diff --git a/gen3/tools/indexing/download_manifest.py b/gen3/tools/indexing/download_manifest.py index 2e5ecffa4..c38781d6d 100644 --- a/gen3/tools/indexing/download_manifest.py +++ b/gen3/tools/indexing/download_manifest.py @@ -151,7 +151,9 @@ async def _write_all_index_records_to_file( os.unlink(output_filename) with open(output_filename, "wb") as outfile: - outfile.write("guid, urls, authz, acl, md5, file_size\n".encode("utf8")) + outfile.write( + "guid, urls, authz, acl, md5, file_size, file_name\n".encode("utf8") + ) for filename in glob.glob(TMP_FOLDER + "*"): if output_filename == filename: # don't want to copy the output into the output @@ -299,6 +301,7 @@ async def _parse_from_queue(queue): " ".join(record.get("acl")), record.get("hashes", {}).get("md5"), record.get("size"), + record.get("file_name"), ] loop.run_in_executor(None, csv_writer.writerow, manifest_row) diff --git a/gen3/tools/indexing/verify_manifest.py b/gen3/tools/indexing/verify_manifest.py index b97499ae7..7f0965534 100644 --- a/gen3/tools/indexing/verify_manifest.py +++ b/gen3/tools/indexing/verify_manifest.py @@ -86,7 +86,12 @@ def _get_md5_from_row(row): Returns: str: md5 sum for file """ - return row.get("md5") + if "md5" in row: + return row["md5"] + elif "md5sum" in row: + return row["md5sum"] + else: + return None def _get_file_size_from_row(row): @@ -100,7 +105,12 @@ def _get_file_size_from_row(row): int: integer representing file size in bytes """ try: - return int(row.get("file_size")) + if "file_size" in row: + return int(row["file_size"]) + elif "size" in row: + return int(row["size"]) + else: + return None except Exception: logging.warning(f"could not convert this to an int: {row.get('file_size')}") return row.get("file_size") @@ -143,7 +153,32 @@ def _get_urls_from_row(row): Returns: List[str]: urls for indexd record file location(s) """ - return [item for item in row.get("urls", "").strip().split(" ") if item] + if "urls" in row: + return [item for item in row.get("urls", "").strip().split(" ") if item] + elif "url" in row: + return [item for item in row.get("urls", "").strip().split(" ") if item] + else: + return [] + + +def _get_file_name_from_row(row): + """ + Given a row from the manifest, return the field representing file's expected file_name. + + Args: + row (dict): column_name:row_value + + Returns: + List[str]: file_name for indexd record file location(s) + """ + if "file_name" in row: + return row["file_name"] + elif "filename" in row: + return row["filename"] + elif "name" in row: + return row["name"] + else: + return None manifest_row_parsers = { @@ -153,6 +188,7 @@ def _get_urls_from_row(row): "acl": _get_acl_from_row, "authz": _get_authz_from_row, "urls": _get_urls_from_row, + "file_name": _get_file_name_from_row, } @@ -222,7 +258,7 @@ def _verify_all_index_records_in_file( with open(manifest_file, encoding="utf-8-sig") as csvfile: manifest_reader = csv.DictReader(csvfile, delimiter=manifest_file_delimiter) for row in manifest_reader: - row = {key.strip(" "): value.strip(" ") for key, value in row.items()} + row = {key.strip(" "): value for key, value in row.items()} queue.put(row) logging.info( @@ -298,6 +334,7 @@ def _verify_records_in_indexd(queue, commons_url, manifest_row_parsers): file_size = manifest_row_parsers["file_size"](row) md5 = manifest_row_parsers["md5"](row) urls = manifest_row_parsers["urls"](row) + file_name = manifest_row_parsers["file_name"](row) try: actual_record = index.get_record(guid) @@ -363,6 +400,13 @@ def _verify_records_in_indexd(queue, commons_url, manifest_row_parsers): file.write(output) logging.error(output) + if not actual_record["file_name"] and file_name: + # if the actual record name is "" or None but something was specified + # in the manifest, we have a problem + output = f"{guid}|file_name|expected {file_name}|actual {actual_record['file_name']}\n" + file.write(output) + logging.error(output) + row = queue.get() logging.info(f"{process_name}:Stop") diff --git a/tests/test_manifests.py b/tests/test_manifests.py index 7f376e1da..cdc32f85e 100644 --- a/tests/test_manifests.py +++ b/tests/test_manifests.py @@ -91,6 +91,7 @@ def test_download_manifest(monkeypatch, gen3_index): acl=["DEV", "test2"], authz=["/programs/DEV/projects/test2", "/programs/DEV/projects/test2bak"], urls=["gs://test/test.txt"], + file_name="test.txt", ) rec3 = gen3_index.create_record( did="dg.TEST/ed8f4658-6acd-4f96-9dd8-3709890c959e", @@ -120,12 +121,13 @@ def test_download_manifest(monkeypatch, gen3_index): # skip header next(file) for line in file: - guid, urls, authz, acl, md5, file_size = line.split(",") + guid, urls, authz, acl, md5, file_size, file_name = line.split(",") guid = guid.strip("\n") urls = urls.split(" ") authz = authz.split(" ") acl = acl.split(" ") file_size = file_size.strip("\n") + file_name = file_name.strip("\n") records[guid] = { "urls": urls, @@ -133,6 +135,7 @@ def test_download_manifest(monkeypatch, gen3_index): "acl": acl, "md5": md5, "file_size": file_size, + "file_name": file_name, } except Exception: # unexpected file format, fail test @@ -160,10 +163,16 @@ def test_download_manifest(monkeypatch, gen3_index): assert "a1234567891234567890123456789012" in records.get( "dg.TEST/f2a39f98-6ae1-48a5-8d48-825a0c52a22b", {} ).get("md5") + assert not records.get("dg.TEST/f2a39f98-6ae1-48a5-8d48-825a0c52a22b", {}).get( + "file_name" + ) # assert other 2 records exist assert "dg.TEST/ed8f4658-6acd-4f96-9dd8-3709890c959e" in records assert "dg.TEST/1e9d3103-cbe2-4c39-917c-b3abad4750d2" in records + assert "test.txt" == records.get( + "dg.TEST/1e9d3103-cbe2-4c39-917c-b3abad4750d2", {} + ).get("file_name") def _mock_get_guid(guid, **kwargs):