Skip to content

Commit

Permalink
Merge pull request #22 from uc-cdis/feat/file-names
Browse files Browse the repository at this point in the history
feat(indexing-tools): support file names in manifest and improve pars…
  • Loading branch information
giangbui authored Feb 4, 2020
2 parents 76e4c77 + b86891a commit 8461e27
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 6 deletions.
5 changes: 4 additions & 1 deletion gen3/tools/indexing/download_manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,9 @@ async def _write_all_index_records_to_file(
os.unlink(output_filename)

with open(output_filename, "wb") as outfile:
outfile.write("guid, urls, authz, acl, md5, file_size\n".encode("utf8"))
outfile.write(
"guid, urls, authz, acl, md5, file_size, file_name\n".encode("utf8")
)
for filename in glob.glob(TMP_FOLDER + "*"):
if output_filename == filename:
# don't want to copy the output into the output
Expand Down Expand Up @@ -299,6 +301,7 @@ async def _parse_from_queue(queue):
" ".join(record.get("acl")),
record.get("hashes", {}).get("md5"),
record.get("size"),
record.get("file_name"),
]
loop.run_in_executor(None, csv_writer.writerow, manifest_row)

Expand Down
52 changes: 48 additions & 4 deletions gen3/tools/indexing/verify_manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,12 @@ def _get_md5_from_row(row):
Returns:
str: md5 sum for file
"""
return row.get("md5")
if "md5" in row:
return row["md5"]
elif "md5sum" in row:
return row["md5sum"]
else:
return None


def _get_file_size_from_row(row):
Expand All @@ -100,7 +105,12 @@ def _get_file_size_from_row(row):
int: integer representing file size in bytes
"""
try:
return int(row.get("file_size"))
if "file_size" in row:
return int(row["file_size"])
elif "size" in row:
return int(row["size"])
else:
return None
except Exception:
logging.warning(f"could not convert this to an int: {row.get('file_size')}")
return row.get("file_size")
Expand Down Expand Up @@ -143,7 +153,32 @@ def _get_urls_from_row(row):
Returns:
List[str]: urls for indexd record file location(s)
"""
return [item for item in row.get("urls", "").strip().split(" ") if item]
if "urls" in row:
return [item for item in row.get("urls", "").strip().split(" ") if item]
elif "url" in row:
return [item for item in row.get("urls", "").strip().split(" ") if item]
else:
return []


def _get_file_name_from_row(row):
"""
Given a row from the manifest, return the field representing file's expected file_name.
Args:
row (dict): column_name:row_value
Returns:
List[str]: file_name for indexd record file location(s)
"""
if "file_name" in row:
return row["file_name"]
elif "filename" in row:
return row["filename"]
elif "name" in row:
return row["name"]
else:
return None


manifest_row_parsers = {
Expand All @@ -153,6 +188,7 @@ def _get_urls_from_row(row):
"acl": _get_acl_from_row,
"authz": _get_authz_from_row,
"urls": _get_urls_from_row,
"file_name": _get_file_name_from_row,
}


Expand Down Expand Up @@ -222,7 +258,7 @@ def _verify_all_index_records_in_file(
with open(manifest_file, encoding="utf-8-sig") as csvfile:
manifest_reader = csv.DictReader(csvfile, delimiter=manifest_file_delimiter)
for row in manifest_reader:
row = {key.strip(" "): value.strip(" ") for key, value in row.items()}
row = {key.strip(" "): value for key, value in row.items()}
queue.put(row)

logging.info(
Expand Down Expand Up @@ -298,6 +334,7 @@ def _verify_records_in_indexd(queue, commons_url, manifest_row_parsers):
file_size = manifest_row_parsers["file_size"](row)
md5 = manifest_row_parsers["md5"](row)
urls = manifest_row_parsers["urls"](row)
file_name = manifest_row_parsers["file_name"](row)

try:
actual_record = index.get_record(guid)
Expand Down Expand Up @@ -363,6 +400,13 @@ def _verify_records_in_indexd(queue, commons_url, manifest_row_parsers):
file.write(output)
logging.error(output)

if not actual_record["file_name"] and file_name:
# if the actual record name is "" or None but something was specified
# in the manifest, we have a problem
output = f"{guid}|file_name|expected {file_name}|actual {actual_record['file_name']}\n"
file.write(output)
logging.error(output)

row = queue.get()

logging.info(f"{process_name}:Stop")
11 changes: 10 additions & 1 deletion tests/test_manifests.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ def test_download_manifest(monkeypatch, gen3_index):
acl=["DEV", "test2"],
authz=["/programs/DEV/projects/test2", "/programs/DEV/projects/test2bak"],
urls=["gs://test/test.txt"],
file_name="test.txt",
)
rec3 = gen3_index.create_record(
did="dg.TEST/ed8f4658-6acd-4f96-9dd8-3709890c959e",
Expand Down Expand Up @@ -120,19 +121,21 @@ def test_download_manifest(monkeypatch, gen3_index):
# skip header
next(file)
for line in file:
guid, urls, authz, acl, md5, file_size = line.split(",")
guid, urls, authz, acl, md5, file_size, file_name = line.split(",")
guid = guid.strip("\n")
urls = urls.split(" ")
authz = authz.split(" ")
acl = acl.split(" ")
file_size = file_size.strip("\n")
file_name = file_name.strip("\n")

records[guid] = {
"urls": urls,
"authz": authz,
"acl": acl,
"md5": md5,
"file_size": file_size,
"file_name": file_name,
}
except Exception:
# unexpected file format, fail test
Expand Down Expand Up @@ -160,10 +163,16 @@ def test_download_manifest(monkeypatch, gen3_index):
assert "a1234567891234567890123456789012" in records.get(
"dg.TEST/f2a39f98-6ae1-48a5-8d48-825a0c52a22b", {}
).get("md5")
assert not records.get("dg.TEST/f2a39f98-6ae1-48a5-8d48-825a0c52a22b", {}).get(
"file_name"
)

# assert other 2 records exist
assert "dg.TEST/ed8f4658-6acd-4f96-9dd8-3709890c959e" in records
assert "dg.TEST/1e9d3103-cbe2-4c39-917c-b3abad4750d2" in records
assert "test.txt" == records.get(
"dg.TEST/1e9d3103-cbe2-4c39-917c-b3abad4750d2", {}
).get("file_name")


def _mock_get_guid(guid, **kwargs):
Expand Down

0 comments on commit 8461e27

Please sign in to comment.