Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Coverage tool #4

Merged
merged 33 commits into from
Aug 26, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
f32efef
add mapping to manifest dict
daisieh Aug 6, 2022
e0075cd
clean up imports
daisieh Aug 6, 2022
37c7272
move stuff from translate_mapping to map_row_to_mcodepacket
daisieh Aug 8, 2022
6b9b0f6
split out process_mapping
daisieh Aug 8, 2022
b289b37
Create quantify_coverage.py
daisieh Aug 9, 2022
45dadd9
remove unused imports
daisieh Aug 9, 2022
62f71aa
can't append if doesn't exist
daisieh Aug 11, 2022
17d7db4
progress update
daisieh Aug 11, 2022
5058f5a
check to see that it's not None
daisieh Aug 11, 2022
6840f0a
update create_test_mapping to match
daisieh Aug 11, 2022
d39999e
never hurts to make sure that comments and empty lines are ignored
daisieh Aug 11, 2022
07dc51e
by default, use the whole MCODE_SCHEMA for test map
daisieh Aug 11, 2022
4f1ddd2
Merge branch 'daisieh/coverage' of https://github.com/CanDIG/clinical…
daisieh Aug 11, 2022
a6df61b
Update mcode_template.csv
daisieh Aug 11, 2022
8283a3a
set up two scaffolds to compare
daisieh Aug 11, 2022
43e14e8
compare test to actual
daisieh Aug 11, 2022
4f8d2a8
first pass at cleaning the comparison
daisieh Aug 11, 2022
205760a
remove quotes from csv template
daisieh Aug 15, 2022
24e33f9
map missing in flattened form
daisieh Aug 15, 2022
350e3a0
rename to validate_coverage
daisieh Aug 15, 2022
bda4222
clean up results
daisieh Aug 15, 2022
0470585
reorder output
daisieh Aug 15, 2022
97c4146
update eval for tests
daisieh Aug 15, 2022
74d3426
don't minus one after removing identifier
daisieh Aug 15, 2022
994cb81
squash down all individuals to a single items_used
daisieh Aug 16, 2022
b22bf30
don't include any extra_properties in missing
daisieh Aug 16, 2022
a5ed1ad
add description of quantify_coverage to readme
daisieh Aug 23, 2022
df6d3e3
Merge branch 'daisieh/coverage' of https://github.com/CanDIG/clinical…
daisieh Aug 23, 2022
5d84fdb
docstrings
daisieh Aug 23, 2022
cb1f8d8
ontology placeholder
daisieh Aug 24, 2022
c5daf13
Update validate_coverage.py
daisieh Aug 24, 2022
f7e4ed8
print actual items used
daisieh Aug 24, 2022
f4489a8
also allow string for placeholder_ontology
daisieh Aug 25, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 54 additions & 25 deletions CSVConvert.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,14 @@ def parse_args():
return args


# Combine dataframes from multiple sheets, delete any duplicate patients by merging data
def process_data(raw_csv_dfs, identifier):
# for each dataframe, merge all occurrences for an identifier into a single row with arrayed values
"""Takes a set of raw dataframes with a common identifier and merges into the internal JSON data structure."""
final_merged = {}
cols_index = {}
individuals = []

for page in raw_csv_dfs.keys():
print(f"Processing sheet {page}...")
df = raw_csv_dfs[page].dropna(axis='index', how='all')\
.dropna(axis='columns', how='all')\
.applymap(str)\
Expand Down Expand Up @@ -99,9 +99,9 @@ def process_data(raw_csv_dfs, identifier):


def map_row_to_mcodepacket(identifier, indexed_data, node):
# walk through the provided node of the mcodepacket and fill in the details
"""Given a particular individual's data, and a node in the schema, return the node with mapped data."""
if "str" in str(type(node)) and node != "":
return translate_mapping(identifier, indexed_data, node)
return eval_mapping(identifier, indexed_data, node)
elif "list" in str(type(node)):
new_node = []
for item in node:
Expand All @@ -121,6 +121,7 @@ def map_row_to_mcodepacket(identifier, indexed_data, node):


def translate_mapping(identifier, indexed_data, mapping):
"""Given the identifier field, the data, and a particular mapping, figure out what the method and the mapped values are."""
func_match = re.match(r".*\{(.+?)\((.+)\)\}.*", mapping)
if func_match is not None: # it's a function, prep the dictionary and exec it
items = func_match.group(2).split(";")
Expand All @@ -146,16 +147,27 @@ def translate_mapping(identifier, indexed_data, mapping):
new_dict[item][sheet] = indexed_data["data"][sheet][identifier][item]
else:
new_dict[item][sheet] = []
return func_match.group(1), new_dict
return None, None


def eval_mapping(identifier, indexed_data, node):
"""Given the identifier field, the data, and a particular schema node, evaluate the mapping and return the final JSON for the node in the schema."""
method, mapping = translate_mapping(identifier, indexed_data, node)
if method is not None:
if "mappings" not in mappings.MODULES:
mappings.MODULES["mappings"] = importlib.import_module("mappings")
module = mappings.MODULES["mappings"]
# is the function something in a dynamically-loaded module?
subfunc_match = re.match(r"(.+)\.(.+)", func_match.group(1))
subfunc_match = re.match(r"(.+)\.(.+)", method)
if subfunc_match is not None:
module = mappings.MODULES[subfunc_match.group(1)]
return eval(f'module.{subfunc_match.group(2)}({new_dict})')
return eval(f'mappings.{func_match.group(1)}({new_dict})')
method = subfunc_match.group(2)
return eval(f'module.{method}({mapping})')


# Ingest either an excel file or a directory of csvs
def ingest_raw_data(input_path, indexed):
"""Ingest the csvs or xlsx and create dataframes for processing."""
raw_csv_dfs = {}
output_file = "mCodePacket"
# input can either be an excel file or a directory of csvs
Expand All @@ -181,15 +193,15 @@ def ingest_raw_data(input_path, indexed):
return raw_csv_dfs, output_file


# Create a template for mcodepacket, for use with the --template flag
def generate_mapping_template(node, node_name="", node_names=None):
"""Create a template for mcodepacket, for use with the --template flag."""
if node_names is None:
node_names = []
if node_name != "":
# check to see if the last node_name is a header for this node_name:
if len(node_names) > 0:
x = node_names.pop()
x_match = re.match(r"\"(.+?)\**\",.*", x)
x_match = re.match(r"(.+?)\**,.*", x)
if x_match is not None:
if x_match.group(1) in node_name:
node_names.append(f"##{x}")
Expand All @@ -198,9 +210,9 @@ def generate_mapping_template(node, node_name="", node_names=None):
else:
node_names.append(x)
if "description" in node:
node_names.append(f"\"{node_name}\",\"##{node['description']}\"")
node_names.append(f"{node_name},\"##{node['description']}\"")
else:
node_names.append(f"\"{node_name}\",")
node_names.append(f"{node_name},")
if "type" in node:
if node["type"] == "string":
return "string", node_names
Expand All @@ -222,9 +234,9 @@ def generate_mapping_template(node, node_name="", node_names=None):
or node["$id"] == "katsu:mcode:complex_ontology"):
# add a + to the name of the node to denote that this needs to be looked up in an ontology
name = node_names.pop()
name_match = re.match(r"\"(.+?)\"(.+)", name)
name_match = re.match(r"(.+?),(.+)", name)
if name_match is not None:
name = f"\"{name_match.group(1)}+\"{name_match.group(2)}"
name = f"{name_match.group(1)}+,{name_match.group(2)}"
node_names.append(name)
return node["$id"], node_names
if "properties" in node:
Expand All @@ -242,23 +254,31 @@ def generate_mapping_template(node, node_name="", node_names=None):
return None, node_names


# Given a mapping csv file, create a scaffold mapping.
def process_mapping(line, test=False):
"""Given a csv mapping line, process into its component pieces."""
line_match = re.match(r"(.+?),(.*$)", line.replace("\"", ""))
if line_match is not None:
element = line_match.group(1)
value = ""
if test:
value = "test"
if line_match.group(2) != "" and not line_match.group(2).startswith("##"):
value = line_match.group(2).replace(",", ";")
elems = element.replace("*", "").replace("+", "").split(".")
return value, elems
return line, None


def create_mapping_scaffold(lines, test=False):
"""Given lines from a mapping csv file, create a scaffold mapping."""
props = {}
for line in lines:
if line.startswith("#"):
continue
if re.match(r"^\s*$", line):
continue
line_match = re.match(r"(.+?),(.*$)", line.replace("\"", ""))
if line_match is not None:
element = line_match.group(1)
value = ""
if test:
value = "test"
if line_match.group(2) != "" and not line_match.group(2).startswith("##"):
value = line_match.group(2).replace(",", ";")
elems = element.replace("*", "").replace("+", "").split(".")
value, elems = process_mapping(line, test)
if elems is not None:
x = elems.pop(0)
if x not in props:
props[x] = []
Expand Down Expand Up @@ -295,6 +315,7 @@ def create_mapping_scaffold(lines, test=False):


def load_manifest(mapping):
"""Given a manifest file's path, return the data inside it."""
identifier = None
schema = "mcode"
mapping_scaffold = None
Expand All @@ -313,9 +334,16 @@ def load_manifest(mapping):
mapping_path = os.path.join(manifest_dir, manifest["mapping"])
if os.path.isabs(manifest["mapping"]):
mapping_path = manifest["mapping"]
mapping = []
with open(mapping_path, 'r') as f:
lines = f.readlines()
mapping_scaffold = create_mapping_scaffold(lines)
for line in lines:
if line.startswith("#"):
continue
if re.match(r"^\s*$", line):
continue
mapping.append(line)
mapping_scaffold = create_mapping_scaffold(mapping)
if "functions" in manifest:
for mod in manifest["functions"]:
try:
Expand All @@ -337,6 +365,7 @@ def load_manifest(mapping):
"identifier": identifier,
"schema": schema,
"scaffold": mapping_scaffold,
"mapping": mapping,
"indexed": indexed
}

Expand Down
21 changes: 18 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -111,14 +111,15 @@ Prerequisites:
You'll need to set up a free [account](https://bioportal.bioontology.org/account) at NCBI Bioportal to obtain an API key.

## Running from command line
`$ python CSVConvert.py [-h] [--input INPUT] [--template TEMPLATE] [--mapping|manifest MAPPING]`
```
$ python CSVConvert.py [-h] [--input INPUT] [--template TEMPLATE] [--mapping|manifest MAPPING]

--input: path to dataset to be converted to mCODE data model

--template: If provided, generate a mapping template at the specified file (only needed if you are creating a new template sheet)

--mapping or --manifest: Path to a manifest file describing the mapping

```
## Testing
Continuous Integration is implemented through Pytest and Travis CI which runs when git pushes occur. Build results can be found at [this repository's Travis build page](https://travis-ci.com/github/CanDIG/medidata_mCode_ETL)

Expand All @@ -129,6 +130,20 @@ To run tests manually, enter from command line `$ pytest`
## Creating a dummy json file for testing
You can use a template file (created as described above with `--template`) alone to create a dummy ingest file without actual data.

`python create_test_mapping.py` creates a file at `mcode_template_testmap.json` that is filled in (without using mapping functions) with placeholder or dummy values. You can specify the placeholder value with the argument `--placeholder`.
`python create_test_mapping.py` creates a JSON that is filled in (without using mapping functions) with placeholder or dummy values. You can specify the placeholder value with the argument `--placeholder`. If no template file is specified with `--template`, the current MCODE_SCHEMA of katsu is used and the JSON is outputted to stdout. Otherwise, the file is saved to `<template>_testmap.json`.

This JSON file can be ingested into katsu and compared with the ingested value using https://github.com/CanDIG/candigv2-ingest/blob/main/katsu_validate_dataset.py.

## Quantifying coverage for datasets and mappings
The `quantify_coverage.py` tool takes the same arguments as `CSVConvert.py`:
```
$ python CSVConvert.py [-h] [--input INPUT] [--mapping|manifest MAPPING]

--input: path to dataset

--mapping or --manifest: Path to a manifest file describing the mapping
```

This tool outputs information quantifying:
* how much of the schema is covered by the mapping
* how much of the dataset is covered by the mapping
32 changes: 14 additions & 18 deletions create_test_mapping.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,15 @@
from copy import deepcopy
import importlib.util
import json
import os
import re
import yaml
from datetime import datetime
from CSVConvert import create_mapping_scaffold, generate_mapping_template
import argparse
from chord_metadata_service.mcode.schemas import MCODE_SCHEMA


def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--template', type=str, default="mcode_template.csv", help="Path to a template mapping file.")
parser.add_argument('--template', type=str, help="Path to a template mapping file.")
parser.add_argument('--placeholder', type=str, default="abcd", help="Value for placeholder strings.")
args = parser.parse_args()
return args
Expand Down Expand Up @@ -90,27 +87,26 @@ def pick_value_for_node(placeholder_val, node, schema):

def main(args):
template = args.template
schema, nn = generate_mapping_template(MCODE_SCHEMA)
# print(json.dumps(MCODE_SCHEMA, indent=4))
if template is not None:
with open(template, 'r') as f:
lines = f.readlines()
mapping_scaffold = create_mapping_scaffold(lines, test=True)
# print(json.dumps(mapping_scaffold, indent=4))
if mapping_scaffold is None:
print("No mapping scaffold was loaded. Either katsu was not found or no schema was specified.")
return
mapping = f.readlines()
else:
print("A manifest file is required, using the --manifest argument")
schema, mapping = generate_mapping_template(MCODE_SCHEMA)
mapping_scaffold = create_mapping_scaffold(mapping, test=True)
# print(json.dumps(mapping_scaffold, indent=4))
if mapping_scaffold is None:
print("No mapping scaffold was loaded. Either katsu was not found or no schema was specified.")
return

output_file, ext = os.path.splitext(template)

mcodepackets = [map_to_mcodepacket(args.placeholder, deepcopy(mapping_scaffold), MCODE_SCHEMA)]

with open(f"{output_file}_testmap.json", 'w') as f: # write to json file for ingestion
json.dump(mcodepackets, f, indent=4)
print(f"Test mapping saved as {output_file}_testmap.json")
if template is not None:
output_file, ext = os.path.splitext(template)
with open(f"{output_file}_testmap.json", 'w') as f: # write to json file for ingestion
json.dump(mcodepackets, f, indent=4)
print(f"Test mapping saved as {output_file}_testmap.json")
else:
print(json.dumps(mcodepackets, indent=4))

if __name__ == '__main__':
main(parse_args())
12 changes: 12 additions & 0 deletions mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,3 +97,15 @@ def is_null(cell):
if cell == 'nan' or cell is None or cell == '':
return True
return False

# Placeholder function to make a fake ontology entry
def ontology_placeholder(mapping):
if "str" in str(type(mapping)):
return {
"id": "placeholder",
"label": mapping
}
return {
"id": "placeholder",
"label": single_val(mapping)
}
Loading