Skip to content

Separate template code #7

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Dec 2, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
138 changes: 17 additions & 121 deletions CSVConvert.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,12 @@
import yaml

import argparse
from chord_metadata_service.mcode.schemas import MCODE_SCHEMA
from schemas import candigv1_schema


def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--input', type=str, help="Path to either an xlsx file or a directory of csv files for ingest")
# parser.add_argument('--api_key', type=str, help="BioPortal API key found in BioPortal personal account settings")
# parser.add_argument('--email', type=str, help="Contact email to access NCBI clinvar API. Required by Entrez")
parser.add_argument('--template', type=str, help="If provided, generate a mapping template at the specified file")
parser.add_argument('--schema', type=str, help="Schema to use for template; default is mCodePacket")
parser.add_argument('--mapping', '--manifest', type=str, help="Path to a manifest file describing the mapping."
" See README for more information")
Expand Down Expand Up @@ -128,7 +124,13 @@ def translate_mapping(identifier, indexed_data, mapping):
new_dict = {}
mappings.IDENTIFIER = {"id": identifier}
for item in items:
item, sheets = process_ref(item)
item = item.strip()
sheets = None
sheet_match = re.match(r"(.+?)\.(.+)", item)
if sheet_match is not None:
# this is a specific item on a specific sheet:
item = sheet_match.group(2)
sheets = [sheet_match.group(1).replace('"', '').replace("'", "")]
# check to see if this item is even present in the columns:
if item in indexed_data["columns"]:
new_dict[item] = {}
Expand All @@ -145,27 +147,6 @@ def translate_mapping(identifier, indexed_data, mapping):
return None, None


def process_ref(item):
"""Given a mapping item, process the reference into the item and the referred sheet."""
item = item.strip()
sheets = None
# are there quotes?
first_quote_match = re.match(r"^[\'\"](.+?)[\'\"]\.*(.*)", item)
if first_quote_match is not None:
if first_quote_match.group(2) == '':
item = first_quote_match.group(1).replace('"', '').replace("'", "")
else:
item = first_quote_match.group(2).replace('"', '').replace("'", "")
sheets = [first_quote_match.group(1).replace('"', '').replace("'", "")]
else:
sheet_match = re.match(r"(.+?)\.(.+)", item)
if sheet_match is not None:
# this is a specific item on a specific sheet:
item = sheet_match.group(2).replace('"', '').replace("'", "")
sheets = [sheet_match.group(1).replace('"', '').replace("'", "")]
return item, sheets


def eval_mapping(identifier, indexed_data, node):
"""Given the identifier field, the data, and a particular schema node, evaluate the mapping and return the final JSON for the node in the schema."""
method, mapping = translate_mapping(identifier, indexed_data, node)
Expand Down Expand Up @@ -207,71 +188,9 @@ def ingest_raw_data(input_path, indexed):
raw_csv_dfs[df].reset_index(inplace=True)
return raw_csv_dfs, output_file


def generate_mapping_template(node, node_name="", node_names=None):
"""Create a template for mcodepacket, for use with the --template flag."""
if node_names is None:
node_names = []
if node_name != "":
# check to see if the last node_name is a header for this node_name:
if len(node_names) > 0:
x = node_names.pop()
x_match = re.match(r"(.+?)\**,.*", x)
if x_match is not None:
if x_match.group(1) in node_name:
node_names.append(f"##{x}")
else:
node_names.append(x)
else:
node_names.append(x)
if "description" in node:
node_names.append(f"{node_name},\"##{node['description']}\"")
else:
node_names.append(f"{node_name},")
if "type" in node:
if node["type"] == "string":
return "string", node_names
elif node["type"] == "array":
new_node_name = ".".join((node_name, "0"))
sc, nn = generate_mapping_template(node["items"], new_node_name, node_names)
return [sc], nn
elif node["type"] in ["number", "integer"]:
return 0, node_names
elif node["type"] == "boolean":
return True, node_names
elif node["type"] == "object":
scaffold = {}
if "$id" in node:
scaffold["$id"] = node["$id"]
if len(node_names) > 0:
# if this is an ontology_class_schema, we'll update this data post-mapping
if "$id" in node and (node["$id"] == "katsu:common:ontology_class"
or node["$id"] == "katsu:mcode:complex_ontology"):
# add a + to the name of the node to denote that this needs to be looked up in an ontology
name = node_names.pop()
name_match = re.match(r"(.+?),(.+)", name)
if name_match is not None:
name = f"{name_match.group(1)}+,{name_match.group(2)}"
node_names.append(name)
return node["$id"], node_names
if "properties" in node:
for prop in node["properties"]:
if node_name == "":
new_node_name = prop
else:
new_node_name = ".".join((node_name, prop))
if "required" in node and prop in node["required"]:
new_node_name += "*"
scaffold[prop], node_names = generate_mapping_template(node["properties"][prop], new_node_name, node_names)
return scaffold, node_names
else:
return {}, node_names
return None, node_names


def process_mapping(line, test=False):
"""Given a csv mapping line, process into its component pieces."""
line_match = re.match(r"(.+?),\"*(.*$)\"*", line)
line_match = re.match(r"(.+?),(.*$)", line.replace("\"", ""))
if line_match is not None:
element = line_match.group(1)
value = ""
Expand All @@ -295,15 +214,14 @@ def create_mapping_scaffold(lines, test=False):
value, elems = process_mapping(line, test)
if elems is not None:
x = elems.pop(0)
if value is not None and value != "":
if x not in props:
props[x] = []
if len(elems) > 0:
props[x].append(".".join(elems)+","+value)
elif value != "":
props[x].append(value)
else:
props[x] = []
if x not in props:
props[x] = []
if len(elems) > 0:
props[x].append(".".join(elems)+","+value)
elif value != "":
props[x].append(value)
else:
props[x] = []
else:
return line

Expand Down Expand Up @@ -390,34 +308,12 @@ def main(args):
# api_key = args.api_key
input_path = args.input
# email = args.email
template = args.template
#template = args.template
mapping = args.mapping
schema = args.schema
mappings.VERBOSE = args.verbose
metadata = ""

# if template is provided, we should generate a template file
if template is not None:
if schema is None:
schema = MCODE_SCHEMA
# get metadata about version of MCODE_SCHEMA used:
metadata += "## schema based on version " + version('katsu') + ",\n"
direct_url = [p for p in files('katsu') if 'direct_url.json' in str(p)]
if len(direct_url) > 0:
d = json.loads(direct_url[0].read_text())
metadata += f"## directly checked out from {d['url']}, commit {d['vcs_info']['commit_id']}\n"
if schema == "candigv1":
schema = candigv1_schema
sc, node_names = generate_mapping_template(schema)

with open(f"{template}.csv", 'w') as f: # write to csv file for mapping
f.write(metadata)
f.write("## mcodepacket element, description (overwrite with mapped element)\n")
f.write("## (.0 is an array element) (* is required) (+ denotes ontology term),\n")
for nn in node_names:
f.write(f"{nn}\n")
return

# if mapping is provided, we should create a mapping scaffold
if mapping is not None:
manifest = load_manifest(mapping)
Expand Down
120 changes: 12 additions & 108 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,107 +1,6 @@
# clinical_ETL_code

Convert patient clinical data to mCode model for Katsu ingestion with Python

## Creating a mapping
You'll need to create a mapping scheme for converting your raw data into mCodepackets for use by katsu.

#### 1. Generate a template for the current installed version of katsu:

`$ python CSVConvert.py --template output_file`

#### 2. Fill in the fields in the template files by naming your mapping functions and their input.
Example:

`id, {single_val(Subject)}`

In this case, the `id` field in the mCODE packet is being mapped to the `Subject` column of the raw input data using the built-in `single_val` function.

You can specify nodes to be mapped using custom mapping functions, passing in the set of fields from your raw dataset that you want to map:

`subject.extra_properties, {additional_functions.vital_signs_node(WEIGHT_BEFORE_STD, WEIGHT_BEFORE_STD_UN, WEIGHT_25_AGE_STD, WEIGHT_25_AGE_STD_UN , HEIGHT_STD, HEIGHT_STD_UN)}`

In this case, the `extra_properties` field in the mCODE packet is being mapped to multiple columns of the raw input data, and the custom mapping function is called `vital_signs_node` from the `additional_functions` module.

Note that you can specify the particular field from a particular raw input sheet/csv:
`Patient.provinceOfResidence, {COMPARISON.province_from_site(Demographics.Site)}`

**Notes about mapping functions:**

- The entries available in the template represent the data mappable by katsu's mCODEpacket.

- Data that should be a single value should use the built-in mapping function `single_val`; likewise any that should be a list/array should use `list_val`. **Note**: single value means a patient will only have one value for that field, such as `id` or `sex`. Something like `genetic_variant` would not be single value, since a single patient can have more than one.

- Some transformations are pretty standard, so are provided in `mappings.py`. These don't require a module to be specified:
`subject.date_of_birth,{date(BIRTH_DATE_YM_RAW)}`

- Any additional data that you'd like to include and is not part of the mCODE fields needs to be mapped to one of the extra_properties dicts.

- Any mappings left blank will be stored as None in the resulting mCODEpacket.

- Entries that begin with `##` are informational: they can be overwritten or deleted completely from the mapping file.

- Entries that have an asterisk are required values for mCODEpacket. There is no validation for this at the moment so the tool will run even if there are missing required values.

- Entries that contain a 0 in the name represent array values: probably the best way to
specify mappings for these is to pass in all relevant data into a custom mapping function and create the array that way. If you choose to do this, don't also specify mappings for sub-entries in the mapping file, as all of the mapping will
happen in the mapping function.

- Some editors (such as LibreOffice) insert commas in the template's empty fields and modify some names with hastags. If the tool is not working, make sure to remove these characters using a text editor. Refer to this [example](https://github.com/CanDIG/clinical_ETL/blob/main/example/COMPARISON2mCODE.csv).

#### 3. Write your custom mapping functions.

Implement the custom mapping functions that you specified in you template file. These are functions that accept a single argument, `mapping`, and return a python object:

```python
# Example mapping function
def vital_signs_node(mapping):
vital_signs = {
'WEIGHT_BEFORE_STD': 'weight_before_illness',
'WEIGHT_BEFORE_STD_UN': 'weight_before_illness_unit',
'WEIGHT_25_AGE_STD': 'weight_around_25',
'WEIGHT_25_AGE_STD_UN': 'weight_around_25_unit',
'HEIGHT_STD': 'height',
'HEIGHT_STD_UN': 'height_unit'
}
new_dict = {}
for item in mapping.keys():
new_dict[vital_signs[item]] = mapping[item]
return new_dict
```

#### 4. Create a new directory that contains your template and mapping functions, then create a `manifest.yml` file in the same directory with the following information:
- `description`: description of the mapping
- `identifier`: column used to identify patients in the input data
- `mapping`: template file
- `functions`: additional mapping functions
- `sheets`: lists of sheets in the clinical data:
- raw (all sheets available for mapping)
- final (subset of sheets actually used in the mapping)
- `indexed`: a list of sheets that need a numeric row index, e.g. for specifying particular rows. Any sheets here will have an `index` column available to mapping functions.

**Note:** Files should be specified as paths relative to the location of the manifest file.

Example:
```yaml
description: Test mapping of COMPARISON dataset to mCODEpacket format for katsu
identifier: Subject
mapping: your_mapping.csv
functions:
- additional_functions.py
sheets:
raw:
- Vital Signs
- Diagnosis
- Diagnosis 2
- Hematology
- Outcome
final:
- Vital Signs
- Diagnosis
- Outcome
indexed:
- Diagnosis
```
This repository converts MoH clinical data into the mcodepacket format needed for katsu. The cohort-specific mappings are implemented in a private GitHub repository, not here.

## Set-up & Installation
Prerequisites:
Expand All @@ -110,16 +9,21 @@ Prerequisites:

You'll need to set up a free [account](https://bioportal.bioontology.org/account) at NCBI Bioportal to obtain an API key.

## Running from command line
## Converting csvs to mcodepackets
```
$ python CSVConvert.py [-h] [--input INPUT] [--template TEMPLATE] [--mapping|manifest MAPPING]
$ python CSVConvert.py [-h] [--input INPUT] [--mapping|manifest MAPPING]

--input: path to dataset to be converted to mCODE data model

--template: If provided, generate a mapping template at the specified file (only needed if you are creating a new template sheet)

--mapping or --manifest: Path to a manifest file describing the mapping
```

The output mcode packets (`INPUT_map.json` and `INPUT_indexed.json`) will be in the parent of the `INPUT` directory.

## Generating mcode template file

The `generate_template.py` script will generate a template file based on the version of katsu specified in `requirements.txt`.

## Testing
Continuous Integration is implemented through Pytest and Travis CI which runs when git pushes occur. Build results can be found at [this repository's Travis build page](https://travis-ci.com/github/CanDIG/medidata_mCode_ETL)

Expand All @@ -128,14 +32,14 @@ To run tests manually, enter from command line `$ pytest`
*Note: updated mCodePacket.json files must be pushed for all tests to pass during Travis builds*

## Creating a dummy json file for testing
You can use a template file (created as described above with `--template`) alone to create a dummy ingest file without actual data.
You can use an mocode template file (created as described above) alone to create a dummy ingest file without actual data.

`python create_test_mapping.py` creates a JSON that is filled in (without using mapping functions) with placeholder or dummy values. You can specify the placeholder value with the argument `--placeholder`. If no template file is specified with `--template`, the current MCODE_SCHEMA of katsu is used and the JSON is outputted to stdout. Otherwise, the file is saved to `<template>_testmap.json`.

This JSON file can be ingested into katsu and compared with the ingested value using https://github.com/CanDIG/candigv2-ingest/blob/main/katsu_validate_dataset.py.

## Quantifying coverage for datasets and mappings
The `validate_coverage.py` tool takes the same arguments as `CSVConvert.py`:
The `quantify_coverage.py` tool takes the same arguments as `CSVConvert.py`:
```
$ python CSVConvert.py [-h] [--input INPUT] [--mapping|manifest MAPPING]

Expand Down
Loading