CanDIG · daisieh · Dec 2, 2022 · Nov 1, 2022 · Nov 1, 2022 · Nov 1, 2022
diff --git a/CSVConvert.py b/CSVConvert.py
@@ -13,16 +13,12 @@
 import yaml
 
 import argparse
-from chord_metadata_service.mcode.schemas import MCODE_SCHEMA
-from schemas import candigv1_schema
-
 
 def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument('--input', type=str, help="Path to either an xlsx file or a directory of csv files for ingest")
     # parser.add_argument('--api_key', type=str, help="BioPortal API key found in BioPortal personal account settings")
     # parser.add_argument('--email', type=str, help="Contact email to access NCBI clinvar API. Required by Entrez")
-    parser.add_argument('--template', type=str, help="If provided, generate a mapping template at the specified file")
     parser.add_argument('--schema', type=str, help="Schema to use for template; default is mCodePacket")
     parser.add_argument('--mapping', '--manifest', type=str, help="Path to a manifest file describing the mapping."
                                                                   " See README for more information")
@@ -128,7 +124,13 @@ def translate_mapping(identifier, indexed_data, mapping):
         new_dict = {}
         mappings.IDENTIFIER = {"id": identifier}
         for item in items:
-            item, sheets = process_ref(item)
+            item = item.strip()
+            sheets = None
+            sheet_match = re.match(r"(.+?)\.(.+)", item)
+            if sheet_match is not None:
+                # this is a specific item on a specific sheet:
+                item = sheet_match.group(2)
+                sheets = [sheet_match.group(1).replace('"', '').replace("'", "")]
             # check to see if this item is even present in the columns:
             if item in indexed_data["columns"]:
                 new_dict[item] = {}
@@ -145,27 +147,6 @@ def translate_mapping(identifier, indexed_data, mapping):
     return None, None
 
 
-def process_ref(item):
-    """Given a mapping item, process the reference into the item and the referred sheet."""
-    item = item.strip()
-    sheets = None
-    # are there quotes?
-    first_quote_match = re.match(r"^[\'\"](.+?)[\'\"]\.*(.*)", item)
-    if first_quote_match is not None:
-        if first_quote_match.group(2) == '':
-            item = first_quote_match.group(1).replace('"', '').replace("'", "")
-        else:
-            item = first_quote_match.group(2).replace('"', '').replace("'", "")
-            sheets = [first_quote_match.group(1).replace('"', '').replace("'", "")]
-    else:
-        sheet_match = re.match(r"(.+?)\.(.+)", item)
-        if sheet_match is not None:
-            # this is a specific item on a specific sheet:
-            item = sheet_match.group(2).replace('"', '').replace("'", "")
-            sheets = [sheet_match.group(1).replace('"', '').replace("'", "")]
-    return item, sheets
-
-
 def eval_mapping(identifier, indexed_data, node):
     """Given the identifier field, the data, and a particular schema node, evaluate the mapping and return the final JSON for the node in the schema."""
     method, mapping = translate_mapping(identifier, indexed_data, node)
@@ -207,71 +188,9 @@ def ingest_raw_data(input_path, indexed):
             raw_csv_dfs[df].reset_index(inplace=True)
     return raw_csv_dfs, output_file
 
-
-def generate_mapping_template(node, node_name="", node_names=None):
-    """Create a template for mcodepacket, for use with the --template flag."""
-    if node_names is None:
-        node_names = []
-    if node_name != "":
-        # check to see if the last node_name is a header for this node_name:
-        if len(node_names) > 0:
-            x = node_names.pop()
-            x_match = re.match(r"(.+?)\**,.*", x)
-            if x_match is not None:
-                if x_match.group(1) in node_name:
-                    node_names.append(f"##{x}")
-                else:
-                    node_names.append(x)
-            else:
-                node_names.append(x)
-        if "description" in node:
-            node_names.append(f"{node_name},\"##{node['description']}\"")
-        else:
-            node_names.append(f"{node_name},")
-    if "type" in node:
-        if node["type"] == "string":
-            return "string", node_names
-        elif node["type"] == "array":
-            new_node_name = ".".join((node_name, "0"))
-            sc, nn = generate_mapping_template(node["items"], new_node_name, node_names)
-            return [sc], nn
-        elif node["type"] in ["number", "integer"]:
-            return 0, node_names
-        elif node["type"] == "boolean":
-            return True, node_names
-        elif node["type"] == "object":
-            scaffold = {}
-            if "$id" in node:
-                scaffold["$id"] = node["$id"]
-            if len(node_names) > 0:
-                # if this is an ontology_class_schema, we'll update this data post-mapping
-                if "$id" in node and (node["$id"] == "katsu:common:ontology_class"
-                                      or node["$id"] == "katsu:mcode:complex_ontology"):
-                    # add a + to the name of the node to denote that this needs to be looked up in an ontology
-                    name = node_names.pop()
-                    name_match = re.match(r"(.+?),(.+)", name)
-                    if name_match is not None:
-                        name = f"{name_match.group(1)}+,{name_match.group(2)}"
-                    node_names.append(name)
-                    return node["$id"], node_names
-            if "properties" in node:
-                for prop in node["properties"]:
-                    if node_name == "":
-                        new_node_name = prop
-                    else:
-                        new_node_name = ".".join((node_name, prop))
-                    if "required" in node and prop in node["required"]:
-                        new_node_name += "*"
-                    scaffold[prop], node_names = generate_mapping_template(node["properties"][prop], new_node_name, node_names)
-            return scaffold, node_names
-    else:
-        return {}, node_names
-    return None, node_names
-
-
 def process_mapping(line, test=False):
     """Given a csv mapping line, process into its component pieces."""
-    line_match = re.match(r"(.+?),\"*(.*$)\"*", line)
+    line_match = re.match(r"(.+?),(.*$)", line.replace("\"", ""))
     if line_match is not None:
         element = line_match.group(1)
         value = ""
@@ -295,15 +214,14 @@ def create_mapping_scaffold(lines, test=False):
         value, elems = process_mapping(line, test)
         if elems is not None:
             x = elems.pop(0)
-            if value is not None and value != "":
-                if x not in props:
-                    props[x] = []
-                if len(elems) > 0:
-                    props[x].append(".".join(elems)+","+value)
-                elif value != "":
-                    props[x].append(value)
-                else:
-                    props[x] = []
+            if x not in props:
+                props[x] = []
+            if len(elems) > 0:
+                props[x].append(".".join(elems)+","+value)
+            elif value != "":
+                props[x].append(value)
+            else:
+                props[x] = []
         else:
             return line
 
@@ -390,34 +308,12 @@ def main(args):
     # api_key = args.api_key
     input_path = args.input
     # email = args.email
-    template = args.template
+    #template = args.template
     mapping = args.mapping
     schema = args.schema
     mappings.VERBOSE = args.verbose
     metadata = ""
 
-    # if template is provided, we should generate a template file
-    if template is not None:
-        if schema is None:
-            schema = MCODE_SCHEMA
-            # get metadata about version of MCODE_SCHEMA used:
-            metadata += "## schema based on version " + version('katsu') + ",\n"
-            direct_url = [p for p in files('katsu') if 'direct_url.json' in str(p)]
-            if len(direct_url) > 0:
-                d = json.loads(direct_url[0].read_text())
-                metadata += f"## directly checked out from {d['url']}, commit {d['vcs_info']['commit_id']}\n"
-        if schema == "candigv1":
-            schema = candigv1_schema
-        sc, node_names = generate_mapping_template(schema)
-
-        with open(f"{template}.csv", 'w') as f:  # write to csv file for mapping
-            f.write(metadata)
-            f.write("## mcodepacket element, description (overwrite with mapped element)\n")
-            f.write("## (.0 is an array element) (* is required) (+ denotes ontology term),\n")
-            for nn in node_names:
-                f.write(f"{nn}\n")
-        return
-
     # if mapping is provided, we should create a mapping scaffold
     if mapping is not None:
         manifest = load_manifest(mapping)

diff --git a/README.md b/README.md
@@ -1,107 +1,6 @@
 # clinical_ETL_code
 
-Convert patient clinical data to mCode model for Katsu ingestion with Python
-
-## Creating a mapping   
-You'll need to create a mapping scheme for converting your raw data into mCodepackets for use by katsu.
-
-#### 1. Generate a template for the current installed version of katsu:
-
-`$ python CSVConvert.py --template output_file`
-
-#### 2. Fill in the fields in the template files by naming your mapping functions and their input. 
-Example: 
-
-`id, {single_val(Subject)}`
-
-In this case,  the `id` field in  the mCODE packet is being mapped to the `Subject` column of the raw input data using the built-in `single_val` function.
-
-You can specify nodes to be mapped using custom mapping functions, passing in the set of fields from your raw dataset that you want to map:
-
-`subject.extra_properties, {additional_functions.vital_signs_node(WEIGHT_BEFORE_STD, WEIGHT_BEFORE_STD_UN, WEIGHT_25_AGE_STD, WEIGHT_25_AGE_STD_UN , HEIGHT_STD, HEIGHT_STD_UN)}`
-
-In this case,  the `extra_properties` field in  the mCODE packet is being mapped to multiple columns of the raw input data, and the custom mapping function is called `vital_signs_node` from the `additional_functions` module.
-
-Note that you can specify the particular field from a particular raw input sheet/csv:
-`Patient.provinceOfResidence, {COMPARISON.province_from_site(Demographics.Site)}`
-
-**Notes about mapping functions:**
-
-- The entries available in the template represent the data mappable by katsu's mCODEpacket.
-
-- Data that should be a single value should use the built-in mapping function `single_val`; likewise any that should be a list/array should use `list_val`. **Note**: single value means a patient will only have one value for that field, such as `id` or `sex`. Something like `genetic_variant` would not be single value, since a single patient can have more than one.
-
-- Some transformations are pretty standard, so are provided in `mappings.py`. These don't require a module to be specified:
-`subject.date_of_birth,{date(BIRTH_DATE_YM_RAW)}`
-
-- Any additional data that you'd like to include and is not part of the mCODE fields needs to be mapped to one of the extra_properties dicts.
-
-- Any mappings left blank will be stored as None in the resulting mCODEpacket.
-
-- Entries that begin with `##` are informational: they can be overwritten or deleted completely from the mapping file.
-
-- Entries that have an asterisk are required values for mCODEpacket. There is no validation for this at the moment so the tool will run even if there are missing required values.
-
-- Entries that contain a 0 in the name represent array values: probably the best way to 
-specify mappings for these is to pass in all relevant data into a custom mapping function and create the array that way. If you choose to do this, don't also specify mappings for sub-entries in the mapping file, as all of the mapping will
-happen in the mapping function.
-
-- Some editors (such as LibreOffice) insert commas in the template's empty fields and modify some names with hastags. If the tool is not working, make sure to remove these characters using a text editor. Refer to this [example](https://github.com/CanDIG/clinical_ETL/blob/main/example/COMPARISON2mCODE.csv).
-
-#### 3. Write your custom mapping functions.
-
-Implement the custom mapping functions that you specified in you template file. These are functions that accept a single argument, `mapping`, and return a python object:
-
-```python
-# Example mapping function
-def vital_signs_node(mapping):
-    vital_signs = {
-        'WEIGHT_BEFORE_STD': 'weight_before_illness',
-        'WEIGHT_BEFORE_STD_UN': 'weight_before_illness_unit',
-        'WEIGHT_25_AGE_STD': 'weight_around_25',
-        'WEIGHT_25_AGE_STD_UN': 'weight_around_25_unit',
-        'HEIGHT_STD': 'height',
-        'HEIGHT_STD_UN': 'height_unit'
-    }
-    new_dict = {}
-    for item in mapping.keys():
-        new_dict[vital_signs[item]] = mapping[item]
-    return new_dict
-```
-
-#### 4. Create a new directory that contains your template and mapping functions, then create a `manifest.yml` file in the same directory with the following information:
-- `description`: description of the mapping
-- `identifier`: column used to identify patients in the input data
-- `mapping`: template file
-- `functions`: additional mapping functions
-- `sheets`: lists of sheets in the clinical data:
-    - raw (all sheets available for mapping)
-    - final (subset of sheets actually used in the mapping)
-- `indexed`: a list of sheets that need a numeric row index, e.g. for specifying particular rows. Any sheets here will have an `index` column available to mapping functions.
-
-**Note:** Files should be specified as paths relative to the location of the manifest file.
-
-Example:
-```yaml
-description: Test mapping of COMPARISON dataset to mCODEpacket format for katsu
-identifier: Subject
-mapping: your_mapping.csv
-functions:
-  - additional_functions.py
-sheets:
-  raw:
-      - Vital Signs
-      - Diagnosis
-      - Diagnosis 2
-      - Hematology
-      - Outcome
-  final:
-      - Vital Signs
-      - Diagnosis
-      - Outcome
-indexed:
-      - Diagnosis
-```
+This repository converts MoH clinical data into the mcodepacket format needed for katsu. The cohort-specific mappings are implemented in a private GitHub repository, not here. 
 
 ## Set-up & Installation
 Prerequisites: 
@@ -110,16 +9,21 @@ Prerequisites:
 
 You'll need to set up a free [account](https://bioportal.bioontology.org/account) at NCBI Bioportal to obtain an API key.
 
-## Running from command line
+## Converting csvs to mcodepackets
 ```
-$ python CSVConvert.py [-h] [--input INPUT] [--template TEMPLATE] [--mapping|manifest MAPPING]
+$ python CSVConvert.py [-h] [--input INPUT] [--mapping|manifest MAPPING]
 
 --input: path to dataset to be converted to mCODE data model
 
---template: If provided, generate a mapping template at the specified file (only needed if you are creating a new template sheet)
-
 --mapping or --manifest: Path to a manifest file describing the mapping
 ```
+
+The output mcode packets (`INPUT_map.json` and `INPUT_indexed.json`) will be  in the parent of the `INPUT` directory. 
+
+## Generating mcode template file
+
+The `generate_template.py` script will generate a template file based on the version of katsu specified in `requirements.txt`. 
+
 ## Testing
 Continuous Integration is implemented through Pytest and Travis CI which runs when git pushes occur. Build results can be found at [this repository's Travis build page](https://travis-ci.com/github/CanDIG/medidata_mCode_ETL)
 
@@ -128,14 +32,14 @@ To run tests manually, enter from command line `$ pytest`
 *Note: updated mCodePacket.json files must be pushed for all tests to pass during Travis builds*
 
 ## Creating a dummy json file for testing
-You can use a template file (created as described above with `--template`) alone to create a dummy ingest file without actual data. 
+You can use an mocode template file (created as described above) alone to create a dummy ingest file without actual data. 
 
 `python create_test_mapping.py` creates a JSON that is filled in (without using mapping functions) with placeholder or dummy values. You can specify the placeholder value with the argument `--placeholder`. If no template file is specified with `--template`, the current MCODE_SCHEMA of katsu is used and the JSON is outputted to stdout. Otherwise, the file is saved to `<template>_testmap.json`.
 
 This JSON file can be ingested into katsu and compared with the ingested value using https://github.com/CanDIG/candigv2-ingest/blob/main/katsu_validate_dataset.py.
 
 ## Quantifying coverage for datasets and mappings
-The `validate_coverage.py` tool takes the same arguments as `CSVConvert.py`:
+The `quantify_coverage.py` tool takes the same arguments as `CSVConvert.py`:
 ```
 $ python CSVConvert.py [-h] [--input INPUT] [--mapping|manifest MAPPING]