From 9a248a1d3508f5a8a9dd82c0e9eadfffd72714bd Mon Sep 17 00:00:00 2001 From: Daisie Huang Date: Tue, 17 Oct 2023 17:20:41 -0700 Subject: [PATCH 01/29] save off indexed data on fail --- CSVConvert.py | 4 ++-- mappings.py | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/CSVConvert.py b/CSVConvert.py index bd87071..2ee3a2f 100644 --- a/CSVConvert.py +++ b/CSVConvert.py @@ -581,14 +581,14 @@ def csv_convert(input_path, manifest_file, verbose=False): # # read the raw data print("Reading raw data") - raw_csv_dfs, output_file = ingest_raw_data(input_path) + raw_csv_dfs, mappings.OUTPUT_FILE = ingest_raw_data(input_path) if not raw_csv_dfs: print(f"No ingestable files (csv or xlsx) were found at {input_path}") return print("Indexing data") mappings.INDEXED_DATA = process_data(raw_csv_dfs) - with open(f"{output_file}_indexed.json", 'w') as f: + with open(f"{mappings.OUTPUT_FILE}_indexed.json", 'w') as f: json.dump(mappings.INDEXED_DATA, f, indent=4) # if verbose flag is set, warn if column name is present in multiple sheets: diff --git a/mappings.py b/mappings.py index a290a27..d2235fe 100644 --- a/mappings.py +++ b/mappings.py @@ -9,9 +9,13 @@ INDEX_STACK = [] INDEXED_DATA = None CURRENT_LINE = "" +OUTPUT_FILE = "" class MappingError(Exception): + with open(f"{mappings.OUTPUT_FILE}_indexed.json", 'w') as f: + json.dump(mappings.INDEXED_DATA, f, indent=4) + def __init__(self, value): self.value = value From 57c305d6d9c33afe9b06a2942ae16c8175cf4839 Mon Sep 17 00:00:00 2001 From: Daisie Huang Date: Tue, 17 Oct 2023 17:23:01 -0700 Subject: [PATCH 02/29] Update mappings.py --- mappings.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mappings.py b/mappings.py index d2235fe..0a36474 100644 --- a/mappings.py +++ b/mappings.py @@ -13,8 +13,8 @@ class MappingError(Exception): - with open(f"{mappings.OUTPUT_FILE}_indexed.json", 'w') as f: - json.dump(mappings.INDEXED_DATA, f, indent=4) + with open(f"{OUTPUT_FILE}_indexed.json", 'w') as f: + json.dump(INDEXED_DATA, f, indent=4) def __init__(self, value): self.value = value From 46996ed54a20859e180afb655e1e10994fbe5a0a Mon Sep 17 00:00:00 2001 From: Daisie Huang Date: Tue, 17 Oct 2023 17:25:57 -0700 Subject: [PATCH 03/29] Update test_data_ingest.py --- test_data_ingest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_data_ingest.py b/test_data_ingest.py index 986ad42..e1a0f47 100644 --- a/test_data_ingest.py +++ b/test_data_ingest.py @@ -5,7 +5,7 @@ from mohschema import MoHSchema # read sheet from given data pathway -raw_csvs, output_file = CSVConvert.ingest_raw_data("test_data/pytest_data") +raw_csvs, mappings.OUTPUT_FILE = CSVConvert.ingest_raw_data("test_data/pytest_data") mappings.IDENTIFIER_FIELD = "Subject" mappings.INDEXED_DATA = CSVConvert.process_data(raw_csvs) mappings._push_to_stack(None, None, mappings.IDENTIFIER) From b4ab3e9d420ffdfd679f1f618ce20cf11d278002 Mon Sep 17 00:00:00 2001 From: Daisie Huang Date: Tue, 17 Oct 2023 17:26:16 -0700 Subject: [PATCH 04/29] add concat_vals --- mappings.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mappings.py b/mappings.py index 0a36474..cb816a4 100644 --- a/mappings.py +++ b/mappings.py @@ -122,6 +122,14 @@ def flat_list_val(data_values): return all_items +# concatenate several data values +def concat_vals(data_values): + result = [] + for x in data_values: + result.extend(data_values[x].values()) + return "_".join(result) + + # Convert various responses to boolean def boolean(data_values): cell = single_val(data_values) From 274d06a924078374f0bb6911b8fae7877fc2845d Mon Sep 17 00:00:00 2001 From: Daisie Huang Date: Tue, 17 Oct 2023 17:28:57 -0700 Subject: [PATCH 05/29] Update CSVConvert.py --- CSVConvert.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CSVConvert.py b/CSVConvert.py index 2ee3a2f..9396aea 100644 --- a/CSVConvert.py +++ b/CSVConvert.py @@ -619,7 +619,7 @@ def csv_convert(input_path, manifest_file, verbose=False): if mappings._pop_from_stack() is not None: raise Exception(f"Stack not empty\n{mappings.IDENTIFIER_FIELD}: {mappings.IDENTIFIER}\n {mappings.INDEX_STACK}") - with open(f"{output_file}_indexed.json", 'w') as f: + with open(f"{mappings.OUTPUT_FILE}_indexed.json", 'w') as f: json.dump(mappings.INDEXED_DATA, f, indent=4) result = { @@ -628,14 +628,14 @@ def csv_convert(input_path, manifest_file, verbose=False): } if schema.katsu_sha is not None: result["katsu_sha"] = schema.katsu_sha - with open(f"{output_file}_map.json", 'w') as f: # write to json file for ingestion + with open(f"{mappings.OUTPUT_FILE}_map.json", 'w') as f: # write to json file for ingestion json.dump(result, f, indent=4) # add validation data: schema.validate_ingest_map(result) result["validation_errors"] = schema.validation_failures result["statistics"] = schema.statistics - with open(f"{output_file}_map.json", 'w') as f: # write to json file for ingestion + with open(f"{mappings.OUTPUT_FILE}_map.json", 'w') as f: # write to json file for ingestion json.dump(result, f, indent=4) if len(result["validation_errors"]) > 0: From ee57cb727d9722d20c978f418460a639b59cd960 Mon Sep 17 00:00:00 2001 From: Daisie Huang Date: Tue, 17 Oct 2023 17:36:21 -0700 Subject: [PATCH 06/29] test --- CSVConvert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CSVConvert.py b/CSVConvert.py index 9396aea..c69bf17 100644 --- a/CSVConvert.py +++ b/CSVConvert.py @@ -49,7 +49,7 @@ def map_data_to_scaffold(node, line, rownum): return result if "str" in str(type(node)) and node != "": result = eval_mapping(node, rownum) - verbose_print(f"Evaluated result is {result}") + verbose_print(f"Evaluated result is {result}, {node}, {rownum}") return result if "dict" in str(type(node)): result = {} From 9c9e611b24c61b7966481213a5002febe38da1bf Mon Sep 17 00:00:00 2001 From: Daisie Huang Date: Tue, 17 Oct 2023 17:42:10 -0700 Subject: [PATCH 07/29] Update CSVConvert.py --- CSVConvert.py | 1 + 1 file changed, 1 insertion(+) diff --git a/CSVConvert.py b/CSVConvert.py index c69bf17..ac19d28 100644 --- a/CSVConvert.py +++ b/CSVConvert.py @@ -59,6 +59,7 @@ def map_data_to_scaffold(node, line, rownum): linekey = f"{line}.{key}" dict = map_data_to_scaffold(node[key], f"{linekey}", rownum) if dict is not None: + print(dict) result[key] = dict if result is not None and len(result) == 0: return None From b29ed748b4dfd74290248a2ede636471e6bb89b3 Mon Sep 17 00:00:00 2001 From: Daisie Huang Date: Tue, 17 Oct 2023 17:46:00 -0700 Subject: [PATCH 08/29] test --- CSVConvert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CSVConvert.py b/CSVConvert.py index ac19d28..57f5b1f 100644 --- a/CSVConvert.py +++ b/CSVConvert.py @@ -59,7 +59,7 @@ def map_data_to_scaffold(node, line, rownum): linekey = f"{line}.{key}" dict = map_data_to_scaffold(node[key], f"{linekey}", rownum) if dict is not None: - print(dict) + print(dict, result) result[key] = dict if result is not None and len(result) == 0: return None From 12cf2da99e0af22c8c4f25976222b5aff2bde6fc Mon Sep 17 00:00:00 2001 From: Daisie Huang Date: Tue, 17 Oct 2023 17:47:32 -0700 Subject: [PATCH 09/29] Update CSVConvert.py --- CSVConvert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CSVConvert.py b/CSVConvert.py index 57f5b1f..56188af 100644 --- a/CSVConvert.py +++ b/CSVConvert.py @@ -59,7 +59,7 @@ def map_data_to_scaffold(node, line, rownum): linekey = f"{line}.{key}" dict = map_data_to_scaffold(node[key], f"{linekey}", rownum) if dict is not None: - print(dict, result) + print(dict, key) result[key] = dict if result is not None and len(result) == 0: return None From 27d569365fecda012e4ea98dcf712d57eca3919f Mon Sep 17 00:00:00 2001 From: Daisie Huang Date: Tue, 17 Oct 2023 17:47:57 -0700 Subject: [PATCH 10/29] Update CSVConvert.py --- CSVConvert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CSVConvert.py b/CSVConvert.py index 56188af..4898481 100644 --- a/CSVConvert.py +++ b/CSVConvert.py @@ -59,7 +59,7 @@ def map_data_to_scaffold(node, line, rownum): linekey = f"{line}.{key}" dict = map_data_to_scaffold(node[key], f"{linekey}", rownum) if dict is not None: - print(dict, key) + print(dict, key, rownum) result[key] = dict if result is not None and len(result) == 0: return None From b110011bc53f74a9f43d3e4dc4f8bb225b223f87 Mon Sep 17 00:00:00 2001 From: Daisie Huang Date: Tue, 17 Oct 2023 17:55:42 -0700 Subject: [PATCH 11/29] Update CSVConvert.py --- CSVConvert.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/CSVConvert.py b/CSVConvert.py index 4898481..607291d 100644 --- a/CSVConvert.py +++ b/CSVConvert.py @@ -59,7 +59,14 @@ def map_data_to_scaffold(node, line, rownum): linekey = f"{line}.{key}" dict = map_data_to_scaffold(node[key], f"{linekey}", rownum) if dict is not None: - print(dict, key, rownum) + # if "CALCULATED" not in mappings.INDEXED_DATA["data"]: + # mappings.INDEXED_DATA["data"]["CALCULATED"] = {} + # if mappings.IDENTIFIER not in mappings.INDEXED_DATA["data"]["CALCULATED"]: + # mappings.INDEXED_DATA["data"]["CALCULATED"] = {} + # if key not in mappings.INDEXED_DATA["data"]["CALCULATED"]: + # mappings.INDEXED_DATA["data"]["CALCULATED"][key] = [] + + print(f"HELLO {dict}, {key}, {rownum}") result[key] = dict if result is not None and len(result) == 0: return None From d3d8516e2a8726c54fcb8e71931af8db269cf9c9 Mon Sep 17 00:00:00 2001 From: Daisie Huang Date: Tue, 17 Oct 2023 17:57:21 -0700 Subject: [PATCH 12/29] Update CSVConvert.py --- CSVConvert.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/CSVConvert.py b/CSVConvert.py index 607291d..2fec61b 100644 --- a/CSVConvert.py +++ b/CSVConvert.py @@ -59,13 +59,13 @@ def map_data_to_scaffold(node, line, rownum): linekey = f"{line}.{key}" dict = map_data_to_scaffold(node[key], f"{linekey}", rownum) if dict is not None: - # if "CALCULATED" not in mappings.INDEXED_DATA["data"]: - # mappings.INDEXED_DATA["data"]["CALCULATED"] = {} - # if mappings.IDENTIFIER not in mappings.INDEXED_DATA["data"]["CALCULATED"]: - # mappings.INDEXED_DATA["data"]["CALCULATED"] = {} - # if key not in mappings.INDEXED_DATA["data"]["CALCULATED"]: - # mappings.INDEXED_DATA["data"]["CALCULATED"][key] = [] - + if "CALCULATED" not in mappings.INDEXED_DATA["data"]: + mappings.INDEXED_DATA["data"]["CALCULATED"] = {} + if mappings.IDENTIFIER not in mappings.INDEXED_DATA["data"]["CALCULATED"]: + mappings.INDEXED_DATA["data"]["CALCULATED"] = {} + if key not in mappings.INDEXED_DATA["data"]["CALCULATED"]: + mappings.INDEXED_DATA["data"]["CALCULATED"][key] = [] + mappings.INDEXED_DATA["data"]["CALCULATED"][key].append(dict) print(f"HELLO {dict}, {key}, {rownum}") result[key] = dict if result is not None and len(result) == 0: From 7ba1276b49703364461e0eb53c2cee47a0b3b24c Mon Sep 17 00:00:00 2001 From: Daisie Huang Date: Tue, 17 Oct 2023 17:58:29 -0700 Subject: [PATCH 13/29] Update mappings.py --- mappings.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mappings.py b/mappings.py index cb816a4..2269ac5 100644 --- a/mappings.py +++ b/mappings.py @@ -14,6 +14,7 @@ class MappingError(Exception): with open(f"{OUTPUT_FILE}_indexed.json", 'w') as f: + print("HOWDY") json.dump(INDEXED_DATA, f, indent=4) def __init__(self, value): From 3792740e93a16a5895bd4809b75bc7e692c3f3c2 Mon Sep 17 00:00:00 2001 From: Daisie Huang Date: Tue, 17 Oct 2023 17:59:50 -0700 Subject: [PATCH 14/29] Update mappings.py --- mappings.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mappings.py b/mappings.py index 2269ac5..b1f7baf 100644 --- a/mappings.py +++ b/mappings.py @@ -13,14 +13,12 @@ class MappingError(Exception): - with open(f"{OUTPUT_FILE}_indexed.json", 'w') as f: - print("HOWDY") - json.dump(INDEXED_DATA, f, indent=4) - def __init__(self, value): self.value = value def __str__(self): + with open(f"{OUTPUT_FILE}_indexed.json", 'w') as f: + json.dump(INDEXED_DATA, f, indent=4) return repr(f"Check the values for {IDENTIFIER} in {IDENTIFIER_FIELD}: {self.value}") From 6e6b15bbcb2361db55f3db9242ea1a253388a161 Mon Sep 17 00:00:00 2001 From: Daisie Huang Date: Tue, 17 Oct 2023 18:01:44 -0700 Subject: [PATCH 15/29] Update CSVConvert.py --- CSVConvert.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/CSVConvert.py b/CSVConvert.py index 2fec61b..b3dc795 100644 --- a/CSVConvert.py +++ b/CSVConvert.py @@ -62,10 +62,10 @@ def map_data_to_scaffold(node, line, rownum): if "CALCULATED" not in mappings.INDEXED_DATA["data"]: mappings.INDEXED_DATA["data"]["CALCULATED"] = {} if mappings.IDENTIFIER not in mappings.INDEXED_DATA["data"]["CALCULATED"]: - mappings.INDEXED_DATA["data"]["CALCULATED"] = {} - if key not in mappings.INDEXED_DATA["data"]["CALCULATED"]: - mappings.INDEXED_DATA["data"]["CALCULATED"][key] = [] - mappings.INDEXED_DATA["data"]["CALCULATED"][key].append(dict) + mappings.INDEXED_DATA["data"]["CALCULATED"][mappings.IDENTIFIER] = {} + if key not in mappings.INDEXED_DATA["data"]["CALCULATED"][mappings.IDENTIFIER]: + mappings.INDEXED_DATA["data"]["CALCULATED"][mappings.IDENTIFIER][key] = [] + mappings.INDEXED_DATA["data"]["CALCULATED"][mappings.IDENTIFIER][key].append(dict) print(f"HELLO {dict}, {key}, {rownum}") result[key] = dict if result is not None and len(result) == 0: From 1c9164369ae8a5aa5953e681146cda79e821f9fe Mon Sep 17 00:00:00 2001 From: Daisie Huang Date: Tue, 17 Oct 2023 18:05:18 -0700 Subject: [PATCH 16/29] Update CSVConvert.py --- CSVConvert.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CSVConvert.py b/CSVConvert.py index b3dc795..6068012 100644 --- a/CSVConvert.py +++ b/CSVConvert.py @@ -66,6 +66,10 @@ def map_data_to_scaffold(node, line, rownum): if key not in mappings.INDEXED_DATA["data"]["CALCULATED"][mappings.IDENTIFIER]: mappings.INDEXED_DATA["data"]["CALCULATED"][mappings.IDENTIFIER][key] = [] mappings.INDEXED_DATA["data"]["CALCULATED"][mappings.IDENTIFIER][key].append(dict) + if key not in mappings.INDEXED_DATA["columns"]: + mappings.INDEXED_DATA["columns"][key] = [] + if "CALCULATED" not in mappings.INDEXED_DATA["columns"][key]: + mappings.INDEXED_DATA["columns"][key].append("CALCULATED") print(f"HELLO {dict}, {key}, {rownum}") result[key] = dict if result is not None and len(result) == 0: From 5c20a02f6c22478019aa84d9867fb8243ec88cb0 Mon Sep 17 00:00:00 2001 From: Daisie Huang Date: Tue, 17 Oct 2023 18:41:13 -0700 Subject: [PATCH 17/29] Update CSVConvert.py --- CSVConvert.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/CSVConvert.py b/CSVConvert.py index 6068012..cb90513 100644 --- a/CSVConvert.py +++ b/CSVConvert.py @@ -70,7 +70,6 @@ def map_data_to_scaffold(node, line, rownum): mappings.INDEXED_DATA["columns"][key] = [] if "CALCULATED" not in mappings.INDEXED_DATA["columns"][key]: mappings.INDEXED_DATA["columns"][key].append("CALCULATED") - print(f"HELLO {dict}, {key}, {rownum}") result[key] = dict if result is not None and len(result) == 0: return None @@ -205,7 +204,7 @@ def get_row_for_stack_top(sheet, rownum): result = {} for param in mappings.INDEXED_DATA["data"][sheet][mappings.IDENTIFIER].keys(): result[param] = mappings.INDEXED_DATA["data"][sheet][mappings.IDENTIFIER][param][rownum] - verbose_print(f"get_row_for_stack_top is {result}") + verbose_print(f"get_row_for_stack_top {sheet} is {result}") return result From 266ea3e2008802bde40a2b0a73029b5e70e14e9e Mon Sep 17 00:00:00 2001 From: Daisie Huang Date: Wed, 18 Oct 2023 10:29:43 -0700 Subject: [PATCH 18/29] Update test2moh.csv --- test_data/test2moh.csv | 1 + 1 file changed, 1 insertion(+) diff --git a/test_data/test2moh.csv b/test_data/test2moh.csv index f8344d0..c34be00 100644 --- a/test_data/test2moh.csv +++ b/test_data/test2moh.csv @@ -1,3 +1,4 @@ +## THIS IS A TEST FILE: DO NOT USE FOR EXAMPLE PURPOSES ## Schema generated from https://raw.githubusercontent.com/CanDIG/katsu/develop/chord_metadata_service/mohpackets/docs/schema.yml ## Based on repo commit sha "29fd55d173b7a01daa72fcc89187e3aabd1fb51e" ## MoH template is manually updated to match the MoH clinical data model From 01526bd50c122daffc430a7247d41504c8049b83 Mon Sep 17 00:00:00 2001 From: Daisie Huang Date: Tue, 10 Oct 2023 10:10:40 -0700 Subject: [PATCH 19/29] Don't add required_but_missing fields to failure list --- schema.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/schema.py b/schema.py index a32b6c0..5567aeb 100644 --- a/schema.py +++ b/schema.py @@ -336,7 +336,7 @@ def validate_schema(self, schema_name, map_json): } self.statistics["required_but_missing"][schema_name][f]["total"] += 1 if f not in map_json: - self.warn(f"{f} required for {schema_name}") + # self.warn(f"{f} required for {schema_name}") self.statistics["required_but_missing"][schema_name][f]["missing"] += 1 if case not in self.statistics["cases_missing_data"]: self.statistics["cases_missing_data"].append(case) From 7b46225baf843561c493780bf326bad3bdf47910 Mon Sep 17 00:00:00 2001 From: Daisie Huang Date: Tue, 10 Oct 2023 10:15:14 -0700 Subject: [PATCH 20/29] update tests --- test_data_ingest.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test_data_ingest.py b/test_data_ingest.py index e1a0f47..9d901f9 100644 --- a/test_data_ingest.py +++ b/test_data_ingest.py @@ -101,7 +101,7 @@ def test_donor_2(packets): def test_validation(packets, schema): schema.validate_ingest_map({"donors": packets}) print(schema.validation_failures) - assert len(schema.validation_failures) == 9 + assert len(schema.validation_failures) == 8 # should be the following 9 failures: # DONOR_5: cause_of_death required if is_deceased = Yes # DONOR_5: date_of_death required if is_deceased = Yes @@ -109,7 +109,6 @@ def test_validation(packets, schema): # DONOR_5 > PD_5 > SPECIMEN_6: Tumour specimens require a reference_pathology_confirmed_diagnosis # DONOR_5 > PD_5 > TR_5 > Radiation 1: Only one radiation is allowed per treatment # DONOR_5 > PD_5 > TR_5 > Radiation 1: reference_radiation_treatment_id required if radiation_boost = Yes - # DONOR_5 > PD_5 > TR_10: response_to_treatment required for treatments # DONOR_5 > PD_5 > TR_10: treatment type Immunotherapy should have one or more immunotherapies submitted # DONOR_6 > PD_6 > TR_9 > Surgery 0: submitter_specimen_id SPECIMEN_43 does not correspond to one of the available specimen_ids ['SPECIMEN_3'] From 249b6bc9cd30881e81cc196779537cca9c1b4b7c Mon Sep 17 00:00:00 2001 From: Daisie Huang Date: Thu, 19 Oct 2023 11:25:55 -0700 Subject: [PATCH 21/29] rename first_key to root_schema --- schema.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/schema.py b/schema.py index 5567aeb..a474c89 100644 --- a/schema.py +++ b/schema.py @@ -300,10 +300,10 @@ def validate_ingest_map(self, map_json): self.validation_schema[key]["extra_args"] = { "index": 0 } - first_key = list(self.validation_schema.keys())[0] - for x in range(0, len(map_json[first_key])): - jsonschema.validate(map_json[first_key][x], self.json_schema) - self.validate_schema(first_key, map_json[first_key][x]) + root_schema = list(self.validation_schema.keys())[0] + for x in range(0, len(map_json[root_schema])): + jsonschema.validate(map_json[root_schema][x], self.json_schema) + self.validate_schema(root_schema, map_json[root_schema][x]) self.statistics["schemas_not_used"] = list(set(self.validation_schema.keys()) - set(self.statistics["schemas_used"])) self.statistics["summary_cases"] = { From 4ba3e39aa5a21c90529bfc5dd3d7e24060597646 Mon Sep 17 00:00:00 2001 From: Daisie Huang Date: Thu, 19 Oct 2023 12:49:05 -0700 Subject: [PATCH 22/29] Check for duplicate IDs within schemas --- schema.py | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/schema.py b/schema.py index a474c89..dc548e0 100644 --- a/schema.py +++ b/schema.py @@ -7,6 +7,7 @@ from copy import deepcopy import jsonschema import dateparser +from collections import Counter class ValidationError(Exception): @@ -50,6 +51,7 @@ class BaseSchema: def __init__(self, url, simple=False): self.validation_failures = [] self.statistics = {} + self.identifiers = {} self.stack_location = [] self.schema = {} self.openapi_url = url @@ -106,12 +108,22 @@ def __init__(self, url, simple=False): def warn(self, message): - message = " > ".join(self.stack_location) + ": " + message + prefix = " > ".join(self.stack_location) + if prefix.strip() == "": + prefix = "" + else: + prefix += ": " + message = prefix + message self.validation_failures.append(f"{message}") def fail(self, message): - message = " > ".join(self.stack_location) + ": " + message + prefix = " > ".join(self.stack_location) + if prefix.strip() == "": + prefix = "" + else: + prefix += ": " + message = prefix + message raise ValidationError(message) @@ -304,7 +316,12 @@ def validate_ingest_map(self, map_json): for x in range(0, len(map_json[root_schema])): jsonschema.validate(map_json[root_schema][x], self.json_schema) self.validate_schema(root_schema, map_json[root_schema][x]) - + for schema in self.identifiers: + most_common = self.identifiers[schema].most_common() + if most_common[0][1] > 1: + for x in most_common: + if x[1] > 1: + self.warn(f"Duplicated IDs: in schema {schema}, {x[0]} occurs {x[1]} times") self.statistics["schemas_not_used"] = list(set(self.validation_schema.keys()) - set(self.statistics["schemas_used"])) self.statistics["summary_cases"] = { "complete_cases": len(map_json["donors"]) - len(self.statistics["cases_missing_data"]), @@ -316,6 +333,9 @@ def validate_schema(self, schema_name, map_json): id = f"{self.validation_schema[schema_name]['name']} {self.validation_schema[schema_name]['extra_args']['index']}" if self.validation_schema[schema_name]["id"] is not None: id = map_json[self.validation_schema[schema_name]["id"]] + if schema_name not in self.identifiers: + self.identifiers[schema_name] = Counter() + self.identifiers[schema_name].update([id]) required_fields = self.validation_schema[schema_name]["required_fields"] nested_schemas = self.validation_schema[schema_name]["nested_schemas"] self.stack_location.append(str(id)) From 0258ebc3ba169b2f9d652440f9cdc873af6538fd Mon Sep 17 00:00:00 2001 From: Daisie Huang Date: Thu, 19 Oct 2023 12:49:39 -0700 Subject: [PATCH 23/29] add test for duplicate ID validation --- test_data/raw_data/Followup.csv | 1 + test_data_ingest.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/test_data/raw_data/Followup.csv b/test_data/raw_data/Followup.csv index 7cd953b..a1fa9ea 100644 --- a/test_data/raw_data/Followup.csv +++ b/test_data/raw_data/Followup.csv @@ -3,3 +3,4 @@ FOLLOW_UP_1,DONOR_1,PD_1,,2022-08,Loco-regional progression,Distant recurrence/m FOLLOW_UP_2,DONOR_1,,TR_1,2022-08,Loco-regional progression,Biochemical progression,2022-05,Imaging (procedure)|Laboratory data interpretation (procedure),C05,Lugano staging system,T1d,N1mi,M1a(0),Stage IVBS, FOLLOW_UP_3,DONOR_1,,,2022-08,Loco-regional progression,Distant recurrence/metastasis,2022-01,Imaging (procedure)|Laboratory data interpretation (procedure),C06,SEER staging system,T2(m),N2c,M1b(1),Stage IIBES, FOLLOW_UP_4,DONOR_1,,,2022-08,Loco-regional progression,Biochemical progression,2022-05,Imaging (procedure)|Laboratory data interpretation (procedure),C05,Lugano staging system,T1d,N1mi,M1a(0),Stage IVBS, +FOLLOW_UP_4,DONOR_6,,,2022-07,Loco-regional progression,Biochemical progression,2022-05,Imaging (procedure)|Laboratory data interpretation (procedure),C05,Lugano staging system,T1d,N1mi,M1a(0),Stage IVBS, diff --git a/test_data_ingest.py b/test_data_ingest.py index 9d901f9..52ff8d3 100644 --- a/test_data_ingest.py +++ b/test_data_ingest.py @@ -101,7 +101,7 @@ def test_donor_2(packets): def test_validation(packets, schema): schema.validate_ingest_map({"donors": packets}) print(schema.validation_failures) - assert len(schema.validation_failures) == 8 + assert len(schema.validation_failures) == 9 # should be the following 9 failures: # DONOR_5: cause_of_death required if is_deceased = Yes # DONOR_5: date_of_death required if is_deceased = Yes @@ -111,6 +111,7 @@ def test_validation(packets, schema): # DONOR_5 > PD_5 > TR_5 > Radiation 1: reference_radiation_treatment_id required if radiation_boost = Yes # DONOR_5 > PD_5 > TR_10: treatment type Immunotherapy should have one or more immunotherapies submitted # DONOR_6 > PD_6 > TR_9 > Surgery 0: submitter_specimen_id SPECIMEN_43 does not correspond to one of the available specimen_ids ['SPECIMEN_3'] + # Duplicated IDs: in schema followups, FOLLOW_UP_4 occurs 2 times From 13fcec585204d26ca29ce9479e4d272e3262ce67 Mon Sep 17 00:00:00 2001 From: Daisie Huang Date: Thu, 19 Oct 2023 12:49:55 -0700 Subject: [PATCH 24/29] duplicate IDs in different schemas are OK --- test_data/raw_data/Followup.csv | 1 + test_data/raw_data/PrimaryDiagnosis.csv | 1 + test_data_ingest.py | 5 +++++ 3 files changed, 7 insertions(+) diff --git a/test_data/raw_data/Followup.csv b/test_data/raw_data/Followup.csv index a1fa9ea..93615d3 100644 --- a/test_data/raw_data/Followup.csv +++ b/test_data/raw_data/Followup.csv @@ -4,3 +4,4 @@ FOLLOW_UP_2,DONOR_1,,TR_1,2022-08,Loco-regional progression,Biochemical progress FOLLOW_UP_3,DONOR_1,,,2022-08,Loco-regional progression,Distant recurrence/metastasis,2022-01,Imaging (procedure)|Laboratory data interpretation (procedure),C06,SEER staging system,T2(m),N2c,M1b(1),Stage IIBES, FOLLOW_UP_4,DONOR_1,,,2022-08,Loco-regional progression,Biochemical progression,2022-05,Imaging (procedure)|Laboratory data interpretation (procedure),C05,Lugano staging system,T1d,N1mi,M1a(0),Stage IVBS, FOLLOW_UP_4,DONOR_6,,,2022-07,Loco-regional progression,Biochemical progression,2022-05,Imaging (procedure)|Laboratory data interpretation (procedure),C05,Lugano staging system,T1d,N1mi,M1a(0),Stage IVBS, +DUPLICATE_ID,DONOR_4,,,2022-08,Loco-regional progression,Biochemical progression,2022-05,Imaging (procedure)|Laboratory data interpretation (procedure),C05,Lugano staging system,T1d,N1mi,M1a(0),Stage IVBS, diff --git a/test_data/raw_data/PrimaryDiagnosis.csv b/test_data/raw_data/PrimaryDiagnosis.csv index 764bb5d..31291cd 100644 --- a/test_data/raw_data/PrimaryDiagnosis.csv +++ b/test_data/raw_data/PrimaryDiagnosis.csv @@ -2,6 +2,7 @@ submitter_donor_id, submitter_primary_diagnosis_id, date_of_diagnosis, cancer_ty DONOR_1,PD_1,1_2018,C43.1,Cytology,No lymph nodes found in resected specimen,Lymph node dissection/pathological exam,5,International Neuroblastoma Staging System,,,,Stage 1,Left DONOR_2,PD_2,3/2020,C04.9,Specific tumour markers,Not applicable,Physical palpation of patient,4,Rai staging system,,,,Stage 1A,Bilateral DONOR_3,PD_3,5/2018,C43.9,Unknown,Yes,Imaging,5,AJCC 7th edition,T0,N0,M1a,,Left +DONOR_3,DUPLICATE_ID,5/2018,C43.9,Unknown,Yes,Imaging,5,AJCC 7th edition,T0,N0,M1a,,Left DONOR_4,PD_4,1_2018,C64.9,Death certificate only,Not applicable,Physical palpation of patient,67,Revised International staging system (RISS),,,,Stage 1B,"Unilateral, side not specified" DONOR_5,PD_5,3/2020,C64.9,Death certificate only,Yes,Lymph node dissection/pathological exam,5,Revised International staging system (RISS),T1,N0a,M0,,Left DONOR_6,PD_6,5/2018,C02.2,Specific tumour markers,No,Physical palpation of patient,2,International Neuroblastoma Staging System,,,,Stage C,"Unilateral, side not specified" diff --git a/test_data_ingest.py b/test_data_ingest.py index 52ff8d3..089ddb0 100644 --- a/test_data_ingest.py +++ b/test_data_ingest.py @@ -2,6 +2,7 @@ import yaml import CSVConvert import mappings +import json from mohschema import MoHSchema # read sheet from given data pathway @@ -113,6 +114,10 @@ def test_validation(packets, schema): # DONOR_6 > PD_6 > TR_9 > Surgery 0: submitter_specimen_id SPECIMEN_43 does not correspond to one of the available specimen_ids ['SPECIMEN_3'] # Duplicated IDs: in schema followups, FOLLOW_UP_4 occurs 2 times + # there should be an item named DUPLICATE_ID in both followup and sample_registration + print(json.dumps(schema.identifiers, indent=2)) + assert schema.identifiers["followups"]["DUPLICATE_ID"] == 1 + assert schema.identifiers["primary_diagnoses"]["DUPLICATE_ID"] == 1 # test mapping that uses values from multiple sheets: From 8c8016850c71afb11066aeda727b0aff7fe27ee6 Mon Sep 17 00:00:00 2001 From: Daisie Huang Date: Thu, 19 Oct 2023 16:36:38 -0700 Subject: [PATCH 25/29] check for empty arrays --- mohschema.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mohschema.py b/mohschema.py index 931f546..a10e85d 100644 --- a/mohschema.py +++ b/mohschema.py @@ -316,19 +316,19 @@ def validate_treatments(self, map_json): for type in map_json["treatment_type"]: match type: case "Chemotherapy": - if "chemotherapies" not in map_json: + if "chemotherapies" not in map_json or len(map_json["chemotherapies"]) == 0: self.warn("treatment type Chemotherapy should have one or more chemotherapies submitted") case "Hormonal therapy": - if "hormone_therapies" not in map_json: + if "hormone_therapies" not in map_json or len(map_json["hormone_therapies"]) == 0: self.warn("treatment type Hormonal therapy should have one or more hormone_therapies submitted") case "Immunotherapy": - if "immunotherapies" not in map_json: + if "immunotherapies" not in map_json or len(map_json["immunotherapies"]) == 0: self.warn("treatment type Immunotherapy should have one or more immunotherapies submitted") case "Radiation therapy": - if "radiations" not in map_json: + if "radiations" not in map_json or len(map_json["radiations"]) == 0: self.warn("treatment type Radiation therapy should have one or more radiation submitted") case "Surgery": - if "surgeries" not in map_json: + if "surgeries" not in map_json or len(map_json["surgeries"]) == 0: self.warn("treatment type Surgery should have one or more surgery submitted") From 63fea21c72ebd8921ff16ebbc9c2bac4bfe871ea Mon Sep 17 00:00:00 2001 From: Daisie Huang Date: Thu, 19 Oct 2023 19:33:26 -0700 Subject: [PATCH 26/29] add a lot of checks for None --- mohschema.py | 97 ++++++++++++++++++++++++---------------------------- 1 file changed, 45 insertions(+), 52 deletions(-) diff --git a/mohschema.py b/mohschema.py index a10e85d..88ddfa5 100644 --- a/mohschema.py +++ b/mohschema.py @@ -194,29 +194,34 @@ def validate_donors(self, map_json): if "date_of_death" not in map_json: self.warn("date_of_death required if is_deceased = Yes") case "lost_to_followup_after_clinical_event_identifier": - if map_json["is_deceased"]: - self.warn("lost_to_followup_after_clinical_event_identifier cannot be present if is_deceased = Yes") + if map_json["lost_to_followup_after_clinical_event_identifier"] is not None: + if map_json["is_deceased"]: + self.warn("lost_to_followup_after_clinical_event_identifier cannot be present if is_deceased = Yes") case "lost_to_followup_reason": - if "lost_to_followup_after_clinical_event_identifier" not in map_json: - self.warn("lost_to_followup_reason should only be submitted if lost_to_followup_after_clinical_event_identifier is submitted") + if map_json["lost_to_followup_reason"] is not None: + if "lost_to_followup_after_clinical_event_identifier" not in map_json: + self.warn("lost_to_followup_reason should only be submitted if lost_to_followup_after_clinical_event_identifier is submitted") case "date_alive_after_lost_to_followup": - if "lost_to_followup_after_clinical_event_identifier" not in map_json: - self.warn("lost_to_followup_after_clinical_event_identifier needs to be submitted if date_alive_after_lost_to_followup is submitted") + if map_json["date_alive_after_lost_to_followup"] is not None: + if "lost_to_followup_after_clinical_event_identifier" not in map_json: + self.warn("lost_to_followup_after_clinical_event_identifier needs to be submitted if date_alive_after_lost_to_followup is submitted") case "cause_of_death": - if not map_json["is_deceased"]: - self.warn("cause_of_death should only be submitted if is_deceased = Yes") + if map_json["cause_of_death"] is not None: + if not map_json["is_deceased"]: + self.warn("cause_of_death should only be submitted if is_deceased = Yes") case "date_of_death": - if not map_json["is_deceased"]: - self.warn("date_of_death should only be submitted if is_deceased = Yes") - else: - if map_json["date_of_death"] is not None and map_json["date_of_birth"] is not None: - death = dateparser.parse(map_json["date_of_death"]).date() - birth = dateparser.parse(map_json["date_of_birth"]).date() - if birth > death: - self.warn("date_of_death cannot be earlier than date_of_birth") + if map_json["date_of_death"] is not None: + if not map_json["is_deceased"]: + self.warn("date_of_death should only be submitted if is_deceased = Yes") + else: + if map_json["date_of_birth"] is not None: + death = dateparser.parse(map_json["date_of_death"]).date() + birth = dateparser.parse(map_json["date_of_birth"]).date() + if birth > death: + self.warn("date_of_death cannot be earlier than date_of_birth") case "biomarkers": for x in map_json["biomarkers"]: - if "test_date" not in x: + if "test_date" not in x or x["test_date"] is None: self.warn("test_date is necessary for biomarkers not associated with nested events") @@ -226,12 +231,12 @@ def validate_primary_diagnoses(self, map_json): is_tumour = False # should either have a clinical staging system specified # OR have a specimen with a pathological staging system specified - if "clinical_tumour_staging_system" in map_json: + if "clinical_tumour_staging_system" in map_json and map_json["clinical_tumour_staging_system"] is not None: is_tumour = True if "specimens" in map_json: for specimen in map_json["specimens"]: specimen_ids.append(specimen["submitter_specimen_id"]) - if "pathological_tumour_staging_system" in specimen: + if "pathological_tumour_staging_system" in specimen and specimen["pathological_tumour_staging_system"] is not None: is_tumour = True self.validation_schema["primary_diagnoses"]["extra_args"]["specimen_ids"] = specimen_ids @@ -241,9 +246,9 @@ def validate_primary_diagnoses(self, map_json): match prop: case "lymph_nodes_examined_status": if map_json["lymph_nodes_examined_status"]: - if "lymph_nodes_examined_method" not in map_json: + if "lymph_nodes_examined_method" not in map_json or map_json["lymph_nodes_examined_method"] is None: self.warn("lymph_nodes_examined_method required if lymph_nodes_examined_status = Yes") - if "number_lymph_nodes_positive" not in map_json: + if "number_lymph_nodes_positive" not in map_json or map_json["number_lymph_nodes_positive"] is None: self.warn("number_lymph_nodes_positive required if lymph_nodes_examined_status = Yes") case "clinical_tumour_staging_system": self.validate_staging_system(map_json, "clinical") @@ -254,7 +259,7 @@ def validate_specimens(self, map_json): # Presence of tumour_histological_type means we have a tumour sample if "tumour_histological_type" in map_json: if not is_clinical_tumour: - if "pathological_tumour_staging_system" not in map_json: + if "pathological_tumour_staging_system" not in map_json or map_json["pathological_tumour_staging_system"] is None: self.warn("Tumour specimens without clinical_tumour_staging_system require a pathological_tumour_staging_system") else: self.validate_staging_system(map_json, "pathological") @@ -333,36 +338,24 @@ def validate_treatments(self, map_json): def validate_chemotherapies(self, map_json): - for prop in map_json: - match prop: - case "prescribed_cumulative_drug_dose": - if "chemotherapy_drug_dose_units" not in map_json: - self.warn("chemotherapy_drug_dose_units required if prescribed_cumulative_drug_dose is submitted") - case "actual_cumulative_drug_dose": - if "chemotherapy_drug_dose_units" not in map_json: - self.warn("chemotherapy_drug_dose_units required if actual_cumulative_drug_dose is submitted") + if "chemotherapy_drug_dose_units" not in map_json or map_json["chemotherapy_drug_dose_units"] is None: + for x in ["prescribed_cumulative_drug_dose", "actual_cumulative_drug_dose"]: + if x in map_json and map_json[x] is not None: + self.warn(f"chemotherapy_drug_dose_units required if {x} is submitted") def validate_hormone_therapies(self, map_json): - for prop in map_json: - match prop: - case "prescribed_cumulative_drug_dose": - if "hormone_drug_dose_units" not in map_json: - self.warn("hormone_drug_dose_units required if prescribed_cumulative_drug_dose is submitted") - case "actual_cumulative_drug_dose": - if "hormone_drug_dose_units" not in map_json: - self.warn("hormone_drug_dose_units required if actual_cumulative_drug_dose is submitted") + if "hormone_drug_dose_units" not in map_json or map_json["hormone_drug_dose_units"] is None: + for x in ["prescribed_cumulative_drug_dose", "actual_cumulative_drug_dose"]: + if x in map_json and map_json[x] is not None: + self.warn(f"hormone_drug_dose_units required if {x} is submitted") def validate_immunotherapies(self, map_json): - for prop in map_json: - match prop: - case "prescribed_cumulative_drug_dose": - if "immunotherapy_drug_dose_units" not in map_json: - self.warn("immunotherapy_drug_dose_units required if prescribed_cumulative_drug_dose is submitted") - case "actual_cumulative_drug_dose": - if "immunotherapy_drug_dose_units" not in map_json: - self.warn("immunotherapy_drug_dose_units required if actual_cumulative_drug_dose is submitted") + if "immunotherapy_drug_dose_units" not in map_json or map_json["immunotherapy_drug_dose_units"] is None: + for x in ["prescribed_cumulative_drug_dose", "actual_cumulative_drug_dose"]: + if x in map_json and map_json[x] is not None: + self.warn(f"immunotherapy_drug_dose_units required if {x} is submitted") def validate_radiations(self, map_json): @@ -374,7 +367,7 @@ def validate_radiations(self, map_json): match prop: case "radiation_boost": if map_json["radiation_boost"]: - if "reference_radiation_treatment_id" not in map_json: + if "reference_radiation_treatment_id" not in map_json or map_json["reference_radiation_treatment_id"] is None: self.warn("reference_radiation_treatment_id required if radiation_boost = Yes") @@ -385,9 +378,9 @@ def validate_surgeries(self, map_json): self.warn("Only one surgery is allowed per treatment") if "submitter_specimen_id" not in map_json: - if "surgery_site" not in map_json: + if "surgery_site" not in map_json or map_json["surgery_site"] is None: self.warn("surgery_site required if submitter_specimen_id not submitted") - if "surgery_location" not in map_json: + if "surgery_location" not in map_json or map_json["surgery_location"] is None: self.warn("surgery_location required if submitter_specimen_id not submitted") else: if map_json["submitter_specimen_id"] not in specimen_ids: @@ -404,7 +397,7 @@ def validate_comorbidities(self, map_json): def validate_exposures(self, map_json): is_smoker = False - if "tobacco_smoking_status" not in map_json: + if "tobacco_smoking_status" not in map_json or map_json["tobacco_smoking_status"] is None: self.fail("tobacco_smoking_status required for exposure") else: if map_json["tobacco_smoking_status"] in [ @@ -433,8 +426,8 @@ def validate_staging_system(self, map_json, staging_type): "m_category" ] for f in required_fields: - if f"{staging_type}_{f}" not in map_json: + if f"{staging_type}_{f}" not in map_json or map_json[f"{staging_type}_{f}"] is None: self.warn(f"{staging_type}_{f} is required if {staging_type}_tumour_staging_system is AJCC") else: - if f"{staging_type}_stage_group" not in map_json: + if f"{staging_type}_stage_group" not in map_json or map_json[f"{staging_type}_stage_group"] is None: self.warn(f"{staging_type}_stage_group is required for {staging_type}_tumour_staging_system {map_json[f'{staging_type}_tumour_staging_system']}") From c22c1557603f7e443f3ded2e4cf042b7a5460583 Mon Sep 17 00:00:00 2001 From: Daisie Huang Date: Thu, 19 Oct 2023 19:36:18 -0700 Subject: [PATCH 27/29] double print --- validate_coverage.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/validate_coverage.py b/validate_coverage.py index 21d261b..077afa3 100644 --- a/validate_coverage.py +++ b/validate_coverage.py @@ -240,7 +240,5 @@ def main(args): for line in result: print(line) - print(json.dumps(result, indent=4)) - if __name__ == '__main__': main(parse_args()) \ No newline at end of file From 60f9ebf38b20a45f87d1fed5111debf085d769b3 Mon Sep 17 00:00:00 2001 From: Daisie Huang Date: Wed, 25 Oct 2023 19:50:55 -0700 Subject: [PATCH 28/29] this is moved to data repo instead of code --- ingest_redcap_data.py | 102 ------------------------------------------ 1 file changed, 102 deletions(-) delete mode 100644 ingest_redcap_data.py diff --git a/ingest_redcap_data.py b/ingest_redcap_data.py deleted file mode 100644 index 86c7531..0000000 --- a/ingest_redcap_data.py +++ /dev/null @@ -1,102 +0,0 @@ -""" -Methods to transform the redcap raw data into the csv format expected by -CSVConcert.py -""" - -import os -import argparse -import re -import pandas -import json -from pathlib import Path - -def parse_args(): - parser = argparse.ArgumentParser() - parser.add_argument('--input', type=str, required = True, help="Raw csv output from Redcap") - parser.add_argument('--verbose', '--v', action="store_true", help="Print extra information") - parser.add_argument('--output', type=str, default="tmp_out", help="Optional name of output directory in same directory as input; default tmp_out") - args = parser.parse_args() - return args - -def ingest_redcap_files(file): - """Test of ingest of redcap output files""" - raw_csv_dfs = {} - file_match = re.match(r"(.+)\.csv$", file) - if file_match is not None: - print(f"Reading input file {file}") - try: - df = pandas.read_csv(file, dtype=str, encoding = "latin-1") - #print(f"initial df shape: {df.shape}") - # find and drop empty columns - df = drop_empty_columns(df) - # now we do some renaming, becuase for reasons we don't understand - # the program_id and submitter_donor_id columns are swapped - df.rename(columns={'program_id':'tempname'},inplace=True) - df.rename(columns={'submitter_donor_id':'program_id'},inplace=True) - df.rename(columns={'tempname':'submitter_donor_id'},inplace=True) - raw_csv_dfs[file] = df - except Exception as e: - raise Exception(f"File {file} does not seem to be a valid csv file") - else: - raise Exception(f"File {file} does not seem to be a csv file") - return raw_csv_dfs - -def extract_repeat_instruments(df): - """ Transforms the single (very sparse) dataframe into one dataframe per - MoH schema. This makes it easier to look at, and also eliminates a bunch - of pandas warnings.""" - new_dfs={} - starting_rows = df.shape[0] - repeat_instruments = df['redcap_repeat_instrument'].dropna().unique() - total_rows = 0 - for i in repeat_instruments: - # each row has a redcap_repeat_instrument that describes the schema - # (e.g. Treatment) and a redcap_repeat_instance that is an id for that - # schema (this would be the treatment.id) - print(f"Extracting schema {i}") - schema_df = df.loc[df['redcap_repeat_instrument'] == i] - # drop all of the empty columns that aren't relevent for this schema - schema_df = drop_empty_columns(schema_df) - # rename the redcap_repeat_instance to the specific id (e.g. treatment_id) - schema_df.rename(columns={ - 'redcap_repeat_instance': f"{i}_id" - }, - inplace=True - ) - total_rows += schema_df.shape[0] - new_dfs[i]=schema_df - - # now save all of the rows that aren't a repeat_instrument and - # label them Singleton for now - singletons = df.loc[df['redcap_repeat_instrument'].isnull()] - singletons = drop_empty_columns(singletons) - # check that we have all of the rows - if (total_rows + singletons.shape[0] < starting_rows): - print("Warning: not all rows recovered in raw data") - new_dfs['Singleton']=singletons - return new_dfs - -def drop_empty_columns(df): - empty_cols = [col for col in df if df[col].isnull().all()] - df = df.drop(empty_cols, axis=1) - return df - -def output_dfs(input_path,output_dir,df_list): - parent_path = Path(input_path).parent - tmpdir = Path(parent_path,output_dir) - if not tmpdir.is_dir(): - tmpdir.mkdir() - print(f"Writing output files to {tmpdir}") - for d in df_list: - df_list[d].to_csv(Path(tmpdir,f"{d}.csv"), index=False) - -def main(args): - input_path = args.input - - raw_csv_dfs = ingest_redcap_files(input_path) - new_dfs = extract_repeat_instruments(raw_csv_dfs[input_path]) - output_dir = args.output - output_dfs(input_path,output_dir,new_dfs) - -if __name__ == '__main__': - main(parse_args()) From d1d393710278f8b07df05abc2ccdcb51aca8e7f8 Mon Sep 17 00:00:00 2001 From: Daisie Huang Date: Wed, 25 Oct 2023 20:06:47 -0700 Subject: [PATCH 29/29] make sure there is an ID in the map_json, if required --- schema.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/schema.py b/schema.py index dc548e0..afda888 100644 --- a/schema.py +++ b/schema.py @@ -331,7 +331,7 @@ def validate_ingest_map(self, map_json): def validate_schema(self, schema_name, map_json): id = f"{self.validation_schema[schema_name]['name']} {self.validation_schema[schema_name]['extra_args']['index']}" - if self.validation_schema[schema_name]["id"] is not None: + if self.validation_schema[schema_name]["id"] is not None and self.validation_schema[schema_name]["id"] in map_json: id = map_json[self.validation_schema[schema_name]["id"]] if schema_name not in self.identifiers: self.identifiers[schema_name] = Counter()