From 9a248a1d3508f5a8a9dd82c0e9eadfffd72714bd Mon Sep 17 00:00:00 2001
From: Daisie Huang <daisieh@gmail.com>
Date: Tue, 17 Oct 2023 17:20:41 -0700
Subject: [PATCH 01/29] save off indexed data on fail

---
 CSVConvert.py | 4 ++--
 mappings.py   | 4 ++++
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/CSVConvert.py b/CSVConvert.py
index bd87071..2ee3a2f 100644
--- a/CSVConvert.py
+++ b/CSVConvert.py
@@ -581,14 +581,14 @@ def csv_convert(input_path, manifest_file, verbose=False):
 
     # # read the raw data
     print("Reading raw data")
-    raw_csv_dfs, output_file = ingest_raw_data(input_path)
+    raw_csv_dfs, mappings.OUTPUT_FILE = ingest_raw_data(input_path)
     if not raw_csv_dfs:
         print(f"No ingestable files (csv or xlsx) were found at {input_path}")
         return
 
     print("Indexing data")
     mappings.INDEXED_DATA = process_data(raw_csv_dfs)
-    with open(f"{output_file}_indexed.json", 'w') as f:
+    with open(f"{mappings.OUTPUT_FILE}_indexed.json", 'w') as f:
         json.dump(mappings.INDEXED_DATA, f, indent=4)
 
     # if verbose flag is set, warn if column name is present in multiple sheets:
diff --git a/mappings.py b/mappings.py
index a290a27..d2235fe 100644
--- a/mappings.py
+++ b/mappings.py
@@ -9,9 +9,13 @@
 INDEX_STACK = []
 INDEXED_DATA = None
 CURRENT_LINE = ""
+OUTPUT_FILE = ""
 
 
 class MappingError(Exception):
+    with open(f"{mappings.OUTPUT_FILE}_indexed.json", 'w') as f:
+        json.dump(mappings.INDEXED_DATA, f, indent=4)
+
     def __init__(self, value):
         self.value = value
 

From 57c305d6d9c33afe9b06a2942ae16c8175cf4839 Mon Sep 17 00:00:00 2001
From: Daisie Huang <daisieh@gmail.com>
Date: Tue, 17 Oct 2023 17:23:01 -0700
Subject: [PATCH 02/29] Update mappings.py

---
 mappings.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mappings.py b/mappings.py
index d2235fe..0a36474 100644
--- a/mappings.py
+++ b/mappings.py
@@ -13,8 +13,8 @@
 
 
 class MappingError(Exception):
-    with open(f"{mappings.OUTPUT_FILE}_indexed.json", 'w') as f:
-        json.dump(mappings.INDEXED_DATA, f, indent=4)
+    with open(f"{OUTPUT_FILE}_indexed.json", 'w') as f:
+        json.dump(INDEXED_DATA, f, indent=4)
 
     def __init__(self, value):
         self.value = value

From 46996ed54a20859e180afb655e1e10994fbe5a0a Mon Sep 17 00:00:00 2001
From: Daisie Huang <daisieh@gmail.com>
Date: Tue, 17 Oct 2023 17:25:57 -0700
Subject: [PATCH 03/29] Update test_data_ingest.py

---
 test_data_ingest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_data_ingest.py b/test_data_ingest.py
index 986ad42..e1a0f47 100644
--- a/test_data_ingest.py
+++ b/test_data_ingest.py
@@ -5,7 +5,7 @@
 from mohschema import MoHSchema
 
 # read sheet from given data pathway
-raw_csvs, output_file = CSVConvert.ingest_raw_data("test_data/pytest_data")
+raw_csvs, mappings.OUTPUT_FILE = CSVConvert.ingest_raw_data("test_data/pytest_data")
 mappings.IDENTIFIER_FIELD =  "Subject"
 mappings.INDEXED_DATA = CSVConvert.process_data(raw_csvs)
 mappings._push_to_stack(None, None, mappings.IDENTIFIER)

From b4ab3e9d420ffdfd679f1f618ce20cf11d278002 Mon Sep 17 00:00:00 2001
From: Daisie Huang <daisieh@gmail.com>
Date: Tue, 17 Oct 2023 17:26:16 -0700
Subject: [PATCH 04/29] add concat_vals

---
 mappings.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/mappings.py b/mappings.py
index 0a36474..cb816a4 100644
--- a/mappings.py
+++ b/mappings.py
@@ -122,6 +122,14 @@ def flat_list_val(data_values):
     return all_items
 
 
+# concatenate several data values
+def concat_vals(data_values):
+    result = []
+    for x in data_values:
+        result.extend(data_values[x].values())
+    return "_".join(result)
+
+
 # Convert various responses to boolean
 def boolean(data_values):
     cell = single_val(data_values)

From 274d06a924078374f0bb6911b8fae7877fc2845d Mon Sep 17 00:00:00 2001
From: Daisie Huang <daisieh@gmail.com>
Date: Tue, 17 Oct 2023 17:28:57 -0700
Subject: [PATCH 05/29] Update CSVConvert.py

---
 CSVConvert.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/CSVConvert.py b/CSVConvert.py
index 2ee3a2f..9396aea 100644
--- a/CSVConvert.py
+++ b/CSVConvert.py
@@ -619,7 +619,7 @@ def csv_convert(input_path, manifest_file, verbose=False):
         if mappings._pop_from_stack() is not None:
             raise Exception(f"Stack not empty\n{mappings.IDENTIFIER_FIELD}: {mappings.IDENTIFIER}\n {mappings.INDEX_STACK}")
 
-    with open(f"{output_file}_indexed.json", 'w') as f:
+    with open(f"{mappings.OUTPUT_FILE}_indexed.json", 'w') as f:
         json.dump(mappings.INDEXED_DATA, f, indent=4)
 
     result = {
@@ -628,14 +628,14 @@ def csv_convert(input_path, manifest_file, verbose=False):
     }
     if schema.katsu_sha is not None:
         result["katsu_sha"] = schema.katsu_sha
-    with open(f"{output_file}_map.json", 'w') as f:    # write to json file for ingestion
+    with open(f"{mappings.OUTPUT_FILE}_map.json", 'w') as f:    # write to json file for ingestion
         json.dump(result, f, indent=4)
 
     # add validation data:
     schema.validate_ingest_map(result)
     result["validation_errors"] = schema.validation_failures
     result["statistics"] = schema.statistics
-    with open(f"{output_file}_map.json", 'w') as f:    # write to json file for ingestion
+    with open(f"{mappings.OUTPUT_FILE}_map.json", 'w') as f:    # write to json file for ingestion
         json.dump(result, f, indent=4)
 
     if len(result["validation_errors"]) > 0:

From ee57cb727d9722d20c978f418460a639b59cd960 Mon Sep 17 00:00:00 2001
From: Daisie Huang <daisieh@gmail.com>
Date: Tue, 17 Oct 2023 17:36:21 -0700
Subject: [PATCH 06/29] test

---
 CSVConvert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CSVConvert.py b/CSVConvert.py
index 9396aea..c69bf17 100644
--- a/CSVConvert.py
+++ b/CSVConvert.py
@@ -49,7 +49,7 @@ def map_data_to_scaffold(node, line, rownum):
         return result
     if "str" in str(type(node)) and node != "":
         result = eval_mapping(node, rownum)
-        verbose_print(f"Evaluated result is {result}")
+        verbose_print(f"Evaluated result is {result}, {node}, {rownum}")
         return result
     if "dict" in str(type(node)):
         result = {}

From 9c9e611b24c61b7966481213a5002febe38da1bf Mon Sep 17 00:00:00 2001
From: Daisie Huang <daisieh@gmail.com>
Date: Tue, 17 Oct 2023 17:42:10 -0700
Subject: [PATCH 07/29] Update CSVConvert.py

---
 CSVConvert.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CSVConvert.py b/CSVConvert.py
index c69bf17..ac19d28 100644
--- a/CSVConvert.py
+++ b/CSVConvert.py
@@ -59,6 +59,7 @@ def map_data_to_scaffold(node, line, rownum):
                 linekey = f"{line}.{key}"
             dict = map_data_to_scaffold(node[key], f"{linekey}", rownum)
             if dict is not None:
+                print(dict)
                 result[key] = dict
         if result is not None and len(result) == 0:
             return None

From b29ed748b4dfd74290248a2ede636471e6bb89b3 Mon Sep 17 00:00:00 2001
From: Daisie Huang <daisieh@gmail.com>
Date: Tue, 17 Oct 2023 17:46:00 -0700
Subject: [PATCH 08/29] test

---
 CSVConvert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CSVConvert.py b/CSVConvert.py
index ac19d28..57f5b1f 100644
--- a/CSVConvert.py
+++ b/CSVConvert.py
@@ -59,7 +59,7 @@ def map_data_to_scaffold(node, line, rownum):
                 linekey = f"{line}.{key}"
             dict = map_data_to_scaffold(node[key], f"{linekey}", rownum)
             if dict is not None:
-                print(dict)
+                print(dict, result)
                 result[key] = dict
         if result is not None and len(result) == 0:
             return None

From 12cf2da99e0af22c8c4f25976222b5aff2bde6fc Mon Sep 17 00:00:00 2001
From: Daisie Huang <daisieh@gmail.com>
Date: Tue, 17 Oct 2023 17:47:32 -0700
Subject: [PATCH 09/29] Update CSVConvert.py

---
 CSVConvert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CSVConvert.py b/CSVConvert.py
index 57f5b1f..56188af 100644
--- a/CSVConvert.py
+++ b/CSVConvert.py
@@ -59,7 +59,7 @@ def map_data_to_scaffold(node, line, rownum):
                 linekey = f"{line}.{key}"
             dict = map_data_to_scaffold(node[key], f"{linekey}", rownum)
             if dict is not None:
-                print(dict, result)
+                print(dict, key)
                 result[key] = dict
         if result is not None and len(result) == 0:
             return None

From 27d569365fecda012e4ea98dcf712d57eca3919f Mon Sep 17 00:00:00 2001
From: Daisie Huang <daisieh@gmail.com>
Date: Tue, 17 Oct 2023 17:47:57 -0700
Subject: [PATCH 10/29] Update CSVConvert.py

---
 CSVConvert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CSVConvert.py b/CSVConvert.py
index 56188af..4898481 100644
--- a/CSVConvert.py
+++ b/CSVConvert.py
@@ -59,7 +59,7 @@ def map_data_to_scaffold(node, line, rownum):
                 linekey = f"{line}.{key}"
             dict = map_data_to_scaffold(node[key], f"{linekey}", rownum)
             if dict is not None:
-                print(dict, key)
+                print(dict, key, rownum)
                 result[key] = dict
         if result is not None and len(result) == 0:
             return None

From b110011bc53f74a9f43d3e4dc4f8bb225b223f87 Mon Sep 17 00:00:00 2001
From: Daisie Huang <daisieh@gmail.com>
Date: Tue, 17 Oct 2023 17:55:42 -0700
Subject: [PATCH 11/29] Update CSVConvert.py

---
 CSVConvert.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/CSVConvert.py b/CSVConvert.py
index 4898481..607291d 100644
--- a/CSVConvert.py
+++ b/CSVConvert.py
@@ -59,7 +59,14 @@ def map_data_to_scaffold(node, line, rownum):
                 linekey = f"{line}.{key}"
             dict = map_data_to_scaffold(node[key], f"{linekey}", rownum)
             if dict is not None:
-                print(dict, key, rownum)
+                # if "CALCULATED" not in mappings.INDEXED_DATA["data"]:
+                #     mappings.INDEXED_DATA["data"]["CALCULATED"] = {}
+                # if mappings.IDENTIFIER not in mappings.INDEXED_DATA["data"]["CALCULATED"]:
+                #     mappings.INDEXED_DATA["data"]["CALCULATED"] = {}
+                # if key not in mappings.INDEXED_DATA["data"]["CALCULATED"]:
+                #     mappings.INDEXED_DATA["data"]["CALCULATED"][key] = []
+
+                print(f"HELLO {dict}, {key}, {rownum}")
                 result[key] = dict
         if result is not None and len(result) == 0:
             return None

From d3d8516e2a8726c54fcb8e71931af8db269cf9c9 Mon Sep 17 00:00:00 2001
From: Daisie Huang <daisieh@gmail.com>
Date: Tue, 17 Oct 2023 17:57:21 -0700
Subject: [PATCH 12/29] Update CSVConvert.py

---
 CSVConvert.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/CSVConvert.py b/CSVConvert.py
index 607291d..2fec61b 100644
--- a/CSVConvert.py
+++ b/CSVConvert.py
@@ -59,13 +59,13 @@ def map_data_to_scaffold(node, line, rownum):
                 linekey = f"{line}.{key}"
             dict = map_data_to_scaffold(node[key], f"{linekey}", rownum)
             if dict is not None:
-                # if "CALCULATED" not in mappings.INDEXED_DATA["data"]:
-                #     mappings.INDEXED_DATA["data"]["CALCULATED"] = {}
-                # if mappings.IDENTIFIER not in mappings.INDEXED_DATA["data"]["CALCULATED"]:
-                #     mappings.INDEXED_DATA["data"]["CALCULATED"] = {}
-                # if key not in mappings.INDEXED_DATA["data"]["CALCULATED"]:
-                #     mappings.INDEXED_DATA["data"]["CALCULATED"][key] = []
-
+                if "CALCULATED" not in mappings.INDEXED_DATA["data"]:
+                    mappings.INDEXED_DATA["data"]["CALCULATED"] = {}
+                if mappings.IDENTIFIER not in mappings.INDEXED_DATA["data"]["CALCULATED"]:
+                    mappings.INDEXED_DATA["data"]["CALCULATED"] = {}
+                if key not in mappings.INDEXED_DATA["data"]["CALCULATED"]:
+                    mappings.INDEXED_DATA["data"]["CALCULATED"][key] = []
+                mappings.INDEXED_DATA["data"]["CALCULATED"][key].append(dict)
                 print(f"HELLO {dict}, {key}, {rownum}")
                 result[key] = dict
         if result is not None and len(result) == 0:

From 7ba1276b49703364461e0eb53c2cee47a0b3b24c Mon Sep 17 00:00:00 2001
From: Daisie Huang <daisieh@gmail.com>
Date: Tue, 17 Oct 2023 17:58:29 -0700
Subject: [PATCH 13/29] Update mappings.py

---
 mappings.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mappings.py b/mappings.py
index cb816a4..2269ac5 100644
--- a/mappings.py
+++ b/mappings.py
@@ -14,6 +14,7 @@
 
 class MappingError(Exception):
     with open(f"{OUTPUT_FILE}_indexed.json", 'w') as f:
+        print("HOWDY")
         json.dump(INDEXED_DATA, f, indent=4)
 
     def __init__(self, value):

From 3792740e93a16a5895bd4809b75bc7e692c3f3c2 Mon Sep 17 00:00:00 2001
From: Daisie Huang <daisieh@gmail.com>
Date: Tue, 17 Oct 2023 17:59:50 -0700
Subject: [PATCH 14/29] Update mappings.py

---
 mappings.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/mappings.py b/mappings.py
index 2269ac5..b1f7baf 100644
--- a/mappings.py
+++ b/mappings.py
@@ -13,14 +13,12 @@
 
 
 class MappingError(Exception):
-    with open(f"{OUTPUT_FILE}_indexed.json", 'w') as f:
-        print("HOWDY")
-        json.dump(INDEXED_DATA, f, indent=4)
-
     def __init__(self, value):
         self.value = value
 
     def __str__(self):
+        with open(f"{OUTPUT_FILE}_indexed.json", 'w') as f:
+            json.dump(INDEXED_DATA, f, indent=4)
         return repr(f"Check the values for {IDENTIFIER} in {IDENTIFIER_FIELD}: {self.value}")
 
 

From 6e6b15bbcb2361db55f3db9242ea1a253388a161 Mon Sep 17 00:00:00 2001
From: Daisie Huang <daisieh@gmail.com>
Date: Tue, 17 Oct 2023 18:01:44 -0700
Subject: [PATCH 15/29] Update CSVConvert.py

---
 CSVConvert.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/CSVConvert.py b/CSVConvert.py
index 2fec61b..b3dc795 100644
--- a/CSVConvert.py
+++ b/CSVConvert.py
@@ -62,10 +62,10 @@ def map_data_to_scaffold(node, line, rownum):
                 if "CALCULATED" not in mappings.INDEXED_DATA["data"]:
                     mappings.INDEXED_DATA["data"]["CALCULATED"] = {}
                 if mappings.IDENTIFIER not in mappings.INDEXED_DATA["data"]["CALCULATED"]:
-                    mappings.INDEXED_DATA["data"]["CALCULATED"] = {}
-                if key not in mappings.INDEXED_DATA["data"]["CALCULATED"]:
-                    mappings.INDEXED_DATA["data"]["CALCULATED"][key] = []
-                mappings.INDEXED_DATA["data"]["CALCULATED"][key].append(dict)
+                    mappings.INDEXED_DATA["data"]["CALCULATED"][mappings.IDENTIFIER] = {}
+                if key not in mappings.INDEXED_DATA["data"]["CALCULATED"][mappings.IDENTIFIER]:
+                    mappings.INDEXED_DATA["data"]["CALCULATED"][mappings.IDENTIFIER][key] = []
+                mappings.INDEXED_DATA["data"]["CALCULATED"][mappings.IDENTIFIER][key].append(dict)
                 print(f"HELLO {dict}, {key}, {rownum}")
                 result[key] = dict
         if result is not None and len(result) == 0:

From 1c9164369ae8a5aa5953e681146cda79e821f9fe Mon Sep 17 00:00:00 2001
From: Daisie Huang <daisieh@gmail.com>
Date: Tue, 17 Oct 2023 18:05:18 -0700
Subject: [PATCH 16/29] Update CSVConvert.py

---
 CSVConvert.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/CSVConvert.py b/CSVConvert.py
index b3dc795..6068012 100644
--- a/CSVConvert.py
+++ b/CSVConvert.py
@@ -66,6 +66,10 @@ def map_data_to_scaffold(node, line, rownum):
                 if key not in mappings.INDEXED_DATA["data"]["CALCULATED"][mappings.IDENTIFIER]:
                     mappings.INDEXED_DATA["data"]["CALCULATED"][mappings.IDENTIFIER][key] = []
                 mappings.INDEXED_DATA["data"]["CALCULATED"][mappings.IDENTIFIER][key].append(dict)
+                if key not in mappings.INDEXED_DATA["columns"]:
+                    mappings.INDEXED_DATA["columns"][key] = []
+                if "CALCULATED" not in mappings.INDEXED_DATA["columns"][key]:
+                    mappings.INDEXED_DATA["columns"][key].append("CALCULATED")
                 print(f"HELLO {dict}, {key}, {rownum}")
                 result[key] = dict
         if result is not None and len(result) == 0:

From 5c20a02f6c22478019aa84d9867fb8243ec88cb0 Mon Sep 17 00:00:00 2001
From: Daisie Huang <daisieh@gmail.com>
Date: Tue, 17 Oct 2023 18:41:13 -0700
Subject: [PATCH 17/29] Update CSVConvert.py

---
 CSVConvert.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/CSVConvert.py b/CSVConvert.py
index 6068012..cb90513 100644
--- a/CSVConvert.py
+++ b/CSVConvert.py
@@ -70,7 +70,6 @@ def map_data_to_scaffold(node, line, rownum):
                     mappings.INDEXED_DATA["columns"][key] = []
                 if "CALCULATED" not in mappings.INDEXED_DATA["columns"][key]:
                     mappings.INDEXED_DATA["columns"][key].append("CALCULATED")
-                print(f"HELLO {dict}, {key}, {rownum}")
                 result[key] = dict
         if result is not None and len(result) == 0:
             return None
@@ -205,7 +204,7 @@ def get_row_for_stack_top(sheet, rownum):
     result = {}
     for param in mappings.INDEXED_DATA["data"][sheet][mappings.IDENTIFIER].keys():
         result[param] = mappings.INDEXED_DATA["data"][sheet][mappings.IDENTIFIER][param][rownum]
-    verbose_print(f"get_row_for_stack_top is {result}")
+    verbose_print(f"get_row_for_stack_top {sheet} is {result}")
     return result
 
 

From 266ea3e2008802bde40a2b0a73029b5e70e14e9e Mon Sep 17 00:00:00 2001
From: Daisie Huang <daisieh@gmail.com>
Date: Wed, 18 Oct 2023 10:29:43 -0700
Subject: [PATCH 18/29] Update test2moh.csv

---
 test_data/test2moh.csv | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test_data/test2moh.csv b/test_data/test2moh.csv
index f8344d0..c34be00 100644
--- a/test_data/test2moh.csv
+++ b/test_data/test2moh.csv
@@ -1,3 +1,4 @@
+## THIS IS A TEST FILE: DO NOT USE FOR EXAMPLE PURPOSES
 ## Schema generated from https://raw.githubusercontent.com/CanDIG/katsu/develop/chord_metadata_service/mohpackets/docs/schema.yml
 ## Based on repo commit sha "29fd55d173b7a01daa72fcc89187e3aabd1fb51e"
 ## MoH template is manually updated to match the MoH clinical data model

From 01526bd50c122daffc430a7247d41504c8049b83 Mon Sep 17 00:00:00 2001
From: Daisie Huang <daisieh@gmail.com>
Date: Tue, 10 Oct 2023 10:10:40 -0700
Subject: [PATCH 19/29] Don't add required_but_missing fields to failure list

---
 schema.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/schema.py b/schema.py
index a32b6c0..5567aeb 100644
--- a/schema.py
+++ b/schema.py
@@ -336,7 +336,7 @@ def validate_schema(self, schema_name, map_json):
                 }
             self.statistics["required_but_missing"][schema_name][f]["total"] += 1
             if f not in map_json:
-                self.warn(f"{f} required for {schema_name}")
+                # self.warn(f"{f} required for {schema_name}")
                 self.statistics["required_but_missing"][schema_name][f]["missing"] += 1
                 if case not in self.statistics["cases_missing_data"]:
                     self.statistics["cases_missing_data"].append(case)

From 7b46225baf843561c493780bf326bad3bdf47910 Mon Sep 17 00:00:00 2001
From: Daisie Huang <daisieh@gmail.com>
Date: Tue, 10 Oct 2023 10:15:14 -0700
Subject: [PATCH 20/29] update tests

---
 test_data_ingest.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/test_data_ingest.py b/test_data_ingest.py
index e1a0f47..9d901f9 100644
--- a/test_data_ingest.py
+++ b/test_data_ingest.py
@@ -101,7 +101,7 @@ def test_donor_2(packets):
 def test_validation(packets, schema):
     schema.validate_ingest_map({"donors": packets})
     print(schema.validation_failures)
-    assert len(schema.validation_failures) == 9
+    assert len(schema.validation_failures) == 8
     # should be the following 9 failures:
     # DONOR_5: cause_of_death required if is_deceased = Yes
     # DONOR_5: date_of_death required if is_deceased = Yes
@@ -109,7 +109,6 @@ def test_validation(packets, schema):
     # DONOR_5 > PD_5 > SPECIMEN_6: Tumour specimens require a reference_pathology_confirmed_diagnosis
     # DONOR_5 > PD_5 > TR_5 > Radiation 1: Only one radiation is allowed per treatment
     # DONOR_5 > PD_5 > TR_5 > Radiation 1: reference_radiation_treatment_id required if radiation_boost = Yes
-    # DONOR_5 > PD_5 > TR_10: response_to_treatment required for treatments
     # DONOR_5 > PD_5 > TR_10: treatment type Immunotherapy should have one or more immunotherapies submitted
     # DONOR_6 > PD_6 > TR_9 > Surgery 0: submitter_specimen_id SPECIMEN_43 does not correspond to one of the available specimen_ids ['SPECIMEN_3']
 

From 249b6bc9cd30881e81cc196779537cca9c1b4b7c Mon Sep 17 00:00:00 2001
From: Daisie Huang <daisieh@gmail.com>
Date: Thu, 19 Oct 2023 11:25:55 -0700
Subject: [PATCH 21/29] rename first_key to root_schema

---
 schema.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/schema.py b/schema.py
index 5567aeb..a474c89 100644
--- a/schema.py
+++ b/schema.py
@@ -300,10 +300,10 @@ def validate_ingest_map(self, map_json):
             self.validation_schema[key]["extra_args"] = {
                 "index": 0
             }
-        first_key = list(self.validation_schema.keys())[0]
-        for x in range(0, len(map_json[first_key])):
-            jsonschema.validate(map_json[first_key][x], self.json_schema)
-            self.validate_schema(first_key, map_json[first_key][x])
+        root_schema = list(self.validation_schema.keys())[0]
+        for x in range(0, len(map_json[root_schema])):
+            jsonschema.validate(map_json[root_schema][x], self.json_schema)
+            self.validate_schema(root_schema, map_json[root_schema][x])
 
         self.statistics["schemas_not_used"] = list(set(self.validation_schema.keys()) - set(self.statistics["schemas_used"]))
         self.statistics["summary_cases"] = {

From 4ba3e39aa5a21c90529bfc5dd3d7e24060597646 Mon Sep 17 00:00:00 2001
From: Daisie Huang <daisieh@gmail.com>
Date: Thu, 19 Oct 2023 12:49:05 -0700
Subject: [PATCH 22/29] Check for duplicate IDs within schemas

---
 schema.py | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/schema.py b/schema.py
index a474c89..dc548e0 100644
--- a/schema.py
+++ b/schema.py
@@ -7,6 +7,7 @@
 from copy import deepcopy
 import jsonschema
 import dateparser
+from collections import Counter
 
 
 class ValidationError(Exception):
@@ -50,6 +51,7 @@ class BaseSchema:
     def __init__(self, url, simple=False):
         self.validation_failures = []
         self.statistics = {}
+        self.identifiers = {}
         self.stack_location = []
         self.schema = {}
         self.openapi_url = url
@@ -106,12 +108,22 @@ def __init__(self, url, simple=False):
 
 
     def warn(self, message):
-        message = " > ".join(self.stack_location) + ": " + message
+        prefix = " > ".join(self.stack_location)
+        if prefix.strip() == "":
+            prefix = ""
+        else:
+            prefix += ": "
+        message = prefix + message
         self.validation_failures.append(f"{message}")
 
 
     def fail(self, message):
-        message = " > ".join(self.stack_location) + ": " + message
+        prefix = " > ".join(self.stack_location)
+        if prefix.strip() == "":
+            prefix = ""
+        else:
+            prefix += ": "
+        message = prefix + message
         raise ValidationError(message)
 
 
@@ -304,7 +316,12 @@ def validate_ingest_map(self, map_json):
         for x in range(0, len(map_json[root_schema])):
             jsonschema.validate(map_json[root_schema][x], self.json_schema)
             self.validate_schema(root_schema, map_json[root_schema][x])
-
+        for schema in self.identifiers:
+            most_common = self.identifiers[schema].most_common()
+            if most_common[0][1] > 1:
+                for x in most_common:
+                    if x[1] > 1:
+                        self.warn(f"Duplicated IDs: in schema {schema}, {x[0]} occurs {x[1]} times")
         self.statistics["schemas_not_used"] = list(set(self.validation_schema.keys()) - set(self.statistics["schemas_used"]))
         self.statistics["summary_cases"] = {
             "complete_cases": len(map_json["donors"]) - len(self.statistics["cases_missing_data"]),
@@ -316,6 +333,9 @@ def validate_schema(self, schema_name, map_json):
         id = f"{self.validation_schema[schema_name]['name']} {self.validation_schema[schema_name]['extra_args']['index']}"
         if self.validation_schema[schema_name]["id"] is not None:
             id = map_json[self.validation_schema[schema_name]["id"]]
+            if schema_name not in self.identifiers:
+                self.identifiers[schema_name] = Counter()
+            self.identifiers[schema_name].update([id])
         required_fields = self.validation_schema[schema_name]["required_fields"]
         nested_schemas = self.validation_schema[schema_name]["nested_schemas"]
         self.stack_location.append(str(id))

From 0258ebc3ba169b2f9d652440f9cdc873af6538fd Mon Sep 17 00:00:00 2001
From: Daisie Huang <daisieh@gmail.com>
Date: Thu, 19 Oct 2023 12:49:39 -0700
Subject: [PATCH 23/29] add test for duplicate ID validation

---
 test_data/raw_data/Followup.csv | 1 +
 test_data_ingest.py             | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/test_data/raw_data/Followup.csv b/test_data/raw_data/Followup.csv
index 7cd953b..a1fa9ea 100644
--- a/test_data/raw_data/Followup.csv
+++ b/test_data/raw_data/Followup.csv
@@ -3,3 +3,4 @@ FOLLOW_UP_1,DONOR_1,PD_1,,2022-08,Loco-regional progression,Distant recurrence/m
 FOLLOW_UP_2,DONOR_1,,TR_1,2022-08,Loco-regional progression,Biochemical progression,2022-05,Imaging (procedure)|Laboratory data interpretation (procedure),C05,Lugano staging system,T1d,N1mi,M1a(0),Stage IVBS,
 FOLLOW_UP_3,DONOR_1,,,2022-08,Loco-regional progression,Distant recurrence/metastasis,2022-01,Imaging (procedure)|Laboratory data interpretation (procedure),C06,SEER staging system,T2(m),N2c,M1b(1),Stage IIBES,
 FOLLOW_UP_4,DONOR_1,,,2022-08,Loco-regional progression,Biochemical progression,2022-05,Imaging (procedure)|Laboratory data interpretation (procedure),C05,Lugano staging system,T1d,N1mi,M1a(0),Stage IVBS,
+FOLLOW_UP_4,DONOR_6,,,2022-07,Loco-regional progression,Biochemical progression,2022-05,Imaging (procedure)|Laboratory data interpretation (procedure),C05,Lugano staging system,T1d,N1mi,M1a(0),Stage IVBS,
diff --git a/test_data_ingest.py b/test_data_ingest.py
index 9d901f9..52ff8d3 100644
--- a/test_data_ingest.py
+++ b/test_data_ingest.py
@@ -101,7 +101,7 @@ def test_donor_2(packets):
 def test_validation(packets, schema):
     schema.validate_ingest_map({"donors": packets})
     print(schema.validation_failures)
-    assert len(schema.validation_failures) == 8
+    assert len(schema.validation_failures) == 9
     # should be the following 9 failures:
     # DONOR_5: cause_of_death required if is_deceased = Yes
     # DONOR_5: date_of_death required if is_deceased = Yes
@@ -111,6 +111,7 @@ def test_validation(packets, schema):
     # DONOR_5 > PD_5 > TR_5 > Radiation 1: reference_radiation_treatment_id required if radiation_boost = Yes
     # DONOR_5 > PD_5 > TR_10: treatment type Immunotherapy should have one or more immunotherapies submitted
     # DONOR_6 > PD_6 > TR_9 > Surgery 0: submitter_specimen_id SPECIMEN_43 does not correspond to one of the available specimen_ids ['SPECIMEN_3']
+    # Duplicated IDs: in schema followups, FOLLOW_UP_4 occurs 2 times
 
 
 

From 13fcec585204d26ca29ce9479e4d272e3262ce67 Mon Sep 17 00:00:00 2001
From: Daisie Huang <daisieh@gmail.com>
Date: Thu, 19 Oct 2023 12:49:55 -0700
Subject: [PATCH 24/29] duplicate IDs in different schemas are OK

---
 test_data/raw_data/Followup.csv         | 1 +
 test_data/raw_data/PrimaryDiagnosis.csv | 1 +
 test_data_ingest.py                     | 5 +++++
 3 files changed, 7 insertions(+)

diff --git a/test_data/raw_data/Followup.csv b/test_data/raw_data/Followup.csv
index a1fa9ea..93615d3 100644
--- a/test_data/raw_data/Followup.csv
+++ b/test_data/raw_data/Followup.csv
@@ -4,3 +4,4 @@ FOLLOW_UP_2,DONOR_1,,TR_1,2022-08,Loco-regional progression,Biochemical progress
 FOLLOW_UP_3,DONOR_1,,,2022-08,Loco-regional progression,Distant recurrence/metastasis,2022-01,Imaging (procedure)|Laboratory data interpretation (procedure),C06,SEER staging system,T2(m),N2c,M1b(1),Stage IIBES,
 FOLLOW_UP_4,DONOR_1,,,2022-08,Loco-regional progression,Biochemical progression,2022-05,Imaging (procedure)|Laboratory data interpretation (procedure),C05,Lugano staging system,T1d,N1mi,M1a(0),Stage IVBS,
 FOLLOW_UP_4,DONOR_6,,,2022-07,Loco-regional progression,Biochemical progression,2022-05,Imaging (procedure)|Laboratory data interpretation (procedure),C05,Lugano staging system,T1d,N1mi,M1a(0),Stage IVBS,
+DUPLICATE_ID,DONOR_4,,,2022-08,Loco-regional progression,Biochemical progression,2022-05,Imaging (procedure)|Laboratory data interpretation (procedure),C05,Lugano staging system,T1d,N1mi,M1a(0),Stage IVBS,
diff --git a/test_data/raw_data/PrimaryDiagnosis.csv b/test_data/raw_data/PrimaryDiagnosis.csv
index 764bb5d..31291cd 100644
--- a/test_data/raw_data/PrimaryDiagnosis.csv
+++ b/test_data/raw_data/PrimaryDiagnosis.csv
@@ -2,6 +2,7 @@ submitter_donor_id, submitter_primary_diagnosis_id, date_of_diagnosis, cancer_ty
 DONOR_1,PD_1,1_2018,C43.1,Cytology,No lymph nodes found in resected specimen,Lymph node dissection/pathological exam,5,International Neuroblastoma Staging System,,,,Stage 1,Left
 DONOR_2,PD_2,3/2020,C04.9,Specific tumour markers,Not applicable,Physical palpation of patient,4,Rai staging system,,,,Stage 1A,Bilateral
 DONOR_3,PD_3,5/2018,C43.9,Unknown,Yes,Imaging,5,AJCC 7th edition,T0,N0,M1a,,Left
+DONOR_3,DUPLICATE_ID,5/2018,C43.9,Unknown,Yes,Imaging,5,AJCC 7th edition,T0,N0,M1a,,Left
 DONOR_4,PD_4,1_2018,C64.9,Death certificate only,Not applicable,Physical palpation of patient,67,Revised International staging system (RISS),,,,Stage 1B,"Unilateral, side not specified"
 DONOR_5,PD_5,3/2020,C64.9,Death certificate only,Yes,Lymph node dissection/pathological exam,5,Revised International staging system (RISS),T1,N0a,M0,,Left
 DONOR_6,PD_6,5/2018,C02.2,Specific tumour markers,No,Physical palpation of patient,2,International Neuroblastoma Staging System,,,,Stage C,"Unilateral, side not specified"
diff --git a/test_data_ingest.py b/test_data_ingest.py
index 52ff8d3..089ddb0 100644
--- a/test_data_ingest.py
+++ b/test_data_ingest.py
@@ -2,6 +2,7 @@
 import yaml
 import CSVConvert
 import mappings
+import json
 from mohschema import MoHSchema
 
 # read sheet from given data pathway
@@ -113,6 +114,10 @@ def test_validation(packets, schema):
     # DONOR_6 > PD_6 > TR_9 > Surgery 0: submitter_specimen_id SPECIMEN_43 does not correspond to one of the available specimen_ids ['SPECIMEN_3']
     # Duplicated IDs: in schema followups, FOLLOW_UP_4 occurs 2 times
 
+    # there should be an item named DUPLICATE_ID in both followup and sample_registration
+    print(json.dumps(schema.identifiers, indent=2))
+    assert schema.identifiers["followups"]["DUPLICATE_ID"] == 1
+    assert schema.identifiers["primary_diagnoses"]["DUPLICATE_ID"] == 1
 
 
 # test mapping that uses values from multiple sheets:

From 8c8016850c71afb11066aeda727b0aff7fe27ee6 Mon Sep 17 00:00:00 2001
From: Daisie Huang <daisieh@gmail.com>
Date: Thu, 19 Oct 2023 16:36:38 -0700
Subject: [PATCH 25/29] check for empty arrays

---
 mohschema.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/mohschema.py b/mohschema.py
index 931f546..a10e85d 100644
--- a/mohschema.py
+++ b/mohschema.py
@@ -316,19 +316,19 @@ def validate_treatments(self, map_json):
                         for type in map_json["treatment_type"]:
                             match type:
                                 case "Chemotherapy":
-                                    if "chemotherapies" not in map_json:
+                                    if "chemotherapies" not in map_json or len(map_json["chemotherapies"]) == 0:
                                         self.warn("treatment type Chemotherapy should have one or more chemotherapies submitted")
                                 case "Hormonal therapy":
-                                    if "hormone_therapies" not in map_json:
+                                    if "hormone_therapies" not in map_json or len(map_json["hormone_therapies"]) == 0:
                                         self.warn("treatment type Hormonal therapy should have one or more hormone_therapies submitted")
                                 case "Immunotherapy":
-                                    if "immunotherapies" not in map_json:
+                                    if "immunotherapies" not in map_json or len(map_json["immunotherapies"]) == 0:
                                         self.warn("treatment type Immunotherapy should have one or more immunotherapies submitted")
                                 case "Radiation therapy":
-                                    if "radiations" not in map_json:
+                                    if "radiations" not in map_json or len(map_json["radiations"]) == 0:
                                         self.warn("treatment type Radiation therapy should have one or more radiation submitted")
                                 case "Surgery":
-                                    if "surgeries" not in map_json:
+                                    if "surgeries" not in map_json or len(map_json["surgeries"]) == 0:
                                         self.warn("treatment type Surgery should have one or more surgery submitted")
 
 

From 63fea21c72ebd8921ff16ebbc9c2bac4bfe871ea Mon Sep 17 00:00:00 2001
From: Daisie Huang <daisieh@gmail.com>
Date: Thu, 19 Oct 2023 19:33:26 -0700
Subject: [PATCH 26/29] add a lot of checks for None

---
 mohschema.py | 97 ++++++++++++++++++++++++----------------------------
 1 file changed, 45 insertions(+), 52 deletions(-)

diff --git a/mohschema.py b/mohschema.py
index a10e85d..88ddfa5 100644
--- a/mohschema.py
+++ b/mohschema.py
@@ -194,29 +194,34 @@ def validate_donors(self, map_json):
                         if "date_of_death" not in map_json:
                             self.warn("date_of_death required if is_deceased = Yes")
                 case "lost_to_followup_after_clinical_event_identifier":
-                    if map_json["is_deceased"]:
-                        self.warn("lost_to_followup_after_clinical_event_identifier cannot be present if is_deceased = Yes")
+                    if map_json["lost_to_followup_after_clinical_event_identifier"] is not None:
+                        if map_json["is_deceased"]:
+                            self.warn("lost_to_followup_after_clinical_event_identifier cannot be present if is_deceased = Yes")
                 case "lost_to_followup_reason":
-                    if "lost_to_followup_after_clinical_event_identifier" not in map_json:
-                        self.warn("lost_to_followup_reason should only be submitted if lost_to_followup_after_clinical_event_identifier is submitted")
+                    if map_json["lost_to_followup_reason"] is not None:
+                        if "lost_to_followup_after_clinical_event_identifier" not in map_json:
+                            self.warn("lost_to_followup_reason should only be submitted if lost_to_followup_after_clinical_event_identifier is submitted")
                 case "date_alive_after_lost_to_followup":
-                    if "lost_to_followup_after_clinical_event_identifier" not in map_json:
-                        self.warn("lost_to_followup_after_clinical_event_identifier needs to be submitted if date_alive_after_lost_to_followup is submitted")
+                    if map_json["date_alive_after_lost_to_followup"] is not None:
+                        if "lost_to_followup_after_clinical_event_identifier" not in map_json:
+                            self.warn("lost_to_followup_after_clinical_event_identifier needs to be submitted if date_alive_after_lost_to_followup is submitted")
                 case "cause_of_death":
-                    if not map_json["is_deceased"]:
-                        self.warn("cause_of_death should only be submitted if is_deceased = Yes")
+                    if map_json["cause_of_death"] is not None:
+                        if not map_json["is_deceased"]:
+                            self.warn("cause_of_death should only be submitted if is_deceased = Yes")
                 case "date_of_death":
-                    if not map_json["is_deceased"]:
-                        self.warn("date_of_death should only be submitted if is_deceased = Yes")
-                    else:
-                        if map_json["date_of_death"] is not None and map_json["date_of_birth"] is not None:
-                            death = dateparser.parse(map_json["date_of_death"]).date()
-                            birth = dateparser.parse(map_json["date_of_birth"]).date()
-                            if birth > death:
-                                self.warn("date_of_death cannot be earlier than date_of_birth")
+                    if map_json["date_of_death"] is not None:
+                        if not map_json["is_deceased"]:
+                            self.warn("date_of_death should only be submitted if is_deceased = Yes")
+                        else:
+                            if map_json["date_of_birth"] is not None:
+                                death = dateparser.parse(map_json["date_of_death"]).date()
+                                birth = dateparser.parse(map_json["date_of_birth"]).date()
+                                if birth > death:
+                                    self.warn("date_of_death cannot be earlier than date_of_birth")
                 case "biomarkers":
                     for x in map_json["biomarkers"]:
-                        if "test_date" not in x:
+                        if "test_date" not in x or x["test_date"] is None:
                             self.warn("test_date is necessary for biomarkers not associated with nested events")
 
 
@@ -226,12 +231,12 @@ def validate_primary_diagnoses(self, map_json):
         is_tumour = False
         # should either have a clinical staging system specified
         # OR have a specimen with a pathological staging system specified
-        if "clinical_tumour_staging_system" in map_json:
+        if "clinical_tumour_staging_system" in map_json and map_json["clinical_tumour_staging_system"] is not None:
             is_tumour = True
         if "specimens" in map_json:
             for specimen in map_json["specimens"]:
                 specimen_ids.append(specimen["submitter_specimen_id"])
-                if "pathological_tumour_staging_system" in specimen:
+                if "pathological_tumour_staging_system" in specimen and specimen["pathological_tumour_staging_system"] is not None:
                     is_tumour = True
 
         self.validation_schema["primary_diagnoses"]["extra_args"]["specimen_ids"] = specimen_ids
@@ -241,9 +246,9 @@ def validate_primary_diagnoses(self, map_json):
             match prop:
                 case "lymph_nodes_examined_status":
                     if map_json["lymph_nodes_examined_status"]:
-                        if "lymph_nodes_examined_method" not in map_json:
+                        if "lymph_nodes_examined_method" not in map_json or map_json["lymph_nodes_examined_method"] is None:
                             self.warn("lymph_nodes_examined_method required if lymph_nodes_examined_status = Yes")
-                        if "number_lymph_nodes_positive" not in map_json:
+                        if "number_lymph_nodes_positive" not in map_json or map_json["number_lymph_nodes_positive"] is None:
                             self.warn("number_lymph_nodes_positive required if lymph_nodes_examined_status = Yes")
                 case "clinical_tumour_staging_system":
                     self.validate_staging_system(map_json, "clinical")
@@ -254,7 +259,7 @@ def validate_specimens(self, map_json):
         # Presence of tumour_histological_type means we have a tumour sample
         if "tumour_histological_type" in map_json:
             if not is_clinical_tumour:
-                if "pathological_tumour_staging_system" not in map_json:
+                if "pathological_tumour_staging_system" not in map_json or map_json["pathological_tumour_staging_system"] is None:
                     self.warn("Tumour specimens without clinical_tumour_staging_system require a pathological_tumour_staging_system")
                 else:
                     self.validate_staging_system(map_json, "pathological")
@@ -333,36 +338,24 @@ def validate_treatments(self, map_json):
 
 
     def validate_chemotherapies(self, map_json):
-        for prop in map_json:
-            match prop:
-                case "prescribed_cumulative_drug_dose":
-                    if "chemotherapy_drug_dose_units" not in map_json:
-                        self.warn("chemotherapy_drug_dose_units required if prescribed_cumulative_drug_dose is submitted")
-                case "actual_cumulative_drug_dose":
-                    if "chemotherapy_drug_dose_units" not in map_json:
-                        self.warn("chemotherapy_drug_dose_units required if actual_cumulative_drug_dose is submitted")
+        if "chemotherapy_drug_dose_units" not in map_json or map_json["chemotherapy_drug_dose_units"] is None:
+            for x in ["prescribed_cumulative_drug_dose", "actual_cumulative_drug_dose"]:
+                if x in map_json and map_json[x] is not None:
+                    self.warn(f"chemotherapy_drug_dose_units required if {x} is submitted")
 
 
     def validate_hormone_therapies(self, map_json):
-        for prop in map_json:
-            match prop:
-                case "prescribed_cumulative_drug_dose":
-                    if "hormone_drug_dose_units" not in map_json:
-                        self.warn("hormone_drug_dose_units required if prescribed_cumulative_drug_dose is submitted")
-                case "actual_cumulative_drug_dose":
-                    if "hormone_drug_dose_units" not in map_json:
-                        self.warn("hormone_drug_dose_units required if actual_cumulative_drug_dose is submitted")
+        if "hormone_drug_dose_units" not in map_json or map_json["hormone_drug_dose_units"] is None:
+            for x in ["prescribed_cumulative_drug_dose", "actual_cumulative_drug_dose"]:
+                if x in map_json and map_json[x] is not None:
+                    self.warn(f"hormone_drug_dose_units required if {x} is submitted")
 
 
     def validate_immunotherapies(self, map_json):
-        for prop in map_json:
-            match prop:
-                case "prescribed_cumulative_drug_dose":
-                    if "immunotherapy_drug_dose_units" not in map_json:
-                        self.warn("immunotherapy_drug_dose_units required if prescribed_cumulative_drug_dose is submitted")
-                case "actual_cumulative_drug_dose":
-                    if "immunotherapy_drug_dose_units" not in map_json:
-                        self.warn("immunotherapy_drug_dose_units required if actual_cumulative_drug_dose is submitted")
+        if "immunotherapy_drug_dose_units" not in map_json or map_json["immunotherapy_drug_dose_units"] is None:
+            for x in ["prescribed_cumulative_drug_dose", "actual_cumulative_drug_dose"]:
+                if x in map_json and map_json[x] is not None:
+                    self.warn(f"immunotherapy_drug_dose_units required if {x} is submitted")
 
 
     def validate_radiations(self, map_json):
@@ -374,7 +367,7 @@ def validate_radiations(self, map_json):
             match prop:
                 case "radiation_boost":
                     if map_json["radiation_boost"]:
-                        if "reference_radiation_treatment_id" not in map_json:
+                        if "reference_radiation_treatment_id" not in map_json or map_json["reference_radiation_treatment_id"] is None:
                             self.warn("reference_radiation_treatment_id required if radiation_boost = Yes")
 
 
@@ -385,9 +378,9 @@ def validate_surgeries(self, map_json):
             self.warn("Only one surgery is allowed per treatment")
 
         if "submitter_specimen_id" not in map_json:
-            if "surgery_site" not in map_json:
+            if "surgery_site" not in map_json or map_json["surgery_site"] is None:
                 self.warn("surgery_site required if submitter_specimen_id not submitted")
-            if "surgery_location" not in map_json:
+            if "surgery_location" not in map_json or map_json["surgery_location"] is None:
                 self.warn("surgery_location required if submitter_specimen_id not submitted")
         else:
             if map_json["submitter_specimen_id"] not in specimen_ids:
@@ -404,7 +397,7 @@ def validate_comorbidities(self, map_json):
 
     def validate_exposures(self, map_json):
         is_smoker = False
-        if "tobacco_smoking_status" not in map_json:
+        if "tobacco_smoking_status" not in map_json or map_json["tobacco_smoking_status"] is None:
             self.fail("tobacco_smoking_status required for exposure")
         else:
             if map_json["tobacco_smoking_status"] in [
@@ -433,8 +426,8 @@ def validate_staging_system(self, map_json, staging_type):
                 "m_category"
             ]
             for f in required_fields:
-                if f"{staging_type}_{f}" not in map_json:
+                if f"{staging_type}_{f}" not in map_json or map_json[f"{staging_type}_{f}"] is None:
                     self.warn(f"{staging_type}_{f} is required if {staging_type}_tumour_staging_system is AJCC")
         else:
-            if f"{staging_type}_stage_group" not in map_json:
+            if f"{staging_type}_stage_group" not in map_json or map_json[f"{staging_type}_stage_group"] is None:
                 self.warn(f"{staging_type}_stage_group is required for {staging_type}_tumour_staging_system {map_json[f'{staging_type}_tumour_staging_system']}")

From c22c1557603f7e443f3ded2e4cf042b7a5460583 Mon Sep 17 00:00:00 2001
From: Daisie Huang <daisieh@gmail.com>
Date: Thu, 19 Oct 2023 19:36:18 -0700
Subject: [PATCH 27/29] double print

---
 validate_coverage.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/validate_coverage.py b/validate_coverage.py
index 21d261b..077afa3 100644
--- a/validate_coverage.py
+++ b/validate_coverage.py
@@ -240,7 +240,5 @@ def main(args):
         for line in result:
             print(line)
 
-    print(json.dumps(result, indent=4))
-
 if __name__ == '__main__':
     main(parse_args())
\ No newline at end of file

From 60f9ebf38b20a45f87d1fed5111debf085d769b3 Mon Sep 17 00:00:00 2001
From: Daisie Huang <daisieh@gmail.com>
Date: Wed, 25 Oct 2023 19:50:55 -0700
Subject: [PATCH 28/29] this is moved to data repo instead of code

---
 ingest_redcap_data.py | 102 ------------------------------------------
 1 file changed, 102 deletions(-)
 delete mode 100644 ingest_redcap_data.py

diff --git a/ingest_redcap_data.py b/ingest_redcap_data.py
deleted file mode 100644
index 86c7531..0000000
--- a/ingest_redcap_data.py
+++ /dev/null
@@ -1,102 +0,0 @@
-"""
-Methods to transform the redcap raw data into the csv format expected by
-CSVConcert.py
-"""
-
-import os
-import argparse
-import re
-import pandas
-import json
-from pathlib import Path
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--input', type=str, required = True, help="Raw csv output from Redcap")
-    parser.add_argument('--verbose', '--v', action="store_true", help="Print extra information")
-    parser.add_argument('--output', type=str, default="tmp_out", help="Optional name of output directory in same directory as input; default tmp_out")
-    args = parser.parse_args()
-    return args
-
-def ingest_redcap_files(file):
-    """Test of ingest of redcap output files"""
-    raw_csv_dfs = {}
-    file_match = re.match(r"(.+)\.csv$", file)
-    if file_match is not None:
-        print(f"Reading input file {file}")
-        try:
-            df = pandas.read_csv(file, dtype=str, encoding = "latin-1")
-            #print(f"initial df shape: {df.shape}")
-            # find and drop empty columns
-            df = drop_empty_columns(df)
-            # now we do some renaming, becuase for reasons we don't understand
-            # the program_id and submitter_donor_id columns are swapped
-            df.rename(columns={'program_id':'tempname'},inplace=True)
-            df.rename(columns={'submitter_donor_id':'program_id'},inplace=True)
-            df.rename(columns={'tempname':'submitter_donor_id'},inplace=True)
-            raw_csv_dfs[file] = df
-        except Exception as e:
-            raise Exception(f"File {file} does not seem to be a valid csv file")
-    else:
-        raise Exception(f"File {file} does not seem to be a csv file")
-    return raw_csv_dfs
-
-def extract_repeat_instruments(df):
-    """ Transforms the single (very sparse) dataframe into one dataframe per
-    MoH schema. This makes it easier to look at, and also eliminates a bunch
-    of pandas warnings."""
-    new_dfs={}
-    starting_rows = df.shape[0]
-    repeat_instruments = df['redcap_repeat_instrument'].dropna().unique()
-    total_rows = 0
-    for i in repeat_instruments:
-        # each row has a redcap_repeat_instrument that describes the schema
-        # (e.g. Treatment) and a redcap_repeat_instance that is an id for that
-        # schema (this would be the treatment.id)
-        print(f"Extracting schema {i}")
-        schema_df = df.loc[df['redcap_repeat_instrument'] == i]
-        # drop all of the empty columns that aren't relevent for this schema
-        schema_df = drop_empty_columns(schema_df)
-        # rename the redcap_repeat_instance to the specific id (e.g. treatment_id)
-        schema_df.rename(columns={
-            'redcap_repeat_instance': f"{i}_id"
-            },
-            inplace=True
-            )
-        total_rows += schema_df.shape[0]
-        new_dfs[i]=schema_df
-
-    # now save all of the rows that aren't a repeat_instrument and
-    # label them Singleton for now
-    singletons = df.loc[df['redcap_repeat_instrument'].isnull()]
-    singletons = drop_empty_columns(singletons)
-    # check that we have all of the rows
-    if (total_rows + singletons.shape[0] < starting_rows):
-        print("Warning: not all rows recovered in raw data")
-    new_dfs['Singleton']=singletons
-    return new_dfs
-
-def drop_empty_columns(df):
-    empty_cols = [col for col in df if df[col].isnull().all()]
-    df = df.drop(empty_cols, axis=1)
-    return df
-
-def output_dfs(input_path,output_dir,df_list):
-    parent_path = Path(input_path).parent
-    tmpdir = Path(parent_path,output_dir)
-    if not tmpdir.is_dir():
-        tmpdir.mkdir()
-    print(f"Writing output files to {tmpdir}")
-    for d in df_list:
-        df_list[d].to_csv(Path(tmpdir,f"{d}.csv"), index=False)
-
-def main(args):
-    input_path = args.input
-
-    raw_csv_dfs = ingest_redcap_files(input_path)
-    new_dfs = extract_repeat_instruments(raw_csv_dfs[input_path])
-    output_dir = args.output
-    output_dfs(input_path,output_dir,new_dfs)
-
-if __name__ == '__main__':
-    main(parse_args())

From d1d393710278f8b07df05abc2ccdcb51aca8e7f8 Mon Sep 17 00:00:00 2001
From: Daisie Huang <daisieh@gmail.com>
Date: Wed, 25 Oct 2023 20:06:47 -0700
Subject: [PATCH 29/29] make sure there is an ID in the map_json, if required

---
 schema.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/schema.py b/schema.py
index dc548e0..afda888 100644
--- a/schema.py
+++ b/schema.py
@@ -331,7 +331,7 @@ def validate_ingest_map(self, map_json):
 
     def validate_schema(self, schema_name, map_json):
         id = f"{self.validation_schema[schema_name]['name']} {self.validation_schema[schema_name]['extra_args']['index']}"
-        if self.validation_schema[schema_name]["id"] is not None:
+        if self.validation_schema[schema_name]["id"] is not None and self.validation_schema[schema_name]["id"] in map_json:
             id = map_json[self.validation_schema[schema_name]["id"]]
             if schema_name not in self.identifiers:
                 self.identifiers[schema_name] = Counter()