diff --git a/README.md b/README.md index 91b980a..7fdecc0 100644 --- a/README.md +++ b/README.md @@ -181,9 +181,6 @@ A summarised example of the output is below: "schemas_used": [ "donors" ], - "cases_missing_data": [ - "DONOR_5" - ], "schemas_not_used": [ "exposures", "biomarkers" diff --git a/src/clinical_etl/CSVConvert.py b/src/clinical_etl/CSVConvert.py index 7fddfd4..5ee6559 100644 --- a/src/clinical_etl/CSVConvert.py +++ b/src/clinical_etl/CSVConvert.py @@ -739,7 +739,6 @@ def csv_convert(input_path, manifest_file, minify=False, index_output=False, ver json.dump(mappings.INDEXED_DATA, f, indent=4) result_key = list(schema.validation_schema.keys()).pop(0) - result = { "openapi_url": schema.openapi_url, "schema_class": type(schema).__name__, @@ -747,20 +746,19 @@ def csv_convert(input_path, manifest_file, minify=False, index_output=False, ver } if schema.katsu_sha is not None: result["katsu_sha"] = schema.katsu_sha - print(f"{Bcolors.OKGREEN}Saving packets to file.{Bcolors.ENDC}") - with open(f"{mappings.OUTPUT_FILE}_map.json", 'w') as f: # write to json file for ingestion - if minify: - json.dump(result, f) - else: - json.dump(result, f, indent=4) # add validation data: print(f"\n{Bcolors.OKGREEN}Starting validation...{Bcolors.ENDC}") schema.validate_ingest_map(result) validation_results = {"validation_errors": schema.validation_errors, - "validation_warnings": schema.validation_warnings} + "validation_warnings": schema.validation_warnings, + "cases_missing_data": schema.statistics["cases_missing_data"]} result["statistics"] = schema.statistics - with open(f"{mappings.OUTPUT_FILE}_map.json", 'w') as f: # write to json file for ingestion + result["statistics"].pop("cases_missing_data") # remove donor IDs from _map.json file + + # write ingestion and validation json files + print(f"{Bcolors.OKGREEN}Saving packets to file.{Bcolors.ENDC}") + with open(f"{mappings.OUTPUT_FILE}_map.json", 'w') as f: if minify: json.dump(result, f) else: diff --git a/src/clinical_etl/mohschemav3.py b/src/clinical_etl/mohschemav3.py index 8476ae5..a3ef2e8 100644 --- a/src/clinical_etl/mohschemav3.py +++ b/src/clinical_etl/mohschemav3.py @@ -164,10 +164,6 @@ class MoHSchemaV3(BaseSchema): } def validate_donors(self, map_json): - missing = {field for field, val in map_json.items() if val is None} - for f in missing: - if f in self.validation_schema["donors"]["required_fields"]: - self.warn(f"{f} is a required field") for prop in map_json: match prop: case "is_deceased": @@ -287,15 +283,10 @@ def validate_donors(self, map_json): self.warn("test_date is required for biomarkers not associated with nested events") def validate_primary_diagnoses(self, map_json): - missing = {field for field, val in map_json.items() if val is None} - if "date_of_diagnosis" in missing: - self.warn("date_of_diagnosis is required. NOTE: cannot calculate any date intervals for this patient") - missing.remove("date_of_diagnosis") - for f in missing: - if f in self.validation_schema["primary_diagnoses"]["required_fields"]: - self.warn(f"{f} is a required field") + if map_json["date_of_diagnosis"] is None: + self.warn("NOTE: cannot calculate any date intervals for this patient without date_of_diagnosis") if "clinical_tumour_staging_system" not in map_json and "pathological_tumour_staging_system" not in map_json: - self.warn("Either clinical_tumour_staging_system or pathological_staging_system is required") + self.warn("Either clinical_tumour_staging_system or pathological_staging_system is required") for prop in map_json: if prop == "clinical_tumour_staging_system": self.validate_staging_system(map_json, "clinical") @@ -317,10 +308,6 @@ def validate_staging_system(self, map_json, staging_type): self.warn(f"{staging_type}_stage_group is required for {staging_type}_tumour_staging_system {map_json[f'{staging_type}_tumour_staging_system']}") def validate_specimens(self, map_json): - missing = {field for field, val in map_json.items() if val is None} - for f in missing: - if f in self.validation_schema["specimens"]["required_fields"]: - self.warn(f"{f} is a required field") if "sample_registrations" in map_json: for sample in map_json["sample_registrations"]: if "tumour_normal_designation" in sample and sample["tumour_normal_designation"] == "Tumour": @@ -337,16 +324,9 @@ def validate_specimens(self, map_json): self.warn(f"Tumour specimens require a {f}") def validate_sample_registrations(self, map_json): - missing = {field for field, val in map_json.items() if val is None} - for f in missing: - if f in self.validation_schema["sample_registrations"]["required_fields"]: - self.warn(f"{f} is a required field") + return def validate_treatments(self, map_json): - missing = {field for field, val in map_json.items() if val is None} - for f in missing: - if f in self.validation_schema["treatments"]["required_fields"]: - self.warn(f"{f} is a required field") for prop in map_json: if prop == "treatment_type" and map_json["treatment_type"] is not None: for t_type in map_json["treatment_type"]: @@ -390,10 +370,6 @@ def validate_treatments(self, map_json): self.fail("Systemic therapy end date cannot be after its treatment end date.") def validate_systemic_therapies(self, map_json): - missing = {field for field, val in map_json.items() if val is None} - for f in missing: - if f in self.validation_schema["systemic_therapies"]["required_fields"]: - self.warn(f"{f} is a required field") if "drug_dose_units" not in map_json or map_json["drug_dose_units"] is None: for x in ["prescribed_cumulative_drug_dose", "actual_cumulative_drug_dose"]: if x in map_json and map_json[x] is not None: @@ -415,26 +391,15 @@ def validate_systemic_therapies(self, map_json): self.fail("Systemic therapy start cannot be after systemic therapy end.") def validate_radiations(self, map_json): - missing = {field for field, val in map_json.items() if val is None} - for f in missing: - if f in self.validation_schema["radiations"]["required_fields"]: - self.warn(f"{f} is a required field") for prop in map_json: if prop == "radiation_boost" and map_json["radiation_boost"] == "Yes": if "reference_radiation_treatment_id" not in map_json or map_json["reference_radiation_treatment_id"] is None: self.warn("reference_radiation_treatment_id required if radiation_boost = Yes") def validate_surgeries(self, map_json): - missing = {field for field, val in map_json.items() if val is None} - for f in missing: - if f in self.validation_schema["surgeries"]["required_fields"]: - self.warn(f"{f} is a required field") + return def validate_followups(self, map_json): - missing = {field for field, val in map_json.items() if val is None} - for f in missing: - if f in self.validation_schema["followups"]["required_fields"]: - self.warn(f"{f} is a required field") for prop in map_json: if prop == "disease_status_at_followup": states = [ @@ -457,10 +422,6 @@ def validate_followups(self, map_json): self.warn(f"anatomic_site_progression_or_recurrence is required if disease_status_at_followup is {map_json['disease_status_at_followup']}") def validate_biomarkers(self, map_json): - missing = {field for field, val in map_json.items() if val is None} - for f in missing: - if f in self.validation_schema["biomarkers"]["required_fields"]: - self.warn(f"{f} is a required field") for prop in map_json: match prop: case "hpv_pcr_status": @@ -468,20 +429,12 @@ def validate_biomarkers(self, map_json): self.warn("If hpv_pcr_status is positive, hpv_strain is required") def validate_comorbidities(self, map_json): - missing = {field for field, val in map_json.items() if val is None} - for f in missing: - if f in self.validation_schema["comorbities"]["required_fields"]: - self.warn(f"{f} is a required field") for prop in map_json: if prop == "laterality_of_prior_malignancy": if "prior_malignancy" not in map_json or map_json["prior_malignancy"] != "Yes": self.fail("laterality_of_prior_malignancy should not be submitted unless prior_malignancy = Yes") def validate_exposures(self, map_json): - missing = {field for field, val in map_json.items() if val is None} - for f in missing: - if f in self.validation_schema["exposures"]["required_fields"]: - self.warn(f"{f} is a required field") is_smoker = False if "tobacco_smoking_status" not in map_json or map_json["tobacco_smoking_status"] is None: self.warn("tobacco_smoking_status required for exposure") diff --git a/src/clinical_etl/schema.py b/src/clinical_etl/schema.py index aa1b666..985e7e5 100644 --- a/src/clinical_etl/schema.py +++ b/src/clinical_etl/schema.py @@ -411,7 +411,7 @@ def validate_schema(self, schema_name, map_json): } self.statistics["required_but_missing"][schema_name][f]["total"] += 1 if f not in map_json or map_json[f] == "Not available": - # self.warn(f"{f} required for {schema_name}") + self.warn(f"{f} required for {schema_name}") self.statistics["required_but_missing"][schema_name][f]["missing"] += 1 if case not in self.statistics["cases_missing_data"]: self.statistics["cases_missing_data"].append(case) diff --git a/tests/test_data_ingest.py b/tests/test_data_ingest.py index 40de7ca..91320d5 100644 --- a/tests/test_data_ingest.py +++ b/tests/test_data_ingest.py @@ -85,17 +85,18 @@ def test_validation(packets, schema): schema.validate_ingest_map({"donors": packets}) print(schema.validation_warnings) warnings = [ - "DONOR_2 > PD_2: date_of_diagnosis is required. NOTE: cannot calculate any date intervals for this patient", - "DONOR_3 > PD_3: basis_of_diagnosis is a required field", + "DONOR_2 > PD_2: date_of_diagnosis required for primary_diagnoses", + "DONOR_2 > PD_2: NOTE: cannot calculate any date intervals for this patient without date_of_diagnosis", + "DONOR_3 > PD_3: basis_of_diagnosis required for primary_diagnoses", "DONOR_5: cause_of_death required if is_deceased = Yes", "DONOR_5: date_of_death required if is_deceased = Yes", - "DONOR_5 > PD_5: basis_of_diagnosis is a required field", + "DONOR_5 > PD_5: basis_of_diagnosis required for primary_diagnoses", "DONOR_5 > PD_5: clinical_stage_group is required for clinical_tumour_staging_system Revised International staging system (R-ISS)", - "DONOR_5 > PD_5 > TR_5 > Radiation 0: radiation_therapy_dosage is a required field", + "DONOR_5 > PD_5 > TR_5 > Radiation 0: radiation_therapy_dosage required for radiations", "DONOR_5 > PD_5 > TR_10: Treatment type Systemic therapy should have one or more systemic therapies submitted", ] assert (sorted(schema.validation_warnings) == sorted(warnings)) - assert len(schema.validation_warnings) == 8 + assert len(schema.validation_warnings) == 9 # temporary: remove 'month_interval' errors: