Skip to content

Commit

Permalink
Merge pull request #89 from CanDIG/yavyx/move-missing-cases
Browse files Browse the repository at this point in the history
DIG-1860: Move missing cases to validation file
  • Loading branch information
yavyx authored Nov 18, 2024
2 parents b57146f + 7d9796b commit e295780
Show file tree
Hide file tree
Showing 5 changed files with 19 additions and 70 deletions.
3 changes: 0 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -181,9 +181,6 @@ A summarised example of the output is below:
"schemas_used": [
"donors"
],
"cases_missing_data": [
"DONOR_5"
],
"schemas_not_used": [
"exposures",
"biomarkers"
Expand Down
16 changes: 7 additions & 9 deletions src/clinical_etl/CSVConvert.py
Original file line number Diff line number Diff line change
Expand Up @@ -739,28 +739,26 @@ def csv_convert(input_path, manifest_file, minify=False, index_output=False, ver
json.dump(mappings.INDEXED_DATA, f, indent=4)

result_key = list(schema.validation_schema.keys()).pop(0)

result = {
"openapi_url": schema.openapi_url,
"schema_class": type(schema).__name__,
result_key: packets
}
if schema.katsu_sha is not None:
result["katsu_sha"] = schema.katsu_sha
print(f"{Bcolors.OKGREEN}Saving packets to file.{Bcolors.ENDC}")
with open(f"{mappings.OUTPUT_FILE}_map.json", 'w') as f: # write to json file for ingestion
if minify:
json.dump(result, f)
else:
json.dump(result, f, indent=4)

# add validation data:
print(f"\n{Bcolors.OKGREEN}Starting validation...{Bcolors.ENDC}")
schema.validate_ingest_map(result)
validation_results = {"validation_errors": schema.validation_errors,
"validation_warnings": schema.validation_warnings}
"validation_warnings": schema.validation_warnings,
"cases_missing_data": schema.statistics["cases_missing_data"]}
result["statistics"] = schema.statistics
with open(f"{mappings.OUTPUT_FILE}_map.json", 'w') as f: # write to json file for ingestion
result["statistics"].pop("cases_missing_data") # remove donor IDs from _map.json file

# write ingestion and validation json files
print(f"{Bcolors.OKGREEN}Saving packets to file.{Bcolors.ENDC}")
with open(f"{mappings.OUTPUT_FILE}_map.json", 'w') as f:
if minify:
json.dump(result, f)
else:
Expand Down
57 changes: 5 additions & 52 deletions src/clinical_etl/mohschemav3.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,10 +164,6 @@ class MoHSchemaV3(BaseSchema):
}

def validate_donors(self, map_json):
missing = {field for field, val in map_json.items() if val is None}
for f in missing:
if f in self.validation_schema["donors"]["required_fields"]:
self.warn(f"{f} is a required field")
for prop in map_json:
match prop:
case "is_deceased":
Expand Down Expand Up @@ -287,15 +283,10 @@ def validate_donors(self, map_json):
self.warn("test_date is required for biomarkers not associated with nested events")

def validate_primary_diagnoses(self, map_json):
missing = {field for field, val in map_json.items() if val is None}
if "date_of_diagnosis" in missing:
self.warn("date_of_diagnosis is required. NOTE: cannot calculate any date intervals for this patient")
missing.remove("date_of_diagnosis")
for f in missing:
if f in self.validation_schema["primary_diagnoses"]["required_fields"]:
self.warn(f"{f} is a required field")
if map_json["date_of_diagnosis"] is None:
self.warn("NOTE: cannot calculate any date intervals for this patient without date_of_diagnosis")
if "clinical_tumour_staging_system" not in map_json and "pathological_tumour_staging_system" not in map_json:
self.warn("Either clinical_tumour_staging_system or pathological_staging_system is required")
self.warn("Either clinical_tumour_staging_system or pathological_staging_system is required")
for prop in map_json:
if prop == "clinical_tumour_staging_system":
self.validate_staging_system(map_json, "clinical")
Expand All @@ -317,10 +308,6 @@ def validate_staging_system(self, map_json, staging_type):
self.warn(f"{staging_type}_stage_group is required for {staging_type}_tumour_staging_system {map_json[f'{staging_type}_tumour_staging_system']}")

def validate_specimens(self, map_json):
missing = {field for field, val in map_json.items() if val is None}
for f in missing:
if f in self.validation_schema["specimens"]["required_fields"]:
self.warn(f"{f} is a required field")
if "sample_registrations" in map_json:
for sample in map_json["sample_registrations"]:
if "tumour_normal_designation" in sample and sample["tumour_normal_designation"] == "Tumour":
Expand All @@ -337,16 +324,9 @@ def validate_specimens(self, map_json):
self.warn(f"Tumour specimens require a {f}")

def validate_sample_registrations(self, map_json):
missing = {field for field, val in map_json.items() if val is None}
for f in missing:
if f in self.validation_schema["sample_registrations"]["required_fields"]:
self.warn(f"{f} is a required field")
return

def validate_treatments(self, map_json):
missing = {field for field, val in map_json.items() if val is None}
for f in missing:
if f in self.validation_schema["treatments"]["required_fields"]:
self.warn(f"{f} is a required field")
for prop in map_json:
if prop == "treatment_type" and map_json["treatment_type"] is not None:
for t_type in map_json["treatment_type"]:
Expand Down Expand Up @@ -390,10 +370,6 @@ def validate_treatments(self, map_json):
self.fail("Systemic therapy end date cannot be after its treatment end date.")

def validate_systemic_therapies(self, map_json):
missing = {field for field, val in map_json.items() if val is None}
for f in missing:
if f in self.validation_schema["systemic_therapies"]["required_fields"]:
self.warn(f"{f} is a required field")
if "drug_dose_units" not in map_json or map_json["drug_dose_units"] is None:
for x in ["prescribed_cumulative_drug_dose", "actual_cumulative_drug_dose"]:
if x in map_json and map_json[x] is not None:
Expand All @@ -415,26 +391,15 @@ def validate_systemic_therapies(self, map_json):
self.fail("Systemic therapy start cannot be after systemic therapy end.")

def validate_radiations(self, map_json):
missing = {field for field, val in map_json.items() if val is None}
for f in missing:
if f in self.validation_schema["radiations"]["required_fields"]:
self.warn(f"{f} is a required field")
for prop in map_json:
if prop == "radiation_boost" and map_json["radiation_boost"] == "Yes":
if "reference_radiation_treatment_id" not in map_json or map_json["reference_radiation_treatment_id"] is None:
self.warn("reference_radiation_treatment_id required if radiation_boost = Yes")

def validate_surgeries(self, map_json):
missing = {field for field, val in map_json.items() if val is None}
for f in missing:
if f in self.validation_schema["surgeries"]["required_fields"]:
self.warn(f"{f} is a required field")
return

def validate_followups(self, map_json):
missing = {field for field, val in map_json.items() if val is None}
for f in missing:
if f in self.validation_schema["followups"]["required_fields"]:
self.warn(f"{f} is a required field")
for prop in map_json:
if prop == "disease_status_at_followup":
states = [
Expand All @@ -457,31 +422,19 @@ def validate_followups(self, map_json):
self.warn(f"anatomic_site_progression_or_recurrence is required if disease_status_at_followup is {map_json['disease_status_at_followup']}")

def validate_biomarkers(self, map_json):
missing = {field for field, val in map_json.items() if val is None}
for f in missing:
if f in self.validation_schema["biomarkers"]["required_fields"]:
self.warn(f"{f} is a required field")
for prop in map_json:
match prop:
case "hpv_pcr_status":
if map_json["hpv_pcr_status"] == "Positive" and "hpv_strain" not in map_json:
self.warn("If hpv_pcr_status is positive, hpv_strain is required")

def validate_comorbidities(self, map_json):
missing = {field for field, val in map_json.items() if val is None}
for f in missing:
if f in self.validation_schema["comorbities"]["required_fields"]:
self.warn(f"{f} is a required field")
for prop in map_json:
if prop == "laterality_of_prior_malignancy":
if "prior_malignancy" not in map_json or map_json["prior_malignancy"] != "Yes":
self.fail("laterality_of_prior_malignancy should not be submitted unless prior_malignancy = Yes")

def validate_exposures(self, map_json):
missing = {field for field, val in map_json.items() if val is None}
for f in missing:
if f in self.validation_schema["exposures"]["required_fields"]:
self.warn(f"{f} is a required field")
is_smoker = False
if "tobacco_smoking_status" not in map_json or map_json["tobacco_smoking_status"] is None:
self.warn("tobacco_smoking_status required for exposure")
Expand Down
2 changes: 1 addition & 1 deletion src/clinical_etl/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -411,7 +411,7 @@ def validate_schema(self, schema_name, map_json):
}
self.statistics["required_but_missing"][schema_name][f]["total"] += 1
if f not in map_json or map_json[f] == "Not available":
# self.warn(f"{f} required for {schema_name}")
self.warn(f"{f} required for {schema_name}")
self.statistics["required_but_missing"][schema_name][f]["missing"] += 1
if case not in self.statistics["cases_missing_data"]:
self.statistics["cases_missing_data"].append(case)
Expand Down
11 changes: 6 additions & 5 deletions tests/test_data_ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,17 +85,18 @@ def test_validation(packets, schema):
schema.validate_ingest_map({"donors": packets})
print(schema.validation_warnings)
warnings = [
"DONOR_2 > PD_2: date_of_diagnosis is required. NOTE: cannot calculate any date intervals for this patient",
"DONOR_3 > PD_3: basis_of_diagnosis is a required field",
"DONOR_2 > PD_2: date_of_diagnosis required for primary_diagnoses",
"DONOR_2 > PD_2: NOTE: cannot calculate any date intervals for this patient without date_of_diagnosis",
"DONOR_3 > PD_3: basis_of_diagnosis required for primary_diagnoses",
"DONOR_5: cause_of_death required if is_deceased = Yes",
"DONOR_5: date_of_death required if is_deceased = Yes",
"DONOR_5 > PD_5: basis_of_diagnosis is a required field",
"DONOR_5 > PD_5: basis_of_diagnosis required for primary_diagnoses",
"DONOR_5 > PD_5: clinical_stage_group is required for clinical_tumour_staging_system Revised International staging system (R-ISS)",
"DONOR_5 > PD_5 > TR_5 > Radiation 0: radiation_therapy_dosage is a required field",
"DONOR_5 > PD_5 > TR_5 > Radiation 0: radiation_therapy_dosage required for radiations",
"DONOR_5 > PD_5 > TR_10: Treatment type Systemic therapy should have one or more systemic therapies submitted",
]
assert (sorted(schema.validation_warnings) == sorted(warnings))
assert len(schema.validation_warnings) == 8
assert len(schema.validation_warnings) == 9


# temporary: remove 'month_interval' errors:
Expand Down

0 comments on commit e295780

Please sign in to comment.