diff --git a/data-serving/data-service/schemas/cases.schema.json b/data-serving/data-service/schemas/cases.schema.json index 399709364..1a39b16d1 100644 --- a/data-serving/data-service/schemas/cases.schema.json +++ b/data-serving/data-service/schemas/cases.schema.json @@ -276,21 +276,6 @@ } } }, - "outbreakSpecifics": { - "bsonType": "object", - "additionalProperties": false, - "properties": { - "_id": { - "bsonType": "objectId" - }, - "livesInWuhan": { - "bsonType": "bool" - }, - "reportedMarketExposure": { - "bsonType": "bool" - } - } - }, "pathogens": { "bsonType": "array", "uniqueItems": true, @@ -416,6 +401,9 @@ }, "travel_history_binary": { "bsonType": "string" + }, + "lives_in_Wuhan": { + "bsonType": "string" } } } diff --git a/data-serving/data-service/src/model/case.ts b/data-serving/data-service/src/model/case.ts index 455d947ab..9056c3b2c 100644 --- a/data-serving/data-service/src/model/case.ts +++ b/data-serving/data-service/src/model/case.ts @@ -2,10 +2,6 @@ import { DemographicsDocument, demographicsSchema } from './demographics'; import { DictionaryDocument, dictionarySchema } from './dictionary'; import { EventDocument, eventSchema } from './event'; import { LocationDocument, locationSchema } from './location'; -import { - OutbreakSpecificsDocument, - outbreakSpecificsSchema, -} from './outbreak-specifics'; import { PathogenDocument, pathogenSchema } from './pathogen'; import { RevisionMetadataDocument, @@ -37,7 +33,6 @@ const caseSchema = new mongoose.Schema( required: 'Must include revision metadata', }, notes: String, - outbreakSpecifics: outbreakSpecificsSchema, pathogens: [pathogenSchema], sources: { type: [sourceSchema], @@ -75,7 +70,6 @@ type CaseDocument = mongoose.Document & { location: LocationDocument; revisionMetadata: RevisionMetadataDocument; notes: string; - outbreakSpecifics: OutbreakSpecificsDocument; pathogens: [PathogenDocument]; sources: [SourceDocument]; symptoms: DictionaryDocument; diff --git a/data-serving/data-service/src/model/outbreak-specifics.ts b/data-serving/data-service/src/model/outbreak-specifics.ts deleted file mode 100644 index 4a9990a60..000000000 --- a/data-serving/data-service/src/model/outbreak-specifics.ts +++ /dev/null @@ -1,11 +0,0 @@ -import mongoose from 'mongoose'; - -export const outbreakSpecificsSchema = new mongoose.Schema({ - livesInWuhan: Boolean, - reportedMarketExposure: Boolean, -}); - -export type OutbreakSpecificsDocument = mongoose.Document & { - livesInWuhan: boolean; - reportedMarketExposure: boolean; -}; diff --git a/data-serving/data-service/test/model/data/case.full.json b/data-serving/data-service/test/model/data/case.full.json index 8f996770d..7dd8dcf1b 100644 --- a/data-serving/data-service/test/model/data/case.full.json +++ b/data-serving/data-service/test/model/data/case.full.json @@ -105,10 +105,6 @@ "url": "https://www.colorado.gov/pacific/cdphe/news/10-new-presumptive-positive-cases-colorado-cdphe-confirms-limited-community-spread-covid-19" } ], - "outbreakSpecifics": { - "livesInWuhan": false, - "reportedMarketExposure": true - }, "pathogens": [ { "name": "sars-cov-2", @@ -131,13 +127,15 @@ "notes": "initial data entry" }, "importedCase": { - "additionalInformation": "Contact of a confirmed case at work.", - "notesForDiscussion": "Other stuff from notes", - "geoResolution": "admin_2", + "ID": "xyz", + "additional_information": "Contact of a confirmed case at work.", + "notes_for_discussion": "Other stuff from notes", + "geo_resolution": "admin_2", "symptoms": "severe pneumonia:dyspnea:weakness:some free-form symptoms:that don't match the symptom dictionary\"", - "chronicDiseaseBinary": true, - "chronicDisease": "hypertension:type 2 diabetes:coronary heart disease:lung cancer:some free-form chronic diseases:that don't match the chronic disease dictionary", + "chronic_disease_binary": "true", "outcome": "discharge 2/12", - "adminId": "291" + "admin_id": "291", + "lives_in_Wuhan": "false", + "reported_market_exposure": "true" } } \ No newline at end of file diff --git a/data-serving/data-service/test/model/data/outbreak-specifics.full.json b/data-serving/data-service/test/model/data/outbreak-specifics.full.json deleted file mode 100644 index f8ad6968d..000000000 --- a/data-serving/data-service/test/model/data/outbreak-specifics.full.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "livesInWuhan": false, - "reportedMarketExposure": true -} \ No newline at end of file diff --git a/data-serving/data-service/test/model/data/outbreak-specifics.minimal.json b/data-serving/data-service/test/model/data/outbreak-specifics.minimal.json deleted file mode 100644 index 9e26dfeeb..000000000 --- a/data-serving/data-service/test/model/data/outbreak-specifics.minimal.json +++ /dev/null @@ -1 +0,0 @@ -{} \ No newline at end of file diff --git a/data-serving/data-service/test/model/outbreak-specifics.test.ts b/data-serving/data-service/test/model/outbreak-specifics.test.ts deleted file mode 100644 index 8fe043e82..000000000 --- a/data-serving/data-service/test/model/outbreak-specifics.test.ts +++ /dev/null @@ -1,23 +0,0 @@ -import { - OutbreakSpecificsDocument, - outbreakSpecificsSchema, -} from '../../src/model/outbreak-specifics'; - -import fullModel from './data/date-range.full.json'; -import minimalModel from './data/date-range.minimal.json'; -import mongoose from 'mongoose'; - -const OutbreakSpecifics = mongoose.model( - 'OutbreakSpecifics', - outbreakSpecificsSchema, -); - -describe('validate', () => { - it('a minimal outbreak specifics document is valid', async () => { - return new OutbreakSpecifics(minimalModel).validate(); - }); - - it('a fully specified outbreak specifics document is valid', async () => { - return new OutbreakSpecifics(fullModel).validate(); - }); -}); diff --git a/data-serving/samples/cases.json b/data-serving/samples/cases.json index 783a74587..56cea59fb 100644 --- a/data-serving/samples/cases.json +++ b/data-serving/samples/cases.json @@ -158,10 +158,6 @@ "url": "https://www.colorado.gov/pacific/cdphe/news/10-new-presumptive-positive-cases-colorado-cdphe-confirms-limited-community-spread-covid-19" } ], - "outbreakSpecifics": { - "livesInWuhan": false, - "reportedMarketExposure": true - }, "pathogens": [ { "name": "sars-cov-2", @@ -194,7 +190,9 @@ "geo_resolution": "admin_2", "chronic_disease_binary": "true", "outcome": "discharge 2/12", - "admin_id": "291" + "admin_id": "291", + "lives_in_Wuhan": "false", + "reported_market_exposure": "true" } }, { diff --git a/data-serving/scripts/convert-data/README.md b/data-serving/scripts/convert-data/README.md index 862232efd..d8dd9750c 100644 --- a/data-serving/scripts/convert-data/README.md +++ b/data-serving/scripts/convert-data/README.md @@ -31,8 +31,6 @@ The following fields are lossy: - `demographics.ageRange`: Some values are too large to be ages. Ex. row `002-23162` with age value `2073`. - `events[name='onsetSymptoms']`: Some values are in an invalid format, ex. row `000-1-20073` with value `08.03.20202` -- `outbreakSpecifics.reportedMarketExposure`: Some values are not bools, ex. row `000-1-13167` has value - `exposed to people who come back from wuhan` - `travelHistory.location`: This field is highly unstructured, and includes lists of locations, free-form text, and locations of all (unmarked) granularity. - `travelHistory.dateRange`: As with `events[name='onsetSymptoms']`, the date format varies. @@ -45,22 +43,16 @@ The following fields are lossy: The following fields are *not* lossy, although they require conversion to a new type: - `sex` -- `outbreakSpecifics.livesInWuhan` - `location.geometry.latitude`, `location.geometry.longitude` - `events[name='admissionHospital']`, `events[name='confirmed']`, `events[name='deathOrDischarge']` ### Future improvements -- Improve disambiguation of `travelHistory.location`. For example, if the person lives in Florida and has traveled to - Georgia, it's more likely to be the state than the country. - - Add validation logic to all dates to ensure that they are between 12/2019 and today. - If a date fails to parse/validate in the `mm/dd/yy` format, attempt to parse it in other formats, including `dd/mm/yy`, `mm.dd.yy`, and `dd.mm.yy`. -- Take free-form text from `outbreakSpecifics.reportedMarketExposure` and add it to the notes field. - - Clean up the source data in the case of obvious errors in the logs, e.g. ages in the thousands or dates with one too many or too few digits. @@ -109,6 +101,18 @@ Fields that can't be converted include: - `source.id` and `pathogens.sequenceSource.id`: Sources may have ids to link them to the new `sources` collection; it's possible that we may be able to backfill this later once that dataset is developed and we can cross-reference by URL. +Fields that are not carrying over to the new schema, though they will be included in `importedCase`: + +- Fields that were relevant early on in the outbreak, but aren't tracked any longer: `lives_in_Wuhan`, + `reported_market_exposure` + +- Fields supplanted by new values: `ID` + +- Non-normalized or redunant location fields, including `province`, `geo_resolution`, `location`, `admin3`, + `country_new`, `admin_id` + +- Fields whose values can be imputed from other fields: `geo_resolution`, `chronic_disease_binary` + ### Backfilled fields We are backfilling fields including: diff --git a/data-serving/scripts/convert-data/constants.py b/data-serving/scripts/convert-data/constants.py index f2435d972..0154a1509 100644 --- a/data-serving/scripts/convert-data/constants.py +++ b/data-serving/scripts/convert-data/constants.py @@ -16,14 +16,14 @@ ''' The path to the geocoding script in the nCoV2019 repo. ''' GEOCODER_REPO_PATH = 'code/sheet_cleaner/geocoding' -# TODO(khmoran): Include 'outcome' once the curator UI transitions to using the +# TODO(khmoran): Exclude 'outcome' once the curator UI transitions to using the # new events-based outcome field. LOSSY_FIELDS = [ 'ID', 'province', 'geo_resolution', 'date_onset_symptoms', 'date_admission_hospital', 'date_confirmation', 'travel_history_dates', 'travel_history_location', 'reported_market_exposure', 'chronic_disease_binary', 'outcome', 'location', 'admin3', 'country_new', - 'admin_id', 'travel_history_binary' + 'admin_id', 'travel_history_binary', 'lives_in_Wuhan' ] ''' diff --git a/data-serving/scripts/convert-data/convert_data.py b/data-serving/scripts/convert-data/convert_data.py index a89d822d7..082766c63 100644 --- a/data-serving/scripts/convert-data/convert_data.py +++ b/data-serving/scripts/convert-data/convert_data.py @@ -14,7 +14,7 @@ convert_demographics, convert_dictionary_field, convert_events, convert_imported_case, convert_location, convert_revision_metadata_field, convert_notes_field, convert_sources_field, convert_pathogens_field, - convert_outbreak_specifics, convert_travel_history) + convert_travel_history) from typing import Any from constants import ( DATA_CSV_FILENAME, DATA_GZIP_FILENAME, DATA_REPO_PATH, GEOCODER_DB_FILENAME, @@ -151,9 +151,6 @@ def convert(infile: str, outfile: str, geocoder: Any, json_case['pathogens'] = convert_pathogens_field( csv_case['sequence_available']) - json_case['outbreakSpecifics'] = convert_outbreak_specifics( - csv_case['ID'], csv_case['reported_market_exposure'], csv_case['lives_in_Wuhan']) - json_case['travelHistory'] = convert_travel_history( geocoder, csv_case['ID'], csv_case['travel_history_dates'], diff --git a/data-serving/scripts/convert-data/converters.py b/data-serving/scripts/convert-data/converters.py index 1d897a1c1..d991d88f4 100644 --- a/data-serving/scripts/convert-data/converters.py +++ b/data-serving/scripts/convert-data/converters.py @@ -402,49 +402,6 @@ def convert_pathogens_field(sequence: str) -> List[Dict[str, Any]]: }] if sources else None -def convert_outbreak_specifics(id: str, reported_market_exposure: str, - lives_in_wuhan: str) -> Dict[str, bool]: - ''' - Converts the covid-19-specific fields into a new outbreakSpecifics - object. - - Parameters: - id: The id of the input row for logging a failed conversion. - - Returns: - None: When the input is empty. - Dict[str, bool]: When the input is nonempty. The dictionary is in the - format: - { - 'reportedMarketExposure': bool, - 'livesInWuhan': bool - } - ''' - - outbreak_specifics = {} - - try: - normalized = parse_bool(reported_market_exposure) - if normalized is not None: - outbreak_specifics['reportedMarketExposure'] = normalized - except ValueError as e: - log_error( - id, 'reported_market_exposure', - 'outbreakSpecifics.reportedMarketExposure', - reported_market_exposure, e) - - try: - normalized = parse_bool(lives_in_wuhan) - if normalized is not None: - outbreak_specifics['livesInWuhan'] = normalized - except ValueError as e: - log_error( - id, 'lives_in_wuhan', 'outbreakSpecifics.livesInWuhan', - lives_in_wuhan, e) - - return outbreak_specifics or None - - def convert_travel_history(geocoder: Any, id: str, dates: str, location: str) -> Dict[str, Any]: '''