cmu-delphi · nmdefries · Nov 5, 2020 · Nov 2, 2020 · Nov 2, 2020 · Nov 3, 2020
diff --git a/validator/delphi_validator/validate.py b/validator/delphi_validator/validate.py
@@ -113,6 +113,7 @@ def __init__(self, params):
         """
         # Get user settings from params or if not provided, set default.
         self.data_source = params['data_source']
+        self.validator_static_file_dir = params.get('validator_static_file_dir', '../validator/static')
 
         # Date/time settings
         self.span_length = timedelta(days=params['span_length'])
@@ -244,9 +245,33 @@ def check_df_format(self, df_to_test, nameformat):
 
         self.increment_total_checks()
 
-    def check_bad_geo_id(self, df_to_test, nameformat, geo_type):
+    def check_bad_geo_id_value(self, df_to_test, filename, geo_type):
         """
-        Check validity of geo type and values, according to regex pattern.
+        Check for bad geo_id values, by comparing to a list of known values (drawn from historical data)
+
+        Arguments:
+            - df_to_test: pandas dataframe of CSV source data containing the geo_id column to check
+            - geo_type: string from CSV name specifying geo type (state, county, msa, etc.) of data
+        """
+        file_path = join(self.validator_static_file_dir, geo_type + '_geo.csv')
+        valid_geo_df = pd.read_csv(file_path, dtype = {'geo_id': str})
+        valid_geos = valid_geo_df['geo_id'].values
+        unexpected_geos = [geo for geo in df_to_test['geo_id'] if geo.lower() not in valid_geos]
+        if len(unexpected_geos) > 0:
+            self.raised_errors.append(ValidationError(
+                ("check_bad_geo_id_value", filename),
+                unexpected_geos, "Unrecognized geo_ids (not in historical data)"))
+        self.increment_total_checks()
+        upper_case_geos = [geo for geo in df_to_test['geo_id'] if geo.lower() != geo]
+        if len(upper_case_geos) > 0:
+            self.raised_warnings.append(ValidationError(
+                ("check_geo_id_lowercase", filename),
+                 upper_case_geos, "geo_id contains uppercase characters. Lowercase is preferred."))
+        self.increment_total_checks()
+
+    def check_bad_geo_id_format(self, df_to_test, nameformat, geo_type):
+        """
+        Check validity of geo_type and format of geo_ids, according to regex pattern.
 
         Arguments:
             - df_to_test: pandas dataframe of CSV source data
@@ -720,8 +745,9 @@ def validate(self, export_dir):
             data_df = load_csv(join(export_dir, filename))
 
             self.check_df_format(data_df, filename)
-            self.check_bad_geo_id(
+            self.check_bad_geo_id_format(
                 data_df, filename, match.groupdict()['geo_type'])
+            self.check_bad_geo_id_value(data_df, filename, match.groupdict()['geo_type'])
             self.check_bad_val(data_df, filename, match.groupdict()['signal'])
             self.check_bad_se(data_df, filename)
             self.check_bad_sample_size(data_df, filename)

diff --git a/validator/params.json.template b/validator/params.json.template
@@ -4,6 +4,7 @@
     "end_date": "2020-09-08",
     "span_length": 3,
     "ref_window_size": 7,
+    "validator_static_file_dir": "../validator/static",
     "minimum_sample_size": 100,
     "missing_se_allowed": true,
     "missing_sample_size_allowed": true,

diff --git a/validator/scripts/unique_geoids.R b/validator/scripts/unique_geoids.R
@@ -0,0 +1,15 @@
+library(covidcast)
+
+geo_types = c("county", "state", "hrr", "msa")
+for(type in geo_types){
+  dtf = covidcast_signal("indicator-combination", "confirmed_7dav_incidence_num", start_day = "2020-10-01", end_day = "2020-10-01", geo_type = type)
+  file_name = paste0("../static/", type, "_geo.csv")
+  write.table(unique(dtf$geo_value), file = file_name, row.names = F, col.names = "geo_id")
+}
+
+dtf = covidcast_signal("ght", "raw_search", start_day = "2020-10-01", end_day = "2020-10-01", geo_type = "dma")
+file_name = "../static/dma_geo.csv"
+write.table(unique(dtf$geo_value), file = file_name, row.names = F, col.names = "geo_id")
+
+national_file = "../static/national_geo.csv"
+write.table("us", file = national_file, row.names = F, col.names = "geo_id")