qiita-spots · charles-cowart · Feb 9, 2023 · Feb 8, 2023 · Feb 9, 2023
diff --git a/qiita_db/metadata_template/base_metadata_template.py b/qiita_db/metadata_template/base_metadata_template.py
@@ -54,6 +54,22 @@
 # information
 QIITA_COLUMN_NAME = 'qiita_sample_column_names'
 
+INSDC_NULL_VALUES = {
+    'not collected': 'not collected',
+    'not provided': 'not provided',
+    'restricted access': 'restricted access',
+    'not applicable': 'not applicable',
+    'unspecified': 'not applicable',
+    'not_collected': 'not collected',
+    'not_provided': 'not provided',
+    'restricted_access': 'restricted access',
+    'not_applicable': 'not applicable',
+    'missing: not collected': 'not collected',
+    'missing: not provided': 'not provided',
+    'missing: restricted access': 'restricted access',
+    'missing: not applicable': 'not applicable',
+}
+
 
 def _helper_get_categories(table):
     """This is a helper function to avoid duplication of code"""
@@ -585,6 +601,15 @@ def _clean_validate_template(cls, md_template, study_id,
             raise qdb.exceptions.QiitaDBDuplicateHeaderError(
                 set(duplicates(md_template.columns)))
 
+        # validate the INSDC_NULL_VALUES
+        _df = md_template.fillna("").applymap(str).applymap(str.lower)
+        _ddf = _df[_df.isin(INSDC_NULL_VALUES.keys()).any(axis=1)]
+        if _ddf.shape[0] != 0:
+            for c in _ddf.columns:
+                if set(INSDC_NULL_VALUES) & set(_ddf[c].values):
+                    for s, v in _ddf[c].to_dict().items():
+                        md_template[c][s] = INSDC_NULL_VALUES[v]
+
         return md_template
 
     @classmethod

diff --git a/qiita_db/metadata_template/test/test_prep_template.py b/qiita_db/metadata_template/test/test_prep_template.py
@@ -298,6 +298,7 @@ def setUp(self):
                             'qiita_prep_id': 1000,
                             'instrument_model': 'Illumina MiSeq',
                             'library_construction_protocol': 'AAAA',
+                            'insdc_nulls': '3.6',
                             'experiment_design_description': 'BBBB'},
             'SKD8.640184': {'center_name': 'ANL',
                             'center_project_name': 'Test Project',
@@ -311,6 +312,7 @@ def setUp(self):
                             'qiita_prep_id': 1000,
                             'instrument_model': 'Illumina MiSeq',
                             'library_construction_protocol': 'AAAA',
+                            'insdc_nulls': 'NoT applicable',
                             'experiment_design_description': 'BBBB'},
             'SKB7.640196': {'center_name': 'ANL',
                             'center_project_name': 'Test Project',
@@ -324,6 +326,7 @@ def setUp(self):
                             'qiita_prep_id': 1000,
                             'instrument_model': 'Illumina MiSeq',
                             'library_construction_protocol': 'AAAA',
+                            'insdc_nulls': 'unspecified',
                             'experiment_design_description': 'BBBB'}
             }
         self.metadata = pd.DataFrame.from_dict(self.metadata_dict,
@@ -342,6 +345,7 @@ def setUp(self):
                               'qiita_prep_id': 1000,
                               'instrument_model': 'Illumina MiSeq',
                               'library_construction_protocol': 'AAAA',
+                              'insdc_nulls': '3.6',
                               'experiment_design_description': 'BBBB'},
             '1.SKD8.640184': {'center_name': 'ANL',
                               'center_project_name': 'Test Project',
@@ -355,6 +359,7 @@ def setUp(self):
                               'qiita_prep_id': 1000,
                               'instrument_model': 'Illumina MiSeq',
                               'library_construction_protocol': 'AAAA',
+                              'insdc_nulls': 'not applicable',
                               'experiment_design_description': 'BBBB'},
             '1.SKB7.640196': {'center_name': 'ANL',
                               'center_project_name': 'Test Project',
@@ -368,6 +373,7 @@ def setUp(self):
                               'qiita_prep_id': 1000,
                               'instrument_model': 'Illumina MiSeq',
                               'library_construction_protocol': 'AAAA',
+                              'insdc_nulls': 'not applicable',
                               'experiment_design_description': 'BBBB'}
             }
         self.metadata_prefixed = pd.DataFrame.from_dict(metadata_prefixed_dict,
@@ -730,6 +736,7 @@ def test_clean_validate_template(self):
                               'platform': 'Illumina',
                               'instrument_model': 'Illumina MiSeq',
                               'library_construction_protocol': 'AAAA',
+                              'insdc_nulls': '3.6',
                               'experiment_design_description': 'BBBB'},
             '2.SKD8.640184': {'center_name': 'ANL',
                               'center_project_name': 'Test Project',
@@ -742,6 +749,7 @@ def test_clean_validate_template(self):
                               'platform': 'Illumina',
                               'instrument_model': 'Illumina MiSeq',
                               'library_construction_protocol': 'AAAA',
+                              'insdc_nulls': 'not applicable',
                               'experiment_design_description': 'BBBB'},
             '2.SKB7.640196': {'center_name': 'ANL',
                               'center_project_name': 'Test Project',
@@ -754,13 +762,16 @@ def test_clean_validate_template(self):
                               'platform': 'Illumina',
                               'instrument_model': 'Illumina MiSeq',
                               'library_construction_protocol': 'AAAA',
+                              'insdc_nulls': 'not applicable',
                               'experiment_design_description': 'BBBB'}
             }
         exp = pd.DataFrame.from_dict(metadata_dict, orient='index', dtype=str)
+
         obs.sort_index(axis=0, inplace=True)
         obs.sort_index(axis=1, inplace=True)
         exp.sort_index(axis=0, inplace=True)
         exp.sort_index(axis=1, inplace=True)
+
         assert_frame_equal(obs, exp, check_like=True)
 
     def test_clean_validate_template_no_forbidden_words1(self):
@@ -909,7 +920,7 @@ def _common_creation_checks(self, pt, fp_count, name):
                           'run_prefix', 'barcode', 'primer', 'platform',
                           'instrument_model', 'experiment_design_description',
                           'library_construction_protocol', 'center_name',
-                          'center_project_name', 'emp_status'}
+                          'center_project_name', 'insdc_nulls', 'emp_status'}
         self.assertCountEqual(pt.categories, exp_categories)
         exp_dict = {
             '%s.SKB7.640196' % self.test_study.id: {
@@ -924,6 +935,7 @@ def _common_creation_checks(self, pt, fp_count, name):
                 'str_column': 'Value for sample 3',
                 'center_name': 'ANL',
                 'center_project_name': 'Test Project',
+                'insdc_nulls': 'not applicable',
                 'emp_status': 'EMP'},
             '%s.SKB8.640193' % self.test_study.id: {
                 'barcode': 'GTCCGCAAGTTA',
@@ -937,6 +949,7 @@ def _common_creation_checks(self, pt, fp_count, name):
                 'str_column': 'Value for sample 1',
                 'center_name': 'ANL',
                 'center_project_name': 'Test Project',
+                'insdc_nulls': '3.6',
                 'emp_status': 'EMP'},
             '%s.SKD8.640184' % self.test_study.id: {
                 'barcode': 'CGTAGAGCTCTC',
@@ -950,6 +963,7 @@ def _common_creation_checks(self, pt, fp_count, name):
                 'str_column': 'Value for sample 2',
                 'center_name': 'ANL',
                 'center_project_name': 'Test Project',
+                'insdc_nulls': 'not applicable',
                 'emp_status': 'EMP'}
         }
         for s_id in exp_sample_ids:
@@ -1068,7 +1082,7 @@ def test_create_warning(self):
         self.assertEqual(pt._get_sample_ids(), exp_sample_ids)
         self.assertEqual(len(pt), 3)
         exp_categories = {'str_column', 'ebi_submission_accession',
-                          'run_prefix', 'primer', 'platform',
+                          'run_prefix', 'primer', 'platform', 'insdc_nulls',
                           'instrument_model', 'experiment_design_description',
                           'library_construction_protocol', 'center_name',
                           'center_project_name', 'emp_status'}
@@ -1085,6 +1099,7 @@ def test_create_warning(self):
                 'str_column': 'Value for sample 3',
                 'center_name': 'ANL',
                 'center_project_name': 'Test Project',
+                'insdc_nulls': 'not applicable',
                 'emp_status': 'EMP'},
             '%s.SKB8.640193' % self.test_study.id: {
                 'ebi_submission_accession': None,
@@ -1097,6 +1112,7 @@ def test_create_warning(self):
                 'str_column': 'Value for sample 1',
                 'center_name': 'ANL',
                 'center_project_name': 'Test Project',
+                'insdc_nulls': '3.6',
                 'emp_status': 'EMP'},
             '%s.SKD8.640184' % self.test_study.id: {
                 'ebi_submission_accession': None,
@@ -1109,6 +1125,7 @@ def test_create_warning(self):
                 'str_column': 'Value for sample 2',
                 'center_name': 'ANL',
                 'center_project_name': 'Test Project',
+                'insdc_nulls': 'not applicable',
                 'emp_status': 'EMP'}
         }
         for s_id in exp_sample_ids:
@@ -1217,6 +1234,7 @@ def test_to_file(self):
         self._clean_up_files.append(fp)
         with open(fp, newline=None) as f:
             obs = f.read()
+
         self.assertEqual(obs, EXP_PREP_TEMPLATE.format(pt.id))
 
         # cleaning
@@ -1803,17 +1821,17 @@ def test_name_setter(self):
 EXP_PREP_TEMPLATE = (
     'sample_name\tbarcode\tcenter_name\tcenter_project_name\t'
     'ebi_submission_accession\temp_status\texperiment_design_description\t'
-    'instrument_model\tlibrary_construction_protocol\tplatform\tprimer\t'
-    'qiita_prep_id\trun_prefix\tstr_column\n'
+    'insdc_nulls\tinstrument_model\tlibrary_construction_protocol\tplatform\t'
+    'primer\tqiita_prep_id\trun_prefix\tstr_column\n'
     '1.SKB7.640196\tCCTCTGAGAGCT\tANL\tTest Project\t\tEMP\tBBBB\t'
-    'Illumina MiSeq\tAAAA\tIllumina\tGTGCCAGCMGCCGCGGTAA\t{0}\t'
-    's_G1_L002_sequences\tValue for sample 3\n'
+    'not applicable\tIllumina MiSeq\tAAAA\tIllumina\tGTGCCAGCMGCCGCGGTAA\t'
+    '{0}\ts_G1_L002_sequences\tValue for sample 3\n'
     '1.SKB8.640193\tGTCCGCAAGTTA\tANL\tTest Project\t\tEMP\tBBBB\t'
-    'Illumina MiSeq\tAAAA\tIllumina\tGTGCCAGCMGCCGCGGTAA\t{0}\t'
-    's_G1_L001_sequences\tValue for sample 1\n'
+    '3.6\tIllumina MiSeq\tAAAA\tIllumina\tGTGCCAGCMGCCGCGGTAA\t'
+    '{0}\ts_G1_L001_sequences\tValue for sample 1\n'
     '1.SKD8.640184\tCGTAGAGCTCTC\tANL\tTest Project\t\tEMP\tBBBB\t'
-    'Illumina MiSeq\tAAAA\tIllumina\tGTGCCAGCMGCCGCGGTAA\t{0}\t'
-    's_G1_L001_sequences\tValue for sample 2\n')
+    'not applicable\tIllumina MiSeq\tAAAA\tIllumina\tGTGCCAGCMGCCGCGGTAA\t'
+    '{0}\ts_G1_L001_sequences\tValue for sample 2\n')
 
 
 if __name__ == '__main__':