Skip to content

Commit ad40a15

Browse files
committed
fix #3246
1 parent 6684ad0 commit ad40a15

File tree

2 files changed

+53
-10
lines changed

2 files changed

+53
-10
lines changed

qiita_db/metadata_template/base_metadata_template.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,22 @@
5454
# information
5555
QIITA_COLUMN_NAME = 'qiita_sample_column_names'
5656

57+
INSDC_NULL_VALUES = {
58+
'not collected': 'not collected',
59+
'not provided': 'not provided',
60+
'restricted access': 'restricted access',
61+
'not applicable': 'not applicable',
62+
'unspecified': 'not applicable',
63+
'not_collected': 'not collected',
64+
'not_provided': 'not provided',
65+
'restricted_access': 'restricted access',
66+
'not_applicable': 'not applicable',
67+
'missing: not collected': 'not collected',
68+
'missing: not provided': 'not provided',
69+
'missing: restricted access': 'restricted access',
70+
'missing: not applicable': 'not applicable',
71+
}
72+
5773

5874
def _helper_get_categories(table):
5975
"""This is a helper function to avoid duplication of code"""
@@ -585,6 +601,15 @@ def _clean_validate_template(cls, md_template, study_id,
585601
raise qdb.exceptions.QiitaDBDuplicateHeaderError(
586602
set(duplicates(md_template.columns)))
587603

604+
# validate the INSDC_NULL_VALUES
605+
_df = md_template.fillna("").applymap(str.lower)
606+
_ddf = _df[_df.isin(INSDC_NULL_VALUES.keys()).any(axis=1)]
607+
if _ddf.shape[0] != 0:
608+
for c in _ddf.columns:
609+
if set(INSDC_NULL_VALUES) & set(_ddf[c].values):
610+
for s, v in _ddf[c].to_dict().items():
611+
md_template[c][s] = INSDC_NULL_VALUES[v]
612+
588613
return md_template
589614

590615
@classmethod

qiita_db/metadata_template/test/test_prep_template.py

Lines changed: 28 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,7 @@ def setUp(self):
298298
'qiita_prep_id': 1000,
299299
'instrument_model': 'Illumina MiSeq',
300300
'library_construction_protocol': 'AAAA',
301+
'insdc_nulls': '3.6',
301302
'experiment_design_description': 'BBBB'},
302303
'SKD8.640184': {'center_name': 'ANL',
303304
'center_project_name': 'Test Project',
@@ -311,6 +312,7 @@ def setUp(self):
311312
'qiita_prep_id': 1000,
312313
'instrument_model': 'Illumina MiSeq',
313314
'library_construction_protocol': 'AAAA',
315+
'insdc_nulls': 'NoT applicable',
314316
'experiment_design_description': 'BBBB'},
315317
'SKB7.640196': {'center_name': 'ANL',
316318
'center_project_name': 'Test Project',
@@ -324,6 +326,7 @@ def setUp(self):
324326
'qiita_prep_id': 1000,
325327
'instrument_model': 'Illumina MiSeq',
326328
'library_construction_protocol': 'AAAA',
329+
'insdc_nulls': 'unspecified',
327330
'experiment_design_description': 'BBBB'}
328331
}
329332
self.metadata = pd.DataFrame.from_dict(self.metadata_dict,
@@ -342,6 +345,7 @@ def setUp(self):
342345
'qiita_prep_id': 1000,
343346
'instrument_model': 'Illumina MiSeq',
344347
'library_construction_protocol': 'AAAA',
348+
'insdc_nulls': '3.6',
345349
'experiment_design_description': 'BBBB'},
346350
'1.SKD8.640184': {'center_name': 'ANL',
347351
'center_project_name': 'Test Project',
@@ -355,6 +359,7 @@ def setUp(self):
355359
'qiita_prep_id': 1000,
356360
'instrument_model': 'Illumina MiSeq',
357361
'library_construction_protocol': 'AAAA',
362+
'insdc_nulls': 'not applicable',
358363
'experiment_design_description': 'BBBB'},
359364
'1.SKB7.640196': {'center_name': 'ANL',
360365
'center_project_name': 'Test Project',
@@ -368,6 +373,7 @@ def setUp(self):
368373
'qiita_prep_id': 1000,
369374
'instrument_model': 'Illumina MiSeq',
370375
'library_construction_protocol': 'AAAA',
376+
'insdc_nulls': 'not applicable',
371377
'experiment_design_description': 'BBBB'}
372378
}
373379
self.metadata_prefixed = pd.DataFrame.from_dict(metadata_prefixed_dict,
@@ -730,6 +736,7 @@ def test_clean_validate_template(self):
730736
'platform': 'Illumina',
731737
'instrument_model': 'Illumina MiSeq',
732738
'library_construction_protocol': 'AAAA',
739+
'insdc_nulls': '3.6',
733740
'experiment_design_description': 'BBBB'},
734741
'2.SKD8.640184': {'center_name': 'ANL',
735742
'center_project_name': 'Test Project',
@@ -742,6 +749,7 @@ def test_clean_validate_template(self):
742749
'platform': 'Illumina',
743750
'instrument_model': 'Illumina MiSeq',
744751
'library_construction_protocol': 'AAAA',
752+
'insdc_nulls': 'not applicable',
745753
'experiment_design_description': 'BBBB'},
746754
'2.SKB7.640196': {'center_name': 'ANL',
747755
'center_project_name': 'Test Project',
@@ -754,13 +762,16 @@ def test_clean_validate_template(self):
754762
'platform': 'Illumina',
755763
'instrument_model': 'Illumina MiSeq',
756764
'library_construction_protocol': 'AAAA',
765+
'insdc_nulls': 'not applicable',
757766
'experiment_design_description': 'BBBB'}
758767
}
759768
exp = pd.DataFrame.from_dict(metadata_dict, orient='index', dtype=str)
769+
760770
obs.sort_index(axis=0, inplace=True)
761771
obs.sort_index(axis=1, inplace=True)
762772
exp.sort_index(axis=0, inplace=True)
763773
exp.sort_index(axis=1, inplace=True)
774+
764775
assert_frame_equal(obs, exp, check_like=True)
765776

766777
def test_clean_validate_template_no_forbidden_words1(self):
@@ -909,7 +920,7 @@ def _common_creation_checks(self, pt, fp_count, name):
909920
'run_prefix', 'barcode', 'primer', 'platform',
910921
'instrument_model', 'experiment_design_description',
911922
'library_construction_protocol', 'center_name',
912-
'center_project_name', 'emp_status'}
923+
'center_project_name', 'insdc_nulls', 'emp_status'}
913924
self.assertCountEqual(pt.categories, exp_categories)
914925
exp_dict = {
915926
'%s.SKB7.640196' % self.test_study.id: {
@@ -924,6 +935,7 @@ def _common_creation_checks(self, pt, fp_count, name):
924935
'str_column': 'Value for sample 3',
925936
'center_name': 'ANL',
926937
'center_project_name': 'Test Project',
938+
'insdc_nulls': 'not applicable',
927939
'emp_status': 'EMP'},
928940
'%s.SKB8.640193' % self.test_study.id: {
929941
'barcode': 'GTCCGCAAGTTA',
@@ -937,6 +949,7 @@ def _common_creation_checks(self, pt, fp_count, name):
937949
'str_column': 'Value for sample 1',
938950
'center_name': 'ANL',
939951
'center_project_name': 'Test Project',
952+
'insdc_nulls': '3.6',
940953
'emp_status': 'EMP'},
941954
'%s.SKD8.640184' % self.test_study.id: {
942955
'barcode': 'CGTAGAGCTCTC',
@@ -950,6 +963,7 @@ def _common_creation_checks(self, pt, fp_count, name):
950963
'str_column': 'Value for sample 2',
951964
'center_name': 'ANL',
952965
'center_project_name': 'Test Project',
966+
'insdc_nulls': 'not applicable',
953967
'emp_status': 'EMP'}
954968
}
955969
for s_id in exp_sample_ids:
@@ -1068,7 +1082,7 @@ def test_create_warning(self):
10681082
self.assertEqual(pt._get_sample_ids(), exp_sample_ids)
10691083
self.assertEqual(len(pt), 3)
10701084
exp_categories = {'str_column', 'ebi_submission_accession',
1071-
'run_prefix', 'primer', 'platform',
1085+
'run_prefix', 'primer', 'platform', 'insdc_nulls',
10721086
'instrument_model', 'experiment_design_description',
10731087
'library_construction_protocol', 'center_name',
10741088
'center_project_name', 'emp_status'}
@@ -1085,6 +1099,7 @@ def test_create_warning(self):
10851099
'str_column': 'Value for sample 3',
10861100
'center_name': 'ANL',
10871101
'center_project_name': 'Test Project',
1102+
'insdc_nulls': 'not applicable',
10881103
'emp_status': 'EMP'},
10891104
'%s.SKB8.640193' % self.test_study.id: {
10901105
'ebi_submission_accession': None,
@@ -1097,6 +1112,7 @@ def test_create_warning(self):
10971112
'str_column': 'Value for sample 1',
10981113
'center_name': 'ANL',
10991114
'center_project_name': 'Test Project',
1115+
'insdc_nulls': '3.6',
11001116
'emp_status': 'EMP'},
11011117
'%s.SKD8.640184' % self.test_study.id: {
11021118
'ebi_submission_accession': None,
@@ -1109,6 +1125,7 @@ def test_create_warning(self):
11091125
'str_column': 'Value for sample 2',
11101126
'center_name': 'ANL',
11111127
'center_project_name': 'Test Project',
1128+
'insdc_nulls': 'not applicable',
11121129
'emp_status': 'EMP'}
11131130
}
11141131
for s_id in exp_sample_ids:
@@ -1217,6 +1234,7 @@ def test_to_file(self):
12171234
self._clean_up_files.append(fp)
12181235
with open(fp, newline=None) as f:
12191236
obs = f.read()
1237+
12201238
self.assertEqual(obs, EXP_PREP_TEMPLATE.format(pt.id))
12211239

12221240
# cleaning
@@ -1803,17 +1821,17 @@ def test_name_setter(self):
18031821
EXP_PREP_TEMPLATE = (
18041822
'sample_name\tbarcode\tcenter_name\tcenter_project_name\t'
18051823
'ebi_submission_accession\temp_status\texperiment_design_description\t'
1806-
'instrument_model\tlibrary_construction_protocol\tplatform\tprimer\t'
1807-
'qiita_prep_id\trun_prefix\tstr_column\n'
1824+
'insdc_nulls\tinstrument_model\tlibrary_construction_protocol\tplatform\t'
1825+
'primer\tqiita_prep_id\trun_prefix\tstr_column\n'
18081826
'1.SKB7.640196\tCCTCTGAGAGCT\tANL\tTest Project\t\tEMP\tBBBB\t'
1809-
'Illumina MiSeq\tAAAA\tIllumina\tGTGCCAGCMGCCGCGGTAA\t{0}\t'
1810-
's_G1_L002_sequences\tValue for sample 3\n'
1827+
'not applicable\tIllumina MiSeq\tAAAA\tIllumina\tGTGCCAGCMGCCGCGGTAA\t'
1828+
'{0}\ts_G1_L002_sequences\tValue for sample 3\n'
18111829
'1.SKB8.640193\tGTCCGCAAGTTA\tANL\tTest Project\t\tEMP\tBBBB\t'
1812-
'Illumina MiSeq\tAAAA\tIllumina\tGTGCCAGCMGCCGCGGTAA\t{0}\t'
1813-
's_G1_L001_sequences\tValue for sample 1\n'
1830+
'3.6\tIllumina MiSeq\tAAAA\tIllumina\tGTGCCAGCMGCCGCGGTAA\t'
1831+
'{0}\ts_G1_L001_sequences\tValue for sample 1\n'
18141832
'1.SKD8.640184\tCGTAGAGCTCTC\tANL\tTest Project\t\tEMP\tBBBB\t'
1815-
'Illumina MiSeq\tAAAA\tIllumina\tGTGCCAGCMGCCGCGGTAA\t{0}\t'
1816-
's_G1_L001_sequences\tValue for sample 2\n')
1833+
'not applicable\tIllumina MiSeq\tAAAA\tIllumina\tGTGCCAGCMGCCGCGGTAA\t'
1834+
'{0}\ts_G1_L001_sequences\tValue for sample 2\n')
18171835

18181836

18191837
if __name__ == '__main__':

0 commit comments

Comments
 (0)