diff --git a/qiita_core/tests/test_configuration_manager.py b/qiita_core/tests/test_configuration_manager.py index 2aa71c36e..a81608645 100644 --- a/qiita_core/tests/test_configuration_manager.py +++ b/qiita_core/tests/test_configuration_manager.py @@ -201,6 +201,7 @@ def test_get_portal(self): obs._get_portal(self.conf) self.assertEqual(obs.portal_dir, "/gold_portal") + CONF = """ # ------------------------------ Main settings -------------------------------- [main] diff --git a/qiita_core/tests/test_util.py b/qiita_core/tests/test_util.py index 351e72c06..2b3fbdc27 100644 --- a/qiita_core/tests/test_util.py +++ b/qiita_core/tests/test_util.py @@ -64,5 +64,6 @@ def test_get_qiita_version(self): # testing just the version self.assertEqual(exp_version, qdb.__version__) + if __name__ == '__main__': main() diff --git a/qiita_db/exceptions.py b/qiita_db/exceptions.py index eea058c86..2b06c702a 100644 --- a/qiita_db/exceptions.py +++ b/qiita_db/exceptions.py @@ -111,4 +111,5 @@ class QiitaDBWarning(UserWarning): """Warning specific for the QiitaDB domain""" pass + warnings.simplefilter('always', QiitaDBWarning) diff --git a/qiita_db/handlers/tests/test_artifact.py b/qiita_db/handlers/tests/test_artifact.py index ebfde2075..07c1805d4 100644 --- a/qiita_db/handlers/tests/test_artifact.py +++ b/qiita_db/handlers/tests/test_artifact.py @@ -239,5 +239,6 @@ def test_post(self): data=data) self.assertEqual(obs.code, 200) + if __name__ == '__main__': main() diff --git a/qiita_db/handlers/tests/test_core.py b/qiita_db/handlers/tests/test_core.py index a0471de20..63be60a2f 100644 --- a/qiita_db/handlers/tests/test_core.py +++ b/qiita_db/handlers/tests/test_core.py @@ -20,5 +20,6 @@ def test_post(self): self.assertEqual(obs.code, 200) self.assertFalse(qdb.user.User.exists('new_user@test.foo')) + if __name__ == '__main__': main() diff --git a/qiita_db/handlers/tests/test_oauth2.py b/qiita_db/handlers/tests/test_oauth2.py index be5b7c117..2c4bd8f3d 100644 --- a/qiita_db/handlers/tests/test_oauth2.py +++ b/qiita_db/handlers/tests/test_oauth2.py @@ -294,5 +294,6 @@ def test_authenticate_password_missing_info(self): 'error_description': 'Oauth2 error: missing user information'} self.assertEqual(obs_body, exp) + if __name__ == "__main__": main() diff --git a/qiita_db/handlers/tests/test_plugin.py b/qiita_db/handlers/tests/test_plugin.py index 457104739..036a58f32 100644 --- a/qiita_db/handlers/tests/test_plugin.py +++ b/qiita_db/handlers/tests/test_plugin.py @@ -197,5 +197,6 @@ def test_post(self): data={}) self.assertEqual(obs.code, 200) + if __name__ == '__main__': main() diff --git a/qiita_db/handlers/tests/test_prep_template.py b/qiita_db/handlers/tests/test_prep_template.py index 9d76160ab..1a3f2f2ff 100644 --- a/qiita_db/handlers/tests/test_prep_template.py +++ b/qiita_db/handlers/tests/test_prep_template.py @@ -46,14 +46,18 @@ def test_get(self): path_builder = partial(join, db_test_template_dir) obs = loads(obs.body) - exp = {'data_type': '18S', - 'artifact': 1, - 'investigation_type': 'Metagenomics', - 'study': 1, - 'status': 'private', - 'qiime-map': path_builder('1_prep_1_qiime_19700101-000000.txt'), - 'prep-file': path_builder('1_prep_1_19700101-000000.txt')} - self.assertEqual(obs, exp) + + # have to check per key because since patch 51 we are updating the + # test info files + self.assertEqual(obs['data_type'], '18S') + self.assertEqual(obs['artifact'], 1) + self.assertEqual(obs['investigation_type'], 'Metagenomics') + self.assertEqual(obs['study'], 1) + self.assertEqual(obs['status'], 'private') + self.assertTrue(obs['qiime-map'].startswith( + path_builder('1_prep_1_qiime_'))) + self.assertTrue(obs['prep-file'].startswith( + path_builder('1_prep_1_'))) class PrepTemplateDataHandlerTests(OauthTestingBase): diff --git a/qiita_db/handlers/tests/test_processing_job.py b/qiita_db/handlers/tests/test_processing_job.py index 0a25612e2..c89d5cc26 100644 --- a/qiita_db/handlers/tests/test_processing_job.py +++ b/qiita_db/handlers/tests/test_processing_job.py @@ -274,5 +274,6 @@ def test_post_processing_job_status(self): self.assertEqual(qdb.processing_job.ProcessingJob(job_id).status, 'running') + if __name__ == '__main__': main() diff --git a/qiita_db/handlers/tests/test_reference.py b/qiita_db/handlers/tests/test_reference.py index 107ea7842..6403de40f 100644 --- a/qiita_db/handlers/tests/test_reference.py +++ b/qiita_db/handlers/tests/test_reference.py @@ -51,5 +51,6 @@ def test_get(self): exp = {'name': 'Greengenes', 'version': '13_8', 'files': fps} self.assertEqual(loads(obs.body), exp) + if __name__ == '__main__': main() diff --git a/qiita_db/meta_util.py b/qiita_db/meta_util.py index 33a84de37..84e7f70a8 100644 --- a/qiita_db/meta_util.py +++ b/qiita_db/meta_util.py @@ -154,9 +154,9 @@ def update_redis_stats(): artifact filepaths that are not present in the file system """ STUDY = qdb.study.Study - studies = {'public': STUDY.get_by_status('private'), - 'private': STUDY.get_by_status('public'), - 'sanbox': STUDY.get_by_status('sandbox')} + studies = {'public': STUDY.get_by_status('public'), + 'private': STUDY.get_by_status('private'), + 'sandbox': STUDY.get_by_status('sandbox')} number_studies = {k: len(v) for k, v in viewitems(studies)} number_of_samples = {} @@ -186,7 +186,8 @@ def update_redis_stats(): lat_longs = get_lat_longs() - num_studies_ebi = len(ebi_samples_prep) + num_studies_ebi = len([k for k, v in viewitems(ebi_samples_prep) + if v >= 1]) number_samples_ebi_prep = sum([v for _, v in viewitems(ebi_samples_prep)]) # generating file size stats diff --git a/qiita_db/metadata_template/base_metadata_template.py b/qiita_db/metadata_template/base_metadata_template.py index 13b2c9d9b..07a164f90 100644 --- a/qiita_db/metadata_template/base_metadata_template.py +++ b/qiita_db/metadata_template/base_metadata_template.py @@ -1430,7 +1430,11 @@ def validate(self, restriction_dict): else: valid_null = qdb.metadata_template.constants.EBI_NULL_VALUES for column, datatype in viewitems(restriction.columns): - for sample, val in viewitems(self.get_category(column)): + # sorting by key (sample id) so we always check in the + # same order, helpful for testing + cats_by_column = self.get_category(column) + for sample in sorted(cats_by_column): + val = cats_by_column[sample] # ignore if valid null value if val in valid_null: continue @@ -1439,11 +1443,8 @@ def validate(self, restriction_dict): val = str(val) formats = [ # 4 digits year - '%m/%d/%Y %H:%M:%S', '%m/%d/%Y %H:%M', - '%m/%d/%Y %H', '%m/%d/%Y', '%m/%Y', '%Y', - # 2 digits year - '%m/%d/%y %H:%M:%S', '%m/%d/%y %H:%M', - '%m/%d/%y %H', '%m/%d/%y', '%m/%y', '%y' + '%Y-%m-%d %H:%M:%S', '%Y-%m-%d %H:%M', + '%Y-%m-%d %H', '%Y-%m-%d', '%Y-%m', '%Y' ] date = None for fmt in formats: diff --git a/qiita_db/metadata_template/constants.py b/qiita_db/metadata_template/constants.py index fb7a4cdb0..b88f71235 100644 --- a/qiita_db/metadata_template/constants.py +++ b/qiita_db/metadata_template/constants.py @@ -89,4 +89,5 @@ def _col_iterator(): for cols in viewkeys(restriction.columns): yield cols + CONTROLLED_COLS = set(col for col in _col_iterator()) diff --git a/qiita_db/metadata_template/test/test_prep_template.py b/qiita_db/metadata_template/test/test_prep_template.py index 729147e2f..05caf8af9 100644 --- a/qiita_db/metadata_template/test/test_prep_template.py +++ b/qiita_db/metadata_template/test/test_prep_template.py @@ -915,6 +915,8 @@ def test_create_qiime_mapping_file(self): exp = pd.read_csv( exp_fp, sep='\t', infer_datetime_format=False, parse_dates=False, index_col=False, comment='\t') + obs = obs.reindex_axis(sorted(obs.columns), axis=1) + exp = exp.reindex_axis(sorted(exp.columns), axis=1) assert_frame_equal(obs, exp) diff --git a/qiita_db/metadata_template/test/test_sample_template.py b/qiita_db/metadata_template/test/test_sample_template.py index 3515a6ed8..c9addf20e 100644 --- a/qiita_db/metadata_template/test/test_sample_template.py +++ b/qiita_db/metadata_template/test/test_sample_template.py @@ -113,7 +113,7 @@ def test_getitem_required(self): """ self.assertEqual(self.tester['physical_specimen_location'], 'ANL') self.assertEqual(self.tester['collection_timestamp'], - '11/11/11 13:00:00') + '2011-11-11 13:00:00') self.assertTrue(self.tester['dna_extracted']) def test_getitem_dynamic(self): @@ -152,7 +152,7 @@ def test_values(self): """values returns an iterator over the values""" obs = self.tester.values() self.assertTrue(isinstance(obs, Iterable)) - exp = {'ANL', 'true', 'true', 'ENVO:soil', '11/11/11 13:00:00', + exp = {'ANL', 'true', 'true', 'ENVO:soil', '2011-11-11 13:00:00', '1001:M7', 'Cannabis Soil Microbiome', 'winter', 'n', '64.6 sand, 17.6 silt, 17.8 clay', '1118232', '0.15', '3483', 'root metagenome', '0.164', '114', '15', '1.41', '7.15', '0', @@ -170,7 +170,7 @@ def test_items(self): ('physical_specimen_remaining', 'true'), ('dna_extracted', 'true'), ('sample_type', 'ENVO:soil'), - ('collection_timestamp', '11/11/11 13:00:00'), + ('collection_timestamp', '2011-11-11 13:00:00'), ('host_subject_id', '1001:M7'), ('description', 'Cannabis Soil Microbiome'), ('season_environment', 'winter'), ('assigned_from_geo', 'n'), @@ -261,7 +261,7 @@ def setUp(self): 'physical_specimen_remaining': 'true', 'dna_extracted': 'true', 'sample_type': 'type1', - 'collection_timestamp': '05/29/2014 12:24:15', + 'collection_timestamp': '2014-05-29 12:24:15', 'host_subject_id': 'NotIdentified', 'Description': 'Test Sample 1', 'latitude': '42.42', @@ -272,7 +272,7 @@ def setUp(self): 'physical_specimen_remaining': 'true', 'dna_extracted': 'true', 'sample_type': 'type1', - 'collection_timestamp': '05/29/2014 12:24:15', + 'collection_timestamp': '2014-05-29 12:24:15', 'host_subject_id': 'NotIdentified', 'Description': 'Test Sample 2', 'latitude': '4.2', @@ -283,7 +283,7 @@ def setUp(self): 'physical_specimen_remaining': 'true', 'dna_extracted': 'true', 'sample_type': 'type1', - 'collection_timestamp': '05/29/2014 12:24:15', + 'collection_timestamp': '2014-05-29 12:24:15', 'host_subject_id': 'NotIdentified', 'Description': 'Test Sample 3', 'latitude': '4.8', @@ -336,7 +336,7 @@ def setUp(self): 'dna_extracted': 'true', 'sample_type': '6', 'collection_timestamp': - '05/29/2014 12:24:15', + '2014-05-29 12:24:15', 'host_subject_id': 'NotIdentified', 'Description': 'Test Sample 1', 'latitude': '42.42', @@ -348,7 +348,7 @@ def setUp(self): 'dna_extracted': 'true', 'sample_type': '5', 'collection_timestamp': - '05/29/2014 12:24:15', + '2014-05-29 12:24:15', 'host_subject_id': 'the only one', 'Description': 'Test Sample 2', 'latitude': '4.2', @@ -360,7 +360,7 @@ def setUp(self): 'dna_extracted': 'true', 'sample_type': '10', 'collection_timestamp': - '05/29/2014 12:24:15', + '2014-05-29 12:24:15', 'host_subject_id': 'NotIdentified', 'Description': 'Test Sample 3', 'latitude': '4.8', @@ -377,7 +377,7 @@ def setUp(self): 'dna_extracted': 'true', 'sample_type': '6', 'collection_timestamp': - '05/29/2014 12:24:15', + '2014-05-29 12:24:15', 'host_subject_id': 'NotIdentified', 'Description': 'Test Sample 1', 'latitude': '42.42', @@ -389,7 +389,7 @@ def setUp(self): 'dna_extracted': 'true', 'sample_type': '5', 'collection_timestamp': - '05/29/2014 12:24:15', + '2014-05-29 12:24:15', 'host_subject_id': 'the only one', 'Description': 'Test Sample 2', 'latitude': '4.2', @@ -401,7 +401,7 @@ def setUp(self): 'dna_extracted': 'true', 'sample_type': '10', 'collection_timestamp': - '05/29/2014 12:24:15', + '2014-05-29 12:24:15', 'host_subject_id': 'NotIdentified', 'Description': 'Test Sample 3', 'latitude': '4.8', @@ -413,7 +413,7 @@ def setUp(self): 'dna_extracted': 'true', 'sample_type': '10', 'collection_timestamp': - '05/29/2014 12:24:15', + '2014-05-29 12:24:15', 'host_subject_id': 'NotIdentified', 'Description': 'Test Sample 4', 'latitude': '4.8', @@ -430,7 +430,7 @@ def setUp(self): 'dna_extracted': 'true', 'sample_type': '6', 'collection_timestamp': - '05/29/2014 12:24:15', + '2014-05-29 12:24:15', 'host_subject_id': 'NotIdentified', 'Description': 'Test Sample 1', 'latitude': '42.42', @@ -443,7 +443,7 @@ def setUp(self): 'dna_extracted': 'true', 'sample_type': '5', 'collection_timestamp': - '05/29/2014 12:24:15', + '2014-05-29 12:24:15', 'host_subject_id': 'the only one', 'Description': 'Test Sample 2', 'latitude': '4.2', @@ -456,7 +456,7 @@ def setUp(self): 'dna_extracted': 'true', 'sample_type': '10', 'collection_timestamp': - '05/29/2014 12:24:15', + '2014-05-29 12:24:15', 'host_subject_id': 'NotIdentified', 'Description': 'Test Sample 3', 'latitude': '4.8', @@ -775,7 +775,7 @@ def test_clean_validate_template(self): 'dna_extracted': 'true', 'sample_type': 'type1', 'collection_timestamp': - '05/29/2014 12:24:15', + '2014-05-29 12:24:15', 'host_subject_id': 'NotIdentified', 'description': 'Test Sample 1', 'latitude': '42.42', @@ -787,7 +787,7 @@ def test_clean_validate_template(self): 'dna_extracted': 'true', 'sample_type': 'type1', 'collection_timestamp': - '05/29/2014 12:24:15', + '2014-05-29 12:24:15', 'host_subject_id': 'NotIdentified', 'description': 'Test Sample 2', 'latitude': '4.2', @@ -799,7 +799,7 @@ def test_clean_validate_template(self): 'dna_extracted': 'true', 'sample_type': 'type1', 'collection_timestamp': - '05/29/2014 12:24:15', + '2014-05-29 12:24:15', 'host_subject_id': 'NotIdentified', 'description': 'Test Sample 3', 'latitude': '4.8', @@ -908,7 +908,7 @@ def test_create(self): self.assertItemsEqual(st.categories(), exp_categories) exp_dict = { "%s.Sample1" % new_id: { - 'collection_timestamp': '05/29/2014 12:24:15', + 'collection_timestamp': '2014-05-29 12:24:15', 'description': "Test Sample 1", 'dna_extracted': 'true', 'host_subject_id': "NotIdentified", @@ -920,7 +920,7 @@ def test_create(self): 'taxon_id': '9606', 'scientific_name': 'homo sapiens'}, "%s.Sample2" % new_id: { - 'collection_timestamp': '05/29/2014 12:24:15', + 'collection_timestamp': '2014-05-29 12:24:15', 'description': "Test Sample 2", 'dna_extracted': 'true', 'host_subject_id': "NotIdentified", @@ -932,7 +932,7 @@ def test_create(self): 'taxon_id': '9606', 'scientific_name': 'homo sapiens'}, "%s.Sample3" % new_id: { - 'collection_timestamp': '05/29/2014 12:24:15', + 'collection_timestamp': '2014-05-29 12:24:15', 'description': "Test Sample 3", 'dna_extracted': 'true', 'host_subject_id': "NotIdentified", @@ -974,7 +974,7 @@ def test_create_int_prefix(self): self.assertItemsEqual(st.categories(), exp_categories) exp_dict = { "%s.12.Sample1" % new_id: { - 'collection_timestamp': '05/29/2014 12:24:15', + 'collection_timestamp': '2014-05-29 12:24:15', 'description': "Test Sample 1", 'dna_extracted': 'true', 'host_subject_id': "NotIdentified", @@ -986,7 +986,7 @@ def test_create_int_prefix(self): 'taxon_id': '9606', 'scientific_name': 'homo sapiens'}, "%s.12.Sample2" % new_id: { - 'collection_timestamp': '05/29/2014 12:24:15', + 'collection_timestamp': '2014-05-29 12:24:15', 'description': "Test Sample 2", 'dna_extracted': 'true', 'host_subject_id': "NotIdentified", @@ -998,7 +998,7 @@ def test_create_int_prefix(self): 'taxon_id': '9606', 'scientific_name': 'homo sapiens'}, "%s.12.Sample3" % new_id: { - 'collection_timestamp': '05/29/2014 12:24:15', + 'collection_timestamp': '2014-05-29 12:24:15', 'description': "Test Sample 3", 'dna_extracted': 'true', 'host_subject_id': "NotIdentified", @@ -1040,7 +1040,7 @@ def test_create_str_prefixes(self): self.assertItemsEqual(st.categories(), exp_categories) exp_dict = { "%s.foo.Sample1" % new_id: { - 'collection_timestamp': '05/29/2014 12:24:15', + 'collection_timestamp': '2014-05-29 12:24:15', 'description': "Test Sample 1", 'dna_extracted': 'true', 'host_subject_id': "NotIdentified", @@ -1052,7 +1052,7 @@ def test_create_str_prefixes(self): 'taxon_id': '9606', 'scientific_name': 'homo sapiens'}, "%s.bar.Sample2" % new_id: { - 'collection_timestamp': '05/29/2014 12:24:15', + 'collection_timestamp': '2014-05-29 12:24:15', 'description': "Test Sample 2", 'dna_extracted': 'true', 'host_subject_id': "NotIdentified", @@ -1064,7 +1064,7 @@ def test_create_str_prefixes(self): 'taxon_id': '9606', 'scientific_name': 'homo sapiens'}, "%s.foo.Sample3" % new_id: { - 'collection_timestamp': '05/29/2014 12:24:15', + 'collection_timestamp': '2014-05-29 12:24:15', 'description': "Test Sample 3", 'dna_extracted': 'true', 'host_subject_id': "NotIdentified", @@ -1108,7 +1108,7 @@ def test_create_already_prefixed_samples(self): self.assertItemsEqual(st.categories(), exp_categories) exp_dict = { "%s.Sample1" % new_id: { - 'collection_timestamp': '05/29/2014 12:24:15', + 'collection_timestamp': '2014-05-29 12:24:15', 'description': "Test Sample 1", 'dna_extracted': 'true', 'host_subject_id': "NotIdentified", @@ -1120,7 +1120,7 @@ def test_create_already_prefixed_samples(self): 'taxon_id': '9606', 'scientific_name': 'homo sapiens'}, "%s.Sample2" % new_id: { - 'collection_timestamp': '05/29/2014 12:24:15', + 'collection_timestamp': '2014-05-29 12:24:15', 'description': "Test Sample 2", 'dna_extracted': 'true', 'host_subject_id': "NotIdentified", @@ -1132,7 +1132,7 @@ def test_create_already_prefixed_samples(self): 'taxon_id': '9606', 'scientific_name': 'homo sapiens'}, "%s.Sample3" % new_id: { - 'collection_timestamp': '05/29/2014 12:24:15', + 'collection_timestamp': '2014-05-29 12:24:15', 'description': "Test Sample 3", 'dna_extracted': 'true', 'host_subject_id': "NotIdentified", @@ -1343,7 +1343,7 @@ def test_extend_add_samples(self): 'physical_specimen_remaining': 'true', 'dna_extracted': 'true', 'sample_type': 'type1', - 'collection_timestamp': '05/29/2014 12:24:15', + 'collection_timestamp': '2014-05-29 12:24:15', 'host_subject_id': 'NotIdentified', 'Description': 'Test Sample 4', 'latitude': '42.42', @@ -1354,7 +1354,7 @@ def test_extend_add_samples(self): 'physical_specimen_remaining': 'true', 'dna_extracted': 'true', 'sample_type': 'type1', - 'collection_timestamp': '05/29/2014 12:24:15', + 'collection_timestamp': '2014-05-29 12:24:15', 'host_subject_id': 'NotIdentified', 'Description': 'Test Sample 5', 'latitude': '42.42', @@ -1379,7 +1379,7 @@ def test_extend_add_samples(self): self.assertItemsEqual(st.categories(), exp_categories) exp_dict = { "%s.Sample1" % st.id: { - 'collection_timestamp': '05/29/2014 12:24:15', + 'collection_timestamp': '2014-05-29 12:24:15', 'description': "Test Sample 1", 'dna_extracted': 'true', 'host_subject_id': "NotIdentified", @@ -1391,7 +1391,7 @@ def test_extend_add_samples(self): 'taxon_id': '9606', 'scientific_name': 'homo sapiens'}, "%s.Sample2" % st.id: { - 'collection_timestamp': '05/29/2014 12:24:15', + 'collection_timestamp': '2014-05-29 12:24:15', 'description': "Test Sample 2", 'dna_extracted': 'true', 'host_subject_id': "NotIdentified", @@ -1403,7 +1403,7 @@ def test_extend_add_samples(self): 'taxon_id': '9606', 'scientific_name': 'homo sapiens'}, "%s.Sample3" % st.id: { - 'collection_timestamp': '05/29/2014 12:24:15', + 'collection_timestamp': '2014-05-29 12:24:15', 'description': "Test Sample 3", 'dna_extracted': 'true', 'host_subject_id': "NotIdentified", @@ -1419,7 +1419,7 @@ def test_extend_add_samples(self): 'physical_specimen_remaining': 'true', 'dna_extracted': 'true', 'sample_type': 'type1', - 'collection_timestamp': '05/29/2014 12:24:15', + 'collection_timestamp': '2014-05-29 12:24:15', 'host_subject_id': 'NotIdentified', 'description': 'Test Sample 4', 'latitude': '42.42', @@ -1431,7 +1431,7 @@ def test_extend_add_samples(self): 'physical_specimen_remaining': 'true', 'dna_extracted': 'true', 'sample_type': 'type1', - 'collection_timestamp': '05/29/2014 12:24:15', + 'collection_timestamp': '2014-05-29 12:24:15', 'host_subject_id': 'NotIdentified', 'description': 'Test Sample 5', 'latitude': '42.42', @@ -1452,7 +1452,7 @@ def test_extend_add_duplicate_samples(self): 'physical_specimen_remaining': 'true', 'dna_extracted': 'true', 'sample_type': 'type1', - 'collection_timestamp': '05/29/2014 12:24:15', + 'collection_timestamp': '2014-05-29 12:24:15', 'host_subject_id': 'NotIdentified', 'Description': 'Test Sample 4', 'latitude': '42.42', @@ -1484,7 +1484,7 @@ def test_extend_add_duplicate_samples(self): self.assertItemsEqual(st.categories(), exp_categories) exp_dict = { "%s.Sample1" % st.id: { - 'collection_timestamp': '05/29/2014 12:24:15', + 'collection_timestamp': '2014-05-29 12:24:15', 'description': "Test Sample 1", 'dna_extracted': 'true', 'host_subject_id': "NotIdentified", @@ -1496,7 +1496,7 @@ def test_extend_add_duplicate_samples(self): 'taxon_id': '9606', 'scientific_name': 'homo sapiens'}, "%s.Sample2" % st.id: { - 'collection_timestamp': '05/29/2014 12:24:15', + 'collection_timestamp': '2014-05-29 12:24:15', 'description': "Test Sample 2", 'dna_extracted': 'true', 'host_subject_id': "NotIdentified", @@ -1508,7 +1508,7 @@ def test_extend_add_duplicate_samples(self): 'taxon_id': '9606', 'scientific_name': 'homo sapiens'}, "%s.Sample3" % st.id: { - 'collection_timestamp': '05/29/2014 12:24:15', + 'collection_timestamp': '2014-05-29 12:24:15', 'description': "Test Sample 3", 'dna_extracted': 'true', 'host_subject_id': "NotIdentified", @@ -1524,7 +1524,7 @@ def test_extend_add_duplicate_samples(self): 'physical_specimen_remaining': 'true', 'dna_extracted': 'true', 'sample_type': 'type1', - 'collection_timestamp': '05/29/2014 12:24:15', + 'collection_timestamp': '2014-05-29 12:24:15', 'host_subject_id': 'NotIdentified', 'description': 'Test Sample 4', 'latitude': '42.42', @@ -1566,7 +1566,7 @@ def test_extend_new_columns(self): self.assertItemsEqual(st.categories(), exp_categories) exp_dict = { "%s.Sample1" % st.id: { - 'collection_timestamp': '05/29/2014 12:24:15', + 'collection_timestamp': '2014-05-29 12:24:15', 'description': "Test Sample 1", 'dna_extracted': 'true', 'host_subject_id': "NotIdentified", @@ -1580,7 +1580,7 @@ def test_extend_new_columns(self): 'texture': 'val1', 'tot_nitro': 'val_1'}, "%s.Sample2" % st.id: { - 'collection_timestamp': '05/29/2014 12:24:15', + 'collection_timestamp': '2014-05-29 12:24:15', 'description': "Test Sample 2", 'dna_extracted': 'true', 'host_subject_id': "NotIdentified", @@ -1594,7 +1594,7 @@ def test_extend_new_columns(self): 'texture': 'val2', 'tot_nitro': 'val_2'}, "%s.Sample3" % st.id: { - 'collection_timestamp': '05/29/2014 12:24:15', + 'collection_timestamp': '2014-05-29 12:24:15', 'description': "Test Sample 3", 'dna_extracted': 'true', 'host_subject_id': "NotIdentified", @@ -1620,7 +1620,7 @@ def test_extend_new_samples_and_columns(self): 'physical_specimen_remaining': 'true', 'dna_extracted': 'true', 'sample_type': 'type1', - 'collection_timestamp': '05/29/2014 12:24:15', + 'collection_timestamp': '2014-05-29 12:24:15', 'host_subject_id': 'NotIdentified', 'Description': 'Test Sample 4', 'latitude': '42.42', @@ -1652,7 +1652,7 @@ def test_extend_new_samples_and_columns(self): self.assertItemsEqual(st.categories(), exp_categories) exp_dict = { "%s.Sample1" % st.id: { - 'collection_timestamp': '05/29/2014 12:24:15', + 'collection_timestamp': '2014-05-29 12:24:15', 'description': "Test Sample 1", 'dna_extracted': 'true', 'host_subject_id': "NotIdentified", @@ -1665,7 +1665,7 @@ def test_extend_new_samples_and_columns(self): 'scientific_name': 'homo sapiens', 'tot_nitro': 'val1'}, "%s.Sample2" % st.id: { - 'collection_timestamp': '05/29/2014 12:24:15', + 'collection_timestamp': '2014-05-29 12:24:15', 'description': "Test Sample 2", 'dna_extracted': 'true', 'host_subject_id': "NotIdentified", @@ -1678,7 +1678,7 @@ def test_extend_new_samples_and_columns(self): 'scientific_name': 'homo sapiens', 'tot_nitro': 'val2'}, "%s.Sample3" % st.id: { - 'collection_timestamp': '05/29/2014 12:24:15', + 'collection_timestamp': '2014-05-29 12:24:15', 'description': "Test Sample 3", 'dna_extracted': 'true', 'host_subject_id': "NotIdentified", @@ -1695,7 +1695,7 @@ def test_extend_new_samples_and_columns(self): 'physical_specimen_remaining': 'true', 'dna_extracted': 'true', 'sample_type': 'type1', - 'collection_timestamp': '05/29/2014 12:24:15', + 'collection_timestamp': '2014-05-29 12:24:15', 'host_subject_id': 'NotIdentified', 'description': 'Test Sample 4', 'latitude': '42.42', @@ -1716,7 +1716,7 @@ def test_extend_update(self): 'physical_specimen_remaining': 'true', 'dna_extracted': 'true', 'sample_type': 'type1', - 'collection_timestamp': '05/29/2014 12:24:15', + 'collection_timestamp': '2014-05-29 12:24:15', 'host_subject_id': 'NotIdentified', 'Description': 'Test Sample 4', 'latitude': '42.42', @@ -1747,7 +1747,7 @@ def test_extend_update(self): self.assertItemsEqual(st.categories(), exp_categories) exp_dict = { "%s.Sample1" % st.id: { - 'collection_timestamp': '05/29/2014 12:24:15', + 'collection_timestamp': '2014-05-29 12:24:15', 'description': "Changed", 'dna_extracted': 'true', 'host_subject_id': "NotIdentified", @@ -1760,7 +1760,7 @@ def test_extend_update(self): 'scientific_name': 'homo sapiens', 'tot_nitro': 'val1'}, "%s.Sample2" % st.id: { - 'collection_timestamp': '05/29/2014 12:24:15', + 'collection_timestamp': '2014-05-29 12:24:15', 'description': "Test Sample 2", 'dna_extracted': 'true', 'host_subject_id': "NotIdentified", @@ -1773,7 +1773,7 @@ def test_extend_update(self): 'scientific_name': 'Changed dynamic', 'tot_nitro': 'val2'}, "%s.Sample3" % st.id: { - 'collection_timestamp': '05/29/2014 12:24:15', + 'collection_timestamp': '2014-05-29 12:24:15', 'description': "Test Sample 3", 'dna_extracted': 'true', 'host_subject_id': "NotIdentified", @@ -1790,7 +1790,7 @@ def test_extend_update(self): 'physical_specimen_remaining': 'true', 'dna_extracted': 'true', 'sample_type': 'type1', - 'collection_timestamp': '05/29/2014 12:24:15', + 'collection_timestamp': '2014-05-29 12:24:15', 'host_subject_id': 'NotIdentified', 'description': 'Test Sample 4', 'latitude': '42.42', @@ -1814,7 +1814,7 @@ def test_to_dataframe(self): 'dna_extracted': 'true', 'sample_type': 'type1', 'collection_timestamp': - '05/29/2014 12:24:15', + '2014-05-29 12:24:15', 'host_subject_id': 'NotIdentified', 'description': 'Test Sample 1', 'latitude': '42.42', @@ -1828,7 +1828,7 @@ def test_to_dataframe(self): 'dna_extracted': 'true', 'sample_type': 'type1', 'collection_timestamp': - '05/29/2014 12:24:15', + '2014-05-29 12:24:15', 'host_subject_id': 'NotIdentified', 'description': 'Test Sample 2', 'latitude': '4.2', @@ -1842,7 +1842,7 @@ def test_to_dataframe(self): 'dna_extracted': 'true', 'sample_type': 'type1', 'collection_timestamp': - '05/29/2014 12:24:15', + '2014-05-29 12:24:15', 'host_subject_id': 'NotIdentified', 'description': 'Test Sample 3', 'latitude': '4.8', @@ -2084,11 +2084,11 @@ def test_validate_errors(self): def test_validate_errors_timestampA_year4digits(self): self.metadata.set_value('Sample1', 'collection_timestamp', - '09/20/2016 12:00') + '2016-09-20 12:00') self.metadata.set_value('Sample2', 'collection_timestamp', - '9/20/2016 12') + '2016-09-20 12') self.metadata.set_value('Sample3', 'collection_timestamp', - '09/20/2016') + '2016-09-20') with catch_warnings(record=True) as warn: qdb.metadata_template.sample_template.SampleTemplate.create( @@ -2098,21 +2098,35 @@ def test_validate_errors_timestampA_year4digits(self): def test_validate_errors_timestampA_year2digits(self): self.metadata.set_value('Sample1', 'collection_timestamp', - '09/20/16 12:00') + '16-09-20 12:00') self.metadata.set_value('Sample2', 'collection_timestamp', '9/20/16 12') self.metadata.set_value('Sample3', 'collection_timestamp', - '09/20/16') + '09-20-16') with catch_warnings(record=True) as warn: - qdb.metadata_template.sample_template.SampleTemplate.create( + st = qdb.metadata_template.sample_template.SampleTemplate.create( self.metadata, self.new_study) - # the warnings should be empty - self.assertEqual(warn, []) + exp_message = ( + 'Some functionality will be disabled due to missing ' + 'columns:\n\t' + 'Sample "{0}.Sample1", column "collection_timestamp", ' + 'wrong value "16-09-20 12:00";\n\t' + 'Sample "{0}.Sample2", column "collection_timestamp", ' + 'wrong value "9/20/16 12";\n\t' + 'Sample "{0}.Sample3", column "collection_timestamp", ' + 'wrong value "09-20-16".\n' + 'See the Templates tutorial ' + 'for a description of these fields.'.format(st.id)) + # warnings is a list of 1 element + self.assertEqual(len(warn), 1) + # the order might change so testing by elements + self.assertItemsEqual(str(warn[0].message).split('\n'), + exp_message.split('\n')) def test_validate_errors_timestampB_year4digits(self): self.metadata.set_value('Sample1', 'collection_timestamp', - '12/2016') + '2016-12') self.metadata.set_value('Sample2', 'collection_timestamp', '2016') with catch_warnings(record=True) as warn: @@ -2123,14 +2137,24 @@ def test_validate_errors_timestampB_year4digits(self): def test_validate_errors_timestampB_year2digits(self): self.metadata.set_value('Sample1', 'collection_timestamp', - '12/16') + '16-12') self.metadata.set_value('Sample2', 'collection_timestamp', '16') with catch_warnings(record=True) as warn: - qdb.metadata_template.sample_template.SampleTemplate.create( + st = qdb.metadata_template.sample_template.SampleTemplate.create( self.metadata, self.new_study) - # the warnings should be empty - self.assertEqual(warn, []) + exp_message = ( + 'Some functionality will be disabled due to missing ' + 'columns:\n\t' + 'Sample "{0}.Sample1", column "collection_timestamp", wrong ' + 'value "16-12";\n\t' + 'Sample "{0}.Sample2", column "collection_timestamp", wrong ' + 'value "16".\n' + 'See the Templates tutorial for a description ' + 'of these fields.'.format(st.id)) + # warnings is a list of 1 element + self.assertEqual(len(warn), 1) + self.assertEqual(str(warn[0].message), exp_message) def test_delete_column(self): st = qdb.metadata_template.sample_template.SampleTemplate.create( @@ -2146,7 +2170,7 @@ def test_delete_sample(self): 'physical_specimen_remaining': 'true', 'dna_extracted': 'true', 'sample_type': 'type1', - 'collection_timestamp': '05/29/2014 12:24:15', + 'collection_timestamp': '2014-05-29 12:24:15', 'host_subject_id': 'NotIdentified', 'Description': 'Test Sample 4', 'latitude': '42.42', @@ -2172,11 +2196,11 @@ def test_delete_sample(self): "host_subject_id\tlatitude\tlongitude\tphysical_specimen_location\t" "physical_specimen_remaining\tqiita_study_id\tsample_type\t" "scientific_name\ttaxon_id\n" - "{0}.Sample1\t05/29/2014 12:24:15\tTest Sample 1\ttrue\tNotIdentified\t" + "{0}.Sample1\t2014-05-29 12:24:15\tTest Sample 1\ttrue\tNotIdentified\t" "42.42\t41.41\tlocation1\ttrue\t{0}\ttype1\thomo sapiens\t9606\n" - "{0}.Sample2\t05/29/2014 12:24:15\tTest Sample 2\ttrue\tNotIdentified\t" + "{0}.Sample2\t2014-05-29 12:24:15\tTest Sample 2\ttrue\tNotIdentified\t" "4.2\t1.1\tlocation1\ttrue\t{0}\ttype1\thomo sapiens\t9606\n" - "{0}.Sample3\t05/29/2014 12:24:15\tTest Sample 3\ttrue\tNotIdentified\t" + "{0}.Sample3\t2014-05-29 12:24:15\tTest Sample 3\ttrue\tNotIdentified\t" "4.8\t4.41\tlocation1\ttrue\t{0}\ttype1\thomo sapiens\t9606\n") EXP_SAMPLE_TEMPLATE_FEWER_SAMPLES = ( @@ -2184,9 +2208,9 @@ def test_delete_sample(self): "host_subject_id\tlatitude\tlongitude\tphysical_specimen_location\t" "physical_specimen_remaining\tqiita_study_id\tsample_type\t" "scientific_name\ttaxon_id\n" - "{0}.Sample1\t05/29/2014 12:24:15\tTest Sample 1\ttrue\tNotIdentified\t" + "{0}.Sample1\t2014-05-29 12:24:15\tTest Sample 1\ttrue\tNotIdentified\t" "42.42\t41.41\tlocation1\ttrue\t{0}\ttype1\thomo sapiens\t9606\n" - "{0}.Sample3\t05/29/2014 12:24:15\tTest Sample 3\ttrue\tNotIdentified\t" + "{0}.Sample3\t2014-05-29 12:24:15\tTest Sample 3\ttrue\tNotIdentified\t" "4.8\t4.41\tlocation1\ttrue\t{0}\ttype1\thomo sapiens\t9606\n") diff --git a/qiita_db/metadata_template/test/test_util.py b/qiita_db/metadata_template/test/test_util.py index ad43be487..40d7cc6cc 100644 --- a/qiita_db/metadata_template/test/test_util.py +++ b/qiita_db/metadata_template/test/test_util.py @@ -299,44 +299,44 @@ def test_get_pgsql_reserved_words(self): "sample_name\tcollection_timestamp\tdescription\thas_extracted_data\t" "has_physical_specimen\thost_subject_id\tint_column\tlatitude\tlongitude\t" "physical_location\trequired_sample_info_status\tsample_type\tstr_column\n" - "2.Sample1\t05/29/2014 12:24:51\tTest Sample 1\tTrue\tTrue\tNotIdentified" + "2.Sample1\t2014-05-29 12:24:51\tTest Sample 1\tTrue\tTrue\tNotIdentified" "\t1\t42.42\t41.41\tlocation1\treceived\ttype1\tValue for sample 1\n" - "2.Sample2\t05/29/2014 12:24:51\tTest Sample 2\tTrue\tTrue\tNotIdentified" + "2.Sample2\t2014-05-29 12:24:51\tTest Sample 2\tTrue\tTrue\tNotIdentified" "\t2\t4.2\t1.1\tlocation1\treceived\ttype1\tValue for sample 2\n" - "2.Sample3\t05/29/2014 12:24:51\tTest Sample 3\tTrue\tTrue\tNotIdentified" + "2.Sample3\t2014-05-29 12:24:51\tTest Sample 3\tTrue\tTrue\tNotIdentified" "\t3\t4.8\t4.41\tlocation1\treceived\ttype1\tValue for sample 3\n") EXP_SAMPLE_TEMPLATE_MULTICASE = ( "sAmPle_Name\tcollection_timestamp\tDescription\thas_extracted_data\t" "has_physical_specimen\thost_Subject_id\tint_column\tlatitude\tLongitude\t" "physical_location\trequired_sample_info_status\tsample_type\tstr_CoLumn\n" - "2.Sample1\t05/29/2014 12:24:51\tTest Sample 1\tTrue\tTrue\tNotIdentified" + "2.Sample1\t2014-05-29 12:24:51\tTest Sample 1\tTrue\tTrue\tNotIdentified" "\t1\t42.42\t41.41\tlocation1\treceived\ttype1\tValue for sample 1\n" - "2.Sample2\t05/29/2014 12:24:51\tTest Sample 2\tTrue\tTrue\tNotIdentified" + "2.Sample2\t2014-05-29 12:24:51\tTest Sample 2\tTrue\tTrue\tNotIdentified" "\t2\t4.2\t1.1\tlocation1\treceived\ttype1\tValue for sample 2\n" - "2.Sample3\t05/29/2014 12:24:51\tTest Sample 3\tTrue\tTrue\tNotIdentified" + "2.Sample3\t2014-05-29 12:24:51\tTest Sample 3\tTrue\tTrue\tNotIdentified" "\t3\t4.8\t4.41\tlocation1\treceived\ttype1\tValue for sample 3\n") EXP_SAMPLE_TEMPLATE_LAT_ALL_INT = ( "sample_name\tcollection_timestamp\tdescription\thas_extracted_data\t" "has_physical_specimen\thost_subject_id\tint_column\tlatitude\tlongitude\t" "physical_location\trequired_sample_info_status\tsample_type\tstr_column\n" - "2.Sample1\t05/29/2014 12:24:51\tTest Sample 1\tTrue\tTrue\tNotIdentified" + "2.Sample1\t2014-05-29 12:24:51\tTest Sample 1\tTrue\tTrue\tNotIdentified" "\t1\t42\t41.41\tlocation1\treceived\ttype1\tValue for sample 1\n" - "2.Sample2\t05/29/2014 12:24:51\tTest Sample 2\tTrue\tTrue\tNotIdentified" + "2.Sample2\t2014-05-29 12:24:51\tTest Sample 2\tTrue\tTrue\tNotIdentified" "\t2\t4\t1.1\tlocation1\treceived\ttype1\tValue for sample 2\n" - "2.Sample3\t05/29/2014 12:24:51\tTest Sample 3\tTrue\tTrue\tNotIdentified" + "2.Sample3\t2014-05-29 12:24:51\tTest Sample 3\tTrue\tTrue\tNotIdentified" "\t3\t4\t4.41\tlocation1\treceived\ttype1\tValue for sample 3\n") EXP_SAMPLE_TEMPLATE_LAT_MIXED_FLOAT_INT = ( "sample_name\tcollection_timestamp\tdescription\thas_extracted_data\t" "has_physical_specimen\thost_subject_id\tint_column\tlatitude\tlongitude\t" "physical_location\trequired_sample_info_status\tsample_type\tstr_column\n" - "2.Sample1\t05/29/2014 12:24:51\tTest Sample 1\tTrue\tTrue\tNotIdentified" + "2.Sample1\t2014-05-29 12:24:51\tTest Sample 1\tTrue\tTrue\tNotIdentified" "\t1\t42\t41.41\tlocation1\treceived\ttype1\tValue for sample 1\n" - "2.Sample2\t05/29/2014 12:24:51\tTest Sample 2\tTrue\tTrue\tNotIdentified" + "2.Sample2\t2014-05-29 12:24:51\tTest Sample 2\tTrue\tTrue\tNotIdentified" "\t2\t4\t1.1\tlocation1\treceived\ttype1\tValue for sample 2\n" - "2.Sample3\t05/29/2014 12:24:51\tTest Sample 3\tTrue\tTrue\tNotIdentified" + "2.Sample3\t2014-05-29 12:24:51\tTest Sample 3\tTrue\tTrue\tNotIdentified" "\t3\t4.8\t4.41\tlocation1\treceived\ttype1\tValue for sample 3\n") EXP_SAMPLE_TEMPLATE_DUPE_COLS = ( @@ -344,13 +344,13 @@ def test_get_pgsql_reserved_words(self): "has_physical_specimen\thost_subject_id\tlatitude\tlongitude\t" "physical_location\trequired_sample_info_status\tsample_type\t" "str_column\tstr_column\n" - "2.Sample1\t05/29/2014 12:24:51\tTest Sample 1\tTrue\tTrue\t" + "2.Sample1\t2014-05-29 12:24:51\tTest Sample 1\tTrue\tTrue\t" "NotIdentified\t42.42\t41.41\tlocation1\treceived\ttype1\t" "Value for sample 1\tValue for sample 1\n" - "2.Sample2\t05/29/2014 12:24:51\t" + "2.Sample2\t2014-05-29 12:24:51\t" "Test Sample 2\tTrue\tTrue\tNotIdentified\t4.2\t1.1\tlocation1\treceived\t" "type1\tValue for sample 2\tValue for sample 2\n" - "2.Sample3\t05/29/2014 12:24:51\tTest Sample 3\tTrue\t" + "2.Sample3\t2014-05-29 12:24:51\tTest Sample 3\tTrue\t" "True\tNotIdentified\t4.8\t4.41\tlocation1\treceived\ttype1\t" "Value for sample 3\tValue for sample 3\n") @@ -359,14 +359,14 @@ def test_get_pgsql_reserved_words(self): "has_physical_specimen\thost_subject_id\tint_column\tlatitude\tlongitude\t" "physical_location\trequired_sample_info_status\tsample_type\t" "str_column\n" - "2.Sample1 \t05/29/2014 12:24:51\tTest Sample 1\t" + "2.Sample1 \t2014-05-29 12:24:51\tTest Sample 1\t" '"True\t"\t"\nTrue"\t' "NotIdentified\t1\t42.42\t41.41\tlocation1\treceived\ttype1\t" "Value for sample 1\n" - "2.Sample2 \t05/29/2014 12:24:51\t" + "2.Sample2 \t2014-05-29 12:24:51\t" "Test Sample 2\tTrue\tTrue\tNotIdentified\t2\t4.2\t1.1\tlocation1\t" "received\ttype1\tValue for sample 2\n" - "2.Sample3\t05/29/2014 12:24:51\tTest Sample 3\tTrue\t" + "2.Sample3\t2014-05-29 12:24:51\tTest Sample 3\tTrue\t" "True\tNotIdentified\t3\t4.8\t4.41\tlocation1\treceived\ttype1\t" "Value for sample 3\n") @@ -375,13 +375,13 @@ def test_get_pgsql_reserved_words(self): "has_physical_specimen\thost_subject_id\tint_column\tlatitude\tlongitude\t" "physical_location\trequired_sample_info_status\tsample_type\t" "str_column\n" - "2.Sample1\t05/29/2014 12:24:51\tTest Sample 1\tTrue\tTrue\t" + "2.Sample1\t2014-05-29 12:24:51\tTest Sample 1\tTrue\tTrue\t" "NotIdentified\t1\t42.42\t41.41\tlocation1\treceived\ttype1\t" "Value for sample 1\n" - "2.Sample2\t 05/29/2014 12:24:51 \t" + "2.Sample2\t 2014-05-29 12:24:51 \t" "Test Sample 2\tTrue\tTrue\tNotIdentified\t2\t4.2\t1.1\tlocation1\t" "received\ttype1\t Value for sample 2\n" - "2.Sample3\t05/29/2014 12:24:51\t Test Sample 3 \tTrue\t" + "2.Sample3\t2014-05-29 12:24:51\t Test Sample 3 \tTrue\t" "True\tNotIdentified\t3\t4.8\t4.41\tlocation1\treceived\ttype1\t" "Value for sample 3\n") @@ -390,13 +390,13 @@ def test_get_pgsql_reserved_words(self): "has_physical_specimen\thost_subject_id\tint_column\tlatitude\tlongitude\t" "physical_location\trequired_sample_info_status\tsample_type\t" "str_column\n" - "2.Sample1 \t05/29/2014 12:24:51\tTest Sample 1\tTrue\tTrue\t" + "2.Sample1 \t2014-05-29 12:24:51\tTest Sample 1\tTrue\tTrue\t" "NotIdentified\t1\t42.42\t41.41\tlocation1\treceived\ttype1\t" "Value for sample 1\n" - "2.Sample2 \t05/29/2014 12:24:51\t" + "2.Sample2 \t2014-05-29 12:24:51\t" "Test Sample 2\tTrue\tTrue\tNotIdentified\t2\t4.2\t1.1\tlocation1\t" "received\ttype1\tValue for sample 2\n" - "2.Sample3\t05/29/2014 12:24:51\tTest Sample 3\tTrue\t" + "2.Sample3\t2014-05-29 12:24:51\tTest Sample 3\tTrue\t" "True\tNotIdentified\t3\t4.8\t4.41\tlocation1\treceived\ttype1\t" "Value for sample 3\n" "\t\t\t\t\t\t\t\t\t\t\t\t\n" @@ -407,13 +407,13 @@ def test_get_pgsql_reserved_words(self): "has_physical_specimen\thost_subject_id\tint_column\tlatitude\tlongitude\t" "physical_location\trequired_sample_info_status\tsample_type\t" "str_column\t\n" - "2.Sample1 \t05/29/2014 12:24:51\tTest Sample 1\tTrue\tTrue\t" + "2.Sample1 \t2014-05-29 12:24:51\tTest Sample 1\tTrue\tTrue\t" "NotIdentified\t1\t42.42\t41.41\tlocation1\treceived\ttype1\t" "Value for sample 1\t\n" - "2.Sample2 \t05/29/2014 12:24:51\t" + "2.Sample2 \t2014-05-29 12:24:51\t" "Test Sample 2\tTrue\tTrue\tNotIdentified\t2\t4.2\t1.1\tlocation1\t" "received\ttype1\tValue for sample 2\t\n" - "2.Sample3\t05/29/2014 12:24:51\tTest Sample 3\tTrue\t" + "2.Sample3\t2014-05-29 12:24:51\tTest Sample 3\tTrue\t" "True\tNotIdentified\t3\t4.8\t4.41\tlocation1\treceived\ttype1\t" "Value for sample 3\t\n") @@ -422,13 +422,13 @@ def test_get_pgsql_reserved_words(self): "has_physical_specimen\thost_subject_id\tlatitude\tlongitude\t" "physical_location\trequired_sample_info_status\tsample_type\t" "str_column\n" - "002.000\t05/29/2014 12:24:51\tTest Sample 1\tTrue\tTrue\t" + "002.000\t2014-05-29 12:24:51\tTest Sample 1\tTrue\tTrue\t" "NotIdentified\t42.42\t41.41\tlocation1\treceived\ttype1\t" "Value for sample 1\n" - "1.11111\t05/29/2014 12:24:51\t" + "1.11111\t2014-05-29 12:24:51\t" "Test Sample 2\tTrue\tTrue\tNotIdentified\t4.2\t1.1\tlocation1\treceived\t" "type1\tValue for sample 2\n" - "0.12121\t05/29/2014 12:24:51\tTest Sample 3\tTrue\t" + "0.12121\t2014-05-29 12:24:51\tTest Sample 3\tTrue\t" "True\tNotIdentified\t4.8\t4.41\tlocation1\treceived\ttype1\t" "Value for sample 3\n") @@ -437,16 +437,16 @@ def test_get_pgsql_reserved_words(self): "has_physical_specimen\thost_subject_id\tint_column\tlatitude\tlongitude\t" "physical_location\trequired_sample_info_status\tsample_type\t" "str_column\n" - "2.Sample1\t05/29/2014 12:24:51\tTest Sample 1\tTrue\tTrue\t" + "2.Sample1\t2014-05-29 12:24:51\tTest Sample 1\tTrue\tTrue\t" "NotIdentified\t1\t42.42\t41.41\tlocation1\treceived\ttype1\t" "Value for sample 1\n" - "2.Sample2\t05/29/2014 12:24:51\t" + "2.Sample2\t2014-05-29 12:24:51\t" "Test Sample 2\tTrue\tTrue\tNotIdentified\t2\t4.2\t1.1\tlocation1\t" "received\ttype1\tValue for sample 2\n" - "2.Sample3\t05/29/2014 12:24:51\tTest Sample 3\tTrue\t" + "2.Sample3\t2014-05-29 12:24:51\tTest Sample 3\tTrue\t" "True\tNotIdentified\t3\t4.8\t4.41\tlocation1\treceived\ttype1\t" "Value for sample 3\n" - "\t05/29/2014 12:24:51\tTest Sample 3\tTrue\t" + "\t2014-05-29 12:24:51\tTest Sample 3\tTrue\t" "True\tNotIdentified\t4.8\t4.41\tlocation1\treceived\ttype1\t" "Value for sample 3\n" "\t\t\t\t\t\t\t\t\t\t\t\n" @@ -457,13 +457,13 @@ def test_get_pgsql_reserved_words(self): "has_physical_specimen\thost_subject_id\tint_column\tlatitude\tlongitude\t" "physical_location\trequired_sample_info_status\tsample_type\t" "str_column\n" - "2.Sample1\t05/29/2014 12:24:51\tTest Sample 1\tTrue\tTrue\t" + "2.Sample1\t2014-05-29 12:24:51\tTest Sample 1\tTrue\tTrue\t" "NotIdentified\t1\t42.42\t41.41\tlocation1\treceived\ttype1\t" "Value for sample 1\n" - "2.Sample2\t05/29/2014 12:24:51\t" + "2.Sample2\t2014-05-29 12:24:51\t" "Test Sample 2\tTrue\tTrue\tNotIdentified\t2\t4.2\t1.1\tlocation1\t" "received\ttype1\tValue for sample 2\n" - "2.Sample3\t05/29/2014 12:24:51\tTest Sample 3\tTrue\t" + "2.Sample3\t2014-05-29 12:24:51\tTest Sample 3\tTrue\t" "True\tNotIdentified\t3\t4.8\t4.41\tlocation1\treceived\ttype1\t" "Value for sample 3\n" "\t\t\t\t\t \t\t\t\t\t \t\t\n" @@ -474,13 +474,13 @@ def test_get_pgsql_reserved_words(self): "has_physical_specimen\thost_subject_id\tlatitude\tlongitude\t" "physical_location\trequired_sample_info_status\tsample_type\t" "str_column\n" - "2.Sample1\t05/29/2014 12:24:51\tTest Sample 1\tTrue\tTrue\t" + "2.Sample1\t2014-05-29 12:24:51\tTest Sample 1\tTrue\tTrue\t" "NotIdentified\t42.42\t41.41\tlocation1\treceived\ttype1\t" "\n" - "2.Sample2\t05/29/2014 12:24:51\t" + "2.Sample2\t2014-05-29 12:24:51\t" "Test Sample 2\tTrue\tTrue\tNotIdentified\t4.2\t1.1\tlocation1\treceived\t" "type1\t\n" - "2.Sample3\t05/29/2014 12:24:51\tTest Sample 3\tTrue\t" + "2.Sample3\t2014-05-29 12:24:51\tTest Sample 3\tTrue\t" "True\tNotIdentified\t4.8\t4.41\tlocation1\treceived\ttype1\t" "\n") @@ -489,13 +489,13 @@ def test_get_pgsql_reserved_words(self): "has_physical_specimen\thost_subject_id\tlatitude\tlongitude\t" "physical_location\trequired_sample_info_status\tsample_type\t" "str_column\n" - "2.Sample1\t05/29/2014 12:24:51\tTest Sample 1\tTrue\tTrue\t" + "2.Sample1\t2014-05-29 12:24:51\tTest Sample 1\tTrue\tTrue\t" "NotIdentified\t42.42\t41.41\tlocation1\treceived\ttype1\t" "NA\n" - "2.Sample2\t05/29/2014 12:24:51\t" + "2.Sample2\t2014-05-29 12:24:51\t" "Test Sample 2\tTrue\tTrue\tNotIdentified\t4.2\t1.1\tlocation1\treceived\t" "type1\tNA\n" - "2.Sample3\t05/29/2014 12:24:51\tTest Sample 3\tTrue\t" + "2.Sample3\t2014-05-29 12:24:51\tTest Sample 3\tTrue\t" "True\tNotIdentified\t4.8\t4.41\tlocation1\treceived\ttype1\t" "NA\n") @@ -504,13 +504,13 @@ def test_get_pgsql_reserved_words(self): "has_physical_specimen\thost_subject_id\tlatitude\tlongitude\t" "physical_location\trequired_sample_info_status\tsample_type\t" "str_column\n" - "2.Sample1\t05/29/2014 12:24:51\tTest Sample 1\tTrue\tTrue\t" + "2.Sample1\t2014-05-29 12:24:51\tTest Sample 1\tTrue\tTrue\t" "NotIdentified\t42.42\t41.41\tlocation1\treceived\ttype1\t" "NA\n" - "2.Sample2\t05/29/2014 12:24:51\t" + "2.Sample2\t2014-05-29 12:24:51\t" "Test Sample 2\tTrue\tTrue\tNotIdentified\t4.2\t1.1\tlocation1\treceived\t" "type1\tNA\n" - "2.Sample3\t05/29/2014 12:24:51\tTest Sample 3\tTrue\t" + "2.Sample3\t2014-05-29 12:24:51\tTest Sample 3\tTrue\t" "True\tNotIdentified\t4.8\t4.41\tlocation1\treceived\ttype1\t" "NA\n") @@ -519,13 +519,13 @@ def test_get_pgsql_reserved_words(self): "has_physical_specimen\thost_subject_id\tlatitude\tlongitude\t" "physical_location\trequired_sample_info_status\tsample_type\t" "str_column\n" - "2.Sample1\t05/29/2014 12:24:51\tTest Sample 1\tTrue\tTrue\t" + "2.Sample1\t2014-05-29 12:24:51\tTest Sample 1\tTrue\tTrue\t" "1\t11.42\t41.41\tlocation1\treceived\ttype1\t" "Value for sample 1\n" - "2.Sample2\t05/29/2014 12:24:51\t" + "2.Sample2\t2014-05-29 12:24:51\t" "Test Sample 2\tTrue\tTrue\1\t4.2\tXXX\tlocation1\treceived\t" "type1\tValue for sample 2\n" - "2.Sample3\t05/29/2014 12:24:51\tTest Sample 3\tTrue\t" + "2.Sample3\t2014-05-29 12:24:51\tTest Sample 3\tTrue\t" "True\1\t4.8\t4.XXXXX41\tlocation1\treceived\ttype1\t" "Value for sample 3\n") @@ -555,9 +555,9 @@ def test_get_pgsql_reserved_words(self): } SAMPLE_TEMPLATE_DICT_FORM = { - 'collection_timestamp': {'2.Sample1': '05/29/2014 12:24:51', - '2.Sample2': '05/29/2014 12:24:51', - '2.Sample3': '05/29/2014 12:24:51'}, + 'collection_timestamp': {'2.Sample1': '2014-05-29 12:24:51', + '2.Sample2': '2014-05-29 12:24:51', + '2.Sample3': '2014-05-29 12:24:51'}, 'description': {'2.Sample1': 'Test Sample 1', '2.Sample2': 'Test Sample 2', '2.Sample3': 'Test Sample 3'}, @@ -594,9 +594,9 @@ def test_get_pgsql_reserved_words(self): } SAMPLE_TEMPLATE_LAT_ALL_INT_DICT = { - 'collection_timestamp': {'2.Sample1': '05/29/2014 12:24:51', - '2.Sample2': '05/29/2014 12:24:51', - '2.Sample3': '05/29/2014 12:24:51'}, + 'collection_timestamp': {'2.Sample1': '2014-05-29 12:24:51', + '2.Sample2': '2014-05-29 12:24:51', + '2.Sample3': '2014-05-29 12:24:51'}, 'description': {'2.Sample1': 'Test Sample 1', '2.Sample2': 'Test Sample 2', '2.Sample3': 'Test Sample 3'}, @@ -633,9 +633,9 @@ def test_get_pgsql_reserved_words(self): } SAMPLE_TEMPLATE_MIXED_FLOAT_INT_DICT = { - 'collection_timestamp': {'2.Sample1': '05/29/2014 12:24:51', - '2.Sample2': '05/29/2014 12:24:51', - '2.Sample3': '05/29/2014 12:24:51'}, + 'collection_timestamp': {'2.Sample1': '2014-05-29 12:24:51', + '2.Sample2': '2014-05-29 12:24:51', + '2.Sample3': '2014-05-29 12:24:51'}, 'description': {'2.Sample1': 'Test Sample 1', '2.Sample2': 'Test Sample 2', '2.Sample3': 'Test Sample 3'}, @@ -672,9 +672,9 @@ def test_get_pgsql_reserved_words(self): } SAMPLE_TEMPLATE_NUMBER_SAMPLE_NAMES_DICT_FORM = { - 'collection_timestamp': {'002.000': '05/29/2014 12:24:51', - '1.11111': '05/29/2014 12:24:51', - '0.12121': '05/29/2014 12:24:51'}, + 'collection_timestamp': {'002.000': '2014-05-29 12:24:51', + '1.11111': '2014-05-29 12:24:51', + '0.12121': '2014-05-29 12:24:51'}, 'description': {'002.000': 'Test Sample 1', '1.11111': 'Test Sample 2', '0.12121': 'Test Sample 3'}, @@ -707,9 +707,9 @@ def test_get_pgsql_reserved_words(self): '0.12121': 'Value for sample 3'}} ST_EMPTY_COLUMN_DICT_FORM = \ - {'collection_timestamp': {'2.Sample1': '05/29/2014 12:24:51', - '2.Sample2': '05/29/2014 12:24:51', - '2.Sample3': '05/29/2014 12:24:51'}, + {'collection_timestamp': {'2.Sample1': '2014-05-29 12:24:51', + '2.Sample2': '2014-05-29 12:24:51', + '2.Sample3': '2014-05-29 12:24:51'}, 'description': {'2.Sample1': 'Test Sample 1', '2.Sample2': 'Test Sample 2', '2.Sample3': 'Test Sample 3'}, @@ -739,9 +739,9 @@ def test_get_pgsql_reserved_words(self): '2.Sample3': 'type1'}} ST_COLUMN_WITH_NAS_DICT_FORM = \ - {'collection_timestamp': {'2.Sample1': '05/29/2014 12:24:51', - '2.Sample2': '05/29/2014 12:24:51', - '2.Sample3': '05/29/2014 12:24:51'}, + {'collection_timestamp': {'2.Sample1': '2014-05-29 12:24:51', + '2.Sample2': '2014-05-29 12:24:51', + '2.Sample3': '2014-05-29 12:24:51'}, 'description': {'2.Sample1': 'Test Sample 1', '2.Sample2': 'Test Sample 2', '2.Sample3': 'Test Sample 3'}, diff --git a/qiita_db/sql_connection.py b/qiita_db/sql_connection.py index 1223c5b88..1c1588ddf 100644 --- a/qiita_db/sql_connection.py +++ b/qiita_db/sql_connection.py @@ -934,5 +934,6 @@ def add_post_rollback_func(self, func, *args, **kwargs): """ self._post_rollback_funcs.append((func, args, kwargs)) + # Singleton pattern, create the transaction for the entire system TRN = Transaction() diff --git a/qiita_db/support_files/patches/51.sql b/qiita_db/support_files/patches/51.sql index a484d5c24..8ad54977e 100644 --- a/qiita_db/support_files/patches/51.sql +++ b/qiita_db/support_files/patches/51.sql @@ -1,115 +1,5 @@ --- Jan 5, 2017 --- Move the analysis to the plugin system. This is a major rewrite of the --- database backend that supports the analysis pipeline. --- After exploring the data on the database, we realized that --- there are a lot of inconsistencies in the data. Unfortunately, this --- makes the process of transferring the data from the old structure --- to the new one a bit more challenging, as we will need to handle --- different special cases. Furthermore, all the information needed is not --- present in the database, since it requires checking BIOM files. Due to these --- reason, the vast majority of the data transfer is done in the python patch --- 51.py +-- Feb 9, 2017 +-- changing format of stored timestamps +-- see python patch --- In this file we are just creating the new data structures. The old --- datastructure will be dropped in the python patch once all data has been --- transferred. - --- Create the new data structures - --- Table that links the analysis with the initial set of artifacts -CREATE TABLE qiita.analysis_artifact ( - analysis_id bigint NOT NULL, - artifact_id bigint NOT NULL, - CONSTRAINT idx_analysis_artifact_0 PRIMARY KEY (analysis_id, artifact_id) -); -CREATE INDEX idx_analysis_artifact_analysis ON qiita.analysis_artifact (analysis_id); -CREATE INDEX idx_analysis_artifact_artifact ON qiita.analysis_artifact (artifact_id); -ALTER TABLE qiita.analysis_artifact ADD CONSTRAINT fk_analysis_artifact_analysis FOREIGN KEY ( analysis_id ) REFERENCES qiita.analysis( analysis_id ); -ALTER TABLE qiita.analysis_artifact ADD CONSTRAINT fk_analysis_artifact_artifact FOREIGN KEY ( artifact_id ) REFERENCES qiita.artifact( artifact_id ); - --- Droping the analysis status column cause now it depends on the artifacts --- status, like the study does. -ALTER TABLE qiita.analysis DROP COLUMN analysis_status_id; - --- Create a table to link the analysis with the jobs that create the initial --- artifacts -CREATE TABLE qiita.analysis_processing_job ( - analysis_id bigint NOT NULL, - processing_job_id uuid NOT NULL, - CONSTRAINT idx_analysis_processing_job PRIMARY KEY ( analysis_id, processing_job_id ) - ) ; - -CREATE INDEX idx_analysis_processing_job_analysis ON qiita.analysis_processing_job ( analysis_id ) ; -CREATE INDEX idx_analysis_processing_job_pj ON qiita.analysis_processing_job ( processing_job_id ) ; -ALTER TABLE qiita.analysis_processing_job ADD CONSTRAINT fk_analysis_processing_job FOREIGN KEY ( analysis_id ) REFERENCES qiita.analysis( analysis_id ) ; -ALTER TABLE qiita.analysis_processing_job ADD CONSTRAINT fk_analysis_processing_job_pj FOREIGN KEY ( processing_job_id ) REFERENCES qiita.processing_job( processing_job_id ) ; - --- Add a logging column in the analysis -ALTER TABLE qiita.analysis ADD logging_id bigint ; -CREATE INDEX idx_analysis_0 ON qiita.analysis ( logging_id ) ; -ALTER TABLE qiita.analysis ADD CONSTRAINT fk_analysis_logging FOREIGN KEY ( logging_id ) REFERENCES qiita.logging( logging_id ) ; - --- We can handle some of the special cases here, so we simplify the work in the --- python patch - --- Special case 1: there are jobs in the database that do not contain --- any information about the options used to process those parameters. --- However, these jobs do not have any results and all are marked either --- as queued or error, although no error log has been saved. Since these --- jobs are mainly useleess, we are going to remove them from the system -DELETE FROM qiita.analysis_job - WHERE job_id IN (SELECT job_id FROM qiita.job WHERE options = '{}'); -DELETE FROM qiita.job WHERE options = '{}'; - --- Special case 2: there are a fair amount of jobs (719 last time I --- checked) that are not attached to any analysis. Not sure how this --- can happen, but these orphan jobs can't be accessed from anywhere --- in the interface. Remove them from the system. Note that we are --- unlinking the files but we are not removing them from the filepath --- table. We will do that on the patch 47.py using the --- purge_filepaths function, as it will make sure that those files are --- not used anywhere else -DELETE FROM qiita.job_results_filepath WHERE job_id IN ( - SELECT job_id FROM qiita.job J WHERE NOT EXISTS ( - SELECT * FROM qiita.analysis_job AJ WHERE J.job_id = AJ.job_id)); -DELETE FROM qiita.job J WHERE NOT EXISTS ( - SELECT * FROM qiita.analysis_job AJ WHERE J.job_id = AJ.job_id); - --- In the analysis pipeline, an artifact can have mutliple datatypes --- (e.g. procrustes). Allow this by creating a new data_type being "multiomic" -INSERT INTO qiita.data_type (data_type) VALUES ('Multiomic'); - - --- The valdiate command from BIOM will have an extra parameter, analysis --- Magic number -> 4 BIOM command_id -> known for sure since it was added in --- patch 36.sql -INSERT INTO qiita.command_parameter (command_id, parameter_name, parameter_type, required) - VALUES (4, 'analysis', 'analysis', FALSE); --- The template comand now becomes optional, since it can be added either to --- an analysis or to a prep template. command_parameter_id known from patch --- 36.sql -UPDATE qiita.command_parameter SET required = FALSE WHERE command_parameter_id = 34; - --- We are going to add a new special software type, and a new software. --- This is going to be used internally by Qiita, so submit the private jobs. --- This is needed for the analysis. -INSERT INTO qiita.software_type (software_type, description) - VALUES ('private', 'Internal Qiita jobs'); - -DO $do$ -DECLARE - qiita_sw_id bigint; - baf_cmd_id bigint; -BEGIN - INSERT INTO qiita.software (name, version, description, environment_script, start_script, software_type_id, active) - VALUES ('Qiita', 'alpha', 'Internal Qiita jobs', 'source activate qiita', 'qiita-private-2', 3, True) - RETURNING software_id INTO qiita_sw_id; - - INSERT INTO qiita.software_command (software_id, name, description) - VALUES (qiita_sw_id, 'build_analysis_files', 'Builds the files needed for the analysis') - RETURNING command_id INTO baf_cmd_id; - - INSERT INTO qiita.command_parameter (command_id, parameter_name, parameter_type, required, default_value) - VALUES (baf_cmd_id, 'analysis', 'analysis', True, NULL), - (baf_cmd_id, 'merge_dup_sample_ids', 'bool', False, 'False'); -END $do$ +SELECT 1; diff --git a/qiita_db/support_files/patches/52.sql b/qiita_db/support_files/patches/52.sql new file mode 100644 index 000000000..a484d5c24 --- /dev/null +++ b/qiita_db/support_files/patches/52.sql @@ -0,0 +1,115 @@ +-- Jan 5, 2017 +-- Move the analysis to the plugin system. This is a major rewrite of the +-- database backend that supports the analysis pipeline. +-- After exploring the data on the database, we realized that +-- there are a lot of inconsistencies in the data. Unfortunately, this +-- makes the process of transferring the data from the old structure +-- to the new one a bit more challenging, as we will need to handle +-- different special cases. Furthermore, all the information needed is not +-- present in the database, since it requires checking BIOM files. Due to these +-- reason, the vast majority of the data transfer is done in the python patch +-- 51.py + +-- In this file we are just creating the new data structures. The old +-- datastructure will be dropped in the python patch once all data has been +-- transferred. + +-- Create the new data structures + +-- Table that links the analysis with the initial set of artifacts +CREATE TABLE qiita.analysis_artifact ( + analysis_id bigint NOT NULL, + artifact_id bigint NOT NULL, + CONSTRAINT idx_analysis_artifact_0 PRIMARY KEY (analysis_id, artifact_id) +); +CREATE INDEX idx_analysis_artifact_analysis ON qiita.analysis_artifact (analysis_id); +CREATE INDEX idx_analysis_artifact_artifact ON qiita.analysis_artifact (artifact_id); +ALTER TABLE qiita.analysis_artifact ADD CONSTRAINT fk_analysis_artifact_analysis FOREIGN KEY ( analysis_id ) REFERENCES qiita.analysis( analysis_id ); +ALTER TABLE qiita.analysis_artifact ADD CONSTRAINT fk_analysis_artifact_artifact FOREIGN KEY ( artifact_id ) REFERENCES qiita.artifact( artifact_id ); + +-- Droping the analysis status column cause now it depends on the artifacts +-- status, like the study does. +ALTER TABLE qiita.analysis DROP COLUMN analysis_status_id; + +-- Create a table to link the analysis with the jobs that create the initial +-- artifacts +CREATE TABLE qiita.analysis_processing_job ( + analysis_id bigint NOT NULL, + processing_job_id uuid NOT NULL, + CONSTRAINT idx_analysis_processing_job PRIMARY KEY ( analysis_id, processing_job_id ) + ) ; + +CREATE INDEX idx_analysis_processing_job_analysis ON qiita.analysis_processing_job ( analysis_id ) ; +CREATE INDEX idx_analysis_processing_job_pj ON qiita.analysis_processing_job ( processing_job_id ) ; +ALTER TABLE qiita.analysis_processing_job ADD CONSTRAINT fk_analysis_processing_job FOREIGN KEY ( analysis_id ) REFERENCES qiita.analysis( analysis_id ) ; +ALTER TABLE qiita.analysis_processing_job ADD CONSTRAINT fk_analysis_processing_job_pj FOREIGN KEY ( processing_job_id ) REFERENCES qiita.processing_job( processing_job_id ) ; + +-- Add a logging column in the analysis +ALTER TABLE qiita.analysis ADD logging_id bigint ; +CREATE INDEX idx_analysis_0 ON qiita.analysis ( logging_id ) ; +ALTER TABLE qiita.analysis ADD CONSTRAINT fk_analysis_logging FOREIGN KEY ( logging_id ) REFERENCES qiita.logging( logging_id ) ; + +-- We can handle some of the special cases here, so we simplify the work in the +-- python patch + +-- Special case 1: there are jobs in the database that do not contain +-- any information about the options used to process those parameters. +-- However, these jobs do not have any results and all are marked either +-- as queued or error, although no error log has been saved. Since these +-- jobs are mainly useleess, we are going to remove them from the system +DELETE FROM qiita.analysis_job + WHERE job_id IN (SELECT job_id FROM qiita.job WHERE options = '{}'); +DELETE FROM qiita.job WHERE options = '{}'; + +-- Special case 2: there are a fair amount of jobs (719 last time I +-- checked) that are not attached to any analysis. Not sure how this +-- can happen, but these orphan jobs can't be accessed from anywhere +-- in the interface. Remove them from the system. Note that we are +-- unlinking the files but we are not removing them from the filepath +-- table. We will do that on the patch 47.py using the +-- purge_filepaths function, as it will make sure that those files are +-- not used anywhere else +DELETE FROM qiita.job_results_filepath WHERE job_id IN ( + SELECT job_id FROM qiita.job J WHERE NOT EXISTS ( + SELECT * FROM qiita.analysis_job AJ WHERE J.job_id = AJ.job_id)); +DELETE FROM qiita.job J WHERE NOT EXISTS ( + SELECT * FROM qiita.analysis_job AJ WHERE J.job_id = AJ.job_id); + +-- In the analysis pipeline, an artifact can have mutliple datatypes +-- (e.g. procrustes). Allow this by creating a new data_type being "multiomic" +INSERT INTO qiita.data_type (data_type) VALUES ('Multiomic'); + + +-- The valdiate command from BIOM will have an extra parameter, analysis +-- Magic number -> 4 BIOM command_id -> known for sure since it was added in +-- patch 36.sql +INSERT INTO qiita.command_parameter (command_id, parameter_name, parameter_type, required) + VALUES (4, 'analysis', 'analysis', FALSE); +-- The template comand now becomes optional, since it can be added either to +-- an analysis or to a prep template. command_parameter_id known from patch +-- 36.sql +UPDATE qiita.command_parameter SET required = FALSE WHERE command_parameter_id = 34; + +-- We are going to add a new special software type, and a new software. +-- This is going to be used internally by Qiita, so submit the private jobs. +-- This is needed for the analysis. +INSERT INTO qiita.software_type (software_type, description) + VALUES ('private', 'Internal Qiita jobs'); + +DO $do$ +DECLARE + qiita_sw_id bigint; + baf_cmd_id bigint; +BEGIN + INSERT INTO qiita.software (name, version, description, environment_script, start_script, software_type_id, active) + VALUES ('Qiita', 'alpha', 'Internal Qiita jobs', 'source activate qiita', 'qiita-private-2', 3, True) + RETURNING software_id INTO qiita_sw_id; + + INSERT INTO qiita.software_command (software_id, name, description) + VALUES (qiita_sw_id, 'build_analysis_files', 'Builds the files needed for the analysis') + RETURNING command_id INTO baf_cmd_id; + + INSERT INTO qiita.command_parameter (command_id, parameter_name, parameter_type, required, default_value) + VALUES (baf_cmd_id, 'analysis', 'analysis', True, NULL), + (baf_cmd_id, 'merge_dup_sample_ids', 'bool', False, 'False'); +END $do$ diff --git a/qiita_db/support_files/patches/python_patches/51.py b/qiita_db/support_files/patches/python_patches/51.py index 9d462d5c9..a8d3eeeb0 100644 --- a/qiita_db/support_files/patches/python_patches/51.py +++ b/qiita_db/support_files/patches/python_patches/51.py @@ -1,749 +1,110 @@ -# The code is commented with details on the changes implemented here, -# but here is an overview of the changes needed to transfer the analysis -# data to the plugins structure: -# 1) Create a new type plugin to define the diversity types -# 2) Create the new commands on the existing QIIME plugin to execute the -# existing analyses (beta div, taxa summaries and alpha rarefaction) -# 3) Transfer all the data in the old structures to the plugin structures -# 4) Delete old structures - -from string import ascii_letters, digits -from random import SystemRandom -from os.path import join, exists, basename -from os import makedirs -from json import loads - -from biom import load_table, Table -from biom.util import biom_open - +from future.utils import viewitems +from datetime import datetime + +from qiita_db.metadata_template.constants import ( + SAMPLE_TEMPLATE_COLUMNS, PREP_TEMPLATE_COLUMNS, + PREP_TEMPLATE_COLUMNS_TARGET_GENE) +from qiita_db.metadata_template.prep_template import PrepTemplate +from qiita_db.metadata_template.sample_template import SampleTemplate from qiita_db.sql_connection import TRN -from qiita_db.util import (get_db_files_base_dir, purge_filepaths, - get_mountpoint, compute_checksum) -from qiita_db.artifact import Artifact - -# Create some aux functions that are going to make the code more modular -# and easier to understand, since there is a fair amount of work to do to -# trasnfer the data from the old structure to the new one - - -def get_random_string(length): - """Creates a random string of the given length with alphanumeric chars - - Parameters - ---------- - length : int - The desired length of the string - - Returns - ------- - str - The new random string - """ - sr = SystemRandom() - chars = ascii_letters + digits - return ''.join(sr.choice(chars) for i in range(length)) - -def create_non_rarefied_biom_artifact(analysis, biom_data, rarefied_table): - """Creates the initial non-rarefied BIOM artifact of the analysis - - Parameters - ---------- - analysis : dict - Dictionary with the analysis information - biom_data : dict - Dictionary with the biom file information - rarefied_table : biom.Table - The rarefied BIOM table - - Returns - ------- - int - The id of the new artifact - """ - # The non rarefied biom artifact is the initial biom table of the analysis. - # This table does not currently exist anywhere, so we need to actually - # create the BIOM file. To create this BIOM file we need: (1) the samples - # and artifacts they come from and (2) whether the samples where - # renamed or not. (1) is on the database, but we need to inferr (2) from - # the existing rarefied BIOM table. Fun, fun... +# getting columns in each info file that we need to check for +cols_sample = [col + for key, vals in viewitems(SAMPLE_TEMPLATE_COLUMNS) + for col, dt in viewitems(vals.columns) if dt == datetime] +cols_prep = [col + for key, vals in viewitems(PREP_TEMPLATE_COLUMNS) + for col, dt in viewitems(vals.columns) if dt == datetime].extend( + [col + for key, vals in viewitems(PREP_TEMPLATE_COLUMNS_TARGET_GENE) + for col, dt in viewitems(vals.columns)]) + + +def transform_date(value): + # for the way the patches are applied we need to have this import and + # the next 2 variables within this function + from datetime import datetime + + # old format : new format + formats = { + # 4 digits year + '%m/%d/%Y %H:%M:%S': '%Y-%m-%d %H:%M:%S', + '%m-%d-%Y %H:%M': '%Y-%m-%d %H:%M', + '%m/%d/%Y %H': '%Y-%m-%d %H', + '%m-%d-%Y': '%Y-%m-%d', + '%m-%Y': '%Y-%m', + '%Y': '%Y', + # 2 digits year + '%m/%d/%y %H:%M:%S': '%Y-%m-%d %H:%M:%S', + '%m-%d-%y %H:%M': '%Y-%m-%d %H:%M', + '%m/%d/%y %H': '%Y-%m-%d %H', + '%m-%d-%y': '%Y-%m-%d', + '%m-%y': '%Y-%m', + '%y': '%Y' + } + + # loop over the old formats to see which one is it + if value is not None: + date = None + for i, fmt in enumerate(formats): + try: + date = datetime.strptime(value, fmt) + break + except ValueError: + pass + if date is not None: + value = date.strftime(formats[fmt]) + + return value + + +if cols_sample: with TRN: - # Get the samples included in the BIOM table grouped by artifact id - # Note that the analysis contains a BIOM table per data type included - # in it, and the table analysis_sample does not differentiate between - # datatypes, so we need to check the data type in the artifact table - sql = """SELECT artifact_id, array_agg(sample_id) - FROM qiita.analysis_sample - JOIN qiita.artifact USING (artifact_id) - WHERE analysis_id = %s AND data_type_id = %s - GROUP BY artifact_id""" - TRN.add(sql, [analysis['analysis_id'], biom_data['data_type_id']]) - samples_by_artifact = TRN.execute_fetchindex() - - # Create an empty BIOM table to be the new master table - new_table = Table([], [], []) - ids_map = {} - for a_id, samples in samples_by_artifact: - # Get the filepath of the BIOM table from the artifact - artifact = Artifact(a_id) - biom_fp = None - for _, fp, fp_type in artifact.filepaths: - if fp_type == 'biom': - biom_fp = fp - # Note that we are sure that the biom table exists for sure, so - # no need to check if biom_fp is undefined - biom_table = load_table(biom_fp) - biom_table.filter(samples, axis='sample', inplace=True) - new_table = new_table.merge(biom_table) - ids_map.update({sid: "%d.%s" % (a_id, sid) - for sid in biom_table.ids()}) - - # Check if we need to rename the sample ids in the biom table - new_table_ids = set(new_table.ids()) - if not new_table_ids.issuperset(rarefied_table.ids()): - # We need to rename the sample ids - new_table.update_ids(ids_map, 'sample', True, True) - - sql = """INSERT INTO qiita.artifact - (generated_timestamp, data_type_id, visibility_id, - artifact_type_id, submitted_to_vamps) - VALUES (%s, %s, %s, %s, %s) - RETURNING artifact_id""" - # Magic number 4 -> visibility sandbox - # Magix number 7 -> biom artifact type - TRN.add(sql, [analysis['timestamp'], biom_data['data_type_id'], - 4, 7, False]) - artifact_id = TRN.execute_fetchlast() - - # Associate the artifact with the analysis - sql = """INSERT INTO qiita.analysis_artifact - (analysis_id, artifact_id) - VALUES (%s, %s)""" - TRN.add(sql, [analysis['analysis_id'], artifact_id]) - # Link the artifact with its file - dd_id, mp = get_mountpoint('BIOM')[0] - dir_fp = join(get_db_files_base_dir(), mp, str(artifact_id)) - if not exists(dir_fp): - makedirs(dir_fp) - new_table_fp = join(dir_fp, "biom_table.biom") - with biom_open(new_table_fp, 'w') as f: - new_table.to_hdf5(f, "Generated by Qiita") - - sql = """INSERT INTO qiita.filepath - (filepath, filepath_type_id, checksum, - checksum_algorithm_id, data_directory_id) - VALUES (%s, %s, %s, %s, %s) - RETURNING filepath_id""" - # Magic number 7 -> filepath_type_id = 'biom' - # Magic number 1 -> the checksum algorithm id - TRN.add(sql, [basename(new_table_fp), 7, - compute_checksum(new_table_fp), 1, dd_id]) - fp_id = TRN.execute_fetchlast() - sql = """INSERT INTO qiita.artifact_filepath - (artifact_id, filepath_id) - VALUES (%s, %s)""" - TRN.add(sql, [artifact_id, fp_id]) - TRN.execute() - - return artifact_id - - -def create_rarefaction_job(depth, biom_artifact_id, analysis, srare_cmd_id): - """Create a new rarefaction job - - Parameters - ---------- - depth : int - The rarefaction depth - biom_artifact_id : int - The artifact id of the input rarefaction biom table - analysis : dict - Dictionary with the analysis information - srare_cmd_id : int - The command id of the single rarefaction command - - Returns - ------- - job_id : str - The job id - params : str - The job parameters - """ - # Add the row in the procesisng job table - params = ('{"depth":%d,"subsample_multinomial":false,"biom_table":%s}' - % (depth, biom_artifact_id)) + # a few notes: just getting the preps with duplicated values; ignoring + # column 'sample_id' and tables 'study_sample', 'prep_template', + # 'prep_template_sample' + sql = """SELECT table_name, array_agg(column_name::text) + FROM information_schema.columns + WHERE column_name IN %s + AND table_name LIKE 'sample_%%' + AND table_name NOT IN ( + 'prep_template', 'prep_template_sample') + GROUP BY table_name""" + # note that we are looking for those columns with duplicated names in + # the headers + TRN.add(sql, [tuple(set(cols_sample))]) + for table, columns in viewitems(dict(TRN.execute_fetchindex())): + # [1] the format is table_# so taking the # + st = SampleTemplate(int(table.split('_')[1])) + # getting just the columns of interest + st_df = st.to_dataframe()[columns] + # converting to datetime + for col in columns: + st_df[col] = st_df[col].apply(transform_date) + st.update(st_df) + +if cols_prep: with TRN: - # magic number 3: status -> success - sql = """INSERT INTO qiita.processing_job - (email, command_id, command_parameters, - processing_job_status_id) - VALUES (%s, %s, %s, %s) - RETURNING processing_job_id""" - TRN.add(sql, [analysis['email'], srare_cmd_id, params, 3]) - job_id = TRN.execute_fetchlast() - # Step 1.2.b: Link the job with the input artifact - sql = """INSERT INTO qiita.artifact_processing_job - (artifact_id, processing_job_id) - VALUES (%s, %s)""" - TRN.add(sql, [biom_artifact_id, job_id]) - TRN.execute() - return job_id, params - - -def transfer_file_to_artifact(analysis_id, a_timestamp, command_id, - data_type_id, params, artifact_type_id, - filepath_id): - """Creates a new artifact with the given filepath id - - Parameters - ---------- - analysis_id : int - The analysis id to attach the artifact - a_timestamp : datetime.datetime - The generated timestamp of the artifact - command_id : int - The command id of the artifact - data_type_id : int - The data type id of the artifact - params : str - The parameters of the artifact - artifact_type_id : int - The artifact type - filepath_id : int - The filepath id - - Returns - ------- - int - The artifact id - """ - with TRN: - # Add the row in the artifact table - # Magic number 4: Visibility -> sandbox - sql = """INSERT INTO qiita.artifact - (generated_timestamp, command_id, data_type_id, - command_parameters, visibility_id, artifact_type_id, - submitted_to_vamps) - VALUES (%s, %s, %s, %s, %s, %s, %s) - RETURNING artifact_id""" - TRN.add(sql, [a_timestamp, command_id, data_type_id, params, 4, - artifact_type_id, False]) - artifact_id = TRN.execute_fetchlast() - # Link the artifact with its file - sql = """INSERT INTO qiita.artifact_filepath (artifact_id, filepath_id) - VALUES (%s, %s)""" - TRN.add(sql, [artifact_id, filepath_id]) - # Link the artifact with the analysis - sql = """INSERT INTO qiita.analysis_artifact - (analysis_id, artifact_id) - VALUES (%s, %s)""" - TRN.add(sql, [analysis_id, artifact_id]) - - return artifact_id - - -def create_rarefied_biom_artifact(analysis, srare_cmd_id, biom_data, params, - parent_biom_artifact_id, rarefaction_job_id, - srare_cmd_out_id): - """Creates the rarefied biom artifact - - Parameters - ---------- - analysis : dict - The analysis information - srare_cmd_id : int - The command id of "Single Rarefaction" - biom_data : dict - The biom information - params : str - The processing parameters - parent_biom_artifact_id : int - The parent biom artifact id - rarefaction_job_id : str - The job id of the rarefaction job - srare_cmd_out_id : int - The id of the single rarefaction output - - Returns - ------- - int - The artifact id - """ - with TRN: - # Transfer the file to an artifact - # Magic number 7: artifact type -> biom - artifact_id = transfer_file_to_artifact( - analysis['analysis_id'], analysis['timestamp'], srare_cmd_id, - biom_data['data_type_id'], params, 7, biom_data['filepath_id']) - # Link the artifact with its parent - sql = """INSERT INTO qiita.parent_artifact (artifact_id, parent_id) - VALUES (%s, %s)""" - TRN.add(sql, [artifact_id, parent_biom_artifact_id]) - # Link the artifact as the job output - sql = """INSERT INTO qiita.artifact_output_processing_job - (artifact_id, processing_job_id, command_output_id) - VALUES (%s, %s, %s)""" - TRN.add(sql, [artifact_id, rarefaction_job_id, srare_cmd_out_id]) - return artifact_id - - -def transfer_job(analysis, command_id, params, input_artifact_id, job_data, - cmd_out_id, biom_data, output_artifact_type_id): - """Transfers the job from the old structure to the plugin structure - - Parameters - ---------- - analysis : dict - The analysis information - command_id : int - The id of the command executed - params : str - The parameters used in the job - input_artifact_id : int - The id of the input artifact - job_data : dict - The job information - cmd_out_id : int - The id of the command's output - biom_data : dict - The biom information - output_artifact_type_id : int - The type of the output artifact - """ - with TRN: - # Create the job - # Add the row in the processing job table - # Magic number 3: status -> success - sql = """INSERT INTO qiita.processing_job - (email, command_id, command_parameters, - processing_job_status_id) - VALUES (%s, %s, %s, %s) - RETURNING processing_job_id""" - TRN.add(sql, [analysis['email'], command_id, params, 3]) - job_id = TRN.execute_fetchlast() - - # Link the job with the input artifact - sql = """INSERT INTO qiita.artifact_processing_job - (artifact_id, processing_job_id) - VALUES (rarefied_biom_id, proc_job_id)""" - TRN.add(sql, [input_artifact_id, job_id]) - - # Check if the executed job has results and add them - sql = """SELECT EXISTS(SELECT * - FROM qiita.job_results_filepath - WHERE job_id = %s)""" - TRN.add(sql, [job_data['job_id']]) - if TRN.execute_fetchlast(): - # There are results for the current job. - # Transfer the job files to a new artifact - sql = """SELECT filepath_id - FROM qiita.job_results_filepath - WHERE job_id = %s""" - TRN.add(sql, job_data['job_id']) - filepath_id = TRN.execute_fetchlast() - artifact_id = transfer_file_to_artifact( - analysis['analysis_id'], analysis['timestamp'], command_id, - biom_data['data_type_id'], params, output_artifact_type_id, - filepath_id) - - # Link the artifact with its parent - sql = """INSERT INTO qiita.parent_artifact (artifact_id, parent_id) - VALUES (%s, %s)""" - TRN.add(sql, [artifact_id, input_artifact_id]) - # Link the artifact as the job output - sql = """INSERT INTO qiita.artifact_output_processing_job - (artifact_id, processing_job_id, command_output_id) - VALUES (%s, %s, %s)""" - TRN.add(sql, [artifact_id, job_id, cmd_out_id]) - TRN.exeucte() - else: - # There are no results on the current job, so mark it as - # error - if job_data.log_id is None: - # Magic number 2 - we are not using any other severity - # level, so keep using number 2 - sql = """INSERT INTO qiita.logging (time, severity_id, msg) - VALUES (%s, %s, %s) - RETURNING logging_id""" - TRN.add(sql, [analysis['timestamp'], 2, - "Unknown error - patch 47"]) - else: - log_id = job_data['log_id'] - - # Magic number 4 -> status -> error - sql = """UPDATE qiita.processing_job - SET processing_job_status_id = 4, logging_id = %s - WHERE processing_job_id = %s""" - TRN.add(sql, [log_id, job_id]) - - -# The new commands that we are going to add generate new artifact types. -# These new artifact types are going to be added to a different plugin. -# In interest of time and given that the artifact type system is going to -# change in the near future, we feel that the easiest way to transfer -# the current analyses results is by creating 3 different types of -# artifacts: (1) distance matrix -> which will include the distance matrix, -# the principal coordinates and the emperor plots; (2) rarefaction -# curves -> which will include all the files generated by alpha rarefaction -# and (3) taxonomy summary, which will include all the files generated -# by summarize_taxa_through_plots.py - -# Step 1: Create the new type -with TRN: - # Magic number 2 -> The "artifact definition" software type - sql = """INSERT INTO qiita.software - (name, version, description, environment_script, start_script, - software_type_id) - VALUES ('Diversity types', '0.1.0', - 'Diversity artifacts type plugin', - 'source activate qiita', 'start_diversity_types', 2) - RETURNING software_id""" - TRN.add(sql) - divtype_id = TRN.execute_fetchlast() - - # Step 2: Create the validate and HTML generator commands - sql = """INSERT INTO qiita.software_command (software_id, name, description) - VALUES (%s, %s, %s) - RETURNING command_id""" - TRN.add(sql, [divtype_id, 'Validate', - 'Validates a new artifact of the given diversity type']) - validate_cmd_id = TRN.execute_fetchlast() - TRN.add(sql, [divtype_id, 'Generate HTML summary', - 'Generates the HTML summary of a given diversity type']) - html_summary_cmd_id = TRN.execute_fetchlast() - - # Step 3: Add the parameters for the previous commands - sql = """INSERT INTO qiita.command_parameter - (command_id, parameter_name, parameter_type, required) - VALUES (%s, %s, %s, %s)""" - sql_args = [(validate_cmd_id, 'files', 'string', True), - (validate_cmd_id, 'artifact_type', 'string', True), - (validate_cmd_id, 'template', 'prep_template', False), - (validate_cmd_id, 'analysis', 'analysis', False), - (validate_cmd_id, 'provenance', 'string', False), - (html_summary_cmd_id, 'input_data', 'artifact', True)] - TRN.add(sql, sql_args, many=True) - - # Step 4: Add the new artifact types - sql = """INSERT INTO qiita.artifact_type ( - artifact_type, description, can_be_submitted_to_ebi, - can_be_submitted_to_vamps) - VALUES (%s, %s, %s, %s) - RETURNING artifact_type_id""" - TRN.add(sql, ['distance_matrix', 'Distance matrix holding pairwise ' - 'distance between samples', False, False]) - dm_atype_id = TRN.execute_fetchlast() - TRN.add(sql, ['rarefaction_curves', 'Rarefaction curves', False, False]) - rc_atype_id = TRN.execute_fetchlast() - TRN.add(sql, ['taxa_summary', 'Taxa summary plots', False, False]) - ts_atype_id = TRN.execute_fetchlast() - - # Step 5: Associate each artifact with the filetypes that it accepts - # At this time we are going to add them as directories, just as it is done - # right now. We can make it fancier with the new type system. - # Magic number 8: the filepath_type_id for the directory - sql = """INSERT INTO qiita.artifact_type_filepath_type - (artifact_type_id, filepath_type_id, required) - VALUES (%s, %s, %s)""" - sql_args = [[dm_atype_id, 8, True], - [rc_atype_id, 8, True], - [ts_atype_id, 8, True]] - TRN.add(sql, sql_args, many=True) - - # Step 6: Associate the plugin with the types that it defines - sql = """INSERT INTO qiita.software_artifact_type - (software_id, artifact_type_id) - VALUES (%s, %s)""" - sql_args = [[divtype_id, dm_atype_id], - [divtype_id, rc_atype_id], - [divtype_id, ts_atype_id]] - TRN.add(sql, sql_args, many=True) - - # Step 7: Create the new entries for the data directory - sql = """INSERT INTO qiita.data_directory - (data_type, mountpoint, subdirectory, active) - VALUES (%s, %s, %s, %s)""" - sql_args = [['distance_matrix', 'distance_matrix', True, True], - ['rarefaction_curves', 'rarefaction_curves', True, True], - ['taxa_summary', 'taxa_summary', True, True]] - TRN.add(sql, sql_args, many=True) - - # Step 8: Give a new client id/client secret pair to the plugins - sql = """INSERT INTO qiita.oauth_identifiers (client_id, client_secret) - VALUES (%s, %s)""" - # Each plugin needs a client id/secret pair, so we are generating it here - # at random - client_id = get_random_string(50) - client_secret = get_random_string(255) - TRN.add(sql, [client_id, client_secret]) - sql = """INSERT INTO qiita.oauth_software (client_id, software_id) - VALUES (%s, %s)""" - TRN.add(sql, [client_id, divtype_id]) - - # Create the new commands that execute the current analyses. In qiita, - # the only commands that where available are Summarize Taxa, Beta - # Diversity and Alpha Rarefaction. The system was executing rarefaction - # by default, but it should be a different step in the analysis process - # so we are going to create a command for it too. These commands are going - # to be part of the QIIME plugin, so we are going to first retrieve the - # id of the QIIME 1.9.1 plugin, which for sure exists cause it was added - # in patch 33 and there is no way of removing plugins - - # Step 1: Get the QIIME plugin id - sql = """SELECT software_id - FROM qiita.software - WHERE name = 'QIIME' AND version = '1.9.1'""" - TRN.add(sql) - qiime_id = TRN.execute_fetchlast() - - # Step 2: Insert the new commands in the software_command table - sql = """INSERT INTO qiita.software_command (software_id, name, description) - VALUES (%s, %s, %s) - RETURNING command_id""" - TRN.add(sql, [qiime_id, 'Summarize Taxa', 'Plots taxonomy summaries at ' - 'different taxonomy levels']) - sum_taxa_cmd_id = TRN.execute_fetchlast() - TRN.add(sql, [qiime_id, 'Beta Diversity', - 'Computes and plots beta diversity results']) - bdiv_cmd_id = TRN.execute_fetchlast() - TRN.add(sql, [qiime_id, 'Alpha Rarefaction', - 'Computes and plots alpha rarefaction results']) - arare_cmd_id = TRN.execute_fetchlast() - TRN.add(sql, [qiime_id, 'Single Rarefaction', - 'Rarefies the input table by random sampling without ' - 'replacement']) - srare_cmd_id = TRN.execute_fetchlast() - - # Step 3: Insert the parameters for each command - sql = """INSERT INTO qiita.command_parameter - (command_id, parameter_name, parameter_type, required, - default_value) - VALUES (%s, %s, %s, %s, %s) - RETURNING command_parameter_id""" - sql_args = [ - # Summarize Taxa - (sum_taxa_cmd_id, 'metadata_category', 'string', False, ''), - (sum_taxa_cmd_id, 'sort', 'bool', False, 'False'), - # Beta Diversity - (bdiv_cmd_id, 'tree', 'string', False, ''), - (bdiv_cmd_id, 'metric', - 'choice:["abund_jaccard","binary_chisq","binary_chord",' - '"binary_euclidean","binary_hamming","binary_jaccard",' - '"binary_lennon","binary_ochiai","binary_otu_gain","binary_pearson",' - '"binary_sorensen_dice","bray_curtis","bray_curtis_faith",' - '"bray_curtis_magurran","canberra","chisq","chord","euclidean",' - '"gower","hellinger","kulczynski","manhattan","morisita_horn",' - '"pearson","soergel","spearman_approx","specprof","unifrac",' - '"unifrac_g","unifrac_g_full_tree","unweighted_unifrac",' - '"unweighted_unifrac_full_tree","weighted_normalized_unifrac",' - '"weighted_unifrac"]', False, '"binary_jaccard"'), - # Alpha rarefaction - (arare_cmd_id, 'tree', 'string', False, ''), - (arare_cmd_id, 'num_steps', 'integer', False, 10), - (arare_cmd_id, 'min_rare_depth', 'integer', False, 10), - (arare_cmd_id, 'max_rare_depth', 'integer', False, 'Default'), - (arare_cmd_id, 'metrics', - 'mchoice:["ace","berger_parker_d","brillouin_d","chao1","chao1_ci",' - '"dominance","doubles","enspie","equitability","esty_ci",' - '"fisher_alpha","gini_index","goods_coverage","heip_e",' - '"kempton_taylor_q","margalef","mcintosh_d","mcintosh_e",' - '"menhinick","michaelis_menten_fit","observed_otus",' - '"observed_species","osd","simpson_reciprocal","robbins",' - '"shannon","simpson","simpson_e","singles","strong","PD_whole_tree"]', - False, '["chao1","observed_otus"]'), - # Single rarefaction - (srare_cmd_id, 'depth', 'integer', True, None), - (srare_cmd_id, 'subsample_multinomial', 'bool', False, 'False') - ] - TRN.add(sql, sql_args, many=True) - - TRN.add(sql, [sum_taxa_cmd_id, 'biom_table', 'artifact', True, None]) - sum_taxa_cmd_param_id = TRN.execute_fetchlast() - TRN.add(sql, [bdiv_cmd_id, 'biom_table', 'artifact', True, None]) - bdiv_cmd_param_id = TRN.execute_fetchlast() - TRN.add(sql, [arare_cmd_id, 'biom_table', 'artifact', True, None]) - arare_cmd_param_id = TRN.execute_fetchlast() - TRN.add(sql, [srare_cmd_id, 'biom_table', 'artifact', True, None]) - srare_cmd_param_id = TRN.execute_fetchlast() - - # Step 4: Connect the artifact parameters with the artifact types that - # they accept - sql = """SELECT artifact_type_id - FROM qiita.artifact_type - WHERE artifact_type = 'BIOM'""" - TRN.add(sql) - biom_atype_id = TRN.execute_fetchlast() - - sql = """INSERT INTO qiita.parameter_artifact_type - (command_parameter_id, artifact_type_id) - VALUES (%s, %s)""" - sql_args = [[sum_taxa_cmd_param_id, biom_atype_id], - [bdiv_cmd_param_id, biom_atype_id], - [arare_cmd_param_id, biom_atype_id], - [srare_cmd_param_id, biom_atype_id]] - TRN.add(sql, sql_args, many=True) - - # Step 5: Add the outputs of the command. - sql = """INSERT INTO qiita.command_output - (name, command_id, artifact_type_id) - VALUES (%s, %s, %s) - RETURNING command_output_id""" - TRN.add(sql, ['taxa_summary', sum_taxa_cmd_id, ts_atype_id]) - sum_taxa_cmd_out_id = TRN.execute_fetchlast() - TRN.add(sql, ['distance_matrix', bdiv_cmd_id, dm_atype_id]) - bdiv_cmd_out_id = TRN.execute_fetchlast() - TRN.add(sql, ['rarefaction_curves', arare_cmd_id, rc_atype_id]) - arare_cmd_out_id = TRN.execute_fetchlast() - TRN.add(sql, ['rarefied_table', srare_cmd_id, biom_atype_id]) - srare_cmd_out_id = TRN.execute_fetchlast() - - # Step 6: Add default parameter sets - sql = """INSERT INTO qiita.default_parameter_set - (command_id, parameter_set_name, parameter_set) - VALUES (%s, %s, %s)""" - sql_args = [ - [sum_taxa_cmd_id, 'Defaults', - '{"sort": false, "metadata_category": ""}'], - [bdiv_cmd_id, 'Unweighted UniFrac', - '{"metrics": "unweighted_unifrac", "tree": ""}'], - [arare_cmd_id, 'Defaults', - '{"max_rare_depth": "Default", "tree": "", "num_steps": 10, ' - '"min_rare_depth": 10, "metrics": ["chao1", "observed_otus"]}'], - [srare_cmd_id, 'Defaults', - '{"subsample_multinomial": "False"}']] - TRN.add(sql, sql_args, many=True) - -# At this point we are ready to start transferring the data from the old -# structures to the new structures. Overview of the procedure: -# Step 1: Add initial set of artifacts up to rarefied table -# Step 2: Transfer the "analisys jobs" to processing jobs and create -# the analysis artifacts -db_dir = get_db_files_base_dir() -with TRN: - sql = "SELECT * FROM qiita.analysis" - TRN.add(sql) - analysis_info = TRN.execute_fetchindex() - - # Loop through all the analysis - for analysis in analysis_info: - # Step 1: Add the inital set of artifacts. An analysis starts with - # a set of BIOM artifacts. - sql = """SELECT * - FROM qiita.analysis_filepath - JOIN qiita.filepath USING (filepath_id) - JOIN qiita.filepath_type USING (filepath_type_id) - WHERE analysis_id = %s AND filepath_type = 'biom'""" - TRN.add(sql, [analysis['analysis_id']]) - analysis_bioms = TRN.execute_fetchindex() - - # Loop through all the biom tables associated with the current analysis - # so we can create the initial set of artifacts - for biom_data in analysis_bioms: - # Get the path of the BIOM table - sql = """SELECT filepath, mountpoint - FROM qiita.filepath - JOIN qiita.data_directory USING (data_directory_id) - WHERE filepath_id = %s""" - TRN.add(sql, [biom_data['filepath_id']]) - # Magic number 0: There is only a single row in the query result - fp_info = TRN.execute_fetchindex()[0] - filepath = join(db_dir, fp_info['mountpoint'], fp_info['filepath']) - - # We need to check if the BIOM table has been rarefied or not - table = load_table(filepath) - depths = set(table.sum(axis='sample')) - if len(depths) == 1: - # The BIOM table was rarefied - # Create the initial unrarefied artifact - initial_biom_artifact_id = create_non_rarefied_biom_artifact( - analysis, biom_data, table) - # Create the rarefaction job - rarefaction_job_id, params = create_rarefaction_job( - depths.pop(), initial_biom_artifact_id, analysis, - srare_cmd_id) - # Create the rarefied artifact - rarefied_biom_artifact_id = create_rarefied_biom_artifact( - analysis, srare_cmd_id, biom_data, params, - initial_biom_artifact_id, rarefaction_job_id, - srare_cmd_out_id) - else: - # The BIOM table was not rarefied, use current table as initial - initial_biom_id = transfer_file_to_artifact() - - # Loop through all the jobs that used this biom table as input - sql = """SELECT * - FROM qiita.job - WHERE reverse(split_part(reverse( - options::json->>'--otu_table_fp'), '/', 1)) = %s""" - TRN.add(sql, [filepath]) - analysis_jobs = TRN.execute_fetchindex() - for job_data in analysis_jobs: - # Identify which command the current job exeucted - if job_data['command_id'] == 1: - # Taxa summaries - cmd_id = sum_taxa_cmd_id - params = ('{"biom_table":%d,"metadata_category":"",' - '"sort":false}' % initial_biom_id) - output_artifact_type_id = ts_atype_id - cmd_out_id = sum_taxa_cmd_out_id - elif job_data['command_id'] == 2: - # Beta diversity - cmd_id = bdiv_cmd_id - tree_fp = loads(job_data['options'])['--tree_fp'] - if tree_fp: - params = ('{"biom_table":%d,"tree":"%s","metrics":' - '["unweighted_unifrac","weighted_unifrac"]}' - % (initial_biom_id, tree_fp)) - else: - params = ('{"biom_table":%d,"metrics":["bray_curtis",' - '"gower","canberra","pearson"]}' - % initial_biom_id) - output_artifact_type_id = dm_atype_id - cmd_out_id = bdiv_cmd_out_id - else: - # Alpha rarefaction - cmd_id = arare_cmd_id - tree_fp = loads(job_data['options'])['--tree_fp'] - params = ('{"biom_table":%d,"tree":"%s","num_steps":"10",' - '"min_rare_depth":"10",' - '"max_rare_depth":"Default"}' - % (initial_biom_id, tree_fp)) - output_artifact_type_id = rc_atype_id - cmd_out_id = arare_cmd_out_id - - transfer_job(analysis, cmd_id, params, initial_biom_id, - job_data, cmd_out_id, biom_data, - output_artifact_type_id) - -errors = [] -with TRN: - # Unlink the analysis from the biom table filepaths - # Magic number 7 -> biom filepath type - sql = """DELETE FROM qiita.analysis_filepath - WHERE filepath_id IN (SELECT filepath_id - FROM qiita.filepath - WHERE filepath_type_id = 7)""" - TRN.add(sql) - TRN.execute() - - # Delete old structures that are not used anymore - tables = ["collection_job", "collection_analysis", "collection_users", - "collection", "collection_status", "analysis_workflow", - "analysis_chain", "analysis_job", "job_results_filepath", "job", - "job_status", "command_data_type", "command", "analysis_status"] - for table in tables: - TRN.add("DROP TABLE qiita.%s" % table) - try: - TRN.execute() - except Exception as e: - errors.append("Error deleting table %s: %s" % (table, str(e))) - -# Purge filepaths -try: - purge_filepaths() -except Exception as e: - errors.append("Error purging filepaths: %s" % str(e)) - -if errors: - print "\n".join(errors) + # a few notes: just getting the preps with duplicated values; ignoring + # column 'sample_id' and tables 'study_sample', 'prep_template', + # 'prep_template_sample' + sql = """SELECT table_name, array_agg(column_name::text) + FROM information_schema.columns + WHERE column_name IN %s + AND table_name LIKE 'prep_%%' + AND table_name NOT IN ( + 'prep_template', 'prep_template_sample') + GROUP BY table_name""" + # note that we are looking for those columns with duplicated names in + # the headers + TRN.add(sql, [tuple(set(cols_prep))]) + for table, columns in viewitems(dict(TRN.execute_fetchindex())): + # [1] the format is table_# so taking the # + pt = PrepTemplate(int(table.split('_')[1])) + # getting just the columns of interest + pt_df = pt.to_dataframe()[columns] + # converting to datetime + for col in columns: + pt_df[col] = pt_df[col].apply(transform_date) + pt.update(pt_df) diff --git a/qiita_db/support_files/patches/python_patches/52.py b/qiita_db/support_files/patches/python_patches/52.py new file mode 100644 index 000000000..9d462d5c9 --- /dev/null +++ b/qiita_db/support_files/patches/python_patches/52.py @@ -0,0 +1,749 @@ +# The code is commented with details on the changes implemented here, +# but here is an overview of the changes needed to transfer the analysis +# data to the plugins structure: +# 1) Create a new type plugin to define the diversity types +# 2) Create the new commands on the existing QIIME plugin to execute the +# existing analyses (beta div, taxa summaries and alpha rarefaction) +# 3) Transfer all the data in the old structures to the plugin structures +# 4) Delete old structures + +from string import ascii_letters, digits +from random import SystemRandom +from os.path import join, exists, basename +from os import makedirs +from json import loads + +from biom import load_table, Table +from biom.util import biom_open + +from qiita_db.sql_connection import TRN +from qiita_db.util import (get_db_files_base_dir, purge_filepaths, + get_mountpoint, compute_checksum) +from qiita_db.artifact import Artifact + +# Create some aux functions that are going to make the code more modular +# and easier to understand, since there is a fair amount of work to do to +# trasnfer the data from the old structure to the new one + + +def get_random_string(length): + """Creates a random string of the given length with alphanumeric chars + + Parameters + ---------- + length : int + The desired length of the string + + Returns + ------- + str + The new random string + """ + sr = SystemRandom() + chars = ascii_letters + digits + return ''.join(sr.choice(chars) for i in range(length)) + + +def create_non_rarefied_biom_artifact(analysis, biom_data, rarefied_table): + """Creates the initial non-rarefied BIOM artifact of the analysis + + Parameters + ---------- + analysis : dict + Dictionary with the analysis information + biom_data : dict + Dictionary with the biom file information + rarefied_table : biom.Table + The rarefied BIOM table + + Returns + ------- + int + The id of the new artifact + """ + # The non rarefied biom artifact is the initial biom table of the analysis. + # This table does not currently exist anywhere, so we need to actually + # create the BIOM file. To create this BIOM file we need: (1) the samples + # and artifacts they come from and (2) whether the samples where + # renamed or not. (1) is on the database, but we need to inferr (2) from + # the existing rarefied BIOM table. Fun, fun... + + with TRN: + # Get the samples included in the BIOM table grouped by artifact id + # Note that the analysis contains a BIOM table per data type included + # in it, and the table analysis_sample does not differentiate between + # datatypes, so we need to check the data type in the artifact table + sql = """SELECT artifact_id, array_agg(sample_id) + FROM qiita.analysis_sample + JOIN qiita.artifact USING (artifact_id) + WHERE analysis_id = %s AND data_type_id = %s + GROUP BY artifact_id""" + TRN.add(sql, [analysis['analysis_id'], biom_data['data_type_id']]) + samples_by_artifact = TRN.execute_fetchindex() + + # Create an empty BIOM table to be the new master table + new_table = Table([], [], []) + ids_map = {} + for a_id, samples in samples_by_artifact: + # Get the filepath of the BIOM table from the artifact + artifact = Artifact(a_id) + biom_fp = None + for _, fp, fp_type in artifact.filepaths: + if fp_type == 'biom': + biom_fp = fp + # Note that we are sure that the biom table exists for sure, so + # no need to check if biom_fp is undefined + biom_table = load_table(biom_fp) + biom_table.filter(samples, axis='sample', inplace=True) + new_table = new_table.merge(biom_table) + ids_map.update({sid: "%d.%s" % (a_id, sid) + for sid in biom_table.ids()}) + + # Check if we need to rename the sample ids in the biom table + new_table_ids = set(new_table.ids()) + if not new_table_ids.issuperset(rarefied_table.ids()): + # We need to rename the sample ids + new_table.update_ids(ids_map, 'sample', True, True) + + sql = """INSERT INTO qiita.artifact + (generated_timestamp, data_type_id, visibility_id, + artifact_type_id, submitted_to_vamps) + VALUES (%s, %s, %s, %s, %s) + RETURNING artifact_id""" + # Magic number 4 -> visibility sandbox + # Magix number 7 -> biom artifact type + TRN.add(sql, [analysis['timestamp'], biom_data['data_type_id'], + 4, 7, False]) + artifact_id = TRN.execute_fetchlast() + + # Associate the artifact with the analysis + sql = """INSERT INTO qiita.analysis_artifact + (analysis_id, artifact_id) + VALUES (%s, %s)""" + TRN.add(sql, [analysis['analysis_id'], artifact_id]) + # Link the artifact with its file + dd_id, mp = get_mountpoint('BIOM')[0] + dir_fp = join(get_db_files_base_dir(), mp, str(artifact_id)) + if not exists(dir_fp): + makedirs(dir_fp) + new_table_fp = join(dir_fp, "biom_table.biom") + with biom_open(new_table_fp, 'w') as f: + new_table.to_hdf5(f, "Generated by Qiita") + + sql = """INSERT INTO qiita.filepath + (filepath, filepath_type_id, checksum, + checksum_algorithm_id, data_directory_id) + VALUES (%s, %s, %s, %s, %s) + RETURNING filepath_id""" + # Magic number 7 -> filepath_type_id = 'biom' + # Magic number 1 -> the checksum algorithm id + TRN.add(sql, [basename(new_table_fp), 7, + compute_checksum(new_table_fp), 1, dd_id]) + fp_id = TRN.execute_fetchlast() + sql = """INSERT INTO qiita.artifact_filepath + (artifact_id, filepath_id) + VALUES (%s, %s)""" + TRN.add(sql, [artifact_id, fp_id]) + TRN.execute() + + return artifact_id + + +def create_rarefaction_job(depth, biom_artifact_id, analysis, srare_cmd_id): + """Create a new rarefaction job + + Parameters + ---------- + depth : int + The rarefaction depth + biom_artifact_id : int + The artifact id of the input rarefaction biom table + analysis : dict + Dictionary with the analysis information + srare_cmd_id : int + The command id of the single rarefaction command + + Returns + ------- + job_id : str + The job id + params : str + The job parameters + """ + # Add the row in the procesisng job table + params = ('{"depth":%d,"subsample_multinomial":false,"biom_table":%s}' + % (depth, biom_artifact_id)) + with TRN: + # magic number 3: status -> success + sql = """INSERT INTO qiita.processing_job + (email, command_id, command_parameters, + processing_job_status_id) + VALUES (%s, %s, %s, %s) + RETURNING processing_job_id""" + TRN.add(sql, [analysis['email'], srare_cmd_id, params, 3]) + job_id = TRN.execute_fetchlast() + # Step 1.2.b: Link the job with the input artifact + sql = """INSERT INTO qiita.artifact_processing_job + (artifact_id, processing_job_id) + VALUES (%s, %s)""" + TRN.add(sql, [biom_artifact_id, job_id]) + TRN.execute() + return job_id, params + + +def transfer_file_to_artifact(analysis_id, a_timestamp, command_id, + data_type_id, params, artifact_type_id, + filepath_id): + """Creates a new artifact with the given filepath id + + Parameters + ---------- + analysis_id : int + The analysis id to attach the artifact + a_timestamp : datetime.datetime + The generated timestamp of the artifact + command_id : int + The command id of the artifact + data_type_id : int + The data type id of the artifact + params : str + The parameters of the artifact + artifact_type_id : int + The artifact type + filepath_id : int + The filepath id + + Returns + ------- + int + The artifact id + """ + with TRN: + # Add the row in the artifact table + # Magic number 4: Visibility -> sandbox + sql = """INSERT INTO qiita.artifact + (generated_timestamp, command_id, data_type_id, + command_parameters, visibility_id, artifact_type_id, + submitted_to_vamps) + VALUES (%s, %s, %s, %s, %s, %s, %s) + RETURNING artifact_id""" + TRN.add(sql, [a_timestamp, command_id, data_type_id, params, 4, + artifact_type_id, False]) + artifact_id = TRN.execute_fetchlast() + # Link the artifact with its file + sql = """INSERT INTO qiita.artifact_filepath (artifact_id, filepath_id) + VALUES (%s, %s)""" + TRN.add(sql, [artifact_id, filepath_id]) + # Link the artifact with the analysis + sql = """INSERT INTO qiita.analysis_artifact + (analysis_id, artifact_id) + VALUES (%s, %s)""" + TRN.add(sql, [analysis_id, artifact_id]) + + return artifact_id + + +def create_rarefied_biom_artifact(analysis, srare_cmd_id, biom_data, params, + parent_biom_artifact_id, rarefaction_job_id, + srare_cmd_out_id): + """Creates the rarefied biom artifact + + Parameters + ---------- + analysis : dict + The analysis information + srare_cmd_id : int + The command id of "Single Rarefaction" + biom_data : dict + The biom information + params : str + The processing parameters + parent_biom_artifact_id : int + The parent biom artifact id + rarefaction_job_id : str + The job id of the rarefaction job + srare_cmd_out_id : int + The id of the single rarefaction output + + Returns + ------- + int + The artifact id + """ + with TRN: + # Transfer the file to an artifact + # Magic number 7: artifact type -> biom + artifact_id = transfer_file_to_artifact( + analysis['analysis_id'], analysis['timestamp'], srare_cmd_id, + biom_data['data_type_id'], params, 7, biom_data['filepath_id']) + # Link the artifact with its parent + sql = """INSERT INTO qiita.parent_artifact (artifact_id, parent_id) + VALUES (%s, %s)""" + TRN.add(sql, [artifact_id, parent_biom_artifact_id]) + # Link the artifact as the job output + sql = """INSERT INTO qiita.artifact_output_processing_job + (artifact_id, processing_job_id, command_output_id) + VALUES (%s, %s, %s)""" + TRN.add(sql, [artifact_id, rarefaction_job_id, srare_cmd_out_id]) + return artifact_id + + +def transfer_job(analysis, command_id, params, input_artifact_id, job_data, + cmd_out_id, biom_data, output_artifact_type_id): + """Transfers the job from the old structure to the plugin structure + + Parameters + ---------- + analysis : dict + The analysis information + command_id : int + The id of the command executed + params : str + The parameters used in the job + input_artifact_id : int + The id of the input artifact + job_data : dict + The job information + cmd_out_id : int + The id of the command's output + biom_data : dict + The biom information + output_artifact_type_id : int + The type of the output artifact + """ + with TRN: + # Create the job + # Add the row in the processing job table + # Magic number 3: status -> success + sql = """INSERT INTO qiita.processing_job + (email, command_id, command_parameters, + processing_job_status_id) + VALUES (%s, %s, %s, %s) + RETURNING processing_job_id""" + TRN.add(sql, [analysis['email'], command_id, params, 3]) + job_id = TRN.execute_fetchlast() + + # Link the job with the input artifact + sql = """INSERT INTO qiita.artifact_processing_job + (artifact_id, processing_job_id) + VALUES (rarefied_biom_id, proc_job_id)""" + TRN.add(sql, [input_artifact_id, job_id]) + + # Check if the executed job has results and add them + sql = """SELECT EXISTS(SELECT * + FROM qiita.job_results_filepath + WHERE job_id = %s)""" + TRN.add(sql, [job_data['job_id']]) + if TRN.execute_fetchlast(): + # There are results for the current job. + # Transfer the job files to a new artifact + sql = """SELECT filepath_id + FROM qiita.job_results_filepath + WHERE job_id = %s""" + TRN.add(sql, job_data['job_id']) + filepath_id = TRN.execute_fetchlast() + artifact_id = transfer_file_to_artifact( + analysis['analysis_id'], analysis['timestamp'], command_id, + biom_data['data_type_id'], params, output_artifact_type_id, + filepath_id) + + # Link the artifact with its parent + sql = """INSERT INTO qiita.parent_artifact (artifact_id, parent_id) + VALUES (%s, %s)""" + TRN.add(sql, [artifact_id, input_artifact_id]) + # Link the artifact as the job output + sql = """INSERT INTO qiita.artifact_output_processing_job + (artifact_id, processing_job_id, command_output_id) + VALUES (%s, %s, %s)""" + TRN.add(sql, [artifact_id, job_id, cmd_out_id]) + TRN.exeucte() + else: + # There are no results on the current job, so mark it as + # error + if job_data.log_id is None: + # Magic number 2 - we are not using any other severity + # level, so keep using number 2 + sql = """INSERT INTO qiita.logging (time, severity_id, msg) + VALUES (%s, %s, %s) + RETURNING logging_id""" + TRN.add(sql, [analysis['timestamp'], 2, + "Unknown error - patch 47"]) + else: + log_id = job_data['log_id'] + + # Magic number 4 -> status -> error + sql = """UPDATE qiita.processing_job + SET processing_job_status_id = 4, logging_id = %s + WHERE processing_job_id = %s""" + TRN.add(sql, [log_id, job_id]) + + +# The new commands that we are going to add generate new artifact types. +# These new artifact types are going to be added to a different plugin. +# In interest of time and given that the artifact type system is going to +# change in the near future, we feel that the easiest way to transfer +# the current analyses results is by creating 3 different types of +# artifacts: (1) distance matrix -> which will include the distance matrix, +# the principal coordinates and the emperor plots; (2) rarefaction +# curves -> which will include all the files generated by alpha rarefaction +# and (3) taxonomy summary, which will include all the files generated +# by summarize_taxa_through_plots.py + +# Step 1: Create the new type +with TRN: + # Magic number 2 -> The "artifact definition" software type + sql = """INSERT INTO qiita.software + (name, version, description, environment_script, start_script, + software_type_id) + VALUES ('Diversity types', '0.1.0', + 'Diversity artifacts type plugin', + 'source activate qiita', 'start_diversity_types', 2) + RETURNING software_id""" + TRN.add(sql) + divtype_id = TRN.execute_fetchlast() + + # Step 2: Create the validate and HTML generator commands + sql = """INSERT INTO qiita.software_command (software_id, name, description) + VALUES (%s, %s, %s) + RETURNING command_id""" + TRN.add(sql, [divtype_id, 'Validate', + 'Validates a new artifact of the given diversity type']) + validate_cmd_id = TRN.execute_fetchlast() + TRN.add(sql, [divtype_id, 'Generate HTML summary', + 'Generates the HTML summary of a given diversity type']) + html_summary_cmd_id = TRN.execute_fetchlast() + + # Step 3: Add the parameters for the previous commands + sql = """INSERT INTO qiita.command_parameter + (command_id, parameter_name, parameter_type, required) + VALUES (%s, %s, %s, %s)""" + sql_args = [(validate_cmd_id, 'files', 'string', True), + (validate_cmd_id, 'artifact_type', 'string', True), + (validate_cmd_id, 'template', 'prep_template', False), + (validate_cmd_id, 'analysis', 'analysis', False), + (validate_cmd_id, 'provenance', 'string', False), + (html_summary_cmd_id, 'input_data', 'artifact', True)] + TRN.add(sql, sql_args, many=True) + + # Step 4: Add the new artifact types + sql = """INSERT INTO qiita.artifact_type ( + artifact_type, description, can_be_submitted_to_ebi, + can_be_submitted_to_vamps) + VALUES (%s, %s, %s, %s) + RETURNING artifact_type_id""" + TRN.add(sql, ['distance_matrix', 'Distance matrix holding pairwise ' + 'distance between samples', False, False]) + dm_atype_id = TRN.execute_fetchlast() + TRN.add(sql, ['rarefaction_curves', 'Rarefaction curves', False, False]) + rc_atype_id = TRN.execute_fetchlast() + TRN.add(sql, ['taxa_summary', 'Taxa summary plots', False, False]) + ts_atype_id = TRN.execute_fetchlast() + + # Step 5: Associate each artifact with the filetypes that it accepts + # At this time we are going to add them as directories, just as it is done + # right now. We can make it fancier with the new type system. + # Magic number 8: the filepath_type_id for the directory + sql = """INSERT INTO qiita.artifact_type_filepath_type + (artifact_type_id, filepath_type_id, required) + VALUES (%s, %s, %s)""" + sql_args = [[dm_atype_id, 8, True], + [rc_atype_id, 8, True], + [ts_atype_id, 8, True]] + TRN.add(sql, sql_args, many=True) + + # Step 6: Associate the plugin with the types that it defines + sql = """INSERT INTO qiita.software_artifact_type + (software_id, artifact_type_id) + VALUES (%s, %s)""" + sql_args = [[divtype_id, dm_atype_id], + [divtype_id, rc_atype_id], + [divtype_id, ts_atype_id]] + TRN.add(sql, sql_args, many=True) + + # Step 7: Create the new entries for the data directory + sql = """INSERT INTO qiita.data_directory + (data_type, mountpoint, subdirectory, active) + VALUES (%s, %s, %s, %s)""" + sql_args = [['distance_matrix', 'distance_matrix', True, True], + ['rarefaction_curves', 'rarefaction_curves', True, True], + ['taxa_summary', 'taxa_summary', True, True]] + TRN.add(sql, sql_args, many=True) + + # Step 8: Give a new client id/client secret pair to the plugins + sql = """INSERT INTO qiita.oauth_identifiers (client_id, client_secret) + VALUES (%s, %s)""" + # Each plugin needs a client id/secret pair, so we are generating it here + # at random + client_id = get_random_string(50) + client_secret = get_random_string(255) + TRN.add(sql, [client_id, client_secret]) + sql = """INSERT INTO qiita.oauth_software (client_id, software_id) + VALUES (%s, %s)""" + TRN.add(sql, [client_id, divtype_id]) + + # Create the new commands that execute the current analyses. In qiita, + # the only commands that where available are Summarize Taxa, Beta + # Diversity and Alpha Rarefaction. The system was executing rarefaction + # by default, but it should be a different step in the analysis process + # so we are going to create a command for it too. These commands are going + # to be part of the QIIME plugin, so we are going to first retrieve the + # id of the QIIME 1.9.1 plugin, which for sure exists cause it was added + # in patch 33 and there is no way of removing plugins + + # Step 1: Get the QIIME plugin id + sql = """SELECT software_id + FROM qiita.software + WHERE name = 'QIIME' AND version = '1.9.1'""" + TRN.add(sql) + qiime_id = TRN.execute_fetchlast() + + # Step 2: Insert the new commands in the software_command table + sql = """INSERT INTO qiita.software_command (software_id, name, description) + VALUES (%s, %s, %s) + RETURNING command_id""" + TRN.add(sql, [qiime_id, 'Summarize Taxa', 'Plots taxonomy summaries at ' + 'different taxonomy levels']) + sum_taxa_cmd_id = TRN.execute_fetchlast() + TRN.add(sql, [qiime_id, 'Beta Diversity', + 'Computes and plots beta diversity results']) + bdiv_cmd_id = TRN.execute_fetchlast() + TRN.add(sql, [qiime_id, 'Alpha Rarefaction', + 'Computes and plots alpha rarefaction results']) + arare_cmd_id = TRN.execute_fetchlast() + TRN.add(sql, [qiime_id, 'Single Rarefaction', + 'Rarefies the input table by random sampling without ' + 'replacement']) + srare_cmd_id = TRN.execute_fetchlast() + + # Step 3: Insert the parameters for each command + sql = """INSERT INTO qiita.command_parameter + (command_id, parameter_name, parameter_type, required, + default_value) + VALUES (%s, %s, %s, %s, %s) + RETURNING command_parameter_id""" + sql_args = [ + # Summarize Taxa + (sum_taxa_cmd_id, 'metadata_category', 'string', False, ''), + (sum_taxa_cmd_id, 'sort', 'bool', False, 'False'), + # Beta Diversity + (bdiv_cmd_id, 'tree', 'string', False, ''), + (bdiv_cmd_id, 'metric', + 'choice:["abund_jaccard","binary_chisq","binary_chord",' + '"binary_euclidean","binary_hamming","binary_jaccard",' + '"binary_lennon","binary_ochiai","binary_otu_gain","binary_pearson",' + '"binary_sorensen_dice","bray_curtis","bray_curtis_faith",' + '"bray_curtis_magurran","canberra","chisq","chord","euclidean",' + '"gower","hellinger","kulczynski","manhattan","morisita_horn",' + '"pearson","soergel","spearman_approx","specprof","unifrac",' + '"unifrac_g","unifrac_g_full_tree","unweighted_unifrac",' + '"unweighted_unifrac_full_tree","weighted_normalized_unifrac",' + '"weighted_unifrac"]', False, '"binary_jaccard"'), + # Alpha rarefaction + (arare_cmd_id, 'tree', 'string', False, ''), + (arare_cmd_id, 'num_steps', 'integer', False, 10), + (arare_cmd_id, 'min_rare_depth', 'integer', False, 10), + (arare_cmd_id, 'max_rare_depth', 'integer', False, 'Default'), + (arare_cmd_id, 'metrics', + 'mchoice:["ace","berger_parker_d","brillouin_d","chao1","chao1_ci",' + '"dominance","doubles","enspie","equitability","esty_ci",' + '"fisher_alpha","gini_index","goods_coverage","heip_e",' + '"kempton_taylor_q","margalef","mcintosh_d","mcintosh_e",' + '"menhinick","michaelis_menten_fit","observed_otus",' + '"observed_species","osd","simpson_reciprocal","robbins",' + '"shannon","simpson","simpson_e","singles","strong","PD_whole_tree"]', + False, '["chao1","observed_otus"]'), + # Single rarefaction + (srare_cmd_id, 'depth', 'integer', True, None), + (srare_cmd_id, 'subsample_multinomial', 'bool', False, 'False') + ] + TRN.add(sql, sql_args, many=True) + + TRN.add(sql, [sum_taxa_cmd_id, 'biom_table', 'artifact', True, None]) + sum_taxa_cmd_param_id = TRN.execute_fetchlast() + TRN.add(sql, [bdiv_cmd_id, 'biom_table', 'artifact', True, None]) + bdiv_cmd_param_id = TRN.execute_fetchlast() + TRN.add(sql, [arare_cmd_id, 'biom_table', 'artifact', True, None]) + arare_cmd_param_id = TRN.execute_fetchlast() + TRN.add(sql, [srare_cmd_id, 'biom_table', 'artifact', True, None]) + srare_cmd_param_id = TRN.execute_fetchlast() + + # Step 4: Connect the artifact parameters with the artifact types that + # they accept + sql = """SELECT artifact_type_id + FROM qiita.artifact_type + WHERE artifact_type = 'BIOM'""" + TRN.add(sql) + biom_atype_id = TRN.execute_fetchlast() + + sql = """INSERT INTO qiita.parameter_artifact_type + (command_parameter_id, artifact_type_id) + VALUES (%s, %s)""" + sql_args = [[sum_taxa_cmd_param_id, biom_atype_id], + [bdiv_cmd_param_id, biom_atype_id], + [arare_cmd_param_id, biom_atype_id], + [srare_cmd_param_id, biom_atype_id]] + TRN.add(sql, sql_args, many=True) + + # Step 5: Add the outputs of the command. + sql = """INSERT INTO qiita.command_output + (name, command_id, artifact_type_id) + VALUES (%s, %s, %s) + RETURNING command_output_id""" + TRN.add(sql, ['taxa_summary', sum_taxa_cmd_id, ts_atype_id]) + sum_taxa_cmd_out_id = TRN.execute_fetchlast() + TRN.add(sql, ['distance_matrix', bdiv_cmd_id, dm_atype_id]) + bdiv_cmd_out_id = TRN.execute_fetchlast() + TRN.add(sql, ['rarefaction_curves', arare_cmd_id, rc_atype_id]) + arare_cmd_out_id = TRN.execute_fetchlast() + TRN.add(sql, ['rarefied_table', srare_cmd_id, biom_atype_id]) + srare_cmd_out_id = TRN.execute_fetchlast() + + # Step 6: Add default parameter sets + sql = """INSERT INTO qiita.default_parameter_set + (command_id, parameter_set_name, parameter_set) + VALUES (%s, %s, %s)""" + sql_args = [ + [sum_taxa_cmd_id, 'Defaults', + '{"sort": false, "metadata_category": ""}'], + [bdiv_cmd_id, 'Unweighted UniFrac', + '{"metrics": "unweighted_unifrac", "tree": ""}'], + [arare_cmd_id, 'Defaults', + '{"max_rare_depth": "Default", "tree": "", "num_steps": 10, ' + '"min_rare_depth": 10, "metrics": ["chao1", "observed_otus"]}'], + [srare_cmd_id, 'Defaults', + '{"subsample_multinomial": "False"}']] + TRN.add(sql, sql_args, many=True) + +# At this point we are ready to start transferring the data from the old +# structures to the new structures. Overview of the procedure: +# Step 1: Add initial set of artifacts up to rarefied table +# Step 2: Transfer the "analisys jobs" to processing jobs and create +# the analysis artifacts +db_dir = get_db_files_base_dir() +with TRN: + sql = "SELECT * FROM qiita.analysis" + TRN.add(sql) + analysis_info = TRN.execute_fetchindex() + + # Loop through all the analysis + for analysis in analysis_info: + # Step 1: Add the inital set of artifacts. An analysis starts with + # a set of BIOM artifacts. + sql = """SELECT * + FROM qiita.analysis_filepath + JOIN qiita.filepath USING (filepath_id) + JOIN qiita.filepath_type USING (filepath_type_id) + WHERE analysis_id = %s AND filepath_type = 'biom'""" + TRN.add(sql, [analysis['analysis_id']]) + analysis_bioms = TRN.execute_fetchindex() + + # Loop through all the biom tables associated with the current analysis + # so we can create the initial set of artifacts + for biom_data in analysis_bioms: + # Get the path of the BIOM table + sql = """SELECT filepath, mountpoint + FROM qiita.filepath + JOIN qiita.data_directory USING (data_directory_id) + WHERE filepath_id = %s""" + TRN.add(sql, [biom_data['filepath_id']]) + # Magic number 0: There is only a single row in the query result + fp_info = TRN.execute_fetchindex()[0] + filepath = join(db_dir, fp_info['mountpoint'], fp_info['filepath']) + + # We need to check if the BIOM table has been rarefied or not + table = load_table(filepath) + depths = set(table.sum(axis='sample')) + if len(depths) == 1: + # The BIOM table was rarefied + # Create the initial unrarefied artifact + initial_biom_artifact_id = create_non_rarefied_biom_artifact( + analysis, biom_data, table) + # Create the rarefaction job + rarefaction_job_id, params = create_rarefaction_job( + depths.pop(), initial_biom_artifact_id, analysis, + srare_cmd_id) + # Create the rarefied artifact + rarefied_biom_artifact_id = create_rarefied_biom_artifact( + analysis, srare_cmd_id, biom_data, params, + initial_biom_artifact_id, rarefaction_job_id, + srare_cmd_out_id) + else: + # The BIOM table was not rarefied, use current table as initial + initial_biom_id = transfer_file_to_artifact() + + # Loop through all the jobs that used this biom table as input + sql = """SELECT * + FROM qiita.job + WHERE reverse(split_part(reverse( + options::json->>'--otu_table_fp'), '/', 1)) = %s""" + TRN.add(sql, [filepath]) + analysis_jobs = TRN.execute_fetchindex() + for job_data in analysis_jobs: + # Identify which command the current job exeucted + if job_data['command_id'] == 1: + # Taxa summaries + cmd_id = sum_taxa_cmd_id + params = ('{"biom_table":%d,"metadata_category":"",' + '"sort":false}' % initial_biom_id) + output_artifact_type_id = ts_atype_id + cmd_out_id = sum_taxa_cmd_out_id + elif job_data['command_id'] == 2: + # Beta diversity + cmd_id = bdiv_cmd_id + tree_fp = loads(job_data['options'])['--tree_fp'] + if tree_fp: + params = ('{"biom_table":%d,"tree":"%s","metrics":' + '["unweighted_unifrac","weighted_unifrac"]}' + % (initial_biom_id, tree_fp)) + else: + params = ('{"biom_table":%d,"metrics":["bray_curtis",' + '"gower","canberra","pearson"]}' + % initial_biom_id) + output_artifact_type_id = dm_atype_id + cmd_out_id = bdiv_cmd_out_id + else: + # Alpha rarefaction + cmd_id = arare_cmd_id + tree_fp = loads(job_data['options'])['--tree_fp'] + params = ('{"biom_table":%d,"tree":"%s","num_steps":"10",' + '"min_rare_depth":"10",' + '"max_rare_depth":"Default"}' + % (initial_biom_id, tree_fp)) + output_artifact_type_id = rc_atype_id + cmd_out_id = arare_cmd_out_id + + transfer_job(analysis, cmd_id, params, initial_biom_id, + job_data, cmd_out_id, biom_data, + output_artifact_type_id) + +errors = [] +with TRN: + # Unlink the analysis from the biom table filepaths + # Magic number 7 -> biom filepath type + sql = """DELETE FROM qiita.analysis_filepath + WHERE filepath_id IN (SELECT filepath_id + FROM qiita.filepath + WHERE filepath_type_id = 7)""" + TRN.add(sql) + TRN.execute() + + # Delete old structures that are not used anymore + tables = ["collection_job", "collection_analysis", "collection_users", + "collection", "collection_status", "analysis_workflow", + "analysis_chain", "analysis_job", "job_results_filepath", "job", + "job_status", "command_data_type", "command", "analysis_status"] + for table in tables: + TRN.add("DROP TABLE qiita.%s" % table) + try: + TRN.execute() + except Exception as e: + errors.append("Error deleting table %s: %s" % (table, str(e))) + +# Purge filepaths +try: + purge_filepaths() +except Exception as e: + errors.append("Error purging filepaths: %s" % str(e)) + +if errors: + print "\n".join(errors) diff --git a/qiita_db/support_files/test_data/analysis/1_analysis_mapping_exp.txt b/qiita_db/support_files/test_data/analysis/1_analysis_mapping_exp.txt index 7cbe9be8e..e2923d93a 100644 --- a/qiita_db/support_files/test_data/analysis/1_analysis_mapping_exp.txt +++ b/qiita_db/support_files/test_data/analysis/1_analysis_mapping_exp.txt @@ -1,4 +1,4 @@ -#SampleID BarcodeSequence LinkerPrimerSequence center_name center_project_name emp_status experiment_center experiment_design_description experiment_title illumina_technology instrument_model library_construction_protocol pcr_primers platform run_center run_date run_prefix samp_size sample_center sequencing_meth study_center target_gene target_subfragment qiita_prep_id altitude anonymized_name assigned_from_geo collection_timestamp common_name country depth description_duplicate elevation env_biome env_feature has_extracted_data has_physical_specimen host_subject_id host_taxid latitude longitude ph physical_location samp_salinity sample_type season_environment taxon_id temp texture tot_nitro tot_org_carb water_content_soil qiita_study_title qiita_study_alias qiita_owner qiita_principal_investigator Description -1.SKB8.640193 AGCGCTCACATC GTGCCAGCMGCCGCGGTAA ANL EMP ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome MiSeq Illumina MiSeq This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT Illumina ANL 8/1/12 s_G1_L001_sequences .25,g ANL Sequencing by synthesis CCME 16S rRNA V4 1 0.0 SKB8 n 2011-11-11 13:00:00 root metagenome GAZ:United States of America 0.15 Burmese root 114.0 ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat True True 1001:M7 3483 74.0894932572 65.3283470202 6.94 ANL 7.15 ENVO:soil winter 1118232 15.0 64.6 sand, 17.6 silt, 17.8 clay 1.41 5.0 0.164 Identification of the Microbiomes for Cannabis Soils Cannabis Soils Dude PIDude Cannabis Soil Microbiome -1.SKD8.640184 TGAGTGGTCTGT GTGCCAGCMGCCGCGGTAA ANL EMP ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome MiSeq Illumina MiSeq This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT Illumina ANL 8/1/12 s_G1_L001_sequences .25,g ANL Sequencing by synthesis CCME 16S rRNA V4 1 0.0 SKD8 n 2011-11-11 13:00:00 root metagenome GAZ:United States of America 0.15 Diesel Root 114.0 ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat True True 1001:D9 3483 57.571893782 32.5563076447 6.8 ANL 7.1 ENVO:soil winter 1118232 15.0 66 sand, 16.3 silt, 17.7 clay 1.51 4.32 0.178 Identification of the Microbiomes for Cannabis Soils Cannabis Soils Dude PIDude Cannabis Soil Microbiome -1.SKB7.640196 CGGCCTAAGTTC GTGCCAGCMGCCGCGGTAA ANL EMP ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome MiSeq Illumina MiSeq This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT Illumina ANL 8/1/12 s_G1_L001_sequences .25,g ANL Sequencing by synthesis CCME 16S rRNA V4 1 0.0 SKB7 n 2011-11-11 13:00:00 root metagenome GAZ:United States of America 0.15 Burmese root 114.0 ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat True True 1001:M8 3483 13.089194595 92.5274472082 6.94 ANL 7.15 ENVO:soil winter 1118232 15.0 64.6 sand, 17.6 silt, 17.8 clay 1.41 5.0 0.164 Identification of the Microbiomes for Cannabis Soils Cannabis Soils Dude PIDude Cannabis Soil Microbiome +#SampleID BarcodeSequence LinkerPrimerSequence center_name emp_status experiment_center experiment_design_description experiment_title illumina_technology instrument_model library_construction_protocol pcr_primers platform run_center run_date run_prefix samp_size sample_center sequencing_meth study_center target_gene target_subfragment qiita_prep_id altitude anonymized_name assigned_from_geo collection_timestamp common_name country depth description_duplicate dna_extracted elevation env_biome env_feature host_subject_id host_taxid latitude longitude ph physical_specimen_location physical_specimen_remaining qiita_study_id samp_salinity sample_type scientific_name season_environment taxon_id temp texture tot_nitro tot_org_carb water_content_soil qiita_study_title qiita_study_alias qiita_owner qiita_principal_investigator Description +1.SKB8.640193 AGCGCTCACATC GTGCCAGCMGCCGCGGTAA ANL EMP ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome MiSeq Illumina MiSeq This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT Illumina ANL 8/1/12 s_G1_L001_sequences .25,g ANL Sequencing by synthesis CCME 16S rRNA V4 1 0 SKB8 n 2011-11-11 13:00:00 root metagenome GAZ:United States of America 0.15 Burmese root true 114 ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat 1001:M7 3483 74.0894932572 65.3283470202 6.94 ANL true 1 7.15 ENVO:soil 1118232 winter 1118232 15 64.6 sand, 17.6 silt, 17.8 clay 1.41 5 0.164 Identification of the Microbiomes for Cannabis Soils Cannabis Soils Dude PIDude Cannabis Soil Microbiome +1.SKD8.640184 TGAGTGGTCTGT GTGCCAGCMGCCGCGGTAA ANL EMP ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome MiSeq Illumina MiSeq This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT Illumina ANL 8/1/12 s_G1_L001_sequences .25,g ANL Sequencing by synthesis CCME 16S rRNA V4 1 0 SKD8 n 2011-11-11 13:00:00 root metagenome GAZ:United States of America 0.15 Diesel Root true 114 ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat 1001:D9 3483 57.571893782 32.5563076447 6.8 ANL true 1 7.1 ENVO:soil 1118232 winter 1118232 15 66 sand, 16.3 silt, 17.7 clay 1.51 4.32 0.178 Identification of the Microbiomes for Cannabis Soils Cannabis Soils Dude PIDude Cannabis Soil Microbiome +1.SKB7.640196 CGGCCTAAGTTC GTGCCAGCMGCCGCGGTAA ANL EMP ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome MiSeq Illumina MiSeq This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT Illumina ANL 8/1/12 s_G1_L001_sequences .25,g ANL Sequencing by synthesis CCME 16S rRNA V4 1 0 SKB7 n 2011-11-11 13:00:00 root metagenome GAZ:United States of America 0.15 Burmese root true 114 ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat 1001:M8 3483 13.089194595 92.5274472082 6.94 ANL true 1 7.15 ENVO:soil 1118232 winter 1118232 15 64.6 sand, 17.6 silt, 17.8 clay 1.41 5 0.164 Identification of the Microbiomes for Cannabis Soils Cannabis Soils Dude PIDude Cannabis Soil Microbiome diff --git a/qiita_db/support_files/test_data/analysis/not_merged_samples.txt b/qiita_db/support_files/test_data/analysis/not_merged_samples.txt index b012467e4..854bd0679 100644 --- a/qiita_db/support_files/test_data/analysis/not_merged_samples.txt +++ b/qiita_db/support_files/test_data/analysis/not_merged_samples.txt @@ -1,7 +1,7 @@ -#SampleID BarcodeSequence LinkerPrimerSequence center_name center_project_name emp_status experiment_center experiment_design_description experiment_title illumina_technology instrument_model library_construction_protocol pcr_primers platform run_center run_date run_prefix samp_size sample_center sequencing_meth study_center target_gene target_subfragment qiita_prep_id altitude anonymized_name assigned_from_geo collection_timestamp common_name country depth description_duplicate elevation env_biome env_feature has_extracted_data has_physical_specimen host_subject_id host_taxid latitude longitude ph physical_location samp_salinity sample_type season_environment taxon_id temp texture tot_nitro tot_org_carb water_content_soil original_SampleID qiita_aid qiita_study_title qiita_study_alias qiita_owner qiita_principal_investigator Description -3.1.SKB8.640193 AGCGCTCACATC GTGCCAGCMGCCGCGGTAA ANL EMP ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome MiSeq Illumina MiSeq This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT Illumina ANL 8/1/12 s_G1_L001_sequences .25,g ANL Sequencing by synthesis CCME 16S rRNA V4 1 0.0 SKB8 n 2011-11-11 13:00:00 root metagenome GAZ:United States of America 0.15 Burmese root 114.0 ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat True True 1001:M7 3483 74.0894932572 65.3283470202 6.94 ANL 7.15 ENVO:soil winter 1118232 15.0 64.6 sand, 17.6 silt, 17.8 clay 1.41 5.0 0.164 1.SKB8.640193 3 Identification of the Microbiomes for Cannabis Soils Cannabis Soils Dude PIDude Cannabis Soil Microbiome -3.1.SKD8.640184 TGAGTGGTCTGT GTGCCAGCMGCCGCGGTAA ANL EMP ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome MiSeq Illumina MiSeq This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT Illumina ANL 8/1/12 s_G1_L001_sequences .25,g ANL Sequencing by synthesis CCME 16S rRNA V4 1 0.0 SKD8 n 2011-11-11 13:00:00 root metagenome GAZ:United States of America 0.15 Diesel Root 114.0 ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat True True 1001:D9 3483 57.571893782 32.5563076447 6.8 ANL 7.1 ENVO:soil winter 1118232 15.0 66 sand, 16.3 silt, 17.7 clay 1.51 4.32 0.178 1.SKD8.640184 3 Identification of the Microbiomes for Cannabis Soils Cannabis Soils Dude PIDude Cannabis Soil Microbiome -3.1.SKB7.640196 CGGCCTAAGTTC GTGCCAGCMGCCGCGGTAA ANL EMP ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome MiSeq Illumina MiSeq This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT Illumina ANL 8/1/12 s_G1_L001_sequences .25,g ANL Sequencing by synthesis CCME 16S rRNA V4 1 0.0 SKB7 n 2011-11-11 13:00:00 root metagenome GAZ:United States of America 0.15 Burmese root 114.0 ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat True True 1001:M8 3483 13.089194595 92.5274472082 6.94 ANL 7.15 ENVO:soil winter 1118232 15.0 64.6 sand, 17.6 silt, 17.8 clay 1.41 5.0 0.164 1.SKB7.640196 3 Identification of the Microbiomes for Cannabis Soils Cannabis Soils Dude PIDude Cannabis Soil Microbiome -4.1.SKB8.640193 AGCGCTCACATC GTGCCAGCMGCCGCGGTAA ANL EMP ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome MiSeq Illumina MiSeq This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT Illumina ANL 8/1/12 s_G1_L001_sequences .25,g ANL Sequencing by synthesis CCME 16S rRNA V4 1 0.0 SKB8 n 2011-11-11 13:00:00 root metagenome GAZ:United States of America 0.15 Burmese root 114.0 ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat True True 1001:M7 3483 74.0894932572 65.3283470202 6.94 ANL 7.15 ENVO:soil winter 1118232 15.0 64.6 sand, 17.6 silt, 17.8 clay 1.41 5.0 0.164 1.SKB8.640193 4 Identification of the Microbiomes for Cannabis Soils Cannabis Soils Dude PIDude Cannabis Soil Microbiome -4.1.SKD8.640184 TGAGTGGTCTGT GTGCCAGCMGCCGCGGTAA ANL EMP ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome MiSeq Illumina MiSeq This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT Illumina ANL 8/1/12 s_G1_L001_sequences .25,g ANL Sequencing by synthesis CCME 16S rRNA V4 1 0.0 SKD8 n 2011-11-11 13:00:00 root metagenome GAZ:United States of America 0.15 Diesel Root 114.0 ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat True True 1001:D9 3483 57.571893782 32.5563076447 6.8 ANL 7.1 ENVO:soil winter 1118232 15.0 66 sand, 16.3 silt, 17.7 clay 1.51 4.32 0.178 1.SKD8.640184 4 Identification of the Microbiomes for Cannabis Soils Cannabis Soils Dude PIDude Cannabis Soil Microbiome -4.1.SKB7.640196 CGGCCTAAGTTC GTGCCAGCMGCCGCGGTAA ANL EMP ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome MiSeq Illumina MiSeq This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT Illumina ANL 8/1/12 s_G1_L001_sequences .25,g ANL Sequencing by synthesis CCME 16S rRNA V4 1 0.0 SKB7 n 2011-11-11 13:00:00 root metagenome GAZ:United States of America 0.15 Burmese root 114.0 ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat True True 1001:M8 3483 13.089194595 92.5274472082 6.94 ANL 7.15 ENVO:soil winter 1118232 15.0 64.6 sand, 17.6 silt, 17.8 clay 1.41 5.0 0.164 1.SKB7.640196 4 Identification of the Microbiomes for Cannabis Soils Cannabis Soils Dude PIDude Cannabis Soil Microbiome +#SampleID BarcodeSequence LinkerPrimerSequence center_name emp_status experiment_center experiment_design_description experiment_title illumina_technology instrument_model library_construction_protocol pcr_primers platform run_center run_date run_prefix samp_size sample_center sequencing_meth study_center target_gene target_subfragment qiita_prep_id altitude anonymized_name assigned_from_geo collection_timestamp common_name country depth description_duplicate dna_extracted elevation env_biome env_feature host_subject_id host_taxid latitude longitude ph physical_specimen_location physical_specimen_remaining qiita_study_id samp_salinity sample_type scientific_name season_environment taxon_id temp texture tot_nitro tot_org_carb water_content_soil original_SampleID qiita_aid qiita_study_title qiita_study_alias qiita_owner qiita_principal_investigator Description +3.1.SKB8.640193 AGCGCTCACATC GTGCCAGCMGCCGCGGTAA ANL EMP ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome MiSeq Illumina MiSeq This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT Illumina ANL 8/1/12 s_G1_L001_sequences .25,g ANL Sequencing by synthesis CCME 16S rRNA V4 1 0 SKB8 n 2011-11-11 13:00:00 root metagenome GAZ:United States of America 0.15 Burmese root true 114 ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat 1001:M7 3483 74.0894932572 65.3283470202 6.94 ANL true 1 7.15 ENVO:soil 1118232 winter 1118232 15 64.6 sand, 17.6 silt, 17.8 clay 1.41 5 0.164 1.SKB8.640193 3 Identification of the Microbiomes for Cannabis Soils Cannabis Soils Dude PIDude Cannabis Soil Microbiome +3.1.SKD8.640184 TGAGTGGTCTGT GTGCCAGCMGCCGCGGTAA ANL EMP ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome MiSeq Illumina MiSeq This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT Illumina ANL 8/1/12 s_G1_L001_sequences .25,g ANL Sequencing by synthesis CCME 16S rRNA V4 1 0 SKD8 n 2011-11-11 13:00:00 root metagenome GAZ:United States of America 0.15 Diesel Root true 114 ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat 1001:D9 3483 57.571893782 32.5563076447 6.8 ANL true 1 7.1 ENVO:soil 1118232 winter 1118232 15 66 sand, 16.3 silt, 17.7 clay 1.51 4.32 0.178 1.SKD8.640184 3 Identification of the Microbiomes for Cannabis Soils Cannabis Soils Dude PIDude Cannabis Soil Microbiome +3.1.SKB7.640196 CGGCCTAAGTTC GTGCCAGCMGCCGCGGTAA ANL EMP ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome MiSeq Illumina MiSeq This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT Illumina ANL 8/1/12 s_G1_L001_sequences .25,g ANL Sequencing by synthesis CCME 16S rRNA V4 1 0 SKB7 n 2011-11-11 13:00:00 root metagenome GAZ:United States of America 0.15 Burmese root true 114 ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat 1001:M8 3483 13.089194595 92.5274472082 6.94 ANL true 1 7.15 ENVO:soil 1118232 winter 1118232 15 64.6 sand, 17.6 silt, 17.8 clay 1.41 5 0.164 1.SKB7.640196 3 Identification of the Microbiomes for Cannabis Soils Cannabis Soils Dude PIDude Cannabis Soil Microbiome +4.1.SKB7.640196 CGGCCTAAGTTC GTGCCAGCMGCCGCGGTAA ANL EMP ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome MiSeq Illumina MiSeq This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT Illumina ANL 8/1/12 s_G1_L001_sequences .25,g ANL Sequencing by synthesis CCME 16S rRNA V4 1 0 SKB7 n 2011-11-11 13:00:00 root metagenome GAZ:United States of America 0.15 Burmese root true 114 ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat 1001:M8 3483 13.089194595 92.5274472082 6.94 ANL true 1 7.15 ENVO:soil 1118232 winter 1118232 15 64.6 sand, 17.6 silt, 17.8 clay 1.41 5 0.164 1.SKB7.640196 4 Identification of the Microbiomes for Cannabis Soils Cannabis Soils Dude PIDude Cannabis Soil Microbiome +4.1.SKD8.640184 TGAGTGGTCTGT GTGCCAGCMGCCGCGGTAA ANL EMP ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome MiSeq Illumina MiSeq This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT Illumina ANL 8/1/12 s_G1_L001_sequences .25,g ANL Sequencing by synthesis CCME 16S rRNA V4 1 0 SKD8 n 2011-11-11 13:00:00 root metagenome GAZ:United States of America 0.15 Diesel Root true 114 ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat 1001:D9 3483 57.571893782 32.5563076447 6.8 ANL true 1 7.1 ENVO:soil 1118232 winter 1118232 15 66 sand, 16.3 silt, 17.7 clay 1.51 4.32 0.178 1.SKD8.640184 4 Identification of the Microbiomes for Cannabis Soils Cannabis Soils Dude PIDude Cannabis Soil Microbiome +4.1.SKB8.640193 AGCGCTCACATC GTGCCAGCMGCCGCGGTAA ANL EMP ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome MiSeq Illumina MiSeq This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT Illumina ANL 8/1/12 s_G1_L001_sequences .25,g ANL Sequencing by synthesis CCME 16S rRNA V4 1 0 SKB8 n 2011-11-11 13:00:00 root metagenome GAZ:United States of America 0.15 Burmese root true 114 ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat 1001:M7 3483 74.0894932572 65.3283470202 6.94 ANL true 1 7.15 ENVO:soil 1118232 winter 1118232 15 64.6 sand, 17.6 silt, 17.8 clay 1.41 5 0.164 1.SKB8.640193 4 Identification of the Microbiomes for Cannabis Soils Cannabis Soils Dude PIDude Cannabis Soil Microbiome diff --git a/qiita_db/support_files/test_data/templates/1_prep_1_qiime_19700101-000000.txt b/qiita_db/support_files/test_data/templates/1_prep_1_qiime_19700101-000000.txt index a34afb0aa..86c8fc819 100644 --- a/qiita_db/support_files/test_data/templates/1_prep_1_qiime_19700101-000000.txt +++ b/qiita_db/support_files/test_data/templates/1_prep_1_qiime_19700101-000000.txt @@ -1,28 +1,28 @@ -#SampleID BarcodeSequence LinkerPrimerSequence center_name center_project_name emp_status experiment_center experiment_design_description experiment_title illumina_technology instrument_model library_construction_protocol pcr_primers platform run_center run_date run_prefix samp_size sample_center sequencing_meth study_center target_gene target_subfragment qiita_prep_id altitude anonymized_name assigned_from_geo collection_timestamp common_name country depth description_duplicate elevation env_biome env_feature has_extracted_data has_physical_specimen host_subject_id host_taxid latitude longitude ph physical_location samp_salinity sample_type season_environment taxon_id temp texture tot_nitro tot_org_carb water_content_soil Description -1.SKB1.640202 GTCCGCAAGTTA GTGCCAGCMGCCGCGGTAA ANL EMP ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome MiSeq Illumina MiSeq This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT Illumina ANL 8/1/12 s_G1_L001_sequences .25,g ANL Sequencing by synthesis CCME 16S rRNA V4 1 0.0 SKB1 n 2011-11-11 13:00:00 soil metagenome GAZ:United States of America 0.15 Burmese bulk 114.0 ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat True True 1001:M2 3483 4.59216095574 63.5115213108 6.94 ANL 7.15 ENVO:soil winter 410658 15.0 64.6 sand, 17.6 silt, 17.8 clay 1.41 5.0 0.164 Cannabis Soil Microbiome -1.SKB2.640194 CGTAGAGCTCTC GTGCCAGCMGCCGCGGTAA ANL EMP ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome MiSeq Illumina MiSeq This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT Illumina ANL 8/1/12 s_G1_L001_sequences .25,g ANL Sequencing by synthesis CCME 16S rRNA V4 1 0.0 SKB2 n 2011-11-11 13:00:00 soil metagenome GAZ:United States of America 0.15 Burmese bulk 114.0 ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat True True 1001:B4 3483 35.2374368957 68.5041623253 6.94 ANL 7.15 ENVO:soil winter 410658 15.0 64.6 sand, 17.6 silt, 17.8 clay 1.41 5.0 0.164 Cannabis Soil Microbiome -1.SKB3.640195 CCTCTGAGAGCT GTGCCAGCMGCCGCGGTAA ANL EMP ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome MiSeq Illumina MiSeq This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT Illumina ANL 8/1/12 s_G1_L001_sequences .25,g ANL Sequencing by synthesis CCME 16S rRNA V4 1 0.0 SKB3 n 2011-11-11 13:00:00 soil metagenome GAZ:United States of America 0.15 Burmese bulk 114.0 ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat True True 1001:M6 3483 95.2060749748 27.3592668624 6.94 ANL 7.15 ENVO:soil winter 410658 15.0 64.6 sand, 17.6 silt, 17.8 clay 1.41 5.0 0.164 Cannabis Soil Microbiome -1.SKB4.640189 CCTCGATGCAGT GTGCCAGCMGCCGCGGTAA ANL EMP ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome MiSeq Illumina MiSeq This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT Illumina ANL 8/1/12 s_G1_L001_sequences .25,g ANL Sequencing by synthesis CCME 16S rRNA V4 1 0.0 SKB4 n 2011-11-11 13:00:00 rhizosphere metagenome GAZ:United States of America 0.15 Burmese Rhizo 114.0 ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat True True 1001:D7 3483 43.9614715197 82.8516734159 6.94 ANL 7.15 ENVO:soil winter 939928 15.0 64.6 sand, 17.6 silt, 17.8 clay 1.41 5.0 0.164 Cannabis Soil Microbiome -1.SKB5.640181 GCGGACTATTCA GTGCCAGCMGCCGCGGTAA ANL EMP ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome MiSeq Illumina MiSeq This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT Illumina ANL 8/1/12 s_G1_L001_sequences .25,g ANL Sequencing by synthesis CCME 16S rRNA V4 1 0.0 SKB5 n 2011-11-11 13:00:00 rhizosphere metagenome GAZ:United States of America 0.15 Burmese Rhizo 114.0 ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat True True 1001:M4 3483 10.6655599093 70.784770579 6.94 ANL 7.15 ENVO:soil winter 939928 15.0 64.6 sand, 17.6 silt, 17.8 clay 1.41 5.0 0.164 Cannabis Soil Microbiome -1.SKB6.640176 CGTGCACAATTG GTGCCAGCMGCCGCGGTAA ANL EMP ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome MiSeq Illumina MiSeq This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT Illumina ANL 8/1/12 s_G1_L001_sequences .25,g ANL Sequencing by synthesis CCME 16S rRNA V4 1 0.0 SKB6 n 2011-11-11 13:00:00 rhizosphere metagenome GAZ:United States of America 0.15 Burmese Rhizo 114.0 ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat True True 1001:D5 3483 78.3634273709 74.423907894 6.94 ANL 7.15 ENVO:soil winter 939928 15.0 64.6 sand, 17.6 silt, 17.8 clay 1.41 5.0 0.164 Cannabis Soil Microbiome -1.SKB7.640196 CGGCCTAAGTTC GTGCCAGCMGCCGCGGTAA ANL EMP ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome MiSeq Illumina MiSeq This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT Illumina ANL 8/1/12 s_G1_L001_sequences .25,g ANL Sequencing by synthesis CCME 16S rRNA V4 1 0.0 SKB7 n 2011-11-11 13:00:00 root metagenome GAZ:United States of America 0.15 Burmese root 114.0 ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat True True 1001:M8 3483 13.089194595 92.5274472082 6.94 ANL 7.15 ENVO:soil winter 1118232 15.0 64.6 sand, 17.6 silt, 17.8 clay 1.41 5.0 0.164 Cannabis Soil Microbiome -1.SKB8.640193 AGCGCTCACATC GTGCCAGCMGCCGCGGTAA ANL EMP ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome MiSeq Illumina MiSeq This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT Illumina ANL 8/1/12 s_G1_L001_sequences .25,g ANL Sequencing by synthesis CCME 16S rRNA V4 1 0.0 SKB8 n 2011-11-11 13:00:00 root metagenome GAZ:United States of America 0.15 Burmese root 114.0 ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat True True 1001:M7 3483 74.0894932572 65.3283470202 6.94 ANL 7.15 ENVO:soil winter 1118232 15.0 64.6 sand, 17.6 silt, 17.8 clay 1.41 5.0 0.164 Cannabis Soil Microbiome -1.SKB9.640200 TGGTTATGGCAC GTGCCAGCMGCCGCGGTAA ANL EMP ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome MiSeq Illumina MiSeq This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT Illumina ANL 8/1/12 s_G1_L001_sequences .25,g ANL Sequencing by synthesis CCME 16S rRNA V4 1 0.0 SKB9 n 2011-11-11 13:00:00 root metagenome GAZ:United States of America 0.15 Burmese root 114.0 ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat True True 1001:B3 3483 12.6245524972 96.0693176066 6.8 ANL 7.15 ENVO:soil winter 1118232 15.0 64.6 sand, 17.6 silt, 17.8 clay 1.41 5.0 0.164 Cannabis Soil Microbiome -1.SKD1.640179 CGAGGTTCTGAT GTGCCAGCMGCCGCGGTAA ANL EMP ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome MiSeq Illumina MiSeq This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT Illumina ANL 8/1/12 s_G1_L001_sequences .25,g ANL Sequencing by synthesis CCME 16S rRNA V4 1 0.0 SKD1 n 2011-11-11 13:00:00 soil metagenome GAZ:United States of America 0.15 Diesel bulk 114.0 ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat True True 1001:M5 3483 68.0991287718 34.8360987059 6.8 ANL 7.1 ENVO:soil winter 410658 15.0 66 sand, 16.3 silt, 17.7 clay 1.51 4.32 0.178 Cannabis Soil Microbiome -1.SKD2.640178 AACTCCTGTGGA GTGCCAGCMGCCGCGGTAA ANL EMP ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome MiSeq Illumina MiSeq This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT Illumina ANL 8/1/12 s_G1_L001_sequences .25,g ANL Sequencing by synthesis CCME 16S rRNA V4 1 0.0 SKD2 n 2011-11-11 13:00:00 soil metagenome GAZ:United States of America 0.15 Diesel bulk 114.0 ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat True True 1001:B5 3483 53.5050692395 31.6056761814 6.8 ANL 7.1 ENVO:soil winter 410658 15.0 66 sand, 16.3 silt, 17.7 clay 1.51 4.32 0.178 Cannabis Soil Microbiome -1.SKD3.640198 TAATGGTCGTAG GTGCCAGCMGCCGCGGTAA ANL EMP ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome MiSeq Illumina MiSeq This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT Illumina ANL 8/1/12 s_G1_L001_sequences .25,g ANL Sequencing by synthesis CCME 16S rRNA V4 1 0.0 SKD3 n 2011-11-11 13:00:00 soil metagenome GAZ:United States of America 0.15 Diesel bulk 114.0 ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat True True 1001:B1 3483 84.0030227585 66.8954849864 6.8 ANL 7.1 ENVO:soil winter 410658 15.0 66 sand, 16.3 silt, 17.7 clay 1.51 4.32 0.178 Cannabis Soil Microbiome -1.SKD4.640185 TTGCACCGTCGA GTGCCAGCMGCCGCGGTAA ANL EMP ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome MiSeq Illumina MiSeq This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT Illumina ANL 8/1/12 s_G1_L001_sequences .25,g ANL Sequencing by synthesis CCME 16S rRNA V4 1 0.0 SKD4 n 2011-11-11 13:00:00 rhizosphere metagenome GAZ:United States of America 0.15 Diesel Rhizo 114.0 ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat True True 1001:M9 3483 40.8623799474 6.66444220187 6.8 ANL 7.1 ENVO:soil winter 939928 15.0 66 sand, 16.3 silt, 17.7 clay 1.51 4.32 0.178 Cannabis Soil Microbiome -1.SKD5.640186 TGCTACAGACGT GTGCCAGCMGCCGCGGTAA ANL EMP ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome MiSeq Illumina MiSeq This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT Illumina ANL 8/1/12 s_G1_L001_sequences .25,g ANL Sequencing by synthesis CCME 16S rRNA V4 1 0.0 SKD5 n 2011-11-11 13:00:00 rhizosphere metagenome GAZ:United States of America 0.15 Diesel Rhizo 114.0 ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat True True 1001:M1 3483 85.4121476399 15.6526750776 6.8 ANL 7.1 ENVO:soil winter 939928 15.0 66 sand, 16.3 silt, 17.7 clay 1.51 4.32 0.178 Cannabis Soil Microbiome -1.SKD6.640190 ATGGCCTGACTA GTGCCAGCMGCCGCGGTAA ANL EMP ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome MiSeq Illumina MiSeq This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT Illumina ANL 8/1/12 s_G1_L001_sequences .25,g ANL Sequencing by synthesis CCME 16S rRNA V4 1 0.0 SKD6 n 2011-11-11 13:00:00 rhizosphere metagenome GAZ:United States of America 0.15 Diesel Rhizo 114.0 ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat True True 1001:B9 3483 29.1499460692 82.1270418227 6.8 ANL 7.1 ENVO:soil winter 939928 15.0 66 sand, 16.3 silt, 17.7 clay 1.51 4.32 0.178 Cannabis Soil Microbiome -1.SKD7.640191 ACGCACATACAA GTGCCAGCMGCCGCGGTAA ANL EMP ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome MiSeq Illumina MiSeq This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT Illumina ANL 8/1/12 s_G1_L001_sequences .25,g ANL Sequencing by synthesis CCME 16S rRNA V4 1 0.0 SKD7 n 2011-11-11 13:00:00 root metagenome GAZ:United States of America 0.15 Diesel Root 114.0 ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat True True 1001:D6 3483 68.51099627 2.35063674718 6.8 ANL 7.1 ENVO:soil winter 1118232 15.0 66 sand, 16.3 silt, 17.7 clay 1.51 4.32 0.178 Cannabis Soil Microbiome -1.SKD8.640184 TGAGTGGTCTGT GTGCCAGCMGCCGCGGTAA ANL EMP ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome MiSeq Illumina MiSeq This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT Illumina ANL 8/1/12 s_G1_L001_sequences .25,g ANL Sequencing by synthesis CCME 16S rRNA V4 1 0.0 SKD8 n 2011-11-11 13:00:00 root metagenome GAZ:United States of America 0.15 Diesel Root 114.0 ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat True True 1001:D9 3483 57.571893782 32.5563076447 6.8 ANL 7.1 ENVO:soil winter 1118232 15.0 66 sand, 16.3 silt, 17.7 clay 1.51 4.32 0.178 Cannabis Soil Microbiome -1.SKD9.640182 GATAGCACTCGT GTGCCAGCMGCCGCGGTAA ANL EMP ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome MiSeq Illumina MiSeq This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT Illumina ANL 8/1/12 s_G1_L001_sequences .25,g ANL Sequencing by synthesis CCME 16S rRNA V4 1 0.0 SKD9 n 2011-11-11 13:00:00 root metagenome GAZ:United States of America 0.15 Diesel Root 114.0 ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat True True 1001:D3 3483 23.1218032799 42.838497795 6.82 ANL 7.1 ENVO:soil winter 1118232 15.0 66 sand, 16.3 silt, 17.7 clay 1.51 4.32 0.178 Cannabis Soil Microbiome -1.SKM1.640183 TAGCGCGAACTT GTGCCAGCMGCCGCGGTAA ANL EMP ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome MiSeq Illumina MiSeq This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT Illumina ANL 8/1/12 s_G1_L001_sequences .25,g ANL Sequencing by synthesis CCME 16S rRNA V4 1 0.0 SKM1 n 2011-11-11 13:00:00 soil metagenome GAZ:United States of America 0.15 Bucu bulk 114.0 ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat True True 1001:D1 3483 38.2627021402 3.48274264219 6.82 ANL 7.44 ENVO:soil winter 410658 15.0 63.1 sand, 17.7 silt, 19.2 clay 1.3 3.31 0.101 Cannabis Soil Microbiome -1.SKM2.640199 CATACACGCACC GTGCCAGCMGCCGCGGTAA ANL EMP ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome MiSeq Illumina MiSeq This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT Illumina ANL 8/1/12 s_G1_L001_sequences .25,g ANL Sequencing by synthesis CCME 16S rRNA V4 1 0.0 SKM2 n 2011-11-11 13:00:00 soil metagenome GAZ:United States of America 0.15 Bucu bulk 114.0 ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat True True 1001:D4 3483 82.8302905615 86.3615778099 6.82 ANL 7.44 ENVO:soil winter 410658 15.0 63.1 sand, 17.7 silt, 19.2 clay 1.3 3.31 0.101 Cannabis Soil Microbiome -1.SKM3.640197 ACCTCAGTCAAG GTGCCAGCMGCCGCGGTAA ANL EMP ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome MiSeq Illumina MiSeq This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT Illumina ANL 8/1/12 s_G1_L001_sequences .25,g ANL Sequencing by synthesis CCME 16S rRNA V4 1 0.0 SKM3 n 2011-11-11 13:00:00 soil metagenome GAZ:United States of America 0.15 Bucu bulk 114.0 ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat True True 1001:B7 3483 63.6505562766 31.2003474585 6.82 ANL 7.44 ENVO:soil winter 410658 15.0 63.1 sand, 17.7 silt, 19.2 clay 1.3 3.31 0.101 Cannabis Soil Microbiome -1.SKM4.640180 TCGACCAAACAC GTGCCAGCMGCCGCGGTAA ANL EMP ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome MiSeq Illumina MiSeq This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT Illumina ANL 8/1/12 s_G1_L001_sequences .25,g ANL Sequencing by synthesis CCME 16S rRNA V4 1 0.0 SKM4 n 2011-11-11 13:00:00 rhizosphere metagenome GAZ:United States of America 0.15 Bucu Rhizo 114.0 ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat True True 1001:D2 3483 31.7167821863 95.5088566087 6.82 ANL 7.44 ENVO:soil winter 939928 15.0 63.1 sand, 17.7 silt, 19.2 clay 1.3 3.31 0.101 Cannabis Soil Microbiome -1.SKM5.640177 CCACCCAGTAAC GTGCCAGCMGCCGCGGTAA ANL EMP ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome MiSeq Illumina MiSeq This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT Illumina ANL 8/1/12 s_G1_L001_sequences .25,g ANL Sequencing by synthesis CCME 16S rRNA V4 1 0.0 SKM5 n 2011-11-11 13:00:00 rhizosphere metagenome GAZ:United States of America 0.15 Bucu Rhizo 114.0 ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat True True 1001:M3 3483 44.9725384282 66.1920014699 6.82 ANL 7.44 ENVO:soil winter 939928 15.0 63.1 sand, 17.7 silt, 19.2 clay 1.3 3.31 0.101 Cannabis Soil Microbiome -1.SKM6.640187 ATATCGCGATGA GTGCCAGCMGCCGCGGTAA ANL EMP ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome MiSeq Illumina MiSeq This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT Illumina ANL 8/1/12 s_G1_L001_sequences .25,g ANL Sequencing by synthesis CCME 16S rRNA V4 1 0.0 SKM6 n 2011-11-11 13:00:00 rhizosphere metagenome GAZ:United States of America 0.15 Bucu Rhizo 114.0 ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat True True 1001:B2 3483 0.291867635913 68.5945325743 6.82 ANL 7.44 ENVO:soil winter 939928 15.0 63.1 sand, 17.7 silt, 19.2 clay 1.3 3.31 0.101 Cannabis Soil Microbiome -1.SKM7.640188 CGCCGGTAATCT GTGCCAGCMGCCGCGGTAA ANL EMP ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome MiSeq Illumina MiSeq This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT Illumina ANL 8/1/12 s_G1_L001_sequences .25,g ANL Sequencing by synthesis CCME 16S rRNA V4 1 0.0 SKM7 n 2011-11-11 13:00:00 root metagenome GAZ:United States of America 0.15 Bucu Roots 114.0 ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat True True 1001:B6 3483 60.1102854322 74.7123248382 6.82 ANL 7.44 ENVO:soil winter 1118232 15.0 63.1 sand, 17.7 silt, 19.2 clay 1.3 3.31 0.101 Cannabis Soil Microbiome -1.SKM8.640201 CCGATGCCTTGA GTGCCAGCMGCCGCGGTAA ANL EMP ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome MiSeq Illumina MiSeq This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT Illumina ANL 8/1/12 s_G1_L001_sequences .25,g ANL Sequencing by synthesis CCME 16S rRNA V4 1 0.0 SKM8 n 2011-11-11 13:00:00 root metagenome GAZ:United States of America 0.15 Bucu Roots 114.0 ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat True True 1001:D8 3483 3.21190859967 26.8138925876 6.82 ANL 7.44 ENVO:soil winter 1118232 15.0 63.1 sand, 17.7 silt, 19.2 clay 1.3 3.31 0.101 Cannabis Soil Microbiome -1.SKM9.640192 AGCAGGCACGAA GTGCCAGCMGCCGCGGTAA ANL EMP ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome MiSeq Illumina MiSeq This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT Illumina ANL 8/1/12 s_G1_L001_sequences .25,g ANL Sequencing by synthesis CCME 16S rRNA V4 1 0.0 SKM9 n 2011-11-11 13:00:00 root metagenome GAZ:United States of America 0.15 Bucu Roots 114.0 ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat True True 1001:B8 3483 12.7065957714 84.9722975792 6.82 ANL 7.44 ENVO:soil winter 1118232 15.0 63.1 sand, 17.7 silt, 19.2 clay 1.3 3.31 0.101 Cannabis Soil Microbiome +#SampleID BarcodeSequence Description LinkerPrimerSequence altitude anonymized_name assigned_from_geo center_name center_project_name collection_timestamp common_name country depth description_duplicate dna_extracted elevation emp_status env_biome env_feature experiment_center experiment_design_description experiment_title host_subject_id host_taxid illumina_technology instrument_model latitude library_construction_protocol longitude pcr_primers ph physical_specimen_location physical_specimen_remaining platform qiita_prep_id qiita_study_id run_center run_date run_prefix samp_salinity samp_size sample_center sample_type scientific_name season_environment sequencing_meth study_center target_gene target_subfragment taxon_id temp texture tot_nitro tot_org_carb water_content_soil +1.SKB1.640202 GTCCGCAAGTTA Cannabis Soil Microbiome GTGCCAGCMGCCGCGGTAA 0 SKB1 n ANL 2011-11-11 13:00:00 soil metagenome GAZ:United States of America 0.15 Burmese bulk True 114 EMP ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome 1001:M2 3483 MiSeq Illumina MiSeq 4.59216095574 This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. 63.5115213108 FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT 6.94 ANL True Illumina 1 1 ANL 8/1/12 s_G1_L001_sequences 7.15 .25,g ANL ENVO:soil 1118232 winter Sequencing by synthesis CCME 16S rRNA V4 410658 15 64.6 sand, 17.6 silt, 17.8 clay 1.41 5.0 0.164 +1.SKB2.640194 CGTAGAGCTCTC Cannabis Soil Microbiome GTGCCAGCMGCCGCGGTAA 0 SKB2 n ANL 2011-11-11 13:00:00 soil metagenome GAZ:United States of America 0.15 Burmese bulk True 114 EMP ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome 1001:B4 3483 MiSeq Illumina MiSeq 35.2374368957 This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. 68.5041623253 FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT 6.94 ANL True Illumina 1 1 ANL 8/1/12 s_G1_L001_sequences 7.15 .25,g ANL ENVO:soil 1118232 winter Sequencing by synthesis CCME 16S rRNA V4 410658 15 64.6 sand, 17.6 silt, 17.8 clay 1.41 5.0 0.164 +1.SKB3.640195 CCTCTGAGAGCT Cannabis Soil Microbiome GTGCCAGCMGCCGCGGTAA 0 SKB3 n ANL 2011-11-11 13:00:00 soil metagenome GAZ:United States of America 0.15 Burmese bulk True 114 EMP ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome 1001:M6 3483 MiSeq Illumina MiSeq 95.2060749748 This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. 27.3592668624 FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT 6.94 ANL True Illumina 1 1 ANL 8/1/12 s_G1_L001_sequences 7.15 .25,g ANL ENVO:soil 1118232 winter Sequencing by synthesis CCME 16S rRNA V4 410658 15 64.6 sand, 17.6 silt, 17.8 clay 1.41 5.0 0.164 +1.SKB4.640189 CCTCGATGCAGT Cannabis Soil Microbiome GTGCCAGCMGCCGCGGTAA 0 SKB4 n ANL 2011-11-11 13:00:00 rhizosphere metagenome GAZ:United States of America 0.15 Burmese Rhizo True 114 EMP ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome 1001:D7 3483 MiSeq Illumina MiSeq 43.9614715197 This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. 82.8516734159 FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT 6.94 ANL True Illumina 1 1 ANL 8/1/12 s_G1_L001_sequences 7.15 .25,g ANL ENVO:soil 1118232 winter Sequencing by synthesis CCME 16S rRNA V4 939928 15 64.6 sand, 17.6 silt, 17.8 clay 1.41 5.0 0.164 +1.SKB5.640181 GCGGACTATTCA Cannabis Soil Microbiome GTGCCAGCMGCCGCGGTAA 0 SKB5 n ANL 2011-11-11 13:00:00 rhizosphere metagenome GAZ:United States of America 0.15 Burmese Rhizo True 114 EMP ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome 1001:M4 3483 MiSeq Illumina MiSeq 10.6655599093 This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. 70.784770579 FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT 6.94 ANL True Illumina 1 1 ANL 8/1/12 s_G1_L001_sequences 7.15 .25,g ANL ENVO:soil 1118232 winter Sequencing by synthesis CCME 16S rRNA V4 939928 15 64.6 sand, 17.6 silt, 17.8 clay 1.41 5.0 0.164 +1.SKB6.640176 CGTGCACAATTG Cannabis Soil Microbiome GTGCCAGCMGCCGCGGTAA 0 SKB6 n ANL 2011-11-11 13:00:00 rhizosphere metagenome GAZ:United States of America 0.15 Burmese Rhizo True 114 EMP ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome 1001:D5 3483 MiSeq Illumina MiSeq 78.3634273709 This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. 74.423907894 FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT 6.94 ANL True Illumina 1 1 ANL 8/1/12 s_G1_L001_sequences 7.15 .25,g ANL ENVO:soil 1118232 winter Sequencing by synthesis CCME 16S rRNA V4 939928 15 64.6 sand, 17.6 silt, 17.8 clay 1.41 5.0 0.164 +1.SKB7.640196 CGGCCTAAGTTC Cannabis Soil Microbiome GTGCCAGCMGCCGCGGTAA 0 SKB7 n ANL 2011-11-11 13:00:00 root metagenome GAZ:United States of America 0.15 Burmese root True 114 EMP ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome 1001:M8 3483 MiSeq Illumina MiSeq 13.089194595 This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. 92.5274472082 FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT 6.94 ANL True Illumina 1 1 ANL 8/1/12 s_G1_L001_sequences 7.15 .25,g ANL ENVO:soil 1118232 winter Sequencing by synthesis CCME 16S rRNA V4 1118232 15 64.6 sand, 17.6 silt, 17.8 clay 1.41 5.0 0.164 +1.SKB8.640193 AGCGCTCACATC Cannabis Soil Microbiome GTGCCAGCMGCCGCGGTAA 0 SKB8 n ANL 2011-11-11 13:00:00 root metagenome GAZ:United States of America 0.15 Burmese root True 114 EMP ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome 1001:M7 3483 MiSeq Illumina MiSeq 74.0894932572 This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. 65.3283470202 FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT 6.94 ANL True Illumina 1 1 ANL 8/1/12 s_G1_L001_sequences 7.15 .25,g ANL ENVO:soil 1118232 winter Sequencing by synthesis CCME 16S rRNA V4 1118232 15 64.6 sand, 17.6 silt, 17.8 clay 1.41 5.0 0.164 +1.SKB9.640200 TGGTTATGGCAC Cannabis Soil Microbiome GTGCCAGCMGCCGCGGTAA 0 SKB9 n ANL 2011-11-11 13:00:00 root metagenome GAZ:United States of America 0.15 Burmese root True 114 EMP ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome 1001:B3 3483 MiSeq Illumina MiSeq 12.6245524972 This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. 96.0693176066 FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT 6.8 ANL True Illumina 1 1 ANL 8/1/12 s_G1_L001_sequences 7.15 .25,g ANL ENVO:soil 1118232 winter Sequencing by synthesis CCME 16S rRNA V4 1118232 15 64.6 sand, 17.6 silt, 17.8 clay 1.41 5.0 0.164 +1.SKD1.640179 CGAGGTTCTGAT Cannabis Soil Microbiome GTGCCAGCMGCCGCGGTAA 0 SKD1 n ANL 2011-11-11 13:00:00 soil metagenome GAZ:United States of America 0.15 Diesel bulk True 114 EMP ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome 1001:M5 3483 MiSeq Illumina MiSeq 68.0991287718 This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. 34.8360987059 FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT 6.8 ANL True Illumina 1 1 ANL 8/1/12 s_G1_L001_sequences 7.1 .25,g ANL ENVO:soil 1118232 winter Sequencing by synthesis CCME 16S rRNA V4 410658 15 66 sand, 16.3 silt, 17.7 clay 1.51 4.32 0.178 +1.SKD2.640178 AACTCCTGTGGA Cannabis Soil Microbiome GTGCCAGCMGCCGCGGTAA 0 SKD2 n ANL 2011-11-11 13:00:00 soil metagenome GAZ:United States of America 0.15 Diesel bulk True 114 EMP ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome 1001:B5 3483 MiSeq Illumina MiSeq 53.5050692395 This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. 31.6056761814 FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT 6.8 ANL True Illumina 1 1 ANL 8/1/12 s_G1_L001_sequences 7.1 .25,g ANL ENVO:soil 1118232 winter Sequencing by synthesis CCME 16S rRNA V4 410658 15 66 sand, 16.3 silt, 17.7 clay 1.51 4.32 0.178 +1.SKD3.640198 TAATGGTCGTAG Cannabis Soil Microbiome GTGCCAGCMGCCGCGGTAA 0 SKD3 n ANL 2011-11-11 13:00:00 soil metagenome GAZ:United States of America 0.15 Diesel bulk True 114 EMP ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome 1001:B1 3483 MiSeq Illumina MiSeq 84.0030227585 This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. 66.8954849864 FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT 6.8 ANL True Illumina 1 1 ANL 8/1/12 s_G1_L001_sequences 7.1 .25,g ANL ENVO:soil 1118232 winter Sequencing by synthesis CCME 16S rRNA V4 410658 15 66 sand, 16.3 silt, 17.7 clay 1.51 4.32 0.178 +1.SKD4.640185 TTGCACCGTCGA Cannabis Soil Microbiome GTGCCAGCMGCCGCGGTAA 0 SKD4 n ANL 2011-11-11 13:00:00 rhizosphere metagenome GAZ:United States of America 0.15 Diesel Rhizo True 114 EMP ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome 1001:M9 3483 MiSeq Illumina MiSeq 40.8623799474 This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. 6.66444220187 FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT 6.8 ANL True Illumina 1 1 ANL 8/1/12 s_G1_L001_sequences 7.1 .25,g ANL ENVO:soil 1118232 winter Sequencing by synthesis CCME 16S rRNA V4 939928 15 66 sand, 16.3 silt, 17.7 clay 1.51 4.32 0.178 +1.SKD5.640186 TGCTACAGACGT Cannabis Soil Microbiome GTGCCAGCMGCCGCGGTAA 0 SKD5 n ANL 2011-11-11 13:00:00 rhizosphere metagenome GAZ:United States of America 0.15 Diesel Rhizo True 114 EMP ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome 1001:M1 3483 MiSeq Illumina MiSeq 85.4121476399 This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. 15.6526750776 FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT 6.8 ANL True Illumina 1 1 ANL 8/1/12 s_G1_L001_sequences 7.1 .25,g ANL ENVO:soil 1118232 winter Sequencing by synthesis CCME 16S rRNA V4 939928 15 66 sand, 16.3 silt, 17.7 clay 1.51 4.32 0.178 +1.SKD6.640190 ATGGCCTGACTA Cannabis Soil Microbiome GTGCCAGCMGCCGCGGTAA 0 SKD6 n ANL 2011-11-11 13:00:00 rhizosphere metagenome GAZ:United States of America 0.15 Diesel Rhizo True 114 EMP ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome 1001:B9 3483 MiSeq Illumina MiSeq 29.1499460692 This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. 82.1270418227 FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT 6.8 ANL True Illumina 1 1 ANL 8/1/12 s_G1_L001_sequences 7.1 .25,g ANL ENVO:soil 1118232 winter Sequencing by synthesis CCME 16S rRNA V4 939928 15 66 sand, 16.3 silt, 17.7 clay 1.51 4.32 0.178 +1.SKD7.640191 ACGCACATACAA Cannabis Soil Microbiome GTGCCAGCMGCCGCGGTAA 0 SKD7 n ANL 2011-11-11 13:00:00 root metagenome GAZ:United States of America 0.15 Diesel Root True 114 EMP ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome 1001:D6 3483 MiSeq Illumina MiSeq 68.51099627 This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. 2.35063674718 FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT 6.8 ANL True Illumina 1 1 ANL 8/1/12 s_G1_L001_sequences 7.1 .25,g ANL ENVO:soil 1118232 winter Sequencing by synthesis CCME 16S rRNA V4 1118232 15 66 sand, 16.3 silt, 17.7 clay 1.51 4.32 0.178 +1.SKD8.640184 TGAGTGGTCTGT Cannabis Soil Microbiome GTGCCAGCMGCCGCGGTAA 0 SKD8 n ANL 2011-11-11 13:00:00 root metagenome GAZ:United States of America 0.15 Diesel Root True 114 EMP ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome 1001:D9 3483 MiSeq Illumina MiSeq 57.571893782 This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. 32.5563076447 FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT 6.8 ANL True Illumina 1 1 ANL 8/1/12 s_G1_L001_sequences 7.1 .25,g ANL ENVO:soil 1118232 winter Sequencing by synthesis CCME 16S rRNA V4 1118232 15 66 sand, 16.3 silt, 17.7 clay 1.51 4.32 0.178 +1.SKD9.640182 GATAGCACTCGT Cannabis Soil Microbiome GTGCCAGCMGCCGCGGTAA 0 SKD9 n ANL 2011-11-11 13:00:00 root metagenome GAZ:United States of America 0.15 Diesel Root True 114 EMP ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome 1001:D3 3483 MiSeq Illumina MiSeq 23.1218032799 This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. 42.838497795 FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT 6.82 ANL True Illumina 1 1 ANL 8/1/12 s_G1_L001_sequences 7.1 .25,g ANL ENVO:soil 1118232 winter Sequencing by synthesis CCME 16S rRNA V4 1118232 15 66 sand, 16.3 silt, 17.7 clay 1.51 4.32 0.178 +1.SKM1.640183 TAGCGCGAACTT Cannabis Soil Microbiome GTGCCAGCMGCCGCGGTAA 0 SKM1 n ANL 2011-11-11 13:00:00 soil metagenome GAZ:United States of America 0.15 Bucu bulk True 114 EMP ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome 1001:D1 3483 MiSeq Illumina MiSeq 38.2627021402 This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. 3.48274264219 FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT 6.82 ANL True Illumina 1 1 ANL 8/1/12 s_G1_L001_sequences 7.44 .25,g ANL ENVO:soil 1118232 winter Sequencing by synthesis CCME 16S rRNA V4 410658 15 63.1 sand, 17.7 silt, 19.2 clay 1.3 3.31 0.101 +1.SKM2.640199 CATACACGCACC Cannabis Soil Microbiome GTGCCAGCMGCCGCGGTAA 0 SKM2 n ANL 2011-11-11 13:00:00 soil metagenome GAZ:United States of America 0.15 Bucu bulk True 114 EMP ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome 1001:D4 3483 MiSeq Illumina MiSeq 82.8302905615 This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. 86.3615778099 FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT 6.82 ANL True Illumina 1 1 ANL 8/1/12 s_G1_L001_sequences 7.44 .25,g ANL ENVO:soil 1118232 winter Sequencing by synthesis CCME 16S rRNA V4 410658 15 63.1 sand, 17.7 silt, 19.2 clay 1.3 3.31 0.101 +1.SKM3.640197 ACCTCAGTCAAG Cannabis Soil Microbiome GTGCCAGCMGCCGCGGTAA 0 SKM3 n ANL 2011-11-11 13:00:00 soil metagenome GAZ:United States of America 0.15 Bucu bulk True 114 EMP ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome 1001:B7 3483 MiSeq Illumina MiSeq Not applicable This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. 31.2003474585 FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT 6.82 ANL True Illumina 1 1 ANL 8/1/12 s_G1_L001_sequences 7.44 .25,g ANL ENVO:soil 1118232 winter Sequencing by synthesis CCME 16S rRNA V4 410658 15 63.1 sand, 17.7 silt, 19.2 clay 1.3 3.31 0.101 +1.SKM4.640180 TCGACCAAACAC Cannabis Soil Microbiome GTGCCAGCMGCCGCGGTAA 0 SKM4 n ANL 2011-11-11 13:00:00 rhizosphere metagenome GAZ:United States of America 0.15 Bucu Rhizo True 114 EMP ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome 1001:D2 3483 MiSeq Illumina MiSeq Not applicable This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. Not applicable FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT 6.82 ANL True Illumina 1 1 ANL 8/1/12 s_G1_L001_sequences 7.44 .25,g ANL ENVO:soil 1118232 winter Sequencing by synthesis CCME 16S rRNA V4 939928 15 63.1 sand, 17.7 silt, 19.2 clay 1.3 3.31 0.101 +1.SKM5.640177 CCACCCAGTAAC Cannabis Soil Microbiome GTGCCAGCMGCCGCGGTAA 0 SKM5 n ANL 2011-11-11 13:00:00 rhizosphere metagenome GAZ:United States of America 0.15 Bucu Rhizo True 114 EMP ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome 1001:M3 3483 MiSeq Illumina MiSeq 44.9725384282 This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. 66.1920014699 FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT 6.82 ANL True Illumina 1 1 ANL 8/1/12 s_G1_L001_sequences 7.44 .25,g ANL ENVO:soil 1118232 winter Sequencing by synthesis CCME 16S rRNA V4 939928 15 63.1 sand, 17.7 silt, 19.2 clay 1.3 3.31 0.101 +1.SKM6.640187 ATATCGCGATGA Cannabis Soil Microbiome GTGCCAGCMGCCGCGGTAA 0 SKM6 n ANL 2011-11-11 13:00:00 rhizosphere metagenome GAZ:United States of America 0.15 Bucu Rhizo True 114 EMP ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome 1001:B2 3483 MiSeq Illumina MiSeq 0.291867635913 This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. 68.5945325743 FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT 6.82 ANL True Illumina 1 1 ANL 8/1/12 s_G1_L001_sequences 7.44 .25,g ANL ENVO:soil 1118232 winter Sequencing by synthesis CCME 16S rRNA V4 939928 15 63.1 sand, 17.7 silt, 19.2 clay 1.3 3.31 0.101 +1.SKM7.640188 CGCCGGTAATCT Cannabis Soil Microbiome GTGCCAGCMGCCGCGGTAA 0 SKM7 n ANL 2011-11-11 13:00:00 root metagenome GAZ:United States of America 0.15 Bucu Roots True 114 EMP ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome 1001:B6 3483 MiSeq Illumina MiSeq 60.1102854322 This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. 74.7123248382 FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT 6.82 ANL True Illumina 1 1 ANL 8/1/12 s_G1_L001_sequences 7.44 .25,g ANL ENVO:soil 1118232 winter Sequencing by synthesis CCME 16S rRNA V4 1118232 15 63.1 sand, 17.7 silt, 19.2 clay 1.3 3.31 0.101 +1.SKM8.640201 CCGATGCCTTGA Cannabis Soil Microbiome GTGCCAGCMGCCGCGGTAA 0 SKM8 n ANL 2011-11-11 13:00:00 root metagenome GAZ:United States of America 0.15 Bucu Roots True 114 EMP ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome 1001:D8 3483 MiSeq Illumina MiSeq 3.21190859967 This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. 26.8138925876 FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT 6.82 ANL True Illumina 1 1 ANL 8/1/12 s_G1_L001_sequences 7.44 .25,g ANL ENVO:soil 1118232 winter Sequencing by synthesis CCME 16S rRNA V4 1118232 15 63.1 sand, 17.7 silt, 19.2 clay 1.3 3.31 0.101 +1.SKM9.640192 AGCAGGCACGAA Cannabis Soil Microbiome GTGCCAGCMGCCGCGGTAA 0 SKM9 n ANL 2011-11-11 13:00:00 root metagenome GAZ:United States of America 0.15 Bucu Roots True 114 EMP ENVO:Temperate grasslands, savannas, and shrubland biome ENVO:plant-associated habitat ANL micro biome of soil and rhizosphere of cannabis plants from CA Cannabis Soil Microbiome 1001:B8 3483 MiSeq Illumina MiSeq 12.7065957714 This analysis was done as in Caporaso et al 2011 Genome research. The PCR primers (F515/R806) were developed against the V4 region of the 16S rRNA (both bacteria and archaea), which we determined would yield optimal community clustering with reads of this length using a procedure similar to that of ref. 15. [For reference, this primer pair amplifies the region 533_786 in the Escherichia coli strain 83972 sequence (greengenes accession no. prokMSA_id:470367).] The reverse PCR primer is barcoded with a 12-base error-correcting Golay code to facilitate multiplexing of up to 1,500 samples per lane, and both PCR primers contain sequencer adapter regions. 84.9722975792 FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT 6.82 ANL True Illumina 1 1 ANL 8/1/12 s_G1_L001_sequences 7.44 .25,g ANL ENVO:soil 1118232 winter Sequencing by synthesis CCME 16S rRNA V4 1118232 15 63.1 sand, 17.7 silt, 19.2 clay 1.3 3.31 0.101 diff --git a/qiita_db/test/test_analysis.py b/qiita_db/test/test_analysis.py index 356acc13a..a83f40a5b 100644 --- a/qiita_db/test/test_analysis.py +++ b/qiita_db/test/test_analysis.py @@ -379,14 +379,13 @@ def test_build_mapping_file(self): npt.assert_warns(qdb.exceptions.QiitaDBWarning, analysis._build_mapping_file, samples) obs = analysis.mapping_file + exp = self.get_fp("%s_analysis_mapping.txt" % analysis.id) self.assertEqual(obs, exp) obs = qdb.metadata_template.util.load_template_to_dataframe( obs, index='#SampleID') - exp = npt.assert_warns( - qdb.exceptions.QiitaDBWarning, - qdb.metadata_template.util.load_template_to_dataframe, + exp = qdb.metadata_template.util.load_template_to_dataframe( self.map_exp_fp, index='#SampleID') assert_frame_equal(obs, exp) @@ -399,9 +398,7 @@ def test_build_mapping_file_duplicated_samples_no_merge(self): obs = qdb.metadata_template.util.load_template_to_dataframe( analysis.mapping_file, index='#SampleID') - exp = npt.assert_warns( - qdb.exceptions.QiitaDBWarning, - qdb.metadata_template.util.load_template_to_dataframe, + exp = qdb.metadata_template.util.load_template_to_dataframe( self.duplicated_samples_not_merged, index='#SampleID') # assert_frame_equal assumes same order on the rows, thus sorting @@ -418,9 +415,7 @@ def test_build_mapping_file_duplicated_samples_merge(self): analysis._build_mapping_file, samples) obs = qdb.metadata_template.util.load_template_to_dataframe( analysis.mapping_file, index='#SampleID') - exp = npt.assert_warns( - qdb.exceptions.QiitaDBWarning, - qdb.metadata_template.util.load_template_to_dataframe, + exp = qdb.metadata_template.util.load_template_to_dataframe( self.map_exp_fp, index='#SampleID') assert_frame_equal(obs, exp) diff --git a/qiita_db/test/test_commands.py b/qiita_db/test/test_commands.py index 0b4f0da5c..edf6f04f5 100644 --- a/qiita_db/test/test_commands.py +++ b/qiita_db/test/test_commands.py @@ -582,5 +582,6 @@ def test_complete_error(self): max_barcode_errors\t1.5 """ + if __name__ == "__main__": main() diff --git a/qiita_db/test/test_logger.py b/qiita_db/test/test_logger.py index ea9d04050..95764e71d 100644 --- a/qiita_db/test/test_logger.py +++ b/qiita_db/test/test_logger.py @@ -67,5 +67,6 @@ def test_clear_info(self): log_entry.clear_info() self.assertEqual(log_entry.info, []) + if __name__ == '__main__': main() diff --git a/qiita_db/test/test_meta_util.py b/qiita_db/test/test_meta_util.py index 8ceebd2fe..d68260bcd 100644 --- a/qiita_db/test/test_meta_util.py +++ b/qiita_db/test/test_meta_util.py @@ -156,7 +156,7 @@ def test_get_lat_longs_EMP_portal(self): 'physical_specimen_remaining': True, 'dna_extracted': True, 'sample_type': 'type1', - 'collection_timestamp': '05/29/14 12:24:51', + 'collection_timestamp': '2014-05-29 12:24:51', 'host_subject_id': 'NotIdentified', 'Description': 'Test Sample 4', 'str_column': 'Value for sample 4', @@ -185,10 +185,10 @@ def test_update_redis_stats(self): portal = qiita_config.portal vals = [ - ('number_studies', {'sanbox': '0', 'public': '1', - 'private': '0'}, r_client.hgetall), - ('number_of_samples', {'sanbox': '0', 'public': '27', - 'private': '0'}, r_client.hgetall), + ('number_studies', {'sandbox': '0', 'public': '0', + 'private': '1'}, r_client.hgetall), + ('number_of_samples', {'sandbox': '0', 'public': '0', + 'private': '27'}, r_client.hgetall), ('num_users', '4', r_client.get), ('lat_longs', EXP_LAT_LONG, r_client.get), ('num_studies_ebi', '1', r_client.get), @@ -218,5 +218,6 @@ def test_update_redis_stats(self): ' [0.291867635913, 68.5945325743], [85.4121476399, 15.6526750776],' ' [68.0991287718, 34.8360987059]]') + if __name__ == '__main__': main() diff --git a/qiita_db/test/test_reference.py b/qiita_db/test/test_reference.py index d733d276f..e92a6f891 100644 --- a/qiita_db/test/test_reference.py +++ b/qiita_db/test/test_reference.py @@ -87,5 +87,6 @@ def test_tree_fp_empty(self): ref = qdb.reference.Reference(2) self.assertEqual(ref.tree_fp, '') + if __name__ == '__main__': main() diff --git a/qiita_db/test/test_setup.py b/qiita_db/test/test_setup.py index 60dfd78f0..fc1f305d5 100644 --- a/qiita_db/test/test_setup.py +++ b/qiita_db/test/test_setup.py @@ -36,7 +36,7 @@ def test_study_experimental_factor(self): self.assertEqual(get_count("qiita.study_experimental_factor"), 1) def test_filepath(self): - self.assertEqual(get_count("qiita.filepath"), 21) + self.assertEqual(get_count("qiita.filepath"), 26) def test_filepath_type(self): self.assertEqual(get_count("qiita.filepath_type"), 21) diff --git a/qiita_db/test/test_software.py b/qiita_db/test/test_software.py index 369d6d0cc..a19d577fe 100644 --- a/qiita_db/test/test_software.py +++ b/qiita_db/test/test_software.py @@ -927,5 +927,6 @@ def test_graph(self): CLIENT_SECRET = %s """ + if __name__ == '__main__': main() diff --git a/qiita_db/test/test_sql_connection.py b/qiita_db/test/test_sql_connection.py index 9ae12847b..4cdf2647b 100644 --- a/qiita_db/test/test_sql_connection.py +++ b/qiita_db/test/test_sql_connection.py @@ -660,5 +660,6 @@ def test_index(self): self.assertEqual(qdb.sql_connection.TRN.index, 0) + if __name__ == "__main__": main() diff --git a/qiita_db/test/test_user.py b/qiita_db/test/test_user.py index f15d70459..0df6ccd11 100644 --- a/qiita_db/test/test_user.py +++ b/qiita_db/test/test_user.py @@ -473,5 +473,6 @@ def test_jobs_defaults(self): # no jobs self.assertEqual(qdb.user.User('admin@foo.bar').jobs(), []) + if __name__ == "__main__": main() diff --git a/qiita_db/test/test_util.py b/qiita_db/test/test_util.py index a368eb1ef..2797e5098 100644 --- a/qiita_db/test/test_util.py +++ b/qiita_db/test/test_util.py @@ -748,33 +748,55 @@ def test_generate_biom_and_metadata_release(self): tmp = topen(tgz, "r:gz") tgz_obs = [ti.name for ti in tmp] tmp.close() - tgz_exp = [ - 'processed_data/1_study_1001_closed_reference_otu_table.biom', - 'templates/1_19700101-000000.txt', - 'templates/1_prep_1_19700101-000000.txt', - 'processed_data/1_study_1001_closed_reference_otu_table.biom', - 'templates/1_19700101-000000.txt', - 'templates/1_prep_1_19700101-000000.txt', - 'processed_data/1_study_1001_closed_reference_otu_table_' - 'Silva.biom', 'templates/1_19700101-000000.txt', - 'templates/1_prep_1_19700101-000000.txt'] - self.assertEqual(tgz_obs, tgz_exp) + # files names might change due to updates and patches so just check + # that the prefix exists. + fn = 'processed_data/1_study_1001_closed_reference_otu_table.biom' + self.assertTrue(fn in tgz_obs) + tgz_obs.remove(fn) + # yes, this file is there twice + self.assertTrue(fn in tgz_obs) + tgz_obs.remove(fn) + # let's check the next biom + fn = ('processed_data/1_study_1001_closed_reference_otu_table_Silva.' + 'biom') + self.assertTrue(fn in tgz_obs) + tgz_obs.remove(fn) + # now let's check prep info files based on their suffix, just take + # the first one and check/rm the occurances of that file + fn_prep = [f for f in tgz_obs + if f.startswith('templates/1_prep_1_')][0] + # 3 times + self.assertTrue(fn_prep in tgz_obs) + tgz_obs.remove(fn_prep) + self.assertTrue(fn_prep in tgz_obs) + tgz_obs.remove(fn_prep) + self.assertTrue(fn_prep in tgz_obs) + tgz_obs.remove(fn_prep) + fn_sample = [f for f in tgz_obs if f.startswith('templates/1_')][0] + # 3 times + self.assertTrue(fn_sample in tgz_obs) + tgz_obs.remove(fn_sample) + self.assertTrue(fn_sample in tgz_obs) + tgz_obs.remove(fn_sample) + self.assertTrue(fn_sample in tgz_obs) + tgz_obs.remove(fn_sample) + # now it should be empty + self.assertEqual(tgz_obs, []) tmp = open(txt) txt_obs = tmp.readlines() tmp.close() txt_exp = [ 'biom_fp\tsample_fp\tprep_fp\tqiita_artifact_id\tcommand\n', - 'processed_data/1_study_1001_closed_reference_otu_table.biom\ttem' - 'plates/1_19700101-000000.txt\ttemplates/1_prep_1_19700101-000000' - '.txt\t4\tPick closed-reference OTUs, Split libraries FASTQ\n', - 'processed_data/1_study_1001_closed_reference_otu_table.biom\ttem' - 'plates/1_19700101-000000.txt\ttemplates/1_prep_1_19700101-000000' - '.txt\t5\tPick closed-reference OTUs, Split libraries FASTQ\n', + 'processed_data/1_study_1001_closed_reference_otu_table.biom\t' + '%s\t%s\t4\tPick closed-reference OTUs, Split libraries FASTQ\n' + % (fn_sample, fn_prep), + 'processed_data/1_study_1001_closed_reference_otu_table.biom\t' + '%s\t%s\t5\tPick closed-reference OTUs, Split libraries FASTQ\n' + % (fn_sample, fn_prep), 'processed_data/1_study_1001_closed_reference_otu_table_Silva.bio' - 'm\ttemplates/1_19700101-000000.txt\ttemplates/1_prep_1_19700101-' - '000000.txt\t6\tPick closed-reference OTUs, Split libraries ' - 'FASTQ\n'] + 'm\t%s\t%s\t6\tPick closed-reference OTUs, Split libraries FASTQ\n' + % (fn_sample, fn_prep)] self.assertEqual(txt_obs, txt_exp) # whatever the configuration was, we will change to settings so we can @@ -798,33 +820,55 @@ def test_generate_biom_and_metadata_release(self): tmp = topen(tgz, "r:gz") tgz_obs = [ti.name for ti in tmp] tmp.close() - tgz_exp = [ - 'processed_data/1_study_1001_closed_reference_otu_table.biom', - 'templates/1_19700101-000000.txt', - 'templates/1_prep_1_19700101-000000.txt', - 'processed_data/1_study_1001_closed_reference_otu_table.biom', - 'templates/1_19700101-000000.txt', - 'templates/1_prep_1_19700101-000000.txt', - 'processed_data/1_study_1001_closed_reference_otu_table_' - 'Silva.biom', 'templates/1_19700101-000000.txt', - 'templates/1_prep_1_19700101-000000.txt'] - self.assertEqual(tgz_obs, tgz_exp) + # files names might change due to updates and patches so just check + # that the prefix exists. + fn = 'processed_data/1_study_1001_closed_reference_otu_table.biom' + self.assertTrue(fn in tgz_obs) + tgz_obs.remove(fn) + # yes, this file is there twice + self.assertTrue(fn in tgz_obs) + tgz_obs.remove(fn) + # let's check the next biom + fn = ('processed_data/1_study_1001_closed_reference_otu_table_Silva.' + 'biom') + self.assertTrue(fn in tgz_obs) + tgz_obs.remove(fn) + # now let's check prep info files based on their suffix, just take + # the first one and check/rm the occurances of that file + fn_prep = [f for f in tgz_obs + if f.startswith('templates/1_prep_1_')][0] + # 3 times + self.assertTrue(fn_prep in tgz_obs) + tgz_obs.remove(fn_prep) + self.assertTrue(fn_prep in tgz_obs) + tgz_obs.remove(fn_prep) + self.assertTrue(fn_prep in tgz_obs) + tgz_obs.remove(fn_prep) + fn_sample = [f for f in tgz_obs if f.startswith('templates/1_')][0] + # 3 times + self.assertTrue(fn_sample in tgz_obs) + tgz_obs.remove(fn_sample) + self.assertTrue(fn_sample in tgz_obs) + tgz_obs.remove(fn_sample) + self.assertTrue(fn_sample in tgz_obs) + tgz_obs.remove(fn_sample) + # now it should be empty + self.assertEqual(tgz_obs, []) tmp = open(txt) txt_obs = tmp.readlines() tmp.close() txt_exp = [ 'biom_fp\tsample_fp\tprep_fp\tqiita_artifact_id\tcommand\n', - 'processed_data/1_study_1001_closed_reference_otu_table.biom\ttem' - 'plates/1_19700101-000000.txt\ttemplates/1_prep_1_19700101-000000' - '.txt\t4\tPick closed-reference OTUs, Split libraries FASTQ\n', - 'processed_data/1_study_1001_closed_reference_otu_table.biom\ttem' - 'plates/1_19700101-000000.txt\ttemplates/1_prep_1_19700101-000000' - '.txt\t5\tPick closed-reference OTUs, Split libraries FASTQ\n', + 'processed_data/1_study_1001_closed_reference_otu_table.biom\t' + '%s\t%s\t4\tPick closed-reference OTUs, Split libraries FASTQ\n' + % (fn_sample, fn_prep), + 'processed_data/1_study_1001_closed_reference_otu_table.biom\t' + '%s\t%s\t5\tPick closed-reference OTUs, Split libraries FASTQ\n' + % (fn_sample, fn_prep), 'processed_data/1_study_1001_closed_reference_otu_table_Silva.bio' - 'm\ttemplates/1_19700101-000000.txt\ttemplates/1_prep_1_19700101-' - '000000.txt\t6\tPick closed-reference OTUs, Split libraries ' - 'FASTQ\n'] + 'm\t%s\t%s\t6\tPick closed-reference OTUs, Split libraries FASTQ\n' + % (fn_sample, fn_prep)] self.assertEqual(txt_obs, txt_exp) # returning configuration diff --git a/qiita_pet/handlers/api_proxy/tests/test_artifact.py b/qiita_pet/handlers/api_proxy/tests/test_artifact.py index badcd9f06..fa819c96b 100644 --- a/qiita_pet/handlers/api_proxy/tests/test_artifact.py +++ b/qiita_pet/handlers/api_proxy/tests/test_artifact.py @@ -510,5 +510,6 @@ def test_artifact_status_put_req_unknown_status(self): 'message': 'Unknown visiblity value: BADSTAT'} self.assertEqual(obs, exp) + if __name__ == "__main__": main() diff --git a/qiita_pet/handlers/api_proxy/tests/test_ontology.py b/qiita_pet/handlers/api_proxy/tests/test_ontology.py index b45d1ca2f..839fd90f2 100644 --- a/qiita_pet/handlers/api_proxy/tests/test_ontology.py +++ b/qiita_pet/handlers/api_proxy/tests/test_ontology.py @@ -35,5 +35,6 @@ def test_ontology_patch_handler_errors(self): 'message': 'Ontology "ONTOLOGY" does not exist'} self.assertEqual(obs, exp) + if __name__ == '__main__': main() diff --git a/qiita_pet/handlers/api_proxy/tests/test_prep_template.py b/qiita_pet/handlers/api_proxy/tests/test_prep_template.py index 9b825518f..fa43db045 100644 --- a/qiita_pet/handlers/api_proxy/tests/test_prep_template.py +++ b/qiita_pet/handlers/api_proxy/tests/test_prep_template.py @@ -70,8 +70,8 @@ def test_prep_template_ajax_get_req(self): 'message': '', 'name': "Prep information 1", 'files': ["uploaded_file.txt"], - 'download_prep': 20, - 'download_qiime': 21, + 'download_prep': 23, + 'download_qiime': 24, 'num_samples': 27, 'num_columns': 22, 'investigation_type': 'Metagenomics', @@ -174,18 +174,12 @@ def test_prep_template_get_req_no_exists(self): def test_prep_template_filepaths_get_req(self): obs = prep_template_filepaths_get_req(1, 'test@foo.bar') - exp = {'status': 'success', - 'message': '', - 'filepaths': [ - (21, join(get_mountpoint('templates')[0][1], - '1_prep_1_qiime_19700101-000000.txt')), - (20, join(get_mountpoint('templates')[0][1], - '1_prep_1_19700101-000000.txt')), - (19, join(get_mountpoint('templates')[0][1], - '1_prep_1_qiime_19700101-000000.txt')), - (18, join(get_mountpoint('templates')[0][1], - '1_prep_1_19700101-000000.txt'))]} - self.assertEqual(obs, exp) + # have to check each key individually as the filepaths will change + self.assertEqual(obs['status'], 'success') + self.assertEqual(obs['message'], '') + # [0] the fp_id is the first element, that should change + fp_ids = [fp[0] for fp in obs['filepaths']] + self.assertItemsEqual(fp_ids, [18, 19, 20, 21, 23, 24]) def test_prep_template_filepaths_get_req_no_access(self): obs = prep_template_filepaths_get_req(1, 'demo@microbio.me') @@ -550,5 +544,6 @@ def test_prep_template_delete_req_no_prep(self): 'message': 'Prep template 3100 does not exist'} self.assertEqual(obs, exp) + if __name__ == '__main__': main() diff --git a/qiita_pet/handlers/api_proxy/tests/test_processing.py b/qiita_pet/handlers/api_proxy/tests/test_processing.py index 357cfffcc..f8c38b47f 100644 --- a/qiita_pet/handlers/api_proxy/tests/test_processing.py +++ b/qiita_pet/handlers/api_proxy/tests/test_processing.py @@ -174,5 +174,6 @@ def test_workflow_handler_patch_req_error(self): 'message': 'Incorrect path parameter'} self.assertEqual(obs, exp) + if __name__ == '__main__': main() diff --git a/qiita_pet/handlers/api_proxy/tests/test_sample_template.py b/qiita_pet/handlers/api_proxy/tests/test_sample_template.py index d7f226189..e8122be29 100644 --- a/qiita_pet/handlers/api_proxy/tests/test_sample_template.py +++ b/qiita_pet/handlers/api_proxy/tests/test_sample_template.py @@ -87,7 +87,7 @@ def test_sample_template_get_req(self): self.assertEqual(len(obs['template']), 27) self.assertEqual(str( obs['template']['1.SKB2.640194']['collection_timestamp']), - '11/11/11 13:00:00') + '2011-11-11 13:00:00') del obs['template']['1.SKB2.640194']['collection_timestamp'] self.assertEqual(obs['template']['1.SKB2.640194'], { 'physical_specimen_location': 'ANL', @@ -237,7 +237,7 @@ def test_sample_template_summary_get_req(self): ('Diesel Root', 3), ('Diesel bulk', 3)], 'elevation': [('114', 27)], 'description': [('Cannabis Soil Microbiome', 27)], - 'collection_timestamp': [('11/11/11 13:00:00', 27)], + 'collection_timestamp': [('2011-11-11 13:00:00', 27)], 'physical_specimen_remaining': [('true', 27)], 'dna_extracted': [('true', 27)], 'taxon_id': [('410658', 9), ('939928', 9), ('1118232', 9)], @@ -472,13 +472,13 @@ def test_sample_template_delete_req_no_template(self): 'exist' % self.new_study.id}) def test_sample_template_filepaths_get_req(self): - templates_dir = qdb.util.get_mountpoint('templates')[0][1] obs = sample_template_filepaths_get_req(1, 'test@foo.bar') - exp = {'status': 'success', - 'message': '', - 'filepaths': [(17, join(templates_dir, - '1_19700101-000000.txt'))]} - self.assertEqual(obs, exp) + # have to check each key individually as the filepaths will change + self.assertEqual(obs['status'], 'success') + self.assertEqual(obs['message'], '') + # [0] the fp_id is the first element, that should change + fp_ids = [fp[0] for fp in obs['filepaths']] + self.assertItemsEqual(fp_ids, [17, 22]) def test_sample_template_filepaths_get_req_no_access(self): obs = sample_template_filepaths_get_req(1, 'demo@microbio.me') diff --git a/qiita_pet/handlers/study_handlers/tests/test_artifact.py b/qiita_pet/handlers/study_handlers/tests/test_artifact.py index 608ec4900..304dc2844 100644 --- a/qiita_pet/handlers/study_handlers/tests/test_artifact.py +++ b/qiita_pet/handlers/study_handlers/tests/test_artifact.py @@ -168,5 +168,6 @@ def test_post_admin(self): self.assertEqual(Artifact(3).visibility, 'sandbox') + if __name__ == "__main__": main() diff --git a/qiita_pet/handlers/study_handlers/tests/test_base.py b/qiita_pet/handlers/study_handlers/tests/test_base.py index 93187ee4a..33168bbf4 100644 --- a/qiita_pet/handlers/study_handlers/tests/test_base.py +++ b/qiita_pet/handlers/study_handlers/tests/test_base.py @@ -58,5 +58,6 @@ def test_get(self): self.assertEqual(response.code, 200) self.assertNotEqual(response.body, "") + if __name__ == "__main__": main() diff --git a/qiita_pet/handlers/study_handlers/tests/test_ebi_handlers.py b/qiita_pet/handlers/study_handlers/tests/test_ebi_handlers.py index 3bddbbdf6..b29f1ca13 100644 --- a/qiita_pet/handlers/study_handlers/tests/test_ebi_handlers.py +++ b/qiita_pet/handlers/study_handlers/tests/test_ebi_handlers.py @@ -53,5 +53,6 @@ def test_get_no_exist(self): response = self.get('/ebi_submission/100') self.assertEqual(response.code, 404) + if __name__ == "__main__": main() diff --git a/qiita_pet/handlers/study_handlers/tests/test_edit_handlers.py b/qiita_pet/handlers/study_handlers/tests/test_edit_handlers.py index 0caa9d345..8e68b8bfc 100644 --- a/qiita_pet/handlers/study_handlers/tests/test_edit_handlers.py +++ b/qiita_pet/handlers/study_handlers/tests/test_edit_handlers.py @@ -154,5 +154,6 @@ def test_get(self): # make sure responds properly self.assertEqual(response.body, 'False') + if __name__ == "__main__": main() diff --git a/qiita_pet/handlers/study_handlers/tests/test_listing_handlers.py b/qiita_pet/handlers/study_handlers/tests/test_listing_handlers.py index 06d871621..33fa2cbdc 100644 --- a/qiita_pet/handlers/study_handlers/tests/test_listing_handlers.py +++ b/qiita_pet/handlers/study_handlers/tests/test_listing_handlers.py @@ -355,5 +355,6 @@ def test_get_emp_portal(self): self.assertEqual(response.code, 200) self.assertEqual(loads(response.body), self.empty) + if __name__ == "__main__": main() diff --git a/qiita_pet/handlers/study_handlers/tests/test_prep_template.py b/qiita_pet/handlers/study_handlers/tests/test_prep_template.py index 74f71687c..31cb99ca2 100644 --- a/qiita_pet/handlers/study_handlers/tests/test_prep_template.py +++ b/qiita_pet/handlers/study_handlers/tests/test_prep_template.py @@ -72,5 +72,6 @@ def test_get_files_not_allowed(self): {'type': 'BIOM', 'prep_file': 'uploaded_file.txt', 'study_id': 1}) self.assertEqual(response.code, 405) + if __name__ == "__main__": main() diff --git a/qiita_pet/handlers/study_handlers/tests/test_processing.py b/qiita_pet/handlers/study_handlers/tests/test_processing.py index a480fa1b5..1ef3597ef 100644 --- a/qiita_pet/handlers/study_handlers/tests/test_processing.py +++ b/qiita_pet/handlers/study_handlers/tests/test_processing.py @@ -24,5 +24,6 @@ class ListOptionsHandler(TestHandlerBase): # TODO: Missing tests pass + if __name__ == "__main__": main() diff --git a/qiita_pet/handlers/study_handlers/tests/test_sample_template.py b/qiita_pet/handlers/study_handlers/tests/test_sample_template.py index 7f3dd9822..a2d8527b4 100644 --- a/qiita_pet/handlers/study_handlers/tests/test_sample_template.py +++ b/qiita_pet/handlers/study_handlers/tests/test_sample_template.py @@ -121,5 +121,6 @@ def test_post_error(self): "message": "Category NOEXIST does not exist in sample template"} self.assertEqual(loads(res.body), exp) + if __name__ == "__main__": main() diff --git a/qiita_pet/handlers/study_handlers/tests/test_vamps_handlers.py b/qiita_pet/handlers/study_handlers/tests/test_vamps_handlers.py index 6b482ce66..e17ff5eeb 100644 --- a/qiita_pet/handlers/study_handlers/tests/test_vamps_handlers.py +++ b/qiita_pet/handlers/study_handlers/tests/test_vamps_handlers.py @@ -14,5 +14,6 @@ class VAMPSHandlerTests(TestHandlerBase): # TODO: Missing tests pass + if __name__ == "__main__": main() diff --git a/qiita_pet/portal.py b/qiita_pet/portal.py index 6eb949c41..3953964e3 100644 --- a/qiita_pet/portal.py +++ b/qiita_pet/portal.py @@ -84,4 +84,5 @@ def _get_study_list(self, config): """Get the configuration of the study_list section""" self.example_search = config.get('study_list', 'EXAMPLE_SEARCH') + portal_styling = PortalStyleManager() diff --git a/qiita_pet/support_files/doc/source/tutorials/images/sharing_analysis.gif b/qiita_pet/support_files/doc/source/tutorials/images/sharing_analysis.gif new file mode 100644 index 000000000..00c96a432 Binary files /dev/null and b/qiita_pet/support_files/doc/source/tutorials/images/sharing_analysis.gif differ diff --git a/qiita_pet/support_files/doc/source/tutorials/images/sharing_study.gif b/qiita_pet/support_files/doc/source/tutorials/images/sharing_study.gif new file mode 100644 index 000000000..f210dd765 Binary files /dev/null and b/qiita_pet/support_files/doc/source/tutorials/images/sharing_study.gif differ diff --git a/qiita_pet/support_files/doc/source/tutorials/index.rst b/qiita_pet/support_files/doc/source/tutorials/index.rst index 10fc7c35f..57f4cfc08 100644 --- a/qiita_pet/support_files/doc/source/tutorials/index.rst +++ b/qiita_pet/support_files/doc/source/tutorials/index.rst @@ -10,6 +10,7 @@ The following is a full list of the available tutorials: prepare-information-files ebi-submission getting-started + sharing analyze-data no-raw-sequences join-paired-end-reads diff --git a/qiita_pet/support_files/doc/source/tutorials/prepare-information-files.rst b/qiita_pet/support_files/doc/source/tutorials/prepare-information-files.rst index 693347c28..72435f14e 100644 --- a/qiita_pet/support_files/doc/source/tutorials/prepare-information-files.rst +++ b/qiita_pet/support_files/doc/source/tutorials/prepare-information-files.rst @@ -72,14 +72,14 @@ These are the columns required for successfully submit your data to EBI: +----------------------------------+-------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+ | Field name | Format | Description | +==================================+=========================+=====================================================================================================================================================+ -| ``collection_timestamp`` | ``mm/dd/yyyy hh:mm:ss`` | The time stamp (preferred) of when the sample was collected. Several format are accepted. | -| | or ``mm/dd/yyyy hh:mm`` | | -| | or ``mm/dd/yyyy hh`` | | -| | or ``mm/dd/yyyy`` | | -| | or ``mm/yyyy`` | | +| ``collection_timestamp`` | ``yyyy-mm-dd hh:mm:ss`` | The time stamp (preferred) of when the sample was collected. Several format are accepted, all ISO 8601. | +| | or ``yyyy-mm-dd hh:mm`` | | +| | or ``yyyy-mm-dd hh`` | | +| | or ``yyyy-mm-dd `` | | +| | or ``yyyy-mm`` | | | | or ``yyyy``. | | -| | Years are supported as | | -| | 4 ``yyyy`` or 2 ``yy`` | | +| | Years are only | | +| | supported as 4 ``yyyy`` | | | | digits | | +----------------------------------+-------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------+ | ``physical_specimen_location`` | free text | Where you would go to find physical sample or DNA, regardless of whether it is still available or not. | diff --git a/qiita_pet/support_files/doc/source/tutorials/sharing.rst b/qiita_pet/support_files/doc/source/tutorials/sharing.rst new file mode 100644 index 000000000..a174595b0 --- /dev/null +++ b/qiita_pet/support_files/doc/source/tutorials/sharing.rst @@ -0,0 +1,51 @@ +.. _sharing: + +.. index:: sharing + +.. role:: red + +Study and analysis sharing +========================== + +.. note:: + You can only share studies and analysis with registered users via their + email. + +Qiita allows users to share their own Studies (owner, the User that created the +study) and Analysis with the goal of encourage online collaboration. When a +study or analysis are shared the invited User inherits the same rights and +permissions that the owner of the Study has. These permissions are: + +- Add, download and process any Qiita artifact. Most recently, this also + includes the ability to download the raw sequence data added to each of the + preparations. This feature is turned off for any public datasets for + security reasons. +- Create, modify and delete the sample information. file +- Create, modify and delete any preparation information file + + +Sharing a Study +--------------- + +In the “Your Studies (includes shared with you)” section of the Studies List +you have a “Shared With These Users” column that lists all User names that +your study is shared with. You can click on the “Modify” button and add/remove +users. See below. + +.. figure:: images/sharing_study.gif + :align: center + + Study sharing example + + +Sharing an Analysis +------------------- + +Analysis sharing allows full access to the Analysis, including downloading the +files generated or deleting it. In the main display page of the Analysis you +will have a link to share it. See below + +.. figure:: images/sharing_study.gif + :align: center + + Analysis sharing example diff --git a/qiita_pet/test/test_base_handlers.py b/qiita_pet/test/test_base_handlers.py index b2a44f224..c08e44399 100644 --- a/qiita_pet/test/test_base_handlers.py +++ b/qiita_pet/test/test_base_handlers.py @@ -13,5 +13,6 @@ def test_get(self): response = self.get('/THISPAGENOEXIST') self.assertEqual(response.code, 404) + if __name__ == "__main__": main() diff --git a/qiita_pet/test/test_logger.py b/qiita_pet/test/test_logger.py index a68477feb..994fd5728 100644 --- a/qiita_pet/test/test_logger.py +++ b/qiita_pet/test/test_logger.py @@ -14,5 +14,6 @@ def test_post(self): response = self.post('/admin/error/', {'numrecords': 20}) self.assertEqual(response.code, 405) + if __name__ == "__main__": main() diff --git a/qiita_pet/test/test_ontology.py b/qiita_pet/test/test_ontology.py index 64eecf4a3..d74d8ad60 100644 --- a/qiita_pet/test/test_ontology.py +++ b/qiita_pet/test/test_ontology.py @@ -19,5 +19,6 @@ def test_patch(self): exp = {'status': 'success', 'message': ''} self.assertEqual(loads(response.body), exp) + if __name__ == '__main__': main() diff --git a/qiita_pet/test/test_prep_template.py b/qiita_pet/test/test_prep_template.py index 7b6f9a6c4..3dab40dde 100644 --- a/qiita_pet/test/test_prep_template.py +++ b/qiita_pet/test/test_prep_template.py @@ -74,5 +74,6 @@ def test_delete(self): "with it"} self.assertEqual(loads(response.body), exp) + if __name__ == '__main__': main() diff --git a/qiita_pet/test/test_upload.py b/qiita_pet/test/test_upload.py index 759932862..093dcf858 100644 --- a/qiita_pet/test/test_upload.py +++ b/qiita_pet/test/test_upload.py @@ -17,5 +17,6 @@ def test_get(self): response = self.get('/upload/') self.assertEqual(response.code, 400) + if __name__ == "__main__": main() diff --git a/qiita_pet/test/test_user_handlers.py b/qiita_pet/test/test_user_handlers.py index 65b056dc1..16a247feb 100644 --- a/qiita_pet/test/test_user_handlers.py +++ b/qiita_pet/test/test_user_handlers.py @@ -59,5 +59,6 @@ def test_get(self): response = self.get('/user/jobs/') self.assertEqual(response.code, 200) + if __name__ == "__main__": main() diff --git a/qiita_ware/context.py b/qiita_ware/context.py index 4f2ba3809..0d51f517b 100644 --- a/qiita_ware/context.py +++ b/qiita_ware/context.py @@ -269,6 +269,7 @@ def _submit(ctx, channel, f, *args, **kwargs): ctx.submit_async(_redis_wrap, f, redis_deets, *args, **kwargs) return uuid + # likely want this in qiita_ware.__init__ context = Dispatch() submit = partial(_submit, context) diff --git a/qiita_ware/executor.py b/qiita_ware/executor.py index 5a360de67..e2e5bf80d 100644 --- a/qiita_ware/executor.py +++ b/qiita_ware/executor.py @@ -97,4 +97,5 @@ def _submit(ctx, user, parameters): ctx.submit_async(_redis_wrap, execute, redis_deets, job.id) return job.id + plugin_submit = partial(_submit, context) diff --git a/qiita_ware/test/test_dispatchable.py b/qiita_ware/test/test_dispatchable.py index 4352a0915..7a3eeb377 100644 --- a/qiita_ware/test/test_dispatchable.py +++ b/qiita_ware/test/test_dispatchable.py @@ -181,5 +181,6 @@ def test_delete_sample_or_column(self): '"samples" and "columns"'} self.assertEqual(obs, exp) + if __name__ == '__main__': main() diff --git a/qiita_ware/test/test_ebi.py b/qiita_ware/test/test_ebi.py index 746125258..3f6d643a4 100644 --- a/qiita_ware/test/test_ebi.py +++ b/qiita_ware/test/test_ebi.py @@ -842,7 +842,7 @@ def test_parse_EBI_reply(self): assigned_from_geon - collection_timestamp11/11/11 13:00:00 + collection_timestamp2011-11-11 13:00:00 common_namesoil metagenome @@ -933,7 +933,7 @@ def test_parse_EBI_reply(self): assigned_from_geon - collection_timestamp11/11/11 13:00:00 + collection_timestamp2011-11-11 13:00:00 common_namesoil metagenome @@ -1584,5 +1584,6 @@ def test_parse_EBI_reply(self): """ + if __name__ == "__main__": main() diff --git a/qiita_ware/test/test_util.py b/qiita_ware/test/test_util.py index 0cc06702c..9437a9aa5 100644 --- a/qiita_ware/test/test_util.py +++ b/qiita_ware/test/test_util.py @@ -182,6 +182,7 @@ def test_hdf5IO_open(self): os.remove(name) + # comment indicates the expected random value sequences = [ ('a_1', 'AATTGGCC-a1'), # 2, 3624216819017203053 @@ -255,7 +256,7 @@ def test_hdf5IO_open(self): ('TTGCACCGTCGA', 1)], 'center_name': [('ANL', 27)], 'center_project_name': [('None', 27)], - 'collection_timestamp': [('11/11/11 13:00:00', 27)], + 'collection_timestamp': [('2011-11-11 13:00:00', 27)], 'common_name': [('rhizosphere metagenome', 9), ('root metagenome', 9), ('soil metagenome', 9)], diff --git a/scripts/qiita-env b/scripts/qiita-env index 2de58eb5d..d545a094f 100755 --- a/scripts/qiita-env +++ b/scripts/qiita-env @@ -175,5 +175,6 @@ def rem_portal(portal): except qdb.exceptions.QiitaDBLookupError: raise click.BadParameter("Portal name does not exist!") + if __name__ == '__main__': env() diff --git a/scripts/qiita-private b/scripts/qiita-private index 698caf845..40c6bf414 100755 --- a/scripts/qiita-private +++ b/scripts/qiita-private @@ -23,5 +23,6 @@ def qiita_private(): def complete_job(job_id, payload): qdb.commands.complete_job_cmd(job_id, payload) + if __name__ == '__main__': qiita_private() diff --git a/scripts/qiita-test-install b/scripts/qiita-test-install index e570e33a0..ff284aa7f 100755 --- a/scripts/qiita-test-install +++ b/scripts/qiita-test-install @@ -271,6 +271,7 @@ class QiitaConfig(TestCase): ('.'.join(map(str, version)), str_acceptable_min_version, str_acceptable_max_version)) + system_info_header = """ System information ================== diff --git a/setup.py b/setup.py index c15564bd6..5f12eb34f 100644 --- a/setup.py +++ b/setup.py @@ -103,7 +103,7 @@ ]}, scripts=glob('scripts/*'), extras_require={'test': ["nose >= 0.10.1", "pep8", 'mock']}, - install_requires=['psycopg2', 'click >= 3.3', 'future', + install_requires=['psycopg2 < 2.7', 'click >= 3.3', 'future', 'bcrypt', 'pandas >= 0.17', 'numpy >= 1.7', 'tornado==3.1.1', 'toredis', 'redis', 'six', 'ipython[all] >= 2.4.1, < 2.5', 'pyparsing',