WIP: using new info to merge bioms (#2167)

antgonza · ElDeveloper · commit 9a378bc1fb47 · 2017-07-10T14:14:12.000-07:00
* using new info to merge bioms

* fix errors

* flake8
diff --git a/qiita_db/analysis.py b/qiita_db/analysis.py
@@ -771,6 +771,9 @@ def build_files(self, merge_duplicated_sample_ids):
             # make testing much harder as we will need to have analyses at
             # different stages and possible errors.
             samples = self.samples
+            # gettin the info of all the artifacts to save SQL time
+            bioms_info = qdb.util.get_artifacts_bioms_information(
+                samples.keys())
 
             # figuring out if we are going to have duplicated samples, again
             # doing it here cause it's computational cheaper
@@ -780,20 +783,32 @@ def build_files(self, merge_duplicated_sample_ids):
             # are going to create
             rename_dup_samples = False
             grouped_samples = {}
-            for k, v in viewitems(samples):
-                a = qdb.artifact.Artifact(k)
-                p = a.processing_parameters
-                if p is not None and p.command is not None:
-                    ref = (str(p.values['reference'])
-                           if 'reference' in p.values else 'na')
-                    cid = str(p.command.id)
+            for aid, asamples in viewitems(samples):
+                # find the artifat info, [0] there should be only 1 info
+                ainfo = [bi for bi in bioms_info
+                         if bi['artifact_id'] == aid][0]
+
+                data_type = ainfo['data_type']
+                algorithm = ainfo['algorithm']
+                target_subfragment = ainfo['target_subfragment']
+                parameters = ['%s: %s' % (k, v)
+                              for k, v in viewitems(ainfo['parameters'])]
+                files = ainfo['files']
+
+                l = "%s || %s || %s || %s" % (
+                    data_type, algorithm, ','.join(target_subfragment),
+                    ', '.join(parameters))
+                # deblur special case, we need to account for file name
+                if 'deblur-workflow' in algorithm:
+                    # [0] there is always just one biom
+                    l += " || %s" % [f for f in files
+                                     if f.endswith('.biom')][0]
                 else:
-                    ref = 'na'
-                    cid = 'na'
-                l = "%s.%s.%s" % (a.data_type, ref, cid)
+                    l += " ||"
+
                 if l not in grouped_samples:
                     grouped_samples[l] = []
-                grouped_samples[l].append((k, v))
+                grouped_samples[l].append((aid, asamples))
             # 2. if rename_dup_samples is still False, make sure that we don't
             #    need to rename samples by checking that there are not
             #    duplicated samples per group
@@ -826,7 +841,9 @@ def _build_biom_tables(self, grouped_samples, rename_dup_samples=False):
 
             biom_files = []
             for label, tables in viewitems(grouped_samples):
-                data_type, reference_id, command_id = label.split('.')
+                data_type, algorithm, target_subfragment, \
+                    parameters, files = [l.strip() for l in label.split('||')]
+
                 new_table = None
                 artifact_ids = []
                 for aid, samples in tables:
@@ -873,25 +890,16 @@ def _build_biom_tables(self, grouped_samples, rename_dup_samples=False):
                     raise RuntimeError("All samples filtered out from "
                                        "analysis due to rarefaction level")
 
-                # add the metadata column for study the samples come from,
-                # this is useful in case the user download the bioms
-                study_md = {'study': artifact.study.title,
-                            'artifact_ids': ', '.join(artifact_ids),
-                            'reference_id': reference_id,
-                            'command_id': command_id}
-                samples_md = {sid: study_md for sid in new_table.ids()}
-                new_table.add_metadata(samples_md, axis='sample')
-
                 # write out the file
-                fn = "%d_analysis_dt-%s_r-%s_c-%s.biom" % (
-                    self._id, data_type, reference_id, command_id)
+                info = "%s_%s_%s_%s_%s" % (
+                    data_type, algorithm, target_subfragment, parameters,
+                    files)
+                fn = "%d_analysis_%s.biom" % (self._id, info)
                 biom_fp = join(base_fp, fn)
                 with biom_open(biom_fp, 'w') as f:
                     new_table.to_hdf5(
-                        f, "Generated by Qiita. Analysis %d Datatype %s "
-                        "Reference %s Command %s" % (self._id, data_type,
-                                                     reference_id, command_id))
-
+                        f, "Generated by Qiita, analysis id: %d, info: %s" % (
+                            self._id, label))
                 biom_files.append((data_type, biom_fp))
         return biom_files
 
diff --git a/qiita_db/test/test_analysis.py b/qiita_db/test/test_analysis.py
@@ -40,7 +40,7 @@ def setUp(self):
         self.map_exp_fp = self.get_fp("1_analysis_mapping_exp.txt")
 
         from glob import glob
-        conf_files = glob(join(qiita_config.plugin_dir, "*.conf"))
+        conf_files = glob(join(qiita_config.plugin_dir, "BIOM*.conf"))
         for i, fp in enumerate(conf_files):
             qdb.software.Software.from_file(fp, update=True)
 
@@ -413,10 +413,13 @@ def test_build_mapping_file_duplicated_samples_merge(self):
 
     def test_build_biom_tables(self):
         analysis = self._create_analyses_with_samples()
-        grouped_samples = {'18S.1.3': [(
-            4, ['1.SKB8.640193', '1.SKD8.640184', '1.SKB7.640196'])]}
+        grouped_samples = {
+            '18S || algorithm || target_subfragment || parameters '
+            '|| files': [(4, ['1.SKB8.640193', '1.SKD8.640184',
+                              '1.SKB7.640196'])]}
         obs_bioms = analysis._build_biom_tables(grouped_samples)
-        biom_fp = self.get_fp("%s_analysis_dt-18S_r-1_c-3.biom" % analysis.id)
+        biom_fp = self.get_fp("%s_analysis_18S_algorithm_target_subfra"
+                              "gment_parameters_files.biom" % analysis.id)
         obs = [(a, basename(b)) for a, b in obs_bioms]
         self.assertEqual(obs, [('18S', basename(biom_fp))])
 
@@ -425,22 +428,16 @@ def test_build_biom_tables(self):
         exp = {'1.SKB8.640193', '1.SKD8.640184', '1.SKB7.640196'}
         self.assertEqual(obs, exp)
 
-        obs = table.metadata('1.SKB8.640193')
-        exp = {'study':
-               'Identification of the Microbiomes for Cannabis Soils',
-               'artifact_ids': '4',
-               'reference_id': '1',
-               'command_id': '3'}
-        self.assertEqual(obs, exp)
-
     def test_build_biom_tables_duplicated_samples_not_merge(self):
         analysis = self._create_analyses_with_samples()
-        grouped_samples = {'18S.1.3': [
-            (4, ['1.SKB8.640193', '1.SKD8.640184', '1.SKB7.640196']),
-            (5, ['1.SKB8.640193', '1.SKD8.640184', '1.SKB7.640196'])]}
+        grouped_samples = {
+            '18S || algorithm || target_subfragment || parameters || files': [
+                (4, ['1.SKB8.640193', '1.SKD8.640184', '1.SKB7.640196']),
+                (5, ['1.SKB8.640193', '1.SKD8.640184', '1.SKB7.640196'])]}
         obs_bioms = analysis._build_biom_tables(grouped_samples, True)
         obs = [(a, basename(b)) for a, b in obs_bioms]
-        biom_fp = "%s_analysis_dt-18S_r-1_c-3.biom" % analysis.id
+        biom_fp = ("%s_analysis_18S_algorithm_target_subfragment_"
+                   "parameters_files.biom" % analysis.id)
         self.assertEqual(obs, [('18S', biom_fp)])
 
         table = load_table(obs_bioms[0][1])
@@ -450,8 +447,10 @@ def test_build_biom_tables_duplicated_samples_not_merge(self):
         self.assertItemsEqual(obs, exp)
 
     def test_build_biom_tables_raise_error_due_to_sample_selection(self):
-        grouped_samples = {'18S.1.3': [
-            (4, ['sample_name_1', 'sample_name_2', 'sample_name_3'])]}
+        grouped_samples = {
+            '18S || algorithm || target_subfragment || parameters '
+            '|| files': [(4, ['sample_name_1', 'sample_name_2',
+                              'sample_name_3'])]}
         with self.assertRaises(RuntimeError):
             self.analysis._build_biom_tables(grouped_samples)