Skip to content

Commit 9a378bc

Browse files
antgonzaElDeveloper
authored andcommitted
WIP: using new info to merge bioms (#2167)
* using new info to merge bioms * fix errors * flake8
1 parent ab0e2f2 commit 9a378bc

File tree

2 files changed

+52
-45
lines changed

2 files changed

+52
-45
lines changed

qiita_db/analysis.py

Lines changed: 35 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -771,6 +771,9 @@ def build_files(self, merge_duplicated_sample_ids):
771771
# make testing much harder as we will need to have analyses at
772772
# different stages and possible errors.
773773
samples = self.samples
774+
# gettin the info of all the artifacts to save SQL time
775+
bioms_info = qdb.util.get_artifacts_bioms_information(
776+
samples.keys())
774777

775778
# figuring out if we are going to have duplicated samples, again
776779
# doing it here cause it's computational cheaper
@@ -780,20 +783,32 @@ def build_files(self, merge_duplicated_sample_ids):
780783
# are going to create
781784
rename_dup_samples = False
782785
grouped_samples = {}
783-
for k, v in viewitems(samples):
784-
a = qdb.artifact.Artifact(k)
785-
p = a.processing_parameters
786-
if p is not None and p.command is not None:
787-
ref = (str(p.values['reference'])
788-
if 'reference' in p.values else 'na')
789-
cid = str(p.command.id)
786+
for aid, asamples in viewitems(samples):
787+
# find the artifat info, [0] there should be only 1 info
788+
ainfo = [bi for bi in bioms_info
789+
if bi['artifact_id'] == aid][0]
790+
791+
data_type = ainfo['data_type']
792+
algorithm = ainfo['algorithm']
793+
target_subfragment = ainfo['target_subfragment']
794+
parameters = ['%s: %s' % (k, v)
795+
for k, v in viewitems(ainfo['parameters'])]
796+
files = ainfo['files']
797+
798+
l = "%s || %s || %s || %s" % (
799+
data_type, algorithm, ','.join(target_subfragment),
800+
', '.join(parameters))
801+
# deblur special case, we need to account for file name
802+
if 'deblur-workflow' in algorithm:
803+
# [0] there is always just one biom
804+
l += " || %s" % [f for f in files
805+
if f.endswith('.biom')][0]
790806
else:
791-
ref = 'na'
792-
cid = 'na'
793-
l = "%s.%s.%s" % (a.data_type, ref, cid)
807+
l += " ||"
808+
794809
if l not in grouped_samples:
795810
grouped_samples[l] = []
796-
grouped_samples[l].append((k, v))
811+
grouped_samples[l].append((aid, asamples))
797812
# 2. if rename_dup_samples is still False, make sure that we don't
798813
# need to rename samples by checking that there are not
799814
# duplicated samples per group
@@ -826,7 +841,9 @@ def _build_biom_tables(self, grouped_samples, rename_dup_samples=False):
826841

827842
biom_files = []
828843
for label, tables in viewitems(grouped_samples):
829-
data_type, reference_id, command_id = label.split('.')
844+
data_type, algorithm, target_subfragment, \
845+
parameters, files = [l.strip() for l in label.split('||')]
846+
830847
new_table = None
831848
artifact_ids = []
832849
for aid, samples in tables:
@@ -873,25 +890,16 @@ def _build_biom_tables(self, grouped_samples, rename_dup_samples=False):
873890
raise RuntimeError("All samples filtered out from "
874891
"analysis due to rarefaction level")
875892

876-
# add the metadata column for study the samples come from,
877-
# this is useful in case the user download the bioms
878-
study_md = {'study': artifact.study.title,
879-
'artifact_ids': ', '.join(artifact_ids),
880-
'reference_id': reference_id,
881-
'command_id': command_id}
882-
samples_md = {sid: study_md for sid in new_table.ids()}
883-
new_table.add_metadata(samples_md, axis='sample')
884-
885893
# write out the file
886-
fn = "%d_analysis_dt-%s_r-%s_c-%s.biom" % (
887-
self._id, data_type, reference_id, command_id)
894+
info = "%s_%s_%s_%s_%s" % (
895+
data_type, algorithm, target_subfragment, parameters,
896+
files)
897+
fn = "%d_analysis_%s.biom" % (self._id, info)
888898
biom_fp = join(base_fp, fn)
889899
with biom_open(biom_fp, 'w') as f:
890900
new_table.to_hdf5(
891-
f, "Generated by Qiita. Analysis %d Datatype %s "
892-
"Reference %s Command %s" % (self._id, data_type,
893-
reference_id, command_id))
894-
901+
f, "Generated by Qiita, analysis id: %d, info: %s" % (
902+
self._id, label))
895903
biom_files.append((data_type, biom_fp))
896904
return biom_files
897905

qiita_db/test/test_analysis.py

Lines changed: 17 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def setUp(self):
4040
self.map_exp_fp = self.get_fp("1_analysis_mapping_exp.txt")
4141

4242
from glob import glob
43-
conf_files = glob(join(qiita_config.plugin_dir, "*.conf"))
43+
conf_files = glob(join(qiita_config.plugin_dir, "BIOM*.conf"))
4444
for i, fp in enumerate(conf_files):
4545
qdb.software.Software.from_file(fp, update=True)
4646

@@ -413,10 +413,13 @@ def test_build_mapping_file_duplicated_samples_merge(self):
413413

414414
def test_build_biom_tables(self):
415415
analysis = self._create_analyses_with_samples()
416-
grouped_samples = {'18S.1.3': [(
417-
4, ['1.SKB8.640193', '1.SKD8.640184', '1.SKB7.640196'])]}
416+
grouped_samples = {
417+
'18S || algorithm || target_subfragment || parameters '
418+
'|| files': [(4, ['1.SKB8.640193', '1.SKD8.640184',
419+
'1.SKB7.640196'])]}
418420
obs_bioms = analysis._build_biom_tables(grouped_samples)
419-
biom_fp = self.get_fp("%s_analysis_dt-18S_r-1_c-3.biom" % analysis.id)
421+
biom_fp = self.get_fp("%s_analysis_18S_algorithm_target_subfra"
422+
"gment_parameters_files.biom" % analysis.id)
420423
obs = [(a, basename(b)) for a, b in obs_bioms]
421424
self.assertEqual(obs, [('18S', basename(biom_fp))])
422425

@@ -425,22 +428,16 @@ def test_build_biom_tables(self):
425428
exp = {'1.SKB8.640193', '1.SKD8.640184', '1.SKB7.640196'}
426429
self.assertEqual(obs, exp)
427430

428-
obs = table.metadata('1.SKB8.640193')
429-
exp = {'study':
430-
'Identification of the Microbiomes for Cannabis Soils',
431-
'artifact_ids': '4',
432-
'reference_id': '1',
433-
'command_id': '3'}
434-
self.assertEqual(obs, exp)
435-
436431
def test_build_biom_tables_duplicated_samples_not_merge(self):
437432
analysis = self._create_analyses_with_samples()
438-
grouped_samples = {'18S.1.3': [
439-
(4, ['1.SKB8.640193', '1.SKD8.640184', '1.SKB7.640196']),
440-
(5, ['1.SKB8.640193', '1.SKD8.640184', '1.SKB7.640196'])]}
433+
grouped_samples = {
434+
'18S || algorithm || target_subfragment || parameters || files': [
435+
(4, ['1.SKB8.640193', '1.SKD8.640184', '1.SKB7.640196']),
436+
(5, ['1.SKB8.640193', '1.SKD8.640184', '1.SKB7.640196'])]}
441437
obs_bioms = analysis._build_biom_tables(grouped_samples, True)
442438
obs = [(a, basename(b)) for a, b in obs_bioms]
443-
biom_fp = "%s_analysis_dt-18S_r-1_c-3.biom" % analysis.id
439+
biom_fp = ("%s_analysis_18S_algorithm_target_subfragment_"
440+
"parameters_files.biom" % analysis.id)
444441
self.assertEqual(obs, [('18S', biom_fp)])
445442

446443
table = load_table(obs_bioms[0][1])
@@ -450,8 +447,10 @@ def test_build_biom_tables_duplicated_samples_not_merge(self):
450447
self.assertItemsEqual(obs, exp)
451448

452449
def test_build_biom_tables_raise_error_due_to_sample_selection(self):
453-
grouped_samples = {'18S.1.3': [
454-
(4, ['sample_name_1', 'sample_name_2', 'sample_name_3'])]}
450+
grouped_samples = {
451+
'18S || algorithm || target_subfragment || parameters '
452+
'|| files': [(4, ['sample_name_1', 'sample_name_2',
453+
'sample_name_3'])]}
455454
with self.assertRaises(RuntimeError):
456455
self.analysis._build_biom_tables(grouped_samples)
457456

0 commit comments

Comments
 (0)