Skip to content

generate biom and metadata release #2066

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions qiita_db/test/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from datetime import datetime
from functools import partial
from string import punctuation
from tarfile import open as topen

import pandas as pd

Expand Down Expand Up @@ -742,6 +743,42 @@ def test_supported_filepath_types(self):
exp = [["biom", True], ["directory", False], ["log", False]]
self.assertItemsEqual(obs, exp)

def test_generate_biom_and_metadata_release(self):
tgz, txt = qdb.util.generate_biom_and_metadata_release('private')
self.files_to_remove.extend([tgz, txt])

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The file objects are not being closed, would you mind closing them?

tmp = topen(tgz, "r:gz")
tgz_obs = [ti.name for ti in tmp]
tmp.close()
tgz_exp = [
'processed_data/1_study_1001_closed_reference_otu_table.biom',
'templates/1_19700101-000000.txt',
'templates/1_prep_1_19700101-000000.txt',
'processed_data/1_study_1001_closed_reference_otu_table.biom',
'templates/1_19700101-000000.txt',
'templates/1_prep_1_19700101-000000.txt',
'processed_data/1_study_1001_closed_reference_otu_table_'
'Silva.biom', 'templates/1_19700101-000000.txt',
'templates/1_prep_1_19700101-000000.txt']
self.assertEqual(tgz_obs, tgz_exp)

tmp = open(txt)
txt_obs = tmp.readlines()
tmp.close()
txt_exp = [
'biom_fp\tsample_fp\tprep_fp\tqiita_artifact_id\tcommand\n',
'processed_data/1_study_1001_closed_reference_otu_table.biom\ttem'
'plates/1_19700101-000000.txt\ttemplates/1_prep_1_19700101-000000'
'.txt\t4\tPick closed-reference OTUs, Split libraries FASTQ\n',
'processed_data/1_study_1001_closed_reference_otu_table.biom\ttem'
'plates/1_19700101-000000.txt\ttemplates/1_prep_1_19700101-000000'
'.txt\t5\tPick closed-reference OTUs, Split libraries FASTQ\n',
'processed_data/1_study_1001_closed_reference_otu_table_Silva.bio'
'm\ttemplates/1_19700101-000000.txt\ttemplates/1_prep_1_19700101-'
'000000.txt\t6\tPick closed-reference OTUs, Split libraries '
'FASTQ\n']
self.assertEqual(txt_obs, txt_exp)


@qiita_test_checker()
class UtilTests(TestCase):
Expand Down
86 changes: 86 additions & 0 deletions qiita_db/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,10 @@
from json import dumps
from datetime import datetime
from itertools import chain
from tarfile import open as topen

from qiita_core.exceptions import IncompetentQiitaDeveloperError
from qiita_core.configuration_manager import ConfigurationManager
import qiita_db as qdb


Expand Down Expand Up @@ -1549,3 +1551,87 @@ def generate_study_list(study_ids, build_samples, public_only=False):
infolist.append(info)

return infolist


def generate_biom_and_metadata_release(study_status='public'):
"""Generate a list of biom/meatadata filepaths and a tgz of those files

Parameters
----------
study_status : str, optional
The study status to search for. Note that this should always be set
to 'public' but having this exposed as helps with testing. The other
options are 'private' and 'sandbox'

Returns
-------
str, str
tgz_name: the filepath of the new generated tgz
txt_name: the filepath of the new generated txt
"""
studies = qdb.study.Study.get_by_status(study_status)
qiita_config = ConfigurationManager()
working_dir = qiita_config.working_dir
portal = qiita_config.portal
bdir = qdb.util.get_db_files_base_dir()
bdir_len = len(bdir) + 1

data = []
for s in studies:
# [0] latest is first, [1] only getting the filepath
sample_fp = s.sample_template.get_filepaths()[0][1]
if sample_fp.startswith(bdir):
sample_fp = sample_fp[bdir_len:]

for a in s.artifacts(artifact_type='BIOM'):
if a.processing_parameters is None:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What would be a biom table without processing parameters? A table that failed or is currently processing?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A table that was uploaded directly to qiita ...

continue

cmd_name = a.processing_parameters.command.name

# this loop is necessary as in theory an artifact can be
# generated from multiple prep info files
human_cmd = []
for p in a.parents:
pp = p.processing_parameters
pp_cmd_name = pp.command.name
if pp_cmd_name == 'Trimming':
human_cmd.append('%s @ %s' % (
cmd_name, str(pp.values['length'])))
else:
human_cmd.append('%s, %s' % (cmd_name, pp_cmd_name))
human_cmd = ', '.join(human_cmd)

for _, fp, fp_type in a.filepaths:
if fp_type != 'biom' or 'only-16s' in fp:
continue
if fp.startswith(bdir):
fp = fp[bdir_len:]
# format: (biom_fp, sample_fp, prep_fp, qiita_artifact_id,
# human readable name)
for pt in a.prep_templates:
for _, prep_fp in pt.get_filepaths():
if 'qiime' not in prep_fp:
break
if prep_fp.startswith(bdir):
prep_fp = prep_fp[bdir_len:]
data.append((fp, sample_fp, prep_fp, a.id, human_cmd))

# writing text and tgz file
ts = datetime.now().strftime('%m%d%y-%H%M%S')
tgz_dir = join(working_dir, 'releases')
if not exists(tgz_dir):
makedirs(tgz_dir)
tgz_name = join(tgz_dir, '%s-%s-%s.tgz' % (portal, study_status, ts))
txt_name = join(tgz_dir, '%s-%s-%s.txt' % (portal, study_status, ts))
with open(txt_name, 'w') as txt, topen(tgz_name, "w|gz") as tgz:
# writing header for txt
txt.write("biom_fp\tsample_fp\tprep_fp\tqiita_artifact_id\tcommand\n")
for biom_fp, sample_fp, prep_fp, artifact_id, human_cmd in data:
txt.write("%s\t%s\t%s\t%s\t%s\n" % (
biom_fp, sample_fp, prep_fp, artifact_id, human_cmd))
tgz.add(join(bdir, biom_fp), arcname=biom_fp, recursive=False)
tgz.add(join(bdir, sample_fp), arcname=sample_fp, recursive=False)
tgz.add(join(bdir, prep_fp), arcname=prep_fp, recursive=False)

return tgz_name, txt_name
6 changes: 5 additions & 1 deletion scripts/qiita-cron-job
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@
# The full license is in the file LICENSE, distributed with this software.
# -----------------------------------------------------------------------------

from qiita_db.util import purge_filepaths, empty_trash_upload_folder
from qiita_db.util import (
purge_filepaths, empty_trash_upload_folder,
generate_biom_and_metadata_release)
from qiita_db.meta_util import update_redis_stats


Expand All @@ -18,6 +20,7 @@ from qiita_db.meta_util import update_redis_stats
# 2. empty_trash_upload_folder: remove files that are present in the trash
# of the upload folders
# 3. update_redis_stats: updates the redis stats information
# 4. generate public releases of biom tables and metadata
#
# Note that is responsability of the Qiita install system admin to add to a
# cron job this script and responsible to define how often it should run
Expand All @@ -27,6 +30,7 @@ def main():
purge_filepaths(True)
empty_trash_upload_folder(True)
update_redis_stats()
generate_biom_and_metadata_release('public')


if __name__ == "__main__":
Expand Down