qiita-spots · ElDeveloper · Feb 3, 2017 · Jan 31, 2017 · Jan 31, 2017 · Feb 2, 2017
diff --git a/qiita_db/test/test_util.py b/qiita_db/test/test_util.py
@@ -14,6 +14,7 @@
 from datetime import datetime
 from functools import partial
 from string import punctuation
+from tarfile import open as topen
 
 import pandas as pd
 
@@ -742,6 +743,42 @@ def test_supported_filepath_types(self):
         exp = [["biom", True], ["directory", False], ["log", False]]
         self.assertItemsEqual(obs, exp)
 
+    def test_generate_biom_and_metadata_release(self):
+        tgz, txt = qdb.util.generate_biom_and_metadata_release('private')
+        self.files_to_remove.extend([tgz, txt])
+
+        tmp = topen(tgz, "r:gz")
+        tgz_obs = [ti.name for ti in tmp]
+        tmp.close()
+        tgz_exp = [
+            'processed_data/1_study_1001_closed_reference_otu_table.biom',
+            'templates/1_19700101-000000.txt',
+            'templates/1_prep_1_19700101-000000.txt',
+            'processed_data/1_study_1001_closed_reference_otu_table.biom',
+            'templates/1_19700101-000000.txt',
+            'templates/1_prep_1_19700101-000000.txt',
+            'processed_data/1_study_1001_closed_reference_otu_table_'
+            'Silva.biom', 'templates/1_19700101-000000.txt',
+            'templates/1_prep_1_19700101-000000.txt']
+        self.assertEqual(tgz_obs, tgz_exp)
+
+        tmp = open(txt)
+        txt_obs = tmp.readlines()
+        tmp.close()
+        txt_exp = [
+            'biom_fp\tsample_fp\tprep_fp\tqiita_artifact_id\tcommand\n',
+            'processed_data/1_study_1001_closed_reference_otu_table.biom\ttem'
+            'plates/1_19700101-000000.txt\ttemplates/1_prep_1_19700101-000000'
+            '.txt\t4\tPick closed-reference OTUs, Split libraries FASTQ\n',
+            'processed_data/1_study_1001_closed_reference_otu_table.biom\ttem'
+            'plates/1_19700101-000000.txt\ttemplates/1_prep_1_19700101-000000'
+            '.txt\t5\tPick closed-reference OTUs, Split libraries FASTQ\n',
+            'processed_data/1_study_1001_closed_reference_otu_table_Silva.bio'
+            'm\ttemplates/1_19700101-000000.txt\ttemplates/1_prep_1_19700101-'
+            '000000.txt\t6\tPick closed-reference OTUs, Split libraries '
+            'FASTQ\n']
+        self.assertEqual(txt_obs, txt_exp)
+
 
 @qiita_test_checker()
 class UtilTests(TestCase):

diff --git a/qiita_db/util.py b/qiita_db/util.py
@@ -54,8 +54,10 @@
 from json import dumps
 from datetime import datetime
 from itertools import chain
+from tarfile import open as topen
 
 from qiita_core.exceptions import IncompetentQiitaDeveloperError
+from qiita_core.configuration_manager import ConfigurationManager
 import qiita_db as qdb
 
 
@@ -1549,3 +1551,87 @@ def generate_study_list(study_ids, build_samples, public_only=False):
             infolist.append(info)
 
     return infolist
+
+
+def generate_biom_and_metadata_release(study_status='public'):
+    """Generate a list of biom/meatadata filepaths and a tgz of those files
+
+    Parameters
+    ----------
+    study_status : str, optional
+        The study status to search for. Note that this should always be set
+        to 'public' but having this exposed as helps with testing. The other
+        options are 'private' and 'sandbox'
+
+    Returns
+    -------
+    str, str
+        tgz_name: the filepath of the new generated tgz
+        txt_name: the filepath of the new generated txt
+    """
+    studies = qdb.study.Study.get_by_status(study_status)
+    qiita_config = ConfigurationManager()
+    working_dir = qiita_config.working_dir
+    portal = qiita_config.portal
+    bdir = qdb.util.get_db_files_base_dir()
+    bdir_len = len(bdir) + 1
+
+    data = []
+    for s in studies:
+        # [0] latest is first, [1] only getting the filepath
+        sample_fp = s.sample_template.get_filepaths()[0][1]
+        if sample_fp.startswith(bdir):
+            sample_fp = sample_fp[bdir_len:]
+
+        for a in s.artifacts(artifact_type='BIOM'):
+            if a.processing_parameters is None:
+                continue
+
+            cmd_name = a.processing_parameters.command.name
+
+            # this loop is necessary as in theory an artifact can be
+            # generated from multiple prep info files
+            human_cmd = []
+            for p in a.parents:
+                pp = p.processing_parameters
+                pp_cmd_name = pp.command.name
+                if pp_cmd_name == 'Trimming':
+                    human_cmd.append('%s @ %s' % (
+                        cmd_name, str(pp.values['length'])))
+                else:
+                    human_cmd.append('%s, %s' % (cmd_name, pp_cmd_name))
+            human_cmd = ', '.join(human_cmd)
+
+            for _, fp, fp_type in a.filepaths:
+                if fp_type != 'biom' or 'only-16s' in fp:
+                    continue
+                if fp.startswith(bdir):
+                    fp = fp[bdir_len:]
+                # format: (biom_fp, sample_fp, prep_fp, qiita_artifact_id,
+                #          human readable name)
+                for pt in a.prep_templates:
+                    for _, prep_fp in pt.get_filepaths():
+                        if 'qiime' not in prep_fp:
+                            break
+                    if prep_fp.startswith(bdir):
+                        prep_fp = prep_fp[bdir_len:]
+                    data.append((fp, sample_fp, prep_fp, a.id, human_cmd))
+
+    # writing text and tgz file
+    ts = datetime.now().strftime('%m%d%y-%H%M%S')
+    tgz_dir = join(working_dir, 'releases')
+    if not exists(tgz_dir):
+        makedirs(tgz_dir)
+    tgz_name = join(tgz_dir, '%s-%s-%s.tgz' % (portal, study_status, ts))
+    txt_name = join(tgz_dir, '%s-%s-%s.txt' % (portal, study_status, ts))
+    with open(txt_name, 'w') as txt, topen(tgz_name, "w|gz") as tgz:
+        # writing header for txt
+        txt.write("biom_fp\tsample_fp\tprep_fp\tqiita_artifact_id\tcommand\n")
+        for biom_fp, sample_fp, prep_fp, artifact_id, human_cmd in data:
+            txt.write("%s\t%s\t%s\t%s\t%s\n" % (
+                biom_fp, sample_fp, prep_fp, artifact_id, human_cmd))
+            tgz.add(join(bdir, biom_fp), arcname=biom_fp, recursive=False)
+            tgz.add(join(bdir, sample_fp), arcname=sample_fp, recursive=False)
+            tgz.add(join(bdir, prep_fp), arcname=prep_fp, recursive=False)
+
+    return tgz_name, txt_name
diff --git a/scripts/qiita-cron-job b/scripts/qiita-cron-job
@@ -8,7 +8,9 @@
 # The full license is in the file LICENSE, distributed with this software.
 # -----------------------------------------------------------------------------
 
-from qiita_db.util import purge_filepaths, empty_trash_upload_folder
+from qiita_db.util import (
+    purge_filepaths, empty_trash_upload_folder,
+    generate_biom_and_metadata_release)
 from qiita_db.meta_util import update_redis_stats
 
 
@@ -18,6 +20,7 @@ from qiita_db.meta_util import update_redis_stats
 # 2. empty_trash_upload_folder: remove files that are present in the trash
 #    of the upload folders
 # 3. update_redis_stats: updates the redis stats information
+# 4. generate public releases of biom tables and metadata
 #
 # Note that is responsability of the Qiita install system admin to add to a
 # cron job this script and responsible to define how often it should run
@@ -27,6 +30,7 @@ def main():
     purge_filepaths(True)
     empty_trash_upload_folder(True)
     update_redis_stats()
+    generate_biom_and_metadata_release('public')
 
 
 if __name__ == "__main__":