Skip to content

Commit 56a20b4

Browse files
antgonzaElDeveloper
authored andcommitted
fix #2086 (#2102)
* fix #2086 * flak8 * addressing @ElDeveloper comments + fixes * adding the final changes * fix failures * get_qiita_version -> generate_biom_and_metadata_release
1 parent dc0b029 commit 56a20b4

File tree

12 files changed

+388
-226
lines changed

12 files changed

+388
-226
lines changed

qiita_core/tests/test_util.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@
1212

1313
from qiita_core.util import (
1414
send_email, qiita_test_checker, execute_as_transaction, get_qiita_version,
15-
is_test_environment)
15+
is_test_environment, get_release_info)
16+
from qiita_db.meta_util import generate_biom_and_metadata_release
1617
import qiita_db as qdb
1718

1819

@@ -64,6 +65,21 @@ def test_get_qiita_version(self):
6465
# testing just the version
6566
self.assertEqual(exp_version, qdb.__version__)
6667

68+
def test_get_release_info(self):
69+
# making sure there is a release
70+
generate_biom_and_metadata_release('private')
71+
# just checking that is not empty cause the MD5 will change on every
72+
# run
73+
md5sum, filepath, timestamp = get_release_info('private')
74+
self.assertNotEqual(md5sum, '')
75+
self.assertNotEqual(filepath, '')
76+
self.assertNotEqual(timestamp, '')
77+
78+
md5sum, filepath, timestamp = get_release_info('public')
79+
self.assertEqual(md5sum, '')
80+
self.assertEqual(filepath, '')
81+
self.assertEqual(timestamp, '')
82+
6783

6884
if __name__ == '__main__':
6985
main()

qiita_core/util.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from os.path import dirname
1212
from git import Repo
1313
from git.exc import InvalidGitRepositoryError
14+
from moi import r_client
1415

1516
from qiita_core.qiita_settings import qiita_config
1617
from qiita_pet import __version__ as qiita_pet_lib_version
@@ -141,3 +142,32 @@ def get_qiita_version():
141142
sha = ''
142143

143144
return (qiita_pet_lib_version, sha)
145+
146+
147+
def get_release_info(study_status='public'):
148+
"""Returns the study status release MD5
149+
150+
Parameters
151+
----------
152+
study_status : str, optional
153+
The study status to search for. Note that this should always be set
154+
to 'public' but having this exposed helps with testing. The other
155+
options are 'private' and 'sandbox'
156+
157+
Returns
158+
------
159+
str, str, str
160+
The release MD5, filepath and timestamp
161+
"""
162+
portal = qiita_config.portal
163+
md5sum = r_client.get('%s:release:%s:md5sum' % (portal, study_status))
164+
filepath = r_client.get('%s:release:%s:filepath' % (portal, study_status))
165+
timestamp = r_client.get('%s:release:%s:time' % (portal, study_status))
166+
if md5sum is None:
167+
md5sum = ''
168+
if filepath is None:
169+
filepath = ''
170+
if timestamp is None:
171+
timestamp = ''
172+
173+
return md5sum, filepath, timestamp

qiita_db/meta_util.py

Lines changed: 101 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,8 @@
2525
from __future__ import division
2626

2727
from moi import r_client
28-
from os import stat
28+
from os import stat, makedirs, rename
29+
from os.path import join, relpath, exists
2930
from time import strftime, localtime
3031
import matplotlib.pyplot as plt
3132
import matplotlib as mpl
@@ -34,8 +35,11 @@
3435
from StringIO import StringIO
3536
from future.utils import viewitems
3637
from datetime import datetime
38+
from tarfile import open as topen, TarInfo
39+
from hashlib import md5
3740

3841
from qiita_core.qiita_settings import qiita_config
42+
from qiita_core.configuration_manager import ConfigurationManager
3943
import qiita_db as qdb
4044

4145

@@ -332,3 +336,99 @@ def get_lat_longs():
332336
qdb.sql_connection.TRN.add(sql)
333337

334338
return qdb.sql_connection.TRN.execute_fetchindex()
339+
340+
341+
def generate_biom_and_metadata_release(study_status='public'):
342+
"""Generate a list of biom/meatadata filepaths and a tgz of those files
343+
344+
Parameters
345+
----------
346+
study_status : str, optional
347+
The study status to search for. Note that this should always be set
348+
to 'public' but having this exposed helps with testing. The other
349+
options are 'private' and 'sandbox'
350+
"""
351+
studies = qdb.study.Study.get_by_status(study_status)
352+
qiita_config = ConfigurationManager()
353+
working_dir = qiita_config.working_dir
354+
portal = qiita_config.portal
355+
bdir = qdb.util.get_db_files_base_dir()
356+
time = datetime.now().strftime('%m-%d-%y %H:%M:%S')
357+
358+
data = []
359+
for s in studies:
360+
# [0] latest is first, [1] only getting the filepath
361+
sample_fp = relpath(s.sample_template.get_filepaths()[0][1], bdir)
362+
363+
for a in s.artifacts(artifact_type='BIOM'):
364+
if a.processing_parameters is None:
365+
continue
366+
367+
cmd_name = a.processing_parameters.command.name
368+
369+
# this loop is necessary as in theory an artifact can be
370+
# generated from multiple prep info files
371+
human_cmd = []
372+
for p in a.parents:
373+
pp = p.processing_parameters
374+
pp_cmd_name = pp.command.name
375+
if pp_cmd_name == 'Trimming':
376+
human_cmd.append('%s @ %s' % (
377+
cmd_name, str(pp.values['length'])))
378+
else:
379+
human_cmd.append('%s, %s' % (cmd_name, pp_cmd_name))
380+
human_cmd = ', '.join(human_cmd)
381+
382+
for _, fp, fp_type in a.filepaths:
383+
if fp_type != 'biom' or 'only-16s' in fp:
384+
continue
385+
fp = relpath(fp, bdir)
386+
# format: (biom_fp, sample_fp, prep_fp, qiita_artifact_id,
387+
# human readable name)
388+
for pt in a.prep_templates:
389+
for _, prep_fp in pt.get_filepaths():
390+
if 'qiime' not in prep_fp:
391+
break
392+
prep_fp = relpath(prep_fp, bdir)
393+
data.append((fp, sample_fp, prep_fp, a.id, human_cmd))
394+
395+
# writing text and tgz file
396+
ts = datetime.now().strftime('%m%d%y-%H%M%S')
397+
tgz_dir = join(working_dir, 'releases')
398+
if not exists(tgz_dir):
399+
makedirs(tgz_dir)
400+
tgz_name = join(tgz_dir, '%s-%s-building.tgz' % (portal, study_status))
401+
tgz_name_final = join(tgz_dir, '%s-%s.tgz' % (portal, study_status))
402+
txt_hd = StringIO()
403+
with topen(tgz_name, "w|gz") as tgz:
404+
# writing header for txt
405+
txt_hd.write(
406+
"biom_fp\tsample_fp\tprep_fp\tqiita_artifact_id\tcommand\n")
407+
for biom_fp, sample_fp, prep_fp, artifact_id, human_cmd in data:
408+
txt_hd.write("%s\t%s\t%s\t%s\t%s\n" % (
409+
biom_fp, sample_fp, prep_fp, artifact_id, human_cmd))
410+
tgz.add(join(bdir, biom_fp), arcname=biom_fp, recursive=False)
411+
tgz.add(join(bdir, sample_fp), arcname=sample_fp, recursive=False)
412+
tgz.add(join(bdir, prep_fp), arcname=prep_fp, recursive=False)
413+
414+
txt_hd.seek(0)
415+
info = TarInfo(name='%s-%s-%s.txt' % (portal, study_status, ts))
416+
info.size = len(txt_hd.buf)
417+
tgz.addfile(tarinfo=info, fileobj=txt_hd)
418+
419+
with open(tgz_name, "rb") as f:
420+
md5sum = md5()
421+
for c in iter(lambda: f.read(4096), b""):
422+
md5sum.update(c)
423+
424+
rename(tgz_name, tgz_name_final)
425+
426+
vals = [
427+
('filepath', tgz_name_final[len(working_dir):], r_client.set),
428+
('md5sum', md5sum.hexdigest(), r_client.set),
429+
('time', time, r_client.set)]
430+
for k, v, f in vals:
431+
redis_key = '%s:release:%s:%s' % (portal, study_status, k)
432+
# important to "flush" variables to avoid errors
433+
r_client.delete(redis_key)
434+
f(redis_key, v)

qiita_db/test/test_meta_util.py

Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@
88

99
from unittest import TestCase, main
1010
import numpy.testing as npt
11+
from tarfile import open as topen
12+
from os import remove
13+
from os.path import exists, join
1114

1215
import pandas as pd
1316

@@ -22,9 +25,13 @@
2225
class MetaUtilTests(TestCase):
2326
def setUp(self):
2427
self.old_portal = qiita_config.portal
28+
self.files_to_remove = []
2529

2630
def tearDown(self):
2731
qiita_config.portal = self.old_portal
32+
for fp in self.files_to_remove:
33+
if exists(fp):
34+
remove(fp)
2835

2936
def _set_artifact_private(self):
3037
self.conn_handler.execute(
@@ -227,6 +234,164 @@ def test_update_redis_stats(self):
227234
redis_key = '%s:stats:%s' % (portal, k)
228235
self.assertEqual(f(redis_key), exp)
229236

237+
def test_generate_biom_and_metadata_release(self):
238+
level = 'private'
239+
qdb.meta_util.generate_biom_and_metadata_release(level)
240+
portal = qiita_config.portal
241+
working_dir = qiita_config.working_dir
242+
243+
vals = [
244+
('filepath', r_client.get),
245+
('md5sum', r_client.get),
246+
('time', r_client.get)]
247+
# we are storing the [0] filepath, [1] md5sum and [2] time but we are
248+
# only going to check the filepath contents so ignoring the others
249+
tgz = vals[0][1]('%s:release:%s:%s' % (portal, level, vals[0][0]))
250+
tgz = join(working_dir, tgz)
251+
252+
self.files_to_remove.extend([tgz])
253+
254+
tmp = topen(tgz, "r:gz")
255+
tgz_obs = [ti.name for ti in tmp]
256+
tmp.close()
257+
# files names might change due to updates and patches so just check
258+
# that the prefix exists.
259+
fn = 'processed_data/1_study_1001_closed_reference_otu_table.biom'
260+
self.assertTrue(fn in tgz_obs)
261+
tgz_obs.remove(fn)
262+
# yes, this file is there twice
263+
self.assertTrue(fn in tgz_obs)
264+
tgz_obs.remove(fn)
265+
# let's check the next biom
266+
fn = ('processed_data/1_study_1001_closed_reference_otu_table_Silva.'
267+
'biom')
268+
self.assertTrue(fn in tgz_obs)
269+
tgz_obs.remove(fn)
270+
# now let's check prep info files based on their suffix, just take
271+
# the first one and check/rm the occurances of that file
272+
fn_prep = [f for f in tgz_obs
273+
if f.startswith('templates/1_prep_1_')][0]
274+
# 3 times
275+
self.assertTrue(fn_prep in tgz_obs)
276+
tgz_obs.remove(fn_prep)
277+
self.assertTrue(fn_prep in tgz_obs)
278+
tgz_obs.remove(fn_prep)
279+
self.assertTrue(fn_prep in tgz_obs)
280+
tgz_obs.remove(fn_prep)
281+
fn_sample = [f for f in tgz_obs if f.startswith('templates/1_')][0]
282+
# 3 times
283+
self.assertTrue(fn_sample in tgz_obs)
284+
tgz_obs.remove(fn_sample)
285+
self.assertTrue(fn_sample in tgz_obs)
286+
tgz_obs.remove(fn_sample)
287+
self.assertTrue(fn_sample in tgz_obs)
288+
tgz_obs.remove(fn_sample)
289+
# now we should only have the text file
290+
txt = tgz_obs.pop()
291+
# now it should be empty
292+
self.assertEqual(tgz_obs, [])
293+
294+
tmp = topen(tgz, "r:gz")
295+
fhd = tmp.extractfile(txt)
296+
txt_obs = fhd.readlines()
297+
tmp.close()
298+
txt_exp = [
299+
'biom_fp\tsample_fp\tprep_fp\tqiita_artifact_id\tcommand\n',
300+
'processed_data/1_study_1001_closed_reference_otu_table.biom\t'
301+
'%s\t%s\t4\tPick closed-reference OTUs, Split libraries FASTQ\n'
302+
% (fn_sample, fn_prep),
303+
'processed_data/1_study_1001_closed_reference_otu_table.biom\t'
304+
'%s\t%s\t5\tPick closed-reference OTUs, Split libraries FASTQ\n'
305+
% (fn_sample, fn_prep),
306+
'processed_data/1_study_1001_closed_reference_otu_table_Silva.bio'
307+
'm\t%s\t%s\t6\tPick closed-reference OTUs, Split libraries FASTQ\n'
308+
% (fn_sample, fn_prep)]
309+
self.assertEqual(txt_obs, txt_exp)
310+
311+
# whatever the configuration was, we will change to settings so we can
312+
# test the other option when dealing with the end '/'
313+
with qdb.sql_connection.TRN:
314+
qdb.sql_connection.TRN.add(
315+
"SELECT base_data_dir FROM settings")
316+
obdr = qdb.sql_connection.TRN.execute_fetchlast()
317+
if obdr[-1] == '/':
318+
bdr = obdr[:-1]
319+
else:
320+
bdr = obdr + '/'
321+
322+
qdb.sql_connection.TRN.add(
323+
"UPDATE settings SET base_data_dir = '%s'" % bdr)
324+
bdr = qdb.sql_connection.TRN.execute()
325+
326+
qdb.meta_util.generate_biom_and_metadata_release(level)
327+
# we are storing the [0] filepath, [1] md5sum and [2] time but we are
328+
# only going to check the filepath contents so ignoring the others
329+
tgz = vals[0][1]('%s:release:%s:%s' % (portal, level, vals[0][0]))
330+
tgz = join(working_dir, tgz)
331+
332+
tmp = topen(tgz, "r:gz")
333+
tgz_obs = [ti.name for ti in tmp]
334+
tmp.close()
335+
# files names might change due to updates and patches so just check
336+
# that the prefix exists.
337+
fn = 'processed_data/1_study_1001_closed_reference_otu_table.biom'
338+
self.assertTrue(fn in tgz_obs)
339+
tgz_obs.remove(fn)
340+
# yes, this file is there twice
341+
self.assertTrue(fn in tgz_obs)
342+
tgz_obs.remove(fn)
343+
# let's check the next biom
344+
fn = ('processed_data/1_study_1001_closed_reference_otu_table_Silva.'
345+
'biom')
346+
self.assertTrue(fn in tgz_obs)
347+
tgz_obs.remove(fn)
348+
# now let's check prep info files based on their suffix, just take
349+
# the first one and check/rm the occurances of that file
350+
fn_prep = [f for f in tgz_obs
351+
if f.startswith('templates/1_prep_1_')][0]
352+
# 3 times
353+
self.assertTrue(fn_prep in tgz_obs)
354+
tgz_obs.remove(fn_prep)
355+
self.assertTrue(fn_prep in tgz_obs)
356+
tgz_obs.remove(fn_prep)
357+
self.assertTrue(fn_prep in tgz_obs)
358+
tgz_obs.remove(fn_prep)
359+
fn_sample = [f for f in tgz_obs if f.startswith('templates/1_')][0]
360+
# 3 times
361+
self.assertTrue(fn_sample in tgz_obs)
362+
tgz_obs.remove(fn_sample)
363+
self.assertTrue(fn_sample in tgz_obs)
364+
tgz_obs.remove(fn_sample)
365+
self.assertTrue(fn_sample in tgz_obs)
366+
tgz_obs.remove(fn_sample)
367+
# now we should only have the text file
368+
txt = tgz_obs.pop()
369+
# now it should be empty
370+
self.assertEqual(tgz_obs, [])
371+
372+
tmp = topen(tgz, "r:gz")
373+
fhd = tmp.extractfile(txt)
374+
txt_obs = fhd.readlines()
375+
tmp.close()
376+
txt_exp = [
377+
'biom_fp\tsample_fp\tprep_fp\tqiita_artifact_id\tcommand\n',
378+
'processed_data/1_study_1001_closed_reference_otu_table.biom\t'
379+
'%s\t%s\t4\tPick closed-reference OTUs, Split libraries FASTQ\n'
380+
% (fn_sample, fn_prep),
381+
'processed_data/1_study_1001_closed_reference_otu_table.biom\t'
382+
'%s\t%s\t5\tPick closed-reference OTUs, Split libraries FASTQ\n'
383+
% (fn_sample, fn_prep),
384+
'processed_data/1_study_1001_closed_reference_otu_table_Silva.bio'
385+
'm\t%s\t%s\t6\tPick closed-reference OTUs, Split libraries FASTQ\n'
386+
% (fn_sample, fn_prep)]
387+
self.assertEqual(txt_obs, txt_exp)
388+
389+
# returning configuration
390+
with qdb.sql_connection.TRN:
391+
qdb.sql_connection.TRN.add(
392+
"UPDATE settings SET base_data_dir = '%s'" % obdr)
393+
bdr = qdb.sql_connection.TRN.execute()
394+
230395

231396
EXP_LAT_LONG = (
232397
'[[60.1102854322, 74.7123248382], [23.1218032799, 42.838497795],'

0 commit comments

Comments
 (0)