Skip to content

Download bulk raw data #2104

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Apr 11, 2017
Binary file not shown.
Binary file not shown.
4 changes: 4 additions & 0 deletions qiita_pet/handlers/api_proxy/studies.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,10 @@ def study_get_req(study_id, user_id):
samples = study.sample_template
study_info['num_samples'] = 0 if samples is None else len(list(samples))
study_info['owner'] = study.owner.id
# Study.has_access no_public=True, will return True only if the user_id is
# the owner of the study or if the study is shared with the user_id
study_info['has_access_to_raw_data'] = study.has_access(
User(user_id), True)

return {'status': 'success',
'message': '',
Expand Down
2 changes: 1 addition & 1 deletion qiita_pet/handlers/api_proxy/tests/test_studies.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,9 +94,9 @@ def test_study_get_req(self):
'number_samples_collected': 27,
'owner': 'test@foo.bar',
'ebi_submission_status': 'submitted',
'has_access_to_raw_data': True,
'ebi_study_accession': 'EBI123456-BB'},
'editable': True}

self.assertEqual(obs, exp)

# Test with no lab person
Expand Down
105 changes: 87 additions & 18 deletions qiita_pet/handlers/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,21 +57,13 @@ def get(self, study_id):
str(study_id)))

study = Study(study_id)
user = self.current_user
basedir = get_db_files_base_dir()
basedir_len = len(basedir) + 1
# loop over artifacts and retrieve those that we have access to
to_download = []
vfabu = validate_filepath_access_by_user
for a in study.artifacts():
if a.artifact_type == 'BIOM':
to_add = True
for i, (fid, path, data_type) in enumerate(a.filepaths):
# validate access only of the first artifact filepath,
# the rest have the same permissions
if (i == 0 and not vfabu(user, fid)):
to_add = False
break
# ignore if tgz as they could create problems and the
# raw data is in the folder
if data_type == 'tgz':
Expand All @@ -97,16 +89,15 @@ def get(self, study_id):
# how to trigger it
to_download.append((path, path, path))

if to_add:
for pt in a.prep_templates:
qmf = pt.qiime_map_fp
if qmf is not None:
sqmf = qmf
if qmf.startswith(basedir):
sqmf = qmf[basedir_len:]
to_download.append(
(qmf, sqmf, 'mapping_files/%s_mapping_file.txt'
% a.id))
for pt in a.prep_templates:
qmf = pt.qiime_map_fp
if qmf is not None:
sqmf = qmf
if qmf.startswith(basedir):
sqmf = qmf[basedir_len:]
to_download.append(
(qmf, sqmf, 'mapping_files/%s_mapping_file.txt'
% a.id))

# If we don't have nginx, write a file that indicates this
all_files = '\n'.join(["- %s /protected/%s %s" % (getsize(fp), sfp, n)
Expand All @@ -130,6 +121,8 @@ def get(self, extras):
_, relpath, _ = get_release_info()

# If we don't have nginx, write a file that indicates this
# Note that this configuration will automatically create and download
# ("on the fly") the zip file via the contents in all_files
self.write("This installation of Qiita was not equipped with nginx, "
"so it is incapable of serving files. The file you "
"attempted to download is located at %s" % relpath)
Expand All @@ -143,5 +136,81 @@ def get(self, extras):
'/protected-working_dir/' + relpath)
self.set_header('Content-Disposition',
'attachment; filename=%s' % basename(relpath))
self.finish()


class DownloadRawData(BaseHandler):
@authenticated
@execute_as_transaction
def get(self, study_id):
study_id = int(study_id)
# Check general access to study
study_info = study_get_req(study_id, self.current_user.id)
if study_info['status'] != 'success':
raise HTTPError(405, "%s: %s, %s" % (study_info['message'],
self.current_user.email,
str(study_id)))

study = Study(study_id)
user = self.current_user
# Check "owner" access to the study
if not study.has_access(user, True):
raise HTTPError(405, "%s: %s, %s" % ('No raw data access',
self.current_user.email,
str(study_id)))

basedir = get_db_files_base_dir()
basedir_len = len(basedir) + 1
# loop over artifacts and retrieve raw data (no parents)
to_download = []
for a in study.artifacts():
if not a.parents:
for i, (fid, path, data_type) in enumerate(a.filepaths):
if data_type == 'directory':
# If we have a directory, we actually need to list
# all the files from the directory so NGINX can
# actually download all of them
for dp, _, fps in walk(path):
for fname in fps:
fullpath = join(dp, fname)
spath = fullpath
if fullpath.startswith(basedir):
spath = fullpath[basedir_len:]
to_download.append((fullpath, spath, spath))
elif path.startswith(basedir):
spath = path[basedir_len:]
to_download.append((path, spath, spath))
else:
# We are not aware of any case that can trigger this
# situation, but we wanted to be overly cautious
# There is no test for this line cause we don't know
# how to trigger it
to_download.append((path, path, path))

for pt in a.prep_templates:
qmf = pt.qiime_map_fp
if qmf is not None:
sqmf = qmf
if qmf.startswith(basedir):
sqmf = qmf[basedir_len:]
to_download.append(
(qmf, sqmf, 'mapping_files/%s_mapping_file.txt'
% a.id))

# If we don't have nginx, write a file that indicates this
# Note that this configuration will automatically create and download
# ("on the fly") the zip file via the contents in all_files
all_files = '\n'.join(["- %s /protected/%s %s" % (getsize(fp), sfp, n)
for fp, sfp, n in to_download])
self.write("%s\n" % all_files)

zip_fn = 'study_raw_data_%d_%s.zip' % (
study_id, datetime.now().strftime('%m%d%y-%H%M%S'))

self.set_header('Content-Description', 'File Transfer')
self.set_header('Expires', '0')
self.set_header('Cache-Control', 'no-cache')
self.set_header('X-Archive-Files', 'zip')
self.set_header('Content-Disposition',
'attachment; filename=%s' % zip_fn)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

where is the file created?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nowhere, is created on the fly by nginx. Basically, we send a list of filepaths to nginx using the "protected" filepath (only nginx has access) and the zip is created during download.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, thought that might be the case. Can you add a comment since it's implicit?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

eh?

self.finish()
3 changes: 3 additions & 0 deletions qiita_pet/templates/study_base.html
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,9 @@
<button class="btn btn-default btn-block" onclick="populate_main_div('{% raw qiita_config.portal_dir %}/study/new_prep_template/', { study_id: {{study_info['study_id']}} })" id="add-new-preparation-btn"><span class="glyphicon glyphicon-plus-sign"></span> Add New Preparation</button>
{% end %}
<a class="btn btn-default btn-block" href="{% raw qiita_config.portal_dir %}/download_study_bioms/{{study_info['study_id']}}"><span class="glyphicon glyphicon-download-alt"></span> All QIIME maps and BIOMs</a>
{% if study_info['has_access_to_raw_data'] %}
<a class="btn btn-default btn-block" href="{% raw qiita_config.portal_dir %}/download_raw_data/{{study_info['study_id']}}"><span class="glyphicon glyphicon-download-alt"></span> All raw data</a>
{% end %}
<div style="text-align: center;"><small><a href="{% raw qiita_config.portal_dir %}/static/doc/html/faq.html#how-to-solve-unzip-errors">Issues opening the downloaded zip?</a></small></div>

<div id="data-types-menu"></div>
Expand Down
51 changes: 49 additions & 2 deletions qiita_pet/test/test_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from qiita_pet.test.tornado_test_base import TestHandlerBase
from qiita_pet.handlers.base_handlers import BaseHandler
from qiita_db.user import User
from qiita_db.study import Study
from qiita_db.artifact import Artifact
from qiita_db.software import Parameters, Command

Expand Down Expand Up @@ -77,8 +78,6 @@ def test_download_study(self):
with open(tgz, 'w') as f:
f.write('\n')

self._clean_up_files.append(tmp_dir)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

rm cause it's twice, see line 65


files_biom = [(biom_fp, 'biom'), (smr_dir, 'directory'), (tgz, 'tgz')]

params = Parameters.from_default_params(
Expand Down Expand Up @@ -156,5 +155,53 @@ def test_download(self):
"is located at", response.body)


class TestDownloadRawData(TestHandlerBase):

def setUp(self):
super(TestDownloadRawData, self).setUp()
self._clean_up_files = []

def tearDown(self):
super(TestDownloadRawData, self).tearDown()
for fp in self._clean_up_files:
if exists(fp):
if isdir(fp):
rmtree(fp)
else:
remove(fp)

def test_download_raw_data(self):
# it's possible that one of the tests is deleting the raw data
# so we will make sure that the files exists so this test passes
all_files = [fp for a in Study(1).artifacts()
for _, fp, _ in a.filepaths]
for fp in all_files:
if not exists(fp):
with open(fp, 'w') as f:
f.write('')
response = self.get('/download_raw_data/1')
self.assertEqual(response.code, 200)

exp = (
'- 0 /protected/raw_data/1_s_G1_L001_sequences.fastq.gz '
'raw_data/1_s_G1_L001_sequences.fastq.gz\n'
'- 0 /protected/raw_data/1_s_G1_L001_sequences_barcodes.fastq.gz '
'raw_data/1_s_G1_L001_sequences_barcodes.fastq.gz\n'
'- 36615 /protected/templates/1_prep_1_qiime_[0-9]*-[0-9]*.txt '
'mapping_files/1_mapping_file.txt\n'
'- 36615 /protected/templates/1_prep_2_qiime_[0-9]*-[0-9]*.txt '
'mapping_files/7_mapping_file.txt\n')
self.assertRegexpMatches(response.body, exp)

response = self.get('/download_study_bioms/200')
self.assertEqual(response.code, 405)

# changing user so we can test the failures
BaseHandler.get_current_user = Mock(
return_value=User("demo@microbio.me"))
response = self.get('/download_study_bioms/1')
self.assertEqual(response.code, 405)


if __name__ == '__main__':
main()
4 changes: 3 additions & 1 deletion qiita_pet/webserver.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@
from qiita_pet.handlers.upload import UploadFileHandler, StudyUploadFileHandler
from qiita_pet.handlers.stats import StatsHandler
from qiita_pet.handlers.download import (
DownloadHandler, DownloadStudyBIOMSHandler, DownloadRelease)
DownloadHandler, DownloadStudyBIOMSHandler, DownloadRelease,
DownloadRawData)
from qiita_pet.handlers.prep_template import PrepTemplateHandler
from qiita_pet.handlers.ontology import OntologyHandler
from qiita_db.handlers.processing_job import (
Expand Down Expand Up @@ -150,6 +151,7 @@ def __init__(self):
(r"/download/(.*)", DownloadHandler),
(r"/download_study_bioms/(.*)", DownloadStudyBIOMSHandler),
(r"/release/download/(.*)", DownloadRelease),
(r"/download_raw_data/(.*)", DownloadRawData),
(r"/vamps/(.*)", VAMPSHandler),
# Plugin handlers - the order matters here so do not change
# qiita_db/jobs/(.*) should go after any of the
Expand Down