diff --git a/qiita_db/test/test_util.py b/qiita_db/test/test_util.py index 466eaed7b..11d5ed310 100644 --- a/qiita_db/test/test_util.py +++ b/qiita_db/test/test_util.py @@ -622,6 +622,16 @@ def test_get_timeseries_types(self): [10, 'mixed', 'combo intervention']] self.assertEqual(obs, exp) + def test_get_filepath_information(self): + obs = qdb.util.get_filepath_information(1) + # This path is machine specific. Just checking that is not empty + self.assertIsNotNone(obs.pop('fullpath')) + exp = {'filepath_id': 1L, 'filepath': '1_s_G1_L001_sequences.fastq.gz', + 'filepath_type': 'raw_forward_seqs', 'checksum': '852952723', + 'data_type': 'raw_data', 'mountpoint': 'raw_data', + 'subdirectory': False, 'active': True} + self.assertEqual(obs, exp) + def test_filepath_id_to_rel_path(self): obs = qdb.util.filepath_id_to_rel_path(1) exp = 'raw_data/1_s_G1_L001_sequences.fastq.gz' diff --git a/qiita_db/util.py b/qiita_db/util.py index 25898fa54..b664bfa8b 100644 --- a/qiita_db/util.py +++ b/qiita_db/util.py @@ -649,6 +649,33 @@ def str_to_id(x): chain.from_iterable(qdb.sql_connection.TRN.execute()[idx:]))) +def _path_builder(db_dir, filepath, mountpoint, subdirectory, obj_id): + """Builds the path of a DB stored file + + Parameters + ---------- + db_dir : str + The DB base dir + filepath : str + The path stored in the DB + mountpoint : str + The mountpoint of the given file + subdirectory : bool + Whether the file is stored in a subdirectory in the mountpoint or not + obj_id : int + The id of the object to which the file is attached + + Returns + ------- + str + The full path of the given file + """ + if subdirectory: + return join(db_dir, mountpoint, str(obj_id), filepath) + else: + return join(db_dir, mountpoint, filepath) + + def retrieve_filepaths(obj_fp_table, obj_id_column, obj_id, sort=None, fp_type=None): """Retrieves the filepaths for the given object id @@ -674,12 +701,6 @@ def retrieve_filepaths(obj_fp_table, obj_id_column, obj_id, sort=None, object id """ - def path_builder(db_dir, filepath, mountpoint, subdirectory, obj_id): - if subdirectory: - return join(db_dir, mountpoint, str(obj_id), filepath) - else: - return join(db_dir, mountpoint, filepath) - sql_sort = "" if sort == 'ascending': sql_sort = " ORDER BY filepath_id" @@ -710,7 +731,7 @@ def path_builder(db_dir, filepath, mountpoint, subdirectory, obj_id): results = qdb.sql_connection.TRN.execute_fetchindex() db_dir = get_db_files_base_dir() - return [(fpid, path_builder(db_dir, fp, m, s, obj_id), fp_type_) + return [(fpid, _path_builder(db_dir, fp, m, s, obj_id), fp_type_) for fpid, fp, fp_type_, m, s in results] @@ -845,6 +866,38 @@ def move_filepaths_to_upload_folder(study_id, filepaths): qdb.sql_connection.TRN.execute() +def get_filepath_information(filepath_id): + """Gets the filepath information of filepath_id + + Parameters + ---------- + filepath_id : int + The filepath id + + Returns + ------- + dict + The filepath information + """ + with qdb.sql_connection.TRN: + sql = """SELECT filepath_id, filepath, filepath_type, checksum, + data_type, mountpoint, subdirectory, active, + artifact_id + FROM qiita.filepath + JOIN qiita.filepath_type USING (filepath_type_id) + JOIN qiita.data_directory USING (data_directory_id) + LEFT JOIN qiita.artifact_filepath USING (filepath_id) + WHERE filepath_id = %s""" + qdb.sql_connection.TRN.add(sql, [filepath_id]) + res = dict(qdb.sql_connection.TRN.execute_fetchindex()[0]) + + obj_id = res.pop('artifact_id') + res['fullpath'] = _path_builder(get_db_files_base_dir(), + res['filepath'], res['mountpoint'], + res['subdirectory'], obj_id) + return res + + def filepath_id_to_rel_path(filepath_id): """Gets the relative to the base directory of filepath_id diff --git a/qiita_pet/handlers/download.py b/qiita_pet/handlers/download.py index bb906e532..8b06f7230 100644 --- a/qiita_pet/handlers/download.py +++ b/qiita_pet/handlers/download.py @@ -10,52 +10,21 @@ from tornado.gen import coroutine from future.utils import viewitems -from os.path import basename, getsize, join +from os.path import basename, getsize, join, isdir from os import walk from datetime import datetime from .base_handlers import BaseHandler from qiita_pet.handlers.api_proxy import study_get_req from qiita_db.study import Study -from qiita_db.util import filepath_id_to_rel_path, get_db_files_base_dir +from qiita_db.util import (filepath_id_to_rel_path, get_db_files_base_dir, + get_filepath_information) from qiita_db.meta_util import validate_filepath_access_by_user from qiita_db.metadata_template.sample_template import SampleTemplate from qiita_db.metadata_template.prep_template import PrepTemplate from qiita_core.util import execute_as_transaction, get_release_info -class DownloadHandler(BaseHandler): - @authenticated - @coroutine - @execute_as_transaction - def get(self, filepath_id): - fid = int(filepath_id) - - if not validate_filepath_access_by_user(self.current_user, fid): - raise HTTPError( - 403, "%s doesn't have access to " - "filepath_id: %s" % (self.current_user.email, str(fid))) - - relpath = filepath_id_to_rel_path(fid) - fname = basename(relpath) - - # If we don't have nginx, write a file that indicates this - self.write("This installation of Qiita was not equipped with nginx, " - "so it is incapable of serving files. The file you " - "attempted to download is located at %s" % relpath) - - self.set_header('Content-Description', 'File Transfer') - self.set_header('Content-Type', 'application/octet-stream') - self.set_header('Content-Transfer-Encoding', 'binary') - self.set_header('Expires', '0') - self.set_header('Cache-Control', 'no-cache') - self.set_header('X-Accel-Redirect', '/protected/' + relpath) - self.set_header('Content-Disposition', - 'attachment; filename=%s' % fname) - - self.finish() - - class BaseHandlerDownload(BaseHandler): def _check_permissions(self, sid): # Check general access to study @@ -77,6 +46,149 @@ def _generate_files(self, header_name, accessions, filename): self.write(text) self.finish() + def _list_dir_files_nginx(self, dirpath): + """Generates a nginx list of files in the given dirpath for nginx + + Parameters + ---------- + dirpath : str + Path to the directory + + Returns + ------- + list of (str, str, str) + The path information needed by nginx for each file in the + directory + """ + basedir = get_db_files_base_dir() + basedir_len = len(basedir) + 1 + to_download = [] + for dp, _, fps in walk(dirpath): + for fn in fps: + fullpath = join(dp, fn) + spath = fullpath + if fullpath.startswith(basedir): + spath = fullpath[basedir_len:] + to_download.append((fullpath, spath, spath)) + return to_download + + def _list_artifact_files_nginx(self, artifact): + """Generates a nginx list of files for the given artifact + + Parameters + ---------- + artifact : qiita_db.artifact.Artifact + The artifact to retrieve the files + + Returns + ------- + list of (str, str, str) + The path information needed by nginx for each file in the artifact + """ + basedir = get_db_files_base_dir() + basedir_len = len(basedir) + 1 + to_download = [] + for i, (fid, path, data_type) in enumerate(artifact.filepaths): + # ignore if tgz as they could create problems and the + # raw data is in the folder + if data_type == 'tgz': + continue + if isdir(path): + # If we have a directory, we actually need to list all the + # files from the directory so NGINX can actually download all + # of them + to_download.extend(self._list_dir_files_nginx(path)) + elif path.startswith(basedir): + spath = path[basedir_len:] + to_download.append((path, spath, spath)) + else: + to_download.append((path, path, path)) + + for pt in artifact.prep_templates: + qmf = pt.qiime_map_fp + if qmf is not None: + sqmf = qmf + if qmf.startswith(basedir): + sqmf = qmf[basedir_len:] + to_download.append( + (qmf, sqmf, 'mapping_files/%s_mapping_file.txt' + % artifact.id)) + return to_download + + def _write_nginx_file_list(self, to_download): + """Writes out the nginx file list + + Parameters + ---------- + to_download : list of (str, str, str) + The file list information + """ + all_files = '\n'.join( + ["- %s /protected/%s %s" % (getsize(fp), sfp, n) + for fp, sfp, n in to_download]) + + self.set_header('X-Archive-Files', 'zip') + self.write("%s\n" % all_files) + + def _set_nginx_headers(self, fname): + """Sets commong nginx headers + + Parameters + ---------- + fname : str + Nginx's output filename + """ + self.set_header('Content-Description', 'File Transfer') + self.set_header('Expires', '0') + self.set_header('Cache-Control', 'no-cache') + self.set_header('Content-Disposition', + 'attachment; filename=%s' % fname) + + def _write_nginx_placeholder_file(self, fp): + """Writes nginx placeholder file in case that nginx is not set up + + Parameters + ---------- + fp : str + The path to be downloaded through nginx + """ + # If we don't have nginx, write a file that indicates this + self.write("This installation of Qiita was not equipped with " + "nginx, so it is incapable of serving files. The file " + "you attempted to download is located at %s" % fp) + + +class DownloadHandler(BaseHandlerDownload): + @authenticated + @coroutine + @execute_as_transaction + def get(self, filepath_id): + fid = int(filepath_id) + + if not validate_filepath_access_by_user(self.current_user, fid): + raise HTTPError( + 403, "%s doesn't have access to " + "filepath_id: %s" % (self.current_user.email, str(fid))) + + relpath = filepath_id_to_rel_path(fid) + fp_info = get_filepath_information(fid) + fname = basename(relpath) + + if fp_info['filepath_type'] in ('directory', 'html_summary_dir'): + # This is a directory, we need to list all the files so NGINX + # can download all of them + to_download = self._list_dir_files_nginx(fp_info['fullpath']) + self._write_nginx_file_list(to_download) + fname = '%s.zip' % fname + else: + self._write_nginx_placeholder_file(relpath) + self.set_header('Content-Type', 'application/octet-stream') + self.set_header('Content-Transfer-Encoding', 'binary') + self.set_header('X-Accel-Redirect', '/protected/' + relpath) + + self._set_nginx_headers(fname) + self.finish() + class DownloadStudyBIOMSHandler(BaseHandlerDownload): @authenticated @@ -85,67 +197,22 @@ class DownloadStudyBIOMSHandler(BaseHandlerDownload): def get(self, study_id): study_id = int(study_id) study = self._check_permissions(study_id) - - basedir = get_db_files_base_dir() - basedir_len = len(basedir) + 1 # loop over artifacts and retrieve those that we have access to to_download = [] for a in study.artifacts(): if a.artifact_type == 'BIOM': - for i, (fid, path, data_type) in enumerate(a.filepaths): - # ignore if tgz as they could create problems and the - # raw data is in the folder - if data_type == 'tgz': - continue - if data_type == 'directory': - # If we have a directory, we actually need to list - # all the files from the directory so NGINX can - # actually download all of them - for dp, _, fps in walk(path): - for fname in fps: - fullpath = join(dp, fname) - spath = fullpath - if fullpath.startswith(basedir): - spath = fullpath[basedir_len:] - to_download.append((fullpath, spath, spath)) - elif path.startswith(basedir): - spath = path[basedir_len:] - to_download.append((path, spath, spath)) - else: - # We are not aware of any case that can trigger this - # situation, but we wanted to be overly cautious - # There is no test for this line cause we don't know - # how to trigger it - to_download.append((path, path, path)) - - for pt in a.prep_templates: - qmf = pt.qiime_map_fp - if qmf is not None: - sqmf = qmf - if qmf.startswith(basedir): - sqmf = qmf[basedir_len:] - to_download.append( - (qmf, sqmf, 'mapping_files/%s_mapping_file.txt' - % a.id)) + to_download.extend(self._list_artifact_files_nginx(a)) - # If we don't have nginx, write a file that indicates this - all_files = '\n'.join(["- %s /protected/%s %s" % (getsize(fp), sfp, n) - for fp, sfp, n in to_download]) - self.write("%s\n" % all_files) + self._write_nginx_file_list(to_download) zip_fn = 'study_%d_%s.zip' % ( study_id, datetime.now().strftime('%m%d%y-%H%M%S')) - self.set_header('Content-Description', 'File Transfer') - self.set_header('Expires', '0') - self.set_header('Cache-Control', 'no-cache') - self.set_header('X-Archive-Files', 'zip') - self.set_header('Content-Disposition', - 'attachment; filename=%s' % zip_fn) + self._set_nginx_headers(zip_fn) self.finish() -class DownloadRelease(BaseHandler): +class DownloadRelease(BaseHandlerDownload): @coroutine def get(self, extras): _, relpath, _ = get_release_info() @@ -153,19 +220,14 @@ def get(self, extras): # If we don't have nginx, write a file that indicates this # Note that this configuration will automatically create and download # ("on the fly") the zip file via the contents in all_files - self.write("This installation of Qiita was not equipped with nginx, " - "so it is incapable of serving files. The file you " - "attempted to download is located at %s" % relpath) + self._write_nginx_placeholder_file(relpath) + + self._set_nginx_headers(basename(relpath)) - self.set_header('Content-Description', 'File Transfer') self.set_header('Content-Type', 'application/octet-stream') self.set_header('Content-Transfer-Encoding', 'binary') - self.set_header('Expires', '0') - self.set_header('Cache-Control', 'no-cache') self.set_header('X-Accel-Redirect', '/protected-working_dir/' + relpath) - self.set_header('Content-Disposition', - 'attachment; filename=%s' % basename(relpath)) self.finish() @@ -183,60 +245,18 @@ def get(self, study_id): self.current_user.email, str(study_id))) - basedir = get_db_files_base_dir() - basedir_len = len(basedir) + 1 # loop over artifacts and retrieve raw data (no parents) to_download = [] for a in study.artifacts(): if not a.parents: - for i, (fid, path, data_type) in enumerate(a.filepaths): - if data_type == 'directory': - # If we have a directory, we actually need to list - # all the files from the directory so NGINX can - # actually download all of them - for dp, _, fps in walk(path): - for fname in fps: - fullpath = join(dp, fname) - spath = fullpath - if fullpath.startswith(basedir): - spath = fullpath[basedir_len:] - to_download.append((fullpath, spath, spath)) - elif path.startswith(basedir): - spath = path[basedir_len:] - to_download.append((path, spath, spath)) - else: - # We are not aware of any case that can trigger this - # situation, but we wanted to be overly cautious - # There is no test for this line cause we don't know - # how to trigger it - to_download.append((path, path, path)) - - for pt in a.prep_templates: - qmf = pt.qiime_map_fp - if qmf is not None: - sqmf = qmf - if qmf.startswith(basedir): - sqmf = qmf[basedir_len:] - to_download.append( - (qmf, sqmf, 'mapping_files/%s_mapping_file.txt' - % a.id)) + to_download.extend(self._list_artifact_files_nginx(a)) - # If we don't have nginx, write a file that indicates this - # Note that this configuration will automatically create and download - # ("on the fly") the zip file via the contents in all_files - all_files = '\n'.join(["- %s /protected/%s %s" % (getsize(fp), sfp, n) - for fp, sfp, n in to_download]) - self.write("%s\n" % all_files) + self._write_nginx_file_list(to_download) zip_fn = 'study_raw_data_%d_%s.zip' % ( study_id, datetime.now().strftime('%m%d%y-%H%M%S')) - self.set_header('Content-Description', 'File Transfer') - self.set_header('Expires', '0') - self.set_header('Cache-Control', 'no-cache') - self.set_header('X-Archive-Files', 'zip') - self.set_header('Content-Disposition', - 'attachment; filename=%s' % zip_fn) + self._set_nginx_headers(zip_fn) self.finish() diff --git a/qiita_pet/test/test_download.py b/qiita_pet/test/test_download.py index d9d0415c8..10ed52bfe 100644 --- a/qiita_pet/test/test_download.py +++ b/qiita_pet/test/test_download.py @@ -8,10 +8,10 @@ from unittest import main from mock import Mock -from os.path import exists, isdir, join -from os import remove, makedirs +from os.path import exists, isdir, join, basename +from os import remove, makedirs, close from shutil import rmtree -from tempfile import mkdtemp +from tempfile import mkdtemp, mkstemp from biom.util import biom_open from biom import example_table as et @@ -28,9 +28,16 @@ class TestDownloadHandler(TestHandlerBase): def setUp(self): super(TestDownloadHandler, self).setUp() + self._clean_up_files = [] def tearDown(self): super(TestDownloadHandler, self).tearDown() + for fp in self._clean_up_files: + if exists(fp): + if isdir(fp): + rmtree(fp) + else: + remove(fp) def test_download(self): # check success @@ -45,6 +52,32 @@ def test_download(self): response = self.get('/download/1000') self.assertEqual(response.code, 403) + # directory + a = Artifact(1) + fd, fp = mkstemp(suffix='.html') + close(fd) + with open(fp, 'w') as f: + f.write('\n') + self._clean_up_files.append(fp) + dirpath = mkdtemp() + fd, fp2 = mkstemp(suffix='.txt', dir=dirpath) + close(fd) + with open(fp2, 'w') as f: + f.write('\n') + self._clean_up_files.append(dirpath) + a.set_html_summary(fp, support_dir=dirpath) + for fp_id, _, fp_type in a.filepaths: + if fp_type == 'html_summary_dir': + break + response = self.get('/download/%d' % fp_id) + self.assertEqual(response.code, 200) + + fp_name = basename(fp2) + dirname = basename(dirpath) + self.assertEqual( + response.body, "- 1 /protected/FASTQ/1/%s/%s FASTQ/1/%s/%s\n" + % (dirname, fp_name, dirname, fp_name)) + class TestDownloadStudyBIOMSHandler(TestHandlerBase):