From 9260d96924610a57a8e67b0bcbb74f35d48b7d75 Mon Sep 17 00:00:00 2001 From: John Chilton Date: Fri, 15 Dec 2017 13:55:35 -0500 Subject: [PATCH] [WIP] Hierarchical upload API optimized for folders & collections. Allows describing hierarchical data in JSON or inferring structure from archives or directories. Datasets or archive sources can be specified via uploads, URLs, paths (if admin && allow_path_paste), library_import_dir/user_library_import_dir, and/or FTP imports. Unlike existing API endpoints, a mix of these on a per file basis is allowed and they work seemlessly between libraries and histories. Supported "archives" include gzip, zip, bagit directories, bagit achives (with fetching and validations of downloads). The existing upload API endpoint is quite rough to work with both in terms of adding parameters (e.g. the file type and dbkey hanlding in 4563 was difficult to implement, terribly hacky, and should seemingly have been trivial) and in terms of building requests (one needs to build a tool form - not describe sensible inputs in JSON). This API is built to be intelligable from an API standpoint instead of being constrained to the older style tool form. Additionally it built with hierarchical data in mind in a way that would not be easy at all enhancing the tool form components we don't even render. This implements 5159 though much simpler YAML descriptions of data libraries should be possible basically as the API descriptions. We can replace the data library script in Ephemeris https://github.com/galaxyproject/ephemeris/blob/master/ephemeris/setup_data_libraries.py with one that converts a simple YAML file into an API call and allows many new options for free. In future PRs I'll add filtering options to this and it will serve as the backend to 4733. --- lib/galaxy/app.py | 4 + lib/galaxy/datatypes/sniff.py | 7 + lib/galaxy/datatypes/upload_util.py | 47 +++ .../dependencies/pinned-requirements.txt | 1 + lib/galaxy/jobs/__init__.py | 2 +- lib/galaxy/managers/collections.py | 19 +- lib/galaxy/tools/__init__.py | 12 +- lib/galaxy/tools/actions/upload.py | 55 ++++ lib/galaxy/tools/actions/upload_common.py | 8 +- lib/galaxy/tools/data_fetch.py | 290 ++++++++++++++++++ lib/galaxy/tools/data_fetch.xml | 33 ++ lib/galaxy/tools/execute.py | 2 +- lib/galaxy/tools/parameters/output_collect.py | 183 ++++++++++- lib/galaxy/tools/special_tools.py | 1 + lib/galaxy/webapps/galaxy/api/_fetch_util.py | 134 ++++++++ lib/galaxy/webapps/galaxy/api/tools.py | 45 +++ lib/galaxy/webapps/galaxy/buildapp.py | 1 + scripts/api/fetch_to_library.py | 33 ++ scripts/api/fetch_to_library_example.yml | 42 +++ test-data/example-bag.zip | Bin 0 -> 2966 bytes test-data/testdir1.zip | Bin 0 -> 825 bytes test/api/test_dataset_collections.py | 31 ++ test/api/test_libraries.py | 89 ++++++ test/base/integration_util.py | 3 + test/base/populators.py | 49 ++- .../test_upload_configuration_options.py | 65 ++++ tools/data_source/upload.py | 42 +-- 27 files changed, 1137 insertions(+), 61 deletions(-) create mode 100644 lib/galaxy/datatypes/upload_util.py create mode 100644 lib/galaxy/tools/data_fetch.py create mode 100644 lib/galaxy/tools/data_fetch.xml create mode 100644 lib/galaxy/webapps/galaxy/api/_fetch_util.py create mode 100644 scripts/api/fetch_to_library.py create mode 100644 scripts/api/fetch_to_library_example.yml create mode 100644 test-data/example-bag.zip create mode 100644 test-data/testdir1.zip diff --git a/lib/galaxy/app.py b/lib/galaxy/app.py index 25de26171058..ce22ccd7f396 100644 --- a/lib/galaxy/app.py +++ b/lib/galaxy/app.py @@ -12,6 +12,8 @@ from galaxy import config, jobs from galaxy.jobs import metrics as job_metrics from galaxy.managers.collections import DatasetCollectionManager +from galaxy.managers.folders import FolderManager +from galaxy.managers.libraries import LibraryManager from galaxy.managers.tags import GalaxyTagManager from galaxy.openid.providers import OpenIDProviders from galaxy.queue_worker import GalaxyQueueWorker @@ -90,6 +92,8 @@ def __init__(self, **kwargs): self.tag_handler = GalaxyTagManager(self.model.context) # Dataset Collection Plugins self.dataset_collections_service = DatasetCollectionManager(self) + self.library_folder_manager = FolderManager() + self.library_manager = LibraryManager() # Tool Data Tables self._configure_tool_data_tables(from_shed_config=False) diff --git a/lib/galaxy/datatypes/sniff.py b/lib/galaxy/datatypes/sniff.py index cf058cf069ac..4f019234e142 100644 --- a/lib/galaxy/datatypes/sniff.py +++ b/lib/galaxy/datatypes/sniff.py @@ -14,6 +14,7 @@ import zipfile from six import text_type +from six.moves.urllib.request import urlopen from galaxy import util from galaxy.util import compression_utils @@ -39,6 +40,12 @@ def get_test_fname(fname): return full_path +def stream_url_to_file(path): + page = urlopen(path) # page will be .close()ed in stream_to_file + temp_name = stream_to_file(page, prefix='url_paste', source_encoding=util.get_charset_from_http_headers(page.headers)) + return temp_name + + def stream_to_open_named_file(stream, fd, filename, source_encoding=None, source_error='strict', target_encoding=None, target_error='strict'): """Writes a stream to the provided file descriptor, returns the file name. Closes file descriptor""" # signature and behavor is somewhat odd, due to backwards compatibility, but this can/should be done better diff --git a/lib/galaxy/datatypes/upload_util.py b/lib/galaxy/datatypes/upload_util.py new file mode 100644 index 000000000000..97bb11862ca3 --- /dev/null +++ b/lib/galaxy/datatypes/upload_util.py @@ -0,0 +1,47 @@ +from galaxy.datatypes import sniff +from galaxy.datatypes.binary import Binary + + +class UploadProblemException(Exception): + + def __init__(self, message): + self.message = message + + +def handle_unsniffable_binary_check(data_type, ext, path, name, is_binary, requested_ext, check_content, registry): + """Return modified values of data_type and ext if unsniffable binary encountered. + + Throw UploadProblemException if content problems or extension mismatches occur. + + Precondition: check_binary called returned True. + """ + if is_binary or registry.is_extension_unsniffable_binary(requested_ext): + # We have a binary dataset, but it is not Bam, Sff or Pdf + data_type = 'binary' + parts = name.split(".") + if len(parts) > 1: + ext = parts[-1].strip().lower() + is_ext_unsniffable_binary = registry.is_extension_unsniffable_binary(ext) + if check_content and not is_ext_unsniffable_binary: + raise UploadProblemException('The uploaded binary file contains inappropriate content') + + elif is_ext_unsniffable_binary and requested_ext != ext: + err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % (ext, ext) + raise UploadProblemException(err_msg) + return data_type, ext + + +def handle_sniffable_binary_check(data_type, ext, path, registry): + """Return modified values of data_type and ext if sniffable binary encountered. + + Precondition: check_binary called returned True. + """ + # Sniff the data type + guessed_ext = sniff.guess_ext(path, registry.sniff_order) + # Set data_type only if guessed_ext is a binary datatype + datatype = registry.get_datatype_by_extension(guessed_ext) + if isinstance(datatype, Binary): + data_type = guessed_ext + ext = guessed_ext + + return data_type, ext diff --git a/lib/galaxy/dependencies/pinned-requirements.txt b/lib/galaxy/dependencies/pinned-requirements.txt index adddb8247b9f..f5cb58868e9f 100644 --- a/lib/galaxy/dependencies/pinned-requirements.txt +++ b/lib/galaxy/dependencies/pinned-requirements.txt @@ -18,6 +18,7 @@ pysam>=0.13 #python_lzo==1.8 # pure Python packages +bdbag==1.1.1 bz2file==0.98; python_version < '3.3' ipaddress==1.0.18; python_version < '3.3' boltons==17.1.0 diff --git a/lib/galaxy/jobs/__init__.py b/lib/galaxy/jobs/__init__.py index 82107f160940..d5f239f083e1 100644 --- a/lib/galaxy/jobs/__init__.py +++ b/lib/galaxy/jobs/__init__.py @@ -1366,7 +1366,7 @@ def path_rewriter(path): collected_datasets = { 'primary': self.tool.collect_primary_datasets(out_data, self.get_tool_provided_job_metadata(), tool_working_directory, input_ext, input_dbkey) } - self.tool.collect_dynamic_collections( + self.tool.collect_dynamic_outputs( out_collections, self.get_tool_provided_job_metadata(), job_working_directory=tool_working_directory, diff --git a/lib/galaxy/managers/collections.py b/lib/galaxy/managers/collections.py index e823b0f44ca3..cff0d0cdc241 100644 --- a/lib/galaxy/managers/collections.py +++ b/lib/galaxy/managers/collections.py @@ -46,17 +46,22 @@ def __init__(self, app): self.tag_manager = tags.GalaxyTagManager(app.model.context) self.ldda_manager = lddas.LDDAManager(app) - def precreate_dataset_collection_instance(self, trans, parent, name, implicit_inputs, implicit_output_name, structure): + def precreate_dataset_collection_instance(self, trans, parent, name, structure, implicit_inputs=None, implicit_output_name=None): # TODO: prebuild all required HIDs and send them in so no need to flush in between. - dataset_collection = self.precreate_dataset_collection(structure) + dataset_collection = self.precreate_dataset_collection(structure, allow_unitialized_element=implicit_output_name is not None) instance = self._create_instance_for_collection( trans, parent, name, dataset_collection, implicit_inputs=implicit_inputs, implicit_output_name=implicit_output_name, flush=False ) return instance - def precreate_dataset_collection(self, structure): - if structure.is_leaf or not structure.children_known: - return model.DatasetCollectionElement.UNINITIALIZED_ELEMENT + def precreate_dataset_collection(self, structure, allow_unitialized_element=True): + has_structure = not structure.is_leaf and structure.children_known + if not has_structure and allow_unitialized_element: + dataset_collection = model.DatasetCollectionElement.UNINITIALIZED_ELEMENT + elif not has_structure: + collection_type_description = structure.collection_type_description + dataset_collection = model.DatasetCollection(populated=False) + dataset_collection.collection_type = collection_type_description.collection_type else: collection_type_description = structure.collection_type_description dataset_collection = model.DatasetCollection(populated=False) @@ -67,7 +72,7 @@ def precreate_dataset_collection(self, structure): if substructure.is_leaf: element = model.DatasetCollectionElement.UNINITIALIZED_ELEMENT else: - element = self.precreate_dataset_collection(substructure) + element = self.precreate_dataset_collection(substructure, allow_unitialized_element=allow_unitialized_element) element = model.DatasetCollectionElement( element=element, @@ -78,7 +83,7 @@ def precreate_dataset_collection(self, structure): dataset_collection.elements = elements dataset_collection.element_count = len(elements) - return dataset_collection + return dataset_collection def create(self, trans, parent, name, collection_type, element_identifiers=None, elements=None, implicit_collection_info=None, trusted_identifiers=None, diff --git a/lib/galaxy/tools/__init__.py b/lib/galaxy/tools/__init__.py index 52b42f94fdff..60d7e4b5631e 100755 --- a/lib/galaxy/tools/__init__.py +++ b/lib/galaxy/tools/__init__.py @@ -101,6 +101,7 @@ # Tools that require Galaxy's Python environment to be preserved. GALAXY_LIB_TOOLS_UNVERSIONED = [ "upload1", + "__DATA_FETCH__", # Legacy tools bundled with Galaxy. "vcf_to_maf_customtrack1", "laj_1", @@ -1039,7 +1040,10 @@ def parse_input_elem(self, page_source, enctypes, context=None): group.file_type_name = elem.get('file_type_name', group.file_type_name) group.default_file_type = elem.get('default_file_type', group.default_file_type) group.metadata_ref = elem.get('metadata_ref', group.metadata_ref) - rval[group.file_type_name].refresh_on_change = True + try: + rval[group.file_type_name].refresh_on_change = True + except KeyError: + pass group_page_source = XmlPageSource(elem) group.inputs = self.parse_input_elem(group_page_source, enctypes, context) rval[group.name] = group @@ -1576,10 +1580,10 @@ def collect_primary_datasets(self, output, tool_provided_metadata, job_working_d """ return output_collect.collect_primary_datasets(self, output, tool_provided_metadata, job_working_directory, input_ext, input_dbkey=input_dbkey) - def collect_dynamic_collections(self, output, tool_provided_metadata, **kwds): - """ Find files corresponding to dynamically structured collections. + def collect_dynamic_outputs(self, output, tool_provided_metadata, **kwds): + """Collect dynamic outputs associated with a job from this tool. """ - return output_collect.collect_dynamic_collections(self, output, tool_provided_metadata, **kwds) + return output_collect.collect_dynamic_outputs(self, output, tool_provided_metadata, **kwds) def to_archive(self): tool = self diff --git a/lib/galaxy/tools/actions/upload.py b/lib/galaxy/tools/actions/upload.py index 70bf8152c44a..c19fb960ac86 100644 --- a/lib/galaxy/tools/actions/upload.py +++ b/lib/galaxy/tools/actions/upload.py @@ -1,3 +1,4 @@ +import json import logging from galaxy.tools.actions import upload_common @@ -36,3 +37,57 @@ def execute(self, tool, trans, incoming={}, set_output_hid=True, history=None, * rval = upload_common.create_job(trans, incoming, tool, json_file_path, data_list, history=history) log.debug("Created upload job %s" % create_job_timer) return rval + + +class FetchUploadToolAction(ToolAction): + + def execute(self, tool, trans, incoming={}, set_output_hid=True, history=None, **kwargs): + dataset_upload_inputs = [] + for input_name, input in tool.inputs.items(): + if input.type == "upload_dataset": + dataset_upload_inputs.append(input) + assert dataset_upload_inputs, Exception("No dataset upload groups were found.") + + persisting_uploads_timer = ExecutionTimer() + # precreated_datasets = upload_common.get_precreated_datasets(trans, incoming, trans.app.model.HistoryDatasetAssociation) + incoming = upload_common.persist_uploads(incoming, trans) + log.debug("Persisted uploads %s" % persisting_uploads_timer) + + # Now replace references in requests with these. + files = incoming.get("files", []) + files_iter = iter(files) + request = json.loads(incoming.get("request_json")) + + def replace_file_srcs(request_part): + if isinstance(request_part, dict): + if request_part.get("src", None) == "files": + path_def = next(files_iter) + request_part["path"] = path_def["file_data"]["local_filename"] + if "name" not in request_part: + request_part["name"] = path_def["file_data"]["filename"] + request_part["src"] = "path" + else: + for key, value in request_part.items(): + replace_file_srcs(value) + elif isinstance(request_part, list): + for value in request_part: + replace_file_srcs(value) + + replace_file_srcs(request) + + incoming["request_json"] = json.dumps(request) + log.info("incoming are %s" % incoming) + # We can pass an empty string as the cntrller here since it is used to check whether we + # are in an admin view, and this tool is currently not used there. + check_and_cleanup_timer = ExecutionTimer() + # uploaded_datasets = upload_common.get_uploaded_datasets(trans, '', incoming, precreated_datasets, dataset_upload_inputs, history=history) + # upload_common.cleanup_unused_precreated_datasets(precreated_datasets) + + # if not uploaded_datasets: + # return None, 'No data was entered in the upload form, please go back and choose data to upload.' + + log.debug("Checked and cleaned uploads %s" % check_and_cleanup_timer) + create_job_timer = ExecutionTimer() + rval = upload_common.create_job(trans, incoming, tool, None, [], history=history) + log.debug("Created upload job %s" % create_job_timer) + return rval diff --git a/lib/galaxy/tools/actions/upload_common.py b/lib/galaxy/tools/actions/upload_common.py index ae70cae17de3..dc7831403123 100644 --- a/lib/galaxy/tools/actions/upload_common.py +++ b/lib/galaxy/tools/actions/upload_common.py @@ -123,7 +123,7 @@ def persist_uploads(params, trans): local_filename=local_filename) elif type(f) == dict and 'local_filename' not in f: raise Exception('Uploaded file was encoded in a way not understood by Galaxy.') - if upload_dataset['url_paste'] and upload_dataset['url_paste'].strip() != '': + if 'url_paste' in upload_dataset and upload_dataset['url_paste'] and upload_dataset['url_paste'].strip() != '': upload_dataset['url_paste'] = datatypes.sniff.stream_to_file( StringIO(validate_url(upload_dataset['url_paste'], trans.app.config.fetch_url_whitelist_ips)), prefix="strio_url_paste_" @@ -334,7 +334,11 @@ def new_upload(trans, cntrller, uploaded_dataset, library_bunch=None, history=No def get_uploaded_datasets(trans, cntrller, params, precreated_datasets, dataset_upload_inputs, library_bunch=None, history=None): uploaded_datasets = [] for dataset_upload_input in dataset_upload_inputs: - uploaded_datasets.extend(dataset_upload_input.get_uploaded_datasets(trans, params)) + try: + uploaded_datasets.extend(dataset_upload_input.get_uploaded_datasets(trans, params)) + except AttributeError: + # TODO: refine... + pass for uploaded_dataset in uploaded_datasets: data = get_precreated_dataset(precreated_datasets, uploaded_dataset.name) if not data: diff --git a/lib/galaxy/tools/data_fetch.py b/lib/galaxy/tools/data_fetch.py new file mode 100644 index 000000000000..4ea9ccd79d38 --- /dev/null +++ b/lib/galaxy/tools/data_fetch.py @@ -0,0 +1,290 @@ +import argparse +import errno +import json +import os +import shutil +import sys +import tempfile + +import bdbag.bdbag_api + +from galaxy.datatypes import sniff +from galaxy.datatypes.registry import Registry +from galaxy.datatypes.upload_util import ( + handle_sniffable_binary_check, + handle_unsniffable_binary_check, + UploadProblemException, +) +from galaxy.util import in_directory +from galaxy.util.checkers import ( + check_binary, + check_html, +) +from galaxy.util.compression_utils import CompressedFile + +DESCRIPTION = """Data Import Script""" + + +def main(argv=None): + if argv is None: + argv = sys.argv[1:] + args = _arg_parser().parse_args(argv) + + registry = Registry() + registry.load_datatypes(root_dir=args.galaxy_root, config=args.datatypes_registry) + + request_path = args.request + assert os.path.exists(request_path) + with open(request_path) as f: + request = json.load(f) + + upload_config = UploadConfig(request, registry) + galaxy_json = _request_to_galaxy_json(upload_config, request) + with open("galaxy.json", "w") as f: + json.dump(galaxy_json, f) + + +def _request_to_galaxy_json(upload_config, request): + targets = request.get("targets", []) + fetched_targets = [] + + for target in targets: + fetched_target = _fetch_target(upload_config, target) + fetched_targets.append(fetched_target) + + return {"__unnamed_outputs": fetched_targets} + + +def _fetch_target(upload_config, target): + destination = target.get("destination", None) + assert destination, "No destination defined." + + items_from = target.get("items_from", None) + assert not items_from or items_from in ["archive", "bagit", "bagit_archive"] + if items_from == "archive": + decompressed_directory = _decompress_target(target) + items = _directory_to_items(decompressed_directory) + elif items_from == "bagit": + _, items_from_path = _has_src_to_path(target) + items = _bagit_to_items(items_from_path) + elif items_from == "bagit_archive": + decompressed_directory = _decompress_target(target) + items = _bagit_to_items(decompressed_directory) + else: + items = target.get("items") + assert items is not None, "No item definition found for destination [%s]" % destination + + fetched_target = {} + fetched_target["destination"] = destination + if "collection_type" in target: + fetched_target["collection_type"] = target["collection_type"] + + def _resolve_src(item): + converted_path = None + + name, path = _has_src_to_path(item) + dbkey = item.get("dbkey", "?") + requested_ext = item.get("ext", "auto") + info = item.get("info", None) + link_data_only = upload_config.link_data_only + if "link_data_only" in item: + # Allow overriding this on a per file basis. + link_data_only = _link_data_only(item) + to_posix_lines = upload_config.get_option(item, "to_posix_lines") + space_to_tab = upload_config.get_option(item, "space_to_tab") + in_place = item.get("in_place", False) + purge_source = item.get("purge_source", True) + + # Follow upload.py logic but without the auto-decompress logic. + registry = upload_config.registry + check_content = upload_config.check_content + data_type, ext = None, requested_ext + + is_binary = check_binary(path) + if is_binary: + data_type, ext = handle_sniffable_binary_check(data_type, ext, path, registry) + if data_type is None: + if is_binary: + data_type, ext = handle_unsniffable_binary_check( + data_type, ext, path, name, is_binary, requested_ext, check_content, registry + ) + if not data_type and check_content and check_html(path): + raise UploadProblemException('The uploaded file contains inappropriate HTML content') + + if data_type != 'binary': + if not link_data_only: + if to_posix_lines: + if space_to_tab: + line_count, converted_path = sniff.convert_newlines_sep2tabs(path, in_place=in_place, tmp_dir=".") + else: + line_count, converted_path = sniff.convert_newlines(path, in_place=in_place, tmp_dir=".") + + if requested_ext == 'auto': + ext = sniff.guess_ext(path, registry.sniff_order) + else: + ext = requested_ext + + data_type = ext + + if ext == 'auto' and data_type == 'binary': + ext = 'data' + if ext == 'auto' and requested_ext: + ext = requested_ext + if ext == 'auto': + ext = 'data' + + datatype = registry.get_datatype_by_extension(ext) + if link_data_only: + # Never alter a file that will not be copied to Galaxy's local file store. + if datatype.dataset_content_needs_grooming(path): + err_msg = 'The uploaded files need grooming, so change your Copy data into Galaxy? selection to be ' + \ + 'Copy files into Galaxy instead of Link to files without copying into Galaxy so grooming can be performed.' + raise UploadProblemException(err_msg) + + # If this file is not in the workdir make sure it gets there. + if not link_data_only and converted_path: + path = upload_config.ensure_in_working_directory(converted_path, purge_source, in_place) + elif not link_data_only: + path = upload_config.ensure_in_working_directory(path, purge_source, in_place) + + if not link_data_only and datatype and datatype.dataset_content_needs_grooming(path): + # Groom the dataset content if necessary + datatype.groom_dataset_content(path) + + rval = {"name": name, "filename": path, "dbkey": dbkey, "ext": ext, "link_data_only": link_data_only} + if info is not None: + rval["info"] = info + return rval + + elements = elements_tree_map(_resolve_src, items) + + fetched_target["elements"] = elements + return fetched_target + + +def _bagit_to_items(directory): + bdbag.bdbag_api.resolve_fetch(directory) + bdbag.bdbag_api.validate_bag(directory) + items = _directory_to_items(os.path.join(directory, "data")) + return items + + +def _decompress_target(target): + items_from_name, items_from_path = _has_src_to_path(target) + temp_directory = tempfile.mkdtemp(prefix=items_from_name, dir=".") + decompressed_directory = CompressedFile(items_from_path).extract(temp_directory) + return decompressed_directory + + +def elements_tree_map(f, items): + new_items = [] + for item in items: + if "elements" in item: + new_item = item.copy() + new_item["elements"] = elements_tree_map(f, item["elements"]) + new_items.append(new_item) + else: + new_items.append(f(item)) + return new_items + + +def _directory_to_items(directory): + items = [] + dir_elements = {} + for root, dirs, files in os.walk(directory): + if root in dir_elements: + target = dir_elements[root] + else: + target = items + for dir in dirs: + dir_dict = {"name": dir, "elements": []} + dir_elements[os.path.join(root, dir)] = dir_dict["elements"] + target.append(dir_dict) + for file in files: + target.append({"src": "path", "path": os.path.join(root, file)}) + + return items + + +def _has_src_to_path(item): + assert "src" in item, item + src = item.get("src") + name = item.get("name") + if src == "url": + url = item.get("url") + path = sniff.stream_url_to_file(url) + if name is None: + name = url.split("/")[-1] + else: + assert src == "path" + path = item["path"] + if name is None: + name = os.path.basename(path) + return name, path + + +def _arg_parser(): + parser = argparse.ArgumentParser(description=DESCRIPTION) + parser.add_argument("--galaxy-root") + parser.add_argument("--datatypes-registry") + parser.add_argument("--request-version") + parser.add_argument("--request") + return parser + + +class UploadConfig(object): + + def __init__(self, request, registry): + self.registry = registry + self.check_content = request.get("check_content" , True) + self.to_posix_lines = request.get("to_posix_lines", False) + self.space_to_tab = request.get("space_to_tab", False) + self.link_data_only = _link_data_only(request) + + self.__workdir = os.path.abspath(".") + self.__upload_count = 0 + + def get_option(self, item, key): + """Return item[key] if specified otherwise use default from UploadConfig. + + This default represents the default for the whole request instead item which + is the option for individual files. + """ + if key in item: + return item[key] + else: + return getattr(self, key) + + def __new_dataset_path(self): + path = "gxupload_%d" % self.__upload_count + self.__upload_count += 1 + return path + + def ensure_in_working_directory(self, path, purge_source, in_place): + if in_directory(path, self.__workdir): + return path + + new_path = self.__new_dataset_path() + if purge_source: + try: + shutil.move(path, new_path) + except OSError as e: + # We may not have permission to remove converted_path + if e.errno != errno.EACCES: + raise + else: + shutil.copy(path, new_path) + + return new_path + + +def _link_data_only(has_config_dict): + link_data_only = has_config_dict.get("link_data_only", False) + if not isinstance(link_data_only, bool): + # Allow the older string values of 'copy_files' and 'link_to_files' + link_data_only = link_data_only == "copy_files" + return link_data_only + + +if __name__ == "__main__": + main() diff --git a/lib/galaxy/tools/data_fetch.xml b/lib/galaxy/tools/data_fetch.xml new file mode 100644 index 000000000000..5160cb2989c8 --- /dev/null +++ b/lib/galaxy/tools/data_fetch.xml @@ -0,0 +1,33 @@ + + + + + + + + + + + + + + + + + $request_json + + + + + diff --git a/lib/galaxy/tools/execute.py b/lib/galaxy/tools/execute.py index 1a83a0d245ff..4ca2cf967278 100644 --- a/lib/galaxy/tools/execute.py +++ b/lib/galaxy/tools/execute.py @@ -278,9 +278,9 @@ def precreate_output_collections(self, history, params): trans=trans, parent=history, name=output_collection_name, + structure=effective_structure, implicit_inputs=implicit_inputs, implicit_output_name=output_name, - structure=effective_structure, ) collection_instance.implicit_collection_jobs = implicit_collection_jobs collection_instances[output_name] = collection_instance diff --git a/lib/galaxy/tools/parameters/output_collect.py b/lib/galaxy/tools/parameters/output_collect.py index a452d5805125..935b994ba267 100644 --- a/lib/galaxy/tools/parameters/output_collect.py +++ b/lib/galaxy/tools/parameters/output_collect.py @@ -9,9 +9,11 @@ from collections import namedtuple from galaxy import util +from galaxy.dataset_collections.structure import UnitializedTree from galaxy.tools.parser.output_collection_def import ( DEFAULT_DATASET_COLLECTOR_DESCRIPTION, INPUT_DBKEY_TOKEN, + ToolProvidedMetadataDatasetCollection, ) from galaxy.util import ( ExecutionTimer, @@ -34,6 +36,9 @@ def get_new_dataset_meta_by_basename(self, output_name, basename): def has_failed_outputs(self): return False + def get_unnamed_outputs(self): + return [] + class LegacyToolProvidedMetadata(object): @@ -84,6 +89,9 @@ def has_failed_outputs(self): return found_failed + def get_unnamed_outputs(self): + return [] + class ToolProvidedMetadata(object): @@ -124,14 +132,21 @@ def _elements_to_datasets(self, elements, level=0): def has_failed_outputs(self): found_failed = False - for meta in self.tool_provided_job_metadata.values(): + for output_name, meta in self.tool_provided_job_metadata.items(): + if output_name == "__unnamed_outputs": + continue + if meta.get("failed", False): found_failed = True return found_failed + def get_unnamed_outputs(self): + log.info(self.tool_provided_job_metadata) + return self.tool_provided_job_metadata.get("__unnamed_outputs", []) + -def collect_dynamic_collections( +def collect_dynamic_outputs( tool, output_collections, tool_provided_metadata, @@ -140,6 +155,7 @@ def collect_dynamic_collections( job=None, input_dbkey="?", ): + app = tool.app collections_service = tool.app.dataset_collections_service job_context = JobContext( tool, @@ -149,6 +165,86 @@ def collect_dynamic_collections( inp_data, input_dbkey, ) + log.info(tool_provided_metadata) + for unnamed_output_dict in tool_provided_metadata.get_unnamed_outputs(): + assert "destination" in unnamed_output_dict + assert "elements" in unnamed_output_dict + destination = unnamed_output_dict["destination"] + elements = unnamed_output_dict["elements"] + + assert "type" in destination + destination_type = destination["type"] + trans = job_context.work_context + + if destination_type == "library_folder": + + library_folder_manager = app.library_folder_manager + library_folder = library_folder_manager.get(trans, app.security.decode_id(destination.get("library_folder_id"))) + + def add_elements_to_folder(elements, library_folder): + for element in elements: + if "elements" in element: + assert "name" in element + name = element["name"] + description = element.get("description") + nested_folder = library_folder_manager.create(trans, library_folder.id, name, description) + add_elements_to_folder(element["elements"], nested_folder) + else: + discovered_file = discovered_file_for_unnamed_output(element, job_working_directory) + fields_match = discovered_file.match + designation = fields_match.designation + visible = fields_match.visible + ext = fields_match.ext + dbkey = fields_match.dbkey + info = element.get("info", None) + + # Create new primary dataset + name = fields_match.name or designation + + job_context.create_dataset( + ext=ext, + designation=designation, + visible=visible, + dbkey=dbkey, + name=name, + filename=discovered_file.path, + info=info, + library_folder=library_folder + ) + + add_elements_to_folder(elements, library_folder) + elif destination_type == "hdca": + history = job.history + assert "collection_type" in unnamed_output_dict + name = unnamed_output_dict.get("name", "unnamed collection") + collection_type = unnamed_output_dict["collection_type"] + collection_type_description = collections_service.collection_type_descriptions.for_collection_type(collection_type) + structure = UnitializedTree(collection_type_description) + hdca = collections_service.precreate_dataset_collection_instance( + trans, history, name, structure=structure + ) + filenames = odict.odict() + + def add_to_discovered_files(elements): + for element in elements: + if "elements" in element: + add_to_discovered_files(element["elements"]) + else: + discovered_file = discovered_file_for_unnamed_output(element, job_working_directory) + filenames[discovered_file.path] = discovered_file + + add_to_discovered_files(elements) + + collection = hdca.collection + collection_builder = collections_service.collection_builder_for( + collection + ) + job_context.populate_collection_elements( + collection, + collection_builder, + filenames, + ) + collection_builder.populate() for name, has_collection in output_collections.items(): if name not in tool.output_collections: @@ -165,13 +261,19 @@ def collect_dynamic_collections( collection = has_collection try: + collection_builder = collections_service.collection_builder_for( collection ) + dataset_collectors = map(dataset_collector, output_collection_def.dataset_collector_descriptions) + output_name = output_collection_def.name + filenames = job_context.find_files(output_name, collection, dataset_collectors) job_context.populate_collection_elements( collection, collection_builder, - output_collection_def, + filenames, + name=output_collection_def.name, + metadata_source_name=output_collection_def.metadata_source, ) collection_builder.populate() except Exception: @@ -190,6 +292,11 @@ def __init__(self, tool, tool_provided_metadata, job, job_working_directory, inp self.job_working_directory = job_working_directory self.tool_provided_metadata = tool_provided_metadata + @property + def work_context(self): + from galaxy.work.context import WorkRequestContext + return WorkRequestContext(self.app, user=self.job.user) + @property def permissions(self): inp_data = self.inp_data @@ -207,15 +314,14 @@ def find_files(self, output_name, collection, dataset_collectors): filenames[discovered_file.path] = discovered_file return filenames - def populate_collection_elements(self, collection, root_collection_builder, output_collection_def): + def populate_collection_elements(self, collection, root_collection_builder, filenames, name=None, metadata_source_name=None): # TODO: allow configurable sorting. # # # # - dataset_collectors = map(dataset_collector, output_collection_def.dataset_collector_descriptions) - output_name = output_collection_def.name - filenames = self.find_files(output_name, collection, dataset_collectors) + if name is None: + name = "unnamed output" element_datasets = [] for filename, discovered_file in filenames.items(): @@ -241,14 +347,14 @@ def populate_collection_elements(self, collection, root_collection_builder, outp dbkey=dbkey, name=name, filename=filename, - metadata_source_name=output_collection_def.metadata_source, + metadata_source_name=metadata_source_name, ) log.debug( "(%s) Created dynamic collection dataset for path [%s] with element identifier [%s] for output [%s] %s", self.job.id, filename, designation, - output_collection_def.name, + name, create_dataset_timer, ) element_datasets.append((element_identifiers, dataset)) @@ -263,7 +369,7 @@ def populate_collection_elements(self, collection, root_collection_builder, outp log.debug( "(%s) Add dynamic collection datsets to history for output [%s] %s", self.job.id, - output_collection_def.name, + name, add_datasets_timer, ) @@ -293,12 +399,17 @@ def create_dataset( dbkey, name, filename, - metadata_source_name, + metadata_source_name=None, + info=None, + library_folder=None, ): app = self.app sa_session = self.sa_session - primary_data = _new_hda(app, sa_session, ext, designation, visible, dbkey, self.permissions) + if not library_folder: + primary_data = _new_hda(app, sa_session, ext, designation, visible, dbkey, self.permissions) + else: + primary_data = _new_ldda(self.work_context, name, ext, visible, dbkey, library_folder) # Copy metadata from one of the inputs if requested. metadata_source = None @@ -318,6 +429,9 @@ def create_dataset( else: primary_data.init_meta() + if info is not None: + primary_data.info = info + primary_data.set_meta() primary_data.set_peek() @@ -484,6 +598,14 @@ def discover_files(output_name, tool_provided_metadata, extra_file_collectors, j yield DiscoveredFile(match.path, collector, match) +def discovered_file_for_unnamed_output(dataset, job_working_directory): + extra_file_collector = DEFAULT_TOOL_PROVIDED_DATASET_COLLECTOR + target_directory = discover_target_directory(extra_file_collector, job_working_directory) + filename = dataset["filename"] + path = os.path.join(target_directory, filename) + return DiscoveredFile(path, extra_file_collector, JsonCollectedDatasetMatch(dataset, extra_file_collector, filename, path=path)) + + def discover_target_directory(extra_file_collector, job_working_directory): directory = job_working_directory if extra_file_collector.directory: @@ -656,6 +778,42 @@ def __init__(self, re_match, collector, filename, path=None): UNSET = object() +def _new_ldda( + trans, + name, + ext, + visible, + dbkey, + library_folder, +): + ld = trans.app.model.LibraryDataset(folder=library_folder, name=name) + trans.sa_session.add(ld) + trans.sa_session.flush() + trans.app.security_agent.copy_library_permissions(trans, library_folder, ld) + + ldda = trans.app.model.LibraryDatasetDatasetAssociation(name=name, + extension=ext, + dbkey=dbkey, + library_dataset=ld, + user=trans.user, + create_dataset=True, + sa_session=trans.sa_session) + trans.sa_session.add(ldda) + ldda.state = ldda.states.OK + # Permissions must be the same on the LibraryDatasetDatasetAssociation and the associated LibraryDataset + trans.app.security_agent.copy_library_permissions(trans, ld, ldda) + # Copy the current user's DefaultUserPermissions to the new LibraryDatasetDatasetAssociation.dataset + trans.app.security_agent.set_all_dataset_permissions(ldda.dataset, trans.app.security_agent.user_get_default_permissions(trans.user)) + library_folder.add_library_dataset(ld, genome_build=dbkey) + trans.sa_session.add(library_folder) + trans.sa_session.flush() + + ld.library_dataset_dataset_association_id = ldda.id + trans.sa_session.add(ld) + trans.sa_session.flush() + return ldda + + def _new_hda( app, sa_session, @@ -682,3 +840,4 @@ def _new_hda( DEFAULT_DATASET_COLLECTOR = DatasetCollector(DEFAULT_DATASET_COLLECTOR_DESCRIPTION) +DEFAULT_TOOL_PROVIDED_DATASET_COLLECTOR = ToolMetadataDatasetCollector(ToolProvidedMetadataDatasetCollection()) diff --git a/lib/galaxy/tools/special_tools.py b/lib/galaxy/tools/special_tools.py index 953e69dee647..129b7064a941 100644 --- a/lib/galaxy/tools/special_tools.py +++ b/lib/galaxy/tools/special_tools.py @@ -4,6 +4,7 @@ SPECIAL_TOOLS = { "history export": "galaxy/tools/imp_exp/exp_history_to_archive.xml", "history import": "galaxy/tools/imp_exp/imp_history_from_archive.xml", + "data fetch": "galaxy/tools/data_fetch.xml", } diff --git a/lib/galaxy/webapps/galaxy/api/_fetch_util.py b/lib/galaxy/webapps/galaxy/api/_fetch_util.py new file mode 100644 index 000000000000..7f94c66689d9 --- /dev/null +++ b/lib/galaxy/webapps/galaxy/api/_fetch_util.py @@ -0,0 +1,134 @@ +import logging +import os + +from galaxy.actions.library import ( + validate_path_upload, + validate_server_directory_upload, +) +from galaxy.exceptions import ( + RequestParameterInvalidException +) +from galaxy.tools.actions.upload_common import validate_url +from galaxy.util import ( + relpath, +) + +log = logging.getLogger(__name__) + +VALID_DESTINATION_TYPES = ["library", "library_folder", "hdca"] + + +def validate_and_normalize_targets(trans, payload): + """Validate and normalize all src references in fetch targets. + + - Normalize ftp_import and server_dir src entries into simple path entires + with the relevant paths resolved and permissions / configuration checked. + - Check for file:// URLs in items src of "url" and convert them into path + src items - after verifying path pastes are allowed and user is admin. + - Check for valid URLs to be fetched for http and https entries. + - Based on Galaxy configuration and upload types set purge_source and in_place + as needed for each upload. + """ + targets = payload.get("targets", []) + + for target in targets: + destination = _get_required_item(target, "destination", "Each target must specify a 'destination'") + destination_type = _get_required_item(destination, "type", "Each target destination must specify a 'type'") + if destination_type not in VALID_DESTINATION_TYPES: + template = "Invalid target destination type [%s] encountered, must be one of %s" + msg = template % (destination_type, VALID_DESTINATION_TYPES) + raise RequestParameterInvalidException(msg) + if destination_type == "library": + library_name = _get_required_item(destination, "name", "Must specify a library name") + description = destination.get("description", "") + synopsis = destination.get("synopsis", "") + library = trans.app.library_manager.create( + trans, library_name, description=description, synopsis=synopsis + ) + destination["type"] = "library_folder" + for key in ["name", "description", "synopsis"]: + if key in destination: + del destination[key] + destination["library_folder_id"] = trans.app.security.encode_id(library.root_folder.id) + + # Unlike upload.py we don't transmit or use run_as_real_user in the job - we just make sure + # in_place and purge_source are set on the individual upload fetch sources as needed based + # on this. + run_as_real_user = trans.app.config.external_chown_script is None # See comment in upload.py + purge_ftp_source = getattr(trans.app.config, 'ftp_upload_purge', True) and not run_as_real_user + + payload["check_content"] = trans.app.config.check_upload_content + + def check_src(item): + # Normalize file:// URLs into paths. + if item["src"] == "url" and item["url"].startswith("file://"): + item["src"] = "path" + item["path"] = item["url"][len("file://"):] + del item["path"] + + if "in_place" in item: + raise Exception("in_place cannot be set") + + src = item["src"] + if src == "path" or (src == "url" and item["url"].startswith("file:")): + # Validate is admin, leave alone. + validate_path_upload(trans) + elif src == "server_dir": + # Validate and replace with path definition. + server_dir = item["server_dir"] + full_path, _ = validate_server_directory_upload(trans, server_dir) + item["src"] = "path" + item["path"] = full_path + elif src == "ftp_import": + ftp_path = item["ftp_path"] + + # It'd be nice if this can be de-duplicated with what is in parameters/grouping.py. + user_ftp_dir = trans.user_ftp_dir + assert not os.path.islink(user_ftp_dir), "User FTP directory cannot be a symbolic link" + for (dirpath, dirnames, filenames) in os.walk(user_ftp_dir): + for filename in filenames: + if ftp_path == filename: + path = relpath(os.path.join(dirpath, filename), user_ftp_dir) + if not os.path.islink(os.path.join(dirpath, filename)): + full_path = os.path.abspath(os.path.join(user_ftp_dir, path)) + + if not full_path: + raise RequestParameterInvalidException("Failed to find referenced ftp_path or symbolic link was enountered") + + item["src"] = path + item["path"] = full_path + item["purge_source"] = purge_ftp_source + elif src == "url": + url = item["url"] + looks_like_url = False + for url_prefix in ["http://", "https://", "ftp://", "ftps://"]: + if url.startswith(url_prefix): + looks_like_url = True + break + + if not looks_like_url: + raise RequestParameterInvalidException("Invalid URL [%s] found in src definition." % url) + + validate_url(url, trans.app.config.fetch_url_whitelist_ips) + item["in_place"] = run_as_real_user + elif src == "files": + item["in_place"] = run_as_real_user + + _for_each_src(check_src, targets) + + +def _get_required_item(from_dict, key, message): + if key not in from_dict: + raise RequestParameterInvalidException(message) + return from_dict[key] + + +def _for_each_src(f, obj): + if isinstance(obj, list): + for item in obj: + _for_each_src(f, item) + if isinstance(obj, dict): + if "src" in obj: + f(obj) + for key, value in obj.items(): + _for_each_src(f, value) diff --git a/lib/galaxy/webapps/galaxy/api/tools.py b/lib/galaxy/webapps/galaxy/api/tools.py index 415785488880..0ddcbaba14c3 100644 --- a/lib/galaxy/webapps/galaxy/api/tools.py +++ b/lib/galaxy/webapps/galaxy/api/tools.py @@ -12,9 +12,14 @@ from galaxy.web import _future_expose_api_anonymous_and_sessionless as expose_api_anonymous_and_sessionless from galaxy.web.base.controller import BaseAPIController from galaxy.web.base.controller import UsesVisualizationMixin +from ._fetch_util import validate_and_normalize_targets log = logging.getLogger(__name__) +# Do not allow these tools to be called directly - they (it) enforces extra security and +# provides access via a different API endpoint. +PROTECTED_TOOLS = ["__DATA_FETCH__"] + class ToolsController(BaseAPIController, UsesVisualizationMixin): """ @@ -290,12 +295,52 @@ def download(self, trans, id, **kwds): trans.response.headers["Content-Disposition"] = 'attachment; filename="%s.tgz"' % (id) return download_file + @expose_api_anonymous + def fetch(self, trans, payload, **kwd): + """Adapt clean API to tool-constrained API. + """ + log.info("Keywords are %s" % payload) + request_version = '1' + history_id = payload.pop("history_id") + clean_payload = {} + files_payload = {} + for key, value in payload.items(): + if key == "key": + continue + if key.startswith('files_') or key.startswith('__files_'): + files_payload[key] = value + continue + clean_payload[key] = value + log.info("payload %s" % clean_payload) + validate_and_normalize_targets(trans, clean_payload) + clean_payload["check_content"] = trans.app.config.check_upload_content + request = dumps(clean_payload) + log.info(request) + create_payload = { + 'tool_id': "__DATA_FETCH__", + 'history_id': history_id, + 'inputs': { + 'request_version': request_version, + 'request_json': request, + }, + } + create_payload.update(files_payload) + return self._create(trans, create_payload, **kwd) + @expose_api_anonymous def create(self, trans, payload, **kwd): """ POST /api/tools Executes tool using specified inputs and returns tool's outputs. """ + tool_id = payload.get("tool_id") + if tool_id in PROTECTED_TOOLS: + raise exceptions.RequestParameterInvalidException("Cannot execute tool [%s] directly, must use alternative endpoint.") + if tool_id is None: + raise exceptions.RequestParameterInvalidException("Must specify a valid tool_id to use this endpoint.") + return self._create(trans, payload, **kwd) + + def _create(self, trans, payload, **kwd): # HACK: for now, if action is rerun, rerun tool. action = payload.get('action', None) if action == 'rerun': diff --git a/lib/galaxy/webapps/galaxy/buildapp.py b/lib/galaxy/webapps/galaxy/buildapp.py index fa14db1e8063..7a9a60bf91bd 100644 --- a/lib/galaxy/webapps/galaxy/buildapp.py +++ b/lib/galaxy/webapps/galaxy/buildapp.py @@ -268,6 +268,7 @@ def populate_api_routes(webapp, app): # ====== TOOLS API ====== # ======================= + webapp.mapper.connect('/api/tools/fetch', action='fetch', controller='tools', conditions=dict(method=["POST"])) webapp.mapper.connect('/api/tools/all_requirements', action='all_requirements', controller="tools") webapp.mapper.connect('/api/tools/{id:.+?}/build', action='build', controller="tools") webapp.mapper.connect('/api/tools/{id:.+?}/reload', action='reload', controller="tools") diff --git a/scripts/api/fetch_to_library.py b/scripts/api/fetch_to_library.py new file mode 100644 index 000000000000..6c497bcb402b --- /dev/null +++ b/scripts/api/fetch_to_library.py @@ -0,0 +1,33 @@ +import argparse +import json + +import requests +import yaml + + +def main(): + parser = argparse.ArgumentParser(description='Upload a directory into a data library') + parser.add_argument("-u", "--url", dest="url", required=True, help="Galaxy URL") + parser.add_argument("-a", "--api", dest="api_key", required=True, help="API Key") + parser.add_argument('target', metavar='FILE', type=str, + help='file describing data library to fetch') + args = parser.parse_args() + with open(args.target, "r") as f: + target = yaml.load(f) + + histories_url = args.url + "/api/histories" + new_history_response = requests.post(histories_url, data={'key': args.api_key}) + + fetch_url = args.url + '/api/tools/fetch' + payload = { + 'key': args.api_key, + 'targets': json.dumps([target]), + 'history_id': new_history_response.json()["id"] + } + + response = requests.post(fetch_url, data=payload) + print(response.content) + + +if __name__ == '__main__': + main() diff --git a/scripts/api/fetch_to_library_example.yml b/scripts/api/fetch_to_library_example.yml new file mode 100644 index 000000000000..84865d739089 --- /dev/null +++ b/scripts/api/fetch_to_library_example.yml @@ -0,0 +1,42 @@ +destination: + type: library + name: Training Material + description: Data for selected tutorials from https://training.galaxyproject.org. +items: + - name: Quality Control + description: | + Data for sequence quality control tutorial at http://galaxyproject.github.io/training-material/topics/sequence-analysis/tutorials/quality-control/tutorial.html. + + 10.5281/zenodo.61771 + elements: + - src: url + url: https://zenodo.org/record/61771/files/GSM461178_untreat_paired_subset_1.fastq + name: GSM461178_untreat_paired_subset_1 + ext: fastqsanger + info: Untreated subseq of GSM461178 from 10.1186/s12864-017-3692-8 + - src: url + url: https://zenodo.org/record/61771/files/GSM461182_untreat_single_subset.fastq + name: GSM461182_untreat_single_subset + ext: fastqsanger + info: Untreated subseq of GSM461182 from 10.1186/s12864-017-3692-8 + - name: Small RNA-Seq + description: | + Data for small RNA-seq tutorial available at http://galaxyproject.github.io/training-material/topics/transcriptomics/tutorials/srna/tutorial.html + + 10.5281/zenodo.826906 + elements: + - src: url + url: https://zenodo.org/record/826906/files/Symp_RNAi_sRNA-seq_rep1_downsampled.fastqsanger.gz + name: Symp RNAi sRNA Rep1 + ext: fastqsanger.gz + info: Downsample rep1 from 10.1186/s12864-017-3692-8 + - src: url + url: https://zenodo.org/record/826906/files/Symp_RNAi_sRNA-seq_rep2_downsampled.fastqsanger.gz + name: Symp RNAi sRNA Rep2 + ext: fastqsanger.gz + info: Downsample rep2 from 10.1186/s12864-017-3692-8 + - src: url + url: https://zenodo.org/record/826906/files/Symp_RNAi_sRNA-seq_rep3_downsampled.fastqsanger.gz + name: Symp RNAi sRNA Rep3 + ext: fastqsanger.gz + info: Downsample rep3 from 10.1186/s12864-017-3692-8 diff --git a/test-data/example-bag.zip b/test-data/example-bag.zip new file mode 100644 index 0000000000000000000000000000000000000000..ee4de52c265be0c5ddfe54ade610957580115dac GIT binary patch literal 2966 zcmWIWW@h1H00G^m9&a!MN{BGXFqEVgm*^%Xrt7AqmLzBBW|Wi^=!b@IGBBsQ8$>Ph zG>9s#;AUWC`Nqh=z#;mYgs!XZ3B?^0f-fFm`Zt`8A z>1ZbBp7rKCq#(fdpw|B&E8D7L;)ZH=W_MR~s?QMXl4xXGwsP;@9}&Ce%>4g9t)Fe( z!ljIh+vZ-Jr?NV+JHQ+@O#^gzJP~5vR?8p#Z|NxI-ed zgitUzC8m3p=!T^h6=&w>St%IkS(v7HxNQv;}(335b>P85!j2=;G^&;>}FV z*p@wglDENthvh)nHl|ez8)f)Dsyn{^+WgMZs?#I zn&~Pev>=Ak0n7%S;31$=noE6!&7uF}kM> z`I-%Q7!J%0|IipDbGz`GmqFP>_ClHPwOS4n7M@Rk`Br?Tl(JE<=9yU=SQvU)v%PeZ zUsRvgy+8e@q3!d9f1}(FO*v@T^Te|`R83Kk;o}LpxAs>|_UHZH`15yk`}v(~@3Nx$ zVE(J~2^~P!uK{9pLOv+YNHj7vBj|}U)_jM6p181g`3qK!JehhUrHd0=r7a4396FR< z9Qc@+Q2F@q&7E0oN;^G*Ro5Ii+I#4r>cmM7iwx3&^Cvbf_?x#kbRbe=f3^ zH!bsb4l~oVyhxKwcAvXJC2rFumE^G;x)7<|xqy>>Wy0xeM`yKrE&OR8w!ZA9ZG3jj z``TT5uOI(_=25AY$@%kuo_vkRqa}&yq{No*S;K8ih8(RA|E*0jTcAE+LgP~g#<@|&pGu@?0@;X;NXNX1&_rZW-snEx@pSYJ(gCrTjqRC?DO=0o)h>IJB$KKEZ)tD z4V<-c@vgep$@4N##7)jVnXu&41_|cU*lOQNy2-!(`gHTgwy(Xq*NMUH+6SeH4=zk9 zu72_PM8)~j$rm(dEGiUV<#_i1%VFkIPRv_x{8PSH?)pmo7g|bHUa&4=cI05WaA)}i);Eug4GLru*t{E`I7t8a5vN&r@9VBz!Dp^6;7HVT zRN3)%sxXWDE53$Xc_us7KL57y<`R*{30H4jQ+@pV^S2lC*FXOL<3`_G=HhPuck^!S zSXHFUvrIKFE#sWw>^Q|MD`qr@Z%|$J(|fmU{;3P9;!ga13gJGIA_1%SOx)#|QlusB z)9?6Z;*{Xs)@6#9mwpetZhrOLs;h1*85Fl(ni6&~=DsX2xd=!u_cA9I`DDK+CYXA7~4#)+1~MX61z(M9*+o0jfu@Sb67|*c+&0%2R0zxKWSkt@^7^NRoTEWf0$nuSm zfq_K?s5k&_ggBZJe8@&1%;5u?LC~B=amj9Opy?nijA2e%W=^Ux*ij(E1TY;{T$%*5 z5TuL?;XV$a`xFeU%-Dctf-s8vKxz%aMt%VL^CJ+W`7=NdcP1$Rx*%D*_~dZUg}VhPRF&8YwzhA<=>6L}cSIBLmquXJq4` zQ3Es+6q*FgM2#4TnZTsLu%xjY!%S#25jGezW{?ejjBGG8o`7b8;t7vqG2;i>OhcfX g;o%7~6Bt`SGeNP1VJ0gmiWyjea0yUXJ;=8V0D^6pPXGV_ literal 0 HcmV?d00001 diff --git a/test/api/test_dataset_collections.py b/test/api/test_dataset_collections.py index ca8950d8bb0c..5367668bda18 100644 --- a/test/api/test_dataset_collections.py +++ b/test/api/test_dataset_collections.py @@ -188,6 +188,37 @@ def test_enforces_unique_names(self): create_response = self._post("dataset_collections", payload) self._assert_status_code_is(create_response, 400) + def test_upload_collection(self): + items = [{"src": "files", "dbkey": "hg19", "info": "my cool bed"}] + targets = [{ + "destination": {"type": "hdca"}, + "items": items, + "collection_type": "list", + }] + payload = { + "history_id": self.history_id, # TODO: Shouldn't be needed :( + "targets": json.dumps(targets), + "__files": {"files_0|file_data": open(self.test_data_resolver.get_filename("4.bed"))}, + } + self.dataset_populator.fetch(payload) + hdca = self._assert_one_collection_created_in_history() + assert len(hdca["elements"]) == 1, hdca + element0 = hdca["elements"][0] + assert element0["element_identifier"] == "4.bed" + assert element0["object"]["file_size"] == 61 + + def _assert_one_collection_created_in_history(self): + contents_response = self._get("histories/%s/contents/dataset_collections" % self.history_id) + self._assert_status_code_is(contents_response, 200) + contents = contents_response.json() + assert len(contents) == 1 + hdca = contents[0] + assert hdca["history_content_type"] == "dataset_collection" + hdca_id = hdca["id"] + collection_response = self._get("histories/%s/contents/dataset_collections/%s" % (self.history_id, hdca_id)) + self._assert_status_code_is(collection_response, 200) + return collection_response.json() + def _check_create_response(self, create_response): self._assert_status_code_is(create_response, 200) dataset_collection = create_response.json() diff --git a/test/api/test_libraries.py b/test/api/test_libraries.py index 2a715f50fcbb..7a6d6b46718d 100644 --- a/test/api/test_libraries.py +++ b/test/api/test_libraries.py @@ -1,3 +1,5 @@ +import json + from base import api from base.populators import ( DatasetCollectionPopulator, @@ -95,6 +97,93 @@ def test_create_dataset(self): assert library_dataset["peek"].find("create_test") >= 0 assert library_dataset["file_ext"] == "txt", library_dataset["file_ext"] + def test_fetch_upload_to_folder(self): + history_id, library, destination = self._setup_fetch_to_folder("flat_zip") + items = [{"src": "files", "dbkey": "hg19", "info": "my cool bed"}] + targets = [{ + "destination": destination, + "items": items + }] + payload = { + "history_id": history_id, # TODO: Shouldn't be needed :( + "targets": json.dumps(targets), + "__files": {"files_0|file_data": open(self.test_data_resolver.get_filename("4.bed"))}, + } + self.dataset_populator.fetch(payload) + dataset = self.library_populator.get_library_contents_with_path(library["id"], "/4.bed") + assert dataset["file_size"] == 61, dataset + assert dataset["genome_build"] == "hg19", dataset + assert dataset["misc_info"] == "my cool bed", dataset + assert dataset["file_ext"] == "bed", dataset + + def test_fetch_zip_to_folder(self): + history_id, library, destination = self._setup_fetch_to_folder("flat_zip") + bed_test_data_path = self.test_data_resolver.get_filename("4.bed.zip") + targets = [{ + "destination": destination, + "items_from": "archive", "src": "path", "path": bed_test_data_path, + }] + payload = { + "history_id": history_id, # TODO: Shouldn't be needed :( + "targets": json.dumps(targets), + } + self.dataset_populator.fetch(payload) + dataset = self.library_populator.get_library_contents_with_path(library["id"], "/4.bed") + assert dataset["file_size"] == 61, dataset + + def test_fetch_single_url_to_folder(self): + history_id, library, destination = self._setup_fetch_to_folder("single_url") + items = [{"src": "url", "url": "https://raw.githubusercontent.com/galaxyproject/galaxy/dev/test-data/4.bed"}] + targets = [{ + "destination": destination, + "items": items + }] + payload = { + "history_id": history_id, # TODO: Shouldn't be needed :( + "targets": json.dumps(targets), + } + self.dataset_populator.fetch(payload) + dataset = self.library_populator.get_library_contents_with_path(library["id"], "/4.bed") + assert dataset["file_size"] == 61, dataset + + def test_fetch_url_archive_to_folder(self): + history_id, library, destination = self._setup_fetch_to_folder("single_url") + targets = [{ + "destination": destination, + "items_from": "archive", + "src": "url", + "url": "https://raw.githubusercontent.com/galaxyproject/galaxy/dev/test-data/4.bed.zip", + }] + payload = { + "history_id": history_id, # TODO: Shouldn't be needed :( + "targets": json.dumps(targets), + } + self.dataset_populator.fetch(payload) + dataset = self.library_populator.get_library_contents_with_path(library["id"], "/4.bed") + assert dataset["file_size"] == 61, dataset + + def test_fetch_bagit_archive_to_folder(self): + history_id, library, destination = self._setup_fetch_to_folder("bagit_archive") + example_bag_path = self.test_data_resolver.get_filename("example-bag.zip") + targets = [{ + "destination": destination, + "items_from": "bagit_archive", "src": "files", + }] + payload = { + "history_id": history_id, # TODO: Shouldn't be needed :( + "targets": json.dumps(targets), + "__files": {"files_0|file_data": open(example_bag_path)}, + } + self.dataset_populator.fetch(payload) + dataset = self.library_populator.get_library_contents_with_path(library["id"], "/README.txt") + assert dataset["file_size"] == 66, dataset + + dataset = self.library_populator.get_library_contents_with_path(library["id"], "/bdbag-profile.json") + assert dataset["file_size"] == 723, dataset + + def _setup_fetch_to_folder(self, test_name): + return self.library_populator.setup_fetch_to_folder(test_name) + def test_create_dataset_in_folder(self): library = self.library_populator.new_private_library("ForCreateDatasets") folder_response = self._create_folder(library) diff --git a/test/base/integration_util.py b/test/base/integration_util.py index 4339436a9b4a..988d95963808 100644 --- a/test/base/integration_util.py +++ b/test/base/integration_util.py @@ -7,6 +7,8 @@ import os from unittest import skip, TestCase +from galaxy.tools.verify.test_data import TestDataResolver + from .api import UsesApiTestCaseMixin from .driver_util import GalaxyTestDriver @@ -45,6 +47,7 @@ def tearDownClass(cls): cls._app_available = False def setUp(self): + self.test_data_resolver = TestDataResolver() # Setup attributes needed for API testing... server_wrapper = self._test_driver.server_wrappers[0] host = server_wrapper.host diff --git a/test/base/populators.py b/test/base/populators.py index c046a78bf2ae..59d48d1004c4 100644 --- a/test/base/populators.py +++ b/test/base/populators.py @@ -148,14 +148,30 @@ def new_dataset_request(self, history_id, content=None, wait=False, **kwds): self.wait_for_tool_run(history_id, run_response, assert_ok=kwds.get('assert_ok', True)) return run_response + def fetch(self, payload, assert_ok=True, timeout=DEFAULT_TIMEOUT): + tool_response = self._post("tools/fetch", data=payload) + if assert_ok: + job = self.check_run(tool_response) + self.wait_for_job(job["id"], timeout=timeout) + + job = tool_response.json()["jobs"][0] + details = self.get_job_details(job["id"]).json() + assert details["state"] == "ok", details + + return tool_response + def wait_for_tool_run(self, history_id, run_response, timeout=DEFAULT_TIMEOUT, assert_ok=True): - run = run_response.json() - assert run_response.status_code == 200, run - job = run["jobs"][0] + job = self.check_run(run_response) self.wait_for_job(job["id"], timeout=timeout) self.wait_for_history(history_id, assert_ok=assert_ok, timeout=timeout) return run_response + def check_run(self, run_response): + run = run_response.json() + assert run_response.status_code == 200, run + job = run["jobs"][0] + return job + def wait_for_history(self, history_id, assert_ok=False, timeout=DEFAULT_TIMEOUT): try: return wait_on_state(lambda: self._get("histories/%s" % history_id), assert_ok=assert_ok, timeout=timeout) @@ -266,8 +282,8 @@ def run_tool(self, tool_id, inputs, history_id, assert_ok=True, **kwds): else: return tool_response - def tools_post(self, payload): - tool_response = self._post("tools", data=payload) + def tools_post(self, payload, url="tools"): + tool_response = self._post(url, data=payload) return tool_response def get_history_dataset_content(self, history_id, wait=True, filename=None, **kwds): @@ -463,6 +479,11 @@ class LibraryPopulator(object): def __init__(self, galaxy_interactor): self.galaxy_interactor = galaxy_interactor + self.dataset_populator = DatasetPopulator(galaxy_interactor) + + def get_libraries(self): + get_response = self.galaxy_interactor.get("libraries") + return get_response.json() def new_private_library(self, name): library = self.new_library(name) @@ -563,6 +584,24 @@ def show(): return library, library_dataset + def get_library_contents_with_path(self, library_id, path): + all_contents_response = self.galaxy_interactor.get("libraries/%s/contents" % library_id) + api_asserts.assert_status_code_is(all_contents_response, 200) + all_contents = all_contents_response.json() + matching = [c for c in all_contents if c["name"] == path] + if len(matching) == 0: + raise Exception("Failed to find library contents with path [%s], contents are %s" % (path, all_contents)) + get_response = self.galaxy_interactor.get(matching[0]["url"]) + api_asserts.assert_status_code_is(get_response, 200) + return get_response.json() + + def setup_fetch_to_folder(self, test_name): + history_id = self.dataset_populator.new_history() + library = self.new_private_library(test_name) + folder_id = library["root_folder_id"][1:] + destination = {"type": "library_folder", "library_folder_id": folder_id} + return history_id, library, destination + class BaseDatasetCollectionPopulator(object): diff --git a/test/integration/test_upload_configuration_options.py b/test/integration/test_upload_configuration_options.py index d5f6789723ca..3e6763df438e 100644 --- a/test/integration/test_upload_configuration_options.py +++ b/test/integration/test_upload_configuration_options.py @@ -19,6 +19,7 @@ framework but tested here for FTP uploads. """ +import json import os import re import shutil @@ -470,3 +471,67 @@ def test_library_import_dir_not_available_to_non_admins(self): payload, files = self.library_populator.create_dataset_request(library, upload_option="upload_directory", server_dir="library") response = self.library_populator.raw_library_contents_create(library["id"], payload, files=files) assert response.status_code == 403, response.json() + + +class FetchByPathTestCase(BaseUploadContentConfigurationTestCase): + + require_admin_user = True + + @classmethod + def handle_galaxy_config_kwds(cls, config): + config["allow_path_paste"] = True + + def test_fetch_path_to_folder(self): + history_id, library, destination = self.library_populator.setup_fetch_to_folder("flat_zip") + bed_test_data_path = self.test_data_resolver.get_filename("4.bed") + items = [{"src": "path", "path": bed_test_data_path, "info": "my cool bed"}] + targets = [{ + "destination": destination, + "items": items + }] + payload = { + "history_id": history_id, # TODO: Shouldn't be needed :( + "targets": json.dumps(targets), + } + self.dataset_populator.fetch(payload) + dataset = self.library_populator.get_library_contents_with_path(library["id"], "/4.bed") + assert dataset["file_size"] == 61, dataset + + def test_fetch_recursive_archive(self): + history_id, library, destination = self.library_populator.setup_fetch_to_folder("recursive_archive") + bed_test_data_path = self.test_data_resolver.get_filename("testdir1.zip") + targets = [{ + "destination": destination, + "items_from": "archive", "src": "path", "path": bed_test_data_path, + }] + payload = { + "history_id": history_id, # TODO: Shouldn't be needed :( + "targets": json.dumps(targets), + } + self.dataset_populator.fetch(payload) + dataset = self.library_populator.get_library_contents_with_path(library["id"], "/file1") + assert dataset["file_size"] == 6, dataset + + dataset = self.library_populator.get_library_contents_with_path(library["id"], "/file2") + assert dataset["file_size"] == 6, dataset + + dataset = self.library_populator.get_library_contents_with_path(library["id"], "/dir1/file3") + assert dataset["file_size"] == 11, dataset + + def test_fetch_recursive_archive_to_library(self): + bed_test_data_path = self.test_data_resolver.get_filename("testdir1.zip") + targets = [{ + "destination": {"type": "library", "name": "My Cool Library"}, + "items_from": "archive", "src": "path", "path": bed_test_data_path, + }] + payload = { + "history_id": self.history_id, # TODO: Shouldn't be needed :( + "targets": json.dumps(targets), + } + self.dataset_populator.fetch(payload) + libraries = self.library_populator.get_libraries() + matching = [l for l in libraries if l["name"] == "My Cool Library"] + assert len(matching) == 1 + library = matching[0] + dataset = self.library_populator.get_library_contents_with_path(library["id"], "/file1") + assert dataset["file_size"] == 6, dataset diff --git a/tools/data_source/upload.py b/tools/data_source/upload.py index 0a6eb99e24af..4bff4a05a8c8 100644 --- a/tools/data_source/upload.py +++ b/tools/data_source/upload.py @@ -18,8 +18,12 @@ from galaxy import util from galaxy.datatypes import sniff -from galaxy.datatypes.binary import Binary from galaxy.datatypes.registry import Registry +from galaxy.datatypes.upload_util import ( + handle_sniffable_binary_check, + handle_unsniffable_binary_check, + UploadProblemException, +) from galaxy.util.checkers import ( check_binary, check_bz2, @@ -36,12 +40,6 @@ assert sys.version_info[:2] >= (2, 7) -class UploadProblemException(Exception): - - def __init__(self, message): - self.message = message - - def file_err(msg, dataset, json_file): json_file.write(dumps(dict(type='dataset', ext='data', @@ -115,26 +113,21 @@ def add_file(dataset, registry, json_file, output_path): if dataset.type == 'url': try: - page = urlopen(dataset.path) # page will be .close()ed by sniff methods - temp_name = sniff.stream_to_file(page, prefix='url_paste', source_encoding=util.get_charset_from_http_headers(page.headers)) + dataset.path = sniff.stream_url_to_file(dataset.path) except Exception as e: raise UploadProblemException('Unable to fetch %s\n%s' % (dataset.path, str(e))) - dataset.path = temp_name + # See if we have an empty file if not os.path.exists(dataset.path): raise UploadProblemException('Uploaded temporary file (%s) does not exist.' % dataset.path) + if not os.path.getsize(dataset.path) > 0: raise UploadProblemException('The uploaded file is empty') + # Is dataset content supported sniffable binary? is_binary = check_binary(dataset.path) if is_binary: - # Sniff the data type - guessed_ext = sniff.guess_ext(dataset.path, registry.sniff_order) - # Set data_type only if guessed_ext is a binary datatype - datatype = registry.get_datatype_by_extension(guessed_ext) - if isinstance(datatype, Binary): - data_type = guessed_ext - ext = guessed_ext + data_type, ext = handle_sniffable_binary_check(data_type, ext, dataset.path, registry) if not data_type: root_datatype = registry.get_datatype_by_extension(dataset.file_type) if getattr(root_datatype, 'compressed', False): @@ -257,18 +250,9 @@ def add_file(dataset, registry, json_file, output_path): dataset.name = uncompressed_name data_type = 'zip' if not data_type: - if is_binary or registry.is_extension_unsniffable_binary(dataset.file_type): - # We have a binary dataset, but it is not Bam, Sff or Pdf - data_type = 'binary' - parts = dataset.name.split(".") - if len(parts) > 1: - ext = parts[-1].strip().lower() - is_ext_unsniffable_binary = registry.is_extension_unsniffable_binary(ext) - if check_content and not is_ext_unsniffable_binary: - raise UploadProblemException('The uploaded binary file contains inappropriate content') - elif is_ext_unsniffable_binary and dataset.file_type != ext: - err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % (ext, ext) - raise UploadProblemException(err_msg) + data_type, ext = handle_unsniffable_binary_check( + data_type, ext, dataset.path, dataset.name, is_binary, dataset.file_type, check_content, registry + ) if not data_type: # We must have a text file if check_content and check_html(dataset.path):