Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hierarchical upload API optimized for folders & collections. #5220

Merged
merged 19 commits into from
Mar 9, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
ebac0fd
Upload simplification - just base this check on link_only.
jmchilton Dec 13, 2017
94cc4df
Refactor shed's CompressedFile abstraction into galaxy.util.compressi…
jmchilton Feb 26, 2018
306dded
Improved doc for library API.
jmchilton Dec 13, 2017
a45cbfe
Stronger assertion about link_data_only option in upload.py.
jmchilton Dec 20, 2017
1c6cc02
Hierarchical upload API optimized for folders & collections.
jmchilton Mar 8, 2018
ce2170a
Test case for link_to_files during upload.
jmchilton Feb 26, 2018
b6f4bff
Do not allow workflows to run tools that are not workflow-compatible.
jmchilton Jan 6, 2018
347f1e1
Don't purge library path pastes and such in upload.py during testing.
jmchilton Feb 26, 2018
3bf5990
Allow uploading individual HDAs via fetch API.
jmchilton Jan 11, 2018
1720354
More upload testing, some input is getting deleted that shouldn't.
jmchilton Jan 14, 2018
5c5dbd2
Handle compressed datatypes appropriately in data fetch API.
jmchilton Jan 22, 2018
54ee573
Simplify and reduce duplication of upload actions.
jmchilton Feb 26, 2018
60f632b
Precreate certain outputs for upload 2.0 API.
jmchilton Feb 26, 2018
d783fc3
Avoid symlinks in upload FTP tests.
jmchilton Mar 5, 2018
3314183
Cleanup hierarchical upload commit based on PR comments from @bgruening.
jmchilton Mar 5, 2018
ca28000
Fixes for pre-creating HDAs using data fetch API.
jmchilton Mar 6, 2018
495d125
Consistent sniffing regardless of in_place.
jmchilton Mar 6, 2018
d651348
More upload tests and fixes.
jmchilton Mar 6, 2018
bca2c3c
Fix for data_fetch sniffing.
jmchilton Mar 7, 2018
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions lib/galaxy/actions/library.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,16 +249,14 @@ def _make_library_uploaded_dataset(self, trans, params, name, path, type, librar
uploaded_dataset.to_posix_lines = params.get('to_posix_lines', None)
uploaded_dataset.space_to_tab = params.get('space_to_tab', None)
uploaded_dataset.tag_using_filenames = params.get('tag_using_filenames', True)
uploaded_dataset.purge_source = getattr(trans.app.config, 'ftp_upload_purge', True)
if in_folder:
uploaded_dataset.in_folder = in_folder
uploaded_dataset.data = upload_common.new_upload(trans, 'api', uploaded_dataset, library_bunch)
uploaded_dataset.link_data_only = link_data_only
uploaded_dataset.uuid = uuid_str
if link_data_only == 'link_to_files':
uploaded_dataset.data.file_name = os.path.abspath(path)
# Since we are not copying the file into Galaxy's managed
# default file location, the dataset should never be purgable.
uploaded_dataset.data.dataset.purgable = False
uploaded_dataset.data.link_to(path)
trans.sa_session.add_all((uploaded_dataset.data, uploaded_dataset.data.dataset))
trans.sa_session.flush()
return uploaded_dataset
Expand Down
4 changes: 4 additions & 0 deletions lib/galaxy/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@
from galaxy import config, jobs
from galaxy.jobs import metrics as job_metrics
from galaxy.managers.collections import DatasetCollectionManager
from galaxy.managers.folders import FolderManager
from galaxy.managers.histories import HistoryManager
from galaxy.managers.libraries import LibraryManager
from galaxy.managers.tags import GalaxyTagManager
from galaxy.openid.providers import OpenIDProviders
from galaxy.queue_worker import GalaxyQueueWorker
Expand Down Expand Up @@ -96,6 +98,8 @@ def __init__(self, **kwargs):
self.history_manager = HistoryManager(self)
self.dependency_resolvers_view = DependencyResolversView(self)
self.test_data_resolver = test_data.TestDataResolver(file_dirs=self.config.tool_test_data_directories)
self.library_folder_manager = FolderManager()
self.library_manager = LibraryManager()

# Tool Data Tables
self._configure_tool_data_tables(from_shed_config=False)
Expand Down
22 changes: 17 additions & 5 deletions lib/galaxy/datatypes/sniff.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import zipfile

from six import text_type
from six.moves.urllib.request import urlopen

from galaxy import util
from galaxy.util import compression_utils
Expand All @@ -39,6 +40,12 @@ def get_test_fname(fname):
return full_path


def stream_url_to_file(path):
page = urlopen(path) # page will be .close()ed in stream_to_file
temp_name = stream_to_file(page, prefix='url_paste', source_encoding=util.get_charset_from_http_headers(page.headers))
return temp_name


def stream_to_open_named_file(stream, fd, filename, source_encoding=None, source_error='strict', target_encoding=None, target_error='strict'):
"""Writes a stream to the provided file descriptor, returns the file name. Closes file descriptor"""
# signature and behavor is somewhat odd, due to backwards compatibility, but this can/should be done better
Expand Down Expand Up @@ -131,7 +138,7 @@ def convert_newlines(fname, in_place=True, tmp_dir=None, tmp_prefix="gxupload"):
return (i, temp_name)


def sep2tabs(fname, in_place=True, patt="\\s+"):
def sep2tabs(fname, in_place=True, patt="\\s+", tmp_dir=None, tmp_prefix="gxupload"):
"""
Transforms in place a 'sep' separated file to a tab separated one

Expand All @@ -143,13 +150,18 @@ def sep2tabs(fname, in_place=True, patt="\\s+"):
'1\\t2\\n3\\t4\\n'
"""
regexp = re.compile(patt)
fd, temp_name = tempfile.mkstemp()
fd, temp_name = tempfile.mkstemp(prefix=tmp_prefix, dir=tmp_dir)
with os.fdopen(fd, "wt") as fp:
i = None
for i, line in enumerate(open(fname)):
line = line.rstrip('\r\n')
elems = regexp.split(line)
fp.write("%s\n" % '\t'.join(elems))
if line.endswith("\r"):
line = line.rstrip('\r')
elems = regexp.split(line)
fp.write("%s\r" % '\t'.join(elems))
else:
line = line.rstrip('\n')
elems = regexp.split(line)
fp.write("%s\n" % '\t'.join(elems))
if i is None:
i = 0
else:
Expand Down
47 changes: 47 additions & 0 deletions lib/galaxy/datatypes/upload_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from galaxy.datatypes import sniff
from galaxy.datatypes.binary import Binary


class UploadProblemException(Exception):

def __init__(self, message):
self.message = message


def handle_unsniffable_binary_check(data_type, ext, path, name, is_binary, requested_ext, check_content, registry):
"""Return modified values of data_type and ext if unsniffable binary encountered.

Throw UploadProblemException if content problems or extension mismatches occur.

Precondition: check_binary called returned True.
"""
if is_binary or registry.is_extension_unsniffable_binary(requested_ext):
# We have a binary dataset, but it is not Bam, Sff or Pdf
data_type = 'binary'
parts = name.split(".")
if len(parts) > 1:
ext = parts[-1].strip().lower()
is_ext_unsniffable_binary = registry.is_extension_unsniffable_binary(ext)
if check_content and not is_ext_unsniffable_binary:
raise UploadProblemException('The uploaded binary file contains inappropriate content')

elif is_ext_unsniffable_binary and requested_ext != ext:
err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % (ext, ext)
raise UploadProblemException(err_msg)
return data_type, ext


def handle_sniffable_binary_check(data_type, ext, path, registry):
"""Return modified values of data_type and ext if sniffable binary encountered.

Precondition: check_binary called returned True.
"""
# Sniff the data type
guessed_ext = sniff.guess_ext(path, registry.sniff_order)
# Set data_type only if guessed_ext is a binary datatype
datatype = registry.get_datatype_by_extension(guessed_ext)
if isinstance(datatype, Binary):
data_type = guessed_ext
ext = guessed_ext

return data_type, ext
1 change: 1 addition & 0 deletions lib/galaxy/dependencies/pinned-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ uWSGI==2.0.15
pysam==0.14

# pure Python packages
bdbag==1.1.1
bleach==2.1.3
bz2file==0.98; python_version < '3.3'
ipaddress==1.0.18; python_version < '3.3'
Expand Down
2 changes: 1 addition & 1 deletion lib/galaxy/jobs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1380,7 +1380,7 @@ def path_rewriter(path):
collected_datasets = {
'primary': self.tool.collect_primary_datasets(out_data, self.get_tool_provided_job_metadata(), tool_working_directory, input_ext, input_dbkey)
}
self.tool.collect_dynamic_collections(
self.tool.collect_dynamic_outputs(
out_collections,
self.get_tool_provided_job_metadata(),
job_working_directory=tool_working_directory,
Expand Down
19 changes: 12 additions & 7 deletions lib/galaxy/managers/collections.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,17 +46,22 @@ def __init__(self, app):
self.tag_manager = tags.GalaxyTagManager(app.model.context)
self.ldda_manager = lddas.LDDAManager(app)

def precreate_dataset_collection_instance(self, trans, parent, name, implicit_inputs, implicit_output_name, structure):
def precreate_dataset_collection_instance(self, trans, parent, name, structure, implicit_inputs=None, implicit_output_name=None):
# TODO: prebuild all required HIDs and send them in so no need to flush in between.
dataset_collection = self.precreate_dataset_collection(structure)
dataset_collection = self.precreate_dataset_collection(structure, allow_unitialized_element=implicit_output_name is not None)
instance = self._create_instance_for_collection(
trans, parent, name, dataset_collection, implicit_inputs=implicit_inputs, implicit_output_name=implicit_output_name, flush=False
)
return instance

def precreate_dataset_collection(self, structure):
if structure.is_leaf or not structure.children_known:
return model.DatasetCollectionElement.UNINITIALIZED_ELEMENT
def precreate_dataset_collection(self, structure, allow_unitialized_element=True):
has_structure = not structure.is_leaf and structure.children_known
if not has_structure and allow_unitialized_element:
dataset_collection = model.DatasetCollectionElement.UNINITIALIZED_ELEMENT
elif not has_structure:
collection_type_description = structure.collection_type_description
dataset_collection = model.DatasetCollection(populated=False)
dataset_collection.collection_type = collection_type_description.collection_type
else:
collection_type_description = structure.collection_type_description
dataset_collection = model.DatasetCollection(populated=False)
Expand All @@ -67,7 +72,7 @@ def precreate_dataset_collection(self, structure):
if substructure.is_leaf:
element = model.DatasetCollectionElement.UNINITIALIZED_ELEMENT
else:
element = self.precreate_dataset_collection(substructure)
element = self.precreate_dataset_collection(substructure, allow_unitialized_element=allow_unitialized_element)

element = model.DatasetCollectionElement(
element=element,
Expand All @@ -78,7 +83,7 @@ def precreate_dataset_collection(self, structure):
dataset_collection.elements = elements
dataset_collection.element_count = len(elements)

return dataset_collection
return dataset_collection

def create(self, trans, parent, name, collection_type, element_identifiers=None,
elements=None, implicit_collection_info=None, trusted_identifiers=None,
Expand Down
6 changes: 6 additions & 0 deletions lib/galaxy/model/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2035,6 +2035,12 @@ def set_file_name(self, filename):
return self.dataset.set_file_name(filename)
file_name = property(get_file_name, set_file_name)

def link_to(self, path):
self.file_name = os.path.abspath(path)
# Since we are not copying the file into Galaxy's managed
# default file location, the dataset should never be purgable.
self.dataset.purgable = False

@property
def extra_files_path(self):
return self.dataset.extra_files_path
Expand Down
12 changes: 8 additions & 4 deletions lib/galaxy/tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@
# Tools that require Galaxy's Python environment to be preserved.
GALAXY_LIB_TOOLS_UNVERSIONED = [
"upload1",
"__DATA_FETCH__",
# Legacy tools bundled with Galaxy.
"vcf_to_maf_customtrack1",
"laj_1",
Expand Down Expand Up @@ -1107,7 +1108,10 @@ def parse_input_elem(self, page_source, enctypes, context=None):
group.file_type_name = elem.get('file_type_name', group.file_type_name)
group.default_file_type = elem.get('default_file_type', group.default_file_type)
group.metadata_ref = elem.get('metadata_ref', group.metadata_ref)
rval[group.file_type_name].refresh_on_change = True
try:
rval[group.file_type_name].refresh_on_change = True
except KeyError:
pass
group_page_source = XmlPageSource(elem)
group.inputs = self.parse_input_elem(group_page_source, enctypes, context)
rval[group.name] = group
Expand Down Expand Up @@ -1658,10 +1662,10 @@ def collect_primary_datasets(self, output, tool_provided_metadata, job_working_d
"""
return output_collect.collect_primary_datasets(self, output, tool_provided_metadata, job_working_directory, input_ext, input_dbkey=input_dbkey)

def collect_dynamic_collections(self, output, tool_provided_metadata, **kwds):
""" Find files corresponding to dynamically structured collections.
def collect_dynamic_outputs(self, output, tool_provided_metadata, **kwds):
"""Collect dynamic outputs associated with a job from this tool.
"""
return output_collect.collect_dynamic_collections(self, output, tool_provided_metadata, **kwds)
return output_collect.collect_dynamic_outputs(self, output, tool_provided_metadata, **kwds)

def to_archive(self):
tool = self
Expand Down
Loading