glue-viz · astrofrog · Aug 17, 2015 · Aug 15, 2015 · Aug 15, 2015 · Aug 15, 2015
diff --git a/CHANGES.md b/CHANGES.md
@@ -6,6 +6,11 @@ v0.6 (unreleased)
 
 * Added a GUI plugin manager in the 'Plugins' menu. [#682]
 
+* Data factories can now be given priorities to determine which ones should
+  take precedence in ambiguous cases. The ``set_default_factory`` and
+  ``get_default_factory`` functions are now deprecated since it is possible to
+  achieve this solely with priorities. [#719]
+
 v0.5.2 (2015-08-13)
 -------------------
 

diff --git a/doc/customizing_guide/customization.rst b/doc/customizing_guide/customization.rst
@@ -110,6 +110,13 @@ file type when loading data:
 If you open a file using this file type selection, Glue will pass the path of
 this file to your function, and use the resulting Data object.
 
+If you are defining a data factory that may clash with an existing one, for
+example if you are defining a loader for a specific type of FITS file, then
+make sure that the identifier function (e.g. ``is_jpeg`` above) returns `True`
+only for that specific subset of FITS files. Then you can set the ``priority=``
+keyword in the ``@data_factory`` decorator. The value should be an integer or
+floating-point number, with larger numbers indicating a higher priority.
+
 For more examples of custom data loaders, see the `example repository
 <https://github.com/glue-viz/glue-data-loaders>`_.
 

diff --git a/glue/config.py b/glue/config.py
@@ -250,40 +250,60 @@ class DataFactoryRegistry(Registry):
     """Stores data factories. Data factories take filenames as input,
     and return :class:`~glue.core.data.Data` instances
 
-    The members property returns a list of (function, label, identifier)
-    namedtuples:
+    The members property returns a list of (function, label, identifier,
+    priority) namedtuples:
 
     - Function is the factory that creates the data object
     - label is a short human-readable description of the factory
     - identifier is a function that takes ``(filename, **kwargs)`` as input
       and returns True if the factory can open the file
+    - priority is a numerical value that indicates how confident the data
+      factory is that it should read the data, relative to other data
+      factories. For example, a highly specialized FITS reader for specific
+      FITS file types can be given a higher priority than the generic FITS
+      reader in order to take precedence over it.
 
     New data factories can be registered via::
 
-        @data_factory('label_name', identifier, default='txt')
+        @data_factory('label_name', identifier=identifier, priority=10)
         def new_factory(file_name):
             ...
-
-    This has the additional side-effect of associating
-    this this factory with filenames ending in ``txt`` by default
+
+    If not specified, the priority defaults to 0.
     """
-    item = namedtuple('DataFactory', 'function label identifier')
+
+    item = namedtuple('DataFactory', 'function label identifier priority')
 
     def default_members(self):
+
         from .core.data_factories import __factories__
-        return [self.item(f, f.label, f.identifier) for f in __factories__]
 
-    def __call__(self, label, identifier=None, default=''):
-        from .core.data_factories import set_default_factory
+        def get_priority(fact):
+            try:
+                return fact.priority
+            except AttributeError:
+                return 0
+
+        return [self.item(f, f.label, f.identifier, get_priority(f)) for f in __factories__]
+
+    def __call__(self, label, identifier=None, priority=None, default=''):
+
         if identifier is None:
             identifier = lambda *a, **k: False
 
+        if priority is None:
+            priority = 0
+
         def adder(func):
-            set_default_factory(default, func)
-            self.add(self.item(func, label, identifier))
+            self.add(self.item(func, label, identifier, priority))
             return func
+
         return adder
 
+    def __iter__(self):
+        for member in sorted(self.members, key=lambda x: (-x.priority, x.label)):
+            yield member
+
 
 class QtClientRegistry(Registry):
 

diff --git a/glue/core/data_factories/dendro_loader.py b/glue/core/data_factories/dendro_loader.py
@@ -7,6 +7,8 @@
 from astrodendro import Dendrogram
 from ..data import Data
 
+from .gridded import is_fits, is_hdf5
+
 __all__ = ['load_dendro']
 
 
@@ -34,3 +36,69 @@ def load_dendro(file):
     im = Data(intensity=dg.data, structure=dg.index_map)
     im.join_on_key(dendro, 'structure', dendro.pixel_component_ids[0])
     return [dendro, im]
+
+
+def is_dendro(file, **kwargs):
+
+    if is_hdf5(file):
+
+        import h5py
+
+        f = h5py.File(file, 'r')
+
+        return 'data' in f and 'index_map' in f and 'newick' in f
+
+    elif is_fits(file):
+
+        from ...external.astro import fits
+
+        hdulist = fits.open(file)
+
+        # In recent versions of Astropy, we could do 'DATA' in hdulist etc. but
+        # this doesn't work with Astropy 0.3, so we use the following method
+        # instead:
+        try:
+            hdulist['DATA']
+            hdulist['INDEX_MAP']
+            hdulist['NEWICK']
+        except KeyError:
+            pass  # continue
+        else:
+            return True
+
+        # For older versions of astrodendro, the HDUs did not have names
+
+        # Here we use heuristics to figure out if this is likely to be a
+        # dendrogram. Specifically, there should be three HDU extensions.
+        # The primary HDU should be empty, HDU 1 and HDU 2 should have
+        # matching shapes, and HDU 3 should have a 1D array. Also, if the
+        # HDUs do have names then this is not a dendrogram since the old
+        # files did not have names
+
+        # This branch can be removed once we think most dendrogram files
+        # will have HDU names.
+
+        if len(hdulist) != 4:
+            return False
+
+        if hdulist[1].name != '' or hdulist[2].name != '' or hdulist[3].name != '':
+            return False
+
+        if hdulist[0].data is not None:
+            return False
+
+        if hdulist[1].data is None or hdulist[2].data is None or hdulist[3].data is None:
+            return False
+
+        if hdulist[1].data.shape != hdulist[2].data.shape:
+            return False
+
+        if hdulist[3].data.ndim != 1:
+            return False
+
+        # We're probably ok, so return True
+        return True
+
+    else:
+
+        return False
diff --git a/glue/core/data_factories/dendrogram.py b/glue/core/data_factories/dendrogram.py
@@ -5,9 +5,11 @@
 __all__ = []
 
 try:
-    from .dendro_loader import load_dendro
-    __factories__.append(load_dendro)
-    load_dendro.label = 'Dendrogram'
-    load_dendro.identifier = has_extension('fits hdf5 h5')
+    from .dendro_loader import load_dendro, is_dendro
 except ImportError:
     pass
+else:
+    __factories__.append(load_dendro)
+    load_dendro.label = 'Dendrogram'
+    load_dendro.identifier = is_dendro
+    load_dendro.priority = 1000
diff --git a/glue/core/data_factories/excel.py b/glue/core/data_factories/excel.py
@@ -1,7 +1,7 @@
 from __future__ import absolute_import, division, print_function
 
 from .pandas import panda_process
-from .helpers import has_extension, __factories__, set_default_factory
+from .helpers import has_extension, __factories__
 
 __all__ = []
 
@@ -24,5 +24,3 @@ def panda_read_excel(path, sheet='Sheet1', **kwargs):
 panda_read_excel.label = "Excel"
 panda_read_excel.identifier = has_extension('xls xlsx')
 __factories__.append(panda_read_excel)
-set_default_factory('xls', panda_read_excel)
-set_default_factory('xlsx', panda_read_excel)
diff --git a/glue/core/data_factories/gridded.py b/glue/core/data_factories/gridded.py
@@ -5,7 +5,7 @@
 from ...utils import file_format
 from ..coordinates import coordinates_from_header
 
-from .helpers import set_default_factory, __factories__
+from .helpers import __factories__
 
 __all__ = ['is_casalike', 'gridded_data', 'casalike_cube']
 
@@ -71,10 +71,8 @@ def is_gridded_data(filename, **kwargs):
 
 gridded_data.label = "FITS/HDF5 Image"
 gridded_data.identifier = is_gridded_data
+gridded_data.priority = 2
 __factories__.append(gridded_data)
-set_default_factory('fits', gridded_data)
-set_default_factory('hd5', gridded_data)
-set_default_factory('hdf5', gridded_data)
 
 
 def casalike_cube(filename, **kwargs):

diff --git a/glue/core/data_factories/helpers.py b/glue/core/data_factories/helpers.py
@@ -17,17 +17,13 @@
 
 5) The function is added to the __factories__ list
 
-6) Optionally, the function is registered to open a given extension by
-default by calling set_default_factory
-
 Putting this together, the simplest data factory code looks like this::
 
     def dummy_factory(file_name):
         return glue.core.Data()
     dummy_factory.label = "Foo file"
     dummy_factory.identifier = has_extension('foo FOO')
     __factories__.append(dummy_factory)
-    set_default_factory("foo", dummy_factory)
 """
 
 from __future__ import absolute_import, division, print_function
@@ -42,13 +38,12 @@ def dummy_factory(file_name):
 from ..contracts import contract
 
 __all__ = ['FileWatcher', 'LoadLog',
-           'auto_data', 'data_label', 'find_factory', 'get_default_factory',
-           'has_extension', 'load_data', 'set_default_factory',
+           'auto_data', 'data_label', 'find_factory',
+           'has_extension', 'load_data',
            '_extension', '__factories__']
 
 
 __factories__ = []
-_default_factory = {}
 
 
 def _extension(path):
@@ -271,49 +266,62 @@ def data_label(path):
 
 
 @contract(extension='string', factory='callable')
-def set_default_factory(extension, factory):
-    """Register an extension that should be handled by a factory by default
-
-    :param extension: File extension (do not include the '.')
-    :param factory: The factory function to dispatch to
-    """
-    for ex in extension.split():
-        _default_factory[ex] = factory
+def set_default_factory(extension, factory):  # pragma: no cover
+    warnings.warn("set_default_factory is deprecated and no longer has any effect")
 
 
 @contract(extension='string', returns='callable|None')
-def get_default_factory(extension):
-    """Return the default factory function to read a given file extension.
-
-    :param extension: The extension to lookup
-
-    :rtype: A factory function, or None if the extension has no default
-    """
-    try:
-        return _default_factory[extension]
-    except KeyError:
-        return None
+def get_default_factory(extension):  # pragma: no cover
+    warnings.warn("get_default_factory is deprecated and will always return None")
+    return None
 
 
 @contract(filename='string')
 def find_factory(filename, **kwargs):
+
     from ...config import data_factory
 
-    # on first pass, only try the default factory
-    default = _default_factory.get(_extension(filename))
-    for func, _, identifier in data_factory:
-        if func is auto_data:
+    # We no longer try the 'default' factory first because we actually need to
+    # try all identifiers and select the one to use based on the priority. This
+    # allows us to define more specialized loaders take priority over more
+    # general ones. For example, a FITS file that is a dendrogram should be
+    # loaded as a dendrogram, not a plain FITS file.
+
+    best_priority = None
+    valid_formats = []
+
+    # Iterating over the data factory returns the formats sorted by decreasing
+    # alphabetical order then by label (alphabetically) in order to be
+    # deterministic. This is implemented in DataFactoryRegistry.__iter__.
+
+    for df in data_factory:
+
+        # Once we've found a match, and iterated through the rest of the
+        # importers with the same priority, we can exit the loop.
+        if best_priority is not None and df.priority < best_priority:
+            break
+
+        if df.function is auto_data:
             continue
-        if (func is default) and identifier(filename, **kwargs):
-            return func
 
-    # if that fails, try everything
-    for func, _, identifier in data_factory:
-        if func is auto_data:
+        try:
+            is_format = df.identifier(filename, **kwargs)
+        except ImportError:  # dependencies missing
             continue
-        if identifier(filename, **kwargs):
-            return func
 
+        if is_format:
+            valid_formats.append(df)
+            best_priority = df.priority
+
+    if len(valid_formats) == 0:
+        return None
+    elif len(valid_formats) > 1:
+        labels = ["'{0}'".format(x.label) for x in valid_formats]
+        warnings.warn("Multiple data factories matched the input: {0}. Choosing {1}.".format(', '.join(labels), labels[0]))
+
+    func = valid_formats[0].function
+
+    return func
 
 @contract(filename='string')
 def auto_data(filename, *args, **kwargs):

diff --git a/glue/core/data_factories/io.py b/glue/core/data_factories/io.py
@@ -77,7 +77,8 @@ def extract_hdf5_datasets(handle):
             for key in sub_datasets:
                 datasets[key] = sub_datasets[key]
         elif isinstance(handle[group], h5py.highlevel.Dataset):
-            datasets[handle[group].name] = handle[group]
+            if handle[group].dtype.kind in ('f', 'i'):
+                datasets[handle[group].name] = handle[group]
     return datasets