xcube-dev · TonioF · Mar 17, 2021 · Mar 10, 2021 · Mar 10, 2021 · Mar 10, 2021
diff --git a/CHANGES.md b/CHANGES.md
@@ -1,5 +1,16 @@
-## Changes in 0.7.1
-
+## Changes in 0.8.0.dev1 (in development)
+
+* Changed behaviour and signature of `xcube.core.store.DataStore.get_dataset_ids()`.
+  The keyword argument `include_titles: str = True` has been replaced by 
+  `include_attrs: Sequence[str] = None` and the return value changes accordingly:
+  - If `include_attrs` is None (the default), the method returns an iterator
+    of dataset identifiers *data_id* of type `str`.
+  - If `include_attrs` is a sequence of attribute names, the method returns
+    an iterator of tuples (*data_id*, *attrs*) of type `Tuple[str, Dict]`.
+  Hence `include_attrs`  can be used to obtain a minimum set of dataset 
+  metadata attributes for each returned *data_id*.
+  However, `include_attrs` is not yet implemented so far in the "s3", 
+  "memory", and "directory" data stores. (#420) 
 * Dataset normalisation no longer includes reordering increasing
   latitude coordinates, as this creates datasets that are no longer writable 
   to Zarr. (#347)

diff --git a/test/core/store/accessors/test_gdf.py b/test/core/store/accessors/test_gdf.py
@@ -16,7 +16,12 @@ def setUp(self) -> None:
         })
 
     def test_get_data_ids(self):
-        self.assertEqual({('cube_1',None), ('cube_2', None)}, set(self.data_store.get_data_ids()))
+        self.assertEqual({'cube_1', 'cube_2'},
+                         set(self.data_store.get_data_ids()))
+        list_with_attrs = list(self.data_store.get_data_ids(include_attrs=[]))
+        self.assertEqual(2, len(list_with_attrs))
+        self.assertIn(('cube_1', {}), list_with_attrs)
+        self.assertIn(('cube_2', {}), list_with_attrs)
 
     def test_open_data(self):
         cube_1 = self.data_store.open_data('cube_1')

diff --git a/test/core/store/stores/test_directory.py b/test/core/store/stores/test_directory.py
@@ -134,51 +134,50 @@ def test_get_data_writer_ids(self):
     def test_get_data_ids(self):
         self.assertEqual(
             {
-                ('cube-1-250-250.zarr', None),
-                ('cube-5-100-200.zarr', None),
-                ('cube.nc', None),
+                'cube-1-250-250.zarr',
+                'cube-5-100-200.zarr',
+                'cube.nc',
             },
             set(self.store.get_data_ids())
         )
         self.assertEqual(
             {
-                ('cube-1-250-250.zarr', None),
-                ('cube-5-100-200.zarr', None),
-                ('cube.nc', None),
+                'cube-1-250-250.zarr',
+                'cube-5-100-200.zarr',
+                'cube.nc',
             },
             set(self.store.get_data_ids('*'))
         )
         self.assertEqual(
             {
-                ('cube-1-250-250.zarr', None),
-                ('cube-5-100-200.zarr', None),
-                ('cube.nc', None),
+                'cube-1-250-250.zarr',
+                'cube-5-100-200.zarr',
+                'cube.nc',
             },
             set(self.store.get_data_ids('dataset'))
         )
         self.assertEqual(
             set(),
             set(self.store.get_data_ids('dataset[multilevel]'))
         )
+        data_ids_list = list(self.store.get_data_ids(include_attrs=["title"]))
+        self.assertEqual(3, len(data_ids_list))
+        # Note, although we expect "title" to be included,
+        # DirectoryStore does not implement it yet.
+        self.assertIn(('cube-1-250-250.zarr', {}), data_ids_list)
+        self.assertIn(('cube-5-100-200.zarr', {}), data_ids_list)
+        self.assertIn(('cube.nc', {}), data_ids_list)
         self.assertEqual(
             {
-                ('cube-1-250-250.zarr', None),
-                ('cube-5-100-200.zarr', None),
-                ('cube.nc', None),
+                'cube-1-250-250.zarr',
+                'cube-5-100-200.zarr',
+                'cube.nc',
             },
-            set(self.store.get_data_ids(include_titles=False))
-        )
-        self.assertEqual(
-            {
-                ('cube-1-250-250.zarr', None),
-                ('cube-5-100-200.zarr', None),
-                ('cube.nc', None),
-            },
-            set(self.store.get_data_ids('dataset', include_titles=False))
+            set(self.store.get_data_ids('dataset'))
         )
         self.assertEqual(
             set(),
-            set(self.store.get_data_ids('dataset[multilevel]', include_titles=False))
+            set(self.store.get_data_ids('dataset[multilevel]'))
         )
 
     def test_has_data(self):

diff --git a/test/core/store/stores/test_memory.py b/test/core/store/stores/test_memory.py
@@ -43,9 +43,11 @@ def test_get_type_specifiers_for_data(self):
         self.assertEqual('Data resource "geodataframe_2" does not exist in store', f'{cm.exception}')
 
     def test_get_data_ids(self):
-        self.assertEqual({('cube_1', None), ('cube_2', None), ('ds_1', None)}, set(self.store.get_data_ids()))
-        self.assertEqual({('cube_1', None), ('cube_2', None), ('ds_1', None)},
-                         set(self.store.get_data_ids(include_titles=False)))
+        data_ids_list = list(self.store.get_data_ids(include_attrs=[]))
+        self.assertEqual(3, len(data_ids_list))
+        self.assertIn(('cube_1', {}), data_ids_list)
+        self.assertIn(('cube_2', {}), data_ids_list)
+        self.assertIn(('ds_1', {}), data_ids_list)
 
     def test_has_data(self):
         self.assertEqual(True, self.store.has_data('cube_1'))

diff --git a/xcube/cli/io.py b/xcube/cli/io.py
@@ -375,8 +375,8 @@ def _dump_store_writers(data_store: 'xcube.core.store.DataStore') -> int:
 # noinspection PyUnresolvedReferences
 def _dump_store_data_ids(data_store: 'xcube.core.store.DataStore') -> int:
     count = 0
-    for data_id, title in data_store.get_data_ids():
-        print(f'  {data_id:>32s}  {title or _NO_TITLE}')
+    for data_id, data_attrs in data_store.get_data_ids(include_attrs=['title']):
+        print(f'  {data_id:>32s}  {data_attrs.get("title") or _NO_TITLE}')
         count += 1
     return count
 

diff --git a/xcube/core/store/search.py b/xcube/core/store/search.py
@@ -37,5 +37,5 @@ def search_data(self, type_specifier: str = None, **search_params) -> Iterator[D
         """
         if search_params:
             raise DataStoreError(f'Unsupported search parameters: {", ".join(search_params.keys())}')
-        for data_id, _ in self.get_data_ids(type_specifier=type_specifier, include_titles=False):
+        for data_id in self.get_data_ids(type_specifier=type_specifier):
             yield self.describe_data(data_id)
diff --git a/xcube/core/store/store.py b/xcube/core/store/store.py
@@ -20,7 +20,7 @@
 # SOFTWARE.
 
 from abc import abstractmethod, ABC
-from typing import Iterator, Tuple, Any, Optional, List, Type
+from typing import Iterator, Tuple, Any, Optional, List, Type, Dict, Union, Container
 
 from xcube.constants import EXTENSION_POINT_DATA_STORES
 from xcube.util.extension import Extension
@@ -153,22 +153,43 @@ def get_type_specifiers_for_data(self, data_id: str) -> Tuple[str, ...]:
         """
 
     @abstractmethod
-    def get_data_ids(self, type_specifier: str = None, include_titles: bool = True) -> \
-            Iterator[Tuple[str, Optional[str]]]:
+    def get_data_ids(self,
+                     type_specifier: str = None,
+                     include_attrs: Container[str] = None) -> \
+            Union[Iterator[str], Iterator[Tuple[str, Dict[str, Any]]]]:
         """
         Get an iterator over the data resource identifiers for the given type *type_specifier*.
         If *type_specifier* is omitted, all data resource identifiers are returned.
 
-        If a store implementation supports only a single data type, it should verify that *type_specifier*
-        is either None or compatible with the supported data type.
-
-        The returned iterator items are 2-tuples of the form (*data_id*, *title*), where *data_id*
-        is the actual data identifier and *title* is an optional, human-readable title for the data.
-        If *include_titles* is false, the second item of the result tuple will be None.
-
-        :param type_specifier: If given, only data identifiers that are available as this type are returned. If this is
-        omitted, all available data identifiers are returned.
-        :param include_titles: If true, the store will attempt to also provide a title.
+        If a store implementation supports only a single data type, it should verify that
+        *type_specifier* is either None or compatible with the supported data type.
+
+        If *include_attrs* is provided, it must be a sequence of names of metadata attributes.
+        The store will then return extra metadata for each returned data resource
+        identifier according to the names of the metadata attributes as tuples (*data_id*, *attrs*).
+
+        Hence, the type of the returned iterator items depends on the value of *include_attrs*:
+
+        - If *include_attrs* is None (the default), the method returns an iterator
+          of dataset identifiers *data_id* of type `str`.
+        - If *include_attrs* is a sequence of attribute names, even an empty one,
+          the method returns an iterator of tuples (*data_id*, *attrs*) of type
+          `Tuple[str, Dict]`, where *attrs* is a dictionary filled according to the
+          names in *include_attrs*. If a store cannot provide a given attribute, it
+          should simply ignore it. This may even yield to an empty dictionary for a given
+          *data_id*.
+
+        The individual attributes do not have to exist in the dataset's metadata, they may also be
+        generated on-the-fly. An example for a generic attribute name is "title".
+        A store should try to resolve ```include_attrs=["title"]``` by returning items such as
+        ```("ESACCI-L4_GHRSST-SSTdepth-OSTIA-GLOB_CDR2.1-v02.0-fv01.0.zarr",
+        {"title": "Level-4 GHRSST Analysed Sea Surface Temperature"})```.
+
+        :param type_specifier: If given, only data identifiers that are available as this type are returned.
+            If this is omitted, all available data identifiers are returned.
+        :param include_attrs: A sequence of names of attributes to be returned for each dataset identifier.
+            If given, the store will attempt to provide the set of requested dataset attributes in addition to the data ids.
+            (added in xcube 0.8.0)
         :return: An iterator over the identifiers and titles of data resources provided by this data store.
         :raise DataStoreError: If an error occurs.
         """

diff --git a/xcube/core/store/stores/directory.py b/xcube/core/store/stores/directory.py
@@ -21,7 +21,7 @@
 
 import os.path
 import uuid
-from typing import Optional, Iterator, Any, Tuple, List
+from typing import Optional, Iterator, Any, Tuple, List, Dict, Union, Container
 
 import geopandas as gpd
 import xarray as xr
@@ -116,8 +116,12 @@ def get_type_specifiers_for_data(self, data_id: str) -> Tuple[str, ...]:
         actual_type_specifier, _, _ = self._get_accessor_id_parts(data_id)
         return actual_type_specifier,
 
-    def get_data_ids(self, type_specifier: str = None, include_titles: bool = True) -> \
-            Iterator[Tuple[str, Optional[str]]]:
+    def get_data_ids(self,
+                     type_specifier: str = None,
+                     include_attrs: Container[str] = None) -> \
+            Union[Iterator[str], Iterator[Tuple[str, Dict[str, Any]]]]:
+        # TODO: do not ignore names in include_attrs
+        return_tuples = include_attrs is not None
         if type_specifier is not None:
             type_specifier = TypeSpecifier.normalize(type_specifier)
         # TODO: Use os.walk(), which provides a generator rather than a list
@@ -127,7 +131,7 @@ def get_data_ids(self, type_specifier: str = None, include_titles: bool = True)
             actual_type_specifier = self._get_type_specifier_for_data_id(data_id, require=False)
             if actual_type_specifier is not None:
                 if type_specifier is None or actual_type_specifier.satisfies(type_specifier):
-                    yield data_id, None
+                    yield (data_id, {}) if return_tuples else data_id
 
     def has_data(self, data_id: str, type_specifier: str = None) -> bool:
         assert_given(data_id, 'data_id')

diff --git a/xcube/core/store/stores/memory.py b/xcube/core/store/stores/memory.py
@@ -20,7 +20,7 @@
 # SOFTWARE.
 
 import uuid
-from typing import Iterator, Dict, Any, Optional, Tuple, Mapping
+from typing import Iterator, Dict, Any, Optional, Tuple, Mapping, Union, Container
 
 from xcube.core.store import DataDescriptor
 from xcube.core.store import DataStoreError
@@ -66,17 +66,21 @@ def get_type_specifiers_for_data(self, data_id: str) -> Tuple[str, ...]:
         type_specifier = get_type_specifier(self._data_dict[data_id])
         return str(type_specifier),
 
-    def get_data_ids(self, type_specifier: str = None, include_titles: bool = True) -> Iterator[
-        Tuple[str, Optional[str]]]:
+    def get_data_ids(self,
+                     type_specifier: str = None,
+                     include_attrs: Container[str] = None) -> \
+            Union[Iterator[str], Iterator[Tuple[str, Dict[str, Any]]]]:
+        # TODO: do not ignore names in include_attrs
+        return_tuples = include_attrs is not None
         if type_specifier is None:
             for data_id, data in self._data_dict.items():
-                yield data_id, None
+                yield (data_id, {}) if return_tuples else data_id
         else:
             type_specifier = TypeSpecifier.normalize(type_specifier)
             for data_id, data in self._data_dict.items():
                 data_type_specifier = get_type_specifier(data)
                 if data_type_specifier is None or data_type_specifier.satisfies(type_specifier):
-                    yield data_id, None
+                    yield (data_id, {}) if return_tuples else data_id
 
     def has_data(self, data_id: str, type_specifier: str = None) -> bool:
         assert_given(data_id, 'data_id')

diff --git a/xcube/core/store/stores/s3.py b/xcube/core/store/stores/s3.py
@@ -22,7 +22,7 @@
 import json
 import os.path
 import uuid
-from typing import Optional, Iterator, Any, Tuple, List
+from typing import Optional, Iterator, Any, Tuple, List, Dict, Union, Container
 
 import s3fs
 import xarray as xr
@@ -124,13 +124,19 @@ def get_type_specifiers_for_data(self, data_id: str) -> Tuple[str, ...]:
         data_type_specifier, _, _ = self._get_accessor_id_parts(data_id)
         return data_type_specifier,
 
-    def get_data_ids(self, type_specifier: str = None, include_titles=True) -> Iterator[Tuple[str, Optional[str]]]:
-        # todo do not ignore type_specifier
+    def get_data_ids(self,
+                     type_specifier: str = None,
+                     include_attrs: Container[str] = None) -> \
+            Union[Iterator[str], Iterator[Tuple[str, Dict[str, Any]]]]:
+        # TODO: do not ignore type_specifier
+        # TODO: do not ignore names in include_attrs
+        return_tuples = include_attrs is not None
         prefix = self._bucket_name + '/'
         first_index = len(prefix)
         for item in self._s3.listdir(self._bucket_name, detail=False):
             if item.startswith(prefix):
-                yield item[first_index:], None
+                data_id = item[first_index:]
+                yield (data_id, {}) if return_tuples else data_id
 
     def has_data(self, data_id: str, type_specifier: str = None) -> bool:
         if data_id in self._registry:

diff --git a/xcube/version.py b/xcube/version.py
@@ -19,4 +19,4 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
-version = '0.7.1'
+version = '0.8.0.dev1'