Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Addressing #420 (for CCI Toolbox) #421

Merged
merged 12 commits into from
Mar 17, 2021
15 changes: 13 additions & 2 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,16 @@
## Changes in 0.7.1

## Changes in 0.8.0.dev1 (in development)

* Changed behaviour and signature of `xcube.core.store.DataStore.get_dataset_ids()`.
The keyword argument `include_titles: str = True` has been replaced by
`include_attrs: Sequence[str] = None` and the return value changes accordingly:
- If `include_attrs` is None (the default), the method returns an iterator
of dataset identifiers *data_id* of type `str`.
- If `include_attrs` is a sequence of attribute names, the method returns
an iterator of tuples (*data_id*, *attrs*) of type `Tuple[str, Dict]`.
Hence `include_attrs` can be used to obtain a minimum set of dataset
metadata attributes for each returned *data_id*.
However, `include_attrs` is not yet implemented so far in the "s3",
"memory", and "directory" data stores. (#420)
* Dataset normalisation no longer includes reordering increasing
latitude coordinates, as this creates datasets that are no longer writable
to Zarr. (#347)
Expand Down
7 changes: 6 additions & 1 deletion test/core/store/accessors/test_gdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,12 @@ def setUp(self) -> None:
})

def test_get_data_ids(self):
self.assertEqual({('cube_1',None), ('cube_2', None)}, set(self.data_store.get_data_ids()))
self.assertEqual({'cube_1', 'cube_2'},
set(self.data_store.get_data_ids()))
list_with_attrs = list(self.data_store.get_data_ids(include_attrs=[]))
self.assertEqual(2, len(list_with_attrs))
self.assertIn(('cube_1', {}), list_with_attrs)
self.assertIn(('cube_2', {}), list_with_attrs)

def test_open_data(self):
cube_1 = self.data_store.open_data('cube_1')
Expand Down
43 changes: 21 additions & 22 deletions test/core/store/stores/test_directory.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,51 +134,50 @@ def test_get_data_writer_ids(self):
def test_get_data_ids(self):
self.assertEqual(
{
('cube-1-250-250.zarr', None),
('cube-5-100-200.zarr', None),
('cube.nc', None),
'cube-1-250-250.zarr',
'cube-5-100-200.zarr',
'cube.nc',
},
set(self.store.get_data_ids())
)
self.assertEqual(
{
('cube-1-250-250.zarr', None),
('cube-5-100-200.zarr', None),
('cube.nc', None),
'cube-1-250-250.zarr',
'cube-5-100-200.zarr',
'cube.nc',
},
set(self.store.get_data_ids('*'))
)
self.assertEqual(
{
('cube-1-250-250.zarr', None),
('cube-5-100-200.zarr', None),
('cube.nc', None),
'cube-1-250-250.zarr',
'cube-5-100-200.zarr',
'cube.nc',
},
set(self.store.get_data_ids('dataset'))
)
self.assertEqual(
set(),
set(self.store.get_data_ids('dataset[multilevel]'))
)
data_ids_list = list(self.store.get_data_ids(include_attrs=["title"]))
self.assertEqual(3, len(data_ids_list))
# Note, although we expect "title" to be included,
# DirectoryStore does not implement it yet.
self.assertIn(('cube-1-250-250.zarr', {}), data_ids_list)
self.assertIn(('cube-5-100-200.zarr', {}), data_ids_list)
self.assertIn(('cube.nc', {}), data_ids_list)
self.assertEqual(
{
('cube-1-250-250.zarr', None),
('cube-5-100-200.zarr', None),
('cube.nc', None),
'cube-1-250-250.zarr',
'cube-5-100-200.zarr',
'cube.nc',
},
set(self.store.get_data_ids(include_titles=False))
)
self.assertEqual(
{
('cube-1-250-250.zarr', None),
('cube-5-100-200.zarr', None),
('cube.nc', None),
},
set(self.store.get_data_ids('dataset', include_titles=False))
set(self.store.get_data_ids('dataset'))
)
self.assertEqual(
set(),
set(self.store.get_data_ids('dataset[multilevel]', include_titles=False))
set(self.store.get_data_ids('dataset[multilevel]'))
)

def test_has_data(self):
Expand Down
8 changes: 5 additions & 3 deletions test/core/store/stores/test_memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,11 @@ def test_get_type_specifiers_for_data(self):
self.assertEqual('Data resource "geodataframe_2" does not exist in store', f'{cm.exception}')

def test_get_data_ids(self):
self.assertEqual({('cube_1', None), ('cube_2', None), ('ds_1', None)}, set(self.store.get_data_ids()))
self.assertEqual({('cube_1', None), ('cube_2', None), ('ds_1', None)},
set(self.store.get_data_ids(include_titles=False)))
data_ids_list = list(self.store.get_data_ids(include_attrs=[]))
self.assertEqual(3, len(data_ids_list))
self.assertIn(('cube_1', {}), data_ids_list)
self.assertIn(('cube_2', {}), data_ids_list)
self.assertIn(('ds_1', {}), data_ids_list)

def test_has_data(self):
self.assertEqual(True, self.store.has_data('cube_1'))
Expand Down
4 changes: 2 additions & 2 deletions xcube/cli/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -375,8 +375,8 @@ def _dump_store_writers(data_store: 'xcube.core.store.DataStore') -> int:
# noinspection PyUnresolvedReferences
def _dump_store_data_ids(data_store: 'xcube.core.store.DataStore') -> int:
count = 0
for data_id, title in data_store.get_data_ids():
print(f' {data_id:>32s} {title or _NO_TITLE}')
for data_id, data_attrs in data_store.get_data_ids(include_attrs=['title']):
print(f' {data_id:>32s} {data_attrs.get("title") or _NO_TITLE}')
count += 1
return count

Expand Down
2 changes: 1 addition & 1 deletion xcube/core/store/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,5 +37,5 @@ def search_data(self, type_specifier: str = None, **search_params) -> Iterator[D
"""
if search_params:
raise DataStoreError(f'Unsupported search parameters: {", ".join(search_params.keys())}')
for data_id, _ in self.get_data_ids(type_specifier=type_specifier, include_titles=False):
for data_id in self.get_data_ids(type_specifier=type_specifier):
yield self.describe_data(data_id)
47 changes: 34 additions & 13 deletions xcube/core/store/store.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
# SOFTWARE.

from abc import abstractmethod, ABC
from typing import Iterator, Tuple, Any, Optional, List, Type
from typing import Iterator, Tuple, Any, Optional, List, Type, Dict, Union, Container

from xcube.constants import EXTENSION_POINT_DATA_STORES
from xcube.util.extension import Extension
Expand Down Expand Up @@ -153,22 +153,43 @@ def get_type_specifiers_for_data(self, data_id: str) -> Tuple[str, ...]:
"""

@abstractmethod
def get_data_ids(self, type_specifier: str = None, include_titles: bool = True) -> \
Iterator[Tuple[str, Optional[str]]]:
def get_data_ids(self,
type_specifier: str = None,
include_attrs: Container[str] = None) -> \
Union[Iterator[str], Iterator[Tuple[str, Dict[str, Any]]]]:
"""
Get an iterator over the data resource identifiers for the given type *type_specifier*.
If *type_specifier* is omitted, all data resource identifiers are returned.

If a store implementation supports only a single data type, it should verify that *type_specifier*
is either None or compatible with the supported data type.

The returned iterator items are 2-tuples of the form (*data_id*, *title*), where *data_id*
is the actual data identifier and *title* is an optional, human-readable title for the data.
If *include_titles* is false, the second item of the result tuple will be None.

:param type_specifier: If given, only data identifiers that are available as this type are returned. If this is
omitted, all available data identifiers are returned.
:param include_titles: If true, the store will attempt to also provide a title.
If a store implementation supports only a single data type, it should verify that
*type_specifier* is either None or compatible with the supported data type.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If a store implementation supports only a single data type, it should verify that type_specifier is either None or compatible with the supported data type. -> A store implementation should verify that type_specifier is either None or compatible with a supported data type.


If *include_attrs* is provided, it must be a sequence of names of metadata attributes.
The store will then return extra metadata for each returned data resource
identifier according to the names of the metadata attributes as tuples (*data_id*, *attrs*).

Hence, the type of the returned iterator items depends on the value of *include_attrs*:

- If *include_attrs* is None (the default), the method returns an iterator
of dataset identifiers *data_id* of type `str`.
- If *include_attrs* is a sequence of attribute names, even an empty one,
the method returns an iterator of tuples (*data_id*, *attrs*) of type
`Tuple[str, Dict]`, where *attrs* is a dictionary filled according to the
names in *include_attrs*. If a store cannot provide a given attribute, it
should simply ignore it. This may even yield to an empty dictionary for a given
*data_id*.

The individual attributes do not have to exist in the dataset's metadata, they may also be
generated on-the-fly. An example for a generic attribute name is "title".
A store should try to resolve ```include_attrs=["title"]``` by returning items such as
```("ESACCI-L4_GHRSST-SSTdepth-OSTIA-GLOB_CDR2.1-v02.0-fv01.0.zarr",
{"title": "Level-4 GHRSST Analysed Sea Surface Temperature"})```.

:param type_specifier: If given, only data identifiers that are available as this type are returned.
If this is omitted, all available data identifiers are returned.
:param include_attrs: A sequence of names of attributes to be returned for each dataset identifier.
If given, the store will attempt to provide the set of requested dataset attributes in addition to the data ids.
(added in xcube 0.8.0)
:return: An iterator over the identifiers and titles of data resources provided by this data store.
:raise DataStoreError: If an error occurs.
"""
Expand Down
12 changes: 8 additions & 4 deletions xcube/core/store/stores/directory.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

import os.path
import uuid
from typing import Optional, Iterator, Any, Tuple, List
from typing import Optional, Iterator, Any, Tuple, List, Dict, Union, Container

import geopandas as gpd
import xarray as xr
Expand Down Expand Up @@ -116,8 +116,12 @@ def get_type_specifiers_for_data(self, data_id: str) -> Tuple[str, ...]:
actual_type_specifier, _, _ = self._get_accessor_id_parts(data_id)
return actual_type_specifier,

def get_data_ids(self, type_specifier: str = None, include_titles: bool = True) -> \
Iterator[Tuple[str, Optional[str]]]:
def get_data_ids(self,
type_specifier: str = None,
include_attrs: Container[str] = None) -> \
Union[Iterator[str], Iterator[Tuple[str, Dict[str, Any]]]]:
# TODO: do not ignore names in include_attrs
return_tuples = include_attrs is not None
if type_specifier is not None:
type_specifier = TypeSpecifier.normalize(type_specifier)
# TODO: Use os.walk(), which provides a generator rather than a list
Expand All @@ -127,7 +131,7 @@ def get_data_ids(self, type_specifier: str = None, include_titles: bool = True)
actual_type_specifier = self._get_type_specifier_for_data_id(data_id, require=False)
if actual_type_specifier is not None:
if type_specifier is None or actual_type_specifier.satisfies(type_specifier):
yield data_id, None
yield (data_id, {}) if return_tuples else data_id

def has_data(self, data_id: str, type_specifier: str = None) -> bool:
assert_given(data_id, 'data_id')
Expand Down
14 changes: 9 additions & 5 deletions xcube/core/store/stores/memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
# SOFTWARE.

import uuid
from typing import Iterator, Dict, Any, Optional, Tuple, Mapping
from typing import Iterator, Dict, Any, Optional, Tuple, Mapping, Union, Container

from xcube.core.store import DataDescriptor
from xcube.core.store import DataStoreError
Expand Down Expand Up @@ -66,17 +66,21 @@ def get_type_specifiers_for_data(self, data_id: str) -> Tuple[str, ...]:
type_specifier = get_type_specifier(self._data_dict[data_id])
return str(type_specifier),

def get_data_ids(self, type_specifier: str = None, include_titles: bool = True) -> Iterator[
Tuple[str, Optional[str]]]:
def get_data_ids(self,
type_specifier: str = None,
include_attrs: Container[str] = None) -> \
Union[Iterator[str], Iterator[Tuple[str, Dict[str, Any]]]]:
# TODO: do not ignore names in include_attrs
return_tuples = include_attrs is not None
if type_specifier is None:
for data_id, data in self._data_dict.items():
yield data_id, None
yield (data_id, {}) if return_tuples else data_id
else:
type_specifier = TypeSpecifier.normalize(type_specifier)
for data_id, data in self._data_dict.items():
data_type_specifier = get_type_specifier(data)
if data_type_specifier is None or data_type_specifier.satisfies(type_specifier):
yield data_id, None
yield (data_id, {}) if return_tuples else data_id

def has_data(self, data_id: str, type_specifier: str = None) -> bool:
assert_given(data_id, 'data_id')
Expand Down
14 changes: 10 additions & 4 deletions xcube/core/store/stores/s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
import json
import os.path
import uuid
from typing import Optional, Iterator, Any, Tuple, List
from typing import Optional, Iterator, Any, Tuple, List, Dict, Union, Container

import s3fs
import xarray as xr
Expand Down Expand Up @@ -124,13 +124,19 @@ def get_type_specifiers_for_data(self, data_id: str) -> Tuple[str, ...]:
data_type_specifier, _, _ = self._get_accessor_id_parts(data_id)
return data_type_specifier,

def get_data_ids(self, type_specifier: str = None, include_titles=True) -> Iterator[Tuple[str, Optional[str]]]:
# todo do not ignore type_specifier
def get_data_ids(self,
type_specifier: str = None,
include_attrs: Container[str] = None) -> \
Union[Iterator[str], Iterator[Tuple[str, Dict[str, Any]]]]:
# TODO: do not ignore type_specifier
# TODO: do not ignore names in include_attrs
return_tuples = include_attrs is not None
prefix = self._bucket_name + '/'
first_index = len(prefix)
for item in self._s3.listdir(self._bucket_name, detail=False):
if item.startswith(prefix):
yield item[first_index:], None
data_id = item[first_index:]
yield (data_id, {}) if return_tuples else data_id

def has_data(self, data_id: str, type_specifier: str = None) -> bool:
if data_id in self._registry:
Expand Down
2 changes: 1 addition & 1 deletion xcube/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,4 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

version = '0.7.1'
version = '0.8.0.dev1'