Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updated xcube server's dataset configuration extraction methodology #1048

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,13 @@
* Improved the way color mapping works in xcube server to support simplified
color bar management in xcube viewer,
see https://github.com/xcube-dev/xcube-viewer/issues/390. (#1043)
* The xcube server's dataset configuration extraction methodology has been updated.
When the data resource ID is provided in the Path field, xcube will attempt to
access the dataset using the given ID. If wildcard patterns are used, the server
will crawl through the data store to find matching data IDs. This process may
result in a long setup time if the data store contains numerous data IDs.
A UserWarning will be issued for the "stac" data store.


## Changes in 1.6.0

Expand Down
92 changes: 57 additions & 35 deletions xcube/webapi/datasets/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@

# We use tilde, because it is not a reserved URI characters
STORE_DS_ID_SEPARATOR = "~"
DATA_STORE_IDS_WARNING = ["stac"]
FS_TYPE_TO_PROTOCOL = {
"local": "file",
"obs": "s3",
Expand Down Expand Up @@ -366,49 +367,48 @@ def get_dataset_configs_from_stores(
for store_instance_id in data_store_pool.store_instance_ids:
LOG.info(f"Scanning store {store_instance_id!r}")
data_store_config = data_store_pool.get_store_config(store_instance_id)
data_store = data_store_pool.get_store(store_instance_id)

# Note by forman: This iterator chaining is inefficient.
# Preferably, we should offer
#
# store_dataset_ids = data_store.get_data_ids(
# data_type=(DATASET_TYPE, MULTI_LEVEL_DATASET_TYPE)
# )
#
store_dataset_ids = itertools.chain(
data_store.get_data_ids(data_type=DATASET_TYPE),
data_store.get_data_ids(data_type=MULTI_LEVEL_DATASET_TYPE),
)
for store_dataset_id in store_dataset_ids:
dataset_config_base = {}
store_dataset_configs: list[ServerConfig] = data_store_config.user_data
if store_dataset_configs:
for store_dataset_config in store_dataset_configs:
dataset_id_pattern = store_dataset_config.get("Path", "*")
if fnmatch.fnmatch(store_dataset_id, dataset_id_pattern):
dataset_config_base = store_dataset_config
break
else:
dataset_config_base = None
if dataset_config_base is not None:
LOG.debug(f"Selected dataset {store_dataset_id!r}")
dataset_config = dict(
StoreInstanceId=store_instance_id, **dataset_config_base
)
if dataset_config.get("Identifier") is not None:
if dataset_config["Path"] == store_dataset_id:
# we will use the preconfigured identifier
all_dataset_configs.append(dataset_config)
continue
raise ApiError.InvalidServerConfig(
"User-defined identifiers can only be assigned"
" to datasets with non-wildcard paths."

store_dataset_configs: list[ServerConfig] = data_store_config.user_data
if store_dataset_configs:
for store_dataset_config in store_dataset_configs:
dataset_id_pattern = store_dataset_config.get("Path", "*")
if _is_wildard(dataset_id_pattern):
if store_instance_id in DATA_STORE_IDS_WARNING:
warnings.warn(
f"The data store with ID '{store_instance_id}' has "
"many data IDs. Using wildcard patterns to select "
"datasets may cause a long setup time of the server."
)
data_store = data_store_pool.get_store(store_instance_id)
store_dataset_ids = itertools.chain(
data_store.get_data_ids(data_type=DATASET_TYPE),
data_store.get_data_ids(data_type=MULTI_LEVEL_DATASET_TYPE),
)
for store_dataset_id in store_dataset_ids:
if fnmatch.fnmatch(store_dataset_id, dataset_id_pattern):
all_dataset_configs.append(
_get_selected_dataset_config(
store_dataset_id,
store_instance_id,
store_dataset_config,
)
)
else:
all_dataset_configs.append(
_get_selected_dataset_config(
store_dataset_config["Path"],
store_instance_id,
store_dataset_config,
)
)
dataset_config["Path"] = store_dataset_id
dataset_config["Identifier"] = (
f"{store_instance_id}{STORE_DS_ID_SEPARATOR}"
f"{store_dataset_id}"
)
all_dataset_configs.append(dataset_config)

# # Just for testing:
# debug_file = 'all_dataset_configs.json'
Expand Down Expand Up @@ -865,6 +865,24 @@ def _get_common_prefixes(p):
)


def _get_selected_dataset_config(
store_dataset_id: str, store_instance_id: str, dataset_config_base: dict
) -> dict:
LOG.debug(f"Selected dataset {store_dataset_id!r}")
dataset_config = dict(StoreInstanceId=store_instance_id, **dataset_config_base)
if "Identifier" in dataset_config and dataset_config["Path"] != store_dataset_id:
raise ApiError.InvalidServerConfig(
"User-defined identifiers can only be assigned"
" to datasets with non-wildcard paths."
)
elif "Identifier" not in dataset_config:
dataset_config["Path"] = store_dataset_id
dataset_config["Identifier"] = (
f"{store_instance_id}{STORE_DS_ID_SEPARATOR}{store_dataset_id}"
)
return dataset_config


def _lastindex(prefix, symbol):
try:
return prefix.rindex(symbol)
Expand All @@ -875,3 +893,7 @@ def _lastindex(prefix, symbol):
_MULTI_LEVEL_DATASET_OPENERS = {
"memory": _open_ml_dataset_from_python_code,
}


def _is_wildard(string: str) -> bool:
return "?" in string or "*" in string
Loading