xcube-dev · konstntokas · Jul 22, 2024 · Jul 18, 2024 · Jul 18, 2024 · Jul 18, 2024
diff --git a/CHANGES.md b/CHANGES.md
@@ -43,6 +43,13 @@
 * Improved the way color mapping works in xcube server to support simplified
   color bar management in xcube viewer,
   see https://github.com/xcube-dev/xcube-viewer/issues/390. (#1043)  
+* The xcube server's dataset configuration extraction methodology has been updated.
+  When the data resource ID is provided in the Path field, xcube will attempt to
+  access the dataset using the given ID. If wildcard patterns are used, the server
+  will crawl through the data store to find matching data IDs. This process may
+  result in a long setup time if the data store contains numerous data IDs.
+  A UserWarning will be issued for the "stac" data store.
+
 
 ## Changes in 1.6.0
 

diff --git a/xcube/webapi/datasets/context.py b/xcube/webapi/datasets/context.py
@@ -50,6 +50,7 @@
 
 # We use tilde, because it is not a reserved URI characters
 STORE_DS_ID_SEPARATOR = "~"
+DATA_STORE_IDS_WARNING = ["stac"]
 FS_TYPE_TO_PROTOCOL = {
     "local": "file",
     "obs": "s3",
@@ -366,49 +367,48 @@ def get_dataset_configs_from_stores(
         for store_instance_id in data_store_pool.store_instance_ids:
             LOG.info(f"Scanning store {store_instance_id!r}")
             data_store_config = data_store_pool.get_store_config(store_instance_id)
-            data_store = data_store_pool.get_store(store_instance_id)
+
             # Note by forman: This iterator chaining is inefficient.
             # Preferably, we should offer
             #
             # store_dataset_ids = data_store.get_data_ids(
             #     data_type=(DATASET_TYPE, MULTI_LEVEL_DATASET_TYPE)
             # )
             #
-            store_dataset_ids = itertools.chain(
-                data_store.get_data_ids(data_type=DATASET_TYPE),
-                data_store.get_data_ids(data_type=MULTI_LEVEL_DATASET_TYPE),
-            )
-            for store_dataset_id in store_dataset_ids:
-                dataset_config_base = {}
-                store_dataset_configs: list[ServerConfig] = data_store_config.user_data
-                if store_dataset_configs:
-                    for store_dataset_config in store_dataset_configs:
-                        dataset_id_pattern = store_dataset_config.get("Path", "*")
-                        if fnmatch.fnmatch(store_dataset_id, dataset_id_pattern):
-                            dataset_config_base = store_dataset_config
-                            break
-                        else:
-                            dataset_config_base = None
-                if dataset_config_base is not None:
-                    LOG.debug(f"Selected dataset {store_dataset_id!r}")
-                    dataset_config = dict(
-                        StoreInstanceId=store_instance_id, **dataset_config_base
-                    )
-                    if dataset_config.get("Identifier") is not None:
-                        if dataset_config["Path"] == store_dataset_id:
-                            # we will use the preconfigured identifier
-                            all_dataset_configs.append(dataset_config)
-                            continue
-                        raise ApiError.InvalidServerConfig(
-                            "User-defined identifiers can only be assigned"
-                            " to datasets with non-wildcard paths."
+
+            store_dataset_configs: list[ServerConfig] = data_store_config.user_data
+            if store_dataset_configs:
+                for store_dataset_config in store_dataset_configs:
+                    dataset_id_pattern = store_dataset_config.get("Path", "*")
+                    if _is_wildard(dataset_id_pattern):
+                        if store_instance_id in DATA_STORE_IDS_WARNING:
+                            warnings.warn(
+                                f"The data store with ID '{store_instance_id}' has "
+                                "many data IDs. Using wildcard patterns to select "
+                                "datasets may cause a long setup time of the server."
+                            )
+                        data_store = data_store_pool.get_store(store_instance_id)
+                        store_dataset_ids = itertools.chain(
+                            data_store.get_data_ids(data_type=DATASET_TYPE),
+                            data_store.get_data_ids(data_type=MULTI_LEVEL_DATASET_TYPE),
+                        )
+                        for store_dataset_id in store_dataset_ids:
+                            if fnmatch.fnmatch(store_dataset_id, dataset_id_pattern):
+                                all_dataset_configs.append(
+                                    _get_selected_dataset_config(
+                                        store_dataset_id,
+                                        store_instance_id,
+                                        store_dataset_config,
+                                    )
+                                )
+                    else:
+                        all_dataset_configs.append(
+                            _get_selected_dataset_config(
+                                store_dataset_config["Path"],
+                                store_instance_id,
+                                store_dataset_config,
+                            )
                         )
-                    dataset_config["Path"] = store_dataset_id
-                    dataset_config["Identifier"] = (
-                        f"{store_instance_id}{STORE_DS_ID_SEPARATOR}"
-                        f"{store_dataset_id}"
-                    )
-                    all_dataset_configs.append(dataset_config)
 
         # # Just for testing:
         # debug_file = 'all_dataset_configs.json'
@@ -865,6 +865,24 @@ def _get_common_prefixes(p):
         )
 
 
+def _get_selected_dataset_config(
+    store_dataset_id: str, store_instance_id: str, dataset_config_base: dict
+) -> dict:
+    LOG.debug(f"Selected dataset {store_dataset_id!r}")
+    dataset_config = dict(StoreInstanceId=store_instance_id, **dataset_config_base)
+    if "Identifier" in dataset_config and dataset_config["Path"] != store_dataset_id:
+        raise ApiError.InvalidServerConfig(
+            "User-defined identifiers can only be assigned"
+            " to datasets with non-wildcard paths."
+        )
+    elif "Identifier" not in dataset_config:
+        dataset_config["Path"] = store_dataset_id
+        dataset_config["Identifier"] = (
+            f"{store_instance_id}{STORE_DS_ID_SEPARATOR}{store_dataset_id}"
+        )
+    return dataset_config
+
+
 def _lastindex(prefix, symbol):
     try:
         return prefix.rindex(symbol)
@@ -875,3 +893,7 @@ def _lastindex(prefix, symbol):
 _MULTI_LEVEL_DATASET_OPENERS = {
     "memory": _open_ml_dataset_from_python_code,
 }
+
+
+def _is_wildard(string: str) -> bool:
+    return "?" in string or "*" in string