diff --git a/docs/examples/grib_metadata_object.ipynb b/docs/examples/grib_metadata_object.ipynb index ab2e33d8..5b354924 100644 --- a/docs/examples/grib_metadata_object.ipynb +++ b/docs/examples/grib_metadata_object.ipynb @@ -26,7 +26,7 @@ "tags": [] }, "source": [ - "We will work with a GRIB file containing 6 messages. First we ensure the example file is available, then read the file with :ref:`from_source() `." + "In this notebook we will work with a GRIB file containing 6 messages. First we ensure the example file is available, then read the file with :ref:`from_source() `." ] }, { @@ -137,7 +137,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 3, @@ -246,21 +246,84 @@ }, "tags": [] }, + "outputs": [], + "source": [ + "md_copy = md.override()" + ] + }, + { + "cell_type": "raw", + "id": "1c511db3-f841-4936-a7cb-25dd3e3a1010", + "metadata": { + "editable": true, + "raw_mimetype": "text/restructuredtext", + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "By default :py:meth:`~data.readers.grib.metadata.GribMetadata.override` is called with the ``headers_only_clone=True`` option to clone a new GRIB handle with all the data values (and some related information) removed. With this the resulting object can be significantly smaller, especially if the data section is large. The downside is that now the value related keys either cannot be accessed or give back wrong values. E.g when using the \"average\" key we get:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "c9ab97f0-40b7-403e-b7c9-6ce16c894d7d", + "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "(279.70703560965404, 47485.4296875)" ] }, - "execution_count": 5, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "md_copy = md.override()\n", - "md_copy" + "md[\"average\"], md_copy[\"average\"]" + ] + }, + { + "cell_type": "markdown", + "id": "fe340c05-ac04-4f7d-8c43-3ed1f415a979", + "metadata": {}, + "source": [ + "To get a copy without shrinking the GRIB handle use ``headers_only_clone=False``." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "a789bb5e-b493-45cd-b7ce-e5cb8b34e125", + "metadata": {}, + "outputs": [], + "source": [ + "md_copy_full = md.override(headers_only_clone=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "3b030549-2a7e-494e-bd46-0a4cd19a09ad", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "279.70703560965404" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "md_copy_full[\"average\"]" ] }, { @@ -294,7 +357,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 9, "id": "40c6d232-03de-402b-82bf-8647e8a7bece", "metadata": { "editable": true, @@ -310,7 +373,7 @@ "('z', 850)" ] }, - "execution_count": 6, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -336,7 +399,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 10, "id": "ef78a3ec-4ea2-4ff5-8c90-e60b5e07e77f", "metadata": { "editable": true, @@ -352,7 +415,7 @@ "('t', 1000)" ] }, - "execution_count": 7, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -386,7 +449,7 @@ "tags": [] }, "source": [ - "GRIB metadata objects play a part in building new fieldlist from (altered) values and metadata." + "GRIB metadata objects play a part in building new fieldlists from (altered) values and metadata." ] }, { @@ -405,7 +468,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 11, "id": "cb59ad5f-c48b-4943-984d-3abdf48fda8d", "metadata": { "editable": true, @@ -427,7 +490,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 12, "id": "85c32bfb-c929-404f-add9-9adae40418d2", "metadata": { "editable": true, @@ -496,7 +559,7 @@ "0 an 0 regular_ll " ] }, - "execution_count": 9, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -520,12 +583,12 @@ } }, "source": [ - "Please note that the resulting :py:class:`~data.sources.array_list.ArrayFieldList` always contains a :py:class:`~data.readers.grib.metadata.RestrictedGribMetadata` object for each field. These objects possess their own GRIB handles, which is ensured by creating a copy with ``override()`` when needed. On top of that metadata access is limited to keys not related to data values. Getting metadata on any other keys will throw an exception. " + "The resulting fieldlist contains an :py:class:`~data.sources.array_list.ArrayField`, which is composed of a numpy array storing the values and a metadata object owning its own GRIB handle with a trimmed down data section. Since the values array is decoupled from the GRIB handle stored in the metadata object, accessing metadata keys related to the data values is forbidden. Getting metadata on these keys will throw an exception. " ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 13, "id": "c6fe87ed-ee88-4f4d-a2b6-9401b364e2df", "metadata": { "editable": true, @@ -538,10 +601,10 @@ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 10, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -552,7 +615,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 14, "id": "27686ac4-9382-4916-ad0e-be96a649d034", "metadata": { "editable": true, @@ -568,7 +631,7 @@ "'Wind speed'" ] }, - "execution_count": 11, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -579,7 +642,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 15, "id": "dc28fa77-4020-431f-ad37-e480a69f9d7f", "metadata": { "editable": true, @@ -620,7 +683,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 16, "id": "8eab3462-3661-4fc1-9d23-8be05dc99cd8", "metadata": { "editable": true, @@ -636,7 +699,7 @@ "7.450183054360252" ] }, - "execution_count": 13, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } diff --git a/src/earthkit/data/core/metadata.py b/src/earthkit/data/core/metadata.py index 2785e0c2..b71e387a 100644 --- a/src/earthkit/data/core/metadata.py +++ b/src/earthkit/data/core/metadata.py @@ -360,10 +360,52 @@ def _hide_internal_keys(self): class WrappedMetadata: - def __init__(self, metadata, extra=None, hidden=None, owner=None, merge=True): + r"""Wrapper around a Metadata object to add extra metadata entries and hide some keys. + + Parameters + ---------- + metadata: Metadata + The metadata object to wrap. + extra: dict-like, optional + Additional metadata entries, which are not part of ``metadata``. + Methods like `:obj:get` first search for entries in `extra` before + querying ``metadata``. The ``extra`` entries can be callables that + take the following arguments: owner, key, metadata (where metadata + is the wrapped metadata object). + hidden: list of str, None, optional + Metadata keys to hide from ``metadata``. + hidden_namespaces: list of str, None, optional + Namespaces to hide from ``metadata``. + enforced_namespaces: list of str, None, optional + Keys in these namespaces are cannot be hidden. + owner: object, optional + The owner. + merge: bool, optional + Used when True and ``metadata`` is a :class:`WrappedMetadata`. In this case + merge ``extra``, ``hidden``, ``hidden_namespaces``, and ``enforced_namespaces`` + from the original metadata and also replaces it with the object the original + metadata wrapped. + + Raises + ------ + ValueError: If a key in ``hidden`` is also in ``extra``. + """ + + def __init__( + self, + metadata, + extra=None, + hidden=None, + hidden_namespaces=None, + enforced_namespaces=None, + owner=None, + merge=True, + ): self.metadata = metadata self.extra = extra if extra is not None else dict() self.hidden = hidden if hidden is not None else [] + self.hidden_namespaces = hidden_namespaces + self.enforced_namespaces = enforced_namespaces self.owner = owner for k in self.hidden: @@ -372,12 +414,7 @@ def __init__(self, metadata, extra=None, hidden=None, owner=None, merge=True): if merge and isinstance(metadata, WrappedMetadata): self.metadata = metadata.metadata - v = dict(**metadata.extra) - v.update(self.extra) - self.extra = v - for x in metadata.hidden: - if x not in self.hidden: - self.hidden.append(x) + self._update(metadata) def __len__(self): r"""Return the number of metadata entries.""" @@ -397,7 +434,16 @@ def __contains__(self, key): return key in self.metadata def _is_hidden(self, key): - return key in self.hidden + name = key + if "." in key: + ns, _, name = key.partition(".") + if name == "": + name = key + ns = "" + if ns and self.enforced_namespaces and ns in self.enforced_namespaces: + return False + + return name in self.hidden def keys(self): r"""Return the metadata keys. @@ -455,22 +501,81 @@ def _extra_value(self, key): v = v(self.owner, key, self.metadata) return v + def namespaces(self): + if self.hidden_namespaces: + return [x for x in self.metadata.namespaces() if x not in self.hidden_namespaces] + else: + return self.metadata.namespaces() + def as_namespace(self, namespace): + print("as_namespace namespace=", namespace) + if self.hidden_namespaces and namespace in self.hidden_namespaces: + return {} + r = dict() if namespace is None: r = dict(self.items()) - for k, v in self.extra.items(): + for k, _ in self.extra.items(): if k in r: r[k] = self._extra_value(k) else: r = self.metadata.as_namespace(namespace) + for k in list(r.keys()): + if k in self.hidden: + del r[k] # TODO: add filtering based on extra return r + def dump(self, namespace=all, **kwargs): + if namespace is all: + namespace = self.namespaces() + return self.metadata.dump(namespace=namespace, **kwargs) + def override(self, *args, **kwargs): md = self.metadata.override(*args, **kwargs) - return self.__class__(md, self.extra, hidden=self.hidden, merge=True) + if self.metadata is md: + return self + return self._clone(md) + + def _hide_internal_keys(self): + if self.hidden: + return self + + md = self.metadata._hide_internal_keys() + if self.metadata is md: + return self + elif isinstance(md, WrappedMetadata) and md.metadata is self.metadata: + md._update(self) + return md + else: + return self._clone(md) + + @staticmethod + def merge_list(v1, v2): + if v1 and v2: + if v2 is None: + v2 = list(v1) + return v2 + r = [x for x in v1 if x not in v2] + if r: + v2 = list(v2) + v2.extend(r) + + def _update(self, other): + assert isinstance(other, WrappedMetadata) + v = dict(**other.extra) + # self.extra.update(other.extra) + v.update(self.extra) + self.extra = v + self.merge_list(other.hidden, self.hidden) + self.merge_list(other.hidden_namespaces, self.hidden_namespaces) + self.merge_list(other.enforced_namespaces, self.enforced_namespaces) + + def _clone(self, metadata): + r = self.__class__(metadata) + r._update(self) + return r def __getitem__(self, key): return self.get(key, raise_on_missing=True) @@ -478,6 +583,22 @@ def __getitem__(self, key): def __getattr__(self, name): return getattr(self.metadata, name) + def __getstate__(self) -> dict: + ret = {} + ret["metadata"] = self.metadata + ret["extra"] = self.extra + ret["hidden"] = self.hidden + ret["hidden_namespaces"] = self.hidden_namespaces + ret["enforced_namespaces"] = self.enforced_namespaces + return ret + + def __setstate__(self, state: dict): + self.metadata = state.pop("metadata") + self.extra = state.pop("extra") + self.hidden = state.pop("hidden") + self.hidden_namespaces = state.pop("hidden_namespaces") + self.enforced_namespaces = state.pop("enforced_namespaces") + class RawMetadata(Metadata): r"""Metadata implementation based on key/value pairs. diff --git a/src/earthkit/data/readers/grib/metadata.py b/src/earthkit/data/readers/grib/metadata.py index 6a20dc1f..a5d5ad1c 100644 --- a/src/earthkit/data/readers/grib/metadata.py +++ b/src/earthkit/data/readers/grib/metadata.py @@ -7,6 +7,7 @@ # nor does it submit to any jurisdiction. # +import logging import warnings from abc import abstractmethod from functools import cached_property @@ -22,6 +23,8 @@ from earthkit.data.utils.dates import datetime_from_grib from earthkit.data.utils.dates import to_timedelta +LOG = logging.getLogger(__name__) + def missing_is_none(x): return None if x == 2147483647 else x @@ -371,6 +374,10 @@ def _key_name(key): if not raise_on_missing: _kwargs["default"] = default + # allow using the "grib." prefix. + if key.startswith("grib."): + key = key[5:] + key = _key_name(key) v = self._handle.get(key, ktype=astype, **_kwargs) @@ -387,14 +394,37 @@ def _copy_key(self, target_handle, key): target_handle.set_long(key, v_ori) def override(self, *args, headers_only_clone=True, **kwargs): - d = dict(*args, **kwargs) + r"""Create a new metadata object by cloning a new GRIB handle and setting the keys in it. + + Parameters + ---------- + *args: tuple + Positional arguments. When present must be a dict with the GRIB keys to set in + the new GRIB handle. + headers_only_clone: bool, optional + If True, the new GRIB handle will be created with headers_only=True to reduce the + data section. With this the GRIB handle size will be significantly smaller, but the + data section becomes unusable. Default is True. + **kwargs: dict, optional + Other keyword arguments specifying the GRIB keys to set. + + Returns + ------- + :class:`WrappedMetadata` + The new metadata object. There is always a :class:`StandAloneGribMetadata` object + created containing the new GRIB handle updated with the specified keys. + It is then wrapped in a :class:`WrappedMetadata` object storing ``"bitsPerValue"`` + as an extra key. - # using headers_only_clone=True can cause problems when we want to write GRIB - # to disk or modify the generated handle. Until it is fixed, we use headers_only_clone=False. - headers_only_clone = False + + Notes + ----- + - When ``"bitsPerValue"`` is a key to set it is not written to the new handle. Instead, it + is stored as an extra key in the resulting :class:`WrappedMetadata` object. + """ + d = dict(*args, **kwargs) new_value_size = None - # extra = None gridspec = d.pop("gridspec", None) if gridspec is not None: from earthkit.data.readers.grib.gridspec import GridSpecConverter @@ -405,13 +435,58 @@ def override(self, *args, headers_only_clone=True, **kwargs): handle = self._handle.clone(headers_only=headers_only_clone) - # some keys, needed later, are not copied into the clone when - # headers_only=True. We store them as extra keys. - if "bitsPerValue" not in d: - self._copy_key(handle, "bitsPerValue") + extra = {} + + # For the steps below consider the followings: + # - we cannot reliably determine whether the original handle is reduced or not + # - "bitsPerValue" needs a special treatment, because it cannot be set without + # repacking the data. + # - we want to carry "bitsPerValue" over to the clone if possible + + # When headers_only=True, "bitsPerValue" in the clone is unreliable. Since we need to + # carry "bitsPerValue" over ideally we should copy it into the clone but we + # cannot do it since we just trimmed down the data section, so a proper repacking + # is not possible. As a solution, we will generate a WrappedMetadata object and + # store the original "bitsPerValue" in the extra dict. + # When headers_only=False, we do not know whether the original handle was trimmed down + # or not. Therefore, instead of applying complicated logic we follow the same + # approach as for headers_only=True. + key = "bitsPerValue" + if key in d: + extra[key] = d.pop(key) + else: + # we get the value form the original metadata object and not from the handle since + # the handle can already be trimmed down + v = self.get(key, default=None) + if v is not None and v > 0: + extra[key] = v + # as a fallback we try to get the value from the clone + else: + v_clone = handle.get(key, None) + if v_clone is not None and v_clone > 0: + extra[key] = v_clone if d: - handle.set_multiple(d) + single = {} + multiple = {} + for k, v in d.items(): + if isinstance(v, (int, float, str, bool)): + single[k] = v + else: + multiple[k] = v + + try: + # Try to set all metadata at once + # This is needed when we set multiple keys that are interdependent + handle.set_multiple(single) + except Exception as e: + LOG.error("Failed to set metadata at once: %s", e) + # Try again, but one by one + for k, v in single.items(): + handle.set(k, v) + + for k, v in multiple.items(): + handle.set(k, v) # we need to set the values to the new size otherwise the clone generated # with headers_only=True will be inconsistent @@ -422,7 +497,15 @@ def override(self, *args, headers_only_clone=True, **kwargs): handle.set_values(vals) # ensure that the cache settings are the same - return StandAloneGribMetadata(handle, cache=MetadataCacheHandler.clone_empty(self._cache)) + r = StandAloneGribMetadata( + handle, + cache=MetadataCacheHandler.clone_empty(self._cache), + ) + + if extra: + r = WrappedMetadata(r, extra=extra) + + return r def namespaces(self): return self.NAMESPACES @@ -562,6 +645,9 @@ def data_format(self): def gridspec(self): return self.geography.gridspec() + def _make_restricted(self, r): + return RestrictedGribMetadata(self) + class GribFieldMetadata(GribMetadata): """Represent the metadata of a GRIB field. @@ -643,7 +729,7 @@ class RestrictedGribMetadata(WrappedMetadata): :ref:`/examples/grib_metadata_object.ipynb` """ - EKD_NAMESPACE = "grib" + EKD_NAMESPACE = ["grib"] # ideally bitsPerValue should be here. However, it is treated as an # extra key and cannot be an internal key. @@ -677,65 +763,20 @@ class RestrictedGribMetadata(WrappedMetadata): ] INTERNAL_NAMESPACES = ["statistics"] - def __init__(self, md): - assert isinstance(md, StandAloneGribMetadata) - super().__init__(md, hidden=self.INTERNAL_KEYS) - - def _is_hidden(self, key): - ns, _, name = key.partition(".") - if name == "": - name = key - ns = "" - - if ns == self.EKD_NAMESPACE: - return False - else: - return name in self.hidden - - def get(self, key, default=None, *, astype=None, raise_on_missing=False): - ns, _, name = key.partition(".") - if name == "": - name = key - ns = "" - - if ns == self.EKD_NAMESPACE: - key = name - - return super().get(key, default=default, astype=astype, raise_on_missing=raise_on_missing) - - def namespaces(self): - if self.INTERNAL_NAMESPACES: - return [x for x in self.metadata.namespaces() if x not in self.INTERNAL_NAMESPACES] - else: - return self.metadata.namespaces() - - def as_namespace(self, namespace): - if namespace in self.INTERNAL_NAMESPACES: - return {} - - r = self.metadata.as_namespace(namespace) - for k in list(r.keys()): - if k in self.INTERNAL_KEYS: - del r[k] - return r - - def dump(self, namespace=all, **kwargs): - if namespace is all: - namespace = self.namespaces() - return self.metadata.dump(namespace=namespace, **kwargs) - - def override(self, *args, **kwargs): - r = self.metadata.override(*args, **kwargs) - return RestrictedGribMetadata(r) + def __init__(self, metadata): + super().__init__( + metadata, + hidden=self.INTERNAL_KEYS, + hidden_namespaces=self.INTERNAL_NAMESPACES, + enforced_namespaces=self.EKD_NAMESPACE, + ) def _hide_internal_keys(self): return self def __getstate__(self) -> dict: - ret = {} - ret["metadata"] = self.metadata - return ret + state = super().__getstate__() + return state def __setstate__(self, state: dict): - md = state.pop("metadata") - super().__init__(md, hidden=self.INTERNAL_KEYS) + super().__setstate__(state) diff --git a/src/earthkit/data/readers/grib/output.py b/src/earthkit/data/readers/grib/output.py index 07ca195a..429dda71 100644 --- a/src/earthkit/data/readers/grib/output.py +++ b/src/earthkit/data/readers/grib/output.py @@ -44,7 +44,6 @@ def __getitem__(self, key): class GribCoder: def __init__(self, template=None, **kwargs): - self.template = template self._bbox = {} self.kwargs = kwargs @@ -80,6 +79,7 @@ def encode( handle = self.handle_from_metadata(values, metadata, compulsory) else: handle = template.handle.clone() + self.update_metadata_from_template(metadata, template, handle) self.update_metadata(handle, metadata, compulsory) @@ -331,6 +331,35 @@ def _gg_field(self, values, metadata): else: return f"reduced_gg_{levtype}_{N}_grib{edition}" + def update_metadata_from_template(self, metadata, template, handle): + # the template can contain extra metadata that is not encoded in the handle + bpv = None + if hasattr(template, "metadata"): + template_md = template.metadata() + from earthkit.data.core.metadata import WrappedMetadata + + if isinstance(template_md, WrappedMetadata): + for k in template_md.extra.keys(): + if k != "bitsPerValue" and k not in metadata: + metadata[k] = template_md.get(k) + + if "bitsPerValue" not in metadata: + bpv = template.metadata("bitsPerValue", default=None) + + # Either the handle has valid bitsPerValue or has to be extracted + # from the template and added to the metadata to be encoded + if "bitsPerValue" not in metadata: + if bpv is None: + try: + bpv = template.handle.get("bitsPerValue", None) + except Exception: + bpv = None + + if bpv is not None and bpv > 0: + bpv_h = handle.get("bitsPerValue", None) + if bpv != bpv_h: + metadata["bitsPerValue"] = bpv + @lru_cache(maxsize=None) def _gg_pl(N): diff --git a/src/earthkit/data/utils/metadata/args.py b/src/earthkit/data/utils/metadata/args.py index a044385d..060098d9 100644 --- a/src/earthkit/data/utils/metadata/args.py +++ b/src/earthkit/data/utils/metadata/args.py @@ -48,5 +48,7 @@ def metadata_argument(*args, namespace=None, astype=None): namespace = [] elif isinstance(namespace, str) or namespace == all: namespace = [namespace] + elif namespace == [""]: + namespace = [] return (key, namespace, astype, key_arg_type) diff --git a/src/earthkit/data/utils/xarray/builder.py b/src/earthkit/data/utils/xarray/builder.py index 69088635..7f844bec 100644 --- a/src/earthkit/data/utils/xarray/builder.py +++ b/src/earthkit/data/utils/xarray/builder.py @@ -67,8 +67,10 @@ def __init__( def build(self, add_earthkit_attrs=True): if add_earthkit_attrs: + md = self.tensor.source[0].metadata().override() attrs = { - "message": self.tensor.source[0].metadata().override()._handle.get_buffer(), + "message": md._handle.get_buffer(), + "bitsPerValue": md.get("bitsPerValue", 0), } self._attrs["_earthkit"] = attrs diff --git a/src/earthkit/data/utils/xarray/engine.py b/src/earthkit/data/utils/xarray/engine.py index d60e5e5c..86a9f8ec 100644 --- a/src/earthkit/data/utils/xarray/engine.py +++ b/src/earthkit/data/utils/xarray/engine.py @@ -337,9 +337,15 @@ def metadata(self): data = md["message"] from earthkit.data.readers.grib.memory import GribMessageMemoryReader from earthkit.data.readers.grib.metadata import StandAloneGribMetadata + from earthkit.data.readers.grib.metadata import WrappedMetadata handle = next(GribMessageMemoryReader(data)).handle - return StandAloneGribMetadata(handle) + bpv = md.get("bitsPerValue", 0) + res_md = StandAloneGribMetadata(handle) + if bpv is not None and bpv > 0: + return WrappedMetadata(res_md, extra={"bitsPerValue": bpv}) + else: + return res_md raise ValueError( ( diff --git a/src/earthkit/data/writers/grib.py b/src/earthkit/data/writers/grib.py index e9d90858..e4410972 100644 --- a/src/earthkit/data/writers/grib.py +++ b/src/earthkit/data/writers/grib.py @@ -36,9 +36,6 @@ def write(self, f, field, values=None, check_nans=True, bits_per_value=None): output = new_grib_output(f, template=field) md = {} - # wrapped metadata - if hasattr(field._metadata, "extra"): - md = {k: field._metadata._extra_value(k) for k, v in field._metadata.extra.items()} if bits_per_value is not None: if field._metadata.get("bitsPerValue", 0) != bits_per_value: @@ -49,11 +46,15 @@ def write(self, f, field, values=None, check_nans=True, bits_per_value=None): md["generatingProcessIdentifier"] = None if values is None: - try: - if field._has_new_values(): - values = field.values - except AttributeError: - pass + # when bitsPerValue is set, we need to repack the values + if "bitsPerValue" in md: + values = field.values + else: + try: + if field._has_new_values(): + values = field.values + except AttributeError: + pass output.write(values, check_nans=check_nans, missing_value=field.handle.MISSING_VALUE - 1, **md) diff --git a/tests/array_fieldlist/array_fl_fixtures.py b/tests/array_fieldlist/array_fl_fixtures.py index d275286e..2aab7a79 100644 --- a/tests/array_fieldlist/array_fl_fixtures.py +++ b/tests/array_fieldlist/array_fl_fixtures.py @@ -32,7 +32,7 @@ def load_array_fl(num, array_backend=None): ds = [] for x in ds_in: - print("len", len(x)) + # print("len", len(x)) ds.append(FieldList.from_array(x.values, [m.override(edition=1) for m in x.metadata()])) return (*ds, md) diff --git a/tests/core/test_metadata.py b/tests/core/test_metadata.py index 44f67351..1e115c4b 100644 --- a/tests/core/test_metadata.py +++ b/tests/core/test_metadata.py @@ -15,7 +15,6 @@ from earthkit.data import from_source from earthkit.data.core.metadata import RawMetadata from earthkit.data.readers.grib.metadata import GribFieldMetadata -from earthkit.data.readers.grib.metadata import RestrictedGribMetadata from earthkit.data.readers.grib.metadata import StandAloneGribMetadata from earthkit.data.testing import earthkit_examples_file from earthkit.data.testing import earthkit_test_data_file @@ -263,47 +262,6 @@ def test_grib_metadata_override_invalid(): assert "EncodingError" in e.typename -@pytest.mark.skipif(True, reason="headers_only_clone has to be fixed") -def test_grib_metadata_override_headers_only_true(): - ds = from_source("file", earthkit_examples_file("test.grib")) - ref_size = ds[0].metadata("totalLength") - - md1 = ds[0].metadata().override(headers_only_clone=True) - assert isinstance(md1, StandAloneGribMetadata) - assert md1._handle is not None - assert md1._handle != ds[0]._handle - assert md1["totalLength"] - ref_size < -10 - - md2 = md1._hide_internal_keys() - assert isinstance(md2, RestrictedGribMetadata) - assert md2._handle is not None - assert md2._handle != ds[0]._handle - assert md2._handle == md1._handle - - with pytest.raises(KeyError): - md2["average"] - - -def test_grib_metadata_override_headers_only_false(): - ds = from_source("file", earthkit_examples_file("test.grib")) - ref_size = ds[0].metadata("totalLength") - - md1 = ds[0].metadata().override(headers_only_clone=False) - assert isinstance(md1, StandAloneGribMetadata) - assert md1._handle is not None - assert md1._handle != ds[0]._handle - assert np.isclose(md1["totalLength"], ref_size) - - md2 = md1._hide_internal_keys() - assert isinstance(md2, RestrictedGribMetadata) - assert md2._handle is not None - assert md2._handle != ds[0]._handle - assert md2._handle == md1._handle - - with pytest.raises(KeyError): - md2["average"] - - def test_grib_metadata_wrapped_core(): ds = from_source("file", earthkit_examples_file("test.grib")) md = ds[0].metadata() diff --git a/tests/data/xr_engine/xr_grid.yaml b/tests/data/xr_engine/xr_grid.yaml index cd07257b..ca284920 100644 --- a/tests/data/xr_engine/xr_grid.yaml +++ b/tests/data/xr_engine/xr_grid.yaml @@ -89,6 +89,17 @@ - values - 6114 distinct_ll: false +- file: reduced_gg_N32.grib2 + dims: + values: 6114 + coords: + latitude: + - values + - 6114 + longitude: + - values + - 6114 + distinct_ll: false - file: reduced_gg_O32.grib1 dims: values: 5248 @@ -100,6 +111,17 @@ - values - 5248 distinct_ll: false +- file: reduced_gg_O32.grib2 + dims: + values: 5248 + coords: + latitude: + - values + - 5248 + longitude: + - values + - 5248 + distinct_ll: false - file: regular_gg_F16.grib1 dims: latitude: 32 diff --git a/tests/grib/test_grib_headers_only.py b/tests/grib/test_grib_headers_only.py new file mode 100644 index 00000000..018f50d3 --- /dev/null +++ b/tests/grib/test_grib_headers_only.py @@ -0,0 +1,285 @@ +#!/usr/bin/env python3 + +# (C) Copyright 2020 ECMWF. +# +# This software is licensed under the terms of the Apache Licence Version 2.0 +# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +# In applying this licence, ECMWF does not waive the privileges and immunities +# granted to it by virtue of its status as an intergovernmental organisation +# nor does it submit to any jurisdiction. +# + +import os + +import numpy as np +import pytest +import yaml + +from earthkit.data import from_source +from earthkit.data.core.temporary import temp_file +from earthkit.data.readers.grib.metadata import RestrictedGribMetadata +from earthkit.data.readers.grib.metadata import StandAloneGribMetadata +from earthkit.data.readers.grib.metadata import WrappedMetadata +from earthkit.data.readers.grib.output import new_grib_output +from earthkit.data.testing import earthkit_examples_file +from earthkit.data.testing import earthkit_remote_test_data_file +from earthkit.data.testing import earthkit_test_data_file +from earthkit.data.utils import ensure_iterable + + +def to_tuple(x): + return tuple(ensure_iterable(x)) + + +def grid_list(files=None, skip=None): + with open(earthkit_test_data_file(os.path.join("xr_engine", "xr_grid.yaml")), "r") as f: + r = yaml.safe_load(f) + + files = [] if files is None else files + skip = [] if skip is None else skip + for item in r: + if not files or item["file"] in files and item["file"] not in skip: + yield item["file"] + + +def check_field_write(f, md_ref, shape_ref, values_ref, use_writer=False, **kwargs): + with temp_file() as tmp: + if use_writer: + fp = new_grib_output(tmp, template=f, **kwargs) + fp.write(f.values) + fp.close() + else: + f.save(tmp, **kwargs) + assert os.path.exists(tmp) + r_tmp = from_source("file", tmp) + assert len(r_tmp) == 1 + f = r_tmp[0] + md = {k: f.metadata(k) for k in md_ref} + md_ref_tmp = dict(**md_ref) + + # for single point data, the original bitsPerValue is 12. In the cloned handle it becomes 0. + # For some reason when we try to set it again in the clone it results in 0! + if len(values_ref) == 1: + md_ref_tmp["bitsPerValue"] = 0 + + assert md == md_ref_tmp + + assert f.shape == shape_ref + assert len(f.values) == len(values_ref) + if md["gridType"] != "sh": + assert np.allclose(f.values, values_ref, rtol=1e-1) + + +# @pytest.mark.skipif(True, reason="headers_only_clone has to be fixed") +def test_grib_metadata_override_headers_only_true_core(): + ds = from_source("file", earthkit_examples_file("test.grib")) + ref_size = ds[0].metadata("totalLength") + + md1 = ds[0].metadata().override(headers_only_clone=True) + assert isinstance(md1, WrappedMetadata) + assert md1._handle is not None + assert md1._handle != ds[0]._handle + assert md1["totalLength"] - ref_size < -10 + assert md1["bitsPerValue"] == 16 + assert md1["shortName"] == "2t" + assert md1["typeOfLevel"] == "surface" + + md2 = md1._hide_internal_keys() + assert isinstance(md2, RestrictedGribMetadata) + assert md2._handle is not None + assert md2._handle != ds[0]._handle + assert md2._handle == md1._handle + assert md2.extra == {"bitsPerValue": 16} + assert md2["bitsPerValue"] == 16 + assert md2["shortName"] == "2t" + assert md2["typeOfLevel"] == "surface" + + with pytest.raises(KeyError): + md2["average"] + + md3 = md2.override(headers_only_clone=True, shortName="2d") + assert isinstance(md3, RestrictedGribMetadata) + assert md3._handle is not None + assert md3._handle != ds[0]._handle + assert md3._handle != md1._handle + assert md3._handle != md2._handle + assert md3["totalLength"] - ref_size < -10 + assert md3.extra == {"bitsPerValue": 16} + assert md3["bitsPerValue"] == 16 + assert md3["shortName"] == "2d" + assert md3["typeOfLevel"] == "surface" + + md4 = md3._hide_internal_keys() + assert md4 is md3 + + md5 = md3.override(headers_only_clone=True, bitsPerValue=8) + assert isinstance(md5, RestrictedGribMetadata) + assert md5._handle is not None + assert md5._handle != ds[0]._handle + assert md5._handle != md1._handle + assert md5._handle != md3._handle + assert md5["totalLength"] - ref_size < -10 + assert md5.extra == {"bitsPerValue": 8} + assert md5["bitsPerValue"] == 8 + assert md5["shortName"] == "2d" + assert md5["typeOfLevel"] == "surface" + + +def test_grib_metadata_override_headers_only_false_core(): + ds = from_source("file", earthkit_examples_file("test.grib")) + ref_size = ds[0].metadata("totalLength") + + md1 = ds[0].metadata().override(headers_only_clone=False) + assert isinstance(md1, WrappedMetadata) + assert isinstance(md1.metadata, StandAloneGribMetadata) + assert md1._handle is not None + assert md1._handle != ds[0]._handle + assert np.isclose(md1["totalLength"], ref_size) + + md2 = md1._hide_internal_keys() + assert isinstance(md2, RestrictedGribMetadata) + assert isinstance(md2.metadata, StandAloneGribMetadata) + assert md2._handle is not None + assert md2._handle != ds[0]._handle + assert md2._handle == md1._handle + + with pytest.raises(KeyError): + md2["average"] + + +@pytest.mark.cache +@pytest.mark.parametrize( + "file", + # grid_list(files=["regular_ll_single_point.grib1"]), + grid_list(), +) +def test_grib_metadata_headers_only_clone_true_grids(file): + ds0 = from_source("url", earthkit_remote_test_data_file("test-data", "xr_engine", "grid", file)) + + keys = ["bitsPerValue", "level", "shortName", "gridType", "packingType", "date"] + md_ref = {k: ds0[0].metadata(k) for k in keys} + shape_ref = ds0[0].shape + vals_ref = ds0[0].values + + # array field + ds = ds0.to_fieldlist() + md = {k: ds[0].metadata(k) for k in keys} + assert isinstance(ds[0].metadata(), RestrictedGribMetadata) + assert md == md_ref + assert ds[0].shape == shape_ref + + # create new field with modified metadata + f = ds[0].copy(metadata=ds[0].metadata().override(date=19810102)) + assert isinstance(f.metadata(), RestrictedGribMetadata) + md_ref_1 = dict(**md_ref) + md_ref_1["date"] = 19810102 + md = {k: f.metadata(k) for k in keys} + assert md == md_ref_1 + + # save to disk, should use the original bitsPerValue + + # for single point data, the original bitsPerValue is 12. In the cloned handle it becomes 0. + # For some reason when we try to set it again to 12 on the clone it results in 0! + check_field_write(f, md_ref_1, shape_ref, vals_ref) + + # save to disk, with the given bitsPerValue + md_ref_tmp = dict(**md_ref_1) + md_ref_tmp["bitsPerValue"] = 24 + check_field_write(f, md_ref_tmp, shape_ref, vals_ref, bits_per_value=24) + + # create new field with modified bitsPerValue + f = f.copy(metadata=f.metadata().override(bitsPerValue=8)) + assert isinstance(f.metadata(), RestrictedGribMetadata) + md_ref_1["bitsPerValue"] = 8 + md = {k: f.metadata(k) for k in keys} + assert md == md_ref_1 + + # save to disk, with the given bitsPerValue + check_field_write(f, md_ref_1, shape_ref, vals_ref) + + # save to disk, with the given bitsPerValue + check_field_write(f, md_ref_1, shape_ref, vals_ref, use_writer=True) + + +@pytest.mark.cache +@pytest.mark.parametrize( + "file", + # grid_list(files=["reduced_gg_N32.grib1"]), + grid_list(), +) +def test_grib_metadata_headers_only_clone_false_grids(file): + ds0 = from_source("url", earthkit_remote_test_data_file("test-data", "xr_engine", "grid", file)) + + keys = ["bitsPerValue", "level", "shortName", "gridType", "packingType", "date"] + md_ref = {k: ds0[0].metadata(k) for k in keys} + shape_ref = ds0[0].shape + vals_ref = ds0[0].values + + # array field + ds = ds0.from_fields([ds0[0].copy(metadata=ds0[0].metadata().override(headers_only_clone=False))]) + md = {k: ds[0].metadata(k) for k in keys} + assert isinstance(ds[0].metadata(), RestrictedGribMetadata) + assert md == md_ref + assert ds[0].shape == shape_ref + + # create new field with modified metadata + f = ds[0].copy(metadata=ds[0].metadata().override(headers_only_clone=False, date=19810102)) + assert isinstance(f.metadata(), RestrictedGribMetadata) + md_ref_1 = dict(**md_ref) + md_ref_1["date"] = 19810102 + md = {k: f.metadata(k) for k in keys} + assert md == md_ref_1 + + # save to disk, should use the original bitsPerValue + + # for single point data, the original bitsPerValue is 12. In the cloned handle it becomes 0. + # For some reason when we try to set it again to 12 on the clone it results in 0! + check_field_write(f, md_ref_1, shape_ref, vals_ref) + + # save to disk, with the given bitsPerValue + md_ref_tmp = dict(**md_ref_1) + md_ref_tmp["bitsPerValue"] = 24 + check_field_write(f, md_ref_tmp, shape_ref, vals_ref, bits_per_value=24) + + # create new field with modified bitsPerValue + f = f.copy(metadata=f.metadata().override(bitsPerValue=8, headers_only_clone=False)) + assert isinstance(f.metadata(), RestrictedGribMetadata) + md_ref_1["bitsPerValue"] = 8 + md = {k: f.metadata(k) for k in keys} + assert md == md_ref_1 + + # save to disk, with the given bitsPerValue + check_field_write(f, md_ref_1, shape_ref, vals_ref) + + # save to disk, with the given bitsPerValue + check_field_write(f, md_ref_1, shape_ref, vals_ref, use_writer=True) + + +def test_grib_headers_only_clone_standalone_metadata(): + ds = from_source("file", earthkit_examples_file("test.grib")) + + md_ref = { + "param": "2t", + "date": 20200513, + "time": 1200, + "step": 0, + "level": 0, + "gridType": "regular_ll", + "type": "an", + } + + md0 = ds[0].metadata().override() + md1 = StandAloneGribMetadata(md0._handle) + for k, v in md_ref.items(): + assert md1[k] == v + + # the handle does not contain bitsPerValue + assert md1["bitsPerValue"] == 0 + + md0 = ds[0].metadata().override(bitsPerValue=8) + md1 = StandAloneGribMetadata(md0._handle) + for k, v in md_ref.items(): + assert md1[k] == v + + # the handle does not contain bitsPerValue + assert md1["bitsPerValue"] == 0 diff --git a/tests/grib/test_grib_output.py b/tests/grib/test_grib_output.py index a044698d..4cd5078c 100644 --- a/tests/grib/test_grib_output.py +++ b/tests/grib/test_grib_output.py @@ -306,6 +306,8 @@ def test_grib_output_field_template(array): if array: ds = ds.to_fieldlist() + assert ds[0].metadata("bitsPerValue") == 4 + path = os.path.join(tmp, "a.grib") f = earthkit.data.new_grib_output(path, template=ds[0], date=20010101) f.write(data, param="pt", bitsPerValue=16) @@ -318,6 +320,7 @@ def test_grib_output_field_template(array): assert ds[0].metadata("levtype") == "pl" assert ds[0].metadata("edition") == 1 assert ds[0].metadata("generatingProcessIdentifier") == 255 + assert ds[0].metadata("bitsPerValue") == 16 assert np.allclose(ds[0].to_numpy(), data, rtol=1e-2, atol=1e-2) diff --git a/tests/grib/test_grib_serialise.py b/tests/grib/test_grib_serialise.py index c5adc5f7..2435e776 100644 --- a/tests/grib/test_grib_serialise.py +++ b/tests/grib/test_grib_serialise.py @@ -18,6 +18,7 @@ from earthkit.data import from_source from earthkit.data.core.temporary import temp_file +from earthkit.data.readers.grib.metadata import StandAloneGribMetadata from earthkit.data.testing import earthkit_examples_file here = os.path.dirname(__file__) @@ -55,6 +56,29 @@ def test_grib_serialise_metadata(fl_type, representation): assert md[k] == md2[k] +@pytest.mark.parametrize("representation", ["file", "memory"]) +def test_grib_serialise_standalone_metadata(representation): + ds = from_source("file", earthkit_examples_file("test.grib")) + + md_ref = { + "param": "2t", + "date": 20200513, + "time": 1200, + "step": 0, + "level": 0, + "gridType": "regular_ll", + "type": "an", + } + + md = StandAloneGribMetadata(ds[0].handle) + for k, v in md_ref.items(): + assert md[k] == v + + md2 = _pickle(md, representation) + for k, v in md_ref.items(): + assert md2[k] == v + + @pytest.mark.parametrize("fl_type", FL_NUMPY) @pytest.mark.parametrize("representation", ["file", "memory"]) def test_grib_serialise_array_field_memory(fl_type, representation): diff --git a/tests/xr_engine/test_xr_write.py b/tests/xr_engine/test_xr_write.py index ded43f6c..118bb8c5 100644 --- a/tests/xr_engine/test_xr_write.py +++ b/tests/xr_engine/test_xr_write.py @@ -225,3 +225,29 @@ def test_xr_write_seasonal(): assert sorted(r.metadata(["indexingDate", "indexingTime", "forecastMonth"])) == sorted( ds_ek.metadata(["indexingDate", "indexingTime", "forecastMonth"]) ) + + +def test_xr_write_bits_per_value(): + ds_ek = from_source("url", earthkit_remote_test_data_file("test-data/xr_engine/level/pl.grib")) + ds_ek = ds_ek.sel(param=["t", "r"], level=[500, 850]) + + ref_bpm = ds_ek[0].metadata("bitsPerValue") + assert ref_bpm == 16 + + ds_ek = ds_ek.to_fieldlist() + ds_ek = ds_ek.from_fields([f.clone(bitsPerValue=8) for f in ds_ek]) + assert ds_ek[0].metadata("bitsPerValue") == 8 + assert ds_ek[0].handle.get("bitsPerValue") == 0 + + import xarray as xr + + xr.set_options(keep_attrs=True) + + ds = ds_ek.to_xarray(**{"profile": "mars", "time_dim_mode": "raw"}) + ds += 1 + + # data-array + r = ds["t"].earthkit.to_fieldlist() + assert len(r) == 16 + assert r.index("shortName") == ["t"] + assert r[0].metadata("bitsPerValue") == 8