diff --git a/doc/conf.py b/doc/conf.py index 3bf62cee4e..14f8afc2ec 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -456,10 +456,12 @@ 'iris': ('https://scitools-iris.readthedocs.io/en/stable/', None), 'esmf_regrid': ('https://iris-esmf-regrid.readthedocs.io/en/stable/', None), 'matplotlib': ('https://matplotlib.org/stable/', None), + 'ncdata': ('https://ncdata.readthedocs.io/en/stable/', None), 'numpy': ('https://numpy.org/doc/stable/', None), 'pyesgf': ('https://esgf-pyclient.readthedocs.io/en/stable/', None), 'python': ('https://docs.python.org/3/', None), 'scipy': ('https://docs.scipy.org/doc/scipy/', None), + 'xarray': ('https://docs.xarray.dev/en/stable/', None), } # -- Extlinks extension ------------------------------------------------------- diff --git a/doc/develop/fixing_data.rst b/doc/develop/fixing_data.rst index 68b6e27221..5ea5c53758 100644 --- a/doc/develop/fixing_data.rst +++ b/doc/develop/fixing_data.rst @@ -126,9 +126,9 @@ Then we have to create the class for the fix deriving from Next we must choose the method to use between the ones offered by the Fix class: -- ``fix_file``: should be used only to fix errors that prevent data loading. - As a rule of thumb, you should only use it if the execution halts before - reaching the checks. +- ``fix_file``: you need to fix errors that prevent loading the data with Iris + or perform operations that are more efficient with other packages (e.g., + loading files with lots of variables is much faster with Xarray than Iris). - ``fix_metadata``: you want to change something in the cube that is not the data (e.g., variable or coordinate names, data units). diff --git a/doc/quickstart/configure.rst b/doc/quickstart/configure.rst index a85e85e535..465092d0cc 100644 --- a/doc/quickstart/configure.rst +++ b/doc/quickstart/configure.rst @@ -934,6 +934,7 @@ the preprocessing chain. Currently supported preprocessor steps: +* :func:`~esmvalcore.preprocessor.fix_file` * :func:`~esmvalcore.preprocessor.load` Here is an example on how to ignore specific warnings during the preprocessor diff --git a/doc/recipe/preprocessor.rst b/doc/recipe/preprocessor.rst index 35190c1bee..906314725f 100644 --- a/doc/recipe/preprocessor.rst +++ b/doc/recipe/preprocessor.rst @@ -272,20 +272,22 @@ ESMValCore deals with those issues by applying specific fixes for those datasets that require them. Fixes are applied at three different preprocessor steps: - - ``fix_file``: apply fixes directly to a copy of the file. - Copying the files is costly, so only errors that prevent Iris to load the - file are fixed here. - See :func:`esmvalcore.preprocessor.fix_file`. - - - ``fix_metadata``: metadata fixes are done just before concatenating the - cubes loaded from different files in the final one. - Automatic metadata fixes are also applied at this step. - See :func:`esmvalcore.preprocessor.fix_metadata`. - - - ``fix_data``: data fixes are applied before starting any operation that - will alter the data itself. - Automatic data fixes are also applied at this step. - See :func:`esmvalcore.preprocessor.fix_data`. +- ``fix_file``: apply fixes to data before loading them with Iris. + This is mainly intended to fix errors that prevent data loading with Iris + (e.g., those related to ``missing_value`` or ``_FillValue``) or + operations that are more efficient with other packages (e.g., loading + files with lots of variables is much faster with Xarray than Iris). See + :func:`esmvalcore.preprocessor.fix_file`. + +- ``fix_metadata``: metadata fixes are done just before concatenating the + cubes loaded from different files in the final one. + Automatic metadata fixes are also applied at this step. + See :func:`esmvalcore.preprocessor.fix_metadata`. + +- ``fix_data``: data fixes are applied before starting any operation that + will alter the data itself. + Automatic data fixes are also applied at this step. + See :func:`esmvalcore.preprocessor.fix_data`. To get an overview on data fixes and how to implement new ones, please go to :ref:`fixing_data`. diff --git a/environment.yml b/environment.yml index b793092e50..a2e19d8b03 100644 --- a/environment.yml +++ b/environment.yml @@ -25,6 +25,7 @@ dependencies: - jinja2 - libnetcdf !=4.9.1 # to avoid hdf5 warnings - nc-time-axis + - ncdata - nested-lookup - netcdf4 - numpy !=1.24.3 diff --git a/esmvalcore/cmor/_fixes/cmip6/cesm2.py b/esmvalcore/cmor/_fixes/cmip6/cesm2.py index 9b190adc63..a5b0545a5e 100644 --- a/esmvalcore/cmor/_fixes/cmip6/cesm2.py +++ b/esmvalcore/cmor/_fixes/cmip6/cesm2.py @@ -25,6 +25,7 @@ def _fix_formula_terms( filepath, output_dir, add_unique_suffix=False, + ignore_warnings=None, ): """Fix ``formula_terms`` attribute.""" new_path = self.get_fixed_filepath( diff --git a/esmvalcore/cmor/_fixes/cmip6/cesm2_waccm.py b/esmvalcore/cmor/_fixes/cmip6/cesm2_waccm.py index d3bbc4dafe..6cad2ede52 100644 --- a/esmvalcore/cmor/_fixes/cmip6/cesm2_waccm.py +++ b/esmvalcore/cmor/_fixes/cmip6/cesm2_waccm.py @@ -15,7 +15,13 @@ class Cl(BaseCl): """Fixes for cl.""" - def fix_file(self, filepath, output_dir, add_unique_suffix=False): + def fix_file( + self, + filepath, + output_dir, + add_unique_suffix=False, + ignore_warnings=None, + ): """Fix hybrid pressure coordinate. Adds missing ``formula_terms`` attribute to file. diff --git a/esmvalcore/cmor/_fixes/emac/emac.py b/esmvalcore/cmor/_fixes/emac/emac.py index 028c82af8b..026dfde98d 100644 --- a/esmvalcore/cmor/_fixes/emac/emac.py +++ b/esmvalcore/cmor/_fixes/emac/emac.py @@ -40,7 +40,13 @@ class AllVars(EmacFix): "kg/m**2s": "kg m-2 s-1", } - def fix_file(self, filepath, output_dir, add_unique_suffix=False): + def fix_file( + self, + filepath, + output_dir, + add_unique_suffix=False, + ignore_warnings=None, + ): """Fix file. Fixes hybrid pressure level coordinate. diff --git a/esmvalcore/cmor/_fixes/fix.py b/esmvalcore/cmor/_fixes/fix.py index 9a229b2dc4..734d422afd 100644 --- a/esmvalcore/cmor/_fixes/fix.py +++ b/esmvalcore/cmor/_fixes/fix.py @@ -11,7 +11,12 @@ from typing import TYPE_CHECKING, Any, Optional import dask +import iris +import ncdata.iris +import ncdata.iris_xarray +import ncdata.threadlock_sharing import numpy as np +import xarray as xr from cf_units import Unit from iris.coords import Coord, CoordExtent from iris.cube import Cube, CubeList @@ -27,7 +32,11 @@ ) from esmvalcore.cmor.fixes import get_time_bounds from esmvalcore.cmor.table import get_var_info -from esmvalcore.iris_helpers import has_unstructured_grid, safe_convert_units +from esmvalcore.iris_helpers import ( + has_unstructured_grid, + ignore_warnings_context, + safe_convert_units, +) if TYPE_CHECKING: from esmvalcore.cmor.table import CoordinateInfo, VariableInfo @@ -36,6 +45,9 @@ logger = logging.getLogger(__name__) generic_fix_logger = logging.getLogger(f"{__name__}.genericfix") +# Enable lock sharing between ncdata and iris/xarray +ncdata.threadlock_sharing.enable_lockshare(iris=True, xarray=True) + class Fix: """Base class for dataset fixes.""" @@ -78,28 +90,43 @@ def fix_file( filepath: Path, output_dir: Path, add_unique_suffix: bool = False, - ) -> str | Path: - """Apply fixes to the files prior to creating the cube. + ignore_warnings: Optional[list[dict]] = None, + ) -> str | Path | Cube | CubeList: + """Fix files before loading them into a :class:`~iris.cube.CubeList`. - Should be used only to fix errors that prevent loading or cannot be - fixed in the cube (e.g., those related to `missing_value` or - `_FillValue`). + This is mainly intended to fix errors that prevent loading the data + with Iris (e.g., those related to ``missing_value`` or ``_FillValue``) + or operations that are more efficient with other packages (e.g., + loading files with lots of variables is much faster with Xarray than + Iris). + + Warning + ------- + A path should only be returned if it points to the original (unchanged) + file (i.e., a fix was not necessary). If a fix is necessary, this + function should return a :class:`~iris.cube.Cube` or + :class:`~iris.cube.CubeList`, which can for example be created from an + :class:`~ncdata.NcData` or :class:`~xarray.Dataset` object using the + helper function ``Fix.dataset_to_iris()``. Under no circumstances a + copy of the input data should be created (this is very inefficient). Parameters ---------- filepath: - File to fix. + Path to the original file. Original files should not be overwritten. output_dir: Output directory for fixed files. add_unique_suffix: - Adds a unique suffix to `output_dir` for thread safety. + Adds a unique suffix to ``output_dir`` for thread safety. + ignore_warnings: + Keyword arguments passed to :func:`warnings.filterwarnings` used to + ignore warnings during data loading. Each list element corresponds + to one call to :func:`warnings.filterwarnings`. Returns ------- - str or pathlib.Path - Path to the corrected file. It can be different from the original - filepath if a fix has been applied, but if not it should be the - original filepath. + str | Path | Cube | CubeList: + Fixed cube(s) or a path to them. """ return filepath @@ -157,6 +184,84 @@ def get_cube_from_list( return cube raise ValueError(f'Cube for variable "{short_name}" not found') + @staticmethod + def _get_attribute( + data: ncdata.NcData | ncdata.NcVariable | xr.Dataset | xr.DataArray, + attribute_name: str, + ) -> Any: + """Get attribute from an ncdata or xarray object.""" + if hasattr(data, "attributes"): # ncdata.NcData | ncdata.NcVariable + attribute = data.attributes[attribute_name].value + else: # xr.Dataset | xr.DataArray + attribute = data.attrs[attribute_name] + return attribute + + def dataset_to_iris( + self, + dataset: ncdata.NcData | xr.Dataset, + filepath: str | Path, + ignore_warnings: Optional[list[dict]] = None, + ) -> CubeList: + """Convert dataset to :class:`~iris.cube.CubeList`. + + This function mimics the behavior of + :func:`esmvalcore.preprocessor.load`. + + Parameters + ---------- + dataset: + The dataset object to convert. + filepath: + The path that the dataset was loaded from. + ignore_warnings: + Keyword arguments passed to :func:`warnings.filterwarnings` used to + ignore warnings during data loading. Each list element corresponds + to one call to :func:`warnings.filterwarnings`. + + Returns + ------- + CubeList + :class:`~iris.cube.CubeList` containing the requested cubes. + + Raises + ------ + TypeError + Invalid type for ``dataset`` given. + + """ + if isinstance(dataset, ncdata.NcData): + conversion_func = ncdata.iris.to_iris + ds_coords = dataset.variables + elif isinstance(dataset, xr.Dataset): + conversion_func = ncdata.iris_xarray.cubes_from_xarray + ds_coords = dataset.coords + else: + raise TypeError( + f"Expected type ncdata.NcData or xr.Dataset for dataset, got " + f"type {type(dataset)}" + ) + + with ignore_warnings_context(ignore_warnings): + cubes = conversion_func(dataset) + + # Restore the lat/lon coordinate units that iris changes to degrees + for coord_name in ["latitude", "longitude"]: + for cube in cubes: + try: + coord = cube.coord(coord_name) + except iris.exceptions.CoordinateNotFoundError: + pass + else: + if coord.var_name in ds_coords: + ds_coord = ds_coords[coord.var_name] + coord.units = self._get_attribute(ds_coord, "units") + + # Add the source file as an attribute to support grouping by + # file when calling fix_metadata. + cube.attributes["source_file"] = str(filepath) + + return cubes + def fix_data(self, cube: Cube) -> Cube: """Apply fixes to the data of the cube. diff --git a/esmvalcore/cmor/_fixes/ipslcm/ipsl_cm6.py b/esmvalcore/cmor/_fixes/ipslcm/ipsl_cm6.py index 5902711a23..3cbdc57fbb 100644 --- a/esmvalcore/cmor/_fixes/ipslcm/ipsl_cm6.py +++ b/esmvalcore/cmor/_fixes/ipslcm/ipsl_cm6.py @@ -17,7 +17,13 @@ class AllVars(Fix): """Fixes for all IPSLCM variables.""" - def fix_file(self, filepath, output_dir, add_unique_suffix=False): + def fix_file( + self, + filepath, + output_dir, + add_unique_suffix=False, + ignore_warnings=None, + ): """Select IPSLCM variable in filepath. This is done only if input file is a multi-variable one. This diff --git a/esmvalcore/cmor/fix.py b/esmvalcore/cmor/fix.py index ab81353cfb..2bdf1f97e6 100644 --- a/esmvalcore/cmor/fix.py +++ b/esmvalcore/cmor/fix.py @@ -33,19 +33,30 @@ def fix_file( add_unique_suffix: bool = False, session: Optional[Session] = None, frequency: Optional[str] = None, + ignore_warnings: Optional[list[dict]] = None, **extra_facets, -) -> str | Path: - """Fix files before ESMValTool can load them. +) -> str | Path | Cube | CubeList: + """Fix files before loading them into a :class:`~iris.cube.CubeList`. - These fixes are only for issues that prevent iris from loading the cube or - that cannot be fixed after the cube is loaded. + This is mainly intended to fix errors that prevent loading the data with + Iris (e.g., those related to ``missing_value`` or ``_FillValue``) or + operations that are more efficient with other packages (e.g., loading files + with lots of variables is much faster with Xarray than Iris). - Original files are not overwritten. + Warning + ------- + A path should only be returned if it points to the original (unchanged) + file (i.e., a fix was not necessary). If a fix is necessary, this function + should return a :class:`~iris.cube.Cube` or :class:`~iris.cube.CubeList`, + which can for example be created from an :class:`~ncdata.NcData` or + :class:`~xarray.Dataset` object using the helper function + ``Fix.dataset_to_iris()``. Under no circumstances a copy of the input data + should be created (this is very inefficient). Parameters ---------- file: - Path to the original file. + Path to the original file. Original files are not overwritten. short_name: Variable's short name. project: @@ -57,19 +68,23 @@ def fix_file( output_dir: Output directory for fixed files. add_unique_suffix: - Adds a unique suffix to `output_dir` for thread safety. + Adds a unique suffix to ``output_dir`` for thread safety. session: Current session which includes configuration and directory information. frequency: Variable's data frequency, if available. + ignore_warnings: + Keyword arguments passed to :func:`warnings.filterwarnings` used to + ignore warnings during data loading. Each list element corresponds to + one call to :func:`warnings.filterwarnings`. **extra_facets: Extra facets are mainly used for data outside of the big projects like CMIP, CORDEX, obs4MIPs. For details, see :ref:`extra_facets`. Returns ------- - str or pathlib.Path - Path to the fixed file. + str | Path | Cube | CubeList: + Fixed cube(s) or a path to them. """ # Update extra_facets with variable information given as regular arguments @@ -94,7 +109,10 @@ def fix_file( frequency=frequency, ): file = fix.fix_file( - file, output_dir, add_unique_suffix=add_unique_suffix + file, + output_dir, + add_unique_suffix=add_unique_suffix, + ignore_warnings=ignore_warnings, ) return file diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py index 6717485e38..206ba9482f 100644 --- a/esmvalcore/dataset.py +++ b/esmvalcore/dataset.py @@ -753,6 +753,9 @@ def _load(self) -> Cube: "output_dir": fix_dir_prefix, "add_unique_suffix": True, "session": self.session, + "ignore_warnings": get_ignored_warnings( + self.facets["project"], "fix_file" + ), **self.facets, } settings["load"] = { diff --git a/esmvalcore/iris_helpers.py b/esmvalcore/iris_helpers.py index 1077c3845c..2a7ddd807d 100644 --- a/esmvalcore/iris_helpers.py +++ b/esmvalcore/iris_helpers.py @@ -2,17 +2,19 @@ from __future__ import annotations +import contextlib import warnings -from collections.abc import Generator -from contextlib import contextmanager -from typing import Dict, Iterable, List, Literal, Sequence +from collections.abc import Generator, Iterable +from typing import Literal, Optional, Sequence import dask.array as da import iris +import iris.aux_factory import iris.cube +import iris.exceptions import iris.util import numpy as np -from cf_units import Unit +from cf_units import Unit, suppress_errors from iris.coords import Coord from iris.cube import Cube from iris.exceptions import CoordinateMultiDimError, CoordinateNotFoundError @@ -21,7 +23,7 @@ from esmvalcore.typing import NetCDFAttr -@contextmanager +@contextlib.contextmanager def ignore_iris_vague_metadata_warnings() -> Generator[None]: """Ignore specific warnings. @@ -163,7 +165,7 @@ def merge_cube_attributes( return # Step 1: collect all attribute values in a list - attributes: Dict[str, List[NetCDFAttr]] = {} + attributes: dict[str, list[NetCDFAttr]] = {} for cube in cubes: for attr, val in cube.attributes.items(): attributes.setdefault(attr, []) @@ -498,3 +500,52 @@ def safe_convert_units(cube: Cube, units: str | Unit) -> Cube: f"'{cube.standard_name}'" ) return cube + + +@contextlib.contextmanager +def ignore_warnings_context( + warnings_to_ignore: Optional[list[dict]] = None, +) -> Generator[None]: + """Ignore warnings (context manager). + + Parameters + ---------- + warnings_to_ignore: + Additional warnings to ignore (by default, Iris warnings about missing + CF-netCDF measure variables and invalid units are ignored). + + """ + if warnings_to_ignore is None: + warnings_to_ignore = [] + + default_warnings_to_ignore: list[dict] = [ + { + "message": "Missing CF-netCDF measure variable .*", + "category": UserWarning, + "module": "iris", + }, + { # iris < 3.8 + "message": "Ignoring netCDF variable '.*' invalid units '.*'", + "category": UserWarning, + "module": "iris", + }, + { # iris >= 3.8 + "message": "Ignoring invalid units .* on netCDF variable .*", + "category": UserWarning, + "module": "iris", + }, + ] + + with contextlib.ExitStack() as stack: + # Regular warnings + stack.enter_context(warnings.catch_warnings()) + for warning_kwargs in warnings_to_ignore + default_warnings_to_ignore: + warning_kwargs.setdefault("action", "ignore") + warnings.filterwarnings(**warning_kwargs) + + # Suppress UDUNITS-2 error messages that cannot be ignored with + # warnings.filterwarnings + # (see https://github.com/SciTools/cf-units/issues/240) + stack.enter_context(suppress_errors()) + + yield diff --git a/esmvalcore/preprocessor/_io.py b/esmvalcore/preprocessor/_io.py index 2e817fe958..a3663902a4 100644 --- a/esmvalcore/preprocessor/_io.py +++ b/esmvalcore/preprocessor/_io.py @@ -5,7 +5,7 @@ import copy import logging import os -from collections.abc import Sequence +from collections.abc import Iterable, Sequence from itertools import groupby from pathlib import Path from typing import NamedTuple, Optional @@ -17,13 +17,16 @@ import iris.exceptions import numpy as np import yaml -from cf_units import suppress_errors from dask.delayed import Delayed -from iris.cube import CubeList +from iris.coords import Coord +from iris.cube import Cube, CubeList from esmvalcore.cmor.check import CheckLevels from esmvalcore.esgf.facets import FACETS -from esmvalcore.iris_helpers import merge_cube_attributes +from esmvalcore.iris_helpers import ( + ignore_warnings_context, + merge_cube_attributes, +) from esmvalcore.preprocessor._shared import _rechunk_aux_factory_dependencies from .._task import write_ncl_settings @@ -43,6 +46,7 @@ def _get_attr_from_field_coord(ncfield, coord_name, attr): + """Get attribute from netCDF field coordinate.""" if coord_name is not None: attrs = ncfield.cf_group[coord_name].cf_attrs() attr_val = [value for (key, value) in attrs if key == attr] @@ -51,31 +55,30 @@ def _get_attr_from_field_coord(ncfield, coord_name, attr): return None -def _load_callback(raw_cube, field, _): - """Use this callback to fix anything Iris tries to break.""" - # Remove attributes that cause issues with merging and concatenation - _delete_attributes( - raw_cube, ("creation_date", "tracking_id", "history", "comment") - ) - for coord in raw_cube.coords(): - # Iris chooses to change longitude and latitude units to degrees - # regardless of value in file, so reinstating file value +def _restore_lat_lon_units( + cube, + field, + filename, +): # pylint: disable=unused-argument + """Use this callback to restore the original lat/lon units.""" + # Iris chooses to change longitude and latitude units to degrees + # regardless of value in file, so reinstating file value + for coord in cube.coords(): if coord.standard_name in ["longitude", "latitude"]: units = _get_attr_from_field_coord(field, coord.var_name, "units") if units is not None: coord.units = units - # CMOR sometimes adds a history to the coordinates. - _delete_attributes(coord, ("history",)) -def _delete_attributes(iris_object, atts): +def _delete_attributes(iris_object: Cube | Coord, atts: Iterable[str]) -> None: + """Delete attributes from Iris cube or coordinate.""" for att in atts: if att in iris_object.attributes: del iris_object.attributes[att] def load( - file: str | Path, + file: str | Path | Cube | CubeList, ignore_warnings: Optional[list[dict]] = None, ) -> CubeList: """Load iris cubes from string or Path objects. @@ -83,7 +86,9 @@ def load( Parameters ---------- file: - File to be loaded. Could be string or POSIX Path object. + File to be loaded. If ``file`` is already a :class:`~iris.cube.Cube` or + :class:`~iris.cube.CubeList`, this object will be returned as a + :class:`~iris.cube.CubeList`. ignore_warnings: Keyword arguments passed to :func:`warnings.filterwarnings` used to ignore warnings issued by :func:`iris.load_raw`. Each list element @@ -99,55 +104,22 @@ def load( ValueError Cubes are empty. """ + if isinstance(file, Cube): + return CubeList([file]) + if isinstance(file, CubeList): + return file + file = Path(file) logger.debug("Loading:\n%s", file) - if ignore_warnings is None: - ignore_warnings = [] - - # Avoid duplication of ignored warnings when load() is called more often - # than once - ignore_warnings = list(ignore_warnings) - - # Default warnings ignored for every dataset - ignore_warnings.append( - { - "message": "Missing CF-netCDF measure variable .*", - "category": UserWarning, - "module": "iris", - } - ) - ignore_warnings.append( - { - "message": "Ignoring netCDF variable '.*' invalid units '.*'", - "category": UserWarning, - "module": "iris", - } - ) # iris < 3.8 - ignore_warnings.append( - { - "message": "Ignoring invalid units .* on netCDF variable .*", - "category": UserWarning, - "module": "iris", - } - ) # iris >= 3.8 - - # Filter warnings - with catch_warnings(): - for warning_kwargs in ignore_warnings: - warning_kwargs.setdefault("action", "ignore") - filterwarnings(**warning_kwargs) - # Suppress UDUNITS-2 error messages that cannot be ignored with - # warnings.filterwarnings - # (see https://github.com/SciTools/cf-units/issues/240) - with suppress_errors(): - # GRIB files need to be loaded with iris.load, otherwise we will - # get separate (lat, lon) slices for each time step, pressure - # level, etc. - if file.suffix in GRIB_FORMATS: - raw_cubes = iris.load(file, callback=_load_callback) - else: - raw_cubes = iris.load_raw(file, callback=_load_callback) + with ignore_warnings_context(ignore_warnings): + # GRIB files need to be loaded with iris.load, otherwise we will + # get separate (lat, lon) slices for each time step, pressure + # level, etc. + if file.suffix in GRIB_FORMATS: + raw_cubes = iris.load(file, callback=_restore_lat_lon_units) + else: + raw_cubes = iris.load_raw(file, callback=_restore_lat_lon_units) logger.debug("Done with loading %s", file) if not raw_cubes: @@ -176,7 +148,7 @@ def _concatenate_cubes(cubes, check_level): "and derived coordinates present in the cubes.", ) - concatenated = iris.cube.CubeList(cubes).concatenate(**kwargs) + concatenated = CubeList(cubes).concatenate(**kwargs) return concatenated @@ -196,7 +168,7 @@ def __getitem__(self, key): return self.times[key] -def _check_time_overlaps(cubes: iris.cube.CubeList) -> iris.cube.CubeList: +def _check_time_overlaps(cubes: CubeList) -> CubeList: """Handle time overlaps. Parameters @@ -215,7 +187,7 @@ def _check_time_overlaps(cubes: iris.cube.CubeList) -> iris.cube.CubeList: return cubes class _TrackedCube(NamedTuple): - cube: iris.cube.Cube + cube: Cube times: iris.coords.DimCoord start: float end: float @@ -227,7 +199,7 @@ def from_cube(cls, cube): start, end = times.core_points()[[0, -1]] return cls(cube, times, start, end) - new_cubes = iris.cube.CubeList() + new_cubes = CubeList() current_cube = _TrackedCube.from_cube(cubes[0]) for new_cube in map(_TrackedCube.from_cube, cubes[1:]): if new_cube.start > current_cube.end: @@ -308,7 +280,7 @@ def _get_concatenation_error(cubes): """Raise an error for concatenation.""" # Concatenation not successful -> retrieve exact error message try: - iris.cube.CubeList(cubes).concatenate_cube() + CubeList(cubes).concatenate_cube() except iris.exceptions.ConcatenateError as exc: msg = str(exc) logger.error("Can not concatenate cubes into a single one: %s", msg) @@ -338,9 +310,7 @@ def _sort_cubes_by_time(cubes): return cubes -def _concatenate_cubes_by_experiment( - cubes: list[iris.cube.Cube], -) -> list[iris.cube.Cube]: +def _concatenate_cubes_by_experiment(cubes: list[Cube]) -> list[Cube]: """Concatenate cubes by experiment. This ensures overlapping (branching) experiments are handled correctly. @@ -351,7 +321,7 @@ def _concatenate_cubes_by_experiment( project["exp"] for project in FACETS.values() if "exp" in project } - def get_exp(cube: iris.cube.Cube) -> str: + def get_exp(cube: Cube) -> str: for key in exp_facet_names: if key in cube.attributes: return cube.attributes[key] @@ -393,6 +363,15 @@ def concatenate(cubes, check_level=CheckLevels.DEFAULT): if len(cubes) == 1: return cubes[0] + for cube in cubes: + # Remove attributes that cause issues with merging and concatenation + _delete_attributes( + cube, ("creation_date", "tracking_id", "history", "comment") + ) + for coord in cube.coords(): + # CMOR sometimes adds a history to the coordinates. + _delete_attributes(coord, ("history",)) + cubes = _concatenate_cubes_by_experiment(cubes) merge_cube_attributes(cubes) @@ -411,7 +390,7 @@ def concatenate(cubes, check_level=CheckLevels.DEFAULT): def save( - cubes: Sequence[iris.cube.Cube], + cubes: Sequence[Cube], filename: Path | str, optimize_access: str = "", compress: bool = False, @@ -445,14 +424,15 @@ def save( Var name to use when saving instead of the one in the cube. compute : bool, default=True - Default is ``True``, meaning complete the file immediately, and return ``None``. - - When ``False``, create the output file but don't write any lazy array content to - its variables, such as lazy cube data or aux-coord points and bounds. - Instead return a :class:`dask.delayed.Delayed` which, when computed, will - stream all the lazy content via :meth:`dask.store`, to complete the file. - Several such data saves can be performed in parallel, by passing a list of them - into a :func:`dask.compute` call. + Default is ``True``, meaning complete the file immediately, and return + ``None``. + + When ``False``, create the output file but don't write any lazy array + content to its variables, such as lazy cube data or aux-coord points + and bounds. Instead return a :class:`dask.delayed.Delayed` which, when + computed, will stream all the lazy content via :meth:`dask.store`, to + complete the file. Several such data saves can be performed in + parallel, by passing a list of them into a :func:`dask.compute` call. **kwargs: See :func:`iris.fileformats.netcdf.saver.save` for additional diff --git a/pyproject.toml b/pyproject.toml index bdd6681be4..bf7a74657f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,6 +49,7 @@ dependencies = [ "isodate>=0.7.0", "jinja2", "nc-time-axis", # needed by iris.plot + "ncdata", "nested-lookup", "netCDF4", "numpy!=1.24.3", diff --git a/tests/__init__.py b/tests/__init__.py index e24d27357e..ce163afaa6 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -4,6 +4,15 @@ from unittest import mock import numpy as np +from cf_units import Unit +from iris.coords import ( + AncillaryVariable, + AuxCoord, + CellMeasure, + CellMethod, + DimCoord, +) +from iris.cube import Cube from esmvalcore.preprocessor import PreprocessorFile as PreprocessorFileBase @@ -86,3 +95,62 @@ def __init__(self, cubes, filename, attributes, settings=None, **kwargs): self.copy_provenance = mock.Mock(return_value=self) group = PreprocessorFileBase.group + + +def create_realistic_4d_cube(): + """Create a realistic 4D cube.""" + time = DimCoord( + [11.0, 12.0], + standard_name="time", + units=Unit("hours since 1851-01-01", calendar="360_day"), + ) + plev = DimCoord([50000], standard_name="air_pressure", units="Pa") + lat = DimCoord([0.0, 1.0], standard_name="latitude", units="degrees_north") + lon = DimCoord( + [0.0, 20.0, 345.0], standard_name="longitude", units="degrees_east" + ) + + aux_2d_data = np.arange(2 * 3).reshape(2, 3) + aux_2d_bounds = np.stack( + (aux_2d_data - 1, aux_2d_data, aux_2d_data + 1), axis=-1 + ) + aux_2d = AuxCoord(aux_2d_data, var_name="aux_2d") + aux_2d_with_bnds = AuxCoord( + aux_2d_data, bounds=aux_2d_bounds, var_name="aux_2d_with_bnds" + ) + aux_time = AuxCoord(["Jan", "Jan"], var_name="aux_time") + aux_lon = AuxCoord([0, 1, 2], var_name="aux_lon") + + cell_area = CellMeasure( + np.arange(2 * 2 * 3).reshape(2, 2, 3) + 10, + standard_name="cell_area", + units="m2", + measure="area", + ) + type_var = AncillaryVariable( + [["sea", "land", "lake"], ["lake", "sea", "land"]], + var_name="type", + units="no_unit", + ) + + cube = Cube( + np.ma.masked_inside( + np.arange(2 * 1 * 2 * 3).reshape(2, 1, 2, 3), 1, 3 + ), + var_name="ta", + standard_name="air_temperature", + long_name="Air Temperature", + units="K", + cell_methods=[CellMethod("mean", "time")], + dim_coords_and_dims=[(time, 0), (plev, 1), (lat, 2), (lon, 3)], + aux_coords_and_dims=[ + (aux_2d, (0, 3)), + (aux_2d_with_bnds, (0, 3)), + (aux_time, 0), + (aux_lon, 3), + ], + cell_measures_and_dims=[(cell_area, (0, 2, 3))], + ancillary_variables_and_dims=[(type_var, (0, 3))], + attributes={"test": 1}, + ) + return cube diff --git a/tests/integration/cmor/_fixes/test_fix.py b/tests/integration/cmor/_fixes/test_fix.py index 13ce5dc03c..5d80d86da9 100644 --- a/tests/integration/cmor/_fixes/test_fix.py +++ b/tests/integration/cmor/_fixes/test_fix.py @@ -1,10 +1,16 @@ """Integration tests for fixes.""" import os +import warnings from pathlib import Path +import ncdata.iris +import ncdata.iris_xarray +import numpy as np import pytest -from iris.cube import Cube +import xarray as xr +from iris.cube import Cube, CubeList +from iris.warnings import IrisUnknownCellMethodWarning from esmvalcore.cmor._fixes.cmip5.bnu_esm import Ch4 from esmvalcore.cmor._fixes.cmip5.canesm2 import FgCo2 @@ -18,6 +24,7 @@ from esmvalcore.cmor.fix import Fix from esmvalcore.cmor.table import get_var_info from esmvalcore.config import CFG +from tests import create_realistic_4d_cube def test_get_fix(): @@ -205,3 +212,106 @@ def test_frequency_not_from_vardef(): vardef = get_var_info("CMIP6", "Amon", "tas") fix = Fix(vardef, frequency="3hr") assert fix.frequency == "3hr" + + +@pytest.fixture +def dummy_cubes(): + cube = create_realistic_4d_cube() + cube.remove_ancillary_variable(cube.ancillary_variables()[0]) + cube.data = cube.lazy_data() + return CubeList([cube]) + + +@pytest.fixture +def empty_cubes(): + return CubeList([Cube(0.0)]) + + +@pytest.mark.parametrize( + "conversion_func", + [ncdata.iris.from_iris, ncdata.iris_xarray.cubes_to_xarray], +) +def test_dataset_to_iris(conversion_func, dummy_cubes): + dataset = conversion_func(dummy_cubes) + + with warnings.catch_warnings(): + warnings.simplefilter("error") + cubes = Fix(None).dataset_to_iris(dataset, "path/to/file.nc") + + assert len(cubes) == 1 + cube = cubes[0] + assert cube.has_lazy_data() + np.testing.assert_allclose(cube.data, dummy_cubes[0].data) + assert cube.standard_name == dummy_cubes[0].standard_name + assert cube.var_name == dummy_cubes[0].var_name + assert cube.long_name == dummy_cubes[0].long_name + assert cube.units == dummy_cubes[0].units + assert str(cube.coord("latitude").units) == "degrees_north" + assert str(cube.coord("longitude").units) == "degrees_east" + assert cube.attributes["source_file"] == "path/to/file.nc" + + +@pytest.mark.parametrize( + "conversion_func", + [ncdata.iris.from_iris, ncdata.iris_xarray.cubes_to_xarray], +) +def test_dataset_to_iris_empty_cube(conversion_func, empty_cubes): + dataset = conversion_func(empty_cubes) + + with warnings.catch_warnings(): + warnings.simplefilter("error") + cubes = Fix(None).dataset_to_iris(dataset, "path/to/file.nc") + + assert len(cubes) == 1 + cube = cubes[0] + + assert cube.has_lazy_data() + np.testing.assert_allclose(cube.data, np.array(0.0)) + assert cube.standard_name is None + assert cube.var_name == "unknown" + assert cube.long_name is None + assert cube.units == "unknown" + assert not cube.coords() + + +@pytest.mark.parametrize( + "conversion_func", + [ncdata.iris.from_iris, ncdata.iris_xarray.cubes_to_xarray], +) +def test_dataset_to_iris_ignore_warnings(conversion_func, dummy_cubes): + dataset = conversion_func(dummy_cubes) + if isinstance(dataset, xr.Dataset): + dataset.ta.attrs["units"] = "invalid_units" + else: + dataset.variables["ta"].attributes["units"] = "invalid_units" + + with warnings.catch_warnings(): + warnings.simplefilter("error") + Fix(None).dataset_to_iris(dataset, "path/to/file.nc") + + +@pytest.mark.parametrize( + "conversion_func", + [ncdata.iris.from_iris, ncdata.iris_xarray.cubes_to_xarray], +) +def test_dataset_to_iris_no_ignore_warnings(conversion_func, dummy_cubes): + dataset = conversion_func(dummy_cubes) + if isinstance(dataset, xr.Dataset): + dataset.ta.attrs["cell_methods"] = "time: invalid_method" + else: + dataset.variables["ta"].set_attrval( + "cell_methods", "time: invalid_method" + ) + + msg = r"NetCDF variable 'ta' contains unknown cell method 'invalid_method'" + with pytest.warns(IrisUnknownCellMethodWarning, match=msg): + Fix(None).dataset_to_iris(dataset, "path/to/file.nc") + + +def test_dataset_to_iris_invalid_type_fail(): + msg = ( + r"Expected type ncdata.NcData or xr.Dataset for dataset, got type " + r"" + ) + with pytest.raises(TypeError, match=msg): + Fix(None).dataset_to_iris(1, "path/to/file.nc") diff --git a/tests/integration/preprocessor/_io/test_concatenate.py b/tests/integration/preprocessor/_io/test_concatenate.py index 1fef5f9693..f600e1d095 100644 --- a/tests/integration/preprocessor/_io/test_concatenate.py +++ b/tests/integration/preprocessor/_io/test_concatenate.py @@ -348,6 +348,26 @@ def test_concatenate_by_experiment_first(self): assert_array_equal(result.coord("time").points, np.arange(7)) assert_array_equal(result.data, np.array([0, 0, 0, 1, 1, 1, 1])) + def test_concatenate_remove_unwanted_attributes(self): + """Test concatenate removes unwanted attributes.""" + attributes = ("history", "creation_date", "tracking_id", "comment") + for i, cube in enumerate(self.raw_cubes): + for attr in attributes: + cube.attributes[attr] = f"{attr}-{i}" + concatenated = _io.concatenate(self.raw_cubes) + assert not set(attributes) & set(concatenated.attributes) + + def test_concatenate_remove_unwanted_attributes_from_coords(self): + """Test concatenate removes unwanted attributes from coords.""" + attributes = ("history",) + for i, cube in enumerate(self.raw_cubes): + for coord in cube.coords(): + for attr in attributes: + coord.attributes[attr] = f"{attr}-{i}" + concatenated = _io.concatenate(self.raw_cubes) + for coord in concatenated.coords(): + assert not set(attributes) & set(coord.attributes) + def test_concatenate_differing_attributes(self): """Test concatenation of cubes with different attributes.""" cubes = CubeList(self.raw_cubes) diff --git a/tests/integration/preprocessor/_io/test_load.py b/tests/integration/preprocessor/_io/test_load.py index e776b9caa2..1d6d8b6142 100644 --- a/tests/integration/preprocessor/_io/test_load.py +++ b/tests/integration/preprocessor/_io/test_load.py @@ -12,7 +12,7 @@ from iris.cube import Cube, CubeList import esmvalcore -from esmvalcore.preprocessor._io import load +from esmvalcore.preprocessor._io import _get_attr_from_field_coord, load def _create_sample_cube(): @@ -54,6 +54,12 @@ def test_load(self): (cube.coord("latitude").points == np.array([1, 2])).all() ) + def test_load_cube(self): + """Test loading an Iris Cube.""" + cube = _create_sample_cube() + cubes = load(cube) + assert cubes == CubeList([cube]) + def test_load_grib(self): """Test loading a grib file.""" grib_path = Path( @@ -72,45 +78,11 @@ def test_load_grib(self): assert cube.shape == (200, 247) assert "source_file" in cube.attributes - def test_callback_remove_attributes(self): - """Test callback remove unwanted attributes.""" - attributes = ("history", "creation_date", "tracking_id", "comment") - for _ in range(2): - cube = _create_sample_cube() - for attr in attributes: - cube.attributes[attr] = attr - self._save_cube(cube) - for temp_file in self.temp_files: - cubes = load(temp_file) - cube = cubes[0] - self.assertEqual(1, len(cubes)) - self.assertTrue((cube.data == np.array([1, 2])).all()) - self.assertTrue( - (cube.coord("latitude").points == np.array([1, 2])).all() - ) - for attr in attributes: - self.assertTrue(attr not in cube.attributes) - - def test_callback_remove_attributes_from_coords(self): - """Test callback remove unwanted attributes from coords.""" - attributes = ("history",) - for _ in range(2): - cube = _create_sample_cube() - for coord in cube.coords(): - for attr in attributes: - coord.attributes[attr] = attr - self._save_cube(cube) - for temp_file in self.temp_files: - cubes = load(temp_file) - cube = cubes[0] - self.assertEqual(1, len(cubes)) - self.assertTrue((cube.data == np.array([1, 2])).all()) - self.assertTrue( - (cube.coord("latitude").points == np.array([1, 2])).all() - ) - for coord in cube.coords(): - for attr in attributes: - self.assertTrue(attr not in coord.attributes) + def test_load_cubes(self): + """Test loading an Iris CubeList.""" + cube = _create_sample_cube() + cubes = load(CubeList([cube])) + assert cubes == CubeList([cube]) def test_callback_fix_lat_units(self): """Test callback for fixing units.""" @@ -126,6 +98,13 @@ def test_callback_fix_lat_units(self): ) self.assertEqual(cube.coord("latitude").units, "degrees_north") + def test_get_attr_from_field_coord_none(self): + """Test ``_get_attr_from_field_coord``.""" + attr = _get_attr_from_field_coord( + unittest.mock.sentinel.ncfield, None, "attr" + ) + assert attr is None + @unittest.mock.patch("iris.load_raw", autospec=True) def test_fail_empty_cubes(self, mock_load_raw): """Test that ValueError is raised when cubes are empty.""" diff --git a/tests/integration/preprocessor/_time/test_time.py b/tests/integration/preprocessor/_time/test_time.py index d283e04515..00e9711e7a 100644 --- a/tests/integration/preprocessor/_time/test_time.py +++ b/tests/integration/preprocessor/_time/test_time.py @@ -8,14 +8,13 @@ AncillaryVariable, AuxCoord, CellMeasure, - CellMethod, DimCoord, ) from iris.cube import Cube from iris.exceptions import CoordinateMultiDimError, CoordinateNotFoundError from esmvalcore.preprocessor._time import climate_statistics, local_solar_time -from tests import assert_array_equal +from tests import assert_array_equal, create_realistic_4d_cube @pytest.fixture @@ -79,61 +78,7 @@ def test_statistical_operators( @pytest.fixture def realistic_4d_cube(): """Create realistic 4D cube.""" - time = DimCoord( - [11.0, 12.0], - standard_name="time", - units=Unit("hours since 1851-01-01", calendar="360_day"), - ) - plev = DimCoord([50000], standard_name="air_pressure", units="Pa") - lat = DimCoord([0.0, 1.0], standard_name="latitude", units="degrees") - lon = DimCoord( - [0.0, 20.0, 345.0], standard_name="longitude", units="degrees" - ) - - aux_2d_data = np.arange(2 * 3).reshape(2, 3) - aux_2d_bounds = np.stack( - (aux_2d_data - 1, aux_2d_data, aux_2d_data + 1), axis=-1 - ) - aux_2d = AuxCoord(aux_2d_data, var_name="aux_2d") - aux_2d_with_bnds = AuxCoord( - aux_2d_data, bounds=aux_2d_bounds, var_name="aux_2d_with_bnds" - ) - aux_time = AuxCoord(["Jan", "Jan"], var_name="aux_time") - aux_lon = AuxCoord([0, 1, 2], var_name="aux_lon") - - cell_area = CellMeasure( - np.arange(2 * 2 * 3).reshape(2, 2, 3) + 10, - standard_name="cell_area", - units="m2", - measure="area", - ) - type_var = AncillaryVariable( - [["sea", "land", "lake"], ["lake", "sea", "land"]], - var_name="type", - units="no_unit", - ) - - cube = Cube( - np.ma.masked_inside( - np.arange(2 * 1 * 2 * 3).reshape(2, 1, 2, 3), 1, 3 - ), - var_name="ta", - standard_name="air_temperature", - long_name="Air Temperature", - units="K", - cell_methods=[CellMethod("mean", "time")], - dim_coords_and_dims=[(time, 0), (plev, 1), (lat, 2), (lon, 3)], - aux_coords_and_dims=[ - (aux_2d, (0, 3)), - (aux_2d_with_bnds, (0, 3)), - (aux_time, 0), - (aux_lon, 3), - ], - cell_measures_and_dims=[(cell_area, (0, 2, 3))], - ancillary_variables_and_dims=[(type_var, (0, 3))], - attributes={"test": 1}, - ) - return cube + return create_realistic_4d_cube() def test_local_solar_time_regular(realistic_4d_cube): diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py index 232803b627..2f73811444 100644 --- a/tests/unit/test_dataset.py +++ b/tests/unit/test_dataset.py @@ -1723,14 +1723,15 @@ def mock_preprocess( }, "fix_file": { "add_unique_suffix": True, - "session": session, "dataset": "CanESM2", "ensemble": "r1i1p1", "exp": "historical", "frequency": "yr", + "ignore_warnings": None, "mip": "Oyr", "output_dir": fix_dir_prefix, "project": "CMIP5", + "session": session, "short_name": "chl", "timerange": "2000/2005", },