quantify fails if dataset is loaded with with `xr.open_mfdataset` #252

aaschwanden · 2024-04-24T17:40:23Z

pint.quantify() fails on a dataset opened using xr.open_mfdataset but everything works as expected when the two data sets are loaded individually with xr.open_dataset and then concatenated.
The bug is insensitive to options such as parallel and chunking. Not sure how I could debug this any further.

Libraries

pint_xarray.__version__ =  0.3
cf_xarray.__version__ =  0.9.0

Expected behavior

quantify() should not fail.

Minimal example

import cf_xarray.units
import pint_xarray

import cftime
import xarray as xr
import re

def preprocess_nc(
    ds: xr.Dataset,
    regexp: str = "id_(.+?)_",
    dim: str = "exp_id",
) -> xr.Dataset:
    """
    Add experiment 'exp_id' to the dataset.

    This function adds an experiment id ('exp_id') to the dataset, extracted from the source encoding
    using the provided regular expression.

    Parameters
    ----------
    ds : xr.Dataset
        The dataset to be preprocessed.
    regexp : str, optional
        The regular expression used to extract the experiment id from the source encoding, by default "id_(.+?)_".
    dim : str, optional
        The name of the dimension to be added to the dataset, by default "exp_id".

    Returns
    -------
    xr.Dataset
        The preprocessed dataset.
    """

    m_id_re = re.search(regexp, ds.encoding["source"])
    ds.expand_dims(dim)
    assert m_id_re is not None
    m_id: Union[str, int]
    try:
        m_id = int(m_id_re.group(1))
    except:
        m_id = str(m_id_re.group(1))
    ds[dim] = m_id
    return ds


d = {'coords': {'lat': {'dims': ('y', 'x'),
   'attrs': {'units': 'degree_north',
    'valid_range': [-90.0, 90.0],
    'long_name': 'latitude',
    'standard_name': 'latitude'},
   'data': [[0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0]]},
  'lon': {'dims': ('y', 'x'),
   'attrs': {'units': 'degree_east',
    'valid_range': [-180.0, 180.0],
    'long_name': 'longitude',
    'standard_name': 'longitude'},
   'data': [[0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0]]},
  'time': {'dims': ('time',),
   'attrs': {'axis': 'T', 'long_name': 'time'},
   'data': [cftime.DatetimeNoLeap(1001, 1, 1, 0, 0, 0, 0, has_year_zero=True)]},
  'x': {'dims': ('x',),
   'attrs': {'units': 'm',
    'axis': 'X',
    'long_name': 'X-coordinate in Cartesian system',
    'standard_name': 'projection_x_coordinate',
    'spacing_meters': 375000.0},
   'data': [-750000.0, -375000.0, 0.0, 375000.0, 750000.0]},
  'y': {'dims': ('y',),
   'attrs': {'units': 'm',
    'axis': 'Y',
    'long_name': 'Y-coordinate in Cartesian system',
    'standard_name': 'projection_y_coordinate',
    'spacing_meters': 375000.0},
   'data': [-750000.0, -375000.0, 0.0, 375000.0, 750000.0]}},
 'attrs': {'Conventions': 'CF-1.6'},
 'dims': {'y': 5, 'x': 5, 'time': 1},
 'data_vars': {'thk': {'dims': ('time', 'y', 'x'),
   'attrs': {'units': 'm',
    'valid_min': 0.0,
    'long_name': 'land ice thickness',
    'standard_name': 'land_ice_thickness'},
   'data': [[[0.0, 0.0, 0.0, 0.0, 0.0],
     [0.0, 0.0, 499.6684394216143, 0.0, 0.0],
     [0.0, 499.6684394216143, 499.66844085642595, 499.6684394216143, 0.0],
     [0.0, 0.0, 499.6684394216143, 0.0, 0.0],
     [0.0, 0.0, 0.0, 0.0, 0.0]]]}}}

ds = xr.Dataset.from_dict(d)

file1 = "test_id_foo_0_1000.nc"
file2 = "test_id_bar_0_1000.nc"

ds.to_netcdf(file1)
ds.to_netcdf(file2)

ds1 = preprocess_nc(xr.open_dataset(file1)).chunk("auto")
ds2 = preprocess_nc(xr.open_dataset(file2)).chunk("auto")
single_ds = xr.concat([ds1, ds2], dim="exp_id")
single_ds.pint.quantify()

mf_ds = xr.open_mfdataset("test_id_*.nc",
                          preprocess=preprocess_nc,
                          concat_dim="exp_id",
                          combine="nested",
                          chunks="auto",
                          parallel=True,
                          decode_times=False)

mf_ds.pint.quantify()

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[6], line 117
    107 single_ds.pint.quantify()
    109 mf_ds = xr.open_mfdataset("test_id_*.nc",
    110                           preprocess=preprocess_nc,
    111                           concat_dim="exp_id",
   (...)
    114                           parallel=True,
    115                           decode_times=False)
--> 117 mf_ds.pint.quantify()

File ~/miniforge3/envs/glacier-flow-tools/lib/python3.12/site-packages/pint_xarray/accessors.py:1085, in PintDatasetAccessor.quantify(self, units, unit_registry, **unit_kwargs)
   1083 if unit is not _default or attr not in (None, _default):
   1084     try:
-> 1085         new_units[name] = _decide_units(unit, registry, attr)
   1086     except (ValueError, pint.UndefinedUnitError) as e:
   1087         if unit is not _default:

File ~/miniforge3/envs/glacier-flow-tools/lib/python3.12/site-packages/pint_xarray/accessors.py:138, in _decide_units(units, registry, unit_attribute)
    136         units = unit_attribute
    137     else:
--> 138         units = registry.parse_units(unit_attribute)
    139 else:
    140     units = registry.parse_units(units)

File ~/miniforge3/envs/glacier-flow-tools/lib/python3.12/site-packages/pint/facets/plain/registry.py:1189, in GenericPlainRegistry.parse_units(self, input_string, as_delta, case_sensitive)
   1161 def parse_units(
   1162     self,
   1163     input_string: str,
   1164     as_delta: Optional[bool] = None,
   1165     case_sensitive: Optional[bool] = None,
   1166 ) -> UnitT:
   1167     """Parse a units expression and returns a UnitContainer with
   1168     the canonical names.
   1169 
   (...)
   1185 
   1186     """
   1188     return self.Unit(
-> 1189         self.parse_units_as_container(input_string, as_delta, case_sensitive)
   1190     )

File ~/miniforge3/envs/glacier-flow-tools/lib/python3.12/site-packages/pint/facets/nonmultiplicative/registry.py:70, in GenericNonMultiplicativeRegistry.parse_units_as_container(self, input_string, as_delta, case_sensitive)
     67 if as_delta is None:
     68     as_delta = self.default_as_delta
---> 70 return super().parse_units_as_container(input_string, as_delta, case_sensitive)

File ~/miniforge3/envs/glacier-flow-tools/lib/python3.12/site-packages/pint/facets/plain/registry.py:1204, in GenericPlainRegistry.parse_units_as_container(self, input_string, as_delta, case_sensitive)
   1198 as_delta = (
   1199     as_delta if as_delta is not None else True
   1200 )  # TODO This only exists in nonmultiplicative
   1201 case_sensitive = (
   1202     case_sensitive if case_sensitive is not None else self.case_sensitive
   1203 )
-> 1204 return self._parse_units_as_container(input_string, as_delta, case_sensitive)

File ~/miniforge3/envs/glacier-flow-tools/lib/python3.12/site-packages/pint/facets/plain/registry.py:1232, in GenericPlainRegistry._parse_units_as_container(self, input_string, as_delta, case_sensitive)
   1229 # Sanitize input_string with whitespaces.
   1230 input_string = input_string.strip()
-> 1232 units = ParserHelper.from_string(input_string, self.non_int_type)
   1233 if units.scale != 1:
   1234     raise ValueError("Unit expression cannot have a scaling factor.")

File ~/miniforge3/envs/glacier-flow-tools/lib/python3.12/site-packages/pint/util.py:767, in ParserHelper.from_string(cls, input_string, non_int_type)
    764     reps = False
    766 gen = pint_eval.tokenizer(input_string)
--> 767 ret = build_eval_tree(gen).evaluate(
    768     partial(cls.eval_token, non_int_type=non_int_type)
    769 )
    771 if isinstance(ret, Number):
    772     return cls(ret, non_int_type=non_int_type)

File ~/miniforge3/envs/glacier-flow-tools/lib/python3.12/site-packages/pint/pint_eval.py:384, in EvalTreeNode.evaluate(self, define_op, bin_op, un_op)
    380     if op_text not in bin_op:
    381         raise DefinitionSyntaxError(f"missing binary operator '{op_text}'")
    383     return bin_op[op_text](
--> 384         self.left.evaluate(define_op, bin_op, un_op),
    385         self.right.evaluate(define_op, bin_op, un_op),
    386     )
    387 elif self.operator:
    388     assert isinstance(self.left, EvalTreeNode), "self.left not EvalTreeNode (4)"

File ~/miniforge3/envs/glacier-flow-tools/lib/python3.12/site-packages/pint/pint_eval.py:383, in EvalTreeNode.evaluate(self, define_op, bin_op, un_op)
    380     if op_text not in bin_op:
    381         raise DefinitionSyntaxError(f"missing binary operator '{op_text}'")
--> 383     return bin_op[op_text](
    384         self.left.evaluate(define_op, bin_op, un_op),
    385         self.right.evaluate(define_op, bin_op, un_op),
    386     )
    387 elif self.operator:
    388     assert isinstance(self.left, EvalTreeNode), "self.left not EvalTreeNode (4)"

TypeError: unsupported operand type(s) for -: 'ParserHelper' and 'int'

The text was updated successfully, but these errors were encountered:

keewis · 2024-04-26T12:14:55Z

Thanks for the report. Your issue is decode_times=False: time coordinates have units containing a date, typically something like seconds since 2019-03-02 23:51:11. pint thinks this is an expression (2019 - 03 - 02) and tries to evaluate, only to fail because it can only parse expressions with units.

In #241 I simply told pint-xarray to ignore variables with time units. In other words, this is already fixed on main, and I'll take this issue as a reminder to issue a new release.

keewis closed this as completed Apr 26, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

quantify fails if dataset is loaded with with `xr.open_mfdataset` #252

quantify fails if dataset is loaded with with `xr.open_mfdataset` #252

aaschwanden commented Apr 24, 2024

keewis commented Apr 26, 2024

quantify fails if dataset is loaded with with xr.open_mfdataset #252

quantify fails if dataset is loaded with with xr.open_mfdataset #252

Comments

aaschwanden commented Apr 24, 2024

Libraries

Expected behavior

Minimal example

keewis commented Apr 26, 2024

quantify fails if dataset is loaded with with `xr.open_mfdataset` #252

quantify fails if dataset is loaded with with `xr.open_mfdataset` #252