Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

quantify fails if dataset is loaded with with xr.open_mfdataset #252

Closed
aaschwanden opened this issue Apr 24, 2024 · 1 comment
Closed

Comments

@aaschwanden
Copy link

pint.quantify() fails on a dataset opened using xr.open_mfdataset but everything works as expected when the two data sets are loaded individually with xr.open_dataset and then concatenated.
The bug is insensitive to options such as parallel and chunking. Not sure how I could debug this any further.

Libraries

pint_xarray.__version__ =  0.3
cf_xarray.__version__ =  0.9.0

Expected behavior

quantify() should not fail.

Minimal example

import cf_xarray.units
import pint_xarray

import cftime
import xarray as xr
import re

def preprocess_nc(
    ds: xr.Dataset,
    regexp: str = "id_(.+?)_",
    dim: str = "exp_id",
) -> xr.Dataset:
    """
    Add experiment 'exp_id' to the dataset.

    This function adds an experiment id ('exp_id') to the dataset, extracted from the source encoding
    using the provided regular expression.

    Parameters
    ----------
    ds : xr.Dataset
        The dataset to be preprocessed.
    regexp : str, optional
        The regular expression used to extract the experiment id from the source encoding, by default "id_(.+?)_".
    dim : str, optional
        The name of the dimension to be added to the dataset, by default "exp_id".

    Returns
    -------
    xr.Dataset
        The preprocessed dataset.
    """

    m_id_re = re.search(regexp, ds.encoding["source"])
    ds.expand_dims(dim)
    assert m_id_re is not None
    m_id: Union[str, int]
    try:
        m_id = int(m_id_re.group(1))
    except:
        m_id = str(m_id_re.group(1))
    ds[dim] = m_id
    return ds


d = {'coords': {'lat': {'dims': ('y', 'x'),
   'attrs': {'units': 'degree_north',
    'valid_range': [-90.0, 90.0],
    'long_name': 'latitude',
    'standard_name': 'latitude'},
   'data': [[0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0]]},
  'lon': {'dims': ('y', 'x'),
   'attrs': {'units': 'degree_east',
    'valid_range': [-180.0, 180.0],
    'long_name': 'longitude',
    'standard_name': 'longitude'},
   'data': [[0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0],
    [0.0, 0.0, 0.0, 0.0, 0.0]]},
  'time': {'dims': ('time',),
   'attrs': {'axis': 'T', 'long_name': 'time'},
   'data': [cftime.DatetimeNoLeap(1001, 1, 1, 0, 0, 0, 0, has_year_zero=True)]},
  'x': {'dims': ('x',),
   'attrs': {'units': 'm',
    'axis': 'X',
    'long_name': 'X-coordinate in Cartesian system',
    'standard_name': 'projection_x_coordinate',
    'spacing_meters': 375000.0},
   'data': [-750000.0, -375000.0, 0.0, 375000.0, 750000.0]},
  'y': {'dims': ('y',),
   'attrs': {'units': 'm',
    'axis': 'Y',
    'long_name': 'Y-coordinate in Cartesian system',
    'standard_name': 'projection_y_coordinate',
    'spacing_meters': 375000.0},
   'data': [-750000.0, -375000.0, 0.0, 375000.0, 750000.0]}},
 'attrs': {'Conventions': 'CF-1.6'},
 'dims': {'y': 5, 'x': 5, 'time': 1},
 'data_vars': {'thk': {'dims': ('time', 'y', 'x'),
   'attrs': {'units': 'm',
    'valid_min': 0.0,
    'long_name': 'land ice thickness',
    'standard_name': 'land_ice_thickness'},
   'data': [[[0.0, 0.0, 0.0, 0.0, 0.0],
     [0.0, 0.0, 499.6684394216143, 0.0, 0.0],
     [0.0, 499.6684394216143, 499.66844085642595, 499.6684394216143, 0.0],
     [0.0, 0.0, 499.6684394216143, 0.0, 0.0],
     [0.0, 0.0, 0.0, 0.0, 0.0]]]}}}

ds = xr.Dataset.from_dict(d)

file1 = "test_id_foo_0_1000.nc"
file2 = "test_id_bar_0_1000.nc"

ds.to_netcdf(file1)
ds.to_netcdf(file2)

ds1 = preprocess_nc(xr.open_dataset(file1)).chunk("auto")
ds2 = preprocess_nc(xr.open_dataset(file2)).chunk("auto")
single_ds = xr.concat([ds1, ds2], dim="exp_id")
single_ds.pint.quantify()

mf_ds = xr.open_mfdataset("test_id_*.nc",
                          preprocess=preprocess_nc,
                          concat_dim="exp_id",
                          combine="nested",
                          chunks="auto",
                          parallel=True,
                          decode_times=False)

mf_ds.pint.quantify()

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[6], line 117
    107 single_ds.pint.quantify()
    109 mf_ds = xr.open_mfdataset("test_id_*.nc",
    110                           preprocess=preprocess_nc,
    111                           concat_dim="exp_id",
   (...)
    114                           parallel=True,
    115                           decode_times=False)
--> 117 mf_ds.pint.quantify()

File ~/miniforge3/envs/glacier-flow-tools/lib/python3.12/site-packages/pint_xarray/accessors.py:1085, in PintDatasetAccessor.quantify(self, units, unit_registry, **unit_kwargs)
   1083 if unit is not _default or attr not in (None, _default):
   1084     try:
-> 1085         new_units[name] = _decide_units(unit, registry, attr)
   1086     except (ValueError, pint.UndefinedUnitError) as e:
   1087         if unit is not _default:

File ~/miniforge3/envs/glacier-flow-tools/lib/python3.12/site-packages/pint_xarray/accessors.py:138, in _decide_units(units, registry, unit_attribute)
    136         units = unit_attribute
    137     else:
--> 138         units = registry.parse_units(unit_attribute)
    139 else:
    140     units = registry.parse_units(units)

File ~/miniforge3/envs/glacier-flow-tools/lib/python3.12/site-packages/pint/facets/plain/registry.py:1189, in GenericPlainRegistry.parse_units(self, input_string, as_delta, case_sensitive)
   1161 def parse_units(
   1162     self,
   1163     input_string: str,
   1164     as_delta: Optional[bool] = None,
   1165     case_sensitive: Optional[bool] = None,
   1166 ) -> UnitT:
   1167     """Parse a units expression and returns a UnitContainer with
   1168     the canonical names.
   1169 
   (...)
   1185 
   1186     """
   1188     return self.Unit(
-> 1189         self.parse_units_as_container(input_string, as_delta, case_sensitive)
   1190     )

File ~/miniforge3/envs/glacier-flow-tools/lib/python3.12/site-packages/pint/facets/nonmultiplicative/registry.py:70, in GenericNonMultiplicativeRegistry.parse_units_as_container(self, input_string, as_delta, case_sensitive)
     67 if as_delta is None:
     68     as_delta = self.default_as_delta
---> 70 return super().parse_units_as_container(input_string, as_delta, case_sensitive)

File ~/miniforge3/envs/glacier-flow-tools/lib/python3.12/site-packages/pint/facets/plain/registry.py:1204, in GenericPlainRegistry.parse_units_as_container(self, input_string, as_delta, case_sensitive)
   1198 as_delta = (
   1199     as_delta if as_delta is not None else True
   1200 )  # TODO This only exists in nonmultiplicative
   1201 case_sensitive = (
   1202     case_sensitive if case_sensitive is not None else self.case_sensitive
   1203 )
-> 1204 return self._parse_units_as_container(input_string, as_delta, case_sensitive)

File ~/miniforge3/envs/glacier-flow-tools/lib/python3.12/site-packages/pint/facets/plain/registry.py:1232, in GenericPlainRegistry._parse_units_as_container(self, input_string, as_delta, case_sensitive)
   1229 # Sanitize input_string with whitespaces.
   1230 input_string = input_string.strip()
-> 1232 units = ParserHelper.from_string(input_string, self.non_int_type)
   1233 if units.scale != 1:
   1234     raise ValueError("Unit expression cannot have a scaling factor.")

File ~/miniforge3/envs/glacier-flow-tools/lib/python3.12/site-packages/pint/util.py:767, in ParserHelper.from_string(cls, input_string, non_int_type)
    764     reps = False
    766 gen = pint_eval.tokenizer(input_string)
--> 767 ret = build_eval_tree(gen).evaluate(
    768     partial(cls.eval_token, non_int_type=non_int_type)
    769 )
    771 if isinstance(ret, Number):
    772     return cls(ret, non_int_type=non_int_type)

File ~/miniforge3/envs/glacier-flow-tools/lib/python3.12/site-packages/pint/pint_eval.py:384, in EvalTreeNode.evaluate(self, define_op, bin_op, un_op)
    380     if op_text not in bin_op:
    381         raise DefinitionSyntaxError(f"missing binary operator '{op_text}'")
    383     return bin_op[op_text](
--> 384         self.left.evaluate(define_op, bin_op, un_op),
    385         self.right.evaluate(define_op, bin_op, un_op),
    386     )
    387 elif self.operator:
    388     assert isinstance(self.left, EvalTreeNode), "self.left not EvalTreeNode (4)"

File ~/miniforge3/envs/glacier-flow-tools/lib/python3.12/site-packages/pint/pint_eval.py:383, in EvalTreeNode.evaluate(self, define_op, bin_op, un_op)
    380     if op_text not in bin_op:
    381         raise DefinitionSyntaxError(f"missing binary operator '{op_text}'")
--> 383     return bin_op[op_text](
    384         self.left.evaluate(define_op, bin_op, un_op),
    385         self.right.evaluate(define_op, bin_op, un_op),
    386     )
    387 elif self.operator:
    388     assert isinstance(self.left, EvalTreeNode), "self.left not EvalTreeNode (4)"

TypeError: unsupported operand type(s) for -: 'ParserHelper' and 'int'

@keewis
Copy link
Collaborator

keewis commented Apr 26, 2024

Thanks for the report. Your issue is decode_times=False: time coordinates have units containing a date, typically something like seconds since 2019-03-02 23:51:11. pint thinks this is an expression (2019 - 03 - 02) and tries to evaluate, only to fail because it can only parse expressions with units.

In #241 I simply told pint-xarray to ignore variables with time units. In other words, this is already fixed on main, and I'll take this issue as a reminder to issue a new release.

@keewis keewis closed this as completed Apr 26, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants