Merge branch 'main' into concat

pydata · Dec 6, 2023 · d5e20c6 · d5e20c6
2 parents d04a2e7 + 3d6ec7e
commit d5e20c6
Show file tree

Hide file tree

Showing 22 changed files with 285 additions and 66 deletions.
diff --git a/doc/api.rst b/doc/api.rst
@@ -192,6 +192,7 @@ Computation
    Dataset.map_blocks
    Dataset.polyfit
    Dataset.curvefit
+   Dataset.eval
 
 Aggregation
 -----------

diff --git a/doc/gallery/plot_cartopy_facetgrid.py b/doc/gallery/plot_cartopy_facetgrid.py
@@ -30,7 +30,7 @@
     transform=ccrs.PlateCarree(),  # the data's projection
     col="time",
     col_wrap=1,  # multiplot settings
-    aspect=ds.dims["lon"] / ds.dims["lat"],  # for a sensible figsize
+    aspect=ds.sizes["lon"] / ds.sizes["lat"],  # for a sensible figsize
     subplot_kws={"projection": map_proj},  # the plot's projection
 )
 

diff --git a/doc/user-guide/interpolation.rst b/doc/user-guide/interpolation.rst
@@ -292,8 +292,8 @@ Let's see how :py:meth:`~xarray.DataArray.interp` works on real data.
     axes[0].set_title("Raw data")
 
     # Interpolated data
-    new_lon = np.linspace(ds.lon[0], ds.lon[-1], ds.dims["lon"] * 4)
-    new_lat = np.linspace(ds.lat[0], ds.lat[-1], ds.dims["lat"] * 4)
+    new_lon = np.linspace(ds.lon[0], ds.lon[-1], ds.sizes["lon"] * 4)
+    new_lat = np.linspace(ds.lat[0], ds.lat[-1], ds.sizes["lat"] * 4)
     dsi = ds.interp(lat=new_lat, lon=new_lon)
     dsi.air.plot(ax=axes[1])
     @savefig interpolation_sample3.png width=8in

diff --git a/doc/user-guide/terminology.rst b/doc/user-guide/terminology.rst
@@ -47,9 +47,9 @@ complete examples, please consult the relevant documentation.*
         all but one of these degrees of freedom is fixed. We can think of each
         dimension axis as having a name, for example the "x dimension".  In
         xarray, a ``DataArray`` object's *dimensions* are its named dimension
-        axes, and the name of the ``i``-th dimension is ``arr.dims[i]``. If an
-        array is created without dimension names, the default dimension names are
-        ``dim_0``, ``dim_1``, and so forth.
+        axes ``da.dims``, and the name of the ``i``-th dimension is ``da.dims[i]``.
+        If an array is created without specifying dimension names, the default dimension
+        names will be ``dim_0``, ``dim_1``, and so forth.
 
     Coordinate
         An array that labels a dimension or set of dimensions of another
@@ -61,8 +61,7 @@ complete examples, please consult the relevant documentation.*
         ``arr.coords[x]``. A ``DataArray`` can have more coordinates than
         dimensions because a single dimension can be labeled by multiple
         coordinate arrays. However, only one coordinate array can be a assigned
-        as a particular dimension's dimension coordinate array. As a
-        consequence, ``len(arr.dims) <= len(arr.coords)`` in general.
+        as a particular dimension's dimension coordinate array.
 
     Dimension coordinate
         A one-dimensional coordinate array assigned to ``arr`` with both a name

diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -41,6 +41,9 @@ New Features
 - :py:meth:`~xarray.DataArray.rank` now operates on dask-backed arrays, assuming
   the core dim has exactly one chunk. (:pull:`8475`).
   By `Maximilian Roos <https://github.com/max-sixty>`_.
+- Add a :py:meth:`Dataset.eval` method, similar to the pandas' method of the
+  same name. (:pull:`7163`). This is currently marked as experimental and
+  doesn't yet support the ``numexpr`` engine.
 - :py:meth:`Dataset.drop_vars` & :py:meth:`DataArray.drop_vars` allow passing a
   callable, similar to :py:meth:`Dataset.where` & :py:meth:`Dataset.sortby` & others.
   (:pull:`8511`).
@@ -66,10 +69,17 @@ Deprecations
   currently ``PendingDeprecationWarning``, which are silenced by default. We'll
   convert these to ``DeprecationWarning`` in a future release.
   By `Maximilian Roos <https://github.com/max-sixty>`_.
-- :py:meth:`Dataset.drop` &
-  :py:meth:`DataArray.drop` are now deprecated, since pending deprecation for
+- Raise a ``FutureWarning`` warning that the type of :py:meth:`Dataset.dims` will be changed
+  from a mapping of dimension names to lengths to a set of dimension names.
+  This is to increase consistency with :py:meth:`DataArray.dims`.
+  To access a mapping of dimension names to lengths please use :py:meth:`Dataset.sizes`.
+  The same change also applies to `DatasetGroupBy.dims`.
+  (:issue:`8496`, :pull:`8500`)
+  By `Tom Nicholas <https://github.com/TomNicholas>`_.
+- :py:meth:`Dataset.drop` & :py:meth:`DataArray.drop` are now deprecated, since pending deprecation for
   several years. :py:meth:`DataArray.drop_sel` & :py:meth:`DataArray.drop_var`
-  replace them for labels & variables respectively.
+  replace them for labels & variables respectively. (:pull:`8497`)
+  By `Maximilian Roos <https://github.com/max-sixty>`_.
 
 Bug fixes
 ~~~~~~~~~

diff --git a/xarray/core/common.py b/xarray/core/common.py
@@ -1167,7 +1167,7 @@ def _dataset_indexer(dim: Hashable) -> DataArray:
                 cond_wdim = cond.drop_vars(
                     var for var in cond if dim not in cond[var].dims
                 )
-                keepany = cond_wdim.any(dim=(d for d in cond.dims.keys() if d != dim))
+                keepany = cond_wdim.any(dim=(d for d in cond.dims if d != dim))
                 return keepany.to_dataarray().any("variable")
 
             _get_indexer = (

diff --git a/xarray/core/concat.py b/xarray/core/concat.py
@@ -315,7 +315,7 @@ def _calc_concat_over(datasets, dim, dim_names, data_vars: T_DataVars, coords, c
                 if dim in ds:
                     ds = ds.set_coords(dim)
         concat_over.update(k for k, v in ds.variables.items() if dim in v.dims)
-        concat_dim_lengths.append(ds.dims.get(dim, 1))
+        concat_dim_lengths.append(ds.sizes.get(dim, 1))
 
     def process_subset_opt(opt, subset):
         if isinstance(opt, str):
@@ -431,7 +431,7 @@ def _parse_datasets(
     variables_order: dict[Hashable, Variable] = {}  # variables in order of appearance
 
     for ds in datasets:
-        dims_sizes.update(ds.dims)
+        dims_sizes.update(ds.sizes)
         all_coord_names.update(ds.coords)
         data_vars.update(ds.data_vars)
         variables_order.update(ds.variables)

diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
@@ -98,13 +98,15 @@
     Self,
     T_ChunkDim,
     T_Chunks,
+    T_DataArray,
     T_DataArrayOrSet,
     T_Dataset,
     ZarrWriteModes,
 )
 from xarray.core.utils import (
     Default,
     Frozen,
+    FrozenMappingWarningOnValuesAccess,
     HybridMappingProxy,
     OrderedSet,
     _default,
@@ -778,14 +780,15 @@ def dims(self) -> Frozen[Hashable, int]:
 
         Note that type of this object differs from `DataArray.dims`.
         See `Dataset.sizes` and `DataArray.sizes` for consistently named
-        properties.
+        properties. This property will be changed to return a type more consistent with
+        `DataArray.dims` in the future, i.e. a set of dimension names.
 
         See Also
         --------
         Dataset.sizes
         DataArray.dims
         """
-        return Frozen(self._dims)
+        return FrozenMappingWarningOnValuesAccess(self._dims)
 
     @property
     def sizes(self) -> Frozen[Hashable, int]:
@@ -800,7 +803,7 @@ def sizes(self) -> Frozen[Hashable, int]:
         --------
         DataArray.sizes
         """
-        return self.dims
+        return Frozen(self._dims)
 
     @property
     def dtypes(self) -> Frozen[Hashable, np.dtype]:
@@ -1411,7 +1414,7 @@ def _copy_listed(self, names: Iterable[Hashable]) -> Self:
                 variables[name] = self._variables[name]
             except KeyError:
                 ref_name, var_name, var = _get_virtual_variable(
-                    self._variables, name, self.dims
+                    self._variables, name, self.sizes
                 )
                 variables[var_name] = var
                 if ref_name in self._coord_names or ref_name in self.dims:
@@ -1426,7 +1429,7 @@ def _copy_listed(self, names: Iterable[Hashable]) -> Self:
         for v in variables.values():
             needed_dims.update(v.dims)
 
-        dims = {k: self.dims[k] for k in needed_dims}
+        dims = {k: self.sizes[k] for k in needed_dims}
 
         # preserves ordering of coordinates
         for k in self._variables:
@@ -1448,7 +1451,7 @@ def _construct_dataarray(self, name: Hashable) -> DataArray:
         try:
             variable = self._variables[name]
         except KeyError:
-            _, name, variable = _get_virtual_variable(self._variables, name, self.dims)
+            _, name, variable = _get_virtual_variable(self._variables, name, self.sizes)
 
         needed_dims = set(variable.dims)
 
@@ -1475,7 +1478,7 @@ def _item_sources(self) -> Iterable[Mapping[Hashable, Any]]:
         yield HybridMappingProxy(keys=self._coord_names, mapping=self.coords)
 
         # virtual coordinates
-        yield HybridMappingProxy(keys=self.dims, mapping=self)
+        yield HybridMappingProxy(keys=self.sizes, mapping=self)
 
     def __contains__(self, key: object) -> bool:
         """The 'in' operator will return true or false depending on whether
@@ -2569,7 +2572,7 @@ def info(self, buf: IO | None = None) -> None:
         lines = []
         lines.append("xarray.Dataset {")
         lines.append("dimensions:")
-        for name, size in self.dims.items():
+        for name, size in self.sizes.items():
             lines.append(f"\t{name} = {size} ;")
         lines.append("\nvariables:")
         for name, da in self.variables.items():
@@ -2697,10 +2700,10 @@ def chunk(
         else:
             chunks_mapping = either_dict_or_kwargs(chunks, chunks_kwargs, "chunk")
 
-        bad_dims = chunks_mapping.keys() - self.dims.keys()
+        bad_dims = chunks_mapping.keys() - self.sizes.keys()
         if bad_dims:
             raise ValueError(
-                f"chunks keys {tuple(bad_dims)} not found in data dimensions {tuple(self.dims)}"
+                f"chunks keys {tuple(bad_dims)} not found in data dimensions {tuple(self.sizes.keys())}"
             )
 
         chunkmanager = guess_chunkmanager(chunked_array_type)
@@ -3952,7 +3955,7 @@ def maybe_variable(obj, k):
             try:
                 return obj._variables[k]
             except KeyError:
-                return as_variable((k, range(obj.dims[k])))
+                return as_variable((k, range(obj.sizes[k])))
 
         def _validate_interp_indexer(x, new_x):
             # In the case of datetimes, the restrictions placed on indexers
@@ -4176,7 +4179,7 @@ def _rename_vars(
         return variables, coord_names
 
     def _rename_dims(self, name_dict: Mapping[Any, Hashable]) -> dict[Hashable, int]:
-        return {name_dict.get(k, k): v for k, v in self.dims.items()}
+        return {name_dict.get(k, k): v for k, v in self.sizes.items()}
 
     def _rename_indexes(
         self, name_dict: Mapping[Any, Hashable], dims_dict: Mapping[Any, Hashable]
@@ -5168,7 +5171,7 @@ def _get_stack_index(
             if dim in self._variables:
                 var = self._variables[dim]
             else:
-                _, _, var = _get_virtual_variable(self._variables, dim, self.dims)
+                _, _, var = _get_virtual_variable(self._variables, dim, self.sizes)
             # dummy index (only `stack_coords` will be used to construct the multi-index)
             stack_index = PandasIndex([0], dim)
             stack_coords = {dim: var}
@@ -5195,7 +5198,7 @@ def _stack_once(
             if any(d in var.dims for d in dims):
                 add_dims = [d for d in dims if d not in var.dims]
                 vdims = list(var.dims) + add_dims
-                shape = [self.dims[d] for d in vdims]
+                shape = [self.sizes[d] for d in vdims]
                 exp_var = var.set_dims(vdims, shape)
                 stacked_var = exp_var.stack(**{new_dim: dims})
                 new_variables[name] = stacked_var
@@ -6351,15 +6354,15 @@ def dropna(
         if subset is None:
             subset = iter(self.data_vars)
 
-        count = np.zeros(self.dims[dim], dtype=np.int64)
+        count = np.zeros(self.sizes[dim], dtype=np.int64)
         size = np.int_(0)  # for type checking
 
         for k in subset:
             array = self._variables[k]
             if dim in array.dims:
                 dims = [d for d in array.dims if d != dim]
                 count += np.asarray(array.count(dims))
-                size += math.prod([self.dims[d] for d in dims])
+                size += math.prod([self.sizes[d] for d in dims])
 
         if thresh is not None:
             mask = count >= thresh
@@ -7136,7 +7139,7 @@ def _normalize_dim_order(
                 f"Dataset: {list(self.dims)}"
             )
 
-        ordered_dims = {k: self.dims[k] for k in dim_order}
+        ordered_dims = {k: self.sizes[k] for k in dim_order}
 
         return ordered_dims
 
@@ -7396,7 +7399,7 @@ def to_dask_dataframe(
                 var = self.variables[name]
             except KeyError:
                 # dimension without a matching coordinate
-                size = self.dims[name]
+                size = self.sizes[name]
                 data = da.arange(size, chunks=size, dtype=np.int64)
                 var = Variable((name,), data)
 
@@ -7469,7 +7472,7 @@ def to_dict(
         d: dict = {
             "coords": {},
             "attrs": decode_numpy_dict_values(self.attrs),
-            "dims": dict(self.dims),
+            "dims": dict(self.sizes),
             "data_vars": {},
         }
         for k in self.coords:
@@ -9552,6 +9555,68 @@ def argmax(self, dim: Hashable | None = None, **kwargs) -> Self:
                 "Dataset.argmin() with a sequence or ... for dim"
             )
 
+    def eval(
+        self,
+        statement: str,
+        *,
+        parser: QueryParserOptions = "pandas",
+    ) -> Self | T_DataArray:
+        """
+        Calculate an expression supplied as a string in the context of the dataset.
+
+        This is currently experimental; the API may change particularly around
+        assignments, which currently returnn a ``Dataset`` with the additional variable.
+        Currently only the ``python`` engine is supported, which has the same
+        performance as executing in python.
+
+        Parameters
+        ----------
+        statement : str
+            String containing the Python-like expression to evaluate.
+
+        Returns
+        -------
+        result : Dataset or DataArray, depending on whether ``statement`` contains an
+        assignment.
+
+        Examples
+        --------
+        >>> ds = xr.Dataset(
+        ...     {"a": ("x", np.arange(0, 5, 1)), "b": ("x", np.linspace(0, 1, 5))}
+        ... )
+        >>> ds
+        <xarray.Dataset>
+        Dimensions:  (x: 5)
+        Dimensions without coordinates: x
+        Data variables:
+            a        (x) int64 0 1 2 3 4
+            b        (x) float64 0.0 0.25 0.5 0.75 1.0
+
+        >>> ds.eval("a + b")
+        <xarray.DataArray (x: 5)>
+        array([0.  , 1.25, 2.5 , 3.75, 5.  ])
+        Dimensions without coordinates: x
+
+        >>> ds.eval("c = a + b")
+        <xarray.Dataset>
+        Dimensions:  (x: 5)
+        Dimensions without coordinates: x
+        Data variables:
+            a        (x) int64 0 1 2 3 4
+            b        (x) float64 0.0 0.25 0.5 0.75 1.0
+            c        (x) float64 0.0 1.25 2.5 3.75 5.0
+        """
+
+        return pd.eval(
+            statement,
+            resolvers=[self],
+            target=self,
+            parser=parser,
+            # Because numexpr returns a numpy array, using that engine results in
+            # different behavior. We'd be very open to a contribution handling this.
+            engine="python",
+        )
+
     def query(
         self,
         queries: Mapping[Any, Any] | None = None,

diff --git a/xarray/core/formatting.py b/xarray/core/formatting.py
@@ -357,7 +357,7 @@ def summarize_attr(key, value, col_width=None):
 
 
 def _calculate_col_width(col_items):
-    max_name_length = max(len(str(s)) for s in col_items) if col_items else 0
+    max_name_length = max((len(str(s)) for s in col_items), default=0)
     col_width = max(max_name_length, 7) + 6
     return col_width
 
@@ -739,7 +739,7 @@ def dataset_repr(ds):
 
 
 def diff_dim_summary(a, b):
-    if a.dims != b.dims:
+    if a.sizes != b.sizes:
         return f"Differing dimensions:\n    ({dim_summary(a)}) != ({dim_summary(b)})"
     else:
         return ""