pydata · dcherian · Apr 28, 2023 · Apr 7, 2023 · Apr 7, 2023 · Apr 7, 2023
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -30,6 +30,10 @@ New Features
 - Added ability to save ``DataArray`` objects directly to Zarr using :py:meth:`~xarray.DataArray.to_zarr`.
   (:issue:`7692`, :pull:`7693`) .
   By `Joe Hamman <https://github.com/jhamman>`_.
+- Added `numpy_data` keyword argument to both :py:meth:`xarray.Dataset.to_dict` and
+  :py:meth:`xarray.DataArray.to_dict` to return data as numpy objects instead of native Python objects.
+  (:issue:`1599`, :pull:`7739`) .
+  By `James McCreight <https://github.com/jmccreight>`_.
 
 Breaking changes
 ~~~~~~~~~~~~~~~~

diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py
@@ -4174,7 +4174,9 @@ def to_zarr(
             zarr_version=zarr_version,
         )
 
-    def to_dict(self, data: bool = True, encoding: bool = False) -> dict[str, Any]:
+    def to_dict(
+        self, data: bool = True, encoding: bool = False, numpy_data: bool = False
+    ) -> dict[str, Any]:
         """
         Convert this xarray.DataArray into a dictionary following xarray
         naming conventions.
@@ -4190,6 +4192,9 @@ def to_dict(self, data: bool = True, encoding: bool = False) -> dict[str, Any]:
             False, returns just the schema.
         encoding : bool, default: False
             Whether to include the Dataset's encoding in the dictionary.
+        numpy_data : bool, default: False
+           Whether to return data as numpy objects rather than native Python (
+           when returning data).
 
         Returns
         -------
@@ -4200,10 +4205,10 @@ def to_dict(self, data: bool = True, encoding: bool = False) -> dict[str, Any]:
         DataArray.from_dict
         Dataset.to_dict
         """
-        d = self.variable.to_dict(data=data)
+        d = self.variable.to_dict(data=data, numpy_data=numpy_data)
         d.update({"coords": {}, "name": self.name})
         for k, coord in self.coords.items():
-            d["coords"][k] = coord.variable.to_dict(data=data)
+            d["coords"][k] = coord.variable.to_dict(data=data, numpy_data=numpy_data)
         if encoding:
             d["encoding"] = dict(self.encoding)
         return d

diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
@@ -6441,7 +6441,9 @@ def to_dask_dataframe(
 
         return df
 
-    def to_dict(self, data: bool = True, encoding: bool = False) -> dict[str, Any]:
+    def to_dict(
+        self, data: bool = True, encoding: bool = False, numpy_data: bool = False
+    ) -> dict[str, Any]:
         """
         Convert this dataset to a dictionary following xarray naming
         conventions.
@@ -6457,6 +6459,9 @@ def to_dict(self, data: bool = True, encoding: bool = False) -> dict[str, Any]:
             False, returns just the schema.
         encoding : bool, default: False
             Whether to include the Dataset's encoding in the dictionary.
+        numpy_data : bool, default: False
+           Whether to return data as numpy objects rather than native Python (
+           when returning data).
 
         Returns
         -------
@@ -6477,11 +6482,19 @@ def to_dict(self, data: bool = True, encoding: bool = False) -> dict[str, Any]:
         }
         for k in self.coords:
             d["coords"].update(
-                {k: self[k].variable.to_dict(data=data, encoding=encoding)}
+                {
+                    k: self[k].variable.to_dict(
+                        data=data, encoding=encoding, numpy_data=numpy_data
+                    )
+                }
             )
         for k in self.data_vars:
             d["data_vars"].update(
-                {k: self[k].variable.to_dict(data=data, encoding=encoding)}
+                {
+                    k: self[k].variable.to_dict(
+                        data=data, encoding=encoding, numpy_data=numpy_data
+                    )
+                }
             )
         if encoding:
             d["encoding"] = dict(self.encoding)

diff --git a/xarray/core/variable.py b/xarray/core/variable.py
@@ -633,11 +633,18 @@ def to_index(self) -> pd.Index:
         """Convert this variable to a pandas.Index"""
         return self.to_index_variable().to_index()
 
-    def to_dict(self, data: bool = True, encoding: bool = False) -> dict:
+    def to_dict(
+        self, data: bool = True, encoding: bool = False, numpy_data: bool = False
+    ) -> dict:
         """Dictionary representation of variable."""
-        item = {"dims": self.dims, "attrs": decode_numpy_dict_values(self.attrs)}
+        item: dict[str, Any] = {
+            "dims": self.dims,
+            "attrs": decode_numpy_dict_values(self.attrs),
+        }
         if data:
-            item["data"] = ensure_us_time_resolution(self.values).tolist()
+            item["data"] = ensure_us_time_resolution(self.values)
+            if not numpy_data:
+                item["data"] = item["data"].tolist()
         else:
             item.update({"dtype": str(self.dtype), "shape": self.shape})
 

diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py
@@ -3324,25 +3324,35 @@ def test_series_categorical_index(self) -> None:
         arr = DataArray(s)
         assert "'a'" in repr(arr)  # should not error
 
+    @pytest.mark.parametrize("numpy_data", [True, False])
     @pytest.mark.parametrize("encoding", [True, False])
-    def test_to_and_from_dict(self, encoding) -> None:
+    def test_to_and_from_dict(self, encoding, numpy_data) -> None:
+        encoding_data = {"bar": "spam"}
         array = DataArray(
             np.random.randn(2, 3), {"x": ["a", "b"]}, ["x", "y"], name="foo"
         )
-        array.encoding = {"bar": "spam"}
-        expected = {
+        array.encoding = encoding_data
+
+        data = array.values
+        coords_data = np.array(["a", "b"])
+        if not numpy_data:
+            data = data.tolist()
+            coords_data = coords_data.tolist()
+
+        expected: dict[str, Any] = {
             "name": "foo",
             "dims": ("x", "y"),
-            "data": array.values.tolist(),
+            "data": data,
             "attrs": {},
-            "coords": {"x": {"dims": ("x",), "data": ["a", "b"], "attrs": {}}},
+            "coords": {"x": {"dims": ("x",), "data": coords_data, "attrs": {}}},
         }
         if encoding:
-            expected["encoding"] = {"bar": "spam"}
-        actual = array.to_dict(encoding=encoding)
+            expected["encoding"] = encoding_data
+
+        actual = array.to_dict(encoding=encoding, numpy_data=numpy_data)
 
         # check that they are identical
-        assert expected == actual
+        np.testing.assert_equal(expected, actual)
 
         # check roundtrip
         assert_identical(array, DataArray.from_dict(actual))

diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py
@@ -4589,7 +4589,8 @@ def test_convert_dataframe_with_many_types_and_multiindex(self) -> None:
         expected = df.apply(np.asarray)
         assert roundtripped.equals(expected)
 
-    def test_to_and_from_dict(self) -> None:
+    @pytest.mark.parametrize("numpy_data", [True, False])
+    def test_to_and_from_dict(self, numpy_data) -> None:
         # <xarray.Dataset>
         # Dimensions:  (t: 10)
         # Coordinates:
@@ -4611,10 +4612,10 @@ def test_to_and_from_dict(self) -> None:
             },
         }
 
-        actual = ds.to_dict()
+        actual = ds.to_dict(numpy_data=numpy_data)
 
         # check that they are identical
-        assert expected == actual
+        np.testing.assert_equal(expected, actual)
 
         # check roundtrip
         assert_identical(ds, Dataset.from_dict(actual))
@@ -4633,7 +4634,7 @@ def test_to_and_from_dict(self) -> None:
 
         # verify coords are included roundtrip
         expected_ds = ds.set_coords("b")
-        actual2 = Dataset.from_dict(expected_ds.to_dict())
+        actual2 = Dataset.from_dict(expected_ds.to_dict(numpy_data=numpy_data))
 
         assert_identical(expected_ds, actual2)
 
@@ -4683,7 +4684,8 @@ def test_to_and_from_dict_with_time_dim(self) -> None:
         roundtripped = Dataset.from_dict(ds.to_dict())
         assert_identical(ds, roundtripped)
 
-    def test_to_and_from_dict_with_nan_nat(self) -> None:
+    @pytest.mark.parametrize("numpy_data", [True, False])
+    def test_to_and_from_dict_with_nan_nat(self, numpy_data) -> None:
         x = np.random.randn(10, 3)
         y = np.random.randn(10, 3)
         y[2] = np.nan
@@ -4699,7 +4701,7 @@ def test_to_and_from_dict_with_nan_nat(self) -> None:
                 "lat": ("lat", lat),
             }
         )
-        roundtripped = Dataset.from_dict(ds.to_dict())
+        roundtripped = Dataset.from_dict(ds.to_dict(numpy_data=numpy_data))
         assert_identical(ds, roundtripped)
 
     def test_to_dict_with_numpy_attrs(self) -> None: