Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ds.to_dict with data as arrays, not lists #7739

Merged
merged 19 commits into from
Apr 28, 2023
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@ New Features
- Added ability to save ``DataArray`` objects directly to Zarr using :py:meth:`~xarray.DataArray.to_zarr`.
(:issue:`7692`, :pull:`7693`) .
By `Joe Hamman <https://github.com/jhamman>`_.
- Added `numpy_data` keyword argument to both :py:meth:`xarray.Dataset.to_dict` and
:py:meth:`xarray.DataArray.to_dict` to return data as numpy objects instead of native Python objects.
(:issue:`1599`, :pull:`7739`) .
By `James McCreight <https://github.com/jmccreight>`_.

Breaking changes
~~~~~~~~~~~~~~~~
Expand Down
11 changes: 8 additions & 3 deletions xarray/core/dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -4174,7 +4174,9 @@ def to_zarr(
zarr_version=zarr_version,
)

def to_dict(self, data: bool = True, encoding: bool = False) -> dict[str, Any]:
def to_dict(
self, data: bool = True, encoding: bool = False, numpy_data: bool = False
) -> dict[str, Any]:
"""
Convert this xarray.DataArray into a dictionary following xarray
naming conventions.
Expand All @@ -4190,6 +4192,9 @@ def to_dict(self, data: bool = True, encoding: bool = False) -> dict[str, Any]:
False, returns just the schema.
encoding : bool, default: False
Whether to include the Dataset's encoding in the dictionary.
numpy_data : bool, default: False
Whether to return data as numpy objects rather than native Python (
when returning data).

Returns
-------
Expand All @@ -4200,10 +4205,10 @@ def to_dict(self, data: bool = True, encoding: bool = False) -> dict[str, Any]:
DataArray.from_dict
Dataset.to_dict
"""
d = self.variable.to_dict(data=data)
d = self.variable.to_dict(data=data, numpy_data=numpy_data)
d.update({"coords": {}, "name": self.name})
for k, coord in self.coords.items():
d["coords"][k] = coord.variable.to_dict(data=data)
d["coords"][k] = coord.variable.to_dict(data=data, numpy_data=numpy_data)
if encoding:
d["encoding"] = dict(self.encoding)
return d
Expand Down
19 changes: 16 additions & 3 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -6441,7 +6441,9 @@ def to_dask_dataframe(

return df

def to_dict(self, data: bool = True, encoding: bool = False) -> dict[str, Any]:
def to_dict(
self, data: bool = True, encoding: bool = False, numpy_data: bool = False
) -> dict[str, Any]:
"""
Convert this dataset to a dictionary following xarray naming
conventions.
Expand All @@ -6457,6 +6459,9 @@ def to_dict(self, data: bool = True, encoding: bool = False) -> dict[str, Any]:
False, returns just the schema.
encoding : bool, default: False
Whether to include the Dataset's encoding in the dictionary.
numpy_data : bool, default: False
Whether to return data as numpy objects rather than native Python (
when returning data).

Returns
-------
Expand All @@ -6477,11 +6482,19 @@ def to_dict(self, data: bool = True, encoding: bool = False) -> dict[str, Any]:
}
for k in self.coords:
d["coords"].update(
{k: self[k].variable.to_dict(data=data, encoding=encoding)}
{
k: self[k].variable.to_dict(
data=data, encoding=encoding, numpy_data=numpy_data
)
}
)
for k in self.data_vars:
d["data_vars"].update(
{k: self[k].variable.to_dict(data=data, encoding=encoding)}
{
k: self[k].variable.to_dict(
data=data, encoding=encoding, numpy_data=numpy_data
)
}
)
if encoding:
d["encoding"] = dict(self.encoding)
Expand Down
13 changes: 10 additions & 3 deletions xarray/core/variable.py
Original file line number Diff line number Diff line change
Expand Up @@ -633,11 +633,18 @@ def to_index(self) -> pd.Index:
"""Convert this variable to a pandas.Index"""
return self.to_index_variable().to_index()

def to_dict(self, data: bool = True, encoding: bool = False) -> dict:
def to_dict(
self, data: bool = True, encoding: bool = False, numpy_data: bool = False
) -> dict:
"""Dictionary representation of variable."""
item = {"dims": self.dims, "attrs": decode_numpy_dict_values(self.attrs)}
item: dict[str, Any] = {
"dims": self.dims,
"attrs": decode_numpy_dict_values(self.attrs),
}
if data:
item["data"] = ensure_us_time_resolution(self.values).tolist()
item["data"] = ensure_us_time_resolution(self.values)
if not numpy_data:
item["data"] = item["data"].tolist()
else:
item.update({"dtype": str(self.dtype), "shape": self.shape})

Expand Down
26 changes: 18 additions & 8 deletions xarray/tests/test_dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -3324,25 +3324,35 @@ def test_series_categorical_index(self) -> None:
arr = DataArray(s)
assert "'a'" in repr(arr) # should not error

@pytest.mark.parametrize("numpy_data", [True, False])
@pytest.mark.parametrize("encoding", [True, False])
def test_to_and_from_dict(self, encoding) -> None:
def test_to_and_from_dict(self, encoding, numpy_data) -> None:
encoding_data = {"bar": "spam"}
array = DataArray(
np.random.randn(2, 3), {"x": ["a", "b"]}, ["x", "y"], name="foo"
)
array.encoding = {"bar": "spam"}
expected = {
array.encoding = encoding_data

data = array.values
coords_data = np.array(["a", "b"])
if not numpy_data:
data = data.tolist()
coords_data = coords_data.tolist()

expected: dict[str, Any] = {
"name": "foo",
"dims": ("x", "y"),
"data": array.values.tolist(),
"data": data,
"attrs": {},
"coords": {"x": {"dims": ("x",), "data": ["a", "b"], "attrs": {}}},
"coords": {"x": {"dims": ("x",), "data": coords_data, "attrs": {}}},
}
if encoding:
expected["encoding"] = {"bar": "spam"}
actual = array.to_dict(encoding=encoding)
expected["encoding"] = encoding_data

actual = array.to_dict(encoding=encoding, numpy_data=numpy_data)

# check that they are identical
assert expected == actual
np.testing.assert_equal(expected, actual)

# check roundtrip
assert_identical(array, DataArray.from_dict(actual))
Expand Down
14 changes: 8 additions & 6 deletions xarray/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -4589,7 +4589,8 @@ def test_convert_dataframe_with_many_types_and_multiindex(self) -> None:
expected = df.apply(np.asarray)
assert roundtripped.equals(expected)

def test_to_and_from_dict(self) -> None:
@pytest.mark.parametrize("numpy_data", [True, False])
def test_to_and_from_dict(self, numpy_data) -> None:
# <xarray.Dataset>
# Dimensions: (t: 10)
# Coordinates:
Expand All @@ -4611,10 +4612,10 @@ def test_to_and_from_dict(self) -> None:
},
}

actual = ds.to_dict()
actual = ds.to_dict(numpy_data=numpy_data)

# check that they are identical
assert expected == actual
np.testing.assert_equal(expected, actual)

# check roundtrip
assert_identical(ds, Dataset.from_dict(actual))
Expand All @@ -4633,7 +4634,7 @@ def test_to_and_from_dict(self) -> None:

# verify coords are included roundtrip
expected_ds = ds.set_coords("b")
actual2 = Dataset.from_dict(expected_ds.to_dict())
actual2 = Dataset.from_dict(expected_ds.to_dict(numpy_data=numpy_data))

assert_identical(expected_ds, actual2)

Expand Down Expand Up @@ -4683,7 +4684,8 @@ def test_to_and_from_dict_with_time_dim(self) -> None:
roundtripped = Dataset.from_dict(ds.to_dict())
assert_identical(ds, roundtripped)

def test_to_and_from_dict_with_nan_nat(self) -> None:
@pytest.mark.parametrize("numpy_data", [True, False])
def test_to_and_from_dict_with_nan_nat(self, numpy_data) -> None:
x = np.random.randn(10, 3)
y = np.random.randn(10, 3)
y[2] = np.nan
Expand All @@ -4699,7 +4701,7 @@ def test_to_and_from_dict_with_nan_nat(self) -> None:
"lat": ("lat", lat),
}
)
roundtripped = Dataset.from_dict(ds.to_dict())
roundtripped = Dataset.from_dict(ds.to_dict(numpy_data=numpy_data))
assert_identical(ds, roundtripped)

def test_to_dict_with_numpy_attrs(self) -> None:
Expand Down