From ce6b158abd9b0c7a46746e41e16a137c4a3e9b0e Mon Sep 17 00:00:00 2001 From: Noah Benson Date: Wed, 24 Apr 2024 17:43:15 -0700 Subject: [PATCH 1/6] Updates the example in the doc-string for the Dataset class to be clearer. The example in the doc-string of the `Dataset` class prior to this commit uses an example array whose size is `2 x 2 x 3` with the first two dimensions labeled `"x"` and `"y"` and the final dimension labeled `"time"`. This was confusing due to the fact that `"x"` and `"y"` are just arbitrary names for these axes and that no reason is given for the data to be organized in a `2x2x3` array instead of a `2x2` matrix. This commit clarifies the example. See issue #8970 for more information. --- xarray/core/dataset.py | 68 +++++++++++++++++++++++++----------------- 1 file changed, 41 insertions(+), 27 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 96f3be00995..650bbd85ea2 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -589,43 +589,56 @@ class Dataset( Examples -------- - Create data: + In this example dataset, we will represent measurements of the temperature + and pressure that were made under various conditions: + * the measurements were made on four different days; + * they were made at two separate locations, which we will represent using + their latitude and longitude; and + * they were made using three instrument developed by three different + manufacturers, which we will refer to using the strings `'manufac1'`, + `'manufac2'`, and `'manufac3'`. >>> np.random.seed(0) - >>> temperature = 15 + 8 * np.random.randn(2, 2, 3) - >>> precipitation = 10 * np.random.rand(2, 2, 3) - >>> lon = [[-99.83, -99.32], [-99.79, -99.23]] - >>> lat = [[42.25, 42.21], [42.63, 42.59]] - >>> time = pd.date_range("2014-09-06", periods=3) + >>> temperature = 15 + 8 * np.random.randn(2, 3, 4) + >>> precipitation = 10 * np.random.rand(2, 3, 4) + >>> lon = [-99.83, -99.32] + >>> lat = [42.25, 42.21] + >>> instruments = ['manufac1', 'manufac2', 'manufac3'] + >>> time = pd.date_range("2014-09-06", periods=4) >>> reference_time = pd.Timestamp("2014-09-05") - Initialize a dataset with multiple dimensions: + Here, we initialize the dataset with multiple dimensions. We use the string + `"loc"` to represent the location dimension of the data, the string + `"instrument"` to represent the instrument manufacturer dimension, and the + string `"time"` for the time dimension. >>> ds = xr.Dataset( ... data_vars=dict( - ... temperature=(["x", "y", "time"], temperature), - ... precipitation=(["x", "y", "time"], precipitation), + ... temperature=(["loc", "instrument", "time"], temperature), + ... precipitation=(["loc", "instrument", "time"], precipitation), ... ), ... coords=dict( - ... lon=(["x", "y"], lon), - ... lat=(["x", "y"], lat), + ... lon=("loc", lon), + ... lat=("loc", lat), + ... instrument=instruments, ... time=time, ... reference_time=reference_time, ... ), ... attrs=dict(description="Weather related data."), ... ) >>> ds - Size: 288B - Dimensions: (x: 2, y: 2, time: 3) + + Dimensions: (loc: 2, instrument: 3, time: 4) Coordinates: - lon (x, y) float64 32B -99.83 -99.32 -99.79 -99.23 - lat (x, y) float64 32B 42.25 42.21 42.63 42.59 - * time (time) datetime64[ns] 24B 2014-09-06 2014-09-07 2014-09-08 - reference_time datetime64[ns] 8B 2014-09-05 - Dimensions without coordinates: x, y + lon (loc) float64 -99.83 -99.32 + lat (loc) float64 42.25 42.21 + * instrument (instrument) >> ds.isel(ds.temperature.argmin(...)) - Size: 48B + Dimensions: () Coordinates: - lon float64 8B -99.32 - lat float64 8B 42.21 - time datetime64[ns] 8B 2014-09-08 - reference_time datetime64[ns] 8B 2014-09-05 + lon float64 -99.32 + lat float64 42.21 + instrument Date: Wed, 24 Apr 2024 18:24:26 -0700 Subject: [PATCH 2/6] Updates the documentation of the Dataset class to have clearer examples. These changes to the documentation bring it into alignment with the changes to the `Dataset` doc-string committed previously. See issue #8970 for more information. --- doc/user-guide/data-structures.rst | 58 ++++++++++++++++++------------ 1 file changed, 35 insertions(+), 23 deletions(-) diff --git a/doc/user-guide/data-structures.rst b/doc/user-guide/data-structures.rst index 64e7b3625ac..0f344ad70b4 100644 --- a/doc/user-guide/data-structures.rst +++ b/doc/user-guide/data-structures.rst @@ -282,27 +282,39 @@ variables (``data_vars``), coordinates (``coords``) and attributes (``attrs``). - ``attrs`` should be a dictionary. -Let's create some fake data for the example we show above: - -.. ipython:: python - - temp = 15 + 8 * np.random.randn(2, 2, 3) - precip = 10 * np.random.rand(2, 2, 3) - lon = [[-99.83, -99.32], [-99.79, -99.23]] - lat = [[42.25, 42.21], [42.63, 42.59]] +Let's create some fake data for the example we show above. In this +example dataset, we will represent measurements of the temperature and +pressure that were made under various conditions: +* the measurements were made on four different days; +* they were made at two separate locations, which we will represent using + their latitude and longitude; and +* they were made using three different sets of instruments, which we will + refer to as `'inst1'`, `'inst2'`, and `'inst3'`. + +.. ipython:: python + + np.random.seed(0) + temperature = 15 + 8 * np.random.randn(2, 3, 4) + precipitation = 10 * np.random.rand(2, 3, 4) + lon = [-99.83, -99.32] + lat = [42.25, 42.21] + instruments = ['inst1', 'inst2', 'inst3'] + time = pd.date_range("2014-09-06", periods=4) + reference_time = pd.Timestamp("2014-09-05") # for real use cases, its good practice to supply array attributes such as # units, but we won't bother here for the sake of brevity ds = xr.Dataset( { - "temperature": (["x", "y", "time"], temp), - "precipitation": (["x", "y", "time"], precip), + "temperature": (["loc", "instrument", "time"], temperature), + "precipitation": (["loc", "instrument", "time"], precipitation), }, coords={ - "lon": (["x", "y"], lon), - "lat": (["x", "y"], lat), - "time": pd.date_range("2014-09-06", periods=3), - "reference_time": pd.Timestamp("2014-09-05"), + "lon": (["loc"], lon), + "lat": (["loc"], lat), + "instrument": instruments, + "time": time, + "reference_time": reference_time, }, ) ds @@ -387,12 +399,12 @@ example, to create this example dataset from scratch, we could have written: .. ipython:: python ds = xr.Dataset() - ds["temperature"] = (("x", "y", "time"), temp) - ds["temperature_double"] = (("x", "y", "time"), temp * 2) - ds["precipitation"] = (("x", "y", "time"), precip) - ds.coords["lat"] = (("x", "y"), lat) - ds.coords["lon"] = (("x", "y"), lon) - ds.coords["time"] = pd.date_range("2014-09-06", periods=3) + ds["temperature"] = (("loc", "instrument", "time"), temp) + ds["temperature_double"] = (("loc", "instrument", "time"), temp * 2) + ds["precipitation"] = (("loc", "instrument", "time"), precip) + ds.coords["lat"] = (("loc",), lat) + ds.coords["lon"] = (("loc",), lon) + ds.coords["time"] = pd.date_range("2014-09-06", periods=4) ds.coords["reference_time"] = pd.Timestamp("2014-09-05") To change the variables in a ``Dataset``, you can use all the standard dictionary @@ -452,8 +464,8 @@ follow nested function calls: # these lines are equivalent, but with pipe we can make the logic flow # entirely from left to right - plt.plot((2 * ds.temperature.sel(x=0)).mean("y")) - (ds.temperature.sel(x=0).pipe(lambda x: 2 * x).mean("y").pipe(plt.plot)) + plt.plot((2 * ds.temperature.sel(loc=0)).mean("instrument")) + (ds.temperature.sel(loc=0).pipe(lambda x: 2 * x).mean("instrument").pipe(plt.plot)) Both ``pipe`` and ``assign`` replicate the pandas methods of the same names (:py:meth:`DataFrame.pipe ` and @@ -479,7 +491,7 @@ dimension and non-dimension variables: .. ipython:: python - ds.coords["day"] = ("time", [6, 7, 8]) + ds.coords["day"] = ("time", [6, 7, 8, 9]) ds.swap_dims({"time": "day"}) .. _coordinates: From 6454852cd2edb6102d3c580f23bb92c5e663bd70 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 25 Apr 2024 01:43:26 +0000 Subject: [PATCH 3/6] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- doc/user-guide/data-structures.rst | 2 +- xarray/core/dataset.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/user-guide/data-structures.rst b/doc/user-guide/data-structures.rst index 0f344ad70b4..074fba9f5ac 100644 --- a/doc/user-guide/data-structures.rst +++ b/doc/user-guide/data-structures.rst @@ -298,7 +298,7 @@ pressure that were made under various conditions: precipitation = 10 * np.random.rand(2, 3, 4) lon = [-99.83, -99.32] lat = [42.25, 42.21] - instruments = ['inst1', 'inst2', 'inst3'] + instruments = ["inst1", "inst2", "inst3"] time = pd.date_range("2014-09-06", periods=4) reference_time = pd.Timestamp("2014-09-05") diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 650bbd85ea2..ef5466fde63 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -603,7 +603,7 @@ class Dataset( >>> precipitation = 10 * np.random.rand(2, 3, 4) >>> lon = [-99.83, -99.32] >>> lat = [42.25, 42.21] - >>> instruments = ['manufac1', 'manufac2', 'manufac3'] + >>> instruments = ["manufac1", "manufac2", "manufac3"] >>> time = pd.date_range("2014-09-06", periods=4) >>> reference_time = pd.Timestamp("2014-09-05") From 2280ba4f26167336a0eb24f71858a13cd3b2068d Mon Sep 17 00:00:00 2001 From: Noah Benson Date: Wed, 24 Apr 2024 18:54:34 -0700 Subject: [PATCH 4/6] Adds dataset size reports to the output of the example in the Dataset docstring. --- xarray/core/dataset.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 650bbd85ea2..ebb148fdf38 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -627,18 +627,18 @@ class Dataset( ... attrs=dict(description="Weather related data."), ... ) >>> ds - + Size: 552B Dimensions: (loc: 2, instrument: 3, time: 4) Coordinates: - lon (loc) float64 -99.83 -99.32 - lat (loc) float64 42.25 42.21 - * instrument (instrument) >> ds.isel(ds.temperature.argmin(...)) - + Size: 80B Dimensions: () Coordinates: - lon float64 -99.32 - lat float64 42.21 - instrument Date: Wed, 24 Apr 2024 19:12:00 -0700 Subject: [PATCH 5/6] Fixes the documentation errors in the previous commits. --- doc/user-guide/data-structures.rst | 6 +++--- xarray/core/dataset.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/user-guide/data-structures.rst b/doc/user-guide/data-structures.rst index 074fba9f5ac..64ca9e9416f 100644 --- a/doc/user-guide/data-structures.rst +++ b/doc/user-guide/data-structures.rst @@ -399,9 +399,9 @@ example, to create this example dataset from scratch, we could have written: .. ipython:: python ds = xr.Dataset() - ds["temperature"] = (("loc", "instrument", "time"), temp) - ds["temperature_double"] = (("loc", "instrument", "time"), temp * 2) - ds["precipitation"] = (("loc", "instrument", "time"), precip) + ds["temperature"] = (("loc", "instrument", "time"), temperature) + ds["temperature_double"] = (("loc", "instrument", "time"), temperature * 2) + ds["precipitation"] = (("loc", "instrument", "time"), precipitation) ds.coords["lat"] = (("loc",), lat) ds.coords["lon"] = (("loc",), lon) ds.coords["time"] = pd.date_range("2014-09-06", periods=4) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 76d2727ca55..fc2bbd80884 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -633,12 +633,12 @@ class Dataset( lon (loc) float64 16B -99.83 -99.32 lat (loc) float64 16B 42.25 42.21 * instrument (instrument) Date: Wed, 24 Apr 2024 20:02:06 -0700 Subject: [PATCH 6/6] Fixes indentation errors in the docs for previous commits. --- doc/user-guide/data-structures.rst | 7 ++++--- xarray/core/dataset.py | 13 +++++++------ 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/doc/user-guide/data-structures.rst b/doc/user-guide/data-structures.rst index 64ca9e9416f..a1794f4123d 100644 --- a/doc/user-guide/data-structures.rst +++ b/doc/user-guide/data-structures.rst @@ -285,11 +285,12 @@ variables (``data_vars``), coordinates (``coords``) and attributes (``attrs``). Let's create some fake data for the example we show above. In this example dataset, we will represent measurements of the temperature and pressure that were made under various conditions: + * the measurements were made on four different days; * they were made at two separate locations, which we will represent using their latitude and longitude; and -* they were made using three different sets of instruments, which we will - refer to as `'inst1'`, `'inst2'`, and `'inst3'`. +* they were made using instruments by three different manufacutrers, which we + will refer to as `'manufac1'`, `'manufac2'`, and `'manufac3'`. .. ipython:: python @@ -298,7 +299,7 @@ pressure that were made under various conditions: precipitation = 10 * np.random.rand(2, 3, 4) lon = [-99.83, -99.32] lat = [42.25, 42.21] - instruments = ["inst1", "inst2", "inst3"] + instruments = ["manufac1", "manufac2", "manufac3"] time = pd.date_range("2014-09-06", periods=4) reference_time = pd.Timestamp("2014-09-05") diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index fc2bbd80884..79388e6afbf 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -591,12 +591,13 @@ class Dataset( -------- In this example dataset, we will represent measurements of the temperature and pressure that were made under various conditions: - * the measurements were made on four different days; - * they were made at two separate locations, which we will represent using - their latitude and longitude; and - * they were made using three instrument developed by three different - manufacturers, which we will refer to using the strings `'manufac1'`, - `'manufac2'`, and `'manufac3'`. + + * the measurements were made on four different days; + * they were made at two separate locations, which we will represent using + their latitude and longitude; and + * they were made using three instrument developed by three different + manufacturers, which we will refer to using the strings `'manufac1'`, + `'manufac2'`, and `'manufac3'`. >>> np.random.seed(0) >>> temperature = 15 + 8 * np.random.randn(2, 3, 4)