diff --git a/doc/user-guide/data-structures.rst b/doc/user-guide/data-structures.rst index 64e7b3625ac..a1794f4123d 100644 --- a/doc/user-guide/data-structures.rst +++ b/doc/user-guide/data-structures.rst @@ -282,27 +282,40 @@ variables (``data_vars``), coordinates (``coords``) and attributes (``attrs``). - ``attrs`` should be a dictionary. -Let's create some fake data for the example we show above: +Let's create some fake data for the example we show above. In this +example dataset, we will represent measurements of the temperature and +pressure that were made under various conditions: + +* the measurements were made on four different days; +* they were made at two separate locations, which we will represent using + their latitude and longitude; and +* they were made using instruments by three different manufacutrers, which we + will refer to as `'manufac1'`, `'manufac2'`, and `'manufac3'`. .. ipython:: python - temp = 15 + 8 * np.random.randn(2, 2, 3) - precip = 10 * np.random.rand(2, 2, 3) - lon = [[-99.83, -99.32], [-99.79, -99.23]] - lat = [[42.25, 42.21], [42.63, 42.59]] + np.random.seed(0) + temperature = 15 + 8 * np.random.randn(2, 3, 4) + precipitation = 10 * np.random.rand(2, 3, 4) + lon = [-99.83, -99.32] + lat = [42.25, 42.21] + instruments = ["manufac1", "manufac2", "manufac3"] + time = pd.date_range("2014-09-06", periods=4) + reference_time = pd.Timestamp("2014-09-05") # for real use cases, its good practice to supply array attributes such as # units, but we won't bother here for the sake of brevity ds = xr.Dataset( { - "temperature": (["x", "y", "time"], temp), - "precipitation": (["x", "y", "time"], precip), + "temperature": (["loc", "instrument", "time"], temperature), + "precipitation": (["loc", "instrument", "time"], precipitation), }, coords={ - "lon": (["x", "y"], lon), - "lat": (["x", "y"], lat), - "time": pd.date_range("2014-09-06", periods=3), - "reference_time": pd.Timestamp("2014-09-05"), + "lon": (["loc"], lon), + "lat": (["loc"], lat), + "instrument": instruments, + "time": time, + "reference_time": reference_time, }, ) ds @@ -387,12 +400,12 @@ example, to create this example dataset from scratch, we could have written: .. ipython:: python ds = xr.Dataset() - ds["temperature"] = (("x", "y", "time"), temp) - ds["temperature_double"] = (("x", "y", "time"), temp * 2) - ds["precipitation"] = (("x", "y", "time"), precip) - ds.coords["lat"] = (("x", "y"), lat) - ds.coords["lon"] = (("x", "y"), lon) - ds.coords["time"] = pd.date_range("2014-09-06", periods=3) + ds["temperature"] = (("loc", "instrument", "time"), temperature) + ds["temperature_double"] = (("loc", "instrument", "time"), temperature * 2) + ds["precipitation"] = (("loc", "instrument", "time"), precipitation) + ds.coords["lat"] = (("loc",), lat) + ds.coords["lon"] = (("loc",), lon) + ds.coords["time"] = pd.date_range("2014-09-06", periods=4) ds.coords["reference_time"] = pd.Timestamp("2014-09-05") To change the variables in a ``Dataset``, you can use all the standard dictionary @@ -452,8 +465,8 @@ follow nested function calls: # these lines are equivalent, but with pipe we can make the logic flow # entirely from left to right - plt.plot((2 * ds.temperature.sel(x=0)).mean("y")) - (ds.temperature.sel(x=0).pipe(lambda x: 2 * x).mean("y").pipe(plt.plot)) + plt.plot((2 * ds.temperature.sel(loc=0)).mean("instrument")) + (ds.temperature.sel(loc=0).pipe(lambda x: 2 * x).mean("instrument").pipe(plt.plot)) Both ``pipe`` and ``assign`` replicate the pandas methods of the same names (:py:meth:`DataFrame.pipe ` and @@ -479,7 +492,7 @@ dimension and non-dimension variables: .. ipython:: python - ds.coords["day"] = ("time", [6, 7, 8]) + ds.coords["day"] = ("time", [6, 7, 8, 9]) ds.swap_dims({"time": "day"}) .. _coordinates: diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index ffed9da1069..57ddcd9d39d 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -590,43 +590,57 @@ class Dataset( Examples -------- - Create data: + In this example dataset, we will represent measurements of the temperature + and pressure that were made under various conditions: + + * the measurements were made on four different days; + * they were made at two separate locations, which we will represent using + their latitude and longitude; and + * they were made using three instrument developed by three different + manufacturers, which we will refer to using the strings `'manufac1'`, + `'manufac2'`, and `'manufac3'`. >>> np.random.seed(0) - >>> temperature = 15 + 8 * np.random.randn(2, 2, 3) - >>> precipitation = 10 * np.random.rand(2, 2, 3) - >>> lon = [[-99.83, -99.32], [-99.79, -99.23]] - >>> lat = [[42.25, 42.21], [42.63, 42.59]] - >>> time = pd.date_range("2014-09-06", periods=3) + >>> temperature = 15 + 8 * np.random.randn(2, 3, 4) + >>> precipitation = 10 * np.random.rand(2, 3, 4) + >>> lon = [-99.83, -99.32] + >>> lat = [42.25, 42.21] + >>> instruments = ["manufac1", "manufac2", "manufac3"] + >>> time = pd.date_range("2014-09-06", periods=4) >>> reference_time = pd.Timestamp("2014-09-05") - Initialize a dataset with multiple dimensions: + Here, we initialize the dataset with multiple dimensions. We use the string + `"loc"` to represent the location dimension of the data, the string + `"instrument"` to represent the instrument manufacturer dimension, and the + string `"time"` for the time dimension. >>> ds = xr.Dataset( ... data_vars=dict( - ... temperature=(["x", "y", "time"], temperature), - ... precipitation=(["x", "y", "time"], precipitation), + ... temperature=(["loc", "instrument", "time"], temperature), + ... precipitation=(["loc", "instrument", "time"], precipitation), ... ), ... coords=dict( - ... lon=(["x", "y"], lon), - ... lat=(["x", "y"], lat), + ... lon=("loc", lon), + ... lat=("loc", lat), + ... instrument=instruments, ... time=time, ... reference_time=reference_time, ... ), ... attrs=dict(description="Weather related data."), ... ) >>> ds - Size: 288B - Dimensions: (x: 2, y: 2, time: 3) + Size: 552B + Dimensions: (loc: 2, instrument: 3, time: 4) Coordinates: - lon (x, y) float64 32B -99.83 -99.32 -99.79 -99.23 - lat (x, y) float64 32B 42.25 42.21 42.63 42.59 - * time (time) datetime64[ns] 24B 2014-09-06 2014-09-07 2014-09-08 + lon (loc) float64 16B -99.83 -99.32 + lat (loc) float64 16B 42.25 42.21 + * instrument (instrument) >> ds.isel(ds.temperature.argmin(...)) - Size: 48B + Size: 80B Dimensions: () Coordinates: lon float64 8B -99.32 lat float64 8B 42.21 - time datetime64[ns] 8B 2014-09-08 + instrument