eurec4a · leifdenby · Aug 21, 2020 · Aug 11, 2020 · Aug 11, 2020 · Aug 17, 2020
diff --git a/eurec4a_environment/nomenclature.py b/eurec4a_environment/nomenclature.py
@@ -0,0 +1,118 @@
+"""
+To ensure that all functions in eurec4a-environment can access the variables
+required the module uses a common nomenclature for variable names throughout.
+To add a new variable to the nomenclature simply a) add a new constant below
+setting the assumed field name and b) add (if available) the CF-conventions
+standard name mapping in `CF_STANDARD_NAMES` if the variable has a "standard
+name" (See http://cfconventions.org/standard-names.html for the list of
+"standard names")
+"""
+import inspect
+import xarray as xr
+
+
+TEMPERATURE = "T"
-TEMPERATURE = "T"
+TEMPERATURE = "ta"
-TEMPERATURE = "T"
+TEMPERATURE = "ta"
+ALTITUDE = "height"
-ALTITUDE = "height"
+ALTITUDE = "alt"
-ALTITUDE = "height"
+ALTITUDE = "alt"
+RELATIVE_HUMIDITY = "rh"
+POTENTIAL_TEMPERATURE = "theta"
+PRESSURE = "p"
+
+# From CF-conventions: "specific" means per unit mass. Specific humidity is
+# the mass fraction of water vapor in (moist) air.
+SPECIFIC_HUMIDITY = "q"
+
+# From CF-conventions: Speed is the magnitude of velocity. Wind is defined as
+# a two-dimensional (horizontal) air velocity vector, with no vertical
+# component. (Vertical motion in the atmosphere has the standard name
+# upward_air_velocity.) The wind speed is the magnitude of the wind velocity
+WIND_SPEED = "wspd"
+
+# From CF-conventions: "Eastward" indicates a vector component which is
+# positive when directed eastward (negative westward). Wind is defined as a
+# two-dimensional (horizontal) air velocity vector, with no vertical component.
+# (Vertical motion in the atmosphere has the standard name
+# upward_air_velocity.)
+ZONAL_WIND = "u"
+
+# From CF-conventions: "Northward" indicates a vector component which is
+# positive when directed northward (negative southward). Wind is defined as a
+# two-dimensional (horizontal) air velocity vector, with no vertical component.
+# (Vertical motion in the atmosphere has the standard name
+# upward_air_velocity.)
+MERIDIONAL_WIND = "v"
+
+
+CF_STANDARD_NAMES = {
+    TEMPERATURE: "air_temperature",
+    ALTITUDE: "geopotential_height",
+    RELATIVE_HUMIDITY: "relative_humidity",
+    POTENTIAL_TEMPERATURE: "potential_temperature",
+    PRESSURE: "air_pressure",
+    SPECIFIC_HUMIDITY: "specific_humidity",
+    WIND_SPEED: "wind_speed",
+    ZONAL_WIND: "eastward_wind",
+    MERIDIONAL_WIND: "northward_wind",
+}
+
+
+def _get_calling_function_name():
+    curframe = inspect.currentframe()
+    calframe = inspect.getouterframes(curframe, 2)
+    return calframe[1][3]
+
+
+class FieldMissingException(Exception):
+    pass
+
+
+def get_field(ds, field_name):
+    if field_name in ds:
+        return ds[field_name]
+
+    calling_function_name = _get_calling_function_name()
+
+    if field_name not in CF_STANDARD_NAMES:
+        raise FieldMissingException(
+            f"Couldn't find the variable `{field_name}` in the provided dataset."
+            f" To use {calling_function_name} you need to provide a variable"
+            f" with the name `{field_name}`"
+        )
+    else:
+        standard_name = CF_STANDARD_NAMES[field_name]
+        matching_dataarrays = {}
+        vars_and_coords = list(ds.data_vars) + list(ds.coords)
+        for v in vars_and_coords:
+            print(v)
+            if ds[v].attrs.get("standard_name", None) == standard_name:
+                matching_dataarrays[v] = ds[v]
+
+        if len(matching_dataarrays) == 0:
+            raise FieldMissingException(
+                f"Couldn't find the variable `{field_name}` in the provided dataset."
+                f" To use {calling_function_name} you need to provide a variable"
+                f" with the name `{field_name}` or set the `standard_name`"
+                f" attribute to `{standard_name}` for one or more of the variables"
+                " in the dataset"
+            )
+        elif len(matching_dataarrays) == 1:
+            return list(matching_dataarrays.values())[0]
+        else:
+            dims = [da.dims for da in matching_dataarrays.values()]
+            if len(set(dims)) == 1:
+                # all variables have the same dims, so we can create a
+                # composite data-array out of these
+                var_names = list(matching_dataarrays.keys())
+                var_dataarrays = list(matching_dataarrays.values())
+
+                # TODO: should we check units here too?
+                da_combined = xr.concat(var_dataarrays, dim="var_name")
+                da_combined.coords["var_name"] = var_names
+                return da_combined
+            else:
+                raise FieldMissingException(
+                    "More than one variable was found in the dataset with"
+                    f" the standard name `{standard_name}`, but these couldn't"
+                    " be merged to a single xarray.DataArray because they"
+                    " don't exist in the same coordinate system"
+                )
+                pass
diff --git a/eurec4a_environment/variables/boundary_layer/mixed_layer_height.py b/eurec4a_environment/variables/boundary_layer/mixed_layer_height.py
@@ -4,13 +4,16 @@
 import statsmodels.api as sm
 
 
-def calc_peak_RH(ds, altitude="alt", rh="RH", z_min=200.0, z_max=900.0):
+from ... import nomenclature as nom
+
+
+def calc_peak_RH(ds, altitude=nom.ALTITUDE, rh=nom.RELATIVE_HUMIDITY, z_min=200.0, z_max=900.0):
     """
     Calculate height at maximum relative humidity values
     """
     ds_mixed_layer = ds.sel({altitude: slice(z_min, z_max)})
 
-    da_rh = ds_mixed_layer[rh]
+    da_rh = nom.get_field(ds=ds_mixed_layer, field_name=rh)
     dims = list(ds.dims.keys())
     # drop the height coord
     del dims[dims.index(altitude)]
@@ -26,8 +29,8 @@ def calc_peak_RH(ds, altitude="alt", rh="RH", z_min=200.0, z_max=900.0):
 
 def calc_peakRH_linearize(
     ds,
-    altitude="height",
-    rh="rh",
+    altitude=nom.ALTITUDE,
+    rh=nom.RELATIVE_HUMIDITY,
     time_dim="sounding",
     z_min=200.0,
     z_max=900.0,
@@ -47,26 +50,28 @@ def calc_peakRH_linearize(
      Outputs:
          -- da: datarray containing h_peakRH_linfit
      """
-    h_peakRH_linfit = np.zeros(len(ds[rh]))
+    da_rh = nom.get_field(ds=ds, field_name=rh)
+    da_alt = nom.get_field(ds=ds, field_name=altitude)
 
-    dz = int(ds[altitude].diff(dim=altitude)[1])  # dz=10m
+    h_peakRH_linfit = np.zeros(len(da_rh))
+
+    dz = int(da_alt.diff(dim=altitude)[1])  # dz=10m
 
     mixed_layer = np.logical_and(
-        ds[altitude] >= z_min, ds[altitude] <= 1500
+        da_alt >= z_min, da_alt <= 1500
     )  # enforce z_max later
     ml_linfit = np.logical_and(
-        ds[altitude] >= z_min_lin, ds[altitude] <= z_max_lin
+        da_alt >= z_min_lin, da_alt <= z_max_lin
     )  # for linearized RH profile
 
-    for i in range(len(ds[rh])):
-
-        rh_profile = ds[rh].isel({time_dim: i}).interpolate_na(dim=altitude)
+    for i in range(len(da_rh)):
+        rh_profile = da_rh.isel({time_dim: i}).interpolate_na(dim=altitude)
         X_height = rh_profile[ml_linfit][altitude]
         X = sm.add_constant(X_height.values)  # add intercept
         model = sm.OLS(
             rh_profile[ml_linfit].values, X
         ).fit()  # instantiate linear model
-        linearized_RH = model.predict(sm.add_constant(ds[altitude]))
+        linearized_RH = model.predict(sm.add_constant(da_alt))
 
         # 'find_peaks' is scipy function
         idx_peaks_RH_raw, _ = find_peaks(rh_profile[mixed_layer])

diff --git a/tests/test_nomenclature.py b/tests/test_nomenclature.py
@@ -0,0 +1,88 @@
+import pytest
+
+
+from eurec4a_environment import nomenclature as nom
+import eurec4a_environment.source_data
+
+
+@pytest.mark.parametrize(
+    "field_name",
+    [
+        nom.TEMPERATURE,
+        nom.ALTITUDE,
+        nom.POTENTIAL_TEMPERATURE,
+        nom.PRESSURE,
+        nom.RELATIVE_HUMIDITY,
+        nom.SPECIFIC_HUMIDITY,
+        nom.WIND_SPEED,
+        nom.MERIDIONAL_WIND,
+        nom.ZONAL_WIND,
+    ],
+)
+def test_get_field_by_name(field_name):
+    ds = eurec4a_environment.source_data.open_joanne_dataset()
+
+    nom.get_field(ds=ds, field_name=field_name)
+
+
+@pytest.mark.parametrize(
+    "field_name",
+    [
+        nom.TEMPERATURE,
+        nom.ALTITUDE,
+        nom.POTENTIAL_TEMPERATURE,
+        nom.PRESSURE,
+        nom.RELATIVE_HUMIDITY,
+        nom.SPECIFIC_HUMIDITY,
+        nom.WIND_SPEED,
+        nom.MERIDIONAL_WIND,
+        nom.ZONAL_WIND,
+    ],
+)
+def test_get_field_by_standard_name(field_name):
+    ds = eurec4a_environment.source_data.open_joanne_dataset()
+
+    # make a dataset where the variable name is doesn't match
+    ds_copy = ds[[field_name]]
+    ds_copy = ds[[field_name]].rename({field_name: "_foobar_field_"})
+
+    nom.get_field(ds=ds_copy, field_name=field_name)
+
+
+def test_get_field_missing_no_standard_name():
+    field_name = nom.TEMPERATURE
+    ds = eurec4a_environment.source_data.open_joanne_dataset()
+
+    # make a dataset where the variable name is doesn't match and the
+    # standard_name isn't set
+    ds_copy = ds[[field_name]].rename({field_name: "_foobar_field_"})
+    del ds_copy["_foobar_field_"].attrs["standard_name"]
+
+    with pytest.raises(nom.FieldMissingException):
+        nom.get_field(ds=ds_copy, field_name=field_name)
+
+
+def test_get_field_missing_unknown_standard_name():
+    field_name = "_non_existing_field_"
+    ds = eurec4a_environment.source_data.open_joanne_dataset()
+
+    with pytest.raises(nom.FieldMissingException):
+        nom.get_field(ds=ds, field_name=field_name)
+
+
+def test_get_field_multiple_by_standard_name():
+    """
+    Tests that merging multiple variables on the same dataset works
+    """
+    field_name = nom.TEMPERATURE
+    field_name_copy = field_name + "_copy"
+    # XXX: currently if there's a field with correct name then the standard
+    # names are ignored, is this ok?
+    field_name_renamed = field_name + "_renamed"
+    ds = eurec4a_environment.source_data.open_joanne_dataset()
+    ds_copy = ds[[]]
+    ds_copy[field_name_copy] = ds[field_name].copy()
+    ds_copy[field_name_renamed] = ds[field_name].copy()
+
+    da = nom.get_field(ds=ds_copy, field_name=field_name)
+    assert list(da["var_name"]) == [field_name_copy, field_name_renamed]