eurec4a · leifdenby · Aug 21, 2020 · Aug 11, 2020 · Aug 11, 2020 · Aug 17, 2020
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -16,6 +16,8 @@ jobs:
 
     steps:
     - uses: actions/checkout@v2
+    - name: Install udunits2
+      run: sudo apt-get install libudunits2-dev
     - name: Set up Python ${{ matrix.python-version }}
       uses: actions/setup-python@v1
       with:

diff --git a/eurec4a_environment/__init__.py b/eurec4a_environment/__init__.py
@@ -1,3 +1,4 @@
+from .nomenclature import get_field
 from ._version import get_versions
 
 __version__ = get_versions()["version"]

diff --git a/eurec4a_environment/nomenclature.py b/eurec4a_environment/nomenclature.py
@@ -0,0 +1,168 @@
+"""
+To ensure that all functions in eurec4a-environment can access the variables
+required the module uses a common nomenclature for variable names throughout.
+To add a new variable to the nomenclature simply a) add a new constant below
+setting the assumed field name and b) add (if available) the CF-conventions
+standard name mapping in `CF_STANDARD_NAMES` if the variable has a "standard
+name" (See http://cfconventions.org/standard-names.html for the list of
+"standard names")
+"""
+import inspect
+import xarray as xr
+try:
+    import cfunits
+    HAS_UDUNITS2 = True
+except FileNotFoundError:
+    import warnings
+    HAS_UDUNITS2 = False
+
+
+# TODO: update temperature to be called `ta` once JOANNE dataset is released
+# which uses this definition
+TEMPERATURE = "T"
-TEMPERATURE = "T"
+TEMPERATURE = "ta"
-TEMPERATURE = "T"
+TEMPERATURE = "ta"
+# TODO: update altitude to be called `alt` once JOANNE dataset is released
+# which uses this definition
+ALTITUDE = "height"
-ALTITUDE = "height"
+ALTITUDE = "alt"
-ALTITUDE = "height"
+ALTITUDE = "alt"
+RELATIVE_HUMIDITY = "rh"
+POTENTIAL_TEMPERATURE = "theta"
+PRESSURE = "p"
+
+# From CF-conventions: "specific" means per unit mass. Specific humidity is
+# the mass fraction of water vapor in (moist) air.
+SPECIFIC_HUMIDITY = "q"
+
+# From CF-conventions: Speed is the magnitude of velocity. Wind is defined as
+# a two-dimensional (horizontal) air velocity vector, with no vertical
+# component. (Vertical motion in the atmosphere has the standard name
+# upward_air_velocity.) The wind speed is the magnitude of the wind velocity
+WIND_SPEED = "wspd"
+
+# From CF-conventions: "Eastward" indicates a vector component which is
+# positive when directed eastward (negative westward). Wind is defined as a
+# two-dimensional (horizontal) air velocity vector, with no vertical component.
+# (Vertical motion in the atmosphere has the standard name
+# upward_air_velocity.)
+ZONAL_WIND = "u"
+
+# From CF-conventions: "Northward" indicates a vector component which is
+# positive when directed northward (negative southward). Wind is defined as a
+# two-dimensional (horizontal) air velocity vector, with no vertical component.
+# (Vertical motion in the atmosphere has the standard name
+# upward_air_velocity.)
+MERIDIONAL_WIND = "v"
+
+
+CF_STANDARD_NAMES = {
+    TEMPERATURE: "air_temperature",
+    ALTITUDE: "geopotential_height",
+    RELATIVE_HUMIDITY: "relative_humidity",
+    POTENTIAL_TEMPERATURE: "air_potential_temperature",
+    PRESSURE: "air_pressure",
+    SPECIFIC_HUMIDITY: "specific_humidity",
+    WIND_SPEED: "wind_speed",
+    ZONAL_WIND: "eastward_wind",
+    MERIDIONAL_WIND: "northward_wind",
+}
+
+
+def _get_calling_function_name():
+    curframe = inspect.currentframe()
+    calframe = inspect.getouterframes(curframe, 2)
+    return calframe[1][3]
+
+
+class FieldMissingException(Exception):
+    pass
+
+
+def get_field(ds, field_name, units=None):
+    """
+    Get field described by `field_name` in dataset `ds` and ensure it is in the
+    units defined by `units` (if `units` != None). The units definition is any
+    unit definition supported by
+    [UDUNITS](https://www.unidata.ucar.edu/software/udunits/)
+    """
+    if field_name in ds:
+        da = ds[field_name]
+    else:
+
+        calling_function_name = _get_calling_function_name()
+
+        if field_name not in CF_STANDARD_NAMES:
+            raise FieldMissingException(
+                f"Couldn't find the variable `{field_name}` in the provided dataset."
+                f" To use {calling_function_name} you need to provide a variable"
+                f" with the name `{field_name}`"
+            )
+        else:
+            standard_name = CF_STANDARD_NAMES[field_name]
+            matching_dataarrays = {}
+            vars_and_coords = list(ds.data_vars) + list(ds.coords)
+            for v in vars_and_coords:
+                print(v)
+                if ds[v].attrs.get("standard_name", None) == standard_name:
+                    matching_dataarrays[v] = ds[v]
+
+            if len(matching_dataarrays) == 0:
+                raise FieldMissingException(
+                    f"Couldn't find the variable `{field_name}` in the provided dataset."
+                    f" To use {calling_function_name} you need to provide a variable"
+                    f" with the name `{field_name}` or set the `standard_name`"
+                    f" attribute to `{standard_name}` for one or more of the variables"
+                    " in the dataset"
+                )
+            elif len(matching_dataarrays) == 1:
+                da = list(matching_dataarrays.values())[0]
+            else:
+                dims = [da.dims for da in matching_dataarrays.values()]
+                if len(set(dims)) == 1:
+                    # all variables have the same dims, so we can create a
+                    # composite data-array out of these
+                    var_names = list(matching_dataarrays.keys())
+                    var_dataarrays = list(matching_dataarrays.values())
+
+                    # TODO: should we check units here too?
+                    da_combined = xr.concat(var_dataarrays, dim="var_name")
+                    da_combined.coords["var_name"] = var_names
+                    da = da_combined
+                else:
+                    raise FieldMissingException(
+                        "More than one variable was found in the dataset with"
+                        f" the standard name `{standard_name}`, but these couldn't"
+                        " be merged to a single xarray.DataArray because they"
+                        " don't exist in the same coordinate system"
+                    )
+                    pass
+
+    if units is None:
+        return da
+    else:
+        # ensure that output is in correct units
+        if "units" not in da.attrs:
+            field_name = da.name
+            raise Exception(
+                f"Units haven't been set on `{field_name}` field in dataset"
+            )
+        if da.attrs["units"] == units:
+            return da
+
+        if not HAS_UDUNITS2:
+            raise Exception(
+                "To do correct unit conversion udunits2 is required, without"
+                " it no unit conversion will be done. udunits2 can be installed"
+                " with conda, `conda install -c conda-forge udunits2` or see"
+                " https://stackoverflow.com/a/42387825 for general instructions"
+            )
+
+        old_units = cfunits.Units(da.attrs["units"])
+        new_units = cfunits.Units(units)
+        if old_units == new_units:
+            return da
+        else:
+            values_converted = cfunits.Units.conform(da.values, old_units, new_units)
+            attrs = dict(da.attrs)
+            attrs["units"] = units
+            da_converted = xr.DataArray(
+                values_converted, coords=da.coords, dims=da.dims, attrs=attrs
+            )
+            return da_converted
diff --git a/eurec4a_environment/variables/boundary_layer/inversion_height.py b/eurec4a_environment/variables/boundary_layer/inversion_height.py
@@ -1,26 +1,35 @@
 import xarray as xr
 
+from ... import nomenclature as nom
+
+
 def find_inversion_height_grad_RH(
-    ds, altitude="alt", rh="rh", smoothing_win_size=None, z_min=1500, z_max=4000.0
+    ds,
+    altitude=nom.ALTITUDE,
+    rh=nom.RELATIVE_HUMIDITY,
+    smoothing_win_size=None,
+    z_min=1500,
+    z_max=4000.0,
 ):
-
     """
         Returns inversion height defined as the maximum in the vertical gradient of RH
-        """
+    """
+
     ds_lowertroposhere = ds.sel({altitude: slice(z_min, z_max)})
+    da_rh = nom.get_field(ds=ds_lowertroposhere, field_name=rh)
+    da_z = nom.get_field(ds=ds_lowertroposhere, field_name=altitude)
+
     if smoothing_win_size:
-        RH = (
-            ds_lowertroposhere[rh]
-            .rolling(alt=smoothing_win_size, min_periods=smoothing_win_size, center=True)
-            .mean(skipna=True)
-        )
+        RH = da_rh.rolling(
+            alt=smoothing_win_size, min_periods=smoothing_win_size, center=True
+        ).mean(skipna=True)
     else:
-        RH = ds_lowertroposhere[rh]
+        RH = da_rh
 
     RHg = RH.differentiate(coord=altitude)
     ix = RHg.argmin(dim=altitude, skipna=True)
-    da_z = RHg.isel({ altitude: ix }).alt
+    da_z = RHg.isel({altitude: ix})[altitude]
     da_z.attrs["long_name"] = "inversion layer height (from RH gradient)"
-    da_z.attrs["units"] = "m"
+    da_z.attrs["units"] = da_z.units
 
     return da_z
diff --git a/eurec4a_environment/variables/boundary_layer/lcl.py b/eurec4a_environment/variables/boundary_layer/lcl.py
@@ -1,9 +1,12 @@
 import numpy as np
 
 from ...constants import cp_d, g
+from ... import nomenclature as nom
 
 
-def find_LCL_Bolton(ds, temperature="T", rh="RH", altitude="alt"):
+def find_LCL_Bolton(
+    ds, temperature=nom.TEMPERATURE, rh=nom.RELATIVE_HUMIDITY, altitude=nom.ALTITUDE
+):
     """
     Calculates distribution of LCL from RH and T at different vertical levels
     returns mean LCL from this distribution
@@ -28,20 +31,9 @@ def _calc_LCL_Bolton(da_temperature, da_rh, da_altitude):
         mean_zlcl = np.mean(zlcl)
         return mean_zlcl
 
-    # Celsius to Kelvin
-    if ds[temperature].units == "C":
-        da_temperature = ds[temperature] + 273.15
-        da_temperature.attrs["units"] = "K"
-    else:
-        da_temperature = ds[temperature]
-    # RH from % to [0,1] value
-    if ds[rh].max().values > 2:
-        da_rh = ds[rh] / 100
-        da_rh.attrs["units"] = "1"
-    else:
-        da_rh = ds[rh]
-
-    da_altitude = ds[altitude]
+    da_temperature = nom.get_field(ds=ds, field_name=temperature, units="K")
+    da_rh = nom.get_field(ds=ds, field_name=rh, units="1")
+    da_altitude = nom.get_field(ds=ds, field_name=altitude, units="m")
 
     return _calc_LCL_Bolton(
         da_temperature=da_temperature, da_rh=da_rh, da_altitude=da_altitude

diff --git a/eurec4a_environment/variables/boundary_layer/mixed_layer_height.py b/eurec4a_environment/variables/boundary_layer/mixed_layer_height.py
@@ -4,13 +4,18 @@
 import statsmodels.api as sm
 
 
-def calc_peak_RH(ds, altitude="alt", rh="RH", z_min=200.0, z_max=900.0):
+from ... import nomenclature as nom
+
+
+def calc_peak_RH(
+    ds, altitude=nom.ALTITUDE, rh=nom.RELATIVE_HUMIDITY, z_min=200.0, z_max=900.0
+):
     """
     Calculate height at maximum relative humidity values
     """
     ds_mixed_layer = ds.sel({altitude: slice(z_min, z_max)})
 
-    da_rh = ds_mixed_layer[rh]
+    da_rh = nom.get_field(ds=ds_mixed_layer, field_name=rh, units="%")
     dims = list(ds.dims.keys())
     # drop the height coord
     del dims[dims.index(altitude)]
@@ -26,8 +31,8 @@ def calc_peak_RH(ds, altitude="alt", rh="RH", z_min=200.0, z_max=900.0):
 
 def calc_peakRH_linearize(
     ds,
-    altitude="height",
-    rh="rh",
+    altitude=nom.ALTITUDE,
+    rh=nom.RELATIVE_HUMIDITY,
     time_dim="sounding",
     z_min=200.0,
     z_max=1500.0,
@@ -47,24 +52,26 @@ def calc_peakRH_linearize(
      Outputs:
          -- da: datarray containing h_peakRH_linfit
      """
-    h_peakRH_linfit = np.zeros(len(ds[rh]))
+    da_rh = nom.get_field(ds=ds, field_name=rh, units="%")
+    da_alt = nom.get_field(ds=ds, field_name=altitude, units="m")
 
-    dz = int(ds[altitude].diff(dim=altitude)[1])  # dz=10m
+    h_peakRH_linfit = np.zeros(len(da_rh))
 
-    mixed_layer = np.logical_and(ds[altitude] >= z_min, ds[altitude] <= z_max)
+    dz = int(da_alt.diff(dim=altitude)[1])  # dz=10m
+
+    mixed_layer = np.logical_and(da_alt >= z_min, da_alt <= z_max)
     ml_linfit = np.logical_and(
-        ds[altitude] >= z_min_lin, ds[altitude] <= z_max_lin
+        da_alt >= z_min_lin, da_alt <= z_max_lin
     )  # for linearized RH profile
 
-    for i in range(len(ds[rh])):
-
-        rh_profile = ds[rh].isel({time_dim: i}).interpolate_na(dim=altitude)
+    for i in range(len(da_rh)):
+        rh_profile = da_rh.isel({time_dim: i}).interpolate_na(dim=altitude)
         X_height = rh_profile[ml_linfit][altitude]
         X = sm.add_constant(X_height.values)  # add intercept
         model = sm.OLS(
             rh_profile[ml_linfit].values, X
         ).fit()  # instantiate linear model
-        linearized_RH = model.predict(sm.add_constant(ds[altitude]))
+        linearized_RH = model.predict(sm.add_constant(da_alt))
 
         # 'find_peaks' is scipy function
         idx_peaks_RH_raw, _ = find_peaks(rh_profile[mixed_layer])
@@ -94,7 +101,7 @@ def calc_peakRH_linearize(
 
 
 def calc_from_gradient(
-    ds, var, threshold, z_min=200, altitude="height", time="sounding"
+    ds, var, threshold, z_min=200, altitude=nom.ALTITUDE, time="sounding"
 ):
     """
     Find mixed layer height as layer over which x(z+1) - x(z) < threshold
@@ -116,7 +123,7 @@ def calculateHmix_var(
         var_profile,
         threshold,
         z_min,
-        altitude="height",
+        altitude=nom.ALTITUDE,
         time="sounding",
     ):
 
@@ -156,7 +163,7 @@ def calculateHmix_var(
             var_profile,
             threshold,
             z_min,
-            altitude="height",
+            altitude=altitude,
             time="sounding",
         )