Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Consistent variable naming #30

Merged
merged 10 commits into from
Aug 21, 2020
118 changes: 118 additions & 0 deletions eurec4a_environment/nomenclature.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
"""
To ensure that all functions in eurec4a-environment can access the variables
required the module uses a common nomenclature for variable names throughout.
To add a new variable to the nomenclature simply a) add a new constant below
setting the assumed field name and b) add (if available) the CF-conventions
standard name mapping in `CF_STANDARD_NAMES` if the variable has a "standard
name" (See http://cfconventions.org/standard-names.html for the list of
"standard names")
"""
import inspect
import xarray as xr


TEMPERATURE = "T"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
TEMPERATURE = "T"
TEMPERATURE = "ta"

This is just to be consistent with the latest variable names in JOANNE - most variable names are finalised, but some are still being discussed.. :/

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No problem :)

ALTITUDE = "height"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
ALTITUDE = "height"
ALTITUDE = "alt"

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, I'll update these. Should I update the JOANNE dataset in the intake catalog already?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did these get updated?

RELATIVE_HUMIDITY = "rh"
POTENTIAL_TEMPERATURE = "theta"
PRESSURE = "p"

# From CF-conventions: "specific" means per unit mass. Specific humidity is
# the mass fraction of water vapor in (moist) air.
SPECIFIC_HUMIDITY = "q"

# From CF-conventions: Speed is the magnitude of velocity. Wind is defined as
# a two-dimensional (horizontal) air velocity vector, with no vertical
# component. (Vertical motion in the atmosphere has the standard name
# upward_air_velocity.) The wind speed is the magnitude of the wind velocity
WIND_SPEED = "wspd"

# From CF-conventions: "Eastward" indicates a vector component which is
# positive when directed eastward (negative westward). Wind is defined as a
# two-dimensional (horizontal) air velocity vector, with no vertical component.
# (Vertical motion in the atmosphere has the standard name
# upward_air_velocity.)
ZONAL_WIND = "u"

# From CF-conventions: "Northward" indicates a vector component which is
# positive when directed northward (negative southward). Wind is defined as a
# two-dimensional (horizontal) air velocity vector, with no vertical component.
# (Vertical motion in the atmosphere has the standard name
# upward_air_velocity.)
MERIDIONAL_WIND = "v"


CF_STANDARD_NAMES = {
TEMPERATURE: "air_temperature",
ALTITUDE: "geopotential_height",
RELATIVE_HUMIDITY: "relative_humidity",
POTENTIAL_TEMPERATURE: "potential_temperature",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The CF standard is actually "air_potential_temperature". I think this is an inconsistency with the JOANNE dataset

Copy link
Collaborator

@Geet-George Geet-George Aug 12, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the correction. I'll update it for the next JOANNE version.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for catching this @LSaffin. I'll check this and make sure we use the next JOANNE version as soon as it's ready. Maybe you could give me a ping on mattermost @Geet-George and then I'll update the intake catalog?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I'll update you on it. 👍

PRESSURE: "air_pressure",
SPECIFIC_HUMIDITY: "specific_humidity",
WIND_SPEED: "wind_speed",
ZONAL_WIND: "eastward_wind",
MERIDIONAL_WIND: "northward_wind",
}


def _get_calling_function_name():
curframe = inspect.currentframe()
calframe = inspect.getouterframes(curframe, 2)
return calframe[1][3]


class FieldMissingException(Exception):
pass


def get_field(ds, field_name):
if field_name in ds:
return ds[field_name]

calling_function_name = _get_calling_function_name()

if field_name not in CF_STANDARD_NAMES:
raise FieldMissingException(
f"Couldn't find the variable `{field_name}` in the provided dataset."
f" To use {calling_function_name} you need to provide a variable"
f" with the name `{field_name}`"
)
else:
standard_name = CF_STANDARD_NAMES[field_name]
matching_dataarrays = {}
vars_and_coords = list(ds.data_vars) + list(ds.coords)
for v in vars_and_coords:
print(v)
if ds[v].attrs.get("standard_name", None) == standard_name:
matching_dataarrays[v] = ds[v]

if len(matching_dataarrays) == 0:
raise FieldMissingException(
f"Couldn't find the variable `{field_name}` in the provided dataset."
f" To use {calling_function_name} you need to provide a variable"
f" with the name `{field_name}` or set the `standard_name`"
f" attribute to `{standard_name}` for one or more of the variables"
" in the dataset"
)
elif len(matching_dataarrays) == 1:
return list(matching_dataarrays.values())[0]
else:
dims = [da.dims for da in matching_dataarrays.values()]
if len(set(dims)) == 1:
# all variables have the same dims, so we can create a
# composite data-array out of these
var_names = list(matching_dataarrays.keys())
var_dataarrays = list(matching_dataarrays.values())

# TODO: should we check units here too?
da_combined = xr.concat(var_dataarrays, dim="var_name")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is well thought of. Good implementation! However, all functions would have to be written to deal with this, i.e. multiple variables. e.g. estimating theta would require an array each of pressure and temperature. If get_field returns two temperature arrays and one pressure array, the function should know what to do... Ideally, return two theta arrays. Did I understand that correctly?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks 😄 Yes, the idea would be that what is returned is always temperature for example, but it might be multiple temperature profiles. I've elaborated a bit on this idea in my comment below. Hope that makes sense...

da_combined.coords["var_name"] = var_names
return da_combined
else:
raise FieldMissingException(
"More than one variable was found in the dataset with"
f" the standard name `{standard_name}`, but these couldn't"
" be merged to a single xarray.DataArray because they"
" don't exist in the same coordinate system"
)
pass
29 changes: 17 additions & 12 deletions eurec4a_environment/variables/boundary_layer/mixed_layer_height.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,16 @@
import statsmodels.api as sm


def calc_peak_RH(ds, altitude="alt", rh="RH", z_min=200.0, z_max=900.0):
from ... import nomenclature as nom


def calc_peak_RH(ds, altitude=nom.ALTITUDE, rh=nom.RELATIVE_HUMIDITY, z_min=200.0, z_max=900.0):
"""
Calculate height at maximum relative humidity values
"""
ds_mixed_layer = ds.sel({altitude: slice(z_min, z_max)})

da_rh = ds_mixed_layer[rh]
da_rh = nom.get_field(ds=ds_mixed_layer, field_name=rh)
dims = list(ds.dims.keys())
# drop the height coord
del dims[dims.index(altitude)]
Expand All @@ -26,8 +29,8 @@ def calc_peak_RH(ds, altitude="alt", rh="RH", z_min=200.0, z_max=900.0):

def calc_peakRH_linearize(
ds,
altitude="height",
rh="rh",
altitude=nom.ALTITUDE,
rh=nom.RELATIVE_HUMIDITY,
time_dim="sounding",
z_min=200.0,
z_max=900.0,
Expand All @@ -47,26 +50,28 @@ def calc_peakRH_linearize(
Outputs:
-- da: datarray containing h_peakRH_linfit
"""
h_peakRH_linfit = np.zeros(len(ds[rh]))
da_rh = nom.get_field(ds=ds, field_name=rh)
da_alt = nom.get_field(ds=ds, field_name=altitude)

dz = int(ds[altitude].diff(dim=altitude)[1]) # dz=10m
h_peakRH_linfit = np.zeros(len(da_rh))

dz = int(da_alt.diff(dim=altitude)[1]) # dz=10m

mixed_layer = np.logical_and(
ds[altitude] >= z_min, ds[altitude] <= 1500
da_alt >= z_min, da_alt <= 1500
) # enforce z_max later
ml_linfit = np.logical_and(
ds[altitude] >= z_min_lin, ds[altitude] <= z_max_lin
da_alt >= z_min_lin, da_alt <= z_max_lin
) # for linearized RH profile

for i in range(len(ds[rh])):

rh_profile = ds[rh].isel({time_dim: i}).interpolate_na(dim=altitude)
for i in range(len(da_rh)):
rh_profile = da_rh.isel({time_dim: i}).interpolate_na(dim=altitude)
X_height = rh_profile[ml_linfit][altitude]
X = sm.add_constant(X_height.values) # add intercept
model = sm.OLS(
rh_profile[ml_linfit].values, X
).fit() # instantiate linear model
linearized_RH = model.predict(sm.add_constant(ds[altitude]))
linearized_RH = model.predict(sm.add_constant(da_alt))

# 'find_peaks' is scipy function
idx_peaks_RH_raw, _ = find_peaks(rh_profile[mixed_layer])
Expand Down
88 changes: 88 additions & 0 deletions tests/test_nomenclature.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import pytest


from eurec4a_environment import nomenclature as nom
import eurec4a_environment.source_data


@pytest.mark.parametrize(
"field_name",
[
nom.TEMPERATURE,
nom.ALTITUDE,
nom.POTENTIAL_TEMPERATURE,
nom.PRESSURE,
nom.RELATIVE_HUMIDITY,
nom.SPECIFIC_HUMIDITY,
nom.WIND_SPEED,
nom.MERIDIONAL_WIND,
nom.ZONAL_WIND,
],
)
def test_get_field_by_name(field_name):
ds = eurec4a_environment.source_data.open_joanne_dataset()

nom.get_field(ds=ds, field_name=field_name)


@pytest.mark.parametrize(
"field_name",
[
nom.TEMPERATURE,
nom.ALTITUDE,
nom.POTENTIAL_TEMPERATURE,
nom.PRESSURE,
nom.RELATIVE_HUMIDITY,
nom.SPECIFIC_HUMIDITY,
nom.WIND_SPEED,
nom.MERIDIONAL_WIND,
nom.ZONAL_WIND,
],
)
def test_get_field_by_standard_name(field_name):
ds = eurec4a_environment.source_data.open_joanne_dataset()

# make a dataset where the variable name is doesn't match
ds_copy = ds[[field_name]]
ds_copy = ds[[field_name]].rename({field_name: "_foobar_field_"})

nom.get_field(ds=ds_copy, field_name=field_name)


def test_get_field_missing_no_standard_name():
field_name = nom.TEMPERATURE
ds = eurec4a_environment.source_data.open_joanne_dataset()

# make a dataset where the variable name is doesn't match and the
# standard_name isn't set
ds_copy = ds[[field_name]].rename({field_name: "_foobar_field_"})
del ds_copy["_foobar_field_"].attrs["standard_name"]

with pytest.raises(nom.FieldMissingException):
nom.get_field(ds=ds_copy, field_name=field_name)


def test_get_field_missing_unknown_standard_name():
field_name = "_non_existing_field_"
ds = eurec4a_environment.source_data.open_joanne_dataset()

with pytest.raises(nom.FieldMissingException):
nom.get_field(ds=ds, field_name=field_name)


def test_get_field_multiple_by_standard_name():
"""
Tests that merging multiple variables on the same dataset works
"""
field_name = nom.TEMPERATURE
field_name_copy = field_name + "_copy"
# XXX: currently if there's a field with correct name then the standard
# names are ignored, is this ok?
field_name_renamed = field_name + "_renamed"
ds = eurec4a_environment.source_data.open_joanne_dataset()
ds_copy = ds[[]]
ds_copy[field_name_copy] = ds[field_name].copy()
ds_copy[field_name_renamed] = ds[field_name].copy()

da = nom.get_field(ds=ds_copy, field_name=field_name)
assert list(da["var_name"]) == [field_name_copy, field_name_renamed]