Skip to content

Commit

Permalink
Merge pull request #55 from esm-tools/feat/dimensionless-units
Browse files Browse the repository at this point in the history
Dimensionless Units
  • Loading branch information
mandresm authored Nov 25, 2024
2 parents 4c61f1e + e8a406b commit 3042e47
Show file tree
Hide file tree
Showing 12 changed files with 337 additions and 58 deletions.
4 changes: 4 additions & 0 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@
"tests.fixtures.fake_data.fesom_mesh",
"tests.fixtures.fake_filesystem",
"tests.fixtures.sample_rules",
"tests.fixtures.config_files",
"tests.fixtures.CV_Dir",
"tests.fixtures.CMIP_Tables_Dir",
"tests.fixtures.data_requests",
]


Expand Down
1 change: 1 addition & 0 deletions examples/sample.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ pymorize:
fixed_jobs: 12
# minimum_jobs: 8
# maximum_jobs: 30
dimensionless_mapping_table: ../data/dimensionless_mappings.yaml
rules:
- name: paul_example_rule
description: "You can put some text here"
Expand Down
5 changes: 4 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,10 @@ def read(filename):
"externals=pymorize.externals:externals",
],
},
package_data={},
include_package_data=True,
package_data={
"pymorize": ["data/*.yaml"],
},
classifiers=[
"Development Status :: 2 - Pre-Alpha",
"License :: OSI Approved :: MIT License",
Expand Down
95 changes: 94 additions & 1 deletion src/pymorize/cmorizer.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import copy
from importlib.resources import files
from pathlib import Path

import dask # noqa: F401
import pandas as pd
import questionary
import xarray as xr # noqa: F401
import yaml
from dask.distributed import Client
from dask_jobqueue import SLURMCluster
Expand All @@ -20,9 +22,14 @@
from .pipeline import Pipeline
from .rule import Rule
from .timeaverage import _frequency_from_approx_interval
from .units import handle_unit_conversion
from .utils import wait_for_workers
from .validate import PIPELINES_VALIDATOR, RULES_VALIDATOR

DIMENSIONLESS_MAPPING_TABLE = files("pymorize.data").joinpath(
"dimensionless_mappings.yaml"
)


class CMORizer:
def __init__(
Expand Down Expand Up @@ -88,6 +95,7 @@ def __init__(
self._post_init_read_bare_tables()
self._post_init_create_data_request()
self._post_init_populate_rules_with_tables()
self._post_init_read_dimensionless_unit_mappings()
self._post_init_data_request_variables()
################################################################################

Expand Down Expand Up @@ -187,6 +195,38 @@ def _post_init_data_request_variables(self):
self._rules_expand_drvs()
self._rules_depluralize_drvs()

def _post_init_read_dimensionless_unit_mappings(self):
"""
Reads the dimensionless unit mappings from a configuration file and
updates the rules with these mappings.
This method reads the dimensionless unit mappings from a file specified
in the configuration. If the file is not specified or does not exist,
an empty dictionary is used. The mappings are then added to each rule
in the `rules` attribute.
Parameters
----------
None
Returns
-------
None
"""
pymorize_cfg = self._pymorize_cfg
unit_map_file = pymorize_cfg.get(
"dimensionless_mapping_table", DIMENSIONLESS_MAPPING_TABLE
)
if unit_map_file is None:
logger.warning("No dimensionless unit mappings file specified!")
dimensionless_unit_mappings = {}
else:
with open(unit_map_file, "r") as f:
dimensionless_unit_mappings = yaml.safe_load(f)
# Add to rules:
for rule in self.rules:
rule.dimensionless_unit_mappings = dimensionless_unit_mappings

def find_matching_rule(
self, data_request_variable: DataRequestVariable
) -> Rule or None:
Expand Down Expand Up @@ -254,7 +294,15 @@ def _post_init_create_pipelines(self):
self.pipelines = pipelines

def _post_init_create_rules(self):
self.rules = [Rule.from_dict(p) for p in self.rules if not isinstance(p, Rule)]
_rules = []
for p in self.rules:
if isinstance(p, Rule):
_rules.append(p)
elif isinstance(p, dict):
_rules.append(Rule.from_dict(p))
else:
raise TypeError("rule must be an instance of Rule or dict")
self.rules = _rules
self._post_init_inherit_rules()
self._post_init_attach_pymorize_config_rules()

Expand All @@ -276,6 +324,7 @@ def validate(self):
# self._check_rules_for_output_dir()
# FIXME(PS): Turn off this check, see GH #59 (https://tinyurl.com/3z7d8uuy)
# self._check_is_subperiod()
self._check_units()

def _check_is_subperiod(self):
logger.info("checking frequency in netcdf file and in table...")
Expand Down Expand Up @@ -308,6 +357,49 @@ def _check_is_subperiod(self):
if errors:
for err in errors:
logger.error(err)
raise errors[0]

def _check_units(self):
# TODO (MA): This function needs to be cleaned up if it needs to stay
# but it will probably be removed soon if we do the validation checks
# via dryruns of the steps.
def is_unit_scalar(value):
if value is None:
return False
try:
x = float(value)
except ValueError:
return False
return (x - 1) == 0

errors = []
for rule in self.rules:
for input_collection in rule.inputs:
try:
filename = input_collection.files[0]
except IndexError:
break
model_units = rule.get("model_unit") or fc.get(filename).units
cmor_units = rule.data_request_variable.units
cmor_variable = rule.data_request_variables.get("cmor_variable")
if model_units is None:
if not (is_unit_scalar(cmor_units) or cmor_units == "%"):
errors.append(
ValueError(
f"dimensionless variables must have dimensionless units ({model_units} {cmor_units})"
)
)
if is_unit_scalar(cmor_units):
if not is_unit_scalar(model_units):
dimless = rule.get("dimensionless_unit_mappings", {})
if not cmor_units in dimless.get(cmor_variable, {}):
errors.append(
f"Missing mapping for dimensionless variable {cmor_variable}"
)
if errors:
for err in errors:
logger.error(err)
raise errors[0]

@classmethod
def from_dict(cls, data):
Expand Down Expand Up @@ -337,6 +429,7 @@ def from_dict(cls, data):
instance._post_init_populate_rules_with_tables()
instance._post_init_create_data_request()
instance._post_init_data_request_variables()
instance._post_init_read_dimensionless_unit_mappings()
return instance

def add_rule(self, rule):
Expand Down
Empty file added src/pymorize/data/__init__.py
Empty file.
7 changes: 7 additions & 0 deletions src/pymorize/data/dimensionless_mappings.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# In general:
# model_variable_name:
# cmor_unit_string: pint_friendly_SI_units
so:
"0.001": g/kg
sos:
"0.001": g/kg
1 change: 1 addition & 0 deletions src/pymorize/frequency.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,3 +171,4 @@ def for_name(cls, n):
]

# Adding a global reference to ALL frequencies
# Frequency.ALL = ALL
3 changes: 3 additions & 0 deletions src/pymorize/rule.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ class Rule:
def __init__(
self,
*,
name: str = None,
inputs: typing.List[dict] = [],
cmor_variable: str,
pipelines: typing.List[pipeline.Pipeline] = [],
Expand All @@ -41,6 +42,7 @@ def __init__(
data_request_variables : DataRequestVariable or None :
The DataRequestVariables this rule should create
"""
self.name = name
self.inputs = [InputFileCollection.from_dict(inp_dict) for inp_dict in inputs]
self.cmor_variable = cmor_variable
self._pipelines = pipelines or [pipeline.DefaultPipeline()]
Expand Down Expand Up @@ -185,6 +187,7 @@ def from_dict(cls, data):
A dictionary containing the rule data.
"""
return cls(
name=data.pop("name", None),
inputs=data.pop("inputs"),
cmor_variable=data.pop("cmor_variable"),
pipelines=data.pop("pipelines", []),
Expand Down
124 changes: 95 additions & 29 deletions src/pymorize/units.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,10 @@
In case the units in model files differ from CMIP Tables, this module attempts to
convert them automatically.
In case of missing units in either model files or CMIP Tables, this module can
not convert from a dimentionless base to something with dimension. Dealing with
such thing have to done with `action` section in the Rules module on a per
variable basis.
Additionally, the cmip frequencies are mapped here. The CMIP6 frequency
names and corresponding number of days are available as a dictionary in the
``CMIP_FREQUENCIES`` variable. Assignment of these frequencies to the unit registry
can be done with the ``assign_frequency_to_unit_registry`` function.
Conversion to-or-from a dimensionless quantity is ambiguous. In this case,
provide a mapping of what this dimensionless quantity represents and that
is used for the conversion. `data/dimensionless_mappings.yaml` contains some
examples on how the mapping is written.
"""

import re
Expand All @@ -29,19 +24,12 @@
import xarray as xr
from chemicals import periodic_table

from .frequency import CMIP_FREQUENCIES
from .logging import logger
from .rule import Rule

ureg = pint_xarray.unit_registry


def assign_frequency_to_unit_registry():
"""Assign the CMIP6 frequencies to the unit registry."""
for freq_name, days in CMIP_FREQUENCIES.items():
ureg.define(f"{freq_name} = {days} * d")


def handle_chemicals(
s: Union[str, None] = None, pattern: Pattern = re.compile(r"mol(?P<symbol>\w+)")
):
Expand Down Expand Up @@ -106,31 +94,109 @@ def handle_unit_conversion(da: xr.DataArray, rule: Rule) -> xr.DataArray:
"""
if not isinstance(da, xr.DataArray):
raise TypeError(f"Expected xr.DataArray, got {type(da)}")
# data_request_variable needs to be defined at this point

drv = rule.data_request_variable
to_unit = drv.unit
dimless_mappings = rule.get("dimensionless_unit_mappings", {})

# Process model's unit (from_unit)
# --------------------------------
# (defined in the yaml file or in the original file)
model_unit = rule.get("model_unit")
from_unit = da.attrs.get("units")
# Overwrite model unit if defined in the yaml file
if model_unit is not None:
logger.debug(
f"using user defined unit ({model_unit}) instead of ({from_unit}) from DataArray "
logger.info(
f"using user defined unit ({model_unit}) instead of ({from_unit}) from the "
"original file"
)
from_unit = model_unit
# Raise error if unit is not defined anywhere
if not from_unit:
logger.error(
"Unit not defined neither in the original file nor in the yaml "
"configuration file. Please, define the unit for your data under "
f"rules.{rule.name}.model_unit"
)
raise ValueError("Unit not defined")

# Process table's unit (to_unit)
# ------------------------------
to_unit = drv.unit
cmor_variable_id = drv.variable_id
# Check for `to_unit` defined as `None`, `False`, empty string...
if not to_unit:
logger.error(
"Unit of CMOR variable '{cmor_variable_id}' not defined in the data "
f"request table/s {rule.tables}"
)
raise ValueError("Unit not defined")

# Check if the data request unit is a float
if unit_can_be_float(to_unit):
logger.debug(
f"Unit of CMOR variable '{cmor_variable_id}' can be a float: {to_unit}"
)
try:
_to_unit = dimless_mappings.get(cmor_variable_id, {})[to_unit]
except KeyError:
logger.error(
f"Dimensionless unit '{to_unit}' not found in mappings for "
f"CMOR variable '{cmor_variable_id}'"
)
raise KeyError("Dimensionless unit not found in mappings")
logger.info(
f"Converting units: ({da.name} -> {cmor_variable_id}) {from_unit} -> "
f"{to_unit}"
)
else:
_to_unit = to_unit
logger.info(
f"Converting units: ({da.name} -> {cmor_variable_id}) {from_unit} -> "
f"{_to_unit} ({to_unit})"
)

# Chemicals
# ---------
handle_chemicals(from_unit)
handle_chemicals(to_unit)
new_da = da.pint.quantify(from_unit)
logger.debug(f"Converting units: {from_unit} -> {to_unit}")
new_da = new_da.pint.to(to_unit).pint.dequantify()

# Unit conversion
# ---------------
try:
new_da = da.pint.quantify(from_unit)
new_da = new_da.pint.to(_to_unit).pint.dequantify()
except ValueError as e:
logger.error(
f"Unit conversion of '{cmor_variable_id}' from {from_unit} to {to_unit} "
f"({_to_unit}) failed: {e}"
)
raise ValueError(f"Unit conversion failed: {e}")

# Reset final unit to the original value as defined in the cmor table
if new_da.attrs.get("units") != to_unit:
logger.debug(
"Pint auto-unit attribute setter different from requested unit string, setting manually."
"Pint auto-unit attribute setter different from requested unit string "
f"({new_da.attrs.get('units')} vs {to_unit}). Setting manually."
)
new_da.attrs["units"] = to_unit
# Ensure a units attribute is present, default to None (this should never happen)

# Ensure a units attribute is present
if "units" not in new_da.attrs:
logger.warning(
"Units attribute not present in DataArray after conversion, please check carefully!"
logger.error("Units attribute not present in DataArray after conversion!")
raise AttributeError(
"Units attribute not present in DataArray after conversion!"
)
logger.warning("Setting to None")
new_da.attrs["units"] = None

return new_da


def unit_can_be_float(value):
try:
_ = float(value)
return True
except ValueError as e:
logger.debug(f"unit_can_be_float: {e}")
return False
except TypeError as e:
logger.debug(f"unit_can_be_float: {e}")
return False
Loading

0 comments on commit 3042e47

Please sign in to comment.