diff --git a/src/pymorize/controlled_vocabularies.py b/src/pymorize/controlled_vocabularies.py index 35f2c161..4587e46d 100644 --- a/src/pymorize/controlled_vocabularies.py +++ b/src/pymorize/controlled_vocabularies.py @@ -5,6 +5,8 @@ import glob import json import os +import re +import requests class ControlledVocabularies(dict): @@ -66,3 +68,53 @@ def dict_from_json_file(path): return json.load(file) except json.JSONDecodeError as e: raise ValueError(f"file {path}: {e.msg}") + + @classmethod + def load_from_git(cls, tag: str = "6.2.58.73"): + """Load the controlled vocabularies from the git repository + + Parameters + ---------- + tag : str + The git tag to use. Default is 6.2.58.73 + If tag is None, the main branch is used. + Returns + ------- + ControlledVocabularies + A new ControlledVocabularies object, behaves like a dictionary. + """ + if tag is None: + tag = "refs/heads/main" + else: + tag = "blob/" + tag + url = f"https://raw.githubusercontent.com/WCRP-CMIP/CMIP6_CVs/{tag}" + filenames = ( + "CMIP6_DRS.json", + "CMIP6_activity_id.json", + "CMIP6_experiment_id.json", + "CMIP6_frequency.json", + "CMIP6_grid_label.json", + "CMIP6_institution_id.json", + "CMIP6_license.json", + "CMIP6_nominal_resolution.json", + "CMIP6_realm.json", + "CMIP6_required_global_attributes.json", + "CMIP6_source_id.json", + "CMIP6_source_type.json", + "CMIP6_sub_experiment_id.json", + "CMIP6_table_id.json", + "mip_era.json", + ) + name_pattern = re.compile(r"^(?:CMIP6_)?(?P[^\.]+)\.json$").match + data = {} + for fname in filenames: + name = name_pattern(fname).groupdict().get("name") + fpath = "/".join([url, fname]) + r = requests.get(fpath) + r.raise_for_status() + content = r.content.decode() + content = json.loads(content) + data[name] = content.get(name) + obj = cls([]) + obj.update(data) + return obj diff --git a/src/pymorize/global_attributes.py b/src/pymorize/global_attributes.py new file mode 100644 index 00000000..4a6a9b58 --- /dev/null +++ b/src/pymorize/global_attributes.py @@ -0,0 +1,186 @@ +# global_attributes.py + +import json +import re +from pathlib import Path + +from .controlled_vocabularies import ControlledVocabularies + +# from loguru import logger + +cv = ControlledVocabularies.load_from_git() + +required_global_attributes = cv["required_global_attributes"] + +_parent_fields = ( + "branch_method", + "branch_time_in_child", + "branch_time_in_parent", + "parent_experiment_id", + "parent_mip_era", + "parent_source_id", + "parent_time_units", + "parent_variant_label", +) + + +defaults = { + "institution_id": "AWI", + "license_type": "CC BY-SA 4.0", + "maintainer_url": None, +} + + +def set_global_attributes(ds, rule): + gattrs = {} + variant_label = rule.get("variant_label") + update_variant_label(variant_label, gattrs) + variable_id = rule.data_request_variable.variable_id + gattrs["variable_id"] = variable_id + gattrs["table_id"] = rule.data_request_variable.table.table_id + _update_global_attributes_from_table_header(gattrs, rule) + gattrs["source_id"] = source_id = rule.get("source_id") + source_id_cv = cv["source_id"][source_id] + _institution_id = source_id_cv.get("institution_id") + if len(_institution_id) > 1: + institution_ids = ", ".join(_institution_id) + institution_id = rule.get("institution_id") + if institution_id is None: + raise ValueError( + f"institution_id -- {institution_ids} -- has multiple value for source_id {source_id}." + ) + else: + assert institution_id in _institution_id + else: + institution_id = _institution_id[0] + gattrs["institution_id"] = institution_id + license_type = source_id_cv["license_info"]["id"] + further_info_url = rule.get("further_info_url") + _update_license(gattrs, cv, institution_id, license_type, further_info_url) + gattrs["source"] = source = rule.get("source") # model_component + gattrs["grid"] = source_id_cv["model_component"][source]["description"] + gattrs["nominal_resolution"] = source_id_cv["model_component"][source][ + "native_nominal_resolution" + ] + gattrs["source_type"] = rule.get("source_type") + experiment_id = rule.get("experiment_id") + activity_id = rule.get("activity_id", None) + if activity_id is None: + _experiment_id_cv = cv.get("experiment_id", {}).get(experiment_id, {}) + activity_id = _experiment_id_cv.get("activity_id", []) + if activity_id and len(activity_id) > 1: + activity_ids = ", ".join(activity_id) + raise ValueError( + f"activity_id -- {activity_ids} -- has multiple value for experiment_id {experiment_id}." + ) + elif activity_id: + activity_id = activity_id[0] + else: + raise ValueError(f"no activity_id found for experiment_id {experiment_id}") + gattrs["activity_id"] = activity_id + gattrs["experiment"] = _experiment_id_cv.get("experiment", "") + gattrs["experiment_id"] = experiment_id + # ignore parent_experiment_id for now, in the first iteration + # parent_activity_id = _experiment_id_cv.get("parent_activity_id", "") + gattrs["sub_experiment"] = rule.get("sub_experiment", "") + gattrs["sub_experiment_id"] = _experiment_id_cv.get("sub_experiment_id") + + +def _update_global_attributes_from_table_header(gattrs, rule): + """Updates global attributes from table header""" + table = rule.data_request_variable.table + header = table._data["Header"] + gattrs["data_specs_version"] = header["data_specs_version"] + gattrs["Conventions"] = header["Conventions"] + gattrs["mip_era"] = header["mip_era"] + gattrs["realm"] = header["realm"] + gattrs["product"] = header["product"] + + +def _parse_variant_label(label: str) -> dict: + """Extracts indices values from variant label. + `label` must be of the form "ripf". + Example: "r1i1p1f1" + """ + pattern = re.compile( + r"r(?P\d+)" + r"i(?P\d+)" + r"p(?P\d+)" + r"f(?P\d+)" + r"$" + ) + if label is None: + raise ValueError( + f"`label` must be of the form 'ripf', Got: {label}" + ) + d = pattern.match(label) + if d is None: + raise ValueError( + f"`label` must be of the form 'ripf', Got: {label}" + ) + d = {name: int(val) for name, val in d.groupdict().items()} + return d + + +def _update_variant_label(label: str, gattrs: dict) -> dict: + "Add variant_label to global attributes" + variant_label_indices = _parse_variant_label(label) + gattrs |= variant_label_indices + gattrs["variant_label"] = label + return gattrs + + +def _update_license( + gattrs: dict, + cv: dict, + institution_id: str = None, + license_type: str = None, + further_info_url: str = None, +): + """ + Updates the license attribute in the global attributes dictionary. + + Args: + gattrs (dict): The global attributes dictionary to update. + cv (dict): The controlled vocabulary dictionary. + institution_id (str, optional): The institution ID. Defaults to None. + license_type (str, optional): The license type. Defaults to None. + further_info_url (str, optional): The maintainer URL. Defaults to None. + + Returns: + None + + Raises: + None + """ + + institution_id = institution_id or defaults.get("institution_id") + license_type = license_type or defaults.get("license_type") + further_info_url = further_info_url or defaults.get("further_info_url") + logger.debug(f"{institution_id=}") + logger.debug(f"{license_type=}") + logger.debug(f"{further_info_url=}") + lic = cv["license"] + license_text = lic["license"] + license_id = lic["license_options"][license_type]["license_id"] + license_url = lic["license_options"][license_type]["license_url"] + if further_info_url is None: + logger.debug( + "Removing placeholder for maintainer url from license text as it is not provided." + ) + license_text = re.sub(r"\[.*?\]", "", license_text) + institution = cv["institution_id"][institution_id] + + def make_placeholders(text): + return re.sub(r"<.*?>", "{}", text) + + logger.debug( + "Creating place-holders in license template found in CMIP6_license.json" + ) + text = make_placeholders(license_text) + if further_info_url is None: + text = text.format(institution, license_id, license_url) + else: + text = text.format(institution, license_id, license_url, further_info_url) + logger.debug(f"License: {text}") + gattrs["license"] = text diff --git a/src/pymorize/global_attributes_checklist.org b/src/pymorize/global_attributes_checklist.org new file mode 100644 index 00000000..42be7d41 --- /dev/null +++ b/src/pymorize/global_attributes_checklist.org @@ -0,0 +1,86 @@ +* GLOBAL ATTRIBUTES + +reference CMIP6_required_global_attributes.json + +| index | NAME | IMPLEMENTED | SOURCE | EXAMPLE | +|-------+----------------------+-------------+---------------+---------------------------------------------------| +| 1 | activity_id | x | USER | CMIP | +| 2 | Conventions | x | table | CF-1.7 CMIP-6.2 | +| 3 | creation_date | | | 2018-12-18T12:00:00Z | +| 4 | data_specs_version | x | USER / table | 01.00.27 | +| 5 | experiment | | | piControl | +| 6 | experiment_id | | | piControl | +| 7 | forcing_index | x | derived from | 1 | +| | | | variant_label | | +| 8 | frequency | x | table | mon | +| 9 | further_info_url | x | USER | *too_long_to_list_here | +| | | | (optional, | | +| | | | default: | | +| | | | None) | | +| 10 | grid | | | *too_long_to_list_here | +| 11 | grid_label | | | gn | +| 12 | initialization_index | x | derived from | 1 | +| | | | variant_label | | +| 13 | institution | | | *too_long_to_list_here | +| 14 | institution_id | x | using | AWI | +| | | | default: AWI | | +| 15 | license | x | CV | *too_long_to_list_here | +| 16 | mip_era | x | table | CMIP6 | +| 17 | nominal_resolution | | | 25 km | +| 18 | physics_index | x | derived from | 1 | +| | | | variant_label | | +| 19 | product | x | table | model-output | +| 20 | realization_index | x | derived from | 1 | +| | | | variant_label | | +| 21 | realm | x | table | ocean | +| 22 | source | | | AWI-CM-1-1-MR | +| 23 | source_id | | | AWI-CM-1-1-MR | +| 24 | source_type | | | AOGCM | +| 25 | sub_experiment | | | none | +| 26 | sub_experiment_id | | | none | +| 27 | table_id | x | USER / use | Omon | +| | | | all matching | | +| | | | tables | | +| 28 | tracking_id | | | hdl:21.14100/84bfc093-b0a3-44ee-b733-91239b6fa6b2 | +| 29 | variable_id | x | USER | fgco2 | +| 30 | variant_label | x | USER | r1i1p1f1 | + + + +EXAMPLE + - further_info_url: "https://furtherinfo.es-doc.org/CMIP6.AWI.AWI-CM-1-1-MR.piControl.none.r1i1p1f1" ; + - grid: "FESOM 1.4 (unstructured grid in the horizontal with 830305 wet nodes; 46 levels; top grid cell 0-5 m)" + - institution: "Alfred Wegener Institute, Helmholtz Centre for Polar and Marine Research, Am Handelshafen 12, 27570 Bremerhaven, Germany" + - license: "CMIP6 model data produced by Alfred Wegener Institute, Helmholtz + Centre for Polar and Marine Research, Am Handelshafen 12, 27570 Bremerhaven, + Germany is licensed under a Creative Commons Attribution-ShareAlike 4.0 + International License (https://creativecommons.org/licenses/). Consult + https://pcmdi.llnl.gov/CMIP6/TermsOfUse for terms of use governing CMIP6 + output, including citation requirements and proper acknowledgment. Further + information about this data, including some limitations, can be found via + the further_info_url (recorded as a global attribute in this file). The data + producers and data providers make no warranty, either express or implied, + including, but not limited to, warranties of merchantability and fitness for + a particular purpose. All liabilities arising from the supply of the + information (including any liability arising in negligence) are excluded to + the fullest extent permitted by law." + - branch_method: "standard" + - branch_time_in_child: 0. + - branch_time_in_parent: 182622. + - parent_activity_id: "CMIP" + - parent_experiment_id: "piControl-spinup" + - parent_mip_era: "CMIP6" + - parent_source_id: "AWI-CM-1-1-MR" + - parent_time_units: "days since 1901-1-1" + - parent_variant_label: "r1i1p1f1" + + +COMMENT + 4. `data_specs_version`: At the moment using Tables with a specific version ("01.00.13" for instance). + Exposing this attribute to user means, fetching user defined version tables from git. + Currently not implemented. + 27. `table_id`: [Optional] A CMOR_variable may be in more than one table. + If user does not specify a table_id, then all matching table for this variable + is considered. + + diff --git a/tests/configs/fesom_sample.yaml b/tests/configs/fesom_sample.yaml index 630e0c8e..a989e551 100644 --- a/tests/configs/fesom_sample.yaml +++ b/tests/configs/fesom_sample.yaml @@ -52,3 +52,4 @@ rules: input_patterns: - /path/to/fesom/output/files/*_temp.nc output_file: temp.nc + diff --git a/tests/test_global_attributes.py b/tests/test_global_attributes.py new file mode 100644 index 00000000..face8a3e --- /dev/null +++ b/tests/test_global_attributes.py @@ -0,0 +1,140 @@ +import pytest +from pymorize.global_attributes import parse_variant_label, update_variant_label +import pymorize.global_attributes as ga + + +simple_cases = [ + # (label, expected) + ( + "r1i2p3f4", + { + "realization_index": 1, + "initialization_index": 2, + "physics_index": 3, + "forcing_index": 4, + }, + ), + ( + "r10i20p30f40", + { + "realization_index": 10, + "initialization_index": 20, + "physics_index": 30, + "forcing_index": 40, + }, + ), + ( + "r0i0p0f0", + { + "realization_index": 0, + "initialization_index": 0, + "physics_index": 0, + "forcing_index": 0, + }, + ), +] + + +@pytest.mark.parametrize("label, expected", simple_cases) +def test_parse_variant_label_realistic_labels(label, expected): + result = parse_variant_label(label) + assert result == expected, f"Failed Test ID: {label}" + + +edge_cases = [ + ( + "r01i02p03f04", + { + "realization_index": 1, + "initialization_index": 2, + "physics_index": 3, + "forcing_index": 4, + }, + ), + ( + "r001i0002p0003f0004", + { + "realization_index": 1, + "initialization_index": 2, + "physics_index": 3, + "forcing_index": 4, + }, + ), +] + + +@pytest.mark.parametrize("label, expected", edge_cases) +def test_parse_variant_label_edge_cases(label, expected): + result = parse_variant_label(label) + assert result == expected, f"Failed Test ID: {label}" + + +error_cases = [ + ("r1i2p3", pytest.raises(ValueError)), + ("r1i2p3f", pytest.raises(ValueError)), + ("1i2p3f4", pytest.raises(ValueError)), + ("r1i2p3f4x", pytest.raises(ValueError)), + ("", pytest.raises(ValueError)), + (None, pytest.raises(ValueError)), + # negitive indices not supported. should they be? + ("r-1i-2p-3f-4", pytest.raises(ValueError)), + # strict match, no trailing extra characters + ("r1i2p3f4a0b1", pytest.raises(ValueError)), + # strict match, no leading extra characters + ("c2d2r1i2p3f4", pytest.raises(ValueError)), + # strict match, no leading or trailing extra characters + ("c2d2r1i2p3f4a0b1", pytest.raises(ValueError)), + # no spaces + ("r1 i2 p3 f4", pytest.raises(ValueError)), + ("r 1 i 2 p 3 f 4", pytest.raises(ValueError)), +] + + +@pytest.mark.parametrize("label, exception", error_cases) +def test_parse_variant_label_error_cases(label, exception): + with exception: + parse_variant_label(label) + + +def test_update_variant_label_adds_label_to_gattrs(): + label = "r10i20p30f40" + d = {} + update_variant_label(label=label, gattrs=d) + assert label == d["variant_label"] + + +def test_update_variant_label_overrides_existing_label(): + label = "r10i20p30f40" + d = { + "realization_index": 1, + "initialization_index": 2, + "physics_index": 3, + "forcing_index": 4, + "variant_label": "r1i2p3f4", + } + update_variant_label(label=label, gattrs=d) + assert d["variant_label"] == label + assert d["realization_index"] == 10 + assert d["initialization_index"] == 20 + assert d["physics_index"] == 30 + assert d["forcing_index"] == 40 + + +def test_update_license_with_no_extra_arguments(): + cv = { + "institution_id": {"AWI": "AWI"}, + "license": { + "license": "CMI6 model under license License (). further_info_url [and at ]. yata yata yata.", + "license_options": { + "CC BY-SA 4.0": { + "license_id": "Creative Common", + "license_url": "https://cc.org", + } + }, + }, + } + d = {} + ga.update_license(d, cv) + assert "license" in d + assert "Creative Common" in d["license"] + assert "AWI" in d["license"] diff --git a/tests/unit/test_global_attributes.py b/tests/unit/test_global_attributes.py new file mode 100644 index 00000000..f24722cb --- /dev/null +++ b/tests/unit/test_global_attributes.py @@ -0,0 +1,54 @@ +from pymorize.global_attributes import set_global_attributes + +# Name, expected pass +required_attributes = { + ("activity_id", True), + ("Conventions", True), + ("creation_date", True), + ("data_specs_version", True), + ("experiment", True), + ("experiment_id", True), + ("forcing_index", True), + ("frequency", True), + ("further_info_url", True), + ("grid", True), + ("grid_label", True), + ("initialization_index", True), + ("institution", True), + ("institution_id", True), + ("license", True), + ("mip_era", True), + ("nominal_resolution", True), + ("physics_index", True), + ("product", True), + ("realization_index", True), + ("realm", True), + ("source", True), + ("source_id", True), + ("source_type", True), + ("sub_experiment", True), + ("sub_experiment_id", True), + ("table_id", True), + ("tracking_id", False), + ("variable_id", True), + ("variant_label", True), +} + + +@pytest.mark.parametrize("added_attributes, expected_pass", required_attributes) +def test_global_attributes_has_expected_attributes( + pi_uxarray_temp_rule, pi_uxarray_data, added_attributes, expected_pass +): + if not expected_pass: + pytest.xfail(f"Test should fail with attribute {added_attributes}") + matching_files = [ + f + for f in (pi_uxarray_data / "outdata/fesom/").iterdir() + if f.name.startswith("temp.fesom") + ] + ds = xr.open_mfdataset( + matching_files, + engine="h5netcdf", + ) + ds_out = set_global_attributes(ds, pi_uxarray_temp_rule) + assert added_attributes in ds_out.attrs