From e4c50143523508013ab20e5d219e9bd8491998db Mon Sep 17 00:00:00 2001 From: dcherian Date: Tue, 3 Aug 2021 10:48:54 -0600 Subject: [PATCH 1/3] Add attribute tracking functionality --- cf_xarray/__init__.py | 2 ++ cf_xarray/tracking.py | 68 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+) create mode 100644 cf_xarray/tracking.py diff --git a/cf_xarray/__init__.py b/cf_xarray/__init__.py index f80585e6..d9afa399 100644 --- a/cf_xarray/__init__.py +++ b/cf_xarray/__init__.py @@ -1,6 +1,8 @@ +from . import tracking # noqa from .accessor import CFAccessor # noqa from .helpers import bounds_to_vertices, vertices_to_bounds # noqa from .options import set_options # noqa +from .tracking import track_cf_attributes # noqa from .utils import _get_version __version__ = _get_version() diff --git a/cf_xarray/tracking.py b/cf_xarray/tracking.py new file mode 100644 index 00000000..8d11e08a --- /dev/null +++ b/cf_xarray/tracking.py @@ -0,0 +1,68 @@ +# This module provides functions for adding CF attribtues +# and tracking history, provenance using xarray's keep_attrs +# functionality + +import copy +import functools + +CELL_METHODS = { + "sum": "sum", + "max": "maximum", + "min": "minimum", + "median": "median", + "mean": "mean", + "std": "standard_deviation", + "var": "variance", +} + + +def add_cell_methods(attrs, context): + """Add appropriate cell_methods attribute.""" + assert len(attrs) == 1 + cell_methods = attrs[0].get("cell_methods", "") + # TODO: get dim_name from context + return {"cell_methods": f"dim_name: {CELL_METHODS[context.func]} {cell_methods}"} + + +def add_history(attrs, context): + return {"history": None} + + +def _tracker( + attrs, + context, + strict: bool = False, + cell_methods: bool = True, + history: bool = True, +): + + # can only handle single variable attrs for now + assert len(attrs) == 1 + attrs_out = copy.deepcopy(attrs[0]) + + if cell_methods and context.func in CELL_METHODS: + attrs_out.update(add_cell_methods(attrs, context)) + if history: + attrs_out.update(add_history(attrs, context)) + pass + return attrs_out + + +def track_cf_attributes( + *, strict: bool = False, cell_methods: bool = True, history: bool = True +): + """Top-level user-facing function. + + Parameters + ---------- + strict: bool + Controls if an error is raised when an appropriate attribute cannot + be added because of lack of information. + cell_methods: bool + Add cell_methods attribute when possible + history: bool + Adds a history attribute like NCO and follows the NUG convention. + """ + return functools.partial( + _tracker, strict=strict, cell_methods=cell_methods, history=history + ) From e972ddedebe6f09b33e504c8c2ce7b9c458f7e9f Mon Sep 17 00:00:00 2001 From: dcherian Date: Tue, 3 Aug 2021 16:50:38 -0600 Subject: [PATCH 2/3] Update history attribute --- cf_xarray/tracking.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/cf_xarray/tracking.py b/cf_xarray/tracking.py index 8d11e08a..e559774a 100644 --- a/cf_xarray/tracking.py +++ b/cf_xarray/tracking.py @@ -4,6 +4,7 @@ import copy import functools +from datetime import datetime CELL_METHODS = { "sum": "sum", @@ -25,7 +26,23 @@ def add_cell_methods(attrs, context): def add_history(attrs, context): - return {"history": None} + """Adds a history attribute following the NetCDF User Guide convention.""" + + # https://www.unidata.ucar.edu/software/netcdf/documentation/4.7.4-pre/attribute_conventions.html + # A global attribute for an audit trail. This is a character array with a line + # for each invocation of a program that has modified the dataset. Well-behaved + # generic netCDF applications should append a line containing: + # date, time of day, user name, program name and command arguments. + + # nco uses the ctime format + now = datetime.now().ctime() + history = attrs[0].get("history", []) + new_history = ( + f"{now}:" + f" {context.func}(args)\n" + # TODO: should we record software versions? + ) + return {"history": history + [new_history]} def _tracker( @@ -63,6 +80,8 @@ def track_cf_attributes( history: bool Adds a history attribute like NCO and follows the NUG convention. """ + + # TODO: check xarray version here. return functools.partial( _tracker, strict=strict, cell_methods=cell_methods, history=history ) From 4b3864efc69c4ab11db1ce1b085a673b7736ea94 Mon Sep 17 00:00:00 2001 From: David Huard Date: Fri, 3 Sep 2021 14:44:32 -0400 Subject: [PATCH 3/3] added a basic provenance tracking prototype --- cf_xarray/tests/test_options.py | 12 +- cf_xarray/tracking.py | 223 +++++++++++++++++++++++++++++++- 2 files changed, 229 insertions(+), 6 deletions(-) diff --git a/cf_xarray/tests/test_options.py b/cf_xarray/tests/test_options.py index 786ac337..2b3e9ef6 100644 --- a/cf_xarray/tests/test_options.py +++ b/cf_xarray/tests/test_options.py @@ -3,7 +3,7 @@ """ import pytest - +import xarray as xr import cf_xarray as cfxr @@ -12,3 +12,13 @@ def test_options(): # test for inputting a nonexistent option with pytest.raises(ValueError): cfxr.set_options(DISPLAY_WIDTH=80) + + +def test_tracker(): + da = xr.DataArray([1, 2, 3], dims="time", name="my_array", attrs={"comment": "A comment"}) + da = da.chunk({"time": 3}) + with xr.set_options(keep_attrs=cfxr.track_cf_attributes(cell_methods=True, history=True)): + out = da.mean("time") + print(out.__dask_graph__()) + assert out.attrs["cell_methods"] == "time: mean" + assert out.attrs["history"] == "mean(dim='time', skipna=None)" diff --git a/cf_xarray/tracking.py b/cf_xarray/tracking.py index e559774a..3f4639fa 100644 --- a/cf_xarray/tracking.py +++ b/cf_xarray/tracking.py @@ -5,6 +5,11 @@ import copy import functools from datetime import datetime +from rdflib import Graph, URIRef, Namespace, Literal +from rdflib.namespace import RDF, RDFS +import xarray as xr + +PROV_KEY = "__prov__" CELL_METHODS = { "sum": "sum", @@ -16,13 +21,26 @@ "var": "variance", } +def call_signature(func, **kwargs): + callargstr = [] + + for (k, v) in kwargs.items(): + if isinstance(v, (xr.DataArray)): + callargstr.append(f"{k}=") + elif isinstance(v, (float, int, str)): + callargstr.append(f"{k}={v!r}") # repr so strings have ' ' + else: + # don't take chance of having unprintable values + callargstr.append(f"{k}={type(v)}") + + return f"{func.__name__}({callargstr})" + def add_cell_methods(attrs, context): """Add appropriate cell_methods attribute.""" assert len(attrs) == 1 cell_methods = attrs[0].get("cell_methods", "") - # TODO: get dim_name from context - return {"cell_methods": f"dim_name: {CELL_METHODS[context.func]} {cell_methods}"} + return {"cell_methods": f"context.dim: {CELL_METHODS[context.func]} {cell_methods}".strip()} def add_history(attrs, context): @@ -37,6 +55,7 @@ def add_history(attrs, context): # nco uses the ctime format now = datetime.now().ctime() history = attrs[0].get("history", []) + new_history = ( f"{now}:" f" {context.func}(args)\n" @@ -45,12 +64,49 @@ def add_history(attrs, context): return {"history": history + [new_history]} +def init_graph(): + """Create empty graph and bind namespaces.""" + g = Graph() + # The metaclip ontology + DS = Namespace("http://www.metaclip.org/datasource/datasource.owl#") + g.namespace_manager.bind("ds", DS) + + # The namespace describing xarray objects (not sure if this is standard) + XR = Namespace("xarray:") + g.namespace_manager.bind("xr", XR) + return g + + +def add_provenance(attrs, context): + """Add provenance information related to the operational context.""" + # Fetch the DataArray graph and instantiate namespaces. + g = attrs[0].get(PROV_KEY, init_graph()) + ns = dict(g.namespaces()) + XR = Namespace(ns["xr"]) + DS = Namespace(ns["ds"]) + + # Creating vertex for the function itself + cmd = XR[f"call:{context.func}"] + # For now it's just a generic command, but there could be an Ontology defined for xarray functions giving more + # information on what they're doing. + # Also, this is limited because we don't know the arguments to the function nor the dimension it operates on. + g.add((cmd, RDF.type, DS.Command)) + + # Linking that function to the DataArray + # Unclear how we know exactly which DataArray this command operates on. + # Cheating a bit here... + ref = attrs[0]["__prov_da_id__"] + g.add((ref, DS.hadCommandCall, cmd)) + return {PROV_KEY: g} + + def _tracker( attrs, context, strict: bool = False, cell_methods: bool = True, history: bool = True, + prov: bool = True ): # can only handle single variable attrs for now @@ -61,12 +117,14 @@ def _tracker( attrs_out.update(add_cell_methods(attrs, context)) if history: attrs_out.update(add_history(attrs, context)) - pass + if prov: + attrs_out.update(add_provenance(attrs, context)) + return attrs_out def track_cf_attributes( - *, strict: bool = False, cell_methods: bool = True, history: bool = True + *, strict: bool = False, cell_methods: bool = True, history: bool = True, prov: bool = True ): """Top-level user-facing function. @@ -79,9 +137,164 @@ def track_cf_attributes( Add cell_methods attribute when possible history: bool Adds a history attribute like NCO and follows the NUG convention. + prov: bool + Add provenance information to an RDF graph. """ # TODO: check xarray version here. return functools.partial( - _tracker, strict=strict, cell_methods=cell_methods, history=history + _tracker, strict=strict, cell_methods=cell_methods, history=history, prov=prov ) + + +def track_provenance_with_rdflib(ds, varname): + """Create provenance document.""" + prov = ds.attrs.get("has_provenance") + if prov is not None: + raise NotImplementedError + + g = init_graph() + ns = dict(g.namespaces()) + XR = Namespace(ns["xr"]) + DS = Namespace(ns["ds"]) + + # Each vertex has an identifier in the graph + e = XR[f"ds:{id(ds)}"] # Creates a URIRef + + # Here we add an RDF triplet (subject, predicate, object) + # What the next line does is tell the graph entity `e` has type `ds:Dataset` + g.add((e, RDF.type, DS.Dataset)) + + if "project_id" in ds.attrs: + label = ds.attrs["project_id"] + ref = XR[f"project:{label}"] + g.add((ref, RDF.type, DS.Project)) + g.add((e, DS.hadProject, ref)) + g.add((ref, RDFS.label, Literal(label))) + + if "institute_id" in ds.attrs: + label = ds.attrs["institute_id"] + ref = XR[f"institute:{label.replace(' ', '_')}"] + g.add((ref, RDF.type, DS.ModellingCenter)) + g.add((e, DS.hadModellingCenter, ref)) + g.add((ref, RDFS.label, Literal(label))) + + # Add vertex for the variable + key = varname + da = ds[key] + # Copy or deepcopy does not make an independent object. The copies still link to the original graph. + # This will look weird if we want to assign a provenance graph to each variable (they'll all be identical) + da.attrs[PROV_KEY] = vg = copy.copy(g) + v = XR[f"da:{key}:{id(da)}"] + da.attrs["__prov_da_id__"] = v + # Create DatasetSubset + vg.add((v, RDF.type, DS.DatasetSubset)) + vg.add((e, DS.hadDatasetSubset, v)) + # Create Variable + vg.add((v, DS.hasVariable, XR[key])) + vg.add((XR[key], RDF.type, DS.Variable)) + vg.add((XR[key], RDFS.label, Literal(key))) + if "units" in da.attrs: + vg.add((XR[key], DS.withUnits, Literal(da.attrs["units"]))) + # TODO: add info about temporal and spatial extent + + return ds + + +def track_provenance_with_prov(ds, varname): + """Not working for now.""" + import prov + from prov.model import ProvDocument + from prov.identifier import Namespace + from uuid import uuid4 + # Create an xarray namespace for what happens here + XARRAY = Namespace("xarray", uri="urn:xarray:") + + def get_record(label, klass, ns={}): + """Search namespaces to find a class instance with the given label. + + Use the output to create a new provenance entity or activity. + """ + # TODO: Search into ns + # Default when label is not found + identifier = XARRAY[f"{klass}.{uuid4()}"] + attributes = {prov.model.PROV_LABEL: label, + prov.model.PROV_TYPE: klass} + # PROV class, identifier, None, attributes + return dict(identifier=identifier, other_attributes=attributes) + + # Create the provenance document + doc = ProvDocument() + + # Identify namespaces, here we're using the METACLIP ontologies + ns = {"ds": "http://www.metaclip.org/datasource/datasource.owl#", + "ipcc": "http://www.metaclip.org/ipcc_terms/ipcc_terms.owl#", + "veri": "http://www.metaclip.org/verification/verification.owl#", + "cal": "http://www.metaclip.org/calibration/calibration.owl#", + "go": "http://www.metaclip.org/graphical_output/graphical_output.owl#"} + for key, uri in ns.items(): + doc.add_namespace(key, uri) + + # Create a `Dataset` entity with an identifier that uniquely identifies this object + # I suppose this could be a __hash__ + ds_id = id(ds) + + # ds:Dataset is a subclass of entity + e = doc.entity(XARRAY[f"dataset_{ds_id}"], + {prov.model.PROV_TYPE: "ds:Dataset"}) + + # Add attributes + # Some attributes might have a corresponding node in the ontology. In that case, we want to link it here. + # Otherwise, we create an new `instance` of the attribute class. + # TODO: these attributes are CMIP5 specific. CMIP6 has slight differences in how attributes are named. Ideally, + # users could create mappings from dataset attributes to ontology classes. More ideally, an inference engine + # could do this mapping automatically. + if "project_id" in ds.attrs: + label = ds.attrs["project_id"] + # Project is a subclass of prov:activity + a = doc.activity(**get_record(label, "ds:Project", ns)) + # ds:hadProject is a sub property of prov:wasGeneratedBy + e.wasGeneratedBy(a, attributes={prov.model.PROV_TYPE: "ds:hadProject"}) + + if "institute_id" in ds.attrs: + label = ds.attrs["institute_id"] + # ModellingCenter is a subclass of prov.Organization + a = doc.agent(**get_record(label, "ds:ModellingCenter", ns)) + e.wasAttributedTo(a) + + # ... + + for key in ds.data_vars: + # ds:DatasetSubset is a subclass of ds: Step, which is a subclass of prov:Derivation + # A variable is a prov:Entity + # ds:hasVariable is a property + da = ds[key] + da.attrs[PROV_KEY] = vdoc = copy.copy(doc) + identifier = id(da) + se = vdoc.entity(XARRAY[f"subset_{identifier}"], + {prov.model.PROV_TYPE: "ds:DatasetSubset"}) + se.wasDerivedFrom(e, {prov.model.PROV_TYPE: "ds:hadDatasetSubset"}) + try: + v = vdoc.entity(XARRAY[f"dataarray_{identifier}"], + {prov.model.PROV_TYPE: "ds:Variable", + prov.model.PROV_LABEL: key, + "ds:withUnits": da.attrs["units"], + }) + # ... ? + except KeyError: + pass + + # Don't know how to make an edge with ds:hasVariable + + +def test_prov_tracking(): + ds = xr.open_dataset("/home/david/data/cmip5/pr_Amon_GFDL-CM3_historical_r1i1p1_186001-186412.nc") + + # Create RDF graph in attribute '__prov__' + track_provenance_with_rdflib(ds, "pr") + + # Run operation + with xr.set_options(keep_attrs=track_cf_attributes(prov=True)): + ds.pr.mean(dim="time") + + print(ds.pr.__prov__.serialize())