diff --git a/doc/conf.py b/doc/conf.py index a4ac21bd..9ffd4c1f 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -79,6 +79,7 @@ "pandas": ("http://pandas.pydata.org/pandas-docs/stable/", None), "xarray": ("http://xarray.pydata.org/en/stable/", None), "chemicals": ("https://chemicals.readthedocs.io/", None), + "cerberus": ("https://docs.python-cerberus.org/", None), } diff --git a/setup.py b/setup.py index 20cae219..b0fa30bb 100644 --- a/setup.py +++ b/setup.py @@ -31,12 +31,12 @@ def read(filename): package_dir={"": "src"}, packages=find_packages(where="src", exclude=("tests",)), install_requires=[ + "cerberus", "cf_xarray", "chemicals", "click-loguru", "dask", "distributed", - "dill", "dpath", "pendulum", "pint-xarray", @@ -50,6 +50,7 @@ def read(filename): extras_require={ "dev": [ "black", + "dill", "flake8", "isort", "pooch", diff --git a/src/pymorize/cmorizer.py b/src/pymorize/cmorizer.py index 670a884f..7111c243 100644 --- a/src/pymorize/cmorizer.py +++ b/src/pymorize/cmorizer.py @@ -9,6 +9,7 @@ from .logging import logger from .pipeline import Pipeline from .rule import Rule +from .validate import PIPELINES_VALIDATOR, RULES_VALIDATOR class CMORizer: @@ -155,15 +156,22 @@ def from_dict(cls, data): pymorize_cfg=data.get("pymorize", {}), general_cfg=data.get("general", {}), ) + if "rules" in data: + if not RULES_VALIDATOR.validate({"rules": data["rules"]}): + raise ValueError(RULES_VALIDATOR.errors) for rule in data.get("rules", []): rule_obj = Rule.from_dict(rule) instance.add_rule(rule_obj) - instance._post_init_populate_rules_with_tables() - instance._post_init_create_data_request() - instance._post_init_data_request_variables() + if "pipelines" in data: + if not PIPELINES_VALIDATOR.validate({"pipelines": data["pipelines"]}): + raise ValueError(PIPELINES_VALIDATOR.errors) for pipeline in data.get("pipelines", []): pipeline_obj = Pipeline.from_dict(pipeline) instance.add_pipeline(pipeline_obj) + + instance._post_init_populate_rules_with_tables() + instance._post_init_create_data_request() + instance._post_init_data_request_variables() return instance def add_rule(self, rule): diff --git a/src/pymorize/generic.py b/src/pymorize/generic.py index a4995490..d3a3179b 100644 --- a/src/pymorize/generic.py +++ b/src/pymorize/generic.py @@ -156,13 +156,10 @@ def dummy_load_data(data, rule_spec, cmorizer, *args, **kwargs): """ A dummy function for testing. Loads the xarray tutorial data """ - allowed_input_sources = ["xr_tutorial"] logger.info("Loading data") input_source = rule_spec.get("input_source", "xr_tutorial") if input_source == "xr_tutorial": data = xr.tutorial.open_dataset("air_temperature") - else: - raise NotImplementedError(f"Only {allowed_input_sources} are supported for now") if rule_spec.get("input_type") == "xr.DataArray": data = getattr(data, rule_spec.get("da_name", "air")) return data diff --git a/src/pymorize/validate.py b/src/pymorize/validate.py new file mode 100644 index 00000000..e9b021a2 --- /dev/null +++ b/src/pymorize/validate.py @@ -0,0 +1,111 @@ +""" +Provides validation of user configuration files by checking against a schema. +""" + +import importlib + +from cerberus import Validator + + +class PipelineValidator(Validator): + """ + Validator for pipeline configuration. + + See Also + -------- + * https://cerberus-sanhe.readthedocs.io/customize.html#class-based-custom-validators + """ + + def _validate_is_qualname(self, is_qualname, field, value): + """Test if a string is a Python qualname. + + The rule's arguments are validated against this schema: + {'type': 'boolean'}. This means that you can use a boolean value + for the schema argument "is_qualname" in your rule definition. + """ + if is_qualname and not isinstance(value, str): + self._error(field, "Must be a string") + if is_qualname: + parts = value.split(".") + module_name, attr_name = ".".join(parts[:-1]), parts[-1] + try: + module = importlib.import_module(module_name) + if not hasattr(module, attr_name): + self._error(field, "Must be a valid Python qualname") + except (ImportError, ModuleNotFoundError): + self._error(field, "Must be a valid Python qualname") + + def _validate(self, document): + super()._validate(document) + if "steps" not in document and "uses" not in document: + self._error( + "document", 'At least one of "steps" or "uses" must be specified' + ) + + +PIPELINES_SCHEMA = { + "pipelines": { + "type": "list", + "schema": { + "type": "dict", + "schema": { + "name": {"type": "string", "required": False}, + "uses": {"type": "string", "excludes": "steps"}, + "steps": { + "type": "list", + "excludes": "uses", + "schema": {"type": "string", "is_qualname": True}, + }, + }, + }, + }, +} +"""dict : Schema for validating pipelines configuration.""" + +PIPELINES_VALIDATOR = PipelineValidator(PIPELINES_SCHEMA) +"""Validator : Validator for pipelines configuration.""" + +RULES_SCHEMA = { + "rules": { + "type": "list", + "schema": { + "type": "dict", + "schema": { + "name": {"type": "string", "required": False}, + "cmor_variable": {"type": "string", "required": True}, + "input_type": { + "type": "string", + "required": False, + "allowed": [ + "xr.DataArray", + "xr.Dataset", + ], + }, + "input_source": { + "type": "string", + "required": False, + "allowed": [ + "xr_tutorial", + ], + }, + "input_patterns": { + "type": "list", + "schema": {"type": "string"}, + "required": True, + }, + "enabled": {"type": "boolean", "required": False}, + "description": {"type": "string", "required": False}, + "pipelines": { + "type": "list", + # FIXME(PG): Should cross-check with pipelines. + "schema": {"type": "string"}, + }, + "cmor_units": {"type": "string", "required": False}, + # FIXME(PS): How is it currently defined? + "model_units": {"type": "string", "required": False}, + }, + }, + }, +} +"""dict : Schema for validating rules configuration.""" +RULES_VALIDATOR = Validator(RULES_SCHEMA) diff --git a/tests/configs/test_config.yaml b/tests/configs/test_config.yaml index 7b1c9e4a..73135b8b 100644 --- a/tests/configs/test_config.yaml +++ b/tests/configs/test_config.yaml @@ -31,3 +31,11 @@ rules: input_source: "xr_tutorial" input_patterns: - "test_input" + - name: test_rule3 + enabled: false + input_patterns: ["/a/b/c"] + cmor_variable: "so" + - name: test_rule4 + cmor_variable: "thetao" + pipelines: ["sleeper_pipeline"] + input_patterns: ["/a/b/c"] diff --git a/tests/unit/test_validate.py b/tests/unit/test_validate.py new file mode 100644 index 00000000..0f956c84 --- /dev/null +++ b/tests/unit/test_validate.py @@ -0,0 +1,49 @@ +import pytest + +from pymorize.validate import PIPELINES_SCHEMA, PipelineValidator + + +@pytest.fixture +def validator(): + return PipelineValidator(PIPELINES_SCHEMA) + + +def test_initialize(validator): + assert validator.schema == PIPELINES_SCHEMA + + +def test_is_qualname(validator): + # Test with valid qualname + validator._validate_is_qualname(True, "field", "os.path.join") + + +def test_is_qualname_error(validator): + # Test with invalid qualname + with pytest.raises(Exception): + validator._validate_is_qualname(True, "field", "non.existent.module") + + +def test_validate(validator): + # Test with valid document + document = {"pipelines": [{"steps": ["os.path.join"]}]} + assert validator.validate(document) + + +def test_validate_neither_steps_nor_uses(validator): + # Test with invalid document (neither 'steps' nor 'uses' specified) + document = {"name": "test"} + valid_document = validator.validate(document) + assert valid_document is False + # with pytest.raises( + # Exception, match='At least one of "steps" or "uses" must be specified' + # ): + # validator.validate(document) + + +def test_validate_error_non_qualname(validator): + # Test with invalid pipeline configuration (invalid 'steps' qualname) + pipelines = {"pipelines": [{"name": "test", "steps": ["non.existent.module"]}]} + valid_document = validator.validate(pipelines) + assert valid_document is False + # with pytest.raises(Exception, match="Must be a valid Python qualname"): + # validator.validate(pipelines)