diff --git a/CHANGELOG.md b/CHANGELOG.md index 24d6f495..fc364ccd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,7 +16,11 @@ ### Changed +<<<<<<< HEAD - Remove `conda_env` and `model_name` arguments from `MlflowPipelineHook` and add them to `PipelineML` and `pipeline_ml`. This is necessary for incoming hook auto-discovery in future release and it enables having multiple `PipelineML` in the same project. [#58](https://github.com/Galileo-Galilei/kedro-mlflow/pull/58) +======= +- `flatten_dict_params`, `recursive` and `sep` arguments of the `MlflowNodeHook` are moved to the `mlflow.yml` config file to prepare plugin auto registration. This also modifies the `run.py` template (to remove the args) and the `mlflow.yml` keys to add a `hooks` entry. ([#59](https://github.com/Galileo-Galilei/kedro-mlflow/pull/59)) +>>>>>>> 2e44efa... FIX #59: move arguments of MlflowNodeHook to mlflow.yml ## [0.2.1] - 2018-08-06 diff --git a/kedro_mlflow/framework/context/config.py b/kedro_mlflow/framework/context/config.py index 540efd6b..37e178e0 100644 --- a/kedro_mlflow/framework/context/config.py +++ b/kedro_mlflow/framework/context/config.py @@ -17,6 +17,8 @@ class KedroMlflowConfig: UI_OPTS = {"port": None, "host": None} + NODE_HOOK_OPTS = {"flatten_dict_params": False, "recursive": True, "sep": "."} + def __init__( self, project_path: Union[str, Path], @@ -24,6 +26,7 @@ def __init__( experiment_opts: Union[Dict[str, Any], None] = None, run_opts: Union[Dict[str, Any], None] = None, ui_opts: Union[Dict[str, Any], None] = None, + node_hook_opts: Union[Dict[str, Any], None] = None, ): # declare attributes in __init__.py to avoid pylint complaining @@ -39,6 +42,7 @@ def __init__( self.experiment_opts = None self.run_opts = None self.ui_opts = None + self.node_hook_opts = None self.mlflow_client = None # the client to interact with the mlflow database self.experiment = ( None # the mlflow experiment object to interact directly with it @@ -52,6 +56,7 @@ def __init__( experiment=experiment_opts, run=run_opts, ui=ui_opts, + hooks=dict(node=node_hook_opts), ) self.from_dict(configuration) @@ -65,20 +70,29 @@ def from_dict(self, configuration: Dict[str, str]): configuration {Dict[str, str]} -- A dict with the following format : { mlflow_tracking_uri: a valid string for mlflow tracking storage, - experiments_opts: + experiments: { name {str}: the name of the experiment create {bool} : should the experiment be created if it does not exists? }, - run_opts: + run: { nested {bool}: should we allow nested run within the context? }, - ui_opts: + ui: { port {int} : the port where the ui must be served host {str} : the host for the ui } + hooks: + { + node: + { + flatten_dict_params {bool}: When the parameter is a dict, should we crete several parameters i the dict, one for each entry? This may be necessary beacuase mlflow has a liit size for parameters. Default to False. + recursive {bool}: In we flatten dict parameters, should we apply the strategy recusrively in case of nested dicts? Default to True. + sep {str}: The separator in case of nested dict flattening {level1:{p1:1, p2:2}} will be logged as level1.p1, level.p2. Default to "." + } + } } """ @@ -87,6 +101,11 @@ def from_dict(self, configuration: Dict[str, str]): experiment_opts = configuration.get("experiment") run_opts = configuration.get("run") ui_opts = configuration.get("ui") + node_hook_opts = ( + configuration.get("hooks").get("node") + if configuration.get("hooks") is not None + else None + ) self.mlflow_tracking_uri = self._validate_uri(uri=mlflow_tracking_uri) self.experiment_opts = _validate_opts( @@ -94,6 +113,9 @@ def from_dict(self, configuration: Dict[str, str]): ) self.run_opts = _validate_opts(opts=run_opts, default=self.RUN_OPTS) self.ui_opts = _validate_opts(opts=ui_opts, default=self.UI_OPTS) + self.node_hook_opts = _validate_opts( + opts=node_hook_opts, default=self.NODE_HOOK_OPTS + ) # instantiate mlflow objects to interact with the database # the client must not be create dbefore carefully checking the uri, @@ -126,6 +148,7 @@ def to_dict(self): "experiments": self.experiment_opts, "run": self.run_opts, "ui": self.ui_opts, + "hooks": {"node": self.node_hook_opts}, } return info diff --git a/kedro_mlflow/framework/hooks/node_hook.py b/kedro_mlflow/framework/hooks/node_hook.py index c3756fa8..a017c75d 100644 --- a/kedro_mlflow/framework/hooks/node_hook.py +++ b/kedro_mlflow/framework/hooks/node_hook.py @@ -5,14 +5,17 @@ from kedro.io import DataCatalog from kedro.pipeline.node import Node +from kedro_mlflow.framework.context import get_mlflow_config + class MlflowNodeHook: def __init__( self, flatten_dict_params: bool = False, recursive: bool = True, sep: str = "." ): - self.flatten = flatten_dict_params - self.recursive = recursive - self.sep = sep + config = get_mlflow_config() + self.flatten = config.node_hook_opts["flatten_dict_params"] + self.recursive = config.node_hook_opts["recursive"] + self.sep = config.node_hook_opts["sep"] @hook_impl def before_node_run( diff --git a/kedro_mlflow/template/project/mlflow.yml b/kedro_mlflow/template/project/mlflow.yml index e45e811c..2aee6080 100644 --- a/kedro_mlflow/template/project/mlflow.yml +++ b/kedro_mlflow/template/project/mlflow.yml @@ -25,6 +25,13 @@ run: name: null # if `name` is None, pipeline name will be used for the run name nested: True # # if `nested` is False, you won't be able to launch sub-runs inside your nodes +hooks: + node: + flatten_dict_params: False # if True, parameter which are dictionary will be splitted in multiple parameters when logged in mlflow, one for each key. + recursive: True # Should the dictionnary flattening be applied recursively (i.e for nested dictionaries)? Not use if `flatten_dict_params` is False. + sep: "." # In case of recursive flattening, what separator should be used between the keys? E.g. {hyperaparam1: {p1:1, p2:2}}will be logged as hyperaparam1.p1 and hyperaparam1.p2 oin mlflow. + + # UI-RELATED PARAMETERS ----------------- ui: diff --git a/kedro_mlflow/template/project/run.py b/kedro_mlflow/template/project/run.py index 82d44918..6e8d3228 100644 --- a/kedro_mlflow/template/project/run.py +++ b/kedro_mlflow/template/project/run.py @@ -48,10 +48,8 @@ class ProjectContext(KedroContext): project_version = "{{ cookiecutter.kedro_version }}" package_name = "{{ cookiecutter.python_package }}" hooks = ( - MlflowNodeHook(flatten_dict_params=False), - MlflowPipelineHook( - model_name="{{ cookiecutter.python_package }}", conda_env="src/requirements.txt", - ), + MlflowNodeHook(), + MlflowPipelineHook(), ) def _get_pipelines(self) -> Dict[str, Pipeline]: diff --git a/tests/framework/context/test_config.py b/tests/framework/context/test_config.py index 091e0901..1b0aa640 100644 --- a/tests/framework/context/test_config.py +++ b/tests/framework/context/test_config.py @@ -55,6 +55,7 @@ def test_kedro_mlflow_config_init(tmp_path): experiments=KedroMlflowConfig.EXPERIMENT_OPTS, run=KedroMlflowConfig.RUN_OPTS, ui=KedroMlflowConfig.UI_OPTS, + hooks=dict(node=KedroMlflowConfig.NODE_HOOK_OPTS), ) diff --git a/tests/framework/context/test_mlflow_context.py b/tests/framework/context/test_mlflow_context.py index 2d24af37..4cf4cee6 100644 --- a/tests/framework/context/test_mlflow_context.py +++ b/tests/framework/context/test_mlflow_context.py @@ -23,6 +23,7 @@ def _write_yaml(filepath, config): experiment=dict(name="fake_package", create=True), run=dict(id="123456789", name="my_run", nested=True), ui=dict(port="5151", host="localhost"), + hooks=dict(node=dict(flatten_dict_params=True, recursive=False, sep="-")), ), ) expected = { @@ -30,5 +31,8 @@ def _write_yaml(filepath, config): "experiments": {"name": "fake_package", "create": True}, "run": {"id": "123456789", "name": "my_run", "nested": True}, "ui": {"port": "5151", "host": "localhost"}, + "hooks": { + "node": {"flatten_dict_params": True, "recursive": False, "sep": "-"} + }, } assert get_mlflow_config(project_path=tmp_path, env="local").to_dict() == expected diff --git a/tests/framework/hooks/test_node_hook.py b/tests/framework/hooks/test_node_hook.py index a1866367..89182c7a 100644 --- a/tests/framework/hooks/test_node_hook.py +++ b/tests/framework/hooks/test_node_hook.py @@ -1,8 +1,10 @@ import mlflow +import pytest from kedro.io import DataCatalog, MemoryDataSet from kedro.pipeline import node from mlflow.tracking import MlflowClient +from kedro_mlflow.framework.context.config import KedroMlflowConfig from kedro_mlflow.framework.hooks import MlflowNodeHook from kedro_mlflow.framework.hooks.node_hook import flatten_dict @@ -36,8 +38,26 @@ def test_flatten_dict_nested_2_levels(): } -def test_node_hook(tmp_path): - mlflow_node_hook = MlflowNodeHook(flatten_dict_params=True, recursive=True, sep="-") +@pytest.mark.parametrize( + "flatten_dict_params,expected", + [ + (True, {"param1": "1", "parameters-param1": "1", "parameters-param2": "2"}), + (False, {"param1": "1", "parameters": "{'param1': 1, 'param2': 2}"}), + ], +) +def test_node_hook_logging(tmp_path, mocker, flatten_dict_params, expected): + + mocker.patch("kedro_mlflow.utils._is_kedro_project", return_value=True) + config = KedroMlflowConfig( + project_path=tmp_path, + node_hook_opts={"flatten_dict_params": flatten_dict_params, "sep": "-"}, + ) + # the function is imported inside the other file antd this is the file to patch + # see https://stackoverflow.com/questions/30987973/python-mock-patch-doesnt-work-as-expected-for-public-method + mocker.patch( + "kedro_mlflow.framework.hooks.node_hook.get_mlflow_config", return_value=config + ) + mlflow_node_hook = MlflowNodeHook() def fake_fun(arg1, arg2, arg3): return None @@ -71,8 +91,4 @@ def fake_fun(arg1, arg2, arg3): mlflow_client = MlflowClient(mlflow_tracking_uri) current_run = mlflow_client.get_run(run_id) - assert current_run.data.params == { - "param1": "1", - "parameters-param1": "1", - "parameters-param2": "2", - } + assert current_run.data.params == expected diff --git a/tests/template/project/test_mlflow_yml.py b/tests/template/project/test_mlflow_yml.py index 8eaa8240..ce976904 100644 --- a/tests/template/project/test_mlflow_yml.py +++ b/tests/template/project/test_mlflow_yml.py @@ -32,6 +32,7 @@ def test_mlflow_yml_rendering(template_mlflowyml): ui=KedroMlflowConfig.UI_OPTS, run=KedroMlflowConfig.RUN_OPTS, experiment=KedroMlflowConfig.EXPERIMENT_OPTS, + hooks=dict(node=KedroMlflowConfig.NODE_HOOK_OPTS), ) expected_config["experiment"]["name"] = "fake_project" # check for proper rendering assert mlflow_config == expected_config diff --git a/tests/template/project/test_template_pyfiles.py b/tests/template/project/test_template_pyfiles.py index 7322e52f..9bcfeaaf 100644 --- a/tests/template/project/test_template_pyfiles.py +++ b/tests/template/project/test_template_pyfiles.py @@ -94,7 +94,7 @@ def test_runpy_template_is_consistent_with_kedro(): "", ) kedro_mlflow_runpy = kedro_mlflow_runpy.replace( - ' hooks = (\n MlflowNodeHook(flatten_dict_params=False),\n MlflowPipelineHook(\n model_name="fake_project", conda_env="src/requirements.txt",\n ),\n )\n', + " hooks = (\n MlflowNodeHook(),\n MlflowPipelineHook(),\n )\n", "", )