FIX #59: move arguments of MlflowNodeHook to mlflow.yml

Galileo-Galilei · Sep 8, 2020 · 691fb5a · 691fb5a
1 parent 404da28
commit 691fb5a
Show file tree

Hide file tree

Showing 10 changed files with 72 additions and 18 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -17,6 +17,7 @@
 ### Changed
 
 - Remove `conda_env` and `model_name` arguments from `MlflowPipelineHook` and add them to `PipelineML` and `pipeline_ml`. This is necessary for incoming hook auto-discovery in future release and it enables having multiple `PipelineML` in the same project. [#58](https://github.com/Galileo-Galilei/kedro-mlflow/pull/58)
+- `flatten_dict_params`, `recursive` and `sep` arguments of the `MlflowNodeHook` are moved to the `mlflow.yml` config file to prepare plugin auto registration. This also modifies the `run.py` template (to remove the args) and the `mlflow.yml` keys to add a `hooks` entry. ([#59](https://github.com/Galileo-Galilei/kedro-mlflow/pull/59))
 
 ## [0.2.1] - 2018-08-06
 

diff --git a/kedro_mlflow/framework/context/config.py b/kedro_mlflow/framework/context/config.py
@@ -17,13 +17,16 @@ class KedroMlflowConfig:
 
     UI_OPTS = {"port": None, "host": None}
 
+    NODE_HOOK_OPTS = {"flatten_dict_params": False, "recursive": True, "sep": "."}
+
     def __init__(
         self,
         project_path: Union[str, Path],
         mlflow_tracking_uri: str = "mlruns",
         experiment_opts: Union[Dict[str, Any], None] = None,
         run_opts: Union[Dict[str, Any], None] = None,
         ui_opts: Union[Dict[str, Any], None] = None,
+        node_hook_opts: Union[Dict[str, Any], None] = None,
     ):
 
         # declare attributes in __init__.py to avoid pylint complaining
@@ -39,6 +42,7 @@ def __init__(
         self.experiment_opts = None
         self.run_opts = None
         self.ui_opts = None
+        self.node_hook_opts = None
         self.mlflow_client = None  # the client to interact with the mlflow database
         self.experiment = (
             None  # the mlflow experiment object to interact directly with it
@@ -52,6 +56,7 @@ def __init__(
             experiment=experiment_opts,
             run=run_opts,
             ui=ui_opts,
+            hooks=dict(node=node_hook_opts),
         )
         self.from_dict(configuration)
 
@@ -65,20 +70,29 @@ def from_dict(self, configuration: Dict[str, str]):
             configuration {Dict[str, str]} -- A dict with the following format :
             {
                 mlflow_tracking_uri: a valid string for mlflow tracking storage,
-                experiments_opts:
+                experiments:
                     {
                         name {str}: the name of the experiment
                         create {bool} : should the experiment be created if it does not exists?
                     },
-                run_opts:
+                run:
                     {
                         nested {bool}: should we allow nested run within the context?
                     },
-                ui_opts:
+                ui:
                     {
                         port {int} : the port where the ui must be served
                         host {str} : the host for the ui
                     }
+                hooks:
+                    {
+                        node:
+                            {
+                             flatten_dict_params {bool}: When the parameter is a dict, should we crete several parameters i the dict, one for each entry? This may be necessary beacuase mlflow has a liit size for parameters. Default to False.
+                             recursive {bool}: In we flatten dict parameters, should we apply the strategy recusrively in case of nested dicts? Default to True.
+                             sep {str}: The separator in case of nested dict flattening {level1:{p1:1, p2:2}} will be logged as level1.p1, level.p2. Default to "."
+                            }
+                    }
             }
 
         """
@@ -87,13 +101,21 @@ def from_dict(self, configuration: Dict[str, str]):
         experiment_opts = configuration.get("experiment")
         run_opts = configuration.get("run")
         ui_opts = configuration.get("ui")
+        node_hook_opts = (
+            configuration.get("hooks").get("node")
+            if configuration.get("hooks") is not None
+            else None
+        )
 
         self.mlflow_tracking_uri = self._validate_uri(uri=mlflow_tracking_uri)
         self.experiment_opts = _validate_opts(
             opts=experiment_opts, default=self.EXPERIMENT_OPTS
         )
         self.run_opts = _validate_opts(opts=run_opts, default=self.RUN_OPTS)
         self.ui_opts = _validate_opts(opts=ui_opts, default=self.UI_OPTS)
+        self.node_hook_opts = _validate_opts(
+            opts=node_hook_opts, default=self.NODE_HOOK_OPTS
+        )
 
         # instantiate mlflow objects to interact with the database
         # the client must not be create dbefore carefully checking the uri,
@@ -126,6 +148,7 @@ def to_dict(self):
             "experiments": self.experiment_opts,
             "run": self.run_opts,
             "ui": self.ui_opts,
+            "hooks": {"node": self.node_hook_opts},
         }
         return info
 

diff --git a/kedro_mlflow/framework/hooks/node_hook.py b/kedro_mlflow/framework/hooks/node_hook.py
@@ -5,14 +5,17 @@
 from kedro.io import DataCatalog
 from kedro.pipeline.node import Node
 
+from kedro_mlflow.framework.context import get_mlflow_config
+
 
 class MlflowNodeHook:
     def __init__(
         self, flatten_dict_params: bool = False, recursive: bool = True, sep: str = "."
     ):
-        self.flatten = flatten_dict_params
-        self.recursive = recursive
-        self.sep = sep
+        config = get_mlflow_config()
+        self.flatten = config.node_hook_opts["flatten_dict_params"]
+        self.recursive = config.node_hook_opts["recursive"]
+        self.sep = config.node_hook_opts["sep"]
 
     @hook_impl
     def before_node_run(

diff --git a/kedro_mlflow/template/project/mlflow.yml b/kedro_mlflow/template/project/mlflow.yml
@@ -25,6 +25,13 @@ run:
   name: null # if `name` is None, pipeline name will be used for the run name
   nested: True  # # if `nested` is False, you won't be able to launch sub-runs inside your nodes
 
+hooks:
+  node:
+    flatten_dict_params: False  # if True, parameter which are dictionary will be splitted in multiple parameters when logged in mlflow, one for each key.
+    recursive: True  # Should the dictionnary flattening be applied recursively (i.e for nested dictionaries)? Not use if `flatten_dict_params` is False.
+    sep: "." # In case of recursive flattening, what separator should be used between the keys? E.g. {hyperaparam1: {p1:1, p2:2}}will be logged as hyperaparam1.p1 and hyperaparam1.p2 oin mlflow.
+
+
 # UI-RELATED PARAMETERS -----------------
 
 ui:

diff --git a/kedro_mlflow/template/project/run.py b/kedro_mlflow/template/project/run.py
@@ -48,10 +48,8 @@ class ProjectContext(KedroContext):
     project_version = "{{ cookiecutter.kedro_version }}"
     package_name = "{{ cookiecutter.python_package }}"
     hooks = (
-        MlflowNodeHook(flatten_dict_params=False),
-        MlflowPipelineHook(
-            model_name="{{ cookiecutter.python_package }}", conda_env="src/requirements.txt",
-        ),
+        MlflowNodeHook(),
+        MlflowPipelineHook(),
     )
 
     def _get_pipelines(self) -> Dict[str, Pipeline]:

diff --git a/tests/framework/context/test_config.py b/tests/framework/context/test_config.py
@@ -55,6 +55,7 @@ def test_kedro_mlflow_config_init(tmp_path):
         experiments=KedroMlflowConfig.EXPERIMENT_OPTS,
         run=KedroMlflowConfig.RUN_OPTS,
         ui=KedroMlflowConfig.UI_OPTS,
+        hooks=dict(node=KedroMlflowConfig.NODE_HOOK_OPTS),
     )
 
 

diff --git a/tests/framework/context/test_mlflow_context.py b/tests/framework/context/test_mlflow_context.py
@@ -23,12 +23,16 @@ def _write_yaml(filepath, config):
             experiment=dict(name="fake_package", create=True),
             run=dict(id="123456789", name="my_run", nested=True),
             ui=dict(port="5151", host="localhost"),
+            hooks=dict(node=dict(flatten_dict_params=True, recursive=False, sep="-")),
         ),
     )
     expected = {
         "mlflow_tracking_uri": (tmp_path / "mlruns").as_uri(),
         "experiments": {"name": "fake_package", "create": True},
         "run": {"id": "123456789", "name": "my_run", "nested": True},
         "ui": {"port": "5151", "host": "localhost"},
+        "hooks": {
+            "node": {"flatten_dict_params": True, "recursive": False, "sep": "-"}
+        },
     }
     assert get_mlflow_config(project_path=tmp_path, env="local").to_dict() == expected
diff --git a/tests/framework/hooks/test_node_hook.py b/tests/framework/hooks/test_node_hook.py
@@ -1,8 +1,10 @@
 import mlflow
+import pytest
 from kedro.io import DataCatalog, MemoryDataSet
 from kedro.pipeline import node
 from mlflow.tracking import MlflowClient
 
+from kedro_mlflow.framework.context.config import KedroMlflowConfig
 from kedro_mlflow.framework.hooks import MlflowNodeHook
 from kedro_mlflow.framework.hooks.node_hook import flatten_dict
 
@@ -36,8 +38,26 @@ def test_flatten_dict_nested_2_levels():
     }
 
 
-def test_node_hook(tmp_path):
-    mlflow_node_hook = MlflowNodeHook(flatten_dict_params=True, recursive=True, sep="-")
+@pytest.mark.parametrize(
+    "flatten_dict_params,expected",
+    [
+        (True, {"param1": "1", "parameters-param1": "1", "parameters-param2": "2"}),
+        (False, {"param1": "1", "parameters": "{'param1': 1, 'param2': 2}"}),
+    ],
+)
+def test_node_hook_logging(tmp_path, mocker, flatten_dict_params, expected):
+
+    mocker.patch("kedro_mlflow.utils._is_kedro_project", return_value=True)
+    config = KedroMlflowConfig(
+        project_path=tmp_path,
+        node_hook_opts={"flatten_dict_params": flatten_dict_params, "sep": "-"},
+    )
+    # the function is imported inside the other file antd this is the file to patch
+    # see https://stackoverflow.com/questions/30987973/python-mock-patch-doesnt-work-as-expected-for-public-method
+    mocker.patch(
+        "kedro_mlflow.framework.hooks.node_hook.get_mlflow_config", return_value=config
+    )
+    mlflow_node_hook = MlflowNodeHook()
 
     def fake_fun(arg1, arg2, arg3):
         return None
@@ -71,8 +91,4 @@ def fake_fun(arg1, arg2, arg3):
 
     mlflow_client = MlflowClient(mlflow_tracking_uri)
     current_run = mlflow_client.get_run(run_id)
-    assert current_run.data.params == {
-        "param1": "1",
-        "parameters-param1": "1",
-        "parameters-param2": "2",
-    }
+    assert current_run.data.params == expected
diff --git a/tests/template/project/test_mlflow_yml.py b/tests/template/project/test_mlflow_yml.py
@@ -32,6 +32,7 @@ def test_mlflow_yml_rendering(template_mlflowyml):
         ui=KedroMlflowConfig.UI_OPTS,
         run=KedroMlflowConfig.RUN_OPTS,
         experiment=KedroMlflowConfig.EXPERIMENT_OPTS,
+        hooks=dict(node=KedroMlflowConfig.NODE_HOOK_OPTS),
     )
     expected_config["experiment"]["name"] = "fake_project"  # check for proper rendering
     assert mlflow_config == expected_config
diff --git a/tests/template/project/test_template_pyfiles.py b/tests/template/project/test_template_pyfiles.py
@@ -94,7 +94,7 @@ def test_runpy_template_is_consistent_with_kedro():
         "",
     )
     kedro_mlflow_runpy = kedro_mlflow_runpy.replace(
-        '    hooks = (\n        MlflowNodeHook(flatten_dict_params=False),\n        MlflowPipelineHook(\n            model_name="fake_project", conda_env="src/requirements.txt",\n        ),\n    )\n',
+        "    hooks = (\n        MlflowNodeHook(),\n        MlflowPipelineHook(),\n    )\n",
         "",
     )
-Original file line number
+Diff line change
@@ Expand Up / @@ -94,7 +94,7 @@ def test_runpy_template_is_consistent_with_kedro(): @@
             "",
         )
         kedro_mlflow_runpy = kedro_mlflow_runpy.replace(
-            '    hooks = (\n        MlflowNodeHook(flatten_dict_params=False),\n        MlflowPipelineHook(\n            model_name="fake_project", conda_env="src/requirements.txt",\n        ),\n    )\n',
+            "    hooks = (\n        MlflowNodeHook(),\n        MlflowPipelineHook(),\n    )\n",
             "",
         )
@@ Expand Down @@