Galileo-Galilei · Galileo-Galilei · Nov 27, 2020 · Nov 26, 2020
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,13 +2,19 @@
 
 ## [Unreleased]
 
+### Added
+
+- It is now possible to supply credentials for the mlflow tracking server within `mlflow.yml` and `credentials.yml`. They will be exported as environment variables during the run. ([#31](https://github.com/Galileo-Galilei/kedro-mlflow/issues/31))
+
 ### Fixed
 
--   Fix `TypeError: unsupported operand type(s) for /: 'str' and 'str'` when using `MlflowArtifactDataSet` with `MlflowModelSaverDataSet` ([#116](https://github.com/Galileo-Galilei/kedro-mlflow/issues/116))
-- Fix various docs typo
+- Fix `TypeError: unsupported operand type(s) for /: 'str' and 'str'` when using `MlflowArtifactDataSet` with `MlflowModelSaverDataSet` ([#116](https://github.com/Galileo-Galilei/kedro-mlflow/issues/116))
+- Fix various docs typo ([#6](https://github.com/Galileo-Galilei/kedro-mlflow/issues/6))
 
 ### Changed
-- Refactor doc structure for readability
+
+- Refactor doc structure for readability ([#6](https://github.com/Galileo-Galilei/kedro-mlflow/issues/6))
+- The `KedroMlflowConfig` no longer creates the mlflow experiment and access to the mlflow tracking server when it is instantiated. A new `setup()` method sets up the mlflow configuration (tracking uri, credentials and experiment management) but must now be called explicitly. ([#97](https://github.com/Galileo-Galilei/kedro-mlflow/issues/97))
 
 ## [0.4.0] - 2020-11-03
 

diff --git a/docs/source/04_experimentation_tracking/01_configuration.md b/docs/source/04_experimentation_tracking/01_configuration.md
@@ -31,6 +31,24 @@ mlflow_tracking_uri: mlruns
 
 This is the **only mandatory key in the `mlflow.yml` file**, but there are many others described hereafter that provide fine-grained control on your mlflow setup.
 
+You can also specify some environment variables needed by mlflow (e.g `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`) in the credentials and specify them in the `mlflow.yml`. Any key specified will be automatically exported as environment variables.
+
+Your `credentials.yml` will look as follows:
+
+```yaml
+my_mlflow_credentials:
+  AWS_ACCESS_KEY_ID: <your-key>
+  AWS_SECRET_ACCESS_KEY: <your-secret-key>
+```
+
+and your can supply the credentials key of the `mlflow.yml`:
+
+```yaml
+credentials: my_mlflow_credentials
+```
+
+For safety reasons, the credentials will not be accessible within `KedroMlflowConfig` objects. They wil be exported as environment variables *on the fly* when running the pipeline.
+
 ### Configure mlflow experiment
 
 Mlflow enable the user to create "experiments" to organize his work. The different experiments will be visible on the left panel of the mlflow user interface. You can create an experiment through the `mlflow.yml` file witht the `experiment` key:

diff --git a/kedro_mlflow/framework/context/config.py b/kedro_mlflow/framework/context/config.py
@@ -1,6 +1,7 @@
 import logging
+import os
 from pathlib import Path, PurePath
-from typing import Any, Dict, Union
+from typing import Any, Dict, Optional, Union
 
 import mlflow
 
@@ -23,6 +24,7 @@ def __init__(
         self,
         project_path: Union[str, Path],
         mlflow_tracking_uri: str = "mlruns",
+        credentials: Optional[Dict[str, str]] = None,
         experiment_opts: Union[Dict[str, Any], None] = None,
         run_opts: Union[Dict[str, Any], None] = None,
         ui_opts: Union[Dict[str, Any], None] = None,
@@ -39,6 +41,9 @@ def __init__(
         self.project_path = Path(project_path)
         # TODO we may add mlflow_registry_uri future release
         self.mlflow_tracking_uri = "mlruns"
+        self.credentials = (
+            credentials or {}
+        )  # replace None by {} but o not default to empty dict which is mutable
         self.experiment_opts = None
         self.run_opts = None
         self.ui_opts = None
@@ -53,13 +58,26 @@ def __init__(
         # for loading the configuration
         configuration = dict(
             mlflow_tracking_uri=mlflow_tracking_uri,
+            credentials=credentials,
             experiment=experiment_opts,
             run=run_opts,
             ui=ui_opts,
             hooks=dict(node=node_hook_opts),
         )
         self.from_dict(configuration)
 
+    def setup(self, context):
+        """Setup all the mlflow configuration"""
+
+        self._export_credentials(context)
+
+        # we set the congiguration now: it takes priority
+        # if it has already be set in export_credentials
+
+        mlflow.set_tracking_uri(self.mlflow_tracking_uri)
+
+        self._get_or_create_experiment()
+
     def from_dict(self, configuration: Dict[str, str]):
         """This functions populates all the attributes of the class through a dictionary.
         This is the preferred method because the configuration is intended
@@ -70,6 +88,7 @@ def from_dict(self, configuration: Dict[str, str]):
             configuration {Dict[str, str]} -- A dict with the following format :
             {
                 mlflow_tracking_uri: a valid string for mlflow tracking storage,
+                credentials: a valid string which exists in credentials.yml,
                 experiments:
                     {
                         name {str}: the name of the experiment
@@ -98,12 +117,14 @@ def from_dict(self, configuration: Dict[str, str]):
         """
 
         mlflow_tracking_uri = configuration.get("mlflow_tracking_uri")
+        credentials = configuration.get("credentials")
         experiment_opts = configuration.get("experiment")
         run_opts = configuration.get("run")
         ui_opts = configuration.get("ui")
         node_hook_opts = configuration.get("hooks", {}).get("node")
 
         self.mlflow_tracking_uri = self._validate_uri(uri=mlflow_tracking_uri)
+        self.credentials = credentials  # do not replace by value here for safety
         self.experiment_opts = _validate_opts(
             opts=experiment_opts, default=self.EXPERIMENT_OPTS
         )
@@ -119,7 +140,6 @@ def from_dict(self, configuration: Dict[str, str]):
         self.mlflow_client = mlflow.tracking.MlflowClient(
             tracking_uri=self.mlflow_tracking_uri
         )
-        self._get_or_create_experiment()
 
     def to_dict(self):
         """Retrieve all the attributes needed to setup the config
@@ -128,6 +148,7 @@ def to_dict(self):
             Dict[str, Any] -- All attributes with the following format:
             {
                 mlflow_tracking_uri: a valid string for mlflow tracking storage,
+                credentials: a valid string which exists in credentials.yml,
                 experiments_opts:
                     {
                         name {str}: the name of the experiment
@@ -141,6 +162,7 @@ def to_dict(self):
         """
         info = {
             "mlflow_tracking_uri": self.mlflow_tracking_uri,
+            "credentials": self.credentials,
             "experiments": self.experiment_opts,
             "run": self.run_opts,
             "ui": self.ui_opts,
@@ -208,6 +230,12 @@ def _validate_uri(self, uri: Union[str, None]) -> str:
 
         return valid_uri
 
+    def _export_credentials(self, context):
+        conf_creds = context._get_config_credentials()
+        mlflow_creds = conf_creds.get(self.credentials, {})
+        for key, value in mlflow_creds.items():
+            os.environ[key] = value
+
 
 def _validate_opts(opts: Dict[str, Any], default: Dict[str, Any]) -> Dict:
     """This functions creates a valid dictionary containing options

diff --git a/kedro_mlflow/framework/hooks/pipeline_hook.py b/kedro_mlflow/framework/hooks/pipeline_hook.py
@@ -75,7 +75,7 @@ def before_pipeline_run(
         )
 
         mlflow_conf = get_mlflow_config(self.context)
-        mlflow.set_tracking_uri(mlflow_conf.mlflow_tracking_uri)
+        mlflow_conf.setup(self.context)
 
         run_name = (
             mlflow_conf.run_opts["name"]
@@ -109,7 +109,10 @@ def before_pipeline_run(
 
     @hook_impl
     def after_pipeline_run(
-        self, run_params: Dict[str, Any], pipeline: Pipeline, catalog: DataCatalog,
+        self,
+        run_params: Dict[str, Any],
+        pipeline: Pipeline,
+        catalog: DataCatalog,
     ) -> None:
         """Hook to be invoked after a pipeline runs.
         Args:

diff --git a/kedro_mlflow/template/project/mlflow.yml b/kedro_mlflow/template/project/mlflow.yml
@@ -7,6 +7,17 @@
 # at the root of the project
 mlflow_tracking_uri: mlruns
 
+# All credentials needed for mlflow must be stored in credentials .yml as a dict
+# they will be exported as environment variable
+# If you want to set some credentials,  e.g. AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY
+# > in `credentials.yml`:
+# your_mlflow_credentials:
+#   AWS_ACCESS_KEY_ID: 132456
+#   AWS_SECRET_ACCESS_KEY: 132456
+# > in this file `mlflow.yml`:
+# credentials: mlflow_credentials
+
+credentials: null  # must be a valid key in credentials.yml
 
 # EXPERIMENT-RELATED PARAMETERS ----------
 

diff --git a/tests/framework/context/test_config.py b/tests/framework/context/test_config.py
@@ -1,5 +1,9 @@
+import os
+
+import mlflow
 import pytest
 import yaml
+from kedro.context import load_context
 from mlflow.tracking import MlflowClient
 
 from kedro_mlflow.framework.context.config import (
@@ -52,14 +56,17 @@ def test_kedro_mlflow_config_init(tmp_path):
     config = KedroMlflowConfig(project_path=tmp_path)
     assert config.to_dict() == dict(
         mlflow_tracking_uri=(tmp_path / "mlruns").as_uri(),
+        credentials=None,
         experiments=KedroMlflowConfig.EXPERIMENT_OPTS,
         run=KedroMlflowConfig.RUN_OPTS,
         ui=KedroMlflowConfig.UI_OPTS,
         hooks=dict(node=KedroMlflowConfig.NODE_HOOK_OPTS),
     )
 
 
-def test_kedro_mlflow_config_new_experiment_does_not_exists(mocker, tmp_path):
+def test_kedro_mlflow_config_new_experiment_does_not_exists(
+    mocker, tmp_path, config_dir
+):
     # create a ".kedro.yml" file to identify "tmp_path" as the root of a kedro project
     mocker.patch("kedro_mlflow.utils._is_kedro_project", return_value=True)
 
@@ -68,10 +75,12 @@ def test_kedro_mlflow_config_new_experiment_does_not_exists(mocker, tmp_path):
         mlflow_tracking_uri="mlruns",
         experiment_opts=dict(name="exp1"),
     )
+    context = load_context(tmp_path)
+    config.setup(context)
     assert "exp1" in [exp.name for exp in config.mlflow_client.list_experiments()]
 
 
-def test_kedro_mlflow_config_experiment_exists(mocker, tmp_path):
+def test_kedro_mlflow_config_experiment_exists(mocker, tmp_path, config_dir):
     # create a ".kedro.yml" file to identify "tmp_path" as the root of a kedro project
     mocker.patch("kedro_mlflow.utils._is_kedro_project", return_value=True)
 
@@ -83,10 +92,12 @@ def test_kedro_mlflow_config_experiment_exists(mocker, tmp_path):
         mlflow_tracking_uri="mlruns",
         experiment_opts=dict(name="exp1"),
     )
+    context = load_context(tmp_path)
+    config.setup(context)
     assert "exp1" in [exp.name for exp in config.mlflow_client.list_experiments()]
 
 
-def test_kedro_mlflow_config_experiment_was_deleted(mocker, tmp_path):
+def test_kedro_mlflow_config_experiment_was_deleted(mocker, tmp_path, config_dir):
     # create a ".kedro.yml" file to identify "tmp_path" as the root of a kedro project
     mocker.patch("kedro_mlflow.utils._is_kedro_project", lambda x: True)
 
@@ -104,9 +115,73 @@ def test_kedro_mlflow_config_experiment_was_deleted(mocker, tmp_path):
         mlflow_tracking_uri="mlruns",
         experiment_opts=dict(name="exp1"),
     )
+    context = load_context(tmp_path)
+    config.setup(context)
     assert "exp1" in [exp.name for exp in config.mlflow_client.list_experiments()]
 
 
+def test_kedro_mlflow_config_setup_set_tracking_uri(mocker, tmp_path, config_dir):
+    # create a ".kedro.yml" file to identify "tmp_path" as the root of a kedro project
+    mocker.patch("kedro_mlflow.utils._is_kedro_project", lambda x: True)
+
+    # create an experiment with the same name and then delete it
+    mlflow_tracking_uri = (tmp_path / "awesome_tracking").as_uri()
+
+    # the config must restore properly the experiment
+    config = KedroMlflowConfig(
+        project_path=tmp_path,
+        mlflow_tracking_uri="awesome_tracking",
+        experiment_opts=dict(name="exp1"),
+    )
+    context = load_context(tmp_path)
+    config.setup(context)
+
+    assert mlflow.get_tracking_uri() == mlflow_tracking_uri
+
+
+def test_kedro_mlflow_config_setup_export_credentials(mocker, tmp_path, config_dir):
+    # create a ".kedro.yml" file to identify "tmp_path" as the root of a kedro project
+    mocker.patch("kedro_mlflow.utils._is_kedro_project", lambda x: True)
+
+    (tmp_path / "conf/base/credentials.yml").write_text(
+        yaml.dump(dict(my_mlflow_creds=dict(fake_mlflow_cred="my_fake_cred")))
+    )
+
+    # the config must restore properly the experiment
+    config = KedroMlflowConfig(project_path=tmp_path, credentials="my_mlflow_creds")
+    context = load_context(tmp_path)
+    config.setup(context)
+
+    assert os.environ["fake_mlflow_cred"] == "my_fake_cred"
+
+
+def test_kedro_mlflow_config_setup_tracking_priority(mocker, tmp_path, config_dir):
+    """Test if the mlflow_tracking uri set is the one of mlflow.yml
+    if it also eist in credentials.
+
+    Args:
+        mocker ([type]): [description]
+        tmp_path ([type]): [description]
+        config_dir ([type]): [description]
+    """
+    # create a ".kedro.yml" file to identify "tmp_path" as the root of a kedro project
+    mocker.patch("kedro_mlflow.utils._is_kedro_project", lambda x: True)
+
+    (tmp_path / "conf/base/credentials.yml").write_text(
+        yaml.dump(dict(my_mlflow_creds=dict(mlflow_tracking_uri="mlruns2")))
+    )
+
+    config = KedroMlflowConfig(
+        project_path=tmp_path,
+        mlflow_tracking_uri="mlruns1",
+        credentials="my_mlflow_creds",
+    )
+    context = load_context(tmp_path)
+    config.setup(context)
+
+    assert mlflow.get_tracking_uri() == (tmp_path / "mlruns1").as_uri()
+
+
 @pytest.mark.parametrize(
     "uri",
     [

diff --git a/tests/framework/context/test_mlflow_context.py b/tests/framework/context/test_mlflow_context.py
@@ -23,6 +23,7 @@ def test_get_mlflow_config(mocker, tmp_path, config_dir):
         tmp_path / "conf" / "base" / "mlflow.yml",
         dict(
             mlflow_tracking_uri="mlruns",
+            credentials=None,
             experiment=dict(name="fake_package", create=True),
             run=dict(id="123456789", name="my_run", nested=True),
             ui=dict(port="5151", host="localhost"),
@@ -31,6 +32,7 @@ def test_get_mlflow_config(mocker, tmp_path, config_dir):
     )
     expected = {
         "mlflow_tracking_uri": (tmp_path / "mlruns").as_uri(),
+        "credentials": None,
         "experiments": {"name": "fake_package", "create": True},
         "run": {"id": "123456789", "name": "my_run", "nested": True},
         "ui": {"port": "5151", "host": "localhost"},
@@ -48,6 +50,7 @@ def test_mlflow_config_with_templated_config(mocker, tmp_path, config_dir):
         tmp_path / "conf" / "base" / "mlflow.yml",
         dict(
             mlflow_tracking_uri="${mlflow_tracking_uri}",
+            credentials=None,
             experiment=dict(name="fake_package", create=True),
             run=dict(id="123456789", name="my_run", nested=True),
             ui=dict(port="5151", host="localhost"),
@@ -62,6 +65,7 @@ def test_mlflow_config_with_templated_config(mocker, tmp_path, config_dir):
 
     expected = {
         "mlflow_tracking_uri": (tmp_path / "testruns").as_uri(),
+        "credentials": None,
         "experiments": {"name": "fake_package", "create": True},
         "run": {"id": "123456789", "name": "my_run", "nested": True},
         "ui": {"port": "5151", "host": "localhost"},

diff --git a/tests/template/project/test_mlflow_yml.py b/tests/template/project/test_mlflow_yml.py
@@ -29,6 +29,7 @@ def test_mlflow_yml_rendering(template_mlflowyml):
         mlflow_config = yaml.load(file_handler)
     expected_config = dict(
         mlflow_tracking_uri="mlruns",
+        credentials=None,
         ui=KedroMlflowConfig.UI_OPTS,
         run=KedroMlflowConfig.RUN_OPTS,
         experiment=KedroMlflowConfig.EXPERIMENT_OPTS,