Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
…Galilei#30 Galileo-Galilei#31 Galileo-Galilei#72 Galileo-Galilei#29 Galileo-Galilei#62 - context access - kedro 16.5 - hook auto registration
  • Loading branch information
takikadiri committed Oct 11, 2020
1 parent 4c0b6a7 commit d691b96
Show file tree
Hide file tree
Showing 17 changed files with 303 additions and 281 deletions.
2 changes: 1 addition & 1 deletion .isort.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,4 @@ line_length=88
ensure_newline_before_comments=True
sections=FUTURE,STDLIB,THIRDPARTY,FIRSTPARTY,LOCALFOLDER
known_first_party=kedro_mlflow
known_third_party=black,click,cookiecutter,flake8,isort,jinja2,kedro,mlflow,pandas,pytest,pytest_lazyfixture,setuptools,yaml
known_third_party=click,cookiecutter,jinja2,kedro,mlflow,pandas,pytest,pytest_lazyfixture,setuptools,yaml
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,15 @@

### Added

- kedro-mlflow hooks are now auto-registred (#29)
- Add dataset ``MlflowMetricsDataSet`` for metrics logging ([#9](https://github.com/Galileo-Galilei/kedro-mlflow/issues/9)) and update documentation for metrics.
- Add CI workflow `create-release` to ensure release consistency and up-to-date CHANGELOG. ([#57](https://github.com/Galileo-Galilei/kedro-mlflow/issues/57), [#68](https://github.com/Galileo-Galilei/kedro-mlflow/pull/68))
- Add templates for issues and pull requests ([#57](https://github.com/Galileo-Galilei/kedro-mlflow/issues/57), [#68](https://github.com/Galileo-Galilei/kedro-mlflow/pull/68))

### Fixed

- `get_mlflow_config` now uses the kedro project config_loader to get configs (#66 #64 #54 #30 #31 #72)
- Fix the tests that are broken during the 0.16.5 update (#62)
- Versioned datasets artifacts logging are handled correctly ([#41](https://github.com/Galileo-Galilei/kedro-mlflow/issues/41))
- MlflowDataSet handles correctly datasets which are inherited from AbstractDataSet ([#45](https://github.com/Galileo-Galilei/kedro-mlflow/issues/45))
- Change the test in `_generate_kedro_command` to accept both empty `Iterable`s(default in CLI mode) and `None` values (default in interactive mode) ([#50](https://github.com/Galileo-Galilei/kedro-mlflow/issues/50))
Expand All @@ -21,6 +24,9 @@

### Changed

- Bump kedro requirement to <=0.16.5
- `kedro mlflow init` is no longer touching the run.py file
- `MlflowNodeHook` now have `before_pipeline_run` hook so it can access to project run parameters
- Remove `conda_env` and `model_name` arguments from `MlflowPipelineHook` and add them to `PipelineML` and `pipeline_ml`. This is necessary for incoming hook auto-discovery in future release and it enables having multiple `PipelineML` in the same project ([#58](https://github.com/Galileo-Galilei/kedro-mlflow/pull/58)). This mechanically fixes [#54](https://github.com/Galileo-Galilei/kedro-mlflow/issues/54) by making `conda_env` path absolute for airflow suppport.
- `flatten_dict_params`, `recursive` and `sep` arguments of the `MlflowNodeHook` are moved to the `mlflow.yml` config file to prepare plugin auto registration. This also modifies the `run.py` template (to remove the args) and the `mlflow.yml` keys to add a `hooks` entry. ([#59](https://github.com/Galileo-Galilei/kedro-mlflow/pull/59))
- Rename CI workflow to `test` ([#57](https://github.com/Galileo-Galilei/kedro-mlflow/issues/57), [#68](https://github.com/Galileo-Galilei/kedro-mlflow/pull/68))
Expand Down
142 changes: 74 additions & 68 deletions kedro_mlflow/framework/cli/cli.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,12 @@
import os
import subprocess
from pathlib import Path

import click
from kedro import __file__ as KEDRO_PATH
from kedro.framework.context import get_static_project_data, load_context

from kedro_mlflow.framework.cli.cli_utils import (
render_jinja_template,
write_jinja_template,
)
from kedro_mlflow.framework.cli.cli_utils import write_jinja_template
from kedro_mlflow.framework.context import get_mlflow_config
from kedro_mlflow.utils import _already_updated, _get_project_globals, _is_kedro_project
from kedro_mlflow.utils import _already_updated, _is_kedro_project

TEMPLATE_FOLDER_PATH = Path(__file__).parent.parent.parent / "template" / "project"

Expand Down Expand Up @@ -87,80 +83,86 @@ def init(force, silent):

# get constants
project_path = Path().cwd()
project_globals = _get_project_globals()
context = load_context(project_path)
# project_globals = _get_project_globals()
project_globals = get_static_project_data(project_path)

# mlflow.yml is just a static file,
# but the name of the experiment is set to be the same as the project
conf_root = context.CONF_ROOT
mlflow_yml = "mlflow.yml"
write_jinja_template(
src=TEMPLATE_FOLDER_PATH / mlflow_yml,
is_cookiecutter=False,
dst=project_path / "conf" / "base" / mlflow_yml,
python_package=project_globals["python_package"],
dst=project_path / conf_root / "base" / mlflow_yml,
python_package=project_globals["package_name"],
)
if not silent:
click.secho(
click.style("'conf/base/mlflow.yml' successfully updated.", fg="green")
click.style(
f"'{conf_root}/base/mlflow.yml' successfully updated.", fg="green"
)
)
# make a check whether the project run.py is strictly identical to the template
# if yes, replace the script by the template silently
# if no, raise a warning and send a message to INSERT_DOC_URL
flag_erase_runpy = force
runpy_project_path = (
project_path
/ "src"
/ (Path(project_globals["context_path"]).parent.as_posix() + ".py")
)
if not force:
kedro_path = Path(KEDRO_PATH).parent
runpy_template_path = (
kedro_path
/ "templates"
/ "project"
/ "{{ cookiecutter.repo_name }}"
/ "src"
/ "{{ cookiecutter.python_package }}"
/ "run.py"
)
kedro_runpy_template = render_jinja_template(
src=runpy_template_path,
is_cookiecutter=True,
python_package=project_globals["python_package"],
project_name=project_globals["project_name"],
kedro_version=project_globals["kedro_version"],
)

with open(runpy_project_path, mode="r") as file_handler:
kedro_runpy_project = file_handler.read()

# beware : black formatting could change slightly this test which is very strict
if kedro_runpy_project == kedro_runpy_template:
flag_erase_runpy = True

if flag_erase_runpy:
os.remove(runpy_project_path)
write_jinja_template(
src=TEMPLATE_FOLDER_PATH / "run.py",
dst=runpy_project_path,
is_cookiecutter=True,
python_package=project_globals["python_package"],
project_name=project_globals["project_name"],
kedro_version=project_globals["kedro_version"],
)
if not silent:
click.secho(click.style("'run.py' successfully updated", fg="green"))
else:
click.secho(
click.style(
"You have modified your 'run.py' since project creation.\n"
+ "In order to use kedro-mlflow, you must either:\n"
+ " - set up your run.py with the following instructions :\n"
+ "INSERT_DOC_URL\n"
+ " - call the following command:\n"
+ "$ kedro mlflow init --force",
fg="yellow",
)
)
# flag_erase_runpy = force
# runpy_project_path = (
# project_path
# / "src"
# / project_globals["package_name"]
# / "run.py"
# )
# if not force:
# kedro_path = Path(KEDRO_PATH).parent
# runpy_template_path = (
# kedro_path
# / "templates"
# / "project"
# / "{{ cookiecutter.repo_name }}"
# / "src"
# / "{{ cookiecutter.python_package }}"
# / "run.py"
# )
# kedro_runpy_template = render_jinja_template(
# src=runpy_template_path,
# is_cookiecutter=True,
# python_package=project_globals["package_name"],
# project_name=project_globals["project_name"],
# kedro_version=project_globals["project_version"],
# )

# with open(runpy_project_path, mode="r") as file_handler:
# kedro_runpy_project = file_handler.read()

# # beware : black formatting could change slightly this test which is very strict
# if kedro_runpy_project == kedro_runpy_template:
# flag_erase_runpy = True

# if flag_erase_runpy:
# os.remove(runpy_project_path)
# write_jinja_template(
# src=TEMPLATE_FOLDER_PATH / "run.py",
# dst=runpy_project_path,
# is_cookiecutter=True,
# python_package=project_globals["package_name"],
# project_name=project_globals["project_name"],
# kedro_version=project_globals["project_version"],
# )
# if not silent:
# click.secho(click.style("'run.py' successfully updated", fg="green"))
# else:
# click.secho(
# click.style(
# "You have modified your 'run.py' since project creation.\n"
# + "In order to use kedro-mlflow, you must either:\n"
# + " - set up your run.py with the following instructions :\n"
# + "INSERT_DOC_URL\n"
# + " - call the following command:\n"
# + "$ kedro mlflow init --force",
# fg="yellow",
# )
# )


@mlflow_commands.command()
Expand All @@ -185,7 +187,11 @@ def ui(project_path, env):
"""

# the context must contains the self.mlflow attribues with mlflow configuration
mlflow_conf = get_mlflow_config(project_path=project_path, env=env)
if not project_path:
project_path = Path().cwd()

context = load_context(project_path=project_path, env=env)
mlflow_conf = get_mlflow_config(context)

# call mlflow ui with specific options
# TODO : add more options for ui
Expand Down
19 changes: 5 additions & 14 deletions kedro_mlflow/framework/context/mlflow_context.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,14 @@
from pathlib import Path

from kedro.config import ConfigLoader
from kedro.framework.context import KedroContext

from kedro_mlflow.framework.context.config import KedroMlflowConfig


# this could be a read-only property in the context
# with a @property decorator
# but for consistency with hook system, it is an external function
def get_mlflow_config(project_path=None, env="local"):
if project_path is None:
project_path = Path.cwd()
project_path = Path(project_path)
conf_paths = [
str(project_path / "conf" / "base"),
str(project_path / "conf" / env),
]
config_loader = ConfigLoader(conf_paths=conf_paths)
conf_mlflow_yml = config_loader.get("mlflow*", "mlflow*/**")
conf_mlflow = KedroMlflowConfig(project_path=project_path)
def get_mlflow_config(context: KedroContext):

conf_mlflow_yml = context.config_loader.get("mlflow*", "mlflow*/**")
conf_mlflow = KedroMlflowConfig(context.project_path)
conf_mlflow.from_dict(conf_mlflow_yml)
return conf_mlflow
44 changes: 43 additions & 1 deletion kedro_mlflow/framework/hooks/node_hook.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,55 @@
from typing import Any, Dict

import mlflow
from kedro.framework.context import load_context
from kedro.framework.hooks import hook_impl
from kedro.io import DataCatalog
from kedro.pipeline import Pipeline
from kedro.pipeline.node import Node

from kedro_mlflow.framework.context import get_mlflow_config


class MlflowNodeHook:
def __init__(self):
config = get_mlflow_config()
self.context = None
self.flatten = False
self.recursive = True
self.sep = "."

@hook_impl
def before_pipeline_run(
self, run_params: Dict[str, Any], pipeline: Pipeline, catalog: DataCatalog
) -> None:
"""Hook to be invoked before a pipeline runs.
Args:
run_params: The params needed for the given run.
Should be identical to the data logged by Journal.
# @fixme: this needs to be modelled explicitly as code, instead of comment
Schema: {
"run_id": str,
"project_path": str,
"env": str,
"kedro_version": str,
"tags": Optional[List[str]],
"from_nodes": Optional[List[str]],
"to_nodes": Optional[List[str]],
"node_names": Optional[List[str]],
"from_inputs": Optional[List[str]],
"load_versions": Optional[List[str]],
"pipeline_name": str,
"extra_params": Optional[Dict[str, Any]],
}
pipeline: The ``Pipeline`` that will be run.
catalog: The ``DataCatalog`` to be used during the run.
"""

self.context = load_context(
project_path=run_params["project_path"],
env=run_params["env"],
extra_params=run_params["extra_params"],
)
config = get_mlflow_config(self.context)
self.flatten = config.node_hook_opts["flatten_dict_params"]
self.recursive = config.node_hook_opts["recursive"]
self.sep = config.node_hook_opts["sep"]
Expand Down Expand Up @@ -51,6 +90,9 @@ def before_node_run(
mlflow.log_params(params_inputs)


mlflow_node_hooks = MlflowNodeHook()


def flatten_dict(d, recursive: bool = True, sep="."):
def expand(key, value):
if isinstance(value, dict):
Expand Down
15 changes: 13 additions & 2 deletions kedro_mlflow/framework/hooks/pipeline_hook.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import mlflow
import yaml
from kedro.framework.context import load_context
from kedro.framework.hooks import hook_impl
from kedro.io import DataCatalog
from kedro.pipeline import Pipeline
Expand All @@ -17,6 +18,9 @@


class MlflowPipelineHook:
def __init__(self):
self.context = None

@hook_impl
def after_catalog_created(
self,
Expand Down Expand Up @@ -62,9 +66,13 @@ def before_pipeline_run(
pipeline: The ``Pipeline`` that will be run.
catalog: The ``DataCatalog`` to be used during the run.
"""
mlflow_conf = get_mlflow_config(
project_path=run_params["project_path"], env=run_params["env"]
context = load_context(
project_path=run_params["project_path"],
env=run_params["env"],
extra_params=run_params["extra_params"],
)

mlflow_conf = get_mlflow_config(context)
mlflow.set_tracking_uri(mlflow_conf.mlflow_tracking_uri)
# TODO : if the pipeline fails, we need to be able to end stop the mlflow run
# cannot figure out how to do this within hooks
Expand Down Expand Up @@ -177,6 +185,9 @@ def on_pipeline_error(
mlflow.end_run()


mlflow_pipeline_hooks = MlflowPipelineHook()


def _generate_kedro_command(
tags, node_names, from_nodes, to_nodes, from_inputs, load_versions, pipeline_name
):
Expand Down
16 changes: 1 addition & 15 deletions kedro_mlflow/template/project/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,34 +28,20 @@

"""Application entry point."""
from pathlib import Path
from typing import Dict

from kedro.framework.context import KedroContext, load_package_context
from kedro.pipeline import Pipeline

from {{ cookiecutter.python_package }}.pipeline import create_pipelines

from kedro_mlflow.framework.hooks import MlflowNodeHook, MlflowPipelineHook


class ProjectContext(KedroContext):
"""Users can override the remaining methods from the parent class here,
or create new ones (e.g. as required by plugins)
"""

project_name = "{{ cookiecutter.project_name }}"
# `project_version` is the version of kedro used to generate the project
project_version = "{{ cookiecutter.kedro_version }}"
package_name = "{{ cookiecutter.python_package }}"
hooks = (
MlflowNodeHook(),
MlflowPipelineHook(),
)

def _get_pipelines(self) -> Dict[str, Pipeline]:
return create_pipelines()


def run_package():
# Entry point for running a Kedro project packaged with `kedro package`
# using `python -m <project_package>.run` command.
Expand All @@ -66,4 +52,4 @@ def run_package():


if __name__ == "__main__":
run_package()
run_package()
2 changes: 1 addition & 1 deletion requirements/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
mlflow>=1.0.0, <2.0.0
kedro>=0.16.0, <=0.16.4 # 0.16.5 breaks pipeline_ml, template and hooks test
kedro>=0.16.0, <=0.16.5
Loading

0 comments on commit d691b96

Please sign in to comment.