diff --git a/docs/lifecycle.dot b/docs/lifecycle.dot index c499dce21f8..961aca6e5cc 100644 --- a/docs/lifecycle.dot +++ b/docs/lifecycle.dot @@ -41,6 +41,7 @@ digraph Metaflow { validate_dag [label="{graph|validate}", fillcolor=lightpink2] init_environment [label="{environment|init_environment}", fillcolor=palegreen2] package_init [label="{decorator|package_init}", fillcolor=lightblue2] + add_custom_package [label="{decorator|add_to_package}", fillcolor=lightblue2] add_to_package [label="{environment|add_to_package}", fillcolor=palegreen2] package [label="{package|create}", fillcolor=lightpink2] } @@ -148,7 +149,8 @@ digraph Metaflow { /* package */ validate_dag -> init_environment init_environment -> package_init - package_init -> add_to_package + package_init -> add_custom_package + add_custom_package -> add_to_package add_to_package -> package package -> command_run package -> stepfunctions_create diff --git a/docs/lifecycle.png b/docs/lifecycle.png index c2cc3fd1341..a412b651af9 100644 Binary files a/docs/lifecycle.png and b/docs/lifecycle.png differ diff --git a/metaflow/decorators.py b/metaflow/decorators.py index 241124db3b8..b977d715ea9 100644 --- a/metaflow/decorators.py +++ b/metaflow/decorators.py @@ -239,6 +239,20 @@ def package_init(self, flow, step_name, environment): """ pass + def add_to_package(self): + """ + Called to add custom packages needed for a decorator. This hook will be + called in the `MetaflowPackage` class where metaflow compiles the code package + tarball. This hook is invoked in the `MetaflowPackage`'s `path_tuples` + function. The `path_tuples` function is a generator that yields a tuple of + `(file_path, arcname)`.`file_path` is the path of the file in the local file system; + the `arcname` is the path of the file in the constructed tarball or the path of the file + after decompressing the tarball. + + Returns a list of tuples where each tuple represents (file_path, arcname) + """ + return [] + def step_task_retry_count(self): """ Called to determine the number of times this task should be retried. diff --git a/metaflow/package.py b/metaflow/package.py index d181705284e..7f7082c1f39 100644 --- a/metaflow/package.py +++ b/metaflow/package.py @@ -7,12 +7,25 @@ from itertools import chain from .metaflow_config import DEFAULT_PACKAGE_SUFFIXES +from .exception import MetaflowException from .util import to_unicode from . import R DEFAULT_SUFFIXES_LIST = DEFAULT_PACKAGE_SUFFIXES.split(",") +class NonUniqueFileNameToFilePathMappingException(MetaflowException): + headline = "Non Unique file path for a file name included in code package" + + def __init__(self, filename, file_paths, lineno=None): + msg = ( + "Filename %s included in the code package includes multiple different paths for the same name : %s.\n" + "The `filename` in the `add_to_package` decorator hook requires a unqiue `file_path` to `file_name` mapping" + % (filename, ", ".join(file_paths)) + ) + super().__init__(msg=msg, lineno=lineno) + + class MetaflowPackage(object): def __init__(self, flow, environment, echo, suffixes=DEFAULT_SUFFIXES_LIST): self.suffixes = list(set().union(suffixes, DEFAULT_SUFFIXES_LIST)) @@ -31,6 +44,7 @@ def __init__(self, flow, environment, echo, suffixes=DEFAULT_SUFFIXES_LIST): ) self.flow_name = flow.name + self._flow = flow self.create_time = time.time() environment.init_environment(echo) for step in flow: @@ -75,6 +89,23 @@ def path_tuples(self): addl_suffixes=self.metaflow_extensions_addl_suffixes, ): yield path_tuple + + # Any custom packages exposed via decorators + deco_module_paths = {} + for step in self._flow: + for deco in step.decorators: + for path_tuple in deco.add_to_package(): + file_path, file_name = path_tuple + # Check if the path is not duplicated as + # many steps can have the same packages being imported + if file_name not in deco_module_paths: + deco_module_paths[file_name] = file_path + yield path_tuple + elif deco_module_paths[file_name] != file_path: + raise NonUniqueFileNameToFilePathMappingException( + file_name, [deco_module_paths[file_name], file_path] + ) + # the package folders for environment for path_tuple in self.environment.add_to_package(): yield path_tuple