Update for publishing

connorferster · connorferster · commit 3eb3b3fa144a · 2025-07-17T11:46:53.000-07:00
diff --git a/README.md b/README.md
@@ -0,0 +1,154 @@
+# Millrun
+
+## A Python library and CLI tool for automating the execution of papermill
+
+### Motivation
+
+Papermill is great: it parameterizes a single notebook for you. Ok, so what about this whole directory of notebooks that I would like to execute with this list of different parameters?
+
+**Millrun** Will execute either a single notebook or all of the notebooks in a directory (recursively, if you want) and using either a list of alternative parameter dictionaries or a dictionary with a list of variations.
+
+In short, it iterates both over notebooks in a directory AND over lists of parameters.
+
+_When executed as a CLI tool, notebooks are executed in parallel using multi-processing_.
+
+## Installation
+
+`pip install millrun`
+
+## Usage: Python Library
+
+```python
+import millrun
+
+millrun.execute_run(
+    notebook_dir_or_file: pathlib.Path | str,
+    bulk_params: list | dict,
+    output_dir: Optional[pathlib.Path | str] = None,
+    output_prepend_components: Optional[list[str]] = None,
+    output_append_components: Optional[list[str]] = None,
+    recursive: bool = False,
+    exclude_glob_pattern: Optional[str] = None,
+    include_glob_pattern: Optional[str] = None,
+    use_multiprocessing: bool = False,
+    **kwargs, # kwargs are passed through to papermill
+)
+```
+
+## Usage: CLI tool
+
+```
+millrun --help
+                                                                                                       
+ Usage: millrun [OPTIONS] NOTEBOOK_DIR_OR_FILE PARAMS                                                  
+                                                                                                       
+ Executes a notebook or directory of notebooks using the provided bulk parameters JSON file            
+                                                                                                       
+                                                                                                       
+╭─ Arguments ─────────────────────────────────────────────────────────────────────────────────────────╮
+│ *    notebook_dir_or_file      TEXT  Path to a notebook file or a directory containing notebooks.   │
+│                                      [default: None]                                                │
+│                                      [required]                                                     │
+│ *    params                    TEXT  JSON file that contains parameters for notebook execution. Can │
+│                                      either be a 'list of dict' or 'dict of list'.                  │
+│                                      [default: None]                                                │
+│                                      [required]                                                     │
+╰─────────────────────────────────────────────────────────────────────────────────────────────────────╯
+╭─ Options ───────────────────────────────────────────────────────────────────────────────────────────╮
+│ --output-dir                                TEXT  Directory to place output files into. If not      │
+│                                                   provided the file directory will be used.         │
+│                                                   [default: None]                                   │
+│ --prepend                                   TEXT  Prepend components to use on output filename.Can  │
+│                                                   use dict keys from 'params' which will be         │
+│                                                   evaluated.(Comma-separated values).               │
+│                                                   [default: None]                                   │
+│ --append                                    TEXT  Append components to use on output filename.Can   │
+│                                                   use dict keys from 'params' which will be         │
+│                                                   evaluated.(Comma-separated values).               │
+│                                                   [default: None]                                   │
+│ --recursive               --no-recursive          [default: no-recursive]                           │
+│ --exclude-glob-pattern                      TEXT  [default: None]                                   │
+│ --include-glob-pattern                      TEXT  [default: None]                                   │
+│ --help                                            Show this message and exit.                       │
+╰─────────────────────────────────────────────────────────────────────────────────────────────────────╯
+
+```
+
+### Example
+
+While the `prepend` argument is optional, it is highly recommend you take advantage of it. If not, your output file names will be automatically prepended with an integer index to differentiate the output files.
+
+```
+millrun ./Notebooks_Dir params.json --prepend id_key_in_params
+```
+
+Where `id_key_in_params` is one of the keys in your params.json that you can use to uniquely identify each iteration. If you do not have a single unique key, you can provide a list of keys and they will all be prepended:
+
+Lets say my params.json looked like this:
+
+```json
+{
+    "x_values": [0, 1, 2],
+    "y_values": [45, 32, 60],
+}
+
+```
+
+I could execute like this:
+
+```
+millrun ./Notebooks_Dir params.json --prepend x_values,y_values,results
+```
+
+And my output files would look like:
+
+```
+0-45-results-special_calculation.ipynb
+1-32-results-special_calculation.ipynb
+2-60-results-special_calculation.ipynb
+```
+
+**Notice**: Since "results" was not a key in my params.json, it gets passed through as a string literal.
+
+## Organizing your parameters
+
+You can have your parameters dictionary/JSON in one of two formats:
+
+### Format 1: A list of dicts
+
+```python
+[
+    {"param1": 0, "param2": "hat", "param3": 21.2},
+    {"param1": 1, "param2": "cat", "param3": 34.3},
+    {"param1": 2, "param2": "bat", "param3": 200.0}
+]
+```
+
+Where each notebook given to millrun will execute against each dictionary in the list.
+
+
+### Format 2: A dict of lists
+
+```python
+{
+    "param1": [0, 1, 2],
+    "param2": ["hat", "cat", "bat"],
+    "param3": [21.2, 34.3, 200.0]
+}
+```
+
+This format is offered as a convenience format. Internally, it is converted into "Format 1" prior to execution.
+
+
+## CLI parallel execution
+
+Since millrun iterates over two dimensions (each notebook and then dict of parameters in the list), there are two ways of parellelizing: 
+
+1. Execute each notebook in sequence and parallelize the execution of the different parameter variations
+2. Execute each notebook in parallel and sequentialize the execution of the different parameter variations
+
+Because of my own personal use cases, it is more efficient for me to use **1.** because I have way more parameter variations than I do notebooks. 
+
+However, this method becomes inefficient if you have MANY notebooks and only 1-3 variations. In that case, you would probably prefer the method **2.**. It is still faster than single-process execution (like you get )
+
+If you need this use case then feel free to raise an issue and/or contribute a PR to implement it as an option for execution.
diff --git a/src/millrun/__init__.py b/src/millrun/__init__.py
@@ -1,2 +1,8 @@
-def main() -> None:
-    print("Hello from millrun!")
+"""
+Millrun: A Python library and CLI tool to automate the execution of notebooks
+with papermill. 
+"""
+
+__version__ = "0.1.0"
+
+from .millrun import execute_run
diff --git a/src/millrun/cli.py b/src/millrun/cli.py
@@ -4,7 +4,7 @@
 import pathlib
 
 import typer
-from .millrun import execute_batch
+from .millrun import execute_run
 
 
 def _parse_json(filepath: str) -> dict:
@@ -69,7 +69,7 @@ def run(
         output_dir = pathlib.Path(output_dir)
     else:
         output_dir = pathlib.Path.cwd()
-    execute_batch(
+    execute_run(
         notebook_dir_or_file,
         params,
         output_dir,
diff --git a/src/millrun/millrun.py b/src/millrun/millrun.py
@@ -9,7 +9,7 @@
 
 
 
-def execute_batch(
+def execute_run(
     notebook_dir_or_file: pathlib.Path | str,
     bulk_params: list | dict,
     output_dir: Optional[pathlib.Path | str] = None,
@@ -90,15 +90,15 @@ def execute_batch(
         output_dir = notebook_dir
     if not output_dir.exists():
         output_dir.mkdir(exist_ok=True, parents=True)
-
     if notebook_filename is not None:
         execute_notebooks(
             notebook_dir / notebook_filename,
             bulk_params_list,
             output_prepend_components,
             output_append_components,
             output_dir,
-            use_multiprocessing
+            use_multiprocessing,
+            **kwargs
         )
     else:
         glob_method = notebook_dir.glob
@@ -115,19 +115,17 @@ def execute_batch(
         included_paths = set(glob_method(glob_pattern))
 
         notebook_paths = sorted(included_paths - excluded_paths)
-
         for notebook_path in notebook_paths:
+
             execute_notebooks(
                 notebook_path,
                 bulk_params_list,
                 output_prepend_components,
                 output_append_components,
                 output_dir,
                 use_multiprocessing,
-            )
-            # Multiprocessing approach inspired by 
-            # https://www.deanmontgomery.com/2022/03/24/rich-progress-and-multiprocessing/
-            
+                **kwargs
+            )            
 
 
 
@@ -163,35 +161,6 @@ def convert_bulk_params_to_list(bulk_params: dict[str, list]):
     return bulk_params_list
 
 
-def get_output_name(
-    notebook_filename: str,
-    output_prepend_components: list[str] | None,
-    output_append_components: list[str] | None,
-    notebook_params: dict[str, Any]
-) -> str:
-    """
-    Returns the output name given the included components.
-    """
-    if output_prepend_components is None:
-        output_prepend_components = []
-    if output_append_components is None:
-        output_append_components = []
-    prepends = [notebook_params[comp] for comp in output_prepend_components]
-    appends = [notebook_params[comp] for comp in output_append_components]
-    prepend_str = "-".join(prepends)
-    append_str = "-".join(appends)
-    notebook_filename = pathlib.Path(notebook_filename)
-    return "-".join([elem for elem in [prepend_str, notebook_filename.stem, append_str] if elem]) + notebook_filename.suffix
-
-                                        # notebook_path,
-                                        # bulk_params_list,
-                                        # output_prepend_components,
-                                        # output_append_components,
-                                        # output_dir,
-                                        # _progress, 
-                                        # task_id
-
-
 def execute_notebooks(
     notebook_path: pathlib.Path,
     bulk_params_list: dict[str, Any],
@@ -229,6 +198,8 @@ def execute_notebooks(
             TimeElapsedColumn(),
             refresh_per_second=1,  # bit slower updates
         ) as progress:
+            # Multiprocessing approach inspired by 
+            # https://www.deanmontgomery.com/2022/03/24/rich-progress-and-multiprocessing/
             futures = []  # keep track of the jobs
             with multiprocessing.Manager() as manager:
                 _progress = manager.dict()
@@ -275,7 +246,8 @@ def execute_notebook(
         notebook_filename, 
         output_prepend_components, 
         output_append_components,
-        notebook_params
+        notebook_params,
+        current_iteration
     )
     pm.execute_notebook(
         notebook_filename,
@@ -286,4 +258,27 @@ def execute_notebook(
         **kwargs
     )
     if _progress is not None:
-        _progress[_task_id] = {"progress": current_iteration / total_variations, "total": total_variations}    
+        _progress[_task_id] = {"progress": current_iteration, "total": total_variations}    
+
+
+def get_output_name(
+    notebook_filename: str,
+    output_prepend_components: list[str] | None,
+    output_append_components: list[str] | None,
+    notebook_params: dict[str, Any],
+    current_index: int
+) -> str:
+    """
+    Returns the output name given the included components.
+    """
+    if output_prepend_components is None:
+        output_prepend_components = [str(current_index)]
+    if output_append_components is None:
+        output_append_components = []
+    prepends = [notebook_params.get(comp, comp) for comp in output_prepend_components]
+    appends = [notebook_params.get(comp, comp) for comp in output_append_components]
+    prepend_str = "-".join(prepends)
+    append_str = "-".join(appends)
+    notebook_filename = pathlib.Path(notebook_filename)
+    output_name = "-".join([elem for elem in [prepend_str, notebook_filename.stem, append_str] if elem]) + notebook_filename.suffix
+    return output_name