Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DM-45079: Execute only non-filtered notebooks when requesting explicit notebooks #363

Merged
merged 3 commits into from
Jul 23, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 81 additions & 18 deletions src/mobu/models/business/notebookrunner.py
Original file line number Diff line number Diff line change
@@ -12,14 +12,44 @@
from .nublado import NubladoBusinessData, NubladoBusinessOptions

__all__ = [
"ListNotebookRunnerOptions",
"NotebookFilterResults",
"NotebookRunnerConfig",
"NotebookRunnerData",
"NotebookRunnerOptions",
]


class NotebookRunnerOptions(NubladoBusinessOptions):
"""Options for NotebookRunner monkey business."""
class BaseNotebookRunnerOptions(NubladoBusinessOptions):
"""Options for all types NotebookRunner monkey business."""

repo_ref: str = Field(
NOTEBOOK_REPO_BRANCH,
title="Git ref of notebook repository to execute",
description="Only used by the NotebookRunner",
examples=["main", "03cd564dd2025bf17054d9ebfeeb5c5a266e3484"],
)

repo_url: str = Field(
NOTEBOOK_REPO_URL,
title="Git URL of notebook repository to execute",
description="Only used by the NotebookRunner",
)

exclude_dirs: set[Path] = Field(
set(),
title="Any notebooks in these directories will not be run",
description=(
" These directories are relative to the repo root. Any notebooks"
" in child directories of these directories will also be excluded."
" Only used by the NotebookRunner."
),
examples=["some-dir", "some-dir/some-other-dir"],
)


class NotebookRunnerOptions(BaseNotebookRunnerOptions):
"""Options to specify a fixed number of notebooks to run per session."""

max_executions: int = Field(
25,
@@ -37,26 +67,14 @@ class NotebookRunnerOptions(NubladoBusinessOptions):
ge=1,
)

repo_ref: str = Field(
NOTEBOOK_REPO_BRANCH,
title="Git ref of notebook repository to execute",
description="Only used by the NotebookRunner",
examples=["main", "03cd564dd2025bf17054d9ebfeeb5c5a266e3484"],
)

repo_url: str = Field(
NOTEBOOK_REPO_URL,
title="Git URL of notebook repository to execute",
description="Only used by the NotebookRunner",
)
class ListNotebookRunnerOptions(BaseNotebookRunnerOptions):
"""Options to specify a list of notebooks to run per session."""

notebooks_to_run: list[Path] = Field(
[],
title="Specific notebooks to run",
description=(
"If this is set, then only these specific notebooks will be"
" executed."
),
description=("Only these specific notebooks will be executed."),
)


@@ -67,7 +85,7 @@ class NotebookRunnerConfig(BusinessConfig):
..., title="Type of business to run"
)

options: NotebookRunnerOptions = Field(
options: NotebookRunnerOptions | ListNotebookRunnerOptions = Field(
default_factory=NotebookRunnerOptions,
title="Options for the monkey business",
)
@@ -104,3 +122,48 @@ class NotebookMetadata(BaseModel):
),
examples=[{"tap", "ssotap", "butler"}],
)


class NotebookFilterResults(BaseModel):
"""Valid notebooks and categories for invalid notebooks."""

all: set[Path] = Field(
default=set(),
title="All notebooks",
description="All notebooks in the repository",
)

runnable: set[Path] = Field(
default=set(),
title="Runnable notebooks",
description=(
"These are the notebooks to run after all filtering has been done"
),
)

excluded_by_dir: set[Path] = Field(
default=set(),
title="Excluded by directory",
description=(
"These notebooks won't be run because they are in a directory that"
"is excliticly excluded"
),
)

excluded_by_service: set[Path] = Field(
default=set(),
title="Excluded by service availability",
description=(
"These notebooks won't be run because the depend on services which"
" are not available in this environment"
),
)

excluded_by_requested: set[Path] = Field(
default=set(),
title="Excluded by explicit list",
description=(
"These notebooks won't be run because a list of explicitly"
" requested notebooks was provided, and they weren't in it."
),
)
100 changes: 72 additions & 28 deletions src/mobu/services/business/notebookrunner.py
Original file line number Diff line number Diff line change
@@ -23,6 +23,8 @@
from ...constants import GITHUB_REPO_CONFIG_PATH
from ...exceptions import NotebookRepositoryError, RepositoryConfigError
from ...models.business.notebookrunner import (
ListNotebookRunnerOptions,
NotebookFilterResults,
NotebookMetadata,
NotebookRunnerData,
NotebookRunnerOptions,
@@ -53,7 +55,7 @@ class NotebookRunner(NubladoBusiness):

def __init__(
self,
options: NotebookRunnerOptions,
options: NotebookRunnerOptions | ListNotebookRunnerOptions,
fajpunk marked this conversation as resolved.
Show resolved Hide resolved
user: AuthenticatedUser,
http_client: AsyncClient,
logger: BoundLogger,
@@ -65,6 +67,14 @@ def __init__(
self._exclude_paths: set[Path] = set()
self._running_code: str | None = None
self._git = Git(logger=logger)
self._max_executions: int | None = None
self._notebooks_to_run: list[Path] | None = None

match options:
case NotebookRunnerOptions(max_executions=max_executions):
self._max_executions = max_executions
case ListNotebookRunnerOptions(notebooks_to_run=notebooks_to_run):
self._notebooks_to_run = notebooks_to_run

def annotations(self, cell_id: str | None = None) -> dict[str, str]:
result = super().annotations()
@@ -81,8 +91,15 @@ async def startup(self) -> None:
async def cleanup(self) -> None:
shutil.rmtree(str(self._repo_dir))
self._repo_dir = None
self._notebook_filter_results = None

async def initialize(self) -> None:
"""Prepare to run the business.

* Check out the repository
* Parse the in-repo config
* Filter the notebooks
"""
if self._repo_dir is None:
self._repo_dir = Path(TemporaryDirectory(delete=False).name)
await self.clone_repo()
@@ -106,6 +123,7 @@ async def initialize(self) -> None:

exclude_dirs = repo_config.exclude_dirs
self._exclude_paths = {self._repo_dir / path for path in exclude_dirs}
self._notebooks = self.find_notebooks()
self.logger.info("Repository cloned and ready")

async def shutdown(self) -> None:
@@ -149,46 +167,67 @@ def missing_services(self, notebook: Path) -> bool:
return True
return False

def find_notebooks(self) -> list[Path]:
def find_notebooks(self) -> NotebookFilterResults:
fajpunk marked this conversation as resolved.
Show resolved Hide resolved
with self.timings.start("find_notebooks"):
if self._repo_dir is None:
raise NotebookRepositoryError(
"Repository directory must be set", self.user.username
)
notebooks = [
n
for n in self._repo_dir.glob("**/*.ipynb")
if not (self.is_excluded(n) or self.missing_services(n))
]

# Filter for explicit notebooks
if self.options.notebooks_to_run:
requested = [

all_notebooks = set(self._repo_dir.glob("**/*.ipynb"))
if not all_notebooks:
msg = "No notebooks found in {self._repo_dir}"
raise NotebookRepositoryError(msg, self.user.username)

filter_results = NotebookFilterResults(all=all_notebooks)
filter_results.excluded_by_dir = {
n for n in filter_results.all if self.is_excluded(n)
}
filter_results.excluded_by_service = {
n for n in filter_results.all if self.missing_services(n)
}

if self._notebooks_to_run:
requested = {
self._repo_dir / notebook
for notebook in self.options.notebooks_to_run
]
not_found = set(requested) - set(notebooks)
for notebook in self._notebooks_to_run
}
not_found = requested - filter_results.all
if not_found:
msg = (
f"These notebooks do not exist in {self._repo_dir}:"
f" {not_found}"
"Requested notebooks do not exist in"
f" {self._repo_dir}: {not_found}"
)
raise NotebookRepositoryError(msg, self.user.username)
notebooks = requested
self.logger.debug(
"Running with explicit list of notebooks",
notebooks=notebooks,
filter_results.excluded_by_requested = (
filter_results.all - requested
)

if not notebooks:
msg = "No notebooks found in {self._repo_dir}"
raise NotebookRepositoryError(msg, self.user.username)
random.shuffle(notebooks)
return notebooks
filter_results.runnable = (
filter_results.all
- filter_results.excluded_by_service
- filter_results.excluded_by_dir
- filter_results.excluded_by_requested
)
if bool(filter_results.runnable):
self.logger.info(
"Found notebooks to run",
filter_results=filter_results.model_dump(),
)
else:
self.logger.warning(
"No notebooks to run after filtering!",
filter_results=filter_results.model_dump(),
)

return filter_results

def next_notebook(self) -> Path:
if not self._notebooks:
self._notebooks = self.find_notebooks()
if not self._notebook_paths:
self._notebook_paths = self.find_notebooks()
self._notebook_paths = list(self._notebooks.runnable)
random.shuffle(self._notebook_paths)
return self._notebook_paths.pop()

def read_notebook_metadata(self, notebook: Path) -> NotebookMetadata:
@@ -238,14 +277,19 @@ async def open_session(
yield session

async def execute_code(self, session: JupyterLabSession) -> None:
for count in range(self.options.max_executions):
"""Run a set number of notebooks (flocks), or all available (CI)."""
if self._max_executions:
num_executions = self._max_executions
else:
num_executions = len(self._notebooks.runnable)
for count in range(num_executions):
if self.refreshing:
await self.refresh()
return

self._notebook = self.next_notebook()

iteration = f"{count + 1}/{self.options.max_executions}"
iteration = f"{count + 1}/{num_executions}"
msg = f"Notebook {self._notebook.name} iteration {iteration}"
self.logger.info(msg)

10 changes: 7 additions & 3 deletions src/mobu/services/github_ci/ci_notebook_job.py
Original file line number Diff line number Diff line change
@@ -6,8 +6,8 @@
from structlog.stdlib import BoundLogger

from mobu.models.business.notebookrunner import (
ListNotebookRunnerOptions,
NotebookRunnerConfig,
NotebookRunnerOptions,
)
from mobu.models.solitary import SolitaryConfig
from mobu.models.user import User
@@ -73,15 +73,19 @@ async def run(self, user: User, scopes: list[str]) -> None:
# Run notebooks using a Solitary runner
summary = "Running these notebooks via Mobu:\n" + "\n".join(
[f"* {notebook}" for notebook in self._notebooks]
+ [
"Note that not all of these may run. Some may be exluded based"
" on config in the repo:"
" https://mobu.lsst.io/user_guide/in_repo_config.html"
]
)
await self.check_run.start(summary=summary)
solitary_config = SolitaryConfig(
user=user,
scopes=[str(scope) for scope in scopes],
business=NotebookRunnerConfig(
type="NotebookRunner",
options=NotebookRunnerOptions(
max_executions=len(self._notebooks),
options=ListNotebookRunnerOptions(
repo_ref=self._github.ref,
repo_url=f"https://github.com/{self._github.repo_owner}/{self._github.repo_name}.git",
notebooks_to_run=self._notebooks,
Loading