Skip to content

Commit

Permalink
Rework state management for file servers
Browse files Browse the repository at this point in the history
Use the same pattern as the new state management in LabManager:
a dictionary of state objects per user, where the user is never
removed once we've seen them. In this case, the state object only
needs to hold a lock, not a full-blown manager class.

Rather than spawning a separate background task (and Kubernetes
watch) for every running file server, use a single background task
that watches for all pod changes in the namespace. Use that to
catch pod phase changes. Add a background reconcile task to look
for any other problems (such as Kubernetes object deletion) and
clean up after those.

Add Slack reporting of errors in file server operations.
  • Loading branch information
rra committed Nov 1, 2023
1 parent c47c471 commit 175cf1e
Show file tree
Hide file tree
Showing 8 changed files with 342 additions and 101 deletions.
8 changes: 8 additions & 0 deletions src/jupyterlabcontroller/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
"DOCKER_SECRETS_PATH",
"DROPDOWN_SENTINEL_VALUE",
"GROUPNAME_REGEX",
"FILE_SERVER_REFRESH_INTERVAL",
"IMAGE_REFRESH_INTERVAL",
"KUBERNETES_DELETE_TIMEOUT",
"KUBERNETES_REQUEST_TIMEOUT",
Expand Down Expand Up @@ -45,6 +46,13 @@
DROPDOWN_SENTINEL_VALUE = "use_image_from_dropdown"
"""Used in the lab form for ``image_list`` when ``image_dropdown`` is used."""

FILE_SERVER_REFRESH_INTERVAL = timedelta(minutes=60)
"""How frequently to refresh file server state from Kubernetes.
This will detect when file servers disappear out from under us, such as being
terminated by Kubernetes node replacements or upgrades.
"""

IMAGE_REFRESH_INTERVAL = timedelta(minutes=5)
"""How frequently to refresh the list of remote and cached images."""

Expand Down
1 change: 1 addition & 0 deletions src/jupyterlabcontroller/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ async def from_config(cls, config: Config) -> Self:
fileserver_storage=FileserverStorage(
kubernetes_client, logger
),
slack_client=slack_client,
logger=logger,
)

Expand Down
13 changes: 12 additions & 1 deletion src/jupyterlabcontroller/models/domain/kubernetes.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from enum import Enum
from typing import Self

from kubernetes_asyncio.client import V1ContainerImage, V1ObjectMeta
from kubernetes_asyncio.client import V1ContainerImage, V1ObjectMeta, V1Pod
from typing_extensions import Protocol

from .docker import DockerReference
Expand Down Expand Up @@ -112,3 +112,14 @@ class PodPhase(str, Enum):
SUCCEEDED = "Succeeded"
FAILED = "Failed"
UNKNOWN = "Unknown"


@dataclass
class PodChange:
"""Represents a change (not creation or deletion) of a pod."""

phase: PodPhase
"""New phase of the pod."""

pod: V1Pod
"""Full object for the pod that changed."""
24 changes: 24 additions & 0 deletions src/jupyterlabcontroller/services/builder/fileserver.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from __future__ import annotations

import re
from typing import Any
from urllib.parse import urlparse

Expand All @@ -12,6 +13,7 @@
V1Job,
V1JobSpec,
V1ObjectMeta,
V1Pod,
V1PodSecurityContext,
V1PodSpec,
V1PodTemplateSpec,
Expand Down Expand Up @@ -96,6 +98,28 @@ def build_name(self, username: str) -> str:
"""
return f"{username}-fs"

def get_username_for_pod(self, pod: V1Pod) -> str | None:
"""Determine the username for a file server pod.
Parameters
----------
pod
Pod object.
Returns
-------
str
Username corresponding to that file server pod, or `None` if no
username information could be found.
"""
labels = pod.metadata.labels
if labels and "nublado.lsst.io/user" in labels:
return labels["nublado.lsst.io/user"]
elif m := re.match("^(.*)-fs$", pod.metadata.name):
return m.group(1)
else:
return None

def is_valid(self, username: str, state: FileserverStateObjects) -> bool:
"""Determine whether a running fileserver is valid.
Expand Down
Loading

0 comments on commit 175cf1e

Please sign in to comment.