Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Emit events from the Contents Service #954

Merged
merged 4 commits into from
Aug 31, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions jupyter_server/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""The Jupyter Server"""
import os
import pathlib
import subprocess
import sys

Expand All @@ -10,6 +11,8 @@
]

DEFAULT_JUPYTER_SERVER_PORT = 8888
JUPYTER_SERVER_EVENTS_URI = "https://events.jupyter.org/jupyter_server"
DEFAULT_EVENTS_SCHEMA_PATH = pathlib.Path(__file__).parent / "event_schemas"

del os

Expand Down
73 changes: 73 additions & 0 deletions jupyter_server/event_schemas/contents_service/v1.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
"$id": https://events.jupyter.org/jupyter_server/contents_service/v1
version: 1
title: Contents Manager activities
personal-data: true
description: |
Record actions on files via the ContentsManager.

The notebook ContentsManager REST API is used by all frontends to retreive,
save, list, delete and perform other actions on notebooks, directories,
and other files through the UI. This is pluggable - the default acts on
the file system, but can be replaced with a different ContentsManager
implementation - to work on S3, Postgres, other object stores, etc.
The events get recorded regardless of the ContentsManager implementation
being used.

Limitations:

1. This does not record all filesystem access, just the ones that happen
explicitly via the notebook server's REST API. Users can (and often do)
trivially access the filesystem in many other ways (such as `open()` calls
in their code), so this is usually never a complete record.
2. As with all events recorded by the notebook server, users most likely
have the ability to modify the code of the notebook server. Unless other
security measures are in place, these events should be treated as user
controlled and not used in high security areas.
3. Events are only recorded when an action succeeds.
type: object
required:
- action
- path
properties:
action:
enum:
- get
- create
- save
- upload
- rename
- copy
- delete
description: |
Action performed by the ContentsManager API.

This is a required field.

Possible values:

1. get
Get contents of a particular file, or list contents of a directory.

2. save
Save a file at path with contents from the client

3. rename
Rename a file or directory from value in source_path to
value in path.

4. copy
Copy a file or directory from value in source_path to
value in path.

5. delete
Delete a file or empty directory at given path
path:
type: string
description: |
Logical path on which the operation was performed.

This is a required field.
source_path:
type: string
description: |
Source path of an operation when action is 'copy' or 'rename'
15 changes: 15 additions & 0 deletions jupyter_server/serverapp.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,9 +83,11 @@
from traitlets.config.application import boolean_flag, catch_config_error

from jupyter_server import (
DEFAULT_EVENTS_SCHEMA_PATH,
DEFAULT_JUPYTER_SERVER_PORT,
DEFAULT_STATIC_FILES_PATH,
DEFAULT_TEMPLATE_PATH_LIST,
JUPYTER_SERVER_EVENTS_URI,
__version__,
)
from jupyter_server._sysinfo import get_sys_info
Expand Down Expand Up @@ -1951,6 +1953,19 @@ def init_logging(self):
def init_event_logger(self):
"""Initialize the Event Bus."""
self.event_logger = EventLogger(parent=self)
# Load the core Jupyter Server event schemas
# All event schemas must start with Jupyter Server's
# events URI, `JUPYTER_SERVER_EVENTS_URI`.
schema_ids = [
"https://events.jupyter.org/jupyter_server/contents_service/v1",
]
for schema_id in schema_ids:
# Get the schema path from the schema ID.
rel_schema_path = schema_id.lstrip(JUPYTER_SERVER_EVENTS_URI) + ".yaml"
schema_path = DEFAULT_EVENTS_SCHEMA_PATH / rel_schema_path
# Use this pathlib object to register the schema
# breakpoint()
self.event_logger.register_event_schema(schema_path)
Zsailer marked this conversation as resolved.
Show resolved Hide resolved

def init_webapp(self):
"""initialize tornado webapp"""
Expand Down
6 changes: 4 additions & 2 deletions jupyter_server/services/contents/filemanager.py
Original file line number Diff line number Diff line change
Expand Up @@ -395,6 +395,7 @@ def get(self, path, content=True, type=None, format=None):
if type == "directory":
raise web.HTTPError(400, "%s is not a directory" % path, reason="bad type")
model = self._file_model(path, content=content, format=format)
self.emit(data={"action": "get", "path": path})
Copy link
Contributor

@dlqqq dlqqq Aug 29, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have a vague performance concern about emitting an event on every get and save. These are by far the most common ContentsManager actions called by clients. In my opinion, we should only emit events that have demonstrated a developer need. For my File ID manager, I do not need these events. If there are no known uses for these events, I suggest we remove these events entirely rather than attempt to anticipate future uses at the cost of performance.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In my opinion, we should only emit events that have demonstrated a developer need

I think this is too stringent of a condition.

Events certainly have use-cases outside of just developers. Operators/admins look to use this event system to audit user activity—in fact, this was the original demand driving the jupyter events/telemetry efforts mentioned in this Jupyter Telemetry enhancement proposal (jupyter/enhancement-proposals#41).

If this work is causing a noticeable performance degradation, we should address it rather than limit the usage of the event system for operators.

Also, keep in mind, in scenarios where there are no handlers or listeners, the .emit method immediately returns: https://github.com/jupyter/jupyter_events/blob/50746633a2adc7e41e2e0e1b0631db10ffb165db/jupyter_events/logger.py#L337-L342

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, I think we actually are in agreement; I forgot Jupyter maintains a distinction between developers and operators, and I was using the word "developer" to mean both. Yes, we should definitely be conscious of performance when building this, though I'm sure it's quite difficult to measure.

Also, keep in mind, in scenarios where there are no handlers or listeners, the .emit method immediately returns:

I took a look at that logic, and I'm not sure if that's sufficient. That checks if there is are any handlers/listeners attached to the current event logger, meaning if I add a listener to another schema or another event type, then a Contents Manager event still performs a deep copy, validation, and building an event capsule only to do nothing with it.

Of course, this is out of scope for the PR. I'll make an issue of this in Jupyter Events for further discussion.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, I think we actually are in agreement; I forgot Jupyter maintains a distinction between developers and operators, and I was using the word "developer" to mean both. Yes, we should definitely be conscious of performance when building this, though I'm sure it's quite difficult to measure.

I think the main point is that Jupyter Server should reach a future state where it provides high volumes of fine-grained, structured event data from most areas across Jupyter Server, allowing operators to get detailed information about what's happening in a running server. In many deployment scenarios, it's required that operators keep a detailed log of everything happening in the application.

If performance is the concern, the answer isn't to avoid adding events; it's to improve performance. I think we can do it! 😎

I took a look at that logic, and I'm not sure if that's sufficient. That checks if there is are any handlers/listeners attached to the current event logger, meaning if I add a listener to another schema or another event type, then a Contents Manager event still performs a deep copy, validation, and building an event capsule only to do nothing with it.

Yeah, that's right. I think we should update jupyter_events to check if the specific event is being watched by any listeners.

This is a little trickier (not impossible) to do with handlers, since we don't keep a mapping of handlers to their specific events (consequence of using Python's logging libraries for the main EventLogger). All handlers listen to all events, unless someone adds a logging.Filter object to look for specific events. The challenge is that we can't inspect the filter objects easily. I have some ideas on how we can solve this issue in jupyter_events, but this shouldn't block the PR here.

return model

def _save_directory(self, os_path, model, path=""):
Expand Down Expand Up @@ -459,7 +460,7 @@ def save(self, model, path=""):
model["message"] = validation_message

self.run_post_save_hooks(model=model, os_path=os_path)

self.emit(data={"action": "save", "path": path})
return model

def delete_file(self, path):
Expand Down Expand Up @@ -735,6 +736,7 @@ async def get(self, path, content=True, type=None, format=None):
if type == "directory":
raise web.HTTPError(400, "%s is not a directory" % path, reason="bad type")
model = await self._file_model(path, content=content, format=format)
self.emit(data={"action": "get", "path": path})
return model

async def _save_directory(self, os_path, model, path=""):
Expand Down Expand Up @@ -795,7 +797,7 @@ async def save(self, model, path=""):
model["message"] = validation_message

self.run_post_save_hooks(model=model, os_path=os_path)

self.emit(data={"action": "save", "path": path})
return model

async def delete_file(self, path):
Expand Down
3 changes: 3 additions & 0 deletions jupyter_server/services/contents/largefilemanager.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ def save(self, model, path=""):
# Last chunk
if chunk == -1:
self.run_post_save_hooks(model=model, os_path=os_path)
self.emit(data={"action": "save", "path": path})
return model
else:
return super().save(model, path)
Expand Down Expand Up @@ -125,6 +126,8 @@ async def save(self, model, path=""):
# Last chunk
if chunk == -1:
self.run_post_save_hooks(model=model, os_path=os_path)

self.emit(data={"action": "save", "path": path})
return model
else:
return await super().save(model, path)
Expand Down
26 changes: 26 additions & 0 deletions jupyter_server/services/contents/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import warnings
from fnmatch import fnmatch

from jupyter_events import EventLogger
from nbformat import ValidationError, sign
from nbformat import validate as validate_nb
from nbformat.v4 import new_notebook
Expand All @@ -25,6 +26,7 @@
)
from traitlets.config.configurable import LoggingConfigurable

from jupyter_server import DEFAULT_EVENTS_SCHEMA_PATH, JUPYTER_SERVER_EVENTS_URI
from jupyter_server.transutils import _i18n
from jupyter_server.utils import ensure_async, import_item

Expand Down Expand Up @@ -53,6 +55,24 @@ class ContentsManager(LoggingConfigurable):

"""

event_schema_id = JUPYTER_SERVER_EVENTS_URI + "/contents_service/v1"
event_logger = Instance(EventLogger).tag(config=True)

@default("event_logger")
def _default_event_logger(self):
if self.parent and hasattr(self.parent, "event_logger"):
return self.parent.event_logger
else:
# If parent does not have an event logger, create one.
logger = EventLogger()
schema_path = DEFAULT_EVENTS_SCHEMA_PATH / "contents_service" / "v1.yaml"
logger.register_event_schema(schema_path)
return logger

def emit(self, data):
"""Emit event using the core event schema from Jupyter Server's Contents Manager."""
self.event_logger.emit(schema_id=self.event_schema_id, data=data)

root_dir = Unicode("/", config=True)

allow_hidden = Bool(False, config=True, help="Allow access to hidden files")
Expand Down Expand Up @@ -416,11 +436,13 @@ def delete(self, path):
raise HTTPError(400, "Can't delete root")
self.delete_file(path)
self.checkpoints.delete_all_checkpoints(path)
self.emit(data={"action": "delete", "path": path})

def rename(self, old_path, new_path):
"""Rename a file and any checkpoints associated with that file."""
self.rename_file(old_path, new_path)
self.checkpoints.rename_all_checkpoints(old_path, new_path)
self.emit(data={"action": "rename", "path": new_path, "source_path": old_path})

def update(self, model, path):
"""Update the file's path
Expand Down Expand Up @@ -616,6 +638,7 @@ def copy(self, from_path, to_path=None):
raise HTTPError(404, "No such directory: %s" % to_path)

model = self.save(model, to_path)
self.emit(data={"action": "copy", "path": to_path, "source_path": from_path})
return model

def log_info(self):
Expand Down Expand Up @@ -819,11 +842,13 @@ async def delete(self, path):

await self.delete_file(path)
await self.checkpoints.delete_all_checkpoints(path)
self.emit(data={"action": "delete", "path": path})

async def rename(self, old_path, new_path):
"""Rename a file and any checkpoints associated with that file."""
await self.rename_file(old_path, new_path)
await self.checkpoints.rename_all_checkpoints(old_path, new_path)
self.emit(data={"action": "rename", "path": new_path, "source_path": old_path})

async def update(self, model, path):
"""Update the file's path
Expand Down Expand Up @@ -985,6 +1010,7 @@ async def copy(self, from_path, to_path=None):
raise HTTPError(404, "No such directory: %s" % to_path)

model = await self.save(model, to_path)
self.emit(data={"action": "copy", "path": to_path, "source_path": from_path})
return model

async def trust_notebook(self, path):
Expand Down