Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow arbitrary contents managers #24

Merged
merged 2 commits into from
Oct 25, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions docs/user_guide/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@ and asserting that `jupyter_server_fileid` is enabled.

## Usage

`jupyter_server_fileid`, by default, constructs a `FileIdManager` instance and
stores it under `serverapp.settings["file_id_manager"]`. This `FileIdManager`
`jupyter_server_fileid`, by default, constructs a `LocalFileIdManager` instance and
stores it under `serverapp.settings["file_id_manager"]`. This `LocalFileIdManager`
instance is a developer's way of accessing its key methods.

Once you obtain a reference to the `FileIdManager` instance and bind it to some
Once you obtain a reference to the `LocalFileIdManager` instance and bind it to some
name, e.g. `fim`, all file ID operations can be performed as methods on that
object. File ID operations are best illustrated in the following examples.

Expand Down
49 changes: 30 additions & 19 deletions jupyter_server_fileid/extension.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
from jupyter_events.logger import EventLogger
from jupyter_server.extension.application import ExtensionApp
from traitlets import Type, default
from jupyter_server.services.contents.filemanager import FileContentsManager
from traitlets import Type

from jupyter_server_fileid.manager import FileIdManager
from jupyter_server_fileid.manager import (
ArbitraryFileIdManager,
BaseFileIdManager,
LocalFileIdManager,
)

from .handlers import FileId2PathHandler, FilePath2IdHandler

Expand All @@ -12,18 +17,32 @@ class FileIdExtension(ExtensionApp):
name = "jupyter_server_fileid"

file_id_manager_class = Type(
klass=FileIdManager,
help="File ID manager instance to use. Defaults to FileIdManager.",
klass=BaseFileIdManager,
help="""File ID manager instance to use.

Defaults to:
- LocalFileIdManager if contents manager is a FileContentsManager,
- ArbitraryFileIdManager otherwise.
""",
config=True,
default_value=None,
allow_none=True,
)

@default("file_id_manager")
def _file_id_manager_default(self):
self.log.debug("No File ID manager configured. Defaulting to FileIdManager.")
return FileIdManager

def initialize_settings(self):
self.log.debug(f"Configured File ID manager: {self.file_id_manager_class.__name__}")
if self.file_id_manager_class is None:
if isinstance(self.settings["contents_manager"], FileContentsManager):
self.log.info(
"Contents manager is a FileContentsManager. Defaulting to LocalFileIdManager."
)
self.file_id_manager_class = LocalFileIdManager
else:
self.log.info(
"Contents manager is not a FileContentsManager. Defaulting to ArbitraryFileIdManager."
)
self.file_id_manager_class = ArbitraryFileIdManager

self.log.info(f"Configured File ID manager: {self.file_id_manager_class.__name__}")
file_id_manager = self.file_id_manager_class(
log=self.log, root_dir=self.serverapp.root_dir, config=self.config
)
Expand All @@ -35,15 +54,7 @@ def initialize_settings(self):

def initialize_event_listeners(self):
file_id_manager = self.settings["file_id_manager"]

# define event handlers per contents manager action
handlers_by_action = {
"get": None,
"save": lambda data: file_id_manager.save(data["path"]),
"rename": lambda data: file_id_manager.move(data["source_path"], data["path"]),
"copy": lambda data: file_id_manager.copy(data["source_path"], data["path"]),
"delete": lambda data: file_id_manager.delete(data["path"]),
}
handlers_by_action = file_id_manager.get_handlers_by_action()

async def cm_listener(logger: EventLogger, schema_id: str, data: dict) -> None:
handler = handlers_by_action[data["action"]]
Expand Down
198 changes: 169 additions & 29 deletions jupyter_server_fileid/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import sqlite3
import stat
import time
from typing import Optional
from typing import Any, Callable, Dict, Optional, Union

from jupyter_core.paths import jupyter_data_dir
from traitlets import Int, TraitError, Unicode, validate
Expand Down Expand Up @@ -37,35 +37,174 @@ def wrapped(self, *args, **kwargs):
return decorator


class FileIdManager(LoggingConfigurable):
class BaseFileIdManager(LoggingConfigurable):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should derive from ABC along with a metaclass definition in order for @abstractmethod decorators to be effective. A good example of this can be found here. By not deriving BaseFileIdManager from ABC, @abstractmethod decorators do not work (and I see those too have been removed). As a result, a subclass's violation for not implementing various methods will not be discovered until that method is called, rather than when the class instance is instantiated. Proper decoration also prevents BaseFileIdManager from being instantiated (i.e., a true abstract base class).

"""
Manager that supports tracking files across their lifetime by associating
each with a unique file ID, which is maintained across filesystem operations.

Notes
-----

All private helper methods prefixed with an underscore (except `__init__()`)
do NOT commit their SQL statements in a transaction via `self.con.commit()`.
This responsibility is delegated to the public method calling them to
increase performance. Committing multiple SQL transactions in serial is much
slower than committing a single SQL transaction wrapping all SQL statements
performed during a method's procedure body.
Base class for File ID manager implementations. All File ID
managers should inherit from this class.
"""

root_dir = Unicode(
help=("The root being served by Jupyter server. Must be an absolute path."), config=True
help=("The root directory being served by Jupyter server. Must be an absolute path."),
config=False,
)

db_path = Unicode(
default_value=default_db_path,
help=(
"The path of the DB file used by `FileIdManager`. "
"The path of the DB file used by File ID manager implementations. "
"Defaults to `jupyter_data_dir()/file_id_manager.db`."
),
config=True,
)

@validate("root_dir", "db_path")
def _validate_abspath_traits(self, proposal):
if proposal["value"] is None:
raise TraitError(f"FileIdManager : {proposal['trait'].name} must not be None")
if not os.path.isabs(proposal["value"]):
raise TraitError(f"FileIdManager : {proposal['trait'].name} must be an absolute path")
return proposal["value"]

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

def index(self, path: str) -> Union[int, str, None]:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think we can have dual forms of IDs as suggested here. Users should not be expected to change from int to UUID, rebuild all indices, and, worst of all, update all external references that may be squirreled away when they find out that int as ID is insufficient.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see this was merged as I must have been editing this comment. This topic needs to be revisited but we can let #3 be that forum.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To the best of my knowledge, this is not a public method and should not exist on the ABC.

raise NotImplementedError("must be implemented by subclass")

def get_id(self, path: str) -> Union[int, str, None]:
raise NotImplementedError("must be implemented by subclass")
Comment on lines +74 to +75
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With a proper ABC definition @abstractmethod is sufficient and this method can use pass. It should also include the basic help-string text from which all subclasses will derive.

Same goes for get_path() and get_handlers_by_action below.


def get_path(self, id: Union[int, str]) -> Union[int, str, None]:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def get_path(self, id: Union[int, str]) -> Union[int, str, None]:
def get_path(self, id: Union[int, str]) -> Optional[str]:

raise NotImplementedError("must be implemented by subclass")

def move(self, old_path: str, new_path: str) -> Union[int, str, None]:
raise NotImplementedError("must be implemented by subclass")

def copy(self, from_path: str, to_path: str) -> Union[int, str, None]:
raise NotImplementedError("must be implemented by subclass")

def delete(self, path: str) -> None:
raise NotImplementedError("must be implemented by subclass")

def save(self, path: str) -> None:
raise NotImplementedError("must be implemented by subclass")
Comment on lines +80 to +90
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To the best of my knowledge, these are not public methods and should not exist on the ABC.


def get_handlers_by_action(self) -> Dict[str, Optional[Callable[[Dict[str, Any]], Any]]]:
"""Returns a dictionary whose keys are contents manager event actions
and whose values are callables invoked upon receipt of an event of the
same action. The callable accepts the body of the event as its only
argument. To ignore an event action, set the value to `None`."""
raise NotImplementedError("must be implemented by subclass")


class ArbitraryFileIdManager(BaseFileIdManager):
"""
File ID manager that works on arbitrary filesystems. Each file is assigned a
unique ID. The path is only updated upon calling `move()`, `copy()`, or
`delete()`, e.g. upon receipt of contents manager events emitted by Jupyter
Server 2.
"""

def __init__(self, *args, **kwargs):
# pass args and kwargs to parent Configurable
super().__init__(*args, **kwargs)
# initialize instance attrs
self._update_cursor = False
# initialize connection with db
self.log.info(f"ArbitraryFileIdManager : Configured root dir: {self.root_dir}")
self.log.info(f"ArbitraryFileIdManager : Configured database path: {self.db_path}")
self.con = sqlite3.connect(self.db_path)
self.log.info("ArbitraryFileIdManager : Successfully connected to database file.")
self.log.info("ArbitraryFileIdManager : Creating File ID tables and indices.")
# do not allow reads to block writes. required when using multiple processes
self.con.execute("PRAGMA journal_mode = WAL")
self.con.execute(
"CREATE TABLE IF NOT EXISTS Files("
"id INTEGER PRIMARY KEY AUTOINCREMENT, "
"path TEXT NOT NULL UNIQUE"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the ArbitraryFileIdManager is going to want to store the root_dir value as it can change between invocations and may be necessary. We should also store any information returned from the Contents events.

"root_dir TEXT NOT NULL"

")"
)
self.con.execute("CREATE INDEX IF NOT EXISTS ix_Files_path ON Files (path)")
self.con.commit()

def index(self, path: str) -> int:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not public, should be prefixed with _ and should probably take whatever other columns are necessary.

row = self.con.execute("SELECT id FROM Files WHERE path = ?", (path,)).fetchone()
existing_id = row and row[0]

if existing_id:
return existing_id

# create new record
cursor = self.con.execute("INSERT INTO Files (path) VALUES (?)", (path,))
self.con.commit()
return cursor.lastrowid # type:ignore

def get_id(self, path: str) -> Optional[int]:
row = self.con.execute("SELECT id FROM Files WHERE path = ?", (path,)).fetchone()
return row and row[0]

def get_path(self, id: Union[int, str]) -> Optional[int]:
row = self.con.execute("SELECT path FROM Files WHERE id = ?", (id,)).fetchone()
return row and row[0]

def move(self, old_path: str, new_path: str) -> None:
row = self.con.execute("SELECT id FROM Files WHERE path = ?", (old_path,)).fetchone()
id = row and row[0]

if id:
self.con.execute("UPDATE Files SET path = ? WHERE path = ?", (new_path, old_path))
else:
cursor = self.con.execute("INSERT INTO Files (path) VALUES (?)", (new_path,))
id = cursor.lastrowid

self.con.commit()
return id

def copy(self, from_path: str, to_path: str) -> Optional[int]:
cursor = self.con.execute("INSERT INTO Files (path) VALUES (?)", (to_path,))
self.con.commit()
return cursor.lastrowid

def delete(self, path: str) -> None:
self.con.execute("DELETE FROM Files WHERE path = ?", (path,))
self.con.commit()
Comment on lines +150 to +170
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not public, should be prefixed with _.


def save(self, path: str) -> None:
return
Comment on lines +172 to +173
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be implemented to call _index() if its not already indexed. Otherwise, no entries will ever be inserted into the FILES table.

Also, _get() should be implemented such that it should be called on Get Contents events. Similar reason to why Save events must be handled.


def get_handlers_by_action(self) -> Dict[str, Optional[Callable[[Dict[str, Any]], Any]]]:
return {
"get": None,
"save": None,
Comment on lines +177 to +178
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Per previous comment, these should call _get() and _save(), respectively.

"rename": lambda data: self.move(data["source_path"], data["path"]),
"copy": lambda data: self.copy(data["source_path"], data["path"]),
"delete": lambda data: self.delete(data["path"]),
}

def __del__(self):
"""Cleans up `ArbitraryFileIdManager` by committing any pending
transactions and closing the connection."""
if hasattr(self, "con"):
self.con.commit()
self.con.close()


class LocalFileIdManager(BaseFileIdManager):
"""
File ID manager that supports tracking files in local filesystems by
associating each with a unique file ID, which is maintained across
filesystem operations.

Notes
-----
All private helper methods prefixed with an underscore (except `__init__()`)
do NOT commit their SQL statements in a transaction via `self.con.commit()`.
This responsibility is delegated to the public method calling them to
increase performance. Committing multiple SQL transactions in serial is much
slower than committing a single SQL transaction wrapping all SQL statements
performed during a method's procedure body.
"""

autosync_interval = Int(
default_value=5,
help=(
Expand All @@ -84,11 +223,11 @@ def __init__(self, *args, **kwargs):
self._update_cursor = False
self._last_sync = 0.0
# initialize connection with db
self.log.info(f"FileIdManager : Configured root dir: {self.root_dir}")
self.log.info(f"FileIdManager : Configured database path: {self.db_path}")
self.log.info(f"LocalFileIdManager : Configured root dir: {self.root_dir}")
self.log.info(f"LocalFileIdManager : Configured database path: {self.db_path}")
self.con = sqlite3.connect(self.db_path)
self.log.info("FileIdManager : Successfully connected to database file.")
self.log.info("FileIdManager : Creating File ID tables and indices.")
self.log.info("LocalFileIdManager : Successfully connected to database file.")
self.log.info("LocalFileIdManager : Creating File ID tables and indices.")
# do not allow reads to block writes. required when using multiple processes
self.con.execute("PRAGMA journal_mode = WAL")
self.con.execute(
Expand All @@ -109,14 +248,6 @@ def __init__(self, *args, **kwargs):
self.con.execute("CREATE INDEX IF NOT EXISTS ix_Files_is_dir ON Files (is_dir)")
self.con.commit()

@validate("root_dir", "db_path")
def _validate_abspath_traits(self, proposal):
if proposal["value"] is None:
raise TraitError(f"FileIdManager : {proposal['trait'].name} must not be None")
if not os.path.isabs(proposal["value"]):
raise TraitError(f"FileIdManager : {proposal['trait'].name} must be an absolute path")
return self._normalize_path(proposal["value"])

def _index_all(self):
"""Recursively indexes all directories under the server root."""
self._index_dir_recursively(self.root_dir, self._stat(self.root_dir))
Expand Down Expand Up @@ -598,8 +729,17 @@ def save(self, path):
self._update(id, stat_info)
self.con.commit()

def get_handlers_by_action(self) -> Dict[str, Optional[Callable[[Dict[str, Any]], Any]]]:
return {
"get": None,
"save": lambda data: self.save(data["path"]),
"rename": lambda data: self.move(data["source_path"], data["path"]),
"copy": lambda data: self.copy(data["source_path"], data["path"]),
"delete": lambda data: self.delete(data["path"]),
}

def __del__(self):
"""Cleans up `FileIdManager` by committing any pending transactions and
"""Cleans up `LocalFileIdManager` by committing any pending transactions and
closing the connection."""
if hasattr(self, "con"):
self.con.commit()
Expand Down
17 changes: 14 additions & 3 deletions jupyter_server_fileid/pytest_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import pytest

from jupyter_server_fileid.manager import FileIdManager
from jupyter_server_fileid.manager import ArbitraryFileIdManager, LocalFileIdManager


@pytest.fixture
Expand All @@ -30,8 +30,8 @@ def delete_fid_db(fid_db_path):

@pytest.fixture
def fid_manager(fid_db_path, jp_root_dir):
"""Fixture returning a test-configured instance of `FileIdManager`."""
fid_manager = FileIdManager(db_path=fid_db_path, root_dir=str(jp_root_dir))
"""Fixture returning a test-configured instance of `LocalFileIdManager`."""
fid_manager = LocalFileIdManager(db_path=fid_db_path, root_dir=str(jp_root_dir))
# disable journal so no temp journal file is created under `tmp_path`.
# reduces test flakiness since sometimes journal file has same ino and
# crtime as a deleted file, so FID manager detects it wrongly as a move
Expand All @@ -40,6 +40,17 @@ def fid_manager(fid_db_path, jp_root_dir):
return fid_manager


@pytest.fixture(params=["local", "arbitrary"])
def any_fid_manager(request, fid_db_path, jp_root_dir):
"""Parametrized fixture that runs the test with each of the default File ID
manager implementations."""
class_by_param = {"local": LocalFileIdManager, "arbitrary": ArbitraryFileIdManager}

fid_manager = class_by_param[request.param](db_path=fid_db_path, root_dir=str(jp_root_dir))
fid_manager.con.execute("PRAGMA journal_mode = OFF") # type: ignore[attr-defined]
return fid_manager


@pytest.fixture
def fs_helpers(jp_root_dir):
class FsHelpers:
Expand Down
Loading