Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add pip resolver #1697

Merged
merged 43 commits into from
May 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
d336eb5
Add pip installer
JCZuurmond May 15, 2024
c13c1e6
Add pip installer to context
JCZuurmond May 15, 2024
66d0ae3
Add installer to dependency graph
JCZuurmond May 15, 2024
7bc69b5
Pass installer form context to workflow linter to dependency graph
JCZuurmond May 15, 2024
7e962ff
Use installer in job task
JCZuurmond May 15, 2024
86b26e1
Add TODO's
JCZuurmond May 15, 2024
5172127
Add pip installer tests
JCZuurmond May 15, 2024
1efa14f
Pass installer on dependency graph
JCZuurmond May 15, 2024
8871b80
Fix type
JCZuurmond May 15, 2024
420b918
Test pip installer on global context
JCZuurmond May 15, 2024
ba7eaed
Remove raise NotImplementedError
JCZuurmond May 15, 2024
45e467c
Add test for workflow task container
JCZuurmond May 15, 2024
5e2a0c3
Use mock autospec for class
JCZuurmond May 15, 2024
013d8a7
Avoid disabling pylint
JCZuurmond May 15, 2024
bc36dbe
Add test for attribute not None
JCZuurmond May 15, 2024
69f7d81
Rewrite to pytest.mark.parameterize
JCZuurmond May 15, 2024
8a4cadd
Move global context integration tests to unit test
JCZuurmond May 15, 2024
562c758
Add unit test for graph install library
JCZuurmond May 15, 2024
490149b
Fix reference to install task libraries
JCZuurmond May 15, 2024
a22a29d
Test install pypi package in job
JCZuurmond May 15, 2024
6f08e06
Test for installing unknown library
JCZuurmond May 15, 2024
6a8a46f
Rewrite Installer similar to Resolver
JCZuurmond May 15, 2024
7917ebf
Add coverage for not yet implemented install libraries
JCZuurmond May 15, 2024
490677f
Test raise NotImplementedError
JCZuurmond May 15, 2024
7025307
Format
JCZuurmond May 15, 2024
08d3cb9
Merge Installer with Resolver
JCZuurmond May 15, 2024
9cc9218
Fix merge issues
JCZuurmond May 15, 2024
4f3afe6
Avoid complexity warning
JCZuurmond May 15, 2024
c8fc07e
Add test to increase coverage
JCZuurmond May 15, 2024
68ef91f
Add pip cell installs to dependency graph (#1698)
JCZuurmond May 15, 2024
5416af0
Rename resolve_library_pip to resolve_library
JCZuurmond May 15, 2024
4e16abb
Add library resolvers to DependencyResolvers
JCZuurmond May 16, 2024
fe39ece
Add BaseLibraryResolver
JCZuurmond May 16, 2024
501f3f4
Add BaseLibraryResolver
JCZuurmond May 16, 2024
05b05dd
Chain library resolvers
JCZuurmond May 16, 2024
a1b2807
Use library resolver for PipResolver
JCZuurmond May 16, 2024
45ede0c
Move not-yet-implemented to register_library
JCZuurmond May 16, 2024
7b90413
Fix issues
JCZuurmond May 16, 2024
8655a0e
Move PipResolver in application
JCZuurmond May 16, 2024
3454256
Update test names
JCZuurmond May 16, 2024
fb40711
Put back TODO's
JCZuurmond May 16, 2024
661d2a3
Update test names
JCZuurmond May 16, 2024
d979f74
Add todo
JCZuurmond May 16, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions src/databricks/labs/ucx/contexts/application.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
from databricks.labs.ucx.source_code.path_lookup import PathLookup
from databricks.labs.ucx.source_code.graph import DependencyResolver
from databricks.labs.ucx.source_code.whitelist import WhitelistResolver, Whitelist
from databricks.labs.ucx.source_code.site_packages import SitePackages
from databricks.labs.ucx.source_code.site_packages import PipResolver, SitePackages
from databricks.labs.ucx.source_code.languages import Languages
from databricks.labs.ucx.source_code.redash import Redash
from databricks.labs.ucx.workspace_access import generic, redash
Expand Down Expand Up @@ -350,6 +350,10 @@ def workspace_info(self):
def verify_has_metastore(self):
return VerifyHasMetastore(self.workspace_client)

@cached_property
def pip_resolver(self):
return PipResolver()

@cached_property
def notebook_loader(self) -> NotebookLoader:
return NotebookLoader()
Expand Down Expand Up @@ -392,7 +396,8 @@ def file_resolver(self):
@cached_property
def dependency_resolver(self):
import_resolvers = [self.file_resolver, self.whitelist_resolver]
return DependencyResolver(self.notebook_resolver, import_resolvers, self.path_lookup)
library_resolvers = [self.pip_resolver]
return DependencyResolver(library_resolvers, self.notebook_resolver, import_resolvers, self.path_lookup)

@cached_property
def workflow_linter(self):
Expand Down
52 changes: 50 additions & 2 deletions src/databricks/labs/ucx/source_code/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,16 @@
def path(self):
return self._dependency.path

def register_library(self, name: str) -> MaybeGraph:
def register_library(self, library: str) -> list[DependencyProblem]:
# TODO: use DistInfoResolver to load wheel/egg/pypi dependencies
# TODO: https://github.com/databrickslabs/ucx/issues/1642
# TODO: https://github.com/databrickslabs/ucx/issues/1643
# TODO: https://github.com/databrickslabs/ucx/issues/1640
return MaybeGraph(None, [DependencyProblem('not-yet-implemented', f'Library dependency: {name}')])
maybe = self._resolver.resolve_library(self.path_lookup, library)
JCZuurmond marked this conversation as resolved.
Show resolved Hide resolved
if not maybe.dependency:
return maybe.problems
maybe_graph = self.register_dependency(maybe.dependency)
return maybe_graph.problems

Check warning on line 57 in src/databricks/labs/ucx/source_code/graph.py

View check run for this annotation

Codecov / codecov/patch

src/databricks/labs/ucx/source_code/graph.py#L56-L57

Added lines #L56 - L57 were not covered by tests

def register_notebook(self, path: Path) -> list[DependencyProblem]:
maybe = self._resolver.resolve_notebook(self.path_lookup, path)
Expand Down Expand Up @@ -80,6 +84,7 @@
child_graph = DependencyGraph(dependency, self, self._resolver, self._path_lookup)
self._dependencies[dependency] = child_graph
container = dependency.load(self.path_lookup)
# TODO: Return either (child) graph OR problems
if not container:
problem = DependencyProblem('dependency-register-failed', 'Failed to register dependency', dependency.path)
return MaybeGraph(child_graph, [problem])
Expand Down Expand Up @@ -247,6 +252,20 @@
return f"<WrappingLoader source_container={self._source_container}>"


class BaseLibraryResolver(abc.ABC):

def __init__(self, next_resolver: BaseLibraryResolver | None):
self._next_resolver = next_resolver

@abc.abstractmethod
def with_next_resolver(self, resolver: BaseLibraryResolver) -> BaseLibraryResolver:
"""required to create a linked list of resolvers"""

def resolve_library(self, path_lookup: PathLookup, library: str) -> MaybeDependency:
assert self._next_resolver is not None
return self._next_resolver.resolve_library(path_lookup, library)

Check warning on line 266 in src/databricks/labs/ucx/source_code/graph.py

View check run for this annotation

Codecov / codecov/patch

src/databricks/labs/ucx/source_code/graph.py#L265-L266

Added lines #L265 - L266 were not covered by tests


class BaseNotebookResolver(abc.ABC):

@abc.abstractmethod
Expand Down Expand Up @@ -284,6 +303,22 @@
"""locates a file"""


class StubLibraryResolver(BaseLibraryResolver):

def __init__(self):
super().__init__(None)

def with_next_resolver(self, resolver: BaseLibraryResolver) -> BaseLibraryResolver:
raise NotImplementedError("Should never happen!")

Check warning on line 312 in src/databricks/labs/ucx/source_code/graph.py

View check run for this annotation

Codecov / codecov/patch

src/databricks/labs/ucx/source_code/graph.py#L312

Added line #L312 was not covered by tests

def resolve_library(self, path_lookup: PathLookup, library: str) -> MaybeDependency:
return self._fail('library-not-found', f"Could not resolve library: {library}")

@staticmethod
def _fail(code: str, message: str):
return MaybeDependency(None, [DependencyProblem(code, message)])


class StubImportResolver(BaseImportResolver):

def __init__(self):
Expand Down Expand Up @@ -316,14 +351,24 @@

def __init__(
self,
library_resolvers: list[BaseLibraryResolver],
notebook_resolver: BaseNotebookResolver,
import_resolvers: list[BaseImportResolver],
path_lookup: PathLookup,
):
self._library_resolver = self._chain_library_resolvers(library_resolvers)
self._notebook_resolver = notebook_resolver
self._import_resolver = self._chain_import_resolvers(import_resolvers)
self._path_lookup = path_lookup

@staticmethod
def _chain_library_resolvers(library_resolvers: list[BaseLibraryResolver]) -> BaseLibraryResolver:
previous: BaseLibraryResolver = StubLibraryResolver()
for resolver in library_resolvers:
resolver = resolver.with_next_resolver(previous)
previous = resolver
return previous

@staticmethod
def _chain_import_resolvers(import_resolvers: list[BaseImportResolver]) -> BaseImportResolver:
previous: BaseImportResolver = StubImportResolver()
Expand All @@ -338,6 +383,9 @@
def resolve_import(self, path_lookup: PathLookup, name: str) -> MaybeDependency:
return self._import_resolver.resolve_import(path_lookup, name)

def resolve_library(self, path_lookup: PathLookup, library: str) -> MaybeDependency:
return self._library_resolver.resolve_library(path_lookup, library)

def build_local_file_dependency_graph(self, path: Path) -> MaybeGraph:
"""Builds a dependency graph starting from a file. This method is mainly intended for testing purposes.
In case of problems, the paths in the problems will be relative to the starting path lookup."""
Expand Down
27 changes: 12 additions & 15 deletions src/databricks/labs/ucx/source_code/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,12 @@
from databricks.labs.ucx.source_code.base import CurrentSessionState
from databricks.labs.ucx.source_code.files import LocalFile
from databricks.labs.ucx.source_code.graph import (
Dependency,
DependencyGraph,
SourceContainer,
DependencyProblem,
Dependency,
WrappingLoader,
DependencyResolver,
SourceContainer,
WrappingLoader,
)
from databricks.labs.ucx.source_code.languages import Languages
from databricks.labs.ucx.source_code.notebooks.sources import Notebook, NotebookLinter, FileLinter
Expand Down Expand Up @@ -76,32 +76,29 @@ def _register_task_dependencies(self, graph: DependencyGraph) -> Iterable[Depend
yield from self._register_existing_cluster_id(graph)
yield from self._register_spark_submit_task(graph)

def _register_libraries(self, graph: DependencyGraph):
def _register_libraries(self, graph: DependencyGraph) -> Iterable[DependencyProblem]:
if not self._task.libraries:
return
for library in self._task.libraries:
yield from self._lint_library(library, graph)
yield from self._register_library(graph, library)

def _lint_library(self, library: compute.Library, graph: DependencyGraph) -> Iterable[DependencyProblem]:
@staticmethod
def _register_library(graph: DependencyGraph, library: compute.Library) -> Iterable[DependencyProblem]:
if library.pypi:
# TODO: https://github.com/databrickslabs/ucx/issues/1642
maybe = graph.register_library(library.pypi.package)
if maybe.problems:
yield from maybe.problems
problems = graph.register_library(library.pypi.package)
if problems:
yield from problems
if library.jar:
# TODO: https://github.com/databrickslabs/ucx/issues/1641
yield DependencyProblem('not-yet-implemented', 'Jar library is not yet implemented')
if library.egg:
# TODO: https://github.com/databrickslabs/ucx/issues/1643
maybe = graph.register_library(library.egg)
if maybe.problems:
yield from maybe.problems
yield DependencyProblem("not-yet-implemented", "Egg library is not yet implemented")
if library.whl:
# TODO: download the wheel somewhere local and add it to "virtual sys.path" via graph.path_lookup.push_path
# TODO: https://github.com/databrickslabs/ucx/issues/1640
maybe = graph.register_library(library.whl)
if maybe.problems:
yield from maybe.problems
yield DependencyProblem("not-yet-implemented", "Wheel library is not yet implemented")
if library.requirements:
# TODO: download and add every entry via graph.register_library
# TODO: https://github.com/databrickslabs/ucx/issues/1644
Expand Down
13 changes: 11 additions & 2 deletions src/databricks/labs/ucx/source_code/notebooks/cells.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,9 +182,18 @@ def language(self):
def is_runnable(self) -> bool:
return True # TODO

def build_dependency_graph(self, _: DependencyGraph) -> list[DependencyProblem]:
def build_dependency_graph(self, graph: DependencyGraph) -> list[DependencyProblem]:
# TODO: https://github.com/databrickslabs/ucx/issues/1642
return []
# TODO: this is very basic code, we need to improve it
splits = self.original_code.split(' ')
if len(splits) < 3:
return [DependencyProblem("library-install-failed", f"Missing arguments in '{self.original_code}'")]
if splits[1] != "install":
return [DependencyProblem("library-install-failed", f"Unsupported %pip command: {splits[1]}")]
# TODO: we need to support different formats of the library name and etc
library = splits[2]

return graph.register_library(library)


class CellLanguage(Enum):
Expand Down
45 changes: 45 additions & 0 deletions src/databricks/labs/ucx/source_code/site_packages.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,50 @@
from __future__ import annotations

import os
import subprocess
import tempfile
from functools import cached_property
from pathlib import Path
from subprocess import CalledProcessError

from databricks.labs.ucx.source_code.graph import BaseLibraryResolver, DependencyProblem, MaybeDependency
from databricks.labs.ucx.source_code.path_lookup import PathLookup


class PipResolver(BaseLibraryResolver):
# TODO: use DistInfoResolver to load wheel/egg/pypi dependencies
# TODO: https://github.com/databrickslabs/ucx/issues/1642
# TODO: https://github.com/databrickslabs/ucx/issues/1643
# TODO: https://github.com/databrickslabs/ucx/issues/1640

def __init__(self, next_resolver: BaseLibraryResolver | None = None) -> None:
super().__init__(next_resolver)

def with_next_resolver(self, resolver: BaseLibraryResolver) -> PipResolver:
return PipResolver(resolver)

def resolve_library(self, path_lookup: PathLookup, library: str) -> MaybeDependency:
"""Pip install library and augment path look-up to resolve the library at import"""
# invoke pip install via subprocess to install this library into lib_install_folder
pip_install_arguments = ["pip", "install", library, "-t", self._temporary_virtual_environment.as_posix()]
try:
subprocess.run(pip_install_arguments, check=True)
except CalledProcessError as e:
problem = DependencyProblem("library-install-failed", f"Failed to install {library}: {e}")
return MaybeDependency(None, [problem])

path_lookup.append_path(self._temporary_virtual_environment)
return MaybeDependency(None, [])

@cached_property
def _temporary_virtual_environment(self) -> Path:
# TODO: for `databricks labs ucx lint-local-code`, detect if we already have a virtual environment
# and use that one. See Databricks CLI code for the labs command to see how to detect the virtual
# environment. If we don't have a virtual environment, create a temporary one.
# simulate notebook-scoped virtual environment
lib_install_folder = tempfile.mkdtemp(prefix="ucx-")
return Path(lib_install_folder)


COMMENTED_OUT_FOR_PR_1685 = """
class SitePackageContainer(SourceContainer):
Expand Down Expand Up @@ -42,6 +85,8 @@ def __init__(self, packages: list[SitePackage]):
self._packages[top_level] = package

def __getitem__(self, item: str) -> SitePackage | None:
# TODO: Replace hyphen with underscores
# TODO: Don't use get, raise KeyError
return self._packages.get(item, None)


Expand Down
9 changes: 0 additions & 9 deletions tests/integration/contexts/test_global_context.py

This file was deleted.

12 changes: 12 additions & 0 deletions tests/unit/contexts/test_application.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import pytest

from databricks.labs.ucx.contexts.application import GlobalContext


@pytest.mark.parametrize("attribute", ["dependency_resolver", "pip_resolver", "site_packages", "site_packages_path"])
def test_global_context_attributes_not_none(attribute: str):
"""Attributes should be not None"""
# Goal is to improve test coverage
ctx = GlobalContext()
assert hasattr(ctx, attribute)
assert getattr(ctx, attribute) is not None
Empty file.
88 changes: 88 additions & 0 deletions tests/unit/source_code/notebooks/test_cells.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
from pathlib import Path
from unittest.mock import create_autospec

from databricks.labs.ucx.source_code.graph import Dependency, DependencyGraph, DependencyResolver
from databricks.labs.ucx.source_code.files import FileLoader
from databricks.labs.ucx.source_code.notebooks.cells import CellLanguage, PipCell
from databricks.labs.ucx.source_code.notebooks.loaders import (
NotebookResolver,
NotebookLoader,
)
from databricks.labs.ucx.source_code.site_packages import PipResolver


def test_pip_cell_language_is_pip():
assert PipCell("code").language == CellLanguage.PIP


def test_pip_cell_build_dependency_graph_invokes_register_library():
graph = create_autospec(DependencyGraph)

code = "%pip install databricks"
cell = PipCell(code)

problems = cell.build_dependency_graph(graph)

assert len(problems) == 0
graph.register_library.assert_called_once_with("databricks")


def test_pip_cell_build_dependency_graph_pip_registers_missing_library():
graph = create_autospec(DependencyGraph)

code = "%pip install"
cell = PipCell(code)

problems = cell.build_dependency_graph(graph)

assert len(problems) == 1
assert problems[0].code == "library-install-failed"
assert problems[0].message == "Missing arguments in '%pip install'"
graph.register_library.assert_not_called()


def test_pip_cell_build_dependency_graph_reports_incorrect_syntax():
graph = create_autospec(DependencyGraph)

code = "%pip installl pytest" # typo on purpose
cell = PipCell(code)

problems = cell.build_dependency_graph(graph)

assert len(problems) == 1
assert problems[0].code == "library-install-failed"
assert problems[0].message == "Unsupported %pip command: installl"
graph.register_library.assert_not_called()


def test_pip_cell_build_dependency_graph_reports_unknown_library(mock_path_lookup):
dependency = Dependency(FileLoader(), Path("test"))
notebook_loader = NotebookLoader()
notebook_resolver = NotebookResolver(notebook_loader)
dependency_resolver = DependencyResolver([PipResolver()], notebook_resolver, [], mock_path_lookup)
graph = DependencyGraph(dependency, None, dependency_resolver, mock_path_lookup)

code = "%pip install unknown-library-name"
cell = PipCell(code)

problems = cell.build_dependency_graph(graph)

assert len(problems) == 1
assert problems[0].code == "library-install-failed"
assert problems[0].message.startswith("Failed to install unknown-library-name")


def test_pip_cell_build_dependency_graph_resolves_installed_library(mock_path_lookup):
dependency = Dependency(FileLoader(), Path("test"))
notebook_loader = NotebookLoader()
notebook_resolver = NotebookResolver(notebook_loader)
dependency_resolver = DependencyResolver([PipResolver()], notebook_resolver, [], mock_path_lookup)
graph = DependencyGraph(dependency, None, dependency_resolver, mock_path_lookup)

code = "%pip install pytest"
cell = PipCell(code)

problems = cell.build_dependency_graph(graph)

assert len(problems) == 0
assert graph.path_lookup.resolve(Path("pytest")).exists()
Loading