Skip to content

Commit

Permalink
Detect dependencies of libraries installed via pip (#1703)
Browse files Browse the repository at this point in the history
## Changes
Builds child dependency graph of libraries resolved via PipResolver,
using DistInfo data
Changed some tests that would consequently take minutes

### Linked issues
#1202
Resolves #1642 

### Functionality 

- [ ] added relevant user documentation
- [ ] added new CLI command
- [ ] modified existing command: `databricks labs ucx ...`
- [ ] added a new workflow
- [ ] modified existing workflow: `...`
- [ ] added a new table
- [ ] modified existing table: `...`

### Tests
- [ ] manually tested
- [x] added unit tests
- [ ] added integration tests
- [ ] verified on staging environment (screenshot attached)

---------

Co-authored-by: Eric Vergnaud <eic.vergnaud@databricks.com>
  • Loading branch information
ericvergnaud and Eric Vergnaud authored May 17, 2024
1 parent a42a83b commit ee8fffc
Show file tree
Hide file tree
Showing 19 changed files with 385 additions and 225 deletions.
8 changes: 2 additions & 6 deletions src/databricks/labs/ucx/contexts/application.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from databricks.labs.blueprint.tui import Prompts
from databricks.labs.blueprint.wheels import ProductInfo, WheelsV2
from databricks.labs.lsql.backends import SqlBackend
from databricks.labs.ucx.source_code.python_libraries import PipResolver
from databricks.sdk import AccountClient, WorkspaceClient, core
from databricks.sdk.service import sql

Expand Down Expand Up @@ -43,7 +44,6 @@
from databricks.labs.ucx.source_code.path_lookup import PathLookup
from databricks.labs.ucx.source_code.graph import DependencyResolver
from databricks.labs.ucx.source_code.whitelist import WhitelistResolver, Whitelist
from databricks.labs.ucx.source_code.site_packages import PipResolver, SitePackages
from databricks.labs.ucx.source_code.languages import Languages
from databricks.labs.ucx.source_code.redash import Redash
from databricks.labs.ucx.workspace_access import generic, redash
Expand Down Expand Up @@ -352,7 +352,7 @@ def verify_has_metastore(self):

@cached_property
def pip_resolver(self):
return PipResolver()
return PipResolver(self.file_loader, self.whitelist)

@cached_property
def notebook_loader(self) -> NotebookLoader:
Expand All @@ -367,10 +367,6 @@ def site_packages_path(self):
lookup = self.path_lookup
return next(path for path in lookup.paths if "site-packages" in path.as_posix())

@cached_property
def site_packages(self):
return SitePackages.parse(self.site_packages_path)

@cached_property
def path_lookup(self):
# TODO find a solution to enable a different cwd per job/task (maybe it's not necessary or possible?)
Expand Down
30 changes: 22 additions & 8 deletions src/databricks/labs/ucx/source_code/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,6 @@ def path(self):
return self._dependency.path

def register_library(self, library: str) -> list[DependencyProblem]:
# TODO: use DistInfoResolver to load wheel/egg/pypi dependencies
# TODO: https://github.com/databrickslabs/ucx/issues/1642
# TODO: https://github.com/databrickslabs/ucx/issues/1643
# TODO: https://github.com/databrickslabs/ucx/issues/1640
maybe = self._resolver.resolve_library(self.path_lookup, library)
Expand All @@ -74,8 +72,8 @@ def register_import(self, name: str) -> list[DependencyProblem]:

def register_dependency(self, dependency: Dependency) -> MaybeGraph:
# TODO: this has to be a private method, because we don't want to allow free-form dependencies.
# the only case we have for this method to be used outside of this class is for SitePackages (or DistInfo)
# See databricks.labs.ucx.source_code.site_packages.SitePackageContainer.build_dependency_graph for reference
# the only case we have for this method to be used outside of this class is for DistInfoPackage
# See databricks.labs.ucx.source_code.python_libraries.DistInfoContainer.build_dependency_graph for reference
maybe = self.locate_dependency(dependency.path)
if maybe.graph is not None:
self._dependencies[dependency] = maybe.graph
Expand Down Expand Up @@ -261,7 +259,7 @@ def __init__(self, next_resolver: BaseLibraryResolver | None):
def with_next_resolver(self, resolver: BaseLibraryResolver) -> BaseLibraryResolver:
"""required to create a linked list of resolvers"""

def resolve_library(self, path_lookup: PathLookup, library: str) -> MaybeDependency:
def resolve_library(self, path_lookup: PathLookup, library: Path) -> MaybeDependency:
assert self._next_resolver is not None
return self._next_resolver.resolve_library(path_lookup, library)

Expand Down Expand Up @@ -311,8 +309,8 @@ def __init__(self):
def with_next_resolver(self, resolver: BaseLibraryResolver) -> BaseLibraryResolver:
raise NotImplementedError("Should never happen!")

def resolve_library(self, path_lookup: PathLookup, library: str) -> MaybeDependency:
return self._fail('library-not-found', f"Could not resolve library: {library}")
def resolve_library(self, path_lookup: PathLookup, library: Path) -> MaybeDependency:
return self._fail('library-not-found', f"Could not resolve library: {library.as_posix()}")

@staticmethod
def _fail(code: str, message: str):
Expand Down Expand Up @@ -384,7 +382,7 @@ def resolve_import(self, path_lookup: PathLookup, name: str) -> MaybeDependency:
return self._import_resolver.resolve_import(path_lookup, name)

def resolve_library(self, path_lookup: PathLookup, library: str) -> MaybeDependency:
return self._library_resolver.resolve_library(path_lookup, library)
return self._library_resolver.resolve_library(path_lookup, Path(library))

def build_local_file_dependency_graph(self, path: Path) -> MaybeGraph:
"""Builds a dependency graph starting from a file. This method is mainly intended for testing purposes.
Expand Down Expand Up @@ -440,6 +438,22 @@ def _make_relative_paths(self, problems: list[DependencyProblem], path: Path) ->
adjusted_problems.append(problem.replace(source_path=out_path))
return adjusted_problems

def build_library_dependency_graph(self, path: Path):
"""Builds a dependency graph starting from a library. This method is mainly intended for testing purposes.
In case of problems, the paths in the problems will be relative to the starting path lookup."""
maybe = self._library_resolver.resolve_library(self._path_lookup, path)
if not maybe.dependency:
return MaybeGraph(None, self._make_relative_paths(maybe.problems, path))
graph = DependencyGraph(maybe.dependency, None, self, self._path_lookup)
container = maybe.dependency.load(graph.path_lookup)
if container is None:
problem = DependencyProblem('cannot-load-library', f"Could not load library {path}")
return MaybeGraph(None, [problem])
problems = container.build_dependency_graph(graph)
if problems:
problems = self._make_relative_paths(problems, path)
return MaybeGraph(graph, problems)

def __repr__(self):
return f"<DependencyResolver {self._notebook_resolver} {self._import_resolver} {self._path_lookup}>"

Expand Down
7 changes: 3 additions & 4 deletions src/databricks/labs/ucx/source_code/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,13 +85,9 @@ def _register_libraries(self, graph: DependencyGraph) -> Iterable[DependencyProb
@staticmethod
def _register_library(graph: DependencyGraph, library: compute.Library) -> Iterable[DependencyProblem]:
if library.pypi:
# TODO: https://github.com/databrickslabs/ucx/issues/1642
problems = graph.register_library(library.pypi.package)
if problems:
yield from problems
if library.jar:
# TODO: https://github.com/databrickslabs/ucx/issues/1641
yield DependencyProblem('not-yet-implemented', 'Jar library is not yet implemented')
if library.egg:
# TODO: https://github.com/databrickslabs/ucx/issues/1643
yield DependencyProblem("not-yet-implemented", "Egg library is not yet implemented")
Expand All @@ -103,6 +99,9 @@ def _register_library(graph: DependencyGraph, library: compute.Library) -> Itera
# TODO: download and add every entry via graph.register_library
# TODO: https://github.com/databrickslabs/ucx/issues/1644
yield DependencyProblem('not-yet-implemented', 'Requirements library is not yet implemented')
if library.jar:
# TODO: https://github.com/databrickslabs/ucx/issues/1641
yield DependencyProblem('not-yet-implemented', 'Jar library is not yet implemented')

def _register_notebook(self, graph: DependencyGraph) -> Iterable[DependencyProblem]:
if not self._task.notebook_task:
Expand Down
2 changes: 0 additions & 2 deletions src/databricks/labs/ucx/source_code/notebooks/cells.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,6 @@ def is_runnable(self) -> bool:
return True # TODO

def build_dependency_graph(self, graph: DependencyGraph) -> list[DependencyProblem]:
# TODO: https://github.com/databrickslabs/ucx/issues/1642
# TODO: this is very basic code, we need to improve it
splits = self.original_code.split(' ')
if len(splits) < 3:
Expand All @@ -192,7 +191,6 @@ def build_dependency_graph(self, graph: DependencyGraph) -> list[DependencyProbl
return [DependencyProblem("library-install-failed", f"Unsupported %pip command: {splits[1]}")]
# TODO: we need to support different formats of the library name and etc
library = splits[2]

return graph.register_library(library)


Expand Down
205 changes: 205 additions & 0 deletions src/databricks/labs/ucx/source_code/python_libraries.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
from __future__ import annotations

import os
import subprocess
import tempfile
from pathlib import Path
from subprocess import CalledProcessError

from databricks.labs.ucx.source_code.files import FileLoader
from databricks.labs.ucx.source_code.graph import (
BaseLibraryResolver,
DependencyProblem,
MaybeDependency,
SourceContainer,
DependencyGraph,
Dependency,
WrappingLoader,
)
from databricks.labs.ucx.source_code.path_lookup import PathLookup
from databricks.labs.ucx.source_code.whitelist import Whitelist


class PipResolver(BaseLibraryResolver):
# TODO: https://github.com/databrickslabs/ucx/issues/1643
# TODO: https://github.com/databrickslabs/ucx/issues/1640

def __init__(
self, file_loader: FileLoader, white_list: Whitelist, next_resolver: BaseLibraryResolver | None = None
) -> None:
super().__init__(next_resolver)
self._file_loader = file_loader
self._white_list = white_list
self._lib_install_folder = ""

def with_next_resolver(self, resolver: BaseLibraryResolver) -> PipResolver:
return PipResolver(self._file_loader, self._white_list, resolver)

def resolve_library(self, path_lookup: PathLookup, library: Path) -> MaybeDependency:
library_path = self._locate_library(path_lookup, library)
if library_path is None: # not installed yet
return self._install_library(path_lookup, library)
dist_info_path = self._locate_dist_info(library_path, library)
if dist_info_path is None: # old package style
problem = DependencyProblem('no-dist-info', f"No dist-info found for {library.name}")
return MaybeDependency(None, [problem])
return self._create_dependency(library, dist_info_path)

def _locate_library(self, path_lookup: PathLookup, library: Path) -> Path | None:
# start the quick way
full_path = path_lookup.resolve(library)
if full_path is not None:
return full_path
# maybe the name needs tweaking
if "-" in library.name:
name = library.name.replace("-", "_")
return self._locate_library(path_lookup, Path(name))
return None

def _locate_dist_info(self, library_path: Path, library: Path) -> Path | None:
packages = os.listdir(library_path.parent.as_posix())
dist_info_dir = next(
(package for package in packages if package.startswith(library.name) and package.endswith(".dist-info")),
None,
)
if dist_info_dir is not None:
return Path(library_path.parent, Path(dist_info_dir))
# maybe the name needs tweaking
if "-" in library.name:
name = library.name.replace("-", "_")
return self._locate_dist_info(library_path, Path(name))
return None

def _install_library(self, path_lookup: PathLookup, library: Path) -> MaybeDependency:
"""Pip install library and augment path look-up to resolve the library at import"""
# invoke pip install via subprocess to install this library into self._lib_install_folder
venv = self._temporary_virtual_environment(path_lookup).as_posix()
existing_packages = os.listdir(venv)
pip_install_arguments = ["pip", "install", library.name, "-t", venv]
try:
subprocess.run(pip_install_arguments, check=True)
except CalledProcessError as e:
problem = DependencyProblem("library-install-failed", f"Failed to install {library}: {e}")
return MaybeDependency(None, [problem])
added_packages = set(os.listdir(venv)).difference(existing_packages)
dist_info_dirs = list(filter(lambda package: package.endswith(".dist-info"), added_packages))
dist_info_dir = next((dir for dir in dist_info_dirs if dir.startswith(library.name)), None)
if dist_info_dir is None and "-" in library.name:
name = library.name.replace("-", "_")
dist_info_dir = next((dir for dir in dist_info_dirs if dir.startswith(name)), None)
if dist_info_dir is None:
# TODO handle other distribution types
problem = DependencyProblem('no-dist-info', f"No dist-info found for {library.name}")
return MaybeDependency(None, [problem])
dist_info_path = path_lookup.resolve(Path(dist_info_dir))
assert dist_info_path is not None
return self._create_dependency(library, dist_info_path)

def _create_dependency(self, library: Path, dist_info_path: Path):
package = DistInfoPackage.parse(dist_info_path)
container = DistInfoContainer(self._file_loader, self._white_list, package)
dependency = Dependency(WrappingLoader(container), library)
return MaybeDependency(dependency, [])

def _temporary_virtual_environment(self, path_lookup: PathLookup) -> Path:
# TODO: for `databricks labs ucx lint-local-code`, detect if we already have a virtual environment
# and use that one. See Databricks CLI code for the labs command to see how to detect the virtual
# environment. If we don't have a virtual environment, create a temporary one.
# simulate notebook-scoped virtual environment
if len(self._lib_install_folder) == 0:
self._lib_install_folder = tempfile.mkdtemp(prefix="ucx-python-libs-")
path_lookup.append_path(Path(self._lib_install_folder))
return Path(self._lib_install_folder)


class DistInfoContainer(SourceContainer):

def __init__(self, file_loader: FileLoader, white_list: Whitelist, package: DistInfoPackage):
self._file_loader = file_loader
self._white_list = white_list
self._package = package

def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProblem]:
problems: list[DependencyProblem] = []
for module_path in self._package.module_paths:
maybe = parent.register_dependency(Dependency(self._file_loader, module_path))
if maybe.problems:
problems.extend(maybe.problems)
for library_name in self._package.library_names:
compatibility = self._white_list.compatibility(library_name)
if compatibility is not None:
# TODO attach compatibility to dependency, see https://github.com/databrickslabs/ucx/issues/1382
continue
more_problems = parent.register_library(library_name)
problems.extend(more_problems)
return problems

def __repr__(self):
return f"<DistInfoContainer {self._package}>"


REQUIRES_DIST_PREFIX = "Requires-Dist: "


class DistInfoPackage:
"""
represents a wheel package installable via pip
see https://packaging.python.org/en/latest/specifications/binary-distribution-format/
"""

@classmethod
def parse(cls, path: Path):
# not using importlib.metadata because it only works on accessible packages
# which we can't guarantee since we have our own emulated sys.paths
with open(Path(path, "METADATA"), encoding="utf-8") as metadata_file:
lines = metadata_file.readlines()
requirements = filter(lambda line: line.startswith(REQUIRES_DIST_PREFIX), lines)
library_names = [
cls._extract_library_name_from_requires_dist(requirement[len(REQUIRES_DIST_PREFIX) :])
for requirement in requirements
]
with open(Path(path, "RECORD"), encoding="utf-8") as record_file:
lines = record_file.readlines()
files = [line.split(',')[0] for line in lines]
modules = list(filter(lambda line: line.endswith(".py"), files))
top_levels_path = Path(path, "top_level.txt")
if top_levels_path.exists():
with open(top_levels_path, encoding="utf-8") as top_levels_file:
top_levels = [line.strip() for line in top_levels_file.readlines()]
else:
dir_name = path.name
# strip extension
dir_name = dir_name[: dir_name.rindex('.')]
# strip version
dir_name = dir_name[: dir_name.rindex('-')]
top_levels = [dir_name]
return DistInfoPackage(path, top_levels, [Path(module) for module in modules], list(library_names))

@classmethod
def _extract_library_name_from_requires_dist(cls, requirement: str) -> str:
delimiters = {' ', '@', '<', '>', ';'}
for i, char in enumerate(requirement):
if char in delimiters:
return requirement[:i]
return requirement

def __init__(self, dist_info_path: Path, top_levels: list[str], module_paths: list[Path], library_names: list[str]):
self._dist_info_path = dist_info_path
self._top_levels = top_levels
self._module_paths = module_paths
self._library_names = library_names

@property
def top_levels(self) -> list[str]:
return self._top_levels

@property
def module_paths(self) -> list[Path]:
return [Path(self._dist_info_path.parent, path) for path in self._module_paths]

@property
def library_names(self) -> list[str]:
return self._library_names

def __repr__(self):
return f"<DistInfoPackage {self._dist_info_path}>"
Loading

0 comments on commit ee8fffc

Please sign in to comment.