Skip to content

Commit

Permalink
Added support for .egg Python libraries in jobs (#1789)
Browse files Browse the repository at this point in the history
## Changes
Register egg library from job dependencies to DependencyGraph for
linting

### Linked issues
Resolves #1643 

### Functionality 

- [ ] added relevant user documentation
- [ ] added new CLI command
- [ ] modified existing command: `databricks labs ucx ...`
- [ ] added a new workflow
- [ ] modified existing workflow: `...`
- [ ] added a new table
- [ ] modified existing table: `...`

### Tests
<!-- How is this tested? Please see the checklist below and also
describe any other relevant tests -->

- [x] manually tested
- [x] added unit tests
- [x] added integration tests
- [ ] verified on staging environment (screenshot attached)
  • Loading branch information
JCZuurmond authored May 31, 2024
1 parent 0cb1881 commit dd30f52
Show file tree
Hide file tree
Showing 16 changed files with 326 additions and 72 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
!tests/integration/source_code/samples/library-egg/*.egg
MANIFEST

# PyInstaller
Expand Down Expand Up @@ -152,4 +153,4 @@ dev/cleanup.py
.python-version
.databricks-login.json
*.out
foo
foo
4 changes: 2 additions & 2 deletions src/databricks/labs/ucx/contexts/application.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from databricks.labs.ucx.recon.metadata_retriever import DatabricksTableMetadataRetriever
from databricks.labs.ucx.recon.migration_recon import MigrationRecon
from databricks.labs.ucx.recon.schema_comparator import StandardSchemaComparator
from databricks.labs.ucx.source_code.python_libraries import PipResolver
from databricks.labs.ucx.source_code.python_libraries import PythonLibraryResolver
from databricks.sdk import AccountClient, WorkspaceClient, core
from databricks.sdk.errors import ResourceDoesNotExist
from databricks.sdk.service import sql
Expand Down Expand Up @@ -355,7 +355,7 @@ def verify_has_metastore(self):

@cached_property
def pip_resolver(self):
return PipResolver(self.whitelist)
return PythonLibraryResolver(self.whitelist)

@cached_property
def notebook_loader(self) -> NotebookLoader:
Expand Down
9 changes: 7 additions & 2 deletions src/databricks/labs/ucx/source_code/jobs.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import functools
import logging
import tempfile
from collections.abc import Iterable
from dataclasses import dataclass
from pathlib import Path
Expand Down Expand Up @@ -93,8 +94,12 @@ def _register_library(self, graph: DependencyGraph, library: compute.Library) ->
if problems:
yield from problems
if library.egg:
# TODO: https://github.com/databrickslabs/ucx/issues/1643
yield DependencyProblem("not-yet-implemented", "Egg library is not yet implemented")
logger.info(f"Registering library from {library.egg}")
with self._ws.workspace.download(library.egg, format=ExportFormat.AUTO) as remote_file:
with tempfile.TemporaryDirectory() as directory:
local_file = Path(directory) / Path(library.egg).name
local_file.write_bytes(remote_file.read())
yield from graph.register_library(local_file.as_posix())
if library.whl:
# TODO: download the wheel somewhere local and add it to "virtual sys.path" via graph.path_lookup.push_path
# TODO: https://github.com/databrickslabs/ucx/issues/1640
Expand Down
54 changes: 45 additions & 9 deletions src/databricks/labs/ucx/source_code/path_lookup.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,12 @@


class PathLookup:
"""
Mimic Python's importlib Lookup.
Sources:
See Lookup in importlib.metadata.__init__.py
"""

@classmethod
def from_pathlike_string(cls, cwd: Path, syspath: str):
Expand All @@ -27,19 +33,49 @@ def change_directory(self, new_working_directory: Path) -> PathLookup:
return PathLookup(new_working_directory, self._sys_paths)

def resolve(self, path: Path) -> Path | None:
if path.is_absolute() and path.exists():
# eliminate “..” components
return path.resolve()
for parent in self.library_roots:
absolute_path = parent / path
try:
if path.is_absolute() and path.exists():
# eliminate “..” components
return path.resolve()
except PermissionError:
logger.warning(f"Permission denied to access {path}")
return None
for library_root in self.library_roots:
try:
if absolute_path.exists():
# eliminate “..” components
return absolute_path.resolve()
resolved_path = self._resolve_in_library_root(library_root, path)
except PermissionError:
logger.warning(f"Permission denied to access {absolute_path}")
logger.warning(f"Permission denied to access files or directories in {library_root}")
continue
if resolved_path is not None:
return resolved_path
return None

def _resolve_in_library_root(self, library_root: Path, path: Path) -> Path | None:
if not library_root.is_dir():
return None
absolute_path = library_root / path
if absolute_path.exists():
return absolute_path.resolve() # eliminate “..” components
return self._resolve_egg_in_library_root(library_root, path)

def _resolve_egg_in_library_root(self, library: Path, path: Path) -> Path | None:
for child in library.iterdir():
if not self._is_egg_folder(child):
continue
absolute_path = child / path
if absolute_path.exists():
return absolute_path.resolve() # eliminate “..” components
return None

@staticmethod
def _is_egg_folder(path: Path) -> bool:
"""Egg folders end with `.egg` and have a 'EGG-INFO' file."""
return (
path.is_dir()
and path.suffix == ".egg"
and any(subfolder.name.lower() == "egg-info" for subfolder in path.iterdir())
)

def has_path(self, path: Path):
return next(p for p in self._sys_paths if path == p) is not None

Expand Down
67 changes: 61 additions & 6 deletions src/databricks/labs/ucx/source_code/python_libraries.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
# pylint: disable=import-outside-toplevel
from __future__ import annotations

import logging
import tempfile
import zipfile
from collections.abc import Callable
from functools import cached_property
from pathlib import Path

from databricks.labs.blueprint.entrypoint import is_in_debug

from databricks.labs.ucx.framework.utils import run_command
from databricks.labs.ucx.source_code.graph import (
LibraryResolver,
Expand All @@ -17,8 +21,7 @@
logger = logging.getLogger(__name__)


class PipResolver(LibraryResolver):
# TODO: https://github.com/databrickslabs/ucx/issues/1643
class PythonLibraryResolver(LibraryResolver):
# TODO: https://github.com/databrickslabs/ucx/issues/1640

def __init__(self, whitelist: Whitelist, runner: Callable[[str], tuple[int, str, str]] = run_command) -> None:
Expand All @@ -45,18 +48,70 @@ def _temporary_virtual_environment(self):

def _install_library(self, path_lookup: PathLookup, library: Path) -> list[DependencyProblem]:
"""Pip install library and augment path look-up to resolve the library at import"""
venv = self._temporary_virtual_environment
path_lookup.append_path(venv)
# resolve relative pip installs from notebooks: %pip install ../../foo.whl
path_lookup.append_path(self._temporary_virtual_environment)

# Resolve relative pip installs from notebooks: %pip install ../../foo.whl
maybe_library = path_lookup.resolve(library)
if maybe_library is not None:
library = maybe_library
return_code, stdout, stderr = self._runner(f"pip install {library} -t {venv}")

if library.suffix == ".egg":
return self._install_egg(library)
return self._install_pip(library)

def _install_pip(self, library: Path) -> list[DependencyProblem]:
return_code, stdout, stderr = self._runner(f"pip install {library} -t {self._temporary_virtual_environment}")
logger.debug(f"pip output:\n{stdout}\n{stderr}")
if return_code != 0:
problem = DependencyProblem("library-install-failed", f"Failed to install {library}: {stderr}")
return [problem]
return []

def _install_egg(self, library: Path) -> list[DependencyProblem]:
"""Install egss using easy_install.
Sources:
See easy_install in setuptools.command.easy_install
"""
verbosity = "--verbose" if is_in_debug() else "--quiet"
easy_install_arguments = [
"easy_install",
verbosity,
"--always-unzip",
"--install-dir",
self._temporary_virtual_environment.as_posix(),
library.as_posix(),
]
setup = self._get_setup()
if callable(setup):
try:
setup(script_args=easy_install_arguments)
return []
except (SystemExit, ImportError, ValueError) as e:
logger.warning(f"Failed to install {library} with (setuptools|distutils).setup, unzipping instead: {e}")
library_folder = self._temporary_virtual_environment / library.name
library_folder.mkdir(parents=True, exist_ok=True)
try:
# Not a "real" install, though, the best effort to still lint eggs without dependencies
with zipfile.ZipFile(library, "r") as zip_ref:
zip_ref.extractall(library_folder)
except zipfile.BadZipfile as e:
problem = DependencyProblem("library-install-failed", f"Failed to install {library}: {e}")
return [problem]
return []

@staticmethod
def _get_setup() -> Callable | None:
try:
# Mypy can't analyze setuptools due to missing type hints
from setuptools import setup # type: ignore
except ImportError:
try:
from distutils.core import setup # pylint: disable=deprecated-module
except ImportError:
logger.warning("Could not import setup.")
setup = None
return setup

def __str__(self) -> str:
return f"PipResolver(whitelist={self._whitelist})"
Binary file not shown.
25 changes: 25 additions & 0 deletions tests/integration/source_code/test_jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,3 +214,28 @@ def test_workflow_linter_lints_job_with_requirements_dependency(
problems = simple_ctx.workflow_linter.lint_job(job_with_pytest_library.job_id)

assert len([problem for problem in problems if problem.message == expected_problem_message]) == 0


def test_workflow_linter_lints_job_with_egg_dependency(
simple_ctx,
ws,
make_job,
make_notebook,
make_directory,
):
expected_problem_message = "Could not locate import: pkgdir"
egg_file = Path(__file__).parent / "samples" / "library-egg" / "demo_egg-0.0.1-py3.6.egg"

entrypoint = make_directory()

remote_egg_file = f"{entrypoint}/{egg_file.name}"
with egg_file.open("rb") as f:
ws.workspace.upload(remote_egg_file, f.read(), format=ImportFormat.AUTO)
library = compute.Library(egg=remote_egg_file)

notebook = make_notebook(path=f"{entrypoint}/notebook.ipynb", content=b"import pkgdir")
job_with_egg_dependency = make_job(notebook_path=notebook, libraries=[library])

problems = simple_ctx.workflow_linter.lint_job(job_with_egg_dependency.job_id)

assert len([problem for problem in problems if problem.message == expected_problem_message]) == 0
8 changes: 4 additions & 4 deletions tests/unit/source_code/linters/test_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from databricks.labs.blueprint.tui import MockPrompts
from databricks.labs.ucx.source_code.graph import DependencyResolver, SourceContainer
from databricks.labs.ucx.source_code.notebooks.loaders import NotebookResolver, NotebookLoader
from databricks.labs.ucx.source_code.python_libraries import PipResolver
from databricks.labs.ucx.source_code.python_libraries import PythonLibraryResolver
from databricks.labs.ucx.source_code.known import Whitelist

from databricks.sdk.service.workspace import Language
Expand Down Expand Up @@ -90,11 +90,11 @@ def test_migrator_walks_directory():


def test_linter_walks_directory(mock_path_lookup, migration_index):
mock_path_lookup.append_path(_samples_path(SourceContainer))
mock_path_lookup.append_path(Path(_samples_path(SourceContainer)))
file_loader = FileLoader()
folder_loader = FolderLoader(file_loader)
whitelist = Whitelist()
pip_resolver = PipResolver(whitelist)
pip_resolver = PythonLibraryResolver(whitelist)
resolver = DependencyResolver(
pip_resolver,
NotebookResolver(NotebookLoader()),
Expand Down Expand Up @@ -152,7 +152,7 @@ def test_known_issues(path: Path, migration_index):
whitelist = Whitelist()
notebook_resolver = NotebookResolver(NotebookLoader())
import_resolver = ImportFileResolver(file_loader, whitelist)
pip_resolver = PipResolver(whitelist)
pip_resolver = PythonLibraryResolver(whitelist)
resolver = DependencyResolver(pip_resolver, notebook_resolver, import_resolver, path_lookup)
linter = LocalCodeLinter(file_loader, folder_loader, path_lookup, resolver, lambda: LinterContext(migration_index))
linter.lint(MockPrompts({}), path)
6 changes: 3 additions & 3 deletions tests/unit/source_code/notebooks/test_cells.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
NotebookResolver,
NotebookLoader,
)
from databricks.labs.ucx.source_code.python_libraries import PipResolver
from databricks.labs.ucx.source_code.python_libraries import PythonLibraryResolver
from databricks.labs.ucx.source_code.known import Whitelist


Expand Down Expand Up @@ -60,7 +60,7 @@ def test_pip_cell_build_dependency_graph_reports_unknown_library(mock_path_looku
dependency = Dependency(FileLoader(), Path("test"))
notebook_loader = NotebookLoader()
notebook_resolver = NotebookResolver(notebook_loader)
pip_resolver = PipResolver(Whitelist())
pip_resolver = PythonLibraryResolver(Whitelist())
dependency_resolver = DependencyResolver(pip_resolver, notebook_resolver, [], mock_path_lookup)
graph = DependencyGraph(dependency, None, dependency_resolver, mock_path_lookup)

Expand All @@ -80,7 +80,7 @@ def test_pip_cell_build_dependency_graph_resolves_installed_library(mock_path_lo
notebook_resolver = NotebookResolver(notebook_loader)
whitelist = Whitelist()
file_loader = FileLoader()
pip_resolver = PipResolver(whitelist)
pip_resolver = PythonLibraryResolver(whitelist)
import_resolver = ImportFileResolver(file_loader, whitelist)
dependency_resolver = DependencyResolver(pip_resolver, notebook_resolver, import_resolver, mock_path_lookup)
graph = DependencyGraph(dependency, None, dependency_resolver, mock_path_lookup)
Expand Down
Loading

0 comments on commit dd30f52

Please sign in to comment.