From dd30f52b980f272dccc517ecdc441ffdce27662d Mon Sep 17 00:00:00 2001 From: Cor Date: Fri, 31 May 2024 10:42:59 +0200 Subject: [PATCH] Added support for `.egg` Python libraries in jobs (#1789) ## Changes Register egg library from job dependencies to DependencyGraph for linting ### Linked issues Resolves #1643 ### Functionality - [ ] added relevant user documentation - [ ] added new CLI command - [ ] modified existing command: `databricks labs ucx ...` - [ ] added a new workflow - [ ] modified existing workflow: `...` - [ ] added a new table - [ ] modified existing table: `...` ### Tests - [x] manually tested - [x] added unit tests - [x] added integration tests - [ ] verified on staging environment (screenshot attached) --- .gitignore | 3 +- .../labs/ucx/contexts/application.py | 4 +- src/databricks/labs/ucx/source_code/jobs.py | 9 +- .../labs/ucx/source_code/path_lookup.py | 54 +++++++-- .../labs/ucx/source_code/python_libraries.py | 67 ++++++++++- .../library-egg/demo_egg-0.0.1-py3.6.egg | Bin 0 -> 1489 bytes tests/integration/source_code/test_jobs.py | 25 +++++ tests/unit/source_code/linters/test_files.py | 8 +- .../unit/source_code/notebooks/test_cells.py | 6 +- tests/unit/source_code/test_dependencies.py | 38 +++---- tests/unit/source_code/test_graph.py | 6 +- tests/unit/source_code/test_jobs.py | 44 +++++++- tests/unit/source_code/test_notebook.py | 12 +- .../test_path_lookup_simulation.py | 106 +++++++++++++++++- .../unit/source_code/test_python_libraries.py | 10 +- tests/unit/source_code/test_s3fs.py | 6 +- 16 files changed, 326 insertions(+), 72 deletions(-) create mode 100644 tests/integration/source_code/samples/library-egg/demo_egg-0.0.1-py3.6.egg diff --git a/.gitignore b/.gitignore index bfb8e46e57..8a030e881a 100644 --- a/.gitignore +++ b/.gitignore @@ -29,6 +29,7 @@ share/python-wheels/ *.egg-info/ .installed.cfg *.egg +!tests/integration/source_code/samples/library-egg/*.egg MANIFEST # PyInstaller @@ -152,4 +153,4 @@ dev/cleanup.py .python-version .databricks-login.json *.out -foo \ No newline at end of file +foo diff --git a/src/databricks/labs/ucx/contexts/application.py b/src/databricks/labs/ucx/contexts/application.py index 204b5bb481..d21c096a72 100644 --- a/src/databricks/labs/ucx/contexts/application.py +++ b/src/databricks/labs/ucx/contexts/application.py @@ -15,7 +15,7 @@ from databricks.labs.ucx.recon.metadata_retriever import DatabricksTableMetadataRetriever from databricks.labs.ucx.recon.migration_recon import MigrationRecon from databricks.labs.ucx.recon.schema_comparator import StandardSchemaComparator -from databricks.labs.ucx.source_code.python_libraries import PipResolver +from databricks.labs.ucx.source_code.python_libraries import PythonLibraryResolver from databricks.sdk import AccountClient, WorkspaceClient, core from databricks.sdk.errors import ResourceDoesNotExist from databricks.sdk.service import sql @@ -355,7 +355,7 @@ def verify_has_metastore(self): @cached_property def pip_resolver(self): - return PipResolver(self.whitelist) + return PythonLibraryResolver(self.whitelist) @cached_property def notebook_loader(self) -> NotebookLoader: diff --git a/src/databricks/labs/ucx/source_code/jobs.py b/src/databricks/labs/ucx/source_code/jobs.py index e24298f625..4adbddda56 100644 --- a/src/databricks/labs/ucx/source_code/jobs.py +++ b/src/databricks/labs/ucx/source_code/jobs.py @@ -1,5 +1,6 @@ import functools import logging +import tempfile from collections.abc import Iterable from dataclasses import dataclass from pathlib import Path @@ -93,8 +94,12 @@ def _register_library(self, graph: DependencyGraph, library: compute.Library) -> if problems: yield from problems if library.egg: - # TODO: https://github.com/databrickslabs/ucx/issues/1643 - yield DependencyProblem("not-yet-implemented", "Egg library is not yet implemented") + logger.info(f"Registering library from {library.egg}") + with self._ws.workspace.download(library.egg, format=ExportFormat.AUTO) as remote_file: + with tempfile.TemporaryDirectory() as directory: + local_file = Path(directory) / Path(library.egg).name + local_file.write_bytes(remote_file.read()) + yield from graph.register_library(local_file.as_posix()) if library.whl: # TODO: download the wheel somewhere local and add it to "virtual sys.path" via graph.path_lookup.push_path # TODO: https://github.com/databrickslabs/ucx/issues/1640 diff --git a/src/databricks/labs/ucx/source_code/path_lookup.py b/src/databricks/labs/ucx/source_code/path_lookup.py index f1a3b49a4d..219598e1c3 100644 --- a/src/databricks/labs/ucx/source_code/path_lookup.py +++ b/src/databricks/labs/ucx/source_code/path_lookup.py @@ -9,6 +9,12 @@ class PathLookup: + """ + Mimic Python's importlib Lookup. + + Sources: + See Lookup in importlib.metadata.__init__.py + """ @classmethod def from_pathlike_string(cls, cwd: Path, syspath: str): @@ -27,19 +33,49 @@ def change_directory(self, new_working_directory: Path) -> PathLookup: return PathLookup(new_working_directory, self._sys_paths) def resolve(self, path: Path) -> Path | None: - if path.is_absolute() and path.exists(): - # eliminate “..” components - return path.resolve() - for parent in self.library_roots: - absolute_path = parent / path + try: + if path.is_absolute() and path.exists(): + # eliminate “..” components + return path.resolve() + except PermissionError: + logger.warning(f"Permission denied to access {path}") + return None + for library_root in self.library_roots: try: - if absolute_path.exists(): - # eliminate “..” components - return absolute_path.resolve() + resolved_path = self._resolve_in_library_root(library_root, path) except PermissionError: - logger.warning(f"Permission denied to access {absolute_path}") + logger.warning(f"Permission denied to access files or directories in {library_root}") + continue + if resolved_path is not None: + return resolved_path + return None + + def _resolve_in_library_root(self, library_root: Path, path: Path) -> Path | None: + if not library_root.is_dir(): + return None + absolute_path = library_root / path + if absolute_path.exists(): + return absolute_path.resolve() # eliminate “..” components + return self._resolve_egg_in_library_root(library_root, path) + + def _resolve_egg_in_library_root(self, library: Path, path: Path) -> Path | None: + for child in library.iterdir(): + if not self._is_egg_folder(child): + continue + absolute_path = child / path + if absolute_path.exists(): + return absolute_path.resolve() # eliminate “..” components return None + @staticmethod + def _is_egg_folder(path: Path) -> bool: + """Egg folders end with `.egg` and have a 'EGG-INFO' file.""" + return ( + path.is_dir() + and path.suffix == ".egg" + and any(subfolder.name.lower() == "egg-info" for subfolder in path.iterdir()) + ) + def has_path(self, path: Path): return next(p for p in self._sys_paths if path == p) is not None diff --git a/src/databricks/labs/ucx/source_code/python_libraries.py b/src/databricks/labs/ucx/source_code/python_libraries.py index 256d8bcb52..5b5fb9cc42 100644 --- a/src/databricks/labs/ucx/source_code/python_libraries.py +++ b/src/databricks/labs/ucx/source_code/python_libraries.py @@ -1,11 +1,15 @@ +# pylint: disable=import-outside-toplevel from __future__ import annotations import logging import tempfile +import zipfile from collections.abc import Callable from functools import cached_property from pathlib import Path +from databricks.labs.blueprint.entrypoint import is_in_debug + from databricks.labs.ucx.framework.utils import run_command from databricks.labs.ucx.source_code.graph import ( LibraryResolver, @@ -17,8 +21,7 @@ logger = logging.getLogger(__name__) -class PipResolver(LibraryResolver): - # TODO: https://github.com/databrickslabs/ucx/issues/1643 +class PythonLibraryResolver(LibraryResolver): # TODO: https://github.com/databrickslabs/ucx/issues/1640 def __init__(self, whitelist: Whitelist, runner: Callable[[str], tuple[int, str, str]] = run_command) -> None: @@ -45,18 +48,70 @@ def _temporary_virtual_environment(self): def _install_library(self, path_lookup: PathLookup, library: Path) -> list[DependencyProblem]: """Pip install library and augment path look-up to resolve the library at import""" - venv = self._temporary_virtual_environment - path_lookup.append_path(venv) - # resolve relative pip installs from notebooks: %pip install ../../foo.whl + path_lookup.append_path(self._temporary_virtual_environment) + + # Resolve relative pip installs from notebooks: %pip install ../../foo.whl maybe_library = path_lookup.resolve(library) if maybe_library is not None: library = maybe_library - return_code, stdout, stderr = self._runner(f"pip install {library} -t {venv}") + + if library.suffix == ".egg": + return self._install_egg(library) + return self._install_pip(library) + + def _install_pip(self, library: Path) -> list[DependencyProblem]: + return_code, stdout, stderr = self._runner(f"pip install {library} -t {self._temporary_virtual_environment}") logger.debug(f"pip output:\n{stdout}\n{stderr}") if return_code != 0: problem = DependencyProblem("library-install-failed", f"Failed to install {library}: {stderr}") return [problem] return [] + def _install_egg(self, library: Path) -> list[DependencyProblem]: + """Install egss using easy_install. + + Sources: + See easy_install in setuptools.command.easy_install + """ + verbosity = "--verbose" if is_in_debug() else "--quiet" + easy_install_arguments = [ + "easy_install", + verbosity, + "--always-unzip", + "--install-dir", + self._temporary_virtual_environment.as_posix(), + library.as_posix(), + ] + setup = self._get_setup() + if callable(setup): + try: + setup(script_args=easy_install_arguments) + return [] + except (SystemExit, ImportError, ValueError) as e: + logger.warning(f"Failed to install {library} with (setuptools|distutils).setup, unzipping instead: {e}") + library_folder = self._temporary_virtual_environment / library.name + library_folder.mkdir(parents=True, exist_ok=True) + try: + # Not a "real" install, though, the best effort to still lint eggs without dependencies + with zipfile.ZipFile(library, "r") as zip_ref: + zip_ref.extractall(library_folder) + except zipfile.BadZipfile as e: + problem = DependencyProblem("library-install-failed", f"Failed to install {library}: {e}") + return [problem] + return [] + + @staticmethod + def _get_setup() -> Callable | None: + try: + # Mypy can't analyze setuptools due to missing type hints + from setuptools import setup # type: ignore + except ImportError: + try: + from distutils.core import setup # pylint: disable=deprecated-module + except ImportError: + logger.warning("Could not import setup.") + setup = None + return setup + def __str__(self) -> str: return f"PipResolver(whitelist={self._whitelist})" diff --git a/tests/integration/source_code/samples/library-egg/demo_egg-0.0.1-py3.6.egg b/tests/integration/source_code/samples/library-egg/demo_egg-0.0.1-py3.6.egg new file mode 100644 index 0000000000000000000000000000000000000000..474e5a7bf29e9b8b7e425ab399325e6ec8514258 GIT binary patch literal 1489 zcmWIWW@Zs#U|`^2NLp6ryTwh5-I9@kftiVcK@ceF>h7-V>F4IJAK(pPga-QNA2Q(C z`#n73(Vc0zm-sAs4=!2g=-8ASzUNv?&ok%F-53AL%_1GvhMu3ITG3_KKD?i?8RYCr!MY6J5f4uf2$7ROXO+E%l<|)>Rh1)3#n-ach=cIqGwu z?e)IRg zcUmQEoOS4g>&5(4yMEW~vO8=(rB%8A4(FO%2bTE9%$U_BbM)|s|4)}BXIosmRQmeS zuNi!Ej_cnE|8`uZJ^&@0`De^D%?5_=QXmF}K0KU*{X>JCU4!*XDoVJEN=rVLUOnym zP|HtOQ}=@QiYH#4XV3Qg`}$vJPP%($&z73M4Eg1;{`TheIzb2jEp$A$?qHRwsA@z? za1eLU=b)g^le8Zf8-3o?U8tdbQorN8r>^%|#?#^{%a^B!y*z$A{aNa?BgdKNvM``J zZZg|-BW9qZfa!oi7TIwrsRgNdDXDqMmGL>5dD+DvS3YJ!)6HPH+l&*aiyer?kad^j z7sThJmZj!^v}vF4^FOKc2wgL>)hOw&DziYhI5901#X7SkWxjR>Ay*ZE*2n>|5D*t+ zr>A5V>Bq-s=4F<|$LkeTYHOT2qkS=;tN;9&(>k5~*R@xKoH!G#0d_#pC3MfTZnt{a z1vF#{s-h8P!)3t6^&-MuJ zE~C$0VMZd2TbeuLmw7IoX{q_kQ*-7~)vBGHCx0Fl-POicHEoxuSlY)lwO=b=zWgP? z{AFU#-lK_=XU`9sH96}zM-j^jS73xNGKnzb&ecF~fx(hS5CuF9>a 0 @@ -30,7 +30,7 @@ def test_dependency_resolver_repr(mock_notebook_resolver, mock_path_lookup): def test_dependency_resolver_visits_workspace_notebook_dependencies(mock_path_lookup): notebook_loader = NotebookLoader() notebook_resolver = NotebookResolver(notebook_loader) - pip_resolver = PipResolver(Whitelist()) + pip_resolver = PythonLibraryResolver(Whitelist()) dependency_resolver = DependencyResolver(pip_resolver, notebook_resolver, [], mock_path_lookup) maybe = dependency_resolver.build_notebook_dependency_graph(Path("root3.run.py")) assert not maybe.failed @@ -40,7 +40,7 @@ def test_dependency_resolver_visits_workspace_notebook_dependencies(mock_path_lo def test_dependency_resolver_visits_local_notebook_dependencies(mock_path_lookup): notebook_loader = NotebookLoader() notebook_resolver = NotebookResolver(notebook_loader) - pip_resolver = PipResolver(Whitelist()) + pip_resolver = PythonLibraryResolver(Whitelist()) dependency_resolver = DependencyResolver(pip_resolver, notebook_resolver, [], mock_path_lookup) maybe = dependency_resolver.build_notebook_dependency_graph(Path("root4.py")) assert not maybe.failed @@ -53,7 +53,7 @@ def test_dependency_resolver_visits_workspace_file_dependencies(mock_path_lookup notebook_loader = NotebookLoader() notebook_resolver = NotebookResolver(notebook_loader) import_resolver = ImportFileResolver(file_loader, whitelist) - pip_resolver = PipResolver(whitelist) + pip_resolver = PythonLibraryResolver(whitelist) dependency_resolver = DependencyResolver(pip_resolver, notebook_resolver, import_resolver, mock_path_lookup) maybe = dependency_resolver.build_local_file_dependency_graph(Path('./root8.py')) assert not maybe.failed @@ -65,7 +65,7 @@ def test_dependency_resolver_raises_problem_with_unfound_workspace_notebook_depe notebook_loader = NotebookLoader() notebook_resolver = NotebookResolver(notebook_loader) import_resolver = ImportFileResolver(file_loader, Whitelist()) - pip_resolver = PipResolver(Whitelist()) + pip_resolver = PythonLibraryResolver(Whitelist()) dependency_resolver = DependencyResolver(pip_resolver, notebook_resolver, import_resolver, mock_path_lookup) maybe = dependency_resolver.build_notebook_dependency_graph(Path("root1-no-leaf.run.py")) assert list(maybe.problems) == [ @@ -84,7 +84,7 @@ def test_dependency_resolver_raises_problem_with_unfound_workspace_notebook_depe def test_dependency_resolver_raises_problem_with_unfound_local_notebook_dependency(mock_path_lookup): notebook_loader = NotebookLoader() notebook_resolver = NotebookResolver(notebook_loader) - pip_resolver = PipResolver(Whitelist()) + pip_resolver = PythonLibraryResolver(Whitelist()) dependency_resolver = DependencyResolver(pip_resolver, notebook_resolver, [], mock_path_lookup) maybe = dependency_resolver.build_notebook_dependency_graph(Path("root4-no-leaf.py")) assert list(maybe.problems) == [ @@ -97,7 +97,7 @@ def test_dependency_resolver_raises_problem_with_unfound_local_notebook_dependen def test_dependency_resolver_raises_problem_with_non_constant_local_notebook_dependency(mock_path_lookup): notebook_loader = NotebookLoader() notebook_resolver = NotebookResolver(notebook_loader) - pip_resolver = PipResolver(Whitelist()) + pip_resolver = PythonLibraryResolver(Whitelist()) dependency_resolver = DependencyResolver(pip_resolver, notebook_resolver, [], mock_path_lookup) maybe = dependency_resolver.build_notebook_dependency_graph(Path('root10.py')) assert list(maybe.problems) == [ @@ -116,7 +116,7 @@ def test_dependency_resolver_raises_problem_with_non_constant_local_notebook_dep def test_dependency_resolver_raises_problem_with_invalid_run_cell(mock_path_lookup): notebook_loader = NotebookLoader() notebook_resolver = NotebookResolver(notebook_loader) - pip_resolver = PipResolver(Whitelist()) + pip_resolver = PythonLibraryResolver(Whitelist()) dependency_resolver = DependencyResolver(pip_resolver, notebook_resolver, [], mock_path_lookup) maybe = dependency_resolver.build_notebook_dependency_graph(Path('leaf6.py')) assert list(maybe.problems) == [ @@ -139,7 +139,7 @@ def test_dependency_resolver_raises_problem_with_unresolved_import(mock_path_loo notebook_resolver = NotebookResolver(notebook_loader) whitelist = Whitelist() import_resolver = ImportFileResolver(FileLoader(), whitelist) - pip_resolver = PipResolver(whitelist) + pip_resolver = PythonLibraryResolver(whitelist) dependency_resolver = DependencyResolver(pip_resolver, notebook_resolver, import_resolver, mock_path_lookup) maybe = dependency_resolver.build_local_file_dependency_graph(Path('root7.py')) assert list(maybe.problems) == [ @@ -152,7 +152,7 @@ def test_dependency_resolver_raises_problem_with_non_constant_notebook_argument( notebook_resolver = NotebookResolver(notebook_loader) whitelist = Whitelist() import_resolver = ImportFileResolver(FileLoader(), whitelist) - pip_resolver = PipResolver(whitelist) + pip_resolver = PythonLibraryResolver(whitelist) dependency_resolver = DependencyResolver(pip_resolver, notebook_resolver, import_resolver, mock_path_lookup) maybe = dependency_resolver.build_local_file_dependency_graph(Path("run_notebooks.py")) assert list(maybe.problems) == [ @@ -173,7 +173,7 @@ def test_dependency_resolver_visits_file_dependencies(mock_path_lookup): notebook_resolver = NotebookResolver(notebook_loader) whitelist = Whitelist() import_resolver = ImportFileResolver(FileLoader(), whitelist) - pip_resolver = PipResolver(whitelist) + pip_resolver = PythonLibraryResolver(whitelist) dependency_resolver = DependencyResolver(pip_resolver, notebook_resolver, import_resolver, mock_path_lookup) maybe = dependency_resolver.build_local_file_dependency_graph(Path("root5.py")) assert not maybe.failed @@ -198,7 +198,7 @@ def test_dependency_resolver_ignores_known_dependencies(mock_path_lookup): notebook_loader = NotebookLoader() notebook_resolver = NotebookResolver(notebook_loader) import_resolver = ImportFileResolver(FileLoader(), Whitelist()) - pip_resolver = PipResolver(Whitelist()) + pip_resolver = PythonLibraryResolver(Whitelist()) dependency_resolver = DependencyResolver(pip_resolver, notebook_resolver, import_resolver, mock_path_lookup) maybe = dependency_resolver.build_local_file_dependency_graph(Path("python_builtins.py")) assert maybe.graph @@ -213,7 +213,7 @@ def test_dependency_resolver_terminates_at_known_libraries(empty_index, mock_not lookup.append_path(site_packages_path) file_loader = FileLoader() import_resolver = ImportFileResolver(file_loader, Whitelist()) - pip_resolver = PipResolver(Whitelist()) + pip_resolver = PythonLibraryResolver(Whitelist()) dependency_resolver = DependencyResolver(pip_resolver, mock_notebook_resolver, import_resolver, lookup) maybe = dependency_resolver.build_local_file_dependency_graph(Path("import-site-package.py")) assert not maybe.failed @@ -225,7 +225,7 @@ def test_dependency_resolver_terminates_at_known_libraries(empty_index, mock_not def test_dependency_resolver_raises_problem_with_unfound_root_file(mock_path_lookup, mock_notebook_resolver): import_resolver = ImportFileResolver(FileLoader(), Whitelist()) - pip_resolver = PipResolver(Whitelist()) + pip_resolver = PythonLibraryResolver(Whitelist()) dependency_resolver = DependencyResolver(pip_resolver, mock_notebook_resolver, import_resolver, mock_path_lookup) maybe = dependency_resolver.build_local_file_dependency_graph(Path("non-existing.py")) assert list(maybe.problems) == [ @@ -236,7 +236,7 @@ def test_dependency_resolver_raises_problem_with_unfound_root_file(mock_path_loo def test_dependency_resolver_raises_problem_with_unfound_root_notebook(mock_path_lookup): notebook_loader = NotebookLoader() notebook_resolver = NotebookResolver(notebook_loader) - pip_resolver = PipResolver(Whitelist()) + pip_resolver = PythonLibraryResolver(Whitelist()) dependency_resolver = DependencyResolver(pip_resolver, notebook_resolver, [], mock_path_lookup) maybe = dependency_resolver.build_notebook_dependency_graph(Path("unknown_notebook")) assert list(maybe.problems) == [ @@ -253,7 +253,7 @@ def load_dependency(self, path_lookup: PathLookup, dependency: Dependency) -> So file_loader = FailingFileLoader() whitelist = Whitelist() import_resolver = ImportFileResolver(file_loader, whitelist) - pip_resolver = PipResolver(whitelist) + pip_resolver = PythonLibraryResolver(whitelist) dependency_resolver = DependencyResolver(pip_resolver, mock_notebook_resolver, import_resolver, mock_path_lookup) maybe = dependency_resolver.build_local_file_dependency_graph(Path("import-sub-site-package.py")) assert list(maybe.problems) == [ @@ -271,7 +271,7 @@ def load_dependency(self, path_lookup: PathLookup, dependency: Dependency) -> So notebook_loader = FailingNotebookLoader() notebook_resolver = NotebookResolver(notebook_loader) - pip_resolver = PipResolver(Whitelist()) + pip_resolver = PythonLibraryResolver(Whitelist()) dependency_resolver = DependencyResolver(pip_resolver, notebook_resolver, [], mock_path_lookup) maybe = dependency_resolver.build_notebook_dependency_graph(Path("root5.py")) assert list(maybe.problems) == [ @@ -280,7 +280,7 @@ def load_dependency(self, path_lookup: PathLookup, dependency: Dependency) -> So def test_dependency_resolver_raises_problem_with_missing_file_loader(mock_notebook_resolver, mock_path_lookup): - pip_resolver = PipResolver(Whitelist()) + pip_resolver = PythonLibraryResolver(Whitelist()) dependency_resolver = DependencyResolver(pip_resolver, mock_notebook_resolver, [], mock_path_lookup) maybe = dependency_resolver.build_local_file_dependency_graph(Path("import-sub-site-package.py")) assert list(maybe.problems) == [ diff --git a/tests/unit/source_code/test_graph.py b/tests/unit/source_code/test_graph.py index 6cb1dcb9ce..583b54d5b5 100644 --- a/tests/unit/source_code/test_graph.py +++ b/tests/unit/source_code/test_graph.py @@ -3,7 +3,7 @@ from databricks.labs.ucx.source_code.linters.files import FileLoader, ImportFileResolver, FolderLoader from databricks.labs.ucx.source_code.graph import Dependency, DependencyGraph, DependencyResolver from databricks.labs.ucx.source_code.notebooks.loaders import NotebookResolver, NotebookLoader -from databricks.labs.ucx.source_code.python_libraries import PipResolver +from databricks.labs.ucx.source_code.python_libraries import PythonLibraryResolver from databricks.labs.ucx.source_code.known import Whitelist @@ -12,7 +12,7 @@ def test_dependency_graph_registers_library(mock_path_lookup): file_loader = FileLoader() whitelist = Whitelist() dependency_resolver = DependencyResolver( - PipResolver(whitelist), + PythonLibraryResolver(whitelist), NotebookResolver(NotebookLoader()), ImportFileResolver(file_loader, whitelist), mock_path_lookup, @@ -30,7 +30,7 @@ def test_folder_loads_content(mock_path_lookup): file_loader = FileLoader() whitelist = Whitelist() dependency_resolver = DependencyResolver( - PipResolver(whitelist), + PythonLibraryResolver(whitelist), NotebookResolver(NotebookLoader()), ImportFileResolver(file_loader, whitelist), mock_path_lookup, diff --git a/tests/unit/source_code/test_jobs.py b/tests/unit/source_code/test_jobs.py index 08b3737a73..f1fcd4726f 100644 --- a/tests/unit/source_code/test_jobs.py +++ b/tests/unit/source_code/test_jobs.py @@ -4,7 +4,7 @@ from unittest.mock import create_autospec import pytest -from databricks.labs.ucx.source_code.python_libraries import PipResolver +from databricks.labs.ucx.source_code.python_libraries import PythonLibraryResolver from databricks.labs.ucx.source_code.known import Whitelist from databricks.sdk import WorkspaceClient from databricks.sdk.service import compute, jobs @@ -31,7 +31,7 @@ def dependency_resolver(mock_path_lookup) -> DependencyResolver: file_loader = FileLoader() whitelist = Whitelist() resolver = DependencyResolver( - PipResolver(whitelist), + PythonLibraryResolver(whitelist), NotebookResolver(NotebookLoader()), ImportFileResolver(file_loader, whitelist), mock_path_lookup, @@ -49,13 +49,13 @@ def graph(mock_path_lookup, dependency_resolver) -> DependencyGraph: def test_workflow_task_container_builds_dependency_graph_not_yet_implemented(mock_path_lookup, graph): # Goal of test is to raise test coverage, remove after implementing ws = create_autospec(WorkspaceClient) - library = compute.Library(jar="library.jar", egg="library.egg", whl="library.whl") + library = compute.Library(jar="library.jar", whl="library.whl") task = jobs.Task(task_key="test", libraries=[library], existing_cluster_id="id") workflow_task_container = WorkflowTaskContainer(ws, task) problems = workflow_task_container.build_dependency_graph(graph) - assert len(problems) == 3 + assert len(problems) == 2 assert all(problem.code == "not-yet-implemented" for problem in problems) ws.assert_not_called() @@ -204,3 +204,39 @@ def test_workflow_task_container_with_existing_cluster_builds_dependency_graph_p problems = workflow_task_container.build_dependency_graph(graph) assert len(problems) == 0 ws.assert_not_called() + + +def test_workflow_task_container_builds_dependency_graph_with_unknown_egg_library(mock_path_lookup, graph): + ws = create_autospec(WorkspaceClient) + ws.workspace.download.return_value = io.BytesIO(b"test") + + unknown_library = "/path/to/unknown/library.egg" + libraries = [compute.Library(egg=unknown_library)] + task = jobs.Task(task_key="test", libraries=libraries) + + workflow_task_container = WorkflowTaskContainer(ws, task) + problems = workflow_task_container.build_dependency_graph(graph) + + assert len(problems) == 1 + assert problems[0].code == "library-install-failed" + assert problems[0].message.startswith("Failed to install") + assert graph.path_lookup.resolve(Path(unknown_library)) is None + ws.workspace.download.assert_called_once_with(unknown_library, format=ExportFormat.AUTO) + + +def test_workflow_task_container_builds_dependency_graph_with_known_egg_library(mock_path_lookup, graph): + ws = create_autospec(WorkspaceClient) + + egg_file = Path(__file__).parent / "samples" / "library-egg" / "demo_egg-0.0.1-py3.6.egg" + with egg_file.open("rb") as f: + ws.workspace.download.return_value = io.BytesIO(f.read()) + + libraries = [compute.Library(egg=egg_file.as_posix())] + task = jobs.Task(task_key="test", libraries=libraries) + + workflow_task_container = WorkflowTaskContainer(ws, task) + problems = workflow_task_container.build_dependency_graph(graph) + + assert len(problems) == 0 + assert graph.path_lookup.resolve(Path("pkgdir")) is not None + ws.workspace.download.assert_called_once_with(egg_file.as_posix(), format=ExportFormat.AUTO) diff --git a/tests/unit/source_code/test_notebook.py b/tests/unit/source_code/test_notebook.py index d7433d0796..a1c52834c6 100644 --- a/tests/unit/source_code/test_notebook.py +++ b/tests/unit/source_code/test_notebook.py @@ -12,7 +12,7 @@ NotebookResolver, NotebookLoader, ) -from databricks.labs.ucx.source_code.python_libraries import PipResolver +from databricks.labs.ucx.source_code.python_libraries import PythonLibraryResolver from databricks.labs.ucx.source_code.linters.imports import DbutilsLinter from tests.unit import _load_sources @@ -130,7 +130,7 @@ def test_notebook_generates_runnable_cells(source: tuple[str, Language, list[str def test_notebook_builds_leaf_dependency_graph(mock_path_lookup): notebook_loader = NotebookLoader() notebook_resolver = NotebookResolver(notebook_loader) - pip_resolver = PipResolver(Whitelist()) + pip_resolver = PythonLibraryResolver(Whitelist()) dependency_resolver = DependencyResolver(pip_resolver, notebook_resolver, [], mock_path_lookup) maybe = dependency_resolver.resolve_notebook(mock_path_lookup, Path("leaf1.py")) graph = DependencyGraph(maybe.dependency, None, dependency_resolver, mock_path_lookup) @@ -149,7 +149,7 @@ def test_notebook_builds_depth1_dependency_graph(mock_path_lookup): paths = ["root1.run.py", "leaf1.py", "leaf2.py"] notebook_loader = NotebookLoader() notebook_resolver = NotebookResolver(notebook_loader) - pip_resolver = PipResolver(Whitelist()) + pip_resolver = PythonLibraryResolver(Whitelist()) dependency_resolver = DependencyResolver(pip_resolver, notebook_resolver, [], mock_path_lookup) maybe = dependency_resolver.resolve_notebook(mock_path_lookup, Path(paths[0])) graph = DependencyGraph(maybe.dependency, None, dependency_resolver, mock_path_lookup) @@ -163,7 +163,7 @@ def test_notebook_builds_depth2_dependency_graph(mock_path_lookup): paths = ["root2.run.py", "root1.run.py", "leaf1.py", "leaf2.py"] notebook_loader = NotebookLoader() notebook_resolver = NotebookResolver(notebook_loader) - pip_resolver = PipResolver(Whitelist()) + pip_resolver = PythonLibraryResolver(Whitelist()) dependency_resolver = DependencyResolver(pip_resolver, notebook_resolver, [], mock_path_lookup) maybe = dependency_resolver.resolve_notebook(mock_path_lookup, Path(paths[0])) graph = DependencyGraph(maybe.dependency, None, dependency_resolver, mock_path_lookup) @@ -177,7 +177,7 @@ def test_notebook_builds_dependency_graph_avoiding_duplicates(mock_path_lookup): paths = ["root3.run.py", "root1.run.py", "leaf1.py", "leaf2.py"] notebook_loader = NotebookLoader() notebook_resolver = NotebookResolver(notebook_loader) - pip_resolver = PipResolver(Whitelist()) + pip_resolver = PythonLibraryResolver(Whitelist()) dependency_resolver = DependencyResolver(pip_resolver, notebook_resolver, [], mock_path_lookup) maybe = dependency_resolver.resolve_notebook(mock_path_lookup, Path(paths[0])) graph = DependencyGraph(maybe.dependency, None, dependency_resolver, mock_path_lookup) @@ -192,7 +192,7 @@ def test_notebook_builds_cyclical_dependency_graph(mock_path_lookup): paths = ["cyclical1.run.py", "cyclical2.run.py"] notebook_loader = NotebookLoader() notebook_resolver = NotebookResolver(notebook_loader) - pip_resolver = PipResolver(Whitelist()) + pip_resolver = PythonLibraryResolver(Whitelist()) dependency_resolver = DependencyResolver(pip_resolver, notebook_resolver, [], mock_path_lookup) maybe = dependency_resolver.resolve_notebook(mock_path_lookup, Path(paths[0])) graph = DependencyGraph(maybe.dependency, None, dependency_resolver, mock_path_lookup) diff --git a/tests/unit/source_code/test_path_lookup_simulation.py b/tests/unit/source_code/test_path_lookup_simulation.py index 0658d6df23..38c89cbd76 100644 --- a/tests/unit/source_code/test_path_lookup_simulation.py +++ b/tests/unit/source_code/test_path_lookup_simulation.py @@ -1,5 +1,6 @@ from pathlib import Path from tempfile import TemporaryDirectory +from unittest.mock import create_autospec import pytest from databricks.labs.ucx.source_code.linters.files import ImportFileResolver, FileLoader @@ -7,7 +8,7 @@ from databricks.labs.ucx.source_code.graph import SourceContainer, DependencyResolver from databricks.labs.ucx.source_code.notebooks.loaders import NotebookResolver, NotebookLoader from databricks.labs.ucx.source_code.known import Whitelist -from databricks.labs.ucx.source_code.python_libraries import PipResolver +from databricks.labs.ucx.source_code.python_libraries import PythonLibraryResolver from tests.unit import ( _samples_path, ) @@ -44,7 +45,7 @@ def test_locates_notebooks(source: list[str], expected: int, mock_path_lookup): notebook_resolver = NotebookResolver(notebook_loader) whitelist = Whitelist() import_resolver = ImportFileResolver(file_loader, whitelist) - pip_resolver = PipResolver(whitelist) + pip_resolver = PythonLibraryResolver(whitelist) dependency_resolver = DependencyResolver(pip_resolver, notebook_resolver, import_resolver, mock_path_lookup) maybe = dependency_resolver.build_notebook_dependency_graph(notebook_path) assert not maybe.problems @@ -70,7 +71,7 @@ def test_locates_files(source: list[str], expected: int): notebook_loader = NotebookLoader() notebook_resolver = NotebookResolver(notebook_loader) import_resolver = ImportFileResolver(file_loader, whitelist) - pip_resolver = PipResolver(whitelist) + pip_resolver = PythonLibraryResolver(whitelist) resolver = DependencyResolver(pip_resolver, notebook_resolver, import_resolver, lookup) maybe = resolver.build_local_file_dependency_graph(file_path) assert not maybe.problems @@ -109,7 +110,7 @@ def test_locates_notebooks_with_absolute_path(): file_loader = FileLoader() whitelist = Whitelist() import_resolver = ImportFileResolver(file_loader, whitelist) - pip_resolver = PipResolver(whitelist) + pip_resolver = PythonLibraryResolver(whitelist) resolver = DependencyResolver(pip_resolver, notebook_resolver, import_resolver, lookup) maybe = resolver.build_notebook_dependency_graph(parent_file_path) assert not maybe.problems @@ -148,9 +149,104 @@ def func(): whitelist = Whitelist() file_loader = FileLoader() import_resolver = ImportFileResolver(file_loader, whitelist) - pip_resolver = PipResolver(whitelist) + pip_resolver = PythonLibraryResolver(whitelist) resolver = DependencyResolver(pip_resolver, notebook_resolver, import_resolver, lookup) maybe = resolver.build_notebook_dependency_graph(parent_file_path) assert not maybe.problems assert maybe.graph is not None assert maybe.graph.all_relative_names() == {"some_file.py", "import_file.py"} + + +def test_path_lookup_skips_resolving_within_file_library(tmp_path): + file = tmp_path / "file.py" + file.touch() + + lookup = PathLookup(Path.cwd(), [file]) + + try: + lookup.resolve(Path("package.py")) + except NotADirectoryError: + assert False, "PathLookup should skip resolving files" + else: + assert True + + +def test_path_lookup_skips_resolving_non_existing_library(tmp_path): + library = tmp_path / "non-existing-library" + + lookup = PathLookup(Path.cwd(), [library]) + + try: + lookup.resolve(Path("package.py")) + except FileNotFoundError: + assert False, "PathLookup should skip resolving non-existing libraries" + else: + assert True + + +def test_path_lookup_resolves_egg_package(tmp_path): + egg_library = tmp_path / "library.egg" + egg_library.mkdir() + (egg_library / "EGG-INFO").touch() + + package_path = egg_library / "package.py" + package_path.touch() + + lookup = PathLookup(Path.cwd(), [tmp_path]) + resolved_path = lookup.resolve(Path("package.py")) + + assert resolved_path == package_path + + +def test_path_lookup_does_not_resolve_package_in_corrupt_egg_package(tmp_path): + # Missing egg-info + egg_library = tmp_path / "library.egg" + egg_library.mkdir() + + package_path = egg_library / "package.py" + package_path.touch() + + lookup = PathLookup(Path.cwd(), [tmp_path]) + resolved_path = lookup.resolve(Path("package.py")) + + assert resolved_path is None + + +def test_path_lookup_skips_resolving_egg_files(tmp_path): + egg_library = tmp_path / "library.egg" + egg_library.touch() + + lookup = PathLookup(Path.cwd(), [tmp_path]) + + try: + lookup.resolve(Path("package.py")) + except NotADirectoryError: + assert False, "PathLookup should skip resolving egg files" + else: + assert True + + +def raise_permission_error(): + raise PermissionError("Can't access path") + + +def test_path_lookup_raises_permission_error_for_path(): + path = create_autospec(Path) + path.is_absolute.side_effect = raise_permission_error + + lookup = PathLookup(Path.cwd(), []) + + resolved_path = lookup.resolve(path) + assert resolved_path is None + path.is_absolute.assert_called_once() + + +def test_path_lookup_raises_permission_error_for_library_root(): + library_root = create_autospec(Path) + library_root.is_dir.side_effect = raise_permission_error + + lookup = PathLookup(Path.cwd(), [library_root]) + + resolved_path = lookup.resolve(Path("library.py")) + assert resolved_path is None + library_root.is_dir.assert_called_once() diff --git a/tests/unit/source_code/test_python_libraries.py b/tests/unit/source_code/test_python_libraries.py index c53f247afb..dc6c14ee5b 100644 --- a/tests/unit/source_code/test_python_libraries.py +++ b/tests/unit/source_code/test_python_libraries.py @@ -3,7 +3,7 @@ from databricks.labs.ucx.source_code.graph import DependencyProblem from databricks.labs.ucx.source_code.path_lookup import PathLookup -from databricks.labs.ucx.source_code.python_libraries import PipResolver +from databricks.labs.ucx.source_code.python_libraries import PythonLibraryResolver from databricks.labs.ucx.source_code.known import Whitelist @@ -12,7 +12,7 @@ def mock_pip_install(command): assert command.startswith("pip install anything -t") return 0, "", "" - pip_resolver = PipResolver(Whitelist(), mock_pip_install) + pip_resolver = PythonLibraryResolver(Whitelist(), mock_pip_install) problems = pip_resolver.register_library(mock_path_lookup, Path("anything")) assert len(problems) == 0 @@ -22,7 +22,7 @@ def test_pip_resolver_failing(mock_path_lookup): def mock_pip_install(_): return 1, "", "nope" - pip_resolver = PipResolver(Whitelist(), mock_pip_install) + pip_resolver = PythonLibraryResolver(Whitelist(), mock_pip_install) problems = pip_resolver.register_library(mock_path_lookup, Path("anything")) assert problems == [DependencyProblem("library-install-failed", "Failed to install anything: nope")] @@ -33,7 +33,7 @@ def mock_pip_install(_): return 0, "", "" path_lookup = create_autospec(PathLookup) - pip_resolver = PipResolver(Whitelist(), mock_pip_install) + pip_resolver = PythonLibraryResolver(Whitelist(), mock_pip_install) problems = pip_resolver.register_library(path_lookup, Path("library")) assert len(problems) == 0 @@ -48,7 +48,7 @@ def test_pip_resolver_resolves_library_with_known_problems(mock_path_lookup): def mock_pip_install(_): return 0, "", "" - pip_resolver = PipResolver(Whitelist(), mock_pip_install) + pip_resolver = PythonLibraryResolver(Whitelist(), mock_pip_install) problems = pip_resolver.register_library(mock_path_lookup, Path("boto3==1.17.0")) assert len(problems) == 1 diff --git a/tests/unit/source_code/test_s3fs.py b/tests/unit/source_code/test_s3fs.py index a9d16f5943..34b299e357 100644 --- a/tests/unit/source_code/test_s3fs.py +++ b/tests/unit/source_code/test_s3fs.py @@ -9,7 +9,7 @@ from databricks.labs.ucx.source_code.linters.files import FileLoader, ImportFileResolver from databricks.labs.ucx.source_code.notebooks.loaders import NotebookLoader, NotebookResolver from databricks.labs.ucx.source_code.known import Whitelist -from databricks.labs.ucx.source_code.python_libraries import PipResolver +from databricks.labs.ucx.source_code.python_libraries import PythonLibraryResolver S3FS_DEPRECATION_MESSAGE = ( 'S3fs library assumes AWS IAM Instance Profile to work with ' @@ -120,7 +120,7 @@ def test_detect_s3fs_import(empty_index, source: str, expected: list[DependencyP file_loader = FileLoader() notebook_resolver = NotebookResolver(notebook_loader) import_resolver = ImportFileResolver(file_loader, whitelist) - pip_resolver = PipResolver(whitelist) + pip_resolver = PythonLibraryResolver(whitelist) dependency_resolver = DependencyResolver(pip_resolver, notebook_resolver, import_resolver, mock_path_lookup) maybe = dependency_resolver.build_local_file_dependency_graph(sample) assert maybe.problems == [_.replace(source_path=sample) for _ in expected] @@ -150,7 +150,7 @@ def test_detect_s3fs_import_in_dependencies( file_loader = FileLoader() whitelist = Whitelist() import_resolver = ImportFileResolver(file_loader, whitelist) - pip_resolver = PipResolver(whitelist) + pip_resolver = PythonLibraryResolver(whitelist) dependency_resolver = DependencyResolver(pip_resolver, mock_notebook_resolver, import_resolver, mock_path_lookup) sample = mock_path_lookup.cwd / "root9.py" maybe = dependency_resolver.build_local_file_dependency_graph(sample)