diff --git a/src/databricks/labs/ucx/github.py b/src/databricks/labs/ucx/github.py new file mode 100644 index 0000000000..5db61fe257 --- /dev/null +++ b/src/databricks/labs/ucx/github.py @@ -0,0 +1,38 @@ +from enum import Enum +import urllib.parse + + +DOCS_URL = "https://databrickslabs.github.io/ucx/docs/" +GITHUB_URL = "https://github.com/databrickslabs/ucx" + + +class IssueType(Enum): + """The issue type""" + + FEATURE = "Feature" + BUG = "Bug" + TASK = "Task" + + +def construct_new_issue_url( + issue_type: IssueType, + title: str, + body: str, + *, + labels: set[str] | None = None, +) -> str: + """Construct a new issue URL. + + References: + - https://docs.github.com/en/issues/tracking-your-work-with-issues/using-issues/creating-an-issue#creating-an-issue-from-a-url-query + """ + labels = labels or set() + labels.add("needs-triage") + parameters = { + "type": issue_type.value, + "title": title, + "body": body, + "labels": ",".join(sorted(labels)), + } + query = "&".join(f"{key}={urllib.parse.quote_plus(value)}" for key, value in parameters.items()) + return f"{GITHUB_URL}/issues/new?{query}" diff --git a/src/databricks/labs/ucx/install.py b/src/databricks/labs/ucx/install.py index b291ab466b..d2da9015e2 100644 --- a/src/databricks/labs/ucx/install.py +++ b/src/databricks/labs/ucx/install.py @@ -48,6 +48,7 @@ ) from databricks.sdk.useragent import with_extra +from databricks.labs.ucx.github import DOCS_URL from databricks.labs.ucx.__about__ import __version__ from databricks.labs.ucx.assessment.azure import AzureServicePrincipalInfo from databricks.labs.ucx.assessment.clusters import ClusterInfo, PolicyInfo @@ -218,7 +219,7 @@ def run( if isinstance(err.__cause__, RequestsConnectionError): logger.warning( f"Cannot connect with {self.workspace_client.config.host} see " - f"https://github.com/databrickslabs/ucx#network-connectivity-issues for help: {err}" + f"{DOCS_URL}#reference/common_challenges/#network-connectivity-issues for help: {err}" ) raise err return config @@ -573,7 +574,7 @@ def _create_database(self): if "Unable to load AWS credentials from any provider in the chain" in str(err): msg = ( "The UCX installation is configured to use external metastore. There is issue with the external metastore connectivity.\n" - "Please check the UCX installation instruction https://github.com/databrickslabs/ucx?tab=readme-ov-file#prerequisites" + f"Please check the UCX installation instruction {DOCS_URL}/installation " "and re-run installation.\n" "Please Follow the Below Command to uninstall and Install UCX\n" "UCX Uninstall: databricks labs uninstall ucx.\n" diff --git a/src/databricks/labs/ucx/source_code/known.py b/src/databricks/labs/ucx/source_code/known.py index 0f1e24aba7..fde858e513 100644 --- a/src/databricks/labs/ucx/source_code/known.py +++ b/src/databricks/labs/ucx/source_code/known.py @@ -14,6 +14,7 @@ from databricks.labs.blueprint.entrypoint import get_logger +from databricks.labs.ucx.github import GITHUB_URL from databricks.labs.ucx.hive_metastore.table_migration_status import TableMigrationIndex from databricks.labs.ucx.source_code.base import Advice, CurrentSessionState from databricks.labs.ucx.source_code.graph import ( @@ -24,6 +25,7 @@ from databricks.labs.ucx.source_code.path_lookup import PathLookup logger = logging.getLogger(__name__) +KNOWN_URL = f"{GITHUB_URL}/blob/main/src/databricks/labs/ucx/source_code/known.json" """ Known libraries that are not in known.json @@ -282,10 +284,9 @@ class KnownDependency(Dependency): """A dependency for known libraries, see :class:KnownList.""" def __init__(self, module_name: str, problems: list[KnownProblem]): - known_url = "https://github.com/databrickslabs/ucx/blob/main/src/databricks/labs/ucx/source_code/known.json" # Note that Github does not support navigating JSON files, hence the # does nothing. # https://docs.github.com/en/repositories/working-with-files/using-files/navigating-code-on-github - super().__init__(KnownLoader(), Path(f"{known_url}#{module_name}"), inherits_context=False) + super().__init__(KnownLoader(), Path(f"{KNOWN_URL}#{module_name}"), inherits_context=False) self._module_name = module_name self.problems = problems diff --git a/src/databricks/labs/ucx/source_code/linters/base.py b/src/databricks/labs/ucx/source_code/linters/base.py index 0841648bad..28bbfa1872 100644 --- a/src/databricks/labs/ucx/source_code/linters/base.py +++ b/src/databricks/labs/ucx/source_code/linters/base.py @@ -114,6 +114,35 @@ def lint(self, code: str) -> Iterable[Advice]: def lint_tree(self, tree: Tree) -> Iterable[Advice]: ... +class PythonFixer(Fixer): + """Fix python source code.""" + + def apply(self, code: str) -> str: + """Apply the changes to Python source code. + + The source code is parsed into an AST tree, and the fixes are applied + to the tree. + """ + maybe_tree = MaybeTree.from_source_code(code) + if maybe_tree.failure: + # Fixing does not yield parse failures, linting does + logger.warning(f"Parsing source code resulted in failure `{maybe_tree.failure}`: {code}") + return code + assert maybe_tree.tree is not None + tree = self.apply_tree(maybe_tree.tree) + return tree.node.as_string() + + @abstractmethod + def apply_tree(self, tree: Tree) -> Tree: + """Apply the fixes to the AST tree. + + For Python, the fixes are applied to a Tree so that we + - Can chain multiple fixers without transpiling back and forth between + source code and AST tree + - Can extend the tree with (brought into scope) nodes, e.g. to add imports + """ + + class DfsaPyCollector(DfsaCollector, ABC): def collect_dfsas(self, source_code: str) -> Iterable[DirectFsAccess]: diff --git a/src/databricks/labs/ucx/source_code/linters/files.py b/src/databricks/labs/ucx/source_code/linters/files.py index 77a023ac60..cd8b1f72dc 100644 --- a/src/databricks/labs/ucx/source_code/linters/files.py +++ b/src/databricks/labs/ucx/source_code/linters/files.py @@ -17,7 +17,7 @@ from databricks.labs.ucx.source_code.files import LocalFile from databricks.labs.ucx.source_code.graph import Dependency from databricks.labs.ucx.source_code.known import KnownDependency -from databricks.labs.ucx.source_code.linters.base import PythonLinter +from databricks.labs.ucx.source_code.linters.base import PythonFixer, PythonLinter from databricks.labs.ucx.source_code.linters.context import LinterContext from databricks.labs.ucx.source_code.linters.imports import SysPathChange, UnresolvedPath from databricks.labs.ucx.source_code.notebooks.cells import ( @@ -26,7 +26,6 @@ RunCell, RunCommand, ) -from databricks.labs.ucx.source_code.notebooks.loaders import NotebookLoader from databricks.labs.ucx.source_code.notebooks.magic import MagicLine from databricks.labs.ucx.source_code.notebooks.sources import Notebook from databricks.labs.ucx.source_code.path_lookup import PathLookup @@ -42,7 +41,11 @@ class NotebookLinter: """ def __init__( - self, notebook: Notebook, path_lookup: PathLookup, context: LinterContext, parent_tree: Tree | None = None + self, + notebook: Notebook, + path_lookup: PathLookup, + context: LinterContext, + parent_tree: Tree | None = None, ): self._context: LinterContext = context self._path_lookup = path_lookup @@ -76,6 +79,37 @@ def lint(self) -> Iterable[Advice]: ) return + def apply(self) -> None: + """Apply changes to the notebook.""" + maybe_tree = self._parse_notebook(self._notebook, parent_tree=self._parent_tree) + if maybe_tree and maybe_tree.failure: + logger.warning("Failed to parse the notebook, run linter for more details.") + return + for cell in self._notebook.cells: + try: + linter = self._context.linter(cell.language.language) + except ValueError: # Language is not supported (yet) + continue + fixed_code = cell.original_code # For default fixing + tree = self._python_tree_cache.get((self._notebook.path, cell)) # For Python fixing + is_python_cell = isinstance(cell, PythonCell) + if is_python_cell and tree: + advices = cast(PythonLinter, linter).lint_tree(tree) + else: + advices = linter.lint(cell.original_code) + for advice in advices: + fixer = self._context.fixer(cell.language.language, advice.code) + if not fixer: + continue + if is_python_cell and tree: + # By calling `apply_tree` instead of `apply`, we chain fixes on the same tree + tree = cast(PythonFixer, fixer).apply_tree(tree) + else: + fixed_code = fixer.apply(fixed_code) + cell.migrated_code = tree.node.as_string() if tree else fixed_code + self._notebook.back_up_original_and_flush_migrated_code() + return + def _parse_notebook(self, notebook: Notebook, *, parent_tree: Tree) -> MaybeTree | None: """Parse a notebook by parsing its cells. @@ -264,6 +298,8 @@ def apply(self) -> None: source_container = self._dependency.load(self._path_lookup) if isinstance(source_container, LocalFile): self._apply_file(source_container) + elif isinstance(source_container, Notebook): + self._apply_notebook(source_container) def _apply_file(self, local_file: LocalFile) -> None: """Apply changes to a local file.""" @@ -271,43 +307,7 @@ def _apply_file(self, local_file: LocalFile) -> None: local_file.migrated_code = fixed_code local_file.back_up_original_and_flush_migrated_code() - -class NotebookMigrator: - def __init__(self, languages: LinterContext): - # TODO: move languages to `apply` - self._languages = languages - - def revert(self, path: Path) -> bool: - backup_path = path.with_suffix(".bak") - if not backup_path.exists(): - return False - return path.write_text(backup_path.read_text()) > 0 - - def apply(self, path: Path) -> bool: - if not path.exists(): - return False - dependency = Dependency(NotebookLoader(), path) - # TODO: the interface for this method has to be changed - lookup = PathLookup.from_sys_path(Path.cwd()) - container = dependency.load(lookup) - assert isinstance(container, Notebook) - return self._apply(container) - - def _apply(self, notebook: Notebook) -> bool: - changed = False - for cell in notebook.cells: - # %run is not a supported language, so this needs to come first - if isinstance(cell, RunCell): - # TODO migration data, see https://github.com/databrickslabs/ucx/issues/1327 - continue - if not self._languages.is_supported(cell.language.language): - continue - migrated_code = self._languages.apply_fixes(cell.language.language, cell.original_code) - if migrated_code != cell.original_code: - cell.migrated_code = migrated_code - changed = True - if changed: - # TODO https://github.com/databrickslabs/ucx/issues/1327 store 'migrated' status - notebook.path.replace(notebook.path.with_suffix(".bak")) - notebook.path.write_text(notebook.to_migrated_code()) - return changed + def _apply_notebook(self, notebook: Notebook) -> None: + """Apply changes to a notebook.""" + notebook_linter = NotebookLinter(notebook, self._path_lookup, self._context, self._inherited_tree) + notebook_linter.apply() diff --git a/src/databricks/labs/ucx/source_code/linters/pyspark.py b/src/databricks/labs/ucx/source_code/linters/pyspark.py index 7c0478c3ea..af7590d40a 100644 --- a/src/databricks/labs/ucx/source_code/linters/pyspark.py +++ b/src/databricks/labs/ucx/source_code/linters/pyspark.py @@ -6,6 +6,8 @@ from typing import TypeVar from astroid import Attribute, Call, Const, Name, NodeNG # type: ignore + +from databricks.labs.ucx.github import IssueType, construct_new_issue_url from databricks.labs.ucx.hive_metastore.table_migration_status import TableMigrationIndex, TableMigrationStatus from databricks.labs.ucx.source_code.base import ( Advice, @@ -20,6 +22,7 @@ from databricks.labs.ucx.source_code.linters.base import ( SqlLinter, Fixer, + PythonFixer, PythonLinter, DfsaPyCollector, TablePyCollector, @@ -33,7 +36,6 @@ from databricks.labs.ucx.source_code.linters.from_table import FromTableSqlLinter from databricks.labs.ucx.source_code.python.python_ast import ( MatchingVisitor, - MaybeTree, Tree, TreeHelper, ) @@ -170,8 +172,16 @@ def lint( def apply(self, from_table: FromTableSqlLinter, index: TableMigrationIndex, node: Call) -> None: table_arg = self._get_table_arg(node) - assert isinstance(table_arg, Const) - # TODO locate constant when value is inferred + if not isinstance(table_arg, Const): + # TODO: https://github.com/databrickslabs/ucx/issues/3695 + source_code = node.as_string() + body = ( + "# Desired behaviour\n\nAutofix following Python code\n\n" + f"``` python\nTODO: Add relevant source code\n{source_code}\n```" + ) + url = construct_new_issue_url(IssueType.FEATURE, "Autofix the following Python code", body) + logger.warning(f"Cannot fix the following Python code: {source_code}. Please report this issue at {url}") + return info = UsedTable.parse(table_arg.value, from_table.schema) dst = self._find_dest(index, info) if dst is not None: @@ -393,7 +403,7 @@ def matchers(self) -> dict[str, _TableNameMatcher]: return self._matchers -class SparkTableNamePyLinter(PythonLinter, Fixer, TablePyCollector): +class SparkTableNamePyLinter(PythonLinter, PythonFixer, TablePyCollector): """Linter for table name references in PySpark Examples: @@ -427,21 +437,15 @@ def lint_tree(self, tree: Tree) -> Iterable[Advice]: assert isinstance(node, Call) yield from matcher.lint(self._from_table, self._index, self._session_state, node) - def apply(self, code: str) -> str: - maybe_tree = MaybeTree.from_source_code(code) - if not maybe_tree.tree: - assert maybe_tree.failure is not None - logger.warning(maybe_tree.failure.message) - return code - tree = maybe_tree.tree - # we won't be doing it like this in production, but for the sake of the example + def apply_tree(self, tree: Tree) -> Tree: + """Apply the fixes to the AST tree.""" for node in tree.walk(): matcher = self._find_matcher(node) if matcher is None: continue assert isinstance(node, Call) matcher.apply(self._from_table, self._index, node) - return tree.node.as_string() + return tree def _find_matcher(self, node: NodeNG) -> _TableNameMatcher | None: if not isinstance(node, Call): @@ -476,7 +480,7 @@ def _visit_call_nodes(cls, tree: Tree) -> Iterable[tuple[Call, NodeNG]]: yield call_node, query -class _SparkSqlPyLinter(_SparkSqlAnalyzer, PythonLinter, Fixer): +class _SparkSqlPyLinter(_SparkSqlAnalyzer, PythonLinter, PythonFixer): """Linter for SparkSQL used within PySpark.""" def __init__(self, sql_linter: SqlLinter, sql_fixer: Fixer | None): @@ -503,15 +507,10 @@ def lint_tree(self, tree: Tree) -> Iterable[Advice]: code = self.diagnostic_code yield dataclasses.replace(advice.replace_from_node(call_node), code=code) - def apply(self, code: str) -> str: + def apply_tree(self, tree: Tree) -> Tree: + """Apply the fixes to the AST tree.""" if not self._sql_fixer: - return code - maybe_tree = MaybeTree.from_source_code(code) - if maybe_tree.failure: - logger.warning(maybe_tree.failure.message) - return code - assert maybe_tree.tree is not None - tree = maybe_tree.tree + return tree for _call_node, query in self._visit_call_nodes(tree): if not isinstance(query, Const) or not isinstance(query.value, str): continue @@ -519,7 +518,7 @@ def apply(self, code: str) -> str: # this requires changing 'apply' API in order to check advice fragment location new_query = self._sql_fixer.apply(query.value) query.value = new_query - return tree.node.as_string() + return tree class FromTableSqlPyLinter(_SparkSqlPyLinter): diff --git a/src/databricks/labs/ucx/source_code/notebooks/sources.py b/src/databricks/labs/ucx/source_code/notebooks/sources.py index 5e3de710b0..da7cdcad48 100644 --- a/src/databricks/labs/ucx/source_code/notebooks/sources.py +++ b/src/databricks/labs/ucx/source_code/notebooks/sources.py @@ -6,6 +6,7 @@ from databricks.sdk.service.workspace import Language +from databricks.labs.ucx.source_code.base import back_up_path, safe_write_text, revert_back_up_path from databricks.labs.ucx.source_code.graph import ( SourceContainer, DependencyGraph, @@ -23,18 +24,23 @@ class Notebook(SourceContainer): + """A notebook source code container. - @staticmethod - def parse(path: Path, source: str, default_language: Language) -> Notebook: + TODO: + Let `Notebook` inherit from `LocalFile` + """ + + @classmethod + def parse(cls, path: Path, source: str, default_language: Language) -> Notebook: default_cell_language = CellLanguage.of_language(default_language) cells = default_cell_language.extract_cells(source) if cells is None: raise ValueError(f"Could not parse Notebook: {path}") - return Notebook(path, source, default_language, cells, source.endswith('\n')) + return cls(path, source, default_language, cells, source.endswith('\n')) def __init__(self, path: Path, source: str, language: Language, cells: list[Cell], ends_with_lf: bool): self._path = path - self._source = source + self._original_code = source self._language = language self._cells = cells self._ends_with_lf = ends_with_lf @@ -49,9 +55,11 @@ def cells(self) -> list[Cell]: @property def original_code(self) -> str: - return self._source + return self._original_code - def to_migrated_code(self) -> str: + @property + def migrated_code(self) -> str: + """Format the migrated code by chaining the migrated cells.""" default_language = CellLanguage.of_language(self._language) header = f"{default_language.comment_prefix} {NOTEBOOK_HEADER}" sources = [header] @@ -68,6 +76,39 @@ def to_migrated_code(self) -> str: sources.append('') # following join will append lf return '\n'.join(sources) + def _safe_write_text(self, contents: str) -> int | None: + """Write content to the local file.""" + return safe_write_text(self._path, contents) + + def _back_up_path(self) -> Path | None: + """Back up the original file.""" + return back_up_path(self._path) + + def back_up_original_and_flush_migrated_code(self) -> int | None: + """Back up the original notebook and flush the migrated code to the file. + + This is a single method to avoid overwriting the original file without a backup. + + Returns : + int : The number of characters written. If None, nothing is written to the file. + + TODO: + Let `Notebook` inherit from `LocalFile` and reuse implementation of + `back_up_original_and_flush_migrated_code`. + """ + if self.original_code == self.migrated_code: + # Avoiding unnecessary back up and flush + return len(self.migrated_code) + backed_up_path = self._back_up_path() + if not backed_up_path: + # Failed to back up the original file, avoid overwriting existing file + return None + number_of_characters_written = self._safe_write_text(self.migrated_code) + if number_of_characters_written is None: + # Failed to overwrite original file, clean up by reverting backup + revert_back_up_path(self._path) + return number_of_characters_written + def build_dependency_graph(self, parent: DependencyGraph) -> list[DependencyProblem]: """Check for any problems with dependencies of the cells in this notebook. diff --git a/src/databricks/labs/ucx/source_code/python/python_ast.py b/src/databricks/labs/ucx/source_code/python/python_ast.py index 1817fcf7e2..7057f49f6d 100644 --- a/src/databricks/labs/ucx/source_code/python/python_ast.py +++ b/src/databricks/labs/ucx/source_code/python/python_ast.py @@ -4,7 +4,6 @@ import logging import sys import re -import urllib.parse from abc import ABC from collections.abc import Iterable from dataclasses import dataclass @@ -29,9 +28,9 @@ ) from astroid.exceptions import AstroidSyntaxError # type: ignore -from databricks.labs.ucx.source_code.base import ( - Failure, -) +from databricks.labs.ucx.github import IssueType, construct_new_issue_url +from databricks.labs.ucx.source_code.base import Failure + logger = logging.getLogger(__name__) missing_handlers: set[str] = set() @@ -100,12 +99,8 @@ def _failure_from_exception(source_code: str, e: Exception) -> Failure: end_line=(e.error.end_lineno or 2) - 1, end_col=(e.error.end_offset or 2) - 1, ) - new_issue_url = ( - "https://github.com/databrickslabs/ucx/issues/new?title=[BUG]:+Python+parse+error" - "&labels=migrate/code,needs-triage,bug" - "&body=%23+Current+behaviour%0A%0ACannot+parse+the+following+Python+code" - f"%0A%0A%60%60%60+python%0A{urllib.parse.quote_plus(source_code)}%0A%60%60%60" - ) + body = "# Desired behaviour\n\nCannot parse the follow Python code\n\n``` python\n{source_code}\n```" + new_issue_url = construct_new_issue_url(IssueType.BUG, "Python parse error", body) return Failure( code="python-parse-error", message=( diff --git a/src/databricks/labs/ucx/source_code/python_libraries.py b/src/databricks/labs/ucx/source_code/python_libraries.py index 41ead804f7..f6b8b82f25 100644 --- a/src/databricks/labs/ucx/source_code/python_libraries.py +++ b/src/databricks/labs/ucx/source_code/python_libraries.py @@ -12,7 +12,7 @@ from databricks.labs.ucx.framework.utils import run_command from databricks.labs.ucx.source_code.graph import DependencyProblem, LibraryResolver -from databricks.labs.ucx.source_code.known import KnownList +from databricks.labs.ucx.source_code.known import KNOWN_URL, KnownList from databricks.labs.ucx.source_code.path_lookup import PathLookup @@ -35,7 +35,6 @@ def register_library(self, path_lookup: PathLookup, *libraries: str) -> list[Dep """We delegate to pip to install the library and augment the path look-up to resolve the library at import. This gives us the flexibility to install any library that is not in the allow-list, and we don't have to bother about parsing cross-version dependencies in our code.""" - known_url = "https://github.com/databrickslabs/ucx/blob/main/src/databricks/labs/ucx/source_code/known.json" if len(libraries) == 0: return [] if len(libraries) == 1: # Multiple libraries might be installation flags @@ -43,7 +42,7 @@ def register_library(self, path_lookup: PathLookup, *libraries: str) -> list[Dep compatibility = self._allow_list.distribution_compatibility(library) if compatibility.known: # TODO: Pass in the line number and column number https://github.com/databrickslabs/ucx/issues/3625 - path = Path(f"{known_url}#{library}") + path = Path(f"{KNOWN_URL}#{library}") problems = [DependencyProblem(p.code, p.message, path) for p in compatibility.problems] return problems return self._install_library(path_lookup, *libraries) diff --git a/tests/integration/source_code/linters/test_files.py b/tests/integration/source_code/linters/test_files.py index 0ae136b713..5990c80e8c 100644 --- a/tests/integration/source_code/linters/test_files.py +++ b/tests/integration/source_code/linters/test_files.py @@ -27,6 +27,6 @@ def test_local_code_linter_lints_ucx(local_checkout_context, ucx_source_path) -> assert len(problems) > 0, "Found no problems while linting ucx" -def test_local_code_migrator_fixes_ucx(local_checkout_context, ucx_source_path) -> None: +def test_local_code_linter_applies_ucx(local_checkout_context, ucx_source_path) -> None: problems = list(local_checkout_context.local_code_linter.apply(ucx_source_path)) assert len(problems) == 0, "Found no problems while fixing ucx" diff --git a/tests/unit/source_code/linters/test_base.py b/tests/unit/source_code/linters/test_base.py new file mode 100644 index 0000000000..c1f1036d20 --- /dev/null +++ b/tests/unit/source_code/linters/test_base.py @@ -0,0 +1,52 @@ +from databricks.labs.ucx.source_code.linters.base import PythonFixer +from databricks.labs.ucx.source_code.python.python_ast import Tree + + +def test_python_fixer_has_dummy_code() -> None: + class _PythonFixer(PythonFixer): + + @property + def diagnostic_code(self) -> str: + """Dummy diagnostic code""" + return "dummy" + + def apply_tree(self, tree: Tree) -> Tree: + """Dummy fixer""" + return tree + + fixer = _PythonFixer() + assert fixer.diagnostic_code == "dummy" + + +def test_python_fixer_applies_valid_python() -> None: + class _PythonFixer(PythonFixer): + + @property + def diagnostic_code(self) -> str: + """Dummy diagnostic code""" + return "dummy" + + def apply_tree(self, tree: Tree) -> Tree: + """Dummy fixer""" + return tree + + fixer = _PythonFixer() + assert fixer.apply("print(1)\n") == "print(1)\n\n" # Format introduces EOF newline + + +def test_python_fixer_applies_invalid_python() -> None: + """Cannot fix invalid Python, thus nothing should happen""" + + class _PythonFixer(PythonFixer): + + @property + def diagnostic_code(self) -> str: + """Dummy diagnostic code""" + return "dummy" + + def apply_tree(self, tree: Tree) -> Tree: + """Dummy fixer""" + return tree + + fixer = _PythonFixer() + assert fixer.apply("print(1") == "print(1" # closing parenthesis is missing on purpose diff --git a/tests/unit/source_code/linters/test_files.py b/tests/unit/source_code/linters/test_files.py index eaa781247e..d8b2031852 100644 --- a/tests/unit/source_code/linters/test_files.py +++ b/tests/unit/source_code/linters/test_files.py @@ -13,7 +13,7 @@ from databricks.labs.ucx.source_code.graph import Dependency, DependencyResolver, SourceContainer from databricks.labs.ucx.source_code.linters.base import PythonLinter from databricks.labs.ucx.source_code.linters.context import LinterContext -from databricks.labs.ucx.source_code.linters.files import FileLinter, NotebookLinter, NotebookMigrator +from databricks.labs.ucx.source_code.linters.files import FileLinter, NotebookLinter from databricks.labs.ucx.source_code.linters.folders import LocalCodeLinter from databricks.labs.ucx.source_code.notebooks.loaders import NotebookLoader, NotebookResolver from databricks.labs.ucx.source_code.notebooks.sources import Notebook @@ -142,20 +142,6 @@ def test_file_linter_applies_migrated(tmp_path, mock_path_lookup, migration_inde assert path.read_text().rstrip() == "df = spark.read.table('brand.new.stuff')" -def test_notebook_migrator_ignores_unsupported_extensions() -> None: - languages = LinterContext(TableMigrationIndex([])) - migrator = NotebookMigrator(languages) - path = Path('unsupported.ext') - assert not migrator.apply(path) - - -def test_notebook_migrator_supported_language_no_diagnostics(mock_path_lookup) -> None: - languages = LinterContext(TableMigrationIndex([])) - migrator = NotebookMigrator(languages) - path = mock_path_lookup.resolve(Path("root1.run.py")) - assert not migrator.apply(path) - - def test_triple_dot_import() -> None: file_resolver = ImportFileResolver(FileLoader()) path_lookup = create_autospec(PathLookup) @@ -266,6 +252,52 @@ def test_notebook_linter_lints_source_yielding_parse_failure(migration_index, mo ] +def test_notebook_linter_applies_migrated_table_with_rename(tmp_path, migration_index, mock_path_lookup) -> None: + """The table should be migrated to the new table name as defined in the migration index.""" + path = tmp_path / "notebook.py" + source = "# Databricks notebook source\nspark.table('old.things')" + path.write_text(source) + notebook = Notebook.parse(path, source, Language.PYTHON) + linter = NotebookLinter(notebook, mock_path_lookup, LinterContext(migration_index)) + + linter.apply() + + contents = path.read_text() + assert "old.things" not in contents + assert "brand.new.stuff" in contents + + +def test_notebook_linter_applies_two_migrated_table_with_rename(tmp_path, migration_index, mock_path_lookup) -> None: + """The table should be migrated to the new table names as defined in the migration index.""" + path = tmp_path / "notebook.py" + source = "# Databricks notebook source\nspark.table('old.things')\nspark.table('other.matters')" + path.write_text(source) + notebook = Notebook.parse(path, source, Language.PYTHON) + linter = NotebookLinter(notebook, mock_path_lookup, LinterContext(migration_index)) + + linter.apply() + + contents = path.read_text() + assert "old.things" not in contents + assert "brand.new.stuff" in contents + assert "some.certain.issues" in contents + + +def test_notebook_linter_applies_no_op_for_non_migrated_table(tmp_path, migration_index, mock_path_lookup) -> None: + """The table should not be migrated to the new table name as defined in the migration index.""" + path = tmp_path / "notebook.py" + source = "# Databricks notebook source\nspark.table('not.migrated.table')" + path.write_text(source) + notebook = Notebook.parse(path, source, Language.PYTHON) + linter = NotebookLinter(notebook, mock_path_lookup, LinterContext(migration_index)) + + linter.apply() + + contents = path.read_text() + assert "not.migrated.table" in contents + assert "brand.new.stuff" not in contents + + @pytest.mark.xfail(reason="https://github.com/databrickslabs/ucx/issues/3556") def test_notebook_linter_lints_source_yielding_parse_failures(migration_index, mock_path_lookup) -> None: source = """ @@ -301,18 +333,37 @@ def test_notebook_linter_lints_source_yielding_parse_failures(migration_index, m ] -def test_notebook_linter_lints_migrated_table(migration_index, mock_path_lookup) -> None: +@pytest.mark.skip(reason="https://github.com/databrickslabs/ucx/issues/3695") +def test_notebook_linter_lints_and_applies_migrated_table(tmp_path, migration_index, mock_path_lookup) -> None: """Regression test with the tests below.""" - source = """ + expected = """ # Databricks notebook source +table_name = 'brand.new.stuff' + -table_name = "old.things" # Migrated table according to the migration index # COMMAND ---------- spark.table(table_name) + + """.lstrip() - linter = _NotebookLinter.from_source_code(migration_index, mock_path_lookup, source, Language.PYTHON) + source_code = """ +# Databricks notebook source +table_name = 'old.things' + + + +# COMMAND ---------- + +spark.table(table_name) + + +""".lstrip() + path = tmp_path / "notebook.py" + path.write_text(source_code) + notebook = Notebook.parse(path, source_code, Language.PYTHON) + linter = NotebookLinter(notebook, mock_path_lookup, LinterContext(migration_index)) advices = list(linter.lint()) @@ -320,83 +371,141 @@ def test_notebook_linter_lints_migrated_table(migration_index, mock_path_lookup) assert advices[0] == Deprecation( code='table-migrated-to-uc-python', message='Table old.things is migrated to brand.new.stuff in Unity Catalog', - start_line=6, + start_line=7, start_col=0, - end_line=6, + end_line=7, end_col=23, ) + linter.apply() + + assert path.read_text() == expected + -def test_notebook_linter_lints_not_migrated_table(migration_index, mock_path_lookup) -> None: +def test_notebook_linter_lints_and_applies_not_migrated_table(tmp_path, migration_index, mock_path_lookup) -> None: """Regression test with the tests above and below.""" - source = """ + source_code = """ # Databricks notebook source +table_name = 'not_migrated.table' + -table_name = "not_migrated.table" # NOT a migrated table according to the migration index # COMMAND ---------- spark.table(table_name) + + """.lstrip() - linter = _NotebookLinter.from_source_code(migration_index, mock_path_lookup, source, Language.PYTHON) + path = tmp_path / "notebook.py" + path.write_text(source_code) + notebook = Notebook.parse(path, source_code, Language.PYTHON) + linter = NotebookLinter(notebook, mock_path_lookup, LinterContext(migration_index)) advices = list(linter.lint()) assert not [advice for advice in advices if advice.code == "table-migrated-to-uc"] + linter.apply() + + assert path.read_text() == source_code + -def test_notebook_linter_lints_migrated_table_with_rename(migration_index, mock_path_lookup) -> None: +@pytest.mark.skip(reason="https://github.com/databrickslabs/ucx/issues/3695") +def test_notebook_linter_lints_and_applies_migrated_table_with_rename( + tmp_path, migration_index, mock_path_lookup +) -> None: """The spark.table should read the table defined above the call not below. This is a regression test with the tests above and below. """ - source = """ + expected = """ # Databricks notebook source +table_name = 'brand.new.stuff' + -table_name = "old.things" # Migrated table according to the migration index # COMMAND ---------- spark.table(table_name) + + # COMMAND ---------- -table_name = "not_migrated.table" # NOT a migrated table according to the migration index +table_name = 'not_migrated.table' + + """.lstrip() - linter = _NotebookLinter.from_source_code(migration_index, mock_path_lookup, source, Language.PYTHON) + source_code = """ +# Databricks notebook source +table_name = 'old.things' + + + +# COMMAND ---------- + +spark.table(table_name) + + +# COMMAND ---------- + +table_name = 'not_migrated.table' + + +""".lstrip() + path = tmp_path / "notebook.py" + path.write_text(source_code) + notebook = Notebook.parse(path, source_code, Language.PYTHON) + linter = NotebookLinter(notebook, mock_path_lookup, LinterContext(migration_index)) first_advice = next(iter(linter.lint())) assert first_advice == Deprecation( code='table-migrated-to-uc-python', message='Table old.things is migrated to brand.new.stuff in Unity Catalog', - start_line=6, + start_line=7, start_col=0, - end_line=6, + end_line=7, end_col=23, ) + linter.apply() + + assert path.read_text() == expected -def test_notebook_linter_lints_not_migrated_table_with_rename(migration_index, mock_path_lookup) -> None: + +def test_notebook_linter_lints_not_migrated_table_with_rename(tmp_path, migration_index, mock_path_lookup) -> None: """The spark.table should read the table defined above the call not below. This is a regression test with the tests above. """ - source = """ + source_code = """ # Databricks notebook source +table_name = 'not_migrated.table' + -table_name = "not_migrated.table" # NOT a migrated table according to the migration index # COMMAND ---------- spark.table(table_name) + + # COMMAND ---------- -table_name = "old.things" # Migrated table according to the migration index +table_name = 'old.things' + + """.lstrip() - linter = _NotebookLinter.from_source_code(migration_index, mock_path_lookup, source, Language.PYTHON) + path = tmp_path / "notebook.py" + path.write_text(source_code) + notebook = Notebook.parse(path, source_code, Language.PYTHON) + linter = NotebookLinter(notebook, mock_path_lookup, LinterContext(migration_index)) advices = list(linter.lint()) assert not [advice for advice in advices if advice.code == "table-migrated-to-uc"] + + linter.apply() + + assert path.read_text() == source_code diff --git a/tests/unit/source_code/linters/test_folders.py b/tests/unit/source_code/linters/test_folders.py index 1f6544f367..845c7c33b3 100644 --- a/tests/unit/source_code/linters/test_folders.py +++ b/tests/unit/source_code/linters/test_folders.py @@ -1,3 +1,4 @@ +import shutil from pathlib import Path import pytest @@ -43,8 +44,14 @@ def test_local_code_linter_lint_walks_directory(mock_path_lookup, local_code_lin assert not advices -def test_local_code_linter_apply_walks_directory(mock_path_lookup, local_code_linter) -> None: - advices = list(local_code_linter.apply(Path("simulate-sys-path/"))) +def test_local_code_linter_apply_walks_directory(tmp_path, mock_path_lookup, local_code_linter) -> None: + # Copy the parent-child-context directory to a temporary directory so that fixes are cleaned up after the test + source_path = mock_path_lookup.resolve(Path("simulate-sys-path/")) + destination_path = tmp_path / "simulate-sys-path" + copied_path = shutil.copytree(source_path, destination_path) + + advices = list(local_code_linter.apply(copied_path)) + assert len(mock_path_lookup.successfully_resolved_paths) > 10 assert not advices @@ -56,10 +63,22 @@ def test_local_code_linter_lints_child_in_context(mock_path_lookup, local_code_l assert not advices -def test_local_code_linter_applies_child_in_context(mock_path_lookup, local_code_linter) -> None: - expected = {Path("parent-child-context/parent.py"), Path("child.py")} - advices = list(local_code_linter.apply(Path("parent-child-context/parent.py"))) - assert not expected - mock_path_lookup.successfully_resolved_paths +def test_local_code_linter_applies_child_in_context(tmp_path, mock_path_lookup, local_code_linter) -> None: + # Copy the parent-child-context directory to a temporary directory so that fixes are cleaned up after the test + source_path = mock_path_lookup.resolve(Path("parent-child-context/")) + destination_path = tmp_path / "parent-child-context" + copied_path = shutil.copytree(source_path, destination_path) + + parent_path = copied_path / "parent.py" + child_path = copied_path / "child.py" + # 1./2. The full parent and child paths are expected to be resolved to read the notebooks + # 3. The relative child path is expected to be resolved as it is defined in the parent notebook + # 4. The parent-child-context directory is resolved at the top of this test + expected = {parent_path, child_path, Path("child.py"), Path("parent-child-context")} + + advices = list(local_code_linter.apply(parent_path)) + + assert mock_path_lookup.successfully_resolved_paths == expected assert not advices diff --git a/tests/unit/source_code/notebooks/test_sources.py b/tests/unit/source_code/notebooks/test_sources.py new file mode 100644 index 0000000000..3e68d8ba15 --- /dev/null +++ b/tests/unit/source_code/notebooks/test_sources.py @@ -0,0 +1,100 @@ +from databricks.sdk.service.workspace import Language + +from databricks.labs.ucx.source_code.notebooks.sources import Notebook + + +def test_notebook_flush_migrated_code(tmp_path) -> None: + """Happy path of flushing and backing up fixed code""" + source_code = "# Databricks notebook source\nprint(1)" + migrated_code = "# Databricks notebook source\nprint(2)" + + path = tmp_path / "test.py" + path.write_text(source_code) + notebook = Notebook.parse(path, source_code, Language.PYTHON) + notebook.cells[0].migrated_code = "print(2)" + + number_of_characters_written = notebook.back_up_original_and_flush_migrated_code() + + assert number_of_characters_written == len(migrated_code) + assert notebook.original_code == source_code + assert notebook.migrated_code == migrated_code + assert path.with_suffix(".py.bak").read_text() == source_code + assert path.read_text() == migrated_code + + +def test_notebook_flush_migrated_code_with_empty_cell_contents(tmp_path) -> None: + """Verify the method handles a cell without contents.""" + source_code = "# Databricks notebook source\nprint(1)" + migrated_code = "# Databricks notebook source\n" + + path = tmp_path / "test.py" + path.write_text(source_code) + notebook = Notebook.parse(path, source_code, Language.PYTHON) + notebook.cells[0].migrated_code = "" + + number_of_characters_written = notebook.back_up_original_and_flush_migrated_code() + + assert number_of_characters_written == len(migrated_code) + assert notebook.original_code == source_code + assert notebook.migrated_code == migrated_code + assert path.with_suffix(".py.bak").read_text() == source_code + assert path.read_text() == migrated_code + + +def test_notebook_flush_non_migrated_code(tmp_path) -> None: + """No-op in case the code is not migrated""" + source_code = "# Databricks notebook source\nprint(1)" + + path = tmp_path / "test.py" + path.write_text(source_code) + notebook = Notebook.parse(path, source_code, Language.PYTHON) + + number_of_characters_written = notebook.back_up_original_and_flush_migrated_code() + + assert number_of_characters_written == len(source_code) + assert notebook.original_code == source_code + assert notebook.migrated_code == source_code + assert not path.with_suffix(".py.bak").is_file() + assert path.read_text() == source_code + + +def test_notebook_does_not_flush_migrated_code_when_backup_fails(tmp_path) -> None: + """If backup fails the method should not flush the migrated code""" + + class _Notebook(Notebook): + def _back_up_path(self) -> None: + # Simulate an error, back_up_path handles the error, no return signals an error + pass + + source_code = "# Databricks notebook source\nprint(1)" + path = tmp_path / "test.py" + path.write_text(source_code) + notebook = _Notebook.parse(path, source_code, Language.PYTHON) + notebook.cells[0].migrated_code = "print(2)" + + number_of_characters_written = notebook.back_up_original_and_flush_migrated_code() + + assert number_of_characters_written is None + assert not path.with_suffix(".py.bak").is_file() + assert path.read_text() == source_code + + +def test_notebook_flush_migrated_code_with_error(tmp_path) -> None: + """If flush fails, the method should revert the backup""" + + class _Notebook(Notebook): + def _safe_write_text(self, contents: str) -> None: + # Simulate an error, safe_write_text handles the error, no returns signals an error + _ = contents + + source_code = "# Databricks notebook source\nprint(1)" + path = tmp_path / "test.py" + path.write_text(source_code) + notebook = _Notebook.parse(path, source_code, Language.PYTHON) + notebook.cells[0].migrated_code = "print(2)" + + number_of_characters_written = notebook.back_up_original_and_flush_migrated_code() + + assert number_of_characters_written is None + assert not path.with_suffix(".py.bak").is_file() + assert path.read_text() == source_code diff --git a/tests/unit/source_code/test_notebook.py b/tests/unit/source_code/test_notebook.py index db12d80ae0..7fdb296c02 100644 --- a/tests/unit/source_code/test_notebook.py +++ b/tests/unit/source_code/test_notebook.py @@ -101,9 +101,8 @@ def test_notebook_rebuilds_same_code(source: tuple[str, Language, list[str]]) -> assert len(sources) == 1 notebook = Notebook.parse(Path(path), sources[0], source[1]) assert notebook is not None - new_source = notebook.to_migrated_code() # ignore trailing whitespaces - actual_purified = re.sub(r'\s+$', '', new_source, flags=re.MULTILINE) + actual_purified = re.sub(r'\s+$', '', notebook.migrated_code, flags=re.MULTILINE) expected_purified = re.sub(r'\s+$', '', sources[0], flags=re.MULTILINE) assert actual_purified == expected_purified diff --git a/tests/unit/test_github.py b/tests/unit/test_github.py new file mode 100644 index 0000000000..c07fd7831c --- /dev/null +++ b/tests/unit/test_github.py @@ -0,0 +1,44 @@ +import pytest + +from databricks.labs.ucx.github import IssueType, construct_new_issue_url + + +def test_construct_new_issue_url_starts_with_github_url() -> None: + """Test that the URL starts with the GitHub URL.""" + url = construct_new_issue_url(IssueType.FEATURE, "title", "body") + assert url.startswith("https://github.com/databrickslabs/ucx") + + +def test_construct_new_issue_url_with_labels() -> None: + """Test that the URL contains the labels.""" + url = construct_new_issue_url(IssueType.FEATURE, "title", "body", labels={"label1", "label2"}) + assert "label1" in url + assert "label2" in url + + +@pytest.mark.parametrize("labels", [None, {}, {"label1", "label2"}]) +def test_construct_new_issue_url_always_has_needs_triage_label(labels: set[str] | None) -> None: + """Test that the URL always has the `needs-triage` label.""" + url = construct_new_issue_url(IssueType.FEATURE, "title", "body", labels=labels) + assert "needs-triage" in url + + +def test_construct_new_issue_url_makes_url_safe() -> None: + """Test that the URL is properly URL-encoded.""" + url = construct_new_issue_url(IssueType.FEATURE, "title", "body with spaces") + assert "body+with+spaces" in url + + +def test_construct_new_issue_url_advanced() -> None: + """Test that the URL is properly constructed with advanced parameters.""" + expected = ( + "https://github.com/databrickslabs/ucx/issues/new" + "?type=Feature" + "&title=Autofix+the+following+Python+code" + "&body=%23+Desired+behaviour%0A%0AAutofix+following+Python+code" + "%0A%0A%60%60%60+python%0ATODO%3A+Add+relevant+source+code%0A%60%60%60" + "&labels=migrate%2Fcode%2Cneeds-triage" + ) + body = "# Desired behaviour\n\nAutofix following Python code\n\n``` python\nTODO: Add relevant source code\n```" + url = construct_new_issue_url(IssueType.FEATURE, "Autofix the following Python code", body, labels={"migrate/code"}) + assert url == expected