Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make MaybeTree the main Python AST entrypoint for constructing the syntax tree #3550

Merged
merged 22 commits into from
Jan 28, 2025
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/databricks/labs/ucx/source_code/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
)
from databricks.labs.ucx.source_code.linters.context import LinterContext
from databricks.labs.ucx.source_code.notebooks.cells import CellLanguage
from databricks.labs.ucx.source_code.python.python_ast import Tree, PythonSequentialLinter
from databricks.labs.ucx.source_code.python.python_ast import MaybeTree, Tree, PythonSequentialLinter
from databricks.labs.ucx.source_code.notebooks.sources import FileLinter, Notebook
from databricks.labs.ucx.source_code.path_lookup import PathLookup
from databricks.labs.ucx.source_code.used_table import UsedTablesCrawler
Expand Down Expand Up @@ -657,7 +657,7 @@ def _collect_from_notebook(
if cell.language is CellLanguage.PYTHON:
if inherited_tree is None:
inherited_tree = Tree.new_module()
maybe_tree = Tree.maybe_normalized_parse(cell.original_code)
maybe_tree = MaybeTree.from_source_code(cell.original_code)
if maybe_tree.failure:
logger.warning(maybe_tree.failure.message)
continue
Expand Down
11 changes: 6 additions & 5 deletions src/databricks/labs/ucx/source_code/linters/pyspark.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,13 @@
from databricks.labs.ucx.source_code.python.python_infer import InferredValue
from databricks.labs.ucx.source_code.linters.from_table import FromTableSqlLinter
from databricks.labs.ucx.source_code.python.python_ast import (
Tree,
TreeHelper,
DfsaPyCollector,
MatchingVisitor,
MaybeTree,
PythonLinter,
TablePyCollector,
DfsaPyCollector,
Tree,
TreeHelper,
)

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -412,7 +413,7 @@ def lint_tree(self, tree: Tree) -> Iterable[Advice]:
yield from matcher.lint(self._from_table, self._index, self._session_state, node)

def apply(self, code: str) -> str:
maybe_tree = Tree.maybe_parse(code)
maybe_tree = MaybeTree.from_source_code(code)
if not maybe_tree.tree:
assert maybe_tree.failure is not None
logger.warning(maybe_tree.failure.message)
Expand Down Expand Up @@ -486,7 +487,7 @@ def lint_tree(self, tree: Tree) -> Iterable[Advice]:
def apply(self, code: str) -> str:
if not self._sql_fixer:
return code
maybe_tree = Tree.maybe_normalized_parse(code)
maybe_tree = MaybeTree.from_source_code(code)
if maybe_tree.failure:
logger.warning(maybe_tree.failure.message)
return code
Expand Down
4 changes: 2 additions & 2 deletions src/databricks/labs/ucx/source_code/notebooks/sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
UnresolvedPath,
)
from databricks.labs.ucx.source_code.notebooks.magic import MagicLine
from databricks.labs.ucx.source_code.python.python_ast import Tree, NodeBase, PythonSequentialLinter
from databricks.labs.ucx.source_code.python.python_ast import MaybeTree, NodeBase, PythonSequentialLinter, Tree
from databricks.labs.ucx.source_code.notebooks.cells import (
CellLanguage,
Cell,
Expand Down Expand Up @@ -196,7 +196,7 @@ def _load_tree_from_notebook(self, notebook: Notebook, register_trees: bool) ->
continue

def _load_tree_from_python_cell(self, python_cell: PythonCell, register_trees: bool) -> Iterable[Advice]:
maybe_tree = Tree.maybe_normalized_parse(python_cell.original_code)
maybe_tree = MaybeTree.from_source_code(python_cell.original_code)
if maybe_tree.failure:
yield maybe_tree.failure
tree = maybe_tree.tree
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
UnresolvedPath,
)
from databricks.labs.ucx.source_code.notebooks.magic import MagicLine
from databricks.labs.ucx.source_code.python.python_ast import Tree, NodeBase
from databricks.labs.ucx.source_code.python.python_ast import MaybeTree, Tree, NodeBase

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -93,7 +93,7 @@ def build_inherited_context(self, child_path: Path) -> InheritedContext:

def _parse_and_extract_nodes(self) -> tuple[Tree | None, list[NodeBase], Iterable[DependencyProblem]]:
problems: list[DependencyProblem] = []
maybe_tree = Tree.maybe_normalized_parse(self._python_code)
maybe_tree = MaybeTree.from_source_code(self._python_code)
if maybe_tree.failure:
return None, [], [DependencyProblem(maybe_tree.failure.code, maybe_tree.failure.message)]
assert maybe_tree.tree is not None
Expand Down
130 changes: 65 additions & 65 deletions src/databricks/labs/ucx/source_code/python/python_ast.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,88 +48,85 @@

@dataclass(frozen=True)
class MaybeTree:
tree: Tree | None
failure: Failure | None

def walk(self) -> Iterable[NodeNG]:
# mainly a helper method for unit testing
if self.tree is None: # no cov
assert self.failure is not None
logger.warning(self.failure.message)
return []
return self.tree.walk()
"""A :class:`Tree` or a :class:`Failure`.

The `MaybeTree` is designed to either contain a `Tree` OR a `Failure`,
never both or neither. Typically, a `Tree` is constructed using the
`MaybeTree` class method(s) that yields a `Failure` if the `Tree` could
NOT be constructed, otherwise it yields the `Tree`, resulting in code that
looks like:
JCZuurmond marked this conversation as resolved.
Show resolved Hide resolved

``` python
maybe_tree = Tree.from_<class method>(<arguments>)
JCZuurmond marked this conversation as resolved.
Show resolved Hide resolved
if maybe_tree.failure:
# Handle failure and return early
assert maybe_tree.tree, "Tree should be not-None when Failure is None."
# Use tree
```
"""

def first_statement(self) -> NodeNG | None:
# mainly a helper method for unit testing
if self.tree is None: # no cov
assert self.failure is not None
logger.warning(self.failure.message)
return None
return self.tree.first_statement()
tree: Tree | None
"""The UCX Python abstract syntax tree object"""

failure: Failure | None
"""The failure during constructing the tree"""
JCZuurmond marked this conversation as resolved.
Show resolved Hide resolved

class Tree: # pylint: disable=too-many-public-methods
@classmethod
def from_source_code(cls, code: str) -> MaybeTree:
"""Normalize and parse the source code to get a `Tree` or parse `Failure`."""
code = cls._normalize(code)
return cls._maybe_parse(code)

@classmethod
def maybe_parse(cls, code: str) -> MaybeTree:
def _maybe_parse(cls, code: str) -> MaybeTree:
try:
root = parse(code)
tree = Tree(root)
return MaybeTree(tree, None)
return cls(tree, None)
except Exception as e: # pylint: disable=broad-exception-caught
# see https://github.com/databrickslabs/ucx/issues/2976
return cls._definitely_failure(code, e)
failure = cls._failure_from_exception(code, e)
return cls(None, failure)

@staticmethod
def _definitely_failure(source_code: str, e: Exception) -> MaybeTree:
def _failure_from_exception(source_code: str, e: Exception) -> Failure:
if isinstance(e, AstroidSyntaxError) and isinstance(e.error, SyntaxError):
return MaybeTree(
None,
Failure(
code="python-parse-error",
message=f"Failed to parse code due to invalid syntax: {source_code}",
# Lines and columns are both 0-based: the first line is line 0.
start_line=(e.error.lineno or 1) - 1,
start_col=(e.error.offset or 1) - 1,
end_line=(e.error.end_lineno or 2) - 1,
end_col=(e.error.end_offset or 2) - 1,
),
return Failure(
code="python-parse-error",
message=f"Failed to parse code due to invalid syntax: {source_code}",
# Lines and columns are both 0-based: the first line is line 0.
start_line=(e.error.lineno or 1) - 1,
start_col=(e.error.offset or 1) - 1,
end_line=(e.error.end_lineno or 2) - 1,
end_col=(e.error.end_offset or 2) - 1,
)
new_issue_url = (
"https://github.com/databrickslabs/ucx/issues/new?title=[BUG]:+Python+parse+error"
"&labels=migrate/code,needs-triage,bug"
"&body=%23+Current+behaviour%0A%0ACannot+parse+the+following+Python+code"
f"%0A%0A%60%60%60+python%0A{source_code}%0A%60%60%60"
)
return MaybeTree(
None,
Failure(
code="python-parse-error",
message=(
f"Please report the following error as an issue on UCX GitHub: {new_issue_url}\n"
f"Caught error `{type(e)} : {e}` while parsing code: {source_code}"
),
# Lines and columns are both 0-based: the first line is line 0.
start_line=0,
start_col=0,
end_line=1,
end_col=1,
return Failure(
code="python-parse-error",
message=(
f"Please report the following error as an issue on UCX GitHub: {new_issue_url}\n"
f"Caught error `{type(e)} : {e}` while parsing code: {source_code}"
),
# Lines and columns are both 0-based: the first line is line 0.
start_line=0,
start_col=0,
end_line=1,
end_col=1,
JCZuurmond marked this conversation as resolved.
Show resolved Hide resolved
)

@classmethod
def maybe_normalized_parse(cls, code: str) -> MaybeTree:
code = cls.normalize(code)
return cls.maybe_parse(code)

@classmethod
def normalize(cls, code: str) -> str:
def _normalize(cls, code: str) -> str:
code = cls._normalize_indents(code)
code = cls._convert_magic_lines_to_magic_commands(code)
return code

@classmethod
def _normalize_indents(cls, python_code: str) -> str:
@staticmethod
def _normalize_indents(python_code: str) -> str:
lines = python_code.split("\n")
for line in lines:
# skip leading ws and comments
Expand All @@ -147,8 +144,8 @@ def _normalize_indents(cls, python_code: str) -> str:
return "\n".join(lines)
return python_code

@classmethod
def _convert_magic_lines_to_magic_commands(cls, python_code: str) -> str:
@staticmethod
def _convert_magic_lines_to_magic_commands(python_code: str) -> str:
lines = python_code.split("\n")
magic_markers = {"%", "!"}
in_multi_line_comment = False
Expand All @@ -164,10 +161,14 @@ def _convert_magic_lines_to_magic_commands(cls, python_code: str) -> str:
in_multi_line_comment = not in_multi_line_comment
return "\n".join(lines)


class Tree:
"""The UCX Python abstract syntax tree object"""

@classmethod
def new_module(cls) -> Tree:
node = Module("root")
return Tree(node)
return cls(node)

def __init__(self, node: NodeNG):
self._node: NodeNG = node
Expand All @@ -186,11 +187,10 @@ def root(self) -> NodeNG:
def walk(self) -> Iterable[NodeNG]:
yield from self._walk(self._node)

@classmethod
def _walk(cls, node: NodeNG) -> Iterable[NodeNG]:
def _walk(self, node: NodeNG) -> Iterable[NodeNG]:
yield node
for child in node.get_children():
yield from cls._walk(child)
yield from self._walk(child)
Comment on lines -190 to +199
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this can remain a class method?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, maybe, it does not make sense to me though. This is a protected method thus it should not be reached from outside the class, thus losing the benefit of calling the method without initializing the class, like Tree._walk. Furthermore, it is only called from the Tree.walk method, which is a regular method already, so why should this not be a regular method?


def locate(self, node_type: type[T], match_nodes: list[tuple[str, type]]) -> list[T]:
visitor = MatchingVisitor(node_type, match_nodes)
Expand Down Expand Up @@ -626,7 +626,7 @@ def __repr__(self):
class PythonLinter(Linter):

def lint(self, code: str) -> Iterable[Advice]:
maybe_tree = Tree.maybe_normalized_parse(code)
maybe_tree = MaybeTree.from_source_code(code)
if maybe_tree.failure:
yield maybe_tree.failure
return
Expand All @@ -640,7 +640,7 @@ def lint_tree(self, tree: Tree) -> Iterable[Advice]: ...
class TablePyCollector(TableCollector, ABC):

def collect_tables(self, source_code: str) -> Iterable[UsedTable]:
maybe_tree = Tree.maybe_normalized_parse(source_code)
maybe_tree = MaybeTree.from_source_code(source_code)
if maybe_tree.failure:
logger.warning(maybe_tree.failure.message)
return
Expand All @@ -655,7 +655,7 @@ def collect_tables_from_tree(self, tree: Tree) -> Iterable[UsedTableNode]: ...
class DfsaPyCollector(DfsaCollector, ABC):

def collect_dfsas(self, source_code: str) -> Iterable[DirectFsAccess]:
maybe_tree = Tree.maybe_normalized_parse(source_code)
maybe_tree = MaybeTree.from_source_code(source_code)
if maybe_tree.failure:
logger.warning(maybe_tree.failure.message)
return
Expand Down Expand Up @@ -693,7 +693,7 @@ def lint_tree(self, tree: Tree) -> Iterable[Advice]:
yield from linter.lint_tree(tree)

def _parse_and_append(self, code: str) -> MaybeTree:
maybe_tree = Tree.maybe_normalized_parse(code)
maybe_tree = MaybeTree.from_source_code(code)
if maybe_tree.failure:
return maybe_tree
assert maybe_tree.tree is not None
Expand All @@ -711,7 +711,7 @@ def append_globals(self, globs: dict) -> None:

def process_child_cell(self, code: str) -> None:
this_tree = self._make_tree()
maybe_tree = Tree.maybe_normalized_parse(code)
maybe_tree = MaybeTree.from_source_code(code)
if maybe_tree.failure:
# TODO: bubble up this error
logger.warning(maybe_tree.failure.message)
Expand Down
4 changes: 2 additions & 2 deletions tests/integration/source_code/message_codes.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from databricks.labs.blueprint.wheels import ProductInfo

from databricks.labs.ucx.source_code.base import Advice
from databricks.labs.ucx.source_code.python.python_ast import Tree
from databricks.labs.ucx.source_code.python.python_ast import MaybeTree


def main():
Expand All @@ -11,7 +11,7 @@ def main():
product_info = ProductInfo.from_class(Advice)
source_code = product_info.version_file().parent
for file in source_code.glob("**/*.py"):
maybe_tree = Tree.maybe_parse(file.read_text())
maybe_tree = MaybeTree.from_source_code(file.read_text())
if not maybe_tree.tree:
continue
tree = maybe_tree.tree
Expand Down
Loading
Loading