Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Safe read files in more places #3394

Merged
merged 8 commits into from
Dec 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 15 additions & 5 deletions src/databricks/labs/ucx/source_code/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -459,6 +459,20 @@ def read_text(path: Path, size: int = -1) -> str:
return f.read(size)


def safe_read_text(path: Path, size: int = -1) -> str | None:
"""Safe read a text file by handling reading exceptions, see :func:_read_text.

Returns
str : Content of file
None : If error occurred during reading.
"""
try:
return read_text(path, size=size)
except (FileNotFoundError, UnicodeDecodeError, PermissionError) as e:
logger.warning(f"Could not read file: {path}", exc_info=e)
return None


# duplicated from CellLanguage to prevent cyclic import
LANGUAGE_COMMENT_PREFIXES = {Language.PYTHON: '#', Language.SCALA: '//', Language.SQL: '--'}
NOTEBOOK_HEADER = "Databricks notebook source"
Expand All @@ -475,9 +489,5 @@ def is_a_notebook(path: Path, content: str | None = None) -> bool:
magic_header = f"{LANGUAGE_COMMENT_PREFIXES.get(language)} {NOTEBOOK_HEADER}"
if content is not None:
return content.startswith(magic_header)
try:
file_header = read_text(path, size=len(magic_header))
except (FileNotFoundError, UnicodeDecodeError, PermissionError):
logger.warning(f"Could not read file {path}")
return False
file_header = safe_read_text(path, size=len(magic_header))
return file_header == magic_header
6 changes: 4 additions & 2 deletions src/databricks/labs/ucx/source_code/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
SourceInfo,
UsedTable,
LineageAtom,
read_text,
safe_read_text,
)
from databricks.labs.ucx.source_code.directfs_access import (
DirectFsAccessCrawler,
Expand Down Expand Up @@ -633,7 +633,9 @@ def _process_dependency(
logger.warning(f"Unknown language for {dependency.path}")
return
cell_language = CellLanguage.of_language(language)
source = read_text(dependency.path)
source = safe_read_text(dependency.path)
if not source:
return
if is_a_notebook(dependency.path):
yield from self._collect_from_notebook(source, cell_language, dependency.path, inherited_tree)
elif dependency.path.is_file():
Expand Down
40 changes: 21 additions & 19 deletions src/databricks/labs/ucx/source_code/notebooks/sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import locale
import logging
from collections.abc import Iterable
from functools import cached_property
from pathlib import Path
from typing import cast

Expand All @@ -14,14 +13,15 @@

from databricks.labs.ucx.hive_metastore.table_migration_status import TableMigrationIndex
from databricks.labs.ucx.source_code.base import (
Advice,
Failure,
Linter,
CurrentSessionState,
Advisory,
file_language,
is_a_notebook,
safe_read_text,
read_text,
Advice,
Advisory,
CurrentSessionState,
Failure,
Linter,
)

from databricks.labs.ucx.source_code.graph import (
Expand Down Expand Up @@ -297,7 +297,9 @@ def _load_source_from_path(self, path: Path | None) -> Notebook | None:
if language is not Language.PYTHON:
logger.warning(f"Unsupported notebook language: {language}")
return None
source = read_text(resolved)
source = safe_read_text(resolved)
if not source:
return None
return Notebook.parse(path, source, language)

def _linter(self, language: Language) -> Linter:
Expand All @@ -315,7 +317,9 @@ def _load_source_from_run_cell(self, run_cell: RunCell) -> None:
language = file_language(resolved)
if language is not Language.PYTHON:
return
source = read_text(resolved)
source = safe_read_text(resolved)
if not source:
return
notebook = Notebook.parse(path, source, language)
for cell in notebook.cells:
if isinstance(cell, RunCell):
Expand Down Expand Up @@ -390,16 +394,11 @@ def __init__(
self._inherited_tree = inherited_tree
self._content = content

@cached_property
def _source_code(self) -> str:
if self._content is None:
self._content = read_text(self._path)
return self._content

def lint(self) -> Iterable[Advice]:
encoding = locale.getpreferredencoding(False)
try:
is_notebook = self._is_notebook()
# Not using `safe_read_text` here to surface read errors
self._content = self._content or read_text(self._path)
except FileNotFoundError:
failure_message = f"File not found: {self._path}"
yield Failure("file-not-found", failure_message, 0, 0, 1, 1)
Expand All @@ -413,19 +412,21 @@ def lint(self) -> Iterable[Advice]:
yield Failure("file-permission", failure_message, 0, 0, 1, 1)
return

if is_notebook:
if self._is_notebook():
yield from self._lint_notebook()
else:
yield from self._lint_file()

def _is_notebook(self) -> bool:
assert self._content is not None, "Content should be read from path before calling this method"
# pre-check to avoid loading unsupported content
language = file_language(self._path)
if not language:
return False
return is_a_notebook(self._path, self._source_code)
return is_a_notebook(self._path, self._content)

def _lint_file(self) -> Iterable[Advice]:
assert self._content is not None, "Content should be read from path before calling this method"
language = file_language(self._path)
if not language:
suffix = self._path.suffix.lower()
Expand All @@ -440,17 +441,18 @@ def _lint_file(self) -> Iterable[Advice]:
linter = self._ctx.linter(language)
if self._inherited_tree is not None and isinstance(linter, PythonSequentialLinter):
linter.append_tree(self._inherited_tree)
yield from linter.lint(self._source_code)
yield from linter.lint(self._content)
except ValueError as err:
failure_message = f"Error while parsing content of {self._path.as_posix()}: {err}"
yield Failure("unsupported-content", failure_message, 0, 0, 1, 1)

def _lint_notebook(self) -> Iterable[Advice]:
assert self._content is not None, "Content should be read from path before calling this method"
language = file_language(self._path)
if not language:
yield Failure("unknown-language", f"Cannot detect language for {self._path}", 0, 0, 1, 1)
return
notebook = Notebook.parse(self._path, self._source_code, language)
notebook = Notebook.parse(self._path, self._content, language)
notebook_linter = NotebookLinter(
self._ctx,
self._path_lookup,
Expand Down
46 changes: 46 additions & 0 deletions tests/integration/source_code/test_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import codecs
import logging
from pathlib import Path
from unittest.mock import create_autospec

import pytest

from databricks.labs.ucx.source_code.base import read_text, safe_read_text


@pytest.mark.parametrize(
"exception",
[
FileNotFoundError(),
UnicodeDecodeError("utf-8", b"\x80\x81\x82", 0, 1, "invalid start byte"),
PermissionError(),
],
)
def test_safe_read_text_handles_and_warns_read_errors(caplog, exception: Exception) -> None:
path = create_autospec(Path)
path.open.side_effect = exception

with caplog.at_level(logging.WARNING, logger="databricks.labs.ucx.account.aggregate"):
text = safe_read_text(path)

assert not text
assert f"Could not read file: {path}" in caplog.messages


@pytest.mark.parametrize(
"bom, encoding",
[
(codecs.BOM_UTF8, "utf-8"),
(codecs.BOM_UTF16_LE, "utf-16-le"),
(codecs.BOM_UTF16_BE, "utf-16-be"),
(codecs.BOM_UTF32_LE, "utf-32-le"),
(codecs.BOM_UTF32_BE, "utf-32-be"),
],
)
def test_read__encoded_file_with_bom(tmp_path, bom, encoding) -> None:
path = tmp_path / "file.py"
path.write_bytes(bom + "a = 12".encode(encoding))

text = read_text(path)

assert text == "a = 12"
44 changes: 20 additions & 24 deletions tests/unit/source_code/notebooks/test_sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,26 +19,6 @@ def test_file_linter_lints_supported_language(path, content, migration_index, mo
assert not advices


@pytest.mark.parametrize("path", ["xyz.scala", "xyz.r", "xyz.sh"])
def test_file_linter_lints_not_yet_supported_language(path, migration_index, mock_path_lookup) -> None:
linter = FileLinter(LinterContext(migration_index), mock_path_lookup, CurrentSessionState(), Path(path), None, "")
advices = list(linter.lint())
assert [advice.code for advice in advices] == ["unsupported-language"]


class FriendFileLinter(FileLinter):

def source_code(self):
return self._source_code


def test_checks_encoding_of_pseudo_file(migration_index, mock_path_lookup) -> None:
linter = FriendFileLinter(
LinterContext(migration_index), mock_path_lookup, CurrentSessionState(), Path("whatever"), None, "a=b"
)
assert linter.source_code() == "a=b"


@pytest.mark.parametrize(
"bom, encoding",
[
Expand All @@ -49,11 +29,25 @@ def test_checks_encoding_of_pseudo_file(migration_index, mock_path_lookup) -> No
(codecs.BOM_UTF32_BE, "utf-32-be"),
],
)
def test_checks_encoding_of_file_with_bom(migration_index, bom, encoding, tmp_path, mock_path_lookup) -> None:
def test_file_linter_lints_supported_language_encoded_file_with_bom(
tmp_path, migration_index, mock_path_lookup, bom, encoding
) -> None:
path = tmp_path / "file.py"
path.write_bytes(bom + "a = 12".encode(encoding))
linter = FriendFileLinter(LinterContext(migration_index), mock_path_lookup, CurrentSessionState(), path)
assert linter.source_code() == "a = 12"
linter = FileLinter(LinterContext(migration_index), mock_path_lookup, CurrentSessionState(), path, None)

advices = list(linter.lint())

assert not advices


@pytest.mark.parametrize("path", ["xyz.scala", "xyz.r", "xyz.sh"])
def test_file_linter_lints_not_yet_supported_language(tmp_path, path, migration_index, mock_path_lookup) -> None:
path = tmp_path / path
path.touch()
linter = FileLinter(LinterContext(migration_index), mock_path_lookup, CurrentSessionState(), Path(path), None, "")
advices = list(linter.lint())
assert [advice.code for advice in advices] == ["unsupported-language"]


@pytest.mark.parametrize(
Expand All @@ -75,7 +69,9 @@ def test_checks_encoding_of_file_with_bom(migration_index, bom, encoding, tmp_pa
".DS_Store", # on MacOS
],
)
def test_file_linter_lints_ignorable_language(path, migration_index, mock_path_lookup) -> None:
def test_file_linter_lints_ignorable_language(tmp_path, path, migration_index, mock_path_lookup) -> None:
path = tmp_path / path
path.touch()
linter = FileLinter(LinterContext(migration_index), mock_path_lookup, CurrentSessionState(), Path(path), None)
advices = list(linter.lint())
assert not advices
Expand Down
Loading