databrickslabs · JCZuurmond · Dec 2, 2024 · Nov 29, 2024 · Nov 29, 2024 · Nov 29, 2024
@@ -459,6 +459,20 @@ def read_text(path: Path, size: int = -1) -> str:
         return f.read(size)
 
 
+def safe_read_text(path: Path, size: int = -1) -> str | None:
+    """Safe read a text file by handling reading exceptions, see :func:_read_text.
+
+    Returns
+        str : Content of file
+        None : If error occurred during reading.
+    """
+    try:
+        return read_text(path, size=size)
+    except (FileNotFoundError, UnicodeDecodeError, PermissionError) as e:
+        logger.warning(f"Could not read file: {path}", exc_info=e)
+        return None
+
+
 # duplicated from CellLanguage to prevent cyclic import
 LANGUAGE_COMMENT_PREFIXES = {Language.PYTHON: '#', Language.SCALA: '//', Language.SQL: '--'}
 NOTEBOOK_HEADER = "Databricks notebook source"
@@ -475,9 +489,5 @@ def is_a_notebook(path: Path, content: str | None = None) -> bool:
     magic_header = f"{LANGUAGE_COMMENT_PREFIXES.get(language)} {NOTEBOOK_HEADER}"
     if content is not None:
         return content.startswith(magic_header)
-    try:
-        file_header = read_text(path, size=len(magic_header))
-    except (FileNotFoundError, UnicodeDecodeError, PermissionError):
-        logger.warning(f"Could not read file {path}")
-        return False
+    file_header = safe_read_text(path, size=len(magic_header))
     return file_header == magic_header
@@ -34,7 +34,7 @@
     SourceInfo,
     UsedTable,
     LineageAtom,
-    read_text,
+    safe_read_text,
 )
 from databricks.labs.ucx.source_code.directfs_access import (
     DirectFsAccessCrawler,
@@ -633,7 +633,9 @@ def _process_dependency(
             logger.warning(f"Unknown language for {dependency.path}")
             return
         cell_language = CellLanguage.of_language(language)
-        source = read_text(dependency.path)
+        source = safe_read_text(dependency.path)
+        if not source:
+            return
         if is_a_notebook(dependency.path):
             yield from self._collect_from_notebook(source, cell_language, dependency.path, inherited_tree)
         elif dependency.path.is_file():

@@ -4,7 +4,6 @@
 import locale
 import logging
 from collections.abc import Iterable
-from functools import cached_property
 from pathlib import Path
 from typing import cast
 
@@ -14,14 +13,15 @@
 
 from databricks.labs.ucx.hive_metastore.table_migration_status import TableMigrationIndex
 from databricks.labs.ucx.source_code.base import (
-    Advice,
-    Failure,
-    Linter,
-    CurrentSessionState,
-    Advisory,
     file_language,
     is_a_notebook,
+    safe_read_text,
     read_text,
+    Advice,
+    Advisory,
+    CurrentSessionState,
+    Failure,
+    Linter,
 )
 
 from databricks.labs.ucx.source_code.graph import (
@@ -297,7 +297,9 @@ def _load_source_from_path(self, path: Path | None) -> Notebook | None:
         if language is not Language.PYTHON:
             logger.warning(f"Unsupported notebook language: {language}")
             return None
-        source = read_text(resolved)
+        source = safe_read_text(resolved)
+        if not source:
+            return None
         return Notebook.parse(path, source, language)
 
     def _linter(self, language: Language) -> Linter:
@@ -315,7 +317,9 @@ def _load_source_from_run_cell(self, run_cell: RunCell) -> None:
         language = file_language(resolved)
         if language is not Language.PYTHON:
             return
-        source = read_text(resolved)
+        source = safe_read_text(resolved)
+        if not source:
+            return
         notebook = Notebook.parse(path, source, language)
         for cell in notebook.cells:
             if isinstance(cell, RunCell):
@@ -390,16 +394,11 @@ def __init__(
         self._inherited_tree = inherited_tree
         self._content = content
 
-    @cached_property
-    def _source_code(self) -> str:
-        if self._content is None:
-            self._content = read_text(self._path)
-        return self._content
-
     def lint(self) -> Iterable[Advice]:
         encoding = locale.getpreferredencoding(False)
         try:
-            is_notebook = self._is_notebook()
+            # Not using `safe_read_text` here to surface read errors
+            self._content = self._content or read_text(self._path)
         except FileNotFoundError:
             failure_message = f"File not found: {self._path}"
             yield Failure("file-not-found", failure_message, 0, 0, 1, 1)
@@ -413,19 +412,21 @@ def lint(self) -> Iterable[Advice]:
             yield Failure("file-permission", failure_message, 0, 0, 1, 1)
             return
 
-        if is_notebook:
+        if self._is_notebook():
             yield from self._lint_notebook()
         else:
             yield from self._lint_file()
 
     def _is_notebook(self) -> bool:
+        assert self._content is not None, "Content should be read from path before calling this method"
         # pre-check to avoid loading unsupported content
         language = file_language(self._path)
         if not language:
             return False
-        return is_a_notebook(self._path, self._source_code)
+        return is_a_notebook(self._path, self._content)
 
     def _lint_file(self) -> Iterable[Advice]:
+        assert self._content is not None, "Content should be read from path before calling this method"
         language = file_language(self._path)
         if not language:
             suffix = self._path.suffix.lower()
@@ -440,17 +441,18 @@ def _lint_file(self) -> Iterable[Advice]:
                 linter = self._ctx.linter(language)
                 if self._inherited_tree is not None and isinstance(linter, PythonSequentialLinter):
                     linter.append_tree(self._inherited_tree)
-                yield from linter.lint(self._source_code)
+                yield from linter.lint(self._content)
             except ValueError as err:
                 failure_message = f"Error while parsing content of {self._path.as_posix()}: {err}"
                 yield Failure("unsupported-content", failure_message, 0, 0, 1, 1)
 
     def _lint_notebook(self) -> Iterable[Advice]:
+        assert self._content is not None, "Content should be read from path before calling this method"
         language = file_language(self._path)
         if not language:
             yield Failure("unknown-language", f"Cannot detect language for {self._path}", 0, 0, 1, 1)
             return
-        notebook = Notebook.parse(self._path, self._source_code, language)
+        notebook = Notebook.parse(self._path, self._content, language)
         notebook_linter = NotebookLinter(
             self._ctx,
             self._path_lookup,

@@ -0,0 +1,46 @@
+import codecs
+import logging
+from pathlib import Path
+from unittest.mock import create_autospec
+
+import pytest
+
+from databricks.labs.ucx.source_code.base import read_text, safe_read_text
+
+
+@pytest.mark.parametrize(
+    "exception",
+    [
+        FileNotFoundError(),
+        UnicodeDecodeError("utf-8", b"\x80\x81\x82", 0, 1, "invalid start byte"),
+        PermissionError(),
+    ],
+)
+def test_safe_read_text_handles_and_warns_read_errors(caplog, exception: Exception) -> None:
+    path = create_autospec(Path)
+    path.open.side_effect = exception
+
+    with caplog.at_level(logging.WARNING, logger="databricks.labs.ucx.account.aggregate"):
+        text = safe_read_text(path)
+
+    assert not text
+    assert f"Could not read file: {path}" in caplog.messages
+
+
+@pytest.mark.parametrize(
+    "bom, encoding",
+    [
+        (codecs.BOM_UTF8, "utf-8"),
+        (codecs.BOM_UTF16_LE, "utf-16-le"),
+        (codecs.BOM_UTF16_BE, "utf-16-be"),
+        (codecs.BOM_UTF32_LE, "utf-32-le"),
+        (codecs.BOM_UTF32_BE, "utf-32-be"),
+    ],
+)
+def test_read__encoded_file_with_bom(tmp_path, bom, encoding) -> None:
+    path = tmp_path / "file.py"
+    path.write_bytes(bom + "a = 12".encode(encoding))
+
+    text = read_text(path)
+
+    assert text == "a = 12"
@@ -19,26 +19,6 @@ def test_file_linter_lints_supported_language(path, content, migration_index, mo
     assert not advices
 
 
-@pytest.mark.parametrize("path", ["xyz.scala", "xyz.r", "xyz.sh"])
-def test_file_linter_lints_not_yet_supported_language(path, migration_index, mock_path_lookup) -> None:
-    linter = FileLinter(LinterContext(migration_index), mock_path_lookup, CurrentSessionState(), Path(path), None, "")
-    advices = list(linter.lint())
-    assert [advice.code for advice in advices] == ["unsupported-language"]
-
-
-class FriendFileLinter(FileLinter):
-
-    def source_code(self):
-        return self._source_code
-
-
-def test_checks_encoding_of_pseudo_file(migration_index, mock_path_lookup) -> None:
-    linter = FriendFileLinter(
-        LinterContext(migration_index), mock_path_lookup, CurrentSessionState(), Path("whatever"), None, "a=b"
-    )
-    assert linter.source_code() == "a=b"
-
-
 @pytest.mark.parametrize(
     "bom, encoding",
     [
@@ -49,11 +29,25 @@ def test_checks_encoding_of_pseudo_file(migration_index, mock_path_lookup) -> No
         (codecs.BOM_UTF32_BE, "utf-32-be"),
     ],
 )
-def test_checks_encoding_of_file_with_bom(migration_index, bom, encoding, tmp_path, mock_path_lookup) -> None:
+def test_file_linter_lints_supported_language_encoded_file_with_bom(
+    tmp_path, migration_index, mock_path_lookup, bom, encoding
+) -> None:
     path = tmp_path / "file.py"
     path.write_bytes(bom + "a = 12".encode(encoding))
-    linter = FriendFileLinter(LinterContext(migration_index), mock_path_lookup, CurrentSessionState(), path)
-    assert linter.source_code() == "a = 12"
+    linter = FileLinter(LinterContext(migration_index), mock_path_lookup, CurrentSessionState(), path, None)
+
+    advices = list(linter.lint())
+
+    assert not advices
+
+
+@pytest.mark.parametrize("path", ["xyz.scala", "xyz.r", "xyz.sh"])
+def test_file_linter_lints_not_yet_supported_language(tmp_path, path, migration_index, mock_path_lookup) -> None:
+    path = tmp_path / path
+    path.touch()
+    linter = FileLinter(LinterContext(migration_index), mock_path_lookup, CurrentSessionState(), Path(path), None, "")
+    advices = list(linter.lint())
+    assert [advice.code for advice in advices] == ["unsupported-language"]
 
 
 @pytest.mark.parametrize(
@@ -75,7 +69,9 @@ def test_checks_encoding_of_file_with_bom(migration_index, bom, encoding, tmp_pa
         ".DS_Store",  # on MacOS
     ],
 )
-def test_file_linter_lints_ignorable_language(path, migration_index, mock_path_lookup) -> None:
+def test_file_linter_lints_ignorable_language(tmp_path, path, migration_index, mock_path_lookup) -> None:
+    path = tmp_path / path
+    path.touch()
     linter = FileLinter(LinterContext(migration_index), mock_path_lookup, CurrentSessionState(), Path(path), None)
     advices = list(linter.lint())
     assert not advices