From 3376bd3c1e2fdb76f33647f210d1000f94a36b1b Mon Sep 17 00:00:00 2001
From: Ned Batchelder <ned@nedbatchelder.com>
Date: Sun, 7 Apr 2024 09:44:51 -0400
Subject: [PATCH] refactor: clean up incremental data and logic

---
 coverage/html.py   | 198 ++++++++++++++++++++++++---------------------
 pyproject.toml     |   1 +
 tests/test_html.py |   2 +-
 3 files changed, 110 insertions(+), 91 deletions(-)

diff --git a/coverage/html.py b/coverage/html.py
index 2bab61bae..a7c464f09 100644
--- a/coverage/html.py
+++ b/coverage/html.py
@@ -15,8 +15,8 @@
 import shutil
 import string
 
-from dataclasses import dataclass
-from typing import Any, Iterable, TYPE_CHECKING, TypedDict, cast
+from dataclasses import dataclass, field
+from typing import Any, Iterable, TYPE_CHECKING, cast
 
 import coverage
 from coverage.data import CoverageData, add_data_to_hash
@@ -41,21 +41,6 @@
 os = isolate_module(os)
 
 
-class IndexInfoDict(TypedDict):
-    """Information for each file, to render the index file."""
-    # For in-memory use, we have Numbers.  For serialization, we write a list
-    # of ints.  Two fields keeps the type-checker happier.
-    nums: Numbers | None
-    numlist: list[int]
-    html_filename: str
-    relative_filename: str
-
-class FileInfoDict(TypedDict):
-    """Summary of the information from last rendering, to avoid duplicate work."""
-    hash: str
-    index: IndexInfoDict
-
-
 def data_filename(fname: str) -> str:
     """Return the path to an "htmlfiles" data file of ours.
     """
@@ -244,7 +229,7 @@ def __init__(self, cov: Coverage) -> None:
         self.data = self.coverage.get_data()
         self.has_arcs = self.data.has_arcs()
 
-        self.file_summaries: list[IndexInfoDict] = []
+        self.file_summaries: list[IndexInfo] = []
         self.all_files_nums: list[Numbers] = []
         self.incr = IncrementalChecker(self.directory)
         self.datagen = HtmlDataGeneration(self.coverage)
@@ -380,6 +365,7 @@ def write_html_file(self, ftr: FileToReport) -> None:
 
         # Find out if the file on disk is already correct.
         if self.incr.can_skip_file(self.data, ftr.fr, ftr.rootname):
+            print("LOOK:",self.incr.index_info(ftr.rootname))
             self.file_summaries.append(self.incr.index_info(ftr.rootname))
             return
 
@@ -460,12 +446,11 @@ def write_html_file(self, ftr: FileToReport) -> None:
         write_html(html_path, html)
 
         # Save this file's information for the index file.
-        index_info: IndexInfoDict = {
-            "nums": ftr.analysis.numbers,
-            "numlist": [],
-            "html_filename": ftr.html_filename,
-            "relative_filename": ftr.fr.relative_filename(),
-        }
+        index_info = IndexInfo(
+            nums = ftr.analysis.numbers,
+            html_filename = ftr.html_filename,
+            relative_filename = ftr.fr.relative_filename(),
+        )
         self.file_summaries.append(index_info)
         self.incr.set_index_info(ftr.rootname, index_info)
 
@@ -499,108 +484,144 @@ def index_file(self, first_html: str, final_html: str) -> None:
         self.incr.write()
 
 
+@dataclass
+class IndexInfo:
+    """Information for each file, to render the index file."""
+    html_filename: str = ""
+    relative_filename: str = ""
+    nums: Numbers = field(default_factory=Numbers)
+
+
+@dataclass
+class FileInfo:
+    """Summary of the information from last rendering, to avoid duplicate work."""
+    hash: str = ""
+    index: IndexInfo = field(default_factory=IndexInfo)
+
+
 class IncrementalChecker:
-    """Logic and data to support incremental reporting."""
+    """Logic and data to support incremental reporting.
+
+    When generating an HTML report, often only a few of the source files have
+    changed since the last time we made the HTML report.  This means previously
+    created HTML pages can be reused without generating them again, speeding
+    the command.
+
+    This class manages a JSON data file that captures enough information to
+    know whether an HTML page for a .py file needs to be regenerated or not.
+    The data file also needs to store all the information needed to create the
+    entry for the file on the index page so that if the HTML page is reused,
+    the index page can still be created to refer to it.
+
+    The data looks like::
+
+        {
+            "note": "This file is an internal implementation detail ...",
+            // A fixed number indicating the data format.  STATUS_FORMAT
+            "format": 3,
+            // The version of coverage.py
+            "version": "7.4.4",
+            // A hash of a number of global things, including the configuration
+            // settings and the pyfile.html template itself.
+            "globals": "540ee119c15d52a68a53fe6f0897346d",
+            "files": {
+                // An entry for each source file keyed by the flat_rootname().
+                "z_7b071bdc2a35fa80___init___py": {
+                    // Hash of the source, the text of the .py file.
+                    "hash": "e45581a5b48f879f301c0f30bf77a50c",
+                    // Information for the index.html file.
+                    "index": {
+                        "html_filename": "z_7b071bdc2a35fa80___init___py.html",
+                        "relative_filename": "cogapp/__init__.py",
+                        // The Numbers for this file.
+                        "nums": { "precision": 2, "n_files": 1, "n_statements": 43, ... }
+                    }
+                },
+                ...
+            }
+        }
+
+    """
 
     STATUS_FILE = "status.json"
-    STATUS_FORMAT = 2
+    STATUS_FORMAT = 3
     NOTE = (
         "This file is an internal implementation detail to speed up HTML report"
         + " generation. Its format can change at any time. You might be looking"
         + " for the JSON report: https://coverage.rtfd.io/cmd.html#cmd-json"
     )
 
-    #  The data looks like:
-    #
-    #  {
-    #      "note": "This file is an internal implementation detail ...",
-    #      "format": 2,
-    #      "version": "4.0a1",
-    #      "globals": "540ee119c15d52a68a53fe6f0897346d",
-    #      "files": {
-    #          "cogapp___init__": {
-    #              "hash": "e45581a5b48f879f301c0f30bf77a50c",
-    #              "index": {
-    #                  "html_filename": "cogapp___init__.html",
-    #                  "relative_filename": "cogapp/__init__",
-    #                  "nums": [ 1, 14, 0, 0, 0, 0, 0 ]
-    #              }
-    #          },
-    #          ...
-    #          "cogapp_whiteutils": {
-    #              "hash": "8504bb427fc488c4176809ded0277d51",
-    #              "index": {
-    #                  "html_filename": "cogapp_whiteutils.html",
-    #                  "relative_filename": "cogapp/whiteutils",
-    #                  "nums": [ 1, 59, 0, 1, 28, 2, 2 ]
-    #              }
-    #          }
-    #      }
-    #  }
-
     def __init__(self, directory: str) -> None:
         self.directory = directory
-        self.reset()
+        self._reset()
 
-    def reset(self) -> None:
+    def _reset(self) -> None:
         """Initialize to empty. Causes all files to be reported."""
         self.globals = ""
-        self.files: dict[str, FileInfoDict] = {}
+        self.files: dict[str, FileInfo] = {}
 
     def read(self) -> None:
         """Read the information we stored last time."""
-        usable = False
         try:
             status_file = os.path.join(self.directory, self.STATUS_FILE)
             with open(status_file) as fstatus:
                 status = json.load(fstatus)
         except (OSError, ValueError):
+            # Status file is missing or malformed.
             usable = False
         else:
-            usable = True
             if status["format"] != self.STATUS_FORMAT:
                 usable = False
             elif status["version"] != coverage.__version__:
                 usable = False
+            else:
+                usable = True
 
         if usable:
             self.files = {}
-            for filename, fileinfo in status["files"].items():
-                fileinfo["index"]["nums"] = Numbers(*fileinfo["index"]["numlist"])
+            for filename, filedict in status["files"].items():
+                indexdict = filedict["index"]
+                indexinfo = IndexInfo(**indexdict)
+                indexinfo.nums = Numbers(**indexdict["nums"])
+                fileinfo = FileInfo(
+                    hash=filedict["hash"],
+                    index=indexinfo,
+                )
                 self.files[filename] = fileinfo
             self.globals = status["globals"]
         else:
-            self.reset()
+            self._reset()
 
     def write(self) -> None:
         """Write the current status."""
         status_file = os.path.join(self.directory, self.STATUS_FILE)
-        files = {}
-        for filename, fileinfo in self.files.items():
-            index = fileinfo["index"]
-            assert index["nums"] is not None
-            index["numlist"] = list(dataclasses.astuple(index["nums"]))
-            index["nums"] = None
-            files[filename] = fileinfo
-
-        status = {
+        status_data = {
             "note": self.NOTE,
             "format": self.STATUS_FORMAT,
             "version": coverage.__version__,
             "globals": self.globals,
-            "files": files,
+            "files": {
+                fname: dataclasses.asdict(finfo)
+                for fname, finfo in self.files.items()
+            },
         }
         with open(status_file, "w") as fout:
-            json.dump(status, fout, separators=(",", ":"))
+            json.dump(status_data, fout, separators=(",", ":"))
 
     def check_global_data(self, *data: Any) -> None:
-        """Check the global data that can affect incremental reporting."""
+        """Check the global data that can affect incremental reporting.
+
+        Pass in whatever global information could affect the content of the
+        HTML pages.  If the global data has changed since last time, this will
+        clear the data so that all files are regenerated.
+
+        """
         m = Hasher()
         for d in data:
             m.update(d)
         these_globals = m.hexdigest()
         if self.globals != these_globals:
-            self.reset()
+            self._reset()
             self.globals = these_globals
 
     def can_skip_file(self, data: CoverageData, fr: FileReporter, rootname: str) -> bool:
@@ -608,36 +629,33 @@ def can_skip_file(self, data: CoverageData, fr: FileReporter, rootname: str) ->
 
         `data` is a CoverageData object, `fr` is a `FileReporter`, and
         `rootname` is the name being used for the file.
+
+        Returns True if the HTML page is fine as-is, False if we need to recreate
+        the HTML page.
+
         """
         m = Hasher()
         m.update(fr.source().encode("utf-8"))
         add_data_to_hash(data, fr.filename, m)
         this_hash = m.hexdigest()
 
-        that_hash = self.file_hash(rootname)
+        file_info = self.files.setdefault(rootname, FileInfo())
 
-        if this_hash == that_hash:
+        if this_hash == file_info.hash:
             # Nothing has changed to require the file to be reported again.
             return True
         else:
-            self.set_file_hash(rootname, this_hash)
+            # File has changed, record the latest hash and force regeneration.
+            file_info.hash = this_hash
             return False
 
-    def file_hash(self, fname: str) -> str:
-        """Get the hash of `fname`'s contents."""
-        return self.files.get(fname, {}).get("hash", "")    # type: ignore[call-overload]
-
-    def set_file_hash(self, fname: str, val: str) -> None:
-        """Set the hash of `fname`'s contents."""
-        self.files.setdefault(fname, {})["hash"] = val      # type: ignore[typeddict-item]
-
-    def index_info(self, fname: str) -> IndexInfoDict:
+    def index_info(self, fname: str) -> IndexInfo:
         """Get the information for index.html for `fname`."""
-        return self.files.get(fname, {}).get("index", {})   # type: ignore
+        return self.files.get(fname, FileInfo()).index
 
-    def set_index_info(self, fname: str, info: IndexInfoDict) -> None:
+    def set_index_info(self, fname: str, info: IndexInfo) -> None:
         """Set the information for index.html for `fname`."""
-        self.files.setdefault(fname, {})["index"] = info    # type: ignore[typeddict-item]
+        self.files.setdefault(fname, FileInfo()).index = info
 
 
 # Helpers for templates and generating HTML
diff --git a/pyproject.toml b/pyproject.toml
index 9830a43cc..7dc56333f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -41,6 +41,7 @@ defining-attr-methods = [
     "__post_init__",
     "setUp",
     "reset",
+    "_reset",
 ]
 
 [tool.pylint.design]
diff --git a/tests/test_html.py b/tests/test_html.py
index bac9ee7ce..c53566344 100644
--- a/tests/test_html.py
+++ b/tests/test_html.py
@@ -310,7 +310,7 @@ def test_status_format_change(self) -> None:
         with open("htmlcov/status.json") as status_json:
             status_data = json.load(status_json)
 
-        assert status_data['format'] == 2
+        assert status_data['format'] == 3
         status_data['format'] = 99
         with open("htmlcov/status.json", "w") as status_json:
             json.dump(status_data, status_json)