Make lcov parser more accepting of corrupt coverage lines (#726)

codecov · Sep 19, 2024 · 3c67255 · 3c67255
1 parent ff40366
commit 3c67255
Show file tree

Hide file tree

Showing 2 changed files with 118 additions and 88 deletions.
diff --git a/services/report/languages/lcov.py b/services/report/languages/lcov.py
@@ -1,6 +1,6 @@
 import logging
 from collections import defaultdict
-from decimal import Decimal
+from decimal import Decimal, InvalidOperation
 from io import BytesIO
 
 import sentry_sdk
@@ -34,13 +34,13 @@ def from_txt(reports: bytes, report_builder_session: ReportBuilderSession) -> No
 def _process_file(
     doc: bytes, report_builder_session: ReportBuilderSession
 ) -> ReportFile | None:
-    _already_informed_of_negative_execution_count = False
     branches: dict[str, dict[str, int]] = defaultdict(dict)
-    fln, fh = {}, {}
+    fn_lines: set[str] = set()  # lines of function definitions
+
     JS = False
     CPP = False
     skip_lines: list[str] = []
-    _file = None
+    _file: ReportFile | None = None
 
     for encoded_line in BytesIO(doc):
         line = encoded_line.decode(errors="replace").rstrip("\n")
@@ -49,14 +49,15 @@ def _process_file(
 
         method, content = line.split(":", 1)
         content = content.strip()
-        if method in ("TN", "LF", "LH", "BRF", "BRH"):
+        if method in ("TN", "LF", "LH", "FNF", "FNH", "BRF", "BRH", "FNDA"):
             # TN: test title
             # LF: lines found
             # LH: lines hit
             # FNF: functions found
             # FNH: functions hit
             # BRF: branches found
             # BRH: branches hit
+            # FNDA: function data
             continue
 
         if method == "SF":
@@ -86,35 +87,27 @@ def _process_file(
             if line.startswith("undefined,"):
                 continue
 
-            splitted_content = content.split(",")
-            line = splitted_content[0]
-            hit = splitted_content[1]
-            if line[0] in ("0", "n") or hit[0] in ("=", "s"):
+            split = content.split(",", 2)
+            if len(split) < 2:
                 continue
+            line_str = split[0]
+            hit = split[1]
 
-            if hit == "undefined" or line == "undefined":
+            if line_str in ("", "undefined") or hit in ("", "undefined"):
+                continue
+            if line_str[0] in ("0", "n") or hit[0] in ("=", "s"):
                 continue
 
-            if hit.isnumeric():
-                cov = int(hit)
-            else:
-                # Huge ints may be expressed in scientific notation.
-                # int(float(hit)) may lose precision, but Decimal shouldn't.
-                cov = int(Decimal(hit))
-
-            if cov < -1:
-                # https://github.com/linux-test-project/lcov/commit/dfec606f3b30e1ac0f4114cfb98b29f91e9edb21
-                if not _already_informed_of_negative_execution_count:
-                    log.warning(
-                        "At least one occurrence of negative execution counts on Lcov",
-                        extra=dict(
-                            execution_count=cov, lcov_report_filename=_file.name
-                        ),
-                    )
-                    _already_informed_of_negative_execution_count = True
-                cov = 0
-            coverage_line = report_builder_session.create_coverage_line(cov)
-            _file.append(int(line), coverage_line)
+            try:
+                ln = int(line_str)
+                cov = parse_int(hit)
+            except (ValueError, InvalidOperation):
+                continue
+
+            cov = max(cov, 0)  # clamp to 0
+
+            _line = report_builder_session.create_coverage_line(cov)
+            _file.append(ln, _line)
 
         elif method == "FN" and not JS:
             """
@@ -123,69 +116,79 @@ def _process_file(
 
             FN:<line number of function start>,<function name>
             """
-            line, name = content.split(",", 1)
-            if CPP and name[:2] in ("_Z", "_G"):
-                skip_lines.append(line)
-                continue
 
-            fln[name] = line
+            split = content.split(",", 1)
+            if len(split) < 2:
+                continue
+            line_str, name = split
 
-        elif method == "FNDA" and not JS:
-            #  FNDA:<execution count>,<function name>
-            hit, name = content.split(",", 1)
-            if CPP and name[0] == "_":
-                skip_lines.append(line)
+            if CPP and name[:2] in ("_Z", "_G"):
+                skip_lines.append(line_str)
                 continue
 
-            if hit:
-                if hit.isnumeric():
-                    fh[name] = int(hit)
-                else:
-                    fh[name] = int(Decimal(hit))
+            fn_lines.add(line_str)
 
         elif method == "BRDA" and not JS:
             """
-            Branch coverage information is stored which one line per branch:
+            Branch coverage information is stored with one line per branch:
 
               BRDA:<line number>,<block number>,<branch number>,<taken>
 
-            Block  number  and  branch  number are gcc internal IDs for the branch.
-            Taken is either "-" if the basic block containing the branch was  never
+            Block number and branch number are gcc internal IDs for the branch.
+            Taken is either "-" if the basic block containing the branch was never
             executed or a number indicating how often that branch was taken.
             """
             # BRDA:<line number>,<block number>,<branch number>,<taken>
-            ln, block, branch, taken = content.split(",", 3)
-            if ln == "1" and _file.name.endswith(".ts"):
+            split = content.split(",", 3)
+            if len(split) < 4:
+                continue
+            line_str, block, branch, taken = split
+
+            if line_str == "1" and _file.name.endswith(".ts"):
                 continue
 
-            elif ln not in ("0", ""):
-                branches[ln]["%s:%s" % (block, branch)] = (
+            elif line_str not in ("0", ""):
+                branches[line_str]["%s:%s" % (block, branch)] = (
                     0 if taken in ("-", "0") else 1
                 )
 
     if _file is None:
         return None
 
-    # remove skipped
+    # remove skipped branches
     for sl in skip_lines:
         branches.pop(sl, None)
 
-    methods = fln.values()
-
     # work branches
-    for ln, br in branches.items():
-        s, li = sum(br.values()), len(br.values())
-        mb = [bid for bid, cov in br.items() if cov == 0]
-        coverage = f"{s}/{li}"
-        coverage_type = CoverageType.method if ln in methods else CoverageType.branch
-
-        _file.append(
-            int(ln),
-            report_builder_session.create_coverage_line(
-                coverage,
-                coverage_type,
-                missing_branches=(mb if mb != [] else None),
-            ),
+    for line_str, br in branches.items():
+        try:
+            ln = int(line_str)
+        except ValueError:
+            continue
+
+        branch_num = len(br.values())
+        branch_sum = sum(br.values())
+        missing_branches = [bid for bid, cov in br.items() if cov == 0]
+
+        coverage = f"{branch_sum}/{branch_num}"
+        coverage_type = (
+            CoverageType.method if line_str in fn_lines else CoverageType.branch
         )
 
+        _line = report_builder_session.create_coverage_line(
+            coverage,
+            coverage_type,
+            missing_branches=(missing_branches if missing_branches != [] else None),
+        )
+        _file.append(ln, _line)
+
     return _file
+
+
+def parse_int(n: str) -> int:
+    if n.isnumeric():
+        return int(n)
+
+    # Huge ints may be expressed in scientific notation.
+    # int(float(hit)) may lose precision, but Decimal shouldn't.
+    return int(Decimal(n))
diff --git a/services/report/languages/tests/unit/test_lcov.py b/services/report/languages/tests/unit/test_lcov.py
@@ -3,7 +3,8 @@
 
 from . import create_report_builder_session
 
-txt = b"""TN:
+txt = b"""
+TN:
 SF:file.js
 FNDA:76,jsx
 FN:76,(anonymous_1)
@@ -86,6 +87,36 @@
 end_of_record
 """
 
+negative_count = b"""
+TN:
+SF:file.js
+DA:1,1
+DA:2,2
+DA:3,0
+DA:4,-1
+DA:5,-5
+DA:6,-20
+end_of_record
+"""
+
+corrupt_txt = b"""
+TN:
+SF:foo.cpp
+
+DA:1,1
+
+DA:DA:130,0
+DA:0,
+DA:,0
+DA:
+DA:not_int,123
+DA:123,not_decimal
+
+FN:just_a_name_no_line
+
+end_of_record
+"""
+
 
 class TestLcov(BaseTestCase):
     def test_report(self):
@@ -100,7 +131,7 @@ def fixes(path):
         report = report_builder_session.output_report()
         processed_report = self.convert_report_to_better_readable(report)
 
-        expected_result_archive = {
+        assert processed_report["archive"] == {
             "file.cpp": [
                 (1, 1, None, [[0, 1, None, None, None]], None, None),
                 (2, "1/3", "m", [[0, "1/3", ["1:1", "1:3"], None, None]], None, None),
@@ -121,7 +152,6 @@ def fixes(path):
                     None,
                     None,
                 ),
-                # TODO (Thiago): This is out of order compared to the original, verify what happened
             ],
             "file.js": [
                 (1, 1, None, [[0, 1, None, None, None]], None, None),
@@ -130,8 +160,6 @@ def fixes(path):
             "file.ts": [(2, 1, None, [[0, 1, None, None, None]], None, None)],
         }
 
-        assert expected_result_archive == processed_report["archive"]
-
     def test_detect(self):
         processor = lcov.LcovProcessor()
         assert processor.matches_content(b"hello\nend_of_record\n", "", "") is True
@@ -140,21 +168,8 @@ def test_detect(self):
         assert processor.matches_content(b"", "", "") is False
 
     def test_negative_execution_count(self):
-        text = "\n".join(
-            [
-                "TN:",
-                "SF:file.js",
-                "DA:1,1",
-                "DA:2,2",
-                "DA:3,0",
-                "DA:4,-1",
-                "DA:5,-5",
-                "DA:6,-20",
-                "end_of_record",
-            ]
-        ).encode()
         report_builder_session = create_report_builder_session()
-        lcov.from_txt(text, report_builder_session)
+        lcov.from_txt(negative_count, report_builder_session)
         report = report_builder_session.output_report()
         processed_report = self.convert_report_to_better_readable(report)
 
@@ -163,8 +178,20 @@ def test_negative_execution_count(self):
                 (1, 1, None, [[0, 1, None, None, None]], None, None),
                 (2, 2, None, [[0, 2, None, None, None]], None, None),
                 (3, 0, None, [[0, 0, None, None, None]], None, None),
-                (4, -1, None, [[0, -1, None, None, None]], None, None),
+                (4, 0, None, [[0, 0, None, None, None]], None, None),
                 (5, 0, None, [[0, 0, None, None, None]], None, None),
                 (6, 0, None, [[0, 0, None, None, None]], None, None),
             ]
         }
+
+    def test_skips_corrupted_lines(self):
+        report_builder_session = create_report_builder_session()
+        lcov.from_txt(corrupt_txt, report_builder_session)
+        report = report_builder_session.output_report()
+        processed_report = self.convert_report_to_better_readable(report)
+
+        assert processed_report["archive"] == {
+            "foo.cpp": [
+                (1, 1, None, [[0, 1, None, None, None]], None, None),
+            ]
+        }