find license identifiers in comments with ascii art frames

fsfe · Jul 18, 2022 · 50deb5f · 50deb5f
1 parent 8683586
commit 50deb5f
Show file tree

Hide file tree

Showing 2 changed files with 39 additions and 3 deletions.
diff --git a/src/reuse/_util.py b/src/reuse/_util.py
@@ -22,7 +22,7 @@
 from hashlib import sha1
 from os import PathLike
 from pathlib import Path
-from typing import BinaryIO, List, Optional, Set
+from typing import BinaryIO, List, Optional, Set, Iterator
 
 from boolean.boolean import Expression, ParseError
 from debian.copyright import Copyright
@@ -53,7 +53,7 @@
     )
 )
 _IDENTIFIER_PATTERN = re.compile(
-    r"SPDX-License-Identifier:[ \t]+(.*?)" + _END_PATTERN, re.MULTILINE
+    r"^(.*?)SPDX-License-Identifier:[ \t]+(.*?)" + _END_PATTERN, re.MULTILINE
 )
 _COPYRIGHT_PATTERNS = [
     re.compile(
@@ -264,7 +264,7 @@ def extract_spdx_info(text: str) -> SpdxInfo:
     :raises ParseError: if an SPDX expression could not be parsed
     """
     text = filter_ignore_block(text)
-    expression_matches = set(map(str.strip, _IDENTIFIER_PATTERN.findall(text)))
+    expression_matches = set(find_license_identifiers(text))
     expressions = set()
     copyright_matches = set()
     for expression in expression_matches:
@@ -287,6 +287,28 @@ def extract_spdx_info(text: str) -> SpdxInfo:
     return SpdxInfo(expressions, copyright_matches)
 
 
+def find_license_identifiers(text: str) -> Iterator[str]:
+    """Extract all the license identifiers matching the IDENTIFIER_PATTERN
+    regex, taking care of stripping extraneous whitespace of formatting."""
+    for prefix, identifier in _IDENTIFIER_PATTERN.findall(text):
+        prefix, identifier = prefix.strip(), identifier.strip()
+
+        # Some comment headers have ASCII art to "frame" the comment, like this:
+        #
+        # /***********************\
+        # |*  This is a comment  *|
+        # \***********************/
+        #
+        # To ensure we parse them correctly, if the line ends with the inverse
+        # of the comment prefix, we strip that suffix. See #343 for a real
+        # world example of a project doing this (LLVM).
+        suffix = prefix[::-1]
+        if suffix and identifier.endswith(suffix):
+            identifier = identifier[: -len(suffix)]
+
+        yield identifier.strip()
+
+
 def filter_ignore_block(text: str) -> str:
     """Filter out blocks beginning with REUSE_IGNORE_START and ending with
     REUSE_IGNORE_END to remove lines that should not be treated as copyright and

diff --git a/tests/test_util.py b/tests/test_util.py
@@ -51,6 +51,20 @@ def test_extract_expression():
         assert result.spdx_expressions == {_LICENSING.parse(expression)}
 
 
+def test_extract_expression_from_ascii_art_frame():
+    """Parse an expression from an ASCII art frame"""
+    result = _util.extract_spdx_info(
+        cleandoc(
+            """
+             /**********************************\\
+             |*  SPDX-License-Identifier: MIT  *|
+             \\**********************************/
+            """
+        )
+    )
+    assert result.spdx_expressions == {_LICENSING.parse("MIT")}
+
+
 def test_extract_erroneous_expression():
     """Parse an incorrect expression."""
     expression = "SPDX-License-Identifier: GPL-3.0-or-later AND (MIT OR)"