Skip to content

Commit

Permalink
find license identifiers in comments with ascii art frames
Browse files Browse the repository at this point in the history
  • Loading branch information
pietroalbini committed Jul 18, 2022
1 parent 8683586 commit 50deb5f
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 3 deletions.
28 changes: 25 additions & 3 deletions src/reuse/_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from hashlib import sha1
from os import PathLike
from pathlib import Path
from typing import BinaryIO, List, Optional, Set
from typing import BinaryIO, List, Optional, Set, Iterator

from boolean.boolean import Expression, ParseError
from debian.copyright import Copyright
Expand Down Expand Up @@ -53,7 +53,7 @@
)
)
_IDENTIFIER_PATTERN = re.compile(
r"SPDX-License-Identifier:[ \t]+(.*?)" + _END_PATTERN, re.MULTILINE
r"^(.*?)SPDX-License-Identifier:[ \t]+(.*?)" + _END_PATTERN, re.MULTILINE
)
_COPYRIGHT_PATTERNS = [
re.compile(
Expand Down Expand Up @@ -264,7 +264,7 @@ def extract_spdx_info(text: str) -> SpdxInfo:
:raises ParseError: if an SPDX expression could not be parsed
"""
text = filter_ignore_block(text)
expression_matches = set(map(str.strip, _IDENTIFIER_PATTERN.findall(text)))
expression_matches = set(find_license_identifiers(text))
expressions = set()
copyright_matches = set()
for expression in expression_matches:
Expand All @@ -287,6 +287,28 @@ def extract_spdx_info(text: str) -> SpdxInfo:
return SpdxInfo(expressions, copyright_matches)


def find_license_identifiers(text: str) -> Iterator[str]:
"""Extract all the license identifiers matching the IDENTIFIER_PATTERN
regex, taking care of stripping extraneous whitespace of formatting."""
for prefix, identifier in _IDENTIFIER_PATTERN.findall(text):
prefix, identifier = prefix.strip(), identifier.strip()

# Some comment headers have ASCII art to "frame" the comment, like this:
#
# /***********************\
# |* This is a comment *|
# \***********************/
#
# To ensure we parse them correctly, if the line ends with the inverse
# of the comment prefix, we strip that suffix. See #343 for a real
# world example of a project doing this (LLVM).
suffix = prefix[::-1]
if suffix and identifier.endswith(suffix):
identifier = identifier[: -len(suffix)]

yield identifier.strip()


def filter_ignore_block(text: str) -> str:
"""Filter out blocks beginning with REUSE_IGNORE_START and ending with
REUSE_IGNORE_END to remove lines that should not be treated as copyright and
Expand Down
14 changes: 14 additions & 0 deletions tests/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,20 @@ def test_extract_expression():
assert result.spdx_expressions == {_LICENSING.parse(expression)}


def test_extract_expression_from_ascii_art_frame():
"""Parse an expression from an ASCII art frame"""
result = _util.extract_spdx_info(
cleandoc(
"""
/**********************************\\
|* SPDX-License-Identifier: MIT *|
\\**********************************/
"""
)
)
assert result.spdx_expressions == {_LICENSING.parse("MIT")}


def test_extract_erroneous_expression():
"""Parse an incorrect expression."""
expression = "SPDX-License-Identifier: GPL-3.0-or-later AND (MIT OR)"
Expand Down

0 comments on commit 50deb5f

Please sign in to comment.