Skip to content

Commit

Permalink
Update adding unknown matches #2390
Browse files Browse the repository at this point in the history
Instead of adding a general `unknown_debian_license` rule, create
a synthetic UnknownRule object and a LicenseMatch object out of the
unknown license text. Also updates test expectations after reindexing
licenses with new rules added from develop branch.

Signed-off-by: Ayan Sinha Mahapatra <ayansmahapatra@gmail.com>
  • Loading branch information
AyanSinhaMahapatra committed Jun 1, 2021
1 parent dc61705 commit 40a303a
Show file tree
Hide file tree
Showing 117 changed files with 517 additions and 480 deletions.
1 change: 0 additions & 1 deletion src/licensedcode/data/rules/unknown_debian_license.RULE

This file was deleted.

3 changes: 0 additions & 3 deletions src/licensedcode/data/rules/unknown_debian_license.yml

This file was deleted.

114 changes: 100 additions & 14 deletions src/packagedcode/debian_copyright.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import sys
from os import environ
from os import path
import traceback

import attr

Expand All @@ -28,6 +29,13 @@
from packagedcode.licensing import get_normalized_expression
from packagedcode.utils import combine_expressions

from licensedcode.models import Rule
from licensedcode.models import InvalidRule
from licensedcode.match import LicenseMatch
from licensedcode.query import Query
from licensedcode.spans import Span
from licensedcode.cache import get_index

from textcode.analysis import unicode_text

"""
Expand All @@ -38,6 +46,7 @@

TRACE = environ.get("SCANCODE_DEBUG_PACKAGE", False) or False

MATCHER_UNKNOWN = '5-unknown'

def logger_debug(*args):
pass
Expand Down Expand Up @@ -371,7 +380,7 @@ def get_license_detection(paragraph, debian_licensing):
"""
name = paragraph.license.name
if not name:
return get_license_detection_from_paragraph_text(paragraph=paragraph)
return get_license_detection_from_nameless_paragraph(paragraph=paragraph)

normalized_expression = debian_licensing.get_normalized_expression(name)

Expand Down Expand Up @@ -585,9 +594,7 @@ def get_normalized_expression(self, exp):

try:
debian_expression = self.licensing.parse(cleaned)
normalized_expression = debian_expression.subs(
self.substitutions
)
normalized_expression = debian_expression.subs(self.substitutions)

except ExpressionError:
# If Expression fails to parse we lookup exact string matches in License paras
Expand All @@ -603,7 +610,7 @@ def get_normalized_expression(self, exp):
else:
# Case where expression is not parsable and the same expression is not present in
# the license paragraphs
unknown_matches = get_unknown_matches(name=exp, text=None)
unknown_matches = add_unknown_matches(name=exp, text=None)
normalized_expression = get_license_expression_from_matches(
license_matches=unknown_matches
)
Expand Down Expand Up @@ -656,7 +663,7 @@ def parse_paras_with_license_text(paras_with_license):
if text_matches:
matches.extend(text_matches)
else:
matches.extend(get_unknown_matches(name=name, text=text))
matches.extend(add_unknown_matches(name=name, text=text))

if license_paragraph.comment:
comment = license_paragraph.comment.text
Expand Down Expand Up @@ -775,7 +782,7 @@ def other_paragraphs(self):
return other_paras

@property
def duplicate_license_paragraphss(self):
def duplicate_license_paragraphs(self):

seen_license_names = set()
duplicate_license_paras = []
Expand Down Expand Up @@ -916,22 +923,101 @@ def get_license_expression_from_matches(license_matches):
return combine_expressions(license_expressions, unique=False)


def get_unknown_matches(name, text):
def add_unknown_matches(name, text):
"""
Return a LicenseMatch object created for an unknown license match.
"""
name = name or ''
text = text or ''
license_text = f"License: {name}\n {text}".strip()
expression_str = 'unknown-license-reference'

idx = get_index()
query = Query(query_string=license_text, idx=idx)

query_run = query.query_runs[0]

match_len = len(query_run)
match_start = query_run.start
matched_tokens = query_run.tokens

qspan = Span(range(match_start, query_run.end + 1))
ispan = Span(range(0, match_len))
len_legalese = idx.len_legalese
hispan = Span(p for p, t in enumerate(matched_tokens) if t < len_legalese)

rule = UnknownRule(
license_expression=expression_str,
stored_text=license_text,
length=match_len)

match = LicenseMatch(
rule=rule, qspan=qspan, ispan=ispan, hispan=hispan,
query_run_start=match_start,
matcher=MATCHER_UNKNOWN, query=query_run.query
)

return [match]

@attr.s(slots=True, repr=False)
class UnknownRule(Rule):
"""
A specialized rule object that is used for the special case of unknown matches in
debian copyright files.
Since there can be a lot of unknown licenses in a debian copyright file,
the rule and the LicenseMatch objects for those are built at matching time.
"""

def __attrs_post_init__(self, *args, **kwargs):
self.identifier = 'debian-unknown-' + self.license_expression
expression = None
try:
expression = self.licensing.parse(self.license_expression)
except:
raise InvalidRule(
'Unable to parse License rule expression: ' +
repr(self.license_expression) + ' for: SPDX rule:' +
self.stored_text +
'\n' + traceback.format_exc())
if expression is None:
raise InvalidRule(
'Unable to parse License rule expression: '
+repr(self.license_expression) + ' for:' + repr(self.data_file))

self.license_expression = expression.render()
self.license_expression_object = expression
self.is_license_tag = True
self.is_small = False
self.relevance = 100
self.has_stored_relevance = True

def load(self):
raise NotImplementedError

def dump(self):
raise NotImplementedError

def build_unknown_symbol(licenses_db=None):
"""
Return a LicenseMatch object for an unknown license match.
Return the unknown SPDX license symbol given a `licenses_db` mapping of
{key: License} or the standard license db.
"""
license_text = f"License: {name} {text}"
return get_license_matches(query_string=license_text)
from license_expression import LicenseSymbolLike
from licensedcode.models import load_licenses
licenses_db = licenses_db or load_licenses()
return LicenseSymbolLike(licenses_db['unknown'])


def get_license_detection_from_paragraph_text(paragraph):
def get_license_detection_from_nameless_paragraph(paragraph):
"""
Return a LicenseDetection object built from a paragraph license text.
Return a LicenseDetection object built from any paragraph without a license name.
"""
assert not paragraph.license.name
matches = get_license_matches(paragraph.license.text)

if not matches:
unknown_matches = get_unknown_matches(name=None, text=paragraph.license.text)
unknown_matches = add_unknown_matches(name=None, text=paragraph.license.text)
normalized_expression = get_license_expression_from_matches(
license_matches=unknown_matches
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@
"code_view_url": null,
"vcs_url": null,
"copyright": "Copyright (c) 1998-2016 Free Software Foundation, Inc.\nCopyright (c) 2001 by Pradeep Padala\nCopyright (c) 1994 X Consortium\nCopyright (c) 1980, 1991, 1992, 1993 The Regents of the University of California\nCopyright 1996-2007 by Thomas E. Dickey",
"license_expression": "bsd-new AND mit AND unknown AND x11-fsf AND x11-xconsortium",
"declared_license": "",
"license_expression": "x11-fsf AND x11-xconsortium AND bsd-new",
"declared_license": null,
"notice_text": null,
"root_path": null,
"dependencies": [],
Expand Down Expand Up @@ -117,9 +117,9 @@
"bug_tracking_url": null,
"code_view_url": null,
"vcs_url": null,
"copyright": "Copyright 2013 Jiri Pirko <jiri@resnulli.us>\nCopyright 2014 Andrew Ayer <agwa@andrewayer.name>",
"license_expression": "lgpl-2.1 AND lgpl-2.1-plus AND unknown",
"declared_license": "LGPL-2.1+",
"copyright": "Copyright 2013 Jiri Pirko <jiri@resnulli.us>",
"license_expression": "lgpl-2.1-plus AND lgpl-2.1-plus AND lgpl-2.1 AND lgpl-2.1",
"declared_license": ["LGPL-2.1+"],
"notice_text": null,
"root_path": null,
"dependencies": [],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,35 +51,27 @@
- WOL
- FSFUL
- (gpl-2.0 AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-2.0 AND gpl-2.0 AND gpl-1.0-plus
AND unknown-license-reference AND unknown-license-reference AND gpl-1.0-plus AND gpl-1.0-plus
AND gpl-1.0-plus AND gpl-1.0-plus AND unknown-license-reference AND gpl-1.0-plus AND gpl-1.0-plus
AND unknown-license-reference AND unknown-license-reference AND unknown-license-reference)
AND fsf-free AND autoconf-simple-exception-2.0 AND mit-old-style-no-advert AND gpl-2.0 AND
gpl-2.0 AND none AND bsd-original-uc AND gpl-2.0 AND tu-berlin AND bsd-new AND bsd-new AND
bsd-original AND gpl-2.0-plus AND public-domain AND none AND gpl-2.0 AND public-domain AND
other-permissive AND (mit AND (gpl-2.0 AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-1.0-plus
AND gpl-2.0 AND gpl-2.0 AND gpl-1.0-plus AND unknown-license-reference AND unknown-license-reference
AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-1.0-plus AND unknown-license-reference
AND gpl-1.0-plus AND gpl-1.0-plus AND unknown-license-reference AND unknown-license-reference
AND unknown-license-reference)) AND gpl-2.0-plus AND free-unknown AND (bsd-original-uc AND
isc AND (gpl-2.0 AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-2.0 AND gpl-2.0
AND gpl-1.0-plus AND unknown-license-reference AND unknown-license-reference AND gpl-1.0-plus
AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-1.0-plus AND unknown-license-reference AND gpl-1.0-plus
AND gpl-1.0-plus AND unknown-license-reference AND unknown-license-reference AND unknown-license-reference))
AND bsd-original-uc AND bison-exception-2.2 AND bsd-new AND gpl-2.0 AND (wol AND (gpl-2.0
AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-2.0 AND gpl-2.0 AND gpl-1.0-plus
AND unknown-license-reference AND unknown-license-reference AND gpl-1.0-plus AND gpl-1.0-plus
AND gpl-1.0-plus AND gpl-1.0-plus AND unknown-license-reference AND gpl-1.0-plus AND gpl-1.0-plus
AND unknown-license-reference AND unknown-license-reference AND unknown-license-reference))
AND (lgpl-2.1 OR (gpl-2.0 AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-2.0 AND
gpl-2.0 AND gpl-1.0-plus AND unknown-license-reference AND unknown-license-reference AND gpl-1.0-plus
AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-1.0-plus AND unknown-license-reference AND gpl-1.0-plus
AND gpl-1.0-plus AND unknown-license-reference AND unknown-license-reference AND unknown-license-reference))
AND public-domain AND isc AND brian-clapper AND lgpl-2.1 AND mit-0 AND (gpl-2.0 AND gpl-1.0-plus
AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-2.0 AND gpl-2.0 AND gpl-1.0-plus AND unknown-license-reference
AND unknown-license-reference AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-1.0-plus
AND unknown-license-reference AND gpl-1.0-plus AND gpl-1.0-plus AND unknown-license-reference
AND unknown-license-reference AND unknown-license-reference) AND gpl-2.0
AND gpl-1.0-plus AND gpl-1.0-plus AND unknown-license-reference) AND fsf-free AND autoconf-simple-exception-2.0
AND mit-old-style-no-advert AND gpl-2.0 AND gpl-2.0 AND none AND bsd-original-uc AND gpl-2.0
AND tu-berlin AND bsd-new AND bsd-new AND bsd-original AND gpl-2.0-plus AND public-domain
AND none AND gpl-2.0 AND public-domain AND other-permissive AND (mit AND (gpl-2.0 AND gpl-1.0-plus
AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-2.0 AND gpl-2.0 AND gpl-1.0-plus AND unknown-license-reference
AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-1.0-plus
AND unknown-license-reference)) AND gpl-2.0-plus AND unknown-license-reference AND (bsd-original-uc
AND isc AND (gpl-2.0 AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-2.0 AND gpl-2.0
AND gpl-1.0-plus AND unknown-license-reference AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-1.0-plus
AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-1.0-plus AND unknown-license-reference)) AND bsd-original-uc
AND bison-exception-2.2 AND bsd-new AND gpl-2.0 AND (wol AND (gpl-2.0 AND gpl-1.0-plus AND
gpl-1.0-plus AND gpl-1.0-plus AND gpl-2.0 AND gpl-2.0 AND gpl-1.0-plus AND unknown-license-reference
AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-1.0-plus
AND unknown-license-reference)) AND (lgpl-2.1 OR (gpl-2.0 AND gpl-1.0-plus AND gpl-1.0-plus
AND gpl-1.0-plus AND gpl-2.0 AND gpl-2.0 AND gpl-1.0-plus AND unknown-license-reference AND
gpl-1.0-plus AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-1.0-plus
AND unknown-license-reference)) AND public-domain AND isc AND brian-clapper AND lgpl-2.1 AND
mit-0 AND (gpl-2.0 AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-2.0 AND gpl-2.0
AND gpl-1.0-plus AND unknown-license-reference AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-1.0-plus
AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-1.0-plus AND unknown-license-reference) AND gpl-2.0
- |
1999-2006, Brett Bryant <bbryant@digium.com>
1999-2006, Mark Spencer <markster@digium.com>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,30 +26,23 @@
- Expat
- WOL
- (gpl-2.0 AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-2.0 AND gpl-2.0 AND gpl-1.0-plus
AND unknown-license-reference AND unknown-license-reference AND gpl-1.0-plus AND gpl-1.0-plus
AND gpl-1.0-plus AND gpl-1.0-plus AND unknown-license-reference AND gpl-1.0-plus AND gpl-1.0-plus
AND unknown-license-reference AND unknown-license-reference AND unknown-license-reference)
AND fsf-free AND autoconf-simple-exception-2.0 AND mit-old-style-no-advert AND gpl-2.0 AND
none AND bsd-original-uc AND tu-berlin AND bsd-new AND bsd-new AND bsd-original AND gpl-2.0-plus
AND public-domain AND other-permissive AND (mit AND (gpl-2.0 AND gpl-1.0-plus AND gpl-1.0-plus
AND gpl-1.0-plus AND gpl-2.0 AND gpl-2.0 AND gpl-1.0-plus AND unknown-license-reference AND
unknown-license-reference AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-1.0-plus
AND unknown-license-reference AND gpl-1.0-plus AND gpl-1.0-plus AND unknown-license-reference
AND unknown-license-reference AND unknown-license-reference)) AND free-unknown AND (bsd-original-uc
AND isc AND (gpl-2.0 AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-2.0 AND gpl-2.0
AND gpl-1.0-plus AND unknown-license-reference AND unknown-license-reference AND gpl-1.0-plus
AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-1.0-plus AND unknown-license-reference AND gpl-1.0-plus
AND gpl-1.0-plus AND unknown-license-reference AND unknown-license-reference AND unknown-license-reference))
AND unknown-license-reference AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-1.0-plus
AND gpl-1.0-plus AND gpl-1.0-plus AND unknown-license-reference) AND fsf-free AND autoconf-simple-exception-2.0
AND mit-old-style-no-advert AND gpl-2.0 AND none AND bsd-original-uc AND tu-berlin AND bsd-new
AND bsd-new AND bsd-original AND gpl-2.0-plus AND public-domain AND other-permissive AND (mit
AND (gpl-2.0 AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-2.0 AND gpl-2.0 AND
gpl-1.0-plus AND unknown-license-reference AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-1.0-plus
AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-1.0-plus AND unknown-license-reference)) AND unknown-license-reference
AND (bsd-original-uc AND isc AND (gpl-2.0 AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-1.0-plus
AND gpl-2.0 AND gpl-2.0 AND gpl-1.0-plus AND unknown-license-reference AND gpl-1.0-plus AND
gpl-1.0-plus AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-1.0-plus AND unknown-license-reference))
AND bison-exception-2.2 AND bsd-new AND (wol AND (gpl-2.0 AND gpl-1.0-plus AND gpl-1.0-plus
AND gpl-1.0-plus AND gpl-2.0 AND gpl-2.0 AND gpl-1.0-plus AND unknown-license-reference AND
unknown-license-reference AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-1.0-plus
AND unknown-license-reference AND gpl-1.0-plus AND gpl-1.0-plus AND unknown-license-reference
AND unknown-license-reference AND unknown-license-reference)) AND (lgpl-2.1 OR (gpl-2.0 AND
gpl-1.0-plus AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-2.0 AND gpl-2.0 AND gpl-1.0-plus AND
unknown-license-reference AND unknown-license-reference AND gpl-1.0-plus AND gpl-1.0-plus
AND gpl-1.0-plus AND gpl-1.0-plus AND unknown-license-reference AND gpl-1.0-plus AND gpl-1.0-plus
AND unknown-license-reference AND unknown-license-reference AND unknown-license-reference))
AND isc AND brian-clapper AND lgpl-2.1 AND mit-0
gpl-1.0-plus AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-1.0-plus
AND unknown-license-reference)) AND (lgpl-2.1 OR (gpl-2.0 AND gpl-1.0-plus AND gpl-1.0-plus
AND gpl-1.0-plus AND gpl-2.0 AND gpl-2.0 AND gpl-1.0-plus AND unknown-license-reference AND
gpl-1.0-plus AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-1.0-plus AND gpl-1.0-plus
AND unknown-license-reference)) AND isc AND brian-clapper AND lgpl-2.1 AND mit-0
- |
1999-2006, Brett Bryant <bbryant@digium.com>
1999-2006, Mark Spencer <markster@digium.com>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,8 @@
AND zlib AND zlib AND isc AND fsf-unlimited AND fsf-free AND fsf-unlimited-no-warranty AND
fsf-ap AND fsf-ap AND bzip2-libbzip-2010 AND (public-domain AND public-domain) AND public-domain
AND public-domain AND x11-tiff AND (public-domain AND public-domain) AND (apache-2.0 AND gpl-1.0-plus
AND apache-2.0 AND apache-2.0 AND unknown-license-reference AND gpl-1.0-plus AND apache-2.0
AND apache-2.0) AND (gpl-3.0-plus AND bison-exception-2.2 AND gpl-1.0-plus AND other-copyleft
AND gpl-3.0 AND gpl-3.0)
AND apache-2.0 AND apache-2.0 AND gpl-1.0-plus AND apache-2.0 AND apache-2.0) AND (gpl-3.0-plus
AND bison-exception-2.2 AND gpl-1.0-plus AND other-copyleft AND gpl-3.0 AND gpl-3.0)
- |
1998-2015, Sourcefire, Inc.
2002-2007, Tomasz Kojm <tkojm@clamav.net>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,8 @@
AND bsd-new AND zlib AND isc AND fsf-unlimited AND fsf-free AND fsf-unlimited-no-warranty
AND fsf-ap AND fsf-ap AND bzip2-libbzip-2010 AND (public-domain AND public-domain) AND public-domain
AND public-domain AND x11-tiff AND (public-domain AND public-domain) AND (apache-2.0 AND gpl-1.0-plus
AND apache-2.0 AND apache-2.0 AND unknown-license-reference AND gpl-1.0-plus AND apache-2.0
AND apache-2.0) AND (gpl-3.0-plus AND bison-exception-2.2 AND gpl-1.0-plus AND other-copyleft
AND gpl-3.0 AND gpl-3.0)
AND apache-2.0 AND apache-2.0 AND gpl-1.0-plus AND apache-2.0 AND apache-2.0) AND (gpl-3.0-plus
AND bison-exception-2.2 AND gpl-1.0-plus AND other-copyleft AND gpl-3.0 AND gpl-3.0)
- |
1998-2015, Sourcefire, Inc.
2002-2007, Tomasz Kojm <tkojm@clamav.net>
Expand Down
Loading

0 comments on commit 40a303a

Please sign in to comment.