Skip to content

Commit

Permalink
Add required phrases from other rules
Browse files Browse the repository at this point in the history
Signed-off-by: Ayan Sinha Mahapatra <ayansmahapatra@gmail.com>
  • Loading branch information
AyanSinhaMahapatra committed Aug 19, 2024
1 parent 966adde commit 66f2be5
Show file tree
Hide file tree
Showing 7 changed files with 417 additions and 88 deletions.
18 changes: 1 addition & 17 deletions etc/scripts/licenses/buildrules.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from licensedcode import models
from licensedcode import match_hash
from licensedcode import frontmatter
from licensedcode.models import rule_exists
from license_expression import Licensing

"""
Expand Down Expand Up @@ -129,23 +130,6 @@ def load_data(location="00-new-licenses.txt"):
return rules


def rule_exists(text):
"""
Return the matched rule identifier if the text is an existing rule matched
exactly, False otherwise.
"""
idx = cache.get_index()

matches = idx.match(query_string=text)
if not matches:
return False
if len(matches) > 1:
return False
match = matches[0]
if match.matcher == match_hash.MATCH_HASH and match.score() == 100:
return match.rule.identifier


def all_rule_by_tokens():
"""
Return a mapping of {tuples of tokens: rule id}, with one item for each
Expand Down
2 changes: 2 additions & 0 deletions etc/scripts/licenses/report_license_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@
"is_license_reference",
"is_license_intro",
"is_license_clue",
"is_required_phrase",
"skip_creating_required_phrases",
"is_deprecated",
"has_unknown",
"only_known_words",
Expand Down
102 changes: 86 additions & 16 deletions src/licensedcode/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -1425,13 +1425,33 @@ class BasicRule:
'Mutually exclusive from any is_license_* flag')
)

is_required_phrase = attr.ib(
default=False,
repr=False,
metadata=dict(
help='True if this is rule text is a required phrase '
'A required phrase is often a part of another larger rule text '
'but is an essential section of the rule text which must be '
'present in the case of partial matches, otherwise the match '
'will be a false positive and misleading. ')
)

skip_creating_required_phrases = attr.ib(
default=False,
repr=False,
metadata=dict(
help='True if this rule needs to be skipped while creating '
'required phrase rules. Required phrase rules are created out '
'of other rule texts which have marked required phrases. ')
)

language = attr.ib(
default='en',
repr=False,
metadata=dict(
help='Two-letter ISO 639-1 language code if this license text is '
'not in English. See https://en.wikipedia.org/wiki/ISO_639-1 .')
)
)

minimum_coverage = attr.ib(
default=0,
Expand Down Expand Up @@ -1793,22 +1813,27 @@ def has_unknown(self):
# license flag instead
return self.license_expression and 'unknown' in self.license_expression

def validate(self, licensing=None, thorough=False):
"""
Validate this rule using the provided ``licensing`` Licensing and yield
one error message for each type of error detected.
"""
is_false_positive = self.is_false_positive

license_flags = (
@property
def license_flags(self):
return (
self.is_license_notice,
self.is_license_text,
self.is_license_reference,
self.is_license_tag,
self.is_license_intro,
self.is_license_clue,
self.is_required_phrase,
)

def validate(self, licensing=None, thorough=False):
"""
Validate this rule using the provided ``licensing`` Licensing and yield
one error message for each type of error detected.
"""
is_false_positive = self.is_false_positive

license_flags = self.license_flags

has_license_flags = any(license_flags)
has_many_license_flags = len([l for l in license_flags if l]) != 1

Expand Down Expand Up @@ -1961,6 +1986,7 @@ def get_flags_mapping(self):
'is_license_tag',
'is_license_intro',
'is_license_clue',
'is_required_phrase',
'is_continuous',
]

Expand All @@ -1987,6 +2013,8 @@ def to_reference(self):
data['is_license_tag'] = self.is_license_tag
data['is_license_intro'] = self.is_license_intro
data['is_license_clue'] = self.is_license_clue
data['is_required_phrase'] = self.is_required_phrase
data['skip_creating_required_phrases'] = self.skip_creating_required_phrases
data['is_continuous'] = self.is_continuous
data['is_builtin'] = self.is_builtin
data['is_from_license'] = self.is_from_license
Expand Down Expand Up @@ -2019,13 +2047,15 @@ def to_dict(self, include_text=False):

flags = (
'is_false_positive',
'is_required_phrase',
'is_license_text',
'is_license_notice',
'is_license_reference',
'is_license_tag',
'is_license_intro',
'is_license_clue',
'is_continuous',
'skip_creating_required_phrases',
'is_deprecated'
)

Expand Down Expand Up @@ -2206,11 +2236,11 @@ def build_key_phrase_spans(self):
Return a list of Spans marking key phrases token positions of that must
be present for this rule to be matched.
"""
from licensedcode.required_phrases import get_key_phrase_spans
from licensedcode.required_phrases import get_key_phrase_spans_or_tokens
if self.is_from_license:
return []
try:
return list(get_key_phrase_spans(self.text))
return list(get_key_phrase_spans_or_tokens(self.text))
except Exception as e:
raise InvalidRule(f'Invalid rule: {self}') from e

Expand Down Expand Up @@ -2241,7 +2271,7 @@ def compute_thresholds(self, small_rule=SMALL_RULE):

self.is_small = self.length < small_rule

def dump(self, rules_data_dir):
def dump(self, rules_data_dir, **kwargs):
"""
Dump a representation of this rule as a .RULE file stored in
``rules_data_dir`` as a UTF-8 file having:
Expand All @@ -2258,6 +2288,8 @@ def dump(self, rules_data_dir):
rule_file = self.rule_file(rules_data_dir=rules_data_dir)

metadata = self.to_dict()
if kwargs:
metadata.update(kwargs)
content = self.text
output = dumps_frontmatter(content=content, metadata=metadata)
with open(rule_file, 'w') as of:
Expand Down Expand Up @@ -2300,6 +2332,8 @@ def load(self, rule_file, with_checks=True):
self.license_expression = data.get('license_expression')

self.is_false_positive = data.get('is_false_positive', False)
self.is_required_phrase = data.get('is_required_phrase', False)
self.skip_creating_required_phrases = data.get('skip_creating_required_phrases', False)

relevance = as_int(float(data.get('relevance') or 0))
# Keep track if we have a stored relevance of not.
Expand Down Expand Up @@ -2359,7 +2393,7 @@ def set_relevance(self):
- relevance is computed based on the rule length
"""

if self.is_false_positive:
if self.is_false_positive or self.is_required_phrase:
self.relevance = 100
self.has_stored_relevance = True
return
Expand Down Expand Up @@ -2814,6 +2848,26 @@ def build_ignorables_mapping(copyrights, holders, authors, urls, emails):
return {k: v for k, v in sorted(ignorables.items()) if v}


def rule_exists(text):
"""
Return the matched rule if the text is an existing rule matched
exactly, False otherwise.
"""
from licensedcode.match_hash import MATCH_HASH
from licensedcode import cache

idx = cache.get_index()

matches = idx.match(query_string=text)
if not matches:
return False
if len(matches) > 1:
return False
match = matches[0]
if match.matcher == MATCH_HASH and match.score() == 100:
return match.rule.identifier


def find_rule_base_location(name_prefix, rules_directory=rules_data_dir):
"""
Return a new, unique and non-existing base location in ``rules_directory``
Expand Down Expand Up @@ -2842,10 +2896,9 @@ def find_rule_base_location(name_prefix, rules_directory=rules_data_dir):
idx += 1


def get_rules_by_expression(rules_data_dir=rules_data_dir):
def get_rules_by_identifier(rules_data_dir=rules_data_dir):
"""
Get a dictionary (sorted by license_expression) of {license_expression: rules}
where `rules` is a list of all rule objects having the `license_expression`.
Get a dictionary of {rule_identifier: rule} for all license rules.
"""
rules = list(load_rules(rules_data_dir=rules_data_dir))

Expand All @@ -2854,6 +2907,14 @@ def get_rules_by_expression(rules_data_dir=rules_data_dir):
for rule in rules
}

return rules_by_identifier


def map_rules_by_expression(rules_by_identifier):
"""
Get a dictionary (sorted by license_expression) of {license_expression: rules}
from a dictionary of rules by their identifier.
"""
rules_by_expression = defaultdict(list)

for rule in rules_by_identifier.values():
Expand All @@ -2862,3 +2923,12 @@ def get_rules_by_expression(rules_data_dir=rules_data_dir):
rules_by_expression[rule.license_expression].append(rule)

return OrderedDict(sorted(rules_by_expression.items()))


def get_rules_by_expression(rules_data_dir=rules_data_dir):
"""
Get a dictionary (sorted by license_expression) of {license_expression: rules}
where `rules` is a list of all rule objects having the `license_expression`.
"""
rules_by_identifier = get_rules_by_identifier(rules_data_dir)
return map_rules_by_expression(rules_by_identifier)
Loading

0 comments on commit 66f2be5

Please sign in to comment.