Add required phrases from other rules

Signed-off-by: Ayan Sinha Mahapatra <ayansmahapatra@gmail.com>
aboutcode-org · Aug 19, 2024 · 66f2be5 · 66f2be5
1 parent 966adde
commit 66f2be5
Show file tree

Hide file tree

Showing 7 changed files with 417 additions and 88 deletions.
diff --git a/etc/scripts/licenses/buildrules.py b/etc/scripts/licenses/buildrules.py
@@ -16,6 +16,7 @@
 from licensedcode import models
 from licensedcode import match_hash
 from licensedcode import frontmatter
+from licensedcode.models import rule_exists
 from license_expression import Licensing
 
 """
@@ -129,23 +130,6 @@ def load_data(location="00-new-licenses.txt"):
     return rules
 
 
-def rule_exists(text):
-    """
-    Return the matched rule identifier if the text is an existing rule matched
-    exactly, False otherwise.
-    """
-    idx = cache.get_index()
-
-    matches = idx.match(query_string=text)
-    if not matches:
-        return False
-    if len(matches) > 1:
-        return False
-    match = matches[0]
-    if match.matcher == match_hash.MATCH_HASH and match.score() == 100:
-        return match.rule.identifier
-
-
 def all_rule_by_tokens():
     """
     Return a mapping of {tuples of tokens: rule id}, with one item for each

diff --git a/etc/scripts/licenses/report_license_rules.py b/etc/scripts/licenses/report_license_rules.py
@@ -62,6 +62,8 @@
     "is_license_reference",
     "is_license_intro",
     "is_license_clue",
+    "is_required_phrase",
+    "skip_creating_required_phrases",
     "is_deprecated",
     "has_unknown",
     "only_known_words",

diff --git a/src/licensedcode/models.py b/src/licensedcode/models.py
@@ -1425,13 +1425,33 @@ class BasicRule:
             'Mutually exclusive from any is_license_* flag')
     )
 
+    is_required_phrase = attr.ib(
+        default=False,
+        repr=False,
+        metadata=dict(
+            help='True if this is rule text is a required phrase '
+            'A required phrase is often a part of another larger rule text '
+            'but is an essential section of the rule text which must be '
+            'present in the case of partial matches, otherwise the match '
+            'will be a false positive and misleading. ')
+    )
+
+    skip_creating_required_phrases = attr.ib(
+        default=False,
+        repr=False,
+        metadata=dict(
+            help='True if this rule needs to be skipped while creating '
+            'required phrase rules. Required phrase rules are created out '
+            'of other rule texts which have marked required phrases. ')
+    )
+
     language = attr.ib(
         default='en',
         repr=False,
         metadata=dict(
             help='Two-letter ISO 639-1 language code if this license text is '
             'not in English. See https://en.wikipedia.org/wiki/ISO_639-1 .')
-        )
+    )
 
     minimum_coverage = attr.ib(
         default=0,
@@ -1793,22 +1813,27 @@ def has_unknown(self):
         # license flag instead
         return self.license_expression and 'unknown' in self.license_expression
 
-    def validate(self, licensing=None, thorough=False):
-        """
-        Validate this rule using the provided ``licensing`` Licensing and yield
-        one error message for each type of error detected.
-        """
-        is_false_positive = self.is_false_positive
-
-        license_flags = (
+    @property
+    def license_flags(self):
+        return (
             self.is_license_notice,
             self.is_license_text,
             self.is_license_reference,
             self.is_license_tag,
             self.is_license_intro,
             self.is_license_clue,
+            self.is_required_phrase,
         )
 
+    def validate(self, licensing=None, thorough=False):
+        """
+        Validate this rule using the provided ``licensing`` Licensing and yield
+        one error message for each type of error detected.
+        """
+        is_false_positive = self.is_false_positive
+
+        license_flags = self.license_flags
+
         has_license_flags = any(license_flags)
         has_many_license_flags = len([l for l in license_flags if l]) != 1
 
@@ -1961,6 +1986,7 @@ def get_flags_mapping(self):
             'is_license_tag',
             'is_license_intro',
             'is_license_clue',
+            'is_required_phrase',
             'is_continuous',
         ]
 
@@ -1987,6 +2013,8 @@ def to_reference(self):
         data['is_license_tag'] = self.is_license_tag
         data['is_license_intro'] = self.is_license_intro
         data['is_license_clue'] = self.is_license_clue
+        data['is_required_phrase'] = self.is_required_phrase
+        data['skip_creating_required_phrases'] = self.skip_creating_required_phrases
         data['is_continuous'] = self.is_continuous
         data['is_builtin'] = self.is_builtin
         data['is_from_license'] = self.is_from_license
@@ -2019,13 +2047,15 @@ def to_dict(self, include_text=False):
 
         flags = (
             'is_false_positive',
+            'is_required_phrase',
             'is_license_text',
             'is_license_notice',
             'is_license_reference',
             'is_license_tag',
             'is_license_intro',
             'is_license_clue',
             'is_continuous',
+            'skip_creating_required_phrases',
             'is_deprecated'
         )
 
@@ -2206,11 +2236,11 @@ def build_key_phrase_spans(self):
         Return a list of Spans marking key phrases token positions of that must
         be present for this rule to be matched.
         """
-        from licensedcode.required_phrases import get_key_phrase_spans
+        from licensedcode.required_phrases import get_key_phrase_spans_or_tokens
         if self.is_from_license:
             return []
         try:
-            return list(get_key_phrase_spans(self.text))
+            return list(get_key_phrase_spans_or_tokens(self.text))
         except Exception as e:
             raise InvalidRule(f'Invalid rule: {self}') from e
 
@@ -2241,7 +2271,7 @@ def compute_thresholds(self, small_rule=SMALL_RULE):
 
         self.is_small = self.length < small_rule
 
-    def dump(self, rules_data_dir):
+    def dump(self, rules_data_dir, **kwargs):
         """
         Dump a representation of this rule as a .RULE file stored in
         ``rules_data_dir`` as a UTF-8 file having:
@@ -2258,6 +2288,8 @@ def dump(self, rules_data_dir):
         rule_file = self.rule_file(rules_data_dir=rules_data_dir)
 
         metadata = self.to_dict()
+        if kwargs:
+            metadata.update(kwargs)
         content = self.text
         output = dumps_frontmatter(content=content, metadata=metadata)
         with open(rule_file, 'w') as of:
@@ -2300,6 +2332,8 @@ def load(self, rule_file, with_checks=True):
         self.license_expression = data.get('license_expression')
 
         self.is_false_positive = data.get('is_false_positive', False)
+        self.is_required_phrase = data.get('is_required_phrase', False)
+        self.skip_creating_required_phrases = data.get('skip_creating_required_phrases', False)
 
         relevance = as_int(float(data.get('relevance') or 0))
         # Keep track if we have a stored relevance of not.
@@ -2359,7 +2393,7 @@ def set_relevance(self):
         - relevance is computed based on the rule length
         """
 
-        if self.is_false_positive:
+        if self.is_false_positive or self.is_required_phrase:
             self.relevance = 100
             self.has_stored_relevance = True
             return
@@ -2814,6 +2848,26 @@ def build_ignorables_mapping(copyrights, holders, authors, urls, emails):
     return {k: v for k, v in sorted(ignorables.items()) if v}
 
 
+def rule_exists(text):
+    """
+    Return the matched rule if the text is an existing rule matched
+    exactly, False otherwise.
+    """
+    from licensedcode.match_hash import MATCH_HASH
+    from licensedcode import cache
+
+    idx = cache.get_index()
+
+    matches = idx.match(query_string=text)
+    if not matches:
+        return False
+    if len(matches) > 1:
+        return False
+    match = matches[0]
+    if match.matcher == MATCH_HASH and match.score() == 100:
+        return match.rule.identifier
+
+
 def find_rule_base_location(name_prefix, rules_directory=rules_data_dir):
     """
     Return a new, unique and non-existing base location in ``rules_directory``
@@ -2842,10 +2896,9 @@ def find_rule_base_location(name_prefix, rules_directory=rules_data_dir):
         idx += 1
 
 
-def get_rules_by_expression(rules_data_dir=rules_data_dir):
+def get_rules_by_identifier(rules_data_dir=rules_data_dir):
     """
-    Get a dictionary (sorted by license_expression) of {license_expression: rules}
-    where `rules` is a list of all rule objects having the `license_expression`.
+    Get a dictionary of {rule_identifier: rule} for all license rules.
     """
     rules = list(load_rules(rules_data_dir=rules_data_dir))
 
@@ -2854,6 +2907,14 @@ def get_rules_by_expression(rules_data_dir=rules_data_dir):
         for rule in rules
     }
 
+    return rules_by_identifier
+
+
+def map_rules_by_expression(rules_by_identifier):
+    """
+    Get a dictionary (sorted by license_expression) of {license_expression: rules}
+    from a dictionary of rules by their identifier.
+    """
     rules_by_expression = defaultdict(list)
 
     for rule in rules_by_identifier.values():
@@ -2862,3 +2923,12 @@ def get_rules_by_expression(rules_data_dir=rules_data_dir):
             rules_by_expression[rule.license_expression].append(rule)
 
     return OrderedDict(sorted(rules_by_expression.items()))
+
+
+def get_rules_by_expression(rules_data_dir=rules_data_dir):
+    """
+    Get a dictionary (sorted by license_expression) of {license_expression: rules}
+    where `rules` is a list of all rule objects having the `license_expression`.
+    """
+    rules_by_identifier = get_rules_by_identifier(rules_data_dir)
+    return map_rules_by_expression(rules_by_identifier)