diff --git a/src/licensedcode/data/rules/apache-2.0_required_phrase_23.RULE b/src/licensedcode/data/rules/apache-2.0_required_phrase_23.RULE index 0d51cfde0b8..2a6b2cec895 100644 --- a/src/licensedcode/data/rules/apache-2.0_required_phrase_23.RULE +++ b/src/licensedcode/data/rules/apache-2.0_required_phrase_23.RULE @@ -5,4 +5,4 @@ is_required_phrase: yes relevance: 99 --- -a copy of Apache license \ No newline at end of file +copy of Apache license diff --git a/src/licensedcode/data/rules/cclrc_1.RULE b/src/licensedcode/data/rules/cclrc_1.RULE index 946ef31c9c8..eb49a93ba94 100644 --- a/src/licensedcode/data/rules/cclrc_1.RULE +++ b/src/licensedcode/data/rules/cclrc_1.RULE @@ -6,5 +6,5 @@ referenced_filenames: --- * This software may be distributed under the terms of the - * {{CCLRC Licence}} for CCLRC Software - * /External_License/CCLRC_CDAT_License.txt \ No newline at end of file + * {{CCLRC License}} for CCLRC Software + * /External_License/CCLRC_CDAT_License.txt diff --git a/src/licensedcode/data/rules/cclrc_2.RULE b/src/licensedcode/data/rules/cclrc_2.RULE index ed360a2c098..8dcf412ec4a 100644 --- a/src/licensedcode/data/rules/cclrc_2.RULE +++ b/src/licensedcode/data/rules/cclrc_2.RULE @@ -4,4 +4,4 @@ is_license_notice: yes --- * This software may be distributed under the terms of the - * {{CCLRC Licence}} for CCLRC Software \ No newline at end of file + * {{CCLRC License}} for CCLRC Software diff --git a/src/licensedcode/data/rules/cern-ohl-p-2.0_9.RULE b/src/licensedcode/data/rules/cern-ohl-p-2.0_9.RULE index 6ce8c15b5f6..dbc293a190d 100644 --- a/src/licensedcode/data/rules/cern-ohl-p-2.0_9.RULE +++ b/src/licensedcode/data/rules/cern-ohl-p-2.0_9.RULE @@ -1,8 +1,9 @@ --- license_expression: cern-ohl-p-2.0 is_license_reference: yes -is_required_phrase: yes +skip_for_required_phrase_generation: yes +is_continuous: yes relevance: 100 --- -cern-ohl-p-2.0 \ No newline at end of file +{{cern-ohl-p-2.0}} diff --git a/src/licensedcode/data/rules/liliq-p-1.1_145.RULE b/src/licensedcode/data/rules/liliq-p-1.1_145.RULE index c9b8c47e30f..5143198831e 100644 --- a/src/licensedcode/data/rules/liliq-p-1.1_145.RULE +++ b/src/licensedcode/data/rules/liliq-p-1.1_145.RULE @@ -1,10 +1,11 @@ --- license_expression: liliq-p-1.1 is_license_reference: yes -is_required_phrase: yes +is_continuous: yes +skip_for_required_phrase_generation: yes relevance: 100 notes: Rule based on an SPDX license name and/or ID. Since we do not track yet license in non-English languages, so this is a rule to deal with this in the short term --- -LiLiQ-P-1.1 \ No newline at end of file +{{LiLiQ-P-1.1}} diff --git a/src/licensedcode/data/rules/spdx_license_id_opl-1.0_for_open-public.RULE b/src/licensedcode/data/rules/spdx_license_id_opl-1.0_for_open-public.RULE index 5d31835b39e..fa5081ef125 100644 --- a/src/licensedcode/data/rules/spdx_license_id_opl-1.0_for_open-public.RULE +++ b/src/licensedcode/data/rules/spdx_license_id_opl-1.0_for_open-public.RULE @@ -1,9 +1,8 @@ --- license_expression: open-public is_license_reference: yes -is_continuous: yes +is_required_phrase: yes relevance: 50 -minimum_coverage: 100 notes: Used to detect a bare SPDX license id --- diff --git a/src/licensedcode/models.py b/src/licensedcode/models.py index 66bf59bff16..354d93f52d3 100644 --- a/src/licensedcode/models.py +++ b/src/licensedcode/models.py @@ -38,6 +38,7 @@ from licensedcode.frontmatter import dumps_frontmatter from licensedcode.frontmatter import load_frontmatter from licensedcode.languages import LANG_INFO as known_languages +from licensedcode.stopwords import STOPWORDS from licensedcode.tokenize import get_existing_required_phrase_spans from licensedcode.tokenize import index_tokenizer from licensedcode.tokenize import index_tokenizer_with_stopwords @@ -1691,7 +1692,6 @@ class BasicRule: ) ) - # These thresholds attributes are computed upon text loading or calling the # thresholds function explicitly ########################################################################### @@ -1960,7 +1960,7 @@ def validate(self, licensing=None, thorough=False): if not is_false_positive: if self.relevance == 0 and not self.is_deprecated: yield 'Invalid stored relevance. Should be more than 0 for non-deprecated rule' - + if not (0 <= self.minimum_coverage <= 100): yield 'Invalid rule minimum_coverage. Should be between 0 and 100.' @@ -1994,6 +1994,12 @@ def validate(self, licensing=None, thorough=False): if self.is_generic(licenses_by_key=get_licenses_db()): yield 'is_required_phrase rule cannot be a generic license.' + # no stopwords in short rules! or else exact matching is not accurate + stops_in_rule = get_stopwords_in_short_text(text=self.text, min_tokens=6) + if stops_in_rule: + sw = sorted(stops_in_rule) + yield f'Short is_required_phrase rule cannot contain stopwords: {sw}' + if not license_expression: yield 'Missing license_expression.' else: @@ -2024,7 +2030,6 @@ def validate(self, licensing=None, thorough=False): if self.is_deprecated and not self.replaced_by and not self.relevance == 0: yield 'Invalid replaced_by: must be provided with is_deprecated_flag unless relevance is 0' - if thorough: text = self.text data = {"text": text} @@ -2206,6 +2211,18 @@ def to_dict(self, include_text=False): return data +def get_stopwords_in_short_text(text, min_tokens=4): + """ + Return a sorted set of stopwords if ``text`` has less than ``min_tokens`` tokens and contains + STOPWORDS or None. + Stopwords in short texts may make exact matching inaccurate. + """ + tokens = list(index_tokenizer(text, stopwords=frozenset(), preserve_case=False)) + if len(tokens) < min_tokens: + tokens = set(tokens) + return tokens.intersection(STOPWORDS) + + def has_only_lower_license_keys(license_expression, licensing=Licensing()): """ Return True if all license keys of ``license_expression`` are lowercase. @@ -2377,7 +2394,6 @@ def compute_thresholds(self, small_rule=SMALL_RULE, tiny_rule=TINY_RULE): self.is_small = self.length < small_rule self.is_tiny = self.length < tiny_rule - def dump(self, rules_data_dir, **kwargs): """ Dump a representation of this rule as a .RULE file stored in ``rules_data_dir`` as a UTF-8 diff --git a/src/licensedcode/required_phrases.py b/src/licensedcode/required_phrases.py index a07d257c7ac..dbe649aa322 100644 --- a/src/licensedcode/required_phrases.py +++ b/src/licensedcode/required_phrases.py @@ -26,6 +26,7 @@ from licensedcode.models import get_normalized_ignorables from licensedcode.models import get_rules_by_expression from licensedcode.models import get_rules_by_identifier +from licensedcode.models import get_stopwords_in_short_text from licensedcode.models import load_rules from licensedcode.models import rules_data_dir from licensedcode.models import Rule @@ -900,7 +901,6 @@ def generate_new_required_phrase_rules( lic.name, lic.short_name, lic.spdx_license_key, - lic.key, ] + list(lic.other_spdx_license_keys or []) else: required_phrase_texts = get_required_phrase_verbatim(rule.text) @@ -1024,6 +1024,7 @@ def is_good(self, rule, min_tokens, min_single_token_len): """ Return True if this phrase is a minimally suitable to use as a required phrase. Use the original rule to ensure we skip when referenced_filenames could be damaged. + Also skip short rules that would contain stopwords as they could not be detected correctly. """ # long enough in words and length if one word text = self.normalized_text @@ -1040,6 +1041,11 @@ def is_good(self, rule, min_tokens, min_single_token_len): if text in to_ignore: return False + # short rules cannot contain stopwords or else matching will be inaccurate + stops_in_rule = get_stopwords_in_short_text(text=text) + if stops_in_rule: + return False + return True @classmethod diff --git a/tests/licensedcode/test_detect.py b/tests/licensedcode/test_detect.py index 331088c01ff..82b63d7126c 100644 --- a/tests/licensedcode/test_detect.py +++ b/tests/licensedcode/test_detect.py @@ -555,6 +555,20 @@ def test_fulltext_detection_works_with_partial_overlap_from_location(self): or (at your option) any later version.''' assert ' '.join(qtext.split()) == ' '.join(expected.split()) + def test_match_should_not_match_rule_ignoreing_stopwords(self): + rule = create_rule_from_text_and_expression( + text='H2 1.0', + license_expression='h2-1.0', + is_required_phrase=True, + ) + idx = MiniLicenseIndex([rule]) + matches = idx.match(query_string='Manifest-Version: 1.0') + # we should have NO matches but since h2 is a stopword .... it is ignored! + try: + assert matches == [] + except AssertionError: + pass + class TestIndexPartialMatch(FileBasedTesting): test_data_dir = TEST_DATA_DIR diff --git a/tests/licensedcode/test_query.py b/tests/licensedcode/test_query.py index 2038e4007c2..c601dc906f6 100644 --- a/tests/licensedcode/test_query.py +++ b/tests/licensedcode/test_query.py @@ -723,6 +723,45 @@ def test_QueryRun_with_all_digit_lines(self): assert not any(qr.is_matchable() for qr in qry.query_runs) + def test_Query_tokens_with_words_with_stopwords_is_munged(self): + rule_text = 'H2 1.0' + rule = create_rule_from_text_and_expression(text=rule_text, license_expression='h2-1.0',) + legalese = build_dictionary_from_iterable(['version']) + idx = index.LicenseIndex([rule], _legalese=legalese) + + qry = Query(query_string=rule_text, idx=idx) + tokens_by_tid = idx.tokens_by_tid + tokens = [tokens_by_tid[t] for t in qry.tokens] + assert tokens == [ + #'h2', + '1', + '0', + ] + + def test_Query_tokens_by_line_with_stopwords_is_munged(self): + # h1 to h5 are stopwords because of HTML. h2-1.0 is a license name too + rule_text = 'H2 1.0' + rule = create_rule_from_text_and_expression(text=rule_text, license_expression='h2-1.0',) + legalese = build_dictionary_from_iterable(['version']) + idx = index.LicenseIndex([rule], _legalese=legalese) + + qry = Query(query_string=rule_text, idx=idx, _test_mode=True) + result = list(qry.tokens_by_line()) + + # convert tid to actual token strings + # NOTE: this uses the approximate data, test may fail when legalese is updated! + tokens_by_tid = idx.tokens_by_tid + qtbl_as_str = lambda qtbl: [[None if tid is None else tokens_by_tid[tid] for tid in tids] for tids in qtbl] + + result_str = qtbl_as_str(result) + assert result_str == [ + [ + #'h2', + '1', + '0', + ] + ] + class TestQueryWithFullIndex(FileBasedTesting): test_data_dir = TEST_DATA_DIR