diff --git a/etc/scripts/licenses/buildrules.py b/etc/scripts/licenses/buildrules.py index 731702b0e04..38161e8bb56 100644 --- a/etc/scripts/licenses/buildrules.py +++ b/etc/scripts/licenses/buildrules.py @@ -16,6 +16,7 @@ from licensedcode import models from licensedcode import match_hash from licensedcode import frontmatter +from licensedcode.models import rule_exists from license_expression import Licensing """ @@ -129,23 +130,6 @@ def load_data(location="00-new-licenses.txt"): return rules -def rule_exists(text): - """ - Return the matched rule identifier if the text is an existing rule matched - exactly, False otherwise. - """ - idx = cache.get_index() - - matches = idx.match(query_string=text) - if not matches: - return False - if len(matches) > 1: - return False - match = matches[0] - if match.matcher == match_hash.MATCH_HASH and match.score() == 100: - return match.rule.identifier - - def all_rule_by_tokens(): """ Return a mapping of {tuples of tokens: rule id}, with one item for each diff --git a/etc/scripts/licenses/report_license_rules.py b/etc/scripts/licenses/report_license_rules.py index 8e8ff04abfb..cf9b89a4758 100644 --- a/etc/scripts/licenses/report_license_rules.py +++ b/etc/scripts/licenses/report_license_rules.py @@ -62,6 +62,8 @@ "is_license_reference", "is_license_intro", "is_license_clue", + "is_required_phrase", + "skip_creating_required_phrases", "is_deprecated", "has_unknown", "only_known_words", diff --git a/src/licensedcode/models.py b/src/licensedcode/models.py index 7d1ef92449f..5d87d581600 100644 --- a/src/licensedcode/models.py +++ b/src/licensedcode/models.py @@ -1425,13 +1425,33 @@ class BasicRule: 'Mutually exclusive from any is_license_* flag') ) + is_required_phrase = attr.ib( + default=False, + repr=False, + metadata=dict( + help='True if this is rule text is a required phrase ' + 'A required phrase is often a part of another larger rule text ' + 'but is an essential section of the rule text which must be ' + 'present in the case of partial matches, otherwise the match ' + 'will be a false positive and misleading. ') + ) + + skip_creating_required_phrases = attr.ib( + default=False, + repr=False, + metadata=dict( + help='True if this rule needs to be skipped while creating ' + 'required phrase rules. Required phrase rules are created out ' + 'of other rule texts which have marked required phrases. ') + ) + language = attr.ib( default='en', repr=False, metadata=dict( help='Two-letter ISO 639-1 language code if this license text is ' 'not in English. See https://en.wikipedia.org/wiki/ISO_639-1 .') - ) + ) minimum_coverage = attr.ib( default=0, @@ -1793,22 +1813,27 @@ def has_unknown(self): # license flag instead return self.license_expression and 'unknown' in self.license_expression - def validate(self, licensing=None, thorough=False): - """ - Validate this rule using the provided ``licensing`` Licensing and yield - one error message for each type of error detected. - """ - is_false_positive = self.is_false_positive - - license_flags = ( + @property + def license_flags(self): + return ( self.is_license_notice, self.is_license_text, self.is_license_reference, self.is_license_tag, self.is_license_intro, self.is_license_clue, + self.is_required_phrase, ) + def validate(self, licensing=None, thorough=False): + """ + Validate this rule using the provided ``licensing`` Licensing and yield + one error message for each type of error detected. + """ + is_false_positive = self.is_false_positive + + license_flags = self.license_flags + has_license_flags = any(license_flags) has_many_license_flags = len([l for l in license_flags if l]) != 1 @@ -1961,6 +1986,7 @@ def get_flags_mapping(self): 'is_license_tag', 'is_license_intro', 'is_license_clue', + 'is_required_phrase', 'is_continuous', ] @@ -1987,6 +2013,8 @@ def to_reference(self): data['is_license_tag'] = self.is_license_tag data['is_license_intro'] = self.is_license_intro data['is_license_clue'] = self.is_license_clue + data['is_required_phrase'] = self.is_required_phrase + data['skip_creating_required_phrases'] = self.skip_creating_required_phrases data['is_continuous'] = self.is_continuous data['is_builtin'] = self.is_builtin data['is_from_license'] = self.is_from_license @@ -2019,6 +2047,7 @@ def to_dict(self, include_text=False): flags = ( 'is_false_positive', + 'is_required_phrase', 'is_license_text', 'is_license_notice', 'is_license_reference', @@ -2026,6 +2055,7 @@ def to_dict(self, include_text=False): 'is_license_intro', 'is_license_clue', 'is_continuous', + 'skip_creating_required_phrases', 'is_deprecated' ) @@ -2206,11 +2236,11 @@ def build_key_phrase_spans(self): Return a list of Spans marking key phrases token positions of that must be present for this rule to be matched. """ - from licensedcode.required_phrases import get_key_phrase_spans + from licensedcode.required_phrases import get_key_phrase_spans_or_tokens if self.is_from_license: return [] try: - return list(get_key_phrase_spans(self.text)) + return list(get_key_phrase_spans_or_tokens(self.text)) except Exception as e: raise InvalidRule(f'Invalid rule: {self}') from e @@ -2241,7 +2271,7 @@ def compute_thresholds(self, small_rule=SMALL_RULE): self.is_small = self.length < small_rule - def dump(self, rules_data_dir): + def dump(self, rules_data_dir, **kwargs): """ Dump a representation of this rule as a .RULE file stored in ``rules_data_dir`` as a UTF-8 file having: @@ -2258,6 +2288,8 @@ def dump(self, rules_data_dir): rule_file = self.rule_file(rules_data_dir=rules_data_dir) metadata = self.to_dict() + if kwargs: + metadata.update(kwargs) content = self.text output = dumps_frontmatter(content=content, metadata=metadata) with open(rule_file, 'w') as of: @@ -2300,6 +2332,8 @@ def load(self, rule_file, with_checks=True): self.license_expression = data.get('license_expression') self.is_false_positive = data.get('is_false_positive', False) + self.is_required_phrase = data.get('is_required_phrase', False) + self.skip_creating_required_phrases = data.get('skip_creating_required_phrases', False) relevance = as_int(float(data.get('relevance') or 0)) # Keep track if we have a stored relevance of not. @@ -2359,7 +2393,7 @@ def set_relevance(self): - relevance is computed based on the rule length """ - if self.is_false_positive: + if self.is_false_positive or self.is_required_phrase: self.relevance = 100 self.has_stored_relevance = True return @@ -2814,6 +2848,26 @@ def build_ignorables_mapping(copyrights, holders, authors, urls, emails): return {k: v for k, v in sorted(ignorables.items()) if v} +def rule_exists(text): + """ + Return the matched rule if the text is an existing rule matched + exactly, False otherwise. + """ + from licensedcode.match_hash import MATCH_HASH + from licensedcode import cache + + idx = cache.get_index() + + matches = idx.match(query_string=text) + if not matches: + return False + if len(matches) > 1: + return False + match = matches[0] + if match.matcher == MATCH_HASH and match.score() == 100: + return match.rule.identifier + + def find_rule_base_location(name_prefix, rules_directory=rules_data_dir): """ Return a new, unique and non-existing base location in ``rules_directory`` @@ -2842,10 +2896,9 @@ def find_rule_base_location(name_prefix, rules_directory=rules_data_dir): idx += 1 -def get_rules_by_expression(rules_data_dir=rules_data_dir): +def get_rules_by_identifier(rules_data_dir=rules_data_dir): """ - Get a dictionary (sorted by license_expression) of {license_expression: rules} - where `rules` is a list of all rule objects having the `license_expression`. + Get a dictionary of {rule_identifier: rule} for all license rules. """ rules = list(load_rules(rules_data_dir=rules_data_dir)) @@ -2854,6 +2907,14 @@ def get_rules_by_expression(rules_data_dir=rules_data_dir): for rule in rules } + return rules_by_identifier + + +def map_rules_by_expression(rules_by_identifier): + """ + Get a dictionary (sorted by license_expression) of {license_expression: rules} + from a dictionary of rules by their identifier. + """ rules_by_expression = defaultdict(list) for rule in rules_by_identifier.values(): @@ -2862,3 +2923,12 @@ def get_rules_by_expression(rules_data_dir=rules_data_dir): rules_by_expression[rule.license_expression].append(rule) return OrderedDict(sorted(rules_by_expression.items())) + + +def get_rules_by_expression(rules_data_dir=rules_data_dir): + """ + Get a dictionary (sorted by license_expression) of {license_expression: rules} + where `rules` is a list of all rule objects having the `license_expression`. + """ + rules_by_identifier = get_rules_by_identifier(rules_data_dir) + return map_rules_by_expression(rules_by_identifier) diff --git a/src/licensedcode/required_phrases.py b/src/licensedcode/required_phrases.py index 718728c9e8c..a14a70a0f6d 100644 --- a/src/licensedcode/required_phrases.py +++ b/src/licensedcode/required_phrases.py @@ -8,6 +8,7 @@ # See https://aboutcode.org for more information about nexB OSS projects. # +import attr import os import click @@ -15,11 +16,15 @@ from licensedcode import TINY_RULE from commoncode.cliutils import PluggableCommandLineOption +from licensedcode.models import map_rules_by_expression +from licensedcode.models import get_rules_by_identifier from licensedcode.models import get_rules_by_expression from licensedcode.models import load_licenses from licensedcode.models import InvalidRule from licensedcode.models import rules_data_dir from licensedcode.models import Rule +from licensedcode.models import rule_exists +from licensedcode.models import find_rule_base_location from licensedcode.spans import Span from licensedcode.tokenize import key_phrase_tokenizer @@ -32,38 +37,44 @@ -def get_key_phrase_spans(text): +def get_key_phrase_spans_or_tokens(text, yield_tokens=False): """ - Yield Spans of key phrase token positions found in the rule ``text``. + Yield Spans of key phrase token positions or if ``yield_tokens`` is True + yield lists of tokens for each key phrase found in the rule ``text``. Tokens form a key phrase when enclosed in {{double curly braces}}. For example: >>> text = 'This is enclosed in {{double curly braces}}' >>> # 0 1 2 3 4 5 6 - >>> x = list(get_key_phrase_spans(text)) + >>> x = list(get_key_phrase_spans_or_tokens(text)) >>> assert x == [Span(4, 6)], x + >>> text = 'This is enclosed in {{double curly braces}}' + >>> # 0 1 2 3 4 5 6 + >>> x = list(get_key_phrase_spans_or_tokens(text=text, yield_tokens=True)) + >>> assert x == ['double', 'curly', 'braces'], x + >>> text = 'This is {{enclosed}} a {{double curly braces}} or not' >>> # 0 1 2 SW 3 4 5 6 7 - >>> x = list(get_key_phrase_spans(text)) + >>> x = list(get_key_phrase_spans_or_tokens(text)) >>> assert x == [Span(2), Span(3, 5)], x >>> text = 'This {{is}} enclosed a {{double curly braces}} or not' >>> # 0 1 2 SW 3 4 5 6 7 - >>> x = list(get_key_phrase_spans(text)) + >>> x = list(get_key_phrase_spans_or_tokens(text)) >>> assert x == [Span([1]), Span([3, 4, 5])], x >>> text = '{{AGPL-3.0 GNU Affero General Public License v3.0}}' >>> # 0 1 2 3 4 5 6 7 8 9 - >>> x = list(get_key_phrase_spans(text)) + >>> x = list(get_key_phrase_spans_or_tokens(text)) >>> assert x == [Span(0, 9)], x - >>> assert list(get_key_phrase_spans('{This}')) == [] + >>> assert list(get_key_phrase_spans_or_tokens('{This}')) == [] >>> def check_exception(text): ... try: - ... return list(get_key_phrase_spans(text)) + ... return list(get_key_phrase_spans_or_tokens(text)) ... except InvalidRule: ... pass @@ -97,7 +108,10 @@ def get_key_phrase_spans(text): elif token == KEY_PHRASE_CLOSE: if in_key_phrase: if key_phrase: - yield Span(key_phrase) + if yield_tokens: + yield key_phrase.copy() + else: + yield Span(key_phrase) key_phrase.clear() else: raise InvalidRule('Invalid rule with empty key phrase {{}} braces', text) @@ -107,13 +121,207 @@ def get_key_phrase_spans(text): continue else: if in_key_phrase: - key_phrase.append(ipos) + if yield_tokens: + key_phrase.append(token) + else: + key_phrase.append(ipos) ipos += 1 if key_phrase or in_key_phrase: raise InvalidRule(f'Invalid rule with dangling key phrase missing final closing braces', text) +def get_required_phrase_texts_from_rule(rule): + for key_phrase_tokens in get_key_phrase_spans_or_tokens( + text=rule.text, + yield_tokens=True, + ): + yield " ".join(key_phrase_tokens) + + +@attr.s +class RequiredPhraseDetails: + + license_expression = attr.ib( + default=None, + metadata=dict( + help='A license expression string for this particular required phrase.') + ) + + rule = attr.ib( + default=None, + metadata=dict( + help='The Rule object for this particular required phrase rule.') + ) + + required_phrase_text = attr.ib( + default=None, + metadata=dict( + help='Normalized required phrase text.') + ) + + sources = attr.ib( + default=attr.Factory(list), + metadata=dict( + help='List of all rule identifiers where this required phrase is present.' + ) + ) + + length = attr.ib( + default=0, + metadata=dict( + help='Length of text for this required phrase text (used to sort).' + ) + ) + + @classmethod + def create_required_phrase_details( + cls, + license_expression, + rule, + required_phrase_text, + sources, + length, + ): + + base_name = f"{rule.license_expression}_required_phrase" + base_loc = find_rule_base_location(name_prefix=base_name) + identifier = f"{base_loc}.RULE" + + rule = Rule( + license_expression=license_expression, + identifier=identifier, + text=required_phrase_text, + is_required_phrase=True, + ) + rule.dump(rules_data_dir) + + return cls( + license_expression=license_expression, + rule=rule, + required_phrase_text=required_phrase_text, + sources=sources, + length=length, + ) + + def update_sources(self, source_identifier): + if not source_identifier in self.sources: + self.sources.append(source_identifier) + + +@attr.s +class ListOfRequiredPhrases: + + required_phrases = attr.ib( + default=attr.Factory(list), + metadata=dict( + help='A list of RequiredPhraseDetails objects for all the required phrases.') + ) + + def match_required_phrase_present(self, required_phrase_text, rules_by_id=None): + # check in all rules which are in the index + rule_id = rule_exists(text=required_phrase_text) + if not rule_id: + # check in all rules which are in the collected list of required phrases + for required_phrase in self.required_phrases: + if required_phrase.required_phrase_text == required_phrase_text: + rule = required_phrase.rule + return rule + + if rule_id and rules_by_id: + rule = rules_by_id.get(rule_id) + return rule + + def update_required_phrase_sources(self, rule): + + for required_phrase in self.required_phrases: + if required_phrase.rule.identifier == rule.identifier: + required_phrase.update_sources(rule.identifier) + return + + #TODO: + # Update old rules which are required phrases + + def sort_required_phrases(self): + self.required_phrases = sorted( + self.required_phrases, + key=lambda x: x.length, + reverse=True, + ) + + +def get_required_phrases_in_rules( + license_expression=None, + delete_required_phrases_debug=False, + write_required_phrases=False, +): + + rules_by_identifier = get_rules_by_identifier() + rules_by_expression = map_rules_by_expression(rules_by_identifier) + + # + required_phrases_by_expression = {} + + if license_expression: + rules_by_expression = {license_expression: rules_by_expression[license_expression]} + else: + rules_by_expression = rules_by_expression + + licensing = Licensing() + + # collect and create required phrase rules + for license_expression, rules in rules_by_expression.items(): + + license_keys = licensing.license_keys(license_expression) + if len(license_keys) != 1: + continue + + required_phrases_list = ListOfRequiredPhrases() + + for rule in rules: + required_phrase_texts_in_rule = get_required_phrase_texts_from_rule(rule) + + for required_phrase_text in required_phrase_texts_in_rule: + required_phrase_rule = required_phrases_list.match_required_phrase_present( + required_phrase_text=required_phrase_text, + rules_by_id=rules_by_identifier, + ) + if not required_phrase_rule: + required_phrase_detail = RequiredPhraseDetails.create_required_phrase_details( + license_expression=license_expression, + required_phrase_text=required_phrase_text, + sources=[rule.identifier], + length=len(required_phrase_text), + ) + required_phrases_list.required_phrases.append(required_phrase_detail) + else: + required_phrases_list.update_required_phrase_sources(rule) + + required_phrases_list.sort_required_phrases() + required_phrases_by_expression[license_expression] = required_phrases_list + + # add required phrases to rules from other rules + for license_expression, rules in rules_by_expression.items(): + + if not license_expression in required_phrases_by_expression: + continue + + required_phrases_for_expression = required_phrases_by_expression.get(license_expression) + add_key_phrases_for_required_phrases( + rules=rules, + required_phrases=required_phrases_for_expression.required_phrases, + ) + + if write_required_phrases: + for required_phrases_list in required_phrases_by_expression.values(): + for required_phrase_detail in required_phrases_list: + if required_phrase_detail.sources: + required_phrase_detail.rule.dump( + rules_data_dir=rules_data_dir, + sources=required_phrase_detail.sources + ) + + def add_key_phrases_for_license_fields(licence_object, rules): license_fields_mapping_by_order = { @@ -129,44 +337,62 @@ def add_key_phrases_for_license_fields(licence_object, rules): continue for license_field_value in license_fields_mapping_by_order.values(): + add_key_phrase_to_rule(rule=rule, key_phrase=license_field_value) - # Reload from file as there could be changes from other license fields - rule_file = os.path.join(rules_data_dir, rule.identifier) - reloaded_rule = Rule.from_file(rule_file) - # we get spans for name/short_name if they exist - new_key_phrase_spans = return_spans_for_key_phrase_in_text( - text=reloaded_rule.text, - key_phrase=license_field_value - ) +def add_key_phrases_for_required_phrases(required_phrases, rules): - # we get spans for already existing key phrases and ignorables - ignorable_spans = get_ignorable_spans(reloaded_rule) - old_key_phrase_spans = reloaded_rule.build_key_phrase_spans() + for rule in rules: + # skip small rules + if len(rule.text) < TINY_RULE: + continue - # we verify whether there are spans which overlap with the - # already present key phrases or ignorables - spans_to_add = list( - get_non_overlapping_spans( - old_key_phrase_spans=old_key_phrase_spans + ignorable_spans, - new_key_phrase_spans=new_key_phrase_spans - ) + for required_phrase in required_phrases: + add_key_phrase_to_rule( + rule=rule, + key_phrase=required_phrase.required_phrase_text, ) - text_rule = reloaded_rule.text - - # we add key phrase markers for the non-overlapping spans - for span_to_add in spans_to_add: - text_rule = add_key_phrase_markers( - text=text_rule, - key_phrase_span=span_to_add - ) - # write the rule on disk if there are any updates - if text_rule != reloaded_rule.text: - click.echo(f"Updating rule: {reloaded_rule.identifier}") - reloaded_rule.text = text_rule - reloaded_rule.dump(rules_data_dir) +def add_key_phrase_to_rule(rule, key_phrase): + + # Reload from file as there could be changes from other license fields + rule_file = os.path.join(rules_data_dir, rule.identifier) + reloaded_rule = Rule.from_file(rule_file) + + # we get spans for name/short_name if they exist + new_key_phrase_spans = return_spans_for_key_phrase_in_text( + text=reloaded_rule.text, + key_phrase=key_phrase, + ) + + # we get spans for already existing key phrases and ignorables + ignorable_spans = get_ignorable_spans(reloaded_rule) + old_key_phrase_spans = reloaded_rule.build_key_phrase_spans() + + # we verify whether there are spans which overlap with the + # already present key phrases or ignorables + spans_to_add = list( + get_non_overlapping_spans( + old_key_phrase_spans=old_key_phrase_spans + ignorable_spans, + new_key_phrase_spans=new_key_phrase_spans + ) + ) + + text_rule = reloaded_rule.text + + # we add key phrase markers for the non-overlapping spans + for span_to_add in spans_to_add: + text_rule = add_key_phrase_markers( + text=text_rule, + key_phrase_span=span_to_add + ) + + # write the rule on disk if there are any updates + if text_rule != reloaded_rule.text: + click.echo(f"Updating rule: {reloaded_rule.identifier}") + reloaded_rule.text = text_rule + reloaded_rule.dump(rules_data_dir) def add_required_phrases_to_rules(license_expression=None, reindex=False, cli=False): @@ -224,12 +450,36 @@ def add_required_phrases_to_rules(license_expression=None, reindex=False, cli=Fa help="Also reindex the license/rules to check for inconsistencies. ", cls=PluggableCommandLineOption, ) +@click.option( + "-d", + "--delete-required-phrases-debug", + is_flag=True, + default=False, + help="Write into their corresponding rule files the sources for all required phrase rules.", + cls=PluggableCommandLineOption, +) +@click.option( + "-w", + "--write-required-phrases", + is_flag=True, + default=False, + help="Also reindex the license/rules to check for inconsistencies. ", + cls=PluggableCommandLineOption, +) @click.help_option("-h", "--help") -def add_required_phrases(license_expression, reindex): +def add_required_phrases(license_expression, reindex, delete_required_phrases_debug, write_required_phrases): """ For all rules with the `license_expression`, add required phrases from the license fields. """ + + # creates a list of all required phrases and adds rule files for them + get_required_phrases_in_rules( + delete_required_phrases_debug=delete_required_phrases_debug, + write_required_phrases=write_required_phrases, + ) + + # Marks required phrases in already present rules add_required_phrases_to_rules( license_expression=license_expression, reindex=reindex, diff --git a/tests/licensedcode/test_models.py b/tests/licensedcode/test_models.py index 7e34a6f4611..0f15b906cdc 100644 --- a/tests/licensedcode/test_models.py +++ b/tests/licensedcode/test_models.py @@ -228,6 +228,7 @@ def test_rules_have_only_one_flag_of_bool_type(self): r.is_license_intro, r.is_license_clue, r.is_false_positive, + r.is_required_phrases, ] number_of_flags_set = 0 for rule_flag in rule_flags: diff --git a/tests/licensedcode/test_required_phrases.py b/tests/licensedcode/test_required_phrases.py index 2263ef0a009..6af8af254a2 100644 --- a/tests/licensedcode/test_required_phrases.py +++ b/tests/licensedcode/test_required_phrases.py @@ -10,57 +10,72 @@ import os from unittest import TestCase as TestCaseClass -from licensedcode.required_phrases import get_key_phrase_spans +from licensedcode.required_phrases import get_key_phrase_spans_or_tokens +from licensedcode.required_phrases import get_required_phrase_texts_from_rule +from licensedcode.models import Rule from licensedcode.models import InvalidRule from licensedcode.spans import Span class TestGetKeyPhrases(TestCaseClass): + text = ( + 'This released software is {{released}} by under {{the MIT license}}. ' + 'Which is a license originating at Massachusetts Institute of Technology (MIT).' + ) def test_get_key_phrases_yields_spans(self): - text = ( - 'This released software is {{released}} by under {{the MIT license}}. ' - 'Which is a license originating at Massachusetts Institute of Technology (MIT).' - ) - - key_phrase_spans = get_key_phrase_spans(text) + key_phrase_spans = get_key_phrase_spans_or_tokens(self.text) assert list(key_phrase_spans) == [Span(4), Span(7, 9)] + def test_get_key_phrase_spans_or_tokens_yields_tokens(self): + key_phrase_spans = get_key_phrase_spans_or_tokens(text=self.text, yield_tokens=True) + assert list(key_phrase_spans) == [['released'], ['the', 'mit', 'license']] + + def test_get_required_phrase_texts_from_rule_works(self): + rule = Rule( + identifier="test_required_phrase.RULE", + license_expression="mit", + text=self.text, + is_license_notice=True, + ) + key_phrase_texts = get_required_phrase_texts_from_rule(rule) + assert list(key_phrase_texts) == ['released', 'the mit license'] + def test_get_key_phrases_raises_exception_key_phrase_markup_is_not_closed(self): text = 'This software is {{released by under the MIT license.' try: - list(get_key_phrase_spans(text)) + list(get_key_phrase_spans_or_tokens(text)) raise Exception('Exception should be raised') except InvalidRule: pass def test_get_key_phrases_ignores_stopwords_in_positions(self): text = 'The word comma is a stop word so comma does not increase the span position {{MIT license}}.' - key_phrase_spans = get_key_phrase_spans(text) + key_phrase_spans = get_key_phrase_spans_or_tokens(text) assert list(key_phrase_spans) == [Span(11, 12)] def test_get_key_phrases_yields_spans_without_stop_words(self): text = 'This released software is {{released span}} by under {{the MIT quot license}}.' - key_phrase_spans = get_key_phrase_spans(text) + key_phrase_spans = get_key_phrase_spans_or_tokens(text) assert list(key_phrase_spans) == [Span(4), Span(7, 9)] def test_get_key_phrases_does_not_yield_empty_spans(self): text = 'This released software {{comma}} is {{}} by under {{the MIT license}}.' try: - list(get_key_phrase_spans(text)) + list(get_key_phrase_spans_or_tokens(text)) raise Exception('Exception should be raised') except InvalidRule: pass def test_get_key_phrases_only_considers_outer_key_phrase_markup(self): text = 'This released {{{software under the MIT}}} license.' - key_phrase_spans = get_key_phrase_spans(text) + key_phrase_spans = get_key_phrase_spans_or_tokens(text) assert list(key_phrase_spans) == [Span(2, 5)] def test_get_key_phrases_ignores_nested_key_phrase_markup(self): text = 'This released {{software {{under the}} MIT}} license.' try: - list(get_key_phrase_spans(text)) + list(get_key_phrase_spans_or_tokens(text)) raise Exception('Exception should be raised') except InvalidRule: pass diff --git a/tests/licensedcode/test_tokenize.py b/tests/licensedcode/test_tokenize.py index 5b36ec59bc4..c04dc428c69 100644 --- a/tests/licensedcode/test_tokenize.py +++ b/tests/licensedcode/test_tokenize.py @@ -485,6 +485,13 @@ def test_key_phrase_tokenizer_returns_key_phrase_markup_as_tokens_for_multiple_t 'binary', '}}', 'is', 'permitted', ] + def test_key_phrase_tokenizer_returns_key_phrase_for_multiple_key_phrases(self): + text = 'Redistribution and {{use}} in {{binary}} is permitted.' + assert list(key_phrase_tokenizer(text)) == [ + 'redistribution', 'and', '{{', 'use', '}}', 'in', + '{{', 'binary', '}}', 'is', 'permitted', + ] + def test_key_phrase_tokenizer_returns_key_phrase_markup_as_tokens_after_newline(self): text = '{{IS_RIGHT\nThis program is distributed under GPL\n}}IS_RIGHT' assert list(key_phrase_tokenizer(text)) == [