From b1cf4aa16386e447b024b2fd3f5386251560628d Mon Sep 17 00:00:00 2001 From: Kevin Ji Date: Sat, 21 May 2022 15:51:01 -0400 Subject: [PATCH] Add support for external licenses in scans #480 This adds `-dir` or `--additional-directories` as a command line option in license detection. This allows users to specify paths to directories of licenses and rules they'd like to use during license detection, but would not like to add to the ScanCode database of licenses. This involves adding a new option in `licensedcode/plugin_license.py`, and this option is used as a parameter in `scancode/api.py`. In this approach, the licenses and rules contained in these additional directories are combined with the existing licenses and rules in the ScanCode database to produce a single index. The code for this is found in `licensedcode/cache.py` and the helper methods for loading these licenses and rules are found in `licensedcode/models.py`. This commit also includes a unit test to verify that license detection succeeds with an additional directory found in `tests/licensedcode/test_plugin_license.py`. Part of the setup for the unit test and future tests involves creating a new directory in `tests/licensedcode/data` that contains sample external licenses used in the unit tests. Signed-off-by: Kevin Ji --- src/licensedcode/cache.py | 62 +++++++++++--- src/licensedcode/models.py | 58 +++++++++++++ src/licensedcode/plugin_license.py | 14 +++- src/scancode/api.py | 4 +- .../example1/licenses/example1.LICENSE | 1 + .../example1/licenses/example1.yml | 5 ++ .../example1/rules/example1.RULE | 1 + .../example1/rules/example1.yml | 2 + .../external_licenses/scan.expected.json | 81 +++++++++++++++++++ .../external_licenses/scan/license.txt | 1 + tests/licensedcode/test_plugin_license.py | 17 ++++ tests/scancode/data/help/help.txt | 39 ++++----- 12 files changed, 256 insertions(+), 29 deletions(-) create mode 100644 tests/licensedcode/data/example_external_licenses/example1/licenses/example1.LICENSE create mode 100644 tests/licensedcode/data/example_external_licenses/example1/licenses/example1.yml create mode 100644 tests/licensedcode/data/example_external_licenses/example1/rules/example1.RULE create mode 100644 tests/licensedcode/data/example_external_licenses/example1/rules/example1.yml create mode 100644 tests/licensedcode/data/plugin_license/external_licenses/scan.expected.json create mode 100644 tests/licensedcode/data/plugin_license/external_licenses/scan/license.txt diff --git a/src/licensedcode/cache.py b/src/licensedcode/cache.py index 9b244728143..445ddd96dc3 100644 --- a/src/licensedcode/cache.py +++ b/src/licensedcode/cache.py @@ -29,6 +29,7 @@ # global in-memory cache of the LicenseCache _LICENSE_CACHE = None +_CACHED_DIRECTORIES = [] LICENSE_INDEX_LOCK_TIMEOUT = 60 * 4 LICENSE_INDEX_DIR = 'license_index' @@ -58,6 +59,7 @@ def load_or_build( timeout=LICENSE_INDEX_LOCK_TIMEOUT, licenses_data_dir=None, rules_data_dir=None, + additional_directories=None, ): """ Load or build and save and return a LicenseCache object. @@ -92,6 +94,8 @@ def load_or_build( from licensedcode.models import licenses_data_dir as ldd from licensedcode.models import rules_data_dir as rdd from licensedcode.models import load_licenses + from licensedcode.models import load_licenses_from_multiple_dirs + from licensedcode.models import get_license_dirs from scancode import lockfile licenses_data_dir = licenses_data_dir or ldd @@ -106,13 +110,21 @@ def load_or_build( # Here, the cache is either stale or non-existing: we need to # rebuild all cached data (e.g. mostly the index) and cache it - licenses_db = load_licenses(licenses_data_dir=licenses_data_dir) + if additional_directories: + additional_license_dirs = get_license_dirs(additional_dirs=additional_directories) + combined_directories = [licenses_data_dir] + additional_license_dirs + licenses_db = load_licenses_from_multiple_dirs(license_directories=combined_directories) + else: + licenses_db = load_licenses(licenses_data_dir=licenses_data_dir) + # create a single merged index containing license data from licenses_data_dir + # and data from additional directories index = build_index( licenses_db=licenses_db, licenses_data_dir=licenses_data_dir, rules_data_dir=rules_data_dir, index_all_languages=index_all_languages, + additional_directories=additional_directories, ) spdx_symbols = build_spdx_symbols(licenses_db=licenses_db) @@ -143,6 +155,7 @@ def build_index( licenses_data_dir=None, rules_data_dir=None, index_all_languages=False, + additional_directories=None, ): """ Return an index built from rules and licenses directories @@ -151,19 +164,35 @@ def build_index( Otherwise, only include the English license texts and rules (the default) """ from licensedcode.index import LicenseIndex + from licensedcode.models import get_license_dirs + from licensedcode.models import get_rule_dirs from licensedcode.models import get_rules + from licensedcode.models import get_rules_from_multiple_dirs from licensedcode.models import get_all_spdx_key_tokens from licensedcode.models import get_license_tokens from licensedcode.models import licenses_data_dir as ldd from licensedcode.models import rules_data_dir as rdd from licensedcode.models import load_licenses + from licensedcode.models import load_licenses_from_multiple_dirs from licensedcode.legalese import common_license_words licenses_data_dir = licenses_data_dir or ldd rules_data_dir = rules_data_dir or rdd - licenses_db = licenses_db or load_licenses(licenses_data_dir=licenses_data_dir) - rules = get_rules(licenses_db=licenses_db, rules_data_dir=rules_data_dir) + if not licenses_db: + if additional_directories: + additional_license_dirs = get_license_dirs(additional_dirs=additional_directories) + combined_license_directories = [licenses_data_dir] + additional_license_dirs + licenses_db = load_licenses_from_multiple_dirs(license_dirs=combined_license_directories) + else: + licenses_db = load_licenses(licenses_data_dir=licenses_data_dir) + + if additional_directories: + additional_rule_dirs = get_rule_dirs(additional_dirs=additional_directories) + combined_rule_directories = [rules_data_dir] + additional_rule_dirs + rules = get_rules_from_multiple_dirs(licenses_db=licenses_db, rule_directories=combined_rule_directories) + else: + rules = get_rules(licenses_db=licenses_db, rules_data_dir=rules_data_dir) legalese = common_license_words spdx_tokens = set(get_all_spdx_key_tokens(licenses_db)) @@ -299,7 +328,7 @@ def build_unknown_spdx_symbol(licenses_db=None): return LicenseSymbolLike(licenses_db['unknown-spdx']) -def get_cache(force=False, index_all_languages=False): +def get_cache(force=False, index_all_languages=False, additional_directories=None): """ Return a LicenseCache either rebuilt, cached or loaded from disk. @@ -307,17 +336,25 @@ def get_cache(force=False, index_all_languages=False): building the license index. Otherwise, only include the English license \ texts and rules (the default) """ - populate_cache(force=force, index_all_languages=index_all_languages) + populate_cache(force=force, index_all_languages=index_all_languages, additional_directories=additional_directories) global _LICENSE_CACHE return _LICENSE_CACHE -def populate_cache(force=False, index_all_languages=False): +def populate_cache(force=False, index_all_languages=False, additional_directories=None): """ Load or build and cache a LicenseCache. Return None. """ global _LICENSE_CACHE - if force or not _LICENSE_CACHE: + global _CACHED_DIRECTORIES + # check if we've already cached this set of additional directories + # if we have, pass + should_cache_additional_directories = additional_directories is not None \ + and sorted(additional_directories) != sorted(_CACHED_DIRECTORIES) + if should_cache_additional_directories: + # otherwise we will just return previous cache on line 84 + force = True + if force or not _LICENSE_CACHE or should_cache_additional_directories: _LICENSE_CACHE = LicenseCache.load_or_build( licensedcode_cache_dir=licensedcode_cache_dir, scancode_cache_dir=scancode_cache_dir, @@ -325,7 +362,10 @@ def populate_cache(force=False, index_all_languages=False): index_all_languages=index_all_languages, # used for testing only timeout=LICENSE_INDEX_LOCK_TIMEOUT, + additional_directories=additional_directories, ) + if additional_directories: + _CACHED_DIRECTORIES = additional_directories def load_cache_file(cache_file): @@ -346,11 +386,15 @@ def load_cache_file(cache_file): raise Exception(msg) from e -def get_index(force=False, index_all_languages=False): +def get_index(force=False, index_all_languages=False, additional_directories=None): """ Return and eventually build and cache a LicenseIndex. """ - return get_cache(force=force, index_all_languages=index_all_languages).index + return get_cache( + force=force, + index_all_languages=index_all_languages, + additional_directories=additional_directories + ).index get_cached_index = get_index diff --git a/src/licensedcode/models.py b/src/licensedcode/models.py index 31e4d71d0b3..4ab115a4f11 100644 --- a/src/licensedcode/models.py +++ b/src/licensedcode/models.py @@ -768,6 +768,64 @@ def get_rules( licenses_as_rules = build_rules_from_licenses(licenses_db) return chain(licenses_as_rules, rules) +def get_license_dirs( + additional_dirs, +): + """ + Takes in a list of additional directories specified during license detection + and produces a list of all the subdirectories containing license files. + """ + return [f"{path}/licenses" for path in additional_dirs] + +def get_rule_dirs( + additional_dirs, +): + """ + Takes in a list of additional directories specified during license detection + and produces a list of all the subdirectories containing rule files. + """ + return [f"{path}/rules" for path in additional_dirs] + +def load_licenses_from_multiple_dirs( + license_directories, + with_deprecated=False, +): + """ + Takes in a list of directories containing additional licenses to use in + license detection and combines all the licenses into the same mapping. + """ + combined_licenses = {} + for license_dir in license_directories: + licenses = load_licenses(licenses_data_dir=license_dir, with_deprecated=False) + # this syntax for merging is described here: https://stackoverflow.com/a/26853961 + combined_licenses = {**combined_licenses, **licenses} + return combined_licenses + +def get_rules_from_multiple_dirs( + licenses_db, + rule_directories, +): + """ + Takes in a license database, which is a mapping from key->License objects, + and a list of all directories containing rules to use in license detection. + Combines all rules together into the same data structure and validates them. + """ + # TODO: error handling in case additional_directories is empty + if rule_directories: + combined_rules = [] + for rules_dir in rule_directories: + r = list(load_rules( + rules_data_dir=rules_dir, + )) + combined_rules.append(r) + rules = chain.from_iterable(combined_rules) + else: + rules = get_rules(licenses_db=licenses_db, rules_data_dir=rules_data_dir) + + validate_rules(rules, licenses_db) + licenses_as_rules = build_rules_from_licenses(licenses_db) + return chain(licenses_as_rules, rules) + class InvalidRule(Exception): pass diff --git a/src/licensedcode/plugin_license.py b/src/licensedcode/plugin_license.py index 8658ffbc356..c4370fd63b4 100644 --- a/src/licensedcode/plugin_license.py +++ b/src/licensedcode/plugin_license.py @@ -18,6 +18,7 @@ from commoncode.resource import clean_path from plugincode.scan import ScanPlugin from plugincode.scan import scan_impl +import click from scancode.api import SCANCODE_LICENSEDB_URL @@ -139,6 +140,14 @@ class LicenseScanner(ScanPlugin): help_group=SCAN_OPTIONS_GROUP, ), + PluggableCommandLineOption( + ('-dir', '--additional_directories'), + multiple=True, + type=click.Path(exists=True, readable=True, path_type=str), + help='Include additional directories for license detection.', + help_group=SCAN_OPTIONS_GROUP, + ), + PluggableCommandLineOption( ('--reindex-licenses',), is_flag=True, is_eager=True, @@ -167,7 +176,8 @@ def setup(self, **kwargs): loaded index. """ from licensedcode.cache import populate_cache - populate_cache() + additional_directories = kwargs.get('additional_directories') + populate_cache(additional_directories=additional_directories) def get_scanner( self, @@ -176,6 +186,7 @@ def get_scanner( license_text_diagnostics=False, license_url_template=SCANCODE_LICENSEDB_URL, unknown_licenses=False, + additional_directories=None, **kwargs ): @@ -186,6 +197,7 @@ def get_scanner( license_text_diagnostics=license_text_diagnostics, license_url_template=license_url_template, unknown_licenses=unknown_licenses, + additional_directories=additional_directories, ) def process_codebase(self, codebase, unknown_licenses, **kwargs): diff --git a/src/scancode/api.py b/src/scancode/api.py index 1f3333c6a4b..1acd26ef174 100644 --- a/src/scancode/api.py +++ b/src/scancode/api.py @@ -142,6 +142,7 @@ def get_licenses( license_url_template=SCANCODE_LICENSEDB_URL, unknown_licenses=False, deadline=sys.maxsize, + additional_directories=None, **kwargs, ): """ @@ -168,7 +169,7 @@ def get_licenses( from licensedcode import cache from licensedcode.spans import Span - idx = cache.get_index() + idx = cache.get_index(additional_directories=additional_directories) detected_licenses = [] detected_expressions = [] @@ -252,6 +253,7 @@ def _licenses_data_from_match( result['homepage_url'] = lic.homepage_url result['text_url'] = lic.text_urls[0] if lic.text_urls else '' result['reference_url'] = license_url_template.format(lic.key) + # TODO: change this in the case of a private license? result['scancode_text_url'] = SCANCODE_LICENSE_TEXT_URL.format(lic.key) result['scancode_data_url'] = SCANCODE_LICENSE_DATA_URL.format(lic.key) diff --git a/tests/licensedcode/data/example_external_licenses/example1/licenses/example1.LICENSE b/tests/licensedcode/data/example_external_licenses/example1/licenses/example1.LICENSE new file mode 100644 index 00000000000..8fe2a4b5ad1 --- /dev/null +++ b/tests/licensedcode/data/example_external_licenses/example1/licenses/example1.LICENSE @@ -0,0 +1 @@ +The quick brown fox jumps over the lazy dog. \ No newline at end of file diff --git a/tests/licensedcode/data/example_external_licenses/example1/licenses/example1.yml b/tests/licensedcode/data/example_external_licenses/example1/licenses/example1.yml new file mode 100644 index 00000000000..d7d1ea640ec --- /dev/null +++ b/tests/licensedcode/data/example_external_licenses/example1/licenses/example1.yml @@ -0,0 +1,5 @@ +key: example1 +short_name: Example External License 1 +name: Example External License 1 +category: Permissive +owner: NexB \ No newline at end of file diff --git a/tests/licensedcode/data/example_external_licenses/example1/rules/example1.RULE b/tests/licensedcode/data/example_external_licenses/example1/rules/example1.RULE new file mode 100644 index 00000000000..8fe2a4b5ad1 --- /dev/null +++ b/tests/licensedcode/data/example_external_licenses/example1/rules/example1.RULE @@ -0,0 +1 @@ +The quick brown fox jumps over the lazy dog. \ No newline at end of file diff --git a/tests/licensedcode/data/example_external_licenses/example1/rules/example1.yml b/tests/licensedcode/data/example_external_licenses/example1/rules/example1.yml new file mode 100644 index 00000000000..96535e6c24b --- /dev/null +++ b/tests/licensedcode/data/example_external_licenses/example1/rules/example1.yml @@ -0,0 +1,2 @@ +license_expression: example1 +is_license_text: yes \ No newline at end of file diff --git a/tests/licensedcode/data/plugin_license/external_licenses/scan.expected.json b/tests/licensedcode/data/plugin_license/external_licenses/scan.expected.json new file mode 100644 index 00000000000..9f291212e54 --- /dev/null +++ b/tests/licensedcode/data/plugin_license/external_licenses/scan.expected.json @@ -0,0 +1,81 @@ +{ + "headers": [ + { + "tool_name": "scancode-toolkit", + "options": { + "input": "", + "-dir": "", + "--json": "", + "--license": true, + "--strip-root": true + }, + "notice": "Generated with ScanCode and provided on an \"AS IS\" BASIS, WITHOUT WARRANTIES\nOR CONDITIONS OF ANY KIND, either express or implied. No content created from\nScanCode should be considered or used as legal advice. Consult an Attorney\nfor any legal advice.\nScanCode is a free software code scanning tool from nexB Inc. and others.\nVisit https://github.com/nexB/scancode-toolkit/ for support and download.", + "output_format_version": "2.0.0", + "message": null, + "errors": [], + "warnings": [], + "extra_data": { + "system_environment": { + "operating_system": "linux", + "cpu_architecture": "64", + "platform": "Linux-5.4.0-109-generic-x86_64-with-Ubuntu-18.04-bionic", + "platform_version": "#123~18.04.1-Ubuntu SMP Fri Apr 8 09:48:52 UTC 2022", + "python_version": "3.6.9 (default, Mar 15 2022, 13:55:28) \n[GCC 8.4.0]" + }, + "spdx_license_list_version": "3.16", + "files_count": 2 + } + } + ], + "files": [ + { + "path": "license.txt", + "type": "file", + "licenses": [ + { + "key": "example1", + "score": 100.0, + "name": "Example External License 1", + "short_name": "Example External License 1", + "category": "Permissive", + "is_exception": false, + "is_unknown": false, + "owner": "NexB", + "homepage_url": "", + "text_url": "", + "reference_url": "https://scancode-licensedb.aboutcode.org/example1", + "scancode_text_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/example1.LICENSE", + "scancode_data_url": "https://github.com/nexB/scancode-toolkit/tree/develop/src/licensedcode/data/licenses/example1.yml", + "spdx_license_key": "", + "spdx_url": "", + "start_line": 1, + "end_line": 1, + "matched_rule": { + "identifier": "example1.LICENSE", + "license_expression": "example1", + "licenses": [ + "example1" + ], + "referenced_filenames": [], + "is_license_text": true, + "is_license_notice": false, + "is_license_reference": false, + "is_license_tag": false, + "is_license_intro": false, + "has_unknown": false, + "matcher": "1-hash", + "rule_length": 9, + "matched_length": 9, + "match_coverage": 100.0, + "rule_relevance": 100 + } + } + ], + "license_expressions": [ + "example1" + ], + "percentage_of_license_text": 100.0, + "scan_errors": [] + } + ] +} diff --git a/tests/licensedcode/data/plugin_license/external_licenses/scan/license.txt b/tests/licensedcode/data/plugin_license/external_licenses/scan/license.txt new file mode 100644 index 00000000000..2fe6575e76e --- /dev/null +++ b/tests/licensedcode/data/plugin_license/external_licenses/scan/license.txt @@ -0,0 +1 @@ +The quick brown fox jumps over the lazy dog. diff --git a/tests/licensedcode/test_plugin_license.py b/tests/licensedcode/test_plugin_license.py index 0d5d20a469a..284466e97cc 100644 --- a/tests/licensedcode/test_plugin_license.py +++ b/tests/licensedcode/test_plugin_license.py @@ -242,6 +242,23 @@ def test_reindex_licenses_works(): run_scan_click(['--reindex-licenses']) +def test_detection_with_external_license_directories(): + test_dir = test_env.get_test_loc('plugin_license/external_licenses/scan', copy=True) + example1_dir = test_env.get_test_loc('example_external_licenses/example1') + result_file = test_env.get_temp_file('json') + args = [ + '--license', + '--strip-root', + '--verbose', + '-dir', example1_dir, + '--json', result_file, + test_dir, + ] + run_scan_click(args) + test_loc = test_env.get_test_loc('plugin_license/external_licenses/scan.expected.json') + check_json_scan(test_loc, result_file, regen=REGEN_TEST_FIXTURES) + + @pytest.mark.scanslow def test_scan_license_with_url_template(): test_dir = test_env.get_test_loc('plugin_license/license_url', copy=True) diff --git a/tests/scancode/data/help/help.txt b/tests/scancode/data/help/help.txt index 747550db517..850fe149b89 100644 --- a/tests/scancode/data/help/help.txt +++ b/tests/scancode/data/help/help.txt @@ -21,24 +21,27 @@ Options: -u, --url Scan for urls. scan options: - --license-score INTEGER Do not return license matches with a score lower - than this score. A number between 0 and 100. - [default: 0] - --license-text Include the detected licenses matched text. - --license-text-diagnostics In the matched license text, include diagnostic - highlights surrounding with square brackets [] - words that are not matched. - --license-url-template TEXT Set the template URL used for the license - reference URLs. Curly braces ({}) are replaced by - the license key. [default: https://scancode- - licensedb.aboutcode.org/{}] - --max-email INT Report only up to INT emails found in a file. Use - 0 for no limit. [default: 50] - --max-url INT Report only up to INT urls found in a file. Use 0 - for no limit. [default: 50] - --unknown-licenses [EXPERIMENTAL] Detect unknown licenses and follow - license references such as "See license in file - COPYING". + --license-score INTEGER Do not return license matches with a score lower + than this score. A number between 0 and 100. + [default: 0] + --license-text Include the detected licenses matched text. + --license-text-diagnostics In the matched license text, include diagnostic + highlights surrounding with square brackets [] + words that are not matched. + --license-url-template TEXT Set the template URL used for the license + reference URLs. Curly braces ({}) are replaced + by the license key. [default: + https://scancode-licensedb.aboutcode.org/{}] + --max-email INT Report only up to INT emails found in a file. + Use 0 for no limit. [default: 50] + --max-url INT Report only up to INT urls found in a file. + Use 0 for no limit. [default: 50] + --unknown-licenses [EXPERIMENTAL] Detect unknown licenses and + follow license references such as "See license + in file COPYING". + -dir, --additional_directories PATH + Include additional directories for license + detection. output formats: --json FILE Write scan output as compact JSON to FILE.