Skip to content

Commit

Permalink
Add support for external licenses in scans aboutcode-org#480
Browse files Browse the repository at this point in the history
This adds `-dir` or `--additional-directories` as a command line
option in license detection. This allows users to specify paths to
directories of licenses and rules they'd like to use during license
detection, but would not like to add to the ScanCode database of
licenses.

This involves adding a new option in `licensedcode/plugin_license.py`,
and this option is used as a parameter in `scancode/api.py`. In this
approach, the licenses and rules contained in these additional
directories are combined with the existing licenses and rules in the
ScanCode database to produce a single index. The code for this is found
in `licensedcode/cache.py` and the helper methods for loading these
licenses and rules are found in `licensedcode/models.py`. This commit
also includes a unit test to verify that license detection succeeds
with an additional directory found in
`tests/licensedcode/test_plugin_license.py`. Part of the setup for the
unit test and future tests involves creating a new directory in
`tests/licensedcode/data` that contains sample external licenses used
in the unit tests.

Signed-off-by: Kevin Ji <kyji1011@gmail.com>
  • Loading branch information
KevinJi22 committed Jul 25, 2022
1 parent c1599f2 commit 942af3c
Show file tree
Hide file tree
Showing 17 changed files with 465 additions and 29 deletions.
103 changes: 94 additions & 9 deletions src/licensedcode/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
LICENSE_INDEX_FILENAME = 'index_cache'
LICENSE_LOCKFILE_NAME = 'scancode_license_index_lockfile'
LICENSE_CHECKSUM_FILE = 'scancode_license_index_tree_checksums'
CACHED_DIRECTORIES_FILENAME = 'cached_directories'


@attr.s(slots=True)
Expand All @@ -58,6 +59,7 @@ def load_or_build(
timeout=LICENSE_INDEX_LOCK_TIMEOUT,
licenses_data_dir=None,
rules_data_dir=None,
additional_directories=None,
):
"""
Load or build and save and return a LicenseCache object.
Expand All @@ -66,7 +68,8 @@ def load_or_build(
On the side, we load cached or build license db, SPDX symbols and other
license-related data structures.
- If the cache exists, it is returned unless corrupted or ``force`` is True.
- If the cache exists, it is returned unless corrupted, ``force`` is True, or if we pass in additional
directories containing licenses that are not present in the existing cache.
- If the cache does not exist, a new index is built and cached.
- If ``index_all_languages`` is True, include texts in all languages when
building the license index. Otherwise, only include the English license \
Expand All @@ -75,12 +78,17 @@ def load_or_build(
idx_cache_dir = os.path.join(licensedcode_cache_dir, LICENSE_INDEX_DIR)
create_dir(idx_cache_dir)
cache_file = os.path.join(idx_cache_dir, LICENSE_INDEX_FILENAME)
cached_directories_file = os.path.join(idx_cache_dir, CACHED_DIRECTORIES_FILENAME)

has_cache = os.path.exists(cache_file) and os.path.getsize(cache_file)

# bypass build if cache exists
if has_cache and not force:
try:
# save the list of additional directories included in the cache, or None if the cache does not
# include any additional directories
with open(cached_directories_file, 'wb') as file:
pickle.dump(additional_directories, file, protocol=PICKLE_PROTOCOL)
return load_cache_file(cache_file)
except Exception as e:
# work around some rare Windows quirks
Expand All @@ -92,6 +100,8 @@ def load_or_build(
from licensedcode.models import licenses_data_dir as ldd
from licensedcode.models import rules_data_dir as rdd
from licensedcode.models import load_licenses
from licensedcode.models import load_licenses_from_multiple_dirs
from licensedcode.models import get_license_dirs
from scancode import lockfile

licenses_data_dir = licenses_data_dir or ldd
Expand All @@ -106,13 +116,21 @@ def load_or_build(
# Here, the cache is either stale or non-existing: we need to
# rebuild all cached data (e.g. mostly the index) and cache it

licenses_db = load_licenses(licenses_data_dir=licenses_data_dir)
if additional_directories:
additional_license_dirs = get_license_dirs(additional_dirs=additional_directories)
combined_directories = [licenses_data_dir] + additional_license_dirs
licenses_db = load_licenses_from_multiple_dirs(license_directories=combined_directories)
else:
licenses_db = load_licenses(licenses_data_dir=licenses_data_dir)

# create a single merged index containing license data from licenses_data_dir
# and data from additional directories
index = build_index(
licenses_db=licenses_db,
licenses_data_dir=licenses_data_dir,
rules_data_dir=rules_data_dir,
index_all_languages=index_all_languages,
additional_directories=additional_directories,
)

spdx_symbols = build_spdx_symbols(licenses_db=licenses_db)
Expand All @@ -131,6 +149,11 @@ def load_or_build(
with open(cache_file, 'wb') as fn:
pickle.dump(license_cache, fn, protocol=PICKLE_PROTOCOL)

# save the list of additional directories included in the cache, or None if the cache does not
# include any additional directories
with open(cached_directories_file, 'wb') as file:
pickle.dump(additional_directories, file, protocol=PICKLE_PROTOCOL)

return license_cache

except lockfile.LockTimeout:
Expand All @@ -143,27 +166,50 @@ def build_index(
licenses_data_dir=None,
rules_data_dir=None,
index_all_languages=False,
additional_directories=None,
):
"""
Return an index built from rules and licenses directories
If ``index_all_languages`` is True, include texts and rules in all languages.
Otherwise, only include the English license texts and rules (the default)
If ``additional_directories`` is not None, we will include licenses and rules
from these additional directories in the returned index.
"""
from licensedcode.index import LicenseIndex
from licensedcode.models import get_license_dirs
from licensedcode.models import get_rule_dirs
from licensedcode.models import get_rules
from licensedcode.models import get_rules_from_multiple_dirs
from licensedcode.models import get_all_spdx_key_tokens
from licensedcode.models import get_license_tokens
from licensedcode.models import licenses_data_dir as ldd
from licensedcode.models import rules_data_dir as rdd
from licensedcode.models import load_licenses
from licensedcode.models import load_licenses_from_multiple_dirs
from licensedcode.legalese import common_license_words

licenses_data_dir = licenses_data_dir or ldd
rules_data_dir = rules_data_dir or rdd

licenses_db = licenses_db or load_licenses(licenses_data_dir=licenses_data_dir)
rules = get_rules(licenses_db=licenses_db, rules_data_dir=rules_data_dir)
if not licenses_db:
if additional_directories:
# combine the licenses in these additional directories with the licenses in the original DB
additional_license_dirs = get_license_dirs(additional_dirs=additional_directories)
combined_license_directories = [licenses_data_dir] + additional_license_dirs
# generate a single combined license db with all licenses
licenses_db = load_licenses_from_multiple_dirs(license_dirs=combined_license_directories)
else:
licenses_db = load_licenses(licenses_data_dir=licenses_data_dir)

if additional_directories:
# if we have additional directories, extract the rules from them
additional_rule_dirs = get_rule_dirs(additional_dirs=additional_directories)
# then combine the rules in these additional directories with the rules in the original rules directory
combined_rule_directories = [rules_data_dir] + additional_rule_dirs
rules = get_rules_from_multiple_dirs(licenses_db=licenses_db, rule_directories=combined_rule_directories)
else:
rules = get_rules(licenses_db=licenses_db, rules_data_dir=rules_data_dir)

legalese = common_license_words
spdx_tokens = set(get_all_spdx_key_tokens(licenses_db))
Expand Down Expand Up @@ -299,24 +345,26 @@ def build_unknown_spdx_symbol(licenses_db=None):
return LicenseSymbolLike(licenses_db['unknown-spdx'])


def get_cache(force=False, index_all_languages=False):
def get_cache(force=False, index_all_languages=False, additional_directories=None):
"""
Return a LicenseCache either rebuilt, cached or loaded from disk.
If ``index_all_languages`` is True, include texts in all languages when
building the license index. Otherwise, only include the English license \
texts and rules (the default)
"""
populate_cache(force=force, index_all_languages=index_all_languages)
populate_cache(force=force, index_all_languages=index_all_languages, additional_directories=additional_directories)
global _LICENSE_CACHE
return _LICENSE_CACHE


def populate_cache(force=False, index_all_languages=False):
def populate_cache(force=False, index_all_languages=False, additional_directories=None):
"""
Load or build and cache a LicenseCache. Return None.
"""
global _LICENSE_CACHE
if need_cache_rebuild(additional_directories):
force = True
if force or not _LICENSE_CACHE:
_LICENSE_CACHE = LicenseCache.load_or_build(
licensedcode_cache_dir=licensedcode_cache_dir,
Expand All @@ -325,9 +373,42 @@ def populate_cache(force=False, index_all_languages=False):
index_all_languages=index_all_languages,
# used for testing only
timeout=LICENSE_INDEX_LOCK_TIMEOUT,
additional_directories=additional_directories,
)


def need_cache_rebuild(additional_directories):
"""
Return true if we need to rebuild the index cache.
"""
idx_cache_dir = os.path.join(licensedcode_cache_dir, LICENSE_INDEX_DIR)
cached_directories_file = os.path.join(idx_cache_dir, CACHED_DIRECTORIES_FILENAME)

has_cached_directories = os.path.exists(cached_directories_file)
should_rebuild_cache = False

if has_cached_directories:
# if we have cached additional directories of licenses, check if those licenses are equal to the additional
# directories passed in
with open(cached_directories_file, 'rb') as file:
# it's possible that pickle.load(file) results in None
try:
cached_additional_directories = pickle.load(file)
except EOFError:
cached_additional_directories = set()

# we need to rebuild the cache if the list of additional directories we passed in is not a subset of
# the set of additional directories currently included in the index cache
should_rebuild_cache = additional_directories is not None \
and not set(additional_directories).issubset(cached_additional_directories)
else:
# otherwise, we don't have a file of cached directories. If there are additional directories passed in,
# we know we need to make a new cache file.
if additional_directories:
should_rebuild_cache = True
return should_rebuild_cache


def load_cache_file(cache_file):
"""
Return a LicenseCache loaded from ``cache_file``.
Expand All @@ -346,11 +427,15 @@ def load_cache_file(cache_file):
raise Exception(msg) from e


def get_index(force=False, index_all_languages=False):
def get_index(force=False, index_all_languages=False, additional_directories=None):
"""
Return and eventually build and cache a LicenseIndex.
"""
return get_cache(force=force, index_all_languages=index_all_languages).index
return get_cache(
force=force,
index_all_languages=index_all_languages,
additional_directories=additional_directories
).index


get_cached_index = get_index
Expand Down
63 changes: 63 additions & 0 deletions src/licensedcode/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from os.path import dirname
from os.path import exists
from os.path import join
from pathlib import Path

import attr
import saneyaml
Expand Down Expand Up @@ -772,6 +773,68 @@ def get_rules(
return chain(licenses_as_rules, rules)


def get_license_dirs(
additional_dirs,
):
"""
Takes in a list of additional directories specified during license detection
and produces a list of all the subdirectories containing license files.
"""
# convert to absolute path in case user passes in a relative path, which messes up building rules from licenses
return [f"{str(Path(path).absolute())}/licenses" for path in additional_dirs]


def get_rule_dirs(
additional_dirs,
):
"""
Takes in a list of additional directories specified during license detection
and produces a list of all the subdirectories containing rule files.
"""
return [f"{str(Path(path).absolute())}/rules" for path in additional_dirs]


def load_licenses_from_multiple_dirs(
license_directories,
with_deprecated=False,
):
"""
Takes in a list of directories containing additional licenses to use in
license detection and combines all the licenses into the same mapping.
"""
combined_licenses = {}
for license_dir in license_directories:
licenses = load_licenses(licenses_data_dir=license_dir, with_deprecated=False)
# this syntax for merging is described here: https://stackoverflow.com/a/26853961
combined_licenses = {**combined_licenses, **licenses}
return combined_licenses


def get_rules_from_multiple_dirs(
licenses_db,
rule_directories,
):
"""
Takes in a license database, which is a mapping from key->License objects,
and a list of all directories containing rules to use in license detection.
Combines all rules together into the same data structure and validates them.
"""
if rule_directories:
combined_rules = []
for rules_dir in rule_directories:
r = list(load_rules(
rules_data_dir=rules_dir,
))
combined_rules.append(r)
# flatten lists of rules into a single iterable
rules = list(chain.from_iterable(combined_rules))
validate_rules(rules, licenses_db)
licenses_as_rules = build_rules_from_licenses(licenses_db)
return chain(licenses_as_rules, rules)
else:
return get_rules(licenses_db=licenses_db, rules_data_dir=rules_data_dir)


class InvalidRule(Exception):
pass

Expand Down
15 changes: 14 additions & 1 deletion src/licensedcode/plugin_license.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from commoncode.resource import clean_path
from plugincode.scan import ScanPlugin
from plugincode.scan import scan_impl
import click

from scancode.api import SCANCODE_LICENSEDB_URL

Expand Down Expand Up @@ -139,6 +140,15 @@ class LicenseScanner(ScanPlugin):
help_group=SCAN_OPTIONS_GROUP,
),

PluggableCommandLineOption(
('-dir', '--additional_directories'),
required_options=['license'],
multiple=True,
type=click.Path(exists=True, readable=True, path_type=str),
help='Include additional directories for license detection.',
help_group=SCAN_OPTIONS_GROUP,
),

PluggableCommandLineOption(
('--reindex-licenses',),
is_flag=True, is_eager=True,
Expand Down Expand Up @@ -167,7 +177,8 @@ def setup(self, **kwargs):
loaded index.
"""
from licensedcode.cache import populate_cache
populate_cache()
additional_directories = kwargs.get('additional_directories')
populate_cache(additional_directories=additional_directories)

def get_scanner(
self,
Expand All @@ -176,6 +187,7 @@ def get_scanner(
license_text_diagnostics=False,
license_url_template=SCANCODE_LICENSEDB_URL,
unknown_licenses=False,
additional_directories=None,
**kwargs
):

Expand All @@ -186,6 +198,7 @@ def get_scanner(
license_text_diagnostics=license_text_diagnostics,
license_url_template=license_url_template,
unknown_licenses=unknown_licenses,
additional_directories=additional_directories,
)

def process_codebase(self, codebase, unknown_licenses, **kwargs):
Expand Down
4 changes: 3 additions & 1 deletion src/scancode/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ def get_licenses(
license_url_template=SCANCODE_LICENSEDB_URL,
unknown_licenses=False,
deadline=sys.maxsize,
additional_directories=None,
**kwargs,
):
"""
Expand All @@ -168,7 +169,7 @@ def get_licenses(
from licensedcode import cache
from licensedcode.spans import Span

idx = cache.get_index()
idx = cache.get_index(additional_directories=additional_directories)

detected_licenses = []
detected_expressions = []
Expand Down Expand Up @@ -252,6 +253,7 @@ def _licenses_data_from_match(
result['homepage_url'] = lic.homepage_url
result['text_url'] = lic.text_urls[0] if lic.text_urls else ''
result['reference_url'] = license_url_template.format(lic.key)
# TODO: change this in the case of a private license?
result['scancode_text_url'] = SCANCODE_LICENSE_TEXT_URL.format(lic.key)
result['scancode_data_url'] = SCANCODE_LICENSE_DATA_URL.format(lic.key)

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
The quick brown fox jumps over the lazy dog.
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
key: example1
short_name: Example External License 1
name: Example External License 1
category: Permissive
owner: NexB
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
The quick brown fox jumps over the lazy dog.
The quick brown fox jumps over the lazy dog.
Loading

0 comments on commit 942af3c

Please sign in to comment.