aboutcode-org · JonoYang · Mar 15, 2023 · Mar 1, 2023 · Mar 3, 2023 · Mar 7, 2023
diff --git a/Makefile b/Makefile
@@ -94,15 +94,21 @@ seed:
 	${MANAGE} seed
 
 run_visit: seed
-	${MANAGE} run_visit
+	${MANAGE} run_visit --ignore-robots --ignore-throttle
 
 run_map:
 	${MANAGE} run_map
 
+request_scans:
+	${MANAGE} request_scans
+
+process_scans:
+	${MANAGE} process_scans
+
 test:
 	@echo "-> Run the test suite"
 	${ACTIVATE} DJANGO_SETTINGS_MODULE=purldb.settings ${PYTHON_EXE} -m pytest -vvs --ignore matchcode-toolkit
-	${ACTIVATE} ${PYTHON_EXE} -m pytest -vvs matchcode-toolkit
+	${ACTIVATE} ${PYTHON_EXE} -m pytest -vvs matchcode-toolkit --ignore matchcode-toolkit/src/matchcode_toolkit/pipelines
 
 shell:
 	${MANAGE} shell

diff --git a/README.rst b/README.rst
@@ -63,9 +63,36 @@ To populate the PackageDB using visited package metadata:
 
     make run_map
 
-If you have an empty PackageDB without Package and Package Resource information,
-ClearCode should be run for a while so it can populate the PackageDB
-with Package and Package Resource information from clearlydefined.
+Populating Package Resource Data
+--------------------------------
+
+The Resources of Packages can be collected using the scan queue. By default, a
+scan request will be created for each mapped Package.
+
+The following environment variables will have to be set for the scan queue
+commands to work:
+::
+    SCANCODEIO_URL=<ScanCode.io API URL>
+    SCANCODEIO_API_KEY=<ScanCode.io API Key>
+
+The scan queue is run using two commands:
+::
+    make request_scans
+
+``request_scans`` will send a Package scan request to a configured ScanCode.io
+instance. ScanCode.io will download, extract, and scan the files of the
+requested Package.
+::
+    make process_scans
+
+``process_scans`` will poll ScanCode.io for the status of the Package scans
+requested by ``request_scans``. When a Package scan on ScanCode.io is ready,
+``process_scans`` will use that data to create Resources and populate the
+MatchCode directory fingerprint indices.
+
+Package Resource data can also be gathered by running ClearCode, where Package
+scan data from clearlydefined is collected and its results are used to create
+Packages and Resources.
 ::
 
     make clearsync

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -74,6 +74,24 @@ services:
       - db
       - web # Ensure that potential db migrations run first
 
+  request_scan:
+    build: .
+    command: wait-for-it web:8000 -- python manage.py request_scans
+    profiles:
+      - scan_queue
+    depends_on:
+      - db
+      - web
+
+  process_scan:
+    build: .
+    command: wait-for-it web:8000 -- python manage.py process_scans
+    profiles:
+      - scan_queue
+    depends_on:
+      - db
+      - web
+
   nginx:
     image: nginx
     ports:

diff --git a/matchcode-toolkit/setup.cfg b/matchcode-toolkit/setup.cfg
@@ -65,3 +65,7 @@ docs =
 [options.entry_points]
 scancode_post_scan =
     match = matchcode_toolkit.plugin_match:Match
+
+scancodeio_pipelines =
+    scan_and_fingerprint_codebase = matchcode_toolkit.pipelines.scan_and_fingerprint_codebase:ScanAndFingerprintCodebase
+    matching = matchcode_toolkit.pipelines.matching:Matching
diff --git a/matchcode-toolkit/src/matchcode_toolkit/fingerprinting.py b/matchcode-toolkit/src/matchcode_toolkit/fingerprinting.py
@@ -60,7 +60,10 @@ def create_structure_fingerprint(directory, children):
         if not child.path:
             continue
         child_subpath = _get_resource_subpath(child, directory)
-        rounded_child_size = int(child.size / 10) * 10
+        if not child.size:
+            rounded_child_size = 0
+        else:
+            rounded_child_size = int(child.size / 10) * 10
         path_feature = str(rounded_child_size) + child_subpath
         features.append(path_feature)
     return _create_directory_fingerprint(features)

diff --git a/matchcode-toolkit/src/matchcode_toolkit/pipelines/matching.py b/matchcode-toolkit/src/matchcode_toolkit/pipelines/matching.py
@@ -0,0 +1,210 @@
+# SPDX-License-Identifier: Apache-2.0
+#
+# http://nexb.com and https://github.com/nexB/scancode.io
+# The ScanCode.io software is licensed under the Apache License version 2.0.
+# Data generated with ScanCode.io is provided as-is without warranties.
+# ScanCode is a trademark of nexB Inc.
+#
+# You may not use this software except in compliance with the License.
+# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+#
+# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
+# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
+# ScanCode.io should be considered or used as legal advice. Consult an Attorney
+# for any legal advice.
+#
+# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
+# Visit https://github.com/nexB/scancode.io for support and download.
+
+from collections import defaultdict
+from os import getenv
+
+from django.conf import settings
+import requests
+
+from matchcode_toolkit.fingerprinting import compute_directory_fingerprints
+from scanpipe.pipelines import Pipeline
+from scanpipe.pipes import update_or_create_package
+from scanpipe.pipes.scancode import set_codebase_resource_for_package
+from scanpipe.pipes.codebase import ProjectCodebase
+
+
+def get_settings(var_name):
+    """
+    Return the settings value from the environment or Django settings.
+    """
+    return getenv(var_name) or getattr(settings, var_name, None) or ''
+
+
+PURLDB_URL = get_settings('PURLDB_URL').rstrip('/')
+MATCHCODE_ENDPOINT = f'{PURLDB_URL}/approximate_directory_content_index/match/' if PURLDB_URL else None
+PURLDB_PACKAGE_ENDPOINT = f'{PURLDB_URL}/packages/' if PURLDB_URL else None
+PURLDB_RESOURCE_ENDPOINT = f'{PURLDB_URL}/resources/' if PURLDB_URL else None
+
+PURLDB_API_KEY = get_settings('PURLDB_API_KEY')
+PURLDB_AUTH_HEADERS = {
+    'Authorization': f'Token {PURLDB_API_KEY}'
+} if PURLDB_API_KEY else {}
+
+
+class PackageInfo:
+    def __init__(self, package_link):
+        self.package_link = package_link
+        self.package_resources = self.get_resources_from_packagedb(package_link)
+        self.package_resource_by_paths = self.create_package_resource_by_paths()
+
+    @classmethod
+    def get_resources_from_packagedb(cls, package_link):
+        package_resources = []
+        response = requests.get(package_link)
+        if response.ok:
+            resources_link = response.json().get('resources').rstrip('/')
+            page_count = 1
+            while True:
+                resources_link_template = f'{resources_link}/?page={page_count}'
+                response = requests.get(resources_link_template)
+                if not response.ok:
+                    break
+                resources = response.json()
+                package_resources.extend(resources)
+                page_count += 1
+        return package_resources
+
+    def create_package_resource_by_paths(self):
+        return {
+            package_resource.get('path'): package_resource
+            for package_resource in self.package_resources
+        }
+
+
+def path_suffixes(path):
+    """
+    Yield all the suffixes of `path`, starting from the longest (e.g. more segments).
+    """
+    segments = path.strip('/').split('/')
+    suffixes = (segments[i:] for i in range(len(segments)))
+    for suffix in suffixes:
+        yield '/'.join(suffix)
+
+
+def check_resource_path(resource, package_resources_by_path):
+    """
+    Check to see if `resource` exists in the set of package Resources
+    `package_resources_by_path`
+    """
+    for path_suffix in path_suffixes(resource.path):
+        if not path_suffix in package_resources_by_path:
+            continue
+        package_resource = package_resources_by_path[path_suffix]
+        # Check to see if we have the same Resource
+        if ((resource.is_file == True
+                and package_resource.get('is_file') == True
+                and resource.sha1 == package_resource.get('sha1', ''))
+                or (resource.is_file == False
+                and package_resource.get('is_file') == False)):
+            return True
+    return False
+
+
+def determine_best_package_match(directory, package_info_by_package_links):
+    """
+    For all potential package matches in `package_info_by_purl`, return the
+    package whose codebase structure matches ours the most.
+    """
+    # Calculate the percent of package files found in codebase
+    package_links_by_match_ratio = {}
+    matched_codebase_paths_by_package_link = defaultdict(list)
+    for package_link, package_info in package_info_by_package_links.items():
+        matched_codebase_paths = matched_codebase_paths_by_package_link[package_link]
+        package_resource_by_paths = package_info.package_resource_by_paths
+
+        # TODO: Theres a problem when try to match the directory with
+        # the name `package` because on the index side, we have the path
+        # `package` indexed, but the path suffixes function only returns
+        # paths that are at least two segments long
+        #
+        # We get around this by checking filetype (file or directory) in `check_resource_path`
+        if check_resource_path(directory, package_resource_by_paths):
+            matched_codebase_paths.append(directory.path)
+
+        for child in directory.walk(topdown=True):
+            if check_resource_path(child, package_resource_by_paths):
+                matched_codebase_paths.append(child.path)
+
+        matching_resources_count = len(matched_codebase_paths)
+        ratio = matching_resources_count / len(package_resource_by_paths)
+        package_links_by_match_ratio[ratio] = package_link
+
+    highest_match_ratio = max(match_ratio for match_ratio, _ in package_links_by_match_ratio.items())
+    best_package_match_link = package_links_by_match_ratio[highest_match_ratio]
+    return best_package_match_link, matched_codebase_paths_by_package_link[best_package_match_link]
+
+
+class Matching(Pipeline):
+    @classmethod
+    def steps(cls):
+        return (
+            cls.get_project_codebase,
+            cls.create_fingerprints,
+            cls.perform_matching,
+        )
+
+    def get_project_codebase(self):
+        self.project_codebase = ProjectCodebase(self.project)
+
+    def create_fingerprints(self):
+        compute_directory_fingerprints(self.project_codebase)
+
+    def perform_matching(self):
+        for resource in self.project_codebase.walk(topdown=True):
+            # Collect directory fingerprints, if available
+            directory_content_fingerprint = resource.extra_data.get('directory_content', '')
+
+            # Skip resource if it is not a directory, does not contain directory
+            # fingerprints, or if it has already been matched
+            if (resource.is_file
+                    or not directory_content_fingerprint
+                    or resource.extra_data.get('matched', False)):
+                continue
+
+            # Send fingerprint to matchcode for matching and get the purls of
+            # the matched packages
+            payload = {
+                'fingerprint': [directory_content_fingerprint]
+            }
+            response = requests.get(MATCHCODE_ENDPOINT, params=payload)
+            if response:
+                results = response.json()
+                matched_package_links = [result.get('package', '') for result in results]
+            if not matched_package_links:
+                continue
+
+            # Get the paths of the resources from matched packages
+            package_info_by_package_links = {}
+            for link in matched_package_links:
+                package_info_by_package_links[link] = PackageInfo(link)
+
+            # Calculate the percent of package files found in codebase
+            best_package_match_link, matched_codebase_paths = determine_best_package_match(resource, package_info_by_package_links)
+
+            # Query PackageDB for info on the best matched package
+            response = requests.get(best_package_match_link)
+            if response:
+                # Create DiscoveredPackage for the best matched package
+                package_data = response.json()
+                purl = package_data.get('purl', '')
+                uuid = package_data.get('uuid', '')
+                package_data['package_uid'] = f'{purl}?uuid={uuid}'
+                package_data.pop('uuid')
+                discovered_package = update_or_create_package(self.project, package_data)
+
+            # Associate the package to the resource and its children
+            for matched_codebase_path in matched_codebase_paths:
+                r = self.project.codebaseresources.get(path=matched_codebase_path)
+                set_codebase_resource_for_package(r, discovered_package)
+                r.extra_data['matched'] = True
+                r.save()
diff --git a/matchcode-toolkit/src/matchcode_toolkit/pipelines/scan_and_fingerprint_codebase.py b/matchcode-toolkit/src/matchcode_toolkit/pipelines/scan_and_fingerprint_codebase.py
@@ -0,0 +1,60 @@
+# SPDX-License-Identifier: Apache-2.0
+#
+# http://nexb.com and https://github.com/nexB/scancode.io
+# The ScanCode.io software is licensed under the Apache License version 2.0.
+# Data generated with ScanCode.io is provided as-is without warranties.
+# ScanCode is a trademark of nexB Inc.
+#
+# You may not use this software except in compliance with the License.
+# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+#
+# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
+# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
+# ScanCode.io should be considered or used as legal advice. Consult an Attorney
+# for any legal advice.
+#
+# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
+# Visit https://github.com/nexB/scancode.io for support and download.
+
+from scanpipe.pipelines.scan_codebase import ScanCodebase
+from scanpipe.pipes.codebase import ProjectCodebase
+
+from matchcode_toolkit.fingerprinting import compute_directory_fingerprints
+
+
+class ScanAndFingerprintCodebase(ScanCodebase):
+    """
+    A pipeline to scan a codebase with ScanCode-toolkit and compute directory
+    fingerprints.
+
+    Input files are copied to the project's codebase/ directory and are extracted
+    in place before running the scan.
+    Alternatively, the code can be manually copied to the project codebase/
+    directory.
+    """
+
+    @classmethod
+    def steps(cls):
+        return (
+            cls.copy_inputs_to_codebase_directory,
+            cls.extract_archives,
+            cls.collect_and_create_codebase_resources,
+            cls.fingerprint_codebase,
+            cls.tag_empty_files,
+            cls.scan_for_application_packages,
+            cls.scan_for_files,
+        )
+
+    # Set to True to extract recursively nested archives in archives.
+    extract_recursively = False
+
+    def fingerprint_codebase(self):
+        """
+        Compute directory fingerprints for matching purposes
+        """
+        project_codebase = ProjectCodebase(self.project)
+        compute_directory_fingerprints(project_codebase)
diff --git a/minecode/management/commands/make_scannableuris.py b/minecode/management/commands/make_scannableuris.py
@@ -29,9 +29,12 @@ def handle(self, *args, **options):
         for package in Package.objects.all():
             package_uri = package.download_url
             try:
-                new_scannableURI = ScannableURI(uri=package_uri, package=package)
-                new_scannableURI.save()
-                self.stdout.write('ScannableURI created for: {}'.format(package_uri))
+                _, created = ScannableURI.objects.get_or_create(
+                    uri=package_uri,
+                    package=package
+                )
+                if created:
+                    self.stdout.write('ScannableURI created for: {}'.format(package_uri))
             except Exception as e:
                 msg = 'Error creating ScannableURI for: {}'.format(package_uri)
                 msg += get_error_message(e)