Skip to content

Commit

Permalink
Add AlpinePackages pipeline
Browse files Browse the repository at this point in the history
A pipeline that complements missing package data. Downloads aports
repository and all its necessary branches (alpine versions) then
iterates over all alpine packages associated with the pipeline's
project. For each package it copies additional files from the aports
repository into scan target directory then downloads and extract all
the source archives, performs a scan and saves it's output to package's
database entry.

Signed-off-by: Mateusz Perc <m.perc@samsung.com>
  • Loading branch information
quepop committed Aug 31, 2021
1 parent e574fa9 commit c8aee9d
Show file tree
Hide file tree
Showing 4 changed files with 215 additions and 0 deletions.
107 changes: 107 additions & 0 deletions scanpipe/pipelines/alpine_packages.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
# SPDX-License-Identifier: Apache-2.0
#
# http://nexb.com and https://github.com/nexB/scancode.io
# The ScanCode.io software is licensed under the Apache License version 2.0.
# Data generated with ScanCode.io is provided as-is without warranties.
# ScanCode is a trademark of nexB Inc.
#
# You may not use this software except in compliance with the License.
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software distributed
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
# specific language governing permissions and limitations under the License.
#
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
# for any legal advice.
#
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/nexB/scancode.io for support and download.

from scanpipe.pipelines import Pipeline
from scanpipe.pipes.alpine import download_or_checkout_aports
from scanpipe.pipes.alpine import extract_summary_fields
from scanpipe.pipes.alpine import get_unscanned_packages_from_db
from scanpipe.pipes.alpine import prepare_scan_dir
from scanpipe.pipes.scancode import run_extractcode
from scanpipe.pipes.scancode import run_scancode


class AlpinePackages(Pipeline):
"""
A pipeline to complement missing alpine package data.
Downloads and extracts needed information from aports repository and package source files.
Alpine Linux does not provide copyrights and (in some cases) licenses for it's packages.
"""

@classmethod
def steps(cls):
return (
cls.create_alpine_versions_dict,
cls.download_aports_repo,
cls.complement_missing_package_data,
)

scancode_options = ["--copyright", "--summary"]

def create_alpine_versions_dict(self):
"""
Create a dict mapping alpine image ids from the database to alpine versions.
"""
self.alpine_versions = {
i["image_id"]: i["distro"]["version_id"]
for i in self.project.extra_data["images"]
if i["distro"]["identifier"] == "alpine"
}

def download_aports_repo(self):
"""
Set pipeline's `aports_dir_path` variable to it's project temporary path.
Iterate over every alpine version associated with this project.
Download corresponding aports repository branches (alpine versions).
"""
self.aports_dir_path = self.project.tmp_path
for image_id, alpine_version in self.alpine_versions.items():
download_or_checkout_aports(
aports_dir_path=self.project.tmp_path, alpine_version=alpine_version
)

def complement_missing_package_data(self):
"""
Iterate over alpine packages associated with this project.
Checkout aports repository to the corresponding alpine version and a commit.
Prepare scan target directory - download and extract package's sources.
Run scancode and extract missing data (only copyrights for now).
Update and save package's missing data to database.
"""
for (
alpine_version,
commit_id,
scan_target_path,
scan_result_path,
package,
) in get_unscanned_packages_from_db(
project=self.project, alpine_versions=self.alpine_versions
):
if not download_or_checkout_aports(
aports_dir_path=self.aports_dir_path,
alpine_version=alpine_version,
commit_id=commit_id,
) or not prepare_scan_dir(
package_name=package.name, scan_target_path=scan_target_path
):
continue
run_extractcode(location=str(scan_target_path))
run_scancode(
location=str(scan_target_path),
output_file=str(scan_result_path),
options=self.scancode_options,
)
package.update_extra_data(
data=extract_summary_fields(
scan_result_path=scan_result_path,
summary_field_names=["copyrights"],
)
)
106 changes: 106 additions & 0 deletions scanpipe/pipes/alpine.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,114 @@
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
# Visit https://github.com/nexB/scancode.io for support and download.


import json
from shutil import copytree

from fetchcode import fetch
from fetchcode.vcs.git import fetch_via_git
from packagedcode import alpine

from scanpipe.models import DiscoveredPackage

APORTS_URL = "https://gitlab.alpinelinux.org/alpine/aports.git"
APORTS_DIR_NAME = "aports"
APORTS_SUBDIRS = ["main", "non-free", "testing", "community", "unmaintained"]


def download_or_checkout_aports(aports_dir_path, alpine_version, commit_id=None):
"""
Download aports repository and it's branch based on `alpine_version`.
Checkout to a branch (alpine version).
If `commit_id` is provided also checkout to a commit.
Return `aports_dir_path` if checkout(s) succeded. #TODO Proper fetchcode patch required (extending #54)
"""
major, minor = alpine_version.split(".")[:2]
aports_dir_path = str(aports_dir_path / APORTS_DIR_NAME)
fetch_via_git(
url=f"git+{APORTS_URL}@{major}.{minor}-stable", location=aports_dir_path
)
if commit_id:
fetch_via_git(url=f"git+{APORTS_URL}@{commit_id}", location=aports_dir_path)
return aports_dir_path


def get_unscanned_packages_from_db(project, alpine_versions):
"""
Return an iterator of 5-tuples (alpine_version, commit_id, scan_target_path, scan_result_path, package) where:
`alpine_version` is an alpine version from which a package comes from (obtained from `alpine_versions` dict),
`commit_id` is an id of aports repository commit that added corresponding version of a package,
`scan_target_path` is a path of the directory on which a scan will be performed,
`scan_result_path` is a path of the scan result json file,
`package` is a DiscoveredPackage instance that belongs to a `project` with an alpine package type.
The returned iterator contains not-a-subpackage alpine packages that don't have an existing scan result file.
"""
for package in DiscoveredPackage.objects.filter(project=project, type="alpine"):
scan_id = f"{package.name}_{package.version}"
scan_result_path = project.output_path / (scan_id + ".json")
alpine_version = alpine_versions.get(package.extra_data["image_id"])
commit_id = package.vcs_url.split("id=")[1]
scan_target_path = project.tmp_path / scan_id
not_a_subpackage = (
not package.source_packages or package.source_packages[0] in package.purl
)
scan_result_nonexistent = not scan_result_path.exists()
if not_a_subpackage and scan_result_nonexistent:
yield alpine_version, commit_id, scan_target_path, scan_result_path, package


def prepare_scan_dir(package_name, scan_target_path, aports_dir_path=None):
"""
A function to gather all the package's source files in `scan_target_path`.
Source files of an alpine package are obtained from it's aports directory whose location has to be guessed.
Such directory is present in one of the five aports repository subdirectories (main, non-free, testing, community, unmaintained).
It's name is the same as the value of the corresponding package's `name` field (`scan_target_path`).
Here are some path examples:
.../aports/main/acf-db
.../aports/non-free/mongodb
Inside, there are some extra files (patches) and an APKBUILD which contains urls to source tarballs.
The function copies all these files (including APKBUILD) and downloads all the source tarballs to `scan_target_path`.
The default value of `aports_dir_path` is set to the parent of the `scan_target_path`.
If the package's aports path is found/guessed and it's also not empty the returned value is `scan_target_path`.
"""
if aports_dir_path is None:
aports_dir_path = scan_target_path.parent
for subdir_name in APORTS_SUBDIRS:
apkbuild_dir = aports_dir_path / APORTS_DIR_NAME / subdir_name / package_name
if not apkbuild_dir.exists():
continue
if not any(apkbuild_dir.iterdir()):
break
copytree(apkbuild_dir, scan_target_path)
package_sources = (
alpine.parse_apkbuild(scan_target_path / "APKBUILD")
.to_dict()
.get("extra_data")
.get("sources")
or []
)
for source in package_sources:
source_url = source.get("url")
if source_url:
fetch(source_url, scan_target_path)
return scan_target_path


def extract_summary_fields(scan_result_path, summary_field_names):
"""
Having a scancode result file extract all the values from the `summary` section of the scan result file (`scan_result_path`).
Put them in the arrays inside the `result` object (result[`field_name`]).
Return `result`.
"""
scan_result = open(scan_result_path)
summaries = json.load(scan_result)["summary"]
scan_result.close()
result = {}
for field_name in summary_field_names:
values = (summary["value"] for summary in summaries.get(field_name, []))
result[field_name] = [v for v in values if v]
return result


def package_getter(root_dir, **kwargs):
"""
Expand Down
1 change: 1 addition & 0 deletions scanpipe/pipes/docker.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ def scan_image_for_system_packages(project, image, detect_licenses=True):
for i, (purl, package, layer) in enumerate(installed_packages):
logger.info(f"Creating package #{i}: {purl}")
created_package = pipes.update_or_create_package(project, package.to_dict())
created_package.extra_data = {"image_id": image.image_id}

# We have no files for this installed package, we cannot go further.
if not package.installed_files:
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
"root_filesystems = scanpipe.pipelines.root_filesystems:RootFS",
"scan_codebase = scanpipe.pipelines.scan_codebase:ScanCodebase",
"scan_package = scanpipe.pipelines.scan_package:ScanPackage",
"alpine_packages = scanpipe.pipelines.alpine_packages:AlpinePackages"
],
},
classifiers=[
Expand Down

0 comments on commit c8aee9d

Please sign in to comment.