diff --git a/README.md b/README.md index ebd079d..ac627cb 100644 --- a/README.md +++ b/README.md @@ -49,6 +49,7 @@ Multiple Python scripts are developed to work together with these tools. - bomsh_index_ws.py script, which creates a blob index database for software build workspace. - bomsh_sbom.py script, which creates or updates SPDX SBOM documents with OmniBOR info. - bomsh_spdx_rpm.py script, which creates or updates SPDX SBOM documents for RPMs built from its src RPM. +- bomsh_spdx_deb.py script, which creates or updates SPDX SBOM documents for DEBs built from its src. - bomsh_art_tree.py script, which grafts new subtrees or prunes existing subtrees of OmniBOR artifact trees. - bomsh_dynlib.py script, which creates raw_logfile of runtime-dependency fragments for ELF executables. - bomsh_pylib.py script, which creates raw_logfile of runtime-dependency fragments for Python scripts. @@ -70,7 +71,7 @@ For a quick start of using the Bomsh tool, run the below command: $ $ # the above should take only a few minutes, and the below may take tens of minutes $ wget https://buildinfos.debian.net/buildinfo-pool/s/sysstat/sysstat_11.7.3-1_all-amd64-source.buildinfo - $ bomsh/scripts/bomsh_rebuild_deb.py -f sysstat_11.7.3-1_all-amd64-source.buildinfo -d bomsh/scripts/sample_sysstat_cvedb.json -o outdir2 --syft_sbom --mmdebstrap_no_cleanup + $ bomsh/scripts/bomsh_rebuild_deb.py -f sysstat_11.7.3-1_all-amd64-source.buildinfo -d bomsh/scripts/sample_sysstat_cvedb.json -o outdir2 --syft_sbom --bomsh_spdx --mmdebstrap_no_cleanup $ grep -B1 -A3 CVElist outdir2/bomsher_out/bomsh_logfiles/bomsh_search_jsonfile-details.json Then explore and inspect all the output files in the outdir/bomsher_out directory, @@ -82,7 +83,7 @@ contain the constructed OmniBOR tree with relevant metadata for the built RPM/DE the bomsh_logfiles/bomsh-index-* files contain the relevant package/blobs database, the syft_sbom/omnibor* files contain the syft-generated SPDX SBOM documents with ExternalRef OmniBOR identifier, and the bomsh_sbom/* files contain the SPDX SBOM documents with ExternalRef OmniBOR identifier -generated by the bomsh_spdx_rpm.py script. +generated by the bomsh_spdx_rpm.py or bomsh_spdx_deb.py script. Compile Bombash and Bomtrace from Source ---------------------------------------- @@ -138,7 +139,7 @@ Except for this difference, all other steps to generate OmniBOR documents are th By running with all C code instead of invoking Python scripts, Bomtrace3 saves a lot of process context switches overhead, thus improving the performance significantly over Bomtrace2. -Bomtrace2 is a few (2x to 5x) times slower than the baseline run, while Bomtrace3 has only about 10% or 20% runtime overhead. +Bomtrace2 is a few (2x to 5x) times slower than the baseline run, while Bomtrace3 has only about 20% runtime overhead. Generating OmniBOR Docs with Bomtrace2 ------------------------------------- diff --git a/scripts/bomsh_rebuild_deb.py b/scripts/bomsh_rebuild_deb.py index 2981cc5..274f39e 100755 --- a/scripts/bomsh_rebuild_deb.py +++ b/scripts/bomsh_rebuild_deb.py @@ -200,7 +200,10 @@ def fix_broken_symlinks(bomsher_outdir): /tmp/bomsh_search_cve.py --derive_sbom -b omnibor_dir $cvedb_file_param -f $debfiles -vvv ; cp /tmp/bomsh_search_jsonfile* bomsh_logfiles/ ; \\ # Extra handling of syft generated SPDX SBOM documents ; \\ if [ "${SYFT_SBOM}" ]; then /tmp/bomsh_sbom.py -b omnibor_dir -F $debfiles -vv --output_dir syft_sbom --sbom_format spdx --force_insert ; fi ; \\ - if [ "${SYFT_SBOM}" ]; then /tmp/bomsh_sbom.py -b omnibor_dir -F $debfiles -vv --output_dir syft_sbom --sbom_format spdx-json --force_insert ; fi ; + if [ "${SYFT_SBOM}" ]; then /tmp/bomsh_sbom.py -b omnibor_dir -F $debfiles -vv --output_dir syft_sbom --sbom_format spdx-json --force_insert ; fi ; \\ + # Extra handling of bomsh-spdx generated SPDX SBOM documents ; \\ + export PYTHONPATH=/root/tools-python/src:/root/beartype:/root/packageurl-python/src ; \\ + if [ "${BOMSH_SPDX}" ]; then /tmp/bomsh_spdx_deb.py -F $debfiles --output_dir bomsh_sbom --sbom_server_url http://your.org ; fi ; ''' def create_dockerfile(work_dir): @@ -213,6 +216,14 @@ def create_dockerfile(work_dir): else: from_str = 'FROM debian:bookworm' dockerfile_str = from_str + g_bomsh_dockerfile_str + if args.bomsh_spdx: + # bomsh_spdx_deb.py requires additional python libraries + dockerfile_str = dockerfile_str.replace("rm -rf /var/lib/apt/lists/* ;", + "apt install python3-requests python3-license-expression python3-uritools python3-rdflib python3-xmltodict python3-yaml ; \\\n" + " cd /root ; git clone https://github.com/spdx/tools-python.git ; \\\n" + " git clone https://github.com/beartype/beartype.git ; \\\n" + " git clone https://github.com/package-url/packageurl-python.git ; \\\n" + " rm -rf /var/lib/apt/lists/* ;") dockerfile = os.path.join(work_dir, "Dockerfile") write_text_file(dockerfile, dockerfile_str) @@ -251,6 +262,9 @@ def run_docker(buildinfo_file, output_dir): if args.syft_sbom: # Generate SBOM document with the syft tool docker_cmd += ' -e SYFT_SBOM=1' + if args.bomsh_spdx: + # Generate SPDX SBOM document with the bomsh_spdx_rpm.py tool + docker_cmd += ' -e BOMSH_SPDX=1' docker_cmd += ' -v ' + output_dir + ':/out $(docker build -t bomsher-deb -q ' + bomsher_indir + ')' verbose("==== Here is the docker run command: " + docker_cmd, LEVEL_1) os.system(docker_cmd) @@ -289,6 +303,9 @@ def rtd_parse_options(): parser.add_argument("--syft_sbom", action = "store_true", help = "run syft to generate DEB SBOM in spdx/spdx-json SBOM format") + parser.add_argument("--bomsh_spdx", + action = "store_true", + help = "run bomsh_spdx_deb.py to generate DEB SBOM in spdx/spdx-json SBOM format") parser.add_argument("--mmdebstrap_no_cleanup", action = "store_true", help = "do not cleanup chroot directory after mmdebstrap run") diff --git a/scripts/bomsh_rebuild_rpm.py b/scripts/bomsh_rebuild_rpm.py index cbaff80..f236619 100755 --- a/scripts/bomsh_rebuild_rpm.py +++ b/scripts/bomsh_rebuild_rpm.py @@ -141,9 +141,6 @@ def fix_broken_symlinks(bomsher_outdir): ./bootstrap && ./configure --enable-mpers=check && make ; \\ cp src/strace /tmp/bomtrace2 ; -# Set up SPDX tools-python environment -RUN cd /root ; git clone https://github.com/spdx/tools-python.git ; - # Bomtrace/Bomsh mock build run to generate OmniBOR documents # if BASELINE_REBUILD is not empty, then it will not use bomtrace2 to run mock, that is, the baseline run. # if CHROOT_CFG is not empty, then the provided mock chroot_cfg will be used, otherwise, default.cfg is used. @@ -175,7 +172,7 @@ def fix_broken_symlinks(bomsher_outdir): if [ "${SYFT_SBOM}" ]; then /tmp/bomsh_sbom.py -b omnibor_dir -F $rpmfiles -vv --output_dir syft_sbom --sbom_format spdx-json ; fi ; \\ # Extra handling of bomsh-spdx generated SPDX SBOM documents ; \\ export PYTHONPATH=/root/tools-python/src ; \\ - if [ "${BOMSH_SPDX}" ]; then /tmp/bomsh_spdx_rpm.py -r $rpmfiles --output_dir bomsh_sbom --sbom_server_url http://your.org ; fi ; + if [ "${BOMSH_SPDX}" ]; then /tmp/bomsh_spdx_rpm.py -F $rpmfiles --output_dir bomsh_sbom --sbom_server_url http://your.org ; fi ; ''' def create_dockerfile(work_dir): @@ -196,6 +193,7 @@ def create_dockerfile(work_dir): # bomsh_spdx_rpm.py requires additional python libraries from pip3 bomsh_dockerfile_str = bomsh_dockerfile_str.replace("dnf clean all ;", "pip3 install requests license-expression beartype uritools rdflib xmltodict pyyaml packageurl-python ; \\\n" + " cd /root ; git clone https://github.com/spdx/tools-python.git ; \\\n" " dnf clean all ;") if args.bomsh_spdx and "almalinux:8" in from_str: # almalinux8 has python3.6 version as default, but we need at least python3.8 version for bomsh_spdx_rpm.py and spdx/tools-python library diff --git a/scripts/bomsh_spdx_deb.py b/scripts/bomsh_spdx_deb.py new file mode 100755 index 0000000..5c62c26 --- /dev/null +++ b/scripts/bomsh_spdx_deb.py @@ -0,0 +1,894 @@ +#! /usr/bin/env python3 +# Copyright (c) 2024 Cisco and/or its affiliates. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Bomsh script to create SPDX documents for Debian packages built from its src. +""" + +import sys +import stat +import os +import logging +import shutil +from datetime import datetime +from typing import List +import tempfile + +import subprocess +import uuid +import secrets +import requests +import argparse + +# for special filename handling with shell +try: + from shlex import quote as cmd_quote +except ImportError: + from pipes import quote as cmd_quote + +from spdx_tools.common.spdx_licensing import spdx_licensing +from spdx_tools.spdx.model import ( + Actor, + ActorType, + Checksum, + ChecksumAlgorithm, + CreationInfo, + Document, + ExternalPackageRef, + ExternalPackageRefCategory, + File, + FileType, + SpdxNoAssertion, + Package, + PackagePurpose, + PackageVerificationCode, + Relationship, + RelationshipType, +) + +from spdx_tools.spdx.model import Checksum, ChecksumAlgorithm, File, PackageVerificationCode +from spdx_tools.spdx.spdx_element_utils import calculate_file_checksum, calculate_package_verification_code + +from spdx_tools.spdx.validation.document_validator import validate_full_spdx_document +from spdx_tools.spdx.validation.validation_message import ValidationMessage +from spdx_tools.spdx.writer.write_anything import write_file +from spdx_tools.spdx.writer.write_utils import convert, validate_and_deduplicate + +import json +from urllib.parse import urlsplit + +from packageurl import PackageURL + +from pathlib import Path +from requests.adapters import HTTPAdapter +from requests.packages.urllib3.util.retry import Retry + +TOOL_VERSION = '0.0.1' +VERSION = '%(prog)s ' + TOOL_VERSION + +args = None + +g_tmpdir = "/tmp" + +# This is the database of blob_id => bom_id mappings +g_bom_mappings_db = None + +# This is the database of blob_id => SBOM-info mappings +g_pkg_sbom_db = None + +# This is the database of pkg_name => blobs/pkg-info mappings +g_pkg_index_db = None + +# This is the server where all the SBOM info goes +SBOM_SERVER_URL = "" + +# This is the server where all the OmniBOR ADG info goes +ADG_SERVER_URL = "" + +# This is the organization name of the SPDX creator +CREATOR_ORG = "your-organization-name" + +# This is the email address of the SPDX creator +CREATOR_EMAIL = "sbom@your-organization-name" + +# This is the dir where all the built DEBs end up +RPMS_DIR = "/out/bomsher_out/debs" + +# TODO: this is for development purposes. Once we figure out how we are going to leverage this script we will +# come up with a better way of identifying where our DB files live +LOGFILE_DIR = "/out/bomsher_out/bomsh_logfiles" + +# This is the file name of the generated file that contains package dependency information for each generated package +PKG_SBOM_DB = "bomsh_search_jsonfile-sbom.json" + +# This is the file name of the generated file that contains package manager information on the dependent packages +PKG_INDEX_DB = "bomsh-index-pkg-db.json" + +# This is the file I created that holds the OS Release environment variables. +OS_REL_INFO = "mock-os-release" + +DB_FN = "bomsh_search_jsonfile-details.json" +BOM_MAPPING_FN = "bomsh_search_jsonfile-bom-mappings.json" + +def get_or_create_dir(destdir): + """ + Create a directory if it does not exist. otherwise, return it directly + return absolute path of destdir + """ + if destdir and os.path.exists(destdir): + return os.path.abspath(destdir) + os.makedirs(destdir) + return os.path.abspath(destdir) + +def load_json_db(db_file): + """ Load the the data from a JSON file + + :param db_file: the JSON database file + :returns a dictionary that contains the data + """ + db = dict() + with open(db_file, 'r') as f: + db = json.load(f) + return db + +def get_shell_cmd_output(cmd): + """ + Returns the output of the shell command "cmd". + + :param cmd: the shell command to execute + """ + #print (cmd) + output = subprocess.check_output(cmd, shell=True, universal_newlines=True) + return output + +def find_all_regular_files(builddir): + """ + Find all regular files in the build dir, excluding symbolic link files. + + It simply runs the shell's find command and saves the result. + + :param builddir: String, build dir of the workspace + :returns a list that contains all the regular file names. + """ + #verbose("entering find_all_regular_files: the build dir is " + builddir, LEVEL_4) + builddir = os.path.abspath(builddir) + findcmd = "find " + cmd_quote(builddir) + ' -type f -print || true ' + output = subprocess.check_output(findcmd, shell=True, universal_newlines=True) + files = output.splitlines() + return files + +def unbundle_package(pkgfile, destdir=''): + ''' + unbundle RPM/DEB package to destdir. + :param pkgfile: the RPM/DEB package file to unbundle + :param destdir: the destination directory to save unbundled files + ''' + if not destdir: + extract_dir = os.path.join(g_tmpdir, "bomsh_extract_dir") + if not os.path.exists(extract_dir): + os.makedirs(extract_dir) + destdir = os.path.join(extract_dir, os.path.basename(pkgfile) + ".extractdir") + if pkgfile[-4:] == ".rpm": + cmd = "rm -rf " + destdir + " ; mkdir -p " + destdir + " ; cd " + destdir + " ; rpm2cpio " + pkgfile + " | cpio -idm || true" + elif pkgfile[-4:] == ".deb" or pkgfile[-5:] in (".udeb", ".ddeb"): + cmd = "rm -rf " + destdir + " ; mkdir -p " + destdir + " ; dpkg-deb -xv " + pkgfile + " " + destdir + " || true" + elif pkgfile[-4:] == ".tgz" or pkgfile[-7:] in (".tar.gz", ".tar.xz") or pkgfile[-8:] == ".tar.bz2": + cmd = "rm -rf " + destdir + " ; mkdir -p " + destdir + " ; tar -xf " + pkgfile + " -C " + destdir + " || true" + else: + print("Unsupported package format in " + pkgfile + " file, skipping it.") + return '' + print(f'Unpacking archive: {pkgfile} to dir: {destdir}\n') + get_shell_cmd_output(cmd) + return destdir + +# This generates a 16 digit unique hex string that can be appended to values to make them unique +# syft does this via IDByHash() +# In theory we could have the caller add the number of digits they want +def unique(): + return secrets.token_hex(8) + +def deb_unpack(debfile): + return unbundle_package(debfile) + +# Instead of getting the full pkg info from the DB, this one gets it from the package itself +def deb_query_pkg(rpm): + cmd_str = f'dpkg -f {rpm}' + query_data = subprocess.run(cmd_str.split(" "), capture_output=True, text=True) + + pkg_info = query_data.stdout.splitlines() + pkg_data = parse_pkg_info(pkg_info) + return pkg_data + +def get_pkg_name(pkg_data): + pkg_name = '' + if 'Package' in pkg_data: + pkg_name = pkg_data['Package'] + else: + pkg_name = pkg_data['Source'] + ".Source" + return pkg_name + +def deb_pkg_nvra(pkg_data): + # "Name : gcc" + #pkg_name = pkg_data['Name'] + pkg_name = get_pkg_name(pkg_data) + + # "Version : 8.5.0" + pkg_ver = pkg_data['Version'] + + # There is no Name or Release in Debian package data. + # "Release : 18.el8.alma" + #pkg_rel = pkg_data['Release'] + + # "Architecture: x86_64" + pkg_arch = pkg_data['Architecture'] + + return (pkg_name, pkg_ver, '', pkg_arch) + +def valid_spdx_id(name): + # spdx_id must only contain letters, numbers, "." and "-" + # If we find a "_" change it to a "-" + n = name.replace("_", "-") + + # If there are other substitutions, we can place them here + return n + +def get_pkg_gitoid(pkg_name): + cmd_str = f'git hash-object {pkg_name}' + hash_data = subprocess.run(cmd_str.split(" "), capture_output=True, text=True) + return hash_data.stdout.strip() + +def parse_pkg_info(pkg_info_array): + # Our return value + pkg_info = dict() + + # Set a description list to hold all the lines of the description + desc_list = list() + + # Get the length of the array for later use + end = len(pkg_info_array) + + # Just in case this changes - we only want to specify it once. + desc_tag = "Description" + + for idx, l in enumerate(pkg_info_array): + if ":" not in l: + continue + + # Every line will have a ': ' format except the description field + # We only want to split at the first ':' (or else we will be splitting timestamps) + v = l.split(':', 1) + # The tag will most likely have trailing spaces + tag = v[0].strip() + if tag == desc_tag: + # Just in case we have a value sitting on the same line as the Description + desc_list.append(v[1].lstrip()) + # Everything after the Description tag is the description value + break + + # For everything else we just assign the left-trimmed value to the tag + value = v[1].lstrip() + # There will be stuff like this in the list: + # "Signature : (none)", + # "Source RPM : (none)", + # We could either put in a tag and a None value or not even put in the tag. + # TODO: Are all these fields deemed "required" and thus something that someone would expect + # to find? + if value != "(none)": + pkg_info[tag] = value + + # We are done with the loop. Everything else is the desription + desc_list += pkg_info_array[idx+1:end] + pkg_info[desc_tag] = desc_list + #print(pkg_info) + + return pkg_info + +def build_pkg_purl(rel_data, pkg_data): + + # scheme:type/namespace/name@version?qualifiers#subpath + + qual_d = dict() + arch = pkg_data.get('Architecture') + if arch: + qual_d['arch'] = arch + + epoch = pkg_data.get('Epoch') + if epoch: + qual_d['epoch'] = epoch + + src_pkg = pkg_data.get('Source RPM') + if src_pkg: + qual_d['upstream'] = src_pkg + + qual_d['distro'] = f"{rel_data['ID']}" + #qual_d['distro'] = f"{rel_data['ID']}-{rel_data['VERSION_ID']}" + + # TODO: Need to figure out where subpath comes into play. Where do I find an example + + pkg_name = get_pkg_name(pkg_data) + + purl = PackageURL(type='deb', + namespace=rel_data['ID'], + name=pkg_name, + version=f"{pkg_data['Version']}", + #version=f"{pkg_data['Version']}-{pkg_data['Release']}", + qualifiers=qual_d, + subpath=None) + + return str(purl) + +def make_purl_ref(pkg_data, os_rel_data): + purl = build_pkg_purl(os_rel_data, pkg_data) + + ref = ExternalPackageRef( + category=ExternalPackageRefCategory.PACKAGE_MANAGER, + reference_type="purl", + locator=purl, + # comment="external reference comment", + ) + + return ref + +def build_base_spdx(pkg_name, doc_uuid): + + # First up, we need general information about the creation of the document, summarised by the CreationInfo class. + creation_info = CreationInfo( + # This is a hard coded value (cisco standard is 2.3 for now) + spdx_version = "SPDX-2.3", + # Another hard coded value + spdx_id = "SPDXRef-DOCUMENT", + # This will be the name of the package for which we are creating the SBOM + name = pkg_name, + # Another hard coded value + data_license = "CC0-1.0", + # This is the current cisco standard + document_namespace = f'{SBOM_SERVER_URL}/spdxdocs/{pkg_name}-{doc_uuid}', + # Not sure about the e-mail, but I left it in as a placeholder. It is an optional parameter + creators=[Actor(ActorType.ORGANIZATION, CREATOR_ORG, CREATOR_EMAIL)], + created=datetime.now() + ) + + # Create our document instance + return Document(creation_info) + +def spdx_add_package(spdx_doc, rpm_file, bom_id, file_verification_code, os_rel_data): + # Only name, spdx_id and download_location are mandatory in SPDX v2.3. + + # This one + pkg_data = deb_query_pkg(rpm_file) + + (pkg_name, pkg_ver, pkg_rel, pkg_arch) = deb_pkg_nvra(pkg_data) + + sha1_hash = calculate_file_checksum(rpm_file, hash_algorithm=ChecksumAlgorithm.SHA1) + md5_hash = calculate_file_checksum(rpm_file, hash_algorithm=ChecksumAlgorithm.MD5) + # This was not parsing. We will have to figure out how to handle non-standard license stuff. + # pkg_license = rpm_query("{LICENSE}", rpm_file) + + package = Package( + name = pkg_name, + spdx_id = f'SPDXRef-Package-{valid_spdx_id(pkg_name)}-{unique()}', + download_location = SpdxNoAssertion(), + license_concluded = SpdxNoAssertion(), + version = f'{pkg_ver}', + #version = f'{pkg_ver}-{pkg_rel}', + file_name = os.path.basename(rpm_file), + # TODO: In theory, we could have the file verification code default to None and if + # there was a value, then set the file_analyzed / verification_code but since we + # know we want all the files in the package this should always be here + files_analyzed = True, + verification_code = file_verification_code, + checksums=[ + Checksum(ChecksumAlgorithm.SHA1, sha1_hash), + Checksum(ChecksumAlgorithm.MD5, md5_hash), + ], + external_references=[ + ExternalPackageRef( + category=ExternalPackageRefCategory.PERSISTENT_ID, + reference_type="gitoid", + locator=f'gitoid:blob:sha1:{bom_id}', + comment=f'{ADG_SERVER_URL}/adg/tree/{bom_id}', + ), + make_purl_ref(pkg_data, os_rel_data) + ] + # license_concluded=spdx_licensing.parse("GPL-2.0-only OR MIT"), + # license_info_from_files=[spdx_licensing.parse("GPL-2.0-only"), spdx_licensing.parse("MIT")], + # license_declared=spdx_licensing.parse("GPL-2.0-or-later"), + # license_comment="license comment", + # supplier=Actor(ActorType.PERSON, "Jane Doe", "jane.doe@example.com"), + # originator=Actor(ActorType.ORGANIZATION, "some organization", "contact@example.com"), + # copyright_text="Copyright 2022 Jane Doe", + # description="package description", + # attribution_texts=["package attribution"], + # primary_package_purpose=PackagePurpose.LIBRARY, + # release_date=datetime(2015, 1, 1), + # ], + ) + + # Now that we have a package defined, we can add it to the document's package property. + spdx_doc.packages = [package] + + # A DESCRIBES relationship asserts that the document indeed describes the package. + describes_relationship = Relationship("SPDXRef-DOCUMENT", RelationshipType.DESCRIBES, package.spdx_id) + spdx_doc.relationships = [describes_relationship] + + return spdx_doc + +def analyze_files(rpm_file, unpack_dir): + # The list of file records + file_list = [] + files = find_all_regular_files(unpack_dir) + len_prefix = len(unpack_dir.rstrip("/")) + 1 + + # We have a choice as to how we want to analyze our files + # We can use the RPM tool to query the package or we can look at the extracted CPIO archive + # A very non-scientific test using sysstat shows that the regular file count (meaning things not directories + # or symlinks) is the same both ways. + # We will use the RPM tool to query the package as the preferred mechanism and use the extracted files when that + # proves unsatisfactory + #####file_data = rpm_query_files(rpm_file) + + for f in files: + # NOTE: Another design choice here. If we want we can also add the file to the reference + # + spdx_file_ref = f'SPDXRef-File-{unique()}' + # The parser does not like filenames that start with '/' + file_name = f[len_prefix:] + file_sha1 = calculate_file_checksum(f, hash_algorithm=ChecksumAlgorithm.SHA1) + file_sha256 = calculate_file_checksum(f, hash_algorithm=ChecksumAlgorithm.SHA256) + file_rec = File( + name=file_name, + spdx_id=spdx_file_ref, + + # TODO: There are ways we can do a better job of figuring out the file type. + # In particular, there are these options from the RPM cmd + # -c, --configfiles list all configuration files + # -d, --docfiles list all documentation files + # -L, --licensefiles list all license files + # -A, --artifactfiles list all artifact files + # and then there is the linux "file" command + + file_types=[FileType.SOURCE], + checksums=[ + Checksum(ChecksumAlgorithm.SHA1, file_sha1), + Checksum(ChecksumAlgorithm.SHA256, file_sha256), + ], + # license_concluded=spdx_licensing.parse("MIT"), + # license_info_in_file=[spdx_licensing.parse("MIT")], + # copyright_text="Copyright 2022 Jane Doe", + ) + + # Append the file record + file_list.append(file_rec) + + return file_list + +def spdx_add_files(spdx_doc, file_list): + for file_rec in file_list: + spdx_doc.files += [file_rec] + + # Create the contains relationship + # TODO: We may want to look into a better way of referencing the package but this will do for now since + # we know we only have one package + contains_relationship = Relationship(spdx_doc.packages[0].spdx_id, RelationshipType.CONTAINS, file_rec.spdx_id) + + # The spdx library uses run-time type checks when assigning properties. + # Because in-place alterations like .append() circumvent these checks, we don't use them here. + spdx_doc.relationships += [contains_relationship] + return spdx_doc + +def pkg_exists(spdx_doc, pkg): + pkg_file_name = f'{os.path.basename(pkg)}.deb' + + for p in spdx_doc.packages: + if p.file_name == pkg_file_name: + print(f'Found package: {p.file_name}') + return p + return None + +def build_basic_spdx_package(pkg, pkg_db, os_rel_data): + db_entry = pkg_db.get(pkg) + if not db_entry: + print(f'Could not find package in DB: {pkg}') + return None + + pkg_data = parse_pkg_info(db_entry['pkg_info']) + + (pkg_name, pkg_ver, pkg_rel, pkg_arch) = deb_pkg_nvra(pkg_data) + + # There are only a couple of mandatory package fields + package = Package( + name = pkg_name, + spdx_id = f'SPDXRef-Package-{valid_spdx_id(pkg_name)}-{unique()}', + download_location = SpdxNoAssertion(), + files_analyzed = False, + # Everything else is optional + version = f'{pkg_ver}', + file_name = f'{os.path.basename(pkg)}', + external_references=[ make_purl_ref(pkg_data, os_rel_data) ] + ) + + return package + +def spdx_add_src_pkg_dependency(spdx_doc, gitoid, sbom_db, pkg_db, os_rel_data, key, dependency): + # For this to work, we will need to add each of the packages in the package section (though we could also + # add them as external documents) + + # TODO: We should probably harmonize this with the other package add method above + + # Pull out the list of dependent source packages names + pkg_list = [ _ for _ in sbom_db[gitoid][key].keys()] + + # If we currently don't have a package entry, add one. + for pkg in pkg_list: + # The summerization of the sbom detail output leaves the following string in the list + # of packages. We don't want that in our output + if pkg == "UNKNOWN_COMPONENT_VERSION": + continue + # All files that were generated during a build will not have an origination package. + # They are listed under the package name that starts with "GENERATED " + # The input files used to generate that file will have an origination pkg and those will be captured elsewhere + if pkg.startswith('DERIVED_PKG '): + continue + package = pkg_exists(spdx_doc, pkg) + if not package: + package = build_basic_spdx_package(pkg, pkg_db, os_rel_data) + spdx_doc.packages += [package] + + # Then we add the the dependency relationship. + # We assume that the generated package is always Package 0 + depends_relationship = Relationship(spdx_doc.packages[0].spdx_id, dependency, package.spdx_id) + spdx_doc.relationships += [depends_relationship] + + return spdx_doc + +def build_sbom(rpm_file, os_rel_db): + + # We will assume that the package name is the basename of the file provided + # If needed we can probably pull / construct this from the package itself + pkg_name = os.path.basename(rpm_file) + + # This will be a random value that will be provided for the document namespace + doc_uuid = uuid.uuid4() + + # At a minimum, we will need to get some SHA1 information from the package files + # so unpack the RPM and save the directory name + unpack_dir = deb_unpack(rpm_file) + + # Build the basic SPDX structure + spdx_doc = build_base_spdx(pkg_name, doc_uuid) + + # In order to compute the packageVerificationCode for a package you need to have a list + # of the files in the package (along with their SHA1 hash) + # packageVerificationCode is mandatory if filesAnalyzed = True + file_list = analyze_files(rpm_file, unpack_dir) + + # We want to include the OMNIBOR BOM ID for our package + pkg_blob_id = get_pkg_gitoid(rpm_file) + if pkg_blob_id in g_bom_mappings_db: + pkg_bom_id = g_bom_mappings_db[pkg_blob_id] + else: + pkg_bom_id = pkg_blob_id + + # Now that we have a files, we can calculate the verification_code + verification_code = calculate_package_verification_code(file_list) + + # The only package will be the package we generated + spdx_doc = spdx_add_package(spdx_doc, rpm_file, pkg_bom_id, verification_code, os_rel_db) + + # Add the files from the package to the document + spdx_doc = spdx_add_files(spdx_doc, file_list) + + # Now for the special sauce. + # For each file in the package file list, there are input files that were used to create that file. + # In some cases, the file came directly from the package upstream source. + # The files could also be patched versions of the upstream source using patches in the distro source package. + # In other cases (particularly for binary files) there are a number of source files that were used + # in the compilation process both from the upstream source and other packages. + # We want to know all the packages from whence all those files came. + # + # Each file in our package should have a set of those input / source packages + # + # This information is generated as part of the bomsh script and stored in a JSON file (bomsh_search_jsonfile-sbom.json) + + # This file is indexed by the generated package gitoid and contains the build time and dynamic link package dependency info + pkg_sbom_db = g_pkg_sbom_db + + # This file is indexed by the package name and contains the build time package manager info for the package + pkg_index_db = g_pkg_index_db + + # And add the build dependency info for the input file source packages + # BUILD_DEPENDENCY_OF - Is to be used when SPDXRef-A is a build dependency of SPDXRef-B. + # EX: A is in the compile scope of B in a Maven project. + spdx_doc = spdx_add_src_pkg_dependency(spdx_doc, pkg_blob_id, pkg_sbom_db, pkg_index_db, os_rel_db, + 'prov_pkgs', RelationshipType.BUILD_DEPENDENCY_OF) + + # These are the run-time dependencies + # DEPENDS_ON - Is to be used when SPDXRef-A depends on SPDXRef-B. + # EX: Package A depends on the presence of package B in order to build and run + spdx_doc = spdx_add_src_pkg_dependency(spdx_doc, pkg_blob_id, pkg_sbom_db, pkg_index_db, os_rel_db, + 'dyn_libs', RelationshipType.DEPENDS_ON) + + # TODO: At this point we should cleanup our extract dir + + # This library provides comprehensive validation against the SPDX specification. + # Note that details of the validation depend on the SPDX version of the document. + validation_messages: List[ValidationMessage] = validate_full_spdx_document(spdx_doc) + + # You can have a look at each entry's message and context (like spdx_id, parent_id, full_element) + # which will help you pinpoint the location of the invalidity. + for message in validation_messages: + logging.warning(message.validation_message) + logging.warning(message.context) + + return (doc_uuid, pkg_bom_id, spdx_doc, validation_messages) + +def rest_put(url, payload): + headers = { + 'Content-Type': 'application/json' + } + + # Add custom timeout since artifactory seems to have issues at times + retries = Retry(total=3, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504]) + adapter = HTTPAdapter(max_retries=retries) + + session = requests.Session() + session.mount('http://', adapter) + session.mount('https://', adapter) + + # DEBUG + # print ( f' PUT URL = {url}') + + response = session.request("PUT", url, + auth=None, + # auth=(self._art_usr, self._art_token), + headers=headers, data=json.dumps(payload)) + return response + +def build_all_spdx(env_data, tree_db): + if not (SBOM_SERVER_URL and ADG_SERVER_URL): + print("Please configure the SBOM server and OmniBOR ADG server") + return + + SBOM_URL = f"{SBOM_SERVER_URL}/sbom/db/" + OMNIBOR_URL = f"{ADG_SERVER_URL}/adg/db" + + rpm_path = Path(RPMS_DIR) + # We could also add an "if _.suffix == '.rpm'" to the end if we want that restriction + # This will put a full posix path for each RPM in the array + # use .name if you only want the name + rpms = [_ for _ in rpm_path.iterdir() if str(_).endswith(".deb")] + print(rpms) + + # For now there will be a key for each package that we built + for rpm_path in rpms: + gitoid = get_pkg_gitoid(str(rpm_path)) + (doc_uuid, pkg_bom_id, spdx_doc, validation_messages) = build_sbom(str(rpm_path), env_data) + + # If the document is valid, validation_messages will be empty. + if validation_messages != []: + print(f'Could not validate SBOM generated for file: {rpm_path}') + # See if we can process the rest + continue + + # This is basically what the write file command does but it takes the extra step of writing to a file + validated_doc = validate_and_deduplicate(spdx_doc) + sbom_dict = convert(validated_doc, None) + + print(f'Package_name: {rpm_path.name}') + print(f'Package gitoid: {gitoid}') + print(f'Package OmniBOR BOM-ID: {pkg_bom_id}') + + pkg_data = deb_query_pkg(rpm_path) + (pkg_name, pkg_ver, pkg_rel, pkg_arch) = deb_pkg_nvra(pkg_data) + pkg_nvra = ".".join([pkg_name, pkg_ver, pkg_arch]) + + #pkg_nvra = rpm_query_fmt('NVRA', str(rpm_path)) + # If this is a source package, the result will be the string '(none)' + # TODO: Can we make this True / False? + #source_pkg = rpm_query_fmt('SOURCERPM', str(rpm_path)) + # There is no Source for Debian package + #source_pkg = pkg_data['Source'] + ".Source" + + print(f'NVRA = {pkg_nvra}') + #print(f'Source Package: {source_pkg}') + + sbom_payload = { + "sbom_name": rpm_path.name, + "sbom_uuid": str(doc_uuid), + "nvra": pkg_nvra, + #"source_pkg" : source_pkg, + "distro" : env_data['ID'], + #"release" : env_data['VERSION_ID'], + "gitoid": gitoid, + "sbom": sbom_dict + } + #print(sbom_payload) + + response = rest_put(SBOM_URL, sbom_payload) + print(f'Storing SBOM: {response}\n') + + omnibor_payload = { + "gitoid": gitoid, + "pkg_name": rpm_path.name, + "distro" : env_data['ID'], + #"release" : env_data['VERSION_ID'], + "adg": tree_db[gitoid] + } + #print(omnibor_payload) + + response = rest_put(OMNIBOR_URL, omnibor_payload) + print(f'Storing ADG: {response}\n') + + print("==========\n") + +def handle_files(rel_data): + if args.output_dir: + output_dir = get_or_create_dir(args.output_dir) + else: + output_dir = os.getcwd() + + # If we supply an argument then build the SPDX doc for those RPM packages + omnibor_sbom_docs = [] + for rpm_file in args.deb_files.split(","): + if not os.path.exists(rpm_file): + print(f'File ({rpm_file}) does not exist') + exit(1) + + (doc_uuid, pkg_bom_id, spdx_doc, validation_messages) = build_sbom(rpm_file, rel_data) + + # If the document is valid, validation_messages will be empty. + if validation_messages != []: + print(f'Could not validate SBOM generated for file: {rpm_file}') + exit(1) + + # Finally, we can serialize the document to any of the five supported formats. + # Using the write_file() method from the write_anything module, + # the format will be determined by the file ending: .spdx (tag-value), .json, .xml, .yaml. or .rdf (or .rdf.xml) + # The document namespace will be something like this: + # https://sbom.your-org.com/spdxdocs/sysstat-11.7.3-9.el8.src.rpm-b184657e-6b09-48d5-a5fc-df2f106f40b5 + # so the path will be: sysstat-11.7.3-9.el8.src.rpm-b184657e-6b09-48d5-a5fc-df2f106f40b5.spdx.json + output_fn = f'{os.path.basename(urlsplit(spdx_doc.creation_info.document_namespace).path)}.spdx.json' + output_file = os.path.join(output_dir, output_fn) + write_file(spdx_doc, output_file) + omnibor_sbom_docs.append(output_file) + + print("\nDone. All bomsh created SPDX SBOM documents with OmniBOR info are: " + str(omnibor_sbom_docs)) + +######################################### + +def rtd_parse_options(): + """ + Parse command options. + """ + parser = argparse.ArgumentParser( + description = "This tool creates SPDX documents for Debian packages built from its src") + parser.add_argument("--version", + action = "version", + version=VERSION) + parser.add_argument("-F", '--deb_files', + help = "comma-separated list of Debian files to create SPDX documents") + parser.add_argument('-o', '--output_dir', + help = "the output directory to store the created SPDX documents, the default is current dir") + parser.add_argument('--sbom_server_url', + help = "the URL of the SBOM database server") + parser.add_argument('--adg_server_url', + help = "the URL of the OmniBOR ADG database server") + parser.add_argument('--creator_organization', + help = "the organization name of the creator used in SPDX document") + parser.add_argument('--creator_email', + help = "the email address of the creator used in SPDX document") + parser.add_argument("-l", "--logs_dir", + help = "the directory with bomsh log files") + parser.add_argument("--debs_dir", + help = "the directory with Debian package files") + parser.add_argument("--keep_intermediate_files", + action = "store_true", + help = "after run completes, keep all intermediate files like unbundled packages, etc.") + parser.add_argument("-v", "--verbose", + action = "count", + default = 0, + help = "verbose output, can be supplied multiple times" + " to increase verbosity") + + # Parse the command line arguments + args = parser.parse_args() + + global LOGFILE_DIR + if args.logs_dir: + LOGFILE_DIR = args.logs_dir + global RPMS_DIR + if args.debs_dir: + RPMS_DIR = args.debs_dir + global SBOM_SERVER_URL + if args.sbom_server_url: + SBOM_SERVER_URL = args.sbom_server_url + global ADG_SERVER_URL + if args.adg_server_url: + ADG_SERVER_URL = args.adg_server_url + if not ADG_SERVER_URL: + # if adg server is not configured, then make it same as sbom server + ADG_SERVER_URL = SBOM_SERVER_URL + global CREATOR_ORG + if args.creator_organization: + CREATOR_ORG = args.creator_organization + global CREATOR_EMAIL + if args.creator_email: + CREATOR_EMAIL = args.creator_email + + if not (LOGFILE_DIR): + print ("Please specify the directory of bomsh log files with -l option!") + print ("") + parser.print_help() + sys.exit() + + print ("Your command line is:") + print (" ".join(sys.argv)) + print ("The current directory is: " + os.getcwd()) + print ("") + return args + + +def main(): + global args + # parse command line options first + args = rtd_parse_options() + + # We will need the build env OS information regardless of how we build + ENV_PATH = os.path.join(LOGFILE_DIR, OS_REL_INFO) + if not os.path.exists(ENV_PATH): + print(f'File ({ENV_PATH}) does not exist') + exit(1) + + with open(ENV_PATH, 'r') as f: + rel_info = f.readlines() + + DB_PATH = os.path.join(LOGFILE_DIR, DB_FN) + if not os.path.exists(DB_PATH): + print(f'File ({DB_PATH}) does not exist') + exit(1) + + # Pull in our the entire tree DB + tree_db = load_json_db(DB_PATH) + + # Just in case someone hid a second '=' somewhere in there + # We also don't want the redundant double quotes + rel_data = dict([_.strip().replace('"','').split('=', 1) for _ in rel_info if '=' in _]) + print(rel_data) + + # This file is indexed by the generated package gitoid and contains the OmniBOR bom-id info + global g_bom_mappings_db + g_bom_mappings_db = load_json_db(os.path.join(LOGFILE_DIR, BOM_MAPPING_FN)) + + # This file is indexed by the generated package gitoid and contains the build time and dynamic link package dependency info + global g_pkg_sbom_db + g_pkg_sbom_db = load_json_db(os.path.join(LOGFILE_DIR, PKG_SBOM_DB)) + + # This file is indexed by the package name and contains the build time package manager info for the package + global g_pkg_index_db + g_pkg_index_db = load_json_db(os.path.join(LOGFILE_DIR, PKG_INDEX_DB)) + + if args.deb_files: + handle_files(rel_data) + else: + build_all_spdx(rel_data, tree_db) + + extract_dir = os.path.join(g_tmpdir, "bomsh_extract_dir") + if os.path.exists(extract_dir) and not args.keep_intermediate_files: + shutil.rmtree(extract_dir) + +if __name__ == "__main__": + main() + diff --git a/scripts/bomsh_spdx_rpm.py b/scripts/bomsh_spdx_rpm.py index c875d18..019684f 100755 --- a/scripts/bomsh_spdx_rpm.py +++ b/scripts/bomsh_spdx_rpm.py @@ -22,6 +22,7 @@ import stat import os import logging +import shutil from datetime import datetime from typing import List import tempfile @@ -74,6 +75,8 @@ args = None +g_tmpdir = "/tmp" + # This is the database of blob_id => bom_id mappings g_bom_mappings_db = None @@ -142,7 +145,7 @@ def unique(): return secrets.token_hex(8) def rpm_unpack(rpm_name): - unpack_base = "/tmp/expand" + unpack_base = os.path.join(g_tmpdir, "bomsh_extract_dir") unpack_cmd_1 = f'/usr/bin/rpm2cpio {rpm_name}' unpack_cmd_2 = 'cpio -idm' @@ -627,11 +630,12 @@ def build_all_spdx(env_data, tree_db): # We could also add an "if _.suffix == '.rpm'" to the end if we want that restriction # This will put a full posix path for each RPM in the array # use .name if you only want the name - rpms = [_ for _ in rpm_path.iterdir()] + rpms = [_ for _ in rpm_path.iterdir() if str(_).endswith(".rpm")] # For now there will be a key for each package that we built for rpm_path in rpms: - (doc_uuid, gitoid, spdx_doc, validation_messages) = build_sbom(str(rpm_path), env_data) + gitoid = get_pkg_gitoid(str(rpm_path)) + (doc_uuid, pkg_bom_id, spdx_doc, validation_messages) = build_sbom(str(rpm_path), env_data) # If the document is valid, validation_messages will be empty. if validation_messages != []: @@ -645,6 +649,7 @@ def build_all_spdx(env_data, tree_db): print(f'Package_name: {rpm_path.name}') print(f'Package gitoid: {gitoid}') + print(f'Package OmniBOR BOM-ID: {pkg_bom_id}') pkg_nvra = rpm_query_fmt('NVRA', str(rpm_path)) # If this is a source package, the result will be the string '(none)' @@ -681,6 +686,39 @@ def build_all_spdx(env_data, tree_db): print("==========\n") +def handle_files(rel_data): + if args.output_dir: + output_dir = get_or_create_dir(args.output_dir) + else: + output_dir = os.getcwd() + + # If we supply an argument then build the SPDX doc for those RPM packages + omnibor_sbom_docs = [] + for rpm_file in args.rpm_files.split(","): + if not os.path.exists(rpm_file): + print(f'File ({rpm_file}) does not exist') + exit(1) + + (doc_uuid, pkg_bom_id, spdx_doc, validation_messages) = build_sbom(rpm_file, rel_data) + + # If the document is valid, validation_messages will be empty. + if validation_messages != []: + print(f'Could not validate SBOM generated for file: {rpm_file}') + exit(1) + + # Finally, we can serialize the document to any of the five supported formats. + # Using the write_file() method from the write_anything module, + # the format will be determined by the file ending: .spdx (tag-value), .json, .xml, .yaml. or .rdf (or .rdf.xml) + # The document namespace will be something like this: + # https://sbom.your-org.com/spdxdocs/sysstat-11.7.3-9.el8.src.rpm-b184657e-6b09-48d5-a5fc-df2f106f40b5 + # so the path will be: sysstat-11.7.3-9.el8.src.rpm-b184657e-6b09-48d5-a5fc-df2f106f40b5.spdx.json + output_fn = f'{os.path.basename(urlsplit(spdx_doc.creation_info.document_namespace).path)}.spdx.json' + output_file = os.path.join(output_dir, output_fn) + write_file(spdx_doc, output_file) + omnibor_sbom_docs.append(output_file) + + print("\nDone. All bomsh created SPDX SBOM documents with OmniBOR info are: " + str(omnibor_sbom_docs)) + ######################################### def rtd_parse_options(): @@ -692,7 +730,7 @@ def rtd_parse_options(): parser.add_argument("--version", action = "version", version=VERSION) - parser.add_argument('-r', '--rpm_files', + parser.add_argument('-F', '--rpm_files', help = "comma-separated list of RPM files to create SPDX documents") parser.add_argument('-o', '--output_dir', help = "the output directory to store the created SPDX documents, the default is current dir") @@ -708,6 +746,9 @@ def rtd_parse_options(): help = "the directory with bomsh log files") parser.add_argument("--rpms_dir", help = "the directory with RPM files") + parser.add_argument("--keep_intermediate_files", + action = "store_true", + help = "after run completes, keep all intermediate files like unbundled packages, etc.") parser.add_argument("-v", "--verbose", action = "count", default = 0, @@ -790,41 +831,14 @@ def main(): global g_pkg_index_db g_pkg_index_db = load_json_db(os.path.join(LOGFILE_DIR, PKG_INDEX_DB)) - if not args.rpm_files: - build_all_spdx(rel_data, tree_db) - return - - if args.output_dir: - output_dir = get_or_create_dir(args.output_dir) + if args.rpm_files: + handle_files(rel_data) else: - output_dir = os.getcwd() - - # If we supply an argument then build the SPDX doc for those RPM packages - omnibor_sbom_docs = [] - for rpm_file in args.rpm_files.split(","): - if not os.path.exists(rpm_file): - print(f'File ({rpm_file}) does not exist') - exit(1) - - (doc_uuid, pkg_bom_id, spdx_doc, validation_messages) = build_sbom(rpm_file, rel_data) - - # If the document is valid, validation_messages will be empty. - if validation_messages != []: - print(f'Could not validate SBOM generated for file: {rpm_file}') - exit(1) - - # Finally, we can serialize the document to any of the five supported formats. - # Using the write_file() method from the write_anything module, - # the format will be determined by the file ending: .spdx (tag-value), .json, .xml, .yaml. or .rdf (or .rdf.xml) - # The document namespace will be something like this: - # https://sbom.your-org.com/spdxdocs/sysstat-11.7.3-9.el8.src.rpm-b184657e-6b09-48d5-a5fc-df2f106f40b5 - # so the path will be: sysstat-11.7.3-9.el8.src.rpm-b184657e-6b09-48d5-a5fc-df2f106f40b5.spdx.json - output_fn = f'{os.path.basename(urlsplit(spdx_doc.creation_info.document_namespace).path)}.spdx.json' - output_file = os.path.join(output_dir, output_fn) - write_file(spdx_doc, output_file) - omnibor_sbom_docs.append(output_file) + build_all_spdx(rel_data, tree_db) - print("\nDone. All bomsh created SPDX SBOM documents with OmniBOR info are: " + str(omnibor_sbom_docs)) + extract_dir = os.path.join(g_tmpdir, "bomsh_extract_dir") + if os.path.exists(extract_dir) and not args.keep_intermediate_files: + shutil.rmtree(extract_dir) if __name__ == "__main__":