Skip to content

Commit

Permalink
Fixed deduplication of SPDX packages
Browse files Browse the repository at this point in the history
Signed-off-by: Jindrich Luza <jluza@redhat.com>
  • Loading branch information
midnightercz committed Aug 26, 2024
1 parent b7414b6 commit 4a10a9d
Show file tree
Hide file tree
Showing 3 changed files with 125 additions and 65 deletions.
33 changes: 30 additions & 3 deletions cachi2/core/models/sbom.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from typing import Any, Literal, Optional
from typing import Any, Iterable, Literal, Optional

import pydantic

from cachi2.core.models.validators import unique_sorted, unique_sorted_multikey
from cachi2.core.models.validators import unique_sorted

PropertyName = Literal[
"cachi2:found_by",
Expand Down Expand Up @@ -122,6 +122,9 @@ class SPDXPackageExternalRef(pydantic.BaseModel):
referenceLocator: str
referenceType: str

def __hash__(self) -> int:
return hash((self.referenceLocator, self.referenceType, self.referenceCategory))


class SPDXPackage(pydantic.BaseModel):
"""SPDX Package.
Expand Down Expand Up @@ -177,6 +180,30 @@ class SPDXCreationInfo(pydantic.BaseModel):
creators: list[str] = []


def deduplicate_spdx_packages(items: Iterable[SPDXPackage]) -> list[SPDXPackage]:
""" Deduplicate SPDX packages and merge external references.
If package with same name and version is found multiple times in the list,
merge external references of all the packages into one package.
"""
unique_items = {}
for item in items:
key = (item.name, item.version)
if key not in unique_items:
unique_items[key] = SPDXPackage(name=item.name, version=item.version)
unique_items[key].externalRefs = item.externalRefs[:]
else:
unique_items[key].externalRefs.extend(item.externalRefs)

for item in unique_items.values():
item.externalRefs = sorted(
list(set(item.externalRefs)),
key=lambda ref: (ref.referenceLocator, ref.referenceType, ref.referenceCategory),
)

return sorted(list(unique_items.values()), key=lambda item: (item.name, item.version))


class SPDXSbom(pydantic.BaseModel):
"""Software bill of materials in the SPDX format.
Expand All @@ -195,4 +222,4 @@ class SPDXSbom(pydantic.BaseModel):
@pydantic.field_validator("packages")
def _unique_packages(cls, packages: list[SPDXPackage]) -> list[SPDXPackage]:
"""Sort and de-duplicate components."""
return unique_sorted_multikey(packages, by=lambda package: package.key())
return deduplicate_spdx_packages(packages)
38 changes: 1 addition & 37 deletions cachi2/core/models/validators.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import os
from pathlib import Path
from typing import Any, Callable, Iterable, List, TypeVar
from typing import Any, Callable, Iterable, TypeVar

T = TypeVar("T")

Expand All @@ -24,31 +24,6 @@ def unique(items: Iterable[T], by: Callable[[T], Any], dedupe: bool = True) -> l
return list(by_key.values())


def unique_multikey(items: Iterable[T], by: Callable[[T], List[Any]]) -> list[T]:
"""Make sure input items are unique by the specified key.
The 'by' function must return a hashable key (the uniqueness key).
If item A and item B have the same key, then
if dedupe is true (the default) and A == B, B is discarded
if dedupe is false or A != B, raise an error
"""
by_key: dict[tuple[str, ...], Any] = {}
for item in items:
multi_key = by(item)
found = False
for mkey in by_key:
for key in mkey:
if key in multi_key:
found = True
break
if found:
break
else:
by_key[tuple(multi_key)] = item
return list(by_key.values())


def unique_sorted(items: Iterable[T], by: Callable[[T], Any], dedupe: bool = True) -> list[T]:
"""Make sure input items are unique and sort them.
Expand All @@ -59,17 +34,6 @@ def unique_sorted(items: Iterable[T], by: Callable[[T], Any], dedupe: bool = Tru
return unique_items


def unique_sorted_multikey(items: Iterable[T], by: Callable[[T], Any]) -> list[T]:
"""Make sure input items are unique and sort them.
This version of unique_sorted works with items where keys is composed of list of multiple values
where every value is considered as key itself. One item can then have more single keys.
"""
unique_items = unique_multikey(items, by)
unique_items.sort(key=by)
return unique_items


def check_sane_relpath(path: Path) -> Path:
"""Check that the path is relative and looks sane."""
if path.is_absolute():
Expand Down
119 changes: 94 additions & 25 deletions tests/unit/models/test_sbom.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
SPDXPackage,
SPDXPackageExternalRef,
SPDXSbom,
deduplicate_spdx_packages,
)


Expand Down Expand Up @@ -311,22 +312,6 @@ def test_sort_and_dedupe_packages(self) -> None:
}
],
},
{
"name": "github.com/org/A",
"version": "v1.1.0",
"externalRefs": [
{
"referenceCategory": "PACKAGE-MANAGER",
"referenceLocator": "pkg:golang/github.com/org/A@v1.1.0?repository_id=R1",
"referenceType": "purl",
},
{
"referenceCategory": "PACKAGE-MANAGER",
"referenceLocator": "pkg:golang/github.com/org/A@v1.1.0?repository_id=R2",
"referenceType": "purl",
},
],
},
{
"name": "github.com/org/A",
"version": "v1.0.0",
Expand Down Expand Up @@ -385,7 +370,7 @@ def test_sort_and_dedupe_packages(self) -> None:
],
)
print(sbom.packages)
assert len(sbom.packages) == 6
assert len(sbom.packages) == 5
assert sbom.packages == [
SPDXPackage(
name="bytes",
Expand Down Expand Up @@ -426,18 +411,12 @@ def test_sort_and_dedupe_packages(self) -> None:
referenceCategory="PACKAGE-MANAGER",
referenceLocator="pkg:golang/github.com/org/A@v1.1.0?repository_id=R1",
referenceType="purl",
)
],
),
SPDXPackage(
name="github.com/org/A",
version="v1.1.0",
externalRefs=[
),
SPDXPackageExternalRef(
referenceCategory="PACKAGE-MANAGER",
referenceLocator="pkg:golang/github.com/org/A@v1.1.0?repository_id=R2",
referenceType="purl",
)
),
],
),
SPDXPackage(
Expand All @@ -452,3 +431,93 @@ def test_sort_and_dedupe_packages(self) -> None:
],
),
]


def test_deduplicate_spdx_packages() -> None:
packages = [
SPDXPackage(
name="github.com/org/A",
version="v1.0.0",
externalRefs=[
SPDXPackageExternalRef(
referenceCategory="PACKAGE-MANAGER",
referenceLocator="pkg:golang/github.com/org/A@v1.0.0?repository_id=R1",
referenceType="purl",
)
],
),
SPDXPackage(
name="github.com/org/A",
version="v1.0.0",
externalRefs=[
SPDXPackageExternalRef(
referenceCategory="PACKAGE-MANAGER",
referenceLocator="pkg:golang/github.com/org/A@v1.0.0?repository_id=R1",
referenceType="purl",
),
SPDXPackageExternalRef(
referenceCategory="PACKAGE-MANAGER",
referenceLocator="pkg:golang/github.com/org/A@v1.0.0?repository_id=R2",
referenceType="purl",
),
],
),
SPDXPackage(
name="github.com/org/B",
version="v1.0.0",
externalRefs=[
SPDXPackageExternalRef(
referenceCategory="PACKAGE-MANAGER",
referenceLocator="pkg:golang/github.com/org/B@v1.0.0",
referenceType="purl",
)
],
),
SPDXPackage(
name="github.com/org/B",
version="v1.0.0",
externalRefs=[
SPDXPackageExternalRef(
referenceCategory="PACKAGE-MANAGER",
referenceLocator="pkg:golang/github.com/org/B@v1.0.0?repository_id=R1",
referenceType="purl",
)
],
),
]
deduped_packages = deduplicate_spdx_packages(packages)
assert len(deduped_packages) == 2
assert deduped_packages == [
SPDXPackage(
name="github.com/org/A",
version="v1.0.0",
externalRefs=[
SPDXPackageExternalRef(
referenceCategory="PACKAGE-MANAGER",
referenceLocator="pkg:golang/github.com/org/A@v1.0.0?repository_id=R1",
referenceType="purl",
),
SPDXPackageExternalRef(
referenceCategory="PACKAGE-MANAGER",
referenceLocator="pkg:golang/github.com/org/A@v1.0.0?repository_id=R2",
referenceType="purl",
),
],
),
SPDXPackage(
name="github.com/org/B",
version="v1.0.0",
externalRefs=[
SPDXPackageExternalRef(
referenceCategory="PACKAGE-MANAGER",
referenceLocator="pkg:golang/github.com/org/B@v1.0.0",
referenceType="purl",
),
SPDXPackageExternalRef(
referenceCategory="PACKAGE-MANAGER",
referenceLocator="pkg:golang/github.com/org/B@v1.0.0?repository_id=R1",
referenceType="purl",
),
],
),
]

0 comments on commit 4a10a9d

Please sign in to comment.