Add initial SBOM library implementation

Add implementation of standard-agnostic model for SBOM, and simple SPDX
v2.3 model. Also add convenience functions for converting DNF4 package
set to the standard-agnostic model and for converting it to SPDX model.

Cover the functionality with unit tests.

Signed-off-by: Tomáš Hozza <thozza@redhat.com>
This commit is contained in:
Tomáš Hozza 2024-06-26 13:22:00 +02:00 committed by Simon de Vlieger
parent 75b6fb4abe
commit 0b68f8123b
11 changed files with 1436 additions and 1 deletions

View file

@ -0,0 +1 @@
"""Module for working with Software Bill of Materials (SBOM) files."""

106
osbuild/util/sbom/dnf.py Normal file
View file

@ -0,0 +1,106 @@
from datetime import datetime
from typing import Dict, List
import dnf
import hawkey
import osbuild.util.sbom.model as sbom_model
def bom_chksum_algorithm_from_hawkey(chksum_type: int) -> sbom_model.ChecksumAlgorithm:
"""
Convert a hawkey checksum type number to an SBOM checksum algorithm.
"""
if chksum_type == hawkey.CHKSUM_MD5:
return sbom_model.ChecksumAlgorithm.MD5
if chksum_type == hawkey.CHKSUM_SHA1:
return sbom_model.ChecksumAlgorithm.SHA1
if chksum_type == hawkey.CHKSUM_SHA256:
return sbom_model.ChecksumAlgorithm.SHA256
if chksum_type == hawkey.CHKSUM_SHA384:
return sbom_model.ChecksumAlgorithm.SHA384
if chksum_type == hawkey.CHKSUM_SHA512:
return sbom_model.ChecksumAlgorithm.SHA512
raise ValueError(f"Unknown Hawkey checksum type: {chksum_type}")
# pylint: disable=too-many-branches
def dnf_pkgset_to_sbom_pkgset(dnf_pkgset: List[dnf.package.Package]) -> List[sbom_model.BasePackage]:
"""
Convert a dnf package set to a SBOM package set.
"""
pkgs_by_name = {}
pkgs_by_provides: Dict[str, List[sbom_model.BasePackage]] = {}
for dnf_pkg in dnf_pkgset:
pkg = sbom_model.RPMPackage(
name=dnf_pkg.name,
version=dnf_pkg.version,
release=dnf_pkg.release,
architecture=dnf_pkg.arch,
epoch=dnf_pkg.epoch,
license_declared=dnf_pkg.license,
vendor=dnf_pkg.vendor,
build_date=datetime.fromtimestamp(dnf_pkg.buildtime),
summary=dnf_pkg.summary,
description=dnf_pkg.description,
source_rpm=dnf_pkg.sourcerpm,
homepage=dnf_pkg.url,
)
if dnf_pkg.chksum:
pkg.checksums = {
bom_chksum_algorithm_from_hawkey(dnf_pkg.chksum[0]): dnf_pkg.chksum[1].hex()
}
if dnf_pkg.remote_location():
pkg.download_url = dnf_pkg.remote_location()
# if dnf_pkg.from_repo is empty, the pkg is not installed. determine from remote_location
# if dnf_pkg.from_repo is "@commanddline", the pkg was installed from the command line, there is no repo URL
# if dnf_pkg.reponame is "@System", the package is installed and there is no repo URL
# if dnf_pkg.from_repo is a string with repo ID, determine the repo URL from the repo configuration
if not dnf_pkg.from_repo and dnf_pkg.remote_location():
pkg.repository_url = dnf_pkg.remote_location()[:-len("/" + dnf_pkg.relativepath)]
elif dnf_pkg.from_repo != "@commandline" and dnf_pkg.reponame != "@System":
repo_url = ""
if dnf_pkg.repo.baseurl:
repo_url = dnf_pkg.repo.baseurl
elif dnf_pkg.repo.metalink:
repo_url = dnf_pkg.repo.metalink
elif dnf_pkg.repo.mirrorlist:
repo_url = dnf_pkg.repo.mirrorlist
pkg.repository_url = repo_url
pkg.rpm_provides = [sbom_model.RPMDependency(r.name, r.relation, r.version) for r in dnf_pkg.provides]
pkg.rpm_requires = [sbom_model.RPMDependency(r.name, r.relation, r.version) for r in dnf_pkg.requires]
pkg.rpm_recommends = [sbom_model.RPMDependency(r.name, r.relation, r.version) for r in dnf_pkg.recommends]
pkg.rpm_suggests = [sbom_model.RPMDependency(r.name, r.relation, r.version) for r in dnf_pkg.suggests]
# The dnf_pkgset is not sorted by package dependencies. We need to determine relationships in two steps:
# 1. Collect all packages that provide a certain capability
# 2. Resolve dependencies for each package using previously constructed list of capabilities by package.
# Doing this in two steps ensures that all soft dependencies satisfied by a package from the same set are
# resolved.
for provide in pkg.rpm_provides:
pkgs_by_provides.setdefault(provide.name, []).append(pkg)
# Packages can also depend directly on files provided by other packages. Collect these as well.
for provided_file in dnf_pkg.files:
pkgs_by_provides.setdefault(provided_file, []).append(pkg)
pkgs_by_name[pkg.name] = pkg
for pkg in pkgs_by_name.values():
for require in pkg.rpm_requires:
# skip conditional dependencies if the required package is not in the set
# "relation" contains whitespace on both sides
if require.relation.strip() == "if" and pkgs_by_name.get(require.version) is None:
continue
for provider_pkg in pkgs_by_provides.get(require.name, []):
pkg.depends_on.add(provider_pkg)
for soft_dep in pkg.rpm_recommends + pkg.rpm_suggests:
for provider_pkg in pkgs_by_provides.get(soft_dep.name, []):
pkg.optional_depends_on.add(provider_pkg)
return list(pkgs_by_name.values())

185
osbuild/util/sbom/model.py Normal file
View file

@ -0,0 +1,185 @@
"""Defines standard-agnostic data model for an SBOM."""
import abc
import urllib.parse
import uuid
from datetime import datetime
from enum import Enum, auto
from typing import Dict, List, Optional, Set
class ChecksumAlgorithm(Enum):
SHA1 = auto()
SHA224 = auto()
SHA256 = auto()
SHA384 = auto()
SHA512 = auto()
MD5 = auto()
class BasePackage(abc.ABC):
"""Represents a software package."""
# pylint: disable=too-many-instance-attributes
def __init__(
self,
name: str,
version: str,
filename: str = "",
license_declared: str = "",
vendor: str = "",
checksums: Optional[Dict[ChecksumAlgorithm, str]] = None,
homepage: str = "",
download_url: str = "",
build_date: Optional[datetime] = None,
summary: str = "",
description: str = "",
depends_on: Optional[Set["BasePackage"]] = None,
optional_depends_on: Optional[Set["BasePackage"]] = None,
) -> None:
self.name = name
self.version = version
self.filename = filename
self.license_declared = license_declared
self.vendor = vendor
self.checksums = checksums or {}
self.homepage = homepage
self.download_url = download_url
self.build_date = build_date
self.summary = summary
self.description = description
self.depends_on = depends_on or set()
self.optional_depends_on = optional_depends_on or set()
@abc.abstractmethod
def uuid(self) -> str:
"""
Returns a stable UUID for the package.
"""
@abc.abstractmethod
def source_info(self) -> str:
"""
Return a string describing the source of the package.
"""
@abc.abstractmethod
def purl(self) -> str:
"""
Return a Package URL for the package.
The PURL format is:
pkg:<type>/<namespace>/<name>@<version>?<qualifiers>#<subpath>
Core PURL spec is defined at:
https://github.com/package-url/purl-spec/blob/master/PURL-SPECIFICATION.rst
"""
class RPMDependency:
"""Represents an RPM dependency or provided capability."""
def __init__(self, name: str, relation: str = "", version: str = "") -> None:
self.name = name
self.relation = relation
self.version = version
def __str__(self) -> str:
return f"{self.name} {self.relation} {self.version}"
class RPMPackage(BasePackage):
"""Represents an RPM package."""
def __init__(
self,
name: str,
version: str,
release: str,
architecture: str,
epoch: int = 0,
filename: str = "",
license_declared: str = "",
vendor: str = "",
checksums: Optional[Dict[ChecksumAlgorithm, str]] = None,
homepage: str = "",
download_url: str = "",
build_date: Optional[datetime] = None,
summary: str = "",
description: str = "",
depends_on: Optional[Set["BasePackage"]] = None,
optional_depends_on: Optional[Set["BasePackage"]] = None,
repository_url: str = "",
source_rpm: str = "",
rpm_provides: Optional[List[RPMDependency]] = None,
rpm_requires: Optional[List[RPMDependency]] = None,
rpm_recommends: Optional[List[RPMDependency]] = None,
rpm_suggests: Optional[List[RPMDependency]] = None,
) -> None:
super().__init__(
name,
version,
filename,
license_declared,
vendor,
checksums,
homepage,
download_url,
build_date,
summary,
description,
depends_on,
optional_depends_on,
)
self.release = release
self.architecture = architecture
self.epoch = epoch
self.repository_url = repository_url
self.source_rpm = source_rpm
self.rpm_provides = rpm_provides or []
self.rpm_requires = rpm_requires or []
self.rpm_recommends = rpm_recommends or []
self.rpm_suggests = rpm_suggests or []
def source_info(self) -> str:
"""
Return a string describing the source of the RPM package.
"""
if self.source_rpm:
return f"Source RPM: {self.source_rpm}"
return ""
def uuid(self) -> str:
"""
Returns a stable UUID for the same RPM package as defined by the PURL.
"""
return str(uuid.uuid3(uuid.NAMESPACE_URL, self._purl(with_repo_url=False)))
def _purl(self, with_repo_url=True) -> str:
"""
Return a Package URL for the RPM package.
Optionally don't include the repository URL in the PURL. This is useful
to generate a PURL that can be used to identify the same package, regardless
of the repository it was found in.
PURL spec for RPMs is defined at:
https://github.com/package-url/purl-spec/blob/master/PURL-TYPES.rst#rpm
"""
namespace = ""
if self.vendor:
namespace = f"{urllib.parse.quote(self.vendor.lower())}/"
purl = f"pkg:rpm/{namespace}{self.name}@{self.version}-{self.release}?arch={self.architecture}"
if self.epoch:
purl += f"&epoch={self.epoch}"
if with_repo_url and self.repository_url:
# https://github.com/package-url/purl-spec/blob/master/PURL-SPECIFICATION.rst#character-encoding
purl += f"&repository_url={urllib.parse.quote(self.repository_url, safe='/:=')}"
return purl
def purl(self) -> str:
return self._purl()

123
osbuild/util/sbom/spdx.py Normal file
View file

@ -0,0 +1,123 @@
from datetime import datetime
from typing import List, Union
from uuid import uuid4
import osbuild
import osbuild.util.sbom.model as sbom_model
import osbuild.util.sbom.spdx2 as spdx2
def spdx2_checksum_algorithm(algorithm: sbom_model.ChecksumAlgorithm) -> spdx2.ChecksumAlgorithm:
if algorithm == sbom_model.ChecksumAlgorithm.SHA1:
return spdx2.ChecksumAlgorithm.SHA1
if algorithm == sbom_model.ChecksumAlgorithm.SHA224:
return spdx2.ChecksumAlgorithm.SHA224
if algorithm == sbom_model.ChecksumAlgorithm.SHA256:
return spdx2.ChecksumAlgorithm.SHA256
if algorithm == sbom_model.ChecksumAlgorithm.SHA384:
return spdx2.ChecksumAlgorithm.SHA384
if algorithm == sbom_model.ChecksumAlgorithm.SHA512:
return spdx2.ChecksumAlgorithm.SHA512
if algorithm == sbom_model.ChecksumAlgorithm.MD5:
return spdx2.ChecksumAlgorithm.MD5
raise ValueError(f"Unknown checksum algorithm: {algorithm}")
def create_spdx2_document():
tool = f"osbuild-{osbuild.__version__}"
doc_name = f"sbom-by-{tool}"
ci = spdx2.CreationInfo(
spdx_version="SPDX-2.3",
spdx_id="SPDXRef-DOCUMENT",
name=doc_name,
data_license="CC0-1.0",
document_namespace=f"https://osbuild.org/spdxdocs/{doc_name}-{uuid4()}",
creators=[spdx2.Creator(spdx2.CreatorType.TOOL, tool)],
created=datetime.now(),
)
doc = spdx2.Document(ci)
return doc
def bom_pkgset_to_spdx2_doc(pkgset: List[sbom_model.BasePackage]) -> spdx2.Document:
doc = create_spdx2_document()
relationships = []
for pkg in pkgset:
download_location: Union[str, spdx2.NoAssertionValue] = spdx2.NoAssertionValue()
if pkg.download_url:
download_location = pkg.download_url
p = spdx2.Package(
spdx_id=f"SPDXRef-{pkg.uuid()}",
name=pkg.name,
download_location=download_location,
version=pkg.version,
files_analyzed=False,
license_declared=pkg.license_declared,
external_references=[
spdx2.ExternalPackageRef(
category=spdx2.ExternalPackageRefCategory.PACKAGE_MANAGER,
reference_type="purl",
locator=pkg.purl(),
)
]
)
if pkg.homepage:
p.homepage = pkg.homepage
if pkg.summary:
p.summary = pkg.summary
if pkg.description:
p.description = pkg.description
if pkg.source_info():
p.source_info = pkg.source_info()
for hash_type, hash_value in pkg.checksums.items():
p.checksums.append(
spdx2.Checksum(
algorithm=spdx2_checksum_algorithm(hash_type),
value=hash_value,
)
)
if pkg.build_date:
p.built_date = pkg.build_date
doc.packages.append(p)
relationships.append(
spdx2.Relationship(
spdx_element_id=doc.creation_info.spdx_id,
relationship_type=spdx2.RelationshipType.DESCRIBES,
related_spdx_element_id=p.spdx_id,
)
)
for dep in sorted(pkg.depends_on, key=lambda x: x.uuid()):
relationships.append(
spdx2.Relationship(
spdx_element_id=p.spdx_id,
relationship_type=spdx2.RelationshipType.DEPENDS_ON,
related_spdx_element_id=f"SPDXRef-{dep.uuid()}",
)
)
for optional_dep in sorted(pkg.optional_depends_on, key=lambda x: x.uuid()):
relationships.append(
spdx2.Relationship(
spdx_element_id=f"SPDXRef-{optional_dep.uuid()}",
relationship_type=spdx2.RelationshipType.OPTIONAL_DEPENDENCY_OF,
related_spdx_element_id=p.spdx_id,
)
)
doc.relationships = relationships
return doc

View file

@ -0,0 +1,33 @@
"""Module for creating SPDX spec v2 Software Bill of Materials (SBOM) files."""
from .model import (
Checksum,
ChecksumAlgorithm,
CreationInfo,
Creator,
CreatorType,
Document,
ExternalPackageRef,
ExternalPackageRefCategory,
NoAssertionValue,
NoneValue,
Package,
Relationship,
RelationshipType,
)
__all__ = [
"Checksum",
"ChecksumAlgorithm",
"CreationInfo",
"Creator",
"CreatorType",
"Document",
"ExternalPackageRef",
"ExternalPackageRefCategory",
"NoAssertionValue",
"NoneValue",
"Package",
"Relationship",
"RelationshipType"
]

View file

@ -0,0 +1,338 @@
"""
A base implementation of SPDX 2.3 model, as described on:
https://spdx.github.io/spdx-spec/v2.3/
"""
import re
from datetime import datetime, timezone
from enum import Enum, auto
from typing import Dict, List, Optional, Union
class CreatorType(Enum):
"""Enumeration of SPDX actor types."""
PERSON = auto()
ORGANIZATION = auto()
TOOL = auto()
def __str__(self) -> str:
return self.name.capitalize()
class Creator():
"""Represents a Creator in SPDX."""
def __init__(self, creator_type: CreatorType, name: str, email: Optional[str] = None) -> None:
self.creator_type = creator_type
self.name = name
self.email = email
def __str__(self):
email_str = f" ({self.email})" if self.email else ""
return f"{self.creator_type}: {self.name}{email_str}"
class EntityWithSpdxId():
"""
Represents an SPDX entity with an SPDX ID.
https://spdx.github.io/spdx-spec/v2.3/package-information/#72-package-spdx-identifier-field
"""
def __init__(self, spdx_id: str) -> None:
id_regex = re.compile(r"^SPDXRef-[a-zA-Z0-9\.\-]+$")
if not id_regex.match(spdx_id):
raise ValueError(f"Invalid SPDX ID '{spdx_id}'")
self.spdx_id = spdx_id
def datetime_to_iso8601(dt: datetime) -> str:
"""
Converts a datetime object to an SPDX-compliant ISO8601 string.
This means that:
- The timezone is UTC
- The microsecond part is removed
https://spdx.github.io/spdx-spec/v2.3/document-creation-information/#69-created-field
"""
date = dt.astimezone(timezone.utc)
date = date.replace(tzinfo=None)
# Microseconds are not supported by SPDX
date = date.replace(microsecond=0)
return date.isoformat() + "Z"
class CreationInfo(EntityWithSpdxId):
"""
Represents SPDX creation information.
https://spdx.github.io/spdx-spec/v2.3/document-creation-information/
"""
def __init__(
self,
spdx_version: str,
spdx_id: str,
name: str,
document_namespace: str,
creators: List[Creator],
created: datetime,
data_license: str = "CC0-1.0",
) -> None:
super().__init__(spdx_id)
if not spdx_version.startswith("SPDX-"):
raise ValueError(f"Invalid SPDX version '{spdx_version}'")
if spdx_id != "SPDXRef-DOCUMENT":
raise ValueError(f"Invalid SPDX ID '{spdx_id}'")
self.spdx_version = spdx_version
self.name = name
self.data_license = data_license
self.document_namespace = document_namespace
self.creators = creators
self.created = created
def to_dict(self):
return {
"SPDXID": self.spdx_id,
"creationInfo": {
"created": datetime_to_iso8601(self.created),
"creators": [str(creator) for creator in self.creators],
},
"dataLicense": self.data_license,
"name": self.name,
"spdxVersion": self.spdx_version,
"documentNamespace": self.document_namespace,
}
class NoAssertionValue():
"""Represents the SPDX No Assertion value."""
VALUE = "NOASSERTION"
def __str__(self):
return self.VALUE
class NoneValue():
"""Represents the SPDX None value."""
VALUE = "NONE"
def __str__(self):
return self.VALUE
class ExternalPackageRefCategory(Enum):
"""Enumeration of external package reference categories."""
SECURITY = auto()
PACKAGE_MANAGER = auto()
PERSISTENT_ID = auto()
OTHER = auto()
def __str__(self) -> str:
return self.name.replace("_", "-")
CATEGORY_TO_REPOSITORY_TYPE: Dict[ExternalPackageRefCategory, List[str]] = {
ExternalPackageRefCategory.SECURITY: ["cpe22Type", "cpe23Type", "advisory", "fix", "url", "swid"],
ExternalPackageRefCategory.PACKAGE_MANAGER: ["maven-central", "nuget", "bower", "purl"],
ExternalPackageRefCategory.PERSISTENT_ID: ["swh", "gitoid"],
ExternalPackageRefCategory.OTHER: [],
}
class ExternalPackageRef():
"""
Represents an external package reference.
https://spdx.github.io/spdx-spec/v2.3/package-information/#721-external-reference-field
"""
def __init__(self, category: ExternalPackageRefCategory, reference_type: str, locator: str) -> None:
if len(CATEGORY_TO_REPOSITORY_TYPE[category]
) > 0 and reference_type not in CATEGORY_TO_REPOSITORY_TYPE[category]:
raise ValueError(f"Invalid repository type '{reference_type}' for category '{category}'")
self.category = category
self.reference_type = reference_type
self.locator = locator
def to_dict(self):
return {
"referenceCategory": str(self.category),
"referenceType": self.reference_type,
"referenceLocator": self.locator,
}
class ChecksumAlgorithm(Enum):
"""Enumeration of SPDX checksum algorithms."""
SHA1 = auto()
SHA224 = auto()
SHA256 = auto()
SHA384 = auto()
SHA512 = auto()
SHA3_256 = auto()
SHA3_384 = auto()
SHA3_512 = auto()
BLAKE2b_256 = auto()
BLAKE2b_384 = auto()
BLAKE2b_512 = auto()
BLAKE3 = auto()
MD2 = auto()
MD4 = auto()
MD5 = auto()
MD6 = auto()
ADLER32 = auto()
def __str__(self) -> str:
return self.name.replace("_", "-")
class Checksum():
"""
Represents a checksum.
https://spdx.github.io/spdx-spec/v2.3/package-information/#72-checksum-fields
"""
def __init__(self, algorithm: ChecksumAlgorithm, value: str) -> None:
self.algorithm = algorithm
self.value = value
def to_dict(self):
return {
"algorithm": str(self.algorithm),
"checksumValue": self.value,
}
# pylint: disable=too-many-instance-attributes
class Package(EntityWithSpdxId):
"""Represents an SPDX package."""
def __init__(
self,
spdx_id: str,
name: str,
download_location: Union[str, NoAssertionValue, NoneValue],
version: Optional[str] = None,
files_analyzed: Optional[bool] = None,
checksums: Optional[List[Checksum]] = None,
homepage: Optional[Union[str, NoAssertionValue, NoneValue]] = None,
source_info: Optional[str] = None,
license_declared: Optional[Union[str, NoAssertionValue, NoneValue]] = None,
summary: Optional[str] = None,
description: Optional[str] = None,
external_references: Optional[List[ExternalPackageRef]] = None,
built_date: Optional[datetime] = None,
) -> None:
super().__init__(spdx_id)
self.name = name
self.download_location = download_location
self.version = version
self.files_analyzed = files_analyzed
self.checksums = checksums or []
self.homepage = homepage
self.source_info = source_info
self.license_declared = license_declared
self.summary = summary
self.description = description
self.external_references = external_references or []
self.built_date = built_date
def to_dict(self):
d = {
"SPDXID": self.spdx_id,
"name": self.name,
"downloadLocation": str(self.download_location)
}
if self.files_analyzed is not None:
d["filesAnalyzed"] = self.files_analyzed
if self.version:
d["versionInfo"] = self.version
if self.checksums:
d["checksums"] = [checksum.to_dict() for checksum in self.checksums]
if self.homepage:
d["homepage"] = str(self.homepage)
if self.source_info:
d["sourceInfo"] = self.source_info
if self.license_declared:
d["licenseDeclared"] = str(self.license_declared)
if self.summary:
d["summary"] = self.summary
if self.description:
d["description"] = self.description
if self.external_references:
d["externalRefs"] = [ref.to_dict() for ref in self.external_references]
if self.built_date:
d["builtDate"] = datetime_to_iso8601(self.built_date)
return d
class RelationshipType(Enum):
"""Enumeration of SPDX relationship types."""
DESCRIBES = auto()
DEPENDS_ON = auto()
OPTIONAL_DEPENDENCY_OF = auto()
def __str__(self) -> str:
return self.name
class Relationship():
"""Represents a relationship between SPDX elements."""
def __init__(
self,
spdx_element_id: str,
relationship_type: RelationshipType,
related_spdx_element_id: Union[str, NoneValue, NoAssertionValue],
comment: Optional[str] = None,
) -> None:
self.spdx_element_id = spdx_element_id
self.relationship_type = relationship_type
self.related_spdx_element_id = related_spdx_element_id
self.comment = comment
def to_dict(self):
d = {
"spdxElementId": self.spdx_element_id,
"relationshipType": str(self.relationship_type),
"relatedSpdxElement": str(self.related_spdx_element_id),
}
if self.comment:
d["comment"] = self.comment
return d
class Document():
"""Represents an SPDX document."""
def __init__(
self,
creation_info: CreationInfo,
packages: Optional[List[Package]] = None,
relationships: Optional[List[Relationship]] = None,
) -> None:
self.creation_info = creation_info
self.packages = packages or []
self.relationships = relationships or []
def to_dict(self):
d = self.creation_info.to_dict()
for package in self.packages:
d.setdefault("packages", []).append(package.to_dict())
for relationship in self.relationships:
d.setdefault("relationships", []).append(relationship.to_dict())
return d