""" A base implementation of SPDX 2.3 model, as described on: https://spdx.github.io/spdx-spec/v2.3/ """ import hashlib import re from datetime import datetime, timezone from enum import Enum, auto from typing import Dict, List, Optional, Union class CreatorType(Enum): """Enumeration of SPDX actor types.""" PERSON = auto() ORGANIZATION = auto() TOOL = auto() def __str__(self) -> str: return self.name.capitalize() class Creator(): """Represents a Creator in SPDX.""" def __init__(self, creator_type: CreatorType, name: str, email: Optional[str] = None) -> None: self.creator_type = creator_type self.name = name self.email = email def __str__(self): email_str = f" ({self.email})" if self.email else "" return f"{self.creator_type}: {self.name}{email_str}" class EntityWithSpdxId(): """ Represents an SPDX entity with an SPDX ID. https://spdx.github.io/spdx-spec/v2.3/package-information/#72-package-spdx-identifier-field """ def __init__(self, spdx_id: str) -> None: id_regex = re.compile(r"^SPDXRef-[a-zA-Z0-9\.\-]+$") if not id_regex.match(spdx_id): raise ValueError(f"Invalid SPDX ID '{spdx_id}'") self.spdx_id = spdx_id def datetime_to_iso8601(dt: datetime) -> str: """ Converts a datetime object to an SPDX-compliant ISO8601 string. This means that: - The timezone is UTC - The microsecond part is removed https://spdx.github.io/spdx-spec/v2.3/document-creation-information/#69-created-field """ date = dt.astimezone(timezone.utc) date = date.replace(tzinfo=None) # Microseconds are not supported by SPDX date = date.replace(microsecond=0) return date.isoformat() + "Z" class CreationInfo(EntityWithSpdxId): """ Represents SPDX creation information. https://spdx.github.io/spdx-spec/v2.3/document-creation-information/ """ def __init__( self, spdx_version: str, spdx_id: str, name: str, document_namespace: str, creators: List[Creator], created: datetime, data_license: str = "CC0-1.0", ) -> None: super().__init__(spdx_id) if not spdx_version.startswith("SPDX-"): raise ValueError(f"Invalid SPDX version '{spdx_version}'") if spdx_id != "SPDXRef-DOCUMENT": raise ValueError(f"Invalid SPDX ID '{spdx_id}'") self.spdx_version = spdx_version self.name = name self.data_license = data_license self.document_namespace = document_namespace self.creators = creators self.created = created def to_dict(self): return { "SPDXID": self.spdx_id, "creationInfo": { "created": datetime_to_iso8601(self.created), "creators": [str(creator) for creator in self.creators], }, "dataLicense": self.data_license, "name": self.name, "spdxVersion": self.spdx_version, "documentNamespace": self.document_namespace, } class NoAssertionValue(): """Represents the SPDX No Assertion value.""" VALUE = "NOASSERTION" def __str__(self): return self.VALUE class NoneValue(): """Represents the SPDX None value.""" VALUE = "NONE" def __str__(self): return self.VALUE class ExternalPackageRefCategory(Enum): """Enumeration of external package reference categories.""" SECURITY = auto() PACKAGE_MANAGER = auto() PERSISTENT_ID = auto() OTHER = auto() def __str__(self) -> str: return self.name.replace("_", "-") CATEGORY_TO_REPOSITORY_TYPE: Dict[ExternalPackageRefCategory, List[str]] = { ExternalPackageRefCategory.SECURITY: ["cpe22Type", "cpe23Type", "advisory", "fix", "url", "swid"], ExternalPackageRefCategory.PACKAGE_MANAGER: ["maven-central", "nuget", "bower", "purl"], ExternalPackageRefCategory.PERSISTENT_ID: ["swh", "gitoid"], ExternalPackageRefCategory.OTHER: [], } class ExternalPackageRef(): """ Represents an external package reference. https://spdx.github.io/spdx-spec/v2.3/package-information/#721-external-reference-field """ def __init__(self, category: ExternalPackageRefCategory, reference_type: str, locator: str) -> None: if len(CATEGORY_TO_REPOSITORY_TYPE[category] ) > 0 and reference_type not in CATEGORY_TO_REPOSITORY_TYPE[category]: raise ValueError(f"Invalid repository type '{reference_type}' for category '{category}'") self.category = category self.reference_type = reference_type self.locator = locator def to_dict(self): return { "referenceCategory": str(self.category), "referenceType": self.reference_type, "referenceLocator": self.locator, } class ChecksumAlgorithm(Enum): """Enumeration of SPDX checksum algorithms.""" SHA1 = auto() SHA224 = auto() SHA256 = auto() SHA384 = auto() SHA512 = auto() SHA3_256 = auto() SHA3_384 = auto() SHA3_512 = auto() BLAKE2b_256 = auto() BLAKE2b_384 = auto() BLAKE2b_512 = auto() BLAKE3 = auto() MD2 = auto() MD4 = auto() MD5 = auto() MD6 = auto() ADLER32 = auto() def __str__(self) -> str: return self.name.replace("_", "-") class Checksum(): """ Represents a checksum. https://spdx.github.io/spdx-spec/v2.3/package-information/#72-checksum-fields """ def __init__(self, algorithm: ChecksumAlgorithm, value: str) -> None: self.algorithm = algorithm self.value = value def to_dict(self): return { "algorithm": str(self.algorithm), "checksumValue": self.value, } def normalize_name_for_license_id(name: str) -> str: """ Normalize a license name to be used within an SPDX license ID. The function does the following things: - Ensures that the returned string contains only letters, numbers, "." and/or "-". All other characters are replaced with "-". - Deduplicates consecutive "." and "-" characters. See also: https://spdx.github.io/spdx-spec/v2.3/other-licensing-information-detected/#1011-description: """ normalized_name = re.sub(r"[^a-zA-Z0-9.-]", "-", name) normalized_name = re.sub(r"([.-])\1+", r"\1", normalized_name) return normalized_name def generate_license_id(extracted_text: str, name: Optional[str] = None) -> str: """ Generate a unique SPDX license ID by hashing the extracted text using SHA-256. If a license name is provided, include it in the license ID. """ extracted_text_hash = hashlib.sha256(extracted_text.encode()).hexdigest() if name is not None: return f"LicenseRef-{normalize_name_for_license_id(name)}-{extracted_text_hash}" return f"LicenseRef-{extracted_text_hash}" class ExtractedLicensingInfo(): """ Represents extracted licensing information for a license not on the SPDX License List. https://spdx.github.io/spdx-spec/v2.3/other-licensing-information-detected/ """ def __init__(self, extracted_text: str, name: Optional[str] = None) -> None: self.extracted_text = extracted_text self.name = name self.license_ref_id = generate_license_id(self.extracted_text, self.name) def __str__(self): return self.license_ref_id def to_dict(self): d = { "licenseId": self.license_ref_id, "extractedText": self.extracted_text, } if self.name: d["name"] = self.name return d # pylint: disable=too-many-instance-attributes class Package(EntityWithSpdxId): """Represents an SPDX package.""" def __init__( self, spdx_id: str, name: str, download_location: Union[str, NoAssertionValue, NoneValue], version: Optional[str] = None, files_analyzed: Optional[bool] = None, checksums: Optional[List[Checksum]] = None, homepage: Optional[Union[str, NoAssertionValue, NoneValue]] = None, source_info: Optional[str] = None, license_declared: Optional[Union[str, ExtractedLicensingInfo, NoAssertionValue, NoneValue]] = None, summary: Optional[str] = None, description: Optional[str] = None, external_references: Optional[List[ExternalPackageRef]] = None, built_date: Optional[datetime] = None, ) -> None: super().__init__(spdx_id) self.name = name self.download_location = download_location self.version = version self.files_analyzed = files_analyzed self.checksums = checksums or [] self.homepage = homepage self.source_info = source_info self.license_declared = license_declared self.summary = summary self.description = description self.external_references = external_references or [] self.built_date = built_date def to_dict(self): d = { "SPDXID": self.spdx_id, "name": self.name, "downloadLocation": str(self.download_location) } if self.files_analyzed is not None: d["filesAnalyzed"] = self.files_analyzed if self.version: d["versionInfo"] = self.version if self.checksums: d["checksums"] = [checksum.to_dict() for checksum in self.checksums] if self.homepage: d["homepage"] = str(self.homepage) if self.source_info: d["sourceInfo"] = self.source_info if self.license_declared: d["licenseDeclared"] = str(self.license_declared) if self.summary: d["summary"] = self.summary if self.description: d["description"] = self.description if self.external_references: d["externalRefs"] = [ref.to_dict() for ref in self.external_references] if self.built_date: d["builtDate"] = datetime_to_iso8601(self.built_date) return d class RelationshipType(Enum): """Enumeration of SPDX relationship types.""" DESCRIBES = auto() DEPENDS_ON = auto() OPTIONAL_DEPENDENCY_OF = auto() def __str__(self) -> str: return self.name class Relationship(): """Represents a relationship between SPDX elements.""" def __init__( self, spdx_element_id: str, relationship_type: RelationshipType, related_spdx_element_id: Union[str, NoneValue, NoAssertionValue], comment: Optional[str] = None, ) -> None: self.spdx_element_id = spdx_element_id self.relationship_type = relationship_type self.related_spdx_element_id = related_spdx_element_id self.comment = comment def to_dict(self): d = { "spdxElementId": self.spdx_element_id, "relationshipType": str(self.relationship_type), "relatedSpdxElement": str(self.related_spdx_element_id), } if self.comment: d["comment"] = self.comment return d class Document(): """Represents an SPDX document.""" def __init__( self, creation_info: CreationInfo, packages: Optional[List[Package]] = None, relationships: Optional[List[Relationship]] = None, extracted_licensing_infos: Optional[List[ExtractedLicensingInfo]] = None, ) -> None: self.creation_info = creation_info self.packages = packages or [] self.relationships = relationships or [] self.extracted_licensing_infos = extracted_licensing_infos or [] def to_dict(self): d = self.creation_info.to_dict() for package in self.packages: d.setdefault("packages", []).append(package.to_dict()) for extracted_licensing_info in self.extracted_licensing_infos: d.setdefault("hasExtractedLicensingInfos", []).append(extracted_licensing_info.to_dict()) for relationship in self.relationships: d.setdefault("relationships", []).append(relationship.to_dict()) return d