debian-forge/sources/org.osbuild.curl
Ondřej Budai af0e849081 sources/curl: Use our own User-Agent
Currently, osbuild downloads are identified as coming from `curl`. This
is unfortunate because some RPM mirrors block requests from curl. Let's
"fix" that by introducing our own user-agent. While this can certainly
be seen as "circumventing" a policy, I think that this change is
actually helpful: Now, the mirror maintainers can actually distinguish
osbuild requests from regular curl calls. If they want to block osbuild,
they certainly can, we have no power there, but at least this allows
more fine-grained filtering. Also, our new user-agent contains our
domain name, so if there's a problem, they can contact us.
2024-04-30 03:10:44 +02:00

217 lines
7.7 KiB
Python
Executable file

#!/usr/bin/python3
"""
Source for downloading files from URLs.
The files are indexed by their content hash. It can download files
that require secrets. The secret providers currently supported are:
- `org.osbuild.rhsm` for downloading Red Hat content that requires
a subscriptions.
- `org.osbuild.mtls` for downloading content that requires client
certificats. The paths to the key and cert should be set in the
environment in OSBUILD_SOURCES_CURL_SSL_CLIENT_KEY,
OSBUILD_SOURCES_CURL_SSL_CLIENT_CERT, and optionally
OSBUILD_SOURCES_CURL_SSL_CA_CERT.
It uses curl to download the files; the files are cached in an
internal cache. Multiple parallel connections are used to speed
up the download.
"""
import concurrent.futures
import os
import platform
import subprocess
import sys
import tempfile
import urllib.parse
from typing import Dict
from osbuild import sources
from osbuild.util.checksum import verify_file
from osbuild.util.rhsm import Subscriptions
SCHEMA = """
"additionalProperties": false,
"definitions": {
"item": {
"description": "The files to fetch indexed their content checksum",
"type": "object",
"additionalProperties": false,
"patternProperties": {
"(md5|sha1|sha256|sha384|sha512):[0-9a-f]{32,128}": {
"oneOf": [
{
"type": "string",
"description": "URL to download the file from."
},
{
"type": "object",
"additionalProperties": false,
"required": [
"url"
],
"properties": {
"url": {
"type": "string",
"description": "URL to download the file from."
},
"insecure": {
"type": "boolean",
"description": "Skip the verification step for secure connections and proceed without checking",
"default": false
},
"secrets": {
"type": "object",
"additionalProperties": false,
"required": [
"name"
],
"properties": {
"name": {
"type": "string",
"description": "Name of the secrets provider."
}
}
}
}
}
]
}
}
}
},
"properties": {
"items": {"$ref": "#/definitions/item"},
"urls": {"$ref": "#/definitions/item"}
},
"oneOf": [{
"required": ["items"]
}, {
"required": ["urls"]
}]
"""
class CurlSource(sources.SourceService):
content_type = "org.osbuild.files"
max_workers = 2 * (os.cpu_count() or 1)
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.subscriptions = None
def amend_secrets(self, checksum, desc_or_url):
if not isinstance(desc_or_url, dict):
desc = {"url": desc_or_url}
else:
desc = desc_or_url
# check if desc needs rhsm secrets
if desc.get("secrets", {}).get("name") == "org.osbuild.rhsm":
# rhsm secrets only need to be retrieved once and can then be reused
if self.subscriptions is None:
self.subscriptions = Subscriptions.from_host_system()
desc["secrets"] = self.subscriptions.get_secrets(desc.get("url"))
elif desc.get("secrets", {}).get("name") == "org.osbuild.mtls":
key = os.getenv("OSBUILD_SOURCES_CURL_SSL_CLIENT_KEY")
cert = os.getenv("OSBUILD_SOURCES_CURL_SSL_CLIENT_CERT")
if not (key and cert):
raise RuntimeError(f"mtls secrets required but key ({key}) or cert ({cert}) not defined")
desc["secrets"] = {
'ssl_ca_cert': os.getenv("OSBUILD_SOURCES_CURL_SSL_CA_CERT"),
'ssl_client_cert': cert,
'ssl_client_key': key,
}
return checksum, desc
@staticmethod
def _quote_url(url: str) -> str:
purl = urllib.parse.urlparse(url)
path = urllib.parse.quote(purl.path)
quoted = purl._replace(path=path)
return quoted.geturl()
def fetch_all(self, items: Dict) -> None:
filtered = filter(lambda i: not self.exists(i[0], i[1]), items.items()) # discards items already in cache
amended = map(lambda i: self.amend_secrets(i[0], i[1]), filtered)
with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor:
for _ in executor.map(self.fetch_one, *zip(*amended)):
pass
def fetch_one(self, checksum, desc):
secrets = desc.get("secrets")
insecure = desc.get("insecure")
url = self._quote_url(desc.get("url"))
proxy = os.getenv("OSBUILD_SOURCES_CURL_PROXY")
# Download to a temporary sub cache until we have verified the checksum. Use a
# subdirectory, so we avoid copying across block devices.
with tempfile.TemporaryDirectory(prefix="osbuild-unverified-file-", dir=self.cache) as tmpdir:
# some mirrors are sometimes broken. retry manually, because we could be
# redirected to a different, working, one on retry.
return_code = 0
arch = platform.machine()
for _ in range(10):
curl_command = [
"curl",
"--silent",
"--speed-limit", "1000",
"--connect-timeout", "30",
"--fail",
"--location",
"--header", f"User-Agent: osbuild (Linux.{arch}; https://osbuild.org/)",
"--output", checksum,
]
if proxy:
curl_command.extend(["--proxy", proxy])
if secrets:
if secrets.get('ssl_ca_cert'):
curl_command.extend(["--cacert", secrets.get('ssl_ca_cert')])
if secrets.get('ssl_client_cert'):
curl_command.extend(["--cert", secrets.get('ssl_client_cert')])
if secrets.get('ssl_client_key'):
curl_command.extend(["--key", secrets.get('ssl_client_key')])
if insecure:
curl_command.append("--insecure")
# url must follow options
curl_command.append(url)
curl = subprocess.run(curl_command, encoding="utf-8", cwd=tmpdir, check=False)
return_code = curl.returncode
if return_code == 0:
break
else:
raise RuntimeError(f"curl: error downloading {url}: error code {return_code}")
if not verify_file(f"{tmpdir}/{checksum}", checksum):
raise RuntimeError(f"checksum mismatch: {checksum} {url}")
# The checksum has been verified, move the file into place. in case we race
# another download of the same file, we simply ignore the error as their
# contents are guaranteed to be the same.
try:
os.rename(f"{tmpdir}/{checksum}", f"{self.cache}/{checksum}")
except FileExistsError:
pass
# Workaround the lack of structured progress reporting from
# stages/sources. It generates messages of the form
# "message": "source/org.osbuild.curl (org.osbuild.curl): Downloaded https://rpmrepo.osbuild.org/v2/mirror/public/f38/f38-x86_64-fedora-20230413/Packages/f/fonts-srpm-macros-2.0.5-11.fc38.noarch.rpm\n
#
# Without it just a long pause with no progress while curl
# downloads.
print(f"Downloaded {url}")
def main():
service = CurlSource.from_args(sys.argv[1:])
service.main()
if __name__ == '__main__':
main()