debian-forge/sources/org.osbuild.curl
Christian Kellner c902a7a754 sources: port to host services
Port sources to also use the host services infrastructure that is
used by inputs, devices and mounts. Sources are a bit different
from the other services that they don't run for the duration of
the stage but are run before anything is built. By using the same
infrastructure we re-use the process management and inter process
communcation. Additionally, this will forward all messages from
sources to the existing monitoring framework.
Adapt all existing sources and tests.
2021-09-22 00:00:20 +02:00

187 lines
5.8 KiB
Python
Executable file

#!/usr/bin/python3
"""
Source for downloading files from URLs.
The files are indexed by their content hash. Can download files
that require secrets. The only secret provider currently supported
is `org.osbuild.rhsm` for downloading Red Hat content that requires
a subscriptions.
Internally use curl to download the files; the files are cached in
an internal cache. Multiple parallel connections are used to speed
up the download.
"""
import concurrent.futures
import itertools
import math
import os
import subprocess
import sys
import tempfile
import time
from osbuild import sources
from osbuild.util.checksum import verify_file
from osbuild.util.rhsm import Subscriptions
SCHEMA = """
"additionalProperties": false,
"definitions": {
"item": {
"description": "The files to fetch indexed their content checksum",
"type": "object",
"additionalProperties": false,
"patternProperties": {
"(md5|sha1|sha256|sha384|sha512):[0-9a-f]{32,128}": {
"oneOf": [
{
"type": "string",
"description": "URL to download the file from."
},
{
"type": "object",
"additionalProperties": false,
"required": [
"url"
],
"properties": {
"url": {
"type": "string",
"description": "URL to download the file from."
},
"secrets": {
"type": "object",
"additionalProperties": false,
"required": [
"name"
],
"properties": {
"name": {
"type": "string",
"description": "Name of the secrets provider."
}
}
}
}
}
]
}
}
}
},
"properties": {
"items": {"$ref": "#/definitions/item"},
"urls": {"$ref": "#/definitions/item"}
},
"oneOf": [{
"required": ["items"]
}, {
"required": ["urls"]
}]
"""
def fetch(url, checksum, directory):
secrets = url.get("secrets")
url_path = url.get("url")
# Download to a temporary directory until we have verified the checksum. Use a
# subdirectory, so we avoid copying across block devices.
with tempfile.TemporaryDirectory(prefix="osbuild-unverified-file-", dir=directory) as tmpdir:
# some mirrors are sometimes broken. retry manually, because we could be
# redirected to a different, working, one on retry.
start_time = time.monotonic()
return_code = 0
for _ in range(20):
elapsed_time = time.monotonic() - start_time
if elapsed_time >= 300:
continue
curl_command = [
"curl",
"--silent",
"--max-time", f"{int(math.ceil(300 - elapsed_time))}",
"--connect-timeout", "60",
"--fail",
"--location",
"--output", checksum,
]
if secrets:
if secrets.get('ssl_ca_cert'):
curl_command.extend(["--cacert", secrets.get('ssl_ca_cert')])
if secrets.get('ssl_client_cert'):
curl_command.extend(["--cert", secrets.get('ssl_client_cert')])
if secrets.get('ssl_client_key'):
curl_command.extend(["--key", secrets.get('ssl_client_key')])
# url must follow options
curl_command.append(url_path)
curl = subprocess.run(curl_command, encoding="utf-8", cwd=tmpdir, check=False)
return_code = curl.returncode
if return_code == 0:
break
else:
raise RuntimeError(f"curl: error downloading {url}: error code {return_code}")
if not verify_file(f"{tmpdir}/{checksum}", checksum):
raise RuntimeError(f"checksum mismatch: {checksum} {url}")
# The checksum has been verified, move the file into place. in case we race
# another download of the same file, we simply ignore the error as their
# contents are guaranteed to be the same.
try:
os.rename(f"{tmpdir}/{checksum}", f"{directory}/{checksum}")
except FileExistsError:
pass
def download(items, cache):
with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
requested_urls = []
requested_checksums = []
subscriptions = None
for (checksum, url) in items.items():
# Invariant: all files in @directory must be named after their (verified) checksum.
# Check this before secrets so that if everything is pre-downloaded we don't need secrets
if os.path.isfile(f"{cache}/{checksum}"):
continue
if not isinstance(url, dict):
url = {"url": url}
# check if url needs rhsm secrets
if url.get("secrets", {}).get("name") == "org.osbuild.rhsm":
# rhsm secrets only need to be retrieved once and can then be reused
if subscriptions is None:
subscriptions = Subscriptions.from_host_system()
url["secrets"] = subscriptions.get_secrets(url.get("url"))
requested_urls.append(url)
requested_checksums.append(checksum)
results = executor.map(fetch, requested_urls, requested_checksums, itertools.repeat(cache))
for _ in results:
pass
class CurlSource(sources.SourceService):
def download(self, items, cache, _options):
cache = os.path.join(cache, "org.osbuild.files")
os.makedirs(cache, exist_ok=True)
download(items, cache)
def main():
service = CurlSource.from_args(sys.argv[1:])
service.main()
if __name__ == '__main__':
main()