#!/usr/bin/python3 """ Source for downloading files from URLs. The files are indexed by their content hash. Can download files that require secrets. The only secret provider currently supported is `org.osbuild.rhsm` for downloading Red Hat content that requires a subscriptions. Internally use curl to download the files; the files are cached in an internal cache. Multiple parallel connections are used to speed up the download. """ import concurrent.futures import itertools import math import os import subprocess import sys import tempfile import time from osbuild import sources from osbuild.util.checksum import verify_file from osbuild.util.rhsm import Subscriptions SCHEMA = """ "additionalProperties": false, "definitions": { "item": { "description": "The files to fetch indexed their content checksum", "type": "object", "additionalProperties": false, "patternProperties": { "(md5|sha1|sha256|sha384|sha512):[0-9a-f]{32,128}": { "oneOf": [ { "type": "string", "description": "URL to download the file from." }, { "type": "object", "additionalProperties": false, "required": [ "url" ], "properties": { "url": { "type": "string", "description": "URL to download the file from." }, "secrets": { "type": "object", "additionalProperties": false, "required": [ "name" ], "properties": { "name": { "type": "string", "description": "Name of the secrets provider." } } } } } ] } } } }, "properties": { "items": {"$ref": "#/definitions/item"}, "urls": {"$ref": "#/definitions/item"} }, "oneOf": [{ "required": ["items"] }, { "required": ["urls"] }] """ def fetch(url, checksum, directory): secrets = url.get("secrets") url_path = url.get("url") # Download to a temporary directory until we have verified the checksum. Use a # subdirectory, so we avoid copying across block devices. with tempfile.TemporaryDirectory(prefix="osbuild-unverified-file-", dir=directory) as tmpdir: # some mirrors are sometimes broken. retry manually, because we could be # redirected to a different, working, one on retry. start_time = time.monotonic() return_code = 0 for _ in range(20): elapsed_time = time.monotonic() - start_time if elapsed_time >= 300: continue curl_command = [ "curl", "--silent", "--max-time", f"{int(math.ceil(300 - elapsed_time))}", "--connect-timeout", "60", "--fail", "--location", "--output", checksum, ] if secrets: if secrets.get('ssl_ca_cert'): curl_command.extend(["--cacert", secrets.get('ssl_ca_cert')]) if secrets.get('ssl_client_cert'): curl_command.extend(["--cert", secrets.get('ssl_client_cert')]) if secrets.get('ssl_client_key'): curl_command.extend(["--key", secrets.get('ssl_client_key')]) # url must follow options curl_command.append(url_path) curl = subprocess.run(curl_command, encoding="utf-8", cwd=tmpdir, check=False) return_code = curl.returncode if return_code == 0: break else: raise RuntimeError(f"curl: error downloading {url}: error code {return_code}") if not verify_file(f"{tmpdir}/{checksum}", checksum): raise RuntimeError(f"checksum mismatch: {checksum} {url}") # The checksum has been verified, move the file into place. in case we race # another download of the same file, we simply ignore the error as their # contents are guaranteed to be the same. try: os.rename(f"{tmpdir}/{checksum}", f"{directory}/{checksum}") except FileExistsError: pass def download(items, cache): with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor: requested_urls = [] requested_checksums = [] subscriptions = None for (checksum, url) in items.items(): # Invariant: all files in @directory must be named after their (verified) checksum. # Check this before secrets so that if everything is pre-downloaded we don't need secrets if os.path.isfile(f"{cache}/{checksum}"): continue if not isinstance(url, dict): url = {"url": url} # check if url needs rhsm secrets if url.get("secrets", {}).get("name") == "org.osbuild.rhsm": # rhsm secrets only need to be retrieved once and can then be reused if subscriptions is None: subscriptions = Subscriptions.from_host_system() url["secrets"] = subscriptions.get_secrets(url.get("url")) requested_urls.append(url) requested_checksums.append(checksum) results = executor.map(fetch, requested_urls, requested_checksums, itertools.repeat(cache)) for _ in results: pass class CurlSource(sources.SourceService): def download(self, items, cache, _options): cache = os.path.join(cache, "org.osbuild.files") os.makedirs(cache, exist_ok=True) download(items, cache) def main(): service = CurlSource.from_args(sys.argv[1:]) service.main() if __name__ == '__main__': main()