diff --git a/.travis.yml b/.travis.yml index 2d742a30..6fbbf67f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,7 +16,9 @@ jobs: script: make rpm-nodeps - name: pipeline-noop before_install: sudo apt-get install -y systemd-container - script: sudo env "PATH=$PATH" python3 -m osbuild --libdir . --output . samples/noop.json + script: + - sudo env "PATH=$PATH" python3 -m osbuild --libdir . --output . samples/noop.json + - sudo env "PATH=$PATH" python3 -m osbuild --libdir . --output . samples/noop.json - name: pipeline-yum before_install: sudo apt-get install -y systemd-container yum script: sudo env "PATH=$PATH" python3 -m osbuild --libdir . --output . samples/build-from-yum.json diff --git a/README.md b/README.md index 80b89781..f7fb0b06 100644 --- a/README.md +++ b/README.md @@ -67,7 +67,7 @@ The above pipeline has no base and produces a qcow2 image. ## Running ``` -usage: python3 -m osbuild [-h] [--objects DIRECTORY] [-l DIRECTORY] -o DIRECTORY +usage: python3 -m osbuild [-h] [--store DIRECTORY] [-l DIRECTORY] -o DIRECTORY PIPELINE Build operating system images @@ -77,7 +77,7 @@ positional arguments: optional arguments: -h, --help show this help message and exit - --objects DIRECTORY the directory where intermediary os trees are stored + --store DIRECTORY the directory where intermediary os trees are stored -l DIRECTORY, --libdir DIRECTORY the directory containing stages, assemblers, and the osbuild library diff --git a/osbuild/__init__.py b/osbuild/__init__.py index 96880163..84428938 100644 --- a/osbuild/__init__.py +++ b/osbuild/__init__.py @@ -1,5 +1,6 @@ import contextlib +import errno import hashlib import json import os @@ -69,6 +70,134 @@ class TmpFs: self.root = None +def treesum(m, dir_fd): + """Compute a content hash of a filesystem tree + + Parameters + ---------- + m : hash object + the hash object to append the treesum to + dir_fd : int + directory file descriptor number to operate on + + The hash is stable between runs, and guarantees that two filesystem + trees with the same hash, are functionally equivalent from the OS + point of view. + + The file, symlink and directory names and contents are recursively + hashed, together with security-relevant metadata.""" + + with os.scandir(dir_fd) as it: + for dirent in sorted(it, key=(lambda d: d.name)): + stat_result = dirent.stat(follow_symlinks=False) + metadata = {} + metadata["name"] = os.fsdecode(dirent.name) + metadata["mode"] = stat_result.st_mode + metadata["uid"] = stat_result.st_uid + metadata["gid"] = stat_result.st_gid + # include the size of symlink target/file-contents so we don't have to delimit it + metadata["size"] = stat_result.st_size + # getxattr cannot operate on a dir_fd, so do a trick and rely on the entries in /proc + stable_file_path = os.path.join(f"/proc/self/fd/{dir_fd}", dirent.name) + try: + selinux_label = os.getxattr(stable_file_path, b"security.selinux", follow_symlinks=False) + except OSError as e: + # SELinux support is optional + if e.errno != errno.ENODATA: + raise + else: + metadata["selinux"] = os.fsdecode(selinux_label) + # hash the JSON representation of the metadata to stay unique/stable/well-defined + m.update(json.dumps(metadata, sort_keys=True).encode()) + if dirent.is_symlink(): + m.update(os.fsdecode(os.readlink(dirent.name, dir_fd=dir_fd)).encode()) + else: + fd = os.open(dirent.name, flags=os.O_RDONLY, dir_fd=dir_fd) + try: + if dirent.is_dir(follow_symlinks=False): + treesum(m, fd) + elif dirent.is_file(follow_symlinks=False): + # hash a page at a time (using f with fd as default is a hack to please pylint) + for byte_block in iter(lambda f=fd: os.read(f, 4096), b""): + m.update(byte_block) + else: + raise ValueError("Found unexpected filetype on OS image") + finally: + os.close(fd) + + +class ObjectStore: + def __init__(self, store): + self.store = store + self.objects = f"{store}/objects" + self.refs = f"{store}/refs" + os.makedirs(self.store, exist_ok=True) + os.makedirs(self.objects, exist_ok=True) + os.makedirs(self.refs, exist_ok=True) + + def has_tree(self, tree_id): + if not tree_id: + return False + return os.access(f"{self.refs}/{tree_id}", os.F_OK) + + @contextlib.contextmanager + def get_tree(self, tree_id): + with tempfile.TemporaryDirectory(dir=self.store) as tmp: + if tree_id: + subprocess.run(["mount", "-o", "bind,ro,mode=0755", f"{self.refs}/{tree_id}", tmp], check=True) + yield tmp + subprocess.run(["umount", "--lazy", tmp], check=True) + else: + # None was given as tree_id, just return an empty directory + yield tmp + + @contextlib.contextmanager + def new_tree(self, tree_id, base_id=None): + with tempfile.TemporaryDirectory(dir=self.store) as tmp: + # the tree that is yielded will be added to the content store + # on success as tree_id + + tree = f"{tmp}/tree" + link = f"{tmp}/link" + os.mkdir(tree, mode=0o755) + + if base_id: + # the base, the working tree and the output tree are all on + # the same fs, so attempt a lightweight copy if the fs + # supports it + subprocess.run(["cp", "--reflink=auto", "-a", f"{self.refs}/{base_id}/.", tree], check=True) + + yield tree + + # if the yield raises an exception, the working tree is cleaned + # up by tempfile, otherwise, we save it in the correct place: + fd = os.open(tree, os.O_DIRECTORY) + try: + m = hashlib.sha256() + treesum(m, fd) + treesum_hash = m.hexdigest() + finally: + os.close(fd) + # the tree is stored in the objects directory using its content + # hash as its name, ideally a given tree_id (i.e., given config) + # will always produce the same content hash, but that is not + # guaranteed + output_tree = f"{self.objects}/{treesum_hash}" + try: + os.rename(tree, output_tree) + except OSError as e: + if e.errno == errno.EEXIST: + pass # tree with the same content hash already exist, use that + else: + raise + # symlink the tree_id (config hash) in the refs directory to the treesum + # (content hash) in the objects directory. If a symlink by that name + # alreday exists, atomically replace it, but leave the backing object + # in place (it may be in use). + os.symlink(f"../objects/{treesum_hash}", link) + os.replace(link, f"{self.refs}/{tree_id}") + + class BuildRoot: def __init__(self, root, path="/run/osbuild"): self.root = tempfile.mkdtemp(prefix="osbuild-buildroot-", dir=path) @@ -241,32 +370,38 @@ class Pipeline: def set_assembler(self, name, options=None): self.assembler = Assembler(name, options or {}) - def run(self, output_dir, objects=None, interactive=False, check=True, libdir=None): + def run(self, output_dir, store, interactive=False, check=True, libdir=None): os.makedirs("/run/osbuild", exist_ok=True) - if objects: - os.makedirs(objects, exist_ok=True) - elif self.base: - raise ValueError("'objects' argument must be given when pipeline has a 'base'") - + if self.base and not store: + raise ValueError("'store' argument must be given when pipeline has a 'base'") + object_store = ObjectStore(store) results = { "stages": [] } - with TmpFs() as tree: - if self.base: - subprocess.run(["cp", "-a", f"{objects}/{self.base}/.", tree], check=True) + if self.stages: + tree_id = self.stages[-1].id + if not object_store.has_tree(tree_id): + # The tree does not exist. Create it and save it to the object store. If + # two run() calls race each-other, two trees may be generated, and it + # is nondeterministic which of them will end up referenced by the tree_id + # in the content store. However, we guarantee that all tree_id's and all + # generated trees remain valid. + with object_store.new_tree(tree_id, base_id=self.base) as tree: + for stage in self.stages: + r = stage.run(tree, + "/", + interactive=interactive, + check=check, + libdir=libdir) + results["stages"].append(r) + if r["returncode"] != 0: + results["returncode"] = r["returncode"] + return results + else: + tree_id = None - for stage in self.stages: - r = stage.run(tree, - "/", - interactive=interactive, - check=check, - libdir=libdir) - results["stages"].append(r) - if r["returncode"] != 0: - results["returncode"] = r["returncode"] - return results - - if self.assembler: + if self.assembler: + with object_store.get_tree(tree_id) as tree: r = self.assembler.run(tree, "/", output_dir=output_dir, @@ -278,13 +413,6 @@ class Pipeline: results["returncode"] = r["returncode"] return results - last = self.stages[-1].id if self.stages else self.base - if objects and last: - output_tree = f"{objects}/{last}" - shutil.rmtree(output_tree, ignore_errors=True) - os.makedirs(output_tree, mode=0o755) - subprocess.run(["cp", "-a", f"{tree}/.", output_tree], check=True) - results["returncode"] = 0 return results diff --git a/osbuild/__main__.py b/osbuild/__main__.py index 9ac33a4d..64d6d660 100755 --- a/osbuild/__main__.py +++ b/osbuild/__main__.py @@ -14,8 +14,8 @@ def main(): parser = argparse.ArgumentParser(description="Build operating system images") parser.add_argument("pipeline_path", metavar="PIPELINE", help="json file containing the pipeline that should be built") - parser.add_argument("--objects", metavar="DIRECTORY", type=os.path.abspath, - default=".osbuild/objects", + parser.add_argument("--store", metavar="DIRECTORY", type=os.path.abspath, + default=".osbuild/store", help="the directory where intermediary os trees are stored") parser.add_argument("-l", "--libdir", metavar="DIRECTORY", type=os.path.abspath, help="the directory containing stages, assemblers, and the osbuild library") @@ -28,7 +28,7 @@ def main(): pipeline = osbuild.load(json.load(f)) try: - pipeline.run(args.output_dir, args.objects, interactive=True, libdir=args.libdir) + pipeline.run(args.output_dir, args.store, interactive=True, libdir=args.libdir) except KeyboardInterrupt: print() print(f"{RESET}{BOLD}{RED}Aborted{RESET}") diff --git a/test/run-tests.py b/test/run-tests.py index e352aedd..8507f724 100644 --- a/test/run-tests.py +++ b/test/run-tests.py @@ -20,7 +20,7 @@ logging.basicConfig(level=logging.getLevelName(os.environ.get("TESTS_LOGLEVEL", def run_osbuild(pipeline: str, check=True): - cmd = OSBUILD + ["--objects", OBJECTS, "-o", OUTPUT_DIR, pipeline] + cmd = OSBUILD + ["--store", OBJECTS, "-o", OUTPUT_DIR, pipeline] logging.info(f"Running osbuild: {cmd}") osbuild = subprocess.run(cmd, capture_output=True) if osbuild.returncode != 0: