From 7c7fcecd471aa586edfd3161e02df14c3f298667 Mon Sep 17 00:00:00 2001 From: Tom Gundersen Date: Sat, 20 Jul 2019 13:38:04 +0200 Subject: [PATCH] ObjectStore: add an object store class This also changes the structure of the object store, though the basic idea is the same. The object store contains a directory of objects, which are content addressable filesystem trees. Currently we only ever use their content-hash internally, but the idea for this is basically Lars Karlitski and Kay Sievers' `treesum()`. We may exopse this in the future. Moreover, it contains a directory of refs, which are symlinks named by the stage id they correspond to (as before), pointing to an object generated from that stage-id. The ObjectStore exposes three method: `has_tree()`: This checks if the content store contains the given tree. If so, we can rely on the tree remaining there. `get_tree()`: This is meant to be used with a `with` block and yields the path to a read-only instance of the tree with the given id. If the tree_id is passed in as None, an empty directory is given instead. `new_tree()`: This is meant to be used with a `with` block and yields the path to a directory in which the tree by the given id should be created. If a base_id is passed in, the tree is initialized with the tree with the given id. Only when the block is exited successfully is the tree written to the content store, referenced by the id in question. Use this in Pipeline.run() to avoid regenerating trees unneccessarily. In order to trigger a regeneration, the content store must currently be manually flushed. Update the travis test to run the noop pipeline twice, verifying that the stage is only run the first time. Signed-off-by: Tom Gundersen --- .travis.yml | 4 +- README.md | 4 +- osbuild/__init__.py | 184 +++++++++++++++++++++++++++++++++++++------- osbuild/__main__.py | 6 +- test/run-tests.py | 2 +- 5 files changed, 165 insertions(+), 35 deletions(-) diff --git a/.travis.yml b/.travis.yml index 2d742a30..6fbbf67f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,7 +16,9 @@ jobs: script: make rpm-nodeps - name: pipeline-noop before_install: sudo apt-get install -y systemd-container - script: sudo env "PATH=$PATH" python3 -m osbuild --libdir . --output . samples/noop.json + script: + - sudo env "PATH=$PATH" python3 -m osbuild --libdir . --output . samples/noop.json + - sudo env "PATH=$PATH" python3 -m osbuild --libdir . --output . samples/noop.json - name: pipeline-yum before_install: sudo apt-get install -y systemd-container yum script: sudo env "PATH=$PATH" python3 -m osbuild --libdir . --output . samples/build-from-yum.json diff --git a/README.md b/README.md index 80b89781..f7fb0b06 100644 --- a/README.md +++ b/README.md @@ -67,7 +67,7 @@ The above pipeline has no base and produces a qcow2 image. ## Running ``` -usage: python3 -m osbuild [-h] [--objects DIRECTORY] [-l DIRECTORY] -o DIRECTORY +usage: python3 -m osbuild [-h] [--store DIRECTORY] [-l DIRECTORY] -o DIRECTORY PIPELINE Build operating system images @@ -77,7 +77,7 @@ positional arguments: optional arguments: -h, --help show this help message and exit - --objects DIRECTORY the directory where intermediary os trees are stored + --store DIRECTORY the directory where intermediary os trees are stored -l DIRECTORY, --libdir DIRECTORY the directory containing stages, assemblers, and the osbuild library diff --git a/osbuild/__init__.py b/osbuild/__init__.py index 96880163..84428938 100644 --- a/osbuild/__init__.py +++ b/osbuild/__init__.py @@ -1,5 +1,6 @@ import contextlib +import errno import hashlib import json import os @@ -69,6 +70,134 @@ class TmpFs: self.root = None +def treesum(m, dir_fd): + """Compute a content hash of a filesystem tree + + Parameters + ---------- + m : hash object + the hash object to append the treesum to + dir_fd : int + directory file descriptor number to operate on + + The hash is stable between runs, and guarantees that two filesystem + trees with the same hash, are functionally equivalent from the OS + point of view. + + The file, symlink and directory names and contents are recursively + hashed, together with security-relevant metadata.""" + + with os.scandir(dir_fd) as it: + for dirent in sorted(it, key=(lambda d: d.name)): + stat_result = dirent.stat(follow_symlinks=False) + metadata = {} + metadata["name"] = os.fsdecode(dirent.name) + metadata["mode"] = stat_result.st_mode + metadata["uid"] = stat_result.st_uid + metadata["gid"] = stat_result.st_gid + # include the size of symlink target/file-contents so we don't have to delimit it + metadata["size"] = stat_result.st_size + # getxattr cannot operate on a dir_fd, so do a trick and rely on the entries in /proc + stable_file_path = os.path.join(f"/proc/self/fd/{dir_fd}", dirent.name) + try: + selinux_label = os.getxattr(stable_file_path, b"security.selinux", follow_symlinks=False) + except OSError as e: + # SELinux support is optional + if e.errno != errno.ENODATA: + raise + else: + metadata["selinux"] = os.fsdecode(selinux_label) + # hash the JSON representation of the metadata to stay unique/stable/well-defined + m.update(json.dumps(metadata, sort_keys=True).encode()) + if dirent.is_symlink(): + m.update(os.fsdecode(os.readlink(dirent.name, dir_fd=dir_fd)).encode()) + else: + fd = os.open(dirent.name, flags=os.O_RDONLY, dir_fd=dir_fd) + try: + if dirent.is_dir(follow_symlinks=False): + treesum(m, fd) + elif dirent.is_file(follow_symlinks=False): + # hash a page at a time (using f with fd as default is a hack to please pylint) + for byte_block in iter(lambda f=fd: os.read(f, 4096), b""): + m.update(byte_block) + else: + raise ValueError("Found unexpected filetype on OS image") + finally: + os.close(fd) + + +class ObjectStore: + def __init__(self, store): + self.store = store + self.objects = f"{store}/objects" + self.refs = f"{store}/refs" + os.makedirs(self.store, exist_ok=True) + os.makedirs(self.objects, exist_ok=True) + os.makedirs(self.refs, exist_ok=True) + + def has_tree(self, tree_id): + if not tree_id: + return False + return os.access(f"{self.refs}/{tree_id}", os.F_OK) + + @contextlib.contextmanager + def get_tree(self, tree_id): + with tempfile.TemporaryDirectory(dir=self.store) as tmp: + if tree_id: + subprocess.run(["mount", "-o", "bind,ro,mode=0755", f"{self.refs}/{tree_id}", tmp], check=True) + yield tmp + subprocess.run(["umount", "--lazy", tmp], check=True) + else: + # None was given as tree_id, just return an empty directory + yield tmp + + @contextlib.contextmanager + def new_tree(self, tree_id, base_id=None): + with tempfile.TemporaryDirectory(dir=self.store) as tmp: + # the tree that is yielded will be added to the content store + # on success as tree_id + + tree = f"{tmp}/tree" + link = f"{tmp}/link" + os.mkdir(tree, mode=0o755) + + if base_id: + # the base, the working tree and the output tree are all on + # the same fs, so attempt a lightweight copy if the fs + # supports it + subprocess.run(["cp", "--reflink=auto", "-a", f"{self.refs}/{base_id}/.", tree], check=True) + + yield tree + + # if the yield raises an exception, the working tree is cleaned + # up by tempfile, otherwise, we save it in the correct place: + fd = os.open(tree, os.O_DIRECTORY) + try: + m = hashlib.sha256() + treesum(m, fd) + treesum_hash = m.hexdigest() + finally: + os.close(fd) + # the tree is stored in the objects directory using its content + # hash as its name, ideally a given tree_id (i.e., given config) + # will always produce the same content hash, but that is not + # guaranteed + output_tree = f"{self.objects}/{treesum_hash}" + try: + os.rename(tree, output_tree) + except OSError as e: + if e.errno == errno.EEXIST: + pass # tree with the same content hash already exist, use that + else: + raise + # symlink the tree_id (config hash) in the refs directory to the treesum + # (content hash) in the objects directory. If a symlink by that name + # alreday exists, atomically replace it, but leave the backing object + # in place (it may be in use). + os.symlink(f"../objects/{treesum_hash}", link) + os.replace(link, f"{self.refs}/{tree_id}") + + class BuildRoot: def __init__(self, root, path="/run/osbuild"): self.root = tempfile.mkdtemp(prefix="osbuild-buildroot-", dir=path) @@ -241,32 +370,38 @@ class Pipeline: def set_assembler(self, name, options=None): self.assembler = Assembler(name, options or {}) - def run(self, output_dir, objects=None, interactive=False, check=True, libdir=None): + def run(self, output_dir, store, interactive=False, check=True, libdir=None): os.makedirs("/run/osbuild", exist_ok=True) - if objects: - os.makedirs(objects, exist_ok=True) - elif self.base: - raise ValueError("'objects' argument must be given when pipeline has a 'base'") - + if self.base and not store: + raise ValueError("'store' argument must be given when pipeline has a 'base'") + object_store = ObjectStore(store) results = { "stages": [] } - with TmpFs() as tree: - if self.base: - subprocess.run(["cp", "-a", f"{objects}/{self.base}/.", tree], check=True) + if self.stages: + tree_id = self.stages[-1].id + if not object_store.has_tree(tree_id): + # The tree does not exist. Create it and save it to the object store. If + # two run() calls race each-other, two trees may be generated, and it + # is nondeterministic which of them will end up referenced by the tree_id + # in the content store. However, we guarantee that all tree_id's and all + # generated trees remain valid. + with object_store.new_tree(tree_id, base_id=self.base) as tree: + for stage in self.stages: + r = stage.run(tree, + "/", + interactive=interactive, + check=check, + libdir=libdir) + results["stages"].append(r) + if r["returncode"] != 0: + results["returncode"] = r["returncode"] + return results + else: + tree_id = None - for stage in self.stages: - r = stage.run(tree, - "/", - interactive=interactive, - check=check, - libdir=libdir) - results["stages"].append(r) - if r["returncode"] != 0: - results["returncode"] = r["returncode"] - return results - - if self.assembler: + if self.assembler: + with object_store.get_tree(tree_id) as tree: r = self.assembler.run(tree, "/", output_dir=output_dir, @@ -278,13 +413,6 @@ class Pipeline: results["returncode"] = r["returncode"] return results - last = self.stages[-1].id if self.stages else self.base - if objects and last: - output_tree = f"{objects}/{last}" - shutil.rmtree(output_tree, ignore_errors=True) - os.makedirs(output_tree, mode=0o755) - subprocess.run(["cp", "-a", f"{tree}/.", output_tree], check=True) - results["returncode"] = 0 return results diff --git a/osbuild/__main__.py b/osbuild/__main__.py index 9ac33a4d..64d6d660 100755 --- a/osbuild/__main__.py +++ b/osbuild/__main__.py @@ -14,8 +14,8 @@ def main(): parser = argparse.ArgumentParser(description="Build operating system images") parser.add_argument("pipeline_path", metavar="PIPELINE", help="json file containing the pipeline that should be built") - parser.add_argument("--objects", metavar="DIRECTORY", type=os.path.abspath, - default=".osbuild/objects", + parser.add_argument("--store", metavar="DIRECTORY", type=os.path.abspath, + default=".osbuild/store", help="the directory where intermediary os trees are stored") parser.add_argument("-l", "--libdir", metavar="DIRECTORY", type=os.path.abspath, help="the directory containing stages, assemblers, and the osbuild library") @@ -28,7 +28,7 @@ def main(): pipeline = osbuild.load(json.load(f)) try: - pipeline.run(args.output_dir, args.objects, interactive=True, libdir=args.libdir) + pipeline.run(args.output_dir, args.store, interactive=True, libdir=args.libdir) except KeyboardInterrupt: print() print(f"{RESET}{BOLD}{RED}Aborted{RESET}") diff --git a/test/run-tests.py b/test/run-tests.py index e352aedd..8507f724 100644 --- a/test/run-tests.py +++ b/test/run-tests.py @@ -20,7 +20,7 @@ logging.basicConfig(level=logging.getLevelName(os.environ.get("TESTS_LOGLEVEL", def run_osbuild(pipeline: str, check=True): - cmd = OSBUILD + ["--objects", OBJECTS, "-o", OUTPUT_DIR, pipeline] + cmd = OSBUILD + ["--store", OBJECTS, "-o", OUTPUT_DIR, pipeline] logging.info(f"Running osbuild: {cmd}") osbuild = subprocess.run(cmd, capture_output=True) if osbuild.returncode != 0: