diff --git a/osbuild/objectstore.py b/osbuild/objectstore.py index abdb8e99..f4deca3d 100644 --- a/osbuild/objectstore.py +++ b/osbuild/objectstore.py @@ -1,15 +1,13 @@ import contextlib -import errno -import hashlib import os import subprocess import tempfile +import uuid from typing import Optional from osbuild.util.types import PathLike -from osbuild.util import ctx, jsoncomm, rmrf +from osbuild.util import jsoncomm, rmrf from . import api -from . import treesum __all__ = [ @@ -86,15 +84,6 @@ class Object: self._base = base_id self.id = base_id - @property - def treesum(self) -> str: - """Calculate the treesum of the object""" - with self._open() as fd: - m = hashlib.sha256() - treesum.treesum(m, fd) - treesum_hash = m.hexdigest() - return treesum_hash - @property def _path(self) -> str: if self._base and not self._init: @@ -146,20 +135,21 @@ class Object: umount(target) self._readers -= 1 - def store_tree(self, destination: str): - """Store the tree at destination and reset itself + def store_tree(self): + """Store the tree with a fresh name and reset itself - Moves the tree atomically by using rename(2). If the - target already exist, does nothing. Afterwards it - resets itself and can be used as if it was new. + Moves the tree atomically by using rename(2), to a + randomly generated unique name. Afterwards it resets + itself and can be used as if it was new. """ self._check_writable() self._check_readers() self._check_writer() self.init() - with ctx.suppress_oserror(errno.ENOTEMPTY, errno.EEXIST): - os.rename(self._tree, destination) + destination = str(uuid.uuid4()) + os.rename(self._tree, os.path.join(self.store.objects, destination)) self.reset() + return destination def reset(self): self.cleanup() @@ -349,41 +339,34 @@ class ObjectStore(contextlib.AbstractContextManager): """Commits a Object to the object store Move the contents of the obj (Object) to object directory - of the store with the content hash (obj.treesum) as its name. - Creates a symlink to that ('objects/{hash}') in the references + of the store with a universally unique name. Creates a + symlink to that ('objects/{hash}') in the references directory with the object_id as the name ('refs/{object_id}). If the link already exists, it will be atomically replaced. - Returns: The treesum of the object + Returns: The name of the object """ - treesum_hash = obj.treesum - # the object is stored in the objects directory using its content - # hash as its name, ideally a given object_id (i.e., given config) - # will always produce the same content hash, but that is not - # guaranteed. If an object with the same treesum already exist, us - # the existing one instead - obj.store_tree(os.path.join(self.objects, treesum_hash)) + # the object is stored in the objects directory using its unique + # name. This means that eatch commit will always result in a new + # object in the store, even if an identical one exists. + object_name = obj.store_tree() # symlink the object_id (config hash) in the refs directory to the - # treesum (content hash) in the objects directory. If a symlink by - # that name already exists, atomically replace it, but leave the - # backing object in place (it may be in use). + # object name in the objects directory. If a symlink by that name + # already exists, atomically replace it, but leave the backing object + # in place (it may be in use). with self.tempdir() as tmp: link = f"{tmp}/link" - os.symlink(f"../objects/{treesum_hash}", link) + os.symlink(f"../objects/{object_name}", link) os.replace(link, self.resolve_ref(object_id)) - # the reference that is pointing to `treesum_hash` is now the base + # the reference that is pointing to `object_name` is now the base # of `obj`. It is not actively initialized but any subsequent calls # to `obj.write()` will initialize it again - # NB: in the case that an object with the same treesum as `obj` - # already existed in the store obj.store_tree() will not actually - # have written anything to the store. In this case `obj` will then - # be initialized with the content of the already existing object. obj.base = object_id - return treesum_hash + return object_name def cleanup(self): """Cleanup all created Objects that are still alive""" diff --git a/osbuild/treesum.py b/osbuild/treesum.py deleted file mode 100644 index b17c5817..00000000 --- a/osbuild/treesum.py +++ /dev/null @@ -1,63 +0,0 @@ -import errno -import json -import os -import stat - - -#pylint: disable=too-many-branches -def treesum(m, dir_fd): - """Compute a content hash of a filesystem tree - - Parameters - ---------- - m : hash object - the hash object to append the treesum to - dir_fd : int - directory file descriptor number to operate on - - The hash is stable between runs, and guarantees that two filesystem - trees with the same hash, are functionally equivalent from the OS - point of view. - - The file, symlink and directory names and contents are recursively - hashed, together with security-relevant metadata.""" - - with os.scandir(f"/proc/self/fd/{dir_fd}") as it: - for dirent in sorted(it, key=(lambda d: d.name)): - stat_result = dirent.stat(follow_symlinks=False) - metadata = {} - metadata["name"] = os.fsdecode(dirent.name) - metadata["mode"] = stat_result.st_mode - metadata["uid"] = stat_result.st_uid - metadata["gid"] = stat_result.st_gid - # include the size of symlink target/file-contents so we don't have to delimit it - metadata["size"] = stat_result.st_size - # getxattr cannot operate on a dir_fd, so do a trick and rely on the entries in /proc - stable_file_path = os.path.join(f"/proc/self/fd/{dir_fd}", dirent.name) - try: - selinux_label = os.getxattr(stable_file_path, b"security.selinux", follow_symlinks=False) - except OSError as e: - # SELinux support is optional - if e.errno != errno.ENODATA: - raise - else: - metadata["selinux"] = os.fsdecode(selinux_label) - # hash the JSON representation of the metadata to stay unique/stable/well-defined - m.update(json.dumps(metadata, sort_keys=True).encode()) - if dirent.is_symlink(): - m.update(os.fsdecode(os.readlink(dirent.name, dir_fd=dir_fd)).encode()) - else: - fd = os.open(dirent.name, flags=os.O_RDONLY, dir_fd=dir_fd) - try: - if dirent.is_dir(follow_symlinks=False): - treesum(m, fd) - elif dirent.is_file(follow_symlinks=False): - # hash a page at a time (using f with fd as default is a hack to please pylint) - for byte_block in iter(lambda f=fd: os.read(f, 4096), b""): - m.update(byte_block) - elif stat.S_ISCHR(stat_result.st_mode) or stat.S_ISBLK(stat_result.st_mode): - m.update(json.dumps({"dev": stat_result.st_rdev}).encode()) - else: - raise ValueError("Found unexpected filetype on OS image.") - finally: - os.close(fd) diff --git a/test/mod/test_objectstore.py b/test/mod/test_objectstore.py index addce9c1..6e04c6d8 100644 --- a/test/mod/test_objectstore.py +++ b/test/mod/test_objectstore.py @@ -101,7 +101,7 @@ class TestObjectStore(unittest.TestCase): assert os.path.exists(f"{object_store.refs}/b/A") assert len(os.listdir(object_store.refs)) == 2 - assert len(os.listdir(object_store.objects)) == 1 + assert len(os.listdir(object_store.objects)) == 2 assert len(os.listdir(f"{object_store.refs}/a/")) == 1 assert len(os.listdir(f"{object_store.refs}/b/")) == 1 @@ -133,7 +133,7 @@ class TestObjectStore(unittest.TestCase): assert os.path.exists(f"{object_store.refs}/c/C") assert len(os.listdir(object_store.refs)) == 3 - assert len(os.listdir(object_store.objects)) == 2 + assert len(os.listdir(object_store.objects)) == 3 def test_object_copy_on_write(self): # operate with a clean object store @@ -152,7 +152,7 @@ class TestObjectStore(unittest.TestCase): st = os.fstat(f.fileno()) data_inode = st.st_ino # commit the object as "x" - x_hash = object_store.commit(tree, "x") + object_store.commit(tree, "x") # after the commit, "x" is now the base # of "tree" self.assertEqual(tree.base, "x") @@ -171,7 +171,6 @@ class TestObjectStore(unittest.TestCase): # the very same content with object_store.new(base_id="x") as tree: self.assertEqual(tree.base, "x") - self.assertEqual(tree.treesum, x_hash) with tree.read() as path: with open(os.path.join(path, "data"), "r") as f: # copy-on-write: since we have not written @@ -189,9 +188,6 @@ class TestObjectStore(unittest.TestCase): self.assertNotEqual(st.st_ino, data_inode) p = Path(path, "other_data") p.touch() - # now that we have written, the treesum - # should have changed - self.assertNotEqual(tree.treesum, x_hash) def test_object_mode(self): object_store = objectstore.ObjectStore(self.store) @@ -205,9 +201,7 @@ class TestObjectStore(unittest.TestCase): # check multiple readers are ok with tree.read() as _: - # calculating the treesum also is reading, - # so this is 3 nested readers - _ = tree.treesum + pass # writing should still fail with self.assertRaises(ValueError):