objectstore: don't store objects by their treesum

The treesum of a filesystem tree is the content hash of all its
files, its directory structure and file metadata.

By storing trees by their treesum we avoid storing duplicates of
identical trees, at the cost of computing the hashes for every
commit to the store.

This has limited benefit as the likelihood of two trees being
identical is slim, in particular when we already have the ability
to cache based on pipeline/stage ID (i.e., we can avoid rebuilding
trees if the pipelines that built them were the same).

Drop the concept of a treesum entirely, even though I very much
liked the idea in theory...

Signed-off-by: Tom Gundersen <teg@jklm.no>
This commit is contained in:
Tom Gundersen 2021-10-30 16:02:36 +01:00
parent bf3c80372a
commit e97f6ef34e
3 changed files with 27 additions and 113 deletions

View file

@ -1,15 +1,13 @@
import contextlib
import errno
import hashlib
import os
import subprocess
import tempfile
import uuid
from typing import Optional
from osbuild.util.types import PathLike
from osbuild.util import ctx, jsoncomm, rmrf
from osbuild.util import jsoncomm, rmrf
from . import api
from . import treesum
__all__ = [
@ -86,15 +84,6 @@ class Object:
self._base = base_id
self.id = base_id
@property
def treesum(self) -> str:
"""Calculate the treesum of the object"""
with self._open() as fd:
m = hashlib.sha256()
treesum.treesum(m, fd)
treesum_hash = m.hexdigest()
return treesum_hash
@property
def _path(self) -> str:
if self._base and not self._init:
@ -146,20 +135,21 @@ class Object:
umount(target)
self._readers -= 1
def store_tree(self, destination: str):
"""Store the tree at destination and reset itself
def store_tree(self):
"""Store the tree with a fresh name and reset itself
Moves the tree atomically by using rename(2). If the
target already exist, does nothing. Afterwards it
resets itself and can be used as if it was new.
Moves the tree atomically by using rename(2), to a
randomly generated unique name. Afterwards it resets
itself and can be used as if it was new.
"""
self._check_writable()
self._check_readers()
self._check_writer()
self.init()
with ctx.suppress_oserror(errno.ENOTEMPTY, errno.EEXIST):
os.rename(self._tree, destination)
destination = str(uuid.uuid4())
os.rename(self._tree, os.path.join(self.store.objects, destination))
self.reset()
return destination
def reset(self):
self.cleanup()
@ -349,41 +339,34 @@ class ObjectStore(contextlib.AbstractContextManager):
"""Commits a Object to the object store
Move the contents of the obj (Object) to object directory
of the store with the content hash (obj.treesum) as its name.
Creates a symlink to that ('objects/{hash}') in the references
of the store with a universally unique name. Creates a
symlink to that ('objects/{hash}') in the references
directory with the object_id as the name ('refs/{object_id}).
If the link already exists, it will be atomically replaced.
Returns: The treesum of the object
Returns: The name of the object
"""
treesum_hash = obj.treesum
# the object is stored in the objects directory using its content
# hash as its name, ideally a given object_id (i.e., given config)
# will always produce the same content hash, but that is not
# guaranteed. If an object with the same treesum already exist, us
# the existing one instead
obj.store_tree(os.path.join(self.objects, treesum_hash))
# the object is stored in the objects directory using its unique
# name. This means that eatch commit will always result in a new
# object in the store, even if an identical one exists.
object_name = obj.store_tree()
# symlink the object_id (config hash) in the refs directory to the
# treesum (content hash) in the objects directory. If a symlink by
# that name already exists, atomically replace it, but leave the
# backing object in place (it may be in use).
# object name in the objects directory. If a symlink by that name
# already exists, atomically replace it, but leave the backing object
# in place (it may be in use).
with self.tempdir() as tmp:
link = f"{tmp}/link"
os.symlink(f"../objects/{treesum_hash}", link)
os.symlink(f"../objects/{object_name}", link)
os.replace(link, self.resolve_ref(object_id))
# the reference that is pointing to `treesum_hash` is now the base
# the reference that is pointing to `object_name` is now the base
# of `obj`. It is not actively initialized but any subsequent calls
# to `obj.write()` will initialize it again
# NB: in the case that an object with the same treesum as `obj`
# already existed in the store obj.store_tree() will not actually
# have written anything to the store. In this case `obj` will then
# be initialized with the content of the already existing object.
obj.base = object_id
return treesum_hash
return object_name
def cleanup(self):
"""Cleanup all created Objects that are still alive"""