objectstore: don't store objects by their treesum

The treesum of a filesystem tree is the content hash of all its
files, its directory structure and file metadata.

By storing trees by their treesum we avoid storing duplicates of
identical trees, at the cost of computing the hashes for every
commit to the store.

This has limited benefit as the likelihood of two trees being
identical is slim, in particular when we already have the ability
to cache based on pipeline/stage ID (i.e., we can avoid rebuilding
trees if the pipelines that built them were the same).

Drop the concept of a treesum entirely, even though I very much
liked the idea in theory...

Signed-off-by: Tom Gundersen <teg@jklm.no>
This commit is contained in:
Tom Gundersen 2021-10-30 16:02:36 +01:00
parent bf3c80372a
commit e97f6ef34e
3 changed files with 27 additions and 113 deletions

View file

@ -101,7 +101,7 @@ class TestObjectStore(unittest.TestCase):
assert os.path.exists(f"{object_store.refs}/b/A")
assert len(os.listdir(object_store.refs)) == 2
assert len(os.listdir(object_store.objects)) == 1
assert len(os.listdir(object_store.objects)) == 2
assert len(os.listdir(f"{object_store.refs}/a/")) == 1
assert len(os.listdir(f"{object_store.refs}/b/")) == 1
@ -133,7 +133,7 @@ class TestObjectStore(unittest.TestCase):
assert os.path.exists(f"{object_store.refs}/c/C")
assert len(os.listdir(object_store.refs)) == 3
assert len(os.listdir(object_store.objects)) == 2
assert len(os.listdir(object_store.objects)) == 3
def test_object_copy_on_write(self):
# operate with a clean object store
@ -152,7 +152,7 @@ class TestObjectStore(unittest.TestCase):
st = os.fstat(f.fileno())
data_inode = st.st_ino
# commit the object as "x"
x_hash = object_store.commit(tree, "x")
object_store.commit(tree, "x")
# after the commit, "x" is now the base
# of "tree"
self.assertEqual(tree.base, "x")
@ -171,7 +171,6 @@ class TestObjectStore(unittest.TestCase):
# the very same content
with object_store.new(base_id="x") as tree:
self.assertEqual(tree.base, "x")
self.assertEqual(tree.treesum, x_hash)
with tree.read() as path:
with open(os.path.join(path, "data"), "r") as f:
# copy-on-write: since we have not written
@ -189,9 +188,6 @@ class TestObjectStore(unittest.TestCase):
self.assertNotEqual(st.st_ino, data_inode)
p = Path(path, "other_data")
p.touch()
# now that we have written, the treesum
# should have changed
self.assertNotEqual(tree.treesum, x_hash)
def test_object_mode(self):
object_store = objectstore.ObjectStore(self.store)
@ -205,9 +201,7 @@ class TestObjectStore(unittest.TestCase):
# check multiple readers are ok
with tree.read() as _:
# calculating the treesum also is reading,
# so this is 3 nested readers
_ = tree.treesum
pass
# writing should still fail
with self.assertRaises(ValueError):