osbuid: integrate FsCache into ObjectStore

Integrate the recently added file system cache `FsCache` into our
object store `ObjectStore`. NB: This changes the semantics of it:
previously a call to `ObjectStore.commit` resulted in the object
being in the cache (i/o errors aside). But `FsCache.store`, which
is now the backing store for objects, will only commit objects if
there is enough space left. Thus we cannot rely that objects are
present for reading after a call to `FsCache.store`. To cope with
this we now always copy the object into the cache, even for cases
where we previously moved it: for the case where commit is called
with `object_id` matching `Object.id`, which is the case for when
`commit` is called for last stage in the pipeline. We could keep
this optimization but then we would have to special case it and
not call `commit` for these cases but only after we exported all
objects; or in other words, after we are sure we will never read
from any committed object again. The extra complexity seems not
worth it for the little gain of the optimization.
Convert all the tests for the new semantic and also remove a lot
of them that make no sense under this new paradigm.

Add a new command line option `--cache-max-size` which will set
the maximum size of the cache, if specified.
This commit is contained in:
Christian Kellner 2022-12-06 18:41:52 +01:00
parent 1e0e1fa2c2
commit ae0680da11
6 changed files with 158 additions and 247 deletions

View file

@ -43,6 +43,9 @@ is not listed here, **osbuild** will deny startup and exit with an error.
are stored are stored
-l DIR, --libdir=DIR directory containing stages, assemblers, and -l DIR, --libdir=DIR directory containing stages, assemblers, and
the osbuild library the osbuild library
--cache-max-size=SIZE maximum size of the cache (bytes) or 'unlimited'
for no restriction (size may include an optional
unit suffix, like kB, kiB, MB, MiB and so on)
--checkpoint=CHECKPOINT stage to commit to the object store during --checkpoint=CHECKPOINT stage to commit to the object store during
build (can be passed multiple times) build (can be passed multiple times)
--export=OBJECT object to export (can be passed multiple times) --export=OBJECT object to export (can be passed multiple times)

View file

@ -15,6 +15,7 @@ import osbuild
import osbuild.meta import osbuild.meta
import osbuild.monitor import osbuild.monitor
from osbuild.objectstore import ObjectStore from osbuild.objectstore import ObjectStore
from osbuild.util.parsing import parse_size
from osbuild.util.term import fmt as vt from osbuild.util.term import fmt as vt
@ -66,6 +67,8 @@ def parse_arguments(sys_argv):
help="directory where intermediary os trees are stored") help="directory where intermediary os trees are stored")
parser.add_argument("-l", "--libdir", metavar="DIRECTORY", type=os.path.abspath, default="/usr/lib/osbuild", parser.add_argument("-l", "--libdir", metavar="DIRECTORY", type=os.path.abspath, default="/usr/lib/osbuild",
help="directory containing stages, assemblers, and the osbuild library") help="directory containing stages, assemblers, and the osbuild library")
parser.add_argument("--cache-max-size", metavar="SIZE", type=parse_size, default=None,
help="maximum size of the cache (bytes) or 'unlimited' for no restriction")
parser.add_argument("--checkpoint", metavar="ID", action="append", type=str, default=None, parser.add_argument("--checkpoint", metavar="ID", action="append", type=str, default=None,
help="stage to commit to the object store during build (can be passed multiple times)") help="stage to commit to the object store during build (can be passed multiple times)")
parser.add_argument("--export", metavar="ID", action="append", type=str, default=[], parser.add_argument("--export", metavar="ID", action="append", type=str, default=[],
@ -150,6 +153,9 @@ def osbuild_cli():
try: try:
with ObjectStore(args.store) as object_store: with ObjectStore(args.store) as object_store:
if args.cache_max_size is not None:
object_store.maximum_size = args.cache_max_size
stage_timeout = args.stage_timeout stage_timeout = args.stage_timeout
pipelines = manifest.depsolve(object_store, exports) pipelines = manifest.depsolve(object_store, exports)

View file

@ -4,10 +4,10 @@ import json
import os import os
import subprocess import subprocess
import tempfile import tempfile
import uuid from typing import Any, Optional, Set, Union
from typing import Any, Optional, Set
from osbuild.util import jsoncomm, rmrf from osbuild.util import jsoncomm
from osbuild.util.fscache import FsCache, FsCacheInfo
from osbuild.util.mnt import mount, umount from osbuild.util.mnt import mount, umount
from osbuild.util.types import PathLike from osbuild.util.types import PathLike
@ -105,22 +105,34 @@ class Object:
def __fspath__(self): def __fspath__(self):
return self.path return self.path
def __init__(self, store: "ObjectStore", uid: str, mode: Mode): def __init__(self, cache: FsCache, uid: str, mode: Mode):
self._cache = cache
self._mode = mode self._mode = mode
self._workdir = None
self._id = uid self._id = uid
self.store = store self._path = None
self._meta: Optional[Object.Metadata] = None
self._stack: Optional[contextlib.ExitStack] = None
def _open_for_reading(self):
name = self._stack.enter_context(
self._cache.load(self.id)
)
self._path = os.path.join(self._cache, name)
def _open_for_writing(self):
name = self._stack.enter_context(
self._cache.stage()
)
self._path = os.path.join(self._cache, name)
os.makedirs(os.path.join(self._path, "tree"))
def __enter__(self):
assert not self.active
self._stack = contextlib.ExitStack()
if self.mode == Object.Mode.READ: if self.mode == Object.Mode.READ:
path = self.store.resolve_ref(uid) self._open_for_reading()
assert path is not None
self._path = os.path.join(path, "data")
else: else:
workdir = self.tempdir("workdir") self._open_for_writing()
self._workdir = workdir
self._path = os.path.join(workdir.name, "data")
tree = os.path.join(self._path, "tree")
os.makedirs(tree)
# Expose our base path as `os.PathLike` via `PathAdater` # Expose our base path as `os.PathLike` via `PathAdater`
# so any changes to it, e.g. via `store_tree`, will be # so any changes to it, e.g. via `store_tree`, will be
@ -128,6 +140,16 @@ class Object:
wrapped = PathAdapter(self, "_path") wrapped = PathAdapter(self, "_path")
self._meta = self.Metadata(wrapped, folder="meta") self._meta = self.Metadata(wrapped, folder="meta")
return self
def __exit__(self, exc_type, exc_value, exc_tb):
assert self.active
self.cleanup()
@property
def active(self) -> bool:
return self._stack is not None
@property @property
def id(self) -> Optional[str]: def id(self) -> Optional[str]:
return self._id return self._id
@ -139,39 +161,22 @@ class Object:
def init(self, base: "Object"): def init(self, base: "Object"):
"""Initialize the object with the base object""" """Initialize the object with the base object"""
self._check_mode(Object.Mode.WRITE) self._check_mode(Object.Mode.WRITE)
assert self.active
assert self._path
base.clone(self._path) base.clone(self._path)
@property @property
def tree(self) -> str: def tree(self) -> str:
assert self.active
assert self._path
return os.path.join(self._path, "tree") return os.path.join(self._path, "tree")
@property @property
def meta(self) -> Metadata: def meta(self) -> Metadata:
assert self.active
assert self._meta
return self._meta return self._meta
def store_tree(self):
"""Store the tree with a fresh name and close it
Moves the tree atomically by using rename(2), to a
randomly generated unique name.
This puts the object into the READ state.
"""
self._check_mode(Object.Mode.WRITE)
name = str(uuid.uuid4())
base = os.path.join(self.store.objects, name)
os.makedirs(base)
destination = os.path.join(base, "data")
os.rename(self._path, destination)
self._path = destination
self.finalize()
self.cleanup()
return name
def finalize(self): def finalize(self):
if self.mode != Object.Mode.WRITE: if self.mode != Object.Mode.WRITE:
return return
@ -180,27 +185,15 @@ class Object:
self._mode = Object.Mode.READ self._mode = Object.Mode.READ
def cleanup(self): def cleanup(self):
workdir = self._workdir if self._stack:
if workdir: self._stack.close()
# manually remove the tree, it might contain self._stack = None
# files with immutable flag set, which will
# throw off standard Python 3 tempdir cleanup
rmrf.rmtree(os.path.join(workdir.name, "data"))
workdir.cleanup()
self._workdir = None
def _check_mode(self, want: Mode): def _check_mode(self, want: Mode):
"""Internal: Raise a ValueError if we are not in the desired mode""" """Internal: Raise a ValueError if we are not in the desired mode"""
if self.mode != want: if self.mode != want:
raise ValueError(f"Wrong object mode: {self.mode}, want {want}") raise ValueError(f"Wrong object mode: {self.mode}, want {want}")
def tempdir(self, suffix=None):
if suffix:
suffix = "-" + suffix
name = f"object-{self._id[:7]}-"
return self.store.tempdir(prefix=name, suffix=suffix)
def export(self, to_directory: PathLike): def export(self, to_directory: PathLike):
"""Copy object into an external directory""" """Copy object into an external directory"""
subprocess.run( subprocess.run(
@ -283,16 +276,14 @@ class HostTree:
class ObjectStore(contextlib.AbstractContextManager): class ObjectStore(contextlib.AbstractContextManager):
def __init__(self, store: PathLike): def __init__(self, store: PathLike):
self.store = store self.cache = FsCache("osbuild", store)
self.objects = os.path.join(store, "objects")
self.refs = os.path.join(store, "refs")
self.tmp = os.path.join(store, "tmp") self.tmp = os.path.join(store, "tmp")
os.makedirs(self.store, exist_ok=True) os.makedirs(self.store, exist_ok=True)
os.makedirs(self.objects, exist_ok=True) os.makedirs(self.objects, exist_ok=True)
os.makedirs(self.refs, exist_ok=True)
os.makedirs(self.tmp, exist_ok=True) os.makedirs(self.tmp, exist_ok=True)
self._objs: Set[Object] = set() self._objs: Set[Object] = set()
self._host_tree: Optional[HostTree] = None self._host_tree: Optional[HostTree] = None
self._stack = contextlib.ExitStack()
def _get_floating(self, object_id: str) -> Optional[Object]: def _get_floating(self, object_id: str) -> Optional[Object]:
"""Internal: get a non-committed object""" """Internal: get a non-committed object"""
@ -301,8 +292,33 @@ class ObjectStore(contextlib.AbstractContextManager):
return obj return obj
return None return None
@property
def maximum_size(self) -> Optional[Union[int, str]]:
info = self.cache.info
return info.maximum_size
@maximum_size.setter
def maximum_size(self, size: Union[int, str]):
info = FsCacheInfo(maximum_size=size)
self.cache.info = info
@property
def active(self) -> bool:
#pylint: disable=protected-access
return self.cache._is_active()
@property
def store(self):
return os.fspath(self.cache)
@property
def objects(self):
return os.path.join(self.cache, "objects")
@property @property
def host_tree(self) -> HostTree: def host_tree(self) -> HostTree:
assert self.active
if not self._host_tree: if not self._host_tree:
self._host_tree = HostTree(self) self._host_tree = HostTree(self)
return self._host_tree return self._host_tree
@ -314,13 +330,11 @@ class ObjectStore(contextlib.AbstractContextManager):
if self._get_floating(object_id): if self._get_floating(object_id):
return True return True
return os.access(self.resolve_ref(object_id), os.F_OK) try:
with self.cache.load(object_id):
def resolve_ref(self, object_id: Optional[str]) -> Optional[str]: return True
"""Returns the path to the given object_id""" except FsCache.MissError:
if not object_id: return False
return None
return os.path.join(self.refs, object_id)
def tempdir(self, prefix=None, suffix=None): def tempdir(self, prefix=None, suffix=None):
"""Return a tempfile.TemporaryDirectory within the store""" """Return a tempfile.TemporaryDirectory within the store"""
@ -329,75 +343,51 @@ class ObjectStore(contextlib.AbstractContextManager):
suffix=suffix) suffix=suffix)
def get(self, object_id): def get(self, object_id):
assert self.active
obj = self._get_floating(object_id) obj = self._get_floating(object_id)
if obj: if obj:
return obj return obj
if not self.contains(object_id): try:
obj = Object(self.cache, object_id, Object.Mode.READ)
self._stack.enter_context(obj)
return obj
except FsCache.MissError:
return None return None
return Object(self, object_id, Object.Mode.READ)
def new(self, object_id: str): def new(self, object_id: str):
"""Creates a new `Object` and open it for writing. """Creates a new `Object` and open it for writing.
It returns a temporary instance of `Object`, the base It returns a instance of `Object` that can be used to
optionally set to `base_id`. It can be used to interact write tree and metadata. Use `commit` to attempt to
with the store. store the object in the cache.
If changes to the object's content were made (by calling
`Object.write`), these must manually be committed to the
store via `commit()`.
""" """
assert self.active
obj = Object(self, object_id, Object.Mode.WRITE) obj = Object(self.cache, object_id, Object.Mode.WRITE)
self._stack.enter_context(obj)
self._objs.add(obj) self._objs.add(obj)
return obj return obj
def commit(self, obj: Object, object_id: str) -> str: def commit(self, obj: Object, object_id: str):
"""Commits a Object to the object store """Commits the Object to the object cache as `object_id`.
Move the contents of the obj (Object) to object directory Attempts to store the contents of `obj` and its metadata
of the store with a universally unique name. Creates a in the object cache. Whether anything is actually stored
symlink to that ('objects/{hash}') in the references depends on the configuration of the cache, i.e. its size
directory with the object_id as the name ('refs/{object_id}). and how much free space is left or can be made available.
If the link already exists, it will be atomically replaced. Therefore the caller should not assume that the stored
object can be retrived at all.
If object_id is different from the id of the object, a copy
of the object will be stored.
Returns: The name of the object
""" """
# The supplied object_id is not the object's final id, so assert self.active
# we have to make a copy first
if obj.id != object_id:
tmp = self.new(object_id)
tmp.init(obj)
obj = tmp
# The object is stored in the objects directory using its unique with self.cache.store(object_id) as name:
# name. This means that each commit will always result in a new path = os.path.join(self.cache, name)
# object in the store, even if an identical one exists. obj.clone(path)
object_name = obj.store_tree()
# symlink the object_id (config hash) in the refs directory to the
# object name in the objects directory. If a symlink by that name
# already exists, atomically replace it, but leave the backing object
# in place (it may be in use).
with self.tempdir() as tmp:
link = f"{tmp}/link"
os.symlink(f"../objects/{object_name}", link)
ref = self.resolve_ref(object_id)
if not ref:
raise RuntimeError("commit with unresolvable ref")
os.replace(link, ref)
return object_name
def cleanup(self): def cleanup(self):
"""Cleanup all created Objects that are still alive""" """Cleanup all created Objects that are still alive"""
@ -405,10 +395,19 @@ class ObjectStore(contextlib.AbstractContextManager):
self._host_tree.cleanup() self._host_tree.cleanup()
self._host_tree = None self._host_tree = None
for obj in self._objs: self._stack.close()
obj.cleanup() self._objs = set()
def __fspath__(self):
return os.fspath(self.store)
def __enter__(self):
assert not self.active
self._stack.enter_context(self.cache)
return self
def __exit__(self, exc_type, exc_val, exc_tb): def __exit__(self, exc_type, exc_val, exc_tb):
assert self.active
self.cleanup() self.cleanup()

View file

@ -16,6 +16,9 @@ git checkout "$MANIFEST_DB_COMMIT"
OSBUILD_LABEL=$(matchpathcon -n /usr/bin/osbuild) OSBUILD_LABEL=$(matchpathcon -n /usr/bin/osbuild)
chcon $OSBUILD_LABEL tools/image-info chcon $OSBUILD_LABEL tools/image-info
# set the maximum cache size to unlimited
echo "{}" | sudo osbuild --cache-max-size unlimited -
# run the tests from the manifest-db for this arch+distro # run the tests from the manifest-db for this arch+distro
echo "Running the osbuild-image-test for arch $ARCH and ditribution $DISTRO_CODE" echo "Running the osbuild-image-test for arch $ARCH and ditribution $DISTRO_CODE"
sudo tools/osbuild-image-test --arch=$ARCH --distro=$DISTRO_CODE --image-info-path=tools/image-info sudo tools/osbuild-image-test --arch=$ARCH --distro=$DISTRO_CODE --image-info-path=tools/image-info

View file

@ -15,12 +15,10 @@ from .. import test
def store_path(store: objectstore.ObjectStore, ref: str, path: str) -> bool: def store_path(store: objectstore.ObjectStore, ref: str, path: str) -> bool:
if not store.contains(ref): obj = store.get(ref)
if not obj:
return False return False
obj = store.resolve_ref(ref) return os.path.exists(os.path.join(obj, path))
if not obj or not os.path.exists(obj):
return False
return os.path.exists(os.path.join(obj, "data", "tree", path))
@unittest.skipUnless(test.TestBase.can_bind_mount(), "root-only") @unittest.skipUnless(test.TestBase.can_bind_mount(), "root-only")
@ -35,8 +33,9 @@ class TestObjectStore(unittest.TestCase):
def test_basic(self): def test_basic(self):
# always use a temporary store so item counting works # always use a temporary store so item counting works
with objectstore.ObjectStore(self.store) as object_store: with objectstore.ObjectStore(self.store) as object_store:
object_store.maximum_size = 1024*1024*1024
# No objects or references should be in the store # No objects or references should be in the store
assert len(os.listdir(object_store.refs)) == 0
assert len(os.listdir(object_store.objects)) == 0 assert len(os.listdir(object_store.objects)) == 0
tree = object_store.new("a") tree = object_store.new("a")
@ -47,157 +46,51 @@ class TestObjectStore(unittest.TestCase):
p = Path(tree, "A") p = Path(tree, "A")
p.touch() p.touch()
# consumes the object, puts it into read mode tree.finalize() # put the object into READ mode
object_store.commit(tree, "a")
assert tree.mode == objectstore.Object.Mode.READ assert tree.mode == objectstore.Object.Mode.READ
assert object_store.contains("a") # commit makes a copy, if space
object_store.commit(tree, "a")
assert store_path(object_store, "a", "A") assert store_path(object_store, "a", "A")
assert len(os.listdir(object_store.refs)) == 1 # second object, based on the first one
assert len(os.listdir(object_store.objects)) == 1 obj2 = object_store.new("b")
obj2.init(tree)
tree = object_store.new("b") p = Path(obj2, "B")
p = Path(tree, "A")
p.touch()
p = Path(tree, "B")
p.touch() p.touch()
# consumes the object, puts it into read mode obj2.finalize() # put the object into READ mode
assert obj2.mode == objectstore.Object.Mode.READ
# commit always makes a copy, if space
object_store.commit(tree, "b") object_store.commit(tree, "b")
assert object_store.contains("b") assert object_store.contains("b")
assert store_path(object_store, "b", "A")
assert store_path(object_store, "b", "B") assert store_path(object_store, "b", "B")
assert len(os.listdir(object_store.refs)) == 2
assert len(os.listdir(object_store.objects)) == 2 assert len(os.listdir(object_store.objects)) == 2
# assert len(os.listdir(f"{object_store.refs}/b/")) == 2
self.assertEqual(object_store.resolve_ref(None), None)
self.assertEqual(object_store.resolve_ref("a"),
f"{object_store.refs}/a")
# object should exist and should be in read mode
tree = object_store.get("b") tree = object_store.get("b")
assert tree is not None assert tree is not None
assert tree.mode == objectstore.Object.Mode.READ assert tree.mode == objectstore.Object.Mode.READ
def test_cleanup(self): def test_cleanup(self):
# always use a temporary store so item counting works # always use a temporary store so item counting works
with tempfile.TemporaryDirectory(dir="/var/tmp") as tmp: with objectstore.ObjectStore(self.store) as object_store:
with objectstore.ObjectStore(tmp) as object_store: object_store.maximum_size = 1024*1024*1024
tree = object_store.new("a")
self.assertEqual(len(os.listdir(object_store.tmp)), 1)
p = Path(tree, "A")
p.touch()
# there should be no temporary Objects dirs anymore stage = os.path.join(object_store, "stage")
self.assertEqual(len(os.listdir(object_store.tmp)), 0) tree = object_store.new("a")
self.assertEqual(len(os.listdir(stage)), 1)
def test_commit_clone(self):
# operate with a clean object store
with tempfile.TemporaryDirectory(dir="/var/tmp") as tmp:
# sample data to be used for read, write checks
data = "23"
with objectstore.ObjectStore(tmp) as store:
assert len(os.listdir(store.refs)) == 0
tree = store.new("a")
with open(os.path.join(tree, "data"), "w",
encoding="utf-8") as f:
f.write(data)
st = os.fstat(f.fileno())
data_inode = st.st_ino
# commit the object as "x", making a copy
store.commit(tree, "x")
# check that "data" got indeed copied
tree = store.get("x")
assert tree is not None
with open(os.path.join(tree, "data"), "r",
encoding="utf-8") as f:
st = os.fstat(f.fileno())
self.assertNotEqual(st.st_ino, data_inode)
data_read = f.read()
self.assertEqual(data, data_read)
def test_commit_consume(self):
# operate with a clean object store
with tempfile.TemporaryDirectory(dir="/var/tmp") as tmp:
# sample data to be used for read, write checks
data = "23"
with objectstore.ObjectStore(tmp) as store:
assert len(os.listdir(store.refs)) == 0
tree = store.new("a")
with open(os.path.join(tree, "data"), "w", encoding="utf8") as f:
f.write(data)
st = os.fstat(f.fileno())
data_inode = st.st_ino
# commit the object as "a"
store.commit(tree, "a")
assert len(os.listdir(store.refs)) == 1
# check that "data" is still the very
# same file after committing
with open(os.path.join(tree, "data"), "r", encoding="utf8") as f:
st = os.fstat(f.fileno())
self.assertEqual(st.st_ino, data_inode)
data_read = f.read()
self.assertEqual(data, data_read)
def test_object_base(self):
with objectstore.ObjectStore(self.store) as store:
assert len(os.listdir(store.refs)) == 0
assert len(os.listdir(store.objects)) == 0
base = store.new("a")
p = Path(base, "A")
p.touch()
store.commit(base, "a")
assert store.contains("a")
assert store_path(store, "a", "A")
tree = store.new("b")
tree.init(base)
p = Path(tree, "B")
p.touch()
tree.finalize()
assert os.path.exists(os.path.join(tree, "A"))
assert os.path.exists(os.path.join(tree, "B"))
def test_snapshot(self):
with objectstore.ObjectStore(self.store) as store:
tree = store.new("b")
p = Path(tree, "A") p = Path(tree, "A")
p.touch() p.touch()
assert not store.contains("a") # there should be no temporary Objects dirs anymore
store.commit(tree, "a") # store via "a", creates a clone with objectstore.ObjectStore(self.store) as object_store:
assert store.contains("a") assert object_store.get("A") is None
p = Path(tree, "B")
p.touch()
store.commit(tree, "b")
# check the references exist
assert os.path.exists(f"{store.refs}/a")
assert os.path.exists(f"{store.refs}/b")
# check the contents of the trees
assert store_path(store, "a", "A")
assert not store_path(store, "a", "B")
assert store_path(store, "b", "A")
assert store_path(store, "b", "B")
def test_metadata(self): def test_metadata(self):
@ -256,6 +149,7 @@ class TestObjectStore(unittest.TestCase):
assert md.get("a") == data assert md.get("a") == data
with objectstore.ObjectStore(self.store) as store: with objectstore.ObjectStore(self.store) as store:
store.maximum_size = 1024*1024*1024
obj = store.new("a") obj = store.new("a")
p = Path(obj, "A") p = Path(obj, "A")
p.touch() p.touch()

View file

@ -12,6 +12,7 @@ import tempfile
import unittest import unittest
import osbuild.meta import osbuild.meta
from osbuild.objectstore import ObjectStore
from osbuild.util import linux from osbuild.util import linux
@ -277,6 +278,8 @@ class OSBuild(contextlib.AbstractContextManager):
_exitstack = None _exitstack = None
_cachedir = None _cachedir = None
maximum_cache_size = 20 * 1024 * 1024 * 1024 # 20 GB
def __init__(self, *, cache_from=None): def __init__(self, *, cache_from=None):
self._cache_from = cache_from self._cache_from = cache_from
@ -297,6 +300,9 @@ class OSBuild(contextlib.AbstractContextManager):
self._cachedir self._cachedir
], check=True) ], check=True)
with ObjectStore(self._cachedir) as store:
store.maximum_size = self.maximum_cache_size
# Keep our ExitStack for `__exit__()`. # Keep our ExitStack for `__exit__()`.
self._exitstack = self._exitstack.pop_all() self._exitstack = self._exitstack.pop_all()