debian-forge/osbuild/util/fscache.py

"""File System Cache

This module implements a data cache that uses the file system to store data
as well as protect parallel access. It implements automatic cache management
and allows purging the cache during runtime, pruning old entries and keeping
the cache under a given limit.
"""

# pylint: disable=too-many-lines

import contextlib
import ctypes
import errno
import json
import os
import subprocess
import uuid
from typing import Any, Dict, List, NamedTuple, Optional, Tuple, Union

from osbuild.util import ctx, linux, rmrf

__all__ = [
    "FsCache",
    "FsCacheInfo",
]


MaximumSizeType = Optional[Union[int, str]]


class FsCacheInfo(NamedTuple):
    """File System Cache Information

    This type represents static cache information. It is an immutable named
    tuple and used to query or set the configuration of a cache.

    creation_boot_id - Hashed linux boot-id at the time of cache-creation
    maximum_size - Maximum cache size in bytes, or "unlimited"
    version - version of the cache data structures
    """

    creation_boot_id: Optional[str] = None
    maximum_size: MaximumSizeType = None
    version: Optional[int] = None

    @classmethod
    def from_json(cls, data: Any) -> "FsCacheInfo":
        """Create tuple from parsed JSON

        This takes a parsed JSON value and converts it into a tuple with the
        same information. Unknown fields in the input are ignored. The input
        is usually taken from `json.load()` and similar.
        """

        if not isinstance(data, dict):
            return cls()

        creation_boot_id = None
        maximum_size: MaximumSizeType = None
        version = None

        # parse "creation-boot-id"
        _creation_boot_id = data.get("creation-boot-id")
        if isinstance(_creation_boot_id, str) and len(_creation_boot_id) == 32:
            creation_boot_id = _creation_boot_id

        # parse "maximum-size"
        _maximum_size = data.get("maximum-size")
        if isinstance(_maximum_size, int):
            maximum_size = _maximum_size
        elif isinstance(_maximum_size, str) and _maximum_size == "unlimited":
            maximum_size = "unlimited"

        # parse "version"
        _version = data.get("version")
        if isinstance(_version, int):
            version = _version

        # create immutable tuple
        return cls(
            creation_boot_id,
            maximum_size,
            version,
        )

    def to_json(self) -> Dict[str, Any]:
        """Convert tuple into parsed JSON

        Return a parsed JSON value that represents the same values as this
        tuple does. Unset values are skipped. The returned value can be
        converted into formatted JSON via `json.dump()` and similar.
        """

        data: Dict[str, Any] = {}
        if self.creation_boot_id is not None:
            data["creation-boot-id"] = self.creation_boot_id
        if self.maximum_size is not None:
            data["maximum-size"] = self.maximum_size
        if self.version is not None:
            data["version"] = self.version
        return data


class FsCacheObjectInfo(NamedTuple):
    """ File System Cache object information

    This type represents information about a single cache object. The
    last_used information is only guaranteed to be valid while the cache
    is locked.
    """
    name: str
    last_used: float


class FsCache(contextlib.AbstractContextManager, os.PathLike):
    """File System Cache

    This file system cache context represents an on-disk cache. That is, it
    allows storing information on the file system, and retrieving it from other
    contexts.

    A single cache directory can be shared between many processes at the same
    time. The cache protects access to the cached data. The cache must not be
    shared over non-coherent network storage, but is designed for system-local
    linux file-systems.

    The file-system layout is as follows:

        [cache]/
        ├── cache.info
        ├── cache.lock
        ├── cache.size
        ├── objects/
        │   ├── [id0]
        │   ├── [id1]/
        │   │   ├── data/
        │   │   │   └── ...
        │   │   ├── object.info
        │   │   └── object.lock
        │   └── ...
        └── stage/
            ├── uuid-[uuid0]
            ├── uuid-[uuid1]/
            │   ├── data/
            │   │   └── ...
            │   ├── object.info
            │   └── object.lock
            └── ...

    The central data store is in the `objects` subdirectory. Every cache entry
    has a separate subdirectory there. To guard access, a read-lock on
    `object.lock` is required for all readers, a write-lock is required for all
    writers. Static information about the object is available in the
    `object.info` file.

    As an optimization, entries in the object store consisting of a single
    file can be stored directly underneath `objects` without a separate
    subdirectory hierarchy. Their guarding lock is directly taken on this file
    and no metadata is available, other than the file information itself. This
    is used extensively by the cache management to prepare objects for atomic
    replacements. Due to lack of metadata, they are volatile and can be
    deleted as soon as they are unlocked.

    Generally, access to the cache is non-blocking. That is, if a read-lock
    cannot be acquired, an entry is considered non-existant. Thus, unless
    treated as a `write-once` cache, cache efficiency will decrease when taking
    write-locks.

    The `data/` directory contains the content of a cache entry. Its content
    is solely defined by the creator of the entry and the cache makes no
    assumptions about its layout. Note that the `data/` directory itself can be
    modified (e.g., permission-changes) if an unnamed top-level directory is
    desired (e.g., to store a directory tree).

    Additionally to the `objects/` directory, a similar `stage/` directory is
    provided. This directory is `write-only` and used to prepare entries for
    the object store before committing them. The staging area is optional. It
    is completely safe to do the same directly in the object store. However,
    the separation allows putting the staging area on a different file-system
    (e.g., symlinking to a tmpfs), and thus improving performance for larger
    operations. Otherwise, the staging area follows the same rules as the
    object store, except that only writers are expected. Hence, staging entries
    always use a unique UUID as name. To commit a staging entry, a user is
    expected to create an entry in the object store and copy/move the `data/`
    directory over.

    To guard against parallel accesses, a set of locks is utilized. Generally,
    a `*.lock`-file locks the directory it is in, while a lock on any other
    file just locks that file (unfortunately, we cannot acquire write-locks on
    directories directly, since it would require opening them for writing,
    which is not possible on linux). `cache.lock` can be used to guard the
    entire cache. A write-lock will keep any other parallel operation out,
    while a read-lock merely acquires cache access (you are still allowed to
    modify the cache, but need fine-grained locking). Hence, a write-lock on the
    global `cache.lock` file is only required for operations that cannot use
    fine-grained locking. The latter requires individual locking for each file
    or each object store entry you modify. In all those cases you must ensure
    for parallel modifications, since lock acquisition on file-systems can only
    be done after opening a file.
    """

    class MissError(Exception):
        """Cache Miss Exception

        This error is raised when a cache entry is not found. Due to the
        shared nature of the cache, a caller must be aware that any entry can
        be created or deleted by other concurrent operations, at any point in
        time. Hence, a cache miss only reflects the state of the cache at a
        particular time under a particular lock.
        """

    # static parameters
    _dirname_data = "data"
    _dirname_objects = "objects"
    _dirname_stage = "stage"
    _filename_cache_info = "cache.info"
    _filename_cache_lock = "cache.lock"
    _filename_cache_size = "cache.size"
    _filename_cache_tag = "CACHEDIR.TAG"
    _filename_object_info = "object.info"
    _filename_object_lock = "object.lock"
    _version_current = 1
    _version_minimum = 1

    # constant properties
    _appid: str
    _tracers: Dict[str, Any]
    _path_cache: Any

    # context-manager properties
    _active: bool
    _bootid: Optional[str]
    _lock: Optional[int]
    _info: FsCacheInfo
    _info_maximum_size: int

    def __init__(self, appid: str, path_cache: Any):
        """Create File System Cache

        This creates a new file-system cache. It does not create the cache, nor
        access any of its content. You must enter its context-manager to prepare
        the cache for access. Any access outside of a context-manager will raise
        an assertion error, unless explicitly stated otherwise.

        Parameters:
        -----------
        appid
            The application-ID of the caller. This can be any random string. It
            is used to initialize the application-specific boot-ID used to tag
            caches and detect whether an entry was created during the same boot.
        path_cache
            The path to the cache directory. The directory (and the path to it)
            is created if it does not exist.
        """

        self._appid = appid
        self._tracers = {}
        self._path_cache = os.fspath(path_cache)

        self._active = False
        self._bootid = None
        self._lock = None
        self._info = FsCacheInfo()
        self._info_maximum_size = 0

    def _trace(self, trace: str):
        """Trace execution

        Execute registered trace-hooks for the given trace string. This allows
        tests to register callbacks that are executed at runtime at a specific
        location in the code. During normal operation, no such hooks should be
        used.

        The trace-hooks are used to trigger race-conditions during tests and
        verify they are handled gracefully.

        Parameters:
        -----------
        trace
            The trace-hook to run.
        """

        if trace in self._tracers:
            self._tracers[trace]()

    @staticmethod
    def _calculate_space(path_target: str) -> int:
        """Calculate total space of a directory tree

        Calculate the total amount of storage required for a directory tree in
        bytes. This does not account for metadata, but only for stored file
        content.

        Note that this may differ from the sum of the file sizes as it
        takes sparse files into account.

        Parameters:
        -----------
        path_target
            File-system path to the directory to operate on.
        """

        return os.lstat(path_target).st_blocks * 512 + sum(
            os.lstat(
                os.path.join(path, f)
            ).st_blocks * 512 for path, dirs, files in os.walk(
                path_target
            ) for f in files + dirs
        )

    def __fspath__(self) -> Any:
        """Return cache path

        Return the path to this cache as provided to the constructor of the
        cache. No conversions are applied, so the path is absolute if the
        path as provided by the caller was absolute, and vice-versa.

        This is part of the `os.PathLike` interface. See its documentation.
        """

        return self._path_cache

    def _path(self, *rpaths):
        """Return absolute path into cache location

        Take the relative path from the caller and turn it into an absolute
        path. Since most operations take a relative path from the cache root
        to a cache location, this function can be used to make those paths
        absolute.

        Parameters:
        -----------
        rpaths
            Relative paths from cache root to the desired cache location.
        """

        return os.path.join(self, *rpaths)

    @contextlib.contextmanager
    def _atomic_open(
        self,
        rpath: str,
        *,
        wait: bool,
        write: bool,
        closefd: bool = True,
        oflags: int = 0,
    ):
        """Atomically open and lock file

        Open the cache-file at the specified relative path and acquire a
        lock on it. Yield the file-descriptor to the caller. Once control
        returns, all locks are released (if not already done so by the
        caller) and the file-descriptor is closed.

        Note that this operation involves a retry-loop in case the file is
        replaced or moved before the lock is acquired.

        Parameters:
        -----------
        rpath
            Relative path from the cache-root to the file to open.
        wait
            Whether to wait for locks to be acquired.
        write
            If false, the file is opened for reading and a read lock is
            acquired. If true, it is opened for read and write and a write
            lock is acquired.
        closefd
            If false, retain file-descriptor (and lock) on success.
        oflags
            Additional open-flags to pass to `os.open()`.
        """

        fd = None
        path = self._path(rpath)

        try:
            while True:
                # Open the file and acquire a lock. Make sure not to modify the
                # file in any way, ever. If non-blocking operation was requested
                # the lock call will raise `EAGAIN` if contended.
                flags = os.O_RDONLY | os.O_CLOEXEC | oflags
                lock = linux.fcntl.F_RDLCK
                if write:
                    flags = flags | os.O_RDWR
                    lock = linux.fcntl.F_WRLCK
                self._trace("_atomic_open:open")
                fd = os.open(path, flags, 0o644)
                self._trace("_atomic_open:lock")
                linux.fcntl_flock(fd, lock, wait=wait)

                # The file might have been replaced between opening it and
                # acquiring the lock. Hence, run `stat(2)` on the path again
                # and compare it to `fstat(2)` of the open file. If they differ
                # simply retry.
                # On NFS, the lock-acquisition has invalidated the caches, hence
                # the metadata is refetched. On linux, the first query will
                # succeed and reflect the drop in link-count. Every further
                # query will yield `ESTALE`. Yet, we cannot rely on being the
                # first to query, so proceed carefully.
                # On non-NFS, information is coherent and we can simply proceed
                # comparing the DEV+INO information to see whether the file was
                # replaced.

                retry = False

                try:
                    st_fd = os.stat(fd)
                except OSError as e:
                    if e.errno != errno.ESTALE:
                        raise
                    retry = True

                try:
                    st_path = os.stat(path)
                except OSError as e:
                    if e.errno not in [errno.ENOENT, errno.ESTALE]:
                        raise
                    retry = True

                if retry or st_fd.st_dev != st_path.st_dev or st_fd.st_ino != st_path.st_ino:
                    linux.fcntl_flock(fd, linux.fcntl.F_UNLCK)
                    os.close(fd)
                    fd = None
                    continue

                # Yield control to the caller to make use of the FD. If the FD
                # is to be retained, clear it before returning to the cleanup
                # handlers.
                yield fd

                if not closefd:
                    fd = None

                return
        finally:
            if fd is not None:
                linux.fcntl_flock(fd, linux.fcntl.F_UNLCK)
                os.close(fd)

    @contextlib.contextmanager
    def _atomic_file(
        self,
        rpath: str,
        rpath_store: str,
        closefd: bool = True,
        ignore_exist: bool = False,
        replace: bool = False,
    ):
        """Create and link temporary file

        Create a new temporary file and yield control to the caller to fill in
        data and metadata. Once control is returned, the file is linked at the
        specified location. If an exception is raised, the temporary file is
        discarded.

        This function emulates the behavior of `O_TMPFILE` for systems and
        file-systems where it is not available.

        Parameters:
        -----------
        rpath
            Relative path from cache-root to the location where to link the
            file on success.
        rpath_store
            Relative path from cache-root to the store to use for temporary
            files. This must share the same mount-instance as the final path.
        closefd
            If false, retain file-descriptor (and lock) on success.
        ignore_exist
            If true, an existing file at the desired location during a
            replacement will not cause an error.
        replace
            If true, replace a previous file at the specified location. If
            false, no replacement takes place and the temporary file is
            discarded.
        """

        assert not replace or not ignore_exist

        rpath_tmp = None

        try:
            # First create a random file in the selected store. This file will
            # have a UUID as name and thus we can safely use `O_CREAT|O_EXCL`
            # to create it and guarantee its uniqueness.
            name = "uuid-" + uuid.uuid4().hex
            rpath_tmp = os.path.join(rpath_store, name)
            with self._atomic_open(
                rpath_tmp,
                wait=True,
                write=True,
                closefd=closefd,
                oflags=os.O_CREAT | os.O_EXCL,
            ) as fd:
                # Yield control to the caller to fill in data and metadata.
                with os.fdopen(fd, "r+", closefd=False, encoding="utf8") as file:
                    yield file

                suppress = []
                if ignore_exist:
                    suppress.append(errno.EEXIST)

                if replace:
                    # Move the file into the desired location, possibly
                    # replacing any existing entry.
                    os.rename(
                        src=self._path(rpath_tmp),
                        dst=self._path(rpath),
                    )
                else:
                    # Preferably, we used `RENAME_NOREPLACE`, but this is not
                    # supported on NFS. Instead, we create a hard-link, which
                    # will fail if the target already exists. We rely on the
                    # cleanup-path to drop the original link.
                    with ctx.suppress_oserror(*suppress):
                        os.link(
                            src=self._path(rpath_tmp),
                            dst=self._path(rpath),
                            follow_symlinks=False,
                        )
        finally:
            if rpath_tmp is not None:
                # If the temporary file exists, we delete it. If we haven't
                # created it, or if we already moved it, this will be a no-op.
                # Due to the unique name, we will never delete a file we do not
                # own. If we hard-linked the file, this merely deletes the
                # original temporary link.
                # On fatal errors, we leak the file into the object store. Due
                # to the released lock and UUID name, cache management will
                # clean it up.
                with ctx.suppress_oserror(errno.ENOENT):
                    os.unlink(self._path(rpath_tmp))

    def _atomic_dir(self, rpath_store: str) -> Tuple[str, int]:
        """Atomically create and lock an anonymous directory

        Create an anonymous directory in the specified storage directory
        relative to the cache-root. The directory will have a UUID as name. On
        success, the name of the directory and the open file-descriptor to its
        acquired lock file (write-locked) are returned.

        The lock-file logic follows the cache-logic for objects. Hence, the
        cache scaffolding for the specified store must exist. No other cache
        infrastructure is required, though.

        Parameters:
        -----------
        rpath_store
            Relative path from the cache-root to the storage directory to create
            the new anonymous directory in. Most likely, this is either the
            object-store or the staging-area.
        """

        rpath_dir = None
        rpath_lock = None

        try:
            while True:
                # Allocate a UUID for the new directory and prepare the paths
                # to the directory and lock-file inside.
                name = "uuid-" + uuid.uuid4().hex
                rpath_dir = os.path.join(rpath_store, name)
                rpath_lock = os.path.join(rpath_dir, self._filename_object_lock)

                # Create an anonymous lock-file, but before linking it create
                # the target directory to link the file in. Use an ExitStack
                # to control exactly where to catch exceptions.
                with contextlib.ExitStack() as es:
                    f = es.enter_context(
                        self._atomic_file(
                            rpath_lock,
                            rpath_store,
                            closefd=False,
                        )
                    )
                    lockfd = f.fileno()
                    os.mkdir(self._path(rpath_dir))

                    # Exit the `_atomic_file()` context, thus triggering a link
                    # of the anonymous lock-file into the new directory. A
                    # parallel cleanup might have deleted the empty directory,
                    # so catch `ENOENT` and retry.
                    try:
                        es.close()
                    except OSError as e:
                        if e.errno == errno.ENOENT:
                            continue
                        raise

                return (name, lockfd)
        except BaseException:
            # On error, we might have already created the directory or even
            # linked the lock-file. Try unlinking both, but ignore errors if
            # they do not exist. Due to using UUIDs as names we cannot conflict
            # with entries created by some-one else.
            if rpath_lock is not None:
                with ctx.suppress_oserror(errno.ENOENT, errno.ENOTDIR):
                    os.unlink(self._path(rpath_lock))
            if rpath_dir is not None:
                with ctx.suppress_oserror(errno.ENOENT, errno.ENOTDIR):
                    os.rmdir(self._path(rpath_dir))
            raise

    def _create_scaffolding(self):
        """Create cache scaffolding

        Create the directories leading to the cache, as well as the internal
        scaffolding directories and files. This ensures that an existing cache
        is not interrupted or rewritten. Hence, this can safely be called in
        parallel, even on live caches.

        If this happens to create a new cache, it is initialized with its
        default configuration and constraints. By default, this means the cache
        has a maximum size of 0 and thus is only used as staging area with no
        long-time storage.

        This call requires no cache-infrastructure to be in place, and can be
        called repeatedly at any time.
        """

        # Create the directory-scaffolding of the cache. Make sure to ignore
        # errors when they already exist, to allow for parallel setups.
        dirs = [
            self._path(self._dirname_objects),
            self._path(self._dirname_stage),
        ]
        for i in dirs:
            os.makedirs(i, exist_ok=True)

        # Create the file-scaffolding of the cache. We fill in the default
        # information and ignore racing operations.
        with self._atomic_file(self._filename_cache_tag, self._dirname_objects, ignore_exist=True) as f:
            f.write(
                "Signature: 8a477f597d28d172789f06886806bc55\n"
                "# This is a cache directory tag created by osbuild (see https://bford.info/cachedir/)\n"
            )
        with self._atomic_file(self._filename_cache_info, self._dirname_objects, ignore_exist=True) as f:
            json.dump({"version": self._version_current}, f)
        with self._atomic_file(self._filename_cache_lock, self._dirname_objects, ignore_exist=True) as f:
            pass
        with self._atomic_file(self._filename_cache_size, self._dirname_objects, ignore_exist=True) as f:
            f.write("0")

    def _load_cache_info(self, info: Optional[FsCacheInfo] = None):
        """Load cache information

        This loads information about the cache into this cache-instance. The
        cache-information is itself cached on this instance and only updated
        on request. If the underlying file in the cache changes at runtime it
        is not automatically re-loaded. Only when this function is called the
        information is reloaded.

        By default this function reads the cache-information from the
        respective file in the cache and then caches it on this instance. If
        the `info` argument is not `None`, then no information is read from the
        file-system, but instead the information is taken from the `info`
        argument. This allows changing the cache-information of this instance
        without necessarily modifying the underlying file.

        This call requires the cache scaffolding to be fully created.

        Parameters:
        -----------
        info
            If `None`, the cache info file is read. Otherwise, the information
            is taken from this tuple.
        """

        # Parse the JSON data into python.
        if info is None:
            with open(self._path(self._filename_cache_info), "r", encoding="utf8") as f:
                info_raw = json.load(f)

            info = FsCacheInfo.from_json(info_raw)

        # Retain information.
        self._info = info

        # Parse `maximum-size` into internal representation.
        if info.maximum_size == "unlimited":
            self._info_maximum_size = -1
        elif isinstance(info.maximum_size, int):
            self._info_maximum_size = info.maximum_size
        elif info.maximum_size is None:
            self._info_maximum_size = 0
        else:
            raise ValueError(
                f"maximum-size can only be set to 'unlimited' or an integer value, got {type(info.maximum_size)}")

    def _is_active(self):
        # Internal helper to verify we are in an active context-manager.
        return self._active

    def _is_compatible(self):
        # Internal helper to verify the cache-version is supported.
        return self._info.version is not None and \
            self._version_minimum <= self._info.version <= self._version_current

    def __enter__(self):
        assert not self._active

        try:
            # Acquire the current boot-id so we can tag entries accordingly, and
            # judge entries that are from previous boots.
            self._bootid = linux.proc_boot_id(self._appid).hex

            # Create the scaffolding for the entire cache.
            self._create_scaffolding()

            # Acquire a shared cache lock.
            self._lock = os.open(
                self._path(self._filename_cache_lock),
                os.O_RDONLY | os.O_CLOEXEC,
            )
            linux.fcntl_flock(self._lock, linux.fcntl.F_RDLCK, wait=True)

            # Read the cache configuration.
            self._load_cache_info()

            self._active = True
            return self
        except BaseException:
            self.__exit__(None, None, None)
            raise

    def __exit__(self, exc_type, exc_value, exc_tb):
        # Discard any state of this context and reset to original state.
        if self._lock is not None:
            linux.fcntl_flock(self._lock, linux.fcntl.F_UNLCK)
            os.close(self._lock)
            self._lock = None
        self._active = False
        self._bootid = None
        self._info = FsCacheInfo()
        # We always have to leave the file-system scaffolding around. Even if
        # the cache is entirely empty, we cannot know whether there are other
        # parallel accesses (without unreasonable effort).

    def _update_cache_size(self, diff: int) -> bool:
        """Update cache size

        Update the total cache size by the specified amount, unless it exceeds
        the cache limits.

        This carefully updates the stored cache size to allow for parallel
        updates by other cache users. If the cache limits are exceeded, the
        operation is canceled and `False` is returned. Otherwise, `True` is
        returned.

        If the specified amount is negative, the operation always succeeds. If
        the cache size would end up negative, it is capped at 0.

        This operation requires an active context.
        """

        assert self._is_active()
        assert self._is_compatible()

        # Open the cache-size and lock it for writing. But instead of writing
        # directly to it, we replace it with a new file. This guarantees that
        # we cannot crash while writing a partial size, but always atomically
        # update the content.
        with self._atomic_open(self._filename_cache_size, write=True, wait=True) as fd:
            with os.fdopen(fd, "r", closefd=False, encoding="utf8") as f:
                size = json.load(f)

            if size + diff < 0:
                size = 0
            elif (self._info_maximum_size < 0) or (size + diff <= self._info_maximum_size):
                size = size + diff
            else:
                return False

            with self._atomic_file(self._filename_cache_size, self._dirname_objects, replace=True) as f:
                json.dump(size, f)

            return True

    def _rm_r_object(self, rpath_dir: str):
        """Remove object

        Recursively remove all traces of a stored object. This either requires
        the caller to hold a write-lock on the entry, or otherwise guarantee
        that no cache lookups can acquire the entry concurrently.

        This carefully deletes any traces of the entry, making sure to first
        mark the object as invalid, and dropping the lock-file last. This can
        safely be called on partially constructured or non-existing entries.

        Parameters:
        -----------
        rpath_dir
            Relative path from the cache-root to the object directory.
        """

        path_dir = self._path(rpath_dir)
        path_info = os.path.join(path_dir, self._filename_object_info)
        path_lock = os.path.join(path_dir, self._filename_object_lock)

        # Optimization: Bail out early if the entry is non-existant
        if not os.path.lexists(path_dir):
            return

        # First step, we unlink the info-file. This will mark the entry as
        # volatile and thus it will get cleaned up by cache management in case
        # we crash while deleting it. Furthermore, no cache lookups will ever
        # consider the entry again if the info-file is missing.
        with ctx.suppress_oserror(errno.ENOENT, errno.ENOTDIR):
            os.unlink(path_info)

        # Now iterate the directory and drop everything _except_ the lock file.
        # This makes sure no parallel operation will needlessly race with us. In
        # case no lock is acquired, we still allow for parallel racing cleanups.
        #
        # Note that racing cleanups might delete the entire directory at any
        # time during this iteration. Furthermore, `scandir()` is not atomic but
        # repeatedly calls into the kernel. Hence, we carefully bail out once
        # it reports a non-existant directory.
        with ctx.suppress_oserror(errno.ENOENT, errno.ENOTDIR):
            for entry in os.scandir(path_dir):
                if entry.name == self._filename_object_lock:
                    continue
                with ctx.suppress_oserror(errno.ENOENT, errno.ENOTDIR):
                    if entry.is_dir():
                        rmrf.rmtree(entry.path)
                    else:
                        os.unlink(entry.path)

        # With everything gone, we unlink the lock-file and eventually delete
        # the directory. Again, cleanup routines might have raced us, so avoid
        # failing in case the entries are already gone.
        with ctx.suppress_oserror(errno.ENOENT, errno.ENOTDIR):
            os.unlink(path_lock)
        with ctx.suppress_oserror(errno.ENOENT, errno.ENOTDIR):
            os.rmdir(path_dir)

    @contextlib.contextmanager
    def stage(self):
        """Create staging entry

        Create a new entry in the staging area and yield control to the caller
        with the relative path to the entry. Once control returns, the staging
        entry is completely discarded.

        If the application crashes while holding a staging entry, it will be
        left behind in the staging directory, but unlocked and marked as stale.
        Hence, any cache management routine will discard it.
        """

        # We check for an active context, but we never check for
        # version-compatibility, because there is no way we can run without
        # a staging area. Hence, the staging-area has to be backwards
        # compatible at all times.
        assert self._is_active()

        uuidname = None
        lockfd = None

        try:
            # Create and lock a new anonymous object in the staging area.
            uuidname, lockfd = self._atomic_dir(self._dirname_stage)

            rpath_data = os.path.join(
                self._dirname_stage,
                uuidname,
                self._dirname_data,
            )

            # Prepare an empty data directory and yield it to the caller.
            os.mkdir(self._path(rpath_data))
            yield rpath_data
        finally:
            if lockfd is not None:
                self._rm_r_object(os.path.join(self._dirname_stage, uuidname))
                linux.fcntl_flock(lockfd, linux.fcntl.F_UNLCK)
                os.close(lockfd)

    @contextlib.contextmanager
    def store(self, name: str):
        """Store object in cache

        Create a new entry and store it in the cache with the specified name.
        The entry is first created with an anonymous name and control is yielded
        to the caller to fill in data. Once control returns, the entry is
        committed with the specified name.

        The final commit is skipped if an entry with the given name already
        exists, or its name is claimed for other reasons. Furthermore, the
        commit is skipped if cache limits are exceeded, or if cache maintenance
        refuses the commit. Hence, a commit can never be relied upon and the
        entry might be deleted from the cache as soon as the commit was invoked.

        Parameters:
        -----------
        name
            Name to store the object under.
        """

        assert self._is_active()
        assert self._bootid is not None

        if not name:
            raise ValueError()

        # If the cache-version is incompatible to this implementation, we short
        # this call into the staging-area (which is always compatible). This
        # avoids raising an exception (at the cost of dealing with this in the
        # caller), and instead just creates a temporary copy which we discard.
        if not self._is_compatible():
            with self.stage() as p:
                yield p
            return

        uuidname = None
        lockfd = None

        try:
            # Create and lock a new anonymous object in the staging area.
            uuidname, lockfd = self._atomic_dir(self._dirname_objects)

            rpath_uuid = os.path.join(
                self._dirname_objects,
                uuidname,
            )
            rpath_data = os.path.join(
                rpath_uuid,
                self._dirname_data,
            )
            rpath_info = os.path.join(
                rpath_uuid,
                self._filename_object_info,
            )
            path_uuid = self._path(rpath_uuid)
            path_data = self._path(rpath_data)
            path_info = self._path(rpath_info)

            # Prepare an empty data directory and yield it to the caller.
            os.mkdir(path_data)
            yield rpath_data

            # Collect metadata about the new entry.
            info: Dict[str, Any] = {}
            info["creation-boot-id"] = self._bootid
            info["size"] = self._calculate_space(path_data)

            # Exit early if it never is going to fit
            if self._info_maximum_size > -1 and info["size"] > self._info_maximum_size:
                return

            # Update the total cache-size. If it exceeds the limits, remove
            # least recently used objects until there is enough space.
            #
            # Note that if we crash after updating the total cache size, but
            # before committing the object information, the total cache size
            # will be out of sync.
            #
            # However, it is never overcommitted, so we will never
            # violate any cache invariants. Future code needs to resync
            # the cache (e.g. on open with some simple journal strategy).
            if not self._update_cache_size(info["size"]):
                # try to free space
                self._remove_lru(info["size"])
                # and see if the update can happen now
                if not self._update_cache_size(info["size"]):
                    # stil could not free enough space
                    return

            try:
                # Commit the object-information, thus marking it as fully
                # committed and accounted in the cache.
                with open(path_info, "x", encoding="utf8") as f:
                    json.dump(info, f)

                # As last step move the entry to the desired location. If the
                # target name is already taken, we bail out and pretend the
                # entry was immediately overwritten by another one.
                #
                # Preferably, we used RENAME_NOREPLACE, but this is not
                # available on all file-systems. Hence, we rely on the fact
                # that non-empty directories cannot be replaced, so we
                # automatically get RENAME_NOREPLACE behavior.
                path_name = self._path(self._dirname_objects, name)
                try:
                    os.rename(
                        src=path_uuid,
                        dst=path_name,
                    )
                except OSError as e:
                    ignore = [errno.EEXIST, errno.ENOTDIR, errno.ENOTEMPTY]
                    if e.errno not in ignore:
                        raise

                uuidname = None
            finally:
                # If the anonymous entry still exists, it will be cleaned up by
                # the outer handler. Hence, make sure to drop the info file
                # again and de-account it, so we don't overcommit.
                if os.path.lexists(path_uuid):
                    with ctx.suppress_oserror(errno.ENOENT, errno.ENOTDIR):
                        os.unlink(path_info)
                    self._update_cache_size(-info["size"])
        finally:
            if lockfd is not None:
                if uuidname is not None:
                    # In case this runs after the object was renamed, but before
                    # `uuidname` was cleared, then `_rm_r_object()` will be a
                    # no-op.
                    self._rm_r_object(os.path.join(self._dirname_objects, uuidname))
                linux.fcntl_flock(lockfd, linux.fcntl.F_UNLCK)
                os.close(lockfd)

    @contextlib.contextmanager
    def load(self, name: str):
        """Load a cache entry

        Find the cache entry with the given name, acquire a read-lock and
        yield its path back to the caller. Once control returns, the entry
        is released.

        The returned path is the relative path between the cache and the top
        level directory of the cache entry.

        Parameters:
        -----------
        name
            Name of the cache entry to find.
        """

        assert self._is_active()

        if not name:
            raise ValueError()
        if not self._is_compatible():
            raise self.MissError()

        with contextlib.ExitStack() as es:
            # Use an ExitStack so we can catch exceptions raised by the
            # `__enter__()` call on the context-manager. We want to catch
            # `OSError` exceptions and convert them to cache-misses.
            obj_lock_path = os.path.join(
                self._dirname_objects, name, self._filename_object_lock)
            try:
                lock_fd = es.enter_context(
                    self._atomic_open(
                        obj_lock_path,
                        write=False,
                        wait=False,
                    )
                )
            except OSError as e:
                if e.errno in [errno.EAGAIN, errno.ENOENT, errno.ENOTDIR]:
                    raise self.MissError() from None
                raise e

            libc = linux.Libc.default()
            libc.futimens(lock_fd, ctypes.byref(linux.c_timespec_times2(
                atime=linux.c_timespec(tv_sec=0, tv_nsec=libc.UTIME_NOW),
                mtime=linux.c_timespec(tv_sec=0, tv_nsec=libc.UTIME_OMIT),
            )))

            yield os.path.join(
                self._dirname_objects,
                name,
                self._dirname_data,
            )

    def _last_used(self, name: str) -> float:
        """Return the last time the given object was last used.

        Note that the resolution is only as good as what the filesystem "atime"
        gives us.
        """
        obj_lock_path = os.path.join(
            self._dirname_objects, name, self._filename_object_lock)
        try:
            return os.stat(self._path(obj_lock_path)).st_atime
        except OSError as e:
            if e.errno in [errno.EAGAIN, errno.ENOENT, errno.ENOTDIR]:
                raise self.MissError() from None
            raise e

    def _last_used_objs(self) -> List[FsCacheObjectInfo]:
        """Return a list of FsCacheObjectInfo with name, last_used
           information sorted by last_used time.

        Note that this function will be racy when used without a lock and
        the caller needs to handle this.
        """
        objs = []
        for name in os.listdir(self._path(self._dirname_objects)):
            try:
                last_used = self._last_used(name)
            except (OSError, FsCache.MissError):
                continue
            objs.append(FsCacheObjectInfo(name=name, last_used=last_used))
        return sorted(objs, key=lambda obj: obj.last_used)

    def _remove_lru(self, required_size):
        """"
        Make room in the cache for "required_size" by remove the least
        recently used entry from the cache. Note that the cache may
        clear more than required_size.
        """
        # To avoid having to take a global cache lock the strategy is:
        # 1. Get list of (object, last_used) sorted from oldest to newest.
        #    This is racy so we need to take care of that in step(2).
        # 2. Start with the oldest entry, try to take a write_lock
        #    (with O_NOATIME to be extra sure that atime information is
        #    correct). Get the "last_used" (atime) time and compare to what
        #    we expect in the list. If it diverges the object got load()ed
        #    while we iterated. Skip it and go to (2) again.
        # 3. Remove entry, update cache size after the entry is removed.
        #
        # Note that there is a risk to get out-of-sync in (3). If the
        # process dies while removing and before updating the cache
        # size the cache will be over reported.

        # Try to clean at least twice the requested size to avoid having
        # to do this all over again
        try_to_free = required_size * 2
        freed_so_far = 0
        for name, last_used in self._last_used_objs():
            # take write lock for the indivdual object
            rpath = os.path.join(self._dirname_objects, name)
            rpath_lock = os.path.join(rpath, self._filename_object_lock)
            # Ideally there would some lock helper instead of the low-level
            # file manipulation to abstract this a bit more.
            try:
                with self._atomic_open(
                        rpath_lock,
                        wait=False,
                        write=True,
                        # atime carries the "last-used" data so don't alter it
                        oflags=os.O_EXCL | os.O_NOATIME,
                ):
                    if last_used != self._last_used(name):
                        continue
                    # This is racy right now if the process is killed
                    # during "_rm_r_object(rpath)" because then the
                    # cache size is never reduced by the amount that
                    # was about to be deleted.
                    #
                    # To fix it we need to (atomic) rename the
                    # "object.info" file in _rm_r_object() to
                    # something like "object.removing". Then when
                    # opening the cache scan for leftover
                    # "object.removing" files and finish the cleanup
                    # and update the cache size based on the size
                    # recorded inside "object.removing".
                    size = self._calculate_space(self._path(rpath))
                    self._rm_r_object(rpath)
                    self._update_cache_size(-size)
                    freed_so_far += size
                    if freed_so_far >= try_to_free:
                        break
            except BlockingIOError:
                continue
        # return True if at least the required size got freed
        return freed_so_far > required_size

    @property
    def info(self) -> FsCacheInfo:
        """Query Cache Information

        Return the parsed cache information which is currently cached on this
        cache-instance. The cache information has all unknown fields stripped.

        Unset values are represented by `None`, and the cache will interpret
        it as the default value for the respective field.
        """

        assert self._is_active()

        return self._info

    @info.setter
    def info(self, info: FsCacheInfo):
        """Write Cache Information

        Update and write the cache-information onto the file-system. This first
        locks the cache-information file, reads it in, updates the newly read
        information with the data from `info`, writes the result back to disk
        and finally unlocks the file.

        There are a few caveats to take into account:

         * The locking guarantees that simultaneous updates will be properly
           ordered and never discard any information.
         * Since this reads in the newest cache-information, this function can
           update cache-information values other than the ones from `info`. Any
           value unset in `info` will be re-read from disk and thus might
           change (in the future, if required, this can be adjusted to allow a
           caller to hook into the operation while the lock is held).
         * You cannot strip known values from the cache-information. Any value
           not present in `info` is left unchanged. You must explicitly set a
           value to its default to reset it.
         * Cache-information fields that are not known to this implementation
           are never exposed to the caller, but are left unchanged on-disk.
           This guarantees that future extensions are left alone and are not
           accidentally stripped.

        The cached information of this instance is updated to reflect the
        changes.

        Parameters:
        -----------
        info
            Cache information object to consume and write.
        """

        assert self._is_active()

        with self._atomic_open(self._filename_cache_info, write=True, wait=True) as fd:
            with os.fdopen(fd, "r", closefd=False, encoding="utf8") as f:
                info_raw = json.load(f)

            # If the on-disk data is in an unexpected format, we never touch
            # it. If it is a JSON-object, we update it with the new values and
            # then re-parse it into a full `FsCacheInfo` with all known fields
            # populated.
            if isinstance(info_raw, dict):
                info_raw.update(info.to_json())
                info = FsCacheInfo.from_json(info_raw)

                # Replace the file with the new values. This releases the lock.
                if self._is_compatible():
                    with self._atomic_file(self._filename_cache_info, self._dirname_objects, replace=True) as f:
                        json.dump(info_raw, f)

        self._load_cache_info(info)

    def store_tree(self, name: str, tree: Any):
        """Store file system tree in cache

        Create a new entry in the object store containing a copy of the file
        system tree specified as `tree`. This behaves like `store()` but instead
        of providing a context to the caller it will copy the specified tree.

        Similar to `store()`, when the entry is committed it is immediately
        unlocked and released to the cache. This means it might vanish at any
        moment due to a parallel cleanup. Hence, a caller cannot rely on the
        object being available in the cache once this call returns.

        If `tree` points to a file, the file is copied. If it points to a
        directory, the entire directory tree is copied including the root entry
        itself. To copy an entire directory without its root entry, use the
        `path/.` notation. Links are never followed but copied verbatim.
        All metadata is preserved, if possible.

        Parameters:
        -----------
        name
            Name to store the object under.
        tree:
            Path to the file system tree to copy.
        """

        with self.store(name) as rpath_data:
            r = subprocess.run(
                [
                    "cp",
                    "--reflink=auto",
                    "-a",
                    "--",
                    os.fspath(tree),
                    self._path(rpath_data),
                ],
                check=False,
                encoding="utf-8",
                stderr=subprocess.STDOUT,
                stdout=subprocess.PIPE,
            )
            if r.returncode != 0:
                code = r.returncode
                msg = r.stdout.strip()
                raise RuntimeError(f"Cannot copy into file-system cache ({code}): {msg}")