dnfjson: size-based cache management

Functions for managing repository cache management based on a max
desirable size for the entire dnf-json cache directory.
While none of the functions are currently used, the workflow should
be as follows:
- Update the timestamp of a repository whenever it's used in a
  transaction by calling `touchRepo()` with the repository ID and the
  current time.
- Update the internal cache information when desired by calling
  `updateInfo()`.  This should be called for example after multiple
  depsolve transactions are run for a single build request.
- Shrink the cache to below the configured maxSize by calling
  `shrink()`.

The most important work happens in `updateInfo()`.  It collects all the
information it needs from the on-disk cache directories and organises it
in a way that makes it convenient for the `shrink()` function to run
efficiently.  It stores three important pieces of information:
1. repoElements: a map that links a repository ID with all the
   information about a repository's cache:
    - the top-level elements (files and directories) for the cache
    - size of the repository cache (total of all elements)
    - most recent mtime from all the elements which, if the
      `touchRepo()` call is consistently used, should reflect the most
      recent time the repository was used
2. repoRecency: a list of repository IDs sorted by mtime (oldest first)
3. size: the total size of the cache (total of all repository caches)

This way, when `shrink()` is called, the paths associated with the
least-recently-used repositories can be easily deleted by iterating on
repoRecency, obtaining the repository info from the map, deleting every
path in the repoElements array, and subtracting the repository's size
from the total.  The `shrink()` function stops when the new size is
below the maxSize (or when all repositories have been deleted).
This commit is contained in:
Achilleas Koutsou 2022-05-30 18:32:44 +02:00 committed by Tom Gundersen
parent b8d16bc395
commit a7a1f1ac07

View file

@ -1,41 +1,55 @@
package dnfjson
import (
"fmt"
"io/fs"
"os"
"path/filepath"
"sort"
"time"
"github.com/gobwas/glob"
)
// A collection of directory paths, their total size, and their most recent
// modification time.
type pathInfo struct {
paths []string
size uint64
mtime time.Time
}
type rpmCache struct {
// root path for the cache
root string
// individual repository cache data
repoElements map[string]pathInfo
// list of known repository IDs, sorted by mtime
repoRecency []string
// total cache size
size uint64
// max cache size
maxSize uint64
}
func newRPMCache(path string, maxSize uint64) *rpmCache {
return &rpmCache{
root: path,
maxSize: maxSize,
r := &rpmCache{
root: path,
repoElements: make(map[string]pathInfo),
size: 0,
maxSize: maxSize,
}
}
func (r *rpmCache) size() (uint64, error) {
var size uint64
sizer := func(path string, info fs.FileInfo, err error) error {
if err != nil {
return err
}
size += uint64(info.Size())
return nil
}
err := filepath.Walk(r.root, sizer)
return size, err
// collect existing cache paths and timestamps
r.updateInfo()
return r
}
func (r *rpmCache) clean() error {
curSize, err := r.size()
curSize, err := dirSize(r.root)
if err != nil {
return err
}
@ -44,3 +58,137 @@ func (r *rpmCache) clean() error {
}
return nil
}
// updateInfo updates the repoPaths and repoRecency fields of the rpmCache.
func (r *rpmCache) updateInfo() {
cacheEntries, _ := os.ReadDir(r.root)
// each repository has multiple cache entries (3 on average), so using the
// number of cacheEntries to allocate the map and ID slice is a high upper
// bound, but guarantees we wont need to grow and reallocate either.
repos := make(map[string]pathInfo, len(cacheEntries))
repoIDs := make([]string, 0, len(cacheEntries))
var totalSize uint64
// Collect the paths grouped by their repo ID
// We assume the first 64 characters of a file or directory name are the
// repository ID because we use a sha256 sum of the repository config to
// create the ID (64 hex chars)
for _, entry := range cacheEntries {
eInfo, err := entry.Info()
if err != nil {
// skip it
continue
}
fname := entry.Name()
if len(fname) < 64 {
// unknown file in cache; ignore
continue
}
repoID := fname[:64]
repo, ok := repos[repoID]
if !ok {
// new repo ID
repoIDs = append(repoIDs, repoID)
}
mtime := eInfo.ModTime()
ePath := filepath.Join(r.root, entry.Name())
// calculate and add entry size
size, err := dirSize(ePath)
if err != nil {
// skip it
continue
}
repo.size += size
totalSize += size
// add path
repo.paths = append(repo.paths, ePath)
// if for some reason the mtimes of the various entries of a single
// repository are out of sync, use the most recent one
if repo.mtime.Before(mtime) {
repo.mtime = mtime
}
// update the collection
repos[repoID] = repo
}
sortFunc := func(idx, jdx int) bool {
ir := repos[repoIDs[idx]]
jr := repos[repoIDs[jdx]]
return ir.mtime.Before(jr.mtime)
}
// sort IDs by mtime (oldest first)
sort.Slice(repoIDs, sortFunc)
r.size = totalSize
r.repoElements = repos
r.repoRecency = repoIDs
}
func (r *rpmCache) shrink() error {
// start deleting until we drop below r.maxSize
nDeleted := 0
for idx := 0; idx < len(r.repoRecency) && r.size >= r.maxSize; idx++ {
repoID := r.repoRecency[idx]
repo := r.repoElements[repoID]
for _, gPath := range repo.paths {
if err := os.RemoveAll(gPath); err != nil {
return err
}
}
r.size -= repo.size
delete(r.repoElements, repoID)
nDeleted++
}
// update recency list
r.repoRecency = r.repoRecency[nDeleted:]
return nil
}
// Update file atime and mtime on the filesystem to time t for all files in the
// root of the cache that match the repo ID. This should be called whenever a
// repository is used.
// This function does not update the internal cache info. A call to
// updateInfo() should be made after touching one or more repositories.
func (r *rpmCache) touchRepo(repoID string, t time.Time) error {
repoGlob, err := glob.Compile(fmt.Sprintf("%s*", repoID))
if err != nil {
return err
}
// we only touch the top-level directories and files of the cache
cacheEntries, err := os.ReadDir(r.root)
if err != nil {
return err
}
for _, cacheEntry := range cacheEntries {
if repoGlob.Match(cacheEntry.Name()) {
path := filepath.Join(r.root, cacheEntry.Name())
if err := os.Chtimes(path, t, t); err != nil {
return err
}
}
}
return nil
}
func dirSize(path string) (uint64, error) {
var size uint64
sizer := func(path string, info fs.FileInfo, err error) error {
if err != nil {
return err
}
size += uint64(info.Size())
return nil
}
err := filepath.Walk(path, sizer)
return size, err
}