import errno import json import os def treesum(m, dir_fd): """Compute a content hash of a filesystem tree Parameters ---------- m : hash object the hash object to append the treesum to dir_fd : int directory file descriptor number to operate on The hash is stable between runs, and guarantees that two filesystem trees with the same hash, are functionally equivalent from the OS point of view. The file, symlink and directory names and contents are recursively hashed, together with security-relevant metadata.""" with os.scandir(f"/proc/self/fd/{dir_fd}") as it: for dirent in sorted(it, key=(lambda d: d.name)): stat_result = dirent.stat(follow_symlinks=False) metadata = {} metadata["name"] = os.fsdecode(dirent.name) metadata["mode"] = stat_result.st_mode metadata["uid"] = stat_result.st_uid metadata["gid"] = stat_result.st_gid # include the size of symlink target/file-contents so we don't have to delimit it metadata["size"] = stat_result.st_size # getxattr cannot operate on a dir_fd, so do a trick and rely on the entries in /proc stable_file_path = os.path.join(f"/proc/self/fd/{dir_fd}", dirent.name) try: selinux_label = os.getxattr(stable_file_path, b"security.selinux", follow_symlinks=False) except OSError as e: # SELinux support is optional if e.errno != errno.ENODATA: raise else: metadata["selinux"] = os.fsdecode(selinux_label) # hash the JSON representation of the metadata to stay unique/stable/well-defined m.update(json.dumps(metadata, sort_keys=True).encode()) if dirent.is_symlink(): m.update(os.fsdecode(os.readlink(dirent.name, dir_fd=dir_fd)).encode()) else: fd = os.open(dirent.name, flags=os.O_RDONLY, dir_fd=dir_fd) try: if dirent.is_dir(follow_symlinks=False): treesum(m, fd) elif dirent.is_file(follow_symlinks=False): # hash a page at a time (using f with fd as default is a hack to please pylint) for byte_block in iter(lambda f=fd: os.read(f, 4096), b""): m.update(byte_block) else: raise ValueError("Found unexpected filetype on OS image") finally: os.close(fd)