summaryrefslogtreecommitdiffstats
path: root/scripts/sstate-cache-management.py
diff options
context:
space:
mode:
Diffstat (limited to 'scripts/sstate-cache-management.py')
-rwxr-xr-xscripts/sstate-cache-management.py329
1 files changed, 329 insertions, 0 deletions
diff --git a/scripts/sstate-cache-management.py b/scripts/sstate-cache-management.py
new file mode 100755
index 0000000000..d3f600bd28
--- /dev/null
+++ b/scripts/sstate-cache-management.py
@@ -0,0 +1,329 @@
+#!/usr/bin/env python3
+#
+# Copyright OpenEmbedded Contributors
+#
+# SPDX-License-Identifier: MIT
+#
+
+import argparse
+import os
+import re
+import sys
+
+from collections import defaultdict
+from concurrent.futures import ThreadPoolExecutor
+from dataclasses import dataclass
+from pathlib import Path
+
+if sys.version_info < (3, 8, 0):
+ raise RuntimeError("Sorry, python 3.8.0 or later is required for this script.")
+
+SSTATE_PREFIX = "sstate:"
+SSTATE_EXTENSION = ".tar.zst"
+# SSTATE_EXTENSION = ".tgz"
+# .siginfo.done files are mentioned in the original script?
+SSTATE_SUFFIXES = (
+ SSTATE_EXTENSION,
+ f"{SSTATE_EXTENSION}.siginfo",
+ f"{SSTATE_EXTENSION}.done",
+)
+
+RE_SSTATE_PKGSPEC = re.compile(
+ rf"""sstate:(?P<pn>[^:]*):
+ (?P<package_target>[^:]*):
+ (?P<pv>[^:]*):
+ (?P<pr>[^:]*):
+ (?P<sstate_pkgarch>[^:]*):
+ (?P<sstate_version>[^_]*):
+ (?P<bb_unihash>[^_]*)_
+ (?P<bb_task>[^:]*)
+ (?P<ext>({"|".join([re.escape(s) for s in SSTATE_SUFFIXES])}))$""",
+ re.X,
+)
+
+
+# Really we'd like something like a Path subclass which implements a stat
+# cache here, unfortunately there's no good way to do that transparently
+# (yet); see:
+#
+# https://github.com/python/cpython/issues/70219
+# https://discuss.python.org/t/make-pathlib-extensible/3428/77
+@dataclass
+class SstateEntry:
+ """Class for keeping track of an entry in sstate-cache."""
+
+ path: Path
+ match: re.Match
+ stat_result: os.stat_result = None
+
+ def __hash__(self):
+ return self.path.__hash__()
+
+ def __getattr__(self, name):
+ return self.match.group(name)
+
+
+# this is what's in the original script; as far as I can tell, it's an
+# implementation artefact which we don't need?
+def find_archs():
+ # all_archs
+ builder_arch = os.uname().machine
+
+ # FIXME
+ layer_paths = [Path("../..")]
+
+ tune_archs = set()
+ re_tune = re.compile(r'AVAILTUNES .*=.*"(.*)"')
+ for path in layer_paths:
+ for tunefile in [
+ p for p in path.glob("meta*/conf/machine/include/**/*") if p.is_file()
+ ]:
+ with open(tunefile) as f:
+ for line in f:
+ m = re_tune.match(line)
+ if m:
+ tune_archs.update(m.group(1).split())
+
+ # all_machines
+ machine_archs = set()
+ for path in layer_paths:
+ for machine_file in path.glob("meta*/conf/machine/*.conf"):
+ machine_archs.add(machine_file.parts[-1][:-5])
+
+ extra_archs = set()
+ all_archs = (
+ set(
+ arch.replace("-", "_")
+ for arch in machine_archs | tune_archs | set(["allarch", builder_arch])
+ )
+ | extra_archs
+ )
+
+ print(all_archs)
+
+
+# again, not needed?
+def find_tasks():
+ print(set([p.bb_task for p in paths]))
+
+
+def collect_sstate_paths(args):
+ def scandir(path, paths):
+ # Assume everything is a directory; by not checking we avoid needing an
+ # additional stat which is potentially a synchronous roundtrip over NFS
+ try:
+ for p in path.iterdir():
+ filename = p.parts[-1]
+ if filename.startswith(SSTATE_PREFIX):
+ if filename.endswith(SSTATE_SUFFIXES):
+ m = RE_SSTATE_PKGSPEC.match(p.parts[-1])
+ assert m
+ paths.add(SstateEntry(p, m))
+ # ignore other things (includes things like lockfiles)
+ else:
+ scandir(p, paths)
+
+ except NotADirectoryError:
+ pass
+
+ paths = set()
+ # TODO: parellise scandir
+ scandir(Path(args.cache_dir), paths)
+
+ def path_stat(p):
+ p.stat_result = p.path.lstat()
+
+ if args.remove_duplicated:
+ # This is probably slightly performance negative on a local filesystem
+ # when we interact with the GIL; over NFS it's a massive win.
+ with ThreadPoolExecutor(max_workers=args.jobs) as executor:
+ executor.map(path_stat, paths)
+
+ return paths
+
+
+def remove_by_stamps(args, paths):
+ all_sums = set()
+ for stamps_dir in args.stamps_dir:
+ stamps_path = Path(stamps_dir)
+ assert stamps_path.is_dir()
+ re_sigdata = re.compile(r"do_.*\.sigdata\.([^.]*)")
+ all_sums |= set(
+ [
+ re_sigdata.search(x.parts[-1]).group(1)
+ for x in stamps_path.glob("*/*/*.do_*.sigdata.*")
+ ]
+ )
+ re_setscene = re.compile(r"do_.*_setscene\.([^.]*)")
+ all_sums |= set(
+ [
+ re_setscene.search(x.parts[-1]).group(1)
+ for x in stamps_path.glob("*/*/*.do_*_setscene.*")
+ ]
+ )
+ return [p for p in paths if p.bb_unihash not in all_sums]
+
+
+def remove_duplicated(args, paths):
+ # Skip populate_lic as it produces duplicates in a normal build
+ #
+ # 9ae16469e707 sstate-cache-management: skip populate_lic archives when removing duplicates
+ valid_paths = [p for p in paths if p.bb_task != "populate_lic"]
+
+ keep = dict()
+ remove = list()
+ for p in valid_paths:
+ sstate_sig = ":".join([p.pn, p.sstate_pkgarch, p.bb_task, p.ext])
+ if sstate_sig not in keep:
+ keep[sstate_sig] = p
+ elif p.stat_result.st_mtime > keep[sstate_sig].stat_result.st_mtime:
+ remove.append(keep[sstate_sig])
+ keep[sstate_sig] = p
+ else:
+ remove.append(p)
+
+ return remove
+
+
+def remove_orphans(args, paths):
+ remove = list()
+ pathsigs = defaultdict(list)
+ for p in paths:
+ sstate_sig = ":".join([p.pn, p.sstate_pkgarch, p.bb_task])
+ pathsigs[sstate_sig].append(p)
+ for k, v in pathsigs.items():
+ if len([p for p in v if p.ext == SSTATE_EXTENSION]) == 0:
+ remove.extend(v)
+ return remove
+
+
+def parse_arguments():
+ parser = argparse.ArgumentParser(description="sstate cache management utility.")
+
+ parser.add_argument(
+ "--cache-dir",
+ default=os.environ.get("SSTATE_CACHE_DIR"),
+ help="""Specify sstate cache directory, will use the environment
+ variable SSTATE_CACHE_DIR if it is not specified.""",
+ )
+
+ # parser.add_argument(
+ # "--extra-archs",
+ # help="""Specify list of architectures which should be tested, this list
+ # will be extended with native arch, allarch and empty arch. The
+ # script won't be trying to generate list of available archs from
+ # AVAILTUNES in tune files.""",
+ # )
+
+ # parser.add_argument(
+ # "--extra-layer",
+ # help="""Specify the layer which will be used for searching the archs,
+ # it will search the meta and meta-* layers in the top dir by
+ # default, and will search meta, meta-*, <layer1>, <layer2>,
+ # ...<layern> when specified. Use "," as the separator.
+ #
+ # This is useless for --stamps-dir or when --extra-archs is used.""",
+ # )
+
+ parser.add_argument(
+ "-d",
+ "--remove-duplicated",
+ action="store_true",
+ help="""Remove the duplicated sstate cache files of one package, only
+ the newest one will be kept. The duplicated sstate cache files
+ of one package must have the same arch, which means sstate cache
+ files with multiple archs are not considered duplicate.
+
+ Conflicts with --stamps-dir.""",
+ )
+
+ parser.add_argument(
+ "--remove-orphans",
+ action="store_true",
+ help=f"""Remove orphan siginfo files from the sstate cache, i.e. those
+ where this is no {SSTATE_EXTENSION} file but there are associated
+ tracking files.""",
+ )
+
+ parser.add_argument(
+ "--stamps-dir",
+ action="append",
+ help="""Specify the build directory's stamps directories, the sstate
+ cache file which IS USED by these build diretories will be KEPT,
+ other sstate cache files in cache-dir will be removed. Can be
+ specified multiple times for several directories.
+
+ Conflicts with --remove-duplicated.""",
+ )
+
+ parser.add_argument(
+ "-j", "--jobs", default=8, type=int, help="Run JOBS jobs in parallel."
+ )
+
+ # parser.add_argument(
+ # "-L",
+ # "--follow-symlink",
+ # action="store_true",
+ # help="Remove both the symbol link and the destination file, default: no.",
+ # )
+
+ parser.add_argument(
+ "-y",
+ "--yes",
+ action="store_true",
+ help="""Automatic yes to prompts; assume "yes" as answer to all prompts
+ and run non-interactively.""",
+ )
+
+ parser.add_argument(
+ "-v", "--verbose", action="store_true", help="Explain what is being done."
+ )
+
+ parser.add_argument(
+ "-D",
+ "--debug",
+ action="count",
+ default=0,
+ help="Show debug info, repeat for more debug info.",
+ )
+
+ args = parser.parse_args()
+ if args.cache_dir is None or (
+ not args.remove_duplicated and not args.stamps_dir and not args.remove_orphans
+ ):
+ parser.print_usage()
+ sys.exit(1)
+
+ return args
+
+
+def main():
+ args = parse_arguments()
+
+ paths = collect_sstate_paths(args)
+ if args.remove_duplicated:
+ remove = remove_duplicated(args, paths)
+ elif args.stamps_dir:
+ remove = remove_by_stamps(args, paths)
+ else:
+ remove = list()
+
+ if args.remove_orphans:
+ remove = set(remove) | set(remove_orphans(args, paths))
+
+ if args.debug >= 1:
+ print("\n".join([str(p.path) for p in remove]))
+ print(f"{len(remove)} out of {len(paths)} files will be removed!")
+ if not args.yes:
+ print("Do you want to continue (y/n)?")
+ confirm = input() in ("y", "Y")
+ else:
+ confirm = True
+ if confirm:
+ # TODO: parallelise remove
+ for p in remove:
+ p.path.unlink()
+
+
+if __name__ == "__main__":
+ main()