From 0c8fc90cb2f0101f60a78953668b391e44d3cdca Mon Sep 17 00:00:00 2001 From: Zac Medico Date: Mon, 9 Jun 2008 13:55:06 +0000 Subject: Add CONTENTS indexing support for optimization of owner lookups. The vardbapi cache maintains a hash table (inside vdb_metadata.pickle) that serves to index package contents by mapping the basename of file to a list of possible packages that own it. This is used to optimize owner lookups by narrowing the search down to a smaller number of packages. It increases the size of vdb_metadata.pickle by approximately 30% and it's used in the following cases: * When an unexpected file collision occurs (whether or not collision-protect is enabled) * `emerge ` * `portageq owners` The svn path=/main/trunk/; revision=10609 --- bin/portageq | 35 +++--- pym/_emerge/__init__.py | 15 +-- pym/portage/dbapi/vartree.py | 293 +++++++++++++++++++++++++++++++++++-------- 3 files changed, 264 insertions(+), 79 deletions(-) diff --git a/bin/portageq b/bin/portageq index c3fe8b37d..fe5260bc0 100755 --- a/bin/portageq +++ b/bin/portageq @@ -188,26 +188,21 @@ def owners(argv): return 2 files.append(f[len(root):]) - found_owner = False - for cpv in vardb.cpv_all(): - cat, pkg = catsplit(cpv) - mylink = dblink(cat, pkg, root, settings, vartree=vardb.vartree) - myfiles = [] - for f in files: - if mylink.isowner(f, root): - myfiles.append(f) - if myfiles: - found_owner = True - sys.stdout.write("%s\n" % cpv) - for f in myfiles: - sys.stdout.write("\t%s\n" % \ - os.path.join(root, f.lstrip(os.path.sep))) - sys.stdout.flush() - if not found_owner: - sys.stderr.write("None of the installed packages claim the file(s).\n") - sys.stderr.flush() - return 1 - return 0 + owners = vardb._owners.get_owners(files) + + for pkg, owned_files in owners.iteritems(): + cpv = pkg.mycpv + sys.stdout.write("%s\n" % cpv) + for f in sorted(owned_files): + sys.stdout.write("\t%s\n" % \ + os.path.join(root, f.lstrip(os.path.sep))) + if owners: + sys.stdout.flush() + return 0 + + sys.stderr.write("None of the installed packages claim the file(s).\n") + sys.stderr.flush() + return 1 owners.uses_root = True diff --git a/pym/_emerge/__init__.py b/pym/_emerge/__init__.py index 520a86104..68f976cbe 100644 --- a/pym/_emerge/__init__.py +++ b/pym/_emerge/__init__.py @@ -2560,6 +2560,7 @@ class depgraph(object): myroot = self.target_root dbs = self._filtered_trees[myroot]["dbs"] vardb = self.trees[myroot]["vartree"].dbapi + real_vardb = self._trees_orig[myroot]["vartree"].dbapi portdb = self.trees[myroot]["porttree"].dbapi bindb = self.trees[myroot]["bintree"].dbapi pkgsettings = self.pkgsettings[myroot] @@ -2638,16 +2639,12 @@ class depgraph(object): " $ROOT.\n") % x, noiselevel=-1) return 0, [] relative_path = x[len(myroot):] - vartree = self._trees_orig[myroot]["vartree"] owner_cpv = None - for cpv in vardb.cpv_all(): - self.spinner.update() - cat, pf = portage.catsplit(cpv) - if portage.dblink(cat, pf, myroot, - pkgsettings, vartree=vartree).isowner( - relative_path, myroot): - owner_cpv = cpv - break + for pkg, relative_path in \ + real_vardb._owners.iter_owners([relative_path]): + owner_cpv = pkg.mycpv + break + if owner_cpv is None: portage.writemsg(("\n\n!!! '%s' is not claimed " + \ "by any package.\n") % x, noiselevel=-1) diff --git a/pym/portage/dbapi/vartree.py b/pym/portage/dbapi/vartree.py index f2593be62..fd28a6705 100644 --- a/pym/portage/dbapi/vartree.py +++ b/pym/portage/dbapi/vartree.py @@ -238,6 +238,9 @@ class vardbapi(dbapi): _excluded_dirs = re.compile(r'^(\..*|-MERGING-.*|' + \ "|".join(_excluded_dirs) + r')$') + _aux_cache_version = "1" + _owners_cache_version = "1" + # Number of uncached packages to trigger cache update, since # it's wasteful to update it for every vdb change. _aux_cache_threshold = 5 @@ -275,8 +278,7 @@ class vardbapi(dbapi): "EAPI", "HOMEPAGE", "IUSE", "KEYWORDS", "LICENSE", "PDEPEND", "PROVIDE", "RDEPEND", "repository", "RESTRICT" , "SLOT", "USE"]) - self._aux_cache = None - self._aux_cache_version = "1" + self._aux_cache_obj = None self._aux_cache_filename = os.path.join(self.root, CACHE_PATH.lstrip(os.path.sep), "vdb_metadata.pickle") self._counter_path = os.path.join(root, @@ -290,6 +292,7 @@ class vardbapi(dbapi): self.plib_registry = None self.linkmap = LinkageMap(self) + self._owners = self._owners_db(self) def getpath(self, mykey, filename=None): rValue = os.path.join(self.root, VDB_PATH, mykey) @@ -562,6 +565,7 @@ class vardbapi(dbapi): if self._aux_cache is not None and \ len(self._aux_cache["modified"]) >= self._aux_cache_threshold and \ secpass >= 2: + self._owners.populate() # index any unindexed contents valid_nodes = set(self.cpv_all()) for cpv in self._aux_cache["packages"].keys(): if cpv not in valid_nodes: @@ -577,6 +581,54 @@ class vardbapi(dbapi): pass self._aux_cache["modified"] = set() + @property + def _aux_cache(self): + if self._aux_cache_obj is None: + self._aux_cache_init() + return self._aux_cache_obj + + def _aux_cache_init(self): + try: + f = open(self._aux_cache_filename) + mypickle = cPickle.Unpickler(f) + mypickle.find_global = None + aux_cache = mypickle.load() + f.close() + del f + except (IOError, OSError, EOFError, cPickle.UnpicklingError), e: + if isinstance(e, cPickle.UnpicklingError): + writemsg("!!! Error loading '%s': %s\n" % \ + (self._aux_cache_filename, str(e)), noiselevel=-1) + del e + + if not aux_cache or \ + not isinstance(aux_cache, dict) or \ + aux_cache.get("version") != self._aux_cache_version or \ + not aux_cache.get("packages"): + aux_cache = {"version": self._aux_cache_version} + aux_cache["packages"] = {} + + owners = aux_cache.get("owners") + if owners is not None: + if not isinstance(owners, dict): + owners = None + elif "version" not in owners: + owners = None + elif owners["version"] != self._owners_cache_version: + owners = None + elif "base_names" not in owners: + owners = None + + if owners is None: + owners = { + "base_names" : {}, + "version" : self._owners_cache_version + } + aux_cache["owners"] = owners + + aux_cache["modified"] = set() + self._aux_cache_obj = aux_cache + def aux_get(self, mycpv, wants): """This automatically caches selected keys that are frequently needed by emerge for dependency calculations. The cached metadata is @@ -601,26 +653,6 @@ class vardbapi(dbapi): cache_these = set(self._aux_cache_keys) cache_these.update(cache_these_wants) - if self._aux_cache is None: - try: - f = open(self._aux_cache_filename) - mypickle = cPickle.Unpickler(f) - mypickle.find_global = None - self._aux_cache = mypickle.load() - f.close() - del f - except (IOError, OSError, EOFError, cPickle.UnpicklingError), e: - if isinstance(e, cPickle.UnpicklingError): - writemsg("!!! Error loading '%s': %s\n" % \ - (self._aux_cache_filename, str(e)), noiselevel=-1) - del e - if not self._aux_cache or \ - not isinstance(self._aux_cache, dict) or \ - self._aux_cache.get("version") != self._aux_cache_version or \ - not self._aux_cache.get("packages"): - self._aux_cache = {"version": self._aux_cache_version} - self._aux_cache["packages"] = {} - self._aux_cache["modified"] = set() mydir = self.getpath(mycpv) mydir_stat = None try: @@ -801,6 +833,173 @@ class vardbapi(dbapi): write_atomic(self._counter_path, str(counter)) return counter + def _dblink(self, cpv): + category, pf = catsplit(cpv) + return dblink(category, pf, self.root, + self.settings, vartree=self.vartree) + + class _owners_cache(object): + """ + This class maintains an hash table that serves to index package + contents by mapping the basename of file to a list of possible + packages that own it. This is used to optimize owner lookups + by narrowing the search down to a smaller number of packages. + """ + try: + from hashlib import md5 as _new_hash + except ImportError: + from md5 import new as _new_hash + + _hash_bits = 16 + _hex_chars = _hash_bits / 4 + + def __init__(self, vardb): + self._vardb = vardb + + def add(self, cpv): + root_len = len(self._vardb.root) + contents = self._vardb._dblink(cpv).getcontents() + pkg_hash = self._hash_pkg(cpv) + if not contents: + # Empty path is a code used to represent empty contents. + self._add_path("", pkg_hash) + for x in contents: + relative_path = x[root_len:] + self._add_path(x, pkg_hash) + self._vardb._aux_cache["modified"].add(cpv) + + def _add_path(self, path, pkg_hash): + """ + Empty path is a code that represents empty contents. + """ + if path: + name = os.path.basename(path.rstrip(os.path.sep)) + if not name: + return + else: + name = path + name_hash = self._hash_str(name) + base_names = self._vardb._aux_cache["owners"]["base_names"] + pkgs = base_names.get(name_hash) + if pkgs is None: + pkgs = {} + base_names[name_hash] = pkgs + pkgs[pkg_hash] = None + + def _hash_str(self, s): + h = self._new_hash() + h.update(s) + h = h.hexdigest() + h = h[-self._hex_chars:] + h = int(h, 16) + return h + + def _hash_pkg(self, cpv): + counter, mtime = self._vardb.aux_get( + cpv, ["COUNTER", "_mtime_"]) + try: + counter = int(counter) + except ValueError: + counter = 0 + return (cpv, counter, mtime) + + class _owners_db(object): + + def __init__(self, vardb): + self._vardb = vardb + + def populate(self): + self._populate() + + def _populate(self): + owners_cache = vardbapi._owners_cache(self._vardb) + cached_hashes = set() + base_names = self._vardb._aux_cache["owners"]["base_names"] + + # Take inventory of all cached package hashes. + for hash_values in base_names.itervalues(): + cached_hashes.update(hash_values) + + # Create sets of valid package hashes and uncached packages. + uncached_pkgs = set() + hash_pkg = owners_cache._hash_pkg + valid_pkg_hashes = set() + for cpv in self._vardb.cpv_all(): + hash_value = hash_pkg(cpv) + valid_pkg_hashes.add(hash_value) + if hash_value not in cached_hashes: + uncached_pkgs.add(cpv) + + # Cache any missing packages. + for cpv in uncached_pkgs: + owners_cache.add(cpv) + + # Delete any stale cache. + stale_hashes = cached_hashes.difference(valid_pkg_hashes) + if stale_hashes: + for base_name_hash, bucket in base_names.items(): + for hash_value in stale_hashes.intersection(bucket): + del bucket[hash_value] + if not bucket: + del base_names[base_name_hash] + + return owners_cache + + def get_owners(self, path_iter): + """ + @return the owners as a dblink -> set(files) mapping. + """ + owners = {} + for owner, f in self.iter_owners(path_iter): + owned_files = owners.get(owner) + if owned_files is None: + owned_files = set() + owners[owner] = owned_files + owned_files.add(f) + return owners + + def iter_owners(self, path_iter): + """ + Iterate over tuples of (dblink, path). In order to avoid + consuming too many resources for too much time, resources + are only allocated for the duration of a given iter_owners() + call. Therefore, to maximize reuse of resources when searching + for multiple files, it's best to search for them all in a single + call. + """ + + owners_cache = self._populate() + + vardb = self._vardb + root = vardb.root + hash_pkg = owners_cache._hash_pkg + hash_str = owners_cache._hash_str + base_names = self._vardb._aux_cache["owners"]["base_names"] + + dblink_cache = {} + + def dblink(cpv): + x = dblink_cache.get(cpv) + if x is None: + x = self._vardb._dblink(cpv) + dblink_cache[cpv] = x + return x + + for path in path_iter: + name = os.path.basename(path.rstrip(os.path.sep)) + if not name: + continue + + name_hash = hash_str(name) + pkgs = base_names.get(name_hash) + if pkgs is not None: + for hash_value in pkgs: + cpv, counter, mtime = hash_value + if hash_pkg(cpv) != hash_value: + continue + if dblink(cpv).isowner(path, root): + yield dblink(cpv), path + class vartree(object): "this tree will scan a var/db/pkg database located at root (passed to init)" def __init__(self, root="/", virtual=None, clone=None, categories=None, @@ -2202,36 +2401,30 @@ class dblink(object): eerror(msg) - if collision_protect: + msg = [] + msg.append("") + msg.append("Searching all installed" + \ + " packages for file collisions...") + msg.append("") + msg.append("Press Ctrl-C to Stop") + msg.append("") + eerror(msg) + + owners = self.vartree.dbapi._owners.get_owners(files) + self.vartree.dbapi.flush_cache() + + for pkg, owned_files in owners.iteritems(): + cpv = pkg.mycpv msg = [] - msg.append("") - msg.append("Searching all installed" + \ - " packages for file collisions...") - msg.append("") - msg.append("Press Ctrl-C to Stop") - msg.append("") + msg.append("%s" % cpv) + for f in sorted(owned_files): + msg.append("\t%s" % os.path.join(destroot, + f.lstrip(os.path.sep))) eerror(msg) - - found_owner = False - for cpv in self.vartree.dbapi.cpv_all(): - cat, pkg = catsplit(cpv) - mylink = dblink(cat, pkg, destroot, self.settings, - vartree=self.vartree) - mycollisions = [] - for f in collisions: - if mylink.isowner(f, destroot): - mycollisions.append(f) - if mycollisions: - found_owner = True - msg = [] - msg.append("%s" % cpv) - for f in mycollisions: - msg.append("\t%s" % os.path.join(destroot, - f.lstrip(os.path.sep))) - eerror(msg) - if not found_owner: - eerror(["None of the installed" + \ - " packages claim the file(s)."]) + if not owners: + eerror(["None of the installed" + \ + " packages claim the file(s)."]) + if collision_protect: return 1 writemsg_stdout(">>> Merging %s to %s\n" % (self.mycpv, destroot)) -- cgit v1.2.3-1-g7c22