From 3ba595090de65c3d1eef2c9c6ad99f50f8b8c911 Mon Sep 17 00:00:00 2001 From: Zac Medico Date: Mon, 9 Jun 2008 15:34:21 +0000 Subject: Add CONTENTS indexing support for optimization of owner lookups. The vardbapi cache maintains a hash table (inside vdb_metadata.pickle) that serves to index package contents by mapping the basename of file to a list of possible packages that own it. This is used to optimize owner lookups by narrowing the search down to a smaller number of packages. It increases the size of vdb_metadata.pickle by approximately 30% and it's used in the following cases: * When an unexpected file collision occurs (whether or not collision-protect is enabled) * `emerge ` * `portageq owners` (trunk r10609) svn path=/main/branches/2.1.2/; revision=10622 --- pym/portage.py | 293 +++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 243 insertions(+), 50 deletions(-) (limited to 'pym') diff --git a/pym/portage.py b/pym/portage.py index 68354a358..83a85b322 100644 --- a/pym/portage.py +++ b/pym/portage.py @@ -6740,6 +6740,9 @@ class vardbapi(dbapi): _excluded_dirs = re.compile(r'^(\..*|-MERGING-.*|' + \ "|".join(_excluded_dirs) + r')$') + _aux_cache_version = "1" + _owners_cache_version = "1" + # Number of uncached packages to trigger cache update, since # it's wasteful to update it for every vdb change. _aux_cache_threshold = 5 @@ -6771,12 +6774,12 @@ class vardbapi(dbapi): "EAPI", "HOMEPAGE", "IUSE", "KEYWORDS", "LICENSE", "PDEPEND", "PROVIDE", "RDEPEND", "repository", "RESTRICT" , "SLOT", "USE"]) - self._aux_cache = None - self._aux_cache_version = "1" + self._aux_cache_obj = None self._aux_cache_filename = os.path.join(self.root, CACHE_PATH.lstrip(os.path.sep), "vdb_metadata.pickle") self._counter_path = os.path.join(root, CACHE_PATH.lstrip(os.path.sep), "counter") + self._owners = self._owners_db(self) def cpv_exists(self,mykey): "Tells us whether an actual ebuild exists on disk (no masking)" @@ -7042,6 +7045,7 @@ class vardbapi(dbapi): if self._aux_cache is not None and \ len(self._aux_cache["modified"]) >= self._aux_cache_threshold and \ secpass >= 2: + self._owners.populate() # index any unindexed contents valid_nodes = set(self.cpv_all()) for cpv in self._aux_cache["packages"].keys(): if cpv not in valid_nodes: @@ -7057,6 +7061,54 @@ class vardbapi(dbapi): pass self._aux_cache["modified"] = set() + @property + def _aux_cache(self): + if self._aux_cache_obj is None: + self._aux_cache_init() + return self._aux_cache_obj + + def _aux_cache_init(self): + try: + f = open(self._aux_cache_filename) + mypickle = cPickle.Unpickler(f) + mypickle.find_global = None + aux_cache = mypickle.load() + f.close() + del f + except (IOError, OSError, EOFError, cPickle.UnpicklingError), e: + if isinstance(e, cPickle.UnpicklingError): + writemsg("!!! Error loading '%s': %s\n" % \ + (self._aux_cache_filename, str(e)), noiselevel=-1) + del e + + if not aux_cache or \ + not isinstance(aux_cache, dict) or \ + aux_cache.get("version") != self._aux_cache_version or \ + not aux_cache.get("packages"): + aux_cache = {"version": self._aux_cache_version} + aux_cache["packages"] = {} + + owners = aux_cache.get("owners") + if owners is not None: + if not isinstance(owners, dict): + owners = None + elif "version" not in owners: + owners = None + elif owners["version"] != self._owners_cache_version: + owners = None + elif "base_names" not in owners: + owners = None + + if owners is None: + owners = { + "base_names" : {}, + "version" : self._owners_cache_version + } + aux_cache["owners"] = owners + + aux_cache["modified"] = set() + self._aux_cache_obj = aux_cache + def aux_get(self, mycpv, wants): """This automatically caches selected keys that are frequently needed by emerge for dependency calculations. The cached metadata is @@ -7078,26 +7130,6 @@ class vardbapi(dbapi): cache_these = set(self._aux_cache_keys) cache_these.update(cache_these_wants) - if self._aux_cache is None: - try: - f = open(self._aux_cache_filename) - mypickle = cPickle.Unpickler(f) - mypickle.find_global = None - self._aux_cache = mypickle.load() - f.close() - del f - except (IOError, OSError, EOFError, cPickle.UnpicklingError), e: - if isinstance(e, cPickle.UnpicklingError): - writemsg("!!! Error loading '%s': %s\n" % \ - (self._aux_cache_filename, str(e)), noiselevel=-1) - del e - if not self._aux_cache or \ - not isinstance(self._aux_cache, dict) or \ - self._aux_cache.get("version") != self._aux_cache_version or \ - not self._aux_cache.get("packages"): - self._aux_cache = {"version": self._aux_cache_version} - self._aux_cache["packages"] = {} - self._aux_cache["modified"] = set() mydir = os.path.join(self.root, VDB_PATH, mycpv) mydir_stat = None try: @@ -7272,6 +7304,173 @@ class vardbapi(dbapi): write_atomic(self._counter_path, str(counter)) return counter + def _dblink(self, cpv): + category, pf = catsplit(cpv) + return dblink(category, pf, self.root, + self.settings, vartree=self.vartree) + + class _owners_cache(object): + """ + This class maintains an hash table that serves to index package + contents by mapping the basename of file to a list of possible + packages that own it. This is used to optimize owner lookups + by narrowing the search down to a smaller number of packages. + """ + try: + from hashlib import md5 as _new_hash + except ImportError: + from md5 import new as _new_hash + + _hash_bits = 16 + _hex_chars = _hash_bits / 4 + + def __init__(self, vardb): + self._vardb = vardb + + def add(self, cpv): + root_len = len(self._vardb.root) + contents = self._vardb._dblink(cpv).getcontents() + pkg_hash = self._hash_pkg(cpv) + if not contents: + # Empty path is a code used to represent empty contents. + self._add_path("", pkg_hash) + for x in contents: + relative_path = x[root_len:] + self._add_path(x, pkg_hash) + self._vardb._aux_cache["modified"].add(cpv) + + def _add_path(self, path, pkg_hash): + """ + Empty path is a code that represents empty contents. + """ + if path: + name = os.path.basename(path.rstrip(os.path.sep)) + if not name: + return + else: + name = path + name_hash = self._hash_str(name) + base_names = self._vardb._aux_cache["owners"]["base_names"] + pkgs = base_names.get(name_hash) + if pkgs is None: + pkgs = {} + base_names[name_hash] = pkgs + pkgs[pkg_hash] = None + + def _hash_str(self, s): + h = self._new_hash() + h.update(s) + h = h.hexdigest() + h = h[-self._hex_chars:] + h = int(h, 16) + return h + + def _hash_pkg(self, cpv): + counter, mtime = self._vardb.aux_get( + cpv, ["COUNTER", "_mtime_"]) + try: + counter = int(counter) + except ValueError: + counter = 0 + return (cpv, counter, mtime) + + class _owners_db(object): + + def __init__(self, vardb): + self._vardb = vardb + + def populate(self): + self._populate() + + def _populate(self): + owners_cache = vardbapi._owners_cache(self._vardb) + cached_hashes = set() + base_names = self._vardb._aux_cache["owners"]["base_names"] + + # Take inventory of all cached package hashes. + for hash_values in base_names.itervalues(): + cached_hashes.update(hash_values) + + # Create sets of valid package hashes and uncached packages. + uncached_pkgs = set() + hash_pkg = owners_cache._hash_pkg + valid_pkg_hashes = set() + for cpv in self._vardb.cpv_all(): + hash_value = hash_pkg(cpv) + valid_pkg_hashes.add(hash_value) + if hash_value not in cached_hashes: + uncached_pkgs.add(cpv) + + # Cache any missing packages. + for cpv in uncached_pkgs: + owners_cache.add(cpv) + + # Delete any stale cache. + stale_hashes = cached_hashes.difference(valid_pkg_hashes) + if stale_hashes: + for base_name_hash, bucket in base_names.items(): + for hash_value in stale_hashes.intersection(bucket): + del bucket[hash_value] + if not bucket: + del base_names[base_name_hash] + + return owners_cache + + def get_owners(self, path_iter): + """ + @return the owners as a dblink -> set(files) mapping. + """ + owners = {} + for owner, f in self.iter_owners(path_iter): + owned_files = owners.get(owner) + if owned_files is None: + owned_files = set() + owners[owner] = owned_files + owned_files.add(f) + return owners + + def iter_owners(self, path_iter): + """ + Iterate over tuples of (dblink, path). In order to avoid + consuming too many resources for too much time, resources + are only allocated for the duration of a given iter_owners() + call. Therefore, to maximize reuse of resources when searching + for multiple files, it's best to search for them all in a single + call. + """ + + owners_cache = self._populate() + + vardb = self._vardb + root = vardb.root + hash_pkg = owners_cache._hash_pkg + hash_str = owners_cache._hash_str + base_names = self._vardb._aux_cache["owners"]["base_names"] + + dblink_cache = {} + + def dblink(cpv): + x = dblink_cache.get(cpv) + if x is None: + x = self._vardb._dblink(cpv) + dblink_cache[cpv] = x + return x + + for path in path_iter: + name = os.path.basename(path.rstrip(os.path.sep)) + if not name: + continue + + name_hash = hash_str(name) + pkgs = base_names.get(name_hash) + if pkgs is not None: + for hash_value in pkgs: + cpv, counter, mtime = hash_value + if hash_pkg(cpv) != hash_value: + continue + if dblink(cpv).isowner(path, root): + yield dblink(cpv), path + class vartree(object): "this tree will scan a var/db/pkg database located at root (passed to init)" def __init__(self, root="/", virtual=None, clone=None, categories=None, @@ -9691,36 +9890,30 @@ class dblink: eerror(msg) - if collision_protect: + msg = [] + msg.append("") + msg.append("Searching all installed" + \ + " packages for file collisions...") + msg.append("") + msg.append("Press Ctrl-C to Stop") + msg.append("") + eerror(msg) + + owners = self.vartree.dbapi._owners.get_owners(files) + self.vartree.dbapi.flush_cache() + + for pkg, owned_files in owners.iteritems(): + cpv = pkg.mycpv msg = [] - msg.append("") - msg.append("Searching all installed" + \ - " packages for file collisions...") - msg.append("") - msg.append("Press Ctrl-C to Stop") - msg.append("") + msg.append("%s" % cpv) + for f in sorted(owned_files): + msg.append("\t%s" % os.path.join(destroot, + f.lstrip(os.path.sep))) eerror(msg) - - found_owner = False - for cpv in self.vartree.dbapi.cpv_all(): - cat, pkg = catsplit(cpv) - mylink = dblink(cat, pkg, destroot, self.settings, - vartree=self.vartree) - mycollisions = [] - for f in collisions: - if mylink.isowner(f, destroot): - mycollisions.append(f) - if mycollisions: - found_owner = True - msg = [] - msg.append("%s" % cpv) - for f in mycollisions: - msg.append("\t%s" % os.path.join(destroot, - f.lstrip(os.path.sep))) - eerror(msg) - if not found_owner: - eerror(["None of the installed" + \ - " packages claim the file(s)."]) + if not owners: + eerror(["None of the installed" + \ + " packages claim the file(s)."]) + if collision_protect: return 1 writemsg_stdout(">>> Merging %s to %s\n" % (self.mycpv, destroot)) -- cgit v1.2.3-1-g7c22