From 7c4949f2d0cc00e5082cfa81f062fc055e4d37bf Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 4 Jul 2021 07:03:42 -0700 Subject: [PATCH] add DirectoryIndex --- src/sourmash/index.py | 67 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 63 insertions(+), 4 deletions(-) diff --git a/src/sourmash/index.py b/src/sourmash/index.py index 19c21ca24c..5b57860661 100644 --- a/src/sourmash/index.py +++ b/src/sourmash/index.py @@ -449,8 +449,9 @@ class LazyLinearIndex(Index): * this class stores the provided index 'db' in memory. If you need a class that does lazy loading of signatures from disk and does not store signatures in memory, see LazyLoadedIndex. - * if you want efficient in-memory manifest-based selection, consider - LazyMultiIndex. + * if you want efficient manifest-based selection, consider + MultiIndex (signatures in memory) and LazyMultiIndex (signatures loaded + upon request). """ def __init__(self, db, selection_dict={}): @@ -895,9 +896,18 @@ def sigloc_iter(): return cls(manifest, parent=parent) @classmethod - def load_from_directory(cls, pathname, force=False): + def load_from_directory(cls, pathname, *, force=False): + """Create a MultiIndex from a directory. + + Takes directory path plus optional boolean 'force'. Attempts to + load all files ending in .sig or .sig.gz, by default; if 'force' is + True, will attempt to load _all_ files, ignoring errors. + """ from .sourmash_args import traverse_find_sigs + if not os.path.isdir(pathname): + raise ValueError(f"'{pathname}' must be a directory.") + index_list = [] source_list = [] @@ -980,6 +990,56 @@ def select(self, **kwargs): return MultiIndex(new_manifest, parent=self.parent) +class DirectoryIndex(Index): + """ + Notes: + * Lazy - loads signatures only when requested + """ + def __init__(self, parent, manifest): + self.parent = parent + self.manifest = manifest + + @classmethod + def load(cls, pathname): + "Create a DirectoryIndex from a directory with an existing manifest." + if not os.path.isdir(pathname): + raise ValueError(f"'{pathname}' must be a directory.") + + manifest_path = os.path.join(pathname, "SOURMASH-MANIFEST.csv") + if not os.path.exists(manifest_path): + raise ValueError(f"Cannot find manifest '{manifest_path}'") + + with open(manifest_path, newline="") as csvfp: + manifest = CollectionManifest.load_from_csv(csvfp) + + return cls(pathname, manifest) + + def signatures(self): + for ss, _ in self.signatures_with_location(): + yield ss + + def signatures_with_location(self): + for location in self.manifest.locations(): + fullpath = os.path.join(self.parent, location) + for ss in load_signatures(fullpath): + if ss in self.manifest: + yield ss, fullpath + + def __len__(self): + return len(self.manifest) + + def insert(self, *args): + raise NotImplementedError + + def save(self, *args): + raise NotImplementedError + + def select(self, **kwargs): + "Run 'select' on the manifest." + new_manifest = self.manifest.select_to_manifest(**kwargs) + return DirectoryIndex(self.parent, new_manifest) + + class LazyLoadedIndex(Index): """Given an index location and a manifest, do select only on the manifest until signatures are actually requested, and only then load the index. @@ -991,7 +1051,6 @@ class LazyLoadedIndex(Index): from disk every time they are needed (e.g. 'find(...)', 'signatures()'). Can be used with LazyMultiIndex to support many such indices at once. - """ def __init__(self, filename, manifest): "Create an Index with given filename and manifest."