Skip to content

Commit

Permalink
add DirectoryIndex
Browse files Browse the repository at this point in the history
  • Loading branch information
ctb committed Jul 4, 2021
1 parent 9726be9 commit 7c4949f
Showing 1 changed file with 63 additions and 4 deletions.
67 changes: 63 additions & 4 deletions src/sourmash/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -449,8 +449,9 @@ class LazyLinearIndex(Index):
* this class stores the provided index 'db' in memory. If you need
a class that does lazy loading of signatures from disk and does not
store signatures in memory, see LazyLoadedIndex.
* if you want efficient in-memory manifest-based selection, consider
LazyMultiIndex.
* if you want efficient manifest-based selection, consider
MultiIndex (signatures in memory) and LazyMultiIndex (signatures loaded
upon request).
"""

def __init__(self, db, selection_dict={}):
Expand Down Expand Up @@ -895,9 +896,18 @@ def sigloc_iter():
return cls(manifest, parent=parent)

@classmethod
def load_from_directory(cls, pathname, force=False):
def load_from_directory(cls, pathname, *, force=False):
"""Create a MultiIndex from a directory.
Takes directory path plus optional boolean 'force'. Attempts to
load all files ending in .sig or .sig.gz, by default; if 'force' is
True, will attempt to load _all_ files, ignoring errors.
"""
from .sourmash_args import traverse_find_sigs

if not os.path.isdir(pathname):
raise ValueError(f"'{pathname}' must be a directory.")

index_list = []
source_list = []

Expand Down Expand Up @@ -980,6 +990,56 @@ def select(self, **kwargs):
return MultiIndex(new_manifest, parent=self.parent)


class DirectoryIndex(Index):
"""
Notes:
* Lazy - loads signatures only when requested
"""
def __init__(self, parent, manifest):
self.parent = parent
self.manifest = manifest

@classmethod
def load(cls, pathname):
"Create a DirectoryIndex from a directory with an existing manifest."
if not os.path.isdir(pathname):
raise ValueError(f"'{pathname}' must be a directory.")

manifest_path = os.path.join(pathname, "SOURMASH-MANIFEST.csv")
if not os.path.exists(manifest_path):
raise ValueError(f"Cannot find manifest '{manifest_path}'")

with open(manifest_path, newline="") as csvfp:
manifest = CollectionManifest.load_from_csv(csvfp)

return cls(pathname, manifest)

def signatures(self):
for ss, _ in self.signatures_with_location():
yield ss

def signatures_with_location(self):
for location in self.manifest.locations():
fullpath = os.path.join(self.parent, location)
for ss in load_signatures(fullpath):
if ss in self.manifest:
yield ss, fullpath

def __len__(self):
return len(self.manifest)

def insert(self, *args):
raise NotImplementedError

def save(self, *args):
raise NotImplementedError

def select(self, **kwargs):
"Run 'select' on the manifest."
new_manifest = self.manifest.select_to_manifest(**kwargs)
return DirectoryIndex(self.parent, new_manifest)


class LazyLoadedIndex(Index):
"""Given an index location and a manifest, do select only on the manifest
until signatures are actually requested, and only then load the index.
Expand All @@ -991,7 +1051,6 @@ class LazyLoadedIndex(Index):
from disk every time they are needed (e.g. 'find(...)', 'signatures()').
Can be used with LazyMultiIndex to support many such indices at once.
"""
def __init__(self, filename, manifest):
"Create an Index with given filename and manifest."
Expand Down

0 comments on commit 7c4949f

Please sign in to comment.