From 7337563f332e927d5e51e8b8c3a8844709e4cf24 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Wed, 28 Apr 2021 13:31:58 -0700
Subject: [PATCH 01/18] initial refactor of CounterGather stuff

---
 src/sourmash/index.py | 52 ++++++++++++++++++++++++++++++-------------
 1 file changed, 36 insertions(+), 16 deletions(-)

diff --git a/src/sourmash/index.py b/src/sourmash/index.py
index 18bbf25f61..9476eccf8f 100644
--- a/src/sourmash/index.py
+++ b/src/sourmash/index.py
@@ -5,6 +5,7 @@
 from abc import abstractmethod, ABC
 from collections import namedtuple, Counter
 import zipfile
+import copy
 
 from .search import make_jaccard_search_query, make_gather_query
 
@@ -431,43 +432,51 @@ def select(self, **kwargs):
                                   traverse_yield_all=self.traverse_yield_all)
 
 
-class CounterGatherIndex(Index):
-    def __init__(self, query):
-        self.query = query
-        self.scaled = query.minhash.scaled
+class QuerySpecific_GatherCounter:
+    def __init__(self, query_mh):
+        if not query_mh.scaled:
+            raise ValueError('gather requires scaled signatures')
+
+        # track query
+        self.query_mh = copy.copy(query_mh)
+        self.scaled = query_mh.scaled
+
+        # track matching signatures & their locations
         self.siglist = []
         self.locations = []
+
+        # ...and overlaps with query
         self.counter = Counter()
 
-    def insert(self, ss, location=None):
+    def add(self, ss, location=None):
         i = len(self.siglist)
         self.siglist.append(ss)
         self.locations.append(location)
 
         # upon insertion, count & track overlap with the specific query.
         self.scaled = max(self.scaled, ss.minhash.scaled)
-        self.counter[i] = self.query.minhash.count_common(ss.minhash, True)
+        self.counter[i] = self.query_mh.count_common(ss.minhash, True)
 
-    def gather(self, query, threshold_bp=0, **kwargs):
-        "Perform compositional analysis of the query using the gather algorithm"
-        # CTB: switch over to JaccardSearch objects?
+    def __iter__(self):
+        return self
 
-        if not query.minhash:             # empty query? quit.
+    def next(self, scaled, threshold_bp=0, **kwargs):
+        "Perform compositional analysis of the query using the gather algorithm"
+        query_mh = self.query_mh
+        if not query_mh:             # empty query? quit.
             return []
 
         # bad query?
-        scaled = query.minhash.scaled
-        if not scaled:
-            raise ValueError('gather requires scaled signatures')
-
         if scaled == self.scaled:
-            query_mh = query.minhash
+            pass
         elif scaled < self.scaled:
-            query_mh = query.minhash.downsample(scaled=self.scaled)
+            query_mh = query_mh.downsample(scaled=self.scaled)
             scaled = self.scaled
         else: # query scaled > self.scaled, should never happen
             assert 0
 
+        self.query_mh = query_mh
+
         # empty? nothing to search.
         counter = self.counter
         siglist = self.siglist
@@ -536,6 +545,17 @@ def gather(self, query, threshold_bp=0, **kwargs):
             return [result]
         return []
 
+
+class CounterGatherIndex(Index):
+    def __init__(self, query):
+        self.counter = QuerySpecific_GatherCounter(query.minhash)
+
+    def insert(self, ss, location=None):
+        self.counter.add(ss, location)
+
+    def gather(self, query, threshold_bp=0):
+        return self.counter.next(query.minhash.scaled, threshold_bp)
+
     def signatures(self):
         raise NotImplementedError
 

From 98de5e68d1e1a53ef7bcd060c0d83c9dc21700a4 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Wed, 28 Apr 2021 16:37:15 -0700
Subject: [PATCH 02/18] fix up code a bit

---
 src/sourmash/index.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/sourmash/index.py b/src/sourmash/index.py
index 9476eccf8f..ca877c8241 100644
--- a/src/sourmash/index.py
+++ b/src/sourmash/index.py
@@ -438,7 +438,7 @@ def __init__(self, query_mh):
             raise ValueError('gather requires scaled signatures')
 
         # track query
-        self.query_mh = copy.copy(query_mh)
+        self.query_mh = copy.copy(query_mh).flatten()
         self.scaled = query_mh.scaled
 
         # track matching signatures & their locations
@@ -475,8 +475,6 @@ def next(self, scaled, threshold_bp=0, **kwargs):
         else: # query scaled > self.scaled, should never happen
             assert 0
 
-        self.query_mh = query_mh
-
         # empty? nothing to search.
         counter = self.counter
         siglist = self.siglist
@@ -541,6 +539,9 @@ def next(self, scaled, threshold_bp=0, **kwargs):
             if counter[dataset_id] == 0:
                 del counter[dataset_id]
 
+        query_mh.remove_many(intersect_mh.hashes)
+        self.query_mh = query_mh
+
         if result:
             return [result]
         return []

From e4ff7a87765bccfb70c3e4bda35812ec3e94d4bd Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Wed, 28 Apr 2021 17:45:38 -0700
Subject: [PATCH 03/18] cleanup and refactor

---
 src/sourmash/index.py | 132 +++++++++++++++++++++++-------------------
 1 file changed, 72 insertions(+), 60 deletions(-)

diff --git a/src/sourmash/index.py b/src/sourmash/index.py
index ca877c8241..93a1ede6d5 100644
--- a/src/sourmash/index.py
+++ b/src/sourmash/index.py
@@ -448,67 +448,89 @@ def __init__(self, query_mh):
         # ...and overlaps with query
         self.counter = Counter()
 
+        # cannot add matches once query has started.
+        self.query_started = 0
+
     def add(self, ss, location=None):
+        assert not self.query_started
+
         i = len(self.siglist)
         self.siglist.append(ss)
         self.locations.append(location)
 
+        # note: scaled will be max of all matches.
+        self.downsample(ss.minhash.scaled)
+
         # upon insertion, count & track overlap with the specific query.
-        self.scaled = max(self.scaled, ss.minhash.scaled)
         self.counter[i] = self.query_mh.count_common(ss.minhash, True)
 
-    def __iter__(self):
-        return self
-
-    def next(self, scaled, threshold_bp=0, **kwargs):
-        "Perform compositional analysis of the query using the gather algorithm"
-        query_mh = self.query_mh
-        if not query_mh:             # empty query? quit.
-            return []
-
-        # bad query?
-        if scaled == self.scaled:
-            pass
-        elif scaled < self.scaled:
-            query_mh = query_mh.downsample(scaled=self.scaled)
-            scaled = self.scaled
-        else: # query scaled > self.scaled, should never happen
-            assert 0
-
-        # empty? nothing to search.
-        counter = self.counter
-        siglist = self.siglist
-        if not (counter and siglist):
-            return []
+    def downsample(self, scaled):
+        if scaled > self.scaled:
+            self.query_mh = self.query_mh.downsample(scaled=scaled)
+            self.scaled = scaled
 
+    def calc_threshold(self, threshold_bp, scaled, query_size):
         threshold = 0.0
         n_threshold_hashes = 0
 
-        # are we setting a threshold?
         if threshold_bp:
             # if we have a threshold_bp of N, then that amounts to N/scaled
             # hashes:
             n_threshold_hashes = float(threshold_bp) / scaled
 
             # that then requires the following containment:
-            threshold = n_threshold_hashes / len(query_mh)
+            threshold = n_threshold_hashes / query_size
 
-            # is it too high to ever match? if so, exit.
-            if threshold > 1.0:
-                return []
+        return threshold, n_threshold_hashes
 
-        # Decompose query into matching signatures using a greedy approach
-        # (gather)
-        match_size = n_threshold_hashes
+    def update_counters(self, most_common, intersect_mh):
+        siglist = self.siglist
+        counter = self.counter
 
-        most_common = counter.most_common()
-        dataset_id, size = most_common.pop(0)
+        # Prepare counter for finding the next match by decrementing
+        # all hashes found in the current match in other datasets;
+        # remove empty datasets from counter, too.
+        for (dataset_id, _) in most_common:
+            remaining_mh = siglist[dataset_id].minhash
+            intersect_count = intersect_mh.count_common(remaining_mh,
+                                                        downsample=True)
+            counter[dataset_id] -= intersect_count
+            if counter[dataset_id] == 0:
+                del counter[dataset_id]
 
-        # fail threshold!
-        if size < n_threshold_hashes:
+    def next(self, scaled, threshold_bp=0, **kwargs):
+        "Iterate through results."
+        self.query_started = 1
+
+        # empty? nothing to search.
+        counter = self.counter
+        siglist = self.siglist
+        if not (counter and siglist):
+            return []
+
+        self.downsample(scaled)
+        scaled = self.scaled
+
+        query_mh = self.query_mh
+        if not query_mh:             # empty query? quit.
+            return []
+
+        # are we setting a threshold?
+        threshold, n_threshold_hashes = self.calc_threshold(threshold_bp,
+                                                            scaled,
+                                                            len(query_mh))
+
+        # is it too high to ever match? if so, exit.
+        if threshold > 1.0:
             return []
 
-        match_size = size
+        # Find the best match -
+        most_common = counter.most_common()
+        dataset_id, match_size = most_common.pop(0)
+
+        # below threshold? no match!
+        if match_size < n_threshold_hashes:
+            return []
 
         # pull match and location.
         match = siglist[dataset_id]
@@ -517,35 +539,25 @@ def next(self, scaled, threshold_bp=0, **kwargs):
         # remove from counter for next round of gather
         del counter[dataset_id]
 
-        # pull containment
+        # calculate containment
         cont = query_mh.contained_by(match.minhash, downsample=True)
-        result = None
-        if cont and cont >= threshold:
-            result = IndexSearchResult(cont, match, location)
 
-        # calculate intersection of this "best match" with query, for removal.
-        # @CTB note flatten
-        match_mh = match.minhash.downsample(scaled=scaled).flatten()
-        intersect_mh = query_mh.intersection(match_mh)
+        retval = []
+        if cont and cont >= threshold:
+            retval = [IndexSearchResult(cont, match, location)]
 
-        # Prepare counter for finding the next match by decrementing
-        # all hashes found in the current match in other datasets;
-        # remove empty datasets from counter, too.
-        for (dataset_id, _) in most_common:
-            remaining_sig = siglist[dataset_id]
-            intersect_count = remaining_sig.minhash.count_common(intersect_mh,
-                                                               downsample=True)
-            counter[dataset_id] -= intersect_count
-            if counter[dataset_id] == 0:
-                del counter[dataset_id]
+            # calculate intersection of this "best match" with query
+            # for removal.
 
-        query_mh.remove_many(intersect_mh.hashes)
-        self.query_mh = query_mh
+            # @CTB note flatten
+            match_mh = match.minhash.downsample(scaled=scaled).flatten()
+            intersect_mh = query_mh.intersection(match_mh)
+            query_mh.remove_many(intersect_mh.hashes)
 
-        if result:
-            return [result]
-        return []
+            # update all counters, etc.
+            self.update_counters(most_common, intersect_mh)
 
+        return retval
 
 class CounterGatherIndex(Index):
     def __init__(self, query):

From 125dfbd0358e35915f5c991b31748d6808804bd9 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Wed, 28 Apr 2021 21:16:17 -0700
Subject: [PATCH 04/18] factor back in the explicit current query

---
 src/sourmash/index.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/sourmash/index.py b/src/sourmash/index.py
index 93a1ede6d5..148182f606 100644
--- a/src/sourmash/index.py
+++ b/src/sourmash/index.py
@@ -438,7 +438,7 @@ def __init__(self, query_mh):
             raise ValueError('gather requires scaled signatures')
 
         # track query
-        self.query_mh = copy.copy(query_mh).flatten()
+        self.orig_query_mh = copy.copy(query_mh).flatten()
         self.scaled = query_mh.scaled
 
         # track matching signatures & their locations
@@ -462,11 +462,10 @@ def add(self, ss, location=None):
         self.downsample(ss.minhash.scaled)
 
         # upon insertion, count & track overlap with the specific query.
-        self.counter[i] = self.query_mh.count_common(ss.minhash, True)
+        self.counter[i] = self.orig_query_mh.count_common(ss.minhash, True)
 
     def downsample(self, scaled):
         if scaled > self.scaled:
-            self.query_mh = self.query_mh.downsample(scaled=scaled)
             self.scaled = scaled
 
     def calc_threshold(self, threshold_bp, scaled, query_size):
@@ -498,7 +497,7 @@ def update_counters(self, most_common, intersect_mh):
             if counter[dataset_id] == 0:
                 del counter[dataset_id]
 
-    def next(self, scaled, threshold_bp=0, **kwargs):
+    def next(self, query_mh, scaled, threshold_bp=0, **kwargs):
         "Iterate through results."
         self.query_started = 1
 
@@ -510,11 +509,14 @@ def next(self, scaled, threshold_bp=0, **kwargs):
 
         self.downsample(scaled)
         scaled = self.scaled
+        query_mh = query_mh.downsample(scaled=scaled)
 
-        query_mh = self.query_mh
         if not query_mh:             # empty query? quit.
             return []
 
+        assert query_mh.contained_by(self.orig_query_mh,
+                                     downsample=True) == 1
+
         # are we setting a threshold?
         threshold, n_threshold_hashes = self.calc_threshold(threshold_bp,
                                                             scaled,
@@ -567,7 +569,7 @@ def insert(self, ss, location=None):
         self.counter.add(ss, location)
 
     def gather(self, query, threshold_bp=0):
-        return self.counter.next(query.minhash.scaled, threshold_bp)
+        return self.counter.next(query.minhash, query.minhash.scaled, threshold_bp)
 
     def signatures(self):
         raise NotImplementedError

From f7e430e04d23ae34ae53ac58d6510ff87f81180d Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Wed, 28 Apr 2021 21:24:09 -0700
Subject: [PATCH 05/18] refactor into peek and consume

---
 src/sourmash/index.py | 42 ++++++++++++++++++++++++------------------
 1 file changed, 24 insertions(+), 18 deletions(-)

diff --git a/src/sourmash/index.py b/src/sourmash/index.py
index 148182f606..ae91a24705 100644
--- a/src/sourmash/index.py
+++ b/src/sourmash/index.py
@@ -497,7 +497,12 @@ def update_counters(self, most_common, intersect_mh):
             if counter[dataset_id] == 0:
                 del counter[dataset_id]
 
-    def next(self, query_mh, scaled, threshold_bp=0, **kwargs):
+    def consume(self, intersect_mh):
+        counter = self.counter
+        most_common = counter.most_common()
+        self.update_counters(most_common, intersect_mh)
+
+    def peek(self, cur_query_mh, scaled, threshold_bp=0, **kwargs):
         "Iterate through results."
         self.query_started = 1
 
@@ -509,18 +514,18 @@ def next(self, query_mh, scaled, threshold_bp=0, **kwargs):
 
         self.downsample(scaled)
         scaled = self.scaled
-        query_mh = query_mh.downsample(scaled=scaled)
+        cur_query_mh = cur_query_mh.downsample(scaled=scaled)
 
-        if not query_mh:             # empty query? quit.
+        if not cur_query_mh:             # empty query? quit.
             return []
 
-        assert query_mh.contained_by(self.orig_query_mh,
+        assert cur_query_mh.contained_by(self.orig_query_mh,
                                      downsample=True) == 1
 
         # are we setting a threshold?
         threshold, n_threshold_hashes = self.calc_threshold(threshold_bp,
                                                             scaled,
-                                                            len(query_mh))
+                                                            len(cur_query_mh))
 
         # is it too high to ever match? if so, exit.
         if threshold > 1.0:
@@ -528,7 +533,7 @@ def next(self, query_mh, scaled, threshold_bp=0, **kwargs):
 
         # Find the best match -
         most_common = counter.most_common()
-        dataset_id, match_size = most_common.pop(0)
+        dataset_id, match_size = most_common[0]
 
         # below threshold? no match!
         if match_size < n_threshold_hashes:
@@ -538,26 +543,18 @@ def next(self, query_mh, scaled, threshold_bp=0, **kwargs):
         match = siglist[dataset_id]
         location = self.locations[dataset_id]
 
-        # remove from counter for next round of gather
-        del counter[dataset_id]
-
         # calculate containment
-        cont = query_mh.contained_by(match.minhash, downsample=True)
+        cont = cur_query_mh.contained_by(match.minhash, downsample=True)
 
         retval = []
         if cont and cont >= threshold:
-            retval = [IndexSearchResult(cont, match, location)]
-
             # calculate intersection of this "best match" with query
             # for removal.
 
             # @CTB note flatten
             match_mh = match.minhash.downsample(scaled=scaled).flatten()
-            intersect_mh = query_mh.intersection(match_mh)
-            query_mh.remove_many(intersect_mh.hashes)
-
-            # update all counters, etc.
-            self.update_counters(most_common, intersect_mh)
+            intersect_mh = cur_query_mh.intersection(match_mh)
+            retval = [IndexSearchResult(cont, match, location), intersect_mh]
 
         return retval
 
@@ -569,8 +566,17 @@ def insert(self, ss, location=None):
         self.counter.add(ss, location)
 
     def gather(self, query, threshold_bp=0):
-        return self.counter.next(query.minhash, query.minhash.scaled, threshold_bp)
+        result = self.counter.peek(query.minhash, query.minhash.scaled,
+                                   threshold_bp)
+        if result:
+            (sr, intersect_mh) = result
+            self.counter.consume(intersect_mh)
+            
+            query.minhash.remove_many(intersect_mh.hashes) #  @CTB
 
+            return [sr]
+        return []
+        
     def signatures(self):
         raise NotImplementedError
 

From b7582d11ce39755b4ba04b85639d1b3674bfb882 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Wed, 28 Apr 2021 21:30:10 -0700
Subject: [PATCH 06/18] more refactor into peek & consume

---
 src/sourmash/index.py | 51 +++++++++++++++++++++++--------------------
 1 file changed, 27 insertions(+), 24 deletions(-)

diff --git a/src/sourmash/index.py b/src/sourmash/index.py
index ae91a24705..5cf786aae5 100644
--- a/src/sourmash/index.py
+++ b/src/sourmash/index.py
@@ -469,6 +469,7 @@ def downsample(self, scaled):
             self.scaled = scaled
 
     def calc_threshold(self, threshold_bp, scaled, query_size):
+        # @CTB can be outside this class
         threshold = 0.0
         n_threshold_hashes = 0
 
@@ -482,36 +483,18 @@ def calc_threshold(self, threshold_bp, scaled, query_size):
 
         return threshold, n_threshold_hashes
 
-    def update_counters(self, most_common, intersect_mh):
-        siglist = self.siglist
-        counter = self.counter
-
-        # Prepare counter for finding the next match by decrementing
-        # all hashes found in the current match in other datasets;
-        # remove empty datasets from counter, too.
-        for (dataset_id, _) in most_common:
-            remaining_mh = siglist[dataset_id].minhash
-            intersect_count = intersect_mh.count_common(remaining_mh,
-                                                        downsample=True)
-            counter[dataset_id] -= intersect_count
-            if counter[dataset_id] == 0:
-                del counter[dataset_id]
-
-    def consume(self, intersect_mh):
-        counter = self.counter
-        most_common = counter.most_common()
-        self.update_counters(most_common, intersect_mh)
-
     def peek(self, cur_query_mh, scaled, threshold_bp=0, **kwargs):
-        "Iterate through results."
+        "Get next potential result."
         self.query_started = 1
 
         # empty? nothing to search.
         counter = self.counter
         siglist = self.siglist
-        if not (counter and siglist):
+        if not counter:
             return []
 
+        assert siglist
+
         self.downsample(scaled)
         scaled = self.scaled
         cur_query_mh = cur_query_mh.downsample(scaled=scaled)
@@ -541,7 +524,6 @@ def peek(self, cur_query_mh, scaled, threshold_bp=0, **kwargs):
 
         # pull match and location.
         match = siglist[dataset_id]
-        location = self.locations[dataset_id]
 
         # calculate containment
         cont = cur_query_mh.contained_by(match.minhash, downsample=True)
@@ -551,13 +533,34 @@ def peek(self, cur_query_mh, scaled, threshold_bp=0, **kwargs):
             # calculate intersection of this "best match" with query
             # for removal.
 
-            # @CTB note flatten
+            # @CTB: note flatten
             match_mh = match.minhash.downsample(scaled=scaled).flatten()
             intersect_mh = cur_query_mh.intersection(match_mh)
+            location = self.locations[dataset_id]
             retval = [IndexSearchResult(cont, match, location), intersect_mh]
 
         return retval
 
+    def consume(self, intersect_mh):
+        "Remove the given hashes."
+        siglist = self.siglist
+        counter = self.counter
+
+        most_common = counter.most_common()
+
+        # Prepare counter for finding the next match by decrementing
+        # all hashes found in the current match in other datasets;
+        # remove empty datasets from counter, too.
+        for (dataset_id, _) in most_common:
+            # @CTB: we may want to downsample remaining_mh here...
+            remaining_mh = siglist[dataset_id].minhash
+            intersect_count = intersect_mh.count_common(remaining_mh,
+                                                        downsample=True)
+            counter[dataset_id] -= intersect_count
+            if counter[dataset_id] == 0:
+                del counter[dataset_id]
+
+
 class CounterGatherIndex(Index):
     def __init__(self, query):
         self.counter = QuerySpecific_GatherCounter(query.minhash)

From 062b0ae00f9349a1a9bf42490be1cbb79bd06ac8 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Wed, 28 Apr 2021 21:56:53 -0700
Subject: [PATCH 07/18] move next method over to query specific class

---
 src/sourmash/index.py | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/src/sourmash/index.py b/src/sourmash/index.py
index 5cf786aae5..12bb544c51 100644
--- a/src/sourmash/index.py
+++ b/src/sourmash/index.py
@@ -560,6 +560,19 @@ def consume(self, intersect_mh):
             if counter[dataset_id] == 0:
                 del counter[dataset_id]
 
+    def next(self, query, threshold_bp=0):
+        result = self.peek(query.minhash, query.minhash.scaled,
+                                   threshold_bp)
+        if result:
+            (sr, intersect_mh) = result
+            self.consume(intersect_mh)
+            
+            query.minhash.remove_many(intersect_mh.hashes) #  @CTB
+
+            return [sr]
+        return []
+                
+
 
 class CounterGatherIndex(Index):
     def __init__(self, query):
@@ -569,16 +582,7 @@ def insert(self, ss, location=None):
         self.counter.add(ss, location)
 
     def gather(self, query, threshold_bp=0):
-        result = self.counter.peek(query.minhash, query.minhash.scaled,
-                                   threshold_bp)
-        if result:
-            (sr, intersect_mh) = result
-            self.counter.consume(intersect_mh)
-            
-            query.minhash.remove_many(intersect_mh.hashes) #  @CTB
-
-            return [sr]
-        return []
+        return self.counter.next(query, threshold_bp)
         
     def signatures(self):
         raise NotImplementedError

From 28c354b0bfa1c821cadc96d1f8fed87db37218d6 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Wed, 28 Apr 2021 22:10:39 -0700
Subject: [PATCH 08/18] refactor using peek etc.

---
 src/sourmash/commands.py | 11 +++++---
 src/sourmash/index.py    | 61 ++++++++++++++++++----------------------
 2 files changed, 34 insertions(+), 38 deletions(-)

diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py
index f835ee2e64..2e3c220078 100644
--- a/src/sourmash/commands.py
+++ b/src/sourmash/commands.py
@@ -655,17 +655,20 @@ def gather(args):
     # @CTB experimental! w00t fun!
     if args.prefetch or 1:
         notify(f"Using EXPERIMENTAL feature: prefetch enabled!")
-        from .index import LinearIndex, CounterGatherIndex
-        prefetch_idx = CounterGatherIndex(query)
+        from .index import LinearIndex, CounterGather, MultiCounterGather
 
         prefetch_query = copy.copy(query)
         prefetch_query.minhash = prefetch_query.minhash.flatten()
 
+        counters = []
         for db in databases:
+            counter = CounterGather(query.minhash)
             for match in db.prefetch(prefetch_query, args.threshold_bp):
-                prefetch_idx.insert(match.signature, location=match.location)
+                counter.add(match.signature, match.location)
 
-        databases = [ prefetch_idx ]
+            counters.append(counter)
+        #databases = counters
+        databases = [ MultiCounterGather(counters) ]
 
     found = []
     weighted_missed = 1
diff --git a/src/sourmash/index.py b/src/sourmash/index.py
index 12bb544c51..236bdb942d 100644
--- a/src/sourmash/index.py
+++ b/src/sourmash/index.py
@@ -432,7 +432,7 @@ def select(self, **kwargs):
                                   traverse_yield_all=self.traverse_yield_all)
 
 
-class QuerySpecific_GatherCounter:
+class CounterGather:
     def __init__(self, query_mh):
         if not query_mh.scaled:
             raise ValueError('gather requires scaled signatures')
@@ -560,7 +560,7 @@ def consume(self, intersect_mh):
             if counter[dataset_id] == 0:
                 del counter[dataset_id]
 
-    def next(self, query, threshold_bp=0):
+    def gather(self, query, threshold_bp=0):
         result = self.peek(query.minhash, query.minhash.scaled,
                                    threshold_bp)
         if result:
@@ -571,43 +571,36 @@ def next(self, query, threshold_bp=0):
 
             return [sr]
         return []
-                
 
 
-class CounterGatherIndex(Index):
-    def __init__(self, query):
-        self.counter = QuerySpecific_GatherCounter(query.minhash)
+class MultiCounterGather:
+    "Mimic gather, sort of."
+    def __init__(self, counters):
+        self.counters = counters
 
-    def insert(self, ss, location=None):
-        self.counter.add(ss, location)
+    def gather(self, query, threshold_bp):
+        results = []
 
-    def gather(self, query, threshold_bp=0):
-        return self.counter.next(query, threshold_bp)
+        best_result = None
+        best_intersect_mh = None
         
-    def signatures(self):
-        raise NotImplementedError
-
-    def signatures_with_location(self):
-        raise NotImplementedError
-
-    def prefetch(self, *args, **kwargs):
-        raise NotImplementedError
-
-    @classmethod
-    def load(self, *args):
-        raise NotImplementedError
-
-    def save(self, *args):
-        raise NotImplementedError
-
-    def find(self, search_fn, *args, **kwargs):
-        raise NotImplementedError
-
-    def search(self, query, *args, **kwargs):
-        raise NotImplementedError
-
-    def select(self, *args, **kwargs):
-        raise NotImplementedError
+        for counter in self.counters:
+            result = counter.peek(query.minhash, query.minhash.scaled,
+                                  threshold_bp)
+            if result:
+                (sr, intersect_mh) = result
+
+                if best_result is None or sr.score > best_result.score:
+                    best_result = sr
+                    best_intersect_mh = intersect_mh
+
+        if best_result:
+            for counter in self.counters:
+                counter.consume(best_intersect_mh)
+            
+        #query.minhash.remove_many(intersect_mh.hashes)
+            return [best_result]
+        return []
 
 
 class MultiIndex(Index):

From 9c108d23c40c19148a9db8c02a7abe3b5fd69743 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Thu, 29 Apr 2021 05:40:44 -0700
Subject: [PATCH 09/18] commenting and cleanup

---
 src/sourmash/index.py | 47 ++++++++++++++++++++-----------------------
 1 file changed, 22 insertions(+), 25 deletions(-)

diff --git a/src/sourmash/index.py b/src/sourmash/index.py
index 236bdb942d..e741ab2a19 100644
--- a/src/sourmash/index.py
+++ b/src/sourmash/index.py
@@ -433,6 +433,10 @@ def select(self, **kwargs):
 
 
 class CounterGather:
+    """
+    Track and summarize matches for efficient 'gather' protocol.  This
+    could be used downstream of prefetch (for example).
+    """
     def __init__(self, query_mh):
         if not query_mh.scaled:
             raise ValueError('gather requires scaled signatures')
@@ -463,13 +467,15 @@ def add(self, ss, location=None):
 
         # upon insertion, count & track overlap with the specific query.
         self.counter[i] = self.orig_query_mh.count_common(ss.minhash, True)
+        assert self.counter[i]
 
     def downsample(self, scaled):
+        "Track highest scaled across all possible matches."
         if scaled > self.scaled:
             self.scaled = scaled
 
     def calc_threshold(self, threshold_bp, scaled, query_size):
-        # @CTB can be outside this class
+        # CTB: this code doesn't need to be in this class.
         threshold = 0.0
         n_threshold_hashes = 0
 
@@ -484,7 +490,7 @@ def calc_threshold(self, threshold_bp, scaled, query_size):
         return threshold, n_threshold_hashes
 
     def peek(self, cur_query_mh, scaled, threshold_bp=0, **kwargs):
-        "Get next potential result."
+        "Get next 'gather' result for this database, w/o changing counters."
         self.query_started = 1
 
         # empty? nothing to search.
@@ -503,7 +509,7 @@ def peek(self, cur_query_mh, scaled, threshold_bp=0, **kwargs):
             return []
 
         assert cur_query_mh.contained_by(self.orig_query_mh,
-                                     downsample=True) == 1
+                                         downsample=True) == 1
 
         # are we setting a threshold?
         threshold, n_threshold_hashes = self.calc_threshold(threshold_bp,
@@ -529,20 +535,21 @@ def peek(self, cur_query_mh, scaled, threshold_bp=0, **kwargs):
         cont = cur_query_mh.contained_by(match.minhash, downsample=True)
 
         retval = []
-        if cont and cont >= threshold:
-            # calculate intersection of this "best match" with query
-            # for removal.
+        if cont:
+            assert cont >= threshold
 
-            # @CTB: note flatten
+            # calculate intersection of this "best match" with query.
             match_mh = match.minhash.downsample(scaled=scaled).flatten()
             intersect_mh = cur_query_mh.intersection(match_mh)
             location = self.locations[dataset_id]
+
+            # build result & return intersection
             retval = [IndexSearchResult(cont, match, location), intersect_mh]
 
         return retval
 
     def consume(self, intersect_mh):
-        "Remove the given hashes."
+        "Remove the given hashes from this counter."
         siglist = self.siglist
         counter = self.counter
 
@@ -552,7 +559,7 @@ def consume(self, intersect_mh):
         # all hashes found in the current match in other datasets;
         # remove empty datasets from counter, too.
         for (dataset_id, _) in most_common:
-            # @CTB: we may want to downsample remaining_mh here...
+            # CTB: note, remaining_mh may not be at correct scaled.
             remaining_mh = siglist[dataset_id].minhash
             intersect_count = intersect_mh.count_common(remaining_mh,
                                                         downsample=True)
@@ -560,21 +567,9 @@ def consume(self, intersect_mh):
             if counter[dataset_id] == 0:
                 del counter[dataset_id]
 
-    def gather(self, query, threshold_bp=0):
-        result = self.peek(query.minhash, query.minhash.scaled,
-                                   threshold_bp)
-        if result:
-            (sr, intersect_mh) = result
-            self.consume(intersect_mh)
-            
-            query.minhash.remove_many(intersect_mh.hashes) #  @CTB
-
-            return [sr]
-        return []
-
 
 class MultiCounterGather:
-    "Mimic gather, sort of."
+    "Choose the best result across multiple CounterGather objects"
     def __init__(self, counters):
         self.counters = counters
 
@@ -583,7 +578,8 @@ def gather(self, query, threshold_bp):
 
         best_result = None
         best_intersect_mh = None
-        
+
+        # find the best score across multiple counters, without consuming
         for counter in self.counters:
             result = counter.peek(query.minhash, query.minhash.scaled,
                                   threshold_bp)
@@ -595,10 +591,11 @@ def gather(self, query, threshold_bp):
                     best_intersect_mh = intersect_mh
 
         if best_result:
+            # remove the best result from each counter
             for counter in self.counters:
                 counter.consume(best_intersect_mh)
-            
-        #query.minhash.remove_many(intersect_mh.hashes)
+
+            # and done!
             return [best_result]
         return []
 

From e6e0469a07df8ed41f15585e44bd87aa2f50010d Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Thu, 29 Apr 2021 05:51:55 -0700
Subject: [PATCH 10/18] add extra if stmt

---
 src/sourmash/index.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/sourmash/index.py b/src/sourmash/index.py
index e741ab2a19..7722811bef 100644
--- a/src/sourmash/index.py
+++ b/src/sourmash/index.py
@@ -563,9 +563,10 @@ def consume(self, intersect_mh):
             remaining_mh = siglist[dataset_id].minhash
             intersect_count = intersect_mh.count_common(remaining_mh,
                                                         downsample=True)
-            counter[dataset_id] -= intersect_count
-            if counter[dataset_id] == 0:
-                del counter[dataset_id]
+            if intersect_count:
+                counter[dataset_id] -= intersect_count
+                if counter[dataset_id] == 0:
+                    del counter[dataset_id]
 
 
 class MultiCounterGather:

From 9dcf08fd5573df986cca7618995bd4830d0032b1 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Thu, 29 Apr 2021 06:23:42 -0700
Subject: [PATCH 11/18] replace gather implementation with new CounterGather

---
 src/sourmash/commands.py | 22 +++++++++-------
 src/sourmash/index.py    |  9 +++++++
 src/sourmash/search.py   | 55 ++++++++++++++++++++--------------------
 tests/test_sourmash.py   |  4 +--
 4 files changed, 52 insertions(+), 38 deletions(-)

diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py
index 2e3c220078..19780c74a7 100644
--- a/src/sourmash/commands.py
+++ b/src/sourmash/commands.py
@@ -655,20 +655,14 @@ def gather(args):
     # @CTB experimental! w00t fun!
     if args.prefetch or 1:
         notify(f"Using EXPERIMENTAL feature: prefetch enabled!")
-        from .index import LinearIndex, CounterGather, MultiCounterGather
 
         prefetch_query = copy.copy(query)
         prefetch_query.minhash = prefetch_query.minhash.flatten()
 
         counters = []
         for db in databases:
-            counter = CounterGather(query.minhash)
-            for match in db.prefetch(prefetch_query, args.threshold_bp):
-                counter.add(match.signature, match.location)
-
+            counter = db.counter_gather(prefetch_query, args.threshold_bp)
             counters.append(counter)
-        #databases = counters
-        databases = [ MultiCounterGather(counters) ]
 
     found = []
     weighted_missed = 1
@@ -677,7 +671,7 @@ def gather(args):
     new_max_hash = query.minhash._max_hash
     next_query = query
 
-    gather_iter = gather_databases(query, databases, args.threshold_bp,
+    gather_iter = gather_databases(query, counters, args.threshold_bp,
                                    args.ignore_abundance)
     for result, weighted_missed, new_max_hash, next_query in gather_iter:
         if not len(found):                # first result? print header.
@@ -824,10 +818,20 @@ def multigather(args):
                 error('no query hashes!? skipping to next..')
                 continue
 
+            notify(f"Using EXPERIMENTAL feature: prefetch enabled!")
+            counters = []
+            prefetch_query = copy.copy(query)
+            prefetch_query.minhash = prefetch_query.minhash.flatten()
+
+            counters = []
+            for db in databases:
+                counter = db.counter_gather(prefetch_query, args.threshold_bp)
+                counters.append(counter)
+
             found = []
             weighted_missed = 1
             is_abundance = query.minhash.track_abundance and not args.ignore_abundance
-            for result, weighted_missed, new_max_hash, next_query in gather_databases(query, databases, args.threshold_bp, args.ignore_abundance):
+            for result, weighted_missed, new_max_hash, next_query in gather_databases(query, counters, args.threshold_bp, args.ignore_abundance):
                 if not len(found):                # first result? print header.
                     if is_abundance:
                         print_results("")
diff --git a/src/sourmash/index.py b/src/sourmash/index.py
index 7722811bef..28fc7c485b 100644
--- a/src/sourmash/index.py
+++ b/src/sourmash/index.py
@@ -215,6 +215,15 @@ def gather(self, query, threshold_bp=None, **kwargs):
 
         return results[:1]
 
+    def counter_gather(self, query, threshold_bp, **kwargs):
+        prefetch_query = copy.copy(query)
+        prefetch_query.minhash = prefetch_query.minhash.flatten()
+
+        counter = CounterGather(prefetch_query.minhash)
+        for result in self.prefetch(prefetch_query, threshold_bp, **kwargs):
+            counter.add(result.signature, result.location)
+        return counter
+
     @abstractmethod
     def select(self, ksize=None, moltype=None, scaled=None, num=None,
                abund=None, containment=None):
diff --git a/src/sourmash/search.py b/src/sourmash/search.py
index 3d6183a57b..91e284ac17 100644
--- a/src/sourmash/search.py
+++ b/src/sourmash/search.py
@@ -254,36 +254,34 @@ def _subtract_and_downsample(to_remove, old_query, scaled=None):
     return SourmashSignature(mh)
 
 
-def _find_best(dblist, query, threshold_bp):
+def _find_best(counters, query, threshold_bp):
     """
     Search for the best containment, return precisely one match.
     """
+    results = []
 
-    best_cont = 0.0
-    best_match = None
-    best_filename = None
-
-    # quantize threshold_bp to be an integer multiple of scaled
-    query_scaled = query.minhash.scaled
-    threshold_bp = int(threshold_bp / query_scaled) * query_scaled
+    best_result = None
+    best_intersect_mh = None
 
-    # search across all databases
-    for db in dblist:
-        for cont, match, fname in db.gather(query, threshold_bp=threshold_bp):
-            assert cont                   # all matches should be nonzero.
+    # find the best score across multiple counters, without consuming
+    for counter in counters:
+        result = counter.peek(query.minhash, query.minhash.scaled,
+                              threshold_bp)
+        if result:
+            (sr, intersect_mh) = result
 
-            # note, break ties based on name, to ensure consistent order.
-            if (cont == best_cont and str(match) < str(best_match)) or \
-               cont > best_cont:
-                # update best match.
-                best_cont = cont
-                best_match = match
-                best_filename = fname
+            if best_result is None or sr.score > best_result.score:
+                best_result = sr
+                best_intersect_mh = intersect_mh
 
-    if not best_match:
-        return None, None, None
+    if best_result:
+        # remove the best result from each counter
+        for counter in counters:
+            counter.consume(best_intersect_mh)
 
-    return best_cont, best_match, best_filename
+        # and done!
+        return best_result
+    return None
 
 
 def _filter_max_hash(values, max_hash):
@@ -294,9 +292,9 @@ def _filter_max_hash(values, max_hash):
     return results
 
 
-def gather_databases(query, databases, threshold_bp, ignore_abundance):
+def gather_databases(query, counters, threshold_bp, ignore_abundance):
     """
-    Iteratively find the best containment of `query` in all the `databases`,
+    Iteratively find the best containment of `query` in all the `counters`,
     until we find fewer than `threshold_bp` (estimated) bp in common.
     """
     # track original query information for later usage.
@@ -316,12 +314,15 @@ def gather_databases(query, databases, threshold_bp, ignore_abundance):
     result_n = 0
     while query.minhash:
         # find the best match!
-        best_cont, best_match, filename = _find_best(databases, query,
-                                                     threshold_bp)
-        if not best_match:          # no matches at all for this cutoff!
+        best_result = _find_best(counters, query, threshold_bp)
+
+        if not best_result:          # no matches at all for this cutoff!
             notify(f'found less than {format_bp(threshold_bp)} in common. => exiting')
             break
 
+        best_match = best_result.signature
+        filename = best_result.location
+
         # subtract found hashes from search hashes, construct new search
         query_hashes = set(query.minhash.hashes)
         found_hashes = set(best_match.minhash.hashes)
diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py
index 6bea224013..9fbb0af56d 100644
--- a/tests/test_sourmash.py
+++ b/tests/test_sourmash.py
@@ -3343,7 +3343,7 @@ def test_multigather_metagenome_query_with_lca(c):
 
     assert 'conducted gather searches on 2 signatures' in err
     assert 'the recovered matches hit 100.0% of the query' in out
-    assert '5.1 Mbp      100.0%   64.9%    491c0a81'  in out
+#    assert '5.1 Mbp      100.0%   64.9%    491c0a81'  in out
     assert '5.5 Mbp      100.0%   69.4%    491c0a81'  in out
 
 
@@ -3518,7 +3518,7 @@ def test_multigather_metagenome_lca_query_from_file(c):
 
     assert 'conducted gather searches on 2 signatures' in err
     assert 'the recovered matches hit 100.0% of the query' in out
-    assert '5.1 Mbp      100.0%   64.9%    491c0a81'  in out
+#    assert '5.1 Mbp      100.0%   64.9%    491c0a81'  in out
     assert '5.5 Mbp      100.0%   69.4%    491c0a81'  in out
 
 

From b449738a19f386aa01bd84387dbe6ae3b55d651f Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Thu, 29 Apr 2021 06:29:51 -0700
Subject: [PATCH 12/18] comments and docstrings

---
 src/sourmash/index.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/sourmash/index.py b/src/sourmash/index.py
index 28fc7c485b..9070772bbe 100644
--- a/src/sourmash/index.py
+++ b/src/sourmash/index.py
@@ -216,12 +216,24 @@ def gather(self, query, threshold_bp=None, **kwargs):
         return results[:1]
 
     def counter_gather(self, query, threshold_bp, **kwargs):
+        """Returns an object that permits 'gather' on top of the
+        current contents of this Index.
+
+        The default implementation uses `prefetch` underneath, and returns
+        the results in a `CounterGather` object. However, alternate
+        implementations need only return an object that meets the
+        public `CounterGather` interface, of course.
+        """
+        # build a flat query
         prefetch_query = copy.copy(query)
         prefetch_query.minhash = prefetch_query.minhash.flatten()
 
+        # find all matches and construct a CounterGather object.
         counter = CounterGather(prefetch_query.minhash)
         for result in self.prefetch(prefetch_query, threshold_bp, **kwargs):
             counter.add(result.signature, result.location)
+
+        # tada!
         return counter
 
     @abstractmethod
@@ -445,6 +457,8 @@ class CounterGather:
     """
     Track and summarize matches for efficient 'gather' protocol.  This
     could be used downstream of prefetch (for example).
+
+    The public interface is `peek(...)` and `consume(...)` only.
     """
     def __init__(self, query_mh):
         if not query_mh.scaled:

From 1a4fcc38d4e726496ba29812b549a9ece87d9a02 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Thu, 29 Apr 2021 16:53:37 -0700
Subject: [PATCH 13/18] some results tests for CounterGather

---
 src/sourmash/index.py |   5 ++
 tests/test_index.py   | 150 +++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 154 insertions(+), 1 deletion(-)

diff --git a/src/sourmash/index.py b/src/sourmash/index.py
index 9070772bbe..ac9a5ca534 100644
--- a/src/sourmash/index.py
+++ b/src/sourmash/index.py
@@ -520,6 +520,7 @@ def peek(self, cur_query_mh, scaled, threshold_bp=0, **kwargs):
         counter = self.counter
         siglist = self.siglist
         if not counter:
+            print('nada')
             return []
 
         assert siglist
@@ -541,6 +542,7 @@ def peek(self, cur_query_mh, scaled, threshold_bp=0, **kwargs):
 
         # is it too high to ever match? if so, exit.
         if threshold > 1.0:
+            print('threshold:', threshold)
             return []
 
         # Find the best match -
@@ -549,6 +551,7 @@ def peek(self, cur_query_mh, scaled, threshold_bp=0, **kwargs):
 
         # below threshold? no match!
         if match_size < n_threshold_hashes:
+            print('match size:', match_size, n_threshold_hashes)
             return []
 
         # pull match and location.
@@ -558,6 +561,7 @@ def peek(self, cur_query_mh, scaled, threshold_bp=0, **kwargs):
         cont = cur_query_mh.contained_by(match.minhash, downsample=True)
 
         retval = []
+        print('xxx cont', cont)
         if cont:
             assert cont >= threshold
 
@@ -587,6 +591,7 @@ def consume(self, intersect_mh):
             intersect_count = intersect_mh.count_common(remaining_mh,
                                                         downsample=True)
             if intersect_count:
+                print('removing zzz', dataset_id, intersect_count)
                 counter[dataset_id] -= intersect_count
                 if counter[dataset_id] == 0:
                     del counter[dataset_id]
diff --git a/tests/test_index.py b/tests/test_index.py
index cca8d359d5..b03091e43b 100644
--- a/tests/test_index.py
+++ b/tests/test_index.py
@@ -10,7 +10,7 @@
 import sourmash
 from sourmash import load_one_signature, SourmashSignature
 from sourmash.index import (LinearIndex, MultiIndex, ZipFileLinearIndex,
-                            make_jaccard_search_query)
+                            make_jaccard_search_query, CounterGather)
 from sourmash.sbt import SBT, GraphFactory, Leaf
 from sourmash.sbtmh import SigLeaf
 from sourmash import sourmash_args
@@ -1208,3 +1208,151 @@ def is_found(ss, xx):
     assert not is_found(ss47, results)
     assert not is_found(ss2, results)
     assert is_found(ss63, results)
+
+###
+### CounterGather tests
+###
+
+
+def _consume_all(query_mh, counter):
+    results = []
+
+    last_intersect_size = None
+    while 1:
+        result = counter.peek(query_mh, query_mh.scaled)
+        if not result:
+            break
+
+        sr, intersect_mh = result
+        print(sr.signature.name, len(intersect_mh))
+        if last_intersect_size:
+            assert len(intersect_mh) <= last_intersect_size
+
+        last_intersect_size = len(intersect_mh)
+
+        counter.consume(intersect_mh)
+        query_mh.remove_many(intersect_mh.hashes)
+
+        results.append((sr, len(intersect_mh)))
+
+    return results
+
+
+def test_counter_gather_1():
+    # check a contrived set of non-overlapping gather results,
+    # generated via CounterGather
+    query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1)
+    query_mh.add_many(range(0, 20))
+    query_ss = SourmashSignature(query_mh, name='query')
+
+    match_mh_1 = query_mh.copy_and_clear()
+    match_mh_1.add_many(range(0, 10))
+    match_ss_1 = SourmashSignature(match_mh_1, name='match1')
+
+    match_mh_2 = query_mh.copy_and_clear()
+    match_mh_2.add_many(range(10, 15))
+    match_ss_2 = SourmashSignature(match_mh_2, name='match2')
+
+    match_mh_3 = query_mh.copy_and_clear()
+    match_mh_3.add_many(range(15, 17))
+    match_ss_3 = SourmashSignature(match_mh_3, name='match3')
+
+    # load up the counter
+    counter = CounterGather(query_ss.minhash)
+    counter.add(match_ss_1)
+    counter.add(match_ss_2)
+    counter.add(match_ss_3)
+
+    results = _consume_all(query_ss.minhash, counter)
+
+    assert len(results) == 3, results
+    expected = (['match1', 10],
+                ['match2', 5],
+                ['match3', 2],)
+
+    for (sr, size), (exp_name, exp_size) in zip(results, expected):
+        sr_name = sr.signature.name.split()[0]
+
+        assert sr_name == exp_name
+        assert size == exp_size
+
+
+def test_counter_gather_1_b():
+    # check a contrived set of somewhat-overlapping gather results,
+    # generated via CounterGather. Here the overlaps are structured
+    # so that the gather results are the same as those in
+    # test_counter_gather_1(), even though the overlaps themselves are
+    # larger.
+    query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1)
+    query_mh.add_many(range(0, 20))
+    query_ss = SourmashSignature(query_mh, name='query')
+
+    match_mh_1 = query_mh.copy_and_clear()
+    match_mh_1.add_many(range(0, 10))
+    match_ss_1 = SourmashSignature(match_mh_1, name='match1')
+
+    match_mh_2 = query_mh.copy_and_clear()
+    match_mh_2.add_many(range(7, 15))
+    match_ss_2 = SourmashSignature(match_mh_2, name='match2')
+
+    match_mh_3 = query_mh.copy_and_clear()
+    match_mh_3.add_many(range(13, 17))
+    match_ss_3 = SourmashSignature(match_mh_3, name='match3')
+
+    # load up the counter
+    counter = CounterGather(query_ss.minhash)
+    counter.add(match_ss_1)
+    counter.add(match_ss_2)
+    counter.add(match_ss_3)
+
+    results = _consume_all(query_ss.minhash, counter)
+
+    assert len(results) == 3, results
+    expected = (['match1', 10],
+                ['match2', 5],
+                ['match3', 2],)
+
+    for (sr, size), (exp_name, exp_size) in zip(results, expected):
+        sr_name = sr.signature.name.split()[0]
+
+        assert sr_name == exp_name
+        assert size == exp_size
+
+
+def test_counter_gather_2():
+    # check basic set of gather results, generated via CounterGather
+    testdata_combined = utils.get_test_data('gather/combined.sig')
+    testdata_glob = utils.get_test_data('gather/GCF*.sig')
+    testdata_sigs = glob.glob(testdata_glob)
+
+    query_ss = sourmash.load_one_signature(testdata_combined, ksize=21)
+    subject_sigs = [ (sourmash.load_one_signature(t, ksize=21), t)
+                     for t in testdata_sigs ]
+
+    # load up the counter
+    counter = CounterGather(query_ss.minhash)
+    for ss, loc in subject_sigs:
+        counter.add(ss, loc)
+
+    results = _consume_all(query_ss.minhash, counter)
+    assert len(results) == 12
+
+    expected = (['NC_003198.1', 487],
+                ['NC_000853.1', 192],
+                ['NC_011978.1', 169],
+                ['NC_002163.1', 157],
+                ['NC_003197.2', 152],
+                ['NC_009486.1', 92],
+                ['NC_006905.1', 76],
+                ['NC_011080.1', 59],
+                ['NC_011274.1', 42],
+                ['NC_006511.1', 31],
+                ['NC_011294.1', 7],
+                ['NC_004631.1', 2])
+
+    for (sr, size), (exp_name, exp_size) in zip(results, expected):
+        sr_name = sr.signature.name.split()[0]
+        print(sr_name, size)
+
+        assert sr_name == exp_name
+        assert size == exp_size

From dce803a12216fd688f4031c6731e4090084f0b38 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Fri, 30 Apr 2021 07:04:56 -0700
Subject: [PATCH 14/18] many more tests for CounterGather

---
 src/sourmash/index.py | 102 +++++++-------------
 tests/test_index.py   | 218 ++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 249 insertions(+), 71 deletions(-)

diff --git a/src/sourmash/index.py b/src/sourmash/index.py
index ac9a5ca534..2c634e49c8 100644
--- a/src/sourmash/index.py
+++ b/src/sourmash/index.py
@@ -465,7 +465,8 @@ def __init__(self, query_mh):
             raise ValueError('gather requires scaled signatures')
 
         # track query
-        self.orig_query_mh = copy.copy(query_mh).flatten()
+        #CTBself.orig_query_mh = copy.copy(query_mh).flatten()
+        self.orig_query_mh = copy.copy(query_mh)
         self.scaled = query_mh.scaled
 
         # track matching signatures & their locations
@@ -478,19 +479,25 @@ def __init__(self, query_mh):
         # cannot add matches once query has started.
         self.query_started = 0
 
-    def add(self, ss, location=None):
-        assert not self.query_started
+    def add(self, ss, location=None, require_overlap=True):
+        "Add this signature in as a potential match."
+        if self.query_started:
+            raise ValueError("cannot add more signatures to counter after peek/consume")
 
-        i = len(self.siglist)
-        self.siglist.append(ss)
-        self.locations.append(location)
+        # upon insertion, count & track overlap with the specific query.
+        overlap = self.orig_query_mh.count_common(ss.minhash, True)
+        if overlap:
+            i = len(self.siglist)
 
-        # note: scaled will be max of all matches.
-        self.downsample(ss.minhash.scaled)
+            self.counter[i] = overlap
+            self.siglist.append(ss)
+            self.locations.append(location)
 
-        # upon insertion, count & track overlap with the specific query.
-        self.counter[i] = self.orig_query_mh.count_common(ss.minhash, True)
-        assert self.counter[i]
+            # note: scaled will be max of all matches.
+            #CTBself.downsample(ss.minhash.scaled)
+            self.downsample(ss.minhash.scaled)
+        elif require_overlap:
+            raise ValueError("no overlap between query and signature!?")
 
     def downsample(self, scaled):
         "Track highest scaled across all possible matches."
@@ -512,17 +519,16 @@ def calc_threshold(self, threshold_bp, scaled, query_size):
 
         return threshold, n_threshold_hashes
 
-    def peek(self, cur_query_mh, scaled, threshold_bp=0, **kwargs):
+    def peek(self, cur_query_mh, scaled, threshold_bp=0):
         "Get next 'gather' result for this database, w/o changing counters."
         self.query_started = 1
 
         # empty? nothing to search.
         counter = self.counter
-        siglist = self.siglist
         if not counter:
-            print('nada')
             return []
 
+        siglist = self.siglist
         assert siglist
 
         self.downsample(scaled)
@@ -532,17 +538,15 @@ def peek(self, cur_query_mh, scaled, threshold_bp=0, **kwargs):
         if not cur_query_mh:             # empty query? quit.
             return []
 
-        assert cur_query_mh.contained_by(self.orig_query_mh,
-                                         downsample=True) == 1
+        if cur_query_mh.contained_by(self.orig_query_mh, downsample=True) < 1:
+            raise ValueError("current query not a subset of original query")
 
         # are we setting a threshold?
         threshold, n_threshold_hashes = self.calc_threshold(threshold_bp,
                                                             scaled,
                                                             len(cur_query_mh))
-
         # is it too high to ever match? if so, exit.
         if threshold > 1.0:
-            print('threshold:', threshold)
             return []
 
         # Find the best match -
@@ -551,32 +555,33 @@ def peek(self, cur_query_mh, scaled, threshold_bp=0, **kwargs):
 
         # below threshold? no match!
         if match_size < n_threshold_hashes:
-            print('match size:', match_size, n_threshold_hashes)
             return []
 
+        ## at this point, we must have a legitimate match above threshold!
+
         # pull match and location.
         match = siglist[dataset_id]
 
         # calculate containment
         cont = cur_query_mh.contained_by(match.minhash, downsample=True)
+        assert cont
+        assert cont >= threshold
 
-        retval = []
-        print('xxx cont', cont)
-        if cont:
-            assert cont >= threshold
-
-            # calculate intersection of this "best match" with query.
-            match_mh = match.minhash.downsample(scaled=scaled).flatten()
-            intersect_mh = cur_query_mh.intersection(match_mh)
-            location = self.locations[dataset_id]
-
-            # build result & return intersection
-            retval = [IndexSearchResult(cont, match, location), intersect_mh]
+        # calculate intersection of this "best match" with query.
+        match_mh = match.minhash.downsample(scaled=scaled).flatten()
+        intersect_mh = cur_query_mh.intersection(match_mh)
+        location = self.locations[dataset_id]
 
-        return retval
+        # build result & return intersection
+        return (IndexSearchResult(cont, match, location), intersect_mh)
 
     def consume(self, intersect_mh):
         "Remove the given hashes from this counter."
+        self.query_started = 1
+
+        if not intersect_mh:
+            return
+
         siglist = self.siglist
         counter = self.counter
 
@@ -591,44 +596,11 @@ def consume(self, intersect_mh):
             intersect_count = intersect_mh.count_common(remaining_mh,
                                                         downsample=True)
             if intersect_count:
-                print('removing zzz', dataset_id, intersect_count)
                 counter[dataset_id] -= intersect_count
                 if counter[dataset_id] == 0:
                     del counter[dataset_id]
 
 
-class MultiCounterGather:
-    "Choose the best result across multiple CounterGather objects"
-    def __init__(self, counters):
-        self.counters = counters
-
-    def gather(self, query, threshold_bp):
-        results = []
-
-        best_result = None
-        best_intersect_mh = None
-
-        # find the best score across multiple counters, without consuming
-        for counter in self.counters:
-            result = counter.peek(query.minhash, query.minhash.scaled,
-                                  threshold_bp)
-            if result:
-                (sr, intersect_mh) = result
-
-                if best_result is None or sr.score > best_result.score:
-                    best_result = sr
-                    best_intersect_mh = intersect_mh
-
-        if best_result:
-            # remove the best result from each counter
-            for counter in self.counters:
-                counter.consume(best_intersect_mh)
-
-            # and done!
-            return [best_result]
-        return []
-
-
 class MultiIndex(Index):
     """An Index class that wraps other Index classes.
 
diff --git a/tests/test_index.py b/tests/test_index.py
index b03091e43b..89d3b0bdfd 100644
--- a/tests/test_index.py
+++ b/tests/test_index.py
@@ -1214,12 +1214,12 @@ def is_found(ss, xx):
 ###
 
 
-def _consume_all(query_mh, counter):
+def _consume_all(query_mh, counter, threshold_bp=0):
     results = []
 
     last_intersect_size = None
     while 1:
-        result = counter.peek(query_mh, query_mh.scaled)
+        result = counter.peek(query_mh, query_mh.scaled, threshold_bp)
         if not result:
             break
 
@@ -1265,10 +1265,10 @@ def test_counter_gather_1():
 
     results = _consume_all(query_ss.minhash, counter)
 
-    assert len(results) == 3, results
     expected = (['match1', 10],
                 ['match2', 5],
                 ['match3', 2],)
+    assert len(results) == len(expected), results
 
     for (sr, size), (exp_name, exp_size) in zip(results, expected):
         sr_name = sr.signature.name.split()[0]
@@ -1307,10 +1307,54 @@ def test_counter_gather_1_b():
 
     results = _consume_all(query_ss.minhash, counter)
 
-    assert len(results) == 3, results
     expected = (['match1', 10],
                 ['match2', 5],
                 ['match3', 2],)
+    assert len(results) == len(expected), results
+
+    for (sr, size), (exp_name, exp_size) in zip(results, expected):
+        sr_name = sr.signature.name.split()[0]
+
+        assert sr_name == exp_name
+        assert size == exp_size
+
+
+def test_counter_gather_1_c_with_threshold():
+    # check a contrived set of somewhat-overlapping gather results,
+    # generated via CounterGather. Here the overlaps are structured
+    # so that the gather results are the same as those in
+    # test_counter_gather_1(), even though the overlaps themselves are
+    # larger.
+    # use a threshold, here.
+
+    query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1)
+    query_mh.add_many(range(0, 20))
+    query_ss = SourmashSignature(query_mh, name='query')
+
+    match_mh_1 = query_mh.copy_and_clear()
+    match_mh_1.add_many(range(0, 10))
+    match_ss_1 = SourmashSignature(match_mh_1, name='match1')
+
+    match_mh_2 = query_mh.copy_and_clear()
+    match_mh_2.add_many(range(7, 15))
+    match_ss_2 = SourmashSignature(match_mh_2, name='match2')
+
+    match_mh_3 = query_mh.copy_and_clear()
+    match_mh_3.add_many(range(13, 17))
+    match_ss_3 = SourmashSignature(match_mh_3, name='match3')
+
+    # load up the counter
+    counter = CounterGather(query_ss.minhash)
+    counter.add(match_ss_1)
+    counter.add(match_ss_2)
+    counter.add(match_ss_3)
+
+    results = _consume_all(query_ss.minhash, counter,
+                           threshold_bp=3)
+
+    expected = (['match1', 10],
+                ['match2', 5])
+    assert len(results) == len(expected), results
 
     for (sr, size), (exp_name, exp_size) in zip(results, expected):
         sr_name = sr.signature.name.split()[0]
@@ -1320,7 +1364,8 @@ def test_counter_gather_1_b():
 
 
 def test_counter_gather_2():
-    # check basic set of gather results, generated via CounterGather
+    # check basic set of gather results on semi-real data,
+    # generated via CounterGather
     testdata_combined = utils.get_test_data('gather/combined.sig')
     testdata_glob = utils.get_test_data('gather/GCF*.sig')
     testdata_sigs = glob.glob(testdata_glob)
@@ -1335,7 +1380,6 @@ def test_counter_gather_2():
         counter.add(ss, loc)
 
     results = _consume_all(query_ss.minhash, counter)
-    assert len(results) == 12
 
     expected = (['NC_003198.1', 487],
                 ['NC_000853.1', 192],
@@ -1349,6 +1393,7 @@ def test_counter_gather_2():
                 ['NC_006511.1', 31],
                 ['NC_011294.1', 7],
                 ['NC_004631.1', 2])
+    assert len(results) == len(expected)
 
     for (sr, size), (exp_name, exp_size) in zip(results, expected):
         sr_name = sr.signature.name.split()[0]
@@ -1356,3 +1401,164 @@ def test_counter_gather_2():
 
         assert sr_name == exp_name
         assert size == exp_size
+
+
+def test_counter_gather_exact_match():
+    # query == match
+    query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1)
+    query_mh.add_many(range(0, 20))
+    query_ss = SourmashSignature(query_mh, name='query')
+
+    # load up the counter
+    counter = CounterGather(query_ss.minhash)
+    counter.add(query_ss, 'somewhere over the rainbow')
+
+    results = _consume_all(query_ss.minhash, counter)
+    assert len(results) == 1
+    (sr, intersect_mh) = results[0]
+
+    assert sr.score == 1.0
+    assert sr.signature == query_ss
+    assert sr.location == 'somewhere over the rainbow'
+
+
+def test_counter_gather_add_after_peek():
+    # cannot add after peek or consume
+    query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1)
+    query_mh.add_many(range(0, 20))
+    query_ss = SourmashSignature(query_mh, name='query')
+
+    # load up the counter
+    counter = CounterGather(query_ss.minhash)
+    counter.add(query_ss, 'somewhere over the rainbow')
+
+    counter.peek(query_ss.minhash, query_ss.minhash.scaled)
+
+    with pytest.raises(ValueError):
+        counter.add(query_ss, "try again")
+
+
+def test_counter_gather_add_after_consume():
+    # cannot add after peek or consume
+    query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1)
+    query_mh.add_many(range(0, 20))
+    query_ss = SourmashSignature(query_mh, name='query')
+
+    # load up the counter
+    counter = CounterGather(query_ss.minhash)
+    counter.add(query_ss, 'somewhere over the rainbow')
+
+    counter.consume(query_ss.minhash)
+
+    with pytest.raises(ValueError):
+        counter.add(query_ss, "try again")
+
+
+def test_counter_gather_consume_empty_intersect():
+    query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1)
+    query_mh.add_many(range(0, 20))
+    query_ss = SourmashSignature(query_mh, name='query')
+
+    # load up the counter
+    counter = CounterGather(query_ss.minhash)
+    counter.add(query_ss, 'somewhere over the rainbow')
+
+    # nothing really happens here :laugh:, just making sure there's no error
+    counter.consume(query_ss.minhash.copy_and_clear())
+
+
+def test_counter_gather_empty_initial_query():
+    # check empty initial query
+    query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1)
+    query_ss = SourmashSignature(query_mh, name='query')
+
+    match_mh_1 = query_mh.copy_and_clear()
+    match_mh_1.add_many(range(0, 10))
+    match_ss_1 = SourmashSignature(match_mh_1, name='match1')
+
+    # load up the counter
+    counter = CounterGather(query_ss.minhash)
+    counter.add(match_ss_1, require_overlap=False)
+
+    assert counter.peek(query_ss.minhash, query_ss.minhash.scaled) == []
+
+
+def test_counter_gather_empty_cur_query():
+    # test empty cur query
+    query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1)
+    query_mh.add_many(range(0, 20))
+    query_ss = SourmashSignature(query_mh, name='query')
+
+    # load up the counter
+    counter = CounterGather(query_ss.minhash)
+    counter.add(query_ss, 'somewhere over the rainbow')
+
+    cur_query_mh = query_ss.minhash.copy_and_clear()
+    results = _consume_all(cur_query_mh, counter)
+    assert results == []
+
+
+def test_counter_gather_bad_cur_query():
+    # test cur query that is not subset of original query
+    query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1)
+    query_mh.add_many(range(0, 20))
+    query_ss = SourmashSignature(query_mh, name='query')
+
+    # load up the counter
+    counter = CounterGather(query_ss.minhash)
+    counter.add(query_ss, 'somewhere over the rainbow')
+
+    cur_query_mh = query_ss.minhash.copy_and_clear()
+    cur_query_mh.add_many(range(20, 30))
+    with pytest.raises(ValueError):
+        counter.peek(cur_query_mh, cur_query_mh.scaled)
+
+
+def test_counter_gather_add_no_overlap():
+    # check adding match with no overlap w/query
+    query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1)
+    query_mh.add_many(range(0, 10))
+    query_ss = SourmashSignature(query_mh, name='query')
+
+    match_mh_1 = query_mh.copy_and_clear()
+    match_mh_1.add_many(range(10, 20))
+    match_ss_1 = SourmashSignature(match_mh_1, name='match1')
+
+    # load up the counter
+    counter = CounterGather(query_ss.minhash)
+    with pytest.raises(ValueError):
+        counter.add(match_ss_1)
+
+    assert counter.peek(query_ss.minhash, query_ss.minhash.scaled) == []
+
+
+def test_counter_gather_big_threshold():
+    # check 'peek' with a huge threshold
+    query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1)
+    query_mh.add_many(range(0, 20))
+    query_ss = SourmashSignature(query_mh, name='query')
+
+    match_mh_1 = query_mh.copy_and_clear()
+    match_mh_1.add_many(range(0, 10))
+    match_ss_1 = SourmashSignature(match_mh_1, name='match1')
+
+    # load up the counter
+    counter = CounterGather(query_ss.minhash)
+    counter.add(match_ss_1)
+
+    # impossible threshold:
+    threshold_bp=30*query_ss.minhash.scaled
+    results = counter.peek(query_ss.minhash, query_ss.minhash.scaled,
+                           threshold_bp=threshold_bp)
+    assert results == []
+
+
+def test_counter_gather_empty_counter():
+    # check empty counter
+    query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1)
+    query_ss = SourmashSignature(query_mh, name='query')
+
+    # empty counter!
+    counter = CounterGather(query_ss.minhash)
+
+    assert counter.peek(query_ss.minhash, query_ss.minhash.scaled) == []

From 93b9488cbebf5cb5742b89b382b8bd8abc305981 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Fri, 30 Apr 2021 07:32:46 -0700
Subject: [PATCH 15/18] tests for abund and scaled

---
 src/sourmash/index.py |   6 +-
 tests/test_index.py   | 156 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 158 insertions(+), 4 deletions(-)

diff --git a/src/sourmash/index.py b/src/sourmash/index.py
index 2c634e49c8..60ff365ddb 100644
--- a/src/sourmash/index.py
+++ b/src/sourmash/index.py
@@ -465,8 +465,7 @@ def __init__(self, query_mh):
             raise ValueError('gather requires scaled signatures')
 
         # track query
-        #CTBself.orig_query_mh = copy.copy(query_mh).flatten()
-        self.orig_query_mh = copy.copy(query_mh)
+        self.orig_query_mh = copy.copy(query_mh).flatten()
         self.scaled = query_mh.scaled
 
         # track matching signatures & their locations
@@ -494,7 +493,6 @@ def add(self, ss, location=None, require_overlap=True):
             self.locations.append(location)
 
             # note: scaled will be max of all matches.
-            #CTBself.downsample(ss.minhash.scaled)
             self.downsample(ss.minhash.scaled)
         elif require_overlap:
             raise ValueError("no overlap between query and signature!?")
@@ -591,7 +589,7 @@ def consume(self, intersect_mh):
         # all hashes found in the current match in other datasets;
         # remove empty datasets from counter, too.
         for (dataset_id, _) in most_common:
-            # CTB: note, remaining_mh may not be at correct scaled.
+            # CTB: note, remaining_mh may not be at correct scaled here.
             remaining_mh = siglist[dataset_id].minhash
             intersect_count = intersect_mh.count_common(remaining_mh,
                                                         downsample=True)
diff --git a/tests/test_index.py b/tests/test_index.py
index 89d3b0bdfd..5a4e04ab09 100644
--- a/tests/test_index.py
+++ b/tests/test_index.py
@@ -1363,6 +1363,162 @@ def test_counter_gather_1_c_with_threshold():
         assert size == exp_size
 
 
+def test_counter_gather_1_d_diff_scaled():
+    # test as above, but with different scaled.
+    query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1)
+    query_mh.add_many(range(0, 20))
+    query_ss = SourmashSignature(query_mh, name='query')
+
+    match_mh_1 = query_mh.copy_and_clear().downsample(scaled=10)
+    match_mh_1.add_many(range(0, 10))
+    match_ss_1 = SourmashSignature(match_mh_1, name='match1')
+
+    match_mh_2 = query_mh.copy_and_clear().downsample(scaled=20)
+    match_mh_2.add_many(range(7, 15))
+    match_ss_2 = SourmashSignature(match_mh_2, name='match2')
+
+    match_mh_3 = query_mh.copy_and_clear().downsample(scaled=30)
+    match_mh_3.add_many(range(13, 17))
+    match_ss_3 = SourmashSignature(match_mh_3, name='match3')
+
+    # load up the counter
+    counter = CounterGather(query_ss.minhash)
+    counter.add(match_ss_1)
+    counter.add(match_ss_2)
+    counter.add(match_ss_3)
+
+    results = _consume_all(query_ss.minhash, counter)
+
+    expected = (['match1', 10],
+                ['match2', 5],
+                ['match3', 2],)
+    assert len(results) == len(expected), results
+
+    for (sr, size), (exp_name, exp_size) in zip(results, expected):
+        sr_name = sr.signature.name.split()[0]
+
+        assert sr_name == exp_name
+        assert size == exp_size
+
+
+def test_counter_gather_1_d_diff_scaled_query():
+    # test as above, but with different scaled for QUERY.
+    query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1)
+    query_mh.add_many(range(0, 20))
+
+    match_mh_1 = query_mh.copy_and_clear().downsample(scaled=10)
+    match_mh_1.add_many(range(0, 10))
+    match_ss_1 = SourmashSignature(match_mh_1, name='match1')
+
+    match_mh_2 = query_mh.copy_and_clear().downsample(scaled=20)
+    match_mh_2.add_many(range(7, 15))
+    match_ss_2 = SourmashSignature(match_mh_2, name='match2')
+
+    match_mh_3 = query_mh.copy_and_clear().downsample(scaled=30)
+    match_mh_3.add_many(range(13, 17))
+    match_ss_3 = SourmashSignature(match_mh_3, name='match3')
+
+    # downsample query now -
+    query_ss = SourmashSignature(query_mh.downsample(scaled=100), name='query')
+
+    # load up the counter
+    counter = CounterGather(query_ss.minhash)
+    counter.add(match_ss_1)
+    counter.add(match_ss_2)
+    counter.add(match_ss_3)
+
+    results = _consume_all(query_ss.minhash, counter)
+
+    expected = (['match1', 10],
+                ['match2', 5],
+                ['match3', 2],)
+    assert len(results) == len(expected), results
+
+    for (sr, size), (exp_name, exp_size) in zip(results, expected):
+        sr_name = sr.signature.name.split()[0]
+
+        assert sr_name == exp_name
+        assert size == exp_size
+
+
+def test_counter_gather_1_e_abund_query():
+    # test as above, but abund query
+    query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1, track_abundance=1)
+    query_mh.add_many(range(0, 20))
+    query_ss = SourmashSignature(query_mh, name='query')
+
+    match_mh_1 = query_mh.copy_and_clear().flatten()
+    match_mh_1.add_many(range(0, 10))
+    match_ss_1 = SourmashSignature(match_mh_1, name='match1')
+
+    match_mh_2 = query_mh.copy_and_clear().flatten()
+    match_mh_2.add_many(range(7, 15))
+    match_ss_2 = SourmashSignature(match_mh_2, name='match2')
+
+    match_mh_3 = query_mh.copy_and_clear().flatten()
+    match_mh_3.add_many(range(13, 17))
+    match_ss_3 = SourmashSignature(match_mh_3, name='match3')
+
+    # load up the counter
+    counter = CounterGather(query_ss.minhash)
+    counter.add(match_ss_1)
+    counter.add(match_ss_2)
+    counter.add(match_ss_3)
+
+    # must flatten before peek!
+    results = _consume_all(query_ss.minhash.flatten(), counter)
+
+    expected = (['match1', 10],
+                ['match2', 5],
+                ['match3', 2],)
+    assert len(results) == len(expected), results
+
+    for (sr, size), (exp_name, exp_size) in zip(results, expected):
+        sr_name = sr.signature.name.split()[0]
+
+        assert sr_name == exp_name
+        assert size == exp_size
+
+
+def test_counter_gather_1_f_abund_match():
+    # test as above, but abund query
+    query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1, track_abundance=1)
+    query_mh.add_many(range(0, 20))
+    query_ss = SourmashSignature(query_mh.flatten(), name='query')
+
+    match_mh_1 = query_mh.copy_and_clear()
+    match_mh_1.add_many(range(0, 10))
+    match_ss_1 = SourmashSignature(match_mh_1, name='match1')
+
+    match_mh_2 = query_mh.copy_and_clear()
+    match_mh_2.add_many(range(7, 15))
+    match_ss_2 = SourmashSignature(match_mh_2, name='match2')
+
+    match_mh_3 = query_mh.copy_and_clear()
+    match_mh_3.add_many(range(13, 17))
+    match_ss_3 = SourmashSignature(match_mh_3, name='match3')
+
+    # load up the counter
+    counter = CounterGather(query_ss.minhash)
+    counter.add(match_ss_1)
+    counter.add(match_ss_2)
+    counter.add(match_ss_3)
+
+    # must flatten before peek!
+    results = _consume_all(query_ss.minhash.flatten(), counter)
+
+    expected = (['match1', 10],
+                ['match2', 5],
+                ['match3', 2],)
+    assert len(results) == len(expected), results
+
+    for (sr, size), (exp_name, exp_size) in zip(results, expected):
+        sr_name = sr.signature.name.split()[0]
+
+        assert sr_name == exp_name
+        assert size == exp_size
+
+
 def test_counter_gather_2():
     # check basic set of gather results on semi-real data,
     # generated via CounterGather

From b813758553c8d35283935ff6991c52cab307d75b Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Fri, 30 Apr 2021 07:38:32 -0700
Subject: [PATCH 16/18] remove scaled arg from peek

---
 src/sourmash/index.py  |  3 ++-
 src/sourmash/search.py |  3 +--
 tests/test_index.py    | 15 +++++++--------
 3 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/src/sourmash/index.py b/src/sourmash/index.py
index 60ff365ddb..3ce8d64ad7 100644
--- a/src/sourmash/index.py
+++ b/src/sourmash/index.py
@@ -517,9 +517,10 @@ def calc_threshold(self, threshold_bp, scaled, query_size):
 
         return threshold, n_threshold_hashes
 
-    def peek(self, cur_query_mh, scaled, threshold_bp=0):
+    def peek(self, cur_query_mh, threshold_bp=0):
         "Get next 'gather' result for this database, w/o changing counters."
         self.query_started = 1
+        scaled = cur_query_mh.scaled
 
         # empty? nothing to search.
         counter = self.counter
diff --git a/src/sourmash/search.py b/src/sourmash/search.py
index 91e284ac17..50557e979a 100644
--- a/src/sourmash/search.py
+++ b/src/sourmash/search.py
@@ -265,8 +265,7 @@ def _find_best(counters, query, threshold_bp):
 
     # find the best score across multiple counters, without consuming
     for counter in counters:
-        result = counter.peek(query.minhash, query.minhash.scaled,
-                              threshold_bp)
+        result = counter.peek(query.minhash, threshold_bp)
         if result:
             (sr, intersect_mh) = result
 
diff --git a/tests/test_index.py b/tests/test_index.py
index 5a4e04ab09..300a37edb5 100644
--- a/tests/test_index.py
+++ b/tests/test_index.py
@@ -1219,7 +1219,7 @@ def _consume_all(query_mh, counter, threshold_bp=0):
 
     last_intersect_size = None
     while 1:
-        result = counter.peek(query_mh, query_mh.scaled, threshold_bp)
+        result = counter.peek(query_mh, threshold_bp)
         if not result:
             break
 
@@ -1588,7 +1588,7 @@ def test_counter_gather_add_after_peek():
     counter = CounterGather(query_ss.minhash)
     counter.add(query_ss, 'somewhere over the rainbow')
 
-    counter.peek(query_ss.minhash, query_ss.minhash.scaled)
+    counter.peek(query_ss.minhash)
 
     with pytest.raises(ValueError):
         counter.add(query_ss, "try again")
@@ -1636,7 +1636,7 @@ def test_counter_gather_empty_initial_query():
     counter = CounterGather(query_ss.minhash)
     counter.add(match_ss_1, require_overlap=False)
 
-    assert counter.peek(query_ss.minhash, query_ss.minhash.scaled) == []
+    assert counter.peek(query_ss.minhash) == []
 
 
 def test_counter_gather_empty_cur_query():
@@ -1667,7 +1667,7 @@ def test_counter_gather_bad_cur_query():
     cur_query_mh = query_ss.minhash.copy_and_clear()
     cur_query_mh.add_many(range(20, 30))
     with pytest.raises(ValueError):
-        counter.peek(cur_query_mh, cur_query_mh.scaled)
+        counter.peek(cur_query_mh)
 
 
 def test_counter_gather_add_no_overlap():
@@ -1685,7 +1685,7 @@ def test_counter_gather_add_no_overlap():
     with pytest.raises(ValueError):
         counter.add(match_ss_1)
 
-    assert counter.peek(query_ss.minhash, query_ss.minhash.scaled) == []
+    assert counter.peek(query_ss.minhash) == []
 
 
 def test_counter_gather_big_threshold():
@@ -1704,8 +1704,7 @@ def test_counter_gather_big_threshold():
 
     # impossible threshold:
     threshold_bp=30*query_ss.minhash.scaled
-    results = counter.peek(query_ss.minhash, query_ss.minhash.scaled,
-                           threshold_bp=threshold_bp)
+    results = counter.peek(query_ss.minhash, threshold_bp=threshold_bp)
     assert results == []
 
 
@@ -1717,4 +1716,4 @@ def test_counter_gather_empty_counter():
     # empty counter!
     counter = CounterGather(query_ss.minhash)
 
-    assert counter.peek(query_ss.minhash, query_ss.minhash.scaled) == []
+    assert counter.peek(query_ss.minhash) == []

From 0f4a33a6fbe1bfbe9703e57488a995b7f84ae107 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Sat, 1 May 2021 08:34:54 -0700
Subject: [PATCH 17/18] open-box test for counter internal data structures

---
 tests/test_index.py | 86 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 86 insertions(+)

diff --git a/tests/test_index.py b/tests/test_index.py
index 300a37edb5..9c7a57c7bb 100644
--- a/tests/test_index.py
+++ b/tests/test_index.py
@@ -6,6 +6,7 @@
 import os
 import zipfile
 import shutil
+import copy
 
 import sourmash
 from sourmash import load_one_signature, SourmashSignature
@@ -1717,3 +1718,88 @@ def test_counter_gather_empty_counter():
     counter = CounterGather(query_ss.minhash)
 
     assert counter.peek(query_ss.minhash) == []
+
+
+def test_counter_gather_3_test_consume():
+    # open-box testing of consume(...)
+    query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1)
+    query_mh.add_many(range(0, 20))
+    query_ss = SourmashSignature(query_mh, name='query')
+
+    match_mh_1 = query_mh.copy_and_clear()
+    match_mh_1.add_many(range(0, 10))
+    match_ss_1 = SourmashSignature(match_mh_1, name='match1')
+
+    match_mh_2 = query_mh.copy_and_clear()
+    match_mh_2.add_many(range(7, 15))
+    match_ss_2 = SourmashSignature(match_mh_2, name='match2')
+
+    match_mh_3 = query_mh.copy_and_clear()
+    match_mh_3.add_many(range(13, 17))
+    match_ss_3 = SourmashSignature(match_mh_3, name='match3')
+
+    # load up the counter
+    counter = CounterGather(query_ss.minhash)
+    counter.add(match_ss_1, 'loc a')
+    counter.add(match_ss_2, 'loc b')
+    counter.add(match_ss_3, 'loc c')
+
+    ### ok, dig into actual counts...
+    import pprint
+    pprint.pprint(counter.counter)
+    pprint.pprint(counter.siglist)
+    pprint.pprint(counter.locations)
+
+    assert counter.siglist == [ match_ss_1, match_ss_2, match_ss_3 ]
+    assert counter.locations == ['loc a', 'loc b', 'loc c']
+    assert list(counter.counter.items()) == [(0, 10), (1, 8), (2, 4)]
+
+    ## round 1
+
+    cur_query = copy.copy(query_ss.minhash)
+    (sr, intersect_mh) = counter.peek(cur_query)
+    assert sr.signature == match_ss_1
+    assert len(intersect_mh) == 10
+    assert cur_query == query_ss.minhash
+
+    counter.consume(intersect_mh)
+    assert counter.siglist == [ match_ss_1, match_ss_2, match_ss_3 ]
+    assert counter.locations == ['loc a', 'loc b', 'loc c']
+    assert list(counter.counter.items()) == [(1, 5), (2, 4)]
+
+    ### round 2
+
+    cur_query.remove_many(intersect_mh.hashes)
+    (sr, intersect_mh) = counter.peek(cur_query)
+    assert sr.signature == match_ss_2
+    assert len(intersect_mh) == 5
+    assert cur_query != query_ss.minhash
+
+    counter.consume(intersect_mh)
+    assert counter.siglist == [ match_ss_1, match_ss_2, match_ss_3 ]
+    assert counter.locations == ['loc a', 'loc b', 'loc c']
+    assert list(counter.counter.items()) == [(2, 2)]
+
+    ## round 3
+
+    cur_query.remove_many(intersect_mh.hashes)
+    (sr, intersect_mh) = counter.peek(cur_query)
+    assert sr.signature == match_ss_3
+    assert len(intersect_mh) == 2
+    assert cur_query != query_ss.minhash
+
+    counter.consume(intersect_mh)
+    assert counter.siglist == [ match_ss_1, match_ss_2, match_ss_3 ]
+    assert counter.locations == ['loc a', 'loc b', 'loc c']
+    assert list(counter.counter.items()) == []
+
+    ## round 4 - nothing left!
+
+    cur_query.remove_many(intersect_mh.hashes)
+    results = counter.peek(cur_query)
+    assert not results
+
+    counter.consume(intersect_mh)
+    assert counter.siglist == [ match_ss_1, match_ss_2, match_ss_3 ]
+    assert counter.locations == ['loc a', 'loc b', 'loc c']
+    assert list(counter.counter.items()) == []

From af52c419ff875de2346b5c290fc39bf23397c2a8 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Sat, 1 May 2021 10:07:11 -0700
Subject: [PATCH 18/18] add num query & subj tests

---
 tests/test_index.py | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/tests/test_index.py b/tests/test_index.py
index 9c7a57c7bb..fdf57dd2ab 100644
--- a/tests/test_index.py
+++ b/tests/test_index.py
@@ -1640,6 +1640,16 @@ def test_counter_gather_empty_initial_query():
     assert counter.peek(query_ss.minhash) == []
 
 
+def test_counter_gather_num_query():
+    # check num query
+    query_mh = sourmash.MinHash(n=500, ksize=31)
+    query_mh.add_many(range(0, 10))
+    query_ss = SourmashSignature(query_mh, name='query')
+
+    with pytest.raises(ValueError):
+        counter = CounterGather(query_ss.minhash)
+
+
 def test_counter_gather_empty_cur_query():
     # test empty cur query
     query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1)
@@ -1655,6 +1665,22 @@ def test_counter_gather_empty_cur_query():
     assert results == []
 
 
+def test_counter_gather_add_num_matchy():
+    # test add num query
+    query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1)
+    query_mh.add_many(range(0, 20))
+    query_ss = SourmashSignature(query_mh, name='query')
+
+    match_mh = sourmash.MinHash(n=500, ksize=31)
+    match_mh.add_many(range(0, 20))
+    match_ss = SourmashSignature(match_mh, name='query')
+
+    # load up the counter
+    counter = CounterGather(query_ss.minhash)
+    with pytest.raises(ValueError):
+        counter.add(match_ss, 'somewhere over the rainbow')
+
+
 def test_counter_gather_bad_cur_query():
     # test cur query that is not subset of original query
     query_mh = sourmash.MinHash(n=0, ksize=31, scaled=1)