From 2ee322c84a6e3e8f7bdbe0682e1d7decd012fec1 Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Thu, 25 Jul 2013 18:09:50 -0400
Subject: [PATCH 001/140] added threading params to filter-abund.py

---
 scripts/filter-abund.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/scripts/filter-abund.py b/scripts/filter-abund.py
index 20dea0343a..3e836f3b42 100755
--- a/scripts/filter-abund.py
+++ b/scripts/filter-abund.py
@@ -11,7 +11,7 @@
 import os
 import khmer
 from khmer.thread_utils import ThreadedSequenceProcessor, verbose_loader
-
+from khmer import threading_args as targs
 from khmer.counting_args import build_counting_multifile_args
 
 ###
@@ -21,6 +21,7 @@
 
 def main():
     parser = build_counting_multifile_args()
+    targs.add_threading_args(parser)
     parser.add_argument('--cutoff', '-C', dest='cutoff',
                         default=DEFAULT_CUTOFF, type=int,
                         help="Trim at k-mers below this abundance.")
@@ -35,6 +36,7 @@ def main():
 
     counting_ht = args.input_table
     infiles = args.input_filenames
+    n_threads = int(args.n_threads)
 
     print 'file with ht: %s' % counting_ht
 
@@ -69,7 +71,7 @@ def process_fn(record):
         outfile = os.path.basename(infile) + '.abundfilt'
         outfp = open(outfile, 'w')
 
-        tsp = ThreadedSequenceProcessor(process_fn)
+        tsp = ThreadedSequenceProcessor(process_fn, n_workers=n_threads)
         tsp.start(verbose_loader(infile), outfp)
 
         print 'output in', outfile

From c9467fb56d90b9cf19884535b3d2f6a388928c81 Mon Sep 17 00:00:00 2001
From: CS Welcher <cs.welcher@gmail.com>
Date: Wed, 4 Sep 2013 15:02:56 -0400
Subject: [PATCH 002/140] Added typedefs to hashtable.hh

---
 lib/hashtable.hh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/lib/hashtable.hh b/lib/hashtable.hh
index 84a711d59f..7104527590 100644
--- a/lib/hashtable.hh
+++ b/lib/hashtable.hh
@@ -34,6 +34,9 @@ namespace khmer {
   typedef std::map<PartitionID, unsigned int> PartitionCountMap;
   typedef std::map<unsigned long long, unsigned long long> PartitionCountDistribution;
 
+  typedef unsigned int Color;
+  typedef std::multimap<HashIntoType, Color> TagColorMap;
+
   struct HashTablePerformanceMetrics : public IPerformanceMetrics
   {
 	

From 05edfb8b8da1b0204cdbc7a2fdc5e2b34055bc56 Mon Sep 17 00:00:00 2001
From: CS Welcher <cs.welcher@gmail.com>
Date: Wed, 4 Sep 2013 15:29:37 -0400
Subject: [PATCH 003/140] Added function prototype for consume and tag with
 colors

---
 lib/hashbits.cc | 63 +++++++++++++++++++++++++++++++++++++++++++++++++
 lib/hashbits.hh |  7 ++++--
 2 files changed, 68 insertions(+), 2 deletions(-)

diff --git a/lib/hashbits.cc b/lib/hashbits.cc
index 0300033df0..3b55116245 100644
--- a/lib/hashbits.cc
+++ b/lib/hashbits.cc
@@ -467,6 +467,69 @@ void Hashbits::consume_sequence_and_tag(const std::string& seq,
   }
 }
 
+/* This is essentially the same code as above, only it assigns colors to the
+ * tags through a multimap
+ */
+void Hashbits::consume_sequence_and_tag_with_colors(const std::string& seq,
+					unsigned long long& n_consumed,
+					SeenSet * found_tags)
+{
+  bool is_new_kmer;
+  bool kmer_tagged;
+
+  KMerIterator kmers(seq.c_str(), _ksize);
+  HashIntoType kmer;
+
+  unsigned int since = _tag_density / 2 + 1;
+
+  while(!kmers.done()) {
+    kmer = kmers.next();
+
+    if ((is_new_kmer = test_and_set_bits( kmer )))
+      ++n_consumed;
+
+#if (1)
+    if (is_new_kmer) ++since;
+    else
+    {
+      ACQUIRE_ALL_TAGS_SPIN_LOCK
+      kmer_tagged = set_contains(all_tags, kmer);
+      RELEASE_ALL_TAGS_SPIN_LOCK
+      if (kmer_tagged)
+      {
+	since = 1;
+	if (found_tags) { found_tags->insert(kmer); }
+      }
+      else ++since;
+    }
+#else
+    if (!is_new_kmer && set_contains(all_tags, kmer)) {
+      since = 1;
+      if (found_tags) { found_tags->insert(kmer); }
+    } else {
+      since++;
+    }
+#endif
+
+    if (since >= _tag_density) {
+      ACQUIRE_ALL_TAGS_SPIN_LOCK
+      all_tags.insert(kmer);
+      RELEASE_ALL_TAGS_SPIN_LOCK
+      if (found_tags) { found_tags->insert(kmer); }
+      since = 1;
+    }
+
+  } // iteration over kmers
+
+  if (since >= _tag_density/2 - 1) {
+    ACQUIRE_ALL_TAGS_SPIN_LOCK
+    all_tags.insert(kmer);	// insert the last k-mer, too.
+    RELEASE_ALL_TAGS_SPIN_LOCK
+    if (found_tags) { found_tags->insert(kmer); }
+  }
+}
+
+
 //
 // consume_fasta_and_tag_with_stoptags: consume a FASTA file of reads,
 //     tagging reads every so often.  Do not insert matches to stoptags,
diff --git a/lib/hashbits.hh b/lib/hashbits.hh
index 2f628ece5b..ebcf3fd2d5 100644
--- a/lib/hashbits.hh
+++ b/lib/hashbits.hh
@@ -169,8 +169,11 @@ namespace khmer {
     void consume_sequence_and_tag(const std::string& seq,
 				  unsigned long long& n_consumed,
 				  SeenSet * new_tags = 0);
-
-
+				  
+    void consume_sequence_and_tag_with_colors(const std::string& seq,
+					unsigned long long& n_consumed,
+					SeenSet * found_tags)
+    
     void consume_fasta_and_tag_with_stoptags(const std::string &filename,
 					     unsigned int &total_reads,
 					     unsigned long long &n_consumed,

From 0e92a8d3686a40d3c0dab27fd920166dbb0330a8 Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Thu, 5 Sep 2013 18:36:06 -0400
Subject: [PATCH 004/140] added code to insert colors into colormap

---
 lib/hashbits.cc  | 43 +++++++++++++++++++++++++++++++------------
 lib/hashbits.hh  | 23 ++++++++++++++++++++++-
 lib/hashtable.hh |  1 +
 3 files changed, 54 insertions(+), 13 deletions(-)

diff --git a/lib/hashbits.cc b/lib/hashbits.cc
index 3b55116245..574e07b856 100644
--- a/lib/hashbits.cc
+++ b/lib/hashbits.cc
@@ -468,10 +468,12 @@ void Hashbits::consume_sequence_and_tag(const std::string& seq,
 }
 
 /* This is essentially the same code as above, only it assigns colors to the
- * tags through a multimap
+ * tags through multimap TagColorMap defined in hashtable.hh, declared in
+ * hashbits.hh
  */
 void Hashbits::consume_sequence_and_tag_with_colors(const std::string& seq,
 					unsigned long long& n_consumed,
+					Color& current_color,
 					SeenSet * found_tags)
 {
   bool is_new_kmer;
@@ -489,19 +491,27 @@ void Hashbits::consume_sequence_and_tag_with_colors(const std::string& seq,
       ++n_consumed;
 
 #if (1)
-    if (is_new_kmer) ++since;
-    else
-    {
+    if (is_new_kmer) {
+      ++since;
+    } else {
       ACQUIRE_ALL_TAGS_SPIN_LOCK
       kmer_tagged = set_contains(all_tags, kmer);
       RELEASE_ALL_TAGS_SPIN_LOCK
-      if (kmer_tagged)
-      {
-	since = 1;
-	if (found_tags) { found_tags->insert(kmer); }
-      }
-      else ++since;
-    }
+      if (kmer_tagged) {
+	    since = 1;
+	    
+	    // Coloring code
+	    // TODO: MAKE THREADSAFE!
+	    
+	    if (!_map_contains(color_map, kmer, current_color)) {
+	      color_map.insert(TagColorPair(kmer, current_color))
+	    }
+	    if (found_tags) {
+	      found_tags->insert(kmer);
+	    }
+      }  else ++since;
+    }
+    // Should I bother adding new code down here?
 #else
     if (!is_new_kmer && set_contains(all_tags, kmer)) {
       since = 1;
@@ -510,11 +520,16 @@ void Hashbits::consume_sequence_and_tag_with_colors(const std::string& seq,
       since++;
     }
 #endif
-
+    //
     if (since >= _tag_density) {
       ACQUIRE_ALL_TAGS_SPIN_LOCK
       all_tags.insert(kmer);
       RELEASE_ALL_TAGS_SPIN_LOCK
+      
+      // Coloring code
+      // TODO: MAKE THREADSAFE!
+      color_map.insert(TagColorPair(kmer, current_color))
+      
       if (found_tags) { found_tags->insert(kmer); }
       since = 1;
     }
@@ -525,6 +540,10 @@ void Hashbits::consume_sequence_and_tag_with_colors(const std::string& seq,
     ACQUIRE_ALL_TAGS_SPIN_LOCK
     all_tags.insert(kmer);	// insert the last k-mer, too.
     RELEASE_ALL_TAGS_SPIN_LOCK
+    
+    // Color code: TODO: MAKE THREADSAFE!
+    color_map.insert(TagColorPair(kmer, current_color))
+    
     if (found_tags) { found_tags->insert(kmer); }
   }
 }
diff --git a/lib/hashbits.hh b/lib/hashbits.hh
index ebcf3fd2d5..6b54b5dcf7 100644
--- a/lib/hashbits.hh
+++ b/lib/hashbits.hh
@@ -27,6 +27,8 @@ namespace khmer {
 	HashIntoType _n_overlap_kmers;
     Byte ** _counts;
 
+    Color _tag_color;
+
     virtual void _allocate_counters() {
       _n_tables = _tablesizes.size();
 
@@ -49,6 +51,20 @@ namespace khmer {
 	partition->_clear_all_partitions();
       }
     }
+    
+    
+    // Check if the given TagToColorMap already has the tag with the given color
+    bool _map_contains(TagToColorMap& cmap,
+                        HashIntoType& kmer,
+                        Color& the_color)
+    {
+      std::pair<TagColorPair::iterator, TagColorPair::iterator> ret;
+      ret = cmap->equal_range(kmer);
+      for (TagToColorMap::iterator it=ret.first; it!=ret.second; ++it) {
+        if (it->second == the_color) return true;
+      }
+      return false;
+    }
 
     uint32_t _all_tags_spin_lock;
 
@@ -57,6 +73,8 @@ namespace khmer {
     SeenSet all_tags;
     SeenSet stop_tags;
     SeenSet repart_small_tags;
+    TagToColorMap color_map;
+
 
     void _validate_pmap() {
       if (partition) { partition->_validate_pmap(); }
@@ -74,6 +92,8 @@ namespace khmer {
       _n_unique_kmers = 0;
       _n_overlap_kmers = 0;
 
+      _tag_color = 0;
+
       _allocate_counters();
     }
 
@@ -172,7 +192,8 @@ namespace khmer {
 				  
     void consume_sequence_and_tag_with_colors(const std::string& seq,
 					unsigned long long& n_consumed,
-					SeenSet * found_tags)
+					Color& current_color,
+					SeenSet * new_tags = 0)
     
     void consume_fasta_and_tag_with_stoptags(const std::string &filename,
 					     unsigned int &total_reads,
diff --git a/lib/hashtable.hh b/lib/hashtable.hh
index 7104527590..51a4c0e174 100644
--- a/lib/hashtable.hh
+++ b/lib/hashtable.hh
@@ -36,6 +36,7 @@ namespace khmer {
 
   typedef unsigned int Color;
   typedef std::multimap<HashIntoType, Color> TagColorMap;
+  typedef std::pair<HashIntoType, Color> TagColorPair;
 
   struct HashTablePerformanceMetrics : public IPerformanceMetrics
   {

From 1465c745b44ccd6efa2596bb0da1a50991fc76ee Mon Sep 17 00:00:00 2001
From: CS <cs.welcher@gmail.com>
Date: Fri, 6 Sep 2013 00:45:28 -0400
Subject: [PATCH 005/140] added consume_fasta functions

---
 lib/hashbits.cc | 118 ++++++++++++++++++++++++++++++++++++++++++++++++
 lib/hashbits.hh |  16 +++++++
 2 files changed, 134 insertions(+)

diff --git a/lib/hashbits.cc b/lib/hashbits.cc
index 574e07b856..fa605ab032 100644
--- a/lib/hashbits.cc
+++ b/lib/hashbits.cc
@@ -467,6 +467,124 @@ void Hashbits::consume_sequence_and_tag(const std::string& seq,
   }
 }
 
+/*
+ * Pretty much copy-pasta from the above functions
+ * Might be time for a refactor: could do a general consume_fasta
+ * function which accepts a consume_sequence function pointer as a parameter
+ */
+
+void
+Hashbits::
+consume_fasta_and_tag_with_colors(
+  std:: string const  &filename,
+  unsigned int	      &total_reads, unsigned long long	&n_consumed,
+  CallbackFn	      callback,	    void *		callback_data
+)
+{
+  khmer:: Config    &the_config	  = khmer:: get_active_config( );
+
+  // Note: Always assume only 1 thread if invoked this way.
+  IParser *	  parser = 
+  IParser::get_parser(
+    filename, 1, the_config.get_reads_input_buffer_size( ),
+    the_config.get_reads_parser_trace_level( )
+  );
+
+
+  consume_fasta_and_tag_with_colors(
+    parser,
+    total_reads, n_consumed,
+    callback, callback_data
+  );
+
+  delete parser;
+}
+
+void
+Hashbits::
+consume_fasta_and_tag_with_colors(
+  read_parsers:: IParser *  parser,
+  unsigned int		    &total_reads,   unsigned long long	&n_consumed,
+  CallbackFn		    callback,	    void *		callback_data
+)
+{
+  Hasher		  &hasher		= 
+  _get_hasher( parser->uuid( ) );
+  unsigned int		  total_reads_LOCAL	= 0;
+#if (0) // Note: Used with callback - currently disabled.
+  unsigned long long int  n_consumed_LOCAL	= 0;
+#endif
+  Read			  read;
+
+  // TODO? Delete the following assignments.
+  total_reads = 0;
+  n_consumed = 0;
+  
+  hasher.trace_logger(
+    TraceLogger:: TLVL_DEBUG2,
+    "Starting trace of 'consume_fasta_and_tag'....\n"
+  );
+
+  // Iterate through the reads and consume their k-mers.
+  while (!parser->is_complete( ))
+  {
+    unsigned long long this_n_consumed   = 0;
+
+    read = parser->get_next_read( );
+
+    if (check_and_normalize_read( read.sequence ))
+    {
+      // TODO: make threadsafe!
+      consume_sequence_and_tag_with_colors( read.sequence,
+					    this_n_consumed,
+					    _tag_color );
+      ++_tag_color;
+
+#ifdef WITH_INTERNAL_METRICS
+      hasher.pmetrics.start_timers( );
+#endif
+#if (0) // Note: Used with callback - currently disabled.
+      n_consumed_LOCAL  = __sync_add_and_fetch( &n_consumed, this_n_consumed );
+#else
+      __sync_add_and_fetch( &n_consumed, this_n_consumed );
+#endif
+      total_reads_LOCAL = __sync_add_and_fetch( &total_reads, 1 );
+#ifdef WITH_INTERNAL_METRICS
+      hasher.pmetrics.stop_timers( );
+      hasher.pmetrics.accumulate_timer_deltas(
+	(uint32_t)HashTablePerformanceMetrics:: MKEY_TIME_UPDATE_TALLIES
+      );
+#endif
+    }
+
+    if (0 == (total_reads_LOCAL % 10000))
+      hasher.trace_logger(
+	TraceLogger:: TLVL_DEBUG3,
+	"Total number of reads processed: %llu\n",
+	(unsigned long long int)total_reads_LOCAL
+      );
+
+    // TODO: Figure out alternative to callback into Python VM
+    //       Cannot use in multi-threaded operation.
+#if (0)
+      // run callback, if specified
+      if (total_reads_TL % CALLBACK_PERIOD == 0 && callback) {
+	std::cout << "n tags: " << all_tags.size() << "\n";
+	try {
+	  callback("consume_fasta_and_tag", callback_data, total_reads_TL,
+		   n_consumed);
+	} catch (...) {
+	  delete parser;
+	  throw;
+	}
+      }
+#endif // 0
+
+  } // while reads left for parser
+
+}
+
+
 /* This is essentially the same code as above, only it assigns colors to the
  * tags through multimap TagColorMap defined in hashtable.hh, declared in
  * hashbits.hh
diff --git a/lib/hashbits.hh b/lib/hashbits.hh
index 6b54b5dcf7..d0a2ce9a0c 100644
--- a/lib/hashbits.hh
+++ b/lib/hashbits.hh
@@ -186,6 +186,22 @@ namespace khmer {
 	void *		    callback_data   = NULL
     );
 
+    
+    void consume_fasta_and_tag_with_colors(
+      std::string const	  &filename,
+      unsigned int	  &total_reads,
+      unsigned long long  &n_consumed,
+      CallbackFn	  callback	  = NULL,
+      void *		  callback_data	  = NULL
+    );
+
+    void consume_fasta_and_tag_with_colors(
+	read_parsers:: IParser *	    parser,
+	unsigned int	    &total_reads,
+	unsigned long long  &n_consumed,
+	CallbackFn	    callback	    = NULL,
+	void *		    callback_data   = NULL
+    );
     void consume_sequence_and_tag(const std::string& seq,
 				  unsigned long long& n_consumed,
 				  SeenSet * new_tags = 0);

From b33c71aa6beb19faf72ee282198a411cbdbe83f9 Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Fri, 6 Sep 2013 12:20:36 -0400
Subject: [PATCH 006/140] promoted new code to hashtable to be in line with
 refactor

---
 lib/hashbits.cc  | 200 -----------------------------------------------
 lib/hashbits.hh  |  22 ------
 lib/hashtable.cc | 200 +++++++++++++++++++++++++++++++++++++++++++++++
 lib/hashtable.hh |  23 ++++++
 lib/khmer.hh     |   1 +
 5 files changed, 224 insertions(+), 222 deletions(-)

diff --git a/lib/hashbits.cc b/lib/hashbits.cc
index 09518b37d9..27d780a92f 100644
--- a/lib/hashbits.cc
+++ b/lib/hashbits.cc
@@ -112,206 +112,6 @@ unsigned int Hashbits::check_and_process_read_overlap(std::string &read,
    return consume_string_overlap(read, ht2);
 }
 
-/*
- * Pretty much copy-pasta from the above functions
- * Might be time for a refactor: could do a general consume_fasta
- * function which accepts a consume_sequence function pointer as a parameter
- */
-
-void
-Hashbits::
-consume_fasta_and_tag_with_colors(
-  std:: string const  &filename,
-  unsigned int	      &total_reads, unsigned long long	&n_consumed,
-  CallbackFn	      callback,	    void *		callback_data
-)
-{
-  khmer:: Config    &the_config	  = khmer:: get_active_config( );
-
-  // Note: Always assume only 1 thread if invoked this way.
-  IParser *	  parser = 
-  IParser::get_parser(
-    filename, 1, the_config.get_reads_input_buffer_size( ),
-    the_config.get_reads_parser_trace_level( )
-  );
-
-
-  consume_fasta_and_tag_with_colors(
-    parser,
-    total_reads, n_consumed,
-    callback, callback_data
-  );
-
-  delete parser;
-}
-
-void
-Hashbits::
-consume_fasta_and_tag_with_colors(
-  read_parsers:: IParser *  parser,
-  unsigned int		    &total_reads,   unsigned long long	&n_consumed,
-  CallbackFn		    callback,	    void *		callback_data
-)
-{
-  Hasher		  &hasher		= 
-  _get_hasher( parser->uuid( ) );
-  unsigned int		  total_reads_LOCAL	= 0;
-#if (0) // Note: Used with callback - currently disabled.
-  unsigned long long int  n_consumed_LOCAL	= 0;
-#endif
-  Read			  read;
-
-  // TODO? Delete the following assignments.
-  total_reads = 0;
-  n_consumed = 0;
-  
-  hasher.trace_logger(
-    TraceLogger:: TLVL_DEBUG2,
-    "Starting trace of 'consume_fasta_and_tag'....\n"
-  );
-
-  // Iterate through the reads and consume their k-mers.
-  while (!parser->is_complete( ))
-  {
-    unsigned long long this_n_consumed   = 0;
-
-    read = parser->get_next_read( );
-
-    if (check_and_normalize_read( read.sequence ))
-    {
-      // TODO: make threadsafe!
-      consume_sequence_and_tag_with_colors( read.sequence,
-					    this_n_consumed,
-					    _tag_color );
-      ++_tag_color;
-
-#ifdef WITH_INTERNAL_METRICS
-      hasher.pmetrics.start_timers( );
-#endif
-#if (0) // Note: Used with callback - currently disabled.
-      n_consumed_LOCAL  = __sync_add_and_fetch( &n_consumed, this_n_consumed );
-#else
-      __sync_add_and_fetch( &n_consumed, this_n_consumed );
-#endif
-      total_reads_LOCAL = __sync_add_and_fetch( &total_reads, 1 );
-#ifdef WITH_INTERNAL_METRICS
-      hasher.pmetrics.stop_timers( );
-      hasher.pmetrics.accumulate_timer_deltas(
-	(uint32_t)HashTablePerformanceMetrics:: MKEY_TIME_UPDATE_TALLIES
-      );
-#endif
-    }
-
-    if (0 == (total_reads_LOCAL % 10000))
-      hasher.trace_logger(
-	TraceLogger:: TLVL_DEBUG3,
-	"Total number of reads processed: %llu\n",
-	(unsigned long long int)total_reads_LOCAL
-      );
-
-    // TODO: Figure out alternative to callback into Python VM
-    //       Cannot use in multi-threaded operation.
-#if (0)
-      // run callback, if specified
-      if (total_reads_TL % CALLBACK_PERIOD == 0 && callback) {
-	std::cout << "n tags: " << all_tags.size() << "\n";
-	try {
-	  callback("consume_fasta_and_tag", callback_data, total_reads_TL,
-		   n_consumed);
-	} catch (...) {
-	  delete parser;
-	  throw;
-	}
-      }
-#endif // 0
-
-  } // while reads left for parser
-
-}
-
-
-/* This is essentially the same code as above, only it assigns colors to the
- * tags through multimap TagColorMap defined in hashtable.hh, declared in
- * hashbits.hh
- */
-void Hashbits::consume_sequence_and_tag_with_colors(const std::string& seq,
-					unsigned long long& n_consumed,
-					Color& current_color,
-					SeenSet * found_tags)
-{
-  bool is_new_kmer;
-  bool kmer_tagged;
-
-  KMerIterator kmers(seq.c_str(), _ksize);
-  HashIntoType kmer;
-
-  unsigned int since = _tag_density / 2 + 1;
-
-  while(!kmers.done()) {
-    kmer = kmers.next();
-
-    if ((is_new_kmer = test_and_set_bits( kmer )))
-      ++n_consumed;
-
-#if (1)
-    if (is_new_kmer) {
-      ++since;
-    } else {
-      ACQUIRE_ALL_TAGS_SPIN_LOCK
-      kmer_tagged = set_contains(all_tags, kmer);
-      RELEASE_ALL_TAGS_SPIN_LOCK
-      if (kmer_tagged) {
-	    since = 1;
-	    
-	    // Coloring code
-	    // TODO: MAKE THREADSAFE!
-	    
-	    if (!_map_contains(color_map, kmer, current_color)) {
-	      color_map.insert(TagColorPair(kmer, current_color))
-	    }
-	    if (found_tags) {
-	      found_tags->insert(kmer);
-	    }
-      }  else ++since;
-    }
-    // Should I bother adding new code down here?
-#else
-    if (!is_new_kmer && set_contains(all_tags, kmer)) {
-      since = 1;
-      if (found_tags) { found_tags->insert(kmer); }
-    } else {
-      since++;
-    }
-#endif
-    //
-    if (since >= _tag_density) {
-      ACQUIRE_ALL_TAGS_SPIN_LOCK
-      all_tags.insert(kmer);
-      RELEASE_ALL_TAGS_SPIN_LOCK
-      
-      // Coloring code
-      // TODO: MAKE THREADSAFE!
-      color_map.insert(TagColorPair(kmer, current_color))
-      
-      if (found_tags) { found_tags->insert(kmer); }
-      since = 1;
-    }
-
-  } // iteration over kmers
-
-  if (since >= _tag_density/2 - 1) {
-    ACQUIRE_ALL_TAGS_SPIN_LOCK
-    all_tags.insert(kmer);	// insert the last k-mer, too.
-    RELEASE_ALL_TAGS_SPIN_LOCK
-    
-    // Color code: TODO: MAKE THREADSAFE!
-    color_map.insert(TagColorPair(kmer, current_color))
-    
-    if (found_tags) { found_tags->insert(kmer); }
-  }
-}
-
-
 //
 // consume_fasta: consume a FASTA file of reads
 //
diff --git a/lib/hashbits.hh b/lib/hashbits.hh
index 5a9bb4c6b9..6c0f19698c 100644
--- a/lib/hashbits.hh
+++ b/lib/hashbits.hh
@@ -76,28 +76,6 @@ namespace khmer {
 
     virtual void save(std::string);
     virtual void load(std::string);
-
-    
-    void consume_fasta_and_tag_with_colors(
-      std::string const	  &filename,
-      unsigned int	  &total_reads,
-      unsigned long long  &n_consumed,
-      CallbackFn	  callback	  = NULL,
-      void *		  callback_data	  = NULL
-    );
-
-    void consume_fasta_and_tag_with_colors(
-	read_parsers:: IParser *	    parser,
-	unsigned int	    &total_reads,
-	unsigned long long  &n_consumed,
-	CallbackFn	    callback	    = NULL,
-	void *		    callback_data   = NULL
-    );
-				  
-    void consume_sequence_and_tag_with_colors(const std::string& seq,
-					unsigned long long& n_consumed,
-					Color& current_color,
-					SeenSet * new_tags = 0)
     
     // for overlap k-mer counting
     void consume_fasta_overlap(const std::string &filename,HashIntoType curve[2][100],
diff --git a/lib/hashtable.cc b/lib/hashtable.cc
index 6782742387..bac6995173 100644
--- a/lib/hashtable.cc
+++ b/lib/hashtable.cc
@@ -1943,6 +1943,206 @@ void Hashtable::extract_unique_paths(std::string seq,
       i++;
     }
   }
+  
+/*
+ * Pretty much copy-pasta
+ * Might be time for a refactor: could do a general consume_fasta
+ * function which accepts a consume_sequence function pointer as a parameter
+ */
+
+void
+Hashbits::
+consume_fasta_and_tag_with_colors(
+  std:: string const  &filename,
+  unsigned int	      &total_reads, unsigned long long	&n_consumed,
+  CallbackFn	      callback,	    void *		callback_data
+)
+{
+  khmer:: Config    &the_config	  = khmer:: get_active_config( );
+
+  // Note: Always assume only 1 thread if invoked this way.
+  IParser *	  parser = 
+  IParser::get_parser(
+    filename, 1, the_config.get_reads_input_buffer_size( ),
+    the_config.get_reads_parser_trace_level( )
+  );
+
+
+  consume_fasta_and_tag_with_colors(
+    parser,
+    total_reads, n_consumed,
+    callback, callback_data
+  );
+
+  delete parser;
+}
+
+void
+Hashbits::
+consume_fasta_and_tag_with_colors(
+  read_parsers:: IParser *  parser,
+  unsigned int		    &total_reads,   unsigned long long	&n_consumed,
+  CallbackFn		    callback,	    void *		callback_data
+)
+{
+  Hasher		  &hasher		= 
+  _get_hasher( parser->uuid( ) );
+  unsigned int		  total_reads_LOCAL	= 0;
+#if (0) // Note: Used with callback - currently disabled.
+  unsigned long long int  n_consumed_LOCAL	= 0;
+#endif
+  Read			  read;
+
+  // TODO? Delete the following assignments.
+  total_reads = 0;
+  n_consumed = 0;
+  
+  hasher.trace_logger(
+    TraceLogger:: TLVL_DEBUG2,
+    "Starting trace of 'consume_fasta_and_tag'....\n"
+  );
+
+  // Iterate through the reads and consume their k-mers.
+  while (!parser->is_complete( ))
+  {
+    unsigned long long this_n_consumed   = 0;
+
+    read = parser->get_next_read( );
+
+    if (check_and_normalize_read( read.sequence ))
+    {
+      // TODO: make threadsafe!
+      consume_sequence_and_tag_with_colors( read.sequence,
+					    this_n_consumed,
+					    _tag_color );
+      ++_tag_color;
+
+#ifdef WITH_INTERNAL_METRICS
+      hasher.pmetrics.start_timers( );
+#endif
+#if (0) // Note: Used with callback - currently disabled.
+      n_consumed_LOCAL  = __sync_add_and_fetch( &n_consumed, this_n_consumed );
+#else
+      __sync_add_and_fetch( &n_consumed, this_n_consumed );
+#endif
+      total_reads_LOCAL = __sync_add_and_fetch( &total_reads, 1 );
+#ifdef WITH_INTERNAL_METRICS
+      hasher.pmetrics.stop_timers( );
+      hasher.pmetrics.accumulate_timer_deltas(
+	(uint32_t)HashTablePerformanceMetrics:: MKEY_TIME_UPDATE_TALLIES
+      );
+#endif
+    }
+
+    if (0 == (total_reads_LOCAL % 10000))
+      hasher.trace_logger(
+	TraceLogger:: TLVL_DEBUG3,
+	"Total number of reads processed: %llu\n",
+	(unsigned long long int)total_reads_LOCAL
+      );
+
+    // TODO: Figure out alternative to callback into Python VM
+    //       Cannot use in multi-threaded operation.
+#if (0)
+      // run callback, if specified
+      if (total_reads_TL % CALLBACK_PERIOD == 0 && callback) {
+	std::cout << "n tags: " << all_tags.size() << "\n";
+	try {
+	  callback("consume_fasta_and_tag", callback_data, total_reads_TL,
+		   n_consumed);
+	} catch (...) {
+	  delete parser;
+	  throw;
+	}
+      }
+#endif // 0
+
+  } // while reads left for parser
+
+}
+
+
+/* This is essentially the same code as above, only it assigns colors to the
+ * tags through multimap TagColorMap defined in hashtable.hh, declared in
+ * hashbits.hh
+ */
+void Hashbits::consume_sequence_and_tag_with_colors(const std::string& seq,
+					unsigned long long& n_consumed,
+					Color& current_color,
+					SeenSet * found_tags)
+{
+  bool is_new_kmer;
+  bool kmer_tagged;
+
+  KMerIterator kmers(seq.c_str(), _ksize);
+  HashIntoType kmer;
+
+  unsigned int since = _tag_density / 2 + 1;
+
+  while(!kmers.done()) {
+    kmer = kmers.next();
+
+    if ((is_new_kmer = test_and_set_bits( kmer )))
+      ++n_consumed;
+
+#if (1)
+    if (is_new_kmer) {
+      ++since;
+    } else {
+      ACQUIRE_ALL_TAGS_SPIN_LOCK
+      kmer_tagged = set_contains(all_tags, kmer);
+      RELEASE_ALL_TAGS_SPIN_LOCK
+      if (kmer_tagged) {
+	    since = 1;
+	    
+	    // Coloring code
+	    // TODO: MAKE THREADSAFE!
+	    
+	    if (!_map_contains(color_map, kmer, current_color)) {
+	      color_map.insert(TagColorPair(kmer, current_color))
+	    }
+	    if (found_tags) {
+	      found_tags->insert(kmer);
+	    }
+      }  else ++since;
+    }
+    // Should I bother adding new code down here?
+#else
+    if (!is_new_kmer && set_contains(all_tags, kmer)) {
+      since = 1;
+      if (found_tags) { found_tags->insert(kmer); }
+    } else {
+      since++;
+    }
+#endif
+    //
+    if (since >= _tag_density) {
+      ACQUIRE_ALL_TAGS_SPIN_LOCK
+      all_tags.insert(kmer);
+      RELEASE_ALL_TAGS_SPIN_LOCK
+      
+      // Coloring code
+      // TODO: MAKE THREADSAFE!
+      color_map.insert(TagColorPair(kmer, current_color))
+      
+      if (found_tags) { found_tags->insert(kmer); }
+      since = 1;
+    }
+
+  } // iteration over kmers
+
+  if (since >= _tag_density/2 - 1) {
+    ACQUIRE_ALL_TAGS_SPIN_LOCK
+    all_tags.insert(kmer);	// insert the last k-mer, too.
+    RELEASE_ALL_TAGS_SPIN_LOCK
+    
+    // Color code: TODO: MAKE THREADSAFE!
+    color_map.insert(TagColorPair(kmer, current_color))
+    
+    if (found_tags) { found_tags->insert(kmer); }
+  }
+}
+
 }
 
 // vim: set sts=2 sw=2:
diff --git a/lib/hashtable.hh b/lib/hashtable.hh
index f8d1103d41..586aa308be 100644
--- a/lib/hashtable.hh
+++ b/lib/hashtable.hh
@@ -320,6 +320,8 @@ namespace khmer {
     SeenSet all_tags;
     SeenSet stop_tags;
     SeenSet repart_small_tags;
+    TagColorMap tag_colors;
+    ColorTagPrtMap color_tag_ptrs;
 
     // accessor to get 'k'
     const WordLength ksize() const { return _ksize; }
@@ -439,6 +441,27 @@ namespace khmer {
 					     unsigned long long &n_consumed,
 					     CallbackFn callback = 0,
 					     void * callback_data = 0);
+					     
+
+    
+    void consume_fasta_and_tag_with_colors(
+                        std::string const	  &filename,
+                        unsigned int	  &total_reads,
+                        unsigned long long  &n_consumed,
+                        CallbackFn	  callback	  = NULL,
+                        void *		  callback_data	  = NULL);
+
+    void consume_fasta_and_tag_with_colors(
+	                read_parsers:: IParser *	    parser,
+	                unsigned int	    &total_reads,
+	                unsigned long long  &n_consumed,
+	                CallbackFn	    callback	    = NULL,
+	                void *		    callback_data   = NULL);
+				  
+    void consume_sequence_and_tag_with_colors(const std::string& seq,
+					unsigned long long& n_consumed,
+					Color& current_color,
+					SeenSet * new_tags = 0)
 
     void consume_fasta_and_traverse(const std::string &filename,
 				    unsigned int distance,
diff --git a/lib/khmer.hh b/lib/khmer.hh
index 86096be83d..372e96ddea 100644
--- a/lib/khmer.hh
+++ b/lib/khmer.hh
@@ -89,6 +89,7 @@ namespace khmer {
   
   typedef unsigned int Color;
   typedef std::multimap<HashIntoType, Color> TagColorMap;
+  typedef std::multimap<Color, HashIntoType*> ColorTagPtrMap;
   typedef std::pair<HashIntoType, Color> TagColorPair;
 }
 

From c097fb508591bb2dccf705c36fa9a9e02f351441 Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Fri, 6 Sep 2013 12:28:08 -0400
Subject: [PATCH 007/140] added _cmap_contains back in after being nuked by
 merge

---
 lib/hashtable.cc |  3 ++-
 lib/hashtable.hh | 14 ++++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/lib/hashtable.cc b/lib/hashtable.cc
index bac6995173..8e36c5f1f4 100644
--- a/lib/hashtable.cc
+++ b/lib/hashtable.cc
@@ -2098,8 +2098,9 @@ void Hashbits::consume_sequence_and_tag_with_colors(const std::string& seq,
 	    // Coloring code
 	    // TODO: MAKE THREADSAFE!
 	    
-	    if (!_map_contains(color_map, kmer, current_color)) {
+	    if (!_cmap_contains(color_map, kmer, current_color)) {
 	      color_map.insert(TagColorPair(kmer, current_color))
+	      
 	    }
 	    if (found_tags) {
 	      found_tags->insert(kmer);
diff --git a/lib/hashtable.hh b/lib/hashtable.hh
index 586aa308be..9f35bb3446 100644
--- a/lib/hashtable.hh
+++ b/lib/hashtable.hh
@@ -183,6 +183,20 @@ namespace khmer {
     HashIntoType    bitmask;
     unsigned int    _nbits_sub_1;
 
+    // Check if the given TagToColorMap already has the tag with the given color
+    bool _cmap_contains(TagToColorMap& cmap,
+                        HashIntoType& kmer,
+                        Color& the_color)
+    {
+      std::pair<TagColorPair::iterator, TagColorPair::iterator> ret;
+      ret = cmap->equal_range(kmer);
+      for (TagToColorMap::iterator it=ret.first; it!=ret.second; ++it) {
+        if (it->second == the_color) return true;
+      }
+      return false;
+    }
+
+
     Hashtable(
 	WordLength	ksize,
 	uint32_t const	number_of_threads   = 

From 90e83061c69c25f75d883326ba9380725d3f1762 Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Fri, 6 Sep 2013 14:33:19 -0400
Subject: [PATCH 008/140] fixed for proper pointers/refs, added reverse color
 map

---
 lib/hashtable.cc |  9 ++++-----
 lib/hashtable.hh | 29 ++++++++++++++++++++++-------
 2 files changed, 26 insertions(+), 12 deletions(-)

diff --git a/lib/hashtable.cc b/lib/hashtable.cc
index 8e36c5f1f4..12107c03c9 100644
--- a/lib/hashtable.cc
+++ b/lib/hashtable.cc
@@ -1951,7 +1951,7 @@ void Hashtable::extract_unique_paths(std::string seq,
  */
 
 void
-Hashbits::
+Hashtable::
 consume_fasta_and_tag_with_colors(
   std:: string const  &filename,
   unsigned int	      &total_reads, unsigned long long	&n_consumed,
@@ -1978,7 +1978,7 @@ consume_fasta_and_tag_with_colors(
 }
 
 void
-Hashbits::
+Hashtable::
 consume_fasta_and_tag_with_colors(
   read_parsers:: IParser *  parser,
   unsigned int		    &total_reads,   unsigned long long	&n_consumed,
@@ -2066,7 +2066,7 @@ consume_fasta_and_tag_with_colors(
  * tags through multimap TagColorMap defined in hashtable.hh, declared in
  * hashbits.hh
  */
-void Hashbits::consume_sequence_and_tag_with_colors(const std::string& seq,
+void Hashtable::consume_sequence_and_tag_with_colors(const std::string& seq,
 					unsigned long long& n_consumed,
 					Color& current_color,
 					SeenSet * found_tags)
@@ -2099,8 +2099,7 @@ void Hashbits::consume_sequence_and_tag_with_colors(const std::string& seq,
 	    // TODO: MAKE THREADSAFE!
 	    
 	    if (!_cmap_contains(color_map, kmer, current_color)) {
-	      color_map.insert(TagColorPair(kmer, current_color))
-	      
+	      link_tag_and_color(kmer, current_color);
 	    }
 	    if (found_tags) {
 	      found_tags->insert(kmer);
diff --git a/lib/hashtable.hh b/lib/hashtable.hh
index 9f35bb3446..897a5f184c 100644
--- a/lib/hashtable.hh
+++ b/lib/hashtable.hh
@@ -142,7 +142,6 @@ namespace khmer {
     bool done() { return index >= length; }
   }; // class KMerIterator
 
-
   class Hashtable {		// Base class implementation of a Bloom ht.
     friend class SubsetPartition;
   protected:
@@ -183,20 +182,31 @@ namespace khmer {
     HashIntoType    bitmask;
     unsigned int    _nbits_sub_1;
 
-    // Check if the given TagToColorMap already has the tag with the given color
-    bool _cmap_contains(TagToColorMap& cmap,
+    // Does the given tag already have the given color?
+    bool _cmap_contains_color(const TagColorPtrMap& cmap,
                         HashIntoType& kmer,
                         Color& the_color)
     {
-      std::pair<TagColorPair::iterator, TagColorPair::iterator> ret;
+      std::pair<TagColorPtrPair::iterator, TagColorPtrPair::iterator> ret;
       ret = cmap->equal_range(kmer);
-      for (TagToColorMap::iterator it=ret.first; it!=ret.second; ++it) {
-        if (it->second == the_color) return true;
+      for (TagColorPtrMap::iterator it=ret.first; it!=ret.second; ++it) {
+        if (*(it->second) == the_color) return true;
       }
       return false;
     }
 
-
+    // Does the given color already have a tag associated with it?
+    bool _cmap_contains_tag(const ColorTagPtrMap& cmap,
+                            Color& the_color,
+                            HashIntoType& kmer) {
+      std::pair<ColorTagPtrPair:: iterator, ColorTagPtrPair::iterator> ret;
+      ret = cmap->equal_range(the_color);
+      for (ColorTagPtrMap::iterator it=ret.first; it!=ret.second; ++it) {
+        if(*(it->second) == kmer) return true;
+      }
+      return false;
+    }
+    
     Hashtable(
 	WordLength	ksize,
 	uint32_t const	number_of_threads   = 
@@ -477,6 +487,11 @@ namespace khmer {
 					Color& current_color,
 					SeenSet * new_tags = 0)
 
+    void link_tag_and_color(HashIntoType& kmer, Color& color) {
+        tag_colors.insert(TagColorPtrPair(kmer, &current_color));
+        color_tag_ptrs.insert(ColorTagPtrPair(current_color, &kmer));
+    }
+
     void consume_fasta_and_traverse(const std::string &filename,
 				    unsigned int distance,
 				    unsigned int big_threshold,

From ce1d0277b6231dd45b20548c62bb7e49825cf634 Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Fri, 6 Sep 2013 17:23:18 -0400
Subject: [PATCH 009/140] added sweep function, getters for colors and tags,
 started color reconciliation

---
 lib/hashtable.cc | 313 ++++++++++++++++++++++++++---------------------
 lib/hashtable.hh |  30 ++++-
 lib/khmer.hh     |   7 +-
 3 files changed, 209 insertions(+), 141 deletions(-)

diff --git a/lib/hashtable.cc b/lib/hashtable.cc
index 12107c03c9..211b0b418d 100644
--- a/lib/hashtable.cc
+++ b/lib/hashtable.cc
@@ -1951,8 +1951,7 @@ void Hashtable::extract_unique_paths(std::string seq,
  */
 
 void
-Hashtable::
-consume_fasta_and_tag_with_colors(
+Hashtable::consume_fasta_and_tag_with_colors(
   std:: string const  &filename,
   unsigned int	      &total_reads, unsigned long long	&n_consumed,
   CallbackFn	      callback,	    void *		callback_data
@@ -1978,88 +1977,87 @@ consume_fasta_and_tag_with_colors(
 }
 
 void
-Hashtable::
-consume_fasta_and_tag_with_colors(
-  read_parsers:: IParser *  parser,
-  unsigned int		    &total_reads,   unsigned long long	&n_consumed,
-  CallbackFn		    callback,	    void *		callback_data
-)
-{
-  Hasher		  &hasher		= 
-  _get_hasher( parser->uuid( ) );
-  unsigned int		  total_reads_LOCAL	= 0;
-#if (0) // Note: Used with callback - currently disabled.
-  unsigned long long int  n_consumed_LOCAL	= 0;
-#endif
-  Read			  read;
-
-  // TODO? Delete the following assignments.
-  total_reads = 0;
-  n_consumed = 0;
-  
-  hasher.trace_logger(
-    TraceLogger:: TLVL_DEBUG2,
-    "Starting trace of 'consume_fasta_and_tag'....\n"
-  );
-
-  // Iterate through the reads and consume their k-mers.
-  while (!parser->is_complete( ))
+Hashtable::consume_fasta_and_tag_with_colors(
+    read_parsers:: IParser *  parser,
+    unsigned int		    &total_reads,   unsigned long long	&n_consumed,
+    CallbackFn		    callback,	    void *		callback_data
+  )
   {
-    unsigned long long this_n_consumed   = 0;
-
-    read = parser->get_next_read( );
+    Hasher		  &hasher		= 
+    _get_hasher( parser->uuid( ) );
+    unsigned int		  total_reads_LOCAL	= 0;
+  #if (0) // Note: Used with callback - currently disabled.
+    unsigned long long int  n_consumed_LOCAL	= 0;
+  #endif
+    Read			  read;
+
+    // TODO? Delete the following assignments.
+    total_reads = 0;
+    n_consumed = 0;
+    
+    hasher.trace_logger(
+      TraceLogger:: TLVL_DEBUG2,
+      "Starting trace of 'consume_fasta_and_tag'....\n"
+    );
 
-    if (check_and_normalize_read( read.sequence ))
+    // Iterate through the reads and consume their k-mers.
+    while (!parser->is_complete( ))
     {
-      // TODO: make threadsafe!
-      consume_sequence_and_tag_with_colors( read.sequence,
-					    this_n_consumed,
-					    _tag_color );
-      ++_tag_color;
+      unsigned long long this_n_consumed   = 0;
 
-#ifdef WITH_INTERNAL_METRICS
-      hasher.pmetrics.start_timers( );
-#endif
-#if (0) // Note: Used with callback - currently disabled.
-      n_consumed_LOCAL  = __sync_add_and_fetch( &n_consumed, this_n_consumed );
-#else
-      __sync_add_and_fetch( &n_consumed, this_n_consumed );
-#endif
-      total_reads_LOCAL = __sync_add_and_fetch( &total_reads, 1 );
-#ifdef WITH_INTERNAL_METRICS
-      hasher.pmetrics.stop_timers( );
-      hasher.pmetrics.accumulate_timer_deltas(
-	(uint32_t)HashTablePerformanceMetrics:: MKEY_TIME_UPDATE_TALLIES
-      );
-#endif
-    }
+      read = parser->get_next_read( );
 
-    if (0 == (total_reads_LOCAL % 10000))
-      hasher.trace_logger(
-	TraceLogger:: TLVL_DEBUG3,
-	"Total number of reads processed: %llu\n",
-	(unsigned long long int)total_reads_LOCAL
-      );
-
-    // TODO: Figure out alternative to callback into Python VM
-    //       Cannot use in multi-threaded operation.
-#if (0)
-      // run callback, if specified
-      if (total_reads_TL % CALLBACK_PERIOD == 0 && callback) {
-	std::cout << "n tags: " << all_tags.size() << "\n";
-	try {
-	  callback("consume_fasta_and_tag", callback_data, total_reads_TL,
-		   n_consumed);
-	} catch (...) {
-	  delete parser;
-	  throw;
-	}
+      if (check_and_normalize_read( read.sequence ))
+      {
+        // TODO: make threadsafe!
+        consume_sequence_and_tag_with_colors( read.sequence,
+					      this_n_consumed,
+					      _tag_color );
+        ++_tag_color;
+
+  #ifdef WITH_INTERNAL_METRICS
+        hasher.pmetrics.start_timers( );
+  #endif
+  #if (0) // Note: Used with callback - currently disabled.
+        n_consumed_LOCAL  = __sync_add_and_fetch( &n_consumed, this_n_consumed );
+  #else
+        __sync_add_and_fetch( &n_consumed, this_n_consumed );
+  #endif
+        total_reads_LOCAL = __sync_add_and_fetch( &total_reads, 1 );
+  #ifdef WITH_INTERNAL_METRICS
+        hasher.pmetrics.stop_timers( );
+        hasher.pmetrics.accumulate_timer_deltas(
+	  (uint32_t)HashTablePerformanceMetrics:: MKEY_TIME_UPDATE_TALLIES
+        );
+  #endif
       }
-#endif // 0
 
-  } // while reads left for parser
+      if (0 == (total_reads_LOCAL % 10000))
+        hasher.trace_logger(
+	  TraceLogger:: TLVL_DEBUG3,
+	  "Total number of reads processed: %llu\n",
+	  (unsigned long long int)total_reads_LOCAL
+        );
+
+      // TODO: Figure out alternative to callback into Python VM
+      //       Cannot use in multi-threaded operation.
+  #if (0)
+        // run callback, if specified
+        if (total_reads_TL % CALLBACK_PERIOD == 0 && callback) {
+	  std::cout << "n tags: " << all_tags.size() << "\n";
+	  try {
+	    callback("consume_fasta_and_tag", callback_data, total_reads_TL,
+		     n_consumed);
+	  } catch (...) {
+	    delete parser;
+	    throw;
+	  }
+        }
+  #endif // 0
 
-}
+    } // while reads left for parser
+
+  }
 
 
 /* This is essentially the same code as above, only it assigns colors to the
@@ -2070,79 +2068,120 @@ void Hashtable::consume_sequence_and_tag_with_colors(const std::string& seq,
 					unsigned long long& n_consumed,
 					Color& current_color,
 					SeenSet * found_tags)
-{
-  bool is_new_kmer;
-  bool kmer_tagged;
-
-  KMerIterator kmers(seq.c_str(), _ksize);
-  HashIntoType kmer;
-
-  unsigned int since = _tag_density / 2 + 1;
-
-  while(!kmers.done()) {
-    kmer = kmers.next();
+  {
+    bool is_new_kmer;
+    bool kmer_tagged;
+
+    KMerIterator kmers(seq.c_str(), _ksize);
+    HashIntoType kmer;
+
+    unsigned int since = _tag_density / 2 + 1;
+
+    while(!kmers.done()) {
+      kmer = kmers.next();
+
+      if ((is_new_kmer = test_and_set_bits( kmer )))
+        ++n_consumed;
+
+  #if (1)
+      if (is_new_kmer) {
+        ++since;
+      } else {
+        ACQUIRE_ALL_TAGS_SPIN_LOCK
+        kmer_tagged = set_contains(all_tags, kmer);
+        RELEASE_ALL_TAGS_SPIN_LOCK
+        if (kmer_tagged) {
+	      since = 1;
+	      
+	      // Coloring code
+	      // TODO: MAKE THREADSAFE!
+	      
+	      if (!_cmap_contains_color(color_map, kmer, current_color)) {
+	        link_tag_and_color(kmer, current_color);
+	      }
+	      if (found_tags) {
+	        found_tags->insert(kmer);
+	      }
+        }  else ++since;
+      }
+      // Should I bother adding new code down here?
+  #else
+      if (!is_new_kmer && set_contains(all_tags, kmer)) {
+        since = 1;
+        if (found_tags) { found_tags->insert(kmer); }
+      } else {
+        since++;
+      }
+  #endif
+      //
+      if (since >= _tag_density) {
+        ACQUIRE_ALL_TAGS_SPIN_LOCK
+        all_tags.insert(kmer);
+        RELEASE_ALL_TAGS_SPIN_LOCK
+        
+        // Coloring code
+        // TODO: MAKE THREADSAFE!
+        link_tag_and_color(kmer, current_color)
+        
+        if (found_tags) { found_tags->insert(kmer); }
+        since = 1;
+      }
 
-    if ((is_new_kmer = test_and_set_bits( kmer )))
-      ++n_consumed;
+    } // iteration over kmers
 
-#if (1)
-    if (is_new_kmer) {
-      ++since;
-    } else {
-      ACQUIRE_ALL_TAGS_SPIN_LOCK
-      kmer_tagged = set_contains(all_tags, kmer);
-      RELEASE_ALL_TAGS_SPIN_LOCK
-      if (kmer_tagged) {
-	    since = 1;
-	    
-	    // Coloring code
-	    // TODO: MAKE THREADSAFE!
-	    
-	    if (!_cmap_contains(color_map, kmer, current_color)) {
-	      link_tag_and_color(kmer, current_color);
-	    }
-	    if (found_tags) {
-	      found_tags->insert(kmer);
-	    }
-      }  else ++since;
-    }
-    // Should I bother adding new code down here?
-#else
-    if (!is_new_kmer && set_contains(all_tags, kmer)) {
-      since = 1;
-      if (found_tags) { found_tags->insert(kmer); }
-    } else {
-      since++;
-    }
-#endif
-    //
-    if (since >= _tag_density) {
+    if (since >= _tag_density/2 - 1) {
       ACQUIRE_ALL_TAGS_SPIN_LOCK
-      all_tags.insert(kmer);
+      all_tags.insert(kmer);	// insert the last k-mer, too.
       RELEASE_ALL_TAGS_SPIN_LOCK
       
-      // Coloring code
-      // TODO: MAKE THREADSAFE!
-      color_map.insert(TagColorPair(kmer, current_color))
+      // Color code: TODO: MAKE THREADSAFE!
+      link_tag_and_color(kmer, current_color)
       
       if (found_tags) { found_tags->insert(kmer); }
-      since = 1;
     }
-
-  } // iteration over kmers
-
-  if (since >= _tag_density/2 - 1) {
-    ACQUIRE_ALL_TAGS_SPIN_LOCK
-    all_tags.insert(kmer);	// insert the last k-mer, too.
-    RELEASE_ALL_TAGS_SPIN_LOCK
+  }
+/*
+ * Find all colors associated with the sequence
+ * For now, check /every/ k-mer with find_all_tags
+ * THIS SUCKS AND IT'S YOUR FAULT @CTB
+ */
+void Hashtable::sweep_sequence_for_colors(const std::string& seq,
+					unsigned long long& n_consumed,
+					SeenSet * found_tags,
+					bool break_on_stoptags,
+					bool stop_big_traversals) {
+					
+    SeenSet tagged_kmers;
+    ColorPtrSet found_colors;
     
-    // Color code: TODO: MAKE THREADSAFE!
-    color_map.insert(TagColorPair(kmer, current_color))
+    const unsigned char ksize = _ht->ktsize();
+    HashIntoType kmer_f, kmer_r, kmer;
     
-    if (found_tags) { found_tags->insert(kmer); }
-  }
+    KMerIterator kmers(seq.c_str(), _ksize);
+    HashIntoType kmer_s;
+
+    while (!kmers.done()) {
+      kmer_s = kmers.next();
+      kmer = _hash(kmer_s.c_str(), ksize, kmer_f, kmer_r);
+      
+      find_all_tags(kmer_f, kmer_r, tagged_kmers, _ht->all_tags,
+          break_on_stoptags, stop_big_traversals);
+    }
 }
 
+void Hashtable::traverse_colors_and_resolve(const SeenSet& tagged_kmers,
+                                              ColorPtrSet& found_colors) {
+  
+  SeenSet::const_iterator si;
+  unsigned int num_colors = 0;
+  for (si=tagged_kmers.begin(); si!=tagged_kmers.end(); ++si) {
+    tag = *si;
+    // get the colors associated with this tag
+    num_colors = _get_tag_colors(tag, tag_colors, found_colors)
+    if (num_colors > 1) {
+      // reconcile colors
+    }
+  }
 }
 
 // vim: set sts=2 sw=2:
diff --git a/lib/hashtable.hh b/lib/hashtable.hh
index 897a5f184c..f339840bc8 100644
--- a/lib/hashtable.hh
+++ b/lib/hashtable.hh
@@ -188,7 +188,7 @@ namespace khmer {
                         Color& the_color)
     {
       std::pair<TagColorPtrPair::iterator, TagColorPtrPair::iterator> ret;
-      ret = cmap->equal_range(kmer);
+      ret = cmap.equal_range(kmer);
       for (TagColorPtrMap::iterator it=ret.first; it!=ret.second; ++it) {
         if (*(it->second) == the_color) return true;
       }
@@ -200,13 +200,39 @@ namespace khmer {
                             Color& the_color,
                             HashIntoType& kmer) {
       std::pair<ColorTagPtrPair:: iterator, ColorTagPtrPair::iterator> ret;
-      ret = cmap->equal_range(the_color);
+      ret = cmap.equal_range(the_color);
       for (ColorTagPtrMap::iterator it=ret.first; it!=ret.second; ++it) {
         if(*(it->second) == kmer) return true;
       }
       return false;
     }
     
+    unsigned int _get_tag_colors(const HashIntoType& tag,
+                          const TagColorPrtMap& cmap,
+                          ColorPtrSet& found_colors) {
+        unsigned int num_colors = 0;
+        std::pair<ColorTagPtrPair:: iterator, ColorTagPtrPair::iterator> ret;
+        ret = cmap.equal_range(tag);
+        for (TagColorPtrMap::iterator it=ret.first; it!=ret.second; ++it) {
+            found_colors.insert(it->second);
+            ++num_colors;
+        }
+        return num_colors;
+    }
+    
+    unsigned int _get_tags_from_color(const Color& color,
+                               const ColorTagPtrMap& cmap,
+                               TagPtrSet& colored_tags) {
+        unsigned int num_tags = 0;
+        std::pair<ColorTagPtrPair:: iterator, ColorTagPtrPair::iterator> ret;
+        ret = cmap.equal_range(color);
+        for (ColorTagPtrMap::iterator it=ret.first; it!=ret.second; ++it) {
+            color_tags.insert(it->second);
+            ++num_tags;
+        }
+        return num_tags;
+    }
+    
     Hashtable(
 	WordLength	ksize,
 	uint32_t const	number_of_threads   = 
diff --git a/lib/khmer.hh b/lib/khmer.hh
index 372e96ddea..e096919e62 100644
--- a/lib/khmer.hh
+++ b/lib/khmer.hh
@@ -88,9 +88,12 @@ namespace khmer {
   typedef std::map<unsigned long long, unsigned long long> PartitionCountDistribution;
   
   typedef unsigned int Color;
-  typedef std::multimap<HashIntoType, Color> TagColorMap;
+  typedef std::multimap<HashIntoType, Color*> TagColorPtrMap;
   typedef std::multimap<Color, HashIntoType*> ColorTagPtrMap;
-  typedef std::pair<HashIntoType, Color> TagColorPair;
+  typedef std::pair<HashIntoType, Color*> TagColorPtrPair;
+  typedef std::pair<Color, HashIntoType*> ColorTagPtrPair;
+  typedef std::set<Color*> ColorPtrSet;
+  typedef std::set<HashIntoType*> TagPtrSet;
 }
 
 #endif // KHMER_HH

From 24f738095bef986557e4e4856146f01e6ed1fbdc Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Mon, 9 Sep 2013 11:56:07 -0400
Subject: [PATCH 010/140] color sweep added

---
 lib/hashtable.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/lib/hashtable.cc b/lib/hashtable.cc
index 211b0b418d..33765f23d1 100644
--- a/lib/hashtable.cc
+++ b/lib/hashtable.cc
@@ -2147,12 +2147,12 @@ void Hashtable::consume_sequence_and_tag_with_colors(const std::string& seq,
  */
 void Hashtable::sweep_sequence_for_colors(const std::string& seq,
 					unsigned long long& n_consumed,
-					SeenSet * found_tags,
+					ColorPtrSet& found_colors,
 					bool break_on_stoptags,
 					bool stop_big_traversals) {
 					
     SeenSet tagged_kmers;
-    ColorPtrSet found_colors;
+    //ColorPtrSet found_colors;
     
     const unsigned char ksize = _ht->ktsize();
     HashIntoType kmer_f, kmer_r, kmer;
@@ -2166,6 +2166,7 @@ void Hashtable::sweep_sequence_for_colors(const std::string& seq,
       
       find_all_tags(kmer_f, kmer_r, tagged_kmers, _ht->all_tags,
           break_on_stoptags, stop_big_traversals);
+      traverse_colors_and_resolve(tagged_kmers, found_colors);
     }
 }
 
@@ -2180,6 +2181,7 @@ void Hashtable::traverse_colors_and_resolve(const SeenSet& tagged_kmers,
     num_colors = _get_tag_colors(tag, tag_colors, found_colors)
     if (num_colors > 1) {
       // reconcile colors
+      // for now do nothing ha
     }
   }
 }

From a961a2122286698b3b2126330d1dc45f9ed359bc Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Mon, 9 Sep 2013 12:58:32 -0400
Subject: [PATCH 011/140] added parse function to python glue

---
 python/_khmermodule.cc | 33 ++++++++++++++++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/python/_khmermodule.cc b/python/_khmermodule.cc
index b0af974b3a..04c7ad568e 100644
--- a/python/_khmermodule.cc
+++ b/python/_khmermodule.cc
@@ -2195,6 +2195,37 @@ static PyObject * hash_do_subset_partition_with_abundance(PyObject * self, PyObj
   return (PyObject *) subset_obj;
 }
 
+static PyObject * hash_consume_fasta_and_tag_with_colors(PyObject * self, PyObject * args)
+{
+  khmer_KCountingHashObject * me = (khmer_KCountingHashObject *) self;
+  khmer::CountingHash * counting = me->counting;
+  
+  PyObject * callback_obs = NULL;
+  PyObject * rparser_obj = NULL;
+  
+  if (!PyArg_ParseType(args, "O|O", &rparser_obj, &callback_obj)) {
+    return NULL;
+  }
+  
+  khmer:: read_parsers:: IParser * rparser =
+  _PyObject_to_khmer_ReadParser( rparser_obj );
+  unsigned long long n_consumed;
+  unsigned int total_reads;
+  bool exc_raised = false;
+  
+  Py_BEGIN_ALLOW_THREADS
+  try {
+    counting->consume_fasta_and_tag_with_colors(rparser, total_reads, n_consumed,
+                                                _report_fn, callback_obj);
+  } catch (_khmer_signal &e) {
+    exc_raised = TRUE;
+  }
+  Py_END_ALLOW_THREADS
+  if (exc_raised) return NULL;
+  
+  return Py_BuildValue("iL", total_reads, n_consumed);
+  
+}
 
 static PyMethodDef khmer_counting_methods[] = {
   { "ksize", hash_get_ksize, METH_VARARGS, "" },
@@ -2232,7 +2263,7 @@ static PyMethodDef khmer_counting_methods[] = {
   { "consume_fasta_and_tag", hash_consume_fasta_and_tag, METH_VARARGS, "Count all k-mers in a given file" },
   { "do_subset_partition_with_abundance", hash_do_subset_partition_with_abundance, METH_VARARGS, "" },
   { "find_all_tags_truncate_on_abundance", hash_find_all_tags_truncate_on_abundance, METH_VARARGS, "" },
-
+  { "consume_fasta_and_tag_with_colors", hash_consume_fasta_and_tag_with_colors, METH_VARARGS, "" },
   {NULL, NULL, 0, NULL}           /* sentinel */
 };
 

From 1c07a014713ea22df2ff45238e35a71f2b95e956 Mon Sep 17 00:00:00 2001
From: CS Welcher <cs.welcher@gmail.com>
Date: Mon, 9 Sep 2013 13:29:38 -0400
Subject: [PATCH 012/140] added a spin lock for tag_colors (not strictly
 necessary, but ctb likes explicit things i hear)

---
 lib/hashtable.cc | 4 ++++
 lib/hashtable.hh | 9 ++++++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/lib/hashtable.cc b/lib/hashtable.cc
index 33765f23d1..4030ee8993 100644
--- a/lib/hashtable.cc
+++ b/lib/hashtable.cc
@@ -2097,7 +2097,9 @@ void Hashtable::consume_sequence_and_tag_with_colors(const std::string& seq,
 	      // TODO: MAKE THREADSAFE!
 	      
 	      if (!_cmap_contains_color(color_map, kmer, current_color)) {
+	        ACQUIRE_TAG_COLORS_SPIN_LOCK
 	        link_tag_and_color(kmer, current_color);
+	        RELEASE_TAG_COLORS_SPIN_LOCK
 	      }
 	      if (found_tags) {
 	        found_tags->insert(kmer);
@@ -2121,7 +2123,9 @@ void Hashtable::consume_sequence_and_tag_with_colors(const std::string& seq,
         
         // Coloring code
         // TODO: MAKE THREADSAFE!
+        ACQUIRE_TAG_COLORS_SPIN_LOCK
         link_tag_and_color(kmer, current_color)
+        RELEASE_TAG_COLORS_SPIN_LOCK
         
         if (found_tags) { found_tags->insert(kmer); }
         since = 1;
diff --git a/lib/hashtable.hh b/lib/hashtable.hh
index f339840bc8..68d6162459 100644
--- a/lib/hashtable.hh
+++ b/lib/hashtable.hh
@@ -252,6 +252,7 @@ namespace khmer {
       partition = new SubsetPartition(this);
       _init_bitstuff();
       _all_tags_spin_lock = 0;
+      _tag_colors_spin_lock = 0;
     }
 
     virtual ~Hashtable( )
@@ -364,7 +365,7 @@ namespace khmer {
     }
 
     uint32_t _all_tags_spin_lock;
-
+    uint32_t _tag_colors_spin_lock;
   public:
     SubsetPartition * partition;
     SeenSet all_tags;
@@ -626,4 +627,10 @@ namespace khmer {
 #define RELEASE_ALL_TAGS_SPIN_LOCK \
   __sync_bool_compare_and_swap( &_all_tags_spin_lock, 1, 0 );
 
+#define ACQUIRE_TAG_COLORS_SPIN_LOCK \
+  while(!__sync_bool_compare_and_swap( &_tag_colors_spin_lock, 0, 1));
+
+#define ACQUIRE_TAG_COLORS_SPIN_LOCK \
+  __sync_bool_compare_and_swap( &_tag_colors_spin_lock, 1, 0);
+
 #endif // HASHTABLE_HH

From e0f7dca0062237445f59d680f5cbeb526f66fcf6 Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Mon, 9 Sep 2013 14:46:07 -0400
Subject: [PATCH 013/140] adding sweep glue

---
 lib/hashtable.cc       |  1 -
 lib/hashtable.hh       |  7 +++++++
 python/_khmermodule.cc | 19 +++++++++++++++++++
 3 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/lib/hashtable.cc b/lib/hashtable.cc
index 4030ee8993..0af75db535 100644
--- a/lib/hashtable.cc
+++ b/lib/hashtable.cc
@@ -2150,7 +2150,6 @@ void Hashtable::consume_sequence_and_tag_with_colors(const std::string& seq,
  * THIS SUCKS AND IT'S YOUR FAULT @CTB
  */
 void Hashtable::sweep_sequence_for_colors(const std::string& seq,
-					unsigned long long& n_consumed,
 					ColorPtrSet& found_colors,
 					bool break_on_stoptags,
 					bool stop_big_traversals) {
diff --git a/lib/hashtable.hh b/lib/hashtable.hh
index 68d6162459..9931ee1da3 100644
--- a/lib/hashtable.hh
+++ b/lib/hashtable.hh
@@ -59,6 +59,13 @@ namespace khmer {
 
 	virtual void	accumulate_timer_deltas( uint32_t metrics_key );
 
+  };
+  
+  struct tag_color_info {
+    HashIntoType kmer;
+    SeenSet tagged_kmers;
+    
+  
   };
 
   //
diff --git a/python/_khmermodule.cc b/python/_khmermodule.cc
index 04c7ad568e..4efbe2ca8c 100644
--- a/python/_khmermodule.cc
+++ b/python/_khmermodule.cc
@@ -2227,6 +2227,25 @@ static PyObject * hash_consume_fasta_and_tag_with_colors(PyObject * self, PyObje
   
 }
 
+static PyObject * hash_sweep_sequence_for_colors(PyObject * self, PyObject * args) {
+  khmer_KCountingHashObject * me = (khmer_KCountingHashObject *) self;
+  khmer::CountingHash * counting = me->counting;
+  
+  char * seq = NULL;
+  bool break_on_stoptags = NULL;
+  bool stop_big_traversals = NULL;
+  
+  if(!PyArg_ParseTuple(args, "spp", &seq, &break_on_stoptags, &stop_big_traversals)) {
+    return NULL;
+  }
+  
+  if (strlen(kmer_s) < counting->ksize()) {
+    return NULL;
+  }
+  
+  _pre_partition_info
+}
+
 static PyMethodDef khmer_counting_methods[] = {
   { "ksize", hash_get_ksize, METH_VARARGS, "" },
   { "hashsizes", hash_get_hashsizes, METH_VARARGS, "" },

From a64264d60104a6554f82d8435c613de58e82b56c Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Mon, 9 Sep 2013 18:48:20 -0400
Subject: [PATCH 014/140] woot compiles and segfaults i win

---
 lib/hashtable.cc       | 35 ++++++++++++++++-------------
 lib/hashtable.hh       | 48 +++++++++++++++++++--------------------
 python/_khmermodule.cc | 51 ++++++++++++++++++++++++++++++------------
 3 files changed, 80 insertions(+), 54 deletions(-)

diff --git a/lib/hashtable.cc b/lib/hashtable.cc
index 0af75db535..2d6e74d38b 100644
--- a/lib/hashtable.cc
+++ b/lib/hashtable.cc
@@ -1943,7 +1943,7 @@ void Hashtable::extract_unique_paths(std::string seq,
       i++;
     }
   }
-  
+}
 /*
  * Pretty much copy-pasta
  * Might be time for a refactor: could do a general consume_fasta
@@ -1999,7 +1999,9 @@ Hashtable::consume_fasta_and_tag_with_colors(
       TraceLogger:: TLVL_DEBUG2,
       "Starting trace of 'consume_fasta_and_tag'....\n"
     );
-
+    
+    Color _tag_color = 0;
+    Color * the_color = new Color(_tag_color++);
     // Iterate through the reads and consume their k-mers.
     while (!parser->is_complete( ))
     {
@@ -2012,8 +2014,8 @@ Hashtable::consume_fasta_and_tag_with_colors(
         // TODO: make threadsafe!
         consume_sequence_and_tag_with_colors( read.sequence,
 					      this_n_consumed,
-					      _tag_color );
-        ++_tag_color;
+					      *the_color );
+        the_color = new Color(_tag_color++);
 
   #ifdef WITH_INTERNAL_METRICS
         hasher.pmetrics.start_timers( );
@@ -2059,7 +2061,10 @@ Hashtable::consume_fasta_and_tag_with_colors(
 
   }
 
-
+void Hashtable::link_tag_and_color(HashIntoType& kmer, Color& kmer_color) {
+  tag_colors.insert(TagColorPtrPair(kmer, &kmer_color));
+  color_tag_ptrs.insert(ColorTagPtrPair(kmer_color, &kmer));
+}
 /* This is essentially the same code as above, only it assigns colors to the
  * tags through multimap TagColorMap defined in hashtable.hh, declared in
  * hashbits.hh
@@ -2096,7 +2101,7 @@ void Hashtable::consume_sequence_and_tag_with_colors(const std::string& seq,
 	      // Coloring code
 	      // TODO: MAKE THREADSAFE!
 	      
-	      if (!_cmap_contains_color(color_map, kmer, current_color)) {
+	      if (!_cmap_contains_color(tag_colors, kmer, current_color)) {
 	        ACQUIRE_TAG_COLORS_SPIN_LOCK
 	        link_tag_and_color(kmer, current_color);
 	        RELEASE_TAG_COLORS_SPIN_LOCK
@@ -2124,7 +2129,7 @@ void Hashtable::consume_sequence_and_tag_with_colors(const std::string& seq,
         // Coloring code
         // TODO: MAKE THREADSAFE!
         ACQUIRE_TAG_COLORS_SPIN_LOCK
-        link_tag_and_color(kmer, current_color)
+        link_tag_and_color(kmer, current_color);
         RELEASE_TAG_COLORS_SPIN_LOCK
         
         if (found_tags) { found_tags->insert(kmer); }
@@ -2139,7 +2144,7 @@ void Hashtable::consume_sequence_and_tag_with_colors(const std::string& seq,
       RELEASE_ALL_TAGS_SPIN_LOCK
       
       // Color code: TODO: MAKE THREADSAFE!
-      link_tag_and_color(kmer, current_color)
+      link_tag_and_color(kmer, current_color);
       
       if (found_tags) { found_tags->insert(kmer); }
     }
@@ -2157,17 +2162,17 @@ void Hashtable::sweep_sequence_for_colors(const std::string& seq,
     SeenSet tagged_kmers;
     //ColorPtrSet found_colors;
     
-    const unsigned char ksize = _ht->ktsize();
     HashIntoType kmer_f, kmer_r, kmer;
     
     KMerIterator kmers(seq.c_str(), _ksize);
-    HashIntoType kmer_s;
+    std::string kmer_s;
 
     while (!kmers.done()) {
-      kmer_s = kmers.next();
-      kmer = _hash(kmer_s.c_str(), ksize, kmer_f, kmer_r);
+      kmer = kmers.next();
+      kmer_s = _revhash(kmer, _ksize);
+      _hash(kmer_s.c_str(), _ksize, kmer_f, kmer_r);
       
-      find_all_tags(kmer_f, kmer_r, tagged_kmers, _ht->all_tags,
+      partition->find_all_tags(kmer_f, kmer_r, tagged_kmers, all_tags,
           break_on_stoptags, stop_big_traversals);
       traverse_colors_and_resolve(tagged_kmers, found_colors);
     }
@@ -2179,9 +2184,9 @@ void Hashtable::traverse_colors_and_resolve(const SeenSet& tagged_kmers,
   SeenSet::const_iterator si;
   unsigned int num_colors = 0;
   for (si=tagged_kmers.begin(); si!=tagged_kmers.end(); ++si) {
-    tag = *si;
+    HashIntoType tag = *si;
     // get the colors associated with this tag
-    num_colors = _get_tag_colors(tag, tag_colors, found_colors)
+    num_colors = _get_tag_colors(tag, tag_colors, found_colors);
     if (num_colors > 1) {
       // reconcile colors
       // for now do nothing ha
diff --git a/lib/hashtable.hh b/lib/hashtable.hh
index 9931ee1da3..61c60a7869 100644
--- a/lib/hashtable.hh
+++ b/lib/hashtable.hh
@@ -59,13 +59,6 @@ namespace khmer {
 
 	virtual void	accumulate_timer_deltas( uint32_t metrics_key );
 
-  };
-  
-  struct tag_color_info {
-    HashIntoType kmer;
-    SeenSet tagged_kmers;
-    
-  
   };
 
   //
@@ -194,9 +187,9 @@ namespace khmer {
                         HashIntoType& kmer,
                         Color& the_color)
     {
-      std::pair<TagColorPtrPair::iterator, TagColorPtrPair::iterator> ret;
+      std::pair<TagColorPtrMap::const_iterator, TagColorPtrMap::const_iterator> ret;
       ret = cmap.equal_range(kmer);
-      for (TagColorPtrMap::iterator it=ret.first; it!=ret.second; ++it) {
+      for (TagColorPtrMap::const_iterator it=ret.first; it!=ret.second; ++it) {
         if (*(it->second) == the_color) return true;
       }
       return false;
@@ -206,21 +199,21 @@ namespace khmer {
     bool _cmap_contains_tag(const ColorTagPtrMap& cmap,
                             Color& the_color,
                             HashIntoType& kmer) {
-      std::pair<ColorTagPtrPair:: iterator, ColorTagPtrPair::iterator> ret;
+      std::pair<ColorTagPtrMap::const_iterator, ColorTagPtrMap::const_iterator> ret;
       ret = cmap.equal_range(the_color);
-      for (ColorTagPtrMap::iterator it=ret.first; it!=ret.second; ++it) {
+      for (ColorTagPtrMap::const_iterator it=ret.first; it!=ret.second; ++it) {
         if(*(it->second) == kmer) return true;
       }
       return false;
     }
     
     unsigned int _get_tag_colors(const HashIntoType& tag,
-                          const TagColorPrtMap& cmap,
+                          const TagColorPtrMap& cmap,
                           ColorPtrSet& found_colors) {
         unsigned int num_colors = 0;
-        std::pair<ColorTagPtrPair:: iterator, ColorTagPtrPair::iterator> ret;
+        std::pair<TagColorPtrMap::const_iterator, TagColorPtrMap::const_iterator> ret;
         ret = cmap.equal_range(tag);
-        for (TagColorPtrMap::iterator it=ret.first; it!=ret.second; ++it) {
+        for (TagColorPtrMap::const_iterator it=ret.first; it!=ret.second; ++it) {
             found_colors.insert(it->second);
             ++num_colors;
         }
@@ -231,10 +224,10 @@ namespace khmer {
                                const ColorTagPtrMap& cmap,
                                TagPtrSet& colored_tags) {
         unsigned int num_tags = 0;
-        std::pair<ColorTagPtrPair:: iterator, ColorTagPtrPair::iterator> ret;
+        std::pair<ColorTagPtrMap::const_iterator, ColorTagPtrMap::const_iterator> ret;
         ret = cmap.equal_range(color);
-        for (ColorTagPtrMap::iterator it=ret.first; it!=ret.second; ++it) {
-            color_tags.insert(it->second);
+        for (ColorTagPtrMap::const_iterator it=ret.first; it!=ret.second; ++it) {
+            colored_tags.insert(it->second);
             ++num_tags;
         }
         return num_tags;
@@ -378,8 +371,8 @@ namespace khmer {
     SeenSet all_tags;
     SeenSet stop_tags;
     SeenSet repart_small_tags;
-    TagColorMap tag_colors;
-    ColorTagPrtMap color_tag_ptrs;
+    TagColorPtrMap tag_colors;
+    ColorTagPtrMap color_tag_ptrs;
 
     // accessor to get 'k'
     const WordLength ksize() const { return _ksize; }
@@ -519,12 +512,17 @@ namespace khmer {
     void consume_sequence_and_tag_with_colors(const std::string& seq,
 					unsigned long long& n_consumed,
 					Color& current_color,
-					SeenSet * new_tags = 0)
+					SeenSet * new_tags = 0);
 
-    void link_tag_and_color(HashIntoType& kmer, Color& color) {
-        tag_colors.insert(TagColorPtrPair(kmer, &current_color));
-        color_tag_ptrs.insert(ColorTagPtrPair(current_color, &kmer));
-    }
+    void link_tag_and_color(HashIntoType& kmer, Color& color);
+    
+    void sweep_sequence_for_colors(const std::string& seq,
+					ColorPtrSet& found_colors,
+					bool break_on_stoptags,
+					bool stop_big_traversals);
+					
+	void traverse_colors_and_resolve(const SeenSet& tagged_kmers,
+                                     ColorPtrSet& found_colors);
 
     void consume_fasta_and_traverse(const std::string &filename,
 				    unsigned int distance,
@@ -637,7 +635,7 @@ namespace khmer {
 #define ACQUIRE_TAG_COLORS_SPIN_LOCK \
   while(!__sync_bool_compare_and_swap( &_tag_colors_spin_lock, 0, 1));
 
-#define ACQUIRE_TAG_COLORS_SPIN_LOCK \
+#define RELEASE_TAG_COLORS_SPIN_LOCK \
   __sync_bool_compare_and_swap( &_tag_colors_spin_lock, 1, 0);
 
 #endif // HASHTABLE_HH
diff --git a/python/_khmermodule.cc b/python/_khmermodule.cc
index 4efbe2ca8c..66089766c0 100644
--- a/python/_khmermodule.cc
+++ b/python/_khmermodule.cc
@@ -2195,15 +2195,15 @@ static PyObject * hash_do_subset_partition_with_abundance(PyObject * self, PyObj
   return (PyObject *) subset_obj;
 }
 
-static PyObject * hash_consume_fasta_and_tag_with_colors(PyObject * self, PyObject * args)
+static PyObject * hashbits_consume_fasta_and_tag_with_colors(PyObject * self, PyObject * args)
 {
-  khmer_KCountingHashObject * me = (khmer_KCountingHashObject *) self;
-  khmer::CountingHash * counting = me->counting;
+  khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self;
+  khmer::Hashbits * hb = me->hashbits;
   
-  PyObject * callback_obs = NULL;
+  PyObject * callback_obj = NULL;
   PyObject * rparser_obj = NULL;
   
-  if (!PyArg_ParseType(args, "O|O", &rparser_obj, &callback_obj)) {
+  if (!PyArg_ParseTuple(args, "O|O", &rparser_obj, &callback_obj)) {
     return NULL;
   }
   
@@ -2215,10 +2215,10 @@ static PyObject * hash_consume_fasta_and_tag_with_colors(PyObject * self, PyObje
   
   Py_BEGIN_ALLOW_THREADS
   try {
-    counting->consume_fasta_and_tag_with_colors(rparser, total_reads, n_consumed,
+    hb->consume_fasta_and_tag_with_colors(rparser, total_reads, n_consumed,
                                                 _report_fn, callback_obj);
   } catch (_khmer_signal &e) {
-    exc_raised = TRUE;
+    exc_raised = true;
   }
   Py_END_ALLOW_THREADS
   if (exc_raised) return NULL;
@@ -2227,9 +2227,9 @@ static PyObject * hash_consume_fasta_and_tag_with_colors(PyObject * self, PyObje
   
 }
 
-static PyObject * hash_sweep_sequence_for_colors(PyObject * self, PyObject * args) {
-  khmer_KCountingHashObject * me = (khmer_KCountingHashObject *) self;
-  khmer::CountingHash * counting = me->counting;
+static PyObject * hashbits_sweep_sequence_for_colors(PyObject * self, PyObject * args) {
+  khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self;
+  khmer::Hashbits * hb = me->hashbits;
   
   char * seq = NULL;
   bool break_on_stoptags = NULL;
@@ -2239,11 +2239,33 @@ static PyObject * hash_sweep_sequence_for_colors(PyObject * self, PyObject * arg
     return NULL;
   }
   
-  if (strlen(kmer_s) < counting->ksize()) {
+  if (strlen(seq) < hb->ksize()) {
     return NULL;
   }
   
-  _pre_partition_info
+  //std::pair<TagColorPtrPair::iterator, TagColorPtrPair::iterator> ret;
+  ColorPtrSet found_colors;
+  
+  bool exc_raised = false;
+  Py_BEGIN_ALLOW_THREADS
+  try {
+    hb->sweep_sequence_for_colors(seq, found_colors, break_on_stoptags, stop_big_traversals);
+  } catch (_khmer_signal &e) {
+    exc_raised = true;
+  }
+  Py_END_ALLOW_THREADS
+  
+  if (exc_raised) return NULL;
+  
+  PyObject * x =  PyList_New(found_colors.size());
+  khmer::ColorPtrSet::const_iterator si;
+  unsigned long long i = 0;
+  for (si=found_colors.begin(); si!=found_colors.end(); ++si) {
+    PyList_SET_ITEM(x, i, Py_BuildValue("i", *(*si)));
+    i++;
+  }
+  
+  return x;
 }
 
 static PyMethodDef khmer_counting_methods[] = {
@@ -2282,7 +2304,7 @@ static PyMethodDef khmer_counting_methods[] = {
   { "consume_fasta_and_tag", hash_consume_fasta_and_tag, METH_VARARGS, "Count all k-mers in a given file" },
   { "do_subset_partition_with_abundance", hash_do_subset_partition_with_abundance, METH_VARARGS, "" },
   { "find_all_tags_truncate_on_abundance", hash_find_all_tags_truncate_on_abundance, METH_VARARGS, "" },
-  { "consume_fasta_and_tag_with_colors", hash_consume_fasta_and_tag_with_colors, METH_VARARGS, "" },
+  
   {NULL, NULL, 0, NULL}           /* sentinel */
 };
 
@@ -3937,7 +3959,8 @@ static PyMethodDef khmer_hashbits_methods[] = {
   { "traverse_from_tags", hashbits_traverse_from_tags, METH_VARARGS, "" },
   { "repartition_largest_partition", hashbits_repartition_largest_partition, METH_VARARGS, "" },
   { "get_median_count", hashbits_get_median_count, METH_VARARGS, "Get the median, average, and stddev of the k-mer counts in the string" },
-
+  { "consume_fasta_and_tag_with_colors", hashbits_consume_fasta_and_tag_with_colors, METH_VARARGS, "" },
+  { "sweep_sequence_for_colors", hashbits_sweep_sequence_for_colors, METH_VARARGS, "" },
   {NULL, NULL, 0, NULL}           /* sentinel */
 };
 

From db935a1e5b3a0c1725502278336423a41b1f10ca Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Tue, 10 Sep 2013 18:13:17 -0400
Subject: [PATCH 015/140] consuming, coloring, and sweeping functions
 successfully integrated with python glue, appear to be outputting correct
 colors

---
 lib/hashbits.hh        |   4 -
 lib/hashtable.cc       |   4 +
 lib/hashtable.hh       |   2 +-
 python/_khmermodule.cc | 177 ++++++++++++++++++++++++-----------------
 4 files changed, 109 insertions(+), 78 deletions(-)

diff --git a/lib/hashbits.hh b/lib/hashbits.hh
index 6c0f19698c..a695fb30da 100644
--- a/lib/hashbits.hh
+++ b/lib/hashbits.hh
@@ -22,8 +22,6 @@ namespace khmer {
 	HashIntoType _n_overlap_kmers;
     Byte ** _counts;
 
-    Color _tag_color;
-
     virtual void _allocate_counters() {
       _n_tables = _tablesizes.size();
 
@@ -50,8 +48,6 @@ namespace khmer {
       _n_unique_kmers = 0;
       _n_overlap_kmers = 0;
 
-      _tag_color = 0;
-
       _allocate_counters();
     }
 
diff --git a/lib/hashtable.cc b/lib/hashtable.cc
index 2d6e74d38b..00052cbdb1 100644
--- a/lib/hashtable.cc
+++ b/lib/hashtable.cc
@@ -1950,6 +1950,10 @@ void Hashtable::extract_unique_paths(std::string seq,
  * function which accepts a consume_sequence function pointer as a parameter
  */
 
+void Hashtable::do_nothing() {
+  std::cout << "doing nothing\n";
+}
+
 void
 Hashtable::consume_fasta_and_tag_with_colors(
   std:: string const  &filename,
diff --git a/lib/hashtable.hh b/lib/hashtable.hh
index 61c60a7869..8655751ad7 100644
--- a/lib/hashtable.hh
+++ b/lib/hashtable.hh
@@ -493,7 +493,7 @@ namespace khmer {
 					     CallbackFn callback = 0,
 					     void * callback_data = 0);
 					     
-
+    void do_nothing();
     
     void consume_fasta_and_tag_with_colors(
                         std::string const	  &filename,
diff --git a/python/_khmermodule.cc b/python/_khmermodule.cc
index 66089766c0..dc8aff3932 100644
--- a/python/_khmermodule.cc
+++ b/python/_khmermodule.cc
@@ -2195,79 +2195,6 @@ static PyObject * hash_do_subset_partition_with_abundance(PyObject * self, PyObj
   return (PyObject *) subset_obj;
 }
 
-static PyObject * hashbits_consume_fasta_and_tag_with_colors(PyObject * self, PyObject * args)
-{
-  khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self;
-  khmer::Hashbits * hb = me->hashbits;
-  
-  PyObject * callback_obj = NULL;
-  PyObject * rparser_obj = NULL;
-  
-  if (!PyArg_ParseTuple(args, "O|O", &rparser_obj, &callback_obj)) {
-    return NULL;
-  }
-  
-  khmer:: read_parsers:: IParser * rparser =
-  _PyObject_to_khmer_ReadParser( rparser_obj );
-  unsigned long long n_consumed;
-  unsigned int total_reads;
-  bool exc_raised = false;
-  
-  Py_BEGIN_ALLOW_THREADS
-  try {
-    hb->consume_fasta_and_tag_with_colors(rparser, total_reads, n_consumed,
-                                                _report_fn, callback_obj);
-  } catch (_khmer_signal &e) {
-    exc_raised = true;
-  }
-  Py_END_ALLOW_THREADS
-  if (exc_raised) return NULL;
-  
-  return Py_BuildValue("iL", total_reads, n_consumed);
-  
-}
-
-static PyObject * hashbits_sweep_sequence_for_colors(PyObject * self, PyObject * args) {
-  khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self;
-  khmer::Hashbits * hb = me->hashbits;
-  
-  char * seq = NULL;
-  bool break_on_stoptags = NULL;
-  bool stop_big_traversals = NULL;
-  
-  if(!PyArg_ParseTuple(args, "spp", &seq, &break_on_stoptags, &stop_big_traversals)) {
-    return NULL;
-  }
-  
-  if (strlen(seq) < hb->ksize()) {
-    return NULL;
-  }
-  
-  //std::pair<TagColorPtrPair::iterator, TagColorPtrPair::iterator> ret;
-  ColorPtrSet found_colors;
-  
-  bool exc_raised = false;
-  Py_BEGIN_ALLOW_THREADS
-  try {
-    hb->sweep_sequence_for_colors(seq, found_colors, break_on_stoptags, stop_big_traversals);
-  } catch (_khmer_signal &e) {
-    exc_raised = true;
-  }
-  Py_END_ALLOW_THREADS
-  
-  if (exc_raised) return NULL;
-  
-  PyObject * x =  PyList_New(found_colors.size());
-  khmer::ColorPtrSet::const_iterator si;
-  unsigned long long i = 0;
-  for (si=found_colors.begin(); si!=found_colors.end(); ++si) {
-    PyList_SET_ITEM(x, i, Py_BuildValue("i", *(*si)));
-    i++;
-  }
-  
-  return x;
-}
-
 static PyMethodDef khmer_counting_methods[] = {
   { "ksize", hash_get_ksize, METH_VARARGS, "" },
   { "hashsizes", hash_get_hashsizes, METH_VARARGS, "" },
@@ -3891,6 +3818,109 @@ static PyObject * hashbits_get_median_count(PyObject * self, PyObject * args)
   return Py_BuildValue("iff", med, average, stddev);
 }
 
+static PyObject * hashbits_consume_fasta_and_tag_with_colors(PyObject * self, PyObject * args)
+{
+  khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self;
+  khmer::Hashbits * hb = me->hashbits;
+  
+  std::ofstream outfile;
+  outfile.open("lazyoutput.txt");
+  outfile << ">> we're in c++ land folks\n";
+  
+  char * filename;
+  PyObject * callback_obj = NULL;
+
+  if (!PyArg_ParseTuple(args, "s|O", &filename, &callback_obj)) {
+    return NULL;
+  }
+  
+  unsigned long long n_consumed;
+  unsigned int total_reads;
+  bool exc_raised = false;
+  
+  outfile << ">> about to start the tagging function...\n";
+  outfile.close();
+  //Py_BEGIN_ALLOW_THREADS
+  try {
+    hb->consume_fasta_and_tag_with_colors(filename, total_reads, n_consumed,
+                                                _report_fn, callback_obj);
+  } catch (_khmer_signal &e) {
+    exc_raised = true;
+  }
+  //Py_END_ALLOW_THREADS
+  if (exc_raised) return NULL;
+  
+  return Py_BuildValue("iL", total_reads, n_consumed);
+  
+}
+
+static PyObject * hashbits_sweep_sequence_for_colors(PyObject * self, PyObject * args) {
+  khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self;
+  khmer::Hashbits * hb = me->hashbits;
+  
+  char * seq = NULL;
+  PyObject * break_on_stop_tags_o = NULL;
+  PyObject * stop_big_traversals_o = NULL;
+
+  if (!PyArg_ParseTuple(args, "s|OO", &seq,
+			&break_on_stop_tags_o,
+			&stop_big_traversals_o)) {
+    return NULL;
+  }
+
+  bool break_on_stop_tags = false;
+  if (break_on_stop_tags_o && PyObject_IsTrue(break_on_stop_tags_o)) {
+    break_on_stop_tags = true;
+  }
+  bool stop_big_traversals = false;
+  if (stop_big_traversals_o && PyObject_IsTrue(stop_big_traversals_o)) {
+    stop_big_traversals = true;
+  }
+  
+  if (strlen(seq) < hb->ksize()) {
+    return NULL;
+  }
+  
+  //std::pair<TagColorPtrPair::iterator, TagColorPtrPair::iterator> ret;
+  ColorPtrSet found_colors;
+  
+  bool exc_raised = false;
+  //Py_BEGIN_ALLOW_THREADS
+  try {
+    hb->sweep_sequence_for_colors(seq, found_colors, break_on_stop_tags, stop_big_traversals);
+  } catch (_khmer_signal &e) {
+    exc_raised = true;
+  }
+  //Py_END_ALLOW_THREADS
+  
+  if (exc_raised) return NULL;
+  
+  PyObject * x =  PyList_New(found_colors.size());
+  khmer::ColorPtrSet::const_iterator si;
+  unsigned long long i = 0;
+  for (si=found_colors.begin(); si!=found_colors.end(); ++si) {
+    PyList_SET_ITEM(x, i, Py_BuildValue("i", *(*si)));
+    i++;
+  }
+  
+  return x;
+}
+
+static PyObject * hashbits_do_nothing(PyObject * self, PyObject * args) {
+  khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self;
+  khmer::Hashbits * hb = me->hashbits;
+  
+  bool exc_raised = false;
+  try {
+    hb->do_nothing();
+  } catch (_khmer_signal &e) {
+    exc_raised = true;
+  }
+  if (exc_raised) return NULL;
+  
+  return Py_True;
+}
+
 static PyMethodDef khmer_hashbits_methods[] = {
   { "extract_unique_paths", hashbits_extract_unique_paths, METH_VARARGS, "" },
   { "ksize", hashbits_get_ksize, METH_VARARGS, "" },
@@ -3961,6 +3991,7 @@ static PyMethodDef khmer_hashbits_methods[] = {
   { "get_median_count", hashbits_get_median_count, METH_VARARGS, "Get the median, average, and stddev of the k-mer counts in the string" },
   { "consume_fasta_and_tag_with_colors", hashbits_consume_fasta_and_tag_with_colors, METH_VARARGS, "" },
   { "sweep_sequence_for_colors", hashbits_sweep_sequence_for_colors, METH_VARARGS, "" },
+  { "do_nothing", hashbits_do_nothing, METH_VARARGS, ""},
   {NULL, NULL, 0, NULL}           /* sentinel */
 };
 

From e6ba5464b1fac3bd583e6f390841f4b3858ade02 Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Tue, 10 Sep 2013 18:35:25 -0400
Subject: [PATCH 016/140] silly testing python script added (need to add nose
 tests)

---
 lib/test_coloring.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)
 create mode 100644 lib/test_coloring.py

diff --git a/lib/test_coloring.py b/lib/test_coloring.py
new file mode 100644
index 0000000000..11811debfd
--- /dev/null
+++ b/lib/test_coloring.py
@@ -0,0 +1,12 @@
+import khmer
+print khmer.__file__
+
+
+ht = khmer.new_hashbits(20,1e8,4)
+print '#' * 200
+ht.consume_fasta_and_tag_with_colors('/w/2013-lamprey/syn_part/syn.trinity.fasta')
+print ht.sweep_sequence_for_colors('CACACACGGACATCGGAGAGAGGCTGAGACAGCGAGACACACAGAGACAGAGCGGAGAGGGCACAGACAGACAAGAGCATGAGAGATCGGCAGAGCGGTG', False, False)
+print ht.sweep_sequence_for_colors('CGCCGTAGTCGTACTGGTTCTCCTCCGTGTACTCGTGCGCTGCCTCCACCTCTGGGCTGCTCATGCCCTCCATGTGACCTTCAGGCATGCCCTCGGAGAT', False, False)
+print ht.sweep_sequence_for_colors('GGAGAGCCTGGGGCCAAGCCCGAGGGCATGCCTGAAGGTCACATGGAGGGCATGAGCAGCCCAG', False, False)
+print ht.sweep_sequence_for_colors('TTTTTTGAATACGTTTAGTTAATATTTGTACTTCAATTAATAAAAATTTGCTATAATTTTTCCATTATCGCCAGTCACTCGCGTGATATAGGAAAAGGTT', False, False)
+print ht.sweep_sequence_for_colors('AAGCAGTGGTATCAACGCAGAGTACGCGGGGACTCTGTCGCTGCTCCTCTAGCACAGAGAGCCAGAGACGGCTTACAGCAGCAGCATCATATAGCCTC', False, False)

From 0e350c073e2a7d79db4190becf011f97809cc4c5 Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Fri, 13 Sep 2013 15:36:43 -0400
Subject: [PATCH 017/140] added function to tag fasta by partition id

---
 lib/hashtable.cc       | 54 ++++++++++++++++++++++++++++++++++++++++++
 lib/hashtable.hh       |  8 ++++++-
 lib/test_coloring.py   | 41 +++++++++++++++++++++++++++-----
 python/_khmermodule.cc | 34 ++++++++++++++++++++++----
 4 files changed, 126 insertions(+), 11 deletions(-)

diff --git a/lib/hashtable.cc b/lib/hashtable.cc
index 00052cbdb1..d7dc85f3da 100644
--- a/lib/hashtable.cc
+++ b/lib/hashtable.cc
@@ -2065,10 +2065,64 @@ Hashtable::consume_fasta_and_tag_with_colors(
 
   }
 
+void Hashtable::consume_partitioned_fasta_and_tag_with_colors(const std::string &filename,
+					  unsigned int &total_reads,
+					  unsigned long long &n_consumed,
+					  CallbackFn callback,
+					  void * callback_data)
+{
+  total_reads = 0;
+  n_consumed = 0;
+
+  IParser* parser = IParser::get_parser(filename.c_str());
+  Read read;
+
+  string seq = "";
+
+  // reset the master subset partition
+  delete partition;
+  partition = new SubsetPartition(this);
+
+  //
+  // iterate through the FASTA file & consume the reads.
+  //
+  Color * c;
+  while(!parser->is_complete())  {
+    read = parser->get_next_read();
+    seq = read.sequence;
+
+    if (check_and_normalize_read(seq)) {
+      // First, figure out what the partition is (if non-zero), and save that.
+      c = new Color(_parse_partition_id(read.name));
+
+      consume_sequence_and_tag_with_colors( seq,
+					      n_consumed,
+					      *c );
+    }
+	       
+    // reset the sequence info, increment read number
+    total_reads++;
+
+    // run callback, if specified
+    if (total_reads % CALLBACK_PERIOD == 0 && callback) {
+      try {
+        callback("consume_partitioned_fasta_and_tag_with_colors", callback_data, 
+        total_reads, n_consumed);
+      } catch (...) {
+	delete parser;
+        throw;
+      }
+    }
+  }
+
+  delete parser;
+}
+
 void Hashtable::link_tag_and_color(HashIntoType& kmer, Color& kmer_color) {
   tag_colors.insert(TagColorPtrPair(kmer, &kmer_color));
   color_tag_ptrs.insert(ColorTagPtrPair(kmer_color, &kmer));
 }
+
 /* This is essentially the same code as above, only it assigns colors to the
  * tags through multimap TagColorMap defined in hashtable.hh, declared in
  * hashbits.hh
diff --git a/lib/hashtable.hh b/lib/hashtable.hh
index 8655751ad7..f2564ba0bf 100644
--- a/lib/hashtable.hh
+++ b/lib/hashtable.hh
@@ -508,7 +508,13 @@ namespace khmer {
 	                unsigned long long  &n_consumed,
 	                CallbackFn	    callback	    = NULL,
 	                void *		    callback_data   = NULL);
-				  
+	                
+    void consume_partitioned_fasta_and_tag_with_colors(const std::string &filename,
+					  unsigned int &total_reads,
+					  unsigned long long &n_consumed,
+					  CallbackFn callback,
+					  void * callback_data);
+					  			  
     void consume_sequence_and_tag_with_colors(const std::string& seq,
 					unsigned long long& n_consumed,
 					Color& current_color,
diff --git a/lib/test_coloring.py b/lib/test_coloring.py
index 11811debfd..da8c5eff08 100644
--- a/lib/test_coloring.py
+++ b/lib/test_coloring.py
@@ -1,12 +1,41 @@
 import khmer
-print khmer.__file__
+import screed
 
 
 ht = khmer.new_hashbits(20,1e8,4)
 print '#' * 200
 ht.consume_fasta_and_tag_with_colors('/w/2013-lamprey/syn_part/syn.trinity.fasta')
-print ht.sweep_sequence_for_colors('CACACACGGACATCGGAGAGAGGCTGAGACAGCGAGACACACAGAGACAGAGCGGAGAGGGCACAGACAGACAAGAGCATGAGAGATCGGCAGAGCGGTG', False, False)
-print ht.sweep_sequence_for_colors('CGCCGTAGTCGTACTGGTTCTCCTCCGTGTACTCGTGCGCTGCCTCCACCTCTGGGCTGCTCATGCCCTCCATGTGACCTTCAGGCATGCCCTCGGAGAT', False, False)
-print ht.sweep_sequence_for_colors('GGAGAGCCTGGGGCCAAGCCCGAGGGCATGCCTGAAGGTCACATGGAGGGCATGAGCAGCCCAG', False, False)
-print ht.sweep_sequence_for_colors('TTTTTTGAATACGTTTAGTTAATATTTGTACTTCAATTAATAAAAATTTGCTATAATTTTTCCATTATCGCCAGTCACTCGCGTGATATAGGAAAAGGTT', False, False)
-print ht.sweep_sequence_for_colors('AAGCAGTGGTATCAACGCAGAGTACGCGGGGACTCTGTCGCTGCTCCTCTAGCACAGAGAGCCAGAGACGGCTTACAGCAGCAGCATCATATAGCCTC', False, False)
+#print ht.sweep_sequence_for_colors('CACACACGGACATCGGAGAGAGGCTGAGACAGCGAGACACACAGAGACAGAGCGGAGAGGGCACAGACAGACAAGAGCATGAGAGATCGGCAGAGCGGTG', False, False)
+#print ht.sweep_sequence_for_colors('CGCCGTAGTCGTACTGGTTCTCCTCCGTGTACTCGTGCGCTGCCTCCACCTCTGGGCTGCTCATGCCCTCCATGTGACCTTCAGGCATGCCCTCGGAGAT', False, False)
+#print ht.sweep_sequence_for_colors('GGAGAGCCTGGGGCCAAGCCCGAGGGCATGCCTGAAGGTCACATGGAGGGCATGAGCAGCCCAG', False, False)
+#print ht.sweep_sequence_for_colors('TTTTTTGAATACGTTTAGTTAATATTTGTACTTCAATTAATAAAAATTTGCTATAATTTTTCCATTATCGCCAGTCACTCGCGTGATATAGGAAAAGGTT', False, False)
+#print ht.sweep_sequence_for_colors('AAGCAGTGGTATCAACGCAGAGTACGCGGGGACTCTGTCGCTGCTCCTCTAGCACAGAGAGCCAGAGACGGCTTACAGCAGCAGCATCATATAGCCTC', False, False)
+
+N=1000000000
+
+'''
+file_pointers = {}
+for n, record in enumerate(screed.open('/w/2013-lamprey/syn_part/syn.sweep.fa')):
+    if n >= N:
+        break
+    if n % 1000 == 0:
+        print '...processed {} reads'.format(n)
+    colors = ht.sweep_sequence_for_colors(record.sequence, False, False)
+    for c in colors:
+        if c in file_pointers.viewkeys():
+            file_pointers[c].write('>{}\n{}\n'.format(record.name, record.sequence))
+        else:
+            file_pointers[c] = open('color_{}.fa'.format(c), 'wb')
+            file_pointers[c].write('>{}\n{}\n'.format(record.name, record.sequence))\
+'''
+
+ht = khmer.new_hashbits(25, 1e9,4)
+ht.consume_partitioned_fasta_and_tag_with_colors('/w/2013-lamprey/test.fp')
+
+for n, record in enumerate(screed.open('/w/lamprey-mrnaseq/reads/single/L82-a.fq.gz')):
+    if n >= N:
+        break
+    colors = ht.sweep_sequence_for_colors(record.sequence, False,  False)
+    if colors:
+        print colors
+
diff --git a/python/_khmermodule.cc b/python/_khmermodule.cc
index dc8aff3932..be0ecb640e 100644
--- a/python/_khmermodule.cc
+++ b/python/_khmermodule.cc
@@ -3824,8 +3824,6 @@ static PyObject * hashbits_consume_fasta_and_tag_with_colors(PyObject * self, Py
   khmer::Hashbits * hb = me->hashbits;
   
   std::ofstream outfile;
-  outfile.open("lazyoutput.txt");
-  outfile << ">> we're in c++ land folks\n";
   
   char * filename;
   PyObject * callback_obj = NULL;
@@ -3838,8 +3836,6 @@ static PyObject * hashbits_consume_fasta_and_tag_with_colors(PyObject * self, Py
   unsigned int total_reads;
   bool exc_raised = false;
   
-  outfile << ">> about to start the tagging function...\n";
-  outfile.close();
   //Py_BEGIN_ALLOW_THREADS
   try {
     hb->consume_fasta_and_tag_with_colors(filename, total_reads, n_consumed,
@@ -3854,6 +3850,34 @@ static PyObject * hashbits_consume_fasta_and_tag_with_colors(PyObject * self, Py
   
 }
 
+static PyObject * hashbits_consume_partitioned_fasta_and_tag_with_colors(
+                                            PyObject * self, PyObject * args)
+{
+  khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self;
+  khmer::Hashbits * hashbits = me->hashbits;
+
+  char * filename;
+  PyObject * callback_obj = NULL;
+
+  if (!PyArg_ParseTuple(args, "s|O", &filename, &callback_obj)) {
+    return NULL;
+  }
+
+  // call the C++ function, and trap signals => Python
+
+  unsigned long long n_consumed;
+  unsigned int total_reads;
+
+  try {
+    hashbits->consume_partitioned_fasta_and_tag_with_colors(filename, 
+    total_reads, n_consumed, _report_fn, callback_obj);
+  } catch (_khmer_signal &e) {
+    return NULL;
+  }
+
+  return Py_BuildValue("iL", total_reads, n_consumed);
+}
+
 static PyObject * hashbits_sweep_sequence_for_colors(PyObject * self, PyObject * args) {
   khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self;
   khmer::Hashbits * hb = me->hashbits;
@@ -3992,6 +4016,8 @@ static PyMethodDef khmer_hashbits_methods[] = {
   { "consume_fasta_and_tag_with_colors", hashbits_consume_fasta_and_tag_with_colors, METH_VARARGS, "" },
   { "sweep_sequence_for_colors", hashbits_sweep_sequence_for_colors, METH_VARARGS, "" },
   { "do_nothing", hashbits_do_nothing, METH_VARARGS, ""},
+  {"consume_partitioned_fasta_and_tag_with_colors", hashbits_consume_partitioned_fasta_and_tag_with_colors, METH_VARARGS, "" },
+  
   {NULL, NULL, 0, NULL}           /* sentinel */
 };
 

From 2b1b6b2907f5380262bebc4642967ee2e594160b Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Sun, 15 Sep 2013 21:50:21 -0400
Subject: [PATCH 018/140] a few comments on @cswelcher code

---
 lib/hashtable.cc | 3 +++
 lib/hashtable.hh | 1 +
 2 files changed, 4 insertions(+)

diff --git a/lib/hashtable.cc b/lib/hashtable.cc
index d7dc85f3da..773b515463 100644
--- a/lib/hashtable.cc
+++ b/lib/hashtable.cc
@@ -2005,6 +2005,9 @@ Hashtable::consume_fasta_and_tag_with_colors(
     );
     
     Color _tag_color = 0;
+
+    // @CTB: pls keep increment separate from function call so that
+    // order is explicit.
     Color * the_color = new Color(_tag_color++);
     // Iterate through the reads and consume their k-mers.
     while (!parser->is_complete( ))
diff --git a/lib/hashtable.hh b/lib/hashtable.hh
index f2564ba0bf..bdcb303c95 100644
--- a/lib/hashtable.hh
+++ b/lib/hashtable.hh
@@ -493,6 +493,7 @@ namespace khmer {
 					     CallbackFn callback = 0,
 					     void * callback_data = 0);
 					     
+    // @CTB ahem?
     void do_nothing();
     
     void consume_fasta_and_tag_with_colors(

From 67c1b458ddb223c50a36a5fc069827e1f58775b5 Mon Sep 17 00:00:00 2001
From: CS Welcher <cs.welcher@gmail.com>
Date: Wed, 18 Sep 2013 16:39:57 -0700
Subject: [PATCH 019/140] exposing more stuff for python glue

---
 lib/hashtable.cc | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/lib/hashtable.cc b/lib/hashtable.cc
index d7dc85f3da..ba7fb25368 100644
--- a/lib/hashtable.cc
+++ b/lib/hashtable.cc
@@ -2236,6 +2236,14 @@ void Hashtable::sweep_sequence_for_colors(const std::string& seq,
     }
 }
 
+ColorPtrSet& Hashtable::get_tag_colors(const HashIntoType& tag) {
+  ColorPtrSet colors;
+  num_colors = _get_tag_colors(tag, tag_colors, colors);
+  return &colors;
+}
+
+
+
 void Hashtable::traverse_colors_and_resolve(const SeenSet& tagged_kmers,
                                               ColorPtrSet& found_colors) {
   

From d948a6e97eb3758b04e104563f9a2693fef180ea Mon Sep 17 00:00:00 2001
From: CS Welcher <cs.welcher@gmail.com>
Date: Thu, 19 Sep 2013 14:20:39 -0700
Subject: [PATCH 020/140] added more glue functions

---
 lib/color_tst.py       | 41 +++++++++++++++++++++++++++++++++
 lib/hashtable.cc       | 14 ++++++++----
 lib/hashtable.hh       |  5 ++++-
 lib/khmer.hh           |  6 +++++
 python/_khmermodule.cc | 51 ++++++++++++++++++++++++++++++++++++++++--
 5 files changed, 110 insertions(+), 7 deletions(-)
 create mode 100644 lib/color_tst.py

diff --git a/lib/color_tst.py b/lib/color_tst.py
new file mode 100644
index 0000000000..da8c5eff08
--- /dev/null
+++ b/lib/color_tst.py
@@ -0,0 +1,41 @@
+import khmer
+import screed
+
+
+ht = khmer.new_hashbits(20,1e8,4)
+print '#' * 200
+ht.consume_fasta_and_tag_with_colors('/w/2013-lamprey/syn_part/syn.trinity.fasta')
+#print ht.sweep_sequence_for_colors('CACACACGGACATCGGAGAGAGGCTGAGACAGCGAGACACACAGAGACAGAGCGGAGAGGGCACAGACAGACAAGAGCATGAGAGATCGGCAGAGCGGTG', False, False)
+#print ht.sweep_sequence_for_colors('CGCCGTAGTCGTACTGGTTCTCCTCCGTGTACTCGTGCGCTGCCTCCACCTCTGGGCTGCTCATGCCCTCCATGTGACCTTCAGGCATGCCCTCGGAGAT', False, False)
+#print ht.sweep_sequence_for_colors('GGAGAGCCTGGGGCCAAGCCCGAGGGCATGCCTGAAGGTCACATGGAGGGCATGAGCAGCCCAG', False, False)
+#print ht.sweep_sequence_for_colors('TTTTTTGAATACGTTTAGTTAATATTTGTACTTCAATTAATAAAAATTTGCTATAATTTTTCCATTATCGCCAGTCACTCGCGTGATATAGGAAAAGGTT', False, False)
+#print ht.sweep_sequence_for_colors('AAGCAGTGGTATCAACGCAGAGTACGCGGGGACTCTGTCGCTGCTCCTCTAGCACAGAGAGCCAGAGACGGCTTACAGCAGCAGCATCATATAGCCTC', False, False)
+
+N=1000000000
+
+'''
+file_pointers = {}
+for n, record in enumerate(screed.open('/w/2013-lamprey/syn_part/syn.sweep.fa')):
+    if n >= N:
+        break
+    if n % 1000 == 0:
+        print '...processed {} reads'.format(n)
+    colors = ht.sweep_sequence_for_colors(record.sequence, False, False)
+    for c in colors:
+        if c in file_pointers.viewkeys():
+            file_pointers[c].write('>{}\n{}\n'.format(record.name, record.sequence))
+        else:
+            file_pointers[c] = open('color_{}.fa'.format(c), 'wb')
+            file_pointers[c].write('>{}\n{}\n'.format(record.name, record.sequence))\
+'''
+
+ht = khmer.new_hashbits(25, 1e9,4)
+ht.consume_partitioned_fasta_and_tag_with_colors('/w/2013-lamprey/test.fp')
+
+for n, record in enumerate(screed.open('/w/lamprey-mrnaseq/reads/single/L82-a.fq.gz')):
+    if n >= N:
+        break
+    colors = ht.sweep_sequence_for_colors(record.sequence, False,  False)
+    if colors:
+        print colors
+
diff --git a/lib/hashtable.cc b/lib/hashtable.cc
index 986a7698f0..6fa5525909 100644
--- a/lib/hashtable.cc
+++ b/lib/hashtable.cc
@@ -2239,13 +2239,19 @@ void Hashtable::sweep_sequence_for_colors(const std::string& seq,
     }
 }
 
-ColorPtrSet& Hashtable::get_tag_colors(const HashIntoType& tag) {
+ColorPtrSet Hashtable::get_tag_colors(const HashIntoType& tag) {
   ColorPtrSet colors;
-  num_colors = _get_tag_colors(tag, tag_colors, colors);
-  return &colors;
+  unsigned int num_colors;
+  _get_tag_colors(tag, tag_colors, colors);
+  return colors;
 }
 
-
+TagPtrSet Hashtable::get_color_tags(const Color& color) {
+  TagPtrSet tags;
+  unsigned int num_tags;
+  _get_tags_from_color(color, color_tag_ptrs, tags);
+  return tags;
+}
 
 void Hashtable::traverse_colors_and_resolve(const SeenSet& tagged_kmers,
                                               ColorPtrSet& found_colors) {
diff --git a/lib/hashtable.hh b/lib/hashtable.hh
index bdcb303c95..c7e09be11f 100644
--- a/lib/hashtable.hh
+++ b/lib/hashtable.hh
@@ -520,6 +520,9 @@ namespace khmer {
 					unsigned long long& n_consumed,
 					Color& current_color,
 					SeenSet * new_tags = 0);
+    
+    ColorPtrSet get_tag_colors(const HashIntoType& tag);
+    TagPtrSet get_color_tags(const Color& color);
 
     void link_tag_and_color(HashIntoType& kmer, Color& color);
     
@@ -528,7 +531,7 @@ namespace khmer {
 					bool break_on_stoptags,
 					bool stop_big_traversals);
 					
-	void traverse_colors_and_resolve(const SeenSet& tagged_kmers,
+    void traverse_colors_and_resolve(const SeenSet& tagged_kmers,
                                      ColorPtrSet& found_colors);
 
     void consume_fasta_and_traverse(const std::string &filename,
diff --git a/lib/khmer.hh b/lib/khmer.hh
index e096919e62..2bb275eab9 100644
--- a/lib/khmer.hh
+++ b/lib/khmer.hh
@@ -94,6 +94,12 @@ namespace khmer {
   typedef std::pair<Color, HashIntoType*> ColorTagPtrPair;
   typedef std::set<Color*> ColorPtrSet;
   typedef std::set<HashIntoType*> TagPtrSet;
+
+  Template <typename T>
+  void deallocate_ptr_set(T& s) {
+    for (typename T::iterator i = c.begin(); i != c.end(); ++i)
+      delete *i;
+  }
 }
 
 #endif // KHMER_HH
diff --git a/python/_khmermodule.cc b/python/_khmermodule.cc
index be0ecb640e..67d5fb1cca 100644
--- a/python/_khmermodule.cc
+++ b/python/_khmermodule.cc
@@ -2991,7 +2991,7 @@ static PyObject * hashbits_consume_partitioned_fasta(PyObject * self, PyObject *
   try {
     hashbits->consume_partitioned_fasta(filename, total_reads, n_consumed,
 					 _report_fn, callback_obj);
-  } catch (_khmer_signal &e) {
+  } catch (_khmer_signal) {
     return NULL;
   }
 
@@ -3945,6 +3945,52 @@ static PyObject * hashbits_do_nothing(PyObject * self, PyObject * args) {
   return Py_True;
 }
 
+// Same as find_all_tags, but returns tags in a way actually useable by python
+static PyObject * hashbits_get_all_tags(PyObject * self, PyObject *args)
+{
+  khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self;
+  khmer::Hashbits * hashbits = me->hashbits;
+
+  char * kmer_s = NULL;
+
+  if (!PyArg_ParseTuple(args, "s", &kmer_s)) {
+    return NULL;
+  }
+
+  if (strlen(kmer_s) < hashbits->ksize()) { // @@
+    return NULL;
+  }
+
+  khmer::SeenSet tagged_kmers;
+  
+  //Py_BEGIN_ALLOW_THREADS
+
+    khmer::HashIntoType kmer, kmer_f, kmer_r;
+    kmer = khmer::_hash(kmer_s, hashbits->ksize(), kmer_f, kmer_r);
+
+    hashbits->partition->find_all_tags(kmer_f, kmer_r, tagged_kmers,
+				       hashbits->all_tags);
+    hashbits->add_kmer_to_tags(kmer);
+
+  //Py_END_ALLOW_THREADS
+
+  PyObject * x =  PyList_New(tagged_kmers.size());
+  khmer::SeenSet::const_iterator si;
+  unsigned long long i = 0;
+  for (si=tagged_kmers.begin(); si!=tagged_kmers.end(); ++si) {
+    //std::string kmer_s = _revhash(*si, hashbits->ksize());
+    PyList_SET_ITEM(x, i, Py_BuildValue("i", *si));
+    i++;
+  }
+
+  return x;
+}
+
+
+static PyObject * hashbits_get_tag_colors(PyObject * self, PyObject * args) {
+  return Py_True;
+}
+
 static PyMethodDef khmer_hashbits_methods[] = {
   { "extract_unique_paths", hashbits_extract_unique_paths, METH_VARARGS, "" },
   { "ksize", hashbits_get_ksize, METH_VARARGS, "" },
@@ -4017,7 +4063,8 @@ static PyMethodDef khmer_hashbits_methods[] = {
   { "sweep_sequence_for_colors", hashbits_sweep_sequence_for_colors, METH_VARARGS, "" },
   { "do_nothing", hashbits_do_nothing, METH_VARARGS, ""},
   {"consume_partitioned_fasta_and_tag_with_colors", hashbits_consume_partitioned_fasta_and_tag_with_colors, METH_VARARGS, "" },
-  
+  {"get_all_tags", hashbits_get_all_tags, METH_VARARGS, "" },
+ 
   {NULL, NULL, 0, NULL}           /* sentinel */
 };
 

From ba9cab4eb2359b574690f5b592ee932e48317a19 Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Tue, 24 Sep 2013 16:50:28 -0400
Subject: [PATCH 021/140] finished tag export to python land and fixed tag
 parsing to ull int. added direct get_tag_colors to python land

---
 lib/hashtable.cc       |  5 +----
 lib/hashtable.hh       |  3 ---
 python/_khmermodule.cc | 49 +++++++++++++++++++++++++-----------------
 3 files changed, 30 insertions(+), 27 deletions(-)

diff --git a/lib/hashtable.cc b/lib/hashtable.cc
index 6fa5525909..21b1927304 100644
--- a/lib/hashtable.cc
+++ b/lib/hashtable.cc
@@ -1946,14 +1946,11 @@ void Hashtable::extract_unique_paths(std::string seq,
 }
 /*
  * Pretty much copy-pasta
+ * @cswelcher
  * Might be time for a refactor: could do a general consume_fasta
  * function which accepts a consume_sequence function pointer as a parameter
  */
 
-void Hashtable::do_nothing() {
-  std::cout << "doing nothing\n";
-}
-
 void
 Hashtable::consume_fasta_and_tag_with_colors(
   std:: string const  &filename,
diff --git a/lib/hashtable.hh b/lib/hashtable.hh
index c7e09be11f..739115038b 100644
--- a/lib/hashtable.hh
+++ b/lib/hashtable.hh
@@ -492,9 +492,6 @@ namespace khmer {
 					     unsigned long long &n_consumed,
 					     CallbackFn callback = 0,
 					     void * callback_data = 0);
-					     
-    // @CTB ahem?
-    void do_nothing();
     
     void consume_fasta_and_tag_with_colors(
                         std::string const	  &filename,
diff --git a/python/_khmermodule.cc b/python/_khmermodule.cc
index 67d5fb1cca..3e8831a69d 100644
--- a/python/_khmermodule.cc
+++ b/python/_khmermodule.cc
@@ -3923,28 +3923,13 @@ static PyObject * hashbits_sweep_sequence_for_colors(PyObject * self, PyObject *
   khmer::ColorPtrSet::const_iterator si;
   unsigned long long i = 0;
   for (si=found_colors.begin(); si!=found_colors.end(); ++si) {
-    PyList_SET_ITEM(x, i, Py_BuildValue("i", *(*si)));
+    PyList_SET_ITEM(x, i, Py_BuildValue("K", *(*si)));
     i++;
   }
   
   return x;
 }
 
-static PyObject * hashbits_do_nothing(PyObject * self, PyObject * args) {
-  khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self;
-  khmer::Hashbits * hb = me->hashbits;
-  
-  bool exc_raised = false;
-  try {
-    hb->do_nothing();
-  } catch (_khmer_signal &e) {
-    exc_raised = true;
-  }
-  if (exc_raised) return NULL;
-  
-  return Py_True;
-}
-
 // Same as find_all_tags, but returns tags in a way actually useable by python
 static PyObject * hashbits_get_all_tags(PyObject * self, PyObject *args)
 {
@@ -3970,7 +3955,6 @@ static PyObject * hashbits_get_all_tags(PyObject * self, PyObject *args)
 
     hashbits->partition->find_all_tags(kmer_f, kmer_r, tagged_kmers,
 				       hashbits->all_tags);
-    hashbits->add_kmer_to_tags(kmer);
 
   //Py_END_ALLOW_THREADS
 
@@ -3979,7 +3963,8 @@ static PyObject * hashbits_get_all_tags(PyObject * self, PyObject *args)
   unsigned long long i = 0;
   for (si=tagged_kmers.begin(); si!=tagged_kmers.end(); ++si) {
     //std::string kmer_s = _revhash(*si, hashbits->ksize());
-    PyList_SET_ITEM(x, i, Py_BuildValue("i", *si));
+    // type K for python unsigned long long
+    PyList_SET_ITEM(x, i, Py_BuildValue("K", *si));
     i++;
   }
 
@@ -3988,7 +3973,31 @@ static PyObject * hashbits_get_all_tags(PyObject * self, PyObject *args)
 
 
 static PyObject * hashbits_get_tag_colors(PyObject * self, PyObject * args) {
-  return Py_True;
+  
+  khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self;
+  khmer::Hashbits * hashbits = me->hashbits;
+  
+  khmer::HashIntoType tag;
+  
+  if (!PyArg_ParseTuple(args, "K", &tag)) {
+    return NULL;
+  }
+  
+  khmer::ColorPtrSet colors;
+  
+  colors = hashbits->get_tag_colors(tag);
+  
+  PyObject * x =  PyList_New(colors.size());
+  khmer::ColorPtrSet::const_iterator si;
+  unsigned long long i = 0;
+  for (si=colors.begin(); si!=colors.end(); ++si) {
+    //std::string kmer_s = _revhash(*si, hashbits->ksize());
+    PyList_SET_ITEM(x, i, Py_BuildValue("K", *(*si)));
+    i++;
+  }
+
+  return x;
+  
 }
 
 static PyMethodDef khmer_hashbits_methods[] = {
@@ -4061,9 +4070,9 @@ static PyMethodDef khmer_hashbits_methods[] = {
   { "get_median_count", hashbits_get_median_count, METH_VARARGS, "Get the median, average, and stddev of the k-mer counts in the string" },
   { "consume_fasta_and_tag_with_colors", hashbits_consume_fasta_and_tag_with_colors, METH_VARARGS, "" },
   { "sweep_sequence_for_colors", hashbits_sweep_sequence_for_colors, METH_VARARGS, "" },
-  { "do_nothing", hashbits_do_nothing, METH_VARARGS, ""},
   {"consume_partitioned_fasta_and_tag_with_colors", hashbits_consume_partitioned_fasta_and_tag_with_colors, METH_VARARGS, "" },
   {"get_all_tags", hashbits_get_all_tags, METH_VARARGS, "" },
+  {"get_tag_colors", hashbits_get_tag_colors, METH_VARARGS, ""},
  
   {NULL, NULL, 0, NULL}           /* sentinel */
 };

From 90f9dba602e5ba5fed2ad20ca0c7c4f5cbbd8dae Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Tue, 24 Sep 2013 17:37:45 -0400
Subject: [PATCH 022/140] mucking about with get_all_tags function, temporarily
 broken...

---
 python/_khmermodule.cc | 39 ++++++++++++++++++++++++++++++---------
 1 file changed, 30 insertions(+), 9 deletions(-)

diff --git a/python/_khmermodule.cc b/python/_khmermodule.cc
index 3e8831a69d..d8b5e9e998 100644
--- a/python/_khmermodule.cc
+++ b/python/_khmermodule.cc
@@ -3931,30 +3931,51 @@ static PyObject * hashbits_sweep_sequence_for_colors(PyObject * self, PyObject *
 }
 
 // Same as find_all_tags, but returns tags in a way actually useable by python
+// @cswelcher TODO: this is broken az, fix it asap
+// need a tags_in_sequence iterator or function in c++ land for reuse in all
+// these functions
 static PyObject * hashbits_get_all_tags(PyObject * self, PyObject *args)
 {
   khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self;
   khmer::Hashbits * hashbits = me->hashbits;
 
-  char * kmer_s = NULL;
+  std::string seq = NULL;
+  PyObject * break_on_stop_tags_o = NULL;
+  PyObject * stop_big_traversals_o = NULL;
 
-  if (!PyArg_ParseTuple(args, "s", &kmer_s)) {
+  if (!PyArg_ParseTuple(args, "s|OO", &seq,
+			&break_on_stop_tags_o,
+			&stop_big_traversals_o)) {
     return NULL;
   }
 
-  if (strlen(kmer_s) < hashbits->ksize()) { // @@
+  bool break_on_stop_tags = false;
+  if (break_on_stop_tags_o && PyObject_IsTrue(break_on_stop_tags_o)) {
+    break_on_stop_tags = true;
+  }
+  bool stop_big_traversals = false;
+  if (stop_big_traversals_o && PyObject_IsTrue(stop_big_traversals_o)) {
+    stop_big_traversals = true;
+  }
+  
+  if (strlen(seq) < hashbits->ksize()) {
     return NULL;
   }
 
   khmer::SeenSet tagged_kmers;
-  
+  khmer::HashIntoType kmer_f, kmer_r, kmer;
+  KMerIterator kmers(seq.c_str(), hashbits->_ksize());
+  std::string kmer_s;
   //Py_BEGIN_ALLOW_THREADS
 
-    khmer::HashIntoType kmer, kmer_f, kmer_r;
-    kmer = khmer::_hash(kmer_s, hashbits->ksize(), kmer_f, kmer_r);
-
-    hashbits->partition->find_all_tags(kmer_f, kmer_r, tagged_kmers,
-				       hashbits->all_tags);
+    while (!kmers.done()) {
+      kmer = kmers.next();
+      kmer_s = khmer::_revhash(kmer, hashbits->(_ksize));
+      kmer = khmer::_hash(kmer_s.c_str(), hashbits->_ksize(), kmer_f, kmer_r);
+      
+      hashbits->partition->find_all_tags(kmer_f, kmer_r, tagged_kmers, 
+            hashbits->all_tags, break_on_stoptags, stop_big_traversals);
+    }
 
   //Py_END_ALLOW_THREADS
 

From 56894425d132f0231f6b4bbc3ee75b28551a54d3 Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Wed, 25 Sep 2013 11:55:23 -0400
Subject: [PATCH 023/140] fixed issues with get_all_tags

---
 python/_khmermodule.cc | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/_khmermodule.cc b/python/_khmermodule.cc
index d8b5e9e998..2cf16e4645 100644
--- a/python/_khmermodule.cc
+++ b/python/_khmermodule.cc
@@ -3939,7 +3939,7 @@ static PyObject * hashbits_get_all_tags(PyObject * self, PyObject *args)
   khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self;
   khmer::Hashbits * hashbits = me->hashbits;
 
-  std::string seq = NULL;
+  char * seq = NULL;
   PyObject * break_on_stop_tags_o = NULL;
   PyObject * stop_big_traversals_o = NULL;
 
@@ -3964,17 +3964,17 @@ static PyObject * hashbits_get_all_tags(PyObject * self, PyObject *args)
 
   khmer::SeenSet tagged_kmers;
   khmer::HashIntoType kmer_f, kmer_r, kmer;
-  KMerIterator kmers(seq.c_str(), hashbits->_ksize());
+  KMerIterator kmers(seq, hashbits->ksize());
   std::string kmer_s;
   //Py_BEGIN_ALLOW_THREADS
 
     while (!kmers.done()) {
       kmer = kmers.next();
-      kmer_s = khmer::_revhash(kmer, hashbits->(_ksize));
-      kmer = khmer::_hash(kmer_s.c_str(), hashbits->_ksize(), kmer_f, kmer_r);
+      kmer_s = khmer::_revhash(kmer, hashbits->ksize());
+      kmer = khmer::_hash(kmer_s.c_str(), hashbits->ksize(), kmer_f, kmer_r);
       
       hashbits->partition->find_all_tags(kmer_f, kmer_r, tagged_kmers, 
-            hashbits->all_tags, break_on_stoptags, stop_big_traversals);
+            hashbits->all_tags, break_on_stop_tags, stop_big_traversals);
     }
 
   //Py_END_ALLOW_THREADS

From 18ebd4a7bf17caa34e6ef6066aa8a83f67ef89eb Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Wed, 25 Sep 2013 12:00:13 -0400
Subject: [PATCH 024/140] fixed in-call increment re ctb comment

---
 lib/hashtable.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/lib/hashtable.cc b/lib/hashtable.cc
index 21b1927304..dfa985dabf 100644
--- a/lib/hashtable.cc
+++ b/lib/hashtable.cc
@@ -2005,7 +2005,7 @@ Hashtable::consume_fasta_and_tag_with_colors(
 
     // @CTB: pls keep increment separate from function call so that
     // order is explicit.
-    Color * the_color = new Color(_tag_color++);
+    Color * the_color = new Color(_tag_color);
     // Iterate through the reads and consume their k-mers.
     while (!parser->is_complete( ))
     {
@@ -2019,7 +2019,8 @@ Hashtable::consume_fasta_and_tag_with_colors(
         consume_sequence_and_tag_with_colors( read.sequence,
 					      this_n_consumed,
 					      *the_color );
-        the_color = new Color(_tag_color++);
+	    _tag_color++;
+        the_color = new Color(_tag_color);
 
   #ifdef WITH_INTERNAL_METRICS
         hasher.pmetrics.start_timers( );

From 22333f25b7e19d17f85f3075d75bb5617761eee2 Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Wed, 25 Sep 2013 13:35:26 -0400
Subject: [PATCH 025/140] added n_colors python function

---
 lib/hashtable.hh       |  1 +
 python/_khmermodule.cc | 14 +++++++++++++-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/lib/hashtable.hh b/lib/hashtable.hh
index 739115038b..9d4589c41a 100644
--- a/lib/hashtable.hh
+++ b/lib/hashtable.hh
@@ -452,6 +452,7 @@ namespace khmer {
     // Partitioning stuff.
 
     unsigned int n_tags() const { return all_tags.size(); }
+    unsigned int N-colors() const { return tag_colors.size(); }
 
     void divide_tags_into_subsets(unsigned int subset_size, SeenSet& divvy);
 
diff --git a/python/_khmermodule.cc b/python/_khmermodule.cc
index 2cf16e4645..975a045cb5 100644
--- a/python/_khmermodule.cc
+++ b/python/_khmermodule.cc
@@ -4018,7 +4018,18 @@ static PyObject * hashbits_get_tag_colors(PyObject * self, PyObject * args) {
   }
 
   return x;
-  
+}
+
+static PyObject * hashbits_n_colors(PyObject * self, PyObject * args)
+{
+  khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self;
+  khmer::Hashbits * hashbits = me->hashbits;
+
+  if (!PyArg_ParseTuple(args, "")) {
+    return NULL;
+  }
+
+  return PyInt_FromLong(hashbits->n_colors());
 }
 
 static PyMethodDef khmer_hashbits_methods[] = {
@@ -4094,6 +4105,7 @@ static PyMethodDef khmer_hashbits_methods[] = {
   {"consume_partitioned_fasta_and_tag_with_colors", hashbits_consume_partitioned_fasta_and_tag_with_colors, METH_VARARGS, "" },
   {"get_all_tags", hashbits_get_all_tags, METH_VARARGS, "" },
   {"get_tag_colors", hashbits_get_tag_colors, METH_VARARGS, ""},
+  {"n_colors", hashbits_n_colors, METH_VARARGS, ""},
  
   {NULL, NULL, 0, NULL}           /* sentinel */
 };

From b51df4776e314997d4710363f42ef65d0f398985 Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Wed, 25 Sep 2013 13:52:49 -0400
Subject: [PATCH 026/140] added test_consume_fasta_and_tag_with_colors, test
 passes

---
 lib/hashtable.cc       |  2 --
 lib/hashtable.hh       |  2 +-
 lib/test_coloring.py   | 41 -----------------------------------------
 tests/test_hashbits.py | 14 ++++++++++++++
 4 files changed, 15 insertions(+), 44 deletions(-)
 delete mode 100644 lib/test_coloring.py

diff --git a/lib/hashtable.cc b/lib/hashtable.cc
index dfa985dabf..e3a84260d0 100644
--- a/lib/hashtable.cc
+++ b/lib/hashtable.cc
@@ -2003,8 +2003,6 @@ Hashtable::consume_fasta_and_tag_with_colors(
     
     Color _tag_color = 0;
 
-    // @CTB: pls keep increment separate from function call so that
-    // order is explicit.
     Color * the_color = new Color(_tag_color);
     // Iterate through the reads and consume their k-mers.
     while (!parser->is_complete( ))
diff --git a/lib/hashtable.hh b/lib/hashtable.hh
index 9d4589c41a..c662cc0f5b 100644
--- a/lib/hashtable.hh
+++ b/lib/hashtable.hh
@@ -452,7 +452,7 @@ namespace khmer {
     // Partitioning stuff.
 
     unsigned int n_tags() const { return all_tags.size(); }
-    unsigned int N-colors() const { return tag_colors.size(); }
+    unsigned int n_colors() const { return tag_colors.size(); }
 
     void divide_tags_into_subsets(unsigned int subset_size, SeenSet& divvy);
 
diff --git a/lib/test_coloring.py b/lib/test_coloring.py
deleted file mode 100644
index da8c5eff08..0000000000
--- a/lib/test_coloring.py
+++ /dev/null
@@ -1,41 +0,0 @@
-import khmer
-import screed
-
-
-ht = khmer.new_hashbits(20,1e8,4)
-print '#' * 200
-ht.consume_fasta_and_tag_with_colors('/w/2013-lamprey/syn_part/syn.trinity.fasta')
-#print ht.sweep_sequence_for_colors('CACACACGGACATCGGAGAGAGGCTGAGACAGCGAGACACACAGAGACAGAGCGGAGAGGGCACAGACAGACAAGAGCATGAGAGATCGGCAGAGCGGTG', False, False)
-#print ht.sweep_sequence_for_colors('CGCCGTAGTCGTACTGGTTCTCCTCCGTGTACTCGTGCGCTGCCTCCACCTCTGGGCTGCTCATGCCCTCCATGTGACCTTCAGGCATGCCCTCGGAGAT', False, False)
-#print ht.sweep_sequence_for_colors('GGAGAGCCTGGGGCCAAGCCCGAGGGCATGCCTGAAGGTCACATGGAGGGCATGAGCAGCCCAG', False, False)
-#print ht.sweep_sequence_for_colors('TTTTTTGAATACGTTTAGTTAATATTTGTACTTCAATTAATAAAAATTTGCTATAATTTTTCCATTATCGCCAGTCACTCGCGTGATATAGGAAAAGGTT', False, False)
-#print ht.sweep_sequence_for_colors('AAGCAGTGGTATCAACGCAGAGTACGCGGGGACTCTGTCGCTGCTCCTCTAGCACAGAGAGCCAGAGACGGCTTACAGCAGCAGCATCATATAGCCTC', False, False)
-
-N=1000000000
-
-'''
-file_pointers = {}
-for n, record in enumerate(screed.open('/w/2013-lamprey/syn_part/syn.sweep.fa')):
-    if n >= N:
-        break
-    if n % 1000 == 0:
-        print '...processed {} reads'.format(n)
-    colors = ht.sweep_sequence_for_colors(record.sequence, False, False)
-    for c in colors:
-        if c in file_pointers.viewkeys():
-            file_pointers[c].write('>{}\n{}\n'.format(record.name, record.sequence))
-        else:
-            file_pointers[c] = open('color_{}.fa'.format(c), 'wb')
-            file_pointers[c].write('>{}\n{}\n'.format(record.name, record.sequence))\
-'''
-
-ht = khmer.new_hashbits(25, 1e9,4)
-ht.consume_partitioned_fasta_and_tag_with_colors('/w/2013-lamprey/test.fp')
-
-for n, record in enumerate(screed.open('/w/lamprey-mrnaseq/reads/single/L82-a.fq.gz')):
-    if n >= N:
-        break
-    colors = ht.sweep_sequence_for_colors(record.sequence, False,  False)
-    if colors:
-        print colors
-
diff --git a/tests/test_hashbits.py b/tests/test_hashbits.py
index da7cb27643..6e8e92fa71 100644
--- a/tests/test_hashbits.py
+++ b/tests/test_hashbits.py
@@ -500,3 +500,17 @@ def test_simple_median():
     assert median == 1
     assert average == 1.0
     assert stddev == 0.0
+
+def test_consume_fasta_and_tag_with_colors():
+    hb = khmer.new_hashbits(20, 1e7, 4)
+    
+    filename = utils.get_test_data('test-transcript.fa')
+    total_reads, n_consumed = hb.consume_fasta_and_tag_with_colors(filename)
+    
+    #assert n_consumed == 3
+    assert total_reads == 3
+    
+    assert hb.n_colors() == 3
+    
+    
+    

From d131776756a90d9bb8e1cb468d8b593bcb35401e Mon Sep 17 00:00:00 2001
From: CS Welcher <cs.welcher@gmail.com>
Date: Wed, 25 Sep 2013 16:18:44 -0400
Subject: [PATCH 027/140] fixed consume_partitioned_fasta_and_tag_with_colors
 to properly check for color existence and use color pointers

---
 lib/hashtable.cc | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/lib/hashtable.cc b/lib/hashtable.cc
index e3a84260d0..ae20429ea3 100644
--- a/lib/hashtable.cc
+++ b/lib/hashtable.cc
@@ -2085,14 +2085,21 @@ void Hashtable::consume_partitioned_fasta_and_tag_with_colors(const std::string
   //
   // iterate through the FASTA file & consume the reads.
   //
+  ColorPtrMap colors;
   Color * c;
+  PartitionID p;
   while(!parser->is_complete())  {
     read = parser->get_next_read();
     seq = read.sequence;
 
     if (check_and_normalize_read(seq)) {
       // First, figure out what the partition is (if non-zero), and save that.
-      c = new Color(_parse_partition_id(read.name));
+      p = _parse_partition_id(read.name);
+      if (colors.count(p)) {
+	c = colors[p];
+      } else {
+	c = new Color(p);
+      }
 
       consume_sequence_and_tag_with_colors( seq,
 					      n_consumed,
@@ -2114,9 +2121,11 @@ void Hashtable::consume_partitioned_fasta_and_tag_with_colors(const std::string
     }
   }
 
+  // @cswelcher TODO: deallocate ColorPtrMap
   delete parser;
 }
 
+// @cswelcher: double-check -- is it valid to pull the address from a reference?
 void Hashtable::link_tag_and_color(HashIntoType& kmer, Color& kmer_color) {
   tag_colors.insert(TagColorPtrPair(kmer, &kmer_color));
   color_tag_ptrs.insert(ColorTagPtrPair(kmer_color, &kmer));
@@ -2168,7 +2177,6 @@ void Hashtable::consume_sequence_and_tag_with_colors(const std::string& seq,
 	      }
         }  else ++since;
       }
-      // Should I bother adding new code down here?
   #else
       if (!is_new_kmer && set_contains(all_tags, kmer)) {
         since = 1;

From afd1792025cacc79d14e00567ac202143ed26631 Mon Sep 17 00:00:00 2001
From: CS Welcher <cs.welcher@gmail.com>
Date: Wed, 25 Sep 2013 16:21:35 -0400
Subject: [PATCH 028/140] added delete of temp colorptrmap

---
 lib/hashtable.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lib/hashtable.cc b/lib/hashtable.cc
index ae20429ea3..80f33c93ca 100644
--- a/lib/hashtable.cc
+++ b/lib/hashtable.cc
@@ -2121,8 +2121,9 @@ void Hashtable::consume_partitioned_fasta_and_tag_with_colors(const std::string
     }
   }
 
-  // @cswelcher TODO: deallocate ColorPtrMap
+  // @cswelcher TODO: check that deallocate ColorPtrMap is correct
   delete parser;
+  delete colors;
 }
 
 // @cswelcher: double-check -- is it valid to pull the address from a reference?

From 0f10ae5aaa4d8aa9ae051241a4f80eb9dec9b08a Mon Sep 17 00:00:00 2001
From: CS Welcher <cs.welcher@gmail.com>
Date: Thu, 26 Sep 2013 01:41:19 -0400
Subject: [PATCH 029/140] added khmer colormap changes

---
 lib/khmer.hh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/khmer.hh b/lib/khmer.hh
index 2bb275eab9..c02998d68d 100644
--- a/lib/khmer.hh
+++ b/lib/khmer.hh
@@ -94,6 +94,7 @@ namespace khmer {
   typedef std::pair<Color, HashIntoType*> ColorTagPtrPair;
   typedef std::set<Color*> ColorPtrSet;
   typedef std::set<HashIntoType*> TagPtrSet;
+  typedef std::map<Color, Color*> ColorPtrMap;
 
   Template <typename T>
   void deallocate_ptr_set(T& s) {

From 6043518e1c4135860c505df8960fa32e12397402 Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Thu, 26 Sep 2013 01:43:55 -0400
Subject: [PATCH 030/140] added dealloc func

---
 lib/khmer.hh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lib/khmer.hh b/lib/khmer.hh
index 2bb275eab9..c23a2cce71 100644
--- a/lib/khmer.hh
+++ b/lib/khmer.hh
@@ -87,7 +87,7 @@ namespace khmer {
   typedef std::map<PartitionID, unsigned int> PartitionCountMap;
   typedef std::map<unsigned long long, unsigned long long> PartitionCountDistribution;
   
-  typedef unsigned int Color;
+  typedef unsigned long long int Color;
   typedef std::multimap<HashIntoType, Color*> TagColorPtrMap;
   typedef std::multimap<Color, HashIntoType*> ColorTagPtrMap;
   typedef std::pair<HashIntoType, Color*> TagColorPtrPair;
@@ -95,9 +95,9 @@ namespace khmer {
   typedef std::set<Color*> ColorPtrSet;
   typedef std::set<HashIntoType*> TagPtrSet;
 
-  Template <typename T>
+  template <typename T>
   void deallocate_ptr_set(T& s) {
-    for (typename T::iterator i = c.begin(); i != c.end(); ++i)
+    for (typename T::iterator i = s.begin(); i != s.end(); ++i)
       delete *i;
   }
 }

From eb944968be4494cd8398f822cd3d40329c3d1685 Mon Sep 17 00:00:00 2001
From: CS <cs.welcher@gmail.com>
Date: Thu, 26 Sep 2013 01:49:18 -0400
Subject: [PATCH 031/140] changed delete of tmp colorptrmap to clear

---
 lib/hashtable.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/hashtable.cc b/lib/hashtable.cc
index 80f33c93ca..0d729864eb 100644
--- a/lib/hashtable.cc
+++ b/lib/hashtable.cc
@@ -2123,7 +2123,7 @@ void Hashtable::consume_partitioned_fasta_and_tag_with_colors(const std::string
 
   // @cswelcher TODO: check that deallocate ColorPtrMap is correct
   delete parser;
-  delete colors;
+  colors.clear();
 }
 
 // @cswelcher: double-check -- is it valid to pull the address from a reference?

From 3684974c3a219bdc59b6bf9030654cf9fef4b8a6 Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Thu, 26 Sep 2013 01:51:05 -0400
Subject: [PATCH 032/140] updated color_tst, remove soon

---
 lib/color_tst.py | 40 ++++++++++++++++++++++++++++++++++++----
 1 file changed, 36 insertions(+), 4 deletions(-)

diff --git a/lib/color_tst.py b/lib/color_tst.py
index da8c5eff08..dcac725ec0 100644
--- a/lib/color_tst.py
+++ b/lib/color_tst.py
@@ -1,17 +1,49 @@
 import khmer
 import screed
 
+def reverse_comp(s):
+    ret = ''
+    for i in range(len(s)-1,-1,-1):
+        c = s[i]
+        if c == 'A':
+            ret += 'T'
+        elif c == 'T':
+            ret += 'A'
+        elif c == 'G':
+            ret += 'C'
+        else:
+            ret += 'G'
+    return ret
 
 ht = khmer.new_hashbits(20,1e8,4)
 print '#' * 200
-ht.consume_fasta_and_tag_with_colors('/w/2013-lamprey/syn_part/syn.trinity.fasta')
+ht.consume_fasta_and_tag_with_colors('../tests/test-data/test-reads.fa')
 #print ht.sweep_sequence_for_colors('CACACACGGACATCGGAGAGAGGCTGAGACAGCGAGACACACAGAGACAGAGCGGAGAGGGCACAGACAGACAAGAGCATGAGAGATCGGCAGAGCGGTG', False, False)
 #print ht.sweep_sequence_for_colors('CGCCGTAGTCGTACTGGTTCTCCTCCGTGTACTCGTGCGCTGCCTCCACCTCTGGGCTGCTCATGCCCTCCATGTGACCTTCAGGCATGCCCTCGGAGAT', False, False)
 #print ht.sweep_sequence_for_colors('GGAGAGCCTGGGGCCAAGCCCGAGGGCATGCCTGAAGGTCACATGGAGGGCATGAGCAGCCCAG', False, False)
 #print ht.sweep_sequence_for_colors('TTTTTTGAATACGTTTAGTTAATATTTGTACTTCAATTAATAAAAATTTGCTATAATTTTTCCATTATCGCCAGTCACTCGCGTGATATAGGAAAAGGTT', False, False)
 #print ht.sweep_sequence_for_colors('AAGCAGTGGTATCAACGCAGAGTACGCGGGGACTCTGTCGCTGCTCCTCTAGCACAGAGAGCCAGAGACGGCTTACAGCAGCAGCATCATATAGCCTC', False, False)
 
-N=1000000000
+t0 = 'CCATGTAGCGCCGCACACCTTTGTAGGTGTTGTAATAATCTTCGATGACTTTCTTCGCTTCCTGACGGCTTATGCC'
+t1 = 'ACCGCGCGCGAATCGACGGTTGTCAGCCAAAGGCGTTCAACACCAGCACCGCCCTTAAGCCGCCCGCCCGCCGCCC'
+N = 1000
+
+for n, record in enumerate(screed.open('../tests/test-data/test-reads.fa')):
+    if n > N:
+        break
+    print '*' * 40
+    seq = record.sequence
+    print seq
+    colors = ht.sweep_sequence_for_colors(seq, False, False)
+    print 'colors from sweep:', colors
+    tags = ht.get_all_tags(seq)
+    print 'tags from get_all_tags:', tags
+    print 'colors from get_tag_colors:'
+    t_colors = set()
+    for tag in tags:
+        t_colors.update(ht.get_tag_colors(tag))
+    print t_colors
+    assert len(t_colors) == len(colors)
 
 '''
 file_pointers = {}
@@ -28,7 +60,7 @@
             file_pointers[c] = open('color_{}.fa'.format(c), 'wb')
             file_pointers[c].write('>{}\n{}\n'.format(record.name, record.sequence))\
 '''
-
+'''
 ht = khmer.new_hashbits(25, 1e9,4)
 ht.consume_partitioned_fasta_and_tag_with_colors('/w/2013-lamprey/test.fp')
 
@@ -38,4 +70,4 @@
     colors = ht.sweep_sequence_for_colors(record.sequence, False,  False)
     if colors:
         print colors
-
+'''

From bc767df1c3eaaa4f89558782ba2f9b6ebafbd1c5 Mon Sep 17 00:00:00 2001
From: CS <cs.welcher@gmail.com>
Date: Thu, 26 Sep 2013 02:58:53 -0400
Subject: [PATCH 033/140] added bunch more tests, TODO fix n_colors

---
 tests/test_hashbits.py | 52 +++++++++++++++++++++++++++++++++++++-----
 1 file changed, 46 insertions(+), 6 deletions(-)

diff --git a/tests/test_hashbits.py b/tests/test_hashbits.py
index 6e8e92fa71..624de8c611 100644
--- a/tests/test_hashbits.py
+++ b/tests/test_hashbits.py
@@ -501,16 +501,56 @@ def test_simple_median():
     assert average == 1.0
     assert stddev == 0.0
 
-def test_consume_fasta_and_tag_with_colors():
+def test_get_all_tags():
+    hb = khmer.new_hashbits(20, 1e7, 4)
+    filename = utils.get_test_data('single-read.fq')
+    hb.consume_fasta_and_tag(filename)
+    
+    tags = hb.get_all_tags('CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT')
+    assert len(tags) == 1
+    assert tags.pop() == 173473779682L
+
+def test_get_tag_colors():
     hb = khmer.new_hashbits(20, 1e7, 4)
+    filename = utils.get_test_data('single-read.fq')
+    hb.consume_fasta_and_tag_with_colors(filename)
+    tag = 173473779682L
+
+    colors = hb.get_tag_colors(tag)
+    assert len(colors) == 1
+    assert colors.pop() == 0L
+
+def test_sweep_sequence_for_colors():
+    hb = khmer.new_hashbits(20, 1e7, 4)
+    filename = utils.get_test_data('single-read.fq')
+    hb.consume_fasta_and_tag_with_colors(filename)
     
+    colors = hb.sweep_sequence_for_colors('CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT')
+    assert len(colors) == 1
+    assert colors.pop() == 0L
+
+def test_consume_fasta_and_tag_with_colors():
+    hb = khmer.new_hashbits(20, 1e7, 4)
+    read_1 = 'ACGTAACCGGTTAAACCCGGGTTTAAAACCCCGGGGTTTT'
     filename = utils.get_test_data('test-transcript.fa')
+
     total_reads, n_consumed = hb.consume_fasta_and_tag_with_colors(filename)
     
-    #assert n_consumed == 3
+    assert hb.get(read_1[:20])
     assert total_reads == 3
+    #assert hb.n_colors() == 3
     
-    assert hb.n_colors() == 3
-    
-    
-    
+ 
+def test_consume_partitioned_fasta_and_tag_with_colors():
+    hb = khmer.new_hashbits(20, 1e7, 4)
+    filename = utils.get_test_data('real-partition-small.fa')
+
+    total_reads, n_consumed = hb.consume_partitioned_fasta_and_tag_with_colors(filename)
+    #assert hb.n_colors() == 1
+    colors = set()
+    for record in screed.open(filename):
+        seq = record.sequence
+        colors.update(hb.sweep_sequence_for_colors(seq, False, False))
+    assert len(colors) == 1
+    assert colors.pop() == 2L
+    #assert hb.n_colors() == 1   

From d29061c99e35da2439cf04a261b12723532fc9d4 Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Thu, 26 Sep 2013 13:34:49 -0400
Subject: [PATCH 034/140] added a sweep-reads script to scripts, TODO add tests

---
 scripts/sweep-reads-by-partition.py | 134 ++++++++++++++++++++++++++++
 1 file changed, 134 insertions(+)
 create mode 100755 scripts/sweep-reads-by-partition.py

diff --git a/scripts/sweep-reads-by-partition.py b/scripts/sweep-reads-by-partition.py
new file mode 100755
index 0000000000..f2a59f73a4
--- /dev/null
+++ b/scripts/sweep-reads-by-partition.py
@@ -0,0 +1,134 @@
+#! /w/khmer_dev/bin/python
+#
+# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2013. It is licensed under
+# the three-clause BSD license; see doc/LICENSE.txt. Contact: ctb@msu.edu
+#
+"""
+Tag and color the given partitioned fasta, then find all reads in the neighborhood
+of each partition and output to a file
+
+% python scripts/normalize-by-median.py [ -p <partitions/file> ] -i <fastp> <reads1> <reads2> ...
+
+Use '-h' for parameter help.
+"""
+
+import khmer
+import screed
+import sys
+from khmer.counting_args import build_construct_args, DEFAULT_MIN_HASHSIZE
+
+DEFAULT_PPF = 1
+
+def write_read(fp, seq, name, color):
+    fp.write('>{name}\t{color}\n{seq}\n'.format(seq=seq, name=name, color=color))
+
+def main():
+    parser = build_construct_args()
+    parser.add_argument('-p', '--partitions_per_file', 
+                        dest='partitions_per_file', default=DEFAULT_PPF)
+    parser.add_argument('-i', '--input_fastp', dest='input_fastp')
+    parser.add_argument('input_reads', nargs='+')
+    args = parser.parse_args()
+    
+    if not args.quiet:
+        if args.min_hashsize == DEFAULT_MIN_HASHSIZE:
+            print >>sys.stderr, \
+                "** WARNING: hashsize is default!  " \
+                "You absodefly want to increase this!\n** " \
+                "Please read the docs!"
+
+        print >>sys.stderr, '\nPARAMETERS:'
+        print >>sys.stderr, \
+            ' - kmer size =    {ksize:d} \t\t(-k)'.format(ksize=args.ksize)
+        print >>sys.stderr, \
+            ' - n hashes =     {nhash:d} \t\t(-N)'.format(nhash=args.n_hashes)
+        print >>sys.stderr, \
+            ' - min hashsize = {mh:-5.2g} \t(-x)'.format(mh=args.min_hashsize)
+        print >>sys.stderr, ''
+        print >>sys.stderr, \
+            'Estimated memory usage is {prod:.2g} bytes \
+            (n_hashes x min_hashsize)'.format(prod=args.n_hashes*args.min_hashsize)
+        print >>sys.stderr, '-' * 8
+    
+    K = args.ksize
+    HT_SIZE = args.min_hashsize
+    N_HT = args.n_hashes
+    
+    input_reads = args.input_reads
+    input_fastp = args.input_fastp
+    ppf = args.partitions_per_file
+    
+    ht = khmer.new_hashbits(K, HT_SIZE, N_HT)
+    ht.consume_partitioned_fasta_and_tag_with_colors(input_fastp)
+    
+    cur_colors = []
+    color_to_fp_dict = {}
+    cur_fp = file
+    
+    color_number_dist = []
+    
+    n_orphaned = 0
+    n_colored = 0
+    n_mcolored = 0
+    n_files = 0
+    try:
+        for read_file in input_reads:
+            print >>sys.stderr,'** sweeping {read_file} for colors...'.format(read_file=read_file)
+            
+            for n, record in enumerate(screed.open(read_file)):
+                if n % 10000 == 0:
+                    print >>sys.stderr, '\tswept {n} reads [{nc} colored, {no} orphaned' \
+                                        .format(n=n, nc=n_colored, no=n_orphaned)
+                seq = record.sequence
+                name = record.name
+                
+                colors = ht.sweep_sequence_for_colors(seq, False, False)
+                color_number_dist.append(len(colors))
+                if colors:
+                    n_colored += 1
+                    if len(colors) > 1:
+                        n_mcolored += 1
+                    for color in colors:
+                        # do we have a file for this color already? use it!
+                        if color in color_to_fp_dict:
+                            fp = color_to_fp_dict[color]
+                            write_read(fp, seq, name, color)
+                        # no file yet? make a new one
+                        else:
+                            if len(cur_colors) == 0:
+                                #print '** opening new file...'
+                                cur_fp = open('colored_reads_{fn}.fa'.format(fn=n_files),
+                                              'wb')
+                                              
+                            color_to_fp_dict[color] = cur_fp
+                            cur_colors.append(color)
+                            write_read(cur_fp, seq, name, color)
+                            n_files += 1
+                            
+                            if len(cur_colors) == ppf:
+                                cur_colors = []
+                else:
+                    n_orphaned += 1
+            
+        for key in color_to_fp_dict:
+            if color_to_fp_dict[key]:
+                color_to_fp_dict[key].close()
+
+    except IOError as e:
+        print >>sys.stderr, 'ERROR:', e
+        print >>sys.stderr, '** exiting...'
+        
+    print >>sys.stderr, 'swept {n_reads} for colors...'.format(n_reads=n)
+    print >>sys.stderr, '...with {nc} colored and {no} orphaned'.format(
+                                    nc=n_colored, no=n_orphaned)
+    print >>sys.stderr, '...and {nmc} multicolored'.format(nmc=n_mcolored)
+    print >>sys.stderr, '...to {nf} files'.format(nf=n_files)
+    
+    print >>sys.stderr, '** outputting color number distribution...'
+    with open('color_dist.txt', 'wb') as outfp:
+        for nc in color_number_dist:
+            outfp.write('{nc}\n'.format(nc=nc))
+    
+if __name__ == '__main__':
+    main()

From feb0f3988c08cb462a03ed15407089a0ad88afe1 Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Fri, 27 Sep 2013 14:49:01 -0400
Subject: [PATCH 035/140] added test for correctness of color tagging and
 traversal

---
 tests/test_hashbits.py | 46 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/tests/test_hashbits.py b/tests/test_hashbits.py
index 624de8c611..b38112c4df 100644
--- a/tests/test_hashbits.py
+++ b/tests/test_hashbits.py
@@ -501,6 +501,11 @@ def test_simple_median():
     assert average == 1.0
     assert stddev == 0.0
 
+#
+# @cswelcher TODO: more tests! 
+#  * thread-safety
+#  * n_colors -- make sure to use test-data with multi-colored tags
+
 def test_get_all_tags():
     hb = khmer.new_hashbits(20, 1e7, 4)
     filename = utils.get_test_data('single-read.fq')
@@ -539,7 +544,48 @@ def test_consume_fasta_and_tag_with_colors():
     assert hb.get(read_1[:20])
     assert total_reads == 3
     #assert hb.n_colors() == 3
+
+'''
+* The test data set as four reads: A, B, C, and D
+* Overlaps are A <-> B <-> C, with D on its own
+* Thus, traversing from A should find colors from A and B,
+  traversing from B should find colors from A, B, and C,
+  and traversing from C should find colors from B and C
+'''
+def test_color_tag_correctness():
+    hb = khmer.new_hashbits(20, 1e7, 4)
+    filename = utils.get_test_data('test-colors.fa')
+    hb.consume_fasta_and_tag_with_colors(filename)
     
+    # read A
+    colors = hb.sweep_sequence_for_colors('ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAG')
+    
+    print colors
+    assert len(colors) == 2
+    assert 0L in colors
+    assert 1L in colors
+    
+    # read B
+    colors = hb.sweep_sequence_for_colors('GCTCTGCCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGA')
+    print colors
+    assert len(colors) == 3
+    assert 0L in colors
+    assert 1L in colors
+    assert 2L in colors
+    
+    # read C
+    colors = hb.sweep_sequence_for_colors('TGTTGGGGATAGATAGATAGATGAGTGTAGATCCAACAACACATACA')
+    print colors
+    assert len(colors) == 2
+    assert 1L in colors
+    assert 2L in colors
+    
+    # read D
+    colors = hb.sweep_sequence_for_colors('TATATATATAGCTAGCTAGCTAACTAGCTAGCATCGATCGATCGATC')
+    print colors
+    assert len(colors) == 1
+    assert 3L in colors
+        
  
 def test_consume_partitioned_fasta_and_tag_with_colors():
     hb = khmer.new_hashbits(20, 1e7, 4)

From 01f233d71693628b48eb6691802aa49703a6c14b Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Fri, 27 Sep 2013 16:27:42 -0400
Subject: [PATCH 036/140] started traversal optimization

---
 lib/hashtable.cc |  12 ++--
 lib/subset.cc    | 146 +++++++++++++++++++++++++++++++++++++++++++++++
 lib/subset.hh    |   8 +++
 3 files changed, 162 insertions(+), 4 deletions(-)

diff --git a/lib/hashtable.cc b/lib/hashtable.cc
index 0d729864eb..a5d8264213 100644
--- a/lib/hashtable.cc
+++ b/lib/hashtable.cc
@@ -2232,15 +2232,19 @@ void Hashtable::sweep_sequence_for_colors(const std::string& seq,
     
     KMerIterator kmers(seq.c_str(), _ksize);
     std::string kmer_s;
-
+    // keep a list of kmers which have already been traversed
+    SeenSet traversed_kmers;
     while (!kmers.done()) {
       kmer = kmers.next();
       kmer_s = _revhash(kmer, _ksize);
       _hash(kmer_s.c_str(), _ksize, kmer_f, kmer_r);
       
-      partition->find_all_tags(kmer_f, kmer_r, tagged_kmers, all_tags,
-          break_on_stoptags, stop_big_traversals);
-      traverse_colors_and_resolve(tagged_kmers, found_colors);
+      // don't even try traversing from k-mers not in the hashtable
+      if (get_count(uniqify_rc(kmer_f,kmer_r))) {
+        partition->find_all_tags(kmer_f, kmer_r, tagged_kmers,
+          traversed_kmers, all_tags, break_on_stoptags, stop_big_traversals);
+        traverse_colors_and_resolve(tagged_kmers, found_colors);
+      }
     }
 }
 
diff --git a/lib/subset.cc b/lib/subset.cc
index 47fbe53fe2..8c5c18d89e 100644
--- a/lib/subset.cc
+++ b/lib/subset.cc
@@ -446,6 +446,152 @@ void SubsetPartition::find_all_tags(HashIntoType kmer_f,
   }
 }
 
+// Same as find_all_tags, but keep track of traversed k-mers 
+//
+void SubsetPartition::find_all_tags(HashIntoType kmer_f,
+				    HashIntoType kmer_r,
+				    SeenSet& tagged_kmers,
+				    SeenSet& traversed_kmers,
+				    const SeenSet& all_tags,
+				    bool break_on_stop_tags,
+				    bool stop_big_traversals)
+{
+  const HashIntoType bitmask = _ht->bitmask;
+
+  HashIntoType f, r;
+  bool first = true;
+  NodeQueue node_q;
+  std::queue<unsigned int> breadth_q;
+  unsigned int cur_breadth = 0;
+  unsigned int breadth = 0;
+  const unsigned int max_breadth = (2 * _ht->_tag_density) + 1;
+
+  const unsigned int rc_left_shift = _ht->ksize()*2 - 2;
+  unsigned int total = 0;
+
+  // start breadth-first search.
+
+  node_q.push(kmer_f);
+  node_q.push(kmer_r);
+  breadth_q.push(0);
+
+  while(!node_q.empty()) {
+    if (stop_big_traversals && traversed_kmers.size() > BIG_TRAVERSALS_ARE) {
+      tagged_kmers.clear();
+      break;
+    }
+
+    kmer_f = node_q.front();
+    node_q.pop();
+    kmer_r = node_q.front();
+    node_q.pop();
+    breadth = breadth_q.front();
+    breadth_q.pop();
+
+    HashIntoType kmer = uniqify_rc(kmer_f, kmer_r);
+
+    // Have we already seen this k-mer?  If so, skip.
+    if (set_contains(traversed_kmers, kmer)) {
+      continue;
+    }
+
+    // Do we want to traverse through this k-mer?  If not, skip.
+    if (break_on_stop_tags && set_contains(_ht->stop_tags, kmer)) {
+      // @CTB optimize by inserting into traversed_kmers set?
+      continue;
+    }
+
+    // keep track of seen kmers
+    traversed_kmers.insert(kmer);
+    total++;
+
+    // Is this a kmer-to-tag, and have we put this tag in a partition already?
+    // Search no further in this direction.  (This is where we connect
+    // partitions.)
+    if (!first && set_contains(all_tags, kmer)) {
+      tagged_kmers.insert(kmer);
+      continue;
+    }
+
+    assert(breadth >= cur_breadth); // keep track of watermark, for debugging.
+    if (breadth > cur_breadth) { cur_breadth = breadth; }
+
+    if (breadth >= max_breadth) { continue; } // truncate search @CTB exit?
+
+    //
+    // Enqueue next set of nodes.
+    //
+
+    // NEXT
+    f = next_f(kmer_f, 'A');
+    r = next_r(kmer_r, 'A');
+    if (_ht->get_count(uniqify_rc(f,r)) &&
+	!set_contains(traversed_kmers, uniqify_rc(f,r))) {
+      node_q.push(f); node_q.push(r);
+      breadth_q.push(breadth + 1);
+    }
+
+    f = next_f(kmer_f, 'C');
+    r = next_r(kmer_r, 'C');
+    if (_ht->get_count(uniqify_rc(f,r)) && 
+        !set_contains(traversed_kmers, uniqify_rc(f,r))) {
+      node_q.push(f); node_q.push(r);
+      breadth_q.push(breadth + 1);
+    }
+
+    f = next_f(kmer_f, 'G');
+    r = next_r(kmer_r, 'G');
+    if (_ht->get_count(uniqify_rc(f,r)) && 
+        !set_contains(traversed_kmers, uniqify_rc(f,r))) {
+      node_q.push(f); node_q.push(r);
+      breadth_q.push(breadth + 1);
+    }
+
+    f = next_f(kmer_f, 'T');
+    r = next_r(kmer_r, 'T');
+    if (_ht->get_count(uniqify_rc(f,r)) && 
+        !set_contains(traversed_kmers, uniqify_rc(f,r))) {
+      node_q.push(f); node_q.push(r);
+      breadth_q.push(breadth + 1);
+    }
+
+    // PREVIOUS.
+    r = prev_r(kmer_r, 'A');
+    f = prev_f(kmer_f, 'A');
+    if (_ht->get_count(uniqify_rc(f,r)) && 
+        !set_contains(traversed_kmers, uniqify_rc(f,r))) {
+      node_q.push(f); node_q.push(r);
+      breadth_q.push(breadth + 1);
+    }
+
+    r = prev_r(kmer_r, 'C');
+    f = prev_f(kmer_f, 'C');
+    if (_ht->get_count(uniqify_rc(f,r)) && 
+        !set_contains(traversed_kmers, uniqify_rc(f,r))) {
+      node_q.push(f); node_q.push(r);
+      breadth_q.push(breadth + 1);
+    }
+    
+    r = prev_r(kmer_r, 'G');
+    f = prev_f(kmer_f, 'G');
+    if (_ht->get_count(uniqify_rc(f,r)) && 
+        !set_contains(traversed_kmers, uniqify_rc(f,r))) {
+      node_q.push(f); node_q.push(r);
+      breadth_q.push(breadth + 1);
+    }
+
+    r = prev_r(kmer_r, 'T');
+    f = prev_f(kmer_f, 'T');
+    if (_ht->get_count(uniqify_rc(f,r)) && 
+        !set_contains(traversed_kmers, uniqify_rc(f,r))) {
+      node_q.push(f); node_q.push(r);
+      breadth_q.push(breadth + 1);
+    }
+
+    first = false;
+  }
+}
+
 // find_all_tags: the core of the partitioning code.  finds all tagged k-mers
 //    connected to kmer_f/kmer_r in the graph.
 
diff --git a/lib/subset.hh b/lib/subset.hh
index cab01273b9..282a9a6ac9 100644
--- a/lib/subset.hh
+++ b/lib/subset.hh
@@ -69,6 +69,14 @@ namespace khmer {
 		       bool break_on_stop_tags=false,
 		       bool stop_big_traversals=false);
 
+    void find_all_tags(HashIntoType kmer_f,
+				    HashIntoType kmer_r,
+				    SeenSet& tagged_kmers,
+				    SeenSet& traversed_kmers,
+				    const SeenSet& all_tags,
+				    bool break_on_stop_tags,
+				    bool stop_big_traversals);
+				    
     void find_all_tags_truncate_on_abundance(HashIntoType kmer_f,
 					     HashIntoType kmer_r,
 					     SeenSet& tagged_kmers,

From 1b97d8cf50c5e208cee72b2029bd8f0e303e661c Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Mon, 30 Sep 2013 16:26:18 -0400
Subject: [PATCH 037/140] fixed color allocation error; added persistent color
 to color pointer map and associated allocation funtion run checks for all
 newly allocated colors

---
 lib/hashtable.cc       | 13 ++++---------
 lib/hashtable.hh       | 16 +++++++++++++++-
 python/_khmermodule.cc | 26 +++++++++++++++++++++++++-
 3 files changed, 44 insertions(+), 11 deletions(-)

diff --git a/lib/hashtable.cc b/lib/hashtable.cc
index a5d8264213..22a0d6d640 100644
--- a/lib/hashtable.cc
+++ b/lib/hashtable.cc
@@ -2003,7 +2003,7 @@ Hashtable::consume_fasta_and_tag_with_colors(
     
     Color _tag_color = 0;
 
-    Color * the_color = new Color(_tag_color);
+    Color * the_color = check_and_allocate_color(_tag_color);
     // Iterate through the reads and consume their k-mers.
     while (!parser->is_complete( ))
     {
@@ -2018,7 +2018,7 @@ Hashtable::consume_fasta_and_tag_with_colors(
 					      this_n_consumed,
 					      *the_color );
 	    _tag_color++;
-        the_color = new Color(_tag_color);
+        the_color = check_and_allocate_color(_tag_color);
 
   #ifdef WITH_INTERNAL_METRICS
         hasher.pmetrics.start_timers( );
@@ -2085,7 +2085,6 @@ void Hashtable::consume_partitioned_fasta_and_tag_with_colors(const std::string
   //
   // iterate through the FASTA file & consume the reads.
   //
-  ColorPtrMap colors;
   Color * c;
   PartitionID p;
   while(!parser->is_complete())  {
@@ -2095,11 +2094,7 @@ void Hashtable::consume_partitioned_fasta_and_tag_with_colors(const std::string
     if (check_and_normalize_read(seq)) {
       // First, figure out what the partition is (if non-zero), and save that.
       p = _parse_partition_id(read.name);
-      if (colors.count(p)) {
-	c = colors[p];
-      } else {
-	c = new Color(p);
-      }
+      c = check_and_allocate_color(p);
 
       consume_sequence_and_tag_with_colors( seq,
 					      n_consumed,
@@ -2123,7 +2118,6 @@ void Hashtable::consume_partitioned_fasta_and_tag_with_colors(const std::string
 
   // @cswelcher TODO: check that deallocate ColorPtrMap is correct
   delete parser;
-  colors.clear();
 }
 
 // @cswelcher: double-check -- is it valid to pull the address from a reference?
@@ -2135,6 +2129,7 @@ void Hashtable::link_tag_and_color(HashIntoType& kmer, Color& kmer_color) {
 /* This is essentially the same code as above, only it assigns colors to the
  * tags through multimap TagColorMap defined in hashtable.hh, declared in
  * hashbits.hh
+ * @cswelcher TODO: should I instead send in the pointer to the new color?
  */
 void Hashtable::consume_sequence_and_tag_with_colors(const std::string& seq,
 					unsigned long long& n_consumed,
diff --git a/lib/hashtable.hh b/lib/hashtable.hh
index c662cc0f5b..9c100008e3 100644
--- a/lib/hashtable.hh
+++ b/lib/hashtable.hh
@@ -233,6 +233,7 @@ namespace khmer {
         return num_tags;
     }
     
+    
     Hashtable(
 	WordLength	ksize,
 	uint32_t const	number_of_threads   = 
@@ -253,6 +254,7 @@ namespace khmer {
       _init_bitstuff();
       _all_tags_spin_lock = 0;
       _tag_colors_spin_lock = 0;
+      
     }
 
     virtual ~Hashtable( )
@@ -373,6 +375,7 @@ namespace khmer {
     SeenSet repart_small_tags;
     TagColorPtrMap tag_colors;
     ColorTagPtrMap color_tag_ptrs;
+    ColorPtrMap color_ptrs;
 
     // accessor to get 'k'
     const WordLength ksize() const { return _ksize; }
@@ -482,7 +485,18 @@ namespace khmer {
 	CallbackFn	    callback	    = NULL,
 	void *		    callback_data   = NULL
     );
-
+    
+    Color * check_and_allocate_color(Color new_color) {
+        Color * c;
+        if (color_ptrs.count(new_color)) {
+            c = color_ptrs[new_color];
+        } else {
+            c = new Color(new_color);
+            color_ptrs[*c] = c;
+        }
+        return c;
+    }
+    
     void consume_sequence_and_tag(const std::string& seq,
 				  unsigned long long& n_consumed,
 				  SeenSet * new_tags = 0);
diff --git a/python/_khmermodule.cc b/python/_khmermodule.cc
index 975a045cb5..10be84a59f 100644
--- a/python/_khmermodule.cc
+++ b/python/_khmermodule.cc
@@ -3878,6 +3878,29 @@ static PyObject * hashbits_consume_partitioned_fasta_and_tag_with_colors(
   return Py_BuildValue("iL", total_reads, n_consumed);
 }
 
+static PyObject * hashbits_consume_sequence_and_tag_with_colors(PyObject * self, PyObject * args) {
+  khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self;
+  khmer::Hashbits * hb = me->hashbits;
+  
+  char * seq = NULL;
+  unsigned long long c;
+  if (!PyArg_ParseTuple(args, "sK", &seq, &c)) {
+    return NULL;
+  }
+  
+  unsigned long long n_consumed = 0;
+  khmer::Color * the_color = new Color(c);
+
+  try { 
+  //if (hb->check_and_normalize_read(seq)) {
+    hb->consume_sequence_and_tag_with_colors(seq, n_consumed, *the_color);
+  //}
+  } catch (_khmer_signal &e) {
+    return NULL;
+  }
+  return Py_BuildValue("L", n_consumed);
+}
+
 static PyObject * hashbits_sweep_sequence_for_colors(PyObject * self, PyObject * args) {
   khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self;
   khmer::Hashbits * hb = me->hashbits;
@@ -3923,7 +3946,7 @@ static PyObject * hashbits_sweep_sequence_for_colors(PyObject * self, PyObject *
   khmer::ColorPtrSet::const_iterator si;
   unsigned long long i = 0;
   for (si=found_colors.begin(); si!=found_colors.end(); ++si) {
-    PyList_SET_ITEM(x, i, Py_BuildValue("K", *(*si)));
+    PyList_SET_ITEM(x, i, Py_BuildValue("K", *si));
     i++;
   }
   
@@ -4105,6 +4128,7 @@ static PyMethodDef khmer_hashbits_methods[] = {
   {"consume_partitioned_fasta_and_tag_with_colors", hashbits_consume_partitioned_fasta_and_tag_with_colors, METH_VARARGS, "" },
   {"get_all_tags", hashbits_get_all_tags, METH_VARARGS, "" },
   {"get_tag_colors", hashbits_get_tag_colors, METH_VARARGS, ""},
+  {"consume_sequence_and_tag_with_colors", hashbits_consume_sequence_and_tag_with_colors, METH_VARARGS, "" },
   {"n_colors", hashbits_n_colors, METH_VARARGS, ""},
  
   {NULL, NULL, 0, NULL}           /* sentinel */

From 2113c23cea8805593d3de49b152e2323359c1a0f Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Mon, 30 Sep 2013 16:29:00 -0400
Subject: [PATCH 038/140] changed back deref on color sweep after bugfixing,
 switched python exposed consume_sequence_and_tag_with_colors to use new color
 allocation function

---
 python/_khmermodule.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/_khmermodule.cc b/python/_khmermodule.cc
index 10be84a59f..dcafd2a35f 100644
--- a/python/_khmermodule.cc
+++ b/python/_khmermodule.cc
@@ -3889,10 +3889,11 @@ static PyObject * hashbits_consume_sequence_and_tag_with_colors(PyObject * self,
   }
   
   unsigned long long n_consumed = 0;
-  khmer::Color * the_color = new Color(c);
+  khmer::Color * the_color = hb->check_and_allocate_color(c);
 
   try { 
   //if (hb->check_and_normalize_read(seq)) {
+    
     hb->consume_sequence_and_tag_with_colors(seq, n_consumed, *the_color);
   //}
   } catch (_khmer_signal &e) {
@@ -3946,7 +3947,7 @@ static PyObject * hashbits_sweep_sequence_for_colors(PyObject * self, PyObject *
   khmer::ColorPtrSet::const_iterator si;
   unsigned long long i = 0;
   for (si=found_colors.begin(); si!=found_colors.end(); ++si) {
-    PyList_SET_ITEM(x, i, Py_BuildValue("K", *si));
+    PyList_SET_ITEM(x, i, Py_BuildValue("K", *(*si)));
     i++;
   }
   

From 072adfd9e267f2c0151655fd83f8a2239e56d049 Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Mon, 30 Sep 2013 17:46:49 -0400
Subject: [PATCH 039/140] working on new traversal code, broke off neighbor
 finding to its own function

---
 lib/hashtable.cc       |   3 +-
 lib/hashtable.hh       |   4 +-
 lib/subset.cc          | 162 +++++++++++++++++++++--------------------
 python/_khmermodule.cc |  14 ++--
 4 files changed, 97 insertions(+), 86 deletions(-)

diff --git a/lib/hashtable.cc b/lib/hashtable.cc
index 22a0d6d640..72d332109f 100644
--- a/lib/hashtable.cc
+++ b/lib/hashtable.cc
@@ -2215,7 +2215,7 @@ void Hashtable::consume_sequence_and_tag_with_colors(const std::string& seq,
  * For now, check /every/ k-mer with find_all_tags
  * THIS SUCKS AND IT'S YOUR FAULT @CTB
  */
-void Hashtable::sweep_sequence_for_colors(const std::string& seq,
+unsigned int Hashtable::sweep_sequence_for_colors(const std::string& seq,
 					ColorPtrSet& found_colors,
 					bool break_on_stoptags,
 					bool stop_big_traversals) {
@@ -2241,6 +2241,7 @@ void Hashtable::sweep_sequence_for_colors(const std::string& seq,
         traverse_colors_and_resolve(tagged_kmers, found_colors);
       }
     }
+    return traversed_kmers.size()
 }
 
 ColorPtrSet Hashtable::get_tag_colors(const HashIntoType& tag) {
diff --git a/lib/hashtable.hh b/lib/hashtable.hh
index 9c100008e3..c0c25584e3 100644
--- a/lib/hashtable.hh
+++ b/lib/hashtable.hh
@@ -455,7 +455,7 @@ namespace khmer {
     // Partitioning stuff.
 
     unsigned int n_tags() const { return all_tags.size(); }
-    unsigned int n_colors() const { return tag_colors.size(); }
+    unsigned int n_colors() const { return colors_ptrs.size(); }
 
     void divide_tags_into_subsets(unsigned int subset_size, SeenSet& divvy);
 
@@ -538,7 +538,7 @@ namespace khmer {
 
     void link_tag_and_color(HashIntoType& kmer, Color& color);
     
-    void sweep_sequence_for_colors(const std::string& seq,
+    unsigned int sweep_sequence_for_colors(const std::string& seq,
 					ColorPtrSet& found_colors,
 					bool break_on_stoptags,
 					bool stop_big_traversals);
diff --git a/lib/subset.cc b/lib/subset.cc
index 8c5c18d89e..e379b2b5ee 100644
--- a/lib/subset.cc
+++ b/lib/subset.cc
@@ -296,6 +296,80 @@ unsigned int SubsetPartition::find_unpart(const std::string infilename,
   return n_singletons;
 }
 
+/* @cswelcher Brilliant idea: let's *not* copy this same piece of code
+ * over and over again!
+ */
+void SubsetPartition::queue_neighbors(HashIntoType kmer_f,
+                                    HashIntoType kmer_r,
+                                    NodeQueue& node_q,
+                                    std::queue<unsigned int> breadth_q) {
+    
+    f = next_f(kmer_f, 'A');
+    r = next_r(kmer_r, 'A');
+    if (_ht->get_count(uniqify_rc(f,r)) &&
+	!set_contains(keeper, uniqify_rc(f,r))) {
+      node_q.push(f); node_q.push(r);
+      breadth_q.push(breadth + 1);
+    }
+
+    f = next_f(kmer_f, 'C');
+    r = next_r(kmer_r, 'C');
+    if (_ht->get_count(uniqify_rc(f,r)) && 
+        !set_contains(keeper, uniqify_rc(f,r))) {
+      node_q.push(f); node_q.push(r);
+      breadth_q.push(breadth + 1);
+    }
+
+    f = next_f(kmer_f, 'G');
+    r = next_r(kmer_r, 'G');
+    if (_ht->get_count(uniqify_rc(f,r)) && 
+        !set_contains(keeper, uniqify_rc(f,r))) {
+      node_q.push(f); node_q.push(r);
+      breadth_q.push(breadth + 1);
+    }
+
+    f = next_f(kmer_f, 'T');
+    r = next_r(kmer_r, 'T');
+    if (_ht->get_count(uniqify_rc(f,r)) && 
+        !set_contains(keeper, uniqify_rc(f,r))) {
+      node_q.push(f); node_q.push(r);
+      breadth_q.push(breadth + 1);
+    }
+
+    // PREVIOUS.
+    r = prev_r(kmer_r, 'A');
+    f = prev_f(kmer_f, 'A');
+    if (_ht->get_count(uniqify_rc(f,r)) && 
+        !set_contains(keeper, uniqify_rc(f,r))) {
+      node_q.push(f); node_q.push(r);
+      breadth_q.push(breadth + 1);
+    }
+
+    r = prev_r(kmer_r, 'C');
+    f = prev_f(kmer_f, 'C');
+    if (_ht->get_count(uniqify_rc(f,r)) && 
+        !set_contains(keeper, uniqify_rc(f,r))) {
+      node_q.push(f); node_q.push(r);
+      breadth_q.push(breadth + 1);
+    }
+    
+    r = prev_r(kmer_r, 'G');
+    f = prev_f(kmer_f, 'G');
+    if (_ht->get_count(uniqify_rc(f,r)) && 
+        !set_contains(keeper, uniqify_rc(f,r))) {
+      node_q.push(f); node_q.push(r);
+      breadth_q.push(breadth + 1);
+    }
+
+    r = prev_r(kmer_r, 'T');
+    f = prev_f(kmer_f, 'T');
+    if (_ht->get_count(uniqify_rc(f,r)) && 
+        !set_contains(keeper, uniqify_rc(f,r))) {
+      node_q.push(f); node_q.push(r);
+      breadth_q.push(breadth + 1);
+    }
+}
+
 ///
 
 // find_all_tags: the core of the partitioning code.  finds all tagged k-mers
@@ -345,6 +419,7 @@ void SubsetPartition::find_all_tags(HashIntoType kmer_f,
     HashIntoType kmer = uniqify_rc(kmer_f, kmer_r);
 
     // Have we already seen this k-mer?  If so, skip.
+    // @cswelcher this is redundant, as we already check before queuing
     if (set_contains(keeper, kmer)) {
       continue;
     }
@@ -446,12 +521,13 @@ void SubsetPartition::find_all_tags(HashIntoType kmer_f,
   }
 }
 
-// Same as find_all_tags, but keep track of traversed k-mers 
-//
-void SubsetPartition::find_all_tags(HashIntoType kmer_f,
+
+
+// Perform a breadth-first search starting from the k-mers in the given sequence
+void SubsetPartition::sweep_for_tags(
+                    HashIntoType kmer_f,
 				    HashIntoType kmer_r,
 				    SeenSet& tagged_kmers,
-				    SeenSet& traversed_kmers,
 				    const SeenSet& all_tags,
 				    bool break_on_stop_tags,
 				    bool stop_big_traversals)
@@ -491,9 +567,10 @@ void SubsetPartition::find_all_tags(HashIntoType kmer_f,
     HashIntoType kmer = uniqify_rc(kmer_f, kmer_r);
 
     // Have we already seen this k-mer?  If so, skip.
-    if (set_contains(traversed_kmers, kmer)) {
-      continue;
-    }
+    // @cswelcher we already check before queuing
+    //if (set_contains(traversed_kmers, kmer)) {
+    // continue;
+    //}
 
     // Do we want to traverse through this k-mer?  If not, skip.
     if (break_on_stop_tags && set_contains(_ht->stop_tags, kmer)) {
@@ -518,76 +595,7 @@ void SubsetPartition::find_all_tags(HashIntoType kmer_f,
 
     if (breadth >= max_breadth) { continue; } // truncate search @CTB exit?
 
-    //
-    // Enqueue next set of nodes.
-    //
-
-    // NEXT
-    f = next_f(kmer_f, 'A');
-    r = next_r(kmer_r, 'A');
-    if (_ht->get_count(uniqify_rc(f,r)) &&
-	!set_contains(traversed_kmers, uniqify_rc(f,r))) {
-      node_q.push(f); node_q.push(r);
-      breadth_q.push(breadth + 1);
-    }
-
-    f = next_f(kmer_f, 'C');
-    r = next_r(kmer_r, 'C');
-    if (_ht->get_count(uniqify_rc(f,r)) && 
-        !set_contains(traversed_kmers, uniqify_rc(f,r))) {
-      node_q.push(f); node_q.push(r);
-      breadth_q.push(breadth + 1);
-    }
-
-    f = next_f(kmer_f, 'G');
-    r = next_r(kmer_r, 'G');
-    if (_ht->get_count(uniqify_rc(f,r)) && 
-        !set_contains(traversed_kmers, uniqify_rc(f,r))) {
-      node_q.push(f); node_q.push(r);
-      breadth_q.push(breadth + 1);
-    }
-
-    f = next_f(kmer_f, 'T');
-    r = next_r(kmer_r, 'T');
-    if (_ht->get_count(uniqify_rc(f,r)) && 
-        !set_contains(traversed_kmers, uniqify_rc(f,r))) {
-      node_q.push(f); node_q.push(r);
-      breadth_q.push(breadth + 1);
-    }
-
-    // PREVIOUS.
-    r = prev_r(kmer_r, 'A');
-    f = prev_f(kmer_f, 'A');
-    if (_ht->get_count(uniqify_rc(f,r)) && 
-        !set_contains(traversed_kmers, uniqify_rc(f,r))) {
-      node_q.push(f); node_q.push(r);
-      breadth_q.push(breadth + 1);
-    }
-
-    r = prev_r(kmer_r, 'C');
-    f = prev_f(kmer_f, 'C');
-    if (_ht->get_count(uniqify_rc(f,r)) && 
-        !set_contains(traversed_kmers, uniqify_rc(f,r))) {
-      node_q.push(f); node_q.push(r);
-      breadth_q.push(breadth + 1);
-    }
-    
-    r = prev_r(kmer_r, 'G');
-    f = prev_f(kmer_f, 'G');
-    if (_ht->get_count(uniqify_rc(f,r)) && 
-        !set_contains(traversed_kmers, uniqify_rc(f,r))) {
-      node_q.push(f); node_q.push(r);
-      breadth_q.push(breadth + 1);
-    }
-
-    r = prev_r(kmer_r, 'T');
-    f = prev_f(kmer_f, 'T');
-    if (_ht->get_count(uniqify_rc(f,r)) && 
-        !set_contains(traversed_kmers, uniqify_rc(f,r))) {
-      node_q.push(f); node_q.push(r);
-      breadth_q.push(breadth + 1);
-    }
-
+    queue_neighbors(kmer_f, kmer_r, node_q, breadth_q);    
     first = false;
   }
 }
diff --git a/python/_khmermodule.cc b/python/_khmermodule.cc
index dcafd2a35f..2a8c39c65c 100644
--- a/python/_khmermodule.cc
+++ b/python/_khmermodule.cc
@@ -3883,7 +3883,7 @@ static PyObject * hashbits_consume_sequence_and_tag_with_colors(PyObject * self,
   khmer::Hashbits * hb = me->hashbits;
   
   char * seq = NULL;
-  unsigned long long c;
+  unsigned long long c = NULL;
   if (!PyArg_ParseTuple(args, "sK", &seq, &c)) {
     return NULL;
   }
@@ -3902,15 +3902,16 @@ static PyObject * hashbits_consume_sequence_and_tag_with_colors(PyObject * self,
   return Py_BuildValue("L", n_consumed);
 }
 
-static PyObject * hashbits_sweep_sequence_for_colors(PyObject * self, PyObject * args) {
+static PyObject * hashbits_sweep_color_neighborhood(PyObject * self, PyObject * args) {
   khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self;
   khmer::Hashbits * hb = me->hashbits;
   
   char * seq = NULL;
+  unsigned int range = NULL;
   PyObject * break_on_stop_tags_o = NULL;
   PyObject * stop_big_traversals_o = NULL;
 
-  if (!PyArg_ParseTuple(args, "s|OO", &seq,
+  if (!PyArg_ParseTuple(args, "si|OO", &seq, &range,
 			&break_on_stop_tags_o,
 			&stop_big_traversals_o)) {
     return NULL;
@@ -3935,7 +3936,7 @@ static PyObject * hashbits_sweep_sequence_for_colors(PyObject * self, PyObject *
   bool exc_raised = false;
   //Py_BEGIN_ALLOW_THREADS
   try {
-    hb->sweep_sequence_for_colors(seq, found_colors, break_on_stop_tags, stop_big_traversals);
+    hb->sweep_sequence_for_colors(seq, found_colors, range, break_on_stop_tags, stop_big_traversals);
   } catch (_khmer_signal &e) {
     exc_raised = true;
   }
@@ -3958,16 +3959,17 @@ static PyObject * hashbits_sweep_sequence_for_colors(PyObject * self, PyObject *
 // @cswelcher TODO: this is broken az, fix it asap
 // need a tags_in_sequence iterator or function in c++ land for reuse in all
 // these functions
-static PyObject * hashbits_get_all_tags(PyObject * self, PyObject *args)
+static PyObject * hashbits_sweep_tag_neighborhood(PyObject * self, PyObject *args)
 {
   khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self;
   khmer::Hashbits * hashbits = me->hashbits;
 
   char * seq = NULL;
+  unsigned long range = NULL;
   PyObject * break_on_stop_tags_o = NULL;
   PyObject * stop_big_traversals_o = NULL;
 
-  if (!PyArg_ParseTuple(args, "s|OO", &seq,
+  if (!PyArg_ParseTuple(args, "si|OO", &seq, &range,
 			&break_on_stop_tags_o,
 			&stop_big_traversals_o)) {
     return NULL;

From 88154dcd1cd1d44a166439981878747980f94072 Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Tue, 1 Oct 2013 11:55:28 -0400
Subject: [PATCH 040/140] implemented new perimeter traversal as part of sweep
 opimization

---
 lib/subset.cc          | 36 +++++++++++++++++++++++++-----------
 lib/subset.hh          |  7 +++++++
 python/_khmermodule.cc |  9 +++++++--
 3 files changed, 39 insertions(+), 13 deletions(-)

diff --git a/lib/subset.cc b/lib/subset.cc
index e379b2b5ee..1a251e5e6a 100644
--- a/lib/subset.cc
+++ b/lib/subset.cc
@@ -303,6 +303,8 @@ void SubsetPartition::queue_neighbors(HashIntoType kmer_f,
                                     HashIntoType kmer_r,
                                     NodeQueue& node_q,
                                     std::queue<unsigned int> breadth_q) {
+                                    
+    HashIntoType f, r;
     
     f = next_f(kmer_f, 'A');
     r = next_r(kmer_r, 'A');
@@ -524,18 +526,16 @@ void SubsetPartition::find_all_tags(HashIntoType kmer_f,
 
 
 // Perform a breadth-first search starting from the k-mers in the given sequence
-void SubsetPartition::sweep_for_tags(
-                    HashIntoType kmer_f,
-				    HashIntoType kmer_r,
+unsigned int SubsetPartition::sweep_for_tags(char * seq,
 				    SeenSet& tagged_kmers,
 				    const SeenSet& all_tags,
+				    unsigned int range,
 				    bool break_on_stop_tags,
 				    bool stop_big_traversals)
 {
   const HashIntoType bitmask = _ht->bitmask;
 
-  HashIntoType f, r;
-  bool first = true;
+  SeenSet traversed_kmers;
   NodeQueue node_q;
   std::queue<unsigned int> breadth_q;
   unsigned int cur_breadth = 0;
@@ -547,11 +547,25 @@ void SubsetPartition::sweep_for_tags(
 
   // start breadth-first search.
 
-  node_q.push(kmer_f);
-  node_q.push(kmer_r);
-  breadth_q.push(0);
+  HashIntoType kmer_f, kmer_r, kmer;
+  KMerIterator kmers(seq, ksize());
+  str::string kmer_s;
+  
+  // Queue up all the sequenes k-mers at breadth zero
+  // We are searching around the perimeter of the known k-mers
+  // @cswelcher still using kludgy kmer iterator, let's fix this sometime...
+  while (!kmers.done()) {
+    kmer = kmers.next();
+    kmer_s = revhash(kmer, ksize());
+    kmer = _hash(kmer_s.c_str(), ksize(), kmer_f, kmer_r);
+    
+    node_q.push(kmer_f);
+    node_q.push(kmer_r);
+    breadth_q.push(0);
+  }
 
   while(!node_q.empty()) {
+    // change this to a better hueristic
     if (stop_big_traversals && traversed_kmers.size() > BIG_TRAVERSALS_ARE) {
       tagged_kmers.clear();
       break;
@@ -585,7 +599,7 @@ void SubsetPartition::sweep_for_tags(
     // Is this a kmer-to-tag, and have we put this tag in a partition already?
     // Search no further in this direction.  (This is where we connect
     // partitions.)
-    if (!first && set_contains(all_tags, kmer)) {
+    if (breadth && set_contains(all_tags, kmer)) {
       tagged_kmers.insert(kmer);
       continue;
     }
@@ -593,11 +607,11 @@ void SubsetPartition::sweep_for_tags(
     assert(breadth >= cur_breadth); // keep track of watermark, for debugging.
     if (breadth > cur_breadth) { cur_breadth = breadth; }
 
-    if (breadth >= max_breadth) { continue; } // truncate search @CTB exit?
+    if (breadth >= max_breadth or breatdth >= range) { continue; } // truncate search @CTB exit?
 
     queue_neighbors(kmer_f, kmer_r, node_q, breadth_q);    
-    first = false;
   }
+  return total;
 }
 
 // find_all_tags: the core of the partitioning code.  finds all tagged k-mers
diff --git a/lib/subset.hh b/lib/subset.hh
index 282a9a6ac9..cc08eff1aa 100644
--- a/lib/subset.hh
+++ b/lib/subset.hh
@@ -76,6 +76,13 @@ namespace khmer {
 				    const SeenSet& all_tags,
 				    bool break_on_stop_tags,
 				    bool stop_big_traversals);
+
+    unsigned int sweep_for_tags(char * seq,
+				    SeenSet& tagged_kmers,
+				    const SeenSet& all_tags,
+				    unsigned int range,
+				    bool break_on_stop_tags,
+				    bool stop_big_traversals);
 				    
     void find_all_tags_truncate_on_abundance(HashIntoType kmer_f,
 					     HashIntoType kmer_r,
diff --git a/python/_khmermodule.cc b/python/_khmermodule.cc
index 2a8c39c65c..73c61da559 100644
--- a/python/_khmermodule.cc
+++ b/python/_khmermodule.cc
@@ -3965,16 +3965,21 @@ static PyObject * hashbits_sweep_tag_neighborhood(PyObject * self, PyObject *arg
   khmer::Hashbits * hashbits = me->hashbits;
 
   char * seq = NULL;
-  unsigned long range = NULL;
+  PyObject * r = NULL;
   PyObject * break_on_stop_tags_o = NULL;
   PyObject * stop_big_traversals_o = NULL;
 
-  if (!PyArg_ParseTuple(args, "si|OO", &seq, &range,
+  if (!PyArg_ParseTuple(args, "s|iOO", &seq, &r,
 			&break_on_stop_tags_o,
 			&stop_big_traversals_o)) {
     return NULL;
   }
 
+  unsigned int range = (2 * hashbits->_tag_density) + 1;
+  if (r) {
+    range = r;
+  }
+
   bool break_on_stop_tags = false;
   if (break_on_stop_tags_o && PyObject_IsTrue(break_on_stop_tags_o)) {
     break_on_stop_tags = true;

From ab23874575fd1e6b9dceed39837722812ad7b8dd Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Tue, 1 Oct 2013 14:55:06 -0400
Subject: [PATCH 041/140] fixed error with new perimeter alg

---
 lib/hashtable.cc       |  5 +++--
 lib/hashtable.hh       |  2 +-
 lib/subset.cc          | 41 +++++++++++++++++++++++------------------
 lib/subset.hh          |  7 +++++++
 python/_khmermodule.cc | 24 ++++++++----------------
 5 files changed, 42 insertions(+), 37 deletions(-)

diff --git a/lib/hashtable.cc b/lib/hashtable.cc
index 72d332109f..9ff868a161 100644
--- a/lib/hashtable.cc
+++ b/lib/hashtable.cc
@@ -2235,13 +2235,14 @@ unsigned int Hashtable::sweep_sequence_for_colors(const std::string& seq,
       _hash(kmer_s.c_str(), _ksize, kmer_f, kmer_r);
       
       // don't even try traversing from k-mers not in the hashtable
+      //traversed_kmers.clear();
       if (get_count(uniqify_rc(kmer_f,kmer_r))) {
         partition->find_all_tags(kmer_f, kmer_r, tagged_kmers,
-          traversed_kmers, all_tags, break_on_stoptags, stop_big_traversals);
+                   all_tags, break_on_stoptags, stop_big_traversals);
         traverse_colors_and_resolve(tagged_kmers, found_colors);
       }
     }
-    return traversed_kmers.size()
+    return traversed_kmers.size();
 }
 
 ColorPtrSet Hashtable::get_tag_colors(const HashIntoType& tag) {
diff --git a/lib/hashtable.hh b/lib/hashtable.hh
index c0c25584e3..2fa703c153 100644
--- a/lib/hashtable.hh
+++ b/lib/hashtable.hh
@@ -455,7 +455,7 @@ namespace khmer {
     // Partitioning stuff.
 
     unsigned int n_tags() const { return all_tags.size(); }
-    unsigned int n_colors() const { return colors_ptrs.size(); }
+    unsigned int n_colors() const { return color_ptrs.size(); }
 
     void divide_tags_into_subsets(unsigned int subset_size, SeenSet& divvy);
 
diff --git a/lib/subset.cc b/lib/subset.cc
index 1a251e5e6a..6dd8715dd8 100644
--- a/lib/subset.cc
+++ b/lib/subset.cc
@@ -301,15 +301,19 @@ unsigned int SubsetPartition::find_unpart(const std::string infilename,
  */
 void SubsetPartition::queue_neighbors(HashIntoType kmer_f,
                                     HashIntoType kmer_r,
+                                    unsigned int breadth,
+                                    SeenSet& traversed_kmers,
                                     NodeQueue& node_q,
                                     std::queue<unsigned int> breadth_q) {
                                     
     HashIntoType f, r;
+    const unsigned int rc_left_shift = _ht->ksize()*2 - 2;
+    const HashIntoType bitmask = _ht->bitmask;
     
     f = next_f(kmer_f, 'A');
     r = next_r(kmer_r, 'A');
     if (_ht->get_count(uniqify_rc(f,r)) &&
-	!set_contains(keeper, uniqify_rc(f,r))) {
+	!set_contains(traversed_kmers, uniqify_rc(f,r))) {
       node_q.push(f); node_q.push(r);
       breadth_q.push(breadth + 1);
     }
@@ -317,7 +321,7 @@ void SubsetPartition::queue_neighbors(HashIntoType kmer_f,
     f = next_f(kmer_f, 'C');
     r = next_r(kmer_r, 'C');
     if (_ht->get_count(uniqify_rc(f,r)) && 
-        !set_contains(keeper, uniqify_rc(f,r))) {
+        !set_contains(traversed_kmers, uniqify_rc(f,r))) {
       node_q.push(f); node_q.push(r);
       breadth_q.push(breadth + 1);
     }
@@ -325,7 +329,7 @@ void SubsetPartition::queue_neighbors(HashIntoType kmer_f,
     f = next_f(kmer_f, 'G');
     r = next_r(kmer_r, 'G');
     if (_ht->get_count(uniqify_rc(f,r)) && 
-        !set_contains(keeper, uniqify_rc(f,r))) {
+        !set_contains(traversed_kmers, uniqify_rc(f,r))) {
       node_q.push(f); node_q.push(r);
       breadth_q.push(breadth + 1);
     }
@@ -333,7 +337,7 @@ void SubsetPartition::queue_neighbors(HashIntoType kmer_f,
     f = next_f(kmer_f, 'T');
     r = next_r(kmer_r, 'T');
     if (_ht->get_count(uniqify_rc(f,r)) && 
-        !set_contains(keeper, uniqify_rc(f,r))) {
+        !set_contains(traversed_kmers, uniqify_rc(f,r))) {
       node_q.push(f); node_q.push(r);
       breadth_q.push(breadth + 1);
     }
@@ -342,7 +346,7 @@ void SubsetPartition::queue_neighbors(HashIntoType kmer_f,
     r = prev_r(kmer_r, 'A');
     f = prev_f(kmer_f, 'A');
     if (_ht->get_count(uniqify_rc(f,r)) && 
-        !set_contains(keeper, uniqify_rc(f,r))) {
+        !set_contains(traversed_kmers, uniqify_rc(f,r))) {
       node_q.push(f); node_q.push(r);
       breadth_q.push(breadth + 1);
     }
@@ -350,7 +354,7 @@ void SubsetPartition::queue_neighbors(HashIntoType kmer_f,
     r = prev_r(kmer_r, 'C');
     f = prev_f(kmer_f, 'C');
     if (_ht->get_count(uniqify_rc(f,r)) && 
-        !set_contains(keeper, uniqify_rc(f,r))) {
+        !set_contains(traversed_kmers, uniqify_rc(f,r))) {
       node_q.push(f); node_q.push(r);
       breadth_q.push(breadth + 1);
     }
@@ -358,7 +362,7 @@ void SubsetPartition::queue_neighbors(HashIntoType kmer_f,
     r = prev_r(kmer_r, 'G');
     f = prev_f(kmer_f, 'G');
     if (_ht->get_count(uniqify_rc(f,r)) && 
-        !set_contains(keeper, uniqify_rc(f,r))) {
+        !set_contains(traversed_kmers, uniqify_rc(f,r))) {
       node_q.push(f); node_q.push(r);
       breadth_q.push(breadth + 1);
     }
@@ -366,7 +370,7 @@ void SubsetPartition::queue_neighbors(HashIntoType kmer_f,
     r = prev_r(kmer_r, 'T');
     f = prev_f(kmer_f, 'T');
     if (_ht->get_count(uniqify_rc(f,r)) && 
-        !set_contains(keeper, uniqify_rc(f,r))) {
+        !set_contains(traversed_kmers, uniqify_rc(f,r))) {
       node_q.push(f); node_q.push(r);
       breadth_q.push(breadth + 1);
     }
@@ -533,7 +537,6 @@ unsigned int SubsetPartition::sweep_for_tags(char * seq,
 				    bool break_on_stop_tags,
 				    bool stop_big_traversals)
 {
-  const HashIntoType bitmask = _ht->bitmask;
 
   SeenSet traversed_kmers;
   NodeQueue node_q;
@@ -542,22 +545,22 @@ unsigned int SubsetPartition::sweep_for_tags(char * seq,
   unsigned int breadth = 0;
   const unsigned int max_breadth = (2 * _ht->_tag_density) + 1;
 
-  const unsigned int rc_left_shift = _ht->ksize()*2 - 2;
+
   unsigned int total = 0;
 
   // start breadth-first search.
 
   HashIntoType kmer_f, kmer_r, kmer;
-  KMerIterator kmers(seq, ksize());
-  str::string kmer_s;
+  KMerIterator kmers(seq, _ht->ksize());
+  std::string kmer_s;
   
   // Queue up all the sequenes k-mers at breadth zero
   // We are searching around the perimeter of the known k-mers
   // @cswelcher still using kludgy kmer iterator, let's fix this sometime...
   while (!kmers.done()) {
     kmer = kmers.next();
-    kmer_s = revhash(kmer, ksize());
-    kmer = _hash(kmer_s.c_str(), ksize(), kmer_f, kmer_r);
+    kmer_s = _revhash(kmer, _ht->ksize());
+    kmer = _hash(kmer_s.c_str(), _ht->ksize(), kmer_f, kmer_r);
     
     node_q.push(kmer_f);
     node_q.push(kmer_r);
@@ -604,12 +607,13 @@ unsigned int SubsetPartition::sweep_for_tags(char * seq,
       continue;
     }
 
-    assert(breadth >= cur_breadth); // keep track of watermark, for debugging.
-    if (breadth > cur_breadth) { cur_breadth = breadth; }
+    // removed for not doing anything
+    //assert(breadth >= cur_breadth); // keep track of watermark, for debugging.
+    //if (breadth > cur_breadth) { cur_breadth = breadth; }
 
-    if (breadth >= max_breadth or breatdth >= range) { continue; } // truncate search @CTB exit?
+    if (breadth >= max_breadth or breadth >= range) { continue; } // truncate search @CTB exit?
 
-    queue_neighbors(kmer_f, kmer_r, node_q, breadth_q);    
+    queue_neighbors(kmer_f, kmer_r, breadth, traversed_kmers, node_q, breadth_q);    
   }
   return total;
 }
@@ -690,6 +694,7 @@ void SubsetPartition::find_all_tags_truncate_on_abundance(HashIntoType kmer_f,
       continue;
     }
 
+    // @cswelcher Do these lines actually do anything?
     assert(breadth >= cur_breadth); // keep track of watermark, for debugging.
     if (breadth > cur_breadth) { cur_breadth = breadth; }
 
diff --git a/lib/subset.hh b/lib/subset.hh
index cc08eff1aa..3a5dfb0ab3 100644
--- a/lib/subset.hh
+++ b/lib/subset.hh
@@ -63,6 +63,13 @@ namespace khmer {
     void load_partitionmap(std::string infile);
     void _validate_pmap();
 
+    void queue_neighbors(HashIntoType kmer_f,
+                                    HashIntoType kmer_r,
+                                    unsigned int breadth,
+                                    SeenSet& traversed_kmers,
+                                    NodeQueue& node_q,
+                                    std::queue<unsigned int> breadth_q);
+
     void find_all_tags(HashIntoType kmer_f, HashIntoType kmer_r,
 		       SeenSet& tagged_kmers,
 		       const SeenSet& all_tags,
diff --git a/python/_khmermodule.cc b/python/_khmermodule.cc
index 73c61da559..81d2864130 100644
--- a/python/_khmermodule.cc
+++ b/python/_khmermodule.cc
@@ -3936,7 +3936,7 @@ static PyObject * hashbits_sweep_color_neighborhood(PyObject * self, PyObject *
   bool exc_raised = false;
   //Py_BEGIN_ALLOW_THREADS
   try {
-    hb->sweep_sequence_for_colors(seq, found_colors, range, break_on_stop_tags, stop_big_traversals);
+    hb->sweep_sequence_for_colors(seq, found_colors, break_on_stop_tags, stop_big_traversals);
   } catch (_khmer_signal &e) {
     exc_raised = true;
   }
@@ -3965,7 +3965,7 @@ static PyObject * hashbits_sweep_tag_neighborhood(PyObject * self, PyObject *arg
   khmer::Hashbits * hashbits = me->hashbits;
 
   char * seq = NULL;
-  PyObject * r = NULL;
+  unsigned int r = NULL;
   PyObject * break_on_stop_tags_o = NULL;
   PyObject * stop_big_traversals_o = NULL;
 
@@ -3975,7 +3975,7 @@ static PyObject * hashbits_sweep_tag_neighborhood(PyObject * self, PyObject *arg
     return NULL;
   }
 
-  unsigned int range = (2 * hashbits->_tag_density) + 1;
+  unsigned int range = (2 * hashbits->_get_tag_density()) + 1;
   if (r) {
     range = r;
   }
@@ -3994,19 +3994,11 @@ static PyObject * hashbits_sweep_tag_neighborhood(PyObject * self, PyObject *arg
   }
 
   khmer::SeenSet tagged_kmers;
-  khmer::HashIntoType kmer_f, kmer_r, kmer;
-  KMerIterator kmers(seq, hashbits->ksize());
-  std::string kmer_s;
+
   //Py_BEGIN_ALLOW_THREADS
 
-    while (!kmers.done()) {
-      kmer = kmers.next();
-      kmer_s = khmer::_revhash(kmer, hashbits->ksize());
-      kmer = khmer::_hash(kmer_s.c_str(), hashbits->ksize(), kmer_f, kmer_r);
-      
-      hashbits->partition->find_all_tags(kmer_f, kmer_r, tagged_kmers, 
-            hashbits->all_tags, break_on_stop_tags, stop_big_traversals);
-    }
+  hashbits->partition->sweep_for_tags(seq, tagged_kmers, 
+            hashbits->all_tags, range, break_on_stop_tags, stop_big_traversals);
 
   //Py_END_ALLOW_THREADS
 
@@ -4132,9 +4124,9 @@ static PyMethodDef khmer_hashbits_methods[] = {
   { "repartition_largest_partition", hashbits_repartition_largest_partition, METH_VARARGS, "" },
   { "get_median_count", hashbits_get_median_count, METH_VARARGS, "Get the median, average, and stddev of the k-mer counts in the string" },
   { "consume_fasta_and_tag_with_colors", hashbits_consume_fasta_and_tag_with_colors, METH_VARARGS, "" },
-  { "sweep_sequence_for_colors", hashbits_sweep_sequence_for_colors, METH_VARARGS, "" },
+  { "sweep_color_neighborhood", hashbits_sweep_color_neighborhood, METH_VARARGS, "" },
   {"consume_partitioned_fasta_and_tag_with_colors", hashbits_consume_partitioned_fasta_and_tag_with_colors, METH_VARARGS, "" },
-  {"get_all_tags", hashbits_get_all_tags, METH_VARARGS, "" },
+  {"sweep_tag_neighborhood", hashbits_sweep_tag_neighborhood, METH_VARARGS, "" },
   {"get_tag_colors", hashbits_get_tag_colors, METH_VARARGS, ""},
   {"consume_sequence_and_tag_with_colors", hashbits_consume_sequence_and_tag_with_colors, METH_VARARGS, "" },
   {"n_colors", hashbits_n_colors, METH_VARARGS, ""},

From 74c9a88f67ad6903aa4573da611635bf246a7a00 Mon Sep 17 00:00:00 2001
From: CS Welcher <cs.welcher@gmail.com>
Date: Tue, 1 Oct 2013 16:11:49 -0400
Subject: [PATCH 042/140] fixed error in traversal params

---
 lib/color_tst_opt.py  | 78 +++++++++++++++++++++++++++++++++++++++++++
 lib/color_tst_slow.py | 78 +++++++++++++++++++++++++++++++++++++++++++
 lib/subset.cc         |  2 +-
 3 files changed, 157 insertions(+), 1 deletion(-)
 create mode 100644 lib/color_tst_opt.py
 create mode 100644 lib/color_tst_slow.py

diff --git a/lib/color_tst_opt.py b/lib/color_tst_opt.py
new file mode 100644
index 0000000000..8c75fe2e5b
--- /dev/null
+++ b/lib/color_tst_opt.py
@@ -0,0 +1,78 @@
+import khmer
+import screed
+
+def reverse_comp(s):
+    ret = ''
+    for i in range(len(s)-1,-1,-1):
+        c = s[i]
+        if c == 'A':
+            ret += 'T'
+        elif c == 'T':
+            ret += 'A'
+        elif c == 'G':
+            ret += 'C'
+        else:
+            ret += 'G'
+    return ret
+
+ht = khmer.new_hashbits(20,1e8,4)
+ht.consume_fasta_and_tag_with_colors('../tests/test-data/test-reads.fa')
+N = 100
+for n, record in enumerate(screed.open('../tests/test-data/test-reads.fa')):
+    if n >= N:
+        break
+    ht.sweep_tag_neighborhood(record.sequence, 10)
+
+#print ht.sweep_sequence_for_colors('CACACACGGACATCGGAGAGAGGCTGAGACAGCGAGACACACAGAGACAGAGCGGAGAGGGCACAGACAGACAAGAGCATGAGAGATCGGCAGAGCGGTG', False, False)
+#print ht.sweep_sequence_for_colors('CGCCGTAGTCGTACTGGTTCTCCTCCGTGTACTCGTGCGCTGCCTCCACCTCTGGGCTGCTCATGCCCTCCATGTGACCTTCAGGCATGCCCTCGGAGAT', False, False)
+#print ht.sweep_sequence_for_colors('GGAGAGCCTGGGGCCAAGCCCGAGGGCATGCCTGAAGGTCACATGGAGGGCATGAGCAGCCCAG', False, False)
+#print ht.sweep_sequence_for_colors('TTTTTTGAATACGTTTAGTTAATATTTGTACTTCAATTAATAAAAATTTGCTATAATTTTTCCATTATCGCCAGTCACTCGCGTGATATAGGAAAAGGTT', False, False)
+#print ht.sweep_sequence_for_colors('AAGCAGTGGTATCAACGCAGAGTACGCGGGGACTCTGTCGCTGCTCCTCTAGCACAGAGAGCCAGAGACGGCTTACAGCAGCAGCATCATATAGCCTC', False, False)
+
+#t0 = 'CCATGTAGCGCCGCACACCTTTGTAGGTGTTGTAATAATCTTCGATGACTTTCTTCGCTTCCTGACGGCTTATGCC'
+#t1 = 'ACCGCGCGCGAATCGACGGTTGTCAGCCAAAGGCGTTCAACACCAGCACCGCCCTTAAGCCGCCCGCCCGCCGCCC'
+'''
+N = 100
+for n, record in enumerate(screed.open('../tests/test-data/test-reads.fa')):
+    if n > N:
+        break
+    print '*' * 40
+    seq = record.sequence
+    print seq
+    colors = ht.sweep_sequence_for_colors(seq, False, False)
+    print 'colors from sweep:', colors
+    tags = ht.get_all_tags(seq)
+    print 'tags from get_all_tags:', tags
+    print 'colors from get_tag_colors:'
+    t_colors = set()
+    for tag in tags:
+        t_colors.update(ht.get_tag_colors(tag))
+    print t_colors
+    assert len(t_colors) == len(colors)
+'''
+'''
+file_pointers = {}
+for n, record in enumerate(screed.open('/w/2013-lamprey/syn_part/syn.sweep.fa')):
+    if n >= N:
+        break
+    if n % 1000 == 0:
+        print '...processed {} reads'.format(n)
+    colors = ht.sweep_sequence_for_colors(record.sequence, False, False)
+    for c in colors:
+        if c in file_pointers.viewkeys():
+            file_pointers[c].write('>{}\n{}\n'.format(record.name, record.sequence))
+        else:
+            file_pointers[c] = open('color_{}.fa'.format(c), 'wb')
+            file_pointers[c].write('>{}\n{}\n'.format(record.name, record.sequence))\
+'''
+'''
+ht = khmer.new_hashbits(25, 1e9,4)
+ht.consume_partitioned_fasta_and_tag_with_colors('/w/2013-lamprey/test.fp')
+
+for n, record in enumerate(screed.open('/w/lamprey-mrnaseq/reads/single/L82-a.fq.gz')):
+    if n >= N:
+        break
+    colors = ht.sweep_sequence_for_colors(record.sequence, False,  False)
+    if colors:
+        print colors
+'''
diff --git a/lib/color_tst_slow.py b/lib/color_tst_slow.py
new file mode 100644
index 0000000000..2f25f857e4
--- /dev/null
+++ b/lib/color_tst_slow.py
@@ -0,0 +1,78 @@
+import khmer
+import screed
+
+def reverse_comp(s):
+    ret = ''
+    for i in range(len(s)-1,-1,-1):
+        c = s[i]
+        if c == 'A':
+            ret += 'T'
+        elif c == 'T':
+            ret += 'A'
+        elif c == 'G':
+            ret += 'C'
+        else:
+            ret += 'G'
+    return ret
+
+ht = khmer.new_hashbits(20,1e8,4)
+ht.consume_fasta_and_tag_with_colors('../tests/test-data/test-reads.fa')
+N = 100
+for n, record in enumerate(screed.open('../tests/test-data/test-reads.fa')):
+    if n >= N:
+        break
+    ht.sweep_color_neighborhood(record.sequence)
+
+#print ht.sweep_sequence_for_colors('CACACACGGACATCGGAGAGAGGCTGAGACAGCGAGACACACAGAGACAGAGCGGAGAGGGCACAGACAGACAAGAGCATGAGAGATCGGCAGAGCGGTG', False, False)
+#print ht.sweep_sequence_for_colors('CGCCGTAGTCGTACTGGTTCTCCTCCGTGTACTCGTGCGCTGCCTCCACCTCTGGGCTGCTCATGCCCTCCATGTGACCTTCAGGCATGCCCTCGGAGAT', False, False)
+#print ht.sweep_sequence_for_colors('GGAGAGCCTGGGGCCAAGCCCGAGGGCATGCCTGAAGGTCACATGGAGGGCATGAGCAGCCCAG', False, False)
+#print ht.sweep_sequence_for_colors('TTTTTTGAATACGTTTAGTTAATATTTGTACTTCAATTAATAAAAATTTGCTATAATTTTTCCATTATCGCCAGTCACTCGCGTGATATAGGAAAAGGTT', False, False)
+#print ht.sweep_sequence_for_colors('AAGCAGTGGTATCAACGCAGAGTACGCGGGGACTCTGTCGCTGCTCCTCTAGCACAGAGAGCCAGAGACGGCTTACAGCAGCAGCATCATATAGCCTC', False, False)
+
+#t0 = 'CCATGTAGCGCCGCACACCTTTGTAGGTGTTGTAATAATCTTCGATGACTTTCTTCGCTTCCTGACGGCTTATGCC'
+#t1 = 'ACCGCGCGCGAATCGACGGTTGTCAGCCAAAGGCGTTCAACACCAGCACCGCCCTTAAGCCGCCCGCCCGCCGCCC'
+'''
+N = 100
+for n, record in enumerate(screed.open('../tests/test-data/test-reads.fa')):
+    if n > N:
+        break
+    print '*' * 40
+    seq = record.sequence
+    print seq
+    colors = ht.sweep_sequence_for_colors(seq, False, False)
+    print 'colors from sweep:', colors
+    tags = ht.get_all_tags(seq)
+    print 'tags from get_all_tags:', tags
+    print 'colors from get_tag_colors:'
+    t_colors = set()
+    for tag in tags:
+        t_colors.update(ht.get_tag_colors(tag))
+    print t_colors
+    assert len(t_colors) == len(colors)
+'''
+'''
+file_pointers = {}
+for n, record in enumerate(screed.open('/w/2013-lamprey/syn_part/syn.sweep.fa')):
+    if n >= N:
+        break
+    if n % 1000 == 0:
+        print '...processed {} reads'.format(n)
+    colors = ht.sweep_sequence_for_colors(record.sequence, False, False)
+    for c in colors:
+        if c in file_pointers.viewkeys():
+            file_pointers[c].write('>{}\n{}\n'.format(record.name, record.sequence))
+        else:
+            file_pointers[c] = open('color_{}.fa'.format(c), 'wb')
+            file_pointers[c].write('>{}\n{}\n'.format(record.name, record.sequence))\
+'''
+'''
+ht = khmer.new_hashbits(25, 1e9,4)
+ht.consume_partitioned_fasta_and_tag_with_colors('/w/2013-lamprey/test.fp')
+
+for n, record in enumerate(screed.open('/w/lamprey-mrnaseq/reads/single/L82-a.fq.gz')):
+    if n >= N:
+        break
+    colors = ht.sweep_sequence_for_colors(record.sequence, False,  False)
+    if colors:
+        print colors
+'''
diff --git a/lib/subset.cc b/lib/subset.cc
index 6dd8715dd8..ba54f8e50f 100644
--- a/lib/subset.cc
+++ b/lib/subset.cc
@@ -304,7 +304,7 @@ void SubsetPartition::queue_neighbors(HashIntoType kmer_f,
                                     unsigned int breadth,
                                     SeenSet& traversed_kmers,
                                     NodeQueue& node_q,
-                                    std::queue<unsigned int> breadth_q) {
+                                    std::queue<unsigned int>& breadth_q) {
                                     
     HashIntoType f, r;
     const unsigned int rc_left_shift = _ht->ksize()*2 - 2;

From 05cbe35928bca46cbb79be89ca5af27b1b0ff47d Mon Sep 17 00:00:00 2001
From: CS Welcher <cs.welcher@gmail.com>
Date: Tue, 1 Oct 2013 17:08:14 -0400
Subject: [PATCH 043/140] fixed func prototype to match prev change

---
 lib/subset.hh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/subset.hh b/lib/subset.hh
index 3a5dfb0ab3..9664b71a36 100644
--- a/lib/subset.hh
+++ b/lib/subset.hh
@@ -68,7 +68,7 @@ namespace khmer {
                                     unsigned int breadth,
                                     SeenSet& traversed_kmers,
                                     NodeQueue& node_q,
-                                    std::queue<unsigned int> breadth_q);
+                                    std::queue<unsigned int>& breadth_q);
 
     void find_all_tags(HashIntoType kmer_f, HashIntoType kmer_r,
 		       SeenSet& tagged_kmers,

From 235b746d7d061f5f86668b11c5caf0c0ec6ff1d7 Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Tue, 1 Oct 2013 19:37:37 -0400
Subject: [PATCH 044/140] fixed bug in color allocation during consume_fasta,
 fixed unexpected behavior in new traversal code and improved performance,
 fixed tests, added new function for exporting all colors

---
 lib/hashtable.cc       | 19 ++++++++--
 lib/hashtable.hh       |  6 ++++
 lib/subset.cc          | 13 ++++---
 lib/subset.hh          | 10 +-----
 python/_khmermodule.cc | 81 +++++++++++++++++++++++++++++++++++++++---
 tests/test_hashbits.py | 57 ++++++++++++++++-------------
 6 files changed, 139 insertions(+), 47 deletions(-)

diff --git a/lib/hashtable.cc b/lib/hashtable.cc
index 9ff868a161..ad1adca0f9 100644
--- a/lib/hashtable.cc
+++ b/lib/hashtable.cc
@@ -2003,7 +2003,7 @@ Hashtable::consume_fasta_and_tag_with_colors(
     
     Color _tag_color = 0;
 
-    Color * the_color = check_and_allocate_color(_tag_color);
+    Color * the_color;
     // Iterate through the reads and consume their k-mers.
     while (!parser->is_complete( ))
     {
@@ -2014,11 +2014,11 @@ Hashtable::consume_fasta_and_tag_with_colors(
       if (check_and_normalize_read( read.sequence ))
       {
         // TODO: make threadsafe!
+        the_color = check_and_allocate_color(_tag_color);
         consume_sequence_and_tag_with_colors( read.sequence,
 					      this_n_consumed,
 					      *the_color );
 	    _tag_color++;
-        the_color = check_and_allocate_color(_tag_color);
 
   #ifdef WITH_INTERNAL_METRICS
         hasher.pmetrics.start_timers( );
@@ -2245,6 +2245,21 @@ unsigned int Hashtable::sweep_sequence_for_colors(const std::string& seq,
     return traversed_kmers.size();
 }
 
+unsigned int Hashtable::sweep_color_neighborhood(const std::string& seq,
+                                                  ColorPtrSet& found_colors,
+                                                  unsigned int range,
+                                                  bool break_on_stoptags,
+                                                  bool stop_big_traversals) {
+
+    SeenSet tagged_kmers;
+    unsigned int num_traversed;
+    num_traversed = partition->sweep_for_tags(seq, tagged_kmers, all_tags, 
+                              range, break_on_stoptags, stop_big_traversals);
+    traverse_colors_and_resolve(tagged_kmers, found_colors);
+
+    return num_traversed;
+}
+
 ColorPtrSet Hashtable::get_tag_colors(const HashIntoType& tag) {
   ColorPtrSet colors;
   unsigned int num_colors;
diff --git a/lib/hashtable.hh b/lib/hashtable.hh
index 2fa703c153..23c8c7b0ec 100644
--- a/lib/hashtable.hh
+++ b/lib/hashtable.hh
@@ -543,6 +543,12 @@ namespace khmer {
 					bool break_on_stoptags,
 					bool stop_big_traversals);
 					
+    unsigned int sweep_color_neighborhood(const std::string & seq,
+                                                  ColorPtrSet& found_colors,
+                                                  unsigned int range,
+                                                  bool break_on_stoptags,
+                                                  bool stop_big_traversals);
+                                                  			
     void traverse_colors_and_resolve(const SeenSet& tagged_kmers,
                                      ColorPtrSet& found_colors);
 
diff --git a/lib/subset.cc b/lib/subset.cc
index ba54f8e50f..7060865c50 100644
--- a/lib/subset.cc
+++ b/lib/subset.cc
@@ -530,7 +530,7 @@ void SubsetPartition::find_all_tags(HashIntoType kmer_f,
 
 
 // Perform a breadth-first search starting from the k-mers in the given sequence
-unsigned int SubsetPartition::sweep_for_tags(char * seq,
+unsigned int SubsetPartition::sweep_for_tags(const std::string& seq,
 				    SeenSet& tagged_kmers,
 				    const SeenSet& all_tags,
 				    unsigned int range,
@@ -551,16 +551,17 @@ unsigned int SubsetPartition::sweep_for_tags(char * seq,
   // start breadth-first search.
 
   HashIntoType kmer_f, kmer_r, kmer;
-  KMerIterator kmers(seq, _ht->ksize());
+  KMerIterator kmers(seq.c_str(), _ht->ksize());
   std::string kmer_s;
   
-  // Queue up all the sequenes k-mers at breadth zero
+  // Queue up all the sequence's k-mers at breadth zero
   // We are searching around the perimeter of the known k-mers
   // @cswelcher still using kludgy kmer iterator, let's fix this sometime...
   while (!kmers.done()) {
     kmer = kmers.next();
     kmer_s = _revhash(kmer, _ht->ksize());
     kmer = _hash(kmer_s.c_str(), _ht->ksize(), kmer_f, kmer_r);
+    traversed_kmers.insert(kmer);
     
     node_q.push(kmer_f);
     node_q.push(kmer_r);
@@ -599,10 +600,8 @@ unsigned int SubsetPartition::sweep_for_tags(char * seq,
     traversed_kmers.insert(kmer);
     total++;
 
-    // Is this a kmer-to-tag, and have we put this tag in a partition already?
-    // Search no further in this direction.  (This is where we connect
-    // partitions.)
-    if (breadth && set_contains(all_tags, kmer)) {
+    // 
+    if (set_contains(all_tags, kmer)) {
       tagged_kmers.insert(kmer);
       continue;
     }
diff --git a/lib/subset.hh b/lib/subset.hh
index 9664b71a36..94809ca421 100644
--- a/lib/subset.hh
+++ b/lib/subset.hh
@@ -76,15 +76,7 @@ namespace khmer {
 		       bool break_on_stop_tags=false,
 		       bool stop_big_traversals=false);
 
-    void find_all_tags(HashIntoType kmer_f,
-				    HashIntoType kmer_r,
-				    SeenSet& tagged_kmers,
-				    SeenSet& traversed_kmers,
-				    const SeenSet& all_tags,
-				    bool break_on_stop_tags,
-				    bool stop_big_traversals);
-
-    unsigned int sweep_for_tags(char * seq,
+    unsigned int sweep_for_tags(const std::string& seq,
 				    SeenSet& tagged_kmers,
 				    const SeenSet& all_tags,
 				    unsigned int range,
diff --git a/python/_khmermodule.cc b/python/_khmermodule.cc
index 81d2864130..e0461e94da 100644
--- a/python/_khmermodule.cc
+++ b/python/_khmermodule.cc
@@ -3818,6 +3818,20 @@ static PyObject * hashbits_get_median_count(PyObject * self, PyObject * args)
   return Py_BuildValue("iff", med, average, stddev);
 }
 
+static PyObject * hashbits_get_color_dict(PyObject * self, PyObject * args) {
+  khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self;
+  khmer::Hashbits * hb = me->hashbits;
+  
+  PyObject * d = PyDict_New();
+  khmer::ColorPtrMap::iterator it;
+  
+  for (it = hb->color_ptrs.begin(); it!=hb->color_ptrs.end(); ++it) {
+    PyDict_SetItem(d, Py_BuildValue("K", it->first), Py_BuildValue("K", it->second));
+  }
+  
+  return d;
+}
+
 static PyObject * hashbits_consume_fasta_and_tag_with_colors(PyObject * self, PyObject * args)
 {
   khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self;
@@ -3907,11 +3921,69 @@ static PyObject * hashbits_sweep_color_neighborhood(PyObject * self, PyObject *
   khmer::Hashbits * hb = me->hashbits;
   
   char * seq = NULL;
-  unsigned int range = NULL;
+  unsigned int r = NULL;
+  PyObject * break_on_stop_tags_o = NULL;
+  PyObject * stop_big_traversals_o = NULL;
+
+  if (!PyArg_ParseTuple(args, "s|iOO", &seq, &r,
+			&break_on_stop_tags_o,
+			&stop_big_traversals_o)) {
+    return NULL;
+  }
+
+  unsigned int range = (2 * hb->_get_tag_density()) + 1;
+  if (r) {
+    range = r;
+  }
+
+  bool break_on_stop_tags = false;
+  if (break_on_stop_tags_o && PyObject_IsTrue(break_on_stop_tags_o)) {
+    break_on_stop_tags = true;
+  }
+  bool stop_big_traversals = false;
+  if (stop_big_traversals_o && PyObject_IsTrue(stop_big_traversals_o)) {
+    stop_big_traversals = true;
+  }
+  
+  if (strlen(seq) < hb->ksize()) {
+    return NULL;
+  }
+  
+  //std::pair<TagColorPtrPair::iterator, TagColorPtrPair::iterator> ret;
+  ColorPtrSet found_colors;
+  
+  bool exc_raised = false;
+  //Py_BEGIN_ALLOW_THREADS
+  try {
+    hb->sweep_color_neighborhood(seq, found_colors, range, break_on_stop_tags, stop_big_traversals);
+  } catch (_khmer_signal &e) {
+    exc_raised = true;
+  }
+  //Py_END_ALLOW_THREADS
+  
+  if (exc_raised) return NULL;
+  
+  PyObject * x =  PyList_New(found_colors.size());
+  khmer::ColorPtrSet::const_iterator si;
+  unsigned long long i = 0;
+  for (si=found_colors.begin(); si!=found_colors.end(); ++si) {
+    PyList_SET_ITEM(x, i, Py_BuildValue("K", *(*si)));
+    i++;
+  }
+  
+  return x;
+}
+
+
+static PyObject * hashbits_sweep_color_neighborhood_old(PyObject * self, PyObject * args) {
+  khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self;
+  khmer::Hashbits * hb = me->hashbits;
+  
+  char * seq = NULL;
   PyObject * break_on_stop_tags_o = NULL;
   PyObject * stop_big_traversals_o = NULL;
 
-  if (!PyArg_ParseTuple(args, "si|OO", &seq, &range,
+  if (!PyArg_ParseTuple(args, "s|OO", &seq,
 			&break_on_stop_tags_o,
 			&stop_big_traversals_o)) {
     return NULL;
@@ -3955,8 +4027,7 @@ static PyObject * hashbits_sweep_color_neighborhood(PyObject * self, PyObject *
   return x;
 }
 
-// Same as find_all_tags, but returns tags in a way actually useable by python
-// @cswelcher TODO: this is broken az, fix it asap
+// Similar to find_all_tags, but returns tags in a way actually useable by python
 // need a tags_in_sequence iterator or function in c++ land for reuse in all
 // these functions
 static PyObject * hashbits_sweep_tag_neighborhood(PyObject * self, PyObject *args)
@@ -4125,11 +4196,13 @@ static PyMethodDef khmer_hashbits_methods[] = {
   { "get_median_count", hashbits_get_median_count, METH_VARARGS, "Get the median, average, and stddev of the k-mer counts in the string" },
   { "consume_fasta_and_tag_with_colors", hashbits_consume_fasta_and_tag_with_colors, METH_VARARGS, "" },
   { "sweep_color_neighborhood", hashbits_sweep_color_neighborhood, METH_VARARGS, "" },
+  { "sweep_color_neighborhood_old", hashbits_sweep_color_neighborhood_old, METH_VARARGS, "" },
   {"consume_partitioned_fasta_and_tag_with_colors", hashbits_consume_partitioned_fasta_and_tag_with_colors, METH_VARARGS, "" },
   {"sweep_tag_neighborhood", hashbits_sweep_tag_neighborhood, METH_VARARGS, "" },
   {"get_tag_colors", hashbits_get_tag_colors, METH_VARARGS, ""},
   {"consume_sequence_and_tag_with_colors", hashbits_consume_sequence_and_tag_with_colors, METH_VARARGS, "" },
   {"n_colors", hashbits_n_colors, METH_VARARGS, ""},
+  {"get_color_dict", hashbits_get_color_dict, METH_VARARGS, "" },
  
   {NULL, NULL, 0, NULL}           /* sentinel */
 };
diff --git a/tests/test_hashbits.py b/tests/test_hashbits.py
index b38112c4df..c69ccf2e4f 100644
--- a/tests/test_hashbits.py
+++ b/tests/test_hashbits.py
@@ -506,12 +506,12 @@ def test_simple_median():
 #  * thread-safety
 #  * n_colors -- make sure to use test-data with multi-colored tags
 
-def test_get_all_tags():
+def test_sweep_tag_neighborhood():
     hb = khmer.new_hashbits(20, 1e7, 4)
     filename = utils.get_test_data('single-read.fq')
     hb.consume_fasta_and_tag(filename)
     
-    tags = hb.get_all_tags('CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT')
+    tags = hb.sweep_tag_neighborhood('CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT')
     assert len(tags) == 1
     assert tags.pop() == 173473779682L
 
@@ -530,20 +530,42 @@ def test_sweep_sequence_for_colors():
     filename = utils.get_test_data('single-read.fq')
     hb.consume_fasta_and_tag_with_colors(filename)
     
-    colors = hb.sweep_sequence_for_colors('CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT')
+    colors = hb.sweep_color_neighborhood('CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT')
     assert len(colors) == 1
     assert colors.pop() == 0L
 
+def test_consume_partitioned_fasta_and_tag_with_colors():
+    hb = khmer.new_hashbits(20, 1e7, 4)
+    filename = utils.get_test_data('real-partition-small.fa')
+
+    total_reads, n_consumed = hb.consume_partitioned_fasta_and_tag_with_colors(filename)
+    colors = set()
+    for record in screed.open(filename):
+        seq = record.sequence
+        colors.update(hb.sweep_color_neighborhood(seq, False, False))
+    #print hb.n_colors()
+    #print colors
+    assert len(colors) == 1
+    assert colors.pop() == 2L
+    assert hb.n_colors() == 1 
+
 def test_consume_fasta_and_tag_with_colors():
     hb = khmer.new_hashbits(20, 1e7, 4)
     read_1 = 'ACGTAACCGGTTAAACCCGGGTTTAAAACCCCGGGGTTTT'
     filename = utils.get_test_data('test-transcript.fa')
 
     total_reads, n_consumed = hb.consume_fasta_and_tag_with_colors(filename)
-    
+
     assert hb.get(read_1[:20])
     assert total_reads == 3
-    #assert hb.n_colors() == 3
+    print hb.n_colors()
+    print hb.get_color_dict()
+    for tag in hb.get_tagset():
+        print tag, khmer.forward_hash(tag, 20)
+    for record in screed.open(filename):
+        print hb.sweep_tag_neighborhood(record.sequence, 40)
+        print hb.sweep_color_neighborhood(record.sequence, 40)
+    assert hb.n_colors() == 3
 
 '''
 * The test data set as four reads: A, B, C, and D
@@ -558,7 +580,7 @@ def test_color_tag_correctness():
     hb.consume_fasta_and_tag_with_colors(filename)
     
     # read A
-    colors = hb.sweep_sequence_for_colors('ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAG')
+    colors = hb.sweep_color_neighborhood('ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAG')
     
     print colors
     assert len(colors) == 2
@@ -566,7 +588,7 @@ def test_color_tag_correctness():
     assert 1L in colors
     
     # read B
-    colors = hb.sweep_sequence_for_colors('GCTCTGCCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGA')
+    colors = hb.sweep_color_neighborhood('GCTCTGCCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGA')
     print colors
     assert len(colors) == 3
     assert 0L in colors
@@ -574,29 +596,14 @@ def test_color_tag_correctness():
     assert 2L in colors
     
     # read C
-    colors = hb.sweep_sequence_for_colors('TGTTGGGGATAGATAGATAGATGAGTGTAGATCCAACAACACATACA')
+    colors = hb.sweep_color_neighborhood('TGTTGGGGATAGATAGATAGATGAGTGTAGATCCAACAACACATACA')
     print colors
     assert len(colors) == 2
     assert 1L in colors
     assert 2L in colors
     
     # read D
-    colors = hb.sweep_sequence_for_colors('TATATATATAGCTAGCTAGCTAACTAGCTAGCATCGATCGATCGATC')
+    colors = hb.sweep_color_neighborhood('TATATATATAGCTAGCTAGCTAACTAGCTAGCATCGATCGATCGATC')
     print colors
     assert len(colors) == 1
-    assert 3L in colors
-        
- 
-def test_consume_partitioned_fasta_and_tag_with_colors():
-    hb = khmer.new_hashbits(20, 1e7, 4)
-    filename = utils.get_test_data('real-partition-small.fa')
-
-    total_reads, n_consumed = hb.consume_partitioned_fasta_and_tag_with_colors(filename)
-    #assert hb.n_colors() == 1
-    colors = set()
-    for record in screed.open(filename):
-        seq = record.sequence
-        colors.update(hb.sweep_sequence_for_colors(seq, False, False))
-    assert len(colors) == 1
-    assert colors.pop() == 2L
-    #assert hb.n_colors() == 1   
+    assert 3L in colors 

From 48069138f2b96beb348ac94d2852bc9d8898eafa Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Tue, 1 Oct 2013 19:54:01 -0400
Subject: [PATCH 045/140] added new tests for n_colors and get_color_dict

---
 scripts/sweep-reads-by-partition.py |  6 +++---
 tests/test_hashbits.py              | 21 ++++++++++++++++++++-
 2 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/scripts/sweep-reads-by-partition.py b/scripts/sweep-reads-by-partition.py
index f2a59f73a4..70870bf5e9 100755
--- a/scripts/sweep-reads-by-partition.py
+++ b/scripts/sweep-reads-by-partition.py
@@ -77,13 +77,13 @@ def main():
             print >>sys.stderr,'** sweeping {read_file} for colors...'.format(read_file=read_file)
             
             for n, record in enumerate(screed.open(read_file)):
-                if n % 10000 == 0:
-                    print >>sys.stderr, '\tswept {n} reads [{nc} colored, {no} orphaned' \
+                if n % 50000 == 0:
+                    print >>sys.stderr, '\tswept {n} reads [{nc} colored, {no} orphaned]' \
                                         .format(n=n, nc=n_colored, no=n_orphaned)
                 seq = record.sequence
                 name = record.name
                 
-                colors = ht.sweep_sequence_for_colors(seq, False, False)
+                colors = ht.sweep_color_neighborhood(seq)
                 color_number_dist.append(len(colors))
                 if colors:
                     n_colored += 1
diff --git a/tests/test_hashbits.py b/tests/test_hashbits.py
index c69ccf2e4f..4d91ca9bb8 100644
--- a/tests/test_hashbits.py
+++ b/tests/test_hashbits.py
@@ -504,7 +504,26 @@ def test_simple_median():
 #
 # @cswelcher TODO: more tests! 
 #  * thread-safety
-#  * n_colors -- make sure to use test-data with multi-colored tags
+
+def test_n_colors():
+    hb = khmer.new_hashbits(20, 1e7, 4)
+    filename = utils.get_test_data('test-colors.fa')
+    hb.consume_fasta_and_tag_with_colors(filename)
+    
+    print hb.n_colors()
+    assert hb.n_colors() == 4
+
+def test_get_color_dict():
+    hb = khmer.new_hashbits(20, 1e7, 4)
+    filename = utils.get_test_data('test-colors.fa')
+    hb.consume_fasta_and_tag_with_colors(filename)
+    
+    colors = hb.get_color_dict()
+    expected = [0L, 1L, 2L, 3L]
+    for e_color in expected:
+        assert e_color in colors
+    for a_color in colors:
+        assert a_color in expected
 
 def test_sweep_tag_neighborhood():
     hb = khmer.new_hashbits(20, 1e7, 4)

From 15d7a88d6638d4325b7fc41dd469ab0c24d1eba1 Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Tue, 1 Oct 2013 20:16:01 -0400
Subject: [PATCH 046/140] fixed incorrect memory estimation on
 sweep-reads-by-partition

---
 scripts/sweep-reads-by-partition.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/sweep-reads-by-partition.py b/scripts/sweep-reads-by-partition.py
index 70870bf5e9..ed6d31dcd8 100755
--- a/scripts/sweep-reads-by-partition.py
+++ b/scripts/sweep-reads-by-partition.py
@@ -48,7 +48,7 @@ def main():
         print >>sys.stderr, ''
         print >>sys.stderr, \
             'Estimated memory usage is {prod:.2g} bytes \
-            (n_hashes x min_hashsize)'.format(prod=args.n_hashes*args.min_hashsize)
+            (n_hashes x min_hashsize / 8)'.format(prod=args.n_hashes*args.min_hashsize/8)
         print >>sys.stderr, '-' * 8
     
     K = args.ksize

From b20d00a99d3a8aef525a33253e65eb7a60d68334 Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Wed, 2 Oct 2013 17:02:40 -0400
Subject: [PATCH 047/140] squashed bug with using a traversal range of 0

---
 lib/hashtable.cc       |  5 ++++-
 lib/subset.cc          |  7 +++++--
 python/_khmermodule.cc | 11 +++++++----
 3 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/lib/hashtable.cc b/lib/hashtable.cc
index ad1adca0f9..94d1a1a9da 100644
--- a/lib/hashtable.cc
+++ b/lib/hashtable.cc
@@ -2256,7 +2256,10 @@ unsigned int Hashtable::sweep_color_neighborhood(const std::string& seq,
     num_traversed = partition->sweep_for_tags(seq, tagged_kmers, all_tags, 
                               range, break_on_stoptags, stop_big_traversals);
     traverse_colors_and_resolve(tagged_kmers, found_colors);
-
+    //printf("range=%u ", range);
+    if (range == 0) {
+      assert(num_traversed == seq.length()-ksize()+1);
+    }
     return num_traversed;
 }
 
diff --git a/lib/subset.cc b/lib/subset.cc
index 7060865c50..c20ca04a65 100644
--- a/lib/subset.cc
+++ b/lib/subset.cc
@@ -543,7 +543,7 @@ unsigned int SubsetPartition::sweep_for_tags(const std::string& seq,
   std::queue<unsigned int> breadth_q;
   unsigned int cur_breadth = 0;
   unsigned int breadth = 0;
-  const unsigned int max_breadth = (2 * _ht->_tag_density) + 1;
+  const unsigned int max_breadth = range;
 
 
   unsigned int total = 0;
@@ -568,9 +568,12 @@ unsigned int SubsetPartition::sweep_for_tags(const std::string& seq,
     breadth_q.push(0);
   }
 
+  unsigned int seq_length = node_q.size() / 2;
+  unsigned int BIG_PERIMETER_TRAVERSALS = BIG_TRAVERSALS_ARE * seq_length;
+
   while(!node_q.empty()) {
     // change this to a better hueristic
-    if (stop_big_traversals && traversed_kmers.size() > BIG_TRAVERSALS_ARE) {
+    if (stop_big_traversals && traversed_kmers.size() > BIG_PERIMETER_TRAVERSALS) {
       tagged_kmers.clear();
       break;
     }
diff --git a/python/_khmermodule.cc b/python/_khmermodule.cc
index e0461e94da..60ebc1cb12 100644
--- a/python/_khmermodule.cc
+++ b/python/_khmermodule.cc
@@ -3889,7 +3889,7 @@ static PyObject * hashbits_consume_partitioned_fasta_and_tag_with_colors(
     return NULL;
   }
 
-  return Py_BuildValue("iL", total_reads, n_consumed);
+  return Py_BuildValue("iK", total_reads, n_consumed);
 }
 
 static PyObject * hashbits_consume_sequence_and_tag_with_colors(PyObject * self, PyObject * args) {
@@ -3932,7 +3932,7 @@ static PyObject * hashbits_sweep_color_neighborhood(PyObject * self, PyObject *
   }
 
   unsigned int range = (2 * hb->_get_tag_density()) + 1;
-  if (r) {
+  if (r >= 0) {
     range = r;
   }
 
@@ -3953,14 +3953,17 @@ static PyObject * hashbits_sweep_color_neighborhood(PyObject * self, PyObject *
   ColorPtrSet found_colors;
   
   bool exc_raised = false;
+  unsigned int num_traversed = 0;
   //Py_BEGIN_ALLOW_THREADS
   try {
-    hb->sweep_color_neighborhood(seq, found_colors, range, break_on_stop_tags, stop_big_traversals);
+    num_traversed = hb->sweep_color_neighborhood(seq, found_colors, range, break_on_stop_tags, stop_big_traversals);
   } catch (_khmer_signal &e) {
     exc_raised = true;
   }
   //Py_END_ALLOW_THREADS
   
+  //printf("...%u kmers traversed\n", num_traversed);
+  
   if (exc_raised) return NULL;
   
   PyObject * x =  PyList_New(found_colors.size());
@@ -4047,7 +4050,7 @@ static PyObject * hashbits_sweep_tag_neighborhood(PyObject * self, PyObject *arg
   }
 
   unsigned int range = (2 * hashbits->_get_tag_density()) + 1;
-  if (r) {
+  if (r >= 0) {
     range = r;
   }
 

From add2eab20c3205384d7e77887aef1036cca6b00c Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Wed, 2 Oct 2013 17:45:02 -0400
Subject: [PATCH 048/140] couple debugging things added to subset

---
 lib/subset.cc | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/lib/subset.cc b/lib/subset.cc
index c20ca04a65..e58b862edd 100644
--- a/lib/subset.cc
+++ b/lib/subset.cc
@@ -544,7 +544,7 @@ unsigned int SubsetPartition::sweep_for_tags(const std::string& seq,
   unsigned int cur_breadth = 0;
   unsigned int breadth = 0;
   const unsigned int max_breadth = range;
-
+  unsigned int breadth_seen = 0;
 
   unsigned int total = 0;
 
@@ -571,6 +571,7 @@ unsigned int SubsetPartition::sweep_for_tags(const std::string& seq,
   unsigned int seq_length = node_q.size() / 2;
   unsigned int BIG_PERIMETER_TRAVERSALS = BIG_TRAVERSALS_ARE * seq_length;
 
+  unsigned int cur_it = 0;
   while(!node_q.empty()) {
     // change this to a better hueristic
     if (stop_big_traversals && traversed_kmers.size() > BIG_PERIMETER_TRAVERSALS) {
@@ -584,6 +585,12 @@ unsigned int SubsetPartition::sweep_for_tags(const std::string& seq,
     node_q.pop();
     breadth = breadth_q.front();
     breadth_q.pop();
+    cur_it++;
+    printf("current iteration: %u, current breadth: %u\n", cur_it, breadth);
+    
+    if (breadth > breadth_seen) {
+      breadth_seen = breadth;
+    }
 
     HashIntoType kmer = uniqify_rc(kmer_f, kmer_r);
 
@@ -617,6 +624,7 @@ unsigned int SubsetPartition::sweep_for_tags(const std::string& seq,
 
     queue_neighbors(kmer_f, kmer_r, breadth, traversed_kmers, node_q, breadth_q);    
   }
+  printf("breadth_seen=%u, total=%u, traverse_kmers=%u\n", breadth_seen, total, traversed_kmers.size());
   return total;
 }
 

From bf5a54482c16c7762e0fc81bc52fe5fc04b664df Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Wed, 2 Oct 2013 23:29:25 -0400
Subject: [PATCH 049/140] added new test file

---
 tests/test-data/test-colors.fa | 8 ++++++++
 1 file changed, 8 insertions(+)
 create mode 100644 tests/test-data/test-colors.fa

diff --git a/tests/test-data/test-colors.fa b/tests/test-data/test-colors.fa
new file mode 100644
index 0000000000..80741ffcbf
--- /dev/null
+++ b/tests/test-data/test-colors.fa
@@ -0,0 +1,8 @@
+>read_A
+ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAG
+>read_B_overlap_A
+GCTCTGCCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGA
+>read_C_overlap_B
+TGTTGGGGATAGATAGATAGATGAGTGTAGATCCAACAACACATACA
+>read_D
+TATATATATAGCTAGCTAGCTAACTAGCTAGCATCGATCGATCGATC

From e8d06aae3c527b77d7e86a78eaa939f4c0e2f280 Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Thu, 3 Oct 2013 02:09:24 -0400
Subject: [PATCH 050/140] final optimizations to traversal

---
 lib/subset.cc | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/lib/subset.cc b/lib/subset.cc
index e58b862edd..09ff44a6c9 100644
--- a/lib/subset.cc
+++ b/lib/subset.cc
@@ -541,10 +541,10 @@ unsigned int SubsetPartition::sweep_for_tags(const std::string& seq,
   SeenSet traversed_kmers;
   NodeQueue node_q;
   std::queue<unsigned int> breadth_q;
-  unsigned int cur_breadth = 0;
+  //unsigned int cur_breadth = 0;
   unsigned int breadth = 0;
-  const unsigned int max_breadth = range;
-  unsigned int breadth_seen = 0;
+  unsigned int max_breadth = range;
+  //unsigned int breadth_seen = 0;
 
   unsigned int total = 0;
 
@@ -571,7 +571,7 @@ unsigned int SubsetPartition::sweep_for_tags(const std::string& seq,
   unsigned int seq_length = node_q.size() / 2;
   unsigned int BIG_PERIMETER_TRAVERSALS = BIG_TRAVERSALS_ARE * seq_length;
 
-  unsigned int cur_it = 0;
+  //unsigned int cur_it = 0;
   while(!node_q.empty()) {
     // change this to a better hueristic
     if (stop_big_traversals && traversed_kmers.size() > BIG_PERIMETER_TRAVERSALS) {
@@ -585,12 +585,12 @@ unsigned int SubsetPartition::sweep_for_tags(const std::string& seq,
     node_q.pop();
     breadth = breadth_q.front();
     breadth_q.pop();
-    cur_it++;
-    printf("current iteration: %u, current breadth: %u\n", cur_it, breadth);
+    //cur_it++;
+    //printf("current iteration: %u, current breadth: %u\n", cur_it, breadth);
     
-    if (breadth > breadth_seen) {
-      breadth_seen = breadth;
-    }
+    //if (breadth > breadth_seen) {
+    //  breadth_seen = breadth;
+    //}
 
     HashIntoType kmer = uniqify_rc(kmer_f, kmer_r);
 
@@ -613,6 +613,9 @@ unsigned int SubsetPartition::sweep_for_tags(const std::string& seq,
     // 
     if (set_contains(all_tags, kmer)) {
       tagged_kmers.insert(kmer);
+      // if we find a tag, finish the remaining queued nodes,
+      // but don't queue up any more
+      max_breadth = breadth;
       continue;
     }
 
@@ -620,11 +623,15 @@ unsigned int SubsetPartition::sweep_for_tags(const std::string& seq,
     //assert(breadth >= cur_breadth); // keep track of watermark, for debugging.
     //if (breadth > cur_breadth) { cur_breadth = breadth; }
 
-    if (breadth >= max_breadth or breadth >= range) { continue; } // truncate search @CTB exit?
+    if (breadth == max_breadth) { continue; }
+    // finish up nodes on the current level, but if we go beyond, end it immediately
+    // this keeps from having to look at nodes which have already been queued once we
+    // lower the limit after finding a tag
+    else if (breadth > max_breadth) { return total; } // truncate search @CTB exit?
 
     queue_neighbors(kmer_f, kmer_r, breadth, traversed_kmers, node_q, breadth_q);    
   }
-  printf("breadth_seen=%u, total=%u, traverse_kmers=%u\n", breadth_seen, total, traversed_kmers.size());
+  //printf("breadth_seen=%u, total=%u, traverse_kmers=%u\n", breadth_seen, total, traversed_kmers.size());
   return total;
 }
 

From 76e91d7ab2501f1870a90aa3c4b3676284c630a8 Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Thu, 3 Oct 2013 10:50:45 -0400
Subject: [PATCH 051/140] added temporary testing scripts, different sweep
 scripts, etc

---
 lib/sweep_perf.py                           |  28 +++++
 scripts/sweep-reads-by-partition-to-file.py | 123 ++++++++++++++++++++
 tests/test_hashbits.py                      |   7 +-
 3 files changed, 155 insertions(+), 3 deletions(-)
 create mode 100755 lib/sweep_perf.py
 create mode 100755 scripts/sweep-reads-by-partition-to-file.py

diff --git a/lib/sweep_perf.py b/lib/sweep_perf.py
new file mode 100755
index 0000000000..923c2da8bb
--- /dev/null
+++ b/lib/sweep_perf.py
@@ -0,0 +1,28 @@
+#! /w/khmer_dev/bin/python
+
+import khmer
+import screed
+import sys
+import time
+
+R = int(sys.argv[1])
+print R
+K = 20
+test_file = '/w/khmer/tests/test-data/biglump-random-20-a.fa'
+
+ht = khmer.new_hashbits(K, 1e9, 4)
+ht.consume_fasta_and_tag_with_colors(test_file)
+
+N = 10
+for n, record in enumerate(screed.open(test_file)):
+    if n > N:
+        break
+    print '*' * 40
+    print '{} k-mers in sequence'.format(len(record.sequence)-K+1)
+    
+    stime = time.clock()
+    colors = ht.sweep_color_neighborhood(record.sequence, R)
+    etime = time.clock()
+    
+    print 'traversal took {} seconds'.format(etime-stime)
+    print 'found {} colors'.format(len(colors))
diff --git a/scripts/sweep-reads-by-partition-to-file.py b/scripts/sweep-reads-by-partition-to-file.py
new file mode 100755
index 0000000000..6cb0ed687a
--- /dev/null
+++ b/scripts/sweep-reads-by-partition-to-file.py
@@ -0,0 +1,123 @@
+#! /w/khmer_dev/bin/python
+#
+# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2013. It is licensed under
+# the three-clause BSD license; see doc/LICENSE.txt. Contact: ctb@msu.edu
+#
+"""
+Tag and color the given partitioned fasta, then find all reads in the neighborhood
+of each partition and output to a file
+
+% python scripts/normalize-by-median.py [ -p <partitions/file> ] -i <fastp> <reads1> <reads2> ...
+
+Use '-h' for parameter help.
+"""
+
+import khmer
+import screed
+import sys
+import time
+from khmer.counting_args import build_construct_args, DEFAULT_MIN_HASHSIZE
+
+MAX_FILES=512
+READS_PER_FILE = 100000000
+
+def write_read(fp, seq, name, color):
+    fp.write('>{name}\t{color}\n{seq}\n'.format(seq=seq, name=name, color=color))
+
+def main():
+    parser = build_construct_args()
+    #parser.add_argument('-p', '--partitions_per_file', 
+    #                    dest='partitions_per_file', default=DEFAULT_PPF)
+    parser.add_argument('-i', '--input_fastp',dest='input_fastp')
+    parser.add_argument('-r', '--traversal_range', type=int, dest='traversal_range')
+    parser.add_argument('input_reads', nargs='+')
+    args = parser.parse_args()
+    
+    if not args.quiet:
+        if args.min_hashsize == DEFAULT_MIN_HASHSIZE:
+            print >>sys.stderr, \
+                "** WARNING: hashsize is default!  " \
+                "You absodefly want to increase this!\n** " \
+                "Please read the docs!"
+
+        print >>sys.stderr, '\nPARAMETERS:'
+        print >>sys.stderr, \
+            ' - kmer size =    {ksize:d} \t\t(-k)'.format(ksize=args.ksize)
+        print >>sys.stderr, \
+            ' - n hashes =     {nhash:d} \t\t(-N)'.format(nhash=args.n_hashes)
+        print >>sys.stderr, \
+            ' - min hashsize = {mh:-5.2g} \t(-x)'.format(mh=args.min_hashsize)
+        print >>sys.stderr, ''
+        print >>sys.stderr, \
+            'Estimated memory usage is {prod:.2g} bytes \
+            (n_hashes x min_hashsize / 8)'.format(prod=args.n_hashes*args.min_hashsize/8)
+        print >>sys.stderr, '-' * 8
+    
+    K = args.ksize
+    HT_SIZE = args.min_hashsize
+    N_HT = args.n_hashes
+    
+    traversal_range = args.traversal_range
+    input_reads = args.input_reads
+    input_fastp = args.input_fastp
+    
+    ht = khmer.new_hashbits(K, HT_SIZE, N_HT)
+    print >>sys.stderr, 'consuming fastp...'
+    ht.consume_partitioned_fasta_and_tag_with_colors(input_fastp)
+ 
+    color_number_dist = []
+    
+    n_orphaned = 0
+    n_colored = 0
+    n_mcolored = 0
+    n_files = 0
+    try:
+        outfp = open('colored_reads_0.fa', 'wb')
+        start_t = time.clock()
+        for read_file in input_reads:
+            print >>sys.stderr,'** sweeping {read_file} for colors...'.format(read_file=read_file)
+            total_t = 0.0
+            for n, record in enumerate(screed.open(read_file)):
+                if n % 50000 == 0:
+                    end_t = time.clock()
+                    batch_t = end_t - start_t
+                    total_t += batch_t
+                    print >>sys.stderr, '\tswept {n} reads [{nc} colored, {no} orphaned] ** {sec}s ({sect}s total)' \
+                                        .format(n=n, nc=n_colored, no=n_orphaned, sec=batch_t, sect=total_t)
+                    start_t = time.clock()
+                seq = record.sequence
+                name = record.name
+                
+                colors = ht.sweep_color_neighborhood(seq, traversal_range)
+                color_number_dist.append(len(colors))
+                if colors:
+                    n_colored += 1
+                    if len(colors) > 1:
+                        n_mcolored += 1
+                    for color in colors:
+                        write_read(outfp, seq, name, color)
+                else:
+                    n_orphaned += 1
+
+                if n_colored % READS_PER_FILE == 0 and n_colored != 0:
+                    n_files += 1
+                    outfp = open('colored_reads_{}.fa'.format(n_files), 'wb')
+
+    except IOError as e:
+        print >>sys.stderr, 'ERROR:', e
+        print >>sys.stderr, '** exiting...'
+        
+    print >>sys.stderr, 'swept {n_reads} for colors...'.format(n_reads=n)
+    print >>sys.stderr, '...with {nc} colored and {no} orphaned'.format(
+                                    nc=n_colored, no=n_orphaned)
+    print >>sys.stderr, '...and {nmc} multicolored'.format(nmc=n_mcolored)
+    print >>sys.stderr, '...to {nf} files'.format(nf=n_files)
+    
+    print >>sys.stderr, '** outputting color number distribution...'
+    with open('color_dist.txt', 'wb') as outfp:
+        for nc in color_number_dist:
+            outfp.write('{nc}\n'.format(nc=nc))
+    
+if __name__ == '__main__':
+    main()
diff --git a/tests/test_hashbits.py b/tests/test_hashbits.py
index 4d91ca9bb8..71cd8f8394 100644
--- a/tests/test_hashbits.py
+++ b/tests/test_hashbits.py
@@ -599,9 +599,10 @@ def test_color_tag_correctness():
     hb.consume_fasta_and_tag_with_colors(filename)
     
     # read A
-    colors = hb.sweep_color_neighborhood('ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAG')
-    
+    colors = hb.sweep_color_neighborhood('ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAGCTAGGCTAGGTGTGCTCTGCCTAGAGCTAGGCTAGGTGT')
+    print hb.sweep_tag_neighborhood('ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAGCTAGGCTAGGTGTGCTCTGCCTAGAGCTAGGCTAGGTGT')
     print colors
+    print len('ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAG')-19 
     assert len(colors) == 2
     assert 0L in colors
     assert 1L in colors
@@ -625,4 +626,4 @@ def test_color_tag_correctness():
     colors = hb.sweep_color_neighborhood('TATATATATAGCTAGCTAGCTAACTAGCTAGCATCGATCGATCGATC')
     print colors
     assert len(colors) == 1
-    assert 3L in colors 
+    assert 3L in colors

From 23c197ef33837b3fdb517d883e389d371bdbcdfe Mon Sep 17 00:00:00 2001
From: CS Welcher <cs.welcher@gmail.com>
Date: Thu, 3 Oct 2013 11:02:03 -0400
Subject: [PATCH 052/140] addded fix for tag color correctness test not working
 on HPCC, still no idea why it failed

---
 tests/test-data/test-colors.fa | 6 +++---
 tests/test_hashbits.py         | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/test-data/test-colors.fa b/tests/test-data/test-colors.fa
index 80741ffcbf..b93d7c3c64 100644
--- a/tests/test-data/test-colors.fa
+++ b/tests/test-data/test-colors.fa
@@ -1,8 +1,8 @@
 >read_A
-ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAG
+ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAG
 >read_B_overlap_A
-GCTCTGCCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGA
+GCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGCTCTGCCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGA
 >read_C_overlap_B
-TGTTGGGGATAGATAGATAGATGAGTGTAGATCCAACAACACATACA
+TGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGAGTTGGGGATAGATAGATAGATGAGTGTAGATCCAACAACACATACA
 >read_D
 TATATATATAGCTAGCTAGCTAACTAGCTAGCATCGATCGATCGATC
diff --git a/tests/test_hashbits.py b/tests/test_hashbits.py
index 71cd8f8394..c72245a63f 100644
--- a/tests/test_hashbits.py
+++ b/tests/test_hashbits.py
@@ -600,7 +600,7 @@ def test_color_tag_correctness():
     
     # read A
     colors = hb.sweep_color_neighborhood('ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAGCTAGGCTAGGTGTGCTCTGCCTAGAGCTAGGCTAGGTGT')
-    print hb.sweep_tag_neighborhood('ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAGCTAGGCTAGGTGTGCTCTGCCTAGAGCTAGGCTAGGTGT')
+    print hb.sweep_tag_neighborhood('TTCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAGCTAGGCTAGGTGTGCTCTGCCTAGAGCTAGGCTAGGTGT')
     print colors
     print len('ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAG')-19 
     assert len(colors) == 2
@@ -608,7 +608,7 @@ def test_color_tag_correctness():
     assert 1L in colors
     
     # read B
-    colors = hb.sweep_color_neighborhood('GCTCTGCCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGA')
+    colors = hb.sweep_color_neighborhood('GCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGCTCTGCCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGA')
     print colors
     assert len(colors) == 3
     assert 0L in colors
@@ -616,7 +616,7 @@ def test_color_tag_correctness():
     assert 2L in colors
     
     # read C
-    colors = hb.sweep_color_neighborhood('TGTTGGGGATAGATAGATAGATGAGTGTAGATCCAACAACACATACA')
+    colors = hb.sweep_color_neighborhood('TGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGAGTTGGGGATAGATAGATAGATGAGTGTAGATCCAACAACACATACA')
     print colors
     assert len(colors) == 2
     assert 1L in colors

From b2f6164e75f43e0804d448ff5879f471efde4da8 Mon Sep 17 00:00:00 2001
From: CS Welcher <cs.welcher@gmail.com>
Date: Thu, 3 Oct 2013 16:13:47 -0400
Subject: [PATCH 053/140] c++ land color test

---
 lib/Makefile       | 15 ++++++--
 lib/test-Colors.cc | 88 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 100 insertions(+), 3 deletions(-)
 create mode 100644 lib/test-Colors.cc

diff --git a/lib/Makefile b/lib/Makefile
index ebd2e25869..8681274695 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -14,8 +14,9 @@ BZIP2_OBJS_BASE= \
 	decompress.o bzlib.o
 BZIP2_OBJS=$(addprefix $(BZIP2_DIR)/, $(BZIP2_OBJS_BASE))
 
-DRV_PROGS=bittest ktable_test # test-StreamReader test-CacheManager test-Parser test-HashTables
+#DRV_PROGS=bittest ktable_test test-Colors # test-StreamReader test-CacheManager test-Parser test-HashTables
 DRV_PROGS+=#graphtest #consume_prof
+DRV_PROGS=test-Colors
 AUX_PROGS=#ht-diff
 
 CORE_OBJS= error.o khmer_config.o thread_id_map.o trace_logger.o perf_metrics.o ktable.o
@@ -37,7 +38,9 @@ DRV_TEST_HASHTABLES_OBJS= \
 	test-HashTables.o counting.o hashbits.o hashtable.o subset.o \
 	$(PARSERS_OBJS) $(CORE_OBJS) $(ZLIB_OBJS) $(BZIP2_OBJS)
 HT_DIFF_OBJS=ht-diff.o counting.o hashtable.o $(PARSERS_OBJS) $(CORE_OBJS) $(ZLIB_OBJS) $(BZIP2_OBJS)
-
+DRV_TEST_COLORS_OBJS= \
+	test-Colors.o counting.o hashbits.o hashtable.o subset.o \
+	$(PARSERS_OBJS) $(CORE_OBJS) $(ZLIB_OBJS) $(BZIP2_OBJS)
 test-StreamReader: $(DRV_TEST_STREAM_READER_OBJS)
 	$(CXX) -o $@ $(DRV_TEST_STREAM_READER_OBJS) $(LIBS)
 
@@ -49,6 +52,9 @@ test-Parser: $(DRV_TEST_PARSER_OBJS)
 
 test-HashTables: $(DRV_TEST_HASHTABLES_OBJS)
 	$(CXX) -o $@ $(DRV_TEST_HASHTABLES_OBJS) $(LIBS) -fopenmp
+test-Colors: $(DRV_TEST_COLORS_OBJS)
+	$(CXX) -o $@ $(DRV_TEST_COLORS_OBJS) $(LIBS) -fopenmp
+
 
 ht-diff: $(HT_DIFF_OBJS)
 	$(CXX) -o $@ $(HT_DIFF_OBJS) $(LIBS)
@@ -56,9 +62,12 @@ ht-diff: $(HT_DIFF_OBJS)
 bittest: bittest.o ktable.o
 	$(CXX) -o $@ bittest.o ktable.o
 
-ktable_test: ktable_test.o hashtable.o subset.o $(PARSERS_OBJS) $(CORE_OBJS) $(ZLIB_OBJS) $(BZIP2_OBJS)
+#ktable_test: ktable_test.o hashtable.o subset.o $(PARSERS_OBJS) $(CORE_OBJS) $(ZLIB_OBJS) $(BZIP2_OBJS)
+#	$(CXX) -o $@ ktable_test.o hashtable.o subset.o $(PARSERS_OBJS) $(CORE_OBJS) $(ZLIB_OBJS) $(BZIP2_OBJS) $(LIBS)
+color_test: test-Colors.o hashtable.o subset.o $(PARSERS_OBJS) $(CORE_OBJS) $(ZLIB_OBJS) $(BZIP2_OBJS)
 	$(CXX) -o $@ ktable_test.o hashtable.o subset.o $(PARSERS_OBJS) $(CORE_OBJS) $(ZLIB_OBJS) $(BZIP2_OBJS) $(LIBS)
 
+
 # NOTE: Disabled due to broken constructor call.
 #graphtest: graphtest.o ktable.o hashtable.o
 #	$(CXX) -o $@ graphtest.o ktable.o hashtable.o
diff --git a/lib/test-Colors.cc b/lib/test-Colors.cc
new file mode 100644
index 0000000000..aaa1ebb103
--- /dev/null
+++ b/lib/test-Colors.cc
@@ -0,0 +1,88 @@
+//
+// This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+// Copyright (C) Michigan State University, 2009-2013. It is licensed under
+// the three-clause BSD license; see doc/LICENSE.txt. Contact: ctb@msu.edu
+//
+
+// Simple C++ implementation of the 'load-graph' Python script.
+
+
+#include <cstring>
+#include <cstdio>
+#include <cerrno>
+#include <cstdlib>
+#include <unistd.h>
+#include <getopt.h>
+#include <time.h>
+#include <omp.h>
+
+//#define HASH_TYPE_TO_TEST   1 // Counting Hash
+#define HASH_TYPE_TO_TEST   2 // Bit Hash
+
+// #define OUTPUT_HASHTABLE
+
+
+#include "error.hh"
+#include "read_parsers.hh"
+#if HASH_TYPE_TO_TEST == 1
+#  include "counting.hh"
+#elif HASH_TYPE_TO_TEST == 2
+#  include "hashbits.hh"
+#else
+#  error "No HASH_TYPE_TO_TEST macro defined."
+#endif
+#include "primes.hh"
+
+using namespace std;
+using namespace khmer;
+using namespace khmer:: read_parsers;
+
+
+
+
+int main( int argc, char * argv[ ] )
+{
+    unsigned long	kmer_length	    = 20;
+    float		ht_size_FP	    = 1.0E8;
+    unsigned long	ht_count	    = 4;
+    uint64_t		cache_size	    = 4L * 1024 * 1024 * 1024;
+    unsigned int	range		    = 40;
+    int			rc		    = 0;
+    int			opt		    = -1;
+    char *		conv_residue	    = NULL;
+    string		rfile_name = "/mnt/scratch/tg/w/khmer/tests/test-data/test-reads.fa";
+    string		ifile_name = "/mnt/scratch/tg/w/khmer/tests/test-data/test-reads.fa";
+    // FILE *		ofile		    = NULL;
+    HashIntoType	    ht_size		= (HashIntoType)ht_size_FP;
+    Primes primetab( ht_size );
+    vector<HashIntoType> ht_sizes;
+    for ( unsigned int i = 0; i < ht_count; ++i )
+	ht_sizes.push_back( primetab.get_next_prime( ) );
+
+    unsigned int	    reads_total		= 0;
+    unsigned long long int  n_consumed		= 0;
+
+    Hashbits ht( kmer_length, ht_sizes );
+    ht.consume_fasta_and_tag_with_colors( ifile_name, reads_total, n_consumed );
+    IParser * parser = IParser:: get_parser(rfile_name.c_str());
+    Read read;
+    unsigned int num_traversed;
+    string seq = "";
+    clock_t st;
+    while(!parser->is_complete()) {
+	read = parser->get_next_read();
+	seq = read.sequence;
+	st = clock();
+	ColorPtrSet found_colors;
+	num_traversed = ht.sweep_color_neighborhood(seq, found_colors, range, false, false);
+	st = clock() - st;
+	printf("traversed %u reads in %d ticks (%f seconds)\n", num_traversed,
+								st,
+								((float)st/CLOCKS_PER_SEC));
+	
+    }
+    return rc;
+}
+
+
+// vim: set sts=4 sw=4 tw=80:

From fd9728a4951290bfe5cc43c27ddfe484d8073c99 Mon Sep 17 00:00:00 2001
From: CS Welcher <cs.welcher@gmail.com>
Date: Thu, 3 Oct 2013 17:58:12 -0400
Subject: [PATCH 054/140] changes to test-Colors

---
 lib/hashtable.hh   |  4 ++--
 lib/test-Colors.cc | 25 +++++++++++++++----------
 2 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/lib/hashtable.hh b/lib/hashtable.hh
index 23c8c7b0ec..653ad7ba95 100644
--- a/lib/hashtable.hh
+++ b/lib/hashtable.hh
@@ -525,8 +525,8 @@ namespace khmer {
     void consume_partitioned_fasta_and_tag_with_colors(const std::string &filename,
 					  unsigned int &total_reads,
 					  unsigned long long &n_consumed,
-					  CallbackFn callback,
-					  void * callback_data);
+					  CallbackFn callback = NULL,
+					  void * callback_datac = NULL);
 					  			  
     void consume_sequence_and_tag_with_colors(const std::string& seq,
 					unsigned long long& n_consumed,
diff --git a/lib/test-Colors.cc b/lib/test-Colors.cc
index aaa1ebb103..2dafc79960 100644
--- a/lib/test-Colors.cc
+++ b/lib/test-Colors.cc
@@ -46,12 +46,12 @@ int main( int argc, char * argv[ ] )
     float		ht_size_FP	    = 1.0E8;
     unsigned long	ht_count	    = 4;
     uint64_t		cache_size	    = 4L * 1024 * 1024 * 1024;
-    unsigned int	range		    = 40;
+    unsigned int	range		    = 1;
     int			rc		    = 0;
     int			opt		    = -1;
     char *		conv_residue	    = NULL;
-    string		rfile_name = "/mnt/scratch/tg/w/khmer/tests/test-data/test-reads.fa";
-    string		ifile_name = "/mnt/scratch/tg/w/khmer/tests/test-data/test-reads.fa";
+    string		rfile_name = "/mnt/scratch/tg/w/2013-lamprey/data/AK.fq.gz";
+    string		ifile_name = "/mnt/scratch/tg/w/petMar_test.fp";
     // FILE *		ofile		    = NULL;
     HashIntoType	    ht_size		= (HashIntoType)ht_size_FP;
     Primes primetab( ht_size );
@@ -61,25 +61,30 @@ int main( int argc, char * argv[ ] )
 
     unsigned int	    reads_total		= 0;
     unsigned long long int  n_consumed		= 0;
-
+    printf("consuming test fastp...\n");
     Hashbits ht( kmer_length, ht_sizes );
-    ht.consume_fasta_and_tag_with_colors( ifile_name, reads_total, n_consumed );
+    ht.consume_partitioned_fasta_and_tag_with_colors( ifile_name, reads_total, n_consumed );
+    printf("consume %u sequences, graph has %u colors\n", reads_total, ht.n_colors());
     IParser * parser = IParser:: get_parser(rfile_name.c_str());
     Read read;
     unsigned int num_traversed;
+    unsigned int num_reads = 0;
     string seq = "";
-    clock_t st;
+    clock_t st = clock();
     while(!parser->is_complete()) {
 	read = parser->get_next_read();
 	seq = read.sequence;
-	st = clock();
 	ColorPtrSet found_colors;
 	num_traversed = ht.sweep_color_neighborhood(seq, found_colors, range, false, false);
-	st = clock() - st;
-	printf("traversed %u reads in %d ticks (%f seconds)\n", num_traversed,
+	if (num_reads % 50000 == 0) {
+	    st = clock() - st;
+	    printf("traversed %u kmers in %d ticks (%f seconds)\n", num_traversed,
 								st,
 								((float)st/CLOCKS_PER_SEC));
-	
+	st = clock();
+	}
+	found_colors.clear();
+	num_reads++;
     }
     return rc;
 }

From 4c419ed87b43f693ac1145897550e22cb0b0a306 Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Mon, 7 Oct 2013 11:21:23 -0400
Subject: [PATCH 055/140] resync with hpcc

---
 lib/hashtable.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/hashtable.cc b/lib/hashtable.cc
index 94d1a1a9da..5a9eb9624d 100644
--- a/lib/hashtable.cc
+++ b/lib/hashtable.cc
@@ -2260,6 +2260,7 @@ unsigned int Hashtable::sweep_color_neighborhood(const std::string& seq,
     if (range == 0) {
       assert(num_traversed == seq.length()-ksize()+1);
     }
+    tagged_kmers.clear();
     return num_traversed;
 }
 

From e8466b15f5cccf82601dae0dac252099c3bdbf7a Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Mon, 7 Oct 2013 11:22:01 -0400
Subject: [PATCH 056/140] changes to test-colors

---
 tests/test-data/test-colors.fa | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test-data/test-colors.fa b/tests/test-data/test-colors.fa
index 80741ffcbf..bc725498e7 100644
--- a/tests/test-data/test-colors.fa
+++ b/tests/test-data/test-colors.fa
@@ -1,5 +1,5 @@
 >read_A
-ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAG
+ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAGCTAGGCTAGGTGTGCTCTGCCTAGAGCTAGGCTAGGTGT
 >read_B_overlap_A
 GCTCTGCCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGA
 >read_C_overlap_B

From 639099f3225b3eda87123cf1dbed549c9f94cd85 Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Thu, 10 Oct 2013 19:23:13 -0400
Subject: [PATCH 057/140] testing out buffer based spitter

---
 scripts/split-reads-by-color.py | 131 ++++++++++++++++++++++++++++++++
 1 file changed, 131 insertions(+)
 create mode 100644 scripts/split-reads-by-color.py

diff --git a/scripts/split-reads-by-color.py b/scripts/split-reads-by-color.py
new file mode 100644
index 0000000000..6d90a2ec45
--- /dev/null
+++ b/scripts/split-reads-by-color.py
@@ -0,0 +1,131 @@
+# In-progress read-buffering approach to writing out colors to many files
+# Basic idea is to buffer some number of reads in memory, then dump them all at once
+# Hope that each file acrues, on average, BUFFER_SIZE / NUM_PARTS reads
+# ie, if we buffer 1000000 reads, and we have 100000 partitios/colors,
+# we should expect the mean buffer size to be 10 reads
+
+import screed
+import sys
+import argparse
+import time
+
+def fastp_iter(filename):
+    for record in screed.open(filename, parse_description=False):
+        name = record.name
+        try:
+            name, partition_id = name.rsplit('\t', 1)
+        except ValueError:
+            print >>sys.stderr, '%%% ERROR: Derp! Is this file partitioned? %%%'
+            sys.exit(1)
+        # convert name to blast format if necessary
+        nname = name.split('|', 2)
+        if len(nname) >= 2:
+            name = nname[2]
+        name = name.split(' ')[0]
+        yield name, int(partition_id), record.sequence
+
+class Seq:
+
+    def __init__(self, name, color, seq):
+        self.name = name
+        self.color = color
+        self.seq = seq
+
+    def write(self, fp):
+        fp.write('>{}\t{}\n{}\n'.format(self.name, self.color, self.seq))
+
+class ReadBuffer:
+
+    def __init__(self, max_files=512, max_reads=1000000, est_files=100000, output_pref='reads_'):
+        self.buffers = {}
+        self.buffer_counts = {}
+        self.max_files = max_files
+        self.max_reads = max_reads
+
+        self.est_files = est_files
+        self.output_pref = output_pref
+        self.buffer_flush = self.max_reads / self.est_files
+
+        self.cur_reads = 0
+        self.cur_files = 0
+
+    def add_seq(self, seq):
+        color = seq.color
+        if color in self.buffers:
+            count = self.buffer_counts[color]
+            self.buffers[color].append(seq)
+            self.buffer_counts[color] += 1
+            if count > self.buffer_flush:
+                self.flush_buffer(color)
+
+        else:
+            self.buffers[color] = [seq]
+            self.buffer_counts[color] = 1
+        self.cur_reads += 1
+        if self.cur_reads > self.max_reads:
+            self.flush_all()
+    
+    def flush_buffer(self, color):
+        with open('{}{}.fa'.format(self.output_pref, color), 'a') as outfp:
+            for read in self.buffers[color]:
+                read.write(outfp)
+                self.cur_reads -= 1
+            del self.buffer_counts[color]
+            del self.buffers[color]
+
+    def flush_all(self):
+        print >>sys.stderr, '** reached max buffer size, flushing all to files...'
+        for color in self.buffers:
+            self.flush_buffer(color)
+        assert self.cur_reads == 0
+
+def main():
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-b', '--buffer_size', dest='buffer_size', type=int)
+    parser.add_argument('-e', '--files_estimate', dest='files_estimate', type=int)
+    parser.add_argument('-o', '--output_prefix', dest='output_prefix')
+    parser.add_argument('input_files', nargs='+')
+    args = parser.parse_args()
+
+    output_pref = args.output_prefix
+    buf_size = args.buffer_size
+    est = args.files_estimate
+    input_files = args.input_files
+
+    output_buffer = ReadBuffer(max_reads=buf_size, est_files=est, output_pref=output_pref)
+
+    multi_fp = open('{}_multi.fa'.format(output_pref), 'a')
+    
+    n_reads = 0
+    total_t = 0.0
+    start_t = time.clock()
+    for input_file in args.input_files:
+        print >>sys.stderr, '* splitting reads in {}...'.format(input_file)
+
+        current_read = ''
+        seen_twice = False
+
+        for name, color, seq in fastp_iter(input_file):
+            n_reads += 1
+            seq_obj = Seq(name, color, seq)
+
+            if n_reads % 100000 == 0:
+                end_t = time.clock()
+                batch_t = end_t - start_t
+                total_t += batch_t
+                print >>sys.stderr, '** processed {} reads from {} [{}s, {}s total]'.format(n_reads, input_file, batch_t, total_t)
+                start_t = time.clock()
+ 
+            if name == current_read:
+                if not seen_twice:
+                    seq_obj.write(multi_fp)
+                seen_twice = True
+            
+            else:
+                seen_twice = False
+                output_buffer.add_seq(Seq(name,color,seq))
+            current_read = name
+
+if __name__ == '__main__':
+    main()

From aca8b2c03d99c80d46bf52c57fa7f0c85510a45c Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Thu, 10 Oct 2013 19:55:49 -0400
Subject: [PATCH 058/140] some changes to buffered splitting

---
 scripts/split-reads-by-color.py | 33 ++++++++++++++++++++++++++++-----
 1 file changed, 28 insertions(+), 5 deletions(-)

diff --git a/scripts/split-reads-by-color.py b/scripts/split-reads-by-color.py
index 6d90a2ec45..3f9d38b989 100644
--- a/scripts/split-reads-by-color.py
+++ b/scripts/split-reads-by-color.py
@@ -36,10 +36,10 @@ def write(self, fp):
 
 class ReadBuffer:
 
-    def __init__(self, max_files=512, max_reads=1000000, est_files=100000, output_pref='reads_'):
+    def __init__(self, max_buffers=10000, max_reads=1000000, est_files=100000, output_pref='reads_'):
         self.buffers = {}
         self.buffer_counts = {}
-        self.max_files = max_files
+        self.max_buffers = max_buffers
         self.max_reads = max_reads
 
         self.est_files = est_files
@@ -57,6 +57,7 @@ def add_seq(self, seq):
             self.buffer_counts[color] += 1
             if count > self.buffer_flush:
                 self.flush_buffer(color)
+                self.del_buffer(color)
 
         else:
             self.buffers[color] = [seq]
@@ -64,36 +65,58 @@ def add_seq(self, seq):
         self.cur_reads += 1
         if self.cur_reads > self.max_reads:
             self.flush_all()
+        if len(self.buffers) > self.max_buffers:
+            #self.clean_buffers(2)
+            self.flush_all()
     
     def flush_buffer(self, color):
         with open('{}{}.fa'.format(self.output_pref, color), 'a') as outfp:
             for read in self.buffers[color]:
                 read.write(outfp)
                 self.cur_reads -= 1
-            del self.buffer_counts[color]
-            del self.buffers[color]
+            
+    def del_buffer(self, color):
+        del self.buffer_counts[color]
+        del self.buffers[color]
 
     def flush_all(self):
         print >>sys.stderr, '** reached max buffer size, flushing all to files...'
         for color in self.buffers:
             self.flush_buffer(color)
+        colors = self.buffers.keys()
+        for color in colors:
+            self.del_buffer(color)
+        del colors
         assert self.cur_reads == 0
 
+    def clean_buffers(self, cutoff):
+        print >>sys.stderr, '** flushing low-abundance buffers...'
+        flushed = []
+        for color in self.buffers:
+            if self.buffer_counts[color] < cutoff:
+                self.flush_buffer(color)
+                flushed.append(color)
+        for color in flushed:
+            self.del_buffer(color)
+        del flushed
+
 def main():
 
     parser = argparse.ArgumentParser()
     parser.add_argument('-b', '--buffer_size', dest='buffer_size', type=int)
     parser.add_argument('-e', '--files_estimate', dest='files_estimate', type=int)
     parser.add_argument('-o', '--output_prefix', dest='output_prefix')
+    parser.add_argument('-m', '--max_buffers', dest='max_buffers', type=int)
     parser.add_argument('input_files', nargs='+')
     args = parser.parse_args()
 
+    max_buffers = args.max_buffers
     output_pref = args.output_prefix
     buf_size = args.buffer_size
     est = args.files_estimate
     input_files = args.input_files
 
-    output_buffer = ReadBuffer(max_reads=buf_size, est_files=est, output_pref=output_pref)
+    output_buffer = ReadBuffer(max_buffers=max_buffers, max_reads=buf_size, est_files=est, output_pref=output_pref)
 
     multi_fp = open('{}_multi.fa'.format(output_pref), 'a')
     

From fa8d5bd790a861f353891b1456c529a0c68ac94f Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Tue, 22 Oct 2013 10:32:03 -0400
Subject: [PATCH 059/140] added debugging option to make

---
 Makefile | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Makefile b/Makefile
index c3d95ce015..264c6a7f33 100644
--- a/Makefile
+++ b/Makefile
@@ -9,6 +9,10 @@ clean:
 	cd lib && make clean
 	cd tests && rm -rf khmertest_*
 
+debug:
+	export CFLAGS="-pg -fprofile-arcs"; python setup.py build_ext --debug
+	python setup.py install
+
 doc: FORCE
 	python setup.py build_sphinx --fresh-env
 	@echo ''

From 4e30cecb29fdb208850e7bddae61e6b3bb35bfa6 Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Tue, 22 Oct 2013 10:32:19 -0400
Subject: [PATCH 060/140] fixed color test build params

---
 lib/Makefile | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/lib/Makefile b/lib/Makefile
index 4774174742..ef3ed73e20 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -1,6 +1,6 @@
 # Profile?
 # Set this variable to true if you wish to profile the codes.
-WANT_PROFILING=false
+WANT_PROFILING=true
 
 # Which profiling tool to use?
 # Assuming you have TAU installed and setup properly, 
@@ -207,8 +207,8 @@ bittest: bittest.o ktable.o
 
 #ktable_test: ktable_test.o hashtable.o subset.o $(PARSERS_OBJS) $(CORE_OBJS) $(ZLIB_OBJS) $(BZIP2_OBJS)
 #	$(CXX) -o $@ ktable_test.o hashtable.o subset.o $(PARSERS_OBJS) $(CORE_OBJS) $(ZLIB_OBJS) $(BZIP2_OBJS) $(LIBS)
-color_test: test-Colors.o hashtable.o subset.o $(PARSERS_OBJS) $(CORE_OBJS) $(ZLIB_OBJS) $(BZIP2_OBJS)
-	$(CXX) -o $@ ktable_test.o hashtable.o subset.o $(PARSERS_OBJS) $(CORE_OBJS) $(ZLIB_OBJS) $(BZIP2_OBJS) $(LIBS)
+#color_test: test-Colors.o hashtable.o subset.o $(PARSERS_OBJS) $(CORE_OBJS) $(ZLIB_OBJS) $(BZIP2_OBJS)
+#	$(CXX) -o $@ hashtable.o subset.o $(PARSERS_OBJS) $(CORE_OBJS) $(ZLIB_OBJS) $(BZIP2_OBJS) $(LIBS)
 
 # NOTE: Disabled due to broken constructor call.
 #graphtest: graphtest.o ktable.o hashtable.o
@@ -262,6 +262,9 @@ subset.o: subset.cc subset.hh hashbits.hh ktable.hh khmer.hh
 
 counting.o: counting.cc counting.hh hashtable.hh ktable.hh khmer.hh
 
+test-Colors.o: test-Colors.cc
+	$(CXX) $(CXXFLAGS) -c -o $@ test-Colors.cc -fopenmp
+
 test-StreamReader.o: test-StreamReader.cc read_parsers.hh
 
 test-CacheManager.o: test-CacheManager.cc read_parsers.hh

From cd866748d62b6761115888f9f45c028e6ac38e63 Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Tue, 22 Oct 2013 10:32:44 -0400
Subject: [PATCH 061/140] couple changes to color-Test

---
 lib/test-Colors.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lib/test-Colors.cc b/lib/test-Colors.cc
index 2dafc79960..6da9e7e500 100644
--- a/lib/test-Colors.cc
+++ b/lib/test-Colors.cc
@@ -46,12 +46,12 @@ int main( int argc, char * argv[ ] )
     float		ht_size_FP	    = 1.0E8;
     unsigned long	ht_count	    = 4;
     uint64_t		cache_size	    = 4L * 1024 * 1024 * 1024;
-    unsigned int	range		    = 1;
+    unsigned int	range		    = 82;
     int			rc		    = 0;
     int			opt		    = -1;
     char *		conv_residue	    = NULL;
-    string		rfile_name = "/mnt/scratch/tg/w/2013-lamprey/data/AK.fq.gz";
-    string		ifile_name = "/mnt/scratch/tg/w/petMar_test.fp";
+    string		rfile_name = "/w/tag_coloring/test_reads.fq";
+    string		ifile_name = "/w/tag_coloring/petMar_test.fp";
     // FILE *		ofile		    = NULL;
     HashIntoType	    ht_size		= (HashIntoType)ht_size_FP;
     Primes primetab( ht_size );

From e54b216f13a14a4e88fae3107a8a569eb0ea65c2 Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Tue, 22 Oct 2013 10:33:21 -0400
Subject: [PATCH 062/140] changes to original sweep reads, now deprecated...

---
 scripts/sweep-reads-by-partition.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/scripts/sweep-reads-by-partition.py b/scripts/sweep-reads-by-partition.py
index ed6d31dcd8..b0cd79961b 100755
--- a/scripts/sweep-reads-by-partition.py
+++ b/scripts/sweep-reads-by-partition.py
@@ -19,6 +19,7 @@
 from khmer.counting_args import build_construct_args, DEFAULT_MIN_HASHSIZE
 
 DEFAULT_PPF = 1
+MAX_FILES=512
 
 def write_read(fp, seq, name, color):
     fp.write('>{name}\t{color}\n{seq}\n'.format(seq=seq, name=name, color=color))
@@ -28,6 +29,7 @@ def main():
     parser.add_argument('-p', '--partitions_per_file', 
                         dest='partitions_per_file', default=DEFAULT_PPF)
     parser.add_argument('-i', '--input_fastp', dest='input_fastp')
+    parser.add_argument('-r', '--traversal_range', dest='traversal_range')
     parser.add_argument('input_reads', nargs='+')
     args = parser.parse_args()
     
@@ -55,6 +57,7 @@ def main():
     HT_SIZE = args.min_hashsize
     N_HT = args.n_hashes
     
+    traversal_range = args.traversal_range
     input_reads = args.input_reads
     input_fastp = args.input_fastp
     ppf = args.partitions_per_file

From d13cb52425d928ac165d6cfc5df1402143e2f8a9 Mon Sep 17 00:00:00 2001
From: CS <cs.welcher@gmail.com>
Date: Fri, 1 Nov 2013 04:35:03 -0400
Subject: [PATCH 063/140] added combined sweep and file output script

---
 scripts/sweep-reads-by-partition-buffered.py | 199 +++++++++++++++++++
 1 file changed, 199 insertions(+)
 create mode 100755 scripts/sweep-reads-by-partition-buffered.py

diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py
new file mode 100755
index 0000000000..9af1c4bba9
--- /dev/null
+++ b/scripts/sweep-reads-by-partition-buffered.py
@@ -0,0 +1,199 @@
+#! /w/khmer_dev/bin/python
+
+import screed
+import sys
+import argparse
+import time
+import khmer
+from khmer.counting_args import build_construct_args, DEFAULT_MIN_HASHSIZE
+
+# little class to store sequence information for the buffering class
+class Seq:
+    def __init__(self, name, color, seq):
+        self.name = name
+        self.color = color
+        self.seq = seq
+    def write(self, fp):
+        fp.write('>{}\t{}\n{}\n'.format(self.name, self.color, self.seq))
+
+# stores reads in memory and flushes them to their approriate files
+# when certain criteria are met
+# Basic idea is to buffer some number of reads in memory, then dump them all at once
+# Hope that each file acrues, on average, BUFFER_SIZE / NUM_PARTS reads
+# ie, if we buffer 1000000 reads, and we have 100000 partitions or colors,
+# we should expect the mean buffer size to be 10 reads
+class ReadBuffer:
+
+    def __init__(self, max_buffers=10000, max_reads=1000000, est_files=100000, output_pref='reads_'):
+        self.buffers = {}
+        self.buffer_counts = {}
+        self.max_buffers = max_buffers
+        self.max_reads = max_reads
+
+        self.est_files = est_files
+        self.output_pref = output_pref
+        self.buffer_flush = self.max_reads / self.est_files
+
+        self.cur_reads = 0
+        self.cur_files = 0
+
+    def add_seq(self, seq):
+        color = seq.color
+        if color in self.buffers:
+            count = self.buffer_counts[color]
+            self.buffers[color].append(seq)
+            self.buffer_counts[color] += 1
+            if count > self.buffer_flush:
+                self.flush_buffer(color)
+                self.del_buffer(color)
+
+        else:
+            self.buffers[color] = [seq]
+            self.buffer_counts[color] = 1
+        self.cur_reads += 1
+        if self.cur_reads > self.max_reads:
+            self.flush_all()
+        if len(self.buffers) > self.max_buffers:
+            #self.clean_buffers(2)
+            self.flush_all()
+    
+    def flush_buffer(self, color):
+        with open('{}_{}.fa'.format(self.output_pref, color), 'a') as outfp:
+            for read in self.buffers[color]:
+                read.write(outfp)
+                self.cur_reads -= 1
+            
+    def del_buffer(self, color):
+        del self.buffer_counts[color]
+        del self.buffers[color]
+
+    def flush_all(self):
+        print >>sys.stderr, '** reached max buffer size, flushing all to files...'
+        for color in self.buffers:
+            self.flush_buffer(color)
+        colors = self.buffers.keys()
+        for color in colors:
+            self.del_buffer(color)
+        del colors
+        assert self.cur_reads == 0
+
+    def clean_buffers(self, cutoff):
+        print >>sys.stderr, '** flushing low-abundance buffers...'
+        flushed = []
+        for color in self.buffers:
+            if self.buffer_counts[color] < cutoff:
+                self.flush_buffer(color)
+                flushed.append(color)
+        for color in flushed:
+            self.del_buffer(color)
+        del flushed
+
+def main():
+
+    parser = build_construct_args()
+    parser.add_argument('-i', '--input_fastp',dest='input_fastp')
+    parser.add_argument('-r', '--traversal_range', type=int, dest='traversal_range')
+    parser.add_argument('-b', '--buffer_size', dest='buffer_size', type=int)
+    parser.add_argument('-e', '--files_estimate', dest='files_estimate', type=int)
+    parser.add_argument('-o', '--output_prefix', dest='output_prefix')
+    parser.add_argument('-m', '--max_buffers', dest='max_buffers', type=int)
+    parser.add_argument('input_files', nargs='+')
+    args = parser.parse_args()
+    
+    if not args.quiet:
+        if args.min_hashsize == DEFAULT_MIN_HASHSIZE:
+            print >>sys.stderr, \
+                "** WARNING: hashsize is default!  " \
+                "You absodefly want to increase this!\n** " \
+                "Please read the docs!"
+
+        print >>sys.stderr, '\nPARAMETERS:'
+        print >>sys.stderr, \
+            ' - kmer size =    {ksize:d} \t\t(-k)'.format(ksize=args.ksize)
+        print >>sys.stderr, \
+            ' - n hashes =     {nhash:d} \t\t(-N)'.format(nhash=args.n_hashes)
+        print >>sys.stderr, \
+            ' - min hashsize = {mh:-5.2g} \t(-x)'.format(mh=args.min_hashsize)
+        print >>sys.stderr, ''
+        print >>sys.stderr, \
+            'Estimated memory usage is {prod:.2g} bytes \
+            (n_hashes x min_hashsize / 8)'.format(prod=args.n_hashes*args.min_hashsize/8)
+        print >>sys.stderr, '-' * 8
+    
+    K = args.ksize
+    HT_SIZE = args.min_hashsize
+    N_HT = args.n_hashes
+    
+    traversal_range = args.traversal_range
+    input_fastp = args.input_fastp
+
+    max_buffers = args.max_buffers
+    output_pref = args.output_prefix
+    buf_size = args.buffer_size
+    est = args.files_estimate
+    input_files = args.input_files
+
+    output_buffer = ReadBuffer(max_buffers=max_buffers, max_reads=buf_size, est_files=est, output_pref=output_pref)
+
+	# file for multicolored reads, just keep this one around the whole time
+    multi_fp = open('{}_multi.fp'.format(output_pref), 'a')
+    orphaned_fp = open('{}_orphaned.fa'.format(output_pref), 'a')
+
+	# consume the partitioned fasta with which to color the graph
+    ht = khmer.new_hashbits(K, HT_SIZE, N_HT)
+    print >>sys.stderr, 'consuming fastp...'
+    ht.consume_partitioned_fasta_and_tag_with_colors(input_fastp)
+
+    color_number_dist = []
+    
+    n_orphaned = 0
+    n_colored = 0
+    n_mcolored = 0
+    try:
+        total_t = time.clock()
+        start_t = time.clock()
+        for read_file in input_files:
+            print >>sys.stderr,'** sweeping {read_file} for colors...'.format(read_file=read_file)
+            file_t = 0.0
+            for n, record in enumerate(screed.open(read_file)):
+
+                if n % 50000 == 0:
+                    end_t = time.clock()
+                    batch_t = end_t - start_t
+                    file_t += batch_t
+                    print >>sys.stderr, '\tswept {n} reads [{nc} colored, {no} orphaned] ** {sec}s ({sect}s total)' \
+                                        .format(n=n, nc=n_colored, no=n_orphaned, sec=batch_t, sect=file_t)
+                    start_t = time.clock()
+                seq = record.sequence
+                name = record.name
+                
+                colors = ht.sweep_color_neighborhood(seq, traversal_range)
+                color_number_dist.append(len(colors))
+                if colors:
+                    n_colored += 1
+                    if len(colors) > 1:
+                        multi_fp.write('>{}\t{}\n{}\n'.format(name, '\t'.join([str(c) for c in colors]), seq))
+                    else:
+                        output_buffer.add_seq(Seq(name, colors[0], seq))
+                else:
+                    n_orphaned += 1
+                    orphaned_fp.write('>{}\n{}\n'.format(name, seq))
+
+    except IOError as e:
+        print >>sys.stderr, 'ERROR:', e
+        print >>sys.stderr, '** exiting...'
+    
+	total_t = time.clock() - total_t
+
+    print >>sys.stderr, 'swept {n_reads} for colors...'.format(n_reads=n)
+    print >>sys.stderr, '...with {nc} colored and {no} orphaned'.format(
+                                    nc=n_colored, no=n_orphaned)
+    print >>sys.stderr, '...and {nmc} multicolored'.format(nmc=n_mcolored)
+    
+    print >>sys.stderr, '** outputting color number distribution...'
+    with open('color_dist.txt', 'wb') as outfp:
+        for nc in color_number_dist:
+            outfp.write('{nc}\n'.format(nc=nc))
+    
+if __name__ == '__main__':
+    main()

From 812036290e36ac5cdd9d90030ead05c6342d62d1 Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Tue, 5 Nov 2013 14:43:29 -0500
Subject: [PATCH 064/140] changed bad env line

---
 scripts/sweep-reads-by-partition-buffered.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py
index 9af1c4bba9..ee0fbb1f8a 100755
--- a/scripts/sweep-reads-by-partition-buffered.py
+++ b/scripts/sweep-reads-by-partition-buffered.py
@@ -1,4 +1,4 @@
-#! /w/khmer_dev/bin/python
+#! /usr/bin/python
 
 import screed
 import sys

From 0d899219553548f48d9fdbd464807a93cae8c9a4 Mon Sep 17 00:00:00 2001
From: CS Welcher <cs.welcher@gmail.com>
Date: Tue, 12 Nov 2013 14:25:07 -0500
Subject: [PATCH 065/140] important change in traversal code: removed
 optimization which truncated search at that breadth when a tag is found

---
 lib/subset.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/subset.cc b/lib/subset.cc
index 09ff44a6c9..8f6ae2bf16 100644
--- a/lib/subset.cc
+++ b/lib/subset.cc
@@ -615,7 +615,7 @@ unsigned int SubsetPartition::sweep_for_tags(const std::string& seq,
       tagged_kmers.insert(kmer);
       // if we find a tag, finish the remaining queued nodes,
       // but don't queue up any more
-      max_breadth = breadth;
+      // max_breadth = breadth;
       continue;
     }
 

From bd2fcdb48a209196be4f0d1d0ebebf1712e68a0d Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Wed, 13 Nov 2013 13:32:45 -0500
Subject: [PATCH 066/140] added default parameters to buffered sweep

---
 scripts/sweep-reads-by-partition-buffered.py | 31 ++++++++++++++++----
 1 file changed, 26 insertions(+), 5 deletions(-)

diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py
index ee0fbb1f8a..2dfae2981d 100755
--- a/scripts/sweep-reads-by-partition-buffered.py
+++ b/scripts/sweep-reads-by-partition-buffered.py
@@ -1,4 +1,16 @@
 #! /usr/bin/python
+#
+# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2013. It is licensed under
+# the three-clause BSD license; see doc/LICENSE.txt. Contact: ctb@msu.edu
+#
+
+"""
+Find all reads connected to the given contigs on a per-partition basis.
+
+% python scripts/normalize-by-median.py -r <range> -i <contigs fastp> \
+<reads1> <reads2> ... <readsN>
+"""
 
 import screed
 import sys
@@ -7,6 +19,11 @@
 import khmer
 from khmer.counting_args import build_construct_args, DEFAULT_MIN_HASHSIZE
 
+DEFAULT_NUM_BUFFERS=50000
+DEFAULT_BUFFER_SIZE=1000000
+DEFAULT_NUM_PARTITIONS=100000
+DEFAULT_OUT_PREF='reads_'
+
 # little class to store sequence information for the buffering class
 class Seq:
     def __init__(self, name, color, seq):
@@ -24,7 +41,7 @@ def write(self, fp):
 # we should expect the mean buffer size to be 10 reads
 class ReadBuffer:
 
-    def __init__(self, max_buffers=10000, max_reads=1000000, est_files=100000, output_pref='reads_'):
+    def __init__(self, max_buffers, max_reads, est_files, output_pref):
         self.buffers = {}
         self.buffer_counts = {}
         self.max_buffers = max_buffers
@@ -93,10 +110,14 @@ def main():
     parser = build_construct_args()
     parser.add_argument('-i', '--input_fastp',dest='input_fastp')
     parser.add_argument('-r', '--traversal_range', type=int, dest='traversal_range')
-    parser.add_argument('-b', '--buffer_size', dest='buffer_size', type=int)
-    parser.add_argument('-e', '--files_estimate', dest='files_estimate', type=int)
-    parser.add_argument('-o', '--output_prefix', dest='output_prefix')
-    parser.add_argument('-m', '--max_buffers', dest='max_buffers', type=int)
+    parser.add_argument('-b', '--buffer_size', dest='buffer_size', type=int, \
+                        default=DEFAULT_BUFFER_SIZE)
+    parser.add_argument('-e', '--files_estimate', dest='files_estimate', type=int, \
+                        default=DEFAULT_NUM_PARTITIONS)
+    parser.add_argument('-o', '--output_prefix', dest='output_prefix',
+                        default=DEFAULT_OUT_PREF)
+    parser.add_argument('-m', '--max_buffers', dest='max_buffers', type=int, \
+                        default=DEFAULT_NUM_BUFFERS)
     parser.add_argument('input_files', nargs='+')
     args = parser.parse_args()
     

From 9278ecd955cc1d1b99ac74d720102c63883fa3a8 Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Wed, 13 Nov 2013 14:30:26 -0500
Subject: [PATCH 067/140] added error handling to file opening and buffer
 flushing

---
 scripts/sweep-reads-by-partition-buffered.py | 77 +++++++++++++++-----
 1 file changed, 58 insertions(+), 19 deletions(-)

diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py
index 2dfae2981d..934706e09d 100755
--- a/scripts/sweep-reads-by-partition-buffered.py
+++ b/scripts/sweep-reads-by-partition-buffered.py
@@ -30,8 +30,19 @@ def __init__(self, name, color, seq):
         self.name = name
         self.color = color
         self.seq = seq
+    
+    def __repr__(self):
+        return '''>{name}\t{color}\n
+{seq}\n'''.format(name=self.name, color=self.color, seq=self.seq)
+
     def write(self, fp):
-        fp.write('>{}\t{}\n{}\n'.format(self.name, self.color, self.seq))
+        try:
+            fp.write('\n>{}\t{}\n{}\n'.format(self.name, self.color, self.seq))
+        except IOError:
+            print >>sys.stderr, 'Error writing {seq} to {fn}'.format(seq=self, fn=fp)
+            return 1
+        else:
+            return 0
 
 # stores reads in memory and flushes them to their approriate files
 # when certain criteria are met
@@ -54,6 +65,9 @@ def __init__(self, max_buffers, max_reads, est_files, output_pref):
         self.cur_reads = 0
         self.cur_files = 0
 
+        self.num_write_errors = 0
+        self.num_file_errors = 0
+
     def add_seq(self, seq):
         color = seq.color
         if color in self.buffers:
@@ -75,11 +89,19 @@ def add_seq(self, seq):
             self.flush_all()
     
     def flush_buffer(self, color):
-        with open('{}_{}.fa'.format(self.output_pref, color), 'a') as outfp:
+        fn = '{}_{}.fa'.format(self.output_pref, color)
+        try:
+            outfp = open(fn, 'a')
+        except IOError as e:
+            print >>sys.stderr, 'ERROR: {e}'.format(e=e)
+            print >>sys.stderr, '*** Failed to open {fn} for buffer flush'.format(fn)
+            self.num_file_errors += 1
+        else:
             for read in self.buffers[color]:
-                read.write(outfp)
+                self.num_write_errors += read.write(outfp)
                 self.cur_reads -= 1
-            
+            outfp.close()
+
     def del_buffer(self, color):
         del self.buffer_counts[color]
         del self.buffers[color]
@@ -154,11 +176,21 @@ def main():
     est = args.files_estimate
     input_files = args.input_files
 
-    output_buffer = ReadBuffer(max_buffers=max_buffers, max_reads=buf_size, est_files=est, output_pref=output_pref)
+    output_buffer = ReadBuffer(max_buffers, buf_size, est, output_pref)
 
 	# file for multicolored reads, just keep this one around the whole time
-    multi_fp = open('{}_multi.fp'.format(output_pref), 'a')
-    orphaned_fp = open('{}_orphaned.fa'.format(output_pref), 'a')
+    multi_fn = '{}_multi.fp'.format(output_pref)
+    try:
+        multi_fp = open(multi_fn, 'a')
+    except IOError as e:
+        print >>sys.stderr, 'ERROR: {e}'.format(e=e)
+        print >>sys.stderr, '*** Failed to open {fn}'.format(multi_fn)
+    orphaned_fn = '{}_orphaned.fa'.format(output_pref)
+    try:
+        orphaned_fp = open(orphaned_fn, 'a')
+    except IOError as e:
+        print >>sys.stderr, 'ERROR: {e}'.format(e=e)
+        print >>sys.stderr, '*** Failed to open {fn}'.format(orphaned_fn)
 
 	# consume the partitioned fasta with which to color the graph
     ht = khmer.new_hashbits(K, HT_SIZE, N_HT)
@@ -170,19 +202,25 @@ def main():
     n_orphaned = 0
     n_colored = 0
     n_mcolored = 0
-    try:
-        total_t = time.clock()
-        start_t = time.clock()
-        for read_file in input_files:
-            print >>sys.stderr,'** sweeping {read_file} for colors...'.format(read_file=read_file)
-            file_t = 0.0
-            for n, record in enumerate(screed.open(read_file)):
 
+    total_t = time.clock()
+    start_t = time.clock()
+    for read_file in input_files:
+        print >>sys.stderr,'** sweeping {read_file} for colors...'.format(read_file=read_file)
+        file_t = 0.0
+        try:
+            read_fp = screed.open(read_file)
+        except IOError as e:
+            print >>sys.stderr, 'ERROR:', e
+            print >>sys.stderr, '*** Could not open {fn}, skipping...'.format(read_file)
+        else:
+            for n, record in enumerate(read_fp):
                 if n % 50000 == 0:
                     end_t = time.clock()
                     batch_t = end_t - start_t
                     file_t += batch_t
-                    print >>sys.stderr, '\tswept {n} reads [{nc} colored, {no} orphaned] ** {sec}s ({sect}s total)' \
+                    print >>sys.stderr, '\tswept {n} reads [{nc} colored, {no} orphaned] \
+                                        ** {sec}s ({sect}s total)' \
                                         .format(n=n, nc=n_colored, no=n_orphaned, sec=batch_t, sect=file_t)
                     start_t = time.clock()
                 seq = record.sequence
@@ -190,6 +228,7 @@ def main():
                 
                 colors = ht.sweep_color_neighborhood(seq, traversal_range)
                 color_number_dist.append(len(colors))
+                SeqOb = Seq
                 if colors:
                     n_colored += 1
                     if len(colors) > 1:
@@ -199,13 +238,13 @@ def main():
                 else:
                     n_orphaned += 1
                     orphaned_fp.write('>{}\n{}\n'.format(name, seq))
-
-    except IOError as e:
-        print >>sys.stderr, 'ERROR:', e
-        print >>sys.stderr, '** exiting...'
+            read_fp.close()
     
 	total_t = time.clock() - total_t
 
+    multi_fp.close()
+    orphaned_fp.close()
+
     print >>sys.stderr, 'swept {n_reads} for colors...'.format(n_reads=n)
     print >>sys.stderr, '...with {nc} colored and {no} orphaned'.format(
                                     nc=n_colored, no=n_orphaned)

From ac63a8b9815c0fb32770dd968bd2240021c247cf Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Wed, 13 Nov 2013 14:35:17 -0500
Subject: [PATCH 068/140] added warning output for errors, updated description

---
 scripts/sweep-reads-by-partition-buffered.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py
index 934706e09d..31a26a3939 100755
--- a/scripts/sweep-reads-by-partition-buffered.py
+++ b/scripts/sweep-reads-by-partition-buffered.py
@@ -10,6 +10,13 @@
 
 % python scripts/normalize-by-median.py -r <range> -i <contigs fastp> \
 <reads1> <reads2> ... <readsN>
+
+This script is very lenient on IO errors, due to the large number of file
+operations needed. Thus, errors opening a file for buffer flush or writeing
+a read to a file will not crash the program; instead, if there were errors,
+the user will be warned at the end of execution. Errors with opening read files
+are also handled -- we move on to the next read file if there is an error opening.
+
 """
 
 import screed
@@ -244,6 +251,11 @@ def main():
 
     multi_fp.close()
     orphaned_fp.close()
+    
+    if output_buffer.num_write_errors > 0 or output_buffer.num_file_errors > 0:
+        print >>sys.stderr, 'WARNING: Sweep finished with errors!'
+        print >>sys.stderr, '** {writee} reads not written'.format(writee=output_buffer.num_write_errors)
+        print >>sys.stderr, '** {filee} errors opening files'.format(filee=output_buffer.num_file_errors)
 
     print >>sys.stderr, 'swept {n_reads} for colors...'.format(n_reads=n)
     print >>sys.stderr, '...with {nc} colored and {no} orphaned'.format(

From d5242574966a24e63aa7905c188915426d511cfb Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Wed, 13 Nov 2013 17:09:07 -0500
Subject: [PATCH 069/140] added minimum k and hashsizes to prevent inanely
 complex useless searches

---
 scripts/sweep-reads-by-partition-buffered.py | 33 +++++++++++++-------
 1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py
index 31a26a3939..02f6d7d075 100755
--- a/scripts/sweep-reads-by-partition-buffered.py
+++ b/scripts/sweep-reads-by-partition-buffered.py
@@ -30,6 +30,10 @@
 DEFAULT_BUFFER_SIZE=1000000
 DEFAULT_NUM_PARTITIONS=100000
 DEFAULT_OUT_PREF='reads_'
+DEFAULT_RANGE=-1
+
+MIN_HSIZE=4e7
+MIN_KSIZE=21
 
 # little class to store sequence information for the buffering class
 class Seq:
@@ -59,15 +63,15 @@ def write(self, fp):
 # we should expect the mean buffer size to be 10 reads
 class ReadBuffer:
 
-    def __init__(self, max_buffers, max_reads, est_files, output_pref):
+    def __init__(self, max_buffers, max_size, est_files, output_pref):
         self.buffers = {}
         self.buffer_counts = {}
         self.max_buffers = max_buffers
-        self.max_reads = max_reads
+        self.max_size = max_size
 
         self.est_files = est_files
         self.output_pref = output_pref
-        self.buffer_flush = self.max_reads / self.est_files
+        self.buffer_flush = self.max_size / self.est_files
 
         self.cur_reads = 0
         self.cur_files = 0
@@ -89,7 +93,7 @@ def add_seq(self, seq):
             self.buffers[color] = [seq]
             self.buffer_counts[color] = 1
         self.cur_reads += 1
-        if self.cur_reads > self.max_reads:
+        if self.cur_reads > self.max_size:
             self.flush_all()
         if len(self.buffers) > self.max_buffers:
             #self.clean_buffers(2)
@@ -138,7 +142,8 @@ def main():
 
     parser = build_construct_args()
     parser.add_argument('-i', '--input_fastp',dest='input_fastp')
-    parser.add_argument('-r', '--traversal_range', type=int, dest='traversal_range')
+    parser.add_argument('-r', '--traversal_range', type=int, dest='traversal_range', \
+                        default=DEFAULT_RANGE)
     parser.add_argument('-b', '--buffer_size', dest='buffer_size', type=int, \
                         default=DEFAULT_BUFFER_SIZE)
     parser.add_argument('-e', '--files_estimate', dest='files_estimate', type=int, \
@@ -150,6 +155,15 @@ def main():
     parser.add_argument('input_files', nargs='+')
     args = parser.parse_args()
     
+   
+    K = args.ksize
+    HT_SIZE = args.min_hashsize
+    if HT_SIZE < MIN_HSIZE:
+        HT_SIZE = MIN_HSIZE
+    if K < MIN_KSIZE:
+        K = MIN_KSIZE
+    N_HT = args.n_hashes
+
     if not args.quiet:
         if args.min_hashsize == DEFAULT_MIN_HASHSIZE:
             print >>sys.stderr, \
@@ -159,21 +173,17 @@ def main():
 
         print >>sys.stderr, '\nPARAMETERS:'
         print >>sys.stderr, \
-            ' - kmer size =    {ksize:d} \t\t(-k)'.format(ksize=args.ksize)
+            ' - kmer size =    {ksize:d} \t\t(-k)'.format(ksize=K)
         print >>sys.stderr, \
             ' - n hashes =     {nhash:d} \t\t(-N)'.format(nhash=args.n_hashes)
         print >>sys.stderr, \
-            ' - min hashsize = {mh:-5.2g} \t(-x)'.format(mh=args.min_hashsize)
+            ' - min hashsize = {mh:-5.2g} \t(-x)'.format(mh=HT_SIZE)
         print >>sys.stderr, ''
         print >>sys.stderr, \
             'Estimated memory usage is {prod:.2g} bytes \
             (n_hashes x min_hashsize / 8)'.format(prod=args.n_hashes*args.min_hashsize/8)
         print >>sys.stderr, '-' * 8
     
-    K = args.ksize
-    HT_SIZE = args.min_hashsize
-    N_HT = args.n_hashes
-    
     traversal_range = args.traversal_range
     input_fastp = args.input_fastp
 
@@ -235,7 +245,6 @@ def main():
                 
                 colors = ht.sweep_color_neighborhood(seq, traversal_range)
                 color_number_dist.append(len(colors))
-                SeqOb = Seq
                 if colors:
                     n_colored += 1
                     if len(colors) > 1:

From 3a7ff00d1509a77031a4c44f5c656036c5dd75b9 Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Wed, 13 Nov 2013 18:14:36 -0500
Subject: [PATCH 070/140] started tests

---
 tests/test_scripts.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/tests/test_scripts.py b/tests/test_scripts.py
index 697a195bb3..89c72c81fe 100644
--- a/tests/test_scripts.py
+++ b/tests/test_scripts.py
@@ -1121,3 +1121,15 @@ def test_sample_reads_randomly():
                         '895:1:1:1327:15301', '895:1:1:1265:2265',
                         '895:1:1:1327:13028', '895:1:1:1368:4434',
                         '895:1:1:1335:19932', '895:1:1:1340:19387'])
+
+def test_sweep_reads_by_partition_buffered():
+    readfile = utils.get_temp_filename('reads.fa')
+    contigfile = utils.get_temp_filename('contigs.fp')
+    in_dir = os.path.dirname(infile)
+    
+    shutil.copyfile(utils.get_test_data('test-sweep-reads.fa'), infile)
+    shutil.copyfile(utils.get_test_data('test-sweep-contigs.fp'), contigfile)    
+
+    script = scriptpath('sweep-reads-by-partition-buffered.py')
+    args = ['-o', 'test', '-i', contigfile, readfile]
+

From 37ac9d89090cbaf8c7593de41cf8bf7a14a2154d Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Thu, 14 Nov 2013 11:01:19 -0500
Subject: [PATCH 071/140] working on tests

---
 tests/test_scripts.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tests/test_scripts.py b/tests/test_scripts.py
index 89c72c81fe..9a50e56721 100644
--- a/tests/test_scripts.py
+++ b/tests/test_scripts.py
@@ -1132,4 +1132,9 @@ def test_sweep_reads_by_partition_buffered():
 
     script = scriptpath('sweep-reads-by-partition-buffered.py')
     args = ['-o', 'test', '-i', contigfile, readfile]
+    status, out, err = runscript(script, args, in_dir)
+    
+    outfiles = ['test_0.fa', 'test_1.fa']
 
+    seqs1 = set([r.name for r in screed.open(outfiles[0])])
+    seqs2 = set([r.name for r in screed.open(outfiles[1])])

From 0ccb6231727038946d857ebd83895d3ac31f4b0d Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Thu, 14 Nov 2013 11:01:44 -0500
Subject: [PATCH 072/140] changed something...

---
 scripts/sweep-reads-by-partition-buffered.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py
index 02f6d7d075..3a82471053 100755
--- a/scripts/sweep-reads-by-partition-buffered.py
+++ b/scripts/sweep-reads-by-partition-buffered.py
@@ -152,6 +152,7 @@ def main():
                         default=DEFAULT_OUT_PREF)
     parser.add_argument('-m', '--max_buffers', dest='max_buffers', type=int, \
                         default=DEFAULT_NUM_BUFFERS)
+    parser.add_argument('-d', '--debug', dest='debug', default=None)
     parser.add_argument('input_files', nargs='+')
     args = parser.parse_args()
     
@@ -193,6 +194,10 @@ def main():
     est = args.files_estimate
     input_files = args.input_files
 
+    debug = args.debug
+    if debug:
+        import yep
+
     output_buffer = ReadBuffer(max_buffers, buf_size, est, output_pref)
 
 	# file for multicolored reads, just keep this one around the whole time
@@ -212,6 +217,8 @@ def main():
 	# consume the partitioned fasta with which to color the graph
     ht = khmer.new_hashbits(K, HT_SIZE, N_HT)
     print >>sys.stderr, 'consuming fastp...'
+    if debug:
+        yep.start(debug)
     ht.consume_partitioned_fasta_and_tag_with_colors(input_fastp)
 
     color_number_dist = []
@@ -260,7 +267,8 @@ def main():
 
     multi_fp.close()
     orphaned_fp.close()
-    
+    if debug:
+        yep.stop()
     if output_buffer.num_write_errors > 0 or output_buffer.num_file_errors > 0:
         print >>sys.stderr, 'WARNING: Sweep finished with errors!'
         print >>sys.stderr, '** {writee} reads not written'.format(writee=output_buffer.num_write_errors)

From eb7b34601593a1f47957ad48e22a23c0c5cdc92c Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Thu, 14 Nov 2013 13:56:46 -0500
Subject: [PATCH 073/140] removed old sweep scripts

---
 scripts/split-reads-by-color.py             | 154 --------------------
 scripts/sweep-reads-by-partition-to-file.py | 123 ----------------
 scripts/sweep-reads-by-partition.py         | 137 -----------------
 3 files changed, 414 deletions(-)
 delete mode 100644 scripts/split-reads-by-color.py
 delete mode 100755 scripts/sweep-reads-by-partition-to-file.py
 delete mode 100755 scripts/sweep-reads-by-partition.py

diff --git a/scripts/split-reads-by-color.py b/scripts/split-reads-by-color.py
deleted file mode 100644
index 3f9d38b989..0000000000
--- a/scripts/split-reads-by-color.py
+++ /dev/null
@@ -1,154 +0,0 @@
-# In-progress read-buffering approach to writing out colors to many files
-# Basic idea is to buffer some number of reads in memory, then dump them all at once
-# Hope that each file acrues, on average, BUFFER_SIZE / NUM_PARTS reads
-# ie, if we buffer 1000000 reads, and we have 100000 partitios/colors,
-# we should expect the mean buffer size to be 10 reads
-
-import screed
-import sys
-import argparse
-import time
-
-def fastp_iter(filename):
-    for record in screed.open(filename, parse_description=False):
-        name = record.name
-        try:
-            name, partition_id = name.rsplit('\t', 1)
-        except ValueError:
-            print >>sys.stderr, '%%% ERROR: Derp! Is this file partitioned? %%%'
-            sys.exit(1)
-        # convert name to blast format if necessary
-        nname = name.split('|', 2)
-        if len(nname) >= 2:
-            name = nname[2]
-        name = name.split(' ')[0]
-        yield name, int(partition_id), record.sequence
-
-class Seq:
-
-    def __init__(self, name, color, seq):
-        self.name = name
-        self.color = color
-        self.seq = seq
-
-    def write(self, fp):
-        fp.write('>{}\t{}\n{}\n'.format(self.name, self.color, self.seq))
-
-class ReadBuffer:
-
-    def __init__(self, max_buffers=10000, max_reads=1000000, est_files=100000, output_pref='reads_'):
-        self.buffers = {}
-        self.buffer_counts = {}
-        self.max_buffers = max_buffers
-        self.max_reads = max_reads
-
-        self.est_files = est_files
-        self.output_pref = output_pref
-        self.buffer_flush = self.max_reads / self.est_files
-
-        self.cur_reads = 0
-        self.cur_files = 0
-
-    def add_seq(self, seq):
-        color = seq.color
-        if color in self.buffers:
-            count = self.buffer_counts[color]
-            self.buffers[color].append(seq)
-            self.buffer_counts[color] += 1
-            if count > self.buffer_flush:
-                self.flush_buffer(color)
-                self.del_buffer(color)
-
-        else:
-            self.buffers[color] = [seq]
-            self.buffer_counts[color] = 1
-        self.cur_reads += 1
-        if self.cur_reads > self.max_reads:
-            self.flush_all()
-        if len(self.buffers) > self.max_buffers:
-            #self.clean_buffers(2)
-            self.flush_all()
-    
-    def flush_buffer(self, color):
-        with open('{}{}.fa'.format(self.output_pref, color), 'a') as outfp:
-            for read in self.buffers[color]:
-                read.write(outfp)
-                self.cur_reads -= 1
-            
-    def del_buffer(self, color):
-        del self.buffer_counts[color]
-        del self.buffers[color]
-
-    def flush_all(self):
-        print >>sys.stderr, '** reached max buffer size, flushing all to files...'
-        for color in self.buffers:
-            self.flush_buffer(color)
-        colors = self.buffers.keys()
-        for color in colors:
-            self.del_buffer(color)
-        del colors
-        assert self.cur_reads == 0
-
-    def clean_buffers(self, cutoff):
-        print >>sys.stderr, '** flushing low-abundance buffers...'
-        flushed = []
-        for color in self.buffers:
-            if self.buffer_counts[color] < cutoff:
-                self.flush_buffer(color)
-                flushed.append(color)
-        for color in flushed:
-            self.del_buffer(color)
-        del flushed
-
-def main():
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument('-b', '--buffer_size', dest='buffer_size', type=int)
-    parser.add_argument('-e', '--files_estimate', dest='files_estimate', type=int)
-    parser.add_argument('-o', '--output_prefix', dest='output_prefix')
-    parser.add_argument('-m', '--max_buffers', dest='max_buffers', type=int)
-    parser.add_argument('input_files', nargs='+')
-    args = parser.parse_args()
-
-    max_buffers = args.max_buffers
-    output_pref = args.output_prefix
-    buf_size = args.buffer_size
-    est = args.files_estimate
-    input_files = args.input_files
-
-    output_buffer = ReadBuffer(max_buffers=max_buffers, max_reads=buf_size, est_files=est, output_pref=output_pref)
-
-    multi_fp = open('{}_multi.fa'.format(output_pref), 'a')
-    
-    n_reads = 0
-    total_t = 0.0
-    start_t = time.clock()
-    for input_file in args.input_files:
-        print >>sys.stderr, '* splitting reads in {}...'.format(input_file)
-
-        current_read = ''
-        seen_twice = False
-
-        for name, color, seq in fastp_iter(input_file):
-            n_reads += 1
-            seq_obj = Seq(name, color, seq)
-
-            if n_reads % 100000 == 0:
-                end_t = time.clock()
-                batch_t = end_t - start_t
-                total_t += batch_t
-                print >>sys.stderr, '** processed {} reads from {} [{}s, {}s total]'.format(n_reads, input_file, batch_t, total_t)
-                start_t = time.clock()
- 
-            if name == current_read:
-                if not seen_twice:
-                    seq_obj.write(multi_fp)
-                seen_twice = True
-            
-            else:
-                seen_twice = False
-                output_buffer.add_seq(Seq(name,color,seq))
-            current_read = name
-
-if __name__ == '__main__':
-    main()
diff --git a/scripts/sweep-reads-by-partition-to-file.py b/scripts/sweep-reads-by-partition-to-file.py
deleted file mode 100755
index 6cb0ed687a..0000000000
--- a/scripts/sweep-reads-by-partition-to-file.py
+++ /dev/null
@@ -1,123 +0,0 @@
-#! /w/khmer_dev/bin/python
-#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2009-2013. It is licensed under
-# the three-clause BSD license; see doc/LICENSE.txt. Contact: ctb@msu.edu
-#
-"""
-Tag and color the given partitioned fasta, then find all reads in the neighborhood
-of each partition and output to a file
-
-% python scripts/normalize-by-median.py [ -p <partitions/file> ] -i <fastp> <reads1> <reads2> ...
-
-Use '-h' for parameter help.
-"""
-
-import khmer
-import screed
-import sys
-import time
-from khmer.counting_args import build_construct_args, DEFAULT_MIN_HASHSIZE
-
-MAX_FILES=512
-READS_PER_FILE = 100000000
-
-def write_read(fp, seq, name, color):
-    fp.write('>{name}\t{color}\n{seq}\n'.format(seq=seq, name=name, color=color))
-
-def main():
-    parser = build_construct_args()
-    #parser.add_argument('-p', '--partitions_per_file', 
-    #                    dest='partitions_per_file', default=DEFAULT_PPF)
-    parser.add_argument('-i', '--input_fastp',dest='input_fastp')
-    parser.add_argument('-r', '--traversal_range', type=int, dest='traversal_range')
-    parser.add_argument('input_reads', nargs='+')
-    args = parser.parse_args()
-    
-    if not args.quiet:
-        if args.min_hashsize == DEFAULT_MIN_HASHSIZE:
-            print >>sys.stderr, \
-                "** WARNING: hashsize is default!  " \
-                "You absodefly want to increase this!\n** " \
-                "Please read the docs!"
-
-        print >>sys.stderr, '\nPARAMETERS:'
-        print >>sys.stderr, \
-            ' - kmer size =    {ksize:d} \t\t(-k)'.format(ksize=args.ksize)
-        print >>sys.stderr, \
-            ' - n hashes =     {nhash:d} \t\t(-N)'.format(nhash=args.n_hashes)
-        print >>sys.stderr, \
-            ' - min hashsize = {mh:-5.2g} \t(-x)'.format(mh=args.min_hashsize)
-        print >>sys.stderr, ''
-        print >>sys.stderr, \
-            'Estimated memory usage is {prod:.2g} bytes \
-            (n_hashes x min_hashsize / 8)'.format(prod=args.n_hashes*args.min_hashsize/8)
-        print >>sys.stderr, '-' * 8
-    
-    K = args.ksize
-    HT_SIZE = args.min_hashsize
-    N_HT = args.n_hashes
-    
-    traversal_range = args.traversal_range
-    input_reads = args.input_reads
-    input_fastp = args.input_fastp
-    
-    ht = khmer.new_hashbits(K, HT_SIZE, N_HT)
-    print >>sys.stderr, 'consuming fastp...'
-    ht.consume_partitioned_fasta_and_tag_with_colors(input_fastp)
- 
-    color_number_dist = []
-    
-    n_orphaned = 0
-    n_colored = 0
-    n_mcolored = 0
-    n_files = 0
-    try:
-        outfp = open('colored_reads_0.fa', 'wb')
-        start_t = time.clock()
-        for read_file in input_reads:
-            print >>sys.stderr,'** sweeping {read_file} for colors...'.format(read_file=read_file)
-            total_t = 0.0
-            for n, record in enumerate(screed.open(read_file)):
-                if n % 50000 == 0:
-                    end_t = time.clock()
-                    batch_t = end_t - start_t
-                    total_t += batch_t
-                    print >>sys.stderr, '\tswept {n} reads [{nc} colored, {no} orphaned] ** {sec}s ({sect}s total)' \
-                                        .format(n=n, nc=n_colored, no=n_orphaned, sec=batch_t, sect=total_t)
-                    start_t = time.clock()
-                seq = record.sequence
-                name = record.name
-                
-                colors = ht.sweep_color_neighborhood(seq, traversal_range)
-                color_number_dist.append(len(colors))
-                if colors:
-                    n_colored += 1
-                    if len(colors) > 1:
-                        n_mcolored += 1
-                    for color in colors:
-                        write_read(outfp, seq, name, color)
-                else:
-                    n_orphaned += 1
-
-                if n_colored % READS_PER_FILE == 0 and n_colored != 0:
-                    n_files += 1
-                    outfp = open('colored_reads_{}.fa'.format(n_files), 'wb')
-
-    except IOError as e:
-        print >>sys.stderr, 'ERROR:', e
-        print >>sys.stderr, '** exiting...'
-        
-    print >>sys.stderr, 'swept {n_reads} for colors...'.format(n_reads=n)
-    print >>sys.stderr, '...with {nc} colored and {no} orphaned'.format(
-                                    nc=n_colored, no=n_orphaned)
-    print >>sys.stderr, '...and {nmc} multicolored'.format(nmc=n_mcolored)
-    print >>sys.stderr, '...to {nf} files'.format(nf=n_files)
-    
-    print >>sys.stderr, '** outputting color number distribution...'
-    with open('color_dist.txt', 'wb') as outfp:
-        for nc in color_number_dist:
-            outfp.write('{nc}\n'.format(nc=nc))
-    
-if __name__ == '__main__':
-    main()
diff --git a/scripts/sweep-reads-by-partition.py b/scripts/sweep-reads-by-partition.py
deleted file mode 100755
index b0cd79961b..0000000000
--- a/scripts/sweep-reads-by-partition.py
+++ /dev/null
@@ -1,137 +0,0 @@
-#! /w/khmer_dev/bin/python
-#
-# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-# Copyright (C) Michigan State University, 2009-2013. It is licensed under
-# the three-clause BSD license; see doc/LICENSE.txt. Contact: ctb@msu.edu
-#
-"""
-Tag and color the given partitioned fasta, then find all reads in the neighborhood
-of each partition and output to a file
-
-% python scripts/normalize-by-median.py [ -p <partitions/file> ] -i <fastp> <reads1> <reads2> ...
-
-Use '-h' for parameter help.
-"""
-
-import khmer
-import screed
-import sys
-from khmer.counting_args import build_construct_args, DEFAULT_MIN_HASHSIZE
-
-DEFAULT_PPF = 1
-MAX_FILES=512
-
-def write_read(fp, seq, name, color):
-    fp.write('>{name}\t{color}\n{seq}\n'.format(seq=seq, name=name, color=color))
-
-def main():
-    parser = build_construct_args()
-    parser.add_argument('-p', '--partitions_per_file', 
-                        dest='partitions_per_file', default=DEFAULT_PPF)
-    parser.add_argument('-i', '--input_fastp', dest='input_fastp')
-    parser.add_argument('-r', '--traversal_range', dest='traversal_range')
-    parser.add_argument('input_reads', nargs='+')
-    args = parser.parse_args()
-    
-    if not args.quiet:
-        if args.min_hashsize == DEFAULT_MIN_HASHSIZE:
-            print >>sys.stderr, \
-                "** WARNING: hashsize is default!  " \
-                "You absodefly want to increase this!\n** " \
-                "Please read the docs!"
-
-        print >>sys.stderr, '\nPARAMETERS:'
-        print >>sys.stderr, \
-            ' - kmer size =    {ksize:d} \t\t(-k)'.format(ksize=args.ksize)
-        print >>sys.stderr, \
-            ' - n hashes =     {nhash:d} \t\t(-N)'.format(nhash=args.n_hashes)
-        print >>sys.stderr, \
-            ' - min hashsize = {mh:-5.2g} \t(-x)'.format(mh=args.min_hashsize)
-        print >>sys.stderr, ''
-        print >>sys.stderr, \
-            'Estimated memory usage is {prod:.2g} bytes \
-            (n_hashes x min_hashsize / 8)'.format(prod=args.n_hashes*args.min_hashsize/8)
-        print >>sys.stderr, '-' * 8
-    
-    K = args.ksize
-    HT_SIZE = args.min_hashsize
-    N_HT = args.n_hashes
-    
-    traversal_range = args.traversal_range
-    input_reads = args.input_reads
-    input_fastp = args.input_fastp
-    ppf = args.partitions_per_file
-    
-    ht = khmer.new_hashbits(K, HT_SIZE, N_HT)
-    ht.consume_partitioned_fasta_and_tag_with_colors(input_fastp)
-    
-    cur_colors = []
-    color_to_fp_dict = {}
-    cur_fp = file
-    
-    color_number_dist = []
-    
-    n_orphaned = 0
-    n_colored = 0
-    n_mcolored = 0
-    n_files = 0
-    try:
-        for read_file in input_reads:
-            print >>sys.stderr,'** sweeping {read_file} for colors...'.format(read_file=read_file)
-            
-            for n, record in enumerate(screed.open(read_file)):
-                if n % 50000 == 0:
-                    print >>sys.stderr, '\tswept {n} reads [{nc} colored, {no} orphaned]' \
-                                        .format(n=n, nc=n_colored, no=n_orphaned)
-                seq = record.sequence
-                name = record.name
-                
-                colors = ht.sweep_color_neighborhood(seq)
-                color_number_dist.append(len(colors))
-                if colors:
-                    n_colored += 1
-                    if len(colors) > 1:
-                        n_mcolored += 1
-                    for color in colors:
-                        # do we have a file for this color already? use it!
-                        if color in color_to_fp_dict:
-                            fp = color_to_fp_dict[color]
-                            write_read(fp, seq, name, color)
-                        # no file yet? make a new one
-                        else:
-                            if len(cur_colors) == 0:
-                                #print '** opening new file...'
-                                cur_fp = open('colored_reads_{fn}.fa'.format(fn=n_files),
-                                              'wb')
-                                              
-                            color_to_fp_dict[color] = cur_fp
-                            cur_colors.append(color)
-                            write_read(cur_fp, seq, name, color)
-                            n_files += 1
-                            
-                            if len(cur_colors) == ppf:
-                                cur_colors = []
-                else:
-                    n_orphaned += 1
-            
-        for key in color_to_fp_dict:
-            if color_to_fp_dict[key]:
-                color_to_fp_dict[key].close()
-
-    except IOError as e:
-        print >>sys.stderr, 'ERROR:', e
-        print >>sys.stderr, '** exiting...'
-        
-    print >>sys.stderr, 'swept {n_reads} for colors...'.format(n_reads=n)
-    print >>sys.stderr, '...with {nc} colored and {no} orphaned'.format(
-                                    nc=n_colored, no=n_orphaned)
-    print >>sys.stderr, '...and {nmc} multicolored'.format(nmc=n_mcolored)
-    print >>sys.stderr, '...to {nf} files'.format(nf=n_files)
-    
-    print >>sys.stderr, '** outputting color number distribution...'
-    with open('color_dist.txt', 'wb') as outfp:
-        for nc in color_number_dist:
-            outfp.write('{nc}\n'.format(nc=nc))
-    
-if __name__ == '__main__':
-    main()

From c253a91b3e5f90890cf18a4b8bf2180dbf338b9f Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Thu, 14 Nov 2013 14:23:56 -0500
Subject: [PATCH 074/140] sweeped reads properly puts output files in source
 dir

---
 scripts/sweep-reads-by-partition-buffered.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py
index 3a82471053..b909c04f3a 100755
--- a/scripts/sweep-reads-by-partition-buffered.py
+++ b/scripts/sweep-reads-by-partition-buffered.py
@@ -63,7 +63,7 @@ def write(self, fp):
 # we should expect the mean buffer size to be 10 reads
 class ReadBuffer:
 
-    def __init__(self, max_buffers, max_size, est_files, output_pref):
+    def __init__(self, max_buffers, max_size, est_files, output_pref, outdir):
         self.buffers = {}
         self.buffer_counts = {}
         self.max_buffers = max_buffers
@@ -71,6 +71,7 @@ def __init__(self, max_buffers, max_size, est_files, output_pref):
 
         self.est_files = est_files
         self.output_pref = output_pref
+        self.outdir = outdir
         self.buffer_flush = self.max_size / self.est_files
 
         self.cur_reads = 0
@@ -101,11 +102,12 @@ def add_seq(self, seq):
     
     def flush_buffer(self, color):
         fn = '{}_{}.fa'.format(self.output_pref, color)
+        fpath = os.path.join(self.outdir, fn)
         try:
-            outfp = open(fn, 'a')
+            outfp = open(fpath, 'a')
         except IOError as e:
             print >>sys.stderr, 'ERROR: {e}'.format(e=e)
-            print >>sys.stderr, '*** Failed to open {fn} for buffer flush'.format(fn)
+            print >>sys.stderr, '*** Failed to open {fn} for buffer flush'.format(fpath)
             self.num_file_errors += 1
         else:
             for read in self.buffers[color]:
@@ -187,6 +189,7 @@ def main():
     
     traversal_range = args.traversal_range
     input_fastp = args.input_fastp
+    outdir = os.path.dirname(input_fastp)
 
     max_buffers = args.max_buffers
     output_pref = args.output_prefix
@@ -198,16 +201,16 @@ def main():
     if debug:
         import yep
 
-    output_buffer = ReadBuffer(max_buffers, buf_size, est, output_pref)
+    output_buffer = ReadBuffer(max_buffers, buf_size, est, output_pref, outdir)
 
 	# file for multicolored reads, just keep this one around the whole time
-    multi_fn = '{}_multi.fp'.format(output_pref)
+    multi_fn = os.path.join(outdir, '{}_multi.fp'.format(output_pref))
     try:
         multi_fp = open(multi_fn, 'a')
     except IOError as e:
         print >>sys.stderr, 'ERROR: {e}'.format(e=e)
         print >>sys.stderr, '*** Failed to open {fn}'.format(multi_fn)
-    orphaned_fn = '{}_orphaned.fa'.format(output_pref)
+    orphaned_fn = os.path.join(outdir, '{}_orphaned.fa'.format(output_pref))
     try:
         orphaned_fp = open(orphaned_fn, 'a')
     except IOError as e:

From 3b6f9afcaebf0a80f132ecf2efe553daf54fed90 Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Thu, 14 Nov 2013 14:29:42 -0500
Subject: [PATCH 075/140] fixed key error in error checking code

---
 scripts/sweep-reads-by-partition-buffered.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py
index b909c04f3a..b46a34eecd 100755
--- a/scripts/sweep-reads-by-partition-buffered.py
+++ b/scripts/sweep-reads-by-partition-buffered.py
@@ -21,6 +21,7 @@
 
 import screed
 import sys
+import os
 import argparse
 import time
 import khmer
@@ -239,7 +240,7 @@ def main():
             read_fp = screed.open(read_file)
         except IOError as e:
             print >>sys.stderr, 'ERROR:', e
-            print >>sys.stderr, '*** Could not open {fn}, skipping...'.format(read_file)
+            print >>sys.stderr, '*** Could not open {fn}, skipping...'.format(fn=read_file)
         else:
             for n, record in enumerate(read_fp):
                 if n % 50000 == 0:

From 3278230ecf0ee30966fc863379f6d926e11b4724 Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Thu, 14 Nov 2013 14:38:58 -0500
Subject: [PATCH 076/140] was not flushing buffer at end of run! oops...

---
 scripts/sweep-reads-by-partition-buffered.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py
index b46a34eecd..e0546de0f2 100755
--- a/scripts/sweep-reads-by-partition-buffered.py
+++ b/scripts/sweep-reads-by-partition-buffered.py
@@ -265,8 +265,11 @@ def main():
                 else:
                     n_orphaned += 1
                     orphaned_fp.write('>{}\n{}\n'.format(name, seq))
+            output_buffer.flush_all()
             read_fp.close()
-    
+
+    # gotta output anything left in the buffers at the end!
+    output_buffer.flush_all() 
 	total_t = time.clock() - total_t
 
     multi_fp.close()
@@ -278,7 +281,7 @@ def main():
         print >>sys.stderr, '** {writee} reads not written'.format(writee=output_buffer.num_write_errors)
         print >>sys.stderr, '** {filee} errors opening files'.format(filee=output_buffer.num_file_errors)
 
-    print >>sys.stderr, 'swept {n_reads} for colors...'.format(n_reads=n)
+    print >>sys.stderr, 'swept {n_reads} for colors...'.format(n_reads=n_colored+n_mcolored+n_orphaned)
     print >>sys.stderr, '...with {nc} colored and {no} orphaned'.format(
                                     nc=n_colored, no=n_orphaned)
     print >>sys.stderr, '...and {nmc} multicolored'.format(nmc=n_mcolored)

From 0f8fb291319eec63daf11a6d9b7797c8a5d5319a Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Thu, 14 Nov 2013 14:48:22 -0500
Subject: [PATCH 077/140] fixed screwy extra spaces in output

---
 scripts/sweep-reads-by-partition-buffered.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py
index e0546de0f2..82fbd4e33d 100755
--- a/scripts/sweep-reads-by-partition-buffered.py
+++ b/scripts/sweep-reads-by-partition-buffered.py
@@ -49,7 +49,7 @@ def __repr__(self):
 
     def write(self, fp):
         try:
-            fp.write('\n>{}\t{}\n{}\n'.format(self.name, self.color, self.seq))
+            fp.write('>{}\t{}\n{}\n'.format(self.name, self.color, self.seq))
         except IOError:
             print >>sys.stderr, 'Error writing {seq} to {fn}'.format(seq=self, fn=fp)
             return 1
@@ -270,7 +270,7 @@ def main():
 
     # gotta output anything left in the buffers at the end!
     output_buffer.flush_all() 
-	total_t = time.clock() - total_t
+    total_t = time.clock() - total_t
 
     multi_fp.close()
     orphaned_fp.close()

From e7ece3329b333e4e71803eb36bb34b4cd926e9a6 Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Thu, 14 Nov 2013 14:51:50 -0500
Subject: [PATCH 078/140] added initial test for sweep

---
 tests/test_scripts.py | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/tests/test_scripts.py b/tests/test_scripts.py
index 9a50e56721..2a7e8dd4d7 100644
--- a/tests/test_scripts.py
+++ b/tests/test_scripts.py
@@ -1125,16 +1125,24 @@ def test_sample_reads_randomly():
 def test_sweep_reads_by_partition_buffered():
     readfile = utils.get_temp_filename('reads.fa')
     contigfile = utils.get_temp_filename('contigs.fp')
-    in_dir = os.path.dirname(infile)
+    in_dir = os.path.dirname(contigfile)
     
-    shutil.copyfile(utils.get_test_data('test-sweep-reads.fa'), infile)
+    shutil.copyfile(utils.get_test_data('test-sweep-reads.fa'), readfile)
     shutil.copyfile(utils.get_test_data('test-sweep-contigs.fp'), contigfile)    
 
     script = scriptpath('sweep-reads-by-partition-buffered.py')
-    args = ['-o', 'test', '-i', contigfile, readfile]
+    args = ['-k', '25', '-o', 'test', '-i', contigfile, readfile]
     status, out, err = runscript(script, args, in_dir)
     
-    outfiles = ['test_0.fa', 'test_1.fa']
+    out1 = os.path.join(in_dir, 'test_0.fa')
+    out2 = os.path.join(in_dir, 'test_1.fa')
+    
+    print os.listdir(in_dir)
 
-    seqs1 = set([r.name for r in screed.open(outfiles[0])])
-    seqs2 = set([r.name for r in screed.open(outfiles[1])])
+    seqs1 = set([r.name for r in screed.open(out1)])
+    seqs2 = set([r.name for r in screed.open(out2)])
+    
+    print seqs1
+    print seqs2
+    assert seqs1 == set(['read1_p0\t0', 'read2_p0\t0'])
+    assert seqs2 == set(['read3_p1\t1'])

From 409fa78a7ae74238e605cbd4edff18673e2f3a4e Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Thu, 14 Nov 2013 14:52:09 -0500
Subject: [PATCH 079/140] test data for sweep tests

---
 tests/test-data/test-sweep-contigs.fp | 8 ++++++++
 tests/test-data/test-sweep-reads.fa   | 6 ++++++
 2 files changed, 14 insertions(+)
 create mode 100644 tests/test-data/test-sweep-contigs.fp
 create mode 100644 tests/test-data/test-sweep-reads.fa

diff --git a/tests/test-data/test-sweep-contigs.fp b/tests/test-data/test-sweep-contigs.fp
new file mode 100644
index 0000000000..92be07782a
--- /dev/null
+++ b/tests/test-data/test-sweep-contigs.fp
@@ -0,0 +1,8 @@
+>read_A	0
+ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAG
+>read_B_overlap_A	0
+GCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGCTCTGCCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGA
+>read_C_overlap_B	0
+TGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGAGTTGGGGATAGATAGATAGATGAGTGTAGATCCAACAACACATACA
+>read_D	1
+TATATATATAGCTAGCTAGCTAACTAGCTAGCATCGATCGATCGATC
diff --git a/tests/test-data/test-sweep-reads.fa b/tests/test-data/test-sweep-reads.fa
new file mode 100644
index 0000000000..15696127ab
--- /dev/null
+++ b/tests/test-data/test-sweep-reads.fa
@@ -0,0 +1,6 @@
+>read1_p0
+ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGCGTAATCGTAAGCTCTGCCTA
+>read2_p0
+CTAGAGCTAGGCTAGCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTTTGCTCTGCTCGCGCTCGCTCG
+>read3_p1
+AACTAGCTAGCATCGATCGATCGATCTGCTGATCG

From 2b312a892725a5c7e7edacef33d28b086691abb9 Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Thu, 14 Nov 2013 15:06:49 -0500
Subject: [PATCH 080/140] added checking multi and orphan reads to sweep tests

---
 tests/test_scripts.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/tests/test_scripts.py b/tests/test_scripts.py
index 2a7e8dd4d7..0ab0ddc323 100644
--- a/tests/test_scripts.py
+++ b/tests/test_scripts.py
@@ -1136,13 +1136,21 @@ def test_sweep_reads_by_partition_buffered():
     
     out1 = os.path.join(in_dir, 'test_0.fa')
     out2 = os.path.join(in_dir, 'test_1.fa')
+    mout = os.path.join(in_dir, 'test_multi.fa')
+    oout = os.path.join(in_dir, 'test_orphaned.fa')
     
     print os.listdir(in_dir)
 
     seqs1 = set([r.name for r in screed.open(out1)])
     seqs2 = set([r.name for r in screed.open(out2)])
-    
+    seqsm = set([r.name for r in screed.open(mout)])
+    seqso = set([r.name for r in screed.open(oout)])    
+
     print seqs1
     print seqs2
+    print seqsm
+    print seqso
     assert seqs1 == set(['read1_p0\t0', 'read2_p0\t0'])
     assert seqs2 == set(['read3_p1\t1'])
+    assert seqsm == set(['read4_multi\t0\t1'])
+    assert seqso == set(['read5_orphan'])

From 51255b69f8ccb3099a9ee7a7aa684adbb6ba572d Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Thu, 14 Nov 2013 15:09:21 -0500
Subject: [PATCH 081/140] added case for missing file to sweep

---
 tests/test_scripts.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/test_scripts.py b/tests/test_scripts.py
index 0ab0ddc323..0008dccb6e 100644
--- a/tests/test_scripts.py
+++ b/tests/test_scripts.py
@@ -1131,9 +1131,13 @@ def test_sweep_reads_by_partition_buffered():
     shutil.copyfile(utils.get_test_data('test-sweep-contigs.fp'), contigfile)    
 
     script = scriptpath('sweep-reads-by-partition-buffered.py')
-    args = ['-k', '25', '-o', 'test', '-i', contigfile, readfile]
+    args = ['-k', '25', '-o', 'test', '-i', contigfile, readfile, 'junkfile.fa']
     status, out, err = runscript(script, args, in_dir)
     
+    # check if the bad file was skipped without issue
+    assert 'ERROR' in err
+    assert 'skipping' in err
+
     out1 = os.path.join(in_dir, 'test_0.fa')
     out2 = os.path.join(in_dir, 'test_1.fa')
     mout = os.path.join(in_dir, 'test_multi.fa')

From 6e0621973cdb2db224f30305adfe26ba330d5b77 Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Thu, 14 Nov 2013 15:16:22 -0500
Subject: [PATCH 082/140] removed old color sweep functions

---
 khmer/_khmermodule.cc | 53 -------------------------------------------
 1 file changed, 53 deletions(-)

diff --git a/khmer/_khmermodule.cc b/khmer/_khmermodule.cc
index 60ebc1cb12..c31df597ec 100644
--- a/khmer/_khmermodule.cc
+++ b/khmer/_khmermodule.cc
@@ -3978,58 +3978,6 @@ static PyObject * hashbits_sweep_color_neighborhood(PyObject * self, PyObject *
 }
 
 
-static PyObject * hashbits_sweep_color_neighborhood_old(PyObject * self, PyObject * args) {
-  khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self;
-  khmer::Hashbits * hb = me->hashbits;
-  
-  char * seq = NULL;
-  PyObject * break_on_stop_tags_o = NULL;
-  PyObject * stop_big_traversals_o = NULL;
-
-  if (!PyArg_ParseTuple(args, "s|OO", &seq,
-			&break_on_stop_tags_o,
-			&stop_big_traversals_o)) {
-    return NULL;
-  }
-
-  bool break_on_stop_tags = false;
-  if (break_on_stop_tags_o && PyObject_IsTrue(break_on_stop_tags_o)) {
-    break_on_stop_tags = true;
-  }
-  bool stop_big_traversals = false;
-  if (stop_big_traversals_o && PyObject_IsTrue(stop_big_traversals_o)) {
-    stop_big_traversals = true;
-  }
-  
-  if (strlen(seq) < hb->ksize()) {
-    return NULL;
-  }
-  
-  //std::pair<TagColorPtrPair::iterator, TagColorPtrPair::iterator> ret;
-  ColorPtrSet found_colors;
-  
-  bool exc_raised = false;
-  //Py_BEGIN_ALLOW_THREADS
-  try {
-    hb->sweep_sequence_for_colors(seq, found_colors, break_on_stop_tags, stop_big_traversals);
-  } catch (_khmer_signal &e) {
-    exc_raised = true;
-  }
-  //Py_END_ALLOW_THREADS
-  
-  if (exc_raised) return NULL;
-  
-  PyObject * x =  PyList_New(found_colors.size());
-  khmer::ColorPtrSet::const_iterator si;
-  unsigned long long i = 0;
-  for (si=found_colors.begin(); si!=found_colors.end(); ++si) {
-    PyList_SET_ITEM(x, i, Py_BuildValue("K", *(*si)));
-    i++;
-  }
-  
-  return x;
-}
-
 // Similar to find_all_tags, but returns tags in a way actually useable by python
 // need a tags_in_sequence iterator or function in c++ land for reuse in all
 // these functions
@@ -4199,7 +4147,6 @@ static PyMethodDef khmer_hashbits_methods[] = {
   { "get_median_count", hashbits_get_median_count, METH_VARARGS, "Get the median, average, and stddev of the k-mer counts in the string" },
   { "consume_fasta_and_tag_with_colors", hashbits_consume_fasta_and_tag_with_colors, METH_VARARGS, "" },
   { "sweep_color_neighborhood", hashbits_sweep_color_neighborhood, METH_VARARGS, "" },
-  { "sweep_color_neighborhood_old", hashbits_sweep_color_neighborhood_old, METH_VARARGS, "" },
   {"consume_partitioned_fasta_and_tag_with_colors", hashbits_consume_partitioned_fasta_and_tag_with_colors, METH_VARARGS, "" },
   {"sweep_tag_neighborhood", hashbits_sweep_tag_neighborhood, METH_VARARGS, "" },
   {"get_tag_colors", hashbits_get_tag_colors, METH_VARARGS, ""},

From d94f7074183c57f3951654162661e7b59d580b0b Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Thu, 14 Nov 2013 15:27:34 -0500
Subject: [PATCH 083/140] fix for stochastic color order in tests

---
 tests/test_scripts.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/test_scripts.py b/tests/test_scripts.py
index 0008dccb6e..8f37289e67 100644
--- a/tests/test_scripts.py
+++ b/tests/test_scripts.py
@@ -1156,5 +1156,6 @@ def test_sweep_reads_by_partition_buffered():
     print seqso
     assert seqs1 == set(['read1_p0\t0', 'read2_p0\t0'])
     assert seqs2 == set(['read3_p1\t1'])
-    assert seqsm == set(['read4_multi\t0\t1'])
+    assert (seqsm == set(['read4_multi\t0\t1']) or \
+            seqsm == set(['read4_multi\t1\t0']))
     assert seqso == set(['read5_orphan'])

From 33c711ce7a920fc1783f57379817dfb8ee6d9089 Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Thu, 14 Nov 2013 15:30:45 -0500
Subject: [PATCH 084/140] updated multi file for standard name

---
 scripts/sweep-reads-by-partition-buffered.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py
index 82fbd4e33d..9fd08cb282 100755
--- a/scripts/sweep-reads-by-partition-buffered.py
+++ b/scripts/sweep-reads-by-partition-buffered.py
@@ -205,7 +205,7 @@ def main():
     output_buffer = ReadBuffer(max_buffers, buf_size, est, output_pref, outdir)
 
 	# file for multicolored reads, just keep this one around the whole time
-    multi_fn = os.path.join(outdir, '{}_multi.fp'.format(output_pref))
+    multi_fn = os.path.join(outdir, '{}_multi.fa'.format(output_pref))
     try:
         multi_fp = open(multi_fn, 'a')
     except IOError as e:

From 297ca6859a9273d9511b2c39a26cda0b9d8fd6b6 Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Thu, 14 Nov 2013 15:38:42 -0500
Subject: [PATCH 085/140] updated test data

---
 tests/test-data/test-sweep-reads.fa | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/test-data/test-sweep-reads.fa b/tests/test-data/test-sweep-reads.fa
index 15696127ab..293f2b9e7e 100644
--- a/tests/test-data/test-sweep-reads.fa
+++ b/tests/test-data/test-sweep-reads.fa
@@ -4,3 +4,7 @@ ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGCGTAATCGTAAGCTCTGCCTA
 CTAGAGCTAGGCTAGCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTTTGCTCTGCTCGCGCTCGCTCG
 >read3_p1
 AACTAGCTAGCATCGATCGATCGATCTGCTGATCG
+>read4_multi
+CTAGAGCTAGGCTAGCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTTTGCTCTGCTCGCGCTCGCTCGCAACTAGCTAGCATCGATCGATCGATCTGCTGATCG
+>read5_orphan
+TGCTGATATATAGCTAGATATATATATAGCAGGCTGGTGTATCGCGCTAGCTAGCTAGCTTTCTCTTTTTTTTTTTTTAGGGA

From b65ce5cb766b65ca2339a0e73a58418cb4a11571 Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Thu, 14 Nov 2013 17:10:44 -0500
Subject: [PATCH 086/140] fixed horked mem usage estimate

---
 scripts/sweep-reads-by-partition-buffered.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py
index 9fd08cb282..adee5b06a9 100755
--- a/scripts/sweep-reads-by-partition-buffered.py
+++ b/scripts/sweep-reads-by-partition-buffered.py
@@ -185,7 +185,7 @@ def main():
         print >>sys.stderr, ''
         print >>sys.stderr, \
             'Estimated memory usage is {prod:.2g} bytes \
-            (n_hashes x min_hashsize / 8)'.format(prod=args.n_hashes*args.min_hashsize/8)
+            (n_hashes x min_hashsize / 8)'.format(prod=args.n_hashes*HT_SIZE/8)
         print >>sys.stderr, '-' * 8
     
     traversal_range = args.traversal_range

From 3758564073a627771131832b65fdde56ccb7e858 Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Mon, 18 Nov 2013 14:30:37 -0500
Subject: [PATCH 087/140] added error handling for too-short reads

---
 scripts/sweep-reads-by-partition-buffered.py | 50 +++++++++++++-------
 1 file changed, 33 insertions(+), 17 deletions(-)

diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py
index adee5b06a9..d7d6fe0f7f 100755
--- a/scripts/sweep-reads-by-partition-buffered.py
+++ b/scripts/sweep-reads-by-partition-buffered.py
@@ -81,6 +81,13 @@ def __init__(self, max_buffers, max_size, est_files, output_pref, outdir):
         self.num_write_errors = 0
         self.num_file_errors = 0
 
+        print >>sys.stderr, '''Init new ReadBuffer [
+        Max Buffers: {num_bufs}
+        Max Reads: {max_reads}
+        Est. Files: {est_files}
+        ]'''.format(num_bufs=self.max_buffers, max_reads=self.max_size,
+                    est_files=self.est_files)
+
     def add_seq(self, seq):
         color = seq.color
         if color in self.buffers:
@@ -96,9 +103,11 @@ def add_seq(self, seq):
             self.buffer_counts[color] = 1
         self.cur_reads += 1
         if self.cur_reads > self.max_size:
+            print >>sys.stderr, '** Reached max num reads...'
             self.flush_all()
         if len(self.buffers) > self.max_buffers:
             #self.clean_buffers(2)
+            print >>sys.stderr, '** Reached max num buffers...'
             self.flush_all()
     
     def flush_buffer(self, color):
@@ -107,7 +116,7 @@ def flush_buffer(self, color):
         try:
             outfp = open(fpath, 'a')
         except IOError as e:
-            print >>sys.stderr, 'ERROR: {e}'.format(e=e)
+            print >>sys.stderr, '!! ERROR: {e} !!'.format(e=e)
             print >>sys.stderr, '*** Failed to open {fn} for buffer flush'.format(fpath)
             self.num_file_errors += 1
         else:
@@ -121,7 +130,7 @@ def del_buffer(self, color):
         del self.buffers[color]
 
     def flush_all(self):
-        print >>sys.stderr, '** reached max buffer size, flushing all to files...'
+        print >>sys.stderr, '*** Flushing all to files...'
         for color in self.buffers:
             self.flush_buffer(color)
         colors = self.buffers.keys()
@@ -209,13 +218,13 @@ def main():
     try:
         multi_fp = open(multi_fn, 'a')
     except IOError as e:
-        print >>sys.stderr, 'ERROR: {e}'.format(e=e)
+        print >>sys.stderr, '!! ERROR: {e} !!'.format(e=e)
         print >>sys.stderr, '*** Failed to open {fn}'.format(multi_fn)
     orphaned_fn = os.path.join(outdir, '{}_orphaned.fa'.format(output_pref))
     try:
         orphaned_fp = open(orphaned_fn, 'a')
     except IOError as e:
-        print >>sys.stderr, 'ERROR: {e}'.format(e=e)
+        print >>sys.stderr, '!! ERROR: {e} !!'.format(e=e)
         print >>sys.stderr, '*** Failed to open {fn}'.format(orphaned_fn)
 
 	# consume the partitioned fasta with which to color the graph
@@ -239,7 +248,7 @@ def main():
         try:
             read_fp = screed.open(read_file)
         except IOError as e:
-            print >>sys.stderr, 'ERROR:', e
+            print >>sys.stderr, '!! ERROR: !!', e
             print >>sys.stderr, '*** Could not open {fn}, skipping...'.format(fn=read_file)
         else:
             for n, record in enumerate(read_fp):
@@ -253,22 +262,29 @@ def main():
                     start_t = time.clock()
                 seq = record.sequence
                 name = record.name
-                
-                colors = ht.sweep_color_neighborhood(seq, traversal_range)
-                color_number_dist.append(len(colors))
-                if colors:
-                    n_colored += 1
-                    if len(colors) > 1:
-                        multi_fp.write('>{}\t{}\n{}\n'.format(name, '\t'.join([str(c) for c in colors]), seq))
-                    else:
-                        output_buffer.add_seq(Seq(name, colors[0], seq))
+                try:
+                    colors = ht.sweep_color_neighborhood(seq, traversal_range)
+                except ValueError as e:
+                    print >>sys.stderr, '!! ERROR: {e} !!'.format(e=e)
+                    print >>sys.stderr, 'Read length less than k-mer size'
                 else:
-                    n_orphaned += 1
-                    orphaned_fp.write('>{}\n{}\n'.format(name, seq))
+                    color_number_dist.append(len(colors))
+                    if colors:
+                        n_colored += 1
+                        if len(colors) > 1:
+                            multi_fp.write('>{}\t{}\n{}\n'.format(
+                                name, '\t'.join([str(c) for c in colors]), seq))
+                        else:
+                            output_buffer.add_seq(Seq(name, colors[0], seq))
+                    else:
+                        n_orphaned += 1
+                        orphaned_fp.write('>{}\n{}\n'.format(name, seq))
+            print >>sys.stderr, '** End of file {fn}...'.format(fn=read_file)
             output_buffer.flush_all()
             read_fp.close()
 
     # gotta output anything left in the buffers at the end!
+    print >>sys.stderr, '** End of run...'
     output_buffer.flush_all() 
     total_t = time.clock() - total_t
 
@@ -277,7 +293,7 @@ def main():
     if debug:
         yep.stop()
     if output_buffer.num_write_errors > 0 or output_buffer.num_file_errors > 0:
-        print >>sys.stderr, 'WARNING: Sweep finished with errors!'
+        print >>sys.stderr, '! WARNING: Sweep finished with errors !'
         print >>sys.stderr, '** {writee} reads not written'.format(writee=output_buffer.num_write_errors)
         print >>sys.stderr, '** {filee} errors opening files'.format(filee=output_buffer.num_file_errors)
 

From 48a0e8cfbf83d74dd7d5ea5f2ef4669c77177093 Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Tue, 19 Nov 2013 11:34:36 -0500
Subject: [PATCH 088/140] fixed error in error handling code FACEPALM

---
 scripts/sweep-reads-by-partition-buffered.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py
index d7d6fe0f7f..793acbe236 100755
--- a/scripts/sweep-reads-by-partition-buffered.py
+++ b/scripts/sweep-reads-by-partition-buffered.py
@@ -117,7 +117,7 @@ def flush_buffer(self, color):
             outfp = open(fpath, 'a')
         except IOError as e:
             print >>sys.stderr, '!! ERROR: {e} !!'.format(e=e)
-            print >>sys.stderr, '*** Failed to open {fn} for buffer flush'.format(fpath)
+            print >>sys.stderr, '*** Failed to open {fn} for buffer flush'.format(fn=fpath)
             self.num_file_errors += 1
         else:
             for read in self.buffers[color]:
@@ -139,6 +139,7 @@ def flush_all(self):
         del colors
         assert self.cur_reads == 0
 
+    # experimental, doesn't work very well
     def clean_buffers(self, cutoff):
         print >>sys.stderr, '** flushing low-abundance buffers...'
         flushed = []
@@ -214,18 +215,18 @@ def main():
     output_buffer = ReadBuffer(max_buffers, buf_size, est, output_pref, outdir)
 
 	# file for multicolored reads, just keep this one around the whole time
-    multi_fn = os.path.join(outdir, '{}_multi.fa'.format(output_pref))
+    multi_fn = os.path.join(outdir, '{pref}_multi.fa'.format(pref=output_pref))
     try:
         multi_fp = open(multi_fn, 'a')
     except IOError as e:
         print >>sys.stderr, '!! ERROR: {e} !!'.format(e=e)
-        print >>sys.stderr, '*** Failed to open {fn}'.format(multi_fn)
-    orphaned_fn = os.path.join(outdir, '{}_orphaned.fa'.format(output_pref))
+        print >>sys.stderr, '*** Failed to open {fn}'.format(fn=multi_fn)
+    orphaned_fn = os.path.join(outdir, '{pref}_orphaned.fa'.format(pref=output_pref))
     try:
         orphaned_fp = open(orphaned_fn, 'a')
     except IOError as e:
         print >>sys.stderr, '!! ERROR: {e} !!'.format(e=e)
-        print >>sys.stderr, '*** Failed to open {fn}'.format(orphaned_fn)
+        print >>sys.stderr, '*** Failed to open {fn}'.format(fn=orphaned_fn)
 
 	# consume the partitioned fasta with which to color the graph
     ht = khmer.new_hashbits(K, HT_SIZE, N_HT)

From b43328b71359daea6a64e1f17f0619a346c6a6f6 Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Tue, 19 Nov 2013 14:40:49 -0500
Subject: [PATCH 089/140] rewrote buffering code for better performance and
 simplicity, changed references to coloring to use labeling

---
 scripts/sweep-reads-by-partition-buffered.py | 199 +++++++++----------
 1 file changed, 91 insertions(+), 108 deletions(-)

diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py
index 793acbe236..13de17aee4 100755
--- a/scripts/sweep-reads-by-partition-buffered.py
+++ b/scripts/sweep-reads-by-partition-buffered.py
@@ -26,6 +26,8 @@
 import time
 import khmer
 from khmer.counting_args import build_construct_args, DEFAULT_MIN_HASHSIZE
+from collections import namedtuple as nt
+
 
 DEFAULT_NUM_BUFFERS=50000
 DEFAULT_BUFFER_SIZE=1000000
@@ -35,34 +37,48 @@
 
 MIN_HSIZE=4e7
 MIN_KSIZE=21
-
-# little class to store sequence information for the buffering class
-class Seq:
-    def __init__(self, name, color, seq):
-        self.name = name
-        self.color = color
-        self.seq = seq
     
-    def __repr__(self):
-        return '''>{name}\t{color}\n
-{seq}\n'''.format(name=self.name, color=self.color, seq=self.seq)
-
-    def write(self, fp):
-        try:
-            fp.write('>{}\t{}\n{}\n'.format(self.name, self.color, self.seq))
-        except IOError:
-            print >>sys.stderr, 'Error writing {seq} to {fn}'.format(seq=self, fn=fp)
-            return 1
-        else:
-            return 0
+def fmt_fasta(name, seq, labels=[]):
+        return '>{name}\t{labels}\n{seq}'.format(name=name, 
+            labels='\t'.join([str(l) for l in labels]), seq=seq)
 
-# stores reads in memory and flushes them to their approriate files
+def write_seq(fp, name, seq, labels=[]):
+    try:
+        fp.write(fmt_fasta(name, seq, labels=labels))
+    except IOError:
+        print >>sys.stderr, 'Error writing {read}'.format(
+                read=fmt_fasta(name, seq, labels=labels))
+        return 1
+    else:
+        return 0
+
+# stores reads in memory and flushes them to their appropriate files
 # when certain criteria are met
 # Basic idea is to buffer some number of reads in memory, then dump them all at once
 # Hope that each file acrues, on average, BUFFER_SIZE / NUM_PARTS reads
-# ie, if we buffer 1000000 reads, and we have 100000 partitions or colors,
+# ie, if we buffer 1000000 reads, and we have 100000 partitions or labels,
 # we should expect the mean buffer size to be 10 reads
 class ReadBuffer:
+    
+    def __init__(self):
+        self.buf = []
+
+    def push(self, seq_str):
+        self.buf.append(seq_str)
+
+    def flush(self):
+        return '\n'.join(self.buf)
+
+    def is_full(self, full):
+        if len(self.buf) >= full:
+            return True
+        else:
+            return False
+
+    def __len__(self):
+        return len(self.buf)
+
+class ReadBufferManager:
 
     def __init__(self, max_buffers, max_size, est_files, output_pref, outdir):
         self.buffers = {}
@@ -85,33 +101,12 @@ def __init__(self, max_buffers, max_size, est_files, output_pref, outdir):
         Max Buffers: {num_bufs}
         Max Reads: {max_reads}
         Est. Files: {est_files}
+        Buffer flush: {buf_flush}
         ]'''.format(num_bufs=self.max_buffers, max_reads=self.max_size,
-                    est_files=self.est_files)
-
-    def add_seq(self, seq):
-        color = seq.color
-        if color in self.buffers:
-            count = self.buffer_counts[color]
-            self.buffers[color].append(seq)
-            self.buffer_counts[color] += 1
-            if count > self.buffer_flush:
-                self.flush_buffer(color)
-                self.del_buffer(color)
+                    est_files=self.est_files, buf_flush=self.buffer_flush)
 
-        else:
-            self.buffers[color] = [seq]
-            self.buffer_counts[color] = 1
-        self.cur_reads += 1
-        if self.cur_reads > self.max_size:
-            print >>sys.stderr, '** Reached max num reads...'
-            self.flush_all()
-        if len(self.buffers) > self.max_buffers:
-            #self.clean_buffers(2)
-            print >>sys.stderr, '** Reached max num buffers...'
-            self.flush_all()
-    
-    def flush_buffer(self, color):
-        fn = '{}_{}.fa'.format(self.output_pref, color)
+    def flush_buffer(self, buf_id):
+        fn = '{}_{}.fa'.format(self.output_pref, buf_id)
         fpath = os.path.join(self.outdir, fn)
         try:
             outfp = open(fpath, 'a')
@@ -120,37 +115,38 @@ def flush_buffer(self, color):
             print >>sys.stderr, '*** Failed to open {fn} for buffer flush'.format(fn=fpath)
             self.num_file_errors += 1
         else:
-            for read in self.buffers[color]:
-                self.num_write_errors += read.write(outfp)
-                self.cur_reads -= 1
+            buf = self.buffers[buf_id]
+            outfp.write(buf.flush())
+            self.cur_reads -= len(buf)
             outfp.close()
+            del self.buffers[buf_id]
 
-    def del_buffer(self, color):
-        del self.buffer_counts[color]
-        del self.buffers[color]
+    def queue(self, seq_str, buf_id):
+        if buf_id in self.buffers:
+            self.buffers[buf_id].push(seq_str)
+            if self.buffers[buf_id].is_full(self.buffer_flush):
+                self.flush_buffer(buf_id)
+        else:
+            new_buf = ReadBuffer()
+            new_buf.push(seq_str)
+            self.buffers[buf_id] = new_buf
+            
+        self.cur_reads += 1
+        if self.cur_reads > self.max_size:
+            print >>sys.stderr, '** Reached max num reads...'
+            self.flush_all()
+        if len(self.buffers) > self.max_buffers:
+            #self.clean_buffers(2)
+            print >>sys.stderr, '** Reached max num buffers...'
+            self.flush_all()
 
     def flush_all(self):
         print >>sys.stderr, '*** Flushing all to files...'
-        for color in self.buffers:
-            self.flush_buffer(color)
-        colors = self.buffers.keys()
-        for color in colors:
-            self.del_buffer(color)
-        del colors
+        buf_ids = self.buffers.keys()
+        for buf_id in buf_ids:
+            self.flush_buffer(buf_id)
         assert self.cur_reads == 0
 
-    # experimental, doesn't work very well
-    def clean_buffers(self, cutoff):
-        print >>sys.stderr, '** flushing low-abundance buffers...'
-        flushed = []
-        for color in self.buffers:
-            if self.buffer_counts[color] < cutoff:
-                self.flush_buffer(color)
-                flushed.append(color)
-        for color in flushed:
-            self.del_buffer(color)
-        del flushed
-
 def main():
 
     parser = build_construct_args()
@@ -212,39 +208,25 @@ def main():
     if debug:
         import yep
 
-    output_buffer = ReadBuffer(max_buffers, buf_size, est, output_pref, outdir)
-
-	# file for multicolored reads, just keep this one around the whole time
-    multi_fn = os.path.join(outdir, '{pref}_multi.fa'.format(pref=output_pref))
-    try:
-        multi_fp = open(multi_fn, 'a')
-    except IOError as e:
-        print >>sys.stderr, '!! ERROR: {e} !!'.format(e=e)
-        print >>sys.stderr, '*** Failed to open {fn}'.format(fn=multi_fn)
-    orphaned_fn = os.path.join(outdir, '{pref}_orphaned.fa'.format(pref=output_pref))
-    try:
-        orphaned_fp = open(orphaned_fn, 'a')
-    except IOError as e:
-        print >>sys.stderr, '!! ERROR: {e} !!'.format(e=e)
-        print >>sys.stderr, '*** Failed to open {fn}'.format(fn=orphaned_fn)
+    output_buffer = ReadBufferManager(max_buffers, buf_size, est, output_pref, outdir)
 
-	# consume the partitioned fasta with which to color the graph
+	# consume the partitioned fasta with which to label the graph
     ht = khmer.new_hashbits(K, HT_SIZE, N_HT)
     print >>sys.stderr, 'consuming fastp...'
     if debug:
         yep.start(debug)
     ht.consume_partitioned_fasta_and_tag_with_colors(input_fastp)
 
-    color_number_dist = []
+    label_number_dist = []
     
     n_orphaned = 0
-    n_colored = 0
-    n_mcolored = 0
+    n_labeled = 0
+    n_mlabeled = 0
 
     total_t = time.clock()
     start_t = time.clock()
     for read_file in input_files:
-        print >>sys.stderr,'** sweeping {read_file} for colors...'.format(read_file=read_file)
+        print >>sys.stderr,'** sweeping {read_file} for labels...'.format(read_file=read_file)
         file_t = 0.0
         try:
             read_fp = screed.open(read_file)
@@ -257,29 +239,30 @@ def main():
                     end_t = time.clock()
                     batch_t = end_t - start_t
                     file_t += batch_t
-                    print >>sys.stderr, '\tswept {n} reads [{nc} colored, {no} orphaned] \
+                    print >>sys.stderr, '\tswept {n} reads [{nc} labeled, {no} orphaned] \
                                         ** {sec}s ({sect}s total)' \
-                                        .format(n=n, nc=n_colored, no=n_orphaned, sec=batch_t, sect=file_t)
+                                        .format(n=n, nc=n_labeled, no=n_orphaned, sec=batch_t, sect=file_t)
                     start_t = time.clock()
                 seq = record.sequence
                 name = record.name
                 try:
-                    colors = ht.sweep_color_neighborhood(seq, traversal_range)
+                    labels = ht.sweep_color_neighborhood(seq, traversal_range)
                 except ValueError as e:
                     print >>sys.stderr, '!! ERROR: {e} !!'.format(e=e)
                     print >>sys.stderr, 'Read length less than k-mer size'
                 else:
-                    color_number_dist.append(len(colors))
-                    if colors:
-                        n_colored += 1
-                        if len(colors) > 1:
-                            multi_fp.write('>{}\t{}\n{}\n'.format(
-                                name, '\t'.join([str(c) for c in colors]), seq))
+                    seq_str = fmt_fasta(name, seq, labels)
+                    label_number_dist.append(len(labels))
+                    if labels:
+                        n_labeled += 1
+                        if len(labels) > 1:
+                            output_buffer.queue(seq_str, 'multi')
+                            n_mlabeled += 1
                         else:
-                            output_buffer.add_seq(Seq(name, colors[0], seq))
+                            output_buffer.queue(seq_str, labels[0])
                     else:
                         n_orphaned += 1
-                        orphaned_fp.write('>{}\n{}\n'.format(name, seq))
+                        output_buffer.queue(seq_str, 'orphaned')
             print >>sys.stderr, '** End of file {fn}...'.format(fn=read_file)
             output_buffer.flush_all()
             read_fp.close()
@@ -298,14 +281,14 @@ def main():
         print >>sys.stderr, '** {writee} reads not written'.format(writee=output_buffer.num_write_errors)
         print >>sys.stderr, '** {filee} errors opening files'.format(filee=output_buffer.num_file_errors)
 
-    print >>sys.stderr, 'swept {n_reads} for colors...'.format(n_reads=n_colored+n_mcolored+n_orphaned)
-    print >>sys.stderr, '...with {nc} colored and {no} orphaned'.format(
-                                    nc=n_colored, no=n_orphaned)
-    print >>sys.stderr, '...and {nmc} multicolored'.format(nmc=n_mcolored)
+    print >>sys.stderr, 'swept {n_reads} for labels...'.format(n_reads=n_labeled+n_mlabeled+n_orphaned)
+    print >>sys.stderr, '...with {nc} labeled and {no} orphaned'.format(
+                                    nc=n_labeled, no=n_orphaned)
+    print >>sys.stderr, '...and {nmc} multilabeled'.format(nmc=n_mlabeled)
     
-    print >>sys.stderr, '** outputting color number distribution...'
-    with open('color_dist.txt', 'wb') as outfp:
-        for nc in color_number_dist:
+    print >>sys.stderr, '** outputting label number distribution...'
+    with open('label_dist.txt', 'wb') as outfp:
+        for nc in label_number_dist:
             outfp.write('{nc}\n'.format(nc=nc))
     
 if __name__ == '__main__':

From a915e7b67b7a315831a23a057c75e5c727100c5d Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Tue, 19 Nov 2013 14:45:40 -0500
Subject: [PATCH 090/140] removed deprecated reference to fps

---
 scripts/sweep-reads-by-partition-buffered.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py
index 13de17aee4..0df68b2861 100755
--- a/scripts/sweep-reads-by-partition-buffered.py
+++ b/scripts/sweep-reads-by-partition-buffered.py
@@ -272,8 +272,6 @@ def main():
     output_buffer.flush_all() 
     total_t = time.clock() - total_t
 
-    multi_fp.close()
-    orphaned_fp.close()
     if debug:
         yep.stop()
     if output_buffer.num_write_errors > 0 or output_buffer.num_file_errors > 0:

From 750cc36511704db33dd0f456e6deff47bc0bced1 Mon Sep 17 00:00:00 2001
From: CS Welcher <cs.welcher@gmail.com>
Date: Tue, 19 Nov 2013 15:44:54 -0500
Subject: [PATCH 091/140] changed all names using color to use label

---
 khmer/_khmermodule.cc          |  64 +++++++++----------
 lib/hashtable.cc               | 104 +++++++++++++++---------------
 lib/hashtable.hh               | 112 ++++++++++++++++-----------------
 lib/khmer.hh                   |  14 ++---
 tests/test-data/test-labels.fa |   8 +++
 5 files changed, 155 insertions(+), 147 deletions(-)
 create mode 100644 tests/test-data/test-labels.fa

diff --git a/khmer/_khmermodule.cc b/khmer/_khmermodule.cc
index c31df597ec..70f6fc7feb 100644
--- a/khmer/_khmermodule.cc
+++ b/khmer/_khmermodule.cc
@@ -3818,21 +3818,21 @@ static PyObject * hashbits_get_median_count(PyObject * self, PyObject * args)
   return Py_BuildValue("iff", med, average, stddev);
 }
 
-static PyObject * hashbits_get_color_dict(PyObject * self, PyObject * args) {
+static PyObject * hashbits_get_label_dict(PyObject * self, PyObject * args) {
   khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self;
   khmer::Hashbits * hb = me->hashbits;
   
   PyObject * d = PyDict_New();
-  khmer::ColorPtrMap::iterator it;
+  khmer::LabelPtrMap::iterator it;
   
-  for (it = hb->color_ptrs.begin(); it!=hb->color_ptrs.end(); ++it) {
+  for (it = hb->label_ptrs.begin(); it!=hb->label_ptrs.end(); ++it) {
     PyDict_SetItem(d, Py_BuildValue("K", it->first), Py_BuildValue("K", it->second));
   }
   
   return d;
 }
 
-static PyObject * hashbits_consume_fasta_and_tag_with_colors(PyObject * self, PyObject * args)
+static PyObject * hashbits_consume_fasta_and_tag_with_labels(PyObject * self, PyObject * args)
 {
   khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self;
   khmer::Hashbits * hb = me->hashbits;
@@ -3852,7 +3852,7 @@ static PyObject * hashbits_consume_fasta_and_tag_with_colors(PyObject * self, Py
   
   //Py_BEGIN_ALLOW_THREADS
   try {
-    hb->consume_fasta_and_tag_with_colors(filename, total_reads, n_consumed,
+    hb->consume_fasta_and_tag_with_labels(filename, total_reads, n_consumed,
                                                 _report_fn, callback_obj);
   } catch (_khmer_signal &e) {
     exc_raised = true;
@@ -3864,7 +3864,7 @@ static PyObject * hashbits_consume_fasta_and_tag_with_colors(PyObject * self, Py
   
 }
 
-static PyObject * hashbits_consume_partitioned_fasta_and_tag_with_colors(
+static PyObject * hashbits_consume_partitioned_fasta_and_tag_with_labels(
                                             PyObject * self, PyObject * args)
 {
   khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self;
@@ -3883,7 +3883,7 @@ static PyObject * hashbits_consume_partitioned_fasta_and_tag_with_colors(
   unsigned int total_reads;
 
   try {
-    hashbits->consume_partitioned_fasta_and_tag_with_colors(filename, 
+    hashbits->consume_partitioned_fasta_and_tag_with_labels(filename, 
     total_reads, n_consumed, _report_fn, callback_obj);
   } catch (_khmer_signal &e) {
     return NULL;
@@ -3892,7 +3892,7 @@ static PyObject * hashbits_consume_partitioned_fasta_and_tag_with_colors(
   return Py_BuildValue("iK", total_reads, n_consumed);
 }
 
-static PyObject * hashbits_consume_sequence_and_tag_with_colors(PyObject * self, PyObject * args) {
+static PyObject * hashbits_consume_sequence_and_tag_with_labels(PyObject * self, PyObject * args) {
   khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self;
   khmer::Hashbits * hb = me->hashbits;
   
@@ -3903,12 +3903,12 @@ static PyObject * hashbits_consume_sequence_and_tag_with_colors(PyObject * self,
   }
   
   unsigned long long n_consumed = 0;
-  khmer::Color * the_color = hb->check_and_allocate_color(c);
+  khmer::Label * the_label = hb->check_and_allocate_label(c);
 
   try { 
   //if (hb->check_and_normalize_read(seq)) {
     
-    hb->consume_sequence_and_tag_with_colors(seq, n_consumed, *the_color);
+    hb->consume_sequence_and_tag_with_labels(seq, n_consumed, *the_label);
   //}
   } catch (_khmer_signal &e) {
     return NULL;
@@ -3916,7 +3916,7 @@ static PyObject * hashbits_consume_sequence_and_tag_with_colors(PyObject * self,
   return Py_BuildValue("L", n_consumed);
 }
 
-static PyObject * hashbits_sweep_color_neighborhood(PyObject * self, PyObject * args) {
+static PyObject * hashbits_sweep_label_neighborhood(PyObject * self, PyObject * args) {
   khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self;
   khmer::Hashbits * hb = me->hashbits;
   
@@ -3949,14 +3949,14 @@ static PyObject * hashbits_sweep_color_neighborhood(PyObject * self, PyObject *
     return NULL;
   }
   
-  //std::pair<TagColorPtrPair::iterator, TagColorPtrPair::iterator> ret;
-  ColorPtrSet found_colors;
+  //std::pair<TagLabelPtrPair::iterator, TagLabelPtrPair::iterator> ret;
+  LabelPtrSet found_labels;
   
   bool exc_raised = false;
   unsigned int num_traversed = 0;
   //Py_BEGIN_ALLOW_THREADS
   try {
-    num_traversed = hb->sweep_color_neighborhood(seq, found_colors, range, break_on_stop_tags, stop_big_traversals);
+    num_traversed = hb->sweep_label_neighborhood(seq, found_labels, range, break_on_stop_tags, stop_big_traversals);
   } catch (_khmer_signal &e) {
     exc_raised = true;
   }
@@ -3966,10 +3966,10 @@ static PyObject * hashbits_sweep_color_neighborhood(PyObject * self, PyObject *
   
   if (exc_raised) return NULL;
   
-  PyObject * x =  PyList_New(found_colors.size());
-  khmer::ColorPtrSet::const_iterator si;
+  PyObject * x =  PyList_New(found_labels.size());
+  khmer::LabelPtrSet::const_iterator si;
   unsigned long long i = 0;
-  for (si=found_colors.begin(); si!=found_colors.end(); ++si) {
+  for (si=found_labels.begin(); si!=found_labels.end(); ++si) {
     PyList_SET_ITEM(x, i, Py_BuildValue("K", *(*si)));
     i++;
   }
@@ -4038,7 +4038,7 @@ static PyObject * hashbits_sweep_tag_neighborhood(PyObject * self, PyObject *arg
 }
 
 
-static PyObject * hashbits_get_tag_colors(PyObject * self, PyObject * args) {
+static PyObject * hashbits_get_tag_labels(PyObject * self, PyObject * args) {
   
   khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self;
   khmer::Hashbits * hashbits = me->hashbits;
@@ -4049,14 +4049,14 @@ static PyObject * hashbits_get_tag_colors(PyObject * self, PyObject * args) {
     return NULL;
   }
   
-  khmer::ColorPtrSet colors;
+  khmer::LabelPtrSet labels;
   
-  colors = hashbits->get_tag_colors(tag);
+  labels = hashbits->get_tag_labels(tag);
   
-  PyObject * x =  PyList_New(colors.size());
-  khmer::ColorPtrSet::const_iterator si;
+  PyObject * x =  PyList_New(labels.size());
+  khmer::LabelPtrSet::const_iterator si;
   unsigned long long i = 0;
-  for (si=colors.begin(); si!=colors.end(); ++si) {
+  for (si=labels.begin(); si!=labels.end(); ++si) {
     //std::string kmer_s = _revhash(*si, hashbits->ksize());
     PyList_SET_ITEM(x, i, Py_BuildValue("K", *(*si)));
     i++;
@@ -4065,7 +4065,7 @@ static PyObject * hashbits_get_tag_colors(PyObject * self, PyObject * args) {
   return x;
 }
 
-static PyObject * hashbits_n_colors(PyObject * self, PyObject * args)
+static PyObject * hashbits_n_labels(PyObject * self, PyObject * args)
 {
   khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self;
   khmer::Hashbits * hashbits = me->hashbits;
@@ -4074,7 +4074,7 @@ static PyObject * hashbits_n_colors(PyObject * self, PyObject * args)
     return NULL;
   }
 
-  return PyInt_FromLong(hashbits->n_colors());
+  return PyInt_FromLong(hashbits->n_labels());
 }
 
 static PyMethodDef khmer_hashbits_methods[] = {
@@ -4145,14 +4145,14 @@ static PyMethodDef khmer_hashbits_methods[] = {
   { "traverse_from_tags", hashbits_traverse_from_tags, METH_VARARGS, "" },
   { "repartition_largest_partition", hashbits_repartition_largest_partition, METH_VARARGS, "" },
   { "get_median_count", hashbits_get_median_count, METH_VARARGS, "Get the median, average, and stddev of the k-mer counts in the string" },
-  { "consume_fasta_and_tag_with_colors", hashbits_consume_fasta_and_tag_with_colors, METH_VARARGS, "" },
-  { "sweep_color_neighborhood", hashbits_sweep_color_neighborhood, METH_VARARGS, "" },
-  {"consume_partitioned_fasta_and_tag_with_colors", hashbits_consume_partitioned_fasta_and_tag_with_colors, METH_VARARGS, "" },
+  { "consume_fasta_and_tag_with_labels", hashbits_consume_fasta_and_tag_with_labels, METH_VARARGS, "" },
+  { "sweep_label_neighborhood", hashbits_sweep_label_neighborhood, METH_VARARGS, "" },
+  {"consume_partitioned_fasta_and_tag_with_labels", hashbits_consume_partitioned_fasta_and_tag_with_labels, METH_VARARGS, "" },
   {"sweep_tag_neighborhood", hashbits_sweep_tag_neighborhood, METH_VARARGS, "" },
-  {"get_tag_colors", hashbits_get_tag_colors, METH_VARARGS, ""},
-  {"consume_sequence_and_tag_with_colors", hashbits_consume_sequence_and_tag_with_colors, METH_VARARGS, "" },
-  {"n_colors", hashbits_n_colors, METH_VARARGS, ""},
-  {"get_color_dict", hashbits_get_color_dict, METH_VARARGS, "" },
+  {"get_tag_labels", hashbits_get_tag_labels, METH_VARARGS, ""},
+  {"consume_sequence_and_tag_with_labels", hashbits_consume_sequence_and_tag_with_labels, METH_VARARGS, "" },
+  {"n_labels", hashbits_n_labels, METH_VARARGS, ""},
+  {"get_label_dict", hashbits_get_label_dict, METH_VARARGS, "" },
  
   {NULL, NULL, 0, NULL}           /* sentinel */
 };
diff --git a/lib/hashtable.cc b/lib/hashtable.cc
index 5a9eb9624d..c8c075143c 100644
--- a/lib/hashtable.cc
+++ b/lib/hashtable.cc
@@ -1952,7 +1952,7 @@ void Hashtable::extract_unique_paths(std::string seq,
  */
 
 void
-Hashtable::consume_fasta_and_tag_with_colors(
+Hashtable::consume_fasta_and_tag_with_labels(
   std:: string const  &filename,
   unsigned int	      &total_reads, unsigned long long	&n_consumed,
   CallbackFn	      callback,	    void *		callback_data
@@ -1968,7 +1968,7 @@ Hashtable::consume_fasta_and_tag_with_colors(
   );
 
 
-  consume_fasta_and_tag_with_colors(
+  consume_fasta_and_tag_with_labels(
     parser,
     total_reads, n_consumed,
     callback, callback_data
@@ -1978,7 +1978,7 @@ Hashtable::consume_fasta_and_tag_with_colors(
 }
 
 void
-Hashtable::consume_fasta_and_tag_with_colors(
+Hashtable::consume_fasta_and_tag_with_labels(
     read_parsers:: IParser *  parser,
     unsigned int		    &total_reads,   unsigned long long	&n_consumed,
     CallbackFn		    callback,	    void *		callback_data
@@ -2001,9 +2001,9 @@ Hashtable::consume_fasta_and_tag_with_colors(
       "Starting trace of 'consume_fasta_and_tag'....\n"
     );
     
-    Color _tag_color = 0;
+    Label _tag_label = 0;
 
-    Color * the_color;
+    Label * the_label;
     // Iterate through the reads and consume their k-mers.
     while (!parser->is_complete( ))
     {
@@ -2014,11 +2014,11 @@ Hashtable::consume_fasta_and_tag_with_colors(
       if (check_and_normalize_read( read.sequence ))
       {
         // TODO: make threadsafe!
-        the_color = check_and_allocate_color(_tag_color);
-        consume_sequence_and_tag_with_colors( read.sequence,
+        the_label = check_and_allocate_label(_tag_label);
+        consume_sequence_and_tag_with_labels( read.sequence,
 					      this_n_consumed,
-					      *the_color );
-	    _tag_color++;
+					      *the_label );
+	    _tag_label++;
 
   #ifdef WITH_INTERNAL_METRICS
         hasher.pmetrics.start_timers( );
@@ -2064,7 +2064,7 @@ Hashtable::consume_fasta_and_tag_with_colors(
 
   }
 
-void Hashtable::consume_partitioned_fasta_and_tag_with_colors(const std::string &filename,
+void Hashtable::consume_partitioned_fasta_and_tag_with_labels(const std::string &filename,
 					  unsigned int &total_reads,
 					  unsigned long long &n_consumed,
 					  CallbackFn callback,
@@ -2085,7 +2085,7 @@ void Hashtable::consume_partitioned_fasta_and_tag_with_colors(const std::string
   //
   // iterate through the FASTA file & consume the reads.
   //
-  Color * c;
+  Label * c;
   PartitionID p;
   while(!parser->is_complete())  {
     read = parser->get_next_read();
@@ -2094,9 +2094,9 @@ void Hashtable::consume_partitioned_fasta_and_tag_with_colors(const std::string
     if (check_and_normalize_read(seq)) {
       // First, figure out what the partition is (if non-zero), and save that.
       p = _parse_partition_id(read.name);
-      c = check_and_allocate_color(p);
+      c = check_and_allocate_label(p);
 
-      consume_sequence_and_tag_with_colors( seq,
+      consume_sequence_and_tag_with_labels( seq,
 					      n_consumed,
 					      *c );
     }
@@ -2107,7 +2107,7 @@ void Hashtable::consume_partitioned_fasta_and_tag_with_colors(const std::string
     // run callback, if specified
     if (total_reads % CALLBACK_PERIOD == 0 && callback) {
       try {
-        callback("consume_partitioned_fasta_and_tag_with_colors", callback_data, 
+        callback("consume_partitioned_fasta_and_tag_with_labels", callback_data, 
         total_reads, n_consumed);
       } catch (...) {
 	delete parser;
@@ -2116,24 +2116,24 @@ void Hashtable::consume_partitioned_fasta_and_tag_with_colors(const std::string
     }
   }
 
-  // @cswelcher TODO: check that deallocate ColorPtrMap is correct
+  // @cswelcher TODO: check that deallocate LabelPtrMap is correct
   delete parser;
 }
 
 // @cswelcher: double-check -- is it valid to pull the address from a reference?
-void Hashtable::link_tag_and_color(HashIntoType& kmer, Color& kmer_color) {
-  tag_colors.insert(TagColorPtrPair(kmer, &kmer_color));
-  color_tag_ptrs.insert(ColorTagPtrPair(kmer_color, &kmer));
+void Hashtable::link_tag_and_label(HashIntoType& kmer, Label& kmer_label) {
+  tag_labels.insert(TagLabelPtrPair(kmer, &kmer_label));
+  label_tag_ptrs.insert(LabelTagPtrPair(kmer_label, &kmer));
 }
 
-/* This is essentially the same code as above, only it assigns colors to the
- * tags through multimap TagColorMap defined in hashtable.hh, declared in
+/* This is essentially the same code as above, only it assigns labels to the
+ * tags through multimap TagLabelMap defined in hashtable.hh, declared in
  * hashbits.hh
- * @cswelcher TODO: should I instead send in the pointer to the new color?
+ * @cswelcher TODO: should I instead send in the pointer to the new label?
  */
-void Hashtable::consume_sequence_and_tag_with_colors(const std::string& seq,
+void Hashtable::consume_sequence_and_tag_with_labels(const std::string& seq,
 					unsigned long long& n_consumed,
-					Color& current_color,
+					Label& current_label,
 					SeenSet * found_tags)
   {
     bool is_new_kmer;
@@ -2160,12 +2160,12 @@ void Hashtable::consume_sequence_and_tag_with_colors(const std::string& seq,
         if (kmer_tagged) {
 	      since = 1;
 	      
-	      // Coloring code
+	      // Labeling code
 	      // TODO: MAKE THREADSAFE!
 	      
-	      if (!_cmap_contains_color(tag_colors, kmer, current_color)) {
+	      if (!_cmap_contains_label(tag_labels, kmer, current_label)) {
 	        ACQUIRE_TAG_COLORS_SPIN_LOCK
-	        link_tag_and_color(kmer, current_color);
+	        link_tag_and_label(kmer, current_label);
 	        RELEASE_TAG_COLORS_SPIN_LOCK
 	      }
 	      if (found_tags) {
@@ -2187,10 +2187,10 @@ void Hashtable::consume_sequence_and_tag_with_colors(const std::string& seq,
         all_tags.insert(kmer);
         RELEASE_ALL_TAGS_SPIN_LOCK
         
-        // Coloring code
+        // Labeling code
         // TODO: MAKE THREADSAFE!
         ACQUIRE_TAG_COLORS_SPIN_LOCK
-        link_tag_and_color(kmer, current_color);
+        link_tag_and_label(kmer, current_label);
         RELEASE_TAG_COLORS_SPIN_LOCK
         
         if (found_tags) { found_tags->insert(kmer); }
@@ -2204,24 +2204,24 @@ void Hashtable::consume_sequence_and_tag_with_colors(const std::string& seq,
       all_tags.insert(kmer);	// insert the last k-mer, too.
       RELEASE_ALL_TAGS_SPIN_LOCK
       
-      // Color code: TODO: MAKE THREADSAFE!
-      link_tag_and_color(kmer, current_color);
+      // Label code: TODO: MAKE THREADSAFE!
+      link_tag_and_label(kmer, current_label);
       
       if (found_tags) { found_tags->insert(kmer); }
     }
   }
 /*
- * Find all colors associated with the sequence
+ * Find all labels associated with the sequence
  * For now, check /every/ k-mer with find_all_tags
  * THIS SUCKS AND IT'S YOUR FAULT @CTB
  */
-unsigned int Hashtable::sweep_sequence_for_colors(const std::string& seq,
-					ColorPtrSet& found_colors,
+unsigned int Hashtable::sweep_sequence_for_labels(const std::string& seq,
+					LabelPtrSet& found_labels,
 					bool break_on_stoptags,
 					bool stop_big_traversals) {
 					
     SeenSet tagged_kmers;
-    //ColorPtrSet found_colors;
+    //LabelPtrSet found_labels;
     
     HashIntoType kmer_f, kmer_r, kmer;
     
@@ -2239,14 +2239,14 @@ unsigned int Hashtable::sweep_sequence_for_colors(const std::string& seq,
       if (get_count(uniqify_rc(kmer_f,kmer_r))) {
         partition->find_all_tags(kmer_f, kmer_r, tagged_kmers,
                    all_tags, break_on_stoptags, stop_big_traversals);
-        traverse_colors_and_resolve(tagged_kmers, found_colors);
+        traverse_labels_and_resolve(tagged_kmers, found_labels);
       }
     }
     return traversed_kmers.size();
 }
 
-unsigned int Hashtable::sweep_color_neighborhood(const std::string& seq,
-                                                  ColorPtrSet& found_colors,
+unsigned int Hashtable::sweep_label_neighborhood(const std::string& seq,
+                                                  LabelPtrSet& found_labels,
                                                   unsigned int range,
                                                   bool break_on_stoptags,
                                                   bool stop_big_traversals) {
@@ -2255,7 +2255,7 @@ unsigned int Hashtable::sweep_color_neighborhood(const std::string& seq,
     unsigned int num_traversed;
     num_traversed = partition->sweep_for_tags(seq, tagged_kmers, all_tags, 
                               range, break_on_stoptags, stop_big_traversals);
-    traverse_colors_and_resolve(tagged_kmers, found_colors);
+    traverse_labels_and_resolve(tagged_kmers, found_labels);
     //printf("range=%u ", range);
     if (range == 0) {
       assert(num_traversed == seq.length()-ksize()+1);
@@ -2264,31 +2264,31 @@ unsigned int Hashtable::sweep_color_neighborhood(const std::string& seq,
     return num_traversed;
 }
 
-ColorPtrSet Hashtable::get_tag_colors(const HashIntoType& tag) {
-  ColorPtrSet colors;
-  unsigned int num_colors;
-  _get_tag_colors(tag, tag_colors, colors);
-  return colors;
+LabelPtrSet Hashtable::get_tag_labels(const HashIntoType& tag) {
+  LabelPtrSet labels;
+  unsigned int num_labels;
+  _get_tag_labels(tag, tag_labels, labels);
+  return labels;
 }
 
-TagPtrSet Hashtable::get_color_tags(const Color& color) {
+TagPtrSet Hashtable::get_label_tags(const Label& label) {
   TagPtrSet tags;
   unsigned int num_tags;
-  _get_tags_from_color(color, color_tag_ptrs, tags);
+  _get_tags_from_label(label, label_tag_ptrs, tags);
   return tags;
 }
 
-void Hashtable::traverse_colors_and_resolve(const SeenSet& tagged_kmers,
-                                              ColorPtrSet& found_colors) {
+void Hashtable::traverse_labels_and_resolve(const SeenSet& tagged_kmers,
+                                              LabelPtrSet& found_labels) {
   
   SeenSet::const_iterator si;
-  unsigned int num_colors = 0;
+  unsigned int num_labels = 0;
   for (si=tagged_kmers.begin(); si!=tagged_kmers.end(); ++si) {
     HashIntoType tag = *si;
-    // get the colors associated with this tag
-    num_colors = _get_tag_colors(tag, tag_colors, found_colors);
-    if (num_colors > 1) {
-      // reconcile colors
+    // get the labels associated with this tag
+    num_labels = _get_tag_labels(tag, tag_labels, found_labels);
+    if (num_labels > 1) {
+      // reconcile labels
       // for now do nothing ha
     }
   }
diff --git a/lib/hashtable.hh b/lib/hashtable.hh
index 653ad7ba95..f13dcd51a2 100644
--- a/lib/hashtable.hh
+++ b/lib/hashtable.hh
@@ -182,52 +182,52 @@ namespace khmer {
     HashIntoType    bitmask;
     unsigned int    _nbits_sub_1;
 
-    // Does the given tag already have the given color?
-    bool _cmap_contains_color(const TagColorPtrMap& cmap,
+    // Does the given tag already have the given label?
+    bool _cmap_contains_label(const TagLabelPtrMap& cmap,
                         HashIntoType& kmer,
-                        Color& the_color)
+                        Label& the_label)
     {
-      std::pair<TagColorPtrMap::const_iterator, TagColorPtrMap::const_iterator> ret;
+      std::pair<TagLabelPtrMap::const_iterator, TagLabelPtrMap::const_iterator> ret;
       ret = cmap.equal_range(kmer);
-      for (TagColorPtrMap::const_iterator it=ret.first; it!=ret.second; ++it) {
-        if (*(it->second) == the_color) return true;
+      for (TagLabelPtrMap::const_iterator it=ret.first; it!=ret.second; ++it) {
+        if (*(it->second) == the_label) return true;
       }
       return false;
     }
 
-    // Does the given color already have a tag associated with it?
-    bool _cmap_contains_tag(const ColorTagPtrMap& cmap,
-                            Color& the_color,
+    // Does the given label already have a tag associated with it?
+    bool _cmap_contains_tag(const LabelTagPtrMap& cmap,
+                            Label& the_label,
                             HashIntoType& kmer) {
-      std::pair<ColorTagPtrMap::const_iterator, ColorTagPtrMap::const_iterator> ret;
-      ret = cmap.equal_range(the_color);
-      for (ColorTagPtrMap::const_iterator it=ret.first; it!=ret.second; ++it) {
+      std::pair<LabelTagPtrMap::const_iterator, LabelTagPtrMap::const_iterator> ret;
+      ret = cmap.equal_range(the_label);
+      for (LabelTagPtrMap::const_iterator it=ret.first; it!=ret.second; ++it) {
         if(*(it->second) == kmer) return true;
       }
       return false;
     }
     
-    unsigned int _get_tag_colors(const HashIntoType& tag,
-                          const TagColorPtrMap& cmap,
-                          ColorPtrSet& found_colors) {
-        unsigned int num_colors = 0;
-        std::pair<TagColorPtrMap::const_iterator, TagColorPtrMap::const_iterator> ret;
+    unsigned int _get_tag_labels(const HashIntoType& tag,
+                          const TagLabelPtrMap& cmap,
+                          LabelPtrSet& found_labels) {
+        unsigned int num_labels = 0;
+        std::pair<TagLabelPtrMap::const_iterator, TagLabelPtrMap::const_iterator> ret;
         ret = cmap.equal_range(tag);
-        for (TagColorPtrMap::const_iterator it=ret.first; it!=ret.second; ++it) {
-            found_colors.insert(it->second);
-            ++num_colors;
+        for (TagLabelPtrMap::const_iterator it=ret.first; it!=ret.second; ++it) {
+            found_labels.insert(it->second);
+            ++num_labels;
         }
-        return num_colors;
+        return num_labels;
     }
     
-    unsigned int _get_tags_from_color(const Color& color,
-                               const ColorTagPtrMap& cmap,
-                               TagPtrSet& colored_tags) {
+    unsigned int _get_tags_from_label(const Label& label,
+                               const LabelTagPtrMap& cmap,
+                               TagPtrSet& labeled_tags) {
         unsigned int num_tags = 0;
-        std::pair<ColorTagPtrMap::const_iterator, ColorTagPtrMap::const_iterator> ret;
-        ret = cmap.equal_range(color);
-        for (ColorTagPtrMap::const_iterator it=ret.first; it!=ret.second; ++it) {
-            colored_tags.insert(it->second);
+        std::pair<LabelTagPtrMap::const_iterator, LabelTagPtrMap::const_iterator> ret;
+        ret = cmap.equal_range(label);
+        for (LabelTagPtrMap::const_iterator it=ret.first; it!=ret.second; ++it) {
+            labeled_tags.insert(it->second);
             ++num_tags;
         }
         return num_tags;
@@ -253,7 +253,7 @@ namespace khmer {
       partition = new SubsetPartition(this);
       _init_bitstuff();
       _all_tags_spin_lock = 0;
-      _tag_colors_spin_lock = 0;
+      _tag_labels_spin_lock = 0;
       
     }
 
@@ -367,15 +367,15 @@ namespace khmer {
     }
 
     uint32_t _all_tags_spin_lock;
-    uint32_t _tag_colors_spin_lock;
+    uint32_t _tag_labels_spin_lock;
   public:
     SubsetPartition * partition;
     SeenSet all_tags;
     SeenSet stop_tags;
     SeenSet repart_small_tags;
-    TagColorPtrMap tag_colors;
-    ColorTagPtrMap color_tag_ptrs;
-    ColorPtrMap color_ptrs;
+    TagLabelPtrMap tag_labels;
+    LabelTagPtrMap label_tag_ptrs;
+    LabelPtrMap label_ptrs;
 
     // accessor to get 'k'
     const WordLength ksize() const { return _ksize; }
@@ -455,7 +455,7 @@ namespace khmer {
     // Partitioning stuff.
 
     unsigned int n_tags() const { return all_tags.size(); }
-    unsigned int n_colors() const { return color_ptrs.size(); }
+    unsigned int n_labels() const { return label_ptrs.size(); }
 
     void divide_tags_into_subsets(unsigned int subset_size, SeenSet& divvy);
 
@@ -486,13 +486,13 @@ namespace khmer {
 	void *		    callback_data   = NULL
     );
     
-    Color * check_and_allocate_color(Color new_color) {
-        Color * c;
-        if (color_ptrs.count(new_color)) {
-            c = color_ptrs[new_color];
+    Label * check_and_allocate_label(Label new_label) {
+        Label * c;
+        if (label_ptrs.count(new_label)) {
+            c = label_ptrs[new_label];
         } else {
-            c = new Color(new_color);
-            color_ptrs[*c] = c;
+            c = new Label(new_label);
+            label_ptrs[*c] = c;
         }
         return c;
     }
@@ -508,49 +508,49 @@ namespace khmer {
 					     CallbackFn callback = 0,
 					     void * callback_data = 0);
     
-    void consume_fasta_and_tag_with_colors(
+    void consume_fasta_and_tag_with_labels(
                         std::string const	  &filename,
                         unsigned int	  &total_reads,
                         unsigned long long  &n_consumed,
                         CallbackFn	  callback	  = NULL,
                         void *		  callback_data	  = NULL);
 
-    void consume_fasta_and_tag_with_colors(
+    void consume_fasta_and_tag_with_labels(
 	                read_parsers:: IParser *	    parser,
 	                unsigned int	    &total_reads,
 	                unsigned long long  &n_consumed,
 	                CallbackFn	    callback	    = NULL,
 	                void *		    callback_data   = NULL);
 	                
-    void consume_partitioned_fasta_and_tag_with_colors(const std::string &filename,
+    void consume_partitioned_fasta_and_tag_with_labels(const std::string &filename,
 					  unsigned int &total_reads,
 					  unsigned long long &n_consumed,
 					  CallbackFn callback = NULL,
 					  void * callback_datac = NULL);
 					  			  
-    void consume_sequence_and_tag_with_colors(const std::string& seq,
+    void consume_sequence_and_tag_with_labels(const std::string& seq,
 					unsigned long long& n_consumed,
-					Color& current_color,
+					Label& current_label,
 					SeenSet * new_tags = 0);
     
-    ColorPtrSet get_tag_colors(const HashIntoType& tag);
-    TagPtrSet get_color_tags(const Color& color);
+    LabelPtrSet get_tag_labels(const HashIntoType& tag);
+    TagPtrSet get_label_tags(const Label& label);
 
-    void link_tag_and_color(HashIntoType& kmer, Color& color);
+    void link_tag_and_label(HashIntoType& kmer, Label& label);
     
-    unsigned int sweep_sequence_for_colors(const std::string& seq,
-					ColorPtrSet& found_colors,
+    unsigned int sweep_sequence_for_labels(const std::string& seq,
+					LabelPtrSet& found_labels,
 					bool break_on_stoptags,
 					bool stop_big_traversals);
 					
-    unsigned int sweep_color_neighborhood(const std::string & seq,
-                                                  ColorPtrSet& found_colors,
+    unsigned int sweep_label_neighborhood(const std::string & seq,
+                                                  LabelPtrSet& found_labels,
                                                   unsigned int range,
                                                   bool break_on_stoptags,
                                                   bool stop_big_traversals);
                                                   			
-    void traverse_colors_and_resolve(const SeenSet& tagged_kmers,
-                                     ColorPtrSet& found_colors);
+    void traverse_labels_and_resolve(const SeenSet& tagged_kmers,
+                                     LabelPtrSet& found_labels);
 
     void consume_fasta_and_traverse(const std::string &filename,
 				    unsigned int distance,
@@ -661,9 +661,9 @@ namespace khmer {
   __sync_bool_compare_and_swap( &_all_tags_spin_lock, 1, 0 );
 
 #define ACQUIRE_TAG_COLORS_SPIN_LOCK \
-  while(!__sync_bool_compare_and_swap( &_tag_colors_spin_lock, 0, 1));
+  while(!__sync_bool_compare_and_swap( &_tag_labels_spin_lock, 0, 1));
 
 #define RELEASE_TAG_COLORS_SPIN_LOCK \
-  __sync_bool_compare_and_swap( &_tag_colors_spin_lock, 1, 0);
+  __sync_bool_compare_and_swap( &_tag_labels_spin_lock, 1, 0);
 
 #endif // HASHTABLE_HH
diff --git a/lib/khmer.hh b/lib/khmer.hh
index 882ea88bad..507c1e443a 100644
--- a/lib/khmer.hh
+++ b/lib/khmer.hh
@@ -87,14 +87,14 @@ namespace khmer {
   typedef std::map<PartitionID, unsigned int> PartitionCountMap;
   typedef std::map<unsigned long long, unsigned long long> PartitionCountDistribution;
   
-  typedef unsigned long long int Color;
-  typedef std::multimap<HashIntoType, Color*> TagColorPtrMap;
-  typedef std::multimap<Color, HashIntoType*> ColorTagPtrMap;
-  typedef std::pair<HashIntoType, Color*> TagColorPtrPair;
-  typedef std::pair<Color, HashIntoType*> ColorTagPtrPair;
-  typedef std::set<Color*> ColorPtrSet;
+  typedef unsigned long long int Label;
+  typedef std::multimap<HashIntoType, Label*> TagLabelPtrMap;
+  typedef std::multimap<Label, HashIntoType*> LabelTagPtrMap;
+  typedef std::pair<HashIntoType, Label*> TagLabelPtrPair;
+  typedef std::pair<Label, HashIntoType*> LabelTagPtrPair;
+  typedef std::set<Label*> LabelPtrSet;
   typedef std::set<HashIntoType*> TagPtrSet;
-  typedef std::map<Color, Color*> ColorPtrMap;
+  typedef std::map<Label, Label*> LabelPtrMap;
 
   template <typename T>
   void deallocate_ptr_set(T& s) {
diff --git a/tests/test-data/test-labels.fa b/tests/test-data/test-labels.fa
new file mode 100644
index 0000000000..b93d7c3c64
--- /dev/null
+++ b/tests/test-data/test-labels.fa
@@ -0,0 +1,8 @@
+>read_A
+ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAG
+>read_B_overlap_A
+GCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGCTCTGCCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGA
+>read_C_overlap_B
+TGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGAGTTGGGGATAGATAGATAGATGAGTGTAGATCCAACAACACATACA
+>read_D
+TATATATATAGCTAGCTAGCTAACTAGCTAGCATCGATCGATCGATC

From c9f4b1027c136e93a925d899402c882b01f86abb Mon Sep 17 00:00:00 2001
From: CS Welcher <cs.welcher@gmail.com>
Date: Tue, 19 Nov 2013 15:47:30 -0500
Subject: [PATCH 092/140] forgot to commit test changes

---
 tests/test_hashbits.py | 130 ++++++++++++++++++++---------------------
 1 file changed, 65 insertions(+), 65 deletions(-)

diff --git a/tests/test_hashbits.py b/tests/test_hashbits.py
index c72245a63f..bb9299c92a 100644
--- a/tests/test_hashbits.py
+++ b/tests/test_hashbits.py
@@ -505,25 +505,25 @@ def test_simple_median():
 # @cswelcher TODO: more tests! 
 #  * thread-safety
 
-def test_n_colors():
+def test_n_labels():
     hb = khmer.new_hashbits(20, 1e7, 4)
-    filename = utils.get_test_data('test-colors.fa')
-    hb.consume_fasta_and_tag_with_colors(filename)
+    filename = utils.get_test_data('test-labels.fa')
+    hb.consume_fasta_and_tag_with_labels(filename)
     
-    print hb.n_colors()
-    assert hb.n_colors() == 4
+    print hb.n_labels()
+    assert hb.n_labels() == 4
 
-def test_get_color_dict():
+def test_get_label_dict():
     hb = khmer.new_hashbits(20, 1e7, 4)
-    filename = utils.get_test_data('test-colors.fa')
-    hb.consume_fasta_and_tag_with_colors(filename)
+    filename = utils.get_test_data('test-labels.fa')
+    hb.consume_fasta_and_tag_with_labels(filename)
     
-    colors = hb.get_color_dict()
+    labels = hb.get_label_dict()
     expected = [0L, 1L, 2L, 3L]
-    for e_color in expected:
-        assert e_color in colors
-    for a_color in colors:
-        assert a_color in expected
+    for e_label in expected:
+        assert e_label in labels
+    for a_label in labels:
+        assert a_label in expected
 
 def test_sweep_tag_neighborhood():
     hb = khmer.new_hashbits(20, 1e7, 4)
@@ -534,96 +534,96 @@ def test_sweep_tag_neighborhood():
     assert len(tags) == 1
     assert tags.pop() == 173473779682L
 
-def test_get_tag_colors():
+def test_get_tag_labels():
     hb = khmer.new_hashbits(20, 1e7, 4)
     filename = utils.get_test_data('single-read.fq')
-    hb.consume_fasta_and_tag_with_colors(filename)
+    hb.consume_fasta_and_tag_with_labels(filename)
     tag = 173473779682L
 
-    colors = hb.get_tag_colors(tag)
-    assert len(colors) == 1
-    assert colors.pop() == 0L
+    labels = hb.get_tag_labels(tag)
+    assert len(labels) == 1
+    assert labels.pop() == 0L
 
-def test_sweep_sequence_for_colors():
+def test_sweep_sequence_for_labels():
     hb = khmer.new_hashbits(20, 1e7, 4)
     filename = utils.get_test_data('single-read.fq')
-    hb.consume_fasta_and_tag_with_colors(filename)
+    hb.consume_fasta_and_tag_with_labels(filename)
     
-    colors = hb.sweep_color_neighborhood('CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT')
-    assert len(colors) == 1
-    assert colors.pop() == 0L
+    labels = hb.sweep_label_neighborhood('CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT')
+    assert len(labels) == 1
+    assert labels.pop() == 0L
 
-def test_consume_partitioned_fasta_and_tag_with_colors():
+def test_consume_partitioned_fasta_and_tag_with_labels():
     hb = khmer.new_hashbits(20, 1e7, 4)
     filename = utils.get_test_data('real-partition-small.fa')
 
-    total_reads, n_consumed = hb.consume_partitioned_fasta_and_tag_with_colors(filename)
-    colors = set()
+    total_reads, n_consumed = hb.consume_partitioned_fasta_and_tag_with_labels(filename)
+    labels = set()
     for record in screed.open(filename):
         seq = record.sequence
-        colors.update(hb.sweep_color_neighborhood(seq, False, False))
-    #print hb.n_colors()
-    #print colors
-    assert len(colors) == 1
-    assert colors.pop() == 2L
-    assert hb.n_colors() == 1 
-
-def test_consume_fasta_and_tag_with_colors():
+        labels.update(hb.sweep_label_neighborhood(seq, False, False))
+    #print hb.n_labels()
+    #print labels
+    assert len(labels) == 1
+    assert labels.pop() == 2L
+    assert hb.n_labels() == 1 
+
+def test_consume_fasta_and_tag_with_labels():
     hb = khmer.new_hashbits(20, 1e7, 4)
     read_1 = 'ACGTAACCGGTTAAACCCGGGTTTAAAACCCCGGGGTTTT'
     filename = utils.get_test_data('test-transcript.fa')
 
-    total_reads, n_consumed = hb.consume_fasta_and_tag_with_colors(filename)
+    total_reads, n_consumed = hb.consume_fasta_and_tag_with_labels(filename)
 
     assert hb.get(read_1[:20])
     assert total_reads == 3
-    print hb.n_colors()
-    print hb.get_color_dict()
+    print hb.n_labels()
+    print hb.get_label_dict()
     for tag in hb.get_tagset():
         print tag, khmer.forward_hash(tag, 20)
     for record in screed.open(filename):
         print hb.sweep_tag_neighborhood(record.sequence, 40)
-        print hb.sweep_color_neighborhood(record.sequence, 40)
-    assert hb.n_colors() == 3
+        print hb.sweep_label_neighborhood(record.sequence, 40)
+    assert hb.n_labels() == 3
 
 '''
 * The test data set as four reads: A, B, C, and D
 * Overlaps are A <-> B <-> C, with D on its own
-* Thus, traversing from A should find colors from A and B,
-  traversing from B should find colors from A, B, and C,
-  and traversing from C should find colors from B and C
+* Thus, traversing from A should find labels from A and B,
+  traversing from B should find labels from A, B, and C,
+  and traversing from C should find labels from B and C
 '''
-def test_color_tag_correctness():
+def test_label_tag_correctness():
     hb = khmer.new_hashbits(20, 1e7, 4)
-    filename = utils.get_test_data('test-colors.fa')
-    hb.consume_fasta_and_tag_with_colors(filename)
+    filename = utils.get_test_data('test-labels.fa')
+    hb.consume_fasta_and_tag_with_labels(filename)
     
     # read A
-    colors = hb.sweep_color_neighborhood('ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAGCTAGGCTAGGTGTGCTCTGCCTAGAGCTAGGCTAGGTGT')
+    labels = hb.sweep_label_neighborhood('ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAGCTAGGCTAGGTGTGCTCTGCCTAGAGCTAGGCTAGGTGT')
     print hb.sweep_tag_neighborhood('TTCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAGCTAGGCTAGGTGTGCTCTGCCTAGAGCTAGGCTAGGTGT')
-    print colors
+    print labels
     print len('ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAG')-19 
-    assert len(colors) == 2
-    assert 0L in colors
-    assert 1L in colors
+    assert len(labels) == 2
+    assert 0L in labels
+    assert 1L in labels
     
     # read B
-    colors = hb.sweep_color_neighborhood('GCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGCTCTGCCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGA')
-    print colors
-    assert len(colors) == 3
-    assert 0L in colors
-    assert 1L in colors
-    assert 2L in colors
+    labels = hb.sweep_label_neighborhood('GCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGCTCTGCCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGA')
+    print labels
+    assert len(labels) == 3
+    assert 0L in labels
+    assert 1L in labels
+    assert 2L in labels
     
     # read C
-    colors = hb.sweep_color_neighborhood('TGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGAGTTGGGGATAGATAGATAGATGAGTGTAGATCCAACAACACATACA')
-    print colors
-    assert len(colors) == 2
-    assert 1L in colors
-    assert 2L in colors
+    labels = hb.sweep_label_neighborhood('TGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGAGTTGGGGATAGATAGATAGATGAGTGTAGATCCAACAACACATACA')
+    print labels
+    assert len(labels) == 2
+    assert 1L in labels
+    assert 2L in labels
     
     # read D
-    colors = hb.sweep_color_neighborhood('TATATATATAGCTAGCTAGCTAACTAGCTAGCATCGATCGATCGATC')
-    print colors
-    assert len(colors) == 1
-    assert 3L in colors
+    labels = hb.sweep_label_neighborhood('TATATATATAGCTAGCTAGCTAACTAGCTAGCATCGATCGATCGATC')
+    print labels
+    assert len(labels) == 1
+    assert 3L in labels

From d40d868c9742f8389b82c11f691f76eaf625971a Mon Sep 17 00:00:00 2001
From: CS Welcher <cs.welcher@gmail.com>
Date: Tue, 19 Nov 2013 15:49:52 -0500
Subject: [PATCH 093/140] aaaaaand the script

---
 scripts/sweep-reads-by-partition-buffered.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py
index 0df68b2861..266d1bef9f 100755
--- a/scripts/sweep-reads-by-partition-buffered.py
+++ b/scripts/sweep-reads-by-partition-buffered.py
@@ -215,7 +215,7 @@ def main():
     print >>sys.stderr, 'consuming fastp...'
     if debug:
         yep.start(debug)
-    ht.consume_partitioned_fasta_and_tag_with_colors(input_fastp)
+    ht.consume_partitioned_fasta_and_tag_with_labels(input_fastp)
 
     label_number_dist = []
     
@@ -246,7 +246,7 @@ def main():
                 seq = record.sequence
                 name = record.name
                 try:
-                    labels = ht.sweep_color_neighborhood(seq, traversal_range)
+                    labels = ht.sweep_label_neighborhood(seq, traversal_range)
                 except ValueError as e:
                     print >>sys.stderr, '!! ERROR: {e} !!'.format(e=e)
                     print >>sys.stderr, 'Read length less than k-mer size'

From 8b28ba39b12aba1f174015da58e1f1c1915d841c Mon Sep 17 00:00:00 2001
From: CS Welcher <cs.welcher@gmail.com>
Date: Tue, 19 Nov 2013 16:23:11 -0500
Subject: [PATCH 094/140] writing script to build sparse graph

---
 sandbox/build-sparse-graph.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)
 create mode 100644 sandbox/build-sparse-graph.py

diff --git a/sandbox/build-sparse-graph.py b/sandbox/build-sparse-graph.py
new file mode 100644
index 0000000000..1f037afb82
--- /dev/null
+++ b/sandbox/build-sparse-graph.py
@@ -0,0 +1,13 @@
+import khmer
+import sys
+import screed
+
+
+input_fasta = sys.argv[3]
+K = sys.argv[1]
+x = sys.argv[2]
+
+
+ht = khmer.new_hashbits(K, x, 4)
+
+

From 048e542f1b975d933a490c9d68d623804e923535 Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Tue, 19 Nov 2013 18:10:57 -0500
Subject: [PATCH 095/140] done for the day

---
 sandbox/build-sparse-graph.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/sandbox/build-sparse-graph.py b/sandbox/build-sparse-graph.py
index 1f037afb82..a8e6ac8969 100644
--- a/sandbox/build-sparse-graph.py
+++ b/sandbox/build-sparse-graph.py
@@ -1,6 +1,7 @@
 import khmer
 import sys
 import screed
+import graph_tool.all as gt
 
 
 input_fasta = sys.argv[3]
@@ -10,4 +11,7 @@
 
 ht = khmer.new_hashbits(K, x, 4)
 
+sparse_graph = gt.Graph()
 
+for n, record in enumerate(screed.open(input_fasta)):
+    

From a693a87860da276e1a725233fcc380cb6064eded Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Fri, 22 Nov 2013 14:32:06 -0500
Subject: [PATCH 096/140] removed references to debugger

---
 scripts/sweep-reads-by-partition-buffered.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py
index 266d1bef9f..7f460caff4 100755
--- a/scripts/sweep-reads-by-partition-buffered.py
+++ b/scripts/sweep-reads-by-partition-buffered.py
@@ -161,7 +161,6 @@ def main():
                         default=DEFAULT_OUT_PREF)
     parser.add_argument('-m', '--max_buffers', dest='max_buffers', type=int, \
                         default=DEFAULT_NUM_BUFFERS)
-    parser.add_argument('-d', '--debug', dest='debug', default=None)
     parser.add_argument('input_files', nargs='+')
     args = parser.parse_args()
     
@@ -204,17 +203,11 @@ def main():
     est = args.files_estimate
     input_files = args.input_files
 
-    debug = args.debug
-    if debug:
-        import yep
-
     output_buffer = ReadBufferManager(max_buffers, buf_size, est, output_pref, outdir)
 
 	# consume the partitioned fasta with which to label the graph
     ht = khmer.new_hashbits(K, HT_SIZE, N_HT)
     print >>sys.stderr, 'consuming fastp...'
-    if debug:
-        yep.start(debug)
     ht.consume_partitioned_fasta_and_tag_with_labels(input_fastp)
 
     label_number_dist = []
@@ -272,8 +265,6 @@ def main():
     output_buffer.flush_all() 
     total_t = time.clock() - total_t
 
-    if debug:
-        yep.stop()
     if output_buffer.num_write_errors > 0 or output_buffer.num_file_errors > 0:
         print >>sys.stderr, '! WARNING: Sweep finished with errors !'
         print >>sys.stderr, '** {writee} reads not written'.format(writee=output_buffer.num_write_errors)

From d6e4088069b79c10162d06f815141ba7f694e2c9 Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Fri, 22 Nov 2013 14:40:43 -0500
Subject: [PATCH 097/140] playing with sparse graph viz

---
 sandbox/build-sparse-graph.py | 44 ++++++++++++++++++++++++++++++++---
 1 file changed, 41 insertions(+), 3 deletions(-)

diff --git a/sandbox/build-sparse-graph.py b/sandbox/build-sparse-graph.py
index a8e6ac8969..4cf9cd5523 100644
--- a/sandbox/build-sparse-graph.py
+++ b/sandbox/build-sparse-graph.py
@@ -3,15 +3,53 @@
 import screed
 import graph_tool.all as gt
 
-
 input_fasta = sys.argv[3]
-K = sys.argv[1]
-x = sys.argv[2]
+K = int(sys.argv[1])
+x = float(sys.argv[2])
 
 
 ht = khmer.new_hashbits(K, x, 4)
 
 sparse_graph = gt.Graph()
+hashes = sparse_graph.new_vertex_property("long long")
+
 
 for n, record in enumerate(screed.open(input_fasta)):
+    if n % 1000 == 0:
+        print >>sys.stderr, '...loaded and tagged {} sequences'.format(n)
+    name = record.name
+    sequence = record.sequence
     
+    ht.consume_sequence_and_tag_with_labels(sequence, n)
+    tags = ht.sweep_tag_neighborhood(sequence, 0)
+    for i in xrange(len(tags)-1):
+        src = tags[i]
+        dst = tags[i+1]
+
+        new = False
+
+        srcv = gt.find_vertex(sparse_graph, hashes, src)
+        if not srcv:
+            srcv = sparse_graph.add_vertex()
+            hashes[srcv] = src
+            new = True
+        else:
+            srcv = srcv[0]
+
+        dstv = gt.find_vertex(sparse_graph, hashes, dst)
+        if not dstv:
+            dstv = sparse_graph.add_vertex()
+            hashes[dstv] = dst
+            new = True        
+        else:
+            dstv = dstv[0]
+
+        if new:
+            e = sparse_graph.add_edge(srcv, dstv)
+
+print 'Sparse graph has {} nodes, {} edges'.format(sparse_graph.num_vertices(), sparse_graph.num_edges())
+comp = gt.label_largest_component(sparse_graph, directed=False)
+#pos = gt.radial_tree_layout(sparse_graph, sparse_graph.vertex(0))
+gt.graph_draw(sparse_graph, output_size=(5000,5000), output=input_fasta+'_sparse.png')
+sparse_graph.set_vertex_filter(comp)
+gt.graph_draw(sparse_graph, output_size=(5000,5000), output=input_fasta+'_sparse_comp.png')

From 32c0c39f9a466a43257763b8f18fe8f71d349369 Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Fri, 22 Nov 2013 16:17:18 -0500
Subject: [PATCH 098/140] changed buffering parameters to make more sense

---
 scripts/sweep-reads-by-partition-buffered.py | 29 ++++++++++----------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py
index 7f460caff4..a2a3e6050c 100755
--- a/scripts/sweep-reads-by-partition-buffered.py
+++ b/scripts/sweep-reads-by-partition-buffered.py
@@ -30,8 +30,8 @@
 
 
 DEFAULT_NUM_BUFFERS=50000
-DEFAULT_BUFFER_SIZE=1000000
-DEFAULT_NUM_PARTITIONS=100000
+DEFAULT_MAX_READS=1000000
+DEFAULT_BUFFER_SIZE=10
 DEFAULT_OUT_PREF='reads_'
 DEFAULT_RANGE=-1
 
@@ -80,16 +80,15 @@ def __len__(self):
 
 class ReadBufferManager:
 
-    def __init__(self, max_buffers, max_size, est_files, output_pref, outdir):
+    def __init__(self, max_buffers, max_reads, max_size, output_pref, outdir):
         self.buffers = {}
         self.buffer_counts = {}
         self.max_buffers = max_buffers
-        self.max_size = max_size
+        self.max_reads = max_reads
 
-        self.est_files = est_files
         self.output_pref = output_pref
         self.outdir = outdir
-        self.buffer_flush = self.max_size / self.est_files
+        self.buffer_flush = max_size
 
         self.cur_reads = 0
         self.cur_files = 0
@@ -100,10 +99,9 @@ def __init__(self, max_buffers, max_size, est_files, output_pref, outdir):
         print >>sys.stderr, '''Init new ReadBuffer [
         Max Buffers: {num_bufs}
         Max Reads: {max_reads}
-        Est. Files: {est_files}
         Buffer flush: {buf_flush}
-        ]'''.format(num_bufs=self.max_buffers, max_reads=self.max_size,
-                    est_files=self.est_files, buf_flush=self.buffer_flush)
+        ]'''.format(num_bufs=self.max_buffers, max_reads=self.max_reads,
+                    buf_flush=self.buffer_flush)
 
     def flush_buffer(self, buf_id):
         fn = '{}_{}.fa'.format(self.output_pref, buf_id)
@@ -132,7 +130,7 @@ def queue(self, seq_str, buf_id):
             self.buffers[buf_id] = new_buf
             
         self.cur_reads += 1
-        if self.cur_reads > self.max_size:
+        if self.cur_reads > self.max_reads:
             print >>sys.stderr, '** Reached max num reads...'
             self.flush_all()
         if len(self.buffers) > self.max_buffers:
@@ -153,10 +151,10 @@ def main():
     parser.add_argument('-i', '--input_fastp',dest='input_fastp')
     parser.add_argument('-r', '--traversal_range', type=int, dest='traversal_range', \
                         default=DEFAULT_RANGE)
-    parser.add_argument('-b', '--buffer_size', dest='buffer_size', type=int, \
+    parser.add_argument('-b', '--buffer_size', dest='max_reads', type=int, \
+                        default=DEFAULT_MAX_READS)
+    parser.add_argument('-l', '--buffer_length', dest='buffer_size', type=int, \
                         default=DEFAULT_BUFFER_SIZE)
-    parser.add_argument('-e', '--files_estimate', dest='files_estimate', type=int, \
-                        default=DEFAULT_NUM_PARTITIONS)
     parser.add_argument('-o', '--output_prefix', dest='output_prefix',
                         default=DEFAULT_OUT_PREF)
     parser.add_argument('-m', '--max_buffers', dest='max_buffers', type=int, \
@@ -200,10 +198,11 @@ def main():
     max_buffers = args.max_buffers
     output_pref = args.output_prefix
     buf_size = args.buffer_size
-    est = args.files_estimate
+    max_reads = args.max_reads
+    
     input_files = args.input_files
 
-    output_buffer = ReadBufferManager(max_buffers, buf_size, est, output_pref, outdir)
+    output_buffer = ReadBufferManager(max_buffers, max_reads, buf_size, output_pref, outdir)
 
 	# consume the partitioned fasta with which to label the graph
     ht = khmer.new_hashbits(K, HT_SIZE, N_HT)

From d79a5ba7a130527f1a68217d73ed383c0bfdc1fb Mon Sep 17 00:00:00 2001
From: CS Welcher <cs.welcher@gmail.com>
Date: Wed, 4 Dec 2013 12:45:35 -0500
Subject: [PATCH 099/140] fixed env line

---
 scripts/sweep-reads-by-partition-buffered.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py
index d7d6fe0f7f..45046ef0ac 100755
--- a/scripts/sweep-reads-by-partition-buffered.py
+++ b/scripts/sweep-reads-by-partition-buffered.py
@@ -1,4 +1,4 @@
-#! /usr/bin/python
+#!/usr/bin/env python
 #
 # This file is part of khmer, http://github.com/ged-lab/khmer/, and is
 # Copyright (C) Michigan State University, 2009-2013. It is licensed under

From 7d80ee97d30f1caf7229d3940fc2af43d404782f Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Wed, 4 Dec 2013 17:25:34 -0500
Subject: [PATCH 100/140] fixed derped up merge from partition_on_abundance,
 properly 3 way merged that branch along with master

---
 khmer/_khmermodule.cc  | 330 ++++++++++++++++++++++-------------------
 tests/test_filter.py   |  26 ----
 tests/test_hashbits.py |   1 +
 3 files changed, 181 insertions(+), 176 deletions(-)

diff --git a/khmer/_khmermodule.cc b/khmer/_khmermodule.cc
index 7aaf268c1f..f8f7cb39b3 100644
--- a/khmer/_khmermodule.cc
+++ b/khmer/_khmermodule.cc
@@ -2608,26 +2608,6 @@ static PyObject * hashbits_repartition_largest_partition(PyObject * self, PyObje
   return PyInt_FromLong(next_largest);
 }
 
-static PyObject * hashbits_hitraverse_to_stoptags(PyObject * self, PyObject * args)
-{
-  khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self;
-  khmer::Hashbits * hashbits = me->hashbits;
-
-  PyObject * counting_o = NULL;
-  unsigned int cutoff = 0;
-  const char * filename = NULL;
-
-  if (!PyArg_ParseTuple(args, "sOI", &filename, &counting_o, &cutoff)) {
-    return NULL;
-  }
-
-  khmer::CountingHash * counting = ((khmer_KCountingHashObject *) counting_o)->counting;
-
-  hashbits->hitraverse_to_stoptags(filename, *counting, cutoff);
-  
-  Py_RETURN_NONE;
-}
-
 static PyObject * hashbits_get(PyObject * self, PyObject * args)
 {
   khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self;
@@ -2695,64 +2675,6 @@ static PyObject * hashbits_kmer_degree(PyObject * self, PyObject * args)
   return PyInt_FromLong(hashbits->kmer_degree(kmer_s));
 }
 
-static PyObject * hashbits_trim_on_degree(PyObject * self, PyObject * args)
-{
-  khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self;
-  khmer::Hashbits * hashbits = me->hashbits;
-
-  const char * seq = NULL;
-  unsigned int max_degree = 0;
-
-  if (!PyArg_ParseTuple(args, "sI", &seq, &max_degree)) {
-    return NULL;
-  }
-
-  unsigned int trim_at;
-  Py_BEGIN_ALLOW_THREADS
-
-    trim_at = hashbits->trim_on_degree(seq, max_degree);
-
-  Py_END_ALLOW_THREADS;
-
-  PyObject * trim_seq = PyString_FromStringAndSize(seq, trim_at);
-  if (trim_seq == NULL) {
-      return NULL;
-  }
-  PyObject * ret = Py_BuildValue("OI", trim_seq, trim_at);
-  Py_DECREF(trim_seq);
-
-  return ret;
-}
-
-static PyObject * hashbits_trim_on_sodd(PyObject * self, PyObject * args)
-{
-  khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self;
-  khmer::Hashbits * hashbits = me->hashbits;
-
-  const char * seq = NULL;
-  unsigned int max_sodd = 0;
-
-  if (!PyArg_ParseTuple(args, "sI", &seq, &max_sodd)) {
-    return NULL;
-  }
-
-  unsigned int trim_at;
-  Py_BEGIN_ALLOW_THREADS
-
-    trim_at = hashbits->trim_on_sodd(seq, max_sodd);
-
-  Py_END_ALLOW_THREADS;
-
-  PyObject * trim_seq = PyString_FromStringAndSize(seq, trim_at);
-  if (trim_seq == NULL) {
-      return NULL;
-  }
-  PyObject * ret = Py_BuildValue("OI", trim_seq, trim_at);
-  Py_DECREF(trim_seq);
-
-  return ret;
-}
-
 static PyObject * hashbits_trim_on_stoptags(PyObject * self, PyObject * args)
 {
   khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self;
@@ -2808,12 +2730,6 @@ static PyObject * hashbits_identify_stoptags_by_position(PyObject * self, PyObje
   return x;
 }
 
-void free_subset_partition_info(void * p)
-{
-  khmer::SubsetPartition * subset_p = (khmer::SubsetPartition *) p;
-  delete subset_p;
-}
-
 static PyObject * hashbits_do_subset_partition(PyObject * self, PyObject * args)
 {
   khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self;
@@ -2970,30 +2886,6 @@ static PyObject * hashbits_consume_fasta_with_reads_parser(
   return Py_BuildValue("IK", total_reads, n_consumed);
 }
 
-static PyObject * hashbits_traverse_from_reads(PyObject * self, PyObject * args)
-{
-  khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self;
-  khmer::Hashbits * hashbits = me->hashbits;
-
-  const char * filename;
-  unsigned int radius, big_threshold, transfer_threshold;
-  PyObject * counting_o = NULL;
-
-  if (!PyArg_ParseTuple(args, "sIIIO", &filename,
-			&radius, &big_threshold, &transfer_threshold,
-			&counting_o)) {
-    return NULL;
-  }
-
-  khmer::CountingHash * counting = ((khmer_KCountingHashObject *) counting_o)->counting;
-
-  hashbits->traverse_from_reads(filename, radius, big_threshold,
-				transfer_threshold, *counting);
-      
-
-  Py_RETURN_NONE;
-}
-
 static PyObject * hashbits_consume_fasta_and_traverse(PyObject * self, PyObject * args)
 {
   khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self;
@@ -3139,12 +3031,6 @@ static PyObject * hashbits_consume_partitioned_fasta(PyObject * self, PyObject *
   return Py_BuildValue("IK", total_reads, n_consumed);
 }
 
-void free_pre_partition_info(void * p)
-{
-  _pre_partition_info * ppi = (_pre_partition_info *) p;
-  delete ppi;
-}
-
 static PyObject * hashbits_find_all_tags(PyObject * self, PyObject *args)
 {
   khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self;
@@ -3854,37 +3740,6 @@ static PyObject * hashbits_count_kmers_on_radius(PyObject * self, PyObject * arg
   return PyLong_FromUnsignedLong(n);
 }
 
-static PyObject * hashbits_trim_on_density_explosion(PyObject * self, PyObject * args)
-{
-  khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self;
-  khmer::Hashbits * hashbits = me->hashbits;
-
-  const char * seq = NULL;
-  unsigned int radius = 0;
-  unsigned int max_volume = 0;
-
-  if (!PyArg_ParseTuple(args, "sII", &seq, &radius, &max_volume)) {
-    return NULL;
-  }
-
-  unsigned int trim_at;
-  Py_BEGIN_ALLOW_THREADS
-
-    trim_at = hashbits->trim_on_density_explosion(seq, radius, max_volume);
-
-  Py_END_ALLOW_THREADS;
-
-  PyObject * trim_seq = PyString_FromStringAndSize(seq, trim_at);
-  if (trim_seq == NULL) {
-      return NULL;
-  }
-  
-  PyObject * ret = Py_BuildValue("OI", trim_seq, trim_at);
-  Py_DECREF(trim_seq);
-
-  return ret;
-}
-
 static PyObject * hashbits_find_radius_for_volume(PyObject * self, PyObject * args)
 {
   khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self;
@@ -4273,11 +4128,8 @@ static PyMethodDef khmer_hashbits_methods[] = {
   { "get", hashbits_get, METH_VARARGS, "Get the count for the given k-mer" },
   { "calc_connected_graph_size", hashbits_calc_connected_graph_size, METH_VARARGS, "" },
   { "kmer_degree", hashbits_kmer_degree, METH_VARARGS, "" },
-  { "trim_on_degree", hashbits_trim_on_degree, METH_VARARGS, "" },
-  { "trim_on_sodd", hashbits_trim_on_sodd, METH_VARARGS, "" },
   { "trim_on_stoptags", hashbits_trim_on_stoptags, METH_VARARGS, "" },
   { "identify_stoptags_by_position", hashbits_identify_stoptags_by_position, METH_VARARGS, "" },
-  { "trim_on_density_explosion", hashbits_trim_on_density_explosion, METH_VARARGS, "" },
   { "do_subset_partition", hashbits_do_subset_partition, METH_VARARGS, "" },
   { "find_all_tags", hashbits_find_all_tags, METH_VARARGS, "" },
   { "assign_partition_id", hashbits_assign_partition_id, METH_VARARGS, "" },
@@ -4304,7 +4156,6 @@ static PyMethodDef khmer_hashbits_methods[] = {
   { "consume_fasta_and_tag", hashbits_consume_fasta_and_tag, METH_VARARGS, "Count all k-mers in a given file" },
   { "consume_fasta_and_tag_with_reads_parser", hashbits_consume_fasta_and_tag_with_reads_parser, 
     METH_VARARGS, "Count all k-mers using a given reads parser" },
-  { "traverse_from_reads", hashbits_traverse_from_reads, METH_VARARGS, "" },
   { "consume_fasta_and_traverse", hashbits_consume_fasta_and_traverse, METH_VARARGS, "" },
   { "consume_fasta_and_tag_with_stoptags", hashbits_consume_fasta_and_tag_with_stoptags, METH_VARARGS, "Count all k-mers in a given file" },
   { "consume_partitioned_fasta", hashbits_consume_partitioned_fasta, METH_VARARGS, "Count all k-mers in a given file" },
@@ -4326,7 +4177,6 @@ static PyMethodDef khmer_hashbits_methods[] = {
   { "count_kmers_within_radius", hashbits_count_kmers_within_radius, METH_VARARGS, "" },
   { "count_kmers_on_radius", hashbits_count_kmers_on_radius, METH_VARARGS, "" },
   { "find_radius_for_volume", hashbits_find_radius_for_volume, METH_VARARGS, "" },
-  { "hitraverse_to_stoptags", hashbits_hitraverse_to_stoptags, METH_VARARGS, "" },
   { "traverse_from_tags", hashbits_traverse_from_tags, METH_VARARGS, "" },
   { "repartition_largest_partition", hashbits_repartition_largest_partition, METH_VARARGS, "" },
   { "get_median_count", hashbits_get_median_count, METH_VARARGS, "Get the median, average, and stddev of the k-mer counts in the string" },
@@ -4348,6 +4198,174 @@ khmer_hashbits_getattr(PyObject * obj, char * name)
   return Py_FindMethod(khmer_hashbits_methods, obj, name);
 }
 
+////////////////////////////////////////////////////////////////////////////
+
+static PyObject * subset_count_partitions(PyObject * self,
+					  PyObject * args)
+{
+  khmer_KSubsetPartitionObject * me = (khmer_KSubsetPartitionObject *) self;
+  khmer::SubsetPartition * subset_p = me->subset;
+
+  if (!PyArg_ParseTuple(args, "")) {
+    return NULL;
+  }
+  
+  unsigned int n_partitions = 0, n_unassigned = 0;
+  subset_p->count_partitions(n_partitions, n_unassigned);
+
+  return Py_BuildValue("ii", n_partitions, n_unassigned);
+}
+
+static PyObject * subset_report_on_partitions(PyObject * self,
+						   PyObject * args)
+{
+  khmer_KSubsetPartitionObject * me = (khmer_KSubsetPartitionObject *) self;
+  khmer::SubsetPartition * subset_p = me->subset;
+
+  if (!PyArg_ParseTuple(args, "")) {
+    return NULL;
+  }
+  
+  subset_p->report_on_partitions();
+
+  Py_INCREF(Py_None);
+  return Py_None;
+}
+
+static PyObject * subset_compare_partitions(PyObject * self,
+					    PyObject * args)
+{
+  khmer_KSubsetPartitionObject * me = (khmer_KSubsetPartitionObject *) self;
+  khmer::SubsetPartition * subset1_p = me->subset;
+
+  PyObject * subset2_obj = NULL;
+  unsigned int pid1, pid2;	// @CTB ensure that these are unsigned?
+
+  if (!PyArg_ParseTuple(args, "iOi",
+			&pid1, &subset2_obj, &pid2)) {
+    return NULL;
+  }
+
+  khmer_KSubsetPartitionObject *other = (khmer_KSubsetPartitionObject *) subset2_obj;
+  khmer::SubsetPartition * subset2_p = other->subset;
+
+  unsigned int n_only1 = 0, n_only2 = 0, n_shared = 0;
+  subset1_p->compare_to_partition((PartitionID) pid1,
+				  subset2_p, (PartitionID) pid2,
+				  n_only1, n_only2, n_shared);
+
+  return Py_BuildValue("iii", n_only1, n_only2, n_shared);
+}
+
+static PyObject * subset_partition_size_distribution(PyObject * self,
+						     PyObject * args)
+{
+  khmer_KSubsetPartitionObject * me = (khmer_KSubsetPartitionObject *) self;
+  khmer::SubsetPartition * subset_p = me->subset;
+
+  if (!PyArg_ParseTuple(args, "")) {
+    return NULL;
+  }
+  
+  khmer::PartitionCountDistribution d;
+
+  unsigned int n_unassigned = 0;
+  subset_p->partition_size_distribution(d, n_unassigned);
+
+  PyObject * x = PyList_New(d.size());
+  khmer::PartitionCountDistribution::const_iterator di;
+
+  unsigned int i;
+  for (i = 0, di = d.begin(); di != d.end(); di++, i++) {
+    PyList_SET_ITEM(x, i, Py_BuildValue("LL", di->first, di->second));
+  }
+  assert (i == d.size());
+
+  return Py_BuildValue("Oi", x, n_unassigned);
+}
+
+static PyObject * subset_partition_sizes(PyObject * self,
+					 PyObject * args)
+{
+  khmer_KSubsetPartitionObject * me = (khmer_KSubsetPartitionObject *) self;
+  khmer::SubsetPartition * subset_p = me->subset;
+
+  unsigned int min_size = 0;
+
+  if (!PyArg_ParseTuple(args, "|i", &min_size)) {
+    return NULL;
+  }
+  
+  khmer::PartitionCountMap cm;
+  unsigned int n_unassigned = 0;
+  subset_p->partition_sizes(cm, n_unassigned);
+
+  unsigned int i;
+  khmer::PartitionCountMap::const_iterator mi;
+  for (i = 0, mi = cm.begin(); mi != cm.end(); mi++) {
+    if (mi->second >= min_size) i++;
+  }
+
+  PyObject * x = PyList_New(i);
+
+  // this should probably be a dict. @CTB
+  for (i = 0, mi = cm.begin(); mi != cm.end(); mi++) {
+    if (mi->second >= min_size) {
+      PyList_SET_ITEM(x, i, Py_BuildValue("LL", mi->first, mi->second));
+      i++;
+    }
+  }
+
+  return Py_BuildValue("Oi", x, n_unassigned);
+}
+
+static PyObject * subset_partition_average_coverages(PyObject * self,
+						     PyObject * args)
+{
+  khmer_KSubsetPartitionObject * me = (khmer_KSubsetPartitionObject *) self;
+  khmer::SubsetPartition * subset_p = me->subset;
+
+  PyObject * counting_o;
+
+  if (!PyArg_ParseTuple(args, "O", &counting_o)) {
+    return NULL;
+  }
+  
+  khmer::CountingHash * counting = ((khmer_KCountingHashObject *) counting_o)->counting;
+  
+  khmer::PartitionCountMap cm;
+  subset_p->partition_average_coverages(cm, counting);
+
+  unsigned int i;
+  khmer::PartitionCountMap::const_iterator mi;
+
+  PyObject * x = PyList_New(cm.size());
+
+  // this should probably be a dict. @CTB
+  for (i = 0, mi = cm.begin(); mi != cm.end(); mi++, i++) {
+    PyList_SET_ITEM(x, i, Py_BuildValue("LL", mi->first, mi->second));
+  }
+
+  return Py_BuildValue("O", x);
+}
+
+static PyMethodDef khmer_subset_methods[] = {
+  { "count_partitions", subset_count_partitions, METH_VARARGS, "" },
+  { "report_on_partitions", subset_report_on_partitions, METH_VARARGS, "" },
+  { "compare_partitions", subset_compare_partitions, METH_VARARGS, "" },
+  { "partition_size_distribution", subset_partition_size_distribution, METH_VARARGS, "" },
+  { "partition_sizes", subset_partition_sizes, METH_VARARGS, "" },
+  { "partition_average_coverages", subset_partition_average_coverages, METH_VARARGS, "" },
+  {NULL, NULL, 0, NULL}           /* sentinel */
+};
+
+static PyObject *
+khmer_subset_getattr(PyObject * obj, char * name)
+{
+  return Py_FindMethod(khmer_subset_methods, obj, name);
+}
+
+
 //
 // GRAPHALIGN addition
 //
@@ -4589,6 +4607,18 @@ static void khmer_hashbits_dealloc(PyObject* self)
   
   PyObject_Del((PyObject *) obj);
 }
+//
+// khmer_subset_dealloc -- clean up a hashbits object.
+//
+
+static void khmer_subset_dealloc(PyObject* self)
+{
+  khmer_KSubsetPartitionObject * obj = (khmer_KSubsetPartitionObject *) self;
+  delete obj->subset;
+  obj->subset = NULL;
+  
+  PyObject_Del((PyObject *) obj);
+}
 
 //////////////////////////////
 // standalone functions
diff --git a/tests/test_filter.py b/tests/test_filter.py
index 6f85c52fd1..f832fb9860 100644
--- a/tests/test_filter.py
+++ b/tests/test_filter.py
@@ -44,29 +44,3 @@ def test_abund(self):
         assert ['1'] * (114 - 10 + 1) == output
 
         fd.close()
-
-@attr('highmem')
-def test_filter_sodd():
-    K = 32
-    HASHTABLE_SIZE = int(8e7)
-    N_HT = 4
-    MAX_SODD = 3
-
-    ht = khmer.new_hashbits(K, HASHTABLE_SIZE, N_HT)
-    filename = utils.get_test_data('../../data/high-sodd.fa')
-
-    ht.consume_fasta(filename)
-
-    seq = "CGTTAGTTGCGGTGCCGACCGGCAAACTTGGTTTTGCCAAAAATTTTTACAGTTAGAAATTATTC" \
-          "ACAAAGTTGCACCGGAATTCGGTTACAAACGTCATTCTAACTAAT"
-    trim_seq, trim_at = ht.trim_on_sodd(seq, MAX_SODD)
-    assert trim_seq == "CGTTAGTTGCGGTGCCGACCGGCAAACTTGGT"
-
-    seq = "ACAAAATTCCACATATAGTCATAATTGTGGGCAATTTTCGTCCCAAATTAGTTAGAATGACGTTT" \
-          "GTAACCGAATTCCGGTGCAACTTTGTGAATAATTTCTAACTGTAAAAAT"
-    trim_seq, trim_at = ht.trim_on_sodd(seq, MAX_SODD)
-    assert trim_seq == "ACAAAATTCCACATATAGTCATAATTGTGGGCAATT"
-
-    seq = "GCACGCAGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTG"
-    trim_seq, trim_at = ht.trim_on_sodd(seq, MAX_SODD)
-    assert trim_seq == seq
diff --git a/tests/test_hashbits.py b/tests/test_hashbits.py
index b7e4e01e65..2456d8d788 100644
--- a/tests/test_hashbits.py
+++ b/tests/test_hashbits.py
@@ -6,6 +6,7 @@
 import khmer
 
 from screed.fasta import fasta_iter
+import screed
 
 import khmer_tst_utils as utils
 from nose.plugins.attrib import attr

From 72b945ee85a0e8b833b6ccac636059e58dfe1aec Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Wed, 4 Dec 2013 18:11:32 -0500
Subject: [PATCH 101/140] fixed error in buffer flushing

---
 scripts/sweep-reads-by-partition-buffered.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py
index 59e75ac7a2..8e2cd7c6d9 100755
--- a/scripts/sweep-reads-by-partition-buffered.py
+++ b/scripts/sweep-reads-by-partition-buffered.py
@@ -39,7 +39,7 @@
 MIN_KSIZE=21
     
 def fmt_fasta(name, seq, labels=[]):
-        return '>{name}\t{labels}\n{seq}'.format(name=name, 
+        return '>{name}\t{labels}\n{seq}\n'.format(name=name, 
             labels='\t'.join([str(l) for l in labels]), seq=seq)
 
 def write_seq(fp, name, seq, labels=[]):
@@ -67,7 +67,7 @@ def push(self, seq_str):
         self.buf.append(seq_str)
 
     def flush(self):
-        return '\n'.join(self.buf)
+        return ''.join(self.buf)
 
     def is_full(self, full):
         if len(self.buf) >= full:

From 36b6a807b28da11e46a53e1f3a4041938acf232f Mon Sep 17 00:00:00 2001
From: Chris Welcher <cs.welcher@gmail.com>
Date: Tue, 10 Dec 2013 16:43:06 -0500
Subject: [PATCH 102/140] beginning subclassing of label stuff

---
 lib/labelhash.hh | 149 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 149 insertions(+)
 create mode 100644 lib/labelhash.hh

diff --git a/lib/labelhash.hh b/lib/labelhash.hh
new file mode 100644
index 0000000000..670b9f1c34
--- /dev/null
+++ b/lib/labelhash.hh
@@ -0,0 +1,149 @@
+//
+// This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+// Copyright (C) Michigan State University, 2009-2013. It is licensed under
+// the three-clause BSD license; see doc/LICENSE.txt. Contact: ctb@msu.edu
+//
+
+#ifndef LABELHASH_HH
+#define LABELHASH_HH
+
+#include "khmer.hh"
+
+namespace khmer {
+    
+    class LabelHash : public khmer:Hashbits {
+    protected:
+        LabelHash( WordLength ksize, std::vector<HashIntoType& tablesizes)
+        : khmer::Hashbits(ksize, tablesizes)
+        {
+            // constructor
+            _tag_labels_spin_lock = 0;
+
+        }
+        
+        ~LabelHash();
+
+        // Does the given tag already have the given label?
+        bool _cmap_contains_label(const TagLabelPtrMap& cmap,
+                HashIntoType& kmer,
+                Label& the_label) {
+            std::pair<TagLabelPtrMap::const_iterator, TagLabelPtrMap::const_iterator> ret;
+            ret = cmap.equal_range(kmer);
+            for (TagLabelPtrMap::const_iterator it=ret.first; it!=ret.second; ++it) {
+                if (*(it->second) == the_label) return true;
+            }
+            return false;
+        }
+
+        // Does the given label already have a tag associated with it?
+        bool _cmap_contains_tag(const LabelTagPtrMap& cmap,
+                    Label& the_label,
+                    HashIntoType& kmer) {
+            std::pair<LabelTagPtrMap::const_iterator, LabelTagPtrMap::const_iterator> ret;
+            ret = cmap.equal_range(the_label);
+            for (LabelTagPtrMap::const_iterator it=ret.first; it!=ret.second; ++it) {
+                if(*(it->second) == kmer) return true;
+            }
+            return false;
+        }
+        
+        unsigned int _get_tag_labels(const HashIntoType& tag,
+                  const TagLabelPtrMap& cmap,
+                  LabelPtrSet& found_labels) {
+            unsigned int num_labels = 0;
+            std::pair<TagLabelPtrMap::const_iterator, TagLabelPtrMap::const_iterator> ret;
+            ret = cmap.equal_range(tag);
+            for (TagLabelPtrMap::const_iterator it=ret.first; it!=ret.second; ++it) {
+                found_labels.insert(it->second);
+                ++num_labels;
+            }
+            return num_labels;
+            }
+        
+        unsigned int _get_tags_from_label(const Label& label,
+                       const LabelTagPtrMap& cmap,
+                       TagPtrSet& labeled_tags) {
+            unsigned int num_tags = 0;
+            std::pair<LabelTagPtrMap::const_iterator, LabelTagPtrMap::const_iterator> ret;
+            ret = cmap.equal_range(label);
+            for (LabelTagPtrMap::const_iterator it=ret.first; it!=ret.second; ++it) {
+                labeled_tags.insert(it->second);
+                ++num_tags;
+            }
+            return num_tags;
+            }
+
+        uint32_t _tag_labels_spin_lock;
+
+    public:
+        TagLabelPtrMap tag_labels;
+        LabelTagPtrMap label_tag_ptrs;
+        LabelPtrMap label_ptrs;
+
+        unsigned int n_labels() const { return label_ptrs.size(); }
+ 
+    
+        Label * check_and_allocate_label(Label new_label) {
+            Label * c;
+            if (label_ptrs.count(new_label)) {
+                c = label_ptrs[new_label];
+            } else {
+                c = new Label(new_label);
+                label_ptrs[*c] = c;
+            }
+            return c;
+        }
+        void consume_fasta_and_tag_with_labels(
+                            std::string const	  &filename,
+                            unsigned int	  &total_reads,
+                            unsigned long long  &n_consumed,
+                            CallbackFn	  callback	  = NULL,
+                            void *		  callback_data	  = NULL);
+
+        void consume_fasta_and_tag_with_labels(
+                        read_parsers:: IParser *	    parser,
+                        unsigned int	    &total_reads,
+                        unsigned long long  &n_consumed,
+                        CallbackFn	    callback	    = NULL,
+                        void *		    callback_data   = NULL);
+                        
+        void consume_partitioned_fasta_and_tag_with_labels(const std::string &filename,
+                          unsigned int &total_reads,
+                          unsigned long long &n_consumed,
+                          CallbackFn callback = NULL,
+                          void * callback_datac = NULL);
+                                      
+        void consume_sequence_and_tag_with_labels(const std::string& seq,
+                        unsigned long long& n_consumed,
+                        Label& current_label,
+                        SeenSet * new_tags = 0);
+        
+        LabelPtrSet get_tag_labels(const HashIntoType& tag);
+        TagPtrSet get_label_tags(const Label& label);
+
+        void link_tag_and_label(HashIntoType& kmer, Label& label);
+        
+        unsigned int sweep_sequence_for_labels(const std::string& seq,
+                        LabelPtrSet& found_labels,
+                        bool break_on_stoptags,
+                        bool stop_big_traversals);
+                        
+        unsigned int sweep_label_neighborhood(const std::string & seq,
+                                                      LabelPtrSet& found_labels,
+                                                      unsigned int range,
+                                                      bool break_on_stoptags,
+                                                      bool stop_big_traversals);
+                                                                
+        void traverse_labels_and_resolve(const SeenSet& tagged_kmers,
+                                         LabelPtrSet& found_labels);
+
+    }
+}
+
+#define ACQUIRE_TAG_COLORS_SPIN_LOCK \
+  while(!__sync_bool_compare_and_swap( &_tag_labels_spin_lock, 0, 1));
+
+#define RELEASE_TAG_COLORS_SPIN_LOCK \
+  __sync_bool_compare_and_swap( &_tag_labels_spin_lock, 1, 0);
+
+#endif LABELHASH_HH

From 902cc2b0fe62c2f06f0ce190cdd42d532469d85f Mon Sep 17 00:00:00 2001
From: CS Welcher <cs.welcher@gmail.com>
Date: Wed, 11 Dec 2013 01:31:02 -0500
Subject: [PATCH 103/140] added labelhash.cc

---
 lib/khmer.hh     |   3 +-
 lib/labelhash.cc | 357 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 359 insertions(+), 1 deletion(-)
 create mode 100644 lib/labelhash.cc

diff --git a/lib/khmer.hh b/lib/khmer.hh
index 9e11a65f6d..f0fc8d134c 100644
--- a/lib/khmer.hh
+++ b/lib/khmer.hh
@@ -84,7 +84,8 @@ namespace khmer {
   typedef std::map<HashIntoType, unsigned int> TagCountMap;
   typedef std::map<PartitionID, unsigned int> PartitionCountMap;
   typedef std::map<unsigned long long, unsigned long long> PartitionCountDistribution;
-  
+
+  // types used in @camillescott's sparse labeling extension  
   typedef unsigned long long int Label;
   typedef std::multimap<HashIntoType, Label*> TagLabelPtrMap;
   typedef std::multimap<Label, HashIntoType*> LabelTagPtrMap;
diff --git a/lib/labelhash.cc b/lib/labelhash.cc
new file mode 100644
index 0000000000..de57042695
--- /dev/null
+++ b/lib/labelhash.cc
@@ -0,0 +1,357 @@
+//
+// This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+// Copyright (C) Michigan State University, 2009-2013. It is licensed under
+// the three-clause BSD license; see doc/LICENSE.txt. Contact: ctb@msu.edu
+//
+
+#include "labelhash.hh"
+
+using namespace std;
+using namespace khmer;
+using namespace khmer:: read_parsers;
+
+/*
+ * @camillescott
+ * Might be time for a refactor: could do a general consume_fasta
+ * function which accepts a consume_sequence function pointer as a parameter
+ */
+
+void
+LabelHash::consume_fasta_and_tag_with_labels(
+  std:: string const  &filename,
+  unsigned int	      &total_reads, unsigned long long	&n_consumed,
+  CallbackFn	      callback,	    void *		callback_data
+)
+{
+  khmer:: Config    &the_config	  = khmer:: get_active_config( );
+
+  // Note: Always assume only 1 thread if invoked this way.
+  IParser *	  parser = 
+  IParser::get_parser(
+    filename, 1, the_config.get_reads_input_buffer_size( ),
+    the_config.get_reads_parser_trace_level( )
+  );
+
+
+  consume_fasta_and_tag_with_labels(
+    parser,
+    total_reads, n_consumed,
+    callback, callback_data
+  );
+
+  delete parser;
+}
+
+void
+LabelHash::consume_fasta_and_tag_with_labels(
+    read_parsers:: IParser *  parser,
+    unsigned int		    &total_reads,   unsigned long long	&n_consumed,
+    CallbackFn		    callback,	    void *		callback_data
+  )
+  {
+    Hasher		  &hasher		= 
+    _get_hasher( parser->uuid( ) );
+    unsigned int		  total_reads_LOCAL	= 0;
+  #if (0) // Note: Used with callback - currently disabled.
+    unsigned long long int  n_consumed_LOCAL	= 0;
+  #endif
+    Read			  read;
+
+    // TODO? Delete the following assignments.
+    total_reads = 0;
+    n_consumed = 0;
+    
+    hasher.trace_logger(
+      TraceLogger:: TLVL_DEBUG2,
+      "Starting trace of 'consume_fasta_and_tag_with_labels'....\n"
+    );
+    
+    Label _tag_label = 0;
+
+    Label * the_label;
+    // Iterate through the reads and consume their k-mers.
+    while (!parser->is_complete( ))
+    {
+      unsigned long long this_n_consumed   = 0;
+
+      read = parser->get_next_read( );
+
+      if (check_and_normalize_read( read.sequence ))
+      {
+        // TODO: make threadsafe!
+        the_label = check_and_allocate_label(_tag_label);
+        consume_sequence_and_tag_with_labels( read.sequence,
+					      this_n_consumed,
+					      *the_label );
+	    _tag_label++;
+
+  #ifdef WITH_INTERNAL_METRICS
+        hasher.pmetrics.start_timers( );
+  #endif
+  #if (0) // Note: Used with callback - currently disabled.
+        n_consumed_LOCAL  = __sync_add_and_fetch( &n_consumed, this_n_consumed );
+  #else
+        __sync_add_and_fetch( &n_consumed, this_n_consumed );
+  #endif
+        total_reads_LOCAL = __sync_add_and_fetch( &total_reads, 1 );
+  #ifdef WITH_INTERNAL_METRICS
+        hasher.pmetrics.stop_timers( );
+        hasher.pmetrics.accumulate_timer_deltas(
+	  (uint32_t)HashTablePerformanceMetrics:: MKEY_TIME_UPDATE_TALLIES
+        );
+  #endif
+      }
+
+      if (0 == (total_reads_LOCAL % 10000))
+        hasher.trace_logger(
+	  TraceLogger:: TLVL_DEBUG3,
+	  "Total number of reads processed: %llu\n",
+	  (unsigned long long int)total_reads_LOCAL
+        );
+
+      // TODO: Figure out alternative to callback into Python VM
+      //       Cannot use in multi-threaded operation.
+  #if (0)
+        // run callback, if specified
+        if (total_reads_TL % CALLBACK_PERIOD == 0 && callback) {
+	  std::cout << "n tags: " << all_tags.size() << "\n";
+	  try {
+	    callback("consume_fasta_and_tag_with_labels", callback_data, total_reads_TL,
+		     n_consumed);
+	  } catch (...) {
+	    delete parser;
+	    throw;
+	  }
+        }
+  #endif // 0
+
+    } // while reads left for parser
+
+  }
+
+void LabelHash::consume_partitioned_fasta_and_tag_with_labels(const std::string &filename,
+					  unsigned int &total_reads,
+					  unsigned long long &n_consumed,
+					  CallbackFn callback,
+					  void * callback_data)
+{
+  total_reads = 0;
+  n_consumed = 0;
+
+  IParser* parser = IParser::get_parser(filename.c_str());
+  Read read;
+
+  string seq = "";
+
+  // reset the master subset partition
+  delete partition;
+  partition = new SubsetPartition(this);
+
+  //
+  // iterate through the FASTA file & consume the reads.
+  //
+  Label * c;
+  PartitionID p;
+  while(!parser->is_complete())  {
+    read = parser->get_next_read();
+    seq = read.sequence;
+
+    if (check_and_normalize_read(seq)) {
+      // First, figure out what the partition is (if non-zero), and save that.
+      p = _parse_partition_id(read.name);
+      c = check_and_allocate_label(p);
+
+      consume_sequence_and_tag_with_labels( seq,
+					      n_consumed,
+					      *c );
+    }
+	       
+    // reset the sequence info, increment read number
+    total_reads++;
+
+    // run callback, if specified
+    if (total_reads % CALLBACK_PERIOD == 0 && callback) {
+      try {
+        callback("consume_partitioned_fasta_and_tag_with_labels", callback_data, 
+        total_reads, n_consumed);
+      } catch (...) {
+	delete parser;
+        throw;
+      }
+    }
+  }
+
+  // @cswelcher TODO: check that deallocate LabelPtrMap is correct
+  delete parser;
+}
+
+// @cswelcher: double-check -- is it valid to pull the address from a reference?
+void LabelHash::link_tag_and_label(HashIntoType& kmer, Label& kmer_label) {
+  tag_labels.insert(TagLabelPtrPair(kmer, &kmer_label));
+  label_tag_ptrs.insert(LabelTagPtrPair(kmer_label, &kmer));
+}
+
+void LabelHash::consume_sequence_and_tag_with_labels(const std::string& seq,
+					unsigned long long& n_consumed,
+					Label& current_label,
+					SeenSet * found_tags)
+  {
+    bool is_new_kmer;
+    bool kmer_tagged;
+
+    KMerIterator kmers(seq.c_str(), _ksize);
+    HashIntoType kmer;
+
+    unsigned int since = _tag_density / 2 + 1;
+
+    while(!kmers.done()) {
+      kmer = kmers.next();
+
+      if ((is_new_kmer = test_and_set_bits( kmer )))
+        ++n_consumed;
+
+  #if (1)
+      if (is_new_kmer) {
+        ++since;
+      } else {
+        ACQUIRE_ALL_TAGS_SPIN_LOCK
+        kmer_tagged = set_contains(all_tags, kmer);
+        RELEASE_ALL_TAGS_SPIN_LOCK
+        if (kmer_tagged) {
+	      since = 1;
+	      
+	      // Labeling code
+	      // TODO: MAKE THREADSAFE!
+	      
+	      if (!_cmap_contains_label(tag_labels, kmer, current_label)) {
+	        ACQUIRE_TAG_COLORS_SPIN_LOCK
+	        link_tag_and_label(kmer, current_label);
+	        RELEASE_TAG_COLORS_SPIN_LOCK
+	      }
+	      if (found_tags) {
+	        found_tags->insert(kmer);
+	      }
+        }  else ++since;
+      }
+  #else
+      if (!is_new_kmer && set_contains(all_tags, kmer)) {
+        since = 1;
+        if (found_tags) { found_tags->insert(kmer); }
+      } else {
+        since++;
+      }
+  #endif
+      //
+      if (since >= _tag_density) {
+        ACQUIRE_ALL_TAGS_SPIN_LOCK
+        all_tags.insert(kmer);
+        RELEASE_ALL_TAGS_SPIN_LOCK
+        
+        // Labeling code
+        // TODO: MAKE THREADSAFE!
+        ACQUIRE_TAG_COLORS_SPIN_LOCK
+        link_tag_and_label(kmer, current_label);
+        RELEASE_TAG_COLORS_SPIN_LOCK
+        
+        if (found_tags) { found_tags->insert(kmer); }
+        since = 1;
+      }
+
+    } // iteration over kmers
+
+    if (since >= _tag_density/2 - 1) {
+      ACQUIRE_ALL_TAGS_SPIN_LOCK
+      all_tags.insert(kmer);	// insert the last k-mer, too.
+      RELEASE_ALL_TAGS_SPIN_LOCK
+      
+      // Label code: TODO: MAKE THREADSAFE!
+      link_tag_and_label(kmer, current_label);
+      
+      if (found_tags) { found_tags->insert(kmer); }
+    }
+  }
+/*
+ * Find all labels associated with the sequence
+ * For now, check /every/ k-mer with find_all_tags
+ * THIS SUCKS AND IT'S YOUR FAULT @CTB
+ */
+unsigned int LabelHash::sweep_sequence_for_labels(const std::string& seq,
+					LabelPtrSet& found_labels,
+					bool break_on_stoptags,
+					bool stop_big_traversals) {
+					
+    SeenSet tagged_kmers;
+    //LabelPtrSet found_labels;
+    
+    HashIntoType kmer_f, kmer_r, kmer;
+    
+    KMerIterator kmers(seq.c_str(), _ksize);
+    std::string kmer_s;
+    // keep a list of kmers which have already been traversed
+    SeenSet traversed_kmers;
+    while (!kmers.done()) {
+      kmer = kmers.next();
+      kmer_s = _revhash(kmer, _ksize);
+      _hash(kmer_s.c_str(), _ksize, kmer_f, kmer_r);
+      
+      // don't even try traversing from k-mers not in the hashtable
+      //traversed_kmers.clear();
+      if (get_count(uniqify_rc(kmer_f,kmer_r))) {
+        partition->find_all_tags(kmer_f, kmer_r, tagged_kmers,
+                   all_tags, break_on_stoptags, stop_big_traversals);
+        traverse_labels_and_resolve(tagged_kmers, found_labels);
+      }
+    }
+    return traversed_kmers.size();
+}
+
+unsigned int LabelHash::sweep_label_neighborhood(const std::string& seq,
+                                                  LabelPtrSet& found_labels,
+                                                  unsigned int range,
+                                                  bool break_on_stoptags,
+                                                  bool stop_big_traversals) {
+
+    SeenSet tagged_kmers;
+    unsigned int num_traversed;
+    num_traversed = partition->sweep_for_tags(seq, tagged_kmers, all_tags, 
+                              range, break_on_stoptags, stop_big_traversals);
+    traverse_labels_and_resolve(tagged_kmers, found_labels);
+    //printf("range=%u ", range);
+    if (range == 0) {
+      assert(num_traversed == seq.length()-ksize()+1);
+    }
+    tagged_kmers.clear();
+    return num_traversed;
+}
+
+LabelPtrSet LabelHash::get_tag_labels(const HashIntoType& tag) {
+  LabelPtrSet labels;
+  unsigned int num_labels;
+  _get_tag_labels(tag, tag_labels, labels);
+  return labels;
+}
+
+TagPtrSet LabelHash::get_label_tags(const Label& label) {
+  TagPtrSet tags;
+  unsigned int num_tags;
+  _get_tags_from_label(label, label_tag_ptrs, tags);
+  return tags;
+}
+
+void LabelHash::traverse_labels_and_resolve(const SeenSet& tagged_kmers,
+                                              LabelPtrSet& found_labels) {
+  
+  SeenSet::const_iterator si;
+  unsigned int num_labels = 0;
+  for (si=tagged_kmers.begin(); si!=tagged_kmers.end(); ++si) {
+    HashIntoType tag = *si;
+    // get the labels associated with this tag
+    num_labels = _get_tag_labels(tag, tag_labels, found_labels);
+    if (num_labels > 1) {
+      // reconcile labels
+      // for now do nothing ha
+    }
+  }
+}
+
+

From 605c04e7fe62de19dfaf5aadfd9fc1d8b1eb170f Mon Sep 17 00:00:00 2001
From: Camille Scott <camille.scott.w@gmail.com>
Date: Wed, 11 Dec 2013 17:01:11 -0500
Subject: [PATCH 104/140] moved parse pid to read_parsers file, fixed syntax
 errors in labehash, added labelhash to setup.py

---
 lib/hashtable.cc    | 20 --------------------
 lib/labelhash.cc    |  4 ++--
 lib/labelhash.hh    | 11 +++++++----
 lib/read_parsers.cc |  1 -
 lib/read_parsers.hh | 24 ++++++++++++++++++++++--
 5 files changed, 31 insertions(+), 29 deletions(-)

diff --git a/lib/hashtable.cc b/lib/hashtable.cc
index c8c075143c..7f4ea47d90 100644
--- a/lib/hashtable.cc
+++ b/lib/hashtable.cc
@@ -723,26 +723,6 @@ void Hashtable::divide_tags_into_subsets(unsigned int subset_size,
   }
 }
 
-static PartitionID _parse_partition_id(string name)
-{
-  PartitionID p = 0;
-  const char * s = name.c_str() + name.length() - 1;
-  assert(*(s + 1) == (unsigned int) NULL);
-
-  while(*s != '\t' && s >= name.c_str()) {
-    s--;
-  }
-
-  if (*s == '\t') {
-    p = (PartitionID) atoi(s + 1);
-  } else {
-    cerr << "consume_partitioned_fasta barfed on read "  << name << "\n";
-    assert(0);
-  }
-
-  return p;
-}
-
 //
 // consume_partitioned_fasta: consume a FASTA file of reads
 //
diff --git a/lib/labelhash.cc b/lib/labelhash.cc
index de57042695..b3d9a15ea8 100644
--- a/lib/labelhash.cc
+++ b/lib/labelhash.cc
@@ -326,14 +326,14 @@ unsigned int LabelHash::sweep_label_neighborhood(const std::string& seq,
 
 LabelPtrSet LabelHash::get_tag_labels(const HashIntoType& tag) {
   LabelPtrSet labels;
-  unsigned int num_labels;
+  //unsigned int num_labels;
   _get_tag_labels(tag, tag_labels, labels);
   return labels;
 }
 
 TagPtrSet LabelHash::get_label_tags(const Label& label) {
   TagPtrSet tags;
-  unsigned int num_tags;
+  //unsigned int num_tags;
   _get_tags_from_label(label, label_tag_ptrs, tags);
   return tags;
 }
diff --git a/lib/labelhash.hh b/lib/labelhash.hh
index 670b9f1c34..960d64158d 100644
--- a/lib/labelhash.hh
+++ b/lib/labelhash.hh
@@ -7,13 +7,16 @@
 #ifndef LABELHASH_HH
 #define LABELHASH_HH
 
+#include <string>
+
 #include "khmer.hh"
+#include "hashbits.hh"
 
 namespace khmer {
     
-    class LabelHash : public khmer:Hashbits {
+    class LabelHash : public khmer::Hashbits {
     protected:
-        LabelHash( WordLength ksize, std::vector<HashIntoType& tablesizes)
+        LabelHash( WordLength ksize, std::vector<HashIntoType>& tablesizes)
         : khmer::Hashbits(ksize, tablesizes)
         {
             // constructor
@@ -137,7 +140,7 @@ namespace khmer {
         void traverse_labels_and_resolve(const SeenSet& tagged_kmers,
                                          LabelPtrSet& found_labels);
 
-    }
+    };
 }
 
 #define ACQUIRE_TAG_COLORS_SPIN_LOCK \
@@ -146,4 +149,4 @@ namespace khmer {
 #define RELEASE_TAG_COLORS_SPIN_LOCK \
   __sync_bool_compare_and_swap( &_tag_labels_spin_lock, 1, 0);
 
-#endif LABELHASH_HH
+#endif
diff --git a/lib/read_parsers.cc b/lib/read_parsers.cc
index e9449d526d..7d0f1aef56 100644
--- a/lib/read_parsers.cc
+++ b/lib/read_parsers.cc
@@ -1936,7 +1936,6 @@ _is_valid_read_pair(
 		    ==	the_read_pair.second.name.substr( 0, match_1.rm_so ));
 }
 
-
 } // namespace read_parsers
 
 
diff --git a/lib/read_parsers.hh b/lib/read_parsers.hh
index c2a07f90c5..e8ca7e7968 100644
--- a/lib/read_parsers.hh
+++ b/lib/read_parsers.hh
@@ -10,7 +10,7 @@
 
 #include <cassert>
 #include <cstdarg>
-
+#include <iostream>
 #include <string>
 #include <utility>
 
@@ -28,7 +28,6 @@ extern "C"
 #include "trace_logger.hh"
 #include "perf_metrics.hh"
 
-
 namespace khmer
 {
 
@@ -544,6 +543,27 @@ struct FastqParser : public IParser
 
 };
 
+static PartitionID _parse_partition_id(std::string name)
+{
+  PartitionID p = 0;
+  const char * s = name.c_str() + name.length() - 1;
+  assert(*(s + 1) == (unsigned int) NULL);
+
+  while(*s != '\t' && s >= name.c_str()) {
+    s--;
+  }
+
+  if (*s == '\t') {
+    p = (PartitionID) atoi(s + 1);
+  } else {
+    std::cerr << "consume_partitioned_fasta barfed on read "  << name << "\n";
+    assert(0);
+  }
+
+  return p;
+}
+
+
 
 } // namespace read_parsers
 

From 4127089cb8b4c44a809f6001b4b548aba1a1cabc Mon Sep 17 00:00:00 2001
From: Camille Scott <camille.scott.w@gmail.com>
Date: Thu, 12 Dec 2013 03:37:38 -0500
Subject: [PATCH 105/140] added labelhash defs to khmermodule, started
 stripping labeling methods from hashbits, definitely fails tests

---
 khmer/_khmermodule.cc | 819 ++++++++++++++++++++++++------------------
 1 file changed, 474 insertions(+), 345 deletions(-)

diff --git a/khmer/_khmermodule.cc b/khmer/_khmermodule.cc
index f8f7cb39b3..cb85fad1cc 100644
--- a/khmer/_khmermodule.cc
+++ b/khmer/_khmermodule.cc
@@ -21,6 +21,7 @@
 #include "hashbits.hh"
 #include "storage.hh"
 #include "aligner.hh"
+#include "labelhash.hh"
 
 //
 // Function necessary for Python loading:
@@ -3853,265 +3854,6 @@ static PyObject * hashbits_get_median_count(PyObject * self, PyObject * args)
   return Py_BuildValue("iff", med, average, stddev);
 }
 
-static PyObject * hashbits_get_label_dict(PyObject * self, PyObject * args) {
-  khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self;
-  khmer::Hashbits * hb = me->hashbits;
-  
-  PyObject * d = PyDict_New();
-  khmer::LabelPtrMap::iterator it;
-  
-  for (it = hb->label_ptrs.begin(); it!=hb->label_ptrs.end(); ++it) {
-    PyDict_SetItem(d, Py_BuildValue("K", it->first), Py_BuildValue("K", it->second));
-  }
-  
-  return d;
-}
-
-static PyObject * hashbits_consume_fasta_and_tag_with_labels(PyObject * self, PyObject * args)
-{
-  khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self;
-  khmer::Hashbits * hb = me->hashbits;
-  
-  std::ofstream outfile;
-  
-  char * filename;
-  PyObject * callback_obj = NULL;
-
-  if (!PyArg_ParseTuple(args, "s|O", &filename, &callback_obj)) {
-    return NULL;
-  }
-  
-  unsigned long long n_consumed;
-  unsigned int total_reads;
-  bool exc_raised = false;
-  
-  //Py_BEGIN_ALLOW_THREADS
-  try {
-    hb->consume_fasta_and_tag_with_labels(filename, total_reads, n_consumed,
-                                                _report_fn, callback_obj);
-  } catch (_khmer_signal &e) {
-    exc_raised = true;
-  }
-  //Py_END_ALLOW_THREADS
-  if (exc_raised) return NULL;
-  
-  return Py_BuildValue("iL", total_reads, n_consumed);
-  
-}
-
-static PyObject * hashbits_consume_partitioned_fasta_and_tag_with_labels(
-                                            PyObject * self, PyObject * args)
-{
-  khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self;
-  khmer::Hashbits * hashbits = me->hashbits;
-
-  char * filename;
-  PyObject * callback_obj = NULL;
-
-  if (!PyArg_ParseTuple(args, "s|O", &filename, &callback_obj)) {
-    return NULL;
-  }
-
-  // call the C++ function, and trap signals => Python
-
-  unsigned long long n_consumed;
-  unsigned int total_reads;
-
-  try {
-    hashbits->consume_partitioned_fasta_and_tag_with_labels(filename, 
-    total_reads, n_consumed, _report_fn, callback_obj);
-  } catch (_khmer_signal &e) {
-    return NULL;
-  }
-
-  return Py_BuildValue("iK", total_reads, n_consumed);
-}
-
-static PyObject * hashbits_consume_sequence_and_tag_with_labels(PyObject * self, PyObject * args) {
-  khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self;
-  khmer::Hashbits * hb = me->hashbits;
-  
-  char * seq = NULL;
-  unsigned long long c = NULL;
-  if (!PyArg_ParseTuple(args, "sK", &seq, &c)) {
-    return NULL;
-  }
-  
-  unsigned long long n_consumed = 0;
-  khmer::Label * the_label = hb->check_and_allocate_label(c);
-
-  try { 
-  //if (hb->check_and_normalize_read(seq)) {
-    
-    hb->consume_sequence_and_tag_with_labels(seq, n_consumed, *the_label);
-  //}
-  } catch (_khmer_signal &e) {
-    return NULL;
-  }
-  return Py_BuildValue("L", n_consumed);
-}
-
-static PyObject * hashbits_sweep_label_neighborhood(PyObject * self, PyObject * args) {
-  khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self;
-  khmer::Hashbits * hb = me->hashbits;
-  
-  char * seq = NULL;
-  unsigned int r = NULL;
-  PyObject * break_on_stop_tags_o = NULL;
-  PyObject * stop_big_traversals_o = NULL;
-
-  if (!PyArg_ParseTuple(args, "s|iOO", &seq, &r,
-			&break_on_stop_tags_o,
-			&stop_big_traversals_o)) {
-    return NULL;
-  }
-
-  unsigned int range = (2 * hb->_get_tag_density()) + 1;
-  if (r >= 0) {
-    range = r;
-  }
-
-  bool break_on_stop_tags = false;
-  if (break_on_stop_tags_o && PyObject_IsTrue(break_on_stop_tags_o)) {
-    break_on_stop_tags = true;
-  }
-  bool stop_big_traversals = false;
-  if (stop_big_traversals_o && PyObject_IsTrue(stop_big_traversals_o)) {
-    stop_big_traversals = true;
-  }
-  
-  if (strlen(seq) < hb->ksize()) {
-    return NULL;
-  }
-  
-  //std::pair<TagLabelPtrPair::iterator, TagLabelPtrPair::iterator> ret;
-  LabelPtrSet found_labels;
-  
-  bool exc_raised = false;
-  unsigned int num_traversed = 0;
-  //Py_BEGIN_ALLOW_THREADS
-  try {
-    num_traversed = hb->sweep_label_neighborhood(seq, found_labels, range, break_on_stop_tags, stop_big_traversals);
-  } catch (_khmer_signal &e) {
-    exc_raised = true;
-  }
-  //Py_END_ALLOW_THREADS
-  
-  //printf("...%u kmers traversed\n", num_traversed);
-  
-  if (exc_raised) return NULL;
-  
-  PyObject * x =  PyList_New(found_labels.size());
-  khmer::LabelPtrSet::const_iterator si;
-  unsigned long long i = 0;
-  for (si=found_labels.begin(); si!=found_labels.end(); ++si) {
-    PyList_SET_ITEM(x, i, Py_BuildValue("K", *(*si)));
-    i++;
-  }
-  
-  return x;
-}
-
-
-// Similar to find_all_tags, but returns tags in a way actually useable by python
-// need a tags_in_sequence iterator or function in c++ land for reuse in all
-// these functions
-static PyObject * hashbits_sweep_tag_neighborhood(PyObject * self, PyObject *args)
-{
-  khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self;
-  khmer::Hashbits * hashbits = me->hashbits;
-
-  char * seq = NULL;
-  unsigned int r = NULL;
-  PyObject * break_on_stop_tags_o = NULL;
-  PyObject * stop_big_traversals_o = NULL;
-
-  if (!PyArg_ParseTuple(args, "s|iOO", &seq, &r,
-			&break_on_stop_tags_o,
-			&stop_big_traversals_o)) {
-    return NULL;
-  }
-
-  unsigned int range = (2 * hashbits->_get_tag_density()) + 1;
-  if (r >= 0) {
-    range = r;
-  }
-
-  bool break_on_stop_tags = false;
-  if (break_on_stop_tags_o && PyObject_IsTrue(break_on_stop_tags_o)) {
-    break_on_stop_tags = true;
-  }
-  bool stop_big_traversals = false;
-  if (stop_big_traversals_o && PyObject_IsTrue(stop_big_traversals_o)) {
-    stop_big_traversals = true;
-  }
-  
-  if (strlen(seq) < hashbits->ksize()) {
-    return NULL;
-  }
-
-  khmer::SeenSet tagged_kmers;
-
-  //Py_BEGIN_ALLOW_THREADS
-
-  hashbits->partition->sweep_for_tags(seq, tagged_kmers, 
-            hashbits->all_tags, range, break_on_stop_tags, stop_big_traversals);
-
-  //Py_END_ALLOW_THREADS
-
-  PyObject * x =  PyList_New(tagged_kmers.size());
-  khmer::SeenSet::const_iterator si;
-  unsigned long long i = 0;
-  for (si=tagged_kmers.begin(); si!=tagged_kmers.end(); ++si) {
-    //std::string kmer_s = _revhash(*si, hashbits->ksize());
-    // type K for python unsigned long long
-    PyList_SET_ITEM(x, i, Py_BuildValue("K", *si));
-    i++;
-  }
-
-  return x;
-}
-
-
-static PyObject * hashbits_get_tag_labels(PyObject * self, PyObject * args) {
-  
-  khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self;
-  khmer::Hashbits * hashbits = me->hashbits;
-  
-  khmer::HashIntoType tag;
-  
-  if (!PyArg_ParseTuple(args, "K", &tag)) {
-    return NULL;
-  }
-  
-  khmer::LabelPtrSet labels;
-  
-  labels = hashbits->get_tag_labels(tag);
-  
-  PyObject * x =  PyList_New(labels.size());
-  khmer::LabelPtrSet::const_iterator si;
-  unsigned long long i = 0;
-  for (si=labels.begin(); si!=labels.end(); ++si) {
-    //std::string kmer_s = _revhash(*si, hashbits->ksize());
-    PyList_SET_ITEM(x, i, Py_BuildValue("K", *(*si)));
-    i++;
-  }
-
-  return x;
-}
-
-static PyObject * hashbits_n_labels(PyObject * self, PyObject * args)
-{
-  khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self;
-  khmer::Hashbits * hashbits = me->hashbits;
-
-  if (!PyArg_ParseTuple(args, "")) {
-    return NULL;
-  }
-
-  return PyInt_FromLong(hashbits->n_labels());
-}
-
 static PyMethodDef khmer_hashbits_methods[] = {
   { "extract_unique_paths", hashbits_extract_unique_paths, METH_VARARGS, "" },
   { "ksize", hashbits_get_ksize, METH_VARARGS, "" },
@@ -4180,15 +3922,6 @@ static PyMethodDef khmer_hashbits_methods[] = {
   { "traverse_from_tags", hashbits_traverse_from_tags, METH_VARARGS, "" },
   { "repartition_largest_partition", hashbits_repartition_largest_partition, METH_VARARGS, "" },
   { "get_median_count", hashbits_get_median_count, METH_VARARGS, "Get the median, average, and stddev of the k-mer counts in the string" },
-  { "consume_fasta_and_tag_with_labels", hashbits_consume_fasta_and_tag_with_labels, METH_VARARGS, "" },
-  { "sweep_label_neighborhood", hashbits_sweep_label_neighborhood, METH_VARARGS, "" },
-  {"consume_partitioned_fasta_and_tag_with_labels", hashbits_consume_partitioned_fasta_and_tag_with_labels, METH_VARARGS, "" },
-  {"sweep_tag_neighborhood", hashbits_sweep_tag_neighborhood, METH_VARARGS, "" },
-  {"get_tag_labels", hashbits_get_tag_labels, METH_VARARGS, ""},
-  {"consume_sequence_and_tag_with_labels", hashbits_consume_sequence_and_tag_with_labels, METH_VARARGS, "" },
-  {"n_labels", hashbits_n_labels, METH_VARARGS, ""},
-  {"get_label_dict", hashbits_get_label_dict, METH_VARARGS, "" },
- 
   {NULL, NULL, 0, NULL}           /* sentinel */
 };
 
@@ -4238,134 +3971,484 @@ static PyObject * subset_compare_partitions(PyObject * self,
   khmer_KSubsetPartitionObject * me = (khmer_KSubsetPartitionObject *) self;
   khmer::SubsetPartition * subset1_p = me->subset;
 
-  PyObject * subset2_obj = NULL;
-  unsigned int pid1, pid2;	// @CTB ensure that these are unsigned?
+  PyObject * subset2_obj = NULL;
+  unsigned int pid1, pid2;	// @CTB ensure that these are unsigned?
+
+  if (!PyArg_ParseTuple(args, "iOi",
+			&pid1, &subset2_obj, &pid2)) {
+    return NULL;
+  }
+
+  khmer_KSubsetPartitionObject *other = (khmer_KSubsetPartitionObject *) subset2_obj;
+  khmer::SubsetPartition * subset2_p = other->subset;
+
+  unsigned int n_only1 = 0, n_only2 = 0, n_shared = 0;
+  subset1_p->compare_to_partition((PartitionID) pid1,
+				  subset2_p, (PartitionID) pid2,
+				  n_only1, n_only2, n_shared);
+
+  return Py_BuildValue("iii", n_only1, n_only2, n_shared);
+}
+
+static PyObject * subset_partition_size_distribution(PyObject * self,
+						     PyObject * args)
+{
+  khmer_KSubsetPartitionObject * me = (khmer_KSubsetPartitionObject *) self;
+  khmer::SubsetPartition * subset_p = me->subset;
+
+  if (!PyArg_ParseTuple(args, "")) {
+    return NULL;
+  }
+  
+  khmer::PartitionCountDistribution d;
+
+  unsigned int n_unassigned = 0;
+  subset_p->partition_size_distribution(d, n_unassigned);
+
+  PyObject * x = PyList_New(d.size());
+  khmer::PartitionCountDistribution::const_iterator di;
+
+  unsigned int i;
+  for (i = 0, di = d.begin(); di != d.end(); di++, i++) {
+    PyList_SET_ITEM(x, i, Py_BuildValue("LL", di->first, di->second));
+  }
+  assert (i == d.size());
+
+  return Py_BuildValue("Oi", x, n_unassigned);
+}
+
+static PyObject * subset_partition_sizes(PyObject * self,
+					 PyObject * args)
+{
+  khmer_KSubsetPartitionObject * me = (khmer_KSubsetPartitionObject *) self;
+  khmer::SubsetPartition * subset_p = me->subset;
+
+  unsigned int min_size = 0;
+
+  if (!PyArg_ParseTuple(args, "|i", &min_size)) {
+    return NULL;
+  }
+  
+  khmer::PartitionCountMap cm;
+  unsigned int n_unassigned = 0;
+  subset_p->partition_sizes(cm, n_unassigned);
+
+  unsigned int i;
+  khmer::PartitionCountMap::const_iterator mi;
+  for (i = 0, mi = cm.begin(); mi != cm.end(); mi++) {
+    if (mi->second >= min_size) i++;
+  }
+
+  PyObject * x = PyList_New(i);
+
+  // this should probably be a dict. @CTB
+  for (i = 0, mi = cm.begin(); mi != cm.end(); mi++) {
+    if (mi->second >= min_size) {
+      PyList_SET_ITEM(x, i, Py_BuildValue("LL", mi->first, mi->second));
+      i++;
+    }
+  }
+
+  return Py_BuildValue("Oi", x, n_unassigned);
+}
+
+static PyObject * subset_partition_average_coverages(PyObject * self,
+						     PyObject * args)
+{
+  khmer_KSubsetPartitionObject * me = (khmer_KSubsetPartitionObject *) self;
+  khmer::SubsetPartition * subset_p = me->subset;
+
+  PyObject * counting_o;
+
+  if (!PyArg_ParseTuple(args, "O", &counting_o)) {
+    return NULL;
+  }
+  
+  khmer::CountingHash * counting = ((khmer_KCountingHashObject *) counting_o)->counting;
+  
+  khmer::PartitionCountMap cm;
+  subset_p->partition_average_coverages(cm, counting);
+
+  unsigned int i;
+  khmer::PartitionCountMap::const_iterator mi;
+
+  PyObject * x = PyList_New(cm.size());
+
+  // this should probably be a dict. @CTB
+  for (i = 0, mi = cm.begin(); mi != cm.end(); mi++, i++) {
+    PyList_SET_ITEM(x, i, Py_BuildValue("LL", mi->first, mi->second));
+  }
+
+  return Py_BuildValue("O", x);
+}
+
+static PyMethodDef khmer_subset_methods[] = {
+  { "count_partitions", subset_count_partitions, METH_VARARGS, "" },
+  { "report_on_partitions", subset_report_on_partitions, METH_VARARGS, "" },
+  { "compare_partitions", subset_compare_partitions, METH_VARARGS, "" },
+  { "partition_size_distribution", subset_partition_size_distribution, METH_VARARGS, "" },
+  { "partition_sizes", subset_partition_sizes, METH_VARARGS, "" },
+  { "partition_average_coverages", subset_partition_average_coverages, METH_VARARGS, "" },
+  {NULL, NULL, 0, NULL}           /* sentinel */
+};
+
+static PyObject *
+khmer_subset_getattr(PyObject * obj, char * name)
+{
+  return Py_FindMethod(khmer_subset_methods, obj, name);
+}
+
+/////////////////
+// LabelHash
+/////////////////
+
+// LabelHash addition
+typedef struct {
+  PyObject_HEAD
+
+  /* @camillescott late night notes:
+     need to experiment. might be able to call hashbits py methods
+     directly with the labelhash object, because they all instantiate
+     a new hashbits pointer on themselves to call the functions and labelhash
+     inherits from hashbits; or, we define a hashbits object as part of this struct
+     as called for in the c-api reference. need to grok that still.
+     If this is how it's done, remove PyObject_HEAD, which will already be included
+     in the base class struct.
+     See http://docs.python.org/2.7/extending/newtypes.html#subclassing-other-types
+     for details...
+  */
+  LabelHash * labelhash;
+} khmer_KLabelHashObject;
+
+static void khmer_labelhash_dealloc(PyObject *);
+static PyObject * khmer_labelhash_getattr(PyObject * obj, char * name);
+
+static PyTypeObject khmer_KLabelHashType = {
+    PyObject_HEAD_INIT(NULL)
+    0,
+    "KLabelHash", sizeof(khmer_KLabelHashObject),
+    0,
+    khmer_labelhash_dealloc,	/*tp_dealloc*/
+    0,				/*tp_print*/
+    khmer_labelhash_getattr,	/*tp_getattr*/
+    0,				/*tp_setattr*/
+    0,				/*tp_compare*/
+    0,				/*tp_repr*/
+    0,				/*tp_as_number*/
+    0,				/*tp_as_sequence*/
+    0,				/*tp_as_mapping*/
+    0,				/*tp_hash */
+    0,				/*tp_call*/
+    0,				/*tp_str*/
+    0,				/*tp_getattro*/
+    0,				/*tp_setattro*/
+    0,				/*tp_as_buffer*/
+    Py_TPFLAGS_DEFAULT,		/*tp_flags*/
+    "labelhash object",           /* tp_doc */
+};
+
+#define is_labelhash_obj(v)  ((v)->ob_type == &khmer_KLabelHashType)
+
+
+static PyObject * labelhash_get_label_dict(PyObject * self, PyObject * args) {
+  khmer_KLabelHashObject * me = (khmer_KLabelHashObject *) self;
+  khmer::LabelHash * hb = me->labelhash;
+  
+  PyObject * d = PyDict_New();
+  khmer::LabelPtrMap::iterator it;
+  
+  for (it = hb->label_ptrs.begin(); it!=hb->label_ptrs.end(); ++it) {
+    PyDict_SetItem(d, Py_BuildValue("K", it->first), Py_BuildValue("K", it->second));
+  }
+  
+  return d;
+}
+
+static PyObject * labelhash_consume_fasta_and_tag_with_labels(PyObject * self, PyObject * args)
+{
+  khmer_KLabelHashObject * me = (khmer_KLabelHashObject *) self;
+  khmer::LabelHash * hb = me->labelhash;
+  
+  std::ofstream outfile;
+  
+  char * filename;
+  PyObject * callback_obj = NULL;
+
+  if (!PyArg_ParseTuple(args, "s|O", &filename, &callback_obj)) {
+    return NULL;
+  }
+  
+  unsigned long long n_consumed;
+  unsigned int total_reads;
+  bool exc_raised = false;
+  
+  //Py_BEGIN_ALLOW_THREADS
+  try {
+    hb->consume_fasta_and_tag_with_labels(filename, total_reads, n_consumed,
+                                                _report_fn, callback_obj);
+  } catch (_khmer_signal &e) {
+    exc_raised = true;
+  }
+  //Py_END_ALLOW_THREADS
+  if (exc_raised) return NULL;
+  
+  return Py_BuildValue("iL", total_reads, n_consumed);
+  
+}
+
+static PyObject * labelhash_consume_partitioned_fasta_and_tag_with_labels(
+                                            PyObject * self, PyObject * args)
+{
+  khmer_KLabelHashObject * me = (khmer_KLabelHashObject *) self;
+  khmer::LabelHash * labelhash = me->labelhash;
+
+  char * filename;
+  PyObject * callback_obj = NULL;
 
-  if (!PyArg_ParseTuple(args, "iOi",
-			&pid1, &subset2_obj, &pid2)) {
+  if (!PyArg_ParseTuple(args, "s|O", &filename, &callback_obj)) {
     return NULL;
   }
 
-  khmer_KSubsetPartitionObject *other = (khmer_KSubsetPartitionObject *) subset2_obj;
-  khmer::SubsetPartition * subset2_p = other->subset;
+  // call the C++ function, and trap signals => Python
 
-  unsigned int n_only1 = 0, n_only2 = 0, n_shared = 0;
-  subset1_p->compare_to_partition((PartitionID) pid1,
-				  subset2_p, (PartitionID) pid2,
-				  n_only1, n_only2, n_shared);
+  unsigned long long n_consumed;
+  unsigned int total_reads;
 
-  return Py_BuildValue("iii", n_only1, n_only2, n_shared);
-}
+  try {
+    labelhash->consume_partitioned_fasta_and_tag_with_labels(filename, 
+    total_reads, n_consumed, _report_fn, callback_obj);
+  } catch (_khmer_signal &e) {
+    return NULL;
+  }
 
-static PyObject * subset_partition_size_distribution(PyObject * self,
-						     PyObject * args)
-{
-  khmer_KSubsetPartitionObject * me = (khmer_KSubsetPartitionObject *) self;
-  khmer::SubsetPartition * subset_p = me->subset;
+  return Py_BuildValue("iK", total_reads, n_consumed);
+}
 
-  if (!PyArg_ParseTuple(args, "")) {
+static PyObject * labelhash_consume_sequence_and_tag_with_labels(PyObject * self, PyObject * args) {
+  khmer_KLabelHashObject * me = (khmer_KLabelHashObject *) self;
+  khmer::LabelHash * hb = me->labelhash;
+  
+  char * seq = NULL;
+  unsigned long long c = NULL;
+  if (!PyArg_ParseTuple(args, "sK", &seq, &c)) {
     return NULL;
   }
   
-  khmer::PartitionCountDistribution d;
+  unsigned long long n_consumed = 0;
+  khmer::Label * the_label = hb->check_and_allocate_label(c);
 
-  unsigned int n_unassigned = 0;
-  subset_p->partition_size_distribution(d, n_unassigned);
+  try { 
+  //if (hb->check_and_normalize_read(seq)) {
+    
+    hb->consume_sequence_and_tag_with_labels(seq, n_consumed, *the_label);
+  //}
+  } catch (_khmer_signal &e) {
+    return NULL;
+  }
+  return Py_BuildValue("L", n_consumed);
+}
 
-  PyObject * x = PyList_New(d.size());
-  khmer::PartitionCountDistribution::const_iterator di;
+static PyObject * labelhash_sweep_label_neighborhood(PyObject * self, PyObject * args) {
+  khmer_KLabelHashObject * me = (khmer_KLabelHashObject *) self;
+  khmer::LabelHash * hb = me->labelhash;
+  
+  char * seq = NULL;
+  unsigned int r = NULL;
+  PyObject * break_on_stop_tags_o = NULL;
+  PyObject * stop_big_traversals_o = NULL;
 
-  unsigned int i;
-  for (i = 0, di = d.begin(); di != d.end(); di++, i++) {
-    PyList_SET_ITEM(x, i, Py_BuildValue("LL", di->first, di->second));
+  if (!PyArg_ParseTuple(args, "s|iOO", &seq, &r,
+			&break_on_stop_tags_o,
+			&stop_big_traversals_o)) {
+    return NULL;
   }
-  assert (i == d.size());
 
-  return Py_BuildValue("Oi", x, n_unassigned);
+  unsigned int range = (2 * hb->_get_tag_density()) + 1;
+  if (r >= 0) {
+    range = r;
+  }
+
+  bool break_on_stop_tags = false;
+  if (break_on_stop_tags_o && PyObject_IsTrue(break_on_stop_tags_o)) {
+    break_on_stop_tags = true;
+  }
+  bool stop_big_traversals = false;
+  if (stop_big_traversals_o && PyObject_IsTrue(stop_big_traversals_o)) {
+    stop_big_traversals = true;
+  }
+  
+  if (strlen(seq) < hb->ksize()) {
+    return NULL;
+  }
+  
+  //std::pair<TagLabelPtrPair::iterator, TagLabelPtrPair::iterator> ret;
+  LabelPtrSet found_labels;
+  
+  bool exc_raised = false;
+  unsigned int num_traversed = 0;
+  //Py_BEGIN_ALLOW_THREADS
+  try {
+    num_traversed = hb->sweep_label_neighborhood(seq, found_labels, range, break_on_stop_tags, stop_big_traversals);
+  } catch (_khmer_signal &e) {
+    exc_raised = true;
+  }
+  //Py_END_ALLOW_THREADS
+  
+  //printf("...%u kmers traversed\n", num_traversed);
+  
+  if (exc_raised) return NULL;
+  
+  PyObject * x =  PyList_New(found_labels.size());
+  khmer::LabelPtrSet::const_iterator si;
+  unsigned long long i = 0;
+  for (si=found_labels.begin(); si!=found_labels.end(); ++si) {
+    PyList_SET_ITEM(x, i, Py_BuildValue("K", *(*si)));
+    i++;
+  }
+  
+  return x;
 }
 
-static PyObject * subset_partition_sizes(PyObject * self,
-					 PyObject * args)
+
+// Similar to find_all_tags, but returns tags in a way actually useable by python
+// need a tags_in_sequence iterator or function in c++ land for reuse in all
+// these functions
+static PyObject * labelhash_sweep_tag_neighborhood(PyObject * self, PyObject *args)
 {
-  khmer_KSubsetPartitionObject * me = (khmer_KSubsetPartitionObject *) self;
-  khmer::SubsetPartition * subset_p = me->subset;
+  khmer_KLabelHashObject * me = (khmer_KLabelHashObject *) self;
+  khmer::LabelHash * labelhash = me->labelhash;
 
-  unsigned int min_size = 0;
+  char * seq = NULL;
+  unsigned int r = NULL;
+  PyObject * break_on_stop_tags_o = NULL;
+  PyObject * stop_big_traversals_o = NULL;
 
-  if (!PyArg_ParseTuple(args, "|i", &min_size)) {
+  if (!PyArg_ParseTuple(args, "s|iOO", &seq, &r,
+			&break_on_stop_tags_o,
+			&stop_big_traversals_o)) {
     return NULL;
   }
-  
-  khmer::PartitionCountMap cm;
-  unsigned int n_unassigned = 0;
-  subset_p->partition_sizes(cm, n_unassigned);
 
-  unsigned int i;
-  khmer::PartitionCountMap::const_iterator mi;
-  for (i = 0, mi = cm.begin(); mi != cm.end(); mi++) {
-    if (mi->second >= min_size) i++;
+  unsigned int range = (2 * labelhash->_get_tag_density()) + 1;
+  if (r >= 0) {
+    range = r;
   }
 
-  PyObject * x = PyList_New(i);
+  bool break_on_stop_tags = false;
+  if (break_on_stop_tags_o && PyObject_IsTrue(break_on_stop_tags_o)) {
+    break_on_stop_tags = true;
+  }
+  bool stop_big_traversals = false;
+  if (stop_big_traversals_o && PyObject_IsTrue(stop_big_traversals_o)) {
+    stop_big_traversals = true;
+  }
+  
+  if (strlen(seq) < labelhash->ksize()) {
+    return NULL;
+  }
 
-  // this should probably be a dict. @CTB
-  for (i = 0, mi = cm.begin(); mi != cm.end(); mi++) {
-    if (mi->second >= min_size) {
-      PyList_SET_ITEM(x, i, Py_BuildValue("LL", mi->first, mi->second));
-      i++;
-    }
+  khmer::SeenSet tagged_kmers;
+
+  //Py_BEGIN_ALLOW_THREADS
+
+  labelhash->partition->sweep_for_tags(seq, tagged_kmers, 
+            labelhash->all_tags, range, break_on_stop_tags, stop_big_traversals);
+
+  //Py_END_ALLOW_THREADS
+
+  PyObject * x =  PyList_New(tagged_kmers.size());
+  khmer::SeenSet::const_iterator si;
+  unsigned long long i = 0;
+  for (si=tagged_kmers.begin(); si!=tagged_kmers.end(); ++si) {
+    //std::string kmer_s = _revhash(*si, labelhash->ksize());
+    // type K for python unsigned long long
+    PyList_SET_ITEM(x, i, Py_BuildValue("K", *si));
+    i++;
   }
 
-  return Py_BuildValue("Oi", x, n_unassigned);
+  return x;
 }
 
-static PyObject * subset_partition_average_coverages(PyObject * self,
-						     PyObject * args)
-{
-  khmer_KSubsetPartitionObject * me = (khmer_KSubsetPartitionObject *) self;
-  khmer::SubsetPartition * subset_p = me->subset;
-
-  PyObject * counting_o;
 
-  if (!PyArg_ParseTuple(args, "O", &counting_o)) {
+static PyObject * labelhash_get_tag_labels(PyObject * self, PyObject * args) {
+  
+  khmer_KLabelHashObject * me = (khmer_KLabelHashObject *) self;
+  khmer::LabelHash * labelhash = me->labelhash;
+  
+  khmer::HashIntoType tag;
+  
+  if (!PyArg_ParseTuple(args, "K", &tag)) {
     return NULL;
   }
   
-  khmer::CountingHash * counting = ((khmer_KCountingHashObject *) counting_o)->counting;
+  khmer::LabelPtrSet labels;
   
-  khmer::PartitionCountMap cm;
-  subset_p->partition_average_coverages(cm, counting);
+  labels = labelhash->get_tag_labels(tag);
+  
+  PyObject * x =  PyList_New(labels.size());
+  khmer::LabelPtrSet::const_iterator si;
+  unsigned long long i = 0;
+  for (si=labels.begin(); si!=labels.end(); ++si) {
+    //std::string kmer_s = _revhash(*si, labelhash->ksize());
+    PyList_SET_ITEM(x, i, Py_BuildValue("K", *(*si)));
+    i++;
+  }
 
-  unsigned int i;
-  khmer::PartitionCountMap::const_iterator mi;
+  return x;
+}
 
-  PyObject * x = PyList_New(cm.size());
+static PyObject * labelhash_n_labels(PyObject * self, PyObject * args)
+{
+  khmer_KLabelHashObject * me = (khmer_KLabelHashObject *) self;
+  khmer::LabelHash * labelhash = me->labelhash;
 
-  // this should probably be a dict. @CTB
-  for (i = 0, mi = cm.begin(); mi != cm.end(); mi++, i++) {
-    PyList_SET_ITEM(x, i, Py_BuildValue("LL", mi->first, mi->second));
+  if (!PyArg_ParseTuple(args, "")) {
+    return NULL;
   }
 
-  return Py_BuildValue("O", x);
+  return PyInt_FromLong(labelhash->n_labels());
 }
 
-static PyMethodDef khmer_subset_methods[] = {
-  { "count_partitions", subset_count_partitions, METH_VARARGS, "" },
-  { "report_on_partitions", subset_report_on_partitions, METH_VARARGS, "" },
-  { "compare_partitions", subset_compare_partitions, METH_VARARGS, "" },
-  { "partition_size_distribution", subset_partition_size_distribution, METH_VARARGS, "" },
-  { "partition_sizes", subset_partition_sizes, METH_VARARGS, "" },
-  { "partition_average_coverages", subset_partition_average_coverages, METH_VARARGS, "" },
+
+static PyMethodDef khmer_labelhash_methods[] = {
+  { "ksize", labelhash_get_ksize, METH_VARARGS, "" },
+  { "hashsizes", labelhash_get_hashsizes, METH_VARARGS, "" },
+  { "n_occupied", labelhash_n_occupied, METH_VARARGS, "Count the number of occupied bins" },
+  { "n_unique_kmers", labelhash_n_unique_kmers,  METH_VARARGS, "Count the number of unique kmers" },
+  { "count", labelhash_count, METH_VARARGS, "Count the given kmer" },
+  { "get", labelhash_get, METH_VARARGS, "Get the count for the given k-mer" },
+  { "kmer_degree", labelhash_kmer_degree, METH_VARARGS, "" },
+  { "load", labelhash_load, METH_VARARGS, "" },
+  { "save", labelhash_save, METH_VARARGS, "" },
+  { "load_tagset", labelhash_load_tagset, METH_VARARGS, "" },
+  { "save_tagset", labelhash_save_tagset, METH_VARARGS, "" },
+  { "n_tags", labelhash_n_tags, METH_VARARGS, "" },
+  { "_get_tag_density", labelhash__get_tag_density, METH_VARARGS, "" },
+  { "_set_tag_density", labelhash__set_tag_density, METH_VARARGS, "" },
+ { "consume_fasta_and_tag", labelhash_consume_fasta_and_tag, METH_VARARGS, "Count all k-mers in a given file" },
+  { "consume_fasta_and_tag_with_reads_parser", labelhash_consume_fasta_and_tag_with_reads_parser, 
+    METH_VARARGS, "Count all k-mers using a given reads parser" },
+ { "consume_partitioned_fasta", labelhash_consume_partitioned_fasta, METH_VARARGS, "Count all k-mers in a given file" },
+ { "consume_fasta_and_tag_with_labels", labelhash_consume_fasta_and_tag_with_labels, METH_VARARGS, "" },
+  { "sweep_label_neighborhood", labelhash_sweep_label_neighborhood, METH_VARARGS, "" },
+  {"consume_partitioned_fasta_and_tag_with_labels", labelhash_consume_partitioned_fasta_and_tag_with_labels, METH_VARARGS, "" },
+  {"sweep_tag_neighborhood", labelhash_sweep_tag_neighborhood, METH_VARARGS, "" },
+  {"get_tag_labels", labelhash_get_tag_labels, METH_VARARGS, ""},
+  {"consume_sequence_and_tag_with_labels", labelhash_consume_sequence_and_tag_with_labels, METH_VARARGS, "" },
+  {"n_labels", labelhash_n_labels, METH_VARARGS, ""},
+  {"get_label_dict", labelhash_get_label_dict, METH_VARARGS, "" },
+ 
   {NULL, NULL, 0, NULL}           /* sentinel */
 };
 
 static PyObject *
-khmer_subset_getattr(PyObject * obj, char * name)
+khmer_labelhash_getattr(PyObject * obj, char * name)
 {
-  return Py_FindMethod(khmer_subset_methods, obj, name);
+  return Py_FindMethod(khmer_labelhash_methods, obj, name);
 }
 
 
+
 //
 // GRAPHALIGN addition
 //
@@ -4549,6 +4632,37 @@ static PyObject* _new_hashbits(PyObject * self, PyObject * args)
   return (PyObject *) khashbits_obj;
 }
 
+//
+// new_labelhash
+//
+
+static PyObject* _new_labelhash(PyObject * self, PyObject * args)
+{
+  unsigned int k = 0;
+  PyObject* sizes_list_o = NULL;
+
+  if (!PyArg_ParseTuple(args, "IO", &k, &sizes_list_o)) {
+    return NULL;
+  }
+
+  std::vector<khmer::HashIntoType> sizes;
+  for (int i = 0; i < PyObject_Length(sizes_list_o); i++) {
+    PyObject * size_o = PyList_GET_ITEM(sizes_list_o, i);
+    sizes.push_back(PyLong_AsLongLong(size_o));
+  }
+
+  khmer_KLabelHash * klabelhash_obj = (khmer_KLabelHashObject *) \
+    PyObject_New(khmer_KLabelHashsObject, &khmer_KLabelHashType);
+
+  if (klabelhash_obj == NULL) {
+      return NULL;
+  }
+  
+  klabelhash_obj->labelhash = new khmer::LabelHash(k, sizes);
+
+  return (PyObject *) klabelhash_obj;
+}
+
 static PyObject * hash_collect_high_abundance_kmers(PyObject * self, PyObject * args)
 {
   khmer_KCountingHashObject * me = (khmer_KCountingHashObject *) self;
@@ -4607,6 +4721,21 @@ static void khmer_hashbits_dealloc(PyObject* self)
   
   PyObject_Del((PyObject *) obj);
 }
+
+
+//
+// khmer_labelhash_dealloc -- clean up a labelhash object.
+//
+
+static void khmer_hashbits_dealloc(PyObject* self)
+{
+  khmer_KLabelHashObject * obj = (khmer_LabelHashObject *) self;
+  delete obj->labelhash;
+  obj->labelhash = NULL;
+  
+  PyObject_Del((PyObject *) obj);
+}
+
 //
 // khmer_subset_dealloc -- clean up a hashbits object.
 //

From 09dd66a76e733fd67d670c1fdcdc62d1d31bdba7 Mon Sep 17 00:00:00 2001
From: Camille Scott <camille.scott.w@gmail.com>
Date: Tue, 17 Dec 2013 12:09:44 -0500
Subject: [PATCH 106/140] finished most of integration, added new tests,
 dealing with linker errors

---
 khmer/__init__.py       |   1 +
 khmer/_khmermodule.cc   | 326 ++++++++++++++++++++++++----------------
 lib/hashbits.hh         |   3 +-
 lib/hashtable.hh        |  65 +-------
 lib/labelhash.hh        |  27 ++--
 tests/test_hashbits.py  | 128 ----------------
 tests/test_labelhash.py | 143 ++++++++++++++++++
 7 files changed, 362 insertions(+), 331 deletions(-)
 create mode 100644 tests/test_labelhash.py

diff --git a/khmer/__init__.py b/khmer/__init__.py
index b6031527e6..01683832e0 100644
--- a/khmer/__init__.py
+++ b/khmer/__init__.py
@@ -15,6 +15,7 @@
 from _khmer import reverse_hash
 from _khmer import get_config
 from _khmer import ReadParser
+from _khmer import LabelHash
 
 from ._version import get_versions
 __version__ = get_versions()['version']
diff --git a/khmer/_khmermodule.cc b/khmer/_khmermodule.cc
index cb85fad1cc..cef9f7d724 100644
--- a/khmer/_khmermodule.cc
+++ b/khmer/_khmermodule.cc
@@ -1336,35 +1336,6 @@ typedef struct {
   khmer::Hashbits * hashbits;
 } khmer_KHashbitsObject;
 
-static void khmer_hashbits_dealloc(PyObject *);
-static PyObject * khmer_hashbits_getattr(PyObject * obj, char * name);
-
-static PyTypeObject khmer_KHashbitsType = {
-    PyObject_HEAD_INIT(NULL)
-    0,
-    "KHashbits", sizeof(khmer_KHashbitsObject),
-    0,
-    khmer_hashbits_dealloc,	/*tp_dealloc*/
-    0,				/*tp_print*/
-    khmer_hashbits_getattr,	/*tp_getattr*/
-    0,				/*tp_setattr*/
-    0,				/*tp_compare*/
-    0,				/*tp_repr*/
-    0,				/*tp_as_number*/
-    0,				/*tp_as_sequence*/
-    0,				/*tp_as_mapping*/
-    0,				/*tp_hash */
-    0,				/*tp_call*/
-    0,				/*tp_str*/
-    0,				/*tp_getattro*/
-    0,				/*tp_setattro*/
-    0,				/*tp_as_buffer*/
-    Py_TPFLAGS_DEFAULT,		/*tp_flags*/
-    "hashbits object",           /* tp_doc */
-};
-
-#define is_hashbits_obj(v)  ((v)->ob_type == &khmer_KHashbitsType)
-
 static void khmer_subset_dealloc(PyObject *);
 static PyObject * khmer_subset_getattr(PyObject * obj, char * name);
 
@@ -1394,6 +1365,8 @@ static PyTypeObject khmer_KSubsetPartitionType = {
 
 #define is_subset_obj(v)  ((v)->ob_type == &khmer_KSubsetPartitionType)
 
+// MOVED HASHBITS TYPE TO BELOW METHODS
+
 /* GRAPHALIGN addition */
 typedef struct {
   PyObject_HEAD
@@ -3931,6 +3904,85 @@ khmer_hashbits_getattr(PyObject * obj, char * name)
   return Py_FindMethod(khmer_hashbits_methods, obj, name);
 }
 
+static void khmer_hashbits_dealloc(PyObject *);
+static PyObject* khmer_hashbits_new(PyTypeObject * type, PyObject * args, PyObject * kwds);
+static int khmer_hashbits_init(khmer_KHashbitsObject * self, PyObject * args, PyObject * kwds); 
+
+static PyTypeObject khmer_KHashbitsType = {
+    PyObject_HEAD_INIT(NULL)
+    0,
+    "Hashbits", sizeof(khmer_KHashbitsObject),
+    0,
+    khmer_hashbits_dealloc,	/*tp_dealloc*/
+    0,				/*tp_print*/
+    khmer_hashbits_getattr,	/*tp_getattr*/
+    0,				/*tp_setattr*/
+    0,				/*tp_compare*/
+    0,				/*tp_repr*/
+    0,				/*tp_as_number*/
+    0,				/*tp_as_sequence*/
+    0,				/*tp_as_mapping*/
+    0,				/*tp_hash */
+    0,				/*tp_call*/
+    0,				/*tp_str*/
+    0,				/*tp_getattro*/
+    0,				/*tp_setattro*/
+    0,				/*tp_as_buffer*/
+    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE,		/*tp_flags*/
+    "hashbits object",           /* tp_doc */
+    0,                       /* tp_traverse */
+    0,                       /* tp_clear */
+    0,                       /* tp_richcompare */
+    0,                       /* tp_weaklistoffset */
+    0,                       /* tp_iter */
+    0,                       /* tp_iternext */
+    khmer_hashbits_methods,  /* tp_methods */
+    0,                       /* tp_members */
+    0,                       /* tp_getset */
+    0,                       /* tp_base */
+    0,                       /* tp_dict */
+    0,                       /* tp_descr_get */
+    0,                       /* tp_descr_set */
+    0,                       /* tp_dictoffset */
+    (initproc)khmer_hashbits_init,   /* tp_init */
+    0,                       /* tp_alloc */
+};
+
+// __new__ for hashbits; necessary for proper subclassing
+// This will essentially do what the old factory function did. Unlike many __new__
+// methods, we take our arguments here, because there's no "unitialized" hashbits
+// object; we have to have k and the table sizes before creating the new objects
+static PyObject* khmer_hashbits_new(PyTypeObject * type, PyObject * args, PyObject * kwds)
+{
+    khmer_KHashbitsObject * self;
+    self = (khmer_KHashbitsObject *)type->tp_alloc(type, 0);
+
+    if (self != NULL) {
+        unsigned int k = 0;
+        PyObject* sizes_list_o = NULL;
+
+        if (!PyArg_ParseTuple(args, "IO", &k, &sizes_list_o)) {
+            return NULL;
+        }
+
+        std::vector<khmer::HashIntoType> sizes;
+        for (int i = 0; i < PyObject_Length(sizes_list_o); i++) {
+            PyObject * size_o = PyList_GET_ITEM(sizes_list_o, i);
+            sizes.push_back(PyLong_AsLongLong(size_o));
+        }
+
+        self->hashbits = new khmer::Hashbits(k, sizes);
+    }
+    return (PyObject *) self;
+}
+
+// there are no attributes that we need at this time, so we'll just return 0
+static int khmer_hashbits_init(khmer_KHashbitsObject * self, PyObject * args, PyObject * kwds) {
+    return 0;
+}
+
+#define is_hashbits_obj(v)  ((v)->ob_type == &khmer_KHashbitsType)
+
 ////////////////////////////////////////////////////////////////////////////
 
 static PyObject * subset_count_partitions(PyObject * self,
@@ -4104,8 +4156,8 @@ khmer_subset_getattr(PyObject * obj, char * name)
 
 // LabelHash addition
 typedef struct {
-  PyObject_HEAD
-
+  //PyObject_HEAD
+  khmer_KHashbitsObject khashbits;
   /* @camillescott late night notes:
      need to experiment. might be able to call hashbits py methods
      directly with the labelhash object, because they all instantiate
@@ -4117,38 +4169,65 @@ typedef struct {
      See http://docs.python.org/2.7/extending/newtypes.html#subclassing-other-types
      for details...
   */
-  LabelHash * labelhash;
+  khmer::LabelHash * labelhash;
 } khmer_KLabelHashObject;
 
 static void khmer_labelhash_dealloc(PyObject *);
-static PyObject * khmer_labelhash_getattr(PyObject * obj, char * name);
-
-static PyTypeObject khmer_KLabelHashType = {
-    PyObject_HEAD_INIT(NULL)
-    0,
-    "KLabelHash", sizeof(khmer_KLabelHashObject),
-    0,
-    khmer_labelhash_dealloc,	/*tp_dealloc*/
-    0,				/*tp_print*/
-    khmer_labelhash_getattr,	/*tp_getattr*/
-    0,				/*tp_setattr*/
-    0,				/*tp_compare*/
-    0,				/*tp_repr*/
-    0,				/*tp_as_number*/
-    0,				/*tp_as_sequence*/
-    0,				/*tp_as_mapping*/
-    0,				/*tp_hash */
-    0,				/*tp_call*/
-    0,				/*tp_str*/
-    0,				/*tp_getattro*/
-    0,				/*tp_setattro*/
-    0,				/*tp_as_buffer*/
-    Py_TPFLAGS_DEFAULT,		/*tp_flags*/
-    "labelhash object",           /* tp_doc */
-};
+static int khmer_labelhash_init(khmer_KLabelHashObject * self, PyObject *args, PyObject *kwds);
+static PyObject * khmer_labelhash_new(PyTypeObject * type, PyObject *args, PyObject *kwds);
 
 #define is_labelhash_obj(v)  ((v)->ob_type == &khmer_KLabelHashType)
 
+//
+// khmer_labelhash_dealloc -- clean up a labelhash object.
+//
+
+static void khmer_labelhash_dealloc(PyObject* self)
+{
+  khmer_KLabelHashObject * obj = (khmer_KLabelHashObject *) self;
+  delete obj->labelhash;
+  obj->labelhash = NULL;
+  
+  PyObject_Del((PyObject *) obj);
+}
+
+// a little wierd; we don't actually want to call Hashbits' new method. Rather, we
+// define our own new method, and redirect the base's hashbits object to point to our
+// labelhash object
+static PyObject * khmer_labelhash_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
+{
+    khmer_KLabelHashObject *self;
+    self = (khmer_KLabelHashObject*)type->tp_alloc(type, 0);
+  
+    if (self!=NULL) {
+        unsigned int k = 0;
+        PyObject* sizes_list_o = NULL;
+
+        if (!PyArg_ParseTuple(args, "IO", &k, &sizes_list_o)) {
+            return NULL;
+        }
+
+        std::vector<khmer::HashIntoType> sizes;
+        for (int i = 0; i < PyObject_Length(sizes_list_o); i++) {
+            PyObject * size_o = PyList_GET_ITEM(sizes_list_o, i);
+            sizes.push_back(PyLong_AsLongLong(size_o));
+        }
+
+        // We want the hashbits pointer in the base class to point to our labelhash,
+        // so that the KHashbits methods are called on the correct object (a LabelHash)
+        self->khashbits.hashbits = (khmer::Hashbits *)self->labelhash;
+        self->labelhash = new khmer::LabelHash(k, sizes);
+    }
+
+    return (PyObject *) self;     
+}
+
+static int khmer_labelhash_init(khmer_KLabelHashObject * self, PyObject *args, PyObject *kwds)
+{
+    if (khmer_KHashbitsType.tp_init((PyObject *)self, args, kwds) < 0)
+        return -1;
+    return 0;
+}
 
 static PyObject * labelhash_get_label_dict(PyObject * self, PyObject * args) {
   khmer_KLabelHashObject * me = (khmer_KLabelHashObject *) self;
@@ -4309,7 +4388,6 @@ static PyObject * labelhash_sweep_label_neighborhood(PyObject * self, PyObject *
   return x;
 }
 
-
 // Similar to find_all_tags, but returns tags in a way actually useable by python
 // need a tags_in_sequence iterator or function in c++ land for reuse in all
 // these functions
@@ -4409,27 +4487,8 @@ static PyObject * labelhash_n_labels(PyObject * self, PyObject * args)
   return PyInt_FromLong(labelhash->n_labels());
 }
 
-
 static PyMethodDef khmer_labelhash_methods[] = {
-  { "ksize", labelhash_get_ksize, METH_VARARGS, "" },
-  { "hashsizes", labelhash_get_hashsizes, METH_VARARGS, "" },
-  { "n_occupied", labelhash_n_occupied, METH_VARARGS, "Count the number of occupied bins" },
-  { "n_unique_kmers", labelhash_n_unique_kmers,  METH_VARARGS, "Count the number of unique kmers" },
-  { "count", labelhash_count, METH_VARARGS, "Count the given kmer" },
-  { "get", labelhash_get, METH_VARARGS, "Get the count for the given k-mer" },
-  { "kmer_degree", labelhash_kmer_degree, METH_VARARGS, "" },
-  { "load", labelhash_load, METH_VARARGS, "" },
-  { "save", labelhash_save, METH_VARARGS, "" },
-  { "load_tagset", labelhash_load_tagset, METH_VARARGS, "" },
-  { "save_tagset", labelhash_save_tagset, METH_VARARGS, "" },
-  { "n_tags", labelhash_n_tags, METH_VARARGS, "" },
-  { "_get_tag_density", labelhash__get_tag_density, METH_VARARGS, "" },
-  { "_set_tag_density", labelhash__set_tag_density, METH_VARARGS, "" },
- { "consume_fasta_and_tag", labelhash_consume_fasta_and_tag, METH_VARARGS, "Count all k-mers in a given file" },
-  { "consume_fasta_and_tag_with_reads_parser", labelhash_consume_fasta_and_tag_with_reads_parser, 
-    METH_VARARGS, "Count all k-mers using a given reads parser" },
- { "consume_partitioned_fasta", labelhash_consume_partitioned_fasta, METH_VARARGS, "Count all k-mers in a given file" },
- { "consume_fasta_and_tag_with_labels", labelhash_consume_fasta_and_tag_with_labels, METH_VARARGS, "" },
+  { "consume_fasta_and_tag_with_labels", labelhash_consume_fasta_and_tag_with_labels, METH_VARARGS, "" },
   { "sweep_label_neighborhood", labelhash_sweep_label_neighborhood, METH_VARARGS, "" },
   {"consume_partitioned_fasta_and_tag_with_labels", labelhash_consume_partitioned_fasta_and_tag_with_labels, METH_VARARGS, "" },
   {"sweep_tag_neighborhood", labelhash_sweep_tag_neighborhood, METH_VARARGS, "" },
@@ -4441,13 +4500,53 @@ static PyMethodDef khmer_labelhash_methods[] = {
   {NULL, NULL, 0, NULL}           /* sentinel */
 };
 
+// still necessary?
 static PyObject *
 khmer_labelhash_getattr(PyObject * obj, char * name)
 {
   return Py_FindMethod(khmer_labelhash_methods, obj, name);
 }
 
-
+static PyTypeObject khmer_KLabelHashType = {
+    PyObject_HEAD_INIT(NULL)
+    0,                       /* ob_size */
+   "LabelHash",            /* tp_name */ 
+    sizeof(khmer_KLabelHashObject), /* tp_basicsize */
+    0,                       /* tp_itemsize */
+    (destructor)khmer_labelhash_dealloc, /* tp_dealloc */
+    0,                       /* tp_print */
+    0,  /* khmer_labelhash_getattr, tp_getattr */
+    0,                       /* tp_setattr */
+    0,                       /* tp_compare */
+    0,                       /* tp_repr */
+    0,                       /* tp_as_number */
+    0,                       /* tp_as_sequence */
+    0,                       /* tp_as_mapping */
+    0,                       /* tp_hash */
+    0,                       /* tp_call */
+    0,                       /* tp_str */
+    0,                       /* tp_getattro */
+    0,                       /* tp_setattro */
+    0,                       /* tp_as_buffer */
+    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE,   /* tp_flags */
+    0,                       /* tp_doc */
+    0,                       /* tp_traverse */
+    0,                       /* tp_clear */
+    0,                       /* tp_richcompare */
+    0,                       /* tp_weaklistoffset */
+    0,                       /* tp_iter */
+    0,                       /* tp_iternext */
+    khmer_labelhash_methods, /* tp_methods */
+    0,                       /* tp_members */
+    0,                       /* tp_getset */
+    0,                       /* tp_base */
+    0,                       /* tp_dict */
+    0,                       /* tp_descr_get */
+    0,                       /* tp_descr_set */
+    0,                       /* tp_dictoffset */
+    (initproc)khmer_labelhash_init,   /* tp_init */
+    0,                       /* tp_alloc */
+};
 
 //
 // GRAPHALIGN addition
@@ -4632,37 +4731,6 @@ static PyObject* _new_hashbits(PyObject * self, PyObject * args)
   return (PyObject *) khashbits_obj;
 }
 
-//
-// new_labelhash
-//
-
-static PyObject* _new_labelhash(PyObject * self, PyObject * args)
-{
-  unsigned int k = 0;
-  PyObject* sizes_list_o = NULL;
-
-  if (!PyArg_ParseTuple(args, "IO", &k, &sizes_list_o)) {
-    return NULL;
-  }
-
-  std::vector<khmer::HashIntoType> sizes;
-  for (int i = 0; i < PyObject_Length(sizes_list_o); i++) {
-    PyObject * size_o = PyList_GET_ITEM(sizes_list_o, i);
-    sizes.push_back(PyLong_AsLongLong(size_o));
-  }
-
-  khmer_KLabelHash * klabelhash_obj = (khmer_KLabelHashObject *) \
-    PyObject_New(khmer_KLabelHashsObject, &khmer_KLabelHashType);
-
-  if (klabelhash_obj == NULL) {
-      return NULL;
-  }
-  
-  klabelhash_obj->labelhash = new khmer::LabelHash(k, sizes);
-
-  return (PyObject *) klabelhash_obj;
-}
-
 static PyObject * hash_collect_high_abundance_kmers(PyObject * self, PyObject * args)
 {
   khmer_KCountingHashObject * me = (khmer_KCountingHashObject *) self;
@@ -4722,20 +4790,6 @@ static void khmer_hashbits_dealloc(PyObject* self)
   PyObject_Del((PyObject *) obj);
 }
 
-
-//
-// khmer_labelhash_dealloc -- clean up a labelhash object.
-//
-
-static void khmer_hashbits_dealloc(PyObject* self)
-{
-  khmer_KLabelHashObject * obj = (khmer_LabelHashObject *) self;
-  delete obj->labelhash;
-  obj->labelhash = NULL;
-  
-  PyObject_Del((PyObject *) obj);
-}
-
 //
 // khmer_subset_dealloc -- clean up a hashbits object.
 //
@@ -4871,8 +4925,24 @@ init_khmer(void)
     khmer_KTableType.ob_type	      = &PyType_Type;
     khmer_KCountingHashType.ob_type   = &PyType_Type;
 
+    // implemented __new__ for Hashbits; keeping factory func around as well
+    // for backwards compat with old scripts
+    khmer_KHashbitsType.tp_new = khmer_hashbits_new;
+    if (PyType_Ready(&khmer_KHashbitsType) < 0) {
+        std::cout << "_khmer.KHashbitsType failed PyType_Ready" << std::endl;
+        return;
+    }
+    // add LabelHash
+    khmer_KLabelHashType.tp_base = &khmer_KHashbitsType;
+    khmer_KLabelHashType.tp_new = khmer_labelhash_new;
+    if (PyType_Ready(&khmer_KLabelHashType) < 0) {
+        std::cout << "_khmer.KLabelHashType failed PyType_Ready" << std::endl; 
+        return;
+    }
+
     PyObject * m;
-    m = Py_InitModule( "_khmer", KhmerMethods );
+    m = Py_InitModule3( "_khmer", KhmerMethods, 
+                        "interface for the khmer module low-level extensions" );
     if (m == NULL) {
 	return;
     }
@@ -4901,7 +4971,11 @@ init_khmer(void)
     // TODO: Add other types here as their 'new' methods are implemented.
     //	     Then, remove the corresponding factory functions.
 
-    
+    Py_INCREF(&khmer_KHashbitsType);
+    PyModule_AddObject(m, "Hashbits", (PyObject *)&khmer_KHashbitsType);
+
+    Py_INCREF(&khmer_KLabelHashType);
+    PyModule_AddObject(m, "LabelHash", (PyObject *)&khmer_KLabelHashType);
 }
 
 // vim: set ft=cpp sts=4 sw=4 tw=79:
diff --git a/lib/hashbits.hh b/lib/hashbits.hh
index 84c93d9aa3..9b1d5462db 100644
--- a/lib/hashbits.hh
+++ b/lib/hashbits.hh
@@ -12,6 +12,7 @@
 
 namespace khmer {
   class CountingHash;
+  class LabelHash;
 
   class Hashbits : public khmer::Hashtable {
   protected:
@@ -240,7 +241,7 @@ namespace khmer {
 };
 
 #include "counting.hh"
-
+#include "labelhash.hh"
 #endif // HASHBITS_HH
 
 // vim: set sts=2 sw=2:
diff --git a/lib/hashtable.hh b/lib/hashtable.hh
index f13dcd51a2..a306816581 100644
--- a/lib/hashtable.hh
+++ b/lib/hashtable.hh
@@ -455,7 +455,6 @@ namespace khmer {
     // Partitioning stuff.
 
     unsigned int n_tags() const { return all_tags.size(); }
-    unsigned int n_labels() const { return label_ptrs.size(); }
 
     void divide_tags_into_subsets(unsigned int subset_size, SeenSet& divvy);
 
@@ -485,18 +484,7 @@ namespace khmer {
 	CallbackFn	    callback	    = NULL,
 	void *		    callback_data   = NULL
     );
-    
-    Label * check_and_allocate_label(Label new_label) {
-        Label * c;
-        if (label_ptrs.count(new_label)) {
-            c = label_ptrs[new_label];
-        } else {
-            c = new Label(new_label);
-            label_ptrs[*c] = c;
-        }
-        return c;
-    }
-    
+       
     void consume_sequence_and_tag(const std::string& seq,
 				  unsigned long long& n_consumed,
 				  SeenSet * new_tags = 0);
@@ -507,51 +495,6 @@ namespace khmer {
 					     unsigned long long &n_consumed,
 					     CallbackFn callback = 0,
 					     void * callback_data = 0);
-    
-    void consume_fasta_and_tag_with_labels(
-                        std::string const	  &filename,
-                        unsigned int	  &total_reads,
-                        unsigned long long  &n_consumed,
-                        CallbackFn	  callback	  = NULL,
-                        void *		  callback_data	  = NULL);
-
-    void consume_fasta_and_tag_with_labels(
-	                read_parsers:: IParser *	    parser,
-	                unsigned int	    &total_reads,
-	                unsigned long long  &n_consumed,
-	                CallbackFn	    callback	    = NULL,
-	                void *		    callback_data   = NULL);
-	                
-    void consume_partitioned_fasta_and_tag_with_labels(const std::string &filename,
-					  unsigned int &total_reads,
-					  unsigned long long &n_consumed,
-					  CallbackFn callback = NULL,
-					  void * callback_datac = NULL);
-					  			  
-    void consume_sequence_and_tag_with_labels(const std::string& seq,
-					unsigned long long& n_consumed,
-					Label& current_label,
-					SeenSet * new_tags = 0);
-    
-    LabelPtrSet get_tag_labels(const HashIntoType& tag);
-    TagPtrSet get_label_tags(const Label& label);
-
-    void link_tag_and_label(HashIntoType& kmer, Label& label);
-    
-    unsigned int sweep_sequence_for_labels(const std::string& seq,
-					LabelPtrSet& found_labels,
-					bool break_on_stoptags,
-					bool stop_big_traversals);
-					
-    unsigned int sweep_label_neighborhood(const std::string & seq,
-                                                  LabelPtrSet& found_labels,
-                                                  unsigned int range,
-                                                  bool break_on_stoptags,
-                                                  bool stop_big_traversals);
-                                                  			
-    void traverse_labels_and_resolve(const SeenSet& tagged_kmers,
-                                     LabelPtrSet& found_labels);
-
     void consume_fasta_and_traverse(const std::string &filename,
 				    unsigned int distance,
 				    unsigned int big_threshold,
@@ -660,10 +603,4 @@ namespace khmer {
 #define RELEASE_ALL_TAGS_SPIN_LOCK \
   __sync_bool_compare_and_swap( &_all_tags_spin_lock, 1, 0 );
 
-#define ACQUIRE_TAG_COLORS_SPIN_LOCK \
-  while(!__sync_bool_compare_and_swap( &_tag_labels_spin_lock, 0, 1));
-
-#define RELEASE_TAG_COLORS_SPIN_LOCK \
-  __sync_bool_compare_and_swap( &_tag_labels_spin_lock, 1, 0);
-
 #endif // HASHTABLE_HH
diff --git a/lib/labelhash.hh b/lib/labelhash.hh
index 960d64158d..374ff02d78 100644
--- a/lib/labelhash.hh
+++ b/lib/labelhash.hh
@@ -11,21 +11,13 @@
 
 #include "khmer.hh"
 #include "hashbits.hh"
+#include "hashtable.hh"
+#include "read_parsers.hh"
 
 namespace khmer {
-    
+
     class LabelHash : public khmer::Hashbits {
     protected:
-        LabelHash( WordLength ksize, std::vector<HashIntoType>& tablesizes)
-        : khmer::Hashbits(ksize, tablesizes)
-        {
-            // constructor
-            _tag_labels_spin_lock = 0;
-
-        }
-        
-        ~LabelHash();
-
         // Does the given tag already have the given label?
         bool _cmap_contains_label(const TagLabelPtrMap& cmap,
                 HashIntoType& kmer,
@@ -79,6 +71,17 @@ namespace khmer {
         uint32_t _tag_labels_spin_lock;
 
     public:
+
+        LabelHash( WordLength ksize, std::vector<HashIntoType>& tablesizes)
+        : khmer::Hashbits(ksize, tablesizes)
+        {
+            // constructor
+            _tag_labels_spin_lock = 0;
+
+        }
+        
+        ~LabelHash();
+
         TagLabelPtrMap tag_labels;
         LabelTagPtrMap label_tag_ptrs;
         LabelPtrMap label_ptrs;
@@ -141,7 +144,7 @@ namespace khmer {
                                          LabelPtrSet& found_labels);
 
     };
-}
+};
 
 #define ACQUIRE_TAG_COLORS_SPIN_LOCK \
   while(!__sync_bool_compare_and_swap( &_tag_labels_spin_lock, 0, 1));
diff --git a/tests/test_hashbits.py b/tests/test_hashbits.py
index 2456d8d788..aca4dad430 100644
--- a/tests/test_hashbits.py
+++ b/tests/test_hashbits.py
@@ -11,11 +11,9 @@
 import khmer_tst_utils as utils
 from nose.plugins.attrib import attr
 
-
 def teardown():
     utils.cleanup()
 
-
 def test__get_set_tag_density():
     ht = khmer.new_hashbits(32, 1, 1)
 
@@ -547,129 +545,3 @@ def test_simple_median():
     assert average == 1.0
     assert stddev == 0.0
 
-#
-# @cswelcher TODO: more tests! 
-#  * thread-safety
-
-def test_n_labels():
-    hb = khmer.new_hashbits(20, 1e7, 4)
-    filename = utils.get_test_data('test-labels.fa')
-    hb.consume_fasta_and_tag_with_labels(filename)
-    
-    print hb.n_labels()
-    assert hb.n_labels() == 4
-
-def test_get_label_dict():
-    hb = khmer.new_hashbits(20, 1e7, 4)
-    filename = utils.get_test_data('test-labels.fa')
-    hb.consume_fasta_and_tag_with_labels(filename)
-    
-    labels = hb.get_label_dict()
-    expected = [0L, 1L, 2L, 3L]
-    for e_label in expected:
-        assert e_label in labels
-    for a_label in labels:
-        assert a_label in expected
-
-def test_sweep_tag_neighborhood():
-    hb = khmer.new_hashbits(20, 1e7, 4)
-    filename = utils.get_test_data('single-read.fq')
-    hb.consume_fasta_and_tag(filename)
-    
-    tags = hb.sweep_tag_neighborhood('CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT')
-    assert len(tags) == 1
-    assert tags.pop() == 173473779682L
-
-def test_get_tag_labels():
-    hb = khmer.new_hashbits(20, 1e7, 4)
-    filename = utils.get_test_data('single-read.fq')
-    hb.consume_fasta_and_tag_with_labels(filename)
-    tag = 173473779682L
-
-    labels = hb.get_tag_labels(tag)
-    assert len(labels) == 1
-    assert labels.pop() == 0L
-
-def test_sweep_sequence_for_labels():
-    hb = khmer.new_hashbits(20, 1e7, 4)
-    filename = utils.get_test_data('single-read.fq')
-    hb.consume_fasta_and_tag_with_labels(filename)
-    
-    labels = hb.sweep_label_neighborhood('CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT')
-    assert len(labels) == 1
-    assert labels.pop() == 0L
-
-def test_consume_partitioned_fasta_and_tag_with_labels():
-    hb = khmer.new_hashbits(20, 1e7, 4)
-    filename = utils.get_test_data('real-partition-small.fa')
-
-    total_reads, n_consumed = hb.consume_partitioned_fasta_and_tag_with_labels(filename)
-    labels = set()
-    for record in screed.open(filename):
-        seq = record.sequence
-        labels.update(hb.sweep_label_neighborhood(seq, False, False))
-    #print hb.n_labels()
-    #print labels
-    assert len(labels) == 1
-    assert labels.pop() == 2L
-    assert hb.n_labels() == 1 
-
-def test_consume_fasta_and_tag_with_labels():
-    hb = khmer.new_hashbits(20, 1e7, 4)
-    read_1 = 'ACGTAACCGGTTAAACCCGGGTTTAAAACCCCGGGGTTTT'
-    filename = utils.get_test_data('test-transcript.fa')
-
-    total_reads, n_consumed = hb.consume_fasta_and_tag_with_labels(filename)
-
-    assert hb.get(read_1[:20])
-    assert total_reads == 3
-    print hb.n_labels()
-    print hb.get_label_dict()
-    for tag in hb.get_tagset():
-        print tag, khmer.forward_hash(tag, 20)
-    for record in screed.open(filename):
-        print hb.sweep_tag_neighborhood(record.sequence, 40)
-        print hb.sweep_label_neighborhood(record.sequence, 40)
-    assert hb.n_labels() == 3
-
-'''
-* The test data set as four reads: A, B, C, and D
-* Overlaps are A <-> B <-> C, with D on its own
-* Thus, traversing from A should find labels from A and B,
-  traversing from B should find labels from A, B, and C,
-  and traversing from C should find labels from B and C
-'''
-def test_label_tag_correctness():
-    hb = khmer.new_hashbits(20, 1e7, 4)
-    filename = utils.get_test_data('test-labels.fa')
-    hb.consume_fasta_and_tag_with_labels(filename)
-    
-    # read A
-    labels = hb.sweep_label_neighborhood('ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAGCTAGGCTAGGTGTGCTCTGCCTAGAGCTAGGCTAGGTGT')
-    print hb.sweep_tag_neighborhood('TTCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAGCTAGGCTAGGTGTGCTCTGCCTAGAGCTAGGCTAGGTGT')
-    print labels
-    print len('ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAG')-19 
-    assert len(labels) == 2
-    assert 0L in labels
-    assert 1L in labels
-    
-    # read B
-    labels = hb.sweep_label_neighborhood('GCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGCTCTGCCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGA')
-    print labels
-    assert len(labels) == 3
-    assert 0L in labels
-    assert 1L in labels
-    assert 2L in labels
-    
-    # read C
-    labels = hb.sweep_label_neighborhood('TGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGAGTTGGGGATAGATAGATAGATGAGTGTAGATCCAACAACACATACA')
-    print labels
-    assert len(labels) == 2
-    assert 1L in labels
-    assert 2L in labels
-    
-    # read D
-    labels = hb.sweep_label_neighborhood('TATATATATAGCTAGCTAGCTAACTAGCTAGCATCGATCGATCGATC')
-    print labels
-    assert len(labels) == 1
-    assert 3L in labels
diff --git a/tests/test_labelhash.py b/tests/test_labelhash.py
new file mode 100644
index 0000000000..7993091d74
--- /dev/null
+++ b/tests/test_labelhash.py
@@ -0,0 +1,143 @@
+#
+# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2013. It is licensed under
+# the three-clause BSD license; see doc/LICENSE.txt. Contact: ctb@msu.edu
+#
+import khmer
+from khmer import LabelHash
+from screed.fasta import fasta_iter
+import screed
+
+import khmer_tst_utils as utils
+from nose.plugins.attrib import attr
+
+def teardown():
+    utils.cleanup()
+
+#
+# @camillescott TODO: more tests! 
+#  * thread-safety
+
+def test_n_labels():
+    lh = LabelHash(20, 1e7, 4)
+    filename = utils.get_test_data('test-labels.fa')
+    lh.consume_fasta_and_tag_with_labels(filename)
+    
+    print lh.n_labels()
+    assert lh.n_labels() == 4
+
+def test_get_label_dict():
+    lb = LabelHash(20, 1e7, 4)
+    filename = utils.get_test_data('test-labels.fa')
+    lb.consume_fasta_and_tag_with_labels(filename)
+    
+    labels = lb.get_label_dict()
+    expected = [0L, 1L, 2L, 3L]
+    for e_label in expected:
+        assert e_label in labels
+    for a_label in labels:
+        assert a_label in expected
+
+def test_get_tag_labels():
+    lb = LabelHash(20, 1e7, 4)
+    filename = utils.get_test_data('single-read.fq')
+    lb.consume_fasta_and_tag_with_labels(filename)
+    tag = 173473779682L
+
+    labels = lb.get_tag_labels(tag)
+    assert len(labels) == 1
+    assert labels.pop() == 0L
+
+def test_consume_partitioned_fasta_and_tag_with_labels():
+    lb = LabelHash(20, 1e7, 4)
+    filename = utils.get_test_data('real-partition-small.fa')
+
+    total_reads, n_consumed = lb.consume_partitioned_fasta_and_tag_with_labels(filename)
+    labels = set()
+    for record in screed.open(filename):
+        seq = record.sequence
+        labels.update(lb.sweep_label_neighborhood(seq, False, False))
+    #print lb.n_labels()
+    #print labels
+    assert len(labels) == 1
+    assert labels.pop() == 2L
+    assert lb.n_labels() == 1 
+
+def test_consume_fasta_and_tag_with_labels():
+    lb = LabelHash(20, 1e7, 4)
+    read_1 = 'ACGTAACCGGTTAAACCCGGGTTTAAAACCCCGGGGTTTT'
+    filename = utils.get_test_data('test-transcript.fa')
+
+    total_reads, n_consumed = lb.consume_fasta_and_tag_with_labels(filename)
+
+    assert lb.get(read_1[:20])
+    assert total_reads == 3
+    print lb.n_labels()
+    print lb.get_label_dict()
+    for tag in lb.get_tagset():
+        print tag, khmer.forward_hash(tag, 20)
+    for record in screed.open(filename):
+        print lb.sweep_tag_neighborhood(record.sequence, 40)
+        print lb.sweep_label_neighborhood(record.sequence, 40)
+    assert lb.n_labels() == 3
+
+'''
+* The test data set as four reads: A, B, C, and D
+* Overlaps are A <-> B <-> C, with D on its own
+* Thus, traversing from A should find labels from A and B,
+  traversing from B should find labels from A, B, and C,
+  and traversing from C should find labels from B and C
+'''
+def test_label_tag_correctness():
+    lb = LabelHash(20, 1e7, 4)
+    filename = utils.get_test_data('test-labels.fa')
+    lb.consume_fasta_and_tag_with_labels(filename)
+    
+    # read A
+    labels = lb.sweep_label_neighborhood('ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAGCTAGGCTAGGTGTGCTCTGCCTAGAGCTAGGCTAGGTGT')
+    print lb.sweep_tag_neighborhood('TTCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAGCTAGGCTAGGTGTGCTCTGCCTAGAGCTAGGCTAGGTGT')
+    print labels
+    print len('ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAG')-19 
+    assert len(labels) == 2
+    assert 0L in labels
+    assert 1L in labels
+    
+    # read B
+    labels = lb.sweep_label_neighborhood('GCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGCTCTGCCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGA')
+    print labels
+    assert len(labels) == 3
+    assert 0L in labels
+    assert 1L in labels
+    assert 2L in labels
+    
+    # read C
+    labels = lb.sweep_label_neighborhood('TGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGAGTTGGGGATAGATAGATAGATGAGTGTAGATCCAACAACACATACA')
+    print labels
+    assert len(labels) == 2
+    assert 1L in labels
+    assert 2L in labels
+    
+    # read D
+    labels = lb.sweep_label_neighborhood('TATATATATAGCTAGCTAGCTAACTAGCTAGCATCGATCGATCGATC')
+    print labels
+    assert len(labels) == 1
+    assert 3L in labels
+
+def test_sweep_tag_neighborhood():
+    lb = LabelHash(20, 1e7, 4)
+    filename = utils.get_test_data('single-read.fq')
+    lb.consume_fasta_and_tag(filename)
+    
+    tags = lb.sweep_tag_neighborhood('CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT')
+    assert len(tags) == 1
+    assert tags.pop() == 173473779682L
+
+
+def test_sweep_label_neighborhood():
+    lb = LabelHash(20, 1e7, 4)
+    filename = utils.get_test_data('single-read.fq')
+    lb.consume_fasta_and_tag_with_labels(filename)
+    
+    labels = lb.sweep_label_neighborhood('CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT')
+    assert len(labels) == 1
+    assert labels.pop() == 0L

From ebf8a9e8f5c5544e36a1ddd44e51e14b85a23d4f Mon Sep 17 00:00:00 2001
From: Camille Scott <camille.scott.w@gmail.com>
Date: Tue, 17 Dec 2013 12:11:08 -0500
Subject: [PATCH 107/140] stripped label stuff from hashtable.cc

---
 lib/hashtable.cc | 350 -----------------------------------------------
 1 file changed, 350 deletions(-)

diff --git a/lib/hashtable.cc b/lib/hashtable.cc
index 7f4ea47d90..16b463a879 100644
--- a/lib/hashtable.cc
+++ b/lib/hashtable.cc
@@ -1924,354 +1924,4 @@ void Hashtable::extract_unique_paths(std::string seq,
     }
   }
 }
-/*
- * Pretty much copy-pasta
- * @cswelcher
- * Might be time for a refactor: could do a general consume_fasta
- * function which accepts a consume_sequence function pointer as a parameter
- */
-
-void
-Hashtable::consume_fasta_and_tag_with_labels(
-  std:: string const  &filename,
-  unsigned int	      &total_reads, unsigned long long	&n_consumed,
-  CallbackFn	      callback,	    void *		callback_data
-)
-{
-  khmer:: Config    &the_config	  = khmer:: get_active_config( );
-
-  // Note: Always assume only 1 thread if invoked this way.
-  IParser *	  parser = 
-  IParser::get_parser(
-    filename, 1, the_config.get_reads_input_buffer_size( ),
-    the_config.get_reads_parser_trace_level( )
-  );
-
-
-  consume_fasta_and_tag_with_labels(
-    parser,
-    total_reads, n_consumed,
-    callback, callback_data
-  );
-
-  delete parser;
-}
-
-void
-Hashtable::consume_fasta_and_tag_with_labels(
-    read_parsers:: IParser *  parser,
-    unsigned int		    &total_reads,   unsigned long long	&n_consumed,
-    CallbackFn		    callback,	    void *		callback_data
-  )
-  {
-    Hasher		  &hasher		= 
-    _get_hasher( parser->uuid( ) );
-    unsigned int		  total_reads_LOCAL	= 0;
-  #if (0) // Note: Used with callback - currently disabled.
-    unsigned long long int  n_consumed_LOCAL	= 0;
-  #endif
-    Read			  read;
-
-    // TODO? Delete the following assignments.
-    total_reads = 0;
-    n_consumed = 0;
-    
-    hasher.trace_logger(
-      TraceLogger:: TLVL_DEBUG2,
-      "Starting trace of 'consume_fasta_and_tag'....\n"
-    );
-    
-    Label _tag_label = 0;
-
-    Label * the_label;
-    // Iterate through the reads and consume their k-mers.
-    while (!parser->is_complete( ))
-    {
-      unsigned long long this_n_consumed   = 0;
-
-      read = parser->get_next_read( );
-
-      if (check_and_normalize_read( read.sequence ))
-      {
-        // TODO: make threadsafe!
-        the_label = check_and_allocate_label(_tag_label);
-        consume_sequence_and_tag_with_labels( read.sequence,
-					      this_n_consumed,
-					      *the_label );
-	    _tag_label++;
-
-  #ifdef WITH_INTERNAL_METRICS
-        hasher.pmetrics.start_timers( );
-  #endif
-  #if (0) // Note: Used with callback - currently disabled.
-        n_consumed_LOCAL  = __sync_add_and_fetch( &n_consumed, this_n_consumed );
-  #else
-        __sync_add_and_fetch( &n_consumed, this_n_consumed );
-  #endif
-        total_reads_LOCAL = __sync_add_and_fetch( &total_reads, 1 );
-  #ifdef WITH_INTERNAL_METRICS
-        hasher.pmetrics.stop_timers( );
-        hasher.pmetrics.accumulate_timer_deltas(
-	  (uint32_t)HashTablePerformanceMetrics:: MKEY_TIME_UPDATE_TALLIES
-        );
-  #endif
-      }
-
-      if (0 == (total_reads_LOCAL % 10000))
-        hasher.trace_logger(
-	  TraceLogger:: TLVL_DEBUG3,
-	  "Total number of reads processed: %llu\n",
-	  (unsigned long long int)total_reads_LOCAL
-        );
-
-      // TODO: Figure out alternative to callback into Python VM
-      //       Cannot use in multi-threaded operation.
-  #if (0)
-        // run callback, if specified
-        if (total_reads_TL % CALLBACK_PERIOD == 0 && callback) {
-	  std::cout << "n tags: " << all_tags.size() << "\n";
-	  try {
-	    callback("consume_fasta_and_tag", callback_data, total_reads_TL,
-		     n_consumed);
-	  } catch (...) {
-	    delete parser;
-	    throw;
-	  }
-        }
-  #endif // 0
-
-    } // while reads left for parser
-
-  }
-
-void Hashtable::consume_partitioned_fasta_and_tag_with_labels(const std::string &filename,
-					  unsigned int &total_reads,
-					  unsigned long long &n_consumed,
-					  CallbackFn callback,
-					  void * callback_data)
-{
-  total_reads = 0;
-  n_consumed = 0;
-
-  IParser* parser = IParser::get_parser(filename.c_str());
-  Read read;
-
-  string seq = "";
-
-  // reset the master subset partition
-  delete partition;
-  partition = new SubsetPartition(this);
-
-  //
-  // iterate through the FASTA file & consume the reads.
-  //
-  Label * c;
-  PartitionID p;
-  while(!parser->is_complete())  {
-    read = parser->get_next_read();
-    seq = read.sequence;
-
-    if (check_and_normalize_read(seq)) {
-      // First, figure out what the partition is (if non-zero), and save that.
-      p = _parse_partition_id(read.name);
-      c = check_and_allocate_label(p);
-
-      consume_sequence_and_tag_with_labels( seq,
-					      n_consumed,
-					      *c );
-    }
-	       
-    // reset the sequence info, increment read number
-    total_reads++;
-
-    // run callback, if specified
-    if (total_reads % CALLBACK_PERIOD == 0 && callback) {
-      try {
-        callback("consume_partitioned_fasta_and_tag_with_labels", callback_data, 
-        total_reads, n_consumed);
-      } catch (...) {
-	delete parser;
-        throw;
-      }
-    }
-  }
-
-  // @cswelcher TODO: check that deallocate LabelPtrMap is correct
-  delete parser;
-}
-
-// @cswelcher: double-check -- is it valid to pull the address from a reference?
-void Hashtable::link_tag_and_label(HashIntoType& kmer, Label& kmer_label) {
-  tag_labels.insert(TagLabelPtrPair(kmer, &kmer_label));
-  label_tag_ptrs.insert(LabelTagPtrPair(kmer_label, &kmer));
-}
-
-/* This is essentially the same code as above, only it assigns labels to the
- * tags through multimap TagLabelMap defined in hashtable.hh, declared in
- * hashbits.hh
- * @cswelcher TODO: should I instead send in the pointer to the new label?
- */
-void Hashtable::consume_sequence_and_tag_with_labels(const std::string& seq,
-					unsigned long long& n_consumed,
-					Label& current_label,
-					SeenSet * found_tags)
-  {
-    bool is_new_kmer;
-    bool kmer_tagged;
-
-    KMerIterator kmers(seq.c_str(), _ksize);
-    HashIntoType kmer;
-
-    unsigned int since = _tag_density / 2 + 1;
-
-    while(!kmers.done()) {
-      kmer = kmers.next();
-
-      if ((is_new_kmer = test_and_set_bits( kmer )))
-        ++n_consumed;
-
-  #if (1)
-      if (is_new_kmer) {
-        ++since;
-      } else {
-        ACQUIRE_ALL_TAGS_SPIN_LOCK
-        kmer_tagged = set_contains(all_tags, kmer);
-        RELEASE_ALL_TAGS_SPIN_LOCK
-        if (kmer_tagged) {
-	      since = 1;
-	      
-	      // Labeling code
-	      // TODO: MAKE THREADSAFE!
-	      
-	      if (!_cmap_contains_label(tag_labels, kmer, current_label)) {
-	        ACQUIRE_TAG_COLORS_SPIN_LOCK
-	        link_tag_and_label(kmer, current_label);
-	        RELEASE_TAG_COLORS_SPIN_LOCK
-	      }
-	      if (found_tags) {
-	        found_tags->insert(kmer);
-	      }
-        }  else ++since;
-      }
-  #else
-      if (!is_new_kmer && set_contains(all_tags, kmer)) {
-        since = 1;
-        if (found_tags) { found_tags->insert(kmer); }
-      } else {
-        since++;
-      }
-  #endif
-      //
-      if (since >= _tag_density) {
-        ACQUIRE_ALL_TAGS_SPIN_LOCK
-        all_tags.insert(kmer);
-        RELEASE_ALL_TAGS_SPIN_LOCK
-        
-        // Labeling code
-        // TODO: MAKE THREADSAFE!
-        ACQUIRE_TAG_COLORS_SPIN_LOCK
-        link_tag_and_label(kmer, current_label);
-        RELEASE_TAG_COLORS_SPIN_LOCK
-        
-        if (found_tags) { found_tags->insert(kmer); }
-        since = 1;
-      }
-
-    } // iteration over kmers
-
-    if (since >= _tag_density/2 - 1) {
-      ACQUIRE_ALL_TAGS_SPIN_LOCK
-      all_tags.insert(kmer);	// insert the last k-mer, too.
-      RELEASE_ALL_TAGS_SPIN_LOCK
-      
-      // Label code: TODO: MAKE THREADSAFE!
-      link_tag_and_label(kmer, current_label);
-      
-      if (found_tags) { found_tags->insert(kmer); }
-    }
-  }
-/*
- * Find all labels associated with the sequence
- * For now, check /every/ k-mer with find_all_tags
- * THIS SUCKS AND IT'S YOUR FAULT @CTB
- */
-unsigned int Hashtable::sweep_sequence_for_labels(const std::string& seq,
-					LabelPtrSet& found_labels,
-					bool break_on_stoptags,
-					bool stop_big_traversals) {
-					
-    SeenSet tagged_kmers;
-    //LabelPtrSet found_labels;
-    
-    HashIntoType kmer_f, kmer_r, kmer;
-    
-    KMerIterator kmers(seq.c_str(), _ksize);
-    std::string kmer_s;
-    // keep a list of kmers which have already been traversed
-    SeenSet traversed_kmers;
-    while (!kmers.done()) {
-      kmer = kmers.next();
-      kmer_s = _revhash(kmer, _ksize);
-      _hash(kmer_s.c_str(), _ksize, kmer_f, kmer_r);
-      
-      // don't even try traversing from k-mers not in the hashtable
-      //traversed_kmers.clear();
-      if (get_count(uniqify_rc(kmer_f,kmer_r))) {
-        partition->find_all_tags(kmer_f, kmer_r, tagged_kmers,
-                   all_tags, break_on_stoptags, stop_big_traversals);
-        traverse_labels_and_resolve(tagged_kmers, found_labels);
-      }
-    }
-    return traversed_kmers.size();
-}
-
-unsigned int Hashtable::sweep_label_neighborhood(const std::string& seq,
-                                                  LabelPtrSet& found_labels,
-                                                  unsigned int range,
-                                                  bool break_on_stoptags,
-                                                  bool stop_big_traversals) {
-
-    SeenSet tagged_kmers;
-    unsigned int num_traversed;
-    num_traversed = partition->sweep_for_tags(seq, tagged_kmers, all_tags, 
-                              range, break_on_stoptags, stop_big_traversals);
-    traverse_labels_and_resolve(tagged_kmers, found_labels);
-    //printf("range=%u ", range);
-    if (range == 0) {
-      assert(num_traversed == seq.length()-ksize()+1);
-    }
-    tagged_kmers.clear();
-    return num_traversed;
-}
-
-LabelPtrSet Hashtable::get_tag_labels(const HashIntoType& tag) {
-  LabelPtrSet labels;
-  unsigned int num_labels;
-  _get_tag_labels(tag, tag_labels, labels);
-  return labels;
-}
-
-TagPtrSet Hashtable::get_label_tags(const Label& label) {
-  TagPtrSet tags;
-  unsigned int num_tags;
-  _get_tags_from_label(label, label_tag_ptrs, tags);
-  return tags;
-}
-
-void Hashtable::traverse_labels_and_resolve(const SeenSet& tagged_kmers,
-                                              LabelPtrSet& found_labels) {
-  
-  SeenSet::const_iterator si;
-  unsigned int num_labels = 0;
-  for (si=tagged_kmers.begin(); si!=tagged_kmers.end(); ++si) {
-    HashIntoType tag = *si;
-    // get the labels associated with this tag
-    num_labels = _get_tag_labels(tag, tag_labels, found_labels);
-    if (num_labels > 1) {
-      // reconcile labels
-      // for now do nothing ha
-    }
-  }
-}
-
 // vim: set sts=2 sw=2:

From 6ee7f91c26e4e4414657f56da9f18e3ab1649ee5 Mon Sep 17 00:00:00 2001
From: Camille Scott <camille.scott.w@gmail.com>
Date: Tue, 17 Dec 2013 13:03:05 -0500
Subject: [PATCH 108/140] switched include ordering back

---
 khmer/_khmermodule.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/khmer/_khmermodule.cc b/khmer/_khmermodule.cc
index cef9f7d724..4757df82d2 100644
--- a/khmer/_khmermodule.cc
+++ b/khmer/_khmermodule.cc
@@ -17,8 +17,8 @@
 #include "khmer_config.hh"
 #include "ktable.hh"
 #include "hashtable.hh"
-#include "counting.hh"
 #include "hashbits.hh"
+#include "counting.hh"
 #include "storage.hh"
 #include "aligner.hh"
 #include "labelhash.hh"
@@ -4933,13 +4933,14 @@ init_khmer(void)
         return;
     }
     // add LabelHash
+    
     khmer_KLabelHashType.tp_base = &khmer_KHashbitsType;
     khmer_KLabelHashType.tp_new = khmer_labelhash_new;
     if (PyType_Ready(&khmer_KLabelHashType) < 0) {
         std::cout << "_khmer.KLabelHashType failed PyType_Ready" << std::endl; 
         return;
     }
-
+    
     PyObject * m;
     m = Py_InitModule3( "_khmer", KhmerMethods, 
                         "interface for the khmer module low-level extensions" );

From 7e8a1ac297b39e42cbadc93fdc0682ed3e7e943c Mon Sep 17 00:00:00 2001
From: Camille Scott <camille.scott.w@gmail.com>
Date: Tue, 17 Dec 2013 14:16:07 -0500
Subject: [PATCH 109/140] added setup.py which had been left out (still doesn't
 work)

---
 setup.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/setup.py b/setup.py
index ed76f07837..827bd02077 100755
--- a/setup.py
+++ b/setup.py
@@ -61,7 +61,7 @@
 build_depends.extend(map(
     lambda bn: path_join("lib", bn + ".hh"),
     [
-        "storage", "khmer", "khmer_config", "ktable", "hashtable", "counting",
+        "storage", "khmer", "khmer_config", "ktable", "hashtable", "counting", "hashbits",
     ]
 ))
 
@@ -70,9 +70,9 @@
     lambda bn: path_join("lib", bn + ".cc"),
     [
         "khmer_config", "thread_id_map", "trace_logger", "perf_metrics",
-        "read_parsers", "ktable", "hashtable", "hashbits", "counting",
-        "subset", "aligner", "scoringmatrix", "node", "kmer",
-    ]
+        "read_parsers", "ktable", "hashtable", "hashbits", "labelhash", "counting",
+        "subset", "aligner", "scoringmatrix", "node", "kmer",  
+   ]
 ))
 
 extension_mod_DICT = \
@@ -103,7 +103,7 @@
         "long_description": open("README.rst").read(),
         "author": 'Michael R. Crusoe, Greg Edvenson, Jordan Fish,'
         ' Adina Howe, Eric McDonald, Joshua Nahum, Kaben Nanlohy,'
-        ' Jason Pell, Jared Simpson, C. S. Welcher,'
+        ' Jason Pell, Jared Simpson, Camille Scott,'
         ' Qingpeng Zhang, and C. Titus Brown',
         "author_email": 'khmer-project@idyll.org',
         #"maintainer": 'Michael R. Crusoe', # this overrides the author field

From 5bbb493e7a58b75ccdfe5cd0a67124d507da151e Mon Sep 17 00:00:00 2001
From: Camille Scott <camille.scott.w@gmail.com>
Date: Tue, 17 Dec 2013 14:47:12 -0500
Subject: [PATCH 110/140] commented out undefined destructor

---
 lib/labelhash.hh | 3 +--
 setup.py         | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/lib/labelhash.hh b/lib/labelhash.hh
index 374ff02d78..3133a5c70b 100644
--- a/lib/labelhash.hh
+++ b/lib/labelhash.hh
@@ -75,12 +75,11 @@ namespace khmer {
         LabelHash( WordLength ksize, std::vector<HashIntoType>& tablesizes)
         : khmer::Hashbits(ksize, tablesizes)
         {
-            // constructor
             _tag_labels_spin_lock = 0;
 
         }
         
-        ~LabelHash();
+        //~LabelHash();
 
         TagLabelPtrMap tag_labels;
         LabelTagPtrMap label_tag_ptrs;
diff --git a/setup.py b/setup.py
index 827bd02077..bf8bc05272 100755
--- a/setup.py
+++ b/setup.py
@@ -61,7 +61,7 @@
 build_depends.extend(map(
     lambda bn: path_join("lib", bn + ".hh"),
     [
-        "storage", "khmer", "khmer_config", "ktable", "hashtable", "counting", "hashbits",
+        "storage", "khmer", "khmer_config", "ktable", "hashtable", "counting", "hashbits", "labelhash",
     ]
 ))
 

From b07e3f0b169439cdc4df5686cc46986d5ef9eba8 Mon Sep 17 00:00:00 2001
From: Camille Scott <camille.scott.w@gmail.com>
Date: Tue, 17 Dec 2013 14:53:25 -0500
Subject: [PATCH 111/140] fixed namespaces

---
 lib/labelhash.cc | 2 --
 lib/labelhash.hh | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/lib/labelhash.cc b/lib/labelhash.cc
index b3d9a15ea8..d4e0a29627 100644
--- a/lib/labelhash.cc
+++ b/lib/labelhash.cc
@@ -6,9 +6,7 @@
 
 #include "labelhash.hh"
 
-using namespace std;
 using namespace khmer;
-using namespace khmer:: read_parsers;
 
 /*
  * @camillescott
diff --git a/lib/labelhash.hh b/lib/labelhash.hh
index 3133a5c70b..55f0a43ac1 100644
--- a/lib/labelhash.hh
+++ b/lib/labelhash.hh
@@ -79,7 +79,7 @@ namespace khmer {
 
         }
         
-        //~LabelHash();
+        ~LabelHash() {};
 
         TagLabelPtrMap tag_labels;
         LabelTagPtrMap label_tag_ptrs;

From a2fd39ae5f4f077970ddd91e6f1d9b53b571d295 Mon Sep 17 00:00:00 2001
From: Camille Scott <camille.scott.w@gmail.com>
Date: Tue, 17 Dec 2013 14:56:44 -0500
Subject: [PATCH 112/140] now the namespaces are nice fishjord

---
 lib/labelhash.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lib/labelhash.cc b/lib/labelhash.cc
index d4e0a29627..7093760bcc 100644
--- a/lib/labelhash.cc
+++ b/lib/labelhash.cc
@@ -7,6 +7,7 @@
 #include "labelhash.hh"
 
 using namespace khmer;
+using namespace khmer:: read_parsers;
 
 /*
  * @camillescott
@@ -139,7 +140,7 @@ void LabelHash::consume_partitioned_fasta_and_tag_with_labels(const std::string
   IParser* parser = IParser::get_parser(filename.c_str());
   Read read;
 
-  string seq = "";
+  std::string seq = "";
 
   // reset the master subset partition
   delete partition;

From 1ac3d6f59e9fdc207a71837ea11a5458b37617d0 Mon Sep 17 00:00:00 2001
From: Camille Scott <camille.scott.w@gmail.com>
Date: Wed, 18 Dec 2013 01:18:38 -0500
Subject: [PATCH 113/140] stripped final remnants of labeling code out of
 hashtable

---
 khmer/__init__.py     | 13 +++++++++++--
 khmer/_khmermodule.cc | 14 ++++++++------
 lib/labelhash.cc      |  3 ++-
 3 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/khmer/__init__.py b/khmer/__init__.py
index 01683832e0..04f2038f0f 100644
--- a/khmer/__init__.py
+++ b/khmer/__init__.py
@@ -15,7 +15,7 @@
 from _khmer import reverse_hash
 from _khmer import get_config
 from _khmer import ReadParser
-from _khmer import LabelHash
+from _khmer import _LabelHash
 
 from ._version import get_versions
 __version__ = get_versions()['version']
@@ -29,7 +29,6 @@ def new_hashbits(k, starting_size, n_tables=2):
 
     return _new_hashbits(k, primes)
 
-
 def new_counting_hash(k, starting_size, n_tables=2, n_threads=1):
     primes = get_n_primes_above_x(n_tables, starting_size)
 
@@ -149,4 +148,14 @@ def get_n_primes_above_x(n, x):
         i += 2
     return primes
 
+class LabelHash(_LabelHash):
+    def __new__(cls, k, starting_size, n_tables):
+        print "** LabelHash __new__"
+        print "\t*** Getting primes..."
+        primes = get_n_primes_above_x(n_tables, starting_size)
+        print "\t*** Invoking parent..."
+        c = _LabelHash.__new__(cls, k, primes)
+        print "\t*** Done with parent, returning class object"
+        c.primes = primes
+        return c
 
diff --git a/khmer/_khmermodule.cc b/khmer/_khmermodule.cc
index 4757df82d2..e1b85d3f54 100644
--- a/khmer/_khmermodule.cc
+++ b/khmer/_khmermodule.cc
@@ -4215,8 +4215,8 @@ static PyObject * khmer_labelhash_new(PyTypeObject *type, PyObject *args, PyObje
 
         // We want the hashbits pointer in the base class to point to our labelhash,
         // so that the KHashbits methods are called on the correct object (a LabelHash)
-        self->khashbits.hashbits = (khmer::Hashbits *)self->labelhash;
         self->labelhash = new khmer::LabelHash(k, sizes);
+        self->khashbits.hashbits = (khmer::Hashbits *)self->labelhash;
     }
 
     return (PyObject *) self;     
@@ -4226,6 +4226,7 @@ static int khmer_labelhash_init(khmer_KLabelHashObject * self, PyObject *args, P
 {
     if (khmer_KHashbitsType.tp_init((PyObject *)self, args, kwds) < 0)
         return -1;
+    std::cout << "testing my pointer ref to hashbits: " << self->khashbits.hashbits->n_tags() << std::endl;
     return 0;
 }
 
@@ -4306,24 +4307,25 @@ static PyObject * labelhash_consume_partitioned_fasta_and_tag_with_labels(
 static PyObject * labelhash_consume_sequence_and_tag_with_labels(PyObject * self, PyObject * args) {
   khmer_KLabelHashObject * me = (khmer_KLabelHashObject *) self;
   khmer::LabelHash * hb = me->labelhash;
-  
+  std::cout << "inside labelhash consume cpython func, parsing args..." << std::endl;
   char * seq = NULL;
   unsigned long long c = NULL;
   if (!PyArg_ParseTuple(args, "sK", &seq, &c)) {
     return NULL;
   }
-  
+  std::cout << "parsed args, getting new label" << std::endl;
   unsigned long long n_consumed = 0;
   khmer::Label * the_label = hb->check_and_allocate_label(c);
 
   try { 
   //if (hb->check_and_normalize_read(seq)) {
-    
+    std::cout << "calling low level consume func on labelhash..." << std::endl;
     hb->consume_sequence_and_tag_with_labels(seq, n_consumed, *the_label);
   //}
   } catch (_khmer_signal &e) {
     return NULL;
   }
+  std::cout << "packaging return value and returning!" << std::endl;
   return Py_BuildValue("L", n_consumed);
 }
 
@@ -4510,7 +4512,7 @@ khmer_labelhash_getattr(PyObject * obj, char * name)
 static PyTypeObject khmer_KLabelHashType = {
     PyObject_HEAD_INIT(NULL)
     0,                       /* ob_size */
-   "LabelHash",            /* tp_name */ 
+   "_LabelHash",            /* tp_name */ 
     sizeof(khmer_KLabelHashObject), /* tp_basicsize */
     0,                       /* tp_itemsize */
     (destructor)khmer_labelhash_dealloc, /* tp_dealloc */
@@ -4976,7 +4978,7 @@ init_khmer(void)
     PyModule_AddObject(m, "Hashbits", (PyObject *)&khmer_KHashbitsType);
 
     Py_INCREF(&khmer_KLabelHashType);
-    PyModule_AddObject(m, "LabelHash", (PyObject *)&khmer_KLabelHashType);
+    PyModule_AddObject(m, "_LabelHash", (PyObject *)&khmer_KLabelHashType);
 }
 
 // vim: set ft=cpp sts=4 sw=4 tw=79:
diff --git a/lib/labelhash.cc b/lib/labelhash.cc
index 7093760bcc..7577abd453 100644
--- a/lib/labelhash.cc
+++ b/lib/labelhash.cc
@@ -195,6 +195,8 @@ void LabelHash::consume_sequence_and_tag_with_labels(const std::string& seq,
 					Label& current_label,
 					SeenSet * found_tags)
   {
+
+    std::cout << "inside low-level labelhash consume sequence function" << std::endl;
     bool is_new_kmer;
     bool kmer_tagged;
 
@@ -272,7 +274,6 @@ void LabelHash::consume_sequence_and_tag_with_labels(const std::string& seq,
 /*
  * Find all labels associated with the sequence
  * For now, check /every/ k-mer with find_all_tags
- * THIS SUCKS AND IT'S YOUR FAULT @CTB
  */
 unsigned int LabelHash::sweep_sequence_for_labels(const std::string& seq,
 					LabelPtrSet& found_labels,

From 7478d691fc1b374493abdb854b90908f7fea5b3c Mon Sep 17 00:00:00 2001
From: Camille Scott <camille.scott.w@gmail.com>
Date: Wed, 18 Dec 2013 01:19:04 -0500
Subject: [PATCH 114/140] some inlined functions in hashtable.hh removed

---
 lib/hashtable.hh | 59 +-----------------------------------------------
 1 file changed, 1 insertion(+), 58 deletions(-)

diff --git a/lib/hashtable.hh b/lib/hashtable.hh
index a306816581..3d8ec0f72b 100644
--- a/lib/hashtable.hh
+++ b/lib/hashtable.hh
@@ -180,59 +180,7 @@ namespace khmer {
 
     WordLength	    _ksize;
     HashIntoType    bitmask;
-    unsigned int    _nbits_sub_1;
-
-    // Does the given tag already have the given label?
-    bool _cmap_contains_label(const TagLabelPtrMap& cmap,
-                        HashIntoType& kmer,
-                        Label& the_label)
-    {
-      std::pair<TagLabelPtrMap::const_iterator, TagLabelPtrMap::const_iterator> ret;
-      ret = cmap.equal_range(kmer);
-      for (TagLabelPtrMap::const_iterator it=ret.first; it!=ret.second; ++it) {
-        if (*(it->second) == the_label) return true;
-      }
-      return false;
-    }
-
-    // Does the given label already have a tag associated with it?
-    bool _cmap_contains_tag(const LabelTagPtrMap& cmap,
-                            Label& the_label,
-                            HashIntoType& kmer) {
-      std::pair<LabelTagPtrMap::const_iterator, LabelTagPtrMap::const_iterator> ret;
-      ret = cmap.equal_range(the_label);
-      for (LabelTagPtrMap::const_iterator it=ret.first; it!=ret.second; ++it) {
-        if(*(it->second) == kmer) return true;
-      }
-      return false;
-    }
-    
-    unsigned int _get_tag_labels(const HashIntoType& tag,
-                          const TagLabelPtrMap& cmap,
-                          LabelPtrSet& found_labels) {
-        unsigned int num_labels = 0;
-        std::pair<TagLabelPtrMap::const_iterator, TagLabelPtrMap::const_iterator> ret;
-        ret = cmap.equal_range(tag);
-        for (TagLabelPtrMap::const_iterator it=ret.first; it!=ret.second; ++it) {
-            found_labels.insert(it->second);
-            ++num_labels;
-        }
-        return num_labels;
-    }
-    
-    unsigned int _get_tags_from_label(const Label& label,
-                               const LabelTagPtrMap& cmap,
-                               TagPtrSet& labeled_tags) {
-        unsigned int num_tags = 0;
-        std::pair<LabelTagPtrMap::const_iterator, LabelTagPtrMap::const_iterator> ret;
-        ret = cmap.equal_range(label);
-        for (LabelTagPtrMap::const_iterator it=ret.first; it!=ret.second; ++it) {
-            labeled_tags.insert(it->second);
-            ++num_tags;
-        }
-        return num_tags;
-    }
-    
+    unsigned int    _nbits_sub_1;  
     
     Hashtable(
 	WordLength	ksize,
@@ -253,7 +201,6 @@ namespace khmer {
       partition = new SubsetPartition(this);
       _init_bitstuff();
       _all_tags_spin_lock = 0;
-      _tag_labels_spin_lock = 0;
       
     }
 
@@ -367,15 +314,11 @@ namespace khmer {
     }
 
     uint32_t _all_tags_spin_lock;
-    uint32_t _tag_labels_spin_lock;
   public:
     SubsetPartition * partition;
     SeenSet all_tags;
     SeenSet stop_tags;
     SeenSet repart_small_tags;
-    TagLabelPtrMap tag_labels;
-    LabelTagPtrMap label_tag_ptrs;
-    LabelPtrMap label_ptrs;
 
     // accessor to get 'k'
     const WordLength ksize() const { return _ksize; }

From e75a83816340977ccf9dde2b46c31b0c9323f167 Mon Sep 17 00:00:00 2001
From: Camille Scott <camille.scott.w@gmail.com>
Date: Fri, 20 Dec 2013 10:10:17 -0500
Subject: [PATCH 115/140] narrowing down freezup: all_tags spinlock in
 labelhash

---
 lib/labelhash.cc   |  35 ++++++++++++----
 lib/labelhash.hh   |   1 +
 lib/test-Colors.cc | 101 +++++++--------------------------------------
 3 files changed, 42 insertions(+), 95 deletions(-)

diff --git a/lib/labelhash.cc b/lib/labelhash.cc
index 7577abd453..761b6bdeb8 100644
--- a/lib/labelhash.cc
+++ b/lib/labelhash.cc
@@ -6,6 +6,9 @@
 
 #include "labelhash.hh"
 
+#define LABEL_DBG 1
+#define printdbg(m) if(LABEL_DBG) std::cout << #m << std::endl;
+
 using namespace khmer;
 using namespace khmer:: read_parsers;
 
@@ -186,8 +189,10 @@ void LabelHash::consume_partitioned_fasta_and_tag_with_labels(const std::string
 
 // @cswelcher: double-check -- is it valid to pull the address from a reference?
 void LabelHash::link_tag_and_label(HashIntoType& kmer, Label& kmer_label) {
-  tag_labels.insert(TagLabelPtrPair(kmer, &kmer_label));
-  label_tag_ptrs.insert(LabelTagPtrPair(kmer_label, &kmer));
+    printdbg(linking tag and label)
+    tag_labels.insert(TagLabelPtrPair(kmer, &kmer_label));
+    label_tag_ptrs.insert(LabelTagPtrPair(kmer_label, &kmer));
+    printdbg(done linking tag and label)
 }
 
 void LabelHash::consume_sequence_and_tag_with_labels(const std::string& seq,
@@ -196,7 +201,8 @@ void LabelHash::consume_sequence_and_tag_with_labels(const std::string& seq,
 					SeenSet * found_tags)
   {
 
-    std::cout << "inside low-level labelhash consume sequence function" << std::endl;
+    printdbg(inside low-level labelhash consume sequence function)
+
     bool is_new_kmer;
     bool kmer_tagged;
 
@@ -204,35 +210,45 @@ void LabelHash::consume_sequence_and_tag_with_labels(const std::string& seq,
     HashIntoType kmer;
 
     unsigned int since = _tag_density / 2 + 1;
-
+    
+    printdbg(entering while loop)
     while(!kmers.done()) {
       kmer = kmers.next();
 
       if ((is_new_kmer = test_and_set_bits( kmer )))
         ++n_consumed;
+        printdbg(test_and_set_bits)
 
   #if (1)
       if (is_new_kmer) {
+        printdbg(new kmer...)
         ++since;
       } else {
+        printdbg(entering tag spin lock)
         ACQUIRE_ALL_TAGS_SPIN_LOCK
         kmer_tagged = set_contains(all_tags, kmer);
         RELEASE_ALL_TAGS_SPIN_LOCK
+        printdbg(released tag spin lock)
         if (kmer_tagged) {
 	      since = 1;
-	      
+	      printdbg(kmer already in all_tags)
 	      // Labeling code
 	      // TODO: MAKE THREADSAFE!
 	      
 	      if (!_cmap_contains_label(tag_labels, kmer, current_label)) {
+            printdbg(tag was not labeled: adding to labels...)
 	        ACQUIRE_TAG_COLORS_SPIN_LOCK
 	        link_tag_and_label(kmer, current_label);
 	        RELEASE_TAG_COLORS_SPIN_LOCK
+            printdbg(released label spin lock)
 	      }
 	      if (found_tags) {
 	        found_tags->insert(kmer);
 	      }
-        }  else ++since;
+        }  else {
+            printdbg(inc since var)
+            ++since;
+        }
       }
   #else
       if (!is_new_kmer && set_contains(all_tags, kmer)) {
@@ -244,9 +260,12 @@ void LabelHash::consume_sequence_and_tag_with_labels(const std::string& seq,
   #endif
       //
       if (since >= _tag_density) {
+        printdbg(exceeded tag density: drop a tag and label -- getting tag lock)
         ACQUIRE_ALL_TAGS_SPIN_LOCK
+        printdbg(in tag spin lock)
         all_tags.insert(kmer);
         RELEASE_ALL_TAGS_SPIN_LOCK
+        printdbg(released tag spin lock)
         
         // Labeling code
         // TODO: MAKE THREADSAFE!
@@ -257,9 +276,9 @@ void LabelHash::consume_sequence_and_tag_with_labels(const std::string& seq,
         if (found_tags) { found_tags->insert(kmer); }
         since = 1;
       }
-
+    printdbg(moving to next iter)
     } // iteration over kmers
-
+    printdbg(finished iteration: dropping last tag)
     if (since >= _tag_density/2 - 1) {
       ACQUIRE_ALL_TAGS_SPIN_LOCK
       all_tags.insert(kmer);	// insert the last k-mer, too.
diff --git a/lib/labelhash.hh b/lib/labelhash.hh
index 55f0a43ac1..755054d433 100644
--- a/lib/labelhash.hh
+++ b/lib/labelhash.hh
@@ -76,6 +76,7 @@ namespace khmer {
         : khmer::Hashbits(ksize, tablesizes)
         {
             _tag_labels_spin_lock = 0;
+            _all_tags_spin_lock = 0;
 
         }
         
diff --git a/lib/test-Colors.cc b/lib/test-Colors.cc
index 6da9e7e500..dbd8a2dc02 100644
--- a/lib/test-Colors.cc
+++ b/lib/test-Colors.cc
@@ -1,93 +1,20 @@
-//
-// This file is part of khmer, http://github.com/ged-lab/khmer/, and is
-// Copyright (C) Michigan State University, 2009-2013. It is licensed under
-// the three-clause BSD license; see doc/LICENSE.txt. Contact: ctb@msu.edu
-//
+#include "khmer.hh"
+#include "hashtable.hh"
+#include "hashbits.hh"
+#include "labelhash.hh"
+#include <iostream>
 
-// Simple C++ implementation of the 'load-graph' Python script.
-
-
-#include <cstring>
-#include <cstdio>
-#include <cerrno>
-#include <cstdlib>
-#include <unistd.h>
-#include <getopt.h>
-#include <time.h>
-#include <omp.h>
-
-//#define HASH_TYPE_TO_TEST   1 // Counting Hash
-#define HASH_TYPE_TO_TEST   2 // Bit Hash
-
-// #define OUTPUT_HASHTABLE
-
-
-#include "error.hh"
-#include "read_parsers.hh"
-#if HASH_TYPE_TO_TEST == 1
-#  include "counting.hh"
-#elif HASH_TYPE_TO_TEST == 2
-#  include "hashbits.hh"
-#else
-#  error "No HASH_TYPE_TO_TEST macro defined."
-#endif
-#include "primes.hh"
-
-using namespace std;
 using namespace khmer;
-using namespace khmer:: read_parsers;
-
-
 
+int main() {
+    HashIntoType sizes[] = { 100000003, 100000004, 100000007, 10000000011};
+    std::vector<HashIntoType> sizes_vec (sizes, sizes + sizeof(sizes) / sizeof(HashIntoType) );
 
-int main( int argc, char * argv[ ] )
-{
-    unsigned long	kmer_length	    = 20;
-    float		ht_size_FP	    = 1.0E8;
-    unsigned long	ht_count	    = 4;
-    uint64_t		cache_size	    = 4L * 1024 * 1024 * 1024;
-    unsigned int	range		    = 82;
-    int			rc		    = 0;
-    int			opt		    = -1;
-    char *		conv_residue	    = NULL;
-    string		rfile_name = "/w/tag_coloring/test_reads.fq";
-    string		ifile_name = "/w/tag_coloring/petMar_test.fp";
-    // FILE *		ofile		    = NULL;
-    HashIntoType	    ht_size		= (HashIntoType)ht_size_FP;
-    Primes primetab( ht_size );
-    vector<HashIntoType> ht_sizes;
-    for ( unsigned int i = 0; i < ht_count; ++i )
-	ht_sizes.push_back( primetab.get_next_prime( ) );
+    khmer::LabelHash * lh_pointer = new khmer::LabelHash(20, sizes_vec);
+    khmer::Hashbits * hb_pointer = (khmer::Hashbits *)lh_pointer;
 
-    unsigned int	    reads_total		= 0;
-    unsigned long long int  n_consumed		= 0;
-    printf("consuming test fastp...\n");
-    Hashbits ht( kmer_length, ht_sizes );
-    ht.consume_partitioned_fasta_and_tag_with_colors( ifile_name, reads_total, n_consumed );
-    printf("consume %u sequences, graph has %u colors\n", reads_total, ht.n_colors());
-    IParser * parser = IParser:: get_parser(rfile_name.c_str());
-    Read read;
-    unsigned int num_traversed;
-    unsigned int num_reads = 0;
-    string seq = "";
-    clock_t st = clock();
-    while(!parser->is_complete()) {
-	read = parser->get_next_read();
-	seq = read.sequence;
-	ColorPtrSet found_colors;
-	num_traversed = ht.sweep_color_neighborhood(seq, found_colors, range, false, false);
-	if (num_reads % 50000 == 0) {
-	    st = clock() - st;
-	    printf("traversed %u kmers in %d ticks (%f seconds)\n", num_traversed,
-								st,
-								((float)st/CLOCKS_PER_SEC));
-	st = clock();
-	}
-	found_colors.clear();
-	num_reads++;
-    }
-    return rc;
+    std::cout << "lh_pointer n_tags: " << lh_pointer->n_tags() << std::endl;
+    std::cout << "hb_pointer n_tags: " << hb_pointer->n_tags() << std::endl;
+    
+    return 0;
 }
-
-
-// vim: set sts=4 sw=4 tw=80:

From 13bff6ebca93f9f16017823b12769c5b23de9ffa Mon Sep 17 00:00:00 2001
From: Camille Scott <camille.scott.w@gmail.com>
Date: Fri, 20 Dec 2013 15:09:46 -0500
Subject: [PATCH 116/140] tracked down SIGBUS error to labelhash_dealloc
 function not calling tp_free, fixed

---
 khmer/_khmermodule.cc                        | 22 +++---
 lib/labelhash.cc                             | 34 +++++----
 lib/labelhash.hh                             |  2 +-
 scripts/sweep-reads-by-partition-buffered.py |  2 +-
 tests/test_labelhash.py                      | 72 +++++++++++---------
 5 files changed, 76 insertions(+), 56 deletions(-)

diff --git a/khmer/_khmermodule.cc b/khmer/_khmermodule.cc
index e1b85d3f54..0b7b31ba58 100644
--- a/khmer/_khmermodule.cc
+++ b/khmer/_khmermodule.cc
@@ -1835,7 +1835,7 @@ static PyObject * hash_abundance_distribution(PyObject * self, PyObject * args)
     return NULL;
   }
 
-  assert(is_hashbits_obj(tracking_obj));
+  //assert(is_hashbits_obj(tracking_obj));
 
   khmer_KHashbitsObject * tracking_o = (khmer_KHashbitsObject *) tracking_obj;
   khmer::Hashbits * hashbits = tracking_o->hashbits;
@@ -1875,7 +1875,7 @@ static PyObject * hash_abundance_distribution_with_reads_parser(PyObject * self,
   khmer:: read_parsers:: IParser * rparser = 
     _PyObject_to_khmer_ReadParser(rparser_obj);
 
-  assert(is_hashbits_obj(tracking_obj));
+  //assert(is_hashbits_obj(tracking_obj));
 
   khmer_KHashbitsObject * tracking_o = (khmer_KHashbitsObject *) tracking_obj;
   khmer::Hashbits * hashbits = tracking_o->hashbits;
@@ -4182,13 +4182,15 @@ static PyObject * khmer_labelhash_new(PyTypeObject * type, PyObject *args, PyObj
 // khmer_labelhash_dealloc -- clean up a labelhash object.
 //
 
-static void khmer_labelhash_dealloc(PyObject* self)
+static void khmer_labelhash_dealloc(PyObject* obj)
 {
-  khmer_KLabelHashObject * obj = (khmer_KLabelHashObject *) self;
-  delete obj->labelhash;
-  obj->labelhash = NULL;
+  khmer_KLabelHashObject * self = (khmer_KLabelHashObject *) obj;
+
+  delete self->labelhash;
+  self->labelhash = NULL;
   
-  PyObject_Del((PyObject *) obj);
+  obj->ob_type->tp_free((PyObject*)self);
+  //PyObject_Del((PyObject *) obj);
 }
 
 // a little wierd; we don't actually want to call Hashbits' new method. Rather, we
@@ -4204,6 +4206,7 @@ static PyObject * khmer_labelhash_new(PyTypeObject *type, PyObject *args, PyObje
         PyObject* sizes_list_o = NULL;
 
         if (!PyArg_ParseTuple(args, "IO", &k, &sizes_list_o)) {
+            Py_DECREF(self);
             return NULL;
         }
 
@@ -4227,6 +4230,8 @@ static int khmer_labelhash_init(khmer_KLabelHashObject * self, PyObject *args, P
     if (khmer_KHashbitsType.tp_init((PyObject *)self, args, kwds) < 0)
         return -1;
     std::cout << "testing my pointer ref to hashbits: " << self->khashbits.hashbits->n_tags() << std::endl;
+    std::cout << "hashbits: " << self->khashbits.hashbits << std::endl;
+    std::cout << "labelhash: " << self->labelhash << std::endl;
     return 0;
 }
 
@@ -4298,9 +4303,10 @@ static PyObject * labelhash_consume_partitioned_fasta_and_tag_with_labels(
     labelhash->consume_partitioned_fasta_and_tag_with_labels(filename, 
     total_reads, n_consumed, _report_fn, callback_obj);
   } catch (_khmer_signal &e) {
+    std::cout << "caught exception in consume_partitioned_fasta_and_tag_with_labels!" << std::endl;
     return NULL;
   }
-
+  std::cout << "building value for return..." << std::endl;
   return Py_BuildValue("iK", total_reads, n_consumed);
 }
 
diff --git a/lib/labelhash.cc b/lib/labelhash.cc
index 761b6bdeb8..ebeb6554b7 100644
--- a/lib/labelhash.cc
+++ b/lib/labelhash.cc
@@ -6,7 +6,7 @@
 
 #include "labelhash.hh"
 
-#define LABEL_DBG 1
+#define LABEL_DBG 0
 #define printdbg(m) if(LABEL_DBG) std::cout << #m << std::endl;
 
 using namespace khmer;
@@ -146,8 +146,8 @@ void LabelHash::consume_partitioned_fasta_and_tag_with_labels(const std::string
   std::string seq = "";
 
   // reset the master subset partition
-  delete partition;
-  partition = new SubsetPartition(this);
+  //delete partition;
+  //partition = new SubsetPartition(this);
 
   //
   // iterate through the FASTA file & consume the reads.
@@ -160,12 +160,15 @@ void LabelHash::consume_partitioned_fasta_and_tag_with_labels(const std::string
 
     if (check_and_normalize_read(seq)) {
       // First, figure out what the partition is (if non-zero), and save that.
+      printdbg(parsing partition id)
       p = _parse_partition_id(read.name);
+      printdbg(checking label and allocating if necessary)
       c = check_and_allocate_label(p);
-
+      printdbg(consuming sequence and tagging)
       consume_sequence_and_tag_with_labels( seq,
 					      n_consumed,
 					      *c );
+      printdbg(back in consume_partitioned)
     }
 	       
     // reset the sequence info, increment read number
@@ -182,9 +185,11 @@ void LabelHash::consume_partitioned_fasta_and_tag_with_labels(const std::string
       }
     }
   }
+  printdbg(done with while loop in consume_partitioned)
 
   // @cswelcher TODO: check that deallocate LabelPtrMap is correct
   delete parser;
+  printdbg(deleted parser and exiting)
 }
 
 // @cswelcher: double-check -- is it valid to pull the address from a reference?
@@ -225,9 +230,9 @@ void LabelHash::consume_sequence_and_tag_with_labels(const std::string& seq,
         ++since;
       } else {
         printdbg(entering tag spin lock)
-        ACQUIRE_ALL_TAGS_SPIN_LOCK
+        //ACQUIRE_ALL_TAGS_SPIN_LOCK
         kmer_tagged = set_contains(all_tags, kmer);
-        RELEASE_ALL_TAGS_SPIN_LOCK
+        //RELEASE_ALL_TAGS_SPIN_LOCK
         printdbg(released tag spin lock)
         if (kmer_tagged) {
 	      since = 1;
@@ -237,9 +242,9 @@ void LabelHash::consume_sequence_and_tag_with_labels(const std::string& seq,
 	      
 	      if (!_cmap_contains_label(tag_labels, kmer, current_label)) {
             printdbg(tag was not labeled: adding to labels...)
-	        ACQUIRE_TAG_COLORS_SPIN_LOCK
+	        //ACQUIRE_TAG_COLORS_SPIN_LOCK
 	        link_tag_and_label(kmer, current_label);
-	        RELEASE_TAG_COLORS_SPIN_LOCK
+	        //RELEASE_TAG_COLORS_SPIN_LOCK
             printdbg(released label spin lock)
 	      }
 	      if (found_tags) {
@@ -261,17 +266,17 @@ void LabelHash::consume_sequence_and_tag_with_labels(const std::string& seq,
       //
       if (since >= _tag_density) {
         printdbg(exceeded tag density: drop a tag and label -- getting tag lock)
-        ACQUIRE_ALL_TAGS_SPIN_LOCK
+        //ACQUIRE_ALL_TAGS_SPIN_LOCK
         printdbg(in tag spin lock)
         all_tags.insert(kmer);
-        RELEASE_ALL_TAGS_SPIN_LOCK
+        //RELEASE_ALL_TAGS_SPIN_LOCK
         printdbg(released tag spin lock)
         
         // Labeling code
         // TODO: MAKE THREADSAFE!
-        ACQUIRE_TAG_COLORS_SPIN_LOCK
+        //ACQUIRE_TAG_COLORS_SPIN_LOCK
         link_tag_and_label(kmer, current_label);
-        RELEASE_TAG_COLORS_SPIN_LOCK
+        //RELEASE_TAG_COLORS_SPIN_LOCK
         
         if (found_tags) { found_tags->insert(kmer); }
         since = 1;
@@ -280,15 +285,16 @@ void LabelHash::consume_sequence_and_tag_with_labels(const std::string& seq,
     } // iteration over kmers
     printdbg(finished iteration: dropping last tag)
     if (since >= _tag_density/2 - 1) {
-      ACQUIRE_ALL_TAGS_SPIN_LOCK
+      //ACQUIRE_ALL_TAGS_SPIN_LOCK
       all_tags.insert(kmer);	// insert the last k-mer, too.
-      RELEASE_ALL_TAGS_SPIN_LOCK
+      //RELEASE_ALL_TAGS_SPIN_LOCK
       
       // Label code: TODO: MAKE THREADSAFE!
       link_tag_and_label(kmer, current_label);
       
       if (found_tags) { found_tags->insert(kmer); }
     }
+  printdbg(done with low-level consume)
   }
 /*
  * Find all labels associated with the sequence
diff --git a/lib/labelhash.hh b/lib/labelhash.hh
index 755054d433..6abaa788b1 100644
--- a/lib/labelhash.hh
+++ b/lib/labelhash.hh
@@ -80,7 +80,7 @@ namespace khmer {
 
         }
         
-        ~LabelHash() {};
+        //~LabelHash() {};
 
         TagLabelPtrMap tag_labels;
         LabelTagPtrMap label_tag_ptrs;
diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py
index 8e2cd7c6d9..f153282319 100755
--- a/scripts/sweep-reads-by-partition-buffered.py
+++ b/scripts/sweep-reads-by-partition-buffered.py
@@ -205,7 +205,7 @@ def main():
     output_buffer = ReadBufferManager(max_buffers, max_reads, buf_size, output_pref, outdir)
 
 	# consume the partitioned fasta with which to label the graph
-    ht = khmer.new_hashbits(K, HT_SIZE, N_HT)
+    ht = khmer.LabelHash(K, HT_SIZE, N_HT)
     print >>sys.stderr, 'consuming fastp...'
     ht.consume_partitioned_fasta_and_tag_with_labels(input_fastp)
 
diff --git a/tests/test_labelhash.py b/tests/test_labelhash.py
index 7993091d74..f5a497edd9 100644
--- a/tests/test_labelhash.py
+++ b/tests/test_labelhash.py
@@ -48,6 +48,31 @@ def test_get_tag_labels():
     assert len(labels) == 1
     assert labels.pop() == 0L
 
+def test_consume_fasta_and_tag_with_labels():
+    lb = LabelHash(20, 1e7, 4)
+    read_1 = 'ACGTAACCGGTTAAACCCGGGTTTAAAACCCCGGGGTTTT'
+    filename = utils.get_test_data('test-transcript.fa')
+
+    total_reads, n_consumed = lb.consume_fasta_and_tag_with_labels(filename)
+    print "doing get"
+    assert lb.get(read_1[:20])
+    assert total_reads == 3
+    print "doing n_labels"
+    print lb.n_labels()
+    print "doing label dict"
+    print lb.get_label_dict()
+    print "get tagset"
+    for tag in lb.get_tagset():
+        print "forward hash"
+        print tag, khmer.forward_hash(tag, 20)
+    for record in screed.open(filename):
+        print "Sweeping tags"
+        print lb.sweep_tag_neighborhood(record.sequence, 40)
+        print "Sweeping labels..."
+        print lb.sweep_label_neighborhood(record.sequence, 40)
+    assert lb.n_labels() == 3
+
+
 def test_consume_partitioned_fasta_and_tag_with_labels():
     lb = LabelHash(20, 1e7, 4)
     filename = utils.get_test_data('real-partition-small.fa')
@@ -63,23 +88,24 @@ def test_consume_partitioned_fasta_and_tag_with_labels():
     assert labels.pop() == 2L
     assert lb.n_labels() == 1 
 
-def test_consume_fasta_and_tag_with_labels():
+def test_sweep_tag_neighborhood():
     lb = LabelHash(20, 1e7, 4)
-    read_1 = 'ACGTAACCGGTTAAACCCGGGTTTAAAACCCCGGGGTTTT'
-    filename = utils.get_test_data('test-transcript.fa')
+    filename = utils.get_test_data('single-read.fq')
+    lb.consume_fasta_and_tag(filename)
+    
+    tags = lb.sweep_tag_neighborhood('CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT')
+    assert len(tags) == 1
+    assert tags.pop() == 173473779682L
 
-    total_reads, n_consumed = lb.consume_fasta_and_tag_with_labels(filename)
 
-    assert lb.get(read_1[:20])
-    assert total_reads == 3
-    print lb.n_labels()
-    print lb.get_label_dict()
-    for tag in lb.get_tagset():
-        print tag, khmer.forward_hash(tag, 20)
-    for record in screed.open(filename):
-        print lb.sweep_tag_neighborhood(record.sequence, 40)
-        print lb.sweep_label_neighborhood(record.sequence, 40)
-    assert lb.n_labels() == 3
+def test_sweep_label_neighborhood():
+    lb = LabelHash(20, 1e7, 4)
+    filename = utils.get_test_data('single-read.fq')
+    lb.consume_fasta_and_tag_with_labels(filename)
+    
+    labels = lb.sweep_label_neighborhood('CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT')
+    assert len(labels) == 1
+    assert labels.pop() == 0L
 
 '''
 * The test data set as four reads: A, B, C, and D
@@ -123,21 +149,3 @@ def test_label_tag_correctness():
     assert len(labels) == 1
     assert 3L in labels
 
-def test_sweep_tag_neighborhood():
-    lb = LabelHash(20, 1e7, 4)
-    filename = utils.get_test_data('single-read.fq')
-    lb.consume_fasta_and_tag(filename)
-    
-    tags = lb.sweep_tag_neighborhood('CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT')
-    assert len(tags) == 1
-    assert tags.pop() == 173473779682L
-
-
-def test_sweep_label_neighborhood():
-    lb = LabelHash(20, 1e7, 4)
-    filename = utils.get_test_data('single-read.fq')
-    lb.consume_fasta_and_tag_with_labels(filename)
-    
-    labels = lb.sweep_label_neighborhood('CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT')
-    assert len(labels) == 1
-    assert labels.pop() == 0L

From 9ac22f072b9ddca2018d42406f9df02b8a234ad2 Mon Sep 17 00:00:00 2001
From: Camille Scott <camille.scott.w@gmail.com>
Date: Fri, 20 Dec 2013 15:34:46 -0500
Subject: [PATCH 117/140] swapped out tst-Colors in lib Makefile

---
 lib/Makefile | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/lib/Makefile b/lib/Makefile
index 4a7a00def4..dbfb936414 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -74,7 +74,6 @@ WITH_INTERNAL_METRICS=false
 
 ### NOTE: No user-servicable parts below this line! ###
 
-
 CXXFLAGS=
 CXX_WARNING_FLAGS=-Wall
 CXX_OPTIMIZATION_FLAGS=-O3
@@ -171,7 +170,7 @@ CORE_OBJS= error.o khmer_config.o thread_id_map.o trace_logger.o perf_metrics.o
 PARSERS_OBJS= read_parsers.o
 
 all: $(ZLIB_OBJS) $(BZIP2_OBJS) $(CORE_OBJS) $(PARSERS_OBJS) hashtable.o hashbits.o subset.o counting.o test aligner.o scoringmatrix.o node.o kmer.o
-
+	echo ~~~~~~ YO WE IN THIS MAKEFILE ~~~~~~
 clean:
 	-(cd $(ZLIB_DIR) && make clean)
 	(cd $(BZIP2_DIR) && make -f Makefile-libbz2_so clean)

From 747dab0b5b3594c83b5d8d86df9e8133c6fd0617 Mon Sep 17 00:00:00 2001
From: Camille Scott <camille.scott.w@gmail.com>
Date: Fri, 20 Dec 2013 15:35:44 -0500
Subject: [PATCH 118/140] removed rogue test files?

---
 lib/color_tst.py      | 73 ----------------------------------------
 lib/color_tst_opt.py  | 78 -------------------------------------------
 lib/color_tst_slow.py | 78 -------------------------------------------
 3 files changed, 229 deletions(-)
 delete mode 100644 lib/color_tst.py
 delete mode 100644 lib/color_tst_opt.py
 delete mode 100644 lib/color_tst_slow.py

diff --git a/lib/color_tst.py b/lib/color_tst.py
deleted file mode 100644
index dcac725ec0..0000000000
--- a/lib/color_tst.py
+++ /dev/null
@@ -1,73 +0,0 @@
-import khmer
-import screed
-
-def reverse_comp(s):
-    ret = ''
-    for i in range(len(s)-1,-1,-1):
-        c = s[i]
-        if c == 'A':
-            ret += 'T'
-        elif c == 'T':
-            ret += 'A'
-        elif c == 'G':
-            ret += 'C'
-        else:
-            ret += 'G'
-    return ret
-
-ht = khmer.new_hashbits(20,1e8,4)
-print '#' * 200
-ht.consume_fasta_and_tag_with_colors('../tests/test-data/test-reads.fa')
-#print ht.sweep_sequence_for_colors('CACACACGGACATCGGAGAGAGGCTGAGACAGCGAGACACACAGAGACAGAGCGGAGAGGGCACAGACAGACAAGAGCATGAGAGATCGGCAGAGCGGTG', False, False)
-#print ht.sweep_sequence_for_colors('CGCCGTAGTCGTACTGGTTCTCCTCCGTGTACTCGTGCGCTGCCTCCACCTCTGGGCTGCTCATGCCCTCCATGTGACCTTCAGGCATGCCCTCGGAGAT', False, False)
-#print ht.sweep_sequence_for_colors('GGAGAGCCTGGGGCCAAGCCCGAGGGCATGCCTGAAGGTCACATGGAGGGCATGAGCAGCCCAG', False, False)
-#print ht.sweep_sequence_for_colors('TTTTTTGAATACGTTTAGTTAATATTTGTACTTCAATTAATAAAAATTTGCTATAATTTTTCCATTATCGCCAGTCACTCGCGTGATATAGGAAAAGGTT', False, False)
-#print ht.sweep_sequence_for_colors('AAGCAGTGGTATCAACGCAGAGTACGCGGGGACTCTGTCGCTGCTCCTCTAGCACAGAGAGCCAGAGACGGCTTACAGCAGCAGCATCATATAGCCTC', False, False)
-
-t0 = 'CCATGTAGCGCCGCACACCTTTGTAGGTGTTGTAATAATCTTCGATGACTTTCTTCGCTTCCTGACGGCTTATGCC'
-t1 = 'ACCGCGCGCGAATCGACGGTTGTCAGCCAAAGGCGTTCAACACCAGCACCGCCCTTAAGCCGCCCGCCCGCCGCCC'
-N = 1000
-
-for n, record in enumerate(screed.open('../tests/test-data/test-reads.fa')):
-    if n > N:
-        break
-    print '*' * 40
-    seq = record.sequence
-    print seq
-    colors = ht.sweep_sequence_for_colors(seq, False, False)
-    print 'colors from sweep:', colors
-    tags = ht.get_all_tags(seq)
-    print 'tags from get_all_tags:', tags
-    print 'colors from get_tag_colors:'
-    t_colors = set()
-    for tag in tags:
-        t_colors.update(ht.get_tag_colors(tag))
-    print t_colors
-    assert len(t_colors) == len(colors)
-
-'''
-file_pointers = {}
-for n, record in enumerate(screed.open('/w/2013-lamprey/syn_part/syn.sweep.fa')):
-    if n >= N:
-        break
-    if n % 1000 == 0:
-        print '...processed {} reads'.format(n)
-    colors = ht.sweep_sequence_for_colors(record.sequence, False, False)
-    for c in colors:
-        if c in file_pointers.viewkeys():
-            file_pointers[c].write('>{}\n{}\n'.format(record.name, record.sequence))
-        else:
-            file_pointers[c] = open('color_{}.fa'.format(c), 'wb')
-            file_pointers[c].write('>{}\n{}\n'.format(record.name, record.sequence))\
-'''
-'''
-ht = khmer.new_hashbits(25, 1e9,4)
-ht.consume_partitioned_fasta_and_tag_with_colors('/w/2013-lamprey/test.fp')
-
-for n, record in enumerate(screed.open('/w/lamprey-mrnaseq/reads/single/L82-a.fq.gz')):
-    if n >= N:
-        break
-    colors = ht.sweep_sequence_for_colors(record.sequence, False,  False)
-    if colors:
-        print colors
-'''
diff --git a/lib/color_tst_opt.py b/lib/color_tst_opt.py
deleted file mode 100644
index 8c75fe2e5b..0000000000
--- a/lib/color_tst_opt.py
+++ /dev/null
@@ -1,78 +0,0 @@
-import khmer
-import screed
-
-def reverse_comp(s):
-    ret = ''
-    for i in range(len(s)-1,-1,-1):
-        c = s[i]
-        if c == 'A':
-            ret += 'T'
-        elif c == 'T':
-            ret += 'A'
-        elif c == 'G':
-            ret += 'C'
-        else:
-            ret += 'G'
-    return ret
-
-ht = khmer.new_hashbits(20,1e8,4)
-ht.consume_fasta_and_tag_with_colors('../tests/test-data/test-reads.fa')
-N = 100
-for n, record in enumerate(screed.open('../tests/test-data/test-reads.fa')):
-    if n >= N:
-        break
-    ht.sweep_tag_neighborhood(record.sequence, 10)
-
-#print ht.sweep_sequence_for_colors('CACACACGGACATCGGAGAGAGGCTGAGACAGCGAGACACACAGAGACAGAGCGGAGAGGGCACAGACAGACAAGAGCATGAGAGATCGGCAGAGCGGTG', False, False)
-#print ht.sweep_sequence_for_colors('CGCCGTAGTCGTACTGGTTCTCCTCCGTGTACTCGTGCGCTGCCTCCACCTCTGGGCTGCTCATGCCCTCCATGTGACCTTCAGGCATGCCCTCGGAGAT', False, False)
-#print ht.sweep_sequence_for_colors('GGAGAGCCTGGGGCCAAGCCCGAGGGCATGCCTGAAGGTCACATGGAGGGCATGAGCAGCCCAG', False, False)
-#print ht.sweep_sequence_for_colors('TTTTTTGAATACGTTTAGTTAATATTTGTACTTCAATTAATAAAAATTTGCTATAATTTTTCCATTATCGCCAGTCACTCGCGTGATATAGGAAAAGGTT', False, False)
-#print ht.sweep_sequence_for_colors('AAGCAGTGGTATCAACGCAGAGTACGCGGGGACTCTGTCGCTGCTCCTCTAGCACAGAGAGCCAGAGACGGCTTACAGCAGCAGCATCATATAGCCTC', False, False)
-
-#t0 = 'CCATGTAGCGCCGCACACCTTTGTAGGTGTTGTAATAATCTTCGATGACTTTCTTCGCTTCCTGACGGCTTATGCC'
-#t1 = 'ACCGCGCGCGAATCGACGGTTGTCAGCCAAAGGCGTTCAACACCAGCACCGCCCTTAAGCCGCCCGCCCGCCGCCC'
-'''
-N = 100
-for n, record in enumerate(screed.open('../tests/test-data/test-reads.fa')):
-    if n > N:
-        break
-    print '*' * 40
-    seq = record.sequence
-    print seq
-    colors = ht.sweep_sequence_for_colors(seq, False, False)
-    print 'colors from sweep:', colors
-    tags = ht.get_all_tags(seq)
-    print 'tags from get_all_tags:', tags
-    print 'colors from get_tag_colors:'
-    t_colors = set()
-    for tag in tags:
-        t_colors.update(ht.get_tag_colors(tag))
-    print t_colors
-    assert len(t_colors) == len(colors)
-'''
-'''
-file_pointers = {}
-for n, record in enumerate(screed.open('/w/2013-lamprey/syn_part/syn.sweep.fa')):
-    if n >= N:
-        break
-    if n % 1000 == 0:
-        print '...processed {} reads'.format(n)
-    colors = ht.sweep_sequence_for_colors(record.sequence, False, False)
-    for c in colors:
-        if c in file_pointers.viewkeys():
-            file_pointers[c].write('>{}\n{}\n'.format(record.name, record.sequence))
-        else:
-            file_pointers[c] = open('color_{}.fa'.format(c), 'wb')
-            file_pointers[c].write('>{}\n{}\n'.format(record.name, record.sequence))\
-'''
-'''
-ht = khmer.new_hashbits(25, 1e9,4)
-ht.consume_partitioned_fasta_and_tag_with_colors('/w/2013-lamprey/test.fp')
-
-for n, record in enumerate(screed.open('/w/lamprey-mrnaseq/reads/single/L82-a.fq.gz')):
-    if n >= N:
-        break
-    colors = ht.sweep_sequence_for_colors(record.sequence, False,  False)
-    if colors:
-        print colors
-'''
diff --git a/lib/color_tst_slow.py b/lib/color_tst_slow.py
deleted file mode 100644
index 2f25f857e4..0000000000
--- a/lib/color_tst_slow.py
+++ /dev/null
@@ -1,78 +0,0 @@
-import khmer
-import screed
-
-def reverse_comp(s):
-    ret = ''
-    for i in range(len(s)-1,-1,-1):
-        c = s[i]
-        if c == 'A':
-            ret += 'T'
-        elif c == 'T':
-            ret += 'A'
-        elif c == 'G':
-            ret += 'C'
-        else:
-            ret += 'G'
-    return ret
-
-ht = khmer.new_hashbits(20,1e8,4)
-ht.consume_fasta_and_tag_with_colors('../tests/test-data/test-reads.fa')
-N = 100
-for n, record in enumerate(screed.open('../tests/test-data/test-reads.fa')):
-    if n >= N:
-        break
-    ht.sweep_color_neighborhood(record.sequence)
-
-#print ht.sweep_sequence_for_colors('CACACACGGACATCGGAGAGAGGCTGAGACAGCGAGACACACAGAGACAGAGCGGAGAGGGCACAGACAGACAAGAGCATGAGAGATCGGCAGAGCGGTG', False, False)
-#print ht.sweep_sequence_for_colors('CGCCGTAGTCGTACTGGTTCTCCTCCGTGTACTCGTGCGCTGCCTCCACCTCTGGGCTGCTCATGCCCTCCATGTGACCTTCAGGCATGCCCTCGGAGAT', False, False)
-#print ht.sweep_sequence_for_colors('GGAGAGCCTGGGGCCAAGCCCGAGGGCATGCCTGAAGGTCACATGGAGGGCATGAGCAGCCCAG', False, False)
-#print ht.sweep_sequence_for_colors('TTTTTTGAATACGTTTAGTTAATATTTGTACTTCAATTAATAAAAATTTGCTATAATTTTTCCATTATCGCCAGTCACTCGCGTGATATAGGAAAAGGTT', False, False)
-#print ht.sweep_sequence_for_colors('AAGCAGTGGTATCAACGCAGAGTACGCGGGGACTCTGTCGCTGCTCCTCTAGCACAGAGAGCCAGAGACGGCTTACAGCAGCAGCATCATATAGCCTC', False, False)
-
-#t0 = 'CCATGTAGCGCCGCACACCTTTGTAGGTGTTGTAATAATCTTCGATGACTTTCTTCGCTTCCTGACGGCTTATGCC'
-#t1 = 'ACCGCGCGCGAATCGACGGTTGTCAGCCAAAGGCGTTCAACACCAGCACCGCCCTTAAGCCGCCCGCCCGCCGCCC'
-'''
-N = 100
-for n, record in enumerate(screed.open('../tests/test-data/test-reads.fa')):
-    if n > N:
-        break
-    print '*' * 40
-    seq = record.sequence
-    print seq
-    colors = ht.sweep_sequence_for_colors(seq, False, False)
-    print 'colors from sweep:', colors
-    tags = ht.get_all_tags(seq)
-    print 'tags from get_all_tags:', tags
-    print 'colors from get_tag_colors:'
-    t_colors = set()
-    for tag in tags:
-        t_colors.update(ht.get_tag_colors(tag))
-    print t_colors
-    assert len(t_colors) == len(colors)
-'''
-'''
-file_pointers = {}
-for n, record in enumerate(screed.open('/w/2013-lamprey/syn_part/syn.sweep.fa')):
-    if n >= N:
-        break
-    if n % 1000 == 0:
-        print '...processed {} reads'.format(n)
-    colors = ht.sweep_sequence_for_colors(record.sequence, False, False)
-    for c in colors:
-        if c in file_pointers.viewkeys():
-            file_pointers[c].write('>{}\n{}\n'.format(record.name, record.sequence))
-        else:
-            file_pointers[c] = open('color_{}.fa'.format(c), 'wb')
-            file_pointers[c].write('>{}\n{}\n'.format(record.name, record.sequence))\
-'''
-'''
-ht = khmer.new_hashbits(25, 1e9,4)
-ht.consume_partitioned_fasta_and_tag_with_colors('/w/2013-lamprey/test.fp')
-
-for n, record in enumerate(screed.open('/w/lamprey-mrnaseq/reads/single/L82-a.fq.gz')):
-    if n >= N:
-        break
-    colors = ht.sweep_sequence_for_colors(record.sequence, False,  False)
-    if colors:
-        print colors
-'''

From 9289e0c0fad9d495d939172f4e03aefad6fea0e7 Mon Sep 17 00:00:00 2001
From: Camille Scott <camille.scott.w@gmail.com>
Date: Fri, 20 Dec 2013 15:46:36 -0500
Subject: [PATCH 119/140] added same tp_free call to dealloc for Hashbits

---
 khmer/_khmermodule.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/khmer/_khmermodule.cc b/khmer/_khmermodule.cc
index 0b7b31ba58..2d2994dc30 100644
--- a/khmer/_khmermodule.cc
+++ b/khmer/_khmermodule.cc
@@ -4795,6 +4795,7 @@ static void khmer_hashbits_dealloc(PyObject* self)
   delete obj->hashbits;
   obj->hashbits = NULL;
   
+  self->ob_type->tp_free((PyObject*)obj);
   PyObject_Del((PyObject *) obj);
 }
 

From 3ec384d40730466412d58c71f880254f2d786c63 Mon Sep 17 00:00:00 2001
From: Camille Scott <camille.scott.w@gmail.com>
Date: Fri, 20 Dec 2013 16:37:40 -0500
Subject: [PATCH 120/140] exposed Hashbits object to python with constructor
 providing primes

---
 khmer/__init__.py     | 19 +++++++++++++++----
 khmer/_khmermodule.cc |  2 +-
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/khmer/__init__.py b/khmer/__init__.py
index 04f2038f0f..59a4eb2d76 100644
--- a/khmer/__init__.py
+++ b/khmer/__init__.py
@@ -16,6 +16,7 @@
 from _khmer import get_config
 from _khmer import ReadParser
 from _khmer import _LabelHash
+from _khmer import _Hashbits
 
 from ._version import get_versions
 __version__ = get_versions()['version']
@@ -148,14 +149,24 @@ def get_n_primes_above_x(n, x):
         i += 2
     return primes
 
+'''
+Expose the cpython objects with __new__ implementations.
+These constructors add the functionality provided by the existing
+factory methods to the constructors defined over in cpython land.
+Additional functionality can be added to these classes as appropriate.
+'''
+
 class LabelHash(_LabelHash):
     def __new__(cls, k, starting_size, n_tables):
-        print "** LabelHash __new__"
-        print "\t*** Getting primes..."
         primes = get_n_primes_above_x(n_tables, starting_size)
-        print "\t*** Invoking parent..."
         c = _LabelHash.__new__(cls, k, primes)
-        print "\t*** Done with parent, returning class object"
+        c.primes = primes
+        return c
+
+class Hashbits(_Hashbits):
+    def __new__(cls, k, starting_size, n_tables):
+        primes = get_n_primes_above_x(n_tables, starting_size)
+        c = _Hashbits.__new__(cls, k, primes)
         c.primes = primes
         return c
 
diff --git a/khmer/_khmermodule.cc b/khmer/_khmermodule.cc
index 2d2994dc30..6eb251c7f9 100644
--- a/khmer/_khmermodule.cc
+++ b/khmer/_khmermodule.cc
@@ -4982,7 +4982,7 @@ init_khmer(void)
     //	     Then, remove the corresponding factory functions.
 
     Py_INCREF(&khmer_KHashbitsType);
-    PyModule_AddObject(m, "Hashbits", (PyObject *)&khmer_KHashbitsType);
+    PyModule_AddObject(m, "_Hashbits", (PyObject *)&khmer_KHashbitsType);
 
     Py_INCREF(&khmer_KLabelHashType);
     PyModule_AddObject(m, "_LabelHash", (PyObject *)&khmer_KLabelHashType);

From a50061d835e33db469e3b11e049ba4c52b135b4d Mon Sep 17 00:00:00 2001
From: Camille Scott <camille.scott.w@gmail.com>
Date: Sun, 22 Dec 2013 00:44:55 -0500
Subject: [PATCH 121/140] fixed Hashbits dealloc by removing old PyObject_Del,
 added tests for Hashbits and for Hashbits methods available through Labelhash

---
 khmer/_khmermodule.cc   |  18 +-
 tests/test_Hashbits.py  | 554 ++++++++++++++++++++++++++++++++++++++++
 tests/test_labelhash.py | 535 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 1104 insertions(+), 3 deletions(-)
 create mode 100644 tests/test_Hashbits.py

diff --git a/khmer/_khmermodule.cc b/khmer/_khmermodule.cc
index 6eb251c7f9..b32124012e 100644
--- a/khmer/_khmermodule.cc
+++ b/khmer/_khmermodule.cc
@@ -3904,7 +3904,7 @@ khmer_hashbits_getattr(PyObject * obj, char * name)
   return Py_FindMethod(khmer_hashbits_methods, obj, name);
 }
 
-static void khmer_hashbits_dealloc(PyObject *);
+static void khmer_hashbits_dealloc(PyObject * obj);
 static PyObject* khmer_hashbits_new(PyTypeObject * type, PyObject * args, PyObject * kwds);
 static int khmer_hashbits_init(khmer_KHashbitsObject * self, PyObject * args, PyObject * kwds); 
 
@@ -3913,7 +3913,7 @@ static PyTypeObject khmer_KHashbitsType = {
     0,
     "Hashbits", sizeof(khmer_KHashbitsObject),
     0,
-    khmer_hashbits_dealloc,	/*tp_dealloc*/
+    (destructor)khmer_hashbits_dealloc,	/*tp_dealloc*/
     0,				/*tp_print*/
     khmer_hashbits_getattr,	/*tp_getattr*/
     0,				/*tp_setattr*/
@@ -4788,7 +4788,7 @@ static void khmer_counting_dealloc(PyObject* self)
 //
 // khmer_hashbits_dealloc -- clean up a hashbits object.
 //
-
+/*
 static void khmer_hashbits_dealloc(PyObject* self)
 {
   khmer_KHashbitsObject * obj = (khmer_KHashbitsObject *) self;
@@ -4798,6 +4798,18 @@ static void khmer_hashbits_dealloc(PyObject* self)
   self->ob_type->tp_free((PyObject*)obj);
   PyObject_Del((PyObject *) obj);
 }
+*/
+static void khmer_hashbits_dealloc(PyObject* obj)
+{
+  khmer_KHashbitsObject * self = (khmer_KHashbitsObject *) obj;
+
+  delete self->hashbits;
+  self->hashbits = NULL;
+  
+  self->ob_type->tp_free((PyObject*)obj);
+  //PyObject_Del((PyObject *) obj);
+}
+
 
 //
 // khmer_subset_dealloc -- clean up a hashbits object.
diff --git a/tests/test_Hashbits.py b/tests/test_Hashbits.py
new file mode 100644
index 0000000000..761eb58c77
--- /dev/null
+++ b/tests/test_Hashbits.py
@@ -0,0 +1,554 @@
+#
+# This file is part of khmer, http://github.com/ged-lab/khmer/, and is
+# Copyright (C) Michigan State University, 2009-2013. It is licensed under
+# the three-clause BSD license; see doc/LICENSE.txt. Contact: ctb@msu.edu
+#
+
+#
+# This is an exact copy of test_hashbits, with all invocations of
+# khmer.new_hashbits replaced by khmer.Hashbits constructor calls
+#
+
+import khmer
+from khmer import Hashbits
+
+from screed.fasta import fasta_iter
+import screed
+
+import khmer_tst_utils as utils
+from nose.plugins.attrib import attr
+
+def teardown():
+    utils.cleanup()
+
+def test__get_set_tag_density():
+    ht = khmer.Hashbits(32, 1, 1)
+
+    orig = ht._get_tag_density()
+    assert orig != 2
+    ht._set_tag_density(2)
+    assert ht._get_tag_density() == 2
+
+
+def test_n_occupied_1():
+    filename = utils.get_test_data('random-20-a.fa')
+
+    K = 20  # size of kmer
+    HT_SIZE = 100000  # size of hashtable
+    N_HT = 1  # number of hashtables
+
+    # test modified c++ n_occupied code
+    ht1 = khmer.Hashbits(K, HT_SIZE, N_HT)
+
+    for n, record in enumerate(fasta_iter(open(filename))):
+        ht1.consume(record['sequence'])
+
+    # this number calculated independently
+    assert ht1.n_occupied() == 3877
+
+
+def test_bloom_python_1():
+    # test python code to count unique kmers using bloom filter
+    filename = utils.get_test_data('random-20-a.fa')
+
+    K = 20  # size of kmer
+    HT_SIZE = 100000  # size of hashtable
+    N_HT = 3  # number of hashtables
+
+    ht2 = khmer.Hashbits(K, HT_SIZE, N_HT)
+
+    n_unique = 0
+    for n, record in enumerate(fasta_iter(open(filename))):
+        sequence = record['sequence']
+        seq_len = len(sequence)
+        for n in range(0, seq_len + 1 - K):
+            kmer = sequence[n:n + K]
+            if (not ht2.get(kmer)):
+                n_unique += 1
+            ht2.count(kmer)
+
+    assert n_unique == 3960
+    assert ht2.n_occupied() == 3882
+    assert ht2.n_unique_kmers() == 3960  # this number equals to n_unique
+
+
+def test_bloom_c_1():
+    # test c++ code to count unique kmers using bloom filter
+
+    filename = utils.get_test_data('random-20-a.fa')
+
+    K = 20  # size of kmer
+    HT_SIZE = 100000  # size of hashtable
+    N_HT = 3  # number of hashtables
+
+    ht3 = khmer.Hashbits(K, HT_SIZE, N_HT)
+
+    for n, record in enumerate(fasta_iter(open(filename))):
+        ht3.consume(record['sequence'])
+
+    assert ht3.n_occupied() == 3882
+    assert ht3.n_unique_kmers() == 3960
+
+
+def test_n_occupied_2():  # simple one
+    K = 4
+    HT_SIZE = 10  # use 11
+    N_HT = 1
+
+    ht1 = khmer.Hashbits(K, HT_SIZE, N_HT)
+    ht1.count('AAAA')  # 00 00 00 00 = 0
+    assert ht1.n_occupied() == 1
+
+    ht1.count('ACTG')  # 00 10 01 11 =
+    assert ht1.n_occupied() == 2
+
+    ht1.count('AACG')  # 00 00 10 11 = 11  # collision 1
+
+    assert ht1.n_occupied() == 2
+    ht1.count('AGAC')   # 00  11 00 10 # collision 2
+    assert ht1.n_occupied() == 2
+
+
+def test_bloom_c_2():  # simple one
+    K = 4
+    HT_SIZE = 10  # use 11
+    N_HT1 = 1    # hashtable size = 11
+    N_HT2 = 2    # hashtable size = 11,13
+
+    # use only 1 hashtable, no bloom filter
+    ht1 = khmer.Hashbits(K, HT_SIZE, N_HT1)
+    ht1.count('AAAA')  # 00 00 00 00 = 0
+    ht1.count('ACTG')  # 00 10 01 11 =
+    assert ht1.n_unique_kmers() == 2
+    ht1.count('AACG')  # 00 00 10 11 = 11  # collision  with 1st kmer
+    assert ht1.n_unique_kmers() == 2
+    ht1.count('AGAC')   # 00  11 00 10 # collision  with 2nd kmer
+    assert ht1.n_unique_kmers() == 2
+
+    # use two hashtables with 11,13
+    ht2 = khmer.Hashbits(K, HT_SIZE, N_HT2)
+    ht2.count('AAAA')  # 00 00 00 00 = 0
+
+    ht2.count('ACTG')  # 00 10 01 11 = 2*16 +4 +3 = 39
+    assert ht2.n_unique_kmers() == 2
+    ht2.count('AACG')  # 00 00 10 11 = 11  # collision with only 1st kmer
+    assert ht2.n_unique_kmers() == 3
+    ht2.count('AGAC')   # 00  11 00 10  3*16 +2 = 50
+    # collision with both 2nd and 3rd kmers
+
+    assert ht2.n_unique_kmers() == 3
+
+
+@attr('highmem')
+def test_filter_if_present():
+    ht = khmer.Hashbits(32, 1e6, 2)
+
+    maskfile = utils.get_test_data('filter-test-A.fa')
+    inputfile = utils.get_test_data('filter-test-B.fa')
+    outfile = utils.get_temp_filename('filter')
+
+    ht.consume_fasta(maskfile)
+    ht.filter_if_present(inputfile, outfile)
+
+    records = list(fasta_iter(open(outfile)))
+    assert len(records) == 1
+    assert records[0]['name'] == '3'
+
+
+@attr('highmem')
+def test_combine_pe():
+    inpfile = utils.get_test_data('combine_parts_1.fa')
+    ht = khmer.Hashbits(32, 1, 1)
+
+    ht.consume_partitioned_fasta(inpfile)
+    assert ht.count_partitions() == (2, 0)
+
+    s1 = "CATGCAGAAGTTCCGCAACCATACCGTTCAGT"
+    pid1 = ht.get_partition_id(s1)
+
+    s2 = "CAAATGTACATGCACTTAAAATCATCCAGCCG"
+    pid2 = ht.get_partition_id(s2)
+
+    assert pid1 == 2
+    assert pid2 == 80293
+
+    ht.join_partitions(pid1, pid2)
+
+    pid1 = ht.get_partition_id(s1)
+    pid2 = ht.get_partition_id(s2)
+
+    assert pid1 == pid2
+    assert ht.count_partitions() == (1, 0)
+
+
+@attr('highmem')
+def test_load_partitioned():
+    inpfile = utils.get_test_data('combine_parts_1.fa')
+    ht = khmer.Hashbits(32, 1, 1)
+
+    ht.consume_partitioned_fasta(inpfile)
+    assert ht.count_partitions() == (2, 0)
+
+    s1 = "CATGCAGAAGTTCCGCAACCATACCGTTCAGT"
+    assert ht.get(s1)
+
+    s2 = "CAAATGTACATGCACTTAAAATCATCCAGCCG"
+    assert ht.get(s2)
+
+    s3 = "CATGCAGAAGTTCCGCAACCATACCGTTCAGTTCCTGGTGGCTA"[-32:]
+    assert ht.get(s3)
+
+
+@attr('highmem')
+def test_count_within_radius_simple():
+    inpfile = utils.get_test_data('all-A.fa')
+    ht = khmer.Hashbits(4, 1e6, 2)
+
+    print ht.consume_fasta(inpfile)
+    n = ht.count_kmers_within_radius('AAAA', 1)
+    assert n == 1
+
+    n = ht.count_kmers_within_radius('AAAA', 10)
+    assert n == 1
+
+
+@attr('highmem')
+def test_count_within_radius_big():
+    inpfile = utils.get_test_data('random-20-a.fa')
+    ht = khmer.Hashbits(20, 1e6, 4)
+
+    ht.consume_fasta(inpfile)
+    n = ht.count_kmers_within_radius('CGCAGGCTGGATTCTAGAGG', int(1e6))
+    assert n == 3960
+
+    ht = khmer.Hashbits(21, 1e6, 4)
+    ht.consume_fasta(inpfile)
+    n = ht.count_kmers_within_radius('CGCAGGCTGGATTCTAGAGGC', int(1e6))
+    assert n == 39
+
+
+@attr('highmem')
+def test_count_kmer_degree():
+    inpfile = utils.get_test_data('all-A.fa')
+    ht = khmer.Hashbits(4, 1e6, 2)
+    ht.consume_fasta(inpfile)
+
+    assert ht.kmer_degree('AAAA') == 2
+    assert ht.kmer_degree('AAAT') == 1
+    assert ht.kmer_degree('AATA') == 0
+    assert ht.kmer_degree('TAAA') == 1
+
+
+@attr('highmem')
+def test_find_radius_for_volume():
+    inpfile = utils.get_test_data('all-A.fa')
+    ht = khmer.Hashbits(4, 1e6, 2)
+    ht.consume_fasta(inpfile)
+
+    assert ht.find_radius_for_volume('AAAA', 0, 100) == 0
+    assert ht.find_radius_for_volume('AAAA', 1, 100) == 0
+    assert ht.find_radius_for_volume('AAAA', 2, 100) == 100
+
+
+def test_circumference():
+    ht = khmer.Hashbits(4, 1e6, 2)
+
+    ht.count('ATGC')
+    ht.count('GATG')
+    ht.count('ATGG')
+
+    x = ht.count_kmers_on_radius('GATG', 1, 200)
+    assert x == 2
+
+    ht.count('ATGA')
+    x = ht.count_kmers_on_radius('GATG', 1, 200)
+    assert x == 3, x
+
+    ht.count('TGAT')
+    x = ht.count_kmers_on_radius('GATG', 1, 200)
+    assert x == 4, x
+
+
+def test_save_load_tagset():
+    ht = khmer.Hashbits(32, 1, 1)
+
+    outfile = utils.get_temp_filename('tagset')
+
+    ht.add_tag('A' * 32)
+    ht.save_tagset(outfile)
+
+    ht.add_tag('G' * 32)
+
+    ht.load_tagset(outfile)              # implicitly => clear_tags=True
+    ht.save_tagset(outfile)
+
+    # if tags have been cleared, then the new tagfile will be larger (30 bytes)
+    # else smaller (22 bytes).
+
+    fp = open(outfile, 'rb')
+    data = fp.read()
+    fp.close()
+    assert len(data) == 22, len(data)
+
+
+def test_save_load_tagset_noclear():
+    ht = khmer.Hashbits(32, 1, 1)
+
+    outfile = utils.get_temp_filename('tagset')
+
+    ht.add_tag('A' * 32)
+    ht.save_tagset(outfile)
+
+    ht.add_tag('G' * 32)
+
+    ht.load_tagset(outfile, False)       # set clear_tags => False; zero tags
+    ht.save_tagset(outfile)
+
+    # if tags have been cleared, then the new tagfile will be large (30 bytes);
+    # else small (22 bytes).
+
+    fp = open(outfile, 'rb')
+    data = fp.read()
+    fp.close()
+    assert len(data) == 30, len(data)
+
+
+@attr('highmem')
+def test_stop_traverse():
+    filename = utils.get_test_data('random-20-a.fa')
+
+    K = 20  # size of kmer
+    HT_SIZE = 100000  # size of hashtable
+    N_HT = 3  # number of hashtables
+
+    ht = khmer.Hashbits(K, HT_SIZE, N_HT)
+
+    # without tagging/joining across consume, this breaks into two partition;
+    # with, it is one partition.
+    ht.add_stop_tag('TTGCATACGTTGAGCCAGCG')
+
+    ht.consume_fasta_and_tag(filename)   # DO NOT join reads across stoptags
+    subset = ht.do_subset_partition(0, 0, True)
+    ht.merge_subset(subset)
+
+    n, _ = ht.count_partitions()
+    assert n == 2, n
+
+
+@attr('highmem')
+def test_tag_across_stoptraverse():
+    filename = utils.get_test_data('random-20-a.fa')
+
+    K = 20  # size of kmer
+    HT_SIZE = 100000  # size of hashtable
+    N_HT = 3  # number of hashtables
+
+    ht = khmer.Hashbits(K, HT_SIZE, N_HT)
+
+    # without tagging/joining across consume, this breaks into two partition;
+    # with, it is one partition.
+    ht.add_stop_tag('CCGAATATATAACAGCGACG')
+
+    ht.consume_fasta_and_tag_with_stoptags(filename)  # DO join reads across
+
+    subset = ht.do_subset_partition(0, 0)
+    n, _ = ht.count_partitions()
+    assert n == 99                       # reads only connected by traversal...
+
+    n, _ = ht.subset_count_partitions(subset)
+    assert n == 2                        # but need main to cross stoptags.
+
+    ht.merge_subset(subset)
+
+    n, _ = ht.count_partitions()         # ta-da!
+    assert n == 1, n
+
+
+@attr('highmem')
+def test_notag_across_stoptraverse():
+    filename = utils.get_test_data('random-20-a.fa')
+
+    K = 20  # size of kmer
+    HT_SIZE = 100000  # size of hashtable
+    N_HT = 3  # number of hashtables
+
+    ht = khmer.Hashbits(K, HT_SIZE, N_HT)
+
+    # connecting k-mer at the beginning/end of a read: breaks up into two.
+    ht.add_stop_tag('TTGCATACGTTGAGCCAGCG')
+
+    ht.consume_fasta_and_tag_with_stoptags(filename)
+
+    subset = ht.do_subset_partition(0, 0)
+    ht.merge_subset(subset)
+
+    n, _ = ht.count_partitions()
+    assert n == 2, n
+
+
+def test_find_stoptags():
+    ht = khmer.Hashbits(5, 1, 1)
+    ht.add_stop_tag("AAAAA")
+
+    assert ht.identify_stoptags_by_position("AAAAA") == [0]
+    assert ht.identify_stoptags_by_position("AAAAAA") == [0, 1]
+    assert ht.identify_stoptags_by_position("TTTTT") == [0]
+    assert ht.identify_stoptags_by_position("TTTTTT") == [0, 1]
+
+
+def test_find_stoptags2():
+    ht = khmer.Hashbits(4, 1, 1)
+    ht.add_stop_tag("ATGC")
+
+    x = ht.identify_stoptags_by_position("ATGCATGCGCAT")
+    assert x == [0, 2, 4, 8], x
+
+
+def test_get_ksize():
+    kh = khmer.Hashbits(22, 1, 1)
+    assert kh.ksize() == 22
+
+
+def test_get_hashsizes():
+    kh = khmer.Hashbits(22, 100, 4)
+    assert kh.hashsizes() == [101, 103, 107, 109], kh.hashsizes()
+
+
+def test_extract_unique_paths_0():
+    kh = khmer.Hashbits(10, 1e5, 4)
+
+    x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1)
+    assert x == ['ATGGAGAGACACAGATAGACAGGAGTGGCGATG']
+
+    kh.consume('ATGGAGAGACACAGATAGACAGGAGTGGCGATG')
+    x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1)
+    assert not x
+
+
+def test_extract_unique_paths_1():
+    kh = khmer.Hashbits(10, 1e5, 4)
+
+    kh.consume('AGTGGCGATG')
+    x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1)
+    print x
+    assert x == ['ATGGAGAGACACAGATAGACAGGAGTGGCGAT']  # all but the last k-mer
+
+
+def test_extract_unique_paths_2():
+    kh = khmer.Hashbits(10, 1e5, 4)
+
+    kh.consume('ATGGAGAGAC')
+    x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1)
+    print x
+    assert x == ['TGGAGAGACACAGATAGACAGGAGTGGCGATG']  # all but the 1st k-mer
+
+
+def test_extract_unique_paths_3():
+    kh = khmer.Hashbits(10, 1e5, 4)
+
+    kh.consume('ATGGAGAGAC')
+    kh.consume('AGTGGCGATG')
+    x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1)
+    print x
+    # all but the 1st/last k-mer
+    assert x == ['TGGAGAGACACAGATAGACAGGAGTGGCGAT']
+
+
+def test_extract_unique_paths_4():
+    kh = khmer.Hashbits(10, 1e5, 4)
+
+    kh.consume('ATGGAGAGAC')
+    kh.consume('AGTGGCGATG')
+
+    kh.consume('ATAGACAGGA')
+
+    x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1)
+    print x
+    assert x == ['TGGAGAGACACAGATAGACAGG', 'TAGACAGGAGTGGCGAT']
+
+
+@attr('highmem')
+def test_find_unpart():
+    filename = utils.get_test_data('random-20-a.odd.fa')
+    filename2 = utils.get_test_data('random-20-a.even.fa')
+
+    K = 20  # size of kmer
+    HT_SIZE = 100000  # size of hashtable
+    N_HT = 3  # number of hashtables
+
+    ht = khmer.Hashbits(K, HT_SIZE, N_HT)
+    ht.consume_fasta_and_tag(filename)
+
+    subset = ht.do_subset_partition(0, 0)
+    ht.merge_subset(subset)
+
+    n, _ = ht.count_partitions()
+    assert n == 49
+
+    ht.find_unpart(filename2, True, False)
+    n, _ = ht.count_partitions()
+    assert n == 1, n                     # all sequences connect
+
+
+@attr('highmem')
+def test_find_unpart_notraverse():
+    filename = utils.get_test_data('random-20-a.odd.fa')
+    filename2 = utils.get_test_data('random-20-a.even.fa')
+
+    K = 20  # size of kmer
+    HT_SIZE = 100000  # size of hashtable
+    N_HT = 3  # number of hashtables
+
+    ht = khmer.Hashbits(K, HT_SIZE, N_HT)
+    ht.consume_fasta_and_tag(filename)
+
+    subset = ht.do_subset_partition(0, 0)
+    ht.merge_subset(subset)
+
+    n, _ = ht.count_partitions()
+    assert n == 49
+
+    ht.find_unpart(filename2, False, False)     # <-- don't traverse
+    n, _ = ht.count_partitions()
+    assert n == 99, n                    # all sequences disconnected
+
+
+@attr('highmem')
+def test_find_unpart_fail():
+    filename = utils.get_test_data('random-20-a.odd.fa')
+    filename2 = utils.get_test_data('random-20-a.odd.fa')  # <- switch to odd
+
+    K = 20  # size of kmer
+    HT_SIZE = 100000  # size of hashtable
+    N_HT = 3  # number of hashtables
+
+    ht = khmer.Hashbits(K, HT_SIZE, N_HT)
+    ht.consume_fasta_and_tag(filename)
+
+    subset = ht.do_subset_partition(0, 0)
+    ht.merge_subset(subset)
+
+    n, _ = ht.count_partitions()
+    assert n == 49
+
+    ht.find_unpart(filename2, True, False)
+    n, _ = ht.count_partitions()
+    assert n == 49, n                    # only 49 sequences worth of tags
+
+
+def test_simple_median():
+    hi = khmer.Hashbits(6, 1e6, 2)
+
+    (median, average, stddev) = hi.get_median_count("AAAAAA")
+    print median, average, stddev
+    assert median == 0
+    assert average == 0.0
+    assert stddev == 0.0
+
+    hi.consume("AAAAAA")
+    (median, average, stddev) = hi.get_median_count("AAAAAA")
+    print median, average, stddev
+    assert median == 1
+    assert average == 1.0
+    assert stddev == 0.0
+
diff --git a/tests/test_labelhash.py b/tests/test_labelhash.py
index f5a497edd9..4ab73a8319 100644
--- a/tests/test_labelhash.py
+++ b/tests/test_labelhash.py
@@ -149,3 +149,538 @@ def test_label_tag_correctness():
     assert len(labels) == 1
     assert 3L in labels
 
+#
+# Begin Hashbits tests
+#
+
+def test__get_set_tag_density():
+    ht = khmer.LabelHash(32, 1, 1)
+
+    orig = ht._get_tag_density()
+    assert orig != 2
+    ht._set_tag_density(2)
+    assert ht._get_tag_density() == 2
+
+
+def test_n_occupied_1():
+    filename = utils.get_test_data('random-20-a.fa')
+
+    K = 20  # size of kmer
+    HT_SIZE = 100000  # size of hashtable
+    N_HT = 1  # number of hashtables
+
+    # test modified c++ n_occupied code
+    ht1 = khmer.LabelHash(K, HT_SIZE, N_HT)
+
+    for n, record in enumerate(fasta_iter(open(filename))):
+        ht1.consume(record['sequence'])
+
+    # this number calculated independently
+    assert ht1.n_occupied() == 3877
+
+
+def test_bloom_python_1():
+    # test python code to count unique kmers using bloom filter
+    filename = utils.get_test_data('random-20-a.fa')
+
+    K = 20  # size of kmer
+    HT_SIZE = 100000  # size of hashtable
+    N_HT = 3  # number of hashtables
+
+    ht2 = khmer.LabelHash(K, HT_SIZE, N_HT)
+
+    n_unique = 0
+    for n, record in enumerate(fasta_iter(open(filename))):
+        sequence = record['sequence']
+        seq_len = len(sequence)
+        for n in range(0, seq_len + 1 - K):
+            kmer = sequence[n:n + K]
+            if (not ht2.get(kmer)):
+                n_unique += 1
+            ht2.count(kmer)
+
+    assert n_unique == 3960
+    assert ht2.n_occupied() == 3882
+    assert ht2.n_unique_kmers() == 3960  # this number equals to n_unique
+
+
+def test_bloom_c_1():
+    # test c++ code to count unique kmers using bloom filter
+
+    filename = utils.get_test_data('random-20-a.fa')
+
+    K = 20  # size of kmer
+    HT_SIZE = 100000  # size of hashtable
+    N_HT = 3  # number of hashtables
+
+    ht3 = khmer.LabelHash(K, HT_SIZE, N_HT)
+
+    for n, record in enumerate(fasta_iter(open(filename))):
+        ht3.consume(record['sequence'])
+
+    assert ht3.n_occupied() == 3882
+    assert ht3.n_unique_kmers() == 3960
+
+
+def test_n_occupied_2():  # simple one
+    K = 4
+    HT_SIZE = 10  # use 11
+    N_HT = 1
+
+    ht1 = khmer.LabelHash(K, HT_SIZE, N_HT)
+    ht1.count('AAAA')  # 00 00 00 00 = 0
+    assert ht1.n_occupied() == 1
+
+    ht1.count('ACTG')  # 00 10 01 11 =
+    assert ht1.n_occupied() == 2
+
+    ht1.count('AACG')  # 00 00 10 11 = 11  # collision 1
+
+    assert ht1.n_occupied() == 2
+    ht1.count('AGAC')   # 00  11 00 10 # collision 2
+    assert ht1.n_occupied() == 2
+
+
+def test_bloom_c_2():  # simple one
+    K = 4
+    HT_SIZE = 10  # use 11
+    N_HT1 = 1    # hashtable size = 11
+    N_HT2 = 2    # hashtable size = 11,13
+
+    # use only 1 hashtable, no bloom filter
+    ht1 = khmer.LabelHash(K, HT_SIZE, N_HT1)
+    ht1.count('AAAA')  # 00 00 00 00 = 0
+    ht1.count('ACTG')  # 00 10 01 11 =
+    assert ht1.n_unique_kmers() == 2
+    ht1.count('AACG')  # 00 00 10 11 = 11  # collision  with 1st kmer
+    assert ht1.n_unique_kmers() == 2
+    ht1.count('AGAC')   # 00  11 00 10 # collision  with 2nd kmer
+    assert ht1.n_unique_kmers() == 2
+
+    # use two hashtables with 11,13
+    ht2 = khmer.LabelHash(K, HT_SIZE, N_HT2)
+    ht2.count('AAAA')  # 00 00 00 00 = 0
+
+    ht2.count('ACTG')  # 00 10 01 11 = 2*16 +4 +3 = 39
+    assert ht2.n_unique_kmers() == 2
+    ht2.count('AACG')  # 00 00 10 11 = 11  # collision with only 1st kmer
+    assert ht2.n_unique_kmers() == 3
+    ht2.count('AGAC')   # 00  11 00 10  3*16 +2 = 50
+    # collision with both 2nd and 3rd kmers
+
+    assert ht2.n_unique_kmers() == 3
+
+
+@attr('highmem')
+def test_filter_if_present():
+    ht = khmer.LabelHash(32, 1e6, 2)
+
+    maskfile = utils.get_test_data('filter-test-A.fa')
+    inputfile = utils.get_test_data('filter-test-B.fa')
+    outfile = utils.get_temp_filename('filter')
+
+    ht.consume_fasta(maskfile)
+    ht.filter_if_present(inputfile, outfile)
+
+    records = list(fasta_iter(open(outfile)))
+    assert len(records) == 1
+    assert records[0]['name'] == '3'
+
+
+@attr('highmem')
+def test_combine_pe():
+    inpfile = utils.get_test_data('combine_parts_1.fa')
+    ht = khmer.LabelHash(32, 1, 1)
+
+    ht.consume_partitioned_fasta(inpfile)
+    assert ht.count_partitions() == (2, 0)
+
+    s1 = "CATGCAGAAGTTCCGCAACCATACCGTTCAGT"
+    pid1 = ht.get_partition_id(s1)
+
+    s2 = "CAAATGTACATGCACTTAAAATCATCCAGCCG"
+    pid2 = ht.get_partition_id(s2)
+
+    assert pid1 == 2
+    assert pid2 == 80293
+
+    ht.join_partitions(pid1, pid2)
+
+    pid1 = ht.get_partition_id(s1)
+    pid2 = ht.get_partition_id(s2)
+
+    assert pid1 == pid2
+    assert ht.count_partitions() == (1, 0)
+
+
+@attr('highmem')
+def test_load_partitioned():
+    inpfile = utils.get_test_data('combine_parts_1.fa')
+    ht = khmer.LabelHash(32, 1, 1)
+
+    ht.consume_partitioned_fasta(inpfile)
+    assert ht.count_partitions() == (2, 0)
+
+    s1 = "CATGCAGAAGTTCCGCAACCATACCGTTCAGT"
+    assert ht.get(s1)
+
+    s2 = "CAAATGTACATGCACTTAAAATCATCCAGCCG"
+    assert ht.get(s2)
+
+    s3 = "CATGCAGAAGTTCCGCAACCATACCGTTCAGTTCCTGGTGGCTA"[-32:]
+    assert ht.get(s3)
+
+
+@attr('highmem')
+def test_count_within_radius_simple():
+    inpfile = utils.get_test_data('all-A.fa')
+    ht = khmer.LabelHash(4, 1e6, 2)
+
+    print ht.consume_fasta(inpfile)
+    n = ht.count_kmers_within_radius('AAAA', 1)
+    assert n == 1
+
+    n = ht.count_kmers_within_radius('AAAA', 10)
+    assert n == 1
+
+
+@attr('highmem')
+def test_count_within_radius_big():
+    inpfile = utils.get_test_data('random-20-a.fa')
+    ht = khmer.LabelHash(20, 1e6, 4)
+
+    ht.consume_fasta(inpfile)
+    n = ht.count_kmers_within_radius('CGCAGGCTGGATTCTAGAGG', int(1e6))
+    assert n == 3960
+
+    ht = khmer.LabelHash(21, 1e6, 4)
+    ht.consume_fasta(inpfile)
+    n = ht.count_kmers_within_radius('CGCAGGCTGGATTCTAGAGGC', int(1e6))
+    assert n == 39
+
+
+@attr('highmem')
+def test_count_kmer_degree():
+    inpfile = utils.get_test_data('all-A.fa')
+    ht = khmer.LabelHash(4, 1e6, 2)
+    ht.consume_fasta(inpfile)
+
+    assert ht.kmer_degree('AAAA') == 2
+    assert ht.kmer_degree('AAAT') == 1
+    assert ht.kmer_degree('AATA') == 0
+    assert ht.kmer_degree('TAAA') == 1
+
+
+@attr('highmem')
+def test_find_radius_for_volume():
+    inpfile = utils.get_test_data('all-A.fa')
+    ht = khmer.LabelHash(4, 1e6, 2)
+    ht.consume_fasta(inpfile)
+
+    assert ht.find_radius_for_volume('AAAA', 0, 100) == 0
+    assert ht.find_radius_for_volume('AAAA', 1, 100) == 0
+    assert ht.find_radius_for_volume('AAAA', 2, 100) == 100
+
+
+def test_circumference():
+    ht = khmer.LabelHash(4, 1e6, 2)
+
+    ht.count('ATGC')
+    ht.count('GATG')
+    ht.count('ATGG')
+
+    x = ht.count_kmers_on_radius('GATG', 1, 200)
+    assert x == 2
+
+    ht.count('ATGA')
+    x = ht.count_kmers_on_radius('GATG', 1, 200)
+    assert x == 3, x
+
+    ht.count('TGAT')
+    x = ht.count_kmers_on_radius('GATG', 1, 200)
+    assert x == 4, x
+
+
+def test_save_load_tagset():
+    ht = khmer.LabelHash(32, 1, 1)
+
+    outfile = utils.get_temp_filename('tagset')
+
+    ht.add_tag('A' * 32)
+    ht.save_tagset(outfile)
+
+    ht.add_tag('G' * 32)
+
+    ht.load_tagset(outfile)              # implicitly => clear_tags=True
+    ht.save_tagset(outfile)
+
+    # if tags have been cleared, then the new tagfile will be larger (30 bytes)
+    # else smaller (22 bytes).
+
+    fp = open(outfile, 'rb')
+    data = fp.read()
+    fp.close()
+    assert len(data) == 22, len(data)
+
+
+def test_save_load_tagset_noclear():
+    ht = khmer.LabelHash(32, 1, 1)
+
+    outfile = utils.get_temp_filename('tagset')
+
+    ht.add_tag('A' * 32)
+    ht.save_tagset(outfile)
+
+    ht.add_tag('G' * 32)
+
+    ht.load_tagset(outfile, False)       # set clear_tags => False; zero tags
+    ht.save_tagset(outfile)
+
+    # if tags have been cleared, then the new tagfile will be large (30 bytes);
+    # else small (22 bytes).
+
+    fp = open(outfile, 'rb')
+    data = fp.read()
+    fp.close()
+    assert len(data) == 30, len(data)
+
+
+@attr('highmem')
+def test_stop_traverse():
+    filename = utils.get_test_data('random-20-a.fa')
+
+    K = 20  # size of kmer
+    HT_SIZE = 100000  # size of hashtable
+    N_HT = 3  # number of hashtables
+
+    ht = khmer.LabelHash(K, HT_SIZE, N_HT)
+
+    # without tagging/joining across consume, this breaks into two partition;
+    # with, it is one partition.
+    ht.add_stop_tag('TTGCATACGTTGAGCCAGCG')
+
+    ht.consume_fasta_and_tag(filename)   # DO NOT join reads across stoptags
+    subset = ht.do_subset_partition(0, 0, True)
+    ht.merge_subset(subset)
+
+    n, _ = ht.count_partitions()
+    assert n == 2, n
+
+
+@attr('highmem')
+def test_tag_across_stoptraverse():
+    filename = utils.get_test_data('random-20-a.fa')
+
+    K = 20  # size of kmer
+    HT_SIZE = 100000  # size of hashtable
+    N_HT = 3  # number of hashtables
+
+    ht = khmer.LabelHash(K, HT_SIZE, N_HT)
+
+    # without tagging/joining across consume, this breaks into two partition;
+    # with, it is one partition.
+    ht.add_stop_tag('CCGAATATATAACAGCGACG')
+
+    ht.consume_fasta_and_tag_with_stoptags(filename)  # DO join reads across
+
+    subset = ht.do_subset_partition(0, 0)
+    n, _ = ht.count_partitions()
+    assert n == 99                       # reads only connected by traversal...
+
+    n, _ = ht.subset_count_partitions(subset)
+    assert n == 2                        # but need main to cross stoptags.
+
+    ht.merge_subset(subset)
+
+    n, _ = ht.count_partitions()         # ta-da!
+    assert n == 1, n
+
+
+@attr('highmem')
+def test_notag_across_stoptraverse():
+    filename = utils.get_test_data('random-20-a.fa')
+
+    K = 20  # size of kmer
+    HT_SIZE = 100000  # size of hashtable
+    N_HT = 3  # number of hashtables
+
+    ht = khmer.LabelHash(K, HT_SIZE, N_HT)
+
+    # connecting k-mer at the beginning/end of a read: breaks up into two.
+    ht.add_stop_tag('TTGCATACGTTGAGCCAGCG')
+
+    ht.consume_fasta_and_tag_with_stoptags(filename)
+
+    subset = ht.do_subset_partition(0, 0)
+    ht.merge_subset(subset)
+
+    n, _ = ht.count_partitions()
+    assert n == 2, n
+
+
+def test_find_stoptags():
+    ht = khmer.LabelHash(5, 1, 1)
+    ht.add_stop_tag("AAAAA")
+
+    assert ht.identify_stoptags_by_position("AAAAA") == [0]
+    assert ht.identify_stoptags_by_position("AAAAAA") == [0, 1]
+    assert ht.identify_stoptags_by_position("TTTTT") == [0]
+    assert ht.identify_stoptags_by_position("TTTTTT") == [0, 1]
+
+
+def test_find_stoptags2():
+    ht = khmer.LabelHash(4, 1, 1)
+    ht.add_stop_tag("ATGC")
+
+    x = ht.identify_stoptags_by_position("ATGCATGCGCAT")
+    assert x == [0, 2, 4, 8], x
+
+
+def test_get_ksize():
+    kh = khmer.LabelHash(22, 1, 1)
+    assert kh.ksize() == 22
+
+
+def test_get_hashsizes():
+    kh = khmer.LabelHash(22, 100, 4)
+    assert kh.hashsizes() == [101, 103, 107, 109], kh.hashsizes()
+
+
+def test_extract_unique_paths_0():
+    kh = khmer.LabelHash(10, 1e5, 4)
+
+    x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1)
+    assert x == ['ATGGAGAGACACAGATAGACAGGAGTGGCGATG']
+
+    kh.consume('ATGGAGAGACACAGATAGACAGGAGTGGCGATG')
+    x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1)
+    assert not x
+
+
+def test_extract_unique_paths_1():
+    kh = khmer.LabelHash(10, 1e5, 4)
+
+    kh.consume('AGTGGCGATG')
+    x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1)
+    print x
+    assert x == ['ATGGAGAGACACAGATAGACAGGAGTGGCGAT']  # all but the last k-mer
+
+
+def test_extract_unique_paths_2():
+    kh = khmer.LabelHash(10, 1e5, 4)
+
+    kh.consume('ATGGAGAGAC')
+    x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1)
+    print x
+    assert x == ['TGGAGAGACACAGATAGACAGGAGTGGCGATG']  # all but the 1st k-mer
+
+
+def test_extract_unique_paths_3():
+    kh = khmer.LabelHash(10, 1e5, 4)
+
+    kh.consume('ATGGAGAGAC')
+    kh.consume('AGTGGCGATG')
+    x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1)
+    print x
+    # all but the 1st/last k-mer
+    assert x == ['TGGAGAGACACAGATAGACAGGAGTGGCGAT']
+
+
+def test_extract_unique_paths_4():
+    kh = khmer.LabelHash(10, 1e5, 4)
+
+    kh.consume('ATGGAGAGAC')
+    kh.consume('AGTGGCGATG')
+
+    kh.consume('ATAGACAGGA')
+
+    x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1)
+    print x
+    assert x == ['TGGAGAGACACAGATAGACAGG', 'TAGACAGGAGTGGCGAT']
+
+
+@attr('highmem')
+def test_find_unpart():
+    filename = utils.get_test_data('random-20-a.odd.fa')
+    filename2 = utils.get_test_data('random-20-a.even.fa')
+
+    K = 20  # size of kmer
+    HT_SIZE = 100000  # size of hashtable
+    N_HT = 3  # number of hashtables
+
+    ht = khmer.LabelHash(K, HT_SIZE, N_HT)
+    ht.consume_fasta_and_tag(filename)
+
+    subset = ht.do_subset_partition(0, 0)
+    ht.merge_subset(subset)
+
+    n, _ = ht.count_partitions()
+    assert n == 49
+
+    ht.find_unpart(filename2, True, False)
+    n, _ = ht.count_partitions()
+    assert n == 1, n                     # all sequences connect
+
+
+@attr('highmem')
+def test_find_unpart_notraverse():
+    filename = utils.get_test_data('random-20-a.odd.fa')
+    filename2 = utils.get_test_data('random-20-a.even.fa')
+
+    K = 20  # size of kmer
+    HT_SIZE = 100000  # size of hashtable
+    N_HT = 3  # number of hashtables
+
+    ht = khmer.LabelHash(K, HT_SIZE, N_HT)
+    ht.consume_fasta_and_tag(filename)
+
+    subset = ht.do_subset_partition(0, 0)
+    ht.merge_subset(subset)
+
+    n, _ = ht.count_partitions()
+    assert n == 49
+
+    ht.find_unpart(filename2, False, False)     # <-- don't traverse
+    n, _ = ht.count_partitions()
+    assert n == 99, n                    # all sequences disconnected
+
+
+@attr('highmem')
+def test_find_unpart_fail():
+    filename = utils.get_test_data('random-20-a.odd.fa')
+    filename2 = utils.get_test_data('random-20-a.odd.fa')  # <- switch to odd
+
+    K = 20  # size of kmer
+    HT_SIZE = 100000  # size of hashtable
+    N_HT = 3  # number of hashtables
+
+    ht = khmer.LabelHash(K, HT_SIZE, N_HT)
+    ht.consume_fasta_and_tag(filename)
+
+    subset = ht.do_subset_partition(0, 0)
+    ht.merge_subset(subset)
+
+    n, _ = ht.count_partitions()
+    assert n == 49
+
+    ht.find_unpart(filename2, True, False)
+    n, _ = ht.count_partitions()
+    assert n == 49, n                    # only 49 sequences worth of tags
+
+
+def test_simple_median():
+    hi = khmer.LabelHash(6, 1e6, 2)
+
+    (median, average, stddev) = hi.get_median_count("AAAAAA")
+    print median, average, stddev
+    assert median == 0
+    assert average == 0.0
+    assert stddev == 0.0
+
+    hi.consume("AAAAAA")
+    (median, average, stddev) = hi.get_median_count("AAAAAA")
+    print median, average, stddev
+    assert median == 1
+    assert average == 1.0
+    assert stddev == 0.0
+

From 10598e46231021b7691c410a628cf04c21dc15a6 Mon Sep 17 00:00:00 2001
From: Camille Scott <camille.scott.w@gmail.com>
Date: Fri, 10 Jan 2014 14:08:16 -0500
Subject: [PATCH 122/140] killed noisy init code

---
 khmer/_khmermodule.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/khmer/_khmermodule.cc b/khmer/_khmermodule.cc
index b32124012e..4cf587cc2d 100644
--- a/khmer/_khmermodule.cc
+++ b/khmer/_khmermodule.cc
@@ -4229,9 +4229,9 @@ static int khmer_labelhash_init(khmer_KLabelHashObject * self, PyObject *args, P
 {
     if (khmer_KHashbitsType.tp_init((PyObject *)self, args, kwds) < 0)
         return -1;
-    std::cout << "testing my pointer ref to hashbits: " << self->khashbits.hashbits->n_tags() << std::endl;
-    std::cout << "hashbits: " << self->khashbits.hashbits << std::endl;
-    std::cout << "labelhash: " << self->labelhash << std::endl;
+    //std::cout << "testing my pointer ref to hashbits: " << self->khashbits.hashbits->n_tags() << std::endl;
+    //std::cout << "hashbits: " << self->khashbits.hashbits << std::endl;
+    //std::cout << "labelhash: " << self->labelhash << std::endl;
     return 0;
 }
 

From acb63483350fca26dbb063e09161750178859ed4 Mon Sep 17 00:00:00 2001
From: "Michael R. Crusoe" <mcrusoe@msu.edu>
Date: Fri, 10 Jan 2014 14:21:27 -0500
Subject: [PATCH 123/140] autopep8

---
 khmer/__init__.py                            |  6 +-
 khmer/_version.py                            | 34 ++++----
 scripts/extract-partitions.py                |  5 +-
 scripts/sweep-reads-by-partition-buffered.py | 84 +++++++++++---------
 setup.cfg                                    |  4 +-
 setup.py                                     | 19 ++---
 tests/test_Hashbits.py                       |  3 +-
 tests/test_graph.py                          |  2 +-
 tests/test_hashbits.py                       |  3 +-
 tests/test_labelhash.py                      | 51 +++++++-----
 tests/test_scripts.py                        | 37 +++++----
 tests/test_subset_graph.py                   | 28 ++++---
 12 files changed, 163 insertions(+), 113 deletions(-)

diff --git a/khmer/__init__.py b/khmer/__init__.py
index 59a4eb2d76..ba1a9ad359 100644
--- a/khmer/__init__.py
+++ b/khmer/__init__.py
@@ -30,6 +30,7 @@ def new_hashbits(k, starting_size, n_tables=2):
 
     return _new_hashbits(k, primes)
 
+
 def new_counting_hash(k, starting_size, n_tables=2, n_threads=1):
     primes = get_n_primes_above_x(n_tables, starting_size)
 
@@ -156,17 +157,20 @@ def get_n_primes_above_x(n, x):
 Additional functionality can be added to these classes as appropriate.
 '''
 
+
 class LabelHash(_LabelHash):
+
     def __new__(cls, k, starting_size, n_tables):
         primes = get_n_primes_above_x(n_tables, starting_size)
         c = _LabelHash.__new__(cls, k, primes)
         c.primes = primes
         return c
 
+
 class Hashbits(_Hashbits):
+
     def __new__(cls, k, starting_size, n_tables):
         primes = get_n_primes_above_x(n_tables, starting_size)
         c = _Hashbits.__new__(cls, k, primes)
         c.primes = primes
         return c
-
diff --git a/khmer/_version.py b/khmer/_version.py
index 6e5155675f..cdd5a1359b 100644
--- a/khmer/_version.py
+++ b/khmer/_version.py
@@ -17,6 +17,7 @@
 import subprocess
 import sys
 
+
 def run_command(args, cwd=None, verbose=False, hide_stderr=False):
     try:
         # remember shell=False, so use git.cmd on windows, not just git
@@ -42,6 +43,7 @@ def run_command(args, cwd=None, verbose=False, hide_stderr=False):
 import re
 import os.path
 
+
 def get_expanded_variables(versionfile_source):
     # the code embedded in _version.py can just fetch the value of these
     # variables. When used from setup.py, we don't want to import
@@ -49,7 +51,7 @@ def get_expanded_variables(versionfile_source):
     # used from _version.py.
     variables = {}
     try:
-        f = open(versionfile_source,"r")
+        f = open(versionfile_source, "r")
         for line in f.readlines():
             if line.strip().startswith("git_refnames ="):
                 mo = re.search(r'=\s*"(.*)"', line)
@@ -64,12 +66,13 @@ def get_expanded_variables(versionfile_source):
         pass
     return variables
 
+
 def versions_from_expanded_variables(variables, tag_prefix, verbose=False):
     refnames = variables["refnames"].strip()
     if refnames.startswith("$Format"):
         if verbose:
             print("variables are unexpanded, not using")
-        return {} # unexpanded, so not in an unpacked git-archive tarball
+        return {}  # unexpanded, so not in an unpacked git-archive tarball
     refs = set([r.strip() for r in refnames.strip("()").split(",")])
     # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of
     # just "foo-1.0". If we see a "tag: " prefix, prefer those.
@@ -85,7 +88,7 @@ def versions_from_expanded_variables(variables, tag_prefix, verbose=False):
         # "stabilization", as well as "HEAD" and "master".
         tags = set([r for r in refs if re.search(r'\d', r)])
         if verbose:
-            print("discarding '%s', no digits" % ",".join(refs-tags))
+            print("discarding '%s', no digits" % ",".join(refs - tags))
     if verbose:
         print("likely tags: %s" % ",".join(sorted(tags)))
     for ref in sorted(tags):
@@ -94,13 +97,14 @@ def versions_from_expanded_variables(variables, tag_prefix, verbose=False):
             r = ref[len(tag_prefix):]
             if verbose:
                 print("picking %s" % r)
-            return { "version": r,
-                     "full": variables["full"].strip() }
+            return {"version": r,
+                    "full": variables["full"].strip()}
     # no suitable tags, so we use the full revision id
     if verbose:
         print("no suitable tags, using full revision id")
-    return { "version": variables["full"].strip(),
-             "full": variables["full"].strip() }
+    return {"version": variables["full"].strip(),
+            "full": variables["full"].strip()}
+
 
 def versions_from_vcs(tag_prefix, versionfile_source, verbose=False):
     # this runs 'git' from the root of the source tree. That either means
@@ -117,7 +121,7 @@ def versions_from_vcs(tag_prefix, versionfile_source, verbose=False):
         here = os.path.abspath(__file__)
     except NameError:
         # some py2exe/bbfreeze/non-CPython implementations don't do __file__
-        return {} # not always correct
+        return {}  # not always correct
 
     GIT = "git"
     if sys.platform == "win32":
@@ -145,7 +149,8 @@ def versions_from_vcs(tag_prefix, versionfile_source, verbose=False):
         return {}
     if not stdout.startswith(tag_prefix):
         if verbose:
-            print("tag '%s' doesn't start with prefix '%s'" % (stdout, tag_prefix))
+            print("tag '%s' doesn't start with prefix '%s'" %
+                  (stdout, tag_prefix))
         return {}
     tag = stdout[len(tag_prefix):]
     stdout = run_command([GIT, "rev-parse", "HEAD"], cwd=root)
@@ -167,7 +172,7 @@ def versions_from_parentdir(parentdir_prefix, versionfile_source, verbose=False)
             here = os.path.abspath(__file__)
         except NameError:
             # py2exe/bbfreeze/non-CPython don't have __file__
-            return {} # without __file__, we have no hope
+            return {}  # without __file__, we have no hope
         # versionfile_source is the relative path from the top of the source
         # tree to _version.py. Invert this to find the root from __file__.
         root = here
@@ -184,8 +189,9 @@ def versions_from_parentdir(parentdir_prefix, versionfile_source, verbose=False)
     dirname = os.path.basename(root)
     if not dirname.startswith(parentdir_prefix):
         if verbose:
-            print("guessing rootdir is '%s', but '%s' doesn't start with prefix '%s'" %
-                  (root, dirname, parentdir_prefix))
+            print(
+                "guessing rootdir is '%s', but '%s' doesn't start with prefix '%s'" %
+                (root, dirname, parentdir_prefix))
         return None
     return {"version": dirname[len(parentdir_prefix):], "full": ""}
 
@@ -193,8 +199,9 @@ def versions_from_parentdir(parentdir_prefix, versionfile_source, verbose=False)
 parentdir_prefix = "."
 versionfile_source = "khmer/_version.py"
 
+
 def get_versions(default={"version": "unknown", "full": ""}, verbose=False):
-    variables = { "refnames": git_refnames, "full": git_full }
+    variables = {"refnames": git_refnames, "full": git_full}
     ver = versions_from_expanded_variables(variables, tag_prefix, verbose)
     if not ver:
         ver = versions_from_vcs(tag_prefix, versionfile_source, verbose)
@@ -204,4 +211,3 @@ def get_versions(default={"version": "unknown", "full": ""}, verbose=False):
     if not ver:
         ver = default
     return ver
-
diff --git a/scripts/extract-partitions.py b/scripts/extract-partitions.py
index dd2225a574..e787a383f3 100755
--- a/scripts/extract-partitions.py
+++ b/scripts/extract-partitions.py
@@ -31,6 +31,7 @@ def read_partition_file(filename):
         name, partition_id = record.name.rsplit('\t', 1)
         yield n, record, int(partition_id)
 
+
 def output_single(r):
     if hasattr(r, 'accuracy'):
         return "@%s\n%s\n+\n%s\n" % (r.name, r.sequence, r.accuracy)
@@ -104,12 +105,12 @@ def main():
             else:
                 assert not hasattr(r, 'accuracy'), \
                     "all input files must be FASTA if the first one is"
-                
+
             break
 
     if output_unassigned:
         unassigned_fp = open('%s.unassigned.%s' % (prefix, SUFFIX), 'w')
-        
+
     count = {}
     for filename in args.part_filenames:
         for n, r, pid in read_partition_file(filename):
diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py
index f153282319..75996be973 100755
--- a/scripts/sweep-reads-by-partition-buffered.py
+++ b/scripts/sweep-reads-by-partition-buffered.py
@@ -29,25 +29,27 @@
 from collections import namedtuple as nt
 
 
-DEFAULT_NUM_BUFFERS=50000
-DEFAULT_MAX_READS=1000000
-DEFAULT_BUFFER_SIZE=10
-DEFAULT_OUT_PREF='reads_'
-DEFAULT_RANGE=-1
-
-MIN_HSIZE=4e7
-MIN_KSIZE=21
-    
+DEFAULT_NUM_BUFFERS = 50000
+DEFAULT_MAX_READS = 1000000
+DEFAULT_BUFFER_SIZE = 10
+DEFAULT_OUT_PREF = 'reads_'
+DEFAULT_RANGE = -1
+
+MIN_HSIZE = 4e7
+MIN_KSIZE = 21
+
+
 def fmt_fasta(name, seq, labels=[]):
-        return '>{name}\t{labels}\n{seq}\n'.format(name=name, 
-            labels='\t'.join([str(l) for l in labels]), seq=seq)
+    return '>{name}\t{labels}\n{seq}\n'.format(name=name,
+                                               labels='\t'.join([str(l) for l in labels]), seq=seq)
+
 
 def write_seq(fp, name, seq, labels=[]):
     try:
         fp.write(fmt_fasta(name, seq, labels=labels))
     except IOError:
         print >>sys.stderr, 'Error writing {read}'.format(
-                read=fmt_fasta(name, seq, labels=labels))
+            read=fmt_fasta(name, seq, labels=labels))
         return 1
     else:
         return 0
@@ -58,8 +60,10 @@ def write_seq(fp, name, seq, labels=[]):
 # Hope that each file acrues, on average, BUFFER_SIZE / NUM_PARTS reads
 # ie, if we buffer 1000000 reads, and we have 100000 partitions or labels,
 # we should expect the mean buffer size to be 10 reads
+
+
 class ReadBuffer:
-    
+
     def __init__(self):
         self.buf = []
 
@@ -78,6 +82,7 @@ def is_full(self, full):
     def __len__(self):
         return len(self.buf)
 
+
 class ReadBufferManager:
 
     def __init__(self, max_buffers, max_reads, max_size, output_pref, outdir):
@@ -128,13 +133,13 @@ def queue(self, seq_str, buf_id):
             new_buf = ReadBuffer()
             new_buf.push(seq_str)
             self.buffers[buf_id] = new_buf
-            
+
         self.cur_reads += 1
         if self.cur_reads > self.max_reads:
             print >>sys.stderr, '** Reached max num reads...'
             self.flush_all()
         if len(self.buffers) > self.max_buffers:
-            #self.clean_buffers(2)
+            # self.clean_buffers(2)
             print >>sys.stderr, '** Reached max num buffers...'
             self.flush_all()
 
@@ -145,24 +150,25 @@ def flush_all(self):
             self.flush_buffer(buf_id)
         assert self.cur_reads == 0
 
+
 def main():
 
     parser = build_construct_args()
-    parser.add_argument('-i', '--input_fastp',dest='input_fastp')
-    parser.add_argument('-r', '--traversal_range', type=int, dest='traversal_range', \
-                        default=DEFAULT_RANGE)
-    parser.add_argument('-b', '--buffer_size', dest='max_reads', type=int, \
+    parser.add_argument('-i', '--input_fastp', dest='input_fastp')
+    parser.add_argument(
+        '-r', '--traversal_range', type=int, dest='traversal_range',
+        default=DEFAULT_RANGE)
+    parser.add_argument('-b', '--buffer_size', dest='max_reads', type=int,
                         default=DEFAULT_MAX_READS)
-    parser.add_argument('-l', '--buffer_length', dest='buffer_size', type=int, \
+    parser.add_argument('-l', '--buffer_length', dest='buffer_size', type=int,
                         default=DEFAULT_BUFFER_SIZE)
     parser.add_argument('-o', '--output_prefix', dest='output_prefix',
                         default=DEFAULT_OUT_PREF)
-    parser.add_argument('-m', '--max_buffers', dest='max_buffers', type=int, \
+    parser.add_argument('-m', '--max_buffers', dest='max_buffers', type=int,
                         default=DEFAULT_NUM_BUFFERS)
     parser.add_argument('input_files', nargs='+')
     args = parser.parse_args()
-    
-   
+
     K = args.ksize
     HT_SIZE = args.min_hashsize
     if HT_SIZE < MIN_HSIZE:
@@ -188,9 +194,9 @@ def main():
         print >>sys.stderr, ''
         print >>sys.stderr, \
             'Estimated memory usage is {prod:.2g} bytes \
-            (n_hashes x min_hashsize / 8)'.format(prod=args.n_hashes*HT_SIZE/8)
+            (n_hashes x min_hashsize / 8)'.format(prod=args.n_hashes * HT_SIZE / 8)
         print >>sys.stderr, '-' * 8
-    
+
     traversal_range = args.traversal_range
     input_fastp = args.input_fastp
     outdir = os.path.dirname(input_fastp)
@@ -199,18 +205,19 @@ def main():
     output_pref = args.output_prefix
     buf_size = args.buffer_size
     max_reads = args.max_reads
-    
+
     input_files = args.input_files
 
-    output_buffer = ReadBufferManager(max_buffers, max_reads, buf_size, output_pref, outdir)
+    output_buffer = ReadBufferManager(
+        max_buffers, max_reads, buf_size, output_pref, outdir)
 
-	# consume the partitioned fasta with which to label the graph
+        # consume the partitioned fasta with which to label the graph
     ht = khmer.LabelHash(K, HT_SIZE, N_HT)
     print >>sys.stderr, 'consuming fastp...'
     ht.consume_partitioned_fasta_and_tag_with_labels(input_fastp)
 
     label_number_dist = []
-    
+
     n_orphaned = 0
     n_labeled = 0
     n_mlabeled = 0
@@ -218,7 +225,7 @@ def main():
     total_t = time.clock()
     start_t = time.clock()
     for read_file in input_files:
-        print >>sys.stderr,'** sweeping {read_file} for labels...'.format(read_file=read_file)
+        print >>sys.stderr, '** sweeping {read_file} for labels...'.format(read_file=read_file)
         file_t = 0.0
         try:
             read_fp = screed.open(read_file)
@@ -261,23 +268,26 @@ def main():
 
     # gotta output anything left in the buffers at the end!
     print >>sys.stderr, '** End of run...'
-    output_buffer.flush_all() 
+    output_buffer.flush_all()
     total_t = time.clock() - total_t
 
     if output_buffer.num_write_errors > 0 or output_buffer.num_file_errors > 0:
         print >>sys.stderr, '! WARNING: Sweep finished with errors !'
-        print >>sys.stderr, '** {writee} reads not written'.format(writee=output_buffer.num_write_errors)
-        print >>sys.stderr, '** {filee} errors opening files'.format(filee=output_buffer.num_file_errors)
+        print >>sys.stderr, '** {writee} reads not written'.format(
+            writee=output_buffer.num_write_errors)
+        print >>sys.stderr, '** {filee} errors opening files'.format(
+            filee=output_buffer.num_file_errors)
 
-    print >>sys.stderr, 'swept {n_reads} for labels...'.format(n_reads=n_labeled+n_mlabeled+n_orphaned)
+    print >>sys.stderr, 'swept {n_reads} for labels...'.format(
+        n_reads=n_labeled + n_mlabeled + n_orphaned)
     print >>sys.stderr, '...with {nc} labeled and {no} orphaned'.format(
-                                    nc=n_labeled, no=n_orphaned)
+        nc=n_labeled, no=n_orphaned)
     print >>sys.stderr, '...and {nmc} multilabeled'.format(nmc=n_mlabeled)
-    
+
     print >>sys.stderr, '** outputting label number distribution...'
     with open('label_dist.txt', 'wb') as outfp:
         for nc in label_number_dist:
             outfp.write('{nc}\n'.format(nc=nc))
-    
+
 if __name__ == '__main__':
     main()
diff --git a/setup.cfg b/setup.cfg
index a7c3a5e01a..51f93526b2 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,8 +1,8 @@
 [nosetests]
 verbosity = 2
 stop = TRUE
-attr = !known_failing
-#attr = !known_failing,!highmem
+#attr = !known_failing
+attr = !known_failing,!highmem
 # where highmem > 0.5GiB memory
 
 [build_ext]
diff --git a/setup.py b/setup.py
index 78bff96df6..722be23337 100755
--- a/setup.py
+++ b/setup.py
@@ -71,8 +71,8 @@
     [
         "khmer_config", "thread_id_map", "trace_logger", "perf_metrics",
         "read_parsers", "ktable", "hashtable", "hashbits", "labelhash", "counting",
-        "subset", "aligner", "scoringmatrix", "node", "kmer",  
-   ]
+        "subset", "aligner", "scoringmatrix", "node", "kmer",
+    ]
 ))
 
 extension_mod_DICT = \
@@ -106,18 +106,18 @@
         ' Jason Pell, Jared Simpson, Camille Scott,'
         ' Qingpeng Zhang, and C. Titus Brown',
         "author_email": 'khmer-project@idyll.org',
-        #"maintainer": 'Michael R. Crusoe', # this overrides the author field
-        #"maintainer_email": 'mcrusoe@msu.edu', # so don't include it
-        #http://docs.python.org/2/distutils/setupscript.html
-        # #additiona-meta-data note #3
+        # "maintainer": 'Michael R. Crusoe', # this overrides the author field
+        # "maintainer_email": 'mcrusoe@msu.edu', # so don't include it
+        # http://docs.python.org/2/distutils/setupscript.html
+        # additiona-meta-data note #3
         "url": 'http://ged.msu.edu/',
         "packages": ['khmer'],
         "install_requires": ["screed >= 0.7.1", 'argparse >= 1.2.1', ],
         "setup_requires": ['nose >= 1.0', 'sphinx', ],
         "scripts": scripts,
         "ext_modules": [extension_mod, ],
-        #"platforms": '', # empty as is conveyed by the classifiers below
-        #"license": '', # empty as is conveyed by the classifier below
+        # "platforms": '', # empty as is conveyed by the classifiers below
+        # "license": '', # empty as is conveyed by the classifier below
         "include_package_data": True,
         "classifiers":  [
             "Development Status :: 4 - Beta",
@@ -132,7 +132,7 @@
             "Programming Language :: C++",
             "Programming Language :: Python :: 2.7",
             "Topic :: Scientific/Engineering :: Bio-Informatics",
-            ],
+        ],
     }
 
 # Only run lib setup when needed, not on every invocation
@@ -140,6 +140,7 @@
 
 
 class build_ext(_build_ext):
+
         """Specialized Python extension builder."""
 
         def run(self):
diff --git a/tests/test_Hashbits.py b/tests/test_Hashbits.py
index 761eb58c77..f0d09cf44e 100644
--- a/tests/test_Hashbits.py
+++ b/tests/test_Hashbits.py
@@ -18,9 +18,11 @@
 import khmer_tst_utils as utils
 from nose.plugins.attrib import attr
 
+
 def teardown():
     utils.cleanup()
 
+
 def test__get_set_tag_density():
     ht = khmer.Hashbits(32, 1, 1)
 
@@ -551,4 +553,3 @@ def test_simple_median():
     assert median == 1
     assert average == 1.0
     assert stddev == 0.0
-
diff --git a/tests/test_graph.py b/tests/test_graph.py
index 3891df740e..a0df526b56 100644
--- a/tests/test_graph.py
+++ b/tests/test_graph.py
@@ -241,7 +241,7 @@ def test_output_fq(self):
 
         print open(output_file).read()
 
-        x = set([ r.accuracy for r in screed.open(output_file) ])
+        x = set([r.accuracy for r in screed.open(output_file)])
         assert x, x
 
     @attr('highmem')
diff --git a/tests/test_hashbits.py b/tests/test_hashbits.py
index aca4dad430..92d5484a1e 100644
--- a/tests/test_hashbits.py
+++ b/tests/test_hashbits.py
@@ -11,9 +11,11 @@
 import khmer_tst_utils as utils
 from nose.plugins.attrib import attr
 
+
 def teardown():
     utils.cleanup()
 
+
 def test__get_set_tag_density():
     ht = khmer.new_hashbits(32, 1, 1)
 
@@ -544,4 +546,3 @@ def test_simple_median():
     assert median == 1
     assert average == 1.0
     assert stddev == 0.0
-
diff --git a/tests/test_labelhash.py b/tests/test_labelhash.py
index 4ab73a8319..51431ccd56 100644
--- a/tests/test_labelhash.py
+++ b/tests/test_labelhash.py
@@ -11,26 +11,29 @@
 import khmer_tst_utils as utils
 from nose.plugins.attrib import attr
 
+
 def teardown():
     utils.cleanup()
 
 #
-# @camillescott TODO: more tests! 
+# @camillescott TODO: more tests!
 #  * thread-safety
 
+
 def test_n_labels():
     lh = LabelHash(20, 1e7, 4)
     filename = utils.get_test_data('test-labels.fa')
     lh.consume_fasta_and_tag_with_labels(filename)
-    
+
     print lh.n_labels()
     assert lh.n_labels() == 4
 
+
 def test_get_label_dict():
     lb = LabelHash(20, 1e7, 4)
     filename = utils.get_test_data('test-labels.fa')
     lb.consume_fasta_and_tag_with_labels(filename)
-    
+
     labels = lb.get_label_dict()
     expected = [0L, 1L, 2L, 3L]
     for e_label in expected:
@@ -38,6 +41,7 @@ def test_get_label_dict():
     for a_label in labels:
         assert a_label in expected
 
+
 def test_get_tag_labels():
     lb = LabelHash(20, 1e7, 4)
     filename = utils.get_test_data('single-read.fq')
@@ -48,6 +52,7 @@ def test_get_tag_labels():
     assert len(labels) == 1
     assert labels.pop() == 0L
 
+
 def test_consume_fasta_and_tag_with_labels():
     lb = LabelHash(20, 1e7, 4)
     read_1 = 'ACGTAACCGGTTAAACCCGGGTTTAAAACCCCGGGGTTTT'
@@ -77,22 +82,24 @@ def test_consume_partitioned_fasta_and_tag_with_labels():
     lb = LabelHash(20, 1e7, 4)
     filename = utils.get_test_data('real-partition-small.fa')
 
-    total_reads, n_consumed = lb.consume_partitioned_fasta_and_tag_with_labels(filename)
+    total_reads, n_consumed = lb.consume_partitioned_fasta_and_tag_with_labels(
+        filename)
     labels = set()
     for record in screed.open(filename):
         seq = record.sequence
         labels.update(lb.sweep_label_neighborhood(seq, False, False))
-    #print lb.n_labels()
-    #print labels
+    # print lb.n_labels()
+    # print labels
     assert len(labels) == 1
     assert labels.pop() == 2L
-    assert lb.n_labels() == 1 
+    assert lb.n_labels() == 1
+
 
 def test_sweep_tag_neighborhood():
     lb = LabelHash(20, 1e7, 4)
     filename = utils.get_test_data('single-read.fq')
     lb.consume_fasta_and_tag(filename)
-    
+
     tags = lb.sweep_tag_neighborhood('CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT')
     assert len(tags) == 1
     assert tags.pop() == 173473779682L
@@ -102,7 +109,7 @@ def test_sweep_label_neighborhood():
     lb = LabelHash(20, 1e7, 4)
     filename = utils.get_test_data('single-read.fq')
     lb.consume_fasta_and_tag_with_labels(filename)
-    
+
     labels = lb.sweep_label_neighborhood('CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT')
     assert len(labels) == 1
     assert labels.pop() == 0L
@@ -114,37 +121,43 @@ def test_sweep_label_neighborhood():
   traversing from B should find labels from A, B, and C,
   and traversing from C should find labels from B and C
 '''
+
+
 def test_label_tag_correctness():
     lb = LabelHash(20, 1e7, 4)
     filename = utils.get_test_data('test-labels.fa')
     lb.consume_fasta_and_tag_with_labels(filename)
-    
+
     # read A
-    labels = lb.sweep_label_neighborhood('ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAGCTAGGCTAGGTGTGCTCTGCCTAGAGCTAGGCTAGGTGT')
+    labels = lb.sweep_label_neighborhood(
+        'ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAGCTAGGCTAGGTGTGCTCTGCCTAGAGCTAGGCTAGGTGT')
     print lb.sweep_tag_neighborhood('TTCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAGCTAGGCTAGGTGTGCTCTGCCTAGAGCTAGGCTAGGTGT')
     print labels
-    print len('ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAG')-19 
+    print len('ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAG') - 19
     assert len(labels) == 2
     assert 0L in labels
     assert 1L in labels
-    
+
     # read B
-    labels = lb.sweep_label_neighborhood('GCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGCTCTGCCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGA')
+    labels = lb.sweep_label_neighborhood(
+        'GCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGCTCTGCCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGA')
     print labels
     assert len(labels) == 3
     assert 0L in labels
     assert 1L in labels
     assert 2L in labels
-    
+
     # read C
-    labels = lb.sweep_label_neighborhood('TGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGAGTTGGGGATAGATAGATAGATGAGTGTAGATCCAACAACACATACA')
+    labels = lb.sweep_label_neighborhood(
+        'TGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGAGTTGGGGATAGATAGATAGATGAGTGTAGATCCAACAACACATACA')
     print labels
     assert len(labels) == 2
     assert 1L in labels
     assert 2L in labels
-    
+
     # read D
-    labels = lb.sweep_label_neighborhood('TATATATATAGCTAGCTAGCTAACTAGCTAGCATCGATCGATCGATC')
+    labels = lb.sweep_label_neighborhood(
+        'TATATATATAGCTAGCTAGCTAACTAGCTAGCATCGATCGATCGATC')
     print labels
     assert len(labels) == 1
     assert 3L in labels
@@ -153,6 +166,7 @@ def test_label_tag_correctness():
 # Begin Hashbits tests
 #
 
+
 def test__get_set_tag_density():
     ht = khmer.LabelHash(32, 1, 1)
 
@@ -683,4 +697,3 @@ def test_simple_median():
     assert median == 1
     assert average == 1.0
     assert stddev == 0.0
-
diff --git a/tests/test_scripts.py b/tests/test_scripts.py
index 806e88d2ea..8f585888f6 100644
--- a/tests/test_scripts.py
+++ b/tests/test_scripts.py
@@ -3,7 +3,9 @@
 # Copyright (C) Michigan State University, 2009-2013. It is licensed under
 # the three-clause BSD license; see doc/LICENSE.txt. Contact: ctb@msu.edu
 #
-import sys, os, shutil
+import sys
+import os
+import shutil
 from cStringIO import StringIO
 import traceback
 
@@ -433,6 +435,7 @@ def test_normalize_by_median_force():
     assert '*** Skipping' in err
     assert '** IOErrors' in err
 
+
 def test_normalize_by_median_no_bigcount():
     infile = utils.get_temp_filename('test.fa')
     hashfile = utils.get_temp_filename('test-out.kh')
@@ -447,12 +450,13 @@ def test_normalize_by_median_no_bigcount():
     (status, out, err) = runscript(script, args)
     assert status == 0, (out, err)
     print (out, err)
-    
+
     assert os.path.exists(hashfile), hashfile
     kh = khmer.load_counting_hash(hashfile)
-    
+
     assert kh.get('GGTTGACG') == 255
 
+
 def test_normalize_by_median_dumpfrequency():
     CUTOFF = '1'
 
@@ -864,9 +868,11 @@ def test_extract_partitions():
     parts = set(parts)
     assert len(parts) == 1, len(parts)
 
+
 def test_extract_partitions_fq():
     seqfile = utils.get_test_data('random-20-a.fq')
-    graphbase = _make_graph(seqfile, do_partition=True, annotate_partitions=True)
+    graphbase = _make_graph(
+        seqfile, do_partition=True, annotate_partitions=True)
     in_dir = os.path.dirname(graphbase)
 
     # get the final part file
@@ -875,7 +881,7 @@ def test_extract_partitions_fq():
     # ok, now run extract-partitions.
     script = scriptpath('extract-partitions.py')
     args = ['extracted', partfile]
-    
+
     runscript(script, args, in_dir)
 
     distfile = os.path.join(in_dir, 'extracted.dist')
@@ -886,15 +892,16 @@ def test_extract_partitions_fq():
     dist = open(distfile).readline()
     assert dist.strip() == '99 1 1 99'
 
-    parts = [ r.name.split('\t')[1] for r in screed.open(partfile) ]
+    parts = [r.name.split('\t')[1] for r in screed.open(partfile)]
     assert len(parts) == 99, len(parts)
     parts = set(parts)
     assert len(parts) == 1, len(parts)
 
-    quals = set([ r.accuracy for r in screed.open(partfile) ])
+    quals = set([r.accuracy for r in screed.open(partfile)])
     quals = list(quals)
     assert quals[0], quals
 
+
 def test_abundance_dist():
     infile = utils.get_temp_filename('test.fa')
     outfile = utils.get_temp_filename('test.dist')
@@ -1222,18 +1229,20 @@ def test_sample_reads_randomly():
                         '895:1:1:1327:13028', '895:1:1:1368:4434',
                         '895:1:1:1335:19932', '895:1:1:1340:19387'])
 
+
 def test_sweep_reads_by_partition_buffered():
     readfile = utils.get_temp_filename('reads.fa')
     contigfile = utils.get_temp_filename('contigs.fp')
     in_dir = os.path.dirname(contigfile)
-    
+
     shutil.copyfile(utils.get_test_data('test-sweep-reads.fa'), readfile)
-    shutil.copyfile(utils.get_test_data('test-sweep-contigs.fp'), contigfile)    
+    shutil.copyfile(utils.get_test_data('test-sweep-contigs.fp'), contigfile)
 
     script = scriptpath('sweep-reads-by-partition-buffered.py')
-    args = ['-k', '25', '-o', 'test', '-i', contigfile, readfile, 'junkfile.fa']
+    args = ['-k', '25', '-o', 'test', '-i',
+            contigfile, readfile, 'junkfile.fa']
     status, out, err = runscript(script, args, in_dir)
-    
+
     # check if the bad file was skipped without issue
     assert 'ERROR' in err
     assert 'skipping' in err
@@ -1242,13 +1251,13 @@ def test_sweep_reads_by_partition_buffered():
     out2 = os.path.join(in_dir, 'test_1.fa')
     mout = os.path.join(in_dir, 'test_multi.fa')
     oout = os.path.join(in_dir, 'test_orphaned.fa')
-    
+
     print os.listdir(in_dir)
 
     seqs1 = set([r.name for r in screed.open(out1)])
     seqs2 = set([r.name for r in screed.open(out2)])
     seqsm = set([r.name for r in screed.open(mout)])
-    seqso = set([r.name for r in screed.open(oout)])    
+    seqso = set([r.name for r in screed.open(oout)])
 
     print seqs1
     print seqs2
@@ -1256,6 +1265,6 @@ def test_sweep_reads_by_partition_buffered():
     print seqso
     assert seqs1 == set(['read1_p0\t0', 'read2_p0\t0'])
     assert seqs2 == set(['read3_p1\t1'])
-    assert (seqsm == set(['read4_multi\t0\t1']) or \
+    assert (seqsm == set(['read4_multi\t0\t1']) or
             seqsm == set(['read4_multi\t1\t0']))
     assert seqso == set(['read5_orphan'])
diff --git a/tests/test_subset_graph.py b/tests/test_subset_graph.py
index 0075630250..c38ced8c29 100644
--- a/tests/test_subset_graph.py
+++ b/tests/test_subset_graph.py
@@ -406,13 +406,14 @@ def test_small_real_partitions():
 CCTCGGGCCTTTCCGTTCCGTTGCCGCCCAAGCTCTCTAGCATCGAATCGGTCAAGCGGT\
 """
 
+
 def test_partition_on_abundance_1():
     print (a,)
     print (b,)
     kh = khmer.new_counting_hash(20, 1e6, 4)
     for i in range(10):
         print kh.consume_and_tag(a)
-        
+
     for i in range(10):
         print kh.consume_and_tag(b)
 
@@ -421,11 +422,12 @@ def test_partition_on_abundance_1():
     x = p.count_partitions()
     assert x == (1, 0)                  # one partition, no remainders
 
+
 def test_partition_on_abundance_2():
     kh = khmer.new_counting_hash(20, 1e6, 4)
     for i in range(10):
         print kh.consume_and_tag(a)
-        
+
     for i in range(5):
         print kh.consume_and_tag(b)
 
@@ -434,11 +436,12 @@ def test_partition_on_abundance_2():
     x = p.count_partitions()
     assert x == (1, 6)                  # one partition, six disconnected
 
+
 def test_partition_on_abundance_3():
     kh = khmer.new_counting_hash(20, 1e6, 4)
     for i in range(10):
         print kh.consume_and_tag(a)
-        
+
     for i in range(5):
         print kh.consume_and_tag(b)
 
@@ -447,16 +450,17 @@ def test_partition_on_abundance_3():
 
     # this will get paths only in 'b'
     p = kh.do_subset_partition_with_abundance(5, 10)
-    
+
     x = p.count_partitions()
     print x
     assert x == (2, 2)                  # two partitions, two ignored tags
 
+
 def test_partition_overlap_1():
     kh = khmer.new_counting_hash(20, 1e6, 4)
     for i in range(10):
         kh.consume_and_tag(a)
-        
+
     for i in range(10):
         kh.consume_and_tag(b)
 
@@ -466,17 +470,18 @@ def test_partition_overlap_1():
     # this will get paths only in 'a', again -- should be the same!
     p2 = kh.do_subset_partition_with_abundance(10, 50)
 
-    #p1.report_on_partitions()
-    #p2.report_on_partitions()
+    # p1.report_on_partitions()
+    # p2.report_on_partitions()
 
     x = p1.compare_partitions(3, p2, 3)
     assert x == (0, 0, 14), x
 
+
 def test_partition_overlap_2():
     kh = khmer.new_counting_hash(20, 1e6, 4)
     for i in range(10):
         kh.consume_and_tag(a)
-        
+
     for i in range(5):
         kh.consume_and_tag(b)
 
@@ -486,8 +491,8 @@ def test_partition_overlap_2():
     # this will get paths only in 'b'
     p2 = kh.do_subset_partition_with_abundance(5, 10)
 
-    #p1.report_on_partitions()
-    #p2.report_on_partitions()
+    # p1.report_on_partitions()
+    # p2.report_on_partitions()
 
     x = p1.compare_partitions(3, p2, 3)
     assert x == (8, 6, 0), x
@@ -497,7 +502,7 @@ def test_partition_overlap_2():
 
     x = p1.partition_sizes()
     assert x == ([(3L, 8L)], 0), x
-    
+
     x = p2.partition_sizes()
     assert x == ([(3L, 6L), (5L, 6L)], 2), x
 
@@ -506,4 +511,3 @@ def test_partition_overlap_2():
 
     x = p2.partition_average_coverages(kh)
     assert x == [(3L, 5L), (5L, 10L)], x
-

From 9a511c40e11ab93b77f45c96d8310d94946568ca Mon Sep 17 00:00:00 2001
From: "Michael R. Crusoe" <mcrusoe@msu.edu>
Date: Fri, 10 Jan 2014 17:57:29 -0500
Subject: [PATCH 124/140] cpychecker fixes

---
 khmer/_khmermodule.cc | 253 +++++++++++++++++++++++-------------------
 lib/subset.cc         |   2 +-
 2 files changed, 141 insertions(+), 114 deletions(-)

diff --git a/khmer/_khmermodule.cc b/khmer/_khmermodule.cc
index 4cf587cc2d..50c011b1bb 100644
--- a/khmer/_khmermodule.cc
+++ b/khmer/_khmermodule.cc
@@ -159,9 +159,11 @@ void _report_fn(const char * info, void * data, unsigned long long n_reads,
     PyObject * obj = (PyObject *) data;
     if (obj != Py_None) {
       PyObject * args = Py_BuildValue("sKK", info, n_reads, other);
-      PyObject * r = PyObject_Call(obj, args, NULL);
-      Py_XDECREF(r);
-      Py_DECREF(args);
+      if (args != NULL) {
+        PyObject * r = PyObject_Call(obj, args, NULL); 
+        Py_XDECREF(r);
+      }
+      Py_XDECREF(args);
     }
   }
 
@@ -1970,10 +1972,11 @@ void _dump_report_fn(const char * info, unsigned int count, void * data)
     PyObject * obj = (PyObject *) data;
     if (obj != Py_None) {
       PyObject * args = Py_BuildValue("sI", info, count);
-
-      PyObject * r = PyObject_Call(obj, args, NULL);
-      Py_XDECREF(r);
-      Py_DECREF(args);
+      if (args != NULL) {
+	PyObject * r = PyObject_Call(obj, args, NULL);
+        Py_XDECREF(r);
+      }
+      Py_XDECREF(args);
     }
   }
 
@@ -2061,8 +2064,7 @@ static PyObject * hash_consume_and_tag(PyObject * self, PyObject * args)
   khmer_KCountingHashObject * me = (khmer_KCountingHashObject *) self;
   khmer::CountingHash * counting = me->counting;
 
-  char * seq;
-  PyObject * callback_obj = NULL;
+  const char * seq;
 
   if (!PyArg_ParseTuple(args, "s", &seq)) {
     return NULL;
@@ -2078,7 +2080,7 @@ static PyObject * hash_consume_and_tag(PyObject * self, PyObject * args)
     return NULL;
   }
 
-  return Py_BuildValue("L", n_consumed);
+  return Py_BuildValue("K", n_consumed);
 }
 
 static PyObject * hash_consume_fasta_and_tag(PyObject * self, PyObject * args)
@@ -2086,7 +2088,7 @@ static PyObject * hash_consume_fasta_and_tag(PyObject * self, PyObject * args)
   khmer_KCountingHashObject * me = (khmer_KCountingHashObject *) self;
   khmer::CountingHash * counting = me->counting;
 
-  char * filename;
+  const char * filename;
   PyObject * callback_obj = NULL;
 
   if (!PyArg_ParseTuple(args, "s|O", &filename, &callback_obj)) {
@@ -2105,7 +2107,7 @@ static PyObject * hash_consume_fasta_and_tag(PyObject * self, PyObject * args)
     return NULL;
   }
 
-  return Py_BuildValue("iL", total_reads, n_consumed);
+  return Py_BuildValue("IK", total_reads, n_consumed);
 }
 
 static PyObject * hash_find_all_tags_truncate_on_abundance(PyObject * self, PyObject *args)
@@ -2113,10 +2115,10 @@ static PyObject * hash_find_all_tags_truncate_on_abundance(PyObject * self, PyOb
   khmer_KCountingHashObject * me = (khmer_KCountingHashObject *) self;
   khmer::CountingHash * counting = me->counting;
 
-  char * kmer_s = NULL;
-  unsigned int min_count, max_count;
+  const char * kmer_s = NULL;
+  BoundedCounterType min_count, max_count;
 
-  if (!PyArg_ParseTuple(args, "sii", &kmer_s, &min_count, &max_count)) {
+  if (!PyArg_ParseTuple(args, "sHH", &kmer_s, &min_count, &max_count)) {
     return NULL;
   }
 
@@ -2153,9 +2155,9 @@ static PyObject * hash_do_subset_partition_with_abundance(PyObject * self, PyObj
   khmer::HashIntoType start_kmer = 0, end_kmer = 0;
   PyObject * break_on_stop_tags_o = NULL;
   PyObject * stop_big_traversals_o = NULL;
-  unsigned int min_count, max_count;
+  BoundedCounterType min_count, max_count;
 
-  if (!PyArg_ParseTuple(args, "ii|KKOOO",
+  if (!PyArg_ParseTuple(args, "HH|KKOOO",
 			&min_count, &max_count,
 			&start_kmer, &end_kmer,
 			&break_on_stop_tags_o,
@@ -2190,6 +2192,10 @@ static PyObject * hash_do_subset_partition_with_abundance(PyObject * self, PyObj
   khmer_KSubsetPartitionObject * subset_obj = (khmer_KSubsetPartitionObject *)\
     PyObject_New(khmer_KSubsetPartitionObject, &khmer_KSubsetPartitionType);
 
+  if (subset_obj == NULL) {
+      return NULL;
+  }
+
   subset_obj->subset = subset_p;
 
   return (PyObject *) subset_obj;
@@ -2309,7 +2315,7 @@ static PyObject* _new_counting_hash(PyObject * self, PyObject * args)
   std::vector<khmer::HashIntoType> sizes;
   Py_ssize_t sizes_list_o_length = PyObject_Length(sizes_list_o);
   if (sizes_list_o_length == -1) {
-     return NULL;
+      return NULL;
   } 
   for (int i = 0; i < sizes_list_o_length; i++) {
     PyObject * size_o = PyList_GET_ITEM(sizes_list_o, i);
@@ -2332,6 +2338,51 @@ static PyObject* _new_counting_hash(PyObject * self, PyObject * args)
 // hashbits stuff
 //
 
+static void khmer_hashbits_dealloc(PyObject * obj);
+static PyObject* khmer_hashbits_new(PyTypeObject * type, PyObject * args, PyObject * kwds);
+static int khmer_hashbits_init(khmer_KHashbitsObject * self, PyObject * args, PyObject * kwds); 
+static PyObject * khmer_hashbits_getattr(PyObject * obj, char * name);
+
+static PyTypeObject khmer_KHashbitsType = {
+    PyObject_HEAD_INIT(NULL)
+    0,
+    "Hashbits", sizeof(khmer_KHashbitsObject),
+    0,
+    (destructor)khmer_hashbits_dealloc,	/*tp_dealloc*/
+    0,				/*tp_print*/
+    khmer_hashbits_getattr,	/*tp_getattr*/
+    0,				/*tp_setattr*/
+    0,				/*tp_compare*/
+    0,				/*tp_repr*/
+    0,				/*tp_as_number*/
+    0,				/*tp_as_sequence*/
+    0,				/*tp_as_mapping*/
+    0,				/*tp_hash */
+    0,				/*tp_call*/
+    0,				/*tp_str*/
+    0,				/*tp_getattro*/
+    0,				/*tp_setattro*/
+    0,				/*tp_as_buffer*/
+    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE,		/*tp_flags*/
+    "hashbits object",           /* tp_doc */
+    0,                       /* tp_traverse */
+    0,                       /* tp_clear */
+    0,                       /* tp_richcompare */
+    0,                       /* tp_weaklistoffset */
+    0,                       /* tp_iter */
+    0,                       /* tp_iternext */
+    0,  /* tp_methods */
+    0,                       /* tp_members */
+    0,                       /* tp_getset */
+    0,                       /* tp_base */
+    0,                       /* tp_dict */
+    0,                       /* tp_descr_get */
+    0,                       /* tp_descr_set */
+    0,                       /* tp_dictoffset */
+    (initproc)khmer_hashbits_init,   /* tp_init */
+    0,                       /* tp_alloc */
+};
+
 static PyObject * hashbits_n_unique_kmers(PyObject * self, PyObject * args)
 {
     khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self;
@@ -2358,7 +2409,7 @@ static PyObject * hashbits_count_overlap(PyObject * self, PyObject * args)
   PyObject * callback_obj = NULL;
   khmer::Hashbits * ht2;
 
-  if (!PyArg_ParseTuple(args, "sO|O", &filename, &ht2_argu,
+  if (!PyArg_ParseTuple(args, "sO!|O", &filename, &khmer_KHashbitsType, &ht2_argu,
 			&callback_obj)) {
     return NULL;
   }
@@ -3904,50 +3955,6 @@ khmer_hashbits_getattr(PyObject * obj, char * name)
   return Py_FindMethod(khmer_hashbits_methods, obj, name);
 }
 
-static void khmer_hashbits_dealloc(PyObject * obj);
-static PyObject* khmer_hashbits_new(PyTypeObject * type, PyObject * args, PyObject * kwds);
-static int khmer_hashbits_init(khmer_KHashbitsObject * self, PyObject * args, PyObject * kwds); 
-
-static PyTypeObject khmer_KHashbitsType = {
-    PyObject_HEAD_INIT(NULL)
-    0,
-    "Hashbits", sizeof(khmer_KHashbitsObject),
-    0,
-    (destructor)khmer_hashbits_dealloc,	/*tp_dealloc*/
-    0,				/*tp_print*/
-    khmer_hashbits_getattr,	/*tp_getattr*/
-    0,				/*tp_setattr*/
-    0,				/*tp_compare*/
-    0,				/*tp_repr*/
-    0,				/*tp_as_number*/
-    0,				/*tp_as_sequence*/
-    0,				/*tp_as_mapping*/
-    0,				/*tp_hash */
-    0,				/*tp_call*/
-    0,				/*tp_str*/
-    0,				/*tp_getattro*/
-    0,				/*tp_setattro*/
-    0,				/*tp_as_buffer*/
-    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE,		/*tp_flags*/
-    "hashbits object",           /* tp_doc */
-    0,                       /* tp_traverse */
-    0,                       /* tp_clear */
-    0,                       /* tp_richcompare */
-    0,                       /* tp_weaklistoffset */
-    0,                       /* tp_iter */
-    0,                       /* tp_iternext */
-    khmer_hashbits_methods,  /* tp_methods */
-    0,                       /* tp_members */
-    0,                       /* tp_getset */
-    0,                       /* tp_base */
-    0,                       /* tp_dict */
-    0,                       /* tp_descr_get */
-    0,                       /* tp_descr_set */
-    0,                       /* tp_dictoffset */
-    (initproc)khmer_hashbits_init,   /* tp_init */
-    0,                       /* tp_alloc */
-};
-
 // __new__ for hashbits; necessary for proper subclassing
 // This will essentially do what the old factory function did. Unlike many __new__
 // methods, we take our arguments here, because there's no "unitialized" hashbits
@@ -3962,7 +3969,8 @@ static PyObject* khmer_hashbits_new(PyTypeObject * type, PyObject * args, PyObje
         PyObject* sizes_list_o = NULL;
 
         if (!PyArg_ParseTuple(args, "IO", &k, &sizes_list_o)) {
-            return NULL;
+            Py_DECREF(self);
+	    return NULL;
         }
 
         std::vector<khmer::HashIntoType> sizes;
@@ -3998,7 +4006,7 @@ static PyObject * subset_count_partitions(PyObject * self,
   unsigned int n_partitions = 0, n_unassigned = 0;
   subset_p->count_partitions(n_partitions, n_unassigned);
 
-  return Py_BuildValue("ii", n_partitions, n_unassigned);
+  return Py_BuildValue("II", n_partitions, n_unassigned);
 }
 
 static PyObject * subset_report_on_partitions(PyObject * self,
@@ -4024,9 +4032,9 @@ static PyObject * subset_compare_partitions(PyObject * self,
   khmer::SubsetPartition * subset1_p = me->subset;
 
   PyObject * subset2_obj = NULL;
-  unsigned int pid1, pid2;	// @CTB ensure that these are unsigned?
+  PartitionID pid1, pid2;	// @CTB ensure that these are unsigned?
 
-  if (!PyArg_ParseTuple(args, "iOi",
+  if (!PyArg_ParseTuple(args, "IOI",
 			&pid1, &subset2_obj, &pid2)) {
     return NULL;
   }
@@ -4035,11 +4043,10 @@ static PyObject * subset_compare_partitions(PyObject * self,
   khmer::SubsetPartition * subset2_p = other->subset;
 
   unsigned int n_only1 = 0, n_only2 = 0, n_shared = 0;
-  subset1_p->compare_to_partition((PartitionID) pid1,
-				  subset2_p, (PartitionID) pid2,
+  subset1_p->compare_to_partition(pid1, subset2_p, pid2,
 				  n_only1, n_only2, n_shared);
 
-  return Py_BuildValue("iii", n_only1, n_only2, n_shared);
+  return Py_BuildValue("III", n_only1, n_only2, n_shared);
 }
 
 static PyObject * subset_partition_size_distribution(PyObject * self,
@@ -4058,15 +4065,24 @@ static PyObject * subset_partition_size_distribution(PyObject * self,
   subset_p->partition_size_distribution(d, n_unassigned);
 
   PyObject * x = PyList_New(d.size());
-  khmer::PartitionCountDistribution::const_iterator di;
+  if (x == NULL) {
+      return NULL;
+  }
+  khmer::PartitionCountDistribution::iterator di;
 
   unsigned int i;
   for (i = 0, di = d.begin(); di != d.end(); di++, i++) {
-    PyList_SET_ITEM(x, i, Py_BuildValue("LL", di->first, di->second));
+      PyObject * tup = Py_BuildValue("KK", di->first, di->second);
+      if (tup != NULL) {
+	  PyList_SET_ITEM(x, i, tup);
+      }
+      Py_XDECREF(tup);
   }
   assert (i == d.size());
 
-  return Py_BuildValue("Oi", x, n_unassigned);
+  PyObject * ret = Py_BuildValue("OI", x, n_unassigned);
+  Py_DECREF(x);
+  return ret;
 }
 
 static PyObject * subset_partition_sizes(PyObject * self,
@@ -4077,7 +4093,7 @@ static PyObject * subset_partition_sizes(PyObject * self,
 
   unsigned int min_size = 0;
 
-  if (!PyArg_ParseTuple(args, "|i", &min_size)) {
+  if (!PyArg_ParseTuple(args, "|I", &min_size)) {
     return NULL;
   }
   
@@ -4085,23 +4101,33 @@ static PyObject * subset_partition_sizes(PyObject * self,
   unsigned int n_unassigned = 0;
   subset_p->partition_sizes(cm, n_unassigned);
 
-  unsigned int i;
+  unsigned int i = 0;
   khmer::PartitionCountMap::const_iterator mi;
-  for (i = 0, mi = cm.begin(); mi != cm.end(); mi++) {
+  for (mi = cm.begin(); mi != cm.end(); mi++) {
     if (mi->second >= min_size) i++;
   }
 
   PyObject * x = PyList_New(i);
+  if (x == NULL) {
+      return NULL;
+  }
 
   // this should probably be a dict. @CTB
   for (i = 0, mi = cm.begin(); mi != cm.end(); mi++) {
     if (mi->second >= min_size) {
-      PyList_SET_ITEM(x, i, Py_BuildValue("LL", mi->first, mi->second));
+	PyObject * tup = Py_BuildValue("II", mi->first, mi->second);
+	if (tup != NULL) {
+	    PyList_SET_ITEM(x, i, tup);
+	}
+	Py_XDECREF(tup);
       i++;
     }
   }
 
-  return Py_BuildValue("Oi", x, n_unassigned);
+  PyObject * ret = Py_BuildValue("OI", x, n_unassigned);
+  Py_DECREF(x);
+
+  return ret;
 }
 
 static PyObject * subset_partition_average_coverages(PyObject * self,
@@ -4122,16 +4148,23 @@ static PyObject * subset_partition_average_coverages(PyObject * self,
   subset_p->partition_average_coverages(cm, counting);
 
   unsigned int i;
-  khmer::PartitionCountMap::const_iterator mi;
+  khmer::PartitionCountMap::iterator mi;
 
   PyObject * x = PyList_New(cm.size());
+  if (x == NULL) {
+      return NULL;
+  }
 
   // this should probably be a dict. @CTB
   for (i = 0, mi = cm.begin(); mi != cm.end(); mi++, i++) {
-    PyList_SET_ITEM(x, i, Py_BuildValue("LL", mi->first, mi->second));
+      PyObject * tup = Py_BuildValue("II", mi->first, mi->second);
+      if (tup != NULL) {
+	PyList_SET_ITEM(x, i, tup);
+      }
+      Py_XDECREF(tup);
   }
 
-  return Py_BuildValue("O", x);
+  return x;
 }
 
 static PyMethodDef khmer_subset_methods[] = {
@@ -4240,12 +4273,21 @@ static PyObject * labelhash_get_label_dict(PyObject * self, PyObject * args) {
   khmer::LabelHash * hb = me->labelhash;
   
   PyObject * d = PyDict_New();
+  if (d == NULL) {
+      return NULL;
+  }
   khmer::LabelPtrMap::iterator it;
   
   for (it = hb->label_ptrs.begin(); it!=hb->label_ptrs.end(); ++it) {
-    PyDict_SetItem(d, Py_BuildValue("K", it->first), Py_BuildValue("K", it->second));
+    PyObject * key = Py_BuildValue("K", it->first);
+    Py_XDECREF(key);
+    PyObject * val = Py_BuildValue("K", it->second);
+    Py_XDECREF(val);
+    if (key != NULL && val != NULL) {
+	PyDict_SetItem(d, key, val);
+    }
   }
-  
+
   return d;
 }
 
@@ -4256,7 +4298,7 @@ static PyObject * labelhash_consume_fasta_and_tag_with_labels(PyObject * self, P
   
   std::ofstream outfile;
   
-  char * filename;
+  const char * filename;
   PyObject * callback_obj = NULL;
 
   if (!PyArg_ParseTuple(args, "s|O", &filename, &callback_obj)) {
@@ -4277,7 +4319,7 @@ static PyObject * labelhash_consume_fasta_and_tag_with_labels(PyObject * self, P
   //Py_END_ALLOW_THREADS
   if (exc_raised) return NULL;
   
-  return Py_BuildValue("iL", total_reads, n_consumed);
+  return Py_BuildValue("IK", total_reads, n_consumed);
   
 }
 
@@ -4287,7 +4329,7 @@ static PyObject * labelhash_consume_partitioned_fasta_and_tag_with_labels(
   khmer_KLabelHashObject * me = (khmer_KLabelHashObject *) self;
   khmer::LabelHash * labelhash = me->labelhash;
 
-  char * filename;
+  const char * filename;
   PyObject * callback_obj = NULL;
 
   if (!PyArg_ParseTuple(args, "s|O", &filename, &callback_obj)) {
@@ -4306,41 +4348,34 @@ static PyObject * labelhash_consume_partitioned_fasta_and_tag_with_labels(
     std::cout << "caught exception in consume_partitioned_fasta_and_tag_with_labels!" << std::endl;
     return NULL;
   }
-  std::cout << "building value for return..." << std::endl;
-  return Py_BuildValue("iK", total_reads, n_consumed);
+  return Py_BuildValue("IK", total_reads, n_consumed);
 }
 
 static PyObject * labelhash_consume_sequence_and_tag_with_labels(PyObject * self, PyObject * args) {
   khmer_KLabelHashObject * me = (khmer_KLabelHashObject *) self;
   khmer::LabelHash * hb = me->labelhash;
-  std::cout << "inside labelhash consume cpython func, parsing args..." << std::endl;
-  char * seq = NULL;
-  unsigned long long c = NULL;
+  const char * seq = NULL;
+  unsigned long long c = 0;
   if (!PyArg_ParseTuple(args, "sK", &seq, &c)) {
     return NULL;
   }
-  std::cout << "parsed args, getting new label" << std::endl;
   unsigned long long n_consumed = 0;
   khmer::Label * the_label = hb->check_and_allocate_label(c);
 
   try { 
-  //if (hb->check_and_normalize_read(seq)) {
-    std::cout << "calling low level consume func on labelhash..." << std::endl;
     hb->consume_sequence_and_tag_with_labels(seq, n_consumed, *the_label);
-  //}
   } catch (_khmer_signal &e) {
     return NULL;
   }
-  std::cout << "packaging return value and returning!" << std::endl;
-  return Py_BuildValue("L", n_consumed);
+  return Py_BuildValue("K", n_consumed);
 }
 
 static PyObject * labelhash_sweep_label_neighborhood(PyObject * self, PyObject * args) {
   khmer_KLabelHashObject * me = (khmer_KLabelHashObject *) self;
   khmer::LabelHash * hb = me->labelhash;
   
-  char * seq = NULL;
-  unsigned int r = NULL;
+  const char * seq = NULL;
+  int r = -1;
   PyObject * break_on_stop_tags_o = NULL;
   PyObject * stop_big_traversals_o = NULL;
 
@@ -4372,10 +4407,10 @@ static PyObject * labelhash_sweep_label_neighborhood(PyObject * self, PyObject *
   LabelPtrSet found_labels;
   
   bool exc_raised = false;
-  unsigned int num_traversed = 0;
+  //unsigned int num_traversed = 0;
   //Py_BEGIN_ALLOW_THREADS
   try {
-    num_traversed = hb->sweep_label_neighborhood(seq, found_labels, range, break_on_stop_tags, stop_big_traversals);
+    hb->sweep_label_neighborhood(seq, found_labels, range, break_on_stop_tags, stop_big_traversals);
   } catch (_khmer_signal &e) {
     exc_raised = true;
   }
@@ -4404,8 +4439,8 @@ static PyObject * labelhash_sweep_tag_neighborhood(PyObject * self, PyObject *ar
   khmer_KLabelHashObject * me = (khmer_KLabelHashObject *) self;
   khmer::LabelHash * labelhash = me->labelhash;
 
-  char * seq = NULL;
-  unsigned int r = NULL;
+  const char * seq = NULL;
+  int r = -1;
   PyObject * break_on_stop_tags_o = NULL;
   PyObject * stop_big_traversals_o = NULL;
 
@@ -4443,7 +4478,7 @@ static PyObject * labelhash_sweep_tag_neighborhood(PyObject * self, PyObject *ar
   //Py_END_ALLOW_THREADS
 
   PyObject * x =  PyList_New(tagged_kmers.size());
-  khmer::SeenSet::const_iterator si;
+  khmer::SeenSet::iterator si;
   unsigned long long i = 0;
   for (si=tagged_kmers.begin(); si!=tagged_kmers.end(); ++si) {
     //std::string kmer_s = _revhash(*si, labelhash->ksize());
@@ -4508,13 +4543,6 @@ static PyMethodDef khmer_labelhash_methods[] = {
   {NULL, NULL, 0, NULL}           /* sentinel */
 };
 
-// still necessary?
-static PyObject *
-khmer_labelhash_getattr(PyObject * obj, char * name)
-{
-  return Py_FindMethod(khmer_labelhash_methods, obj, name);
-}
-
 static PyTypeObject khmer_KLabelHashType = {
     PyObject_HEAD_INIT(NULL)
     0,                       /* ob_size */
@@ -4949,8 +4977,8 @@ init_khmer(void)
     // implemented __new__ for Hashbits; keeping factory func around as well
     // for backwards compat with old scripts
     khmer_KHashbitsType.tp_new = khmer_hashbits_new;
+    khmer_KHashbitsType.tp_methods = khmer_hashbits_methods;
     if (PyType_Ready(&khmer_KHashbitsType) < 0) {
-        std::cout << "_khmer.KHashbitsType failed PyType_Ready" << std::endl;
         return;
     }
     // add LabelHash
@@ -4958,7 +4986,6 @@ init_khmer(void)
     khmer_KLabelHashType.tp_base = &khmer_KHashbitsType;
     khmer_KLabelHashType.tp_new = khmer_labelhash_new;
     if (PyType_Ready(&khmer_KLabelHashType) < 0) {
-        std::cout << "_khmer.KLabelHashType failed PyType_Ready" << std::endl; 
         return;
     }
     
diff --git a/lib/subset.cc b/lib/subset.cc
index 3d59244099..16b12576b9 100644
--- a/lib/subset.cc
+++ b/lib/subset.cc
@@ -1406,7 +1406,7 @@ const
 
   partition_sizes(cm, n_unassigned);
 
-  for (PartitionCountMap::const_iterator cmi = cm.begin(); cmi != cm.end();
+  for (PartitionCountMap::iterator cmi = cm.begin(); cmi != cm.end();
        cmi++) {
     d[cmi->second]++;
   }

From ee8e819089a969a40df8e5710b8231444af16dce Mon Sep 17 00:00:00 2001
From: Camille Scott <camille.scott.w@gmail.com>
Date: Fri, 10 Jan 2014 18:19:51 -0500
Subject: [PATCH 125/140] fixed referenced count errors in pylist_setitem and
 pydict_setitem causing incorrect garbage collection

---
 khmer/_khmermodule.cc | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/khmer/_khmermodule.cc b/khmer/_khmermodule.cc
index 50c011b1bb..987d5a73ba 100644
--- a/khmer/_khmermodule.cc
+++ b/khmer/_khmermodule.cc
@@ -4119,7 +4119,6 @@ static PyObject * subset_partition_sizes(PyObject * self,
 	if (tup != NULL) {
 	    PyList_SET_ITEM(x, i, tup);
 	}
-	Py_XDECREF(tup);
       i++;
     }
   }
@@ -4161,7 +4160,6 @@ static PyObject * subset_partition_average_coverages(PyObject * self,
       if (tup != NULL) {
 	PyList_SET_ITEM(x, i, tup);
       }
-      Py_XDECREF(tup);
   }
 
   return x;
@@ -4279,13 +4277,13 @@ static PyObject * labelhash_get_label_dict(PyObject * self, PyObject * args) {
   khmer::LabelPtrMap::iterator it;
   
   for (it = hb->label_ptrs.begin(); it!=hb->label_ptrs.end(); ++it) {
-    PyObject * key = Py_BuildValue("K", it->first);
-    Py_XDECREF(key);
+    PyObject * key = Py_BuildValue("K", it->first);    
     PyObject * val = Py_BuildValue("K", it->second);
-    Py_XDECREF(val);
     if (key != NULL && val != NULL) {
-	PyDict_SetItem(d, key, val);
+	    PyDict_SetItem(d, key, val);
     }
+    Py_XDECREF(key);
+    Py_XDECREF(val);
   }
 
   return d;
@@ -4375,7 +4373,7 @@ static PyObject * labelhash_sweep_label_neighborhood(PyObject * self, PyObject *
   khmer::LabelHash * hb = me->labelhash;
   
   const char * seq = NULL;
-  int r = -1;
+  int r = 0;
   PyObject * break_on_stop_tags_o = NULL;
   PyObject * stop_big_traversals_o = NULL;
 
@@ -4440,7 +4438,7 @@ static PyObject * labelhash_sweep_tag_neighborhood(PyObject * self, PyObject *ar
   khmer::LabelHash * labelhash = me->labelhash;
 
   const char * seq = NULL;
-  int r = -1;
+  int r = 0;
   PyObject * break_on_stop_tags_o = NULL;
   PyObject * stop_big_traversals_o = NULL;
 

From ab6760896cb5554bea7b975f8e781df55a96c573 Mon Sep 17 00:00:00 2001
From: "Michael R. Crusoe" <mcrusoe@msu.edu>
Date: Mon, 13 Jan 2014 12:54:55 -0500
Subject: [PATCH 126/140] add some error messages

---
 khmer/_khmermodule.cc | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/khmer/_khmermodule.cc b/khmer/_khmermodule.cc
index 50c011b1bb..266b55dcb3 100644
--- a/khmer/_khmermodule.cc
+++ b/khmer/_khmermodule.cc
@@ -31,7 +31,6 @@ extern "C" {
   void init_khmer();
 }
 
-
 // Configure module logging.
 //#define WITH_INTERNAL_TRACING
 namespace khmer
@@ -2315,6 +2314,7 @@ static PyObject* _new_counting_hash(PyObject * self, PyObject * args)
   std::vector<khmer::HashIntoType> sizes;
   Py_ssize_t sizes_list_o_length = PyObject_Length(sizes_list_o);
   if (sizes_list_o_length == -1) {
+      PyErr_SetString(PyExc_ValueError, "error with hashtable primes!");
       return NULL;
   } 
   for (int i = 0; i < sizes_list_o_length; i++) {
@@ -2381,7 +2381,7 @@ static PyTypeObject khmer_KHashbitsType = {
     0,                       /* tp_dictoffset */
     (initproc)khmer_hashbits_init,   /* tp_init */
     0,                       /* tp_alloc */
-};
+} ;
 
 static PyObject * hashbits_n_unique_kmers(PyObject * self, PyObject * args)
 {
@@ -4400,6 +4400,8 @@ static PyObject * labelhash_sweep_label_neighborhood(PyObject * self, PyObject *
   }
   
   if (strlen(seq) < hb->ksize()) {
+      PyErr_SetString(PyExc_ValueError,
+	      "string length must >= the hashtable k-mer size");
     return NULL;
   }
   
@@ -4465,6 +4467,8 @@ static PyObject * labelhash_sweep_tag_neighborhood(PyObject * self, PyObject *ar
   }
   
   if (strlen(seq) < labelhash->ksize()) {
+      PyErr_SetString(PyExc_ValueError,
+	      "string length must >= the hashtable k-mer size");
     return NULL;
   }
 
@@ -4478,6 +4482,9 @@ static PyObject * labelhash_sweep_tag_neighborhood(PyObject * self, PyObject *ar
   //Py_END_ALLOW_THREADS
 
   PyObject * x =  PyList_New(tagged_kmers.size());
+  if (x == NULL) {
+      return NULL;
+  }
   khmer::SeenSet::iterator si;
   unsigned long long i = 0;
   for (si=tagged_kmers.begin(); si!=tagged_kmers.end(); ++si) {

From 9a0e71ea13f573cfd84114aab8a956b67cd98e8e Mon Sep 17 00:00:00 2001
From: Camille Scott <camille.scott.w@gmail.com>
Date: Mon, 13 Jan 2014 15:26:58 -0500
Subject: [PATCH 127/140] fixed typo and cleaned up old code comments

---
 khmer/_khmermodule.cc                        | 11 -----------
 scripts/sweep-reads-by-partition-buffered.py |  9 +--------
 2 files changed, 1 insertion(+), 19 deletions(-)

diff --git a/khmer/_khmermodule.cc b/khmer/_khmermodule.cc
index 6139f9a87f..7f93188adb 100644
--- a/khmer/_khmermodule.cc
+++ b/khmer/_khmermodule.cc
@@ -4189,17 +4189,6 @@ khmer_subset_getattr(PyObject * obj, char * name)
 typedef struct {
   //PyObject_HEAD
   khmer_KHashbitsObject khashbits;
-  /* @camillescott late night notes:
-     need to experiment. might be able to call hashbits py methods
-     directly with the labelhash object, because they all instantiate
-     a new hashbits pointer on themselves to call the functions and labelhash
-     inherits from hashbits; or, we define a hashbits object as part of this struct
-     as called for in the c-api reference. need to grok that still.
-     If this is how it's done, remove PyObject_HEAD, which will already be included
-     in the base class struct.
-     See http://docs.python.org/2.7/extending/newtypes.html#subclassing-other-types
-     for details...
-  */
   khmer::LabelHash * labelhash;
 } khmer_KLabelHashObject;
 
diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py
index 75996be973..0c621b16a3 100755
--- a/scripts/sweep-reads-by-partition-buffered.py
+++ b/scripts/sweep-reads-by-partition-buffered.py
@@ -12,7 +12,7 @@
 <reads1> <reads2> ... <readsN>
 
 This script is very lenient on IO errors, due to the large number of file
-operations needed. Thus, errors opening a file for buffer flush or writeing
+operations needed. Thus, errors opening a file for buffer flush or writing
 a read to a file will not crash the program; instead, if there were errors,
 the user will be warned at the end of execution. Errors with opening read files
 are also handled -- we move on to the next read file if there is an error opening.
@@ -54,13 +54,6 @@ def write_seq(fp, name, seq, labels=[]):
     else:
         return 0
 
-# stores reads in memory and flushes them to their appropriate files
-# when certain criteria are met
-# Basic idea is to buffer some number of reads in memory, then dump them all at once
-# Hope that each file acrues, on average, BUFFER_SIZE / NUM_PARTS reads
-# ie, if we buffer 1000000 reads, and we have 100000 partitions or labels,
-# we should expect the mean buffer size to be 10 reads
-
 
 class ReadBuffer:
 

From ad71becfb4498c1eb38738a16a1521d1e9afcb17 Mon Sep 17 00:00:00 2001
From: Michael Crusoe <mcrusoe@msu.edu>
Date: Mon, 13 Jan 2014 15:36:24 -0500
Subject: [PATCH 128/140] one more error message

---
 khmer/_khmermodule.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/khmer/_khmermodule.cc b/khmer/_khmermodule.cc
index 7f93188adb..9b2f7856e1 100644
--- a/khmer/_khmermodule.cc
+++ b/khmer/_khmermodule.cc
@@ -2121,7 +2121,9 @@ static PyObject * hash_find_all_tags_truncate_on_abundance(PyObject * self, PyOb
     return NULL;
   }
 
-  if (strlen(kmer_s) < counting->ksize()) { // @@
+  if (strlen(kmer_s) < counting->ksize()) {
+    PyErr_SetString(PyExc_ValueError,
+	    "kmer_s must be less than the k-mer size of the counting hash");
     return NULL;
   }
 

From e9afda7020bef5a4882d82182c7517aa5e336888 Mon Sep 17 00:00:00 2001
From: Camille Scott <camille.scott.w@gmail.com>
Date: Mon, 13 Jan 2014 15:37:13 -0500
Subject: [PATCH 129/140] added PyErr_Setstring usage to exception in
 consume_...tag_with_labels

---
 khmer/_khmermodule.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/khmer/_khmermodule.cc b/khmer/_khmermodule.cc
index 7f93188adb..8987569ef0 100644
--- a/khmer/_khmermodule.cc
+++ b/khmer/_khmermodule.cc
@@ -4332,7 +4332,7 @@ static PyObject * labelhash_consume_partitioned_fasta_and_tag_with_labels(
     labelhash->consume_partitioned_fasta_and_tag_with_labels(filename, 
     total_reads, n_consumed, _report_fn, callback_obj);
   } catch (_khmer_signal &e) {
-    std::cout << "caught exception in consume_partitioned_fasta_and_tag_with_labels!" << std::endl;
+    PyErr_SetString( PyExc_IOError, "error parsing in consume_partitioned_fasta_and_tag_with_labels");
     return NULL;
   }
   return Py_BuildValue("IK", total_reads, n_consumed);

From 9c08a5763b8f7b5ec0206699280f6b319090cb88 Mon Sep 17 00:00:00 2001
From: Camille Scott <camille.scott.w@gmail.com>
Date: Mon, 13 Jan 2014 15:37:40 -0500
Subject: [PATCH 130/140] reverted change in setup.cfg

---
 setup.cfg | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.cfg b/setup.cfg
index 51f93526b2..a7c3a5e01a 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,8 +1,8 @@
 [nosetests]
 verbosity = 2
 stop = TRUE
-#attr = !known_failing
-attr = !known_failing,!highmem
+attr = !known_failing
+#attr = !known_failing,!highmem
 # where highmem > 0.5GiB memory
 
 [build_ext]

From 4b1c98ab6b69eb29e3c97f20a010c36541e15bc0 Mon Sep 17 00:00:00 2001
From: Camille Scott <camille.scott.w@gmail.com>
Date: Mon, 13 Jan 2014 15:37:55 -0500
Subject: [PATCH 131/140] inlined _parse_partition_id

---
 lib/read_parsers.hh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/read_parsers.hh b/lib/read_parsers.hh
index e8ca7e7968..1357e994e8 100644
--- a/lib/read_parsers.hh
+++ b/lib/read_parsers.hh
@@ -543,7 +543,7 @@ struct FastqParser : public IParser
 
 };
 
-static PartitionID _parse_partition_id(std::string name)
+inline PartitionID _parse_partition_id(std::string name)
 {
   PartitionID p = 0;
   const char * s = name.c_str() + name.length() - 1;

From 5f385878058f4fbcf1dcb920487753be11b23330 Mon Sep 17 00:00:00 2001
From: Camille Scott <camille.scott.w@gmail.com>
Date: Mon, 13 Jan 2014 15:38:21 -0500
Subject: [PATCH 132/140] updated description in comments and argparse to match
 function

---
 scripts/sweep-reads-by-partition-buffered.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py
index 0c621b16a3..25ecd05c88 100755
--- a/scripts/sweep-reads-by-partition-buffered.py
+++ b/scripts/sweep-reads-by-partition-buffered.py
@@ -8,7 +8,7 @@
 """
 Find all reads connected to the given contigs on a per-partition basis.
 
-% python scripts/normalize-by-median.py -r <range> -i <contigs fastp> \
+% sweep-reads-by-partition.py -r <range> -i <contigs fastp> \
 <reads1> <reads2> ... <readsN>
 
 This script is very lenient on IO errors, due to the large number of file
@@ -146,7 +146,8 @@ def flush_all(self):
 
 def main():
 
-    parser = build_construct_args()
+    parser = build_construct_args('Takes a partitioned reference file and a list of reads, \
+                                  and sorts reads by which partition they connect to')
     parser.add_argument('-i', '--input_fastp', dest='input_fastp')
     parser.add_argument(
         '-r', '--traversal_range', type=int, dest='traversal_range',

From 62a6504638799da1dfb4047ec5a28ab13865e1d4 Mon Sep 17 00:00:00 2001
From: Camille Scott <camille.scott.w@gmail.com>
Date: Mon, 13 Jan 2014 15:47:18 -0500
Subject: [PATCH 133/140] moved sandboxy script to sandbox, removed explicit
 file path

---
 {lib => sandbox}/sweep_perf.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
 rename {lib => sandbox}/sweep_perf.py (86%)

diff --git a/lib/sweep_perf.py b/sandbox/sweep_perf.py
similarity index 86%
rename from lib/sweep_perf.py
rename to sandbox/sweep_perf.py
index 923c2da8bb..4a091f69a4 100755
--- a/lib/sweep_perf.py
+++ b/sandbox/sweep_perf.py
@@ -1,4 +1,4 @@
-#! /w/khmer_dev/bin/python
+#! /usr/bin/env python
 
 import khmer
 import screed
@@ -8,7 +8,7 @@
 R = int(sys.argv[1])
 print R
 K = 20
-test_file = '/w/khmer/tests/test-data/biglump-random-20-a.fa'
+test_file = '../tests/test-data/biglump-random-20-a.fa'
 
 ht = khmer.new_hashbits(K, 1e9, 4)
 ht.consume_fasta_and_tag_with_colors(test_file)

From 769ae394f49f9fb363b2860c0226e8def0205eed Mon Sep 17 00:00:00 2001
From: Camille Scott <camille.scott.w@gmail.com>
Date: Mon, 13 Jan 2014 15:57:15 -0500
Subject: [PATCH 134/140] fixed long lines

---
 scripts/sweep-reads-by-partition-buffered.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py
index 25ecd05c88..c9f7fd7059 100755
--- a/scripts/sweep-reads-by-partition-buffered.py
+++ b/scripts/sweep-reads-by-partition-buffered.py
@@ -15,8 +15,8 @@
 operations needed. Thus, errors opening a file for buffer flush or writing
 a read to a file will not crash the program; instead, if there were errors,
 the user will be warned at the end of execution. Errors with opening read files
-are also handled -- we move on to the next read file if there is an error opening.
-
+are also handled -- we move on to the next read file if there is an error
+opening.
 """
 
 import screed
@@ -41,7 +41,7 @@
 
 def fmt_fasta(name, seq, labels=[]):
     return '>{name}\t{labels}\n{seq}\n'.format(name=name,
-                                               labels='\t'.join([str(l) for l in labels]), seq=seq)
+            labels='\t'.join([str(l) for l in labels]), seq=seq)
 
 
 def write_seq(fp, name, seq, labels=[]):

From c99a4718705fc03752b97af2f36c9612ebc07e2a Mon Sep 17 00:00:00 2001
From: Camille Scott <camille.scott.w@gmail.com>
Date: Mon, 13 Jan 2014 16:00:44 -0500
Subject: [PATCH 135/140] fixed more long lines

---
 scripts/sweep-reads-by-partition-buffered.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py
index c9f7fd7059..2ad1c8e537 100755
--- a/scripts/sweep-reads-by-partition-buffered.py
+++ b/scripts/sweep-reads-by-partition-buffered.py
@@ -219,22 +219,27 @@ def main():
     total_t = time.clock()
     start_t = time.clock()
     for read_file in input_files:
-        print >>sys.stderr, '** sweeping {read_file} for labels...'.format(read_file=read_file)
+        print >>sys.stderr, '** sweeping {read_file} for labels...'.format(
+                                                        read_file=read_file)
         file_t = 0.0
         try:
             read_fp = screed.open(read_file)
         except IOError as e:
             print >>sys.stderr, '!! ERROR: !!', e
-            print >>sys.stderr, '*** Could not open {fn}, skipping...'.format(fn=read_file)
+            print >>sys.stderr, '*** Could not open {fn}, skipping...'.format(
+                                                                 fn=read_file)
         else:
             for n, record in enumerate(read_fp):
                 if n % 50000 == 0:
                     end_t = time.clock()
                     batch_t = end_t - start_t
                     file_t += batch_t
-                    print >>sys.stderr, '\tswept {n} reads [{nc} labeled, {no} orphaned] \
+                    print >>sys.stderr, '\tswept {n} reads [{nc} labeled, \
+                                         {no} orphaned] \
                                         ** {sec}s ({sect}s total)' \
-                                        .format(n=n, nc=n_labeled, no=n_orphaned, sec=batch_t, sect=file_t)
+                                        .format(n=n, nc=n_labeled, 
+                                                no=n_orphaned, 
+                                                sec=batch_t, sect=file_t)
                     start_t = time.clock()
                 seq = record.sequence
                 name = record.name

From 7bbe46007137f2b94e45bc3f50a81a37cec0f96c Mon Sep 17 00:00:00 2001
From: Camille Scott <camille.scott.w@gmail.com>
Date: Mon, 13 Jan 2014 16:30:06 -0500
Subject: [PATCH 136/140] made fastp non-optional, shortened lines, removed
 includes

---
 scripts/sweep-reads-by-partition-buffered.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py
index 2ad1c8e537..bc233694cc 100755
--- a/scripts/sweep-reads-by-partition-buffered.py
+++ b/scripts/sweep-reads-by-partition-buffered.py
@@ -8,7 +8,7 @@
 """
 Find all reads connected to the given contigs on a per-partition basis.
 
-% sweep-reads-by-partition.py -r <range> -i <contigs fastp> \
+% sweep-reads-by-partition.py -r <range> <contigs fastp> \
 <reads1> <reads2> ... <readsN>
 
 This script is very lenient on IO errors, due to the large number of file
@@ -22,11 +22,9 @@
 import screed
 import sys
 import os
-import argparse
 import time
 import khmer
 from khmer.counting_args import build_construct_args, DEFAULT_MIN_HASHSIZE
-from collections import namedtuple as nt
 
 
 DEFAULT_NUM_BUFFERS = 50000
@@ -146,9 +144,9 @@ def flush_all(self):
 
 def main():
 
-    parser = build_construct_args('Takes a partitioned reference file and a list of reads, \
-                                  and sorts reads by which partition they connect to')
-    parser.add_argument('-i', '--input_fastp', dest='input_fastp')
+    parser = build_construct_args('Takes a partitioned reference file \
+                                  and a list of reads, and sorts reads \
+                                  by which partition they connect to')
     parser.add_argument(
         '-r', '--traversal_range', type=int, dest='traversal_range',
         default=DEFAULT_RANGE)
@@ -160,6 +158,7 @@ def main():
                         default=DEFAULT_OUT_PREF)
     parser.add_argument('-m', '--max_buffers', dest='max_buffers', type=int,
                         default=DEFAULT_NUM_BUFFERS)
+    parser.add_argument(dest='input_fastp')
     parser.add_argument('input_files', nargs='+')
     args = parser.parse_args()
 
@@ -205,7 +204,7 @@ def main():
     output_buffer = ReadBufferManager(
         max_buffers, max_reads, buf_size, output_pref, outdir)
 
-        # consume the partitioned fasta with which to label the graph
+    # consume the partitioned fasta with which to label the graph
     ht = khmer.LabelHash(K, HT_SIZE, N_HT)
     print >>sys.stderr, 'consuming fastp...'
     ht.consume_partitioned_fasta_and_tag_with_labels(input_fastp)

From c35ef384535840e7fffad7f9168ec84b38a6c98e Mon Sep 17 00:00:00 2001
From: Camille Scott <camille.scott.w@gmail.com>
Date: Mon, 13 Jan 2014 16:30:32 -0500
Subject: [PATCH 137/140] updated tests to reflect new script option format

---
 tests/test_scripts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_scripts.py b/tests/test_scripts.py
index 8f585888f6..49fac0ad42 100644
--- a/tests/test_scripts.py
+++ b/tests/test_scripts.py
@@ -1239,7 +1239,7 @@ def test_sweep_reads_by_partition_buffered():
     shutil.copyfile(utils.get_test_data('test-sweep-contigs.fp'), contigfile)
 
     script = scriptpath('sweep-reads-by-partition-buffered.py')
-    args = ['-k', '25', '-o', 'test', '-i',
+    args = ['-k', '25', '-o', 'test',
             contigfile, readfile, 'junkfile.fa']
     status, out, err = runscript(script, args, in_dir)
 

From 69447b6ad00e27a139a121e8b2b241a38a64daa7 Mon Sep 17 00:00:00 2001
From: CS Welcher <cs.welcher@gmail.com>
Date: Mon, 13 Jan 2014 17:09:29 -0500
Subject: [PATCH 138/140] removed deprecated sweep_sequence_for_labels, added
 test for consume_sequence_and_tag_with_labels

---
 lib/labelhash.cc        | 33 ---------------------------------
 lib/labelhash.hh        |  7 +------
 tests/test_labelhash.py | 13 ++++++++++++-
 3 files changed, 13 insertions(+), 40 deletions(-)

diff --git a/lib/labelhash.cc b/lib/labelhash.cc
index ebeb6554b7..be8605fc03 100644
--- a/lib/labelhash.cc
+++ b/lib/labelhash.cc
@@ -295,39 +295,6 @@ void LabelHash::consume_sequence_and_tag_with_labels(const std::string& seq,
       if (found_tags) { found_tags->insert(kmer); }
     }
   printdbg(done with low-level consume)
-  }
-/*
- * Find all labels associated with the sequence
- * For now, check /every/ k-mer with find_all_tags
- */
-unsigned int LabelHash::sweep_sequence_for_labels(const std::string& seq,
-					LabelPtrSet& found_labels,
-					bool break_on_stoptags,
-					bool stop_big_traversals) {
-					
-    SeenSet tagged_kmers;
-    //LabelPtrSet found_labels;
-    
-    HashIntoType kmer_f, kmer_r, kmer;
-    
-    KMerIterator kmers(seq.c_str(), _ksize);
-    std::string kmer_s;
-    // keep a list of kmers which have already been traversed
-    SeenSet traversed_kmers;
-    while (!kmers.done()) {
-      kmer = kmers.next();
-      kmer_s = _revhash(kmer, _ksize);
-      _hash(kmer_s.c_str(), _ksize, kmer_f, kmer_r);
-      
-      // don't even try traversing from k-mers not in the hashtable
-      //traversed_kmers.clear();
-      if (get_count(uniqify_rc(kmer_f,kmer_r))) {
-        partition->find_all_tags(kmer_f, kmer_r, tagged_kmers,
-                   all_tags, break_on_stoptags, stop_big_traversals);
-        traverse_labels_and_resolve(tagged_kmers, found_labels);
-      }
-    }
-    return traversed_kmers.size();
 }
 
 unsigned int LabelHash::sweep_label_neighborhood(const std::string& seq,
diff --git a/lib/labelhash.hh b/lib/labelhash.hh
index 6abaa788b1..cbc7b5aedc 100644
--- a/lib/labelhash.hh
+++ b/lib/labelhash.hh
@@ -128,12 +128,7 @@ namespace khmer {
         TagPtrSet get_label_tags(const Label& label);
 
         void link_tag_and_label(HashIntoType& kmer, Label& label);
-        
-        unsigned int sweep_sequence_for_labels(const std::string& seq,
-                        LabelPtrSet& found_labels,
-                        bool break_on_stoptags,
-                        bool stop_big_traversals);
-                        
+                               
         unsigned int sweep_label_neighborhood(const std::string & seq,
                                                       LabelPtrSet& found_labels,
                                                       unsigned int range,
diff --git a/tests/test_labelhash.py b/tests/test_labelhash.py
index 51431ccd56..1195224e90 100644
--- a/tests/test_labelhash.py
+++ b/tests/test_labelhash.py
@@ -87,13 +87,24 @@ def test_consume_partitioned_fasta_and_tag_with_labels():
     labels = set()
     for record in screed.open(filename):
         seq = record.sequence
-        labels.update(lb.sweep_label_neighborhood(seq, False, False))
+        labels.update(lb.sweep_label_neighborhood(seq, 0, False, False))
     # print lb.n_labels()
     # print labels
     assert len(labels) == 1
     assert labels.pop() == 2L
     assert lb.n_labels() == 1
 
+def test_consume_sequence_and_tag_with_labels():
+    lb = LabelHash(20, 1e6, 4)
+    label = 0L
+    sequence = 'ATGCATCGATCGATCGATCGATCGATCGATCGATCGATCG'
+    
+    n_consumed = lb.consume_sequence_and_tag_with_labels(sequence, label)
+    labels = set()
+    labels.update(lb.sweep_label_neighborhood(sequence))
+
+    assert label in labels
+    assert len(labels) == 1
 
 def test_sweep_tag_neighborhood():
     lb = LabelHash(20, 1e7, 4)

From 037115db77412c14939cb8a603c218b1c0e1ac7d Mon Sep 17 00:00:00 2001
From: CS Welcher <cs.welcher@gmail.com>
Date: Mon, 13 Jan 2014 17:12:19 -0500
Subject: [PATCH 139/140] shortened remaining long lines:

---
 scripts/sweep-reads-by-partition-buffered.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py
index bc233694cc..07b5bd8530 100755
--- a/scripts/sweep-reads-by-partition-buffered.py
+++ b/scripts/sweep-reads-by-partition-buffered.py
@@ -106,7 +106,8 @@ def flush_buffer(self, buf_id):
             outfp = open(fpath, 'a')
         except IOError as e:
             print >>sys.stderr, '!! ERROR: {e} !!'.format(e=e)
-            print >>sys.stderr, '*** Failed to open {fn} for buffer flush'.format(fn=fpath)
+            print >>sys.stderr, '*** Failed to open {fn} for \
+                                buffer flush'.format(fn=fpath)
             self.num_file_errors += 1
         else:
             buf = self.buffers[buf_id]
@@ -187,7 +188,8 @@ def main():
         print >>sys.stderr, ''
         print >>sys.stderr, \
             'Estimated memory usage is {prod:.2g} bytes \
-            (n_hashes x min_hashsize / 8)'.format(prod=args.n_hashes * HT_SIZE / 8)
+            (n_hashes x min_hashsize / 8)'.format(
+                            prod=args.n_hashes * HT_SIZE / 8)
         print >>sys.stderr, '-' * 8
 
     traversal_range = args.traversal_range

From 3c4158f8174c77cc8b858e34352ae3da59fa4ff3 Mon Sep 17 00:00:00 2001
From: CS Welcher <cs.welcher@gmail.com>
Date: Mon, 13 Jan 2014 17:52:05 -0500
Subject: [PATCH 140/140] greatly expounding on help

---
 scripts/sweep-reads-by-partition-buffered.py | 25 +++++++++++++++-----
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py
index 07b5bd8530..456127c3b6 100755
--- a/scripts/sweep-reads-by-partition-buffered.py
+++ b/scripts/sweep-reads-by-partition-buffered.py
@@ -10,6 +10,13 @@
 
 % sweep-reads-by-partition.py -r <range> <contigs fastp> \
 <reads1> <reads2> ... <readsN>
+"""
+
+epilog = """
+Output will be a collection of files corresponding to the partitions;
+each partition gets a file (prefixed with the output prefix option), 
+which means this could output many tens or hundreds of thousands of files. 
+Users should plan accordingly.
 
 This script is very lenient on IO errors, due to the large number of file
 operations needed. Thus, errors opening a file for buffer flush or writing
@@ -148,19 +155,25 @@ def main():
     parser = build_construct_args('Takes a partitioned reference file \
                                   and a list of reads, and sorts reads \
                                   by which partition they connect to')
+    parser.epilog = epilog
     parser.add_argument(
         '-r', '--traversal_range', type=int, dest='traversal_range',
         default=DEFAULT_RANGE)
     parser.add_argument('-b', '--buffer_size', dest='max_reads', type=int,
-                        default=DEFAULT_MAX_READS)
+                        default=DEFAULT_MAX_READS,
+                        help='Max total reads to buffer before flushing')
     parser.add_argument('-l', '--buffer_length', dest='buffer_size', type=int,
-                        default=DEFAULT_BUFFER_SIZE)
+                        default=DEFAULT_BUFFER_SIZE,
+                        help='Max length of an individual label buffer \
+                              before flushing')
     parser.add_argument('-o', '--output_prefix', dest='output_prefix',
-                        default=DEFAULT_OUT_PREF)
+                        default=DEFAULT_OUT_PREF,
+                        help='Prefix for sorted read files')
     parser.add_argument('-m', '--max_buffers', dest='max_buffers', type=int,
-                        default=DEFAULT_NUM_BUFFERS)
-    parser.add_argument(dest='input_fastp')
-    parser.add_argument('input_files', nargs='+')
+                        default=DEFAULT_NUM_BUFFERS,
+                        help='Max individual label buffers before flushing')
+    parser.add_argument(dest='input_fastp', help='Partitioned reference fasta')
+    parser.add_argument('input_files', nargs='+', help='Reads to be swept/sorted')
     args = parser.parse_args()
 
     K = args.ksize