dib-lab · ctb · Jun 2, 2017 · Jan 24, 2017 · Jan 24, 2017 · Feb 14, 2017
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,12 +7,15 @@ The khmer project's command line scripts adhere to
 under semantic versioning, but will be in future versions of khmer.
 
 ## [Unreleased]
+
 ### Added
 - Cython wrapper for liboxli.
 - Cython containers for parsing, assembly, and hashing.
 - Header install for liboxli.
 
 ### Changed
+- Non-ACTG handling significantly changed so that only bulk-loading functions 
+  "clean" sequences of non-DNA characters. See #1590 for details.
 - Split CPython wrapper file into per-class files under src/khmer and include/khmer.
 - Moved liboxli headers to include/oxli and implementations to src/oxli.
 - Removed CPython assembler wrappers.

diff --git a/include/oxli/hashtable.hh b/include/oxli/hashtable.hh
@@ -260,13 +260,6 @@ public:
     // count every k-mer in the string.
     unsigned int consume_string(const std::string &s);
 
-    // checks each read for non-ACGT characters
-    bool check_and_normalize_read(std::string &read) const;
-
-    // check each read for non-ACGT characters, and then consume it.
-    unsigned int check_and_process_read(std::string &read,
-                                        bool &is_valid);
-
     // Count every k-mer in a file containing nucleotide sequences.
     template<typename SeqIO>
     void consume_seqfile(

diff --git a/src/oxli/hashgraph.cc b/src/oxli/hashgraph.cc
@@ -309,13 +309,12 @@ void Hashgraph::consume_seqfile_and_tag(
             break;
         }
 
-        if (check_and_normalize_read( read.sequence )) {
-            unsigned long long this_n_consumed = 0;
-            consume_sequence_and_tag( read.sequence, this_n_consumed );
+        read.set_clean_seq();
+        unsigned long long this_n_consumed = 0;
+        consume_sequence_and_tag(read.cleaned_seq, this_n_consumed);
 
-            __sync_add_and_fetch( &n_consumed, this_n_consumed );
-            __sync_add_and_fetch( &total_reads, 1 );
-        }
+        __sync_add_and_fetch(&n_consumed, this_n_consumed);
+        __sync_add_and_fetch(&total_reads, 1);
     } // while reads left for parser
 
 }
@@ -373,21 +372,20 @@ void Hashgraph::consume_partitioned_fasta(
         } catch (NoMoreReadsAvailable &exc) {
             break;
         }
-        seq = read.sequence;
+        read.set_clean_seq();
+        seq = read.cleaned_seq;
 
-        if (check_and_normalize_read(seq)) {
-            // First, figure out what the partition is (if non-zero), and save that.
-            PartitionID p = _parse_partition_id(read.name);
+        // First, figure out what the partition is (if non-zero), and save that.
+        PartitionID p = _parse_partition_id(read.name);
 
-            // Then consume the sequence
-            n_consumed += consume_string(seq); // @CTB why are we doing this?
+        // Then consume the sequence
+        n_consumed += consume_string(seq); // @CTB why are we doing this?
 
-            // Next, compute the tag & set the partition, if nonzero
-            HashIntoType kmer = hash_dna(seq.c_str());
-            all_tags.insert(kmer);
-            if (p > 0) {
-                partition->set_partition_id(kmer, p);
-            }
+        // Next, compute the tag & set the partition, if nonzero
+        HashIntoType kmer = hash_dna(seq.c_str());
+        all_tags.insert(kmer);
+        if (p > 0) {
+            partition->set_partition_id(kmer, p);
         }
 
         // reset the sequence info, increment read number
@@ -467,10 +465,6 @@ unsigned int Hashgraph::kmer_degree(const char * kmer_s)
 
 size_t Hashgraph::trim_on_stoptags(std::string seq) const
 {
-    if (!check_and_normalize_read(seq)) {
-        return 0;
-    }
-
     KmerIterator kmers(seq.c_str(), _ksize);
 
     size_t i = _ksize - 2;

diff --git a/src/oxli/hashtable.cc b/src/oxli/hashtable.cc
@@ -57,51 +57,6 @@ using namespace oxli;
 using namespace oxli:: read_parsers;
 
 
-//
-// check_and_process_read: checks for non-ACGT characters before consuming
-//
-
-unsigned int Hashtable::check_and_process_read(std::string &read,
-        bool &is_valid)
-{
-    is_valid = check_and_normalize_read(read);
-
-    if (!is_valid) {
-        return 0;
-    }
-
-    return consume_string(read);
-}
-
-//
-// check_and_normalize_read: checks for non-ACGT characters
-//			     converts lowercase characters to uppercase one
-// Note: Usually it is desirable to keep checks and mutations separate.
-//	 However, in the interests of efficiency (we are potentially working
-//	 with TB of data), a check and mutation have been placed inside the
-//	 same loop. Potentially trillions fewer fetches from memory would
-//	 seem to be a worthwhile goal.
-//
-
-bool Hashtable::check_and_normalize_read(std::string &read) const
-{
-    bool rc = true;
-
-    if (read.length() < _ksize) {
-        return false;
-    }
-
-    for (unsigned int i = 0; i < read.length(); i++)  {
-        read[ i ] &= 0xdf; // toupper - knock out the "lowercase bit"
-        if (!is_valid_dna( read[ i ] )) {
-            rc = false;
-            break;
-        }
-    }
-
-    return rc;
-}
-
 //
 // consume_seqfile: consume a file of reads
 //
@@ -136,8 +91,8 @@ void Hashtable::consume_seqfile(
             break;
         }
 
-        unsigned int this_n_consumed =
-            check_and_process_read(read.sequence, is_valid);
+        read.set_clean_seq();
+        unsigned int this_n_consumed = consume_string(read.cleaned_seq);
 
         __sync_add_and_fetch( &n_consumed, this_n_consumed );
         __sync_add_and_fetch( &total_reads, 1 );
@@ -350,20 +305,19 @@ uint64_t * Hashtable::abundance_distribution(
         } catch (NoMoreReadsAvailable &exc) {
             break;
         }
-        seq = read.sequence;
+        read.set_clean_seq();
+        seq = read.cleaned_seq;
 
-        if (check_and_normalize_read(seq)) {
-            KmerHashIteratorPtr kmers = new_kmer_iterator(seq);
+        KmerHashIteratorPtr kmers = new_kmer_iterator(seq);
 
-            while(!kmers->done()) {
-                HashIntoType kmer = kmers->next();
+        while(!kmers->done()) {
+            HashIntoType kmer = kmers->next();
 
-                if (!tracking->get_count(kmer)) {
-                    tracking->count(kmer);
+            if (!tracking->get_count(kmer)) {
+                tracking->count(kmer);
 
-                    BoundedCounterType n = get_count(kmer);
-                    dist[n]++;
-                }
+                BoundedCounterType n = get_count(kmer);
+                dist[n]++;
             }
 
             name.clear();
@@ -387,10 +341,6 @@ unsigned long Hashtable::trim_on_abundance(
     BoundedCounterType  min_abund)
 const
 {
-    if (!check_and_normalize_read(seq)) {
-        return 0;
-    }
-
     KmerHashIteratorPtr kmers = new_kmer_iterator(seq);
 
     HashIntoType kmer;
@@ -422,12 +372,7 @@ unsigned long Hashtable::trim_below_abundance(
     BoundedCounterType  max_abund)
 const
 {
-    if (!check_and_normalize_read(seq)) {
-        return 0;
-    }
-
     KmerHashIteratorPtr kmers = new_kmer_iterator(seq);
-
     HashIntoType kmer;
 
     if (kmers->done()) {
@@ -458,10 +403,6 @@ std::vector<unsigned int> Hashtable::find_spectral_error_positions(
 const
 {
     std::vector<unsigned int> posns;
-    if (!check_and_normalize_read(seq)) {
-        throw oxli_exception("invalid read");
-    }
-
     KmerHashIteratorPtr kmers = new_kmer_iterator(seq);
 
     HashIntoType kmer = kmers->next();

diff --git a/src/oxli/labelhash.cc b/src/oxli/labelhash.cc
@@ -106,21 +106,21 @@ void LabelHash::consume_seqfile_and_tag_with_labels(
             break;
         }
 
-        if (graph->check_and_normalize_read( read.sequence )) {
-            // TODO: make threadsafe!
-            unsigned long long this_n_consumed = 0;
-            consume_sequence_and_tag_with_labels( read.sequence,
-                                                  this_n_consumed,
-                                                  the_label );
-            the_label++;
+        read.set_clean_seq();
+
+        // TODO: make threadsafe!
+        unsigned long long this_n_consumed = 0;
+        consume_sequence_and_tag_with_labels( read.cleaned_seq,
+                                              this_n_consumed,
+                                              the_label );
+        the_label++;
 
 #if (0) // Note: Used with callback - currently disabled.
-            n_consumed_LOCAL  = __sync_add_and_fetch( &n_consumed, this_n_consumed );
+        n_consumed_LOCAL  = __sync_add_and_fetch( &n_consumed, this_n_consumed );
 #else
-            __sync_add_and_fetch( &n_consumed, this_n_consumed );
+        __sync_add_and_fetch( &n_consumed, this_n_consumed );
 #endif
-            __sync_add_and_fetch( &total_reads, 1 );
-        }
+        __sync_add_and_fetch( &total_reads, 1 );
 
         // TODO: Figure out alternative to callback into Python VM
         //       Cannot use in multi-threaded operation.
@@ -169,19 +169,19 @@ void LabelHash::consume_partitioned_fasta_and_tag_with_labels(
     PartitionID p;
     while(!parser->is_complete())  {
         read = parser->get_next_read();
-        seq = read.sequence;
-
-        if (graph->check_and_normalize_read(seq)) {
-            // First, figure out what the partition is (if non-zero), and
-            // save that.
-            printdbg(parsing partition id)
-            p = _parse_partition_id(read.name);
-            printdbg(consuming sequence and tagging)
-            consume_sequence_and_tag_with_labels( seq,
-                                                  n_consumed,
-                                                  p );
-            printdbg(back in consume_partitioned)
-        }
+
+        read.set_clean_seq();
+        seq = read.cleaned_seq;
+
+        // First, figure out what the partition is (if non-zero), and
+        // save that.
+        printdbg(parsing partition id)
+        p = _parse_partition_id(read.name);
+        printdbg(consuming sequence and tagging)
+        consume_sequence_and_tag_with_labels( seq,
+                                              n_consumed,
+                                              p );
+        printdbg(back in consume_partitioned)
 
         // reset the sequence info, increment read number
         total_reads++;

diff --git a/src/oxli/subset.cc b/src/oxli/subset.cc
@@ -147,61 +147,59 @@ size_t SubsetPartition::output_partitioned_file(
             break;
         }
 
-        seq = read.sequence;
-
-        if (_ht->check_and_normalize_read(seq)) {
-            const char * kmer_s = seq.c_str();
-
-            bool found_tag = false;
-            for (unsigned int i = 0; i < seq.length() - ksize + 1; i++) {
-                kmer = _ht->hash_dna(kmer_s + i);
-
-                // is this a known tag?
-                if (set_contains(partition_map, kmer)) {
-                    found_tag = true;
-                    break;
-                }
+        read.set_clean_seq();
+        seq = read.cleaned_seq;
+
+        bool found_tag = false;
+        KmerHashIteratorPtr kmers = _ht->new_kmer_iterator(read.cleaned_seq);
+        while (!kmers->done()) {
+            kmer = kmers->next();
+
+            // is this a known tag?
+            if (set_contains(partition_map, kmer)) {
+                found_tag = true;
+                break;
             }
+        }
 
-            // all sequences should have at least one tag in them.
-            // assert(found_tag);  @CTB currently breaks tests.  give fn flag
-            // to disable.
+        // all sequences should have at least one tag in them.
+        // assert(found_tag);  @CTB currently breaks tests.  give fn flag
+        // to disable.
 
-            PartitionID partition_id = 0;
-            if (found_tag) {
-                PartitionID * partition_p = partition_map[kmer];
-                if (partition_p == NULL ) {
-                    partition_id = 0;
-                    n_singletons++;
-                } else {
-                    partition_id = *partition_p;
-                    partitions.insert(partition_id);
-                }
+        PartitionID partition_id = 0;
+        if (found_tag) {
+            PartitionID * partition_p = partition_map[kmer];
+            if (partition_p == NULL ) {
+                partition_id = 0;
+                n_singletons++;
+            } else {
+                partition_id = *partition_p;
+                partitions.insert(partition_id);
             }
+        }
 
-            if (partition_id > 0 || output_unassigned) {
-                if (read.quality.length()) { // FASTQ
-                    outfile << "@" << read.name << "\t" << partition_id
-                            << "\n";
-                    outfile << seq << "\n+\n";
-                    outfile << read.quality << "\n";
-                } else {		// FASTA
-                    outfile << ">" << read.name << "\t" << partition_id;
-                    outfile << "\n" << seq << "\n";
-                }
+        if (partition_id > 0 || output_unassigned) {
+            if (read.quality.length()) { // FASTQ
+                outfile << "@" << read.name << "\t" << partition_id
+                        << "\n";
+                outfile << seq << "\n+\n";
+                outfile << read.quality << "\n";
+            } else {		// FASTA
+                outfile << ">" << read.name << "\t" << partition_id;
+                outfile << "\n" << seq << "\n";
             }
+        }
 
-            total_reads++;
+        total_reads++;
 
-            // run callback, if specified
-            if (total_reads % CALLBACK_PERIOD == 0 && callback) {
-                try {
-                    callback("output_partitions", callback_data,
-                             total_reads, reads_kept);
-                } catch (...) {
-                    outfile.close();
-                    throw;
-                }
+        // run callback, if specified
+        if (total_reads % CALLBACK_PERIOD == 0 && callback) {
+            try {
+                callback("output_partitions", callback_data,
+                         total_reads, reads_kept);
+            } catch (...) {
+                outfile.close();
+                throw;
             }
         }
     }