From 2ee322c84a6e3e8f7bdbe0682e1d7decd012fec1 Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Thu, 25 Jul 2013 18:09:50 -0400 Subject: [PATCH 001/140] added threading params to filter-abund.py --- scripts/filter-abund.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/filter-abund.py b/scripts/filter-abund.py index 20dea0343a..3e836f3b42 100755 --- a/scripts/filter-abund.py +++ b/scripts/filter-abund.py @@ -11,7 +11,7 @@ import os import khmer from khmer.thread_utils import ThreadedSequenceProcessor, verbose_loader - +from khmer import threading_args as targs from khmer.counting_args import build_counting_multifile_args ### @@ -21,6 +21,7 @@ def main(): parser = build_counting_multifile_args() + targs.add_threading_args(parser) parser.add_argument('--cutoff', '-C', dest='cutoff', default=DEFAULT_CUTOFF, type=int, help="Trim at k-mers below this abundance.") @@ -35,6 +36,7 @@ def main(): counting_ht = args.input_table infiles = args.input_filenames + n_threads = int(args.n_threads) print 'file with ht: %s' % counting_ht @@ -69,7 +71,7 @@ def process_fn(record): outfile = os.path.basename(infile) + '.abundfilt' outfp = open(outfile, 'w') - tsp = ThreadedSequenceProcessor(process_fn) + tsp = ThreadedSequenceProcessor(process_fn, n_workers=n_threads) tsp.start(verbose_loader(infile), outfp) print 'output in', outfile From c9467fb56d90b9cf19884535b3d2f6a388928c81 Mon Sep 17 00:00:00 2001 From: CS Welcher Date: Wed, 4 Sep 2013 15:02:56 -0400 Subject: [PATCH 002/140] Added typedefs to hashtable.hh --- lib/hashtable.hh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lib/hashtable.hh b/lib/hashtable.hh index 84a711d59f..7104527590 100644 --- a/lib/hashtable.hh +++ b/lib/hashtable.hh @@ -34,6 +34,9 @@ namespace khmer { typedef std::map PartitionCountMap; typedef std::map PartitionCountDistribution; + typedef unsigned int Color; + typedef std::multimap TagColorMap; + struct HashTablePerformanceMetrics : public IPerformanceMetrics { From 05edfb8b8da1b0204cdbc7a2fdc5e2b34055bc56 Mon Sep 17 00:00:00 2001 From: CS Welcher Date: Wed, 4 Sep 2013 15:29:37 -0400 Subject: [PATCH 003/140] Added function prototype for consume and tag with colors --- lib/hashbits.cc | 63 +++++++++++++++++++++++++++++++++++++++++++++++++ lib/hashbits.hh | 7 ++++-- 2 files changed, 68 insertions(+), 2 deletions(-) diff --git a/lib/hashbits.cc b/lib/hashbits.cc index 0300033df0..3b55116245 100644 --- a/lib/hashbits.cc +++ b/lib/hashbits.cc @@ -467,6 +467,69 @@ void Hashbits::consume_sequence_and_tag(const std::string& seq, } } +/* This is essentially the same code as above, only it assigns colors to the + * tags through a multimap + */ +void Hashbits::consume_sequence_and_tag_with_colors(const std::string& seq, + unsigned long long& n_consumed, + SeenSet * found_tags) +{ + bool is_new_kmer; + bool kmer_tagged; + + KMerIterator kmers(seq.c_str(), _ksize); + HashIntoType kmer; + + unsigned int since = _tag_density / 2 + 1; + + while(!kmers.done()) { + kmer = kmers.next(); + + if ((is_new_kmer = test_and_set_bits( kmer ))) + ++n_consumed; + +#if (1) + if (is_new_kmer) ++since; + else + { + ACQUIRE_ALL_TAGS_SPIN_LOCK + kmer_tagged = set_contains(all_tags, kmer); + RELEASE_ALL_TAGS_SPIN_LOCK + if (kmer_tagged) + { + since = 1; + if (found_tags) { found_tags->insert(kmer); } + } + else ++since; + } +#else + if (!is_new_kmer && set_contains(all_tags, kmer)) { + since = 1; + if (found_tags) { found_tags->insert(kmer); } + } else { + since++; + } +#endif + + if (since >= _tag_density) { + ACQUIRE_ALL_TAGS_SPIN_LOCK + all_tags.insert(kmer); + RELEASE_ALL_TAGS_SPIN_LOCK + if (found_tags) { found_tags->insert(kmer); } + since = 1; + } + + } // iteration over kmers + + if (since >= _tag_density/2 - 1) { + ACQUIRE_ALL_TAGS_SPIN_LOCK + all_tags.insert(kmer); // insert the last k-mer, too. + RELEASE_ALL_TAGS_SPIN_LOCK + if (found_tags) { found_tags->insert(kmer); } + } +} + + // // consume_fasta_and_tag_with_stoptags: consume a FASTA file of reads, // tagging reads every so often. Do not insert matches to stoptags, diff --git a/lib/hashbits.hh b/lib/hashbits.hh index 2f628ece5b..ebcf3fd2d5 100644 --- a/lib/hashbits.hh +++ b/lib/hashbits.hh @@ -169,8 +169,11 @@ namespace khmer { void consume_sequence_and_tag(const std::string& seq, unsigned long long& n_consumed, SeenSet * new_tags = 0); - - + + void consume_sequence_and_tag_with_colors(const std::string& seq, + unsigned long long& n_consumed, + SeenSet * found_tags) + void consume_fasta_and_tag_with_stoptags(const std::string &filename, unsigned int &total_reads, unsigned long long &n_consumed, From 0e92a8d3686a40d3c0dab27fd920166dbb0330a8 Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Thu, 5 Sep 2013 18:36:06 -0400 Subject: [PATCH 004/140] added code to insert colors into colormap --- lib/hashbits.cc | 43 +++++++++++++++++++++++++++++++------------ lib/hashbits.hh | 23 ++++++++++++++++++++++- lib/hashtable.hh | 1 + 3 files changed, 54 insertions(+), 13 deletions(-) diff --git a/lib/hashbits.cc b/lib/hashbits.cc index 3b55116245..574e07b856 100644 --- a/lib/hashbits.cc +++ b/lib/hashbits.cc @@ -468,10 +468,12 @@ void Hashbits::consume_sequence_and_tag(const std::string& seq, } /* This is essentially the same code as above, only it assigns colors to the - * tags through a multimap + * tags through multimap TagColorMap defined in hashtable.hh, declared in + * hashbits.hh */ void Hashbits::consume_sequence_and_tag_with_colors(const std::string& seq, unsigned long long& n_consumed, + Color& current_color, SeenSet * found_tags) { bool is_new_kmer; @@ -489,19 +491,27 @@ void Hashbits::consume_sequence_and_tag_with_colors(const std::string& seq, ++n_consumed; #if (1) - if (is_new_kmer) ++since; - else - { + if (is_new_kmer) { + ++since; + } else { ACQUIRE_ALL_TAGS_SPIN_LOCK kmer_tagged = set_contains(all_tags, kmer); RELEASE_ALL_TAGS_SPIN_LOCK - if (kmer_tagged) - { - since = 1; - if (found_tags) { found_tags->insert(kmer); } - } - else ++since; - } + if (kmer_tagged) { + since = 1; + + // Coloring code + // TODO: MAKE THREADSAFE! + + if (!_map_contains(color_map, kmer, current_color)) { + color_map.insert(TagColorPair(kmer, current_color)) + } + if (found_tags) { + found_tags->insert(kmer); + } + } else ++since; + } + // Should I bother adding new code down here? #else if (!is_new_kmer && set_contains(all_tags, kmer)) { since = 1; @@ -510,11 +520,16 @@ void Hashbits::consume_sequence_and_tag_with_colors(const std::string& seq, since++; } #endif - + // if (since >= _tag_density) { ACQUIRE_ALL_TAGS_SPIN_LOCK all_tags.insert(kmer); RELEASE_ALL_TAGS_SPIN_LOCK + + // Coloring code + // TODO: MAKE THREADSAFE! + color_map.insert(TagColorPair(kmer, current_color)) + if (found_tags) { found_tags->insert(kmer); } since = 1; } @@ -525,6 +540,10 @@ void Hashbits::consume_sequence_and_tag_with_colors(const std::string& seq, ACQUIRE_ALL_TAGS_SPIN_LOCK all_tags.insert(kmer); // insert the last k-mer, too. RELEASE_ALL_TAGS_SPIN_LOCK + + // Color code: TODO: MAKE THREADSAFE! + color_map.insert(TagColorPair(kmer, current_color)) + if (found_tags) { found_tags->insert(kmer); } } } diff --git a/lib/hashbits.hh b/lib/hashbits.hh index ebcf3fd2d5..6b54b5dcf7 100644 --- a/lib/hashbits.hh +++ b/lib/hashbits.hh @@ -27,6 +27,8 @@ namespace khmer { HashIntoType _n_overlap_kmers; Byte ** _counts; + Color _tag_color; + virtual void _allocate_counters() { _n_tables = _tablesizes.size(); @@ -49,6 +51,20 @@ namespace khmer { partition->_clear_all_partitions(); } } + + + // Check if the given TagToColorMap already has the tag with the given color + bool _map_contains(TagToColorMap& cmap, + HashIntoType& kmer, + Color& the_color) + { + std::pair ret; + ret = cmap->equal_range(kmer); + for (TagToColorMap::iterator it=ret.first; it!=ret.second; ++it) { + if (it->second == the_color) return true; + } + return false; + } uint32_t _all_tags_spin_lock; @@ -57,6 +73,8 @@ namespace khmer { SeenSet all_tags; SeenSet stop_tags; SeenSet repart_small_tags; + TagToColorMap color_map; + void _validate_pmap() { if (partition) { partition->_validate_pmap(); } @@ -74,6 +92,8 @@ namespace khmer { _n_unique_kmers = 0; _n_overlap_kmers = 0; + _tag_color = 0; + _allocate_counters(); } @@ -172,7 +192,8 @@ namespace khmer { void consume_sequence_and_tag_with_colors(const std::string& seq, unsigned long long& n_consumed, - SeenSet * found_tags) + Color& current_color, + SeenSet * new_tags = 0) void consume_fasta_and_tag_with_stoptags(const std::string &filename, unsigned int &total_reads, diff --git a/lib/hashtable.hh b/lib/hashtable.hh index 7104527590..51a4c0e174 100644 --- a/lib/hashtable.hh +++ b/lib/hashtable.hh @@ -36,6 +36,7 @@ namespace khmer { typedef unsigned int Color; typedef std::multimap TagColorMap; + typedef std::pair TagColorPair; struct HashTablePerformanceMetrics : public IPerformanceMetrics { From 1465c745b44ccd6efa2596bb0da1a50991fc76ee Mon Sep 17 00:00:00 2001 From: CS Date: Fri, 6 Sep 2013 00:45:28 -0400 Subject: [PATCH 005/140] added consume_fasta functions --- lib/hashbits.cc | 118 ++++++++++++++++++++++++++++++++++++++++++++++++ lib/hashbits.hh | 16 +++++++ 2 files changed, 134 insertions(+) diff --git a/lib/hashbits.cc b/lib/hashbits.cc index 574e07b856..fa605ab032 100644 --- a/lib/hashbits.cc +++ b/lib/hashbits.cc @@ -467,6 +467,124 @@ void Hashbits::consume_sequence_and_tag(const std::string& seq, } } +/* + * Pretty much copy-pasta from the above functions + * Might be time for a refactor: could do a general consume_fasta + * function which accepts a consume_sequence function pointer as a parameter + */ + +void +Hashbits:: +consume_fasta_and_tag_with_colors( + std:: string const &filename, + unsigned int &total_reads, unsigned long long &n_consumed, + CallbackFn callback, void * callback_data +) +{ + khmer:: Config &the_config = khmer:: get_active_config( ); + + // Note: Always assume only 1 thread if invoked this way. + IParser * parser = + IParser::get_parser( + filename, 1, the_config.get_reads_input_buffer_size( ), + the_config.get_reads_parser_trace_level( ) + ); + + + consume_fasta_and_tag_with_colors( + parser, + total_reads, n_consumed, + callback, callback_data + ); + + delete parser; +} + +void +Hashbits:: +consume_fasta_and_tag_with_colors( + read_parsers:: IParser * parser, + unsigned int &total_reads, unsigned long long &n_consumed, + CallbackFn callback, void * callback_data +) +{ + Hasher &hasher = + _get_hasher( parser->uuid( ) ); + unsigned int total_reads_LOCAL = 0; +#if (0) // Note: Used with callback - currently disabled. + unsigned long long int n_consumed_LOCAL = 0; +#endif + Read read; + + // TODO? Delete the following assignments. + total_reads = 0; + n_consumed = 0; + + hasher.trace_logger( + TraceLogger:: TLVL_DEBUG2, + "Starting trace of 'consume_fasta_and_tag'....\n" + ); + + // Iterate through the reads and consume their k-mers. + while (!parser->is_complete( )) + { + unsigned long long this_n_consumed = 0; + + read = parser->get_next_read( ); + + if (check_and_normalize_read( read.sequence )) + { + // TODO: make threadsafe! + consume_sequence_and_tag_with_colors( read.sequence, + this_n_consumed, + _tag_color ); + ++_tag_color; + +#ifdef WITH_INTERNAL_METRICS + hasher.pmetrics.start_timers( ); +#endif +#if (0) // Note: Used with callback - currently disabled. + n_consumed_LOCAL = __sync_add_and_fetch( &n_consumed, this_n_consumed ); +#else + __sync_add_and_fetch( &n_consumed, this_n_consumed ); +#endif + total_reads_LOCAL = __sync_add_and_fetch( &total_reads, 1 ); +#ifdef WITH_INTERNAL_METRICS + hasher.pmetrics.stop_timers( ); + hasher.pmetrics.accumulate_timer_deltas( + (uint32_t)HashTablePerformanceMetrics:: MKEY_TIME_UPDATE_TALLIES + ); +#endif + } + + if (0 == (total_reads_LOCAL % 10000)) + hasher.trace_logger( + TraceLogger:: TLVL_DEBUG3, + "Total number of reads processed: %llu\n", + (unsigned long long int)total_reads_LOCAL + ); + + // TODO: Figure out alternative to callback into Python VM + // Cannot use in multi-threaded operation. +#if (0) + // run callback, if specified + if (total_reads_TL % CALLBACK_PERIOD == 0 && callback) { + std::cout << "n tags: " << all_tags.size() << "\n"; + try { + callback("consume_fasta_and_tag", callback_data, total_reads_TL, + n_consumed); + } catch (...) { + delete parser; + throw; + } + } +#endif // 0 + + } // while reads left for parser + +} + + /* This is essentially the same code as above, only it assigns colors to the * tags through multimap TagColorMap defined in hashtable.hh, declared in * hashbits.hh diff --git a/lib/hashbits.hh b/lib/hashbits.hh index 6b54b5dcf7..d0a2ce9a0c 100644 --- a/lib/hashbits.hh +++ b/lib/hashbits.hh @@ -186,6 +186,22 @@ namespace khmer { void * callback_data = NULL ); + + void consume_fasta_and_tag_with_colors( + std::string const &filename, + unsigned int &total_reads, + unsigned long long &n_consumed, + CallbackFn callback = NULL, + void * callback_data = NULL + ); + + void consume_fasta_and_tag_with_colors( + read_parsers:: IParser * parser, + unsigned int &total_reads, + unsigned long long &n_consumed, + CallbackFn callback = NULL, + void * callback_data = NULL + ); void consume_sequence_and_tag(const std::string& seq, unsigned long long& n_consumed, SeenSet * new_tags = 0); From b33c71aa6beb19faf72ee282198a411cbdbe83f9 Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Fri, 6 Sep 2013 12:20:36 -0400 Subject: [PATCH 006/140] promoted new code to hashtable to be in line with refactor --- lib/hashbits.cc | 200 ----------------------------------------------- lib/hashbits.hh | 22 ------ lib/hashtable.cc | 200 +++++++++++++++++++++++++++++++++++++++++++++++ lib/hashtable.hh | 23 ++++++ lib/khmer.hh | 1 + 5 files changed, 224 insertions(+), 222 deletions(-) diff --git a/lib/hashbits.cc b/lib/hashbits.cc index 09518b37d9..27d780a92f 100644 --- a/lib/hashbits.cc +++ b/lib/hashbits.cc @@ -112,206 +112,6 @@ unsigned int Hashbits::check_and_process_read_overlap(std::string &read, return consume_string_overlap(read, ht2); } -/* - * Pretty much copy-pasta from the above functions - * Might be time for a refactor: could do a general consume_fasta - * function which accepts a consume_sequence function pointer as a parameter - */ - -void -Hashbits:: -consume_fasta_and_tag_with_colors( - std:: string const &filename, - unsigned int &total_reads, unsigned long long &n_consumed, - CallbackFn callback, void * callback_data -) -{ - khmer:: Config &the_config = khmer:: get_active_config( ); - - // Note: Always assume only 1 thread if invoked this way. - IParser * parser = - IParser::get_parser( - filename, 1, the_config.get_reads_input_buffer_size( ), - the_config.get_reads_parser_trace_level( ) - ); - - - consume_fasta_and_tag_with_colors( - parser, - total_reads, n_consumed, - callback, callback_data - ); - - delete parser; -} - -void -Hashbits:: -consume_fasta_and_tag_with_colors( - read_parsers:: IParser * parser, - unsigned int &total_reads, unsigned long long &n_consumed, - CallbackFn callback, void * callback_data -) -{ - Hasher &hasher = - _get_hasher( parser->uuid( ) ); - unsigned int total_reads_LOCAL = 0; -#if (0) // Note: Used with callback - currently disabled. - unsigned long long int n_consumed_LOCAL = 0; -#endif - Read read; - - // TODO? Delete the following assignments. - total_reads = 0; - n_consumed = 0; - - hasher.trace_logger( - TraceLogger:: TLVL_DEBUG2, - "Starting trace of 'consume_fasta_and_tag'....\n" - ); - - // Iterate through the reads and consume their k-mers. - while (!parser->is_complete( )) - { - unsigned long long this_n_consumed = 0; - - read = parser->get_next_read( ); - - if (check_and_normalize_read( read.sequence )) - { - // TODO: make threadsafe! - consume_sequence_and_tag_with_colors( read.sequence, - this_n_consumed, - _tag_color ); - ++_tag_color; - -#ifdef WITH_INTERNAL_METRICS - hasher.pmetrics.start_timers( ); -#endif -#if (0) // Note: Used with callback - currently disabled. - n_consumed_LOCAL = __sync_add_and_fetch( &n_consumed, this_n_consumed ); -#else - __sync_add_and_fetch( &n_consumed, this_n_consumed ); -#endif - total_reads_LOCAL = __sync_add_and_fetch( &total_reads, 1 ); -#ifdef WITH_INTERNAL_METRICS - hasher.pmetrics.stop_timers( ); - hasher.pmetrics.accumulate_timer_deltas( - (uint32_t)HashTablePerformanceMetrics:: MKEY_TIME_UPDATE_TALLIES - ); -#endif - } - - if (0 == (total_reads_LOCAL % 10000)) - hasher.trace_logger( - TraceLogger:: TLVL_DEBUG3, - "Total number of reads processed: %llu\n", - (unsigned long long int)total_reads_LOCAL - ); - - // TODO: Figure out alternative to callback into Python VM - // Cannot use in multi-threaded operation. -#if (0) - // run callback, if specified - if (total_reads_TL % CALLBACK_PERIOD == 0 && callback) { - std::cout << "n tags: " << all_tags.size() << "\n"; - try { - callback("consume_fasta_and_tag", callback_data, total_reads_TL, - n_consumed); - } catch (...) { - delete parser; - throw; - } - } -#endif // 0 - - } // while reads left for parser - -} - - -/* This is essentially the same code as above, only it assigns colors to the - * tags through multimap TagColorMap defined in hashtable.hh, declared in - * hashbits.hh - */ -void Hashbits::consume_sequence_and_tag_with_colors(const std::string& seq, - unsigned long long& n_consumed, - Color& current_color, - SeenSet * found_tags) -{ - bool is_new_kmer; - bool kmer_tagged; - - KMerIterator kmers(seq.c_str(), _ksize); - HashIntoType kmer; - - unsigned int since = _tag_density / 2 + 1; - - while(!kmers.done()) { - kmer = kmers.next(); - - if ((is_new_kmer = test_and_set_bits( kmer ))) - ++n_consumed; - -#if (1) - if (is_new_kmer) { - ++since; - } else { - ACQUIRE_ALL_TAGS_SPIN_LOCK - kmer_tagged = set_contains(all_tags, kmer); - RELEASE_ALL_TAGS_SPIN_LOCK - if (kmer_tagged) { - since = 1; - - // Coloring code - // TODO: MAKE THREADSAFE! - - if (!_map_contains(color_map, kmer, current_color)) { - color_map.insert(TagColorPair(kmer, current_color)) - } - if (found_tags) { - found_tags->insert(kmer); - } - } else ++since; - } - // Should I bother adding new code down here? -#else - if (!is_new_kmer && set_contains(all_tags, kmer)) { - since = 1; - if (found_tags) { found_tags->insert(kmer); } - } else { - since++; - } -#endif - // - if (since >= _tag_density) { - ACQUIRE_ALL_TAGS_SPIN_LOCK - all_tags.insert(kmer); - RELEASE_ALL_TAGS_SPIN_LOCK - - // Coloring code - // TODO: MAKE THREADSAFE! - color_map.insert(TagColorPair(kmer, current_color)) - - if (found_tags) { found_tags->insert(kmer); } - since = 1; - } - - } // iteration over kmers - - if (since >= _tag_density/2 - 1) { - ACQUIRE_ALL_TAGS_SPIN_LOCK - all_tags.insert(kmer); // insert the last k-mer, too. - RELEASE_ALL_TAGS_SPIN_LOCK - - // Color code: TODO: MAKE THREADSAFE! - color_map.insert(TagColorPair(kmer, current_color)) - - if (found_tags) { found_tags->insert(kmer); } - } -} - - // // consume_fasta: consume a FASTA file of reads // diff --git a/lib/hashbits.hh b/lib/hashbits.hh index 5a9bb4c6b9..6c0f19698c 100644 --- a/lib/hashbits.hh +++ b/lib/hashbits.hh @@ -76,28 +76,6 @@ namespace khmer { virtual void save(std::string); virtual void load(std::string); - - - void consume_fasta_and_tag_with_colors( - std::string const &filename, - unsigned int &total_reads, - unsigned long long &n_consumed, - CallbackFn callback = NULL, - void * callback_data = NULL - ); - - void consume_fasta_and_tag_with_colors( - read_parsers:: IParser * parser, - unsigned int &total_reads, - unsigned long long &n_consumed, - CallbackFn callback = NULL, - void * callback_data = NULL - ); - - void consume_sequence_and_tag_with_colors(const std::string& seq, - unsigned long long& n_consumed, - Color& current_color, - SeenSet * new_tags = 0) // for overlap k-mer counting void consume_fasta_overlap(const std::string &filename,HashIntoType curve[2][100], diff --git a/lib/hashtable.cc b/lib/hashtable.cc index 6782742387..bac6995173 100644 --- a/lib/hashtable.cc +++ b/lib/hashtable.cc @@ -1943,6 +1943,206 @@ void Hashtable::extract_unique_paths(std::string seq, i++; } } + +/* + * Pretty much copy-pasta + * Might be time for a refactor: could do a general consume_fasta + * function which accepts a consume_sequence function pointer as a parameter + */ + +void +Hashbits:: +consume_fasta_and_tag_with_colors( + std:: string const &filename, + unsigned int &total_reads, unsigned long long &n_consumed, + CallbackFn callback, void * callback_data +) +{ + khmer:: Config &the_config = khmer:: get_active_config( ); + + // Note: Always assume only 1 thread if invoked this way. + IParser * parser = + IParser::get_parser( + filename, 1, the_config.get_reads_input_buffer_size( ), + the_config.get_reads_parser_trace_level( ) + ); + + + consume_fasta_and_tag_with_colors( + parser, + total_reads, n_consumed, + callback, callback_data + ); + + delete parser; +} + +void +Hashbits:: +consume_fasta_and_tag_with_colors( + read_parsers:: IParser * parser, + unsigned int &total_reads, unsigned long long &n_consumed, + CallbackFn callback, void * callback_data +) +{ + Hasher &hasher = + _get_hasher( parser->uuid( ) ); + unsigned int total_reads_LOCAL = 0; +#if (0) // Note: Used with callback - currently disabled. + unsigned long long int n_consumed_LOCAL = 0; +#endif + Read read; + + // TODO? Delete the following assignments. + total_reads = 0; + n_consumed = 0; + + hasher.trace_logger( + TraceLogger:: TLVL_DEBUG2, + "Starting trace of 'consume_fasta_and_tag'....\n" + ); + + // Iterate through the reads and consume their k-mers. + while (!parser->is_complete( )) + { + unsigned long long this_n_consumed = 0; + + read = parser->get_next_read( ); + + if (check_and_normalize_read( read.sequence )) + { + // TODO: make threadsafe! + consume_sequence_and_tag_with_colors( read.sequence, + this_n_consumed, + _tag_color ); + ++_tag_color; + +#ifdef WITH_INTERNAL_METRICS + hasher.pmetrics.start_timers( ); +#endif +#if (0) // Note: Used with callback - currently disabled. + n_consumed_LOCAL = __sync_add_and_fetch( &n_consumed, this_n_consumed ); +#else + __sync_add_and_fetch( &n_consumed, this_n_consumed ); +#endif + total_reads_LOCAL = __sync_add_and_fetch( &total_reads, 1 ); +#ifdef WITH_INTERNAL_METRICS + hasher.pmetrics.stop_timers( ); + hasher.pmetrics.accumulate_timer_deltas( + (uint32_t)HashTablePerformanceMetrics:: MKEY_TIME_UPDATE_TALLIES + ); +#endif + } + + if (0 == (total_reads_LOCAL % 10000)) + hasher.trace_logger( + TraceLogger:: TLVL_DEBUG3, + "Total number of reads processed: %llu\n", + (unsigned long long int)total_reads_LOCAL + ); + + // TODO: Figure out alternative to callback into Python VM + // Cannot use in multi-threaded operation. +#if (0) + // run callback, if specified + if (total_reads_TL % CALLBACK_PERIOD == 0 && callback) { + std::cout << "n tags: " << all_tags.size() << "\n"; + try { + callback("consume_fasta_and_tag", callback_data, total_reads_TL, + n_consumed); + } catch (...) { + delete parser; + throw; + } + } +#endif // 0 + + } // while reads left for parser + +} + + +/* This is essentially the same code as above, only it assigns colors to the + * tags through multimap TagColorMap defined in hashtable.hh, declared in + * hashbits.hh + */ +void Hashbits::consume_sequence_and_tag_with_colors(const std::string& seq, + unsigned long long& n_consumed, + Color& current_color, + SeenSet * found_tags) +{ + bool is_new_kmer; + bool kmer_tagged; + + KMerIterator kmers(seq.c_str(), _ksize); + HashIntoType kmer; + + unsigned int since = _tag_density / 2 + 1; + + while(!kmers.done()) { + kmer = kmers.next(); + + if ((is_new_kmer = test_and_set_bits( kmer ))) + ++n_consumed; + +#if (1) + if (is_new_kmer) { + ++since; + } else { + ACQUIRE_ALL_TAGS_SPIN_LOCK + kmer_tagged = set_contains(all_tags, kmer); + RELEASE_ALL_TAGS_SPIN_LOCK + if (kmer_tagged) { + since = 1; + + // Coloring code + // TODO: MAKE THREADSAFE! + + if (!_map_contains(color_map, kmer, current_color)) { + color_map.insert(TagColorPair(kmer, current_color)) + } + if (found_tags) { + found_tags->insert(kmer); + } + } else ++since; + } + // Should I bother adding new code down here? +#else + if (!is_new_kmer && set_contains(all_tags, kmer)) { + since = 1; + if (found_tags) { found_tags->insert(kmer); } + } else { + since++; + } +#endif + // + if (since >= _tag_density) { + ACQUIRE_ALL_TAGS_SPIN_LOCK + all_tags.insert(kmer); + RELEASE_ALL_TAGS_SPIN_LOCK + + // Coloring code + // TODO: MAKE THREADSAFE! + color_map.insert(TagColorPair(kmer, current_color)) + + if (found_tags) { found_tags->insert(kmer); } + since = 1; + } + + } // iteration over kmers + + if (since >= _tag_density/2 - 1) { + ACQUIRE_ALL_TAGS_SPIN_LOCK + all_tags.insert(kmer); // insert the last k-mer, too. + RELEASE_ALL_TAGS_SPIN_LOCK + + // Color code: TODO: MAKE THREADSAFE! + color_map.insert(TagColorPair(kmer, current_color)) + + if (found_tags) { found_tags->insert(kmer); } + } +} + } // vim: set sts=2 sw=2: diff --git a/lib/hashtable.hh b/lib/hashtable.hh index f8d1103d41..586aa308be 100644 --- a/lib/hashtable.hh +++ b/lib/hashtable.hh @@ -320,6 +320,8 @@ namespace khmer { SeenSet all_tags; SeenSet stop_tags; SeenSet repart_small_tags; + TagColorMap tag_colors; + ColorTagPrtMap color_tag_ptrs; // accessor to get 'k' const WordLength ksize() const { return _ksize; } @@ -439,6 +441,27 @@ namespace khmer { unsigned long long &n_consumed, CallbackFn callback = 0, void * callback_data = 0); + + + + void consume_fasta_and_tag_with_colors( + std::string const &filename, + unsigned int &total_reads, + unsigned long long &n_consumed, + CallbackFn callback = NULL, + void * callback_data = NULL); + + void consume_fasta_and_tag_with_colors( + read_parsers:: IParser * parser, + unsigned int &total_reads, + unsigned long long &n_consumed, + CallbackFn callback = NULL, + void * callback_data = NULL); + + void consume_sequence_and_tag_with_colors(const std::string& seq, + unsigned long long& n_consumed, + Color& current_color, + SeenSet * new_tags = 0) void consume_fasta_and_traverse(const std::string &filename, unsigned int distance, diff --git a/lib/khmer.hh b/lib/khmer.hh index 86096be83d..372e96ddea 100644 --- a/lib/khmer.hh +++ b/lib/khmer.hh @@ -89,6 +89,7 @@ namespace khmer { typedef unsigned int Color; typedef std::multimap TagColorMap; + typedef std::multimap ColorTagPtrMap; typedef std::pair TagColorPair; } From c097fb508591bb2dccf705c36fa9a9e02f351441 Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Fri, 6 Sep 2013 12:28:08 -0400 Subject: [PATCH 007/140] added _cmap_contains back in after being nuked by merge --- lib/hashtable.cc | 3 ++- lib/hashtable.hh | 14 ++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/lib/hashtable.cc b/lib/hashtable.cc index bac6995173..8e36c5f1f4 100644 --- a/lib/hashtable.cc +++ b/lib/hashtable.cc @@ -2098,8 +2098,9 @@ void Hashbits::consume_sequence_and_tag_with_colors(const std::string& seq, // Coloring code // TODO: MAKE THREADSAFE! - if (!_map_contains(color_map, kmer, current_color)) { + if (!_cmap_contains(color_map, kmer, current_color)) { color_map.insert(TagColorPair(kmer, current_color)) + } if (found_tags) { found_tags->insert(kmer); diff --git a/lib/hashtable.hh b/lib/hashtable.hh index 586aa308be..9f35bb3446 100644 --- a/lib/hashtable.hh +++ b/lib/hashtable.hh @@ -183,6 +183,20 @@ namespace khmer { HashIntoType bitmask; unsigned int _nbits_sub_1; + // Check if the given TagToColorMap already has the tag with the given color + bool _cmap_contains(TagToColorMap& cmap, + HashIntoType& kmer, + Color& the_color) + { + std::pair ret; + ret = cmap->equal_range(kmer); + for (TagToColorMap::iterator it=ret.first; it!=ret.second; ++it) { + if (it->second == the_color) return true; + } + return false; + } + + Hashtable( WordLength ksize, uint32_t const number_of_threads = From 90e83061c69c25f75d883326ba9380725d3f1762 Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Fri, 6 Sep 2013 14:33:19 -0400 Subject: [PATCH 008/140] fixed for proper pointers/refs, added reverse color map --- lib/hashtable.cc | 9 ++++----- lib/hashtable.hh | 29 ++++++++++++++++++++++------- 2 files changed, 26 insertions(+), 12 deletions(-) diff --git a/lib/hashtable.cc b/lib/hashtable.cc index 8e36c5f1f4..12107c03c9 100644 --- a/lib/hashtable.cc +++ b/lib/hashtable.cc @@ -1951,7 +1951,7 @@ void Hashtable::extract_unique_paths(std::string seq, */ void -Hashbits:: +Hashtable:: consume_fasta_and_tag_with_colors( std:: string const &filename, unsigned int &total_reads, unsigned long long &n_consumed, @@ -1978,7 +1978,7 @@ consume_fasta_and_tag_with_colors( } void -Hashbits:: +Hashtable:: consume_fasta_and_tag_with_colors( read_parsers:: IParser * parser, unsigned int &total_reads, unsigned long long &n_consumed, @@ -2066,7 +2066,7 @@ consume_fasta_and_tag_with_colors( * tags through multimap TagColorMap defined in hashtable.hh, declared in * hashbits.hh */ -void Hashbits::consume_sequence_and_tag_with_colors(const std::string& seq, +void Hashtable::consume_sequence_and_tag_with_colors(const std::string& seq, unsigned long long& n_consumed, Color& current_color, SeenSet * found_tags) @@ -2099,8 +2099,7 @@ void Hashbits::consume_sequence_and_tag_with_colors(const std::string& seq, // TODO: MAKE THREADSAFE! if (!_cmap_contains(color_map, kmer, current_color)) { - color_map.insert(TagColorPair(kmer, current_color)) - + link_tag_and_color(kmer, current_color); } if (found_tags) { found_tags->insert(kmer); diff --git a/lib/hashtable.hh b/lib/hashtable.hh index 9f35bb3446..897a5f184c 100644 --- a/lib/hashtable.hh +++ b/lib/hashtable.hh @@ -142,7 +142,6 @@ namespace khmer { bool done() { return index >= length; } }; // class KMerIterator - class Hashtable { // Base class implementation of a Bloom ht. friend class SubsetPartition; protected: @@ -183,20 +182,31 @@ namespace khmer { HashIntoType bitmask; unsigned int _nbits_sub_1; - // Check if the given TagToColorMap already has the tag with the given color - bool _cmap_contains(TagToColorMap& cmap, + // Does the given tag already have the given color? + bool _cmap_contains_color(const TagColorPtrMap& cmap, HashIntoType& kmer, Color& the_color) { - std::pair ret; + std::pair ret; ret = cmap->equal_range(kmer); - for (TagToColorMap::iterator it=ret.first; it!=ret.second; ++it) { - if (it->second == the_color) return true; + for (TagColorPtrMap::iterator it=ret.first; it!=ret.second; ++it) { + if (*(it->second) == the_color) return true; } return false; } - + // Does the given color already have a tag associated with it? + bool _cmap_contains_tag(const ColorTagPtrMap& cmap, + Color& the_color, + HashIntoType& kmer) { + std::pair ret; + ret = cmap->equal_range(the_color); + for (ColorTagPtrMap::iterator it=ret.first; it!=ret.second; ++it) { + if(*(it->second) == kmer) return true; + } + return false; + } + Hashtable( WordLength ksize, uint32_t const number_of_threads = @@ -477,6 +487,11 @@ namespace khmer { Color& current_color, SeenSet * new_tags = 0) + void link_tag_and_color(HashIntoType& kmer, Color& color) { + tag_colors.insert(TagColorPtrPair(kmer, ¤t_color)); + color_tag_ptrs.insert(ColorTagPtrPair(current_color, &kmer)); + } + void consume_fasta_and_traverse(const std::string &filename, unsigned int distance, unsigned int big_threshold, From ce1d0277b6231dd45b20548c62bb7e49825cf634 Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Fri, 6 Sep 2013 17:23:18 -0400 Subject: [PATCH 009/140] added sweep function, getters for colors and tags, started color reconciliation --- lib/hashtable.cc | 313 ++++++++++++++++++++++++++--------------------- lib/hashtable.hh | 30 ++++- lib/khmer.hh | 7 +- 3 files changed, 209 insertions(+), 141 deletions(-) diff --git a/lib/hashtable.cc b/lib/hashtable.cc index 12107c03c9..211b0b418d 100644 --- a/lib/hashtable.cc +++ b/lib/hashtable.cc @@ -1951,8 +1951,7 @@ void Hashtable::extract_unique_paths(std::string seq, */ void -Hashtable:: -consume_fasta_and_tag_with_colors( +Hashtable::consume_fasta_and_tag_with_colors( std:: string const &filename, unsigned int &total_reads, unsigned long long &n_consumed, CallbackFn callback, void * callback_data @@ -1978,88 +1977,87 @@ consume_fasta_and_tag_with_colors( } void -Hashtable:: -consume_fasta_and_tag_with_colors( - read_parsers:: IParser * parser, - unsigned int &total_reads, unsigned long long &n_consumed, - CallbackFn callback, void * callback_data -) -{ - Hasher &hasher = - _get_hasher( parser->uuid( ) ); - unsigned int total_reads_LOCAL = 0; -#if (0) // Note: Used with callback - currently disabled. - unsigned long long int n_consumed_LOCAL = 0; -#endif - Read read; - - // TODO? Delete the following assignments. - total_reads = 0; - n_consumed = 0; - - hasher.trace_logger( - TraceLogger:: TLVL_DEBUG2, - "Starting trace of 'consume_fasta_and_tag'....\n" - ); - - // Iterate through the reads and consume their k-mers. - while (!parser->is_complete( )) +Hashtable::consume_fasta_and_tag_with_colors( + read_parsers:: IParser * parser, + unsigned int &total_reads, unsigned long long &n_consumed, + CallbackFn callback, void * callback_data + ) { - unsigned long long this_n_consumed = 0; - - read = parser->get_next_read( ); + Hasher &hasher = + _get_hasher( parser->uuid( ) ); + unsigned int total_reads_LOCAL = 0; + #if (0) // Note: Used with callback - currently disabled. + unsigned long long int n_consumed_LOCAL = 0; + #endif + Read read; + + // TODO? Delete the following assignments. + total_reads = 0; + n_consumed = 0; + + hasher.trace_logger( + TraceLogger:: TLVL_DEBUG2, + "Starting trace of 'consume_fasta_and_tag'....\n" + ); - if (check_and_normalize_read( read.sequence )) + // Iterate through the reads and consume their k-mers. + while (!parser->is_complete( )) { - // TODO: make threadsafe! - consume_sequence_and_tag_with_colors( read.sequence, - this_n_consumed, - _tag_color ); - ++_tag_color; + unsigned long long this_n_consumed = 0; -#ifdef WITH_INTERNAL_METRICS - hasher.pmetrics.start_timers( ); -#endif -#if (0) // Note: Used with callback - currently disabled. - n_consumed_LOCAL = __sync_add_and_fetch( &n_consumed, this_n_consumed ); -#else - __sync_add_and_fetch( &n_consumed, this_n_consumed ); -#endif - total_reads_LOCAL = __sync_add_and_fetch( &total_reads, 1 ); -#ifdef WITH_INTERNAL_METRICS - hasher.pmetrics.stop_timers( ); - hasher.pmetrics.accumulate_timer_deltas( - (uint32_t)HashTablePerformanceMetrics:: MKEY_TIME_UPDATE_TALLIES - ); -#endif - } + read = parser->get_next_read( ); - if (0 == (total_reads_LOCAL % 10000)) - hasher.trace_logger( - TraceLogger:: TLVL_DEBUG3, - "Total number of reads processed: %llu\n", - (unsigned long long int)total_reads_LOCAL - ); - - // TODO: Figure out alternative to callback into Python VM - // Cannot use in multi-threaded operation. -#if (0) - // run callback, if specified - if (total_reads_TL % CALLBACK_PERIOD == 0 && callback) { - std::cout << "n tags: " << all_tags.size() << "\n"; - try { - callback("consume_fasta_and_tag", callback_data, total_reads_TL, - n_consumed); - } catch (...) { - delete parser; - throw; - } + if (check_and_normalize_read( read.sequence )) + { + // TODO: make threadsafe! + consume_sequence_and_tag_with_colors( read.sequence, + this_n_consumed, + _tag_color ); + ++_tag_color; + + #ifdef WITH_INTERNAL_METRICS + hasher.pmetrics.start_timers( ); + #endif + #if (0) // Note: Used with callback - currently disabled. + n_consumed_LOCAL = __sync_add_and_fetch( &n_consumed, this_n_consumed ); + #else + __sync_add_and_fetch( &n_consumed, this_n_consumed ); + #endif + total_reads_LOCAL = __sync_add_and_fetch( &total_reads, 1 ); + #ifdef WITH_INTERNAL_METRICS + hasher.pmetrics.stop_timers( ); + hasher.pmetrics.accumulate_timer_deltas( + (uint32_t)HashTablePerformanceMetrics:: MKEY_TIME_UPDATE_TALLIES + ); + #endif } -#endif // 0 - } // while reads left for parser + if (0 == (total_reads_LOCAL % 10000)) + hasher.trace_logger( + TraceLogger:: TLVL_DEBUG3, + "Total number of reads processed: %llu\n", + (unsigned long long int)total_reads_LOCAL + ); + + // TODO: Figure out alternative to callback into Python VM + // Cannot use in multi-threaded operation. + #if (0) + // run callback, if specified + if (total_reads_TL % CALLBACK_PERIOD == 0 && callback) { + std::cout << "n tags: " << all_tags.size() << "\n"; + try { + callback("consume_fasta_and_tag", callback_data, total_reads_TL, + n_consumed); + } catch (...) { + delete parser; + throw; + } + } + #endif // 0 -} + } // while reads left for parser + + } /* This is essentially the same code as above, only it assigns colors to the @@ -2070,79 +2068,120 @@ void Hashtable::consume_sequence_and_tag_with_colors(const std::string& seq, unsigned long long& n_consumed, Color& current_color, SeenSet * found_tags) -{ - bool is_new_kmer; - bool kmer_tagged; - - KMerIterator kmers(seq.c_str(), _ksize); - HashIntoType kmer; - - unsigned int since = _tag_density / 2 + 1; - - while(!kmers.done()) { - kmer = kmers.next(); + { + bool is_new_kmer; + bool kmer_tagged; + + KMerIterator kmers(seq.c_str(), _ksize); + HashIntoType kmer; + + unsigned int since = _tag_density / 2 + 1; + + while(!kmers.done()) { + kmer = kmers.next(); + + if ((is_new_kmer = test_and_set_bits( kmer ))) + ++n_consumed; + + #if (1) + if (is_new_kmer) { + ++since; + } else { + ACQUIRE_ALL_TAGS_SPIN_LOCK + kmer_tagged = set_contains(all_tags, kmer); + RELEASE_ALL_TAGS_SPIN_LOCK + if (kmer_tagged) { + since = 1; + + // Coloring code + // TODO: MAKE THREADSAFE! + + if (!_cmap_contains_color(color_map, kmer, current_color)) { + link_tag_and_color(kmer, current_color); + } + if (found_tags) { + found_tags->insert(kmer); + } + } else ++since; + } + // Should I bother adding new code down here? + #else + if (!is_new_kmer && set_contains(all_tags, kmer)) { + since = 1; + if (found_tags) { found_tags->insert(kmer); } + } else { + since++; + } + #endif + // + if (since >= _tag_density) { + ACQUIRE_ALL_TAGS_SPIN_LOCK + all_tags.insert(kmer); + RELEASE_ALL_TAGS_SPIN_LOCK + + // Coloring code + // TODO: MAKE THREADSAFE! + link_tag_and_color(kmer, current_color) + + if (found_tags) { found_tags->insert(kmer); } + since = 1; + } - if ((is_new_kmer = test_and_set_bits( kmer ))) - ++n_consumed; + } // iteration over kmers -#if (1) - if (is_new_kmer) { - ++since; - } else { - ACQUIRE_ALL_TAGS_SPIN_LOCK - kmer_tagged = set_contains(all_tags, kmer); - RELEASE_ALL_TAGS_SPIN_LOCK - if (kmer_tagged) { - since = 1; - - // Coloring code - // TODO: MAKE THREADSAFE! - - if (!_cmap_contains(color_map, kmer, current_color)) { - link_tag_and_color(kmer, current_color); - } - if (found_tags) { - found_tags->insert(kmer); - } - } else ++since; - } - // Should I bother adding new code down here? -#else - if (!is_new_kmer && set_contains(all_tags, kmer)) { - since = 1; - if (found_tags) { found_tags->insert(kmer); } - } else { - since++; - } -#endif - // - if (since >= _tag_density) { + if (since >= _tag_density/2 - 1) { ACQUIRE_ALL_TAGS_SPIN_LOCK - all_tags.insert(kmer); + all_tags.insert(kmer); // insert the last k-mer, too. RELEASE_ALL_TAGS_SPIN_LOCK - // Coloring code - // TODO: MAKE THREADSAFE! - color_map.insert(TagColorPair(kmer, current_color)) + // Color code: TODO: MAKE THREADSAFE! + link_tag_and_color(kmer, current_color) if (found_tags) { found_tags->insert(kmer); } - since = 1; } - - } // iteration over kmers - - if (since >= _tag_density/2 - 1) { - ACQUIRE_ALL_TAGS_SPIN_LOCK - all_tags.insert(kmer); // insert the last k-mer, too. - RELEASE_ALL_TAGS_SPIN_LOCK + } +/* + * Find all colors associated with the sequence + * For now, check /every/ k-mer with find_all_tags + * THIS SUCKS AND IT'S YOUR FAULT @CTB + */ +void Hashtable::sweep_sequence_for_colors(const std::string& seq, + unsigned long long& n_consumed, + SeenSet * found_tags, + bool break_on_stoptags, + bool stop_big_traversals) { + + SeenSet tagged_kmers; + ColorPtrSet found_colors; - // Color code: TODO: MAKE THREADSAFE! - color_map.insert(TagColorPair(kmer, current_color)) + const unsigned char ksize = _ht->ktsize(); + HashIntoType kmer_f, kmer_r, kmer; - if (found_tags) { found_tags->insert(kmer); } - } + KMerIterator kmers(seq.c_str(), _ksize); + HashIntoType kmer_s; + + while (!kmers.done()) { + kmer_s = kmers.next(); + kmer = _hash(kmer_s.c_str(), ksize, kmer_f, kmer_r); + + find_all_tags(kmer_f, kmer_r, tagged_kmers, _ht->all_tags, + break_on_stoptags, stop_big_traversals); + } } +void Hashtable::traverse_colors_and_resolve(const SeenSet& tagged_kmers, + ColorPtrSet& found_colors) { + + SeenSet::const_iterator si; + unsigned int num_colors = 0; + for (si=tagged_kmers.begin(); si!=tagged_kmers.end(); ++si) { + tag = *si; + // get the colors associated with this tag + num_colors = _get_tag_colors(tag, tag_colors, found_colors) + if (num_colors > 1) { + // reconcile colors + } + } } // vim: set sts=2 sw=2: diff --git a/lib/hashtable.hh b/lib/hashtable.hh index 897a5f184c..f339840bc8 100644 --- a/lib/hashtable.hh +++ b/lib/hashtable.hh @@ -188,7 +188,7 @@ namespace khmer { Color& the_color) { std::pair ret; - ret = cmap->equal_range(kmer); + ret = cmap.equal_range(kmer); for (TagColorPtrMap::iterator it=ret.first; it!=ret.second; ++it) { if (*(it->second) == the_color) return true; } @@ -200,13 +200,39 @@ namespace khmer { Color& the_color, HashIntoType& kmer) { std::pair ret; - ret = cmap->equal_range(the_color); + ret = cmap.equal_range(the_color); for (ColorTagPtrMap::iterator it=ret.first; it!=ret.second; ++it) { if(*(it->second) == kmer) return true; } return false; } + unsigned int _get_tag_colors(const HashIntoType& tag, + const TagColorPrtMap& cmap, + ColorPtrSet& found_colors) { + unsigned int num_colors = 0; + std::pair ret; + ret = cmap.equal_range(tag); + for (TagColorPtrMap::iterator it=ret.first; it!=ret.second; ++it) { + found_colors.insert(it->second); + ++num_colors; + } + return num_colors; + } + + unsigned int _get_tags_from_color(const Color& color, + const ColorTagPtrMap& cmap, + TagPtrSet& colored_tags) { + unsigned int num_tags = 0; + std::pair ret; + ret = cmap.equal_range(color); + for (ColorTagPtrMap::iterator it=ret.first; it!=ret.second; ++it) { + color_tags.insert(it->second); + ++num_tags; + } + return num_tags; + } + Hashtable( WordLength ksize, uint32_t const number_of_threads = diff --git a/lib/khmer.hh b/lib/khmer.hh index 372e96ddea..e096919e62 100644 --- a/lib/khmer.hh +++ b/lib/khmer.hh @@ -88,9 +88,12 @@ namespace khmer { typedef std::map PartitionCountDistribution; typedef unsigned int Color; - typedef std::multimap TagColorMap; + typedef std::multimap TagColorPtrMap; typedef std::multimap ColorTagPtrMap; - typedef std::pair TagColorPair; + typedef std::pair TagColorPtrPair; + typedef std::pair ColorTagPtrPair; + typedef std::set ColorPtrSet; + typedef std::set TagPtrSet; } #endif // KHMER_HH From 24f738095bef986557e4e4856146f01e6ed1fbdc Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Mon, 9 Sep 2013 11:56:07 -0400 Subject: [PATCH 010/140] color sweep added --- lib/hashtable.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lib/hashtable.cc b/lib/hashtable.cc index 211b0b418d..33765f23d1 100644 --- a/lib/hashtable.cc +++ b/lib/hashtable.cc @@ -2147,12 +2147,12 @@ void Hashtable::consume_sequence_and_tag_with_colors(const std::string& seq, */ void Hashtable::sweep_sequence_for_colors(const std::string& seq, unsigned long long& n_consumed, - SeenSet * found_tags, + ColorPtrSet& found_colors, bool break_on_stoptags, bool stop_big_traversals) { SeenSet tagged_kmers; - ColorPtrSet found_colors; + //ColorPtrSet found_colors; const unsigned char ksize = _ht->ktsize(); HashIntoType kmer_f, kmer_r, kmer; @@ -2166,6 +2166,7 @@ void Hashtable::sweep_sequence_for_colors(const std::string& seq, find_all_tags(kmer_f, kmer_r, tagged_kmers, _ht->all_tags, break_on_stoptags, stop_big_traversals); + traverse_colors_and_resolve(tagged_kmers, found_colors); } } @@ -2180,6 +2181,7 @@ void Hashtable::traverse_colors_and_resolve(const SeenSet& tagged_kmers, num_colors = _get_tag_colors(tag, tag_colors, found_colors) if (num_colors > 1) { // reconcile colors + // for now do nothing ha } } } From a961a2122286698b3b2126330d1dc45f9ed359bc Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Mon, 9 Sep 2013 12:58:32 -0400 Subject: [PATCH 011/140] added parse function to python glue --- python/_khmermodule.cc | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/python/_khmermodule.cc b/python/_khmermodule.cc index b0af974b3a..04c7ad568e 100644 --- a/python/_khmermodule.cc +++ b/python/_khmermodule.cc @@ -2195,6 +2195,37 @@ static PyObject * hash_do_subset_partition_with_abundance(PyObject * self, PyObj return (PyObject *) subset_obj; } +static PyObject * hash_consume_fasta_and_tag_with_colors(PyObject * self, PyObject * args) +{ + khmer_KCountingHashObject * me = (khmer_KCountingHashObject *) self; + khmer::CountingHash * counting = me->counting; + + PyObject * callback_obs = NULL; + PyObject * rparser_obj = NULL; + + if (!PyArg_ParseType(args, "O|O", &rparser_obj, &callback_obj)) { + return NULL; + } + + khmer:: read_parsers:: IParser * rparser = + _PyObject_to_khmer_ReadParser( rparser_obj ); + unsigned long long n_consumed; + unsigned int total_reads; + bool exc_raised = false; + + Py_BEGIN_ALLOW_THREADS + try { + counting->consume_fasta_and_tag_with_colors(rparser, total_reads, n_consumed, + _report_fn, callback_obj); + } catch (_khmer_signal &e) { + exc_raised = TRUE; + } + Py_END_ALLOW_THREADS + if (exc_raised) return NULL; + + return Py_BuildValue("iL", total_reads, n_consumed); + +} static PyMethodDef khmer_counting_methods[] = { { "ksize", hash_get_ksize, METH_VARARGS, "" }, @@ -2232,7 +2263,7 @@ static PyMethodDef khmer_counting_methods[] = { { "consume_fasta_and_tag", hash_consume_fasta_and_tag, METH_VARARGS, "Count all k-mers in a given file" }, { "do_subset_partition_with_abundance", hash_do_subset_partition_with_abundance, METH_VARARGS, "" }, { "find_all_tags_truncate_on_abundance", hash_find_all_tags_truncate_on_abundance, METH_VARARGS, "" }, - + { "consume_fasta_and_tag_with_colors", hash_consume_fasta_and_tag_with_colors, METH_VARARGS, "" }, {NULL, NULL, 0, NULL} /* sentinel */ }; From 1c07a014713ea22df2ff45238e35a71f2b95e956 Mon Sep 17 00:00:00 2001 From: CS Welcher Date: Mon, 9 Sep 2013 13:29:38 -0400 Subject: [PATCH 012/140] added a spin lock for tag_colors (not strictly necessary, but ctb likes explicit things i hear) --- lib/hashtable.cc | 4 ++++ lib/hashtable.hh | 9 ++++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/lib/hashtable.cc b/lib/hashtable.cc index 33765f23d1..4030ee8993 100644 --- a/lib/hashtable.cc +++ b/lib/hashtable.cc @@ -2097,7 +2097,9 @@ void Hashtable::consume_sequence_and_tag_with_colors(const std::string& seq, // TODO: MAKE THREADSAFE! if (!_cmap_contains_color(color_map, kmer, current_color)) { + ACQUIRE_TAG_COLORS_SPIN_LOCK link_tag_and_color(kmer, current_color); + RELEASE_TAG_COLORS_SPIN_LOCK } if (found_tags) { found_tags->insert(kmer); @@ -2121,7 +2123,9 @@ void Hashtable::consume_sequence_and_tag_with_colors(const std::string& seq, // Coloring code // TODO: MAKE THREADSAFE! + ACQUIRE_TAG_COLORS_SPIN_LOCK link_tag_and_color(kmer, current_color) + RELEASE_TAG_COLORS_SPIN_LOCK if (found_tags) { found_tags->insert(kmer); } since = 1; diff --git a/lib/hashtable.hh b/lib/hashtable.hh index f339840bc8..68d6162459 100644 --- a/lib/hashtable.hh +++ b/lib/hashtable.hh @@ -252,6 +252,7 @@ namespace khmer { partition = new SubsetPartition(this); _init_bitstuff(); _all_tags_spin_lock = 0; + _tag_colors_spin_lock = 0; } virtual ~Hashtable( ) @@ -364,7 +365,7 @@ namespace khmer { } uint32_t _all_tags_spin_lock; - + uint32_t _tag_colors_spin_lock; public: SubsetPartition * partition; SeenSet all_tags; @@ -626,4 +627,10 @@ namespace khmer { #define RELEASE_ALL_TAGS_SPIN_LOCK \ __sync_bool_compare_and_swap( &_all_tags_spin_lock, 1, 0 ); +#define ACQUIRE_TAG_COLORS_SPIN_LOCK \ + while(!__sync_bool_compare_and_swap( &_tag_colors_spin_lock, 0, 1)); + +#define ACQUIRE_TAG_COLORS_SPIN_LOCK \ + __sync_bool_compare_and_swap( &_tag_colors_spin_lock, 1, 0); + #endif // HASHTABLE_HH From e0f7dca0062237445f59d680f5cbeb526f66fcf6 Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Mon, 9 Sep 2013 14:46:07 -0400 Subject: [PATCH 013/140] adding sweep glue --- lib/hashtable.cc | 1 - lib/hashtable.hh | 7 +++++++ python/_khmermodule.cc | 19 +++++++++++++++++++ 3 files changed, 26 insertions(+), 1 deletion(-) diff --git a/lib/hashtable.cc b/lib/hashtable.cc index 4030ee8993..0af75db535 100644 --- a/lib/hashtable.cc +++ b/lib/hashtable.cc @@ -2150,7 +2150,6 @@ void Hashtable::consume_sequence_and_tag_with_colors(const std::string& seq, * THIS SUCKS AND IT'S YOUR FAULT @CTB */ void Hashtable::sweep_sequence_for_colors(const std::string& seq, - unsigned long long& n_consumed, ColorPtrSet& found_colors, bool break_on_stoptags, bool stop_big_traversals) { diff --git a/lib/hashtable.hh b/lib/hashtable.hh index 68d6162459..9931ee1da3 100644 --- a/lib/hashtable.hh +++ b/lib/hashtable.hh @@ -59,6 +59,13 @@ namespace khmer { virtual void accumulate_timer_deltas( uint32_t metrics_key ); + }; + + struct tag_color_info { + HashIntoType kmer; + SeenSet tagged_kmers; + + }; // diff --git a/python/_khmermodule.cc b/python/_khmermodule.cc index 04c7ad568e..4efbe2ca8c 100644 --- a/python/_khmermodule.cc +++ b/python/_khmermodule.cc @@ -2227,6 +2227,25 @@ static PyObject * hash_consume_fasta_and_tag_with_colors(PyObject * self, PyObje } +static PyObject * hash_sweep_sequence_for_colors(PyObject * self, PyObject * args) { + khmer_KCountingHashObject * me = (khmer_KCountingHashObject *) self; + khmer::CountingHash * counting = me->counting; + + char * seq = NULL; + bool break_on_stoptags = NULL; + bool stop_big_traversals = NULL; + + if(!PyArg_ParseTuple(args, "spp", &seq, &break_on_stoptags, &stop_big_traversals)) { + return NULL; + } + + if (strlen(kmer_s) < counting->ksize()) { + return NULL; + } + + _pre_partition_info +} + static PyMethodDef khmer_counting_methods[] = { { "ksize", hash_get_ksize, METH_VARARGS, "" }, { "hashsizes", hash_get_hashsizes, METH_VARARGS, "" }, From a64264d60104a6554f82d8435c613de58e82b56c Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Mon, 9 Sep 2013 18:48:20 -0400 Subject: [PATCH 014/140] woot compiles and segfaults i win --- lib/hashtable.cc | 35 ++++++++++++++++------------- lib/hashtable.hh | 48 +++++++++++++++++++-------------------- python/_khmermodule.cc | 51 ++++++++++++++++++++++++++++++------------ 3 files changed, 80 insertions(+), 54 deletions(-) diff --git a/lib/hashtable.cc b/lib/hashtable.cc index 0af75db535..2d6e74d38b 100644 --- a/lib/hashtable.cc +++ b/lib/hashtable.cc @@ -1943,7 +1943,7 @@ void Hashtable::extract_unique_paths(std::string seq, i++; } } - +} /* * Pretty much copy-pasta * Might be time for a refactor: could do a general consume_fasta @@ -1999,7 +1999,9 @@ Hashtable::consume_fasta_and_tag_with_colors( TraceLogger:: TLVL_DEBUG2, "Starting trace of 'consume_fasta_and_tag'....\n" ); - + + Color _tag_color = 0; + Color * the_color = new Color(_tag_color++); // Iterate through the reads and consume their k-mers. while (!parser->is_complete( )) { @@ -2012,8 +2014,8 @@ Hashtable::consume_fasta_and_tag_with_colors( // TODO: make threadsafe! consume_sequence_and_tag_with_colors( read.sequence, this_n_consumed, - _tag_color ); - ++_tag_color; + *the_color ); + the_color = new Color(_tag_color++); #ifdef WITH_INTERNAL_METRICS hasher.pmetrics.start_timers( ); @@ -2059,7 +2061,10 @@ Hashtable::consume_fasta_and_tag_with_colors( } - +void Hashtable::link_tag_and_color(HashIntoType& kmer, Color& kmer_color) { + tag_colors.insert(TagColorPtrPair(kmer, &kmer_color)); + color_tag_ptrs.insert(ColorTagPtrPair(kmer_color, &kmer)); +} /* This is essentially the same code as above, only it assigns colors to the * tags through multimap TagColorMap defined in hashtable.hh, declared in * hashbits.hh @@ -2096,7 +2101,7 @@ void Hashtable::consume_sequence_and_tag_with_colors(const std::string& seq, // Coloring code // TODO: MAKE THREADSAFE! - if (!_cmap_contains_color(color_map, kmer, current_color)) { + if (!_cmap_contains_color(tag_colors, kmer, current_color)) { ACQUIRE_TAG_COLORS_SPIN_LOCK link_tag_and_color(kmer, current_color); RELEASE_TAG_COLORS_SPIN_LOCK @@ -2124,7 +2129,7 @@ void Hashtable::consume_sequence_and_tag_with_colors(const std::string& seq, // Coloring code // TODO: MAKE THREADSAFE! ACQUIRE_TAG_COLORS_SPIN_LOCK - link_tag_and_color(kmer, current_color) + link_tag_and_color(kmer, current_color); RELEASE_TAG_COLORS_SPIN_LOCK if (found_tags) { found_tags->insert(kmer); } @@ -2139,7 +2144,7 @@ void Hashtable::consume_sequence_and_tag_with_colors(const std::string& seq, RELEASE_ALL_TAGS_SPIN_LOCK // Color code: TODO: MAKE THREADSAFE! - link_tag_and_color(kmer, current_color) + link_tag_and_color(kmer, current_color); if (found_tags) { found_tags->insert(kmer); } } @@ -2157,17 +2162,17 @@ void Hashtable::sweep_sequence_for_colors(const std::string& seq, SeenSet tagged_kmers; //ColorPtrSet found_colors; - const unsigned char ksize = _ht->ktsize(); HashIntoType kmer_f, kmer_r, kmer; KMerIterator kmers(seq.c_str(), _ksize); - HashIntoType kmer_s; + std::string kmer_s; while (!kmers.done()) { - kmer_s = kmers.next(); - kmer = _hash(kmer_s.c_str(), ksize, kmer_f, kmer_r); + kmer = kmers.next(); + kmer_s = _revhash(kmer, _ksize); + _hash(kmer_s.c_str(), _ksize, kmer_f, kmer_r); - find_all_tags(kmer_f, kmer_r, tagged_kmers, _ht->all_tags, + partition->find_all_tags(kmer_f, kmer_r, tagged_kmers, all_tags, break_on_stoptags, stop_big_traversals); traverse_colors_and_resolve(tagged_kmers, found_colors); } @@ -2179,9 +2184,9 @@ void Hashtable::traverse_colors_and_resolve(const SeenSet& tagged_kmers, SeenSet::const_iterator si; unsigned int num_colors = 0; for (si=tagged_kmers.begin(); si!=tagged_kmers.end(); ++si) { - tag = *si; + HashIntoType tag = *si; // get the colors associated with this tag - num_colors = _get_tag_colors(tag, tag_colors, found_colors) + num_colors = _get_tag_colors(tag, tag_colors, found_colors); if (num_colors > 1) { // reconcile colors // for now do nothing ha diff --git a/lib/hashtable.hh b/lib/hashtable.hh index 9931ee1da3..61c60a7869 100644 --- a/lib/hashtable.hh +++ b/lib/hashtable.hh @@ -59,13 +59,6 @@ namespace khmer { virtual void accumulate_timer_deltas( uint32_t metrics_key ); - }; - - struct tag_color_info { - HashIntoType kmer; - SeenSet tagged_kmers; - - }; // @@ -194,9 +187,9 @@ namespace khmer { HashIntoType& kmer, Color& the_color) { - std::pair ret; + std::pair ret; ret = cmap.equal_range(kmer); - for (TagColorPtrMap::iterator it=ret.first; it!=ret.second; ++it) { + for (TagColorPtrMap::const_iterator it=ret.first; it!=ret.second; ++it) { if (*(it->second) == the_color) return true; } return false; @@ -206,21 +199,21 @@ namespace khmer { bool _cmap_contains_tag(const ColorTagPtrMap& cmap, Color& the_color, HashIntoType& kmer) { - std::pair ret; + std::pair ret; ret = cmap.equal_range(the_color); - for (ColorTagPtrMap::iterator it=ret.first; it!=ret.second; ++it) { + for (ColorTagPtrMap::const_iterator it=ret.first; it!=ret.second; ++it) { if(*(it->second) == kmer) return true; } return false; } unsigned int _get_tag_colors(const HashIntoType& tag, - const TagColorPrtMap& cmap, + const TagColorPtrMap& cmap, ColorPtrSet& found_colors) { unsigned int num_colors = 0; - std::pair ret; + std::pair ret; ret = cmap.equal_range(tag); - for (TagColorPtrMap::iterator it=ret.first; it!=ret.second; ++it) { + for (TagColorPtrMap::const_iterator it=ret.first; it!=ret.second; ++it) { found_colors.insert(it->second); ++num_colors; } @@ -231,10 +224,10 @@ namespace khmer { const ColorTagPtrMap& cmap, TagPtrSet& colored_tags) { unsigned int num_tags = 0; - std::pair ret; + std::pair ret; ret = cmap.equal_range(color); - for (ColorTagPtrMap::iterator it=ret.first; it!=ret.second; ++it) { - color_tags.insert(it->second); + for (ColorTagPtrMap::const_iterator it=ret.first; it!=ret.second; ++it) { + colored_tags.insert(it->second); ++num_tags; } return num_tags; @@ -378,8 +371,8 @@ namespace khmer { SeenSet all_tags; SeenSet stop_tags; SeenSet repart_small_tags; - TagColorMap tag_colors; - ColorTagPrtMap color_tag_ptrs; + TagColorPtrMap tag_colors; + ColorTagPtrMap color_tag_ptrs; // accessor to get 'k' const WordLength ksize() const { return _ksize; } @@ -519,12 +512,17 @@ namespace khmer { void consume_sequence_and_tag_with_colors(const std::string& seq, unsigned long long& n_consumed, Color& current_color, - SeenSet * new_tags = 0) + SeenSet * new_tags = 0); - void link_tag_and_color(HashIntoType& kmer, Color& color) { - tag_colors.insert(TagColorPtrPair(kmer, ¤t_color)); - color_tag_ptrs.insert(ColorTagPtrPair(current_color, &kmer)); - } + void link_tag_and_color(HashIntoType& kmer, Color& color); + + void sweep_sequence_for_colors(const std::string& seq, + ColorPtrSet& found_colors, + bool break_on_stoptags, + bool stop_big_traversals); + + void traverse_colors_and_resolve(const SeenSet& tagged_kmers, + ColorPtrSet& found_colors); void consume_fasta_and_traverse(const std::string &filename, unsigned int distance, @@ -637,7 +635,7 @@ namespace khmer { #define ACQUIRE_TAG_COLORS_SPIN_LOCK \ while(!__sync_bool_compare_and_swap( &_tag_colors_spin_lock, 0, 1)); -#define ACQUIRE_TAG_COLORS_SPIN_LOCK \ +#define RELEASE_TAG_COLORS_SPIN_LOCK \ __sync_bool_compare_and_swap( &_tag_colors_spin_lock, 1, 0); #endif // HASHTABLE_HH diff --git a/python/_khmermodule.cc b/python/_khmermodule.cc index 4efbe2ca8c..66089766c0 100644 --- a/python/_khmermodule.cc +++ b/python/_khmermodule.cc @@ -2195,15 +2195,15 @@ static PyObject * hash_do_subset_partition_with_abundance(PyObject * self, PyObj return (PyObject *) subset_obj; } -static PyObject * hash_consume_fasta_and_tag_with_colors(PyObject * self, PyObject * args) +static PyObject * hashbits_consume_fasta_and_tag_with_colors(PyObject * self, PyObject * args) { - khmer_KCountingHashObject * me = (khmer_KCountingHashObject *) self; - khmer::CountingHash * counting = me->counting; + khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self; + khmer::Hashbits * hb = me->hashbits; - PyObject * callback_obs = NULL; + PyObject * callback_obj = NULL; PyObject * rparser_obj = NULL; - if (!PyArg_ParseType(args, "O|O", &rparser_obj, &callback_obj)) { + if (!PyArg_ParseTuple(args, "O|O", &rparser_obj, &callback_obj)) { return NULL; } @@ -2215,10 +2215,10 @@ static PyObject * hash_consume_fasta_and_tag_with_colors(PyObject * self, PyObje Py_BEGIN_ALLOW_THREADS try { - counting->consume_fasta_and_tag_with_colors(rparser, total_reads, n_consumed, + hb->consume_fasta_and_tag_with_colors(rparser, total_reads, n_consumed, _report_fn, callback_obj); } catch (_khmer_signal &e) { - exc_raised = TRUE; + exc_raised = true; } Py_END_ALLOW_THREADS if (exc_raised) return NULL; @@ -2227,9 +2227,9 @@ static PyObject * hash_consume_fasta_and_tag_with_colors(PyObject * self, PyObje } -static PyObject * hash_sweep_sequence_for_colors(PyObject * self, PyObject * args) { - khmer_KCountingHashObject * me = (khmer_KCountingHashObject *) self; - khmer::CountingHash * counting = me->counting; +static PyObject * hashbits_sweep_sequence_for_colors(PyObject * self, PyObject * args) { + khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self; + khmer::Hashbits * hb = me->hashbits; char * seq = NULL; bool break_on_stoptags = NULL; @@ -2239,11 +2239,33 @@ static PyObject * hash_sweep_sequence_for_colors(PyObject * self, PyObject * arg return NULL; } - if (strlen(kmer_s) < counting->ksize()) { + if (strlen(seq) < hb->ksize()) { return NULL; } - _pre_partition_info + //std::pair ret; + ColorPtrSet found_colors; + + bool exc_raised = false; + Py_BEGIN_ALLOW_THREADS + try { + hb->sweep_sequence_for_colors(seq, found_colors, break_on_stoptags, stop_big_traversals); + } catch (_khmer_signal &e) { + exc_raised = true; + } + Py_END_ALLOW_THREADS + + if (exc_raised) return NULL; + + PyObject * x = PyList_New(found_colors.size()); + khmer::ColorPtrSet::const_iterator si; + unsigned long long i = 0; + for (si=found_colors.begin(); si!=found_colors.end(); ++si) { + PyList_SET_ITEM(x, i, Py_BuildValue("i", *(*si))); + i++; + } + + return x; } static PyMethodDef khmer_counting_methods[] = { @@ -2282,7 +2304,7 @@ static PyMethodDef khmer_counting_methods[] = { { "consume_fasta_and_tag", hash_consume_fasta_and_tag, METH_VARARGS, "Count all k-mers in a given file" }, { "do_subset_partition_with_abundance", hash_do_subset_partition_with_abundance, METH_VARARGS, "" }, { "find_all_tags_truncate_on_abundance", hash_find_all_tags_truncate_on_abundance, METH_VARARGS, "" }, - { "consume_fasta_and_tag_with_colors", hash_consume_fasta_and_tag_with_colors, METH_VARARGS, "" }, + {NULL, NULL, 0, NULL} /* sentinel */ }; @@ -3937,7 +3959,8 @@ static PyMethodDef khmer_hashbits_methods[] = { { "traverse_from_tags", hashbits_traverse_from_tags, METH_VARARGS, "" }, { "repartition_largest_partition", hashbits_repartition_largest_partition, METH_VARARGS, "" }, { "get_median_count", hashbits_get_median_count, METH_VARARGS, "Get the median, average, and stddev of the k-mer counts in the string" }, - + { "consume_fasta_and_tag_with_colors", hashbits_consume_fasta_and_tag_with_colors, METH_VARARGS, "" }, + { "sweep_sequence_for_colors", hashbits_sweep_sequence_for_colors, METH_VARARGS, "" }, {NULL, NULL, 0, NULL} /* sentinel */ }; From db935a1e5b3a0c1725502278336423a41b1f10ca Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Tue, 10 Sep 2013 18:13:17 -0400 Subject: [PATCH 015/140] consuming, coloring, and sweeping functions successfully integrated with python glue, appear to be outputting correct colors --- lib/hashbits.hh | 4 - lib/hashtable.cc | 4 + lib/hashtable.hh | 2 +- python/_khmermodule.cc | 177 ++++++++++++++++++++++++----------------- 4 files changed, 109 insertions(+), 78 deletions(-) diff --git a/lib/hashbits.hh b/lib/hashbits.hh index 6c0f19698c..a695fb30da 100644 --- a/lib/hashbits.hh +++ b/lib/hashbits.hh @@ -22,8 +22,6 @@ namespace khmer { HashIntoType _n_overlap_kmers; Byte ** _counts; - Color _tag_color; - virtual void _allocate_counters() { _n_tables = _tablesizes.size(); @@ -50,8 +48,6 @@ namespace khmer { _n_unique_kmers = 0; _n_overlap_kmers = 0; - _tag_color = 0; - _allocate_counters(); } diff --git a/lib/hashtable.cc b/lib/hashtable.cc index 2d6e74d38b..00052cbdb1 100644 --- a/lib/hashtable.cc +++ b/lib/hashtable.cc @@ -1950,6 +1950,10 @@ void Hashtable::extract_unique_paths(std::string seq, * function which accepts a consume_sequence function pointer as a parameter */ +void Hashtable::do_nothing() { + std::cout << "doing nothing\n"; +} + void Hashtable::consume_fasta_and_tag_with_colors( std:: string const &filename, diff --git a/lib/hashtable.hh b/lib/hashtable.hh index 61c60a7869..8655751ad7 100644 --- a/lib/hashtable.hh +++ b/lib/hashtable.hh @@ -493,7 +493,7 @@ namespace khmer { CallbackFn callback = 0, void * callback_data = 0); - + void do_nothing(); void consume_fasta_and_tag_with_colors( std::string const &filename, diff --git a/python/_khmermodule.cc b/python/_khmermodule.cc index 66089766c0..dc8aff3932 100644 --- a/python/_khmermodule.cc +++ b/python/_khmermodule.cc @@ -2195,79 +2195,6 @@ static PyObject * hash_do_subset_partition_with_abundance(PyObject * self, PyObj return (PyObject *) subset_obj; } -static PyObject * hashbits_consume_fasta_and_tag_with_colors(PyObject * self, PyObject * args) -{ - khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self; - khmer::Hashbits * hb = me->hashbits; - - PyObject * callback_obj = NULL; - PyObject * rparser_obj = NULL; - - if (!PyArg_ParseTuple(args, "O|O", &rparser_obj, &callback_obj)) { - return NULL; - } - - khmer:: read_parsers:: IParser * rparser = - _PyObject_to_khmer_ReadParser( rparser_obj ); - unsigned long long n_consumed; - unsigned int total_reads; - bool exc_raised = false; - - Py_BEGIN_ALLOW_THREADS - try { - hb->consume_fasta_and_tag_with_colors(rparser, total_reads, n_consumed, - _report_fn, callback_obj); - } catch (_khmer_signal &e) { - exc_raised = true; - } - Py_END_ALLOW_THREADS - if (exc_raised) return NULL; - - return Py_BuildValue("iL", total_reads, n_consumed); - -} - -static PyObject * hashbits_sweep_sequence_for_colors(PyObject * self, PyObject * args) { - khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self; - khmer::Hashbits * hb = me->hashbits; - - char * seq = NULL; - bool break_on_stoptags = NULL; - bool stop_big_traversals = NULL; - - if(!PyArg_ParseTuple(args, "spp", &seq, &break_on_stoptags, &stop_big_traversals)) { - return NULL; - } - - if (strlen(seq) < hb->ksize()) { - return NULL; - } - - //std::pair ret; - ColorPtrSet found_colors; - - bool exc_raised = false; - Py_BEGIN_ALLOW_THREADS - try { - hb->sweep_sequence_for_colors(seq, found_colors, break_on_stoptags, stop_big_traversals); - } catch (_khmer_signal &e) { - exc_raised = true; - } - Py_END_ALLOW_THREADS - - if (exc_raised) return NULL; - - PyObject * x = PyList_New(found_colors.size()); - khmer::ColorPtrSet::const_iterator si; - unsigned long long i = 0; - for (si=found_colors.begin(); si!=found_colors.end(); ++si) { - PyList_SET_ITEM(x, i, Py_BuildValue("i", *(*si))); - i++; - } - - return x; -} - static PyMethodDef khmer_counting_methods[] = { { "ksize", hash_get_ksize, METH_VARARGS, "" }, { "hashsizes", hash_get_hashsizes, METH_VARARGS, "" }, @@ -3891,6 +3818,109 @@ static PyObject * hashbits_get_median_count(PyObject * self, PyObject * args) return Py_BuildValue("iff", med, average, stddev); } +static PyObject * hashbits_consume_fasta_and_tag_with_colors(PyObject * self, PyObject * args) +{ + khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self; + khmer::Hashbits * hb = me->hashbits; + + std::ofstream outfile; + outfile.open("lazyoutput.txt"); + outfile << ">> we're in c++ land folks\n"; + + char * filename; + PyObject * callback_obj = NULL; + + if (!PyArg_ParseTuple(args, "s|O", &filename, &callback_obj)) { + return NULL; + } + + unsigned long long n_consumed; + unsigned int total_reads; + bool exc_raised = false; + + outfile << ">> about to start the tagging function...\n"; + outfile.close(); + //Py_BEGIN_ALLOW_THREADS + try { + hb->consume_fasta_and_tag_with_colors(filename, total_reads, n_consumed, + _report_fn, callback_obj); + } catch (_khmer_signal &e) { + exc_raised = true; + } + //Py_END_ALLOW_THREADS + if (exc_raised) return NULL; + + return Py_BuildValue("iL", total_reads, n_consumed); + +} + +static PyObject * hashbits_sweep_sequence_for_colors(PyObject * self, PyObject * args) { + khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self; + khmer::Hashbits * hb = me->hashbits; + + char * seq = NULL; + PyObject * break_on_stop_tags_o = NULL; + PyObject * stop_big_traversals_o = NULL; + + if (!PyArg_ParseTuple(args, "s|OO", &seq, + &break_on_stop_tags_o, + &stop_big_traversals_o)) { + return NULL; + } + + bool break_on_stop_tags = false; + if (break_on_stop_tags_o && PyObject_IsTrue(break_on_stop_tags_o)) { + break_on_stop_tags = true; + } + bool stop_big_traversals = false; + if (stop_big_traversals_o && PyObject_IsTrue(stop_big_traversals_o)) { + stop_big_traversals = true; + } + + if (strlen(seq) < hb->ksize()) { + return NULL; + } + + //std::pair ret; + ColorPtrSet found_colors; + + bool exc_raised = false; + //Py_BEGIN_ALLOW_THREADS + try { + hb->sweep_sequence_for_colors(seq, found_colors, break_on_stop_tags, stop_big_traversals); + } catch (_khmer_signal &e) { + exc_raised = true; + } + //Py_END_ALLOW_THREADS + + if (exc_raised) return NULL; + + PyObject * x = PyList_New(found_colors.size()); + khmer::ColorPtrSet::const_iterator si; + unsigned long long i = 0; + for (si=found_colors.begin(); si!=found_colors.end(); ++si) { + PyList_SET_ITEM(x, i, Py_BuildValue("i", *(*si))); + i++; + } + + return x; +} + +static PyObject * hashbits_do_nothing(PyObject * self, PyObject * args) { + khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self; + khmer::Hashbits * hb = me->hashbits; + + bool exc_raised = false; + try { + hb->do_nothing(); + } catch (_khmer_signal &e) { + exc_raised = true; + } + if (exc_raised) return NULL; + + return Py_True; +} + static PyMethodDef khmer_hashbits_methods[] = { { "extract_unique_paths", hashbits_extract_unique_paths, METH_VARARGS, "" }, { "ksize", hashbits_get_ksize, METH_VARARGS, "" }, @@ -3961,6 +3991,7 @@ static PyMethodDef khmer_hashbits_methods[] = { { "get_median_count", hashbits_get_median_count, METH_VARARGS, "Get the median, average, and stddev of the k-mer counts in the string" }, { "consume_fasta_and_tag_with_colors", hashbits_consume_fasta_and_tag_with_colors, METH_VARARGS, "" }, { "sweep_sequence_for_colors", hashbits_sweep_sequence_for_colors, METH_VARARGS, "" }, + { "do_nothing", hashbits_do_nothing, METH_VARARGS, ""}, {NULL, NULL, 0, NULL} /* sentinel */ }; From e6ba5464b1fac3bd583e6f390841f4b3858ade02 Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Tue, 10 Sep 2013 18:35:25 -0400 Subject: [PATCH 016/140] silly testing python script added (need to add nose tests) --- lib/test_coloring.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 lib/test_coloring.py diff --git a/lib/test_coloring.py b/lib/test_coloring.py new file mode 100644 index 0000000000..11811debfd --- /dev/null +++ b/lib/test_coloring.py @@ -0,0 +1,12 @@ +import khmer +print khmer.__file__ + + +ht = khmer.new_hashbits(20,1e8,4) +print '#' * 200 +ht.consume_fasta_and_tag_with_colors('/w/2013-lamprey/syn_part/syn.trinity.fasta') +print ht.sweep_sequence_for_colors('CACACACGGACATCGGAGAGAGGCTGAGACAGCGAGACACACAGAGACAGAGCGGAGAGGGCACAGACAGACAAGAGCATGAGAGATCGGCAGAGCGGTG', False, False) +print ht.sweep_sequence_for_colors('CGCCGTAGTCGTACTGGTTCTCCTCCGTGTACTCGTGCGCTGCCTCCACCTCTGGGCTGCTCATGCCCTCCATGTGACCTTCAGGCATGCCCTCGGAGAT', False, False) +print ht.sweep_sequence_for_colors('GGAGAGCCTGGGGCCAAGCCCGAGGGCATGCCTGAAGGTCACATGGAGGGCATGAGCAGCCCAG', False, False) +print ht.sweep_sequence_for_colors('TTTTTTGAATACGTTTAGTTAATATTTGTACTTCAATTAATAAAAATTTGCTATAATTTTTCCATTATCGCCAGTCACTCGCGTGATATAGGAAAAGGTT', False, False) +print ht.sweep_sequence_for_colors('AAGCAGTGGTATCAACGCAGAGTACGCGGGGACTCTGTCGCTGCTCCTCTAGCACAGAGAGCCAGAGACGGCTTACAGCAGCAGCATCATATAGCCTC', False, False) From 0e350c073e2a7d79db4190becf011f97809cc4c5 Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Fri, 13 Sep 2013 15:36:43 -0400 Subject: [PATCH 017/140] added function to tag fasta by partition id --- lib/hashtable.cc | 54 ++++++++++++++++++++++++++++++++++++++++++ lib/hashtable.hh | 8 ++++++- lib/test_coloring.py | 41 +++++++++++++++++++++++++++----- python/_khmermodule.cc | 34 ++++++++++++++++++++++---- 4 files changed, 126 insertions(+), 11 deletions(-) diff --git a/lib/hashtable.cc b/lib/hashtable.cc index 00052cbdb1..d7dc85f3da 100644 --- a/lib/hashtable.cc +++ b/lib/hashtable.cc @@ -2065,10 +2065,64 @@ Hashtable::consume_fasta_and_tag_with_colors( } +void Hashtable::consume_partitioned_fasta_and_tag_with_colors(const std::string &filename, + unsigned int &total_reads, + unsigned long long &n_consumed, + CallbackFn callback, + void * callback_data) +{ + total_reads = 0; + n_consumed = 0; + + IParser* parser = IParser::get_parser(filename.c_str()); + Read read; + + string seq = ""; + + // reset the master subset partition + delete partition; + partition = new SubsetPartition(this); + + // + // iterate through the FASTA file & consume the reads. + // + Color * c; + while(!parser->is_complete()) { + read = parser->get_next_read(); + seq = read.sequence; + + if (check_and_normalize_read(seq)) { + // First, figure out what the partition is (if non-zero), and save that. + c = new Color(_parse_partition_id(read.name)); + + consume_sequence_and_tag_with_colors( seq, + n_consumed, + *c ); + } + + // reset the sequence info, increment read number + total_reads++; + + // run callback, if specified + if (total_reads % CALLBACK_PERIOD == 0 && callback) { + try { + callback("consume_partitioned_fasta_and_tag_with_colors", callback_data, + total_reads, n_consumed); + } catch (...) { + delete parser; + throw; + } + } + } + + delete parser; +} + void Hashtable::link_tag_and_color(HashIntoType& kmer, Color& kmer_color) { tag_colors.insert(TagColorPtrPair(kmer, &kmer_color)); color_tag_ptrs.insert(ColorTagPtrPair(kmer_color, &kmer)); } + /* This is essentially the same code as above, only it assigns colors to the * tags through multimap TagColorMap defined in hashtable.hh, declared in * hashbits.hh diff --git a/lib/hashtable.hh b/lib/hashtable.hh index 8655751ad7..f2564ba0bf 100644 --- a/lib/hashtable.hh +++ b/lib/hashtable.hh @@ -508,7 +508,13 @@ namespace khmer { unsigned long long &n_consumed, CallbackFn callback = NULL, void * callback_data = NULL); - + + void consume_partitioned_fasta_and_tag_with_colors(const std::string &filename, + unsigned int &total_reads, + unsigned long long &n_consumed, + CallbackFn callback, + void * callback_data); + void consume_sequence_and_tag_with_colors(const std::string& seq, unsigned long long& n_consumed, Color& current_color, diff --git a/lib/test_coloring.py b/lib/test_coloring.py index 11811debfd..da8c5eff08 100644 --- a/lib/test_coloring.py +++ b/lib/test_coloring.py @@ -1,12 +1,41 @@ import khmer -print khmer.__file__ +import screed ht = khmer.new_hashbits(20,1e8,4) print '#' * 200 ht.consume_fasta_and_tag_with_colors('/w/2013-lamprey/syn_part/syn.trinity.fasta') -print ht.sweep_sequence_for_colors('CACACACGGACATCGGAGAGAGGCTGAGACAGCGAGACACACAGAGACAGAGCGGAGAGGGCACAGACAGACAAGAGCATGAGAGATCGGCAGAGCGGTG', False, False) -print ht.sweep_sequence_for_colors('CGCCGTAGTCGTACTGGTTCTCCTCCGTGTACTCGTGCGCTGCCTCCACCTCTGGGCTGCTCATGCCCTCCATGTGACCTTCAGGCATGCCCTCGGAGAT', False, False) -print ht.sweep_sequence_for_colors('GGAGAGCCTGGGGCCAAGCCCGAGGGCATGCCTGAAGGTCACATGGAGGGCATGAGCAGCCCAG', False, False) -print ht.sweep_sequence_for_colors('TTTTTTGAATACGTTTAGTTAATATTTGTACTTCAATTAATAAAAATTTGCTATAATTTTTCCATTATCGCCAGTCACTCGCGTGATATAGGAAAAGGTT', False, False) -print ht.sweep_sequence_for_colors('AAGCAGTGGTATCAACGCAGAGTACGCGGGGACTCTGTCGCTGCTCCTCTAGCACAGAGAGCCAGAGACGGCTTACAGCAGCAGCATCATATAGCCTC', False, False) +#print ht.sweep_sequence_for_colors('CACACACGGACATCGGAGAGAGGCTGAGACAGCGAGACACACAGAGACAGAGCGGAGAGGGCACAGACAGACAAGAGCATGAGAGATCGGCAGAGCGGTG', False, False) +#print ht.sweep_sequence_for_colors('CGCCGTAGTCGTACTGGTTCTCCTCCGTGTACTCGTGCGCTGCCTCCACCTCTGGGCTGCTCATGCCCTCCATGTGACCTTCAGGCATGCCCTCGGAGAT', False, False) +#print ht.sweep_sequence_for_colors('GGAGAGCCTGGGGCCAAGCCCGAGGGCATGCCTGAAGGTCACATGGAGGGCATGAGCAGCCCAG', False, False) +#print ht.sweep_sequence_for_colors('TTTTTTGAATACGTTTAGTTAATATTTGTACTTCAATTAATAAAAATTTGCTATAATTTTTCCATTATCGCCAGTCACTCGCGTGATATAGGAAAAGGTT', False, False) +#print ht.sweep_sequence_for_colors('AAGCAGTGGTATCAACGCAGAGTACGCGGGGACTCTGTCGCTGCTCCTCTAGCACAGAGAGCCAGAGACGGCTTACAGCAGCAGCATCATATAGCCTC', False, False) + +N=1000000000 + +''' +file_pointers = {} +for n, record in enumerate(screed.open('/w/2013-lamprey/syn_part/syn.sweep.fa')): + if n >= N: + break + if n % 1000 == 0: + print '...processed {} reads'.format(n) + colors = ht.sweep_sequence_for_colors(record.sequence, False, False) + for c in colors: + if c in file_pointers.viewkeys(): + file_pointers[c].write('>{}\n{}\n'.format(record.name, record.sequence)) + else: + file_pointers[c] = open('color_{}.fa'.format(c), 'wb') + file_pointers[c].write('>{}\n{}\n'.format(record.name, record.sequence))\ +''' + +ht = khmer.new_hashbits(25, 1e9,4) +ht.consume_partitioned_fasta_and_tag_with_colors('/w/2013-lamprey/test.fp') + +for n, record in enumerate(screed.open('/w/lamprey-mrnaseq/reads/single/L82-a.fq.gz')): + if n >= N: + break + colors = ht.sweep_sequence_for_colors(record.sequence, False, False) + if colors: + print colors + diff --git a/python/_khmermodule.cc b/python/_khmermodule.cc index dc8aff3932..be0ecb640e 100644 --- a/python/_khmermodule.cc +++ b/python/_khmermodule.cc @@ -3824,8 +3824,6 @@ static PyObject * hashbits_consume_fasta_and_tag_with_colors(PyObject * self, Py khmer::Hashbits * hb = me->hashbits; std::ofstream outfile; - outfile.open("lazyoutput.txt"); - outfile << ">> we're in c++ land folks\n"; char * filename; PyObject * callback_obj = NULL; @@ -3838,8 +3836,6 @@ static PyObject * hashbits_consume_fasta_and_tag_with_colors(PyObject * self, Py unsigned int total_reads; bool exc_raised = false; - outfile << ">> about to start the tagging function...\n"; - outfile.close(); //Py_BEGIN_ALLOW_THREADS try { hb->consume_fasta_and_tag_with_colors(filename, total_reads, n_consumed, @@ -3854,6 +3850,34 @@ static PyObject * hashbits_consume_fasta_and_tag_with_colors(PyObject * self, Py } +static PyObject * hashbits_consume_partitioned_fasta_and_tag_with_colors( + PyObject * self, PyObject * args) +{ + khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self; + khmer::Hashbits * hashbits = me->hashbits; + + char * filename; + PyObject * callback_obj = NULL; + + if (!PyArg_ParseTuple(args, "s|O", &filename, &callback_obj)) { + return NULL; + } + + // call the C++ function, and trap signals => Python + + unsigned long long n_consumed; + unsigned int total_reads; + + try { + hashbits->consume_partitioned_fasta_and_tag_with_colors(filename, + total_reads, n_consumed, _report_fn, callback_obj); + } catch (_khmer_signal &e) { + return NULL; + } + + return Py_BuildValue("iL", total_reads, n_consumed); +} + static PyObject * hashbits_sweep_sequence_for_colors(PyObject * self, PyObject * args) { khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self; khmer::Hashbits * hb = me->hashbits; @@ -3992,6 +4016,8 @@ static PyMethodDef khmer_hashbits_methods[] = { { "consume_fasta_and_tag_with_colors", hashbits_consume_fasta_and_tag_with_colors, METH_VARARGS, "" }, { "sweep_sequence_for_colors", hashbits_sweep_sequence_for_colors, METH_VARARGS, "" }, { "do_nothing", hashbits_do_nothing, METH_VARARGS, ""}, + {"consume_partitioned_fasta_and_tag_with_colors", hashbits_consume_partitioned_fasta_and_tag_with_colors, METH_VARARGS, "" }, + {NULL, NULL, 0, NULL} /* sentinel */ }; From 2b1b6b2907f5380262bebc4642967ee2e594160b Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 15 Sep 2013 21:50:21 -0400 Subject: [PATCH 018/140] a few comments on @cswelcher code --- lib/hashtable.cc | 3 +++ lib/hashtable.hh | 1 + 2 files changed, 4 insertions(+) diff --git a/lib/hashtable.cc b/lib/hashtable.cc index d7dc85f3da..773b515463 100644 --- a/lib/hashtable.cc +++ b/lib/hashtable.cc @@ -2005,6 +2005,9 @@ Hashtable::consume_fasta_and_tag_with_colors( ); Color _tag_color = 0; + + // @CTB: pls keep increment separate from function call so that + // order is explicit. Color * the_color = new Color(_tag_color++); // Iterate through the reads and consume their k-mers. while (!parser->is_complete( )) diff --git a/lib/hashtable.hh b/lib/hashtable.hh index f2564ba0bf..bdcb303c95 100644 --- a/lib/hashtable.hh +++ b/lib/hashtable.hh @@ -493,6 +493,7 @@ namespace khmer { CallbackFn callback = 0, void * callback_data = 0); + // @CTB ahem? void do_nothing(); void consume_fasta_and_tag_with_colors( From 67c1b458ddb223c50a36a5fc069827e1f58775b5 Mon Sep 17 00:00:00 2001 From: CS Welcher Date: Wed, 18 Sep 2013 16:39:57 -0700 Subject: [PATCH 019/140] exposing more stuff for python glue --- lib/hashtable.cc | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/lib/hashtable.cc b/lib/hashtable.cc index d7dc85f3da..ba7fb25368 100644 --- a/lib/hashtable.cc +++ b/lib/hashtable.cc @@ -2236,6 +2236,14 @@ void Hashtable::sweep_sequence_for_colors(const std::string& seq, } } +ColorPtrSet& Hashtable::get_tag_colors(const HashIntoType& tag) { + ColorPtrSet colors; + num_colors = _get_tag_colors(tag, tag_colors, colors); + return &colors; +} + + + void Hashtable::traverse_colors_and_resolve(const SeenSet& tagged_kmers, ColorPtrSet& found_colors) { From d948a6e97eb3758b04e104563f9a2693fef180ea Mon Sep 17 00:00:00 2001 From: CS Welcher Date: Thu, 19 Sep 2013 14:20:39 -0700 Subject: [PATCH 020/140] added more glue functions --- lib/color_tst.py | 41 +++++++++++++++++++++++++++++++++ lib/hashtable.cc | 14 ++++++++---- lib/hashtable.hh | 5 ++++- lib/khmer.hh | 6 +++++ python/_khmermodule.cc | 51 ++++++++++++++++++++++++++++++++++++++++-- 5 files changed, 110 insertions(+), 7 deletions(-) create mode 100644 lib/color_tst.py diff --git a/lib/color_tst.py b/lib/color_tst.py new file mode 100644 index 0000000000..da8c5eff08 --- /dev/null +++ b/lib/color_tst.py @@ -0,0 +1,41 @@ +import khmer +import screed + + +ht = khmer.new_hashbits(20,1e8,4) +print '#' * 200 +ht.consume_fasta_and_tag_with_colors('/w/2013-lamprey/syn_part/syn.trinity.fasta') +#print ht.sweep_sequence_for_colors('CACACACGGACATCGGAGAGAGGCTGAGACAGCGAGACACACAGAGACAGAGCGGAGAGGGCACAGACAGACAAGAGCATGAGAGATCGGCAGAGCGGTG', False, False) +#print ht.sweep_sequence_for_colors('CGCCGTAGTCGTACTGGTTCTCCTCCGTGTACTCGTGCGCTGCCTCCACCTCTGGGCTGCTCATGCCCTCCATGTGACCTTCAGGCATGCCCTCGGAGAT', False, False) +#print ht.sweep_sequence_for_colors('GGAGAGCCTGGGGCCAAGCCCGAGGGCATGCCTGAAGGTCACATGGAGGGCATGAGCAGCCCAG', False, False) +#print ht.sweep_sequence_for_colors('TTTTTTGAATACGTTTAGTTAATATTTGTACTTCAATTAATAAAAATTTGCTATAATTTTTCCATTATCGCCAGTCACTCGCGTGATATAGGAAAAGGTT', False, False) +#print ht.sweep_sequence_for_colors('AAGCAGTGGTATCAACGCAGAGTACGCGGGGACTCTGTCGCTGCTCCTCTAGCACAGAGAGCCAGAGACGGCTTACAGCAGCAGCATCATATAGCCTC', False, False) + +N=1000000000 + +''' +file_pointers = {} +for n, record in enumerate(screed.open('/w/2013-lamprey/syn_part/syn.sweep.fa')): + if n >= N: + break + if n % 1000 == 0: + print '...processed {} reads'.format(n) + colors = ht.sweep_sequence_for_colors(record.sequence, False, False) + for c in colors: + if c in file_pointers.viewkeys(): + file_pointers[c].write('>{}\n{}\n'.format(record.name, record.sequence)) + else: + file_pointers[c] = open('color_{}.fa'.format(c), 'wb') + file_pointers[c].write('>{}\n{}\n'.format(record.name, record.sequence))\ +''' + +ht = khmer.new_hashbits(25, 1e9,4) +ht.consume_partitioned_fasta_and_tag_with_colors('/w/2013-lamprey/test.fp') + +for n, record in enumerate(screed.open('/w/lamprey-mrnaseq/reads/single/L82-a.fq.gz')): + if n >= N: + break + colors = ht.sweep_sequence_for_colors(record.sequence, False, False) + if colors: + print colors + diff --git a/lib/hashtable.cc b/lib/hashtable.cc index 986a7698f0..6fa5525909 100644 --- a/lib/hashtable.cc +++ b/lib/hashtable.cc @@ -2239,13 +2239,19 @@ void Hashtable::sweep_sequence_for_colors(const std::string& seq, } } -ColorPtrSet& Hashtable::get_tag_colors(const HashIntoType& tag) { +ColorPtrSet Hashtable::get_tag_colors(const HashIntoType& tag) { ColorPtrSet colors; - num_colors = _get_tag_colors(tag, tag_colors, colors); - return &colors; + unsigned int num_colors; + _get_tag_colors(tag, tag_colors, colors); + return colors; } - +TagPtrSet Hashtable::get_color_tags(const Color& color) { + TagPtrSet tags; + unsigned int num_tags; + _get_tags_from_color(color, color_tag_ptrs, tags); + return tags; +} void Hashtable::traverse_colors_and_resolve(const SeenSet& tagged_kmers, ColorPtrSet& found_colors) { diff --git a/lib/hashtable.hh b/lib/hashtable.hh index bdcb303c95..c7e09be11f 100644 --- a/lib/hashtable.hh +++ b/lib/hashtable.hh @@ -520,6 +520,9 @@ namespace khmer { unsigned long long& n_consumed, Color& current_color, SeenSet * new_tags = 0); + + ColorPtrSet get_tag_colors(const HashIntoType& tag); + TagPtrSet get_color_tags(const Color& color); void link_tag_and_color(HashIntoType& kmer, Color& color); @@ -528,7 +531,7 @@ namespace khmer { bool break_on_stoptags, bool stop_big_traversals); - void traverse_colors_and_resolve(const SeenSet& tagged_kmers, + void traverse_colors_and_resolve(const SeenSet& tagged_kmers, ColorPtrSet& found_colors); void consume_fasta_and_traverse(const std::string &filename, diff --git a/lib/khmer.hh b/lib/khmer.hh index e096919e62..2bb275eab9 100644 --- a/lib/khmer.hh +++ b/lib/khmer.hh @@ -94,6 +94,12 @@ namespace khmer { typedef std::pair ColorTagPtrPair; typedef std::set ColorPtrSet; typedef std::set TagPtrSet; + + Template + void deallocate_ptr_set(T& s) { + for (typename T::iterator i = c.begin(); i != c.end(); ++i) + delete *i; + } } #endif // KHMER_HH diff --git a/python/_khmermodule.cc b/python/_khmermodule.cc index be0ecb640e..67d5fb1cca 100644 --- a/python/_khmermodule.cc +++ b/python/_khmermodule.cc @@ -2991,7 +2991,7 @@ static PyObject * hashbits_consume_partitioned_fasta(PyObject * self, PyObject * try { hashbits->consume_partitioned_fasta(filename, total_reads, n_consumed, _report_fn, callback_obj); - } catch (_khmer_signal &e) { + } catch (_khmer_signal) { return NULL; } @@ -3945,6 +3945,52 @@ static PyObject * hashbits_do_nothing(PyObject * self, PyObject * args) { return Py_True; } +// Same as find_all_tags, but returns tags in a way actually useable by python +static PyObject * hashbits_get_all_tags(PyObject * self, PyObject *args) +{ + khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self; + khmer::Hashbits * hashbits = me->hashbits; + + char * kmer_s = NULL; + + if (!PyArg_ParseTuple(args, "s", &kmer_s)) { + return NULL; + } + + if (strlen(kmer_s) < hashbits->ksize()) { // @@ + return NULL; + } + + khmer::SeenSet tagged_kmers; + + //Py_BEGIN_ALLOW_THREADS + + khmer::HashIntoType kmer, kmer_f, kmer_r; + kmer = khmer::_hash(kmer_s, hashbits->ksize(), kmer_f, kmer_r); + + hashbits->partition->find_all_tags(kmer_f, kmer_r, tagged_kmers, + hashbits->all_tags); + hashbits->add_kmer_to_tags(kmer); + + //Py_END_ALLOW_THREADS + + PyObject * x = PyList_New(tagged_kmers.size()); + khmer::SeenSet::const_iterator si; + unsigned long long i = 0; + for (si=tagged_kmers.begin(); si!=tagged_kmers.end(); ++si) { + //std::string kmer_s = _revhash(*si, hashbits->ksize()); + PyList_SET_ITEM(x, i, Py_BuildValue("i", *si)); + i++; + } + + return x; +} + + +static PyObject * hashbits_get_tag_colors(PyObject * self, PyObject * args) { + return Py_True; +} + static PyMethodDef khmer_hashbits_methods[] = { { "extract_unique_paths", hashbits_extract_unique_paths, METH_VARARGS, "" }, { "ksize", hashbits_get_ksize, METH_VARARGS, "" }, @@ -4017,7 +4063,8 @@ static PyMethodDef khmer_hashbits_methods[] = { { "sweep_sequence_for_colors", hashbits_sweep_sequence_for_colors, METH_VARARGS, "" }, { "do_nothing", hashbits_do_nothing, METH_VARARGS, ""}, {"consume_partitioned_fasta_and_tag_with_colors", hashbits_consume_partitioned_fasta_and_tag_with_colors, METH_VARARGS, "" }, - + {"get_all_tags", hashbits_get_all_tags, METH_VARARGS, "" }, + {NULL, NULL, 0, NULL} /* sentinel */ }; From ba9cab4eb2359b574690f5b592ee932e48317a19 Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Tue, 24 Sep 2013 16:50:28 -0400 Subject: [PATCH 021/140] finished tag export to python land and fixed tag parsing to ull int. added direct get_tag_colors to python land --- lib/hashtable.cc | 5 +---- lib/hashtable.hh | 3 --- python/_khmermodule.cc | 49 +++++++++++++++++++++++++----------------- 3 files changed, 30 insertions(+), 27 deletions(-) diff --git a/lib/hashtable.cc b/lib/hashtable.cc index 6fa5525909..21b1927304 100644 --- a/lib/hashtable.cc +++ b/lib/hashtable.cc @@ -1946,14 +1946,11 @@ void Hashtable::extract_unique_paths(std::string seq, } /* * Pretty much copy-pasta + * @cswelcher * Might be time for a refactor: could do a general consume_fasta * function which accepts a consume_sequence function pointer as a parameter */ -void Hashtable::do_nothing() { - std::cout << "doing nothing\n"; -} - void Hashtable::consume_fasta_and_tag_with_colors( std:: string const &filename, diff --git a/lib/hashtable.hh b/lib/hashtable.hh index c7e09be11f..739115038b 100644 --- a/lib/hashtable.hh +++ b/lib/hashtable.hh @@ -492,9 +492,6 @@ namespace khmer { unsigned long long &n_consumed, CallbackFn callback = 0, void * callback_data = 0); - - // @CTB ahem? - void do_nothing(); void consume_fasta_and_tag_with_colors( std::string const &filename, diff --git a/python/_khmermodule.cc b/python/_khmermodule.cc index 67d5fb1cca..3e8831a69d 100644 --- a/python/_khmermodule.cc +++ b/python/_khmermodule.cc @@ -3923,28 +3923,13 @@ static PyObject * hashbits_sweep_sequence_for_colors(PyObject * self, PyObject * khmer::ColorPtrSet::const_iterator si; unsigned long long i = 0; for (si=found_colors.begin(); si!=found_colors.end(); ++si) { - PyList_SET_ITEM(x, i, Py_BuildValue("i", *(*si))); + PyList_SET_ITEM(x, i, Py_BuildValue("K", *(*si))); i++; } return x; } -static PyObject * hashbits_do_nothing(PyObject * self, PyObject * args) { - khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self; - khmer::Hashbits * hb = me->hashbits; - - bool exc_raised = false; - try { - hb->do_nothing(); - } catch (_khmer_signal &e) { - exc_raised = true; - } - if (exc_raised) return NULL; - - return Py_True; -} - // Same as find_all_tags, but returns tags in a way actually useable by python static PyObject * hashbits_get_all_tags(PyObject * self, PyObject *args) { @@ -3970,7 +3955,6 @@ static PyObject * hashbits_get_all_tags(PyObject * self, PyObject *args) hashbits->partition->find_all_tags(kmer_f, kmer_r, tagged_kmers, hashbits->all_tags); - hashbits->add_kmer_to_tags(kmer); //Py_END_ALLOW_THREADS @@ -3979,7 +3963,8 @@ static PyObject * hashbits_get_all_tags(PyObject * self, PyObject *args) unsigned long long i = 0; for (si=tagged_kmers.begin(); si!=tagged_kmers.end(); ++si) { //std::string kmer_s = _revhash(*si, hashbits->ksize()); - PyList_SET_ITEM(x, i, Py_BuildValue("i", *si)); + // type K for python unsigned long long + PyList_SET_ITEM(x, i, Py_BuildValue("K", *si)); i++; } @@ -3988,7 +3973,31 @@ static PyObject * hashbits_get_all_tags(PyObject * self, PyObject *args) static PyObject * hashbits_get_tag_colors(PyObject * self, PyObject * args) { - return Py_True; + + khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self; + khmer::Hashbits * hashbits = me->hashbits; + + khmer::HashIntoType tag; + + if (!PyArg_ParseTuple(args, "K", &tag)) { + return NULL; + } + + khmer::ColorPtrSet colors; + + colors = hashbits->get_tag_colors(tag); + + PyObject * x = PyList_New(colors.size()); + khmer::ColorPtrSet::const_iterator si; + unsigned long long i = 0; + for (si=colors.begin(); si!=colors.end(); ++si) { + //std::string kmer_s = _revhash(*si, hashbits->ksize()); + PyList_SET_ITEM(x, i, Py_BuildValue("K", *(*si))); + i++; + } + + return x; + } static PyMethodDef khmer_hashbits_methods[] = { @@ -4061,9 +4070,9 @@ static PyMethodDef khmer_hashbits_methods[] = { { "get_median_count", hashbits_get_median_count, METH_VARARGS, "Get the median, average, and stddev of the k-mer counts in the string" }, { "consume_fasta_and_tag_with_colors", hashbits_consume_fasta_and_tag_with_colors, METH_VARARGS, "" }, { "sweep_sequence_for_colors", hashbits_sweep_sequence_for_colors, METH_VARARGS, "" }, - { "do_nothing", hashbits_do_nothing, METH_VARARGS, ""}, {"consume_partitioned_fasta_and_tag_with_colors", hashbits_consume_partitioned_fasta_and_tag_with_colors, METH_VARARGS, "" }, {"get_all_tags", hashbits_get_all_tags, METH_VARARGS, "" }, + {"get_tag_colors", hashbits_get_tag_colors, METH_VARARGS, ""}, {NULL, NULL, 0, NULL} /* sentinel */ }; From 90f9dba602e5ba5fed2ad20ca0c7c4f5cbbd8dae Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Tue, 24 Sep 2013 17:37:45 -0400 Subject: [PATCH 022/140] mucking about with get_all_tags function, temporarily broken... --- python/_khmermodule.cc | 39 ++++++++++++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 9 deletions(-) diff --git a/python/_khmermodule.cc b/python/_khmermodule.cc index 3e8831a69d..d8b5e9e998 100644 --- a/python/_khmermodule.cc +++ b/python/_khmermodule.cc @@ -3931,30 +3931,51 @@ static PyObject * hashbits_sweep_sequence_for_colors(PyObject * self, PyObject * } // Same as find_all_tags, but returns tags in a way actually useable by python +// @cswelcher TODO: this is broken az, fix it asap +// need a tags_in_sequence iterator or function in c++ land for reuse in all +// these functions static PyObject * hashbits_get_all_tags(PyObject * self, PyObject *args) { khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self; khmer::Hashbits * hashbits = me->hashbits; - char * kmer_s = NULL; + std::string seq = NULL; + PyObject * break_on_stop_tags_o = NULL; + PyObject * stop_big_traversals_o = NULL; - if (!PyArg_ParseTuple(args, "s", &kmer_s)) { + if (!PyArg_ParseTuple(args, "s|OO", &seq, + &break_on_stop_tags_o, + &stop_big_traversals_o)) { return NULL; } - if (strlen(kmer_s) < hashbits->ksize()) { // @@ + bool break_on_stop_tags = false; + if (break_on_stop_tags_o && PyObject_IsTrue(break_on_stop_tags_o)) { + break_on_stop_tags = true; + } + bool stop_big_traversals = false; + if (stop_big_traversals_o && PyObject_IsTrue(stop_big_traversals_o)) { + stop_big_traversals = true; + } + + if (strlen(seq) < hashbits->ksize()) { return NULL; } khmer::SeenSet tagged_kmers; - + khmer::HashIntoType kmer_f, kmer_r, kmer; + KMerIterator kmers(seq.c_str(), hashbits->_ksize()); + std::string kmer_s; //Py_BEGIN_ALLOW_THREADS - khmer::HashIntoType kmer, kmer_f, kmer_r; - kmer = khmer::_hash(kmer_s, hashbits->ksize(), kmer_f, kmer_r); - - hashbits->partition->find_all_tags(kmer_f, kmer_r, tagged_kmers, - hashbits->all_tags); + while (!kmers.done()) { + kmer = kmers.next(); + kmer_s = khmer::_revhash(kmer, hashbits->(_ksize)); + kmer = khmer::_hash(kmer_s.c_str(), hashbits->_ksize(), kmer_f, kmer_r); + + hashbits->partition->find_all_tags(kmer_f, kmer_r, tagged_kmers, + hashbits->all_tags, break_on_stoptags, stop_big_traversals); + } //Py_END_ALLOW_THREADS From 56894425d132f0231f6b4bbc3ee75b28551a54d3 Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Wed, 25 Sep 2013 11:55:23 -0400 Subject: [PATCH 023/140] fixed issues with get_all_tags --- python/_khmermodule.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/python/_khmermodule.cc b/python/_khmermodule.cc index d8b5e9e998..2cf16e4645 100644 --- a/python/_khmermodule.cc +++ b/python/_khmermodule.cc @@ -3939,7 +3939,7 @@ static PyObject * hashbits_get_all_tags(PyObject * self, PyObject *args) khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self; khmer::Hashbits * hashbits = me->hashbits; - std::string seq = NULL; + char * seq = NULL; PyObject * break_on_stop_tags_o = NULL; PyObject * stop_big_traversals_o = NULL; @@ -3964,17 +3964,17 @@ static PyObject * hashbits_get_all_tags(PyObject * self, PyObject *args) khmer::SeenSet tagged_kmers; khmer::HashIntoType kmer_f, kmer_r, kmer; - KMerIterator kmers(seq.c_str(), hashbits->_ksize()); + KMerIterator kmers(seq, hashbits->ksize()); std::string kmer_s; //Py_BEGIN_ALLOW_THREADS while (!kmers.done()) { kmer = kmers.next(); - kmer_s = khmer::_revhash(kmer, hashbits->(_ksize)); - kmer = khmer::_hash(kmer_s.c_str(), hashbits->_ksize(), kmer_f, kmer_r); + kmer_s = khmer::_revhash(kmer, hashbits->ksize()); + kmer = khmer::_hash(kmer_s.c_str(), hashbits->ksize(), kmer_f, kmer_r); hashbits->partition->find_all_tags(kmer_f, kmer_r, tagged_kmers, - hashbits->all_tags, break_on_stoptags, stop_big_traversals); + hashbits->all_tags, break_on_stop_tags, stop_big_traversals); } //Py_END_ALLOW_THREADS From 18ebd4a7bf17caa34e6ef6066aa8a83f67ef89eb Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Wed, 25 Sep 2013 12:00:13 -0400 Subject: [PATCH 024/140] fixed in-call increment re ctb comment --- lib/hashtable.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/hashtable.cc b/lib/hashtable.cc index 21b1927304..dfa985dabf 100644 --- a/lib/hashtable.cc +++ b/lib/hashtable.cc @@ -2005,7 +2005,7 @@ Hashtable::consume_fasta_and_tag_with_colors( // @CTB: pls keep increment separate from function call so that // order is explicit. - Color * the_color = new Color(_tag_color++); + Color * the_color = new Color(_tag_color); // Iterate through the reads and consume their k-mers. while (!parser->is_complete( )) { @@ -2019,7 +2019,8 @@ Hashtable::consume_fasta_and_tag_with_colors( consume_sequence_and_tag_with_colors( read.sequence, this_n_consumed, *the_color ); - the_color = new Color(_tag_color++); + _tag_color++; + the_color = new Color(_tag_color); #ifdef WITH_INTERNAL_METRICS hasher.pmetrics.start_timers( ); From 22333f25b7e19d17f85f3075d75bb5617761eee2 Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Wed, 25 Sep 2013 13:35:26 -0400 Subject: [PATCH 025/140] added n_colors python function --- lib/hashtable.hh | 1 + python/_khmermodule.cc | 14 +++++++++++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/lib/hashtable.hh b/lib/hashtable.hh index 739115038b..9d4589c41a 100644 --- a/lib/hashtable.hh +++ b/lib/hashtable.hh @@ -452,6 +452,7 @@ namespace khmer { // Partitioning stuff. unsigned int n_tags() const { return all_tags.size(); } + unsigned int N-colors() const { return tag_colors.size(); } void divide_tags_into_subsets(unsigned int subset_size, SeenSet& divvy); diff --git a/python/_khmermodule.cc b/python/_khmermodule.cc index 2cf16e4645..975a045cb5 100644 --- a/python/_khmermodule.cc +++ b/python/_khmermodule.cc @@ -4018,7 +4018,18 @@ static PyObject * hashbits_get_tag_colors(PyObject * self, PyObject * args) { } return x; - +} + +static PyObject * hashbits_n_colors(PyObject * self, PyObject * args) +{ + khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self; + khmer::Hashbits * hashbits = me->hashbits; + + if (!PyArg_ParseTuple(args, "")) { + return NULL; + } + + return PyInt_FromLong(hashbits->n_colors()); } static PyMethodDef khmer_hashbits_methods[] = { @@ -4094,6 +4105,7 @@ static PyMethodDef khmer_hashbits_methods[] = { {"consume_partitioned_fasta_and_tag_with_colors", hashbits_consume_partitioned_fasta_and_tag_with_colors, METH_VARARGS, "" }, {"get_all_tags", hashbits_get_all_tags, METH_VARARGS, "" }, {"get_tag_colors", hashbits_get_tag_colors, METH_VARARGS, ""}, + {"n_colors", hashbits_n_colors, METH_VARARGS, ""}, {NULL, NULL, 0, NULL} /* sentinel */ }; From b51df4776e314997d4710363f42ef65d0f398985 Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Wed, 25 Sep 2013 13:52:49 -0400 Subject: [PATCH 026/140] added test_consume_fasta_and_tag_with_colors, test passes --- lib/hashtable.cc | 2 -- lib/hashtable.hh | 2 +- lib/test_coloring.py | 41 ----------------------------------------- tests/test_hashbits.py | 14 ++++++++++++++ 4 files changed, 15 insertions(+), 44 deletions(-) delete mode 100644 lib/test_coloring.py diff --git a/lib/hashtable.cc b/lib/hashtable.cc index dfa985dabf..e3a84260d0 100644 --- a/lib/hashtable.cc +++ b/lib/hashtable.cc @@ -2003,8 +2003,6 @@ Hashtable::consume_fasta_and_tag_with_colors( Color _tag_color = 0; - // @CTB: pls keep increment separate from function call so that - // order is explicit. Color * the_color = new Color(_tag_color); // Iterate through the reads and consume their k-mers. while (!parser->is_complete( )) diff --git a/lib/hashtable.hh b/lib/hashtable.hh index 9d4589c41a..c662cc0f5b 100644 --- a/lib/hashtable.hh +++ b/lib/hashtable.hh @@ -452,7 +452,7 @@ namespace khmer { // Partitioning stuff. unsigned int n_tags() const { return all_tags.size(); } - unsigned int N-colors() const { return tag_colors.size(); } + unsigned int n_colors() const { return tag_colors.size(); } void divide_tags_into_subsets(unsigned int subset_size, SeenSet& divvy); diff --git a/lib/test_coloring.py b/lib/test_coloring.py deleted file mode 100644 index da8c5eff08..0000000000 --- a/lib/test_coloring.py +++ /dev/null @@ -1,41 +0,0 @@ -import khmer -import screed - - -ht = khmer.new_hashbits(20,1e8,4) -print '#' * 200 -ht.consume_fasta_and_tag_with_colors('/w/2013-lamprey/syn_part/syn.trinity.fasta') -#print ht.sweep_sequence_for_colors('CACACACGGACATCGGAGAGAGGCTGAGACAGCGAGACACACAGAGACAGAGCGGAGAGGGCACAGACAGACAAGAGCATGAGAGATCGGCAGAGCGGTG', False, False) -#print ht.sweep_sequence_for_colors('CGCCGTAGTCGTACTGGTTCTCCTCCGTGTACTCGTGCGCTGCCTCCACCTCTGGGCTGCTCATGCCCTCCATGTGACCTTCAGGCATGCCCTCGGAGAT', False, False) -#print ht.sweep_sequence_for_colors('GGAGAGCCTGGGGCCAAGCCCGAGGGCATGCCTGAAGGTCACATGGAGGGCATGAGCAGCCCAG', False, False) -#print ht.sweep_sequence_for_colors('TTTTTTGAATACGTTTAGTTAATATTTGTACTTCAATTAATAAAAATTTGCTATAATTTTTCCATTATCGCCAGTCACTCGCGTGATATAGGAAAAGGTT', False, False) -#print ht.sweep_sequence_for_colors('AAGCAGTGGTATCAACGCAGAGTACGCGGGGACTCTGTCGCTGCTCCTCTAGCACAGAGAGCCAGAGACGGCTTACAGCAGCAGCATCATATAGCCTC', False, False) - -N=1000000000 - -''' -file_pointers = {} -for n, record in enumerate(screed.open('/w/2013-lamprey/syn_part/syn.sweep.fa')): - if n >= N: - break - if n % 1000 == 0: - print '...processed {} reads'.format(n) - colors = ht.sweep_sequence_for_colors(record.sequence, False, False) - for c in colors: - if c in file_pointers.viewkeys(): - file_pointers[c].write('>{}\n{}\n'.format(record.name, record.sequence)) - else: - file_pointers[c] = open('color_{}.fa'.format(c), 'wb') - file_pointers[c].write('>{}\n{}\n'.format(record.name, record.sequence))\ -''' - -ht = khmer.new_hashbits(25, 1e9,4) -ht.consume_partitioned_fasta_and_tag_with_colors('/w/2013-lamprey/test.fp') - -for n, record in enumerate(screed.open('/w/lamprey-mrnaseq/reads/single/L82-a.fq.gz')): - if n >= N: - break - colors = ht.sweep_sequence_for_colors(record.sequence, False, False) - if colors: - print colors - diff --git a/tests/test_hashbits.py b/tests/test_hashbits.py index da7cb27643..6e8e92fa71 100644 --- a/tests/test_hashbits.py +++ b/tests/test_hashbits.py @@ -500,3 +500,17 @@ def test_simple_median(): assert median == 1 assert average == 1.0 assert stddev == 0.0 + +def test_consume_fasta_and_tag_with_colors(): + hb = khmer.new_hashbits(20, 1e7, 4) + + filename = utils.get_test_data('test-transcript.fa') + total_reads, n_consumed = hb.consume_fasta_and_tag_with_colors(filename) + + #assert n_consumed == 3 + assert total_reads == 3 + + assert hb.n_colors() == 3 + + + From d131776756a90d9bb8e1cb468d8b593bcb35401e Mon Sep 17 00:00:00 2001 From: CS Welcher Date: Wed, 25 Sep 2013 16:18:44 -0400 Subject: [PATCH 027/140] fixed consume_partitioned_fasta_and_tag_with_colors to properly check for color existence and use color pointers --- lib/hashtable.cc | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/lib/hashtable.cc b/lib/hashtable.cc index e3a84260d0..ae20429ea3 100644 --- a/lib/hashtable.cc +++ b/lib/hashtable.cc @@ -2085,14 +2085,21 @@ void Hashtable::consume_partitioned_fasta_and_tag_with_colors(const std::string // // iterate through the FASTA file & consume the reads. // + ColorPtrMap colors; Color * c; + PartitionID p; while(!parser->is_complete()) { read = parser->get_next_read(); seq = read.sequence; if (check_and_normalize_read(seq)) { // First, figure out what the partition is (if non-zero), and save that. - c = new Color(_parse_partition_id(read.name)); + p = _parse_partition_id(read.name); + if (colors.count(p)) { + c = colors[p]; + } else { + c = new Color(p); + } consume_sequence_and_tag_with_colors( seq, n_consumed, @@ -2114,9 +2121,11 @@ void Hashtable::consume_partitioned_fasta_and_tag_with_colors(const std::string } } + // @cswelcher TODO: deallocate ColorPtrMap delete parser; } +// @cswelcher: double-check -- is it valid to pull the address from a reference? void Hashtable::link_tag_and_color(HashIntoType& kmer, Color& kmer_color) { tag_colors.insert(TagColorPtrPair(kmer, &kmer_color)); color_tag_ptrs.insert(ColorTagPtrPair(kmer_color, &kmer)); @@ -2168,7 +2177,6 @@ void Hashtable::consume_sequence_and_tag_with_colors(const std::string& seq, } } else ++since; } - // Should I bother adding new code down here? #else if (!is_new_kmer && set_contains(all_tags, kmer)) { since = 1; From afd1792025cacc79d14e00567ac202143ed26631 Mon Sep 17 00:00:00 2001 From: CS Welcher Date: Wed, 25 Sep 2013 16:21:35 -0400 Subject: [PATCH 028/140] added delete of temp colorptrmap --- lib/hashtable.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/hashtable.cc b/lib/hashtable.cc index ae20429ea3..80f33c93ca 100644 --- a/lib/hashtable.cc +++ b/lib/hashtable.cc @@ -2121,8 +2121,9 @@ void Hashtable::consume_partitioned_fasta_and_tag_with_colors(const std::string } } - // @cswelcher TODO: deallocate ColorPtrMap + // @cswelcher TODO: check that deallocate ColorPtrMap is correct delete parser; + delete colors; } // @cswelcher: double-check -- is it valid to pull the address from a reference? From 0f10ae5aaa4d8aa9ae051241a4f80eb9dec9b08a Mon Sep 17 00:00:00 2001 From: CS Welcher Date: Thu, 26 Sep 2013 01:41:19 -0400 Subject: [PATCH 029/140] added khmer colormap changes --- lib/khmer.hh | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/khmer.hh b/lib/khmer.hh index 2bb275eab9..c02998d68d 100644 --- a/lib/khmer.hh +++ b/lib/khmer.hh @@ -94,6 +94,7 @@ namespace khmer { typedef std::pair ColorTagPtrPair; typedef std::set ColorPtrSet; typedef std::set TagPtrSet; + typedef std::map ColorPtrMap; Template void deallocate_ptr_set(T& s) { From 6043518e1c4135860c505df8960fa32e12397402 Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Thu, 26 Sep 2013 01:43:55 -0400 Subject: [PATCH 030/140] added dealloc func --- lib/khmer.hh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/khmer.hh b/lib/khmer.hh index 2bb275eab9..c23a2cce71 100644 --- a/lib/khmer.hh +++ b/lib/khmer.hh @@ -87,7 +87,7 @@ namespace khmer { typedef std::map PartitionCountMap; typedef std::map PartitionCountDistribution; - typedef unsigned int Color; + typedef unsigned long long int Color; typedef std::multimap TagColorPtrMap; typedef std::multimap ColorTagPtrMap; typedef std::pair TagColorPtrPair; @@ -95,9 +95,9 @@ namespace khmer { typedef std::set ColorPtrSet; typedef std::set TagPtrSet; - Template + template void deallocate_ptr_set(T& s) { - for (typename T::iterator i = c.begin(); i != c.end(); ++i) + for (typename T::iterator i = s.begin(); i != s.end(); ++i) delete *i; } } From eb944968be4494cd8398f822cd3d40329c3d1685 Mon Sep 17 00:00:00 2001 From: CS Date: Thu, 26 Sep 2013 01:49:18 -0400 Subject: [PATCH 031/140] changed delete of tmp colorptrmap to clear --- lib/hashtable.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/hashtable.cc b/lib/hashtable.cc index 80f33c93ca..0d729864eb 100644 --- a/lib/hashtable.cc +++ b/lib/hashtable.cc @@ -2123,7 +2123,7 @@ void Hashtable::consume_partitioned_fasta_and_tag_with_colors(const std::string // @cswelcher TODO: check that deallocate ColorPtrMap is correct delete parser; - delete colors; + colors.clear(); } // @cswelcher: double-check -- is it valid to pull the address from a reference? From 3684974c3a219bdc59b6bf9030654cf9fef4b8a6 Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Thu, 26 Sep 2013 01:51:05 -0400 Subject: [PATCH 032/140] updated color_tst, remove soon --- lib/color_tst.py | 40 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 36 insertions(+), 4 deletions(-) diff --git a/lib/color_tst.py b/lib/color_tst.py index da8c5eff08..dcac725ec0 100644 --- a/lib/color_tst.py +++ b/lib/color_tst.py @@ -1,17 +1,49 @@ import khmer import screed +def reverse_comp(s): + ret = '' + for i in range(len(s)-1,-1,-1): + c = s[i] + if c == 'A': + ret += 'T' + elif c == 'T': + ret += 'A' + elif c == 'G': + ret += 'C' + else: + ret += 'G' + return ret ht = khmer.new_hashbits(20,1e8,4) print '#' * 200 -ht.consume_fasta_and_tag_with_colors('/w/2013-lamprey/syn_part/syn.trinity.fasta') +ht.consume_fasta_and_tag_with_colors('../tests/test-data/test-reads.fa') #print ht.sweep_sequence_for_colors('CACACACGGACATCGGAGAGAGGCTGAGACAGCGAGACACACAGAGACAGAGCGGAGAGGGCACAGACAGACAAGAGCATGAGAGATCGGCAGAGCGGTG', False, False) #print ht.sweep_sequence_for_colors('CGCCGTAGTCGTACTGGTTCTCCTCCGTGTACTCGTGCGCTGCCTCCACCTCTGGGCTGCTCATGCCCTCCATGTGACCTTCAGGCATGCCCTCGGAGAT', False, False) #print ht.sweep_sequence_for_colors('GGAGAGCCTGGGGCCAAGCCCGAGGGCATGCCTGAAGGTCACATGGAGGGCATGAGCAGCCCAG', False, False) #print ht.sweep_sequence_for_colors('TTTTTTGAATACGTTTAGTTAATATTTGTACTTCAATTAATAAAAATTTGCTATAATTTTTCCATTATCGCCAGTCACTCGCGTGATATAGGAAAAGGTT', False, False) #print ht.sweep_sequence_for_colors('AAGCAGTGGTATCAACGCAGAGTACGCGGGGACTCTGTCGCTGCTCCTCTAGCACAGAGAGCCAGAGACGGCTTACAGCAGCAGCATCATATAGCCTC', False, False) -N=1000000000 +t0 = 'CCATGTAGCGCCGCACACCTTTGTAGGTGTTGTAATAATCTTCGATGACTTTCTTCGCTTCCTGACGGCTTATGCC' +t1 = 'ACCGCGCGCGAATCGACGGTTGTCAGCCAAAGGCGTTCAACACCAGCACCGCCCTTAAGCCGCCCGCCCGCCGCCC' +N = 1000 + +for n, record in enumerate(screed.open('../tests/test-data/test-reads.fa')): + if n > N: + break + print '*' * 40 + seq = record.sequence + print seq + colors = ht.sweep_sequence_for_colors(seq, False, False) + print 'colors from sweep:', colors + tags = ht.get_all_tags(seq) + print 'tags from get_all_tags:', tags + print 'colors from get_tag_colors:' + t_colors = set() + for tag in tags: + t_colors.update(ht.get_tag_colors(tag)) + print t_colors + assert len(t_colors) == len(colors) ''' file_pointers = {} @@ -28,7 +60,7 @@ file_pointers[c] = open('color_{}.fa'.format(c), 'wb') file_pointers[c].write('>{}\n{}\n'.format(record.name, record.sequence))\ ''' - +''' ht = khmer.new_hashbits(25, 1e9,4) ht.consume_partitioned_fasta_and_tag_with_colors('/w/2013-lamprey/test.fp') @@ -38,4 +70,4 @@ colors = ht.sweep_sequence_for_colors(record.sequence, False, False) if colors: print colors - +''' From bc767df1c3eaaa4f89558782ba2f9b6ebafbd1c5 Mon Sep 17 00:00:00 2001 From: CS Date: Thu, 26 Sep 2013 02:58:53 -0400 Subject: [PATCH 033/140] added bunch more tests, TODO fix n_colors --- tests/test_hashbits.py | 52 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 46 insertions(+), 6 deletions(-) diff --git a/tests/test_hashbits.py b/tests/test_hashbits.py index 6e8e92fa71..624de8c611 100644 --- a/tests/test_hashbits.py +++ b/tests/test_hashbits.py @@ -501,16 +501,56 @@ def test_simple_median(): assert average == 1.0 assert stddev == 0.0 -def test_consume_fasta_and_tag_with_colors(): +def test_get_all_tags(): + hb = khmer.new_hashbits(20, 1e7, 4) + filename = utils.get_test_data('single-read.fq') + hb.consume_fasta_and_tag(filename) + + tags = hb.get_all_tags('CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT') + assert len(tags) == 1 + assert tags.pop() == 173473779682L + +def test_get_tag_colors(): hb = khmer.new_hashbits(20, 1e7, 4) + filename = utils.get_test_data('single-read.fq') + hb.consume_fasta_and_tag_with_colors(filename) + tag = 173473779682L + + colors = hb.get_tag_colors(tag) + assert len(colors) == 1 + assert colors.pop() == 0L + +def test_sweep_sequence_for_colors(): + hb = khmer.new_hashbits(20, 1e7, 4) + filename = utils.get_test_data('single-read.fq') + hb.consume_fasta_and_tag_with_colors(filename) + colors = hb.sweep_sequence_for_colors('CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT') + assert len(colors) == 1 + assert colors.pop() == 0L + +def test_consume_fasta_and_tag_with_colors(): + hb = khmer.new_hashbits(20, 1e7, 4) + read_1 = 'ACGTAACCGGTTAAACCCGGGTTTAAAACCCCGGGGTTTT' filename = utils.get_test_data('test-transcript.fa') + total_reads, n_consumed = hb.consume_fasta_and_tag_with_colors(filename) - #assert n_consumed == 3 + assert hb.get(read_1[:20]) assert total_reads == 3 + #assert hb.n_colors() == 3 - assert hb.n_colors() == 3 - - - + +def test_consume_partitioned_fasta_and_tag_with_colors(): + hb = khmer.new_hashbits(20, 1e7, 4) + filename = utils.get_test_data('real-partition-small.fa') + + total_reads, n_consumed = hb.consume_partitioned_fasta_and_tag_with_colors(filename) + #assert hb.n_colors() == 1 + colors = set() + for record in screed.open(filename): + seq = record.sequence + colors.update(hb.sweep_sequence_for_colors(seq, False, False)) + assert len(colors) == 1 + assert colors.pop() == 2L + #assert hb.n_colors() == 1 From d29061c99e35da2439cf04a261b12723532fc9d4 Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Thu, 26 Sep 2013 13:34:49 -0400 Subject: [PATCH 034/140] added a sweep-reads script to scripts, TODO add tests --- scripts/sweep-reads-by-partition.py | 134 ++++++++++++++++++++++++++++ 1 file changed, 134 insertions(+) create mode 100755 scripts/sweep-reads-by-partition.py diff --git a/scripts/sweep-reads-by-partition.py b/scripts/sweep-reads-by-partition.py new file mode 100755 index 0000000000..f2a59f73a4 --- /dev/null +++ b/scripts/sweep-reads-by-partition.py @@ -0,0 +1,134 @@ +#! /w/khmer_dev/bin/python +# +# This file is part of khmer, http://github.com/ged-lab/khmer/, and is +# Copyright (C) Michigan State University, 2009-2013. It is licensed under +# the three-clause BSD license; see doc/LICENSE.txt. Contact: ctb@msu.edu +# +""" +Tag and color the given partitioned fasta, then find all reads in the neighborhood +of each partition and output to a file + +% python scripts/normalize-by-median.py [ -p ] -i ... + +Use '-h' for parameter help. +""" + +import khmer +import screed +import sys +from khmer.counting_args import build_construct_args, DEFAULT_MIN_HASHSIZE + +DEFAULT_PPF = 1 + +def write_read(fp, seq, name, color): + fp.write('>{name}\t{color}\n{seq}\n'.format(seq=seq, name=name, color=color)) + +def main(): + parser = build_construct_args() + parser.add_argument('-p', '--partitions_per_file', + dest='partitions_per_file', default=DEFAULT_PPF) + parser.add_argument('-i', '--input_fastp', dest='input_fastp') + parser.add_argument('input_reads', nargs='+') + args = parser.parse_args() + + if not args.quiet: + if args.min_hashsize == DEFAULT_MIN_HASHSIZE: + print >>sys.stderr, \ + "** WARNING: hashsize is default! " \ + "You absodefly want to increase this!\n** " \ + "Please read the docs!" + + print >>sys.stderr, '\nPARAMETERS:' + print >>sys.stderr, \ + ' - kmer size = {ksize:d} \t\t(-k)'.format(ksize=args.ksize) + print >>sys.stderr, \ + ' - n hashes = {nhash:d} \t\t(-N)'.format(nhash=args.n_hashes) + print >>sys.stderr, \ + ' - min hashsize = {mh:-5.2g} \t(-x)'.format(mh=args.min_hashsize) + print >>sys.stderr, '' + print >>sys.stderr, \ + 'Estimated memory usage is {prod:.2g} bytes \ + (n_hashes x min_hashsize)'.format(prod=args.n_hashes*args.min_hashsize) + print >>sys.stderr, '-' * 8 + + K = args.ksize + HT_SIZE = args.min_hashsize + N_HT = args.n_hashes + + input_reads = args.input_reads + input_fastp = args.input_fastp + ppf = args.partitions_per_file + + ht = khmer.new_hashbits(K, HT_SIZE, N_HT) + ht.consume_partitioned_fasta_and_tag_with_colors(input_fastp) + + cur_colors = [] + color_to_fp_dict = {} + cur_fp = file + + color_number_dist = [] + + n_orphaned = 0 + n_colored = 0 + n_mcolored = 0 + n_files = 0 + try: + for read_file in input_reads: + print >>sys.stderr,'** sweeping {read_file} for colors...'.format(read_file=read_file) + + for n, record in enumerate(screed.open(read_file)): + if n % 10000 == 0: + print >>sys.stderr, '\tswept {n} reads [{nc} colored, {no} orphaned' \ + .format(n=n, nc=n_colored, no=n_orphaned) + seq = record.sequence + name = record.name + + colors = ht.sweep_sequence_for_colors(seq, False, False) + color_number_dist.append(len(colors)) + if colors: + n_colored += 1 + if len(colors) > 1: + n_mcolored += 1 + for color in colors: + # do we have a file for this color already? use it! + if color in color_to_fp_dict: + fp = color_to_fp_dict[color] + write_read(fp, seq, name, color) + # no file yet? make a new one + else: + if len(cur_colors) == 0: + #print '** opening new file...' + cur_fp = open('colored_reads_{fn}.fa'.format(fn=n_files), + 'wb') + + color_to_fp_dict[color] = cur_fp + cur_colors.append(color) + write_read(cur_fp, seq, name, color) + n_files += 1 + + if len(cur_colors) == ppf: + cur_colors = [] + else: + n_orphaned += 1 + + for key in color_to_fp_dict: + if color_to_fp_dict[key]: + color_to_fp_dict[key].close() + + except IOError as e: + print >>sys.stderr, 'ERROR:', e + print >>sys.stderr, '** exiting...' + + print >>sys.stderr, 'swept {n_reads} for colors...'.format(n_reads=n) + print >>sys.stderr, '...with {nc} colored and {no} orphaned'.format( + nc=n_colored, no=n_orphaned) + print >>sys.stderr, '...and {nmc} multicolored'.format(nmc=n_mcolored) + print >>sys.stderr, '...to {nf} files'.format(nf=n_files) + + print >>sys.stderr, '** outputting color number distribution...' + with open('color_dist.txt', 'wb') as outfp: + for nc in color_number_dist: + outfp.write('{nc}\n'.format(nc=nc)) + +if __name__ == '__main__': + main() From feb0f3988c08cb462a03ed15407089a0ad88afe1 Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Fri, 27 Sep 2013 14:49:01 -0400 Subject: [PATCH 035/140] added test for correctness of color tagging and traversal --- tests/test_hashbits.py | 46 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/tests/test_hashbits.py b/tests/test_hashbits.py index 624de8c611..b38112c4df 100644 --- a/tests/test_hashbits.py +++ b/tests/test_hashbits.py @@ -501,6 +501,11 @@ def test_simple_median(): assert average == 1.0 assert stddev == 0.0 +# +# @cswelcher TODO: more tests! +# * thread-safety +# * n_colors -- make sure to use test-data with multi-colored tags + def test_get_all_tags(): hb = khmer.new_hashbits(20, 1e7, 4) filename = utils.get_test_data('single-read.fq') @@ -539,7 +544,48 @@ def test_consume_fasta_and_tag_with_colors(): assert hb.get(read_1[:20]) assert total_reads == 3 #assert hb.n_colors() == 3 + +''' +* The test data set as four reads: A, B, C, and D +* Overlaps are A <-> B <-> C, with D on its own +* Thus, traversing from A should find colors from A and B, + traversing from B should find colors from A, B, and C, + and traversing from C should find colors from B and C +''' +def test_color_tag_correctness(): + hb = khmer.new_hashbits(20, 1e7, 4) + filename = utils.get_test_data('test-colors.fa') + hb.consume_fasta_and_tag_with_colors(filename) + # read A + colors = hb.sweep_sequence_for_colors('ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAG') + + print colors + assert len(colors) == 2 + assert 0L in colors + assert 1L in colors + + # read B + colors = hb.sweep_sequence_for_colors('GCTCTGCCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGA') + print colors + assert len(colors) == 3 + assert 0L in colors + assert 1L in colors + assert 2L in colors + + # read C + colors = hb.sweep_sequence_for_colors('TGTTGGGGATAGATAGATAGATGAGTGTAGATCCAACAACACATACA') + print colors + assert len(colors) == 2 + assert 1L in colors + assert 2L in colors + + # read D + colors = hb.sweep_sequence_for_colors('TATATATATAGCTAGCTAGCTAACTAGCTAGCATCGATCGATCGATC') + print colors + assert len(colors) == 1 + assert 3L in colors + def test_consume_partitioned_fasta_and_tag_with_colors(): hb = khmer.new_hashbits(20, 1e7, 4) From 01f233d71693628b48eb6691802aa49703a6c14b Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Fri, 27 Sep 2013 16:27:42 -0400 Subject: [PATCH 036/140] started traversal optimization --- lib/hashtable.cc | 12 ++-- lib/subset.cc | 146 +++++++++++++++++++++++++++++++++++++++++++++++ lib/subset.hh | 8 +++ 3 files changed, 162 insertions(+), 4 deletions(-) diff --git a/lib/hashtable.cc b/lib/hashtable.cc index 0d729864eb..a5d8264213 100644 --- a/lib/hashtable.cc +++ b/lib/hashtable.cc @@ -2232,15 +2232,19 @@ void Hashtable::sweep_sequence_for_colors(const std::string& seq, KMerIterator kmers(seq.c_str(), _ksize); std::string kmer_s; - + // keep a list of kmers which have already been traversed + SeenSet traversed_kmers; while (!kmers.done()) { kmer = kmers.next(); kmer_s = _revhash(kmer, _ksize); _hash(kmer_s.c_str(), _ksize, kmer_f, kmer_r); - partition->find_all_tags(kmer_f, kmer_r, tagged_kmers, all_tags, - break_on_stoptags, stop_big_traversals); - traverse_colors_and_resolve(tagged_kmers, found_colors); + // don't even try traversing from k-mers not in the hashtable + if (get_count(uniqify_rc(kmer_f,kmer_r))) { + partition->find_all_tags(kmer_f, kmer_r, tagged_kmers, + traversed_kmers, all_tags, break_on_stoptags, stop_big_traversals); + traverse_colors_and_resolve(tagged_kmers, found_colors); + } } } diff --git a/lib/subset.cc b/lib/subset.cc index 47fbe53fe2..8c5c18d89e 100644 --- a/lib/subset.cc +++ b/lib/subset.cc @@ -446,6 +446,152 @@ void SubsetPartition::find_all_tags(HashIntoType kmer_f, } } +// Same as find_all_tags, but keep track of traversed k-mers +// +void SubsetPartition::find_all_tags(HashIntoType kmer_f, + HashIntoType kmer_r, + SeenSet& tagged_kmers, + SeenSet& traversed_kmers, + const SeenSet& all_tags, + bool break_on_stop_tags, + bool stop_big_traversals) +{ + const HashIntoType bitmask = _ht->bitmask; + + HashIntoType f, r; + bool first = true; + NodeQueue node_q; + std::queue breadth_q; + unsigned int cur_breadth = 0; + unsigned int breadth = 0; + const unsigned int max_breadth = (2 * _ht->_tag_density) + 1; + + const unsigned int rc_left_shift = _ht->ksize()*2 - 2; + unsigned int total = 0; + + // start breadth-first search. + + node_q.push(kmer_f); + node_q.push(kmer_r); + breadth_q.push(0); + + while(!node_q.empty()) { + if (stop_big_traversals && traversed_kmers.size() > BIG_TRAVERSALS_ARE) { + tagged_kmers.clear(); + break; + } + + kmer_f = node_q.front(); + node_q.pop(); + kmer_r = node_q.front(); + node_q.pop(); + breadth = breadth_q.front(); + breadth_q.pop(); + + HashIntoType kmer = uniqify_rc(kmer_f, kmer_r); + + // Have we already seen this k-mer? If so, skip. + if (set_contains(traversed_kmers, kmer)) { + continue; + } + + // Do we want to traverse through this k-mer? If not, skip. + if (break_on_stop_tags && set_contains(_ht->stop_tags, kmer)) { + // @CTB optimize by inserting into traversed_kmers set? + continue; + } + + // keep track of seen kmers + traversed_kmers.insert(kmer); + total++; + + // Is this a kmer-to-tag, and have we put this tag in a partition already? + // Search no further in this direction. (This is where we connect + // partitions.) + if (!first && set_contains(all_tags, kmer)) { + tagged_kmers.insert(kmer); + continue; + } + + assert(breadth >= cur_breadth); // keep track of watermark, for debugging. + if (breadth > cur_breadth) { cur_breadth = breadth; } + + if (breadth >= max_breadth) { continue; } // truncate search @CTB exit? + + // + // Enqueue next set of nodes. + // + + // NEXT + f = next_f(kmer_f, 'A'); + r = next_r(kmer_r, 'A'); + if (_ht->get_count(uniqify_rc(f,r)) && + !set_contains(traversed_kmers, uniqify_rc(f,r))) { + node_q.push(f); node_q.push(r); + breadth_q.push(breadth + 1); + } + + f = next_f(kmer_f, 'C'); + r = next_r(kmer_r, 'C'); + if (_ht->get_count(uniqify_rc(f,r)) && + !set_contains(traversed_kmers, uniqify_rc(f,r))) { + node_q.push(f); node_q.push(r); + breadth_q.push(breadth + 1); + } + + f = next_f(kmer_f, 'G'); + r = next_r(kmer_r, 'G'); + if (_ht->get_count(uniqify_rc(f,r)) && + !set_contains(traversed_kmers, uniqify_rc(f,r))) { + node_q.push(f); node_q.push(r); + breadth_q.push(breadth + 1); + } + + f = next_f(kmer_f, 'T'); + r = next_r(kmer_r, 'T'); + if (_ht->get_count(uniqify_rc(f,r)) && + !set_contains(traversed_kmers, uniqify_rc(f,r))) { + node_q.push(f); node_q.push(r); + breadth_q.push(breadth + 1); + } + + // PREVIOUS. + r = prev_r(kmer_r, 'A'); + f = prev_f(kmer_f, 'A'); + if (_ht->get_count(uniqify_rc(f,r)) && + !set_contains(traversed_kmers, uniqify_rc(f,r))) { + node_q.push(f); node_q.push(r); + breadth_q.push(breadth + 1); + } + + r = prev_r(kmer_r, 'C'); + f = prev_f(kmer_f, 'C'); + if (_ht->get_count(uniqify_rc(f,r)) && + !set_contains(traversed_kmers, uniqify_rc(f,r))) { + node_q.push(f); node_q.push(r); + breadth_q.push(breadth + 1); + } + + r = prev_r(kmer_r, 'G'); + f = prev_f(kmer_f, 'G'); + if (_ht->get_count(uniqify_rc(f,r)) && + !set_contains(traversed_kmers, uniqify_rc(f,r))) { + node_q.push(f); node_q.push(r); + breadth_q.push(breadth + 1); + } + + r = prev_r(kmer_r, 'T'); + f = prev_f(kmer_f, 'T'); + if (_ht->get_count(uniqify_rc(f,r)) && + !set_contains(traversed_kmers, uniqify_rc(f,r))) { + node_q.push(f); node_q.push(r); + breadth_q.push(breadth + 1); + } + + first = false; + } +} + // find_all_tags: the core of the partitioning code. finds all tagged k-mers // connected to kmer_f/kmer_r in the graph. diff --git a/lib/subset.hh b/lib/subset.hh index cab01273b9..282a9a6ac9 100644 --- a/lib/subset.hh +++ b/lib/subset.hh @@ -69,6 +69,14 @@ namespace khmer { bool break_on_stop_tags=false, bool stop_big_traversals=false); + void find_all_tags(HashIntoType kmer_f, + HashIntoType kmer_r, + SeenSet& tagged_kmers, + SeenSet& traversed_kmers, + const SeenSet& all_tags, + bool break_on_stop_tags, + bool stop_big_traversals); + void find_all_tags_truncate_on_abundance(HashIntoType kmer_f, HashIntoType kmer_r, SeenSet& tagged_kmers, From 1b97d8cf50c5e208cee72b2029bd8f0e303e661c Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Mon, 30 Sep 2013 16:26:18 -0400 Subject: [PATCH 037/140] fixed color allocation error; added persistent color to color pointer map and associated allocation funtion run checks for all newly allocated colors --- lib/hashtable.cc | 13 ++++--------- lib/hashtable.hh | 16 +++++++++++++++- python/_khmermodule.cc | 26 +++++++++++++++++++++++++- 3 files changed, 44 insertions(+), 11 deletions(-) diff --git a/lib/hashtable.cc b/lib/hashtable.cc index a5d8264213..22a0d6d640 100644 --- a/lib/hashtable.cc +++ b/lib/hashtable.cc @@ -2003,7 +2003,7 @@ Hashtable::consume_fasta_and_tag_with_colors( Color _tag_color = 0; - Color * the_color = new Color(_tag_color); + Color * the_color = check_and_allocate_color(_tag_color); // Iterate through the reads and consume their k-mers. while (!parser->is_complete( )) { @@ -2018,7 +2018,7 @@ Hashtable::consume_fasta_and_tag_with_colors( this_n_consumed, *the_color ); _tag_color++; - the_color = new Color(_tag_color); + the_color = check_and_allocate_color(_tag_color); #ifdef WITH_INTERNAL_METRICS hasher.pmetrics.start_timers( ); @@ -2085,7 +2085,6 @@ void Hashtable::consume_partitioned_fasta_and_tag_with_colors(const std::string // // iterate through the FASTA file & consume the reads. // - ColorPtrMap colors; Color * c; PartitionID p; while(!parser->is_complete()) { @@ -2095,11 +2094,7 @@ void Hashtable::consume_partitioned_fasta_and_tag_with_colors(const std::string if (check_and_normalize_read(seq)) { // First, figure out what the partition is (if non-zero), and save that. p = _parse_partition_id(read.name); - if (colors.count(p)) { - c = colors[p]; - } else { - c = new Color(p); - } + c = check_and_allocate_color(p); consume_sequence_and_tag_with_colors( seq, n_consumed, @@ -2123,7 +2118,6 @@ void Hashtable::consume_partitioned_fasta_and_tag_with_colors(const std::string // @cswelcher TODO: check that deallocate ColorPtrMap is correct delete parser; - colors.clear(); } // @cswelcher: double-check -- is it valid to pull the address from a reference? @@ -2135,6 +2129,7 @@ void Hashtable::link_tag_and_color(HashIntoType& kmer, Color& kmer_color) { /* This is essentially the same code as above, only it assigns colors to the * tags through multimap TagColorMap defined in hashtable.hh, declared in * hashbits.hh + * @cswelcher TODO: should I instead send in the pointer to the new color? */ void Hashtable::consume_sequence_and_tag_with_colors(const std::string& seq, unsigned long long& n_consumed, diff --git a/lib/hashtable.hh b/lib/hashtable.hh index c662cc0f5b..9c100008e3 100644 --- a/lib/hashtable.hh +++ b/lib/hashtable.hh @@ -233,6 +233,7 @@ namespace khmer { return num_tags; } + Hashtable( WordLength ksize, uint32_t const number_of_threads = @@ -253,6 +254,7 @@ namespace khmer { _init_bitstuff(); _all_tags_spin_lock = 0; _tag_colors_spin_lock = 0; + } virtual ~Hashtable( ) @@ -373,6 +375,7 @@ namespace khmer { SeenSet repart_small_tags; TagColorPtrMap tag_colors; ColorTagPtrMap color_tag_ptrs; + ColorPtrMap color_ptrs; // accessor to get 'k' const WordLength ksize() const { return _ksize; } @@ -482,7 +485,18 @@ namespace khmer { CallbackFn callback = NULL, void * callback_data = NULL ); - + + Color * check_and_allocate_color(Color new_color) { + Color * c; + if (color_ptrs.count(new_color)) { + c = color_ptrs[new_color]; + } else { + c = new Color(new_color); + color_ptrs[*c] = c; + } + return c; + } + void consume_sequence_and_tag(const std::string& seq, unsigned long long& n_consumed, SeenSet * new_tags = 0); diff --git a/python/_khmermodule.cc b/python/_khmermodule.cc index 975a045cb5..10be84a59f 100644 --- a/python/_khmermodule.cc +++ b/python/_khmermodule.cc @@ -3878,6 +3878,29 @@ static PyObject * hashbits_consume_partitioned_fasta_and_tag_with_colors( return Py_BuildValue("iL", total_reads, n_consumed); } +static PyObject * hashbits_consume_sequence_and_tag_with_colors(PyObject * self, PyObject * args) { + khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self; + khmer::Hashbits * hb = me->hashbits; + + char * seq = NULL; + unsigned long long c; + if (!PyArg_ParseTuple(args, "sK", &seq, &c)) { + return NULL; + } + + unsigned long long n_consumed = 0; + khmer::Color * the_color = new Color(c); + + try { + //if (hb->check_and_normalize_read(seq)) { + hb->consume_sequence_and_tag_with_colors(seq, n_consumed, *the_color); + //} + } catch (_khmer_signal &e) { + return NULL; + } + return Py_BuildValue("L", n_consumed); +} + static PyObject * hashbits_sweep_sequence_for_colors(PyObject * self, PyObject * args) { khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self; khmer::Hashbits * hb = me->hashbits; @@ -3923,7 +3946,7 @@ static PyObject * hashbits_sweep_sequence_for_colors(PyObject * self, PyObject * khmer::ColorPtrSet::const_iterator si; unsigned long long i = 0; for (si=found_colors.begin(); si!=found_colors.end(); ++si) { - PyList_SET_ITEM(x, i, Py_BuildValue("K", *(*si))); + PyList_SET_ITEM(x, i, Py_BuildValue("K", *si)); i++; } @@ -4105,6 +4128,7 @@ static PyMethodDef khmer_hashbits_methods[] = { {"consume_partitioned_fasta_and_tag_with_colors", hashbits_consume_partitioned_fasta_and_tag_with_colors, METH_VARARGS, "" }, {"get_all_tags", hashbits_get_all_tags, METH_VARARGS, "" }, {"get_tag_colors", hashbits_get_tag_colors, METH_VARARGS, ""}, + {"consume_sequence_and_tag_with_colors", hashbits_consume_sequence_and_tag_with_colors, METH_VARARGS, "" }, {"n_colors", hashbits_n_colors, METH_VARARGS, ""}, {NULL, NULL, 0, NULL} /* sentinel */ From 2113c23cea8805593d3de49b152e2323359c1a0f Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Mon, 30 Sep 2013 16:29:00 -0400 Subject: [PATCH 038/140] changed back deref on color sweep after bugfixing, switched python exposed consume_sequence_and_tag_with_colors to use new color allocation function --- python/_khmermodule.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/_khmermodule.cc b/python/_khmermodule.cc index 10be84a59f..dcafd2a35f 100644 --- a/python/_khmermodule.cc +++ b/python/_khmermodule.cc @@ -3889,10 +3889,11 @@ static PyObject * hashbits_consume_sequence_and_tag_with_colors(PyObject * self, } unsigned long long n_consumed = 0; - khmer::Color * the_color = new Color(c); + khmer::Color * the_color = hb->check_and_allocate_color(c); try { //if (hb->check_and_normalize_read(seq)) { + hb->consume_sequence_and_tag_with_colors(seq, n_consumed, *the_color); //} } catch (_khmer_signal &e) { @@ -3946,7 +3947,7 @@ static PyObject * hashbits_sweep_sequence_for_colors(PyObject * self, PyObject * khmer::ColorPtrSet::const_iterator si; unsigned long long i = 0; for (si=found_colors.begin(); si!=found_colors.end(); ++si) { - PyList_SET_ITEM(x, i, Py_BuildValue("K", *si)); + PyList_SET_ITEM(x, i, Py_BuildValue("K", *(*si))); i++; } From 072adfd9e267f2c0151655fd83f8a2239e56d049 Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Mon, 30 Sep 2013 17:46:49 -0400 Subject: [PATCH 039/140] working on new traversal code, broke off neighbor finding to its own function --- lib/hashtable.cc | 3 +- lib/hashtable.hh | 4 +- lib/subset.cc | 162 +++++++++++++++++++++-------------------- python/_khmermodule.cc | 14 ++-- 4 files changed, 97 insertions(+), 86 deletions(-) diff --git a/lib/hashtable.cc b/lib/hashtable.cc index 22a0d6d640..72d332109f 100644 --- a/lib/hashtable.cc +++ b/lib/hashtable.cc @@ -2215,7 +2215,7 @@ void Hashtable::consume_sequence_and_tag_with_colors(const std::string& seq, * For now, check /every/ k-mer with find_all_tags * THIS SUCKS AND IT'S YOUR FAULT @CTB */ -void Hashtable::sweep_sequence_for_colors(const std::string& seq, +unsigned int Hashtable::sweep_sequence_for_colors(const std::string& seq, ColorPtrSet& found_colors, bool break_on_stoptags, bool stop_big_traversals) { @@ -2241,6 +2241,7 @@ void Hashtable::sweep_sequence_for_colors(const std::string& seq, traverse_colors_and_resolve(tagged_kmers, found_colors); } } + return traversed_kmers.size() } ColorPtrSet Hashtable::get_tag_colors(const HashIntoType& tag) { diff --git a/lib/hashtable.hh b/lib/hashtable.hh index 9c100008e3..c0c25584e3 100644 --- a/lib/hashtable.hh +++ b/lib/hashtable.hh @@ -455,7 +455,7 @@ namespace khmer { // Partitioning stuff. unsigned int n_tags() const { return all_tags.size(); } - unsigned int n_colors() const { return tag_colors.size(); } + unsigned int n_colors() const { return colors_ptrs.size(); } void divide_tags_into_subsets(unsigned int subset_size, SeenSet& divvy); @@ -538,7 +538,7 @@ namespace khmer { void link_tag_and_color(HashIntoType& kmer, Color& color); - void sweep_sequence_for_colors(const std::string& seq, + unsigned int sweep_sequence_for_colors(const std::string& seq, ColorPtrSet& found_colors, bool break_on_stoptags, bool stop_big_traversals); diff --git a/lib/subset.cc b/lib/subset.cc index 8c5c18d89e..e379b2b5ee 100644 --- a/lib/subset.cc +++ b/lib/subset.cc @@ -296,6 +296,80 @@ unsigned int SubsetPartition::find_unpart(const std::string infilename, return n_singletons; } +/* @cswelcher Brilliant idea: let's *not* copy this same piece of code + * over and over again! + */ +void SubsetPartition::queue_neighbors(HashIntoType kmer_f, + HashIntoType kmer_r, + NodeQueue& node_q, + std::queue breadth_q) { + + f = next_f(kmer_f, 'A'); + r = next_r(kmer_r, 'A'); + if (_ht->get_count(uniqify_rc(f,r)) && + !set_contains(keeper, uniqify_rc(f,r))) { + node_q.push(f); node_q.push(r); + breadth_q.push(breadth + 1); + } + + f = next_f(kmer_f, 'C'); + r = next_r(kmer_r, 'C'); + if (_ht->get_count(uniqify_rc(f,r)) && + !set_contains(keeper, uniqify_rc(f,r))) { + node_q.push(f); node_q.push(r); + breadth_q.push(breadth + 1); + } + + f = next_f(kmer_f, 'G'); + r = next_r(kmer_r, 'G'); + if (_ht->get_count(uniqify_rc(f,r)) && + !set_contains(keeper, uniqify_rc(f,r))) { + node_q.push(f); node_q.push(r); + breadth_q.push(breadth + 1); + } + + f = next_f(kmer_f, 'T'); + r = next_r(kmer_r, 'T'); + if (_ht->get_count(uniqify_rc(f,r)) && + !set_contains(keeper, uniqify_rc(f,r))) { + node_q.push(f); node_q.push(r); + breadth_q.push(breadth + 1); + } + + // PREVIOUS. + r = prev_r(kmer_r, 'A'); + f = prev_f(kmer_f, 'A'); + if (_ht->get_count(uniqify_rc(f,r)) && + !set_contains(keeper, uniqify_rc(f,r))) { + node_q.push(f); node_q.push(r); + breadth_q.push(breadth + 1); + } + + r = prev_r(kmer_r, 'C'); + f = prev_f(kmer_f, 'C'); + if (_ht->get_count(uniqify_rc(f,r)) && + !set_contains(keeper, uniqify_rc(f,r))) { + node_q.push(f); node_q.push(r); + breadth_q.push(breadth + 1); + } + + r = prev_r(kmer_r, 'G'); + f = prev_f(kmer_f, 'G'); + if (_ht->get_count(uniqify_rc(f,r)) && + !set_contains(keeper, uniqify_rc(f,r))) { + node_q.push(f); node_q.push(r); + breadth_q.push(breadth + 1); + } + + r = prev_r(kmer_r, 'T'); + f = prev_f(kmer_f, 'T'); + if (_ht->get_count(uniqify_rc(f,r)) && + !set_contains(keeper, uniqify_rc(f,r))) { + node_q.push(f); node_q.push(r); + breadth_q.push(breadth + 1); + } +} + /// // find_all_tags: the core of the partitioning code. finds all tagged k-mers @@ -345,6 +419,7 @@ void SubsetPartition::find_all_tags(HashIntoType kmer_f, HashIntoType kmer = uniqify_rc(kmer_f, kmer_r); // Have we already seen this k-mer? If so, skip. + // @cswelcher this is redundant, as we already check before queuing if (set_contains(keeper, kmer)) { continue; } @@ -446,12 +521,13 @@ void SubsetPartition::find_all_tags(HashIntoType kmer_f, } } -// Same as find_all_tags, but keep track of traversed k-mers -// -void SubsetPartition::find_all_tags(HashIntoType kmer_f, + + +// Perform a breadth-first search starting from the k-mers in the given sequence +void SubsetPartition::sweep_for_tags( + HashIntoType kmer_f, HashIntoType kmer_r, SeenSet& tagged_kmers, - SeenSet& traversed_kmers, const SeenSet& all_tags, bool break_on_stop_tags, bool stop_big_traversals) @@ -491,9 +567,10 @@ void SubsetPartition::find_all_tags(HashIntoType kmer_f, HashIntoType kmer = uniqify_rc(kmer_f, kmer_r); // Have we already seen this k-mer? If so, skip. - if (set_contains(traversed_kmers, kmer)) { - continue; - } + // @cswelcher we already check before queuing + //if (set_contains(traversed_kmers, kmer)) { + // continue; + //} // Do we want to traverse through this k-mer? If not, skip. if (break_on_stop_tags && set_contains(_ht->stop_tags, kmer)) { @@ -518,76 +595,7 @@ void SubsetPartition::find_all_tags(HashIntoType kmer_f, if (breadth >= max_breadth) { continue; } // truncate search @CTB exit? - // - // Enqueue next set of nodes. - // - - // NEXT - f = next_f(kmer_f, 'A'); - r = next_r(kmer_r, 'A'); - if (_ht->get_count(uniqify_rc(f,r)) && - !set_contains(traversed_kmers, uniqify_rc(f,r))) { - node_q.push(f); node_q.push(r); - breadth_q.push(breadth + 1); - } - - f = next_f(kmer_f, 'C'); - r = next_r(kmer_r, 'C'); - if (_ht->get_count(uniqify_rc(f,r)) && - !set_contains(traversed_kmers, uniqify_rc(f,r))) { - node_q.push(f); node_q.push(r); - breadth_q.push(breadth + 1); - } - - f = next_f(kmer_f, 'G'); - r = next_r(kmer_r, 'G'); - if (_ht->get_count(uniqify_rc(f,r)) && - !set_contains(traversed_kmers, uniqify_rc(f,r))) { - node_q.push(f); node_q.push(r); - breadth_q.push(breadth + 1); - } - - f = next_f(kmer_f, 'T'); - r = next_r(kmer_r, 'T'); - if (_ht->get_count(uniqify_rc(f,r)) && - !set_contains(traversed_kmers, uniqify_rc(f,r))) { - node_q.push(f); node_q.push(r); - breadth_q.push(breadth + 1); - } - - // PREVIOUS. - r = prev_r(kmer_r, 'A'); - f = prev_f(kmer_f, 'A'); - if (_ht->get_count(uniqify_rc(f,r)) && - !set_contains(traversed_kmers, uniqify_rc(f,r))) { - node_q.push(f); node_q.push(r); - breadth_q.push(breadth + 1); - } - - r = prev_r(kmer_r, 'C'); - f = prev_f(kmer_f, 'C'); - if (_ht->get_count(uniqify_rc(f,r)) && - !set_contains(traversed_kmers, uniqify_rc(f,r))) { - node_q.push(f); node_q.push(r); - breadth_q.push(breadth + 1); - } - - r = prev_r(kmer_r, 'G'); - f = prev_f(kmer_f, 'G'); - if (_ht->get_count(uniqify_rc(f,r)) && - !set_contains(traversed_kmers, uniqify_rc(f,r))) { - node_q.push(f); node_q.push(r); - breadth_q.push(breadth + 1); - } - - r = prev_r(kmer_r, 'T'); - f = prev_f(kmer_f, 'T'); - if (_ht->get_count(uniqify_rc(f,r)) && - !set_contains(traversed_kmers, uniqify_rc(f,r))) { - node_q.push(f); node_q.push(r); - breadth_q.push(breadth + 1); - } - + queue_neighbors(kmer_f, kmer_r, node_q, breadth_q); first = false; } } diff --git a/python/_khmermodule.cc b/python/_khmermodule.cc index dcafd2a35f..2a8c39c65c 100644 --- a/python/_khmermodule.cc +++ b/python/_khmermodule.cc @@ -3883,7 +3883,7 @@ static PyObject * hashbits_consume_sequence_and_tag_with_colors(PyObject * self, khmer::Hashbits * hb = me->hashbits; char * seq = NULL; - unsigned long long c; + unsigned long long c = NULL; if (!PyArg_ParseTuple(args, "sK", &seq, &c)) { return NULL; } @@ -3902,15 +3902,16 @@ static PyObject * hashbits_consume_sequence_and_tag_with_colors(PyObject * self, return Py_BuildValue("L", n_consumed); } -static PyObject * hashbits_sweep_sequence_for_colors(PyObject * self, PyObject * args) { +static PyObject * hashbits_sweep_color_neighborhood(PyObject * self, PyObject * args) { khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self; khmer::Hashbits * hb = me->hashbits; char * seq = NULL; + unsigned int range = NULL; PyObject * break_on_stop_tags_o = NULL; PyObject * stop_big_traversals_o = NULL; - if (!PyArg_ParseTuple(args, "s|OO", &seq, + if (!PyArg_ParseTuple(args, "si|OO", &seq, &range, &break_on_stop_tags_o, &stop_big_traversals_o)) { return NULL; @@ -3935,7 +3936,7 @@ static PyObject * hashbits_sweep_sequence_for_colors(PyObject * self, PyObject * bool exc_raised = false; //Py_BEGIN_ALLOW_THREADS try { - hb->sweep_sequence_for_colors(seq, found_colors, break_on_stop_tags, stop_big_traversals); + hb->sweep_sequence_for_colors(seq, found_colors, range, break_on_stop_tags, stop_big_traversals); } catch (_khmer_signal &e) { exc_raised = true; } @@ -3958,16 +3959,17 @@ static PyObject * hashbits_sweep_sequence_for_colors(PyObject * self, PyObject * // @cswelcher TODO: this is broken az, fix it asap // need a tags_in_sequence iterator or function in c++ land for reuse in all // these functions -static PyObject * hashbits_get_all_tags(PyObject * self, PyObject *args) +static PyObject * hashbits_sweep_tag_neighborhood(PyObject * self, PyObject *args) { khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self; khmer::Hashbits * hashbits = me->hashbits; char * seq = NULL; + unsigned long range = NULL; PyObject * break_on_stop_tags_o = NULL; PyObject * stop_big_traversals_o = NULL; - if (!PyArg_ParseTuple(args, "s|OO", &seq, + if (!PyArg_ParseTuple(args, "si|OO", &seq, &range, &break_on_stop_tags_o, &stop_big_traversals_o)) { return NULL; From 88154dcd1cd1d44a166439981878747980f94072 Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Tue, 1 Oct 2013 11:55:28 -0400 Subject: [PATCH 040/140] implemented new perimeter traversal as part of sweep opimization --- lib/subset.cc | 36 +++++++++++++++++++++++++----------- lib/subset.hh | 7 +++++++ python/_khmermodule.cc | 9 +++++++-- 3 files changed, 39 insertions(+), 13 deletions(-) diff --git a/lib/subset.cc b/lib/subset.cc index e379b2b5ee..1a251e5e6a 100644 --- a/lib/subset.cc +++ b/lib/subset.cc @@ -303,6 +303,8 @@ void SubsetPartition::queue_neighbors(HashIntoType kmer_f, HashIntoType kmer_r, NodeQueue& node_q, std::queue breadth_q) { + + HashIntoType f, r; f = next_f(kmer_f, 'A'); r = next_r(kmer_r, 'A'); @@ -524,18 +526,16 @@ void SubsetPartition::find_all_tags(HashIntoType kmer_f, // Perform a breadth-first search starting from the k-mers in the given sequence -void SubsetPartition::sweep_for_tags( - HashIntoType kmer_f, - HashIntoType kmer_r, +unsigned int SubsetPartition::sweep_for_tags(char * seq, SeenSet& tagged_kmers, const SeenSet& all_tags, + unsigned int range, bool break_on_stop_tags, bool stop_big_traversals) { const HashIntoType bitmask = _ht->bitmask; - HashIntoType f, r; - bool first = true; + SeenSet traversed_kmers; NodeQueue node_q; std::queue breadth_q; unsigned int cur_breadth = 0; @@ -547,11 +547,25 @@ void SubsetPartition::sweep_for_tags( // start breadth-first search. - node_q.push(kmer_f); - node_q.push(kmer_r); - breadth_q.push(0); + HashIntoType kmer_f, kmer_r, kmer; + KMerIterator kmers(seq, ksize()); + str::string kmer_s; + + // Queue up all the sequenes k-mers at breadth zero + // We are searching around the perimeter of the known k-mers + // @cswelcher still using kludgy kmer iterator, let's fix this sometime... + while (!kmers.done()) { + kmer = kmers.next(); + kmer_s = revhash(kmer, ksize()); + kmer = _hash(kmer_s.c_str(), ksize(), kmer_f, kmer_r); + + node_q.push(kmer_f); + node_q.push(kmer_r); + breadth_q.push(0); + } while(!node_q.empty()) { + // change this to a better hueristic if (stop_big_traversals && traversed_kmers.size() > BIG_TRAVERSALS_ARE) { tagged_kmers.clear(); break; @@ -585,7 +599,7 @@ void SubsetPartition::sweep_for_tags( // Is this a kmer-to-tag, and have we put this tag in a partition already? // Search no further in this direction. (This is where we connect // partitions.) - if (!first && set_contains(all_tags, kmer)) { + if (breadth && set_contains(all_tags, kmer)) { tagged_kmers.insert(kmer); continue; } @@ -593,11 +607,11 @@ void SubsetPartition::sweep_for_tags( assert(breadth >= cur_breadth); // keep track of watermark, for debugging. if (breadth > cur_breadth) { cur_breadth = breadth; } - if (breadth >= max_breadth) { continue; } // truncate search @CTB exit? + if (breadth >= max_breadth or breatdth >= range) { continue; } // truncate search @CTB exit? queue_neighbors(kmer_f, kmer_r, node_q, breadth_q); - first = false; } + return total; } // find_all_tags: the core of the partitioning code. finds all tagged k-mers diff --git a/lib/subset.hh b/lib/subset.hh index 282a9a6ac9..cc08eff1aa 100644 --- a/lib/subset.hh +++ b/lib/subset.hh @@ -76,6 +76,13 @@ namespace khmer { const SeenSet& all_tags, bool break_on_stop_tags, bool stop_big_traversals); + + unsigned int sweep_for_tags(char * seq, + SeenSet& tagged_kmers, + const SeenSet& all_tags, + unsigned int range, + bool break_on_stop_tags, + bool stop_big_traversals); void find_all_tags_truncate_on_abundance(HashIntoType kmer_f, HashIntoType kmer_r, diff --git a/python/_khmermodule.cc b/python/_khmermodule.cc index 2a8c39c65c..73c61da559 100644 --- a/python/_khmermodule.cc +++ b/python/_khmermodule.cc @@ -3965,16 +3965,21 @@ static PyObject * hashbits_sweep_tag_neighborhood(PyObject * self, PyObject *arg khmer::Hashbits * hashbits = me->hashbits; char * seq = NULL; - unsigned long range = NULL; + PyObject * r = NULL; PyObject * break_on_stop_tags_o = NULL; PyObject * stop_big_traversals_o = NULL; - if (!PyArg_ParseTuple(args, "si|OO", &seq, &range, + if (!PyArg_ParseTuple(args, "s|iOO", &seq, &r, &break_on_stop_tags_o, &stop_big_traversals_o)) { return NULL; } + unsigned int range = (2 * hashbits->_tag_density) + 1; + if (r) { + range = r; + } + bool break_on_stop_tags = false; if (break_on_stop_tags_o && PyObject_IsTrue(break_on_stop_tags_o)) { break_on_stop_tags = true; From ab23874575fd1e6b9dceed39837722812ad7b8dd Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Tue, 1 Oct 2013 14:55:06 -0400 Subject: [PATCH 041/140] fixed error with new perimeter alg --- lib/hashtable.cc | 5 +++-- lib/hashtable.hh | 2 +- lib/subset.cc | 41 +++++++++++++++++++++++------------------ lib/subset.hh | 7 +++++++ python/_khmermodule.cc | 24 ++++++++---------------- 5 files changed, 42 insertions(+), 37 deletions(-) diff --git a/lib/hashtable.cc b/lib/hashtable.cc index 72d332109f..9ff868a161 100644 --- a/lib/hashtable.cc +++ b/lib/hashtable.cc @@ -2235,13 +2235,14 @@ unsigned int Hashtable::sweep_sequence_for_colors(const std::string& seq, _hash(kmer_s.c_str(), _ksize, kmer_f, kmer_r); // don't even try traversing from k-mers not in the hashtable + //traversed_kmers.clear(); if (get_count(uniqify_rc(kmer_f,kmer_r))) { partition->find_all_tags(kmer_f, kmer_r, tagged_kmers, - traversed_kmers, all_tags, break_on_stoptags, stop_big_traversals); + all_tags, break_on_stoptags, stop_big_traversals); traverse_colors_and_resolve(tagged_kmers, found_colors); } } - return traversed_kmers.size() + return traversed_kmers.size(); } ColorPtrSet Hashtable::get_tag_colors(const HashIntoType& tag) { diff --git a/lib/hashtable.hh b/lib/hashtable.hh index c0c25584e3..2fa703c153 100644 --- a/lib/hashtable.hh +++ b/lib/hashtable.hh @@ -455,7 +455,7 @@ namespace khmer { // Partitioning stuff. unsigned int n_tags() const { return all_tags.size(); } - unsigned int n_colors() const { return colors_ptrs.size(); } + unsigned int n_colors() const { return color_ptrs.size(); } void divide_tags_into_subsets(unsigned int subset_size, SeenSet& divvy); diff --git a/lib/subset.cc b/lib/subset.cc index 1a251e5e6a..6dd8715dd8 100644 --- a/lib/subset.cc +++ b/lib/subset.cc @@ -301,15 +301,19 @@ unsigned int SubsetPartition::find_unpart(const std::string infilename, */ void SubsetPartition::queue_neighbors(HashIntoType kmer_f, HashIntoType kmer_r, + unsigned int breadth, + SeenSet& traversed_kmers, NodeQueue& node_q, std::queue breadth_q) { HashIntoType f, r; + const unsigned int rc_left_shift = _ht->ksize()*2 - 2; + const HashIntoType bitmask = _ht->bitmask; f = next_f(kmer_f, 'A'); r = next_r(kmer_r, 'A'); if (_ht->get_count(uniqify_rc(f,r)) && - !set_contains(keeper, uniqify_rc(f,r))) { + !set_contains(traversed_kmers, uniqify_rc(f,r))) { node_q.push(f); node_q.push(r); breadth_q.push(breadth + 1); } @@ -317,7 +321,7 @@ void SubsetPartition::queue_neighbors(HashIntoType kmer_f, f = next_f(kmer_f, 'C'); r = next_r(kmer_r, 'C'); if (_ht->get_count(uniqify_rc(f,r)) && - !set_contains(keeper, uniqify_rc(f,r))) { + !set_contains(traversed_kmers, uniqify_rc(f,r))) { node_q.push(f); node_q.push(r); breadth_q.push(breadth + 1); } @@ -325,7 +329,7 @@ void SubsetPartition::queue_neighbors(HashIntoType kmer_f, f = next_f(kmer_f, 'G'); r = next_r(kmer_r, 'G'); if (_ht->get_count(uniqify_rc(f,r)) && - !set_contains(keeper, uniqify_rc(f,r))) { + !set_contains(traversed_kmers, uniqify_rc(f,r))) { node_q.push(f); node_q.push(r); breadth_q.push(breadth + 1); } @@ -333,7 +337,7 @@ void SubsetPartition::queue_neighbors(HashIntoType kmer_f, f = next_f(kmer_f, 'T'); r = next_r(kmer_r, 'T'); if (_ht->get_count(uniqify_rc(f,r)) && - !set_contains(keeper, uniqify_rc(f,r))) { + !set_contains(traversed_kmers, uniqify_rc(f,r))) { node_q.push(f); node_q.push(r); breadth_q.push(breadth + 1); } @@ -342,7 +346,7 @@ void SubsetPartition::queue_neighbors(HashIntoType kmer_f, r = prev_r(kmer_r, 'A'); f = prev_f(kmer_f, 'A'); if (_ht->get_count(uniqify_rc(f,r)) && - !set_contains(keeper, uniqify_rc(f,r))) { + !set_contains(traversed_kmers, uniqify_rc(f,r))) { node_q.push(f); node_q.push(r); breadth_q.push(breadth + 1); } @@ -350,7 +354,7 @@ void SubsetPartition::queue_neighbors(HashIntoType kmer_f, r = prev_r(kmer_r, 'C'); f = prev_f(kmer_f, 'C'); if (_ht->get_count(uniqify_rc(f,r)) && - !set_contains(keeper, uniqify_rc(f,r))) { + !set_contains(traversed_kmers, uniqify_rc(f,r))) { node_q.push(f); node_q.push(r); breadth_q.push(breadth + 1); } @@ -358,7 +362,7 @@ void SubsetPartition::queue_neighbors(HashIntoType kmer_f, r = prev_r(kmer_r, 'G'); f = prev_f(kmer_f, 'G'); if (_ht->get_count(uniqify_rc(f,r)) && - !set_contains(keeper, uniqify_rc(f,r))) { + !set_contains(traversed_kmers, uniqify_rc(f,r))) { node_q.push(f); node_q.push(r); breadth_q.push(breadth + 1); } @@ -366,7 +370,7 @@ void SubsetPartition::queue_neighbors(HashIntoType kmer_f, r = prev_r(kmer_r, 'T'); f = prev_f(kmer_f, 'T'); if (_ht->get_count(uniqify_rc(f,r)) && - !set_contains(keeper, uniqify_rc(f,r))) { + !set_contains(traversed_kmers, uniqify_rc(f,r))) { node_q.push(f); node_q.push(r); breadth_q.push(breadth + 1); } @@ -533,7 +537,6 @@ unsigned int SubsetPartition::sweep_for_tags(char * seq, bool break_on_stop_tags, bool stop_big_traversals) { - const HashIntoType bitmask = _ht->bitmask; SeenSet traversed_kmers; NodeQueue node_q; @@ -542,22 +545,22 @@ unsigned int SubsetPartition::sweep_for_tags(char * seq, unsigned int breadth = 0; const unsigned int max_breadth = (2 * _ht->_tag_density) + 1; - const unsigned int rc_left_shift = _ht->ksize()*2 - 2; + unsigned int total = 0; // start breadth-first search. HashIntoType kmer_f, kmer_r, kmer; - KMerIterator kmers(seq, ksize()); - str::string kmer_s; + KMerIterator kmers(seq, _ht->ksize()); + std::string kmer_s; // Queue up all the sequenes k-mers at breadth zero // We are searching around the perimeter of the known k-mers // @cswelcher still using kludgy kmer iterator, let's fix this sometime... while (!kmers.done()) { kmer = kmers.next(); - kmer_s = revhash(kmer, ksize()); - kmer = _hash(kmer_s.c_str(), ksize(), kmer_f, kmer_r); + kmer_s = _revhash(kmer, _ht->ksize()); + kmer = _hash(kmer_s.c_str(), _ht->ksize(), kmer_f, kmer_r); node_q.push(kmer_f); node_q.push(kmer_r); @@ -604,12 +607,13 @@ unsigned int SubsetPartition::sweep_for_tags(char * seq, continue; } - assert(breadth >= cur_breadth); // keep track of watermark, for debugging. - if (breadth > cur_breadth) { cur_breadth = breadth; } + // removed for not doing anything + //assert(breadth >= cur_breadth); // keep track of watermark, for debugging. + //if (breadth > cur_breadth) { cur_breadth = breadth; } - if (breadth >= max_breadth or breatdth >= range) { continue; } // truncate search @CTB exit? + if (breadth >= max_breadth or breadth >= range) { continue; } // truncate search @CTB exit? - queue_neighbors(kmer_f, kmer_r, node_q, breadth_q); + queue_neighbors(kmer_f, kmer_r, breadth, traversed_kmers, node_q, breadth_q); } return total; } @@ -690,6 +694,7 @@ void SubsetPartition::find_all_tags_truncate_on_abundance(HashIntoType kmer_f, continue; } + // @cswelcher Do these lines actually do anything? assert(breadth >= cur_breadth); // keep track of watermark, for debugging. if (breadth > cur_breadth) { cur_breadth = breadth; } diff --git a/lib/subset.hh b/lib/subset.hh index cc08eff1aa..3a5dfb0ab3 100644 --- a/lib/subset.hh +++ b/lib/subset.hh @@ -63,6 +63,13 @@ namespace khmer { void load_partitionmap(std::string infile); void _validate_pmap(); + void queue_neighbors(HashIntoType kmer_f, + HashIntoType kmer_r, + unsigned int breadth, + SeenSet& traversed_kmers, + NodeQueue& node_q, + std::queue breadth_q); + void find_all_tags(HashIntoType kmer_f, HashIntoType kmer_r, SeenSet& tagged_kmers, const SeenSet& all_tags, diff --git a/python/_khmermodule.cc b/python/_khmermodule.cc index 73c61da559..81d2864130 100644 --- a/python/_khmermodule.cc +++ b/python/_khmermodule.cc @@ -3936,7 +3936,7 @@ static PyObject * hashbits_sweep_color_neighborhood(PyObject * self, PyObject * bool exc_raised = false; //Py_BEGIN_ALLOW_THREADS try { - hb->sweep_sequence_for_colors(seq, found_colors, range, break_on_stop_tags, stop_big_traversals); + hb->sweep_sequence_for_colors(seq, found_colors, break_on_stop_tags, stop_big_traversals); } catch (_khmer_signal &e) { exc_raised = true; } @@ -3965,7 +3965,7 @@ static PyObject * hashbits_sweep_tag_neighborhood(PyObject * self, PyObject *arg khmer::Hashbits * hashbits = me->hashbits; char * seq = NULL; - PyObject * r = NULL; + unsigned int r = NULL; PyObject * break_on_stop_tags_o = NULL; PyObject * stop_big_traversals_o = NULL; @@ -3975,7 +3975,7 @@ static PyObject * hashbits_sweep_tag_neighborhood(PyObject * self, PyObject *arg return NULL; } - unsigned int range = (2 * hashbits->_tag_density) + 1; + unsigned int range = (2 * hashbits->_get_tag_density()) + 1; if (r) { range = r; } @@ -3994,19 +3994,11 @@ static PyObject * hashbits_sweep_tag_neighborhood(PyObject * self, PyObject *arg } khmer::SeenSet tagged_kmers; - khmer::HashIntoType kmer_f, kmer_r, kmer; - KMerIterator kmers(seq, hashbits->ksize()); - std::string kmer_s; + //Py_BEGIN_ALLOW_THREADS - while (!kmers.done()) { - kmer = kmers.next(); - kmer_s = khmer::_revhash(kmer, hashbits->ksize()); - kmer = khmer::_hash(kmer_s.c_str(), hashbits->ksize(), kmer_f, kmer_r); - - hashbits->partition->find_all_tags(kmer_f, kmer_r, tagged_kmers, - hashbits->all_tags, break_on_stop_tags, stop_big_traversals); - } + hashbits->partition->sweep_for_tags(seq, tagged_kmers, + hashbits->all_tags, range, break_on_stop_tags, stop_big_traversals); //Py_END_ALLOW_THREADS @@ -4132,9 +4124,9 @@ static PyMethodDef khmer_hashbits_methods[] = { { "repartition_largest_partition", hashbits_repartition_largest_partition, METH_VARARGS, "" }, { "get_median_count", hashbits_get_median_count, METH_VARARGS, "Get the median, average, and stddev of the k-mer counts in the string" }, { "consume_fasta_and_tag_with_colors", hashbits_consume_fasta_and_tag_with_colors, METH_VARARGS, "" }, - { "sweep_sequence_for_colors", hashbits_sweep_sequence_for_colors, METH_VARARGS, "" }, + { "sweep_color_neighborhood", hashbits_sweep_color_neighborhood, METH_VARARGS, "" }, {"consume_partitioned_fasta_and_tag_with_colors", hashbits_consume_partitioned_fasta_and_tag_with_colors, METH_VARARGS, "" }, - {"get_all_tags", hashbits_get_all_tags, METH_VARARGS, "" }, + {"sweep_tag_neighborhood", hashbits_sweep_tag_neighborhood, METH_VARARGS, "" }, {"get_tag_colors", hashbits_get_tag_colors, METH_VARARGS, ""}, {"consume_sequence_and_tag_with_colors", hashbits_consume_sequence_and_tag_with_colors, METH_VARARGS, "" }, {"n_colors", hashbits_n_colors, METH_VARARGS, ""}, From 74c9a88f67ad6903aa4573da611635bf246a7a00 Mon Sep 17 00:00:00 2001 From: CS Welcher Date: Tue, 1 Oct 2013 16:11:49 -0400 Subject: [PATCH 042/140] fixed error in traversal params --- lib/color_tst_opt.py | 78 +++++++++++++++++++++++++++++++++++++++++++ lib/color_tst_slow.py | 78 +++++++++++++++++++++++++++++++++++++++++++ lib/subset.cc | 2 +- 3 files changed, 157 insertions(+), 1 deletion(-) create mode 100644 lib/color_tst_opt.py create mode 100644 lib/color_tst_slow.py diff --git a/lib/color_tst_opt.py b/lib/color_tst_opt.py new file mode 100644 index 0000000000..8c75fe2e5b --- /dev/null +++ b/lib/color_tst_opt.py @@ -0,0 +1,78 @@ +import khmer +import screed + +def reverse_comp(s): + ret = '' + for i in range(len(s)-1,-1,-1): + c = s[i] + if c == 'A': + ret += 'T' + elif c == 'T': + ret += 'A' + elif c == 'G': + ret += 'C' + else: + ret += 'G' + return ret + +ht = khmer.new_hashbits(20,1e8,4) +ht.consume_fasta_and_tag_with_colors('../tests/test-data/test-reads.fa') +N = 100 +for n, record in enumerate(screed.open('../tests/test-data/test-reads.fa')): + if n >= N: + break + ht.sweep_tag_neighborhood(record.sequence, 10) + +#print ht.sweep_sequence_for_colors('CACACACGGACATCGGAGAGAGGCTGAGACAGCGAGACACACAGAGACAGAGCGGAGAGGGCACAGACAGACAAGAGCATGAGAGATCGGCAGAGCGGTG', False, False) +#print ht.sweep_sequence_for_colors('CGCCGTAGTCGTACTGGTTCTCCTCCGTGTACTCGTGCGCTGCCTCCACCTCTGGGCTGCTCATGCCCTCCATGTGACCTTCAGGCATGCCCTCGGAGAT', False, False) +#print ht.sweep_sequence_for_colors('GGAGAGCCTGGGGCCAAGCCCGAGGGCATGCCTGAAGGTCACATGGAGGGCATGAGCAGCCCAG', False, False) +#print ht.sweep_sequence_for_colors('TTTTTTGAATACGTTTAGTTAATATTTGTACTTCAATTAATAAAAATTTGCTATAATTTTTCCATTATCGCCAGTCACTCGCGTGATATAGGAAAAGGTT', False, False) +#print ht.sweep_sequence_for_colors('AAGCAGTGGTATCAACGCAGAGTACGCGGGGACTCTGTCGCTGCTCCTCTAGCACAGAGAGCCAGAGACGGCTTACAGCAGCAGCATCATATAGCCTC', False, False) + +#t0 = 'CCATGTAGCGCCGCACACCTTTGTAGGTGTTGTAATAATCTTCGATGACTTTCTTCGCTTCCTGACGGCTTATGCC' +#t1 = 'ACCGCGCGCGAATCGACGGTTGTCAGCCAAAGGCGTTCAACACCAGCACCGCCCTTAAGCCGCCCGCCCGCCGCCC' +''' +N = 100 +for n, record in enumerate(screed.open('../tests/test-data/test-reads.fa')): + if n > N: + break + print '*' * 40 + seq = record.sequence + print seq + colors = ht.sweep_sequence_for_colors(seq, False, False) + print 'colors from sweep:', colors + tags = ht.get_all_tags(seq) + print 'tags from get_all_tags:', tags + print 'colors from get_tag_colors:' + t_colors = set() + for tag in tags: + t_colors.update(ht.get_tag_colors(tag)) + print t_colors + assert len(t_colors) == len(colors) +''' +''' +file_pointers = {} +for n, record in enumerate(screed.open('/w/2013-lamprey/syn_part/syn.sweep.fa')): + if n >= N: + break + if n % 1000 == 0: + print '...processed {} reads'.format(n) + colors = ht.sweep_sequence_for_colors(record.sequence, False, False) + for c in colors: + if c in file_pointers.viewkeys(): + file_pointers[c].write('>{}\n{}\n'.format(record.name, record.sequence)) + else: + file_pointers[c] = open('color_{}.fa'.format(c), 'wb') + file_pointers[c].write('>{}\n{}\n'.format(record.name, record.sequence))\ +''' +''' +ht = khmer.new_hashbits(25, 1e9,4) +ht.consume_partitioned_fasta_and_tag_with_colors('/w/2013-lamprey/test.fp') + +for n, record in enumerate(screed.open('/w/lamprey-mrnaseq/reads/single/L82-a.fq.gz')): + if n >= N: + break + colors = ht.sweep_sequence_for_colors(record.sequence, False, False) + if colors: + print colors +''' diff --git a/lib/color_tst_slow.py b/lib/color_tst_slow.py new file mode 100644 index 0000000000..2f25f857e4 --- /dev/null +++ b/lib/color_tst_slow.py @@ -0,0 +1,78 @@ +import khmer +import screed + +def reverse_comp(s): + ret = '' + for i in range(len(s)-1,-1,-1): + c = s[i] + if c == 'A': + ret += 'T' + elif c == 'T': + ret += 'A' + elif c == 'G': + ret += 'C' + else: + ret += 'G' + return ret + +ht = khmer.new_hashbits(20,1e8,4) +ht.consume_fasta_and_tag_with_colors('../tests/test-data/test-reads.fa') +N = 100 +for n, record in enumerate(screed.open('../tests/test-data/test-reads.fa')): + if n >= N: + break + ht.sweep_color_neighborhood(record.sequence) + +#print ht.sweep_sequence_for_colors('CACACACGGACATCGGAGAGAGGCTGAGACAGCGAGACACACAGAGACAGAGCGGAGAGGGCACAGACAGACAAGAGCATGAGAGATCGGCAGAGCGGTG', False, False) +#print ht.sweep_sequence_for_colors('CGCCGTAGTCGTACTGGTTCTCCTCCGTGTACTCGTGCGCTGCCTCCACCTCTGGGCTGCTCATGCCCTCCATGTGACCTTCAGGCATGCCCTCGGAGAT', False, False) +#print ht.sweep_sequence_for_colors('GGAGAGCCTGGGGCCAAGCCCGAGGGCATGCCTGAAGGTCACATGGAGGGCATGAGCAGCCCAG', False, False) +#print ht.sweep_sequence_for_colors('TTTTTTGAATACGTTTAGTTAATATTTGTACTTCAATTAATAAAAATTTGCTATAATTTTTCCATTATCGCCAGTCACTCGCGTGATATAGGAAAAGGTT', False, False) +#print ht.sweep_sequence_for_colors('AAGCAGTGGTATCAACGCAGAGTACGCGGGGACTCTGTCGCTGCTCCTCTAGCACAGAGAGCCAGAGACGGCTTACAGCAGCAGCATCATATAGCCTC', False, False) + +#t0 = 'CCATGTAGCGCCGCACACCTTTGTAGGTGTTGTAATAATCTTCGATGACTTTCTTCGCTTCCTGACGGCTTATGCC' +#t1 = 'ACCGCGCGCGAATCGACGGTTGTCAGCCAAAGGCGTTCAACACCAGCACCGCCCTTAAGCCGCCCGCCCGCCGCCC' +''' +N = 100 +for n, record in enumerate(screed.open('../tests/test-data/test-reads.fa')): + if n > N: + break + print '*' * 40 + seq = record.sequence + print seq + colors = ht.sweep_sequence_for_colors(seq, False, False) + print 'colors from sweep:', colors + tags = ht.get_all_tags(seq) + print 'tags from get_all_tags:', tags + print 'colors from get_tag_colors:' + t_colors = set() + for tag in tags: + t_colors.update(ht.get_tag_colors(tag)) + print t_colors + assert len(t_colors) == len(colors) +''' +''' +file_pointers = {} +for n, record in enumerate(screed.open('/w/2013-lamprey/syn_part/syn.sweep.fa')): + if n >= N: + break + if n % 1000 == 0: + print '...processed {} reads'.format(n) + colors = ht.sweep_sequence_for_colors(record.sequence, False, False) + for c in colors: + if c in file_pointers.viewkeys(): + file_pointers[c].write('>{}\n{}\n'.format(record.name, record.sequence)) + else: + file_pointers[c] = open('color_{}.fa'.format(c), 'wb') + file_pointers[c].write('>{}\n{}\n'.format(record.name, record.sequence))\ +''' +''' +ht = khmer.new_hashbits(25, 1e9,4) +ht.consume_partitioned_fasta_and_tag_with_colors('/w/2013-lamprey/test.fp') + +for n, record in enumerate(screed.open('/w/lamprey-mrnaseq/reads/single/L82-a.fq.gz')): + if n >= N: + break + colors = ht.sweep_sequence_for_colors(record.sequence, False, False) + if colors: + print colors +''' diff --git a/lib/subset.cc b/lib/subset.cc index 6dd8715dd8..ba54f8e50f 100644 --- a/lib/subset.cc +++ b/lib/subset.cc @@ -304,7 +304,7 @@ void SubsetPartition::queue_neighbors(HashIntoType kmer_f, unsigned int breadth, SeenSet& traversed_kmers, NodeQueue& node_q, - std::queue breadth_q) { + std::queue& breadth_q) { HashIntoType f, r; const unsigned int rc_left_shift = _ht->ksize()*2 - 2; From 05cbe35928bca46cbb79be89ca5af27b1b0ff47d Mon Sep 17 00:00:00 2001 From: CS Welcher Date: Tue, 1 Oct 2013 17:08:14 -0400 Subject: [PATCH 043/140] fixed func prototype to match prev change --- lib/subset.hh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/subset.hh b/lib/subset.hh index 3a5dfb0ab3..9664b71a36 100644 --- a/lib/subset.hh +++ b/lib/subset.hh @@ -68,7 +68,7 @@ namespace khmer { unsigned int breadth, SeenSet& traversed_kmers, NodeQueue& node_q, - std::queue breadth_q); + std::queue& breadth_q); void find_all_tags(HashIntoType kmer_f, HashIntoType kmer_r, SeenSet& tagged_kmers, From 235b746d7d061f5f86668b11c5caf0c0ec6ff1d7 Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Tue, 1 Oct 2013 19:37:37 -0400 Subject: [PATCH 044/140] fixed bug in color allocation during consume_fasta, fixed unexpected behavior in new traversal code and improved performance, fixed tests, added new function for exporting all colors --- lib/hashtable.cc | 19 ++++++++-- lib/hashtable.hh | 6 ++++ lib/subset.cc | 13 ++++--- lib/subset.hh | 10 +----- python/_khmermodule.cc | 81 +++++++++++++++++++++++++++++++++++++++--- tests/test_hashbits.py | 57 ++++++++++++++++------------- 6 files changed, 139 insertions(+), 47 deletions(-) diff --git a/lib/hashtable.cc b/lib/hashtable.cc index 9ff868a161..ad1adca0f9 100644 --- a/lib/hashtable.cc +++ b/lib/hashtable.cc @@ -2003,7 +2003,7 @@ Hashtable::consume_fasta_and_tag_with_colors( Color _tag_color = 0; - Color * the_color = check_and_allocate_color(_tag_color); + Color * the_color; // Iterate through the reads and consume their k-mers. while (!parser->is_complete( )) { @@ -2014,11 +2014,11 @@ Hashtable::consume_fasta_and_tag_with_colors( if (check_and_normalize_read( read.sequence )) { // TODO: make threadsafe! + the_color = check_and_allocate_color(_tag_color); consume_sequence_and_tag_with_colors( read.sequence, this_n_consumed, *the_color ); _tag_color++; - the_color = check_and_allocate_color(_tag_color); #ifdef WITH_INTERNAL_METRICS hasher.pmetrics.start_timers( ); @@ -2245,6 +2245,21 @@ unsigned int Hashtable::sweep_sequence_for_colors(const std::string& seq, return traversed_kmers.size(); } +unsigned int Hashtable::sweep_color_neighborhood(const std::string& seq, + ColorPtrSet& found_colors, + unsigned int range, + bool break_on_stoptags, + bool stop_big_traversals) { + + SeenSet tagged_kmers; + unsigned int num_traversed; + num_traversed = partition->sweep_for_tags(seq, tagged_kmers, all_tags, + range, break_on_stoptags, stop_big_traversals); + traverse_colors_and_resolve(tagged_kmers, found_colors); + + return num_traversed; +} + ColorPtrSet Hashtable::get_tag_colors(const HashIntoType& tag) { ColorPtrSet colors; unsigned int num_colors; diff --git a/lib/hashtable.hh b/lib/hashtable.hh index 2fa703c153..23c8c7b0ec 100644 --- a/lib/hashtable.hh +++ b/lib/hashtable.hh @@ -543,6 +543,12 @@ namespace khmer { bool break_on_stoptags, bool stop_big_traversals); + unsigned int sweep_color_neighborhood(const std::string & seq, + ColorPtrSet& found_colors, + unsigned int range, + bool break_on_stoptags, + bool stop_big_traversals); + void traverse_colors_and_resolve(const SeenSet& tagged_kmers, ColorPtrSet& found_colors); diff --git a/lib/subset.cc b/lib/subset.cc index ba54f8e50f..7060865c50 100644 --- a/lib/subset.cc +++ b/lib/subset.cc @@ -530,7 +530,7 @@ void SubsetPartition::find_all_tags(HashIntoType kmer_f, // Perform a breadth-first search starting from the k-mers in the given sequence -unsigned int SubsetPartition::sweep_for_tags(char * seq, +unsigned int SubsetPartition::sweep_for_tags(const std::string& seq, SeenSet& tagged_kmers, const SeenSet& all_tags, unsigned int range, @@ -551,16 +551,17 @@ unsigned int SubsetPartition::sweep_for_tags(char * seq, // start breadth-first search. HashIntoType kmer_f, kmer_r, kmer; - KMerIterator kmers(seq, _ht->ksize()); + KMerIterator kmers(seq.c_str(), _ht->ksize()); std::string kmer_s; - // Queue up all the sequenes k-mers at breadth zero + // Queue up all the sequence's k-mers at breadth zero // We are searching around the perimeter of the known k-mers // @cswelcher still using kludgy kmer iterator, let's fix this sometime... while (!kmers.done()) { kmer = kmers.next(); kmer_s = _revhash(kmer, _ht->ksize()); kmer = _hash(kmer_s.c_str(), _ht->ksize(), kmer_f, kmer_r); + traversed_kmers.insert(kmer); node_q.push(kmer_f); node_q.push(kmer_r); @@ -599,10 +600,8 @@ unsigned int SubsetPartition::sweep_for_tags(char * seq, traversed_kmers.insert(kmer); total++; - // Is this a kmer-to-tag, and have we put this tag in a partition already? - // Search no further in this direction. (This is where we connect - // partitions.) - if (breadth && set_contains(all_tags, kmer)) { + // + if (set_contains(all_tags, kmer)) { tagged_kmers.insert(kmer); continue; } diff --git a/lib/subset.hh b/lib/subset.hh index 9664b71a36..94809ca421 100644 --- a/lib/subset.hh +++ b/lib/subset.hh @@ -76,15 +76,7 @@ namespace khmer { bool break_on_stop_tags=false, bool stop_big_traversals=false); - void find_all_tags(HashIntoType kmer_f, - HashIntoType kmer_r, - SeenSet& tagged_kmers, - SeenSet& traversed_kmers, - const SeenSet& all_tags, - bool break_on_stop_tags, - bool stop_big_traversals); - - unsigned int sweep_for_tags(char * seq, + unsigned int sweep_for_tags(const std::string& seq, SeenSet& tagged_kmers, const SeenSet& all_tags, unsigned int range, diff --git a/python/_khmermodule.cc b/python/_khmermodule.cc index 81d2864130..e0461e94da 100644 --- a/python/_khmermodule.cc +++ b/python/_khmermodule.cc @@ -3818,6 +3818,20 @@ static PyObject * hashbits_get_median_count(PyObject * self, PyObject * args) return Py_BuildValue("iff", med, average, stddev); } +static PyObject * hashbits_get_color_dict(PyObject * self, PyObject * args) { + khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self; + khmer::Hashbits * hb = me->hashbits; + + PyObject * d = PyDict_New(); + khmer::ColorPtrMap::iterator it; + + for (it = hb->color_ptrs.begin(); it!=hb->color_ptrs.end(); ++it) { + PyDict_SetItem(d, Py_BuildValue("K", it->first), Py_BuildValue("K", it->second)); + } + + return d; +} + static PyObject * hashbits_consume_fasta_and_tag_with_colors(PyObject * self, PyObject * args) { khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self; @@ -3907,11 +3921,69 @@ static PyObject * hashbits_sweep_color_neighborhood(PyObject * self, PyObject * khmer::Hashbits * hb = me->hashbits; char * seq = NULL; - unsigned int range = NULL; + unsigned int r = NULL; + PyObject * break_on_stop_tags_o = NULL; + PyObject * stop_big_traversals_o = NULL; + + if (!PyArg_ParseTuple(args, "s|iOO", &seq, &r, + &break_on_stop_tags_o, + &stop_big_traversals_o)) { + return NULL; + } + + unsigned int range = (2 * hb->_get_tag_density()) + 1; + if (r) { + range = r; + } + + bool break_on_stop_tags = false; + if (break_on_stop_tags_o && PyObject_IsTrue(break_on_stop_tags_o)) { + break_on_stop_tags = true; + } + bool stop_big_traversals = false; + if (stop_big_traversals_o && PyObject_IsTrue(stop_big_traversals_o)) { + stop_big_traversals = true; + } + + if (strlen(seq) < hb->ksize()) { + return NULL; + } + + //std::pair ret; + ColorPtrSet found_colors; + + bool exc_raised = false; + //Py_BEGIN_ALLOW_THREADS + try { + hb->sweep_color_neighborhood(seq, found_colors, range, break_on_stop_tags, stop_big_traversals); + } catch (_khmer_signal &e) { + exc_raised = true; + } + //Py_END_ALLOW_THREADS + + if (exc_raised) return NULL; + + PyObject * x = PyList_New(found_colors.size()); + khmer::ColorPtrSet::const_iterator si; + unsigned long long i = 0; + for (si=found_colors.begin(); si!=found_colors.end(); ++si) { + PyList_SET_ITEM(x, i, Py_BuildValue("K", *(*si))); + i++; + } + + return x; +} + + +static PyObject * hashbits_sweep_color_neighborhood_old(PyObject * self, PyObject * args) { + khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self; + khmer::Hashbits * hb = me->hashbits; + + char * seq = NULL; PyObject * break_on_stop_tags_o = NULL; PyObject * stop_big_traversals_o = NULL; - if (!PyArg_ParseTuple(args, "si|OO", &seq, &range, + if (!PyArg_ParseTuple(args, "s|OO", &seq, &break_on_stop_tags_o, &stop_big_traversals_o)) { return NULL; @@ -3955,8 +4027,7 @@ static PyObject * hashbits_sweep_color_neighborhood(PyObject * self, PyObject * return x; } -// Same as find_all_tags, but returns tags in a way actually useable by python -// @cswelcher TODO: this is broken az, fix it asap +// Similar to find_all_tags, but returns tags in a way actually useable by python // need a tags_in_sequence iterator or function in c++ land for reuse in all // these functions static PyObject * hashbits_sweep_tag_neighborhood(PyObject * self, PyObject *args) @@ -4125,11 +4196,13 @@ static PyMethodDef khmer_hashbits_methods[] = { { "get_median_count", hashbits_get_median_count, METH_VARARGS, "Get the median, average, and stddev of the k-mer counts in the string" }, { "consume_fasta_and_tag_with_colors", hashbits_consume_fasta_and_tag_with_colors, METH_VARARGS, "" }, { "sweep_color_neighborhood", hashbits_sweep_color_neighborhood, METH_VARARGS, "" }, + { "sweep_color_neighborhood_old", hashbits_sweep_color_neighborhood_old, METH_VARARGS, "" }, {"consume_partitioned_fasta_and_tag_with_colors", hashbits_consume_partitioned_fasta_and_tag_with_colors, METH_VARARGS, "" }, {"sweep_tag_neighborhood", hashbits_sweep_tag_neighborhood, METH_VARARGS, "" }, {"get_tag_colors", hashbits_get_tag_colors, METH_VARARGS, ""}, {"consume_sequence_and_tag_with_colors", hashbits_consume_sequence_and_tag_with_colors, METH_VARARGS, "" }, {"n_colors", hashbits_n_colors, METH_VARARGS, ""}, + {"get_color_dict", hashbits_get_color_dict, METH_VARARGS, "" }, {NULL, NULL, 0, NULL} /* sentinel */ }; diff --git a/tests/test_hashbits.py b/tests/test_hashbits.py index b38112c4df..c69ccf2e4f 100644 --- a/tests/test_hashbits.py +++ b/tests/test_hashbits.py @@ -506,12 +506,12 @@ def test_simple_median(): # * thread-safety # * n_colors -- make sure to use test-data with multi-colored tags -def test_get_all_tags(): +def test_sweep_tag_neighborhood(): hb = khmer.new_hashbits(20, 1e7, 4) filename = utils.get_test_data('single-read.fq') hb.consume_fasta_and_tag(filename) - tags = hb.get_all_tags('CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT') + tags = hb.sweep_tag_neighborhood('CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT') assert len(tags) == 1 assert tags.pop() == 173473779682L @@ -530,20 +530,42 @@ def test_sweep_sequence_for_colors(): filename = utils.get_test_data('single-read.fq') hb.consume_fasta_and_tag_with_colors(filename) - colors = hb.sweep_sequence_for_colors('CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT') + colors = hb.sweep_color_neighborhood('CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT') assert len(colors) == 1 assert colors.pop() == 0L +def test_consume_partitioned_fasta_and_tag_with_colors(): + hb = khmer.new_hashbits(20, 1e7, 4) + filename = utils.get_test_data('real-partition-small.fa') + + total_reads, n_consumed = hb.consume_partitioned_fasta_and_tag_with_colors(filename) + colors = set() + for record in screed.open(filename): + seq = record.sequence + colors.update(hb.sweep_color_neighborhood(seq, False, False)) + #print hb.n_colors() + #print colors + assert len(colors) == 1 + assert colors.pop() == 2L + assert hb.n_colors() == 1 + def test_consume_fasta_and_tag_with_colors(): hb = khmer.new_hashbits(20, 1e7, 4) read_1 = 'ACGTAACCGGTTAAACCCGGGTTTAAAACCCCGGGGTTTT' filename = utils.get_test_data('test-transcript.fa') total_reads, n_consumed = hb.consume_fasta_and_tag_with_colors(filename) - + assert hb.get(read_1[:20]) assert total_reads == 3 - #assert hb.n_colors() == 3 + print hb.n_colors() + print hb.get_color_dict() + for tag in hb.get_tagset(): + print tag, khmer.forward_hash(tag, 20) + for record in screed.open(filename): + print hb.sweep_tag_neighborhood(record.sequence, 40) + print hb.sweep_color_neighborhood(record.sequence, 40) + assert hb.n_colors() == 3 ''' * The test data set as four reads: A, B, C, and D @@ -558,7 +580,7 @@ def test_color_tag_correctness(): hb.consume_fasta_and_tag_with_colors(filename) # read A - colors = hb.sweep_sequence_for_colors('ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAG') + colors = hb.sweep_color_neighborhood('ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAG') print colors assert len(colors) == 2 @@ -566,7 +588,7 @@ def test_color_tag_correctness(): assert 1L in colors # read B - colors = hb.sweep_sequence_for_colors('GCTCTGCCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGA') + colors = hb.sweep_color_neighborhood('GCTCTGCCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGA') print colors assert len(colors) == 3 assert 0L in colors @@ -574,29 +596,14 @@ def test_color_tag_correctness(): assert 2L in colors # read C - colors = hb.sweep_sequence_for_colors('TGTTGGGGATAGATAGATAGATGAGTGTAGATCCAACAACACATACA') + colors = hb.sweep_color_neighborhood('TGTTGGGGATAGATAGATAGATGAGTGTAGATCCAACAACACATACA') print colors assert len(colors) == 2 assert 1L in colors assert 2L in colors # read D - colors = hb.sweep_sequence_for_colors('TATATATATAGCTAGCTAGCTAACTAGCTAGCATCGATCGATCGATC') + colors = hb.sweep_color_neighborhood('TATATATATAGCTAGCTAGCTAACTAGCTAGCATCGATCGATCGATC') print colors assert len(colors) == 1 - assert 3L in colors - - -def test_consume_partitioned_fasta_and_tag_with_colors(): - hb = khmer.new_hashbits(20, 1e7, 4) - filename = utils.get_test_data('real-partition-small.fa') - - total_reads, n_consumed = hb.consume_partitioned_fasta_and_tag_with_colors(filename) - #assert hb.n_colors() == 1 - colors = set() - for record in screed.open(filename): - seq = record.sequence - colors.update(hb.sweep_sequence_for_colors(seq, False, False)) - assert len(colors) == 1 - assert colors.pop() == 2L - #assert hb.n_colors() == 1 + assert 3L in colors From 48069138f2b96beb348ac94d2852bc9d8898eafa Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Tue, 1 Oct 2013 19:54:01 -0400 Subject: [PATCH 045/140] added new tests for n_colors and get_color_dict --- scripts/sweep-reads-by-partition.py | 6 +++--- tests/test_hashbits.py | 21 ++++++++++++++++++++- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/scripts/sweep-reads-by-partition.py b/scripts/sweep-reads-by-partition.py index f2a59f73a4..70870bf5e9 100755 --- a/scripts/sweep-reads-by-partition.py +++ b/scripts/sweep-reads-by-partition.py @@ -77,13 +77,13 @@ def main(): print >>sys.stderr,'** sweeping {read_file} for colors...'.format(read_file=read_file) for n, record in enumerate(screed.open(read_file)): - if n % 10000 == 0: - print >>sys.stderr, '\tswept {n} reads [{nc} colored, {no} orphaned' \ + if n % 50000 == 0: + print >>sys.stderr, '\tswept {n} reads [{nc} colored, {no} orphaned]' \ .format(n=n, nc=n_colored, no=n_orphaned) seq = record.sequence name = record.name - colors = ht.sweep_sequence_for_colors(seq, False, False) + colors = ht.sweep_color_neighborhood(seq) color_number_dist.append(len(colors)) if colors: n_colored += 1 diff --git a/tests/test_hashbits.py b/tests/test_hashbits.py index c69ccf2e4f..4d91ca9bb8 100644 --- a/tests/test_hashbits.py +++ b/tests/test_hashbits.py @@ -504,7 +504,26 @@ def test_simple_median(): # # @cswelcher TODO: more tests! # * thread-safety -# * n_colors -- make sure to use test-data with multi-colored tags + +def test_n_colors(): + hb = khmer.new_hashbits(20, 1e7, 4) + filename = utils.get_test_data('test-colors.fa') + hb.consume_fasta_and_tag_with_colors(filename) + + print hb.n_colors() + assert hb.n_colors() == 4 + +def test_get_color_dict(): + hb = khmer.new_hashbits(20, 1e7, 4) + filename = utils.get_test_data('test-colors.fa') + hb.consume_fasta_and_tag_with_colors(filename) + + colors = hb.get_color_dict() + expected = [0L, 1L, 2L, 3L] + for e_color in expected: + assert e_color in colors + for a_color in colors: + assert a_color in expected def test_sweep_tag_neighborhood(): hb = khmer.new_hashbits(20, 1e7, 4) From 15d7a88d6638d4325b7fc41dd469ab0c24d1eba1 Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Tue, 1 Oct 2013 20:16:01 -0400 Subject: [PATCH 046/140] fixed incorrect memory estimation on sweep-reads-by-partition --- scripts/sweep-reads-by-partition.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/sweep-reads-by-partition.py b/scripts/sweep-reads-by-partition.py index 70870bf5e9..ed6d31dcd8 100755 --- a/scripts/sweep-reads-by-partition.py +++ b/scripts/sweep-reads-by-partition.py @@ -48,7 +48,7 @@ def main(): print >>sys.stderr, '' print >>sys.stderr, \ 'Estimated memory usage is {prod:.2g} bytes \ - (n_hashes x min_hashsize)'.format(prod=args.n_hashes*args.min_hashsize) + (n_hashes x min_hashsize / 8)'.format(prod=args.n_hashes*args.min_hashsize/8) print >>sys.stderr, '-' * 8 K = args.ksize From b20d00a99d3a8aef525a33253e65eb7a60d68334 Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Wed, 2 Oct 2013 17:02:40 -0400 Subject: [PATCH 047/140] squashed bug with using a traversal range of 0 --- lib/hashtable.cc | 5 ++++- lib/subset.cc | 7 +++++-- python/_khmermodule.cc | 11 +++++++---- 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/lib/hashtable.cc b/lib/hashtable.cc index ad1adca0f9..94d1a1a9da 100644 --- a/lib/hashtable.cc +++ b/lib/hashtable.cc @@ -2256,7 +2256,10 @@ unsigned int Hashtable::sweep_color_neighborhood(const std::string& seq, num_traversed = partition->sweep_for_tags(seq, tagged_kmers, all_tags, range, break_on_stoptags, stop_big_traversals); traverse_colors_and_resolve(tagged_kmers, found_colors); - + //printf("range=%u ", range); + if (range == 0) { + assert(num_traversed == seq.length()-ksize()+1); + } return num_traversed; } diff --git a/lib/subset.cc b/lib/subset.cc index 7060865c50..c20ca04a65 100644 --- a/lib/subset.cc +++ b/lib/subset.cc @@ -543,7 +543,7 @@ unsigned int SubsetPartition::sweep_for_tags(const std::string& seq, std::queue breadth_q; unsigned int cur_breadth = 0; unsigned int breadth = 0; - const unsigned int max_breadth = (2 * _ht->_tag_density) + 1; + const unsigned int max_breadth = range; unsigned int total = 0; @@ -568,9 +568,12 @@ unsigned int SubsetPartition::sweep_for_tags(const std::string& seq, breadth_q.push(0); } + unsigned int seq_length = node_q.size() / 2; + unsigned int BIG_PERIMETER_TRAVERSALS = BIG_TRAVERSALS_ARE * seq_length; + while(!node_q.empty()) { // change this to a better hueristic - if (stop_big_traversals && traversed_kmers.size() > BIG_TRAVERSALS_ARE) { + if (stop_big_traversals && traversed_kmers.size() > BIG_PERIMETER_TRAVERSALS) { tagged_kmers.clear(); break; } diff --git a/python/_khmermodule.cc b/python/_khmermodule.cc index e0461e94da..60ebc1cb12 100644 --- a/python/_khmermodule.cc +++ b/python/_khmermodule.cc @@ -3889,7 +3889,7 @@ static PyObject * hashbits_consume_partitioned_fasta_and_tag_with_colors( return NULL; } - return Py_BuildValue("iL", total_reads, n_consumed); + return Py_BuildValue("iK", total_reads, n_consumed); } static PyObject * hashbits_consume_sequence_and_tag_with_colors(PyObject * self, PyObject * args) { @@ -3932,7 +3932,7 @@ static PyObject * hashbits_sweep_color_neighborhood(PyObject * self, PyObject * } unsigned int range = (2 * hb->_get_tag_density()) + 1; - if (r) { + if (r >= 0) { range = r; } @@ -3953,14 +3953,17 @@ static PyObject * hashbits_sweep_color_neighborhood(PyObject * self, PyObject * ColorPtrSet found_colors; bool exc_raised = false; + unsigned int num_traversed = 0; //Py_BEGIN_ALLOW_THREADS try { - hb->sweep_color_neighborhood(seq, found_colors, range, break_on_stop_tags, stop_big_traversals); + num_traversed = hb->sweep_color_neighborhood(seq, found_colors, range, break_on_stop_tags, stop_big_traversals); } catch (_khmer_signal &e) { exc_raised = true; } //Py_END_ALLOW_THREADS + //printf("...%u kmers traversed\n", num_traversed); + if (exc_raised) return NULL; PyObject * x = PyList_New(found_colors.size()); @@ -4047,7 +4050,7 @@ static PyObject * hashbits_sweep_tag_neighborhood(PyObject * self, PyObject *arg } unsigned int range = (2 * hashbits->_get_tag_density()) + 1; - if (r) { + if (r >= 0) { range = r; } From add2eab20c3205384d7e77887aef1036cca6b00c Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Wed, 2 Oct 2013 17:45:02 -0400 Subject: [PATCH 048/140] couple debugging things added to subset --- lib/subset.cc | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/lib/subset.cc b/lib/subset.cc index c20ca04a65..e58b862edd 100644 --- a/lib/subset.cc +++ b/lib/subset.cc @@ -544,7 +544,7 @@ unsigned int SubsetPartition::sweep_for_tags(const std::string& seq, unsigned int cur_breadth = 0; unsigned int breadth = 0; const unsigned int max_breadth = range; - + unsigned int breadth_seen = 0; unsigned int total = 0; @@ -571,6 +571,7 @@ unsigned int SubsetPartition::sweep_for_tags(const std::string& seq, unsigned int seq_length = node_q.size() / 2; unsigned int BIG_PERIMETER_TRAVERSALS = BIG_TRAVERSALS_ARE * seq_length; + unsigned int cur_it = 0; while(!node_q.empty()) { // change this to a better hueristic if (stop_big_traversals && traversed_kmers.size() > BIG_PERIMETER_TRAVERSALS) { @@ -584,6 +585,12 @@ unsigned int SubsetPartition::sweep_for_tags(const std::string& seq, node_q.pop(); breadth = breadth_q.front(); breadth_q.pop(); + cur_it++; + printf("current iteration: %u, current breadth: %u\n", cur_it, breadth); + + if (breadth > breadth_seen) { + breadth_seen = breadth; + } HashIntoType kmer = uniqify_rc(kmer_f, kmer_r); @@ -617,6 +624,7 @@ unsigned int SubsetPartition::sweep_for_tags(const std::string& seq, queue_neighbors(kmer_f, kmer_r, breadth, traversed_kmers, node_q, breadth_q); } + printf("breadth_seen=%u, total=%u, traverse_kmers=%u\n", breadth_seen, total, traversed_kmers.size()); return total; } From bf5a54482c16c7762e0fc81bc52fe5fc04b664df Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Wed, 2 Oct 2013 23:29:25 -0400 Subject: [PATCH 049/140] added new test file --- tests/test-data/test-colors.fa | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 tests/test-data/test-colors.fa diff --git a/tests/test-data/test-colors.fa b/tests/test-data/test-colors.fa new file mode 100644 index 0000000000..80741ffcbf --- /dev/null +++ b/tests/test-data/test-colors.fa @@ -0,0 +1,8 @@ +>read_A +ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAG +>read_B_overlap_A +GCTCTGCCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGA +>read_C_overlap_B +TGTTGGGGATAGATAGATAGATGAGTGTAGATCCAACAACACATACA +>read_D +TATATATATAGCTAGCTAGCTAACTAGCTAGCATCGATCGATCGATC From e8d06aae3c527b77d7e86a78eaa939f4c0e2f280 Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Thu, 3 Oct 2013 02:09:24 -0400 Subject: [PATCH 050/140] final optimizations to traversal --- lib/subset.cc | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/lib/subset.cc b/lib/subset.cc index e58b862edd..09ff44a6c9 100644 --- a/lib/subset.cc +++ b/lib/subset.cc @@ -541,10 +541,10 @@ unsigned int SubsetPartition::sweep_for_tags(const std::string& seq, SeenSet traversed_kmers; NodeQueue node_q; std::queue breadth_q; - unsigned int cur_breadth = 0; + //unsigned int cur_breadth = 0; unsigned int breadth = 0; - const unsigned int max_breadth = range; - unsigned int breadth_seen = 0; + unsigned int max_breadth = range; + //unsigned int breadth_seen = 0; unsigned int total = 0; @@ -571,7 +571,7 @@ unsigned int SubsetPartition::sweep_for_tags(const std::string& seq, unsigned int seq_length = node_q.size() / 2; unsigned int BIG_PERIMETER_TRAVERSALS = BIG_TRAVERSALS_ARE * seq_length; - unsigned int cur_it = 0; + //unsigned int cur_it = 0; while(!node_q.empty()) { // change this to a better hueristic if (stop_big_traversals && traversed_kmers.size() > BIG_PERIMETER_TRAVERSALS) { @@ -585,12 +585,12 @@ unsigned int SubsetPartition::sweep_for_tags(const std::string& seq, node_q.pop(); breadth = breadth_q.front(); breadth_q.pop(); - cur_it++; - printf("current iteration: %u, current breadth: %u\n", cur_it, breadth); + //cur_it++; + //printf("current iteration: %u, current breadth: %u\n", cur_it, breadth); - if (breadth > breadth_seen) { - breadth_seen = breadth; - } + //if (breadth > breadth_seen) { + // breadth_seen = breadth; + //} HashIntoType kmer = uniqify_rc(kmer_f, kmer_r); @@ -613,6 +613,9 @@ unsigned int SubsetPartition::sweep_for_tags(const std::string& seq, // if (set_contains(all_tags, kmer)) { tagged_kmers.insert(kmer); + // if we find a tag, finish the remaining queued nodes, + // but don't queue up any more + max_breadth = breadth; continue; } @@ -620,11 +623,15 @@ unsigned int SubsetPartition::sweep_for_tags(const std::string& seq, //assert(breadth >= cur_breadth); // keep track of watermark, for debugging. //if (breadth > cur_breadth) { cur_breadth = breadth; } - if (breadth >= max_breadth or breadth >= range) { continue; } // truncate search @CTB exit? + if (breadth == max_breadth) { continue; } + // finish up nodes on the current level, but if we go beyond, end it immediately + // this keeps from having to look at nodes which have already been queued once we + // lower the limit after finding a tag + else if (breadth > max_breadth) { return total; } // truncate search @CTB exit? queue_neighbors(kmer_f, kmer_r, breadth, traversed_kmers, node_q, breadth_q); } - printf("breadth_seen=%u, total=%u, traverse_kmers=%u\n", breadth_seen, total, traversed_kmers.size()); + //printf("breadth_seen=%u, total=%u, traverse_kmers=%u\n", breadth_seen, total, traversed_kmers.size()); return total; } From 76e91d7ab2501f1870a90aa3c4b3676284c630a8 Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Thu, 3 Oct 2013 10:50:45 -0400 Subject: [PATCH 051/140] added temporary testing scripts, different sweep scripts, etc --- lib/sweep_perf.py | 28 +++++ scripts/sweep-reads-by-partition-to-file.py | 123 ++++++++++++++++++++ tests/test_hashbits.py | 7 +- 3 files changed, 155 insertions(+), 3 deletions(-) create mode 100755 lib/sweep_perf.py create mode 100755 scripts/sweep-reads-by-partition-to-file.py diff --git a/lib/sweep_perf.py b/lib/sweep_perf.py new file mode 100755 index 0000000000..923c2da8bb --- /dev/null +++ b/lib/sweep_perf.py @@ -0,0 +1,28 @@ +#! /w/khmer_dev/bin/python + +import khmer +import screed +import sys +import time + +R = int(sys.argv[1]) +print R +K = 20 +test_file = '/w/khmer/tests/test-data/biglump-random-20-a.fa' + +ht = khmer.new_hashbits(K, 1e9, 4) +ht.consume_fasta_and_tag_with_colors(test_file) + +N = 10 +for n, record in enumerate(screed.open(test_file)): + if n > N: + break + print '*' * 40 + print '{} k-mers in sequence'.format(len(record.sequence)-K+1) + + stime = time.clock() + colors = ht.sweep_color_neighborhood(record.sequence, R) + etime = time.clock() + + print 'traversal took {} seconds'.format(etime-stime) + print 'found {} colors'.format(len(colors)) diff --git a/scripts/sweep-reads-by-partition-to-file.py b/scripts/sweep-reads-by-partition-to-file.py new file mode 100755 index 0000000000..6cb0ed687a --- /dev/null +++ b/scripts/sweep-reads-by-partition-to-file.py @@ -0,0 +1,123 @@ +#! /w/khmer_dev/bin/python +# +# This file is part of khmer, http://github.com/ged-lab/khmer/, and is +# Copyright (C) Michigan State University, 2009-2013. It is licensed under +# the three-clause BSD license; see doc/LICENSE.txt. Contact: ctb@msu.edu +# +""" +Tag and color the given partitioned fasta, then find all reads in the neighborhood +of each partition and output to a file + +% python scripts/normalize-by-median.py [ -p ] -i ... + +Use '-h' for parameter help. +""" + +import khmer +import screed +import sys +import time +from khmer.counting_args import build_construct_args, DEFAULT_MIN_HASHSIZE + +MAX_FILES=512 +READS_PER_FILE = 100000000 + +def write_read(fp, seq, name, color): + fp.write('>{name}\t{color}\n{seq}\n'.format(seq=seq, name=name, color=color)) + +def main(): + parser = build_construct_args() + #parser.add_argument('-p', '--partitions_per_file', + # dest='partitions_per_file', default=DEFAULT_PPF) + parser.add_argument('-i', '--input_fastp',dest='input_fastp') + parser.add_argument('-r', '--traversal_range', type=int, dest='traversal_range') + parser.add_argument('input_reads', nargs='+') + args = parser.parse_args() + + if not args.quiet: + if args.min_hashsize == DEFAULT_MIN_HASHSIZE: + print >>sys.stderr, \ + "** WARNING: hashsize is default! " \ + "You absodefly want to increase this!\n** " \ + "Please read the docs!" + + print >>sys.stderr, '\nPARAMETERS:' + print >>sys.stderr, \ + ' - kmer size = {ksize:d} \t\t(-k)'.format(ksize=args.ksize) + print >>sys.stderr, \ + ' - n hashes = {nhash:d} \t\t(-N)'.format(nhash=args.n_hashes) + print >>sys.stderr, \ + ' - min hashsize = {mh:-5.2g} \t(-x)'.format(mh=args.min_hashsize) + print >>sys.stderr, '' + print >>sys.stderr, \ + 'Estimated memory usage is {prod:.2g} bytes \ + (n_hashes x min_hashsize / 8)'.format(prod=args.n_hashes*args.min_hashsize/8) + print >>sys.stderr, '-' * 8 + + K = args.ksize + HT_SIZE = args.min_hashsize + N_HT = args.n_hashes + + traversal_range = args.traversal_range + input_reads = args.input_reads + input_fastp = args.input_fastp + + ht = khmer.new_hashbits(K, HT_SIZE, N_HT) + print >>sys.stderr, 'consuming fastp...' + ht.consume_partitioned_fasta_and_tag_with_colors(input_fastp) + + color_number_dist = [] + + n_orphaned = 0 + n_colored = 0 + n_mcolored = 0 + n_files = 0 + try: + outfp = open('colored_reads_0.fa', 'wb') + start_t = time.clock() + for read_file in input_reads: + print >>sys.stderr,'** sweeping {read_file} for colors...'.format(read_file=read_file) + total_t = 0.0 + for n, record in enumerate(screed.open(read_file)): + if n % 50000 == 0: + end_t = time.clock() + batch_t = end_t - start_t + total_t += batch_t + print >>sys.stderr, '\tswept {n} reads [{nc} colored, {no} orphaned] ** {sec}s ({sect}s total)' \ + .format(n=n, nc=n_colored, no=n_orphaned, sec=batch_t, sect=total_t) + start_t = time.clock() + seq = record.sequence + name = record.name + + colors = ht.sweep_color_neighborhood(seq, traversal_range) + color_number_dist.append(len(colors)) + if colors: + n_colored += 1 + if len(colors) > 1: + n_mcolored += 1 + for color in colors: + write_read(outfp, seq, name, color) + else: + n_orphaned += 1 + + if n_colored % READS_PER_FILE == 0 and n_colored != 0: + n_files += 1 + outfp = open('colored_reads_{}.fa'.format(n_files), 'wb') + + except IOError as e: + print >>sys.stderr, 'ERROR:', e + print >>sys.stderr, '** exiting...' + + print >>sys.stderr, 'swept {n_reads} for colors...'.format(n_reads=n) + print >>sys.stderr, '...with {nc} colored and {no} orphaned'.format( + nc=n_colored, no=n_orphaned) + print >>sys.stderr, '...and {nmc} multicolored'.format(nmc=n_mcolored) + print >>sys.stderr, '...to {nf} files'.format(nf=n_files) + + print >>sys.stderr, '** outputting color number distribution...' + with open('color_dist.txt', 'wb') as outfp: + for nc in color_number_dist: + outfp.write('{nc}\n'.format(nc=nc)) + +if __name__ == '__main__': + main() diff --git a/tests/test_hashbits.py b/tests/test_hashbits.py index 4d91ca9bb8..71cd8f8394 100644 --- a/tests/test_hashbits.py +++ b/tests/test_hashbits.py @@ -599,9 +599,10 @@ def test_color_tag_correctness(): hb.consume_fasta_and_tag_with_colors(filename) # read A - colors = hb.sweep_color_neighborhood('ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAG') - + colors = hb.sweep_color_neighborhood('ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAGCTAGGCTAGGTGTGCTCTGCCTAGAGCTAGGCTAGGTGT') + print hb.sweep_tag_neighborhood('ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAGCTAGGCTAGGTGTGCTCTGCCTAGAGCTAGGCTAGGTGT') print colors + print len('ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAG')-19 assert len(colors) == 2 assert 0L in colors assert 1L in colors @@ -625,4 +626,4 @@ def test_color_tag_correctness(): colors = hb.sweep_color_neighborhood('TATATATATAGCTAGCTAGCTAACTAGCTAGCATCGATCGATCGATC') print colors assert len(colors) == 1 - assert 3L in colors + assert 3L in colors From 23c197ef33837b3fdb517d883e389d371bdbcdfe Mon Sep 17 00:00:00 2001 From: CS Welcher Date: Thu, 3 Oct 2013 11:02:03 -0400 Subject: [PATCH 052/140] addded fix for tag color correctness test not working on HPCC, still no idea why it failed --- tests/test-data/test-colors.fa | 6 +++--- tests/test_hashbits.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/test-data/test-colors.fa b/tests/test-data/test-colors.fa index 80741ffcbf..b93d7c3c64 100644 --- a/tests/test-data/test-colors.fa +++ b/tests/test-data/test-colors.fa @@ -1,8 +1,8 @@ >read_A -ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAG +ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAG >read_B_overlap_A -GCTCTGCCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGA +GCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGCTCTGCCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGA >read_C_overlap_B -TGTTGGGGATAGATAGATAGATGAGTGTAGATCCAACAACACATACA +TGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGAGTTGGGGATAGATAGATAGATGAGTGTAGATCCAACAACACATACA >read_D TATATATATAGCTAGCTAGCTAACTAGCTAGCATCGATCGATCGATC diff --git a/tests/test_hashbits.py b/tests/test_hashbits.py index 71cd8f8394..c72245a63f 100644 --- a/tests/test_hashbits.py +++ b/tests/test_hashbits.py @@ -600,7 +600,7 @@ def test_color_tag_correctness(): # read A colors = hb.sweep_color_neighborhood('ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAGCTAGGCTAGGTGTGCTCTGCCTAGAGCTAGGCTAGGTGT') - print hb.sweep_tag_neighborhood('ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAGCTAGGCTAGGTGTGCTCTGCCTAGAGCTAGGCTAGGTGT') + print hb.sweep_tag_neighborhood('TTCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAGCTAGGCTAGGTGTGCTCTGCCTAGAGCTAGGCTAGGTGT') print colors print len('ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAG')-19 assert len(colors) == 2 @@ -608,7 +608,7 @@ def test_color_tag_correctness(): assert 1L in colors # read B - colors = hb.sweep_color_neighborhood('GCTCTGCCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGA') + colors = hb.sweep_color_neighborhood('GCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGCTCTGCCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGA') print colors assert len(colors) == 3 assert 0L in colors @@ -616,7 +616,7 @@ def test_color_tag_correctness(): assert 2L in colors # read C - colors = hb.sweep_color_neighborhood('TGTTGGGGATAGATAGATAGATGAGTGTAGATCCAACAACACATACA') + colors = hb.sweep_color_neighborhood('TGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGAGTTGGGGATAGATAGATAGATGAGTGTAGATCCAACAACACATACA') print colors assert len(colors) == 2 assert 1L in colors From b2f6164e75f43e0804d448ff5879f471efde4da8 Mon Sep 17 00:00:00 2001 From: CS Welcher Date: Thu, 3 Oct 2013 16:13:47 -0400 Subject: [PATCH 053/140] c++ land color test --- lib/Makefile | 15 ++++++-- lib/test-Colors.cc | 88 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+), 3 deletions(-) create mode 100644 lib/test-Colors.cc diff --git a/lib/Makefile b/lib/Makefile index ebd2e25869..8681274695 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -14,8 +14,9 @@ BZIP2_OBJS_BASE= \ decompress.o bzlib.o BZIP2_OBJS=$(addprefix $(BZIP2_DIR)/, $(BZIP2_OBJS_BASE)) -DRV_PROGS=bittest ktable_test # test-StreamReader test-CacheManager test-Parser test-HashTables +#DRV_PROGS=bittest ktable_test test-Colors # test-StreamReader test-CacheManager test-Parser test-HashTables DRV_PROGS+=#graphtest #consume_prof +DRV_PROGS=test-Colors AUX_PROGS=#ht-diff CORE_OBJS= error.o khmer_config.o thread_id_map.o trace_logger.o perf_metrics.o ktable.o @@ -37,7 +38,9 @@ DRV_TEST_HASHTABLES_OBJS= \ test-HashTables.o counting.o hashbits.o hashtable.o subset.o \ $(PARSERS_OBJS) $(CORE_OBJS) $(ZLIB_OBJS) $(BZIP2_OBJS) HT_DIFF_OBJS=ht-diff.o counting.o hashtable.o $(PARSERS_OBJS) $(CORE_OBJS) $(ZLIB_OBJS) $(BZIP2_OBJS) - +DRV_TEST_COLORS_OBJS= \ + test-Colors.o counting.o hashbits.o hashtable.o subset.o \ + $(PARSERS_OBJS) $(CORE_OBJS) $(ZLIB_OBJS) $(BZIP2_OBJS) test-StreamReader: $(DRV_TEST_STREAM_READER_OBJS) $(CXX) -o $@ $(DRV_TEST_STREAM_READER_OBJS) $(LIBS) @@ -49,6 +52,9 @@ test-Parser: $(DRV_TEST_PARSER_OBJS) test-HashTables: $(DRV_TEST_HASHTABLES_OBJS) $(CXX) -o $@ $(DRV_TEST_HASHTABLES_OBJS) $(LIBS) -fopenmp +test-Colors: $(DRV_TEST_COLORS_OBJS) + $(CXX) -o $@ $(DRV_TEST_COLORS_OBJS) $(LIBS) -fopenmp + ht-diff: $(HT_DIFF_OBJS) $(CXX) -o $@ $(HT_DIFF_OBJS) $(LIBS) @@ -56,9 +62,12 @@ ht-diff: $(HT_DIFF_OBJS) bittest: bittest.o ktable.o $(CXX) -o $@ bittest.o ktable.o -ktable_test: ktable_test.o hashtable.o subset.o $(PARSERS_OBJS) $(CORE_OBJS) $(ZLIB_OBJS) $(BZIP2_OBJS) +#ktable_test: ktable_test.o hashtable.o subset.o $(PARSERS_OBJS) $(CORE_OBJS) $(ZLIB_OBJS) $(BZIP2_OBJS) +# $(CXX) -o $@ ktable_test.o hashtable.o subset.o $(PARSERS_OBJS) $(CORE_OBJS) $(ZLIB_OBJS) $(BZIP2_OBJS) $(LIBS) +color_test: test-Colors.o hashtable.o subset.o $(PARSERS_OBJS) $(CORE_OBJS) $(ZLIB_OBJS) $(BZIP2_OBJS) $(CXX) -o $@ ktable_test.o hashtable.o subset.o $(PARSERS_OBJS) $(CORE_OBJS) $(ZLIB_OBJS) $(BZIP2_OBJS) $(LIBS) + # NOTE: Disabled due to broken constructor call. #graphtest: graphtest.o ktable.o hashtable.o # $(CXX) -o $@ graphtest.o ktable.o hashtable.o diff --git a/lib/test-Colors.cc b/lib/test-Colors.cc new file mode 100644 index 0000000000..aaa1ebb103 --- /dev/null +++ b/lib/test-Colors.cc @@ -0,0 +1,88 @@ +// +// This file is part of khmer, http://github.com/ged-lab/khmer/, and is +// Copyright (C) Michigan State University, 2009-2013. It is licensed under +// the three-clause BSD license; see doc/LICENSE.txt. Contact: ctb@msu.edu +// + +// Simple C++ implementation of the 'load-graph' Python script. + + +#include +#include +#include +#include +#include +#include +#include +#include + +//#define HASH_TYPE_TO_TEST 1 // Counting Hash +#define HASH_TYPE_TO_TEST 2 // Bit Hash + +// #define OUTPUT_HASHTABLE + + +#include "error.hh" +#include "read_parsers.hh" +#if HASH_TYPE_TO_TEST == 1 +# include "counting.hh" +#elif HASH_TYPE_TO_TEST == 2 +# include "hashbits.hh" +#else +# error "No HASH_TYPE_TO_TEST macro defined." +#endif +#include "primes.hh" + +using namespace std; +using namespace khmer; +using namespace khmer:: read_parsers; + + + + +int main( int argc, char * argv[ ] ) +{ + unsigned long kmer_length = 20; + float ht_size_FP = 1.0E8; + unsigned long ht_count = 4; + uint64_t cache_size = 4L * 1024 * 1024 * 1024; + unsigned int range = 40; + int rc = 0; + int opt = -1; + char * conv_residue = NULL; + string rfile_name = "/mnt/scratch/tg/w/khmer/tests/test-data/test-reads.fa"; + string ifile_name = "/mnt/scratch/tg/w/khmer/tests/test-data/test-reads.fa"; + // FILE * ofile = NULL; + HashIntoType ht_size = (HashIntoType)ht_size_FP; + Primes primetab( ht_size ); + vector ht_sizes; + for ( unsigned int i = 0; i < ht_count; ++i ) + ht_sizes.push_back( primetab.get_next_prime( ) ); + + unsigned int reads_total = 0; + unsigned long long int n_consumed = 0; + + Hashbits ht( kmer_length, ht_sizes ); + ht.consume_fasta_and_tag_with_colors( ifile_name, reads_total, n_consumed ); + IParser * parser = IParser:: get_parser(rfile_name.c_str()); + Read read; + unsigned int num_traversed; + string seq = ""; + clock_t st; + while(!parser->is_complete()) { + read = parser->get_next_read(); + seq = read.sequence; + st = clock(); + ColorPtrSet found_colors; + num_traversed = ht.sweep_color_neighborhood(seq, found_colors, range, false, false); + st = clock() - st; + printf("traversed %u reads in %d ticks (%f seconds)\n", num_traversed, + st, + ((float)st/CLOCKS_PER_SEC)); + + } + return rc; +} + + +// vim: set sts=4 sw=4 tw=80: From fd9728a4951290bfe5cc43c27ddfe484d8073c99 Mon Sep 17 00:00:00 2001 From: CS Welcher Date: Thu, 3 Oct 2013 17:58:12 -0400 Subject: [PATCH 054/140] changes to test-Colors --- lib/hashtable.hh | 4 ++-- lib/test-Colors.cc | 25 +++++++++++++++---------- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/lib/hashtable.hh b/lib/hashtable.hh index 23c8c7b0ec..653ad7ba95 100644 --- a/lib/hashtable.hh +++ b/lib/hashtable.hh @@ -525,8 +525,8 @@ namespace khmer { void consume_partitioned_fasta_and_tag_with_colors(const std::string &filename, unsigned int &total_reads, unsigned long long &n_consumed, - CallbackFn callback, - void * callback_data); + CallbackFn callback = NULL, + void * callback_datac = NULL); void consume_sequence_and_tag_with_colors(const std::string& seq, unsigned long long& n_consumed, diff --git a/lib/test-Colors.cc b/lib/test-Colors.cc index aaa1ebb103..2dafc79960 100644 --- a/lib/test-Colors.cc +++ b/lib/test-Colors.cc @@ -46,12 +46,12 @@ int main( int argc, char * argv[ ] ) float ht_size_FP = 1.0E8; unsigned long ht_count = 4; uint64_t cache_size = 4L * 1024 * 1024 * 1024; - unsigned int range = 40; + unsigned int range = 1; int rc = 0; int opt = -1; char * conv_residue = NULL; - string rfile_name = "/mnt/scratch/tg/w/khmer/tests/test-data/test-reads.fa"; - string ifile_name = "/mnt/scratch/tg/w/khmer/tests/test-data/test-reads.fa"; + string rfile_name = "/mnt/scratch/tg/w/2013-lamprey/data/AK.fq.gz"; + string ifile_name = "/mnt/scratch/tg/w/petMar_test.fp"; // FILE * ofile = NULL; HashIntoType ht_size = (HashIntoType)ht_size_FP; Primes primetab( ht_size ); @@ -61,25 +61,30 @@ int main( int argc, char * argv[ ] ) unsigned int reads_total = 0; unsigned long long int n_consumed = 0; - + printf("consuming test fastp...\n"); Hashbits ht( kmer_length, ht_sizes ); - ht.consume_fasta_and_tag_with_colors( ifile_name, reads_total, n_consumed ); + ht.consume_partitioned_fasta_and_tag_with_colors( ifile_name, reads_total, n_consumed ); + printf("consume %u sequences, graph has %u colors\n", reads_total, ht.n_colors()); IParser * parser = IParser:: get_parser(rfile_name.c_str()); Read read; unsigned int num_traversed; + unsigned int num_reads = 0; string seq = ""; - clock_t st; + clock_t st = clock(); while(!parser->is_complete()) { read = parser->get_next_read(); seq = read.sequence; - st = clock(); ColorPtrSet found_colors; num_traversed = ht.sweep_color_neighborhood(seq, found_colors, range, false, false); - st = clock() - st; - printf("traversed %u reads in %d ticks (%f seconds)\n", num_traversed, + if (num_reads % 50000 == 0) { + st = clock() - st; + printf("traversed %u kmers in %d ticks (%f seconds)\n", num_traversed, st, ((float)st/CLOCKS_PER_SEC)); - + st = clock(); + } + found_colors.clear(); + num_reads++; } return rc; } From 4c419ed87b43f693ac1145897550e22cb0b0a306 Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Mon, 7 Oct 2013 11:21:23 -0400 Subject: [PATCH 055/140] resync with hpcc --- lib/hashtable.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/hashtable.cc b/lib/hashtable.cc index 94d1a1a9da..5a9eb9624d 100644 --- a/lib/hashtable.cc +++ b/lib/hashtable.cc @@ -2260,6 +2260,7 @@ unsigned int Hashtable::sweep_color_neighborhood(const std::string& seq, if (range == 0) { assert(num_traversed == seq.length()-ksize()+1); } + tagged_kmers.clear(); return num_traversed; } From e8466b15f5cccf82601dae0dac252099c3bdbf7a Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Mon, 7 Oct 2013 11:22:01 -0400 Subject: [PATCH 056/140] changes to test-colors --- tests/test-data/test-colors.fa | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test-data/test-colors.fa b/tests/test-data/test-colors.fa index 80741ffcbf..bc725498e7 100644 --- a/tests/test-data/test-colors.fa +++ b/tests/test-data/test-colors.fa @@ -1,5 +1,5 @@ >read_A -ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAG +ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAGCTAGGCTAGGTGTGCTCTGCCTAGAGCTAGGCTAGGTGT >read_B_overlap_A GCTCTGCCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGA >read_C_overlap_B From 639099f3225b3eda87123cf1dbed549c9f94cd85 Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Thu, 10 Oct 2013 19:23:13 -0400 Subject: [PATCH 057/140] testing out buffer based spitter --- scripts/split-reads-by-color.py | 131 ++++++++++++++++++++++++++++++++ 1 file changed, 131 insertions(+) create mode 100644 scripts/split-reads-by-color.py diff --git a/scripts/split-reads-by-color.py b/scripts/split-reads-by-color.py new file mode 100644 index 0000000000..6d90a2ec45 --- /dev/null +++ b/scripts/split-reads-by-color.py @@ -0,0 +1,131 @@ +# In-progress read-buffering approach to writing out colors to many files +# Basic idea is to buffer some number of reads in memory, then dump them all at once +# Hope that each file acrues, on average, BUFFER_SIZE / NUM_PARTS reads +# ie, if we buffer 1000000 reads, and we have 100000 partitios/colors, +# we should expect the mean buffer size to be 10 reads + +import screed +import sys +import argparse +import time + +def fastp_iter(filename): + for record in screed.open(filename, parse_description=False): + name = record.name + try: + name, partition_id = name.rsplit('\t', 1) + except ValueError: + print >>sys.stderr, '%%% ERROR: Derp! Is this file partitioned? %%%' + sys.exit(1) + # convert name to blast format if necessary + nname = name.split('|', 2) + if len(nname) >= 2: + name = nname[2] + name = name.split(' ')[0] + yield name, int(partition_id), record.sequence + +class Seq: + + def __init__(self, name, color, seq): + self.name = name + self.color = color + self.seq = seq + + def write(self, fp): + fp.write('>{}\t{}\n{}\n'.format(self.name, self.color, self.seq)) + +class ReadBuffer: + + def __init__(self, max_files=512, max_reads=1000000, est_files=100000, output_pref='reads_'): + self.buffers = {} + self.buffer_counts = {} + self.max_files = max_files + self.max_reads = max_reads + + self.est_files = est_files + self.output_pref = output_pref + self.buffer_flush = self.max_reads / self.est_files + + self.cur_reads = 0 + self.cur_files = 0 + + def add_seq(self, seq): + color = seq.color + if color in self.buffers: + count = self.buffer_counts[color] + self.buffers[color].append(seq) + self.buffer_counts[color] += 1 + if count > self.buffer_flush: + self.flush_buffer(color) + + else: + self.buffers[color] = [seq] + self.buffer_counts[color] = 1 + self.cur_reads += 1 + if self.cur_reads > self.max_reads: + self.flush_all() + + def flush_buffer(self, color): + with open('{}{}.fa'.format(self.output_pref, color), 'a') as outfp: + for read in self.buffers[color]: + read.write(outfp) + self.cur_reads -= 1 + del self.buffer_counts[color] + del self.buffers[color] + + def flush_all(self): + print >>sys.stderr, '** reached max buffer size, flushing all to files...' + for color in self.buffers: + self.flush_buffer(color) + assert self.cur_reads == 0 + +def main(): + + parser = argparse.ArgumentParser() + parser.add_argument('-b', '--buffer_size', dest='buffer_size', type=int) + parser.add_argument('-e', '--files_estimate', dest='files_estimate', type=int) + parser.add_argument('-o', '--output_prefix', dest='output_prefix') + parser.add_argument('input_files', nargs='+') + args = parser.parse_args() + + output_pref = args.output_prefix + buf_size = args.buffer_size + est = args.files_estimate + input_files = args.input_files + + output_buffer = ReadBuffer(max_reads=buf_size, est_files=est, output_pref=output_pref) + + multi_fp = open('{}_multi.fa'.format(output_pref), 'a') + + n_reads = 0 + total_t = 0.0 + start_t = time.clock() + for input_file in args.input_files: + print >>sys.stderr, '* splitting reads in {}...'.format(input_file) + + current_read = '' + seen_twice = False + + for name, color, seq in fastp_iter(input_file): + n_reads += 1 + seq_obj = Seq(name, color, seq) + + if n_reads % 100000 == 0: + end_t = time.clock() + batch_t = end_t - start_t + total_t += batch_t + print >>sys.stderr, '** processed {} reads from {} [{}s, {}s total]'.format(n_reads, input_file, batch_t, total_t) + start_t = time.clock() + + if name == current_read: + if not seen_twice: + seq_obj.write(multi_fp) + seen_twice = True + + else: + seen_twice = False + output_buffer.add_seq(Seq(name,color,seq)) + current_read = name + +if __name__ == '__main__': + main() From aca8b2c03d99c80d46bf52c57fa7f0c85510a45c Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Thu, 10 Oct 2013 19:55:49 -0400 Subject: [PATCH 058/140] some changes to buffered splitting --- scripts/split-reads-by-color.py | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/scripts/split-reads-by-color.py b/scripts/split-reads-by-color.py index 6d90a2ec45..3f9d38b989 100644 --- a/scripts/split-reads-by-color.py +++ b/scripts/split-reads-by-color.py @@ -36,10 +36,10 @@ def write(self, fp): class ReadBuffer: - def __init__(self, max_files=512, max_reads=1000000, est_files=100000, output_pref='reads_'): + def __init__(self, max_buffers=10000, max_reads=1000000, est_files=100000, output_pref='reads_'): self.buffers = {} self.buffer_counts = {} - self.max_files = max_files + self.max_buffers = max_buffers self.max_reads = max_reads self.est_files = est_files @@ -57,6 +57,7 @@ def add_seq(self, seq): self.buffer_counts[color] += 1 if count > self.buffer_flush: self.flush_buffer(color) + self.del_buffer(color) else: self.buffers[color] = [seq] @@ -64,36 +65,58 @@ def add_seq(self, seq): self.cur_reads += 1 if self.cur_reads > self.max_reads: self.flush_all() + if len(self.buffers) > self.max_buffers: + #self.clean_buffers(2) + self.flush_all() def flush_buffer(self, color): with open('{}{}.fa'.format(self.output_pref, color), 'a') as outfp: for read in self.buffers[color]: read.write(outfp) self.cur_reads -= 1 - del self.buffer_counts[color] - del self.buffers[color] + + def del_buffer(self, color): + del self.buffer_counts[color] + del self.buffers[color] def flush_all(self): print >>sys.stderr, '** reached max buffer size, flushing all to files...' for color in self.buffers: self.flush_buffer(color) + colors = self.buffers.keys() + for color in colors: + self.del_buffer(color) + del colors assert self.cur_reads == 0 + def clean_buffers(self, cutoff): + print >>sys.stderr, '** flushing low-abundance buffers...' + flushed = [] + for color in self.buffers: + if self.buffer_counts[color] < cutoff: + self.flush_buffer(color) + flushed.append(color) + for color in flushed: + self.del_buffer(color) + del flushed + def main(): parser = argparse.ArgumentParser() parser.add_argument('-b', '--buffer_size', dest='buffer_size', type=int) parser.add_argument('-e', '--files_estimate', dest='files_estimate', type=int) parser.add_argument('-o', '--output_prefix', dest='output_prefix') + parser.add_argument('-m', '--max_buffers', dest='max_buffers', type=int) parser.add_argument('input_files', nargs='+') args = parser.parse_args() + max_buffers = args.max_buffers output_pref = args.output_prefix buf_size = args.buffer_size est = args.files_estimate input_files = args.input_files - output_buffer = ReadBuffer(max_reads=buf_size, est_files=est, output_pref=output_pref) + output_buffer = ReadBuffer(max_buffers=max_buffers, max_reads=buf_size, est_files=est, output_pref=output_pref) multi_fp = open('{}_multi.fa'.format(output_pref), 'a') From fa8d5bd790a861f353891b1456c529a0c68ac94f Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Tue, 22 Oct 2013 10:32:03 -0400 Subject: [PATCH 059/140] added debugging option to make --- Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile b/Makefile index c3d95ce015..264c6a7f33 100644 --- a/Makefile +++ b/Makefile @@ -9,6 +9,10 @@ clean: cd lib && make clean cd tests && rm -rf khmertest_* +debug: + export CFLAGS="-pg -fprofile-arcs"; python setup.py build_ext --debug + python setup.py install + doc: FORCE python setup.py build_sphinx --fresh-env @echo '' From 4e30cecb29fdb208850e7bddae61e6b3bb35bfa6 Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Tue, 22 Oct 2013 10:32:19 -0400 Subject: [PATCH 060/140] fixed color test build params --- lib/Makefile | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/lib/Makefile b/lib/Makefile index 4774174742..ef3ed73e20 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -1,6 +1,6 @@ # Profile? # Set this variable to true if you wish to profile the codes. -WANT_PROFILING=false +WANT_PROFILING=true # Which profiling tool to use? # Assuming you have TAU installed and setup properly, @@ -207,8 +207,8 @@ bittest: bittest.o ktable.o #ktable_test: ktable_test.o hashtable.o subset.o $(PARSERS_OBJS) $(CORE_OBJS) $(ZLIB_OBJS) $(BZIP2_OBJS) # $(CXX) -o $@ ktable_test.o hashtable.o subset.o $(PARSERS_OBJS) $(CORE_OBJS) $(ZLIB_OBJS) $(BZIP2_OBJS) $(LIBS) -color_test: test-Colors.o hashtable.o subset.o $(PARSERS_OBJS) $(CORE_OBJS) $(ZLIB_OBJS) $(BZIP2_OBJS) - $(CXX) -o $@ ktable_test.o hashtable.o subset.o $(PARSERS_OBJS) $(CORE_OBJS) $(ZLIB_OBJS) $(BZIP2_OBJS) $(LIBS) +#color_test: test-Colors.o hashtable.o subset.o $(PARSERS_OBJS) $(CORE_OBJS) $(ZLIB_OBJS) $(BZIP2_OBJS) +# $(CXX) -o $@ hashtable.o subset.o $(PARSERS_OBJS) $(CORE_OBJS) $(ZLIB_OBJS) $(BZIP2_OBJS) $(LIBS) # NOTE: Disabled due to broken constructor call. #graphtest: graphtest.o ktable.o hashtable.o @@ -262,6 +262,9 @@ subset.o: subset.cc subset.hh hashbits.hh ktable.hh khmer.hh counting.o: counting.cc counting.hh hashtable.hh ktable.hh khmer.hh +test-Colors.o: test-Colors.cc + $(CXX) $(CXXFLAGS) -c -o $@ test-Colors.cc -fopenmp + test-StreamReader.o: test-StreamReader.cc read_parsers.hh test-CacheManager.o: test-CacheManager.cc read_parsers.hh From cd866748d62b6761115888f9f45c028e6ac38e63 Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Tue, 22 Oct 2013 10:32:44 -0400 Subject: [PATCH 061/140] couple changes to color-Test --- lib/test-Colors.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/test-Colors.cc b/lib/test-Colors.cc index 2dafc79960..6da9e7e500 100644 --- a/lib/test-Colors.cc +++ b/lib/test-Colors.cc @@ -46,12 +46,12 @@ int main( int argc, char * argv[ ] ) float ht_size_FP = 1.0E8; unsigned long ht_count = 4; uint64_t cache_size = 4L * 1024 * 1024 * 1024; - unsigned int range = 1; + unsigned int range = 82; int rc = 0; int opt = -1; char * conv_residue = NULL; - string rfile_name = "/mnt/scratch/tg/w/2013-lamprey/data/AK.fq.gz"; - string ifile_name = "/mnt/scratch/tg/w/petMar_test.fp"; + string rfile_name = "/w/tag_coloring/test_reads.fq"; + string ifile_name = "/w/tag_coloring/petMar_test.fp"; // FILE * ofile = NULL; HashIntoType ht_size = (HashIntoType)ht_size_FP; Primes primetab( ht_size ); From e54b216f13a14a4e88fae3107a8a569eb0ea65c2 Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Tue, 22 Oct 2013 10:33:21 -0400 Subject: [PATCH 062/140] changes to original sweep reads, now deprecated... --- scripts/sweep-reads-by-partition.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scripts/sweep-reads-by-partition.py b/scripts/sweep-reads-by-partition.py index ed6d31dcd8..b0cd79961b 100755 --- a/scripts/sweep-reads-by-partition.py +++ b/scripts/sweep-reads-by-partition.py @@ -19,6 +19,7 @@ from khmer.counting_args import build_construct_args, DEFAULT_MIN_HASHSIZE DEFAULT_PPF = 1 +MAX_FILES=512 def write_read(fp, seq, name, color): fp.write('>{name}\t{color}\n{seq}\n'.format(seq=seq, name=name, color=color)) @@ -28,6 +29,7 @@ def main(): parser.add_argument('-p', '--partitions_per_file', dest='partitions_per_file', default=DEFAULT_PPF) parser.add_argument('-i', '--input_fastp', dest='input_fastp') + parser.add_argument('-r', '--traversal_range', dest='traversal_range') parser.add_argument('input_reads', nargs='+') args = parser.parse_args() @@ -55,6 +57,7 @@ def main(): HT_SIZE = args.min_hashsize N_HT = args.n_hashes + traversal_range = args.traversal_range input_reads = args.input_reads input_fastp = args.input_fastp ppf = args.partitions_per_file From d13cb52425d928ac165d6cfc5df1402143e2f8a9 Mon Sep 17 00:00:00 2001 From: CS Date: Fri, 1 Nov 2013 04:35:03 -0400 Subject: [PATCH 063/140] added combined sweep and file output script --- scripts/sweep-reads-by-partition-buffered.py | 199 +++++++++++++++++++ 1 file changed, 199 insertions(+) create mode 100755 scripts/sweep-reads-by-partition-buffered.py diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py new file mode 100755 index 0000000000..9af1c4bba9 --- /dev/null +++ b/scripts/sweep-reads-by-partition-buffered.py @@ -0,0 +1,199 @@ +#! /w/khmer_dev/bin/python + +import screed +import sys +import argparse +import time +import khmer +from khmer.counting_args import build_construct_args, DEFAULT_MIN_HASHSIZE + +# little class to store sequence information for the buffering class +class Seq: + def __init__(self, name, color, seq): + self.name = name + self.color = color + self.seq = seq + def write(self, fp): + fp.write('>{}\t{}\n{}\n'.format(self.name, self.color, self.seq)) + +# stores reads in memory and flushes them to their approriate files +# when certain criteria are met +# Basic idea is to buffer some number of reads in memory, then dump them all at once +# Hope that each file acrues, on average, BUFFER_SIZE / NUM_PARTS reads +# ie, if we buffer 1000000 reads, and we have 100000 partitions or colors, +# we should expect the mean buffer size to be 10 reads +class ReadBuffer: + + def __init__(self, max_buffers=10000, max_reads=1000000, est_files=100000, output_pref='reads_'): + self.buffers = {} + self.buffer_counts = {} + self.max_buffers = max_buffers + self.max_reads = max_reads + + self.est_files = est_files + self.output_pref = output_pref + self.buffer_flush = self.max_reads / self.est_files + + self.cur_reads = 0 + self.cur_files = 0 + + def add_seq(self, seq): + color = seq.color + if color in self.buffers: + count = self.buffer_counts[color] + self.buffers[color].append(seq) + self.buffer_counts[color] += 1 + if count > self.buffer_flush: + self.flush_buffer(color) + self.del_buffer(color) + + else: + self.buffers[color] = [seq] + self.buffer_counts[color] = 1 + self.cur_reads += 1 + if self.cur_reads > self.max_reads: + self.flush_all() + if len(self.buffers) > self.max_buffers: + #self.clean_buffers(2) + self.flush_all() + + def flush_buffer(self, color): + with open('{}_{}.fa'.format(self.output_pref, color), 'a') as outfp: + for read in self.buffers[color]: + read.write(outfp) + self.cur_reads -= 1 + + def del_buffer(self, color): + del self.buffer_counts[color] + del self.buffers[color] + + def flush_all(self): + print >>sys.stderr, '** reached max buffer size, flushing all to files...' + for color in self.buffers: + self.flush_buffer(color) + colors = self.buffers.keys() + for color in colors: + self.del_buffer(color) + del colors + assert self.cur_reads == 0 + + def clean_buffers(self, cutoff): + print >>sys.stderr, '** flushing low-abundance buffers...' + flushed = [] + for color in self.buffers: + if self.buffer_counts[color] < cutoff: + self.flush_buffer(color) + flushed.append(color) + for color in flushed: + self.del_buffer(color) + del flushed + +def main(): + + parser = build_construct_args() + parser.add_argument('-i', '--input_fastp',dest='input_fastp') + parser.add_argument('-r', '--traversal_range', type=int, dest='traversal_range') + parser.add_argument('-b', '--buffer_size', dest='buffer_size', type=int) + parser.add_argument('-e', '--files_estimate', dest='files_estimate', type=int) + parser.add_argument('-o', '--output_prefix', dest='output_prefix') + parser.add_argument('-m', '--max_buffers', dest='max_buffers', type=int) + parser.add_argument('input_files', nargs='+') + args = parser.parse_args() + + if not args.quiet: + if args.min_hashsize == DEFAULT_MIN_HASHSIZE: + print >>sys.stderr, \ + "** WARNING: hashsize is default! " \ + "You absodefly want to increase this!\n** " \ + "Please read the docs!" + + print >>sys.stderr, '\nPARAMETERS:' + print >>sys.stderr, \ + ' - kmer size = {ksize:d} \t\t(-k)'.format(ksize=args.ksize) + print >>sys.stderr, \ + ' - n hashes = {nhash:d} \t\t(-N)'.format(nhash=args.n_hashes) + print >>sys.stderr, \ + ' - min hashsize = {mh:-5.2g} \t(-x)'.format(mh=args.min_hashsize) + print >>sys.stderr, '' + print >>sys.stderr, \ + 'Estimated memory usage is {prod:.2g} bytes \ + (n_hashes x min_hashsize / 8)'.format(prod=args.n_hashes*args.min_hashsize/8) + print >>sys.stderr, '-' * 8 + + K = args.ksize + HT_SIZE = args.min_hashsize + N_HT = args.n_hashes + + traversal_range = args.traversal_range + input_fastp = args.input_fastp + + max_buffers = args.max_buffers + output_pref = args.output_prefix + buf_size = args.buffer_size + est = args.files_estimate + input_files = args.input_files + + output_buffer = ReadBuffer(max_buffers=max_buffers, max_reads=buf_size, est_files=est, output_pref=output_pref) + + # file for multicolored reads, just keep this one around the whole time + multi_fp = open('{}_multi.fp'.format(output_pref), 'a') + orphaned_fp = open('{}_orphaned.fa'.format(output_pref), 'a') + + # consume the partitioned fasta with which to color the graph + ht = khmer.new_hashbits(K, HT_SIZE, N_HT) + print >>sys.stderr, 'consuming fastp...' + ht.consume_partitioned_fasta_and_tag_with_colors(input_fastp) + + color_number_dist = [] + + n_orphaned = 0 + n_colored = 0 + n_mcolored = 0 + try: + total_t = time.clock() + start_t = time.clock() + for read_file in input_files: + print >>sys.stderr,'** sweeping {read_file} for colors...'.format(read_file=read_file) + file_t = 0.0 + for n, record in enumerate(screed.open(read_file)): + + if n % 50000 == 0: + end_t = time.clock() + batch_t = end_t - start_t + file_t += batch_t + print >>sys.stderr, '\tswept {n} reads [{nc} colored, {no} orphaned] ** {sec}s ({sect}s total)' \ + .format(n=n, nc=n_colored, no=n_orphaned, sec=batch_t, sect=file_t) + start_t = time.clock() + seq = record.sequence + name = record.name + + colors = ht.sweep_color_neighborhood(seq, traversal_range) + color_number_dist.append(len(colors)) + if colors: + n_colored += 1 + if len(colors) > 1: + multi_fp.write('>{}\t{}\n{}\n'.format(name, '\t'.join([str(c) for c in colors]), seq)) + else: + output_buffer.add_seq(Seq(name, colors[0], seq)) + else: + n_orphaned += 1 + orphaned_fp.write('>{}\n{}\n'.format(name, seq)) + + except IOError as e: + print >>sys.stderr, 'ERROR:', e + print >>sys.stderr, '** exiting...' + + total_t = time.clock() - total_t + + print >>sys.stderr, 'swept {n_reads} for colors...'.format(n_reads=n) + print >>sys.stderr, '...with {nc} colored and {no} orphaned'.format( + nc=n_colored, no=n_orphaned) + print >>sys.stderr, '...and {nmc} multicolored'.format(nmc=n_mcolored) + + print >>sys.stderr, '** outputting color number distribution...' + with open('color_dist.txt', 'wb') as outfp: + for nc in color_number_dist: + outfp.write('{nc}\n'.format(nc=nc)) + +if __name__ == '__main__': + main() From 812036290e36ac5cdd9d90030ead05c6342d62d1 Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Tue, 5 Nov 2013 14:43:29 -0500 Subject: [PATCH 064/140] changed bad env line --- scripts/sweep-reads-by-partition-buffered.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py index 9af1c4bba9..ee0fbb1f8a 100755 --- a/scripts/sweep-reads-by-partition-buffered.py +++ b/scripts/sweep-reads-by-partition-buffered.py @@ -1,4 +1,4 @@ -#! /w/khmer_dev/bin/python +#! /usr/bin/python import screed import sys From 0d899219553548f48d9fdbd464807a93cae8c9a4 Mon Sep 17 00:00:00 2001 From: CS Welcher Date: Tue, 12 Nov 2013 14:25:07 -0500 Subject: [PATCH 065/140] important change in traversal code: removed optimization which truncated search at that breadth when a tag is found --- lib/subset.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/subset.cc b/lib/subset.cc index 09ff44a6c9..8f6ae2bf16 100644 --- a/lib/subset.cc +++ b/lib/subset.cc @@ -615,7 +615,7 @@ unsigned int SubsetPartition::sweep_for_tags(const std::string& seq, tagged_kmers.insert(kmer); // if we find a tag, finish the remaining queued nodes, // but don't queue up any more - max_breadth = breadth; + // max_breadth = breadth; continue; } From bd2fcdb48a209196be4f0d1d0ebebf1712e68a0d Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Wed, 13 Nov 2013 13:32:45 -0500 Subject: [PATCH 066/140] added default parameters to buffered sweep --- scripts/sweep-reads-by-partition-buffered.py | 31 ++++++++++++++++---- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py index ee0fbb1f8a..2dfae2981d 100755 --- a/scripts/sweep-reads-by-partition-buffered.py +++ b/scripts/sweep-reads-by-partition-buffered.py @@ -1,4 +1,16 @@ #! /usr/bin/python +# +# This file is part of khmer, http://github.com/ged-lab/khmer/, and is +# Copyright (C) Michigan State University, 2009-2013. It is licensed under +# the three-clause BSD license; see doc/LICENSE.txt. Contact: ctb@msu.edu +# + +""" +Find all reads connected to the given contigs on a per-partition basis. + +% python scripts/normalize-by-median.py -r -i \ + ... +""" import screed import sys @@ -7,6 +19,11 @@ import khmer from khmer.counting_args import build_construct_args, DEFAULT_MIN_HASHSIZE +DEFAULT_NUM_BUFFERS=50000 +DEFAULT_BUFFER_SIZE=1000000 +DEFAULT_NUM_PARTITIONS=100000 +DEFAULT_OUT_PREF='reads_' + # little class to store sequence information for the buffering class class Seq: def __init__(self, name, color, seq): @@ -24,7 +41,7 @@ def write(self, fp): # we should expect the mean buffer size to be 10 reads class ReadBuffer: - def __init__(self, max_buffers=10000, max_reads=1000000, est_files=100000, output_pref='reads_'): + def __init__(self, max_buffers, max_reads, est_files, output_pref): self.buffers = {} self.buffer_counts = {} self.max_buffers = max_buffers @@ -93,10 +110,14 @@ def main(): parser = build_construct_args() parser.add_argument('-i', '--input_fastp',dest='input_fastp') parser.add_argument('-r', '--traversal_range', type=int, dest='traversal_range') - parser.add_argument('-b', '--buffer_size', dest='buffer_size', type=int) - parser.add_argument('-e', '--files_estimate', dest='files_estimate', type=int) - parser.add_argument('-o', '--output_prefix', dest='output_prefix') - parser.add_argument('-m', '--max_buffers', dest='max_buffers', type=int) + parser.add_argument('-b', '--buffer_size', dest='buffer_size', type=int, \ + default=DEFAULT_BUFFER_SIZE) + parser.add_argument('-e', '--files_estimate', dest='files_estimate', type=int, \ + default=DEFAULT_NUM_PARTITIONS) + parser.add_argument('-o', '--output_prefix', dest='output_prefix', + default=DEFAULT_OUT_PREF) + parser.add_argument('-m', '--max_buffers', dest='max_buffers', type=int, \ + default=DEFAULT_NUM_BUFFERS) parser.add_argument('input_files', nargs='+') args = parser.parse_args() From 9278ecd955cc1d1b99ac74d720102c63883fa3a8 Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Wed, 13 Nov 2013 14:30:26 -0500 Subject: [PATCH 067/140] added error handling to file opening and buffer flushing --- scripts/sweep-reads-by-partition-buffered.py | 77 +++++++++++++++----- 1 file changed, 58 insertions(+), 19 deletions(-) diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py index 2dfae2981d..934706e09d 100755 --- a/scripts/sweep-reads-by-partition-buffered.py +++ b/scripts/sweep-reads-by-partition-buffered.py @@ -30,8 +30,19 @@ def __init__(self, name, color, seq): self.name = name self.color = color self.seq = seq + + def __repr__(self): + return '''>{name}\t{color}\n +{seq}\n'''.format(name=self.name, color=self.color, seq=self.seq) + def write(self, fp): - fp.write('>{}\t{}\n{}\n'.format(self.name, self.color, self.seq)) + try: + fp.write('\n>{}\t{}\n{}\n'.format(self.name, self.color, self.seq)) + except IOError: + print >>sys.stderr, 'Error writing {seq} to {fn}'.format(seq=self, fn=fp) + return 1 + else: + return 0 # stores reads in memory and flushes them to their approriate files # when certain criteria are met @@ -54,6 +65,9 @@ def __init__(self, max_buffers, max_reads, est_files, output_pref): self.cur_reads = 0 self.cur_files = 0 + self.num_write_errors = 0 + self.num_file_errors = 0 + def add_seq(self, seq): color = seq.color if color in self.buffers: @@ -75,11 +89,19 @@ def add_seq(self, seq): self.flush_all() def flush_buffer(self, color): - with open('{}_{}.fa'.format(self.output_pref, color), 'a') as outfp: + fn = '{}_{}.fa'.format(self.output_pref, color) + try: + outfp = open(fn, 'a') + except IOError as e: + print >>sys.stderr, 'ERROR: {e}'.format(e=e) + print >>sys.stderr, '*** Failed to open {fn} for buffer flush'.format(fn) + self.num_file_errors += 1 + else: for read in self.buffers[color]: - read.write(outfp) + self.num_write_errors += read.write(outfp) self.cur_reads -= 1 - + outfp.close() + def del_buffer(self, color): del self.buffer_counts[color] del self.buffers[color] @@ -154,11 +176,21 @@ def main(): est = args.files_estimate input_files = args.input_files - output_buffer = ReadBuffer(max_buffers=max_buffers, max_reads=buf_size, est_files=est, output_pref=output_pref) + output_buffer = ReadBuffer(max_buffers, buf_size, est, output_pref) # file for multicolored reads, just keep this one around the whole time - multi_fp = open('{}_multi.fp'.format(output_pref), 'a') - orphaned_fp = open('{}_orphaned.fa'.format(output_pref), 'a') + multi_fn = '{}_multi.fp'.format(output_pref) + try: + multi_fp = open(multi_fn, 'a') + except IOError as e: + print >>sys.stderr, 'ERROR: {e}'.format(e=e) + print >>sys.stderr, '*** Failed to open {fn}'.format(multi_fn) + orphaned_fn = '{}_orphaned.fa'.format(output_pref) + try: + orphaned_fp = open(orphaned_fn, 'a') + except IOError as e: + print >>sys.stderr, 'ERROR: {e}'.format(e=e) + print >>sys.stderr, '*** Failed to open {fn}'.format(orphaned_fn) # consume the partitioned fasta with which to color the graph ht = khmer.new_hashbits(K, HT_SIZE, N_HT) @@ -170,19 +202,25 @@ def main(): n_orphaned = 0 n_colored = 0 n_mcolored = 0 - try: - total_t = time.clock() - start_t = time.clock() - for read_file in input_files: - print >>sys.stderr,'** sweeping {read_file} for colors...'.format(read_file=read_file) - file_t = 0.0 - for n, record in enumerate(screed.open(read_file)): + total_t = time.clock() + start_t = time.clock() + for read_file in input_files: + print >>sys.stderr,'** sweeping {read_file} for colors...'.format(read_file=read_file) + file_t = 0.0 + try: + read_fp = screed.open(read_file) + except IOError as e: + print >>sys.stderr, 'ERROR:', e + print >>sys.stderr, '*** Could not open {fn}, skipping...'.format(read_file) + else: + for n, record in enumerate(read_fp): if n % 50000 == 0: end_t = time.clock() batch_t = end_t - start_t file_t += batch_t - print >>sys.stderr, '\tswept {n} reads [{nc} colored, {no} orphaned] ** {sec}s ({sect}s total)' \ + print >>sys.stderr, '\tswept {n} reads [{nc} colored, {no} orphaned] \ + ** {sec}s ({sect}s total)' \ .format(n=n, nc=n_colored, no=n_orphaned, sec=batch_t, sect=file_t) start_t = time.clock() seq = record.sequence @@ -190,6 +228,7 @@ def main(): colors = ht.sweep_color_neighborhood(seq, traversal_range) color_number_dist.append(len(colors)) + SeqOb = Seq if colors: n_colored += 1 if len(colors) > 1: @@ -199,13 +238,13 @@ def main(): else: n_orphaned += 1 orphaned_fp.write('>{}\n{}\n'.format(name, seq)) - - except IOError as e: - print >>sys.stderr, 'ERROR:', e - print >>sys.stderr, '** exiting...' + read_fp.close() total_t = time.clock() - total_t + multi_fp.close() + orphaned_fp.close() + print >>sys.stderr, 'swept {n_reads} for colors...'.format(n_reads=n) print >>sys.stderr, '...with {nc} colored and {no} orphaned'.format( nc=n_colored, no=n_orphaned) From ac63a8b9815c0fb32770dd968bd2240021c247cf Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Wed, 13 Nov 2013 14:35:17 -0500 Subject: [PATCH 068/140] added warning output for errors, updated description --- scripts/sweep-reads-by-partition-buffered.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py index 934706e09d..31a26a3939 100755 --- a/scripts/sweep-reads-by-partition-buffered.py +++ b/scripts/sweep-reads-by-partition-buffered.py @@ -10,6 +10,13 @@ % python scripts/normalize-by-median.py -r -i \ ... + +This script is very lenient on IO errors, due to the large number of file +operations needed. Thus, errors opening a file for buffer flush or writeing +a read to a file will not crash the program; instead, if there were errors, +the user will be warned at the end of execution. Errors with opening read files +are also handled -- we move on to the next read file if there is an error opening. + """ import screed @@ -244,6 +251,11 @@ def main(): multi_fp.close() orphaned_fp.close() + + if output_buffer.num_write_errors > 0 or output_buffer.num_file_errors > 0: + print >>sys.stderr, 'WARNING: Sweep finished with errors!' + print >>sys.stderr, '** {writee} reads not written'.format(writee=output_buffer.num_write_errors) + print >>sys.stderr, '** {filee} errors opening files'.format(filee=output_buffer.num_file_errors) print >>sys.stderr, 'swept {n_reads} for colors...'.format(n_reads=n) print >>sys.stderr, '...with {nc} colored and {no} orphaned'.format( From d5242574966a24e63aa7905c188915426d511cfb Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Wed, 13 Nov 2013 17:09:07 -0500 Subject: [PATCH 069/140] added minimum k and hashsizes to prevent inanely complex useless searches --- scripts/sweep-reads-by-partition-buffered.py | 33 +++++++++++++------- 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py index 31a26a3939..02f6d7d075 100755 --- a/scripts/sweep-reads-by-partition-buffered.py +++ b/scripts/sweep-reads-by-partition-buffered.py @@ -30,6 +30,10 @@ DEFAULT_BUFFER_SIZE=1000000 DEFAULT_NUM_PARTITIONS=100000 DEFAULT_OUT_PREF='reads_' +DEFAULT_RANGE=-1 + +MIN_HSIZE=4e7 +MIN_KSIZE=21 # little class to store sequence information for the buffering class class Seq: @@ -59,15 +63,15 @@ def write(self, fp): # we should expect the mean buffer size to be 10 reads class ReadBuffer: - def __init__(self, max_buffers, max_reads, est_files, output_pref): + def __init__(self, max_buffers, max_size, est_files, output_pref): self.buffers = {} self.buffer_counts = {} self.max_buffers = max_buffers - self.max_reads = max_reads + self.max_size = max_size self.est_files = est_files self.output_pref = output_pref - self.buffer_flush = self.max_reads / self.est_files + self.buffer_flush = self.max_size / self.est_files self.cur_reads = 0 self.cur_files = 0 @@ -89,7 +93,7 @@ def add_seq(self, seq): self.buffers[color] = [seq] self.buffer_counts[color] = 1 self.cur_reads += 1 - if self.cur_reads > self.max_reads: + if self.cur_reads > self.max_size: self.flush_all() if len(self.buffers) > self.max_buffers: #self.clean_buffers(2) @@ -138,7 +142,8 @@ def main(): parser = build_construct_args() parser.add_argument('-i', '--input_fastp',dest='input_fastp') - parser.add_argument('-r', '--traversal_range', type=int, dest='traversal_range') + parser.add_argument('-r', '--traversal_range', type=int, dest='traversal_range', \ + default=DEFAULT_RANGE) parser.add_argument('-b', '--buffer_size', dest='buffer_size', type=int, \ default=DEFAULT_BUFFER_SIZE) parser.add_argument('-e', '--files_estimate', dest='files_estimate', type=int, \ @@ -150,6 +155,15 @@ def main(): parser.add_argument('input_files', nargs='+') args = parser.parse_args() + + K = args.ksize + HT_SIZE = args.min_hashsize + if HT_SIZE < MIN_HSIZE: + HT_SIZE = MIN_HSIZE + if K < MIN_KSIZE: + K = MIN_KSIZE + N_HT = args.n_hashes + if not args.quiet: if args.min_hashsize == DEFAULT_MIN_HASHSIZE: print >>sys.stderr, \ @@ -159,21 +173,17 @@ def main(): print >>sys.stderr, '\nPARAMETERS:' print >>sys.stderr, \ - ' - kmer size = {ksize:d} \t\t(-k)'.format(ksize=args.ksize) + ' - kmer size = {ksize:d} \t\t(-k)'.format(ksize=K) print >>sys.stderr, \ ' - n hashes = {nhash:d} \t\t(-N)'.format(nhash=args.n_hashes) print >>sys.stderr, \ - ' - min hashsize = {mh:-5.2g} \t(-x)'.format(mh=args.min_hashsize) + ' - min hashsize = {mh:-5.2g} \t(-x)'.format(mh=HT_SIZE) print >>sys.stderr, '' print >>sys.stderr, \ 'Estimated memory usage is {prod:.2g} bytes \ (n_hashes x min_hashsize / 8)'.format(prod=args.n_hashes*args.min_hashsize/8) print >>sys.stderr, '-' * 8 - K = args.ksize - HT_SIZE = args.min_hashsize - N_HT = args.n_hashes - traversal_range = args.traversal_range input_fastp = args.input_fastp @@ -235,7 +245,6 @@ def main(): colors = ht.sweep_color_neighborhood(seq, traversal_range) color_number_dist.append(len(colors)) - SeqOb = Seq if colors: n_colored += 1 if len(colors) > 1: From 3a7ff00d1509a77031a4c44f5c656036c5dd75b9 Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Wed, 13 Nov 2013 18:14:36 -0500 Subject: [PATCH 070/140] started tests --- tests/test_scripts.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/test_scripts.py b/tests/test_scripts.py index 697a195bb3..89c72c81fe 100644 --- a/tests/test_scripts.py +++ b/tests/test_scripts.py @@ -1121,3 +1121,15 @@ def test_sample_reads_randomly(): '895:1:1:1327:15301', '895:1:1:1265:2265', '895:1:1:1327:13028', '895:1:1:1368:4434', '895:1:1:1335:19932', '895:1:1:1340:19387']) + +def test_sweep_reads_by_partition_buffered(): + readfile = utils.get_temp_filename('reads.fa') + contigfile = utils.get_temp_filename('contigs.fp') + in_dir = os.path.dirname(infile) + + shutil.copyfile(utils.get_test_data('test-sweep-reads.fa'), infile) + shutil.copyfile(utils.get_test_data('test-sweep-contigs.fp'), contigfile) + + script = scriptpath('sweep-reads-by-partition-buffered.py') + args = ['-o', 'test', '-i', contigfile, readfile] + From 37ac9d89090cbaf8c7593de41cf8bf7a14a2154d Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Thu, 14 Nov 2013 11:01:19 -0500 Subject: [PATCH 071/140] working on tests --- tests/test_scripts.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/test_scripts.py b/tests/test_scripts.py index 89c72c81fe..9a50e56721 100644 --- a/tests/test_scripts.py +++ b/tests/test_scripts.py @@ -1132,4 +1132,9 @@ def test_sweep_reads_by_partition_buffered(): script = scriptpath('sweep-reads-by-partition-buffered.py') args = ['-o', 'test', '-i', contigfile, readfile] + status, out, err = runscript(script, args, in_dir) + + outfiles = ['test_0.fa', 'test_1.fa'] + seqs1 = set([r.name for r in screed.open(outfiles[0])]) + seqs2 = set([r.name for r in screed.open(outfiles[1])]) From 0ccb6231727038946d857ebd83895d3ac31f4b0d Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Thu, 14 Nov 2013 11:01:44 -0500 Subject: [PATCH 072/140] changed something... --- scripts/sweep-reads-by-partition-buffered.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py index 02f6d7d075..3a82471053 100755 --- a/scripts/sweep-reads-by-partition-buffered.py +++ b/scripts/sweep-reads-by-partition-buffered.py @@ -152,6 +152,7 @@ def main(): default=DEFAULT_OUT_PREF) parser.add_argument('-m', '--max_buffers', dest='max_buffers', type=int, \ default=DEFAULT_NUM_BUFFERS) + parser.add_argument('-d', '--debug', dest='debug', default=None) parser.add_argument('input_files', nargs='+') args = parser.parse_args() @@ -193,6 +194,10 @@ def main(): est = args.files_estimate input_files = args.input_files + debug = args.debug + if debug: + import yep + output_buffer = ReadBuffer(max_buffers, buf_size, est, output_pref) # file for multicolored reads, just keep this one around the whole time @@ -212,6 +217,8 @@ def main(): # consume the partitioned fasta with which to color the graph ht = khmer.new_hashbits(K, HT_SIZE, N_HT) print >>sys.stderr, 'consuming fastp...' + if debug: + yep.start(debug) ht.consume_partitioned_fasta_and_tag_with_colors(input_fastp) color_number_dist = [] @@ -260,7 +267,8 @@ def main(): multi_fp.close() orphaned_fp.close() - + if debug: + yep.stop() if output_buffer.num_write_errors > 0 or output_buffer.num_file_errors > 0: print >>sys.stderr, 'WARNING: Sweep finished with errors!' print >>sys.stderr, '** {writee} reads not written'.format(writee=output_buffer.num_write_errors) From eb7b34601593a1f47957ad48e22a23c0c5cdc92c Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Thu, 14 Nov 2013 13:56:46 -0500 Subject: [PATCH 073/140] removed old sweep scripts --- scripts/split-reads-by-color.py | 154 -------------------- scripts/sweep-reads-by-partition-to-file.py | 123 ---------------- scripts/sweep-reads-by-partition.py | 137 ----------------- 3 files changed, 414 deletions(-) delete mode 100644 scripts/split-reads-by-color.py delete mode 100755 scripts/sweep-reads-by-partition-to-file.py delete mode 100755 scripts/sweep-reads-by-partition.py diff --git a/scripts/split-reads-by-color.py b/scripts/split-reads-by-color.py deleted file mode 100644 index 3f9d38b989..0000000000 --- a/scripts/split-reads-by-color.py +++ /dev/null @@ -1,154 +0,0 @@ -# In-progress read-buffering approach to writing out colors to many files -# Basic idea is to buffer some number of reads in memory, then dump them all at once -# Hope that each file acrues, on average, BUFFER_SIZE / NUM_PARTS reads -# ie, if we buffer 1000000 reads, and we have 100000 partitios/colors, -# we should expect the mean buffer size to be 10 reads - -import screed -import sys -import argparse -import time - -def fastp_iter(filename): - for record in screed.open(filename, parse_description=False): - name = record.name - try: - name, partition_id = name.rsplit('\t', 1) - except ValueError: - print >>sys.stderr, '%%% ERROR: Derp! Is this file partitioned? %%%' - sys.exit(1) - # convert name to blast format if necessary - nname = name.split('|', 2) - if len(nname) >= 2: - name = nname[2] - name = name.split(' ')[0] - yield name, int(partition_id), record.sequence - -class Seq: - - def __init__(self, name, color, seq): - self.name = name - self.color = color - self.seq = seq - - def write(self, fp): - fp.write('>{}\t{}\n{}\n'.format(self.name, self.color, self.seq)) - -class ReadBuffer: - - def __init__(self, max_buffers=10000, max_reads=1000000, est_files=100000, output_pref='reads_'): - self.buffers = {} - self.buffer_counts = {} - self.max_buffers = max_buffers - self.max_reads = max_reads - - self.est_files = est_files - self.output_pref = output_pref - self.buffer_flush = self.max_reads / self.est_files - - self.cur_reads = 0 - self.cur_files = 0 - - def add_seq(self, seq): - color = seq.color - if color in self.buffers: - count = self.buffer_counts[color] - self.buffers[color].append(seq) - self.buffer_counts[color] += 1 - if count > self.buffer_flush: - self.flush_buffer(color) - self.del_buffer(color) - - else: - self.buffers[color] = [seq] - self.buffer_counts[color] = 1 - self.cur_reads += 1 - if self.cur_reads > self.max_reads: - self.flush_all() - if len(self.buffers) > self.max_buffers: - #self.clean_buffers(2) - self.flush_all() - - def flush_buffer(self, color): - with open('{}{}.fa'.format(self.output_pref, color), 'a') as outfp: - for read in self.buffers[color]: - read.write(outfp) - self.cur_reads -= 1 - - def del_buffer(self, color): - del self.buffer_counts[color] - del self.buffers[color] - - def flush_all(self): - print >>sys.stderr, '** reached max buffer size, flushing all to files...' - for color in self.buffers: - self.flush_buffer(color) - colors = self.buffers.keys() - for color in colors: - self.del_buffer(color) - del colors - assert self.cur_reads == 0 - - def clean_buffers(self, cutoff): - print >>sys.stderr, '** flushing low-abundance buffers...' - flushed = [] - for color in self.buffers: - if self.buffer_counts[color] < cutoff: - self.flush_buffer(color) - flushed.append(color) - for color in flushed: - self.del_buffer(color) - del flushed - -def main(): - - parser = argparse.ArgumentParser() - parser.add_argument('-b', '--buffer_size', dest='buffer_size', type=int) - parser.add_argument('-e', '--files_estimate', dest='files_estimate', type=int) - parser.add_argument('-o', '--output_prefix', dest='output_prefix') - parser.add_argument('-m', '--max_buffers', dest='max_buffers', type=int) - parser.add_argument('input_files', nargs='+') - args = parser.parse_args() - - max_buffers = args.max_buffers - output_pref = args.output_prefix - buf_size = args.buffer_size - est = args.files_estimate - input_files = args.input_files - - output_buffer = ReadBuffer(max_buffers=max_buffers, max_reads=buf_size, est_files=est, output_pref=output_pref) - - multi_fp = open('{}_multi.fa'.format(output_pref), 'a') - - n_reads = 0 - total_t = 0.0 - start_t = time.clock() - for input_file in args.input_files: - print >>sys.stderr, '* splitting reads in {}...'.format(input_file) - - current_read = '' - seen_twice = False - - for name, color, seq in fastp_iter(input_file): - n_reads += 1 - seq_obj = Seq(name, color, seq) - - if n_reads % 100000 == 0: - end_t = time.clock() - batch_t = end_t - start_t - total_t += batch_t - print >>sys.stderr, '** processed {} reads from {} [{}s, {}s total]'.format(n_reads, input_file, batch_t, total_t) - start_t = time.clock() - - if name == current_read: - if not seen_twice: - seq_obj.write(multi_fp) - seen_twice = True - - else: - seen_twice = False - output_buffer.add_seq(Seq(name,color,seq)) - current_read = name - -if __name__ == '__main__': - main() diff --git a/scripts/sweep-reads-by-partition-to-file.py b/scripts/sweep-reads-by-partition-to-file.py deleted file mode 100755 index 6cb0ed687a..0000000000 --- a/scripts/sweep-reads-by-partition-to-file.py +++ /dev/null @@ -1,123 +0,0 @@ -#! /w/khmer_dev/bin/python -# -# This file is part of khmer, http://github.com/ged-lab/khmer/, and is -# Copyright (C) Michigan State University, 2009-2013. It is licensed under -# the three-clause BSD license; see doc/LICENSE.txt. Contact: ctb@msu.edu -# -""" -Tag and color the given partitioned fasta, then find all reads in the neighborhood -of each partition and output to a file - -% python scripts/normalize-by-median.py [ -p ] -i ... - -Use '-h' for parameter help. -""" - -import khmer -import screed -import sys -import time -from khmer.counting_args import build_construct_args, DEFAULT_MIN_HASHSIZE - -MAX_FILES=512 -READS_PER_FILE = 100000000 - -def write_read(fp, seq, name, color): - fp.write('>{name}\t{color}\n{seq}\n'.format(seq=seq, name=name, color=color)) - -def main(): - parser = build_construct_args() - #parser.add_argument('-p', '--partitions_per_file', - # dest='partitions_per_file', default=DEFAULT_PPF) - parser.add_argument('-i', '--input_fastp',dest='input_fastp') - parser.add_argument('-r', '--traversal_range', type=int, dest='traversal_range') - parser.add_argument('input_reads', nargs='+') - args = parser.parse_args() - - if not args.quiet: - if args.min_hashsize == DEFAULT_MIN_HASHSIZE: - print >>sys.stderr, \ - "** WARNING: hashsize is default! " \ - "You absodefly want to increase this!\n** " \ - "Please read the docs!" - - print >>sys.stderr, '\nPARAMETERS:' - print >>sys.stderr, \ - ' - kmer size = {ksize:d} \t\t(-k)'.format(ksize=args.ksize) - print >>sys.stderr, \ - ' - n hashes = {nhash:d} \t\t(-N)'.format(nhash=args.n_hashes) - print >>sys.stderr, \ - ' - min hashsize = {mh:-5.2g} \t(-x)'.format(mh=args.min_hashsize) - print >>sys.stderr, '' - print >>sys.stderr, \ - 'Estimated memory usage is {prod:.2g} bytes \ - (n_hashes x min_hashsize / 8)'.format(prod=args.n_hashes*args.min_hashsize/8) - print >>sys.stderr, '-' * 8 - - K = args.ksize - HT_SIZE = args.min_hashsize - N_HT = args.n_hashes - - traversal_range = args.traversal_range - input_reads = args.input_reads - input_fastp = args.input_fastp - - ht = khmer.new_hashbits(K, HT_SIZE, N_HT) - print >>sys.stderr, 'consuming fastp...' - ht.consume_partitioned_fasta_and_tag_with_colors(input_fastp) - - color_number_dist = [] - - n_orphaned = 0 - n_colored = 0 - n_mcolored = 0 - n_files = 0 - try: - outfp = open('colored_reads_0.fa', 'wb') - start_t = time.clock() - for read_file in input_reads: - print >>sys.stderr,'** sweeping {read_file} for colors...'.format(read_file=read_file) - total_t = 0.0 - for n, record in enumerate(screed.open(read_file)): - if n % 50000 == 0: - end_t = time.clock() - batch_t = end_t - start_t - total_t += batch_t - print >>sys.stderr, '\tswept {n} reads [{nc} colored, {no} orphaned] ** {sec}s ({sect}s total)' \ - .format(n=n, nc=n_colored, no=n_orphaned, sec=batch_t, sect=total_t) - start_t = time.clock() - seq = record.sequence - name = record.name - - colors = ht.sweep_color_neighborhood(seq, traversal_range) - color_number_dist.append(len(colors)) - if colors: - n_colored += 1 - if len(colors) > 1: - n_mcolored += 1 - for color in colors: - write_read(outfp, seq, name, color) - else: - n_orphaned += 1 - - if n_colored % READS_PER_FILE == 0 and n_colored != 0: - n_files += 1 - outfp = open('colored_reads_{}.fa'.format(n_files), 'wb') - - except IOError as e: - print >>sys.stderr, 'ERROR:', e - print >>sys.stderr, '** exiting...' - - print >>sys.stderr, 'swept {n_reads} for colors...'.format(n_reads=n) - print >>sys.stderr, '...with {nc} colored and {no} orphaned'.format( - nc=n_colored, no=n_orphaned) - print >>sys.stderr, '...and {nmc} multicolored'.format(nmc=n_mcolored) - print >>sys.stderr, '...to {nf} files'.format(nf=n_files) - - print >>sys.stderr, '** outputting color number distribution...' - with open('color_dist.txt', 'wb') as outfp: - for nc in color_number_dist: - outfp.write('{nc}\n'.format(nc=nc)) - -if __name__ == '__main__': - main() diff --git a/scripts/sweep-reads-by-partition.py b/scripts/sweep-reads-by-partition.py deleted file mode 100755 index b0cd79961b..0000000000 --- a/scripts/sweep-reads-by-partition.py +++ /dev/null @@ -1,137 +0,0 @@ -#! /w/khmer_dev/bin/python -# -# This file is part of khmer, http://github.com/ged-lab/khmer/, and is -# Copyright (C) Michigan State University, 2009-2013. It is licensed under -# the three-clause BSD license; see doc/LICENSE.txt. Contact: ctb@msu.edu -# -""" -Tag and color the given partitioned fasta, then find all reads in the neighborhood -of each partition and output to a file - -% python scripts/normalize-by-median.py [ -p ] -i ... - -Use '-h' for parameter help. -""" - -import khmer -import screed -import sys -from khmer.counting_args import build_construct_args, DEFAULT_MIN_HASHSIZE - -DEFAULT_PPF = 1 -MAX_FILES=512 - -def write_read(fp, seq, name, color): - fp.write('>{name}\t{color}\n{seq}\n'.format(seq=seq, name=name, color=color)) - -def main(): - parser = build_construct_args() - parser.add_argument('-p', '--partitions_per_file', - dest='partitions_per_file', default=DEFAULT_PPF) - parser.add_argument('-i', '--input_fastp', dest='input_fastp') - parser.add_argument('-r', '--traversal_range', dest='traversal_range') - parser.add_argument('input_reads', nargs='+') - args = parser.parse_args() - - if not args.quiet: - if args.min_hashsize == DEFAULT_MIN_HASHSIZE: - print >>sys.stderr, \ - "** WARNING: hashsize is default! " \ - "You absodefly want to increase this!\n** " \ - "Please read the docs!" - - print >>sys.stderr, '\nPARAMETERS:' - print >>sys.stderr, \ - ' - kmer size = {ksize:d} \t\t(-k)'.format(ksize=args.ksize) - print >>sys.stderr, \ - ' - n hashes = {nhash:d} \t\t(-N)'.format(nhash=args.n_hashes) - print >>sys.stderr, \ - ' - min hashsize = {mh:-5.2g} \t(-x)'.format(mh=args.min_hashsize) - print >>sys.stderr, '' - print >>sys.stderr, \ - 'Estimated memory usage is {prod:.2g} bytes \ - (n_hashes x min_hashsize / 8)'.format(prod=args.n_hashes*args.min_hashsize/8) - print >>sys.stderr, '-' * 8 - - K = args.ksize - HT_SIZE = args.min_hashsize - N_HT = args.n_hashes - - traversal_range = args.traversal_range - input_reads = args.input_reads - input_fastp = args.input_fastp - ppf = args.partitions_per_file - - ht = khmer.new_hashbits(K, HT_SIZE, N_HT) - ht.consume_partitioned_fasta_and_tag_with_colors(input_fastp) - - cur_colors = [] - color_to_fp_dict = {} - cur_fp = file - - color_number_dist = [] - - n_orphaned = 0 - n_colored = 0 - n_mcolored = 0 - n_files = 0 - try: - for read_file in input_reads: - print >>sys.stderr,'** sweeping {read_file} for colors...'.format(read_file=read_file) - - for n, record in enumerate(screed.open(read_file)): - if n % 50000 == 0: - print >>sys.stderr, '\tswept {n} reads [{nc} colored, {no} orphaned]' \ - .format(n=n, nc=n_colored, no=n_orphaned) - seq = record.sequence - name = record.name - - colors = ht.sweep_color_neighborhood(seq) - color_number_dist.append(len(colors)) - if colors: - n_colored += 1 - if len(colors) > 1: - n_mcolored += 1 - for color in colors: - # do we have a file for this color already? use it! - if color in color_to_fp_dict: - fp = color_to_fp_dict[color] - write_read(fp, seq, name, color) - # no file yet? make a new one - else: - if len(cur_colors) == 0: - #print '** opening new file...' - cur_fp = open('colored_reads_{fn}.fa'.format(fn=n_files), - 'wb') - - color_to_fp_dict[color] = cur_fp - cur_colors.append(color) - write_read(cur_fp, seq, name, color) - n_files += 1 - - if len(cur_colors) == ppf: - cur_colors = [] - else: - n_orphaned += 1 - - for key in color_to_fp_dict: - if color_to_fp_dict[key]: - color_to_fp_dict[key].close() - - except IOError as e: - print >>sys.stderr, 'ERROR:', e - print >>sys.stderr, '** exiting...' - - print >>sys.stderr, 'swept {n_reads} for colors...'.format(n_reads=n) - print >>sys.stderr, '...with {nc} colored and {no} orphaned'.format( - nc=n_colored, no=n_orphaned) - print >>sys.stderr, '...and {nmc} multicolored'.format(nmc=n_mcolored) - print >>sys.stderr, '...to {nf} files'.format(nf=n_files) - - print >>sys.stderr, '** outputting color number distribution...' - with open('color_dist.txt', 'wb') as outfp: - for nc in color_number_dist: - outfp.write('{nc}\n'.format(nc=nc)) - -if __name__ == '__main__': - main() From c253a91b3e5f90890cf18a4b8bf2180dbf338b9f Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Thu, 14 Nov 2013 14:23:56 -0500 Subject: [PATCH 074/140] sweeped reads properly puts output files in source dir --- scripts/sweep-reads-by-partition-buffered.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py index 3a82471053..b909c04f3a 100755 --- a/scripts/sweep-reads-by-partition-buffered.py +++ b/scripts/sweep-reads-by-partition-buffered.py @@ -63,7 +63,7 @@ def write(self, fp): # we should expect the mean buffer size to be 10 reads class ReadBuffer: - def __init__(self, max_buffers, max_size, est_files, output_pref): + def __init__(self, max_buffers, max_size, est_files, output_pref, outdir): self.buffers = {} self.buffer_counts = {} self.max_buffers = max_buffers @@ -71,6 +71,7 @@ def __init__(self, max_buffers, max_size, est_files, output_pref): self.est_files = est_files self.output_pref = output_pref + self.outdir = outdir self.buffer_flush = self.max_size / self.est_files self.cur_reads = 0 @@ -101,11 +102,12 @@ def add_seq(self, seq): def flush_buffer(self, color): fn = '{}_{}.fa'.format(self.output_pref, color) + fpath = os.path.join(self.outdir, fn) try: - outfp = open(fn, 'a') + outfp = open(fpath, 'a') except IOError as e: print >>sys.stderr, 'ERROR: {e}'.format(e=e) - print >>sys.stderr, '*** Failed to open {fn} for buffer flush'.format(fn) + print >>sys.stderr, '*** Failed to open {fn} for buffer flush'.format(fpath) self.num_file_errors += 1 else: for read in self.buffers[color]: @@ -187,6 +189,7 @@ def main(): traversal_range = args.traversal_range input_fastp = args.input_fastp + outdir = os.path.dirname(input_fastp) max_buffers = args.max_buffers output_pref = args.output_prefix @@ -198,16 +201,16 @@ def main(): if debug: import yep - output_buffer = ReadBuffer(max_buffers, buf_size, est, output_pref) + output_buffer = ReadBuffer(max_buffers, buf_size, est, output_pref, outdir) # file for multicolored reads, just keep this one around the whole time - multi_fn = '{}_multi.fp'.format(output_pref) + multi_fn = os.path.join(outdir, '{}_multi.fp'.format(output_pref)) try: multi_fp = open(multi_fn, 'a') except IOError as e: print >>sys.stderr, 'ERROR: {e}'.format(e=e) print >>sys.stderr, '*** Failed to open {fn}'.format(multi_fn) - orphaned_fn = '{}_orphaned.fa'.format(output_pref) + orphaned_fn = os.path.join(outdir, '{}_orphaned.fa'.format(output_pref)) try: orphaned_fp = open(orphaned_fn, 'a') except IOError as e: From 3b6f9afcaebf0a80f132ecf2efe553daf54fed90 Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Thu, 14 Nov 2013 14:29:42 -0500 Subject: [PATCH 075/140] fixed key error in error checking code --- scripts/sweep-reads-by-partition-buffered.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py index b909c04f3a..b46a34eecd 100755 --- a/scripts/sweep-reads-by-partition-buffered.py +++ b/scripts/sweep-reads-by-partition-buffered.py @@ -21,6 +21,7 @@ import screed import sys +import os import argparse import time import khmer @@ -239,7 +240,7 @@ def main(): read_fp = screed.open(read_file) except IOError as e: print >>sys.stderr, 'ERROR:', e - print >>sys.stderr, '*** Could not open {fn}, skipping...'.format(read_file) + print >>sys.stderr, '*** Could not open {fn}, skipping...'.format(fn=read_file) else: for n, record in enumerate(read_fp): if n % 50000 == 0: From 3278230ecf0ee30966fc863379f6d926e11b4724 Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Thu, 14 Nov 2013 14:38:58 -0500 Subject: [PATCH 076/140] was not flushing buffer at end of run! oops... --- scripts/sweep-reads-by-partition-buffered.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py index b46a34eecd..e0546de0f2 100755 --- a/scripts/sweep-reads-by-partition-buffered.py +++ b/scripts/sweep-reads-by-partition-buffered.py @@ -265,8 +265,11 @@ def main(): else: n_orphaned += 1 orphaned_fp.write('>{}\n{}\n'.format(name, seq)) + output_buffer.flush_all() read_fp.close() - + + # gotta output anything left in the buffers at the end! + output_buffer.flush_all() total_t = time.clock() - total_t multi_fp.close() @@ -278,7 +281,7 @@ def main(): print >>sys.stderr, '** {writee} reads not written'.format(writee=output_buffer.num_write_errors) print >>sys.stderr, '** {filee} errors opening files'.format(filee=output_buffer.num_file_errors) - print >>sys.stderr, 'swept {n_reads} for colors...'.format(n_reads=n) + print >>sys.stderr, 'swept {n_reads} for colors...'.format(n_reads=n_colored+n_mcolored+n_orphaned) print >>sys.stderr, '...with {nc} colored and {no} orphaned'.format( nc=n_colored, no=n_orphaned) print >>sys.stderr, '...and {nmc} multicolored'.format(nmc=n_mcolored) From 0f8fb291319eec63daf11a6d9b7797c8a5d5319a Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Thu, 14 Nov 2013 14:48:22 -0500 Subject: [PATCH 077/140] fixed screwy extra spaces in output --- scripts/sweep-reads-by-partition-buffered.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py index e0546de0f2..82fbd4e33d 100755 --- a/scripts/sweep-reads-by-partition-buffered.py +++ b/scripts/sweep-reads-by-partition-buffered.py @@ -49,7 +49,7 @@ def __repr__(self): def write(self, fp): try: - fp.write('\n>{}\t{}\n{}\n'.format(self.name, self.color, self.seq)) + fp.write('>{}\t{}\n{}\n'.format(self.name, self.color, self.seq)) except IOError: print >>sys.stderr, 'Error writing {seq} to {fn}'.format(seq=self, fn=fp) return 1 @@ -270,7 +270,7 @@ def main(): # gotta output anything left in the buffers at the end! output_buffer.flush_all() - total_t = time.clock() - total_t + total_t = time.clock() - total_t multi_fp.close() orphaned_fp.close() From e7ece3329b333e4e71803eb36bb34b4cd926e9a6 Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Thu, 14 Nov 2013 14:51:50 -0500 Subject: [PATCH 078/140] added initial test for sweep --- tests/test_scripts.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/tests/test_scripts.py b/tests/test_scripts.py index 9a50e56721..2a7e8dd4d7 100644 --- a/tests/test_scripts.py +++ b/tests/test_scripts.py @@ -1125,16 +1125,24 @@ def test_sample_reads_randomly(): def test_sweep_reads_by_partition_buffered(): readfile = utils.get_temp_filename('reads.fa') contigfile = utils.get_temp_filename('contigs.fp') - in_dir = os.path.dirname(infile) + in_dir = os.path.dirname(contigfile) - shutil.copyfile(utils.get_test_data('test-sweep-reads.fa'), infile) + shutil.copyfile(utils.get_test_data('test-sweep-reads.fa'), readfile) shutil.copyfile(utils.get_test_data('test-sweep-contigs.fp'), contigfile) script = scriptpath('sweep-reads-by-partition-buffered.py') - args = ['-o', 'test', '-i', contigfile, readfile] + args = ['-k', '25', '-o', 'test', '-i', contigfile, readfile] status, out, err = runscript(script, args, in_dir) - outfiles = ['test_0.fa', 'test_1.fa'] + out1 = os.path.join(in_dir, 'test_0.fa') + out2 = os.path.join(in_dir, 'test_1.fa') + + print os.listdir(in_dir) - seqs1 = set([r.name for r in screed.open(outfiles[0])]) - seqs2 = set([r.name for r in screed.open(outfiles[1])]) + seqs1 = set([r.name for r in screed.open(out1)]) + seqs2 = set([r.name for r in screed.open(out2)]) + + print seqs1 + print seqs2 + assert seqs1 == set(['read1_p0\t0', 'read2_p0\t0']) + assert seqs2 == set(['read3_p1\t1']) From 409fa78a7ae74238e605cbd4edff18673e2f3a4e Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Thu, 14 Nov 2013 14:52:09 -0500 Subject: [PATCH 079/140] test data for sweep tests --- tests/test-data/test-sweep-contigs.fp | 8 ++++++++ tests/test-data/test-sweep-reads.fa | 6 ++++++ 2 files changed, 14 insertions(+) create mode 100644 tests/test-data/test-sweep-contigs.fp create mode 100644 tests/test-data/test-sweep-reads.fa diff --git a/tests/test-data/test-sweep-contigs.fp b/tests/test-data/test-sweep-contigs.fp new file mode 100644 index 0000000000..92be07782a --- /dev/null +++ b/tests/test-data/test-sweep-contigs.fp @@ -0,0 +1,8 @@ +>read_A 0 +ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAG +>read_B_overlap_A 0 +GCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGCTCTGCCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGA +>read_C_overlap_B 0 +TGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGAGTTGGGGATAGATAGATAGATGAGTGTAGATCCAACAACACATACA +>read_D 1 +TATATATATAGCTAGCTAGCTAACTAGCTAGCATCGATCGATCGATC diff --git a/tests/test-data/test-sweep-reads.fa b/tests/test-data/test-sweep-reads.fa new file mode 100644 index 0000000000..15696127ab --- /dev/null +++ b/tests/test-data/test-sweep-reads.fa @@ -0,0 +1,6 @@ +>read1_p0 +ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGCGTAATCGTAAGCTCTGCCTA +>read2_p0 +CTAGAGCTAGGCTAGCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTTTGCTCTGCTCGCGCTCGCTCG +>read3_p1 +AACTAGCTAGCATCGATCGATCGATCTGCTGATCG From 2b312a892725a5c7e7edacef33d28b086691abb9 Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Thu, 14 Nov 2013 15:06:49 -0500 Subject: [PATCH 080/140] added checking multi and orphan reads to sweep tests --- tests/test_scripts.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tests/test_scripts.py b/tests/test_scripts.py index 2a7e8dd4d7..0ab0ddc323 100644 --- a/tests/test_scripts.py +++ b/tests/test_scripts.py @@ -1136,13 +1136,21 @@ def test_sweep_reads_by_partition_buffered(): out1 = os.path.join(in_dir, 'test_0.fa') out2 = os.path.join(in_dir, 'test_1.fa') + mout = os.path.join(in_dir, 'test_multi.fa') + oout = os.path.join(in_dir, 'test_orphaned.fa') print os.listdir(in_dir) seqs1 = set([r.name for r in screed.open(out1)]) seqs2 = set([r.name for r in screed.open(out2)]) - + seqsm = set([r.name for r in screed.open(mout)]) + seqso = set([r.name for r in screed.open(oout)]) + print seqs1 print seqs2 + print seqsm + print seqso assert seqs1 == set(['read1_p0\t0', 'read2_p0\t0']) assert seqs2 == set(['read3_p1\t1']) + assert seqsm == set(['read4_multi\t0\t1']) + assert seqso == set(['read5_orphan']) From 51255b69f8ccb3099a9ee7a7aa684adbb6ba572d Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Thu, 14 Nov 2013 15:09:21 -0500 Subject: [PATCH 081/140] added case for missing file to sweep --- tests/test_scripts.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/test_scripts.py b/tests/test_scripts.py index 0ab0ddc323..0008dccb6e 100644 --- a/tests/test_scripts.py +++ b/tests/test_scripts.py @@ -1131,9 +1131,13 @@ def test_sweep_reads_by_partition_buffered(): shutil.copyfile(utils.get_test_data('test-sweep-contigs.fp'), contigfile) script = scriptpath('sweep-reads-by-partition-buffered.py') - args = ['-k', '25', '-o', 'test', '-i', contigfile, readfile] + args = ['-k', '25', '-o', 'test', '-i', contigfile, readfile, 'junkfile.fa'] status, out, err = runscript(script, args, in_dir) + # check if the bad file was skipped without issue + assert 'ERROR' in err + assert 'skipping' in err + out1 = os.path.join(in_dir, 'test_0.fa') out2 = os.path.join(in_dir, 'test_1.fa') mout = os.path.join(in_dir, 'test_multi.fa') From 6e0621973cdb2db224f30305adfe26ba330d5b77 Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Thu, 14 Nov 2013 15:16:22 -0500 Subject: [PATCH 082/140] removed old color sweep functions --- khmer/_khmermodule.cc | 53 ------------------------------------------- 1 file changed, 53 deletions(-) diff --git a/khmer/_khmermodule.cc b/khmer/_khmermodule.cc index 60ebc1cb12..c31df597ec 100644 --- a/khmer/_khmermodule.cc +++ b/khmer/_khmermodule.cc @@ -3978,58 +3978,6 @@ static PyObject * hashbits_sweep_color_neighborhood(PyObject * self, PyObject * } -static PyObject * hashbits_sweep_color_neighborhood_old(PyObject * self, PyObject * args) { - khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self; - khmer::Hashbits * hb = me->hashbits; - - char * seq = NULL; - PyObject * break_on_stop_tags_o = NULL; - PyObject * stop_big_traversals_o = NULL; - - if (!PyArg_ParseTuple(args, "s|OO", &seq, - &break_on_stop_tags_o, - &stop_big_traversals_o)) { - return NULL; - } - - bool break_on_stop_tags = false; - if (break_on_stop_tags_o && PyObject_IsTrue(break_on_stop_tags_o)) { - break_on_stop_tags = true; - } - bool stop_big_traversals = false; - if (stop_big_traversals_o && PyObject_IsTrue(stop_big_traversals_o)) { - stop_big_traversals = true; - } - - if (strlen(seq) < hb->ksize()) { - return NULL; - } - - //std::pair ret; - ColorPtrSet found_colors; - - bool exc_raised = false; - //Py_BEGIN_ALLOW_THREADS - try { - hb->sweep_sequence_for_colors(seq, found_colors, break_on_stop_tags, stop_big_traversals); - } catch (_khmer_signal &e) { - exc_raised = true; - } - //Py_END_ALLOW_THREADS - - if (exc_raised) return NULL; - - PyObject * x = PyList_New(found_colors.size()); - khmer::ColorPtrSet::const_iterator si; - unsigned long long i = 0; - for (si=found_colors.begin(); si!=found_colors.end(); ++si) { - PyList_SET_ITEM(x, i, Py_BuildValue("K", *(*si))); - i++; - } - - return x; -} - // Similar to find_all_tags, but returns tags in a way actually useable by python // need a tags_in_sequence iterator or function in c++ land for reuse in all // these functions @@ -4199,7 +4147,6 @@ static PyMethodDef khmer_hashbits_methods[] = { { "get_median_count", hashbits_get_median_count, METH_VARARGS, "Get the median, average, and stddev of the k-mer counts in the string" }, { "consume_fasta_and_tag_with_colors", hashbits_consume_fasta_and_tag_with_colors, METH_VARARGS, "" }, { "sweep_color_neighborhood", hashbits_sweep_color_neighborhood, METH_VARARGS, "" }, - { "sweep_color_neighborhood_old", hashbits_sweep_color_neighborhood_old, METH_VARARGS, "" }, {"consume_partitioned_fasta_and_tag_with_colors", hashbits_consume_partitioned_fasta_and_tag_with_colors, METH_VARARGS, "" }, {"sweep_tag_neighborhood", hashbits_sweep_tag_neighborhood, METH_VARARGS, "" }, {"get_tag_colors", hashbits_get_tag_colors, METH_VARARGS, ""}, From d94f7074183c57f3951654162661e7b59d580b0b Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Thu, 14 Nov 2013 15:27:34 -0500 Subject: [PATCH 083/140] fix for stochastic color order in tests --- tests/test_scripts.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_scripts.py b/tests/test_scripts.py index 0008dccb6e..8f37289e67 100644 --- a/tests/test_scripts.py +++ b/tests/test_scripts.py @@ -1156,5 +1156,6 @@ def test_sweep_reads_by_partition_buffered(): print seqso assert seqs1 == set(['read1_p0\t0', 'read2_p0\t0']) assert seqs2 == set(['read3_p1\t1']) - assert seqsm == set(['read4_multi\t0\t1']) + assert (seqsm == set(['read4_multi\t0\t1']) or \ + seqsm == set(['read4_multi\t1\t0'])) assert seqso == set(['read5_orphan']) From 33c711ce7a920fc1783f57379817dfb8ee6d9089 Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Thu, 14 Nov 2013 15:30:45 -0500 Subject: [PATCH 084/140] updated multi file for standard name --- scripts/sweep-reads-by-partition-buffered.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py index 82fbd4e33d..9fd08cb282 100755 --- a/scripts/sweep-reads-by-partition-buffered.py +++ b/scripts/sweep-reads-by-partition-buffered.py @@ -205,7 +205,7 @@ def main(): output_buffer = ReadBuffer(max_buffers, buf_size, est, output_pref, outdir) # file for multicolored reads, just keep this one around the whole time - multi_fn = os.path.join(outdir, '{}_multi.fp'.format(output_pref)) + multi_fn = os.path.join(outdir, '{}_multi.fa'.format(output_pref)) try: multi_fp = open(multi_fn, 'a') except IOError as e: From 297ca6859a9273d9511b2c39a26cda0b9d8fd6b6 Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Thu, 14 Nov 2013 15:38:42 -0500 Subject: [PATCH 085/140] updated test data --- tests/test-data/test-sweep-reads.fa | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/test-data/test-sweep-reads.fa b/tests/test-data/test-sweep-reads.fa index 15696127ab..293f2b9e7e 100644 --- a/tests/test-data/test-sweep-reads.fa +++ b/tests/test-data/test-sweep-reads.fa @@ -4,3 +4,7 @@ ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGCGTAATCGTAAGCTCTGCCTA CTAGAGCTAGGCTAGCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTTTGCTCTGCTCGCGCTCGCTCG >read3_p1 AACTAGCTAGCATCGATCGATCGATCTGCTGATCG +>read4_multi +CTAGAGCTAGGCTAGCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTTTGCTCTGCTCGCGCTCGCTCGCAACTAGCTAGCATCGATCGATCGATCTGCTGATCG +>read5_orphan +TGCTGATATATAGCTAGATATATATATAGCAGGCTGGTGTATCGCGCTAGCTAGCTAGCTTTCTCTTTTTTTTTTTTTAGGGA From b65ce5cb766b65ca2339a0e73a58418cb4a11571 Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Thu, 14 Nov 2013 17:10:44 -0500 Subject: [PATCH 086/140] fixed horked mem usage estimate --- scripts/sweep-reads-by-partition-buffered.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py index 9fd08cb282..adee5b06a9 100755 --- a/scripts/sweep-reads-by-partition-buffered.py +++ b/scripts/sweep-reads-by-partition-buffered.py @@ -185,7 +185,7 @@ def main(): print >>sys.stderr, '' print >>sys.stderr, \ 'Estimated memory usage is {prod:.2g} bytes \ - (n_hashes x min_hashsize / 8)'.format(prod=args.n_hashes*args.min_hashsize/8) + (n_hashes x min_hashsize / 8)'.format(prod=args.n_hashes*HT_SIZE/8) print >>sys.stderr, '-' * 8 traversal_range = args.traversal_range From 3758564073a627771131832b65fdde56ccb7e858 Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Mon, 18 Nov 2013 14:30:37 -0500 Subject: [PATCH 087/140] added error handling for too-short reads --- scripts/sweep-reads-by-partition-buffered.py | 50 +++++++++++++------- 1 file changed, 33 insertions(+), 17 deletions(-) diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py index adee5b06a9..d7d6fe0f7f 100755 --- a/scripts/sweep-reads-by-partition-buffered.py +++ b/scripts/sweep-reads-by-partition-buffered.py @@ -81,6 +81,13 @@ def __init__(self, max_buffers, max_size, est_files, output_pref, outdir): self.num_write_errors = 0 self.num_file_errors = 0 + print >>sys.stderr, '''Init new ReadBuffer [ + Max Buffers: {num_bufs} + Max Reads: {max_reads} + Est. Files: {est_files} + ]'''.format(num_bufs=self.max_buffers, max_reads=self.max_size, + est_files=self.est_files) + def add_seq(self, seq): color = seq.color if color in self.buffers: @@ -96,9 +103,11 @@ def add_seq(self, seq): self.buffer_counts[color] = 1 self.cur_reads += 1 if self.cur_reads > self.max_size: + print >>sys.stderr, '** Reached max num reads...' self.flush_all() if len(self.buffers) > self.max_buffers: #self.clean_buffers(2) + print >>sys.stderr, '** Reached max num buffers...' self.flush_all() def flush_buffer(self, color): @@ -107,7 +116,7 @@ def flush_buffer(self, color): try: outfp = open(fpath, 'a') except IOError as e: - print >>sys.stderr, 'ERROR: {e}'.format(e=e) + print >>sys.stderr, '!! ERROR: {e} !!'.format(e=e) print >>sys.stderr, '*** Failed to open {fn} for buffer flush'.format(fpath) self.num_file_errors += 1 else: @@ -121,7 +130,7 @@ def del_buffer(self, color): del self.buffers[color] def flush_all(self): - print >>sys.stderr, '** reached max buffer size, flushing all to files...' + print >>sys.stderr, '*** Flushing all to files...' for color in self.buffers: self.flush_buffer(color) colors = self.buffers.keys() @@ -209,13 +218,13 @@ def main(): try: multi_fp = open(multi_fn, 'a') except IOError as e: - print >>sys.stderr, 'ERROR: {e}'.format(e=e) + print >>sys.stderr, '!! ERROR: {e} !!'.format(e=e) print >>sys.stderr, '*** Failed to open {fn}'.format(multi_fn) orphaned_fn = os.path.join(outdir, '{}_orphaned.fa'.format(output_pref)) try: orphaned_fp = open(orphaned_fn, 'a') except IOError as e: - print >>sys.stderr, 'ERROR: {e}'.format(e=e) + print >>sys.stderr, '!! ERROR: {e} !!'.format(e=e) print >>sys.stderr, '*** Failed to open {fn}'.format(orphaned_fn) # consume the partitioned fasta with which to color the graph @@ -239,7 +248,7 @@ def main(): try: read_fp = screed.open(read_file) except IOError as e: - print >>sys.stderr, 'ERROR:', e + print >>sys.stderr, '!! ERROR: !!', e print >>sys.stderr, '*** Could not open {fn}, skipping...'.format(fn=read_file) else: for n, record in enumerate(read_fp): @@ -253,22 +262,29 @@ def main(): start_t = time.clock() seq = record.sequence name = record.name - - colors = ht.sweep_color_neighborhood(seq, traversal_range) - color_number_dist.append(len(colors)) - if colors: - n_colored += 1 - if len(colors) > 1: - multi_fp.write('>{}\t{}\n{}\n'.format(name, '\t'.join([str(c) for c in colors]), seq)) - else: - output_buffer.add_seq(Seq(name, colors[0], seq)) + try: + colors = ht.sweep_color_neighborhood(seq, traversal_range) + except ValueError as e: + print >>sys.stderr, '!! ERROR: {e} !!'.format(e=e) + print >>sys.stderr, 'Read length less than k-mer size' else: - n_orphaned += 1 - orphaned_fp.write('>{}\n{}\n'.format(name, seq)) + color_number_dist.append(len(colors)) + if colors: + n_colored += 1 + if len(colors) > 1: + multi_fp.write('>{}\t{}\n{}\n'.format( + name, '\t'.join([str(c) for c in colors]), seq)) + else: + output_buffer.add_seq(Seq(name, colors[0], seq)) + else: + n_orphaned += 1 + orphaned_fp.write('>{}\n{}\n'.format(name, seq)) + print >>sys.stderr, '** End of file {fn}...'.format(fn=read_file) output_buffer.flush_all() read_fp.close() # gotta output anything left in the buffers at the end! + print >>sys.stderr, '** End of run...' output_buffer.flush_all() total_t = time.clock() - total_t @@ -277,7 +293,7 @@ def main(): if debug: yep.stop() if output_buffer.num_write_errors > 0 or output_buffer.num_file_errors > 0: - print >>sys.stderr, 'WARNING: Sweep finished with errors!' + print >>sys.stderr, '! WARNING: Sweep finished with errors !' print >>sys.stderr, '** {writee} reads not written'.format(writee=output_buffer.num_write_errors) print >>sys.stderr, '** {filee} errors opening files'.format(filee=output_buffer.num_file_errors) From 48a0e8cfbf83d74dd7d5ea5f2ef4669c77177093 Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Tue, 19 Nov 2013 11:34:36 -0500 Subject: [PATCH 088/140] fixed error in error handling code FACEPALM --- scripts/sweep-reads-by-partition-buffered.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py index d7d6fe0f7f..793acbe236 100755 --- a/scripts/sweep-reads-by-partition-buffered.py +++ b/scripts/sweep-reads-by-partition-buffered.py @@ -117,7 +117,7 @@ def flush_buffer(self, color): outfp = open(fpath, 'a') except IOError as e: print >>sys.stderr, '!! ERROR: {e} !!'.format(e=e) - print >>sys.stderr, '*** Failed to open {fn} for buffer flush'.format(fpath) + print >>sys.stderr, '*** Failed to open {fn} for buffer flush'.format(fn=fpath) self.num_file_errors += 1 else: for read in self.buffers[color]: @@ -139,6 +139,7 @@ def flush_all(self): del colors assert self.cur_reads == 0 + # experimental, doesn't work very well def clean_buffers(self, cutoff): print >>sys.stderr, '** flushing low-abundance buffers...' flushed = [] @@ -214,18 +215,18 @@ def main(): output_buffer = ReadBuffer(max_buffers, buf_size, est, output_pref, outdir) # file for multicolored reads, just keep this one around the whole time - multi_fn = os.path.join(outdir, '{}_multi.fa'.format(output_pref)) + multi_fn = os.path.join(outdir, '{pref}_multi.fa'.format(pref=output_pref)) try: multi_fp = open(multi_fn, 'a') except IOError as e: print >>sys.stderr, '!! ERROR: {e} !!'.format(e=e) - print >>sys.stderr, '*** Failed to open {fn}'.format(multi_fn) - orphaned_fn = os.path.join(outdir, '{}_orphaned.fa'.format(output_pref)) + print >>sys.stderr, '*** Failed to open {fn}'.format(fn=multi_fn) + orphaned_fn = os.path.join(outdir, '{pref}_orphaned.fa'.format(pref=output_pref)) try: orphaned_fp = open(orphaned_fn, 'a') except IOError as e: print >>sys.stderr, '!! ERROR: {e} !!'.format(e=e) - print >>sys.stderr, '*** Failed to open {fn}'.format(orphaned_fn) + print >>sys.stderr, '*** Failed to open {fn}'.format(fn=orphaned_fn) # consume the partitioned fasta with which to color the graph ht = khmer.new_hashbits(K, HT_SIZE, N_HT) From b43328b71359daea6a64e1f17f0619a346c6a6f6 Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Tue, 19 Nov 2013 14:40:49 -0500 Subject: [PATCH 089/140] rewrote buffering code for better performance and simplicity, changed references to coloring to use labeling --- scripts/sweep-reads-by-partition-buffered.py | 199 +++++++++---------- 1 file changed, 91 insertions(+), 108 deletions(-) diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py index 793acbe236..13de17aee4 100755 --- a/scripts/sweep-reads-by-partition-buffered.py +++ b/scripts/sweep-reads-by-partition-buffered.py @@ -26,6 +26,8 @@ import time import khmer from khmer.counting_args import build_construct_args, DEFAULT_MIN_HASHSIZE +from collections import namedtuple as nt + DEFAULT_NUM_BUFFERS=50000 DEFAULT_BUFFER_SIZE=1000000 @@ -35,34 +37,48 @@ MIN_HSIZE=4e7 MIN_KSIZE=21 - -# little class to store sequence information for the buffering class -class Seq: - def __init__(self, name, color, seq): - self.name = name - self.color = color - self.seq = seq - def __repr__(self): - return '''>{name}\t{color}\n -{seq}\n'''.format(name=self.name, color=self.color, seq=self.seq) - - def write(self, fp): - try: - fp.write('>{}\t{}\n{}\n'.format(self.name, self.color, self.seq)) - except IOError: - print >>sys.stderr, 'Error writing {seq} to {fn}'.format(seq=self, fn=fp) - return 1 - else: - return 0 +def fmt_fasta(name, seq, labels=[]): + return '>{name}\t{labels}\n{seq}'.format(name=name, + labels='\t'.join([str(l) for l in labels]), seq=seq) -# stores reads in memory and flushes them to their approriate files +def write_seq(fp, name, seq, labels=[]): + try: + fp.write(fmt_fasta(name, seq, labels=labels)) + except IOError: + print >>sys.stderr, 'Error writing {read}'.format( + read=fmt_fasta(name, seq, labels=labels)) + return 1 + else: + return 0 + +# stores reads in memory and flushes them to their appropriate files # when certain criteria are met # Basic idea is to buffer some number of reads in memory, then dump them all at once # Hope that each file acrues, on average, BUFFER_SIZE / NUM_PARTS reads -# ie, if we buffer 1000000 reads, and we have 100000 partitions or colors, +# ie, if we buffer 1000000 reads, and we have 100000 partitions or labels, # we should expect the mean buffer size to be 10 reads class ReadBuffer: + + def __init__(self): + self.buf = [] + + def push(self, seq_str): + self.buf.append(seq_str) + + def flush(self): + return '\n'.join(self.buf) + + def is_full(self, full): + if len(self.buf) >= full: + return True + else: + return False + + def __len__(self): + return len(self.buf) + +class ReadBufferManager: def __init__(self, max_buffers, max_size, est_files, output_pref, outdir): self.buffers = {} @@ -85,33 +101,12 @@ def __init__(self, max_buffers, max_size, est_files, output_pref, outdir): Max Buffers: {num_bufs} Max Reads: {max_reads} Est. Files: {est_files} + Buffer flush: {buf_flush} ]'''.format(num_bufs=self.max_buffers, max_reads=self.max_size, - est_files=self.est_files) - - def add_seq(self, seq): - color = seq.color - if color in self.buffers: - count = self.buffer_counts[color] - self.buffers[color].append(seq) - self.buffer_counts[color] += 1 - if count > self.buffer_flush: - self.flush_buffer(color) - self.del_buffer(color) + est_files=self.est_files, buf_flush=self.buffer_flush) - else: - self.buffers[color] = [seq] - self.buffer_counts[color] = 1 - self.cur_reads += 1 - if self.cur_reads > self.max_size: - print >>sys.stderr, '** Reached max num reads...' - self.flush_all() - if len(self.buffers) > self.max_buffers: - #self.clean_buffers(2) - print >>sys.stderr, '** Reached max num buffers...' - self.flush_all() - - def flush_buffer(self, color): - fn = '{}_{}.fa'.format(self.output_pref, color) + def flush_buffer(self, buf_id): + fn = '{}_{}.fa'.format(self.output_pref, buf_id) fpath = os.path.join(self.outdir, fn) try: outfp = open(fpath, 'a') @@ -120,37 +115,38 @@ def flush_buffer(self, color): print >>sys.stderr, '*** Failed to open {fn} for buffer flush'.format(fn=fpath) self.num_file_errors += 1 else: - for read in self.buffers[color]: - self.num_write_errors += read.write(outfp) - self.cur_reads -= 1 + buf = self.buffers[buf_id] + outfp.write(buf.flush()) + self.cur_reads -= len(buf) outfp.close() + del self.buffers[buf_id] - def del_buffer(self, color): - del self.buffer_counts[color] - del self.buffers[color] + def queue(self, seq_str, buf_id): + if buf_id in self.buffers: + self.buffers[buf_id].push(seq_str) + if self.buffers[buf_id].is_full(self.buffer_flush): + self.flush_buffer(buf_id) + else: + new_buf = ReadBuffer() + new_buf.push(seq_str) + self.buffers[buf_id] = new_buf + + self.cur_reads += 1 + if self.cur_reads > self.max_size: + print >>sys.stderr, '** Reached max num reads...' + self.flush_all() + if len(self.buffers) > self.max_buffers: + #self.clean_buffers(2) + print >>sys.stderr, '** Reached max num buffers...' + self.flush_all() def flush_all(self): print >>sys.stderr, '*** Flushing all to files...' - for color in self.buffers: - self.flush_buffer(color) - colors = self.buffers.keys() - for color in colors: - self.del_buffer(color) - del colors + buf_ids = self.buffers.keys() + for buf_id in buf_ids: + self.flush_buffer(buf_id) assert self.cur_reads == 0 - # experimental, doesn't work very well - def clean_buffers(self, cutoff): - print >>sys.stderr, '** flushing low-abundance buffers...' - flushed = [] - for color in self.buffers: - if self.buffer_counts[color] < cutoff: - self.flush_buffer(color) - flushed.append(color) - for color in flushed: - self.del_buffer(color) - del flushed - def main(): parser = build_construct_args() @@ -212,39 +208,25 @@ def main(): if debug: import yep - output_buffer = ReadBuffer(max_buffers, buf_size, est, output_pref, outdir) - - # file for multicolored reads, just keep this one around the whole time - multi_fn = os.path.join(outdir, '{pref}_multi.fa'.format(pref=output_pref)) - try: - multi_fp = open(multi_fn, 'a') - except IOError as e: - print >>sys.stderr, '!! ERROR: {e} !!'.format(e=e) - print >>sys.stderr, '*** Failed to open {fn}'.format(fn=multi_fn) - orphaned_fn = os.path.join(outdir, '{pref}_orphaned.fa'.format(pref=output_pref)) - try: - orphaned_fp = open(orphaned_fn, 'a') - except IOError as e: - print >>sys.stderr, '!! ERROR: {e} !!'.format(e=e) - print >>sys.stderr, '*** Failed to open {fn}'.format(fn=orphaned_fn) + output_buffer = ReadBufferManager(max_buffers, buf_size, est, output_pref, outdir) - # consume the partitioned fasta with which to color the graph + # consume the partitioned fasta with which to label the graph ht = khmer.new_hashbits(K, HT_SIZE, N_HT) print >>sys.stderr, 'consuming fastp...' if debug: yep.start(debug) ht.consume_partitioned_fasta_and_tag_with_colors(input_fastp) - color_number_dist = [] + label_number_dist = [] n_orphaned = 0 - n_colored = 0 - n_mcolored = 0 + n_labeled = 0 + n_mlabeled = 0 total_t = time.clock() start_t = time.clock() for read_file in input_files: - print >>sys.stderr,'** sweeping {read_file} for colors...'.format(read_file=read_file) + print >>sys.stderr,'** sweeping {read_file} for labels...'.format(read_file=read_file) file_t = 0.0 try: read_fp = screed.open(read_file) @@ -257,29 +239,30 @@ def main(): end_t = time.clock() batch_t = end_t - start_t file_t += batch_t - print >>sys.stderr, '\tswept {n} reads [{nc} colored, {no} orphaned] \ + print >>sys.stderr, '\tswept {n} reads [{nc} labeled, {no} orphaned] \ ** {sec}s ({sect}s total)' \ - .format(n=n, nc=n_colored, no=n_orphaned, sec=batch_t, sect=file_t) + .format(n=n, nc=n_labeled, no=n_orphaned, sec=batch_t, sect=file_t) start_t = time.clock() seq = record.sequence name = record.name try: - colors = ht.sweep_color_neighborhood(seq, traversal_range) + labels = ht.sweep_color_neighborhood(seq, traversal_range) except ValueError as e: print >>sys.stderr, '!! ERROR: {e} !!'.format(e=e) print >>sys.stderr, 'Read length less than k-mer size' else: - color_number_dist.append(len(colors)) - if colors: - n_colored += 1 - if len(colors) > 1: - multi_fp.write('>{}\t{}\n{}\n'.format( - name, '\t'.join([str(c) for c in colors]), seq)) + seq_str = fmt_fasta(name, seq, labels) + label_number_dist.append(len(labels)) + if labels: + n_labeled += 1 + if len(labels) > 1: + output_buffer.queue(seq_str, 'multi') + n_mlabeled += 1 else: - output_buffer.add_seq(Seq(name, colors[0], seq)) + output_buffer.queue(seq_str, labels[0]) else: n_orphaned += 1 - orphaned_fp.write('>{}\n{}\n'.format(name, seq)) + output_buffer.queue(seq_str, 'orphaned') print >>sys.stderr, '** End of file {fn}...'.format(fn=read_file) output_buffer.flush_all() read_fp.close() @@ -298,14 +281,14 @@ def main(): print >>sys.stderr, '** {writee} reads not written'.format(writee=output_buffer.num_write_errors) print >>sys.stderr, '** {filee} errors opening files'.format(filee=output_buffer.num_file_errors) - print >>sys.stderr, 'swept {n_reads} for colors...'.format(n_reads=n_colored+n_mcolored+n_orphaned) - print >>sys.stderr, '...with {nc} colored and {no} orphaned'.format( - nc=n_colored, no=n_orphaned) - print >>sys.stderr, '...and {nmc} multicolored'.format(nmc=n_mcolored) + print >>sys.stderr, 'swept {n_reads} for labels...'.format(n_reads=n_labeled+n_mlabeled+n_orphaned) + print >>sys.stderr, '...with {nc} labeled and {no} orphaned'.format( + nc=n_labeled, no=n_orphaned) + print >>sys.stderr, '...and {nmc} multilabeled'.format(nmc=n_mlabeled) - print >>sys.stderr, '** outputting color number distribution...' - with open('color_dist.txt', 'wb') as outfp: - for nc in color_number_dist: + print >>sys.stderr, '** outputting label number distribution...' + with open('label_dist.txt', 'wb') as outfp: + for nc in label_number_dist: outfp.write('{nc}\n'.format(nc=nc)) if __name__ == '__main__': From a915e7b67b7a315831a23a057c75e5c727100c5d Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Tue, 19 Nov 2013 14:45:40 -0500 Subject: [PATCH 090/140] removed deprecated reference to fps --- scripts/sweep-reads-by-partition-buffered.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py index 13de17aee4..0df68b2861 100755 --- a/scripts/sweep-reads-by-partition-buffered.py +++ b/scripts/sweep-reads-by-partition-buffered.py @@ -272,8 +272,6 @@ def main(): output_buffer.flush_all() total_t = time.clock() - total_t - multi_fp.close() - orphaned_fp.close() if debug: yep.stop() if output_buffer.num_write_errors > 0 or output_buffer.num_file_errors > 0: From 750cc36511704db33dd0f456e6deff47bc0bced1 Mon Sep 17 00:00:00 2001 From: CS Welcher Date: Tue, 19 Nov 2013 15:44:54 -0500 Subject: [PATCH 091/140] changed all names using color to use label --- khmer/_khmermodule.cc | 64 +++++++++---------- lib/hashtable.cc | 104 +++++++++++++++--------------- lib/hashtable.hh | 112 ++++++++++++++++----------------- lib/khmer.hh | 14 ++--- tests/test-data/test-labels.fa | 8 +++ 5 files changed, 155 insertions(+), 147 deletions(-) create mode 100644 tests/test-data/test-labels.fa diff --git a/khmer/_khmermodule.cc b/khmer/_khmermodule.cc index c31df597ec..70f6fc7feb 100644 --- a/khmer/_khmermodule.cc +++ b/khmer/_khmermodule.cc @@ -3818,21 +3818,21 @@ static PyObject * hashbits_get_median_count(PyObject * self, PyObject * args) return Py_BuildValue("iff", med, average, stddev); } -static PyObject * hashbits_get_color_dict(PyObject * self, PyObject * args) { +static PyObject * hashbits_get_label_dict(PyObject * self, PyObject * args) { khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self; khmer::Hashbits * hb = me->hashbits; PyObject * d = PyDict_New(); - khmer::ColorPtrMap::iterator it; + khmer::LabelPtrMap::iterator it; - for (it = hb->color_ptrs.begin(); it!=hb->color_ptrs.end(); ++it) { + for (it = hb->label_ptrs.begin(); it!=hb->label_ptrs.end(); ++it) { PyDict_SetItem(d, Py_BuildValue("K", it->first), Py_BuildValue("K", it->second)); } return d; } -static PyObject * hashbits_consume_fasta_and_tag_with_colors(PyObject * self, PyObject * args) +static PyObject * hashbits_consume_fasta_and_tag_with_labels(PyObject * self, PyObject * args) { khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self; khmer::Hashbits * hb = me->hashbits; @@ -3852,7 +3852,7 @@ static PyObject * hashbits_consume_fasta_and_tag_with_colors(PyObject * self, Py //Py_BEGIN_ALLOW_THREADS try { - hb->consume_fasta_and_tag_with_colors(filename, total_reads, n_consumed, + hb->consume_fasta_and_tag_with_labels(filename, total_reads, n_consumed, _report_fn, callback_obj); } catch (_khmer_signal &e) { exc_raised = true; @@ -3864,7 +3864,7 @@ static PyObject * hashbits_consume_fasta_and_tag_with_colors(PyObject * self, Py } -static PyObject * hashbits_consume_partitioned_fasta_and_tag_with_colors( +static PyObject * hashbits_consume_partitioned_fasta_and_tag_with_labels( PyObject * self, PyObject * args) { khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self; @@ -3883,7 +3883,7 @@ static PyObject * hashbits_consume_partitioned_fasta_and_tag_with_colors( unsigned int total_reads; try { - hashbits->consume_partitioned_fasta_and_tag_with_colors(filename, + hashbits->consume_partitioned_fasta_and_tag_with_labels(filename, total_reads, n_consumed, _report_fn, callback_obj); } catch (_khmer_signal &e) { return NULL; @@ -3892,7 +3892,7 @@ static PyObject * hashbits_consume_partitioned_fasta_and_tag_with_colors( return Py_BuildValue("iK", total_reads, n_consumed); } -static PyObject * hashbits_consume_sequence_and_tag_with_colors(PyObject * self, PyObject * args) { +static PyObject * hashbits_consume_sequence_and_tag_with_labels(PyObject * self, PyObject * args) { khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self; khmer::Hashbits * hb = me->hashbits; @@ -3903,12 +3903,12 @@ static PyObject * hashbits_consume_sequence_and_tag_with_colors(PyObject * self, } unsigned long long n_consumed = 0; - khmer::Color * the_color = hb->check_and_allocate_color(c); + khmer::Label * the_label = hb->check_and_allocate_label(c); try { //if (hb->check_and_normalize_read(seq)) { - hb->consume_sequence_and_tag_with_colors(seq, n_consumed, *the_color); + hb->consume_sequence_and_tag_with_labels(seq, n_consumed, *the_label); //} } catch (_khmer_signal &e) { return NULL; @@ -3916,7 +3916,7 @@ static PyObject * hashbits_consume_sequence_and_tag_with_colors(PyObject * self, return Py_BuildValue("L", n_consumed); } -static PyObject * hashbits_sweep_color_neighborhood(PyObject * self, PyObject * args) { +static PyObject * hashbits_sweep_label_neighborhood(PyObject * self, PyObject * args) { khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self; khmer::Hashbits * hb = me->hashbits; @@ -3949,14 +3949,14 @@ static PyObject * hashbits_sweep_color_neighborhood(PyObject * self, PyObject * return NULL; } - //std::pair ret; - ColorPtrSet found_colors; + //std::pair ret; + LabelPtrSet found_labels; bool exc_raised = false; unsigned int num_traversed = 0; //Py_BEGIN_ALLOW_THREADS try { - num_traversed = hb->sweep_color_neighborhood(seq, found_colors, range, break_on_stop_tags, stop_big_traversals); + num_traversed = hb->sweep_label_neighborhood(seq, found_labels, range, break_on_stop_tags, stop_big_traversals); } catch (_khmer_signal &e) { exc_raised = true; } @@ -3966,10 +3966,10 @@ static PyObject * hashbits_sweep_color_neighborhood(PyObject * self, PyObject * if (exc_raised) return NULL; - PyObject * x = PyList_New(found_colors.size()); - khmer::ColorPtrSet::const_iterator si; + PyObject * x = PyList_New(found_labels.size()); + khmer::LabelPtrSet::const_iterator si; unsigned long long i = 0; - for (si=found_colors.begin(); si!=found_colors.end(); ++si) { + for (si=found_labels.begin(); si!=found_labels.end(); ++si) { PyList_SET_ITEM(x, i, Py_BuildValue("K", *(*si))); i++; } @@ -4038,7 +4038,7 @@ static PyObject * hashbits_sweep_tag_neighborhood(PyObject * self, PyObject *arg } -static PyObject * hashbits_get_tag_colors(PyObject * self, PyObject * args) { +static PyObject * hashbits_get_tag_labels(PyObject * self, PyObject * args) { khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self; khmer::Hashbits * hashbits = me->hashbits; @@ -4049,14 +4049,14 @@ static PyObject * hashbits_get_tag_colors(PyObject * self, PyObject * args) { return NULL; } - khmer::ColorPtrSet colors; + khmer::LabelPtrSet labels; - colors = hashbits->get_tag_colors(tag); + labels = hashbits->get_tag_labels(tag); - PyObject * x = PyList_New(colors.size()); - khmer::ColorPtrSet::const_iterator si; + PyObject * x = PyList_New(labels.size()); + khmer::LabelPtrSet::const_iterator si; unsigned long long i = 0; - for (si=colors.begin(); si!=colors.end(); ++si) { + for (si=labels.begin(); si!=labels.end(); ++si) { //std::string kmer_s = _revhash(*si, hashbits->ksize()); PyList_SET_ITEM(x, i, Py_BuildValue("K", *(*si))); i++; @@ -4065,7 +4065,7 @@ static PyObject * hashbits_get_tag_colors(PyObject * self, PyObject * args) { return x; } -static PyObject * hashbits_n_colors(PyObject * self, PyObject * args) +static PyObject * hashbits_n_labels(PyObject * self, PyObject * args) { khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self; khmer::Hashbits * hashbits = me->hashbits; @@ -4074,7 +4074,7 @@ static PyObject * hashbits_n_colors(PyObject * self, PyObject * args) return NULL; } - return PyInt_FromLong(hashbits->n_colors()); + return PyInt_FromLong(hashbits->n_labels()); } static PyMethodDef khmer_hashbits_methods[] = { @@ -4145,14 +4145,14 @@ static PyMethodDef khmer_hashbits_methods[] = { { "traverse_from_tags", hashbits_traverse_from_tags, METH_VARARGS, "" }, { "repartition_largest_partition", hashbits_repartition_largest_partition, METH_VARARGS, "" }, { "get_median_count", hashbits_get_median_count, METH_VARARGS, "Get the median, average, and stddev of the k-mer counts in the string" }, - { "consume_fasta_and_tag_with_colors", hashbits_consume_fasta_and_tag_with_colors, METH_VARARGS, "" }, - { "sweep_color_neighborhood", hashbits_sweep_color_neighborhood, METH_VARARGS, "" }, - {"consume_partitioned_fasta_and_tag_with_colors", hashbits_consume_partitioned_fasta_and_tag_with_colors, METH_VARARGS, "" }, + { "consume_fasta_and_tag_with_labels", hashbits_consume_fasta_and_tag_with_labels, METH_VARARGS, "" }, + { "sweep_label_neighborhood", hashbits_sweep_label_neighborhood, METH_VARARGS, "" }, + {"consume_partitioned_fasta_and_tag_with_labels", hashbits_consume_partitioned_fasta_and_tag_with_labels, METH_VARARGS, "" }, {"sweep_tag_neighborhood", hashbits_sweep_tag_neighborhood, METH_VARARGS, "" }, - {"get_tag_colors", hashbits_get_tag_colors, METH_VARARGS, ""}, - {"consume_sequence_and_tag_with_colors", hashbits_consume_sequence_and_tag_with_colors, METH_VARARGS, "" }, - {"n_colors", hashbits_n_colors, METH_VARARGS, ""}, - {"get_color_dict", hashbits_get_color_dict, METH_VARARGS, "" }, + {"get_tag_labels", hashbits_get_tag_labels, METH_VARARGS, ""}, + {"consume_sequence_and_tag_with_labels", hashbits_consume_sequence_and_tag_with_labels, METH_VARARGS, "" }, + {"n_labels", hashbits_n_labels, METH_VARARGS, ""}, + {"get_label_dict", hashbits_get_label_dict, METH_VARARGS, "" }, {NULL, NULL, 0, NULL} /* sentinel */ }; diff --git a/lib/hashtable.cc b/lib/hashtable.cc index 5a9eb9624d..c8c075143c 100644 --- a/lib/hashtable.cc +++ b/lib/hashtable.cc @@ -1952,7 +1952,7 @@ void Hashtable::extract_unique_paths(std::string seq, */ void -Hashtable::consume_fasta_and_tag_with_colors( +Hashtable::consume_fasta_and_tag_with_labels( std:: string const &filename, unsigned int &total_reads, unsigned long long &n_consumed, CallbackFn callback, void * callback_data @@ -1968,7 +1968,7 @@ Hashtable::consume_fasta_and_tag_with_colors( ); - consume_fasta_and_tag_with_colors( + consume_fasta_and_tag_with_labels( parser, total_reads, n_consumed, callback, callback_data @@ -1978,7 +1978,7 @@ Hashtable::consume_fasta_and_tag_with_colors( } void -Hashtable::consume_fasta_and_tag_with_colors( +Hashtable::consume_fasta_and_tag_with_labels( read_parsers:: IParser * parser, unsigned int &total_reads, unsigned long long &n_consumed, CallbackFn callback, void * callback_data @@ -2001,9 +2001,9 @@ Hashtable::consume_fasta_and_tag_with_colors( "Starting trace of 'consume_fasta_and_tag'....\n" ); - Color _tag_color = 0; + Label _tag_label = 0; - Color * the_color; + Label * the_label; // Iterate through the reads and consume their k-mers. while (!parser->is_complete( )) { @@ -2014,11 +2014,11 @@ Hashtable::consume_fasta_and_tag_with_colors( if (check_and_normalize_read( read.sequence )) { // TODO: make threadsafe! - the_color = check_and_allocate_color(_tag_color); - consume_sequence_and_tag_with_colors( read.sequence, + the_label = check_and_allocate_label(_tag_label); + consume_sequence_and_tag_with_labels( read.sequence, this_n_consumed, - *the_color ); - _tag_color++; + *the_label ); + _tag_label++; #ifdef WITH_INTERNAL_METRICS hasher.pmetrics.start_timers( ); @@ -2064,7 +2064,7 @@ Hashtable::consume_fasta_and_tag_with_colors( } -void Hashtable::consume_partitioned_fasta_and_tag_with_colors(const std::string &filename, +void Hashtable::consume_partitioned_fasta_and_tag_with_labels(const std::string &filename, unsigned int &total_reads, unsigned long long &n_consumed, CallbackFn callback, @@ -2085,7 +2085,7 @@ void Hashtable::consume_partitioned_fasta_and_tag_with_colors(const std::string // // iterate through the FASTA file & consume the reads. // - Color * c; + Label * c; PartitionID p; while(!parser->is_complete()) { read = parser->get_next_read(); @@ -2094,9 +2094,9 @@ void Hashtable::consume_partitioned_fasta_and_tag_with_colors(const std::string if (check_and_normalize_read(seq)) { // First, figure out what the partition is (if non-zero), and save that. p = _parse_partition_id(read.name); - c = check_and_allocate_color(p); + c = check_and_allocate_label(p); - consume_sequence_and_tag_with_colors( seq, + consume_sequence_and_tag_with_labels( seq, n_consumed, *c ); } @@ -2107,7 +2107,7 @@ void Hashtable::consume_partitioned_fasta_and_tag_with_colors(const std::string // run callback, if specified if (total_reads % CALLBACK_PERIOD == 0 && callback) { try { - callback("consume_partitioned_fasta_and_tag_with_colors", callback_data, + callback("consume_partitioned_fasta_and_tag_with_labels", callback_data, total_reads, n_consumed); } catch (...) { delete parser; @@ -2116,24 +2116,24 @@ void Hashtable::consume_partitioned_fasta_and_tag_with_colors(const std::string } } - // @cswelcher TODO: check that deallocate ColorPtrMap is correct + // @cswelcher TODO: check that deallocate LabelPtrMap is correct delete parser; } // @cswelcher: double-check -- is it valid to pull the address from a reference? -void Hashtable::link_tag_and_color(HashIntoType& kmer, Color& kmer_color) { - tag_colors.insert(TagColorPtrPair(kmer, &kmer_color)); - color_tag_ptrs.insert(ColorTagPtrPair(kmer_color, &kmer)); +void Hashtable::link_tag_and_label(HashIntoType& kmer, Label& kmer_label) { + tag_labels.insert(TagLabelPtrPair(kmer, &kmer_label)); + label_tag_ptrs.insert(LabelTagPtrPair(kmer_label, &kmer)); } -/* This is essentially the same code as above, only it assigns colors to the - * tags through multimap TagColorMap defined in hashtable.hh, declared in +/* This is essentially the same code as above, only it assigns labels to the + * tags through multimap TagLabelMap defined in hashtable.hh, declared in * hashbits.hh - * @cswelcher TODO: should I instead send in the pointer to the new color? + * @cswelcher TODO: should I instead send in the pointer to the new label? */ -void Hashtable::consume_sequence_and_tag_with_colors(const std::string& seq, +void Hashtable::consume_sequence_and_tag_with_labels(const std::string& seq, unsigned long long& n_consumed, - Color& current_color, + Label& current_label, SeenSet * found_tags) { bool is_new_kmer; @@ -2160,12 +2160,12 @@ void Hashtable::consume_sequence_and_tag_with_colors(const std::string& seq, if (kmer_tagged) { since = 1; - // Coloring code + // Labeling code // TODO: MAKE THREADSAFE! - if (!_cmap_contains_color(tag_colors, kmer, current_color)) { + if (!_cmap_contains_label(tag_labels, kmer, current_label)) { ACQUIRE_TAG_COLORS_SPIN_LOCK - link_tag_and_color(kmer, current_color); + link_tag_and_label(kmer, current_label); RELEASE_TAG_COLORS_SPIN_LOCK } if (found_tags) { @@ -2187,10 +2187,10 @@ void Hashtable::consume_sequence_and_tag_with_colors(const std::string& seq, all_tags.insert(kmer); RELEASE_ALL_TAGS_SPIN_LOCK - // Coloring code + // Labeling code // TODO: MAKE THREADSAFE! ACQUIRE_TAG_COLORS_SPIN_LOCK - link_tag_and_color(kmer, current_color); + link_tag_and_label(kmer, current_label); RELEASE_TAG_COLORS_SPIN_LOCK if (found_tags) { found_tags->insert(kmer); } @@ -2204,24 +2204,24 @@ void Hashtable::consume_sequence_and_tag_with_colors(const std::string& seq, all_tags.insert(kmer); // insert the last k-mer, too. RELEASE_ALL_TAGS_SPIN_LOCK - // Color code: TODO: MAKE THREADSAFE! - link_tag_and_color(kmer, current_color); + // Label code: TODO: MAKE THREADSAFE! + link_tag_and_label(kmer, current_label); if (found_tags) { found_tags->insert(kmer); } } } /* - * Find all colors associated with the sequence + * Find all labels associated with the sequence * For now, check /every/ k-mer with find_all_tags * THIS SUCKS AND IT'S YOUR FAULT @CTB */ -unsigned int Hashtable::sweep_sequence_for_colors(const std::string& seq, - ColorPtrSet& found_colors, +unsigned int Hashtable::sweep_sequence_for_labels(const std::string& seq, + LabelPtrSet& found_labels, bool break_on_stoptags, bool stop_big_traversals) { SeenSet tagged_kmers; - //ColorPtrSet found_colors; + //LabelPtrSet found_labels; HashIntoType kmer_f, kmer_r, kmer; @@ -2239,14 +2239,14 @@ unsigned int Hashtable::sweep_sequence_for_colors(const std::string& seq, if (get_count(uniqify_rc(kmer_f,kmer_r))) { partition->find_all_tags(kmer_f, kmer_r, tagged_kmers, all_tags, break_on_stoptags, stop_big_traversals); - traverse_colors_and_resolve(tagged_kmers, found_colors); + traverse_labels_and_resolve(tagged_kmers, found_labels); } } return traversed_kmers.size(); } -unsigned int Hashtable::sweep_color_neighborhood(const std::string& seq, - ColorPtrSet& found_colors, +unsigned int Hashtable::sweep_label_neighborhood(const std::string& seq, + LabelPtrSet& found_labels, unsigned int range, bool break_on_stoptags, bool stop_big_traversals) { @@ -2255,7 +2255,7 @@ unsigned int Hashtable::sweep_color_neighborhood(const std::string& seq, unsigned int num_traversed; num_traversed = partition->sweep_for_tags(seq, tagged_kmers, all_tags, range, break_on_stoptags, stop_big_traversals); - traverse_colors_and_resolve(tagged_kmers, found_colors); + traverse_labels_and_resolve(tagged_kmers, found_labels); //printf("range=%u ", range); if (range == 0) { assert(num_traversed == seq.length()-ksize()+1); @@ -2264,31 +2264,31 @@ unsigned int Hashtable::sweep_color_neighborhood(const std::string& seq, return num_traversed; } -ColorPtrSet Hashtable::get_tag_colors(const HashIntoType& tag) { - ColorPtrSet colors; - unsigned int num_colors; - _get_tag_colors(tag, tag_colors, colors); - return colors; +LabelPtrSet Hashtable::get_tag_labels(const HashIntoType& tag) { + LabelPtrSet labels; + unsigned int num_labels; + _get_tag_labels(tag, tag_labels, labels); + return labels; } -TagPtrSet Hashtable::get_color_tags(const Color& color) { +TagPtrSet Hashtable::get_label_tags(const Label& label) { TagPtrSet tags; unsigned int num_tags; - _get_tags_from_color(color, color_tag_ptrs, tags); + _get_tags_from_label(label, label_tag_ptrs, tags); return tags; } -void Hashtable::traverse_colors_and_resolve(const SeenSet& tagged_kmers, - ColorPtrSet& found_colors) { +void Hashtable::traverse_labels_and_resolve(const SeenSet& tagged_kmers, + LabelPtrSet& found_labels) { SeenSet::const_iterator si; - unsigned int num_colors = 0; + unsigned int num_labels = 0; for (si=tagged_kmers.begin(); si!=tagged_kmers.end(); ++si) { HashIntoType tag = *si; - // get the colors associated with this tag - num_colors = _get_tag_colors(tag, tag_colors, found_colors); - if (num_colors > 1) { - // reconcile colors + // get the labels associated with this tag + num_labels = _get_tag_labels(tag, tag_labels, found_labels); + if (num_labels > 1) { + // reconcile labels // for now do nothing ha } } diff --git a/lib/hashtable.hh b/lib/hashtable.hh index 653ad7ba95..f13dcd51a2 100644 --- a/lib/hashtable.hh +++ b/lib/hashtable.hh @@ -182,52 +182,52 @@ namespace khmer { HashIntoType bitmask; unsigned int _nbits_sub_1; - // Does the given tag already have the given color? - bool _cmap_contains_color(const TagColorPtrMap& cmap, + // Does the given tag already have the given label? + bool _cmap_contains_label(const TagLabelPtrMap& cmap, HashIntoType& kmer, - Color& the_color) + Label& the_label) { - std::pair ret; + std::pair ret; ret = cmap.equal_range(kmer); - for (TagColorPtrMap::const_iterator it=ret.first; it!=ret.second; ++it) { - if (*(it->second) == the_color) return true; + for (TagLabelPtrMap::const_iterator it=ret.first; it!=ret.second; ++it) { + if (*(it->second) == the_label) return true; } return false; } - // Does the given color already have a tag associated with it? - bool _cmap_contains_tag(const ColorTagPtrMap& cmap, - Color& the_color, + // Does the given label already have a tag associated with it? + bool _cmap_contains_tag(const LabelTagPtrMap& cmap, + Label& the_label, HashIntoType& kmer) { - std::pair ret; - ret = cmap.equal_range(the_color); - for (ColorTagPtrMap::const_iterator it=ret.first; it!=ret.second; ++it) { + std::pair ret; + ret = cmap.equal_range(the_label); + for (LabelTagPtrMap::const_iterator it=ret.first; it!=ret.second; ++it) { if(*(it->second) == kmer) return true; } return false; } - unsigned int _get_tag_colors(const HashIntoType& tag, - const TagColorPtrMap& cmap, - ColorPtrSet& found_colors) { - unsigned int num_colors = 0; - std::pair ret; + unsigned int _get_tag_labels(const HashIntoType& tag, + const TagLabelPtrMap& cmap, + LabelPtrSet& found_labels) { + unsigned int num_labels = 0; + std::pair ret; ret = cmap.equal_range(tag); - for (TagColorPtrMap::const_iterator it=ret.first; it!=ret.second; ++it) { - found_colors.insert(it->second); - ++num_colors; + for (TagLabelPtrMap::const_iterator it=ret.first; it!=ret.second; ++it) { + found_labels.insert(it->second); + ++num_labels; } - return num_colors; + return num_labels; } - unsigned int _get_tags_from_color(const Color& color, - const ColorTagPtrMap& cmap, - TagPtrSet& colored_tags) { + unsigned int _get_tags_from_label(const Label& label, + const LabelTagPtrMap& cmap, + TagPtrSet& labeled_tags) { unsigned int num_tags = 0; - std::pair ret; - ret = cmap.equal_range(color); - for (ColorTagPtrMap::const_iterator it=ret.first; it!=ret.second; ++it) { - colored_tags.insert(it->second); + std::pair ret; + ret = cmap.equal_range(label); + for (LabelTagPtrMap::const_iterator it=ret.first; it!=ret.second; ++it) { + labeled_tags.insert(it->second); ++num_tags; } return num_tags; @@ -253,7 +253,7 @@ namespace khmer { partition = new SubsetPartition(this); _init_bitstuff(); _all_tags_spin_lock = 0; - _tag_colors_spin_lock = 0; + _tag_labels_spin_lock = 0; } @@ -367,15 +367,15 @@ namespace khmer { } uint32_t _all_tags_spin_lock; - uint32_t _tag_colors_spin_lock; + uint32_t _tag_labels_spin_lock; public: SubsetPartition * partition; SeenSet all_tags; SeenSet stop_tags; SeenSet repart_small_tags; - TagColorPtrMap tag_colors; - ColorTagPtrMap color_tag_ptrs; - ColorPtrMap color_ptrs; + TagLabelPtrMap tag_labels; + LabelTagPtrMap label_tag_ptrs; + LabelPtrMap label_ptrs; // accessor to get 'k' const WordLength ksize() const { return _ksize; } @@ -455,7 +455,7 @@ namespace khmer { // Partitioning stuff. unsigned int n_tags() const { return all_tags.size(); } - unsigned int n_colors() const { return color_ptrs.size(); } + unsigned int n_labels() const { return label_ptrs.size(); } void divide_tags_into_subsets(unsigned int subset_size, SeenSet& divvy); @@ -486,13 +486,13 @@ namespace khmer { void * callback_data = NULL ); - Color * check_and_allocate_color(Color new_color) { - Color * c; - if (color_ptrs.count(new_color)) { - c = color_ptrs[new_color]; + Label * check_and_allocate_label(Label new_label) { + Label * c; + if (label_ptrs.count(new_label)) { + c = label_ptrs[new_label]; } else { - c = new Color(new_color); - color_ptrs[*c] = c; + c = new Label(new_label); + label_ptrs[*c] = c; } return c; } @@ -508,49 +508,49 @@ namespace khmer { CallbackFn callback = 0, void * callback_data = 0); - void consume_fasta_and_tag_with_colors( + void consume_fasta_and_tag_with_labels( std::string const &filename, unsigned int &total_reads, unsigned long long &n_consumed, CallbackFn callback = NULL, void * callback_data = NULL); - void consume_fasta_and_tag_with_colors( + void consume_fasta_and_tag_with_labels( read_parsers:: IParser * parser, unsigned int &total_reads, unsigned long long &n_consumed, CallbackFn callback = NULL, void * callback_data = NULL); - void consume_partitioned_fasta_and_tag_with_colors(const std::string &filename, + void consume_partitioned_fasta_and_tag_with_labels(const std::string &filename, unsigned int &total_reads, unsigned long long &n_consumed, CallbackFn callback = NULL, void * callback_datac = NULL); - void consume_sequence_and_tag_with_colors(const std::string& seq, + void consume_sequence_and_tag_with_labels(const std::string& seq, unsigned long long& n_consumed, - Color& current_color, + Label& current_label, SeenSet * new_tags = 0); - ColorPtrSet get_tag_colors(const HashIntoType& tag); - TagPtrSet get_color_tags(const Color& color); + LabelPtrSet get_tag_labels(const HashIntoType& tag); + TagPtrSet get_label_tags(const Label& label); - void link_tag_and_color(HashIntoType& kmer, Color& color); + void link_tag_and_label(HashIntoType& kmer, Label& label); - unsigned int sweep_sequence_for_colors(const std::string& seq, - ColorPtrSet& found_colors, + unsigned int sweep_sequence_for_labels(const std::string& seq, + LabelPtrSet& found_labels, bool break_on_stoptags, bool stop_big_traversals); - unsigned int sweep_color_neighborhood(const std::string & seq, - ColorPtrSet& found_colors, + unsigned int sweep_label_neighborhood(const std::string & seq, + LabelPtrSet& found_labels, unsigned int range, bool break_on_stoptags, bool stop_big_traversals); - void traverse_colors_and_resolve(const SeenSet& tagged_kmers, - ColorPtrSet& found_colors); + void traverse_labels_and_resolve(const SeenSet& tagged_kmers, + LabelPtrSet& found_labels); void consume_fasta_and_traverse(const std::string &filename, unsigned int distance, @@ -661,9 +661,9 @@ namespace khmer { __sync_bool_compare_and_swap( &_all_tags_spin_lock, 1, 0 ); #define ACQUIRE_TAG_COLORS_SPIN_LOCK \ - while(!__sync_bool_compare_and_swap( &_tag_colors_spin_lock, 0, 1)); + while(!__sync_bool_compare_and_swap( &_tag_labels_spin_lock, 0, 1)); #define RELEASE_TAG_COLORS_SPIN_LOCK \ - __sync_bool_compare_and_swap( &_tag_colors_spin_lock, 1, 0); + __sync_bool_compare_and_swap( &_tag_labels_spin_lock, 1, 0); #endif // HASHTABLE_HH diff --git a/lib/khmer.hh b/lib/khmer.hh index 882ea88bad..507c1e443a 100644 --- a/lib/khmer.hh +++ b/lib/khmer.hh @@ -87,14 +87,14 @@ namespace khmer { typedef std::map PartitionCountMap; typedef std::map PartitionCountDistribution; - typedef unsigned long long int Color; - typedef std::multimap TagColorPtrMap; - typedef std::multimap ColorTagPtrMap; - typedef std::pair TagColorPtrPair; - typedef std::pair ColorTagPtrPair; - typedef std::set ColorPtrSet; + typedef unsigned long long int Label; + typedef std::multimap TagLabelPtrMap; + typedef std::multimap LabelTagPtrMap; + typedef std::pair TagLabelPtrPair; + typedef std::pair LabelTagPtrPair; + typedef std::set LabelPtrSet; typedef std::set TagPtrSet; - typedef std::map ColorPtrMap; + typedef std::map LabelPtrMap; template void deallocate_ptr_set(T& s) { diff --git a/tests/test-data/test-labels.fa b/tests/test-data/test-labels.fa new file mode 100644 index 0000000000..b93d7c3c64 --- /dev/null +++ b/tests/test-data/test-labels.fa @@ -0,0 +1,8 @@ +>read_A +ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAG +>read_B_overlap_A +GCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGCTCTGCCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGA +>read_C_overlap_B +TGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGAGTTGGGGATAGATAGATAGATGAGTGTAGATCCAACAACACATACA +>read_D +TATATATATAGCTAGCTAGCTAACTAGCTAGCATCGATCGATCGATC From c9f4b1027c136e93a925d899402c882b01f86abb Mon Sep 17 00:00:00 2001 From: CS Welcher Date: Tue, 19 Nov 2013 15:47:30 -0500 Subject: [PATCH 092/140] forgot to commit test changes --- tests/test_hashbits.py | 130 ++++++++++++++++++++--------------------- 1 file changed, 65 insertions(+), 65 deletions(-) diff --git a/tests/test_hashbits.py b/tests/test_hashbits.py index c72245a63f..bb9299c92a 100644 --- a/tests/test_hashbits.py +++ b/tests/test_hashbits.py @@ -505,25 +505,25 @@ def test_simple_median(): # @cswelcher TODO: more tests! # * thread-safety -def test_n_colors(): +def test_n_labels(): hb = khmer.new_hashbits(20, 1e7, 4) - filename = utils.get_test_data('test-colors.fa') - hb.consume_fasta_and_tag_with_colors(filename) + filename = utils.get_test_data('test-labels.fa') + hb.consume_fasta_and_tag_with_labels(filename) - print hb.n_colors() - assert hb.n_colors() == 4 + print hb.n_labels() + assert hb.n_labels() == 4 -def test_get_color_dict(): +def test_get_label_dict(): hb = khmer.new_hashbits(20, 1e7, 4) - filename = utils.get_test_data('test-colors.fa') - hb.consume_fasta_and_tag_with_colors(filename) + filename = utils.get_test_data('test-labels.fa') + hb.consume_fasta_and_tag_with_labels(filename) - colors = hb.get_color_dict() + labels = hb.get_label_dict() expected = [0L, 1L, 2L, 3L] - for e_color in expected: - assert e_color in colors - for a_color in colors: - assert a_color in expected + for e_label in expected: + assert e_label in labels + for a_label in labels: + assert a_label in expected def test_sweep_tag_neighborhood(): hb = khmer.new_hashbits(20, 1e7, 4) @@ -534,96 +534,96 @@ def test_sweep_tag_neighborhood(): assert len(tags) == 1 assert tags.pop() == 173473779682L -def test_get_tag_colors(): +def test_get_tag_labels(): hb = khmer.new_hashbits(20, 1e7, 4) filename = utils.get_test_data('single-read.fq') - hb.consume_fasta_and_tag_with_colors(filename) + hb.consume_fasta_and_tag_with_labels(filename) tag = 173473779682L - colors = hb.get_tag_colors(tag) - assert len(colors) == 1 - assert colors.pop() == 0L + labels = hb.get_tag_labels(tag) + assert len(labels) == 1 + assert labels.pop() == 0L -def test_sweep_sequence_for_colors(): +def test_sweep_sequence_for_labels(): hb = khmer.new_hashbits(20, 1e7, 4) filename = utils.get_test_data('single-read.fq') - hb.consume_fasta_and_tag_with_colors(filename) + hb.consume_fasta_and_tag_with_labels(filename) - colors = hb.sweep_color_neighborhood('CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT') - assert len(colors) == 1 - assert colors.pop() == 0L + labels = hb.sweep_label_neighborhood('CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT') + assert len(labels) == 1 + assert labels.pop() == 0L -def test_consume_partitioned_fasta_and_tag_with_colors(): +def test_consume_partitioned_fasta_and_tag_with_labels(): hb = khmer.new_hashbits(20, 1e7, 4) filename = utils.get_test_data('real-partition-small.fa') - total_reads, n_consumed = hb.consume_partitioned_fasta_and_tag_with_colors(filename) - colors = set() + total_reads, n_consumed = hb.consume_partitioned_fasta_and_tag_with_labels(filename) + labels = set() for record in screed.open(filename): seq = record.sequence - colors.update(hb.sweep_color_neighborhood(seq, False, False)) - #print hb.n_colors() - #print colors - assert len(colors) == 1 - assert colors.pop() == 2L - assert hb.n_colors() == 1 - -def test_consume_fasta_and_tag_with_colors(): + labels.update(hb.sweep_label_neighborhood(seq, False, False)) + #print hb.n_labels() + #print labels + assert len(labels) == 1 + assert labels.pop() == 2L + assert hb.n_labels() == 1 + +def test_consume_fasta_and_tag_with_labels(): hb = khmer.new_hashbits(20, 1e7, 4) read_1 = 'ACGTAACCGGTTAAACCCGGGTTTAAAACCCCGGGGTTTT' filename = utils.get_test_data('test-transcript.fa') - total_reads, n_consumed = hb.consume_fasta_and_tag_with_colors(filename) + total_reads, n_consumed = hb.consume_fasta_and_tag_with_labels(filename) assert hb.get(read_1[:20]) assert total_reads == 3 - print hb.n_colors() - print hb.get_color_dict() + print hb.n_labels() + print hb.get_label_dict() for tag in hb.get_tagset(): print tag, khmer.forward_hash(tag, 20) for record in screed.open(filename): print hb.sweep_tag_neighborhood(record.sequence, 40) - print hb.sweep_color_neighborhood(record.sequence, 40) - assert hb.n_colors() == 3 + print hb.sweep_label_neighborhood(record.sequence, 40) + assert hb.n_labels() == 3 ''' * The test data set as four reads: A, B, C, and D * Overlaps are A <-> B <-> C, with D on its own -* Thus, traversing from A should find colors from A and B, - traversing from B should find colors from A, B, and C, - and traversing from C should find colors from B and C +* Thus, traversing from A should find labels from A and B, + traversing from B should find labels from A, B, and C, + and traversing from C should find labels from B and C ''' -def test_color_tag_correctness(): +def test_label_tag_correctness(): hb = khmer.new_hashbits(20, 1e7, 4) - filename = utils.get_test_data('test-colors.fa') - hb.consume_fasta_and_tag_with_colors(filename) + filename = utils.get_test_data('test-labels.fa') + hb.consume_fasta_and_tag_with_labels(filename) # read A - colors = hb.sweep_color_neighborhood('ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAGCTAGGCTAGGTGTGCTCTGCCTAGAGCTAGGCTAGGTGT') + labels = hb.sweep_label_neighborhood('ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAGCTAGGCTAGGTGTGCTCTGCCTAGAGCTAGGCTAGGTGT') print hb.sweep_tag_neighborhood('TTCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAGCTAGGCTAGGTGTGCTCTGCCTAGAGCTAGGCTAGGTGT') - print colors + print labels print len('ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAG')-19 - assert len(colors) == 2 - assert 0L in colors - assert 1L in colors + assert len(labels) == 2 + assert 0L in labels + assert 1L in labels # read B - colors = hb.sweep_color_neighborhood('GCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGCTCTGCCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGA') - print colors - assert len(colors) == 3 - assert 0L in colors - assert 1L in colors - assert 2L in colors + labels = hb.sweep_label_neighborhood('GCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGCTCTGCCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGA') + print labels + assert len(labels) == 3 + assert 0L in labels + assert 1L in labels + assert 2L in labels # read C - colors = hb.sweep_color_neighborhood('TGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGAGTTGGGGATAGATAGATAGATGAGTGTAGATCCAACAACACATACA') - print colors - assert len(colors) == 2 - assert 1L in colors - assert 2L in colors + labels = hb.sweep_label_neighborhood('TGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGAGTTGGGGATAGATAGATAGATGAGTGTAGATCCAACAACACATACA') + print labels + assert len(labels) == 2 + assert 1L in labels + assert 2L in labels # read D - colors = hb.sweep_color_neighborhood('TATATATATAGCTAGCTAGCTAACTAGCTAGCATCGATCGATCGATC') - print colors - assert len(colors) == 1 - assert 3L in colors + labels = hb.sweep_label_neighborhood('TATATATATAGCTAGCTAGCTAACTAGCTAGCATCGATCGATCGATC') + print labels + assert len(labels) == 1 + assert 3L in labels From d40d868c9742f8389b82c11f691f76eaf625971a Mon Sep 17 00:00:00 2001 From: CS Welcher Date: Tue, 19 Nov 2013 15:49:52 -0500 Subject: [PATCH 093/140] aaaaaand the script --- scripts/sweep-reads-by-partition-buffered.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py index 0df68b2861..266d1bef9f 100755 --- a/scripts/sweep-reads-by-partition-buffered.py +++ b/scripts/sweep-reads-by-partition-buffered.py @@ -215,7 +215,7 @@ def main(): print >>sys.stderr, 'consuming fastp...' if debug: yep.start(debug) - ht.consume_partitioned_fasta_and_tag_with_colors(input_fastp) + ht.consume_partitioned_fasta_and_tag_with_labels(input_fastp) label_number_dist = [] @@ -246,7 +246,7 @@ def main(): seq = record.sequence name = record.name try: - labels = ht.sweep_color_neighborhood(seq, traversal_range) + labels = ht.sweep_label_neighborhood(seq, traversal_range) except ValueError as e: print >>sys.stderr, '!! ERROR: {e} !!'.format(e=e) print >>sys.stderr, 'Read length less than k-mer size' From 8b28ba39b12aba1f174015da58e1f1c1915d841c Mon Sep 17 00:00:00 2001 From: CS Welcher Date: Tue, 19 Nov 2013 16:23:11 -0500 Subject: [PATCH 094/140] writing script to build sparse graph --- sandbox/build-sparse-graph.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 sandbox/build-sparse-graph.py diff --git a/sandbox/build-sparse-graph.py b/sandbox/build-sparse-graph.py new file mode 100644 index 0000000000..1f037afb82 --- /dev/null +++ b/sandbox/build-sparse-graph.py @@ -0,0 +1,13 @@ +import khmer +import sys +import screed + + +input_fasta = sys.argv[3] +K = sys.argv[1] +x = sys.argv[2] + + +ht = khmer.new_hashbits(K, x, 4) + + From 048e542f1b975d933a490c9d68d623804e923535 Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Tue, 19 Nov 2013 18:10:57 -0500 Subject: [PATCH 095/140] done for the day --- sandbox/build-sparse-graph.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sandbox/build-sparse-graph.py b/sandbox/build-sparse-graph.py index 1f037afb82..a8e6ac8969 100644 --- a/sandbox/build-sparse-graph.py +++ b/sandbox/build-sparse-graph.py @@ -1,6 +1,7 @@ import khmer import sys import screed +import graph_tool.all as gt input_fasta = sys.argv[3] @@ -10,4 +11,7 @@ ht = khmer.new_hashbits(K, x, 4) +sparse_graph = gt.Graph() +for n, record in enumerate(screed.open(input_fasta)): + From a693a87860da276e1a725233fcc380cb6064eded Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Fri, 22 Nov 2013 14:32:06 -0500 Subject: [PATCH 096/140] removed references to debugger --- scripts/sweep-reads-by-partition-buffered.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py index 266d1bef9f..7f460caff4 100755 --- a/scripts/sweep-reads-by-partition-buffered.py +++ b/scripts/sweep-reads-by-partition-buffered.py @@ -161,7 +161,6 @@ def main(): default=DEFAULT_OUT_PREF) parser.add_argument('-m', '--max_buffers', dest='max_buffers', type=int, \ default=DEFAULT_NUM_BUFFERS) - parser.add_argument('-d', '--debug', dest='debug', default=None) parser.add_argument('input_files', nargs='+') args = parser.parse_args() @@ -204,17 +203,11 @@ def main(): est = args.files_estimate input_files = args.input_files - debug = args.debug - if debug: - import yep - output_buffer = ReadBufferManager(max_buffers, buf_size, est, output_pref, outdir) # consume the partitioned fasta with which to label the graph ht = khmer.new_hashbits(K, HT_SIZE, N_HT) print >>sys.stderr, 'consuming fastp...' - if debug: - yep.start(debug) ht.consume_partitioned_fasta_and_tag_with_labels(input_fastp) label_number_dist = [] @@ -272,8 +265,6 @@ def main(): output_buffer.flush_all() total_t = time.clock() - total_t - if debug: - yep.stop() if output_buffer.num_write_errors > 0 or output_buffer.num_file_errors > 0: print >>sys.stderr, '! WARNING: Sweep finished with errors !' print >>sys.stderr, '** {writee} reads not written'.format(writee=output_buffer.num_write_errors) From d6e4088069b79c10162d06f815141ba7f694e2c9 Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Fri, 22 Nov 2013 14:40:43 -0500 Subject: [PATCH 097/140] playing with sparse graph viz --- sandbox/build-sparse-graph.py | 44 ++++++++++++++++++++++++++++++++--- 1 file changed, 41 insertions(+), 3 deletions(-) diff --git a/sandbox/build-sparse-graph.py b/sandbox/build-sparse-graph.py index a8e6ac8969..4cf9cd5523 100644 --- a/sandbox/build-sparse-graph.py +++ b/sandbox/build-sparse-graph.py @@ -3,15 +3,53 @@ import screed import graph_tool.all as gt - input_fasta = sys.argv[3] -K = sys.argv[1] -x = sys.argv[2] +K = int(sys.argv[1]) +x = float(sys.argv[2]) ht = khmer.new_hashbits(K, x, 4) sparse_graph = gt.Graph() +hashes = sparse_graph.new_vertex_property("long long") + for n, record in enumerate(screed.open(input_fasta)): + if n % 1000 == 0: + print >>sys.stderr, '...loaded and tagged {} sequences'.format(n) + name = record.name + sequence = record.sequence + ht.consume_sequence_and_tag_with_labels(sequence, n) + tags = ht.sweep_tag_neighborhood(sequence, 0) + for i in xrange(len(tags)-1): + src = tags[i] + dst = tags[i+1] + + new = False + + srcv = gt.find_vertex(sparse_graph, hashes, src) + if not srcv: + srcv = sparse_graph.add_vertex() + hashes[srcv] = src + new = True + else: + srcv = srcv[0] + + dstv = gt.find_vertex(sparse_graph, hashes, dst) + if not dstv: + dstv = sparse_graph.add_vertex() + hashes[dstv] = dst + new = True + else: + dstv = dstv[0] + + if new: + e = sparse_graph.add_edge(srcv, dstv) + +print 'Sparse graph has {} nodes, {} edges'.format(sparse_graph.num_vertices(), sparse_graph.num_edges()) +comp = gt.label_largest_component(sparse_graph, directed=False) +#pos = gt.radial_tree_layout(sparse_graph, sparse_graph.vertex(0)) +gt.graph_draw(sparse_graph, output_size=(5000,5000), output=input_fasta+'_sparse.png') +sparse_graph.set_vertex_filter(comp) +gt.graph_draw(sparse_graph, output_size=(5000,5000), output=input_fasta+'_sparse_comp.png') From 32c0c39f9a466a43257763b8f18fe8f71d349369 Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Fri, 22 Nov 2013 16:17:18 -0500 Subject: [PATCH 098/140] changed buffering parameters to make more sense --- scripts/sweep-reads-by-partition-buffered.py | 29 ++++++++++---------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py index 7f460caff4..a2a3e6050c 100755 --- a/scripts/sweep-reads-by-partition-buffered.py +++ b/scripts/sweep-reads-by-partition-buffered.py @@ -30,8 +30,8 @@ DEFAULT_NUM_BUFFERS=50000 -DEFAULT_BUFFER_SIZE=1000000 -DEFAULT_NUM_PARTITIONS=100000 +DEFAULT_MAX_READS=1000000 +DEFAULT_BUFFER_SIZE=10 DEFAULT_OUT_PREF='reads_' DEFAULT_RANGE=-1 @@ -80,16 +80,15 @@ def __len__(self): class ReadBufferManager: - def __init__(self, max_buffers, max_size, est_files, output_pref, outdir): + def __init__(self, max_buffers, max_reads, max_size, output_pref, outdir): self.buffers = {} self.buffer_counts = {} self.max_buffers = max_buffers - self.max_size = max_size + self.max_reads = max_reads - self.est_files = est_files self.output_pref = output_pref self.outdir = outdir - self.buffer_flush = self.max_size / self.est_files + self.buffer_flush = max_size self.cur_reads = 0 self.cur_files = 0 @@ -100,10 +99,9 @@ def __init__(self, max_buffers, max_size, est_files, output_pref, outdir): print >>sys.stderr, '''Init new ReadBuffer [ Max Buffers: {num_bufs} Max Reads: {max_reads} - Est. Files: {est_files} Buffer flush: {buf_flush} - ]'''.format(num_bufs=self.max_buffers, max_reads=self.max_size, - est_files=self.est_files, buf_flush=self.buffer_flush) + ]'''.format(num_bufs=self.max_buffers, max_reads=self.max_reads, + buf_flush=self.buffer_flush) def flush_buffer(self, buf_id): fn = '{}_{}.fa'.format(self.output_pref, buf_id) @@ -132,7 +130,7 @@ def queue(self, seq_str, buf_id): self.buffers[buf_id] = new_buf self.cur_reads += 1 - if self.cur_reads > self.max_size: + if self.cur_reads > self.max_reads: print >>sys.stderr, '** Reached max num reads...' self.flush_all() if len(self.buffers) > self.max_buffers: @@ -153,10 +151,10 @@ def main(): parser.add_argument('-i', '--input_fastp',dest='input_fastp') parser.add_argument('-r', '--traversal_range', type=int, dest='traversal_range', \ default=DEFAULT_RANGE) - parser.add_argument('-b', '--buffer_size', dest='buffer_size', type=int, \ + parser.add_argument('-b', '--buffer_size', dest='max_reads', type=int, \ + default=DEFAULT_MAX_READS) + parser.add_argument('-l', '--buffer_length', dest='buffer_size', type=int, \ default=DEFAULT_BUFFER_SIZE) - parser.add_argument('-e', '--files_estimate', dest='files_estimate', type=int, \ - default=DEFAULT_NUM_PARTITIONS) parser.add_argument('-o', '--output_prefix', dest='output_prefix', default=DEFAULT_OUT_PREF) parser.add_argument('-m', '--max_buffers', dest='max_buffers', type=int, \ @@ -200,10 +198,11 @@ def main(): max_buffers = args.max_buffers output_pref = args.output_prefix buf_size = args.buffer_size - est = args.files_estimate + max_reads = args.max_reads + input_files = args.input_files - output_buffer = ReadBufferManager(max_buffers, buf_size, est, output_pref, outdir) + output_buffer = ReadBufferManager(max_buffers, max_reads, buf_size, output_pref, outdir) # consume the partitioned fasta with which to label the graph ht = khmer.new_hashbits(K, HT_SIZE, N_HT) From d79a5ba7a130527f1a68217d73ed383c0bfdc1fb Mon Sep 17 00:00:00 2001 From: CS Welcher Date: Wed, 4 Dec 2013 12:45:35 -0500 Subject: [PATCH 099/140] fixed env line --- scripts/sweep-reads-by-partition-buffered.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py index d7d6fe0f7f..45046ef0ac 100755 --- a/scripts/sweep-reads-by-partition-buffered.py +++ b/scripts/sweep-reads-by-partition-buffered.py @@ -1,4 +1,4 @@ -#! /usr/bin/python +#!/usr/bin/env python # # This file is part of khmer, http://github.com/ged-lab/khmer/, and is # Copyright (C) Michigan State University, 2009-2013. It is licensed under From 7d80ee97d30f1caf7229d3940fc2af43d404782f Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Wed, 4 Dec 2013 17:25:34 -0500 Subject: [PATCH 100/140] fixed derped up merge from partition_on_abundance, properly 3 way merged that branch along with master --- khmer/_khmermodule.cc | 330 ++++++++++++++++++++++------------------- tests/test_filter.py | 26 ---- tests/test_hashbits.py | 1 + 3 files changed, 181 insertions(+), 176 deletions(-) diff --git a/khmer/_khmermodule.cc b/khmer/_khmermodule.cc index 7aaf268c1f..f8f7cb39b3 100644 --- a/khmer/_khmermodule.cc +++ b/khmer/_khmermodule.cc @@ -2608,26 +2608,6 @@ static PyObject * hashbits_repartition_largest_partition(PyObject * self, PyObje return PyInt_FromLong(next_largest); } -static PyObject * hashbits_hitraverse_to_stoptags(PyObject * self, PyObject * args) -{ - khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self; - khmer::Hashbits * hashbits = me->hashbits; - - PyObject * counting_o = NULL; - unsigned int cutoff = 0; - const char * filename = NULL; - - if (!PyArg_ParseTuple(args, "sOI", &filename, &counting_o, &cutoff)) { - return NULL; - } - - khmer::CountingHash * counting = ((khmer_KCountingHashObject *) counting_o)->counting; - - hashbits->hitraverse_to_stoptags(filename, *counting, cutoff); - - Py_RETURN_NONE; -} - static PyObject * hashbits_get(PyObject * self, PyObject * args) { khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self; @@ -2695,64 +2675,6 @@ static PyObject * hashbits_kmer_degree(PyObject * self, PyObject * args) return PyInt_FromLong(hashbits->kmer_degree(kmer_s)); } -static PyObject * hashbits_trim_on_degree(PyObject * self, PyObject * args) -{ - khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self; - khmer::Hashbits * hashbits = me->hashbits; - - const char * seq = NULL; - unsigned int max_degree = 0; - - if (!PyArg_ParseTuple(args, "sI", &seq, &max_degree)) { - return NULL; - } - - unsigned int trim_at; - Py_BEGIN_ALLOW_THREADS - - trim_at = hashbits->trim_on_degree(seq, max_degree); - - Py_END_ALLOW_THREADS; - - PyObject * trim_seq = PyString_FromStringAndSize(seq, trim_at); - if (trim_seq == NULL) { - return NULL; - } - PyObject * ret = Py_BuildValue("OI", trim_seq, trim_at); - Py_DECREF(trim_seq); - - return ret; -} - -static PyObject * hashbits_trim_on_sodd(PyObject * self, PyObject * args) -{ - khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self; - khmer::Hashbits * hashbits = me->hashbits; - - const char * seq = NULL; - unsigned int max_sodd = 0; - - if (!PyArg_ParseTuple(args, "sI", &seq, &max_sodd)) { - return NULL; - } - - unsigned int trim_at; - Py_BEGIN_ALLOW_THREADS - - trim_at = hashbits->trim_on_sodd(seq, max_sodd); - - Py_END_ALLOW_THREADS; - - PyObject * trim_seq = PyString_FromStringAndSize(seq, trim_at); - if (trim_seq == NULL) { - return NULL; - } - PyObject * ret = Py_BuildValue("OI", trim_seq, trim_at); - Py_DECREF(trim_seq); - - return ret; -} - static PyObject * hashbits_trim_on_stoptags(PyObject * self, PyObject * args) { khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self; @@ -2808,12 +2730,6 @@ static PyObject * hashbits_identify_stoptags_by_position(PyObject * self, PyObje return x; } -void free_subset_partition_info(void * p) -{ - khmer::SubsetPartition * subset_p = (khmer::SubsetPartition *) p; - delete subset_p; -} - static PyObject * hashbits_do_subset_partition(PyObject * self, PyObject * args) { khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self; @@ -2970,30 +2886,6 @@ static PyObject * hashbits_consume_fasta_with_reads_parser( return Py_BuildValue("IK", total_reads, n_consumed); } -static PyObject * hashbits_traverse_from_reads(PyObject * self, PyObject * args) -{ - khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self; - khmer::Hashbits * hashbits = me->hashbits; - - const char * filename; - unsigned int radius, big_threshold, transfer_threshold; - PyObject * counting_o = NULL; - - if (!PyArg_ParseTuple(args, "sIIIO", &filename, - &radius, &big_threshold, &transfer_threshold, - &counting_o)) { - return NULL; - } - - khmer::CountingHash * counting = ((khmer_KCountingHashObject *) counting_o)->counting; - - hashbits->traverse_from_reads(filename, radius, big_threshold, - transfer_threshold, *counting); - - - Py_RETURN_NONE; -} - static PyObject * hashbits_consume_fasta_and_traverse(PyObject * self, PyObject * args) { khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self; @@ -3139,12 +3031,6 @@ static PyObject * hashbits_consume_partitioned_fasta(PyObject * self, PyObject * return Py_BuildValue("IK", total_reads, n_consumed); } -void free_pre_partition_info(void * p) -{ - _pre_partition_info * ppi = (_pre_partition_info *) p; - delete ppi; -} - static PyObject * hashbits_find_all_tags(PyObject * self, PyObject *args) { khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self; @@ -3854,37 +3740,6 @@ static PyObject * hashbits_count_kmers_on_radius(PyObject * self, PyObject * arg return PyLong_FromUnsignedLong(n); } -static PyObject * hashbits_trim_on_density_explosion(PyObject * self, PyObject * args) -{ - khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self; - khmer::Hashbits * hashbits = me->hashbits; - - const char * seq = NULL; - unsigned int radius = 0; - unsigned int max_volume = 0; - - if (!PyArg_ParseTuple(args, "sII", &seq, &radius, &max_volume)) { - return NULL; - } - - unsigned int trim_at; - Py_BEGIN_ALLOW_THREADS - - trim_at = hashbits->trim_on_density_explosion(seq, radius, max_volume); - - Py_END_ALLOW_THREADS; - - PyObject * trim_seq = PyString_FromStringAndSize(seq, trim_at); - if (trim_seq == NULL) { - return NULL; - } - - PyObject * ret = Py_BuildValue("OI", trim_seq, trim_at); - Py_DECREF(trim_seq); - - return ret; -} - static PyObject * hashbits_find_radius_for_volume(PyObject * self, PyObject * args) { khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self; @@ -4273,11 +4128,8 @@ static PyMethodDef khmer_hashbits_methods[] = { { "get", hashbits_get, METH_VARARGS, "Get the count for the given k-mer" }, { "calc_connected_graph_size", hashbits_calc_connected_graph_size, METH_VARARGS, "" }, { "kmer_degree", hashbits_kmer_degree, METH_VARARGS, "" }, - { "trim_on_degree", hashbits_trim_on_degree, METH_VARARGS, "" }, - { "trim_on_sodd", hashbits_trim_on_sodd, METH_VARARGS, "" }, { "trim_on_stoptags", hashbits_trim_on_stoptags, METH_VARARGS, "" }, { "identify_stoptags_by_position", hashbits_identify_stoptags_by_position, METH_VARARGS, "" }, - { "trim_on_density_explosion", hashbits_trim_on_density_explosion, METH_VARARGS, "" }, { "do_subset_partition", hashbits_do_subset_partition, METH_VARARGS, "" }, { "find_all_tags", hashbits_find_all_tags, METH_VARARGS, "" }, { "assign_partition_id", hashbits_assign_partition_id, METH_VARARGS, "" }, @@ -4304,7 +4156,6 @@ static PyMethodDef khmer_hashbits_methods[] = { { "consume_fasta_and_tag", hashbits_consume_fasta_and_tag, METH_VARARGS, "Count all k-mers in a given file" }, { "consume_fasta_and_tag_with_reads_parser", hashbits_consume_fasta_and_tag_with_reads_parser, METH_VARARGS, "Count all k-mers using a given reads parser" }, - { "traverse_from_reads", hashbits_traverse_from_reads, METH_VARARGS, "" }, { "consume_fasta_and_traverse", hashbits_consume_fasta_and_traverse, METH_VARARGS, "" }, { "consume_fasta_and_tag_with_stoptags", hashbits_consume_fasta_and_tag_with_stoptags, METH_VARARGS, "Count all k-mers in a given file" }, { "consume_partitioned_fasta", hashbits_consume_partitioned_fasta, METH_VARARGS, "Count all k-mers in a given file" }, @@ -4326,7 +4177,6 @@ static PyMethodDef khmer_hashbits_methods[] = { { "count_kmers_within_radius", hashbits_count_kmers_within_radius, METH_VARARGS, "" }, { "count_kmers_on_radius", hashbits_count_kmers_on_radius, METH_VARARGS, "" }, { "find_radius_for_volume", hashbits_find_radius_for_volume, METH_VARARGS, "" }, - { "hitraverse_to_stoptags", hashbits_hitraverse_to_stoptags, METH_VARARGS, "" }, { "traverse_from_tags", hashbits_traverse_from_tags, METH_VARARGS, "" }, { "repartition_largest_partition", hashbits_repartition_largest_partition, METH_VARARGS, "" }, { "get_median_count", hashbits_get_median_count, METH_VARARGS, "Get the median, average, and stddev of the k-mer counts in the string" }, @@ -4348,6 +4198,174 @@ khmer_hashbits_getattr(PyObject * obj, char * name) return Py_FindMethod(khmer_hashbits_methods, obj, name); } +//////////////////////////////////////////////////////////////////////////// + +static PyObject * subset_count_partitions(PyObject * self, + PyObject * args) +{ + khmer_KSubsetPartitionObject * me = (khmer_KSubsetPartitionObject *) self; + khmer::SubsetPartition * subset_p = me->subset; + + if (!PyArg_ParseTuple(args, "")) { + return NULL; + } + + unsigned int n_partitions = 0, n_unassigned = 0; + subset_p->count_partitions(n_partitions, n_unassigned); + + return Py_BuildValue("ii", n_partitions, n_unassigned); +} + +static PyObject * subset_report_on_partitions(PyObject * self, + PyObject * args) +{ + khmer_KSubsetPartitionObject * me = (khmer_KSubsetPartitionObject *) self; + khmer::SubsetPartition * subset_p = me->subset; + + if (!PyArg_ParseTuple(args, "")) { + return NULL; + } + + subset_p->report_on_partitions(); + + Py_INCREF(Py_None); + return Py_None; +} + +static PyObject * subset_compare_partitions(PyObject * self, + PyObject * args) +{ + khmer_KSubsetPartitionObject * me = (khmer_KSubsetPartitionObject *) self; + khmer::SubsetPartition * subset1_p = me->subset; + + PyObject * subset2_obj = NULL; + unsigned int pid1, pid2; // @CTB ensure that these are unsigned? + + if (!PyArg_ParseTuple(args, "iOi", + &pid1, &subset2_obj, &pid2)) { + return NULL; + } + + khmer_KSubsetPartitionObject *other = (khmer_KSubsetPartitionObject *) subset2_obj; + khmer::SubsetPartition * subset2_p = other->subset; + + unsigned int n_only1 = 0, n_only2 = 0, n_shared = 0; + subset1_p->compare_to_partition((PartitionID) pid1, + subset2_p, (PartitionID) pid2, + n_only1, n_only2, n_shared); + + return Py_BuildValue("iii", n_only1, n_only2, n_shared); +} + +static PyObject * subset_partition_size_distribution(PyObject * self, + PyObject * args) +{ + khmer_KSubsetPartitionObject * me = (khmer_KSubsetPartitionObject *) self; + khmer::SubsetPartition * subset_p = me->subset; + + if (!PyArg_ParseTuple(args, "")) { + return NULL; + } + + khmer::PartitionCountDistribution d; + + unsigned int n_unassigned = 0; + subset_p->partition_size_distribution(d, n_unassigned); + + PyObject * x = PyList_New(d.size()); + khmer::PartitionCountDistribution::const_iterator di; + + unsigned int i; + for (i = 0, di = d.begin(); di != d.end(); di++, i++) { + PyList_SET_ITEM(x, i, Py_BuildValue("LL", di->first, di->second)); + } + assert (i == d.size()); + + return Py_BuildValue("Oi", x, n_unassigned); +} + +static PyObject * subset_partition_sizes(PyObject * self, + PyObject * args) +{ + khmer_KSubsetPartitionObject * me = (khmer_KSubsetPartitionObject *) self; + khmer::SubsetPartition * subset_p = me->subset; + + unsigned int min_size = 0; + + if (!PyArg_ParseTuple(args, "|i", &min_size)) { + return NULL; + } + + khmer::PartitionCountMap cm; + unsigned int n_unassigned = 0; + subset_p->partition_sizes(cm, n_unassigned); + + unsigned int i; + khmer::PartitionCountMap::const_iterator mi; + for (i = 0, mi = cm.begin(); mi != cm.end(); mi++) { + if (mi->second >= min_size) i++; + } + + PyObject * x = PyList_New(i); + + // this should probably be a dict. @CTB + for (i = 0, mi = cm.begin(); mi != cm.end(); mi++) { + if (mi->second >= min_size) { + PyList_SET_ITEM(x, i, Py_BuildValue("LL", mi->first, mi->second)); + i++; + } + } + + return Py_BuildValue("Oi", x, n_unassigned); +} + +static PyObject * subset_partition_average_coverages(PyObject * self, + PyObject * args) +{ + khmer_KSubsetPartitionObject * me = (khmer_KSubsetPartitionObject *) self; + khmer::SubsetPartition * subset_p = me->subset; + + PyObject * counting_o; + + if (!PyArg_ParseTuple(args, "O", &counting_o)) { + return NULL; + } + + khmer::CountingHash * counting = ((khmer_KCountingHashObject *) counting_o)->counting; + + khmer::PartitionCountMap cm; + subset_p->partition_average_coverages(cm, counting); + + unsigned int i; + khmer::PartitionCountMap::const_iterator mi; + + PyObject * x = PyList_New(cm.size()); + + // this should probably be a dict. @CTB + for (i = 0, mi = cm.begin(); mi != cm.end(); mi++, i++) { + PyList_SET_ITEM(x, i, Py_BuildValue("LL", mi->first, mi->second)); + } + + return Py_BuildValue("O", x); +} + +static PyMethodDef khmer_subset_methods[] = { + { "count_partitions", subset_count_partitions, METH_VARARGS, "" }, + { "report_on_partitions", subset_report_on_partitions, METH_VARARGS, "" }, + { "compare_partitions", subset_compare_partitions, METH_VARARGS, "" }, + { "partition_size_distribution", subset_partition_size_distribution, METH_VARARGS, "" }, + { "partition_sizes", subset_partition_sizes, METH_VARARGS, "" }, + { "partition_average_coverages", subset_partition_average_coverages, METH_VARARGS, "" }, + {NULL, NULL, 0, NULL} /* sentinel */ +}; + +static PyObject * +khmer_subset_getattr(PyObject * obj, char * name) +{ + return Py_FindMethod(khmer_subset_methods, obj, name); +} + + // // GRAPHALIGN addition // @@ -4589,6 +4607,18 @@ static void khmer_hashbits_dealloc(PyObject* self) PyObject_Del((PyObject *) obj); } +// +// khmer_subset_dealloc -- clean up a hashbits object. +// + +static void khmer_subset_dealloc(PyObject* self) +{ + khmer_KSubsetPartitionObject * obj = (khmer_KSubsetPartitionObject *) self; + delete obj->subset; + obj->subset = NULL; + + PyObject_Del((PyObject *) obj); +} ////////////////////////////// // standalone functions diff --git a/tests/test_filter.py b/tests/test_filter.py index 6f85c52fd1..f832fb9860 100644 --- a/tests/test_filter.py +++ b/tests/test_filter.py @@ -44,29 +44,3 @@ def test_abund(self): assert ['1'] * (114 - 10 + 1) == output fd.close() - -@attr('highmem') -def test_filter_sodd(): - K = 32 - HASHTABLE_SIZE = int(8e7) - N_HT = 4 - MAX_SODD = 3 - - ht = khmer.new_hashbits(K, HASHTABLE_SIZE, N_HT) - filename = utils.get_test_data('../../data/high-sodd.fa') - - ht.consume_fasta(filename) - - seq = "CGTTAGTTGCGGTGCCGACCGGCAAACTTGGTTTTGCCAAAAATTTTTACAGTTAGAAATTATTC" \ - "ACAAAGTTGCACCGGAATTCGGTTACAAACGTCATTCTAACTAAT" - trim_seq, trim_at = ht.trim_on_sodd(seq, MAX_SODD) - assert trim_seq == "CGTTAGTTGCGGTGCCGACCGGCAAACTTGGT" - - seq = "ACAAAATTCCACATATAGTCATAATTGTGGGCAATTTTCGTCCCAAATTAGTTAGAATGACGTTT" \ - "GTAACCGAATTCCGGTGCAACTTTGTGAATAATTTCTAACTGTAAAAAT" - trim_seq, trim_at = ht.trim_on_sodd(seq, MAX_SODD) - assert trim_seq == "ACAAAATTCCACATATAGTCATAATTGTGGGCAATT" - - seq = "GCACGCAGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTG" - trim_seq, trim_at = ht.trim_on_sodd(seq, MAX_SODD) - assert trim_seq == seq diff --git a/tests/test_hashbits.py b/tests/test_hashbits.py index b7e4e01e65..2456d8d788 100644 --- a/tests/test_hashbits.py +++ b/tests/test_hashbits.py @@ -6,6 +6,7 @@ import khmer from screed.fasta import fasta_iter +import screed import khmer_tst_utils as utils from nose.plugins.attrib import attr From 72b945ee85a0e8b833b6ccac636059e58dfe1aec Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Wed, 4 Dec 2013 18:11:32 -0500 Subject: [PATCH 101/140] fixed error in buffer flushing --- scripts/sweep-reads-by-partition-buffered.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py index 59e75ac7a2..8e2cd7c6d9 100755 --- a/scripts/sweep-reads-by-partition-buffered.py +++ b/scripts/sweep-reads-by-partition-buffered.py @@ -39,7 +39,7 @@ MIN_KSIZE=21 def fmt_fasta(name, seq, labels=[]): - return '>{name}\t{labels}\n{seq}'.format(name=name, + return '>{name}\t{labels}\n{seq}\n'.format(name=name, labels='\t'.join([str(l) for l in labels]), seq=seq) def write_seq(fp, name, seq, labels=[]): @@ -67,7 +67,7 @@ def push(self, seq_str): self.buf.append(seq_str) def flush(self): - return '\n'.join(self.buf) + return ''.join(self.buf) def is_full(self, full): if len(self.buf) >= full: From 36b6a807b28da11e46a53e1f3a4041938acf232f Mon Sep 17 00:00:00 2001 From: Chris Welcher Date: Tue, 10 Dec 2013 16:43:06 -0500 Subject: [PATCH 102/140] beginning subclassing of label stuff --- lib/labelhash.hh | 149 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 lib/labelhash.hh diff --git a/lib/labelhash.hh b/lib/labelhash.hh new file mode 100644 index 0000000000..670b9f1c34 --- /dev/null +++ b/lib/labelhash.hh @@ -0,0 +1,149 @@ +// +// This file is part of khmer, http://github.com/ged-lab/khmer/, and is +// Copyright (C) Michigan State University, 2009-2013. It is licensed under +// the three-clause BSD license; see doc/LICENSE.txt. Contact: ctb@msu.edu +// + +#ifndef LABELHASH_HH +#define LABELHASH_HH + +#include "khmer.hh" + +namespace khmer { + + class LabelHash : public khmer:Hashbits { + protected: + LabelHash( WordLength ksize, std::vector ret; + ret = cmap.equal_range(kmer); + for (TagLabelPtrMap::const_iterator it=ret.first; it!=ret.second; ++it) { + if (*(it->second) == the_label) return true; + } + return false; + } + + // Does the given label already have a tag associated with it? + bool _cmap_contains_tag(const LabelTagPtrMap& cmap, + Label& the_label, + HashIntoType& kmer) { + std::pair ret; + ret = cmap.equal_range(the_label); + for (LabelTagPtrMap::const_iterator it=ret.first; it!=ret.second; ++it) { + if(*(it->second) == kmer) return true; + } + return false; + } + + unsigned int _get_tag_labels(const HashIntoType& tag, + const TagLabelPtrMap& cmap, + LabelPtrSet& found_labels) { + unsigned int num_labels = 0; + std::pair ret; + ret = cmap.equal_range(tag); + for (TagLabelPtrMap::const_iterator it=ret.first; it!=ret.second; ++it) { + found_labels.insert(it->second); + ++num_labels; + } + return num_labels; + } + + unsigned int _get_tags_from_label(const Label& label, + const LabelTagPtrMap& cmap, + TagPtrSet& labeled_tags) { + unsigned int num_tags = 0; + std::pair ret; + ret = cmap.equal_range(label); + for (LabelTagPtrMap::const_iterator it=ret.first; it!=ret.second; ++it) { + labeled_tags.insert(it->second); + ++num_tags; + } + return num_tags; + } + + uint32_t _tag_labels_spin_lock; + + public: + TagLabelPtrMap tag_labels; + LabelTagPtrMap label_tag_ptrs; + LabelPtrMap label_ptrs; + + unsigned int n_labels() const { return label_ptrs.size(); } + + + Label * check_and_allocate_label(Label new_label) { + Label * c; + if (label_ptrs.count(new_label)) { + c = label_ptrs[new_label]; + } else { + c = new Label(new_label); + label_ptrs[*c] = c; + } + return c; + } + void consume_fasta_and_tag_with_labels( + std::string const &filename, + unsigned int &total_reads, + unsigned long long &n_consumed, + CallbackFn callback = NULL, + void * callback_data = NULL); + + void consume_fasta_and_tag_with_labels( + read_parsers:: IParser * parser, + unsigned int &total_reads, + unsigned long long &n_consumed, + CallbackFn callback = NULL, + void * callback_data = NULL); + + void consume_partitioned_fasta_and_tag_with_labels(const std::string &filename, + unsigned int &total_reads, + unsigned long long &n_consumed, + CallbackFn callback = NULL, + void * callback_datac = NULL); + + void consume_sequence_and_tag_with_labels(const std::string& seq, + unsigned long long& n_consumed, + Label& current_label, + SeenSet * new_tags = 0); + + LabelPtrSet get_tag_labels(const HashIntoType& tag); + TagPtrSet get_label_tags(const Label& label); + + void link_tag_and_label(HashIntoType& kmer, Label& label); + + unsigned int sweep_sequence_for_labels(const std::string& seq, + LabelPtrSet& found_labels, + bool break_on_stoptags, + bool stop_big_traversals); + + unsigned int sweep_label_neighborhood(const std::string & seq, + LabelPtrSet& found_labels, + unsigned int range, + bool break_on_stoptags, + bool stop_big_traversals); + + void traverse_labels_and_resolve(const SeenSet& tagged_kmers, + LabelPtrSet& found_labels); + + } +} + +#define ACQUIRE_TAG_COLORS_SPIN_LOCK \ + while(!__sync_bool_compare_and_swap( &_tag_labels_spin_lock, 0, 1)); + +#define RELEASE_TAG_COLORS_SPIN_LOCK \ + __sync_bool_compare_and_swap( &_tag_labels_spin_lock, 1, 0); + +#endif LABELHASH_HH From 902cc2b0fe62c2f06f0ce190cdd42d532469d85f Mon Sep 17 00:00:00 2001 From: CS Welcher Date: Wed, 11 Dec 2013 01:31:02 -0500 Subject: [PATCH 103/140] added labelhash.cc --- lib/khmer.hh | 3 +- lib/labelhash.cc | 357 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 359 insertions(+), 1 deletion(-) create mode 100644 lib/labelhash.cc diff --git a/lib/khmer.hh b/lib/khmer.hh index 9e11a65f6d..f0fc8d134c 100644 --- a/lib/khmer.hh +++ b/lib/khmer.hh @@ -84,7 +84,8 @@ namespace khmer { typedef std::map TagCountMap; typedef std::map PartitionCountMap; typedef std::map PartitionCountDistribution; - + + // types used in @camillescott's sparse labeling extension typedef unsigned long long int Label; typedef std::multimap TagLabelPtrMap; typedef std::multimap LabelTagPtrMap; diff --git a/lib/labelhash.cc b/lib/labelhash.cc new file mode 100644 index 0000000000..de57042695 --- /dev/null +++ b/lib/labelhash.cc @@ -0,0 +1,357 @@ +// +// This file is part of khmer, http://github.com/ged-lab/khmer/, and is +// Copyright (C) Michigan State University, 2009-2013. It is licensed under +// the three-clause BSD license; see doc/LICENSE.txt. Contact: ctb@msu.edu +// + +#include "labelhash.hh" + +using namespace std; +using namespace khmer; +using namespace khmer:: read_parsers; + +/* + * @camillescott + * Might be time for a refactor: could do a general consume_fasta + * function which accepts a consume_sequence function pointer as a parameter + */ + +void +LabelHash::consume_fasta_and_tag_with_labels( + std:: string const &filename, + unsigned int &total_reads, unsigned long long &n_consumed, + CallbackFn callback, void * callback_data +) +{ + khmer:: Config &the_config = khmer:: get_active_config( ); + + // Note: Always assume only 1 thread if invoked this way. + IParser * parser = + IParser::get_parser( + filename, 1, the_config.get_reads_input_buffer_size( ), + the_config.get_reads_parser_trace_level( ) + ); + + + consume_fasta_and_tag_with_labels( + parser, + total_reads, n_consumed, + callback, callback_data + ); + + delete parser; +} + +void +LabelHash::consume_fasta_and_tag_with_labels( + read_parsers:: IParser * parser, + unsigned int &total_reads, unsigned long long &n_consumed, + CallbackFn callback, void * callback_data + ) + { + Hasher &hasher = + _get_hasher( parser->uuid( ) ); + unsigned int total_reads_LOCAL = 0; + #if (0) // Note: Used with callback - currently disabled. + unsigned long long int n_consumed_LOCAL = 0; + #endif + Read read; + + // TODO? Delete the following assignments. + total_reads = 0; + n_consumed = 0; + + hasher.trace_logger( + TraceLogger:: TLVL_DEBUG2, + "Starting trace of 'consume_fasta_and_tag_with_labels'....\n" + ); + + Label _tag_label = 0; + + Label * the_label; + // Iterate through the reads and consume their k-mers. + while (!parser->is_complete( )) + { + unsigned long long this_n_consumed = 0; + + read = parser->get_next_read( ); + + if (check_and_normalize_read( read.sequence )) + { + // TODO: make threadsafe! + the_label = check_and_allocate_label(_tag_label); + consume_sequence_and_tag_with_labels( read.sequence, + this_n_consumed, + *the_label ); + _tag_label++; + + #ifdef WITH_INTERNAL_METRICS + hasher.pmetrics.start_timers( ); + #endif + #if (0) // Note: Used with callback - currently disabled. + n_consumed_LOCAL = __sync_add_and_fetch( &n_consumed, this_n_consumed ); + #else + __sync_add_and_fetch( &n_consumed, this_n_consumed ); + #endif + total_reads_LOCAL = __sync_add_and_fetch( &total_reads, 1 ); + #ifdef WITH_INTERNAL_METRICS + hasher.pmetrics.stop_timers( ); + hasher.pmetrics.accumulate_timer_deltas( + (uint32_t)HashTablePerformanceMetrics:: MKEY_TIME_UPDATE_TALLIES + ); + #endif + } + + if (0 == (total_reads_LOCAL % 10000)) + hasher.trace_logger( + TraceLogger:: TLVL_DEBUG3, + "Total number of reads processed: %llu\n", + (unsigned long long int)total_reads_LOCAL + ); + + // TODO: Figure out alternative to callback into Python VM + // Cannot use in multi-threaded operation. + #if (0) + // run callback, if specified + if (total_reads_TL % CALLBACK_PERIOD == 0 && callback) { + std::cout << "n tags: " << all_tags.size() << "\n"; + try { + callback("consume_fasta_and_tag_with_labels", callback_data, total_reads_TL, + n_consumed); + } catch (...) { + delete parser; + throw; + } + } + #endif // 0 + + } // while reads left for parser + + } + +void LabelHash::consume_partitioned_fasta_and_tag_with_labels(const std::string &filename, + unsigned int &total_reads, + unsigned long long &n_consumed, + CallbackFn callback, + void * callback_data) +{ + total_reads = 0; + n_consumed = 0; + + IParser* parser = IParser::get_parser(filename.c_str()); + Read read; + + string seq = ""; + + // reset the master subset partition + delete partition; + partition = new SubsetPartition(this); + + // + // iterate through the FASTA file & consume the reads. + // + Label * c; + PartitionID p; + while(!parser->is_complete()) { + read = parser->get_next_read(); + seq = read.sequence; + + if (check_and_normalize_read(seq)) { + // First, figure out what the partition is (if non-zero), and save that. + p = _parse_partition_id(read.name); + c = check_and_allocate_label(p); + + consume_sequence_and_tag_with_labels( seq, + n_consumed, + *c ); + } + + // reset the sequence info, increment read number + total_reads++; + + // run callback, if specified + if (total_reads % CALLBACK_PERIOD == 0 && callback) { + try { + callback("consume_partitioned_fasta_and_tag_with_labels", callback_data, + total_reads, n_consumed); + } catch (...) { + delete parser; + throw; + } + } + } + + // @cswelcher TODO: check that deallocate LabelPtrMap is correct + delete parser; +} + +// @cswelcher: double-check -- is it valid to pull the address from a reference? +void LabelHash::link_tag_and_label(HashIntoType& kmer, Label& kmer_label) { + tag_labels.insert(TagLabelPtrPair(kmer, &kmer_label)); + label_tag_ptrs.insert(LabelTagPtrPair(kmer_label, &kmer)); +} + +void LabelHash::consume_sequence_and_tag_with_labels(const std::string& seq, + unsigned long long& n_consumed, + Label& current_label, + SeenSet * found_tags) + { + bool is_new_kmer; + bool kmer_tagged; + + KMerIterator kmers(seq.c_str(), _ksize); + HashIntoType kmer; + + unsigned int since = _tag_density / 2 + 1; + + while(!kmers.done()) { + kmer = kmers.next(); + + if ((is_new_kmer = test_and_set_bits( kmer ))) + ++n_consumed; + + #if (1) + if (is_new_kmer) { + ++since; + } else { + ACQUIRE_ALL_TAGS_SPIN_LOCK + kmer_tagged = set_contains(all_tags, kmer); + RELEASE_ALL_TAGS_SPIN_LOCK + if (kmer_tagged) { + since = 1; + + // Labeling code + // TODO: MAKE THREADSAFE! + + if (!_cmap_contains_label(tag_labels, kmer, current_label)) { + ACQUIRE_TAG_COLORS_SPIN_LOCK + link_tag_and_label(kmer, current_label); + RELEASE_TAG_COLORS_SPIN_LOCK + } + if (found_tags) { + found_tags->insert(kmer); + } + } else ++since; + } + #else + if (!is_new_kmer && set_contains(all_tags, kmer)) { + since = 1; + if (found_tags) { found_tags->insert(kmer); } + } else { + since++; + } + #endif + // + if (since >= _tag_density) { + ACQUIRE_ALL_TAGS_SPIN_LOCK + all_tags.insert(kmer); + RELEASE_ALL_TAGS_SPIN_LOCK + + // Labeling code + // TODO: MAKE THREADSAFE! + ACQUIRE_TAG_COLORS_SPIN_LOCK + link_tag_and_label(kmer, current_label); + RELEASE_TAG_COLORS_SPIN_LOCK + + if (found_tags) { found_tags->insert(kmer); } + since = 1; + } + + } // iteration over kmers + + if (since >= _tag_density/2 - 1) { + ACQUIRE_ALL_TAGS_SPIN_LOCK + all_tags.insert(kmer); // insert the last k-mer, too. + RELEASE_ALL_TAGS_SPIN_LOCK + + // Label code: TODO: MAKE THREADSAFE! + link_tag_and_label(kmer, current_label); + + if (found_tags) { found_tags->insert(kmer); } + } + } +/* + * Find all labels associated with the sequence + * For now, check /every/ k-mer with find_all_tags + * THIS SUCKS AND IT'S YOUR FAULT @CTB + */ +unsigned int LabelHash::sweep_sequence_for_labels(const std::string& seq, + LabelPtrSet& found_labels, + bool break_on_stoptags, + bool stop_big_traversals) { + + SeenSet tagged_kmers; + //LabelPtrSet found_labels; + + HashIntoType kmer_f, kmer_r, kmer; + + KMerIterator kmers(seq.c_str(), _ksize); + std::string kmer_s; + // keep a list of kmers which have already been traversed + SeenSet traversed_kmers; + while (!kmers.done()) { + kmer = kmers.next(); + kmer_s = _revhash(kmer, _ksize); + _hash(kmer_s.c_str(), _ksize, kmer_f, kmer_r); + + // don't even try traversing from k-mers not in the hashtable + //traversed_kmers.clear(); + if (get_count(uniqify_rc(kmer_f,kmer_r))) { + partition->find_all_tags(kmer_f, kmer_r, tagged_kmers, + all_tags, break_on_stoptags, stop_big_traversals); + traverse_labels_and_resolve(tagged_kmers, found_labels); + } + } + return traversed_kmers.size(); +} + +unsigned int LabelHash::sweep_label_neighborhood(const std::string& seq, + LabelPtrSet& found_labels, + unsigned int range, + bool break_on_stoptags, + bool stop_big_traversals) { + + SeenSet tagged_kmers; + unsigned int num_traversed; + num_traversed = partition->sweep_for_tags(seq, tagged_kmers, all_tags, + range, break_on_stoptags, stop_big_traversals); + traverse_labels_and_resolve(tagged_kmers, found_labels); + //printf("range=%u ", range); + if (range == 0) { + assert(num_traversed == seq.length()-ksize()+1); + } + tagged_kmers.clear(); + return num_traversed; +} + +LabelPtrSet LabelHash::get_tag_labels(const HashIntoType& tag) { + LabelPtrSet labels; + unsigned int num_labels; + _get_tag_labels(tag, tag_labels, labels); + return labels; +} + +TagPtrSet LabelHash::get_label_tags(const Label& label) { + TagPtrSet tags; + unsigned int num_tags; + _get_tags_from_label(label, label_tag_ptrs, tags); + return tags; +} + +void LabelHash::traverse_labels_and_resolve(const SeenSet& tagged_kmers, + LabelPtrSet& found_labels) { + + SeenSet::const_iterator si; + unsigned int num_labels = 0; + for (si=tagged_kmers.begin(); si!=tagged_kmers.end(); ++si) { + HashIntoType tag = *si; + // get the labels associated with this tag + num_labels = _get_tag_labels(tag, tag_labels, found_labels); + if (num_labels > 1) { + // reconcile labels + // for now do nothing ha + } + } +} + + From 605c04e7fe62de19dfaf5aadfd9fc1d8b1eb170f Mon Sep 17 00:00:00 2001 From: Camille Scott Date: Wed, 11 Dec 2013 17:01:11 -0500 Subject: [PATCH 104/140] moved parse pid to read_parsers file, fixed syntax errors in labehash, added labelhash to setup.py --- lib/hashtable.cc | 20 -------------------- lib/labelhash.cc | 4 ++-- lib/labelhash.hh | 11 +++++++---- lib/read_parsers.cc | 1 - lib/read_parsers.hh | 24 ++++++++++++++++++++++-- 5 files changed, 31 insertions(+), 29 deletions(-) diff --git a/lib/hashtable.cc b/lib/hashtable.cc index c8c075143c..7f4ea47d90 100644 --- a/lib/hashtable.cc +++ b/lib/hashtable.cc @@ -723,26 +723,6 @@ void Hashtable::divide_tags_into_subsets(unsigned int subset_size, } } -static PartitionID _parse_partition_id(string name) -{ - PartitionID p = 0; - const char * s = name.c_str() + name.length() - 1; - assert(*(s + 1) == (unsigned int) NULL); - - while(*s != '\t' && s >= name.c_str()) { - s--; - } - - if (*s == '\t') { - p = (PartitionID) atoi(s + 1); - } else { - cerr << "consume_partitioned_fasta barfed on read " << name << "\n"; - assert(0); - } - - return p; -} - // // consume_partitioned_fasta: consume a FASTA file of reads // diff --git a/lib/labelhash.cc b/lib/labelhash.cc index de57042695..b3d9a15ea8 100644 --- a/lib/labelhash.cc +++ b/lib/labelhash.cc @@ -326,14 +326,14 @@ unsigned int LabelHash::sweep_label_neighborhood(const std::string& seq, LabelPtrSet LabelHash::get_tag_labels(const HashIntoType& tag) { LabelPtrSet labels; - unsigned int num_labels; + //unsigned int num_labels; _get_tag_labels(tag, tag_labels, labels); return labels; } TagPtrSet LabelHash::get_label_tags(const Label& label) { TagPtrSet tags; - unsigned int num_tags; + //unsigned int num_tags; _get_tags_from_label(label, label_tag_ptrs, tags); return tags; } diff --git a/lib/labelhash.hh b/lib/labelhash.hh index 670b9f1c34..960d64158d 100644 --- a/lib/labelhash.hh +++ b/lib/labelhash.hh @@ -7,13 +7,16 @@ #ifndef LABELHASH_HH #define LABELHASH_HH +#include + #include "khmer.hh" +#include "hashbits.hh" namespace khmer { - class LabelHash : public khmer:Hashbits { + class LabelHash : public khmer::Hashbits { protected: - LabelHash( WordLength ksize, std::vector& tablesizes) : khmer::Hashbits(ksize, tablesizes) { // constructor @@ -137,7 +140,7 @@ namespace khmer { void traverse_labels_and_resolve(const SeenSet& tagged_kmers, LabelPtrSet& found_labels); - } + }; } #define ACQUIRE_TAG_COLORS_SPIN_LOCK \ @@ -146,4 +149,4 @@ namespace khmer { #define RELEASE_TAG_COLORS_SPIN_LOCK \ __sync_bool_compare_and_swap( &_tag_labels_spin_lock, 1, 0); -#endif LABELHASH_HH +#endif diff --git a/lib/read_parsers.cc b/lib/read_parsers.cc index e9449d526d..7d0f1aef56 100644 --- a/lib/read_parsers.cc +++ b/lib/read_parsers.cc @@ -1936,7 +1936,6 @@ _is_valid_read_pair( == the_read_pair.second.name.substr( 0, match_1.rm_so )); } - } // namespace read_parsers diff --git a/lib/read_parsers.hh b/lib/read_parsers.hh index c2a07f90c5..e8ca7e7968 100644 --- a/lib/read_parsers.hh +++ b/lib/read_parsers.hh @@ -10,7 +10,7 @@ #include #include - +#include #include #include @@ -28,7 +28,6 @@ extern "C" #include "trace_logger.hh" #include "perf_metrics.hh" - namespace khmer { @@ -544,6 +543,27 @@ struct FastqParser : public IParser }; +static PartitionID _parse_partition_id(std::string name) +{ + PartitionID p = 0; + const char * s = name.c_str() + name.length() - 1; + assert(*(s + 1) == (unsigned int) NULL); + + while(*s != '\t' && s >= name.c_str()) { + s--; + } + + if (*s == '\t') { + p = (PartitionID) atoi(s + 1); + } else { + std::cerr << "consume_partitioned_fasta barfed on read " << name << "\n"; + assert(0); + } + + return p; +} + + } // namespace read_parsers From 4127089cb8b4c44a809f6001b4b548aba1a1cabc Mon Sep 17 00:00:00 2001 From: Camille Scott Date: Thu, 12 Dec 2013 03:37:38 -0500 Subject: [PATCH 105/140] added labelhash defs to khmermodule, started stripping labeling methods from hashbits, definitely fails tests --- khmer/_khmermodule.cc | 819 ++++++++++++++++++++++++------------------ 1 file changed, 474 insertions(+), 345 deletions(-) diff --git a/khmer/_khmermodule.cc b/khmer/_khmermodule.cc index f8f7cb39b3..cb85fad1cc 100644 --- a/khmer/_khmermodule.cc +++ b/khmer/_khmermodule.cc @@ -21,6 +21,7 @@ #include "hashbits.hh" #include "storage.hh" #include "aligner.hh" +#include "labelhash.hh" // // Function necessary for Python loading: @@ -3853,265 +3854,6 @@ static PyObject * hashbits_get_median_count(PyObject * self, PyObject * args) return Py_BuildValue("iff", med, average, stddev); } -static PyObject * hashbits_get_label_dict(PyObject * self, PyObject * args) { - khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self; - khmer::Hashbits * hb = me->hashbits; - - PyObject * d = PyDict_New(); - khmer::LabelPtrMap::iterator it; - - for (it = hb->label_ptrs.begin(); it!=hb->label_ptrs.end(); ++it) { - PyDict_SetItem(d, Py_BuildValue("K", it->first), Py_BuildValue("K", it->second)); - } - - return d; -} - -static PyObject * hashbits_consume_fasta_and_tag_with_labels(PyObject * self, PyObject * args) -{ - khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self; - khmer::Hashbits * hb = me->hashbits; - - std::ofstream outfile; - - char * filename; - PyObject * callback_obj = NULL; - - if (!PyArg_ParseTuple(args, "s|O", &filename, &callback_obj)) { - return NULL; - } - - unsigned long long n_consumed; - unsigned int total_reads; - bool exc_raised = false; - - //Py_BEGIN_ALLOW_THREADS - try { - hb->consume_fasta_and_tag_with_labels(filename, total_reads, n_consumed, - _report_fn, callback_obj); - } catch (_khmer_signal &e) { - exc_raised = true; - } - //Py_END_ALLOW_THREADS - if (exc_raised) return NULL; - - return Py_BuildValue("iL", total_reads, n_consumed); - -} - -static PyObject * hashbits_consume_partitioned_fasta_and_tag_with_labels( - PyObject * self, PyObject * args) -{ - khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self; - khmer::Hashbits * hashbits = me->hashbits; - - char * filename; - PyObject * callback_obj = NULL; - - if (!PyArg_ParseTuple(args, "s|O", &filename, &callback_obj)) { - return NULL; - } - - // call the C++ function, and trap signals => Python - - unsigned long long n_consumed; - unsigned int total_reads; - - try { - hashbits->consume_partitioned_fasta_and_tag_with_labels(filename, - total_reads, n_consumed, _report_fn, callback_obj); - } catch (_khmer_signal &e) { - return NULL; - } - - return Py_BuildValue("iK", total_reads, n_consumed); -} - -static PyObject * hashbits_consume_sequence_and_tag_with_labels(PyObject * self, PyObject * args) { - khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self; - khmer::Hashbits * hb = me->hashbits; - - char * seq = NULL; - unsigned long long c = NULL; - if (!PyArg_ParseTuple(args, "sK", &seq, &c)) { - return NULL; - } - - unsigned long long n_consumed = 0; - khmer::Label * the_label = hb->check_and_allocate_label(c); - - try { - //if (hb->check_and_normalize_read(seq)) { - - hb->consume_sequence_and_tag_with_labels(seq, n_consumed, *the_label); - //} - } catch (_khmer_signal &e) { - return NULL; - } - return Py_BuildValue("L", n_consumed); -} - -static PyObject * hashbits_sweep_label_neighborhood(PyObject * self, PyObject * args) { - khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self; - khmer::Hashbits * hb = me->hashbits; - - char * seq = NULL; - unsigned int r = NULL; - PyObject * break_on_stop_tags_o = NULL; - PyObject * stop_big_traversals_o = NULL; - - if (!PyArg_ParseTuple(args, "s|iOO", &seq, &r, - &break_on_stop_tags_o, - &stop_big_traversals_o)) { - return NULL; - } - - unsigned int range = (2 * hb->_get_tag_density()) + 1; - if (r >= 0) { - range = r; - } - - bool break_on_stop_tags = false; - if (break_on_stop_tags_o && PyObject_IsTrue(break_on_stop_tags_o)) { - break_on_stop_tags = true; - } - bool stop_big_traversals = false; - if (stop_big_traversals_o && PyObject_IsTrue(stop_big_traversals_o)) { - stop_big_traversals = true; - } - - if (strlen(seq) < hb->ksize()) { - return NULL; - } - - //std::pair ret; - LabelPtrSet found_labels; - - bool exc_raised = false; - unsigned int num_traversed = 0; - //Py_BEGIN_ALLOW_THREADS - try { - num_traversed = hb->sweep_label_neighborhood(seq, found_labels, range, break_on_stop_tags, stop_big_traversals); - } catch (_khmer_signal &e) { - exc_raised = true; - } - //Py_END_ALLOW_THREADS - - //printf("...%u kmers traversed\n", num_traversed); - - if (exc_raised) return NULL; - - PyObject * x = PyList_New(found_labels.size()); - khmer::LabelPtrSet::const_iterator si; - unsigned long long i = 0; - for (si=found_labels.begin(); si!=found_labels.end(); ++si) { - PyList_SET_ITEM(x, i, Py_BuildValue("K", *(*si))); - i++; - } - - return x; -} - - -// Similar to find_all_tags, but returns tags in a way actually useable by python -// need a tags_in_sequence iterator or function in c++ land for reuse in all -// these functions -static PyObject * hashbits_sweep_tag_neighborhood(PyObject * self, PyObject *args) -{ - khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self; - khmer::Hashbits * hashbits = me->hashbits; - - char * seq = NULL; - unsigned int r = NULL; - PyObject * break_on_stop_tags_o = NULL; - PyObject * stop_big_traversals_o = NULL; - - if (!PyArg_ParseTuple(args, "s|iOO", &seq, &r, - &break_on_stop_tags_o, - &stop_big_traversals_o)) { - return NULL; - } - - unsigned int range = (2 * hashbits->_get_tag_density()) + 1; - if (r >= 0) { - range = r; - } - - bool break_on_stop_tags = false; - if (break_on_stop_tags_o && PyObject_IsTrue(break_on_stop_tags_o)) { - break_on_stop_tags = true; - } - bool stop_big_traversals = false; - if (stop_big_traversals_o && PyObject_IsTrue(stop_big_traversals_o)) { - stop_big_traversals = true; - } - - if (strlen(seq) < hashbits->ksize()) { - return NULL; - } - - khmer::SeenSet tagged_kmers; - - //Py_BEGIN_ALLOW_THREADS - - hashbits->partition->sweep_for_tags(seq, tagged_kmers, - hashbits->all_tags, range, break_on_stop_tags, stop_big_traversals); - - //Py_END_ALLOW_THREADS - - PyObject * x = PyList_New(tagged_kmers.size()); - khmer::SeenSet::const_iterator si; - unsigned long long i = 0; - for (si=tagged_kmers.begin(); si!=tagged_kmers.end(); ++si) { - //std::string kmer_s = _revhash(*si, hashbits->ksize()); - // type K for python unsigned long long - PyList_SET_ITEM(x, i, Py_BuildValue("K", *si)); - i++; - } - - return x; -} - - -static PyObject * hashbits_get_tag_labels(PyObject * self, PyObject * args) { - - khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self; - khmer::Hashbits * hashbits = me->hashbits; - - khmer::HashIntoType tag; - - if (!PyArg_ParseTuple(args, "K", &tag)) { - return NULL; - } - - khmer::LabelPtrSet labels; - - labels = hashbits->get_tag_labels(tag); - - PyObject * x = PyList_New(labels.size()); - khmer::LabelPtrSet::const_iterator si; - unsigned long long i = 0; - for (si=labels.begin(); si!=labels.end(); ++si) { - //std::string kmer_s = _revhash(*si, hashbits->ksize()); - PyList_SET_ITEM(x, i, Py_BuildValue("K", *(*si))); - i++; - } - - return x; -} - -static PyObject * hashbits_n_labels(PyObject * self, PyObject * args) -{ - khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self; - khmer::Hashbits * hashbits = me->hashbits; - - if (!PyArg_ParseTuple(args, "")) { - return NULL; - } - - return PyInt_FromLong(hashbits->n_labels()); -} - static PyMethodDef khmer_hashbits_methods[] = { { "extract_unique_paths", hashbits_extract_unique_paths, METH_VARARGS, "" }, { "ksize", hashbits_get_ksize, METH_VARARGS, "" }, @@ -4180,15 +3922,6 @@ static PyMethodDef khmer_hashbits_methods[] = { { "traverse_from_tags", hashbits_traverse_from_tags, METH_VARARGS, "" }, { "repartition_largest_partition", hashbits_repartition_largest_partition, METH_VARARGS, "" }, { "get_median_count", hashbits_get_median_count, METH_VARARGS, "Get the median, average, and stddev of the k-mer counts in the string" }, - { "consume_fasta_and_tag_with_labels", hashbits_consume_fasta_and_tag_with_labels, METH_VARARGS, "" }, - { "sweep_label_neighborhood", hashbits_sweep_label_neighborhood, METH_VARARGS, "" }, - {"consume_partitioned_fasta_and_tag_with_labels", hashbits_consume_partitioned_fasta_and_tag_with_labels, METH_VARARGS, "" }, - {"sweep_tag_neighborhood", hashbits_sweep_tag_neighborhood, METH_VARARGS, "" }, - {"get_tag_labels", hashbits_get_tag_labels, METH_VARARGS, ""}, - {"consume_sequence_and_tag_with_labels", hashbits_consume_sequence_and_tag_with_labels, METH_VARARGS, "" }, - {"n_labels", hashbits_n_labels, METH_VARARGS, ""}, - {"get_label_dict", hashbits_get_label_dict, METH_VARARGS, "" }, - {NULL, NULL, 0, NULL} /* sentinel */ }; @@ -4238,134 +3971,484 @@ static PyObject * subset_compare_partitions(PyObject * self, khmer_KSubsetPartitionObject * me = (khmer_KSubsetPartitionObject *) self; khmer::SubsetPartition * subset1_p = me->subset; - PyObject * subset2_obj = NULL; - unsigned int pid1, pid2; // @CTB ensure that these are unsigned? + PyObject * subset2_obj = NULL; + unsigned int pid1, pid2; // @CTB ensure that these are unsigned? + + if (!PyArg_ParseTuple(args, "iOi", + &pid1, &subset2_obj, &pid2)) { + return NULL; + } + + khmer_KSubsetPartitionObject *other = (khmer_KSubsetPartitionObject *) subset2_obj; + khmer::SubsetPartition * subset2_p = other->subset; + + unsigned int n_only1 = 0, n_only2 = 0, n_shared = 0; + subset1_p->compare_to_partition((PartitionID) pid1, + subset2_p, (PartitionID) pid2, + n_only1, n_only2, n_shared); + + return Py_BuildValue("iii", n_only1, n_only2, n_shared); +} + +static PyObject * subset_partition_size_distribution(PyObject * self, + PyObject * args) +{ + khmer_KSubsetPartitionObject * me = (khmer_KSubsetPartitionObject *) self; + khmer::SubsetPartition * subset_p = me->subset; + + if (!PyArg_ParseTuple(args, "")) { + return NULL; + } + + khmer::PartitionCountDistribution d; + + unsigned int n_unassigned = 0; + subset_p->partition_size_distribution(d, n_unassigned); + + PyObject * x = PyList_New(d.size()); + khmer::PartitionCountDistribution::const_iterator di; + + unsigned int i; + for (i = 0, di = d.begin(); di != d.end(); di++, i++) { + PyList_SET_ITEM(x, i, Py_BuildValue("LL", di->first, di->second)); + } + assert (i == d.size()); + + return Py_BuildValue("Oi", x, n_unassigned); +} + +static PyObject * subset_partition_sizes(PyObject * self, + PyObject * args) +{ + khmer_KSubsetPartitionObject * me = (khmer_KSubsetPartitionObject *) self; + khmer::SubsetPartition * subset_p = me->subset; + + unsigned int min_size = 0; + + if (!PyArg_ParseTuple(args, "|i", &min_size)) { + return NULL; + } + + khmer::PartitionCountMap cm; + unsigned int n_unassigned = 0; + subset_p->partition_sizes(cm, n_unassigned); + + unsigned int i; + khmer::PartitionCountMap::const_iterator mi; + for (i = 0, mi = cm.begin(); mi != cm.end(); mi++) { + if (mi->second >= min_size) i++; + } + + PyObject * x = PyList_New(i); + + // this should probably be a dict. @CTB + for (i = 0, mi = cm.begin(); mi != cm.end(); mi++) { + if (mi->second >= min_size) { + PyList_SET_ITEM(x, i, Py_BuildValue("LL", mi->first, mi->second)); + i++; + } + } + + return Py_BuildValue("Oi", x, n_unassigned); +} + +static PyObject * subset_partition_average_coverages(PyObject * self, + PyObject * args) +{ + khmer_KSubsetPartitionObject * me = (khmer_KSubsetPartitionObject *) self; + khmer::SubsetPartition * subset_p = me->subset; + + PyObject * counting_o; + + if (!PyArg_ParseTuple(args, "O", &counting_o)) { + return NULL; + } + + khmer::CountingHash * counting = ((khmer_KCountingHashObject *) counting_o)->counting; + + khmer::PartitionCountMap cm; + subset_p->partition_average_coverages(cm, counting); + + unsigned int i; + khmer::PartitionCountMap::const_iterator mi; + + PyObject * x = PyList_New(cm.size()); + + // this should probably be a dict. @CTB + for (i = 0, mi = cm.begin(); mi != cm.end(); mi++, i++) { + PyList_SET_ITEM(x, i, Py_BuildValue("LL", mi->first, mi->second)); + } + + return Py_BuildValue("O", x); +} + +static PyMethodDef khmer_subset_methods[] = { + { "count_partitions", subset_count_partitions, METH_VARARGS, "" }, + { "report_on_partitions", subset_report_on_partitions, METH_VARARGS, "" }, + { "compare_partitions", subset_compare_partitions, METH_VARARGS, "" }, + { "partition_size_distribution", subset_partition_size_distribution, METH_VARARGS, "" }, + { "partition_sizes", subset_partition_sizes, METH_VARARGS, "" }, + { "partition_average_coverages", subset_partition_average_coverages, METH_VARARGS, "" }, + {NULL, NULL, 0, NULL} /* sentinel */ +}; + +static PyObject * +khmer_subset_getattr(PyObject * obj, char * name) +{ + return Py_FindMethod(khmer_subset_methods, obj, name); +} + +///////////////// +// LabelHash +///////////////// + +// LabelHash addition +typedef struct { + PyObject_HEAD + + /* @camillescott late night notes: + need to experiment. might be able to call hashbits py methods + directly with the labelhash object, because they all instantiate + a new hashbits pointer on themselves to call the functions and labelhash + inherits from hashbits; or, we define a hashbits object as part of this struct + as called for in the c-api reference. need to grok that still. + If this is how it's done, remove PyObject_HEAD, which will already be included + in the base class struct. + See http://docs.python.org/2.7/extending/newtypes.html#subclassing-other-types + for details... + */ + LabelHash * labelhash; +} khmer_KLabelHashObject; + +static void khmer_labelhash_dealloc(PyObject *); +static PyObject * khmer_labelhash_getattr(PyObject * obj, char * name); + +static PyTypeObject khmer_KLabelHashType = { + PyObject_HEAD_INIT(NULL) + 0, + "KLabelHash", sizeof(khmer_KLabelHashObject), + 0, + khmer_labelhash_dealloc, /*tp_dealloc*/ + 0, /*tp_print*/ + khmer_labelhash_getattr, /*tp_getattr*/ + 0, /*tp_setattr*/ + 0, /*tp_compare*/ + 0, /*tp_repr*/ + 0, /*tp_as_number*/ + 0, /*tp_as_sequence*/ + 0, /*tp_as_mapping*/ + 0, /*tp_hash */ + 0, /*tp_call*/ + 0, /*tp_str*/ + 0, /*tp_getattro*/ + 0, /*tp_setattro*/ + 0, /*tp_as_buffer*/ + Py_TPFLAGS_DEFAULT, /*tp_flags*/ + "labelhash object", /* tp_doc */ +}; + +#define is_labelhash_obj(v) ((v)->ob_type == &khmer_KLabelHashType) + + +static PyObject * labelhash_get_label_dict(PyObject * self, PyObject * args) { + khmer_KLabelHashObject * me = (khmer_KLabelHashObject *) self; + khmer::LabelHash * hb = me->labelhash; + + PyObject * d = PyDict_New(); + khmer::LabelPtrMap::iterator it; + + for (it = hb->label_ptrs.begin(); it!=hb->label_ptrs.end(); ++it) { + PyDict_SetItem(d, Py_BuildValue("K", it->first), Py_BuildValue("K", it->second)); + } + + return d; +} + +static PyObject * labelhash_consume_fasta_and_tag_with_labels(PyObject * self, PyObject * args) +{ + khmer_KLabelHashObject * me = (khmer_KLabelHashObject *) self; + khmer::LabelHash * hb = me->labelhash; + + std::ofstream outfile; + + char * filename; + PyObject * callback_obj = NULL; + + if (!PyArg_ParseTuple(args, "s|O", &filename, &callback_obj)) { + return NULL; + } + + unsigned long long n_consumed; + unsigned int total_reads; + bool exc_raised = false; + + //Py_BEGIN_ALLOW_THREADS + try { + hb->consume_fasta_and_tag_with_labels(filename, total_reads, n_consumed, + _report_fn, callback_obj); + } catch (_khmer_signal &e) { + exc_raised = true; + } + //Py_END_ALLOW_THREADS + if (exc_raised) return NULL; + + return Py_BuildValue("iL", total_reads, n_consumed); + +} + +static PyObject * labelhash_consume_partitioned_fasta_and_tag_with_labels( + PyObject * self, PyObject * args) +{ + khmer_KLabelHashObject * me = (khmer_KLabelHashObject *) self; + khmer::LabelHash * labelhash = me->labelhash; + + char * filename; + PyObject * callback_obj = NULL; - if (!PyArg_ParseTuple(args, "iOi", - &pid1, &subset2_obj, &pid2)) { + if (!PyArg_ParseTuple(args, "s|O", &filename, &callback_obj)) { return NULL; } - khmer_KSubsetPartitionObject *other = (khmer_KSubsetPartitionObject *) subset2_obj; - khmer::SubsetPartition * subset2_p = other->subset; + // call the C++ function, and trap signals => Python - unsigned int n_only1 = 0, n_only2 = 0, n_shared = 0; - subset1_p->compare_to_partition((PartitionID) pid1, - subset2_p, (PartitionID) pid2, - n_only1, n_only2, n_shared); + unsigned long long n_consumed; + unsigned int total_reads; - return Py_BuildValue("iii", n_only1, n_only2, n_shared); -} + try { + labelhash->consume_partitioned_fasta_and_tag_with_labels(filename, + total_reads, n_consumed, _report_fn, callback_obj); + } catch (_khmer_signal &e) { + return NULL; + } -static PyObject * subset_partition_size_distribution(PyObject * self, - PyObject * args) -{ - khmer_KSubsetPartitionObject * me = (khmer_KSubsetPartitionObject *) self; - khmer::SubsetPartition * subset_p = me->subset; + return Py_BuildValue("iK", total_reads, n_consumed); +} - if (!PyArg_ParseTuple(args, "")) { +static PyObject * labelhash_consume_sequence_and_tag_with_labels(PyObject * self, PyObject * args) { + khmer_KLabelHashObject * me = (khmer_KLabelHashObject *) self; + khmer::LabelHash * hb = me->labelhash; + + char * seq = NULL; + unsigned long long c = NULL; + if (!PyArg_ParseTuple(args, "sK", &seq, &c)) { return NULL; } - khmer::PartitionCountDistribution d; + unsigned long long n_consumed = 0; + khmer::Label * the_label = hb->check_and_allocate_label(c); - unsigned int n_unassigned = 0; - subset_p->partition_size_distribution(d, n_unassigned); + try { + //if (hb->check_and_normalize_read(seq)) { + + hb->consume_sequence_and_tag_with_labels(seq, n_consumed, *the_label); + //} + } catch (_khmer_signal &e) { + return NULL; + } + return Py_BuildValue("L", n_consumed); +} - PyObject * x = PyList_New(d.size()); - khmer::PartitionCountDistribution::const_iterator di; +static PyObject * labelhash_sweep_label_neighborhood(PyObject * self, PyObject * args) { + khmer_KLabelHashObject * me = (khmer_KLabelHashObject *) self; + khmer::LabelHash * hb = me->labelhash; + + char * seq = NULL; + unsigned int r = NULL; + PyObject * break_on_stop_tags_o = NULL; + PyObject * stop_big_traversals_o = NULL; - unsigned int i; - for (i = 0, di = d.begin(); di != d.end(); di++, i++) { - PyList_SET_ITEM(x, i, Py_BuildValue("LL", di->first, di->second)); + if (!PyArg_ParseTuple(args, "s|iOO", &seq, &r, + &break_on_stop_tags_o, + &stop_big_traversals_o)) { + return NULL; } - assert (i == d.size()); - return Py_BuildValue("Oi", x, n_unassigned); + unsigned int range = (2 * hb->_get_tag_density()) + 1; + if (r >= 0) { + range = r; + } + + bool break_on_stop_tags = false; + if (break_on_stop_tags_o && PyObject_IsTrue(break_on_stop_tags_o)) { + break_on_stop_tags = true; + } + bool stop_big_traversals = false; + if (stop_big_traversals_o && PyObject_IsTrue(stop_big_traversals_o)) { + stop_big_traversals = true; + } + + if (strlen(seq) < hb->ksize()) { + return NULL; + } + + //std::pair ret; + LabelPtrSet found_labels; + + bool exc_raised = false; + unsigned int num_traversed = 0; + //Py_BEGIN_ALLOW_THREADS + try { + num_traversed = hb->sweep_label_neighborhood(seq, found_labels, range, break_on_stop_tags, stop_big_traversals); + } catch (_khmer_signal &e) { + exc_raised = true; + } + //Py_END_ALLOW_THREADS + + //printf("...%u kmers traversed\n", num_traversed); + + if (exc_raised) return NULL; + + PyObject * x = PyList_New(found_labels.size()); + khmer::LabelPtrSet::const_iterator si; + unsigned long long i = 0; + for (si=found_labels.begin(); si!=found_labels.end(); ++si) { + PyList_SET_ITEM(x, i, Py_BuildValue("K", *(*si))); + i++; + } + + return x; } -static PyObject * subset_partition_sizes(PyObject * self, - PyObject * args) + +// Similar to find_all_tags, but returns tags in a way actually useable by python +// need a tags_in_sequence iterator or function in c++ land for reuse in all +// these functions +static PyObject * labelhash_sweep_tag_neighborhood(PyObject * self, PyObject *args) { - khmer_KSubsetPartitionObject * me = (khmer_KSubsetPartitionObject *) self; - khmer::SubsetPartition * subset_p = me->subset; + khmer_KLabelHashObject * me = (khmer_KLabelHashObject *) self; + khmer::LabelHash * labelhash = me->labelhash; - unsigned int min_size = 0; + char * seq = NULL; + unsigned int r = NULL; + PyObject * break_on_stop_tags_o = NULL; + PyObject * stop_big_traversals_o = NULL; - if (!PyArg_ParseTuple(args, "|i", &min_size)) { + if (!PyArg_ParseTuple(args, "s|iOO", &seq, &r, + &break_on_stop_tags_o, + &stop_big_traversals_o)) { return NULL; } - - khmer::PartitionCountMap cm; - unsigned int n_unassigned = 0; - subset_p->partition_sizes(cm, n_unassigned); - unsigned int i; - khmer::PartitionCountMap::const_iterator mi; - for (i = 0, mi = cm.begin(); mi != cm.end(); mi++) { - if (mi->second >= min_size) i++; + unsigned int range = (2 * labelhash->_get_tag_density()) + 1; + if (r >= 0) { + range = r; } - PyObject * x = PyList_New(i); + bool break_on_stop_tags = false; + if (break_on_stop_tags_o && PyObject_IsTrue(break_on_stop_tags_o)) { + break_on_stop_tags = true; + } + bool stop_big_traversals = false; + if (stop_big_traversals_o && PyObject_IsTrue(stop_big_traversals_o)) { + stop_big_traversals = true; + } + + if (strlen(seq) < labelhash->ksize()) { + return NULL; + } - // this should probably be a dict. @CTB - for (i = 0, mi = cm.begin(); mi != cm.end(); mi++) { - if (mi->second >= min_size) { - PyList_SET_ITEM(x, i, Py_BuildValue("LL", mi->first, mi->second)); - i++; - } + khmer::SeenSet tagged_kmers; + + //Py_BEGIN_ALLOW_THREADS + + labelhash->partition->sweep_for_tags(seq, tagged_kmers, + labelhash->all_tags, range, break_on_stop_tags, stop_big_traversals); + + //Py_END_ALLOW_THREADS + + PyObject * x = PyList_New(tagged_kmers.size()); + khmer::SeenSet::const_iterator si; + unsigned long long i = 0; + for (si=tagged_kmers.begin(); si!=tagged_kmers.end(); ++si) { + //std::string kmer_s = _revhash(*si, labelhash->ksize()); + // type K for python unsigned long long + PyList_SET_ITEM(x, i, Py_BuildValue("K", *si)); + i++; } - return Py_BuildValue("Oi", x, n_unassigned); + return x; } -static PyObject * subset_partition_average_coverages(PyObject * self, - PyObject * args) -{ - khmer_KSubsetPartitionObject * me = (khmer_KSubsetPartitionObject *) self; - khmer::SubsetPartition * subset_p = me->subset; - - PyObject * counting_o; - if (!PyArg_ParseTuple(args, "O", &counting_o)) { +static PyObject * labelhash_get_tag_labels(PyObject * self, PyObject * args) { + + khmer_KLabelHashObject * me = (khmer_KLabelHashObject *) self; + khmer::LabelHash * labelhash = me->labelhash; + + khmer::HashIntoType tag; + + if (!PyArg_ParseTuple(args, "K", &tag)) { return NULL; } - khmer::CountingHash * counting = ((khmer_KCountingHashObject *) counting_o)->counting; + khmer::LabelPtrSet labels; - khmer::PartitionCountMap cm; - subset_p->partition_average_coverages(cm, counting); + labels = labelhash->get_tag_labels(tag); + + PyObject * x = PyList_New(labels.size()); + khmer::LabelPtrSet::const_iterator si; + unsigned long long i = 0; + for (si=labels.begin(); si!=labels.end(); ++si) { + //std::string kmer_s = _revhash(*si, labelhash->ksize()); + PyList_SET_ITEM(x, i, Py_BuildValue("K", *(*si))); + i++; + } - unsigned int i; - khmer::PartitionCountMap::const_iterator mi; + return x; +} - PyObject * x = PyList_New(cm.size()); +static PyObject * labelhash_n_labels(PyObject * self, PyObject * args) +{ + khmer_KLabelHashObject * me = (khmer_KLabelHashObject *) self; + khmer::LabelHash * labelhash = me->labelhash; - // this should probably be a dict. @CTB - for (i = 0, mi = cm.begin(); mi != cm.end(); mi++, i++) { - PyList_SET_ITEM(x, i, Py_BuildValue("LL", mi->first, mi->second)); + if (!PyArg_ParseTuple(args, "")) { + return NULL; } - return Py_BuildValue("O", x); + return PyInt_FromLong(labelhash->n_labels()); } -static PyMethodDef khmer_subset_methods[] = { - { "count_partitions", subset_count_partitions, METH_VARARGS, "" }, - { "report_on_partitions", subset_report_on_partitions, METH_VARARGS, "" }, - { "compare_partitions", subset_compare_partitions, METH_VARARGS, "" }, - { "partition_size_distribution", subset_partition_size_distribution, METH_VARARGS, "" }, - { "partition_sizes", subset_partition_sizes, METH_VARARGS, "" }, - { "partition_average_coverages", subset_partition_average_coverages, METH_VARARGS, "" }, + +static PyMethodDef khmer_labelhash_methods[] = { + { "ksize", labelhash_get_ksize, METH_VARARGS, "" }, + { "hashsizes", labelhash_get_hashsizes, METH_VARARGS, "" }, + { "n_occupied", labelhash_n_occupied, METH_VARARGS, "Count the number of occupied bins" }, + { "n_unique_kmers", labelhash_n_unique_kmers, METH_VARARGS, "Count the number of unique kmers" }, + { "count", labelhash_count, METH_VARARGS, "Count the given kmer" }, + { "get", labelhash_get, METH_VARARGS, "Get the count for the given k-mer" }, + { "kmer_degree", labelhash_kmer_degree, METH_VARARGS, "" }, + { "load", labelhash_load, METH_VARARGS, "" }, + { "save", labelhash_save, METH_VARARGS, "" }, + { "load_tagset", labelhash_load_tagset, METH_VARARGS, "" }, + { "save_tagset", labelhash_save_tagset, METH_VARARGS, "" }, + { "n_tags", labelhash_n_tags, METH_VARARGS, "" }, + { "_get_tag_density", labelhash__get_tag_density, METH_VARARGS, "" }, + { "_set_tag_density", labelhash__set_tag_density, METH_VARARGS, "" }, + { "consume_fasta_and_tag", labelhash_consume_fasta_and_tag, METH_VARARGS, "Count all k-mers in a given file" }, + { "consume_fasta_and_tag_with_reads_parser", labelhash_consume_fasta_and_tag_with_reads_parser, + METH_VARARGS, "Count all k-mers using a given reads parser" }, + { "consume_partitioned_fasta", labelhash_consume_partitioned_fasta, METH_VARARGS, "Count all k-mers in a given file" }, + { "consume_fasta_and_tag_with_labels", labelhash_consume_fasta_and_tag_with_labels, METH_VARARGS, "" }, + { "sweep_label_neighborhood", labelhash_sweep_label_neighborhood, METH_VARARGS, "" }, + {"consume_partitioned_fasta_and_tag_with_labels", labelhash_consume_partitioned_fasta_and_tag_with_labels, METH_VARARGS, "" }, + {"sweep_tag_neighborhood", labelhash_sweep_tag_neighborhood, METH_VARARGS, "" }, + {"get_tag_labels", labelhash_get_tag_labels, METH_VARARGS, ""}, + {"consume_sequence_and_tag_with_labels", labelhash_consume_sequence_and_tag_with_labels, METH_VARARGS, "" }, + {"n_labels", labelhash_n_labels, METH_VARARGS, ""}, + {"get_label_dict", labelhash_get_label_dict, METH_VARARGS, "" }, + {NULL, NULL, 0, NULL} /* sentinel */ }; static PyObject * -khmer_subset_getattr(PyObject * obj, char * name) +khmer_labelhash_getattr(PyObject * obj, char * name) { - return Py_FindMethod(khmer_subset_methods, obj, name); + return Py_FindMethod(khmer_labelhash_methods, obj, name); } + // // GRAPHALIGN addition // @@ -4549,6 +4632,37 @@ static PyObject* _new_hashbits(PyObject * self, PyObject * args) return (PyObject *) khashbits_obj; } +// +// new_labelhash +// + +static PyObject* _new_labelhash(PyObject * self, PyObject * args) +{ + unsigned int k = 0; + PyObject* sizes_list_o = NULL; + + if (!PyArg_ParseTuple(args, "IO", &k, &sizes_list_o)) { + return NULL; + } + + std::vector sizes; + for (int i = 0; i < PyObject_Length(sizes_list_o); i++) { + PyObject * size_o = PyList_GET_ITEM(sizes_list_o, i); + sizes.push_back(PyLong_AsLongLong(size_o)); + } + + khmer_KLabelHash * klabelhash_obj = (khmer_KLabelHashObject *) \ + PyObject_New(khmer_KLabelHashsObject, &khmer_KLabelHashType); + + if (klabelhash_obj == NULL) { + return NULL; + } + + klabelhash_obj->labelhash = new khmer::LabelHash(k, sizes); + + return (PyObject *) klabelhash_obj; +} + static PyObject * hash_collect_high_abundance_kmers(PyObject * self, PyObject * args) { khmer_KCountingHashObject * me = (khmer_KCountingHashObject *) self; @@ -4607,6 +4721,21 @@ static void khmer_hashbits_dealloc(PyObject* self) PyObject_Del((PyObject *) obj); } + + +// +// khmer_labelhash_dealloc -- clean up a labelhash object. +// + +static void khmer_hashbits_dealloc(PyObject* self) +{ + khmer_KLabelHashObject * obj = (khmer_LabelHashObject *) self; + delete obj->labelhash; + obj->labelhash = NULL; + + PyObject_Del((PyObject *) obj); +} + // // khmer_subset_dealloc -- clean up a hashbits object. // From 09dd66a76e733fd67d670c1fdcdc62d1d31bdba7 Mon Sep 17 00:00:00 2001 From: Camille Scott Date: Tue, 17 Dec 2013 12:09:44 -0500 Subject: [PATCH 106/140] finished most of integration, added new tests, dealing with linker errors --- khmer/__init__.py | 1 + khmer/_khmermodule.cc | 326 ++++++++++++++++++++++++---------------- lib/hashbits.hh | 3 +- lib/hashtable.hh | 65 +------- lib/labelhash.hh | 27 ++-- tests/test_hashbits.py | 128 ---------------- tests/test_labelhash.py | 143 ++++++++++++++++++ 7 files changed, 362 insertions(+), 331 deletions(-) create mode 100644 tests/test_labelhash.py diff --git a/khmer/__init__.py b/khmer/__init__.py index b6031527e6..01683832e0 100644 --- a/khmer/__init__.py +++ b/khmer/__init__.py @@ -15,6 +15,7 @@ from _khmer import reverse_hash from _khmer import get_config from _khmer import ReadParser +from _khmer import LabelHash from ._version import get_versions __version__ = get_versions()['version'] diff --git a/khmer/_khmermodule.cc b/khmer/_khmermodule.cc index cb85fad1cc..cef9f7d724 100644 --- a/khmer/_khmermodule.cc +++ b/khmer/_khmermodule.cc @@ -1336,35 +1336,6 @@ typedef struct { khmer::Hashbits * hashbits; } khmer_KHashbitsObject; -static void khmer_hashbits_dealloc(PyObject *); -static PyObject * khmer_hashbits_getattr(PyObject * obj, char * name); - -static PyTypeObject khmer_KHashbitsType = { - PyObject_HEAD_INIT(NULL) - 0, - "KHashbits", sizeof(khmer_KHashbitsObject), - 0, - khmer_hashbits_dealloc, /*tp_dealloc*/ - 0, /*tp_print*/ - khmer_hashbits_getattr, /*tp_getattr*/ - 0, /*tp_setattr*/ - 0, /*tp_compare*/ - 0, /*tp_repr*/ - 0, /*tp_as_number*/ - 0, /*tp_as_sequence*/ - 0, /*tp_as_mapping*/ - 0, /*tp_hash */ - 0, /*tp_call*/ - 0, /*tp_str*/ - 0, /*tp_getattro*/ - 0, /*tp_setattro*/ - 0, /*tp_as_buffer*/ - Py_TPFLAGS_DEFAULT, /*tp_flags*/ - "hashbits object", /* tp_doc */ -}; - -#define is_hashbits_obj(v) ((v)->ob_type == &khmer_KHashbitsType) - static void khmer_subset_dealloc(PyObject *); static PyObject * khmer_subset_getattr(PyObject * obj, char * name); @@ -1394,6 +1365,8 @@ static PyTypeObject khmer_KSubsetPartitionType = { #define is_subset_obj(v) ((v)->ob_type == &khmer_KSubsetPartitionType) +// MOVED HASHBITS TYPE TO BELOW METHODS + /* GRAPHALIGN addition */ typedef struct { PyObject_HEAD @@ -3931,6 +3904,85 @@ khmer_hashbits_getattr(PyObject * obj, char * name) return Py_FindMethod(khmer_hashbits_methods, obj, name); } +static void khmer_hashbits_dealloc(PyObject *); +static PyObject* khmer_hashbits_new(PyTypeObject * type, PyObject * args, PyObject * kwds); +static int khmer_hashbits_init(khmer_KHashbitsObject * self, PyObject * args, PyObject * kwds); + +static PyTypeObject khmer_KHashbitsType = { + PyObject_HEAD_INIT(NULL) + 0, + "Hashbits", sizeof(khmer_KHashbitsObject), + 0, + khmer_hashbits_dealloc, /*tp_dealloc*/ + 0, /*tp_print*/ + khmer_hashbits_getattr, /*tp_getattr*/ + 0, /*tp_setattr*/ + 0, /*tp_compare*/ + 0, /*tp_repr*/ + 0, /*tp_as_number*/ + 0, /*tp_as_sequence*/ + 0, /*tp_as_mapping*/ + 0, /*tp_hash */ + 0, /*tp_call*/ + 0, /*tp_str*/ + 0, /*tp_getattro*/ + 0, /*tp_setattro*/ + 0, /*tp_as_buffer*/ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /*tp_flags*/ + "hashbits object", /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + khmer_hashbits_methods, /* tp_methods */ + 0, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + (initproc)khmer_hashbits_init, /* tp_init */ + 0, /* tp_alloc */ +}; + +// __new__ for hashbits; necessary for proper subclassing +// This will essentially do what the old factory function did. Unlike many __new__ +// methods, we take our arguments here, because there's no "unitialized" hashbits +// object; we have to have k and the table sizes before creating the new objects +static PyObject* khmer_hashbits_new(PyTypeObject * type, PyObject * args, PyObject * kwds) +{ + khmer_KHashbitsObject * self; + self = (khmer_KHashbitsObject *)type->tp_alloc(type, 0); + + if (self != NULL) { + unsigned int k = 0; + PyObject* sizes_list_o = NULL; + + if (!PyArg_ParseTuple(args, "IO", &k, &sizes_list_o)) { + return NULL; + } + + std::vector sizes; + for (int i = 0; i < PyObject_Length(sizes_list_o); i++) { + PyObject * size_o = PyList_GET_ITEM(sizes_list_o, i); + sizes.push_back(PyLong_AsLongLong(size_o)); + } + + self->hashbits = new khmer::Hashbits(k, sizes); + } + return (PyObject *) self; +} + +// there are no attributes that we need at this time, so we'll just return 0 +static int khmer_hashbits_init(khmer_KHashbitsObject * self, PyObject * args, PyObject * kwds) { + return 0; +} + +#define is_hashbits_obj(v) ((v)->ob_type == &khmer_KHashbitsType) + //////////////////////////////////////////////////////////////////////////// static PyObject * subset_count_partitions(PyObject * self, @@ -4104,8 +4156,8 @@ khmer_subset_getattr(PyObject * obj, char * name) // LabelHash addition typedef struct { - PyObject_HEAD - + //PyObject_HEAD + khmer_KHashbitsObject khashbits; /* @camillescott late night notes: need to experiment. might be able to call hashbits py methods directly with the labelhash object, because they all instantiate @@ -4117,38 +4169,65 @@ typedef struct { See http://docs.python.org/2.7/extending/newtypes.html#subclassing-other-types for details... */ - LabelHash * labelhash; + khmer::LabelHash * labelhash; } khmer_KLabelHashObject; static void khmer_labelhash_dealloc(PyObject *); -static PyObject * khmer_labelhash_getattr(PyObject * obj, char * name); - -static PyTypeObject khmer_KLabelHashType = { - PyObject_HEAD_INIT(NULL) - 0, - "KLabelHash", sizeof(khmer_KLabelHashObject), - 0, - khmer_labelhash_dealloc, /*tp_dealloc*/ - 0, /*tp_print*/ - khmer_labelhash_getattr, /*tp_getattr*/ - 0, /*tp_setattr*/ - 0, /*tp_compare*/ - 0, /*tp_repr*/ - 0, /*tp_as_number*/ - 0, /*tp_as_sequence*/ - 0, /*tp_as_mapping*/ - 0, /*tp_hash */ - 0, /*tp_call*/ - 0, /*tp_str*/ - 0, /*tp_getattro*/ - 0, /*tp_setattro*/ - 0, /*tp_as_buffer*/ - Py_TPFLAGS_DEFAULT, /*tp_flags*/ - "labelhash object", /* tp_doc */ -}; +static int khmer_labelhash_init(khmer_KLabelHashObject * self, PyObject *args, PyObject *kwds); +static PyObject * khmer_labelhash_new(PyTypeObject * type, PyObject *args, PyObject *kwds); #define is_labelhash_obj(v) ((v)->ob_type == &khmer_KLabelHashType) +// +// khmer_labelhash_dealloc -- clean up a labelhash object. +// + +static void khmer_labelhash_dealloc(PyObject* self) +{ + khmer_KLabelHashObject * obj = (khmer_KLabelHashObject *) self; + delete obj->labelhash; + obj->labelhash = NULL; + + PyObject_Del((PyObject *) obj); +} + +// a little wierd; we don't actually want to call Hashbits' new method. Rather, we +// define our own new method, and redirect the base's hashbits object to point to our +// labelhash object +static PyObject * khmer_labelhash_new(PyTypeObject *type, PyObject *args, PyObject *kwds) +{ + khmer_KLabelHashObject *self; + self = (khmer_KLabelHashObject*)type->tp_alloc(type, 0); + + if (self!=NULL) { + unsigned int k = 0; + PyObject* sizes_list_o = NULL; + + if (!PyArg_ParseTuple(args, "IO", &k, &sizes_list_o)) { + return NULL; + } + + std::vector sizes; + for (int i = 0; i < PyObject_Length(sizes_list_o); i++) { + PyObject * size_o = PyList_GET_ITEM(sizes_list_o, i); + sizes.push_back(PyLong_AsLongLong(size_o)); + } + + // We want the hashbits pointer in the base class to point to our labelhash, + // so that the KHashbits methods are called on the correct object (a LabelHash) + self->khashbits.hashbits = (khmer::Hashbits *)self->labelhash; + self->labelhash = new khmer::LabelHash(k, sizes); + } + + return (PyObject *) self; +} + +static int khmer_labelhash_init(khmer_KLabelHashObject * self, PyObject *args, PyObject *kwds) +{ + if (khmer_KHashbitsType.tp_init((PyObject *)self, args, kwds) < 0) + return -1; + return 0; +} static PyObject * labelhash_get_label_dict(PyObject * self, PyObject * args) { khmer_KLabelHashObject * me = (khmer_KLabelHashObject *) self; @@ -4309,7 +4388,6 @@ static PyObject * labelhash_sweep_label_neighborhood(PyObject * self, PyObject * return x; } - // Similar to find_all_tags, but returns tags in a way actually useable by python // need a tags_in_sequence iterator or function in c++ land for reuse in all // these functions @@ -4409,27 +4487,8 @@ static PyObject * labelhash_n_labels(PyObject * self, PyObject * args) return PyInt_FromLong(labelhash->n_labels()); } - static PyMethodDef khmer_labelhash_methods[] = { - { "ksize", labelhash_get_ksize, METH_VARARGS, "" }, - { "hashsizes", labelhash_get_hashsizes, METH_VARARGS, "" }, - { "n_occupied", labelhash_n_occupied, METH_VARARGS, "Count the number of occupied bins" }, - { "n_unique_kmers", labelhash_n_unique_kmers, METH_VARARGS, "Count the number of unique kmers" }, - { "count", labelhash_count, METH_VARARGS, "Count the given kmer" }, - { "get", labelhash_get, METH_VARARGS, "Get the count for the given k-mer" }, - { "kmer_degree", labelhash_kmer_degree, METH_VARARGS, "" }, - { "load", labelhash_load, METH_VARARGS, "" }, - { "save", labelhash_save, METH_VARARGS, "" }, - { "load_tagset", labelhash_load_tagset, METH_VARARGS, "" }, - { "save_tagset", labelhash_save_tagset, METH_VARARGS, "" }, - { "n_tags", labelhash_n_tags, METH_VARARGS, "" }, - { "_get_tag_density", labelhash__get_tag_density, METH_VARARGS, "" }, - { "_set_tag_density", labelhash__set_tag_density, METH_VARARGS, "" }, - { "consume_fasta_and_tag", labelhash_consume_fasta_and_tag, METH_VARARGS, "Count all k-mers in a given file" }, - { "consume_fasta_and_tag_with_reads_parser", labelhash_consume_fasta_and_tag_with_reads_parser, - METH_VARARGS, "Count all k-mers using a given reads parser" }, - { "consume_partitioned_fasta", labelhash_consume_partitioned_fasta, METH_VARARGS, "Count all k-mers in a given file" }, - { "consume_fasta_and_tag_with_labels", labelhash_consume_fasta_and_tag_with_labels, METH_VARARGS, "" }, + { "consume_fasta_and_tag_with_labels", labelhash_consume_fasta_and_tag_with_labels, METH_VARARGS, "" }, { "sweep_label_neighborhood", labelhash_sweep_label_neighborhood, METH_VARARGS, "" }, {"consume_partitioned_fasta_and_tag_with_labels", labelhash_consume_partitioned_fasta_and_tag_with_labels, METH_VARARGS, "" }, {"sweep_tag_neighborhood", labelhash_sweep_tag_neighborhood, METH_VARARGS, "" }, @@ -4441,13 +4500,53 @@ static PyMethodDef khmer_labelhash_methods[] = { {NULL, NULL, 0, NULL} /* sentinel */ }; +// still necessary? static PyObject * khmer_labelhash_getattr(PyObject * obj, char * name) { return Py_FindMethod(khmer_labelhash_methods, obj, name); } - +static PyTypeObject khmer_KLabelHashType = { + PyObject_HEAD_INIT(NULL) + 0, /* ob_size */ + "LabelHash", /* tp_name */ + sizeof(khmer_KLabelHashObject), /* tp_basicsize */ + 0, /* tp_itemsize */ + (destructor)khmer_labelhash_dealloc, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* khmer_labelhash_getattr, tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + 0, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */ + 0, /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + khmer_labelhash_methods, /* tp_methods */ + 0, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + (initproc)khmer_labelhash_init, /* tp_init */ + 0, /* tp_alloc */ +}; // // GRAPHALIGN addition @@ -4632,37 +4731,6 @@ static PyObject* _new_hashbits(PyObject * self, PyObject * args) return (PyObject *) khashbits_obj; } -// -// new_labelhash -// - -static PyObject* _new_labelhash(PyObject * self, PyObject * args) -{ - unsigned int k = 0; - PyObject* sizes_list_o = NULL; - - if (!PyArg_ParseTuple(args, "IO", &k, &sizes_list_o)) { - return NULL; - } - - std::vector sizes; - for (int i = 0; i < PyObject_Length(sizes_list_o); i++) { - PyObject * size_o = PyList_GET_ITEM(sizes_list_o, i); - sizes.push_back(PyLong_AsLongLong(size_o)); - } - - khmer_KLabelHash * klabelhash_obj = (khmer_KLabelHashObject *) \ - PyObject_New(khmer_KLabelHashsObject, &khmer_KLabelHashType); - - if (klabelhash_obj == NULL) { - return NULL; - } - - klabelhash_obj->labelhash = new khmer::LabelHash(k, sizes); - - return (PyObject *) klabelhash_obj; -} - static PyObject * hash_collect_high_abundance_kmers(PyObject * self, PyObject * args) { khmer_KCountingHashObject * me = (khmer_KCountingHashObject *) self; @@ -4722,20 +4790,6 @@ static void khmer_hashbits_dealloc(PyObject* self) PyObject_Del((PyObject *) obj); } - -// -// khmer_labelhash_dealloc -- clean up a labelhash object. -// - -static void khmer_hashbits_dealloc(PyObject* self) -{ - khmer_KLabelHashObject * obj = (khmer_LabelHashObject *) self; - delete obj->labelhash; - obj->labelhash = NULL; - - PyObject_Del((PyObject *) obj); -} - // // khmer_subset_dealloc -- clean up a hashbits object. // @@ -4871,8 +4925,24 @@ init_khmer(void) khmer_KTableType.ob_type = &PyType_Type; khmer_KCountingHashType.ob_type = &PyType_Type; + // implemented __new__ for Hashbits; keeping factory func around as well + // for backwards compat with old scripts + khmer_KHashbitsType.tp_new = khmer_hashbits_new; + if (PyType_Ready(&khmer_KHashbitsType) < 0) { + std::cout << "_khmer.KHashbitsType failed PyType_Ready" << std::endl; + return; + } + // add LabelHash + khmer_KLabelHashType.tp_base = &khmer_KHashbitsType; + khmer_KLabelHashType.tp_new = khmer_labelhash_new; + if (PyType_Ready(&khmer_KLabelHashType) < 0) { + std::cout << "_khmer.KLabelHashType failed PyType_Ready" << std::endl; + return; + } + PyObject * m; - m = Py_InitModule( "_khmer", KhmerMethods ); + m = Py_InitModule3( "_khmer", KhmerMethods, + "interface for the khmer module low-level extensions" ); if (m == NULL) { return; } @@ -4901,7 +4971,11 @@ init_khmer(void) // TODO: Add other types here as their 'new' methods are implemented. // Then, remove the corresponding factory functions. - + Py_INCREF(&khmer_KHashbitsType); + PyModule_AddObject(m, "Hashbits", (PyObject *)&khmer_KHashbitsType); + + Py_INCREF(&khmer_KLabelHashType); + PyModule_AddObject(m, "LabelHash", (PyObject *)&khmer_KLabelHashType); } // vim: set ft=cpp sts=4 sw=4 tw=79: diff --git a/lib/hashbits.hh b/lib/hashbits.hh index 84c93d9aa3..9b1d5462db 100644 --- a/lib/hashbits.hh +++ b/lib/hashbits.hh @@ -12,6 +12,7 @@ namespace khmer { class CountingHash; + class LabelHash; class Hashbits : public khmer::Hashtable { protected: @@ -240,7 +241,7 @@ namespace khmer { }; #include "counting.hh" - +#include "labelhash.hh" #endif // HASHBITS_HH // vim: set sts=2 sw=2: diff --git a/lib/hashtable.hh b/lib/hashtable.hh index f13dcd51a2..a306816581 100644 --- a/lib/hashtable.hh +++ b/lib/hashtable.hh @@ -455,7 +455,6 @@ namespace khmer { // Partitioning stuff. unsigned int n_tags() const { return all_tags.size(); } - unsigned int n_labels() const { return label_ptrs.size(); } void divide_tags_into_subsets(unsigned int subset_size, SeenSet& divvy); @@ -485,18 +484,7 @@ namespace khmer { CallbackFn callback = NULL, void * callback_data = NULL ); - - Label * check_and_allocate_label(Label new_label) { - Label * c; - if (label_ptrs.count(new_label)) { - c = label_ptrs[new_label]; - } else { - c = new Label(new_label); - label_ptrs[*c] = c; - } - return c; - } - + void consume_sequence_and_tag(const std::string& seq, unsigned long long& n_consumed, SeenSet * new_tags = 0); @@ -507,51 +495,6 @@ namespace khmer { unsigned long long &n_consumed, CallbackFn callback = 0, void * callback_data = 0); - - void consume_fasta_and_tag_with_labels( - std::string const &filename, - unsigned int &total_reads, - unsigned long long &n_consumed, - CallbackFn callback = NULL, - void * callback_data = NULL); - - void consume_fasta_and_tag_with_labels( - read_parsers:: IParser * parser, - unsigned int &total_reads, - unsigned long long &n_consumed, - CallbackFn callback = NULL, - void * callback_data = NULL); - - void consume_partitioned_fasta_and_tag_with_labels(const std::string &filename, - unsigned int &total_reads, - unsigned long long &n_consumed, - CallbackFn callback = NULL, - void * callback_datac = NULL); - - void consume_sequence_and_tag_with_labels(const std::string& seq, - unsigned long long& n_consumed, - Label& current_label, - SeenSet * new_tags = 0); - - LabelPtrSet get_tag_labels(const HashIntoType& tag); - TagPtrSet get_label_tags(const Label& label); - - void link_tag_and_label(HashIntoType& kmer, Label& label); - - unsigned int sweep_sequence_for_labels(const std::string& seq, - LabelPtrSet& found_labels, - bool break_on_stoptags, - bool stop_big_traversals); - - unsigned int sweep_label_neighborhood(const std::string & seq, - LabelPtrSet& found_labels, - unsigned int range, - bool break_on_stoptags, - bool stop_big_traversals); - - void traverse_labels_and_resolve(const SeenSet& tagged_kmers, - LabelPtrSet& found_labels); - void consume_fasta_and_traverse(const std::string &filename, unsigned int distance, unsigned int big_threshold, @@ -660,10 +603,4 @@ namespace khmer { #define RELEASE_ALL_TAGS_SPIN_LOCK \ __sync_bool_compare_and_swap( &_all_tags_spin_lock, 1, 0 ); -#define ACQUIRE_TAG_COLORS_SPIN_LOCK \ - while(!__sync_bool_compare_and_swap( &_tag_labels_spin_lock, 0, 1)); - -#define RELEASE_TAG_COLORS_SPIN_LOCK \ - __sync_bool_compare_and_swap( &_tag_labels_spin_lock, 1, 0); - #endif // HASHTABLE_HH diff --git a/lib/labelhash.hh b/lib/labelhash.hh index 960d64158d..374ff02d78 100644 --- a/lib/labelhash.hh +++ b/lib/labelhash.hh @@ -11,21 +11,13 @@ #include "khmer.hh" #include "hashbits.hh" +#include "hashtable.hh" +#include "read_parsers.hh" namespace khmer { - + class LabelHash : public khmer::Hashbits { protected: - LabelHash( WordLength ksize, std::vector& tablesizes) - : khmer::Hashbits(ksize, tablesizes) - { - // constructor - _tag_labels_spin_lock = 0; - - } - - ~LabelHash(); - // Does the given tag already have the given label? bool _cmap_contains_label(const TagLabelPtrMap& cmap, HashIntoType& kmer, @@ -79,6 +71,17 @@ namespace khmer { uint32_t _tag_labels_spin_lock; public: + + LabelHash( WordLength ksize, std::vector& tablesizes) + : khmer::Hashbits(ksize, tablesizes) + { + // constructor + _tag_labels_spin_lock = 0; + + } + + ~LabelHash(); + TagLabelPtrMap tag_labels; LabelTagPtrMap label_tag_ptrs; LabelPtrMap label_ptrs; @@ -141,7 +144,7 @@ namespace khmer { LabelPtrSet& found_labels); }; -} +}; #define ACQUIRE_TAG_COLORS_SPIN_LOCK \ while(!__sync_bool_compare_and_swap( &_tag_labels_spin_lock, 0, 1)); diff --git a/tests/test_hashbits.py b/tests/test_hashbits.py index 2456d8d788..aca4dad430 100644 --- a/tests/test_hashbits.py +++ b/tests/test_hashbits.py @@ -11,11 +11,9 @@ import khmer_tst_utils as utils from nose.plugins.attrib import attr - def teardown(): utils.cleanup() - def test__get_set_tag_density(): ht = khmer.new_hashbits(32, 1, 1) @@ -547,129 +545,3 @@ def test_simple_median(): assert average == 1.0 assert stddev == 0.0 -# -# @cswelcher TODO: more tests! -# * thread-safety - -def test_n_labels(): - hb = khmer.new_hashbits(20, 1e7, 4) - filename = utils.get_test_data('test-labels.fa') - hb.consume_fasta_and_tag_with_labels(filename) - - print hb.n_labels() - assert hb.n_labels() == 4 - -def test_get_label_dict(): - hb = khmer.new_hashbits(20, 1e7, 4) - filename = utils.get_test_data('test-labels.fa') - hb.consume_fasta_and_tag_with_labels(filename) - - labels = hb.get_label_dict() - expected = [0L, 1L, 2L, 3L] - for e_label in expected: - assert e_label in labels - for a_label in labels: - assert a_label in expected - -def test_sweep_tag_neighborhood(): - hb = khmer.new_hashbits(20, 1e7, 4) - filename = utils.get_test_data('single-read.fq') - hb.consume_fasta_and_tag(filename) - - tags = hb.sweep_tag_neighborhood('CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT') - assert len(tags) == 1 - assert tags.pop() == 173473779682L - -def test_get_tag_labels(): - hb = khmer.new_hashbits(20, 1e7, 4) - filename = utils.get_test_data('single-read.fq') - hb.consume_fasta_and_tag_with_labels(filename) - tag = 173473779682L - - labels = hb.get_tag_labels(tag) - assert len(labels) == 1 - assert labels.pop() == 0L - -def test_sweep_sequence_for_labels(): - hb = khmer.new_hashbits(20, 1e7, 4) - filename = utils.get_test_data('single-read.fq') - hb.consume_fasta_and_tag_with_labels(filename) - - labels = hb.sweep_label_neighborhood('CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT') - assert len(labels) == 1 - assert labels.pop() == 0L - -def test_consume_partitioned_fasta_and_tag_with_labels(): - hb = khmer.new_hashbits(20, 1e7, 4) - filename = utils.get_test_data('real-partition-small.fa') - - total_reads, n_consumed = hb.consume_partitioned_fasta_and_tag_with_labels(filename) - labels = set() - for record in screed.open(filename): - seq = record.sequence - labels.update(hb.sweep_label_neighborhood(seq, False, False)) - #print hb.n_labels() - #print labels - assert len(labels) == 1 - assert labels.pop() == 2L - assert hb.n_labels() == 1 - -def test_consume_fasta_and_tag_with_labels(): - hb = khmer.new_hashbits(20, 1e7, 4) - read_1 = 'ACGTAACCGGTTAAACCCGGGTTTAAAACCCCGGGGTTTT' - filename = utils.get_test_data('test-transcript.fa') - - total_reads, n_consumed = hb.consume_fasta_and_tag_with_labels(filename) - - assert hb.get(read_1[:20]) - assert total_reads == 3 - print hb.n_labels() - print hb.get_label_dict() - for tag in hb.get_tagset(): - print tag, khmer.forward_hash(tag, 20) - for record in screed.open(filename): - print hb.sweep_tag_neighborhood(record.sequence, 40) - print hb.sweep_label_neighborhood(record.sequence, 40) - assert hb.n_labels() == 3 - -''' -* The test data set as four reads: A, B, C, and D -* Overlaps are A <-> B <-> C, with D on its own -* Thus, traversing from A should find labels from A and B, - traversing from B should find labels from A, B, and C, - and traversing from C should find labels from B and C -''' -def test_label_tag_correctness(): - hb = khmer.new_hashbits(20, 1e7, 4) - filename = utils.get_test_data('test-labels.fa') - hb.consume_fasta_and_tag_with_labels(filename) - - # read A - labels = hb.sweep_label_neighborhood('ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAGCTAGGCTAGGTGTGCTCTGCCTAGAGCTAGGCTAGGTGT') - print hb.sweep_tag_neighborhood('TTCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAGCTAGGCTAGGTGTGCTCTGCCTAGAGCTAGGCTAGGTGT') - print labels - print len('ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAG')-19 - assert len(labels) == 2 - assert 0L in labels - assert 1L in labels - - # read B - labels = hb.sweep_label_neighborhood('GCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGCTCTGCCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGA') - print labels - assert len(labels) == 3 - assert 0L in labels - assert 1L in labels - assert 2L in labels - - # read C - labels = hb.sweep_label_neighborhood('TGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGAGTTGGGGATAGATAGATAGATGAGTGTAGATCCAACAACACATACA') - print labels - assert len(labels) == 2 - assert 1L in labels - assert 2L in labels - - # read D - labels = hb.sweep_label_neighborhood('TATATATATAGCTAGCTAGCTAACTAGCTAGCATCGATCGATCGATC') - print labels - assert len(labels) == 1 - assert 3L in labels diff --git a/tests/test_labelhash.py b/tests/test_labelhash.py new file mode 100644 index 0000000000..7993091d74 --- /dev/null +++ b/tests/test_labelhash.py @@ -0,0 +1,143 @@ +# +# This file is part of khmer, http://github.com/ged-lab/khmer/, and is +# Copyright (C) Michigan State University, 2009-2013. It is licensed under +# the three-clause BSD license; see doc/LICENSE.txt. Contact: ctb@msu.edu +# +import khmer +from khmer import LabelHash +from screed.fasta import fasta_iter +import screed + +import khmer_tst_utils as utils +from nose.plugins.attrib import attr + +def teardown(): + utils.cleanup() + +# +# @camillescott TODO: more tests! +# * thread-safety + +def test_n_labels(): + lh = LabelHash(20, 1e7, 4) + filename = utils.get_test_data('test-labels.fa') + lh.consume_fasta_and_tag_with_labels(filename) + + print lh.n_labels() + assert lh.n_labels() == 4 + +def test_get_label_dict(): + lb = LabelHash(20, 1e7, 4) + filename = utils.get_test_data('test-labels.fa') + lb.consume_fasta_and_tag_with_labels(filename) + + labels = lb.get_label_dict() + expected = [0L, 1L, 2L, 3L] + for e_label in expected: + assert e_label in labels + for a_label in labels: + assert a_label in expected + +def test_get_tag_labels(): + lb = LabelHash(20, 1e7, 4) + filename = utils.get_test_data('single-read.fq') + lb.consume_fasta_and_tag_with_labels(filename) + tag = 173473779682L + + labels = lb.get_tag_labels(tag) + assert len(labels) == 1 + assert labels.pop() == 0L + +def test_consume_partitioned_fasta_and_tag_with_labels(): + lb = LabelHash(20, 1e7, 4) + filename = utils.get_test_data('real-partition-small.fa') + + total_reads, n_consumed = lb.consume_partitioned_fasta_and_tag_with_labels(filename) + labels = set() + for record in screed.open(filename): + seq = record.sequence + labels.update(lb.sweep_label_neighborhood(seq, False, False)) + #print lb.n_labels() + #print labels + assert len(labels) == 1 + assert labels.pop() == 2L + assert lb.n_labels() == 1 + +def test_consume_fasta_and_tag_with_labels(): + lb = LabelHash(20, 1e7, 4) + read_1 = 'ACGTAACCGGTTAAACCCGGGTTTAAAACCCCGGGGTTTT' + filename = utils.get_test_data('test-transcript.fa') + + total_reads, n_consumed = lb.consume_fasta_and_tag_with_labels(filename) + + assert lb.get(read_1[:20]) + assert total_reads == 3 + print lb.n_labels() + print lb.get_label_dict() + for tag in lb.get_tagset(): + print tag, khmer.forward_hash(tag, 20) + for record in screed.open(filename): + print lb.sweep_tag_neighborhood(record.sequence, 40) + print lb.sweep_label_neighborhood(record.sequence, 40) + assert lb.n_labels() == 3 + +''' +* The test data set as four reads: A, B, C, and D +* Overlaps are A <-> B <-> C, with D on its own +* Thus, traversing from A should find labels from A and B, + traversing from B should find labels from A, B, and C, + and traversing from C should find labels from B and C +''' +def test_label_tag_correctness(): + lb = LabelHash(20, 1e7, 4) + filename = utils.get_test_data('test-labels.fa') + lb.consume_fasta_and_tag_with_labels(filename) + + # read A + labels = lb.sweep_label_neighborhood('ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAGCTAGGCTAGGTGTGCTCTGCCTAGAGCTAGGCTAGGTGT') + print lb.sweep_tag_neighborhood('TTCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAGCTAGGCTAGGTGTGCTCTGCCTAGAGCTAGGCTAGGTGT') + print labels + print len('ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAG')-19 + assert len(labels) == 2 + assert 0L in labels + assert 1L in labels + + # read B + labels = lb.sweep_label_neighborhood('GCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGCTCTGCCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGA') + print labels + assert len(labels) == 3 + assert 0L in labels + assert 1L in labels + assert 2L in labels + + # read C + labels = lb.sweep_label_neighborhood('TGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGAGTTGGGGATAGATAGATAGATGAGTGTAGATCCAACAACACATACA') + print labels + assert len(labels) == 2 + assert 1L in labels + assert 2L in labels + + # read D + labels = lb.sweep_label_neighborhood('TATATATATAGCTAGCTAGCTAACTAGCTAGCATCGATCGATCGATC') + print labels + assert len(labels) == 1 + assert 3L in labels + +def test_sweep_tag_neighborhood(): + lb = LabelHash(20, 1e7, 4) + filename = utils.get_test_data('single-read.fq') + lb.consume_fasta_and_tag(filename) + + tags = lb.sweep_tag_neighborhood('CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT') + assert len(tags) == 1 + assert tags.pop() == 173473779682L + + +def test_sweep_label_neighborhood(): + lb = LabelHash(20, 1e7, 4) + filename = utils.get_test_data('single-read.fq') + lb.consume_fasta_and_tag_with_labels(filename) + + labels = lb.sweep_label_neighborhood('CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT') + assert len(labels) == 1 + assert labels.pop() == 0L From ebf8a9e8f5c5544e36a1ddd44e51e14b85a23d4f Mon Sep 17 00:00:00 2001 From: Camille Scott Date: Tue, 17 Dec 2013 12:11:08 -0500 Subject: [PATCH 107/140] stripped label stuff from hashtable.cc --- lib/hashtable.cc | 350 ----------------------------------------------- 1 file changed, 350 deletions(-) diff --git a/lib/hashtable.cc b/lib/hashtable.cc index 7f4ea47d90..16b463a879 100644 --- a/lib/hashtable.cc +++ b/lib/hashtable.cc @@ -1924,354 +1924,4 @@ void Hashtable::extract_unique_paths(std::string seq, } } } -/* - * Pretty much copy-pasta - * @cswelcher - * Might be time for a refactor: could do a general consume_fasta - * function which accepts a consume_sequence function pointer as a parameter - */ - -void -Hashtable::consume_fasta_and_tag_with_labels( - std:: string const &filename, - unsigned int &total_reads, unsigned long long &n_consumed, - CallbackFn callback, void * callback_data -) -{ - khmer:: Config &the_config = khmer:: get_active_config( ); - - // Note: Always assume only 1 thread if invoked this way. - IParser * parser = - IParser::get_parser( - filename, 1, the_config.get_reads_input_buffer_size( ), - the_config.get_reads_parser_trace_level( ) - ); - - - consume_fasta_and_tag_with_labels( - parser, - total_reads, n_consumed, - callback, callback_data - ); - - delete parser; -} - -void -Hashtable::consume_fasta_and_tag_with_labels( - read_parsers:: IParser * parser, - unsigned int &total_reads, unsigned long long &n_consumed, - CallbackFn callback, void * callback_data - ) - { - Hasher &hasher = - _get_hasher( parser->uuid( ) ); - unsigned int total_reads_LOCAL = 0; - #if (0) // Note: Used with callback - currently disabled. - unsigned long long int n_consumed_LOCAL = 0; - #endif - Read read; - - // TODO? Delete the following assignments. - total_reads = 0; - n_consumed = 0; - - hasher.trace_logger( - TraceLogger:: TLVL_DEBUG2, - "Starting trace of 'consume_fasta_and_tag'....\n" - ); - - Label _tag_label = 0; - - Label * the_label; - // Iterate through the reads and consume their k-mers. - while (!parser->is_complete( )) - { - unsigned long long this_n_consumed = 0; - - read = parser->get_next_read( ); - - if (check_and_normalize_read( read.sequence )) - { - // TODO: make threadsafe! - the_label = check_and_allocate_label(_tag_label); - consume_sequence_and_tag_with_labels( read.sequence, - this_n_consumed, - *the_label ); - _tag_label++; - - #ifdef WITH_INTERNAL_METRICS - hasher.pmetrics.start_timers( ); - #endif - #if (0) // Note: Used with callback - currently disabled. - n_consumed_LOCAL = __sync_add_and_fetch( &n_consumed, this_n_consumed ); - #else - __sync_add_and_fetch( &n_consumed, this_n_consumed ); - #endif - total_reads_LOCAL = __sync_add_and_fetch( &total_reads, 1 ); - #ifdef WITH_INTERNAL_METRICS - hasher.pmetrics.stop_timers( ); - hasher.pmetrics.accumulate_timer_deltas( - (uint32_t)HashTablePerformanceMetrics:: MKEY_TIME_UPDATE_TALLIES - ); - #endif - } - - if (0 == (total_reads_LOCAL % 10000)) - hasher.trace_logger( - TraceLogger:: TLVL_DEBUG3, - "Total number of reads processed: %llu\n", - (unsigned long long int)total_reads_LOCAL - ); - - // TODO: Figure out alternative to callback into Python VM - // Cannot use in multi-threaded operation. - #if (0) - // run callback, if specified - if (total_reads_TL % CALLBACK_PERIOD == 0 && callback) { - std::cout << "n tags: " << all_tags.size() << "\n"; - try { - callback("consume_fasta_and_tag", callback_data, total_reads_TL, - n_consumed); - } catch (...) { - delete parser; - throw; - } - } - #endif // 0 - - } // while reads left for parser - - } - -void Hashtable::consume_partitioned_fasta_and_tag_with_labels(const std::string &filename, - unsigned int &total_reads, - unsigned long long &n_consumed, - CallbackFn callback, - void * callback_data) -{ - total_reads = 0; - n_consumed = 0; - - IParser* parser = IParser::get_parser(filename.c_str()); - Read read; - - string seq = ""; - - // reset the master subset partition - delete partition; - partition = new SubsetPartition(this); - - // - // iterate through the FASTA file & consume the reads. - // - Label * c; - PartitionID p; - while(!parser->is_complete()) { - read = parser->get_next_read(); - seq = read.sequence; - - if (check_and_normalize_read(seq)) { - // First, figure out what the partition is (if non-zero), and save that. - p = _parse_partition_id(read.name); - c = check_and_allocate_label(p); - - consume_sequence_and_tag_with_labels( seq, - n_consumed, - *c ); - } - - // reset the sequence info, increment read number - total_reads++; - - // run callback, if specified - if (total_reads % CALLBACK_PERIOD == 0 && callback) { - try { - callback("consume_partitioned_fasta_and_tag_with_labels", callback_data, - total_reads, n_consumed); - } catch (...) { - delete parser; - throw; - } - } - } - - // @cswelcher TODO: check that deallocate LabelPtrMap is correct - delete parser; -} - -// @cswelcher: double-check -- is it valid to pull the address from a reference? -void Hashtable::link_tag_and_label(HashIntoType& kmer, Label& kmer_label) { - tag_labels.insert(TagLabelPtrPair(kmer, &kmer_label)); - label_tag_ptrs.insert(LabelTagPtrPair(kmer_label, &kmer)); -} - -/* This is essentially the same code as above, only it assigns labels to the - * tags through multimap TagLabelMap defined in hashtable.hh, declared in - * hashbits.hh - * @cswelcher TODO: should I instead send in the pointer to the new label? - */ -void Hashtable::consume_sequence_and_tag_with_labels(const std::string& seq, - unsigned long long& n_consumed, - Label& current_label, - SeenSet * found_tags) - { - bool is_new_kmer; - bool kmer_tagged; - - KMerIterator kmers(seq.c_str(), _ksize); - HashIntoType kmer; - - unsigned int since = _tag_density / 2 + 1; - - while(!kmers.done()) { - kmer = kmers.next(); - - if ((is_new_kmer = test_and_set_bits( kmer ))) - ++n_consumed; - - #if (1) - if (is_new_kmer) { - ++since; - } else { - ACQUIRE_ALL_TAGS_SPIN_LOCK - kmer_tagged = set_contains(all_tags, kmer); - RELEASE_ALL_TAGS_SPIN_LOCK - if (kmer_tagged) { - since = 1; - - // Labeling code - // TODO: MAKE THREADSAFE! - - if (!_cmap_contains_label(tag_labels, kmer, current_label)) { - ACQUIRE_TAG_COLORS_SPIN_LOCK - link_tag_and_label(kmer, current_label); - RELEASE_TAG_COLORS_SPIN_LOCK - } - if (found_tags) { - found_tags->insert(kmer); - } - } else ++since; - } - #else - if (!is_new_kmer && set_contains(all_tags, kmer)) { - since = 1; - if (found_tags) { found_tags->insert(kmer); } - } else { - since++; - } - #endif - // - if (since >= _tag_density) { - ACQUIRE_ALL_TAGS_SPIN_LOCK - all_tags.insert(kmer); - RELEASE_ALL_TAGS_SPIN_LOCK - - // Labeling code - // TODO: MAKE THREADSAFE! - ACQUIRE_TAG_COLORS_SPIN_LOCK - link_tag_and_label(kmer, current_label); - RELEASE_TAG_COLORS_SPIN_LOCK - - if (found_tags) { found_tags->insert(kmer); } - since = 1; - } - - } // iteration over kmers - - if (since >= _tag_density/2 - 1) { - ACQUIRE_ALL_TAGS_SPIN_LOCK - all_tags.insert(kmer); // insert the last k-mer, too. - RELEASE_ALL_TAGS_SPIN_LOCK - - // Label code: TODO: MAKE THREADSAFE! - link_tag_and_label(kmer, current_label); - - if (found_tags) { found_tags->insert(kmer); } - } - } -/* - * Find all labels associated with the sequence - * For now, check /every/ k-mer with find_all_tags - * THIS SUCKS AND IT'S YOUR FAULT @CTB - */ -unsigned int Hashtable::sweep_sequence_for_labels(const std::string& seq, - LabelPtrSet& found_labels, - bool break_on_stoptags, - bool stop_big_traversals) { - - SeenSet tagged_kmers; - //LabelPtrSet found_labels; - - HashIntoType kmer_f, kmer_r, kmer; - - KMerIterator kmers(seq.c_str(), _ksize); - std::string kmer_s; - // keep a list of kmers which have already been traversed - SeenSet traversed_kmers; - while (!kmers.done()) { - kmer = kmers.next(); - kmer_s = _revhash(kmer, _ksize); - _hash(kmer_s.c_str(), _ksize, kmer_f, kmer_r); - - // don't even try traversing from k-mers not in the hashtable - //traversed_kmers.clear(); - if (get_count(uniqify_rc(kmer_f,kmer_r))) { - partition->find_all_tags(kmer_f, kmer_r, tagged_kmers, - all_tags, break_on_stoptags, stop_big_traversals); - traverse_labels_and_resolve(tagged_kmers, found_labels); - } - } - return traversed_kmers.size(); -} - -unsigned int Hashtable::sweep_label_neighborhood(const std::string& seq, - LabelPtrSet& found_labels, - unsigned int range, - bool break_on_stoptags, - bool stop_big_traversals) { - - SeenSet tagged_kmers; - unsigned int num_traversed; - num_traversed = partition->sweep_for_tags(seq, tagged_kmers, all_tags, - range, break_on_stoptags, stop_big_traversals); - traverse_labels_and_resolve(tagged_kmers, found_labels); - //printf("range=%u ", range); - if (range == 0) { - assert(num_traversed == seq.length()-ksize()+1); - } - tagged_kmers.clear(); - return num_traversed; -} - -LabelPtrSet Hashtable::get_tag_labels(const HashIntoType& tag) { - LabelPtrSet labels; - unsigned int num_labels; - _get_tag_labels(tag, tag_labels, labels); - return labels; -} - -TagPtrSet Hashtable::get_label_tags(const Label& label) { - TagPtrSet tags; - unsigned int num_tags; - _get_tags_from_label(label, label_tag_ptrs, tags); - return tags; -} - -void Hashtable::traverse_labels_and_resolve(const SeenSet& tagged_kmers, - LabelPtrSet& found_labels) { - - SeenSet::const_iterator si; - unsigned int num_labels = 0; - for (si=tagged_kmers.begin(); si!=tagged_kmers.end(); ++si) { - HashIntoType tag = *si; - // get the labels associated with this tag - num_labels = _get_tag_labels(tag, tag_labels, found_labels); - if (num_labels > 1) { - // reconcile labels - // for now do nothing ha - } - } -} - // vim: set sts=2 sw=2: From 6ee7f91c26e4e4414657f56da9f18e3ab1649ee5 Mon Sep 17 00:00:00 2001 From: Camille Scott Date: Tue, 17 Dec 2013 13:03:05 -0500 Subject: [PATCH 108/140] switched include ordering back --- khmer/_khmermodule.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/khmer/_khmermodule.cc b/khmer/_khmermodule.cc index cef9f7d724..4757df82d2 100644 --- a/khmer/_khmermodule.cc +++ b/khmer/_khmermodule.cc @@ -17,8 +17,8 @@ #include "khmer_config.hh" #include "ktable.hh" #include "hashtable.hh" -#include "counting.hh" #include "hashbits.hh" +#include "counting.hh" #include "storage.hh" #include "aligner.hh" #include "labelhash.hh" @@ -4933,13 +4933,14 @@ init_khmer(void) return; } // add LabelHash + khmer_KLabelHashType.tp_base = &khmer_KHashbitsType; khmer_KLabelHashType.tp_new = khmer_labelhash_new; if (PyType_Ready(&khmer_KLabelHashType) < 0) { std::cout << "_khmer.KLabelHashType failed PyType_Ready" << std::endl; return; } - + PyObject * m; m = Py_InitModule3( "_khmer", KhmerMethods, "interface for the khmer module low-level extensions" ); From 7e8a1ac297b39e42cbadc93fdc0682ed3e7e943c Mon Sep 17 00:00:00 2001 From: Camille Scott Date: Tue, 17 Dec 2013 14:16:07 -0500 Subject: [PATCH 109/140] added setup.py which had been left out (still doesn't work) --- setup.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/setup.py b/setup.py index ed76f07837..827bd02077 100755 --- a/setup.py +++ b/setup.py @@ -61,7 +61,7 @@ build_depends.extend(map( lambda bn: path_join("lib", bn + ".hh"), [ - "storage", "khmer", "khmer_config", "ktable", "hashtable", "counting", + "storage", "khmer", "khmer_config", "ktable", "hashtable", "counting", "hashbits", ] )) @@ -70,9 +70,9 @@ lambda bn: path_join("lib", bn + ".cc"), [ "khmer_config", "thread_id_map", "trace_logger", "perf_metrics", - "read_parsers", "ktable", "hashtable", "hashbits", "counting", - "subset", "aligner", "scoringmatrix", "node", "kmer", - ] + "read_parsers", "ktable", "hashtable", "hashbits", "labelhash", "counting", + "subset", "aligner", "scoringmatrix", "node", "kmer", + ] )) extension_mod_DICT = \ @@ -103,7 +103,7 @@ "long_description": open("README.rst").read(), "author": 'Michael R. Crusoe, Greg Edvenson, Jordan Fish,' ' Adina Howe, Eric McDonald, Joshua Nahum, Kaben Nanlohy,' - ' Jason Pell, Jared Simpson, C. S. Welcher,' + ' Jason Pell, Jared Simpson, Camille Scott,' ' Qingpeng Zhang, and C. Titus Brown', "author_email": 'khmer-project@idyll.org', #"maintainer": 'Michael R. Crusoe', # this overrides the author field From 5bbb493e7a58b75ccdfe5cd0a67124d507da151e Mon Sep 17 00:00:00 2001 From: Camille Scott Date: Tue, 17 Dec 2013 14:47:12 -0500 Subject: [PATCH 110/140] commented out undefined destructor --- lib/labelhash.hh | 3 +-- setup.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/lib/labelhash.hh b/lib/labelhash.hh index 374ff02d78..3133a5c70b 100644 --- a/lib/labelhash.hh +++ b/lib/labelhash.hh @@ -75,12 +75,11 @@ namespace khmer { LabelHash( WordLength ksize, std::vector& tablesizes) : khmer::Hashbits(ksize, tablesizes) { - // constructor _tag_labels_spin_lock = 0; } - ~LabelHash(); + //~LabelHash(); TagLabelPtrMap tag_labels; LabelTagPtrMap label_tag_ptrs; diff --git a/setup.py b/setup.py index 827bd02077..bf8bc05272 100755 --- a/setup.py +++ b/setup.py @@ -61,7 +61,7 @@ build_depends.extend(map( lambda bn: path_join("lib", bn + ".hh"), [ - "storage", "khmer", "khmer_config", "ktable", "hashtable", "counting", "hashbits", + "storage", "khmer", "khmer_config", "ktable", "hashtable", "counting", "hashbits", "labelhash", ] )) From b07e3f0b169439cdc4df5686cc46986d5ef9eba8 Mon Sep 17 00:00:00 2001 From: Camille Scott Date: Tue, 17 Dec 2013 14:53:25 -0500 Subject: [PATCH 111/140] fixed namespaces --- lib/labelhash.cc | 2 -- lib/labelhash.hh | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/lib/labelhash.cc b/lib/labelhash.cc index b3d9a15ea8..d4e0a29627 100644 --- a/lib/labelhash.cc +++ b/lib/labelhash.cc @@ -6,9 +6,7 @@ #include "labelhash.hh" -using namespace std; using namespace khmer; -using namespace khmer:: read_parsers; /* * @camillescott diff --git a/lib/labelhash.hh b/lib/labelhash.hh index 3133a5c70b..55f0a43ac1 100644 --- a/lib/labelhash.hh +++ b/lib/labelhash.hh @@ -79,7 +79,7 @@ namespace khmer { } - //~LabelHash(); + ~LabelHash() {}; TagLabelPtrMap tag_labels; LabelTagPtrMap label_tag_ptrs; From a2fd39ae5f4f077970ddd91e6f1d9b53b571d295 Mon Sep 17 00:00:00 2001 From: Camille Scott Date: Tue, 17 Dec 2013 14:56:44 -0500 Subject: [PATCH 112/140] now the namespaces are nice fishjord --- lib/labelhash.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/labelhash.cc b/lib/labelhash.cc index d4e0a29627..7093760bcc 100644 --- a/lib/labelhash.cc +++ b/lib/labelhash.cc @@ -7,6 +7,7 @@ #include "labelhash.hh" using namespace khmer; +using namespace khmer:: read_parsers; /* * @camillescott @@ -139,7 +140,7 @@ void LabelHash::consume_partitioned_fasta_and_tag_with_labels(const std::string IParser* parser = IParser::get_parser(filename.c_str()); Read read; - string seq = ""; + std::string seq = ""; // reset the master subset partition delete partition; From 1ac3d6f59e9fdc207a71837ea11a5458b37617d0 Mon Sep 17 00:00:00 2001 From: Camille Scott Date: Wed, 18 Dec 2013 01:18:38 -0500 Subject: [PATCH 113/140] stripped final remnants of labeling code out of hashtable --- khmer/__init__.py | 13 +++++++++++-- khmer/_khmermodule.cc | 14 ++++++++------ lib/labelhash.cc | 3 ++- 3 files changed, 21 insertions(+), 9 deletions(-) diff --git a/khmer/__init__.py b/khmer/__init__.py index 01683832e0..04f2038f0f 100644 --- a/khmer/__init__.py +++ b/khmer/__init__.py @@ -15,7 +15,7 @@ from _khmer import reverse_hash from _khmer import get_config from _khmer import ReadParser -from _khmer import LabelHash +from _khmer import _LabelHash from ._version import get_versions __version__ = get_versions()['version'] @@ -29,7 +29,6 @@ def new_hashbits(k, starting_size, n_tables=2): return _new_hashbits(k, primes) - def new_counting_hash(k, starting_size, n_tables=2, n_threads=1): primes = get_n_primes_above_x(n_tables, starting_size) @@ -149,4 +148,14 @@ def get_n_primes_above_x(n, x): i += 2 return primes +class LabelHash(_LabelHash): + def __new__(cls, k, starting_size, n_tables): + print "** LabelHash __new__" + print "\t*** Getting primes..." + primes = get_n_primes_above_x(n_tables, starting_size) + print "\t*** Invoking parent..." + c = _LabelHash.__new__(cls, k, primes) + print "\t*** Done with parent, returning class object" + c.primes = primes + return c diff --git a/khmer/_khmermodule.cc b/khmer/_khmermodule.cc index 4757df82d2..e1b85d3f54 100644 --- a/khmer/_khmermodule.cc +++ b/khmer/_khmermodule.cc @@ -4215,8 +4215,8 @@ static PyObject * khmer_labelhash_new(PyTypeObject *type, PyObject *args, PyObje // We want the hashbits pointer in the base class to point to our labelhash, // so that the KHashbits methods are called on the correct object (a LabelHash) - self->khashbits.hashbits = (khmer::Hashbits *)self->labelhash; self->labelhash = new khmer::LabelHash(k, sizes); + self->khashbits.hashbits = (khmer::Hashbits *)self->labelhash; } return (PyObject *) self; @@ -4226,6 +4226,7 @@ static int khmer_labelhash_init(khmer_KLabelHashObject * self, PyObject *args, P { if (khmer_KHashbitsType.tp_init((PyObject *)self, args, kwds) < 0) return -1; + std::cout << "testing my pointer ref to hashbits: " << self->khashbits.hashbits->n_tags() << std::endl; return 0; } @@ -4306,24 +4307,25 @@ static PyObject * labelhash_consume_partitioned_fasta_and_tag_with_labels( static PyObject * labelhash_consume_sequence_and_tag_with_labels(PyObject * self, PyObject * args) { khmer_KLabelHashObject * me = (khmer_KLabelHashObject *) self; khmer::LabelHash * hb = me->labelhash; - + std::cout << "inside labelhash consume cpython func, parsing args..." << std::endl; char * seq = NULL; unsigned long long c = NULL; if (!PyArg_ParseTuple(args, "sK", &seq, &c)) { return NULL; } - + std::cout << "parsed args, getting new label" << std::endl; unsigned long long n_consumed = 0; khmer::Label * the_label = hb->check_and_allocate_label(c); try { //if (hb->check_and_normalize_read(seq)) { - + std::cout << "calling low level consume func on labelhash..." << std::endl; hb->consume_sequence_and_tag_with_labels(seq, n_consumed, *the_label); //} } catch (_khmer_signal &e) { return NULL; } + std::cout << "packaging return value and returning!" << std::endl; return Py_BuildValue("L", n_consumed); } @@ -4510,7 +4512,7 @@ khmer_labelhash_getattr(PyObject * obj, char * name) static PyTypeObject khmer_KLabelHashType = { PyObject_HEAD_INIT(NULL) 0, /* ob_size */ - "LabelHash", /* tp_name */ + "_LabelHash", /* tp_name */ sizeof(khmer_KLabelHashObject), /* tp_basicsize */ 0, /* tp_itemsize */ (destructor)khmer_labelhash_dealloc, /* tp_dealloc */ @@ -4976,7 +4978,7 @@ init_khmer(void) PyModule_AddObject(m, "Hashbits", (PyObject *)&khmer_KHashbitsType); Py_INCREF(&khmer_KLabelHashType); - PyModule_AddObject(m, "LabelHash", (PyObject *)&khmer_KLabelHashType); + PyModule_AddObject(m, "_LabelHash", (PyObject *)&khmer_KLabelHashType); } // vim: set ft=cpp sts=4 sw=4 tw=79: diff --git a/lib/labelhash.cc b/lib/labelhash.cc index 7093760bcc..7577abd453 100644 --- a/lib/labelhash.cc +++ b/lib/labelhash.cc @@ -195,6 +195,8 @@ void LabelHash::consume_sequence_and_tag_with_labels(const std::string& seq, Label& current_label, SeenSet * found_tags) { + + std::cout << "inside low-level labelhash consume sequence function" << std::endl; bool is_new_kmer; bool kmer_tagged; @@ -272,7 +274,6 @@ void LabelHash::consume_sequence_and_tag_with_labels(const std::string& seq, /* * Find all labels associated with the sequence * For now, check /every/ k-mer with find_all_tags - * THIS SUCKS AND IT'S YOUR FAULT @CTB */ unsigned int LabelHash::sweep_sequence_for_labels(const std::string& seq, LabelPtrSet& found_labels, From 7478d691fc1b374493abdb854b90908f7fea5b3c Mon Sep 17 00:00:00 2001 From: Camille Scott Date: Wed, 18 Dec 2013 01:19:04 -0500 Subject: [PATCH 114/140] some inlined functions in hashtable.hh removed --- lib/hashtable.hh | 59 +----------------------------------------------- 1 file changed, 1 insertion(+), 58 deletions(-) diff --git a/lib/hashtable.hh b/lib/hashtable.hh index a306816581..3d8ec0f72b 100644 --- a/lib/hashtable.hh +++ b/lib/hashtable.hh @@ -180,59 +180,7 @@ namespace khmer { WordLength _ksize; HashIntoType bitmask; - unsigned int _nbits_sub_1; - - // Does the given tag already have the given label? - bool _cmap_contains_label(const TagLabelPtrMap& cmap, - HashIntoType& kmer, - Label& the_label) - { - std::pair ret; - ret = cmap.equal_range(kmer); - for (TagLabelPtrMap::const_iterator it=ret.first; it!=ret.second; ++it) { - if (*(it->second) == the_label) return true; - } - return false; - } - - // Does the given label already have a tag associated with it? - bool _cmap_contains_tag(const LabelTagPtrMap& cmap, - Label& the_label, - HashIntoType& kmer) { - std::pair ret; - ret = cmap.equal_range(the_label); - for (LabelTagPtrMap::const_iterator it=ret.first; it!=ret.second; ++it) { - if(*(it->second) == kmer) return true; - } - return false; - } - - unsigned int _get_tag_labels(const HashIntoType& tag, - const TagLabelPtrMap& cmap, - LabelPtrSet& found_labels) { - unsigned int num_labels = 0; - std::pair ret; - ret = cmap.equal_range(tag); - for (TagLabelPtrMap::const_iterator it=ret.first; it!=ret.second; ++it) { - found_labels.insert(it->second); - ++num_labels; - } - return num_labels; - } - - unsigned int _get_tags_from_label(const Label& label, - const LabelTagPtrMap& cmap, - TagPtrSet& labeled_tags) { - unsigned int num_tags = 0; - std::pair ret; - ret = cmap.equal_range(label); - for (LabelTagPtrMap::const_iterator it=ret.first; it!=ret.second; ++it) { - labeled_tags.insert(it->second); - ++num_tags; - } - return num_tags; - } - + unsigned int _nbits_sub_1; Hashtable( WordLength ksize, @@ -253,7 +201,6 @@ namespace khmer { partition = new SubsetPartition(this); _init_bitstuff(); _all_tags_spin_lock = 0; - _tag_labels_spin_lock = 0; } @@ -367,15 +314,11 @@ namespace khmer { } uint32_t _all_tags_spin_lock; - uint32_t _tag_labels_spin_lock; public: SubsetPartition * partition; SeenSet all_tags; SeenSet stop_tags; SeenSet repart_small_tags; - TagLabelPtrMap tag_labels; - LabelTagPtrMap label_tag_ptrs; - LabelPtrMap label_ptrs; // accessor to get 'k' const WordLength ksize() const { return _ksize; } From e75a83816340977ccf9dde2b46c31b0c9323f167 Mon Sep 17 00:00:00 2001 From: Camille Scott Date: Fri, 20 Dec 2013 10:10:17 -0500 Subject: [PATCH 115/140] narrowing down freezup: all_tags spinlock in labelhash --- lib/labelhash.cc | 35 ++++++++++++---- lib/labelhash.hh | 1 + lib/test-Colors.cc | 101 +++++++-------------------------------------- 3 files changed, 42 insertions(+), 95 deletions(-) diff --git a/lib/labelhash.cc b/lib/labelhash.cc index 7577abd453..761b6bdeb8 100644 --- a/lib/labelhash.cc +++ b/lib/labelhash.cc @@ -6,6 +6,9 @@ #include "labelhash.hh" +#define LABEL_DBG 1 +#define printdbg(m) if(LABEL_DBG) std::cout << #m << std::endl; + using namespace khmer; using namespace khmer:: read_parsers; @@ -186,8 +189,10 @@ void LabelHash::consume_partitioned_fasta_and_tag_with_labels(const std::string // @cswelcher: double-check -- is it valid to pull the address from a reference? void LabelHash::link_tag_and_label(HashIntoType& kmer, Label& kmer_label) { - tag_labels.insert(TagLabelPtrPair(kmer, &kmer_label)); - label_tag_ptrs.insert(LabelTagPtrPair(kmer_label, &kmer)); + printdbg(linking tag and label) + tag_labels.insert(TagLabelPtrPair(kmer, &kmer_label)); + label_tag_ptrs.insert(LabelTagPtrPair(kmer_label, &kmer)); + printdbg(done linking tag and label) } void LabelHash::consume_sequence_and_tag_with_labels(const std::string& seq, @@ -196,7 +201,8 @@ void LabelHash::consume_sequence_and_tag_with_labels(const std::string& seq, SeenSet * found_tags) { - std::cout << "inside low-level labelhash consume sequence function" << std::endl; + printdbg(inside low-level labelhash consume sequence function) + bool is_new_kmer; bool kmer_tagged; @@ -204,35 +210,45 @@ void LabelHash::consume_sequence_and_tag_with_labels(const std::string& seq, HashIntoType kmer; unsigned int since = _tag_density / 2 + 1; - + + printdbg(entering while loop) while(!kmers.done()) { kmer = kmers.next(); if ((is_new_kmer = test_and_set_bits( kmer ))) ++n_consumed; + printdbg(test_and_set_bits) #if (1) if (is_new_kmer) { + printdbg(new kmer...) ++since; } else { + printdbg(entering tag spin lock) ACQUIRE_ALL_TAGS_SPIN_LOCK kmer_tagged = set_contains(all_tags, kmer); RELEASE_ALL_TAGS_SPIN_LOCK + printdbg(released tag spin lock) if (kmer_tagged) { since = 1; - + printdbg(kmer already in all_tags) // Labeling code // TODO: MAKE THREADSAFE! if (!_cmap_contains_label(tag_labels, kmer, current_label)) { + printdbg(tag was not labeled: adding to labels...) ACQUIRE_TAG_COLORS_SPIN_LOCK link_tag_and_label(kmer, current_label); RELEASE_TAG_COLORS_SPIN_LOCK + printdbg(released label spin lock) } if (found_tags) { found_tags->insert(kmer); } - } else ++since; + } else { + printdbg(inc since var) + ++since; + } } #else if (!is_new_kmer && set_contains(all_tags, kmer)) { @@ -244,9 +260,12 @@ void LabelHash::consume_sequence_and_tag_with_labels(const std::string& seq, #endif // if (since >= _tag_density) { + printdbg(exceeded tag density: drop a tag and label -- getting tag lock) ACQUIRE_ALL_TAGS_SPIN_LOCK + printdbg(in tag spin lock) all_tags.insert(kmer); RELEASE_ALL_TAGS_SPIN_LOCK + printdbg(released tag spin lock) // Labeling code // TODO: MAKE THREADSAFE! @@ -257,9 +276,9 @@ void LabelHash::consume_sequence_and_tag_with_labels(const std::string& seq, if (found_tags) { found_tags->insert(kmer); } since = 1; } - + printdbg(moving to next iter) } // iteration over kmers - + printdbg(finished iteration: dropping last tag) if (since >= _tag_density/2 - 1) { ACQUIRE_ALL_TAGS_SPIN_LOCK all_tags.insert(kmer); // insert the last k-mer, too. diff --git a/lib/labelhash.hh b/lib/labelhash.hh index 55f0a43ac1..755054d433 100644 --- a/lib/labelhash.hh +++ b/lib/labelhash.hh @@ -76,6 +76,7 @@ namespace khmer { : khmer::Hashbits(ksize, tablesizes) { _tag_labels_spin_lock = 0; + _all_tags_spin_lock = 0; } diff --git a/lib/test-Colors.cc b/lib/test-Colors.cc index 6da9e7e500..dbd8a2dc02 100644 --- a/lib/test-Colors.cc +++ b/lib/test-Colors.cc @@ -1,93 +1,20 @@ -// -// This file is part of khmer, http://github.com/ged-lab/khmer/, and is -// Copyright (C) Michigan State University, 2009-2013. It is licensed under -// the three-clause BSD license; see doc/LICENSE.txt. Contact: ctb@msu.edu -// +#include "khmer.hh" +#include "hashtable.hh" +#include "hashbits.hh" +#include "labelhash.hh" +#include -// Simple C++ implementation of the 'load-graph' Python script. - - -#include -#include -#include -#include -#include -#include -#include -#include - -//#define HASH_TYPE_TO_TEST 1 // Counting Hash -#define HASH_TYPE_TO_TEST 2 // Bit Hash - -// #define OUTPUT_HASHTABLE - - -#include "error.hh" -#include "read_parsers.hh" -#if HASH_TYPE_TO_TEST == 1 -# include "counting.hh" -#elif HASH_TYPE_TO_TEST == 2 -# include "hashbits.hh" -#else -# error "No HASH_TYPE_TO_TEST macro defined." -#endif -#include "primes.hh" - -using namespace std; using namespace khmer; -using namespace khmer:: read_parsers; - - +int main() { + HashIntoType sizes[] = { 100000003, 100000004, 100000007, 10000000011}; + std::vector sizes_vec (sizes, sizes + sizeof(sizes) / sizeof(HashIntoType) ); -int main( int argc, char * argv[ ] ) -{ - unsigned long kmer_length = 20; - float ht_size_FP = 1.0E8; - unsigned long ht_count = 4; - uint64_t cache_size = 4L * 1024 * 1024 * 1024; - unsigned int range = 82; - int rc = 0; - int opt = -1; - char * conv_residue = NULL; - string rfile_name = "/w/tag_coloring/test_reads.fq"; - string ifile_name = "/w/tag_coloring/petMar_test.fp"; - // FILE * ofile = NULL; - HashIntoType ht_size = (HashIntoType)ht_size_FP; - Primes primetab( ht_size ); - vector ht_sizes; - for ( unsigned int i = 0; i < ht_count; ++i ) - ht_sizes.push_back( primetab.get_next_prime( ) ); + khmer::LabelHash * lh_pointer = new khmer::LabelHash(20, sizes_vec); + khmer::Hashbits * hb_pointer = (khmer::Hashbits *)lh_pointer; - unsigned int reads_total = 0; - unsigned long long int n_consumed = 0; - printf("consuming test fastp...\n"); - Hashbits ht( kmer_length, ht_sizes ); - ht.consume_partitioned_fasta_and_tag_with_colors( ifile_name, reads_total, n_consumed ); - printf("consume %u sequences, graph has %u colors\n", reads_total, ht.n_colors()); - IParser * parser = IParser:: get_parser(rfile_name.c_str()); - Read read; - unsigned int num_traversed; - unsigned int num_reads = 0; - string seq = ""; - clock_t st = clock(); - while(!parser->is_complete()) { - read = parser->get_next_read(); - seq = read.sequence; - ColorPtrSet found_colors; - num_traversed = ht.sweep_color_neighborhood(seq, found_colors, range, false, false); - if (num_reads % 50000 == 0) { - st = clock() - st; - printf("traversed %u kmers in %d ticks (%f seconds)\n", num_traversed, - st, - ((float)st/CLOCKS_PER_SEC)); - st = clock(); - } - found_colors.clear(); - num_reads++; - } - return rc; + std::cout << "lh_pointer n_tags: " << lh_pointer->n_tags() << std::endl; + std::cout << "hb_pointer n_tags: " << hb_pointer->n_tags() << std::endl; + + return 0; } - - -// vim: set sts=4 sw=4 tw=80: From 13bff6ebca93f9f16017823b12769c5b23de9ffa Mon Sep 17 00:00:00 2001 From: Camille Scott Date: Fri, 20 Dec 2013 15:09:46 -0500 Subject: [PATCH 116/140] tracked down SIGBUS error to labelhash_dealloc function not calling tp_free, fixed --- khmer/_khmermodule.cc | 22 +++--- lib/labelhash.cc | 34 +++++---- lib/labelhash.hh | 2 +- scripts/sweep-reads-by-partition-buffered.py | 2 +- tests/test_labelhash.py | 72 +++++++++++--------- 5 files changed, 76 insertions(+), 56 deletions(-) diff --git a/khmer/_khmermodule.cc b/khmer/_khmermodule.cc index e1b85d3f54..0b7b31ba58 100644 --- a/khmer/_khmermodule.cc +++ b/khmer/_khmermodule.cc @@ -1835,7 +1835,7 @@ static PyObject * hash_abundance_distribution(PyObject * self, PyObject * args) return NULL; } - assert(is_hashbits_obj(tracking_obj)); + //assert(is_hashbits_obj(tracking_obj)); khmer_KHashbitsObject * tracking_o = (khmer_KHashbitsObject *) tracking_obj; khmer::Hashbits * hashbits = tracking_o->hashbits; @@ -1875,7 +1875,7 @@ static PyObject * hash_abundance_distribution_with_reads_parser(PyObject * self, khmer:: read_parsers:: IParser * rparser = _PyObject_to_khmer_ReadParser(rparser_obj); - assert(is_hashbits_obj(tracking_obj)); + //assert(is_hashbits_obj(tracking_obj)); khmer_KHashbitsObject * tracking_o = (khmer_KHashbitsObject *) tracking_obj; khmer::Hashbits * hashbits = tracking_o->hashbits; @@ -4182,13 +4182,15 @@ static PyObject * khmer_labelhash_new(PyTypeObject * type, PyObject *args, PyObj // khmer_labelhash_dealloc -- clean up a labelhash object. // -static void khmer_labelhash_dealloc(PyObject* self) +static void khmer_labelhash_dealloc(PyObject* obj) { - khmer_KLabelHashObject * obj = (khmer_KLabelHashObject *) self; - delete obj->labelhash; - obj->labelhash = NULL; + khmer_KLabelHashObject * self = (khmer_KLabelHashObject *) obj; + + delete self->labelhash; + self->labelhash = NULL; - PyObject_Del((PyObject *) obj); + obj->ob_type->tp_free((PyObject*)self); + //PyObject_Del((PyObject *) obj); } // a little wierd; we don't actually want to call Hashbits' new method. Rather, we @@ -4204,6 +4206,7 @@ static PyObject * khmer_labelhash_new(PyTypeObject *type, PyObject *args, PyObje PyObject* sizes_list_o = NULL; if (!PyArg_ParseTuple(args, "IO", &k, &sizes_list_o)) { + Py_DECREF(self); return NULL; } @@ -4227,6 +4230,8 @@ static int khmer_labelhash_init(khmer_KLabelHashObject * self, PyObject *args, P if (khmer_KHashbitsType.tp_init((PyObject *)self, args, kwds) < 0) return -1; std::cout << "testing my pointer ref to hashbits: " << self->khashbits.hashbits->n_tags() << std::endl; + std::cout << "hashbits: " << self->khashbits.hashbits << std::endl; + std::cout << "labelhash: " << self->labelhash << std::endl; return 0; } @@ -4298,9 +4303,10 @@ static PyObject * labelhash_consume_partitioned_fasta_and_tag_with_labels( labelhash->consume_partitioned_fasta_and_tag_with_labels(filename, total_reads, n_consumed, _report_fn, callback_obj); } catch (_khmer_signal &e) { + std::cout << "caught exception in consume_partitioned_fasta_and_tag_with_labels!" << std::endl; return NULL; } - + std::cout << "building value for return..." << std::endl; return Py_BuildValue("iK", total_reads, n_consumed); } diff --git a/lib/labelhash.cc b/lib/labelhash.cc index 761b6bdeb8..ebeb6554b7 100644 --- a/lib/labelhash.cc +++ b/lib/labelhash.cc @@ -6,7 +6,7 @@ #include "labelhash.hh" -#define LABEL_DBG 1 +#define LABEL_DBG 0 #define printdbg(m) if(LABEL_DBG) std::cout << #m << std::endl; using namespace khmer; @@ -146,8 +146,8 @@ void LabelHash::consume_partitioned_fasta_and_tag_with_labels(const std::string std::string seq = ""; // reset the master subset partition - delete partition; - partition = new SubsetPartition(this); + //delete partition; + //partition = new SubsetPartition(this); // // iterate through the FASTA file & consume the reads. @@ -160,12 +160,15 @@ void LabelHash::consume_partitioned_fasta_and_tag_with_labels(const std::string if (check_and_normalize_read(seq)) { // First, figure out what the partition is (if non-zero), and save that. + printdbg(parsing partition id) p = _parse_partition_id(read.name); + printdbg(checking label and allocating if necessary) c = check_and_allocate_label(p); - + printdbg(consuming sequence and tagging) consume_sequence_and_tag_with_labels( seq, n_consumed, *c ); + printdbg(back in consume_partitioned) } // reset the sequence info, increment read number @@ -182,9 +185,11 @@ void LabelHash::consume_partitioned_fasta_and_tag_with_labels(const std::string } } } + printdbg(done with while loop in consume_partitioned) // @cswelcher TODO: check that deallocate LabelPtrMap is correct delete parser; + printdbg(deleted parser and exiting) } // @cswelcher: double-check -- is it valid to pull the address from a reference? @@ -225,9 +230,9 @@ void LabelHash::consume_sequence_and_tag_with_labels(const std::string& seq, ++since; } else { printdbg(entering tag spin lock) - ACQUIRE_ALL_TAGS_SPIN_LOCK + //ACQUIRE_ALL_TAGS_SPIN_LOCK kmer_tagged = set_contains(all_tags, kmer); - RELEASE_ALL_TAGS_SPIN_LOCK + //RELEASE_ALL_TAGS_SPIN_LOCK printdbg(released tag spin lock) if (kmer_tagged) { since = 1; @@ -237,9 +242,9 @@ void LabelHash::consume_sequence_and_tag_with_labels(const std::string& seq, if (!_cmap_contains_label(tag_labels, kmer, current_label)) { printdbg(tag was not labeled: adding to labels...) - ACQUIRE_TAG_COLORS_SPIN_LOCK + //ACQUIRE_TAG_COLORS_SPIN_LOCK link_tag_and_label(kmer, current_label); - RELEASE_TAG_COLORS_SPIN_LOCK + //RELEASE_TAG_COLORS_SPIN_LOCK printdbg(released label spin lock) } if (found_tags) { @@ -261,17 +266,17 @@ void LabelHash::consume_sequence_and_tag_with_labels(const std::string& seq, // if (since >= _tag_density) { printdbg(exceeded tag density: drop a tag and label -- getting tag lock) - ACQUIRE_ALL_TAGS_SPIN_LOCK + //ACQUIRE_ALL_TAGS_SPIN_LOCK printdbg(in tag spin lock) all_tags.insert(kmer); - RELEASE_ALL_TAGS_SPIN_LOCK + //RELEASE_ALL_TAGS_SPIN_LOCK printdbg(released tag spin lock) // Labeling code // TODO: MAKE THREADSAFE! - ACQUIRE_TAG_COLORS_SPIN_LOCK + //ACQUIRE_TAG_COLORS_SPIN_LOCK link_tag_and_label(kmer, current_label); - RELEASE_TAG_COLORS_SPIN_LOCK + //RELEASE_TAG_COLORS_SPIN_LOCK if (found_tags) { found_tags->insert(kmer); } since = 1; @@ -280,15 +285,16 @@ void LabelHash::consume_sequence_and_tag_with_labels(const std::string& seq, } // iteration over kmers printdbg(finished iteration: dropping last tag) if (since >= _tag_density/2 - 1) { - ACQUIRE_ALL_TAGS_SPIN_LOCK + //ACQUIRE_ALL_TAGS_SPIN_LOCK all_tags.insert(kmer); // insert the last k-mer, too. - RELEASE_ALL_TAGS_SPIN_LOCK + //RELEASE_ALL_TAGS_SPIN_LOCK // Label code: TODO: MAKE THREADSAFE! link_tag_and_label(kmer, current_label); if (found_tags) { found_tags->insert(kmer); } } + printdbg(done with low-level consume) } /* * Find all labels associated with the sequence diff --git a/lib/labelhash.hh b/lib/labelhash.hh index 755054d433..6abaa788b1 100644 --- a/lib/labelhash.hh +++ b/lib/labelhash.hh @@ -80,7 +80,7 @@ namespace khmer { } - ~LabelHash() {}; + //~LabelHash() {}; TagLabelPtrMap tag_labels; LabelTagPtrMap label_tag_ptrs; diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py index 8e2cd7c6d9..f153282319 100755 --- a/scripts/sweep-reads-by-partition-buffered.py +++ b/scripts/sweep-reads-by-partition-buffered.py @@ -205,7 +205,7 @@ def main(): output_buffer = ReadBufferManager(max_buffers, max_reads, buf_size, output_pref, outdir) # consume the partitioned fasta with which to label the graph - ht = khmer.new_hashbits(K, HT_SIZE, N_HT) + ht = khmer.LabelHash(K, HT_SIZE, N_HT) print >>sys.stderr, 'consuming fastp...' ht.consume_partitioned_fasta_and_tag_with_labels(input_fastp) diff --git a/tests/test_labelhash.py b/tests/test_labelhash.py index 7993091d74..f5a497edd9 100644 --- a/tests/test_labelhash.py +++ b/tests/test_labelhash.py @@ -48,6 +48,31 @@ def test_get_tag_labels(): assert len(labels) == 1 assert labels.pop() == 0L +def test_consume_fasta_and_tag_with_labels(): + lb = LabelHash(20, 1e7, 4) + read_1 = 'ACGTAACCGGTTAAACCCGGGTTTAAAACCCCGGGGTTTT' + filename = utils.get_test_data('test-transcript.fa') + + total_reads, n_consumed = lb.consume_fasta_and_tag_with_labels(filename) + print "doing get" + assert lb.get(read_1[:20]) + assert total_reads == 3 + print "doing n_labels" + print lb.n_labels() + print "doing label dict" + print lb.get_label_dict() + print "get tagset" + for tag in lb.get_tagset(): + print "forward hash" + print tag, khmer.forward_hash(tag, 20) + for record in screed.open(filename): + print "Sweeping tags" + print lb.sweep_tag_neighborhood(record.sequence, 40) + print "Sweeping labels..." + print lb.sweep_label_neighborhood(record.sequence, 40) + assert lb.n_labels() == 3 + + def test_consume_partitioned_fasta_and_tag_with_labels(): lb = LabelHash(20, 1e7, 4) filename = utils.get_test_data('real-partition-small.fa') @@ -63,23 +88,24 @@ def test_consume_partitioned_fasta_and_tag_with_labels(): assert labels.pop() == 2L assert lb.n_labels() == 1 -def test_consume_fasta_and_tag_with_labels(): +def test_sweep_tag_neighborhood(): lb = LabelHash(20, 1e7, 4) - read_1 = 'ACGTAACCGGTTAAACCCGGGTTTAAAACCCCGGGGTTTT' - filename = utils.get_test_data('test-transcript.fa') + filename = utils.get_test_data('single-read.fq') + lb.consume_fasta_and_tag(filename) + + tags = lb.sweep_tag_neighborhood('CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT') + assert len(tags) == 1 + assert tags.pop() == 173473779682L - total_reads, n_consumed = lb.consume_fasta_and_tag_with_labels(filename) - assert lb.get(read_1[:20]) - assert total_reads == 3 - print lb.n_labels() - print lb.get_label_dict() - for tag in lb.get_tagset(): - print tag, khmer.forward_hash(tag, 20) - for record in screed.open(filename): - print lb.sweep_tag_neighborhood(record.sequence, 40) - print lb.sweep_label_neighborhood(record.sequence, 40) - assert lb.n_labels() == 3 +def test_sweep_label_neighborhood(): + lb = LabelHash(20, 1e7, 4) + filename = utils.get_test_data('single-read.fq') + lb.consume_fasta_and_tag_with_labels(filename) + + labels = lb.sweep_label_neighborhood('CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT') + assert len(labels) == 1 + assert labels.pop() == 0L ''' * The test data set as four reads: A, B, C, and D @@ -123,21 +149,3 @@ def test_label_tag_correctness(): assert len(labels) == 1 assert 3L in labels -def test_sweep_tag_neighborhood(): - lb = LabelHash(20, 1e7, 4) - filename = utils.get_test_data('single-read.fq') - lb.consume_fasta_and_tag(filename) - - tags = lb.sweep_tag_neighborhood('CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT') - assert len(tags) == 1 - assert tags.pop() == 173473779682L - - -def test_sweep_label_neighborhood(): - lb = LabelHash(20, 1e7, 4) - filename = utils.get_test_data('single-read.fq') - lb.consume_fasta_and_tag_with_labels(filename) - - labels = lb.sweep_label_neighborhood('CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT') - assert len(labels) == 1 - assert labels.pop() == 0L From 9ac22f072b9ddca2018d42406f9df02b8a234ad2 Mon Sep 17 00:00:00 2001 From: Camille Scott Date: Fri, 20 Dec 2013 15:34:46 -0500 Subject: [PATCH 117/140] swapped out tst-Colors in lib Makefile --- lib/Makefile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/Makefile b/lib/Makefile index 4a7a00def4..dbfb936414 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -74,7 +74,6 @@ WITH_INTERNAL_METRICS=false ### NOTE: No user-servicable parts below this line! ### - CXXFLAGS= CXX_WARNING_FLAGS=-Wall CXX_OPTIMIZATION_FLAGS=-O3 @@ -171,7 +170,7 @@ CORE_OBJS= error.o khmer_config.o thread_id_map.o trace_logger.o perf_metrics.o PARSERS_OBJS= read_parsers.o all: $(ZLIB_OBJS) $(BZIP2_OBJS) $(CORE_OBJS) $(PARSERS_OBJS) hashtable.o hashbits.o subset.o counting.o test aligner.o scoringmatrix.o node.o kmer.o - + echo ~~~~~~ YO WE IN THIS MAKEFILE ~~~~~~ clean: -(cd $(ZLIB_DIR) && make clean) (cd $(BZIP2_DIR) && make -f Makefile-libbz2_so clean) From 747dab0b5b3594c83b5d8d86df9e8133c6fd0617 Mon Sep 17 00:00:00 2001 From: Camille Scott Date: Fri, 20 Dec 2013 15:35:44 -0500 Subject: [PATCH 118/140] removed rogue test files? --- lib/color_tst.py | 73 ---------------------------------------- lib/color_tst_opt.py | 78 ------------------------------------------- lib/color_tst_slow.py | 78 ------------------------------------------- 3 files changed, 229 deletions(-) delete mode 100644 lib/color_tst.py delete mode 100644 lib/color_tst_opt.py delete mode 100644 lib/color_tst_slow.py diff --git a/lib/color_tst.py b/lib/color_tst.py deleted file mode 100644 index dcac725ec0..0000000000 --- a/lib/color_tst.py +++ /dev/null @@ -1,73 +0,0 @@ -import khmer -import screed - -def reverse_comp(s): - ret = '' - for i in range(len(s)-1,-1,-1): - c = s[i] - if c == 'A': - ret += 'T' - elif c == 'T': - ret += 'A' - elif c == 'G': - ret += 'C' - else: - ret += 'G' - return ret - -ht = khmer.new_hashbits(20,1e8,4) -print '#' * 200 -ht.consume_fasta_and_tag_with_colors('../tests/test-data/test-reads.fa') -#print ht.sweep_sequence_for_colors('CACACACGGACATCGGAGAGAGGCTGAGACAGCGAGACACACAGAGACAGAGCGGAGAGGGCACAGACAGACAAGAGCATGAGAGATCGGCAGAGCGGTG', False, False) -#print ht.sweep_sequence_for_colors('CGCCGTAGTCGTACTGGTTCTCCTCCGTGTACTCGTGCGCTGCCTCCACCTCTGGGCTGCTCATGCCCTCCATGTGACCTTCAGGCATGCCCTCGGAGAT', False, False) -#print ht.sweep_sequence_for_colors('GGAGAGCCTGGGGCCAAGCCCGAGGGCATGCCTGAAGGTCACATGGAGGGCATGAGCAGCCCAG', False, False) -#print ht.sweep_sequence_for_colors('TTTTTTGAATACGTTTAGTTAATATTTGTACTTCAATTAATAAAAATTTGCTATAATTTTTCCATTATCGCCAGTCACTCGCGTGATATAGGAAAAGGTT', False, False) -#print ht.sweep_sequence_for_colors('AAGCAGTGGTATCAACGCAGAGTACGCGGGGACTCTGTCGCTGCTCCTCTAGCACAGAGAGCCAGAGACGGCTTACAGCAGCAGCATCATATAGCCTC', False, False) - -t0 = 'CCATGTAGCGCCGCACACCTTTGTAGGTGTTGTAATAATCTTCGATGACTTTCTTCGCTTCCTGACGGCTTATGCC' -t1 = 'ACCGCGCGCGAATCGACGGTTGTCAGCCAAAGGCGTTCAACACCAGCACCGCCCTTAAGCCGCCCGCCCGCCGCCC' -N = 1000 - -for n, record in enumerate(screed.open('../tests/test-data/test-reads.fa')): - if n > N: - break - print '*' * 40 - seq = record.sequence - print seq - colors = ht.sweep_sequence_for_colors(seq, False, False) - print 'colors from sweep:', colors - tags = ht.get_all_tags(seq) - print 'tags from get_all_tags:', tags - print 'colors from get_tag_colors:' - t_colors = set() - for tag in tags: - t_colors.update(ht.get_tag_colors(tag)) - print t_colors - assert len(t_colors) == len(colors) - -''' -file_pointers = {} -for n, record in enumerate(screed.open('/w/2013-lamprey/syn_part/syn.sweep.fa')): - if n >= N: - break - if n % 1000 == 0: - print '...processed {} reads'.format(n) - colors = ht.sweep_sequence_for_colors(record.sequence, False, False) - for c in colors: - if c in file_pointers.viewkeys(): - file_pointers[c].write('>{}\n{}\n'.format(record.name, record.sequence)) - else: - file_pointers[c] = open('color_{}.fa'.format(c), 'wb') - file_pointers[c].write('>{}\n{}\n'.format(record.name, record.sequence))\ -''' -''' -ht = khmer.new_hashbits(25, 1e9,4) -ht.consume_partitioned_fasta_and_tag_with_colors('/w/2013-lamprey/test.fp') - -for n, record in enumerate(screed.open('/w/lamprey-mrnaseq/reads/single/L82-a.fq.gz')): - if n >= N: - break - colors = ht.sweep_sequence_for_colors(record.sequence, False, False) - if colors: - print colors -''' diff --git a/lib/color_tst_opt.py b/lib/color_tst_opt.py deleted file mode 100644 index 8c75fe2e5b..0000000000 --- a/lib/color_tst_opt.py +++ /dev/null @@ -1,78 +0,0 @@ -import khmer -import screed - -def reverse_comp(s): - ret = '' - for i in range(len(s)-1,-1,-1): - c = s[i] - if c == 'A': - ret += 'T' - elif c == 'T': - ret += 'A' - elif c == 'G': - ret += 'C' - else: - ret += 'G' - return ret - -ht = khmer.new_hashbits(20,1e8,4) -ht.consume_fasta_and_tag_with_colors('../tests/test-data/test-reads.fa') -N = 100 -for n, record in enumerate(screed.open('../tests/test-data/test-reads.fa')): - if n >= N: - break - ht.sweep_tag_neighborhood(record.sequence, 10) - -#print ht.sweep_sequence_for_colors('CACACACGGACATCGGAGAGAGGCTGAGACAGCGAGACACACAGAGACAGAGCGGAGAGGGCACAGACAGACAAGAGCATGAGAGATCGGCAGAGCGGTG', False, False) -#print ht.sweep_sequence_for_colors('CGCCGTAGTCGTACTGGTTCTCCTCCGTGTACTCGTGCGCTGCCTCCACCTCTGGGCTGCTCATGCCCTCCATGTGACCTTCAGGCATGCCCTCGGAGAT', False, False) -#print ht.sweep_sequence_for_colors('GGAGAGCCTGGGGCCAAGCCCGAGGGCATGCCTGAAGGTCACATGGAGGGCATGAGCAGCCCAG', False, False) -#print ht.sweep_sequence_for_colors('TTTTTTGAATACGTTTAGTTAATATTTGTACTTCAATTAATAAAAATTTGCTATAATTTTTCCATTATCGCCAGTCACTCGCGTGATATAGGAAAAGGTT', False, False) -#print ht.sweep_sequence_for_colors('AAGCAGTGGTATCAACGCAGAGTACGCGGGGACTCTGTCGCTGCTCCTCTAGCACAGAGAGCCAGAGACGGCTTACAGCAGCAGCATCATATAGCCTC', False, False) - -#t0 = 'CCATGTAGCGCCGCACACCTTTGTAGGTGTTGTAATAATCTTCGATGACTTTCTTCGCTTCCTGACGGCTTATGCC' -#t1 = 'ACCGCGCGCGAATCGACGGTTGTCAGCCAAAGGCGTTCAACACCAGCACCGCCCTTAAGCCGCCCGCCCGCCGCCC' -''' -N = 100 -for n, record in enumerate(screed.open('../tests/test-data/test-reads.fa')): - if n > N: - break - print '*' * 40 - seq = record.sequence - print seq - colors = ht.sweep_sequence_for_colors(seq, False, False) - print 'colors from sweep:', colors - tags = ht.get_all_tags(seq) - print 'tags from get_all_tags:', tags - print 'colors from get_tag_colors:' - t_colors = set() - for tag in tags: - t_colors.update(ht.get_tag_colors(tag)) - print t_colors - assert len(t_colors) == len(colors) -''' -''' -file_pointers = {} -for n, record in enumerate(screed.open('/w/2013-lamprey/syn_part/syn.sweep.fa')): - if n >= N: - break - if n % 1000 == 0: - print '...processed {} reads'.format(n) - colors = ht.sweep_sequence_for_colors(record.sequence, False, False) - for c in colors: - if c in file_pointers.viewkeys(): - file_pointers[c].write('>{}\n{}\n'.format(record.name, record.sequence)) - else: - file_pointers[c] = open('color_{}.fa'.format(c), 'wb') - file_pointers[c].write('>{}\n{}\n'.format(record.name, record.sequence))\ -''' -''' -ht = khmer.new_hashbits(25, 1e9,4) -ht.consume_partitioned_fasta_and_tag_with_colors('/w/2013-lamprey/test.fp') - -for n, record in enumerate(screed.open('/w/lamprey-mrnaseq/reads/single/L82-a.fq.gz')): - if n >= N: - break - colors = ht.sweep_sequence_for_colors(record.sequence, False, False) - if colors: - print colors -''' diff --git a/lib/color_tst_slow.py b/lib/color_tst_slow.py deleted file mode 100644 index 2f25f857e4..0000000000 --- a/lib/color_tst_slow.py +++ /dev/null @@ -1,78 +0,0 @@ -import khmer -import screed - -def reverse_comp(s): - ret = '' - for i in range(len(s)-1,-1,-1): - c = s[i] - if c == 'A': - ret += 'T' - elif c == 'T': - ret += 'A' - elif c == 'G': - ret += 'C' - else: - ret += 'G' - return ret - -ht = khmer.new_hashbits(20,1e8,4) -ht.consume_fasta_and_tag_with_colors('../tests/test-data/test-reads.fa') -N = 100 -for n, record in enumerate(screed.open('../tests/test-data/test-reads.fa')): - if n >= N: - break - ht.sweep_color_neighborhood(record.sequence) - -#print ht.sweep_sequence_for_colors('CACACACGGACATCGGAGAGAGGCTGAGACAGCGAGACACACAGAGACAGAGCGGAGAGGGCACAGACAGACAAGAGCATGAGAGATCGGCAGAGCGGTG', False, False) -#print ht.sweep_sequence_for_colors('CGCCGTAGTCGTACTGGTTCTCCTCCGTGTACTCGTGCGCTGCCTCCACCTCTGGGCTGCTCATGCCCTCCATGTGACCTTCAGGCATGCCCTCGGAGAT', False, False) -#print ht.sweep_sequence_for_colors('GGAGAGCCTGGGGCCAAGCCCGAGGGCATGCCTGAAGGTCACATGGAGGGCATGAGCAGCCCAG', False, False) -#print ht.sweep_sequence_for_colors('TTTTTTGAATACGTTTAGTTAATATTTGTACTTCAATTAATAAAAATTTGCTATAATTTTTCCATTATCGCCAGTCACTCGCGTGATATAGGAAAAGGTT', False, False) -#print ht.sweep_sequence_for_colors('AAGCAGTGGTATCAACGCAGAGTACGCGGGGACTCTGTCGCTGCTCCTCTAGCACAGAGAGCCAGAGACGGCTTACAGCAGCAGCATCATATAGCCTC', False, False) - -#t0 = 'CCATGTAGCGCCGCACACCTTTGTAGGTGTTGTAATAATCTTCGATGACTTTCTTCGCTTCCTGACGGCTTATGCC' -#t1 = 'ACCGCGCGCGAATCGACGGTTGTCAGCCAAAGGCGTTCAACACCAGCACCGCCCTTAAGCCGCCCGCCCGCCGCCC' -''' -N = 100 -for n, record in enumerate(screed.open('../tests/test-data/test-reads.fa')): - if n > N: - break - print '*' * 40 - seq = record.sequence - print seq - colors = ht.sweep_sequence_for_colors(seq, False, False) - print 'colors from sweep:', colors - tags = ht.get_all_tags(seq) - print 'tags from get_all_tags:', tags - print 'colors from get_tag_colors:' - t_colors = set() - for tag in tags: - t_colors.update(ht.get_tag_colors(tag)) - print t_colors - assert len(t_colors) == len(colors) -''' -''' -file_pointers = {} -for n, record in enumerate(screed.open('/w/2013-lamprey/syn_part/syn.sweep.fa')): - if n >= N: - break - if n % 1000 == 0: - print '...processed {} reads'.format(n) - colors = ht.sweep_sequence_for_colors(record.sequence, False, False) - for c in colors: - if c in file_pointers.viewkeys(): - file_pointers[c].write('>{}\n{}\n'.format(record.name, record.sequence)) - else: - file_pointers[c] = open('color_{}.fa'.format(c), 'wb') - file_pointers[c].write('>{}\n{}\n'.format(record.name, record.sequence))\ -''' -''' -ht = khmer.new_hashbits(25, 1e9,4) -ht.consume_partitioned_fasta_and_tag_with_colors('/w/2013-lamprey/test.fp') - -for n, record in enumerate(screed.open('/w/lamprey-mrnaseq/reads/single/L82-a.fq.gz')): - if n >= N: - break - colors = ht.sweep_sequence_for_colors(record.sequence, False, False) - if colors: - print colors -''' From 9289e0c0fad9d495d939172f4e03aefad6fea0e7 Mon Sep 17 00:00:00 2001 From: Camille Scott Date: Fri, 20 Dec 2013 15:46:36 -0500 Subject: [PATCH 119/140] added same tp_free call to dealloc for Hashbits --- khmer/_khmermodule.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/khmer/_khmermodule.cc b/khmer/_khmermodule.cc index 0b7b31ba58..2d2994dc30 100644 --- a/khmer/_khmermodule.cc +++ b/khmer/_khmermodule.cc @@ -4795,6 +4795,7 @@ static void khmer_hashbits_dealloc(PyObject* self) delete obj->hashbits; obj->hashbits = NULL; + self->ob_type->tp_free((PyObject*)obj); PyObject_Del((PyObject *) obj); } From 3ec384d40730466412d58c71f880254f2d786c63 Mon Sep 17 00:00:00 2001 From: Camille Scott Date: Fri, 20 Dec 2013 16:37:40 -0500 Subject: [PATCH 120/140] exposed Hashbits object to python with constructor providing primes --- khmer/__init__.py | 19 +++++++++++++++---- khmer/_khmermodule.cc | 2 +- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/khmer/__init__.py b/khmer/__init__.py index 04f2038f0f..59a4eb2d76 100644 --- a/khmer/__init__.py +++ b/khmer/__init__.py @@ -16,6 +16,7 @@ from _khmer import get_config from _khmer import ReadParser from _khmer import _LabelHash +from _khmer import _Hashbits from ._version import get_versions __version__ = get_versions()['version'] @@ -148,14 +149,24 @@ def get_n_primes_above_x(n, x): i += 2 return primes +''' +Expose the cpython objects with __new__ implementations. +These constructors add the functionality provided by the existing +factory methods to the constructors defined over in cpython land. +Additional functionality can be added to these classes as appropriate. +''' + class LabelHash(_LabelHash): def __new__(cls, k, starting_size, n_tables): - print "** LabelHash __new__" - print "\t*** Getting primes..." primes = get_n_primes_above_x(n_tables, starting_size) - print "\t*** Invoking parent..." c = _LabelHash.__new__(cls, k, primes) - print "\t*** Done with parent, returning class object" + c.primes = primes + return c + +class Hashbits(_Hashbits): + def __new__(cls, k, starting_size, n_tables): + primes = get_n_primes_above_x(n_tables, starting_size) + c = _Hashbits.__new__(cls, k, primes) c.primes = primes return c diff --git a/khmer/_khmermodule.cc b/khmer/_khmermodule.cc index 2d2994dc30..6eb251c7f9 100644 --- a/khmer/_khmermodule.cc +++ b/khmer/_khmermodule.cc @@ -4982,7 +4982,7 @@ init_khmer(void) // Then, remove the corresponding factory functions. Py_INCREF(&khmer_KHashbitsType); - PyModule_AddObject(m, "Hashbits", (PyObject *)&khmer_KHashbitsType); + PyModule_AddObject(m, "_Hashbits", (PyObject *)&khmer_KHashbitsType); Py_INCREF(&khmer_KLabelHashType); PyModule_AddObject(m, "_LabelHash", (PyObject *)&khmer_KLabelHashType); From a50061d835e33db469e3b11e049ba4c52b135b4d Mon Sep 17 00:00:00 2001 From: Camille Scott Date: Sun, 22 Dec 2013 00:44:55 -0500 Subject: [PATCH 121/140] fixed Hashbits dealloc by removing old PyObject_Del, added tests for Hashbits and for Hashbits methods available through Labelhash --- khmer/_khmermodule.cc | 18 +- tests/test_Hashbits.py | 554 ++++++++++++++++++++++++++++++++++++++++ tests/test_labelhash.py | 535 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 1104 insertions(+), 3 deletions(-) create mode 100644 tests/test_Hashbits.py diff --git a/khmer/_khmermodule.cc b/khmer/_khmermodule.cc index 6eb251c7f9..b32124012e 100644 --- a/khmer/_khmermodule.cc +++ b/khmer/_khmermodule.cc @@ -3904,7 +3904,7 @@ khmer_hashbits_getattr(PyObject * obj, char * name) return Py_FindMethod(khmer_hashbits_methods, obj, name); } -static void khmer_hashbits_dealloc(PyObject *); +static void khmer_hashbits_dealloc(PyObject * obj); static PyObject* khmer_hashbits_new(PyTypeObject * type, PyObject * args, PyObject * kwds); static int khmer_hashbits_init(khmer_KHashbitsObject * self, PyObject * args, PyObject * kwds); @@ -3913,7 +3913,7 @@ static PyTypeObject khmer_KHashbitsType = { 0, "Hashbits", sizeof(khmer_KHashbitsObject), 0, - khmer_hashbits_dealloc, /*tp_dealloc*/ + (destructor)khmer_hashbits_dealloc, /*tp_dealloc*/ 0, /*tp_print*/ khmer_hashbits_getattr, /*tp_getattr*/ 0, /*tp_setattr*/ @@ -4788,7 +4788,7 @@ static void khmer_counting_dealloc(PyObject* self) // // khmer_hashbits_dealloc -- clean up a hashbits object. // - +/* static void khmer_hashbits_dealloc(PyObject* self) { khmer_KHashbitsObject * obj = (khmer_KHashbitsObject *) self; @@ -4798,6 +4798,18 @@ static void khmer_hashbits_dealloc(PyObject* self) self->ob_type->tp_free((PyObject*)obj); PyObject_Del((PyObject *) obj); } +*/ +static void khmer_hashbits_dealloc(PyObject* obj) +{ + khmer_KHashbitsObject * self = (khmer_KHashbitsObject *) obj; + + delete self->hashbits; + self->hashbits = NULL; + + self->ob_type->tp_free((PyObject*)obj); + //PyObject_Del((PyObject *) obj); +} + // // khmer_subset_dealloc -- clean up a hashbits object. diff --git a/tests/test_Hashbits.py b/tests/test_Hashbits.py new file mode 100644 index 0000000000..761eb58c77 --- /dev/null +++ b/tests/test_Hashbits.py @@ -0,0 +1,554 @@ +# +# This file is part of khmer, http://github.com/ged-lab/khmer/, and is +# Copyright (C) Michigan State University, 2009-2013. It is licensed under +# the three-clause BSD license; see doc/LICENSE.txt. Contact: ctb@msu.edu +# + +# +# This is an exact copy of test_hashbits, with all invocations of +# khmer.new_hashbits replaced by khmer.Hashbits constructor calls +# + +import khmer +from khmer import Hashbits + +from screed.fasta import fasta_iter +import screed + +import khmer_tst_utils as utils +from nose.plugins.attrib import attr + +def teardown(): + utils.cleanup() + +def test__get_set_tag_density(): + ht = khmer.Hashbits(32, 1, 1) + + orig = ht._get_tag_density() + assert orig != 2 + ht._set_tag_density(2) + assert ht._get_tag_density() == 2 + + +def test_n_occupied_1(): + filename = utils.get_test_data('random-20-a.fa') + + K = 20 # size of kmer + HT_SIZE = 100000 # size of hashtable + N_HT = 1 # number of hashtables + + # test modified c++ n_occupied code + ht1 = khmer.Hashbits(K, HT_SIZE, N_HT) + + for n, record in enumerate(fasta_iter(open(filename))): + ht1.consume(record['sequence']) + + # this number calculated independently + assert ht1.n_occupied() == 3877 + + +def test_bloom_python_1(): + # test python code to count unique kmers using bloom filter + filename = utils.get_test_data('random-20-a.fa') + + K = 20 # size of kmer + HT_SIZE = 100000 # size of hashtable + N_HT = 3 # number of hashtables + + ht2 = khmer.Hashbits(K, HT_SIZE, N_HT) + + n_unique = 0 + for n, record in enumerate(fasta_iter(open(filename))): + sequence = record['sequence'] + seq_len = len(sequence) + for n in range(0, seq_len + 1 - K): + kmer = sequence[n:n + K] + if (not ht2.get(kmer)): + n_unique += 1 + ht2.count(kmer) + + assert n_unique == 3960 + assert ht2.n_occupied() == 3882 + assert ht2.n_unique_kmers() == 3960 # this number equals to n_unique + + +def test_bloom_c_1(): + # test c++ code to count unique kmers using bloom filter + + filename = utils.get_test_data('random-20-a.fa') + + K = 20 # size of kmer + HT_SIZE = 100000 # size of hashtable + N_HT = 3 # number of hashtables + + ht3 = khmer.Hashbits(K, HT_SIZE, N_HT) + + for n, record in enumerate(fasta_iter(open(filename))): + ht3.consume(record['sequence']) + + assert ht3.n_occupied() == 3882 + assert ht3.n_unique_kmers() == 3960 + + +def test_n_occupied_2(): # simple one + K = 4 + HT_SIZE = 10 # use 11 + N_HT = 1 + + ht1 = khmer.Hashbits(K, HT_SIZE, N_HT) + ht1.count('AAAA') # 00 00 00 00 = 0 + assert ht1.n_occupied() == 1 + + ht1.count('ACTG') # 00 10 01 11 = + assert ht1.n_occupied() == 2 + + ht1.count('AACG') # 00 00 10 11 = 11 # collision 1 + + assert ht1.n_occupied() == 2 + ht1.count('AGAC') # 00 11 00 10 # collision 2 + assert ht1.n_occupied() == 2 + + +def test_bloom_c_2(): # simple one + K = 4 + HT_SIZE = 10 # use 11 + N_HT1 = 1 # hashtable size = 11 + N_HT2 = 2 # hashtable size = 11,13 + + # use only 1 hashtable, no bloom filter + ht1 = khmer.Hashbits(K, HT_SIZE, N_HT1) + ht1.count('AAAA') # 00 00 00 00 = 0 + ht1.count('ACTG') # 00 10 01 11 = + assert ht1.n_unique_kmers() == 2 + ht1.count('AACG') # 00 00 10 11 = 11 # collision with 1st kmer + assert ht1.n_unique_kmers() == 2 + ht1.count('AGAC') # 00 11 00 10 # collision with 2nd kmer + assert ht1.n_unique_kmers() == 2 + + # use two hashtables with 11,13 + ht2 = khmer.Hashbits(K, HT_SIZE, N_HT2) + ht2.count('AAAA') # 00 00 00 00 = 0 + + ht2.count('ACTG') # 00 10 01 11 = 2*16 +4 +3 = 39 + assert ht2.n_unique_kmers() == 2 + ht2.count('AACG') # 00 00 10 11 = 11 # collision with only 1st kmer + assert ht2.n_unique_kmers() == 3 + ht2.count('AGAC') # 00 11 00 10 3*16 +2 = 50 + # collision with both 2nd and 3rd kmers + + assert ht2.n_unique_kmers() == 3 + + +@attr('highmem') +def test_filter_if_present(): + ht = khmer.Hashbits(32, 1e6, 2) + + maskfile = utils.get_test_data('filter-test-A.fa') + inputfile = utils.get_test_data('filter-test-B.fa') + outfile = utils.get_temp_filename('filter') + + ht.consume_fasta(maskfile) + ht.filter_if_present(inputfile, outfile) + + records = list(fasta_iter(open(outfile))) + assert len(records) == 1 + assert records[0]['name'] == '3' + + +@attr('highmem') +def test_combine_pe(): + inpfile = utils.get_test_data('combine_parts_1.fa') + ht = khmer.Hashbits(32, 1, 1) + + ht.consume_partitioned_fasta(inpfile) + assert ht.count_partitions() == (2, 0) + + s1 = "CATGCAGAAGTTCCGCAACCATACCGTTCAGT" + pid1 = ht.get_partition_id(s1) + + s2 = "CAAATGTACATGCACTTAAAATCATCCAGCCG" + pid2 = ht.get_partition_id(s2) + + assert pid1 == 2 + assert pid2 == 80293 + + ht.join_partitions(pid1, pid2) + + pid1 = ht.get_partition_id(s1) + pid2 = ht.get_partition_id(s2) + + assert pid1 == pid2 + assert ht.count_partitions() == (1, 0) + + +@attr('highmem') +def test_load_partitioned(): + inpfile = utils.get_test_data('combine_parts_1.fa') + ht = khmer.Hashbits(32, 1, 1) + + ht.consume_partitioned_fasta(inpfile) + assert ht.count_partitions() == (2, 0) + + s1 = "CATGCAGAAGTTCCGCAACCATACCGTTCAGT" + assert ht.get(s1) + + s2 = "CAAATGTACATGCACTTAAAATCATCCAGCCG" + assert ht.get(s2) + + s3 = "CATGCAGAAGTTCCGCAACCATACCGTTCAGTTCCTGGTGGCTA"[-32:] + assert ht.get(s3) + + +@attr('highmem') +def test_count_within_radius_simple(): + inpfile = utils.get_test_data('all-A.fa') + ht = khmer.Hashbits(4, 1e6, 2) + + print ht.consume_fasta(inpfile) + n = ht.count_kmers_within_radius('AAAA', 1) + assert n == 1 + + n = ht.count_kmers_within_radius('AAAA', 10) + assert n == 1 + + +@attr('highmem') +def test_count_within_radius_big(): + inpfile = utils.get_test_data('random-20-a.fa') + ht = khmer.Hashbits(20, 1e6, 4) + + ht.consume_fasta(inpfile) + n = ht.count_kmers_within_radius('CGCAGGCTGGATTCTAGAGG', int(1e6)) + assert n == 3960 + + ht = khmer.Hashbits(21, 1e6, 4) + ht.consume_fasta(inpfile) + n = ht.count_kmers_within_radius('CGCAGGCTGGATTCTAGAGGC', int(1e6)) + assert n == 39 + + +@attr('highmem') +def test_count_kmer_degree(): + inpfile = utils.get_test_data('all-A.fa') + ht = khmer.Hashbits(4, 1e6, 2) + ht.consume_fasta(inpfile) + + assert ht.kmer_degree('AAAA') == 2 + assert ht.kmer_degree('AAAT') == 1 + assert ht.kmer_degree('AATA') == 0 + assert ht.kmer_degree('TAAA') == 1 + + +@attr('highmem') +def test_find_radius_for_volume(): + inpfile = utils.get_test_data('all-A.fa') + ht = khmer.Hashbits(4, 1e6, 2) + ht.consume_fasta(inpfile) + + assert ht.find_radius_for_volume('AAAA', 0, 100) == 0 + assert ht.find_radius_for_volume('AAAA', 1, 100) == 0 + assert ht.find_radius_for_volume('AAAA', 2, 100) == 100 + + +def test_circumference(): + ht = khmer.Hashbits(4, 1e6, 2) + + ht.count('ATGC') + ht.count('GATG') + ht.count('ATGG') + + x = ht.count_kmers_on_radius('GATG', 1, 200) + assert x == 2 + + ht.count('ATGA') + x = ht.count_kmers_on_radius('GATG', 1, 200) + assert x == 3, x + + ht.count('TGAT') + x = ht.count_kmers_on_radius('GATG', 1, 200) + assert x == 4, x + + +def test_save_load_tagset(): + ht = khmer.Hashbits(32, 1, 1) + + outfile = utils.get_temp_filename('tagset') + + ht.add_tag('A' * 32) + ht.save_tagset(outfile) + + ht.add_tag('G' * 32) + + ht.load_tagset(outfile) # implicitly => clear_tags=True + ht.save_tagset(outfile) + + # if tags have been cleared, then the new tagfile will be larger (30 bytes) + # else smaller (22 bytes). + + fp = open(outfile, 'rb') + data = fp.read() + fp.close() + assert len(data) == 22, len(data) + + +def test_save_load_tagset_noclear(): + ht = khmer.Hashbits(32, 1, 1) + + outfile = utils.get_temp_filename('tagset') + + ht.add_tag('A' * 32) + ht.save_tagset(outfile) + + ht.add_tag('G' * 32) + + ht.load_tagset(outfile, False) # set clear_tags => False; zero tags + ht.save_tagset(outfile) + + # if tags have been cleared, then the new tagfile will be large (30 bytes); + # else small (22 bytes). + + fp = open(outfile, 'rb') + data = fp.read() + fp.close() + assert len(data) == 30, len(data) + + +@attr('highmem') +def test_stop_traverse(): + filename = utils.get_test_data('random-20-a.fa') + + K = 20 # size of kmer + HT_SIZE = 100000 # size of hashtable + N_HT = 3 # number of hashtables + + ht = khmer.Hashbits(K, HT_SIZE, N_HT) + + # without tagging/joining across consume, this breaks into two partition; + # with, it is one partition. + ht.add_stop_tag('TTGCATACGTTGAGCCAGCG') + + ht.consume_fasta_and_tag(filename) # DO NOT join reads across stoptags + subset = ht.do_subset_partition(0, 0, True) + ht.merge_subset(subset) + + n, _ = ht.count_partitions() + assert n == 2, n + + +@attr('highmem') +def test_tag_across_stoptraverse(): + filename = utils.get_test_data('random-20-a.fa') + + K = 20 # size of kmer + HT_SIZE = 100000 # size of hashtable + N_HT = 3 # number of hashtables + + ht = khmer.Hashbits(K, HT_SIZE, N_HT) + + # without tagging/joining across consume, this breaks into two partition; + # with, it is one partition. + ht.add_stop_tag('CCGAATATATAACAGCGACG') + + ht.consume_fasta_and_tag_with_stoptags(filename) # DO join reads across + + subset = ht.do_subset_partition(0, 0) + n, _ = ht.count_partitions() + assert n == 99 # reads only connected by traversal... + + n, _ = ht.subset_count_partitions(subset) + assert n == 2 # but need main to cross stoptags. + + ht.merge_subset(subset) + + n, _ = ht.count_partitions() # ta-da! + assert n == 1, n + + +@attr('highmem') +def test_notag_across_stoptraverse(): + filename = utils.get_test_data('random-20-a.fa') + + K = 20 # size of kmer + HT_SIZE = 100000 # size of hashtable + N_HT = 3 # number of hashtables + + ht = khmer.Hashbits(K, HT_SIZE, N_HT) + + # connecting k-mer at the beginning/end of a read: breaks up into two. + ht.add_stop_tag('TTGCATACGTTGAGCCAGCG') + + ht.consume_fasta_and_tag_with_stoptags(filename) + + subset = ht.do_subset_partition(0, 0) + ht.merge_subset(subset) + + n, _ = ht.count_partitions() + assert n == 2, n + + +def test_find_stoptags(): + ht = khmer.Hashbits(5, 1, 1) + ht.add_stop_tag("AAAAA") + + assert ht.identify_stoptags_by_position("AAAAA") == [0] + assert ht.identify_stoptags_by_position("AAAAAA") == [0, 1] + assert ht.identify_stoptags_by_position("TTTTT") == [0] + assert ht.identify_stoptags_by_position("TTTTTT") == [0, 1] + + +def test_find_stoptags2(): + ht = khmer.Hashbits(4, 1, 1) + ht.add_stop_tag("ATGC") + + x = ht.identify_stoptags_by_position("ATGCATGCGCAT") + assert x == [0, 2, 4, 8], x + + +def test_get_ksize(): + kh = khmer.Hashbits(22, 1, 1) + assert kh.ksize() == 22 + + +def test_get_hashsizes(): + kh = khmer.Hashbits(22, 100, 4) + assert kh.hashsizes() == [101, 103, 107, 109], kh.hashsizes() + + +def test_extract_unique_paths_0(): + kh = khmer.Hashbits(10, 1e5, 4) + + x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1) + assert x == ['ATGGAGAGACACAGATAGACAGGAGTGGCGATG'] + + kh.consume('ATGGAGAGACACAGATAGACAGGAGTGGCGATG') + x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1) + assert not x + + +def test_extract_unique_paths_1(): + kh = khmer.Hashbits(10, 1e5, 4) + + kh.consume('AGTGGCGATG') + x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1) + print x + assert x == ['ATGGAGAGACACAGATAGACAGGAGTGGCGAT'] # all but the last k-mer + + +def test_extract_unique_paths_2(): + kh = khmer.Hashbits(10, 1e5, 4) + + kh.consume('ATGGAGAGAC') + x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1) + print x + assert x == ['TGGAGAGACACAGATAGACAGGAGTGGCGATG'] # all but the 1st k-mer + + +def test_extract_unique_paths_3(): + kh = khmer.Hashbits(10, 1e5, 4) + + kh.consume('ATGGAGAGAC') + kh.consume('AGTGGCGATG') + x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1) + print x + # all but the 1st/last k-mer + assert x == ['TGGAGAGACACAGATAGACAGGAGTGGCGAT'] + + +def test_extract_unique_paths_4(): + kh = khmer.Hashbits(10, 1e5, 4) + + kh.consume('ATGGAGAGAC') + kh.consume('AGTGGCGATG') + + kh.consume('ATAGACAGGA') + + x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1) + print x + assert x == ['TGGAGAGACACAGATAGACAGG', 'TAGACAGGAGTGGCGAT'] + + +@attr('highmem') +def test_find_unpart(): + filename = utils.get_test_data('random-20-a.odd.fa') + filename2 = utils.get_test_data('random-20-a.even.fa') + + K = 20 # size of kmer + HT_SIZE = 100000 # size of hashtable + N_HT = 3 # number of hashtables + + ht = khmer.Hashbits(K, HT_SIZE, N_HT) + ht.consume_fasta_and_tag(filename) + + subset = ht.do_subset_partition(0, 0) + ht.merge_subset(subset) + + n, _ = ht.count_partitions() + assert n == 49 + + ht.find_unpart(filename2, True, False) + n, _ = ht.count_partitions() + assert n == 1, n # all sequences connect + + +@attr('highmem') +def test_find_unpart_notraverse(): + filename = utils.get_test_data('random-20-a.odd.fa') + filename2 = utils.get_test_data('random-20-a.even.fa') + + K = 20 # size of kmer + HT_SIZE = 100000 # size of hashtable + N_HT = 3 # number of hashtables + + ht = khmer.Hashbits(K, HT_SIZE, N_HT) + ht.consume_fasta_and_tag(filename) + + subset = ht.do_subset_partition(0, 0) + ht.merge_subset(subset) + + n, _ = ht.count_partitions() + assert n == 49 + + ht.find_unpart(filename2, False, False) # <-- don't traverse + n, _ = ht.count_partitions() + assert n == 99, n # all sequences disconnected + + +@attr('highmem') +def test_find_unpart_fail(): + filename = utils.get_test_data('random-20-a.odd.fa') + filename2 = utils.get_test_data('random-20-a.odd.fa') # <- switch to odd + + K = 20 # size of kmer + HT_SIZE = 100000 # size of hashtable + N_HT = 3 # number of hashtables + + ht = khmer.Hashbits(K, HT_SIZE, N_HT) + ht.consume_fasta_and_tag(filename) + + subset = ht.do_subset_partition(0, 0) + ht.merge_subset(subset) + + n, _ = ht.count_partitions() + assert n == 49 + + ht.find_unpart(filename2, True, False) + n, _ = ht.count_partitions() + assert n == 49, n # only 49 sequences worth of tags + + +def test_simple_median(): + hi = khmer.Hashbits(6, 1e6, 2) + + (median, average, stddev) = hi.get_median_count("AAAAAA") + print median, average, stddev + assert median == 0 + assert average == 0.0 + assert stddev == 0.0 + + hi.consume("AAAAAA") + (median, average, stddev) = hi.get_median_count("AAAAAA") + print median, average, stddev + assert median == 1 + assert average == 1.0 + assert stddev == 0.0 + diff --git a/tests/test_labelhash.py b/tests/test_labelhash.py index f5a497edd9..4ab73a8319 100644 --- a/tests/test_labelhash.py +++ b/tests/test_labelhash.py @@ -149,3 +149,538 @@ def test_label_tag_correctness(): assert len(labels) == 1 assert 3L in labels +# +# Begin Hashbits tests +# + +def test__get_set_tag_density(): + ht = khmer.LabelHash(32, 1, 1) + + orig = ht._get_tag_density() + assert orig != 2 + ht._set_tag_density(2) + assert ht._get_tag_density() == 2 + + +def test_n_occupied_1(): + filename = utils.get_test_data('random-20-a.fa') + + K = 20 # size of kmer + HT_SIZE = 100000 # size of hashtable + N_HT = 1 # number of hashtables + + # test modified c++ n_occupied code + ht1 = khmer.LabelHash(K, HT_SIZE, N_HT) + + for n, record in enumerate(fasta_iter(open(filename))): + ht1.consume(record['sequence']) + + # this number calculated independently + assert ht1.n_occupied() == 3877 + + +def test_bloom_python_1(): + # test python code to count unique kmers using bloom filter + filename = utils.get_test_data('random-20-a.fa') + + K = 20 # size of kmer + HT_SIZE = 100000 # size of hashtable + N_HT = 3 # number of hashtables + + ht2 = khmer.LabelHash(K, HT_SIZE, N_HT) + + n_unique = 0 + for n, record in enumerate(fasta_iter(open(filename))): + sequence = record['sequence'] + seq_len = len(sequence) + for n in range(0, seq_len + 1 - K): + kmer = sequence[n:n + K] + if (not ht2.get(kmer)): + n_unique += 1 + ht2.count(kmer) + + assert n_unique == 3960 + assert ht2.n_occupied() == 3882 + assert ht2.n_unique_kmers() == 3960 # this number equals to n_unique + + +def test_bloom_c_1(): + # test c++ code to count unique kmers using bloom filter + + filename = utils.get_test_data('random-20-a.fa') + + K = 20 # size of kmer + HT_SIZE = 100000 # size of hashtable + N_HT = 3 # number of hashtables + + ht3 = khmer.LabelHash(K, HT_SIZE, N_HT) + + for n, record in enumerate(fasta_iter(open(filename))): + ht3.consume(record['sequence']) + + assert ht3.n_occupied() == 3882 + assert ht3.n_unique_kmers() == 3960 + + +def test_n_occupied_2(): # simple one + K = 4 + HT_SIZE = 10 # use 11 + N_HT = 1 + + ht1 = khmer.LabelHash(K, HT_SIZE, N_HT) + ht1.count('AAAA') # 00 00 00 00 = 0 + assert ht1.n_occupied() == 1 + + ht1.count('ACTG') # 00 10 01 11 = + assert ht1.n_occupied() == 2 + + ht1.count('AACG') # 00 00 10 11 = 11 # collision 1 + + assert ht1.n_occupied() == 2 + ht1.count('AGAC') # 00 11 00 10 # collision 2 + assert ht1.n_occupied() == 2 + + +def test_bloom_c_2(): # simple one + K = 4 + HT_SIZE = 10 # use 11 + N_HT1 = 1 # hashtable size = 11 + N_HT2 = 2 # hashtable size = 11,13 + + # use only 1 hashtable, no bloom filter + ht1 = khmer.LabelHash(K, HT_SIZE, N_HT1) + ht1.count('AAAA') # 00 00 00 00 = 0 + ht1.count('ACTG') # 00 10 01 11 = + assert ht1.n_unique_kmers() == 2 + ht1.count('AACG') # 00 00 10 11 = 11 # collision with 1st kmer + assert ht1.n_unique_kmers() == 2 + ht1.count('AGAC') # 00 11 00 10 # collision with 2nd kmer + assert ht1.n_unique_kmers() == 2 + + # use two hashtables with 11,13 + ht2 = khmer.LabelHash(K, HT_SIZE, N_HT2) + ht2.count('AAAA') # 00 00 00 00 = 0 + + ht2.count('ACTG') # 00 10 01 11 = 2*16 +4 +3 = 39 + assert ht2.n_unique_kmers() == 2 + ht2.count('AACG') # 00 00 10 11 = 11 # collision with only 1st kmer + assert ht2.n_unique_kmers() == 3 + ht2.count('AGAC') # 00 11 00 10 3*16 +2 = 50 + # collision with both 2nd and 3rd kmers + + assert ht2.n_unique_kmers() == 3 + + +@attr('highmem') +def test_filter_if_present(): + ht = khmer.LabelHash(32, 1e6, 2) + + maskfile = utils.get_test_data('filter-test-A.fa') + inputfile = utils.get_test_data('filter-test-B.fa') + outfile = utils.get_temp_filename('filter') + + ht.consume_fasta(maskfile) + ht.filter_if_present(inputfile, outfile) + + records = list(fasta_iter(open(outfile))) + assert len(records) == 1 + assert records[0]['name'] == '3' + + +@attr('highmem') +def test_combine_pe(): + inpfile = utils.get_test_data('combine_parts_1.fa') + ht = khmer.LabelHash(32, 1, 1) + + ht.consume_partitioned_fasta(inpfile) + assert ht.count_partitions() == (2, 0) + + s1 = "CATGCAGAAGTTCCGCAACCATACCGTTCAGT" + pid1 = ht.get_partition_id(s1) + + s2 = "CAAATGTACATGCACTTAAAATCATCCAGCCG" + pid2 = ht.get_partition_id(s2) + + assert pid1 == 2 + assert pid2 == 80293 + + ht.join_partitions(pid1, pid2) + + pid1 = ht.get_partition_id(s1) + pid2 = ht.get_partition_id(s2) + + assert pid1 == pid2 + assert ht.count_partitions() == (1, 0) + + +@attr('highmem') +def test_load_partitioned(): + inpfile = utils.get_test_data('combine_parts_1.fa') + ht = khmer.LabelHash(32, 1, 1) + + ht.consume_partitioned_fasta(inpfile) + assert ht.count_partitions() == (2, 0) + + s1 = "CATGCAGAAGTTCCGCAACCATACCGTTCAGT" + assert ht.get(s1) + + s2 = "CAAATGTACATGCACTTAAAATCATCCAGCCG" + assert ht.get(s2) + + s3 = "CATGCAGAAGTTCCGCAACCATACCGTTCAGTTCCTGGTGGCTA"[-32:] + assert ht.get(s3) + + +@attr('highmem') +def test_count_within_radius_simple(): + inpfile = utils.get_test_data('all-A.fa') + ht = khmer.LabelHash(4, 1e6, 2) + + print ht.consume_fasta(inpfile) + n = ht.count_kmers_within_radius('AAAA', 1) + assert n == 1 + + n = ht.count_kmers_within_radius('AAAA', 10) + assert n == 1 + + +@attr('highmem') +def test_count_within_radius_big(): + inpfile = utils.get_test_data('random-20-a.fa') + ht = khmer.LabelHash(20, 1e6, 4) + + ht.consume_fasta(inpfile) + n = ht.count_kmers_within_radius('CGCAGGCTGGATTCTAGAGG', int(1e6)) + assert n == 3960 + + ht = khmer.LabelHash(21, 1e6, 4) + ht.consume_fasta(inpfile) + n = ht.count_kmers_within_radius('CGCAGGCTGGATTCTAGAGGC', int(1e6)) + assert n == 39 + + +@attr('highmem') +def test_count_kmer_degree(): + inpfile = utils.get_test_data('all-A.fa') + ht = khmer.LabelHash(4, 1e6, 2) + ht.consume_fasta(inpfile) + + assert ht.kmer_degree('AAAA') == 2 + assert ht.kmer_degree('AAAT') == 1 + assert ht.kmer_degree('AATA') == 0 + assert ht.kmer_degree('TAAA') == 1 + + +@attr('highmem') +def test_find_radius_for_volume(): + inpfile = utils.get_test_data('all-A.fa') + ht = khmer.LabelHash(4, 1e6, 2) + ht.consume_fasta(inpfile) + + assert ht.find_radius_for_volume('AAAA', 0, 100) == 0 + assert ht.find_radius_for_volume('AAAA', 1, 100) == 0 + assert ht.find_radius_for_volume('AAAA', 2, 100) == 100 + + +def test_circumference(): + ht = khmer.LabelHash(4, 1e6, 2) + + ht.count('ATGC') + ht.count('GATG') + ht.count('ATGG') + + x = ht.count_kmers_on_radius('GATG', 1, 200) + assert x == 2 + + ht.count('ATGA') + x = ht.count_kmers_on_radius('GATG', 1, 200) + assert x == 3, x + + ht.count('TGAT') + x = ht.count_kmers_on_radius('GATG', 1, 200) + assert x == 4, x + + +def test_save_load_tagset(): + ht = khmer.LabelHash(32, 1, 1) + + outfile = utils.get_temp_filename('tagset') + + ht.add_tag('A' * 32) + ht.save_tagset(outfile) + + ht.add_tag('G' * 32) + + ht.load_tagset(outfile) # implicitly => clear_tags=True + ht.save_tagset(outfile) + + # if tags have been cleared, then the new tagfile will be larger (30 bytes) + # else smaller (22 bytes). + + fp = open(outfile, 'rb') + data = fp.read() + fp.close() + assert len(data) == 22, len(data) + + +def test_save_load_tagset_noclear(): + ht = khmer.LabelHash(32, 1, 1) + + outfile = utils.get_temp_filename('tagset') + + ht.add_tag('A' * 32) + ht.save_tagset(outfile) + + ht.add_tag('G' * 32) + + ht.load_tagset(outfile, False) # set clear_tags => False; zero tags + ht.save_tagset(outfile) + + # if tags have been cleared, then the new tagfile will be large (30 bytes); + # else small (22 bytes). + + fp = open(outfile, 'rb') + data = fp.read() + fp.close() + assert len(data) == 30, len(data) + + +@attr('highmem') +def test_stop_traverse(): + filename = utils.get_test_data('random-20-a.fa') + + K = 20 # size of kmer + HT_SIZE = 100000 # size of hashtable + N_HT = 3 # number of hashtables + + ht = khmer.LabelHash(K, HT_SIZE, N_HT) + + # without tagging/joining across consume, this breaks into two partition; + # with, it is one partition. + ht.add_stop_tag('TTGCATACGTTGAGCCAGCG') + + ht.consume_fasta_and_tag(filename) # DO NOT join reads across stoptags + subset = ht.do_subset_partition(0, 0, True) + ht.merge_subset(subset) + + n, _ = ht.count_partitions() + assert n == 2, n + + +@attr('highmem') +def test_tag_across_stoptraverse(): + filename = utils.get_test_data('random-20-a.fa') + + K = 20 # size of kmer + HT_SIZE = 100000 # size of hashtable + N_HT = 3 # number of hashtables + + ht = khmer.LabelHash(K, HT_SIZE, N_HT) + + # without tagging/joining across consume, this breaks into two partition; + # with, it is one partition. + ht.add_stop_tag('CCGAATATATAACAGCGACG') + + ht.consume_fasta_and_tag_with_stoptags(filename) # DO join reads across + + subset = ht.do_subset_partition(0, 0) + n, _ = ht.count_partitions() + assert n == 99 # reads only connected by traversal... + + n, _ = ht.subset_count_partitions(subset) + assert n == 2 # but need main to cross stoptags. + + ht.merge_subset(subset) + + n, _ = ht.count_partitions() # ta-da! + assert n == 1, n + + +@attr('highmem') +def test_notag_across_stoptraverse(): + filename = utils.get_test_data('random-20-a.fa') + + K = 20 # size of kmer + HT_SIZE = 100000 # size of hashtable + N_HT = 3 # number of hashtables + + ht = khmer.LabelHash(K, HT_SIZE, N_HT) + + # connecting k-mer at the beginning/end of a read: breaks up into two. + ht.add_stop_tag('TTGCATACGTTGAGCCAGCG') + + ht.consume_fasta_and_tag_with_stoptags(filename) + + subset = ht.do_subset_partition(0, 0) + ht.merge_subset(subset) + + n, _ = ht.count_partitions() + assert n == 2, n + + +def test_find_stoptags(): + ht = khmer.LabelHash(5, 1, 1) + ht.add_stop_tag("AAAAA") + + assert ht.identify_stoptags_by_position("AAAAA") == [0] + assert ht.identify_stoptags_by_position("AAAAAA") == [0, 1] + assert ht.identify_stoptags_by_position("TTTTT") == [0] + assert ht.identify_stoptags_by_position("TTTTTT") == [0, 1] + + +def test_find_stoptags2(): + ht = khmer.LabelHash(4, 1, 1) + ht.add_stop_tag("ATGC") + + x = ht.identify_stoptags_by_position("ATGCATGCGCAT") + assert x == [0, 2, 4, 8], x + + +def test_get_ksize(): + kh = khmer.LabelHash(22, 1, 1) + assert kh.ksize() == 22 + + +def test_get_hashsizes(): + kh = khmer.LabelHash(22, 100, 4) + assert kh.hashsizes() == [101, 103, 107, 109], kh.hashsizes() + + +def test_extract_unique_paths_0(): + kh = khmer.LabelHash(10, 1e5, 4) + + x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1) + assert x == ['ATGGAGAGACACAGATAGACAGGAGTGGCGATG'] + + kh.consume('ATGGAGAGACACAGATAGACAGGAGTGGCGATG') + x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1) + assert not x + + +def test_extract_unique_paths_1(): + kh = khmer.LabelHash(10, 1e5, 4) + + kh.consume('AGTGGCGATG') + x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1) + print x + assert x == ['ATGGAGAGACACAGATAGACAGGAGTGGCGAT'] # all but the last k-mer + + +def test_extract_unique_paths_2(): + kh = khmer.LabelHash(10, 1e5, 4) + + kh.consume('ATGGAGAGAC') + x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1) + print x + assert x == ['TGGAGAGACACAGATAGACAGGAGTGGCGATG'] # all but the 1st k-mer + + +def test_extract_unique_paths_3(): + kh = khmer.LabelHash(10, 1e5, 4) + + kh.consume('ATGGAGAGAC') + kh.consume('AGTGGCGATG') + x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1) + print x + # all but the 1st/last k-mer + assert x == ['TGGAGAGACACAGATAGACAGGAGTGGCGAT'] + + +def test_extract_unique_paths_4(): + kh = khmer.LabelHash(10, 1e5, 4) + + kh.consume('ATGGAGAGAC') + kh.consume('AGTGGCGATG') + + kh.consume('ATAGACAGGA') + + x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1) + print x + assert x == ['TGGAGAGACACAGATAGACAGG', 'TAGACAGGAGTGGCGAT'] + + +@attr('highmem') +def test_find_unpart(): + filename = utils.get_test_data('random-20-a.odd.fa') + filename2 = utils.get_test_data('random-20-a.even.fa') + + K = 20 # size of kmer + HT_SIZE = 100000 # size of hashtable + N_HT = 3 # number of hashtables + + ht = khmer.LabelHash(K, HT_SIZE, N_HT) + ht.consume_fasta_and_tag(filename) + + subset = ht.do_subset_partition(0, 0) + ht.merge_subset(subset) + + n, _ = ht.count_partitions() + assert n == 49 + + ht.find_unpart(filename2, True, False) + n, _ = ht.count_partitions() + assert n == 1, n # all sequences connect + + +@attr('highmem') +def test_find_unpart_notraverse(): + filename = utils.get_test_data('random-20-a.odd.fa') + filename2 = utils.get_test_data('random-20-a.even.fa') + + K = 20 # size of kmer + HT_SIZE = 100000 # size of hashtable + N_HT = 3 # number of hashtables + + ht = khmer.LabelHash(K, HT_SIZE, N_HT) + ht.consume_fasta_and_tag(filename) + + subset = ht.do_subset_partition(0, 0) + ht.merge_subset(subset) + + n, _ = ht.count_partitions() + assert n == 49 + + ht.find_unpart(filename2, False, False) # <-- don't traverse + n, _ = ht.count_partitions() + assert n == 99, n # all sequences disconnected + + +@attr('highmem') +def test_find_unpart_fail(): + filename = utils.get_test_data('random-20-a.odd.fa') + filename2 = utils.get_test_data('random-20-a.odd.fa') # <- switch to odd + + K = 20 # size of kmer + HT_SIZE = 100000 # size of hashtable + N_HT = 3 # number of hashtables + + ht = khmer.LabelHash(K, HT_SIZE, N_HT) + ht.consume_fasta_and_tag(filename) + + subset = ht.do_subset_partition(0, 0) + ht.merge_subset(subset) + + n, _ = ht.count_partitions() + assert n == 49 + + ht.find_unpart(filename2, True, False) + n, _ = ht.count_partitions() + assert n == 49, n # only 49 sequences worth of tags + + +def test_simple_median(): + hi = khmer.LabelHash(6, 1e6, 2) + + (median, average, stddev) = hi.get_median_count("AAAAAA") + print median, average, stddev + assert median == 0 + assert average == 0.0 + assert stddev == 0.0 + + hi.consume("AAAAAA") + (median, average, stddev) = hi.get_median_count("AAAAAA") + print median, average, stddev + assert median == 1 + assert average == 1.0 + assert stddev == 0.0 + From 10598e46231021b7691c410a628cf04c21dc15a6 Mon Sep 17 00:00:00 2001 From: Camille Scott Date: Fri, 10 Jan 2014 14:08:16 -0500 Subject: [PATCH 122/140] killed noisy init code --- khmer/_khmermodule.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/khmer/_khmermodule.cc b/khmer/_khmermodule.cc index b32124012e..4cf587cc2d 100644 --- a/khmer/_khmermodule.cc +++ b/khmer/_khmermodule.cc @@ -4229,9 +4229,9 @@ static int khmer_labelhash_init(khmer_KLabelHashObject * self, PyObject *args, P { if (khmer_KHashbitsType.tp_init((PyObject *)self, args, kwds) < 0) return -1; - std::cout << "testing my pointer ref to hashbits: " << self->khashbits.hashbits->n_tags() << std::endl; - std::cout << "hashbits: " << self->khashbits.hashbits << std::endl; - std::cout << "labelhash: " << self->labelhash << std::endl; + //std::cout << "testing my pointer ref to hashbits: " << self->khashbits.hashbits->n_tags() << std::endl; + //std::cout << "hashbits: " << self->khashbits.hashbits << std::endl; + //std::cout << "labelhash: " << self->labelhash << std::endl; return 0; } From acb63483350fca26dbb063e09161750178859ed4 Mon Sep 17 00:00:00 2001 From: "Michael R. Crusoe" Date: Fri, 10 Jan 2014 14:21:27 -0500 Subject: [PATCH 123/140] autopep8 --- khmer/__init__.py | 6 +- khmer/_version.py | 34 ++++---- scripts/extract-partitions.py | 5 +- scripts/sweep-reads-by-partition-buffered.py | 84 +++++++++++--------- setup.cfg | 4 +- setup.py | 19 ++--- tests/test_Hashbits.py | 3 +- tests/test_graph.py | 2 +- tests/test_hashbits.py | 3 +- tests/test_labelhash.py | 51 +++++++----- tests/test_scripts.py | 37 +++++---- tests/test_subset_graph.py | 28 ++++--- 12 files changed, 163 insertions(+), 113 deletions(-) diff --git a/khmer/__init__.py b/khmer/__init__.py index 59a4eb2d76..ba1a9ad359 100644 --- a/khmer/__init__.py +++ b/khmer/__init__.py @@ -30,6 +30,7 @@ def new_hashbits(k, starting_size, n_tables=2): return _new_hashbits(k, primes) + def new_counting_hash(k, starting_size, n_tables=2, n_threads=1): primes = get_n_primes_above_x(n_tables, starting_size) @@ -156,17 +157,20 @@ def get_n_primes_above_x(n, x): Additional functionality can be added to these classes as appropriate. ''' + class LabelHash(_LabelHash): + def __new__(cls, k, starting_size, n_tables): primes = get_n_primes_above_x(n_tables, starting_size) c = _LabelHash.__new__(cls, k, primes) c.primes = primes return c + class Hashbits(_Hashbits): + def __new__(cls, k, starting_size, n_tables): primes = get_n_primes_above_x(n_tables, starting_size) c = _Hashbits.__new__(cls, k, primes) c.primes = primes return c - diff --git a/khmer/_version.py b/khmer/_version.py index 6e5155675f..cdd5a1359b 100644 --- a/khmer/_version.py +++ b/khmer/_version.py @@ -17,6 +17,7 @@ import subprocess import sys + def run_command(args, cwd=None, verbose=False, hide_stderr=False): try: # remember shell=False, so use git.cmd on windows, not just git @@ -42,6 +43,7 @@ def run_command(args, cwd=None, verbose=False, hide_stderr=False): import re import os.path + def get_expanded_variables(versionfile_source): # the code embedded in _version.py can just fetch the value of these # variables. When used from setup.py, we don't want to import @@ -49,7 +51,7 @@ def get_expanded_variables(versionfile_source): # used from _version.py. variables = {} try: - f = open(versionfile_source,"r") + f = open(versionfile_source, "r") for line in f.readlines(): if line.strip().startswith("git_refnames ="): mo = re.search(r'=\s*"(.*)"', line) @@ -64,12 +66,13 @@ def get_expanded_variables(versionfile_source): pass return variables + def versions_from_expanded_variables(variables, tag_prefix, verbose=False): refnames = variables["refnames"].strip() if refnames.startswith("$Format"): if verbose: print("variables are unexpanded, not using") - return {} # unexpanded, so not in an unpacked git-archive tarball + return {} # unexpanded, so not in an unpacked git-archive tarball refs = set([r.strip() for r in refnames.strip("()").split(",")]) # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of # just "foo-1.0". If we see a "tag: " prefix, prefer those. @@ -85,7 +88,7 @@ def versions_from_expanded_variables(variables, tag_prefix, verbose=False): # "stabilization", as well as "HEAD" and "master". tags = set([r for r in refs if re.search(r'\d', r)]) if verbose: - print("discarding '%s', no digits" % ",".join(refs-tags)) + print("discarding '%s', no digits" % ",".join(refs - tags)) if verbose: print("likely tags: %s" % ",".join(sorted(tags))) for ref in sorted(tags): @@ -94,13 +97,14 @@ def versions_from_expanded_variables(variables, tag_prefix, verbose=False): r = ref[len(tag_prefix):] if verbose: print("picking %s" % r) - return { "version": r, - "full": variables["full"].strip() } + return {"version": r, + "full": variables["full"].strip()} # no suitable tags, so we use the full revision id if verbose: print("no suitable tags, using full revision id") - return { "version": variables["full"].strip(), - "full": variables["full"].strip() } + return {"version": variables["full"].strip(), + "full": variables["full"].strip()} + def versions_from_vcs(tag_prefix, versionfile_source, verbose=False): # this runs 'git' from the root of the source tree. That either means @@ -117,7 +121,7 @@ def versions_from_vcs(tag_prefix, versionfile_source, verbose=False): here = os.path.abspath(__file__) except NameError: # some py2exe/bbfreeze/non-CPython implementations don't do __file__ - return {} # not always correct + return {} # not always correct GIT = "git" if sys.platform == "win32": @@ -145,7 +149,8 @@ def versions_from_vcs(tag_prefix, versionfile_source, verbose=False): return {} if not stdout.startswith(tag_prefix): if verbose: - print("tag '%s' doesn't start with prefix '%s'" % (stdout, tag_prefix)) + print("tag '%s' doesn't start with prefix '%s'" % + (stdout, tag_prefix)) return {} tag = stdout[len(tag_prefix):] stdout = run_command([GIT, "rev-parse", "HEAD"], cwd=root) @@ -167,7 +172,7 @@ def versions_from_parentdir(parentdir_prefix, versionfile_source, verbose=False) here = os.path.abspath(__file__) except NameError: # py2exe/bbfreeze/non-CPython don't have __file__ - return {} # without __file__, we have no hope + return {} # without __file__, we have no hope # versionfile_source is the relative path from the top of the source # tree to _version.py. Invert this to find the root from __file__. root = here @@ -184,8 +189,9 @@ def versions_from_parentdir(parentdir_prefix, versionfile_source, verbose=False) dirname = os.path.basename(root) if not dirname.startswith(parentdir_prefix): if verbose: - print("guessing rootdir is '%s', but '%s' doesn't start with prefix '%s'" % - (root, dirname, parentdir_prefix)) + print( + "guessing rootdir is '%s', but '%s' doesn't start with prefix '%s'" % + (root, dirname, parentdir_prefix)) return None return {"version": dirname[len(parentdir_prefix):], "full": ""} @@ -193,8 +199,9 @@ def versions_from_parentdir(parentdir_prefix, versionfile_source, verbose=False) parentdir_prefix = "." versionfile_source = "khmer/_version.py" + def get_versions(default={"version": "unknown", "full": ""}, verbose=False): - variables = { "refnames": git_refnames, "full": git_full } + variables = {"refnames": git_refnames, "full": git_full} ver = versions_from_expanded_variables(variables, tag_prefix, verbose) if not ver: ver = versions_from_vcs(tag_prefix, versionfile_source, verbose) @@ -204,4 +211,3 @@ def get_versions(default={"version": "unknown", "full": ""}, verbose=False): if not ver: ver = default return ver - diff --git a/scripts/extract-partitions.py b/scripts/extract-partitions.py index dd2225a574..e787a383f3 100755 --- a/scripts/extract-partitions.py +++ b/scripts/extract-partitions.py @@ -31,6 +31,7 @@ def read_partition_file(filename): name, partition_id = record.name.rsplit('\t', 1) yield n, record, int(partition_id) + def output_single(r): if hasattr(r, 'accuracy'): return "@%s\n%s\n+\n%s\n" % (r.name, r.sequence, r.accuracy) @@ -104,12 +105,12 @@ def main(): else: assert not hasattr(r, 'accuracy'), \ "all input files must be FASTA if the first one is" - + break if output_unassigned: unassigned_fp = open('%s.unassigned.%s' % (prefix, SUFFIX), 'w') - + count = {} for filename in args.part_filenames: for n, r, pid in read_partition_file(filename): diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py index f153282319..75996be973 100755 --- a/scripts/sweep-reads-by-partition-buffered.py +++ b/scripts/sweep-reads-by-partition-buffered.py @@ -29,25 +29,27 @@ from collections import namedtuple as nt -DEFAULT_NUM_BUFFERS=50000 -DEFAULT_MAX_READS=1000000 -DEFAULT_BUFFER_SIZE=10 -DEFAULT_OUT_PREF='reads_' -DEFAULT_RANGE=-1 - -MIN_HSIZE=4e7 -MIN_KSIZE=21 - +DEFAULT_NUM_BUFFERS = 50000 +DEFAULT_MAX_READS = 1000000 +DEFAULT_BUFFER_SIZE = 10 +DEFAULT_OUT_PREF = 'reads_' +DEFAULT_RANGE = -1 + +MIN_HSIZE = 4e7 +MIN_KSIZE = 21 + + def fmt_fasta(name, seq, labels=[]): - return '>{name}\t{labels}\n{seq}\n'.format(name=name, - labels='\t'.join([str(l) for l in labels]), seq=seq) + return '>{name}\t{labels}\n{seq}\n'.format(name=name, + labels='\t'.join([str(l) for l in labels]), seq=seq) + def write_seq(fp, name, seq, labels=[]): try: fp.write(fmt_fasta(name, seq, labels=labels)) except IOError: print >>sys.stderr, 'Error writing {read}'.format( - read=fmt_fasta(name, seq, labels=labels)) + read=fmt_fasta(name, seq, labels=labels)) return 1 else: return 0 @@ -58,8 +60,10 @@ def write_seq(fp, name, seq, labels=[]): # Hope that each file acrues, on average, BUFFER_SIZE / NUM_PARTS reads # ie, if we buffer 1000000 reads, and we have 100000 partitions or labels, # we should expect the mean buffer size to be 10 reads + + class ReadBuffer: - + def __init__(self): self.buf = [] @@ -78,6 +82,7 @@ def is_full(self, full): def __len__(self): return len(self.buf) + class ReadBufferManager: def __init__(self, max_buffers, max_reads, max_size, output_pref, outdir): @@ -128,13 +133,13 @@ def queue(self, seq_str, buf_id): new_buf = ReadBuffer() new_buf.push(seq_str) self.buffers[buf_id] = new_buf - + self.cur_reads += 1 if self.cur_reads > self.max_reads: print >>sys.stderr, '** Reached max num reads...' self.flush_all() if len(self.buffers) > self.max_buffers: - #self.clean_buffers(2) + # self.clean_buffers(2) print >>sys.stderr, '** Reached max num buffers...' self.flush_all() @@ -145,24 +150,25 @@ def flush_all(self): self.flush_buffer(buf_id) assert self.cur_reads == 0 + def main(): parser = build_construct_args() - parser.add_argument('-i', '--input_fastp',dest='input_fastp') - parser.add_argument('-r', '--traversal_range', type=int, dest='traversal_range', \ - default=DEFAULT_RANGE) - parser.add_argument('-b', '--buffer_size', dest='max_reads', type=int, \ + parser.add_argument('-i', '--input_fastp', dest='input_fastp') + parser.add_argument( + '-r', '--traversal_range', type=int, dest='traversal_range', + default=DEFAULT_RANGE) + parser.add_argument('-b', '--buffer_size', dest='max_reads', type=int, default=DEFAULT_MAX_READS) - parser.add_argument('-l', '--buffer_length', dest='buffer_size', type=int, \ + parser.add_argument('-l', '--buffer_length', dest='buffer_size', type=int, default=DEFAULT_BUFFER_SIZE) parser.add_argument('-o', '--output_prefix', dest='output_prefix', default=DEFAULT_OUT_PREF) - parser.add_argument('-m', '--max_buffers', dest='max_buffers', type=int, \ + parser.add_argument('-m', '--max_buffers', dest='max_buffers', type=int, default=DEFAULT_NUM_BUFFERS) parser.add_argument('input_files', nargs='+') args = parser.parse_args() - - + K = args.ksize HT_SIZE = args.min_hashsize if HT_SIZE < MIN_HSIZE: @@ -188,9 +194,9 @@ def main(): print >>sys.stderr, '' print >>sys.stderr, \ 'Estimated memory usage is {prod:.2g} bytes \ - (n_hashes x min_hashsize / 8)'.format(prod=args.n_hashes*HT_SIZE/8) + (n_hashes x min_hashsize / 8)'.format(prod=args.n_hashes * HT_SIZE / 8) print >>sys.stderr, '-' * 8 - + traversal_range = args.traversal_range input_fastp = args.input_fastp outdir = os.path.dirname(input_fastp) @@ -199,18 +205,19 @@ def main(): output_pref = args.output_prefix buf_size = args.buffer_size max_reads = args.max_reads - + input_files = args.input_files - output_buffer = ReadBufferManager(max_buffers, max_reads, buf_size, output_pref, outdir) + output_buffer = ReadBufferManager( + max_buffers, max_reads, buf_size, output_pref, outdir) - # consume the partitioned fasta with which to label the graph + # consume the partitioned fasta with which to label the graph ht = khmer.LabelHash(K, HT_SIZE, N_HT) print >>sys.stderr, 'consuming fastp...' ht.consume_partitioned_fasta_and_tag_with_labels(input_fastp) label_number_dist = [] - + n_orphaned = 0 n_labeled = 0 n_mlabeled = 0 @@ -218,7 +225,7 @@ def main(): total_t = time.clock() start_t = time.clock() for read_file in input_files: - print >>sys.stderr,'** sweeping {read_file} for labels...'.format(read_file=read_file) + print >>sys.stderr, '** sweeping {read_file} for labels...'.format(read_file=read_file) file_t = 0.0 try: read_fp = screed.open(read_file) @@ -261,23 +268,26 @@ def main(): # gotta output anything left in the buffers at the end! print >>sys.stderr, '** End of run...' - output_buffer.flush_all() + output_buffer.flush_all() total_t = time.clock() - total_t if output_buffer.num_write_errors > 0 or output_buffer.num_file_errors > 0: print >>sys.stderr, '! WARNING: Sweep finished with errors !' - print >>sys.stderr, '** {writee} reads not written'.format(writee=output_buffer.num_write_errors) - print >>sys.stderr, '** {filee} errors opening files'.format(filee=output_buffer.num_file_errors) + print >>sys.stderr, '** {writee} reads not written'.format( + writee=output_buffer.num_write_errors) + print >>sys.stderr, '** {filee} errors opening files'.format( + filee=output_buffer.num_file_errors) - print >>sys.stderr, 'swept {n_reads} for labels...'.format(n_reads=n_labeled+n_mlabeled+n_orphaned) + print >>sys.stderr, 'swept {n_reads} for labels...'.format( + n_reads=n_labeled + n_mlabeled + n_orphaned) print >>sys.stderr, '...with {nc} labeled and {no} orphaned'.format( - nc=n_labeled, no=n_orphaned) + nc=n_labeled, no=n_orphaned) print >>sys.stderr, '...and {nmc} multilabeled'.format(nmc=n_mlabeled) - + print >>sys.stderr, '** outputting label number distribution...' with open('label_dist.txt', 'wb') as outfp: for nc in label_number_dist: outfp.write('{nc}\n'.format(nc=nc)) - + if __name__ == '__main__': main() diff --git a/setup.cfg b/setup.cfg index a7c3a5e01a..51f93526b2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,8 +1,8 @@ [nosetests] verbosity = 2 stop = TRUE -attr = !known_failing -#attr = !known_failing,!highmem +#attr = !known_failing +attr = !known_failing,!highmem # where highmem > 0.5GiB memory [build_ext] diff --git a/setup.py b/setup.py index 78bff96df6..722be23337 100755 --- a/setup.py +++ b/setup.py @@ -71,8 +71,8 @@ [ "khmer_config", "thread_id_map", "trace_logger", "perf_metrics", "read_parsers", "ktable", "hashtable", "hashbits", "labelhash", "counting", - "subset", "aligner", "scoringmatrix", "node", "kmer", - ] + "subset", "aligner", "scoringmatrix", "node", "kmer", + ] )) extension_mod_DICT = \ @@ -106,18 +106,18 @@ ' Jason Pell, Jared Simpson, Camille Scott,' ' Qingpeng Zhang, and C. Titus Brown', "author_email": 'khmer-project@idyll.org', - #"maintainer": 'Michael R. Crusoe', # this overrides the author field - #"maintainer_email": 'mcrusoe@msu.edu', # so don't include it - #http://docs.python.org/2/distutils/setupscript.html - # #additiona-meta-data note #3 + # "maintainer": 'Michael R. Crusoe', # this overrides the author field + # "maintainer_email": 'mcrusoe@msu.edu', # so don't include it + # http://docs.python.org/2/distutils/setupscript.html + # additiona-meta-data note #3 "url": 'http://ged.msu.edu/', "packages": ['khmer'], "install_requires": ["screed >= 0.7.1", 'argparse >= 1.2.1', ], "setup_requires": ['nose >= 1.0', 'sphinx', ], "scripts": scripts, "ext_modules": [extension_mod, ], - #"platforms": '', # empty as is conveyed by the classifiers below - #"license": '', # empty as is conveyed by the classifier below + # "platforms": '', # empty as is conveyed by the classifiers below + # "license": '', # empty as is conveyed by the classifier below "include_package_data": True, "classifiers": [ "Development Status :: 4 - Beta", @@ -132,7 +132,7 @@ "Programming Language :: C++", "Programming Language :: Python :: 2.7", "Topic :: Scientific/Engineering :: Bio-Informatics", - ], + ], } # Only run lib setup when needed, not on every invocation @@ -140,6 +140,7 @@ class build_ext(_build_ext): + """Specialized Python extension builder.""" def run(self): diff --git a/tests/test_Hashbits.py b/tests/test_Hashbits.py index 761eb58c77..f0d09cf44e 100644 --- a/tests/test_Hashbits.py +++ b/tests/test_Hashbits.py @@ -18,9 +18,11 @@ import khmer_tst_utils as utils from nose.plugins.attrib import attr + def teardown(): utils.cleanup() + def test__get_set_tag_density(): ht = khmer.Hashbits(32, 1, 1) @@ -551,4 +553,3 @@ def test_simple_median(): assert median == 1 assert average == 1.0 assert stddev == 0.0 - diff --git a/tests/test_graph.py b/tests/test_graph.py index 3891df740e..a0df526b56 100644 --- a/tests/test_graph.py +++ b/tests/test_graph.py @@ -241,7 +241,7 @@ def test_output_fq(self): print open(output_file).read() - x = set([ r.accuracy for r in screed.open(output_file) ]) + x = set([r.accuracy for r in screed.open(output_file)]) assert x, x @attr('highmem') diff --git a/tests/test_hashbits.py b/tests/test_hashbits.py index aca4dad430..92d5484a1e 100644 --- a/tests/test_hashbits.py +++ b/tests/test_hashbits.py @@ -11,9 +11,11 @@ import khmer_tst_utils as utils from nose.plugins.attrib import attr + def teardown(): utils.cleanup() + def test__get_set_tag_density(): ht = khmer.new_hashbits(32, 1, 1) @@ -544,4 +546,3 @@ def test_simple_median(): assert median == 1 assert average == 1.0 assert stddev == 0.0 - diff --git a/tests/test_labelhash.py b/tests/test_labelhash.py index 4ab73a8319..51431ccd56 100644 --- a/tests/test_labelhash.py +++ b/tests/test_labelhash.py @@ -11,26 +11,29 @@ import khmer_tst_utils as utils from nose.plugins.attrib import attr + def teardown(): utils.cleanup() # -# @camillescott TODO: more tests! +# @camillescott TODO: more tests! # * thread-safety + def test_n_labels(): lh = LabelHash(20, 1e7, 4) filename = utils.get_test_data('test-labels.fa') lh.consume_fasta_and_tag_with_labels(filename) - + print lh.n_labels() assert lh.n_labels() == 4 + def test_get_label_dict(): lb = LabelHash(20, 1e7, 4) filename = utils.get_test_data('test-labels.fa') lb.consume_fasta_and_tag_with_labels(filename) - + labels = lb.get_label_dict() expected = [0L, 1L, 2L, 3L] for e_label in expected: @@ -38,6 +41,7 @@ def test_get_label_dict(): for a_label in labels: assert a_label in expected + def test_get_tag_labels(): lb = LabelHash(20, 1e7, 4) filename = utils.get_test_data('single-read.fq') @@ -48,6 +52,7 @@ def test_get_tag_labels(): assert len(labels) == 1 assert labels.pop() == 0L + def test_consume_fasta_and_tag_with_labels(): lb = LabelHash(20, 1e7, 4) read_1 = 'ACGTAACCGGTTAAACCCGGGTTTAAAACCCCGGGGTTTT' @@ -77,22 +82,24 @@ def test_consume_partitioned_fasta_and_tag_with_labels(): lb = LabelHash(20, 1e7, 4) filename = utils.get_test_data('real-partition-small.fa') - total_reads, n_consumed = lb.consume_partitioned_fasta_and_tag_with_labels(filename) + total_reads, n_consumed = lb.consume_partitioned_fasta_and_tag_with_labels( + filename) labels = set() for record in screed.open(filename): seq = record.sequence labels.update(lb.sweep_label_neighborhood(seq, False, False)) - #print lb.n_labels() - #print labels + # print lb.n_labels() + # print labels assert len(labels) == 1 assert labels.pop() == 2L - assert lb.n_labels() == 1 + assert lb.n_labels() == 1 + def test_sweep_tag_neighborhood(): lb = LabelHash(20, 1e7, 4) filename = utils.get_test_data('single-read.fq') lb.consume_fasta_and_tag(filename) - + tags = lb.sweep_tag_neighborhood('CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT') assert len(tags) == 1 assert tags.pop() == 173473779682L @@ -102,7 +109,7 @@ def test_sweep_label_neighborhood(): lb = LabelHash(20, 1e7, 4) filename = utils.get_test_data('single-read.fq') lb.consume_fasta_and_tag_with_labels(filename) - + labels = lb.sweep_label_neighborhood('CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT') assert len(labels) == 1 assert labels.pop() == 0L @@ -114,37 +121,43 @@ def test_sweep_label_neighborhood(): traversing from B should find labels from A, B, and C, and traversing from C should find labels from B and C ''' + + def test_label_tag_correctness(): lb = LabelHash(20, 1e7, 4) filename = utils.get_test_data('test-labels.fa') lb.consume_fasta_and_tag_with_labels(filename) - + # read A - labels = lb.sweep_label_neighborhood('ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAGCTAGGCTAGGTGTGCTCTGCCTAGAGCTAGGCTAGGTGT') + labels = lb.sweep_label_neighborhood( + 'ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAGCTAGGCTAGGTGTGCTCTGCCTAGAGCTAGGCTAGGTGT') print lb.sweep_tag_neighborhood('TTCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAGCTAGGCTAGGTGTGCTCTGCCTAGAGCTAGGCTAGGTGT') print labels - print len('ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAG')-19 + print len('ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAG') - 19 assert len(labels) == 2 assert 0L in labels assert 1L in labels - + # read B - labels = lb.sweep_label_neighborhood('GCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGCTCTGCCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGA') + labels = lb.sweep_label_neighborhood( + 'GCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGCTCTGCCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGA') print labels assert len(labels) == 3 assert 0L in labels assert 1L in labels assert 2L in labels - + # read C - labels = lb.sweep_label_neighborhood('TGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGAGTTGGGGATAGATAGATAGATGAGTGTAGATCCAACAACACATACA') + labels = lb.sweep_label_neighborhood( + 'TGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGAGTTGGGGATAGATAGATAGATGAGTGTAGATCCAACAACACATACA') print labels assert len(labels) == 2 assert 1L in labels assert 2L in labels - + # read D - labels = lb.sweep_label_neighborhood('TATATATATAGCTAGCTAGCTAACTAGCTAGCATCGATCGATCGATC') + labels = lb.sweep_label_neighborhood( + 'TATATATATAGCTAGCTAGCTAACTAGCTAGCATCGATCGATCGATC') print labels assert len(labels) == 1 assert 3L in labels @@ -153,6 +166,7 @@ def test_label_tag_correctness(): # Begin Hashbits tests # + def test__get_set_tag_density(): ht = khmer.LabelHash(32, 1, 1) @@ -683,4 +697,3 @@ def test_simple_median(): assert median == 1 assert average == 1.0 assert stddev == 0.0 - diff --git a/tests/test_scripts.py b/tests/test_scripts.py index 806e88d2ea..8f585888f6 100644 --- a/tests/test_scripts.py +++ b/tests/test_scripts.py @@ -3,7 +3,9 @@ # Copyright (C) Michigan State University, 2009-2013. It is licensed under # the three-clause BSD license; see doc/LICENSE.txt. Contact: ctb@msu.edu # -import sys, os, shutil +import sys +import os +import shutil from cStringIO import StringIO import traceback @@ -433,6 +435,7 @@ def test_normalize_by_median_force(): assert '*** Skipping' in err assert '** IOErrors' in err + def test_normalize_by_median_no_bigcount(): infile = utils.get_temp_filename('test.fa') hashfile = utils.get_temp_filename('test-out.kh') @@ -447,12 +450,13 @@ def test_normalize_by_median_no_bigcount(): (status, out, err) = runscript(script, args) assert status == 0, (out, err) print (out, err) - + assert os.path.exists(hashfile), hashfile kh = khmer.load_counting_hash(hashfile) - + assert kh.get('GGTTGACG') == 255 + def test_normalize_by_median_dumpfrequency(): CUTOFF = '1' @@ -864,9 +868,11 @@ def test_extract_partitions(): parts = set(parts) assert len(parts) == 1, len(parts) + def test_extract_partitions_fq(): seqfile = utils.get_test_data('random-20-a.fq') - graphbase = _make_graph(seqfile, do_partition=True, annotate_partitions=True) + graphbase = _make_graph( + seqfile, do_partition=True, annotate_partitions=True) in_dir = os.path.dirname(graphbase) # get the final part file @@ -875,7 +881,7 @@ def test_extract_partitions_fq(): # ok, now run extract-partitions. script = scriptpath('extract-partitions.py') args = ['extracted', partfile] - + runscript(script, args, in_dir) distfile = os.path.join(in_dir, 'extracted.dist') @@ -886,15 +892,16 @@ def test_extract_partitions_fq(): dist = open(distfile).readline() assert dist.strip() == '99 1 1 99' - parts = [ r.name.split('\t')[1] for r in screed.open(partfile) ] + parts = [r.name.split('\t')[1] for r in screed.open(partfile)] assert len(parts) == 99, len(parts) parts = set(parts) assert len(parts) == 1, len(parts) - quals = set([ r.accuracy for r in screed.open(partfile) ]) + quals = set([r.accuracy for r in screed.open(partfile)]) quals = list(quals) assert quals[0], quals + def test_abundance_dist(): infile = utils.get_temp_filename('test.fa') outfile = utils.get_temp_filename('test.dist') @@ -1222,18 +1229,20 @@ def test_sample_reads_randomly(): '895:1:1:1327:13028', '895:1:1:1368:4434', '895:1:1:1335:19932', '895:1:1:1340:19387']) + def test_sweep_reads_by_partition_buffered(): readfile = utils.get_temp_filename('reads.fa') contigfile = utils.get_temp_filename('contigs.fp') in_dir = os.path.dirname(contigfile) - + shutil.copyfile(utils.get_test_data('test-sweep-reads.fa'), readfile) - shutil.copyfile(utils.get_test_data('test-sweep-contigs.fp'), contigfile) + shutil.copyfile(utils.get_test_data('test-sweep-contigs.fp'), contigfile) script = scriptpath('sweep-reads-by-partition-buffered.py') - args = ['-k', '25', '-o', 'test', '-i', contigfile, readfile, 'junkfile.fa'] + args = ['-k', '25', '-o', 'test', '-i', + contigfile, readfile, 'junkfile.fa'] status, out, err = runscript(script, args, in_dir) - + # check if the bad file was skipped without issue assert 'ERROR' in err assert 'skipping' in err @@ -1242,13 +1251,13 @@ def test_sweep_reads_by_partition_buffered(): out2 = os.path.join(in_dir, 'test_1.fa') mout = os.path.join(in_dir, 'test_multi.fa') oout = os.path.join(in_dir, 'test_orphaned.fa') - + print os.listdir(in_dir) seqs1 = set([r.name for r in screed.open(out1)]) seqs2 = set([r.name for r in screed.open(out2)]) seqsm = set([r.name for r in screed.open(mout)]) - seqso = set([r.name for r in screed.open(oout)]) + seqso = set([r.name for r in screed.open(oout)]) print seqs1 print seqs2 @@ -1256,6 +1265,6 @@ def test_sweep_reads_by_partition_buffered(): print seqso assert seqs1 == set(['read1_p0\t0', 'read2_p0\t0']) assert seqs2 == set(['read3_p1\t1']) - assert (seqsm == set(['read4_multi\t0\t1']) or \ + assert (seqsm == set(['read4_multi\t0\t1']) or seqsm == set(['read4_multi\t1\t0'])) assert seqso == set(['read5_orphan']) diff --git a/tests/test_subset_graph.py b/tests/test_subset_graph.py index 0075630250..c38ced8c29 100644 --- a/tests/test_subset_graph.py +++ b/tests/test_subset_graph.py @@ -406,13 +406,14 @@ def test_small_real_partitions(): CCTCGGGCCTTTCCGTTCCGTTGCCGCCCAAGCTCTCTAGCATCGAATCGGTCAAGCGGT\ """ + def test_partition_on_abundance_1(): print (a,) print (b,) kh = khmer.new_counting_hash(20, 1e6, 4) for i in range(10): print kh.consume_and_tag(a) - + for i in range(10): print kh.consume_and_tag(b) @@ -421,11 +422,12 @@ def test_partition_on_abundance_1(): x = p.count_partitions() assert x == (1, 0) # one partition, no remainders + def test_partition_on_abundance_2(): kh = khmer.new_counting_hash(20, 1e6, 4) for i in range(10): print kh.consume_and_tag(a) - + for i in range(5): print kh.consume_and_tag(b) @@ -434,11 +436,12 @@ def test_partition_on_abundance_2(): x = p.count_partitions() assert x == (1, 6) # one partition, six disconnected + def test_partition_on_abundance_3(): kh = khmer.new_counting_hash(20, 1e6, 4) for i in range(10): print kh.consume_and_tag(a) - + for i in range(5): print kh.consume_and_tag(b) @@ -447,16 +450,17 @@ def test_partition_on_abundance_3(): # this will get paths only in 'b' p = kh.do_subset_partition_with_abundance(5, 10) - + x = p.count_partitions() print x assert x == (2, 2) # two partitions, two ignored tags + def test_partition_overlap_1(): kh = khmer.new_counting_hash(20, 1e6, 4) for i in range(10): kh.consume_and_tag(a) - + for i in range(10): kh.consume_and_tag(b) @@ -466,17 +470,18 @@ def test_partition_overlap_1(): # this will get paths only in 'a', again -- should be the same! p2 = kh.do_subset_partition_with_abundance(10, 50) - #p1.report_on_partitions() - #p2.report_on_partitions() + # p1.report_on_partitions() + # p2.report_on_partitions() x = p1.compare_partitions(3, p2, 3) assert x == (0, 0, 14), x + def test_partition_overlap_2(): kh = khmer.new_counting_hash(20, 1e6, 4) for i in range(10): kh.consume_and_tag(a) - + for i in range(5): kh.consume_and_tag(b) @@ -486,8 +491,8 @@ def test_partition_overlap_2(): # this will get paths only in 'b' p2 = kh.do_subset_partition_with_abundance(5, 10) - #p1.report_on_partitions() - #p2.report_on_partitions() + # p1.report_on_partitions() + # p2.report_on_partitions() x = p1.compare_partitions(3, p2, 3) assert x == (8, 6, 0), x @@ -497,7 +502,7 @@ def test_partition_overlap_2(): x = p1.partition_sizes() assert x == ([(3L, 8L)], 0), x - + x = p2.partition_sizes() assert x == ([(3L, 6L), (5L, 6L)], 2), x @@ -506,4 +511,3 @@ def test_partition_overlap_2(): x = p2.partition_average_coverages(kh) assert x == [(3L, 5L), (5L, 10L)], x - From 9a511c40e11ab93b77f45c96d8310d94946568ca Mon Sep 17 00:00:00 2001 From: "Michael R. Crusoe" Date: Fri, 10 Jan 2014 17:57:29 -0500 Subject: [PATCH 124/140] cpychecker fixes --- khmer/_khmermodule.cc | 253 +++++++++++++++++++++++------------------- lib/subset.cc | 2 +- 2 files changed, 141 insertions(+), 114 deletions(-) diff --git a/khmer/_khmermodule.cc b/khmer/_khmermodule.cc index 4cf587cc2d..50c011b1bb 100644 --- a/khmer/_khmermodule.cc +++ b/khmer/_khmermodule.cc @@ -159,9 +159,11 @@ void _report_fn(const char * info, void * data, unsigned long long n_reads, PyObject * obj = (PyObject *) data; if (obj != Py_None) { PyObject * args = Py_BuildValue("sKK", info, n_reads, other); - PyObject * r = PyObject_Call(obj, args, NULL); - Py_XDECREF(r); - Py_DECREF(args); + if (args != NULL) { + PyObject * r = PyObject_Call(obj, args, NULL); + Py_XDECREF(r); + } + Py_XDECREF(args); } } @@ -1970,10 +1972,11 @@ void _dump_report_fn(const char * info, unsigned int count, void * data) PyObject * obj = (PyObject *) data; if (obj != Py_None) { PyObject * args = Py_BuildValue("sI", info, count); - - PyObject * r = PyObject_Call(obj, args, NULL); - Py_XDECREF(r); - Py_DECREF(args); + if (args != NULL) { + PyObject * r = PyObject_Call(obj, args, NULL); + Py_XDECREF(r); + } + Py_XDECREF(args); } } @@ -2061,8 +2064,7 @@ static PyObject * hash_consume_and_tag(PyObject * self, PyObject * args) khmer_KCountingHashObject * me = (khmer_KCountingHashObject *) self; khmer::CountingHash * counting = me->counting; - char * seq; - PyObject * callback_obj = NULL; + const char * seq; if (!PyArg_ParseTuple(args, "s", &seq)) { return NULL; @@ -2078,7 +2080,7 @@ static PyObject * hash_consume_and_tag(PyObject * self, PyObject * args) return NULL; } - return Py_BuildValue("L", n_consumed); + return Py_BuildValue("K", n_consumed); } static PyObject * hash_consume_fasta_and_tag(PyObject * self, PyObject * args) @@ -2086,7 +2088,7 @@ static PyObject * hash_consume_fasta_and_tag(PyObject * self, PyObject * args) khmer_KCountingHashObject * me = (khmer_KCountingHashObject *) self; khmer::CountingHash * counting = me->counting; - char * filename; + const char * filename; PyObject * callback_obj = NULL; if (!PyArg_ParseTuple(args, "s|O", &filename, &callback_obj)) { @@ -2105,7 +2107,7 @@ static PyObject * hash_consume_fasta_and_tag(PyObject * self, PyObject * args) return NULL; } - return Py_BuildValue("iL", total_reads, n_consumed); + return Py_BuildValue("IK", total_reads, n_consumed); } static PyObject * hash_find_all_tags_truncate_on_abundance(PyObject * self, PyObject *args) @@ -2113,10 +2115,10 @@ static PyObject * hash_find_all_tags_truncate_on_abundance(PyObject * self, PyOb khmer_KCountingHashObject * me = (khmer_KCountingHashObject *) self; khmer::CountingHash * counting = me->counting; - char * kmer_s = NULL; - unsigned int min_count, max_count; + const char * kmer_s = NULL; + BoundedCounterType min_count, max_count; - if (!PyArg_ParseTuple(args, "sii", &kmer_s, &min_count, &max_count)) { + if (!PyArg_ParseTuple(args, "sHH", &kmer_s, &min_count, &max_count)) { return NULL; } @@ -2153,9 +2155,9 @@ static PyObject * hash_do_subset_partition_with_abundance(PyObject * self, PyObj khmer::HashIntoType start_kmer = 0, end_kmer = 0; PyObject * break_on_stop_tags_o = NULL; PyObject * stop_big_traversals_o = NULL; - unsigned int min_count, max_count; + BoundedCounterType min_count, max_count; - if (!PyArg_ParseTuple(args, "ii|KKOOO", + if (!PyArg_ParseTuple(args, "HH|KKOOO", &min_count, &max_count, &start_kmer, &end_kmer, &break_on_stop_tags_o, @@ -2190,6 +2192,10 @@ static PyObject * hash_do_subset_partition_with_abundance(PyObject * self, PyObj khmer_KSubsetPartitionObject * subset_obj = (khmer_KSubsetPartitionObject *)\ PyObject_New(khmer_KSubsetPartitionObject, &khmer_KSubsetPartitionType); + if (subset_obj == NULL) { + return NULL; + } + subset_obj->subset = subset_p; return (PyObject *) subset_obj; @@ -2309,7 +2315,7 @@ static PyObject* _new_counting_hash(PyObject * self, PyObject * args) std::vector sizes; Py_ssize_t sizes_list_o_length = PyObject_Length(sizes_list_o); if (sizes_list_o_length == -1) { - return NULL; + return NULL; } for (int i = 0; i < sizes_list_o_length; i++) { PyObject * size_o = PyList_GET_ITEM(sizes_list_o, i); @@ -2332,6 +2338,51 @@ static PyObject* _new_counting_hash(PyObject * self, PyObject * args) // hashbits stuff // +static void khmer_hashbits_dealloc(PyObject * obj); +static PyObject* khmer_hashbits_new(PyTypeObject * type, PyObject * args, PyObject * kwds); +static int khmer_hashbits_init(khmer_KHashbitsObject * self, PyObject * args, PyObject * kwds); +static PyObject * khmer_hashbits_getattr(PyObject * obj, char * name); + +static PyTypeObject khmer_KHashbitsType = { + PyObject_HEAD_INIT(NULL) + 0, + "Hashbits", sizeof(khmer_KHashbitsObject), + 0, + (destructor)khmer_hashbits_dealloc, /*tp_dealloc*/ + 0, /*tp_print*/ + khmer_hashbits_getattr, /*tp_getattr*/ + 0, /*tp_setattr*/ + 0, /*tp_compare*/ + 0, /*tp_repr*/ + 0, /*tp_as_number*/ + 0, /*tp_as_sequence*/ + 0, /*tp_as_mapping*/ + 0, /*tp_hash */ + 0, /*tp_call*/ + 0, /*tp_str*/ + 0, /*tp_getattro*/ + 0, /*tp_setattro*/ + 0, /*tp_as_buffer*/ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /*tp_flags*/ + "hashbits object", /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iternext */ + 0, /* tp_methods */ + 0, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + (initproc)khmer_hashbits_init, /* tp_init */ + 0, /* tp_alloc */ +}; + static PyObject * hashbits_n_unique_kmers(PyObject * self, PyObject * args) { khmer_KHashbitsObject * me = (khmer_KHashbitsObject *) self; @@ -2358,7 +2409,7 @@ static PyObject * hashbits_count_overlap(PyObject * self, PyObject * args) PyObject * callback_obj = NULL; khmer::Hashbits * ht2; - if (!PyArg_ParseTuple(args, "sO|O", &filename, &ht2_argu, + if (!PyArg_ParseTuple(args, "sO!|O", &filename, &khmer_KHashbitsType, &ht2_argu, &callback_obj)) { return NULL; } @@ -3904,50 +3955,6 @@ khmer_hashbits_getattr(PyObject * obj, char * name) return Py_FindMethod(khmer_hashbits_methods, obj, name); } -static void khmer_hashbits_dealloc(PyObject * obj); -static PyObject* khmer_hashbits_new(PyTypeObject * type, PyObject * args, PyObject * kwds); -static int khmer_hashbits_init(khmer_KHashbitsObject * self, PyObject * args, PyObject * kwds); - -static PyTypeObject khmer_KHashbitsType = { - PyObject_HEAD_INIT(NULL) - 0, - "Hashbits", sizeof(khmer_KHashbitsObject), - 0, - (destructor)khmer_hashbits_dealloc, /*tp_dealloc*/ - 0, /*tp_print*/ - khmer_hashbits_getattr, /*tp_getattr*/ - 0, /*tp_setattr*/ - 0, /*tp_compare*/ - 0, /*tp_repr*/ - 0, /*tp_as_number*/ - 0, /*tp_as_sequence*/ - 0, /*tp_as_mapping*/ - 0, /*tp_hash */ - 0, /*tp_call*/ - 0, /*tp_str*/ - 0, /*tp_getattro*/ - 0, /*tp_setattro*/ - 0, /*tp_as_buffer*/ - Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /*tp_flags*/ - "hashbits object", /* tp_doc */ - 0, /* tp_traverse */ - 0, /* tp_clear */ - 0, /* tp_richcompare */ - 0, /* tp_weaklistoffset */ - 0, /* tp_iter */ - 0, /* tp_iternext */ - khmer_hashbits_methods, /* tp_methods */ - 0, /* tp_members */ - 0, /* tp_getset */ - 0, /* tp_base */ - 0, /* tp_dict */ - 0, /* tp_descr_get */ - 0, /* tp_descr_set */ - 0, /* tp_dictoffset */ - (initproc)khmer_hashbits_init, /* tp_init */ - 0, /* tp_alloc */ -}; - // __new__ for hashbits; necessary for proper subclassing // This will essentially do what the old factory function did. Unlike many __new__ // methods, we take our arguments here, because there's no "unitialized" hashbits @@ -3962,7 +3969,8 @@ static PyObject* khmer_hashbits_new(PyTypeObject * type, PyObject * args, PyObje PyObject* sizes_list_o = NULL; if (!PyArg_ParseTuple(args, "IO", &k, &sizes_list_o)) { - return NULL; + Py_DECREF(self); + return NULL; } std::vector sizes; @@ -3998,7 +4006,7 @@ static PyObject * subset_count_partitions(PyObject * self, unsigned int n_partitions = 0, n_unassigned = 0; subset_p->count_partitions(n_partitions, n_unassigned); - return Py_BuildValue("ii", n_partitions, n_unassigned); + return Py_BuildValue("II", n_partitions, n_unassigned); } static PyObject * subset_report_on_partitions(PyObject * self, @@ -4024,9 +4032,9 @@ static PyObject * subset_compare_partitions(PyObject * self, khmer::SubsetPartition * subset1_p = me->subset; PyObject * subset2_obj = NULL; - unsigned int pid1, pid2; // @CTB ensure that these are unsigned? + PartitionID pid1, pid2; // @CTB ensure that these are unsigned? - if (!PyArg_ParseTuple(args, "iOi", + if (!PyArg_ParseTuple(args, "IOI", &pid1, &subset2_obj, &pid2)) { return NULL; } @@ -4035,11 +4043,10 @@ static PyObject * subset_compare_partitions(PyObject * self, khmer::SubsetPartition * subset2_p = other->subset; unsigned int n_only1 = 0, n_only2 = 0, n_shared = 0; - subset1_p->compare_to_partition((PartitionID) pid1, - subset2_p, (PartitionID) pid2, + subset1_p->compare_to_partition(pid1, subset2_p, pid2, n_only1, n_only2, n_shared); - return Py_BuildValue("iii", n_only1, n_only2, n_shared); + return Py_BuildValue("III", n_only1, n_only2, n_shared); } static PyObject * subset_partition_size_distribution(PyObject * self, @@ -4058,15 +4065,24 @@ static PyObject * subset_partition_size_distribution(PyObject * self, subset_p->partition_size_distribution(d, n_unassigned); PyObject * x = PyList_New(d.size()); - khmer::PartitionCountDistribution::const_iterator di; + if (x == NULL) { + return NULL; + } + khmer::PartitionCountDistribution::iterator di; unsigned int i; for (i = 0, di = d.begin(); di != d.end(); di++, i++) { - PyList_SET_ITEM(x, i, Py_BuildValue("LL", di->first, di->second)); + PyObject * tup = Py_BuildValue("KK", di->first, di->second); + if (tup != NULL) { + PyList_SET_ITEM(x, i, tup); + } + Py_XDECREF(tup); } assert (i == d.size()); - return Py_BuildValue("Oi", x, n_unassigned); + PyObject * ret = Py_BuildValue("OI", x, n_unassigned); + Py_DECREF(x); + return ret; } static PyObject * subset_partition_sizes(PyObject * self, @@ -4077,7 +4093,7 @@ static PyObject * subset_partition_sizes(PyObject * self, unsigned int min_size = 0; - if (!PyArg_ParseTuple(args, "|i", &min_size)) { + if (!PyArg_ParseTuple(args, "|I", &min_size)) { return NULL; } @@ -4085,23 +4101,33 @@ static PyObject * subset_partition_sizes(PyObject * self, unsigned int n_unassigned = 0; subset_p->partition_sizes(cm, n_unassigned); - unsigned int i; + unsigned int i = 0; khmer::PartitionCountMap::const_iterator mi; - for (i = 0, mi = cm.begin(); mi != cm.end(); mi++) { + for (mi = cm.begin(); mi != cm.end(); mi++) { if (mi->second >= min_size) i++; } PyObject * x = PyList_New(i); + if (x == NULL) { + return NULL; + } // this should probably be a dict. @CTB for (i = 0, mi = cm.begin(); mi != cm.end(); mi++) { if (mi->second >= min_size) { - PyList_SET_ITEM(x, i, Py_BuildValue("LL", mi->first, mi->second)); + PyObject * tup = Py_BuildValue("II", mi->first, mi->second); + if (tup != NULL) { + PyList_SET_ITEM(x, i, tup); + } + Py_XDECREF(tup); i++; } } - return Py_BuildValue("Oi", x, n_unassigned); + PyObject * ret = Py_BuildValue("OI", x, n_unassigned); + Py_DECREF(x); + + return ret; } static PyObject * subset_partition_average_coverages(PyObject * self, @@ -4122,16 +4148,23 @@ static PyObject * subset_partition_average_coverages(PyObject * self, subset_p->partition_average_coverages(cm, counting); unsigned int i; - khmer::PartitionCountMap::const_iterator mi; + khmer::PartitionCountMap::iterator mi; PyObject * x = PyList_New(cm.size()); + if (x == NULL) { + return NULL; + } // this should probably be a dict. @CTB for (i = 0, mi = cm.begin(); mi != cm.end(); mi++, i++) { - PyList_SET_ITEM(x, i, Py_BuildValue("LL", mi->first, mi->second)); + PyObject * tup = Py_BuildValue("II", mi->first, mi->second); + if (tup != NULL) { + PyList_SET_ITEM(x, i, tup); + } + Py_XDECREF(tup); } - return Py_BuildValue("O", x); + return x; } static PyMethodDef khmer_subset_methods[] = { @@ -4240,12 +4273,21 @@ static PyObject * labelhash_get_label_dict(PyObject * self, PyObject * args) { khmer::LabelHash * hb = me->labelhash; PyObject * d = PyDict_New(); + if (d == NULL) { + return NULL; + } khmer::LabelPtrMap::iterator it; for (it = hb->label_ptrs.begin(); it!=hb->label_ptrs.end(); ++it) { - PyDict_SetItem(d, Py_BuildValue("K", it->first), Py_BuildValue("K", it->second)); + PyObject * key = Py_BuildValue("K", it->first); + Py_XDECREF(key); + PyObject * val = Py_BuildValue("K", it->second); + Py_XDECREF(val); + if (key != NULL && val != NULL) { + PyDict_SetItem(d, key, val); + } } - + return d; } @@ -4256,7 +4298,7 @@ static PyObject * labelhash_consume_fasta_and_tag_with_labels(PyObject * self, P std::ofstream outfile; - char * filename; + const char * filename; PyObject * callback_obj = NULL; if (!PyArg_ParseTuple(args, "s|O", &filename, &callback_obj)) { @@ -4277,7 +4319,7 @@ static PyObject * labelhash_consume_fasta_and_tag_with_labels(PyObject * self, P //Py_END_ALLOW_THREADS if (exc_raised) return NULL; - return Py_BuildValue("iL", total_reads, n_consumed); + return Py_BuildValue("IK", total_reads, n_consumed); } @@ -4287,7 +4329,7 @@ static PyObject * labelhash_consume_partitioned_fasta_and_tag_with_labels( khmer_KLabelHashObject * me = (khmer_KLabelHashObject *) self; khmer::LabelHash * labelhash = me->labelhash; - char * filename; + const char * filename; PyObject * callback_obj = NULL; if (!PyArg_ParseTuple(args, "s|O", &filename, &callback_obj)) { @@ -4306,41 +4348,34 @@ static PyObject * labelhash_consume_partitioned_fasta_and_tag_with_labels( std::cout << "caught exception in consume_partitioned_fasta_and_tag_with_labels!" << std::endl; return NULL; } - std::cout << "building value for return..." << std::endl; - return Py_BuildValue("iK", total_reads, n_consumed); + return Py_BuildValue("IK", total_reads, n_consumed); } static PyObject * labelhash_consume_sequence_and_tag_with_labels(PyObject * self, PyObject * args) { khmer_KLabelHashObject * me = (khmer_KLabelHashObject *) self; khmer::LabelHash * hb = me->labelhash; - std::cout << "inside labelhash consume cpython func, parsing args..." << std::endl; - char * seq = NULL; - unsigned long long c = NULL; + const char * seq = NULL; + unsigned long long c = 0; if (!PyArg_ParseTuple(args, "sK", &seq, &c)) { return NULL; } - std::cout << "parsed args, getting new label" << std::endl; unsigned long long n_consumed = 0; khmer::Label * the_label = hb->check_and_allocate_label(c); try { - //if (hb->check_and_normalize_read(seq)) { - std::cout << "calling low level consume func on labelhash..." << std::endl; hb->consume_sequence_and_tag_with_labels(seq, n_consumed, *the_label); - //} } catch (_khmer_signal &e) { return NULL; } - std::cout << "packaging return value and returning!" << std::endl; - return Py_BuildValue("L", n_consumed); + return Py_BuildValue("K", n_consumed); } static PyObject * labelhash_sweep_label_neighborhood(PyObject * self, PyObject * args) { khmer_KLabelHashObject * me = (khmer_KLabelHashObject *) self; khmer::LabelHash * hb = me->labelhash; - char * seq = NULL; - unsigned int r = NULL; + const char * seq = NULL; + int r = -1; PyObject * break_on_stop_tags_o = NULL; PyObject * stop_big_traversals_o = NULL; @@ -4372,10 +4407,10 @@ static PyObject * labelhash_sweep_label_neighborhood(PyObject * self, PyObject * LabelPtrSet found_labels; bool exc_raised = false; - unsigned int num_traversed = 0; + //unsigned int num_traversed = 0; //Py_BEGIN_ALLOW_THREADS try { - num_traversed = hb->sweep_label_neighborhood(seq, found_labels, range, break_on_stop_tags, stop_big_traversals); + hb->sweep_label_neighborhood(seq, found_labels, range, break_on_stop_tags, stop_big_traversals); } catch (_khmer_signal &e) { exc_raised = true; } @@ -4404,8 +4439,8 @@ static PyObject * labelhash_sweep_tag_neighborhood(PyObject * self, PyObject *ar khmer_KLabelHashObject * me = (khmer_KLabelHashObject *) self; khmer::LabelHash * labelhash = me->labelhash; - char * seq = NULL; - unsigned int r = NULL; + const char * seq = NULL; + int r = -1; PyObject * break_on_stop_tags_o = NULL; PyObject * stop_big_traversals_o = NULL; @@ -4443,7 +4478,7 @@ static PyObject * labelhash_sweep_tag_neighborhood(PyObject * self, PyObject *ar //Py_END_ALLOW_THREADS PyObject * x = PyList_New(tagged_kmers.size()); - khmer::SeenSet::const_iterator si; + khmer::SeenSet::iterator si; unsigned long long i = 0; for (si=tagged_kmers.begin(); si!=tagged_kmers.end(); ++si) { //std::string kmer_s = _revhash(*si, labelhash->ksize()); @@ -4508,13 +4543,6 @@ static PyMethodDef khmer_labelhash_methods[] = { {NULL, NULL, 0, NULL} /* sentinel */ }; -// still necessary? -static PyObject * -khmer_labelhash_getattr(PyObject * obj, char * name) -{ - return Py_FindMethod(khmer_labelhash_methods, obj, name); -} - static PyTypeObject khmer_KLabelHashType = { PyObject_HEAD_INIT(NULL) 0, /* ob_size */ @@ -4949,8 +4977,8 @@ init_khmer(void) // implemented __new__ for Hashbits; keeping factory func around as well // for backwards compat with old scripts khmer_KHashbitsType.tp_new = khmer_hashbits_new; + khmer_KHashbitsType.tp_methods = khmer_hashbits_methods; if (PyType_Ready(&khmer_KHashbitsType) < 0) { - std::cout << "_khmer.KHashbitsType failed PyType_Ready" << std::endl; return; } // add LabelHash @@ -4958,7 +4986,6 @@ init_khmer(void) khmer_KLabelHashType.tp_base = &khmer_KHashbitsType; khmer_KLabelHashType.tp_new = khmer_labelhash_new; if (PyType_Ready(&khmer_KLabelHashType) < 0) { - std::cout << "_khmer.KLabelHashType failed PyType_Ready" << std::endl; return; } diff --git a/lib/subset.cc b/lib/subset.cc index 3d59244099..16b12576b9 100644 --- a/lib/subset.cc +++ b/lib/subset.cc @@ -1406,7 +1406,7 @@ const partition_sizes(cm, n_unassigned); - for (PartitionCountMap::const_iterator cmi = cm.begin(); cmi != cm.end(); + for (PartitionCountMap::iterator cmi = cm.begin(); cmi != cm.end(); cmi++) { d[cmi->second]++; } From ee8e819089a969a40df8e5710b8231444af16dce Mon Sep 17 00:00:00 2001 From: Camille Scott Date: Fri, 10 Jan 2014 18:19:51 -0500 Subject: [PATCH 125/140] fixed referenced count errors in pylist_setitem and pydict_setitem causing incorrect garbage collection --- khmer/_khmermodule.cc | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/khmer/_khmermodule.cc b/khmer/_khmermodule.cc index 50c011b1bb..987d5a73ba 100644 --- a/khmer/_khmermodule.cc +++ b/khmer/_khmermodule.cc @@ -4119,7 +4119,6 @@ static PyObject * subset_partition_sizes(PyObject * self, if (tup != NULL) { PyList_SET_ITEM(x, i, tup); } - Py_XDECREF(tup); i++; } } @@ -4161,7 +4160,6 @@ static PyObject * subset_partition_average_coverages(PyObject * self, if (tup != NULL) { PyList_SET_ITEM(x, i, tup); } - Py_XDECREF(tup); } return x; @@ -4279,13 +4277,13 @@ static PyObject * labelhash_get_label_dict(PyObject * self, PyObject * args) { khmer::LabelPtrMap::iterator it; for (it = hb->label_ptrs.begin(); it!=hb->label_ptrs.end(); ++it) { - PyObject * key = Py_BuildValue("K", it->first); - Py_XDECREF(key); + PyObject * key = Py_BuildValue("K", it->first); PyObject * val = Py_BuildValue("K", it->second); - Py_XDECREF(val); if (key != NULL && val != NULL) { - PyDict_SetItem(d, key, val); + PyDict_SetItem(d, key, val); } + Py_XDECREF(key); + Py_XDECREF(val); } return d; @@ -4375,7 +4373,7 @@ static PyObject * labelhash_sweep_label_neighborhood(PyObject * self, PyObject * khmer::LabelHash * hb = me->labelhash; const char * seq = NULL; - int r = -1; + int r = 0; PyObject * break_on_stop_tags_o = NULL; PyObject * stop_big_traversals_o = NULL; @@ -4440,7 +4438,7 @@ static PyObject * labelhash_sweep_tag_neighborhood(PyObject * self, PyObject *ar khmer::LabelHash * labelhash = me->labelhash; const char * seq = NULL; - int r = -1; + int r = 0; PyObject * break_on_stop_tags_o = NULL; PyObject * stop_big_traversals_o = NULL; From ab6760896cb5554bea7b975f8e781df55a96c573 Mon Sep 17 00:00:00 2001 From: "Michael R. Crusoe" Date: Mon, 13 Jan 2014 12:54:55 -0500 Subject: [PATCH 126/140] add some error messages --- khmer/_khmermodule.cc | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/khmer/_khmermodule.cc b/khmer/_khmermodule.cc index 50c011b1bb..266b55dcb3 100644 --- a/khmer/_khmermodule.cc +++ b/khmer/_khmermodule.cc @@ -31,7 +31,6 @@ extern "C" { void init_khmer(); } - // Configure module logging. //#define WITH_INTERNAL_TRACING namespace khmer @@ -2315,6 +2314,7 @@ static PyObject* _new_counting_hash(PyObject * self, PyObject * args) std::vector sizes; Py_ssize_t sizes_list_o_length = PyObject_Length(sizes_list_o); if (sizes_list_o_length == -1) { + PyErr_SetString(PyExc_ValueError, "error with hashtable primes!"); return NULL; } for (int i = 0; i < sizes_list_o_length; i++) { @@ -2381,7 +2381,7 @@ static PyTypeObject khmer_KHashbitsType = { 0, /* tp_dictoffset */ (initproc)khmer_hashbits_init, /* tp_init */ 0, /* tp_alloc */ -}; +} ; static PyObject * hashbits_n_unique_kmers(PyObject * self, PyObject * args) { @@ -4400,6 +4400,8 @@ static PyObject * labelhash_sweep_label_neighborhood(PyObject * self, PyObject * } if (strlen(seq) < hb->ksize()) { + PyErr_SetString(PyExc_ValueError, + "string length must >= the hashtable k-mer size"); return NULL; } @@ -4465,6 +4467,8 @@ static PyObject * labelhash_sweep_tag_neighborhood(PyObject * self, PyObject *ar } if (strlen(seq) < labelhash->ksize()) { + PyErr_SetString(PyExc_ValueError, + "string length must >= the hashtable k-mer size"); return NULL; } @@ -4478,6 +4482,9 @@ static PyObject * labelhash_sweep_tag_neighborhood(PyObject * self, PyObject *ar //Py_END_ALLOW_THREADS PyObject * x = PyList_New(tagged_kmers.size()); + if (x == NULL) { + return NULL; + } khmer::SeenSet::iterator si; unsigned long long i = 0; for (si=tagged_kmers.begin(); si!=tagged_kmers.end(); ++si) { From 9a0e71ea13f573cfd84114aab8a956b67cd98e8e Mon Sep 17 00:00:00 2001 From: Camille Scott Date: Mon, 13 Jan 2014 15:26:58 -0500 Subject: [PATCH 127/140] fixed typo and cleaned up old code comments --- khmer/_khmermodule.cc | 11 ----------- scripts/sweep-reads-by-partition-buffered.py | 9 +-------- 2 files changed, 1 insertion(+), 19 deletions(-) diff --git a/khmer/_khmermodule.cc b/khmer/_khmermodule.cc index 6139f9a87f..7f93188adb 100644 --- a/khmer/_khmermodule.cc +++ b/khmer/_khmermodule.cc @@ -4189,17 +4189,6 @@ khmer_subset_getattr(PyObject * obj, char * name) typedef struct { //PyObject_HEAD khmer_KHashbitsObject khashbits; - /* @camillescott late night notes: - need to experiment. might be able to call hashbits py methods - directly with the labelhash object, because they all instantiate - a new hashbits pointer on themselves to call the functions and labelhash - inherits from hashbits; or, we define a hashbits object as part of this struct - as called for in the c-api reference. need to grok that still. - If this is how it's done, remove PyObject_HEAD, which will already be included - in the base class struct. - See http://docs.python.org/2.7/extending/newtypes.html#subclassing-other-types - for details... - */ khmer::LabelHash * labelhash; } khmer_KLabelHashObject; diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py index 75996be973..0c621b16a3 100755 --- a/scripts/sweep-reads-by-partition-buffered.py +++ b/scripts/sweep-reads-by-partition-buffered.py @@ -12,7 +12,7 @@ ... This script is very lenient on IO errors, due to the large number of file -operations needed. Thus, errors opening a file for buffer flush or writeing +operations needed. Thus, errors opening a file for buffer flush or writing a read to a file will not crash the program; instead, if there were errors, the user will be warned at the end of execution. Errors with opening read files are also handled -- we move on to the next read file if there is an error opening. @@ -54,13 +54,6 @@ def write_seq(fp, name, seq, labels=[]): else: return 0 -# stores reads in memory and flushes them to their appropriate files -# when certain criteria are met -# Basic idea is to buffer some number of reads in memory, then dump them all at once -# Hope that each file acrues, on average, BUFFER_SIZE / NUM_PARTS reads -# ie, if we buffer 1000000 reads, and we have 100000 partitions or labels, -# we should expect the mean buffer size to be 10 reads - class ReadBuffer: From ad71becfb4498c1eb38738a16a1521d1e9afcb17 Mon Sep 17 00:00:00 2001 From: Michael Crusoe Date: Mon, 13 Jan 2014 15:36:24 -0500 Subject: [PATCH 128/140] one more error message --- khmer/_khmermodule.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/khmer/_khmermodule.cc b/khmer/_khmermodule.cc index 7f93188adb..9b2f7856e1 100644 --- a/khmer/_khmermodule.cc +++ b/khmer/_khmermodule.cc @@ -2121,7 +2121,9 @@ static PyObject * hash_find_all_tags_truncate_on_abundance(PyObject * self, PyOb return NULL; } - if (strlen(kmer_s) < counting->ksize()) { // @@ + if (strlen(kmer_s) < counting->ksize()) { + PyErr_SetString(PyExc_ValueError, + "kmer_s must be less than the k-mer size of the counting hash"); return NULL; } From e9afda7020bef5a4882d82182c7517aa5e336888 Mon Sep 17 00:00:00 2001 From: Camille Scott Date: Mon, 13 Jan 2014 15:37:13 -0500 Subject: [PATCH 129/140] added PyErr_Setstring usage to exception in consume_...tag_with_labels --- khmer/_khmermodule.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/khmer/_khmermodule.cc b/khmer/_khmermodule.cc index 7f93188adb..8987569ef0 100644 --- a/khmer/_khmermodule.cc +++ b/khmer/_khmermodule.cc @@ -4332,7 +4332,7 @@ static PyObject * labelhash_consume_partitioned_fasta_and_tag_with_labels( labelhash->consume_partitioned_fasta_and_tag_with_labels(filename, total_reads, n_consumed, _report_fn, callback_obj); } catch (_khmer_signal &e) { - std::cout << "caught exception in consume_partitioned_fasta_and_tag_with_labels!" << std::endl; + PyErr_SetString( PyExc_IOError, "error parsing in consume_partitioned_fasta_and_tag_with_labels"); return NULL; } return Py_BuildValue("IK", total_reads, n_consumed); From 9c08a5763b8f7b5ec0206699280f6b319090cb88 Mon Sep 17 00:00:00 2001 From: Camille Scott Date: Mon, 13 Jan 2014 15:37:40 -0500 Subject: [PATCH 130/140] reverted change in setup.cfg --- setup.cfg | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.cfg b/setup.cfg index 51f93526b2..a7c3a5e01a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,8 +1,8 @@ [nosetests] verbosity = 2 stop = TRUE -#attr = !known_failing -attr = !known_failing,!highmem +attr = !known_failing +#attr = !known_failing,!highmem # where highmem > 0.5GiB memory [build_ext] From 4b1c98ab6b69eb29e3c97f20a010c36541e15bc0 Mon Sep 17 00:00:00 2001 From: Camille Scott Date: Mon, 13 Jan 2014 15:37:55 -0500 Subject: [PATCH 131/140] inlined _parse_partition_id --- lib/read_parsers.hh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/read_parsers.hh b/lib/read_parsers.hh index e8ca7e7968..1357e994e8 100644 --- a/lib/read_parsers.hh +++ b/lib/read_parsers.hh @@ -543,7 +543,7 @@ struct FastqParser : public IParser }; -static PartitionID _parse_partition_id(std::string name) +inline PartitionID _parse_partition_id(std::string name) { PartitionID p = 0; const char * s = name.c_str() + name.length() - 1; From 5f385878058f4fbcf1dcb920487753be11b23330 Mon Sep 17 00:00:00 2001 From: Camille Scott Date: Mon, 13 Jan 2014 15:38:21 -0500 Subject: [PATCH 132/140] updated description in comments and argparse to match function --- scripts/sweep-reads-by-partition-buffered.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py index 0c621b16a3..25ecd05c88 100755 --- a/scripts/sweep-reads-by-partition-buffered.py +++ b/scripts/sweep-reads-by-partition-buffered.py @@ -8,7 +8,7 @@ """ Find all reads connected to the given contigs on a per-partition basis. -% python scripts/normalize-by-median.py -r -i \ +% sweep-reads-by-partition.py -r -i \ ... This script is very lenient on IO errors, due to the large number of file @@ -146,7 +146,8 @@ def flush_all(self): def main(): - parser = build_construct_args() + parser = build_construct_args('Takes a partitioned reference file and a list of reads, \ + and sorts reads by which partition they connect to') parser.add_argument('-i', '--input_fastp', dest='input_fastp') parser.add_argument( '-r', '--traversal_range', type=int, dest='traversal_range', From 62a6504638799da1dfb4047ec5a28ab13865e1d4 Mon Sep 17 00:00:00 2001 From: Camille Scott Date: Mon, 13 Jan 2014 15:47:18 -0500 Subject: [PATCH 133/140] moved sandboxy script to sandbox, removed explicit file path --- {lib => sandbox}/sweep_perf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) rename {lib => sandbox}/sweep_perf.py (86%) diff --git a/lib/sweep_perf.py b/sandbox/sweep_perf.py similarity index 86% rename from lib/sweep_perf.py rename to sandbox/sweep_perf.py index 923c2da8bb..4a091f69a4 100755 --- a/lib/sweep_perf.py +++ b/sandbox/sweep_perf.py @@ -1,4 +1,4 @@ -#! /w/khmer_dev/bin/python +#! /usr/bin/env python import khmer import screed @@ -8,7 +8,7 @@ R = int(sys.argv[1]) print R K = 20 -test_file = '/w/khmer/tests/test-data/biglump-random-20-a.fa' +test_file = '../tests/test-data/biglump-random-20-a.fa' ht = khmer.new_hashbits(K, 1e9, 4) ht.consume_fasta_and_tag_with_colors(test_file) From 769ae394f49f9fb363b2860c0226e8def0205eed Mon Sep 17 00:00:00 2001 From: Camille Scott Date: Mon, 13 Jan 2014 15:57:15 -0500 Subject: [PATCH 134/140] fixed long lines --- scripts/sweep-reads-by-partition-buffered.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py index 25ecd05c88..c9f7fd7059 100755 --- a/scripts/sweep-reads-by-partition-buffered.py +++ b/scripts/sweep-reads-by-partition-buffered.py @@ -15,8 +15,8 @@ operations needed. Thus, errors opening a file for buffer flush or writing a read to a file will not crash the program; instead, if there were errors, the user will be warned at the end of execution. Errors with opening read files -are also handled -- we move on to the next read file if there is an error opening. - +are also handled -- we move on to the next read file if there is an error +opening. """ import screed @@ -41,7 +41,7 @@ def fmt_fasta(name, seq, labels=[]): return '>{name}\t{labels}\n{seq}\n'.format(name=name, - labels='\t'.join([str(l) for l in labels]), seq=seq) + labels='\t'.join([str(l) for l in labels]), seq=seq) def write_seq(fp, name, seq, labels=[]): From c99a4718705fc03752b97af2f36c9612ebc07e2a Mon Sep 17 00:00:00 2001 From: Camille Scott Date: Mon, 13 Jan 2014 16:00:44 -0500 Subject: [PATCH 135/140] fixed more long lines --- scripts/sweep-reads-by-partition-buffered.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py index c9f7fd7059..2ad1c8e537 100755 --- a/scripts/sweep-reads-by-partition-buffered.py +++ b/scripts/sweep-reads-by-partition-buffered.py @@ -219,22 +219,27 @@ def main(): total_t = time.clock() start_t = time.clock() for read_file in input_files: - print >>sys.stderr, '** sweeping {read_file} for labels...'.format(read_file=read_file) + print >>sys.stderr, '** sweeping {read_file} for labels...'.format( + read_file=read_file) file_t = 0.0 try: read_fp = screed.open(read_file) except IOError as e: print >>sys.stderr, '!! ERROR: !!', e - print >>sys.stderr, '*** Could not open {fn}, skipping...'.format(fn=read_file) + print >>sys.stderr, '*** Could not open {fn}, skipping...'.format( + fn=read_file) else: for n, record in enumerate(read_fp): if n % 50000 == 0: end_t = time.clock() batch_t = end_t - start_t file_t += batch_t - print >>sys.stderr, '\tswept {n} reads [{nc} labeled, {no} orphaned] \ + print >>sys.stderr, '\tswept {n} reads [{nc} labeled, \ + {no} orphaned] \ ** {sec}s ({sect}s total)' \ - .format(n=n, nc=n_labeled, no=n_orphaned, sec=batch_t, sect=file_t) + .format(n=n, nc=n_labeled, + no=n_orphaned, + sec=batch_t, sect=file_t) start_t = time.clock() seq = record.sequence name = record.name From 7bbe46007137f2b94e45bc3f50a81a37cec0f96c Mon Sep 17 00:00:00 2001 From: Camille Scott Date: Mon, 13 Jan 2014 16:30:06 -0500 Subject: [PATCH 136/140] made fastp non-optional, shortened lines, removed includes --- scripts/sweep-reads-by-partition-buffered.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py index 2ad1c8e537..bc233694cc 100755 --- a/scripts/sweep-reads-by-partition-buffered.py +++ b/scripts/sweep-reads-by-partition-buffered.py @@ -8,7 +8,7 @@ """ Find all reads connected to the given contigs on a per-partition basis. -% sweep-reads-by-partition.py -r -i \ +% sweep-reads-by-partition.py -r \ ... This script is very lenient on IO errors, due to the large number of file @@ -22,11 +22,9 @@ import screed import sys import os -import argparse import time import khmer from khmer.counting_args import build_construct_args, DEFAULT_MIN_HASHSIZE -from collections import namedtuple as nt DEFAULT_NUM_BUFFERS = 50000 @@ -146,9 +144,9 @@ def flush_all(self): def main(): - parser = build_construct_args('Takes a partitioned reference file and a list of reads, \ - and sorts reads by which partition they connect to') - parser.add_argument('-i', '--input_fastp', dest='input_fastp') + parser = build_construct_args('Takes a partitioned reference file \ + and a list of reads, and sorts reads \ + by which partition they connect to') parser.add_argument( '-r', '--traversal_range', type=int, dest='traversal_range', default=DEFAULT_RANGE) @@ -160,6 +158,7 @@ def main(): default=DEFAULT_OUT_PREF) parser.add_argument('-m', '--max_buffers', dest='max_buffers', type=int, default=DEFAULT_NUM_BUFFERS) + parser.add_argument(dest='input_fastp') parser.add_argument('input_files', nargs='+') args = parser.parse_args() @@ -205,7 +204,7 @@ def main(): output_buffer = ReadBufferManager( max_buffers, max_reads, buf_size, output_pref, outdir) - # consume the partitioned fasta with which to label the graph + # consume the partitioned fasta with which to label the graph ht = khmer.LabelHash(K, HT_SIZE, N_HT) print >>sys.stderr, 'consuming fastp...' ht.consume_partitioned_fasta_and_tag_with_labels(input_fastp) From c35ef384535840e7fffad7f9168ec84b38a6c98e Mon Sep 17 00:00:00 2001 From: Camille Scott Date: Mon, 13 Jan 2014 16:30:32 -0500 Subject: [PATCH 137/140] updated tests to reflect new script option format --- tests/test_scripts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_scripts.py b/tests/test_scripts.py index 8f585888f6..49fac0ad42 100644 --- a/tests/test_scripts.py +++ b/tests/test_scripts.py @@ -1239,7 +1239,7 @@ def test_sweep_reads_by_partition_buffered(): shutil.copyfile(utils.get_test_data('test-sweep-contigs.fp'), contigfile) script = scriptpath('sweep-reads-by-partition-buffered.py') - args = ['-k', '25', '-o', 'test', '-i', + args = ['-k', '25', '-o', 'test', contigfile, readfile, 'junkfile.fa'] status, out, err = runscript(script, args, in_dir) From 69447b6ad00e27a139a121e8b2b241a38a64daa7 Mon Sep 17 00:00:00 2001 From: CS Welcher Date: Mon, 13 Jan 2014 17:09:29 -0500 Subject: [PATCH 138/140] removed deprecated sweep_sequence_for_labels, added test for consume_sequence_and_tag_with_labels --- lib/labelhash.cc | 33 --------------------------------- lib/labelhash.hh | 7 +------ tests/test_labelhash.py | 13 ++++++++++++- 3 files changed, 13 insertions(+), 40 deletions(-) diff --git a/lib/labelhash.cc b/lib/labelhash.cc index ebeb6554b7..be8605fc03 100644 --- a/lib/labelhash.cc +++ b/lib/labelhash.cc @@ -295,39 +295,6 @@ void LabelHash::consume_sequence_and_tag_with_labels(const std::string& seq, if (found_tags) { found_tags->insert(kmer); } } printdbg(done with low-level consume) - } -/* - * Find all labels associated with the sequence - * For now, check /every/ k-mer with find_all_tags - */ -unsigned int LabelHash::sweep_sequence_for_labels(const std::string& seq, - LabelPtrSet& found_labels, - bool break_on_stoptags, - bool stop_big_traversals) { - - SeenSet tagged_kmers; - //LabelPtrSet found_labels; - - HashIntoType kmer_f, kmer_r, kmer; - - KMerIterator kmers(seq.c_str(), _ksize); - std::string kmer_s; - // keep a list of kmers which have already been traversed - SeenSet traversed_kmers; - while (!kmers.done()) { - kmer = kmers.next(); - kmer_s = _revhash(kmer, _ksize); - _hash(kmer_s.c_str(), _ksize, kmer_f, kmer_r); - - // don't even try traversing from k-mers not in the hashtable - //traversed_kmers.clear(); - if (get_count(uniqify_rc(kmer_f,kmer_r))) { - partition->find_all_tags(kmer_f, kmer_r, tagged_kmers, - all_tags, break_on_stoptags, stop_big_traversals); - traverse_labels_and_resolve(tagged_kmers, found_labels); - } - } - return traversed_kmers.size(); } unsigned int LabelHash::sweep_label_neighborhood(const std::string& seq, diff --git a/lib/labelhash.hh b/lib/labelhash.hh index 6abaa788b1..cbc7b5aedc 100644 --- a/lib/labelhash.hh +++ b/lib/labelhash.hh @@ -128,12 +128,7 @@ namespace khmer { TagPtrSet get_label_tags(const Label& label); void link_tag_and_label(HashIntoType& kmer, Label& label); - - unsigned int sweep_sequence_for_labels(const std::string& seq, - LabelPtrSet& found_labels, - bool break_on_stoptags, - bool stop_big_traversals); - + unsigned int sweep_label_neighborhood(const std::string & seq, LabelPtrSet& found_labels, unsigned int range, diff --git a/tests/test_labelhash.py b/tests/test_labelhash.py index 51431ccd56..1195224e90 100644 --- a/tests/test_labelhash.py +++ b/tests/test_labelhash.py @@ -87,13 +87,24 @@ def test_consume_partitioned_fasta_and_tag_with_labels(): labels = set() for record in screed.open(filename): seq = record.sequence - labels.update(lb.sweep_label_neighborhood(seq, False, False)) + labels.update(lb.sweep_label_neighborhood(seq, 0, False, False)) # print lb.n_labels() # print labels assert len(labels) == 1 assert labels.pop() == 2L assert lb.n_labels() == 1 +def test_consume_sequence_and_tag_with_labels(): + lb = LabelHash(20, 1e6, 4) + label = 0L + sequence = 'ATGCATCGATCGATCGATCGATCGATCGATCGATCGATCG' + + n_consumed = lb.consume_sequence_and_tag_with_labels(sequence, label) + labels = set() + labels.update(lb.sweep_label_neighborhood(sequence)) + + assert label in labels + assert len(labels) == 1 def test_sweep_tag_neighborhood(): lb = LabelHash(20, 1e7, 4) From 037115db77412c14939cb8a603c218b1c0e1ac7d Mon Sep 17 00:00:00 2001 From: CS Welcher Date: Mon, 13 Jan 2014 17:12:19 -0500 Subject: [PATCH 139/140] shortened remaining long lines: --- scripts/sweep-reads-by-partition-buffered.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py index bc233694cc..07b5bd8530 100755 --- a/scripts/sweep-reads-by-partition-buffered.py +++ b/scripts/sweep-reads-by-partition-buffered.py @@ -106,7 +106,8 @@ def flush_buffer(self, buf_id): outfp = open(fpath, 'a') except IOError as e: print >>sys.stderr, '!! ERROR: {e} !!'.format(e=e) - print >>sys.stderr, '*** Failed to open {fn} for buffer flush'.format(fn=fpath) + print >>sys.stderr, '*** Failed to open {fn} for \ + buffer flush'.format(fn=fpath) self.num_file_errors += 1 else: buf = self.buffers[buf_id] @@ -187,7 +188,8 @@ def main(): print >>sys.stderr, '' print >>sys.stderr, \ 'Estimated memory usage is {prod:.2g} bytes \ - (n_hashes x min_hashsize / 8)'.format(prod=args.n_hashes * HT_SIZE / 8) + (n_hashes x min_hashsize / 8)'.format( + prod=args.n_hashes * HT_SIZE / 8) print >>sys.stderr, '-' * 8 traversal_range = args.traversal_range From 3c4158f8174c77cc8b858e34352ae3da59fa4ff3 Mon Sep 17 00:00:00 2001 From: CS Welcher Date: Mon, 13 Jan 2014 17:52:05 -0500 Subject: [PATCH 140/140] greatly expounding on help --- scripts/sweep-reads-by-partition-buffered.py | 25 +++++++++++++++----- 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/scripts/sweep-reads-by-partition-buffered.py b/scripts/sweep-reads-by-partition-buffered.py index 07b5bd8530..456127c3b6 100755 --- a/scripts/sweep-reads-by-partition-buffered.py +++ b/scripts/sweep-reads-by-partition-buffered.py @@ -10,6 +10,13 @@ % sweep-reads-by-partition.py -r \ ... +""" + +epilog = """ +Output will be a collection of files corresponding to the partitions; +each partition gets a file (prefixed with the output prefix option), +which means this could output many tens or hundreds of thousands of files. +Users should plan accordingly. This script is very lenient on IO errors, due to the large number of file operations needed. Thus, errors opening a file for buffer flush or writing @@ -148,19 +155,25 @@ def main(): parser = build_construct_args('Takes a partitioned reference file \ and a list of reads, and sorts reads \ by which partition they connect to') + parser.epilog = epilog parser.add_argument( '-r', '--traversal_range', type=int, dest='traversal_range', default=DEFAULT_RANGE) parser.add_argument('-b', '--buffer_size', dest='max_reads', type=int, - default=DEFAULT_MAX_READS) + default=DEFAULT_MAX_READS, + help='Max total reads to buffer before flushing') parser.add_argument('-l', '--buffer_length', dest='buffer_size', type=int, - default=DEFAULT_BUFFER_SIZE) + default=DEFAULT_BUFFER_SIZE, + help='Max length of an individual label buffer \ + before flushing') parser.add_argument('-o', '--output_prefix', dest='output_prefix', - default=DEFAULT_OUT_PREF) + default=DEFAULT_OUT_PREF, + help='Prefix for sorted read files') parser.add_argument('-m', '--max_buffers', dest='max_buffers', type=int, - default=DEFAULT_NUM_BUFFERS) - parser.add_argument(dest='input_fastp') - parser.add_argument('input_files', nargs='+') + default=DEFAULT_NUM_BUFFERS, + help='Max individual label buffers before flushing') + parser.add_argument(dest='input_fastp', help='Partitioned reference fasta') + parser.add_argument('input_files', nargs='+', help='Reads to be swept/sorted') args = parser.parse_args() K = args.ksize