From cea5ad37cada65d6f6892a647f1badb5c653ef33 Mon Sep 17 00:00:00 2001 From: Nick Ingolia Date: Sun, 11 Aug 2019 21:17:08 -0700 Subject: [PATCH 01/31] Started work on a tool to collapse similar barcodes --- src/barcode_assign/collapse.rs | 82 ++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 src/barcode_assign/collapse.rs diff --git a/src/barcode_assign/collapse.rs b/src/barcode_assign/collapse.rs new file mode 100644 index 0000000..c1ebee6 --- /dev/null +++ b/src/barcode_assign/collapse.rs @@ -0,0 +1,82 @@ +use std::cell::RefCell; +use std::collections::HashMap; +use std::rc::Rc; + +pub struct BarcodeEquivMap { + barcode_equivs: HashMap, Rc>>, + max_dist: usize, +} + +impl BarcodeEquivMap { + pub fn new(max_dist: usize) -> Self { + BarcodeEquivMap { + barcode_equivs: HashMap::new(), + max_dist: max_dist, + } + } + + pub fn insert(&mut self, bc: &[u8]) -> () { + unimplemented!(); + } +} + +struct BarcodeEquiv { + barcodes: Vec<(Vec, usize)>, +} + +impl BarcodeEquiv { + fn new() -> Self { + BarcodeEquiv { barcodes: Vec::new() } + } + + fn insert(&mut self, bc_new: &[u8]) -> () { + for &mut (ref bc, ref mut ct) in self.barcodes.iter_mut() { + if bc_new == bc.as_slice() { + *ct += 1; + return; + } + } + + self.barcodes.push((bc_new.to_vec(), 1)); + } +} + +const NTS_LEN: usize = 4; +static NTS: [u8; NTS_LEN] = [b'A', b'C', b'G', b'T']; + +struct Substitutions<'a> { + original: &'a [u8], + position: usize, + nt: usize, +} + +impl <'a> Substitutions<'a> { + pub fn new(original: &'a [u8]) -> Self { + Substitutions { original: original, position: 0, nt: 0 } + } +} + +impl <'a> Iterator for Substitutions<'a> { + type Item = Vec; + + fn next(&mut self) -> Option { + if self.position >= self.original.len() { + return None; + } + + if self.nt >= NTS_LEN { + self.position += 1; + self.nt = 0; + return self.next(); + } + + if self.original[self.position] == NTS[self.nt] { + self.nt += 1; + return self.next(); + } + + let mut variant = self.original.to_vec(); + variant[self.position] = NTS[self.nt]; + return Some(variant); + } +} From bf00f28672914a2e89d341c8aa2122178329c668 Mon Sep 17 00:00:00 2001 From: Nick Ingolia Date: Sun, 11 Aug 2019 21:17:08 -0700 Subject: [PATCH 02/31] Started work on a tool to collapse similar barcodes --- src/barcode_assign/collapse.rs | 82 ++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 src/barcode_assign/collapse.rs diff --git a/src/barcode_assign/collapse.rs b/src/barcode_assign/collapse.rs new file mode 100644 index 0000000..c1ebee6 --- /dev/null +++ b/src/barcode_assign/collapse.rs @@ -0,0 +1,82 @@ +use std::cell::RefCell; +use std::collections::HashMap; +use std::rc::Rc; + +pub struct BarcodeEquivMap { + barcode_equivs: HashMap, Rc>>, + max_dist: usize, +} + +impl BarcodeEquivMap { + pub fn new(max_dist: usize) -> Self { + BarcodeEquivMap { + barcode_equivs: HashMap::new(), + max_dist: max_dist, + } + } + + pub fn insert(&mut self, bc: &[u8]) -> () { + unimplemented!(); + } +} + +struct BarcodeEquiv { + barcodes: Vec<(Vec, usize)>, +} + +impl BarcodeEquiv { + fn new() -> Self { + BarcodeEquiv { barcodes: Vec::new() } + } + + fn insert(&mut self, bc_new: &[u8]) -> () { + for &mut (ref bc, ref mut ct) in self.barcodes.iter_mut() { + if bc_new == bc.as_slice() { + *ct += 1; + return; + } + } + + self.barcodes.push((bc_new.to_vec(), 1)); + } +} + +const NTS_LEN: usize = 4; +static NTS: [u8; NTS_LEN] = [b'A', b'C', b'G', b'T']; + +struct Substitutions<'a> { + original: &'a [u8], + position: usize, + nt: usize, +} + +impl <'a> Substitutions<'a> { + pub fn new(original: &'a [u8]) -> Self { + Substitutions { original: original, position: 0, nt: 0 } + } +} + +impl <'a> Iterator for Substitutions<'a> { + type Item = Vec; + + fn next(&mut self) -> Option { + if self.position >= self.original.len() { + return None; + } + + if self.nt >= NTS_LEN { + self.position += 1; + self.nt = 0; + return self.next(); + } + + if self.original[self.position] == NTS[self.nt] { + self.nt += 1; + return self.next(); + } + + let mut variant = self.original.to_vec(); + variant[self.position] = NTS[self.nt]; + return Some(variant); + } +} From e9bf46a41bbc643a946a4fd28e4057032fa428f2 Mon Sep 17 00:00:00 2001 From: Nick Ingolia Date: Sun, 11 Aug 2019 21:17:08 -0700 Subject: [PATCH 03/31] Started work on a tool to collapse similar barcodes --- src/barcode_assign/collapse.rs | 82 ++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 src/barcode_assign/collapse.rs diff --git a/src/barcode_assign/collapse.rs b/src/barcode_assign/collapse.rs new file mode 100644 index 0000000..c1ebee6 --- /dev/null +++ b/src/barcode_assign/collapse.rs @@ -0,0 +1,82 @@ +use std::cell::RefCell; +use std::collections::HashMap; +use std::rc::Rc; + +pub struct BarcodeEquivMap { + barcode_equivs: HashMap, Rc>>, + max_dist: usize, +} + +impl BarcodeEquivMap { + pub fn new(max_dist: usize) -> Self { + BarcodeEquivMap { + barcode_equivs: HashMap::new(), + max_dist: max_dist, + } + } + + pub fn insert(&mut self, bc: &[u8]) -> () { + unimplemented!(); + } +} + +struct BarcodeEquiv { + barcodes: Vec<(Vec, usize)>, +} + +impl BarcodeEquiv { + fn new() -> Self { + BarcodeEquiv { barcodes: Vec::new() } + } + + fn insert(&mut self, bc_new: &[u8]) -> () { + for &mut (ref bc, ref mut ct) in self.barcodes.iter_mut() { + if bc_new == bc.as_slice() { + *ct += 1; + return; + } + } + + self.barcodes.push((bc_new.to_vec(), 1)); + } +} + +const NTS_LEN: usize = 4; +static NTS: [u8; NTS_LEN] = [b'A', b'C', b'G', b'T']; + +struct Substitutions<'a> { + original: &'a [u8], + position: usize, + nt: usize, +} + +impl <'a> Substitutions<'a> { + pub fn new(original: &'a [u8]) -> Self { + Substitutions { original: original, position: 0, nt: 0 } + } +} + +impl <'a> Iterator for Substitutions<'a> { + type Item = Vec; + + fn next(&mut self) -> Option { + if self.position >= self.original.len() { + return None; + } + + if self.nt >= NTS_LEN { + self.position += 1; + self.nt = 0; + return self.next(); + } + + if self.original[self.position] == NTS[self.nt] { + self.nt += 1; + return self.next(); + } + + let mut variant = self.original.to_vec(); + variant[self.position] = NTS[self.nt]; + return Some(variant); + } +} From 3baba8056c2aa73b74208dcd2942d2661fb4a532 Mon Sep 17 00:00:00 2001 From: Nick Ingolia Date: Sun, 11 Aug 2019 21:58:29 -0700 Subject: [PATCH 04/31] Build module for barcode collapsing --- src/barcode_assign/lib.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/barcode_assign/lib.rs b/src/barcode_assign/lib.rs index b8e7b18..3286ef6 100644 --- a/src/barcode_assign/lib.rs +++ b/src/barcode_assign/lib.rs @@ -9,6 +9,7 @@ pub mod bc_count; pub mod bc_frag; pub mod bc_seqs; pub mod blasr; +pub mod collapse; pub mod depth; pub mod fastq_pair; pub mod flank_match; From 41f6b725cfb17ebea204fd6ae2d3659e1c63c0fa Mon Sep 17 00:00:00 2001 From: Nick Ingolia Date: Mon, 12 Aug 2019 00:20:55 -0700 Subject: [PATCH 05/31] HashMap-based barcode collapsing --- Cargo.toml | 3 + src/barcode_assign/collapse.rs | 105 +++++++++++++++++++++++++++++---- src/bc_collapse.rs | 98 ++++++++++++++++++++++++++++++ 3 files changed, 195 insertions(+), 11 deletions(-) create mode 100644 src/bc_collapse.rs diff --git a/Cargo.toml b/Cargo.toml index b430c3b..651435e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -54,3 +54,6 @@ path = "src/bc-pacbio/bc_pbj.rs" name = "bc-bam-chunk" path = "src/bc-pacbio/bc_bam_chunk.rs" +[[bin]] +name = "bc-collapse" +path = "src/bc_collapse.rs" diff --git a/src/barcode_assign/collapse.rs b/src/barcode_assign/collapse.rs index c1ebee6..7cae9df 100644 --- a/src/barcode_assign/collapse.rs +++ b/src/barcode_assign/collapse.rs @@ -1,32 +1,84 @@ use std::cell::RefCell; use std::collections::HashMap; +use std::io::Write; use std::rc::Rc; -pub struct BarcodeEquivMap { - barcode_equivs: HashMap, Rc>>, - max_dist: usize, +use bio::pattern_matching::myers::Myers; + +// New barcode -- check first for exact match, then scan all existing barcodes +// for near-matches +// Insert later barcodes with links to earlier barcodes? +// Or, create barcode equivalency sets? + +pub struct BarcodeNbhdMap { + barcode_nbhds: HashMap, Rc>>, + max_dist: u8, } -impl BarcodeEquivMap { +impl BarcodeNbhdMap { pub fn new(max_dist: usize) -> Self { - BarcodeEquivMap { - barcode_equivs: HashMap::new(), - max_dist: max_dist, + BarcodeNbhdMap { + barcode_nbhds: HashMap::new(), + max_dist: max_dist as u8, } } pub fn insert(&mut self, bc: &[u8]) -> () { - unimplemented!(); + if self.barcode_nbhds.contains_key(bc) { + self.barcode_nbhds.get(bc).unwrap().borrow_mut().insert(bc); + } else { + // let myers = Myers::::new(bc); + + // for key in self.barcode_nbhds.keys() { + // let (_, dist) = myers.find_best_end(key); + // if dist <= self.max_dist { + // println!("Found {} vs {} at {}", + // String::from_utf8_lossy(bc), + // String::from_utf8_lossy(key), + // dist); + // } + // } + + let mut subst_iter = Substitutions::new(bc).chain(Deletions::new(bc)); + for subst in subst_iter { + if self.barcode_nbhds.contains_key(&subst) { + let nbhd = self.barcode_nbhds.get(&subst).unwrap(); + nbhd.borrow_mut().insert(bc); + self.barcode_nbhds.insert(bc.to_vec(), Rc::clone(nbhd)); + return; + } + } + + let mut nbhd = BarcodeNbhd::new(); + nbhd.insert(bc); + self.barcode_nbhds.insert(bc.to_vec(), Rc::new(RefCell::new(nbhd))); + } + } + + pub fn write_nbhds(&self, mut out: W) -> Result<(), failure::Error> { + for (key, val) in self.barcode_nbhds.iter() { + write!(out, "{}", + String::from_utf8_lossy(key))?; + + for (bc, ct) in val.borrow().barcodes.iter() { + write!(out, "\t{}\t{}", + String::from_utf8_lossy(bc), ct)?; + } + + write!(out, "\n")?; + } + + Ok(()) } } -struct BarcodeEquiv { +struct BarcodeNbhd { barcodes: Vec<(Vec, usize)>, } -impl BarcodeEquiv { +impl BarcodeNbhd { fn new() -> Self { - BarcodeEquiv { barcodes: Vec::new() } + BarcodeNbhd { barcodes: Vec::new() } } fn insert(&mut self, bc_new: &[u8]) -> () { @@ -41,6 +93,9 @@ impl BarcodeEquiv { } } +// Switch to an interface where mutations (acting on a slice buffer) +// are returned to avoid allocation. + const NTS_LEN: usize = 4; static NTS: [u8; NTS_LEN] = [b'A', b'C', b'G', b'T']; @@ -77,6 +132,34 @@ impl <'a> Iterator for Substitutions<'a> { let mut variant = self.original.to_vec(); variant[self.position] = NTS[self.nt]; + self.nt += 1; + return Some(variant); + } +} + +struct Deletions<'a> { + original: &'a [u8], + position: usize, +} + +impl <'a> Deletions<'a> { + pub fn new(original: &'a [u8]) -> Self { + Deletions { original: original, position: 0 } + } +} + +impl <'a> Iterator for Deletions<'a> { + type Item = Vec; + + fn next(&mut self) -> Option { + if self.position >= self.original.len() { + return None; + } + + let mut variant = self.original.to_vec(); + variant.remove(self.position); + self.position += 1; return Some(variant); } } + diff --git a/src/bc_collapse.rs b/src/bc_collapse.rs new file mode 100644 index 0000000..41949ec --- /dev/null +++ b/src/bc_collapse.rs @@ -0,0 +1,98 @@ +extern crate barcode_assign; +extern crate bio; +extern crate clap; +extern crate failure; + +use std::io::Read; + +use bio::io::fastq; +use barcode_assign::collapse::*; +use clap::{App, Arg}; + +fn main() { + let matches = App::new("bc-collapse") + .version("1.0") + .author("Nick Ingolia ") + .about("Collapse barcode sequences") + .arg( + Arg::with_name("fastq") + .short("f") + .long("fastq") + .value_name("BARCODE-FQ") + .help("FastQ file of barcode sequences") + .takes_value(true) + .required(true), + ) + .arg( + Arg::with_name("output") + .short("o") + .long("output") + .value_name("OUTPUT-TXT") + .help("Tab-delimited text file of barcode counts") + .takes_value(true) + .required(true), + ) + .get_matches(); + + let cli = CLI { + barcode_fastq: matches.value_of("fastq").unwrap().to_string(), + output: matches.value_of("output").unwrap().to_string(), + }; + + match cli.run() { + Ok(_) => (), + Err(e) => panic!(e), + } +} + +struct CLI { + barcode_fastq: String, + output: String, +} + +impl CLI { + fn run(&self) -> Result<(), failure::Error> { + let mut nbhds = BarcodeNbhdMap::new(1); + + let reader: Box = if self.barcode_fastq == "-" { + Box::new(std::io::stdin()) + } else { + Box::new(std::fs::File::open(&self.barcode_fastq)?) + }; + let barcode_reader = fastq::Reader::new(reader); + + let mut last_report = std::time::Instant::now(); + let mut last_recno = 0; + + for (recno, rec_res) in barcode_reader.records().enumerate() { + let rec = rec_res?; + + if rec.seq().len() < 24 || rec.seq().len() > 26 { +// println!("Skipping {} because of length", +// String::from_utf8_lossy(rec.seq())); + continue; + } + + nbhds.insert(rec.seq()); + + let last_duration = std::time::Instant::now().duration_since(last_report); + if last_duration.as_secs() > 0 + { + let recs = recno - last_recno; + println!("{} usec, {} records, {} records / second ({} total)", + last_duration.as_micros(), + recs, + 1000000.0 * (recs as f64) / (last_duration.as_micros() as f64), + recno); + last_report = std::time::Instant::now(); + last_recno = recno; + } + + if recno > 10000000 { + break; + } + } + + nbhds.write_nbhds(std::fs::File::create(&self.output)?) + } +} From ec174fcd6a14caf9e500b83f24af4d0495f5a5ee Mon Sep 17 00:00:00 2001 From: Nick Ingolia Date: Mon, 12 Aug 2019 01:05:32 -0700 Subject: [PATCH 06/31] Write neighborhood-focused (vs barcode-focused) output --- src/barcode_assign/collapse.rs | 52 +++++++++++++++++++++++++--------- src/bc_collapse.rs | 4 --- 2 files changed, 39 insertions(+), 17 deletions(-) diff --git a/src/barcode_assign/collapse.rs b/src/barcode_assign/collapse.rs index 7cae9df..dad2e55 100644 --- a/src/barcode_assign/collapse.rs +++ b/src/barcode_assign/collapse.rs @@ -10,7 +10,21 @@ use bio::pattern_matching::myers::Myers; // Insert later barcodes with links to earlier barcodes? // Or, create barcode equivalency sets? +// No need for complexity during counting -- post-process key set in +// map to find neighborhoods. No need to handle insertions, can find +// all edges with substitutions and deletions. + +// 1. Pick a node +// a. remove from key set +// b. initialize work stack with node +// 2. Handle a node from work stack +// a. check key set for all near-neighbors +// i. remove near-neighbor from key set +// ii. push near-neighbor onto work stack +// b. add node to neighborhood + pub struct BarcodeNbhdMap { + nbhds: Vec>>, barcode_nbhds: HashMap, Rc>>, max_dist: u8, } @@ -18,6 +32,7 @@ pub struct BarcodeNbhdMap { impl BarcodeNbhdMap { pub fn new(max_dist: usize) -> Self { BarcodeNbhdMap { + nbhds: Vec::new(), barcode_nbhds: HashMap::new(), max_dist: max_dist as u8, } @@ -27,19 +42,9 @@ impl BarcodeNbhdMap { if self.barcode_nbhds.contains_key(bc) { self.barcode_nbhds.get(bc).unwrap().borrow_mut().insert(bc); } else { - // let myers = Myers::::new(bc); - - // for key in self.barcode_nbhds.keys() { - // let (_, dist) = myers.find_best_end(key); - // if dist <= self.max_dist { - // println!("Found {} vs {} at {}", - // String::from_utf8_lossy(bc), - // String::from_utf8_lossy(key), - // dist); - // } - // } - let mut subst_iter = Substitutions::new(bc).chain(Deletions::new(bc)); + let mut nbhd_found = false; + for subst in subst_iter { if self.barcode_nbhds.contains_key(&subst) { let nbhd = self.barcode_nbhds.get(&subst).unwrap(); @@ -51,11 +56,32 @@ impl BarcodeNbhdMap { let mut nbhd = BarcodeNbhd::new(); nbhd.insert(bc); - self.barcode_nbhds.insert(bc.to_vec(), Rc::new(RefCell::new(nbhd))); + let mut nbhd_rc = Rc::new(RefCell::new(nbhd)); + self.nbhds.push(Rc::clone(&nbhd_rc)); + self.barcode_nbhds.insert(bc.to_vec(), nbhd_rc); } } pub fn write_nbhds(&self, mut out: W) -> Result<(), failure::Error> { + for nbhd in self.nbhds.iter() { + let mut barcodes = nbhd.borrow().barcodes.clone(); + barcodes.sort_by_key(|(_, ct)| -(*ct as isize)); + let (keybc, keyct) = barcodes.first().unwrap(); + let total: usize = barcodes.iter().map(|(_, ct)| *ct).sum(); + write!(out, "{}\t{}\t{}\t{:0.3}", + String::from_utf8_lossy(keybc), + barcodes.len(), total, + (*keyct as f64) / (total as f64))?; + for (bc, ct) in barcodes.iter() { + write!(out, "\t{}\t{}", + String::from_utf8_lossy(bc), ct)?; + } + write!(out, "\n")?; + } + Ok(()) + } + + pub fn write_barcode_nbhds(&self, mut out: W) -> Result<(), failure::Error> { for (key, val) in self.barcode_nbhds.iter() { write!(out, "{}", String::from_utf8_lossy(key))?; diff --git a/src/bc_collapse.rs b/src/bc_collapse.rs index 41949ec..08b1cab 100644 --- a/src/bc_collapse.rs +++ b/src/bc_collapse.rs @@ -87,10 +87,6 @@ impl CLI { last_report = std::time::Instant::now(); last_recno = recno; } - - if recno > 10000000 { - break; - } } nbhds.write_nbhds(std::fs::File::create(&self.output)?) From c476c598d0e5c12ac09ac254048c6b787fdabcc8 Mon Sep 17 00:00:00 2001 From: Nick Ingolia Date: Mon, 12 Aug 2019 08:32:01 -0700 Subject: [PATCH 07/31] Started work on post-tabulation neighborhood discovery --- src/barcode_assign/collapse.rs | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/barcode_assign/collapse.rs b/src/barcode_assign/collapse.rs index dad2e55..d15d8a6 100644 --- a/src/barcode_assign/collapse.rs +++ b/src/barcode_assign/collapse.rs @@ -1,5 +1,5 @@ use std::cell::RefCell; -use std::collections::HashMap; +use std::collections::{HashMap,HashSet}; use std::io::Write; use std::rc::Rc; @@ -23,6 +23,19 @@ use bio::pattern_matching::myers::Myers; // ii. push near-neighbor onto work stack // b. add node to neighborhood +pub fn gather_neighborhoods(mut bc_set: HashSet>) -> Vec>> { + loop { + let work_stack = Vec::new(); + + if let Some(start_ref) = bc_set.iter().next() { + let start = bc_set.take(start_ref).unwrap(); + work_stack.push(start); + } else { + return Vec::new(); + } + } +} + pub struct BarcodeNbhdMap { nbhds: Vec>>, barcode_nbhds: HashMap, Rc>>, From 40a697a60187ac1f9ccc15cd4320da977e5f5b1d Mon Sep 17 00:00:00 2001 From: Nick Ingolia Date: Mon, 12 Aug 2019 08:34:07 -0700 Subject: [PATCH 08/31] Data file of barcodes for counting and collapse test --- data/test-barcodes.fastq | 20000 +++++++++++++++++++++++++++++++++++++ 1 file changed, 20000 insertions(+) create mode 100644 data/test-barcodes.fastq diff --git a/data/test-barcodes.fastq b/data/test-barcodes.fastq new file mode 100644 index 0000000..7e0541f --- /dev/null +++ b/data/test-barcodes.fastq @@ -0,0 +1,20000 @@ +@K00364:136:HYLVFBBXX:7:1101:26829:1279 1:N:0:CGCTCATT+NCTATCCT +TGNGGGCATGTACGCGAGGGAGGGGGATCCTGTAGCCCTANACTTNATNGC ++ +AA#AFJJJJJJJJJJJJJJJJ Date: Mon, 12 Aug 2019 13:28:49 -0700 Subject: [PATCH 09/31] Insertion mutations; post-tabulation neighborhood collection --- src/barcode_assign/collapse.rs | 80 +++++++++++++++++++++++++++++----- 1 file changed, 70 insertions(+), 10 deletions(-) diff --git a/src/barcode_assign/collapse.rs b/src/barcode_assign/collapse.rs index d15d8a6..4157be7 100644 --- a/src/barcode_assign/collapse.rs +++ b/src/barcode_assign/collapse.rs @@ -23,17 +23,41 @@ use bio::pattern_matching::myers::Myers; // ii. push near-neighbor onto work stack // b. add node to neighborhood -pub fn gather_neighborhoods(mut bc_set: HashSet>) -> Vec>> { +pub fn gather_neighborhoods(mut bc_set: HashSet>) -> Vec>> +{ + let mut neighborhoods = Vec::new(); + loop { - let work_stack = Vec::new(); + let mut work_stack = Vec::new(); + let mut neighborhood = Vec::new(); - if let Some(start_ref) = bc_set.iter().next() { - let start = bc_set.take(start_ref).unwrap(); - work_stack.push(start); - } else { - return Vec::new(); + let start_ref = match bc_set.iter().next() { + Some(start_ref) => start_ref, + None => { break; } + }; + + // work_stack.push(bc_set.take(start_ref).unwrap()); + let start = start_ref.to_vec(); + bc_set.take(&start); + work_stack.push(start); + + while work_stack.len() > 0 { + let curr = work_stack.pop().unwrap(); + + let neighbors = Substitutions::new(&curr).chain(Deletions::new(&curr)); + for neighbor in neighbors { + if bc_set.remove(&neighbor) { + work_stack.push(neighbor); + } + } + + neighborhood.push(curr); } + + neighborhoods.push(neighborhood); } + + neighborhoods } pub struct BarcodeNbhdMap { @@ -55,7 +79,7 @@ impl BarcodeNbhdMap { if self.barcode_nbhds.contains_key(bc) { self.barcode_nbhds.get(bc).unwrap().borrow_mut().insert(bc); } else { - let mut subst_iter = Substitutions::new(bc).chain(Deletions::new(bc)); + let mut subst_iter = Substitutions::new(bc).chain(Deletions::new(bc)).chain(Insertions::new(bc)); let mut nbhd_found = false; for subst in subst_iter { @@ -176,6 +200,41 @@ impl <'a> Iterator for Substitutions<'a> { } } +struct Insertions<'a> { + original: &'a [u8], + position: usize, + nt: usize, +} + +impl <'a> Insertions<'a> { + pub fn new(original: &'a [u8]) -> Self { + Insertions { original: original, position: 0, nt: 0 } + } +} + +impl <'a> Iterator for Insertions<'a> { + type Item = Vec; + + fn next(&mut self) -> Option { + if self.position > self.original.len() { + return None; + } + + if self.nt >= NTS_LEN { + self.position += 1; + self.nt = 0; + return self.next(); + } + + let mut variant = Vec::with_capacity(self.original.len() + 1); + variant.extend_from_slice(&self.original[..self.position]); + variant.push(NTS[self.nt]); + variant.extend_from_slice(&self.original[self.position..]); + self.nt += 1; + return Some(variant); + } +} + struct Deletions<'a> { original: &'a [u8], position: usize, @@ -195,8 +254,9 @@ impl <'a> Iterator for Deletions<'a> { return None; } - let mut variant = self.original.to_vec(); - variant.remove(self.position); + let mut variant = Vec::with_capacity(self.original.len() - 1); + variant.extend_from_slice(&self.original[..self.position]); + variant.extend_from_slice(&self.original[(self.position+1)..]); self.position += 1; return Some(variant); } From cd7abbe67592c14724938be1aca4d29cd8c522ba Mon Sep 17 00:00:00 2001 From: Nick Ingolia Date: Mon, 12 Aug 2019 17:01:45 -0700 Subject: [PATCH 10/31] Abstracted out barcode counting into a separate public function --- src/barcode_assign/bc_count.rs | 36 ++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/src/barcode_assign/bc_count.rs b/src/barcode_assign/bc_count.rs index 9024d56..2420a97 100644 --- a/src/barcode_assign/bc_count.rs +++ b/src/barcode_assign/bc_count.rs @@ -19,17 +19,7 @@ pub fn bc_count(config: Config) -> Result<(), failure::Error> { }; let barcode_reader = fastq::Reader::new(reader); - let mut barcode_counts = HashMap::new(); - - let records = barcode_reader.records(); - - for result in records { - let barcode_record = result?; - - let barcode = String::from_utf8(barcode_record.seq().to_vec())?; - let barcode_count = barcode_counts.entry(barcode.to_string()).or_insert(0); - *barcode_count += 1; - } + let barcode_counts = count_barcodes(barcode_reader)?; let writer: Box = if config.out_barcodes == "-" { Box::new(io::stdout()) @@ -46,9 +36,23 @@ pub fn bc_count(config: Config) -> Result<(), failure::Error> { Ok(()) } +pub fn count_barcodes(barcode_reader: fastq::Reader) -> Result, usize>, failure::Error> { + let mut barcode_counts = HashMap::new(); + + for (_recno, rec_res) in barcode_reader.records().enumerate() { + let rec = rec_res?; + + let barcode = rec.seq().to_vec(); + let barcode_count = barcode_counts.entry(barcode).or_insert(0); + *barcode_count += 1; + } + + Ok(barcode_counts) +} + fn write_barcode_table( barcode_out: W, - barcode_counts: &HashMap, + barcode_counts: &HashMap, usize>, ) -> Result<(), failure::Error> where W: std::io::Write, @@ -56,10 +60,8 @@ where let mut bcout = std::io::BufWriter::new(barcode_out); for (barcode, count) in barcode_counts.iter() { - bcout.write(barcode.as_bytes())?; - bcout.write("\t".as_bytes())?; - bcout.write(count.to_string().as_bytes())?; - bcout.write("\n".as_bytes())?; + write!(bcout, "{}\t{}\n", + String::from_utf8_lossy(barcode), count)?; } Ok(()) @@ -67,7 +69,7 @@ where fn write_freq_table( freq_out: W, - barcode_counts: &HashMap, + barcode_counts: &HashMap, usize>, ) -> Result<(), failure::Error> where W: std::io::Write, From 04ae4f9a2c6962132ca3a1088c61238b9912c30d Mon Sep 17 00:00:00 2001 From: Nick Ingolia Date: Mon, 12 Aug 2019 17:02:45 -0700 Subject: [PATCH 11/31] Counting tool with neighborhood collapse --- src/barcode_assign/collapse.rs | 249 ++++++++++++++++----------------- src/bc_collapse.rs | 70 ++------- 2 files changed, 134 insertions(+), 185 deletions(-) diff --git a/src/barcode_assign/collapse.rs b/src/barcode_assign/collapse.rs index 4157be7..0c0a8f3 100644 --- a/src/barcode_assign/collapse.rs +++ b/src/barcode_assign/collapse.rs @@ -1,158 +1,155 @@ -use std::cell::RefCell; -use std::collections::{HashMap,HashSet}; +use std::collections::HashMap; +use std::io::Read; use std::io::Write; -use std::rc::Rc; - -use bio::pattern_matching::myers::Myers; - -// New barcode -- check first for exact match, then scan all existing barcodes -// for near-matches -// Insert later barcodes with links to earlier barcodes? -// Or, create barcode equivalency sets? - -// No need for complexity during counting -- post-process key set in -// map to find neighborhoods. No need to handle insertions, can find -// all edges with substitutions and deletions. - -// 1. Pick a node -// a. remove from key set -// b. initialize work stack with node -// 2. Handle a node from work stack -// a. check key set for all near-neighbors -// i. remove near-neighbor from key set -// ii. push near-neighbor onto work stack -// b. add node to neighborhood - -pub fn gather_neighborhoods(mut bc_set: HashSet>) -> Vec>> -{ - let mut neighborhoods = Vec::new(); - - loop { - let mut work_stack = Vec::new(); - let mut neighborhood = Vec::new(); - - let start_ref = match bc_set.iter().next() { - Some(start_ref) => start_ref, - None => { break; } - }; - - // work_stack.push(bc_set.take(start_ref).unwrap()); - let start = start_ref.to_vec(); - bc_set.take(&start); - work_stack.push(start); - - while work_stack.len() > 0 { - let curr = work_stack.pop().unwrap(); - - let neighbors = Substitutions::new(&curr).chain(Deletions::new(&curr)); - for neighbor in neighbors { - if bc_set.remove(&neighbor) { - work_stack.push(neighbor); - } - } +use std::path::{Path,PathBuf}; - neighborhood.push(curr); - } +use bio::io::fastq; - neighborhoods.push(neighborhood); - } - - neighborhoods -} +use bc_count::count_barcodes; -pub struct BarcodeNbhdMap { - nbhds: Vec>>, - barcode_nbhds: HashMap, Rc>>, - max_dist: u8, +pub struct CLI { + pub fastq_in: String, + pub output_base: String, } -impl BarcodeNbhdMap { - pub fn new(max_dist: usize) -> Self { - BarcodeNbhdMap { - nbhds: Vec::new(), - barcode_nbhds: HashMap::new(), - max_dist: max_dist as u8, - } +impl CLI { + pub fn output_filename(&self, name: &str) -> PathBuf { + let base_ref: &Path = self.output_base.as_ref(); + let mut namebase = base_ref + .file_name() + .map_or(std::ffi::OsString::new(), std::ffi::OsStr::to_os_string); + namebase.push(name); + base_ref.with_file_name(namebase) } - pub fn insert(&mut self, bc: &[u8]) -> () { - if self.barcode_nbhds.contains_key(bc) { - self.barcode_nbhds.get(bc).unwrap().borrow_mut().insert(bc); + pub fn run(&self) -> Result<(), failure::Error> { + let reader: Box = if self.fastq_in == "-" { + Box::new(std::io::stdin()) } else { - let mut subst_iter = Substitutions::new(bc).chain(Deletions::new(bc)).chain(Insertions::new(bc)); - let mut nbhd_found = false; + Box::new(std::fs::File::open(&self.fastq_in)?) + }; + let barcode_reader = fastq::Reader::new(reader); + + let barcode_counts = count_barcodes(barcode_reader)?; + + let mut nbhds = Neighborhood::gather_neighborhoods(barcode_counts); + for nbhd in nbhds.iter_mut() { + nbhd.sort_by_counts(); + } + + let mut barcode_to_nbhd_out = std::fs::File::create(self.output_filename("-barcode-to-nbhd.txt"))?; + let mut nbhd_count_out = std::fs::File::create(self.output_filename("-nbhd-count.txt"))?; + let mut nbhds_out = std::fs::File::create(self.output_filename("-nbhds.txt"))?; + + for nbhd in nbhds.iter() { + let (keybc, keyct) = nbhd.key_barcode(); + let total = nbhd.total(); + write!(nbhd_count_out, "{}\t{}\n", String::from_utf8_lossy(keybc), nbhd.total())?; - for subst in subst_iter { - if self.barcode_nbhds.contains_key(&subst) { - let nbhd = self.barcode_nbhds.get(&subst).unwrap(); - nbhd.borrow_mut().insert(bc); - self.barcode_nbhds.insert(bc.to_vec(), Rc::clone(nbhd)); - return; - } + for (bc, ct) in nbhd.barcodes() { + write!(barcode_to_nbhd_out, "{}\t{}\t{}\t{}\t{:0.3}\n", + String::from_utf8_lossy(bc), + String::from_utf8_lossy(keybc), + ct, total, (*ct as f64) / (total as f64))?; } - let mut nbhd = BarcodeNbhd::new(); - nbhd.insert(bc); - let mut nbhd_rc = Rc::new(RefCell::new(nbhd)); - self.nbhds.push(Rc::clone(&nbhd_rc)); - self.barcode_nbhds.insert(bc.to_vec(), nbhd_rc); - } - } - - pub fn write_nbhds(&self, mut out: W) -> Result<(), failure::Error> { - for nbhd in self.nbhds.iter() { - let mut barcodes = nbhd.borrow().barcodes.clone(); - barcodes.sort_by_key(|(_, ct)| -(*ct as isize)); - let (keybc, keyct) = barcodes.first().unwrap(); - let total: usize = barcodes.iter().map(|(_, ct)| *ct).sum(); - write!(out, "{}\t{}\t{}\t{:0.3}", + write!(nbhds_out, "{}\t{}\t{}\t{:0.3}", String::from_utf8_lossy(keybc), - barcodes.len(), total, + nbhd.len(), total, (*keyct as f64) / (total as f64))?; - for (bc, ct) in barcodes.iter() { - write!(out, "\t{}\t{}", + for (bc, ct) in nbhd.barcodes() { + write!(nbhds_out, "\t{}\t{}", String::from_utf8_lossy(bc), ct)?; } - write!(out, "\n")?; + write!(nbhds_out, "\n")?; } - Ok(()) - } - - pub fn write_barcode_nbhds(&self, mut out: W) -> Result<(), failure::Error> { - for (key, val) in self.barcode_nbhds.iter() { - write!(out, "{}", - String::from_utf8_lossy(key))?; - - for (bc, ct) in val.borrow().barcodes.iter() { - write!(out, "\t{}\t{}", - String::from_utf8_lossy(bc), ct)?; - } - - write!(out, "\n")?; - } - + Ok(()) } } -struct BarcodeNbhd { - barcodes: Vec<(Vec, usize)>, +pub struct Neighborhood { + barcodes: Vec<(Vec, T)> } -impl BarcodeNbhd { +impl Neighborhood { fn new() -> Self { - BarcodeNbhd { barcodes: Vec::new() } + Neighborhood { + barcodes: Vec::new() + } + } + + fn insert(&mut self, barcode: Vec, value: T) -> () { + self.barcodes.push((barcode, value)); + } + + pub fn barcodes(&self) -> impl Iterator, T)> { + self.barcodes.iter() } - fn insert(&mut self, bc_new: &[u8]) -> () { - for &mut (ref bc, ref mut ct) in self.barcodes.iter_mut() { - if bc_new == bc.as_slice() { - *ct += 1; - return; + pub fn len(&self) -> usize { self.barcodes.len() } + + pub fn key_barcode(&self) -> (&[u8], &T) { + let (keybc, keyct) = self.barcodes.first().unwrap(); + (keybc, keyct) + } + + // Collecting a neighborhood: + // 1. Pick a node arbitrarily + // a. remove from key set + // b. initialize work stack with node + // 2. Handle a node from work stack + // a. check key set for all near-neighbors + // i. remove near-neighbor from key set + // ii. push near-neighbor onto work stack + // b. add node to neighborhood + // 3. Repeat handling nodes from work stack until empty + + pub fn gather_neighborhoods(mut bc_map: HashMap, T>) -> Vec> + { + let mut neighborhoods = Vec::new(); + + loop { + let mut work_stack = Vec::new(); + let mut neighborhood = Neighborhood::new(); + + let start_ref = match bc_map.iter().next() { + Some(start_ref) => start_ref, + None => { break; } + }; + + let start = start_ref.0.to_vec(); + let value = bc_map.remove(&start).unwrap(); + work_stack.push((start, value)); + + while work_stack.len() > 0 { + let (curr, curr_value) = work_stack.pop().unwrap(); + + let neighbors = Substitutions::new(&curr).chain(Deletions::new(&curr)).chain(Insertions::new(&curr)); + for neighbor in neighbors { + if bc_map.contains_key(&neighbor) { + let neighbor_value = bc_map.remove(&neighbor).unwrap(); + work_stack.push((neighbor, neighbor_value)); + } + } + + neighborhood.insert(curr, curr_value); } + + neighborhoods.push(neighborhood); } + + neighborhoods + } +} + +impl Neighborhood { + pub fn sort_by_counts(&mut self) -> () { + self.barcodes.sort_by_key(|(_, ct)| -(*ct as isize)); + } - self.barcodes.push((bc_new.to_vec(), 1)); + pub fn total(&self) -> usize { + self.barcodes().map(|(_, ct)| *ct).sum() } } diff --git a/src/bc_collapse.rs b/src/bc_collapse.rs index 08b1cab..77d2738 100644 --- a/src/bc_collapse.rs +++ b/src/bc_collapse.rs @@ -1,42 +1,41 @@ extern crate barcode_assign; extern crate bio; extern crate clap; +#[macro_use] extern crate failure; -use std::io::Read; - -use bio::io::fastq; -use barcode_assign::collapse::*; use clap::{App, Arg}; +use barcode_assign::collapse::CLI; + fn main() { let matches = App::new("bc-collapse") .version("1.0") .author("Nick Ingolia ") .about("Collapse barcode sequences") .arg( - Arg::with_name("fastq") + Arg::with_name("fastq_in") .short("f") .long("fastq") - .value_name("BARCODE-FQ") + .value_name("BARCODE.FASTQ") .help("FastQ file of barcode sequences") .takes_value(true) .required(true), ) .arg( - Arg::with_name("output") + Arg::with_name("output_base") .short("o") - .long("output") - .value_name("OUTPUT-TXT") - .help("Tab-delimited text file of barcode counts") + .long("outbase") + .value_name("OUTPUT_BASE") + .help("Base name for output files") .takes_value(true) .required(true), ) .get_matches(); let cli = CLI { - barcode_fastq: matches.value_of("fastq").unwrap().to_string(), - output: matches.value_of("output").unwrap().to_string(), + fastq_in: matches.value_of("fastq_in").unwrap().to_string(), + output_base: matches.value_of("output_base").unwrap().to_string(), }; match cli.run() { @@ -45,50 +44,3 @@ fn main() { } } -struct CLI { - barcode_fastq: String, - output: String, -} - -impl CLI { - fn run(&self) -> Result<(), failure::Error> { - let mut nbhds = BarcodeNbhdMap::new(1); - - let reader: Box = if self.barcode_fastq == "-" { - Box::new(std::io::stdin()) - } else { - Box::new(std::fs::File::open(&self.barcode_fastq)?) - }; - let barcode_reader = fastq::Reader::new(reader); - - let mut last_report = std::time::Instant::now(); - let mut last_recno = 0; - - for (recno, rec_res) in barcode_reader.records().enumerate() { - let rec = rec_res?; - - if rec.seq().len() < 24 || rec.seq().len() > 26 { -// println!("Skipping {} because of length", -// String::from_utf8_lossy(rec.seq())); - continue; - } - - nbhds.insert(rec.seq()); - - let last_duration = std::time::Instant::now().duration_since(last_report); - if last_duration.as_secs() > 0 - { - let recs = recno - last_recno; - println!("{} usec, {} records, {} records / second ({} total)", - last_duration.as_micros(), - recs, - 1000000.0 * (recs as f64) / (last_duration.as_micros() as f64), - recno); - last_report = std::time::Instant::now(); - last_recno = recno; - } - } - - nbhds.write_nbhds(std::fs::File::create(&self.output)?) - } -} From 49c213adcd349b90d8356f73c6f14a1741a2607d Mon Sep 17 00:00:00 2001 From: Nick Ingolia Date: Mon, 12 Aug 2019 20:13:22 -0700 Subject: [PATCH 12/31] Abstracted functions to write barcode neighborhood tables --- src/barcode_assign/collapse.rs | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/src/barcode_assign/collapse.rs b/src/barcode_assign/collapse.rs index 0c0a8f3..1c6524c 100644 --- a/src/barcode_assign/collapse.rs +++ b/src/barcode_assign/collapse.rs @@ -151,6 +151,39 @@ impl Neighborhood { pub fn total(&self) -> usize { self.barcodes().map(|(_, ct)| *ct).sum() } + + pub fn write_total_counts(&self, out: &mut W) -> Result<(), std::io::Error> { + write!(out, "{}\t{}\n", String::from_utf8_lossy(self.key_barcode().0), self.total()) + } + + pub fn write_barcode_counts(&self, out: &mut W) -> Result<(), std::io::Error> { + let (keybc, keyct) = self.key_barcode(); + let total = self.total(); + + for (bc, ct) in self.barcodes() { + write!(out, "{}\t{}\t{}\t{}\t{:0.3}\n", + String::from_utf8_lossy(bc), + String::from_utf8_lossy(keybc), + ct, total, (*ct as f64) / (total as f64))?; + } + + Ok(()) + } + + pub fn write_nbhd_counts(&self, out: &mut W) -> Result<(), std::io::Error> { + let (keybc, keyct) = self.key_barcode(); + let total = self.total(); + + write!(out, "{}\t{}\t{}\t{:0.3}", + String::from_utf8_lossy(keybc), + self.len(), total, + (*keyct as f64) / (total as f64))?; + for (bc, ct) in self.barcodes() { + write!(out, "\t{}\t{}", + String::from_utf8_lossy(bc), ct)?; + } + write!(out, "\n") + } } // Switch to an interface where mutations (acting on a slice buffer) From 8307d2b333123e5277c6f23c522c2a831b414fb7 Mon Sep 17 00:00:00 2001 From: Nick Ingolia Date: Mon, 12 Aug 2019 20:13:47 -0700 Subject: [PATCH 13/31] Added neighborhood option for barcode counting --- src/barcode_assign/bc_count.rs | 38 ++++++++++++++++++++++++++++++++++ src/bc_count.rs | 9 ++++++++ 2 files changed, 47 insertions(+) diff --git a/src/barcode_assign/bc_count.rs b/src/barcode_assign/bc_count.rs index 2420a97..a63e377 100644 --- a/src/barcode_assign/bc_count.rs +++ b/src/barcode_assign/bc_count.rs @@ -1,14 +1,18 @@ use std::collections::HashMap; use std::fs::File; use std::io::{self, Read, Write}; +use std::path::{Path,PathBuf}; use bio::io::fastq; +use collapse::Neighborhood; + #[derive(Debug)] pub struct Config { pub barcode_fastq: String, pub out_barcodes: String, pub freq_filename: Option, + pub neighborhood: Option, } pub fn bc_count(config: Config) -> Result<(), failure::Error> { @@ -33,6 +37,10 @@ pub fn bc_count(config: Config) -> Result<(), failure::Error> { write_freq_table(freq_writer, &barcode_counts)?; } + if let Some(nbhd_filename) = config.neighborhood { + neighborhood_counts(barcode_counts, &nbhd_filename)?; + } + Ok(()) } @@ -92,3 +100,33 @@ where Ok(()) } + +fn neighborhood_counts(barcode_counts: HashMap, usize>, nbhd_filename: &str) -> Result<(), failure::Error> +{ + let mut barcode_to_nbhd_out = std::fs::File::create(output_filename(nbhd_filename, "-barcode-to-nbhd.txt"))?; + let mut nbhd_count_out = std::fs::File::create(output_filename(nbhd_filename, "-nbhd-count.txt"))?; + let mut nbhds_out = std::fs::File::create(output_filename(nbhd_filename, "-nbhds.txt"))?; + + let mut nbhds = Neighborhood::gather_neighborhoods(barcode_counts); + for nbhd in nbhds.iter_mut() { + nbhd.sort_by_counts(); + } + + for nbhd in nbhds.iter() { + nbhd.write_total_counts(&mut nbhd_count_out)?; + nbhd.write_barcode_counts(&mut barcode_to_nbhd_out)?; + nbhd.write_nbhd_counts(&mut nbhds_out)?; + } + + Ok(()) +} + +fn output_filename(output_base: &str, name: &str) -> PathBuf { + let base_ref: &Path = output_base.as_ref(); + let mut namebase = base_ref + .file_name() + .map_or(std::ffi::OsString::new(), std::ffi::OsStr::to_os_string); + namebase.push(name); + base_ref.with_file_name(namebase) +} + diff --git a/src/bc_count.rs b/src/bc_count.rs index ae03131..f1145b8 100644 --- a/src/bc_count.rs +++ b/src/bc_count.rs @@ -27,12 +27,21 @@ fn main() { .takes_value(true) .required(true), ) + .arg( + Arg::with_name("neighborhood") + .short("n") + .long("neighborhood") + .value_name("NBHD_BASE") + .help("Analyze barcode mutation neighborhoods") + .takes_value(true), + ) .get_matches(); let config = Config { barcode_fastq: matches.value_of("fastq").unwrap().to_string(), out_barcodes: matches.value_of("output").unwrap().to_string(), freq_filename: None, + neighborhood: matches.value_of("neighborhood").map(|s| String::from(s)), }; match bc_count(config) { From 83ab20782522bdd5b2b4172a603546bfb033b6d3 Mon Sep 17 00:00:00 2001 From: Nicholas Ingolia Date: Mon, 12 Aug 2019 20:26:59 -0700 Subject: [PATCH 14/31] Changed collapse to run on tables of barcodes --- src/barcode_assign/collapse.rs | 57 ++++++++++++++++------------------ src/bc_collapse.rs | 12 +++---- 2 files changed, 32 insertions(+), 37 deletions(-) diff --git a/src/barcode_assign/collapse.rs b/src/barcode_assign/collapse.rs index 1c6524c..e4f4942 100644 --- a/src/barcode_assign/collapse.rs +++ b/src/barcode_assign/collapse.rs @@ -1,14 +1,12 @@ use std::collections::HashMap; +use std::io::BufRead; +use std::io::BufReader; use std::io::Read; use std::io::Write; use std::path::{Path,PathBuf}; -use bio::io::fastq; - -use bc_count::count_barcodes; - pub struct CLI { - pub fastq_in: String, + pub input: String, pub output_base: String, } @@ -23,14 +21,14 @@ impl CLI { } pub fn run(&self) -> Result<(), failure::Error> { - let reader: Box = if self.fastq_in == "-" { + let reader: Box = if self.input == "-" { Box::new(std::io::stdin()) } else { - Box::new(std::fs::File::open(&self.fastq_in)?) + Box::new(std::fs::File::open(&self.input)?) }; - let barcode_reader = fastq::Reader::new(reader); + let mut barcode_reader = BufReader::new(reader); - let barcode_counts = count_barcodes(barcode_reader)?; + let barcode_counts = CLI::count_barcodes(&mut barcode_reader)?; let mut nbhds = Neighborhood::gather_neighborhoods(barcode_counts); for nbhd in nbhds.iter_mut() { @@ -40,32 +38,29 @@ impl CLI { let mut barcode_to_nbhd_out = std::fs::File::create(self.output_filename("-barcode-to-nbhd.txt"))?; let mut nbhd_count_out = std::fs::File::create(self.output_filename("-nbhd-count.txt"))?; let mut nbhds_out = std::fs::File::create(self.output_filename("-nbhds.txt"))?; - + for nbhd in nbhds.iter() { - let (keybc, keyct) = nbhd.key_barcode(); - let total = nbhd.total(); - write!(nbhd_count_out, "{}\t{}\n", String::from_utf8_lossy(keybc), nbhd.total())?; - - for (bc, ct) in nbhd.barcodes() { - write!(barcode_to_nbhd_out, "{}\t{}\t{}\t{}\t{:0.3}\n", - String::from_utf8_lossy(bc), - String::from_utf8_lossy(keybc), - ct, total, (*ct as f64) / (total as f64))?; - } - - write!(nbhds_out, "{}\t{}\t{}\t{:0.3}", - String::from_utf8_lossy(keybc), - nbhd.len(), total, - (*keyct as f64) / (total as f64))?; - for (bc, ct) in nbhd.barcodes() { - write!(nbhds_out, "\t{}\t{}", - String::from_utf8_lossy(bc), ct)?; - } - write!(nbhds_out, "\n")?; + nbhd.write_total_counts(&mut nbhd_count_out)?; + nbhd.write_barcode_counts(&mut barcode_to_nbhd_out)?; + nbhd.write_nbhd_counts(&mut nbhds_out)?; } Ok(()) } + + pub fn count_barcodes(barcode_reader: &mut R) -> Result, usize>, failure::Error> { + let mut barcode_counts = HashMap::new(); + + for line_res in barcode_reader.lines() { + let line = line_res?; + + let barcode = line.into_bytes(); + let barcode_count = barcode_counts.entry(barcode).or_insert(0); + *barcode_count += 1; + } + + Ok(barcode_counts) + } } pub struct Neighborhood { @@ -157,7 +152,7 @@ impl Neighborhood { } pub fn write_barcode_counts(&self, out: &mut W) -> Result<(), std::io::Error> { - let (keybc, keyct) = self.key_barcode(); + let (keybc, _keyct) = self.key_barcode(); let total = self.total(); for (bc, ct) in self.barcodes() { diff --git a/src/bc_collapse.rs b/src/bc_collapse.rs index 77d2738..3a11aaf 100644 --- a/src/bc_collapse.rs +++ b/src/bc_collapse.rs @@ -14,11 +14,11 @@ fn main() { .author("Nick Ingolia ") .about("Collapse barcode sequences") .arg( - Arg::with_name("fastq_in") - .short("f") - .long("fastq") - .value_name("BARCODE.FASTQ") - .help("FastQ file of barcode sequences") + Arg::with_name("input") + .short("i") + .long("input") + .value_name("BARCODES.TXT") + .help("Text file of barcode sequences") .takes_value(true) .required(true), ) @@ -34,7 +34,7 @@ fn main() { .get_matches(); let cli = CLI { - fastq_in: matches.value_of("fastq_in").unwrap().to_string(), + input: matches.value_of("input").unwrap().to_string(), output_base: matches.value_of("output_base").unwrap().to_string(), }; From e89d0ee6cebaaab50052abbfa63e72255fb8b07e Mon Sep 17 00:00:00 2001 From: Nick Ingolia Date: Sun, 11 Aug 2019 21:17:08 -0700 Subject: [PATCH 15/31] Started work on a tool to collapse similar barcodes --- src/barcode_assign/collapse.rs | 82 ++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 src/barcode_assign/collapse.rs diff --git a/src/barcode_assign/collapse.rs b/src/barcode_assign/collapse.rs new file mode 100644 index 0000000..c1ebee6 --- /dev/null +++ b/src/barcode_assign/collapse.rs @@ -0,0 +1,82 @@ +use std::cell::RefCell; +use std::collections::HashMap; +use std::rc::Rc; + +pub struct BarcodeEquivMap { + barcode_equivs: HashMap, Rc>>, + max_dist: usize, +} + +impl BarcodeEquivMap { + pub fn new(max_dist: usize) -> Self { + BarcodeEquivMap { + barcode_equivs: HashMap::new(), + max_dist: max_dist, + } + } + + pub fn insert(&mut self, bc: &[u8]) -> () { + unimplemented!(); + } +} + +struct BarcodeEquiv { + barcodes: Vec<(Vec, usize)>, +} + +impl BarcodeEquiv { + fn new() -> Self { + BarcodeEquiv { barcodes: Vec::new() } + } + + fn insert(&mut self, bc_new: &[u8]) -> () { + for &mut (ref bc, ref mut ct) in self.barcodes.iter_mut() { + if bc_new == bc.as_slice() { + *ct += 1; + return; + } + } + + self.barcodes.push((bc_new.to_vec(), 1)); + } +} + +const NTS_LEN: usize = 4; +static NTS: [u8; NTS_LEN] = [b'A', b'C', b'G', b'T']; + +struct Substitutions<'a> { + original: &'a [u8], + position: usize, + nt: usize, +} + +impl <'a> Substitutions<'a> { + pub fn new(original: &'a [u8]) -> Self { + Substitutions { original: original, position: 0, nt: 0 } + } +} + +impl <'a> Iterator for Substitutions<'a> { + type Item = Vec; + + fn next(&mut self) -> Option { + if self.position >= self.original.len() { + return None; + } + + if self.nt >= NTS_LEN { + self.position += 1; + self.nt = 0; + return self.next(); + } + + if self.original[self.position] == NTS[self.nt] { + self.nt += 1; + return self.next(); + } + + let mut variant = self.original.to_vec(); + variant[self.position] = NTS[self.nt]; + return Some(variant); + } +} From b6af64b95c01ee272b40731fae413fc8e337596b Mon Sep 17 00:00:00 2001 From: Nick Ingolia Date: Sun, 11 Aug 2019 21:58:29 -0700 Subject: [PATCH 16/31] Build module for barcode collapsing --- src/barcode_assign/lib.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/barcode_assign/lib.rs b/src/barcode_assign/lib.rs index f5ad99b..eed9ebc 100644 --- a/src/barcode_assign/lib.rs +++ b/src/barcode_assign/lib.rs @@ -10,6 +10,7 @@ pub mod bc_frag; pub mod bc_seqs; pub mod bc_tabulate; pub mod counts; +pub mod collapse; pub mod depth; pub mod fastq_pair; pub mod flank_match; From 4d94be353e7b73e00ea8c4e695b598a2db6eb08e Mon Sep 17 00:00:00 2001 From: Nick Ingolia Date: Mon, 12 Aug 2019 00:20:55 -0700 Subject: [PATCH 17/31] HashMap-based barcode collapsing --- Cargo.toml | 6 +- src/barcode_assign/collapse.rs | 105 +++++++++++++++++++++++++++++---- src/bc_collapse.rs | 98 ++++++++++++++++++++++++++++++ 3 files changed, 197 insertions(+), 12 deletions(-) create mode 100644 src/bc_collapse.rs diff --git a/Cargo.toml b/Cargo.toml index 9faeea8..4e15d17 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -56,4 +56,8 @@ path = "src/bc-pacbio/bc_bam_chunk.rs" [[bin]] name = "bc-tabulate" -path = "src/bc_tabulate.rs" \ No newline at end of file +path = "src/bc_tabulate.rs" + +[[bin]] +name = "bc-collapse" +path = "src/bc_collapse.rs" diff --git a/src/barcode_assign/collapse.rs b/src/barcode_assign/collapse.rs index c1ebee6..7cae9df 100644 --- a/src/barcode_assign/collapse.rs +++ b/src/barcode_assign/collapse.rs @@ -1,32 +1,84 @@ use std::cell::RefCell; use std::collections::HashMap; +use std::io::Write; use std::rc::Rc; -pub struct BarcodeEquivMap { - barcode_equivs: HashMap, Rc>>, - max_dist: usize, +use bio::pattern_matching::myers::Myers; + +// New barcode -- check first for exact match, then scan all existing barcodes +// for near-matches +// Insert later barcodes with links to earlier barcodes? +// Or, create barcode equivalency sets? + +pub struct BarcodeNbhdMap { + barcode_nbhds: HashMap, Rc>>, + max_dist: u8, } -impl BarcodeEquivMap { +impl BarcodeNbhdMap { pub fn new(max_dist: usize) -> Self { - BarcodeEquivMap { - barcode_equivs: HashMap::new(), - max_dist: max_dist, + BarcodeNbhdMap { + barcode_nbhds: HashMap::new(), + max_dist: max_dist as u8, } } pub fn insert(&mut self, bc: &[u8]) -> () { - unimplemented!(); + if self.barcode_nbhds.contains_key(bc) { + self.barcode_nbhds.get(bc).unwrap().borrow_mut().insert(bc); + } else { + // let myers = Myers::::new(bc); + + // for key in self.barcode_nbhds.keys() { + // let (_, dist) = myers.find_best_end(key); + // if dist <= self.max_dist { + // println!("Found {} vs {} at {}", + // String::from_utf8_lossy(bc), + // String::from_utf8_lossy(key), + // dist); + // } + // } + + let mut subst_iter = Substitutions::new(bc).chain(Deletions::new(bc)); + for subst in subst_iter { + if self.barcode_nbhds.contains_key(&subst) { + let nbhd = self.barcode_nbhds.get(&subst).unwrap(); + nbhd.borrow_mut().insert(bc); + self.barcode_nbhds.insert(bc.to_vec(), Rc::clone(nbhd)); + return; + } + } + + let mut nbhd = BarcodeNbhd::new(); + nbhd.insert(bc); + self.barcode_nbhds.insert(bc.to_vec(), Rc::new(RefCell::new(nbhd))); + } + } + + pub fn write_nbhds(&self, mut out: W) -> Result<(), failure::Error> { + for (key, val) in self.barcode_nbhds.iter() { + write!(out, "{}", + String::from_utf8_lossy(key))?; + + for (bc, ct) in val.borrow().barcodes.iter() { + write!(out, "\t{}\t{}", + String::from_utf8_lossy(bc), ct)?; + } + + write!(out, "\n")?; + } + + Ok(()) } } -struct BarcodeEquiv { +struct BarcodeNbhd { barcodes: Vec<(Vec, usize)>, } -impl BarcodeEquiv { +impl BarcodeNbhd { fn new() -> Self { - BarcodeEquiv { barcodes: Vec::new() } + BarcodeNbhd { barcodes: Vec::new() } } fn insert(&mut self, bc_new: &[u8]) -> () { @@ -41,6 +93,9 @@ impl BarcodeEquiv { } } +// Switch to an interface where mutations (acting on a slice buffer) +// are returned to avoid allocation. + const NTS_LEN: usize = 4; static NTS: [u8; NTS_LEN] = [b'A', b'C', b'G', b'T']; @@ -77,6 +132,34 @@ impl <'a> Iterator for Substitutions<'a> { let mut variant = self.original.to_vec(); variant[self.position] = NTS[self.nt]; + self.nt += 1; + return Some(variant); + } +} + +struct Deletions<'a> { + original: &'a [u8], + position: usize, +} + +impl <'a> Deletions<'a> { + pub fn new(original: &'a [u8]) -> Self { + Deletions { original: original, position: 0 } + } +} + +impl <'a> Iterator for Deletions<'a> { + type Item = Vec; + + fn next(&mut self) -> Option { + if self.position >= self.original.len() { + return None; + } + + let mut variant = self.original.to_vec(); + variant.remove(self.position); + self.position += 1; return Some(variant); } } + diff --git a/src/bc_collapse.rs b/src/bc_collapse.rs new file mode 100644 index 0000000..41949ec --- /dev/null +++ b/src/bc_collapse.rs @@ -0,0 +1,98 @@ +extern crate barcode_assign; +extern crate bio; +extern crate clap; +extern crate failure; + +use std::io::Read; + +use bio::io::fastq; +use barcode_assign::collapse::*; +use clap::{App, Arg}; + +fn main() { + let matches = App::new("bc-collapse") + .version("1.0") + .author("Nick Ingolia ") + .about("Collapse barcode sequences") + .arg( + Arg::with_name("fastq") + .short("f") + .long("fastq") + .value_name("BARCODE-FQ") + .help("FastQ file of barcode sequences") + .takes_value(true) + .required(true), + ) + .arg( + Arg::with_name("output") + .short("o") + .long("output") + .value_name("OUTPUT-TXT") + .help("Tab-delimited text file of barcode counts") + .takes_value(true) + .required(true), + ) + .get_matches(); + + let cli = CLI { + barcode_fastq: matches.value_of("fastq").unwrap().to_string(), + output: matches.value_of("output").unwrap().to_string(), + }; + + match cli.run() { + Ok(_) => (), + Err(e) => panic!(e), + } +} + +struct CLI { + barcode_fastq: String, + output: String, +} + +impl CLI { + fn run(&self) -> Result<(), failure::Error> { + let mut nbhds = BarcodeNbhdMap::new(1); + + let reader: Box = if self.barcode_fastq == "-" { + Box::new(std::io::stdin()) + } else { + Box::new(std::fs::File::open(&self.barcode_fastq)?) + }; + let barcode_reader = fastq::Reader::new(reader); + + let mut last_report = std::time::Instant::now(); + let mut last_recno = 0; + + for (recno, rec_res) in barcode_reader.records().enumerate() { + let rec = rec_res?; + + if rec.seq().len() < 24 || rec.seq().len() > 26 { +// println!("Skipping {} because of length", +// String::from_utf8_lossy(rec.seq())); + continue; + } + + nbhds.insert(rec.seq()); + + let last_duration = std::time::Instant::now().duration_since(last_report); + if last_duration.as_secs() > 0 + { + let recs = recno - last_recno; + println!("{} usec, {} records, {} records / second ({} total)", + last_duration.as_micros(), + recs, + 1000000.0 * (recs as f64) / (last_duration.as_micros() as f64), + recno); + last_report = std::time::Instant::now(); + last_recno = recno; + } + + if recno > 10000000 { + break; + } + } + + nbhds.write_nbhds(std::fs::File::create(&self.output)?) + } +} From 1ef582a8332ba1f6445d8c5c69cd0587eae94b73 Mon Sep 17 00:00:00 2001 From: Nick Ingolia Date: Mon, 12 Aug 2019 01:05:32 -0700 Subject: [PATCH 18/31] Write neighborhood-focused (vs barcode-focused) output --- src/barcode_assign/collapse.rs | 52 +++++++++++++++++++++++++--------- src/bc_collapse.rs | 4 --- 2 files changed, 39 insertions(+), 17 deletions(-) diff --git a/src/barcode_assign/collapse.rs b/src/barcode_assign/collapse.rs index 7cae9df..dad2e55 100644 --- a/src/barcode_assign/collapse.rs +++ b/src/barcode_assign/collapse.rs @@ -10,7 +10,21 @@ use bio::pattern_matching::myers::Myers; // Insert later barcodes with links to earlier barcodes? // Or, create barcode equivalency sets? +// No need for complexity during counting -- post-process key set in +// map to find neighborhoods. No need to handle insertions, can find +// all edges with substitutions and deletions. + +// 1. Pick a node +// a. remove from key set +// b. initialize work stack with node +// 2. Handle a node from work stack +// a. check key set for all near-neighbors +// i. remove near-neighbor from key set +// ii. push near-neighbor onto work stack +// b. add node to neighborhood + pub struct BarcodeNbhdMap { + nbhds: Vec>>, barcode_nbhds: HashMap, Rc>>, max_dist: u8, } @@ -18,6 +32,7 @@ pub struct BarcodeNbhdMap { impl BarcodeNbhdMap { pub fn new(max_dist: usize) -> Self { BarcodeNbhdMap { + nbhds: Vec::new(), barcode_nbhds: HashMap::new(), max_dist: max_dist as u8, } @@ -27,19 +42,9 @@ impl BarcodeNbhdMap { if self.barcode_nbhds.contains_key(bc) { self.barcode_nbhds.get(bc).unwrap().borrow_mut().insert(bc); } else { - // let myers = Myers::::new(bc); - - // for key in self.barcode_nbhds.keys() { - // let (_, dist) = myers.find_best_end(key); - // if dist <= self.max_dist { - // println!("Found {} vs {} at {}", - // String::from_utf8_lossy(bc), - // String::from_utf8_lossy(key), - // dist); - // } - // } - let mut subst_iter = Substitutions::new(bc).chain(Deletions::new(bc)); + let mut nbhd_found = false; + for subst in subst_iter { if self.barcode_nbhds.contains_key(&subst) { let nbhd = self.barcode_nbhds.get(&subst).unwrap(); @@ -51,11 +56,32 @@ impl BarcodeNbhdMap { let mut nbhd = BarcodeNbhd::new(); nbhd.insert(bc); - self.barcode_nbhds.insert(bc.to_vec(), Rc::new(RefCell::new(nbhd))); + let mut nbhd_rc = Rc::new(RefCell::new(nbhd)); + self.nbhds.push(Rc::clone(&nbhd_rc)); + self.barcode_nbhds.insert(bc.to_vec(), nbhd_rc); } } pub fn write_nbhds(&self, mut out: W) -> Result<(), failure::Error> { + for nbhd in self.nbhds.iter() { + let mut barcodes = nbhd.borrow().barcodes.clone(); + barcodes.sort_by_key(|(_, ct)| -(*ct as isize)); + let (keybc, keyct) = barcodes.first().unwrap(); + let total: usize = barcodes.iter().map(|(_, ct)| *ct).sum(); + write!(out, "{}\t{}\t{}\t{:0.3}", + String::from_utf8_lossy(keybc), + barcodes.len(), total, + (*keyct as f64) / (total as f64))?; + for (bc, ct) in barcodes.iter() { + write!(out, "\t{}\t{}", + String::from_utf8_lossy(bc), ct)?; + } + write!(out, "\n")?; + } + Ok(()) + } + + pub fn write_barcode_nbhds(&self, mut out: W) -> Result<(), failure::Error> { for (key, val) in self.barcode_nbhds.iter() { write!(out, "{}", String::from_utf8_lossy(key))?; diff --git a/src/bc_collapse.rs b/src/bc_collapse.rs index 41949ec..08b1cab 100644 --- a/src/bc_collapse.rs +++ b/src/bc_collapse.rs @@ -87,10 +87,6 @@ impl CLI { last_report = std::time::Instant::now(); last_recno = recno; } - - if recno > 10000000 { - break; - } } nbhds.write_nbhds(std::fs::File::create(&self.output)?) From 872e55b46cdfb6680c2adee0daea3a554688381b Mon Sep 17 00:00:00 2001 From: Nick Ingolia Date: Mon, 12 Aug 2019 08:32:01 -0700 Subject: [PATCH 19/31] Started work on post-tabulation neighborhood discovery --- src/barcode_assign/collapse.rs | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/barcode_assign/collapse.rs b/src/barcode_assign/collapse.rs index dad2e55..d15d8a6 100644 --- a/src/barcode_assign/collapse.rs +++ b/src/barcode_assign/collapse.rs @@ -1,5 +1,5 @@ use std::cell::RefCell; -use std::collections::HashMap; +use std::collections::{HashMap,HashSet}; use std::io::Write; use std::rc::Rc; @@ -23,6 +23,19 @@ use bio::pattern_matching::myers::Myers; // ii. push near-neighbor onto work stack // b. add node to neighborhood +pub fn gather_neighborhoods(mut bc_set: HashSet>) -> Vec>> { + loop { + let work_stack = Vec::new(); + + if let Some(start_ref) = bc_set.iter().next() { + let start = bc_set.take(start_ref).unwrap(); + work_stack.push(start); + } else { + return Vec::new(); + } + } +} + pub struct BarcodeNbhdMap { nbhds: Vec>>, barcode_nbhds: HashMap, Rc>>, From f43dae31e799987955785506b3d2fb099b81690d Mon Sep 17 00:00:00 2001 From: Nick Ingolia Date: Mon, 12 Aug 2019 08:34:07 -0700 Subject: [PATCH 20/31] Data file of barcodes for counting and collapse test --- data/test-barcodes.fastq | 20000 +++++++++++++++++++++++++++++++++++++ 1 file changed, 20000 insertions(+) create mode 100644 data/test-barcodes.fastq diff --git a/data/test-barcodes.fastq b/data/test-barcodes.fastq new file mode 100644 index 0000000..7e0541f --- /dev/null +++ b/data/test-barcodes.fastq @@ -0,0 +1,20000 @@ +@K00364:136:HYLVFBBXX:7:1101:26829:1279 1:N:0:CGCTCATT+NCTATCCT +TGNGGGCATGTACGCGAGGGAGGGGGATCCTGTAGCCCTANACTTNATNGC ++ +AA#AFJJJJJJJJJJJJJJJJ Date: Mon, 12 Aug 2019 13:28:49 -0700 Subject: [PATCH 21/31] Insertion mutations; post-tabulation neighborhood collection --- src/barcode_assign/collapse.rs | 80 +++++++++++++++++++++++++++++----- 1 file changed, 70 insertions(+), 10 deletions(-) diff --git a/src/barcode_assign/collapse.rs b/src/barcode_assign/collapse.rs index d15d8a6..4157be7 100644 --- a/src/barcode_assign/collapse.rs +++ b/src/barcode_assign/collapse.rs @@ -23,17 +23,41 @@ use bio::pattern_matching::myers::Myers; // ii. push near-neighbor onto work stack // b. add node to neighborhood -pub fn gather_neighborhoods(mut bc_set: HashSet>) -> Vec>> { +pub fn gather_neighborhoods(mut bc_set: HashSet>) -> Vec>> +{ + let mut neighborhoods = Vec::new(); + loop { - let work_stack = Vec::new(); + let mut work_stack = Vec::new(); + let mut neighborhood = Vec::new(); - if let Some(start_ref) = bc_set.iter().next() { - let start = bc_set.take(start_ref).unwrap(); - work_stack.push(start); - } else { - return Vec::new(); + let start_ref = match bc_set.iter().next() { + Some(start_ref) => start_ref, + None => { break; } + }; + + // work_stack.push(bc_set.take(start_ref).unwrap()); + let start = start_ref.to_vec(); + bc_set.take(&start); + work_stack.push(start); + + while work_stack.len() > 0 { + let curr = work_stack.pop().unwrap(); + + let neighbors = Substitutions::new(&curr).chain(Deletions::new(&curr)); + for neighbor in neighbors { + if bc_set.remove(&neighbor) { + work_stack.push(neighbor); + } + } + + neighborhood.push(curr); } + + neighborhoods.push(neighborhood); } + + neighborhoods } pub struct BarcodeNbhdMap { @@ -55,7 +79,7 @@ impl BarcodeNbhdMap { if self.barcode_nbhds.contains_key(bc) { self.barcode_nbhds.get(bc).unwrap().borrow_mut().insert(bc); } else { - let mut subst_iter = Substitutions::new(bc).chain(Deletions::new(bc)); + let mut subst_iter = Substitutions::new(bc).chain(Deletions::new(bc)).chain(Insertions::new(bc)); let mut nbhd_found = false; for subst in subst_iter { @@ -176,6 +200,41 @@ impl <'a> Iterator for Substitutions<'a> { } } +struct Insertions<'a> { + original: &'a [u8], + position: usize, + nt: usize, +} + +impl <'a> Insertions<'a> { + pub fn new(original: &'a [u8]) -> Self { + Insertions { original: original, position: 0, nt: 0 } + } +} + +impl <'a> Iterator for Insertions<'a> { + type Item = Vec; + + fn next(&mut self) -> Option { + if self.position > self.original.len() { + return None; + } + + if self.nt >= NTS_LEN { + self.position += 1; + self.nt = 0; + return self.next(); + } + + let mut variant = Vec::with_capacity(self.original.len() + 1); + variant.extend_from_slice(&self.original[..self.position]); + variant.push(NTS[self.nt]); + variant.extend_from_slice(&self.original[self.position..]); + self.nt += 1; + return Some(variant); + } +} + struct Deletions<'a> { original: &'a [u8], position: usize, @@ -195,8 +254,9 @@ impl <'a> Iterator for Deletions<'a> { return None; } - let mut variant = self.original.to_vec(); - variant.remove(self.position); + let mut variant = Vec::with_capacity(self.original.len() - 1); + variant.extend_from_slice(&self.original[..self.position]); + variant.extend_from_slice(&self.original[(self.position+1)..]); self.position += 1; return Some(variant); } From 9427d500ce011380e3d1788089af54e66aab3449 Mon Sep 17 00:00:00 2001 From: Nick Ingolia Date: Mon, 12 Aug 2019 17:01:45 -0700 Subject: [PATCH 22/31] Abstracted out barcode counting into a separate public function --- src/barcode_assign/bc_count.rs | 36 ++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/src/barcode_assign/bc_count.rs b/src/barcode_assign/bc_count.rs index 9024d56..2420a97 100644 --- a/src/barcode_assign/bc_count.rs +++ b/src/barcode_assign/bc_count.rs @@ -19,17 +19,7 @@ pub fn bc_count(config: Config) -> Result<(), failure::Error> { }; let barcode_reader = fastq::Reader::new(reader); - let mut barcode_counts = HashMap::new(); - - let records = barcode_reader.records(); - - for result in records { - let barcode_record = result?; - - let barcode = String::from_utf8(barcode_record.seq().to_vec())?; - let barcode_count = barcode_counts.entry(barcode.to_string()).or_insert(0); - *barcode_count += 1; - } + let barcode_counts = count_barcodes(barcode_reader)?; let writer: Box = if config.out_barcodes == "-" { Box::new(io::stdout()) @@ -46,9 +36,23 @@ pub fn bc_count(config: Config) -> Result<(), failure::Error> { Ok(()) } +pub fn count_barcodes(barcode_reader: fastq::Reader) -> Result, usize>, failure::Error> { + let mut barcode_counts = HashMap::new(); + + for (_recno, rec_res) in barcode_reader.records().enumerate() { + let rec = rec_res?; + + let barcode = rec.seq().to_vec(); + let barcode_count = barcode_counts.entry(barcode).or_insert(0); + *barcode_count += 1; + } + + Ok(barcode_counts) +} + fn write_barcode_table( barcode_out: W, - barcode_counts: &HashMap, + barcode_counts: &HashMap, usize>, ) -> Result<(), failure::Error> where W: std::io::Write, @@ -56,10 +60,8 @@ where let mut bcout = std::io::BufWriter::new(barcode_out); for (barcode, count) in barcode_counts.iter() { - bcout.write(barcode.as_bytes())?; - bcout.write("\t".as_bytes())?; - bcout.write(count.to_string().as_bytes())?; - bcout.write("\n".as_bytes())?; + write!(bcout, "{}\t{}\n", + String::from_utf8_lossy(barcode), count)?; } Ok(()) @@ -67,7 +69,7 @@ where fn write_freq_table( freq_out: W, - barcode_counts: &HashMap, + barcode_counts: &HashMap, usize>, ) -> Result<(), failure::Error> where W: std::io::Write, From a1676225040b2ea26b1e381aa70a773d520524e3 Mon Sep 17 00:00:00 2001 From: Nick Ingolia Date: Mon, 12 Aug 2019 17:02:45 -0700 Subject: [PATCH 23/31] Counting tool with neighborhood collapse --- src/barcode_assign/collapse.rs | 249 ++++++++++++++++----------------- src/bc_collapse.rs | 70 ++------- 2 files changed, 134 insertions(+), 185 deletions(-) diff --git a/src/barcode_assign/collapse.rs b/src/barcode_assign/collapse.rs index 4157be7..0c0a8f3 100644 --- a/src/barcode_assign/collapse.rs +++ b/src/barcode_assign/collapse.rs @@ -1,158 +1,155 @@ -use std::cell::RefCell; -use std::collections::{HashMap,HashSet}; +use std::collections::HashMap; +use std::io::Read; use std::io::Write; -use std::rc::Rc; - -use bio::pattern_matching::myers::Myers; - -// New barcode -- check first for exact match, then scan all existing barcodes -// for near-matches -// Insert later barcodes with links to earlier barcodes? -// Or, create barcode equivalency sets? - -// No need for complexity during counting -- post-process key set in -// map to find neighborhoods. No need to handle insertions, can find -// all edges with substitutions and deletions. - -// 1. Pick a node -// a. remove from key set -// b. initialize work stack with node -// 2. Handle a node from work stack -// a. check key set for all near-neighbors -// i. remove near-neighbor from key set -// ii. push near-neighbor onto work stack -// b. add node to neighborhood - -pub fn gather_neighborhoods(mut bc_set: HashSet>) -> Vec>> -{ - let mut neighborhoods = Vec::new(); - - loop { - let mut work_stack = Vec::new(); - let mut neighborhood = Vec::new(); - - let start_ref = match bc_set.iter().next() { - Some(start_ref) => start_ref, - None => { break; } - }; - - // work_stack.push(bc_set.take(start_ref).unwrap()); - let start = start_ref.to_vec(); - bc_set.take(&start); - work_stack.push(start); - - while work_stack.len() > 0 { - let curr = work_stack.pop().unwrap(); - - let neighbors = Substitutions::new(&curr).chain(Deletions::new(&curr)); - for neighbor in neighbors { - if bc_set.remove(&neighbor) { - work_stack.push(neighbor); - } - } +use std::path::{Path,PathBuf}; - neighborhood.push(curr); - } +use bio::io::fastq; - neighborhoods.push(neighborhood); - } - - neighborhoods -} +use bc_count::count_barcodes; -pub struct BarcodeNbhdMap { - nbhds: Vec>>, - barcode_nbhds: HashMap, Rc>>, - max_dist: u8, +pub struct CLI { + pub fastq_in: String, + pub output_base: String, } -impl BarcodeNbhdMap { - pub fn new(max_dist: usize) -> Self { - BarcodeNbhdMap { - nbhds: Vec::new(), - barcode_nbhds: HashMap::new(), - max_dist: max_dist as u8, - } +impl CLI { + pub fn output_filename(&self, name: &str) -> PathBuf { + let base_ref: &Path = self.output_base.as_ref(); + let mut namebase = base_ref + .file_name() + .map_or(std::ffi::OsString::new(), std::ffi::OsStr::to_os_string); + namebase.push(name); + base_ref.with_file_name(namebase) } - pub fn insert(&mut self, bc: &[u8]) -> () { - if self.barcode_nbhds.contains_key(bc) { - self.barcode_nbhds.get(bc).unwrap().borrow_mut().insert(bc); + pub fn run(&self) -> Result<(), failure::Error> { + let reader: Box = if self.fastq_in == "-" { + Box::new(std::io::stdin()) } else { - let mut subst_iter = Substitutions::new(bc).chain(Deletions::new(bc)).chain(Insertions::new(bc)); - let mut nbhd_found = false; + Box::new(std::fs::File::open(&self.fastq_in)?) + }; + let barcode_reader = fastq::Reader::new(reader); + + let barcode_counts = count_barcodes(barcode_reader)?; + + let mut nbhds = Neighborhood::gather_neighborhoods(barcode_counts); + for nbhd in nbhds.iter_mut() { + nbhd.sort_by_counts(); + } + + let mut barcode_to_nbhd_out = std::fs::File::create(self.output_filename("-barcode-to-nbhd.txt"))?; + let mut nbhd_count_out = std::fs::File::create(self.output_filename("-nbhd-count.txt"))?; + let mut nbhds_out = std::fs::File::create(self.output_filename("-nbhds.txt"))?; + + for nbhd in nbhds.iter() { + let (keybc, keyct) = nbhd.key_barcode(); + let total = nbhd.total(); + write!(nbhd_count_out, "{}\t{}\n", String::from_utf8_lossy(keybc), nbhd.total())?; - for subst in subst_iter { - if self.barcode_nbhds.contains_key(&subst) { - let nbhd = self.barcode_nbhds.get(&subst).unwrap(); - nbhd.borrow_mut().insert(bc); - self.barcode_nbhds.insert(bc.to_vec(), Rc::clone(nbhd)); - return; - } + for (bc, ct) in nbhd.barcodes() { + write!(barcode_to_nbhd_out, "{}\t{}\t{}\t{}\t{:0.3}\n", + String::from_utf8_lossy(bc), + String::from_utf8_lossy(keybc), + ct, total, (*ct as f64) / (total as f64))?; } - let mut nbhd = BarcodeNbhd::new(); - nbhd.insert(bc); - let mut nbhd_rc = Rc::new(RefCell::new(nbhd)); - self.nbhds.push(Rc::clone(&nbhd_rc)); - self.barcode_nbhds.insert(bc.to_vec(), nbhd_rc); - } - } - - pub fn write_nbhds(&self, mut out: W) -> Result<(), failure::Error> { - for nbhd in self.nbhds.iter() { - let mut barcodes = nbhd.borrow().barcodes.clone(); - barcodes.sort_by_key(|(_, ct)| -(*ct as isize)); - let (keybc, keyct) = barcodes.first().unwrap(); - let total: usize = barcodes.iter().map(|(_, ct)| *ct).sum(); - write!(out, "{}\t{}\t{}\t{:0.3}", + write!(nbhds_out, "{}\t{}\t{}\t{:0.3}", String::from_utf8_lossy(keybc), - barcodes.len(), total, + nbhd.len(), total, (*keyct as f64) / (total as f64))?; - for (bc, ct) in barcodes.iter() { - write!(out, "\t{}\t{}", + for (bc, ct) in nbhd.barcodes() { + write!(nbhds_out, "\t{}\t{}", String::from_utf8_lossy(bc), ct)?; } - write!(out, "\n")?; + write!(nbhds_out, "\n")?; } - Ok(()) - } - - pub fn write_barcode_nbhds(&self, mut out: W) -> Result<(), failure::Error> { - for (key, val) in self.barcode_nbhds.iter() { - write!(out, "{}", - String::from_utf8_lossy(key))?; - - for (bc, ct) in val.borrow().barcodes.iter() { - write!(out, "\t{}\t{}", - String::from_utf8_lossy(bc), ct)?; - } - - write!(out, "\n")?; - } - + Ok(()) } } -struct BarcodeNbhd { - barcodes: Vec<(Vec, usize)>, +pub struct Neighborhood { + barcodes: Vec<(Vec, T)> } -impl BarcodeNbhd { +impl Neighborhood { fn new() -> Self { - BarcodeNbhd { barcodes: Vec::new() } + Neighborhood { + barcodes: Vec::new() + } + } + + fn insert(&mut self, barcode: Vec, value: T) -> () { + self.barcodes.push((barcode, value)); + } + + pub fn barcodes(&self) -> impl Iterator, T)> { + self.barcodes.iter() } - fn insert(&mut self, bc_new: &[u8]) -> () { - for &mut (ref bc, ref mut ct) in self.barcodes.iter_mut() { - if bc_new == bc.as_slice() { - *ct += 1; - return; + pub fn len(&self) -> usize { self.barcodes.len() } + + pub fn key_barcode(&self) -> (&[u8], &T) { + let (keybc, keyct) = self.barcodes.first().unwrap(); + (keybc, keyct) + } + + // Collecting a neighborhood: + // 1. Pick a node arbitrarily + // a. remove from key set + // b. initialize work stack with node + // 2. Handle a node from work stack + // a. check key set for all near-neighbors + // i. remove near-neighbor from key set + // ii. push near-neighbor onto work stack + // b. add node to neighborhood + // 3. Repeat handling nodes from work stack until empty + + pub fn gather_neighborhoods(mut bc_map: HashMap, T>) -> Vec> + { + let mut neighborhoods = Vec::new(); + + loop { + let mut work_stack = Vec::new(); + let mut neighborhood = Neighborhood::new(); + + let start_ref = match bc_map.iter().next() { + Some(start_ref) => start_ref, + None => { break; } + }; + + let start = start_ref.0.to_vec(); + let value = bc_map.remove(&start).unwrap(); + work_stack.push((start, value)); + + while work_stack.len() > 0 { + let (curr, curr_value) = work_stack.pop().unwrap(); + + let neighbors = Substitutions::new(&curr).chain(Deletions::new(&curr)).chain(Insertions::new(&curr)); + for neighbor in neighbors { + if bc_map.contains_key(&neighbor) { + let neighbor_value = bc_map.remove(&neighbor).unwrap(); + work_stack.push((neighbor, neighbor_value)); + } + } + + neighborhood.insert(curr, curr_value); } + + neighborhoods.push(neighborhood); } + + neighborhoods + } +} + +impl Neighborhood { + pub fn sort_by_counts(&mut self) -> () { + self.barcodes.sort_by_key(|(_, ct)| -(*ct as isize)); + } - self.barcodes.push((bc_new.to_vec(), 1)); + pub fn total(&self) -> usize { + self.barcodes().map(|(_, ct)| *ct).sum() } } diff --git a/src/bc_collapse.rs b/src/bc_collapse.rs index 08b1cab..77d2738 100644 --- a/src/bc_collapse.rs +++ b/src/bc_collapse.rs @@ -1,42 +1,41 @@ extern crate barcode_assign; extern crate bio; extern crate clap; +#[macro_use] extern crate failure; -use std::io::Read; - -use bio::io::fastq; -use barcode_assign::collapse::*; use clap::{App, Arg}; +use barcode_assign::collapse::CLI; + fn main() { let matches = App::new("bc-collapse") .version("1.0") .author("Nick Ingolia ") .about("Collapse barcode sequences") .arg( - Arg::with_name("fastq") + Arg::with_name("fastq_in") .short("f") .long("fastq") - .value_name("BARCODE-FQ") + .value_name("BARCODE.FASTQ") .help("FastQ file of barcode sequences") .takes_value(true) .required(true), ) .arg( - Arg::with_name("output") + Arg::with_name("output_base") .short("o") - .long("output") - .value_name("OUTPUT-TXT") - .help("Tab-delimited text file of barcode counts") + .long("outbase") + .value_name("OUTPUT_BASE") + .help("Base name for output files") .takes_value(true) .required(true), ) .get_matches(); let cli = CLI { - barcode_fastq: matches.value_of("fastq").unwrap().to_string(), - output: matches.value_of("output").unwrap().to_string(), + fastq_in: matches.value_of("fastq_in").unwrap().to_string(), + output_base: matches.value_of("output_base").unwrap().to_string(), }; match cli.run() { @@ -45,50 +44,3 @@ fn main() { } } -struct CLI { - barcode_fastq: String, - output: String, -} - -impl CLI { - fn run(&self) -> Result<(), failure::Error> { - let mut nbhds = BarcodeNbhdMap::new(1); - - let reader: Box = if self.barcode_fastq == "-" { - Box::new(std::io::stdin()) - } else { - Box::new(std::fs::File::open(&self.barcode_fastq)?) - }; - let barcode_reader = fastq::Reader::new(reader); - - let mut last_report = std::time::Instant::now(); - let mut last_recno = 0; - - for (recno, rec_res) in barcode_reader.records().enumerate() { - let rec = rec_res?; - - if rec.seq().len() < 24 || rec.seq().len() > 26 { -// println!("Skipping {} because of length", -// String::from_utf8_lossy(rec.seq())); - continue; - } - - nbhds.insert(rec.seq()); - - let last_duration = std::time::Instant::now().duration_since(last_report); - if last_duration.as_secs() > 0 - { - let recs = recno - last_recno; - println!("{} usec, {} records, {} records / second ({} total)", - last_duration.as_micros(), - recs, - 1000000.0 * (recs as f64) / (last_duration.as_micros() as f64), - recno); - last_report = std::time::Instant::now(); - last_recno = recno; - } - } - - nbhds.write_nbhds(std::fs::File::create(&self.output)?) - } -} From 0f4586e653df7b5393a924ae8f45ac76133c9252 Mon Sep 17 00:00:00 2001 From: Nick Ingolia Date: Mon, 12 Aug 2019 20:13:22 -0700 Subject: [PATCH 24/31] Abstracted functions to write barcode neighborhood tables --- src/barcode_assign/collapse.rs | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/src/barcode_assign/collapse.rs b/src/barcode_assign/collapse.rs index 0c0a8f3..1c6524c 100644 --- a/src/barcode_assign/collapse.rs +++ b/src/barcode_assign/collapse.rs @@ -151,6 +151,39 @@ impl Neighborhood { pub fn total(&self) -> usize { self.barcodes().map(|(_, ct)| *ct).sum() } + + pub fn write_total_counts(&self, out: &mut W) -> Result<(), std::io::Error> { + write!(out, "{}\t{}\n", String::from_utf8_lossy(self.key_barcode().0), self.total()) + } + + pub fn write_barcode_counts(&self, out: &mut W) -> Result<(), std::io::Error> { + let (keybc, keyct) = self.key_barcode(); + let total = self.total(); + + for (bc, ct) in self.barcodes() { + write!(out, "{}\t{}\t{}\t{}\t{:0.3}\n", + String::from_utf8_lossy(bc), + String::from_utf8_lossy(keybc), + ct, total, (*ct as f64) / (total as f64))?; + } + + Ok(()) + } + + pub fn write_nbhd_counts(&self, out: &mut W) -> Result<(), std::io::Error> { + let (keybc, keyct) = self.key_barcode(); + let total = self.total(); + + write!(out, "{}\t{}\t{}\t{:0.3}", + String::from_utf8_lossy(keybc), + self.len(), total, + (*keyct as f64) / (total as f64))?; + for (bc, ct) in self.barcodes() { + write!(out, "\t{}\t{}", + String::from_utf8_lossy(bc), ct)?; + } + write!(out, "\n") + } } // Switch to an interface where mutations (acting on a slice buffer) From fea91050c65210852eb335b321a6e8f1335649fe Mon Sep 17 00:00:00 2001 From: Nick Ingolia Date: Mon, 12 Aug 2019 20:13:47 -0700 Subject: [PATCH 25/31] Added neighborhood option for barcode counting --- src/barcode_assign/bc_count.rs | 38 ++++++++++++++++++++++++++++++++++ src/bc_count.rs | 9 ++++++++ 2 files changed, 47 insertions(+) diff --git a/src/barcode_assign/bc_count.rs b/src/barcode_assign/bc_count.rs index 2420a97..a63e377 100644 --- a/src/barcode_assign/bc_count.rs +++ b/src/barcode_assign/bc_count.rs @@ -1,14 +1,18 @@ use std::collections::HashMap; use std::fs::File; use std::io::{self, Read, Write}; +use std::path::{Path,PathBuf}; use bio::io::fastq; +use collapse::Neighborhood; + #[derive(Debug)] pub struct Config { pub barcode_fastq: String, pub out_barcodes: String, pub freq_filename: Option, + pub neighborhood: Option, } pub fn bc_count(config: Config) -> Result<(), failure::Error> { @@ -33,6 +37,10 @@ pub fn bc_count(config: Config) -> Result<(), failure::Error> { write_freq_table(freq_writer, &barcode_counts)?; } + if let Some(nbhd_filename) = config.neighborhood { + neighborhood_counts(barcode_counts, &nbhd_filename)?; + } + Ok(()) } @@ -92,3 +100,33 @@ where Ok(()) } + +fn neighborhood_counts(barcode_counts: HashMap, usize>, nbhd_filename: &str) -> Result<(), failure::Error> +{ + let mut barcode_to_nbhd_out = std::fs::File::create(output_filename(nbhd_filename, "-barcode-to-nbhd.txt"))?; + let mut nbhd_count_out = std::fs::File::create(output_filename(nbhd_filename, "-nbhd-count.txt"))?; + let mut nbhds_out = std::fs::File::create(output_filename(nbhd_filename, "-nbhds.txt"))?; + + let mut nbhds = Neighborhood::gather_neighborhoods(barcode_counts); + for nbhd in nbhds.iter_mut() { + nbhd.sort_by_counts(); + } + + for nbhd in nbhds.iter() { + nbhd.write_total_counts(&mut nbhd_count_out)?; + nbhd.write_barcode_counts(&mut barcode_to_nbhd_out)?; + nbhd.write_nbhd_counts(&mut nbhds_out)?; + } + + Ok(()) +} + +fn output_filename(output_base: &str, name: &str) -> PathBuf { + let base_ref: &Path = output_base.as_ref(); + let mut namebase = base_ref + .file_name() + .map_or(std::ffi::OsString::new(), std::ffi::OsStr::to_os_string); + namebase.push(name); + base_ref.with_file_name(namebase) +} + diff --git a/src/bc_count.rs b/src/bc_count.rs index ae03131..f1145b8 100644 --- a/src/bc_count.rs +++ b/src/bc_count.rs @@ -27,12 +27,21 @@ fn main() { .takes_value(true) .required(true), ) + .arg( + Arg::with_name("neighborhood") + .short("n") + .long("neighborhood") + .value_name("NBHD_BASE") + .help("Analyze barcode mutation neighborhoods") + .takes_value(true), + ) .get_matches(); let config = Config { barcode_fastq: matches.value_of("fastq").unwrap().to_string(), out_barcodes: matches.value_of("output").unwrap().to_string(), freq_filename: None, + neighborhood: matches.value_of("neighborhood").map(|s| String::from(s)), }; match bc_count(config) { From 7b9eb8617300718f67a346f79458ae888c240638 Mon Sep 17 00:00:00 2001 From: Nicholas Ingolia Date: Mon, 12 Aug 2019 20:26:59 -0700 Subject: [PATCH 26/31] Changed collapse to run on tables of barcodes --- src/barcode_assign/collapse.rs | 57 ++++++++++++++++------------------ src/bc_collapse.rs | 12 +++---- 2 files changed, 32 insertions(+), 37 deletions(-) diff --git a/src/barcode_assign/collapse.rs b/src/barcode_assign/collapse.rs index 1c6524c..e4f4942 100644 --- a/src/barcode_assign/collapse.rs +++ b/src/barcode_assign/collapse.rs @@ -1,14 +1,12 @@ use std::collections::HashMap; +use std::io::BufRead; +use std::io::BufReader; use std::io::Read; use std::io::Write; use std::path::{Path,PathBuf}; -use bio::io::fastq; - -use bc_count::count_barcodes; - pub struct CLI { - pub fastq_in: String, + pub input: String, pub output_base: String, } @@ -23,14 +21,14 @@ impl CLI { } pub fn run(&self) -> Result<(), failure::Error> { - let reader: Box = if self.fastq_in == "-" { + let reader: Box = if self.input == "-" { Box::new(std::io::stdin()) } else { - Box::new(std::fs::File::open(&self.fastq_in)?) + Box::new(std::fs::File::open(&self.input)?) }; - let barcode_reader = fastq::Reader::new(reader); + let mut barcode_reader = BufReader::new(reader); - let barcode_counts = count_barcodes(barcode_reader)?; + let barcode_counts = CLI::count_barcodes(&mut barcode_reader)?; let mut nbhds = Neighborhood::gather_neighborhoods(barcode_counts); for nbhd in nbhds.iter_mut() { @@ -40,32 +38,29 @@ impl CLI { let mut barcode_to_nbhd_out = std::fs::File::create(self.output_filename("-barcode-to-nbhd.txt"))?; let mut nbhd_count_out = std::fs::File::create(self.output_filename("-nbhd-count.txt"))?; let mut nbhds_out = std::fs::File::create(self.output_filename("-nbhds.txt"))?; - + for nbhd in nbhds.iter() { - let (keybc, keyct) = nbhd.key_barcode(); - let total = nbhd.total(); - write!(nbhd_count_out, "{}\t{}\n", String::from_utf8_lossy(keybc), nbhd.total())?; - - for (bc, ct) in nbhd.barcodes() { - write!(barcode_to_nbhd_out, "{}\t{}\t{}\t{}\t{:0.3}\n", - String::from_utf8_lossy(bc), - String::from_utf8_lossy(keybc), - ct, total, (*ct as f64) / (total as f64))?; - } - - write!(nbhds_out, "{}\t{}\t{}\t{:0.3}", - String::from_utf8_lossy(keybc), - nbhd.len(), total, - (*keyct as f64) / (total as f64))?; - for (bc, ct) in nbhd.barcodes() { - write!(nbhds_out, "\t{}\t{}", - String::from_utf8_lossy(bc), ct)?; - } - write!(nbhds_out, "\n")?; + nbhd.write_total_counts(&mut nbhd_count_out)?; + nbhd.write_barcode_counts(&mut barcode_to_nbhd_out)?; + nbhd.write_nbhd_counts(&mut nbhds_out)?; } Ok(()) } + + pub fn count_barcodes(barcode_reader: &mut R) -> Result, usize>, failure::Error> { + let mut barcode_counts = HashMap::new(); + + for line_res in barcode_reader.lines() { + let line = line_res?; + + let barcode = line.into_bytes(); + let barcode_count = barcode_counts.entry(barcode).or_insert(0); + *barcode_count += 1; + } + + Ok(barcode_counts) + } } pub struct Neighborhood { @@ -157,7 +152,7 @@ impl Neighborhood { } pub fn write_barcode_counts(&self, out: &mut W) -> Result<(), std::io::Error> { - let (keybc, keyct) = self.key_barcode(); + let (keybc, _keyct) = self.key_barcode(); let total = self.total(); for (bc, ct) in self.barcodes() { diff --git a/src/bc_collapse.rs b/src/bc_collapse.rs index 77d2738..3a11aaf 100644 --- a/src/bc_collapse.rs +++ b/src/bc_collapse.rs @@ -14,11 +14,11 @@ fn main() { .author("Nick Ingolia ") .about("Collapse barcode sequences") .arg( - Arg::with_name("fastq_in") - .short("f") - .long("fastq") - .value_name("BARCODE.FASTQ") - .help("FastQ file of barcode sequences") + Arg::with_name("input") + .short("i") + .long("input") + .value_name("BARCODES.TXT") + .help("Text file of barcode sequences") .takes_value(true) .required(true), ) @@ -34,7 +34,7 @@ fn main() { .get_matches(); let cli = CLI { - fastq_in: matches.value_of("fastq_in").unwrap().to_string(), + input: matches.value_of("input").unwrap().to_string(), output_base: matches.value_of("output_base").unwrap().to_string(), }; From 99c65fba45c04eccfd2c408356813ee22652cd5f Mon Sep 17 00:00:00 2001 From: Nick Ingolia Date: Fri, 16 Aug 2019 15:08:54 -0700 Subject: [PATCH 27/31] Changed to use Vec barcodes for fast indexing. Converted bc_count to use SampleCount structure --- src/barcode_assign/bc_count.rs | 81 ++++------------------ src/barcode_assign/bc_tabulate.rs | 6 +- src/barcode_assign/counts.rs | 110 ++++++++++++++++++++++++++++-- 3 files changed, 120 insertions(+), 77 deletions(-) diff --git a/src/barcode_assign/bc_count.rs b/src/barcode_assign/bc_count.rs index a63e377..3e61c73 100644 --- a/src/barcode_assign/bc_count.rs +++ b/src/barcode_assign/bc_count.rs @@ -6,6 +6,7 @@ use std::path::{Path,PathBuf}; use bio::io::fastq; use collapse::Neighborhood; +use counts::*; #[derive(Debug)] pub struct Config { @@ -23,91 +24,35 @@ pub fn bc_count(config: Config) -> Result<(), failure::Error> { }; let barcode_reader = fastq::Reader::new(reader); - let barcode_counts = count_barcodes(barcode_reader)?; - + let barcode_counts_res: Result + = barcode_reader.records().collect(); + let barcode_counts = barcode_counts_res?; + let writer: Box = if config.out_barcodes == "-" { Box::new(io::stdout()) } else { Box::new(File::create(&config.out_barcodes)?) }; - write_barcode_table(writer, &barcode_counts)?; - + barcode_counts.write(writer)?; + if let Some(freq_filename) = config.freq_filename { - let freq_writer = File::create(freq_filename)?; - write_freq_table(freq_writer, &barcode_counts)?; + barcode_counts.write_freq_table(File::create(freq_filename)?)?; } - if let Some(nbhd_filename) = config.neighborhood { - neighborhood_counts(barcode_counts, &nbhd_filename)?; - } - - Ok(()) -} - -pub fn count_barcodes(barcode_reader: fastq::Reader) -> Result, usize>, failure::Error> { - let mut barcode_counts = HashMap::new(); + // if let Some(nbhd_filename) = config.neighborhood { + // neighborhood_counts(barcode_counts, &nbhd_filename)?; + // } - for (_recno, rec_res) in barcode_reader.records().enumerate() { - let rec = rec_res?; - - let barcode = rec.seq().to_vec(); - let barcode_count = barcode_counts.entry(barcode).or_insert(0); - *barcode_count += 1; - } - - Ok(barcode_counts) -} - -fn write_barcode_table( - barcode_out: W, - barcode_counts: &HashMap, usize>, -) -> Result<(), failure::Error> -where - W: std::io::Write, -{ - let mut bcout = std::io::BufWriter::new(barcode_out); - - for (barcode, count) in barcode_counts.iter() { - write!(bcout, "{}\t{}\n", - String::from_utf8_lossy(barcode), count)?; - } - - Ok(()) -} - -fn write_freq_table( - freq_out: W, - barcode_counts: &HashMap, usize>, -) -> Result<(), failure::Error> -where - W: std::io::Write, -{ - let mut fout = std::io::BufWriter::new(freq_out); - - let mut freq_counts = HashMap::new(); - - for freq in barcode_counts.values() { - let freq_count = freq_counts.entry(freq).or_insert(0); - *freq_count += 1; - } - - let mut freqs: Vec = freq_counts.keys().map(|&&k| k).collect(); - freqs.sort(); - - for freq in freqs { - write!(fout, "{}\t{}\n", freq, freq_counts.get(&freq).unwrap_or(&0))?; - } - Ok(()) } -fn neighborhood_counts(barcode_counts: HashMap, usize>, nbhd_filename: &str) -> Result<(), failure::Error> +fn neighborhood_counts(barcode_counts: SampleCounts, nbhd_filename: &str) -> Result<(), failure::Error> { let mut barcode_to_nbhd_out = std::fs::File::create(output_filename(nbhd_filename, "-barcode-to-nbhd.txt"))?; let mut nbhd_count_out = std::fs::File::create(output_filename(nbhd_filename, "-nbhd-count.txt"))?; let mut nbhds_out = std::fs::File::create(output_filename(nbhd_filename, "-nbhds.txt"))?; - let mut nbhds = Neighborhood::gather_neighborhoods(barcode_counts); + let mut nbhds = Neighborhood::gather_neighborhoods(barcode_counts.count_map()); for nbhd in nbhds.iter_mut() { nbhd.sort_by_counts(); } diff --git a/src/barcode_assign/bc_tabulate.rs b/src/barcode_assign/bc_tabulate.rs index a2642f8..c4ed328 100644 --- a/src/barcode_assign/bc_tabulate.rs +++ b/src/barcode_assign/bc_tabulate.rs @@ -23,7 +23,7 @@ impl CLI { } let total_counts = SampleCounts::total_counts(counts.iter().map(|(_input, counts)| counts)); - let mut barcodes: Vec<(String, usize)> = total_counts.into_iter().collect(); + let mut barcodes: Vec<(Vec, usize)> = total_counts.into_iter().collect(); barcodes.sort_by_key(|(_barcode, counts)| -(*counts as isize)); let mut out = std::fs::File::create(&self.output)?; @@ -45,9 +45,9 @@ impl CLI { ); if self.is_omitted(&count_vec) { - write!(omit_out, "{}\n", barcode)?; + write!(omit_out, "{}\n", String::from_utf8_lossy(barcode))?; } else { - write!(out, "{}", barcode)?; + write!(out, "{}", String::from_utf8_lossy(barcode))?; for ct in count_vec.iter() { write!(out, "\t{}", ct)?; } diff --git a/src/barcode_assign/counts.rs b/src/barcode_assign/counts.rs index 348be22..eab0b95 100644 --- a/src/barcode_assign/counts.rs +++ b/src/barcode_assign/counts.rs @@ -3,13 +3,19 @@ use std::fmt::Display; use std::io::BufRead; use std::io::BufReader; use std::io::Read; +use std::io::Write; +use std::iter::FromIterator; use std::path::Path; +use bio::io::fasta; +use bio::io::fastq; use failure; -pub struct SampleCounts(HashMap); +pub struct SampleCounts(HashMap, usize>); impl SampleCounts { + pub fn count_map(self) -> HashMap, usize> { self.0 } + pub fn from_file + Display>(filename: P) -> Result { Self::read(std::fs::File::open(filename.as_ref())?) .map_err(|e| format_err!("Reading file {}: {}", filename, e)) @@ -46,16 +52,46 @@ impl SampleCounts { ); } - counts.insert(barcode.to_string(), count); + counts.insert(barcode.as_bytes().to_vec(), count); } Ok(SampleCounts(counts)) } + pub fn write(&self, barcode_out: W) -> Result<(), failure::Error> { + let mut bcout = std::io::BufWriter::new(barcode_out); + + for (barcode, count) in self.0.iter() { + write!(bcout, "{}\t{}\n", String::from_utf8_lossy(barcode), count)?; + } + + Ok(()) + } + + pub fn write_freq_table(&self, freq_out: W) -> Result<(), failure::Error> { + let mut fout = std::io::BufWriter::new(freq_out); + + let mut freq_counts = HashMap::new(); + + for freq in self.0.values() { + let freq_count = freq_counts.entry(freq).or_insert(0); + *freq_count += 1; + } + + let mut freqs: Vec = freq_counts.keys().map(|&&k| k).collect(); + freqs.sort(); + + for freq in freqs { + write!(fout, "{}\t{}\n", freq, freq_counts.get(&freq).unwrap_or(&0))?; + } + + Ok(()) + } + pub fn total_counts<'a, I: Iterator>(counts_iter: I) -> Self { let mut total_counts = HashMap::new(); for SampleCounts(count_map) in counts_iter { for (barcode, count) in count_map.iter() { - let barcode_count = total_counts.entry(barcode.to_string()).or_insert(0); + let barcode_count = total_counts.entry(barcode.to_vec()).or_insert(0); *barcode_count += *count; } } @@ -64,7 +100,7 @@ impl SampleCounts { pub fn barcode_count_vec<'a, I: Iterator>( counts_iter: I, - barcode: &'a str, + barcode: &'a [u8], ) -> Vec { counts_iter .map(|SampleCounts(count_table)| count_table.get(barcode).map_or(0, |ct| *ct)) @@ -73,10 +109,72 @@ impl SampleCounts { } impl IntoIterator for SampleCounts { - type Item = (String, usize); - type IntoIter = ::std::collections::hash_map::IntoIter; + type Item = (Vec, usize); + type IntoIter = ::std::collections::hash_map::IntoIter, usize>; fn into_iter(self) -> Self::IntoIter { self.0.into_iter() } } + +impl <'a> FromIterator<&'a str> for SampleCounts { + fn from_iter(iter: I) -> SampleCounts + where I: IntoIterator + { + let mut barcode_counts = HashMap::new(); + + for bc in iter { + if barcode_counts.contains_key(bc.as_bytes()) { + *(barcode_counts.get_mut(bc.as_bytes()).unwrap()) += 1; + } else { + barcode_counts.insert(bc.as_bytes().to_vec(), 1); + } + } + + SampleCounts(barcode_counts) + } +} + +impl <'a> FromIterator<&'a [u8]> for SampleCounts { + fn from_iter(iter: I) -> SampleCounts + where I: IntoIterator + { + let mut barcode_counts = HashMap::new(); + + for bc in iter { + if barcode_counts.contains_key(bc) { + *(barcode_counts.get_mut(bc).unwrap()) += 1; + } else { + barcode_counts.insert(bc.to_vec(), 1); + } + } + + SampleCounts(barcode_counts) + } +} + +impl <'a> FromIterator<&'a fastq::Record> for SampleCounts { + fn from_iter(iter: I) -> SampleCounts + where I: IntoIterator + { + iter.into_iter().map(fastq::Record::seq).collect() + } +} + +impl FromIterator for SampleCounts { + fn from_iter(iter: I) -> SampleCounts + where I: IntoIterator + { + let mut barcode_counts = HashMap::new(); + + for rec in iter { + if barcode_counts.contains_key(rec.seq()) { + *(barcode_counts.get_mut(rec.seq()).unwrap()) += 1; + } else { + barcode_counts.insert(rec.seq().to_vec(), 1); + } + } + + SampleCounts(barcode_counts) + } +} From 7e3d90bb2ef754dc37a7de659b969692df503f4d Mon Sep 17 00:00:00 2001 From: Nick Ingolia Date: Fri, 16 Aug 2019 15:17:18 -0700 Subject: [PATCH 28/31] Split out barcode neighborhood and collapse CLI. --- .../{collapse.rs => bc_collapse.rs} | 0 src/barcode_assign/bc_count.rs | 11 +- src/barcode_assign/counts.rs | 26 ++ src/barcode_assign/lib.rs | 3 +- src/barcode_assign/neighborhood.rs | 227 ++++++++++++++++++ src/bc_collapse.rs | 3 +- 6 files changed, 261 insertions(+), 9 deletions(-) rename src/barcode_assign/{collapse.rs => bc_collapse.rs} (100%) create mode 100644 src/barcode_assign/neighborhood.rs diff --git a/src/barcode_assign/collapse.rs b/src/barcode_assign/bc_collapse.rs similarity index 100% rename from src/barcode_assign/collapse.rs rename to src/barcode_assign/bc_collapse.rs diff --git a/src/barcode_assign/bc_count.rs b/src/barcode_assign/bc_count.rs index 3e61c73..b597a64 100644 --- a/src/barcode_assign/bc_count.rs +++ b/src/barcode_assign/bc_count.rs @@ -1,12 +1,11 @@ -use std::collections::HashMap; use std::fs::File; use std::io::{self, Read, Write}; use std::path::{Path,PathBuf}; use bio::io::fastq; -use collapse::Neighborhood; -use counts::*; +use neighborhood::Neighborhood; +use counts::SampleCounts; #[derive(Debug)] pub struct Config { @@ -39,9 +38,9 @@ pub fn bc_count(config: Config) -> Result<(), failure::Error> { barcode_counts.write_freq_table(File::create(freq_filename)?)?; } - // if let Some(nbhd_filename) = config.neighborhood { - // neighborhood_counts(barcode_counts, &nbhd_filename)?; - // } + if let Some(nbhd_filename) = config.neighborhood { + neighborhood_counts(barcode_counts, &nbhd_filename)?; + } Ok(()) } diff --git a/src/barcode_assign/counts.rs b/src/barcode_assign/counts.rs index eab0b95..afd5d34 100644 --- a/src/barcode_assign/counts.rs +++ b/src/barcode_assign/counts.rs @@ -178,3 +178,29 @@ impl FromIterator for SampleCounts { SampleCounts(barcode_counts) } } + +impl <'a> FromIterator<&'a fasta::Record> for SampleCounts { + fn from_iter(iter: I) -> SampleCounts + where I: IntoIterator + { + iter.into_iter().map(fasta::Record::seq).collect() + } +} + +impl FromIterator for SampleCounts { + fn from_iter(iter: I) -> SampleCounts + where I: IntoIterator + { + let mut barcode_counts = HashMap::new(); + + for rec in iter { + if barcode_counts.contains_key(rec.seq()) { + *(barcode_counts.get_mut(rec.seq()).unwrap()) += 1; + } else { + barcode_counts.insert(rec.seq().to_vec(), 1); + } + } + + SampleCounts(barcode_counts) + } +} diff --git a/src/barcode_assign/lib.rs b/src/barcode_assign/lib.rs index eed9ebc..648f472 100644 --- a/src/barcode_assign/lib.rs +++ b/src/barcode_assign/lib.rs @@ -5,16 +5,17 @@ extern crate rust_htslib; pub mod assign; pub mod barcode_group; +pub mod bc_collapse; pub mod bc_count; pub mod bc_frag; pub mod bc_seqs; pub mod bc_tabulate; pub mod counts; -pub mod collapse; pub mod depth; pub mod fastq_pair; pub mod flank_match; pub mod frag_purity; +pub mod neighborhood; pub mod pacbio_join; pub mod pacbio_reads; pub mod purity; diff --git a/src/barcode_assign/neighborhood.rs b/src/barcode_assign/neighborhood.rs new file mode 100644 index 0000000..5e00e5b --- /dev/null +++ b/src/barcode_assign/neighborhood.rs @@ -0,0 +1,227 @@ +use std::collections::HashMap; +use std::io::Write; + +pub struct Neighborhood { + barcodes: Vec<(Vec, T)> +} + +impl Neighborhood { + fn new() -> Self { + Neighborhood { + barcodes: Vec::new() + } + } + + fn insert(&mut self, barcode: Vec, value: T) -> () { + self.barcodes.push((barcode, value)); + } + + pub fn barcodes(&self) -> impl Iterator, T)> { + self.barcodes.iter() + } + + pub fn len(&self) -> usize { self.barcodes.len() } + + pub fn key_barcode(&self) -> (&[u8], &T) { + let (keybc, keyct) = self.barcodes.first().unwrap(); + (keybc, keyct) + } + + // Collecting a neighborhood: + // 1. Pick a node arbitrarily + // a. remove from key set + // b. initialize work stack with node + // 2. Handle a node from work stack + // a. check key set for all near-neighbors + // i. remove near-neighbor from key set + // ii. push near-neighbor onto work stack + // b. add node to neighborhood + // 3. Repeat handling nodes from work stack until empty + + pub fn gather_neighborhoods(mut bc_map: HashMap, T>) -> Vec> + { + let mut neighborhoods = Vec::new(); + + loop { + let mut work_stack = Vec::new(); + let mut neighborhood = Neighborhood::new(); + + let start_ref = match bc_map.iter().next() { + Some(start_ref) => start_ref, + None => { break; } + }; + + let start = start_ref.0.to_vec(); + let value = bc_map.remove(&start).unwrap(); + work_stack.push((start, value)); + + while work_stack.len() > 0 { + let (curr, curr_value) = work_stack.pop().unwrap(); + + let neighbors = Substitutions::new(&curr).chain(Deletions::new(&curr)).chain(Insertions::new(&curr)); + for neighbor in neighbors { + if bc_map.contains_key(&neighbor) { + let neighbor_value = bc_map.remove(&neighbor).unwrap(); + work_stack.push((neighbor, neighbor_value)); + } + } + + neighborhood.insert(curr, curr_value); + } + + neighborhoods.push(neighborhood); + } + + neighborhoods + } +} + +impl Neighborhood { + pub fn sort_by_counts(&mut self) -> () { + self.barcodes.sort_by_key(|(_, ct)| -(*ct as isize)); + } + + pub fn total(&self) -> usize { + self.barcodes().map(|(_, ct)| *ct).sum() + } + + pub fn write_total_counts(&self, out: &mut W) -> Result<(), std::io::Error> { + write!(out, "{}\t{}\n", String::from_utf8_lossy(self.key_barcode().0), self.total()) + } + + pub fn write_barcode_counts(&self, out: &mut W) -> Result<(), std::io::Error> { + let (keybc, _keyct) = self.key_barcode(); + let total = self.total(); + + for (bc, ct) in self.barcodes() { + write!(out, "{}\t{}\t{}\t{}\t{:0.3}\n", + String::from_utf8_lossy(bc), + String::from_utf8_lossy(keybc), + ct, total, (*ct as f64) / (total as f64))?; + } + + Ok(()) + } + + pub fn write_nbhd_counts(&self, out: &mut W) -> Result<(), std::io::Error> { + let (keybc, keyct) = self.key_barcode(); + let total = self.total(); + + write!(out, "{}\t{}\t{}\t{:0.3}", + String::from_utf8_lossy(keybc), + self.len(), total, + (*keyct as f64) / (total as f64))?; + for (bc, ct) in self.barcodes() { + write!(out, "\t{}\t{}", + String::from_utf8_lossy(bc), ct)?; + } + write!(out, "\n") + } +} + +// Switch to an interface where mutations (acting on a slice buffer) +// are returned to avoid allocation. + +const NTS_LEN: usize = 4; +static NTS: [u8; NTS_LEN] = [b'A', b'C', b'G', b'T']; + +struct Substitutions<'a> { + original: &'a [u8], + position: usize, + nt: usize, +} + +impl <'a> Substitutions<'a> { + pub fn new(original: &'a [u8]) -> Self { + Substitutions { original: original, position: 0, nt: 0 } + } +} + +impl <'a> Iterator for Substitutions<'a> { + type Item = Vec; + + fn next(&mut self) -> Option { + if self.position >= self.original.len() { + return None; + } + + if self.nt >= NTS_LEN { + self.position += 1; + self.nt = 0; + return self.next(); + } + + if self.original[self.position] == NTS[self.nt] { + self.nt += 1; + return self.next(); + } + + let mut variant = self.original.to_vec(); + variant[self.position] = NTS[self.nt]; + self.nt += 1; + return Some(variant); + } +} + +struct Insertions<'a> { + original: &'a [u8], + position: usize, + nt: usize, +} + +impl <'a> Insertions<'a> { + pub fn new(original: &'a [u8]) -> Self { + Insertions { original: original, position: 0, nt: 0 } + } +} + +impl <'a> Iterator for Insertions<'a> { + type Item = Vec; + + fn next(&mut self) -> Option { + if self.position > self.original.len() { + return None; + } + + if self.nt >= NTS_LEN { + self.position += 1; + self.nt = 0; + return self.next(); + } + + let mut variant = Vec::with_capacity(self.original.len() + 1); + variant.extend_from_slice(&self.original[..self.position]); + variant.push(NTS[self.nt]); + variant.extend_from_slice(&self.original[self.position..]); + self.nt += 1; + return Some(variant); + } +} + +struct Deletions<'a> { + original: &'a [u8], + position: usize, +} + +impl <'a> Deletions<'a> { + pub fn new(original: &'a [u8]) -> Self { + Deletions { original: original, position: 0 } + } +} + +impl <'a> Iterator for Deletions<'a> { + type Item = Vec; + + fn next(&mut self) -> Option { + if self.position >= self.original.len() { + return None; + } + + let mut variant = Vec::with_capacity(self.original.len() - 1); + variant.extend_from_slice(&self.original[..self.position]); + variant.extend_from_slice(&self.original[(self.position+1)..]); + self.position += 1; + return Some(variant); + } +} + diff --git a/src/bc_collapse.rs b/src/bc_collapse.rs index 3a11aaf..31b1677 100644 --- a/src/bc_collapse.rs +++ b/src/bc_collapse.rs @@ -1,12 +1,11 @@ extern crate barcode_assign; extern crate bio; extern crate clap; -#[macro_use] extern crate failure; use clap::{App, Arg}; -use barcode_assign::collapse::CLI; +use barcode_assign::bc_collapse::CLI; fn main() { let matches = App::new("bc-collapse") From 8c1d798b3f5a75a51889e58cf9c9d37627f1ecfc Mon Sep 17 00:00:00 2001 From: Nick Ingolia Date: Fri, 16 Aug 2019 15:38:17 -0700 Subject: [PATCH 29/31] Headers for barcode assignment output --- src/barcode_assign/bc_count.rs | 3 +++ src/barcode_assign/neighborhood.rs | 4 ++++ src/bc_count.rs | 2 +- 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/barcode_assign/bc_count.rs b/src/barcode_assign/bc_count.rs index b597a64..03e10ed 100644 --- a/src/barcode_assign/bc_count.rs +++ b/src/barcode_assign/bc_count.rs @@ -55,6 +55,9 @@ fn neighborhood_counts(barcode_counts: SampleCounts, nbhd_filename: &str) -> Res for nbhd in nbhds.iter_mut() { nbhd.sort_by_counts(); } + + writeln!(barcode_to_nbhd_out, "{}", Neighborhood::barcode_counts_header())?; + writeln!(nbhds_out, "{}", Neighborhood::nbhd_counts_header())?; for nbhd in nbhds.iter() { nbhd.write_total_counts(&mut nbhd_count_out)?; diff --git a/src/barcode_assign/neighborhood.rs b/src/barcode_assign/neighborhood.rs index 5e00e5b..94ceed7 100644 --- a/src/barcode_assign/neighborhood.rs +++ b/src/barcode_assign/neighborhood.rs @@ -88,6 +88,8 @@ impl Neighborhood { pub fn write_total_counts(&self, out: &mut W) -> Result<(), std::io::Error> { write!(out, "{}\t{}\n", String::from_utf8_lossy(self.key_barcode().0), self.total()) } + + pub fn barcode_counts_header() -> &'static str { "barcode\tneighborhood\tcount\ttotal\tfraction" } pub fn write_barcode_counts(&self, out: &mut W) -> Result<(), std::io::Error> { let (keybc, _keyct) = self.key_barcode(); @@ -103,6 +105,8 @@ impl Neighborhood { Ok(()) } + pub fn nbhd_counts_header() -> &'static str { "neighborhood\tnum_barcodes\ttotal\tfract_nbhd" } + pub fn write_nbhd_counts(&self, out: &mut W) -> Result<(), std::io::Error> { let (keybc, keyct) = self.key_barcode(); let total = self.total(); diff --git a/src/bc_count.rs b/src/bc_count.rs index f1145b8..12a98b6 100644 --- a/src/bc_count.rs +++ b/src/bc_count.rs @@ -6,7 +6,7 @@ use clap::{App, Arg}; fn main() { let matches = App::new("bc-count") - .version("1.0") + .version("1.1") .author("Nick Ingolia ") .about("Count barcode sequences") .arg( From 7073ce4f5ac78ccc9a1f82014dcee14e04812acc Mon Sep 17 00:00:00 2001 From: Nick Ingolia Date: Fri, 16 Aug 2019 15:39:09 -0700 Subject: [PATCH 30/31] Updated for separate neighborhood module --- src/barcode_assign/bc_collapse.rs | 227 +----------------------------- 1 file changed, 2 insertions(+), 225 deletions(-) diff --git a/src/barcode_assign/bc_collapse.rs b/src/barcode_assign/bc_collapse.rs index e4f4942..d416914 100644 --- a/src/barcode_assign/bc_collapse.rs +++ b/src/barcode_assign/bc_collapse.rs @@ -2,9 +2,10 @@ use std::collections::HashMap; use std::io::BufRead; use std::io::BufReader; use std::io::Read; -use std::io::Write; use std::path::{Path,PathBuf}; +use neighborhood::Neighborhood; + pub struct CLI { pub input: String, pub output_base: String, @@ -63,227 +64,3 @@ impl CLI { } } -pub struct Neighborhood { - barcodes: Vec<(Vec, T)> -} - -impl Neighborhood { - fn new() -> Self { - Neighborhood { - barcodes: Vec::new() - } - } - - fn insert(&mut self, barcode: Vec, value: T) -> () { - self.barcodes.push((barcode, value)); - } - - pub fn barcodes(&self) -> impl Iterator, T)> { - self.barcodes.iter() - } - - pub fn len(&self) -> usize { self.barcodes.len() } - - pub fn key_barcode(&self) -> (&[u8], &T) { - let (keybc, keyct) = self.barcodes.first().unwrap(); - (keybc, keyct) - } - - // Collecting a neighborhood: - // 1. Pick a node arbitrarily - // a. remove from key set - // b. initialize work stack with node - // 2. Handle a node from work stack - // a. check key set for all near-neighbors - // i. remove near-neighbor from key set - // ii. push near-neighbor onto work stack - // b. add node to neighborhood - // 3. Repeat handling nodes from work stack until empty - - pub fn gather_neighborhoods(mut bc_map: HashMap, T>) -> Vec> - { - let mut neighborhoods = Vec::new(); - - loop { - let mut work_stack = Vec::new(); - let mut neighborhood = Neighborhood::new(); - - let start_ref = match bc_map.iter().next() { - Some(start_ref) => start_ref, - None => { break; } - }; - - let start = start_ref.0.to_vec(); - let value = bc_map.remove(&start).unwrap(); - work_stack.push((start, value)); - - while work_stack.len() > 0 { - let (curr, curr_value) = work_stack.pop().unwrap(); - - let neighbors = Substitutions::new(&curr).chain(Deletions::new(&curr)).chain(Insertions::new(&curr)); - for neighbor in neighbors { - if bc_map.contains_key(&neighbor) { - let neighbor_value = bc_map.remove(&neighbor).unwrap(); - work_stack.push((neighbor, neighbor_value)); - } - } - - neighborhood.insert(curr, curr_value); - } - - neighborhoods.push(neighborhood); - } - - neighborhoods - } -} - -impl Neighborhood { - pub fn sort_by_counts(&mut self) -> () { - self.barcodes.sort_by_key(|(_, ct)| -(*ct as isize)); - } - - pub fn total(&self) -> usize { - self.barcodes().map(|(_, ct)| *ct).sum() - } - - pub fn write_total_counts(&self, out: &mut W) -> Result<(), std::io::Error> { - write!(out, "{}\t{}\n", String::from_utf8_lossy(self.key_barcode().0), self.total()) - } - - pub fn write_barcode_counts(&self, out: &mut W) -> Result<(), std::io::Error> { - let (keybc, _keyct) = self.key_barcode(); - let total = self.total(); - - for (bc, ct) in self.barcodes() { - write!(out, "{}\t{}\t{}\t{}\t{:0.3}\n", - String::from_utf8_lossy(bc), - String::from_utf8_lossy(keybc), - ct, total, (*ct as f64) / (total as f64))?; - } - - Ok(()) - } - - pub fn write_nbhd_counts(&self, out: &mut W) -> Result<(), std::io::Error> { - let (keybc, keyct) = self.key_barcode(); - let total = self.total(); - - write!(out, "{}\t{}\t{}\t{:0.3}", - String::from_utf8_lossy(keybc), - self.len(), total, - (*keyct as f64) / (total as f64))?; - for (bc, ct) in self.barcodes() { - write!(out, "\t{}\t{}", - String::from_utf8_lossy(bc), ct)?; - } - write!(out, "\n") - } -} - -// Switch to an interface where mutations (acting on a slice buffer) -// are returned to avoid allocation. - -const NTS_LEN: usize = 4; -static NTS: [u8; NTS_LEN] = [b'A', b'C', b'G', b'T']; - -struct Substitutions<'a> { - original: &'a [u8], - position: usize, - nt: usize, -} - -impl <'a> Substitutions<'a> { - pub fn new(original: &'a [u8]) -> Self { - Substitutions { original: original, position: 0, nt: 0 } - } -} - -impl <'a> Iterator for Substitutions<'a> { - type Item = Vec; - - fn next(&mut self) -> Option { - if self.position >= self.original.len() { - return None; - } - - if self.nt >= NTS_LEN { - self.position += 1; - self.nt = 0; - return self.next(); - } - - if self.original[self.position] == NTS[self.nt] { - self.nt += 1; - return self.next(); - } - - let mut variant = self.original.to_vec(); - variant[self.position] = NTS[self.nt]; - self.nt += 1; - return Some(variant); - } -} - -struct Insertions<'a> { - original: &'a [u8], - position: usize, - nt: usize, -} - -impl <'a> Insertions<'a> { - pub fn new(original: &'a [u8]) -> Self { - Insertions { original: original, position: 0, nt: 0 } - } -} - -impl <'a> Iterator for Insertions<'a> { - type Item = Vec; - - fn next(&mut self) -> Option { - if self.position > self.original.len() { - return None; - } - - if self.nt >= NTS_LEN { - self.position += 1; - self.nt = 0; - return self.next(); - } - - let mut variant = Vec::with_capacity(self.original.len() + 1); - variant.extend_from_slice(&self.original[..self.position]); - variant.push(NTS[self.nt]); - variant.extend_from_slice(&self.original[self.position..]); - self.nt += 1; - return Some(variant); - } -} - -struct Deletions<'a> { - original: &'a [u8], - position: usize, -} - -impl <'a> Deletions<'a> { - pub fn new(original: &'a [u8]) -> Self { - Deletions { original: original, position: 0 } - } -} - -impl <'a> Iterator for Deletions<'a> { - type Item = Vec; - - fn next(&mut self) -> Option { - if self.position >= self.original.len() { - return None; - } - - let mut variant = Vec::with_capacity(self.original.len() - 1); - variant.extend_from_slice(&self.original[..self.position]); - variant.extend_from_slice(&self.original[(self.position+1)..]); - self.position += 1; - return Some(variant); - } -} - From ce830d00509c4074c35539a1476c8acfcc0ccecf Mon Sep 17 00:00:00 2001 From: Nick Ingolia Date: Fri, 16 Aug 2019 15:52:34 -0700 Subject: [PATCH 31/31] Change to Sum interface for totalling up SampleCounts --- src/barcode_assign/bc_tabulate.rs | 2 +- src/barcode_assign/counts.rs | 55 ++++++++++++++++++++++++------- 2 files changed, 45 insertions(+), 12 deletions(-) diff --git a/src/barcode_assign/bc_tabulate.rs b/src/barcode_assign/bc_tabulate.rs index c4ed328..cb983d6 100644 --- a/src/barcode_assign/bc_tabulate.rs +++ b/src/barcode_assign/bc_tabulate.rs @@ -22,7 +22,7 @@ impl CLI { counts.push((input.to_string(), input_counts)); } - let total_counts = SampleCounts::total_counts(counts.iter().map(|(_input, counts)| counts)); + let total_counts: SampleCounts = counts.iter().map(|(_input, counts)| counts).sum(); let mut barcodes: Vec<(Vec, usize)> = total_counts.into_iter().collect(); barcodes.sort_by_key(|(_barcode, counts)| -(*counts as isize)); diff --git a/src/barcode_assign/counts.rs b/src/barcode_assign/counts.rs index afd5d34..c104743 100644 --- a/src/barcode_assign/counts.rs +++ b/src/barcode_assign/counts.rs @@ -5,22 +5,37 @@ use std::io::BufReader; use std::io::Read; use std::io::Write; use std::iter::FromIterator; +use std::iter::Sum; use std::path::Path; use bio::io::fasta; use bio::io::fastq; use failure; +/// Tabulation of barcode counts in a sample pub struct SampleCounts(HashMap, usize>); impl SampleCounts { + /// Returns a `HashMap` of barcodes and their counts pub fn count_map(self) -> HashMap, usize> { self.0 } + /// Reads a barcode count table from a file. + /// + /// # Arguments + /// + /// * `filename` is the name of the file that will be read. pub fn from_file + Display>(filename: P) -> Result { Self::read(std::fs::File::open(filename.as_ref())?) .map_err(|e| format_err!("Reading file {}: {}", filename, e)) } + /// Reads and parses a barcode count table. + /// + /// A barcode count table is a tab-delimited table of barcodecount. + /// + /// # Arguments + /// + /// * `input` is an input source that will be read. pub fn read(input: R) -> Result { let mut counts = HashMap::new(); @@ -57,6 +72,11 @@ impl SampleCounts { Ok(SampleCounts(counts)) } + /// Writes a barcode count table. + /// + /// # Arguments + /// + /// * The table will be written to the output desntiation `barcode_out` pub fn write(&self, barcode_out: W) -> Result<(), failure::Error> { let mut bcout = std::io::BufWriter::new(barcode_out); @@ -67,6 +87,15 @@ impl SampleCounts { Ok(()) } + /// Writes a barcode frequency table. + /// + /// The frequency table lists the number of barcodes seen 1 time, + /// 2 times, etc. The format of the table is + /// times-seennumber-of-barcodes. + /// + /// # Arguments + /// + /// * The table will be written to the output desntiation `barcode_out` pub fn write_freq_table(&self, freq_out: W) -> Result<(), failure::Error> { let mut fout = std::io::BufWriter::new(freq_out); @@ -87,17 +116,6 @@ impl SampleCounts { Ok(()) } - pub fn total_counts<'a, I: Iterator>(counts_iter: I) -> Self { - let mut total_counts = HashMap::new(); - for SampleCounts(count_map) in counts_iter { - for (barcode, count) in count_map.iter() { - let barcode_count = total_counts.entry(barcode.to_vec()).or_insert(0); - *barcode_count += *count; - } - } - SampleCounts(total_counts) - } - pub fn barcode_count_vec<'a, I: Iterator>( counts_iter: I, barcode: &'a [u8], @@ -108,6 +126,21 @@ impl SampleCounts { } } +impl <'a> Sum<&'a SampleCounts> for SampleCounts { + fn sum(iter: I) -> Self + where I: Iterator + { + let mut total_counts = HashMap::new(); + for SampleCounts(count_map) in iter { + for (barcode, count) in count_map.iter() { + let barcode_count = total_counts.entry(barcode.to_vec()).or_insert(0); + *barcode_count += *count; + } + } + SampleCounts(total_counts) + } +} + impl IntoIterator for SampleCounts { type Item = (Vec, usize); type IntoIter = ::std::collections::hash_map::IntoIter, usize>;