Skip to content

Commit

Permalink
barebones bamCompare implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
WardDeb committed Nov 21, 2024
1 parent 659888f commit 5ced25d
Show file tree
Hide file tree
Showing 8 changed files with 179 additions and 101 deletions.
5 changes: 4 additions & 1 deletion pydeeptools/deeptools/bamCompare2.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,8 @@ def main(args=None):
args.scaleFactorsMethod = 'None'
else:
args.normalizeUsing = 'None'

print(args)
args.pseudocount = 1
r_bamcompare(
args.bamfile1, # bam file 1
args.bamfile2, # bam file 2
Expand All @@ -253,6 +254,8 @@ def main(args=None):
args.normalizeUsing, # normalization method
args.scaleFactorsMethod, # scaling method
args.effectiveGenomeSize, # effective genome size
args.operation,
args.pseudocount,
args.numberOfProcessors, # threads
args.binSize, # bin size
[], # regions
Expand Down
125 changes: 54 additions & 71 deletions src/bamcompare.rs
Original file line number Diff line number Diff line change
@@ -1,96 +1,79 @@
use pyo3::prelude::*;
use rayon::prelude::*;
use rayon::ThreadPoolBuilder;
use crate::filehandler::bam_stats;
use crate::filehandler::{bam_ispaired, write_file};
use crate::covcalc::{bam_pileup, parse_regions, collapse_bgvecs};
use crate::normalization::scale_factor_bamcompare;
use crate::covcalc::{bam_pileup, parse_regions};
use crate::calc::median;

#[pyfunction]
pub fn r_bamcompare(
bam_ifile: &str,
bam_ifile1: &str,
bam_ifile2: &str,
_ofile: &str,
_ofiletype: &str,
_norm: &str,
ofile: &str,
ofiletype: &str,
norm: &str,
scalefactorsmethod: &str,
effective_genome_size: u64,
operation: &str,
pseudocount: f64,
nproc: usize,
binsize: u32,
regions: Vec<(String, u64, u64)>,
verbose: bool
) -> PyResult<()> {
// put statistics into scope, this should probably be rewritten. (can't we always assume at least 2 threads ? )
// will need to be revisited for multiBamsummaries / computeMatrix.
let mut total_reads1: u64 = 0;
let mut total_reads2: u64 = 0;
let mut mapped_reads1: u64 = 0;
let mut mapped_reads2: u64 = 0;
let mut unmapped_reads1: u64 = 0;
let mut unmapped_reads2: u64 = 0;
let mut readlen1: f32 = 0.0;
let mut readlen2: f32 = 0.0;
let mut fraglen1: f32 = 0.0;
let mut fraglen2: f32 = 0.0;
// Get statistics of bam file.
if nproc > 1 {
let pool2 = ThreadPoolBuilder::new().num_threads(2).build().unwrap();
let bamstatvec: Vec<_> = pool2.install(|| {
vec![
(bam_ifile, &verbose),
(bam_ifile2, &verbose)
]
.par_iter()
.map(|(bam_ifile, verbose)| bam_stats(bam_ifile, verbose))
.collect()
});
let (_total_reads1, _mapped_reads1, _unmapped_reads1, _readlen1, _fraglen1) = bamstatvec[0];
let (_total_reads2, _mapped_reads2, _unmapped_reads2, _readlen2, _fraglen2) = bamstatvec[1];
total_reads1 = _total_reads1;
total_reads2 = _total_reads2;
mapped_reads1 = _mapped_reads1;
mapped_reads2 = _mapped_reads2;
unmapped_reads1 = _unmapped_reads1;
unmapped_reads2 = _unmapped_reads2;
readlen1 = _readlen1;
readlen2 = _readlen2;
fraglen1 = _fraglen1;
fraglen2 = _fraglen2;
let ispe1 = bam_ispaired(bam_ifile1);
let ispe2 = bam_ispaired(bam_ifile2);

} else {
let (_total_reads1, _mapped_reads1, _unmapped_reads1, _readlen1, _fraglen1) = bam_stats(bam_ifile, &verbose);
let (_total_reads2, _mapped_reads2, _unmapped_reads2, _readlen2, _fraglen2) = bam_stats(bam_ifile2, &verbose);
total_reads1 = _total_reads1;
total_reads2 = _total_reads2;
mapped_reads1 = _mapped_reads1;
mapped_reads2 = _mapped_reads2;
unmapped_reads1 = _unmapped_reads1;
unmapped_reads2 = _unmapped_reads2;
readlen1 = _readlen1;
readlen2 = _readlen2;
fraglen1 = _fraglen1;
fraglen2 = _fraglen2;
if verbose {
println!("Sample1: {} is-paired: {}", bam_ifile1, ispe1);
println!("Sample2: {} is-paired: {}", bam_ifile2, ispe2);
}

// Calculate scale factors
let (scale_factor1, scale_factor2) = scale_factor_bamcompare(
scalefactorsmethod,
mapped_reads1,
mapped_reads2,
binsize as u64,
effective_genome_size
);
println!("scalefactors = {} and {}", scale_factor1, scale_factor2);
let (regions, chromsizes) = parse_regions(&regions, bam_ifile);
// Parse regions & calculate coverage
let (regions, chromsizes) = parse_regions(&regions, bam_ifile1);
let pool = ThreadPoolBuilder::new().num_threads(nproc).build().unwrap();
let _bg1: Vec<(String, u64, u64, f64)> = pool.install(|| {

// Parse first bamfile
let (bg1, mapped1, _unmapped1, readlen1, fraglen1) = pool.install(|| {
regions.par_iter()
.flat_map(|i| bam_pileup(bam_ifile, &i, &binsize, scale_factor1))
.collect()
.map(|i| bam_pileup(bam_ifile1, &i, &binsize, &ispe1))
.reduce(
|| (vec![], 0, 0, vec![], vec![]),
|(mut _bg, mut _mapped, mut _unmapped, mut _readlen, mut _fraglen), (bg, mapped, unmapped, readlen, fraglen)| {
_bg.extend(bg);
_readlen.extend(readlen);
_fraglen.extend(fraglen);
_mapped += mapped;
_unmapped += unmapped;
(_bg, _mapped, _unmapped, _readlen, _fraglen)
}
)
});
let _bg2: Vec<(String, u64, u64, f64)> = pool.install(|| {
let _readlen1 = median(readlen1);
let _fraglen1 = median(fraglen1);

// Parse first bamfile
let (bg2, mapped2, _unmapped2, readlen2, fraglen2) = pool.install(|| {
regions.par_iter()
.flat_map(|i| bam_pileup(bam_ifile2, &i, &binsize, scale_factor2))
.collect()
.map(|i| bam_pileup(bam_ifile2, &i, &binsize, &ispe2))
.reduce(
|| (vec![], 0, 0, vec![], vec![]),
|(mut _bg, mut _mapped, mut _unmapped, mut _readlen, mut _fraglen), (bg, mapped, unmapped, readlen, fraglen)| {
_bg.extend(bg);
_readlen.extend(readlen);
_fraglen.extend(fraglen);
_mapped += mapped;
_unmapped += unmapped;
(_bg, _mapped, _unmapped, _readlen, _fraglen)
}
)
});
let _readlen2 = median(readlen2);
let _fraglen2 = median(fraglen2);
let (sf1, sf2) = scale_factor_bamcompare(scalefactorsmethod, mapped1, mapped2, binsize, effective_genome_size, norm);
println!("scale factor1 = {}, scale factor2 = {}", sf1, sf2);
let bge = collapse_bgvecs(bg1, bg2, sf1, sf2, pseudocount, operation);
write_file(ofile, ofiletype, bge, chromsizes);
Ok(())
}
3 changes: 2 additions & 1 deletion src/bamcoverage.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ pub fn r_bamcoverage(
// Parse regions & calculate coverage
let (regions, chromsizes) = parse_regions(&regions, bam_ifile);
let pool = ThreadPoolBuilder::new().num_threads(nproc).build().unwrap();
let (bg, mapped, unmapped, readlen, fraglen) = pool.install(|| {
let (bg, mapped, _unmapped, readlen, fraglen) = pool.install(|| {
regions.par_iter()
.map(|i| bam_pileup(bam_ifile, &i, &binsize, &ispe))
.reduce(
Expand All @@ -48,6 +48,7 @@ pub fn r_bamcoverage(
binsize,
effective_genome_size,
readlen,
fraglen,
&verbose
);
let bg_scaled = collapse_bgvec(bg, sf);
Expand Down
2 changes: 1 addition & 1 deletion src/calc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,4 @@ pub fn median(mut nvec: Vec<u32>) -> f32 {
return (nvec[len / 2] + nvec[len / 2 - 1]) as f32 / 2.0;
}
}
}
}
130 changes: 110 additions & 20 deletions src/covcalc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,32 @@ pub fn bam_pileup(
return (bg, mapped_reads, unmapped_reads, readlens, fraglens);
}

/// Extract contigious blocks from a bam record
/// Blocks are split per insertion, padding, deletion and ref skip
fn bam_blocks(rec: bam::Record) -> Vec<(i64, i64)> {
let mut blocks: Vec<(i64, i64)> = Vec::new();
let mut pos = rec.pos();

for c in rec.cigar().iter() {
match c {
Cigar::Match(len) | Cigar::Equal(len) | Cigar::Diff(len) => {
let start = pos;
let end = pos + *len as i64;
blocks.push((start, end));
pos = end;
}
Cigar::Ins(len) | Cigar::Pad(len) => {
pos += *len as i64;
}
Cigar::Del(len) | Cigar::RefSkip(len) => {
pos += *len as i64;
}
_ => (),
}
}
return blocks;
}

/// Takes a bedgraph vector, and combines adjacent blocks with equal coverage
#[allow(unused_assignments)]
pub fn collapse_bgvec(mut bg: Vec<(String, u64, u64, f64)>, scale_factor: f64) -> Vec<(String, u64, u64, f64)> {
Expand All @@ -259,28 +285,92 @@ pub fn collapse_bgvec(mut bg: Vec<(String, u64, u64, f64)>, scale_factor: f64) -
return cvec;
}

/// Extract contigious blocks from a bam record
/// Blocks are split per insertion, padding, deletion and ref skip
fn bam_blocks(rec: bam::Record) -> Vec<(i64, i64)> {
let mut blocks: Vec<(i64, i64)> = Vec::new();
let mut pos = rec.pos();
/// Takes two bedgraph vectors, and combines adjacent blocks with equal coverage
#[allow(unused_assignments)]
pub fn collapse_bgvecs(
mut bg1: Vec<(String, u64, u64, f64)>,
mut bg2: Vec<(String, u64, u64, f64)>,
sf1: f64,
sf2: f64,
pseudocount: f64,
operation: &str
) -> Vec<(String, u64, u64, f64)> {
assert_eq!(bg1.len(), bg2.len(), "Bedgraph vectors must be of equal length.");
let mut cvec: Vec<(String, u64, u64, f64)> = Vec::new();
// initialize values, assert equal bins
let (mut lchrom, mut lstart, mut lend, lcov1) = bg1.remove(0);
let (lchrom2, lstart2, lend2, lcov2) = bg2.remove(0);
assert_eq!(lchrom, lchrom2,"Chromosomes in bedgraph vectors must be equal.");
assert_eq!(lstart, lstart2,"Start positions in bedgraph vectors must be equal.");
assert_eq!(lend, lend2,"End positions in bedgraph vectors must be equal.");

for c in rec.cigar().iter() {
match c {
Cigar::Match(len) | Cigar::Equal(len) | Cigar::Diff(len) => {
let start = pos;
let end = pos + *len as i64;
blocks.push((start, end));
pos = end;
}
Cigar::Ins(len) | Cigar::Pad(len) => {
pos += *len as i64;
}
Cigar::Del(len) | Cigar::RefSkip(len) => {
pos += *len as i64;
let mut lcov: f64 = calc_ratio(lcov1, lcov2, &sf1, &sf2, &pseudocount, &operation);
for (
(chrom, start, end, cov1),
(chrom2, start2, end2, cov2)
) in bg1.into_iter().zip(bg2.into_iter()) {
// assert equal bins.
assert_eq!(chrom, chrom2,"Chromosomes in bedgraph vectors must be equal.");
assert_eq!(start, start2,"Start positions in bedgraph vectors must be equal.");
assert_eq!(end, end2,"End positions in bedgraph vectors must be equal.");
// Calculate coverage, depending on what operation is fed
let cov: f64 = calc_ratio(cov1, cov2, &sf1, &sf2, &pseudocount, &operation);
// Collapse bg vector properly
if chrom != lchrom {
cvec.push((lchrom, lstart, lend, lcov));
lchrom = chrom;
lstart = start;
lend = end;
lcov = cov;
} else if cov != lcov {
cvec.push((lchrom, lstart, lend, lcov));
lchrom = chrom;
lstart = start;
lend = end;
lcov = cov;
}
lend = end;
}
cvec.push((lchrom, lstart, lend, lcov));
return cvec;
}

fn calc_ratio(
cov1: f64,
cov2: f64,
sf1: &f64,
sf2: &f64,
pseudocount: &f64,
operation: &str
) -> f64 {
// Pseudocounts are only used in log2 and ratio operations
// First scale factor is applied, then pseudocount, if applicable.
match operation {
"log2" => {
let num: f64 = (cov1 * *sf1) + *pseudocount;
let den: f64 = (cov2 * *sf2) + *pseudocount;
return (num / den).log2();
}
"ratio" => {
let num: f64 = (cov1 * *sf1) + *pseudocount;
let den: f64 = (cov2 * *sf2) + *pseudocount;
return num / den;
}
"reciprocal_ratio" => {
let num: f64 = (cov1 * *sf1) + *pseudocount;
let den: f64 = (cov2 * *sf2) + *pseudocount;
let ratio: f64 = num / den;
if ratio >= 1.0 {
return den / num;
} else {
return -num / den;
}
_ => (),
}
_ => {
// No operation is never allowed (on the py arg level, so just default to log2)
let num: f64 = (cov1 * *sf1) + *pseudocount;
let den: f64 = (cov2 * *sf2) + *pseudocount;
return (num / den).log2();
}
}
return blocks;
}
5 changes: 2 additions & 3 deletions src/filehandler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ pub fn bam_ispaired(bam_ifile: &str) -> bool {
return false;
}

pub fn write_file(ofile: &str, filetype: &str, bg: Vec<(String, u64, u64, f64)>, chromsizes: HashMap<String, u32>) -> Result<(), std::string::String> {
pub fn write_file(ofile: &str, filetype: &str, bg: Vec<(String, u64, u64, f64)>, chromsizes: HashMap<String, u32>) {
if filetype == "bedgraph" {
// write output file, bedgraph
let mut writer = BufWriter::new(File::create(ofile).unwrap());
Expand All @@ -39,7 +39,6 @@ pub fn write_file(ofile: &str, filetype: &str, bg: Vec<(String, u64, u64, f64)>,
println!("Init writer");
let writer = BigWigWrite::create_file(ofile, chromsizes).unwrap();
println!("Start writer");
writer.write(vals, runtime);
let _ = writer.write(vals, runtime);
}
Ok(())
}
4 changes: 2 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use pyo3::prelude::*;
mod bamcoverage;
//mod bamcompare;
mod bamcompare;
mod covcalc;
mod alignmentsieve;
mod filehandler;
Expand All @@ -10,6 +10,6 @@ mod calc;
#[pymodule]
fn hp(m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_function(wrap_pyfunction!(bamcoverage::r_bamcoverage, m)?)?;
//m.add_function(wrap_pyfunction!(bamcompare::r_bamcompare, m)?)?;
m.add_function(wrap_pyfunction!(bamcompare::r_bamcompare, m)?)?;
Ok(())
}
6 changes: 4 additions & 2 deletions src/normalization.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ pub fn scale_factor(
binsize: u32,
effective_genome_size: u64,
readlen: f32,
_fraglen: f32,
verbose: &bool
) -> f64 {
let mut scale_factor = 1.0;
Expand Down Expand Up @@ -44,8 +45,9 @@ pub fn scale_factor_bamcompare(
norm_method: &str,
mapped_bam1: u64,
mapped_bam2: u64,
_binsize: u64,
_effective_genome_size: u64
_binsize: u32,
_effective_genome_size: u64,
_norm: &str
) -> (f64, f64) {
return match norm_method {
"readCount" => {
Expand Down

0 comments on commit 5ced25d

Please sign in to comment.