Skip to content

Commit

Permalink
fix: affine gap penalties must be scaled by highest substitution cost.
Browse files Browse the repository at this point in the history
  • Loading branch information
nishaq503 committed Nov 16, 2024
1 parent 32a3e17 commit dbc4243
Show file tree
Hide file tree
Showing 4 changed files with 67 additions and 13 deletions.
15 changes: 15 additions & 0 deletions crates/abd-clam/src/msa/columnar.rs
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,11 @@ impl<'a, U: Number + Neg<Output = U>> Columnar<'a, U> {
D: Dataset<T, U>,
C: Cluster<T, U, D>,
{
ftlog::trace!(
"Adding cluster to MSA. Depth: {}, Cardinality: {}",
c.depth(),
c.cardinality()
);
let center = Self::new(self.aligner).with_sequence(data.get(c.arg_center()));
c.indices()
.filter(|&i| i != c.arg_center())
Expand Down Expand Up @@ -151,6 +156,11 @@ impl<'a, U: Number + Neg<Output = U>> Columnar<'a, U> {
/// Merge two MSAs.
#[must_use]
pub fn merge(mut self, self_center: usize, mut other: Self, other_center: usize) -> Self {
ftlog::trace!(
"Merging MSAs with cardinalities: {} and {}, and centers {self_center} and {other_center}",
self.len(),
other.len()
);
let x = self.get_sequence(self_center);
let y = other.get_sequence(other_center);
let table = self.aligner.dp_table(&x, &y);
Expand Down Expand Up @@ -267,6 +277,11 @@ impl<'a, U: Number + Neg<Output = U>> Columnar<'a, U> {
/// Parallel version of `merge`.
#[must_use]
pub fn par_merge(mut self, self_center: usize, mut other: Self, other_center: usize) -> Self {
ftlog::trace!(
"Parallel Merging MSAs with cardinalities: {} and {}, and centers {self_center} and {other_center}",
self.len(),
other.len()
);
let x = self.get_sequence(self_center);
let y = other.get_sequence(other_center);
let table = self.aligner.dp_table(&x, &y);
Expand Down
35 changes: 28 additions & 7 deletions crates/abd-clam/src/msa/needleman_wunsch/cost_matrix.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ impl<T: Number> CostMatrix<T> {
}

/// Create a new substitution matrix with affine gap penalties.
///
/// All substitution costs are set to 1, the gap opening cost is 10, and the
/// gap extension cost is 1.
#[must_use]
pub fn default_affine() -> Self {
Self::new(T::ONE, T::from(10), T::ONE)
Expand All @@ -55,14 +58,26 @@ impl<T: Number> CostMatrix<T> {
/// Add a constant to all substitution costs.
#[must_use]
pub fn shift(mut self, shift: T) -> Self {
self.sub_matrix.iter_mut().flatten().for_each(|cost| *cost += shift);
for i in 0..NUM_CHARS {
for j in 0..NUM_CHARS {
self.sub_matrix[i][j] += shift;
}
}
// self.gap_open += shift;
// self.gap_ext += shift;
self
}

/// Multiply all substitution costs by a constant.
#[must_use]
pub fn scale(mut self, scale: T) -> Self {
self.sub_matrix.iter_mut().flatten().for_each(|cost| *cost *= scale);
for i in 0..NUM_CHARS {
for j in 0..NUM_CHARS {
self.sub_matrix[i][j] *= scale;
}
}
// self.gap_open *= scale;
// self.gap_ext *= scale;
self
}

Expand Down Expand Up @@ -133,7 +148,13 @@ impl<T: Number + Neg<Output = T>> Neg for CostMatrix<T> {
type Output = Self;

fn neg(mut self) -> Self::Output {
self.sub_matrix.iter_mut().flatten().for_each(|cost| *cost = -*cost);
for i in 0..NUM_CHARS {
for j in 0..NUM_CHARS {
self.sub_matrix[i][j] = -self.sub_matrix[i][j];
}
}
self.gap_open = -self.gap_open;
self.gap_ext = -self.gap_ext;
self
}
}
Expand Down Expand Up @@ -237,8 +258,8 @@ impl<T: Number + Neg<Output = T>> CostMatrix<T> {
// Add the costs to the substitution matrix.
.fold(matrix, |matrix, (a, b, cost)| matrix.with_sub_cost(a, b, cost))
// Add affine gap penalties.
.with_gap_open(T::from(lcm))
.with_gap_ext(T::ONE)
.with_gap_open(T::from(lcm * 10))
.with_gap_ext(T::from(lcm))
}

/// The BLOSUM62 substitution matrix for proteins.
Expand Down Expand Up @@ -310,7 +331,7 @@ impl<T: Number + Neg<Output = T>> CostMatrix<T> {
.neg()
.normalize()
// Add affine gap penalties.
.with_gap_open(T::from(15))
.with_gap_ext(T::ONE)
.with_gap_open(T::from(15 * 10))
.with_gap_ext(T::from(15))
}
}
28 changes: 23 additions & 5 deletions crates/abd-clam/src/msa/quality/row_major.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,7 @@ impl<T: AsRef<[u8]>, U: Number, M> FlatVec<T, U, M> {
let rows = self.instances.iter().map(AsRef::as_ref).collect::<Vec<_>>();
let width = rows[0].len();

let mut instances = Vec::with_capacity(width);
for i in 0..width {
let col = rows.iter().map(|row| row[i]).collect();
instances.push(col);
}
let instances = (0..width).map(|i| rows.iter().map(|row| row[i]).collect()).collect();

let dimensionality_hint = (self.cardinality(), Some(self.cardinality()));
let name = format!("ColMajor({})", self.name);
Expand Down Expand Up @@ -193,6 +189,28 @@ impl<T: AsRef<[u8]>, U: Number, M> FlatVec<T, U, M> {

// Parallelized implementations here
impl<T: AsRef<[u8]> + Send + Sync, U: Number, M: Send + Sync> FlatVec<T, U, M> {
/// Parallelized version of `as_col_major`.
#[must_use]
pub fn par_as_col_major<Tn: FromIterator<u8> + Send + Sync>(&self, metric: Metric<Tn, U>) -> FlatVec<Tn, U, usize> {
let rows = self.instances.iter().map(AsRef::as_ref).collect::<Vec<_>>();
let width = rows[0].len();

let instances = (0..width)
.into_par_iter()
.map(|i| rows.iter().map(|row| row[i]).collect())
.collect();

let dimensionality_hint = (self.cardinality(), Some(self.cardinality()));
let name = format!("ColMajor({})", self.name);
FlatVec {
metric,
instances,
dimensionality_hint,
permutation: (0..width).collect(),
metadata: (0..width).collect(),
name,
}
}
/// Calculates the average and maximum `p-distance`s of all pairwise
/// alignments in the MSA.
#[must_use]
Expand Down
2 changes: 1 addition & 1 deletion crates/results/msa/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ fn main() -> Result<(), String> {

ftlog::info!("Converting to column-major format.");
let metric = Metric::default();
let col_msa_data = msa_data.as_col_major::<Vec<_>>(metric);
let col_msa_data = msa_data.par_as_col_major::<Vec<_>>(metric);
ftlog::info!("Finished converting to column-major format.");

let cs_quality = col_msa_data.par_scoring_columns(gap_char, gap_penalty, mismatch_penalty);
Expand Down

0 comments on commit dbc4243

Please sign in to comment.