Skip to content

Commit

Permalink
Allowed control over the alignment scoring and added option to recrea…
Browse files Browse the repository at this point in the history
…te an alignment based on the path #41
  • Loading branch information
douweschulte committed Sep 27, 2024
1 parent 2948403 commit beb5758
Show file tree
Hide file tree
Showing 9 changed files with 421 additions and 118 deletions.
7 changes: 3 additions & 4 deletions examples/de-novo-align/src/main.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
use std::{collections::HashMap, fs::File, io::BufWriter};

use align::AlignScoring;
use clap::Parser;
use itertools::Itertools;
use rayon::prelude::*;
use rustyms::{
align::{align, matrix, AlignType},
align::{align, AlignType},
csv::write_csv,
identification::{open_identified_peptides_file, FastaData},
system::da,
*,
};

Expand Down Expand Up @@ -45,8 +45,7 @@ fn main() {
align::<4, SemiAmbiguous, SemiAmbiguous>(
&db.peptide,
peptide.peptide().unwrap(),
matrix::BLOSUM62,
Tolerance::Absolute(da(0.1)),
AlignScoring::default(),
AlignType::EITHER_GLOBAL,
),
)
Expand Down
12 changes: 7 additions & 5 deletions rustyms-generate-imgt/src/combine.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use std::collections::HashMap;
use std::fmt::Write;

use itertools::Itertools;
use rustyms::system::{dalton, Mass};
use rustyms::align::AlignScoring;
use rustyms::LinearPeptide;
use rustyms::UnAmbiguous;

Expand Down Expand Up @@ -193,12 +193,15 @@ impl std::fmt::Display for TemporaryGermline {
cons.1.iter().map(|i| seq.acc[*i].clone()).join(" "),
)?;
}
let scoring = AlignScoring::<'_> {
matrix: rustyms::align::matrix::BLOSUM90,
..Default::default()
};
if let Some(first_allele) = first_allele {
let alignment = rustyms::align::align::<1, UnAmbiguous, UnAmbiguous>(
first_allele,
&seq.sequence,
rustyms::align::matrix::BLOSUM90,
crate::Tolerance::new_absolute(Mass::new::<dalton>(0.01)),
scoring,
rustyms::align::AlignType::GLOBAL,
)
.stats();
Expand All @@ -213,8 +216,7 @@ impl std::fmt::Display for TemporaryGermline {
let alignment = rustyms::align::align::<1, UnAmbiguous, UnAmbiguous>(
reference,
&seq.sequence,
rustyms::align::matrix::BLOSUM90,
crate::Tolerance::new_absolute(Mass::new::<dalton>(0.01)),
scoring,
rustyms::align::AlignType::GLOBAL,
)
.stats();
Expand Down
201 changes: 187 additions & 14 deletions rustyms/src/align/alignment.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ use super::align_type::*;
use super::piece::*;
use super::scoring::*;

use crate::align::mass_alignment::determine_final_score;
use crate::align::mass_alignment::score_pair;
use crate::helper_functions::next_num;
use crate::peptide::AtMax;
use crate::peptide::Linear;
use crate::system::Mass;
Expand All @@ -19,6 +22,7 @@ use crate::LinearPeptide;
use crate::MolecularFormula;
use crate::Multi;
use crate::SequencePosition;
use crate::SimpleLinear;

/// An alignment of two reads. It has either a reference to the two sequences to prevent overzealous use of memory, or if needed use [`Self::to_owned`] to get a variant that clones the sequences and so can be used in more places.
#[derive(Debug)]
Expand Down Expand Up @@ -109,8 +113,183 @@ impl<'lifetime, A, B> Ord for Alignment<'lifetime, A, B> {
}
}

/// A generalised alignment with all behaviour.
#[allow(private_bounds)] // Intended behaviour no one should build on the inner structure
impl<'lifetime, A: AtMax<SimpleLinear>, B: AtMax<SimpleLinear>> Alignment<'lifetime, A, B> {
/// Recreate an alignment from a path, the path is [`Self::short`].
#[allow(clippy::missing_panics_doc)]
pub fn create_from_path(
seq_a: &'lifetime LinearPeptide<A>,
seq_b: &'lifetime LinearPeptide<B>,
start_a: usize,
start_b: usize,
path: &str,
scoring: AlignScoring,
align_type: AlignType,
maximal_step: u16,
) -> Option<Self> {
#[derive(PartialEq, Eq, Debug)]
enum StepType {
Gap,
Match,
Rotation,
Isobaric,
}

let mut index = 0;
let mut steps = Vec::new();
while index < path.len() {
if let Some((offset, num)) = next_num(path.as_bytes(), index, false) {
let num = u16::try_from(num).unwrap();
index += offset + 1;
match path.as_bytes()[index - 1] {
b'I' => steps.push((StepType::Gap, 0, num)),
b'D' => steps.push((StepType::Gap, num, 0)),
b'=' | b'X' => steps.push((StepType::Match, num, num)),
b'r' => steps.push((StepType::Rotation, num, num)),
b'i' => steps.push((StepType::Isobaric, num, num)),
b':' => {
if let Some((offset, num2)) = next_num(path.as_bytes(), index, false) {
let num2 = u16::try_from(num2).unwrap();
index += offset + 1;
match path.as_bytes()[index - 1] {
b'i' => steps.push((StepType::Isobaric, num, num2)),
_ => return None,
}
} else {
return None;
}
}
_ => return None,
}
} else {
return None;
}
}

dbg!((&steps, &path));

let mut index_a = start_a;
let mut index_b = start_b;
let mut score = 0;
let path = steps
.into_iter()
.flat_map(|(step, a, b)| match step {
StepType::Gap => {
let step_a = u16::from(a != 0);
let step_b = u16::from(b != 0);
index_a += a as usize;
index_b += b as usize;

(0..a.max(b))
.map(|i| {
if i == 0 {
let r = Piece {
score,
local_score: scoring.gap_start as isize
+ scoring.gap_extend as isize,
match_type: MatchType::Gap,
step_a,
step_b,
};
score += scoring.gap_start as isize + scoring.gap_extend as isize;
r
} else {
let r = Piece {
score,
local_score: scoring.gap_extend as isize,
match_type: MatchType::Gap,
step_a,
step_b,
};
score += scoring.gap_extend as isize;
r
}
})
.collect_vec()
}
StepType::Match => (0..a)
.map(|_| {
let mass_a = seq_a[index_a]
.formulas_all(
&[],
&[],
&mut Vec::new(),
false,
SequencePosition::Index(index_a),
0,
)
.0
.iter()
.map(MolecularFormula::monoisotopic_mass)
.collect();
let mass_b = seq_b[index_b]
.formulas_all(
&[],
&[],
&mut Vec::new(),
false,
SequencePosition::Index(index_b),
0,
)
.0
.iter()
.map(MolecularFormula::monoisotopic_mass)
.collect();
let piece = score_pair(
(&seq_a[index_a], &mass_a),
(&seq_b[index_b], &mass_b),
scoring,
score,
);
index_a += 1;
index_b += 1;
score += piece.local_score;
piece
})
.collect_vec(),
StepType::Rotation => {
let local_score =
scoring.mass_base as isize + scoring.rotated as isize * a as isize;
score += local_score;
index_a += a as usize;
index_b += b as usize;
vec![Piece {
score,
local_score,
match_type: MatchType::Rotation,
step_a: a,
step_b: b,
}]
}
StepType::Isobaric => {
let local_score = scoring.mass_base as isize
+ scoring.isobaric as isize * (a as isize + b as isize) / 2;
score += local_score;
index_a += a as usize;
index_b += b as usize;
vec![Piece {
score,
local_score,
match_type: MatchType::Isobaric,
step_a: a,
step_b: b,
}]
}
})
.collect_vec();

Some(Self {
seq_a: Cow::Borrowed(seq_a),
seq_b: Cow::Borrowed(seq_b),
score: determine_final_score(seq_a, seq_b, start_a, start_b, &path, scoring),
path,
start_a,
start_b,
align_type,
maximal_step,
})
}
}

impl<'lifetime, A, B> Alignment<'lifetime, A, B> {
/// The first sequence
pub fn seq_a(&self) -> &LinearPeptide<A> {
Expand Down Expand Up @@ -305,8 +484,7 @@ impl<'lifetime, A: AtMax<Linear>, B: AtMax<Linear>> Alignment<'lifetime, A, B> {
Self::Deletion => String::from("D"),
Self::Match => String::from("="),
Self::Mismatch => String::from("X"),
Self::Special(MatchType::Rotation, a, b) if a == b => format!("{a}r"),
Self::Special(MatchType::Rotation, a, b) => format!("{a}:{b}r"),
Self::Special(MatchType::Rotation, a, _) => format!("{a}r"),
Self::Special(MatchType::Isobaric, a, b) if a == b => format!("{a}i"),
Self::Special(MatchType::Isobaric, a, b) => format!("{a}:{b}i"),
Self::Special(..) => panic!("A special match cannot be of this match type"),
Expand Down Expand Up @@ -404,9 +582,8 @@ pub struct Score {
#[allow(clippy::missing_panics_doc)]
mod tests {
use crate::{
align::{align, matrix::BLOSUM62, AlignType},
align::{align, AlignScoring, AlignType},
peptide::SimpleLinear,
system::da,
AminoAcid, LinearPeptide, MultiChemical,
};

Expand Down Expand Up @@ -435,8 +612,7 @@ mod tests {
align::<1, SimpleLinear, SimpleLinear>(
&a,
&b,
BLOSUM62,
crate::Tolerance::new_absolute(da(0.1)),
AlignScoring::default(),
AlignType::GLOBAL
)
.mass_difference()
Expand All @@ -448,8 +624,7 @@ mod tests {
align::<1, SimpleLinear, SimpleLinear>(
&a,
&c,
BLOSUM62,
crate::Tolerance::new_absolute(da(0.1)),
AlignScoring::default(),
AlignType::GLOBAL
)
.mass_difference()
Expand All @@ -461,8 +636,7 @@ mod tests {
align::<1, SimpleLinear, SimpleLinear>(
&a,
&d,
BLOSUM62,
crate::Tolerance::new_absolute(da(0.1)),
AlignScoring::default(),
AlignType::GLOBAL_B
)
.mass_difference()
Expand All @@ -477,8 +651,7 @@ mod tests {
let mass_diff_bc = align::<1, SimpleLinear, SimpleLinear>(
&b,
&c,
BLOSUM62,
crate::Tolerance::new_absolute(da(0.1)),
AlignScoring::default(),
AlignType::GLOBAL_B,
)
.mass_difference()
Expand Down
7 changes: 3 additions & 4 deletions rustyms/src/align/bad_alignments.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#![allow(clippy::missing_panics_doc)]

use crate::{
align::{align, matrix, AlignType},
LinearPeptide, SimpleLinear, Tolerance,
align::{align, scoring::AlignScoring, AlignType},
LinearPeptide, SimpleLinear,
};

#[test]
Expand All @@ -22,8 +22,7 @@ fn test_alignment(peptide_one: &str, peptide_two: &str, path: &str) {
let alignment = align::<4, SimpleLinear, SimpleLinear>(
&first_peptide,
&second_peptide,
matrix::BLOSUM62,
Tolerance::new_ppm(10.0),
AlignScoring::default(),
AlignType::GLOBAL,
);
assert_eq!(
Expand Down
Loading

0 comments on commit beb5758

Please sign in to comment.