From 2312e3552ab3168df7ed97add73b5814b35c7fe8 Mon Sep 17 00:00:00 2001 From: Douwe Schulte Date: Fri, 27 Oct 2023 14:49:07 +0200 Subject: [PATCH] Fixed more issues with ontologies --- src/build/gnome.rs | 20 ++++------- src/build/ontology_modification.rs | 16 +++++---- src/build/psi_mod.rs | 4 +-- src/build/unimod.rs | 4 +-- src/complex_peptide.rs | 29 ++++++++++++--- src/formula.rs | 4 +-- src/glycan.rs | 34 ++++++++++-------- src/isobaric_sets.rs | 57 +++++++++++++++++------------- src/linear_peptide.rs | 15 +++----- src/modification.rs | 9 +++-- src/ontologies.rs | 14 ++++---- src/pro_forma_parse_tests.rs | 4 +++ src/shared/glycan.rs | 10 ++++++ src/shared/modification.rs | 2 +- src/shared/ontology.rs | 2 ++ 15 files changed, 134 insertions(+), 90 deletions(-) diff --git a/src/build/gnome.rs b/src/build/gnome.rs index 2e266ce..40d9d34 100644 --- a/src/build/gnome.rs +++ b/src/build/gnome.rs @@ -2,7 +2,7 @@ use std::{collections::HashMap, ffi::OsString, io::Write, path::Path}; use crate::{build::csv::parse_csv, glycan::*}; -use super::{obo::OboOntology, GnoComposition, Modification}; +use super::{obo::OboOntology, ontology_modification::OntologyList, GnoComposition, Modification}; pub fn build_gnome_ontology(out_dir: &OsString, debug: bool) { // Get all the basic info @@ -26,21 +26,10 @@ pub fn build_gnome_ontology(out_dir: &OsString, debug: bool) { let final_mods = mods .into_values() .filter(|m| m.mass.is_some()) - .take(10) .map(|m| (0_usize, m.code_name.clone(), m.into_mod())) .collect::>(); - file.write_all(&bincode::serialize(&final_mods).unwrap()) + file.write_all(&bincode::serialize::(&final_mods).unwrap()) .unwrap(); - // let mut writer = BufWriter::new(file); - // writeln!( - // writer, - // "pub const GNOME_ONTOLOGY: &[(usize, &str, Modification)] = &[" - // ) - // .unwrap(); - // for modification in mods.values().filter(|m| m.mass.is_some()) { - // writeln!(writer, "{},", modification.to_code()).unwrap(); - // } - // writeln!(writer, "];").unwrap(); } fn find_mass(mods: &HashMap, mut name: String) -> Option { @@ -138,7 +127,10 @@ impl GNOmeModification { if let Some(structure) = self.structure { Modification::Gno(GnoComposition::Structure(structure), self.code_name) } else if let Some(mass) = self.mass { - Modification::Gno(GnoComposition::Mass(mass), self.code_name) + Modification::Gno( + GnoComposition::Mass(crate::system::f64::da(mass)), + self.code_name, + ) } else { panic!("unreachable") } diff --git a/src/build/ontology_modification.rs b/src/build/ontology_modification.rs index 91983b1..d0c0c2e 100644 --- a/src/build/ontology_modification.rs +++ b/src/build/ontology_modification.rs @@ -16,13 +16,17 @@ pub struct OntologyModification { } impl OntologyModification { - pub fn into_mod(self) -> Modification { - Modification::Predefined( - self.diff_formula, - self.rules, - self.ontology, - self.code_name, + pub fn into_mod(self) -> (usize, String, Modification) { + ( self.id, + self.code_name.to_ascii_lowercase(), + Modification::Predefined( + self.diff_formula, + self.rules, + self.ontology, + self.code_name, + self.id, + ), ) } } diff --git a/src/build/psi_mod.rs b/src/build/psi_mod.rs index 8e0b8f0..4cf4698 100644 --- a/src/build/psi_mod.rs +++ b/src/build/psi_mod.rs @@ -4,7 +4,7 @@ use crate::{formula::MolecularFormula, ELEMENT_PARSE_LIST}; use super::{ obo::OboOntology, - ontology_modification::{OntologyModification, PlacementRule, Position}, + ontology_modification::{OntologyList, OntologyModification, PlacementRule, Position}, }; pub fn build_psi_mod_ontology(out_dir: &OsString, debug: bool) { @@ -13,7 +13,7 @@ pub fn build_psi_mod_ontology(out_dir: &OsString, debug: bool) { let dest_path = Path::new(&out_dir).join("psimod.dat"); let mut file = std::fs::File::create(dest_path).unwrap(); let final_mods = mods.into_iter().map(|m| m.into_mod()).collect::>(); - file.write_all(&bincode::serialize(&final_mods).unwrap()) + file.write_all(&bincode::serialize::(&final_mods).unwrap()) .unwrap(); } diff --git a/src/build/unimod.rs b/src/build/unimod.rs index 8d2a487..e3b5925 100644 --- a/src/build/unimod.rs +++ b/src/build/unimod.rs @@ -6,7 +6,7 @@ use crate::{formula::MolecularFormula, glycan::MonoSaccharide, print, Element}; use super::{ obo::OboOntology, - ontology_modification::{OntologyModification, PlacementRule}, + ontology_modification::{OntologyList, OntologyModification, PlacementRule}, }; pub fn build_unimod_ontology(out_dir: &OsString, debug: bool) { @@ -15,7 +15,7 @@ pub fn build_unimod_ontology(out_dir: &OsString, debug: bool) { let dest_path = Path::new(&out_dir).join("unimod.dat"); let mut file = std::fs::File::create(dest_path).unwrap(); let final_mods = mods.into_iter().map(|m| m.into_mod()).collect::>(); - file.write_all(&bincode::serialize(&final_mods).unwrap()) + file.write_all(&bincode::serialize::(&final_mods).unwrap()) .unwrap(); } diff --git a/src/complex_peptide.rs b/src/complex_peptide.rs index 507c1a1..32bee0a 100644 --- a/src/complex_peptide.rs +++ b/src/complex_peptide.rs @@ -1,3 +1,5 @@ +use std::fmt::Display; + use itertools::Itertools; use crate::{ @@ -21,6 +23,25 @@ pub enum ComplexPeptide { Multimeric(Vec), } +impl Display for ComplexPeptide { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Singular(s) => write!(f, "{s}"), + Self::Multimeric(m) => { + let mut first = true; + for pep in m { + if !first { + write!(f, "+")?; + } + write!(f, "{pep}")?; + first = false; + } + Ok(()) + } + } + } +} + impl ComplexPeptide { /// [Pro Forma specification](https://github.com/HUPO-PSI/ProForma) /// Only supports a subset of the specification (see `proforma_grammar.md` for an overview of what is supported), some functions are not possible to be represented. @@ -134,7 +155,7 @@ impl ComplexPeptide { } ch => { peptide.sequence.push(SequenceElement::new( - ch.try_into().map_err(|_| { + ch.try_into().map_err(|()| { CustomError::error( "Invalid amino acid", "This character is not a valid amino acid", @@ -207,7 +228,7 @@ impl ComplexPeptide { .flat_err()?; for aa in line[at_index..end_index].split(',') { global_modifications.push(GlobalModification::Fixed( - aa.try_into().map_err(|_| { + aa.try_into().map_err(|()| { CustomError::error( "Invalid global modification", "The location could not be read as an amino acid", @@ -398,7 +419,7 @@ impl ComplexPeptide { chars[offset+count_len..].iter() .take_while(|c| c.is_ascii_alphabetic()) .count(); - let element: Element = std::str::from_utf8(&chars[offset+count_len..offset+count_len+element_len]).unwrap().try_into().map_err(|_| CustomError::error( + let element: Element = std::str::from_utf8(&chars[offset+count_len..offset+count_len+element_len]).unwrap().try_into().map_err(|()| CustomError::error( "Invalid adduct ion", "Invalid element symbol", Context::line(0, line, offset+count_len, element_len), @@ -489,7 +510,7 @@ impl ComplexPeptide { } (false, ch) => { peptide.sequence.push(SequenceElement::new( - ch.try_into().map_err(|_| CustomError::error( + ch.try_into().map_err(|()| CustomError::error( "Invalid amino acid", "This character is not a valid amino acid", Context::line(0, line, index, 1), diff --git a/src/formula.rs b/src/formula.rs index 1ef9196..dd64bed 100644 --- a/src/formula.rs +++ b/src/formula.rs @@ -216,7 +216,7 @@ fn to_subscript_num(input: isize) -> String { if *c == b'-' { output.push('\u{208B}'); } else { - output.push(char::from_u32(*c as u32 + 0x2080 - 0x30).unwrap()); + output.push(char::from_u32(u32::from(*c) + 0x2080 - 0x30).unwrap()); } } output @@ -235,7 +235,7 @@ fn to_superscript_num(input: isize) -> String { } else if *c == b'3' { output.push('\u{00B3}'); } else { - output.push(char::from_u32(*c as u32 + 0x2070 - 0x30).unwrap()); + output.push(char::from_u32(u32::from(*c) + 0x2070 - 0x30).unwrap()); } } output diff --git a/src/glycan.rs b/src/glycan.rs index 8bd4854..596e724 100644 --- a/src/glycan.rs +++ b/src/glycan.rs @@ -424,9 +424,9 @@ mod test { assert_eq!( GlycanStructure::from_str("Hep(Hex)").unwrap(), GlycanStructure { - sugar: MonoSaccharide::new(BaseSugar::Heptose(None), &[]), + sugar: MonoSaccharide::new(BaseSugar::Heptose(None), &[]).with_name("Hep"), branches: vec![GlycanStructure { - sugar: MonoSaccharide::new(BaseSugar::Hexose(None), &[]), + sugar: MonoSaccharide::new(BaseSugar::Hexose(None), &[]).with_name("Hex"), branches: Vec::new() }], } @@ -434,14 +434,14 @@ mod test { assert_eq!( GlycanStructure::from_str("Hex(Hex,Hep)").unwrap(), GlycanStructure { - sugar: MonoSaccharide::new(BaseSugar::Hexose(None), &[]), + sugar: MonoSaccharide::new(BaseSugar::Hexose(None), &[]).with_name("Hex"), branches: vec![ GlycanStructure { - sugar: MonoSaccharide::new(BaseSugar::Hexose(None), &[]), + sugar: MonoSaccharide::new(BaseSugar::Hexose(None), &[]).with_name("Hex"), branches: Vec::new() }, GlycanStructure { - sugar: MonoSaccharide::new(BaseSugar::Heptose(None), &[]), + sugar: MonoSaccharide::new(BaseSugar::Heptose(None), &[]).with_name("Hep"), branches: Vec::new() } ], @@ -450,17 +450,18 @@ mod test { assert_eq!( GlycanStructure::from_str("Hex(Hex(Hex),Hep)").unwrap(), GlycanStructure { - sugar: MonoSaccharide::new(BaseSugar::Hexose(None), &[]), + sugar: MonoSaccharide::new(BaseSugar::Hexose(None), &[]).with_name("Hex"), branches: vec![ GlycanStructure { - sugar: MonoSaccharide::new(BaseSugar::Hexose(None), &[]), + sugar: MonoSaccharide::new(BaseSugar::Hexose(None), &[]).with_name("Hex"), branches: vec![GlycanStructure { - sugar: MonoSaccharide::new(BaseSugar::Hexose(None), &[]), + sugar: MonoSaccharide::new(BaseSugar::Hexose(None), &[]) + .with_name("Hex"), branches: Vec::new() }] }, GlycanStructure { - sugar: MonoSaccharide::new(BaseSugar::Heptose(None), &[]), + sugar: MonoSaccharide::new(BaseSugar::Heptose(None), &[]).with_name("Hep"), branches: Vec::new() } ], @@ -469,21 +470,24 @@ mod test { assert_eq!( GlycanStructure::from_str("Hep(Hex(Hex(Hex(Hep),Hex)))").unwrap(), GlycanStructure { - sugar: MonoSaccharide::new(BaseSugar::Heptose(None), &[]), + sugar: MonoSaccharide::new(BaseSugar::Heptose(None), &[]).with_name("Hep"), branches: vec![GlycanStructure { - sugar: MonoSaccharide::new(BaseSugar::Hexose(None), &[]), + sugar: MonoSaccharide::new(BaseSugar::Hexose(None), &[]).with_name("Hex"), branches: vec![GlycanStructure { - sugar: MonoSaccharide::new(BaseSugar::Hexose(None), &[]), + sugar: MonoSaccharide::new(BaseSugar::Hexose(None), &[]).with_name("Hex"), branches: vec![ GlycanStructure { - sugar: MonoSaccharide::new(BaseSugar::Hexose(None), &[]), + sugar: MonoSaccharide::new(BaseSugar::Hexose(None), &[]) + .with_name("Hex"), branches: vec![GlycanStructure { - sugar: MonoSaccharide::new(BaseSugar::Heptose(None), &[]), + sugar: MonoSaccharide::new(BaseSugar::Heptose(None), &[]) + .with_name("Hep"), branches: Vec::new(), }], }, GlycanStructure { - sugar: MonoSaccharide::new(BaseSugar::Hexose(None), &[]), + sugar: MonoSaccharide::new(BaseSugar::Hexose(None), &[]) + .with_name("Hex"), branches: Vec::new(), }, ], diff --git a/src/isobaric_sets.rs b/src/isobaric_sets.rs index 059ad16..374c31e 100644 --- a/src/isobaric_sets.rs +++ b/src/isobaric_sets.rs @@ -94,14 +94,17 @@ pub fn find_isobaric_sets( .iter() .flat_map(|aa| { let mut options = vec![SequenceElement::new(*aa, None)]; - options.extend(modifications.iter().filter_map(|m| { - can_be_placed(m, *aa, 0, 1).then(|| SequenceElement { - aminoacid: *aa, - ambiguous: None, - modifications: vec![m.clone()], - possible_modifications: Vec::new(), - }) - })); + options.extend( + modifications + .iter() + .filter(|&m| can_be_placed(m, *aa, 0, 1)) + .map(|m| SequenceElement { + aminoacid: *aa, + ambiguous: None, + modifications: vec![m.clone()], + possible_modifications: Vec::new(), + }), + ); options }) .map(|s| { @@ -116,14 +119,17 @@ pub fn find_isobaric_sets( .iter() .flat_map(|aa| { let mut options = vec![SequenceElement::new(*aa, None)]; - options.extend(modifications.iter().filter_map(|m| { - can_be_placed(m, *aa, 1, 2).then(|| SequenceElement { - aminoacid: *aa, - ambiguous: None, - modifications: vec![m.clone()], - possible_modifications: Vec::new(), - }) - })); + options.extend( + modifications + .iter() + .filter(|&m| can_be_placed(m, *aa, 1, 2)) + .map(|m| SequenceElement { + aminoacid: *aa, + ambiguous: None, + modifications: vec![m.clone()], + possible_modifications: Vec::new(), + }), + ); options }) .map(|s| { @@ -138,14 +144,17 @@ pub fn find_isobaric_sets( .iter() .flat_map(|aa| { let mut options = vec![SequenceElement::new(*aa, None)]; - options.extend(modifications.iter().filter_map(|m| { - can_be_placed(m, *aa, 1, 1).then(|| SequenceElement { - aminoacid: *aa, - ambiguous: None, - modifications: vec![m.clone()], - possible_modifications: Vec::new(), - }) - })); + options.extend( + modifications + .iter() + .filter(|&m| can_be_placed(m, *aa, 1, 1)) + .map(|m| SequenceElement { + aminoacid: *aa, + ambiguous: None, + modifications: vec![m.clone()], + possible_modifications: Vec::new(), + }), + ); options }) .map(|s| { diff --git a/src/linear_peptide.rs b/src/linear_peptide.rs index ca5a358..e8a5cbd 100644 --- a/src/linear_peptide.rs +++ b/src/linear_peptide.rs @@ -134,11 +134,8 @@ impl LinearPeptide { + self.sequence[index] .possible_modifications .iter() - .filter_map(|am| { - ambiguous_local - .contains(&&am.id) - .then(|| am.modification.formula()) - }) + .filter(|&am| ambiguous_local.contains(&&am.id)) + .map(|am| am.modification.formula()) .sum::() }) .map(|m| { @@ -536,11 +533,8 @@ impl SequenceElement { + self .possible_modifications .iter() - .filter_map(|m| { - selected_ambiguous - .contains(&m.id) - .then(|| m.modification.formula()) - }) + .filter(|&m| selected_ambiguous.contains(&m.id)) + .map(|m| m.modification.formula()) .sum::(), ) } @@ -551,6 +545,7 @@ impl SequenceElement { if self.aminoacid == AminoAcid::B || self.aminoacid == AminoAcid::Z { None } else { + #[allow(clippy::filter_map_bool_then)] // otherwise crashes Some( self.aminoacid.formula() + self diff --git a/src/modification.rs b/src/modification.rs index 56425ee..874b09f 100644 --- a/src/modification.rs +++ b/src/modification.rs @@ -29,10 +29,13 @@ impl Chemical for Modification { .fold(MolecularFormula::default(), |acc, i| { acc + i.0.formula() * i.1 as i16 }), - Self::GlycanStructure(glycan) => glycan.formula(), + Self::GlycanStructure(glycan) | Self::Gno(GnoComposition::Structure(glycan), _) => { + glycan.formula() + } Self::Predefined(formula, _, _, _, _) => formula.clone(), - Self::Gno(GnoComposition::Mass(m), _) => MolecularFormula::with_additional_mass(*m), - Self::Gno(GnoComposition::Structure(glycan), _) => glycan.formula(), + Self::Gno(GnoComposition::Mass(m), _) => { + MolecularFormula::with_additional_mass(m.value) + } } } } diff --git a/src/ontologies.rs b/src/ontologies.rs index 0f5aab1..f71bbff 100644 --- a/src/ontologies.rs +++ b/src/ontologies.rs @@ -1,25 +1,25 @@ use std::sync::OnceLock; -use crate::modification::Modification; +use crate::OntologyList; /// Get the unimod ontology -pub fn unimod_ontology() -> &'static Vec<(usize, String, Modification)> { +pub fn unimod_ontology() -> &'static OntologyList { UNIMOD_CELL.get_or_init(|| { bincode::deserialize(include_bytes!(concat!(env!("OUT_DIR"), "/unimod.dat"))).unwrap() }) } /// Get the PSI-MOD ontology -pub fn psimod_ontology() -> &'static Vec<(usize, String, Modification)> { +pub fn psimod_ontology() -> &'static OntologyList { PSIMOD_CELL.get_or_init(|| { bincode::deserialize(include_bytes!(concat!(env!("OUT_DIR"), "/psimod.dat"))).unwrap() }) } /// Get the Gnome ontology -pub fn gnome_ontology() -> &'static Vec<(usize, String, Modification)> { +pub fn gnome_ontology() -> &'static OntologyList { GNOME_CELL.get_or_init(|| { bincode::deserialize(include_bytes!(concat!(env!("OUT_DIR"), "/gnome.dat"))).unwrap() }) } -static UNIMOD_CELL: OnceLock> = OnceLock::new(); -static PSIMOD_CELL: OnceLock> = OnceLock::new(); -static GNOME_CELL: OnceLock> = OnceLock::new(); +static UNIMOD_CELL: OnceLock = OnceLock::new(); +static PSIMOD_CELL: OnceLock = OnceLock::new(); +static GNOME_CELL: OnceLock = OnceLock::new(); diff --git a/src/pro_forma_parse_tests.rs b/src/pro_forma_parse_tests.rs index 440dcc7..098d454 100644 --- a/src/pro_forma_parse_tests.rs +++ b/src/pro_forma_parse_tests.rs @@ -8,6 +8,10 @@ macro_rules! parse_test { let res = ComplexPeptide::pro_forma($case); println!("{}\n{:?}", $case, res); assert!(res.is_ok()); + // TODO: when ready activate below and start debugging! + // let back = res.as_ref().unwrap().to_string(); + // let res_back = ComplexPeptide::pro_forma(&back); + // assert_eq!(res, res_back, "{} != {back}", $case); } }; } diff --git a/src/shared/glycan.rs b/src/shared/glycan.rs index 76d8e55..7d20d36 100644 --- a/src/shared/glycan.rs +++ b/src/shared/glycan.rs @@ -28,6 +28,16 @@ impl MonoSaccharide { } } + /// Get this same monosaccharide but now with the given pro forma name + #[must_use] + #[allow(dead_code)] + pub fn with_name(self, name: &str) -> Self { + Self { + proforma_name: Some(name.to_string()), + ..self + } + } + /// Set this saccharide up as to be a furanose #[must_use] #[allow(dead_code)] diff --git a/src/shared/modification.rs b/src/shared/modification.rs index 241eaf4..ca47298 100644 --- a/src/shared/modification.rs +++ b/src/shared/modification.rs @@ -29,7 +29,7 @@ pub enum Modification { #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub enum GnoComposition { /// Only the mass is known - Mass(f64), + Mass(Mass), /// The (full) structure is known Structure(GlycanStructure), } diff --git a/src/shared/ontology.rs b/src/shared/ontology.rs index 237b78d..2ea4241 100644 --- a/src/shared/ontology.rs +++ b/src/shared/ontology.rs @@ -18,3 +18,5 @@ impl Ontology { } } } + +pub type OntologyList = Vec<(usize, String, Modification)>;