From 57082b9dd4fcdd5f3087e7fae6ba1bf22953fcdd Mon Sep 17 00:00:00 2001 From: nbollis Date: Tue, 10 Dec 2024 22:35:48 -0600 Subject: [PATCH 1/4] Optimized PpmTolerance and ModFits --- mzLib/MzLibUtil/PpmTolerance.cs | 31 +++++------- .../Modifications/ModificationLocalization.cs | 48 +++++++++---------- 2 files changed, 35 insertions(+), 44 deletions(-) diff --git a/mzLib/MzLibUtil/PpmTolerance.cs b/mzLib/MzLibUtil/PpmTolerance.cs index 7bc88e33d..b355d8f76 100644 --- a/mzLib/MzLibUtil/PpmTolerance.cs +++ b/mzLib/MzLibUtil/PpmTolerance.cs @@ -25,40 +25,35 @@ namespace MzLibUtil /// public class PpmTolerance : Tolerance { + private readonly double _factor; + /// - /// Creates a new tolerance given a unit, value, and whether the tolerance is ± + /// Creates a new tolerance given value /// - /// The units for this tolerance /// The numerical value of the tolerance - public PpmTolerance(double value) + public PpmTolerance(double value) : base(value) { + _factor = value / 1e6; } - public override string ToString() - { - return $"{"±"}{Value.ToString("f4", System.Globalization.CultureInfo.InvariantCulture)} PPM"; - } + public override string ToString() => $"\u00b1{Value.ToString("f4", System.Globalization.CultureInfo.InvariantCulture)} PPM"; public override DoubleRange GetRange(double mean) { - double tol = Value * mean / 1e6; + double tol = _factor * mean; return new DoubleRange(mean - tol, mean + tol); } - public override double GetMinimumValue(double mean) - { - return mean * (1 - (Value / 1e6)); - } + public override double GetMinimumValue(double mean) => mean * (1 - _factor); - public override double GetMaximumValue(double mean) - { - return mean * (1 + (Value / 1e6)); - } + public override double GetMaximumValue(double mean) => mean * (1 + _factor); public override bool Within(double experimental, double theoretical) { - return Math.Abs((experimental - theoretical) / theoretical * 1e6) <= Value; + double diff = experimental - theoretical; + double scaledTolerance = theoretical * _factor; + return -scaledTolerance <= diff && diff <= scaledTolerance; } } -} \ No newline at end of file +} diff --git a/mzLib/Omics/Modifications/ModificationLocalization.cs b/mzLib/Omics/Modifications/ModificationLocalization.cs index bbf25d1a3..fcd43f7f3 100644 --- a/mzLib/Omics/Modifications/ModificationLocalization.cs +++ b/mzLib/Omics/Modifications/ModificationLocalization.cs @@ -5,42 +5,38 @@ public static class ModificationLocalization public static bool ModFits(Modification attemptToLocalize, string sequence, int digestionProductOneBasedIndex, int digestionProductLength, int bioPolymerOneBasedIndex) { // First find the capital letter... - var motif = attemptToLocalize.Target; - var motifStartLocation = motif.ToString().IndexOf(motif.ToString().First(b => char.IsUpper(b))); + var motif = attemptToLocalize.Target.ToString(); + var motifStartLocation = motif.IndexOf(motif.First(char.IsUpper)); // Look up starting at and including the capital letter var proteinToMotifOffset = bioPolymerOneBasedIndex - motifStartLocation - 1; - var indexUp = 0; - while (indexUp < motif.ToString().Length) + var motifLength = motif.Length; + + for (int indexUp = 0; indexUp < motifLength; indexUp++) { - if (indexUp + proteinToMotifOffset < 0 || indexUp + proteinToMotifOffset >= sequence.Length - || !MotifMatches(motif.ToString()[indexUp], sequence[indexUp + proteinToMotifOffset])) + int sequenceIndex = indexUp + proteinToMotifOffset; + if (sequenceIndex < 0 || sequenceIndex >= sequence.Length || !MotifMatches(motif[indexUp], sequence[sequenceIndex])) { return false; } - indexUp++; } - switch (attemptToLocalize.LocationRestriction) - { - case "N-terminal." when bioPolymerOneBasedIndex > 2: - case "Peptide N-terminal." when digestionProductOneBasedIndex > 1: - case "C-terminal." when bioPolymerOneBasedIndex < sequence.Length: - case "Peptide C-terminal." when digestionProductOneBasedIndex < digestionProductLength: - case "5'-terminal." when bioPolymerOneBasedIndex > 2: - // first residue in oligo but not first in nucleic acid - case "Oligo 5'-terminal." when digestionProductOneBasedIndex > 1 - || bioPolymerOneBasedIndex == 1: - case "3'-terminal." when bioPolymerOneBasedIndex < sequence.Length: - // not the last residue in oligo but not in nucleic acid - case "Oligo 3'-terminal." when digestionProductOneBasedIndex < digestionProductLength - || bioPolymerOneBasedIndex == sequence.Length: - return false; - default: + return attemptToLocalize.LocationRestriction switch + { + "N-terminal." when bioPolymerOneBasedIndex > 2 => false, + "Peptide N-terminal." when digestionProductOneBasedIndex > 1 => false, + "C-terminal." when bioPolymerOneBasedIndex < sequence.Length => false, + "Peptide C-terminal." when digestionProductOneBasedIndex < digestionProductLength => false, + "5'-terminal." when bioPolymerOneBasedIndex > 2 => false, + // first residue in oligo but not first in nucleic acid + "Oligo 5'-terminal." when digestionProductOneBasedIndex > 1 || bioPolymerOneBasedIndex == 1 => false, + "3'-terminal." when bioPolymerOneBasedIndex < sequence.Length => false, + // last residue in oligo but not in nucleic acid + "Oligo 3'-terminal." when digestionProductOneBasedIndex < digestionProductLength || bioPolymerOneBasedIndex == sequence.Length => false, // I guess Anywhere. and Unassigned. are true since how do you localize anywhere or unassigned. - - return true; - } + + _ => true, + }; } public static bool UniprotModExists(IBioPolymer bioPolymer, int i, Modification attemptToLocalize) From 13aa7666ba43b1bac8188f3cea166be2838ff8c0 Mon Sep 17 00:00:00 2001 From: nbollis Date: Tue, 10 Dec 2024 22:55:09 -0600 Subject: [PATCH 2/4] built basic initial capacity into several methods --- .../ClassicDeconvolutionAlgorithm.cs | 10 ++++++---- mzLib/Omics/BioPolymerWithSetModsExtensions.cs | 3 ++- .../Modifications/ModificationLocalization.cs | 18 +++++++++++++----- 3 files changed, 21 insertions(+), 10 deletions(-) diff --git a/mzLib/MassSpectrometry/Deconvolution/Algorithms/ClassicDeconvolutionAlgorithm.cs b/mzLib/MassSpectrometry/Deconvolution/Algorithms/ClassicDeconvolutionAlgorithm.cs index 8f7bb320b..26fedce39 100644 --- a/mzLib/MassSpectrometry/Deconvolution/Algorithms/ClassicDeconvolutionAlgorithm.cs +++ b/mzLib/MassSpectrometry/Deconvolution/Algorithms/ClassicDeconvolutionAlgorithm.cs @@ -32,7 +32,7 @@ internal override IEnumerable Deconvolute(MzSpectrum spectrumT yield break; } - var isolatedMassesAndCharges = new List(); + var isolatedMassesAndCharges = new List(10); (int start, int end) indexes = ExtractIndices(range.Minimum, range.Maximum); @@ -48,6 +48,7 @@ internal override IEnumerable Deconvolute(MzSpectrum spectrumT //go through each peak in the selected range and assume it is the most intense peak of its isotopic envelope (if it's not, it will hopefully get a low score) //cycle through possible charge states and select the one that has the best score (fit) with the averagine model + HashSet allPossibleChargeStates = new HashSet(); for (int candidateForMostIntensePeakIndex = indexes.start; candidateForMostIntensePeakIndex < indexes.end; candidateForMostIntensePeakIndex++) @@ -61,7 +62,7 @@ internal override IEnumerable Deconvolute(MzSpectrum spectrumT double candidateForMostIntensePeakMz = spectrum.XArray[candidateForMostIntensePeakIndex]; //Find what charge states this peak might be based on the spacing of nearby peaks (assumes isotopic resolution) - HashSet allPossibleChargeStates = new HashSet(); + allPossibleChargeStates.Clear(); for (int i = candidateForMostIntensePeakIndex + 1; i < spectrum.XArray.Length; i++) //look at peaks of higher m/z @@ -169,8 +170,9 @@ private IsotopicEnvelope FindIsotopicEnvelope(int massIndex, double candidateFor double[] theoreticalMasses = allMasses[massIndex]; double[] theoreticalIntensities = allIntensities[massIndex]; //add "most intense peak" - var listOfObservedPeaks = new List<(double, double)> { (candidateForMostIntensePeakMz, candidateForMostIntensePeakIntensity) }; - var listOfRatios = new List { theoreticalIntensities[0] / candidateForMostIntensePeakIntensity }; // theoreticalIntensities and theoreticalMasses are sorted by intensity, so first is most intense + int estimatedSize = theoreticalIntensities.Length; + var listOfObservedPeaks = new List<(double, double)>(estimatedSize) { (candidateForMostIntensePeakMz, candidateForMostIntensePeakIntensity) }; + var listOfRatios = new List(estimatedSize) { theoreticalIntensities[0] / candidateForMostIntensePeakIntensity }; // theoreticalIntensities and theoreticalMasses are sorted by intensity, so first is most intense // Assuming the test peak is most intense... // Try to find the rest of the isotopes! double differenceBetweenTheorAndActualMass = testMostIntenseMass - theoreticalMasses[0]; //mass difference actual-theoretical for the tallest peak (not necessarily the monoisotopic) diff --git a/mzLib/Omics/BioPolymerWithSetModsExtensions.cs b/mzLib/Omics/BioPolymerWithSetModsExtensions.cs index 2e5d29718..1a7193057 100644 --- a/mzLib/Omics/BioPolymerWithSetModsExtensions.cs +++ b/mzLib/Omics/BioPolymerWithSetModsExtensions.cs @@ -112,7 +112,8 @@ public static string EssentialSequence(this IBioPolymerWithSetMods withSetMods, /// public static string DetermineFullSequence(this IBioPolymerWithSetMods withSetMods) { - var subSequence = new StringBuilder(); + // start string builder with initial capacity to avoid resizing costs. + var subSequence = new StringBuilder(withSetMods.BaseSequence.Length + withSetMods.AllModsOneIsNterminus.Count * 20); // modification on peptide N-terminus if (withSetMods.AllModsOneIsNterminus.TryGetValue(1, out Modification mod)) diff --git a/mzLib/Omics/Modifications/ModificationLocalization.cs b/mzLib/Omics/Modifications/ModificationLocalization.cs index fcd43f7f3..692e7a745 100644 --- a/mzLib/Omics/Modifications/ModificationLocalization.cs +++ b/mzLib/Omics/Modifications/ModificationLocalization.cs @@ -52,11 +52,19 @@ public static bool UniprotModExists(IBioPolymer bioPolymer, int i, Modification private static bool MotifMatches(char motifChar, char sequenceChar) { char upperMotifChar = char.ToUpper(motifChar); - return upperMotifChar.Equals('X') - || upperMotifChar.Equals(sequenceChar) - || upperMotifChar.Equals('B') && new[] { 'D', 'N' }.Contains(sequenceChar) - || upperMotifChar.Equals('J') && new[] { 'I', 'L' }.Contains(sequenceChar) - || upperMotifChar.Equals('Z') && new[] { 'E', 'Q' }.Contains(sequenceChar); + switch (upperMotifChar) + { + case 'X': + return true; + case 'B': + return sequenceChar is 'D' or 'N'; + case 'J': + return sequenceChar is 'I' or 'L'; + case 'Z': + return sequenceChar is 'E' or 'Q'; + default: + return upperMotifChar == sequenceChar; + } } } } \ No newline at end of file From 191b4c976cf72f7b60ed546a39688dc254e37c9b Mon Sep 17 00:00:00 2001 From: nbollis Date: Tue, 10 Dec 2024 23:06:50 -0600 Subject: [PATCH 3/4] Optimized digestion product get modificaiton paterns --- mzLib/Omics/Digestion/DigestionProduct.cs | 41 +++++++++---------- .../Modifications/ModificationLocalization.cs | 1 + 2 files changed, 20 insertions(+), 22 deletions(-) diff --git a/mzLib/Omics/Digestion/DigestionProduct.cs b/mzLib/Omics/Digestion/DigestionProduct.cs index 55aed3255..5dcf59d0c 100644 --- a/mzLib/Omics/Digestion/DigestionProduct.cs +++ b/mzLib/Omics/Digestion/DigestionProduct.cs @@ -52,8 +52,17 @@ protected static IEnumerable> GetVariableModificat var possible_variable_modifications = new Dictionary>(possibleVariableModifications); int[] base_variable_modification_pattern = new int[peptideLength + 4]; - var totalAvailableMods = possible_variable_modifications.Sum(b => b.Value == null ? 0 : b.Value.Count); - for (int variable_modifications = 0; variable_modifications <= Math.Min(totalAvailableMods, maxModsForPeptide); variable_modifications++) + int totalAvailableMods = 0; + foreach (var kvp in possible_variable_modifications) + { + if (kvp.Value != null) + { + totalAvailableMods += kvp.Value.Count; + } + } + + int maxVariableMods = Math.Min(totalAvailableMods, maxModsForPeptide); + for (int variable_modifications = 0; variable_modifications <= maxVariableMods; variable_modifications++) { foreach (int[] variable_modification_pattern in GetVariableModificationPatterns(new List>>(possible_variable_modifications), possible_variable_modifications.Count - variable_modifications, base_variable_modification_pattern, 0)) @@ -77,17 +86,12 @@ protected Dictionary GetFixedModsOneIsNorFivePrimeTerminus(in case "N-terminal.": case "Peptide N-terminal.": //the modification is protease associated and is applied to the n-terminal cleaved residue, not at the beginign of the protein - if (mod.ModificationType == "Protease" && ModificationLocalization.ModFits(mod, Parent.BaseSequence, 1, length, OneBasedStartResidue)) + if (ModificationLocalization.ModFits(mod, Parent.BaseSequence, 1, length, OneBasedStartResidue)) { - if (OneBasedStartResidue != 1) - { + if (mod.ModificationType == "Protease" && OneBasedStartResidue != 1) fixedModsOneIsNterminus[2] = mod; - } - } - //Normal N-terminal peptide modification - else if (ModificationLocalization.ModFits(mod, Parent.BaseSequence, 1, length, OneBasedStartResidue)) - { - fixedModsOneIsNterminus[1] = mod; + else //Normal N-terminal peptide modification + fixedModsOneIsNterminus[1] = mod; } break; @@ -106,17 +110,12 @@ protected Dictionary GetFixedModsOneIsNorFivePrimeTerminus(in case "C-terminal.": case "Peptide C-terminal.": //the modification is protease associated and is applied to the c-terminal cleaved residue, not if it is at the end of the protein - if (mod.ModificationType == "Protease" && ModificationLocalization.ModFits(mod, Parent.BaseSequence, length, length, OneBasedStartResidue + length - 1)) + if (ModificationLocalization.ModFits(mod, Parent.BaseSequence, length, length, OneBasedStartResidue + length - 1)) { - if (OneBasedEndResidue != Parent.Length) - { + if (mod.ModificationType == "Protease" && OneBasedEndResidue != Parent.Length) fixedModsOneIsNterminus[length + 1] = mod; - } - } - //Normal C-terminal peptide modification - else if (ModificationLocalization.ModFits(mod, Parent.BaseSequence, length, length, OneBasedStartResidue + length - 1)) - { - fixedModsOneIsNterminus[length + 2] = mod; + else //Normal C-terminal peptide modification + fixedModsOneIsNterminus[length + 2] = mod; } break; @@ -188,7 +187,5 @@ private static Dictionary GetNewVariableModificationPattern(i return modification_pattern; } - - } } diff --git a/mzLib/Omics/Modifications/ModificationLocalization.cs b/mzLib/Omics/Modifications/ModificationLocalization.cs index 692e7a745..adb7b6c48 100644 --- a/mzLib/Omics/Modifications/ModificationLocalization.cs +++ b/mzLib/Omics/Modifications/ModificationLocalization.cs @@ -2,6 +2,7 @@ { public static class ModificationLocalization { + // This method is called a ton in MetaMorpheus. If changes are made, ensure they are efficient. public static bool ModFits(Modification attemptToLocalize, string sequence, int digestionProductOneBasedIndex, int digestionProductLength, int bioPolymerOneBasedIndex) { // First find the capital letter... From 532aaa762c8ccea256be8bb994de5371d638e4b4 Mon Sep 17 00:00:00 2001 From: nbollis Date: Tue, 10 Dec 2024 23:16:27 -0600 Subject: [PATCH 4/4] Adjusted Protease Specific fixed mod call --- mzLib/Omics/Digestion/DigestionProduct.cs | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/mzLib/Omics/Digestion/DigestionProduct.cs b/mzLib/Omics/Digestion/DigestionProduct.cs index 5dcf59d0c..3a473e680 100644 --- a/mzLib/Omics/Digestion/DigestionProduct.cs +++ b/mzLib/Omics/Digestion/DigestionProduct.cs @@ -85,11 +85,15 @@ protected Dictionary GetFixedModsOneIsNorFivePrimeTerminus(in case "Oligo 5'-terminal.": case "N-terminal.": case "Peptide N-terminal.": + //the modification is protease associated and is applied to the n-terminal cleaved residue, not at the beginign of the protein if (ModificationLocalization.ModFits(mod, Parent.BaseSequence, 1, length, OneBasedStartResidue)) { - if (mod.ModificationType == "Protease" && OneBasedStartResidue != 1) - fixedModsOneIsNterminus[2] = mod; + if (mod.ModificationType == "Protease") + { + if (OneBasedStartResidue != 1) + fixedModsOneIsNterminus[2] = mod; + } else //Normal N-terminal peptide modification fixedModsOneIsNterminus[1] = mod; } @@ -112,8 +116,11 @@ protected Dictionary GetFixedModsOneIsNorFivePrimeTerminus(in //the modification is protease associated and is applied to the c-terminal cleaved residue, not if it is at the end of the protein if (ModificationLocalization.ModFits(mod, Parent.BaseSequence, length, length, OneBasedStartResidue + length - 1)) { - if (mod.ModificationType == "Protease" && OneBasedEndResidue != Parent.Length) - fixedModsOneIsNterminus[length + 1] = mod; + if (mod.ModificationType == "Protease") + { + if (OneBasedEndResidue != Parent.Length) + fixedModsOneIsNterminus[length + 1] = mod; + } else //Normal C-terminal peptide modification fixedModsOneIsNterminus[length + 2] = mod; }