diff --git a/mzLib/Chemistry/IsotopicDistribution.cs b/mzLib/Chemistry/IsotopicDistribution.cs index ebb723b53..68e52466a 100644 --- a/mzLib/Chemistry/IsotopicDistribution.cs +++ b/mzLib/Chemistry/IsotopicDistribution.cs @@ -56,7 +56,22 @@ private IsotopicDistribution(int count) intensities = new double[count]; } - // Clone() produces shallow copies, but because double is a primitive type, this is acceptable + public double MostAbundantMass + { + get + { + double maxIntensity = intensities.Max(); + for (int i = 0; i < masses.Length; i++) + { + if (Math.Abs(intensities[i] - maxIntensity) < 0.0001) + { + return (masses[i]); + } + } + return Double.NaN; + } + } + public double MonoIsotopicMass => masses[0]; public double[] Masses => (double[]) masses.Clone(); public double[] Intensities => (double[]) intensities.Clone(); diff --git a/mzLib/MassSpectrometry/Deconvolution/Algorithms/ClassicDeconvolutionAlgorithm.cs b/mzLib/MassSpectrometry/Deconvolution/Algorithms/ClassicDeconvolutionAlgorithm.cs index 5dd92db7a..5edc178ad 100644 --- a/mzLib/MassSpectrometry/Deconvolution/Algorithms/ClassicDeconvolutionAlgorithm.cs +++ b/mzLib/MassSpectrometry/Deconvolution/Algorithms/ClassicDeconvolutionAlgorithm.cs @@ -27,7 +27,12 @@ public ClassicDeconvolutionAlgorithm(DeconvolutionParameters deconParameters) : /// public override IEnumerable Deconvolute(MzSpectrum spectrumToDeconvolute, MzRange range) { - var deconParams = DeconvolutionParameters as ClassicDeconvolutionParameters ?? throw new MzLibException("Deconvolution params and algorithm do not match"); + var deconParams = DeconvolutionParameters as ClassicDeconvolutionParameters; + if (deconParams == null) + { + throw new MzLibException("Deconvolution params and algorithm do not match"); + } + spectrum = spectrumToDeconvolute; //if no peaks, stop if (spectrum.Size == 0) @@ -175,7 +180,9 @@ public override IEnumerable Deconvolute(MzSpectrum spectrumToD } } - private IsotopicEnvelope FindIsotopicEnvelope(int massIndex, double candidateForMostIntensePeakMz, double candidateForMostIntensePeakIntensity, double testMostIntenseMass, int chargeState, double deconvolutionTolerancePpm, double intensityRatioLimit, List monoisotopicMassPredictions) + private IsotopicEnvelope FindIsotopicEnvelope(int massIndex, double candidateForMostIntensePeakMz, double candidateForMostIntensePeakIntensity, + double testMostIntenseMass, int chargeState, double deconvolutionTolerancePpm, double intensityRatioLimit, + List monoisotopicMassPredictions) { double[] theoreticalMasses = allMasses[massIndex]; double[] theoreticalIntensities = allIntensities[massIndex]; @@ -216,7 +223,9 @@ private IsotopicEnvelope FindIsotopicEnvelope(int massIndex, double candidateFor return new IsotopicEnvelope(listOfObservedPeaks, monoisotopicMass, chargeState, totalIntensity, Statistics.StandardDeviation(listOfRatios), massIndex); } - private int ObserveAdjacentChargeStates(IsotopicEnvelope originalEnvelope, double mostIntensePeakMz, int massIndex, double deconvolutionTolerancePpm, double intensityRatioLimit, double minChargeToLookFor, double maxChargeToLookFor, List monoisotopicMassPredictions) + private int ObserveAdjacentChargeStates(IsotopicEnvelope originalEnvelope, double mostIntensePeakMz, int massIndex, + double deconvolutionTolerancePpm, double intensityRatioLimit, double minChargeToLookFor, double maxChargeToLookFor, + List monoisotopicMassPredictions) { //look for the higher and lower charge states using the proposed mass int numAdjacentChargeStatesObserved = 0; @@ -251,7 +260,8 @@ private int ObserveAdjacentChargeStates(IsotopicEnvelope originalEnvelope, doubl return numAdjacentChargeStatesObserved; } - private bool FindChargeStateOfMass(IsotopicEnvelope originalEnvelope, int zToInvestigate, double mostAbundantNeutralIsotopeToInvestigate, int massIndex, double deconvolutionTolerancePpm, double intensityRatioLimit, List monoisotopicMassPredictions) + private bool FindChargeStateOfMass(IsotopicEnvelope originalEnvelope, int zToInvestigate, double mostAbundantNeutralIsotopeToInvestigate, int massIndex, + double deconvolutionTolerancePpm, double intensityRatioLimit, List monoisotopicMassPredictions) { //we know the mass and the charge that we're looking for, just see if the expected m/z and its isotopes are there or not double mostAbundantIsotopeMzForThisZTheoretical = mostAbundantNeutralIsotopeToInvestigate.ToMz(zToInvestigate); diff --git a/mzLib/MassSpectrometry/Deconvolution/Algorithms/SpectralDeconvolutionAlgorithm.cs b/mzLib/MassSpectrometry/Deconvolution/Algorithms/SpectralDeconvolutionAlgorithm.cs new file mode 100644 index 000000000..3470971f0 --- /dev/null +++ b/mzLib/MassSpectrometry/Deconvolution/Algorithms/SpectralDeconvolutionAlgorithm.cs @@ -0,0 +1,281 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using Chemistry; +using Easy.Common.Extensions; +using MassSpectrometry.Deconvolution; +using MassSpectrometry.Deconvolution.Scoring; +using Proteomics; +using Proteomics.ProteolyticDigestion; +using MzLibUtil; + +namespace MassSpectrometry.Deconvolution.Algorithms +{ + public class SpectralDeconvolutionAlgorithm : DeconvolutionAlgorithm + { + // TODO: Make a charge state envelope class, complete with "MostAbundantChargeState" + public Dictionary> EnvelopeDictionary { get; private set; } + + // Consider defining this as a jagged array to increase performance + public List[,] IndexedLibrarySpectra { get; private set; } + // SpectrumIndexToPwsmMap maps the location of each spectrum within IndexedLibrarySpectra to its respective PeptideWithSetMods and charge + public Dictionary<(int, int, int), PeptideWithSetModifications> SpectrumIndexToPwsmMap { get; private set; } + public int MaxThreads; // This should be in the Parameters abstract + public SpectralDeconvolutionParameters SpectralParams { get; } + public PpmTolerance PpmTolerance { get; } + public Scorer Scorer { get; } + + public SpectralDeconvolutionAlgorithm(DeconvolutionParameters parameters) : base(parameters) + { + var deconvolutionParameters = DeconvolutionParameters as SpectralDeconvolutionParameters; + if (deconvolutionParameters == null) + { + throw new MzLibException( + "Improper Deconvolution Parameters were pass to the SpectralDeconvolutionAlgorithm"); + } + else + { + SpectralParams = deconvolutionParameters; + } + + PpmTolerance = new PpmTolerance(parameters.DeconvolutionTolerancePpm); + + FindLibraryEnvelopes(); + IndexEnvelopes(); + Scorer = new Scorer(Scorer.ScoringMethods.SpectralContrastAngle, PpmTolerance); + + } + + public override IEnumerable Deconvolute(MzSpectrum spectrum) + { + + if (spectrum == null || spectrum.Size == 0) + { + yield break; + } + + // For each charge state (key) store the indices corresponding to every potential isotopic envelope (value) + Dictionary> potentialEnvelopes = FindPotentialEnvelopes(spectrum); + + // iterate through charge states (potentially not necessary/performant. Could flatten) + foreach (var keyValuePair in potentialEnvelopes) + { + int chargeBinIndex = keyValuePair.Key - SpectralParams.MinAssumedChargeState; + // iterate through potential envelopes + foreach (var experimentalSpectrum in keyValuePair.Value) + { + double mostAbundantMz = experimentalSpectrum.MostAbundantMz; + int massBinIndex = (int)Math.Floor((mostAbundantMz - SpectralParams.ScanRange.Minimum) * + SpectralParams.BinsPerDalton); + if (!IndexedLibrarySpectra[massBinIndex, chargeBinIndex].IsNotNullOrEmpty()) continue; // continue if there are no corresponding library spectra + + int? bestMatchListPosition = null; + int currentListPosition = 0; + double bestFoundScore = Scorer.PoorScore; + // Score against matching theoretical envelopes + foreach (MinimalSpectrum theoreticalSpectrum in IndexedLibrarySpectra[massBinIndex, chargeBinIndex]) + { + // TODO: Rename to FindBestScore + if (Scorer.TestForScoreImprovement( + Scorer.Score(experimentalSpectrum,theoreticalSpectrum), + bestFoundScore, + out double betterScore) + ) + { + bestMatchListPosition = currentListPosition; + bestFoundScore = betterScore; + } + currentListPosition++; + } + + if (bestMatchListPosition.HasValue && + SpectrumIndexToPwsmMap.TryGetValue((massBinIndex, chargeBinIndex, (int)bestMatchListPosition), out var pwsmMatch)) + { + yield return new IsotopicEnvelope(experimentalSpectrum, pwsmMatch, bestFoundScore); + } + else + { + //TODO: Add some averagine bullshit here + } + + } + } + } + + /// + /// Populates the EnvelopeDictionary by digesting each protein in the parameters into PeptideWithSetMods, + /// then calculating an isotopic envelope for each charge state from min to max assumed charge state + /// + private void FindLibraryEnvelopes() + { + EnvelopeDictionary = new(); + + + //TODO: Parallelize this section of the code + foreach (Protein protein in SpectralParams.Proteins) + { + // I'm not sure if calling protein.Digest within the foreach statement would call the method anew for every loop + IEnumerable uniquePeptides = protein.Digest( + SpectralParams.DigestionParams, SpectralParams.FixedModifications, + SpectralParams.VariableModifications, SpectralParams.SilacLabels, + topDownTruncationSearch: SpectralParams.FindTopDownTruncationProducts); + + foreach (PeptideWithSetModifications pwsm in uniquePeptides) + { + EnvelopeDictionary.Add(pwsm, new List()); + IsotopicDistribution pwsmDistribution = IsotopicDistribution.GetDistribution(pwsm.FullChemicalFormula, + fineResolution: SpectralParams.FineResolutionForIsotopicDistribution, + minProbability: SpectralParams.MinProbabilityForIsotopicDistribution); + + // iterates through all possible charge states, from largest to smallest. + // Any isotopic envelope whose most abundant peak would fall within the scan range is written to the envelope dictionary + // Once the mass to charge ratio of the most abundant peak is greater than the scan range maximum, the loop breaks + for (int charge = SpectralParams.MaxAssumedChargeState; + charge >= SpectralParams.MinAssumedChargeState; + charge--) + { + double theoreticalMz = pwsm.MostAbundantMass.ToMz(charge); + if (SpectralParams.ScanRange.Contains(theoreticalMz)) + { + EnvelopeDictionary[pwsm].Add(new + IsotopicEnvelope(pwsmDistribution, charge, SpectralParams.AmbiguityThresholdForIsotopicDistribution)); + } + else if (SpectralParams.ScanRange.CompareTo(theoreticalMz) < 0) + { + break; + } + } + } + + } + } + + /// + /// For each envelope in Envelope Dictionary, indexes it according to mass and charge, + /// resulting in a 2D array of lists of minimal spectra + /// + private void IndexEnvelopes() + { + + int numberOfBinsForIndexing = (int) (SpectralParams.ScanRange.Width * SpectralParams.BinsPerDalton).Ceiling(0); + IndexedLibrarySpectra = new List[numberOfBinsForIndexing, + SpectralParams.MaxAssumedChargeState + 1 - SpectralParams.MinAssumedChargeState]; + SpectrumIndexToPwsmMap = new(); + + foreach (var keyValuePair in EnvelopeDictionary) + { + foreach (IsotopicEnvelope envelope in keyValuePair.Value) + { + int massBinIndex = (int)Math.Floor((envelope.MostAbundantObservedIsotopicMz - SpectralParams.ScanRange.Minimum) * + SpectralParams.BinsPerDalton); + int chargeBinIndex = envelope.Charge - SpectralParams.MinAssumedChargeState; + if (IndexedLibrarySpectra[massBinIndex, chargeBinIndex] == null) + { + IndexedLibrarySpectra[massBinIndex, chargeBinIndex] = new(); + } + MinimalSpectrum envelopeMinimalSpectrum = new MinimalSpectrum(envelope.MzArray, envelope.IntensityArray, envelope.Charge); + IndexedLibrarySpectra[massBinIndex, chargeBinIndex].Add(envelopeMinimalSpectrum); + SpectrumIndexToPwsmMap.Add( + (massBinIndex, chargeBinIndex, IndexedLibrarySpectra[massBinIndex, chargeBinIndex].Count - 1), // tuple consisting of bin index (mass, charge) and list position of MinimalSpectrum object + keyValuePair.Key // tuple consisting of PeptideWithSetMods and charge state + ); + + // In situations where the most abundant isotope frequency is close to the second most abundant isotope's frequency + // ( ratio >= IsotopicEnvelope.AmbiguityRatioMinimum), + // The Spectrum is stored in the index of the second most abundant isotope as well + if(envelope.SecondMostAbundantObservedIsotopicMz > 0 ) + { + // Ceiling or floor???? + int secondBinIndex = (int)Math.Floor( + ((double)envelope.SecondMostAbundantObservedIsotopicMz - SpectralParams.ScanRange.Minimum ) * SpectralParams.BinsPerDalton); + if (secondBinIndex != massBinIndex) + { + if (IndexedLibrarySpectra[secondBinIndex, chargeBinIndex] == null) IndexedLibrarySpectra[secondBinIndex, chargeBinIndex] = new(); + IndexedLibrarySpectra[secondBinIndex, chargeBinIndex].Add(envelopeMinimalSpectrum); + SpectrumIndexToPwsmMap.Add( + (secondBinIndex, chargeBinIndex, IndexedLibrarySpectra[secondBinIndex, chargeBinIndex].Count - 1), + keyValuePair.Key + ); + } + } + } + } + } + + /// + /// Iterates through all peaks in a spectrum to find all potential isotopic envelopes. + /// It does this by examining the spacing of peaks in the m/z domain + /// e.g. for charge of 2, a peak at 200 m/z would result in a search for a peak at 200.5 and 201 m/z + /// if either is found, the process continues until SpectralParams.MaxConsecutiveMissedIsotopicPeaks number of consecutive + /// isotope peaks are missed + /// Anything consistent with an isotopic envelope in a given charge state is stored in the dictionary + /// + /// + /// + private Dictionary> FindPotentialEnvelopes(MzSpectrum spectrum) + { + + // For each charge state (key) store the indices corresponding to every potential isotopic envelope (value) + Dictionary> potentialEnvelopes = new(); + + for (int charge = SpectralParams.MinAssumedChargeState; charge <= SpectralParams.MaxAssumedChargeState; charge++) + { + List indicesOfKnownPeaks = new(); + + // Spectrum Search Loop + for (int i = 0; i < spectrum.Size; i++) + { + if (indicesOfKnownPeaks.Contains(i)) + { + continue; + } + List envelopeIndices = new(); + envelopeIndices.Add(i); + + // Envelope Search Loop + for (int j = i + 1; j < spectrum.Size; j++) + { + if (PpmTolerance.Within(spectrum.XArray[j], + spectrum.XArray[envelopeIndices.Last()] + Constants.C13MinusC12 / charge)) + { + envelopeIndices.Add(j); + } + else if (spectrum.XArray[j] > PpmTolerance.GetMaximumValue(spectrum.XArray[envelopeIndices.Last()] + + (1 + SpectralParams.MaxConsecutiveMissedIsotopicPeaks) * Constants.C13MinusC12 / charge)) + { + // exit the Envelope loop if we missed more consecutive isotopic peaks than were allowed + break; + } + } + + // Convert to MinimalSpectrum here? Write helper function to do so? + if (envelopeIndices.Count > 1) + { + if (!potentialEnvelopes.ContainsKey(charge)) potentialEnvelopes.Add(charge, new()); + potentialEnvelopes[charge].Add(GetMinimalSpectrumFromIndices(spectrum, envelopeIndices, charge)); + indicesOfKnownPeaks.AddRange(envelopeIndices); + } + } + } + + return potentialEnvelopes; + } + + + private static MinimalSpectrum GetMinimalSpectrumFromIndices(MzSpectrum spectrum, List indices, int charge = 0) + { + double[] mzArray = new double[indices.Count]; + double[] intensityArray = new double[indices.Count]; + for (int i = 0; i < indices.Count; i++) + { + mzArray[i] = spectrum.XArray[indices[i]]; + intensityArray[i] = spectrum.YArray[indices[i]]; + } + + return new MinimalSpectrum(mzArray, intensityArray, charge); + } + + } +} diff --git a/mzLib/MassSpectrometry/Deconvolution/Deconvoluter.cs b/mzLib/MassSpectrometry/Deconvolution/Deconvoluter.cs index 70977ebe3..0e2ed3f2e 100644 --- a/mzLib/MassSpectrometry/Deconvolution/Deconvoluter.cs +++ b/mzLib/MassSpectrometry/Deconvolution/Deconvoluter.cs @@ -4,6 +4,7 @@ using System.Text; using System.Threading.Tasks; using Easy.Common.Extensions; +using MassSpectrometry.Deconvolution.Algorithms; using MzLibUtil; namespace MassSpectrometry @@ -11,7 +12,7 @@ namespace MassSpectrometry public enum DeconvolutionTypes { ClassicDeconvolution, - AlexDeconvolution, + SpectralDeconvolution, } /// @@ -44,9 +45,13 @@ public IEnumerable Deconvolute(MsDataScan scan, MzRange rangeT switch (DeconvolutionType) { case DeconvolutionTypes.ClassicDeconvolution: + ((ClassicDeconvolutionParameters)DeconvolutionParameters).Range = + new MzRange(scan.IsolationRange.Minimum - 8.5, scan.IsolationRange.Maximum + 8.5); break; - case DeconvolutionTypes.AlexDeconvolution: + case DeconvolutionTypes.SpectralDeconvolution: + ((SpectralDeconvolutionParameters)DeconvolutionParameters).ScanRange = + new MzRange(scan.IsolationRange.Minimum - 8.5, scan.IsolationRange.Maximum + 8.5); break; } @@ -69,8 +74,8 @@ private void ConstructDeconvolutionAlgorithm(DeconvolutionParameters deconParame DeconvolutionAlgorithm = new ClassicDeconvolutionAlgorithm(deconParameters); break; - case DeconvolutionTypes.AlexDeconvolution: - DeconvolutionAlgorithm = new ExampleNewDeconvolutionAlgorithm(deconParameters); + case DeconvolutionTypes.SpectralDeconvolution: + DeconvolutionAlgorithm = new SpectralDeconvolutionAlgorithm(deconParameters); break; default: throw new MzLibException("DeconvolutionType not yet supported"); diff --git a/mzLib/MassSpectrometry/Deconvolution/DeconvoluterExtensions.cs b/mzLib/MassSpectrometry/Deconvolution/DeconvoluterExtensions.cs index 75723b1ec..1b15858d5 100644 --- a/mzLib/MassSpectrometry/Deconvolution/DeconvoluterExtensions.cs +++ b/mzLib/MassSpectrometry/Deconvolution/DeconvoluterExtensions.cs @@ -34,5 +34,20 @@ public static IEnumerable ClassicDeconvoluteMzSpectra(this Dec return deconvoluter.DeconvolutionAlgorithm.Deconvolute(spectrum, range); } } + + public static IEnumerable SpectralDeconvoluteMzSpectra(this Deconvoluter deconvoluter, + MzSpectrum spectrum, MzRange range) + { + if (deconvoluter.DeconvolutionType != DeconvolutionTypes.SpectralDeconvolution) + { + throw new MzLibException("Deconvoluter is not of correct type for this extension method"); + } + else + { + ((SpectralDeconvolutionParameters)deconvoluter.DeconvolutionParameters).ScanRange = range; + return deconvoluter.DeconvolutionAlgorithm.Deconvolute(spectrum); + + } + } } } diff --git a/mzLib/MassSpectrometry/Deconvolution/MinimalSpectrum.cs b/mzLib/MassSpectrometry/Deconvolution/MinimalSpectrum.cs new file mode 100644 index 000000000..514ed90ba --- /dev/null +++ b/mzLib/MassSpectrometry/Deconvolution/MinimalSpectrum.cs @@ -0,0 +1,65 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace MassSpectrometry.Deconvolution +{ + // Consider defining this as a struct to increase performance + public class MinimalSpectrum + { + public readonly double[] MzArray; + public readonly double[] IntensityArray; + public readonly double MostAbundantMz; + public readonly int Charge; + + public MinimalSpectrum(double[] mzArray, double[] intensityArray, int charge = 0) + { + MzArray = mzArray; + IntensityArray = intensityArray; + MostAbundantMz = GetMostAbundantMz(mzArray, intensityArray); + Charge = charge; + } + + internal double[] GetMzs() + { + double[] mzArrayCopy = new double[MzArray.Length]; + Array.Copy(MzArray, mzArrayCopy, MzArray.Length); + return mzArrayCopy; + } + + internal double[] GetIntensities() + { + double[] intensityArrayCopy = new double[IntensityArray.Length]; + Array.Copy(MzArray, intensityArrayCopy, IntensityArray.Length); + return intensityArrayCopy; + } + + /// + /// Returns the charge, or 0 if charge was not assigned + /// + /// + internal int GetCharge() + { + return Charge; + } + + internal static double GetMostAbundantMz(double[] mzArray, double[] intensityArray) + { + double mostAbundantMz = 0; + double maxIntensity = 0; + for (int i = 0; i < mzArray.Length; i++) + { + if (intensityArray[i] > maxIntensity) + { + maxIntensity = intensityArray[i]; + mostAbundantMz = mzArray[i]; + } + } + + return mostAbundantMz; + + } + } +} diff --git a/mzLib/MassSpectrometry/Deconvolution/Parameters/ClassicDeconvolutionParameters.cs b/mzLib/MassSpectrometry/Deconvolution/Parameters/ClassicDeconvolutionParameters.cs index 0ab4a57af..a66c01735 100644 --- a/mzLib/MassSpectrometry/Deconvolution/Parameters/ClassicDeconvolutionParameters.cs +++ b/mzLib/MassSpectrometry/Deconvolution/Parameters/ClassicDeconvolutionParameters.cs @@ -12,6 +12,7 @@ namespace MassSpectrometry /// public class ClassicDeconvolutionParameters : DeconvolutionParameters { + public MzRange Range { get; set; } public int MinAssumedChargeState { get; set; } public int MaxAssumedChargeState { get; set; } public double DeconvolutionTolerancePpm { get; set; } @@ -25,12 +26,10 @@ public class ClassicDeconvolutionParameters : DeconvolutionParameters /// /// /// Isolation range of the scan to be deconvoluted - public ClassicDeconvolutionParameters(int minCharge, int maxCharge, double deconPpm, double intensityRatio) : base() + public ClassicDeconvolutionParameters(int minCharge, int maxCharge, double deconPpm, double intensityRatio, MzRange range = null) : + base (minCharge, maxCharge, deconPpm) { IntensityRatioLimit = intensityRatio; - DeconvolutionTolerancePpm = deconPpm; - MinAssumedChargeState = minCharge; - MaxAssumedChargeState = maxCharge; } } } diff --git a/mzLib/MassSpectrometry/Deconvolution/Parameters/DeconvolutionParameters.cs b/mzLib/MassSpectrometry/Deconvolution/Parameters/DeconvolutionParameters.cs index d88cfbd20..0eecb0d87 100644 --- a/mzLib/MassSpectrometry/Deconvolution/Parameters/DeconvolutionParameters.cs +++ b/mzLib/MassSpectrometry/Deconvolution/Parameters/DeconvolutionParameters.cs @@ -12,12 +12,19 @@ namespace MassSpectrometry /// public abstract class DeconvolutionParameters { + public int MinAssumedChargeState { get; set; } + public int MaxAssumedChargeState { get; set; } + public double DeconvolutionTolerancePpm { get; set; } + /// /// Constructor should initialize all fields that are used by every deconvolution algorithm /// - public DeconvolutionParameters() + public DeconvolutionParameters(int minAssumedChargeState, int maxAssumedChargeState, + double deconvolutionTolerancePpm) { - + MinAssumedChargeState = minAssumedChargeState; + MaxAssumedChargeState = maxAssumedChargeState; + DeconvolutionTolerancePpm = deconvolutionTolerancePpm; } } } diff --git a/mzLib/MassSpectrometry/Deconvolution/Parameters/ExampleNewDeconvolutionParameters.cs b/mzLib/MassSpectrometry/Deconvolution/Parameters/ExampleNewDeconvolutionParameters.cs index 9d4e57455..abb6f39ec 100644 --- a/mzLib/MassSpectrometry/Deconvolution/Parameters/ExampleNewDeconvolutionParameters.cs +++ b/mzLib/MassSpectrometry/Deconvolution/Parameters/ExampleNewDeconvolutionParameters.cs @@ -10,7 +10,8 @@ namespace MassSpectrometry [ExcludeFromCodeCoverage] public class ExampleNewDeconvolutionParameters : DeconvolutionParameters { - public ExampleNewDeconvolutionParameters() : base() + public ExampleNewDeconvolutionParameters(int minCharge, int maxCharge, double deconPpm) : + base (minCharge, maxCharge, deconPpm) { } diff --git a/mzLib/MassSpectrometry/Deconvolution/Parameters/SpectralDeconvolutionParameters.cs b/mzLib/MassSpectrometry/Deconvolution/Parameters/SpectralDeconvolutionParameters.cs new file mode 100644 index 000000000..7e06c032c --- /dev/null +++ b/mzLib/MassSpectrometry/Deconvolution/Parameters/SpectralDeconvolutionParameters.cs @@ -0,0 +1,55 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using Proteomics; +using Proteomics.ProteolyticDigestion; +using MathNet.Numerics.Optimization; +using MzLibUtil; + +namespace MassSpectrometry +{ + + public class SpectralDeconvolutionParameters : DeconvolutionParameters + { + public List Proteins { get; } + public List FixedModifications { get; } + public List VariableModifications { get; } + public DigestionParams DigestionParams { get; } + public List SilacLabels { get; } + // TODO: convert double range to MzRange + public DoubleRange ScanRange { get; set; } + public bool FindTopDownTruncationProducts { get; } + public int BinsPerDalton { get; } + public double FineResolutionForIsotopicDistribution { get; } + public double MinProbabilityForIsotopicDistribution { get; } + public double AmbiguityThresholdForIsotopicDistribution { get; } + public int MaxConsecutiveMissedIsotopicPeaks { get; } + private bool FindNonDatabasePeaks { get; } // This should be linked to a method that generates Averagine envelopes + + + public SpectralDeconvolutionParameters(int minAssumedChargeState, int maxAssumedChargeState, + double deconvolutionTolerancePpm, List proteins, List fixedModifications, + List variableModifications, DigestionParams digestionParams, + List silacLabels, bool findTopDownTruncationProducts, double scanMinimumMz, double scanMaximumMz, + int binsPerDalton = 10, double fineResolutionForIsotopicDistribution = 0.125, double minProbabilityForIsotopicDistribution = 1e-8, + double ambiguityThresholdForIsotopicDistribution = 0.9, int maxConsecutiveMissedIsotopicPeaks = 1, + bool findNonDatabasePeaks = false) : + base(minAssumedChargeState, maxAssumedChargeState, deconvolutionTolerancePpm) + { + Proteins = proteins; + FixedModifications = fixedModifications; + VariableModifications = variableModifications; + DigestionParams = digestionParams; + SilacLabels = silacLabels; + FindTopDownTruncationProducts = findTopDownTruncationProducts; + ScanRange = new DoubleRange(scanMinimumMz, scanMaximumMz); + BinsPerDalton = binsPerDalton; + FineResolutionForIsotopicDistribution = fineResolutionForIsotopicDistribution; + MinProbabilityForIsotopicDistribution = minProbabilityForIsotopicDistribution; + AmbiguityThresholdForIsotopicDistribution = ambiguityThresholdForIsotopicDistribution; + FindNonDatabasePeaks = findNonDatabasePeaks; + } + } +} diff --git a/mzLib/MassSpectrometry/Deconvolution/Scoring/Scorer.cs b/mzLib/MassSpectrometry/Deconvolution/Scoring/Scorer.cs new file mode 100644 index 000000000..d7b78b231 --- /dev/null +++ b/mzLib/MassSpectrometry/Deconvolution/Scoring/Scorer.cs @@ -0,0 +1,123 @@ +using System; +using MzLibUtil; + +namespace MassSpectrometry.Deconvolution.Scoring; + +// Context class for scoring deconvolution hypotheses +public class Scorer +{ + public enum ScoringMethods + { + KullbackLeibler, + SpectralContrastAngle + } + public ScoringAlgorithm ScoringAlgorithm { get; private set; } + public ScoringMethods ScoringMethod { get; } + private double? _poorScore; + + public double PoorScore + { + get + { + if (_poorScore.HasValue) return (double)_poorScore; + switch (ScoringMethod) + { + case ScoringMethods.KullbackLeibler: + _poorScore = Double.MaxValue; + return (double)_poorScore; + case ScoringMethods.SpectralContrastAngle: + _poorScore = 0; + return (double)_poorScore; + default: + _poorScore = Double.MinValue; + return (double)_poorScore; + } + } + } + + public Scorer(ScoringMethods scoringMethod, PpmTolerance tolerance) + { + ScoringMethod = scoringMethod; + ConstructScoringAlgorithm(tolerance); + } + + public double Score(IScoreArgs args) + { + return ScoringAlgorithm.GetScore(args); + } + + public double Score(MinimalSpectrum experimentalSpectrum, MinimalSpectrum theoreticalSpectrum) + { + IScoreArgs args = new MinimalSpectraArgs(experimentalSpectrum, theoreticalSpectrum); + return ScoringAlgorithm.GetScore(args); + } + + /// + /// Compares two scores in a method specific fashion. Returns true if the instanceScore (first) + /// is better than the argumentScore (second). Outputs the better of the two. This method is necessary + /// because there are some metrics where lower scores are better. + /// + /// + /// + /// + /// + /// + public bool TestForScoreImprovement(double instanceScore, double argumentScore, out double betterScore) + { + switch (ScoringMethod) + { + case ScoringMethods.KullbackLeibler: + if (instanceScore < argumentScore) + { + betterScore = instanceScore; + return true; + } + else + { + betterScore = argumentScore; + return false; + } + case ScoringMethods.SpectralContrastAngle: + return DefaultCompare(instanceScore, argumentScore, out betterScore); + default: + return DefaultCompare(instanceScore, argumentScore, out betterScore); + } + } + + /// + /// The default score comparison, where higher scores are better. Compares two scores, returns true + /// if the instance score is higher than the argument score, returns false if instance score is lower. + /// + /// + /// + /// The higher of the two scores + /// + private bool DefaultCompare(double instanceScore, double argumentScore, out double betterScore) + { + if (instanceScore > argumentScore) + { + betterScore = instanceScore; + return true; + } + else + { + betterScore = argumentScore; + return false; + } + } + + private void ConstructScoringAlgorithm(PpmTolerance tolerance) + { + switch (ScoringMethod) + { + case ScoringMethods.KullbackLeibler: + throw new NotImplementedException(); + case ScoringMethods.SpectralContrastAngle: + ScoringAlgorithm = new SpectralContrastAlgorithm(tolerance); + break; + default: + throw new NotImplementedException(); + } + } + +} \ No newline at end of file diff --git a/mzLib/MassSpectrometry/Deconvolution/Scoring/ScoringAlgorithm.cs b/mzLib/MassSpectrometry/Deconvolution/Scoring/ScoringAlgorithm.cs new file mode 100644 index 000000000..44e2f422d --- /dev/null +++ b/mzLib/MassSpectrometry/Deconvolution/Scoring/ScoringAlgorithm.cs @@ -0,0 +1,44 @@ +using System; +using System.Collections.Generic; +using System.Dynamic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using MzLibUtil; + + +namespace MassSpectrometry.Deconvolution.Scoring +{ + public abstract class ScoringAlgorithm + { + public PpmTolerance PpmTolerance { get; } + + public ScoringAlgorithm(PpmTolerance tolerance) + { + PpmTolerance = tolerance; + } + public abstract double GetScore(IScoreArgs args); + } + + public interface IScoreArgs + { + } + + public class MinimalSpectraArgs : IScoreArgs + { + public MinimalSpectrum ExperimentalSpectrum { get; set; } + public MinimalSpectrum TheoreticalSpectrum { get; set; } + + public MinimalSpectraArgs(MinimalSpectrum experimentalSpectrum, MinimalSpectrum theoreticalSpectrum) + { + ExperimentalSpectrum = experimentalSpectrum; + TheoreticalSpectrum = theoreticalSpectrum; + } + } + + public class IsotopicEnvelopeArgs : IScoreArgs + { + public IsotopicEnvelope ExperimentalEnvelope { get; set; } + public IsotopicEnvelope TheoreticalEnvelope { get; set; } + } +} diff --git a/mzLib/MassSpectrometry/Deconvolution/Scoring/SpectralContrastAlgorithm.cs b/mzLib/MassSpectrometry/Deconvolution/Scoring/SpectralContrastAlgorithm.cs new file mode 100644 index 000000000..aeee5e67d --- /dev/null +++ b/mzLib/MassSpectrometry/Deconvolution/Scoring/SpectralContrastAlgorithm.cs @@ -0,0 +1,35 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using MassSpectrometry.MzSpectra; +using MzLibUtil; + +namespace MassSpectrometry.Deconvolution.Scoring +{ + public class SpectralContrastAlgorithm : ScoringAlgorithm + { + public SpectralContrastAlgorithm(PpmTolerance tolerance) : base(tolerance) + { + + } + + public override double GetScore(IScoreArgs args) + { + switch (args) + { + case MinimalSpectraArgs spectraArgs: + SpectralSimilarity spectralSimilarity = + new(spectraArgs.ExperimentalSpectrum.MzArray, spectraArgs.ExperimentalSpectrum.IntensityArray, + spectraArgs.TheoreticalSpectrum.MzArray, spectraArgs.TheoreticalSpectrum.IntensityArray, + SpectralSimilarity.SpectrumNormalizationScheme.spectrumSum, PpmTolerance.Value, + allPeaks: true, filterOutBelowThisMz: 1); + return spectralSimilarity.SpectralContrastAngle() ?? 0; + default: + throw new ArgumentException(); + } + } + + } +} diff --git a/mzLib/MassSpectrometry/MassSpectrometry.csproj b/mzLib/MassSpectrometry/MassSpectrometry.csproj index d803402ea..7fd9f1309 100644 --- a/mzLib/MassSpectrometry/MassSpectrometry.csproj +++ b/mzLib/MassSpectrometry/MassSpectrometry.csproj @@ -19,4 +19,10 @@ + + + Always + + + diff --git a/mzLib/MassSpectrometry/MzSpectra/IsotopicEnvelope.cs b/mzLib/MassSpectrometry/MzSpectra/IsotopicEnvelope.cs index ab85e7228..cb9897791 100644 --- a/mzLib/MassSpectrometry/MzSpectra/IsotopicEnvelope.cs +++ b/mzLib/MassSpectrometry/MzSpectra/IsotopicEnvelope.cs @@ -2,6 +2,9 @@ using System; using System.Collections.Generic; using System.Linq; +using Chemistry; +using MassSpectrometry.Deconvolution; +using Proteomics.ProteolyticDigestion; namespace MassSpectrometry { @@ -9,29 +12,85 @@ public class IsotopicEnvelope { public readonly List<(double mz, double intensity)> Peaks; public double MonoisotopicMass { get; private set; } - public double MostAbundantObservedIsotopicMass { get; private set; } public readonly int Charge; + + // Legacy fields used in the ClassicDeconvolutionAlgorithm public readonly double TotalIntensity; public readonly double StDev; public readonly int MassIndex; + public double[] MzArray => Peaks.OrderBy(p => p.mz).Select(p => p.mz).ToArray(); + public double[] IntensityArray => Peaks.OrderBy(p => p.mz).Select(p => p.intensity).ToArray(); + + public double MostAbundantObservedIsotopicMz => _mostAbundantObservedIsotopicMz ?? 0; + public double MostAbundantObservedIsotopicMass => MostAbundantObservedIsotopicMz.ToMass(Charge); + public double SecondMostAbundantObservedIsotopicMz => _secondMostAbundantObservedIsotopicMz ?? 0; + private double? _mostAbundantObservedIsotopicMz; + private double? _secondMostAbundantObservedIsotopicMz; + public double AmbiguityRatioMinimum { get; } public double Score { get; private set; } + public PeptideWithSetModifications BestPwsmMatch { get; } - public IsotopicEnvelope(List<(double mz, double intensity)> bestListOfPeaks, double bestMonoisotopicMass, int bestChargeState, double bestTotalIntensity, double bestStDev, int bestMassIndex) + public IsotopicEnvelope(List<(double mz, double intensity)> bestListOfPeaks, double bestMonoisotopicMass, + int bestChargeState, double bestTotalIntensity, double bestStDev, int bestMassIndex) { Peaks = bestListOfPeaks; MonoisotopicMass = bestMonoisotopicMass; - MostAbundantObservedIsotopicMass = GetMostAbundantObservedIsotopicMass(bestListOfPeaks, bestChargeState); Charge = bestChargeState; + FindMostAbundantObservedIsotopicMz(); + TotalIntensity = bestTotalIntensity; StDev = bestStDev; MassIndex = bestMassIndex; Score = ScoreIsotopeEnvelope(); } - public double GetMostAbundantObservedIsotopicMass(List<(double mz, double intensity)> peaks, int charge) + /// + /// Takes in an Isotopic Distribution and a given charge state and converts it to an IsotopicEnvelope object + /// TODO: Test this function specifically + /// + /// An IsotopicDistribution generated from a ChemicalFormula + /// The charge state (corresponding to the z value of m/z) + public IsotopicEnvelope(IsotopicDistribution theoreticalDistribution, int charge, double ambiguityRatioMinimum = 0.9) + { + Peaks = theoreticalDistribution.Masses.Zip(theoreticalDistribution.Intensities, + (first, second) => (first.ToMz(charge), (double)second)).ToList(); + MonoisotopicMass = theoreticalDistribution.MonoIsotopicMass; // I think this is right, need to test it tho + Charge = charge; + AmbiguityRatioMinimum = ambiguityRatioMinimum; + FindMostAbundantObservedIsotopicMz(); + } + + public IsotopicEnvelope(MinimalSpectrum experimentalSpectrum, PeptideWithSetModifications bestPwsmMatch, double spectralScore = 0) + { + Peaks = experimentalSpectrum.MzArray.Zip(experimentalSpectrum.IntensityArray, (first, second) => + (first, second)).ToList(); + MonoisotopicMass = bestPwsmMatch.MonoisotopicMass; + Charge = experimentalSpectrum.Charge; + FindMostAbundantObservedIsotopicMz(); + BestPwsmMatch = bestPwsmMatch; + Score = spectralScore; + } + + /// + /// Finds the m/z value of the greatest intensity peak. If the second most intense peak + /// is within 90% of the most intense peak, the m/z value of that peak is stored + /// in the _secondMostAbundantObservedIsotopicMass field + /// + /// + public void FindMostAbundantObservedIsotopicMz() { - return (peaks.OrderByDescending(p => p.intensity).ToList()[0].Item1)* charge; + if (!_mostAbundantObservedIsotopicMz.HasValue | MostAbundantObservedIsotopicMass == 0) + { + List<(double mz, double intensity)> intensityOrderedPeaks = Peaks.OrderByDescending(p => p.intensity).ToList(); + _mostAbundantObservedIsotopicMz = intensityOrderedPeaks.Select(p => p.mz).First(); + if (intensityOrderedPeaks.Count > 1 && + intensityOrderedPeaks[1].intensity / intensityOrderedPeaks[0].intensity >= AmbiguityRatioMinimum && + AmbiguityRatioMinimum > 0) + { + _secondMostAbundantObservedIsotopicMz = intensityOrderedPeaks[1].mz; + } + } } public override string ToString() @@ -39,6 +98,7 @@ public override string ToString() return Charge + "\t" + Peaks[0].mz.ToString("G8") + "\t" + Peaks.Count + "\t" + TotalIntensity; } + // This should be done using a Strategy pattern private double ScoreIsotopeEnvelope() //likely created by Stefan Solntsev using peptide data { return Peaks.Count >= 2 ? diff --git a/mzLib/MassSpectrometry/MzSpectra/SpectralSimilarity.cs b/mzLib/MassSpectrometry/MzSpectra/SpectralSimilarity.cs index be6ac6344..0febdec23 100644 --- a/mzLib/MassSpectrometry/MzSpectra/SpectralSimilarity.cs +++ b/mzLib/MassSpectrometry/MzSpectra/SpectralSimilarity.cs @@ -30,6 +30,17 @@ public SpectralSimilarity(MzSpectrum experimentalSpectrum, double[] theoreticalX _intensityPairs = IntensityPairs(allPeaks); } + /// + /// Constructs a spectral similarity object where the P arrays represent the experimental spectrum and the Q arrays represent the theoretical spectrum + /// + /// Experimental X Array (m/z) + /// Experimental Y Array (intensity) + /// Theoretical X Array (m/z) + /// Theoretical Y Array (intensity) + /// + /// + /// + /// public SpectralSimilarity(double[] P_XArray, double[] P_YArray, double[] Q_XArray, double[] Q_YArray, SpectrumNormalizationScheme scheme, double toleranceInPpm, bool allPeaks, double filterOutBelowThisMz = 300) { ExperimentalYArray = Normalize(FilterOutIonsBelowThisMz(P_XArray, P_YArray, filterOutBelowThisMz).Select(p => p.Item2).ToArray(), scheme); diff --git a/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/AminoAcidPolymer.cs b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/AminoAcidPolymer.cs new file mode 100644 index 000000000..8c6575ae1 --- /dev/null +++ b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/AminoAcidPolymer.cs @@ -0,0 +1,1200 @@ +// Copyright 2012, 2013, 2014 Derek J. Bailey +// Modified work copyright 2016 Stefan Solntsev +// +// This file (AminoAcidPolymer.cs) is part of Proteomics. +// +// Proteomics is free software: you can redistribute it and/or modify it +// under the terms of the GNU Lesser General Public License as published +// by the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Proteomics is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +// License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with Proteomics. If not, see . + +using System; +using System.Collections.Generic; +using System.Collections.ObjectModel; +using System.Globalization; +using System.Linq; +using System.Text; +using Chemistry; +using MzLibUtil; + +namespace Proteomics.AminoAcidPolymer +{ + /// + /// A linear polymer of amino acids + /// + public abstract class AminoAcidPolymer : IEquatable, IHasMass + { + /// + /// The C-terminus chemical formula cap. This is different from the C-Terminus modification. + /// + private IHasChemicalFormula _cTerminus; + + /// + /// The N-terminus chemical formula cap. This is different from the N-Terminus modification. + /// + private IHasChemicalFormula _nTerminus; + + /// + /// All of the modifications indexed by position from N to C. This array is 2 bigger than the amino acid array + /// as index 0 and Count - 1 represent the N and C terminus, respectively + /// + private IHasMass[] _modifications; + + /// + /// All of the amino acid residues indexed by position from N to C. + /// + private Residue[] residues; + + protected AminoAcidPolymer() + : this(string.Empty, new ChemicalFormulaTerminus(ChemicalFormula.ParseFormula("H")), new ChemicalFormulaTerminus(ChemicalFormula.ParseFormula("OH"))) + { + } + + protected AminoAcidPolymer(string sequence) + : this(sequence, new ChemicalFormulaTerminus(ChemicalFormula.ParseFormula("H")), new ChemicalFormulaTerminus(ChemicalFormula.ParseFormula("OH"))) + { + } + + protected AminoAcidPolymer(string sequence, IHasChemicalFormula nTerm, IHasChemicalFormula cTerm) + { + MonoisotopicMass = 0; + Length = sequence.Length; + residues = new Residue[Length]; + NTerminus = nTerm; + CTerminus = cTerm; + ParseSequence(sequence); + } + + protected AminoAcidPolymer(AminoAcidPolymer aminoAcidPolymer, bool includeModifications) + : this(aminoAcidPolymer, 0, aminoAcidPolymer.Length, includeModifications) + { + } + + protected AminoAcidPolymer(AminoAcidPolymer aminoAcidPolymer, int firstResidue, int length, bool includeModifications) + { + Length = length; + residues = new Residue[length]; + + bool isNterm = firstResidue == 0; + bool isCterm = length + firstResidue == aminoAcidPolymer.Length; + + _nTerminus = isNterm ? aminoAcidPolymer.NTerminus : new ChemicalFormulaTerminus(ChemicalFormula.ParseFormula("H")); + _cTerminus = isCterm ? aminoAcidPolymer.CTerminus : new ChemicalFormulaTerminus(ChemicalFormula.ParseFormula("OH")); + + double monoMass = _nTerminus.MonoisotopicMass + _cTerminus.MonoisotopicMass; + + Residue[] otherAminoAcids = aminoAcidPolymer.residues; + + if (includeModifications && aminoAcidPolymer.ContainsModifications()) + { + _modifications = new IHasMass[length + 2]; + for (int i = 0; i < length; i++) + { + var aa = otherAminoAcids[i + firstResidue]; + residues[i] = aa; + monoMass += aa.MonoisotopicMass; + + IHasMass mod = aminoAcidPolymer._modifications[i + firstResidue + 1]; + if (mod == null) + continue; + + _modifications[i + 1] = mod; + monoMass += mod.MonoisotopicMass; + } + } + else + { + for (int i = 0, j = firstResidue; i < length; i++, j++) + { + var aa = otherAminoAcids[j]; + residues[i] = aa; + monoMass += aa.MonoisotopicMass; + } + } + + MonoisotopicMass = monoMass; + + if (includeModifications) + { + if (isNterm) + NTerminusModification = aminoAcidPolymer.NTerminusModification; + + if (isCterm) + CTerminusModification = aminoAcidPolymer.CTerminusModification; + } + } + + public ReadOnlyCollection Modifications + { + get + { + return new ReadOnlyCollection(_modifications); + } + } + + /// + /// Gets or sets the C terminus of this amino acid polymer + /// + public IHasChemicalFormula CTerminus + { + get { return _cTerminus; } + set { ReplaceTerminus(ref _cTerminus, value); } + } + + /// + /// Gets or sets the N terminus of this amino acid polymer + /// + public IHasChemicalFormula NTerminus + { + get { return _nTerminus; } + set { ReplaceTerminus(ref _nTerminus, value); } + } + + /// + /// Gets the number of amino acids in this amino acid polymer + /// + public int Length { get; private set; } + + /// + /// The total monoisotopic mass of this peptide and all of its modifications + /// + public double MonoisotopicMass { get; private set; } + + /// + /// Returns the amino acid sequence with all isoleucines (I) replaced with leucines (L); + /// + /// The amino acid sequence with all I's into L's + public virtual string BaseLeucineSequence + { + get + { + return BaseSequence.Replace('I', 'L'); + } + } + + /// + /// Gets the base amino acid sequence + /// + public string BaseSequence + { + get + { + return new string(residues.Select(aa => aa.Letter).ToArray()); + } + } + + /// + /// Gets or sets the modification of the C terminus on this amino acid polymer + /// + public IHasMass CTerminusModification + { + get { return GetModification(Length + 1); } + set { ReplaceMod(Length + 1, value); } + } + + /// + /// Gets or sets the modification of the C terminus on this amino acid polymer + /// + public IHasMass NTerminusModification + { + get { return GetModification(0); } + set { ReplaceMod(0, value); } + } + + /// + /// Returns all fragments that are present in either fragmentation of A or B, but not in both + /// + public static IEnumerable GetSiteDeterminingFragments(AminoAcidPolymer peptideA, AminoAcidPolymer peptideB, FragmentTypes types) + { + HashSet aFrags = new HashSet(peptideA.Fragment(types)); + aFrags.SymmetricExceptWith(peptideB.Fragment(types)); + return aFrags; + } + + /// + /// Gets the digestion points (starting index and length) of a amino acid sequence + /// + /// The sequence to cleave + /// The proteases to cleave with + /// The maximum number of missed clevages to allow + /// The minimum amino acid length of the peptides + /// The maximum amino acid length of the peptides + /// + /// + /// A collection of clevage points and the length of the cut (Item1 = index, Item2 = length) + public static IEnumerable GetDigestionPointsAndLengths(string sequence, IEnumerable proteases, int maxMissedCleavages, int minLength, int maxLength, bool methionineInitiator, bool semiDigestion) + { + int[] indices = GetCleavageIndexes(sequence, proteases).ToArray(); + + bool includeMethionineCut = methionineInitiator && sequence[0] == 'M'; + + int indiciesCount = indices.Length - 1; + + for (int missedCleavages = 0; missedCleavages <= maxMissedCleavages; missedCleavages++) + { + int max = indiciesCount - missedCleavages; + int offset = missedCleavages + 1; + for (int i = 0; i < max; i++) + { + int len = indices[i + offset] - indices[i]; + + // Case for initiator methionine + if (indices[i] == -1 && includeMethionineCut) + { + int newLength = len - 1; + if (newLength >= minLength && newLength <= maxLength) + { + yield return new DigestionPointAndLength(1, newLength); if (semiDigestion) + { + for (int j = 1; j < newLength; j++) + { + if (j >= minLength && j <= maxLength) + { + yield return new DigestionPointAndLength(1, j); + } + } + } + } + } + + if (len < minLength || len > maxLength) + continue; + + yield return new DigestionPointAndLength(indices[i] + 1, len); + if (semiDigestion) + { + for (int j = 1; j < len; j++) + { + if (len - j >= minLength && len - j <= maxLength) + { + yield return new DigestionPointAndLength(indices[i] + 1 + j, len - j); + } + if (j >= minLength && j <= maxLength) + { + yield return new DigestionPointAndLength(indices[i] + 1, j); + } + } + } + } + } + } + + public static IEnumerable GetCleavageIndexes(string sequence, IEnumerable proteases) + { + return GetCleavageIndexes(sequence, proteases, true); + } + + /// + /// Gets the location of all the possible cleavage points for a given sequence and set of proteases + /// + /// The sequence to determine the cleavage points for + /// The proteases to cleave with + /// Include the N and C terminus (-1 and Length + 1) + /// A collection of all the sites where the proteases would cleave + public static IEnumerable GetCleavageIndexes(string sequence, IEnumerable proteases, bool includeTermini) + { + // Combine all the proteases digestion sites + SortedSet locations = new SortedSet(); + foreach (IProtease protease in proteases.Where(protease => protease != null)) + { + locations.UnionWith(protease.GetDigestionSites(sequence)); + } + + if (!includeTermini) + return locations; + + locations.Add(-1); + locations.Add(sequence.Length - 1); + + return locations; + } + + public static IEnumerable Digest(string sequence, IEnumerable proteases, int maxMissedCleavages, int minLength, int maxLength, bool methionineInitiator, bool semiDigestion) + { + return GetDigestionPointsAndLengths(sequence, proteases, maxMissedCleavages, minLength, maxLength, methionineInitiator, semiDigestion).Select(points => sequence.Substring(points.Index, points.Length)); + } + + public static IEnumerable Digest(AminoAcidPolymer sequence, IProtease protease) + { + return Digest(sequence, protease, 3, 1, int.MaxValue, true, false); + } + + public static IEnumerable Digest(AminoAcidPolymer polymer, IProtease protease, int maxMissedCleavages, int minLength, int maxLength, bool methionineInitiator, bool semiDigestion) + { + return Digest(polymer.BaseSequence, new[] { protease }, maxMissedCleavages, minLength, maxLength, methionineInitiator, semiDigestion); + } + + public Residue GetResidue(int position) + { + if (position < 0 || position >= Length) + return null; + return residues[position]; + } + + /// + /// Checks if an amino acid residue with the value of 'residue' is contained in this polymer + /// + /// The character code for the amino acid residue + /// True if any amino acid residue is the same as the specified character + public bool Contains(char residue) + { + return residues.Any(aa => aa.Letter.Equals(residue)); + } + + /// + /// Checks if the amino acid residue is contained in this polymer + /// + /// The residue to check for + /// True if the polymer contains the specified residue, False otherwise + public bool Contains(Residue residue) + { + return residues.Contains(residue); + } + + public string GetSequenceWithModifications() + { + return GetSequenceWithModifications(false); + } + + public string GetSequenceWithModifications(bool leucineSequence) + { + if (_modifications == null) + return (leucineSequence) ? BaseLeucineSequence : BaseSequence; + + StringBuilder modSeqSb = new StringBuilder(Length); + + IHasMass mod; + + // Handle N-Terminus Modification + if ((mod = _modifications[0]) != null && mod.MonoisotopicMass > 0) + { + modSeqSb.Append('['); + modSeqSb.Append(mod); + modSeqSb.Append("]-"); + } + + // Handle Amino Acid Residues + for (int i = 0; i < Length; i++) + { + if (leucineSequence && residues[i].Letter == 'I') + modSeqSb.Append('L'); + else + modSeqSb.Append(residues[i].Letter); + + // Handle Amino Acid Modification (1-based) + if ((mod = _modifications[i + 1]) != null && mod.MonoisotopicMass > 0) + { + modSeqSb.Append('['); + modSeqSb.Append(mod); + modSeqSb.Append(']'); + } + } + + // Handle C-Terminus Modification + if ((mod = _modifications[Length + 1]) != null && mod.MonoisotopicMass > 0) + { + modSeqSb.Append("-["); + modSeqSb.Append(mod); + modSeqSb.Append(']'); + } + + return modSeqSb.ToString(); + } + + /// + /// Gets the total number of amino acid residues in this amino acid polymer + /// + /// The number of amino acid residues + public int ResidueCount() + { + return Length; + } + + public int ResidueCount(Residue aminoAcid) + { + return aminoAcid == null ? 0 : residues.Count(aar => aar.Equals(aminoAcid)); + } + + /// + /// Gets the number of amino acids residues in this amino acid polymer that + /// has the specified residue letter + /// + /// The residue letter to search for + /// The number of amino acid residues that have the same letter in this polymer + public int ResidueCount(char residueLetter) + { + return residues.Count(aar => aar.Letter.Equals(residueLetter)); + } + + public int ResidueCount(char residueLetter, int index, int length) + { + return residues.SubArray(index, length).Count(aar => aar.Letter.Equals(residueLetter)); + } + + public int ResidueCount(Residue aminoAcid, int index, int length) + { + return residues.SubArray(index, length).Count(aar => aar.Equals(aminoAcid)); + } + + public int ElementCountWithIsotopes(string element) + { + // Residues count + int count = residues.Sum(aar => aar.ThisChemicalFormula.CountWithIsotopes(element)); + // Modifications count (if the mod is a IHasChemicalFormula) + if (_modifications != null) + count += _modifications.Where(mod => mod is IHasChemicalFormula).Cast().Sum(mod => mod.ThisChemicalFormula.CountWithIsotopes(element)); + + count += ChemicalFormula.ParseFormula("H2O").CountWithIsotopes(element); + return count; + } + + public int SpecificIsotopeCount(Isotope isotope) + { + // Residues count + int count = residues.Sum(aar => aar.ThisChemicalFormula.CountSpecificIsotopes(isotope)); + // Modifications count (if the mod is a IHasChemicalFormula) + if (_modifications != null) + count += _modifications.Where(mod => mod is IHasChemicalFormula).Cast().Sum(mod => mod.ThisChemicalFormula.CountSpecificIsotopes(isotope)); + return count; + } + + /// + /// Calculates the fragments that are different between this and another aminoacidpolymer + /// + /// + /// + /// + public IEnumerable GetSiteDeterminingFragments(AminoAcidPolymer other, FragmentTypes type) + { + return GetSiteDeterminingFragments(this, other, type); + } + + public IEnumerable Fragment(FragmentTypes types) + { + return Fragment(types, false); + } + + /// + /// Calculates all the fragments of the types you specify + /// + /// + /// + /// + public IEnumerable Fragment(FragmentTypes types, bool calculateChemicalFormula) + { + return Fragment(types, 1, Length - 1, calculateChemicalFormula); + } + + public IEnumerable Fragment(FragmentTypes types, int number) + { + return Fragment(types, number, false); + } + + public IEnumerable Fragment(FragmentTypes types, int number, bool calculateChemicalFormula) + { + return Fragment(types, number, number, calculateChemicalFormula); + } + + public IEnumerable Fragment(FragmentTypes types, int minIndex, int maxIndex) + { + return Fragment(types, minIndex, maxIndex, false); + } + + public IEnumerable Fragment(FragmentTypes types, int minIndex, int maxIndex, bool calculateChemicalFormula) + { + foreach (FragmentTypes type in types.GetIndividualFragmentTypes()) + { + bool isChemicalFormula = calculateChemicalFormula; + ChemicalFormula capFormula = type.GetIonCap(); + bool isCTerminal = type.GetTerminus() == Terminus.C; + + double monoMass = capFormula.MonoisotopicMass; + ChemicalFormula formula = new ChemicalFormula(capFormula); + + IHasChemicalFormula terminus = isCTerminal ? CTerminus : NTerminus; + monoMass += terminus.MonoisotopicMass; + if (isChemicalFormula) + formula.Add(terminus); + + bool first = true; + bool hasMod = _modifications != null; + + for (int i = 0; i <= maxIndex; i++) + { + int aaIndex = isCTerminal ? Length - i : i - 1; + + // Handle the terminus mods first in a special case + IHasMass mod; + if (first) + { + first = false; + if (hasMod) + { + mod = _modifications[aaIndex + 1]; + if (mod != null) + { + monoMass += mod.MonoisotopicMass; + if (isChemicalFormula) + { + if (mod is IHasChemicalFormula modFormula) + { + formula.Add(modFormula); + } + else + { + isChemicalFormula = false; + } + } + } + } + continue; + } + + monoMass += residues[aaIndex].MonoisotopicMass; + formula.Add(residues[aaIndex]); + + if (hasMod) + { + mod = _modifications[aaIndex + 1]; + + if (mod != null) + { + monoMass += mod.MonoisotopicMass; + if (isChemicalFormula) + { + if (mod is IHasChemicalFormula modFormula) + { + formula.Add(modFormula); + } + else + { + isChemicalFormula = false; + } + } + } + } + + if (i < minIndex) + continue; + + if (isChemicalFormula) + { + yield return new ChemicalFormulaFragment(type, i, formula, this); + } + else + { + yield return new Fragment(type, i, monoMass, this); + } + } + } + } + + public bool ContainsModifications() + { + return _modifications != null && _modifications.Any(m => m != null); + } + + public ISet GetUniqueModifications() where T : IHasMass + { + HashSet uniqueMods = new HashSet(); + + if (_modifications == null) + return uniqueMods; + + foreach (IHasMass mod in _modifications) + { + if (mod is T) + uniqueMods.Add((T)mod); + } + return uniqueMods; + } + + /// + /// Counts the total number of modifications on this polymer that are not null + /// + /// The number of modifications + public int ModificationCount() + { + return _modifications == null ? 0 : _modifications.Count(mod => mod != null); + } + + /// + /// Get the modification at the given residue number + /// + /// The amino acid residue number + /// The modification at the site, null if there isn't any modification present + public IHasMass GetModification(int residueNumber) + { + return _modifications?[residueNumber]; + } + + /// + /// Sets the modification at the terminus of this amino acid polymer + /// + /// The modification to set + /// The termini to set the mod at + public virtual void SetModification(IHasMass modification, Terminus terminus) + { + if ((terminus & Terminus.N) == Terminus.N) + NTerminusModification = modification; + + if ((terminus & Terminus.C) == Terminus.C) + CTerminusModification = modification; + } + + /// + /// Sets the modification at specific sites on this amino acid polymer + /// + /// The modification to set + /// The sites to set the modification at + /// The number of modifications added to this amino acid polymer + public virtual int SetModification(IHasMass modification, ModificationSites sites) + { + int count = 0; + + if ((sites & ModificationSites.NPep) == ModificationSites.NPep) + { + NTerminusModification = modification; + count++; + } + + for (int i = 0; i < Length; i++) + { + ModificationSites site = residues[i].Site; + if ((sites & site) == site) + { + ReplaceMod(i + 1, modification); + count++; + } + } + + if ((sites & ModificationSites.PepC) == ModificationSites.PepC) + { + CTerminusModification = modification; + count++; + } + + return count; + } + + /// + /// Sets the modification at specific sites on this amino acid polymer + /// + /// The modification to set + /// The residue character to set the modification at + /// The number of modifications added to this amino acid polymer + public virtual int SetModification(IHasMass modification, char letter) + { + int count = 0; + for (int i = 0; i < Length; i++) + { + if (!letter.Equals(residues[i].Letter)) + continue; + + ReplaceMod(i + 1, modification); + count++; + } + + return count; + } + + /// + /// Sets the modification at specific sites on this amino acid polymer + /// + /// The modification to set + /// The residue to set the modification at + /// The number of modifications added to this amino acid polymer + public virtual int SetModification(IHasMass modification, Residue residue) + { + int count = 0; + for (int i = 0; i < Length; i++) + { + if (!residue.Letter.Equals(residues[i].Letter)) + continue; + ReplaceMod(i + 1, modification); + count++; + } + return count; + } + + /// + /// Sets the modification at specific sites on this amino acid polymer + /// + /// The modification to set + /// The residue number to set the modification at + public virtual void SetModification(IHasMass modification, int residueNumber) + { + if (residueNumber > Length || residueNumber < 1) + throw new MzLibException(string.Format(CultureInfo.InvariantCulture, "Residue number not in the correct range: [{0}-{1}] you specified: {2}", 1, Length, residueNumber)); + + ReplaceMod(residueNumber, modification); + } + + public void SetModifications(IEnumerable modifications) + { + foreach (OldSchoolModification mod in modifications) + { + SetModification(mod, mod.Sites); + } + } + + public void SetModification(OldSchoolModification mod) + { + SetModification(mod, mod.Sites); + } + + /// + /// + /// + /// + /// (1-based) residue number + public void SetModification(IHasMass mod, params int[] residueNumbers) + { + foreach (int residueNumber in residueNumbers) + { + SetModification(mod, residueNumber); + } + } + + /// + /// Replaces all instances of the old modification with the new modification in this polymer + /// + /// The modification to remove + /// The modification to replace it with + /// The number of modifications added to this amino acid polymer + public virtual int ReplaceModification(IHasMass oldMod, IHasMass newMod) + { + if (oldMod == null) + throw new MzLibException("Cannot replace a null modification"); + + int count = 0; + for (int i = 0; i < Length + 2; i++) + { + IHasMass mod = GetModification(i); + if (mod == null || !oldMod.Equals(mod)) + continue; + + ReplaceMod(i, newMod); + count++; + } + return count; + } + + /// + /// Adds the modification at the terminus of this amino acid polymer, combining modifications if a modification is already present + /// + /// The modification to set + /// The termini to set the mod at + public virtual int AddModification(IHasMass modification, Terminus terminus) + { + IHasMass currentMod; + int count = 0; + + if ((terminus & Terminus.N) == Terminus.N) + { + currentMod = NTerminusModification; + NTerminusModification = currentMod == null ? modification : new ModificationCollection(currentMod, modification); + count++; + } + + if ((terminus & Terminus.C) == Terminus.C) + { + currentMod = CTerminusModification; + CTerminusModification = currentMod == null ? modification : new ModificationCollection(currentMod, modification); + count++; + } + return count; + } + + public virtual int AddModification(OldSchoolModification modification) + { + return AddModification(modification, modification.Sites); + } + + public virtual int AddModification(IHasMass modification, ModificationSites sites) + { + if (_modifications == null) + _modifications = new IHasMass[Length + 2]; + + int count = 0; + IHasMass currentMod; + if ((sites & ModificationSites.NPep) == ModificationSites.NPep) + { + currentMod = NTerminusModification; + NTerminusModification = currentMod == null ? modification : new ModificationCollection(currentMod, modification); + count++; + } + + for (int i = 0; i < Length; i++) + { + ModificationSites site = residues[i].Site; + if ((sites & site) == site) + { + currentMod = _modifications[i + 1]; + ReplaceMod(i + 1, currentMod == null ? modification : new ModificationCollection(currentMod, modification)); + count++; + } + } + + if ((sites & ModificationSites.PepC) == ModificationSites.PepC) + { + currentMod = CTerminusModification; + CTerminusModification = currentMod == null ? modification : new ModificationCollection(currentMod, modification); + count++; + } + + return count; + } + + /// + /// Adds the modification at specific sites on this amino acid polymer, combining modifications if a modification is already present + /// + /// The modification to set + /// The location to set the modification at + public virtual void AddModification(IHasMass modification, int location) + { + IHasMass currentMod = GetModification(location); + ReplaceMod(location, currentMod == null ? modification : new ModificationCollection(currentMod, modification)); + } + + /// + /// Clears the modification set at the terminus of this amino acid polymer back + /// to the default C or N modifications. + /// + /// The termini to clear the mod at + public void ClearModifications(Terminus terminus) + { + if ((terminus & Terminus.N) == Terminus.N) + NTerminusModification = null; + + if ((terminus & Terminus.C) == Terminus.C) + CTerminusModification = null; + } + + /// + /// Clear the modifications from the specified sites(s) + /// + /// The sites to remove modifications from + public void ClearModifications(ModificationSites sites) + { + if ((sites & ModificationSites.NPep) == ModificationSites.NPep || (sites & ModificationSites.NProt) == ModificationSites.NProt) + { + ReplaceMod(0, null); + } + + for (int i = 0; i < Length; i++) + { + int modIndex = i + 1; + + if (_modifications[modIndex] == null) + continue; + + ModificationSites curSite = residues[i].Site; + + if ((curSite & sites) == curSite) + { + ReplaceMod(modIndex, null); + } + } + + if ((sites & ModificationSites.PepC) == ModificationSites.PepC || (sites & ModificationSites.ProtC) == ModificationSites.ProtC) + { + ReplaceMod(Length + 1, null); + } + } + + /// + /// Clear all modifications from this amino acid polymer. + /// Includes N and C terminus modifications. + /// + public void ClearModifications() + { + if (!ContainsModifications()) + return; + + for (int i = 0; i <= Length + 1; i++) + { + if (_modifications[i] == null) + continue; + + MonoisotopicMass -= _modifications[i].MonoisotopicMass; + _modifications[i] = null; + } + } + + /// + /// Removes the specified mod from all locations on this polymer + /// + /// The modification to remove from this polymer + public void ClearModifications(IHasMass mod) + { + for (int i = 0; i <= Length + 1; i++) + { + if (!mod.Equals(_modifications[i])) + continue; + + MonoisotopicMass -= mod.MonoisotopicMass; + _modifications[i] = null; + } + } + + /// + /// Gets the chemical formula of this amino acid polymer. + /// + /// + public ChemicalFormula GetChemicalFormula() + { + var formula = new ChemicalFormula(); + + // Handle Modifications + if (ContainsModifications()) + { + for (int i = 0; i < Length + 2; i++) + { + if (_modifications[i] == null) + continue; + + if (!(_modifications[i] is IHasChemicalFormula chemMod)) + throw new MzLibException("Modification " + _modifications[i] + " does not have a chemical formula!"); + + formula.Add(chemMod.ThisChemicalFormula); + } + } + + // Handle N-Terminus + formula.Add(NTerminus.ThisChemicalFormula); + + // Handle C-Terminus + formula.Add(CTerminus.ThisChemicalFormula); + + // Handle Amino Acid Residues + for (int i = 0; i < Length; i++) + { + formula.Add(residues[i].ThisChemicalFormula); + } + + return formula; + } + + public override string ToString() + { + return GetSequenceWithModifications(); + } + + public override int GetHashCode() + { + return BaseSequence.GetHashCode(); + } + + public override bool Equals(object obj) + { + AminoAcidPolymer aap = obj as AminoAcidPolymer; + return aap != null && Equals(aap); + } + + public bool Equals(AminoAcidPolymer other) + { + if (other == null || + Length != other.Length || + !NTerminus.ThisChemicalFormula.Equals(other.NTerminus.ThisChemicalFormula) || + !CTerminus.ThisChemicalFormula.Equals(other.CTerminus.ThisChemicalFormula)) + return false; + + bool containsMod = ContainsModifications(); + + if (containsMod != other.ContainsModifications()) + return false; + + for (int i = 0; i <= Length + 1; i++) + { + if (containsMod && !Equals(_modifications[i], other._modifications[i])) + { + return false; + } + + if (i == 0 || i == Length + 1) + { + continue; // uneven arrays, so skip these two conditions + } + + if (!residues[i - 1].Equals(other.residues[i - 1])) + { + return false; + } + } + return true; + } + + private void ReplaceTerminus(ref IHasChemicalFormula terminus, IHasChemicalFormula value) + { + if (terminus != null) + MonoisotopicMass -= terminus.MonoisotopicMass; + + terminus = value; + + if (value != null) + MonoisotopicMass += value.MonoisotopicMass; + } + + /// + /// Replaces a modification (if present) at the specific index in the residue (0-based for N and C termini) + /// + /// The residue index to replace at + /// The modification to replace with + private void ReplaceMod(int index, IHasMass mod) + { + // No error checking here as all validation will occur before this method is call. This is to prevent + // unneeded bounds checking + + if (_modifications == null) + { + _modifications = new IHasMass[Length + 2]; + } + + IHasMass oldMod = _modifications[index]; // Get the mod at the index, if present + + if (Equals(mod, oldMod)) + return; // Same modifications, no change is required + + if (oldMod != null) + MonoisotopicMass -= oldMod.MonoisotopicMass; // remove the old mod mass + + _modifications[index] = mod; + + if (mod != null) + MonoisotopicMass += mod.MonoisotopicMass; // add the new mod mass + } + + /// + /// Parses a string sequence of amino acids characters into a peptide object + /// + /// + /// + private void ParseSequence(string sequence) + { + bool inMod = false; + bool cterminalMod = false; // n or c terminal modification + int index = 0; + + double monoMass = 0; + + StringBuilder modSb = new StringBuilder(10); + foreach (char letter in sequence) + { + if (inMod) + { + if (letter == ']') + { + inMod = false; // end the modification phase + + string modString = modSb.ToString(); + modSb.Clear(); + IHasMass modification; + try + { + modification = new OldSchoolChemicalFormulaModification(ChemicalFormula.ParseFormula(modString)); + } + catch (MzLibException) + { + if (double.TryParse(modString, out double mass)) + { + modification = new ModWithOnlyMass(mass); + } + else + { + throw new MzLibException("Unable to correctly parse the following modification: " + modString); + } + } + + monoMass += modification.MonoisotopicMass; + + if (_modifications == null) + _modifications = new IHasMass[Length + 2]; + + if (cterminalMod) + { + _modifications[index + 1] = modification; + } + else + { + _modifications[index] = modification; + } + + cterminalMod = false; + } + else + { + modSb.Append(letter); + } + } + else + { + //char upperletter = char.ToUpper(letter); // moved to amino acid dictionary + if (Residue.TryGetResidue(letter, out Residue residue)) + { + residues[index++] = residue; + monoMass += residue.MonoisotopicMass; + } + else + { + switch (letter) + { + case '[': // start of a modification + inMod = true; + break; + + case '-': // End of an n-terminus mod or start of a c-terminus mod + cterminalMod = (index > 0); + break; + + default: + throw new MzLibException(string.Format(CultureInfo.InvariantCulture, "Amino Acid Letter {0} does not exist in the Amino Acid Dictionary. {0} is also not a valid character", letter)); + } + } + } + } + + if (inMod) + throw new MzLibException("Couldn't find the closing ] for a modification in this sequence: " + sequence); + + Length = index; + MonoisotopicMass += monoMass; + Array.Resize(ref residues, Length); + if (_modifications != null) + { + Array.Resize(ref _modifications, Length + 2); + } + } + + private class ModWithOnlyMass : IHasMass + { + private readonly double mass; + + public ModWithOnlyMass(double mass) + { + this.mass = mass; + } + + public double MonoisotopicMass + { + get + { + return mass; + } + } + + public override string ToString() + { + return mass.ToString(CultureInfo.InvariantCulture); + } + } + } +} \ No newline at end of file diff --git a/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/AminoAcidPolymerExtensions.cs b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/AminoAcidPolymerExtensions.cs new file mode 100644 index 000000000..2bc13a1f0 --- /dev/null +++ b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/AminoAcidPolymerExtensions.cs @@ -0,0 +1,74 @@ +// Copyright 2012, 2013, 2014 Derek J. Bailey +// Modified work copyright 2016 Stefan Solntsev +// +// This file (AminoAcidPolymerExtensions.cs) is part of Proteomics. +// +// Proteomics is free software: you can redistribute it and/or modify it +// under the terms of the GNU Lesser General Public License as published +// by the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Proteomics is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +// License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with Proteomics. If not, see . + +using System; +using System.Collections.Generic; +using System.Linq; + +namespace Proteomics.AminoAcidPolymer +{ + public static class AminoAcidPolymerExtensions + { + public static double GetSequenceCoverageFraction(this AminoAcidPolymer baseSequence, IEnumerable sequences) + { + return GetSequenceCoverageFraction(baseSequence, sequences, true); + } + + public static double GetSequenceCoverageFraction(this AminoAcidPolymer baseSequence, IEnumerable sequences, bool useLeucineSequence) + { + int[] counts = baseSequence.GetSequenceCoverage(sequences, useLeucineSequence); + return ((double)counts.Count(x => x > 0)) / baseSequence.Length; + } + + public static int[] GetSequenceCoverage(this AminoAcidPolymer baseSequence, IEnumerable sequences) + { + return GetSequenceCoverage(baseSequence, sequences, true); + } + + public static int[] GetSequenceCoverage(this AminoAcidPolymer baseSequence, IEnumerable allPolymers, bool useLeucineSequence) + { + int[] bits = new int[baseSequence.Length]; + + string masterSequence = useLeucineSequence ? baseSequence.BaseLeucineSequence : baseSequence.BaseSequence; + + foreach (AminoAcidPolymer polymer in allPolymers) + { + string seq = useLeucineSequence ? polymer.BaseLeucineSequence : polymer.BaseSequence; + + int startIndex = 0; + while (true) + { + int index = masterSequence.IndexOf(seq, startIndex, StringComparison.Ordinal); + + if (index < 0) + { + break; + } + + for (int aa = index; aa < index + polymer.Length; aa++) + { + bits[aa]++; + } + + startIndex = index + 1; + } + } + return bits; + } + } +} \ No newline at end of file diff --git a/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/ChemicalFormulaFragment.cs b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/ChemicalFormulaFragment.cs new file mode 100644 index 000000000..655b73608 --- /dev/null +++ b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/ChemicalFormulaFragment.cs @@ -0,0 +1,33 @@ +// Copyright 2012, 2013, 2014 Derek J. Bailey +// Modified work copyright 2016 Stefan Solntsev +// +// This file (ChemicalFormulaFragment.cs) is part of Proteomics. +// +// Proteomics is free software: you can redistribute it and/or modify it +// under the terms of the GNU Lesser General Public License as published +// by the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Proteomics is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +// License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with Proteomics. If not, see . + +using Chemistry; + +namespace Proteomics.AminoAcidPolymer +{ + public class ChemicalFormulaFragment : Fragment, IHasChemicalFormula + { + public ChemicalFormulaFragment(FragmentTypes type, int number, ChemicalFormula formula, AminoAcidPolymer parent) + : base(type, number, formula.MonoisotopicMass, parent) + { + ThisChemicalFormula = ChemicalFormula.ParseFormula(formula.Formula); + } + + public ChemicalFormula ThisChemicalFormula { get; private set; } + } +} \ No newline at end of file diff --git a/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/ChemicalFormulaModification.cs b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/ChemicalFormulaModification.cs new file mode 100644 index 000000000..7f4f93580 --- /dev/null +++ b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/ChemicalFormulaModification.cs @@ -0,0 +1,57 @@ +// Copyright 2012, 2013, 2014 Derek J. Bailey +// Modified work copyright 2016 Stefan Solntsev +// +// This file (ChemicalFormulaModification.cs) is part of Proteomics. +// +// Proteomics is free software: you can redistribute it and/or modify it +// under the terms of the GNU Lesser General Public License as published +// by the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Proteomics is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +// License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with Proteomics. If not, see . + +using Chemistry; + +namespace Proteomics.AminoAcidPolymer +{ + public class OldSchoolChemicalFormulaModification : OldSchoolModification, IHasChemicalFormula + { + public OldSchoolChemicalFormulaModification(ChemicalFormula chemicalFormula) + : this(chemicalFormula, ModificationSites.Any) + { + } + + public OldSchoolChemicalFormulaModification(ChemicalFormula chemicalFormula, ModificationSites sites) + : this(chemicalFormula, "", sites) + { + Name = ThisChemicalFormula.Formula; + } + + public OldSchoolChemicalFormulaModification(ChemicalFormula chemicalFormula, string name) + : this(chemicalFormula, name, ModificationSites.Any) + { + } + + public OldSchoolChemicalFormulaModification(ChemicalFormula chemicalFormula, string name, ModificationSites sites) + : base(chemicalFormula.MonoisotopicMass, name, sites) + { + ThisChemicalFormula = chemicalFormula; + } + + public OldSchoolChemicalFormulaModification(OldSchoolChemicalFormulaModification other) + : this(ChemicalFormula.ParseFormula(other.ThisChemicalFormula.Formula), other.Name, other.Sites) + { + } + + /// + /// The Chemical Formula of this modifications + /// + public ChemicalFormula ThisChemicalFormula { get; private set; } + } +} \ No newline at end of file diff --git a/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/ChemicalFormulaTerminus.cs b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/ChemicalFormulaTerminus.cs new file mode 100644 index 000000000..3479a5068 --- /dev/null +++ b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/ChemicalFormulaTerminus.cs @@ -0,0 +1,42 @@ +// Copyright 2016 Stefan Solntsev +// +// This file (ChemicalFormulaTerminus.cs) is part of Proteomics. +// +// Proteomics is free software: you can redistribute it and/or modify it +// under the terms of the GNU Lesser General Public License as published +// by the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Proteomics is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +// License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with Proteomics. If not, see . + +using Chemistry; + +namespace Proteomics.AminoAcidPolymer +{ + public class ChemicalFormulaTerminus : IHasChemicalFormula + { + public ChemicalFormulaTerminus(ChemicalFormula chemicalFormula) + { + ThisChemicalFormula = chemicalFormula; + } + + public double MonoisotopicMass + { + get + { + return ThisChemicalFormula.MonoisotopicMass; + } + } + + public ChemicalFormula ThisChemicalFormula + { + get; private set; + } + } +} \ No newline at end of file diff --git a/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/DigestionPoint.cs b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/DigestionPoint.cs new file mode 100644 index 000000000..d906b736a --- /dev/null +++ b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/DigestionPoint.cs @@ -0,0 +1,14 @@ +namespace Proteomics.AminoAcidPolymer +{ + public class DigestionPointAndLength + { + public DigestionPointAndLength(int index, int length) + { + Index = index; + Length = length; + } + + public int Index { get; private set; } + public int Length { get; private set; } + } +} \ No newline at end of file diff --git a/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/Fragment.cs b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/Fragment.cs new file mode 100644 index 000000000..6d0c7806a --- /dev/null +++ b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/Fragment.cs @@ -0,0 +1,98 @@ +// Copyright 2012, 2013, 2014 Derek J. Bailey +// Modified work copyright 2016 Stefan Solntsev +// +// This file (Fragment.cs) is part of Proteomics. +// +// Proteomics is free software: you can redistribute it and/or modify it +// under the terms of the GNU Lesser General Public License as published +// by the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Proteomics is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +// License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with Proteomics. If not, see . + +using System; +using System.Collections.Generic; +using System.Globalization; +using Chemistry; + +namespace Proteomics.AminoAcidPolymer +{ + public class Fragment : IHasMass, IEquatable + { + public Fragment(FragmentTypes type, int number, double monoisotopicMass, AminoAcidPolymer parent) + { + FragmentType = type; + Number = number; + Parent = parent; + MonoisotopicMass = monoisotopicMass; + } + + public double MonoisotopicMass { get; private set; } + + public int Number { get; private set; } + + public AminoAcidPolymer Parent { get; private set; } + + public FragmentTypes FragmentType { get; private set; } + + public IEnumerable Modifications + { + get + { + var mods = Parent.Modifications; + if (FragmentType.GetTerminus() == Terminus.N) + { + for (int i = 0; i <= Number; i++) + { + if (mods[i] != null) + yield return mods[i]; + } + } + else + { + int length = Parent.Length + 1; + for (int i = length - Number; i <= length; i++) + { + if (mods[i] != null) + yield return mods[i]; + } + } + } + } + + public string Sequence + { + get + { + string parentSeq = Parent.BaseSequence; + if (FragmentType.GetTerminus() == Terminus.N) + { + return parentSeq.Substring(0, Number); + } + + return parentSeq.Substring(parentSeq.Length - Number, Number); + } + } + + public override string ToString() + { + return string.Format(CultureInfo.InvariantCulture, "{0}{1}", Enum.GetName(typeof(FragmentTypes), FragmentType), Number); + } + + public override int GetHashCode() + { + return MonoisotopicMass.GetHashCode(); + } + + public bool Equals(Fragment other) + { + return FragmentType.Equals(other.FragmentType) && Number.Equals(other.Number) && Math.Abs(MonoisotopicMass - other.MonoisotopicMass) < 1e-9; + } + } +} \ No newline at end of file diff --git a/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/FragmentTypes.cs b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/FragmentTypes.cs new file mode 100644 index 000000000..5ef1f3c88 --- /dev/null +++ b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/FragmentTypes.cs @@ -0,0 +1,98 @@ +// Copyright 2012, 2013, 2014 Derek J. Bailey +// Modified work copyright 2016 Stefan Solntsev +// +// This file (FragmentTypes.cs) is part of Proteomics. +// +// Proteomics is free software: you can redistribute it and/or modify it +// under the terms of the GNU Lesser General Public License as published +// by the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Proteomics is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +// License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with Proteomics. If not, see . + +using System; +using System.Collections.Generic; +using Chemistry; +using MzLibUtil; + +namespace Proteomics.AminoAcidPolymer +{ + [Flags] + public enum FragmentTypes + { + None = 0, + a = 1 << 0, + adot = 1 << 1, + b = 1 << 2, + bdot = 1 << 3, + c = 1 << 4, + cdot = 1 << 5, + x = 1 << 6, + xdot = 1 << 7, + y = 1 << 8, + ydot = 1 << 9, + z = 1 << 10, + zdot = 1 << 11, + Internal = 1 << 12, + All = (1 << 12) - 1, // Handy way of setting all below the 12th bit + } + + public static class FragmentTypesExtension + { + private static readonly Dictionary FragmentIonCaps = new Dictionary + { + {FragmentTypes.a, ChemicalFormula.ParseFormula("C-1H-1O-1")}, + {FragmentTypes.adot, ChemicalFormula.ParseFormula("C-1O-1")}, + {FragmentTypes.b, ChemicalFormula.ParseFormula("H-1")}, + {FragmentTypes.bdot, new ChemicalFormula()}, + {FragmentTypes.c, ChemicalFormula.ParseFormula("NH2")}, + {FragmentTypes.cdot, ChemicalFormula.ParseFormula("NH3")}, + {FragmentTypes.x, ChemicalFormula.ParseFormula("COH-1")}, + {FragmentTypes.xdot, ChemicalFormula.ParseFormula("CO")}, + {FragmentTypes.y, ChemicalFormula.ParseFormula("H")}, + {FragmentTypes.ydot, ChemicalFormula.ParseFormula("H2")}, + {FragmentTypes.z, ChemicalFormula.ParseFormula("N-1H-2")}, + {FragmentTypes.zdot, ChemicalFormula.ParseFormula("N-1H-1")} + }; + + public static IEnumerable GetIndividualFragmentTypes(this FragmentTypes fragmentTypes) + { + foreach (FragmentTypes site in Enum.GetValues(typeof(FragmentTypes))) + { + if (site == FragmentTypes.None || site == FragmentTypes.All || site == FragmentTypes.Internal) + { + continue; + } + if ((fragmentTypes & site) == site) + { + yield return site; + } + } + } + + public static Terminus GetTerminus(this FragmentTypes fragmentType) + { + // Super handy: http://stackoverflow.com/questions/4624248/c-logical-riddle-with-bit-operations-only-one-bit-is-set + if (fragmentType == FragmentTypes.None || (fragmentType & (fragmentType - 1)) != FragmentTypes.None) + { + throw new MzLibException("Fragment Type must be a single value to determine the terminus"); + } + return fragmentType >= FragmentTypes.x ? Terminus.C : Terminus.N; + } + + public static ChemicalFormula GetIonCap(this FragmentTypes fragmentType) + { + if (fragmentType == FragmentTypes.None || (fragmentType & (fragmentType - 1)) != FragmentTypes.None) + { + throw new MzLibException("Fragment Type must be a single value to determine the ion cap"); + } + return FragmentIonCaps[fragmentType]; + } + } +} \ No newline at end of file diff --git a/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/IProtease.cs b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/IProtease.cs new file mode 100644 index 000000000..60a27219c --- /dev/null +++ b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/IProtease.cs @@ -0,0 +1,42 @@ +// Copyright 2012, 2013, 2014 Derek J. Bailey +// Modified work copyright 2016 Stefan Solntsev +// +// This file (IProtease.cs) is part of Proteomics. +// +// Proteomics is free software: you can redistribute it and/or modify it +// under the terms of the GNU Lesser General Public License as published +// by the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Proteomics is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +// License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with Proteomics. If not, see . + +using System.Collections.Generic; + +namespace Proteomics.AminoAcidPolymer +{ + /// + /// A proteolyic enzyme that cuts amino acids at specific residues. + /// + public interface IProtease + { + /// + /// Finds the indicies of where this protease would cut in + /// the given amino acid sequence + /// + /// The Amino Acid Polymer to cut + /// A set of the 1-based indicies to cut at + IEnumerable GetDigestionSites(string aminoAcidSequence); + + IEnumerable GetDigestionSites(AminoAcidPolymer aminoAcidSequence); + + int MissedCleavages(string sequence); + + int MissedCleavages(AminoAcidPolymer aminoAcidSequence); + } +} \ No newline at end of file diff --git a/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/ModificationCollection.cs b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/ModificationCollection.cs new file mode 100644 index 000000000..cb7939c39 --- /dev/null +++ b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/ModificationCollection.cs @@ -0,0 +1,122 @@ +// Copyright 2012, 2013, 2014 Derek J. Bailey +// Modified work copyright 2016 Stefan Solntsev +// +// This file (ModificationCollection.cs) is part of Proteomics. +// +// Proteomics is free software: you can redistribute it and/or modify it +// under the terms of the GNU Lesser General Public License as published +// by the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Proteomics is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +// License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with Proteomics. If not, see . + +using System; +using System.Collections; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using Chemistry; +using MzLibUtil; + +namespace Proteomics.AminoAcidPolymer +{ + public class ModificationCollection : ICollection, IEquatable, IHasChemicalFormula + { + private readonly List _modifications; + + public ModificationCollection(params IHasMass[] mods) + { + _modifications = mods.ToList(); + MonoisotopicMass = _modifications.Sum(m => m.MonoisotopicMass); + } + + public double MonoisotopicMass { get; private set; } + + public int Count + { + get { return _modifications.Count; } + } + + public bool IsReadOnly + { + get { return false; } + } + + public ChemicalFormula ThisChemicalFormula + { + get + { + ChemicalFormula chemicalFormula = new ChemicalFormula(); + foreach (var ok in _modifications) + chemicalFormula.Add(ok as IHasChemicalFormula); + return chemicalFormula; + } + } + + public override string ToString() + { + StringBuilder sb = new StringBuilder(); + foreach (IHasMass mod in _modifications) + { + sb.Append(mod); + sb.Append(" | "); + } + if (sb.Length > 0) + { + sb.Remove(sb.Length - 3, 3); + } + return sb.ToString(); + } + + public void Add(IHasMass item) + { + _modifications.Add(item); + MonoisotopicMass += item.MonoisotopicMass; + } + + public void Clear() + { + _modifications.Clear(); + MonoisotopicMass = 0; + } + + public bool Contains(IHasMass item) + { + return _modifications.Contains(item); + } + + public void CopyTo(IHasMass[] array, int arrayIndex) + { + _modifications.CopyTo(array, arrayIndex); + } + + public bool Remove(IHasMass item) + { + if (!_modifications.Remove(item)) + return false; + MonoisotopicMass -= item.MonoisotopicMass; + return true; + } + + public bool Equals(ModificationCollection other) + { + return Count == other.Count && _modifications.ScrambledEquals(other._modifications); + } + + public IEnumerator GetEnumerator() + { + return _modifications.GetEnumerator(); + } + + IEnumerator IEnumerable.GetEnumerator() + { + return _modifications.GetEnumerator(); + } + } +} \ No newline at end of file diff --git a/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/ModificationSites.cs b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/ModificationSites.cs new file mode 100644 index 000000000..e68c7e143 --- /dev/null +++ b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/ModificationSites.cs @@ -0,0 +1,92 @@ +// Copyright 2012, 2013, 2014 Derek J. Bailey +// Modified work copyright 2016 Stefan Solntsev +// +// This file (ModificationSites.cs) is part of Proteomics. +// +// Proteomics is free software: you can redistribute it and/or modify it +// under the terms of the GNU Lesser General Public License as published +// by the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Proteomics is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +// License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with Proteomics. If not, see . + +using System; +using System.Collections.Generic; + +namespace Proteomics.AminoAcidPolymer +{ + [Flags] + public enum ModificationSites + { + None = 0, + A = 1 << 0, + R = 1 << 1, + N = 1 << 2, + D = 1 << 3, + C = 1 << 4, + E = 1 << 5, + Q = 1 << 6, + G = 1 << 7, + H = 1 << 8, + I = 1 << 9, + L = 1 << 10, + K = 1 << 11, + M = 1 << 12, + F = 1 << 13, + P = 1 << 14, + S = 1 << 15, + T = 1 << 16, + U = 1 << 17, + W = 1 << 18, + Y = 1 << 19, + V = 1 << 20, + NPep = 1 << 21, + PepC = 1 << 22, + NProt = 1 << 23, + ProtC = 1 << 24, + All = (1 << 25) - 1, // Handy way of setting all below the 24th bit + NTerminus = NPep | NProt, + TerminusC = PepC | ProtC, + Any = 1 << 31 // Acts like none, but is equal to all + } + + public static class ModificationSiteExtensions + { + public static IEnumerable EnumerateActiveSites(this ModificationSites sites) + { + foreach (ModificationSites site in Enum.GetValues(typeof(ModificationSites))) + { + if (site == ModificationSites.None) + { + continue; + } + if ((sites & site) == site) + { + yield return site; + } + } + } + + public static bool ContainsSites(this ModificationSites sites, ModificationSites otherSites) + { + // By convention, if the other site is 'Any', they are always equal + if (otherSites == ModificationSites.Any) + { + return true; + } + + if (otherSites == ModificationSites.None) + { + return sites == ModificationSites.None; + } + + return (~sites & otherSites) == ModificationSites.None; + } + } +} \ No newline at end of file diff --git a/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/OldSchoolModification.cs b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/OldSchoolModification.cs new file mode 100644 index 000000000..2753fbe02 --- /dev/null +++ b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/OldSchoolModification.cs @@ -0,0 +1,121 @@ +// Copyright 2012, 2013, 2014 Derek J. Bailey +// Modified work copyright 2016 Stefan Solntsev +// +// This file (Modification.cs) is part of Proteomics. +// +// Proteomics is free software: you can redistribute it and/or modify it +// under the terms of the GNU Lesser General Public License as published +// by the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Proteomics is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +// License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with Proteomics. If not, see . + +using System; +using System.Globalization; +using Chemistry; + +namespace Proteomics.AminoAcidPolymer +{ + /// + /// Represents a modification with a mass and name and default amino acid sites of modification + /// + public class OldSchoolModification : IHasMass, IEquatable + { + public OldSchoolModification(OldSchoolModification modification) + : this(modification.MonoisotopicMass, modification.Name, modification.Sites) + { + } + + public OldSchoolModification() + : this(0.0, "", ModificationSites.Any) + { + } + + public OldSchoolModification(double monoMass) + : this(monoMass, "", ModificationSites.Any) + { + } + + public OldSchoolModification(double monoMass, string name) + : this(monoMass, name, ModificationSites.Any) + { + } + + public OldSchoolModification(double monoMass, string name, ModificationSites sites) + { + MonoisotopicMass = monoMass; + Name = name; + Sites = sites; + } + + /// + /// The name of the modification + /// + public string Name { get; protected set; } + + /// + /// The monoisotopic mass of the modification, commoningly known as the delta mass + /// + public double MonoisotopicMass { get; protected set; } + + /// + /// The potentially modified sites of this modification + /// + public ModificationSites Sites { get; set; } + + /// + /// Displays the name of the mod and the sites it modified in a formated string + /// + public string NameAndSites + { + get { return string.Format(CultureInfo.InvariantCulture, "{0} ({1})", Name, Sites); } + } + + public override string ToString() + { + return Name; + } + + public override int GetHashCode() + { + return MonoisotopicMass.GetHashCode(); + } + + public override bool Equals(object obj) + { + OldSchoolModification modObj = obj as OldSchoolModification; + return modObj != null && Equals(modObj); + } + + public bool Equals(OldSchoolModification other) + { + if (ReferenceEquals(this, other)) + { + return true; + } + + if (Math.Abs(MonoisotopicMass - other.MonoisotopicMass) > 1e-9) + { + return false; + } + + if (!Name.Equals(other.Name)) + { + return false; + } + + if (!Sites.Equals(other.Sites)) + { + return false; + } + + return true; + } + } +} \ No newline at end of file diff --git a/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/OldSchoolModificationWithMultiplePossibilities.cs b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/OldSchoolModificationWithMultiplePossibilities.cs new file mode 100644 index 000000000..d1039281e --- /dev/null +++ b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/OldSchoolModificationWithMultiplePossibilities.cs @@ -0,0 +1,68 @@ +// Copyright 2012, 2013, 2014 Derek J. Bailey +// Modified work copyright 2016 Stefan Solntsev +// +// This file (Isotopologue.cs) is part of Proteomics. +// +// Proteomics is free software: you can redistribute it and/or modify it +// under the terms of the GNU Lesser General Public License as published +// by the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Proteomics is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +// License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with Proteomics. If not, see . + +using System.Collections; +using System.Collections.Generic; +using MzLibUtil; + +namespace Proteomics.AminoAcidPolymer +{ + public class ModificationWithMultiplePossibilitiesCollection : OldSchoolModification, IEnumerable + { + private readonly SortedList _modifications; + + public ModificationWithMultiplePossibilitiesCollection(string name, ModificationSites sites) + : base(0, name, sites) + { + _modifications = new SortedList(); + } + + public int Count + { + get { return _modifications.Count; } + } + + public OldSchoolModification this[int index] + { + get { return _modifications.Values[index]; } + } + + public void AddModification(OldSchoolModification modification) + { + if (!Sites.ContainsSites(modification.Sites)) + throw new MzLibException("Unable to add a modification with sites other than " + Sites); + + _modifications.Add(modification.MonoisotopicMass, modification); + } + + public bool Contains(OldSchoolModification modification) + { + return _modifications.ContainsValue(modification); + } + + public IEnumerator GetEnumerator() + { + return _modifications.Values.GetEnumerator(); + } + + IEnumerator IEnumerable.GetEnumerator() + { + return _modifications.Values.GetEnumerator(); + } + } +} \ No newline at end of file diff --git a/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/Peptide.cs b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/Peptide.cs new file mode 100644 index 000000000..4ea4346e6 --- /dev/null +++ b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/Peptide.cs @@ -0,0 +1,137 @@ +// Copyright 2012, 2013, 2014 Derek J. Bailey +// Modified work copyright 2016 Stefan Solntsev +// +// This file (Peptide.cs) is part of Proteomics. +// +// Proteomics is free software: you can redistribute it and/or modify it +// under the terms of the GNU Lesser General Public License as published +// by the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Proteomics is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +// License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with Proteomics. If not, see . + +using System.Collections.Generic; +using System.Linq; + +namespace Proteomics.AminoAcidPolymer +{ + public class Peptide : AminoAcidPolymer + { + public Peptide() + { + } + + public Peptide(string sequence) : base(sequence) + { + } + + public Peptide(AminoAcidPolymer aminoAcidPolymer) + : this(aminoAcidPolymer, true) + { + } + + /// + /// Create a new peptide based on another amino acid polymer + /// + /// The other amino acid polymer to copy + /// Whether to copy the modifications to the new peptide + public Peptide(AminoAcidPolymer aminoAcidPolymer, bool includeModifications) + : base(aminoAcidPolymer, includeModifications) + { + Parent = aminoAcidPolymer; + StartResidue = 0; + EndResidue = Length - 1; + } + + public Peptide(AminoAcidPolymer aminoAcidPolymer, int firstResidue, int length) + : this(aminoAcidPolymer, firstResidue, length, true) + { + } + + public Peptide(AminoAcidPolymer aminoAcidPolymer, int firstResidue, int length, bool includeModifications) + : base(aminoAcidPolymer, firstResidue, length, includeModifications) + { + Parent = aminoAcidPolymer; + StartResidue = firstResidue; + EndResidue = firstResidue + length - 1; + PreviousResidue = aminoAcidPolymer.GetResidue(StartResidue - 1); + NextResidue = aminoAcidPolymer.GetResidue(EndResidue + 1); + } + + /// + /// The amino acid number this peptide is located in its parent + /// + public int StartResidue { get; set; } + + /// + /// The amino acid number this peptide is located in its parent + /// + public int EndResidue { get; set; } + + /// + /// The amino acid polymer this peptide came from. Could be null + /// + public AminoAcidPolymer Parent { get; set; } + + /// + /// The preceding amino acid in its parent + /// + public Residue PreviousResidue { get; set; } + + /// + /// The next amino acid in its parent + /// + public Residue NextResidue { get; set; } + + public IEnumerable GenerateAllModificationCombinations() + { + // Get all the modifications that are isotopologues + var isotopologues = GetUniqueModifications().ToArray(); + + // Base condition, no more isotopologues to make, so just return + if (isotopologues.Length < 1) + { + yield break; + } + + // Grab the the first isotopologue + ModificationWithMultiplePossibilitiesCollection isotopologue = isotopologues[0]; + + // Loop over each modification in the isotopologue + foreach (OldSchoolModification mod in isotopologue) + { + // Create a clone of the peptide, cloning modifications as well. + Peptide peptide = new Peptide(this); + + // Replace the base isotopologue mod with the specific version + peptide.ReplaceModification(isotopologue, mod); + + // There were more than one isotopologue, so we must go deeper + if (isotopologues.Length > 1) + { + // Call the same rotuine on the newly generate peptide that has one less isotopologue + foreach (var subpeptide in peptide.GenerateAllModificationCombinations()) + { + yield return subpeptide; + } + } + else + { + // Return this peptide + yield return peptide; + } + } + } + + public Peptide GetSubPeptide(int firstResidue, int length) + { + return new Peptide(this, firstResidue, length); + } + } +} \ No newline at end of file diff --git a/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/Residue.cs b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/Residue.cs new file mode 100644 index 000000000..4797bec7a --- /dev/null +++ b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/Residue.cs @@ -0,0 +1,207 @@ +// Copyright 2012, 2013, 2014 Derek J. Bailey +// Modified work copyright 2016 Stefan Solntsev +// +// This file (AminoAcid.cs) is part of Proteomics. +// +// Proteomics is free software: you can redistribute it and/or modify it +// under the terms of the GNU Lesser General Public License as published +// by the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Proteomics is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +// License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with Proteomics. If not, see . + +using System.Collections.Generic; +using Chemistry; + +namespace Proteomics.AminoAcidPolymer +{ + public class Residue : IHasChemicalFormula + { + public static readonly double[] ResidueMonoisotopicMass; + + private static readonly Dictionary ResiduesDictionary; + private static readonly Residue[] ResiduesByLetter; + + static Residue() + { + ResiduesDictionary = new Dictionary + { + {"Alanine", new Residue("Alanine", 'A', "Ala",ChemicalFormula.ParseFormula("C3H5NO"), ModificationSites.A)}, + {"Arginine", new Residue("Arginine", 'R', "Arg",ChemicalFormula.ParseFormula("C6H12N4O"), ModificationSites.R)}, + {"Asparagine", new Residue("Asparagine", 'N', "Asn",ChemicalFormula.ParseFormula("C4H6N2O2"), ModificationSites.N)}, + {"Aspartic Acid", new Residue("Aspartic Acid", 'D', "Asp",ChemicalFormula.ParseFormula("C4H5NO3"), ModificationSites.D)}, + {"Cysteine", new Residue("Cysteine", 'C', "Cys",ChemicalFormula.ParseFormula("C3H5NOS"), ModificationSites.C)}, + {"Glutamic Acid", new Residue("Glutamic Acid", 'E', "Glu",ChemicalFormula.ParseFormula("C5H7NO3"), ModificationSites.E)}, + {"Glutamine", new Residue("Glutamine", 'Q', "Gln",ChemicalFormula.ParseFormula("C5H8N2O2"), ModificationSites.Q)}, + {"Glycine", new Residue("Glycine", 'G', "Gly",ChemicalFormula.ParseFormula("C2H3NO"), ModificationSites.G)}, + {"Histidine", new Residue("Histidine", 'H', "His",ChemicalFormula.ParseFormula("C6H7N3O"), ModificationSites.H)}, + {"Isoleucine", new Residue("Isoleucine", 'I', "Ile",ChemicalFormula.ParseFormula("C6H11NO"), ModificationSites.I)}, + {"Leucine", new Residue("Leucine", 'L', "Leu",ChemicalFormula.ParseFormula("C6H11NO"), ModificationSites.L)}, + {"Lysine", new Residue("Lysine", 'K', "Lys",ChemicalFormula.ParseFormula("C6H12N2O"), ModificationSites.K)}, + {"Methionine", new Residue("Methionine", 'M', "Met",ChemicalFormula.ParseFormula("C5H9NOS"), ModificationSites.M)}, + {"Phenylalanine", new Residue("Phenylalanine", 'F', "Phe",ChemicalFormula.ParseFormula("C9H9NO"), ModificationSites.F)}, + {"Proline", new Residue("Proline", 'P', "Pro",ChemicalFormula.ParseFormula("C5H7NO"), ModificationSites.P)}, + {"Pyrrolysine", new Residue("Pyrrolysine", 'O', "Pyl",ChemicalFormula.ParseFormula("C12H19N3O2"), ModificationSites.P)}, + {"Selenocysteine", new Residue("Selenocysteine", 'U', "Sec",ChemicalFormula.ParseFormula("C3H5NOSe"), ModificationSites.U)}, + {"Serine", new Residue("Serine", 'S', "Ser",ChemicalFormula.ParseFormula("C3H5NO2"), ModificationSites.S)}, + {"Threonine", new Residue("Threonine", 'T', "Thr",ChemicalFormula.ParseFormula("C4H7NO2"), ModificationSites.T)}, + {"Tryptophan", new Residue("Tryptophan", 'W', "Trp",ChemicalFormula.ParseFormula("C11H10N2O"), ModificationSites.W)}, + {"Tyrosine", new Residue("Tyrosine", 'Y', "Try",ChemicalFormula.ParseFormula("C9H9NO2"), ModificationSites.Y)}, + {"Valine", new Residue("Valine", 'V', "Val",ChemicalFormula.ParseFormula("C5H9NO"), ModificationSites.V)} + }; + + ResiduesByLetter = new Residue[] + { + null,null,null,null,null,null,null,null,null,null,null,null,null, //12 + null,null,null,null,null,null,null,null,null,null,null,null,null, //25 + null,null,null,null,null,null,null,null,null,null,null,null,null, //38 + null,null,null,null,null,null,null,null,null,null,null,null,null, //51 + null,null,null,null,null,null,null,null,null,null,null,null,null, //64 + ResiduesDictionary["Alanine"], //65 + null, // B + ResiduesDictionary["Cysteine"], + ResiduesDictionary["Aspartic Acid"], + ResiduesDictionary["Glutamic Acid"], + ResiduesDictionary["Phenylalanine"], + ResiduesDictionary["Glycine"], + ResiduesDictionary["Histidine"], + ResiduesDictionary["Isoleucine"], + null, // J + ResiduesDictionary["Lysine"], + ResiduesDictionary["Leucine"], + ResiduesDictionary["Methionine"], + ResiduesDictionary["Asparagine"], + ResiduesDictionary["Pyrrolysine"], // O + ResiduesDictionary["Proline"], + ResiduesDictionary["Glutamine"], + ResiduesDictionary["Arginine"], + ResiduesDictionary["Serine"], + ResiduesDictionary["Threonine"], + ResiduesDictionary["Selenocysteine"], + ResiduesDictionary["Valine"], + ResiduesDictionary["Tryptophan"], + null, // X + ResiduesDictionary["Tyrosine"], + null, // Z //90 + null,null,null,null,null,null,null,null,null,null,null,null,null, //103 + null,null,null,null,null,null,null,null,null,null,null,null,null, //116 + null,null,null,null,null,null //122 + }; + ResidueMonoisotopicMass = new double[] + { + double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN, + double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN, + double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN, + double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN, + double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN, + ResiduesDictionary["Alanine"].MonoisotopicMass, + double.NaN, // B + ResiduesDictionary["Cysteine"].MonoisotopicMass, + ResiduesDictionary["Aspartic Acid"].MonoisotopicMass, + ResiduesDictionary["Glutamic Acid"].MonoisotopicMass, + ResiduesDictionary["Phenylalanine"].MonoisotopicMass, + ResiduesDictionary["Glycine"].MonoisotopicMass, + ResiduesDictionary["Histidine"].MonoisotopicMass, + ResiduesDictionary["Isoleucine"].MonoisotopicMass, + ResiduesDictionary["Isoleucine"].MonoisotopicMass, // J - SPECIAL CASE!!! + ResiduesDictionary["Lysine"].MonoisotopicMass, + ResiduesDictionary["Leucine"].MonoisotopicMass, + ResiduesDictionary["Methionine"].MonoisotopicMass, + ResiduesDictionary["Asparagine"].MonoisotopicMass, + ResiduesDictionary["Pyrrolysine"].MonoisotopicMass, // O + ResiduesDictionary["Proline"].MonoisotopicMass, + ResiduesDictionary["Glutamine"].MonoisotopicMass, + ResiduesDictionary["Arginine"].MonoisotopicMass, + ResiduesDictionary["Serine"].MonoisotopicMass, + ResiduesDictionary["Threonine"].MonoisotopicMass, + ResiduesDictionary["Selenocysteine"].MonoisotopicMass, + ResiduesDictionary["Valine"].MonoisotopicMass, + ResiduesDictionary["Tryptophan"].MonoisotopicMass, + double.NaN, // X + ResiduesDictionary["Tyrosine"].MonoisotopicMass, + double.NaN, // Z + double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN, + double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN, + double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN + }; + } + + /// + /// Adds a list of new residues to the dictionary at their specified index. + /// + /// + /// + public static void AddNewResiduesToDictionary(List residuesToAdd) + { + foreach (Residue residue in residuesToAdd) + { + ResiduesDictionary[residue.Name] = residue; + ResiduesByLetter[residue.Letter] = residue; + ResidueMonoisotopicMass[residue.Letter] = residue.MonoisotopicMass; + } + } + + + public Residue(string name, char oneLetterAbbreviation, string threeLetterAbbreviation, ChemicalFormula chemicalFormula, ModificationSites site) + { + Name = name; + Letter = oneLetterAbbreviation; + Symbol = threeLetterAbbreviation; + ThisChemicalFormula = chemicalFormula; + MonoisotopicMass = ThisChemicalFormula.MonoisotopicMass; + Site = site; + } + + public ChemicalFormula ThisChemicalFormula { get; private set; } + public char Letter { get; private set; } + public ModificationSites Site { get; private set; } + public double MonoisotopicMass { get; private set; } + public string Name { get; private set; } + public string Symbol { get; private set; } + + /// + /// Get the residue based on the residues's symbol + /// + /// + /// + public static Residue GetResidue(string symbol) + { + return symbol.Length == 1 ? ResiduesByLetter[symbol[0]] : ResiduesDictionary[symbol]; + } + + /// + /// Gets the resdiue based on the residue's one-character symbol + /// + /// + /// + public static Residue GetResidue(char letter) + { + return ResiduesByLetter[letter]; + } + + public static bool TryGetResidue(char letter, out Residue residue) + { + if (letter < ResiduesByLetter.Length && letter >= 0) + { + residue = ResiduesByLetter[letter]; + } + else + { + residue = null; + } + + return residue != null; + } + + public static bool TryGetResidue(string name, out Residue residue) + { + return ResiduesDictionary.TryGetValue(name, out residue); + } + } +} \ No newline at end of file diff --git a/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/Terminus.cs b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/Terminus.cs new file mode 100644 index 000000000..0785e835a --- /dev/null +++ b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/Terminus.cs @@ -0,0 +1,39 @@ +// Copyright 2012, 2013, 2014 Derek J. Bailey +// Modified work copyright 2016 Stefan Solntsev +// +// This file (Terminus.cs) is part of Proteomics. +// +// Proteomics is free software: you can redistribute it and/or modify it +// under the terms of the GNU Lesser General Public License as published +// by the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// Proteomics is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +// License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with Proteomics. If not, see . + +using System; + +namespace Proteomics.AminoAcidPolymer +{ + /// + /// The terminus of an amino acid polymer N-[Amino Acids]-C + /// + [Flags] + public enum Terminus + { + /// + /// The N-terminus (amino-terminus) + /// + N = 1, + + /// + /// The C-terminus (carboxyl-terminus) + /// + C = 2 + } +} \ No newline at end of file diff --git a/mzLib/MassSpectrometry/Proteomics/Fragmentation/DissociationTypeCollection.cs b/mzLib/MassSpectrometry/Proteomics/Fragmentation/DissociationTypeCollection.cs new file mode 100644 index 000000000..98293c4d9 --- /dev/null +++ b/mzLib/MassSpectrometry/Proteomics/Fragmentation/DissociationTypeCollection.cs @@ -0,0 +1,174 @@ +using System.Collections.Generic; +using System.Linq; +using Chemistry; +using MassSpectrometry; + +namespace Proteomics.Fragmentation +{ + public class DissociationTypeCollection + { + public static Dictionary> ProductsFromDissociationType = new Dictionary> + { + { DissociationType.Unknown, new List() }, + { DissociationType.CID, new List{ ProductType.b, ProductType.y } }, + { DissociationType.LowCID, new List{ ProductType.b, ProductType.y, ProductType.aStar, ProductType.bAmmoniaLoss, ProductType.yAmmoniaLoss, ProductType.aDegree, ProductType.bWaterLoss, ProductType.yWaterLoss } }, + { DissociationType.IRMPD, new List{ ProductType.b, ProductType.y } }, + { DissociationType.ECD, new List{ ProductType.c, ProductType.y, ProductType.zDot } }, + { DissociationType.PQD, new List() }, + { DissociationType.ETD, new List{ ProductType.c, ProductType.y, ProductType.zDot } }, + { DissociationType.HCD, new List{ ProductType.b, ProductType.y } },//HCD often creates a-, aStar, and aDegree-ions and we should examine what other prominent algoroithms do to see if that would benefit our search results + { DissociationType.AnyActivationType, new List{ ProductType.b, ProductType.y } }, + { DissociationType.EThcD, new List{ ProductType.b, ProductType.y, ProductType.c, ProductType.zDot } }, + { DissociationType.Custom, new List() }, + { DissociationType.ISCID, new List() } + }; + + public static List GetTerminusSpecificProductTypesFromDissociation(DissociationType dissociationType, FragmentationTerminus fragmentationTerminus) + { + if (!TerminusSpecificProductTypesFromDissociation.TryGetValue((dissociationType, fragmentationTerminus), out List productTypes)) + { + lock (TerminusSpecificProductTypesFromDissociation) + { + var productCollection = TerminusSpecificProductTypes.ProductIonTypesFromSpecifiedTerminus[fragmentationTerminus] + .Intersect(DissociationTypeCollection.ProductsFromDissociationType[dissociationType]); + + if (!TerminusSpecificProductTypesFromDissociation.TryGetValue((dissociationType, fragmentationTerminus), out productTypes)) + { + productTypes = productCollection.ToList(); + TerminusSpecificProductTypesFromDissociation.Add((dissociationType, fragmentationTerminus), productTypes); + } + } + } + + return productTypes; + } + + public static List GetWaterAndAmmoniaLossProductTypesFromDissociation(DissociationType dissociationType, FragmentationTerminus fragmentationTerminus) + { + List productList = new(); + + switch (dissociationType) + { + case DissociationType.CID: + case DissociationType.IRMPD: + case DissociationType.HCD: + case DissociationType.AnyActivationType: + case DissociationType.EThcD: + if (fragmentationTerminus == FragmentationTerminus.N || fragmentationTerminus == FragmentationTerminus.Both) + { + productList.Add(ProductType.bWaterLoss); + productList.Add(ProductType.bAmmoniaLoss); + } + if (fragmentationTerminus == FragmentationTerminus.C || fragmentationTerminus == FragmentationTerminus.Both) + { + productList.Add(ProductType.yWaterLoss); + productList.Add(ProductType.yAmmoniaLoss); + } + break; + case DissociationType.ECD: + case DissociationType.ETD: + if (fragmentationTerminus == FragmentationTerminus.C || fragmentationTerminus == FragmentationTerminus.Both) + { + productList.Add(ProductType.yWaterLoss); + productList.Add(ProductType.yAmmoniaLoss); + } + break; + default: + break; + } + return productList; + } + + private static Dictionary<(DissociationType, FragmentationTerminus), List> TerminusSpecificProductTypesFromDissociation + = new Dictionary<(DissociationType, FragmentationTerminus), List>(); + + private static Dictionary NeutralMassShiftFromProductType = new Dictionary + { + { ProductType.a, null},//-C -O + { ProductType.aStar, null},//-C -O -N -H3 + { ProductType.aDegree, null},//-C -O2 -H2 + { ProductType.b, null},//no change + { ProductType.bAmmoniaLoss, null},//-N -H3 + { ProductType.bWaterLoss, null},//-H2 -O1 + { ProductType.c, null},//+N1 +H3 + { ProductType.x, null},//+C1 +O2 + { ProductType.y, null},//+O +H2 + { ProductType.yAmmoniaLoss, null},//+O -H -N + { ProductType.yWaterLoss, null},//no change + { ProductType.zDot, null },// +O -NH + e- + p+ + { ProductType.zPlusOne, null},//+O +H -N: A Zdot ion is also known as z+1. It is not a z-ion in the Biemann nomenclature. It differs from a y-ion by N-1 H-1; + { ProductType.M, null},// neutral Molecular product can be used with neutral loss as fragment + { ProductType.D, null},// diagnostic ions are not shifted but added sumarily + { ProductType.Ycore, null},// neutral Molecular product can be used with neutral loss as fragment + { ProductType.Y, null},// diagnostic ions are not shifted but added sumarily + }; + + private static Dictionary DissociationTypeToTerminusMassShift = new Dictionary(); + + /// + /// This function is used in performance-critical functions, such as fragmenting peptides. The first double array is the N-terminal mass shifts for + /// the given dissociation type; the second array is the C-terminal mass shifts. + /// + public static (double[], double[]) GetNAndCTerminalMassShiftsForDissociationType(DissociationType dissociationType) + { + if (!DissociationTypeToTerminusMassShift.TryGetValue(dissociationType, out var massShifts)) + { + lock (DissociationTypeToTerminusMassShift) + { + if (!DissociationTypeToTerminusMassShift.TryGetValue(dissociationType, out massShifts)) + { + DissociationTypeToTerminusMassShift.Add(dissociationType, + (GetTerminusSpecificProductTypesFromDissociation(dissociationType, FragmentationTerminus.N).Select(p => GetMassShiftFromProductType(p)).ToArray(), + GetTerminusSpecificProductTypesFromDissociation(dissociationType, FragmentationTerminus.C).Select(p => GetMassShiftFromProductType(p)).ToArray())); + + massShifts = DissociationTypeToTerminusMassShift[dissociationType]; + } + } + } + + return massShifts; + } + + public static double GetMassShiftFromProductType(ProductType productType) + { + if (NeutralMassShiftFromProductType.TryGetValue(productType, out double? shift)) + { + if (!shift.HasValue) + { + // compute formula + switch (productType) + { + case ProductType.a: NeutralMassShiftFromProductType[productType] = ChemicalFormula.ParseFormula("C-1O-1").MonoisotopicMass; break; + case ProductType.aStar: NeutralMassShiftFromProductType[productType] = ChemicalFormula.ParseFormula("C-1O-1N-1H-3").MonoisotopicMass; break; + case ProductType.aDegree: NeutralMassShiftFromProductType[productType] = ChemicalFormula.ParseFormula("C-1O-2H-2").MonoisotopicMass; break; // -46.0054793036,-C -O2 -H2 + case ProductType.b: NeutralMassShiftFromProductType[productType] = 0; break;// 0, no change + case ProductType.bAmmoniaLoss: NeutralMassShiftFromProductType[productType] = ChemicalFormula.ParseFormula("N-1H-3").MonoisotopicMass; break;// -17.02654910112, -N -H3 + case ProductType.bWaterLoss: NeutralMassShiftFromProductType[productType] = ChemicalFormula.ParseFormula("H-2O-1").MonoisotopicMass; break;// -18.01056468403, -H2 -O1 + case ProductType.c: NeutralMassShiftFromProductType[productType] = ChemicalFormula.ParseFormula("N1H3").MonoisotopicMass; break;// 17.02654910112, +N1 +H3 + case ProductType.x: NeutralMassShiftFromProductType[productType] = ChemicalFormula.ParseFormula("C1O2").MonoisotopicMass; break;// 43.98982923914, +C1 +O2 + case ProductType.y: NeutralMassShiftFromProductType[productType] = ChemicalFormula.ParseFormula("H2O1").MonoisotopicMass; break;// 18.01056468403, +O +H2 + case ProductType.yAmmoniaLoss: NeutralMassShiftFromProductType[productType] = ChemicalFormula.ParseFormula("O1H-1N-1").MonoisotopicMass; break;// 0.98401558291000057, +O -H -N + case ProductType.yWaterLoss: NeutralMassShiftFromProductType[productType] = 0; break;// 0, no change + case ProductType.zDot: NeutralMassShiftFromProductType[productType] = ChemicalFormula.ParseFormula("O1N-1H-1").MonoisotopicMass + Constants.ElectronMass + Constants.ProtonMass; break; //1.991840552567, +O -NH + e- + p+ + case ProductType.zPlusOne: NeutralMassShiftFromProductType[productType] = ChemicalFormula.ParseFormula("O1H1N-1").MonoisotopicMass; break;//; 2.9996656473699996, +O +H -N: + case ProductType.M: NeutralMassShiftFromProductType[productType] = 0; break;// no change + case ProductType.D: NeutralMassShiftFromProductType[productType] = 0; break;// no change + case ProductType.Ycore: NeutralMassShiftFromProductType[productType] = 0; break;// no change + case ProductType.Y: NeutralMassShiftFromProductType[productType] = 0; break;// no change + } + } + + return NeutralMassShiftFromProductType[productType].Value; + } + else + { + throw new MzLibUtil.MzLibException("Unknown product type!"); + } + } + + public static double ProductTypeSpecificFragmentNeutralMass(double mass, ProductType p) + { + return (double)ClassExtensions.RoundedDouble(mass + GetMassShiftFromProductType(p), 9); + } + } +} \ No newline at end of file diff --git a/mzLib/MassSpectrometry/Proteomics/Fragmentation/FragmentationTerminus.cs b/mzLib/MassSpectrometry/Proteomics/Fragmentation/FragmentationTerminus.cs new file mode 100644 index 000000000..84e8958f8 --- /dev/null +++ b/mzLib/MassSpectrometry/Proteomics/Fragmentation/FragmentationTerminus.cs @@ -0,0 +1,10 @@ +namespace Proteomics.Fragmentation +{ + public enum FragmentationTerminus + { + Both, //N- and C-terminus + N, //N-terminus only + C, //C-terminus only + None //used for internal fragments, could be used for top down intact mass? + } +} \ No newline at end of file diff --git a/mzLib/MassSpectrometry/Proteomics/Fragmentation/MatchedFragmentIon.cs b/mzLib/MassSpectrometry/Proteomics/Fragmentation/MatchedFragmentIon.cs new file mode 100644 index 000000000..0f8ade525 --- /dev/null +++ b/mzLib/MassSpectrometry/Proteomics/Fragmentation/MatchedFragmentIon.cs @@ -0,0 +1,91 @@ +using System.Text; +using Chemistry; + +namespace Proteomics.Fragmentation +{ + public class MatchedFragmentIon + { + public readonly Product NeutralTheoreticalProduct; + public readonly double Mz; + public readonly double Intensity; + public readonly int Charge; + + /// + /// Constructs a new MatchedFragmentIon given information about a theoretical and an experimental fragment mass spectral peak + /// + public MatchedFragmentIon(ref Product neutralTheoreticalProduct, double experMz, double experIntensity, int charge) + { + NeutralTheoreticalProduct = neutralTheoreticalProduct; + Mz = experMz; + Intensity = experIntensity; + Charge = charge; + } + + public double MassErrorDa + { + get + { + return Mz.ToMass(Charge) - NeutralTheoreticalProduct.NeutralMass; + } + } + + public double MassErrorPpm + { + get + { + return (MassErrorDa / NeutralTheoreticalProduct.NeutralMass) * 1e6; + } + } + + public string Annotation + { + get + { + StringBuilder sb = new StringBuilder(); + + bool containsNeutralLoss = NeutralTheoreticalProduct.NeutralLoss != 0; + + if (containsNeutralLoss) + { + sb.Append("("); + } + + sb.Append(NeutralTheoreticalProduct.Annotation); + + if (containsNeutralLoss) + { + sb.Append(")"); + } + + sb.Append("+"); + sb.Append(Charge); + + return sb.ToString(); + } + } + + /// + /// Summarizes a TheoreticalFragmentIon into a string for debug purposes + /// + public override string ToString() + { + // we add the blank space in the tostring because the values are treated like integers and looked up as index in the enum instead of being converted to just string and concatenated + return NeutralTheoreticalProduct.ProductType + "" + NeutralTheoreticalProduct.FragmentNumber + "+" + Charge + "\t;" + NeutralTheoreticalProduct.NeutralMass; + } + + public override bool Equals(object obj) + { + MatchedFragmentIon other = (MatchedFragmentIon)obj; + + return this.NeutralTheoreticalProduct.Equals(other.NeutralTheoreticalProduct) + && this.Charge == other.Charge + && this.Mz == other.Mz + && this.Intensity == other.Intensity; + } + + public override int GetHashCode() + { + return Mz.GetHashCode(); + } + } +} diff --git a/mzLib/MassSpectrometry/Proteomics/Fragmentation/Product.cs b/mzLib/MassSpectrometry/Proteomics/Fragmentation/Product.cs new file mode 100644 index 000000000..f8de52f04 --- /dev/null +++ b/mzLib/MassSpectrometry/Proteomics/Fragmentation/Product.cs @@ -0,0 +1,97 @@ +using System.Text; + +namespace Proteomics.Fragmentation +{ + public struct Product + { + public readonly double NeutralMass; + public readonly ProductType ProductType; + public readonly double NeutralLoss; + public readonly FragmentationTerminus Terminus; + public readonly int FragmentNumber; + public readonly int AminoAcidPosition; + public readonly ProductType? SecondaryProductType; //used for internal fragment ions + public readonly int SecondaryFragmentNumber; //used for internal fragment ions + + /// + /// A product is the individual neutral fragment from an MS dissociation. A fragmentation product here contains one of the two termini (N- or C-). + /// The ProductType describes where along the backbone the fragmentaiton occurred (e.g. b-, y-, c-, zdot-). The neutral loss mass (if any) that + /// occurred from a mod on the fragment is listed as a mass. Finally the neutral mass of the whole fragment is provided. + /// + public Product(ProductType productType, FragmentationTerminus terminus, double neutralMass, + int fragmentNumber, int aminoAcidPosition, double neutralLoss, ProductType? secondaryProductType = null, int secondaryFragmentNumber = 0) + { + NeutralMass = neutralMass; + ProductType = productType; + NeutralLoss = neutralLoss; + Terminus = terminus; + FragmentNumber = fragmentNumber; + AminoAcidPosition = aminoAcidPosition; + SecondaryProductType = secondaryProductType; + SecondaryFragmentNumber = secondaryFragmentNumber; + } + + public string Annotation + { + get + { + StringBuilder sb = new StringBuilder(); + + if (SecondaryProductType == null) + { + sb.Append(ProductType); + + // for "normal" fragments this is just the fragment number (e.g., the 3 in the b3 ion) + // for diagnostic ions, it's the m/z assuming z=1 + // (e.g., a diagnostic ion with neutral mass 100 Da will be reported as the D101 fragment) + sb.Append(FragmentNumber); + } + else + { + //internal fragment ion, annotation used here: 10.1007/s13361-015-1078-1 + //example: yIb[18-36] + sb.Append(ProductType + "I" + SecondaryProductType.Value + "[" + FragmentNumber + "-" + SecondaryFragmentNumber + "]"); + } + if (NeutralLoss != 0) + { + sb.Append("-"); + sb.Append(NeutralLoss.ToString("F2")); + } + + return sb.ToString(); + } + } + + /// + /// Summarizes a Product into a string for debug purposes + /// + public override string ToString() + { + if (SecondaryProductType == null) + { + return ProductType + "" + FragmentNumber + ";" + NeutralMass.ToString("F5") + "-" + string.Format("{0:0.##}", NeutralLoss); + } + else + { + return ProductType + "I" + SecondaryProductType.Value + "[" + FragmentNumber + "-" + SecondaryFragmentNumber + "]" + ";" + NeutralMass.ToString("F5") + "-" + string.Format("{0:0.##}", NeutralLoss); + } + } + + public override bool Equals(object obj) + { + Product other = (Product)obj; + + return this.ProductType == other.ProductType + && this.NeutralMass == other.NeutralMass + && this.FragmentNumber == other.FragmentNumber + && this.NeutralLoss == other.NeutralLoss + && this.SecondaryFragmentNumber == other.SecondaryFragmentNumber + && this.SecondaryProductType == other.SecondaryProductType; + } + + public override int GetHashCode() + { + return NeutralMass.GetHashCode(); + } + } +} \ No newline at end of file diff --git a/mzLib/MassSpectrometry/Proteomics/Fragmentation/ProductType.cs b/mzLib/MassSpectrometry/Proteomics/Fragmentation/ProductType.cs new file mode 100644 index 000000000..10e9bcbab --- /dev/null +++ b/mzLib/MassSpectrometry/Proteomics/Fragmentation/ProductType.cs @@ -0,0 +1,42 @@ +namespace Proteomics.Fragmentation +{ + public enum ProductType + { + //Ion Type Neutral Mr + //a [N]+[M]-CHO + //a* a-NH3 + //a° a-H2O + //b [N]+[M]-H + //b* b-NH3 + //b° b-H2O + //c [N]+[M]+NH2 + //d a – partial side chain + //v y – complete side chain + //w z – partial side chain + //x [C]+[M]+CO-H + //y [C]+[M]+H + //y* y-NH3 + //y° y-H2O + //z [C]+[M]-NH2 + + a, + aStar, + aDegree, + b, + bAmmoniaLoss, + bWaterLoss, + //BnoB1ions, + c, + x, + y, + yAmmoniaLoss, + yWaterLoss, + zPlusOne,//This is zDot plus H + zDot, + M, //this is the molecular ion // [M] + D, //this is a diagnostic ion // Modification loss mass + Ycore, //Glyco core Y ions // [pep] + Neutral core Glycan mass (such as: [pep] + [N]) //Which already consider the loss of H2O and H-transfer + Y //Glyco Y ions // [pep] + other Glycan mass + } + +} \ No newline at end of file diff --git a/mzLib/MassSpectrometry/Proteomics/Fragmentation/TerminusSpecificProductTypes.cs b/mzLib/MassSpectrometry/Proteomics/Fragmentation/TerminusSpecificProductTypes.cs new file mode 100644 index 000000000..543ef482f --- /dev/null +++ b/mzLib/MassSpectrometry/Proteomics/Fragmentation/TerminusSpecificProductTypes.cs @@ -0,0 +1,33 @@ +using System.Collections.Generic; + +namespace Proteomics.Fragmentation +{ + public class TerminusSpecificProductTypes + { + public static Dictionary> ProductIonTypesFromSpecifiedTerminus = new Dictionary> + { + {FragmentationTerminus.N, new List{ ProductType.a, ProductType.aDegree, ProductType.aStar, ProductType.b, ProductType.bWaterLoss, ProductType.bAmmoniaLoss, ProductType.c } }, //all ion types that include the N-terminus + {FragmentationTerminus.C, new List{ ProductType.x, ProductType.y, ProductType.yWaterLoss, ProductType.yAmmoniaLoss, ProductType.zDot, ProductType.zPlusOne } }, //all ion types that include the C-terminus + {FragmentationTerminus.Both, new List{ ProductType.a, ProductType.aDegree, ProductType.aStar, ProductType.b, ProductType.bWaterLoss, ProductType.bAmmoniaLoss, ProductType.c, ProductType.x, ProductType.y, ProductType.yWaterLoss, ProductType.yAmmoniaLoss, ProductType.zDot, ProductType.zPlusOne} }, + {FragmentationTerminus.None, new List() } + }; + + public static Dictionary ProductTypeToFragmentationTerminus = new Dictionary + { + { ProductType.a, FragmentationTerminus.N }, + { ProductType.aDegree, FragmentationTerminus.N }, + { ProductType.aStar, FragmentationTerminus.N }, + { ProductType.b, FragmentationTerminus.N }, + { ProductType.bWaterLoss, FragmentationTerminus.N }, + { ProductType.bAmmoniaLoss, FragmentationTerminus.N }, + { ProductType.c, FragmentationTerminus.N }, + { ProductType.x, FragmentationTerminus.C }, + { ProductType.y, FragmentationTerminus.C }, + { ProductType.yWaterLoss, FragmentationTerminus.C }, + { ProductType.yAmmoniaLoss, FragmentationTerminus.C }, + { ProductType.zDot, FragmentationTerminus.C }, + { ProductType.zPlusOne, FragmentationTerminus.C }, + }; + + } +} diff --git a/mzLib/MassSpectrometry/Proteomics/Modifications/ModLocationOnPeptideOrProtein.cs b/mzLib/MassSpectrometry/Proteomics/Modifications/ModLocationOnPeptideOrProtein.cs new file mode 100644 index 000000000..4a143a3fa --- /dev/null +++ b/mzLib/MassSpectrometry/Proteomics/Modifications/ModLocationOnPeptideOrProtein.cs @@ -0,0 +1,11 @@ +namespace Proteomics +{ + public enum ModLocationOnPeptideOrProtein + { + NPep, + PepC, + NProt, + ProtC, + Any + } +} \ No newline at end of file diff --git a/mzLib/MassSpectrometry/Proteomics/Modifications/Modification.cs b/mzLib/MassSpectrometry/Proteomics/Modifications/Modification.cs new file mode 100644 index 000000000..711aaf32b --- /dev/null +++ b/mzLib/MassSpectrometry/Proteomics/Modifications/Modification.cs @@ -0,0 +1,303 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using Chemistry; +using MassSpectrometry; + +namespace Proteomics +{ + public class Modification + { + public string IdWithMotif { get; private set; } + public string OriginalId { get; private set; } + public string Accession { get; private set; } + public string ModificationType { get; private set; } + public string FeatureType { get; private set; } + public ModificationMotif Target { get; private set; } + public string LocationRestriction { get; private set; } + public ChemicalFormula ChemicalFormula { get; private set; } + private double? monoisotopicMass = null; + + public double? MonoisotopicMass + { + get + { + return ClassExtensions.RoundedDouble(monoisotopicMass); + } + private set + { + monoisotopicMass = value; + } + } + + public Dictionary> DatabaseReference { get; private set; } + public Dictionary> TaxonomicRange { get; private set; } + public List Keywords { get; private set; } + public Dictionary> NeutralLosses { get; private set; } + public Dictionary> DiagnosticIons { get; private set; } + public string FileOrigin { get; private set; } + protected const double tolForEquality = 1e-9; + + public bool ValidModification + { + get + { + return this.IdWithMotif != null + && (this.ChemicalFormula != null || this.MonoisotopicMass != null) + && this.Target != null + && this.LocationRestriction != "Unassigned." + && this.ModificationType != null + && this.FeatureType != "CROSSLINK" + && !this.ModificationType.Contains(':'); + } + } + + public Modification(string _originalId = null, string _accession = null, string _modificationType = null, string _featureType = null, + ModificationMotif _target = null, string _locationRestriction = "Unassigned.", ChemicalFormula _chemicalFormula = null, + double? _monoisotopicMass = null, Dictionary> _databaseReference = null, + Dictionary> _taxonomicRange = null, List _keywords = null, + Dictionary> _neutralLosses = null, Dictionary> _diagnosticIons = null, + string _fileOrigin = null) + { + if (_originalId != null) + { + if (_originalId.Contains(" on ")) + { + this.IdWithMotif = _originalId; + this.OriginalId = _originalId.Split(new[] { " on " }, StringSplitOptions.None)[0]; + } + else if (_originalId.Contains(" of ")) + { + this.IdWithMotif = _originalId.Replace(" of ", " on "); + this.OriginalId = _originalId.Split(new[] { " of ", " on " }, StringSplitOptions.None)[0]; + } + else if (_target != null) + { + this.IdWithMotif = _originalId + " on " + _target.ToString(); + this.OriginalId = _originalId; + } + else + { + this.OriginalId = _originalId; + } + } + + this.Accession = _accession; + this.ModificationType = _modificationType; + this.FeatureType = _featureType; + this.Target = _target; + this.LocationRestriction = ModLocationOnPeptideOrProtein(_locationRestriction); + this.ChemicalFormula = _chemicalFormula; + this.MonoisotopicMass = _monoisotopicMass; + this.DatabaseReference = _databaseReference; + this.TaxonomicRange = _taxonomicRange; + this.Keywords = _keywords; + this.NeutralLosses = _neutralLosses; + this.DiagnosticIons = _diagnosticIons; + this.FileOrigin = _fileOrigin; + + if (this.MonoisotopicMass == null && this.ChemicalFormula != null) + { + this.MonoisotopicMass = this.ChemicalFormula.MonoisotopicMass; + } + } + + public static string ModLocationOnPeptideOrProtein(string _locationRestriction) + { + switch (_locationRestriction) + { + case "N-terminal.": + return _locationRestriction; + + case "C-terminal.": + return _locationRestriction; + + case "Peptide N-terminal.": + return _locationRestriction; + + case "Peptide C-terminal.": + return _locationRestriction; + + case "Anywhere.": + return _locationRestriction; + + default: + return "Unassigned."; + } + } + + public override bool Equals(object o) + { + Modification m = o as Modification; + return o != null + && IdWithMotif == m.IdWithMotif + && OriginalId == m.OriginalId + && ModificationType == m.ModificationType + && (MonoisotopicMass == m.MonoisotopicMass + || MonoisotopicMass != null && m.MonoisotopicMass != null && Math.Abs((double)m.MonoisotopicMass - (double)MonoisotopicMass) < tolForEquality); + } + + public override int GetHashCode() + { + string id = IdWithMotif ?? OriginalId ?? string.Empty; + string mt = ModificationType ?? string.Empty; + return id.GetHashCode() ^ mt.GetHashCode(); + } + + public override string ToString() + { + StringBuilder sb = new StringBuilder(); + if (this.IdWithMotif != null) + { sb.AppendLine("ID " + this.IdWithMotif); } + if (this.Accession != null) + { sb.AppendLine("AC " + this.Accession); } + if (this.ModificationType != null) + { sb.AppendLine("MT " + this.ModificationType); } + if (this.FeatureType != null) + { sb.AppendLine("FT " + this.FeatureType); } + if (this.Target != null) + { sb.AppendLine("TG " + this.Target); } // at this stage, each mod has only one target though many may have the same Id + if (this.LocationRestriction != null) + { sb.AppendLine("PP " + this.LocationRestriction); } + if (this.ChemicalFormula != null) + { sb.AppendLine("CF " + this.ChemicalFormula.Formula); } + if (this.MonoisotopicMass != null) + { sb.AppendLine("MM " + this.MonoisotopicMass); } + if (this.DatabaseReference != null) + { + if (this.DatabaseReference.Count != 0) + { + List myKeys = new List(this.DatabaseReference.Keys); + myKeys.Sort(); + foreach (string myKey in myKeys) + { + List myValues = new List(this.DatabaseReference[myKey]); + myValues.Sort(); + foreach (string myValue in myValues) + { + sb.AppendLine("DR " + myKey + "; " + myValue); + } + } + } + } + if (this.TaxonomicRange != null) + { + if (this.TaxonomicRange.Count != 0) + { + List myKeys = new List(this.TaxonomicRange.Keys); + myKeys.Sort(); + foreach (string myKey in myKeys) + { + List myValues = new List(this.TaxonomicRange[myKey]); + myValues.Sort(); + foreach (string myValue in myValues) + { + sb.AppendLine("TR " + myKey + "; " + myValue); + } + } + } + } + if (this.NeutralLosses != null) + { + if (this.NeutralLosses.Count != 0) + { + List allDissociationTypes = this.NeutralLosses.Keys.ToList(); + allDissociationTypes.Sort(); + + foreach (DissociationType dissociationType in allDissociationTypes) + { + StringBuilder myLine = new StringBuilder(); + myLine.Append("NL "); + + List myValues = new List(this.NeutralLosses[dissociationType]); + myValues.Sort(); + for (int i = 0; i < myValues.Count; i++) + { + myLine.Append(dissociationType + ":" + ClassExtensions.RoundedDouble(myValues[i])); + if (i < myValues.Count - 1) + myLine.Append(" or "); + } + + sb.AppendLine(myLine.ToString()); + } + } + } + if (this.DiagnosticIons != null) + { + if (this.DiagnosticIons.Count != 0) + { + List allDissociationTypes = this.DiagnosticIons.Keys.ToList(); + allDissociationTypes.Sort(); + + foreach (DissociationType dissociationType in allDissociationTypes) + { + StringBuilder myLine = new StringBuilder(); + myLine.Append("DI "); + + List myValues = new List(this.DiagnosticIons[dissociationType]); + myValues.Sort(); + for (int i = 0; i < myValues.Count; i++) + { + myLine.Append(dissociationType + ":" + ClassExtensions.RoundedDouble(myValues[i])); + if (i < myValues.Count - 1) + myLine.Append(" or "); + } + + sb.AppendLine(myLine.ToString()); + } + } + } + + if (this.Keywords != null) + { + if (this.Keywords.Count != 0) + { + sb.AppendLine("KW " + String.Join(" or ", this.Keywords.ToList().OrderBy(b => b))); + } + } + + return sb.ToString(); + } + + public string ModificationErrorsToString() //reports errors in required fields. + { + StringBuilder sb = new StringBuilder(); + + sb.Append(this.ToString()); + + if (this.IdWithMotif == null) + { + sb.AppendLine("#Required field ID missing or malformed. Current value = " + this.IdWithMotif); + } + + if (this.ModificationType == null) + { + sb.AppendLine("#Required field MT missing or malformed. Current value = " + this.ModificationType); + } + + if (this.LocationRestriction == null) + { + sb.AppendLine("#Required field PP missing or malformed. Current value = " + this.LocationRestriction + + "."); + } + + if (this.ChemicalFormula == null && this.MonoisotopicMass == null) + { + sb.AppendLine( + "#Required fields CF and MM are both missing or malformed. One of those two fields must be provided."); + } + + if (this.ModificationType != null && this.ModificationType.Contains(':')) + { + sb.AppendLine("#Modification type cannot contain ':'!"); + } + + sb.Append("#This modification can be found in file " + this.FileOrigin); + + return sb.ToString(); + } + + + } +} \ No newline at end of file diff --git a/mzLib/MassSpectrometry/Proteomics/Modifications/ModificationLocalization.cs b/mzLib/MassSpectrometry/Proteomics/Modifications/ModificationLocalization.cs new file mode 100644 index 000000000..b12c0e19c --- /dev/null +++ b/mzLib/MassSpectrometry/Proteomics/Modifications/ModificationLocalization.cs @@ -0,0 +1,69 @@ +using System; +using System.Collections.Generic; +using System.Linq; + +namespace Proteomics +{ + public static class ModificationLocalization + { + public static bool ModFits(Modification attemptToLocalize, string proteinSequence, int peptideOneBasedIndex, int peptideLength, int proteinOneBasedIndex) + { + // First find the capital letter... + var motif = attemptToLocalize.Target; + var motifStartLocation = motif.ToString().IndexOf(motif.ToString().First(b => char.IsUpper(b))); + + // Look up starting at and including the capital letter + var proteinToMotifOffset = proteinOneBasedIndex - motifStartLocation - 1; + var indexUp = 0; + while (indexUp < motif.ToString().Length) + { + if (indexUp + proteinToMotifOffset < 0 || indexUp + proteinToMotifOffset >= proteinSequence.Length + || !MotifMatches(motif.ToString()[indexUp], proteinSequence[indexUp + proteinToMotifOffset])) + { + return false; + } + indexUp++; + } + if (attemptToLocalize.LocationRestriction == "N-terminal." && proteinOneBasedIndex > 2) + { + return false; + } + if (attemptToLocalize.LocationRestriction == "Peptide N-terminal." && peptideOneBasedIndex > 1) + { + return false; + } + if (attemptToLocalize.LocationRestriction == "C-terminal." && proteinOneBasedIndex < proteinSequence.Length) + { + return false; + } + if (attemptToLocalize.LocationRestriction == "Peptide C-terminal." && peptideOneBasedIndex < peptideLength) + { + return false; + } + + // I guess Anywhere. and Unassigned. are true since how do you localize anywhere or unassigned. + + return true; + } + + public static bool UniprotModExists(Protein protein, int i, Modification attemptToLocalize) + { + // uniprot mods with same mass takes precedence over variable mods + if (protein.OneBasedPossibleLocalizedModifications.TryGetValue(i, out List modsAtThisLocation)) { + return modsAtThisLocation.Any(p => Math.Abs((double)(p.MonoisotopicMass - attemptToLocalize.MonoisotopicMass)) < 0.001 && p.ModificationType == "UniProt"); + } + + return false; + } + + private static bool MotifMatches(char motifChar, char sequenceChar) + { + char upperMotifChar = char.ToUpper(motifChar); + return upperMotifChar.Equals('X') + || upperMotifChar.Equals(sequenceChar) + || upperMotifChar.Equals('B') && new[] { 'D', 'N' }.Contains(sequenceChar) + || upperMotifChar.Equals('J') && new[] { 'I', 'L' }.Contains(sequenceChar) + || upperMotifChar.Equals('Z') && new[] { 'E', 'Q' }.Contains(sequenceChar); + } + } +} \ No newline at end of file diff --git a/mzLib/MassSpectrometry/Proteomics/Modifications/ModificationMotif.cs b/mzLib/MassSpectrometry/Proteomics/Modifications/ModificationMotif.cs new file mode 100644 index 000000000..204ed16a4 --- /dev/null +++ b/mzLib/MassSpectrometry/Proteomics/Modifications/ModificationMotif.cs @@ -0,0 +1,50 @@ +using System.Linq; +using System.Text.RegularExpressions; + +namespace Proteomics +{ + public class ModificationMotif + { + private static readonly Regex ModificationMotifRegex = new Regex(@"^[A-Za-z]+$", RegexOptions.Compiled); + private readonly string motifString; + + private ModificationMotif(string motif) + { + motifString = motif; + } + + /// + /// Only upper and lower case letters allowed, must have a single upper case letter + /// + /// + /// + /// + public static bool TryGetMotif(string motifString, out ModificationMotif motif) + { + motif = null; + if (ModificationMotifRegex.IsMatch(motifString) && motifString.Count(b => char.IsUpper(b)) == 1) + { + motif = new ModificationMotif(motifString); + return true; + } + return false; + } + + public override bool Equals(object o) + { + ModificationMotif m = o as ModificationMotif; + return m != null + && m.motifString == motifString; + } + + public override int GetHashCode() + { + return motifString.GetHashCode(); + } + + public override string ToString() + { + return motifString; + } + } +} \ No newline at end of file diff --git a/mzLib/MassSpectrometry/Proteomics/Modifications/SilacLabel.cs b/mzLib/MassSpectrometry/Proteomics/Modifications/SilacLabel.cs new file mode 100644 index 000000000..d6376a3f7 --- /dev/null +++ b/mzLib/MassSpectrometry/Proteomics/Modifications/SilacLabel.cs @@ -0,0 +1,65 @@ +using System; +using System.Collections.Generic; +using System.Globalization; + +namespace Proteomics +{ + /// + /// Silac labels used to modify unlabeled proteins + /// + public class SilacLabel + { + public char OriginalAminoAcid { get; private set; } + public char AminoAcidLabel { get; private set; } + public string LabelChemicalFormula { get; private set; } + public string MassDifference { get; private set; } + public List AdditionalLabels { get; private set; } + + public SilacLabel(char originalAminoAcid, char aminoAcidLabel, string labelChemicalFormula, double massDifference) + { + OriginalAminoAcid = originalAminoAcid; + AminoAcidLabel = aminoAcidLabel; + LabelChemicalFormula = labelChemicalFormula; + MassDifference = Math.Round(massDifference, 3).ToString("F3"); + if (massDifference > 0)//if not negative, add a plus + { + MassDifference = "+" + MassDifference; + } + } + + public void AddAdditionalSilacLabel(SilacLabel label) + { + if (AdditionalLabels == null) + { + AdditionalLabels = new List { label }; + } + else + { + AdditionalLabels.Add(label); + } + } + + /// + /// This method exists for conversion of Silac labels, which take double inputs + /// Although a double object could be saved, it clutters tomls + /// + /// + public double ConvertMassDifferenceToDouble() + { + string substring = MassDifference.Substring(1); + double value = Convert.ToDouble(substring, CultureInfo.InvariantCulture); + if (MassDifference[0] == '-') + { + value *= -1; + } + return value; + } + + /// this parameterless constructor needs to exist to read the toml. + /// if you can figure out a way to get rid of it, feel free... + /// this is also encountered in MetaMorpheus's "CommonParameters.cs" if you find a solution. + public SilacLabel() + { + } + } +} diff --git a/mzLib/MassSpectrometry/Proteomics/Protein/DatabaseReference.cs b/mzLib/MassSpectrometry/Proteomics/Protein/DatabaseReference.cs new file mode 100644 index 000000000..b9945a142 --- /dev/null +++ b/mzLib/MassSpectrometry/Proteomics/Protein/DatabaseReference.cs @@ -0,0 +1,51 @@ +using System; +using System.Collections.Generic; +using System.Linq; + +namespace Proteomics +{ + public class DatabaseReference + { + /// + /// DatabaseReference constructor, which takes the type and ID strings of the reference, and a list of properties. Each property contains the "type" and "value" of the property as Item1 and Item2 of the Tuple. + /// + /// + /// + /// + public DatabaseReference(string type, string id, IEnumerable> properties) + { + Type = type ?? ""; + Id = id ?? ""; + Properties = properties ?? new List>(); + } + + /// + /// dbRef type, e.g. "GO" for GO terms + /// + public string Type { get; } + + /// + /// dbRef ID string + /// + public string Id { get; } + + /// + /// Each database reference contains a list of properties. Item1 of this Tuple is the "type", and Item2 is the "value" of the property. + /// + public IEnumerable> Properties { get; } + + public override bool Equals(object obj) + { + DatabaseReference d = obj as DatabaseReference; + return obj != null + && (d.Type == null && Type == null || d.Type.Equals(Type)) + && (d.Id == null && Id == null || d.Id.Equals(Id)) + && d.Properties.OrderBy(x => x).SequenceEqual(Properties.OrderBy(x => x)); + } + + public override int GetHashCode() + { + return Type.GetHashCode() ^ Id.GetHashCode(); // null handled in constructor + } + } +} \ No newline at end of file diff --git a/mzLib/MassSpectrometry/Proteomics/Protein/DisulfideBond.cs b/mzLib/MassSpectrometry/Proteomics/Protein/DisulfideBond.cs new file mode 100644 index 000000000..6bd4fa1af --- /dev/null +++ b/mzLib/MassSpectrometry/Proteomics/Protein/DisulfideBond.cs @@ -0,0 +1,48 @@ +namespace Proteomics +{ + public class DisulfideBond + { + public DisulfideBond(int OneBasedBeginPosition, int OneBasedEndPosition, string Description) + { + this.OneBasedBeginPosition = OneBasedBeginPosition; + this.OneBasedEndPosition = OneBasedEndPosition; + this.Description = Description ?? ""; + } + + /// For interchain disulfide bonds, sets begin and end to the same position. + public DisulfideBond(int OneBasedPosition, string Description) + : this(OneBasedPosition, OneBasedPosition, Description) + { } + + /// + /// Beginning position of disulfide bond + /// + public int OneBasedBeginPosition { get; set; } + + /// + /// End position of disulfide bond + /// + public int OneBasedEndPosition { get; set; } + + /// + /// Description of this variation (optional) + /// + public string Description { get; set; } + + public override bool Equals(object obj) + { + DisulfideBond bond = obj as DisulfideBond; + return bond != null + && bond.OneBasedBeginPosition == OneBasedBeginPosition + && bond.OneBasedEndPosition == OneBasedEndPosition + && bond.Description == Description; + } + + public override int GetHashCode() + { + return OneBasedBeginPosition + ^ OneBasedEndPosition + ^ Description.GetHashCode(); // null handled in constructor + } + } +} \ No newline at end of file diff --git a/mzLib/MassSpectrometry/Proteomics/Protein/Protein.cs b/mzLib/MassSpectrometry/Proteomics/Protein/Protein.cs new file mode 100644 index 000000000..7ec98d9e4 --- /dev/null +++ b/mzLib/MassSpectrometry/Proteomics/Protein/Protein.cs @@ -0,0 +1,832 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text.RegularExpressions; +using Proteomics.Fragmentation; +using Proteomics; +using Proteomics.ProteolyticDigestion; + +namespace Proteomics +{ + public class Protein + { + private List _proteolysisProducts; + + /// + /// Protein. Filters out modifications that do not match their amino acid target site. + /// + /// Base sequence of the protein. + /// Unique accession for the protein. + /// Organism with this protein. + /// List of gene names as tuple of (nameType, name), e.g. (primary, HLA-A) + /// Modifications at positions along the sequence. + /// + /// + /// + /// + /// + /// + /// + /// + /// + /// + public Protein(string sequence, string accession, string organism = null, List> geneNames = null, + IDictionary> oneBasedModifications = null, List proteolysisProducts = null, + string name = null, string fullName = null, bool isDecoy = false, bool isContaminant = false, List databaseReferences = null, + List sequenceVariations = null, List appliedSequenceVariations = null, string sampleNameForVariants = null, + List disulfideBonds = null, List spliceSites = null, string databaseFilePath = null, bool addTruncations = false) + { + // Mandatory + BaseSequence = sequence; + NonVariantProtein = this; + Accession = accession; + + Name = name; + Organism = organism; + FullName = fullName; + IsDecoy = isDecoy; + IsContaminant = isContaminant; + DatabaseFilePath = databaseFilePath; + SampleNameForVariants = sampleNameForVariants; + + GeneNames = geneNames ?? new List>(); + _proteolysisProducts = proteolysisProducts ?? new List(); + SequenceVariations = sequenceVariations ?? new List(); + AppliedSequenceVariations = appliedSequenceVariations ?? new List(); + OriginalNonVariantModifications = oneBasedModifications ?? new Dictionary>(); + if (oneBasedModifications != null) + { + OneBasedPossibleLocalizedModifications = SelectValidOneBaseMods(oneBasedModifications); + } + else + { + OneBasedPossibleLocalizedModifications = new Dictionary>(); + } + DatabaseReferences = databaseReferences ?? new List(); + DisulfideBonds = disulfideBonds ?? new List(); + SpliceSites = spliceSites ?? new List(); + + if (addTruncations) + { + this.AddTruncations(); + } + } + + /// + /// Protein construction that clones a protein but assigns a different base sequence + /// For use in SILAC experiments + /// + /// + /// + /// + public Protein(Protein originalProtein, string silacSequence) + { + BaseSequence = silacSequence; + Accession = originalProtein.Accession; + NonVariantProtein = originalProtein.NonVariantProtein; + Name = originalProtein.Name; + Organism = originalProtein.Organism; + FullName = originalProtein.FullName; + IsDecoy = originalProtein.IsDecoy; + IsContaminant = originalProtein.IsContaminant; + DatabaseFilePath = originalProtein.DatabaseFilePath; + SampleNameForVariants = originalProtein.SampleNameForVariants; + GeneNames = originalProtein.GeneNames; + _proteolysisProducts = originalProtein._proteolysisProducts; + SequenceVariations = originalProtein.SequenceVariations; + AppliedSequenceVariations = originalProtein.AppliedSequenceVariations; + OriginalNonVariantModifications = originalProtein.OriginalNonVariantModifications; + OneBasedPossibleLocalizedModifications = originalProtein.OneBasedPossibleLocalizedModifications; + DatabaseReferences = originalProtein.DatabaseReferences; + DisulfideBonds = originalProtein.DisulfideBonds; + SpliceSites = originalProtein.SpliceSites; + DatabaseFilePath = originalProtein.DatabaseFilePath; + } + + /// + /// Protein construction with applied variations + /// + /// + /// + /// + /// + /// + /// + public Protein(string variantBaseSequence, Protein protein, IEnumerable appliedSequenceVariations, + IEnumerable applicableProteolysisProducts, IDictionary> oneBasedModifications, string sampleNameForVariants) + : this(variantBaseSequence, + VariantApplication.GetAccession(protein, appliedSequenceVariations), + organism: protein.Organism, + geneNames: new List>(protein.GeneNames), + oneBasedModifications: oneBasedModifications != null ? oneBasedModifications.ToDictionary(x => x.Key, x => x.Value) : new Dictionary>(), + proteolysisProducts: new List(applicableProteolysisProducts ?? new List()), + name: GetName(appliedSequenceVariations, protein.Name), + fullName: GetName(appliedSequenceVariations, protein.FullName), + isDecoy: protein.IsDecoy, + isContaminant: protein.IsContaminant, + databaseReferences: new List(protein.DatabaseReferences), + sequenceVariations: new List(protein.SequenceVariations), + disulfideBonds: new List(protein.DisulfideBonds), + spliceSites: new List(protein.SpliceSites), + databaseFilePath: protein.DatabaseFilePath) + { + NonVariantProtein = protein.NonVariantProtein; + OriginalNonVariantModifications = NonVariantProtein.OriginalNonVariantModifications; + AppliedSequenceVariations = (appliedSequenceVariations ?? new List()).ToList(); + SampleNameForVariants = sampleNameForVariants; + } + + /// + /// Modifications (values) located at one-based protein positions (keys) + /// + public IDictionary> OneBasedPossibleLocalizedModifications { get; private set; } + + /// + /// The list of gene names consists of tuples, where Item1 is the type of gene name, and Item2 is the name. There may be many genes and names of a certain type produced when reading an XML protein database. + /// + public IEnumerable> GeneNames { get; } + + /// + /// Unique accession for this protein. + /// + public string Accession { get; } + + /// + /// Base sequence, which may contain applied sequence variations. + /// + public string BaseSequence { get; } + + public string Organism { get; } + public bool IsDecoy { get; } + public IEnumerable SequenceVariations { get; } + public IEnumerable DisulfideBonds { get; } + public IEnumerable SpliceSites { get; } + + //TODO: Generate all the proteolytic products as distinct proteins during XML reading and delete the ProteolysisProducts parameter + public IEnumerable ProteolysisProducts + { get { return _proteolysisProducts; } } + + public IEnumerable DatabaseReferences { get; } + public string DatabaseFilePath { get; } + + /// + /// Protein before applying variations. + /// + public Protein NonVariantProtein { get; } + + /// + /// Sequence variations that have been applied to the base sequence. + /// + public List AppliedSequenceVariations { get; } + + /// + /// Sample name from which applied variants came, e.g. tumor or normal. + /// + public string SampleNameForVariants { get; } + + public double Probability { get; set; } // for protein pep project + + public int Length + { + get + { + return BaseSequence.Length; + } + } + + public string FullDescription + { + get + { + return Accession + "|" + Name + "|" + FullName; + } + } + + public string Name { get; } + public string FullName { get; } + public bool IsContaminant { get; } + internal IDictionary> OriginalNonVariantModifications { get; set; } + + public char this[int zeroBasedIndex] + { + get + { + return BaseSequence[zeroBasedIndex]; + } + } + + /// + /// Formats a string for a UniProt fasta header. See https://www.uniprot.org/help/fasta-headers. + /// Note that the db field isn't very applicable here, so mz is placed in to denote written by mzLib. + /// + public string GetUniProtFastaHeader() + { + var n = GeneNames.FirstOrDefault(); + string geneName = n == null ? "" : n.Item2; + return string.Format("mz|{0}|{1} {2} OS={3} GN={4}", Accession, Name, FullName, Organism, geneName); + } + + /// + /// Formats a string for an ensembl header + /// + public string GetEnsemblFastaHeader() + { + return string.Format("{0} {1}", Accession, FullName); + } + + /// + /// Gets peptides for digestion of a protein + /// TODO: Refactor to employ yield returns + /// + public IEnumerable Digest(DigestionParams digestionParams, List allKnownFixedModifications, + List variableModifications, List silacLabels = null, + (SilacLabel startLabel, SilacLabel endLabel)? turnoverLabels = null, bool topDownTruncationSearch = false) + { + //can't be null + allKnownFixedModifications = allKnownFixedModifications ?? new List(); + // add in any modifications that are caused by protease digestion + if (digestionParams.Protease.CleavageMod != null && !allKnownFixedModifications.Contains(digestionParams.Protease.CleavageMod)) + { + allKnownFixedModifications.Add(digestionParams.Protease.CleavageMod); + } + variableModifications = variableModifications ?? new List(); + CleavageSpecificity searchModeType = digestionParams.SearchModeType; + + ProteinDigestion digestion = new(digestionParams, allKnownFixedModifications, variableModifications); + IEnumerable unmodifiedPeptides = + searchModeType == CleavageSpecificity.Semi ? + digestion.SpeedySemiSpecificDigestion(this) : + digestion.Digestion(this, topDownTruncationSearch); + + if (digestionParams.KeepNGlycopeptide || digestionParams.KeepOGlycopeptide) + { + unmodifiedPeptides = GetGlycoPeptides(unmodifiedPeptides, digestionParams.KeepNGlycopeptide, digestionParams.KeepOGlycopeptide); + } + + IEnumerable modifiedPeptides = unmodifiedPeptides.SelectMany(peptide => peptide.GetModifiedPeptides(allKnownFixedModifications, digestionParams, variableModifications)); + + //Remove terminal modifications (if needed) + if (searchModeType == CleavageSpecificity.SingleN || + searchModeType == CleavageSpecificity.SingleC || + (searchModeType == CleavageSpecificity.None && (digestionParams.FragmentationTerminus == FragmentationTerminus.N || digestionParams.FragmentationTerminus == FragmentationTerminus.C))) + { + modifiedPeptides = RemoveTerminalModifications(modifiedPeptides, digestionParams.FragmentationTerminus, allKnownFixedModifications); + } + + //add silac labels (if needed) + if (silacLabels != null) + { + return GetSilacPeptides(modifiedPeptides, silacLabels, digestionParams.GeneratehUnlabeledProteinsForSilac, turnoverLabels); + } + + return modifiedPeptides; + } + + /// + /// Remove terminal modifications from the C-terminus of SingleN peptides and the N-terminus of SingleC peptides/ + /// These terminal modifications create redundant entries and increase search time + /// + internal static IEnumerable RemoveTerminalModifications(IEnumerable modifiedPeptides, FragmentationTerminus fragmentationTerminus, IEnumerable allFixedMods) + { + string terminalStringToLookFor = fragmentationTerminus == FragmentationTerminus.N ? "C-terminal" : "N-terminal"; + List fixedTerminalMods = allFixedMods.Where(x => x.LocationRestriction.Contains(terminalStringToLookFor)).ToList(); + foreach (PeptideWithSetModifications pwsm in modifiedPeptides) + { + if (!pwsm.AllModsOneIsNterminus.Values.Any(x => x.LocationRestriction.Contains(terminalStringToLookFor) && !fixedTerminalMods.Contains(x))) + { + yield return pwsm; + } + } + } + + /// + /// Add additional peptides with SILAC amino acids + /// + internal IEnumerable GetSilacPeptides(IEnumerable originalPeptides, List silacLabels, bool generateUnlabeledProteins, (SilacLabel startLabel, SilacLabel endLabel)? turnoverLabels) + { + //if this is a multiplex experiment (pooling multiple samples, not a turnover), then only create the fully unlabeled/labeled peptides + if (turnoverLabels == null) + { + //unlabeled peptides + if (generateUnlabeledProteins) + { + foreach (PeptideWithSetModifications pwsm in originalPeptides) + { + yield return pwsm; + } + } + + //fully labeled peptides + foreach (SilacLabel label in silacLabels) + { + Protein silacProtein = GenerateFullyLabeledSilacProtein(label); + foreach (PeptideWithSetModifications pwsm in originalPeptides) + { + //duplicate the peptides with the updated protein sequence that contains only silac labels + yield return new PeptideWithSetModifications(silacProtein, pwsm.DigestionParams, pwsm.OneBasedStartResidueInProtein, pwsm.OneBasedEndResidueInProtein, pwsm.CleavageSpecificityForFdrCategory, pwsm.PeptideDescription, pwsm.MissedCleavages, pwsm.AllModsOneIsNterminus, pwsm.NumFixedMods); + } + } + } + else //if this is a turnover experiment, we want to be able to look for peptides containing mixtures of heavy and light amino acids (typically occurs for missed cleavages) + { + (SilacLabel startLabel, SilacLabel endLabel) turnoverLabelsValue = turnoverLabels.Value; + SilacLabel startLabel = turnoverLabelsValue.startLabel; + SilacLabel endLabel = turnoverLabelsValue.endLabel; + + //This allows you to move from one label to another (rather than unlabeled->labeled or labeled->unlabeled). Useful for when your lab is swimming in cash and you have stock in a SILAC company + if (startLabel != null && endLabel != null) //if neither the start nor end conditions are unlabeled, then generate fully labeled proteins using the "startLabel" (otherwise maintain the unlabeled) + { + Protein silacStartProtein = GenerateFullyLabeledSilacProtein(startLabel); + PeptideWithSetModifications[] originalPeptideArray = originalPeptides.ToArray(); + for (int i = 0; i < originalPeptideArray.Length; i++) + { + PeptideWithSetModifications pwsm = originalPeptideArray[i]; + //duplicate the peptides with the updated protein sequence that contains only silac labels + originalPeptideArray[i] = new PeptideWithSetModifications(silacStartProtein, pwsm.DigestionParams, pwsm.OneBasedStartResidueInProtein, pwsm.OneBasedEndResidueInProtein, pwsm.CleavageSpecificityForFdrCategory, pwsm.PeptideDescription, pwsm.MissedCleavages, pwsm.AllModsOneIsNterminus, pwsm.NumFixedMods); + } + originalPeptides = originalPeptideArray; + + //modify the end label amino acids to recognize the new "original" amino acid + //get the residues that were changed + List originalLabels = new List { startLabel }; + if (startLabel.AdditionalLabels != null) + { + originalLabels.AddRange(startLabel.AdditionalLabels); + } + SilacLabel startLabelWithSharedOriginalAminoAcid = originalLabels.Where(x => x.OriginalAminoAcid == endLabel.OriginalAminoAcid).FirstOrDefault(); + SilacLabel updatedEndLabel = startLabelWithSharedOriginalAminoAcid == null ? + endLabel : + new SilacLabel(startLabelWithSharedOriginalAminoAcid.AminoAcidLabel, endLabel.AminoAcidLabel, endLabel.LabelChemicalFormula, endLabel.ConvertMassDifferenceToDouble()); + if (endLabel.AdditionalLabels != null) + { + foreach (SilacLabel additionalLabel in endLabel.AdditionalLabels) + { + startLabelWithSharedOriginalAminoAcid = originalLabels.Where(x => x.OriginalAminoAcid == additionalLabel.OriginalAminoAcid).FirstOrDefault(); + updatedEndLabel.AddAdditionalSilacLabel( + startLabelWithSharedOriginalAminoAcid == null ? + additionalLabel : + new SilacLabel(startLabelWithSharedOriginalAminoAcid.AminoAcidLabel, additionalLabel.AminoAcidLabel, additionalLabel.LabelChemicalFormula, additionalLabel.ConvertMassDifferenceToDouble())); + } + } + + //double check that all labeled amino acids can become unlabeled/relabeled + if (startLabel.AdditionalLabels != null) + { + foreach (SilacLabel originalLabel in originalLabels) + { + if (updatedEndLabel.OriginalAminoAcid != originalLabel.AminoAcidLabel && + (updatedEndLabel.AdditionalLabels == null || !updatedEndLabel.AdditionalLabels.Any(x => x.OriginalAminoAcid == originalLabel.AminoAcidLabel))) + { + updatedEndLabel.AddAdditionalSilacLabel(new SilacLabel(originalLabel.AminoAcidLabel, originalLabel.OriginalAminoAcid, originalLabel.LabelChemicalFormula, originalLabel.ConvertMassDifferenceToDouble())); + } + } + } + endLabel = updatedEndLabel; + } + + //add all unlabeled (or if no unlabeled, then the startLabeled) peptides + foreach (PeptideWithSetModifications pwsm in originalPeptides) + { + yield return pwsm; + } + + //the order (below) matters when neither labels are null, because the fully labeled "start" has already been created above, so we want to use the end label here if it's not unlabeled (null) + SilacLabel label = endLabel ?? startLabel; //pick the labeled (not the unlabeled). If no unlabeled, take the endLabel + + Protein silacEndProtein = GenerateFullyLabeledSilacProtein(label); + + //add all peptides containing any label (may also contain unlabeled) + if (label.AdditionalLabels == null) //if there's only one (which is common) + { + //get the residues to change + char originalResidue = label.OriginalAminoAcid; + char labeledResidue = label.AminoAcidLabel; + + //label peptides + foreach (PeptideWithSetModifications pwsm in originalPeptides) + { + //find the indexes in the base sequence for labeling + char[] baseSequenceArray = pwsm.BaseSequence.ToArray(); + List indexesOfResiduesToBeLabeled = new List(); + for (int c = 0; c < baseSequenceArray.Length; c++) + { + if (baseSequenceArray[c] == originalResidue) + { + indexesOfResiduesToBeLabeled.Add(c); + } + } + //if there's something to label + if (indexesOfResiduesToBeLabeled.Count != 0) + { + List pwsmsForCombinatorics = new List { pwsm }; + for (int a = 0; a < indexesOfResiduesToBeLabeled.Count; a++) + { + List localPwsmsForCombinatorics = new List(); + foreach (PeptideWithSetModifications pwsmCombination in pwsmsForCombinatorics) + { + char[] combinatoricBaseSequenceArray = pwsmCombination.BaseSequence.ToArray(); + combinatoricBaseSequenceArray[indexesOfResiduesToBeLabeled[a]] = labeledResidue; + string updatedBaseSequence = string.Concat(combinatoricBaseSequenceArray); + + PeptideWithSetModifications labeledPwsm = new PeptideWithSetModifications(silacEndProtein, pwsm.DigestionParams, + pwsm.OneBasedStartResidueInProtein, pwsm.OneBasedEndResidueInProtein, pwsm.CleavageSpecificityForFdrCategory, + pwsm.PeptideDescription, pwsm.MissedCleavages, pwsm.AllModsOneIsNterminus, pwsm.NumFixedMods, updatedBaseSequence); + yield return labeledPwsm; //return + localPwsmsForCombinatorics.Add(labeledPwsm); //add so it can be used again + } + pwsmsForCombinatorics.AddRange(localPwsmsForCombinatorics); + } + } + } + } + else //if there are more than one (i.e. K and R are labeled) + { + //get the residues to change + char[] originalResidues = new char[label.AdditionalLabels.Count + 1]; + char[] labeledResidues = new char[label.AdditionalLabels.Count + 1]; + originalResidues[0] = label.OriginalAminoAcid; + labeledResidues[0] = label.AminoAcidLabel; + for (int i = 0; i < label.AdditionalLabels.Count; i++) + { + originalResidues[i + 1] = label.AdditionalLabels[i].OriginalAminoAcid; + labeledResidues[i + 1] = label.AdditionalLabels[i].AminoAcidLabel; + } + + //label peptides + foreach (PeptideWithSetModifications pwsm in originalPeptides) + { + //find the indexes in the base sequence for labeling + char[] baseSequenceArray = pwsm.BaseSequence.ToArray(); + Dictionary indexesOfResiduesToBeLabeled = new Dictionary(); + for (int peptideResidueIndex = 0; peptideResidueIndex < baseSequenceArray.Length; peptideResidueIndex++) + { + for (int silacResidue = 0; silacResidue < originalResidues.Length; silacResidue++) + { + if (baseSequenceArray[peptideResidueIndex] == originalResidues[silacResidue]) + { + indexesOfResiduesToBeLabeled.Add(peptideResidueIndex, labeledResidues[silacResidue]); + } + } + } + //if there's something to label + if (indexesOfResiduesToBeLabeled.Count != 0) + { + List pwsmsForCombinatorics = new List { pwsm }; + foreach (KeyValuePair kvp in indexesOfResiduesToBeLabeled) + { + List localPwsmsForCombinatorics = new List(); + foreach (PeptideWithSetModifications pwsmCombination in pwsmsForCombinatorics) + { + char[] combinatoricBaseSequenceArray = pwsmCombination.BaseSequence.ToArray(); + combinatoricBaseSequenceArray[kvp.Key] = kvp.Value; + string updatedBaseSequence = string.Concat(combinatoricBaseSequenceArray); + + PeptideWithSetModifications labeledPwsm = new PeptideWithSetModifications(silacEndProtein, pwsm.DigestionParams, + pwsm.OneBasedStartResidueInProtein, pwsm.OneBasedEndResidueInProtein, pwsm.CleavageSpecificityForFdrCategory, + pwsm.PeptideDescription, pwsm.MissedCleavages, pwsm.AllModsOneIsNterminus, pwsm.NumFixedMods, updatedBaseSequence); + yield return labeledPwsm; //return + localPwsmsForCombinatorics.Add(labeledPwsm); //add so it can be used again + } + pwsmsForCombinatorics.AddRange(localPwsmsForCombinatorics); + } + } + } + } + } + } + + /// + /// Only keep glycopeptides by filtering the NGlycopeptide motif 'NxS || NxT' or OGlycopeptide motif 'S || T' + /// + internal IEnumerable GetGlycoPeptides(IEnumerable originalPeptides, bool keepNGlycopeptide, bool keepOGlycopeptide) + { + Regex rgx = new Regex("N[A-Z][ST]"); + foreach (ProteolyticPeptide pwsm in originalPeptides) + { + bool yielded = false; + if (keepNGlycopeptide) + { + if (rgx.IsMatch(pwsm.BaseSequence)) + { + yielded = true; + yield return pwsm; + } + } + + if (keepOGlycopeptide && !yielded) + { + if (pwsm.BaseSequence.Contains('S') || pwsm.BaseSequence.Contains('T')) + { + yield return pwsm; + } + } + } + } + + /// + /// Generates a protein that is fully labeled with the specified silac label + /// + private Protein GenerateFullyLabeledSilacProtein(SilacLabel label) + { + string updatedBaseSequence = BaseSequence.Replace(label.OriginalAminoAcid, label.AminoAcidLabel); + if (label.AdditionalLabels != null) //if there is more than one label per replicate (i.e both R and K were labeled in a sample before pooling) + { + foreach (SilacLabel additionalLabel in label.AdditionalLabels) + { + updatedBaseSequence = updatedBaseSequence.Replace(additionalLabel.OriginalAminoAcid, additionalLabel.AminoAcidLabel); + } + } + return new Protein(this, updatedBaseSequence); + } + + /// + /// Gets proteins with applied variants from this protein + /// + public List GetVariantProteins(int maxAllowedVariantsForCombinitorics = 4, int minAlleleDepth = 1) + { + return VariantApplication.ApplyVariants(this, SequenceVariations, maxAllowedVariantsForCombinitorics, minAlleleDepth); + } + + /// + /// Restore all modifications that were read in, including those that did not match their target amino acid. + /// + public void RestoreUnfilteredModifications() + { + OneBasedPossibleLocalizedModifications = OriginalNonVariantModifications; + } + + /// + /// Filters modifications that do not match their target amino acid. + /// + /// + /// + private IDictionary> SelectValidOneBaseMods(IDictionary> dict) + { + Dictionary> validModDictionary = new Dictionary>(); + foreach (KeyValuePair> entry in dict) + { + List validMods = new List(); + foreach (Modification m in entry.Value) + { + //mod must be valid mod and the motif of the mod must be present in the protein at the specified location + if (m.ValidModification && ModificationLocalization.ModFits(m, BaseSequence, 0, BaseSequence.Length, entry.Key)) + { + validMods.Add(m); + } + } + + if (validMods.Any()) + { + if (validModDictionary.Keys.Contains(entry.Key)) + { + validModDictionary[entry.Key].AddRange(validMods); + } + else + { + validModDictionary.Add(entry.Key, validMods); + } + } + } + return validModDictionary; + } + /// + /// Protein XML files contain annotated proteolysis products for many proteins (e.g. signal peptides, chain peptides). + /// This method adds N- and C-terminal truncations to these products. + /// + + public void AddTruncationsToExistingProteolysisProducts(int fullProteinOneBasedBegin, int fullProteinOneBasedEnd, bool addNterminalDigestionTruncations, bool addCterminalDigestionTruncations, int minProductBaseSequenceLength, int lengthOfProteolysis, string proteolyisisProductName) + { + bool sequenceContainsNterminus = (fullProteinOneBasedBegin == 1); + + if (sequenceContainsNterminus) + { + //Digest N-terminus + if (addNterminalDigestionTruncations) + { + if (BaseSequence.Substring(0, 1) == "M") + { + AddNterminalTruncations(lengthOfProteolysis + 1, fullProteinOneBasedBegin, fullProteinOneBasedEnd, minProductBaseSequenceLength, proteolyisisProductName); + } + else + { + AddNterminalTruncations(lengthOfProteolysis, fullProteinOneBasedBegin, fullProteinOneBasedEnd, minProductBaseSequenceLength, proteolyisisProductName); + } + } + //Digest C-terminus -- not effected by variable N-terminus behavior + if (addCterminalDigestionTruncations) + { + // if first residue is M, then we have to add c-terminal markers for both with and without the M + if (BaseSequence.Substring(0, 1) == "M") + { + //add sequences WITHOUT methionine + AddCterminalTruncations(lengthOfProteolysis, fullProteinOneBasedEnd, fullProteinOneBasedBegin + 1, minProductBaseSequenceLength, proteolyisisProductName); + } + //add sequences with methionine + AddCterminalTruncations(lengthOfProteolysis, fullProteinOneBasedEnd, fullProteinOneBasedBegin, minProductBaseSequenceLength, proteolyisisProductName); + } + } + else // sequence does not contain N-terminus + { + //Digest C-terminus + if (addCterminalDigestionTruncations) + { + AddCterminalTruncations(lengthOfProteolysis, fullProteinOneBasedEnd, fullProteinOneBasedBegin, minProductBaseSequenceLength, proteolyisisProductName); + } + + //Digest N-terminus + if (addNterminalDigestionTruncations) + { + AddNterminalTruncations(lengthOfProteolysis, fullProteinOneBasedBegin, fullProteinOneBasedEnd, minProductBaseSequenceLength, proteolyisisProductName); + } + } + } + /// + /// Returns of list of proteoforms with the specified number of C-terminal amino acid truncations subject to minimum length criteria + /// + private void AddCterminalTruncations(int lengthOfProteolysis, int fullProteinOneBasedEnd, int fullProteinOneBasedBegin, int minProductBaseSequenceLength, string proteolyisisProductName) + { + for (int i = 1; i <= lengthOfProteolysis; i++) + { + int newEnd = fullProteinOneBasedEnd - i; + int length = newEnd - fullProteinOneBasedBegin + 1; + if (length >= minProductBaseSequenceLength) + { + _proteolysisProducts.Add(new ProteolysisProduct(fullProteinOneBasedBegin, newEnd, proteolyisisProductName)); + } + } + } + /// + /// Returns of list of proteoforms with the specified number of N-terminal amino acid truncations subject to minimum length criteria + /// + + private void AddNterminalTruncations(int lengthOfProteolysis, int fullProteinOneBasedBegin, int fullProteinOneBasedEnd, int minProductBaseSequenceLength, string proteolyisisProductName) + { + for (int i = 1; i <= lengthOfProteolysis; i++) + { + int newBegin = fullProteinOneBasedBegin + i; + int length = fullProteinOneBasedEnd - newBegin + 1; + if (length >= minProductBaseSequenceLength) + { + _proteolysisProducts.Add(new ProteolysisProduct(newBegin, fullProteinOneBasedEnd, proteolyisisProductName)); + } + } + } + + /// + /// This the main entry point for adding sequences in a top-down truncation search. + /// The way this is designed is such at all base sequences to be searched end up in the list Protein.ProteolysisProducts + /// This includes the intact protein. IT DOES NOT INCLUDE ANY DOUBLY (BOTH ENDS) DIGESTED PRODUCTS. + /// The original proteolysis products (if any) are already in that list. These are annotated in protein.xml files. + /// The options to keep in mind are present in the following variables + /// + /// This needs to be added to the proteolysisProducts list to be searched + /// the original products are there but those resulting from N- or C-terminal degradation still need to be added + /// + /// + /// the same as the min detectable peptide + /// the number of amino acids that can be removed from either end. + public void AddTruncations(bool addFullProtein = true, bool addForEachOrigninalProteolysisProduct = true, bool addNterminalDigestionTruncations = true, bool addCterminalDigestionTruncations = true, int minProductBaseSequenceLength = 7, int lengthOfProteolysis = 5) + { + if (addFullProtein) //this loop adds the intact protoeoform and its proteolysis products to the proteolysis products list + { + AddIntactProteoformToTruncationsProducts(minProductBaseSequenceLength); + if (addNterminalDigestionTruncations) + { + AddTruncationsToExistingProteolysisProducts(1, BaseSequence.Length, true, false, minProductBaseSequenceLength, lengthOfProteolysis, "full-length proteoform N-terminal digestion truncation"); + } + if (addCterminalDigestionTruncations) + { + AddTruncationsToExistingProteolysisProducts(1, BaseSequence.Length, false, true, minProductBaseSequenceLength, lengthOfProteolysis, "full-length proteoform C-terminal digestion truncation"); + } + } + + if (addForEachOrigninalProteolysisProduct) // this does not include the original intact proteoform + { + List existingProducts = ProteolysisProducts.Where(p => !p.Type.Contains("truncation") && !p.Type.Contains("full-length proteoform")).ToList(); + foreach (ProteolysisProduct product in existingProducts) + { + if (product.OneBasedBeginPosition.HasValue && product.OneBasedEndPosition.HasValue) + { + string proteolyisisProductName = "truncation"; + + if (!String.IsNullOrEmpty(product.Type)) + { + proteolyisisProductName = product.Type + " " + proteolyisisProductName; + } + //the original proteolysis product is already on the list so we don't need to duplicate + if (addNterminalDigestionTruncations) + { + AddTruncationsToExistingProteolysisProducts(product.OneBasedBeginPosition.Value, product.OneBasedEndPosition.Value, true, false, minProductBaseSequenceLength, lengthOfProteolysis, proteolyisisProductName); + } + if (addCterminalDigestionTruncations) + { + AddTruncationsToExistingProteolysisProducts(product.OneBasedBeginPosition.Value, product.OneBasedEndPosition.Value, false, true, minProductBaseSequenceLength, lengthOfProteolysis, proteolyisisProductName); + } + } + } + } + CleaveOnceBetweenProteolysisProducts(); + } + /// + /// This method adds proteoforms with N- and C-terminal amino acid loss to the list of species included in top-down search + /// + public void AddIntactProteoformToTruncationsProducts(int minProductBaseSequenceLength) + { + if (BaseSequence.Length >= minProductBaseSequenceLength) + { + _proteolysisProducts.Add(new ProteolysisProduct(1, BaseSequence.Length, "full-length proteoform")); + } + } + + /// + /// proteins with multiple proteolysis products are not always full cleaved. we observed proteolysis products w/ missed cleavages. + /// This method allows for one missed cleavage between proteolysis products. + /// + + public void CleaveOnceBetweenProteolysisProducts(int minimumProductLength = 7) + { + List cleavagePostions = new(); + List localProducts = _proteolysisProducts.Where(p => !p.Type.Contains("truncation") && !p.Type.Contains("full-length proteoform")).ToList(); + List proteolysisProductEndPositions = localProducts.Where(p => p.OneBasedEndPosition.HasValue).Select(p => p.OneBasedEndPosition.Value).ToList(); + if (proteolysisProductEndPositions.Count > 0) + { + foreach (int proteolysisProductEndPosition in proteolysisProductEndPositions) + { + if (localProducts.Any(p => p.OneBasedBeginPosition == (proteolysisProductEndPosition + 1))) + { + cleavagePostions.Add(proteolysisProductEndPosition); + } + } + } + + foreach (int position in cleavagePostions) + { + if (position - 1 >= minimumProductLength) + { + string leftType = $"N-terminal Portion of Singly Cleaved Protein(1-{position})"; + ProteolysisProduct leftProduct = new(1, position, leftType); + + //here we're making sure a product with these begin/end positions isn't already present + if (!_proteolysisProducts.Any(p => p.OneBasedBeginPosition == leftProduct.OneBasedBeginPosition && p.OneBasedEndPosition == leftProduct.OneBasedEndPosition)) + { + _proteolysisProducts.Add(leftProduct); + } + } + + if (BaseSequence.Length - position - 1 >= minimumProductLength) + { + string rightType = $"C-terminal Portion of Singly Cleaved Protein({position + 1}-{BaseSequence.Length})"; + ProteolysisProduct rightProduct = new(position + 1, BaseSequence.Length, rightType); + + //here we're making sure a product with these begin/end positions isn't already present + if (!_proteolysisProducts.Any(p => p.OneBasedBeginPosition == rightProduct.OneBasedBeginPosition && p.OneBasedEndPosition == rightProduct.OneBasedEndPosition)) + { + _proteolysisProducts.Add(rightProduct); + } + } + } + } + + private static string GetName(IEnumerable appliedVariations, string name) + { + bool emptyVars = appliedVariations == null || appliedVariations.Count() == 0; + if (name == null && emptyVars) + { + return null; + } + else + { + string variantTag = emptyVars ? "" : $" variant:{VariantApplication.CombineDescriptions(appliedVariations)}"; + return name + variantTag; + } + } + + public int CompareTo(Protein other) + { + //permits sorting of proteins + return this.Accession.CompareTo(other.Accession); + } + + //not sure if we require any additional fields for equality + public override bool Equals(object obj) + { + Protein otherProtein = (Protein)obj; + return otherProtein != null && otherProtein.Accession.Equals(Accession) && otherProtein.BaseSequence.Equals(BaseSequence); + } + + /// + /// The protein object uses the default hash code method for speed, + /// but note that two protein objects with the same information will give two different hash codes. + /// + /// + public override int GetHashCode() + { + return this.BaseSequence.GetHashCode(); + } + + public override string ToString() + { + return this.Accession.ToString(); + } + } +} \ No newline at end of file diff --git a/mzLib/MassSpectrometry/Proteomics/Protein/ProteoformLevelClassifier.cs b/mzLib/MassSpectrometry/Proteomics/Protein/ProteoformLevelClassifier.cs new file mode 100644 index 000000000..e89956e8c --- /dev/null +++ b/mzLib/MassSpectrometry/Proteomics/Protein/ProteoformLevelClassifier.cs @@ -0,0 +1,240 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using Proteomics.ProteolyticDigestion; + +namespace Proteomics +{ + public static class ProteoformLevelClassifier + { + + /// + /// All input strings are delimited with "|" + /// PTMs are annotated with [] + /// + /// All possible sequences (with modifications) for this PrSM + /// All possible genes for this PrSM + /// + public static string ClassifyPrSM(string fullSequenceString, string geneString) + { + //separate delimited input + string[] sequences = fullSequenceString.Split('|'); + string[] genes = geneString.Split('|'); + + + //determine sequence ambiguity + string firstBaseSequence = PeptideWithSetModifications.GetBaseSequenceFromFullSequence(sequences[0]).ToUpper(); //get first sequence with modifications removed + bool sequenceIdentified = !SequenceContainsUnknownAminoAcids(firstBaseSequence); //check if there are any ambiguous amino acids (i.e. B, J, X, Z) + //for every other sequence reported + if (sequenceIdentified) //if there weren't any unknown amino acids reported. + { + for (int i = 1; i < sequences.Length; i++) + { + //if the unmodified sequences don't match, then there's sequence ambiguity + if (!firstBaseSequence.Equals(PeptideWithSetModifications.GetBaseSequenceFromFullSequence(sequences[i]).ToUpper())) + { + sequenceIdentified = false; + break; + } + } + } + + + //determine PTM localization and identification + List<(int index, string ptm)> firstPTMsSortedByIndex = GetPTMs(sequences[0]); //get ptms from the first sequence reported + List firstPTMsSortedByPTM = firstPTMsSortedByIndex.Select(x => x.ptm).OrderBy(x => x).ToList(); //sort ptms alphabetically + //check if there are unknown mass shifts + bool ptmsIdentified = !PtmsContainUnknownMassShifts(firstPTMsSortedByPTM); + bool ptmsLocalized = true; //assume these are localized unless we determine otherwise + //for every other sequence reported + for (int seqIndex = 1; seqIndex < sequences.Length; seqIndex++) + { + List<(int index, string ptm)> currentPTMsSortedByIndex = GetPTMs(sequences[seqIndex]); //get ptms from this sequence + List currentPTMsSortedByPTM = currentPTMsSortedByIndex.Select(x => x.ptm).OrderBy(x => x).ToList(); //sort ptms alphabetically + + //are number of PTMs the same? + if (firstPTMsSortedByIndex.Count == currentPTMsSortedByIndex.Count) + { + //check localization (are indexes conserved?) + for (int i = 0; i < firstPTMsSortedByIndex.Count; i++) + { + if (firstPTMsSortedByIndex[i].index != currentPTMsSortedByIndex[i].index) + { + ptmsLocalized = false; + break; + } + } + //check PTM identification + for (int i = 0; i < firstPTMsSortedByPTM.Count; i++) + { + if (!firstPTMsSortedByPTM[i].Equals(currentPTMsSortedByPTM[i])) + { + ptmsIdentified = false; + break; + } + } + } + else + { + ptmsIdentified = false; + ptmsLocalized = false; + } + } + //handle an edge case where two PTMs are identified and localized to two residues, but it's unclear which PTM is localized to which residue. + if (ptmsIdentified && ptmsLocalized) + { + for (int seqIndex = 1; seqIndex < sequences.Length; seqIndex++) + { + List<(int index, string ptm)> currentPTMsSortedByIndex = GetPTMs(sequences[seqIndex]); //get ptms from this sequence + //check that the mods are in the same position + for(int ptmIndex =0; ptmIndex + /// Determine proteoform level between 1 (know everything) and 5 (only know the mass) + /// as defined in the publication: + /// Smith, L.M., Thomas, P.M., Shortreed, M.R. et al. A five-level classification system for proteoform identifications. Nat Methods 16, 939–940 (2019). https://doi.org/10.1038/s41592-019-0573-x + /// + /// Is the PTM localized? + /// Do we know what the PTM is, or is it ambiguous (or an unknown mass shift?) + /// Do we know the proteoform sequence, or is it ambiguous? + /// Do we know which gene produced this proteoform? + /// + public static string GetProteoformClassification(bool ptmLocalized, bool ptmIdentified, bool sequenceIdentified, bool geneIdentified) + { + int sum = Convert.ToInt16(ptmLocalized) + Convert.ToInt16(ptmIdentified) + Convert.ToInt16(sequenceIdentified) + Convert.ToInt16(geneIdentified); + if (sum == 3) //level 2, but is it A, B, C, or D? + { + if (!ptmLocalized) + { + return "2A"; + } + else if (!ptmIdentified) + { + return "2B"; + } + else if (!sequenceIdentified) + { + return "2C"; + } + else //if (!geneIdentified) + { + return "2D"; + } + } + else + { + return (5 - sum).ToString(); + } + } + + /// + /// Provided with an unmodified sequence, return if it contains ambiguous amino acids such as: + /// B: Aspartic acid or Asparagine + /// J: Leucine or Isoleucine + /// X: Any amino acid + /// Z: Glutamic acid or Glutamine + /// + /// + /// + private static bool SequenceContainsUnknownAminoAcids(string baseSequence) + { + char[] ambiguousAminoAcids = new char[] { 'B', 'J', 'X', 'Z' }; + foreach (char aa in ambiguousAminoAcids) + { + if (baseSequence.Contains(aa)) + { + return true; + } + } + return false; + } + + /// + /// Given a proteoform sequence (contains ptms), returns a list of all ptms and their one based index in order from N-terminus to C-terminus + /// + /// + /// + private static List<(int, string)> GetPTMs(string fullSequence) + { + List<(int, string)> ptmsToReturn = new List<(int, string)>(); + StringBuilder currentPTM = new StringBuilder(); + int currentIndex = 0; + int numLeftBrackets = 0; //PTMs are annotated with brackets. This object keeps track of how many brackets deep we are + + //iterate through the sequence + foreach (char c in fullSequence) + { + //if we found a right bracket + if (c == ']') + { + //record that we're stepping out of brackets + numLeftBrackets--; + //if we've finished the ptm + if (numLeftBrackets == 0) + { + //Add the ptm and clear the record + currentIndex--; //move back an index because we added one when we entered the bracket + ptmsToReturn.Add((currentIndex, currentPTM.ToString())); + currentPTM.Clear(); + } + } + else //if not a right bracket... + { + //if we're already in a PTM, record it + if (numLeftBrackets > 0) + { + currentPTM.Append(c); + } + else //we're not in a PTM, so update where we are in the proteoform + { + currentIndex++; //this operation occurs when entering a PTM, so we need to substract when exiting the PTM + } + //if we're entering a PTM or a nested bracket, record it + if (c == '[') + { + numLeftBrackets++; + } + } + } + + return ptmsToReturn; + } + + /// + /// See if any of the reported PTMs are mass shifts, (e.g. [+15.99] or [-17.99]) or contain "?" + /// + /// + /// + private static bool PtmsContainUnknownMassShifts(List ptms) + { + foreach (string ptm in ptms) + { + if (ptm.Length > 1) //check length is appropriate + { + //remove sign with substring and try to parse into double. If it's a mass, tryparse returns true + if (double.TryParse(ptm.Substring(1), out double mass)) + { + return true; + } + } + } + return false; + } + } +} diff --git a/mzLib/MassSpectrometry/Proteomics/Protein/ProteolysisProduct.cs b/mzLib/MassSpectrometry/Proteomics/Protein/ProteolysisProduct.cs new file mode 100644 index 000000000..81eaebdc3 --- /dev/null +++ b/mzLib/MassSpectrometry/Proteomics/Protein/ProteolysisProduct.cs @@ -0,0 +1,32 @@ +namespace Proteomics +{ + public class ProteolysisProduct + { + public ProteolysisProduct(int? oneBasedBeginPosition, int? oneBasedEndPosition, string type) + { + OneBasedBeginPosition = oneBasedBeginPosition; + OneBasedEndPosition = oneBasedEndPosition; + Type = type ?? ""; + } + + public int? OneBasedBeginPosition { get; } + public int? OneBasedEndPosition { get; } + public string Type { get; } + + public override bool Equals(object obj) + { + ProteolysisProduct pp = obj as ProteolysisProduct; + return pp != null + && pp.OneBasedBeginPosition.Equals(OneBasedBeginPosition) + && pp.OneBasedEndPosition.Equals(OneBasedEndPosition) + && (pp.Type == null && Type == null || pp.Type.Equals(Type)); + } + + public override int GetHashCode() + { + return (OneBasedBeginPosition ?? 0).GetHashCode() + ^ (OneBasedEndPosition ?? 0).GetHashCode() + ^ Type.GetHashCode(); // null handled in constructor + } + } +} \ No newline at end of file diff --git a/mzLib/MassSpectrometry/Proteomics/Protein/SequenceVariantDescription.cs b/mzLib/MassSpectrometry/Proteomics/Protein/SequenceVariantDescription.cs new file mode 100644 index 000000000..d19493dd8 --- /dev/null +++ b/mzLib/MassSpectrometry/Proteomics/Protein/SequenceVariantDescription.cs @@ -0,0 +1,97 @@ +using System; +using System.Collections.Generic; +using System.Linq; + +namespace Proteomics +{ + public class SequenceVariantDescription + { + public SequenceVariantDescription(string description) + { + Description = description; + if (description == null) + { + return; + } + + // Parse description into + string[] vcfFields = description.Split(new[] { @"\t" }, StringSplitOptions.None); + if (vcfFields.Length < 10) { return; } + ReferenceAlleleString = vcfFields[3]; + AlternateAlleleString = vcfFields[4]; + Info = new SnpEffAnnotation(vcfFields[7]); + AlleleIndex = Info.Allele == null ? -1 : AlternateAlleleString.Split(',').ToList().IndexOf(Info.Allele) + 1; // reference is zero + Format = vcfFields[8]; + string[] genotypes = Enumerable.Range(9, vcfFields.Length - 9).Select(i => vcfFields[i]).ToArray(); + + // loop through genotypes for this variant (e.g. tumor and normal) + for (int individual = 0; individual < genotypes.Length; individual++) + { + var genotypeFields = GenotypeDictionary(Format.Trim(), genotypes[individual].Trim()); + + // parse genotype + string[] gt = null; + if (genotypeFields.TryGetValue("GT", out string gtString)) { gt = gtString.Split('/'); } + if (gt == null) { continue; } + + // parse allele depth (might be null, technically, but shouldn't be in most use cases) + string[] ad = null; + if (genotypeFields.TryGetValue("AD", out string adString)) { ad = adString.Split(','); } + + Genotypes.Add(individual.ToString(), gt); + AlleleDepths.Add(individual.ToString(), ad); + Homozygous.Add(individual.ToString(), gt.Distinct().Count() == 1); + Heterozygous.Add(individual.ToString(), gt.Distinct().Count() > 1); + } + } + + public string Description { get; } + public string ReferenceAlleleString { get; } + public string AlternateAlleleString { get; } + public SnpEffAnnotation Info { get; } + public string Format { get; } + public Dictionary Homozygous { get; } = new Dictionary(); + public Dictionary Heterozygous { get; } = new Dictionary(); + public Dictionary Genotypes { get; } = new Dictionary(); + public Dictionary AlleleDepths { get; } = new Dictionary(); + public int AlleleIndex { get; } + + /// + /// Returns original string for the description + /// + /// + public override string ToString() + { + return Description; + } + + public override bool Equals(object obj) + { + SequenceVariantDescription s = obj as SequenceVariantDescription; + return s != null && s.Description == Description; + } + + public override int GetHashCode() + { + return (Description ?? "").GetHashCode(); + } + + /// + /// Gets a dictionary of the format (key) and fields (value) for a genotype + /// + /// + /// + /// + internal static Dictionary GenotypeDictionary(string format, string genotype) + { + Dictionary genotypeDict = new Dictionary(); + string[] formatSplit = format.Split(':'); + string[] genotypeSplit = genotype.Split(':'); + if (formatSplit.Length != genotypeSplit.Length) + { + throw new ArgumentException("Genotype format: " + format + " and genotype: " + genotype + " do not match -- they're not the same length"); + } + return Enumerable.Range(0, formatSplit.Length).ToDictionary(x => formatSplit[x], x => genotypeSplit[x]); + } + } +} \ No newline at end of file diff --git a/mzLib/MassSpectrometry/Proteomics/Protein/SequenceVariation.cs b/mzLib/MassSpectrometry/Proteomics/Protein/SequenceVariation.cs new file mode 100644 index 000000000..84642db46 --- /dev/null +++ b/mzLib/MassSpectrometry/Proteomics/Protein/SequenceVariation.cs @@ -0,0 +1,162 @@ +using System.Collections.Generic; +using System.Linq; +using Proteomics; + +namespace Proteomics +{ + public class SequenceVariation + { + /// + /// For longer sequence variations, where a range of sequence is replaced. Point mutations should be specified with the same begin and end positions. + /// + /// + /// + /// + /// + /// + public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, string originalSequence, string variantSequence, string description, Dictionary> oneBasedModifications = null) + { + OneBasedBeginPosition = oneBasedBeginPosition; + OneBasedEndPosition = oneBasedEndPosition; + OriginalSequence = originalSequence ?? ""; + VariantSequence = variantSequence ?? ""; + Description = new SequenceVariantDescription(description); + OneBasedModifications = oneBasedModifications ?? new Dictionary>(); + } + + /// + /// For variations with only position information (not begin and end). + /// Sets the end to the end of the original protein sequence to which this variation applies. + /// + /// + /// + /// + /// + /// + public SequenceVariation(int oneBasedPosition, string originalSequence, string variantSequence, string description, Dictionary> oneBasedModifications = null) + : this(oneBasedPosition, originalSequence == null ? oneBasedPosition : oneBasedPosition + originalSequence.Length - 1, originalSequence, variantSequence, description, oneBasedModifications) + { } + + /// + /// Beginning position of original sequence to be replaced + /// + public int OneBasedBeginPosition { get; } + + /// + /// End position of original sequence to be replaced + /// + public int OneBasedEndPosition { get; } + + /// + /// Original sequence information (optional) + /// + public string OriginalSequence { get; } + + /// + /// Variant sequence information (required) + /// + public string VariantSequence { get; } + + /// + /// Description of this variation (optional) + /// + public SequenceVariantDescription Description { get; } + + /// + /// Modifications specifically for this variant + /// + public Dictionary> OneBasedModifications { get; } + + public override bool Equals(object obj) + { + SequenceVariation s = obj as SequenceVariation; + return s != null + && OneBasedBeginPosition == s.OneBasedBeginPosition + && OneBasedEndPosition == s.OneBasedEndPosition + && (s.OriginalSequence == null && OriginalSequence == null || OriginalSequence.Equals(s.OriginalSequence)) + && (s.VariantSequence == null && VariantSequence == null || VariantSequence.Equals(s.VariantSequence)) + && (s.Description == null && Description == null || Description.Equals(s.Description)) + && (s.OneBasedModifications == null && OneBasedModifications == null || + s.OneBasedModifications.Keys.ToList().SequenceEqual(OneBasedModifications.Keys.ToList()) + && s.OneBasedModifications.Values.SelectMany(m => m).ToList().SequenceEqual(OneBasedModifications.Values.SelectMany(m => m).ToList())); + } + + public override int GetHashCode() + { + return OneBasedBeginPosition.GetHashCode() + ^ OneBasedEndPosition.GetHashCode() + ^ OriginalSequence.GetHashCode() // null handled in constructor + ^ VariantSequence.GetHashCode() // null handled in constructor + ^ Description.GetHashCode(); // always constructed in constructor + } + + /// + /// Returns a simple string represantation of this amino acid change + /// + /// + public string SimpleString() + { + return OriginalSequence + OneBasedBeginPosition.ToString() + VariantSequence; + } + + /// + /// Determines whether this interval overlaps the queried interval + /// + /// + /// + internal bool Intersects(SequenceVariation segment) + { + return segment.OneBasedEndPosition >= OneBasedBeginPosition && segment.OneBasedBeginPosition <= OneBasedEndPosition; + } + + /// + /// Determines whether this interval overlaps the queried interval + /// + /// + /// + internal bool Intersects(ProteolysisProduct segment) + { + return segment.OneBasedEndPosition >= OneBasedBeginPosition && segment.OneBasedBeginPosition <= OneBasedEndPosition; + } + + /// + /// Determines whether this interval overlaps the queried position + /// + /// + /// + internal bool Intersects(int pos) + { + return OneBasedBeginPosition <= pos && pos <= OneBasedEndPosition; + } + + /// + /// Determines whether this interval includes the queried interval + /// + /// + /// + internal bool Includes(SequenceVariation segment) + { + return OneBasedBeginPosition <= segment.OneBasedBeginPosition && OneBasedEndPosition >= segment.OneBasedEndPosition; + } + + /// + /// Determines whether this interval includes the queried interval + /// + /// + /// + internal bool Includes(ProteolysisProduct segment) + { + return OneBasedBeginPosition <= segment.OneBasedBeginPosition && OneBasedEndPosition >= segment.OneBasedEndPosition; + } + + /// + /// Determines whether this interval overlaps the queried position + /// + /// + /// + internal bool Includes(int pos) + { + return OneBasedBeginPosition <= pos && pos <= OneBasedEndPosition; + } + } +} \ No newline at end of file diff --git a/mzLib/MassSpectrometry/Proteomics/Protein/SnpEffAnnotation.cs b/mzLib/MassSpectrometry/Proteomics/Protein/SnpEffAnnotation.cs new file mode 100644 index 000000000..62330a9c3 --- /dev/null +++ b/mzLib/MassSpectrometry/Proteomics/Protein/SnpEffAnnotation.cs @@ -0,0 +1,236 @@ +using System.Collections.Generic; +using System.Linq; +using System.Text.RegularExpressions; + +namespace Proteomics +{ + /// + /// Specifications are described here: http://snpeff.sourceforge.net/VCFannotationformat_v1.0.pdf + /// + public class SnpEffAnnotation + { + private static readonly Regex HGVSProteinRegex = new Regex(@"(p\.)([A-Z][a-z][a-z])(\d+)([A-Z][a-z][a-z])"); + + /// + /// Original SnpEff annotation string. + /// + public string Annotation { get; } + + public string Allele { get; } + public string[] Effects { get; } + public string PutativeImpact { get; } + public string GeneName { get; } + public string GeneID { get; } + + /// + /// It looks like these are sometimes domains, like the ones annotated in UniProt, + /// Otherwise, this tends to just be "transcript" + /// + /// Some examples: + /// sequence_feature: can be initiator-methionine:Removed ... maybe not too helpful for proteomics, since this is assumed + /// sequence_feature: helix:combinatorial_evidence_used_in_manual_assertion + /// sequence_feature: nucleotide-phosphate-binding-region:ATP + /// sequence_feature: domain:EGF-like_2 + /// sequence_feature: transmembrane-region:Transmembrane_region + /// sequence_feature: topological-domain:Extracellular + /// sequence_feature: modified-residue:phosphoserine + /// + public string FeatureType { get; } + + /// + /// Always seems to be the transcriptID + /// + public string FeatureID { get; } + + public string TranscriptBiotype { get; } + public int ExonIntronRank { get; } + public int ExonIntronTotal { get; } + public string HGVSNotationDnaLevel { get; } // kind of bad for ins and del because they notation aligns to most 3' coordinate, rather than leftmost + public string HGVSNotationProteinLevel { get; } + public int OneBasedTranscriptCDNAPosition { get; } + public int TranscriptCDNALength { get; } + public int OneBasedCodingDomainSequencePosition { get; } + public int CodingDomainSequenceLengthIncludingStopCodon { get; } + public int OneBasedProteinPosition { get; } + public int ProteinLength { get; } + + /// + /// up/downstream: distance to first / last codon + /// intergenic: distance to closest gene + /// exonic: distance to closest intron boundary (+ is upstream, - is downstream) + /// intronic: distance to closest exon boundary (+ is upstream, - is downstream) + /// motif: distance to first base in MOTIF + /// miRNA: distance to first base in miRNA + /// splice_site: distance to exon-intron boundary + /// splice_region: distance to exon-intron boundary + /// chip seq peak: distance to summit or peak center + /// histone mark/state: distance to summit or peak center + /// + public int DistanceToFeature { get; } + + public string[] Warnings { get; } + + public int AminoAcidLocation { get; } + public char ReferenceAminoAcid { get; } + public char AlternateAminoAcid { get; } + public bool Missense { get; } + public bool Synonymous { get; } + public bool FrameshiftVariant { get; } + public bool BadTranscript { get; } + + public SnpEffAnnotation(string annotation) + { + bool isSnpEffAnnotation = annotation.StartsWith("ANN=") || annotation.StartsWith("EFF="); + Annotation = isSnpEffAnnotation ? annotation.Substring(4) : annotation; + if (!isSnpEffAnnotation) + { + return; + } + string[] a = Annotation.Split('|'); + Allele = a[0]; + Effects = a[1].Split('&'); + PutativeImpact = a[2]; + GeneName = a[3]; + GeneID = a[4]; + FeatureType = a[5]; + FeatureID = a[6]; + TranscriptBiotype = a[7]; + if (a[8].Split('/').Length > 0 && int.TryParse(a[8].Split('/')[0], out int x)) { ExonIntronRank = x; } + if (a[8].Split('/').Length > 1 && int.TryParse(a[8].Split('/')[1], out int y)) { ExonIntronTotal = y; } + HGVSNotationDnaLevel = a[9]; + HGVSNotationProteinLevel = a[10]; + if (a[11].Split('/').Length > 0 && int.TryParse(a[11].Split('/')[0], out x)) { OneBasedTranscriptCDNAPosition = x; } + if (a[11].Split('/').Length > 1 && int.TryParse(a[11].Split('/')[1], out y)) { TranscriptCDNALength = y; } + if (a[12].Split('/').Length > 0 && int.TryParse(a[12].Split('/')[0], out x)) { OneBasedCodingDomainSequencePosition = x; } + if (a[12].Split('/').Length > 1 && int.TryParse(a[12].Split('/')[1], out y)) { CodingDomainSequenceLengthIncludingStopCodon = y; } + if (a[13].Split('/').Length > 0 && int.TryParse(a[13].Split('/')[0], out x)) { OneBasedProteinPosition = x; } + if (a[13].Split('/').Length > 1 && int.TryParse(a[13].Split('/')[1], out y)) { ProteinLength = y; } + if (int.TryParse(a[14], out y)) DistanceToFeature = y; + Warnings = a[15].Split('&'); + + Missense = Effects.Any(eff => eff == "missense_variant"); + Synonymous = !Effects.Any(eff => NonSynonymousVariations.Contains(eff)); + FrameshiftVariant = Effects.Contains("frameshift_variant"); + BadTranscript = Warnings.Any(w => BadTranscriptWarnings.Contains(w)); + } + + private string[] HighPutativeImpactEffects = new string[] + { + "chromosome_number_variation", // rare... + "exon_loss_variant", // + "frameshift_variant", + "rare_amino_acid_variant", + "splice_acceptor_variant", // often with intron_variant, sometimes with splice_donor_variant + "splice_donor_variant", // often with intron_variant, sometimes with splice_acceptor_variant + "start_lost", + "stop_gained", + "stop_lost", + "transcript_ablation", + }; + + private string[] ModeratePutativeImpactEffects = new string[] + { + "3_prime_UTR_truncation", "exon_loss", // appear together + "5_prime_UTR_truncation", "exon_loss_variant", // appear together + "coding_sequence_variant", // not seen much? Probably because missense is used more often. + "conservative_inframe_insertion", + "conservative_inframe_deletion", + "disruptive_inframe_deletion", + "disruptive_inframe_insertion", + "inframe_deletion", // not common, in favor of more specific terms above + "inframe_insertion", // not common, in favor of more specific terms above + "missense_variant", + "regulatory_region_ablation", // not common? + "splice_region_variant", // often combined with intron_variant and non_coding_transcript_exon_variant + "TFBS_ablation", // not common? + }; + + private string[] NonSynonymousVariations = new string[] + { + "exon_loss_variant", + "frameshift_variant", + "rare_amino_acid_variant", + "start_lost", + "stop_gained", + "stop_lost", + "conservative_inframe_insertion", + "conservative_inframe_deletion", + "disruptive_inframe_deletion", + "disruptive_inframe_insertion", + "inframe_deletion", // not common, in favor of more specific terms above + "inframe_insertion", // not common, in favor of more specific terms above + "missense_variant", + }; + + private string[] LowPutativeImpactEffects = new string[] + { + "5_prime_UTR_premature_start_codon_gain_variant", + "initiator_codon_variant", + "splice_region_variant", + "start_retained", // not used in human, with only one canonical start codon + "stop_retained_variant", // fairly common + "synonymous_variant", + "sequence_feature" + }; + + private string[] ModifierEffects = new string[] + { + "3_prime_UTR_variant", + "5_prime_UTR_variant", + "coding_sequence_variant", + "conserved_intergenic_variant", + "conserved_intron_variant", + "downstream_gene_variant", + "exon_variant", + "feature_elongation", + "feature_truncation", + "gene_variant", + "intergenic_region", + "intragenic_variant", + "intron_variant", + "mature_miRNA_variant", + "miRNA", + "NMD_transcript_variant", + "non_coding_transcript_exon_variant", + "non_coding_transcript_variant", + "regulatory_region_amplification", + "regulatory_region_variant", + "TF_binding_site_variant", + "TFBS_amplification", + "transcript_amplification", + "transcript_variant", + "upstream_gene_variant" + }; + + private string[] BadTranscriptWarnings = new string[] + { + "WARNING_TRANSCRIPT_INCOMPLETE", + "WARNING_TRANSCRIPT_MULTIPLE_STOP_CODONS", + "WARNING_TRANSCRIPT_NO_STOP_CODON", + "WARNING_TRANSCRIPT_NO_START_CODON" + }; + + /// + /// It looks like WARNING_TRANSCRIPT_INCOMPLETE, WARNING_TRANSCRIPT_MULTIPLE_STOP_CODONS, + /// WARNING_TRANSCRIPT_NO_STOP_CODON, and WARNING_TRANSCRIPT_NO_START_CODON are relevant to this program. + /// + /// These are the ones that I shouldn't be translating. + /// + /// Could also be used for error messages regarding certain transcripts. + /// + public Dictionary SnpEffWarningDescriptions = new Dictionary + { + { "ERROR_CHROMOSOME_NOT_FOUND", "Chromosome does not exists in reference genome database." }, + { "ERROR_OUT_OF_CHROMOSOME_RANGE", "The variant’s genomic coordinate is greater than chromosome's length." }, + { "WARNING_REF_DOES_NOT_MATCH_GENOME", "This means that the ‘REF’ field in the input VCF file does not match the reference genome." }, + { "WARNING_SEQUENCE_NOT_AVAILABLE", "Reference sequence is not available, thus no inference could be performed." }, + { "WARNING_TRANSCRIPT_INCOMPLETE", "A protein coding transcript having a non­multiple of 3 length, indicating that the reference genome has missing information about this trancript." }, + { "WARNING_TRANSCRIPT_MULTIPLE_STOP_CODONS", "A protein coding transcript has two or more STOP codons in the middle of the coding sequence (CDS). This should not happen and it usually means the reference genome may have an error in this transcript." }, + { "WARNING_TRANSCRIPT_NO_START_CODON", "A protein coding transcript does not have a proper START codon. It is rare that a real transcript does not have a START codon, so this probably indicates an error or missing information in the reference genome." }, + { "WARNING_TRANSCRIPT_NO_STOP_CODON", "A protein coding transcript does not have a proper STOP codon. It is rare that a real transcript does not have a STOP codon, so this probably indicates an error or missing information in the reference genome." }, + { "INFO_REALIGN_3_PRIME", "Variant has been realigned to the most 3­-prime position within the transcript. This is usually done to to comply with HGVS specification to always report the most 3-­prime annotation." }, + { "INFO_COMPOUND_ANNOTATION", "This effect is a result of combining more than one variants." }, + { "INFO_NON_REFERENCE_ANNOTATION", "An alternative reference sequence was used to calculate this annotation." }, + }; + } +} \ No newline at end of file diff --git a/mzLib/MassSpectrometry/Proteomics/Protein/SpliceSite.cs b/mzLib/MassSpectrometry/Proteomics/Protein/SpliceSite.cs new file mode 100644 index 000000000..c1c7cfadb --- /dev/null +++ b/mzLib/MassSpectrometry/Proteomics/Protein/SpliceSite.cs @@ -0,0 +1,37 @@ +namespace Proteomics +{ + public class SpliceSite + { + public SpliceSite(int oneBasedBegin, int oneBasedEnd, string description) + { + OneBasedBeginPosition = oneBasedBegin; + OneBasedEndPosition = oneBasedEnd; + Description = description ?? ""; + } + + public SpliceSite(int oneBasedPosition, string description) + : this(oneBasedPosition, oneBasedPosition, description) + { + } + + public int OneBasedBeginPosition { get; } + public int OneBasedEndPosition { get; } + public string Description { get; } + + public override bool Equals(object obj) + { + SpliceSite s = obj as SpliceSite; + return s != null + && s.OneBasedBeginPosition == OneBasedBeginPosition + && s.OneBasedEndPosition == OneBasedEndPosition + && s.Description == Description; + } + + public override int GetHashCode() + { + return OneBasedBeginPosition.GetHashCode() + ^ OneBasedEndPosition.GetHashCode() + ^ Description.GetHashCode(); // null handled in constructor + } + } +} \ No newline at end of file diff --git a/mzLib/MassSpectrometry/Proteomics/Protein/VariantApplication.cs b/mzLib/MassSpectrometry/Proteomics/Protein/VariantApplication.cs new file mode 100644 index 000000000..f2554694e --- /dev/null +++ b/mzLib/MassSpectrometry/Proteomics/Protein/VariantApplication.cs @@ -0,0 +1,397 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using Proteomics; + +namespace Proteomics +{ + public static class VariantApplication + { + /// + /// Gets the accession for a protein with applied variations + /// + /// + /// + public static string GetAccession(Protein protein,IEnumerable appliedSequenceVariations) + { + return protein.NonVariantProtein.Accession + + (appliedSequenceVariations == null || appliedSequenceVariations.Count() == 0 ? "" : $"_{CombineSimpleStrings(appliedSequenceVariations)}"); + } + + /// + /// Determines if the modification falls on a variant amino acid + /// + /// + /// + /// + public static bool IsSequenceVariantModification(SequenceVariation appliedVariant, int variantProteinIndex) + { + return appliedVariant != null && appliedVariant.Includes(variantProteinIndex); + } + + /// + /// Restores modification index on a variant protein to the index on the nonvariant protein, + /// or if it falls on a variant, this restores the position on the protein with only that variant + /// + /// + /// + /// + public static int RestoreModificationIndex(Protein protein, int variantProteinIndex) + { + return variantProteinIndex - protein.AppliedSequenceVariations + .Where(v => v.OneBasedEndPosition < variantProteinIndex) + .Sum(v => v.VariantSequence.Length - v.OriginalSequence.Length); + } + + /// + /// Format string to append to accession + /// + /// + /// + internal static string CombineSimpleStrings(IEnumerable variations) + { + return variations == null || variations.Count() == 0? "" : string.Join("_", variations.Select(v => v.SimpleString())); + } + + /// + /// Format string to append to protein names + /// + /// + /// + internal static string CombineDescriptions(IEnumerable variations) + { + return variations == null || variations.Count() == 0 ? "" : string.Join(", variant:", variations.Select(d => d.Description)); + } + + /// + /// Applies multiple variant changes to a protein sequence + /// + /// + /// + /// + internal static List ApplyVariants(Protein protein, IEnumerable sequenceVariations, int maxAllowedVariantsForCombinitorics, int minAlleleDepth) + { + List uniqueEffectsToApply = sequenceVariations + .GroupBy(v => v.SimpleString()) + .Select(x => x.First()) + .Where(v => v.Description.Genotypes.Count > 0) // this is a VCF line + .OrderByDescending(v => v.OneBasedBeginPosition) // apply variants at the end of the protein sequence first + .ToList(); + + Protein proteinCopy = new Protein(protein.BaseSequence, protein, null, protein.ProteolysisProducts, protein.OneBasedPossibleLocalizedModifications, null); + + // If there aren't any variants to apply, just return the base protein + if (uniqueEffectsToApply.Count == 0) + { + return new List { proteinCopy }; + } + + HashSet individuals = new HashSet(uniqueEffectsToApply.SelectMany(v => v.Description.Genotypes.Keys)); + List variantProteins = new List(); + + // loop through genotypes for each sample/individual (e.g. tumor and normal) + foreach (string individual in individuals) + { + bool tooManyHeterozygousVariants = uniqueEffectsToApply.Count(v => v.Description.Heterozygous[individual]) > maxAllowedVariantsForCombinitorics; + List newVariantProteins = new List { proteinCopy }; + foreach (var variant in uniqueEffectsToApply) + { + bool variantAlleleIsInTheGenotype = variant.Description.Genotypes[individual].Contains(variant.Description.AlleleIndex.ToString()); // should catch the case where it's -1 if the INFO isn't from SnpEff + if (!variantAlleleIsInTheGenotype) + { + continue; + } + bool isHomozygousAlternate = variant.Description.Homozygous[individual] && variant.Description.Genotypes[individual].All(d => d == variant.Description.AlleleIndex.ToString()); // note this isn't a great test for homozygosity, since the genotype could be 1/2 and this would still return true. But currently, alleles 1 and 2 will be included as separate variants, so this is fine for now. + bool isDeepReferenceAllele = int.TryParse(variant.Description.AlleleDepths[individual][0], out int depthRef) && depthRef >= minAlleleDepth; + bool isDeepAlternateAllele = int.TryParse(variant.Description.AlleleDepths[individual][variant.Description.AlleleIndex], out int depthAlt) && depthAlt >= minAlleleDepth; + + // homozygous alternate + if (isHomozygousAlternate && isDeepAlternateAllele) + { + newVariantProteins = newVariantProteins.Select(p => ApplySingleVariant(variant, p, individual)).ToList(); + } + + // heterozygous basic + // first protein with variants contains all homozygous variation, second contains all variations + else if (variant.Description.Heterozygous[individual] && tooManyHeterozygousVariants) + { + if (isDeepAlternateAllele && isDeepReferenceAllele) + { + if (newVariantProteins.Count == 1 && maxAllowedVariantsForCombinitorics > 0) + { + Protein variantProtein = ApplySingleVariant(variant, newVariantProteins[0], individual); + newVariantProteins.Add(variantProtein); + } + else if (maxAllowedVariantsForCombinitorics > 0) + { + newVariantProteins[1] = ApplySingleVariant(variant, newVariantProteins[1], individual); + } + else + { + // no heterozygous variants + } + } + else if (isDeepAlternateAllele && maxAllowedVariantsForCombinitorics > 0) + { + newVariantProteins = newVariantProteins.Select(p => ApplySingleVariant(variant, p, individual)).ToList(); + } + else + { + // keep reference only + } + } + + // heterozygous combinitorics + else if (variant.Description.Heterozygous[individual] && isDeepAlternateAllele && !tooManyHeterozygousVariants) + { + List combinitoricProteins = new List(); + + foreach (Protein ppp in newVariantProteins) + { + if (isDeepAlternateAllele && maxAllowedVariantsForCombinitorics > 0 && isDeepReferenceAllele) + { + // keep reference allele + if (variant.Description.Genotypes[individual].Contains("0")) + { + combinitoricProteins.Add(ppp); + } + + // alternate allele (replace all, since in heterozygous with two alternates, both alternates are included) + combinitoricProteins.Add(ApplySingleVariant(variant, ppp, individual)); + } + else if (isDeepAlternateAllele && maxAllowedVariantsForCombinitorics > 0) + { + combinitoricProteins.Add(ApplySingleVariant(variant, ppp, individual)); + } + else if (variant.Description.Genotypes[individual].Contains("0")) + { + combinitoricProteins.Add(ppp); + } + else + { + // must be two alternate alleles with not enough depth + } + } + newVariantProteins = combinitoricProteins; + } + } + variantProteins.AddRange(newVariantProteins); + } + + return variantProteins.GroupBy(x => x.BaseSequence).Select(x => x.First()).ToList(); + } + + /// + /// Applies a single variant to a protein sequence + /// + /// + /// + internal static Protein ApplySingleVariant(SequenceVariation variantGettingApplied, Protein protein, string individual) + { + string seqBefore = protein.BaseSequence.Substring(0, variantGettingApplied.OneBasedBeginPosition - 1); + string seqVariant = variantGettingApplied.VariantSequence; + int afterIdx = variantGettingApplied.OneBasedBeginPosition + variantGettingApplied.OriginalSequence.Length - 1; + + SequenceVariation variantAfterApplication = new SequenceVariation( + variantGettingApplied.OneBasedBeginPosition, + variantGettingApplied.OneBasedBeginPosition + variantGettingApplied.VariantSequence.Length - 1, + variantGettingApplied.OriginalSequence, + variantGettingApplied.VariantSequence, + variantGettingApplied.Description.Description, + variantGettingApplied.OneBasedModifications.ToDictionary(kv => kv.Key, kv => kv.Value)); + + // check to see if there is incomplete indel overlap, which would lead to weird variant sequences + // complete overlap is okay, since it will be overwritten; this can happen if there are two alternate alleles, + // e.g. reference sequence is wrong at that point + bool intersectsAppliedRegionIncompletely = protein.AppliedSequenceVariations.Any(x => variantGettingApplied.Intersects(x) && !variantGettingApplied.Includes(x)); + IEnumerable appliedVariations = new[] { variantAfterApplication }; + string seqAfter = null; + if (intersectsAppliedRegionIncompletely) + { + // use original protein sequence for the remaining sequence + seqAfter = protein.BaseSequence.Length - afterIdx <= 0 ? "" : protein.NonVariantProtein.BaseSequence.Substring(afterIdx); + } + else + { + // use this variant protein sequence for the remaining sequence + seqAfter = protein.BaseSequence.Length - afterIdx <= 0 ? "" : protein.BaseSequence.Substring(afterIdx); + appliedVariations = appliedVariations + .Concat(protein.AppliedSequenceVariations.Where(x => !variantGettingApplied.Includes(x))) + .ToList(); + } + string variantSequence = (seqBefore + seqVariant + seqAfter).Split('*')[0]; // there may be a stop gained + + // adjust indices + List adjustedProteolysisProducts = AdjustProteolysisProductIndices(variantGettingApplied, variantSequence, protein, protein.ProteolysisProducts); + Dictionary> adjustedModifications = AdjustModificationIndices(variantGettingApplied, variantSequence, protein); + List adjustedAppliedVariations = AdjustSequenceVariationIndices(variantGettingApplied, variantSequence, appliedVariations); + + return new Protein(variantSequence, protein, adjustedAppliedVariations, adjustedProteolysisProducts, adjustedModifications, individual); + } + + /// + /// Adjusts the indices of sequence variations due to applying a single additional variant + /// + /// + /// + /// + internal static List AdjustSequenceVariationIndices(SequenceVariation variantGettingApplied, string variantAppliedProteinSequence, IEnumerable alreadyAppliedVariations) + { + List variations = new List(); + if (alreadyAppliedVariations == null) { return variations; } + foreach (SequenceVariation v in alreadyAppliedVariations) + { + int addedIdx = alreadyAppliedVariations + .Where(applied => applied.OneBasedEndPosition < v.OneBasedBeginPosition) + .Sum(applied => applied.VariantSequence.Length - applied.OriginalSequence.Length); + + // variant was entirely before the one being applied (shouldn't happen because of order of applying variants) + // or it's the current variation + if (v.Description.Equals(variantGettingApplied.Description) || v.OneBasedEndPosition - addedIdx < variantGettingApplied.OneBasedBeginPosition) + { + variations.Add(v); + } + + // adjust indices based on new included sequence, minding possible overlaps to be filtered later + else + { + int intersectOneBasedStart = Math.Max(variantGettingApplied.OneBasedBeginPosition, v.OneBasedBeginPosition); + int intersectOneBasedEnd = Math.Min(variantGettingApplied.OneBasedEndPosition, v.OneBasedEndPosition); + int overlap = intersectOneBasedEnd < intersectOneBasedStart ? 0 : // no overlap + intersectOneBasedEnd - intersectOneBasedStart + 1; // there's some overlap + int sequenceLengthChange = variantGettingApplied.VariantSequence.Length - variantGettingApplied.OriginalSequence.Length; + int begin = v.OneBasedBeginPosition + sequenceLengthChange - overlap; + if (begin > variantAppliedProteinSequence.Length) + { + continue; // cut out by a stop gain + } + int end = v.OneBasedEndPosition + sequenceLengthChange - overlap; + if (end > variantAppliedProteinSequence.Length) + { + end = variantAppliedProteinSequence.Length; // end shortened by a stop gain + } + variations.Add(new SequenceVariation( + begin, + end, + v.OriginalSequence, + v.VariantSequence, + v.Description.Description, + v.OneBasedModifications.ToDictionary(kv => kv.Key, kv => kv.Value))); + } + } + return variations; + } + + /// + /// Eliminates proteolysis products that overlap sequence variations. + /// Since frameshift indels are written across the remaining sequence, + /// this eliminates proteolysis products that conflict with large deletions and other structural variations. + /// + /// + /// + /// + internal static List AdjustProteolysisProductIndices(SequenceVariation variant, string variantAppliedProteinSequence, Protein protein, IEnumerable proteolysisProducts) + { + List products = new List(); + if (proteolysisProducts == null) { return products; } + int sequenceLengthChange = variant.VariantSequence.Length - variant.OriginalSequence.Length; + foreach (ProteolysisProduct p in proteolysisProducts.Where(p => p.OneBasedEndPosition.HasValue && p.OneBasedBeginPosition.HasValue)) + { + // proteolysis product is entirely before the variant + if (variant.OneBasedBeginPosition > p.OneBasedEndPosition) + { + products.Add(p); + } + // proteolysis product straddles the variant, but the cleavage site(s) are still intact; the ends aren't considered cleavage sites + else if ((p.OneBasedBeginPosition < variant.OneBasedBeginPosition || p.OneBasedBeginPosition == 1 || p.OneBasedBeginPosition == 2) + && (p.OneBasedEndPosition > variant.OneBasedEndPosition || p.OneBasedEndPosition == protein.NonVariantProtein.BaseSequence.Length)) + { + if (variant.VariantSequence.EndsWith("*")) + { + products.Add(new ProteolysisProduct(p.OneBasedBeginPosition, variantAppliedProteinSequence.Length, p.Type)); + } + else if (p.OneBasedEndPosition + sequenceLengthChange <= variantAppliedProteinSequence.Length) + { + products.Add(new ProteolysisProduct(p.OneBasedBeginPosition, p.OneBasedEndPosition + sequenceLengthChange, p.Type)); + } + else + { + // cleavage site is not intact + } + } + // proteolysis product is after the variant and there is no stop gain + else if (p.OneBasedBeginPosition > variant.OneBasedEndPosition + && p.OneBasedBeginPosition + sequenceLengthChange <= variantAppliedProteinSequence.Length + && p.OneBasedEndPosition + sequenceLengthChange <= variantAppliedProteinSequence.Length + && !variant.VariantSequence.EndsWith("*")) + { + products.Add(new ProteolysisProduct(p.OneBasedBeginPosition + sequenceLengthChange, p.OneBasedEndPosition + sequenceLengthChange, p.Type)); + } + else // sequence variant conflicts with proteolysis cleavage site (cleavage site was lost) + { + continue; + } + } + return products; + } + + /// + /// Adjusts modification indices. + /// + /// + /// + /// + internal static Dictionary> AdjustModificationIndices(SequenceVariation variant, string variantAppliedProteinSequence, Protein protein) + { + IDictionary> modificationDictionary = protein.OneBasedPossibleLocalizedModifications; + IDictionary> variantModificationDictionary = variant.OneBasedModifications; + Dictionary> mods = new Dictionary>(); + int sequenceLengthChange = variant.VariantSequence.Length - variant.OriginalSequence.Length; + + // change modification indices for variant sequence + if (modificationDictionary != null) + { + foreach (KeyValuePair> kv in modificationDictionary) + { + if (kv.Key > variantAppliedProteinSequence.Length) + { + continue; // it was cut out by a stop gain + } + // mod is before the variant + else if (kv.Key < variant.OneBasedBeginPosition) + { + mods.Add(kv.Key, kv.Value); + } + // mod is after the variant and not affected by a stop gain + else if (variant.OneBasedEndPosition < kv.Key && kv.Key + sequenceLengthChange <= variantAppliedProteinSequence.Length) + { + mods.Add(kv.Key + sequenceLengthChange, kv.Value); + } + else // sequence variant conflicts with modification site (modification site substitution) + { + continue; + } + } + } + + // sequence variant modifications are indexed to the variant sequence + // NOTE: this code assumes variants are added from end to beginning of protein, so that previously added variant mods are adjusted above + if (variantModificationDictionary != null) + { + foreach (var kv in variantModificationDictionary) + { + if (mods.TryGetValue(kv.Key, out var modsAtPos)) + { + modsAtPos.AddRange(kv.Value); + } + else + { + mods.Add(kv.Key, kv.Value); + } + } + } + + return mods; + } + } +} \ No newline at end of file diff --git a/mzLib/MassSpectrometry/Proteomics/ProteolyticDigestion/CleavageSpecificity.cs b/mzLib/MassSpectrometry/Proteomics/ProteolyticDigestion/CleavageSpecificity.cs new file mode 100644 index 000000000..c09f06ebd --- /dev/null +++ b/mzLib/MassSpectrometry/Proteomics/ProteolyticDigestion/CleavageSpecificity.cs @@ -0,0 +1,12 @@ +namespace Proteomics.ProteolyticDigestion +{ + public enum CleavageSpecificity + { + None, + Semi, + Full, + SingleN, + SingleC, + Unknown //used for fast Semi/NonSpecific searches when peptide is cleaved post-search + } +} \ No newline at end of file diff --git a/mzLib/MassSpectrometry/Proteomics/ProteolyticDigestion/DigestionMotif.cs b/mzLib/MassSpectrometry/Proteomics/ProteolyticDigestion/DigestionMotif.cs new file mode 100644 index 000000000..d4cc67c12 --- /dev/null +++ b/mzLib/MassSpectrometry/Proteomics/ProteolyticDigestion/DigestionMotif.cs @@ -0,0 +1,173 @@ +using System.Collections.Generic; +using System.Linq; +using System.Text.RegularExpressions; +using MzLibUtil; + +namespace Proteomics.ProteolyticDigestion +{ + public class DigestionMotif + { + private static char[] B = new char[] { 'D', 'N' }; + private static char[] J = new char[] { 'I', 'L' }; + private static char[] Z = new char[] { 'E', 'Q' }; + + public readonly string InducingCleavage; + public readonly string PreventingCleavage; + public readonly int CutIndex; + public readonly string ExcludeFromWildcard; + + public DigestionMotif(string inducingCleavage, string preventingCleavage, int cutIndex, string excludeFromWildcard) + { + this.InducingCleavage = inducingCleavage; + this.PreventingCleavage = preventingCleavage; + this.CutIndex = cutIndex; + this.ExcludeFromWildcard = excludeFromWildcard; + } + + // parsing cleavage rules syntax + public static List ParseDigestionMotifsFromString(string motifsString) + { + motifsString = motifsString.Replace("\"", string.Empty).Replace(" ", string.Empty); + + // throws exception if non-supported characters are used + if (Regex.Match(motifsString, @"[^a-zA-Z0-9|,[\]{}]+").Success) + { + throw new MzLibException("Unrecognized protease syntax. The digestion motif can only contain letters and {}[]|"); + } + // throws exception if user attempts separate multiple preventing cleavages using commas + if (Regex.Match(motifsString, @"\[([\w]*,+[\w]*)*\]").Success) + { + throw new MzLibException("Unrecognized protease syntax. Please create a separate motif for each sequence preventing cleavage (comma separated)."); + } + // throws exception if user attempts separate multiple wildcard exclusions + if (Regex.Match(motifsString, @"\{([\w]*,+[\w]*)*\}").Success) + { + throw new MzLibException("Unrecognized protease syntax. Please create a separate motif for each wildcard exclusion (comma separated)."); + } + + string[] motifStrings = motifsString.Split(','); + var motifs = new List(); + + for (int i = 0; i < motifStrings.Length; i++) + { + string motifString = motifStrings[i]; + motifs.Add(ParseDigestionMotifFromString(motifString)); + } + return motifs; + } + + private static DigestionMotif ParseDigestionMotifFromString(string motifString) + { + string inducingCleavage; + string preventingCleavage = null; + string excludingWC = null; + int cutIndex = 0; + + if (motifString.Contains("{") && !motifString.Contains("}") + || !motifString.Contains("{") && motifString.Contains("}") + || motifString.Contains("[") && !motifString.Contains("]") + || !motifString.Contains("[") && motifString.Contains("]")) + { + throw new MzLibException("Unrecognized protease syntax. Please close any brackets used."); + } + + // find preventing cleavage + if (motifString.Contains("[")) + { + int start = motifString.IndexOf("[") + 1; + int end = motifString.IndexOf("]"); + + preventingCleavage = motifString.Substring(start, end - start); + motifString = Regex.Replace(motifString, @"\[[a-zA-Z]+\]", string.Empty); + } + + // finds wildcard exceptions + if (motifString.Contains("{")) + { + int start = motifString.IndexOf("{") + 1; + int end = motifString.IndexOf("}"); + + excludingWC = motifString.Substring(start, end - start); + if (Regex.Matches(motifString.ToUpper(), "X").Count != excludingWC.Length) + { + throw new MzLibException("Unrecognized protease syntax. Please have equal number of wildcards for multi-letter wildcard exclusions."); + } + motifString = Regex.Replace(motifString, @"\{[a-zA-Z]+\}", string.Empty); + } + + // finds motif cut index + for (int j = 0; j < motifString.Length; j++) + { + if (motifString[j] == '|') + { + cutIndex = j; + break; + } + } + + motifString = motifString.Replace("|", string.Empty); + inducingCleavage = motifString; + + return new DigestionMotif(inducingCleavage, preventingCleavage, cutIndex, excludingWC); + } + + public (bool, bool) Fits(string sequence, int location) + { + bool fits = true; + char currentResidue; + int m; + + // check for inducing cleavage + for (m = 0; m < InducingCleavage.Length && fits; m++) // handle patterns + { + if (location + m >= sequence.Length) + { + fits = false; + } + else + { + currentResidue = sequence[location + m]; + if (!MotifMatches(InducingCleavage[m], currentResidue)) + { + fits = false; + } + } + } + + bool prevents = false; + // check for preventing cleavage + if (fits && PreventingCleavage != null) + { + prevents = true; + for (int n = 0; n < PreventingCleavage.Length && prevents; n++) + { + if (location + m + n >= sequence.Length || location - PreventingCleavage.Length + 1 + n < 0) + { + prevents = false; + } + else + { + currentResidue = CutIndex != 0 ? sequence[location + m + n] : sequence[location - PreventingCleavage.Length + 1 + n]; + if (!MotifMatches(PreventingCleavage[n], currentResidue)) + { + prevents = false; + } + } + } + + fits = prevents ? false : true; + } + + return (fits, prevents); + } + + private bool MotifMatches(char motifChar, char sequenceChar) + { + return motifChar.Equals('X') && !sequenceChar.ToString().Equals(ExcludeFromWildcard) + || motifChar.Equals(sequenceChar) + || motifChar.Equals('B') && B.Contains(sequenceChar) + || motifChar.Equals('J') && J.Contains(sequenceChar) + || motifChar.Equals('Z') && Z.Contains(sequenceChar); + } + } +} \ No newline at end of file diff --git a/mzLib/MassSpectrometry/Proteomics/ProteolyticDigestion/DigestionParams.cs b/mzLib/MassSpectrometry/Proteomics/ProteolyticDigestion/DigestionParams.cs new file mode 100644 index 000000000..612e5a77c --- /dev/null +++ b/mzLib/MassSpectrometry/Proteomics/ProteolyticDigestion/DigestionParams.cs @@ -0,0 +1,91 @@ +using Proteomics.Fragmentation; + +namespace Proteomics.ProteolyticDigestion +{ + public class DigestionParams + { + // this parameterless constructor needs to exist to read the toml. + // if you can figure out a way to get rid of it, feel free... + public DigestionParams() : this("trypsin") + { + } + + public DigestionParams(string protease = "trypsin", int maxMissedCleavages = 2, int minPeptideLength = 7, int maxPeptideLength = int.MaxValue, + int maxModificationIsoforms = 1024, InitiatorMethionineBehavior initiatorMethionineBehavior = InitiatorMethionineBehavior.Variable, + int maxModsForPeptides = 2, CleavageSpecificity searchModeType = CleavageSpecificity.Full, FragmentationTerminus fragmentationTerminus = FragmentationTerminus.Both, + bool generateUnlabeledProteinsForSilac = true, bool keepNGlycopeptide = false, bool keepOGlycopeptide = false) + { + Protease = ProteaseDictionary.Dictionary[protease]; + MaxMissedCleavages = maxMissedCleavages; + MinPeptideLength = minPeptideLength; + MaxPeptideLength = maxPeptideLength; + MaxModificationIsoforms = maxModificationIsoforms; + InitiatorMethionineBehavior = initiatorMethionineBehavior; + MaxModsForPeptide = maxModsForPeptides; + SearchModeType = searchModeType; + FragmentationTerminus = fragmentationTerminus; + RecordSpecificProtease(); + GeneratehUnlabeledProteinsForSilac = generateUnlabeledProteinsForSilac; + KeepNGlycopeptide = keepNGlycopeptide; + KeepOGlycopeptide = keepOGlycopeptide; + } + + public int MaxMissedCleavages { get; private set; } + public InitiatorMethionineBehavior InitiatorMethionineBehavior { get; private set; } + public int MinPeptideLength { get; private set; } + public int MaxPeptideLength { get; private set; } + public int MaxModificationIsoforms { get; private set; } + public int MaxModsForPeptide { get; private set; } + public Protease Protease { get; private set; } + public CleavageSpecificity SearchModeType { get; private set; } //for fast semi and nonspecific searching of proteases + public FragmentationTerminus FragmentationTerminus { get; private set; } //for fast semi searching of proteases + public Protease SpecificProtease { get; private set; } //for fast semi and nonspecific searching of proteases + public bool GeneratehUnlabeledProteinsForSilac { get; private set; } //used to look for unlabeled proteins (in addition to labeled proteins) for SILAC experiments + public bool KeepNGlycopeptide { get; private set; } + public bool KeepOGlycopeptide { get; private set; } + + public override bool Equals(object obj) + { + return obj is DigestionParams a + && MaxMissedCleavages.Equals(a.MaxMissedCleavages) + && MinPeptideLength.Equals(a.MinPeptideLength) + && MaxPeptideLength.Equals(a.MaxPeptideLength) + && InitiatorMethionineBehavior.Equals(a.InitiatorMethionineBehavior) + && MaxModificationIsoforms.Equals(a.MaxModificationIsoforms) + && MaxModsForPeptide.Equals(a.MaxModsForPeptide) + && Protease.Equals(a.Protease) + && SearchModeType.Equals(a.SearchModeType) + && FragmentationTerminus.Equals(a.FragmentationTerminus) + && GeneratehUnlabeledProteinsForSilac.Equals(a.GeneratehUnlabeledProteinsForSilac) + && KeepNGlycopeptide.Equals(a.KeepNGlycopeptide) + && KeepOGlycopeptide.Equals(a.KeepOGlycopeptide); + } + + public override int GetHashCode() + { + return + MaxMissedCleavages.GetHashCode() + ^ InitiatorMethionineBehavior.GetHashCode() + ^ MaxModificationIsoforms.GetHashCode() + ^ MaxModsForPeptide.GetHashCode(); + } + + public override string ToString() + { + return MaxMissedCleavages + "," + InitiatorMethionineBehavior + "," + MinPeptideLength + "," + MaxPeptideLength + "," + + MaxModificationIsoforms + "," + MaxModsForPeptide + "," + SpecificProtease.Name + "," + SearchModeType + "," + FragmentationTerminus + "," + + GeneratehUnlabeledProteinsForSilac + "," + KeepNGlycopeptide + "," + KeepOGlycopeptide; + } + + private void RecordSpecificProtease() + { + SpecificProtease = Protease; + if (SearchModeType == CleavageSpecificity.None) //nonspecific searches, which might have a specific protease + { + Protease = FragmentationTerminus == FragmentationTerminus.N ? + ProteaseDictionary.Dictionary["singleN"] : + ProteaseDictionary.Dictionary["singleC"]; + } + } + } +} \ No newline at end of file diff --git a/mzLib/MassSpectrometry/Proteomics/ProteolyticDigestion/InitiatorMethionineBehavior.cs b/mzLib/MassSpectrometry/Proteomics/ProteolyticDigestion/InitiatorMethionineBehavior.cs new file mode 100644 index 000000000..2df741374 --- /dev/null +++ b/mzLib/MassSpectrometry/Proteomics/ProteolyticDigestion/InitiatorMethionineBehavior.cs @@ -0,0 +1,10 @@ +namespace Proteomics.ProteolyticDigestion +{ + public enum InitiatorMethionineBehavior + { + Undefined, + Retain, + Cleave, + Variable + } +} \ No newline at end of file diff --git a/mzLib/MassSpectrometry/Proteomics/ProteolyticDigestion/PeptideWithSetModifications.cs b/mzLib/MassSpectrometry/Proteomics/ProteolyticDigestion/PeptideWithSetModifications.cs new file mode 100644 index 000000000..3cc15bb0a --- /dev/null +++ b/mzLib/MassSpectrometry/Proteomics/ProteolyticDigestion/PeptideWithSetModifications.cs @@ -0,0 +1,1514 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using Chemistry; +using Proteomics.AminoAcidPolymer; +using Proteomics.Fragmentation; +using MassSpectrometry; + +namespace Proteomics.ProteolyticDigestion +{ + [Serializable] + public class PeptideWithSetModifications : ProteolyticPeptide + { + public string FullSequence { get; private set; } //sequence with modifications + public readonly int NumFixedMods; + // Parameter to store a hash code corresponding to a Decoy or a Target peptide + // If the peptide in question is a decoy, this pairs it to the target it was generated from + // If the peptide in question is a target, this pairs it to its corresponding decoy + public int? PairedTargetDecoyHash { get; private set; } + /// + /// Dictionary of modifications on the peptide. The N terminus is index 1. + /// The key indicates which residue modification is on (with 1 being N terminus). + /// + [NonSerialized] private Dictionary _allModsOneIsNterminus; //we currently only allow one mod per position + [NonSerialized] private bool? _hasChemicalFormulas; + [NonSerialized] private string _sequenceWithChemicalFormulas; + [NonSerialized] private double? _monoisotopicMass; + [NonSerialized] private double? _mostAbundantMass; + [NonSerialized] private ChemicalFormula _fullChemicalFormula; + [NonSerialized] private DigestionParams _digestionParams; + private static readonly double WaterMonoisotopicMass = PeriodicTable.GetElement("H").PrincipalIsotope.AtomicMass * 2 + PeriodicTable.GetElement("O").PrincipalIsotope.AtomicMass; + private readonly string ProteinAccession; // used to get protein object after deserialization + /// + /// Creates a PeptideWithSetModifications object from a protein. Used when a Protein is digested. + /// + public PeptideWithSetModifications(Protein protein, DigestionParams digestionParams, int oneBasedStartResidueInProtein, + int oneBasedEndResidueInProtein, CleavageSpecificity cleavageSpecificity, string peptideDescription, int missedCleavages, + Dictionary allModsOneIsNterminus, int numFixedMods, string baseSequence = null, int? pairedTargetDecoyHash = null) + : base(protein, oneBasedStartResidueInProtein, oneBasedEndResidueInProtein, missedCleavages, cleavageSpecificity, peptideDescription, baseSequence) + { + _allModsOneIsNterminus = allModsOneIsNterminus; + NumFixedMods = numFixedMods; + _digestionParams = digestionParams; + DetermineFullSequence(); + ProteinAccession = protein.Accession; + UpdateCleavageSpecificity(); + PairedTargetDecoyHash = pairedTargetDecoyHash; // Added PairedTargetDecoyHash as a nullable integer + } + + /// + /// Creates a PeptideWithSetModifications object from a sequence string. + /// Useful for reading in MetaMorpheus search engine output into mzLib objects. + /// + public PeptideWithSetModifications(string sequence, Dictionary allKnownMods, int numFixedMods = 0, + DigestionParams digestionParams = null, Protein p = null, int oneBasedStartResidueInProtein = int.MinValue, + int oneBasedEndResidueInProtein = int.MinValue, int missedCleavages = int.MinValue, + CleavageSpecificity cleavageSpecificity = CleavageSpecificity.Full, string peptideDescription = null, int? pairedTargetDecoyHash = null) + : base(p, oneBasedStartResidueInProtein, oneBasedEndResidueInProtein, missedCleavages, cleavageSpecificity, peptideDescription) + { + if (sequence.Contains("|")) + { + throw new MzLibUtil.MzLibException("Ambiguous peptide cannot be parsed from string: " + sequence); + } + + FullSequence = sequence; + _baseSequence = GetBaseSequenceFromFullSequence(sequence); + GetModsAfterDeserialization(allKnownMods); + NumFixedMods = numFixedMods; + _digestionParams = digestionParams; + PairedTargetDecoyHash = pairedTargetDecoyHash; // Added PairedTargetDecoyHash as a nullable integer + + if (p != null) + { + ProteinAccession = p.Accession; + } + } + + public DigestionParams DigestionParams + { + get { return _digestionParams; } + } + + public Dictionary AllModsOneIsNterminus + { + get { return _allModsOneIsNterminus; } + } + + public int NumMods + { + get { return AllModsOneIsNterminus.Count; } + } + + public int NumVariableMods + { + get { return NumMods - NumFixedMods; } + } + + public double MonoisotopicMass + { + get + { + if (!_monoisotopicMass.HasValue) + { + double monoMass = WaterMonoisotopicMass; + + foreach (var mod in AllModsOneIsNterminus.Values) + { + monoMass += mod.MonoisotopicMass.Value; + } + monoMass += BaseSequence.Sum(b => Residue.ResidueMonoisotopicMass[b]); + + _monoisotopicMass = monoMass; + } + return (double)ClassExtensions.RoundedDouble(_monoisotopicMass.Value); + } + + } + + public ChemicalFormula FullChemicalFormula + { + get + { + ChemicalFormula fullChemicalFormula = new Proteomics.AminoAcidPolymer.Peptide(BaseSequence).GetChemicalFormula(); + foreach (var mod in AllModsOneIsNterminus.Values) + { + fullChemicalFormula.Add(mod.ChemicalFormula); + } + + _fullChemicalFormula = fullChemicalFormula; + return _fullChemicalFormula; + } + } + + public double MostAbundantMass + { + get + { + if (!_mostAbundantMass.HasValue) + { + IsotopicDistribution dist = IsotopicDistribution.GetDistribution(this.FullChemicalFormula); + double maxIntensity = dist.Intensities.Max(); + _mostAbundantMass = (double)ClassExtensions.RoundedDouble(dist.Masses.ToList()[dist.Intensities.ToList().IndexOf(maxIntensity)]); + } + return (double)ClassExtensions.RoundedDouble(_mostAbundantMass.Value); + } + + } + + public string SequenceWithChemicalFormulas + { + get + { + if (!_hasChemicalFormulas.HasValue) + { + _hasChemicalFormulas = true; + var subsequence = new StringBuilder(); + + // variable modification on peptide N-terminus + if (AllModsOneIsNterminus.TryGetValue(1, out Modification pep_n_term_variable_mod)) + { + if (pep_n_term_variable_mod is Modification jj) + { + subsequence.Append('[' + jj.ChemicalFormula.Formula + ']'); + } + else + { + return null; + } + } + + for (int r = 0; r < Length; r++) + { + subsequence.Append(this[r]); + // variable modification on this residue + if (AllModsOneIsNterminus.TryGetValue(r + 2, out Modification residue_variable_mod)) + { + if (residue_variable_mod is Modification jj) + { + subsequence.Append('[' + jj.ChemicalFormula.Formula + ']'); + } + else + { + return null; + } + } + } + + // variable modification on peptide C-terminus + if (AllModsOneIsNterminus.TryGetValue(Length + 2, out Modification pep_c_term_variable_mod)) + { + if (pep_c_term_variable_mod is Modification jj) + { + subsequence.Append('[' + jj.ChemicalFormula.Formula + ']'); + } + else + { + return null; + } + } + + _sequenceWithChemicalFormulas = subsequence.ToString(); + } + return _sequenceWithChemicalFormulas; + } + } + + /// + /// Generates theoretical fragments for given dissociation type for this peptide. + /// The "products" parameter is filled with these fragments. + /// + public void Fragment(DissociationType dissociationType, FragmentationTerminus fragmentationTerminus, List products) + { + // This code is specifically written to be memory- and CPU -efficient because it is + // called millions of times for a typical search (i.e., at least once per peptide). + // If you modify this code, BE VERY CAREFUL about allocating new memory, especially + // for new collections. This code also deliberately avoids using "yield return", again + // for performance reasons. Be sure to benchmark any changes with a parallelized + // fragmentation of every peptide in a database (i.e., test for speed decreases and + // memory issues). + + products.Clear(); + + var massCaps = DissociationTypeCollection.GetNAndCTerminalMassShiftsForDissociationType(dissociationType); + + double cTermMass = 0; + double nTermMass = 0; + + List nTermProductTypes = DissociationTypeCollection.GetTerminusSpecificProductTypesFromDissociation(dissociationType, FragmentationTerminus.N); + List cTermProductTypes = DissociationTypeCollection.GetTerminusSpecificProductTypesFromDissociation(dissociationType, FragmentationTerminus.C); + + bool calculateNTermFragments = fragmentationTerminus == FragmentationTerminus.N + || fragmentationTerminus == FragmentationTerminus.Both; + + bool calculateCTermFragments = fragmentationTerminus == FragmentationTerminus.C + || fragmentationTerminus == FragmentationTerminus.Both; + + //From http://www.matrixscience.com/help/fragmentation_help.html + //Low Energy CID -- In low energy CID(i.e.collision induced dissociation in a triple quadrupole or an ion trap) a peptide carrying a positive charge fragments mainly along its backbone, + //generating predominantly b and y ions. In addition, for fragments containing RKNQ, peaks are seen for ions that have lost ammonia (-17 Da) denoted a*, b* and y*. For fragments containing + //STED, loss of water(-18 Da) is denoted a°, b° and y°. Satellite ions from side chain cleavage are not observed. + bool haveSeenNTermDegreeIon = false; + bool haveSeenNTermStarIon = false; + bool haveSeenCTermDegreeIon = false; + bool haveSeenCTermStarIon = false; + + // these two collections keep track of the neutral losses observed so far on the n-term or c-term. + // they are apparently necessary, but allocating memory for collections in this function results in + // inefficient memory usage and thus frequent garbage collection. + // TODO: If you can think of a way to remove these collections and still maintain correct + // fragmentation, please do so. + HashSet nTermNeutralLosses = null; + HashSet cTermNeutralLosses = null; + + // n-terminus mod + if (calculateNTermFragments) + { + if (AllModsOneIsNterminus.TryGetValue(1, out Modification mod)) + { + nTermMass += mod.MonoisotopicMass.Value; + + // n-term mod neutral loss + nTermNeutralLosses = AddNeutralLossesFromMods(mod, nTermNeutralLosses, dissociationType); + } + } + + // c-terminus mod + if (calculateCTermFragments) + { + if (AllModsOneIsNterminus.TryGetValue(BaseSequence.Length + 2, out Modification mod)) + { + cTermMass += mod.MonoisotopicMass.Value; + + // c-term mod neutral loss + cTermNeutralLosses = AddNeutralLossesFromMods(mod, cTermNeutralLosses, dissociationType); + } + } + + for (int r = 0; r < BaseSequence.Length - 1; r++) + { + // n-term fragments + if (calculateNTermFragments) + { + char nTermResidue = BaseSequence[r]; + + // get n-term residue mass + if (Residue.TryGetResidue(nTermResidue, out Residue residue)) + { + nTermMass += residue.MonoisotopicMass; + } + else + { + nTermMass = double.NaN; + } + + // add side-chain mod + if (AllModsOneIsNterminus.TryGetValue(r + 2, out Modification mod)) + { + nTermMass += mod.MonoisotopicMass.Value; + } + + // handle star and degree ions for low-res CID + if (dissociationType == DissociationType.LowCID) + { + if (nTermResidue == 'R' || nTermResidue == 'K' || nTermResidue == 'N' || nTermResidue == 'Q') + { + haveSeenNTermStarIon = true; + } + + if (nTermResidue == 'S' || nTermResidue == 'T' || nTermResidue == 'E' || nTermResidue == 'D') + { + haveSeenNTermDegreeIon = true; + } + } + + // skip first N-terminal fragment (b1, aDegree1, ...) for CID + if (r == 0 && (dissociationType == DissociationType.CID || dissociationType == DissociationType.LowCID)) + { + goto CTerminusFragments; + } + + // generate products + for (int i = 0; i < nTermProductTypes.Count; i++) + { + if (dissociationType == DissociationType.LowCID) + { + if (!haveSeenNTermStarIon && (nTermProductTypes[i] == ProductType.aStar || nTermProductTypes[i] == ProductType.bAmmoniaLoss)) + { + continue; + } + + if (!haveSeenNTermDegreeIon && (nTermProductTypes[i] == ProductType.aDegree || nTermProductTypes[i] == ProductType.bWaterLoss)) + { + continue; + } + } + + products.Add(new Product( + nTermProductTypes[i], + FragmentationTerminus.N, + nTermMass + massCaps.Item1[i], + r + 1, + r + 1, + 0)); + + nTermNeutralLosses = AddNeutralLossesFromMods(mod, nTermNeutralLosses, dissociationType); + + if (nTermNeutralLosses != null) + { + foreach (double neutralLoss in nTermNeutralLosses) + { + products.Add(new Product( + nTermProductTypes[i], + FragmentationTerminus.N, + nTermMass + massCaps.Item1[i] - neutralLoss, + r + 1, + r + 1, + neutralLoss)); + } + } + } + } + + // c-term fragments + CTerminusFragments: + if (calculateCTermFragments) + { + char cTermResidue = BaseSequence[BaseSequence.Length - r - 1]; + + // get c-term residue mass + if (Residue.TryGetResidue(cTermResidue, out Residue residue)) + { + cTermMass += residue.MonoisotopicMass; + } + else + { + cTermMass = double.NaN; + } + + // add side-chain mod + if (AllModsOneIsNterminus.TryGetValue(BaseSequence.Length - r + 1, out Modification mod)) + { + cTermMass += mod.MonoisotopicMass.Value; + } + + // handle star and degree ions for low-res CID + if (dissociationType == DissociationType.LowCID) + { + if (cTermResidue == 'R' || cTermResidue == 'K' || cTermResidue == 'N' || cTermResidue == 'Q') + { + haveSeenCTermStarIon = true; + } + + if (cTermResidue == 'S' || cTermResidue == 'T' || cTermResidue == 'E' || cTermResidue == 'D') + { + haveSeenCTermDegreeIon = true; + } + } + + // generate products + for (int i = 0; i < cTermProductTypes.Count; i++) + { + // skip zDot ions for proline residues for ETD/ECD/EThcD + if (cTermResidue == 'P' + && (dissociationType == DissociationType.ECD || dissociationType == DissociationType.ETD || dissociationType == DissociationType.EThcD) + && cTermProductTypes[i] == ProductType.zDot) + { + continue; + } + + if (dissociationType == DissociationType.LowCID) + { + if (!haveSeenCTermStarIon && cTermProductTypes[i] == ProductType.yAmmoniaLoss) + { + continue; + } + + if (!haveSeenCTermDegreeIon && cTermProductTypes[i] == ProductType.yWaterLoss) + { + continue; + } + } + + products.Add(new Product( + cTermProductTypes[i], + FragmentationTerminus.C, + cTermMass + massCaps.Item2[i], + r + 1, + BaseSequence.Length - r, + 0)); + + cTermNeutralLosses = AddNeutralLossesFromMods(mod, cTermNeutralLosses, dissociationType); + + if (cTermNeutralLosses != null) + { + foreach (double neutralLoss in cTermNeutralLosses) + { + products.Add(new Product( + cTermProductTypes[i], + FragmentationTerminus.C, + cTermMass + massCaps.Item2[i] - neutralLoss, + r + 1, + BaseSequence.Length - r, + neutralLoss)); + } + } + } + } + } + + // zDot generates one more ion... + //ETD will cleave between N - C bond.So ETD will remove a NH3 from the N-terminal amino acid, and generate(MH + minus NH3) ion + if (cTermProductTypes.Contains(ProductType.zDot) && BaseSequence[0] != 'P') + { + // get c-term residue mass + if (Residue.TryGetResidue(BaseSequence[0], out Residue residue)) + { + cTermMass += residue.MonoisotopicMass; + } + else + { + cTermMass = double.NaN; + } + + // add side-chain mod + if (AllModsOneIsNterminus.TryGetValue(2, out Modification mod)) + { + cTermMass += mod.MonoisotopicMass.Value; + } + + // generate zDot product + products.Add(new Product( + ProductType.zDot, + FragmentationTerminus.C, + cTermMass + DissociationTypeCollection.GetMassShiftFromProductType(ProductType.zDot), + BaseSequence.Length, + 1, + 0)); + + cTermNeutralLosses = AddNeutralLossesFromMods(mod, cTermNeutralLosses, dissociationType); + + if (cTermNeutralLosses != null) + { + foreach (double neutralLoss in cTermNeutralLosses) + { + products.Add(new Product( + ProductType.zDot, + FragmentationTerminus.C, + cTermMass + DissociationTypeCollection.GetMassShiftFromProductType(ProductType.zDot) - neutralLoss, + BaseSequence.Length, + 1, + neutralLoss)); + } + } + } + + foreach (var mod in AllModsOneIsNterminus.Where(p => p.Value.NeutralLosses != null)) + { + // molecular ion minus neutral losses + if (mod.Value.NeutralLosses.TryGetValue(dissociationType, out List losses)) + { + foreach (double neutralLoss in losses.Where(p => p != 0)) + { + if (neutralLoss != 0) + { + products.Add(new Product(ProductType.M, FragmentationTerminus.Both, MonoisotopicMass - neutralLoss, 0, 0, neutralLoss)); + } + } + } + + if (mod.Value.NeutralLosses.TryGetValue(DissociationType.AnyActivationType, out losses)) + { + foreach (double neutralLoss in losses.Where(p => p != 0)) + { + if (neutralLoss != 0) + { + products.Add(new Product(ProductType.M, FragmentationTerminus.Both, MonoisotopicMass - neutralLoss, 0, 0, neutralLoss)); + } + } + } + } + + // generate diagnostic ions + // TODO: this code is memory-efficient but sort of CPU inefficient; it can be further optimized. + // however, diagnostic ions are fairly rare so it's probably OK for now + foreach (double diagnosticIon in AllModsOneIsNterminus + .Where(p => p.Value.DiagnosticIons != null) + .SelectMany(p => p.Value.DiagnosticIons.Where(v => v.Key == dissociationType || v.Key == DissociationType.AnyActivationType)) + .SelectMany(p => p.Value) + .Distinct()) + { + int diagnosticIonLabel = (int)Math.Round(diagnosticIon.ToMz(1), 0); + + // the diagnostic ion is assumed to be annotated in the mod info as the *neutral mass* of the diagnostic ion, not the ionized species + products.Add(new Product(ProductType.D, FragmentationTerminus.Both, diagnosticIon, diagnosticIonLabel, 0, 0)); + } + } + + /// + /// Generates theoretical internal fragments for given dissociation type for this peptide. + /// The "products" parameter is filled with these fragments. + /// The "minLengthOfFragments" parameter is the minimum number of amino acids for an internal fragment to be included + /// TODO: Implement neutral losses (e.g. phospho) + /// TODO: Implement Star/Degree ions from CID + /// + public void FragmentInternally(DissociationType dissociationType, int minLengthOfFragments, List products) + { + products.Clear(); + + var massCaps = DissociationTypeCollection.GetNAndCTerminalMassShiftsForDissociationType(dissociationType); + + List nTermProductTypes = DissociationTypeCollection.GetTerminusSpecificProductTypesFromDissociation(dissociationType, FragmentationTerminus.N); + List cTermProductTypes = DissociationTypeCollection.GetTerminusSpecificProductTypesFromDissociation(dissociationType, FragmentationTerminus.C); + + //foreach start (N-term) index possible + for (int n = 1; n <= BaseSequence.Length - minLengthOfFragments - 1; n++) + { + double fragmentMass = 0; + //populate with smallest possible fragment (minus 1) from this starting residue + for (int i = 0; i < minLengthOfFragments - 1; i++) + { + if (Residue.TryGetResidue(BaseSequence[n + i], out Residue residue)) + { + fragmentMass += residue.MonoisotopicMass; + + // add side-chain mod + if (AllModsOneIsNterminus.TryGetValue(n + i + 2, out Modification mod)) + { + fragmentMass += mod.MonoisotopicMass.Value; + } + } + else + { + fragmentMass = double.NaN; + } + } + + //expand length of fragment, adding each new length as a new fragment ion, until we reach the C1 residue. + for (int c = n + minLengthOfFragments - 1; c < BaseSequence.Length - 1; c++) + { + if (Residue.TryGetResidue(BaseSequence[c], out Residue residue)) + { + fragmentMass += residue.MonoisotopicMass; + // add side-chain mod + if (AllModsOneIsNterminus.TryGetValue(c + 2, out Modification mod)) + { + fragmentMass += mod.MonoisotopicMass.Value; + } + //add new fragment + //loop to accomodate EThcD + for (int i = 0; i < nTermProductTypes.Count; i++) + { + double massCap = massCaps.Item1[i]; + for (int j = 0; j < cTermProductTypes.Count; j++) + { + double massCap2 = massCaps.Item2[j]; + //do c, then n terminal ions + products.Add(new Product(cTermProductTypes[j], FragmentationTerminus.None, fragmentMass + massCap + massCap2 - WaterMonoisotopicMass, + n + 1, c - n + 1, 0, nTermProductTypes[i], c + 1)); + } + } + } + else + { + fragmentMass = double.NaN; + } + } + } + } + + public virtual string EssentialSequence(IReadOnlyDictionary modstoWritePruned) + { + string essentialSequence = BaseSequence; + if (modstoWritePruned != null) + { + var sbsequence = new StringBuilder(); + + // variable modification on peptide N-terminus + if (AllModsOneIsNterminus.TryGetValue(1, out Modification pep_n_term_variable_mod)) + { + if (modstoWritePruned.ContainsKey(pep_n_term_variable_mod.ModificationType)) + { + sbsequence.Append('[' + pep_n_term_variable_mod.ModificationType + ":" + pep_n_term_variable_mod.IdWithMotif + ']'); + } + } + for (int r = 0; r < Length; r++) + { + sbsequence.Append(this[r]); + // variable modification on this residue + if (AllModsOneIsNterminus.TryGetValue(r + 2, out Modification residue_variable_mod)) + { + if (modstoWritePruned.ContainsKey(residue_variable_mod.ModificationType)) + { + sbsequence.Append('[' + residue_variable_mod.ModificationType + ":" + residue_variable_mod.IdWithMotif + ']'); + } + } + } + + // variable modification on peptide C-terminus + if (AllModsOneIsNterminus.TryGetValue(Length + 2, out Modification pep_c_term_variable_mod)) + { + if (modstoWritePruned.ContainsKey(pep_c_term_variable_mod.ModificationType)) + { + sbsequence.Append('[' + pep_c_term_variable_mod.ModificationType + ":" + pep_c_term_variable_mod.IdWithMotif + ']'); + } + } + + essentialSequence = sbsequence.ToString(); + } + return essentialSequence; + } + + public PeptideWithSetModifications Localize(int j, double massToLocalize) + { + var dictWithLocalizedMass = new Dictionary(AllModsOneIsNterminus); + double massOfExistingMod = 0; + if (dictWithLocalizedMass.TryGetValue(j + 2, out Modification modToReplace)) + { + massOfExistingMod = (double)modToReplace.MonoisotopicMass; + dictWithLocalizedMass.Remove(j + 2); + } + + dictWithLocalizedMass.Add(j + 2, new Modification(_locationRestriction: "Anywhere.", _monoisotopicMass: massToLocalize + massOfExistingMod)); + + var peptideWithLocalizedMass = new PeptideWithSetModifications(Protein, _digestionParams, OneBasedStartResidueInProtein, OneBasedEndResidueInProtein, + CleavageSpecificityForFdrCategory, PeptideDescription, MissedCleavages, dictWithLocalizedMass, NumFixedMods); + + return peptideWithLocalizedMass; + } + + /// + /// Determines whether a peptide includes a splice site + /// + /// + /// + /// + public bool IncludesSpliceSite(SpliceSite site) + { + return OneBasedStartResidueInProtein <= site.OneBasedBeginPosition && OneBasedEndResidueInProtein >= site.OneBasedEndPosition; + } + + /// + /// Checks if sequence variant and peptide intersect, also checks if the seuqence variatn can be identified whether they intersect + /// or not (ie if the variant causes a cleavage site generating the peptide). Returns a tuple with item 1 being a bool value + /// representing if the varaint intersects the peptide and item 2 beign abool that represents if the variatn is identified. + /// + /// + /// + /// + public (bool intersects, bool identifies) IntersectsAndIdentifiesVariation(SequenceVariation appliedVariation) + { + // does it intersect? + //possible locations for variant start site + bool VariantStartsBeforePeptide = appliedVariation.OneBasedBeginPosition < OneBasedStartResidueInProtein; + bool VariantStartsAtPeptideStart = appliedVariation.OneBasedBeginPosition == OneBasedStartResidueInProtein; + bool VariantStartsInsidePeptide = appliedVariation.OneBasedBeginPosition >= OneBasedStartResidueInProtein && appliedVariation.OneBasedBeginPosition < OneBasedEndResidueInProtein; + bool VariantStartsAtPeptideEnd = appliedVariation.OneBasedBeginPosition == OneBasedEndResidueInProtein; + //possibe locations for variant end stite + bool VariantEndsAtPeptideStart = appliedVariation.OneBasedEndPosition == OneBasedStartResidueInProtein; + bool VariantEndsInsidePeptide = appliedVariation.OneBasedEndPosition > OneBasedStartResidueInProtein && appliedVariation.OneBasedEndPosition <= OneBasedEndResidueInProtein; + bool VariantEndsAtPeptideEnd = appliedVariation.OneBasedEndPosition == OneBasedEndResidueInProtein; + bool VariantEndsAfterPeptide = appliedVariation.OneBasedEndPosition > OneBasedEndResidueInProtein; + + bool intersects = false; + bool identifies = false; + //start and end combinations that lead to variants being intersected by the peptide sequnce + if (VariantStartsBeforePeptide || VariantStartsAtPeptideStart) + { + if (VariantEndsAtPeptideStart || VariantEndsInsidePeptide || VariantEndsAtPeptideEnd || VariantEndsAfterPeptide) + { + intersects = true; + } + } + else if (VariantStartsInsidePeptide) + { + if (VariantEndsInsidePeptide || VariantEndsAfterPeptide || VariantEndsAtPeptideEnd) + { + intersects = true; + } + } + else if (VariantStartsAtPeptideEnd) + { + if (VariantEndsAfterPeptide || VariantEndsAtPeptideEnd) + { + intersects = true; + } + } + + if (intersects == true) + { + int lengthDiff = appliedVariation.VariantSequence.Length - appliedVariation.OriginalSequence.Length; + int intersectOneBasedStart = Math.Max(OneBasedStartResidueInProtein, appliedVariation.OneBasedBeginPosition); + int intersectOneBasedEnd = Math.Min(OneBasedEndResidueInProtein, appliedVariation.OneBasedEndPosition + lengthDiff); + int intersectSize = intersectOneBasedEnd - intersectOneBasedStart + 1; + + // if the original sequence within the peptide is shorter or longer than the variant sequence within the peptide, there is a sequence change + int variantZeroBasedStartInPeptide = intersectOneBasedStart - appliedVariation.OneBasedBeginPosition; + bool origSeqIsShort = appliedVariation.OriginalSequence.Length - variantZeroBasedStartInPeptide < intersectSize; + bool origSeqIsLong = appliedVariation.OriginalSequence.Length > intersectSize && OneBasedEndResidueInProtein > intersectOneBasedEnd; + if (origSeqIsShort || origSeqIsLong) + { + identifies = true; + } + else + { + // crosses the entire variant sequence (needed to identify truncations and certain deletions, like KAAAAAAAAA -> K, but also catches synonymous variations A -> A) + bool crossesEntireVariant = intersectSize == appliedVariation.VariantSequence.Length; + + if (crossesEntireVariant == true) + { + // is the variant sequence intersecting the peptide different than the original sequence? + string originalAtIntersect = appliedVariation.OriginalSequence.Substring(intersectOneBasedStart - appliedVariation.OneBasedBeginPosition, intersectSize); + string variantAtIntersect = appliedVariation.VariantSequence.Substring(intersectOneBasedStart - appliedVariation.OneBasedBeginPosition, intersectSize); + identifies = originalAtIntersect != variantAtIntersect; + } + } + } + //checks to see if the variant causes a cleavage event creating the peptide. This is how a variant can be identified without intersecting + //with the peptide itself + else + { + //We need to account for any variants that occur in the protien prior to the variant in question. + //This information is used to calculate a scaling factor to calculate the AA that proceeds the peptide seqeunce in the original (variant free) protein + List VariantsThatAffectPreviousAAPosition = Protein.AppliedSequenceVariations.Where(v => v.OneBasedEndPosition <= OneBasedStartResidueInProtein).ToList(); + int totalLengthDifference = 0; + foreach (var variant in VariantsThatAffectPreviousAAPosition) + { + totalLengthDifference += variant.VariantSequence.Length - variant.OriginalSequence.Length; + } + + //need to determine what the cleavage sites are for the protease used (will allow us to determine if new cleavage sites were made by variant) + List proteasesCleavageSites = DigestionParams.Protease.DigestionMotifs; + //if the variant ends the AA before the peptide starts then it may have caused c-terminal cleavage + //see if the protease used for digestion has C-terminal cleavage sites + List cTerminalResidue = proteasesCleavageSites.Where(dm => dm.CutIndex == 1).Select(d => d.InducingCleavage).ToList(); + + if (appliedVariation.OneBasedEndPosition == (OneBasedStartResidueInProtein - 1)) + { + if (cTerminalResidue.Count > 0) + { + // get the AA that proceeds the peptide from the variant protein (AKA the last AA in the variant) + PeptideWithSetModifications previousAA_Variant = new PeptideWithSetModifications(Protein, DigestionParams, OneBasedStartResidueInProtein - 1, OneBasedStartResidueInProtein - 1, CleavageSpecificity.Full, "full", 0, AllModsOneIsNterminus, NumFixedMods); + + // get the AA that proceeds the peptide sequence in the original protein (wihtout any applied variants) + PeptideWithSetModifications previousAA_Original = new PeptideWithSetModifications(Protein.NonVariantProtein, DigestionParams, (OneBasedStartResidueInProtein - 1) - totalLengthDifference, (OneBasedStartResidueInProtein - 1) - totalLengthDifference, CleavageSpecificity.Full, "full", 0, AllModsOneIsNterminus, NumFixedMods); + bool newSite = cTerminalResidue.Contains(previousAA_Variant.BaseSequence); + bool oldSite = cTerminalResidue.Contains(previousAA_Original.BaseSequence); + // if the new AA causes a cleavage event, and that cleavage event would not have occurred without the variant then it is identified + if (newSite == true && oldSite == false) + { + identifies = true; + } + } + } + //if the variant begins the AA after the peptide ends then it may have caused n-terminal cleavage + else if (appliedVariation.OneBasedBeginPosition == (OneBasedEndResidueInProtein + 1)) + { + //see if the protease used for digestion has N-terminal cleavage sites + List nTerminalResidue = proteasesCleavageSites.Where(dm => dm.CutIndex == 0).Select(d => d.InducingCleavage).ToList(); + // stop gain variation can create a peptide this checks for this with cTerminal cleavage proteases + if (cTerminalResidue.Count > 0) + { + if (appliedVariation.VariantSequence == "*") + { + PeptideWithSetModifications lastAAofPeptide = new PeptideWithSetModifications(Protein, DigestionParams, OneBasedEndResidueInProtein, OneBasedEndResidueInProtein, CleavageSpecificity.Full, "full", 0, AllModsOneIsNterminus, NumFixedMods); + bool oldSite = cTerminalResidue.Contains(lastAAofPeptide.BaseSequence); + if (oldSite == false) + { + identifies = true; + } + } + } + + if (nTerminalResidue.Count > 0) + { + if (Protein.Length >= OneBasedEndResidueInProtein + 1) + { + //get the AA that follows the peptide sequence fromt he variant protein (AKA the first AA of the varaint) + PeptideWithSetModifications nextAA_Variant = new PeptideWithSetModifications(Protein, DigestionParams, OneBasedEndResidueInProtein + 1, OneBasedEndResidueInProtein + 1, CleavageSpecificity.Full, "full", 0, AllModsOneIsNterminus, NumFixedMods); + + // checks to make sure the original protein has an amino acid following the peptide (an issue with stop loss variants or variatns that add AA after the previous stop residue) + // no else statement because if the peptide end residue was the previous protein stop site, there is no way to truly identify the variant. + // if the peptide were to extend into the stop loss region then the peptide would intesect the variant and this code block would not be triggered. + if (Protein.NonVariantProtein.Length >= OneBasedEndResidueInProtein + 1) + { + // get the AA that follows the peptide sequence in the original protein (without any applied variants) + PeptideWithSetModifications nextAA_Original = new PeptideWithSetModifications(Protein.NonVariantProtein, DigestionParams, (OneBasedEndResidueInProtein + 1) - totalLengthDifference, (OneBasedEndResidueInProtein + 1) - totalLengthDifference, CleavageSpecificity.Full, "full", 0, AllModsOneIsNterminus, NumFixedMods); + bool newSite = nTerminalResidue.Contains(nextAA_Variant.BaseSequence); + bool oldSite = nTerminalResidue.Contains(nextAA_Original.BaseSequence); + // if the new AA causes a cleavage event, and that cleavage event would not have occurred without the variant then it is identified + if (newSite == true && oldSite == false) + { + identifies = true; + } + } + + } + //for stop gain varations that cause peptide + else + { + // get the AA that follows the peptide sequence in the original protein (without any applied variants) + PeptideWithSetModifications nextAA_Original = new PeptideWithSetModifications(Protein.NonVariantProtein, DigestionParams, (OneBasedEndResidueInProtein + 1) - totalLengthDifference, (OneBasedEndResidueInProtein + 1) - totalLengthDifference, CleavageSpecificity.Full, "full", 0, AllModsOneIsNterminus, NumFixedMods); + bool oldSite = nTerminalResidue.Contains(nextAA_Original.BaseSequence); + // if the new AA causes a cleavage event, and that cleavage event would not have occurred without the variant then it is identified + if (oldSite == false) + { + identifies = true; + } + } + } + } + } + + return (intersects, identifies); + } + + /// + /// Makes the string representing a detected sequence variation, including any modifications on a variant amino acid. + /// takes in the variant as well as the bool value of wheter the peptid eintersects the variant. (this allows for identified + /// variants that cause the cleavage site for the peptide. + /// + /// + /// + /// + public string SequenceVariantString(SequenceVariation applied, bool intersects) + { + if (intersects == true) + { + bool startAtNTerm = applied.OneBasedBeginPosition == 1 && OneBasedStartResidueInProtein == 1; + bool onlyPeptideStartAtNTerm = OneBasedStartResidueInProtein == 1 && applied.OneBasedBeginPosition != 1; + int modResidueScale = 0; + if (startAtNTerm) + { + modResidueScale = 1; + } + else if (onlyPeptideStartAtNTerm) + { + modResidueScale = 2; + } + else + { + modResidueScale = 3; + } + int lengthDiff = applied.VariantSequence.Length - applied.OriginalSequence.Length; + var modsOnVariantOneIsNTerm = AllModsOneIsNterminus + .Where(kv => kv.Key == 1 && applied.OneBasedBeginPosition == 1 || applied.OneBasedBeginPosition <= kv.Key - 2 + OneBasedStartResidueInProtein && kv.Key - 2 + OneBasedStartResidueInProtein <= applied.OneBasedEndPosition) + .ToDictionary(kv => kv.Key - applied.OneBasedBeginPosition + (modResidueScale), kv => kv.Value); + PeptideWithSetModifications variantWithAnyMods = new PeptideWithSetModifications(Protein, DigestionParams, applied.OneBasedBeginPosition == 1 ? applied.OneBasedBeginPosition : applied.OneBasedBeginPosition - 1, applied.OneBasedEndPosition, CleavageSpecificityForFdrCategory, PeptideDescription, MissedCleavages, modsOnVariantOneIsNTerm, NumFixedMods); + return $"{applied.OriginalSequence}{applied.OneBasedBeginPosition}{variantWithAnyMods.FullSequence.Substring(applied.OneBasedBeginPosition == 1 ? 0 : 1)}"; + } + //if the variant caused a cleavage site leading the the peptide sequence (variant does not intersect but is identified) + else + { + return $"{applied.OriginalSequence}{ applied.OneBasedBeginPosition}{applied.VariantSequence}"; + } + } + + /// + /// Takes an individual peptideWithSetModifications and determines if applied variations from the protein are found within its length + /// + /// + public bool IsVariantPeptide() + { + bool identifiedVariant = false; + if (this.Protein.AppliedSequenceVariations.Count() > 0) + { + foreach (var variant in this.Protein.AppliedSequenceVariations) + { + if (this.IntersectsAndIdentifiesVariation(variant).identifies) + { + identifiedVariant = true; + break; + } + } + } + return identifiedVariant; + } + + public override string ToString() + { + return FullSequence + string.Join("\t", AllModsOneIsNterminus.Select(m => m.ToString())); + } + + public override bool Equals(object obj) + { + var q = obj as PeptideWithSetModifications; + + if (Protein == null && q.Protein == null) + { + return q.FullSequence.Equals(this.FullSequence); + } + + return q != null + && q.FullSequence.Equals(this.FullSequence) + && q.OneBasedStartResidueInProtein == this.OneBasedStartResidueInProtein + && (q.Protein.Accession == null && this.Protein.Accession == null || q.Protein.Accession.Equals(this.Protein.Accession)) + && q.DigestionParams.Protease.Equals(this.DigestionParams.Protease); + } + + public override int GetHashCode() + { + if (DigestionParams == null) + { + return FullSequence.GetHashCode(); + } + else + { + return FullSequence.GetHashCode() + DigestionParams.Protease.GetHashCode(); + } + } + + /// + /// This should be run after deserialization of a PeptideWithSetModifications, in order to set its Protein and Modification objects, which were not serialized + /// + public void SetNonSerializedPeptideInfo(Dictionary idToMod, Dictionary accessionToProtein, DigestionParams dp) + { + GetModsAfterDeserialization(idToMod); + GetProteinAfterDeserialization(accessionToProtein); + _digestionParams = dp; + } + + private void GetModsAfterDeserialization(Dictionary idToMod) + { + _allModsOneIsNterminus = new Dictionary(); + int currentModStart = 0; + int currentModificationLocation = 1; + bool currentlyReadingMod = false; + int bracketCount = 0; + + for (int r = 0; r < FullSequence.Length; r++) + { + char c = FullSequence[r]; + if (c == '[') + { + currentlyReadingMod = true; + if (bracketCount == 0) + { + currentModStart = r + 1; + } + bracketCount++; + } + else if (c == ']') + { + string modId = null; + bracketCount--; + if (bracketCount == 0) + { + try + { + //remove the beginning section (e.g. "Fixed", "Variable", "Uniprot") + string modString = FullSequence.Substring(currentModStart, r - currentModStart); + int splitIndex = modString.IndexOf(':'); + string modType = modString.Substring(0, splitIndex); + modId = modString.Substring(splitIndex + 1, modString.Length - splitIndex - 1); + } + catch (Exception e) + { + throw new MzLibUtil.MzLibException( + "Error while trying to parse string into peptide: " + e.Message); + } + if (!idToMod.TryGetValue(modId, out Modification mod)) + { + throw new MzLibUtil.MzLibException( + "Could not find modification while reading string: " + FullSequence); + } + if (mod.LocationRestriction.Contains("C-terminal.") && r == FullSequence.Length - 1) + { + currentModificationLocation = BaseSequence.Length + 2; + } + _allModsOneIsNterminus.Add(currentModificationLocation, mod); + currentlyReadingMod = false; + } + } + else if (!currentlyReadingMod) + { + currentModificationLocation++; + } + //else do nothing + } + } + + private void GetProteinAfterDeserialization(Dictionary idToProtein) + { + Protein protein = null; + + if (ProteinAccession != null && !idToProtein.TryGetValue(ProteinAccession, out protein)) + { + throw new MzLibUtil.MzLibException("Could not find protein accession after deserialization! " + ProteinAccession); + } + + Protein = protein; + } + + public static string GetBaseSequenceFromFullSequence(string fullSequence) + { + StringBuilder sb = new StringBuilder(); + int bracketCount = 0; + foreach (char c in fullSequence) + { + if (c == '[') + { + bracketCount++; + } + else if (c == ']') + { + bracketCount--; + } + else if (bracketCount == 0) + { + sb.Append(c); + } + } + return sb.ToString(); + } + + private void DetermineFullSequence() + { + var subsequence = new StringBuilder(); + + // modification on peptide N-terminus + if (AllModsOneIsNterminus.TryGetValue(1, out Modification mod)) + { + subsequence.Append('[' + mod.ModificationType + ":" + mod.IdWithMotif + ']'); + } + + for (int r = 0; r < Length; r++) + { + subsequence.Append(this[r]); + + // modification on this residue + if (AllModsOneIsNterminus.TryGetValue(r + 2, out mod)) + { + subsequence.Append('[' + mod.ModificationType + ":" + mod.IdWithMotif + ']'); + } + } + + // modification on peptide C-terminus + if (AllModsOneIsNterminus.TryGetValue(Length + 2, out mod)) + { + subsequence.Append('[' + mod.ModificationType + ":" + mod.IdWithMotif + ']'); + } + + FullSequence = subsequence.ToString(); + } + + private void UpdateCleavageSpecificity() + { + if (CleavageSpecificityForFdrCategory == CleavageSpecificity.Unknown) + { + CleavageSpecificityForFdrCategory = DigestionParams.SpecificProtease.GetCleavageSpecificity(Protein, OneBasedStartResidueInProtein, OneBasedEndResidueInProtein, DigestionParams.InitiatorMethionineBehavior == InitiatorMethionineBehavior.Retain); + PeptideDescription = CleavageSpecificityForFdrCategory.ToString(); + } + } + + private HashSet AddNeutralLossesFromMods(Modification mod, HashSet allNeutralLossesSoFar, DissociationType dissociationType) + { + // add neutral losses specific to this dissociation type + if (mod != null + && mod.NeutralLosses != null + && mod.NeutralLosses.TryGetValue(dissociationType, out List neutralLossesFromMod)) + { + foreach (double neutralLoss in neutralLossesFromMod.Where(p => p != 0)) + { + if (allNeutralLossesSoFar == null) + { + allNeutralLossesSoFar = new HashSet(); + } + + allNeutralLossesSoFar.Add(neutralLoss); + } + } + + // add neutral losses that are generic to any dissociation type + if (mod != null + && mod.NeutralLosses != null + && mod.NeutralLosses.TryGetValue(DissociationType.AnyActivationType, out neutralLossesFromMod)) + { + foreach (double neutralLoss in neutralLossesFromMod.Where(p => p != 0)) + { + if (allNeutralLossesSoFar == null) + { + allNeutralLossesSoFar = new HashSet(); + } + + allNeutralLossesSoFar.Add(neutralLoss); + } + } + + return allNeutralLossesSoFar; + } + + //This function maintains the amino acids associated with the protease motif and reverses all other amino acids. + //N-terminal modificatons are preserved. Other modifications travel with their respective amino acids. this results + //in a decoy peptide composed the same amino acids and modifications as the original. + //Occasionally, this process results in peptide with exactly the same sequence. Therefore, there is a stop-gap measure + //the returns the mirror image of the original. N-terminal mods are preserved, but other mods are also reversed. + //this should yield a unique decoy for each target sequence. + //This function also adds a hash code to both the original PeptideWithSetModifications and the decoy + //generated by this function pairing the two together by eachother's FullSequence. + //The original taget peptide is given a hash code corresponding to the decoy's full sequence, + //and the decoy is given a hash code corresponding to the original target peptide's sequence. + //This hash code is stored in the PairedTargetDecoyHash parameter of PeptideWithSetModifications. + public PeptideWithSetModifications GetReverseDecoyFromTarget(int[] revisedAminoAcidOrder) + { + Dictionary newModificationsDictionary = new Dictionary(); + //Copy N-terminal modifications from target dictionary to decoy dictionary. + if (this.AllModsOneIsNterminus.ContainsKey(1)) + { + newModificationsDictionary.Add(1, this.AllModsOneIsNterminus[1]); + } + char[] newBase = new char[this.BaseSequence.Length]; + Array.Fill(newBase, '0'); + char[] evaporatingBase = this.BaseSequence.ToCharArray(); + List motifs = this.DigestionParams.Protease.DigestionMotifs; + if (motifs != null && motifs.Count > 0) + { + foreach (var motif in motifs.Where(m => m.InducingCleavage != ""))//check the empty "" for topdown + { + string cleavingMotif = motif.InducingCleavage; + List cleavageMotifLocations = new List(); + + for (int i = 0; i < BaseSequence.Length; i++) + { + bool fits; + bool prevents; + (fits, prevents) = motif.Fits(BaseSequence, i); + + if (fits && !prevents) + { + cleavageMotifLocations.Add(i); + } + } + + foreach (int location in cleavageMotifLocations) + { + char[] motifArray = BaseSequence.Substring(location, cleavingMotif.Length).ToCharArray(); + + for (int i = 0; i < cleavingMotif.Length; i++) + { + newBase[location + i] = motifArray[i]; + revisedAminoAcidOrder[location + i] = location + i;// + //directly copy mods that were on amino acids in the motif. Those amino acids don't change position. + if (this.AllModsOneIsNterminus.ContainsKey(location + i + 2)) + { + newModificationsDictionary.Add(location + i + 2, this.AllModsOneIsNterminus[location + i + 2]); + } + + evaporatingBase[location + i] = '0';//can null a char so i use a number which doesnt' appear in peptide string + } + } + } + } + + // We've kept amino acids in the digestion motif in the same position in the decoy peptide. + // Now we will fill the remaining open positions in the decoy with the reverse of amino acids from the target. + // Part to change to scramble + int fillPosition = 0; + int extractPosition = this.BaseSequence.Length - 1; + while (fillPosition < this.BaseSequence.Length && extractPosition >= 0) + { + if (evaporatingBase[extractPosition] != '0') + { + while (newBase[fillPosition] != '0') + { + fillPosition++; + } + newBase[fillPosition] = evaporatingBase[extractPosition]; + revisedAminoAcidOrder[fillPosition] = extractPosition; + if (this.AllModsOneIsNterminus.ContainsKey(extractPosition + 2)) + { + newModificationsDictionary.Add(fillPosition + 2, this.AllModsOneIsNterminus[extractPosition + 2]); + } + fillPosition++; + } + extractPosition--; + } + + string newBaseString = new string(newBase); + + var proteinSequence = this.Protein.BaseSequence; + var aStringBuilder = new StringBuilder(proteinSequence); + aStringBuilder.Remove(this.OneBasedStartResidueInProtein - 1, this.BaseSequence.Length); + aStringBuilder.Insert(this.OneBasedStartResidueInProtein - 1, newBaseString); + proteinSequence = aStringBuilder.ToString(); + + Protein decoyProtein = new Protein(proteinSequence, "DECOY_" + this.Protein.Accession, null, new List>(), new Dictionary>(), null, null, null, true); + DigestionParams d = this.DigestionParams; + + // Creates a hash code corresponding to the target's sequence + int targetHash = GetHashCode(); + PeptideWithSetModifications decoyPeptide; + //Make the "peptideDescription" store the corresponding target's sequence + if (newBaseString != this.BaseSequence) + { + decoyPeptide = new PeptideWithSetModifications(decoyProtein, d, this.OneBasedStartResidueInProtein, this.OneBasedEndResidueInProtein, this.CleavageSpecificityForFdrCategory, this.FullSequence, this.MissedCleavages, newModificationsDictionary, this.NumFixedMods, newBaseString); + // Sets PairedTargetDecoyHash of the original target peptie to the hash hode of the decoy sequence + PairedTargetDecoyHash = decoyPeptide.GetHashCode(); + // Sets PairedTargetDecoyHash of the decoy peptide to the hash code of the target sequence + decoyPeptide.PairedTargetDecoyHash = targetHash; + return decoyPeptide; + + } + else + { + //The reverse decoy procedure failed to create a PeptideWithSetModificatons with a different sequence. Therefore, + //we retrun the mirror image peptide. + decoyPeptide = this.GetPeptideMirror(revisedAminoAcidOrder); + PairedTargetDecoyHash = decoyPeptide.GetHashCode(); + decoyPeptide.PairedTargetDecoyHash = targetHash; + return decoyPeptide; + } + + } + /// + /// This function generates a decoy peptide from a target by scrambling the target peptide's amino acid sequence + /// This preserves any digestion motifs and keeps modifications with their amino acids + /// To help generate only high quality decoys, a homology cutoff of 30 % sequence similarity is used + /// If after 10 attempts no sufficient decoy is generated, the mirror sequence is returned + /// + /// Array to store the new amino acid order in + /// Parameter specifying the homology cutoff to be used + /// + public PeptideWithSetModifications GetScrambledDecoyFromTarget(int[] revisedAminoAcidOrder, double maximumHomology = 0.3) + { + Dictionary newModificationsDictionary = new Dictionary(); + //Copy N-terminal modifications from target dictionary to decoy dictionary. + if (this.AllModsOneIsNterminus.ContainsKey(1)) + { + newModificationsDictionary.Add(1, this.AllModsOneIsNterminus[1]); + } + char[] newBase = new char[this.BaseSequence.Length]; + Array.Fill(newBase, '0'); + char[] evaporatingBase = this.BaseSequence.ToCharArray(); + List motifs = this.DigestionParams.Protease.DigestionMotifs; + if (motifs != null && motifs.Count > 0) + { + foreach (var motif in motifs.Where(m => m.InducingCleavage != ""))//check the empty "" for topdown + { + string cleavingMotif = motif.InducingCleavage; + List cleavageMotifLocations = new List(); + + for (int i = 0; i < BaseSequence.Length; i++) + { + bool fits; + bool prevents; + (fits, prevents) = motif.Fits(BaseSequence, i); + + if (fits && !prevents) + { + cleavageMotifLocations.Add(i); + } + } + + foreach (int location in cleavageMotifLocations) + { + char[] motifArray = BaseSequence.Substring(location, cleavingMotif.Length).ToCharArray(); + + for (int i = 0; i < cleavingMotif.Length; i++) + { + newBase[location + i] = motifArray[i]; + revisedAminoAcidOrder[location + i] = location + i; + //directly copy mods that were on amino acids in the motif. Those amino acids don't change position. + if (this.AllModsOneIsNterminus.ContainsKey(location + i + 2)) + { + newModificationsDictionary.Add(location + i + 2, this.AllModsOneIsNterminus[location + i + 2]); + } + + evaporatingBase[location + i] = '0';//can null a char so i use a number which doesnt' appear in peptide string + } + } + } + } + + //We've kept amino acids in the digestion motif in the same position in the decoy peptide. + //Now we will fill the remaining open positions in the decoy with the scrambled amino acids from the target. + int extractPosition; + int fillPosition; + int residueNumsIndex; + // Specify seed to ensure that the same decoy sequence is always generated from the target + Random rand = new(56); + double percentIdentity = 1; + int scrambleAttempt = 0; + int maxScrambles = 10; + double maxIdentity = maximumHomology; + int characterCounter; + + while(scrambleAttempt < maxScrambles && percentIdentity > maxIdentity) + { + // Copies the newModificationsDictionary for the scramble attempt + Dictionary tempModificationsDictionary = new(newModificationsDictionary); + fillPosition = 0; + // residueNums is a list containing array indices for each element of evaporatingBase + // Once each amino acid is added, its index is removed from residueNums to prevent the same AA from being added 2x + var residueNums = Enumerable.Range(0, evaporatingBase.Length).ToList(); + characterCounter = 0; + char[] tempNewBase = new char[newBase.Length]; + // Create a copy of the newBase character array for the scrambling attempt + Array.Copy(newBase, tempNewBase, newBase.Length); + + // I am not sure why I need the second counter, but it always works when I have it + int seqLength = this.BaseSequence.Length; + while (fillPosition < seqLength && characterCounter < seqLength) + { + residueNumsIndex = rand.Next(residueNums.Count); + extractPosition = residueNums[residueNumsIndex]; + char targetAA = evaporatingBase[extractPosition]; + residueNums.RemoveAt(residueNumsIndex); + if (targetAA != '0') + { + while (tempNewBase[fillPosition] != '0') + { + fillPosition++; + } + tempNewBase[fillPosition] = targetAA; + revisedAminoAcidOrder[fillPosition] = extractPosition; + if (this.AllModsOneIsNterminus.ContainsKey(extractPosition + 2)) + { + tempModificationsDictionary.Add(fillPosition + 2, this.AllModsOneIsNterminus[extractPosition + 2]); + } + fillPosition++; + } + characterCounter ++; + } + scrambleAttempt++; + /* + * Any homology scoring mechanism can go here, percent identity is probably not the best + * In terms of generating a decoy sequence that will have a different mass spectrum than + * the original, it is far more important to vary the amino acids on the edges than + * those in the middle. Changes on the edges will offset the entire b and y sequences + * leading to an effective decoy spectrum even if there is high identity in the middle of + * the sequence. Additionally, for peptides with a large amount of a certain amino acid, + * it will be very difficult to generate a low homology sequence. + */ + percentIdentity = GetPercentIdentity(tempNewBase, evaporatingBase, tempModificationsDictionary, this.AllModsOneIsNterminus); + // Check that the percent identity is below the maximum identity threshold and set actual values to the temporary values + if (percentIdentity < maxIdentity) + { + newBase = tempNewBase; + newModificationsDictionary = tempModificationsDictionary; + // Code checking similarity between theoretical spectra could go here + } + + // If max scrambles are reached, make the new sequence identical to the original to trigger mirroring + else if (scrambleAttempt == maxScrambles) + { + for(int j = 0; j < newBase.Length; j++) + { + if (newBase[j] == '0') + { + newBase[j] = evaporatingBase[j]; + } + } + } + } + + + string newBaseString = new string(newBase); + + var proteinSequence = this.Protein.BaseSequence; + var aStringBuilder = new StringBuilder(proteinSequence); + aStringBuilder.Remove(this.OneBasedStartResidueInProtein - 1, this.BaseSequence.Length); + aStringBuilder.Insert(this.OneBasedStartResidueInProtein - 1, newBaseString); + proteinSequence = aStringBuilder.ToString(); + + Protein decoyProtein = new Protein(proteinSequence, "DECOY_" + this.Protein.Accession, null, new List>(), new Dictionary>(), null, null, null, true); + DigestionParams d = this.DigestionParams; + // Creates a hash code corresponding to the target's sequence + int targetHash = GetHashCode(); + PeptideWithSetModifications decoyPeptide; + //Make the "peptideDescription" store the corresponding target's sequence + if (newBaseString != this.BaseSequence) + { + decoyPeptide = new PeptideWithSetModifications(decoyProtein, d, this.OneBasedStartResidueInProtein, this.OneBasedEndResidueInProtein, this.CleavageSpecificityForFdrCategory, this.FullSequence, this.MissedCleavages, newModificationsDictionary, this.NumFixedMods, newBaseString); + // Sets PairedTargetDecoyHash of the original target peptie to the hash hode of the decoy sequence + PairedTargetDecoyHash = decoyPeptide.GetHashCode(); + // Sets PairedTargetDecoyHash of the decoy peptide to the hash code of the target sequence + decoyPeptide.PairedTargetDecoyHash = targetHash; + return decoyPeptide; + + } + else + { + //The reverse decoy procedure failed to create a PeptideWithSetModificatons with a different sequence. Therefore, + //we retrun the mirror image peptide. + decoyPeptide = this.GetPeptideMirror(revisedAminoAcidOrder); + PairedTargetDecoyHash = decoyPeptide.GetHashCode(); + decoyPeptide.PairedTargetDecoyHash = targetHash; + return decoyPeptide; + } + } + + /// + /// Method to get the percent identity between two peptide sequences stored as char[] + /// + /// Character array of the scrambled sequence + /// Character array of the unscrambled sequence + /// Dictionary containing the scrambled sequence's modifications + /// Dictionary containing the unscrambled sequence's modifications + /// + private static double GetPercentIdentity(char[] scrambledSequence, char[] unscrambledSequence, Dictionary scrambledMods, Dictionary unscrambledMods) + { + double rawScore = 0; + int seqLength = scrambledSequence.Length; + for(int i = 0; i < seqLength; i++) + { + if (scrambledSequence[i] == unscrambledSequence[i] || unscrambledSequence[i] == '0') + { + Modification scrambledMod; + if (scrambledMods.TryGetValue(i + 2, out scrambledMod) && unscrambledSequence[i] != '0') + { + Modification unscrambledMod; + if (unscrambledMods.TryGetValue(i + 2, out unscrambledMod)) + { + if (scrambledMod == unscrambledMod) + { + rawScore += 1; + } + } + } + else + { + rawScore += 1; + } + + } + } + return rawScore / seqLength; + } + + //Returns a PeptideWithSetModifications mirror image. Used when reverse decoy sequence is same as target sequence + public PeptideWithSetModifications GetPeptideMirror(int[] revisedOrderNisOne) + { + Dictionary newModificationsDictionary = new Dictionary(); + //Copy N-terminal modifications from target dictionary to decoy dictionary. + if (this.AllModsOneIsNterminus.ContainsKey(1)) + { + newModificationsDictionary.Add(1, this.AllModsOneIsNterminus[1]); + } + + //First step is to reverse the position of all modifications except the mod on the peptide N-terminus. + if (this.AllModsOneIsNterminus.Any()) + { + foreach (var kvp in this.AllModsOneIsNterminus.Where(p => p.Key != 1).ToList()) + { + newModificationsDictionary.Add(this.BaseSequence.Length - kvp.Key + 3, kvp.Value); + } + } + + //Second step is to reverse the sequence. + string newBaseString = new string(this.BaseSequence.Reverse().ToArray()); + + var proteinSequence = this.Protein.BaseSequence; + var aStringBuilder = new StringBuilder(proteinSequence); + aStringBuilder.Remove(this.OneBasedStartResidueInProtein - 1, this.BaseSequence.Length); + aStringBuilder.Insert(this.OneBasedStartResidueInProtein - 1, newBaseString); + proteinSequence = aStringBuilder.ToString(); + + Protein decoyProtein = new Protein(proteinSequence, "DECOY_" + this.Protein.Accession, null, new List>(), new Dictionary>(), null, null, null, true); + + DigestionParams d = this.DigestionParams; + + //now fill in the revised amino acid order + int oldStringPosition = this.BaseSequence.Length - 1; + for (int i = 0; i < newBaseString.Length; i++) + { + revisedOrderNisOne[i] = oldStringPosition; + oldStringPosition--; + } + + //Make the "peptideDescription" store the corresponding target's sequence + return new PeptideWithSetModifications(decoyProtein, d, this.OneBasedStartResidueInProtein, this.OneBasedEndResidueInProtein, this.CleavageSpecificityForFdrCategory, this.FullSequence, this.MissedCleavages, newModificationsDictionary, this.NumFixedMods, newBaseString); + } + } +} diff --git a/mzLib/MassSpectrometry/Proteomics/ProteolyticDigestion/ProductTypeMethods.cs b/mzLib/MassSpectrometry/Proteomics/ProteolyticDigestion/ProductTypeMethods.cs new file mode 100644 index 000000000..2606ce068 --- /dev/null +++ b/mzLib/MassSpectrometry/Proteomics/ProteolyticDigestion/ProductTypeMethods.cs @@ -0,0 +1,61 @@ +using System; +using System.Collections.Generic; +using Proteomics.Fragmentation; + +namespace Proteomics.ProteolyticDigestion +{ + public static class ProductTypeMethods + { + public static FragmentationTerminus IdentifyTerminusType(List productTypes) + { + if ((productTypes.Contains(ProductType.b) || productTypes.Contains(ProductType.c) || productTypes.Contains(ProductType.aDegree)) + && (productTypes.Contains(ProductType.y) || productTypes.Contains(ProductType.zDot) || productTypes.Contains(ProductType.zPlusOne) + || productTypes.Contains(ProductType.x))) + { + return FragmentationTerminus.Both; + } + else if (productTypes.Contains(ProductType.y) || productTypes.Contains(ProductType.zDot) + || productTypes.Contains(ProductType.zPlusOne) || productTypes.Contains(ProductType.x)) + { + return FragmentationTerminus.C; + } + else //"lp.Contains(ProductType.B) || lp.Contains(ProductType.BnoB1ions) || lp.Contains(ProductType.C) || lp.Contains(ProductType.Adot))" + { + return FragmentationTerminus.N; + } + } + + public static List> SeparateIonsByTerminus(List ionTypes) + { + List nIons = new List(); + List cIons = new List(); + foreach (ProductType productType in ionTypes) + { + if (productType == ProductType.b || productType == ProductType.c) + { + nIons.Add(productType); + } + else // Y and Z + { + cIons.Add(productType); + } + } + if (nIons.Count != 0 && cIons.Count != 0) + { + return new List> { nIons, cIons }; + } + else if (nIons.Count != 0) + { + return new List> { nIons }; + } + else if (cIons.Count != 0) + { + return new List> { cIons }; + } + else + { + throw new ArgumentException("No ions types were selected."); + } + } + } +} \ No newline at end of file diff --git a/mzLib/MassSpectrometry/Proteomics/ProteolyticDigestion/Protease.cs b/mzLib/MassSpectrometry/Proteomics/ProteolyticDigestion/Protease.cs new file mode 100644 index 000000000..93b649cbe --- /dev/null +++ b/mzLib/MassSpectrometry/Proteomics/ProteolyticDigestion/Protease.cs @@ -0,0 +1,740 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using Proteomics; + +namespace Proteomics.ProteolyticDigestion +{ + public class Protease + { + public Protease(string name, CleavageSpecificity cleavageSpecificity, string psiMSAccessionNumber, string psiMSName, List motifList, Modification modDetails = null) + { + Name = name; + CleavageSpecificity = cleavageSpecificity; + PsiMsAccessionNumber = psiMSAccessionNumber; + PsiMsName = psiMSName; + DigestionMotifs = motifList ?? new List(); + CleavageMod = modDetails; + } + + public string Name { get; } + public CleavageSpecificity CleavageSpecificity { get; } + public string PsiMsAccessionNumber { get; } + public string PsiMsName { get; } + public List DigestionMotifs { get; } + public Modification CleavageMod { get; set; } + + public override string ToString() + { + return Name; + } + + public override bool Equals(object obj) + { + return obj is Protease a + && (a.Name == null && Name == null || a.Name.Equals(Name)); + } + + public override int GetHashCode() + { + return (Name ?? "").GetHashCode(); + } + + /// + /// This method is used to determine cleavage specificity if the cleavage specificity is unknown + /// This occurs in the speedy nonspecific/semispecific searches when digesting post-search + /// + /// + /// + /// + /// + /// + public CleavageSpecificity GetCleavageSpecificity(Protein protein, int startIndex, int endIndex, bool retainMethionine) + { + int cleavableMatches = 0; + if (CleavageSpecificity != CleavageSpecificity.SingleN && CleavageSpecificity != CleavageSpecificity.SingleC) //if it's single protease, don't bother + { + List indicesToCleave = GetDigestionSiteIndices(protein.BaseSequence); + //if the start index is a cleavable index (-1 because one based) OR if the start index is after a cleavable methionine + if (indicesToCleave.Contains(startIndex - 1) || + (startIndex == 2 && protein.BaseSequence[0] == 'M' && !retainMethionine) || + protein.ProteolysisProducts.Any(x => x.OneBasedBeginPosition == startIndex)) + { + cleavableMatches++; + } + //if the end index is a cleavable index + if (indicesToCleave.Contains(endIndex) || + protein.ProteolysisProducts.Any(x => x.OneBasedEndPosition == endIndex)) + { + cleavableMatches++; + } + } + if (cleavableMatches == 0) //if neither were cleavable, (or it's singleN/C) then it's nonspecific + { + return CleavageSpecificity.None; + } + else if (cleavableMatches == 1) //if one index was cleavable, then it's semi specific + { + return CleavageSpecificity.Semi; + } + else //2 if both, then it's fully speific + { + return CleavageSpecificity.Full; + } + } + + /// + /// Gets intervals of a protein sequence that will result from digestion by this protease. + /// + /// + /// + /// + /// + /// + /// + internal List GetUnmodifiedPeptides(Protein protein, int maximumMissedCleavages, InitiatorMethionineBehavior initiatorMethionineBehavior, + int minPeptideLength, int maxPeptideLength, Protease specificProtease, bool topDownTruncationSearch = false) + { + List peptides = new List(); + + // proteolytic cleavage in one spot (N) + if (CleavageSpecificity == CleavageSpecificity.SingleN) + { + peptides = SingleN_Digestion(protein, initiatorMethionineBehavior, maximumMissedCleavages, minPeptideLength, maxPeptideLength, specificProtease); + } + + // proteolytic cleavage in one spot (C) + else if (CleavageSpecificity == CleavageSpecificity.SingleC) + { + peptides = SingleC_Digestion(protein, initiatorMethionineBehavior, maximumMissedCleavages, minPeptideLength, maxPeptideLength, specificProtease); + } + + //top-down + else if (CleavageSpecificity == CleavageSpecificity.None) + { + if (!topDownTruncationSearch)//standard top-down + { + // retain methionine + if ((initiatorMethionineBehavior != InitiatorMethionineBehavior.Cleave || protein[0] != 'M') + && OkayLength(protein.Length, minPeptideLength, maxPeptideLength)) + { + peptides.Add(new ProteolyticPeptide(protein, 1, protein.Length, 0, CleavageSpecificity.Full, "full")); + } + + // cleave methionine + if ((initiatorMethionineBehavior != InitiatorMethionineBehavior.Retain && protein[0] == 'M') + && OkayLength(protein.Length - 1, minPeptideLength, maxPeptideLength)) + { + peptides.Add(new ProteolyticPeptide(protein, 2, protein.Length, 0, CleavageSpecificity.Full, "full:M cleaved")); + } + } + + // Also digest using the proteolysis product start/end indices + peptides.AddRange( + protein.ProteolysisProducts + .Where(proteolysisProduct => proteolysisProduct.OneBasedEndPosition.HasValue && proteolysisProduct.OneBasedBeginPosition.HasValue + && OkayLength(proteolysisProduct.OneBasedEndPosition.Value - proteolysisProduct.OneBasedBeginPosition.Value + 1, minPeptideLength, maxPeptideLength)) + .Select(proteolysisProduct => + new ProteolyticPeptide(protein, proteolysisProduct.OneBasedBeginPosition.Value, proteolysisProduct.OneBasedEndPosition.Value, 0, CleavageSpecificity.None, proteolysisProduct.Type))); + } + + // Full proteolytic cleavage + else if (CleavageSpecificity == CleavageSpecificity.Full) + { + peptides.AddRange(FullDigestion(protein, initiatorMethionineBehavior, maximumMissedCleavages, minPeptideLength, maxPeptideLength)); + } + + // Cleavage rules for semi-specific search + else if (CleavageSpecificity == CleavageSpecificity.Semi) + { + peptides.AddRange(SemiProteolyticDigestion(protein, initiatorMethionineBehavior, maximumMissedCleavages, minPeptideLength, maxPeptideLength)); + } + else + { + throw new NotImplementedException(); + } + + return peptides; + } + + /// + /// Gets the indices after which this protease will cleave a given protein sequence + /// + /// + /// + internal List GetDigestionSiteIndices(string proteinSequence) + { + var indices = new List(); + + for (int r = 0; r < proteinSequence.Length; r++) + { + var cutSiteIndex = -1; + bool cleavagePrevented = false; + + foreach (DigestionMotif motif in DigestionMotifs) + { + var motifResults = motif.Fits(proteinSequence, r); + bool motifFits = motifResults.Item1; + bool motifPreventsCleavage = motifResults.Item2; + + if (motifFits && r + motif.CutIndex < proteinSequence.Length) + { + cutSiteIndex = Math.Max(r + motif.CutIndex, cutSiteIndex); + } + + if (motifPreventsCleavage) // if any motif prevents cleave + { + cleavagePrevented = true; + } + } + + // if no motif prevents cleave + if (!cleavagePrevented && cutSiteIndex != -1) + { + indices.Add(cutSiteIndex); + } + } + + indices.Add(0); // The start of the protein is treated as a cleavage site to retain the n-terminal peptide + indices.Add(proteinSequence.Length); // The end of the protein is treated as a cleavage site to retain the c-terminal peptide + return indices.Distinct().OrderBy(i => i).ToList(); + } + + /// + /// Retain N-terminal residue? + /// + /// + /// + /// + /// + internal static bool Retain(int oneBasedCleaveAfter, InitiatorMethionineBehavior initiatorMethionineBehavior, char nTerminus) + { + return oneBasedCleaveAfter != 0 // this only pertains to the n-terminus + || initiatorMethionineBehavior != InitiatorMethionineBehavior.Cleave + || nTerminus != 'M'; + } + + /// + /// Cleave N-terminal residue? + /// + /// + /// + /// + /// + internal static bool Cleave(int oneBasedCleaveAfter, InitiatorMethionineBehavior initiatorMethionineBehavior, char nTerminus) + { + return oneBasedCleaveAfter == 0 // this only pertains to the n-terminus + && initiatorMethionineBehavior != InitiatorMethionineBehavior.Retain + && nTerminus == 'M'; + } + + /// + /// Is length of given peptide okay, given minimum and maximum? + /// + /// + /// + /// + /// + internal static bool OkayLength(int peptideLength, int minPeptideLength, int maxPeptideLength) + { + return OkayMinLength(peptideLength, minPeptideLength) && OkayMaxLength(peptideLength, maxPeptideLength); + } + + /// + /// Gets protein intervals for digestion by this specific protease. + /// + /// + /// + /// + /// + /// + /// + private IEnumerable FullDigestion(Protein protein, InitiatorMethionineBehavior initiatorMethionineBehavior, + int maximumMissedCleavages, int minPeptideLength, int maxPeptideLength) + { + List oneBasedIndicesToCleaveAfter = GetDigestionSiteIndices(protein.BaseSequence); + char firstResidueInProtein = protein[0]; + + for (int missedCleavages = 0; missedCleavages <= maximumMissedCleavages; missedCleavages++) + { + for (int i = 0; i < oneBasedIndicesToCleaveAfter.Count - missedCleavages - 1; i++) + { + if (Retain(i, initiatorMethionineBehavior, firstResidueInProtein) + && OkayLength(oneBasedIndicesToCleaveAfter[i + missedCleavages + 1] - oneBasedIndicesToCleaveAfter[i], minPeptideLength, maxPeptideLength)) + { + yield return new ProteolyticPeptide(protein, oneBasedIndicesToCleaveAfter[i] + 1, oneBasedIndicesToCleaveAfter[i + missedCleavages + 1], + missedCleavages, CleavageSpecificity.Full, "full"); + } + if (Cleave(i, initiatorMethionineBehavior, firstResidueInProtein) && oneBasedIndicesToCleaveAfter[1] != 1 //prevent duplicates if that bond is cleaved by the protease + && OkayLength(oneBasedIndicesToCleaveAfter[i + missedCleavages + 1] - 1, minPeptideLength, maxPeptideLength)) + { + yield return new ProteolyticPeptide(protein, 2, oneBasedIndicesToCleaveAfter[i + missedCleavages + 1], + missedCleavages, CleavageSpecificity.Full, "full:M cleaved"); + } + } + + //TODO: Generate all the proteolytic products as distinct proteins during XML reading and delete all of the code below + // Also digest using the proteolysis product start/end indices + foreach (var proteolysisProduct in protein.ProteolysisProducts) + { + //if the proteolysis product contains something other than just the start AND end residues of the protein + if (proteolysisProduct.OneBasedBeginPosition != 1 || proteolysisProduct.OneBasedEndPosition != protein.Length) + { + int cleavageIndexWithinProteolysisProduct = 0; + //get the first cleavage index after the start of the proteolysis product + while (oneBasedIndicesToCleaveAfter[cleavageIndexWithinProteolysisProduct] < proteolysisProduct.OneBasedBeginPosition) + { + cleavageIndexWithinProteolysisProduct++; + } + + bool startPeptide = cleavageIndexWithinProteolysisProduct + missedCleavages < oneBasedIndicesToCleaveAfter.Count //if the current missed cleavages doesn't hit the end + && oneBasedIndicesToCleaveAfter[cleavageIndexWithinProteolysisProduct + missedCleavages] <= proteolysisProduct.OneBasedEndPosition //and the cleavage occurs before the proteolytic end + && proteolysisProduct.OneBasedBeginPosition.HasValue //and the proteolytic peptide even has a beginning + && !oneBasedIndicesToCleaveAfter.Contains(proteolysisProduct.OneBasedBeginPosition.Value - 1) //and we haven't already cleaved here + && (proteolysisProduct.OneBasedBeginPosition.Value != 1 || !Cleave(0, initiatorMethionineBehavior, firstResidueInProtein)) //and it's not the initiator methionine + && OkayLength(oneBasedIndicesToCleaveAfter[cleavageIndexWithinProteolysisProduct + missedCleavages] - proteolysisProduct.OneBasedBeginPosition.Value + 1, minPeptideLength, maxPeptideLength); //and it's the correct size + if (startPeptide) + { + yield return new ProteolyticPeptide(protein, proteolysisProduct.OneBasedBeginPosition.Value, oneBasedIndicesToCleaveAfter[cleavageIndexWithinProteolysisProduct + missedCleavages], + missedCleavages, CleavageSpecificity.Full, proteolysisProduct.Type + " start"); + } + + //get the cleavage index before the end of the proteolysis product + while (oneBasedIndicesToCleaveAfter[cleavageIndexWithinProteolysisProduct] < proteolysisProduct.OneBasedEndPosition) + { + cleavageIndexWithinProteolysisProduct++; + } + + bool endPeptide = cleavageIndexWithinProteolysisProduct - missedCleavages - 1 >= 0 //if we're not going to go out of bounds (-1 to get in front of the end) + && oneBasedIndicesToCleaveAfter[cleavageIndexWithinProteolysisProduct - missedCleavages - 1] + 1 >= proteolysisProduct.OneBasedBeginPosition //and it's not before the beginning + && proteolysisProduct.OneBasedEndPosition.HasValue //and the proteolytic peptide even has an end + && !oneBasedIndicesToCleaveAfter.Contains(proteolysisProduct.OneBasedEndPosition.Value) //and we haven't already cleaved here + && OkayLength(proteolysisProduct.OneBasedEndPosition.Value - oneBasedIndicesToCleaveAfter[cleavageIndexWithinProteolysisProduct - missedCleavages - 1] + 1 - 1, minPeptideLength, maxPeptideLength); //and it's the correct size + if (endPeptide) + { + yield return new ProteolyticPeptide(protein, oneBasedIndicesToCleaveAfter[cleavageIndexWithinProteolysisProduct - missedCleavages - 1] + 1, proteolysisProduct.OneBasedEndPosition.Value, + missedCleavages, CleavageSpecificity.Full, proteolysisProduct.Type + " end"); + } + } + } + } + + //add intact proteolysis products (if acceptable) + foreach (var proteolysisProduct in protein.ProteolysisProducts) + { + if (proteolysisProduct.OneBasedBeginPosition.HasValue //begin has value + && proteolysisProduct.OneBasedEndPosition.HasValue //and end has value + && (proteolysisProduct.OneBasedBeginPosition.Value != 1 || !Cleave(0, initiatorMethionineBehavior, firstResidueInProtein)) //and it's not the initiator methionine + && !oneBasedIndicesToCleaveAfter.Contains(proteolysisProduct.OneBasedBeginPosition.Value - 1) //and we haven't already cleaved here + && !oneBasedIndicesToCleaveAfter.Contains(proteolysisProduct.OneBasedEndPosition.Value)) //and we haven't already cleaved there + { + int firstCleavage = 0; + //get the first cleavage index after the start of the proteolysis product + while (oneBasedIndicesToCleaveAfter[firstCleavage] < proteolysisProduct.OneBasedBeginPosition) + { + firstCleavage++; + } + + int lastCleavage = firstCleavage; + //get the last cleavage index before the end of the proteolysis product + while (oneBasedIndicesToCleaveAfter[lastCleavage] < proteolysisProduct.OneBasedEndPosition) + { + lastCleavage++; + } + if (lastCleavage - firstCleavage < maximumMissedCleavages && //if there aren't too many missed cleavages + OkayLength(proteolysisProduct.OneBasedEndPosition.Value - proteolysisProduct.OneBasedBeginPosition.Value, minPeptideLength, maxPeptideLength)) //and it's the correct size + { + yield return new ProteolyticPeptide(protein, proteolysisProduct.OneBasedBeginPosition.Value, proteolysisProduct.OneBasedEndPosition.Value, + lastCleavage - firstCleavage, CleavageSpecificity.Full, proteolysisProduct.Type + " end"); + } + } + } + } + + /// + /// Gets the protein intervals based on semiSpecific digestion rules + /// This is the classic, slow semi-specific digestion that generates each semi-specific peptide pre-search + /// + /// + /// + /// + /// + /// + /// + private IEnumerable SemiProteolyticDigestion(Protein protein, InitiatorMethionineBehavior initiatorMethionineBehavior, + int maximumMissedCleavages, int minPeptideLength, int maxPeptideLength) + { + List intervals = new List(); + List oneBasedIndicesToCleaveAfter = GetDigestionSiteIndices(protein.BaseSequence); + + // It's possible not to go through this loop (maxMissedCleavages+1>number of indexes), and that's okay. It will get digested in the next loops (finish C/N termini) + for (int i = 0; i < oneBasedIndicesToCleaveAfter.Count - maximumMissedCleavages - 1; i++) + { + bool retain = Retain(i, initiatorMethionineBehavior, protein[0]); + bool cleave = Cleave(i, initiatorMethionineBehavior, protein[0]) && oneBasedIndicesToCleaveAfter[1] != 1; + int cTerminusProtein = oneBasedIndicesToCleaveAfter[i + maximumMissedCleavages + 1]; + HashSet localOneBasedIndicesToCleaveAfter = new HashSet(); + for (int j = i; j < i + maximumMissedCleavages + 1; j++) + { + localOneBasedIndicesToCleaveAfter.Add(oneBasedIndicesToCleaveAfter[j]); + } + if (retain) + { + intervals.AddRange(FixedTermini(oneBasedIndicesToCleaveAfter[i], cTerminusProtein, protein, cleave, retain, minPeptideLength, maxPeptideLength, localOneBasedIndicesToCleaveAfter)); + } + + if (cleave) + { + intervals.AddRange(FixedTermini(1, cTerminusProtein, protein, cleave, retain, minPeptideLength, maxPeptideLength, localOneBasedIndicesToCleaveAfter)); + } + } + + // Finish C-term of protein caused by loop being "i < oneBasedIndicesToCleaveAfter.Count - maximumMissedCleavages - 1" + int last = oneBasedIndicesToCleaveAfter.Count - 1; + int maxIndexSemi = maximumMissedCleavages < last ? maximumMissedCleavages : last; + // Fringe C-term peptides + for (int i = 1; i <= maxIndexSemi; i++) + { + // FixedN + int nTerminusProtein = oneBasedIndicesToCleaveAfter[last - i]; + int cTerminusProtein = oneBasedIndicesToCleaveAfter[last]; + HashSet localOneBasedIndicesToCleaveAfter = new HashSet(); + for (int j = 0; j < i; j++) //include zero, the c terminus + { + localOneBasedIndicesToCleaveAfter.Add(oneBasedIndicesToCleaveAfter[last - j]); + } + for (int j = cTerminusProtein; j > nTerminusProtein; j--)//We are hitting the c-terminus here + { + if (OkayLength(j - nTerminusProtein, minPeptideLength, maxPeptideLength)) + { + intervals.Add(localOneBasedIndicesToCleaveAfter.Contains(j) ? + new ProteolyticPeptide(protein, nTerminusProtein + 1, j, j - nTerminusProtein, CleavageSpecificity.Full, "full") : + new ProteolyticPeptide(protein, nTerminusProtein + 1, j, j - nTerminusProtein, CleavageSpecificity.Semi, "semi")); + } + } + } + + // Fringe N-term peptides + for (int i = 1; i <= maxIndexSemi; i++) + { + bool retain = initiatorMethionineBehavior == InitiatorMethionineBehavior.Retain; + // FixedC + int nTerminusProtein = retain ? oneBasedIndicesToCleaveAfter[0] : oneBasedIndicesToCleaveAfter[0] + 1; // +1 start after M (since already covered earlier) + int cTerminusProtein = oneBasedIndicesToCleaveAfter[i]; + HashSet localOneBasedIndicesToCleaveAfter = new HashSet(); + for (int j = 1; j < i; j++)//j starts at 1, because zero is n terminus + { + localOneBasedIndicesToCleaveAfter.Add(oneBasedIndicesToCleaveAfter[j]); + } + int start = nTerminusProtein + 1;//plus one to not doublecount the n terminus (in addition to the M term skip) + for (int j = start; j < cTerminusProtein; j++) + { + if (OkayLength(cTerminusProtein - j, minPeptideLength, maxPeptideLength) + && !localOneBasedIndicesToCleaveAfter.Contains(j)) + { + intervals.Add(new ProteolyticPeptide(protein, j + 1, cTerminusProtein, cTerminusProtein - j, CleavageSpecificity.Semi, "semi")); + } + } + } + + // Also digest using the proteolysis product start/end indices + // This should only be things where the proteolysis is not K/R and the + foreach (var proteolysisProduct in protein.ProteolysisProducts) + { + if (proteolysisProduct.OneBasedEndPosition.HasValue && proteolysisProduct.OneBasedBeginPosition.HasValue + && (proteolysisProduct.OneBasedBeginPosition != 1 || proteolysisProduct.OneBasedEndPosition != protein.Length)) //if at least one side is not a terminus + { + int start = proteolysisProduct.OneBasedBeginPosition.Value; + int end = proteolysisProduct.OneBasedEndPosition.Value; + int i = 0; + while (oneBasedIndicesToCleaveAfter[i] < start)//"<" to prevent additions if same index as residues + { + i++; // Last position in protein is an index to cleave after + } + + // Start peptide + for (int j = start; j < oneBasedIndicesToCleaveAfter[i]; j++) + { + if (OkayLength(j - start + 1, minPeptideLength, maxPeptideLength)) + { + intervals.Add(new ProteolyticPeptide(protein, start, j, j - start, CleavageSpecificity.Full, proteolysisProduct.Type + " start")); + } + } + while (oneBasedIndicesToCleaveAfter[i] < end) //"<" to prevent additions if same index as residues, since i-- is below + { + i++; + } + + // Now that we've obtained an index to cleave after that is past the proteolysis product + // we need to backtrack to get the index to cleave that is immediately before the the proteolysis product + // to do this, we will do i-- + // In the nitch case that the proteolysis product is already an index to cleave + // no new peptides will be generated using this, so we will forgo i-- + // this makes peptides of length 0, which are not generated due to the for loop + // removing this if statement will result in crashes from c-terminal proteolysis product end positions + if (oneBasedIndicesToCleaveAfter[i] != end) + { + i--; + } + + // Fin (End) + for (int j = oneBasedIndicesToCleaveAfter[i] + 1; j < end; j++) + { + if (OkayLength(end - j + 1, minPeptideLength, maxPeptideLength)) + { + intervals.Add(new ProteolyticPeptide(protein, j, end, end - j, CleavageSpecificity.Full, proteolysisProduct.Type + " end")); + } + } + } + } + return intervals; + } + + /// + /// Get protein intervals for fixed termini. + /// This is used for the classic, slow semi-proteolytic cleavage that generates each semi-specific peptides pre-search. + /// + /// + /// + /// + /// + /// + /// + /// + private static IEnumerable FixedTermini(int nTerminusProtein, int cTerminusProtein, Protein protein, bool cleave, bool retain, int minPeptideLength, int maxPeptideLength, HashSet localOneBasedIndicesToCleaveAfter) + { + bool preventMethionineFromBeingDuplicated = nTerminusProtein == 1 && cleave && retain; //prevents duplicate sequences containing N-terminal methionine + List intervals = new List(); + if (!preventMethionineFromBeingDuplicated && OkayLength(cTerminusProtein - nTerminusProtein, minPeptideLength, maxPeptideLength)) //adds the full length maximum cleavages, no semi + { + intervals.Add(new ProteolyticPeptide(protein, nTerminusProtein + 1, cTerminusProtein, + cTerminusProtein - nTerminusProtein, CleavageSpecificity.Full, "full" + (cleave ? ":M cleaved" : ""))); // Maximum sequence length + } + + // Fixed termini at each internal index + IEnumerable internalIndices = Enumerable.Range(nTerminusProtein + 1, cTerminusProtein - nTerminusProtein - 1); //every residue between them, +1 so we don't double count the original full + + List fixedCTermIntervals = new List(); + if (!preventMethionineFromBeingDuplicated) + { + var indexesOfAcceptableLength = internalIndices.Where(j => OkayLength(cTerminusProtein - j, minPeptideLength, maxPeptideLength)); + foreach (var j in indexesOfAcceptableLength) + { + if (localOneBasedIndicesToCleaveAfter.Contains(j) || (j == 1 && cleave)) //if cleaved on cleavable index or after initiator methionine, record as full + { + if (j == 1 && cleave) //check we're not doubling it up + { + fixedCTermIntervals.Add(new ProteolyticPeptide(protein, j + 1, cTerminusProtein, cTerminusProtein - j, CleavageSpecificity.Full, "full:M cleaved")); + } + //else //don't allow full unless cleaved, since they're covered by Cterm + } + else //record it as a semi + { + fixedCTermIntervals.Add(new ProteolyticPeptide(protein, j + 1, cTerminusProtein, cTerminusProtein - j, CleavageSpecificity.Semi, "semi" + (cleave ? ":M cleaved" : ""))); + } + } + } + IEnumerable fixedNTermIntervals = + internalIndices + .Where(j => OkayLength(j - nTerminusProtein, minPeptideLength, maxPeptideLength)) + .Select(j => localOneBasedIndicesToCleaveAfter.Contains(j) ? + new ProteolyticPeptide(protein, nTerminusProtein + 1, j, j - nTerminusProtein, CleavageSpecificity.Full, "full" + (cleave ? ":M cleaved" : "")) : + new ProteolyticPeptide(protein, nTerminusProtein + 1, j, j - nTerminusProtein, CleavageSpecificity.Semi, "semi" + (cleave ? ":M cleaved" : ""))); + + return intervals.Concat(fixedCTermIntervals).Concat(fixedNTermIntervals); + } + + /// + /// Gets peptides for the singleN protease + /// + /// + /// + /// + /// + /// + /// + /// + private List SingleN_Digestion(Protein protein, InitiatorMethionineBehavior initiatorMethionineBehavior, + int maximumMissedCleavages, int minPeptideLength, int maxPeptideLength, Protease specificProtease) + { + List peptides = new List(); + int proteinStart = Retain(0, initiatorMethionineBehavior, protein[0]) ? 1 : 2; //where does the protein start? + + if (Equals(specificProtease)) + { + bool maxTooBig = protein.Length + maxPeptideLength < 0; //when maxPeptideLength is too large, it becomes negative and causes issues + //This happens when maxPeptideLength == int.MaxValue or something close to it + for (; proteinStart <= protein.Length; proteinStart++) + { + if (OkayMinLength(protein.Length - proteinStart + 1, minPeptideLength)) + { + //need Math.Max if max length is int.MaxLength, since +proteinStart will make it negative + //if the max length is too big to be an int (ie infinity), just do the protein length. + //if it's not too big to be an int, it might still be too big. Take the minimum of the protein length or the maximum length (-1, because the index is inclusive. Without -1, peptides will be one AA too long) + peptides.Add(new ProteolyticPeptide(protein, proteinStart, maxTooBig ? protein.Length : Math.Min(protein.Length, proteinStart + maxPeptideLength - 1), 0, CleavageSpecificity.SingleN, "SingleN")); + } + } + } + else //if there's a specific protease, then we need to adhere to the specified missed cleavage rules + { + //generate only peptides with the maximum number of missed cleavages, unless the protein has fewer than the max or we're near the unselected terminus (where we run to the end of the protein) + List oneBasedIndicesToCleaveAfter = specificProtease.GetDigestionSiteIndices(protein.BaseSequence); //get peptide bonds to cleave SPECIFICALLY (termini included) + oneBasedIndicesToCleaveAfter[0] = proteinStart - 1;//update the first cleavage to represent the initiator methionine rules + int maximumMissedCleavagesIndexShift = maximumMissedCleavages + 1; + + for (int i = 0; i < oneBasedIndicesToCleaveAfter.Count - maximumMissedCleavagesIndexShift; i++) + { + int startIndex = oneBasedIndicesToCleaveAfter[i]; + int endProteaseIndex = oneBasedIndicesToCleaveAfter[i + maximumMissedCleavagesIndexShift]; + int peptideLength = endProteaseIndex - startIndex; + if (peptideLength >= minPeptideLength) //if bigger than min + { + int endActualIndex = endProteaseIndex; + if (peptideLength > maxPeptideLength) //if the next cleavage is too far away, crop it to the max length + { + endActualIndex = startIndex + maxPeptideLength; + } + int nextStartIndex = oneBasedIndicesToCleaveAfter[i + 1] + 1; + + //make SingleN peptides until we reach the next index to cleave at or until the peptides are too small + for (; (startIndex + 1 < nextStartIndex) && (endActualIndex - startIndex >= minPeptideLength); startIndex++) + { + peptides.Add(new ProteolyticPeptide(protein, startIndex + 1, endActualIndex, maximumMissedCleavages, CleavageSpecificity.SingleN, "SingleN")); + + //update endIndex if needed + if (endActualIndex != endProteaseIndex) + { + endActualIndex++; + } + } + } + } + //wrap up the terminus + if (oneBasedIndicesToCleaveAfter.Count < maximumMissedCleavagesIndexShift) + { + maximumMissedCleavagesIndexShift = oneBasedIndicesToCleaveAfter.Count; + } + int lastStartIndex = oneBasedIndicesToCleaveAfter[oneBasedIndicesToCleaveAfter.Count - maximumMissedCleavagesIndexShift] + 1; + int proteinEndIndex = oneBasedIndicesToCleaveAfter[oneBasedIndicesToCleaveAfter.Count - 1]; //end of protein + int lastEndIndex = Math.Min(proteinEndIndex, lastStartIndex + maxPeptideLength - 1); //end of protein + for (; lastStartIndex + minPeptideLength - 1 <= lastEndIndex; lastStartIndex++) + { + peptides.Add(new ProteolyticPeptide(protein, lastStartIndex, lastEndIndex, maximumMissedCleavages, CleavageSpecificity.SingleN, "SingleN")); + + //update the end if needed + if (lastEndIndex != proteinEndIndex) + { + lastEndIndex++; + } + } + } + return peptides; + } + + /// + /// Gets peptides for the singleC protease + /// + /// + /// + /// + /// + /// + /// + /// + private List SingleC_Digestion(Protein protein, InitiatorMethionineBehavior initiatorMethionineBehavior, + int maximumMissedCleavages, int minPeptideLength, int maxPeptideLength, Protease specificProtease) + { + List peptides = new List(); + int proteinStart = Retain(0, initiatorMethionineBehavior, protein[0]) ? 1 : 2; //where does the protein start? + if (Equals(specificProtease)) + { + int lengthDifference = proteinStart - 1; //take it back one for zero based index + for (int proteinEnd = 1; proteinEnd <= protein.Length; proteinEnd++) + { + //length of peptide will be at least the start index + if (OkayMinLength(proteinEnd - lengthDifference, minPeptideLength)) //is the maximum possible length longer than the minimum? + { + //use the start index as the max of the N-terminus or the c-terminus minus the max (+1 because inclusive, otherwise peptides will be one AA too long) + peptides.Add(new ProteolyticPeptide(protein, Math.Max(proteinStart, proteinEnd - maxPeptideLength + 1), proteinEnd, 0, CleavageSpecificity.SingleC, "SingleC")); + } + } + } + else //if there's a specific protease, then we need to adhere to the specified missed cleavage rules + { + //generate only peptides with the maximum number of missed cleavages, unless the protein has fewer than the max or we're near the unselected terminus (where we run to the end of the protein) + List oneBasedIndicesToCleaveAfter = specificProtease.GetDigestionSiteIndices(protein.BaseSequence); //get peptide bonds to cleave SPECIFICALLY (termini included) + oneBasedIndicesToCleaveAfter[0] = proteinStart - 1;//update the first cleavage to represent the initiator methionine rules + int maximumMissedCleavagesIndexShift = maximumMissedCleavages + 1; + + for (int i = oneBasedIndicesToCleaveAfter.Count - 1; i > maximumMissedCleavagesIndexShift; i--) + { + int endProteaseIndex = oneBasedIndicesToCleaveAfter[i]; + int startProteaseIndex = oneBasedIndicesToCleaveAfter[i - maximumMissedCleavagesIndexShift]; + int peptideLength = endProteaseIndex - startProteaseIndex; + if (peptideLength >= minPeptideLength) //if bigger than min + { + int startActualIndex = startProteaseIndex; + if (peptideLength > maxPeptideLength) //if the next cleavage is too far away, crop it to the max length + { + startActualIndex = endProteaseIndex - maxPeptideLength; + } + int nextEndIndex = oneBasedIndicesToCleaveAfter[i - 1]; + //make SingleC peptides until we reach the next index to cleave at or until the peptides are too small + for (; (endProteaseIndex > nextEndIndex) && (endProteaseIndex - startActualIndex >= minPeptideLength); endProteaseIndex--) + { + peptides.Add(new ProteolyticPeptide(protein, startActualIndex + 1, endProteaseIndex, maximumMissedCleavages, CleavageSpecificity.SingleC, "SingleC")); + + //update startIndex if needed + if (startActualIndex != startProteaseIndex) + { + startActualIndex--; + } + } + } + } + //wrap up the terminus + //if there are more missed cleavages allowed than there are cleavages to cleave, change the effective number of missed cleavages to the max + if (oneBasedIndicesToCleaveAfter.Count <= maximumMissedCleavagesIndexShift) + { + maximumMissedCleavagesIndexShift = oneBasedIndicesToCleaveAfter.Count - 1; + } + int lastEndIndex = oneBasedIndicesToCleaveAfter[maximumMissedCleavagesIndexShift]; + int startIndex = Math.Max(proteinStart, lastEndIndex - maxPeptideLength + 1); + int minPeptideLengthOneBasedResidueShift = minPeptideLength - 1; + for (; lastEndIndex >= startIndex + minPeptideLengthOneBasedResidueShift; lastEndIndex--) + { + peptides.Add(new ProteolyticPeptide(protein, startIndex, lastEndIndex, maximumMissedCleavages, CleavageSpecificity.SingleC, "SingleC")); + + //update the start if needed + if (startIndex != proteinStart) + { + startIndex--; + } + } + } + return peptides; + } + + /// + /// Is length of given peptide okay, given minimum? + /// + /// + /// + /// + private static bool OkayMinLength(int peptideLength, int minPeptideLength) + { + return peptideLength >= minPeptideLength; + } + + /// + /// Is length of given peptide okay, given maximum? + /// + /// + /// + /// + private static bool OkayMaxLength(int? peptideLength, int maxPeptideLength) + { + return !peptideLength.HasValue || peptideLength <= maxPeptideLength; + } + } +} \ No newline at end of file diff --git a/mzLib/MassSpectrometry/Proteomics/ProteolyticDigestion/ProteaseDictionary.cs b/mzLib/MassSpectrometry/Proteomics/ProteolyticDigestion/ProteaseDictionary.cs new file mode 100644 index 000000000..a667653a8 --- /dev/null +++ b/mzLib/MassSpectrometry/Proteomics/ProteolyticDigestion/ProteaseDictionary.cs @@ -0,0 +1,100 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using MzLibUtil; +using Proteomics; + +namespace Proteomics.ProteolyticDigestion +{ + public static class ProteaseDictionary + { + static ProteaseDictionary() + { + var pathToProgramFiles = Environment.GetFolderPath(Environment.SpecialFolder.ProgramFiles); + string dataDirectory = !String.IsNullOrWhiteSpace(pathToProgramFiles) && AppDomain.CurrentDomain.BaseDirectory.Contains(pathToProgramFiles) + && !AppDomain.CurrentDomain.BaseDirectory.Contains("Jenkins") ? + Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.LocalApplicationData), "MetaMorpheus") : + AppDomain.CurrentDomain.BaseDirectory; + + string path = Path.Combine(dataDirectory, "Proteomics", "ProteolyticDigestion", "proteases.tsv"); + Dictionary = LoadProteaseDictionary(path); + } + + public static Dictionary Dictionary { get; set; } + + public static Dictionary LoadProteaseDictionary(string path, List proteaseMods = null) + { + + Dictionary dict = new Dictionary(); + + string[] myLines = File.ReadAllLines(path); + myLines = myLines.Skip(1).ToArray(); + + foreach (string line in myLines) + { + if (line.Trim() != string.Empty) // skip empty lines + { + string[] fields = line.Split('\t'); + List motifList = DigestionMotif.ParseDigestionMotifsFromString(fields[1]); + string name = fields[0]; + var cleavageSpecificity = ((CleavageSpecificity)Enum.Parse(typeof(CleavageSpecificity), fields[4], true)); + string psiMsAccessionNumber = fields[5]; + string psiMsName = fields[6]; + //name of the modification that is associated with proteolytic cleavage + string proteaseModDetails = fields[8]; + //if this protease has an associated modification, look it up in the list of mods loaded fro the protease mods file + if (proteaseModDetails != "" && proteaseMods != null) + { + if (proteaseMods.Select(p => p.IdWithMotif).ToList().Contains(proteaseModDetails)) + { + Modification proteaseModification = proteaseMods.Where(p => p.IdWithMotif == proteaseModDetails).First(); + var protease = new Protease(name, cleavageSpecificity, psiMsAccessionNumber, psiMsName, motifList, proteaseModification); + if (!dict.ContainsKey(protease.Name)) + { + dict.Add(protease.Name, protease); + } + else + { + throw new MzLibException("More than one protease named "+ protease.Name +" exists"); + } + + } + else + { + var protease = new Protease(name, cleavageSpecificity, psiMsAccessionNumber, psiMsName, motifList); + if (!dict.ContainsKey(protease.Name)) + { + dict.Add(protease.Name, protease); + } + else + { + throw new MzLibException("More than one protease named " + protease.Name + " exists"); + } + throw new MzLibException(proteaseModDetails + " is not a valid modification"); + } + + } + else + { + var protease = new Protease(name, cleavageSpecificity, psiMsAccessionNumber, psiMsName, motifList); + if (!dict.ContainsKey(protease.Name)) + { + dict.Add(protease.Name, protease); + } + else + { + throw new MzLibException("More than one protease named " + protease.Name + " exists"); + } + } + + } + } + + return dict; + + } + + + } +} \ No newline at end of file diff --git a/mzLib/MassSpectrometry/Proteomics/ProteolyticDigestion/ProteinDigestion.cs b/mzLib/MassSpectrometry/Proteomics/ProteolyticDigestion/ProteinDigestion.cs new file mode 100644 index 000000000..cf5d9ce0b --- /dev/null +++ b/mzLib/MassSpectrometry/Proteomics/ProteolyticDigestion/ProteinDigestion.cs @@ -0,0 +1,239 @@ +using System.Collections.Generic; +using Proteomics.Fragmentation; + +namespace Proteomics.ProteolyticDigestion +{ + public class ProteinDigestion + { + /// + /// Initializes digestion object + /// + /// + /// + /// + public ProteinDigestion(DigestionParams digestionParams, IEnumerable allKnownFixedModifications, List variableModifications) + { + DigestionParams = digestionParams; + Protease = digestionParams.Protease; + MaximumMissedCleavages = digestionParams.MaxMissedCleavages; + InitiatorMethionineBehavior = digestionParams.InitiatorMethionineBehavior; + MinPeptideLength = digestionParams.MinPeptideLength; + MaxPeptideLength = digestionParams.MaxPeptideLength; + AllKnownFixedModifications = allKnownFixedModifications; + VariableModifications = variableModifications; + } + + public Protease Protease { get; set; } + public int MaximumMissedCleavages { get; set; } + public DigestionParams DigestionParams { get; set; } + public InitiatorMethionineBehavior InitiatorMethionineBehavior { get; set; } + public int MinPeptideLength { get; set; } + public int MaxPeptideLength { get; set; } + public IEnumerable AllKnownFixedModifications { get; set; } + public List VariableModifications { get; set; } + + /// + /// Gets peptides for speedy semispecific digestion of a protein + /// This generates specific peptides of maximum missed cleavages + /// These peptides need to be digested post search to their actual sequences + /// semi-specific search enters here... + /// + /// + /// + public IEnumerable SpeedySemiSpecificDigestion(Protein protein) //We are only getting fully specific peptides of the maximum cleaved residues here + { + List peptides = new List(); + List oneBasedIndicesToCleaveAfter = Protease.GetDigestionSiteIndices(protein.BaseSequence); //get peptide bonds to cleave SPECIFICALLY (termini included) + int maximumMissedCleavagesIndexShift = MaximumMissedCleavages + 1; + + //it's possible not to go through this loop (maxMissedCleavages+1>number of indexes), and that's okay. It will get digested in the next loops (finish C/N termini) + for (int i = 0; i < oneBasedIndicesToCleaveAfter.Count - maximumMissedCleavagesIndexShift; i++) + { + bool retain = Protease.Retain(i, InitiatorMethionineBehavior, protein[0]); + if (retain) //it's okay to use i instead of oneBasedIndicesToCleaveAfter[i], because the index of zero is zero and it only checks if it's the N-terminus or not + { + int peptideLength = oneBasedIndicesToCleaveAfter[i + maximumMissedCleavagesIndexShift] - oneBasedIndicesToCleaveAfter[i]; + if (peptideLength >= MinPeptideLength) //if bigger than min + { + if (peptideLength <= MaxPeptideLength) //if an acceptable length (bigger than min, smaller than max), add it + { + peptides.Add(new ProteolyticPeptide(protein, oneBasedIndicesToCleaveAfter[i] + 1, oneBasedIndicesToCleaveAfter[i + maximumMissedCleavagesIndexShift], + MaximumMissedCleavages, CleavageSpecificity.Full, "full")); + } + else if (DigestionParams.FragmentationTerminus == FragmentationTerminus.N) //make something with the maximum length and fixed N + { + int startIndex = oneBasedIndicesToCleaveAfter[i]; + peptides.Add(new ProteolyticPeptide(protein, startIndex + 1, startIndex + MaxPeptideLength, MaximumMissedCleavages, CleavageSpecificity.Semi, "semi")); + } + else //It has to be FragmentationTerminus.C //make something with the maximum length and fixed C + { + int endIndex = oneBasedIndicesToCleaveAfter[i + maximumMissedCleavagesIndexShift]; + peptides.Add(new ProteolyticPeptide(protein, endIndex - MaxPeptideLength + 1, endIndex, MaximumMissedCleavages, CleavageSpecificity.Semi, "semi")); + } + } + } + + if (Protease.Cleave(i, InitiatorMethionineBehavior, protein[0]) && (DigestionParams.FragmentationTerminus == FragmentationTerminus.N || !retain)) //it's okay to use i instead of oneBasedIndicesToCleaveAfter[i], because the index of zero is zero and it only checks if it's the N-terminus or not + { + int peptideLength = oneBasedIndicesToCleaveAfter[i + maximumMissedCleavagesIndexShift] - 1; + if (peptideLength >= MinPeptideLength) + { + if (peptideLength <= MaxPeptideLength) + { + peptides.Add(new ProteolyticPeptide(protein, 2, oneBasedIndicesToCleaveAfter[i + maximumMissedCleavagesIndexShift], //two is hardcoded, since M=1, so the next aa is 2 (one based) + MaximumMissedCleavages, CleavageSpecificity.Full, "full:M cleaved")); + } + else if (DigestionParams.FragmentationTerminus == FragmentationTerminus.N) + { + peptides.Add(new ProteolyticPeptide(protein, 2, 2 + MaxPeptideLength - 1, MaximumMissedCleavages, CleavageSpecificity.Semi, "semi")); + } + else //It has to be FragmentationTerminus.C //make something with the maximum length and fixed C + { + //kinda tricky, because we'll be creating a duplication if cleavage is variable + if (!Protease.Retain(i, InitiatorMethionineBehavior, protein[0])) //only if cleave, because then not made earlier during retain + { + int tempIndex = oneBasedIndicesToCleaveAfter[i + maximumMissedCleavagesIndexShift]; + peptides.Add(new ProteolyticPeptide(protein, tempIndex - MaxPeptideLength + 1, tempIndex, MaximumMissedCleavages, CleavageSpecificity.Semi, "semi")); + } + } + } + } + } + + //wrap up the termini that weren't hit earlier + int lastIndex = oneBasedIndicesToCleaveAfter.Count - 1; //last cleavage index (the c-terminus) + int maxIndexDifference = MaximumMissedCleavages < lastIndex ? MaximumMissedCleavages : lastIndex; //the number of index differences allowed. + //If the protein has fewer cleavage sites than allowed missed cleavages, just use the number of cleavage sites (lastIndex) + bool nTerminusFragmentation = DigestionParams.FragmentationTerminus == FragmentationTerminus.N; + for (int i = 1; i <= maxIndexDifference; i++) //i is the difference (in indexes) between indexes (cleavages), so it needs to start at 1, or the peptide would have length = 0 + { + int startIndex = nTerminusFragmentation ? + oneBasedIndicesToCleaveAfter[lastIndex - i] : + oneBasedIndicesToCleaveAfter[0]; + int endIndex = nTerminusFragmentation ? + oneBasedIndicesToCleaveAfter[lastIndex] : + oneBasedIndicesToCleaveAfter[i]; + + int peptideLength = endIndex - startIndex; + if (peptideLength >= MinPeptideLength) + { + if (peptideLength <= MaxPeptideLength) //if okay length, add it up to the terminus + { + peptides.Add(new ProteolyticPeptide(protein, startIndex + 1, endIndex, i - 1, CleavageSpecificity.Full, "full")); + } + else //update so that not the end of terminus + { + if (nTerminusFragmentation) + { + endIndex = startIndex + MaxPeptideLength; + } + else + { + startIndex = endIndex - MaxPeptideLength; + } + peptides.Add(new ProteolyticPeptide(protein, startIndex + 1, endIndex, i - 1, CleavageSpecificity.Semi, "semi")); + } + } + } + + // Also digest using the proteolysis product start/end indices + foreach (ProteolysisProduct product in protein.ProteolysisProducts) + { + //if fixed N, we care if the start position is novel + if (DigestionParams.FragmentationTerminus == FragmentationTerminus.N) + { + //if has value and not a duplicate + if (product.OneBasedBeginPosition.HasValue && !oneBasedIndicesToCleaveAfter.Contains(product.OneBasedBeginPosition.Value - 1)) + { + int proteaseClevageIndex = 0; + + //get the first cleavage index after the start of the proteolysis product + while (oneBasedIndicesToCleaveAfter[proteaseClevageIndex] < product.OneBasedBeginPosition.Value) + { + proteaseClevageIndex++; + } + //add max missed cleavages + proteaseClevageIndex += MaximumMissedCleavages; + + //set to the end if we overshot + if (proteaseClevageIndex >= oneBasedIndicesToCleaveAfter.Count) + { + proteaseClevageIndex = oneBasedIndicesToCleaveAfter.Count - 1; + } + int endIndex = oneBasedIndicesToCleaveAfter[proteaseClevageIndex]; + + //set to product end value if cleavages extend past + if (product.OneBasedEndPosition.HasValue && product.OneBasedEndPosition.Value < endIndex) + { + endIndex = product.OneBasedEndPosition.Value; + } + + //limit length to the maximum allowed if necessary + if (endIndex - product.OneBasedBeginPosition.Value >= MaxPeptideLength) + { + endIndex = product.OneBasedBeginPosition.Value + MaxPeptideLength - 1; + } + + //if it's bigger than the minimum allowed, then add it + if (endIndex - product.OneBasedBeginPosition.Value + 1 >= MinPeptideLength) + { + peptides.Add(new ProteolyticPeptide(protein, product.OneBasedBeginPosition.Value, endIndex, MaximumMissedCleavages, CleavageSpecificity.Full, product.Type + " start")); + } + } + } + else //if fixed C, we care if the end position is novel + { + //if has value and not a duplicate + if (product.OneBasedEndPosition.HasValue && !oneBasedIndicesToCleaveAfter.Contains(product.OneBasedEndPosition.Value)) + { + int proteaseClevageIndex = 0; + + //get the first cleavage index after the start of the proteolysis product + while (oneBasedIndicesToCleaveAfter[proteaseClevageIndex] < product.OneBasedEndPosition.Value) + { + proteaseClevageIndex++; + } + //subtract max missed cleavages + proteaseClevageIndex -= (MaximumMissedCleavages + 1); //+1 because we overshot in the while loop + + //set to the beginning if we overshot + if (proteaseClevageIndex < 0) + { + proteaseClevageIndex = 0; + } + int beginIndex = oneBasedIndicesToCleaveAfter[proteaseClevageIndex] + 1; + + //set to product end value if cleavages extend past + if (product.OneBasedBeginPosition.HasValue && product.OneBasedBeginPosition.Value > beginIndex) + { + beginIndex = product.OneBasedBeginPosition.Value; + } + + //limit length to the maximum allowed if necessary + if (product.OneBasedEndPosition.Value - beginIndex >= MaxPeptideLength) + { + beginIndex = product.OneBasedEndPosition.Value - MaxPeptideLength + 1; + } + //if it's bigger than the minimum allowed, then add it + if (product.OneBasedEndPosition.Value - beginIndex + 1 >= MinPeptideLength) + { + peptides.Add(new ProteolyticPeptide(protein, beginIndex, product.OneBasedEndPosition.Value, MaximumMissedCleavages, CleavageSpecificity.Full, product.Type + " start")); + } + } + } + } + + return peptides; + } + + /// + /// Gets peptides for specific protease digestion of a protein + /// + /// + /// + public IEnumerable Digestion(Protein protein, bool topDownTruncationSearch = false) + { + return Protease.GetUnmodifiedPeptides(protein, MaximumMissedCleavages, InitiatorMethionineBehavior, MinPeptideLength, MaxPeptideLength, DigestionParams.SpecificProtease, topDownTruncationSearch); + } + } +} \ No newline at end of file diff --git a/mzLib/MassSpectrometry/Proteomics/ProteolyticDigestion/ProteolyticPeptide.cs b/mzLib/MassSpectrometry/Proteomics/ProteolyticDigestion/ProteolyticPeptide.cs new file mode 100644 index 000000000..4ef3f5150 --- /dev/null +++ b/mzLib/MassSpectrometry/Proteomics/ProteolyticDigestion/ProteolyticPeptide.cs @@ -0,0 +1,368 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using Proteomics; + +namespace Proteomics.ProteolyticDigestion +{ + /// + /// Product of digesting a protein + /// Contains methods for modified peptide combinitorics + /// + [Serializable] + public class ProteolyticPeptide + { + protected string _baseSequence; + + internal ProteolyticPeptide(Protein protein, int oneBasedStartResidueInProtein, int oneBasedEndResidueInProtein, int missedCleavages, CleavageSpecificity cleavageSpecificityForFdrCategory, string peptideDescription = null, string baseSequence = null) + { + _protein = protein; + OneBasedStartResidueInProtein = oneBasedStartResidueInProtein; + OneBasedEndResidueInProtein = oneBasedEndResidueInProtein; + MissedCleavages = missedCleavages; + CleavageSpecificityForFdrCategory = cleavageSpecificityForFdrCategory; + PeptideDescription = peptideDescription; + _baseSequence = baseSequence; + } + + [NonSerialized] private Protein _protein; // protein that this peptide is a digestion product of + public int OneBasedStartResidueInProtein { get; } // the residue number at which the peptide begins (the first residue in a protein is 1) + public int OneBasedEndResidueInProtein { get; } // the residue number at which the peptide ends + public int MissedCleavages { get; } // the number of missed cleavages this peptide has with respect to the digesting protease + public string PeptideDescription { get; internal set; } //unstructured explanation of source + public CleavageSpecificity CleavageSpecificityForFdrCategory { get; internal set; } //structured explanation of source + public int Length { get { return BaseSequence.Length; } } //how many residues long the peptide is + + public virtual char PreviousAminoAcid + { + get + { + return OneBasedStartResidueInProtein > 1 ? Protein[OneBasedStartResidueInProtein - 2] : '-'; + } + } + + public virtual char NextAminoAcid + { + get + { + return OneBasedEndResidueInProtein < Protein.Length ? Protein[OneBasedEndResidueInProtein] : '-'; + } + } + + public Protein Protein + { + get { return _protein; } + protected set { _protein = value; } + } + + public string BaseSequence + { + get + { + if (_baseSequence == null) + { + _baseSequence = Protein.BaseSequence.Substring(OneBasedStartResidueInProtein - 1, OneBasedEndResidueInProtein - OneBasedStartResidueInProtein + 1); + } + return _baseSequence; + } + } + + public char this[int zeroBasedIndex] + { + get + { + return BaseSequence[zeroBasedIndex]; + } + } + + /// + /// Gets the peptides for a specific protein interval + /// + /// + /// + /// + /// + /// + internal IEnumerable GetModifiedPeptides(IEnumerable allKnownFixedModifications, + DigestionParams digestionParams, List variableModifications) + { + int peptideLength = OneBasedEndResidueInProtein - OneBasedStartResidueInProtein + 1; + int maximumVariableModificationIsoforms = digestionParams.MaxModificationIsoforms; + int maxModsForPeptide = digestionParams.MaxModsForPeptide; + var twoBasedPossibleVariableAndLocalizeableModifications = new Dictionary>(peptideLength + 4); + + var pepNTermVariableMods = new List(); + twoBasedPossibleVariableAndLocalizeableModifications.Add(1, pepNTermVariableMods); + + var pepCTermVariableMods = new List(); + twoBasedPossibleVariableAndLocalizeableModifications.Add(peptideLength + 2, pepCTermVariableMods); + + foreach (Modification variableModification in variableModifications) + { + // Check if can be a n-term mod + if (CanBeNTerminalMod(variableModification, peptideLength) && !ModificationLocalization.UniprotModExists(Protein, 1, variableModification)) + { + pepNTermVariableMods.Add(variableModification); + } + + for (int r = 0; r < peptideLength; r++) + { + if (ModificationLocalization.ModFits(variableModification, Protein.BaseSequence, r + 1, peptideLength, OneBasedStartResidueInProtein + r) + && variableModification.LocationRestriction == "Anywhere." && !ModificationLocalization.UniprotModExists(Protein, r + 1, variableModification)) + { + if (!twoBasedPossibleVariableAndLocalizeableModifications.TryGetValue(r + 2, out List residueVariableMods)) + { + residueVariableMods = new List { variableModification }; + twoBasedPossibleVariableAndLocalizeableModifications.Add(r + 2, residueVariableMods); + } + else + { + residueVariableMods.Add(variableModification); + } + } + } + // Check if can be a c-term mod + if (CanBeCTerminalMod(variableModification, peptideLength) && !ModificationLocalization.UniprotModExists(Protein, peptideLength, variableModification)) + { + pepCTermVariableMods.Add(variableModification); + } + } + + // LOCALIZED MODS + foreach (var kvp in Protein.OneBasedPossibleLocalizedModifications) + { + bool inBounds = kvp.Key >= OneBasedStartResidueInProtein && kvp.Key <= OneBasedEndResidueInProtein; + if (!inBounds) + { + continue; + } + + int locInPeptide = kvp.Key - OneBasedStartResidueInProtein + 1; + foreach (Modification modWithMass in kvp.Value) + { + if (modWithMass is Modification variableModification) + { + // Check if can be a n-term mod + if (locInPeptide == 1 && CanBeNTerminalMod(variableModification, peptideLength) && !Protein.IsDecoy) + { + pepNTermVariableMods.Add(variableModification); + } + + int r = locInPeptide - 1; + if (r >= 0 && r < peptideLength + && (Protein.IsDecoy || + (ModificationLocalization.ModFits(variableModification, Protein.BaseSequence, r + 1, peptideLength, OneBasedStartResidueInProtein + r) + && variableModification.LocationRestriction == "Anywhere."))) + { + if (!twoBasedPossibleVariableAndLocalizeableModifications.TryGetValue(r + 2, out List residueVariableMods)) + { + residueVariableMods = new List { variableModification }; + twoBasedPossibleVariableAndLocalizeableModifications.Add(r + 2, residueVariableMods); + } + else + { + residueVariableMods.Add(variableModification); + } + } + + // Check if can be a c-term mod + if (locInPeptide == peptideLength && CanBeCTerminalMod(variableModification, peptideLength) && !Protein.IsDecoy) + { + pepCTermVariableMods.Add(variableModification); + } + } + } + } + + int variable_modification_isoforms = 0; + + foreach (Dictionary kvp in GetVariableModificationPatterns(twoBasedPossibleVariableAndLocalizeableModifications, maxModsForPeptide, peptideLength)) + { + int numFixedMods = 0; + foreach (var ok in GetFixedModsOneIsNterminus(peptideLength, allKnownFixedModifications)) + { + if (!kvp.ContainsKey(ok.Key)) + { + numFixedMods++; + kvp.Add(ok.Key, ok.Value); + } + } + yield return new PeptideWithSetModifications(Protein, digestionParams, OneBasedStartResidueInProtein, OneBasedEndResidueInProtein, + CleavageSpecificityForFdrCategory, PeptideDescription, MissedCleavages, kvp, numFixedMods); + variable_modification_isoforms++; + if (variable_modification_isoforms == maximumVariableModificationIsoforms) + { + yield break; + } + } + } + + /// + /// Determines whether given modification can be an N-terminal modification + /// + /// + /// + /// + private bool CanBeNTerminalMod(Modification variableModification, int peptideLength) + { + return ModificationLocalization.ModFits(variableModification, Protein.BaseSequence, 1, peptideLength, OneBasedStartResidueInProtein) + && (variableModification.LocationRestriction == "N-terminal." || variableModification.LocationRestriction == "Peptide N-terminal."); + } + + /// + /// Determines whether given modification can be a C-terminal modification + /// + /// + /// + /// + private bool CanBeCTerminalMod(Modification variableModification, int peptideLength) + { + return ModificationLocalization.ModFits(variableModification, Protein.BaseSequence, peptideLength, peptideLength, OneBasedStartResidueInProtein + peptideLength - 1) + && (variableModification.LocationRestriction == "C-terminal." || variableModification.LocationRestriction == "Peptide C-terminal."); + } + + private static IEnumerable> GetVariableModificationPatterns(Dictionary> possibleVariableModifications, int maxModsForPeptide, int peptideLength) + { + if (possibleVariableModifications.Count == 0) + { + yield return null; + } + else + { + var possible_variable_modifications = new Dictionary>(possibleVariableModifications); + + int[] base_variable_modification_pattern = new int[peptideLength + 4]; + var totalAvailableMods = possible_variable_modifications.Sum(b => b.Value == null ? 0 : b.Value.Count); + for (int variable_modifications = 0; variable_modifications <= Math.Min(totalAvailableMods, maxModsForPeptide); variable_modifications++) + { + foreach (int[] variable_modification_pattern in GetVariableModificationPatterns(new List>>(possible_variable_modifications), + possible_variable_modifications.Count - variable_modifications, base_variable_modification_pattern, 0)) + { + yield return GetNewVariableModificationPattern(variable_modification_pattern, possible_variable_modifications); + } + } + } + } + + private static IEnumerable GetVariableModificationPatterns(List>> possibleVariableModifications, + int unmodifiedResiduesDesired, int[] variableModificationPattern, int index) + { + if (index < possibleVariableModifications.Count - 1) + { + if (unmodifiedResiduesDesired > 0) + { + variableModificationPattern[possibleVariableModifications[index].Key] = 0; + foreach (int[] new_variable_modification_pattern in GetVariableModificationPatterns(possibleVariableModifications, + unmodifiedResiduesDesired - 1, variableModificationPattern, index + 1)) + { + yield return new_variable_modification_pattern; + } + } + if (unmodifiedResiduesDesired < possibleVariableModifications.Count - index) + { + for (int i = 1; i <= possibleVariableModifications[index].Value.Count; i++) + { + variableModificationPattern[possibleVariableModifications[index].Key] = i; + foreach (int[] new_variable_modification_pattern in GetVariableModificationPatterns(possibleVariableModifications, + unmodifiedResiduesDesired, variableModificationPattern, index + 1)) + { + yield return new_variable_modification_pattern; + } + } + } + } + else + { + if (unmodifiedResiduesDesired > 0) + { + variableModificationPattern[possibleVariableModifications[index].Key] = 0; + yield return variableModificationPattern; + } + else + { + for (int i = 1; i <= possibleVariableModifications[index].Value.Count; i++) + { + variableModificationPattern[possibleVariableModifications[index].Key] = i; + yield return variableModificationPattern; + } + } + } + } + + private static Dictionary GetNewVariableModificationPattern(int[] variableModificationArray, + IEnumerable>> possibleVariableModifications) + { + var modification_pattern = new Dictionary(); + + foreach (KeyValuePair> kvp in possibleVariableModifications) + { + if (variableModificationArray[kvp.Key] > 0) + { + modification_pattern.Add(kvp.Key, kvp.Value[variableModificationArray[kvp.Key] - 1]); + } + } + + return modification_pattern; + } + + private Dictionary GetFixedModsOneIsNterminus(int peptideLength, + IEnumerable allKnownFixedModifications) + { + var fixedModsOneIsNterminus = new Dictionary(peptideLength + 3); + foreach (Modification mod in allKnownFixedModifications) + { + switch (mod.LocationRestriction) + { + case "N-terminal.": + case "Peptide N-terminal.": + //the modification is protease associated and is applied to the n-terminal cleaved residue, not at the beginign of the protein + if (mod.ModificationType == "Protease" && ModificationLocalization.ModFits(mod, Protein.BaseSequence, 1, peptideLength, OneBasedStartResidueInProtein)) + { + if (OneBasedStartResidueInProtein != 1) + { + fixedModsOneIsNterminus[2] = mod; + } + } + //Normal N-terminal peptide modification + else if (ModificationLocalization.ModFits(mod, Protein.BaseSequence, 1, peptideLength, OneBasedStartResidueInProtein)) + { + fixedModsOneIsNterminus[1] = mod; + } + break; + + case "Anywhere.": + for (int i = 2; i <= peptideLength + 1; i++) + { + if (ModificationLocalization.ModFits(mod, Protein.BaseSequence, i - 1, peptideLength, OneBasedStartResidueInProtein + i - 2)) + { + fixedModsOneIsNterminus[i] = mod; + } + } + break; + + case "C-terminal.": + case "Peptide C-terminal.": + //the modification is protease associated and is applied to the c-terminal cleaved residue, not if it is at the end of the protein + if (mod.ModificationType == "Protease" && ModificationLocalization.ModFits(mod, Protein.BaseSequence, peptideLength, peptideLength, OneBasedStartResidueInProtein + peptideLength - 1)) + { + if (OneBasedEndResidueInProtein != Protein.Length) + { + fixedModsOneIsNterminus[peptideLength+1] = mod; + } + + } + //Normal C-terminal peptide modification + else if (ModificationLocalization.ModFits(mod, Protein.BaseSequence, peptideLength, peptideLength, OneBasedStartResidueInProtein + peptideLength - 1)) + { + fixedModsOneIsNterminus[peptideLength + 2] = mod; + } + break; + + default: + throw new NotSupportedException("This terminus localization is not supported."); + } + } + return fixedModsOneIsNterminus; + } + } +} \ No newline at end of file diff --git a/mzLib/MassSpectrometry/Proteomics/ProteolyticDigestion/proteases.tsv b/mzLib/MassSpectrometry/Proteomics/ProteolyticDigestion/proteases.tsv new file mode 100644 index 000000000..a18aff8d1 --- /dev/null +++ b/mzLib/MassSpectrometry/Proteomics/ProteolyticDigestion/proteases.tsv @@ -0,0 +1,25 @@ +Name Sequences Inducing Cleavage Sequences Preventing Cleavage Cleavage Terminus Cleavage Specificity PSI-MS Accession Number PSI-MS Name Site Regular Expression Cleavage Mass Shifts Notes +Arg-C R| full MS:1001303 Arg-C (?<=R)(?!P) +Asp-N |D full MS:1001304 Asp-N (?=[BD]) +chymotrypsin (don't cleave before proline) "F[P]|,W[P]|,Y[P]|" full MS:1001306 Chymotrypsin (?<=[FYWL])(?!P) +chymotrypsin (cleave before proline) "F|,W|,Y|" full MS:1001306 Chymotrypsin (?<=[FYWL]) +CNBr M| full MS:1001307 CNBr (?<=M) Homoserine lactone on M +elastase "A|,V|,S|,G|,L|,I|" full Elastase (?<=[AVSGLI]) +Glu-C E| full +Glu-C (with asp) "E|,D|" full +Lys-C (don't cleave before proline) K[P]| full MS:1001309 Lys-C (?<=K)(?!P) +Lys-C (cleave before proline) K| full MS:1001310 Lys-C/P (?<=K) +Lys-N |K full +semi-trypsin "K|,R|" semi MS:1001313 Trypsin/P (?<=[KR]) +trypsin "K|,R|" full MS:1001313 Trypsin/P (?<=[KR]) +tryptophan oxidation W| full +non-specific X| full MS:1001956 unspecific cleavage +top-down none MS:1001955 no cleavage +singleN SingleN MS:1001957 single cleavage +singleC SingleC MS:1001958 single cleavage +peptidomics none no cleavage +collagenase GPX|GPX full +StcE-trypsin "TX|T,TX|S,SX|T,SX|S,K|,R|" full StcE/Trpsin +CNBr_old M| full MS:1001307 CNBr (?<=M) +CNBr_N |M full MS:1001307 CNBr (?<=M) Test on M +ProAlanase "P|,A|" full diff --git a/mzLib/MassSpectrometry/Proteomics/RetentionTimePrediction/CZE.cs b/mzLib/MassSpectrometry/Proteomics/RetentionTimePrediction/CZE.cs new file mode 100644 index 000000000..846291ced --- /dev/null +++ b/mzLib/MassSpectrometry/Proteomics/RetentionTimePrediction/CZE.cs @@ -0,0 +1,258 @@ +using System; + +namespace Proteomics.RetentionTimePrediction +{ + /// + /// This class will return theoretical retention times, hydrobphobicites, electrophoretic mobilities and etc. for peptides. + /// These values would be useful for comparision with experimentally observed retention times. This information might be + /// informative for evaluation of false positives and also for discerning the prescence of certain PTMs that would + /// alter the experimental chromatographic behavior. + /// + /// This class returns calculated electrophoretic mobility for an observed peptide. The calculation requires use of an + /// observed retention time(min), the total capillary length(m) and the applied voltage (V/m) + /// + /// The public methods of this class are limited to electrophoretic mobilities of peptides detected in a CZE-MS/MS experiment. + /// + public class CZE + { + private readonly double ColumnLength; //in meters + private readonly double VoltsPerMeter; //in volts/meter + public CZE(double columnLength, double voltsPerMeter) + { + ColumnLength = columnLength; + VoltsPerMeter = voltsPerMeter; + } + + /// + /// This method returns calculated electrophoretic mobility for an observed peptide. The calculation requires use of an + /// observed retention time(min), the total capillary length(m) and the applied voltage (V/m) + /// + /// + /// + public double ExperimentalElectrophoreticMobility(double timeMin) + { + if (ColumnLength >= 0 && timeMin >= 0) + { + return ColumnLength / (60 * timeMin * VoltsPerMeter) * 1e9; + } + else + { + return -1; + } + } + + /// + /// This method returns an expected retention time for a given electrophoretic mobility and experiment. The calculation requires use of an + /// electrophoretic mobility, the total capillary length(m) and the applied voltage (V/m) + /// + /// + /// + public double TheoreticalElutionTime(double electrophoreticMobility) + { + if (ColumnLength >= 0) + { + return (ColumnLength * 1e9) / (60 * VoltsPerMeter *electrophoreticMobility); + } + else + { + return -1; + } + } + + /// + /// This calculated the predicted electrophoretic mobility of a peptide. + /// + /// See for reference + /// Anal Chem. 2017 Feb 7;89(3):2000-2008. doi: 10.1021/acs.analchem.6b04544. Epub 2017 Jan 19. + /// Predicting Electrophoretic Mobility of Tryptic Peptides for High-Throughput CZE-MS Analysis. + /// Krokhin OV, Anderson G, Spicer V, Sun L1, Dovichi NJ2. + /// https://www.ncbi.nlm.nih.gov/pubmed/28208305 + /// + /// + /// + /// + /// + public static double PredictedElectrophoreticMobility(string peptideSequence, double observedMass) + { + double predictedMu = 0; + + //calculation described in Anal Chem. 2017 Feb 7;89(3):2000-2008 + //3.069 and 386 are coefficients applied to align output with experimentally measured values(slope 1 and intercept 0 in). I think we may need to reset these. + //other values from best fit model of Cifuentes and Poppe (J. Chromatogr. A 1994, 680, 321−340) used as described in the AC paper. + predictedMu = 3.069 + 386 * Math.Log(1d + 0.35 * PredictedChargeCorrected(peptideSequence)) / + (Math.Pow(observedMass, 0.411) + Offset(PredictedChargeCorrected(peptideSequence), peptideSequence.Length)); + + return predictedMu; + } + + /// + /// The predicted charge is plus 1 for the N-terminal and plus for the count of lysine(K), arginine(R) and histidine(H). + /// + /// + /// + private static double PredictedCharge(string peptideSequence) + { + string substitutedString = peptideSequence.Replace("R", "").Replace("K", "").Replace("H", "").ToString(); + return (1d + (peptideSequence.Length - substitutedString.Length)); + } + + /// + /// minimal charge correction is position dependenat and predominantly at the peptide termini. Adjustments are made for presence of D, E, N and Q + /// at the ends and in the middle. + /// + /// In the future, I would like to use linear algebra to estimate these more accurately for each dataset separately. Currently + /// these numbers are from a table in Anal Chem. 2017 Feb 7;89(3):2000-2008. doi: 10.1021/acs.analchem.6b04544. Epub 2017 Jan 19. + /// + /// + /// + /// + private static double PredictedChargeCorrected(string peptideSequence) + { + double runningSum = 0; + string internalString = peptideSequence.Substring(3, peptideSequence.Length - 5); + + char firstAA = peptideSequence[0]; + if (firstAA == 'D') + { + runningSum -= 0.26741; + } + else if (firstAA == 'E') + { + runningSum -= 0.06852; + } + else if (firstAA == 'N') + { + runningSum += 0.011699; + } + else + { + //change nothing + } + + char secondAA = peptideSequence[1]; + if (secondAA == 'D') + { + runningSum -= 0.10947; + } + else if (secondAA == 'E') + { + runningSum -= 0.04011; + } + else if (secondAA == 'N') + { + runningSum += 0.012535; + } + else if (secondAA == 'Q') + { + runningSum += 0.011699; + } + else + { + //change nothing + } + + char thirdAA = peptideSequence[2]; + if (thirdAA == 'D') + { + runningSum -= 0.08022; + } + else if (thirdAA == 'E') + { + runningSum -= 0.03426; + } + else if (thirdAA == 'N') + { + runningSum += 0.016713; + } + else if (thirdAA == 'Q') + { + runningSum += 0.00585; + } + else + { + //change nothing + } + + char secondToLastAA = peptideSequence[peptideSequence.Length - 2]; + if (secondToLastAA == 'D') + { + runningSum -= 0.03844; + } + else if (secondToLastAA == 'E') + { + runningSum -= 0.01337; + } + else if (secondToLastAA == 'N') + { + runningSum += 0.026741; + } + else if (secondToLastAA == 'Q') + { + runningSum -= 0.00084; + } + else + { + //change nothing + } + + char lastAA = peptideSequence[peptideSequence.Length - 1]; + if (lastAA == 'D') + { + runningSum -= 0.02256; + } + else if (lastAA == 'E') + { + runningSum -= 0.00418; + } + else if (lastAA == 'N') + { + runningSum += 0.010864; + } + else if (lastAA == 'Q') + { + runningSum -= 0.0117; + } + else + { + //change nothing + } + + //consider internal residues + if (internalString.Contains("D")) + { + runningSum -= 0.05014; + } + if (internalString.Contains("E")) + { + runningSum -= 0.01922; + } + if (internalString.Contains("N")) + { + runningSum += 0.012535; + } + if (internalString.Contains("Q")) + { + runningSum -= 0.000251; + } + + runningSum += PredictedCharge(peptideSequence); + + return runningSum; + } + + /// + /// + /// The offset in the AC paper is a 5th order polynomial best fit to a plot of Zc/N versus the difference between experimental and predicted electrophoretic mobility. + /// This smells of dead fish. I'm leaving it out for not but it might need to be used as some point. + /// + /// + /// + /// + /// + private static double Offset(double correctedCharge, int length) + { + return 0; + //should fit 5th order polynomical to plot of (ExperimentalElectrophoreticMobility - PredictedElectrophoreticMobility) vs. (Zc/N) where N is peptidelength. + } + } +} \ No newline at end of file diff --git a/mzLib/MassSpectrometry/Proteomics/RetentionTimePrediction/SSRCalc3.cs b/mzLib/MassSpectrometry/Proteomics/RetentionTimePrediction/SSRCalc3.cs new file mode 100644 index 000000000..92734045a --- /dev/null +++ b/mzLib/MassSpectrometry/Proteomics/RetentionTimePrediction/SSRCalc3.cs @@ -0,0 +1,1502 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Text.RegularExpressions; +using Proteomics.ProteolyticDigestion; + +namespace Proteomics.RetentionTimePrediction +{ + /** + /* + /* reference, O. V. Krokhin, R. Craig, V. Spicer, W. Ens, K. G. Standing, R. C. Beavis, J. A. Wilkins + /* An improved model for prediction of retention times of tryptic peptides in ion-pair reverse-phase HPLC: + /* its application to protein peptide mapping by off-line HPLC-MALDI MS + /* Molecular and Cellular Proteomics 2004 Sep;3(9):908-19. + /* URL, http://hs2.proteome.ca/SSRCalc/SSRCalc.html + /* + /* + /* These subroutines are based on web version SSRCalculator of the Copyright holder listed as in the following: + /* + /* Version 3.0 2005.02.28 + /* Copyright (c) 2005 John Wilkins + /* Sequence Specific Retention Calculator + /* Authors: Oleg Krokhin, Vic Spicer, John Cortens + */ + + /* Translated from perl to C, Ted Holzman FHCRC, 6/2006 */ + /* Retranslated from C to Java, Ted Holzman FHCRC 7/2006 */ + /* Translated from Java to C#, Brendan MacLean UW 10/2008 */ + /* NB: This is a version 0.1 direct translation. + /* An attempt has been made to keep function names, variable names, and algorithms + /* as close as possible to the original perl. + */ + + + // ReSharper disable InconsistentNaming + // ReSharper disable CharImplicitlyConvertedToNumeric + public class SSRCalc3 + { + /* Lookup table data. These are translations of the .h table in C which is a */ + /* translation of the ReadParmFile perl routine. This does not read a parameter */ + /* file; it makes static initializers for the parameter data. */ + + public const String VERSION = "Krokhin,3.0"; // Not L10N + + public IEnumerable ChooseRegressionPeptides(IEnumerable peptides, out int minCount) + { + minCount = 0; + return peptides; + } + + public IEnumerable GetStandardPeptides(IEnumerable peptides) + { + return new PeptideWithSetModifications[] { }; + } + + //public RetentionScoreCalculatorSpec Initialize(IProgressMonitor loadMonitor) + //{ + // return null; + //} + + private static readonly CLUSTCOMB_List CLUSTCOMB = new CLUSTCOMB_List(); + private static readonly Dictionary HlxScore4 = new Dictionary(); + private static readonly Dictionary HlxScore5 = new Dictionary(); + private static readonly Dictionary HlxScore6 = new Dictionary(); + private static readonly int[] EMap = new int[128]; + + private sealed class CLUSTCOMB_List : List> + { + public void Add(string pattern, double value) + { + Add(new KeyValuePair(new Regex(pattern), value)); + } + } + + static SSRCalc3() + { + + /* + Translator1 note: For the Java version we are prepending and appending 0s to the "pick" (key) column. This + is done dynamically and repeatedly in the perl code. As far as I can tell, pick is never used + without the surrounding 0s. + */ + + // ReSharper disable NonLocalizedString + CLUSTCOMB.Add("0110", 0.3); + CLUSTCOMB.Add("0150", 0.4); + CLUSTCOMB.Add("0510", 0.4); + CLUSTCOMB.Add("0550", 1.3); + CLUSTCOMB.Add("01110", 0.5); + CLUSTCOMB.Add("01150", 0.7); + CLUSTCOMB.Add("01510", 0.7); + CLUSTCOMB.Add("01550", 2.1); + CLUSTCOMB.Add("05110", 0.7); + CLUSTCOMB.Add("05150", 2.1); + CLUSTCOMB.Add("05510", 2.1); + CLUSTCOMB.Add("05550", 2.8); + CLUSTCOMB.Add("011110", 0.7); + CLUSTCOMB.Add("011150", 0.9); + CLUSTCOMB.Add("011510", 0.9); + CLUSTCOMB.Add("011550", 2.2); + CLUSTCOMB.Add("015110", 0.9); + CLUSTCOMB.Add("015150", 2.2); + CLUSTCOMB.Add("015510", 0.9); + CLUSTCOMB.Add("015550", 3.0); + CLUSTCOMB.Add("051110", 0.9); + CLUSTCOMB.Add("051150", 2.2); + CLUSTCOMB.Add("051510", 2.2); + CLUSTCOMB.Add("051550", 3.0); + CLUSTCOMB.Add("055110", 2.2); + CLUSTCOMB.Add("055150", 3.0); + CLUSTCOMB.Add("055510", 3.0); + CLUSTCOMB.Add("055550", 3.5); + CLUSTCOMB.Add("0111110", 0.9); + CLUSTCOMB.Add("0111150", 1.0); + CLUSTCOMB.Add("0111510", 1.0); + CLUSTCOMB.Add("0111550", 2.3); + CLUSTCOMB.Add("0115110", 1.0); + CLUSTCOMB.Add("0115150", 2.3); + CLUSTCOMB.Add("0115510", 2.3); + CLUSTCOMB.Add("0115550", 3.1); + CLUSTCOMB.Add("0151110", 1.0); + CLUSTCOMB.Add("0151150", 2.3); + CLUSTCOMB.Add("0151510", 2.3); + CLUSTCOMB.Add("0151550", 3.1); + CLUSTCOMB.Add("0155110", 2.3); + CLUSTCOMB.Add("0155150", 3.1); + CLUSTCOMB.Add("0155510", 3.1); + CLUSTCOMB.Add("0155550", 3.6); + CLUSTCOMB.Add("0511110", 1.0); + CLUSTCOMB.Add("0511150", 2.3); + CLUSTCOMB.Add("0511510", 2.3); + CLUSTCOMB.Add("0511550", 3.1); + CLUSTCOMB.Add("0515110", 3.6); + CLUSTCOMB.Add("0515150", 2.3); + CLUSTCOMB.Add("0515510", 3.1); + CLUSTCOMB.Add("0515550", 3.6); + CLUSTCOMB.Add("0551110", 2.3); + CLUSTCOMB.Add("0551150", 3.1); + CLUSTCOMB.Add("0551510", 3.1); + CLUSTCOMB.Add("0551550", 3.6); + CLUSTCOMB.Add("0555110", 3.1); + CLUSTCOMB.Add("0555150", 3.6); + CLUSTCOMB.Add("0555510", 3.6); + CLUSTCOMB.Add("0555550", 4.0); + CLUSTCOMB.Add("01111110", 1.1); + CLUSTCOMB.Add("01111150", 1.7); + CLUSTCOMB.Add("01111510", 1.7); + CLUSTCOMB.Add("01111550", 2.5); + CLUSTCOMB.Add("01115110", 1.7); + CLUSTCOMB.Add("01115150", 2.5); + CLUSTCOMB.Add("01115510", 2.5); + CLUSTCOMB.Add("01115550", 3.3); + CLUSTCOMB.Add("01151110", 1.7); + CLUSTCOMB.Add("01151150", 2.5); + CLUSTCOMB.Add("01151510", 2.5); + CLUSTCOMB.Add("01151550", 3.3); + CLUSTCOMB.Add("01155110", 2.5); + CLUSTCOMB.Add("01155150", 3.3); + CLUSTCOMB.Add("01155510", 3.3); + CLUSTCOMB.Add("01155550", 3.7); + CLUSTCOMB.Add("01511110", 1.7); + CLUSTCOMB.Add("01511150", 2.5); + CLUSTCOMB.Add("01511510", 2.5); + CLUSTCOMB.Add("01511550", 3.3); + CLUSTCOMB.Add("01515110", 2.5); + CLUSTCOMB.Add("01515150", 3.3); + CLUSTCOMB.Add("01515510", 3.3); + CLUSTCOMB.Add("01515550", 3.7); + CLUSTCOMB.Add("01551110", 2.5); + CLUSTCOMB.Add("01551150", 3.3); + CLUSTCOMB.Add("01551510", 3.3); + CLUSTCOMB.Add("01551550", 3.7); + CLUSTCOMB.Add("01555110", 3.3); + CLUSTCOMB.Add("01555150", 3.7); + CLUSTCOMB.Add("01555510", 3.7); + CLUSTCOMB.Add("01555550", 4.1); + CLUSTCOMB.Add("05111110", 1.7); + CLUSTCOMB.Add("05111150", 2.5); + CLUSTCOMB.Add("05111510", 2.5); + CLUSTCOMB.Add("05111550", 3.3); + CLUSTCOMB.Add("05115110", 2.5); + CLUSTCOMB.Add("05115150", 3.3); + CLUSTCOMB.Add("05115510", 3.3); + CLUSTCOMB.Add("05115550", 3.7); + CLUSTCOMB.Add("05151110", 2.5); + CLUSTCOMB.Add("05151150", 3.3); + CLUSTCOMB.Add("05151510", 3.3); + CLUSTCOMB.Add("05151550", 3.7); + CLUSTCOMB.Add("05155110", 3.3); + CLUSTCOMB.Add("05155150", 3.7); + CLUSTCOMB.Add("05155510", 3.7); + CLUSTCOMB.Add("05155550", 4.1); + CLUSTCOMB.Add("05511110", 2.5); + CLUSTCOMB.Add("05511150", 3.3); + CLUSTCOMB.Add("05511510", 3.3); + CLUSTCOMB.Add("05511550", 3.7); + CLUSTCOMB.Add("05515110", 3.3); + CLUSTCOMB.Add("05515150", 3.7); + CLUSTCOMB.Add("05515510", 3.7); + CLUSTCOMB.Add("05515550", 4.1); + CLUSTCOMB.Add("05551110", 3.3); + CLUSTCOMB.Add("05551150", 3.7); + CLUSTCOMB.Add("05551510", 3.7); + CLUSTCOMB.Add("05551550", 4.1); + CLUSTCOMB.Add("05555110", 3.7); + CLUSTCOMB.Add("05555150", 4.1); + CLUSTCOMB.Add("05555510", 4.1); + CLUSTCOMB.Add("05555550", 4.5); + + HlxScore4.Add("XXUX", 0.8); + HlxScore4.Add("XZOX", 0.8); + HlxScore4.Add("XUXX", 0.8); + HlxScore4.Add("XXOX", 0.7); + HlxScore4.Add("XOXX", 0.7); + HlxScore4.Add("XZUX", 0.7); + HlxScore4.Add("XXOZ", 0.7); + HlxScore4.Add("ZXOX", 0.7); + HlxScore4.Add("XOZZ", 0.7); + HlxScore4.Add("ZOXX", 0.7); + HlxScore4.Add("ZOZX", 0.7); + HlxScore4.Add("ZUXX", 0.7); + HlxScore4.Add("ZXUX", 0.5); + HlxScore4.Add("XOZX", 0.5); + HlxScore4.Add("XZOZ", 0.5); + HlxScore4.Add("XUZX", 0.5); + HlxScore4.Add("ZZOX", 0.2); + HlxScore4.Add("ZXOZ", 0.2); + HlxScore4.Add("ZOXZ", 0.2); + HlxScore4.Add("XOXZ", 0.2); + HlxScore4.Add("ZZUZ", 0.2); + HlxScore4.Add("XUXZ", 0.2); + HlxScore4.Add("ZUXZ", 0.2); + HlxScore4.Add("XZUZ", 0.2); + HlxScore4.Add("XUZZ", 0.2); + HlxScore4.Add("ZXUZ", 0.2); + HlxScore4.Add("ZOZZ", 0.2); + HlxScore4.Add("ZZOZ", 0.2); + HlxScore4.Add("ZZUX", 0.2); + HlxScore4.Add("ZUZX", 0.2); + HlxScore4.Add("XXUZ", 0.2); + HlxScore4.Add("ZUZZ", 0.2); + + HlxScore5.Add("XXOXX", 3.75); + HlxScore5.Add("XXOXZ", 3.75); + HlxScore5.Add("XXOZX", 3.75); + HlxScore5.Add("XZOXX", 3.75); + HlxScore5.Add("ZXOXX", 3.75); + HlxScore5.Add("XXOZZ", 2.7); + HlxScore5.Add("XZOXZ", 2.7); + HlxScore5.Add("XZOZX", 2.7); + HlxScore5.Add("ZXOXZ", 2.7); + HlxScore5.Add("ZXOZX", 2.7); + HlxScore5.Add("ZZOXX", 2.7); + HlxScore5.Add("ZXOZZ", 1.3); + HlxScore5.Add("XZOZZ", 1.3); + HlxScore5.Add("ZZOXZ", 1.3); + HlxScore5.Add("ZZOZX", 1.3); + HlxScore5.Add("ZZOZZ", 1.3); + HlxScore5.Add("XXUXX", 3.75); + HlxScore5.Add("XXUXZ", 3.75); + HlxScore5.Add("XXUZX", 3.75); + HlxScore5.Add("XZUXX", 3.75); + HlxScore5.Add("ZXUXX", 3.75); + HlxScore5.Add("XXUZZ", 1.1); + HlxScore5.Add("XZUXZ", 1.1); + HlxScore5.Add("XZUZX", 1.1); + HlxScore5.Add("ZXUZX", 1.1); + HlxScore5.Add("ZXUXZ", 1.1); + HlxScore5.Add("ZZUXX", 1.1); + HlxScore5.Add("XZUZZ", 1.3); + HlxScore5.Add("ZXUZZ", 1.3); + HlxScore5.Add("ZZUXZ", 1.3); + HlxScore5.Add("ZZUZX", 1.3); + HlxScore5.Add("ZZUZZ", 1.3); + HlxScore5.Add("XXOOX", 1.25); + HlxScore5.Add("ZXOOX", 1.25); + HlxScore5.Add("XZOOX", 1.25); + HlxScore5.Add("XOOXX", 1.25); + HlxScore5.Add("XOOXZ", 1.25); + HlxScore5.Add("XOOZX", 1.25); + HlxScore5.Add("XXOOZ", 1.25); + HlxScore5.Add("ZXOOZ", 1.25); + HlxScore5.Add("XZOOZ", 1.25); + HlxScore5.Add("ZZOOX", 1.25); + HlxScore5.Add("ZZOOZ", 1.25); + HlxScore5.Add("ZOOXX", 1.25); + HlxScore5.Add("ZOOXZ", 1.25); + HlxScore5.Add("ZOOZX", 1.25); + HlxScore5.Add("XOOZZ", 1.25); + HlxScore5.Add("ZOOZZ", 1.25); + HlxScore5.Add("XXOUX", 1.25); + HlxScore5.Add("ZXOUX", 1.25); + HlxScore5.Add("XXUOX", 1.25); + HlxScore5.Add("ZXUOX", 1.25); + HlxScore5.Add("XOUXX", 1.25); + HlxScore5.Add("XOUXZ", 1.25); + HlxScore5.Add("XUOXX", 1.25); + HlxScore5.Add("XUOXZ", 1.25); + HlxScore5.Add("XXOUZ", 0.75); + HlxScore5.Add("ZXOUZ", 0.75); + HlxScore5.Add("XZOUX", 0.75); + HlxScore5.Add("XZOUZ", 0.75); + HlxScore5.Add("ZZOUX", 0.75); + HlxScore5.Add("ZZOUZ", 0.75); + HlxScore5.Add("XXUOZ", 0.75); + HlxScore5.Add("ZXUOZ", 0.75); + HlxScore5.Add("XZUOX", 0.75); + HlxScore5.Add("XZUOZ", 0.75); + HlxScore5.Add("ZZUOX", 0.75); + HlxScore5.Add("ZZUOZ", 0.75); + HlxScore5.Add("ZOUXX", 0.75); + HlxScore5.Add("ZOUXZ", 0.75); + HlxScore5.Add("XOUZX", 0.75); + HlxScore5.Add("ZOUZX", 0.75); + HlxScore5.Add("XOUZZ", 0.75); + HlxScore5.Add("ZOUZZ", 0.75); + HlxScore5.Add("ZUOXX", 0.75); + HlxScore5.Add("ZUOXZ", 0.75); + HlxScore5.Add("XUOZX", 0.75); + HlxScore5.Add("ZUOZX", 0.75); + HlxScore5.Add("XUOZZ", 0.75); + HlxScore5.Add("ZUOZZ", 0.75); + HlxScore5.Add("XUUXX", 1.25); + HlxScore5.Add("XXUUX", 1.25); + HlxScore5.Add("XXUUZ", 0.6); + HlxScore5.Add("ZXUUX", 0.6); + HlxScore5.Add("ZXUUZ", 0.6); + HlxScore5.Add("XZUUX", 0.6); + HlxScore5.Add("XZUUZ", 0.6); + HlxScore5.Add("ZZUUX", 0.6); + HlxScore5.Add("ZZUUZ", 0.6); + HlxScore5.Add("ZUUXX", 0.6); + HlxScore5.Add("XUUXZ", 0.6); + HlxScore5.Add("ZUUXZ", 0.6); + HlxScore5.Add("XUUZX", 0.6); + HlxScore5.Add("ZUUZX", 0.6); + HlxScore5.Add("XUUZZ", 0.6); + HlxScore5.Add("ZUUZZ", 0.6); + + HlxScore6.Add("XXOOXX", 3.0); + HlxScore6.Add("XXOOXZ", 3.0); + HlxScore6.Add("ZXOOXX", 3.0); + HlxScore6.Add("ZXOOXZ", 3.0); + HlxScore6.Add("XXOUXX", 3.0); + HlxScore6.Add("XXOUXZ", 3.0); + HlxScore6.Add("XXUOXX", 3.0); + HlxScore6.Add("XXUOXZ", 3.0); + HlxScore6.Add("ZXUOXX", 3.0); + HlxScore6.Add("ZXOUXX", 3.0); + HlxScore6.Add("XXOOZX", 1.6); + HlxScore6.Add("XXOOZZ", 1.6); + HlxScore6.Add("XZOOXX", 1.6); + HlxScore6.Add("XZOOXZ", 1.6); + HlxScore6.Add("XZOOZX", 1.6); + HlxScore6.Add("XZOOZZ", 1.6); + HlxScore6.Add("ZXOOZX", 1.6); + HlxScore6.Add("ZXOOZZ", 1.6); + HlxScore6.Add("ZZOOXX", 1.6); + HlxScore6.Add("ZZOOXZ", 1.6); + HlxScore6.Add("ZXOUXZ", 1.6); + HlxScore6.Add("XZUOXX", 1.6); + HlxScore6.Add("ZXUOXZ", 1.6); + HlxScore6.Add("ZZOOZX", 1.5); + HlxScore6.Add("ZZOOZZ", 1.5); + HlxScore6.Add("XXOUZX", 1.5); + HlxScore6.Add("XXOUZZ", 1.5); + HlxScore6.Add("XZOUXX", 1.5); + HlxScore6.Add("XZOUXZ", 1.5); + HlxScore6.Add("ZXOUZX", 1.5); + HlxScore6.Add("ZXOUZZ", 1.5); + HlxScore6.Add("ZZOUXX", 1.5); + HlxScore6.Add("ZZOUXZ", 1.5); + HlxScore6.Add("XXUOZX", 1.5); + HlxScore6.Add("XXUOZZ", 1.5); + HlxScore6.Add("XZUOXZ", 1.5); + HlxScore6.Add("ZXUOZX", 1.5); + HlxScore6.Add("ZXUOZZ", 1.5); + HlxScore6.Add("ZZUOXX", 1.5); + HlxScore6.Add("ZZUOXZ", 1.5); + HlxScore6.Add("ZZUOZX", 1.25); + HlxScore6.Add("ZZUOZZ", 1.25); + HlxScore6.Add("ZZOUZX", 1.25); + HlxScore6.Add("ZZOUZZ", 1.25); + HlxScore6.Add("XZOUZX", 1.25); + HlxScore6.Add("XZOUZZ", 1.25); + HlxScore6.Add("XZUOZX", 1.25); + HlxScore6.Add("XZUOZZ", 1.25); + HlxScore6.Add("XXUUXX", 1.25); + HlxScore6.Add("XXUUXZ", 1.25); + HlxScore6.Add("ZXUUXX", 1.25); + HlxScore6.Add("XXUUZX", 1.25); + HlxScore6.Add("XXUUZZ", 1.25); + HlxScore6.Add("XZUUXX", 1.25); + HlxScore6.Add("XZUUXZ", 1.25); + HlxScore6.Add("XZUUZX", 0.75); + HlxScore6.Add("XZUUZZ", 0.75); + HlxScore6.Add("ZXUUXZ", 1.25); + HlxScore6.Add("ZXUUZX", 1.25); + HlxScore6.Add("ZXUUZZ", 1.25); + HlxScore6.Add("ZZUUXX", 1.25); + HlxScore6.Add("ZZUUXZ", 1.25); + HlxScore6.Add("ZZUUZX", 0.75); + HlxScore6.Add("ZZUUZZ", 0.75); + // ReSharper restore NonLocalizedString + + // populate eMap + for (int i = 0; i < EMap.Length; i++) + { + EMap[i] = -1; //default + } + EMap['K'] = 0; + EMap['R'] = 1; + EMap['H'] = 2; + EMap['D'] = 3; + EMap['E'] = 4; + EMap['C'] = 5; + EMap['Y'] = 6; + } + + public enum Column { A300, A100 } + + public AAParams[] AAPARAMS = new AAParams[128]; + + public SSRCalc3(string name, Column column) + { + Name = name; + + AAParams NULLPARAM = new AAParams(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + for (int i = 0; i < AAPARAMS.Length; i++) + { + AAPARAMS[i] = NULLPARAM; + } + + switch (column) + { + case Column.A300: + A300Column(); + break; + case Column.A100: + A100Column(); + break; + } + } + + public string Name { get; private set; } + + private void A300Column() + { + // a | Weights for reg peptide | weights for short peptide | | | iso-elec vals | heli2 + // a | RC | RC1 | RC2 | RN | RN-1 | RCs | RC1s | RC2s | RNs |RN-1s| krh | mass | Ctrm| Ntrm| pk1 | bsc| cmu + AAPARAMS['A'] = new AAParams(01.10, 00.35, 00.50, 00.80, -0.10, 00.80, -0.30, 00.10, 00.80, -0.50, 00.00, 071.0370, 3.55, 7.59, 00.00, 1.0, 1.2); + AAPARAMS['C'] = new AAParams(00.45, 00.90, 00.20, -0.80, -0.50, 00.50, 00.40, 00.00, -0.80, -0.50, 00.00, 103.0090, 3.55, 7.50, 00.00, 0.0, 1.0); + AAPARAMS['D'] = new AAParams(00.15, 00.50, 00.40, -0.50, -0.50, 00.30, 00.30, 00.70, -0.50, -0.50, 00.00, 115.0270, 4.55, 7.50, 04.05, 0.0, 1.1); + AAPARAMS['E'] = new AAParams(00.95, 01.00, 00.00, 00.00, -0.10, 00.50, 00.10, 00.00, 00.00, -0.10, 00.00, 129.0430, 4.75, 7.70, 04.45, 0.0, 1.1); + AAPARAMS['F'] = new AAParams(10.90, 07.50, 09.50, 10.50, 10.30, 11.10, 08.10, 09.50, 10.50, 10.30, -0.10, 147.0638, 3.55, 7.50, 00.00, 0.5, 1.0); + AAPARAMS['G'] = new AAParams(-0.35, 00.20, 00.15, -0.90, -0.70, 00.00, 00.00, 00.10, -0.90, -0.70, 00.00, 057.0210, 3.55, 7.50, 00.00, 0.0, 0.3); + AAPARAMS['H'] = new AAParams(-1.45, -0.10, -0.20, -1.30, -1.70, -1.00, 00.10, -0.20, -1.30, -1.70, 00.00, 137.0590, 3.55, 7.50, 05.98, 0.0, 0.6); + AAPARAMS['I'] = new AAParams(08.00, 05.20, 06.60, 08.40, 07.70, 07.70, 05.00, 06.80, 08.40, 07.70, 00.15, 113.0840, 3.55, 7.50, 00.00, 3.5, 1.4); + AAPARAMS['K'] = new AAParams(-2.05, -0.60, -1.50, -1.90, -1.45, -0.20, -1.40, -1.30, -2.20, -1.45, 00.00, 128.0950, 3.55, 7.50, 10.00, 0.0, 1.0); + AAPARAMS['L'] = new AAParams(09.30, 05.55, 07.40, 09.60, 09.30, 09.20, 06.00, 07.90, 09.60, 08.70, 00.30, 113.0840, 3.55, 7.50, 00.00, 1.6, 1.6); + AAPARAMS['M'] = new AAParams(06.20, 04.40, 05.70, 05.80, 06.00, 06.20, 05.00, 05.70, 05.80, 06.00, 00.00, 131.0400, 3.55, 7.00, 00.00, 1.8, 1.0); + AAPARAMS['N'] = new AAParams(-0.85, 00.20, -0.20, -1.20, -1.10, -0.85, 00.20, -0.20, -1.20, -1.10, 00.00, 114.0430, 3.55, 7.50, 00.00, 0.0, 0.4); + AAPARAMS['P'] = new AAParams(02.10, 02.10, 02.10, 00.20, 02.10, 03.00, 01.00, 01.50, 00.20, 02.10, 00.00, 097.0530, 3.55, 8.36, 00.00, 0.0, 0.3); + AAPARAMS['Q'] = new AAParams(-0.40, -0.70, -0.20, -0.90, -1.10, -0.40, -0.80, -0.20, -0.90, -1.10, 00.00, 128.0590, 3.55, 7.50, 00.00, 0.0, 1.0); + AAPARAMS['R'] = new AAParams(-1.40, 00.50, -1.10, -1.30, -1.10, -0.20, 00.50, -1.10, -1.20, -1.10, 00.00, 156.1010, 3.55, 7.50, 12.00, 0.0, 1.0); + AAPARAMS['S'] = new AAParams(-0.15, 00.80, -0.10, -0.80, -1.20, -0.50, 00.40, 00.10, -0.80, -1.20, 00.00, 087.0320, 3.55, 6.93, 00.00, 0.0, 1.0); + AAPARAMS['T'] = new AAParams(00.65, 00.80, 00.60, 00.40, 00.00, 00.60, 00.80, 00.40, 00.40, 00.00, 00.00, 101.0480, 3.55, 6.82, 00.00, 0.0, 1.0); + AAPARAMS['V'] = new AAParams(05.00, 02.90, 03.40, 05.00, 04.20, 05.10, 02.70, 03.40, 05.00, 04.20, -0.30, 099.0680, 3.55, 7.44, 00.00, 1.4, 1.2); + AAPARAMS['W'] = new AAParams(12.25, 11.10, 11.80, 11.00, 12.10, 12.40, 11.60, 11.80, 11.00, 12.10, 00.15, 186.0790, 3.55, 7.50, 00.00, 1.6, 1.0); + AAPARAMS['Y'] = new AAParams(04.85, 03.70, 04.50, 04.00, 04.40, 05.10, 04.20, 04.50, 04.00, 04.40, -0.20, 163.0630, 3.55, 7.50, 10.00, 0.2, 1.0); + + AAPARAMS['B'] = new AAParams(00.15, 00.50, 00.40, -0.50, -0.50, 00.30, 00.30, 00.70, -0.50, -0.50, 00.00, 115.0270, 4.55, 7.50, 04.05, 0.0, 1.1); //? + AAPARAMS['X'] = new AAParams(00.00, 00.00, 00.00, 00.00, 00.00, 00.00, 00.00, 00.00, 00.00, 00.00, 00.00, 000.0000, 0.00, 0.00, 00.00, 0.0, 1.0); //? + AAPARAMS['Z'] = new AAParams(00.95, 01.00, 00.00, 00.00, -0.10, 00.50, 00.10, 00.00, 00.00, -0.10, 00.00, 129.0430, 4.75, 7.70, 04.45, 0.0, 1.1); //? + } + + // Note: The 100 A version is not yet verified. + private void A100Column() + { + // a | Weights for reg peptide | weights for short peptide | | | iso-elec vals | heli2 + // a | RC | RC1 | RC2 | RN | RN-1 | RCs | RC1s | RC2s | RNs |RN-1s| krh | mass | Ctrm| Ntrm| pk1 | bsc| cmu + AAPARAMS['A'] = new AAParams(01.02, -0.35, 00.35, 01.02, -0.20, 00.50, -0.05, 00.10, 00.50, -0.30, 00.00, 071.0370, 3.55, 7.59, 00.00, 1.0, 1.2); + AAPARAMS['C'] = new AAParams(00.10, 00.40, 00.20, 00.10, -0.40, 00.60, 00.60, 01.00, 00.60, -0.50, 00.00, 103.0090, 3.55, 7.50, 00.00, 0.0, 1.0); + AAPARAMS['D'] = new AAParams(00.15, 00.90, 00.60, 00.15, -0.40, 00.60, 00.30, 00.20, 00.60, -0.50, 00.00, 115.0270, 4.55, 7.50, 04.05, 0.0, 1.1); + AAPARAMS['E'] = new AAParams(01.00, 01.00, -0.20, 01.00, -0.10, 00.70, 00.45, 00.50, 00.00, 00.25, 00.00, 129.0430, 4.75, 7.70, 04.45, 0.0, 1.1); + AAPARAMS['F'] = new AAParams(11.67, 07.60, 09.70, 11.67, 11.50, 11.30, 08.40, 10.00, 11.30, 10.85, -0.10, 147.0638, 3.55, 7.50, 00.00, 0.5, 1.0); + AAPARAMS['G'] = new AAParams(-0.35, 00.15, 00.15, -0.35, -0.40, 00.00, 00.15, 00.20, 00.00, -0.70, 00.00, 057.0210, 3.55, 7.50, 00.00, 0.0, 0.3); + AAPARAMS['H'] = new AAParams(-3.00, -1.40, -1.00, -3.00, -1.90, -1.30, -1.30, -1.10, -1.30, -1.70, 00.00, 137.0590, 3.55, 7.50, 05.98, 0.0, 0.6); + AAPARAMS['I'] = new AAParams(07.96, 04.95, 06.30, 07.96, 06.60, 07.25, 04.50, 06.50, 07.25, 07.20, 00.15, 113.0840, 3.55, 7.50, 00.00, 3.5, 1.4); + AAPARAMS['K'] = new AAParams(-3.40, -1.85, -2.30, -2.10, -2.10, -1.75, -1.50, -1.75, -2.30, -2.50, 00.00, 128.0950, 3.55, 7.50, 10.00, 0.0, 1.0); + AAPARAMS['L'] = new AAParams(09.40, 05.57, 07.40, 09.40, 09.30, 08.70, 05.50, 07.70, 08.70, 08.50, 00.30, 113.0840, 3.55, 7.50, 00.00, 1.6, 1.6); + AAPARAMS['M'] = new AAParams(06.27, 05.20, 05.70, 06.27, 05.80, 06.25, 04.20, 05.70, 06.25, 05.60, 00.00, 131.0400, 3.55, 7.00, 00.00, 1.8, 1.0); + AAPARAMS['N'] = new AAParams(-0.95, 01.20, -0.10, -0.95, -1.30, -0.65, 00.40, -0.05, -0.65, -1.20, 00.00, 114.0430, 3.55, 7.50, 00.00, 0.0, 0.4); + AAPARAMS['P'] = new AAParams(01.85, 01.70, 01.75, 01.85, 01.20, 02.50, 01.70, 02.10, 02.50, 01.90, 00.00, 097.0530, 3.55, 8.36, 00.00, 0.0, 0.3); + AAPARAMS['Q'] = new AAParams(-0.60, -0.50, -0.20, -0.60, -1.10, -0.40, -0.20, -0.70, -0.40, -1.30, 00.00, 128.0590, 3.55, 7.50, 00.00, 0.0, 1.0); + AAPARAMS['R'] = new AAParams(-2.55, -1.40, -1.50, -1.10, -1.30, -1.00, 00.40, -1.00, -1.10, -1.90, 00.00, 156.1010, 3.55, 7.50, 12.00, 0.0, 1.0); + AAPARAMS['S'] = new AAParams(-0.14, 01.10, -0.10, -0.14, -1.00, -0.40, 00.20, -0.30, -0.40, -1.20, 00.00, 087.0320, 3.55, 6.93, 00.00, 0.0, 1.0); + AAPARAMS['T'] = new AAParams(00.64, 00.95, 00.60, 00.64, -0.10, 00.40, 00.30, 00.40, 00.40, -0.50, 00.00, 101.0480, 3.55, 6.82, 00.00, 0.0, 1.0); + AAPARAMS['V'] = new AAParams(04.68, 02.10, 03.40, 04.68, 03.90, 04.40, 02.10, 03.00, 04.40, 04.40, -0.30, 099.0680, 3.55, 7.44, 00.00, 1.4, 1.2); + AAPARAMS['W'] = new AAParams(13.35, 11.50, 11.80, 13.35, 13.00, 13.90, 11.80, 13.00, 13.90, 12.90, 00.15, 186.0790, 3.55, 7.50, 00.00, 1.6, 1.0); + AAPARAMS['Y'] = new AAParams(05.35, 04.30, 05.10, 05.35, 05.00, 05.70, 05.00, 05.40, 05.70, 05.30, -0.20, 163.0630, 3.55, 7.50, 10.00, 0.2, 1.0); + + AAPARAMS['B'] = new AAParams(00.15, 00.50, 00.40, -0.50, -0.50, 00.30, 00.30, 00.70, -0.50, -0.50, 00.00, 115.0270, 4.55, 7.50, 04.05, 0.0, 1.1); //? + AAPARAMS['X'] = new AAParams(00.00, 00.00, 00.00, 00.00, 00.00, 00.00, 00.00, 00.00, 00.00, 00.00, 00.00, 000.0000, 0.00, 0.00, 00.00, 0.0, 1.0); //? + AAPARAMS['Z'] = new AAParams(00.95, 01.00, 00.00, 00.00, -0.10, 00.50, 00.10, 00.00, 00.00, -0.10, 00.00, 129.0430, 4.75, 7.70, 04.45, 0.0, 1.1); //? + } + + // control variables, 0 means leaving them ON, 1 means turning them OFF + // Translator1 note: Some day these may be turned into options. For the + // time being they are unchanging, and the tests for them in each function + // are superfluous and absurd. + // Translator2 note: To avoid warnings on unreachable code, these were changed + // to auto-implemented properties, which means they can now be set. + + public int NOELECTRIC { get; set; } + public int NOCLUSTER { get; set; } + public int NODIGEST { get; set; } + public int NOSMALL { get; set; } + public int NOHELIX1 { get; set; } + public int NOHELIX2 { get; set; } + public int NOEHEL { get; set; } + + //Translator1 note: This constant controls whether "bugs" in the original + //perl code are maintained. A conversation with the developers has revealed + //that the constant data in the static initialization blocks has been "tuned" + //to the algorithm in its undebugged state. In other words, using a correct + //algorithm would invalidate the results. + private const bool DUPLICATE_ORIGINAL_CODE = true; + //Translator1 note: Some code is supposed to be executed only when + // $SSRCVERSION==3. SSRCVERSION was commented out in my version of the perl + // code. This may need some reworking. Speaking with the developers, it + // was determined that it ought not to have been commented out. So -- + // ALGORITHM_VERSION may be used to choose the older or newer code + private const int ALGORITHM_VERSION = 3; + + // Length Scaling length limits and scaling factors + private const int LPLim = 20; + private const int SPLim = 8; + private const double LPSFac = 0.0270; + private const double SPSFac = -0.055; + + // UnDigested (missed cuts) scaling Factors + private const double UDF21 = 0.0, UDF22 = 0.0; // rightmost + private const double UDF31 = 1.0, UDF32 = 0.0; // inside string + + // total correction values, 20..30 / 30..40 / 40..50 /50..500 + private const double SUMSCALE1 = 0.27, SUMSCALE2 = 0.33, SUMSCALE3 = 0.38, SUMSCALE4 = 0.447; + + // clusterness scaling: i.e. weight to give cluster correction. + private const double KSCALE = 0.4; + + // isoelectric scaling factors + private const double Z01 = -0.03, Z02 = 0.60, NDELTAWT = 0.8; // negative delta values + private const double Z03 = 0.00, Z04 = 0.00, PDELTAWT = 1.0; // positive delta values + + // proline chain scores + private const double PPSCORE = 1.2, PPPSCORE = 3.5, PPPPSCORE = 5.0; + + // helix scaling factors + private const double HELIX1SCALE = 1.6, HELIX2SCALE = 0.255; + + /// + /// No such thing as an unkown score for this calculator. ScoreSequence + /// always returns a value. + /// + public double UnknownScore + { + get { return 0; } + } + + public double ScoreSequence(PeptideWithSetModifications item) + { + var seq = item.BaseSequence; //PTMs are not yet implemented + double tsum3 = 0.0; + int i; + + // Core summation + + int sze = seq.Length; + if (sze < 4) // peptide is too short ot have any retention + { + return tsum3; + } + if (sze < 10) // short peptides use short peptide retention weights + { + tsum3 = + AAPARAMS[seq[0]].RC1S + // Sum weights for 1st + AAPARAMS[seq[1]].RC2S + // second, + AAPARAMS[seq[sze - 1]].RCNS + // ultimate + AAPARAMS[seq[sze - 2]].RCN2S; // and penultimate aa + + for (i = 2; i < sze - 2; i++) // add weights for aa's in the middle + { + tsum3 += AAPARAMS[seq[i]].RCS; + } + } + else // longer peptides use regular retention weights + { + tsum3 = + AAPARAMS[seq[0]].RC1 + // Sum weights for 1st + AAPARAMS[seq[1]].RC2 + // second, + AAPARAMS[seq[sze - 1]].RCN + // ultimate + AAPARAMS[seq[sze - 2]].RCN2; // and penultimate aa + + for (i = 2; i < sze - 2; i++) // add weights for aa's in the middle + { + tsum3 += AAPARAMS[seq[i]].RC; + } + } + //_log.debug("Core = "+tsum3); + + // 1- smallness - adjust based on tsum score of peptides shorter than 20 aa's. + tsum3 += Smallness(sze, tsum3); + //_log.debug("smallness = "+tsum3); + // 2- undigested parts + tsum3 -= Undigested(seq); + //_log.debug("undigested = "+tsum3); + // 3- clusterness # NB:weighting of v1 is now done in subrtn. + tsum3 -= Clusterness(seq); + //_log.debug("clusterness = "+tsum3); + // 4- proline fix + tsum3 -= Proline(seq); + //_log.debug("proline = "+tsum3); + // 5- length scaling correction + tsum3 *= Length_scale(sze); + //_log.debug("length_scale = "+tsum3); + // 6- total sum correction + if (tsum3 >= 20 && tsum3 < 30) tsum3 -= ((tsum3 - 18) * SUMSCALE1); + if (tsum3 >= 30 && tsum3 < 40) tsum3 -= ((tsum3 - 18) * SUMSCALE2); + if (tsum3 >= 40 && tsum3 < 50) tsum3 -= ((tsum3 - 18) * SUMSCALE3); + if (tsum3 >= 50) tsum3 -= ((tsum3 - 18) * SUMSCALE4); + //_log.debug("total sum = "+tsum3); + // 7- isoelectric change + tsum3 += NewIso(seq, tsum3); + //_log.debug("isoelectric = "+tsum3); + // 8- helicity corrections #NB: HELIX#SCALE-ing is now done in subrtn. + tsum3 += Helicity1(seq); + //_log.debug("helicity1 = "+tsum3); + tsum3 += Helicity2(seq); + //_log.debug("helicity2 = "+tsum3); + tsum3 += Helectric(seq); + //_log.debug("helectric = "+tsum3); + return tsum3; + } + + private double Smallness(int sqlen, double tsum) + { + if (NOSMALL == 1) + { + return 0.0; + } + if (sqlen < 20 && (tsum / sqlen) < 0.9) + { + return 3.5 * (0.9 - (tsum / sqlen)); + } + if (sqlen < 15 && (tsum / sqlen) > 2.8) + { + return 2.6 * ((tsum / sqlen) - 2.8); + } + return 0.0; + } + + private double Undigested(String sq) + { + if (NODIGEST == 1) + return 0.0; + + char op1, op2; + + int xx = sq.Length - 1; + char re = sq[xx]; + double csum = 0.0; + + // rightmost + if (re == 'R' || re == 'K' || re == 'H') + { + op1 = sq[xx - 1]; // left by 1 + op2 = sq[xx - 2]; // left by 2 + csum = UDF21 * AAPARAMS[op1].UndKRH + UDF22 * AAPARAMS[op2].UndKRH; + } + // scan through string, starting at second and ending two before left + // --Translator1 note: + // the perl code does not jibe with the comment above, and will probably need repair + // possibly dd should start out as 2, not 0; and should loop to xx-2, not xx. + + // Negative indices on the perl substr function make substrings offset from right + // (instead of left) end of string. The perl loop gets negative indices. This may be a + // a problem. + for (int dd = 0; dd < xx; dd++) + { + re = sq[dd]; + if (re == 'K' || re == 'R' || re == 'H') + { + char op3, op4; + op1 = op2 = op3 = op4 = '\0'; + if (dd - 1 >= 0 && dd - 1 <= xx) + op1 = sq[dd - 1]; //left by 1 + if (dd - 2 >= 0 && dd - 2 <= xx) + op2 = sq[dd - 2]; //left by 2 + // ReSharper disable ConditionIsAlwaysTrueOrFalse + if (DUPLICATE_ORIGINAL_CODE) + // ReSharper restore ConditionIsAlwaysTrueOrFalse + { + if (dd - 1 < 0 && (-(dd - 1)) <= xx) + op1 = sq[xx + (dd - 1) + 1]; + if (dd - 2 < 0 && (-(dd - 2)) <= xx) + op2 = sq[xx + (dd - 2) + 1]; + } + if (dd + 1 >= 0 && dd + 1 <= xx) + op3 = sq[dd + 1]; //right by 1 + if (dd + 2 >= 0 && dd + 2 <= xx) + op4 = sq[dd + 2]; //right by 2; + + csum = csum + + (UDF31 * (AAPARAMS[op1].UndKRH + AAPARAMS[op3].UndKRH)) + + (UDF32 * (AAPARAMS[op2].UndKRH + AAPARAMS[op4].UndKRH)); + } + } + return csum; + } + + // ============================================================ + // compute clusterness of a string - v 2,3 algorithm + // code W,L,F,I as 5 + // code M,Y,V as 1 + // code all others as 0 + + private double Clusterness(String sq) + { + if (NOCLUSTER == 1) + return 0.0; + + string cc = "0" + sq + "0"; // Not L10N + // ReSharper disable ConditionIsAlwaysTrueOrFalse + if (ALGORITHM_VERSION == 3) + // ReSharper restore ConditionIsAlwaysTrueOrFalse + { + cc = cc.ReplaceAAs("LIW", "5"); // Not L10N + cc = cc.ReplaceAAs("AMYV", "1"); // Not L10N + cc = cc.ReplaceAAs("A-Z", "0"); // Not L10N + } + else + // Suppress the unreachable code warning +#pragma warning disable 162 + // ReSharper disable HeuristicUnreachableCode + { + cc = cc.ReplaceAAs("LIWF", "5"); // Not L10N + cc = cc.ReplaceAAs("MYV", "1"); // Not L10N + cc = cc.ReplaceAAs("A-Z", "0"); // Not L10N + } + // ReSharper restore HeuristicUnreachableCode +#pragma warning restore 162 + + double score = 0.0; + // + // Translator1 note: check on true meaning of the algorithm that defines 'occurs' + // Should an encoded aa string such as 015101510 match pick "01510" once or twice? + // The perl code seems to match once. 0151001510 would match twice. + + foreach (var pair in CLUSTCOMB) + { + int occurs = 0; + Match m = pair.Key.Match(cc); + while (m.Success) + { + occurs++; + m = m.NextMatch(); + } + if (occurs > 0) + { + double sk = pair.Value; + double addit = sk * occurs; + score += addit; + } + } + return score * KSCALE; + } + + // ============================================================ + // process based on proline - v 2,3 algorithm + private static double Proline(String sq) + { + if (sq.Contains("PPPP")) // Not L10N + { + return PPPPSCORE; + } + else if (sq.Contains("PPP")) // Not L10N + { + return PPPSCORE; + } + else if (sq.Contains("PP")) // Not L10N + { + return PPSCORE; + } + else + { + return 0.0; + } + } + + // ============================================================ + // scaling based on length - v 1,2,3 algorithms + private static double Length_scale(int sqlen) + { + if (sqlen < SPLim) + { + return 1.0 + SPSFac * (SPLim - sqlen); + } + else if (sqlen > LPLim) + { + return 1.0 / (1.0 + LPSFac * (sqlen - LPLim)); + } + else + { + return 1.0; + } + } + + // ============================================================ + // compute partial charge - v 2,3 algorithms + private static double Partial_charge(double pK, double pH) + { + double cr = Math.Pow(10.0, (pK - pH)); + return cr / (cr + 1.0); + } + + // ============================================================ + // - v 2,3 algorithms + private double Electric(String sq) + { + int[] aaCNT = { 0, 0, 0, 0, 0, 0, 0 }; + + // Translator1 Note: this is commented out in the perl source + // if (NOELECTRIC == 1) { return 1.0; } + + // get c and n terminus acids + int ss = sq.Length; + char s1 = sq[0]; + char s2 = sq[ss - 1]; + double pk0 = AAPARAMS[s1].CT; + double pk1 = AAPARAMS[s2].NT; + + // count them up + for (int i = 0; i < ss; i++) + { + int index = EMap[sq[i]]; + if (index >= 0) + { + aaCNT[index]++; + } + } + + // cycle through pH values looking for closest to zero + // coarse pass + double best = 0.0; double min = 100000; const double step1 = 0.3; + + for (double z = 0.01; z <= 14.0; z = z + step1) + { + double check = CalcR(z, pk0, pk1, aaCNT); + if (check < 0) + check = 0 - check; + if (check < min) + { + min = check; + best = z; + } + } + + double best1 = best; + + // fine pass + min = 100000; + for (double z = best1 - step1; z <= best1 + step1; z = z + 0.01) + { + double check = CalcR(z, pk0, pk1, aaCNT); + if (check < 0) + check = 0 - check; + if (check < min) + { + min = check; + best = z; + } + } + return best; + } + + // ============================================================ + // compute R - v 2,3 algorithms + private double CalcR(double pH, double PK0, double PK1, int[] CNTref) + { + double cr0 = + Partial_charge(PK0, pH) // n terminus + + CNTref[EMap['K']] * Partial_charge(AAPARAMS['K'].PK, pH) // lys // Not L10N + + CNTref[EMap['R']] * Partial_charge(AAPARAMS['R'].PK, pH) // arg // Not L10N + + CNTref[EMap['H']] * Partial_charge(AAPARAMS['H'].PK, pH) // his // Not L10N + - CNTref[EMap['D']] * Partial_charge(pH, AAPARAMS['D'].PK) // asp // Not L10N + - CNTref[EMap['E']] * Partial_charge(pH, AAPARAMS['E'].PK) // glu // Not L10N + - CNTref[EMap['Y']] * Partial_charge(pH, AAPARAMS['Y'].PK) // try // Not L10N + - Partial_charge(pH, PK1); // c terminus + /* + // The following was taken out of the formula for R + // - $CNTref->{C} * _partial_charge( $pH, $PK{C} ) // cys + */ + return cr0; + } + + private double NewIso(string sq, double tsum) + { + if (NOELECTRIC == 1) + return 0.0; + + // compute mass + double mass = 0.0; + foreach (char cf1 in sq) + { + mass += AAPARAMS[cf1].AMASS; + } + // compute isoelectric value + double pi1 = Electric(sq); + double lmass = 1.8014 * Math.Log(mass); + + // make mass correction + double delta1 = pi1 - 19.107 + lmass; + //apply corrected value as scaling factor + + double corr01 = 0.0; + if (delta1 < 0.0) + { + corr01 = (tsum * Z01 + Z02) * NDELTAWT * delta1; + } + else if (delta1 > 0.0) + { + corr01 = (tsum * Z03 + Z04) * PDELTAWT * delta1; + } + return corr01; + } + + // ============================================================ + // called by helicity1 - v 3 algorithm + private static double Heli1TermAdj(string ss1, int ix2, int sqlen) + { + int where = 0; + + for (int i = 0; i < ss1.Length; i++) + { + char m = ss1[i]; + if (m == 'O' || m == 'U') + { + where = i; + // Suppress unreachable code warning +#pragma warning disable 162 + // ReSharper disable ConditionIsAlwaysTrueOrFalse + if (!DUPLICATE_ORIGINAL_CODE) + // ReSharper restore ConditionIsAlwaysTrueOrFalse + // ReSharper disable HeuristicUnreachableCode + break; + // ReSharper restore HeuristicUnreachableCode +#pragma warning restore 162 + } + } + + where += ix2; + + if (where < 2) { return 0.20; } + if (where < 3) { return 0.25; } + if (where < 4) { return 0.45; } + + if (where > sqlen - 3) { return 0.2; } + if (where > sqlen - 4) { return 0.75; } + if (where > sqlen - 5) { return 0.65; } + + return 1.0; + } + + // ============================================================ + // helicity1 adjust for short helices or sections - v 3 algorithm + // + private double Helicity1(string sq) + { + if (NOHELIX1 == 1) + return 0.0; + + string hc = sq; //helicity coded sq + + /* Translator1 note: notice lowercase 'z'. This never appears in any patterns to which this + string is compared, and will never match any helicity patterns. + */ + hc = hc.ReplaceAAs("PHRK", "z"); // Not L10N + hc = hc.ReplaceAAs("WFIL", "X"); // Not L10N + hc = hc.ReplaceAAs("YMVA", "Z"); // Not L10N + hc = hc.ReplaceAAs("DE", "O"); // Not L10N + hc = hc.ReplaceAAs("GSPCNKQHRT", "U"); // Not L10N + + double sum = 0.0; + int sqlen = hc.Length; + + // Translator1 note: this loop should be reviewed carefully + + for (int i = 0; i < sqlen - 3; i++) + { + string hc4 = string.Empty, hc5 = string.Empty, hc6 = string.Empty; + double sc4 = 0.0, sc5 = 0.0, sc6 = 0.0; + + if (hc.Substring(i).Length >= 6) + { + hc6 = hc.Substring(i, 6); + sc6 = 0.0; + if (HlxScore6.ContainsKey(hc6)) + { + sc6 = HlxScore6[hc6]; + } + } + if (sc6 > 0) + { + double trmAdj6 = Heli1TermAdj(hc6, i, sqlen); + sum += (sc6 * trmAdj6); + i = i + 1; //?? + continue; + } + + if (hc.Substring(i).Length >= 5) + { + hc5 = hc.Substring(i, 5); + sc5 = 0.0; + if (HlxScore5.ContainsKey(hc5)) + { + sc5 = HlxScore5[hc5]; + } + } + if (sc5 > 0) + { + double trmAdj5 = Heli1TermAdj(hc5, i, sqlen); + sum += (sc5 * trmAdj5); + i = i + 1; //?? + continue; + } + + if (hc.Substring(i).Length >= 4) + { + hc4 = hc.Substring(i, 4); + sc4 = 0.0; + if (HlxScore4.ContainsKey(hc4)) + { + sc4 = HlxScore4[hc4]; + } + } + if (sc4 > 0) + { + double trmAdj4 = Heli1TermAdj(hc4, i, sqlen); + sum += (sc4 * trmAdj4); + i = i + 1; //?? + } + } + return HELIX1SCALE * sum; + } + + // ============================================================ + // called by heli2calc - v 3 algorithm + private double EvalH2pattern(String pattern, String testsq, int posn, char etype) + { + char f01 = pattern[0]; + double prod1 = AAPARAMS[f01].H2BASCORE; + int iss = 0; + const int OFF1 = 2; + int acount = 1; + char far1 = '\0'; + char far2 = '\0'; + + char testAAl = testsq[OFF1 + posn]; + char testAAr = testsq[OFF1 + posn + 2]; + string testsqCopy = testsq.Substring(OFF1 + posn + 1); + double mult = Connector(f01, testAAl, testAAr, "--", far1, far2); // Not L10N + prod1 = prod1 * mult; + if (etype == '*') // Not L10N + prod1 = prod1 * 25.0; + if (mult == 0.0) + { + return 0.0; + } + for (int i = 1; i < pattern.Length - 2; i = i + 3) + { + string fpart = pattern.Substring(i, 2); + char gpart = (i + 2) < pattern.Length ? pattern[i + 2] : '\0'; // Not L10N + double s3 = AAPARAMS[gpart].H2BASCORE; + if (fpart.Equals("--")) // Not L10N + { + iss = 0; far1 = '\0'; far2 = '\0'; // Not L10N + } + if (fpart.Equals("<-")) // Not L10N + { + iss = 1; far1 = testsqCopy[i + 1]; far2 = '\0'; // Not L10N + } + if (fpart.Equals("->")) // Not L10N + { + iss = -1; far1 = '\0'; far2 = testsqCopy[i + 3]; // Not L10N + } + + testAAl = testsqCopy[i + 1 + iss]; + testAAr = testsqCopy[i + 3 + iss]; + + mult = Connector(gpart, testAAl, testAAr, fpart, far1, far2); + + if (etype == '*') // Not L10N + { + if (mult != 0.0 || acount < 3) + { + prod1 = prod1 * 25.0 * s3 * mult; + } + } + + if (etype == '+') // Not L10N + { + prod1 = prod1 + s3 * mult; + } + + if (mult == 0.0) + { + return prod1; + } + + acount++; + } + return prod1; + } + + // ============================================================ + // called by evalH2pattern - v 3 algorithm + private double Connector(char acid, char lp, char rp, String ct, char far1, char far2) + { + double mult = 1.0; + + if (ct.Contains("<-")) { mult *= 0.2; } // Not L10N + if (ct.Contains("->")) { mult *= 0.1; } // Not L10N + + mult *= AAPARAMS[lp].H2CMULT; + if (lp != rp) mult *= AAPARAMS[rp].H2CMULT; + + if (acid == 'A' || acid == 'Y' || acid == 'V' || acid == 'M') // Not L10N + { + if (lp == 'P' || lp == 'G' || rp == 'P' || rp == 'G') mult = 0.0; // Not L10N + if (ct.Contains("->") || ct.Contains("<-")) mult = 0.0; // Not L10N + } + + if (acid == 'L' || acid == 'W' || acid == 'F' || acid == 'I') // Not L10N + { + if (((lp == 'P' || lp == 'G') || (rp == 'P' || rp == 'G')) && (!ct.Contains("--"))) mult = 0.0; // Not L10N + if (((far1 == 'P' || far1 == 'G') || (far2 == 'P' || far2 == 'G')) && (ct.Contains("<-") || ct.Contains("->"))) mult = 0.0; // Not L10N + } + return mult; + } + + private const int HISC = 0; + private const int GSC = 1; + + // ============================================================ + // called by helicity2 - v 3 algorithm + private double[] Heli2Calc(String sq) + { + // Translator1 note: in the original perl and translated C, this function + // was void and returned values through double pointer arguments. Like this: + // + // void heli2Calc(char *sq, double *hisc, double *gsc) + // + + double[] ret = new double[2]; + string traps; //not my()'ed in perl source + string best = string.Empty; + const int llim = 50; + double hiscore = 0.0; + int best_pos = 0; + + if (sq.Length < 11) + { + ret[HISC] = 0.0; + ret[GSC] = 0.0; + return ret; + } + + string prechop = sq; + string sqCopy = sq.Substring(2, sq.Length - 4); + + string pass1 = sqCopy.ReplaceAAs("WFILYMVA", "1"); // Not L10N + pass1 = pass1.ReplaceAAs("GSPCNKQHRTDE", "0"); // Not L10N + + for (int i = 0; i < pass1.Length; i++) + { + char m = pass1[i]; + if (m == '1') // Not L10N + { + string lc = pass1.Substring(i); + string sq2 = sqCopy.Substring(i); + string pat = string.Empty; + int zap = 0; + int subt = 0; + + while (zap <= llim && subt < 2) + { + char f1 = (zap < 0 || zap >= lc.Length ? '0' : lc[zap]); + char f2 = (zap - 1 < 0 || zap - 1 >= lc.Length ? '0' : lc[zap - 1]); // Not L10N + char f3 = (zap + 1 < 0 || zap + 1 >= lc.Length ? '0' : lc[zap + 1]); // Not L10N + + if (f1 == '1') // Not L10N + { + if (zap > 0) + pat += "--"; // Not L10N + pat += sq2.Substring(zap, 1); + } + else + { + if (f2 == '1' && f1 == '0') // Not L10N + { + subt++; + if (subt < 2) + { + pat += "->"; // Not L10N + pat += sq2.Substring(zap - 1, 1); + } + } + else + { + if (f3 == '1' && f1 == '0') // Not L10N + { + subt++; + if (subt < 2) + { + pat += "<-"; // Not L10N + pat += sq2.Substring(zap + 1, 1); + } + } + } + } + + if (f1 == '0' && f2 == '0' && f3 == '0') // Not L10N + zap = 1000; + zap += 3; + } + + if (pat.Length > 4) + { + traps = prechop; + double skore = EvalH2pattern(pat, traps, i - 1, '*'); // Not L10N + if (skore >= hiscore) + { + hiscore = skore; + best = pat; + best_pos = i; + } + } + } + } + + if (hiscore > 0.0) + { + double gscore = hiscore; //not my()'ed in perl source + traps = prechop; + hiscore = EvalH2pattern(best, traps, best_pos - 1, '+'); // Not L10N + + ret[HISC] = hiscore; + ret[GSC] = gscore; + return ret; + } + + ret[HISC] = 0.0; + ret[GSC] = 0.0; + return ret; + } + + // ============================================================ + // helicity2 adjust for long helices - v 3 algorithm + private double Helicity2(string sq) + { + if (NOHELIX2 == 1) + return 0.0; + string Bksq = sq.Backwards(); + double[] fhg = Heli2Calc(sq); + double FwHiscor = fhg[HISC]; + double FwGscor = fhg[GSC]; + double[] rhg = Heli2Calc(Bksq); + double BkHiscor = rhg[HISC]; + double BkGscor = rhg[GSC]; + double h2FwBk = BkGscor > FwGscor ? BkHiscor : FwHiscor; + double lenMult = 0.0; + if (sq.Length > 30) + { + lenMult = 1; + } + double NoPMult = 0.75; + if (sq.Contains("P")) // Not L10N + NoPMult = 0.0; + double h2mult = 1.0 + lenMult + NoPMult; + return HELIX2SCALE * h2mult * h2FwBk; + } + + private double Helectric(String sq) + { + if (NOEHEL == 1 || sq.Length > 14 || sq.Length < 4) + return 0.0; + string mpart = sq.Substring(sq.Length - 4); + + if (mpart[0] == 'D' || mpart[0] == 'E') // Not L10N + { + mpart = mpart.Substring(1, 2); + if (mpart.ContainsAA("PGKRH")) // Not L10N + return 0.0; + mpart = mpart.ReplaceAAs("LI", "X"); // Not L10N + mpart = mpart.ReplaceAAs("AVYFWM", "Z"); // Not L10N + mpart = mpart.ReplaceAAs("GSPCNKQHRTDE", "U"); // Not L10N + + switch (mpart) + { + // ReSharper disable NonLocalizedString + case "XX": return 1.0; + case "ZX": return 0.5; + case "XZ": return 0.5; + case "ZZ": return 0.4; + case "XU": return 0.4; + case "UX": return 0.4; + case "ZU": return 0.2; + case "UZ": return 0.2; + // ReSharper restore NonLocalizedString + } + } + return 0; + } + + public class AAParams + { + //Retention Factors + public double RC { get; private set; } + public double RC1 { get; private set; } + public double RC2 { get; private set; } + public double RCN { get; private set; } + public double RCN2 { get; private set; } + //Short peptide retention factors + public double RCS { get; private set; } + public double RC1S { get; private set; } + public double RC2S { get; private set; } + public double RCNS { get; private set; } + public double RCN2S { get; private set; } + + public double UndKRH { get; private set; } //Factors for aa's near undigested KRH + public double AMASS { get; private set; } //aa masses in Daltons + //isoelectric factors + public double CT { get; private set; } + public double NT { get; private set; } + public double PK { get; private set; } + //helicity2 bascore & connector multiplier + public double H2BASCORE { get; private set; } + public double H2CMULT { get; private set; } + + public AAParams( + double rc, double rc1, double rc2, double rcn, double rcn2, + double rcs, double rc1s, double rc2s, double rcns, double rcn2s, + double undkrh, double amass, + double ct, double nt, double pk, + double h2bascore, double h2cmult + ) + { + RC = rc; + RC1 = rc1; + RC2 = rc2; + RCN = rcn; + RCN2 = rcn2; + RCS = rcs; + RC1S = rc1s; + RC2S = rc2s; + RCNS = rcns; + RCN2S = rcn2s; + UndKRH = undkrh; + AMASS = amass; + CT = ct; + NT = nt; + PK = pk; + H2BASCORE = h2bascore; + H2CMULT = h2cmult; + } + } + /* + * Translator2 note: The code for the Isoparams array was found in + * the Java version, but never used. Refering to the Perl + * version showed that the only place these values were used + * was in the electric_scale() function, which in turn was never + * used. Both the array and function are included here for + * completeness, but commented out, since they are never used. + * + private class Isoparams + { + public double emin { get; private set; } + public double emax { get; private set; } + public double eK { get; private set; } + + public Isoparams(double EMIN, double EMAX, double EK) + { + emin = EMIN; emax = EMAX; eK = EK; + } + } + + private static readonly Isoparams[] ISOPARAMS = new[] + { + new Isoparams(3.8, 4.0, 0.880), + new Isoparams(4.0, 4.2, 0.900), + new Isoparams(4.2, 4.4, 0.920), + new Isoparams(4.4, 4.6, 0.940), + new Isoparams(4.6, 4.8, 0.960), + new Isoparams(4.8, 5.0, 0.980), + new Isoparams(5.0, 6.0, 0.990), + new Isoparams(6.0, 7.0, 0.995), + new Isoparams(7.0, 8.0, 1.005), + new Isoparams(8.0, 9.0, 1.010), + new Isoparams(9.0, 9.2, 1.020), + new Isoparams(9.2, 9.4, 1.030), + new Isoparams(9.4, 9.6, 1.040), + new Isoparams(9.6, 9.8, 1.060), + new Isoparams(9.8, 10.0, 1.080) + }; + + // convert electric to scaler - v 2,3 algorithms + private static double electric_scale(double v) + { + double best=1.0; + + // Translator2 Note: this is commented out in the perl source + // if (NOELECTRIC==1) { return 1.0; } + + foreach (Isoparams p in ISOPARAMS) + { + if (v > p.emin && v < p.emax) + best= p.eK; + } + + return best; + } + */ + } + + internal static class HelpersLocal + { + /// + /// Replace amino acids in a sequence string with some other value. + /// + /// The sequence string with AAs in uppercase + /// The amino acid characters, or A-Z for all, to replace + /// The value to use as a replacement + /// Modified string with specified AAs replaced + public static string ReplaceAAs(this IEnumerable s, string aas, string newValue) + { + StringBuilder sb = new StringBuilder(); + bool allAAs = (aas == "A-Z"); // Not L10N + foreach (char c in s) + { + if (!allAAs && aas.IndexOf(c) != -1) + { + sb.Append(newValue); + } + else if (allAAs && char.IsLetter(c) && char.IsUpper(c)) + { + sb.Append(newValue); + } + else + { + sb.Append(c); + } + } + + return sb.ToString(); + } + + /// + /// Inspects a sequence of amino acids, and returns true if it contains + /// any of the designated amino acid characters. + /// + /// Amino acid sequence + /// List of characters to search for + /// True if any of the amino acid characters are found + public static bool ContainsAA(this IEnumerable s, string aas) + { + foreach (char c in s) + { + if (aas.IndexOf(c) != -1) + { + return true; + } + } + return false; + } + + public static string Backwards(this IEnumerable s) + { + StringBuilder sb = new StringBuilder(); + foreach (char c in s.Reverse()) + { + sb.Append(c); + } + return sb.ToString(); + } + } + // ReSharper restore CharImplicitlyConvertedToNumeric + // ReSharper restore InconsistentNaming +} + + diff --git a/mzLib/MassSpectrometry/Proteomics/RetentionTimePrediction/SeparationType.cs b/mzLib/MassSpectrometry/Proteomics/RetentionTimePrediction/SeparationType.cs new file mode 100644 index 000000000..abc460533 --- /dev/null +++ b/mzLib/MassSpectrometry/Proteomics/RetentionTimePrediction/SeparationType.cs @@ -0,0 +1,8 @@ +namespace Proteomics.RetentionTimePrediction +{ + public enum SeparationType + { + HPLC, //this is for all reverse phase separations at this time + CZE //this is for capillary electrophoresis separations + } +} diff --git a/mzLib/MzLibUtil/DoubleRange.cs b/mzLib/MzLibUtil/DoubleRange.cs index 939292b0d..4618a0bb5 100644 --- a/mzLib/MzLibUtil/DoubleRange.cs +++ b/mzLib/MzLibUtil/DoubleRange.cs @@ -79,14 +79,6 @@ public virtual string ToString(string format) return $"[{Minimum.ToString(format, System.Globalization.CultureInfo.InvariantCulture)};{Maximum.ToString(format, System.Globalization.CultureInfo.InvariantCulture)}]"; } - /// - /// Compares the DoubleRange to a double 'item' passed in. - /// If the 'item' falls below the range, 1 is returned (the range is greater than the item) - /// If the 'item' falls above the range, -1 is returned (the range is less than the item) - /// If the 'item' falls within the range, 0 is returned - /// - /// A double the range will be compared against - /// 1, 0, or -1 public int CompareTo(double item) { if (Minimum.CompareTo(item) > 0) diff --git a/mzLib/Test/Test.csproj b/mzLib/Test/Test.csproj index b79f512c2..5a43d2e89 100644 --- a/mzLib/Test/Test.csproj +++ b/mzLib/Test/Test.csproj @@ -31,8 +31,6 @@ - - diff --git a/mzLib/Test/TestDeconvolution.cs b/mzLib/Test/TestDeconvolution.cs index c21a048db..fbc3efc5d 100644 --- a/mzLib/Test/TestDeconvolution.cs +++ b/mzLib/Test/TestDeconvolution.cs @@ -11,6 +11,12 @@ using System.Globalization; using System.IO; using System.Linq; +using Easy.Common.Extensions; +using MassSpectrometry.Deconvolution; +using MassSpectrometry.Deconvolution.Algorithms; +using MassSpectrometry.Deconvolution.Scoring; +using TopDownProteomics.MassSpectrometry; +using IsotopicDistribution = Chemistry.IsotopicDistribution; namespace Test { @@ -71,7 +77,7 @@ public static void CheckGetMostAbundantObservedIsotopicMass(string peptide, stri Protein test1 = new Protein(peptide, "Accession"); DigestionParams d = new DigestionParams(); PeptideWithSetModifications pw = new PeptideWithSetModifications(test1, d, 1, test1.Length, CleavageSpecificity.None, "", 0, new Dictionary(), 0); - double m = pw.MostAbundantMonoisotopicMass.ToMz(charge); + double mostAbundantMz = pw.MostAbundantMass.ToMz(charge); string singleScan = Path.Combine(TestContext.CurrentContext.TestDirectory, "DataFiles", file); Mzml singleMZML = Mzml.LoadAllStaticData(singleScan); @@ -88,7 +94,7 @@ public static void CheckGetMostAbundantObservedIsotopicMass(string peptide, stri //check assigned correctly List lie2 = singlespec.Deconvolute(singleRange, minAssumedChargeState, maxAssumedChargeState, deconvolutionTolerancePpm, intensityRatioLimit).ToList(); List lie2_charge = lie2.Where(p => p.Charge == charge).ToList(); - Assert.That(lie2_charge[0].MostAbundantObservedIsotopicMass / charge, Is.EqualTo(m).Within(0.1)); + Assert.That(lie2_charge[0].MostAbundantObservedIsotopicMz, Is.EqualTo(mostAbundantMz).Within(0.1)); //check that if already assigned, skips assignment and just recalls same value List lie3 = singlespec.Deconvolute(singleRange, minAssumedChargeState, maxAssumedChargeState, deconvolutionTolerancePpm, intensityRatioLimit).ToList(); @@ -157,7 +163,7 @@ public static void CheckClassicGetMostAbundantObservedIsotopicMass(string peptid Protein test1 = new Protein(peptide, "Accession"); DigestionParams d = new DigestionParams(); PeptideWithSetModifications pw = new PeptideWithSetModifications(test1, d, 1, test1.Length, CleavageSpecificity.None, "", 0, new Dictionary(), 0); - double m = pw.MostAbundantMonoisotopicMass.ToMz(charge); + double pwsmMonoisotopicMass = pw.MostAbundantMass; string singleScan = Path.Combine(TestContext.CurrentContext.TestDirectory, "DataFiles", file); Mzml singleMZML = Mzml.LoadAllStaticData(singleScan); @@ -181,7 +187,7 @@ public static void CheckClassicGetMostAbundantObservedIsotopicMass(string peptid List lie2 = deconvoluter.ClassicDeconvoluteMzSpectra(singlespec, singleRange).ToList(); List lie2_charge = lie2.Where(p => p.Charge == charge).ToList(); - Assert.That(lie2_charge[0].MostAbundantObservedIsotopicMass / charge, Is.EqualTo(m).Within(0.1)); + Assert.That(lie2_charge[0].MostAbundantObservedIsotopicMass, Is.EqualTo(pwsmMonoisotopicMass).Within(0.05)); //check that if already assigned, skips assignment and just recalls same value List lie3 = deconvoluter.ClassicDeconvoluteMzSpectra(singlespec, singleRange).ToList(); @@ -189,5 +195,166 @@ public static void CheckClassicGetMostAbundantObservedIsotopicMass(string peptid } #endregion + + #region SpectralDecon + + [Test] + public static void TestGetSecondMostAbundantSpecies() + { + Protein testProtein = new Protein("PEPTIDEFPEPTIDEK", "Accession"); + DigestionParams digestionParams = new DigestionParams(); + PeptideWithSetModifications pwsm = new PeptideWithSetModifications(testProtein, digestionParams, 1, testProtein.Length, CleavageSpecificity.None, "", 0, new Dictionary(), 0); + IsotopicDistribution isotopicDistribution = IsotopicDistribution.GetDistribution(pwsm.FullChemicalFormula, + fineResolution: 0.125, minProbability: 0.001); + + int charge = 1; + IsotopicEnvelope testEnvelope = new IsotopicEnvelope(isotopicDistribution, charge); + Assert.That(testEnvelope.MostAbundantObservedIsotopicMz, Is.EqualTo(pwsm.MonoisotopicMass.ToMz(charge)).Within(0.1)); + Assert.That(testEnvelope.SecondMostAbundantObservedIsotopicMz, Is.EqualTo(isotopicDistribution.Masses[1].ToMz(charge)).Within(0.1)); + } + + [Test] + + public void TestIndexingForSpectralDecon() + { + //PEPTIDEK vs PEPTIDEFPEPTIDEK (the longer peptide has ~ twice the mass of the shorter, enabling a test of the indexing system + Protein myProtein = new Protein("PEPTIDEKPEPTIDEFPEPTIDEK", "accession"); + DigestionParams digest1 = new DigestionParams(protease: "trypsin", maxMissedCleavages: 0, initiatorMethionineBehavior: InitiatorMethionineBehavior.Retain); + + List pep = myProtein.Digest(digest1, new List(), new List()).ToList(); + + int minAssumedChargeState = 1; + int maxAssumedChargeState = 60; + double deconvolutionTolerancePpm = 20; + int binsPerDalton = 1; + int scanMinimum = 460; + + SpectralDeconvolutionParameters spectralDeconParams = new SpectralDeconvolutionParameters( + minAssumedChargeState, maxAssumedChargeState, deconvolutionTolerancePpm, + new List() { myProtein }, + new List(), new List(), digest1, + new List(), false, scanMinimumMz: scanMinimum, scanMaximumMz: 2000, + ambiguityThresholdForIsotopicDistribution: 0.9, binsPerDalton: binsPerDalton); + + SpectralDeconvolutionAlgorithm spectralDecon = new SpectralDeconvolutionAlgorithm(spectralDeconParams); + + PeptideWithSetModifications peptidek = pep.Where(p => p.FullSequence.Equals("PEPTIDEK")).First(); + PeptideWithSetModifications doublePeptidek = pep.Where(p => p.FullSequence.Equals("PEPTIDEFPEPTIDEK")).First(); + Assert.That(spectralDecon.EnvelopeDictionary.ContainsKey(peptidek) & spectralDecon.EnvelopeDictionary.ContainsKey(doublePeptidek)); + Assert.That(spectralDecon.EnvelopeDictionary[peptidek].Count == 2 & spectralDecon.EnvelopeDictionary[doublePeptidek].Count == 4); + + List> indexedSpectra = new(); + //Iterate over 2d Array + foreach (var listOfSpectra in spectralDecon.IndexedLibrarySpectra) + { + if (listOfSpectra.IsNotNullOrEmpty()) indexedSpectra.Add(listOfSpectra); + } + + // For the longer peptide, the first and second isotopes have extremely similar abundances, + // so they should be stored in different bins for the +1, +2, and +4 charge states (the +3 charge state masses fall within the same bin [619 Thompsons]) + // The shorter peptide is approximately 1/2 the mass of the longer peptide. With bin sizes of one dalton, + // every spectra for the shorter peptide should share a bin with a peptide from a longer spectra. + // This assertion does a lot of heavy lifting in testing the indexing engine. + // DO NOT CHANGE unless you understand what is being tested here + //Assert.That(indexedSpectra.Count == 7); + + int peptideCharge = 2; + int binIndex = (int)Math.Floor(binsPerDalton * (peptidek.MonoisotopicMass.ToMz(charge: peptideCharge) - scanMinimum)); + int chargeIndex = peptideCharge - minAssumedChargeState; + Assert.That(spectralDecon.SpectrumIndexToPwsmMap.TryGetValue((binIndex, chargeIndex, 0), out var peptidek2Charge)); + Assert.That(spectralDecon.SpectrumIndexToPwsmMap.TryGetValue((binIndex, 3, 0), out var doublePeptideK4Charge)); + Assert.That(peptidek2Charge.BaseSequence.Equals(peptidek.BaseSequence)); + Assert.That(doublePeptideK4Charge.BaseSequence.Equals(doublePeptidek.BaseSequence)); + + peptideCharge = 1; + chargeIndex = peptideCharge - minAssumedChargeState; + binIndex = (int)Math.Floor(binsPerDalton * (doublePeptidek.MonoisotopicMass.ToMz(charge: 1) - scanMinimum)); + Assert.That(spectralDecon.SpectrumIndexToPwsmMap.TryGetValue((binIndex, chargeIndex, 0), out var doublePeptideK1Charge) && + !spectralDecon.SpectrumIndexToPwsmMap.TryGetValue((binIndex, chargeIndex, 1), out var doesNotExist)); + Assert.That(doublePeptideK1Charge == doublePeptideK4Charge); + + } + + [Test] + [TestCase("APSGGKK", "12-18-17_frac7_calib_ms1_663_665.mzML", 2)] + [TestCase("PKRKAEGDAKGDKAKVKDEPQRRSARLSAKPAPPKPEPKPKKAPAKKGEKVPKGKKGKADAGKEGNNPAENGDAKTDQAQKAEGAGDAK", "FXN11_tr1_032017-calib_ms1_scans716_718.mzML", 8)] + [TestCase("PKRKVSSAEGAAKEEPKRRSARLSAKPPAKVEAKPKKAAAKDKSSDKKVQTKGKRGAKGKQAEVANQETKEDLPAENGETKTEESPASDEAGEKEAKSD", "FXN11_tr1_032017-calib_ms1_scans781_783.mzML", 16)] + public static void CheckSpectralGetMostAbundantObservedIsotopicMass(string peptide, string file, int charge) + { + Protein myProtein = new Protein(peptide, "Accession"); + DigestionParams digest1 = new DigestionParams("top-down"); + PeptideWithSetModifications pw = new PeptideWithSetModifications(myProtein, digest1, 1, myProtein.Length, CleavageSpecificity.None, "", 0, new Dictionary(), 0); + double pwsmMonoisotopicMass = pw.MostAbundantMass; + + string singleScan = Path.Combine(TestContext.CurrentContext.TestDirectory, "DataFiles", file); + Mzml singleMZML = Mzml.LoadAllStaticData(singleScan); + + List singlescan = singleMZML.GetAllScansList(); + + MzSpectrum singlespec = singlescan[0].MassSpectrum; + MzRange singleRange = new MzRange(singlespec.XArray.Min(), singlespec.XArray.Max()); + + + int minAssumedChargeState = 1; + int maxAssumedChargeState = 60; + double deconvolutionTolerancePpm = 20; + int binsPerDalton = 1; + int scanMinimum = 460; + + DeconvolutionParameters deconParams = new SpectralDeconvolutionParameters( + minAssumedChargeState, maxAssumedChargeState, deconvolutionTolerancePpm, + new List() { myProtein }, + new List(), new List(), digest1, + new List(), false, scanMinimumMz: singleRange.Minimum, scanMaximumMz: singleRange.Maximum, + ambiguityThresholdForIsotopicDistribution: 0.9, binsPerDalton: binsPerDalton); + + Deconvoluter deconvoluter = new Deconvoluter(DeconvolutionTypes.SpectralDeconvolution, deconParams); + + //check assigned correctly + + List lie2 = deconvoluter.SpectralDeconvoluteMzSpectra(singlespec, singleRange).ToList(); + + List lie2_charge = lie2.Where(p => p.Charge == charge).ToList(); + Assert.That(lie2_charge[0].MostAbundantObservedIsotopicMass, Is.EqualTo(pwsmMonoisotopicMass).Within(0.05)); + + //check that if already assigned, skips assignment and just recalls same value + //List lie3 = deconvoluter.ClassicDeconvoluteMzSpectra(singlespec, singleRange).ToList(); + //Assert.AreEqual(lie2.Select(p => p.MostAbundantObservedIsotopicMass), lie3.Select(p => p.MostAbundantObservedIsotopicMass)); + } + #endregion + + #region scorerTests + + [Test] + public static void ScorerTest() + { + Scorer.ScoringMethods kullbackMethod = Scorer.ScoringMethods.KullbackLeibler; + Scorer.ScoringMethods spectralContrastMethod = Scorer.ScoringMethods.SpectralContrastAngle; + + // KullbackLeibler hasn't been implemented yet + Assert.That(() => new Scorer(kullbackMethod, new PpmTolerance(5.0)), Throws.Exception.TypeOf()); + // Assert.That(kullbackScorer.PoorScore > 10); + // In kullback leibler, low scores are better. In spectral contrast angle, high scores are better + // Assert.That(kullbackScorer.TestForScoreImprovement(0.01, 0.03, out var better)); + + + Scorer spectralScorer = new Scorer(spectralContrastMethod, new PpmTolerance(5.0)); + Assert.That(spectralScorer.PoorScore <= 0); + Assert.That(!spectralScorer.TestForScoreImprovement(0.01, 0.03, out var betterScore)); + + + MinimalSpectrum testSpectrum = + new MinimalSpectrum(new double[] { 1.0, 2.0, 3.0, 4.0 }, new double[] { 1.0, 2.0, 3.0, 4.0 }); + MinimalSpectrum comparisonSpectrum = + new MinimalSpectrum(new double[] { 1.0, 2.0, 3.0, 4.0 }, new double[] { 2.0, 1.0, 4.0, 3.0 }); + + double spectralScore = spectralScorer.Score(testSpectrum, comparisonSpectrum); + Assert.That(spectralScore, Is.EqualTo(0.766).Within(0.001)); + Assert.That(spectralScore, Is.EqualTo(spectralScorer.Score(comparisonSpectrum, testSpectrum) ).Within(0.01)); + + } + + #endregion + } } \ No newline at end of file diff --git a/mzLib/Test/TestIsotopicEnvelope.cs b/mzLib/Test/TestIsotopicEnvelope.cs new file mode 100644 index 000000000..10f8fb192 --- /dev/null +++ b/mzLib/Test/TestIsotopicEnvelope.cs @@ -0,0 +1,37 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using Chemistry; +using MassSpectrometry; +using Proteomics.ProteolyticDigestion; +using Proteomics; +using NUnit.Framework; + +namespace Test +{ + [TestFixture] + public static class TestIsotopicEnvelope + { + [Test] + public static void TestIsotopicDistributionToEnvelope() + { + Protein myProtein = new Protein("PEPTIDEK", "accession"); + + DigestionParams digest = new DigestionParams(protease: "trypsin", maxMissedCleavages: 0, + initiatorMethionineBehavior: InitiatorMethionineBehavior.Retain); + + PeptideWithSetModifications pwsm = myProtein.Digest(digest, new List(), + new List()).First(); + + IsotopicDistribution distribution = IsotopicDistribution.GetDistribution(pwsm.FullChemicalFormula, 0.125, 1e-8); + IsotopicEnvelope envelope = new(distribution, charge: 2); + + double distributionMostAbundant = distribution.MostAbundantMass; + double envelopeMostAbundant = envelope.MostAbundantObservedIsotopicMass; + Assert.AreEqual(envelopeMostAbundant, envelopeMostAbundant); + } + + } +} diff --git a/mzLib/Test/TestPeptideWithSetMods.cs b/mzLib/Test/TestPeptideWithSetMods.cs index a425d7549..b97729050 100644 --- a/mzLib/Test/TestPeptideWithSetMods.cs +++ b/mzLib/Test/TestPeptideWithSetMods.cs @@ -1065,11 +1065,11 @@ public static void CheckMostAbundantMonoisotopicMass() { PeptideWithSetModifications small_pep = new PeptideWithSetModifications(new Protein("PEPTIDE", "ACCESSION"), new DigestionParams(protease: "trypsin"), 1, 7, CleavageSpecificity.Full, null, 0, new Dictionary(), 0, null); double small_pep_most_abundant_mass_prospector = 800.36724 - 1.0079; - Assert.That(small_pep.MostAbundantMonoisotopicMass, Is.EqualTo(small_pep_most_abundant_mass_prospector).Within(0.01)); + Assert.That(small_pep.MostAbundantMass, Is.EqualTo(small_pep_most_abundant_mass_prospector).Within(0.01)); PeptideWithSetModifications large_pep = new PeptideWithSetModifications(new Protein("PEPTIDEPEPTIDEPEPTIDEPEPTIDEPEPTIDEPEPTIDE", "ACCESSION"), new DigestionParams(protease: "trypsin"), 1, 42, CleavageSpecificity.Full, null, 0, new Dictionary(), 0, null); double large_pep_most_abundant_mass_prospector = 4709.12020 - 1.0079; - Assert.That(large_pep.MostAbundantMonoisotopicMass, Is.EqualTo(large_pep_most_abundant_mass_prospector).Within(0.01)); + Assert.That(large_pep.MostAbundantMass, Is.EqualTo(large_pep_most_abundant_mass_prospector).Within(0.01)); } } } \ No newline at end of file diff --git a/mzLib/Test/TestProteinDigestion.cs b/mzLib/Test/TestProteinDigestion.cs index 46520cfd8..d808588bb 100644 --- a/mzLib/Test/TestProteinDigestion.cs +++ b/mzLib/Test/TestProteinDigestion.cs @@ -59,7 +59,7 @@ public static void CNBrProteinDigestion() Assert.That(File.Exists(path)); var proteaseDict = ProteaseDictionary.LoadProteaseDictionary(path, proteaseMods); - ProteaseDictionary.Dictionary = ProteaseDictionary.LoadProteaseDictionary(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "ProteolyticDigestion", "proteases.tsv"), proteaseMods); + ProteaseDictionary.Dictionary = ProteaseDictionary.LoadProteaseDictionary(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "Proteomics", "ProteolyticDigestion", "proteases.tsv"), proteaseMods); var protease1 = proteaseDict["CNBr"]; DigestionParams digestionParams1 = new DigestionParams( protease: protease1.Name, diff --git a/mzLib/TestFlashLFQ/TestFlashLFQ.csproj b/mzLib/TestFlashLFQ/TestFlashLFQ.csproj index ce938d91a..8987c2d06 100644 --- a/mzLib/TestFlashLFQ/TestFlashLFQ.csproj +++ b/mzLib/TestFlashLFQ/TestFlashLFQ.csproj @@ -26,7 +26,6 @@ - diff --git a/mzLib/UsefulProteomicsDatabases/DecoyType.cs b/mzLib/UsefulProteomicsDatabases/DecoyType.cs index 946fcf6a3..b098589b3 100644 --- a/mzLib/UsefulProteomicsDatabases/DecoyType.cs +++ b/mzLib/UsefulProteomicsDatabases/DecoyType.cs @@ -1,7 +1,4 @@ -using Proteomics.ProteolyticDigestion; -using System; - -namespace UsefulProteomicsDatabases +namespace UsefulProteomicsDatabases { public enum DecoyType { diff --git a/mzLib/UsefulProteomicsDatabases/UsefulProteomicsDatabases.csproj b/mzLib/UsefulProteomicsDatabases/UsefulProteomicsDatabases.csproj index 590324de2..c631c7fb4 100644 --- a/mzLib/UsefulProteomicsDatabases/UsefulProteomicsDatabases.csproj +++ b/mzLib/UsefulProteomicsDatabases/UsefulProteomicsDatabases.csproj @@ -16,8 +16,8 @@ + - diff --git a/mzLib/mzLib.nuspec b/mzLib/mzLib.nuspec index 89d5c162e..7a803c19d 100644 --- a/mzLib/mzLib.nuspec +++ b/mzLib/mzLib.nuspec @@ -42,35 +42,33 @@ + - - - - - - - - - + + + + + + + + - - diff --git a/mzLib/mzLib.sln b/mzLib/mzLib.sln index 8b736b822..b4cbd2fed 100644 --- a/mzLib/mzLib.sln +++ b/mzLib/mzLib.sln @@ -21,8 +21,6 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "MzML", "MzML\MzML.csproj", EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "PepXML", "PepXML\PepXML.csproj", "{9697BC5E-1F2C-4E3C-BC53-3532CBDFFCC7}" EndProject -Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Proteomics", "Proteomics\Proteomics.csproj", "{AD3D126D-6359-481B-BE17-69DFB0BC4E40}" -EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "ThermoRawFileReader", "ThermoRawFileReader\ThermoRawFileReader.csproj", "{21FD9C16-733F-444A-B4D5-E062C4420F12}" EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "UsefulProteomicsDatabases", "UsefulProteomicsDatabases\UsefulProteomicsDatabases.csproj", "{6A510911-5F37-4314-8176-82772B7E5AE3}" @@ -96,12 +94,6 @@ Global {9697BC5E-1F2C-4E3C-BC53-3532CBDFFCC7}.Release|x64.Build.0 = Release|x64 {9697BC5E-1F2C-4E3C-BC53-3532CBDFFCC7}.TestAndRelease|x64.ActiveCfg = Release|x64 {9697BC5E-1F2C-4E3C-BC53-3532CBDFFCC7}.TestAndRelease|x64.Build.0 = Release|x64 - {AD3D126D-6359-481B-BE17-69DFB0BC4E40}.Debug|x64.ActiveCfg = Debug|x64 - {AD3D126D-6359-481B-BE17-69DFB0BC4E40}.Debug|x64.Build.0 = Debug|x64 - {AD3D126D-6359-481B-BE17-69DFB0BC4E40}.Release|x64.ActiveCfg = Release|x64 - {AD3D126D-6359-481B-BE17-69DFB0BC4E40}.Release|x64.Build.0 = Release|x64 - {AD3D126D-6359-481B-BE17-69DFB0BC4E40}.TestAndRelease|x64.ActiveCfg = Release|x64 - {AD3D126D-6359-481B-BE17-69DFB0BC4E40}.TestAndRelease|x64.Build.0 = Release|x64 {21FD9C16-733F-444A-B4D5-E062C4420F12}.Debug|x64.ActiveCfg = Debug|x64 {21FD9C16-733F-444A-B4D5-E062C4420F12}.Debug|x64.Build.0 = Debug|x64 {21FD9C16-733F-444A-B4D5-E062C4420F12}.Release|x64.ActiveCfg = Release|x64