diff --git a/mzLib/Chemistry/IsotopicDistribution.cs b/mzLib/Chemistry/IsotopicDistribution.cs
index ebb723b53..68e52466a 100644
--- a/mzLib/Chemistry/IsotopicDistribution.cs
+++ b/mzLib/Chemistry/IsotopicDistribution.cs
@@ -56,7 +56,22 @@ private IsotopicDistribution(int count)
intensities = new double[count];
}
- // Clone() produces shallow copies, but because double is a primitive type, this is acceptable
+ public double MostAbundantMass
+ {
+ get
+ {
+ double maxIntensity = intensities.Max();
+ for (int i = 0; i < masses.Length; i++)
+ {
+ if (Math.Abs(intensities[i] - maxIntensity) < 0.0001)
+ {
+ return (masses[i]);
+ }
+ }
+ return Double.NaN;
+ }
+ }
+ public double MonoIsotopicMass => masses[0];
public double[] Masses => (double[]) masses.Clone();
public double[] Intensities => (double[]) intensities.Clone();
diff --git a/mzLib/MassSpectrometry/Deconvolution/Algorithms/ClassicDeconvolutionAlgorithm.cs b/mzLib/MassSpectrometry/Deconvolution/Algorithms/ClassicDeconvolutionAlgorithm.cs
index 5dd92db7a..5edc178ad 100644
--- a/mzLib/MassSpectrometry/Deconvolution/Algorithms/ClassicDeconvolutionAlgorithm.cs
+++ b/mzLib/MassSpectrometry/Deconvolution/Algorithms/ClassicDeconvolutionAlgorithm.cs
@@ -27,7 +27,12 @@ public ClassicDeconvolutionAlgorithm(DeconvolutionParameters deconParameters) :
///
public override IEnumerable Deconvolute(MzSpectrum spectrumToDeconvolute, MzRange range)
{
- var deconParams = DeconvolutionParameters as ClassicDeconvolutionParameters ?? throw new MzLibException("Deconvolution params and algorithm do not match");
+ var deconParams = DeconvolutionParameters as ClassicDeconvolutionParameters;
+ if (deconParams == null)
+ {
+ throw new MzLibException("Deconvolution params and algorithm do not match");
+ }
+
spectrum = spectrumToDeconvolute;
//if no peaks, stop
if (spectrum.Size == 0)
@@ -175,7 +180,9 @@ public override IEnumerable Deconvolute(MzSpectrum spectrumToD
}
}
- private IsotopicEnvelope FindIsotopicEnvelope(int massIndex, double candidateForMostIntensePeakMz, double candidateForMostIntensePeakIntensity, double testMostIntenseMass, int chargeState, double deconvolutionTolerancePpm, double intensityRatioLimit, List monoisotopicMassPredictions)
+ private IsotopicEnvelope FindIsotopicEnvelope(int massIndex, double candidateForMostIntensePeakMz, double candidateForMostIntensePeakIntensity,
+ double testMostIntenseMass, int chargeState, double deconvolutionTolerancePpm, double intensityRatioLimit,
+ List monoisotopicMassPredictions)
{
double[] theoreticalMasses = allMasses[massIndex];
double[] theoreticalIntensities = allIntensities[massIndex];
@@ -216,7 +223,9 @@ private IsotopicEnvelope FindIsotopicEnvelope(int massIndex, double candidateFor
return new IsotopicEnvelope(listOfObservedPeaks, monoisotopicMass, chargeState, totalIntensity, Statistics.StandardDeviation(listOfRatios), massIndex);
}
- private int ObserveAdjacentChargeStates(IsotopicEnvelope originalEnvelope, double mostIntensePeakMz, int massIndex, double deconvolutionTolerancePpm, double intensityRatioLimit, double minChargeToLookFor, double maxChargeToLookFor, List monoisotopicMassPredictions)
+ private int ObserveAdjacentChargeStates(IsotopicEnvelope originalEnvelope, double mostIntensePeakMz, int massIndex,
+ double deconvolutionTolerancePpm, double intensityRatioLimit, double minChargeToLookFor, double maxChargeToLookFor,
+ List monoisotopicMassPredictions)
{
//look for the higher and lower charge states using the proposed mass
int numAdjacentChargeStatesObserved = 0;
@@ -251,7 +260,8 @@ private int ObserveAdjacentChargeStates(IsotopicEnvelope originalEnvelope, doubl
return numAdjacentChargeStatesObserved;
}
- private bool FindChargeStateOfMass(IsotopicEnvelope originalEnvelope, int zToInvestigate, double mostAbundantNeutralIsotopeToInvestigate, int massIndex, double deconvolutionTolerancePpm, double intensityRatioLimit, List monoisotopicMassPredictions)
+ private bool FindChargeStateOfMass(IsotopicEnvelope originalEnvelope, int zToInvestigate, double mostAbundantNeutralIsotopeToInvestigate, int massIndex,
+ double deconvolutionTolerancePpm, double intensityRatioLimit, List monoisotopicMassPredictions)
{
//we know the mass and the charge that we're looking for, just see if the expected m/z and its isotopes are there or not
double mostAbundantIsotopeMzForThisZTheoretical = mostAbundantNeutralIsotopeToInvestigate.ToMz(zToInvestigate);
diff --git a/mzLib/MassSpectrometry/Deconvolution/Algorithms/SpectralDeconvolutionAlgorithm.cs b/mzLib/MassSpectrometry/Deconvolution/Algorithms/SpectralDeconvolutionAlgorithm.cs
new file mode 100644
index 000000000..3470971f0
--- /dev/null
+++ b/mzLib/MassSpectrometry/Deconvolution/Algorithms/SpectralDeconvolutionAlgorithm.cs
@@ -0,0 +1,281 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+using Chemistry;
+using Easy.Common.Extensions;
+using MassSpectrometry.Deconvolution;
+using MassSpectrometry.Deconvolution.Scoring;
+using Proteomics;
+using Proteomics.ProteolyticDigestion;
+using MzLibUtil;
+
+namespace MassSpectrometry.Deconvolution.Algorithms
+{
+ public class SpectralDeconvolutionAlgorithm : DeconvolutionAlgorithm
+ {
+ // TODO: Make a charge state envelope class, complete with "MostAbundantChargeState"
+ public Dictionary> EnvelopeDictionary { get; private set; }
+
+ // Consider defining this as a jagged array to increase performance
+ public List[,] IndexedLibrarySpectra { get; private set; }
+ // SpectrumIndexToPwsmMap maps the location of each spectrum within IndexedLibrarySpectra to its respective PeptideWithSetMods and charge
+ public Dictionary<(int, int, int), PeptideWithSetModifications> SpectrumIndexToPwsmMap { get; private set; }
+ public int MaxThreads; // This should be in the Parameters abstract
+ public SpectralDeconvolutionParameters SpectralParams { get; }
+ public PpmTolerance PpmTolerance { get; }
+ public Scorer Scorer { get; }
+
+ public SpectralDeconvolutionAlgorithm(DeconvolutionParameters parameters) : base(parameters)
+ {
+ var deconvolutionParameters = DeconvolutionParameters as SpectralDeconvolutionParameters;
+ if (deconvolutionParameters == null)
+ {
+ throw new MzLibException(
+ "Improper Deconvolution Parameters were pass to the SpectralDeconvolutionAlgorithm");
+ }
+ else
+ {
+ SpectralParams = deconvolutionParameters;
+ }
+
+ PpmTolerance = new PpmTolerance(parameters.DeconvolutionTolerancePpm);
+
+ FindLibraryEnvelopes();
+ IndexEnvelopes();
+ Scorer = new Scorer(Scorer.ScoringMethods.SpectralContrastAngle, PpmTolerance);
+
+ }
+
+ public override IEnumerable Deconvolute(MzSpectrum spectrum)
+ {
+
+ if (spectrum == null || spectrum.Size == 0)
+ {
+ yield break;
+ }
+
+ // For each charge state (key) store the indices corresponding to every potential isotopic envelope (value)
+ Dictionary> potentialEnvelopes = FindPotentialEnvelopes(spectrum);
+
+ // iterate through charge states (potentially not necessary/performant. Could flatten)
+ foreach (var keyValuePair in potentialEnvelopes)
+ {
+ int chargeBinIndex = keyValuePair.Key - SpectralParams.MinAssumedChargeState;
+ // iterate through potential envelopes
+ foreach (var experimentalSpectrum in keyValuePair.Value)
+ {
+ double mostAbundantMz = experimentalSpectrum.MostAbundantMz;
+ int massBinIndex = (int)Math.Floor((mostAbundantMz - SpectralParams.ScanRange.Minimum) *
+ SpectralParams.BinsPerDalton);
+ if (!IndexedLibrarySpectra[massBinIndex, chargeBinIndex].IsNotNullOrEmpty()) continue; // continue if there are no corresponding library spectra
+
+ int? bestMatchListPosition = null;
+ int currentListPosition = 0;
+ double bestFoundScore = Scorer.PoorScore;
+ // Score against matching theoretical envelopes
+ foreach (MinimalSpectrum theoreticalSpectrum in IndexedLibrarySpectra[massBinIndex, chargeBinIndex])
+ {
+ // TODO: Rename to FindBestScore
+ if (Scorer.TestForScoreImprovement(
+ Scorer.Score(experimentalSpectrum,theoreticalSpectrum),
+ bestFoundScore,
+ out double betterScore)
+ )
+ {
+ bestMatchListPosition = currentListPosition;
+ bestFoundScore = betterScore;
+ }
+ currentListPosition++;
+ }
+
+ if (bestMatchListPosition.HasValue &&
+ SpectrumIndexToPwsmMap.TryGetValue((massBinIndex, chargeBinIndex, (int)bestMatchListPosition), out var pwsmMatch))
+ {
+ yield return new IsotopicEnvelope(experimentalSpectrum, pwsmMatch, bestFoundScore);
+ }
+ else
+ {
+ //TODO: Add some averagine bullshit here
+ }
+
+ }
+ }
+ }
+
+ ///
+ /// Populates the EnvelopeDictionary by digesting each protein in the parameters into PeptideWithSetMods,
+ /// then calculating an isotopic envelope for each charge state from min to max assumed charge state
+ ///
+ private void FindLibraryEnvelopes()
+ {
+ EnvelopeDictionary = new();
+
+
+ //TODO: Parallelize this section of the code
+ foreach (Protein protein in SpectralParams.Proteins)
+ {
+ // I'm not sure if calling protein.Digest within the foreach statement would call the method anew for every loop
+ IEnumerable uniquePeptides = protein.Digest(
+ SpectralParams.DigestionParams, SpectralParams.FixedModifications,
+ SpectralParams.VariableModifications, SpectralParams.SilacLabels,
+ topDownTruncationSearch: SpectralParams.FindTopDownTruncationProducts);
+
+ foreach (PeptideWithSetModifications pwsm in uniquePeptides)
+ {
+ EnvelopeDictionary.Add(pwsm, new List());
+ IsotopicDistribution pwsmDistribution = IsotopicDistribution.GetDistribution(pwsm.FullChemicalFormula,
+ fineResolution: SpectralParams.FineResolutionForIsotopicDistribution,
+ minProbability: SpectralParams.MinProbabilityForIsotopicDistribution);
+
+ // iterates through all possible charge states, from largest to smallest.
+ // Any isotopic envelope whose most abundant peak would fall within the scan range is written to the envelope dictionary
+ // Once the mass to charge ratio of the most abundant peak is greater than the scan range maximum, the loop breaks
+ for (int charge = SpectralParams.MaxAssumedChargeState;
+ charge >= SpectralParams.MinAssumedChargeState;
+ charge--)
+ {
+ double theoreticalMz = pwsm.MostAbundantMass.ToMz(charge);
+ if (SpectralParams.ScanRange.Contains(theoreticalMz))
+ {
+ EnvelopeDictionary[pwsm].Add(new
+ IsotopicEnvelope(pwsmDistribution, charge, SpectralParams.AmbiguityThresholdForIsotopicDistribution));
+ }
+ else if (SpectralParams.ScanRange.CompareTo(theoreticalMz) < 0)
+ {
+ break;
+ }
+ }
+ }
+
+ }
+ }
+
+ ///
+ /// For each envelope in Envelope Dictionary, indexes it according to mass and charge,
+ /// resulting in a 2D array of lists of minimal spectra
+ ///
+ private void IndexEnvelopes()
+ {
+
+ int numberOfBinsForIndexing = (int) (SpectralParams.ScanRange.Width * SpectralParams.BinsPerDalton).Ceiling(0);
+ IndexedLibrarySpectra = new List[numberOfBinsForIndexing,
+ SpectralParams.MaxAssumedChargeState + 1 - SpectralParams.MinAssumedChargeState];
+ SpectrumIndexToPwsmMap = new();
+
+ foreach (var keyValuePair in EnvelopeDictionary)
+ {
+ foreach (IsotopicEnvelope envelope in keyValuePair.Value)
+ {
+ int massBinIndex = (int)Math.Floor((envelope.MostAbundantObservedIsotopicMz - SpectralParams.ScanRange.Minimum) *
+ SpectralParams.BinsPerDalton);
+ int chargeBinIndex = envelope.Charge - SpectralParams.MinAssumedChargeState;
+ if (IndexedLibrarySpectra[massBinIndex, chargeBinIndex] == null)
+ {
+ IndexedLibrarySpectra[massBinIndex, chargeBinIndex] = new();
+ }
+ MinimalSpectrum envelopeMinimalSpectrum = new MinimalSpectrum(envelope.MzArray, envelope.IntensityArray, envelope.Charge);
+ IndexedLibrarySpectra[massBinIndex, chargeBinIndex].Add(envelopeMinimalSpectrum);
+ SpectrumIndexToPwsmMap.Add(
+ (massBinIndex, chargeBinIndex, IndexedLibrarySpectra[massBinIndex, chargeBinIndex].Count - 1), // tuple consisting of bin index (mass, charge) and list position of MinimalSpectrum object
+ keyValuePair.Key // tuple consisting of PeptideWithSetMods and charge state
+ );
+
+ // In situations where the most abundant isotope frequency is close to the second most abundant isotope's frequency
+ // ( ratio >= IsotopicEnvelope.AmbiguityRatioMinimum),
+ // The Spectrum is stored in the index of the second most abundant isotope as well
+ if(envelope.SecondMostAbundantObservedIsotopicMz > 0 )
+ {
+ // Ceiling or floor????
+ int secondBinIndex = (int)Math.Floor(
+ ((double)envelope.SecondMostAbundantObservedIsotopicMz - SpectralParams.ScanRange.Minimum ) * SpectralParams.BinsPerDalton);
+ if (secondBinIndex != massBinIndex)
+ {
+ if (IndexedLibrarySpectra[secondBinIndex, chargeBinIndex] == null) IndexedLibrarySpectra[secondBinIndex, chargeBinIndex] = new();
+ IndexedLibrarySpectra[secondBinIndex, chargeBinIndex].Add(envelopeMinimalSpectrum);
+ SpectrumIndexToPwsmMap.Add(
+ (secondBinIndex, chargeBinIndex, IndexedLibrarySpectra[secondBinIndex, chargeBinIndex].Count - 1),
+ keyValuePair.Key
+ );
+ }
+ }
+ }
+ }
+ }
+
+ ///
+ /// Iterates through all peaks in a spectrum to find all potential isotopic envelopes.
+ /// It does this by examining the spacing of peaks in the m/z domain
+ /// e.g. for charge of 2, a peak at 200 m/z would result in a search for a peak at 200.5 and 201 m/z
+ /// if either is found, the process continues until SpectralParams.MaxConsecutiveMissedIsotopicPeaks number of consecutive
+ /// isotope peaks are missed
+ /// Anything consistent with an isotopic envelope in a given charge state is stored in the dictionary
+ ///
+ ///
+ ///
+ private Dictionary> FindPotentialEnvelopes(MzSpectrum spectrum)
+ {
+
+ // For each charge state (key) store the indices corresponding to every potential isotopic envelope (value)
+ Dictionary> potentialEnvelopes = new();
+
+ for (int charge = SpectralParams.MinAssumedChargeState; charge <= SpectralParams.MaxAssumedChargeState; charge++)
+ {
+ List indicesOfKnownPeaks = new();
+
+ // Spectrum Search Loop
+ for (int i = 0; i < spectrum.Size; i++)
+ {
+ if (indicesOfKnownPeaks.Contains(i))
+ {
+ continue;
+ }
+ List envelopeIndices = new();
+ envelopeIndices.Add(i);
+
+ // Envelope Search Loop
+ for (int j = i + 1; j < spectrum.Size; j++)
+ {
+ if (PpmTolerance.Within(spectrum.XArray[j],
+ spectrum.XArray[envelopeIndices.Last()] + Constants.C13MinusC12 / charge))
+ {
+ envelopeIndices.Add(j);
+ }
+ else if (spectrum.XArray[j] > PpmTolerance.GetMaximumValue(spectrum.XArray[envelopeIndices.Last()] +
+ (1 + SpectralParams.MaxConsecutiveMissedIsotopicPeaks) * Constants.C13MinusC12 / charge))
+ {
+ // exit the Envelope loop if we missed more consecutive isotopic peaks than were allowed
+ break;
+ }
+ }
+
+ // Convert to MinimalSpectrum here? Write helper function to do so?
+ if (envelopeIndices.Count > 1)
+ {
+ if (!potentialEnvelopes.ContainsKey(charge)) potentialEnvelopes.Add(charge, new());
+ potentialEnvelopes[charge].Add(GetMinimalSpectrumFromIndices(spectrum, envelopeIndices, charge));
+ indicesOfKnownPeaks.AddRange(envelopeIndices);
+ }
+ }
+ }
+
+ return potentialEnvelopes;
+ }
+
+
+ private static MinimalSpectrum GetMinimalSpectrumFromIndices(MzSpectrum spectrum, List indices, int charge = 0)
+ {
+ double[] mzArray = new double[indices.Count];
+ double[] intensityArray = new double[indices.Count];
+ for (int i = 0; i < indices.Count; i++)
+ {
+ mzArray[i] = spectrum.XArray[indices[i]];
+ intensityArray[i] = spectrum.YArray[indices[i]];
+ }
+
+ return new MinimalSpectrum(mzArray, intensityArray, charge);
+ }
+
+ }
+}
diff --git a/mzLib/MassSpectrometry/Deconvolution/Deconvoluter.cs b/mzLib/MassSpectrometry/Deconvolution/Deconvoluter.cs
index 70977ebe3..0e2ed3f2e 100644
--- a/mzLib/MassSpectrometry/Deconvolution/Deconvoluter.cs
+++ b/mzLib/MassSpectrometry/Deconvolution/Deconvoluter.cs
@@ -4,6 +4,7 @@
using System.Text;
using System.Threading.Tasks;
using Easy.Common.Extensions;
+using MassSpectrometry.Deconvolution.Algorithms;
using MzLibUtil;
namespace MassSpectrometry
@@ -11,7 +12,7 @@ namespace MassSpectrometry
public enum DeconvolutionTypes
{
ClassicDeconvolution,
- AlexDeconvolution,
+ SpectralDeconvolution,
}
///
@@ -44,9 +45,13 @@ public IEnumerable Deconvolute(MsDataScan scan, MzRange rangeT
switch (DeconvolutionType)
{
case DeconvolutionTypes.ClassicDeconvolution:
+ ((ClassicDeconvolutionParameters)DeconvolutionParameters).Range =
+ new MzRange(scan.IsolationRange.Minimum - 8.5, scan.IsolationRange.Maximum + 8.5);
break;
- case DeconvolutionTypes.AlexDeconvolution:
+ case DeconvolutionTypes.SpectralDeconvolution:
+ ((SpectralDeconvolutionParameters)DeconvolutionParameters).ScanRange =
+ new MzRange(scan.IsolationRange.Minimum - 8.5, scan.IsolationRange.Maximum + 8.5);
break;
}
@@ -69,8 +74,8 @@ private void ConstructDeconvolutionAlgorithm(DeconvolutionParameters deconParame
DeconvolutionAlgorithm = new ClassicDeconvolutionAlgorithm(deconParameters);
break;
- case DeconvolutionTypes.AlexDeconvolution:
- DeconvolutionAlgorithm = new ExampleNewDeconvolutionAlgorithm(deconParameters);
+ case DeconvolutionTypes.SpectralDeconvolution:
+ DeconvolutionAlgorithm = new SpectralDeconvolutionAlgorithm(deconParameters);
break;
default: throw new MzLibException("DeconvolutionType not yet supported");
diff --git a/mzLib/MassSpectrometry/Deconvolution/DeconvoluterExtensions.cs b/mzLib/MassSpectrometry/Deconvolution/DeconvoluterExtensions.cs
index 75723b1ec..1b15858d5 100644
--- a/mzLib/MassSpectrometry/Deconvolution/DeconvoluterExtensions.cs
+++ b/mzLib/MassSpectrometry/Deconvolution/DeconvoluterExtensions.cs
@@ -34,5 +34,20 @@ public static IEnumerable ClassicDeconvoluteMzSpectra(this Dec
return deconvoluter.DeconvolutionAlgorithm.Deconvolute(spectrum, range);
}
}
+
+ public static IEnumerable SpectralDeconvoluteMzSpectra(this Deconvoluter deconvoluter,
+ MzSpectrum spectrum, MzRange range)
+ {
+ if (deconvoluter.DeconvolutionType != DeconvolutionTypes.SpectralDeconvolution)
+ {
+ throw new MzLibException("Deconvoluter is not of correct type for this extension method");
+ }
+ else
+ {
+ ((SpectralDeconvolutionParameters)deconvoluter.DeconvolutionParameters).ScanRange = range;
+ return deconvoluter.DeconvolutionAlgorithm.Deconvolute(spectrum);
+
+ }
+ }
}
}
diff --git a/mzLib/MassSpectrometry/Deconvolution/MinimalSpectrum.cs b/mzLib/MassSpectrometry/Deconvolution/MinimalSpectrum.cs
new file mode 100644
index 000000000..514ed90ba
--- /dev/null
+++ b/mzLib/MassSpectrometry/Deconvolution/MinimalSpectrum.cs
@@ -0,0 +1,65 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+
+namespace MassSpectrometry.Deconvolution
+{
+ // Consider defining this as a struct to increase performance
+ public class MinimalSpectrum
+ {
+ public readonly double[] MzArray;
+ public readonly double[] IntensityArray;
+ public readonly double MostAbundantMz;
+ public readonly int Charge;
+
+ public MinimalSpectrum(double[] mzArray, double[] intensityArray, int charge = 0)
+ {
+ MzArray = mzArray;
+ IntensityArray = intensityArray;
+ MostAbundantMz = GetMostAbundantMz(mzArray, intensityArray);
+ Charge = charge;
+ }
+
+ internal double[] GetMzs()
+ {
+ double[] mzArrayCopy = new double[MzArray.Length];
+ Array.Copy(MzArray, mzArrayCopy, MzArray.Length);
+ return mzArrayCopy;
+ }
+
+ internal double[] GetIntensities()
+ {
+ double[] intensityArrayCopy = new double[IntensityArray.Length];
+ Array.Copy(MzArray, intensityArrayCopy, IntensityArray.Length);
+ return intensityArrayCopy;
+ }
+
+ ///
+ /// Returns the charge, or 0 if charge was not assigned
+ ///
+ ///
+ internal int GetCharge()
+ {
+ return Charge;
+ }
+
+ internal static double GetMostAbundantMz(double[] mzArray, double[] intensityArray)
+ {
+ double mostAbundantMz = 0;
+ double maxIntensity = 0;
+ for (int i = 0; i < mzArray.Length; i++)
+ {
+ if (intensityArray[i] > maxIntensity)
+ {
+ maxIntensity = intensityArray[i];
+ mostAbundantMz = mzArray[i];
+ }
+ }
+
+ return mostAbundantMz;
+
+ }
+ }
+}
diff --git a/mzLib/MassSpectrometry/Deconvolution/Parameters/ClassicDeconvolutionParameters.cs b/mzLib/MassSpectrometry/Deconvolution/Parameters/ClassicDeconvolutionParameters.cs
index 0ab4a57af..a66c01735 100644
--- a/mzLib/MassSpectrometry/Deconvolution/Parameters/ClassicDeconvolutionParameters.cs
+++ b/mzLib/MassSpectrometry/Deconvolution/Parameters/ClassicDeconvolutionParameters.cs
@@ -12,6 +12,7 @@ namespace MassSpectrometry
///
public class ClassicDeconvolutionParameters : DeconvolutionParameters
{
+ public MzRange Range { get; set; }
public int MinAssumedChargeState { get; set; }
public int MaxAssumedChargeState { get; set; }
public double DeconvolutionTolerancePpm { get; set; }
@@ -25,12 +26,10 @@ public class ClassicDeconvolutionParameters : DeconvolutionParameters
///
///
/// Isolation range of the scan to be deconvoluted
- public ClassicDeconvolutionParameters(int minCharge, int maxCharge, double deconPpm, double intensityRatio) : base()
+ public ClassicDeconvolutionParameters(int minCharge, int maxCharge, double deconPpm, double intensityRatio, MzRange range = null) :
+ base (minCharge, maxCharge, deconPpm)
{
IntensityRatioLimit = intensityRatio;
- DeconvolutionTolerancePpm = deconPpm;
- MinAssumedChargeState = minCharge;
- MaxAssumedChargeState = maxCharge;
}
}
}
diff --git a/mzLib/MassSpectrometry/Deconvolution/Parameters/DeconvolutionParameters.cs b/mzLib/MassSpectrometry/Deconvolution/Parameters/DeconvolutionParameters.cs
index d88cfbd20..0eecb0d87 100644
--- a/mzLib/MassSpectrometry/Deconvolution/Parameters/DeconvolutionParameters.cs
+++ b/mzLib/MassSpectrometry/Deconvolution/Parameters/DeconvolutionParameters.cs
@@ -12,12 +12,19 @@ namespace MassSpectrometry
///
public abstract class DeconvolutionParameters
{
+ public int MinAssumedChargeState { get; set; }
+ public int MaxAssumedChargeState { get; set; }
+ public double DeconvolutionTolerancePpm { get; set; }
+
///
/// Constructor should initialize all fields that are used by every deconvolution algorithm
///
- public DeconvolutionParameters()
+ public DeconvolutionParameters(int minAssumedChargeState, int maxAssumedChargeState,
+ double deconvolutionTolerancePpm)
{
-
+ MinAssumedChargeState = minAssumedChargeState;
+ MaxAssumedChargeState = maxAssumedChargeState;
+ DeconvolutionTolerancePpm = deconvolutionTolerancePpm;
}
}
}
diff --git a/mzLib/MassSpectrometry/Deconvolution/Parameters/ExampleNewDeconvolutionParameters.cs b/mzLib/MassSpectrometry/Deconvolution/Parameters/ExampleNewDeconvolutionParameters.cs
index 9d4e57455..abb6f39ec 100644
--- a/mzLib/MassSpectrometry/Deconvolution/Parameters/ExampleNewDeconvolutionParameters.cs
+++ b/mzLib/MassSpectrometry/Deconvolution/Parameters/ExampleNewDeconvolutionParameters.cs
@@ -10,7 +10,8 @@ namespace MassSpectrometry
[ExcludeFromCodeCoverage]
public class ExampleNewDeconvolutionParameters : DeconvolutionParameters
{
- public ExampleNewDeconvolutionParameters() : base()
+ public ExampleNewDeconvolutionParameters(int minCharge, int maxCharge, double deconPpm) :
+ base (minCharge, maxCharge, deconPpm)
{
}
diff --git a/mzLib/MassSpectrometry/Deconvolution/Parameters/SpectralDeconvolutionParameters.cs b/mzLib/MassSpectrometry/Deconvolution/Parameters/SpectralDeconvolutionParameters.cs
new file mode 100644
index 000000000..7e06c032c
--- /dev/null
+++ b/mzLib/MassSpectrometry/Deconvolution/Parameters/SpectralDeconvolutionParameters.cs
@@ -0,0 +1,55 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+using Proteomics;
+using Proteomics.ProteolyticDigestion;
+using MathNet.Numerics.Optimization;
+using MzLibUtil;
+
+namespace MassSpectrometry
+{
+
+ public class SpectralDeconvolutionParameters : DeconvolutionParameters
+ {
+ public List Proteins { get; }
+ public List FixedModifications { get; }
+ public List VariableModifications { get; }
+ public DigestionParams DigestionParams { get; }
+ public List SilacLabels { get; }
+ // TODO: convert double range to MzRange
+ public DoubleRange ScanRange { get; set; }
+ public bool FindTopDownTruncationProducts { get; }
+ public int BinsPerDalton { get; }
+ public double FineResolutionForIsotopicDistribution { get; }
+ public double MinProbabilityForIsotopicDistribution { get; }
+ public double AmbiguityThresholdForIsotopicDistribution { get; }
+ public int MaxConsecutiveMissedIsotopicPeaks { get; }
+ private bool FindNonDatabasePeaks { get; } // This should be linked to a method that generates Averagine envelopes
+
+
+ public SpectralDeconvolutionParameters(int minAssumedChargeState, int maxAssumedChargeState,
+ double deconvolutionTolerancePpm, List proteins, List fixedModifications,
+ List variableModifications, DigestionParams digestionParams,
+ List silacLabels, bool findTopDownTruncationProducts, double scanMinimumMz, double scanMaximumMz,
+ int binsPerDalton = 10, double fineResolutionForIsotopicDistribution = 0.125, double minProbabilityForIsotopicDistribution = 1e-8,
+ double ambiguityThresholdForIsotopicDistribution = 0.9, int maxConsecutiveMissedIsotopicPeaks = 1,
+ bool findNonDatabasePeaks = false) :
+ base(minAssumedChargeState, maxAssumedChargeState, deconvolutionTolerancePpm)
+ {
+ Proteins = proteins;
+ FixedModifications = fixedModifications;
+ VariableModifications = variableModifications;
+ DigestionParams = digestionParams;
+ SilacLabels = silacLabels;
+ FindTopDownTruncationProducts = findTopDownTruncationProducts;
+ ScanRange = new DoubleRange(scanMinimumMz, scanMaximumMz);
+ BinsPerDalton = binsPerDalton;
+ FineResolutionForIsotopicDistribution = fineResolutionForIsotopicDistribution;
+ MinProbabilityForIsotopicDistribution = minProbabilityForIsotopicDistribution;
+ AmbiguityThresholdForIsotopicDistribution = ambiguityThresholdForIsotopicDistribution;
+ FindNonDatabasePeaks = findNonDatabasePeaks;
+ }
+ }
+}
diff --git a/mzLib/MassSpectrometry/Deconvolution/Scoring/Scorer.cs b/mzLib/MassSpectrometry/Deconvolution/Scoring/Scorer.cs
new file mode 100644
index 000000000..d7b78b231
--- /dev/null
+++ b/mzLib/MassSpectrometry/Deconvolution/Scoring/Scorer.cs
@@ -0,0 +1,123 @@
+using System;
+using MzLibUtil;
+
+namespace MassSpectrometry.Deconvolution.Scoring;
+
+// Context class for scoring deconvolution hypotheses
+public class Scorer
+{
+ public enum ScoringMethods
+ {
+ KullbackLeibler,
+ SpectralContrastAngle
+ }
+ public ScoringAlgorithm ScoringAlgorithm { get; private set; }
+ public ScoringMethods ScoringMethod { get; }
+ private double? _poorScore;
+
+ public double PoorScore
+ {
+ get
+ {
+ if (_poorScore.HasValue) return (double)_poorScore;
+ switch (ScoringMethod)
+ {
+ case ScoringMethods.KullbackLeibler:
+ _poorScore = Double.MaxValue;
+ return (double)_poorScore;
+ case ScoringMethods.SpectralContrastAngle:
+ _poorScore = 0;
+ return (double)_poorScore;
+ default:
+ _poorScore = Double.MinValue;
+ return (double)_poorScore;
+ }
+ }
+ }
+
+ public Scorer(ScoringMethods scoringMethod, PpmTolerance tolerance)
+ {
+ ScoringMethod = scoringMethod;
+ ConstructScoringAlgorithm(tolerance);
+ }
+
+ public double Score(IScoreArgs args)
+ {
+ return ScoringAlgorithm.GetScore(args);
+ }
+
+ public double Score(MinimalSpectrum experimentalSpectrum, MinimalSpectrum theoreticalSpectrum)
+ {
+ IScoreArgs args = new MinimalSpectraArgs(experimentalSpectrum, theoreticalSpectrum);
+ return ScoringAlgorithm.GetScore(args);
+ }
+
+ ///
+ /// Compares two scores in a method specific fashion. Returns true if the instanceScore (first)
+ /// is better than the argumentScore (second). Outputs the better of the two. This method is necessary
+ /// because there are some metrics where lower scores are better.
+ ///
+ ///
+ ///
+ ///
+ ///
+ ///
+ public bool TestForScoreImprovement(double instanceScore, double argumentScore, out double betterScore)
+ {
+ switch (ScoringMethod)
+ {
+ case ScoringMethods.KullbackLeibler:
+ if (instanceScore < argumentScore)
+ {
+ betterScore = instanceScore;
+ return true;
+ }
+ else
+ {
+ betterScore = argumentScore;
+ return false;
+ }
+ case ScoringMethods.SpectralContrastAngle:
+ return DefaultCompare(instanceScore, argumentScore, out betterScore);
+ default:
+ return DefaultCompare(instanceScore, argumentScore, out betterScore);
+ }
+ }
+
+ ///
+ /// The default score comparison, where higher scores are better. Compares two scores, returns true
+ /// if the instance score is higher than the argument score, returns false if instance score is lower.
+ ///
+ ///
+ ///
+ /// The higher of the two scores
+ ///
+ private bool DefaultCompare(double instanceScore, double argumentScore, out double betterScore)
+ {
+ if (instanceScore > argumentScore)
+ {
+ betterScore = instanceScore;
+ return true;
+ }
+ else
+ {
+ betterScore = argumentScore;
+ return false;
+ }
+ }
+
+ private void ConstructScoringAlgorithm(PpmTolerance tolerance)
+ {
+ switch (ScoringMethod)
+ {
+ case ScoringMethods.KullbackLeibler:
+ throw new NotImplementedException();
+ case ScoringMethods.SpectralContrastAngle:
+ ScoringAlgorithm = new SpectralContrastAlgorithm(tolerance);
+ break;
+ default:
+ throw new NotImplementedException();
+ }
+ }
+
+}
\ No newline at end of file
diff --git a/mzLib/MassSpectrometry/Deconvolution/Scoring/ScoringAlgorithm.cs b/mzLib/MassSpectrometry/Deconvolution/Scoring/ScoringAlgorithm.cs
new file mode 100644
index 000000000..44e2f422d
--- /dev/null
+++ b/mzLib/MassSpectrometry/Deconvolution/Scoring/ScoringAlgorithm.cs
@@ -0,0 +1,44 @@
+using System;
+using System.Collections.Generic;
+using System.Dynamic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+using MzLibUtil;
+
+
+namespace MassSpectrometry.Deconvolution.Scoring
+{
+ public abstract class ScoringAlgorithm
+ {
+ public PpmTolerance PpmTolerance { get; }
+
+ public ScoringAlgorithm(PpmTolerance tolerance)
+ {
+ PpmTolerance = tolerance;
+ }
+ public abstract double GetScore(IScoreArgs args);
+ }
+
+ public interface IScoreArgs
+ {
+ }
+
+ public class MinimalSpectraArgs : IScoreArgs
+ {
+ public MinimalSpectrum ExperimentalSpectrum { get; set; }
+ public MinimalSpectrum TheoreticalSpectrum { get; set; }
+
+ public MinimalSpectraArgs(MinimalSpectrum experimentalSpectrum, MinimalSpectrum theoreticalSpectrum)
+ {
+ ExperimentalSpectrum = experimentalSpectrum;
+ TheoreticalSpectrum = theoreticalSpectrum;
+ }
+ }
+
+ public class IsotopicEnvelopeArgs : IScoreArgs
+ {
+ public IsotopicEnvelope ExperimentalEnvelope { get; set; }
+ public IsotopicEnvelope TheoreticalEnvelope { get; set; }
+ }
+}
diff --git a/mzLib/MassSpectrometry/Deconvolution/Scoring/SpectralContrastAlgorithm.cs b/mzLib/MassSpectrometry/Deconvolution/Scoring/SpectralContrastAlgorithm.cs
new file mode 100644
index 000000000..aeee5e67d
--- /dev/null
+++ b/mzLib/MassSpectrometry/Deconvolution/Scoring/SpectralContrastAlgorithm.cs
@@ -0,0 +1,35 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+using MassSpectrometry.MzSpectra;
+using MzLibUtil;
+
+namespace MassSpectrometry.Deconvolution.Scoring
+{
+ public class SpectralContrastAlgorithm : ScoringAlgorithm
+ {
+ public SpectralContrastAlgorithm(PpmTolerance tolerance) : base(tolerance)
+ {
+
+ }
+
+ public override double GetScore(IScoreArgs args)
+ {
+ switch (args)
+ {
+ case MinimalSpectraArgs spectraArgs:
+ SpectralSimilarity spectralSimilarity =
+ new(spectraArgs.ExperimentalSpectrum.MzArray, spectraArgs.ExperimentalSpectrum.IntensityArray,
+ spectraArgs.TheoreticalSpectrum.MzArray, spectraArgs.TheoreticalSpectrum.IntensityArray,
+ SpectralSimilarity.SpectrumNormalizationScheme.spectrumSum, PpmTolerance.Value,
+ allPeaks: true, filterOutBelowThisMz: 1);
+ return spectralSimilarity.SpectralContrastAngle() ?? 0;
+ default:
+ throw new ArgumentException();
+ }
+ }
+
+ }
+}
diff --git a/mzLib/MassSpectrometry/MassSpectrometry.csproj b/mzLib/MassSpectrometry/MassSpectrometry.csproj
index d803402ea..7fd9f1309 100644
--- a/mzLib/MassSpectrometry/MassSpectrometry.csproj
+++ b/mzLib/MassSpectrometry/MassSpectrometry.csproj
@@ -19,4 +19,10 @@
+
+
+ Always
+
+
+
diff --git a/mzLib/MassSpectrometry/MzSpectra/IsotopicEnvelope.cs b/mzLib/MassSpectrometry/MzSpectra/IsotopicEnvelope.cs
index ab85e7228..cb9897791 100644
--- a/mzLib/MassSpectrometry/MzSpectra/IsotopicEnvelope.cs
+++ b/mzLib/MassSpectrometry/MzSpectra/IsotopicEnvelope.cs
@@ -2,6 +2,9 @@
using System;
using System.Collections.Generic;
using System.Linq;
+using Chemistry;
+using MassSpectrometry.Deconvolution;
+using Proteomics.ProteolyticDigestion;
namespace MassSpectrometry
{
@@ -9,29 +12,85 @@ public class IsotopicEnvelope
{
public readonly List<(double mz, double intensity)> Peaks;
public double MonoisotopicMass { get; private set; }
- public double MostAbundantObservedIsotopicMass { get; private set; }
public readonly int Charge;
+
+ // Legacy fields used in the ClassicDeconvolutionAlgorithm
public readonly double TotalIntensity;
public readonly double StDev;
public readonly int MassIndex;
+ public double[] MzArray => Peaks.OrderBy(p => p.mz).Select(p => p.mz).ToArray();
+ public double[] IntensityArray => Peaks.OrderBy(p => p.mz).Select(p => p.intensity).ToArray();
+
+ public double MostAbundantObservedIsotopicMz => _mostAbundantObservedIsotopicMz ?? 0;
+ public double MostAbundantObservedIsotopicMass => MostAbundantObservedIsotopicMz.ToMass(Charge);
+ public double SecondMostAbundantObservedIsotopicMz => _secondMostAbundantObservedIsotopicMz ?? 0;
+ private double? _mostAbundantObservedIsotopicMz;
+ private double? _secondMostAbundantObservedIsotopicMz;
+ public double AmbiguityRatioMinimum { get; }
public double Score { get; private set; }
+ public PeptideWithSetModifications BestPwsmMatch { get; }
- public IsotopicEnvelope(List<(double mz, double intensity)> bestListOfPeaks, double bestMonoisotopicMass, int bestChargeState, double bestTotalIntensity, double bestStDev, int bestMassIndex)
+ public IsotopicEnvelope(List<(double mz, double intensity)> bestListOfPeaks, double bestMonoisotopicMass,
+ int bestChargeState, double bestTotalIntensity, double bestStDev, int bestMassIndex)
{
Peaks = bestListOfPeaks;
MonoisotopicMass = bestMonoisotopicMass;
- MostAbundantObservedIsotopicMass = GetMostAbundantObservedIsotopicMass(bestListOfPeaks, bestChargeState);
Charge = bestChargeState;
+ FindMostAbundantObservedIsotopicMz();
+
TotalIntensity = bestTotalIntensity;
StDev = bestStDev;
MassIndex = bestMassIndex;
Score = ScoreIsotopeEnvelope();
}
- public double GetMostAbundantObservedIsotopicMass(List<(double mz, double intensity)> peaks, int charge)
+ ///
+ /// Takes in an Isotopic Distribution and a given charge state and converts it to an IsotopicEnvelope object
+ /// TODO: Test this function specifically
+ ///
+ /// An IsotopicDistribution generated from a ChemicalFormula
+ /// The charge state (corresponding to the z value of m/z)
+ public IsotopicEnvelope(IsotopicDistribution theoreticalDistribution, int charge, double ambiguityRatioMinimum = 0.9)
+ {
+ Peaks = theoreticalDistribution.Masses.Zip(theoreticalDistribution.Intensities,
+ (first, second) => (first.ToMz(charge), (double)second)).ToList();
+ MonoisotopicMass = theoreticalDistribution.MonoIsotopicMass; // I think this is right, need to test it tho
+ Charge = charge;
+ AmbiguityRatioMinimum = ambiguityRatioMinimum;
+ FindMostAbundantObservedIsotopicMz();
+ }
+
+ public IsotopicEnvelope(MinimalSpectrum experimentalSpectrum, PeptideWithSetModifications bestPwsmMatch, double spectralScore = 0)
+ {
+ Peaks = experimentalSpectrum.MzArray.Zip(experimentalSpectrum.IntensityArray, (first, second) =>
+ (first, second)).ToList();
+ MonoisotopicMass = bestPwsmMatch.MonoisotopicMass;
+ Charge = experimentalSpectrum.Charge;
+ FindMostAbundantObservedIsotopicMz();
+ BestPwsmMatch = bestPwsmMatch;
+ Score = spectralScore;
+ }
+
+ ///
+ /// Finds the m/z value of the greatest intensity peak. If the second most intense peak
+ /// is within 90% of the most intense peak, the m/z value of that peak is stored
+ /// in the _secondMostAbundantObservedIsotopicMass field
+ ///
+ ///
+ public void FindMostAbundantObservedIsotopicMz()
{
- return (peaks.OrderByDescending(p => p.intensity).ToList()[0].Item1)* charge;
+ if (!_mostAbundantObservedIsotopicMz.HasValue | MostAbundantObservedIsotopicMass == 0)
+ {
+ List<(double mz, double intensity)> intensityOrderedPeaks = Peaks.OrderByDescending(p => p.intensity).ToList();
+ _mostAbundantObservedIsotopicMz = intensityOrderedPeaks.Select(p => p.mz).First();
+ if (intensityOrderedPeaks.Count > 1 &&
+ intensityOrderedPeaks[1].intensity / intensityOrderedPeaks[0].intensity >= AmbiguityRatioMinimum &&
+ AmbiguityRatioMinimum > 0)
+ {
+ _secondMostAbundantObservedIsotopicMz = intensityOrderedPeaks[1].mz;
+ }
+ }
}
public override string ToString()
@@ -39,6 +98,7 @@ public override string ToString()
return Charge + "\t" + Peaks[0].mz.ToString("G8") + "\t" + Peaks.Count + "\t" + TotalIntensity;
}
+ // This should be done using a Strategy pattern
private double ScoreIsotopeEnvelope() //likely created by Stefan Solntsev using peptide data
{
return Peaks.Count >= 2 ?
diff --git a/mzLib/MassSpectrometry/MzSpectra/SpectralSimilarity.cs b/mzLib/MassSpectrometry/MzSpectra/SpectralSimilarity.cs
index be6ac6344..0febdec23 100644
--- a/mzLib/MassSpectrometry/MzSpectra/SpectralSimilarity.cs
+++ b/mzLib/MassSpectrometry/MzSpectra/SpectralSimilarity.cs
@@ -30,6 +30,17 @@ public SpectralSimilarity(MzSpectrum experimentalSpectrum, double[] theoreticalX
_intensityPairs = IntensityPairs(allPeaks);
}
+ ///
+ /// Constructs a spectral similarity object where the P arrays represent the experimental spectrum and the Q arrays represent the theoretical spectrum
+ ///
+ /// Experimental X Array (m/z)
+ /// Experimental Y Array (intensity)
+ /// Theoretical X Array (m/z)
+ /// Theoretical Y Array (intensity)
+ ///
+ ///
+ ///
+ ///
public SpectralSimilarity(double[] P_XArray, double[] P_YArray, double[] Q_XArray, double[] Q_YArray, SpectrumNormalizationScheme scheme, double toleranceInPpm, bool allPeaks, double filterOutBelowThisMz = 300)
{
ExperimentalYArray = Normalize(FilterOutIonsBelowThisMz(P_XArray, P_YArray, filterOutBelowThisMz).Select(p => p.Item2).ToArray(), scheme);
diff --git a/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/AminoAcidPolymer.cs b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/AminoAcidPolymer.cs
new file mode 100644
index 000000000..8c6575ae1
--- /dev/null
+++ b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/AminoAcidPolymer.cs
@@ -0,0 +1,1200 @@
+// Copyright 2012, 2013, 2014 Derek J. Bailey
+// Modified work copyright 2016 Stefan Solntsev
+//
+// This file (AminoAcidPolymer.cs) is part of Proteomics.
+//
+// Proteomics is free software: you can redistribute it and/or modify it
+// under the terms of the GNU Lesser General Public License as published
+// by the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// Proteomics is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+// License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with Proteomics. If not, see .
+
+using System;
+using System.Collections.Generic;
+using System.Collections.ObjectModel;
+using System.Globalization;
+using System.Linq;
+using System.Text;
+using Chemistry;
+using MzLibUtil;
+
+namespace Proteomics.AminoAcidPolymer
+{
+ ///
+ /// A linear polymer of amino acids
+ ///
+ public abstract class AminoAcidPolymer : IEquatable, IHasMass
+ {
+ ///
+ /// The C-terminus chemical formula cap. This is different from the C-Terminus modification.
+ ///
+ private IHasChemicalFormula _cTerminus;
+
+ ///
+ /// The N-terminus chemical formula cap. This is different from the N-Terminus modification.
+ ///
+ private IHasChemicalFormula _nTerminus;
+
+ ///
+ /// All of the modifications indexed by position from N to C. This array is 2 bigger than the amino acid array
+ /// as index 0 and Count - 1 represent the N and C terminus, respectively
+ ///
+ private IHasMass[] _modifications;
+
+ ///
+ /// All of the amino acid residues indexed by position from N to C.
+ ///
+ private Residue[] residues;
+
+ protected AminoAcidPolymer()
+ : this(string.Empty, new ChemicalFormulaTerminus(ChemicalFormula.ParseFormula("H")), new ChemicalFormulaTerminus(ChemicalFormula.ParseFormula("OH")))
+ {
+ }
+
+ protected AminoAcidPolymer(string sequence)
+ : this(sequence, new ChemicalFormulaTerminus(ChemicalFormula.ParseFormula("H")), new ChemicalFormulaTerminus(ChemicalFormula.ParseFormula("OH")))
+ {
+ }
+
+ protected AminoAcidPolymer(string sequence, IHasChemicalFormula nTerm, IHasChemicalFormula cTerm)
+ {
+ MonoisotopicMass = 0;
+ Length = sequence.Length;
+ residues = new Residue[Length];
+ NTerminus = nTerm;
+ CTerminus = cTerm;
+ ParseSequence(sequence);
+ }
+
+ protected AminoAcidPolymer(AminoAcidPolymer aminoAcidPolymer, bool includeModifications)
+ : this(aminoAcidPolymer, 0, aminoAcidPolymer.Length, includeModifications)
+ {
+ }
+
+ protected AminoAcidPolymer(AminoAcidPolymer aminoAcidPolymer, int firstResidue, int length, bool includeModifications)
+ {
+ Length = length;
+ residues = new Residue[length];
+
+ bool isNterm = firstResidue == 0;
+ bool isCterm = length + firstResidue == aminoAcidPolymer.Length;
+
+ _nTerminus = isNterm ? aminoAcidPolymer.NTerminus : new ChemicalFormulaTerminus(ChemicalFormula.ParseFormula("H"));
+ _cTerminus = isCterm ? aminoAcidPolymer.CTerminus : new ChemicalFormulaTerminus(ChemicalFormula.ParseFormula("OH"));
+
+ double monoMass = _nTerminus.MonoisotopicMass + _cTerminus.MonoisotopicMass;
+
+ Residue[] otherAminoAcids = aminoAcidPolymer.residues;
+
+ if (includeModifications && aminoAcidPolymer.ContainsModifications())
+ {
+ _modifications = new IHasMass[length + 2];
+ for (int i = 0; i < length; i++)
+ {
+ var aa = otherAminoAcids[i + firstResidue];
+ residues[i] = aa;
+ monoMass += aa.MonoisotopicMass;
+
+ IHasMass mod = aminoAcidPolymer._modifications[i + firstResidue + 1];
+ if (mod == null)
+ continue;
+
+ _modifications[i + 1] = mod;
+ monoMass += mod.MonoisotopicMass;
+ }
+ }
+ else
+ {
+ for (int i = 0, j = firstResidue; i < length; i++, j++)
+ {
+ var aa = otherAminoAcids[j];
+ residues[i] = aa;
+ monoMass += aa.MonoisotopicMass;
+ }
+ }
+
+ MonoisotopicMass = monoMass;
+
+ if (includeModifications)
+ {
+ if (isNterm)
+ NTerminusModification = aminoAcidPolymer.NTerminusModification;
+
+ if (isCterm)
+ CTerminusModification = aminoAcidPolymer.CTerminusModification;
+ }
+ }
+
+ public ReadOnlyCollection Modifications
+ {
+ get
+ {
+ return new ReadOnlyCollection(_modifications);
+ }
+ }
+
+ ///
+ /// Gets or sets the C terminus of this amino acid polymer
+ ///
+ public IHasChemicalFormula CTerminus
+ {
+ get { return _cTerminus; }
+ set { ReplaceTerminus(ref _cTerminus, value); }
+ }
+
+ ///
+ /// Gets or sets the N terminus of this amino acid polymer
+ ///
+ public IHasChemicalFormula NTerminus
+ {
+ get { return _nTerminus; }
+ set { ReplaceTerminus(ref _nTerminus, value); }
+ }
+
+ ///
+ /// Gets the number of amino acids in this amino acid polymer
+ ///
+ public int Length { get; private set; }
+
+ ///
+ /// The total monoisotopic mass of this peptide and all of its modifications
+ ///
+ public double MonoisotopicMass { get; private set; }
+
+ ///
+ /// Returns the amino acid sequence with all isoleucines (I) replaced with leucines (L);
+ ///
+ /// The amino acid sequence with all I's into L's
+ public virtual string BaseLeucineSequence
+ {
+ get
+ {
+ return BaseSequence.Replace('I', 'L');
+ }
+ }
+
+ ///
+ /// Gets the base amino acid sequence
+ ///
+ public string BaseSequence
+ {
+ get
+ {
+ return new string(residues.Select(aa => aa.Letter).ToArray());
+ }
+ }
+
+ ///
+ /// Gets or sets the modification of the C terminus on this amino acid polymer
+ ///
+ public IHasMass CTerminusModification
+ {
+ get { return GetModification(Length + 1); }
+ set { ReplaceMod(Length + 1, value); }
+ }
+
+ ///
+ /// Gets or sets the modification of the C terminus on this amino acid polymer
+ ///
+ public IHasMass NTerminusModification
+ {
+ get { return GetModification(0); }
+ set { ReplaceMod(0, value); }
+ }
+
+ ///
+ /// Returns all fragments that are present in either fragmentation of A or B, but not in both
+ ///
+ public static IEnumerable GetSiteDeterminingFragments(AminoAcidPolymer peptideA, AminoAcidPolymer peptideB, FragmentTypes types)
+ {
+ HashSet aFrags = new HashSet(peptideA.Fragment(types));
+ aFrags.SymmetricExceptWith(peptideB.Fragment(types));
+ return aFrags;
+ }
+
+ ///
+ /// Gets the digestion points (starting index and length) of a amino acid sequence
+ ///
+ /// The sequence to cleave
+ /// The proteases to cleave with
+ /// The maximum number of missed clevages to allow
+ /// The minimum amino acid length of the peptides
+ /// The maximum amino acid length of the peptides
+ ///
+ ///
+ /// A collection of clevage points and the length of the cut (Item1 = index, Item2 = length)
+ public static IEnumerable GetDigestionPointsAndLengths(string sequence, IEnumerable proteases, int maxMissedCleavages, int minLength, int maxLength, bool methionineInitiator, bool semiDigestion)
+ {
+ int[] indices = GetCleavageIndexes(sequence, proteases).ToArray();
+
+ bool includeMethionineCut = methionineInitiator && sequence[0] == 'M';
+
+ int indiciesCount = indices.Length - 1;
+
+ for (int missedCleavages = 0; missedCleavages <= maxMissedCleavages; missedCleavages++)
+ {
+ int max = indiciesCount - missedCleavages;
+ int offset = missedCleavages + 1;
+ for (int i = 0; i < max; i++)
+ {
+ int len = indices[i + offset] - indices[i];
+
+ // Case for initiator methionine
+ if (indices[i] == -1 && includeMethionineCut)
+ {
+ int newLength = len - 1;
+ if (newLength >= minLength && newLength <= maxLength)
+ {
+ yield return new DigestionPointAndLength(1, newLength); if (semiDigestion)
+ {
+ for (int j = 1; j < newLength; j++)
+ {
+ if (j >= minLength && j <= maxLength)
+ {
+ yield return new DigestionPointAndLength(1, j);
+ }
+ }
+ }
+ }
+ }
+
+ if (len < minLength || len > maxLength)
+ continue;
+
+ yield return new DigestionPointAndLength(indices[i] + 1, len);
+ if (semiDigestion)
+ {
+ for (int j = 1; j < len; j++)
+ {
+ if (len - j >= minLength && len - j <= maxLength)
+ {
+ yield return new DigestionPointAndLength(indices[i] + 1 + j, len - j);
+ }
+ if (j >= minLength && j <= maxLength)
+ {
+ yield return new DigestionPointAndLength(indices[i] + 1, j);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ public static IEnumerable GetCleavageIndexes(string sequence, IEnumerable proteases)
+ {
+ return GetCleavageIndexes(sequence, proteases, true);
+ }
+
+ ///
+ /// Gets the location of all the possible cleavage points for a given sequence and set of proteases
+ ///
+ /// The sequence to determine the cleavage points for
+ /// The proteases to cleave with
+ /// Include the N and C terminus (-1 and Length + 1)
+ /// A collection of all the sites where the proteases would cleave
+ public static IEnumerable GetCleavageIndexes(string sequence, IEnumerable proteases, bool includeTermini)
+ {
+ // Combine all the proteases digestion sites
+ SortedSet locations = new SortedSet();
+ foreach (IProtease protease in proteases.Where(protease => protease != null))
+ {
+ locations.UnionWith(protease.GetDigestionSites(sequence));
+ }
+
+ if (!includeTermini)
+ return locations;
+
+ locations.Add(-1);
+ locations.Add(sequence.Length - 1);
+
+ return locations;
+ }
+
+ public static IEnumerable Digest(string sequence, IEnumerable proteases, int maxMissedCleavages, int minLength, int maxLength, bool methionineInitiator, bool semiDigestion)
+ {
+ return GetDigestionPointsAndLengths(sequence, proteases, maxMissedCleavages, minLength, maxLength, methionineInitiator, semiDigestion).Select(points => sequence.Substring(points.Index, points.Length));
+ }
+
+ public static IEnumerable Digest(AminoAcidPolymer sequence, IProtease protease)
+ {
+ return Digest(sequence, protease, 3, 1, int.MaxValue, true, false);
+ }
+
+ public static IEnumerable Digest(AminoAcidPolymer polymer, IProtease protease, int maxMissedCleavages, int minLength, int maxLength, bool methionineInitiator, bool semiDigestion)
+ {
+ return Digest(polymer.BaseSequence, new[] { protease }, maxMissedCleavages, minLength, maxLength, methionineInitiator, semiDigestion);
+ }
+
+ public Residue GetResidue(int position)
+ {
+ if (position < 0 || position >= Length)
+ return null;
+ return residues[position];
+ }
+
+ ///
+ /// Checks if an amino acid residue with the value of 'residue' is contained in this polymer
+ ///
+ /// The character code for the amino acid residue
+ /// True if any amino acid residue is the same as the specified character
+ public bool Contains(char residue)
+ {
+ return residues.Any(aa => aa.Letter.Equals(residue));
+ }
+
+ ///
+ /// Checks if the amino acid residue is contained in this polymer
+ ///
+ /// The residue to check for
+ /// True if the polymer contains the specified residue, False otherwise
+ public bool Contains(Residue residue)
+ {
+ return residues.Contains(residue);
+ }
+
+ public string GetSequenceWithModifications()
+ {
+ return GetSequenceWithModifications(false);
+ }
+
+ public string GetSequenceWithModifications(bool leucineSequence)
+ {
+ if (_modifications == null)
+ return (leucineSequence) ? BaseLeucineSequence : BaseSequence;
+
+ StringBuilder modSeqSb = new StringBuilder(Length);
+
+ IHasMass mod;
+
+ // Handle N-Terminus Modification
+ if ((mod = _modifications[0]) != null && mod.MonoisotopicMass > 0)
+ {
+ modSeqSb.Append('[');
+ modSeqSb.Append(mod);
+ modSeqSb.Append("]-");
+ }
+
+ // Handle Amino Acid Residues
+ for (int i = 0; i < Length; i++)
+ {
+ if (leucineSequence && residues[i].Letter == 'I')
+ modSeqSb.Append('L');
+ else
+ modSeqSb.Append(residues[i].Letter);
+
+ // Handle Amino Acid Modification (1-based)
+ if ((mod = _modifications[i + 1]) != null && mod.MonoisotopicMass > 0)
+ {
+ modSeqSb.Append('[');
+ modSeqSb.Append(mod);
+ modSeqSb.Append(']');
+ }
+ }
+
+ // Handle C-Terminus Modification
+ if ((mod = _modifications[Length + 1]) != null && mod.MonoisotopicMass > 0)
+ {
+ modSeqSb.Append("-[");
+ modSeqSb.Append(mod);
+ modSeqSb.Append(']');
+ }
+
+ return modSeqSb.ToString();
+ }
+
+ ///
+ /// Gets the total number of amino acid residues in this amino acid polymer
+ ///
+ /// The number of amino acid residues
+ public int ResidueCount()
+ {
+ return Length;
+ }
+
+ public int ResidueCount(Residue aminoAcid)
+ {
+ return aminoAcid == null ? 0 : residues.Count(aar => aar.Equals(aminoAcid));
+ }
+
+ ///
+ /// Gets the number of amino acids residues in this amino acid polymer that
+ /// has the specified residue letter
+ ///
+ /// The residue letter to search for
+ /// The number of amino acid residues that have the same letter in this polymer
+ public int ResidueCount(char residueLetter)
+ {
+ return residues.Count(aar => aar.Letter.Equals(residueLetter));
+ }
+
+ public int ResidueCount(char residueLetter, int index, int length)
+ {
+ return residues.SubArray(index, length).Count(aar => aar.Letter.Equals(residueLetter));
+ }
+
+ public int ResidueCount(Residue aminoAcid, int index, int length)
+ {
+ return residues.SubArray(index, length).Count(aar => aar.Equals(aminoAcid));
+ }
+
+ public int ElementCountWithIsotopes(string element)
+ {
+ // Residues count
+ int count = residues.Sum(aar => aar.ThisChemicalFormula.CountWithIsotopes(element));
+ // Modifications count (if the mod is a IHasChemicalFormula)
+ if (_modifications != null)
+ count += _modifications.Where(mod => mod is IHasChemicalFormula).Cast().Sum(mod => mod.ThisChemicalFormula.CountWithIsotopes(element));
+
+ count += ChemicalFormula.ParseFormula("H2O").CountWithIsotopes(element);
+ return count;
+ }
+
+ public int SpecificIsotopeCount(Isotope isotope)
+ {
+ // Residues count
+ int count = residues.Sum(aar => aar.ThisChemicalFormula.CountSpecificIsotopes(isotope));
+ // Modifications count (if the mod is a IHasChemicalFormula)
+ if (_modifications != null)
+ count += _modifications.Where(mod => mod is IHasChemicalFormula).Cast().Sum(mod => mod.ThisChemicalFormula.CountSpecificIsotopes(isotope));
+ return count;
+ }
+
+ ///
+ /// Calculates the fragments that are different between this and another aminoacidpolymer
+ ///
+ ///
+ ///
+ ///
+ public IEnumerable GetSiteDeterminingFragments(AminoAcidPolymer other, FragmentTypes type)
+ {
+ return GetSiteDeterminingFragments(this, other, type);
+ }
+
+ public IEnumerable Fragment(FragmentTypes types)
+ {
+ return Fragment(types, false);
+ }
+
+ ///
+ /// Calculates all the fragments of the types you specify
+ ///
+ ///
+ ///
+ ///
+ public IEnumerable Fragment(FragmentTypes types, bool calculateChemicalFormula)
+ {
+ return Fragment(types, 1, Length - 1, calculateChemicalFormula);
+ }
+
+ public IEnumerable Fragment(FragmentTypes types, int number)
+ {
+ return Fragment(types, number, false);
+ }
+
+ public IEnumerable Fragment(FragmentTypes types, int number, bool calculateChemicalFormula)
+ {
+ return Fragment(types, number, number, calculateChemicalFormula);
+ }
+
+ public IEnumerable Fragment(FragmentTypes types, int minIndex, int maxIndex)
+ {
+ return Fragment(types, minIndex, maxIndex, false);
+ }
+
+ public IEnumerable Fragment(FragmentTypes types, int minIndex, int maxIndex, bool calculateChemicalFormula)
+ {
+ foreach (FragmentTypes type in types.GetIndividualFragmentTypes())
+ {
+ bool isChemicalFormula = calculateChemicalFormula;
+ ChemicalFormula capFormula = type.GetIonCap();
+ bool isCTerminal = type.GetTerminus() == Terminus.C;
+
+ double monoMass = capFormula.MonoisotopicMass;
+ ChemicalFormula formula = new ChemicalFormula(capFormula);
+
+ IHasChemicalFormula terminus = isCTerminal ? CTerminus : NTerminus;
+ monoMass += terminus.MonoisotopicMass;
+ if (isChemicalFormula)
+ formula.Add(terminus);
+
+ bool first = true;
+ bool hasMod = _modifications != null;
+
+ for (int i = 0; i <= maxIndex; i++)
+ {
+ int aaIndex = isCTerminal ? Length - i : i - 1;
+
+ // Handle the terminus mods first in a special case
+ IHasMass mod;
+ if (first)
+ {
+ first = false;
+ if (hasMod)
+ {
+ mod = _modifications[aaIndex + 1];
+ if (mod != null)
+ {
+ monoMass += mod.MonoisotopicMass;
+ if (isChemicalFormula)
+ {
+ if (mod is IHasChemicalFormula modFormula)
+ {
+ formula.Add(modFormula);
+ }
+ else
+ {
+ isChemicalFormula = false;
+ }
+ }
+ }
+ }
+ continue;
+ }
+
+ monoMass += residues[aaIndex].MonoisotopicMass;
+ formula.Add(residues[aaIndex]);
+
+ if (hasMod)
+ {
+ mod = _modifications[aaIndex + 1];
+
+ if (mod != null)
+ {
+ monoMass += mod.MonoisotopicMass;
+ if (isChemicalFormula)
+ {
+ if (mod is IHasChemicalFormula modFormula)
+ {
+ formula.Add(modFormula);
+ }
+ else
+ {
+ isChemicalFormula = false;
+ }
+ }
+ }
+ }
+
+ if (i < minIndex)
+ continue;
+
+ if (isChemicalFormula)
+ {
+ yield return new ChemicalFormulaFragment(type, i, formula, this);
+ }
+ else
+ {
+ yield return new Fragment(type, i, monoMass, this);
+ }
+ }
+ }
+ }
+
+ public bool ContainsModifications()
+ {
+ return _modifications != null && _modifications.Any(m => m != null);
+ }
+
+ public ISet GetUniqueModifications() where T : IHasMass
+ {
+ HashSet uniqueMods = new HashSet();
+
+ if (_modifications == null)
+ return uniqueMods;
+
+ foreach (IHasMass mod in _modifications)
+ {
+ if (mod is T)
+ uniqueMods.Add((T)mod);
+ }
+ return uniqueMods;
+ }
+
+ ///
+ /// Counts the total number of modifications on this polymer that are not null
+ ///
+ /// The number of modifications
+ public int ModificationCount()
+ {
+ return _modifications == null ? 0 : _modifications.Count(mod => mod != null);
+ }
+
+ ///
+ /// Get the modification at the given residue number
+ ///
+ /// The amino acid residue number
+ /// The modification at the site, null if there isn't any modification present
+ public IHasMass GetModification(int residueNumber)
+ {
+ return _modifications?[residueNumber];
+ }
+
+ ///
+ /// Sets the modification at the terminus of this amino acid polymer
+ ///
+ /// The modification to set
+ /// The termini to set the mod at
+ public virtual void SetModification(IHasMass modification, Terminus terminus)
+ {
+ if ((terminus & Terminus.N) == Terminus.N)
+ NTerminusModification = modification;
+
+ if ((terminus & Terminus.C) == Terminus.C)
+ CTerminusModification = modification;
+ }
+
+ ///
+ /// Sets the modification at specific sites on this amino acid polymer
+ ///
+ /// The modification to set
+ /// The sites to set the modification at
+ /// The number of modifications added to this amino acid polymer
+ public virtual int SetModification(IHasMass modification, ModificationSites sites)
+ {
+ int count = 0;
+
+ if ((sites & ModificationSites.NPep) == ModificationSites.NPep)
+ {
+ NTerminusModification = modification;
+ count++;
+ }
+
+ for (int i = 0; i < Length; i++)
+ {
+ ModificationSites site = residues[i].Site;
+ if ((sites & site) == site)
+ {
+ ReplaceMod(i + 1, modification);
+ count++;
+ }
+ }
+
+ if ((sites & ModificationSites.PepC) == ModificationSites.PepC)
+ {
+ CTerminusModification = modification;
+ count++;
+ }
+
+ return count;
+ }
+
+ ///
+ /// Sets the modification at specific sites on this amino acid polymer
+ ///
+ /// The modification to set
+ /// The residue character to set the modification at
+ /// The number of modifications added to this amino acid polymer
+ public virtual int SetModification(IHasMass modification, char letter)
+ {
+ int count = 0;
+ for (int i = 0; i < Length; i++)
+ {
+ if (!letter.Equals(residues[i].Letter))
+ continue;
+
+ ReplaceMod(i + 1, modification);
+ count++;
+ }
+
+ return count;
+ }
+
+ ///
+ /// Sets the modification at specific sites on this amino acid polymer
+ ///
+ /// The modification to set
+ /// The residue to set the modification at
+ /// The number of modifications added to this amino acid polymer
+ public virtual int SetModification(IHasMass modification, Residue residue)
+ {
+ int count = 0;
+ for (int i = 0; i < Length; i++)
+ {
+ if (!residue.Letter.Equals(residues[i].Letter))
+ continue;
+ ReplaceMod(i + 1, modification);
+ count++;
+ }
+ return count;
+ }
+
+ ///
+ /// Sets the modification at specific sites on this amino acid polymer
+ ///
+ /// The modification to set
+ /// The residue number to set the modification at
+ public virtual void SetModification(IHasMass modification, int residueNumber)
+ {
+ if (residueNumber > Length || residueNumber < 1)
+ throw new MzLibException(string.Format(CultureInfo.InvariantCulture, "Residue number not in the correct range: [{0}-{1}] you specified: {2}", 1, Length, residueNumber));
+
+ ReplaceMod(residueNumber, modification);
+ }
+
+ public void SetModifications(IEnumerable modifications)
+ {
+ foreach (OldSchoolModification mod in modifications)
+ {
+ SetModification(mod, mod.Sites);
+ }
+ }
+
+ public void SetModification(OldSchoolModification mod)
+ {
+ SetModification(mod, mod.Sites);
+ }
+
+ ///
+ ///
+ ///
+ ///
+ /// (1-based) residue number
+ public void SetModification(IHasMass mod, params int[] residueNumbers)
+ {
+ foreach (int residueNumber in residueNumbers)
+ {
+ SetModification(mod, residueNumber);
+ }
+ }
+
+ ///
+ /// Replaces all instances of the old modification with the new modification in this polymer
+ ///
+ /// The modification to remove
+ /// The modification to replace it with
+ /// The number of modifications added to this amino acid polymer
+ public virtual int ReplaceModification(IHasMass oldMod, IHasMass newMod)
+ {
+ if (oldMod == null)
+ throw new MzLibException("Cannot replace a null modification");
+
+ int count = 0;
+ for (int i = 0; i < Length + 2; i++)
+ {
+ IHasMass mod = GetModification(i);
+ if (mod == null || !oldMod.Equals(mod))
+ continue;
+
+ ReplaceMod(i, newMod);
+ count++;
+ }
+ return count;
+ }
+
+ ///
+ /// Adds the modification at the terminus of this amino acid polymer, combining modifications if a modification is already present
+ ///
+ /// The modification to set
+ /// The termini to set the mod at
+ public virtual int AddModification(IHasMass modification, Terminus terminus)
+ {
+ IHasMass currentMod;
+ int count = 0;
+
+ if ((terminus & Terminus.N) == Terminus.N)
+ {
+ currentMod = NTerminusModification;
+ NTerminusModification = currentMod == null ? modification : new ModificationCollection(currentMod, modification);
+ count++;
+ }
+
+ if ((terminus & Terminus.C) == Terminus.C)
+ {
+ currentMod = CTerminusModification;
+ CTerminusModification = currentMod == null ? modification : new ModificationCollection(currentMod, modification);
+ count++;
+ }
+ return count;
+ }
+
+ public virtual int AddModification(OldSchoolModification modification)
+ {
+ return AddModification(modification, modification.Sites);
+ }
+
+ public virtual int AddModification(IHasMass modification, ModificationSites sites)
+ {
+ if (_modifications == null)
+ _modifications = new IHasMass[Length + 2];
+
+ int count = 0;
+ IHasMass currentMod;
+ if ((sites & ModificationSites.NPep) == ModificationSites.NPep)
+ {
+ currentMod = NTerminusModification;
+ NTerminusModification = currentMod == null ? modification : new ModificationCollection(currentMod, modification);
+ count++;
+ }
+
+ for (int i = 0; i < Length; i++)
+ {
+ ModificationSites site = residues[i].Site;
+ if ((sites & site) == site)
+ {
+ currentMod = _modifications[i + 1];
+ ReplaceMod(i + 1, currentMod == null ? modification : new ModificationCollection(currentMod, modification));
+ count++;
+ }
+ }
+
+ if ((sites & ModificationSites.PepC) == ModificationSites.PepC)
+ {
+ currentMod = CTerminusModification;
+ CTerminusModification = currentMod == null ? modification : new ModificationCollection(currentMod, modification);
+ count++;
+ }
+
+ return count;
+ }
+
+ ///
+ /// Adds the modification at specific sites on this amino acid polymer, combining modifications if a modification is already present
+ ///
+ /// The modification to set
+ /// The location to set the modification at
+ public virtual void AddModification(IHasMass modification, int location)
+ {
+ IHasMass currentMod = GetModification(location);
+ ReplaceMod(location, currentMod == null ? modification : new ModificationCollection(currentMod, modification));
+ }
+
+ ///
+ /// Clears the modification set at the terminus of this amino acid polymer back
+ /// to the default C or N modifications.
+ ///
+ /// The termini to clear the mod at
+ public void ClearModifications(Terminus terminus)
+ {
+ if ((terminus & Terminus.N) == Terminus.N)
+ NTerminusModification = null;
+
+ if ((terminus & Terminus.C) == Terminus.C)
+ CTerminusModification = null;
+ }
+
+ ///
+ /// Clear the modifications from the specified sites(s)
+ ///
+ /// The sites to remove modifications from
+ public void ClearModifications(ModificationSites sites)
+ {
+ if ((sites & ModificationSites.NPep) == ModificationSites.NPep || (sites & ModificationSites.NProt) == ModificationSites.NProt)
+ {
+ ReplaceMod(0, null);
+ }
+
+ for (int i = 0; i < Length; i++)
+ {
+ int modIndex = i + 1;
+
+ if (_modifications[modIndex] == null)
+ continue;
+
+ ModificationSites curSite = residues[i].Site;
+
+ if ((curSite & sites) == curSite)
+ {
+ ReplaceMod(modIndex, null);
+ }
+ }
+
+ if ((sites & ModificationSites.PepC) == ModificationSites.PepC || (sites & ModificationSites.ProtC) == ModificationSites.ProtC)
+ {
+ ReplaceMod(Length + 1, null);
+ }
+ }
+
+ ///
+ /// Clear all modifications from this amino acid polymer.
+ /// Includes N and C terminus modifications.
+ ///
+ public void ClearModifications()
+ {
+ if (!ContainsModifications())
+ return;
+
+ for (int i = 0; i <= Length + 1; i++)
+ {
+ if (_modifications[i] == null)
+ continue;
+
+ MonoisotopicMass -= _modifications[i].MonoisotopicMass;
+ _modifications[i] = null;
+ }
+ }
+
+ ///
+ /// Removes the specified mod from all locations on this polymer
+ ///
+ /// The modification to remove from this polymer
+ public void ClearModifications(IHasMass mod)
+ {
+ for (int i = 0; i <= Length + 1; i++)
+ {
+ if (!mod.Equals(_modifications[i]))
+ continue;
+
+ MonoisotopicMass -= mod.MonoisotopicMass;
+ _modifications[i] = null;
+ }
+ }
+
+ ///
+ /// Gets the chemical formula of this amino acid polymer.
+ ///
+ ///
+ public ChemicalFormula GetChemicalFormula()
+ {
+ var formula = new ChemicalFormula();
+
+ // Handle Modifications
+ if (ContainsModifications())
+ {
+ for (int i = 0; i < Length + 2; i++)
+ {
+ if (_modifications[i] == null)
+ continue;
+
+ if (!(_modifications[i] is IHasChemicalFormula chemMod))
+ throw new MzLibException("Modification " + _modifications[i] + " does not have a chemical formula!");
+
+ formula.Add(chemMod.ThisChemicalFormula);
+ }
+ }
+
+ // Handle N-Terminus
+ formula.Add(NTerminus.ThisChemicalFormula);
+
+ // Handle C-Terminus
+ formula.Add(CTerminus.ThisChemicalFormula);
+
+ // Handle Amino Acid Residues
+ for (int i = 0; i < Length; i++)
+ {
+ formula.Add(residues[i].ThisChemicalFormula);
+ }
+
+ return formula;
+ }
+
+ public override string ToString()
+ {
+ return GetSequenceWithModifications();
+ }
+
+ public override int GetHashCode()
+ {
+ return BaseSequence.GetHashCode();
+ }
+
+ public override bool Equals(object obj)
+ {
+ AminoAcidPolymer aap = obj as AminoAcidPolymer;
+ return aap != null && Equals(aap);
+ }
+
+ public bool Equals(AminoAcidPolymer other)
+ {
+ if (other == null ||
+ Length != other.Length ||
+ !NTerminus.ThisChemicalFormula.Equals(other.NTerminus.ThisChemicalFormula) ||
+ !CTerminus.ThisChemicalFormula.Equals(other.CTerminus.ThisChemicalFormula))
+ return false;
+
+ bool containsMod = ContainsModifications();
+
+ if (containsMod != other.ContainsModifications())
+ return false;
+
+ for (int i = 0; i <= Length + 1; i++)
+ {
+ if (containsMod && !Equals(_modifications[i], other._modifications[i]))
+ {
+ return false;
+ }
+
+ if (i == 0 || i == Length + 1)
+ {
+ continue; // uneven arrays, so skip these two conditions
+ }
+
+ if (!residues[i - 1].Equals(other.residues[i - 1]))
+ {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ private void ReplaceTerminus(ref IHasChemicalFormula terminus, IHasChemicalFormula value)
+ {
+ if (terminus != null)
+ MonoisotopicMass -= terminus.MonoisotopicMass;
+
+ terminus = value;
+
+ if (value != null)
+ MonoisotopicMass += value.MonoisotopicMass;
+ }
+
+ ///
+ /// Replaces a modification (if present) at the specific index in the residue (0-based for N and C termini)
+ ///
+ /// The residue index to replace at
+ /// The modification to replace with
+ private void ReplaceMod(int index, IHasMass mod)
+ {
+ // No error checking here as all validation will occur before this method is call. This is to prevent
+ // unneeded bounds checking
+
+ if (_modifications == null)
+ {
+ _modifications = new IHasMass[Length + 2];
+ }
+
+ IHasMass oldMod = _modifications[index]; // Get the mod at the index, if present
+
+ if (Equals(mod, oldMod))
+ return; // Same modifications, no change is required
+
+ if (oldMod != null)
+ MonoisotopicMass -= oldMod.MonoisotopicMass; // remove the old mod mass
+
+ _modifications[index] = mod;
+
+ if (mod != null)
+ MonoisotopicMass += mod.MonoisotopicMass; // add the new mod mass
+ }
+
+ ///
+ /// Parses a string sequence of amino acids characters into a peptide object
+ ///
+ ///
+ ///
+ private void ParseSequence(string sequence)
+ {
+ bool inMod = false;
+ bool cterminalMod = false; // n or c terminal modification
+ int index = 0;
+
+ double monoMass = 0;
+
+ StringBuilder modSb = new StringBuilder(10);
+ foreach (char letter in sequence)
+ {
+ if (inMod)
+ {
+ if (letter == ']')
+ {
+ inMod = false; // end the modification phase
+
+ string modString = modSb.ToString();
+ modSb.Clear();
+ IHasMass modification;
+ try
+ {
+ modification = new OldSchoolChemicalFormulaModification(ChemicalFormula.ParseFormula(modString));
+ }
+ catch (MzLibException)
+ {
+ if (double.TryParse(modString, out double mass))
+ {
+ modification = new ModWithOnlyMass(mass);
+ }
+ else
+ {
+ throw new MzLibException("Unable to correctly parse the following modification: " + modString);
+ }
+ }
+
+ monoMass += modification.MonoisotopicMass;
+
+ if (_modifications == null)
+ _modifications = new IHasMass[Length + 2];
+
+ if (cterminalMod)
+ {
+ _modifications[index + 1] = modification;
+ }
+ else
+ {
+ _modifications[index] = modification;
+ }
+
+ cterminalMod = false;
+ }
+ else
+ {
+ modSb.Append(letter);
+ }
+ }
+ else
+ {
+ //char upperletter = char.ToUpper(letter); // moved to amino acid dictionary
+ if (Residue.TryGetResidue(letter, out Residue residue))
+ {
+ residues[index++] = residue;
+ monoMass += residue.MonoisotopicMass;
+ }
+ else
+ {
+ switch (letter)
+ {
+ case '[': // start of a modification
+ inMod = true;
+ break;
+
+ case '-': // End of an n-terminus mod or start of a c-terminus mod
+ cterminalMod = (index > 0);
+ break;
+
+ default:
+ throw new MzLibException(string.Format(CultureInfo.InvariantCulture, "Amino Acid Letter {0} does not exist in the Amino Acid Dictionary. {0} is also not a valid character", letter));
+ }
+ }
+ }
+ }
+
+ if (inMod)
+ throw new MzLibException("Couldn't find the closing ] for a modification in this sequence: " + sequence);
+
+ Length = index;
+ MonoisotopicMass += monoMass;
+ Array.Resize(ref residues, Length);
+ if (_modifications != null)
+ {
+ Array.Resize(ref _modifications, Length + 2);
+ }
+ }
+
+ private class ModWithOnlyMass : IHasMass
+ {
+ private readonly double mass;
+
+ public ModWithOnlyMass(double mass)
+ {
+ this.mass = mass;
+ }
+
+ public double MonoisotopicMass
+ {
+ get
+ {
+ return mass;
+ }
+ }
+
+ public override string ToString()
+ {
+ return mass.ToString(CultureInfo.InvariantCulture);
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/AminoAcidPolymerExtensions.cs b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/AminoAcidPolymerExtensions.cs
new file mode 100644
index 000000000..2bc13a1f0
--- /dev/null
+++ b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/AminoAcidPolymerExtensions.cs
@@ -0,0 +1,74 @@
+// Copyright 2012, 2013, 2014 Derek J. Bailey
+// Modified work copyright 2016 Stefan Solntsev
+//
+// This file (AminoAcidPolymerExtensions.cs) is part of Proteomics.
+//
+// Proteomics is free software: you can redistribute it and/or modify it
+// under the terms of the GNU Lesser General Public License as published
+// by the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// Proteomics is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+// License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with Proteomics. If not, see .
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+
+namespace Proteomics.AminoAcidPolymer
+{
+ public static class AminoAcidPolymerExtensions
+ {
+ public static double GetSequenceCoverageFraction(this AminoAcidPolymer baseSequence, IEnumerable sequences)
+ {
+ return GetSequenceCoverageFraction(baseSequence, sequences, true);
+ }
+
+ public static double GetSequenceCoverageFraction(this AminoAcidPolymer baseSequence, IEnumerable sequences, bool useLeucineSequence)
+ {
+ int[] counts = baseSequence.GetSequenceCoverage(sequences, useLeucineSequence);
+ return ((double)counts.Count(x => x > 0)) / baseSequence.Length;
+ }
+
+ public static int[] GetSequenceCoverage(this AminoAcidPolymer baseSequence, IEnumerable sequences)
+ {
+ return GetSequenceCoverage(baseSequence, sequences, true);
+ }
+
+ public static int[] GetSequenceCoverage(this AminoAcidPolymer baseSequence, IEnumerable allPolymers, bool useLeucineSequence)
+ {
+ int[] bits = new int[baseSequence.Length];
+
+ string masterSequence = useLeucineSequence ? baseSequence.BaseLeucineSequence : baseSequence.BaseSequence;
+
+ foreach (AminoAcidPolymer polymer in allPolymers)
+ {
+ string seq = useLeucineSequence ? polymer.BaseLeucineSequence : polymer.BaseSequence;
+
+ int startIndex = 0;
+ while (true)
+ {
+ int index = masterSequence.IndexOf(seq, startIndex, StringComparison.Ordinal);
+
+ if (index < 0)
+ {
+ break;
+ }
+
+ for (int aa = index; aa < index + polymer.Length; aa++)
+ {
+ bits[aa]++;
+ }
+
+ startIndex = index + 1;
+ }
+ }
+ return bits;
+ }
+ }
+}
\ No newline at end of file
diff --git a/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/ChemicalFormulaFragment.cs b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/ChemicalFormulaFragment.cs
new file mode 100644
index 000000000..655b73608
--- /dev/null
+++ b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/ChemicalFormulaFragment.cs
@@ -0,0 +1,33 @@
+// Copyright 2012, 2013, 2014 Derek J. Bailey
+// Modified work copyright 2016 Stefan Solntsev
+//
+// This file (ChemicalFormulaFragment.cs) is part of Proteomics.
+//
+// Proteomics is free software: you can redistribute it and/or modify it
+// under the terms of the GNU Lesser General Public License as published
+// by the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// Proteomics is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+// License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with Proteomics. If not, see .
+
+using Chemistry;
+
+namespace Proteomics.AminoAcidPolymer
+{
+ public class ChemicalFormulaFragment : Fragment, IHasChemicalFormula
+ {
+ public ChemicalFormulaFragment(FragmentTypes type, int number, ChemicalFormula formula, AminoAcidPolymer parent)
+ : base(type, number, formula.MonoisotopicMass, parent)
+ {
+ ThisChemicalFormula = ChemicalFormula.ParseFormula(formula.Formula);
+ }
+
+ public ChemicalFormula ThisChemicalFormula { get; private set; }
+ }
+}
\ No newline at end of file
diff --git a/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/ChemicalFormulaModification.cs b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/ChemicalFormulaModification.cs
new file mode 100644
index 000000000..7f4f93580
--- /dev/null
+++ b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/ChemicalFormulaModification.cs
@@ -0,0 +1,57 @@
+// Copyright 2012, 2013, 2014 Derek J. Bailey
+// Modified work copyright 2016 Stefan Solntsev
+//
+// This file (ChemicalFormulaModification.cs) is part of Proteomics.
+//
+// Proteomics is free software: you can redistribute it and/or modify it
+// under the terms of the GNU Lesser General Public License as published
+// by the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// Proteomics is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+// License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with Proteomics. If not, see .
+
+using Chemistry;
+
+namespace Proteomics.AminoAcidPolymer
+{
+ public class OldSchoolChemicalFormulaModification : OldSchoolModification, IHasChemicalFormula
+ {
+ public OldSchoolChemicalFormulaModification(ChemicalFormula chemicalFormula)
+ : this(chemicalFormula, ModificationSites.Any)
+ {
+ }
+
+ public OldSchoolChemicalFormulaModification(ChemicalFormula chemicalFormula, ModificationSites sites)
+ : this(chemicalFormula, "", sites)
+ {
+ Name = ThisChemicalFormula.Formula;
+ }
+
+ public OldSchoolChemicalFormulaModification(ChemicalFormula chemicalFormula, string name)
+ : this(chemicalFormula, name, ModificationSites.Any)
+ {
+ }
+
+ public OldSchoolChemicalFormulaModification(ChemicalFormula chemicalFormula, string name, ModificationSites sites)
+ : base(chemicalFormula.MonoisotopicMass, name, sites)
+ {
+ ThisChemicalFormula = chemicalFormula;
+ }
+
+ public OldSchoolChemicalFormulaModification(OldSchoolChemicalFormulaModification other)
+ : this(ChemicalFormula.ParseFormula(other.ThisChemicalFormula.Formula), other.Name, other.Sites)
+ {
+ }
+
+ ///
+ /// The Chemical Formula of this modifications
+ ///
+ public ChemicalFormula ThisChemicalFormula { get; private set; }
+ }
+}
\ No newline at end of file
diff --git a/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/ChemicalFormulaTerminus.cs b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/ChemicalFormulaTerminus.cs
new file mode 100644
index 000000000..3479a5068
--- /dev/null
+++ b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/ChemicalFormulaTerminus.cs
@@ -0,0 +1,42 @@
+// Copyright 2016 Stefan Solntsev
+//
+// This file (ChemicalFormulaTerminus.cs) is part of Proteomics.
+//
+// Proteomics is free software: you can redistribute it and/or modify it
+// under the terms of the GNU Lesser General Public License as published
+// by the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// Proteomics is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+// License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with Proteomics. If not, see .
+
+using Chemistry;
+
+namespace Proteomics.AminoAcidPolymer
+{
+ public class ChemicalFormulaTerminus : IHasChemicalFormula
+ {
+ public ChemicalFormulaTerminus(ChemicalFormula chemicalFormula)
+ {
+ ThisChemicalFormula = chemicalFormula;
+ }
+
+ public double MonoisotopicMass
+ {
+ get
+ {
+ return ThisChemicalFormula.MonoisotopicMass;
+ }
+ }
+
+ public ChemicalFormula ThisChemicalFormula
+ {
+ get; private set;
+ }
+ }
+}
\ No newline at end of file
diff --git a/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/DigestionPoint.cs b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/DigestionPoint.cs
new file mode 100644
index 000000000..d906b736a
--- /dev/null
+++ b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/DigestionPoint.cs
@@ -0,0 +1,14 @@
+namespace Proteomics.AminoAcidPolymer
+{
+ public class DigestionPointAndLength
+ {
+ public DigestionPointAndLength(int index, int length)
+ {
+ Index = index;
+ Length = length;
+ }
+
+ public int Index { get; private set; }
+ public int Length { get; private set; }
+ }
+}
\ No newline at end of file
diff --git a/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/Fragment.cs b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/Fragment.cs
new file mode 100644
index 000000000..6d0c7806a
--- /dev/null
+++ b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/Fragment.cs
@@ -0,0 +1,98 @@
+// Copyright 2012, 2013, 2014 Derek J. Bailey
+// Modified work copyright 2016 Stefan Solntsev
+//
+// This file (Fragment.cs) is part of Proteomics.
+//
+// Proteomics is free software: you can redistribute it and/or modify it
+// under the terms of the GNU Lesser General Public License as published
+// by the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// Proteomics is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+// License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with Proteomics. If not, see .
+
+using System;
+using System.Collections.Generic;
+using System.Globalization;
+using Chemistry;
+
+namespace Proteomics.AminoAcidPolymer
+{
+ public class Fragment : IHasMass, IEquatable
+ {
+ public Fragment(FragmentTypes type, int number, double monoisotopicMass, AminoAcidPolymer parent)
+ {
+ FragmentType = type;
+ Number = number;
+ Parent = parent;
+ MonoisotopicMass = monoisotopicMass;
+ }
+
+ public double MonoisotopicMass { get; private set; }
+
+ public int Number { get; private set; }
+
+ public AminoAcidPolymer Parent { get; private set; }
+
+ public FragmentTypes FragmentType { get; private set; }
+
+ public IEnumerable Modifications
+ {
+ get
+ {
+ var mods = Parent.Modifications;
+ if (FragmentType.GetTerminus() == Terminus.N)
+ {
+ for (int i = 0; i <= Number; i++)
+ {
+ if (mods[i] != null)
+ yield return mods[i];
+ }
+ }
+ else
+ {
+ int length = Parent.Length + 1;
+ for (int i = length - Number; i <= length; i++)
+ {
+ if (mods[i] != null)
+ yield return mods[i];
+ }
+ }
+ }
+ }
+
+ public string Sequence
+ {
+ get
+ {
+ string parentSeq = Parent.BaseSequence;
+ if (FragmentType.GetTerminus() == Terminus.N)
+ {
+ return parentSeq.Substring(0, Number);
+ }
+
+ return parentSeq.Substring(parentSeq.Length - Number, Number);
+ }
+ }
+
+ public override string ToString()
+ {
+ return string.Format(CultureInfo.InvariantCulture, "{0}{1}", Enum.GetName(typeof(FragmentTypes), FragmentType), Number);
+ }
+
+ public override int GetHashCode()
+ {
+ return MonoisotopicMass.GetHashCode();
+ }
+
+ public bool Equals(Fragment other)
+ {
+ return FragmentType.Equals(other.FragmentType) && Number.Equals(other.Number) && Math.Abs(MonoisotopicMass - other.MonoisotopicMass) < 1e-9;
+ }
+ }
+}
\ No newline at end of file
diff --git a/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/FragmentTypes.cs b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/FragmentTypes.cs
new file mode 100644
index 000000000..5ef1f3c88
--- /dev/null
+++ b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/FragmentTypes.cs
@@ -0,0 +1,98 @@
+// Copyright 2012, 2013, 2014 Derek J. Bailey
+// Modified work copyright 2016 Stefan Solntsev
+//
+// This file (FragmentTypes.cs) is part of Proteomics.
+//
+// Proteomics is free software: you can redistribute it and/or modify it
+// under the terms of the GNU Lesser General Public License as published
+// by the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// Proteomics is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+// License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with Proteomics. If not, see .
+
+using System;
+using System.Collections.Generic;
+using Chemistry;
+using MzLibUtil;
+
+namespace Proteomics.AminoAcidPolymer
+{
+ [Flags]
+ public enum FragmentTypes
+ {
+ None = 0,
+ a = 1 << 0,
+ adot = 1 << 1,
+ b = 1 << 2,
+ bdot = 1 << 3,
+ c = 1 << 4,
+ cdot = 1 << 5,
+ x = 1 << 6,
+ xdot = 1 << 7,
+ y = 1 << 8,
+ ydot = 1 << 9,
+ z = 1 << 10,
+ zdot = 1 << 11,
+ Internal = 1 << 12,
+ All = (1 << 12) - 1, // Handy way of setting all below the 12th bit
+ }
+
+ public static class FragmentTypesExtension
+ {
+ private static readonly Dictionary FragmentIonCaps = new Dictionary
+ {
+ {FragmentTypes.a, ChemicalFormula.ParseFormula("C-1H-1O-1")},
+ {FragmentTypes.adot, ChemicalFormula.ParseFormula("C-1O-1")},
+ {FragmentTypes.b, ChemicalFormula.ParseFormula("H-1")},
+ {FragmentTypes.bdot, new ChemicalFormula()},
+ {FragmentTypes.c, ChemicalFormula.ParseFormula("NH2")},
+ {FragmentTypes.cdot, ChemicalFormula.ParseFormula("NH3")},
+ {FragmentTypes.x, ChemicalFormula.ParseFormula("COH-1")},
+ {FragmentTypes.xdot, ChemicalFormula.ParseFormula("CO")},
+ {FragmentTypes.y, ChemicalFormula.ParseFormula("H")},
+ {FragmentTypes.ydot, ChemicalFormula.ParseFormula("H2")},
+ {FragmentTypes.z, ChemicalFormula.ParseFormula("N-1H-2")},
+ {FragmentTypes.zdot, ChemicalFormula.ParseFormula("N-1H-1")}
+ };
+
+ public static IEnumerable GetIndividualFragmentTypes(this FragmentTypes fragmentTypes)
+ {
+ foreach (FragmentTypes site in Enum.GetValues(typeof(FragmentTypes)))
+ {
+ if (site == FragmentTypes.None || site == FragmentTypes.All || site == FragmentTypes.Internal)
+ {
+ continue;
+ }
+ if ((fragmentTypes & site) == site)
+ {
+ yield return site;
+ }
+ }
+ }
+
+ public static Terminus GetTerminus(this FragmentTypes fragmentType)
+ {
+ // Super handy: http://stackoverflow.com/questions/4624248/c-logical-riddle-with-bit-operations-only-one-bit-is-set
+ if (fragmentType == FragmentTypes.None || (fragmentType & (fragmentType - 1)) != FragmentTypes.None)
+ {
+ throw new MzLibException("Fragment Type must be a single value to determine the terminus");
+ }
+ return fragmentType >= FragmentTypes.x ? Terminus.C : Terminus.N;
+ }
+
+ public static ChemicalFormula GetIonCap(this FragmentTypes fragmentType)
+ {
+ if (fragmentType == FragmentTypes.None || (fragmentType & (fragmentType - 1)) != FragmentTypes.None)
+ {
+ throw new MzLibException("Fragment Type must be a single value to determine the ion cap");
+ }
+ return FragmentIonCaps[fragmentType];
+ }
+ }
+}
\ No newline at end of file
diff --git a/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/IProtease.cs b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/IProtease.cs
new file mode 100644
index 000000000..60a27219c
--- /dev/null
+++ b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/IProtease.cs
@@ -0,0 +1,42 @@
+// Copyright 2012, 2013, 2014 Derek J. Bailey
+// Modified work copyright 2016 Stefan Solntsev
+//
+// This file (IProtease.cs) is part of Proteomics.
+//
+// Proteomics is free software: you can redistribute it and/or modify it
+// under the terms of the GNU Lesser General Public License as published
+// by the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// Proteomics is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+// License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with Proteomics. If not, see .
+
+using System.Collections.Generic;
+
+namespace Proteomics.AminoAcidPolymer
+{
+ ///
+ /// A proteolyic enzyme that cuts amino acids at specific residues.
+ ///
+ public interface IProtease
+ {
+ ///
+ /// Finds the indicies of where this protease would cut in
+ /// the given amino acid sequence
+ ///
+ /// The Amino Acid Polymer to cut
+ /// A set of the 1-based indicies to cut at
+ IEnumerable GetDigestionSites(string aminoAcidSequence);
+
+ IEnumerable GetDigestionSites(AminoAcidPolymer aminoAcidSequence);
+
+ int MissedCleavages(string sequence);
+
+ int MissedCleavages(AminoAcidPolymer aminoAcidSequence);
+ }
+}
\ No newline at end of file
diff --git a/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/ModificationCollection.cs b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/ModificationCollection.cs
new file mode 100644
index 000000000..cb7939c39
--- /dev/null
+++ b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/ModificationCollection.cs
@@ -0,0 +1,122 @@
+// Copyright 2012, 2013, 2014 Derek J. Bailey
+// Modified work copyright 2016 Stefan Solntsev
+//
+// This file (ModificationCollection.cs) is part of Proteomics.
+//
+// Proteomics is free software: you can redistribute it and/or modify it
+// under the terms of the GNU Lesser General Public License as published
+// by the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// Proteomics is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+// License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with Proteomics. If not, see .
+
+using System;
+using System.Collections;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using Chemistry;
+using MzLibUtil;
+
+namespace Proteomics.AminoAcidPolymer
+{
+ public class ModificationCollection : ICollection, IEquatable, IHasChemicalFormula
+ {
+ private readonly List _modifications;
+
+ public ModificationCollection(params IHasMass[] mods)
+ {
+ _modifications = mods.ToList();
+ MonoisotopicMass = _modifications.Sum(m => m.MonoisotopicMass);
+ }
+
+ public double MonoisotopicMass { get; private set; }
+
+ public int Count
+ {
+ get { return _modifications.Count; }
+ }
+
+ public bool IsReadOnly
+ {
+ get { return false; }
+ }
+
+ public ChemicalFormula ThisChemicalFormula
+ {
+ get
+ {
+ ChemicalFormula chemicalFormula = new ChemicalFormula();
+ foreach (var ok in _modifications)
+ chemicalFormula.Add(ok as IHasChemicalFormula);
+ return chemicalFormula;
+ }
+ }
+
+ public override string ToString()
+ {
+ StringBuilder sb = new StringBuilder();
+ foreach (IHasMass mod in _modifications)
+ {
+ sb.Append(mod);
+ sb.Append(" | ");
+ }
+ if (sb.Length > 0)
+ {
+ sb.Remove(sb.Length - 3, 3);
+ }
+ return sb.ToString();
+ }
+
+ public void Add(IHasMass item)
+ {
+ _modifications.Add(item);
+ MonoisotopicMass += item.MonoisotopicMass;
+ }
+
+ public void Clear()
+ {
+ _modifications.Clear();
+ MonoisotopicMass = 0;
+ }
+
+ public bool Contains(IHasMass item)
+ {
+ return _modifications.Contains(item);
+ }
+
+ public void CopyTo(IHasMass[] array, int arrayIndex)
+ {
+ _modifications.CopyTo(array, arrayIndex);
+ }
+
+ public bool Remove(IHasMass item)
+ {
+ if (!_modifications.Remove(item))
+ return false;
+ MonoisotopicMass -= item.MonoisotopicMass;
+ return true;
+ }
+
+ public bool Equals(ModificationCollection other)
+ {
+ return Count == other.Count && _modifications.ScrambledEquals(other._modifications);
+ }
+
+ public IEnumerator GetEnumerator()
+ {
+ return _modifications.GetEnumerator();
+ }
+
+ IEnumerator IEnumerable.GetEnumerator()
+ {
+ return _modifications.GetEnumerator();
+ }
+ }
+}
\ No newline at end of file
diff --git a/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/ModificationSites.cs b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/ModificationSites.cs
new file mode 100644
index 000000000..e68c7e143
--- /dev/null
+++ b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/ModificationSites.cs
@@ -0,0 +1,92 @@
+// Copyright 2012, 2013, 2014 Derek J. Bailey
+// Modified work copyright 2016 Stefan Solntsev
+//
+// This file (ModificationSites.cs) is part of Proteomics.
+//
+// Proteomics is free software: you can redistribute it and/or modify it
+// under the terms of the GNU Lesser General Public License as published
+// by the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// Proteomics is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+// License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with Proteomics. If not, see .
+
+using System;
+using System.Collections.Generic;
+
+namespace Proteomics.AminoAcidPolymer
+{
+ [Flags]
+ public enum ModificationSites
+ {
+ None = 0,
+ A = 1 << 0,
+ R = 1 << 1,
+ N = 1 << 2,
+ D = 1 << 3,
+ C = 1 << 4,
+ E = 1 << 5,
+ Q = 1 << 6,
+ G = 1 << 7,
+ H = 1 << 8,
+ I = 1 << 9,
+ L = 1 << 10,
+ K = 1 << 11,
+ M = 1 << 12,
+ F = 1 << 13,
+ P = 1 << 14,
+ S = 1 << 15,
+ T = 1 << 16,
+ U = 1 << 17,
+ W = 1 << 18,
+ Y = 1 << 19,
+ V = 1 << 20,
+ NPep = 1 << 21,
+ PepC = 1 << 22,
+ NProt = 1 << 23,
+ ProtC = 1 << 24,
+ All = (1 << 25) - 1, // Handy way of setting all below the 24th bit
+ NTerminus = NPep | NProt,
+ TerminusC = PepC | ProtC,
+ Any = 1 << 31 // Acts like none, but is equal to all
+ }
+
+ public static class ModificationSiteExtensions
+ {
+ public static IEnumerable EnumerateActiveSites(this ModificationSites sites)
+ {
+ foreach (ModificationSites site in Enum.GetValues(typeof(ModificationSites)))
+ {
+ if (site == ModificationSites.None)
+ {
+ continue;
+ }
+ if ((sites & site) == site)
+ {
+ yield return site;
+ }
+ }
+ }
+
+ public static bool ContainsSites(this ModificationSites sites, ModificationSites otherSites)
+ {
+ // By convention, if the other site is 'Any', they are always equal
+ if (otherSites == ModificationSites.Any)
+ {
+ return true;
+ }
+
+ if (otherSites == ModificationSites.None)
+ {
+ return sites == ModificationSites.None;
+ }
+
+ return (~sites & otherSites) == ModificationSites.None;
+ }
+ }
+}
\ No newline at end of file
diff --git a/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/OldSchoolModification.cs b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/OldSchoolModification.cs
new file mode 100644
index 000000000..2753fbe02
--- /dev/null
+++ b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/OldSchoolModification.cs
@@ -0,0 +1,121 @@
+// Copyright 2012, 2013, 2014 Derek J. Bailey
+// Modified work copyright 2016 Stefan Solntsev
+//
+// This file (Modification.cs) is part of Proteomics.
+//
+// Proteomics is free software: you can redistribute it and/or modify it
+// under the terms of the GNU Lesser General Public License as published
+// by the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// Proteomics is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+// License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with Proteomics. If not, see .
+
+using System;
+using System.Globalization;
+using Chemistry;
+
+namespace Proteomics.AminoAcidPolymer
+{
+ ///
+ /// Represents a modification with a mass and name and default amino acid sites of modification
+ ///
+ public class OldSchoolModification : IHasMass, IEquatable
+ {
+ public OldSchoolModification(OldSchoolModification modification)
+ : this(modification.MonoisotopicMass, modification.Name, modification.Sites)
+ {
+ }
+
+ public OldSchoolModification()
+ : this(0.0, "", ModificationSites.Any)
+ {
+ }
+
+ public OldSchoolModification(double monoMass)
+ : this(monoMass, "", ModificationSites.Any)
+ {
+ }
+
+ public OldSchoolModification(double monoMass, string name)
+ : this(monoMass, name, ModificationSites.Any)
+ {
+ }
+
+ public OldSchoolModification(double monoMass, string name, ModificationSites sites)
+ {
+ MonoisotopicMass = monoMass;
+ Name = name;
+ Sites = sites;
+ }
+
+ ///
+ /// The name of the modification
+ ///
+ public string Name { get; protected set; }
+
+ ///
+ /// The monoisotopic mass of the modification, commoningly known as the delta mass
+ ///
+ public double MonoisotopicMass { get; protected set; }
+
+ ///
+ /// The potentially modified sites of this modification
+ ///
+ public ModificationSites Sites { get; set; }
+
+ ///
+ /// Displays the name of the mod and the sites it modified in a formated string
+ ///
+ public string NameAndSites
+ {
+ get { return string.Format(CultureInfo.InvariantCulture, "{0} ({1})", Name, Sites); }
+ }
+
+ public override string ToString()
+ {
+ return Name;
+ }
+
+ public override int GetHashCode()
+ {
+ return MonoisotopicMass.GetHashCode();
+ }
+
+ public override bool Equals(object obj)
+ {
+ OldSchoolModification modObj = obj as OldSchoolModification;
+ return modObj != null && Equals(modObj);
+ }
+
+ public bool Equals(OldSchoolModification other)
+ {
+ if (ReferenceEquals(this, other))
+ {
+ return true;
+ }
+
+ if (Math.Abs(MonoisotopicMass - other.MonoisotopicMass) > 1e-9)
+ {
+ return false;
+ }
+
+ if (!Name.Equals(other.Name))
+ {
+ return false;
+ }
+
+ if (!Sites.Equals(other.Sites))
+ {
+ return false;
+ }
+
+ return true;
+ }
+ }
+}
\ No newline at end of file
diff --git a/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/OldSchoolModificationWithMultiplePossibilities.cs b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/OldSchoolModificationWithMultiplePossibilities.cs
new file mode 100644
index 000000000..d1039281e
--- /dev/null
+++ b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/OldSchoolModificationWithMultiplePossibilities.cs
@@ -0,0 +1,68 @@
+// Copyright 2012, 2013, 2014 Derek J. Bailey
+// Modified work copyright 2016 Stefan Solntsev
+//
+// This file (Isotopologue.cs) is part of Proteomics.
+//
+// Proteomics is free software: you can redistribute it and/or modify it
+// under the terms of the GNU Lesser General Public License as published
+// by the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// Proteomics is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+// License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with Proteomics. If not, see .
+
+using System.Collections;
+using System.Collections.Generic;
+using MzLibUtil;
+
+namespace Proteomics.AminoAcidPolymer
+{
+ public class ModificationWithMultiplePossibilitiesCollection : OldSchoolModification, IEnumerable
+ {
+ private readonly SortedList _modifications;
+
+ public ModificationWithMultiplePossibilitiesCollection(string name, ModificationSites sites)
+ : base(0, name, sites)
+ {
+ _modifications = new SortedList();
+ }
+
+ public int Count
+ {
+ get { return _modifications.Count; }
+ }
+
+ public OldSchoolModification this[int index]
+ {
+ get { return _modifications.Values[index]; }
+ }
+
+ public void AddModification(OldSchoolModification modification)
+ {
+ if (!Sites.ContainsSites(modification.Sites))
+ throw new MzLibException("Unable to add a modification with sites other than " + Sites);
+
+ _modifications.Add(modification.MonoisotopicMass, modification);
+ }
+
+ public bool Contains(OldSchoolModification modification)
+ {
+ return _modifications.ContainsValue(modification);
+ }
+
+ public IEnumerator GetEnumerator()
+ {
+ return _modifications.Values.GetEnumerator();
+ }
+
+ IEnumerator IEnumerable.GetEnumerator()
+ {
+ return _modifications.Values.GetEnumerator();
+ }
+ }
+}
\ No newline at end of file
diff --git a/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/Peptide.cs b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/Peptide.cs
new file mode 100644
index 000000000..4ea4346e6
--- /dev/null
+++ b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/Peptide.cs
@@ -0,0 +1,137 @@
+// Copyright 2012, 2013, 2014 Derek J. Bailey
+// Modified work copyright 2016 Stefan Solntsev
+//
+// This file (Peptide.cs) is part of Proteomics.
+//
+// Proteomics is free software: you can redistribute it and/or modify it
+// under the terms of the GNU Lesser General Public License as published
+// by the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// Proteomics is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+// License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with Proteomics. If not, see .
+
+using System.Collections.Generic;
+using System.Linq;
+
+namespace Proteomics.AminoAcidPolymer
+{
+ public class Peptide : AminoAcidPolymer
+ {
+ public Peptide()
+ {
+ }
+
+ public Peptide(string sequence) : base(sequence)
+ {
+ }
+
+ public Peptide(AminoAcidPolymer aminoAcidPolymer)
+ : this(aminoAcidPolymer, true)
+ {
+ }
+
+ ///
+ /// Create a new peptide based on another amino acid polymer
+ ///
+ /// The other amino acid polymer to copy
+ /// Whether to copy the modifications to the new peptide
+ public Peptide(AminoAcidPolymer aminoAcidPolymer, bool includeModifications)
+ : base(aminoAcidPolymer, includeModifications)
+ {
+ Parent = aminoAcidPolymer;
+ StartResidue = 0;
+ EndResidue = Length - 1;
+ }
+
+ public Peptide(AminoAcidPolymer aminoAcidPolymer, int firstResidue, int length)
+ : this(aminoAcidPolymer, firstResidue, length, true)
+ {
+ }
+
+ public Peptide(AminoAcidPolymer aminoAcidPolymer, int firstResidue, int length, bool includeModifications)
+ : base(aminoAcidPolymer, firstResidue, length, includeModifications)
+ {
+ Parent = aminoAcidPolymer;
+ StartResidue = firstResidue;
+ EndResidue = firstResidue + length - 1;
+ PreviousResidue = aminoAcidPolymer.GetResidue(StartResidue - 1);
+ NextResidue = aminoAcidPolymer.GetResidue(EndResidue + 1);
+ }
+
+ ///
+ /// The amino acid number this peptide is located in its parent
+ ///
+ public int StartResidue { get; set; }
+
+ ///
+ /// The amino acid number this peptide is located in its parent
+ ///
+ public int EndResidue { get; set; }
+
+ ///
+ /// The amino acid polymer this peptide came from. Could be null
+ ///
+ public AminoAcidPolymer Parent { get; set; }
+
+ ///
+ /// The preceding amino acid in its parent
+ ///
+ public Residue PreviousResidue { get; set; }
+
+ ///
+ /// The next amino acid in its parent
+ ///
+ public Residue NextResidue { get; set; }
+
+ public IEnumerable GenerateAllModificationCombinations()
+ {
+ // Get all the modifications that are isotopologues
+ var isotopologues = GetUniqueModifications().ToArray();
+
+ // Base condition, no more isotopologues to make, so just return
+ if (isotopologues.Length < 1)
+ {
+ yield break;
+ }
+
+ // Grab the the first isotopologue
+ ModificationWithMultiplePossibilitiesCollection isotopologue = isotopologues[0];
+
+ // Loop over each modification in the isotopologue
+ foreach (OldSchoolModification mod in isotopologue)
+ {
+ // Create a clone of the peptide, cloning modifications as well.
+ Peptide peptide = new Peptide(this);
+
+ // Replace the base isotopologue mod with the specific version
+ peptide.ReplaceModification(isotopologue, mod);
+
+ // There were more than one isotopologue, so we must go deeper
+ if (isotopologues.Length > 1)
+ {
+ // Call the same rotuine on the newly generate peptide that has one less isotopologue
+ foreach (var subpeptide in peptide.GenerateAllModificationCombinations())
+ {
+ yield return subpeptide;
+ }
+ }
+ else
+ {
+ // Return this peptide
+ yield return peptide;
+ }
+ }
+ }
+
+ public Peptide GetSubPeptide(int firstResidue, int length)
+ {
+ return new Peptide(this, firstResidue, length);
+ }
+ }
+}
\ No newline at end of file
diff --git a/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/Residue.cs b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/Residue.cs
new file mode 100644
index 000000000..4797bec7a
--- /dev/null
+++ b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/Residue.cs
@@ -0,0 +1,207 @@
+// Copyright 2012, 2013, 2014 Derek J. Bailey
+// Modified work copyright 2016 Stefan Solntsev
+//
+// This file (AminoAcid.cs) is part of Proteomics.
+//
+// Proteomics is free software: you can redistribute it and/or modify it
+// under the terms of the GNU Lesser General Public License as published
+// by the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// Proteomics is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+// License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with Proteomics. If not, see .
+
+using System.Collections.Generic;
+using Chemistry;
+
+namespace Proteomics.AminoAcidPolymer
+{
+ public class Residue : IHasChemicalFormula
+ {
+ public static readonly double[] ResidueMonoisotopicMass;
+
+ private static readonly Dictionary ResiduesDictionary;
+ private static readonly Residue[] ResiduesByLetter;
+
+ static Residue()
+ {
+ ResiduesDictionary = new Dictionary
+ {
+ {"Alanine", new Residue("Alanine", 'A', "Ala",ChemicalFormula.ParseFormula("C3H5NO"), ModificationSites.A)},
+ {"Arginine", new Residue("Arginine", 'R', "Arg",ChemicalFormula.ParseFormula("C6H12N4O"), ModificationSites.R)},
+ {"Asparagine", new Residue("Asparagine", 'N', "Asn",ChemicalFormula.ParseFormula("C4H6N2O2"), ModificationSites.N)},
+ {"Aspartic Acid", new Residue("Aspartic Acid", 'D', "Asp",ChemicalFormula.ParseFormula("C4H5NO3"), ModificationSites.D)},
+ {"Cysteine", new Residue("Cysteine", 'C', "Cys",ChemicalFormula.ParseFormula("C3H5NOS"), ModificationSites.C)},
+ {"Glutamic Acid", new Residue("Glutamic Acid", 'E', "Glu",ChemicalFormula.ParseFormula("C5H7NO3"), ModificationSites.E)},
+ {"Glutamine", new Residue("Glutamine", 'Q', "Gln",ChemicalFormula.ParseFormula("C5H8N2O2"), ModificationSites.Q)},
+ {"Glycine", new Residue("Glycine", 'G', "Gly",ChemicalFormula.ParseFormula("C2H3NO"), ModificationSites.G)},
+ {"Histidine", new Residue("Histidine", 'H', "His",ChemicalFormula.ParseFormula("C6H7N3O"), ModificationSites.H)},
+ {"Isoleucine", new Residue("Isoleucine", 'I', "Ile",ChemicalFormula.ParseFormula("C6H11NO"), ModificationSites.I)},
+ {"Leucine", new Residue("Leucine", 'L', "Leu",ChemicalFormula.ParseFormula("C6H11NO"), ModificationSites.L)},
+ {"Lysine", new Residue("Lysine", 'K', "Lys",ChemicalFormula.ParseFormula("C6H12N2O"), ModificationSites.K)},
+ {"Methionine", new Residue("Methionine", 'M', "Met",ChemicalFormula.ParseFormula("C5H9NOS"), ModificationSites.M)},
+ {"Phenylalanine", new Residue("Phenylalanine", 'F', "Phe",ChemicalFormula.ParseFormula("C9H9NO"), ModificationSites.F)},
+ {"Proline", new Residue("Proline", 'P', "Pro",ChemicalFormula.ParseFormula("C5H7NO"), ModificationSites.P)},
+ {"Pyrrolysine", new Residue("Pyrrolysine", 'O', "Pyl",ChemicalFormula.ParseFormula("C12H19N3O2"), ModificationSites.P)},
+ {"Selenocysteine", new Residue("Selenocysteine", 'U', "Sec",ChemicalFormula.ParseFormula("C3H5NOSe"), ModificationSites.U)},
+ {"Serine", new Residue("Serine", 'S', "Ser",ChemicalFormula.ParseFormula("C3H5NO2"), ModificationSites.S)},
+ {"Threonine", new Residue("Threonine", 'T', "Thr",ChemicalFormula.ParseFormula("C4H7NO2"), ModificationSites.T)},
+ {"Tryptophan", new Residue("Tryptophan", 'W', "Trp",ChemicalFormula.ParseFormula("C11H10N2O"), ModificationSites.W)},
+ {"Tyrosine", new Residue("Tyrosine", 'Y', "Try",ChemicalFormula.ParseFormula("C9H9NO2"), ModificationSites.Y)},
+ {"Valine", new Residue("Valine", 'V', "Val",ChemicalFormula.ParseFormula("C5H9NO"), ModificationSites.V)}
+ };
+
+ ResiduesByLetter = new Residue[]
+ {
+ null,null,null,null,null,null,null,null,null,null,null,null,null, //12
+ null,null,null,null,null,null,null,null,null,null,null,null,null, //25
+ null,null,null,null,null,null,null,null,null,null,null,null,null, //38
+ null,null,null,null,null,null,null,null,null,null,null,null,null, //51
+ null,null,null,null,null,null,null,null,null,null,null,null,null, //64
+ ResiduesDictionary["Alanine"], //65
+ null, // B
+ ResiduesDictionary["Cysteine"],
+ ResiduesDictionary["Aspartic Acid"],
+ ResiduesDictionary["Glutamic Acid"],
+ ResiduesDictionary["Phenylalanine"],
+ ResiduesDictionary["Glycine"],
+ ResiduesDictionary["Histidine"],
+ ResiduesDictionary["Isoleucine"],
+ null, // J
+ ResiduesDictionary["Lysine"],
+ ResiduesDictionary["Leucine"],
+ ResiduesDictionary["Methionine"],
+ ResiduesDictionary["Asparagine"],
+ ResiduesDictionary["Pyrrolysine"], // O
+ ResiduesDictionary["Proline"],
+ ResiduesDictionary["Glutamine"],
+ ResiduesDictionary["Arginine"],
+ ResiduesDictionary["Serine"],
+ ResiduesDictionary["Threonine"],
+ ResiduesDictionary["Selenocysteine"],
+ ResiduesDictionary["Valine"],
+ ResiduesDictionary["Tryptophan"],
+ null, // X
+ ResiduesDictionary["Tyrosine"],
+ null, // Z //90
+ null,null,null,null,null,null,null,null,null,null,null,null,null, //103
+ null,null,null,null,null,null,null,null,null,null,null,null,null, //116
+ null,null,null,null,null,null //122
+ };
+ ResidueMonoisotopicMass = new double[]
+ {
+ double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,
+ double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,
+ double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,
+ double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,
+ double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,
+ ResiduesDictionary["Alanine"].MonoisotopicMass,
+ double.NaN, // B
+ ResiduesDictionary["Cysteine"].MonoisotopicMass,
+ ResiduesDictionary["Aspartic Acid"].MonoisotopicMass,
+ ResiduesDictionary["Glutamic Acid"].MonoisotopicMass,
+ ResiduesDictionary["Phenylalanine"].MonoisotopicMass,
+ ResiduesDictionary["Glycine"].MonoisotopicMass,
+ ResiduesDictionary["Histidine"].MonoisotopicMass,
+ ResiduesDictionary["Isoleucine"].MonoisotopicMass,
+ ResiduesDictionary["Isoleucine"].MonoisotopicMass, // J - SPECIAL CASE!!!
+ ResiduesDictionary["Lysine"].MonoisotopicMass,
+ ResiduesDictionary["Leucine"].MonoisotopicMass,
+ ResiduesDictionary["Methionine"].MonoisotopicMass,
+ ResiduesDictionary["Asparagine"].MonoisotopicMass,
+ ResiduesDictionary["Pyrrolysine"].MonoisotopicMass, // O
+ ResiduesDictionary["Proline"].MonoisotopicMass,
+ ResiduesDictionary["Glutamine"].MonoisotopicMass,
+ ResiduesDictionary["Arginine"].MonoisotopicMass,
+ ResiduesDictionary["Serine"].MonoisotopicMass,
+ ResiduesDictionary["Threonine"].MonoisotopicMass,
+ ResiduesDictionary["Selenocysteine"].MonoisotopicMass,
+ ResiduesDictionary["Valine"].MonoisotopicMass,
+ ResiduesDictionary["Tryptophan"].MonoisotopicMass,
+ double.NaN, // X
+ ResiduesDictionary["Tyrosine"].MonoisotopicMass,
+ double.NaN, // Z
+ double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,
+ double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,
+ double.NaN,double.NaN,double.NaN,double.NaN,double.NaN,double.NaN
+ };
+ }
+
+ ///
+ /// Adds a list of new residues to the dictionary at their specified index.
+ ///
+ ///
+ ///
+ public static void AddNewResiduesToDictionary(List residuesToAdd)
+ {
+ foreach (Residue residue in residuesToAdd)
+ {
+ ResiduesDictionary[residue.Name] = residue;
+ ResiduesByLetter[residue.Letter] = residue;
+ ResidueMonoisotopicMass[residue.Letter] = residue.MonoisotopicMass;
+ }
+ }
+
+
+ public Residue(string name, char oneLetterAbbreviation, string threeLetterAbbreviation, ChemicalFormula chemicalFormula, ModificationSites site)
+ {
+ Name = name;
+ Letter = oneLetterAbbreviation;
+ Symbol = threeLetterAbbreviation;
+ ThisChemicalFormula = chemicalFormula;
+ MonoisotopicMass = ThisChemicalFormula.MonoisotopicMass;
+ Site = site;
+ }
+
+ public ChemicalFormula ThisChemicalFormula { get; private set; }
+ public char Letter { get; private set; }
+ public ModificationSites Site { get; private set; }
+ public double MonoisotopicMass { get; private set; }
+ public string Name { get; private set; }
+ public string Symbol { get; private set; }
+
+ ///
+ /// Get the residue based on the residues's symbol
+ ///
+ ///
+ ///
+ public static Residue GetResidue(string symbol)
+ {
+ return symbol.Length == 1 ? ResiduesByLetter[symbol[0]] : ResiduesDictionary[symbol];
+ }
+
+ ///
+ /// Gets the resdiue based on the residue's one-character symbol
+ ///
+ ///
+ ///
+ public static Residue GetResidue(char letter)
+ {
+ return ResiduesByLetter[letter];
+ }
+
+ public static bool TryGetResidue(char letter, out Residue residue)
+ {
+ if (letter < ResiduesByLetter.Length && letter >= 0)
+ {
+ residue = ResiduesByLetter[letter];
+ }
+ else
+ {
+ residue = null;
+ }
+
+ return residue != null;
+ }
+
+ public static bool TryGetResidue(string name, out Residue residue)
+ {
+ return ResiduesDictionary.TryGetValue(name, out residue);
+ }
+ }
+}
\ No newline at end of file
diff --git a/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/Terminus.cs b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/Terminus.cs
new file mode 100644
index 000000000..0785e835a
--- /dev/null
+++ b/mzLib/MassSpectrometry/Proteomics/AminoAcidPolymer/Terminus.cs
@@ -0,0 +1,39 @@
+// Copyright 2012, 2013, 2014 Derek J. Bailey
+// Modified work copyright 2016 Stefan Solntsev
+//
+// This file (Terminus.cs) is part of Proteomics.
+//
+// Proteomics is free software: you can redistribute it and/or modify it
+// under the terms of the GNU Lesser General Public License as published
+// by the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// Proteomics is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+// License for more details.
+//
+// You should have received a copy of the GNU Lesser General Public
+// License along with Proteomics. If not, see .
+
+using System;
+
+namespace Proteomics.AminoAcidPolymer
+{
+ ///
+ /// The terminus of an amino acid polymer N-[Amino Acids]-C
+ ///
+ [Flags]
+ public enum Terminus
+ {
+ ///
+ /// The N-terminus (amino-terminus)
+ ///
+ N = 1,
+
+ ///
+ /// The C-terminus (carboxyl-terminus)
+ ///
+ C = 2
+ }
+}
\ No newline at end of file
diff --git a/mzLib/MassSpectrometry/Proteomics/Fragmentation/DissociationTypeCollection.cs b/mzLib/MassSpectrometry/Proteomics/Fragmentation/DissociationTypeCollection.cs
new file mode 100644
index 000000000..98293c4d9
--- /dev/null
+++ b/mzLib/MassSpectrometry/Proteomics/Fragmentation/DissociationTypeCollection.cs
@@ -0,0 +1,174 @@
+using System.Collections.Generic;
+using System.Linq;
+using Chemistry;
+using MassSpectrometry;
+
+namespace Proteomics.Fragmentation
+{
+ public class DissociationTypeCollection
+ {
+ public static Dictionary> ProductsFromDissociationType = new Dictionary>
+ {
+ { DissociationType.Unknown, new List() },
+ { DissociationType.CID, new List{ ProductType.b, ProductType.y } },
+ { DissociationType.LowCID, new List{ ProductType.b, ProductType.y, ProductType.aStar, ProductType.bAmmoniaLoss, ProductType.yAmmoniaLoss, ProductType.aDegree, ProductType.bWaterLoss, ProductType.yWaterLoss } },
+ { DissociationType.IRMPD, new List{ ProductType.b, ProductType.y } },
+ { DissociationType.ECD, new List{ ProductType.c, ProductType.y, ProductType.zDot } },
+ { DissociationType.PQD, new List() },
+ { DissociationType.ETD, new List{ ProductType.c, ProductType.y, ProductType.zDot } },
+ { DissociationType.HCD, new List{ ProductType.b, ProductType.y } },//HCD often creates a-, aStar, and aDegree-ions and we should examine what other prominent algoroithms do to see if that would benefit our search results
+ { DissociationType.AnyActivationType, new List{ ProductType.b, ProductType.y } },
+ { DissociationType.EThcD, new List{ ProductType.b, ProductType.y, ProductType.c, ProductType.zDot } },
+ { DissociationType.Custom, new List() },
+ { DissociationType.ISCID, new List() }
+ };
+
+ public static List GetTerminusSpecificProductTypesFromDissociation(DissociationType dissociationType, FragmentationTerminus fragmentationTerminus)
+ {
+ if (!TerminusSpecificProductTypesFromDissociation.TryGetValue((dissociationType, fragmentationTerminus), out List productTypes))
+ {
+ lock (TerminusSpecificProductTypesFromDissociation)
+ {
+ var productCollection = TerminusSpecificProductTypes.ProductIonTypesFromSpecifiedTerminus[fragmentationTerminus]
+ .Intersect(DissociationTypeCollection.ProductsFromDissociationType[dissociationType]);
+
+ if (!TerminusSpecificProductTypesFromDissociation.TryGetValue((dissociationType, fragmentationTerminus), out productTypes))
+ {
+ productTypes = productCollection.ToList();
+ TerminusSpecificProductTypesFromDissociation.Add((dissociationType, fragmentationTerminus), productTypes);
+ }
+ }
+ }
+
+ return productTypes;
+ }
+
+ public static List GetWaterAndAmmoniaLossProductTypesFromDissociation(DissociationType dissociationType, FragmentationTerminus fragmentationTerminus)
+ {
+ List productList = new();
+
+ switch (dissociationType)
+ {
+ case DissociationType.CID:
+ case DissociationType.IRMPD:
+ case DissociationType.HCD:
+ case DissociationType.AnyActivationType:
+ case DissociationType.EThcD:
+ if (fragmentationTerminus == FragmentationTerminus.N || fragmentationTerminus == FragmentationTerminus.Both)
+ {
+ productList.Add(ProductType.bWaterLoss);
+ productList.Add(ProductType.bAmmoniaLoss);
+ }
+ if (fragmentationTerminus == FragmentationTerminus.C || fragmentationTerminus == FragmentationTerminus.Both)
+ {
+ productList.Add(ProductType.yWaterLoss);
+ productList.Add(ProductType.yAmmoniaLoss);
+ }
+ break;
+ case DissociationType.ECD:
+ case DissociationType.ETD:
+ if (fragmentationTerminus == FragmentationTerminus.C || fragmentationTerminus == FragmentationTerminus.Both)
+ {
+ productList.Add(ProductType.yWaterLoss);
+ productList.Add(ProductType.yAmmoniaLoss);
+ }
+ break;
+ default:
+ break;
+ }
+ return productList;
+ }
+
+ private static Dictionary<(DissociationType, FragmentationTerminus), List> TerminusSpecificProductTypesFromDissociation
+ = new Dictionary<(DissociationType, FragmentationTerminus), List>();
+
+ private static Dictionary NeutralMassShiftFromProductType = new Dictionary
+ {
+ { ProductType.a, null},//-C -O
+ { ProductType.aStar, null},//-C -O -N -H3
+ { ProductType.aDegree, null},//-C -O2 -H2
+ { ProductType.b, null},//no change
+ { ProductType.bAmmoniaLoss, null},//-N -H3
+ { ProductType.bWaterLoss, null},//-H2 -O1
+ { ProductType.c, null},//+N1 +H3
+ { ProductType.x, null},//+C1 +O2
+ { ProductType.y, null},//+O +H2
+ { ProductType.yAmmoniaLoss, null},//+O -H -N
+ { ProductType.yWaterLoss, null},//no change
+ { ProductType.zDot, null },// +O -NH + e- + p+
+ { ProductType.zPlusOne, null},//+O +H -N: A Zdot ion is also known as z+1. It is not a z-ion in the Biemann nomenclature. It differs from a y-ion by N-1 H-1;
+ { ProductType.M, null},// neutral Molecular product can be used with neutral loss as fragment
+ { ProductType.D, null},// diagnostic ions are not shifted but added sumarily
+ { ProductType.Ycore, null},// neutral Molecular product can be used with neutral loss as fragment
+ { ProductType.Y, null},// diagnostic ions are not shifted but added sumarily
+ };
+
+ private static Dictionary DissociationTypeToTerminusMassShift = new Dictionary();
+
+ ///
+ /// This function is used in performance-critical functions, such as fragmenting peptides. The first double array is the N-terminal mass shifts for
+ /// the given dissociation type; the second array is the C-terminal mass shifts.
+ ///
+ public static (double[], double[]) GetNAndCTerminalMassShiftsForDissociationType(DissociationType dissociationType)
+ {
+ if (!DissociationTypeToTerminusMassShift.TryGetValue(dissociationType, out var massShifts))
+ {
+ lock (DissociationTypeToTerminusMassShift)
+ {
+ if (!DissociationTypeToTerminusMassShift.TryGetValue(dissociationType, out massShifts))
+ {
+ DissociationTypeToTerminusMassShift.Add(dissociationType,
+ (GetTerminusSpecificProductTypesFromDissociation(dissociationType, FragmentationTerminus.N).Select(p => GetMassShiftFromProductType(p)).ToArray(),
+ GetTerminusSpecificProductTypesFromDissociation(dissociationType, FragmentationTerminus.C).Select(p => GetMassShiftFromProductType(p)).ToArray()));
+
+ massShifts = DissociationTypeToTerminusMassShift[dissociationType];
+ }
+ }
+ }
+
+ return massShifts;
+ }
+
+ public static double GetMassShiftFromProductType(ProductType productType)
+ {
+ if (NeutralMassShiftFromProductType.TryGetValue(productType, out double? shift))
+ {
+ if (!shift.HasValue)
+ {
+ // compute formula
+ switch (productType)
+ {
+ case ProductType.a: NeutralMassShiftFromProductType[productType] = ChemicalFormula.ParseFormula("C-1O-1").MonoisotopicMass; break;
+ case ProductType.aStar: NeutralMassShiftFromProductType[productType] = ChemicalFormula.ParseFormula("C-1O-1N-1H-3").MonoisotopicMass; break;
+ case ProductType.aDegree: NeutralMassShiftFromProductType[productType] = ChemicalFormula.ParseFormula("C-1O-2H-2").MonoisotopicMass; break; // -46.0054793036,-C -O2 -H2
+ case ProductType.b: NeutralMassShiftFromProductType[productType] = 0; break;// 0, no change
+ case ProductType.bAmmoniaLoss: NeutralMassShiftFromProductType[productType] = ChemicalFormula.ParseFormula("N-1H-3").MonoisotopicMass; break;// -17.02654910112, -N -H3
+ case ProductType.bWaterLoss: NeutralMassShiftFromProductType[productType] = ChemicalFormula.ParseFormula("H-2O-1").MonoisotopicMass; break;// -18.01056468403, -H2 -O1
+ case ProductType.c: NeutralMassShiftFromProductType[productType] = ChemicalFormula.ParseFormula("N1H3").MonoisotopicMass; break;// 17.02654910112, +N1 +H3
+ case ProductType.x: NeutralMassShiftFromProductType[productType] = ChemicalFormula.ParseFormula("C1O2").MonoisotopicMass; break;// 43.98982923914, +C1 +O2
+ case ProductType.y: NeutralMassShiftFromProductType[productType] = ChemicalFormula.ParseFormula("H2O1").MonoisotopicMass; break;// 18.01056468403, +O +H2
+ case ProductType.yAmmoniaLoss: NeutralMassShiftFromProductType[productType] = ChemicalFormula.ParseFormula("O1H-1N-1").MonoisotopicMass; break;// 0.98401558291000057, +O -H -N
+ case ProductType.yWaterLoss: NeutralMassShiftFromProductType[productType] = 0; break;// 0, no change
+ case ProductType.zDot: NeutralMassShiftFromProductType[productType] = ChemicalFormula.ParseFormula("O1N-1H-1").MonoisotopicMass + Constants.ElectronMass + Constants.ProtonMass; break; //1.991840552567, +O -NH + e- + p+
+ case ProductType.zPlusOne: NeutralMassShiftFromProductType[productType] = ChemicalFormula.ParseFormula("O1H1N-1").MonoisotopicMass; break;//; 2.9996656473699996, +O +H -N:
+ case ProductType.M: NeutralMassShiftFromProductType[productType] = 0; break;// no change
+ case ProductType.D: NeutralMassShiftFromProductType[productType] = 0; break;// no change
+ case ProductType.Ycore: NeutralMassShiftFromProductType[productType] = 0; break;// no change
+ case ProductType.Y: NeutralMassShiftFromProductType[productType] = 0; break;// no change
+ }
+ }
+
+ return NeutralMassShiftFromProductType[productType].Value;
+ }
+ else
+ {
+ throw new MzLibUtil.MzLibException("Unknown product type!");
+ }
+ }
+
+ public static double ProductTypeSpecificFragmentNeutralMass(double mass, ProductType p)
+ {
+ return (double)ClassExtensions.RoundedDouble(mass + GetMassShiftFromProductType(p), 9);
+ }
+ }
+}
\ No newline at end of file
diff --git a/mzLib/MassSpectrometry/Proteomics/Fragmentation/FragmentationTerminus.cs b/mzLib/MassSpectrometry/Proteomics/Fragmentation/FragmentationTerminus.cs
new file mode 100644
index 000000000..84e8958f8
--- /dev/null
+++ b/mzLib/MassSpectrometry/Proteomics/Fragmentation/FragmentationTerminus.cs
@@ -0,0 +1,10 @@
+namespace Proteomics.Fragmentation
+{
+ public enum FragmentationTerminus
+ {
+ Both, //N- and C-terminus
+ N, //N-terminus only
+ C, //C-terminus only
+ None //used for internal fragments, could be used for top down intact mass?
+ }
+}
\ No newline at end of file
diff --git a/mzLib/MassSpectrometry/Proteomics/Fragmentation/MatchedFragmentIon.cs b/mzLib/MassSpectrometry/Proteomics/Fragmentation/MatchedFragmentIon.cs
new file mode 100644
index 000000000..0f8ade525
--- /dev/null
+++ b/mzLib/MassSpectrometry/Proteomics/Fragmentation/MatchedFragmentIon.cs
@@ -0,0 +1,91 @@
+using System.Text;
+using Chemistry;
+
+namespace Proteomics.Fragmentation
+{
+ public class MatchedFragmentIon
+ {
+ public readonly Product NeutralTheoreticalProduct;
+ public readonly double Mz;
+ public readonly double Intensity;
+ public readonly int Charge;
+
+ ///
+ /// Constructs a new MatchedFragmentIon given information about a theoretical and an experimental fragment mass spectral peak
+ ///
+ public MatchedFragmentIon(ref Product neutralTheoreticalProduct, double experMz, double experIntensity, int charge)
+ {
+ NeutralTheoreticalProduct = neutralTheoreticalProduct;
+ Mz = experMz;
+ Intensity = experIntensity;
+ Charge = charge;
+ }
+
+ public double MassErrorDa
+ {
+ get
+ {
+ return Mz.ToMass(Charge) - NeutralTheoreticalProduct.NeutralMass;
+ }
+ }
+
+ public double MassErrorPpm
+ {
+ get
+ {
+ return (MassErrorDa / NeutralTheoreticalProduct.NeutralMass) * 1e6;
+ }
+ }
+
+ public string Annotation
+ {
+ get
+ {
+ StringBuilder sb = new StringBuilder();
+
+ bool containsNeutralLoss = NeutralTheoreticalProduct.NeutralLoss != 0;
+
+ if (containsNeutralLoss)
+ {
+ sb.Append("(");
+ }
+
+ sb.Append(NeutralTheoreticalProduct.Annotation);
+
+ if (containsNeutralLoss)
+ {
+ sb.Append(")");
+ }
+
+ sb.Append("+");
+ sb.Append(Charge);
+
+ return sb.ToString();
+ }
+ }
+
+ ///
+ /// Summarizes a TheoreticalFragmentIon into a string for debug purposes
+ ///
+ public override string ToString()
+ {
+ // we add the blank space in the tostring because the values are treated like integers and looked up as index in the enum instead of being converted to just string and concatenated
+ return NeutralTheoreticalProduct.ProductType + "" + NeutralTheoreticalProduct.FragmentNumber + "+" + Charge + "\t;" + NeutralTheoreticalProduct.NeutralMass;
+ }
+
+ public override bool Equals(object obj)
+ {
+ MatchedFragmentIon other = (MatchedFragmentIon)obj;
+
+ return this.NeutralTheoreticalProduct.Equals(other.NeutralTheoreticalProduct)
+ && this.Charge == other.Charge
+ && this.Mz == other.Mz
+ && this.Intensity == other.Intensity;
+ }
+
+ public override int GetHashCode()
+ {
+ return Mz.GetHashCode();
+ }
+ }
+}
diff --git a/mzLib/MassSpectrometry/Proteomics/Fragmentation/Product.cs b/mzLib/MassSpectrometry/Proteomics/Fragmentation/Product.cs
new file mode 100644
index 000000000..f8de52f04
--- /dev/null
+++ b/mzLib/MassSpectrometry/Proteomics/Fragmentation/Product.cs
@@ -0,0 +1,97 @@
+using System.Text;
+
+namespace Proteomics.Fragmentation
+{
+ public struct Product
+ {
+ public readonly double NeutralMass;
+ public readonly ProductType ProductType;
+ public readonly double NeutralLoss;
+ public readonly FragmentationTerminus Terminus;
+ public readonly int FragmentNumber;
+ public readonly int AminoAcidPosition;
+ public readonly ProductType? SecondaryProductType; //used for internal fragment ions
+ public readonly int SecondaryFragmentNumber; //used for internal fragment ions
+
+ ///
+ /// A product is the individual neutral fragment from an MS dissociation. A fragmentation product here contains one of the two termini (N- or C-).
+ /// The ProductType describes where along the backbone the fragmentaiton occurred (e.g. b-, y-, c-, zdot-). The neutral loss mass (if any) that
+ /// occurred from a mod on the fragment is listed as a mass. Finally the neutral mass of the whole fragment is provided.
+ ///
+ public Product(ProductType productType, FragmentationTerminus terminus, double neutralMass,
+ int fragmentNumber, int aminoAcidPosition, double neutralLoss, ProductType? secondaryProductType = null, int secondaryFragmentNumber = 0)
+ {
+ NeutralMass = neutralMass;
+ ProductType = productType;
+ NeutralLoss = neutralLoss;
+ Terminus = terminus;
+ FragmentNumber = fragmentNumber;
+ AminoAcidPosition = aminoAcidPosition;
+ SecondaryProductType = secondaryProductType;
+ SecondaryFragmentNumber = secondaryFragmentNumber;
+ }
+
+ public string Annotation
+ {
+ get
+ {
+ StringBuilder sb = new StringBuilder();
+
+ if (SecondaryProductType == null)
+ {
+ sb.Append(ProductType);
+
+ // for "normal" fragments this is just the fragment number (e.g., the 3 in the b3 ion)
+ // for diagnostic ions, it's the m/z assuming z=1
+ // (e.g., a diagnostic ion with neutral mass 100 Da will be reported as the D101 fragment)
+ sb.Append(FragmentNumber);
+ }
+ else
+ {
+ //internal fragment ion, annotation used here: 10.1007/s13361-015-1078-1
+ //example: yIb[18-36]
+ sb.Append(ProductType + "I" + SecondaryProductType.Value + "[" + FragmentNumber + "-" + SecondaryFragmentNumber + "]");
+ }
+ if (NeutralLoss != 0)
+ {
+ sb.Append("-");
+ sb.Append(NeutralLoss.ToString("F2"));
+ }
+
+ return sb.ToString();
+ }
+ }
+
+ ///
+ /// Summarizes a Product into a string for debug purposes
+ ///
+ public override string ToString()
+ {
+ if (SecondaryProductType == null)
+ {
+ return ProductType + "" + FragmentNumber + ";" + NeutralMass.ToString("F5") + "-" + string.Format("{0:0.##}", NeutralLoss);
+ }
+ else
+ {
+ return ProductType + "I" + SecondaryProductType.Value + "[" + FragmentNumber + "-" + SecondaryFragmentNumber + "]" + ";" + NeutralMass.ToString("F5") + "-" + string.Format("{0:0.##}", NeutralLoss);
+ }
+ }
+
+ public override bool Equals(object obj)
+ {
+ Product other = (Product)obj;
+
+ return this.ProductType == other.ProductType
+ && this.NeutralMass == other.NeutralMass
+ && this.FragmentNumber == other.FragmentNumber
+ && this.NeutralLoss == other.NeutralLoss
+ && this.SecondaryFragmentNumber == other.SecondaryFragmentNumber
+ && this.SecondaryProductType == other.SecondaryProductType;
+ }
+
+ public override int GetHashCode()
+ {
+ return NeutralMass.GetHashCode();
+ }
+ }
+}
\ No newline at end of file
diff --git a/mzLib/MassSpectrometry/Proteomics/Fragmentation/ProductType.cs b/mzLib/MassSpectrometry/Proteomics/Fragmentation/ProductType.cs
new file mode 100644
index 000000000..10e9bcbab
--- /dev/null
+++ b/mzLib/MassSpectrometry/Proteomics/Fragmentation/ProductType.cs
@@ -0,0 +1,42 @@
+namespace Proteomics.Fragmentation
+{
+ public enum ProductType
+ {
+ //Ion Type Neutral Mr
+ //a [N]+[M]-CHO
+ //a* a-NH3
+ //a° a-H2O
+ //b [N]+[M]-H
+ //b* b-NH3
+ //b° b-H2O
+ //c [N]+[M]+NH2
+ //d a – partial side chain
+ //v y – complete side chain
+ //w z – partial side chain
+ //x [C]+[M]+CO-H
+ //y [C]+[M]+H
+ //y* y-NH3
+ //y° y-H2O
+ //z [C]+[M]-NH2
+
+ a,
+ aStar,
+ aDegree,
+ b,
+ bAmmoniaLoss,
+ bWaterLoss,
+ //BnoB1ions,
+ c,
+ x,
+ y,
+ yAmmoniaLoss,
+ yWaterLoss,
+ zPlusOne,//This is zDot plus H
+ zDot,
+ M, //this is the molecular ion // [M]
+ D, //this is a diagnostic ion // Modification loss mass
+ Ycore, //Glyco core Y ions // [pep] + Neutral core Glycan mass (such as: [pep] + [N]) //Which already consider the loss of H2O and H-transfer
+ Y //Glyco Y ions // [pep] + other Glycan mass
+ }
+
+}
\ No newline at end of file
diff --git a/mzLib/MassSpectrometry/Proteomics/Fragmentation/TerminusSpecificProductTypes.cs b/mzLib/MassSpectrometry/Proteomics/Fragmentation/TerminusSpecificProductTypes.cs
new file mode 100644
index 000000000..543ef482f
--- /dev/null
+++ b/mzLib/MassSpectrometry/Proteomics/Fragmentation/TerminusSpecificProductTypes.cs
@@ -0,0 +1,33 @@
+using System.Collections.Generic;
+
+namespace Proteomics.Fragmentation
+{
+ public class TerminusSpecificProductTypes
+ {
+ public static Dictionary> ProductIonTypesFromSpecifiedTerminus = new Dictionary>
+ {
+ {FragmentationTerminus.N, new List{ ProductType.a, ProductType.aDegree, ProductType.aStar, ProductType.b, ProductType.bWaterLoss, ProductType.bAmmoniaLoss, ProductType.c } }, //all ion types that include the N-terminus
+ {FragmentationTerminus.C, new List{ ProductType.x, ProductType.y, ProductType.yWaterLoss, ProductType.yAmmoniaLoss, ProductType.zDot, ProductType.zPlusOne } }, //all ion types that include the C-terminus
+ {FragmentationTerminus.Both, new List{ ProductType.a, ProductType.aDegree, ProductType.aStar, ProductType.b, ProductType.bWaterLoss, ProductType.bAmmoniaLoss, ProductType.c, ProductType.x, ProductType.y, ProductType.yWaterLoss, ProductType.yAmmoniaLoss, ProductType.zDot, ProductType.zPlusOne} },
+ {FragmentationTerminus.None, new List() }
+ };
+
+ public static Dictionary ProductTypeToFragmentationTerminus = new Dictionary
+ {
+ { ProductType.a, FragmentationTerminus.N },
+ { ProductType.aDegree, FragmentationTerminus.N },
+ { ProductType.aStar, FragmentationTerminus.N },
+ { ProductType.b, FragmentationTerminus.N },
+ { ProductType.bWaterLoss, FragmentationTerminus.N },
+ { ProductType.bAmmoniaLoss, FragmentationTerminus.N },
+ { ProductType.c, FragmentationTerminus.N },
+ { ProductType.x, FragmentationTerminus.C },
+ { ProductType.y, FragmentationTerminus.C },
+ { ProductType.yWaterLoss, FragmentationTerminus.C },
+ { ProductType.yAmmoniaLoss, FragmentationTerminus.C },
+ { ProductType.zDot, FragmentationTerminus.C },
+ { ProductType.zPlusOne, FragmentationTerminus.C },
+ };
+
+ }
+}
diff --git a/mzLib/MassSpectrometry/Proteomics/Modifications/ModLocationOnPeptideOrProtein.cs b/mzLib/MassSpectrometry/Proteomics/Modifications/ModLocationOnPeptideOrProtein.cs
new file mode 100644
index 000000000..4a143a3fa
--- /dev/null
+++ b/mzLib/MassSpectrometry/Proteomics/Modifications/ModLocationOnPeptideOrProtein.cs
@@ -0,0 +1,11 @@
+namespace Proteomics
+{
+ public enum ModLocationOnPeptideOrProtein
+ {
+ NPep,
+ PepC,
+ NProt,
+ ProtC,
+ Any
+ }
+}
\ No newline at end of file
diff --git a/mzLib/MassSpectrometry/Proteomics/Modifications/Modification.cs b/mzLib/MassSpectrometry/Proteomics/Modifications/Modification.cs
new file mode 100644
index 000000000..711aaf32b
--- /dev/null
+++ b/mzLib/MassSpectrometry/Proteomics/Modifications/Modification.cs
@@ -0,0 +1,303 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using Chemistry;
+using MassSpectrometry;
+
+namespace Proteomics
+{
+ public class Modification
+ {
+ public string IdWithMotif { get; private set; }
+ public string OriginalId { get; private set; }
+ public string Accession { get; private set; }
+ public string ModificationType { get; private set; }
+ public string FeatureType { get; private set; }
+ public ModificationMotif Target { get; private set; }
+ public string LocationRestriction { get; private set; }
+ public ChemicalFormula ChemicalFormula { get; private set; }
+ private double? monoisotopicMass = null;
+
+ public double? MonoisotopicMass
+ {
+ get
+ {
+ return ClassExtensions.RoundedDouble(monoisotopicMass);
+ }
+ private set
+ {
+ monoisotopicMass = value;
+ }
+ }
+
+ public Dictionary> DatabaseReference { get; private set; }
+ public Dictionary> TaxonomicRange { get; private set; }
+ public List Keywords { get; private set; }
+ public Dictionary> NeutralLosses { get; private set; }
+ public Dictionary> DiagnosticIons { get; private set; }
+ public string FileOrigin { get; private set; }
+ protected const double tolForEquality = 1e-9;
+
+ public bool ValidModification
+ {
+ get
+ {
+ return this.IdWithMotif != null
+ && (this.ChemicalFormula != null || this.MonoisotopicMass != null)
+ && this.Target != null
+ && this.LocationRestriction != "Unassigned."
+ && this.ModificationType != null
+ && this.FeatureType != "CROSSLINK"
+ && !this.ModificationType.Contains(':');
+ }
+ }
+
+ public Modification(string _originalId = null, string _accession = null, string _modificationType = null, string _featureType = null,
+ ModificationMotif _target = null, string _locationRestriction = "Unassigned.", ChemicalFormula _chemicalFormula = null,
+ double? _monoisotopicMass = null, Dictionary> _databaseReference = null,
+ Dictionary> _taxonomicRange = null, List _keywords = null,
+ Dictionary> _neutralLosses = null, Dictionary> _diagnosticIons = null,
+ string _fileOrigin = null)
+ {
+ if (_originalId != null)
+ {
+ if (_originalId.Contains(" on "))
+ {
+ this.IdWithMotif = _originalId;
+ this.OriginalId = _originalId.Split(new[] { " on " }, StringSplitOptions.None)[0];
+ }
+ else if (_originalId.Contains(" of "))
+ {
+ this.IdWithMotif = _originalId.Replace(" of ", " on ");
+ this.OriginalId = _originalId.Split(new[] { " of ", " on " }, StringSplitOptions.None)[0];
+ }
+ else if (_target != null)
+ {
+ this.IdWithMotif = _originalId + " on " + _target.ToString();
+ this.OriginalId = _originalId;
+ }
+ else
+ {
+ this.OriginalId = _originalId;
+ }
+ }
+
+ this.Accession = _accession;
+ this.ModificationType = _modificationType;
+ this.FeatureType = _featureType;
+ this.Target = _target;
+ this.LocationRestriction = ModLocationOnPeptideOrProtein(_locationRestriction);
+ this.ChemicalFormula = _chemicalFormula;
+ this.MonoisotopicMass = _monoisotopicMass;
+ this.DatabaseReference = _databaseReference;
+ this.TaxonomicRange = _taxonomicRange;
+ this.Keywords = _keywords;
+ this.NeutralLosses = _neutralLosses;
+ this.DiagnosticIons = _diagnosticIons;
+ this.FileOrigin = _fileOrigin;
+
+ if (this.MonoisotopicMass == null && this.ChemicalFormula != null)
+ {
+ this.MonoisotopicMass = this.ChemicalFormula.MonoisotopicMass;
+ }
+ }
+
+ public static string ModLocationOnPeptideOrProtein(string _locationRestriction)
+ {
+ switch (_locationRestriction)
+ {
+ case "N-terminal.":
+ return _locationRestriction;
+
+ case "C-terminal.":
+ return _locationRestriction;
+
+ case "Peptide N-terminal.":
+ return _locationRestriction;
+
+ case "Peptide C-terminal.":
+ return _locationRestriction;
+
+ case "Anywhere.":
+ return _locationRestriction;
+
+ default:
+ return "Unassigned.";
+ }
+ }
+
+ public override bool Equals(object o)
+ {
+ Modification m = o as Modification;
+ return o != null
+ && IdWithMotif == m.IdWithMotif
+ && OriginalId == m.OriginalId
+ && ModificationType == m.ModificationType
+ && (MonoisotopicMass == m.MonoisotopicMass
+ || MonoisotopicMass != null && m.MonoisotopicMass != null && Math.Abs((double)m.MonoisotopicMass - (double)MonoisotopicMass) < tolForEquality);
+ }
+
+ public override int GetHashCode()
+ {
+ string id = IdWithMotif ?? OriginalId ?? string.Empty;
+ string mt = ModificationType ?? string.Empty;
+ return id.GetHashCode() ^ mt.GetHashCode();
+ }
+
+ public override string ToString()
+ {
+ StringBuilder sb = new StringBuilder();
+ if (this.IdWithMotif != null)
+ { sb.AppendLine("ID " + this.IdWithMotif); }
+ if (this.Accession != null)
+ { sb.AppendLine("AC " + this.Accession); }
+ if (this.ModificationType != null)
+ { sb.AppendLine("MT " + this.ModificationType); }
+ if (this.FeatureType != null)
+ { sb.AppendLine("FT " + this.FeatureType); }
+ if (this.Target != null)
+ { sb.AppendLine("TG " + this.Target); } // at this stage, each mod has only one target though many may have the same Id
+ if (this.LocationRestriction != null)
+ { sb.AppendLine("PP " + this.LocationRestriction); }
+ if (this.ChemicalFormula != null)
+ { sb.AppendLine("CF " + this.ChemicalFormula.Formula); }
+ if (this.MonoisotopicMass != null)
+ { sb.AppendLine("MM " + this.MonoisotopicMass); }
+ if (this.DatabaseReference != null)
+ {
+ if (this.DatabaseReference.Count != 0)
+ {
+ List myKeys = new List(this.DatabaseReference.Keys);
+ myKeys.Sort();
+ foreach (string myKey in myKeys)
+ {
+ List myValues = new List(this.DatabaseReference[myKey]);
+ myValues.Sort();
+ foreach (string myValue in myValues)
+ {
+ sb.AppendLine("DR " + myKey + "; " + myValue);
+ }
+ }
+ }
+ }
+ if (this.TaxonomicRange != null)
+ {
+ if (this.TaxonomicRange.Count != 0)
+ {
+ List myKeys = new List(this.TaxonomicRange.Keys);
+ myKeys.Sort();
+ foreach (string myKey in myKeys)
+ {
+ List myValues = new List(this.TaxonomicRange[myKey]);
+ myValues.Sort();
+ foreach (string myValue in myValues)
+ {
+ sb.AppendLine("TR " + myKey + "; " + myValue);
+ }
+ }
+ }
+ }
+ if (this.NeutralLosses != null)
+ {
+ if (this.NeutralLosses.Count != 0)
+ {
+ List allDissociationTypes = this.NeutralLosses.Keys.ToList();
+ allDissociationTypes.Sort();
+
+ foreach (DissociationType dissociationType in allDissociationTypes)
+ {
+ StringBuilder myLine = new StringBuilder();
+ myLine.Append("NL ");
+
+ List myValues = new List(this.NeutralLosses[dissociationType]);
+ myValues.Sort();
+ for (int i = 0; i < myValues.Count; i++)
+ {
+ myLine.Append(dissociationType + ":" + ClassExtensions.RoundedDouble(myValues[i]));
+ if (i < myValues.Count - 1)
+ myLine.Append(" or ");
+ }
+
+ sb.AppendLine(myLine.ToString());
+ }
+ }
+ }
+ if (this.DiagnosticIons != null)
+ {
+ if (this.DiagnosticIons.Count != 0)
+ {
+ List allDissociationTypes = this.DiagnosticIons.Keys.ToList();
+ allDissociationTypes.Sort();
+
+ foreach (DissociationType dissociationType in allDissociationTypes)
+ {
+ StringBuilder myLine = new StringBuilder();
+ myLine.Append("DI ");
+
+ List myValues = new List(this.DiagnosticIons[dissociationType]);
+ myValues.Sort();
+ for (int i = 0; i < myValues.Count; i++)
+ {
+ myLine.Append(dissociationType + ":" + ClassExtensions.RoundedDouble(myValues[i]));
+ if (i < myValues.Count - 1)
+ myLine.Append(" or ");
+ }
+
+ sb.AppendLine(myLine.ToString());
+ }
+ }
+ }
+
+ if (this.Keywords != null)
+ {
+ if (this.Keywords.Count != 0)
+ {
+ sb.AppendLine("KW " + String.Join(" or ", this.Keywords.ToList().OrderBy(b => b)));
+ }
+ }
+
+ return sb.ToString();
+ }
+
+ public string ModificationErrorsToString() //reports errors in required fields.
+ {
+ StringBuilder sb = new StringBuilder();
+
+ sb.Append(this.ToString());
+
+ if (this.IdWithMotif == null)
+ {
+ sb.AppendLine("#Required field ID missing or malformed. Current value = " + this.IdWithMotif);
+ }
+
+ if (this.ModificationType == null)
+ {
+ sb.AppendLine("#Required field MT missing or malformed. Current value = " + this.ModificationType);
+ }
+
+ if (this.LocationRestriction == null)
+ {
+ sb.AppendLine("#Required field PP missing or malformed. Current value = " + this.LocationRestriction +
+ ".");
+ }
+
+ if (this.ChemicalFormula == null && this.MonoisotopicMass == null)
+ {
+ sb.AppendLine(
+ "#Required fields CF and MM are both missing or malformed. One of those two fields must be provided.");
+ }
+
+ if (this.ModificationType != null && this.ModificationType.Contains(':'))
+ {
+ sb.AppendLine("#Modification type cannot contain ':'!");
+ }
+
+ sb.Append("#This modification can be found in file " + this.FileOrigin);
+
+ return sb.ToString();
+ }
+
+
+ }
+}
\ No newline at end of file
diff --git a/mzLib/MassSpectrometry/Proteomics/Modifications/ModificationLocalization.cs b/mzLib/MassSpectrometry/Proteomics/Modifications/ModificationLocalization.cs
new file mode 100644
index 000000000..b12c0e19c
--- /dev/null
+++ b/mzLib/MassSpectrometry/Proteomics/Modifications/ModificationLocalization.cs
@@ -0,0 +1,69 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+
+namespace Proteomics
+{
+ public static class ModificationLocalization
+ {
+ public static bool ModFits(Modification attemptToLocalize, string proteinSequence, int peptideOneBasedIndex, int peptideLength, int proteinOneBasedIndex)
+ {
+ // First find the capital letter...
+ var motif = attemptToLocalize.Target;
+ var motifStartLocation = motif.ToString().IndexOf(motif.ToString().First(b => char.IsUpper(b)));
+
+ // Look up starting at and including the capital letter
+ var proteinToMotifOffset = proteinOneBasedIndex - motifStartLocation - 1;
+ var indexUp = 0;
+ while (indexUp < motif.ToString().Length)
+ {
+ if (indexUp + proteinToMotifOffset < 0 || indexUp + proteinToMotifOffset >= proteinSequence.Length
+ || !MotifMatches(motif.ToString()[indexUp], proteinSequence[indexUp + proteinToMotifOffset]))
+ {
+ return false;
+ }
+ indexUp++;
+ }
+ if (attemptToLocalize.LocationRestriction == "N-terminal." && proteinOneBasedIndex > 2)
+ {
+ return false;
+ }
+ if (attemptToLocalize.LocationRestriction == "Peptide N-terminal." && peptideOneBasedIndex > 1)
+ {
+ return false;
+ }
+ if (attemptToLocalize.LocationRestriction == "C-terminal." && proteinOneBasedIndex < proteinSequence.Length)
+ {
+ return false;
+ }
+ if (attemptToLocalize.LocationRestriction == "Peptide C-terminal." && peptideOneBasedIndex < peptideLength)
+ {
+ return false;
+ }
+
+ // I guess Anywhere. and Unassigned. are true since how do you localize anywhere or unassigned.
+
+ return true;
+ }
+
+ public static bool UniprotModExists(Protein protein, int i, Modification attemptToLocalize)
+ {
+ // uniprot mods with same mass takes precedence over variable mods
+ if (protein.OneBasedPossibleLocalizedModifications.TryGetValue(i, out List modsAtThisLocation)) {
+ return modsAtThisLocation.Any(p => Math.Abs((double)(p.MonoisotopicMass - attemptToLocalize.MonoisotopicMass)) < 0.001 && p.ModificationType == "UniProt");
+ }
+
+ return false;
+ }
+
+ private static bool MotifMatches(char motifChar, char sequenceChar)
+ {
+ char upperMotifChar = char.ToUpper(motifChar);
+ return upperMotifChar.Equals('X')
+ || upperMotifChar.Equals(sequenceChar)
+ || upperMotifChar.Equals('B') && new[] { 'D', 'N' }.Contains(sequenceChar)
+ || upperMotifChar.Equals('J') && new[] { 'I', 'L' }.Contains(sequenceChar)
+ || upperMotifChar.Equals('Z') && new[] { 'E', 'Q' }.Contains(sequenceChar);
+ }
+ }
+}
\ No newline at end of file
diff --git a/mzLib/MassSpectrometry/Proteomics/Modifications/ModificationMotif.cs b/mzLib/MassSpectrometry/Proteomics/Modifications/ModificationMotif.cs
new file mode 100644
index 000000000..204ed16a4
--- /dev/null
+++ b/mzLib/MassSpectrometry/Proteomics/Modifications/ModificationMotif.cs
@@ -0,0 +1,50 @@
+using System.Linq;
+using System.Text.RegularExpressions;
+
+namespace Proteomics
+{
+ public class ModificationMotif
+ {
+ private static readonly Regex ModificationMotifRegex = new Regex(@"^[A-Za-z]+$", RegexOptions.Compiled);
+ private readonly string motifString;
+
+ private ModificationMotif(string motif)
+ {
+ motifString = motif;
+ }
+
+ ///
+ /// Only upper and lower case letters allowed, must have a single upper case letter
+ ///
+ ///
+ ///
+ ///
+ public static bool TryGetMotif(string motifString, out ModificationMotif motif)
+ {
+ motif = null;
+ if (ModificationMotifRegex.IsMatch(motifString) && motifString.Count(b => char.IsUpper(b)) == 1)
+ {
+ motif = new ModificationMotif(motifString);
+ return true;
+ }
+ return false;
+ }
+
+ public override bool Equals(object o)
+ {
+ ModificationMotif m = o as ModificationMotif;
+ return m != null
+ && m.motifString == motifString;
+ }
+
+ public override int GetHashCode()
+ {
+ return motifString.GetHashCode();
+ }
+
+ public override string ToString()
+ {
+ return motifString;
+ }
+ }
+}
\ No newline at end of file
diff --git a/mzLib/MassSpectrometry/Proteomics/Modifications/SilacLabel.cs b/mzLib/MassSpectrometry/Proteomics/Modifications/SilacLabel.cs
new file mode 100644
index 000000000..d6376a3f7
--- /dev/null
+++ b/mzLib/MassSpectrometry/Proteomics/Modifications/SilacLabel.cs
@@ -0,0 +1,65 @@
+using System;
+using System.Collections.Generic;
+using System.Globalization;
+
+namespace Proteomics
+{
+ ///
+ /// Silac labels used to modify unlabeled proteins
+ ///
+ public class SilacLabel
+ {
+ public char OriginalAminoAcid { get; private set; }
+ public char AminoAcidLabel { get; private set; }
+ public string LabelChemicalFormula { get; private set; }
+ public string MassDifference { get; private set; }
+ public List AdditionalLabels { get; private set; }
+
+ public SilacLabel(char originalAminoAcid, char aminoAcidLabel, string labelChemicalFormula, double massDifference)
+ {
+ OriginalAminoAcid = originalAminoAcid;
+ AminoAcidLabel = aminoAcidLabel;
+ LabelChemicalFormula = labelChemicalFormula;
+ MassDifference = Math.Round(massDifference, 3).ToString("F3");
+ if (massDifference > 0)//if not negative, add a plus
+ {
+ MassDifference = "+" + MassDifference;
+ }
+ }
+
+ public void AddAdditionalSilacLabel(SilacLabel label)
+ {
+ if (AdditionalLabels == null)
+ {
+ AdditionalLabels = new List { label };
+ }
+ else
+ {
+ AdditionalLabels.Add(label);
+ }
+ }
+
+ ///
+ /// This method exists for conversion of Silac labels, which take double inputs
+ /// Although a double object could be saved, it clutters tomls
+ ///
+ ///
+ public double ConvertMassDifferenceToDouble()
+ {
+ string substring = MassDifference.Substring(1);
+ double value = Convert.ToDouble(substring, CultureInfo.InvariantCulture);
+ if (MassDifference[0] == '-')
+ {
+ value *= -1;
+ }
+ return value;
+ }
+
+ /// this parameterless constructor needs to exist to read the toml.
+ /// if you can figure out a way to get rid of it, feel free...
+ /// this is also encountered in MetaMorpheus's "CommonParameters.cs" if you find a solution.
+ public SilacLabel()
+ {
+ }
+ }
+}
diff --git a/mzLib/MassSpectrometry/Proteomics/Protein/DatabaseReference.cs b/mzLib/MassSpectrometry/Proteomics/Protein/DatabaseReference.cs
new file mode 100644
index 000000000..b9945a142
--- /dev/null
+++ b/mzLib/MassSpectrometry/Proteomics/Protein/DatabaseReference.cs
@@ -0,0 +1,51 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+
+namespace Proteomics
+{
+ public class DatabaseReference
+ {
+ ///
+ /// DatabaseReference constructor, which takes the type and ID strings of the reference, and a list of properties. Each property contains the "type" and "value" of the property as Item1 and Item2 of the Tuple.
+ ///
+ ///
+ ///
+ ///
+ public DatabaseReference(string type, string id, IEnumerable> properties)
+ {
+ Type = type ?? "";
+ Id = id ?? "";
+ Properties = properties ?? new List>();
+ }
+
+ ///
+ /// dbRef type, e.g. "GO" for GO terms
+ ///
+ public string Type { get; }
+
+ ///
+ /// dbRef ID string
+ ///
+ public string Id { get; }
+
+ ///
+ /// Each database reference contains a list of properties. Item1 of this Tuple is the "type", and Item2 is the "value" of the property.
+ ///
+ public IEnumerable> Properties { get; }
+
+ public override bool Equals(object obj)
+ {
+ DatabaseReference d = obj as DatabaseReference;
+ return obj != null
+ && (d.Type == null && Type == null || d.Type.Equals(Type))
+ && (d.Id == null && Id == null || d.Id.Equals(Id))
+ && d.Properties.OrderBy(x => x).SequenceEqual(Properties.OrderBy(x => x));
+ }
+
+ public override int GetHashCode()
+ {
+ return Type.GetHashCode() ^ Id.GetHashCode(); // null handled in constructor
+ }
+ }
+}
\ No newline at end of file
diff --git a/mzLib/MassSpectrometry/Proteomics/Protein/DisulfideBond.cs b/mzLib/MassSpectrometry/Proteomics/Protein/DisulfideBond.cs
new file mode 100644
index 000000000..6bd4fa1af
--- /dev/null
+++ b/mzLib/MassSpectrometry/Proteomics/Protein/DisulfideBond.cs
@@ -0,0 +1,48 @@
+namespace Proteomics
+{
+ public class DisulfideBond
+ {
+ public DisulfideBond(int OneBasedBeginPosition, int OneBasedEndPosition, string Description)
+ {
+ this.OneBasedBeginPosition = OneBasedBeginPosition;
+ this.OneBasedEndPosition = OneBasedEndPosition;
+ this.Description = Description ?? "";
+ }
+
+ /// For interchain disulfide bonds, sets begin and end to the same position.
+ public DisulfideBond(int OneBasedPosition, string Description)
+ : this(OneBasedPosition, OneBasedPosition, Description)
+ { }
+
+ ///
+ /// Beginning position of disulfide bond
+ ///
+ public int OneBasedBeginPosition { get; set; }
+
+ ///
+ /// End position of disulfide bond
+ ///
+ public int OneBasedEndPosition { get; set; }
+
+ ///
+ /// Description of this variation (optional)
+ ///
+ public string Description { get; set; }
+
+ public override bool Equals(object obj)
+ {
+ DisulfideBond bond = obj as DisulfideBond;
+ return bond != null
+ && bond.OneBasedBeginPosition == OneBasedBeginPosition
+ && bond.OneBasedEndPosition == OneBasedEndPosition
+ && bond.Description == Description;
+ }
+
+ public override int GetHashCode()
+ {
+ return OneBasedBeginPosition
+ ^ OneBasedEndPosition
+ ^ Description.GetHashCode(); // null handled in constructor
+ }
+ }
+}
\ No newline at end of file
diff --git a/mzLib/MassSpectrometry/Proteomics/Protein/Protein.cs b/mzLib/MassSpectrometry/Proteomics/Protein/Protein.cs
new file mode 100644
index 000000000..7ec98d9e4
--- /dev/null
+++ b/mzLib/MassSpectrometry/Proteomics/Protein/Protein.cs
@@ -0,0 +1,832 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text.RegularExpressions;
+using Proteomics.Fragmentation;
+using Proteomics;
+using Proteomics.ProteolyticDigestion;
+
+namespace Proteomics
+{
+ public class Protein
+ {
+ private List _proteolysisProducts;
+
+ ///
+ /// Protein. Filters out modifications that do not match their amino acid target site.
+ ///
+ /// Base sequence of the protein.
+ /// Unique accession for the protein.
+ /// Organism with this protein.
+ /// List of gene names as tuple of (nameType, name), e.g. (primary, HLA-A)
+ /// Modifications at positions along the sequence.
+ ///
+ ///
+ ///
+ ///
+ ///
+ ///
+ ///
+ ///
+ ///
+ ///
+ public Protein(string sequence, string accession, string organism = null, List> geneNames = null,
+ IDictionary> oneBasedModifications = null, List proteolysisProducts = null,
+ string name = null, string fullName = null, bool isDecoy = false, bool isContaminant = false, List databaseReferences = null,
+ List sequenceVariations = null, List appliedSequenceVariations = null, string sampleNameForVariants = null,
+ List disulfideBonds = null, List spliceSites = null, string databaseFilePath = null, bool addTruncations = false)
+ {
+ // Mandatory
+ BaseSequence = sequence;
+ NonVariantProtein = this;
+ Accession = accession;
+
+ Name = name;
+ Organism = organism;
+ FullName = fullName;
+ IsDecoy = isDecoy;
+ IsContaminant = isContaminant;
+ DatabaseFilePath = databaseFilePath;
+ SampleNameForVariants = sampleNameForVariants;
+
+ GeneNames = geneNames ?? new List>();
+ _proteolysisProducts = proteolysisProducts ?? new List();
+ SequenceVariations = sequenceVariations ?? new List();
+ AppliedSequenceVariations = appliedSequenceVariations ?? new List();
+ OriginalNonVariantModifications = oneBasedModifications ?? new Dictionary>();
+ if (oneBasedModifications != null)
+ {
+ OneBasedPossibleLocalizedModifications = SelectValidOneBaseMods(oneBasedModifications);
+ }
+ else
+ {
+ OneBasedPossibleLocalizedModifications = new Dictionary>();
+ }
+ DatabaseReferences = databaseReferences ?? new List();
+ DisulfideBonds = disulfideBonds ?? new List();
+ SpliceSites = spliceSites ?? new List();
+
+ if (addTruncations)
+ {
+ this.AddTruncations();
+ }
+ }
+
+ ///
+ /// Protein construction that clones a protein but assigns a different base sequence
+ /// For use in SILAC experiments
+ ///
+ ///
+ ///
+ ///
+ public Protein(Protein originalProtein, string silacSequence)
+ {
+ BaseSequence = silacSequence;
+ Accession = originalProtein.Accession;
+ NonVariantProtein = originalProtein.NonVariantProtein;
+ Name = originalProtein.Name;
+ Organism = originalProtein.Organism;
+ FullName = originalProtein.FullName;
+ IsDecoy = originalProtein.IsDecoy;
+ IsContaminant = originalProtein.IsContaminant;
+ DatabaseFilePath = originalProtein.DatabaseFilePath;
+ SampleNameForVariants = originalProtein.SampleNameForVariants;
+ GeneNames = originalProtein.GeneNames;
+ _proteolysisProducts = originalProtein._proteolysisProducts;
+ SequenceVariations = originalProtein.SequenceVariations;
+ AppliedSequenceVariations = originalProtein.AppliedSequenceVariations;
+ OriginalNonVariantModifications = originalProtein.OriginalNonVariantModifications;
+ OneBasedPossibleLocalizedModifications = originalProtein.OneBasedPossibleLocalizedModifications;
+ DatabaseReferences = originalProtein.DatabaseReferences;
+ DisulfideBonds = originalProtein.DisulfideBonds;
+ SpliceSites = originalProtein.SpliceSites;
+ DatabaseFilePath = originalProtein.DatabaseFilePath;
+ }
+
+ ///
+ /// Protein construction with applied variations
+ ///
+ ///
+ ///
+ ///
+ ///
+ ///
+ ///
+ public Protein(string variantBaseSequence, Protein protein, IEnumerable appliedSequenceVariations,
+ IEnumerable applicableProteolysisProducts, IDictionary> oneBasedModifications, string sampleNameForVariants)
+ : this(variantBaseSequence,
+ VariantApplication.GetAccession(protein, appliedSequenceVariations),
+ organism: protein.Organism,
+ geneNames: new List>(protein.GeneNames),
+ oneBasedModifications: oneBasedModifications != null ? oneBasedModifications.ToDictionary(x => x.Key, x => x.Value) : new Dictionary>(),
+ proteolysisProducts: new List(applicableProteolysisProducts ?? new List()),
+ name: GetName(appliedSequenceVariations, protein.Name),
+ fullName: GetName(appliedSequenceVariations, protein.FullName),
+ isDecoy: protein.IsDecoy,
+ isContaminant: protein.IsContaminant,
+ databaseReferences: new List(protein.DatabaseReferences),
+ sequenceVariations: new List(protein.SequenceVariations),
+ disulfideBonds: new List(protein.DisulfideBonds),
+ spliceSites: new List(protein.SpliceSites),
+ databaseFilePath: protein.DatabaseFilePath)
+ {
+ NonVariantProtein = protein.NonVariantProtein;
+ OriginalNonVariantModifications = NonVariantProtein.OriginalNonVariantModifications;
+ AppliedSequenceVariations = (appliedSequenceVariations ?? new List()).ToList();
+ SampleNameForVariants = sampleNameForVariants;
+ }
+
+ ///
+ /// Modifications (values) located at one-based protein positions (keys)
+ ///
+ public IDictionary> OneBasedPossibleLocalizedModifications { get; private set; }
+
+ ///
+ /// The list of gene names consists of tuples, where Item1 is the type of gene name, and Item2 is the name. There may be many genes and names of a certain type produced when reading an XML protein database.
+ ///
+ public IEnumerable> GeneNames { get; }
+
+ ///
+ /// Unique accession for this protein.
+ ///
+ public string Accession { get; }
+
+ ///
+ /// Base sequence, which may contain applied sequence variations.
+ ///
+ public string BaseSequence { get; }
+
+ public string Organism { get; }
+ public bool IsDecoy { get; }
+ public IEnumerable SequenceVariations { get; }
+ public IEnumerable DisulfideBonds { get; }
+ public IEnumerable SpliceSites { get; }
+
+ //TODO: Generate all the proteolytic products as distinct proteins during XML reading and delete the ProteolysisProducts parameter
+ public IEnumerable ProteolysisProducts
+ { get { return _proteolysisProducts; } }
+
+ public IEnumerable DatabaseReferences { get; }
+ public string DatabaseFilePath { get; }
+
+ ///
+ /// Protein before applying variations.
+ ///
+ public Protein NonVariantProtein { get; }
+
+ ///
+ /// Sequence variations that have been applied to the base sequence.
+ ///
+ public List AppliedSequenceVariations { get; }
+
+ ///
+ /// Sample name from which applied variants came, e.g. tumor or normal.
+ ///
+ public string SampleNameForVariants { get; }
+
+ public double Probability { get; set; } // for protein pep project
+
+ public int Length
+ {
+ get
+ {
+ return BaseSequence.Length;
+ }
+ }
+
+ public string FullDescription
+ {
+ get
+ {
+ return Accession + "|" + Name + "|" + FullName;
+ }
+ }
+
+ public string Name { get; }
+ public string FullName { get; }
+ public bool IsContaminant { get; }
+ internal IDictionary> OriginalNonVariantModifications { get; set; }
+
+ public char this[int zeroBasedIndex]
+ {
+ get
+ {
+ return BaseSequence[zeroBasedIndex];
+ }
+ }
+
+ ///
+ /// Formats a string for a UniProt fasta header. See https://www.uniprot.org/help/fasta-headers.
+ /// Note that the db field isn't very applicable here, so mz is placed in to denote written by mzLib.
+ ///
+ public string GetUniProtFastaHeader()
+ {
+ var n = GeneNames.FirstOrDefault();
+ string geneName = n == null ? "" : n.Item2;
+ return string.Format("mz|{0}|{1} {2} OS={3} GN={4}", Accession, Name, FullName, Organism, geneName);
+ }
+
+ ///
+ /// Formats a string for an ensembl header
+ ///
+ public string GetEnsemblFastaHeader()
+ {
+ return string.Format("{0} {1}", Accession, FullName);
+ }
+
+ ///
+ /// Gets peptides for digestion of a protein
+ /// TODO: Refactor to employ yield returns
+ ///
+ public IEnumerable Digest(DigestionParams digestionParams, List allKnownFixedModifications,
+ List variableModifications, List silacLabels = null,
+ (SilacLabel startLabel, SilacLabel endLabel)? turnoverLabels = null, bool topDownTruncationSearch = false)
+ {
+ //can't be null
+ allKnownFixedModifications = allKnownFixedModifications ?? new List();
+ // add in any modifications that are caused by protease digestion
+ if (digestionParams.Protease.CleavageMod != null && !allKnownFixedModifications.Contains(digestionParams.Protease.CleavageMod))
+ {
+ allKnownFixedModifications.Add(digestionParams.Protease.CleavageMod);
+ }
+ variableModifications = variableModifications ?? new List();
+ CleavageSpecificity searchModeType = digestionParams.SearchModeType;
+
+ ProteinDigestion digestion = new(digestionParams, allKnownFixedModifications, variableModifications);
+ IEnumerable unmodifiedPeptides =
+ searchModeType == CleavageSpecificity.Semi ?
+ digestion.SpeedySemiSpecificDigestion(this) :
+ digestion.Digestion(this, topDownTruncationSearch);
+
+ if (digestionParams.KeepNGlycopeptide || digestionParams.KeepOGlycopeptide)
+ {
+ unmodifiedPeptides = GetGlycoPeptides(unmodifiedPeptides, digestionParams.KeepNGlycopeptide, digestionParams.KeepOGlycopeptide);
+ }
+
+ IEnumerable modifiedPeptides = unmodifiedPeptides.SelectMany(peptide => peptide.GetModifiedPeptides(allKnownFixedModifications, digestionParams, variableModifications));
+
+ //Remove terminal modifications (if needed)
+ if (searchModeType == CleavageSpecificity.SingleN ||
+ searchModeType == CleavageSpecificity.SingleC ||
+ (searchModeType == CleavageSpecificity.None && (digestionParams.FragmentationTerminus == FragmentationTerminus.N || digestionParams.FragmentationTerminus == FragmentationTerminus.C)))
+ {
+ modifiedPeptides = RemoveTerminalModifications(modifiedPeptides, digestionParams.FragmentationTerminus, allKnownFixedModifications);
+ }
+
+ //add silac labels (if needed)
+ if (silacLabels != null)
+ {
+ return GetSilacPeptides(modifiedPeptides, silacLabels, digestionParams.GeneratehUnlabeledProteinsForSilac, turnoverLabels);
+ }
+
+ return modifiedPeptides;
+ }
+
+ ///
+ /// Remove terminal modifications from the C-terminus of SingleN peptides and the N-terminus of SingleC peptides/
+ /// These terminal modifications create redundant entries and increase search time
+ ///
+ internal static IEnumerable RemoveTerminalModifications(IEnumerable modifiedPeptides, FragmentationTerminus fragmentationTerminus, IEnumerable allFixedMods)
+ {
+ string terminalStringToLookFor = fragmentationTerminus == FragmentationTerminus.N ? "C-terminal" : "N-terminal";
+ List fixedTerminalMods = allFixedMods.Where(x => x.LocationRestriction.Contains(terminalStringToLookFor)).ToList();
+ foreach (PeptideWithSetModifications pwsm in modifiedPeptides)
+ {
+ if (!pwsm.AllModsOneIsNterminus.Values.Any(x => x.LocationRestriction.Contains(terminalStringToLookFor) && !fixedTerminalMods.Contains(x)))
+ {
+ yield return pwsm;
+ }
+ }
+ }
+
+ ///
+ /// Add additional peptides with SILAC amino acids
+ ///
+ internal IEnumerable GetSilacPeptides(IEnumerable originalPeptides, List silacLabels, bool generateUnlabeledProteins, (SilacLabel startLabel, SilacLabel endLabel)? turnoverLabels)
+ {
+ //if this is a multiplex experiment (pooling multiple samples, not a turnover), then only create the fully unlabeled/labeled peptides
+ if (turnoverLabels == null)
+ {
+ //unlabeled peptides
+ if (generateUnlabeledProteins)
+ {
+ foreach (PeptideWithSetModifications pwsm in originalPeptides)
+ {
+ yield return pwsm;
+ }
+ }
+
+ //fully labeled peptides
+ foreach (SilacLabel label in silacLabels)
+ {
+ Protein silacProtein = GenerateFullyLabeledSilacProtein(label);
+ foreach (PeptideWithSetModifications pwsm in originalPeptides)
+ {
+ //duplicate the peptides with the updated protein sequence that contains only silac labels
+ yield return new PeptideWithSetModifications(silacProtein, pwsm.DigestionParams, pwsm.OneBasedStartResidueInProtein, pwsm.OneBasedEndResidueInProtein, pwsm.CleavageSpecificityForFdrCategory, pwsm.PeptideDescription, pwsm.MissedCleavages, pwsm.AllModsOneIsNterminus, pwsm.NumFixedMods);
+ }
+ }
+ }
+ else //if this is a turnover experiment, we want to be able to look for peptides containing mixtures of heavy and light amino acids (typically occurs for missed cleavages)
+ {
+ (SilacLabel startLabel, SilacLabel endLabel) turnoverLabelsValue = turnoverLabels.Value;
+ SilacLabel startLabel = turnoverLabelsValue.startLabel;
+ SilacLabel endLabel = turnoverLabelsValue.endLabel;
+
+ //This allows you to move from one label to another (rather than unlabeled->labeled or labeled->unlabeled). Useful for when your lab is swimming in cash and you have stock in a SILAC company
+ if (startLabel != null && endLabel != null) //if neither the start nor end conditions are unlabeled, then generate fully labeled proteins using the "startLabel" (otherwise maintain the unlabeled)
+ {
+ Protein silacStartProtein = GenerateFullyLabeledSilacProtein(startLabel);
+ PeptideWithSetModifications[] originalPeptideArray = originalPeptides.ToArray();
+ for (int i = 0; i < originalPeptideArray.Length; i++)
+ {
+ PeptideWithSetModifications pwsm = originalPeptideArray[i];
+ //duplicate the peptides with the updated protein sequence that contains only silac labels
+ originalPeptideArray[i] = new PeptideWithSetModifications(silacStartProtein, pwsm.DigestionParams, pwsm.OneBasedStartResidueInProtein, pwsm.OneBasedEndResidueInProtein, pwsm.CleavageSpecificityForFdrCategory, pwsm.PeptideDescription, pwsm.MissedCleavages, pwsm.AllModsOneIsNterminus, pwsm.NumFixedMods);
+ }
+ originalPeptides = originalPeptideArray;
+
+ //modify the end label amino acids to recognize the new "original" amino acid
+ //get the residues that were changed
+ List originalLabels = new List { startLabel };
+ if (startLabel.AdditionalLabels != null)
+ {
+ originalLabels.AddRange(startLabel.AdditionalLabels);
+ }
+ SilacLabel startLabelWithSharedOriginalAminoAcid = originalLabels.Where(x => x.OriginalAminoAcid == endLabel.OriginalAminoAcid).FirstOrDefault();
+ SilacLabel updatedEndLabel = startLabelWithSharedOriginalAminoAcid == null ?
+ endLabel :
+ new SilacLabel(startLabelWithSharedOriginalAminoAcid.AminoAcidLabel, endLabel.AminoAcidLabel, endLabel.LabelChemicalFormula, endLabel.ConvertMassDifferenceToDouble());
+ if (endLabel.AdditionalLabels != null)
+ {
+ foreach (SilacLabel additionalLabel in endLabel.AdditionalLabels)
+ {
+ startLabelWithSharedOriginalAminoAcid = originalLabels.Where(x => x.OriginalAminoAcid == additionalLabel.OriginalAminoAcid).FirstOrDefault();
+ updatedEndLabel.AddAdditionalSilacLabel(
+ startLabelWithSharedOriginalAminoAcid == null ?
+ additionalLabel :
+ new SilacLabel(startLabelWithSharedOriginalAminoAcid.AminoAcidLabel, additionalLabel.AminoAcidLabel, additionalLabel.LabelChemicalFormula, additionalLabel.ConvertMassDifferenceToDouble()));
+ }
+ }
+
+ //double check that all labeled amino acids can become unlabeled/relabeled
+ if (startLabel.AdditionalLabels != null)
+ {
+ foreach (SilacLabel originalLabel in originalLabels)
+ {
+ if (updatedEndLabel.OriginalAminoAcid != originalLabel.AminoAcidLabel &&
+ (updatedEndLabel.AdditionalLabels == null || !updatedEndLabel.AdditionalLabels.Any(x => x.OriginalAminoAcid == originalLabel.AminoAcidLabel)))
+ {
+ updatedEndLabel.AddAdditionalSilacLabel(new SilacLabel(originalLabel.AminoAcidLabel, originalLabel.OriginalAminoAcid, originalLabel.LabelChemicalFormula, originalLabel.ConvertMassDifferenceToDouble()));
+ }
+ }
+ }
+ endLabel = updatedEndLabel;
+ }
+
+ //add all unlabeled (or if no unlabeled, then the startLabeled) peptides
+ foreach (PeptideWithSetModifications pwsm in originalPeptides)
+ {
+ yield return pwsm;
+ }
+
+ //the order (below) matters when neither labels are null, because the fully labeled "start" has already been created above, so we want to use the end label here if it's not unlabeled (null)
+ SilacLabel label = endLabel ?? startLabel; //pick the labeled (not the unlabeled). If no unlabeled, take the endLabel
+
+ Protein silacEndProtein = GenerateFullyLabeledSilacProtein(label);
+
+ //add all peptides containing any label (may also contain unlabeled)
+ if (label.AdditionalLabels == null) //if there's only one (which is common)
+ {
+ //get the residues to change
+ char originalResidue = label.OriginalAminoAcid;
+ char labeledResidue = label.AminoAcidLabel;
+
+ //label peptides
+ foreach (PeptideWithSetModifications pwsm in originalPeptides)
+ {
+ //find the indexes in the base sequence for labeling
+ char[] baseSequenceArray = pwsm.BaseSequence.ToArray();
+ List indexesOfResiduesToBeLabeled = new List();
+ for (int c = 0; c < baseSequenceArray.Length; c++)
+ {
+ if (baseSequenceArray[c] == originalResidue)
+ {
+ indexesOfResiduesToBeLabeled.Add(c);
+ }
+ }
+ //if there's something to label
+ if (indexesOfResiduesToBeLabeled.Count != 0)
+ {
+ List pwsmsForCombinatorics = new List { pwsm };
+ for (int a = 0; a < indexesOfResiduesToBeLabeled.Count; a++)
+ {
+ List localPwsmsForCombinatorics = new List();
+ foreach (PeptideWithSetModifications pwsmCombination in pwsmsForCombinatorics)
+ {
+ char[] combinatoricBaseSequenceArray = pwsmCombination.BaseSequence.ToArray();
+ combinatoricBaseSequenceArray[indexesOfResiduesToBeLabeled[a]] = labeledResidue;
+ string updatedBaseSequence = string.Concat(combinatoricBaseSequenceArray);
+
+ PeptideWithSetModifications labeledPwsm = new PeptideWithSetModifications(silacEndProtein, pwsm.DigestionParams,
+ pwsm.OneBasedStartResidueInProtein, pwsm.OneBasedEndResidueInProtein, pwsm.CleavageSpecificityForFdrCategory,
+ pwsm.PeptideDescription, pwsm.MissedCleavages, pwsm.AllModsOneIsNterminus, pwsm.NumFixedMods, updatedBaseSequence);
+ yield return labeledPwsm; //return
+ localPwsmsForCombinatorics.Add(labeledPwsm); //add so it can be used again
+ }
+ pwsmsForCombinatorics.AddRange(localPwsmsForCombinatorics);
+ }
+ }
+ }
+ }
+ else //if there are more than one (i.e. K and R are labeled)
+ {
+ //get the residues to change
+ char[] originalResidues = new char[label.AdditionalLabels.Count + 1];
+ char[] labeledResidues = new char[label.AdditionalLabels.Count + 1];
+ originalResidues[0] = label.OriginalAminoAcid;
+ labeledResidues[0] = label.AminoAcidLabel;
+ for (int i = 0; i < label.AdditionalLabels.Count; i++)
+ {
+ originalResidues[i + 1] = label.AdditionalLabels[i].OriginalAminoAcid;
+ labeledResidues[i + 1] = label.AdditionalLabels[i].AminoAcidLabel;
+ }
+
+ //label peptides
+ foreach (PeptideWithSetModifications pwsm in originalPeptides)
+ {
+ //find the indexes in the base sequence for labeling
+ char[] baseSequenceArray = pwsm.BaseSequence.ToArray();
+ Dictionary indexesOfResiduesToBeLabeled = new Dictionary();
+ for (int peptideResidueIndex = 0; peptideResidueIndex < baseSequenceArray.Length; peptideResidueIndex++)
+ {
+ for (int silacResidue = 0; silacResidue < originalResidues.Length; silacResidue++)
+ {
+ if (baseSequenceArray[peptideResidueIndex] == originalResidues[silacResidue])
+ {
+ indexesOfResiduesToBeLabeled.Add(peptideResidueIndex, labeledResidues[silacResidue]);
+ }
+ }
+ }
+ //if there's something to label
+ if (indexesOfResiduesToBeLabeled.Count != 0)
+ {
+ List pwsmsForCombinatorics = new List { pwsm };
+ foreach (KeyValuePair kvp in indexesOfResiduesToBeLabeled)
+ {
+ List localPwsmsForCombinatorics = new List();
+ foreach (PeptideWithSetModifications pwsmCombination in pwsmsForCombinatorics)
+ {
+ char[] combinatoricBaseSequenceArray = pwsmCombination.BaseSequence.ToArray();
+ combinatoricBaseSequenceArray[kvp.Key] = kvp.Value;
+ string updatedBaseSequence = string.Concat(combinatoricBaseSequenceArray);
+
+ PeptideWithSetModifications labeledPwsm = new PeptideWithSetModifications(silacEndProtein, pwsm.DigestionParams,
+ pwsm.OneBasedStartResidueInProtein, pwsm.OneBasedEndResidueInProtein, pwsm.CleavageSpecificityForFdrCategory,
+ pwsm.PeptideDescription, pwsm.MissedCleavages, pwsm.AllModsOneIsNterminus, pwsm.NumFixedMods, updatedBaseSequence);
+ yield return labeledPwsm; //return
+ localPwsmsForCombinatorics.Add(labeledPwsm); //add so it can be used again
+ }
+ pwsmsForCombinatorics.AddRange(localPwsmsForCombinatorics);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ ///
+ /// Only keep glycopeptides by filtering the NGlycopeptide motif 'NxS || NxT' or OGlycopeptide motif 'S || T'
+ ///
+ internal IEnumerable GetGlycoPeptides(IEnumerable originalPeptides, bool keepNGlycopeptide, bool keepOGlycopeptide)
+ {
+ Regex rgx = new Regex("N[A-Z][ST]");
+ foreach (ProteolyticPeptide pwsm in originalPeptides)
+ {
+ bool yielded = false;
+ if (keepNGlycopeptide)
+ {
+ if (rgx.IsMatch(pwsm.BaseSequence))
+ {
+ yielded = true;
+ yield return pwsm;
+ }
+ }
+
+ if (keepOGlycopeptide && !yielded)
+ {
+ if (pwsm.BaseSequence.Contains('S') || pwsm.BaseSequence.Contains('T'))
+ {
+ yield return pwsm;
+ }
+ }
+ }
+ }
+
+ ///
+ /// Generates a protein that is fully labeled with the specified silac label
+ ///
+ private Protein GenerateFullyLabeledSilacProtein(SilacLabel label)
+ {
+ string updatedBaseSequence = BaseSequence.Replace(label.OriginalAminoAcid, label.AminoAcidLabel);
+ if (label.AdditionalLabels != null) //if there is more than one label per replicate (i.e both R and K were labeled in a sample before pooling)
+ {
+ foreach (SilacLabel additionalLabel in label.AdditionalLabels)
+ {
+ updatedBaseSequence = updatedBaseSequence.Replace(additionalLabel.OriginalAminoAcid, additionalLabel.AminoAcidLabel);
+ }
+ }
+ return new Protein(this, updatedBaseSequence);
+ }
+
+ ///
+ /// Gets proteins with applied variants from this protein
+ ///
+ public List GetVariantProteins(int maxAllowedVariantsForCombinitorics = 4, int minAlleleDepth = 1)
+ {
+ return VariantApplication.ApplyVariants(this, SequenceVariations, maxAllowedVariantsForCombinitorics, minAlleleDepth);
+ }
+
+ ///
+ /// Restore all modifications that were read in, including those that did not match their target amino acid.
+ ///
+ public void RestoreUnfilteredModifications()
+ {
+ OneBasedPossibleLocalizedModifications = OriginalNonVariantModifications;
+ }
+
+ ///
+ /// Filters modifications that do not match their target amino acid.
+ ///
+ ///
+ ///
+ private IDictionary> SelectValidOneBaseMods(IDictionary> dict)
+ {
+ Dictionary> validModDictionary = new Dictionary>();
+ foreach (KeyValuePair> entry in dict)
+ {
+ List validMods = new List();
+ foreach (Modification m in entry.Value)
+ {
+ //mod must be valid mod and the motif of the mod must be present in the protein at the specified location
+ if (m.ValidModification && ModificationLocalization.ModFits(m, BaseSequence, 0, BaseSequence.Length, entry.Key))
+ {
+ validMods.Add(m);
+ }
+ }
+
+ if (validMods.Any())
+ {
+ if (validModDictionary.Keys.Contains(entry.Key))
+ {
+ validModDictionary[entry.Key].AddRange(validMods);
+ }
+ else
+ {
+ validModDictionary.Add(entry.Key, validMods);
+ }
+ }
+ }
+ return validModDictionary;
+ }
+ ///
+ /// Protein XML files contain annotated proteolysis products for many proteins (e.g. signal peptides, chain peptides).
+ /// This method adds N- and C-terminal truncations to these products.
+ ///
+
+ public void AddTruncationsToExistingProteolysisProducts(int fullProteinOneBasedBegin, int fullProteinOneBasedEnd, bool addNterminalDigestionTruncations, bool addCterminalDigestionTruncations, int minProductBaseSequenceLength, int lengthOfProteolysis, string proteolyisisProductName)
+ {
+ bool sequenceContainsNterminus = (fullProteinOneBasedBegin == 1);
+
+ if (sequenceContainsNterminus)
+ {
+ //Digest N-terminus
+ if (addNterminalDigestionTruncations)
+ {
+ if (BaseSequence.Substring(0, 1) == "M")
+ {
+ AddNterminalTruncations(lengthOfProteolysis + 1, fullProteinOneBasedBegin, fullProteinOneBasedEnd, minProductBaseSequenceLength, proteolyisisProductName);
+ }
+ else
+ {
+ AddNterminalTruncations(lengthOfProteolysis, fullProteinOneBasedBegin, fullProteinOneBasedEnd, minProductBaseSequenceLength, proteolyisisProductName);
+ }
+ }
+ //Digest C-terminus -- not effected by variable N-terminus behavior
+ if (addCterminalDigestionTruncations)
+ {
+ // if first residue is M, then we have to add c-terminal markers for both with and without the M
+ if (BaseSequence.Substring(0, 1) == "M")
+ {
+ //add sequences WITHOUT methionine
+ AddCterminalTruncations(lengthOfProteolysis, fullProteinOneBasedEnd, fullProteinOneBasedBegin + 1, minProductBaseSequenceLength, proteolyisisProductName);
+ }
+ //add sequences with methionine
+ AddCterminalTruncations(lengthOfProteolysis, fullProteinOneBasedEnd, fullProteinOneBasedBegin, minProductBaseSequenceLength, proteolyisisProductName);
+ }
+ }
+ else // sequence does not contain N-terminus
+ {
+ //Digest C-terminus
+ if (addCterminalDigestionTruncations)
+ {
+ AddCterminalTruncations(lengthOfProteolysis, fullProteinOneBasedEnd, fullProteinOneBasedBegin, minProductBaseSequenceLength, proteolyisisProductName);
+ }
+
+ //Digest N-terminus
+ if (addNterminalDigestionTruncations)
+ {
+ AddNterminalTruncations(lengthOfProteolysis, fullProteinOneBasedBegin, fullProteinOneBasedEnd, minProductBaseSequenceLength, proteolyisisProductName);
+ }
+ }
+ }
+ ///
+ /// Returns of list of proteoforms with the specified number of C-terminal amino acid truncations subject to minimum length criteria
+ ///
+ private void AddCterminalTruncations(int lengthOfProteolysis, int fullProteinOneBasedEnd, int fullProteinOneBasedBegin, int minProductBaseSequenceLength, string proteolyisisProductName)
+ {
+ for (int i = 1; i <= lengthOfProteolysis; i++)
+ {
+ int newEnd = fullProteinOneBasedEnd - i;
+ int length = newEnd - fullProteinOneBasedBegin + 1;
+ if (length >= minProductBaseSequenceLength)
+ {
+ _proteolysisProducts.Add(new ProteolysisProduct(fullProteinOneBasedBegin, newEnd, proteolyisisProductName));
+ }
+ }
+ }
+ ///
+ /// Returns of list of proteoforms with the specified number of N-terminal amino acid truncations subject to minimum length criteria
+ ///
+
+ private void AddNterminalTruncations(int lengthOfProteolysis, int fullProteinOneBasedBegin, int fullProteinOneBasedEnd, int minProductBaseSequenceLength, string proteolyisisProductName)
+ {
+ for (int i = 1; i <= lengthOfProteolysis; i++)
+ {
+ int newBegin = fullProteinOneBasedBegin + i;
+ int length = fullProteinOneBasedEnd - newBegin + 1;
+ if (length >= minProductBaseSequenceLength)
+ {
+ _proteolysisProducts.Add(new ProteolysisProduct(newBegin, fullProteinOneBasedEnd, proteolyisisProductName));
+ }
+ }
+ }
+
+ ///
+ /// This the main entry point for adding sequences in a top-down truncation search.
+ /// The way this is designed is such at all base sequences to be searched end up in the list Protein.ProteolysisProducts
+ /// This includes the intact protein. IT DOES NOT INCLUDE ANY DOUBLY (BOTH ENDS) DIGESTED PRODUCTS.
+ /// The original proteolysis products (if any) are already in that list. These are annotated in protein.xml files.
+ /// The options to keep in mind are present in the following variables
+ ///
+ /// This needs to be added to the proteolysisProducts list to be searched
+ /// the original products are there but those resulting from N- or C-terminal degradation still need to be added
+ ///
+ ///
+ /// the same as the min detectable peptide
+ /// the number of amino acids that can be removed from either end.
+ public void AddTruncations(bool addFullProtein = true, bool addForEachOrigninalProteolysisProduct = true, bool addNterminalDigestionTruncations = true, bool addCterminalDigestionTruncations = true, int minProductBaseSequenceLength = 7, int lengthOfProteolysis = 5)
+ {
+ if (addFullProtein) //this loop adds the intact protoeoform and its proteolysis products to the proteolysis products list
+ {
+ AddIntactProteoformToTruncationsProducts(minProductBaseSequenceLength);
+ if (addNterminalDigestionTruncations)
+ {
+ AddTruncationsToExistingProteolysisProducts(1, BaseSequence.Length, true, false, minProductBaseSequenceLength, lengthOfProteolysis, "full-length proteoform N-terminal digestion truncation");
+ }
+ if (addCterminalDigestionTruncations)
+ {
+ AddTruncationsToExistingProteolysisProducts(1, BaseSequence.Length, false, true, minProductBaseSequenceLength, lengthOfProteolysis, "full-length proteoform C-terminal digestion truncation");
+ }
+ }
+
+ if (addForEachOrigninalProteolysisProduct) // this does not include the original intact proteoform
+ {
+ List existingProducts = ProteolysisProducts.Where(p => !p.Type.Contains("truncation") && !p.Type.Contains("full-length proteoform")).ToList();
+ foreach (ProteolysisProduct product in existingProducts)
+ {
+ if (product.OneBasedBeginPosition.HasValue && product.OneBasedEndPosition.HasValue)
+ {
+ string proteolyisisProductName = "truncation";
+
+ if (!String.IsNullOrEmpty(product.Type))
+ {
+ proteolyisisProductName = product.Type + " " + proteolyisisProductName;
+ }
+ //the original proteolysis product is already on the list so we don't need to duplicate
+ if (addNterminalDigestionTruncations)
+ {
+ AddTruncationsToExistingProteolysisProducts(product.OneBasedBeginPosition.Value, product.OneBasedEndPosition.Value, true, false, minProductBaseSequenceLength, lengthOfProteolysis, proteolyisisProductName);
+ }
+ if (addCterminalDigestionTruncations)
+ {
+ AddTruncationsToExistingProteolysisProducts(product.OneBasedBeginPosition.Value, product.OneBasedEndPosition.Value, false, true, minProductBaseSequenceLength, lengthOfProteolysis, proteolyisisProductName);
+ }
+ }
+ }
+ }
+ CleaveOnceBetweenProteolysisProducts();
+ }
+ ///
+ /// This method adds proteoforms with N- and C-terminal amino acid loss to the list of species included in top-down search
+ ///
+ public void AddIntactProteoformToTruncationsProducts(int minProductBaseSequenceLength)
+ {
+ if (BaseSequence.Length >= minProductBaseSequenceLength)
+ {
+ _proteolysisProducts.Add(new ProteolysisProduct(1, BaseSequence.Length, "full-length proteoform"));
+ }
+ }
+
+ ///
+ /// proteins with multiple proteolysis products are not always full cleaved. we observed proteolysis products w/ missed cleavages.
+ /// This method allows for one missed cleavage between proteolysis products.
+ ///
+
+ public void CleaveOnceBetweenProteolysisProducts(int minimumProductLength = 7)
+ {
+ List cleavagePostions = new();
+ List localProducts = _proteolysisProducts.Where(p => !p.Type.Contains("truncation") && !p.Type.Contains("full-length proteoform")).ToList();
+ List proteolysisProductEndPositions = localProducts.Where(p => p.OneBasedEndPosition.HasValue).Select(p => p.OneBasedEndPosition.Value).ToList();
+ if (proteolysisProductEndPositions.Count > 0)
+ {
+ foreach (int proteolysisProductEndPosition in proteolysisProductEndPositions)
+ {
+ if (localProducts.Any(p => p.OneBasedBeginPosition == (proteolysisProductEndPosition + 1)))
+ {
+ cleavagePostions.Add(proteolysisProductEndPosition);
+ }
+ }
+ }
+
+ foreach (int position in cleavagePostions)
+ {
+ if (position - 1 >= minimumProductLength)
+ {
+ string leftType = $"N-terminal Portion of Singly Cleaved Protein(1-{position})";
+ ProteolysisProduct leftProduct = new(1, position, leftType);
+
+ //here we're making sure a product with these begin/end positions isn't already present
+ if (!_proteolysisProducts.Any(p => p.OneBasedBeginPosition == leftProduct.OneBasedBeginPosition && p.OneBasedEndPosition == leftProduct.OneBasedEndPosition))
+ {
+ _proteolysisProducts.Add(leftProduct);
+ }
+ }
+
+ if (BaseSequence.Length - position - 1 >= minimumProductLength)
+ {
+ string rightType = $"C-terminal Portion of Singly Cleaved Protein({position + 1}-{BaseSequence.Length})";
+ ProteolysisProduct rightProduct = new(position + 1, BaseSequence.Length, rightType);
+
+ //here we're making sure a product with these begin/end positions isn't already present
+ if (!_proteolysisProducts.Any(p => p.OneBasedBeginPosition == rightProduct.OneBasedBeginPosition && p.OneBasedEndPosition == rightProduct.OneBasedEndPosition))
+ {
+ _proteolysisProducts.Add(rightProduct);
+ }
+ }
+ }
+ }
+
+ private static string GetName(IEnumerable appliedVariations, string name)
+ {
+ bool emptyVars = appliedVariations == null || appliedVariations.Count() == 0;
+ if (name == null && emptyVars)
+ {
+ return null;
+ }
+ else
+ {
+ string variantTag = emptyVars ? "" : $" variant:{VariantApplication.CombineDescriptions(appliedVariations)}";
+ return name + variantTag;
+ }
+ }
+
+ public int CompareTo(Protein other)
+ {
+ //permits sorting of proteins
+ return this.Accession.CompareTo(other.Accession);
+ }
+
+ //not sure if we require any additional fields for equality
+ public override bool Equals(object obj)
+ {
+ Protein otherProtein = (Protein)obj;
+ return otherProtein != null && otherProtein.Accession.Equals(Accession) && otherProtein.BaseSequence.Equals(BaseSequence);
+ }
+
+ ///
+ /// The protein object uses the default hash code method for speed,
+ /// but note that two protein objects with the same information will give two different hash codes.
+ ///
+ ///
+ public override int GetHashCode()
+ {
+ return this.BaseSequence.GetHashCode();
+ }
+
+ public override string ToString()
+ {
+ return this.Accession.ToString();
+ }
+ }
+}
\ No newline at end of file
diff --git a/mzLib/MassSpectrometry/Proteomics/Protein/ProteoformLevelClassifier.cs b/mzLib/MassSpectrometry/Proteomics/Protein/ProteoformLevelClassifier.cs
new file mode 100644
index 000000000..e89956e8c
--- /dev/null
+++ b/mzLib/MassSpectrometry/Proteomics/Protein/ProteoformLevelClassifier.cs
@@ -0,0 +1,240 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using Proteomics.ProteolyticDigestion;
+
+namespace Proteomics
+{
+ public static class ProteoformLevelClassifier
+ {
+
+ ///
+ /// All input strings are delimited with "|"
+ /// PTMs are annotated with []
+ ///
+ /// All possible sequences (with modifications) for this PrSM
+ /// All possible genes for this PrSM
+ ///
+ public static string ClassifyPrSM(string fullSequenceString, string geneString)
+ {
+ //separate delimited input
+ string[] sequences = fullSequenceString.Split('|');
+ string[] genes = geneString.Split('|');
+
+
+ //determine sequence ambiguity
+ string firstBaseSequence = PeptideWithSetModifications.GetBaseSequenceFromFullSequence(sequences[0]).ToUpper(); //get first sequence with modifications removed
+ bool sequenceIdentified = !SequenceContainsUnknownAminoAcids(firstBaseSequence); //check if there are any ambiguous amino acids (i.e. B, J, X, Z)
+ //for every other sequence reported
+ if (sequenceIdentified) //if there weren't any unknown amino acids reported.
+ {
+ for (int i = 1; i < sequences.Length; i++)
+ {
+ //if the unmodified sequences don't match, then there's sequence ambiguity
+ if (!firstBaseSequence.Equals(PeptideWithSetModifications.GetBaseSequenceFromFullSequence(sequences[i]).ToUpper()))
+ {
+ sequenceIdentified = false;
+ break;
+ }
+ }
+ }
+
+
+ //determine PTM localization and identification
+ List<(int index, string ptm)> firstPTMsSortedByIndex = GetPTMs(sequences[0]); //get ptms from the first sequence reported
+ List firstPTMsSortedByPTM = firstPTMsSortedByIndex.Select(x => x.ptm).OrderBy(x => x).ToList(); //sort ptms alphabetically
+ //check if there are unknown mass shifts
+ bool ptmsIdentified = !PtmsContainUnknownMassShifts(firstPTMsSortedByPTM);
+ bool ptmsLocalized = true; //assume these are localized unless we determine otherwise
+ //for every other sequence reported
+ for (int seqIndex = 1; seqIndex < sequences.Length; seqIndex++)
+ {
+ List<(int index, string ptm)> currentPTMsSortedByIndex = GetPTMs(sequences[seqIndex]); //get ptms from this sequence
+ List currentPTMsSortedByPTM = currentPTMsSortedByIndex.Select(x => x.ptm).OrderBy(x => x).ToList(); //sort ptms alphabetically
+
+ //are number of PTMs the same?
+ if (firstPTMsSortedByIndex.Count == currentPTMsSortedByIndex.Count)
+ {
+ //check localization (are indexes conserved?)
+ for (int i = 0; i < firstPTMsSortedByIndex.Count; i++)
+ {
+ if (firstPTMsSortedByIndex[i].index != currentPTMsSortedByIndex[i].index)
+ {
+ ptmsLocalized = false;
+ break;
+ }
+ }
+ //check PTM identification
+ for (int i = 0; i < firstPTMsSortedByPTM.Count; i++)
+ {
+ if (!firstPTMsSortedByPTM[i].Equals(currentPTMsSortedByPTM[i]))
+ {
+ ptmsIdentified = false;
+ break;
+ }
+ }
+ }
+ else
+ {
+ ptmsIdentified = false;
+ ptmsLocalized = false;
+ }
+ }
+ //handle an edge case where two PTMs are identified and localized to two residues, but it's unclear which PTM is localized to which residue.
+ if (ptmsIdentified && ptmsLocalized)
+ {
+ for (int seqIndex = 1; seqIndex < sequences.Length; seqIndex++)
+ {
+ List<(int index, string ptm)> currentPTMsSortedByIndex = GetPTMs(sequences[seqIndex]); //get ptms from this sequence
+ //check that the mods are in the same position
+ for(int ptmIndex =0; ptmIndex
+ /// Determine proteoform level between 1 (know everything) and 5 (only know the mass)
+ /// as defined in the publication:
+ /// Smith, L.M., Thomas, P.M., Shortreed, M.R. et al. A five-level classification system for proteoform identifications. Nat Methods 16, 939–940 (2019). https://doi.org/10.1038/s41592-019-0573-x
+ ///
+ /// Is the PTM localized?
+ /// Do we know what the PTM is, or is it ambiguous (or an unknown mass shift?)
+ /// Do we know the proteoform sequence, or is it ambiguous?
+ /// Do we know which gene produced this proteoform?
+ ///
+ public static string GetProteoformClassification(bool ptmLocalized, bool ptmIdentified, bool sequenceIdentified, bool geneIdentified)
+ {
+ int sum = Convert.ToInt16(ptmLocalized) + Convert.ToInt16(ptmIdentified) + Convert.ToInt16(sequenceIdentified) + Convert.ToInt16(geneIdentified);
+ if (sum == 3) //level 2, but is it A, B, C, or D?
+ {
+ if (!ptmLocalized)
+ {
+ return "2A";
+ }
+ else if (!ptmIdentified)
+ {
+ return "2B";
+ }
+ else if (!sequenceIdentified)
+ {
+ return "2C";
+ }
+ else //if (!geneIdentified)
+ {
+ return "2D";
+ }
+ }
+ else
+ {
+ return (5 - sum).ToString();
+ }
+ }
+
+ ///
+ /// Provided with an unmodified sequence, return if it contains ambiguous amino acids such as:
+ /// B: Aspartic acid or Asparagine
+ /// J: Leucine or Isoleucine
+ /// X: Any amino acid
+ /// Z: Glutamic acid or Glutamine
+ ///
+ ///
+ ///
+ private static bool SequenceContainsUnknownAminoAcids(string baseSequence)
+ {
+ char[] ambiguousAminoAcids = new char[] { 'B', 'J', 'X', 'Z' };
+ foreach (char aa in ambiguousAminoAcids)
+ {
+ if (baseSequence.Contains(aa))
+ {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ ///
+ /// Given a proteoform sequence (contains ptms), returns a list of all ptms and their one based index in order from N-terminus to C-terminus
+ ///
+ ///
+ ///
+ private static List<(int, string)> GetPTMs(string fullSequence)
+ {
+ List<(int, string)> ptmsToReturn = new List<(int, string)>();
+ StringBuilder currentPTM = new StringBuilder();
+ int currentIndex = 0;
+ int numLeftBrackets = 0; //PTMs are annotated with brackets. This object keeps track of how many brackets deep we are
+
+ //iterate through the sequence
+ foreach (char c in fullSequence)
+ {
+ //if we found a right bracket
+ if (c == ']')
+ {
+ //record that we're stepping out of brackets
+ numLeftBrackets--;
+ //if we've finished the ptm
+ if (numLeftBrackets == 0)
+ {
+ //Add the ptm and clear the record
+ currentIndex--; //move back an index because we added one when we entered the bracket
+ ptmsToReturn.Add((currentIndex, currentPTM.ToString()));
+ currentPTM.Clear();
+ }
+ }
+ else //if not a right bracket...
+ {
+ //if we're already in a PTM, record it
+ if (numLeftBrackets > 0)
+ {
+ currentPTM.Append(c);
+ }
+ else //we're not in a PTM, so update where we are in the proteoform
+ {
+ currentIndex++; //this operation occurs when entering a PTM, so we need to substract when exiting the PTM
+ }
+ //if we're entering a PTM or a nested bracket, record it
+ if (c == '[')
+ {
+ numLeftBrackets++;
+ }
+ }
+ }
+
+ return ptmsToReturn;
+ }
+
+ ///
+ /// See if any of the reported PTMs are mass shifts, (e.g. [+15.99] or [-17.99]) or contain "?"
+ ///
+ ///
+ ///
+ private static bool PtmsContainUnknownMassShifts(List ptms)
+ {
+ foreach (string ptm in ptms)
+ {
+ if (ptm.Length > 1) //check length is appropriate
+ {
+ //remove sign with substring and try to parse into double. If it's a mass, tryparse returns true
+ if (double.TryParse(ptm.Substring(1), out double mass))
+ {
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+ }
+}
diff --git a/mzLib/MassSpectrometry/Proteomics/Protein/ProteolysisProduct.cs b/mzLib/MassSpectrometry/Proteomics/Protein/ProteolysisProduct.cs
new file mode 100644
index 000000000..81eaebdc3
--- /dev/null
+++ b/mzLib/MassSpectrometry/Proteomics/Protein/ProteolysisProduct.cs
@@ -0,0 +1,32 @@
+namespace Proteomics
+{
+ public class ProteolysisProduct
+ {
+ public ProteolysisProduct(int? oneBasedBeginPosition, int? oneBasedEndPosition, string type)
+ {
+ OneBasedBeginPosition = oneBasedBeginPosition;
+ OneBasedEndPosition = oneBasedEndPosition;
+ Type = type ?? "";
+ }
+
+ public int? OneBasedBeginPosition { get; }
+ public int? OneBasedEndPosition { get; }
+ public string Type { get; }
+
+ public override bool Equals(object obj)
+ {
+ ProteolysisProduct pp = obj as ProteolysisProduct;
+ return pp != null
+ && pp.OneBasedBeginPosition.Equals(OneBasedBeginPosition)
+ && pp.OneBasedEndPosition.Equals(OneBasedEndPosition)
+ && (pp.Type == null && Type == null || pp.Type.Equals(Type));
+ }
+
+ public override int GetHashCode()
+ {
+ return (OneBasedBeginPosition ?? 0).GetHashCode()
+ ^ (OneBasedEndPosition ?? 0).GetHashCode()
+ ^ Type.GetHashCode(); // null handled in constructor
+ }
+ }
+}
\ No newline at end of file
diff --git a/mzLib/MassSpectrometry/Proteomics/Protein/SequenceVariantDescription.cs b/mzLib/MassSpectrometry/Proteomics/Protein/SequenceVariantDescription.cs
new file mode 100644
index 000000000..d19493dd8
--- /dev/null
+++ b/mzLib/MassSpectrometry/Proteomics/Protein/SequenceVariantDescription.cs
@@ -0,0 +1,97 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+
+namespace Proteomics
+{
+ public class SequenceVariantDescription
+ {
+ public SequenceVariantDescription(string description)
+ {
+ Description = description;
+ if (description == null)
+ {
+ return;
+ }
+
+ // Parse description into
+ string[] vcfFields = description.Split(new[] { @"\t" }, StringSplitOptions.None);
+ if (vcfFields.Length < 10) { return; }
+ ReferenceAlleleString = vcfFields[3];
+ AlternateAlleleString = vcfFields[4];
+ Info = new SnpEffAnnotation(vcfFields[7]);
+ AlleleIndex = Info.Allele == null ? -1 : AlternateAlleleString.Split(',').ToList().IndexOf(Info.Allele) + 1; // reference is zero
+ Format = vcfFields[8];
+ string[] genotypes = Enumerable.Range(9, vcfFields.Length - 9).Select(i => vcfFields[i]).ToArray();
+
+ // loop through genotypes for this variant (e.g. tumor and normal)
+ for (int individual = 0; individual < genotypes.Length; individual++)
+ {
+ var genotypeFields = GenotypeDictionary(Format.Trim(), genotypes[individual].Trim());
+
+ // parse genotype
+ string[] gt = null;
+ if (genotypeFields.TryGetValue("GT", out string gtString)) { gt = gtString.Split('/'); }
+ if (gt == null) { continue; }
+
+ // parse allele depth (might be null, technically, but shouldn't be in most use cases)
+ string[] ad = null;
+ if (genotypeFields.TryGetValue("AD", out string adString)) { ad = adString.Split(','); }
+
+ Genotypes.Add(individual.ToString(), gt);
+ AlleleDepths.Add(individual.ToString(), ad);
+ Homozygous.Add(individual.ToString(), gt.Distinct().Count() == 1);
+ Heterozygous.Add(individual.ToString(), gt.Distinct().Count() > 1);
+ }
+ }
+
+ public string Description { get; }
+ public string ReferenceAlleleString { get; }
+ public string AlternateAlleleString { get; }
+ public SnpEffAnnotation Info { get; }
+ public string Format { get; }
+ public Dictionary Homozygous { get; } = new Dictionary();
+ public Dictionary Heterozygous { get; } = new Dictionary();
+ public Dictionary Genotypes { get; } = new Dictionary();
+ public Dictionary AlleleDepths { get; } = new Dictionary();
+ public int AlleleIndex { get; }
+
+ ///
+ /// Returns original string for the description
+ ///
+ ///
+ public override string ToString()
+ {
+ return Description;
+ }
+
+ public override bool Equals(object obj)
+ {
+ SequenceVariantDescription s = obj as SequenceVariantDescription;
+ return s != null && s.Description == Description;
+ }
+
+ public override int GetHashCode()
+ {
+ return (Description ?? "").GetHashCode();
+ }
+
+ ///
+ /// Gets a dictionary of the format (key) and fields (value) for a genotype
+ ///
+ ///
+ ///
+ ///
+ internal static Dictionary GenotypeDictionary(string format, string genotype)
+ {
+ Dictionary genotypeDict = new Dictionary();
+ string[] formatSplit = format.Split(':');
+ string[] genotypeSplit = genotype.Split(':');
+ if (formatSplit.Length != genotypeSplit.Length)
+ {
+ throw new ArgumentException("Genotype format: " + format + " and genotype: " + genotype + " do not match -- they're not the same length");
+ }
+ return Enumerable.Range(0, formatSplit.Length).ToDictionary(x => formatSplit[x], x => genotypeSplit[x]);
+ }
+ }
+}
\ No newline at end of file
diff --git a/mzLib/MassSpectrometry/Proteomics/Protein/SequenceVariation.cs b/mzLib/MassSpectrometry/Proteomics/Protein/SequenceVariation.cs
new file mode 100644
index 000000000..84642db46
--- /dev/null
+++ b/mzLib/MassSpectrometry/Proteomics/Protein/SequenceVariation.cs
@@ -0,0 +1,162 @@
+using System.Collections.Generic;
+using System.Linq;
+using Proteomics;
+
+namespace Proteomics
+{
+ public class SequenceVariation
+ {
+ ///
+ /// For longer sequence variations, where a range of sequence is replaced. Point mutations should be specified with the same begin and end positions.
+ ///
+ ///
+ ///
+ ///
+ ///
+ ///
+ public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, string originalSequence, string variantSequence, string description, Dictionary> oneBasedModifications = null)
+ {
+ OneBasedBeginPosition = oneBasedBeginPosition;
+ OneBasedEndPosition = oneBasedEndPosition;
+ OriginalSequence = originalSequence ?? "";
+ VariantSequence = variantSequence ?? "";
+ Description = new SequenceVariantDescription(description);
+ OneBasedModifications = oneBasedModifications ?? new Dictionary>();
+ }
+
+ ///
+ /// For variations with only position information (not begin and end).
+ /// Sets the end to the end of the original protein sequence to which this variation applies.
+ ///
+ ///
+ ///
+ ///
+ ///
+ ///
+ public SequenceVariation(int oneBasedPosition, string originalSequence, string variantSequence, string description, Dictionary> oneBasedModifications = null)
+ : this(oneBasedPosition, originalSequence == null ? oneBasedPosition : oneBasedPosition + originalSequence.Length - 1, originalSequence, variantSequence, description, oneBasedModifications)
+ { }
+
+ ///
+ /// Beginning position of original sequence to be replaced
+ ///
+ public int OneBasedBeginPosition { get; }
+
+ ///
+ /// End position of original sequence to be replaced
+ ///
+ public int OneBasedEndPosition { get; }
+
+ ///
+ /// Original sequence information (optional)
+ ///
+ public string OriginalSequence { get; }
+
+ ///
+ /// Variant sequence information (required)
+ ///
+ public string VariantSequence { get; }
+
+ ///
+ /// Description of this variation (optional)
+ ///
+ public SequenceVariantDescription Description { get; }
+
+ ///
+ /// Modifications specifically for this variant
+ ///
+ public Dictionary> OneBasedModifications { get; }
+
+ public override bool Equals(object obj)
+ {
+ SequenceVariation s = obj as SequenceVariation;
+ return s != null
+ && OneBasedBeginPosition == s.OneBasedBeginPosition
+ && OneBasedEndPosition == s.OneBasedEndPosition
+ && (s.OriginalSequence == null && OriginalSequence == null || OriginalSequence.Equals(s.OriginalSequence))
+ && (s.VariantSequence == null && VariantSequence == null || VariantSequence.Equals(s.VariantSequence))
+ && (s.Description == null && Description == null || Description.Equals(s.Description))
+ && (s.OneBasedModifications == null && OneBasedModifications == null ||
+ s.OneBasedModifications.Keys.ToList().SequenceEqual(OneBasedModifications.Keys.ToList())
+ && s.OneBasedModifications.Values.SelectMany(m => m).ToList().SequenceEqual(OneBasedModifications.Values.SelectMany(m => m).ToList()));
+ }
+
+ public override int GetHashCode()
+ {
+ return OneBasedBeginPosition.GetHashCode()
+ ^ OneBasedEndPosition.GetHashCode()
+ ^ OriginalSequence.GetHashCode() // null handled in constructor
+ ^ VariantSequence.GetHashCode() // null handled in constructor
+ ^ Description.GetHashCode(); // always constructed in constructor
+ }
+
+ ///
+ /// Returns a simple string represantation of this amino acid change
+ ///
+ ///
+ public string SimpleString()
+ {
+ return OriginalSequence + OneBasedBeginPosition.ToString() + VariantSequence;
+ }
+
+ ///
+ /// Determines whether this interval overlaps the queried interval
+ ///
+ ///
+ ///
+ internal bool Intersects(SequenceVariation segment)
+ {
+ return segment.OneBasedEndPosition >= OneBasedBeginPosition && segment.OneBasedBeginPosition <= OneBasedEndPosition;
+ }
+
+ ///
+ /// Determines whether this interval overlaps the queried interval
+ ///
+ ///
+ ///
+ internal bool Intersects(ProteolysisProduct segment)
+ {
+ return segment.OneBasedEndPosition >= OneBasedBeginPosition && segment.OneBasedBeginPosition <= OneBasedEndPosition;
+ }
+
+ ///
+ /// Determines whether this interval overlaps the queried position
+ ///
+ ///
+ ///
+ internal bool Intersects(int pos)
+ {
+ return OneBasedBeginPosition <= pos && pos <= OneBasedEndPosition;
+ }
+
+ ///
+ /// Determines whether this interval includes the queried interval
+ ///
+ ///
+ ///
+ internal bool Includes(SequenceVariation segment)
+ {
+ return OneBasedBeginPosition <= segment.OneBasedBeginPosition && OneBasedEndPosition >= segment.OneBasedEndPosition;
+ }
+
+ ///
+ /// Determines whether this interval includes the queried interval
+ ///
+ ///
+ ///
+ internal bool Includes(ProteolysisProduct segment)
+ {
+ return OneBasedBeginPosition <= segment.OneBasedBeginPosition && OneBasedEndPosition >= segment.OneBasedEndPosition;
+ }
+
+ ///
+ /// Determines whether this interval overlaps the queried position
+ ///
+ ///
+ ///
+ internal bool Includes(int pos)
+ {
+ return OneBasedBeginPosition <= pos && pos <= OneBasedEndPosition;
+ }
+ }
+}
\ No newline at end of file
diff --git a/mzLib/MassSpectrometry/Proteomics/Protein/SnpEffAnnotation.cs b/mzLib/MassSpectrometry/Proteomics/Protein/SnpEffAnnotation.cs
new file mode 100644
index 000000000..62330a9c3
--- /dev/null
+++ b/mzLib/MassSpectrometry/Proteomics/Protein/SnpEffAnnotation.cs
@@ -0,0 +1,236 @@
+using System.Collections.Generic;
+using System.Linq;
+using System.Text.RegularExpressions;
+
+namespace Proteomics
+{
+ ///
+ /// Specifications are described here: http://snpeff.sourceforge.net/VCFannotationformat_v1.0.pdf
+ ///
+ public class SnpEffAnnotation
+ {
+ private static readonly Regex HGVSProteinRegex = new Regex(@"(p\.)([A-Z][a-z][a-z])(\d+)([A-Z][a-z][a-z])");
+
+ ///
+ /// Original SnpEff annotation string.
+ ///
+ public string Annotation { get; }
+
+ public string Allele { get; }
+ public string[] Effects { get; }
+ public string PutativeImpact { get; }
+ public string GeneName { get; }
+ public string GeneID { get; }
+
+ ///
+ /// It looks like these are sometimes domains, like the ones annotated in UniProt,
+ /// Otherwise, this tends to just be "transcript"
+ ///
+ /// Some examples:
+ /// sequence_feature: can be initiator-methionine:Removed ... maybe not too helpful for proteomics, since this is assumed
+ /// sequence_feature: helix:combinatorial_evidence_used_in_manual_assertion
+ /// sequence_feature: nucleotide-phosphate-binding-region:ATP
+ /// sequence_feature: domain:EGF-like_2
+ /// sequence_feature: transmembrane-region:Transmembrane_region
+ /// sequence_feature: topological-domain:Extracellular
+ /// sequence_feature: modified-residue:phosphoserine
+ ///
+ public string FeatureType { get; }
+
+ ///
+ /// Always seems to be the transcriptID
+ ///
+ public string FeatureID { get; }
+
+ public string TranscriptBiotype { get; }
+ public int ExonIntronRank { get; }
+ public int ExonIntronTotal { get; }
+ public string HGVSNotationDnaLevel { get; } // kind of bad for ins and del because they notation aligns to most 3' coordinate, rather than leftmost
+ public string HGVSNotationProteinLevel { get; }
+ public int OneBasedTranscriptCDNAPosition { get; }
+ public int TranscriptCDNALength { get; }
+ public int OneBasedCodingDomainSequencePosition { get; }
+ public int CodingDomainSequenceLengthIncludingStopCodon { get; }
+ public int OneBasedProteinPosition { get; }
+ public int ProteinLength { get; }
+
+ ///
+ /// up/downstream: distance to first / last codon
+ /// intergenic: distance to closest gene
+ /// exonic: distance to closest intron boundary (+ is upstream, - is downstream)
+ /// intronic: distance to closest exon boundary (+ is upstream, - is downstream)
+ /// motif: distance to first base in MOTIF
+ /// miRNA: distance to first base in miRNA
+ /// splice_site: distance to exon-intron boundary
+ /// splice_region: distance to exon-intron boundary
+ /// chip seq peak: distance to summit or peak center
+ /// histone mark/state: distance to summit or peak center
+ ///
+ public int DistanceToFeature { get; }
+
+ public string[] Warnings { get; }
+
+ public int AminoAcidLocation { get; }
+ public char ReferenceAminoAcid { get; }
+ public char AlternateAminoAcid { get; }
+ public bool Missense { get; }
+ public bool Synonymous { get; }
+ public bool FrameshiftVariant { get; }
+ public bool BadTranscript { get; }
+
+ public SnpEffAnnotation(string annotation)
+ {
+ bool isSnpEffAnnotation = annotation.StartsWith("ANN=") || annotation.StartsWith("EFF=");
+ Annotation = isSnpEffAnnotation ? annotation.Substring(4) : annotation;
+ if (!isSnpEffAnnotation)
+ {
+ return;
+ }
+ string[] a = Annotation.Split('|');
+ Allele = a[0];
+ Effects = a[1].Split('&');
+ PutativeImpact = a[2];
+ GeneName = a[3];
+ GeneID = a[4];
+ FeatureType = a[5];
+ FeatureID = a[6];
+ TranscriptBiotype = a[7];
+ if (a[8].Split('/').Length > 0 && int.TryParse(a[8].Split('/')[0], out int x)) { ExonIntronRank = x; }
+ if (a[8].Split('/').Length > 1 && int.TryParse(a[8].Split('/')[1], out int y)) { ExonIntronTotal = y; }
+ HGVSNotationDnaLevel = a[9];
+ HGVSNotationProteinLevel = a[10];
+ if (a[11].Split('/').Length > 0 && int.TryParse(a[11].Split('/')[0], out x)) { OneBasedTranscriptCDNAPosition = x; }
+ if (a[11].Split('/').Length > 1 && int.TryParse(a[11].Split('/')[1], out y)) { TranscriptCDNALength = y; }
+ if (a[12].Split('/').Length > 0 && int.TryParse(a[12].Split('/')[0], out x)) { OneBasedCodingDomainSequencePosition = x; }
+ if (a[12].Split('/').Length > 1 && int.TryParse(a[12].Split('/')[1], out y)) { CodingDomainSequenceLengthIncludingStopCodon = y; }
+ if (a[13].Split('/').Length > 0 && int.TryParse(a[13].Split('/')[0], out x)) { OneBasedProteinPosition = x; }
+ if (a[13].Split('/').Length > 1 && int.TryParse(a[13].Split('/')[1], out y)) { ProteinLength = y; }
+ if (int.TryParse(a[14], out y)) DistanceToFeature = y;
+ Warnings = a[15].Split('&');
+
+ Missense = Effects.Any(eff => eff == "missense_variant");
+ Synonymous = !Effects.Any(eff => NonSynonymousVariations.Contains(eff));
+ FrameshiftVariant = Effects.Contains("frameshift_variant");
+ BadTranscript = Warnings.Any(w => BadTranscriptWarnings.Contains(w));
+ }
+
+ private string[] HighPutativeImpactEffects = new string[]
+ {
+ "chromosome_number_variation", // rare...
+ "exon_loss_variant", //
+ "frameshift_variant",
+ "rare_amino_acid_variant",
+ "splice_acceptor_variant", // often with intron_variant, sometimes with splice_donor_variant
+ "splice_donor_variant", // often with intron_variant, sometimes with splice_acceptor_variant
+ "start_lost",
+ "stop_gained",
+ "stop_lost",
+ "transcript_ablation",
+ };
+
+ private string[] ModeratePutativeImpactEffects = new string[]
+ {
+ "3_prime_UTR_truncation", "exon_loss", // appear together
+ "5_prime_UTR_truncation", "exon_loss_variant", // appear together
+ "coding_sequence_variant", // not seen much? Probably because missense is used more often.
+ "conservative_inframe_insertion",
+ "conservative_inframe_deletion",
+ "disruptive_inframe_deletion",
+ "disruptive_inframe_insertion",
+ "inframe_deletion", // not common, in favor of more specific terms above
+ "inframe_insertion", // not common, in favor of more specific terms above
+ "missense_variant",
+ "regulatory_region_ablation", // not common?
+ "splice_region_variant", // often combined with intron_variant and non_coding_transcript_exon_variant
+ "TFBS_ablation", // not common?
+ };
+
+ private string[] NonSynonymousVariations = new string[]
+ {
+ "exon_loss_variant",
+ "frameshift_variant",
+ "rare_amino_acid_variant",
+ "start_lost",
+ "stop_gained",
+ "stop_lost",
+ "conservative_inframe_insertion",
+ "conservative_inframe_deletion",
+ "disruptive_inframe_deletion",
+ "disruptive_inframe_insertion",
+ "inframe_deletion", // not common, in favor of more specific terms above
+ "inframe_insertion", // not common, in favor of more specific terms above
+ "missense_variant",
+ };
+
+ private string[] LowPutativeImpactEffects = new string[]
+ {
+ "5_prime_UTR_premature_start_codon_gain_variant",
+ "initiator_codon_variant",
+ "splice_region_variant",
+ "start_retained", // not used in human, with only one canonical start codon
+ "stop_retained_variant", // fairly common
+ "synonymous_variant",
+ "sequence_feature"
+ };
+
+ private string[] ModifierEffects = new string[]
+ {
+ "3_prime_UTR_variant",
+ "5_prime_UTR_variant",
+ "coding_sequence_variant",
+ "conserved_intergenic_variant",
+ "conserved_intron_variant",
+ "downstream_gene_variant",
+ "exon_variant",
+ "feature_elongation",
+ "feature_truncation",
+ "gene_variant",
+ "intergenic_region",
+ "intragenic_variant",
+ "intron_variant",
+ "mature_miRNA_variant",
+ "miRNA",
+ "NMD_transcript_variant",
+ "non_coding_transcript_exon_variant",
+ "non_coding_transcript_variant",
+ "regulatory_region_amplification",
+ "regulatory_region_variant",
+ "TF_binding_site_variant",
+ "TFBS_amplification",
+ "transcript_amplification",
+ "transcript_variant",
+ "upstream_gene_variant"
+ };
+
+ private string[] BadTranscriptWarnings = new string[]
+ {
+ "WARNING_TRANSCRIPT_INCOMPLETE",
+ "WARNING_TRANSCRIPT_MULTIPLE_STOP_CODONS",
+ "WARNING_TRANSCRIPT_NO_STOP_CODON",
+ "WARNING_TRANSCRIPT_NO_START_CODON"
+ };
+
+ ///
+ /// It looks like WARNING_TRANSCRIPT_INCOMPLETE, WARNING_TRANSCRIPT_MULTIPLE_STOP_CODONS,
+ /// WARNING_TRANSCRIPT_NO_STOP_CODON, and WARNING_TRANSCRIPT_NO_START_CODON are relevant to this program.
+ ///
+ /// These are the ones that I shouldn't be translating.
+ ///
+ /// Could also be used for error messages regarding certain transcripts.
+ ///
+ public Dictionary SnpEffWarningDescriptions = new Dictionary
+ {
+ { "ERROR_CHROMOSOME_NOT_FOUND", "Chromosome does not exists in reference genome database." },
+ { "ERROR_OUT_OF_CHROMOSOME_RANGE", "The variant’s genomic coordinate is greater than chromosome's length." },
+ { "WARNING_REF_DOES_NOT_MATCH_GENOME", "This means that the ‘REF’ field in the input VCF file does not match the reference genome." },
+ { "WARNING_SEQUENCE_NOT_AVAILABLE", "Reference sequence is not available, thus no inference could be performed." },
+ { "WARNING_TRANSCRIPT_INCOMPLETE", "A protein coding transcript having a nonÂmultiple of 3 length, indicating that the reference genome has missing information about this trancript." },
+ { "WARNING_TRANSCRIPT_MULTIPLE_STOP_CODONS", "A protein coding transcript has two or more STOP codons in the middle of the coding sequence (CDS). This should not happen and it usually means the reference genome may have an error in this transcript." },
+ { "WARNING_TRANSCRIPT_NO_START_CODON", "A protein coding transcript does not have a proper START codon. It is rare that a real transcript does not have a START codon, so this probably indicates an error or missing information in the reference genome." },
+ { "WARNING_TRANSCRIPT_NO_STOP_CODON", "A protein coding transcript does not have a proper STOP codon. It is rare that a real transcript does not have a STOP codon, so this probably indicates an error or missing information in the reference genome." },
+ { "INFO_REALIGN_3_PRIME", "Variant has been realigned to the most 3Â-prime position within the transcript. This is usually done to to comply with HGVS specification to always report the most 3-Âprime annotation." },
+ { "INFO_COMPOUND_ANNOTATION", "This effect is a result of combining more than one variants." },
+ { "INFO_NON_REFERENCE_ANNOTATION", "An alternative reference sequence was used to calculate this annotation." },
+ };
+ }
+}
\ No newline at end of file
diff --git a/mzLib/MassSpectrometry/Proteomics/Protein/SpliceSite.cs b/mzLib/MassSpectrometry/Proteomics/Protein/SpliceSite.cs
new file mode 100644
index 000000000..c1c7cfadb
--- /dev/null
+++ b/mzLib/MassSpectrometry/Proteomics/Protein/SpliceSite.cs
@@ -0,0 +1,37 @@
+namespace Proteomics
+{
+ public class SpliceSite
+ {
+ public SpliceSite(int oneBasedBegin, int oneBasedEnd, string description)
+ {
+ OneBasedBeginPosition = oneBasedBegin;
+ OneBasedEndPosition = oneBasedEnd;
+ Description = description ?? "";
+ }
+
+ public SpliceSite(int oneBasedPosition, string description)
+ : this(oneBasedPosition, oneBasedPosition, description)
+ {
+ }
+
+ public int OneBasedBeginPosition { get; }
+ public int OneBasedEndPosition { get; }
+ public string Description { get; }
+
+ public override bool Equals(object obj)
+ {
+ SpliceSite s = obj as SpliceSite;
+ return s != null
+ && s.OneBasedBeginPosition == OneBasedBeginPosition
+ && s.OneBasedEndPosition == OneBasedEndPosition
+ && s.Description == Description;
+ }
+
+ public override int GetHashCode()
+ {
+ return OneBasedBeginPosition.GetHashCode()
+ ^ OneBasedEndPosition.GetHashCode()
+ ^ Description.GetHashCode(); // null handled in constructor
+ }
+ }
+}
\ No newline at end of file
diff --git a/mzLib/MassSpectrometry/Proteomics/Protein/VariantApplication.cs b/mzLib/MassSpectrometry/Proteomics/Protein/VariantApplication.cs
new file mode 100644
index 000000000..f2554694e
--- /dev/null
+++ b/mzLib/MassSpectrometry/Proteomics/Protein/VariantApplication.cs
@@ -0,0 +1,397 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using Proteomics;
+
+namespace Proteomics
+{
+ public static class VariantApplication
+ {
+ ///
+ /// Gets the accession for a protein with applied variations
+ ///
+ ///
+ ///
+ public static string GetAccession(Protein protein,IEnumerable appliedSequenceVariations)
+ {
+ return protein.NonVariantProtein.Accession +
+ (appliedSequenceVariations == null || appliedSequenceVariations.Count() == 0 ? "" : $"_{CombineSimpleStrings(appliedSequenceVariations)}");
+ }
+
+ ///