Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

better emperical null hypothesis for FlashLFQ's bayesian protein quant #510

Merged
merged 1 commit into from
Aug 26, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
164 changes: 54 additions & 110 deletions FlashLFQ/Bayesian Protein Quantification/ProteinQuantificationEngine.cs
Original file line number Diff line number Diff line change
Expand Up @@ -102,18 +102,9 @@ public void Run()
var conditions = results.SpectraFiles.Select(p => p.Condition).Distinct().ToList();
conditions.Remove(BaseCondition);

double baseConditionExperimentalNull = 0;
if (FoldChangeCutoff == null)
{
baseConditionExperimentalNull = DetermineExperimentalNull(BaseCondition);
}

// calculate fold-change for each protein
foreach (string condition in conditions)
{
double nullHyp = FoldChangeCutoff == null ?
Math.Max(DetermineExperimentalNull(condition), baseConditionExperimentalNull) : FoldChangeCutoff.Value;

Parallel.ForEach(Partitioner.Create(0, proteinList.Count), new ParallelOptions { MaxDegreeOfParallelism = MaxThreads }, partitionRange =>
{
for (int i = partitionRange.Item1; i < partitionRange.Item2; i++)
Expand All @@ -131,7 +122,44 @@ public void Run()
// run the Bayesian analysis
result = RunBayesianProteinQuant(peptideFoldChanges, protein, condition,
randomSeedsForEachProtein[protein][0], null, false, BurnInSteps, NSteps, out var mus);
}
else if (measurementCount == 1)
{
result = new ProteinQuantificationEngineResult(protein, BaseCondition, condition,
new double[] { peptideFoldChanges.SelectMany(p => p.Item2).First() }, new double[] { double.NaN },
new double[] { double.NaN }, peptideFoldChanges, ProteinToBaseConditionIntensity[protein]);
}
else
{
result = new ProteinQuantificationEngineResult(protein, BaseCondition, condition,
new double[] { 0.0 }, new double[] { double.NaN }, new double[] { double.NaN },
peptideFoldChanges, ProteinToBaseConditionIntensity[protein]);
}

protein.ConditionToQuantificationResults.Add(condition, result);
}
});
}

// calculate PEPs for each protein
foreach (string condition in conditions)
{
double nullHyp = FoldChangeCutoff == null ? DetermineExperimentalNull(condition) : FoldChangeCutoff.Value;

Parallel.ForEach(Partitioner.Create(0, proteinList.Count), new ParallelOptions { MaxDegreeOfParallelism = MaxThreads }, partitionRange =>
{
for (int i = partitionRange.Item1; i < partitionRange.Item2; i++)
{
ProteinGroup protein = proteinList[i].Key;

// get the fold-change measurements for the peptides assigned to this protein
List<(Peptide, List<double>)> peptideFoldChanges = GetPeptideFoldChangeMeasurements(protein, BaseCondition, condition, null, null);
int measurementCount = peptideFoldChanges.SelectMany(p => p.Item2).Count();

ProteinQuantificationEngineResult result = protein.ConditionToQuantificationResults[condition];

if (measurementCount > 1)
{
RunBayesianProteinQuant(peptideFoldChanges, protein, condition,
randomSeedsForEachProtein[protein][1], nullHyp, true, BurnInSteps, NSteps, out var skepticalMus);

Expand All @@ -140,24 +168,14 @@ public void Run()
}
else if (measurementCount == 1)
{
result = new ProteinQuantificationEngineResult(protein, BaseCondition, condition,
new double[] { peptideFoldChanges.SelectMany(p => p.Item2).First() }, new double[] { double.NaN },
new double[] { double.NaN }, peptideFoldChanges, ProteinToBaseConditionIntensity[protein]);

result.NullHypothesisCutoff = nullHyp;
result.CalculatePosteriorErrorProbability(new double[] { nullHyp });
}
else
{
result = new ProteinQuantificationEngineResult(protein, BaseCondition, condition,
new double[] { 0.0 }, new double[] { double.NaN }, new double[] { double.NaN },
peptideFoldChanges, ProteinToBaseConditionIntensity[protein]);

result.NullHypothesisCutoff = nullHyp;
result.CalculatePosteriorErrorProbability(new double[] { nullHyp });
}

protein.ConditionToQuantificationResults.Add(condition, result);
}
});
}
Expand Down Expand Up @@ -500,108 +518,34 @@ private ProteinQuantificationEngineResult RunBayesianProteinQuant(List<(Peptide,
}

/// <summary>
/// This function determines a fold-change cutoff from the data. The cutoff is the estimate of the biological variance.
/// If bioreps are defined, the standard deviation of the protein fold-change between bioreps within a condition is used
/// as the cutoff. If the condition only has 1 biorep, then the cutoff is the standard deviation of the protein fold-change
/// between conditions.
/// This function determines a fold-change cutoff (a null hypothesis) from the data. Each protein has an estimate of the
/// standard deviation of its peptide fold-changes. The protein's standard deviations are pooled and the null hypothesis
/// is estimated as the most common standard deviation. This is difficult to do because the distribution is usually skewed,
/// so median and average are pretty poor estimates of the mode. Instead, the smallest interval with 10% of the distribution's
/// density is found, and the median of this range is an estimate of the mode. This estimate of the mode standard deviation
/// is used as the null hypothesis.
/// </summary>
private double DetermineExperimentalNull(string treatmentCondition)
{
double experimentalNull;
double experimentalNull = 1.0;
var res = results.ProteinGroups.Values.Select(p => p.ConditionToQuantificationResults[treatmentCondition]).ToList();

int biorepCount = results.SpectraFiles.Where(p => p.Condition == treatmentCondition).Select(p => p.BiologicalReplicate).Distinct().Count();
List<double> stdDevs = res.Where(p => p.PeptideFoldChangeMeasurements
.SelectMany(v => v.foldChanges).Count() > 1).Select(p => p.StandardDeviationPointEstimate).ToList();

if ((PairedSamples || biorepCount == 1) && treatmentCondition == BaseCondition)
if (stdDevs.Any())
{
return 0;
}

var proteinList = ProteinsWithConstituentPeptides.ToList();

// generate a random seed for each protein for the MCMC sampler
var randomSeedsForEachProtein = new Dictionary<ProteinGroup, List<int>>();
foreach (var protein in ProteinsWithConstituentPeptides)
{
randomSeedsForEachProtein.Add(protein.Key, new List<int> { Rng.NextFullRangeInt32() });
}

var experimentalNullResults = new Dictionary<string, List<ProteinQuantificationEngineResult>>();

Parallel.ForEach(Partitioner.Create(0, proteinList.Count), new ParallelOptions { MaxDegreeOfParallelism = MaxThreads }, partitionRange =>
{
for (int i = partitionRange.Item1; i < partitionRange.Item2; i++)
{
ProteinGroup protein = proteinList[i].Key;

List<(Peptide, List<double>)> peptideFoldChanges = new List<(Peptide, List<double>)>();

if (biorepCount > 1)
{
if (!PairedSamples)
{
// get the fold-change measurements for the peptides assigned to this protein between bioreps of this condition
for (int b = 0; b < biorepCount - 1; b++)
{
peptideFoldChanges.AddRange(GetPeptideFoldChangeMeasurements(protein, treatmentCondition, treatmentCondition, b, b + 1));
}
}
else if (PairedSamples)
{
peptideFoldChanges.AddRange(GetPeptideFoldChangeMeasurements(protein, BaseCondition, treatmentCondition, null, null));
}
}
else
{
peptideFoldChanges.AddRange(GetPeptideFoldChangeMeasurements(protein, BaseCondition, treatmentCondition, null, null));
}

ProteinQuantificationEngineResult result;
int measurementCount = peptideFoldChanges.SelectMany(p => p.Item2).Count();

if (measurementCount > 1)
{
// run the Bayesian analysis
result = RunBayesianProteinQuant(peptideFoldChanges, protein, treatmentCondition,
randomSeedsForEachProtein[protein][0], null, false, 500, 500, out var mus);

lock (experimentalNullResults)
{
experimentalNullResults.Add(protein.ProteinGroupName, new List<ProteinQuantificationEngineResult> { result });
}
}
else
{
continue;
}
}
});
var tenPercentHdi = Util.GetHighestDensityInterval(stdDevs.ToArray(), 0.1);
var sdsWithinHdi = stdDevs.Where(p => p <= tenPercentHdi.hdi_end && p >= tenPercentHdi.hdi_start).ToList();

double stdDev = 0;
var modeStdDevPointEstimate = sdsWithinHdi.Median();
experimentalNull = modeStdDevPointEstimate;

// this is a weird case where there are very few proteins in the data...
if (experimentalNullResults.Count < 10)
{
if (experimentalNullResults.Any())
{
stdDev = experimentalNullResults.SelectMany(p => p.Value).Select(p => p.StandardDeviationPointEstimate).Average();
}
else
if (double.IsNaN(experimentalNull))
{
// TODO: not sure what to do here.. there were no protein quant results for this condition
// for now the cutoff is just made to be a fold-change of 1.0, but maybe an exception should be thrown...

// the experimental null is 3 * stdDev so the fold-change cutoff will end up being 1.0
stdDev = 1.0 / 3.0;

//throw new MzLibException("Could not determine experimental null for condition " + treatmentCondition);
experimentalNull = stdDevs.Median();
}
}
else
{
stdDev = experimentalNullResults.SelectMany(p => p.Value).Select(p => p.FoldChangePointEstimate).InterquartileRange() / 1.34896;
}

experimentalNull = 3.0 * stdDev;

return experimentalNull;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ public override string ToString()
TreatmentCondition + "\t" +
NullHypothesisCutoff.Value + "\t" +
FoldChangePointEstimate + "\t" +
StandardDeviationPointEstimate + "\t" +
ConditionIntensityPointEstimate + "\t" +
nPeptides + "\t" +
nMeasurements + "\t" +
Expand All @@ -122,10 +123,11 @@ public static string TabSeparatedHeader()
"Protein Group" + "\t" +
"Gene" + "\t" +
"Organism" + "\t" +
"Base Condition" + "\t" +
"Control Condition" + "\t" +
"Treatment Condition" + "\t" +
"Log2 Fold-Change Cutoff" + "\t" +
"Protein Log2 Fold-Change" + "\t" +
"Standard Deviation of Peptide Log2 Fold-Changes" + "\t" +
"Protein Intensity for Treatment Condition" + "\t" +
"Number of Peptides" + "\t" +
"Number of Fold-Change Measurements" + "\t" +
Expand Down
8 changes: 4 additions & 4 deletions TestFlashLFQ/TestFlashLFQ.cs
Original file line number Diff line number Diff line change
Expand Up @@ -787,8 +787,8 @@ public static void TestBayesianProteinQuantification()

var quantResult = proteinGroup.ConditionToQuantificationResults["b"];

Assert.That(Math.Round(quantResult.NullHypothesisCutoff.Value, 3) == 0.120);
Assert.That(Math.Round(quantResult.PosteriorErrorProbability, 3) == 0.229);
Assert.That(Math.Round(quantResult.NullHypothesisCutoff.Value, 3) == 0.202);
Assert.That(Math.Round(quantResult.PosteriorErrorProbability, 3) == 0.176);
Assert.That(Math.Round(quantResult.FoldChangePointEstimate, 3) == 1.007);
Assert.That(quantResult.PeptideFoldChangeMeasurements.Count == 1);
Assert.That(quantResult.PeptideFoldChangeMeasurements.SelectMany(v => v.foldChanges).Count() == 3);
Expand All @@ -799,7 +799,7 @@ public static void TestBayesianProteinQuantification()
var textResults = File.ReadAllLines(filepath);
Assert.That(textResults.Length == 2);
var line = textResults[1].Split(new char[] { '\t' });
Assert.That(Math.Round(double.Parse(line[11]), 3) == 0.229);
Assert.That(Math.Round(double.Parse(line[12]), 3) == 0.176);
File.Delete(filepath);

// try with defined fold-change cutoff
Expand Down Expand Up @@ -844,7 +844,7 @@ public static void TestBayesianProteinQuantification()

quantResult = proteinGroup.ConditionToQuantificationResults["b"];

Assert.That(Math.Round(quantResult.PosteriorErrorProbability, 3) == 0.139);
Assert.That(Math.Round(quantResult.PosteriorErrorProbability, 3) == 0.098);
Assert.That(Math.Round(quantResult.FoldChangePointEstimate, 3) == 1.103);
Assert.That(quantResult.PeptideFoldChangeMeasurements.Count == 1);
Assert.That(quantResult.PeptideFoldChangeMeasurements.SelectMany(v => v.foldChanges).Count() == 3);
Expand Down