Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dev - maxlfq and sum protein quantification strategies and optimization #66

Merged
merged 6 commits into from
Sep 19, 2023
2 changes: 1 addition & 1 deletion src/csodiaq/csodiaq.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ def run_scoring(args):
if len(proteinDfs) > 0:
printer("Begin Quantifying Common Proteins")
commonProteinDf = compile_common_protein_quantification_file(
proteinDfs, commonPeptideDf, args["proteinQuantMethod"]
proteinDfs, commonPeptideDf, args["proteinQuantMethod"], args["minNumDifferences"]
)
commonProteinDf.to_csv(os.path.join(outputDir, "commonProteins.csv"))
printer("Finish Scoring")
Expand Down
14 changes: 13 additions & 1 deletion src/csodiaq/csodiaqParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import numpy as np
from abc import ABC, abstractmethod
import re
import warnings


def set_args_from_command_line_input():
Expand Down Expand Up @@ -97,10 +98,17 @@ def add_score_parser(commandParser):
scoringParser.add_argument(
"-p",
"--proteinQuantMethod",
choices=["maxlfq", "average"],
choices=["maxlfq", "sum"],
default="maxlfq",
help="Method by which protein quantification metric is calculated (based on peptide quantities). \nOptional, default is 'maxlfq' method. Choices are 'maxlfq' or 'average'.",
)
scoringParser.add_argument(
"-min",
"--minNumDifferences",
type=_RestrictedInt("minNumDifferences", minValue=1, maxValue=2),
default=2,
help="Specific to the maxLFQ protein quantification method. Requires at minimum the given number of matches before a sample to sample ratio or difference is accepted.\nOptional, default is 2. Only 1 or 2 is accepted. \nThis flag will throw a warning error when paired with a protein quantification method other than 'maxlfq'.",
)


def add_reanalysis_parser(commandParser):
Expand Down Expand Up @@ -147,6 +155,10 @@ def check_for_conflicting_args(args):
raise argparse.ArgumentTypeError(
"The correctionDegree parameter is invalidated by the noCorrection flag. Please inspect your input and remove one of them."
)
if args["command"] == "score" and args["proteinQuantMethod"] != "maxlfq" and args["minNumDifferences"] != 2:
warnings.warn(
f"The minNumDifferences flag will only have an effect when paired with the 'maxlfq' proteinQuantMethod flag. You used it with the '{args['proteinQuantMethod']}' method, so this flag will be ignored.", UserWarning
)
if (
args["command"] == "targetedReanalysis"
and args["protein"]
Expand Down
2 changes: 1 addition & 1 deletion src/csodiaq/identification/matchingFunctions.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ def calculate_ppm_offset_tolerance_using_tallest_bin_peak(ppms):
binHeights, tallestBinIdx
)
offset = bins[tallestBinIdx]
tolerance = abs(offset - bins[nearestNoiseBinIdx])
tolerance = abs(offset - bins[nearestNoiseBinIdx]) + 1
return offset, tolerance


Expand Down
81 changes: 34 additions & 47 deletions src/csodiaq/scoring/quantificationFunctions.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,21 +54,33 @@ def extract_all_ion_counts_from_df(df, allValuesToCompare, columnName):


def compile_common_protein_quantification_file(
proteinDfs, commonPeptidesDf, proteinQuantificationMethod
proteinDfs, commonPeptidesDf, proteinQuantificationMethod, minNumDifferences
):
if proteinQuantificationMethod == "average":
return compile_ion_count_comparison_across_runs_df(
{
key: calculate_ion_count_for_each_protein_in_protein_fdr_df(value)
for key, value in proteinDfs.items()
},
"protein",
)
elif proteinQuantificationMethod == "maxlfq":
return run_maxlfq_on_all_proteins_found_across_runs(
proteinDfs, commonPeptidesDf
headerToProteinPresenceDict = {header: set(proteinDf["leadingProtein"]) for header, proteinDf in proteinDfs.items()}
proteinPeptideDict = (
pd.concat(list(proteinDfs.values()))
.groupby("leadingProtein")["peptide"]
.apply(set)
.to_dict()
)
proteinQuantitiesDict = {}
for protein, peptideSet in proteinPeptideDict.items():
peptideQuantityDf = commonPeptidesDf[list(peptideSet)]
peptideQuantityDf = set_non_present_protein_levels_to_zero(
peptideQuantityDf, protein, headerToProteinPresenceDict
)
if proteinQuantificationMethod == "sum":
proteinSampleQuantities = ion_count_sum(peptideQuantityDf)
elif proteinQuantificationMethod == "maxlfq":
proteinSampleQuantities = run_maxlfq_with_normalizations(peptideQuantityDf, minNumDifferences)
proteinQuantitiesDict[protein] = proteinSampleQuantities
commonProteinsDf = pd.DataFrame.from_dict(proteinQuantitiesDict)
commonProteinsDf.index = commonPeptidesDf.index
return commonProteinsDf

def ion_count_sum(sampleByPeptideMatrix):
sampleByPeptideMatrix = sampleByPeptideMatrix.loc[:, (sampleByPeptideMatrix != 0).all(axis=0)]
return np.array(sampleByPeptideMatrix.sum(axis=1))

def set_non_present_protein_levels_to_zero(
peptideQuantityDf, protein, headerToProteinPresenceDict
Expand All @@ -82,40 +94,15 @@ def set_non_present_protein_levels_to_zero(
)
peptideQuantityDf.loc[
peptideQuantityDf.index.isin(indicesWithoutProtein), :
] = -np.inf
] = 0
return peptideQuantityDf


def run_maxlfq_on_all_proteins_found_across_runs(proteinDfs, commonPeptidesDf):
peptideProteinConnections = []
headerToProteinPresenceDict = {}
for header, proteinDf in proteinDfs.items():
peptideProteinConnectionsForSample = (
initialize__format_peptide_protein_connections(
proteinDf, proteinColumn="leadingProtein"
)
)
peptideProteinConnections.append(peptideProteinConnectionsForSample)
headerToProteinPresenceDict[header] = set(
peptideProteinConnectionsForSample["protein"]
)
proteinPeptideDict = (
pd.concat(peptideProteinConnections)
.groupby("protein")["peptide"]
.apply(set)
.to_dict()
)
normalizedCommonPeptideDf = np.log(commonPeptidesDf)
proteinQuantitiesDict = {}
for protein, peptideSet in proteinPeptideDict.items():
peptideQuantityDf = normalizedCommonPeptideDf[list(peptideSet)]
peptideQuantityDf = set_non_present_protein_levels_to_zero(
peptideQuantityDf, protein, headerToProteinPresenceDict
)
proteinQuantitiesDict[protein] = maxlfq(peptideQuantityDf.to_numpy())
commonProteinsDf = pd.DataFrame.from_dict(proteinQuantitiesDict)
commonProteinsDf.index = commonPeptidesDf.index
return np.exp(commonProteinsDf).replace(1, 0)
def run_maxlfq_with_normalizations(peptideQuantityDf, minNumDifferences):
proteinSampleQuantities = maxlfq(np.log(peptideQuantityDf).to_numpy(), minNumDifferences)
proteinSampleQuantities = np.exp(proteinSampleQuantities)
proteinSampleQuantities[proteinSampleQuantities == 1] = 0
return proteinSampleQuantities


def prepare_matrices_for_cholesky_factorization(
Expand Down Expand Up @@ -162,13 +149,13 @@ def calculate_protein_intensities_across_samples_from_cholesky_factorization(A,
return linalg.cho_solve(a, B)


def maxlfq(sampleByPeptideMatrix, tolerance=-10.0):
def maxlfq(sampleByPeptideMatrix, minNumDifferences, tolerance=-10.0):
sampleByPeptideMatrix[sampleByPeptideMatrix < tolerance] = 0
A, B = prepare_matrices_for_cholesky_factorization(sampleByPeptideMatrix)
oneOrFewerMatchIdx = np.diagonal(A) == 0
A, B = prepare_matrices_for_cholesky_factorization(sampleByPeptideMatrix, minNumDifferences)
unmatchedIdx = np.diagonal(A) == 0
A, B = apply_regularization_to_matrices(sampleByPeptideMatrix, A, B)
proteinScoreAcrossSamples = (
calculate_protein_intensities_across_samples_from_cholesky_factorization(A, B)
)
proteinScoreAcrossSamples[oneOrFewerMatchIdx] = 0
proteinScoreAcrossSamples[unmatchedIdx] = 0
return proteinScoreAcrossSamples
2 changes: 1 addition & 1 deletion src/csodiaq/scoring/scoringFunctions.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ def score_library_to_query_matches(matches):
matches.groupby(["libraryIdx", "queryIdx"])
.apply(
lambda x: calculate_cosine_similarity_score(
x["libraryIntensity"], x["queryIntensity"]
np.sqrt(x["libraryIntensity"]), np.sqrt(x["queryIntensity"])
)
)
.reset_index(name="cosineScore")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -250,14 +250,15 @@ def find_number_of_total_query_peaks_in_scan(spectra):


def create_vector_that_can_be_used_to_create_cosine_score(vectorA, cosineScore):
vectorA = np.sqrt(vectorA)
unitVector = vectorA / np.linalg.norm(vectorA)
positiveVector = np.array(np.full((len(vectorA)), 1000.0))
perpendicularVector = positiveVector - positiveVector.dot(unitVector) * unitVector
perpendicularVector = perpendicularVector / np.linalg.norm(perpendicularVector)
vectorB = (
cosineScore * unitVector + np.sqrt(1 - cosineScore**2) * perpendicularVector
)
return vectorB
return vectorB ** 2


def calculate_mz_value_from_ppm(mz, ppm):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ def format_query_mz_values(self, libraryMzs, mzWindowGroup):
numBins = 200
ppmCenterBin = 10.0
binWidth = 0.1
emptyBinNum = 4
emptyBinNum = 20
centerBinDistribution = np.random.uniform(
ppmCenterBin - (binWidth / 2), ppmCenterBin + (binWidth / 2), size=(4,)
)
Expand Down
15 changes: 12 additions & 3 deletions tests/system/system_tests/identification/test_identification.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,9 +277,7 @@ def test__identification__baseline_no_matches_raises_error(
outputDir,
baselineSpectraBreakdown,
)
assert_no_matches_after_correction_raises_warning(
inputQueryFile, libraryFile, outputDir, baselineSpectraBreakdown
)



def assert_no_matches_before_correction_raises_warning_and_skips_offending_file_only(
Expand Down Expand Up @@ -321,6 +319,17 @@ def assert_no_matches_before_correction_raises_warning_and_skips_offending_file_
def assert_no_matches_after_correction_raises_warning(
inputQueryFile, libraryFile, outputDir, baselineSpectraBreakdown
):
"""
Currently unused. The applicability of this error is questionable and may be removed entirely in the future.

Would be added to 'test__identification__baseline_no_matches_raises_error', though it would require using
input data with wider differences between m/z values in each spectra.
'''
assert_no_matches_after_correction_raises_warning(
inputQueryFile, libraryFile, outputDir, baselineSpectraBreakdown
)
'''
"""
args = [
"csodiaq",
"id",
Expand Down
6 changes: 3 additions & 3 deletions tests/system/system_tests/scoring/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
OneMatchSample1And2Breakdown,
OneMatchSample3Breakdown,
)
from .testFileContentCreators.OverlapCommonProteinScoresBreakdown import (
OverlapSample1And2Breakdown,
OvelapSample3Breakdown,
from .testFileContentCreators.ClusteredCommonProteinScoresBreakdown import (
ClusterSample1And2Breakdown,
ClusterSample3And4Breakdown,
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import pandas as pd
from .BaselineCommonProteinScoresBreakdown import BaselineCommonProteinScoresBreakdown

"""
NOTE: this is specific to the maxlfq method. This method tests the viability
of having two unconnected clusters of peptides that correspond to the
same protein.

protein quantification profile:
peptide1 peptide2 peptide3 peptide4
sample1 100.0 100.0 - -
sample2 100.0 100.0 - -
sample3 - - 200.0 200.0
sample3 - - 200.0 200.0

protein quantification:
protein
sample1 100.0
sample2 100.0
sample3 200.0
sample4 200.0
"""


class ClusterSample1And2Breakdown(BaselineCommonProteinScoresBreakdown):
def _create_input_for_quantifiable_proteins(self):
df = pd.DataFrame(
[
["peptide1", "1/protein", 1, 0, 0, 100.0],
["peptide2", "1/protein", 1, 0, 0, 100.0],
["peptide3", "1/protein", 1, 0, 0, 0.0],
["peptide4", "1/protein", 1, 0, 0, 0.0],
],
columns=["peptide", "protein", "zLIB", "isDecoy", "rank", "ionCount"],
)
return pd.concat([df, self._create_generic_peptide_line()])

def _make_expected_protein_output(self, peptideDf):
proteinDf = super()._make_expected_protein_output(peptideDf)
proteinDf["uniquePeptide"] = [0, 0, 0, 0, 1]
return proteinDf


class ClusterSample3And4Breakdown(BaselineCommonProteinScoresBreakdown):
def _create_input_for_quantifiable_proteins(self):
df = pd.DataFrame(
[
["peptide1", "1/protein", 1, 0, 0, 0.0],
["peptide2", "1/protein", 1, 0, 0, 0.0],
["peptide3", "1/protein", 1, 0, 0, 200.0],
["peptide4", "1/protein", 1, 0, 0, 200.0],
],
columns=["peptide", "protein", "zLIB", "isDecoy", "rank", "ionCount"],
)
return pd.concat([df, self._create_generic_peptide_line()])

def _make_expected_protein_output(self, peptideDf):
proteinDf = super()._make_expected_protein_output(peptideDf)
proteinDf["uniquePeptide"] = [0, 0, 0, 0, 1]
return proteinDf
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@

protein quantification for 'average' method:
protein
sample1 200.0
sample2 233.3
sample3 266.7
sample1 600.0
sample2 700.0
sample3 800.0

protein quantification for 'maxlfq' method:
protein
Expand Down

This file was deleted.

Loading