xomicsdatascience · CCranney · Sep 19, 2023 · Sep 12, 2023 · Sep 12, 2023 · Sep 13, 2023
diff --git a/src/csodiaq/csodiaq.py b/src/csodiaq/csodiaq.py
@@ -109,7 +109,7 @@ def run_scoring(args):
     if len(proteinDfs) > 0:
         printer("Begin Quantifying Common Proteins")
         commonProteinDf = compile_common_protein_quantification_file(
-            proteinDfs, commonPeptideDf, args["proteinQuantMethod"]
+            proteinDfs, commonPeptideDf, args["proteinQuantMethod"], args["minNumDifferences"]
         )
         commonProteinDf.to_csv(os.path.join(outputDir, "commonProteins.csv"))
     printer("Finish Scoring")

diff --git a/src/csodiaq/csodiaqParser.py b/src/csodiaq/csodiaqParser.py
@@ -4,6 +4,7 @@
 import numpy as np
 from abc import ABC, abstractmethod
 import re
+import warnings
 
 
 def set_args_from_command_line_input():
@@ -97,10 +98,17 @@ def add_score_parser(commandParser):
     scoringParser.add_argument(
         "-p",
         "--proteinQuantMethod",
-        choices=["maxlfq", "average"],
+        choices=["maxlfq", "sum"],
         default="maxlfq",
         help="Method by which protein quantification metric is calculated (based on peptide quantities). \nOptional, default is 'maxlfq' method. Choices are 'maxlfq' or 'average'.",
     )
+    scoringParser.add_argument(
+        "-min",
+        "--minNumDifferences",
+        type=_RestrictedInt("minNumDifferences", minValue=1, maxValue=2),
+        default=2,
+        help="Specific to the maxLFQ protein quantification method. Requires at minimum the given number of matches before a sample to sample ratio or difference is accepted.\nOptional, default is 2. Only 1 or 2 is accepted. \nThis flag will throw a warning error when paired with a protein quantification method other than 'maxlfq'.",
+    )
 
 
 def add_reanalysis_parser(commandParser):
@@ -147,6 +155,10 @@ def check_for_conflicting_args(args):
         raise argparse.ArgumentTypeError(
             "The correctionDegree parameter is invalidated by the noCorrection flag. Please inspect your input and remove one of them."
         )
+    if args["command"] == "score" and args["proteinQuantMethod"] != "maxlfq" and args["minNumDifferences"] != 2:
+        warnings.warn(
+            f"The minNumDifferences flag will only have an effect when paired with the 'maxlfq' proteinQuantMethod flag. You used it with the '{args['proteinQuantMethod']}' method, so this flag will be ignored.", UserWarning
+        )
     if (
         args["command"] == "targetedReanalysis"
         and args["protein"]

diff --git a/src/csodiaq/identification/matchingFunctions.py b/src/csodiaq/identification/matchingFunctions.py
@@ -221,7 +221,7 @@ def calculate_ppm_offset_tolerance_using_tallest_bin_peak(ppms):
         binHeights, tallestBinIdx
     )
     offset = bins[tallestBinIdx]
-    tolerance = abs(offset - bins[nearestNoiseBinIdx])
+    tolerance = abs(offset - bins[nearestNoiseBinIdx]) + 1
     return offset, tolerance
 
 

diff --git a/src/csodiaq/scoring/quantificationFunctions.py b/src/csodiaq/scoring/quantificationFunctions.py
@@ -54,21 +54,33 @@ def extract_all_ion_counts_from_df(df, allValuesToCompare, columnName):
 
 
 def compile_common_protein_quantification_file(
-    proteinDfs, commonPeptidesDf, proteinQuantificationMethod
+    proteinDfs, commonPeptidesDf, proteinQuantificationMethod, minNumDifferences
 ):
-    if proteinQuantificationMethod == "average":
-        return compile_ion_count_comparison_across_runs_df(
-            {
-                key: calculate_ion_count_for_each_protein_in_protein_fdr_df(value)
-                for key, value in proteinDfs.items()
-            },
-            "protein",
-        )
-    elif proteinQuantificationMethod == "maxlfq":
-        return run_maxlfq_on_all_proteins_found_across_runs(
-            proteinDfs, commonPeptidesDf
+    headerToProteinPresenceDict = {header: set(proteinDf["leadingProtein"]) for header, proteinDf in proteinDfs.items()}
+    proteinPeptideDict = (
+        pd.concat(list(proteinDfs.values()))
+        .groupby("leadingProtein")["peptide"]
+        .apply(set)
+        .to_dict()
+    )
+    proteinQuantitiesDict = {}
+    for protein, peptideSet in proteinPeptideDict.items():
+        peptideQuantityDf = commonPeptidesDf[list(peptideSet)]
+        peptideQuantityDf = set_non_present_protein_levels_to_zero(
+            peptideQuantityDf, protein, headerToProteinPresenceDict
         )
+        if proteinQuantificationMethod == "sum":
+            proteinSampleQuantities = ion_count_sum(peptideQuantityDf)
+        elif proteinQuantificationMethod == "maxlfq":
+            proteinSampleQuantities = run_maxlfq_with_normalizations(peptideQuantityDf, minNumDifferences)
+        proteinQuantitiesDict[protein] = proteinSampleQuantities
+    commonProteinsDf = pd.DataFrame.from_dict(proteinQuantitiesDict)
+    commonProteinsDf.index = commonPeptidesDf.index
+    return commonProteinsDf
 
+def ion_count_sum(sampleByPeptideMatrix):
+    sampleByPeptideMatrix = sampleByPeptideMatrix.loc[:, (sampleByPeptideMatrix != 0).all(axis=0)]
+    return np.array(sampleByPeptideMatrix.sum(axis=1))
 
 def set_non_present_protein_levels_to_zero(
     peptideQuantityDf, protein, headerToProteinPresenceDict
@@ -82,40 +94,15 @@ def set_non_present_protein_levels_to_zero(
     )
     peptideQuantityDf.loc[
         peptideQuantityDf.index.isin(indicesWithoutProtein), :
-    ] = -np.inf
+    ] = 0
     return peptideQuantityDf
 
 
-def run_maxlfq_on_all_proteins_found_across_runs(proteinDfs, commonPeptidesDf):
-    peptideProteinConnections = []
-    headerToProteinPresenceDict = {}
-    for header, proteinDf in proteinDfs.items():
-        peptideProteinConnectionsForSample = (
-            initialize__format_peptide_protein_connections(
-                proteinDf, proteinColumn="leadingProtein"
-            )
-        )
-        peptideProteinConnections.append(peptideProteinConnectionsForSample)
-        headerToProteinPresenceDict[header] = set(
-            peptideProteinConnectionsForSample["protein"]
-        )
-    proteinPeptideDict = (
-        pd.concat(peptideProteinConnections)
-        .groupby("protein")["peptide"]
-        .apply(set)
-        .to_dict()
-    )
-    normalizedCommonPeptideDf = np.log(commonPeptidesDf)
-    proteinQuantitiesDict = {}
-    for protein, peptideSet in proteinPeptideDict.items():
-        peptideQuantityDf = normalizedCommonPeptideDf[list(peptideSet)]
-        peptideQuantityDf = set_non_present_protein_levels_to_zero(
-            peptideQuantityDf, protein, headerToProteinPresenceDict
-        )
-        proteinQuantitiesDict[protein] = maxlfq(peptideQuantityDf.to_numpy())
-    commonProteinsDf = pd.DataFrame.from_dict(proteinQuantitiesDict)
-    commonProteinsDf.index = commonPeptidesDf.index
-    return np.exp(commonProteinsDf).replace(1, 0)
+def run_maxlfq_with_normalizations(peptideQuantityDf, minNumDifferences):
+    proteinSampleQuantities = maxlfq(np.log(peptideQuantityDf).to_numpy(), minNumDifferences)
+    proteinSampleQuantities = np.exp(proteinSampleQuantities)
+    proteinSampleQuantities[proteinSampleQuantities == 1] = 0
+    return proteinSampleQuantities
 
 
 def prepare_matrices_for_cholesky_factorization(
@@ -162,13 +149,13 @@ def calculate_protein_intensities_across_samples_from_cholesky_factorization(A,
     return linalg.cho_solve(a, B)
 
 
-def maxlfq(sampleByPeptideMatrix, tolerance=-10.0):
+def maxlfq(sampleByPeptideMatrix, minNumDifferences, tolerance=-10.0):
     sampleByPeptideMatrix[sampleByPeptideMatrix < tolerance] = 0
-    A, B = prepare_matrices_for_cholesky_factorization(sampleByPeptideMatrix)
-    oneOrFewerMatchIdx = np.diagonal(A) == 0
+    A, B = prepare_matrices_for_cholesky_factorization(sampleByPeptideMatrix, minNumDifferences)
+    unmatchedIdx = np.diagonal(A) == 0
     A, B = apply_regularization_to_matrices(sampleByPeptideMatrix, A, B)
     proteinScoreAcrossSamples = (
         calculate_protein_intensities_across_samples_from_cholesky_factorization(A, B)
     )
-    proteinScoreAcrossSamples[oneOrFewerMatchIdx] = 0
+    proteinScoreAcrossSamples[unmatchedIdx] = 0
     return proteinScoreAcrossSamples
diff --git a/src/csodiaq/scoring/scoringFunctions.py b/src/csodiaq/scoring/scoringFunctions.py
@@ -8,7 +8,7 @@ def score_library_to_query_matches(matches):
         matches.groupby(["libraryIdx", "queryIdx"])
         .apply(
             lambda x: calculate_cosine_similarity_score(
-                x["libraryIntensity"], x["queryIntensity"]
+                np.sqrt(x["libraryIntensity"]), np.sqrt(x["queryIntensity"])
             )
         )
         .reset_index(name="cosineScore")

diff --git a/tests/system/system_tests/identification/testFileContentCreators/BaselineSpectraBreakdown.py b/tests/system/system_tests/identification/testFileContentCreators/BaselineSpectraBreakdown.py
@@ -250,14 +250,15 @@ def find_number_of_total_query_peaks_in_scan(spectra):
 
 
 def create_vector_that_can_be_used_to_create_cosine_score(vectorA, cosineScore):
+    vectorA = np.sqrt(vectorA)
     unitVector = vectorA / np.linalg.norm(vectorA)
     positiveVector = np.array(np.full((len(vectorA)), 1000.0))
     perpendicularVector = positiveVector - positiveVector.dot(unitVector) * unitVector
     perpendicularVector = perpendicularVector / np.linalg.norm(perpendicularVector)
     vectorB = (
         cosineScore * unitVector + np.sqrt(1 - cosineScore**2) * perpendicularVector
     )
-    return vectorB
+    return vectorB ** 2
 
 
 def calculate_mz_value_from_ppm(mz, ppm):

diff --git a/...m/system_tests/identification/testFileContentCreators/CustomCorrectionSpectraBreakdown.py b/...m/system_tests/identification/testFileContentCreators/CustomCorrectionSpectraBreakdown.py
@@ -10,7 +10,7 @@ def format_query_mz_values(self, libraryMzs, mzWindowGroup):
         numBins = 200
         ppmCenterBin = 10.0
         binWidth = 0.1
-        emptyBinNum = 4
+        emptyBinNum = 20
         centerBinDistribution = np.random.uniform(
             ppmCenterBin - (binWidth / 2), ppmCenterBin + (binWidth / 2), size=(4,)
         )

diff --git a/tests/system/system_tests/identification/test_identification.py b/tests/system/system_tests/identification/test_identification.py
@@ -277,9 +277,7 @@ def test__identification__baseline_no_matches_raises_error(
         outputDir,
         baselineSpectraBreakdown,
     )
-    assert_no_matches_after_correction_raises_warning(
-        inputQueryFile, libraryFile, outputDir, baselineSpectraBreakdown
-    )
+
 
 
 def assert_no_matches_before_correction_raises_warning_and_skips_offending_file_only(
@@ -321,6 +319,17 @@ def assert_no_matches_before_correction_raises_warning_and_skips_offending_file_
 def assert_no_matches_after_correction_raises_warning(
     inputQueryFile, libraryFile, outputDir, baselineSpectraBreakdown
 ):
+    """
+    Currently unused. The applicability of this error is questionable and may be removed entirely in the future.
+
+    Would be added to 'test__identification__baseline_no_matches_raises_error', though it would require using
+        input data with wider differences between m/z values in each spectra.
+    '''
+    assert_no_matches_after_correction_raises_warning(
+        inputQueryFile, libraryFile, outputDir, baselineSpectraBreakdown
+    )
+    '''
+    """
     args = [
         "csodiaq",
         "id",

diff --git a/tests/system/system_tests/scoring/__init__.py b/tests/system/system_tests/scoring/__init__.py
@@ -15,7 +15,7 @@
     OneMatchSample1And2Breakdown,
     OneMatchSample3Breakdown,
 )
-from .testFileContentCreators.OverlapCommonProteinScoresBreakdown import (
-    OverlapSample1And2Breakdown,
-    OvelapSample3Breakdown,
+from .testFileContentCreators.ClusteredCommonProteinScoresBreakdown import (
+    ClusterSample1And2Breakdown,
+    ClusterSample3And4Breakdown,
 )
diff --git a/...tem/system_tests/scoring/testFileContentCreators/ClusteredCommonProteinScoresBreakdown.py b/...tem/system_tests/scoring/testFileContentCreators/ClusteredCommonProteinScoresBreakdown.py
@@ -0,0 +1,60 @@
+import pandas as pd
+from .BaselineCommonProteinScoresBreakdown import BaselineCommonProteinScoresBreakdown
+
+"""
+NOTE: this is specific to the maxlfq method. This method tests the viability
+        of having two unconnected clusters of peptides that correspond to the
+        same protein.
+
+protein quantification profile:
+        peptide1    peptide2    peptide3    peptide4
+sample1 100.0       100.0       -           -
+sample2 100.0       100.0       -           -
+sample3 -           -           200.0       200.0
+sample3 -           -           200.0       200.0
+
+protein quantification:
+        protein
+sample1 100.0
+sample2 100.0
+sample3 200.0
+sample4 200.0
+"""
+
+
+class ClusterSample1And2Breakdown(BaselineCommonProteinScoresBreakdown):
+    def _create_input_for_quantifiable_proteins(self):
+        df = pd.DataFrame(
+            [
+                ["peptide1", "1/protein", 1, 0, 0, 100.0],
+                ["peptide2", "1/protein", 1, 0, 0, 100.0],
+                ["peptide3", "1/protein", 1, 0, 0, 0.0],
+                ["peptide4", "1/protein", 1, 0, 0, 0.0],
+            ],
+            columns=["peptide", "protein", "zLIB", "isDecoy", "rank", "ionCount"],
+        )
+        return pd.concat([df, self._create_generic_peptide_line()])
+
+    def _make_expected_protein_output(self, peptideDf):
+        proteinDf = super()._make_expected_protein_output(peptideDf)
+        proteinDf["uniquePeptide"] = [0, 0, 0, 0, 1]
+        return proteinDf
+
+
+class ClusterSample3And4Breakdown(BaselineCommonProteinScoresBreakdown):
+    def _create_input_for_quantifiable_proteins(self):
+        df = pd.DataFrame(
+            [
+                ["peptide1", "1/protein", 1, 0, 0, 0.0],
+                ["peptide2", "1/protein", 1, 0, 0, 0.0],
+                ["peptide3", "1/protein", 1, 0, 0, 200.0],
+                ["peptide4", "1/protein", 1, 0, 0, 200.0],
+            ],
+            columns=["peptide", "protein", "zLIB", "isDecoy", "rank", "ionCount"],
+        )
+        return pd.concat([df, self._create_generic_peptide_line()])
+
+    def _make_expected_protein_output(self, peptideDf):
+        proteinDf = super()._make_expected_protein_output(peptideDf)
+        proteinDf["uniquePeptide"] = [0, 0, 0, 0, 1]
+        return proteinDf
diff --git a/...tem_tests/scoring/testFileContentCreators/MethodComparisonCommonProteinScoresBreakdown.py b/...tem_tests/scoring/testFileContentCreators/MethodComparisonCommonProteinScoresBreakdown.py
@@ -11,9 +11,9 @@
 
 protein quantification for 'average' method:
         protein
-sample1 200.0
-sample2 233.3
-sample3 266.7
+sample1 600.0
+sample2 700.0
+sample3 800.0
 
 protein quantification for 'maxlfq' method:
         protein

diff --git a/...ystem/system_tests/scoring/testFileContentCreators/OverlapCommonProteinScoresBreakdown.py b/...ystem/system_tests/scoring/testFileContentCreators/OverlapCommonProteinScoresBreakdown.py