diff --git a/src/csodiaq/csodiaq.py b/src/csodiaq/csodiaq.py index 8a466ee..9ab37b7 100644 --- a/src/csodiaq/csodiaq.py +++ b/src/csodiaq/csodiaq.py @@ -109,7 +109,7 @@ def run_scoring(args): if len(proteinDfs) > 0: printer("Begin Quantifying Common Proteins") commonProteinDf = compile_common_protein_quantification_file( - proteinDfs, commonPeptideDf, args["proteinQuantMethod"] + proteinDfs, commonPeptideDf, args["proteinQuantMethod"], args["minNumDifferences"] ) commonProteinDf.to_csv(os.path.join(outputDir, "commonProteins.csv")) printer("Finish Scoring") diff --git a/src/csodiaq/csodiaqParser.py b/src/csodiaq/csodiaqParser.py index 0491beb..537ff21 100644 --- a/src/csodiaq/csodiaqParser.py +++ b/src/csodiaq/csodiaqParser.py @@ -4,6 +4,7 @@ import numpy as np from abc import ABC, abstractmethod import re +import warnings def set_args_from_command_line_input(): @@ -97,10 +98,17 @@ def add_score_parser(commandParser): scoringParser.add_argument( "-p", "--proteinQuantMethod", - choices=["maxlfq", "average"], + choices=["maxlfq", "sum"], default="maxlfq", help="Method by which protein quantification metric is calculated (based on peptide quantities). \nOptional, default is 'maxlfq' method. Choices are 'maxlfq' or 'average'.", ) + scoringParser.add_argument( + "-min", + "--minNumDifferences", + type=_RestrictedInt("minNumDifferences", minValue=1, maxValue=2), + default=2, + help="Specific to the maxLFQ protein quantification method. Requires at minimum the given number of matches before a sample to sample ratio or difference is accepted.\nOptional, default is 2. Only 1 or 2 is accepted. \nThis flag will throw a warning error when paired with a protein quantification method other than 'maxlfq'.", + ) def add_reanalysis_parser(commandParser): @@ -147,6 +155,10 @@ def check_for_conflicting_args(args): raise argparse.ArgumentTypeError( "The correctionDegree parameter is invalidated by the noCorrection flag. Please inspect your input and remove one of them." ) + if args["command"] == "score" and args["proteinQuantMethod"] != "maxlfq" and args["minNumDifferences"] != 2: + warnings.warn( + f"The minNumDifferences flag will only have an effect when paired with the 'maxlfq' proteinQuantMethod flag. You used it with the '{args['proteinQuantMethod']}' method, so this flag will be ignored.", UserWarning + ) if ( args["command"] == "targetedReanalysis" and args["protein"] diff --git a/src/csodiaq/identification/matchingFunctions.py b/src/csodiaq/identification/matchingFunctions.py index 1f3ba31..b216a9b 100644 --- a/src/csodiaq/identification/matchingFunctions.py +++ b/src/csodiaq/identification/matchingFunctions.py @@ -221,7 +221,7 @@ def calculate_ppm_offset_tolerance_using_tallest_bin_peak(ppms): binHeights, tallestBinIdx ) offset = bins[tallestBinIdx] - tolerance = abs(offset - bins[nearestNoiseBinIdx]) + tolerance = abs(offset - bins[nearestNoiseBinIdx]) + 1 return offset, tolerance diff --git a/src/csodiaq/scoring/quantificationFunctions.py b/src/csodiaq/scoring/quantificationFunctions.py index 008308d..1c1e8f3 100644 --- a/src/csodiaq/scoring/quantificationFunctions.py +++ b/src/csodiaq/scoring/quantificationFunctions.py @@ -54,21 +54,33 @@ def extract_all_ion_counts_from_df(df, allValuesToCompare, columnName): def compile_common_protein_quantification_file( - proteinDfs, commonPeptidesDf, proteinQuantificationMethod + proteinDfs, commonPeptidesDf, proteinQuantificationMethod, minNumDifferences ): - if proteinQuantificationMethod == "average": - return compile_ion_count_comparison_across_runs_df( - { - key: calculate_ion_count_for_each_protein_in_protein_fdr_df(value) - for key, value in proteinDfs.items() - }, - "protein", - ) - elif proteinQuantificationMethod == "maxlfq": - return run_maxlfq_on_all_proteins_found_across_runs( - proteinDfs, commonPeptidesDf + headerToProteinPresenceDict = {header: set(proteinDf["leadingProtein"]) for header, proteinDf in proteinDfs.items()} + proteinPeptideDict = ( + pd.concat(list(proteinDfs.values())) + .groupby("leadingProtein")["peptide"] + .apply(set) + .to_dict() + ) + proteinQuantitiesDict = {} + for protein, peptideSet in proteinPeptideDict.items(): + peptideQuantityDf = commonPeptidesDf[list(peptideSet)] + peptideQuantityDf = set_non_present_protein_levels_to_zero( + peptideQuantityDf, protein, headerToProteinPresenceDict ) + if proteinQuantificationMethod == "sum": + proteinSampleQuantities = ion_count_sum(peptideQuantityDf) + elif proteinQuantificationMethod == "maxlfq": + proteinSampleQuantities = run_maxlfq_with_normalizations(peptideQuantityDf, minNumDifferences) + proteinQuantitiesDict[protein] = proteinSampleQuantities + commonProteinsDf = pd.DataFrame.from_dict(proteinQuantitiesDict) + commonProteinsDf.index = commonPeptidesDf.index + return commonProteinsDf +def ion_count_sum(sampleByPeptideMatrix): + sampleByPeptideMatrix = sampleByPeptideMatrix.loc[:, (sampleByPeptideMatrix != 0).all(axis=0)] + return np.array(sampleByPeptideMatrix.sum(axis=1)) def set_non_present_protein_levels_to_zero( peptideQuantityDf, protein, headerToProteinPresenceDict @@ -82,40 +94,15 @@ def set_non_present_protein_levels_to_zero( ) peptideQuantityDf.loc[ peptideQuantityDf.index.isin(indicesWithoutProtein), : - ] = -np.inf + ] = 0 return peptideQuantityDf -def run_maxlfq_on_all_proteins_found_across_runs(proteinDfs, commonPeptidesDf): - peptideProteinConnections = [] - headerToProteinPresenceDict = {} - for header, proteinDf in proteinDfs.items(): - peptideProteinConnectionsForSample = ( - initialize__format_peptide_protein_connections( - proteinDf, proteinColumn="leadingProtein" - ) - ) - peptideProteinConnections.append(peptideProteinConnectionsForSample) - headerToProteinPresenceDict[header] = set( - peptideProteinConnectionsForSample["protein"] - ) - proteinPeptideDict = ( - pd.concat(peptideProteinConnections) - .groupby("protein")["peptide"] - .apply(set) - .to_dict() - ) - normalizedCommonPeptideDf = np.log(commonPeptidesDf) - proteinQuantitiesDict = {} - for protein, peptideSet in proteinPeptideDict.items(): - peptideQuantityDf = normalizedCommonPeptideDf[list(peptideSet)] - peptideQuantityDf = set_non_present_protein_levels_to_zero( - peptideQuantityDf, protein, headerToProteinPresenceDict - ) - proteinQuantitiesDict[protein] = maxlfq(peptideQuantityDf.to_numpy()) - commonProteinsDf = pd.DataFrame.from_dict(proteinQuantitiesDict) - commonProteinsDf.index = commonPeptidesDf.index - return np.exp(commonProteinsDf).replace(1, 0) +def run_maxlfq_with_normalizations(peptideQuantityDf, minNumDifferences): + proteinSampleQuantities = maxlfq(np.log(peptideQuantityDf).to_numpy(), minNumDifferences) + proteinSampleQuantities = np.exp(proteinSampleQuantities) + proteinSampleQuantities[proteinSampleQuantities == 1] = 0 + return proteinSampleQuantities def prepare_matrices_for_cholesky_factorization( @@ -162,13 +149,13 @@ def calculate_protein_intensities_across_samples_from_cholesky_factorization(A, return linalg.cho_solve(a, B) -def maxlfq(sampleByPeptideMatrix, tolerance=-10.0): +def maxlfq(sampleByPeptideMatrix, minNumDifferences, tolerance=-10.0): sampleByPeptideMatrix[sampleByPeptideMatrix < tolerance] = 0 - A, B = prepare_matrices_for_cholesky_factorization(sampleByPeptideMatrix) - oneOrFewerMatchIdx = np.diagonal(A) == 0 + A, B = prepare_matrices_for_cholesky_factorization(sampleByPeptideMatrix, minNumDifferences) + unmatchedIdx = np.diagonal(A) == 0 A, B = apply_regularization_to_matrices(sampleByPeptideMatrix, A, B) proteinScoreAcrossSamples = ( calculate_protein_intensities_across_samples_from_cholesky_factorization(A, B) ) - proteinScoreAcrossSamples[oneOrFewerMatchIdx] = 0 + proteinScoreAcrossSamples[unmatchedIdx] = 0 return proteinScoreAcrossSamples diff --git a/src/csodiaq/scoring/scoringFunctions.py b/src/csodiaq/scoring/scoringFunctions.py index dd2280c..4790a8b 100644 --- a/src/csodiaq/scoring/scoringFunctions.py +++ b/src/csodiaq/scoring/scoringFunctions.py @@ -8,7 +8,7 @@ def score_library_to_query_matches(matches): matches.groupby(["libraryIdx", "queryIdx"]) .apply( lambda x: calculate_cosine_similarity_score( - x["libraryIntensity"], x["queryIntensity"] + np.sqrt(x["libraryIntensity"]), np.sqrt(x["queryIntensity"]) ) ) .reset_index(name="cosineScore") diff --git a/tests/system/system_tests/identification/testFileContentCreators/BaselineSpectraBreakdown.py b/tests/system/system_tests/identification/testFileContentCreators/BaselineSpectraBreakdown.py index 0bf747d..d34997a 100644 --- a/tests/system/system_tests/identification/testFileContentCreators/BaselineSpectraBreakdown.py +++ b/tests/system/system_tests/identification/testFileContentCreators/BaselineSpectraBreakdown.py @@ -250,6 +250,7 @@ def find_number_of_total_query_peaks_in_scan(spectra): def create_vector_that_can_be_used_to_create_cosine_score(vectorA, cosineScore): + vectorA = np.sqrt(vectorA) unitVector = vectorA / np.linalg.norm(vectorA) positiveVector = np.array(np.full((len(vectorA)), 1000.0)) perpendicularVector = positiveVector - positiveVector.dot(unitVector) * unitVector @@ -257,7 +258,7 @@ def create_vector_that_can_be_used_to_create_cosine_score(vectorA, cosineScore): vectorB = ( cosineScore * unitVector + np.sqrt(1 - cosineScore**2) * perpendicularVector ) - return vectorB + return vectorB ** 2 def calculate_mz_value_from_ppm(mz, ppm): diff --git a/tests/system/system_tests/identification/testFileContentCreators/CustomCorrectionSpectraBreakdown.py b/tests/system/system_tests/identification/testFileContentCreators/CustomCorrectionSpectraBreakdown.py index 54ce398..0891224 100644 --- a/tests/system/system_tests/identification/testFileContentCreators/CustomCorrectionSpectraBreakdown.py +++ b/tests/system/system_tests/identification/testFileContentCreators/CustomCorrectionSpectraBreakdown.py @@ -10,7 +10,7 @@ def format_query_mz_values(self, libraryMzs, mzWindowGroup): numBins = 200 ppmCenterBin = 10.0 binWidth = 0.1 - emptyBinNum = 4 + emptyBinNum = 20 centerBinDistribution = np.random.uniform( ppmCenterBin - (binWidth / 2), ppmCenterBin + (binWidth / 2), size=(4,) ) diff --git a/tests/system/system_tests/identification/test_identification.py b/tests/system/system_tests/identification/test_identification.py index 25334f5..07892d2 100644 --- a/tests/system/system_tests/identification/test_identification.py +++ b/tests/system/system_tests/identification/test_identification.py @@ -277,9 +277,7 @@ def test__identification__baseline_no_matches_raises_error( outputDir, baselineSpectraBreakdown, ) - assert_no_matches_after_correction_raises_warning( - inputQueryFile, libraryFile, outputDir, baselineSpectraBreakdown - ) + def assert_no_matches_before_correction_raises_warning_and_skips_offending_file_only( @@ -321,6 +319,17 @@ def assert_no_matches_before_correction_raises_warning_and_skips_offending_file_ def assert_no_matches_after_correction_raises_warning( inputQueryFile, libraryFile, outputDir, baselineSpectraBreakdown ): + """ + Currently unused. The applicability of this error is questionable and may be removed entirely in the future. + + Would be added to 'test__identification__baseline_no_matches_raises_error', though it would require using + input data with wider differences between m/z values in each spectra. + ''' + assert_no_matches_after_correction_raises_warning( + inputQueryFile, libraryFile, outputDir, baselineSpectraBreakdown + ) + ''' + """ args = [ "csodiaq", "id", diff --git a/tests/system/system_tests/scoring/__init__.py b/tests/system/system_tests/scoring/__init__.py index 389bbc5..445659a 100644 --- a/tests/system/system_tests/scoring/__init__.py +++ b/tests/system/system_tests/scoring/__init__.py @@ -15,7 +15,7 @@ OneMatchSample1And2Breakdown, OneMatchSample3Breakdown, ) -from .testFileContentCreators.OverlapCommonProteinScoresBreakdown import ( - OverlapSample1And2Breakdown, - OvelapSample3Breakdown, +from .testFileContentCreators.ClusteredCommonProteinScoresBreakdown import ( + ClusterSample1And2Breakdown, + ClusterSample3And4Breakdown, ) diff --git a/tests/system/system_tests/scoring/testFileContentCreators/ClusteredCommonProteinScoresBreakdown.py b/tests/system/system_tests/scoring/testFileContentCreators/ClusteredCommonProteinScoresBreakdown.py new file mode 100644 index 0000000..a201d77 --- /dev/null +++ b/tests/system/system_tests/scoring/testFileContentCreators/ClusteredCommonProteinScoresBreakdown.py @@ -0,0 +1,60 @@ +import pandas as pd +from .BaselineCommonProteinScoresBreakdown import BaselineCommonProteinScoresBreakdown + +""" +NOTE: this is specific to the maxlfq method. This method tests the viability + of having two unconnected clusters of peptides that correspond to the + same protein. + +protein quantification profile: + peptide1 peptide2 peptide3 peptide4 +sample1 100.0 100.0 - - +sample2 100.0 100.0 - - +sample3 - - 200.0 200.0 +sample3 - - 200.0 200.0 + +protein quantification: + protein +sample1 100.0 +sample2 100.0 +sample3 200.0 +sample4 200.0 +""" + + +class ClusterSample1And2Breakdown(BaselineCommonProteinScoresBreakdown): + def _create_input_for_quantifiable_proteins(self): + df = pd.DataFrame( + [ + ["peptide1", "1/protein", 1, 0, 0, 100.0], + ["peptide2", "1/protein", 1, 0, 0, 100.0], + ["peptide3", "1/protein", 1, 0, 0, 0.0], + ["peptide4", "1/protein", 1, 0, 0, 0.0], + ], + columns=["peptide", "protein", "zLIB", "isDecoy", "rank", "ionCount"], + ) + return pd.concat([df, self._create_generic_peptide_line()]) + + def _make_expected_protein_output(self, peptideDf): + proteinDf = super()._make_expected_protein_output(peptideDf) + proteinDf["uniquePeptide"] = [0, 0, 0, 0, 1] + return proteinDf + + +class ClusterSample3And4Breakdown(BaselineCommonProteinScoresBreakdown): + def _create_input_for_quantifiable_proteins(self): + df = pd.DataFrame( + [ + ["peptide1", "1/protein", 1, 0, 0, 0.0], + ["peptide2", "1/protein", 1, 0, 0, 0.0], + ["peptide3", "1/protein", 1, 0, 0, 200.0], + ["peptide4", "1/protein", 1, 0, 0, 200.0], + ], + columns=["peptide", "protein", "zLIB", "isDecoy", "rank", "ionCount"], + ) + return pd.concat([df, self._create_generic_peptide_line()]) + + def _make_expected_protein_output(self, peptideDf): + proteinDf = super()._make_expected_protein_output(peptideDf) + proteinDf["uniquePeptide"] = [0, 0, 0, 0, 1] + return proteinDf diff --git a/tests/system/system_tests/scoring/testFileContentCreators/MethodComparisonCommonProteinScoresBreakdown.py b/tests/system/system_tests/scoring/testFileContentCreators/MethodComparisonCommonProteinScoresBreakdown.py index 445437a..4bc30d3 100644 --- a/tests/system/system_tests/scoring/testFileContentCreators/MethodComparisonCommonProteinScoresBreakdown.py +++ b/tests/system/system_tests/scoring/testFileContentCreators/MethodComparisonCommonProteinScoresBreakdown.py @@ -11,9 +11,9 @@ protein quantification for 'average' method: protein -sample1 200.0 -sample2 233.3 -sample3 266.7 +sample1 600.0 +sample2 700.0 +sample3 800.0 protein quantification for 'maxlfq' method: protein diff --git a/tests/system/system_tests/scoring/testFileContentCreators/OverlapCommonProteinScoresBreakdown.py b/tests/system/system_tests/scoring/testFileContentCreators/OverlapCommonProteinScoresBreakdown.py deleted file mode 100644 index 2ac1a05..0000000 --- a/tests/system/system_tests/scoring/testFileContentCreators/OverlapCommonProteinScoresBreakdown.py +++ /dev/null @@ -1,73 +0,0 @@ -import pandas as pd -from .BaselineCommonProteinScoresBreakdown import BaselineCommonProteinScoresBreakdown - -""" -NOTE: this is specific to the maxlfq method. Samples with 1 or 0 peptides - shared with other samples are considered 0. - -protein quantification profile for protein 1: - peptide1 peptide2 peptide3 -sample1 X X X -sample2 X X X -sample3 - X X - -protein quantification profile for protein 2: - peptide2 peptide3 peptide4 -sample1 X X - -sample2 X X - -sample3 X X X - -protein 1 quantification: - protein -sample1 X -sample2 X -sample3 0 - -protein 2 quantification for average method: - protein -sample1 0 -sample2 0 -sample3 X - -protein 2 quantification for maxlfq method: - protein -sample1 0 -sample2 0 -sample3 0 -""" - - -class OverlapSample1And2Breakdown(BaselineCommonProteinScoresBreakdown): - def _create_input_for_quantifiable_proteins(self): - df = pd.DataFrame( - [ - ["peptide1", "1/protein1", 1, 0, 0, 100.0], - ["peptide2", "2/protein1/protein2", 1, 0, 0, 100.0], - ["peptide3", "2/protein1/protein2", 1, 0, 0, 100.0], - ], - columns=["peptide", "protein", "zLIB", "isDecoy", "rank", "ionCount"], - ) - return pd.concat([df, self._create_generic_peptide_line()]) - - def _make_expected_protein_output(self, peptideDf): - proteinDf = super()._make_expected_protein_output(peptideDf) - proteinDf["uniquePeptide"] = [1, 0, 0, 1] - return proteinDf - - -class OvelapSample3Breakdown(BaselineCommonProteinScoresBreakdown): - def _create_input_for_quantifiable_proteins(self): - df = pd.DataFrame( - [ - ["peptide2", "2/protein1/protein2", 1, 0, 0, 100.0], - ["peptide3", "2/protein1/protein2", 1, 0, 0, 100.0], - ["peptide4", "1/protein2", 1, 0, 0, 100.0], - ], - columns=["peptide", "protein", "zLIB", "isDecoy", "rank", "ionCount"], - ) - return pd.concat([df, self._create_generic_peptide_line()]) - - def _make_expected_protein_output(self, peptideDf): - proteinDf = super()._make_expected_protein_output(peptideDf) - proteinDf["uniquePeptide"] = [0, 0, 1, 1] - return proteinDf diff --git a/tests/system/system_tests/scoring/testFileContentCreators/StandardCommonProteinScoresBreakdown.py b/tests/system/system_tests/scoring/testFileContentCreators/StandardCommonProteinScoresBreakdown.py index 483c146..5d3e12f 100644 --- a/tests/system/system_tests/scoring/testFileContentCreators/StandardCommonProteinScoresBreakdown.py +++ b/tests/system/system_tests/scoring/testFileContentCreators/StandardCommonProteinScoresBreakdown.py @@ -8,11 +8,18 @@ sample2 X X sample3 - - -protein quantification: +protein quantification for maxlfq method: protein sample1 X sample2 X sample3 0 + +protein quantification for sum method (only peptides found in all samples counted): + protein +sample1 0 +sample2 0 +sample3 0 + """ diff --git a/tests/system/system_tests/scoring/test_scoring.py b/tests/system/system_tests/scoring/test_scoring.py index 3aa3806..dd23d5e 100644 --- a/tests/system/system_tests/scoring/test_scoring.py +++ b/tests/system/system_tests/scoring/test_scoring.py @@ -14,8 +14,8 @@ MethodSample3Breakdown, OneMatchSample1And2Breakdown, OneMatchSample3Breakdown, - OverlapSample1And2Breakdown, - OvelapSample3Breakdown, + ClusterSample1And2Breakdown, + ClusterSample3And4Breakdown, ) @@ -276,16 +276,16 @@ def test__scoring__evaluate_common_proteins_has_zero_quantity_in_missing_sample_ ): expectedCommonProteinDf = pd.DataFrame( [ - [100.0, 100.0], - [100.0, 100.0], + [0.0, 100.0], + [0.0, 100.0], [0.0, 100.0], ], - columns=["protein", "proteinX"], + columns=["1/protein", "1/proteinX"], ) evaluate_standard_protein_quant_comparison( inputFileDirectory, expectedOutputDirectory, - method="average", + method="sum", expectedCommonProteinDf=expectedCommonProteinDf, ) @@ -299,7 +299,7 @@ def test__scoring__evaluate_common_proteins_has_zero_quantity_in_missing_sample_ [100.0, 0.0], [0.0, 0.0], ], - columns=["protein", "proteinX"], + columns=["1/protein", "1/proteinX"], ) evaluate_standard_protein_quant_comparison( inputFileDirectory, @@ -378,16 +378,16 @@ def test__scoring__evaluate_common_proteins_average_method_works_correctly( ): expectedCommonProteinDf = pd.DataFrame( [ - [200.0, 100.0], - [233.333333333333, 100.0], - [266.666666666667, 100.0], + [600.0, 100.0], + [700.0, 100.0], + [800.0, 100.0], ], - columns=["protein", "proteinX"], + columns=["1/protein", "1/proteinX"], ) evaluate_method_protein_quant_comparison( inputFileDirectory, expectedOutputDirectory, - method="average", + method="sum", expectedCommonProteinDf=expectedCommonProteinDf, ) @@ -401,7 +401,7 @@ def test__scoring__evaluate_common_proteins_maxlfq_method_works_correctly( [391.487326, 0.0], [391.493149, 0.0], ], - columns=["protein", "proteinX"], + columns=["1/protein", "1/proteinX"], ) evaluate_method_protein_quant_comparison( inputFileDirectory, @@ -410,11 +410,10 @@ def test__scoring__evaluate_common_proteins_maxlfq_method_works_correctly( expectedCommonProteinDf=expectedCommonProteinDf, ) - -def test__scoring__evaluate_common_proteins_one_peptide_match_excludes_protein_in_maxlfq( - inputFileDirectory, expectedOutputDirectory +def evaluate_common_proteins_one_peptide_match_in_maxlfq( + inputFileDirectory, expectedOutputDirectory, expectedCommonProteinDf, minNumDifferences ): - inputHeader = f"one_match_common_proteins" + inputHeader = f"one_match_common_proteins_{minNumDifferences}minNumDifferences" inputFileDirectoryChild = os.path.join(inputFileDirectory, inputHeader) os.mkdir(inputFileDirectoryChild) @@ -443,6 +442,8 @@ def test__scoring__evaluate_common_proteins_one_peptide_match_excludes_protein_i "score", "-i", inputFileDirectoryChild, + "-min", + str(minNumDifferences), ] subprocess.run(args, capture_output=True) assert_common_peptide_outputs_are_correct( @@ -457,59 +458,84 @@ def test__scoring__evaluate_common_proteins_one_peptide_match_excludes_protein_i outputDirPath = os.path.join(inputFileDirectoryChild, "fdrScores-macc-maxlfq") outputDirContents = os.listdir(outputDirPath) assert "commonProteins.csv" in outputDirContents + expectedCommonProteinDf = expectedCommonProteinDf.set_index( + pd.Index( + [ + f"CsoDIAq-file_{sample1Header}_fullOutput", + f"CsoDIAq-file_{sample2Header}_fullOutput", + f"CsoDIAq-file_{sample3Header}_fullOutput", + ] + ) + ) + commonProteinDf = pd.read_csv( + os.path.join(outputDirPath, "commonProteins.csv"), index_col=0 + ).sort_index() + assert_pandas_dataframes_are_equal(expectedCommonProteinDf, commonProteinDf) + +def test__scoring__evaluate_common_proteins_one_peptide_match__excludes_protein_in_maxlfq_when_minNumDifferences_2( + inputFileDirectory, expectedOutputDirectory +): expectedCommonProteinDf = pd.DataFrame( [ [100.0, 0.0], [100.0, 0.0], [0.0, 0.0], ], - columns=["protein", "proteinX"], - index=[ - f"CsoDIAq-file_{sample1Header}_fullOutput", - f"CsoDIAq-file_{sample2Header}_fullOutput", - f"CsoDIAq-file_{sample3Header}_fullOutput", - ], + columns=["1/protein", "1/proteinX"], ) - commonProteinDf = pd.read_csv( - os.path.join(outputDirPath, "commonProteins.csv"), index_col=0 - ).sort_index() - assert_pandas_dataframes_are_equal(expectedCommonProteinDf, commonProteinDf) + evaluate_common_proteins_one_peptide_match_in_maxlfq(inputFileDirectory, expectedOutputDirectory, expectedCommonProteinDf, minNumDifferences=2) +def test__scoring__evaluate_common_proteins_one_peptide_match__includes_protein_in_maxlfq_when_minNumDifferences_1( + inputFileDirectory, expectedOutputDirectory +): + expectedCommonProteinDf = pd.DataFrame( + [ + [100.0, 100.0], + [100.0, 100.0], + [100.0, 100.0], + ], + columns=["1/protein", "1/proteinX"], + ) + evaluate_common_proteins_one_peptide_match_in_maxlfq(inputFileDirectory, expectedOutputDirectory, expectedCommonProteinDf, minNumDifferences=1) -def evaluate_overlap_protein_quant_comparison( - inputFileDirectory, expectedOutputDirectory, method, expectedCommonProteinDf +def test__scoring__evaluate_protein_quantities_are_calculated_even_with_multiple_peptide_clusters( + inputFileDirectory, expectedOutputDirectory ): - inputHeader = f"overlap_common_proteins_{method}_method" + inputHeader = f"maxlfq_multiple_peptide_clusters_" inputFileDirectoryChild = os.path.join(inputFileDirectory, inputHeader) os.mkdir(inputFileDirectoryChild) - sample1Header = "overlap_common_protein_eval_sample_1" - sample1Breakdown = OverlapSample1And2Breakdown(expectedOutputDirectory) + sample1Header = "maxlfq_multiple_peptide_clusters_sample_1" + sample1Breakdown = ClusterSample1And2Breakdown(expectedOutputDirectory) inputFilePath = os.path.join( inputFileDirectoryChild, f"CsoDIAq-file_{sample1Header}_fullOutput.csv" ) sample1Breakdown.inputDf.to_csv(inputFilePath, index=False) - sample2Header = "overlap_common_protein_eval_sample_2" - sample2Breakdown = OverlapSample1And2Breakdown(expectedOutputDirectory) + sample2Header = "maxlfq_multiple_peptide_clusters_sample_2" + sample2Breakdown = ClusterSample1And2Breakdown(expectedOutputDirectory) inputFilePath = os.path.join( inputFileDirectoryChild, f"CsoDIAq-file_{sample2Header}_fullOutput.csv" ) sample2Breakdown.inputDf.to_csv(inputFilePath, index=False) - sample3Header = "overlap_common_protein_eval_sample_3" - sample3Breakdown = OvelapSample3Breakdown(expectedOutputDirectory) + sample3Header = "maxlfq_multiple_peptide_clusters_sample_3" + sample3Breakdown = ClusterSample3And4Breakdown(expectedOutputDirectory) inputFilePath = os.path.join( inputFileDirectoryChild, f"CsoDIAq-file_{sample3Header}_fullOutput.csv" ) sample3Breakdown.inputDf.to_csv(inputFilePath, index=False) + sample4Header = "maxlfq_multiple_peptide_clusters_sample_4" + sample4Breakdown = ClusterSample3And4Breakdown(expectedOutputDirectory) + inputFilePath = os.path.join( + inputFileDirectoryChild, f"CsoDIAq-file_{sample4Header}_fullOutput.csv" + ) + sample4Breakdown.inputDf.to_csv(inputFilePath, index=False) args = [ "csodiaq", "score", "-i", inputFileDirectoryChild, - "-p", - method, ] subprocess.run(args, capture_output=True) assert_common_peptide_outputs_are_correct( @@ -518,61 +544,30 @@ def evaluate_overlap_protein_quant_comparison( sample1Header: sample1Breakdown, sample2Header: sample2Breakdown, sample3Header: sample3Breakdown, + sample4Header: sample4Breakdown, }, - method=method, + method='maxlfq', ) - outputDirPath = os.path.join(inputFileDirectoryChild, f"fdrScores-macc-{method}") + outputDirPath = os.path.join(inputFileDirectoryChild, f"fdrScores-macc-maxlfq") outputDirContents = os.listdir(outputDirPath) assert "commonProteins.csv" in outputDirContents - expectedCommonProteinDf = expectedCommonProteinDf.set_index( - pd.Index( - [ + expectedCommonProteinDf = pd.DataFrame( + [ + [100.0, 0.0], + [100.0, 0.0], + [200.0, 0.0], + [200.0, 0.0], + ], + columns=["1/protein", "1/proteinX"], + index=[ f"CsoDIAq-file_{sample1Header}_fullOutput", f"CsoDIAq-file_{sample2Header}_fullOutput", f"CsoDIAq-file_{sample3Header}_fullOutput", + f"CsoDIAq-file_{sample4Header}_fullOutput", ] - ) ) commonProteinDf = pd.read_csv( os.path.join(outputDirPath, "commonProteins.csv"), index_col=0 ).sort_index() - assert_pandas_dataframes_are_equal(expectedCommonProteinDf, commonProteinDf) - - -def test__scoring__evaluate_common_proteins_work_correctly_with_overlapping_proteins__average_method( - inputFileDirectory, expectedOutputDirectory -): - expectedCommonProteinDf = pd.DataFrame( - [ - [100.0, 0.0, 100.0], - [100.0, 0.0, 100.0], - [0.0, 100.0, 100.0], - ], - columns=["protein1", "protein2", "proteinX"], - ) - evaluate_overlap_protein_quant_comparison( - inputFileDirectory, - expectedOutputDirectory, - method="average", - expectedCommonProteinDf=expectedCommonProteinDf, - ) - - -def test__scoring__evaluate_common_proteins_work_correctly_with_overlapping_proteins__maxlfq_method( - inputFileDirectory, expectedOutputDirectory -): - expectedCommonProteinDf = pd.DataFrame( - [ - [100.0, 0.0, 0.0], - [100.0, 0.0, 0.0], - [0.0, 0.0, 0.0], - ], - columns=["protein1", "protein2", "proteinX"], - ) - evaluate_overlap_protein_quant_comparison( - inputFileDirectory, - expectedOutputDirectory, - method="maxlfq", - expectedCommonProteinDf=expectedCommonProteinDf, - ) + assert_pandas_dataframes_are_equal(expectedCommonProteinDf, commonProteinDf) \ No newline at end of file diff --git a/tests/system/v1_to_v2/test_compareToV1.py b/tests/system/v1_to_v2/test_compareToV1.py index 9970758..5d34102 100644 --- a/tests/system/v1_to_v2/test_compareToV1.py +++ b/tests/system/v1_to_v2/test_compareToV1.py @@ -90,43 +90,6 @@ def get_columns_that_should_match(type): "leadingProteinFDR", ] - -def test__identifier__main_workflow(commandLineArgs): - identifier = Identifier(commandLineArgs, isTesting=True) - queryFile = commandLineArgs["input"][0] - identifier._queryContext = QueryLoaderContext(queryFile) - - expectedMatchDf = pd.read_csv( - get_file_from_system_test_folder("matchDf_precorrected.csv.gz"), - compression="gzip", - ) - matchDf = identifier._match_library_to_query_spectra() - assert_numeric_pandas_dataframes_are_equal(expectedMatchDf, matchDf, "match") - - expectedCorrectedMatchDf = pd.read_csv( - get_file_from_system_test_folder("matchDf_postcorrected.csv.gz"), - compression="gzip", - ) - matchDf = identifier._apply_correction_to_match_dataframe(matchDf) - assert_numeric_pandas_dataframes_are_equal( - expectedCorrectedMatchDf, matchDf, "match" - ) - - expectedScoreDf = pd.read_csv( - get_file_from_system_test_folder("scoreDf.csv.gz"), - compression="gzip", - ) - scoreDf = identifier._score_spectra_matches(matchDf) - assert_numeric_pandas_dataframes_are_equal(expectedScoreDf, scoreDf, "score") - - expectedFullDf = pd.read_csv( - get_file_from_system_test_folder("v1FullOutput.csv.gz"), compression="gzip" - ) - - fullDf = identifier._format_identifications_as_dataframe(matchDf, scoreDf) - assert_numeric_pandas_dataframes_are_equal(expectedFullDf, fullDf, "full") - - def test__scoring_workflow(): fullDf = pd.read_csv( get_file_from_system_test_folder("v2FullOutput.csv.gz"), compression="gzip" diff --git a/tests/system/v1_to_v2/test_files/v1FullOutput.csv.gz b/tests/system/v1_to_v2/test_files/v1FullOutput.csv.gz deleted file mode 100644 index 0e57a36..0000000 Binary files a/tests/system/v1_to_v2/test_files/v1FullOutput.csv.gz and /dev/null differ diff --git a/tests/unit/identification/test_matchingFunctions.py b/tests/unit/identification/test_matchingFunctions.py index d136be2..3059d9a 100644 --- a/tests/unit/identification/test_matchingFunctions.py +++ b/tests/unit/identification/test_matchingFunctions.py @@ -151,7 +151,7 @@ def test__score_functions__calculate_ppm_offset_tolerance_using_tallest_bin_peak numbers += [mediumLeftNeighboringBin] * (tallestBinQuantity // 2 - 1) numbers += [mediumRightNeighboringBin] * (tallestBinQuantity // 2 - 1) expectedOffset = tallestBin - expectedTolerance = 2 + expectedTolerance = 3 offset, tolerance = calculate_ppm_offset_tolerance_using_tallest_bin_peak(numbers) assert abs(offset - expectedOffset) < 0.5 assert abs(tolerance - expectedTolerance) < 0.5 diff --git a/tests/unit/scoring/test_quantificationFunctions.py b/tests/unit/scoring/test_quantificationFunctions.py index d617c17..696b84b 100644 --- a/tests/unit/scoring/test_quantificationFunctions.py +++ b/tests/unit/scoring/test_quantificationFunctions.py @@ -9,6 +9,7 @@ compile_common_protein_quantification_file, set_non_present_protein_levels_to_zero, maxlfq, + ion_count_sum, ) @@ -96,19 +97,26 @@ def test__quantification_functions__compile_ion_count_comparison_across_runs_df( def test__quantification_functions__compile_common_protein_quantification_file__averaging_method(): inputData = [ - ["1/protein1", 100.0], - ["1/protein1", 200.0], - ["1/protein1", 300.0], - ["2/protein2/protein3", 400.0], - ["2/protein2/protein3", 500.0], + ["peptide1", "1/protein1"], + ["peptide2", "1/protein1"], + ["peptide3", "1/protein1"], + ["peptide4", "2/protein2/protein3"], + ["peptide5", "2/protein2/protein3"], ] - inputDf1 = pd.DataFrame(inputData, columns=["leadingProtein", "ionCount"]) + inputDf1 = pd.DataFrame(inputData, columns=["peptide", "leadingProtein"]) inputDf2 = inputDf1.copy() fileHeader1 = "test1" fileHeader2 = "test2" + commonPeptidesDf = pd.DataFrame( + [ + [100.0,200.0,300.0,400.0,500.0] for _ in range(2) + ], + columns = ["peptide1","peptide2","peptide3","peptide4","peptide5"], + index = [fileHeader1, fileHeader2] + ) expectedOutputDf = pd.DataFrame( - [[200.0, 450.0, 450.0], [200.0, 450.0, 450.0]], - columns=["protein1", "protein2", "protein3"], + [[600.0, 900.0], [600.0, 900.0]], + columns=["1/protein1", "2/protein2/protein3"], index=[fileHeader1, fileHeader2], ) outputDf = compile_common_protein_quantification_file( @@ -116,8 +124,9 @@ def test__quantification_functions__compile_common_protein_quantification_file__ fileHeader1: inputDf1, fileHeader2: inputDf2, }, - commonPeptidesDf=None, - proteinQuantificationMethod="average", + commonPeptidesDf=commonPeptidesDf, + proteinQuantificationMethod="sum", + minNumDifferences=None, ) assert expectedOutputDf.equals(outputDf) @@ -144,7 +153,7 @@ def test__set_non_present_protein_levels_to_zero(): [ [100.0, 100.0], [100.0, 100.0], - [-np.inf, -np.inf], + [0, 0], ], columns=peptides, index=samples, @@ -155,7 +164,70 @@ def test__set_non_present_protein_levels_to_zero(): assert expectedOutputDf.equals(outputDf) -def test__maxlfq__from_paper(): +def test__ion_count_sum(): + """ + Tests the ion count summary method of protein quantification. + + The test is based on the following protein profile (columns peptides, rows samples): + P1 P2 P3 P4 P5 P6 + A - X - - X - + B - X X - X - + C X X X X X X + D X X - X X X + + The protein profile is expected to filter out peptides with any missing samples: + P2 P5 + A X X + B X X + C X X + D X X + + Then summarize them across samples. So with the following example: + + P1 P2 P3 P4 + A 4.0 40.0 400.0 0 + B 3.0 30.0 300.0 0 + C 0 20.0 200.0 2000.0 + D 0 10.0 100.0 1000.0 + + It would filter down to the following: + P2 P3 + A 40.0 400.0 + B 30.0 300.0 + C 20.0 200.0 + D 10.0 100.0 + + And result in the following protein quantities across samples. + protein + A 440.0 + B 330.0 + C 220.0 + D 110.0 + """ + numPeptides = 4 + numSamples = 4 + peptideIntensityIncrease = [10**i for i in range(numPeptides)] + sampleIntensityIncrease = list(range(numSamples, 0, -1)) + inputDf = pd.DataFrame( + np.multiply.outer(sampleIntensityIncrease, peptideIntensityIncrease).astype(float), + columns=[f"P{i}" for i in range(1, numPeptides + 1)], + index=["A", "B", "C", "D",], + ) + cellsToDelete = [ + (0, 3), + (1, 3), + (2, 0), + (3, 0), + ] + for cell in cellsToDelete: + inputDf.iloc[cell[0], cell[1]] = 0 + + expectedOutput = np.array([440.0, 330.0, 220.0, 110.0]) + output = ion_count_sum(inputDf) + np.testing.assert_array_almost_equal(expectedOutput, output) + + +def test__maxlfq__from_paper__default_2_peptide_connections_required_per_sample(): """ Tests the maxLFQ algorithm for protein quantification as summarized in in figure 2 of their paper. @@ -200,6 +272,11 @@ def test__maxlfq__from_paper(): F 6:1 5:1 4:1 3:1 2:1 - Applying the maxLFQ function to this input, you'd expect quantities that reflect these ratios. + + NOTE: By default this algorithm only uses differences/ratios calculated between samples that have + at least 2 peptide matches and excludes all others. In the case of sample F, which has no such + differences/ratios, the protein is quantified as 0. See later test for adjusting this setting + to requiring at least 1 peptide match, which would then include a quantity for sample F. """ numPeptides = 7 numSamples = 6 @@ -241,5 +318,51 @@ def test__maxlfq__from_paper(): [7.83589964, 7.65357809, 7.43065038, 7.14296831, 6.73758953, 0] ) - output = maxlfq(normalizedInputDf.to_numpy()) + output = maxlfq(normalizedInputDf.to_numpy(), minNumDifferences=2) np.testing.assert_array_almost_equal(expectedOutput, output) + +def test__maxlfq__from_paper__only_1_peptide_connection_required_per_sample(): + """ + Evaluating the above example, but only requiring 1 peptide connection between samples. + """ + numPeptides = 7 + numSamples = 6 + peptideIntensityIncrease = [10**i for i in range(-3, 4)] + sampleIntensityIncrease = list(range(numSamples, 0, -1)) + inputDf = pd.DataFrame( + np.multiply.outer(sampleIntensityIncrease, peptideIntensityIncrease), + columns=[f"P{i}" for i in range(1, numPeptides + 1)], + index=["A", "B", "C", "D", "E", "F"], + ) + cellsToDelete = [ + (0, 0), + (0, 2), + (0, 3), + (0, 4), + (0, 6), + (1, 0), + (1, 3), + (1, 4), + (1, 6), + (2, 4), + (3, 2), + (3, 4), + (4, 0), + (4, 2), + (4, 4), + (4, 5), + (5, 0), + (5, 2), + (5, 3), + (5, 5), + (5, 6), + ] + for cell in cellsToDelete: + inputDf.iloc[cell[0], cell[1]] = 0 + + normalizedInputDf = np.log(inputDf) + expectedOutput = np.array( + [7.164394, 6.982073, 6.759121, 6.471439, 6.065974, 5.372443] + ) + output = maxlfq(normalizedInputDf.to_numpy(), minNumDifferences=1) + np.testing.assert_array_almost_equal(expectedOutput, output) \ No newline at end of file diff --git a/tests/unit/scoring/test_scoringFunctions.py b/tests/unit/scoring/test_scoringFunctions.py index 22c35bf..9b6beba 100644 --- a/tests/unit/scoring/test_scoringFunctions.py +++ b/tests/unit/scoring/test_scoringFunctions.py @@ -54,7 +54,7 @@ def test__score_functions__score_library_to_query_matches(vectorA, vectorB): matchesDf["libraryIntensity"] = vectorA matchesDf["queryIdx"] = [queryIdx for i in vectorA.index] matchesDf["queryIntensity"] = vectorB - cosineScore = calculate_cosine_similarity_score(vectorA, vectorB) + cosineScore = calculate_cosine_similarity_score(np.sqrt(vectorA), np.sqrt(vectorB)) expectedOutputDf = pd.DataFrame( data=[[libraryIdx, queryIdx, cosineScore]], columns=["libraryIdx", "queryIdx", "cosineScore"], @@ -66,7 +66,7 @@ def test__score_functions__score_library_to_query_matches(vectorA, vectorB): lowScoreMatchesDf["libraryIdx"] = [libraryIdx - 1 for i in vectorA.index] reverseVectorA = pd.Series(list(vectorA)[::-1]) lowScoreMatchesDf["libraryIntensity"] = reverseVectorA - lowCosineScore = calculate_cosine_similarity_score(reverseVectorA, vectorB) + lowCosineScore = calculate_cosine_similarity_score(np.sqrt(reverseVectorA), np.sqrt(vectorB)) unsortedMatchesDf = pd.concat([lowScoreMatchesDf, matchesDf]) expectedOutputDf = pd.DataFrame( data=[ diff --git a/tests/unit/test_csodiaqParser.py b/tests/unit/test_csodiaqParser.py index 80065f2..adb34da 100644 --- a/tests/unit/test_csodiaqParser.py +++ b/tests/unit/test_csodiaqParser.py @@ -2,6 +2,7 @@ import argparse import re import os +import warnings from csodiaq import set_args_from_command_line_input, check_for_conflicting_args from csodiaq.csodiaqParser import ( _OutputDirectory, @@ -670,6 +671,7 @@ def test__csodiaq_parser__set_args_from_command_line_input__initialize_scoring( assert len(parsedScoreArgs["input"]["idFiles"]) == 2 assert parsedScoreArgs["score"] == "macc" assert parsedScoreArgs["proteinQuantMethod"] == "maxlfq" + assert parsedScoreArgs["minNumDifferences"] == 2 def test__csodiaq_parser__set_args_from_command_line_input__score_succeeds_with_macc_input( @@ -713,10 +715,87 @@ def test__csodiaq_parser__set_args_from_command_line_input__score_suceeds_with_m def test__csodiaq_parser__set_args_from_command_line_input__score_suceeds_with_maxlfq_protein_quant_method( parser, scoreArgs ): - scoreArgs += ["-p", "average"] + scoreArgs += ["-p", "sum"] + args = vars(parser.parse_args(scoreArgs)) + assert args["proteinQuantMethod"] == "sum" + +def test__csodiaq_parser__set_args_from_command_line_input__score_fails_with_invalid_protein_quant_method( + parser, scoreArgs +): + scoreArgs += ["-p", "shouldFail"] + errorOutput = ( + "argument -p/--proteinQuantMethod: invalid choice: 'shouldFail' (choose from 'maxlfq', 'sum')" + ) + with pytest.raises(argparse.ArgumentTypeError, match=re.escape(errorOutput)): + args = vars(parser.parse_args(scoreArgs)) + +def test__csodiaq_parser__set_args_from_command_line_input__score_suceeds_with_min_num_differences_equal_to_1( + parser, scoreArgs +): + scoreArgs += ["-min", "1"] args = vars(parser.parse_args(scoreArgs)) - assert args["proteinQuantMethod"] == "average" + assert args["minNumDifferences"] == 1 +def test__csodiaq_parser__set_args_from_command_line_input__score_suceeds_with_min_num_differences_equal_to_2( + parser, scoreArgs +): + scoreArgs += ["-min", "2"] + args = vars(parser.parse_args(scoreArgs)) + assert args["minNumDifferences"] == 2 + +def test__csodiaq_parser__set_args_from_command_line_input__score_fails_with_float_min_num_differences( + parser, scoreArgs +): + scoreArgs += ["-min", "1.2"] + errorOutput = ( + "The minNumDifferences argument must be an integer." + ) + with pytest.raises(argparse.ArgumentTypeError, match=re.escape(errorOutput)): + args = vars(parser.parse_args(scoreArgs)) + +def test__csodiaq_parser__set_args_from_command_line_input__score_fails_when_min_num_differences_less_than_1( + parser, scoreArgs +): + scoreArgs += ["-min", "0"] + errorOutput = ( + "The minNumDifferences argument must be an integer greater than or equal to 1." + ) + with pytest.raises(argparse.ArgumentTypeError, match=re.escape(errorOutput)): + args = vars(parser.parse_args(scoreArgs)) + +def test__csodiaq_parser__set_args_from_command_line_input__score_fails_when_min_num_differences_greater_than_2( + parser, scoreArgs +): + scoreArgs += ["-min", "3"] + errorOutput = ( + "The minNumDifferences argument must be an integer less than or equal to 2." + ) + with pytest.raises(argparse.ArgumentTypeError, match=re.escape(errorOutput)): + args = vars(parser.parse_args(scoreArgs)) + +def test__csodiaq_parser__set_args_from_command_line_input__score_fails_when_min_num_differences_greater_than_2( + parser, scoreArgs +): + scoreArgs += ["-min", "3"] + errorOutput = ( + "The minNumDifferences argument must be an integer less than or equal to 2." + ) + with pytest.raises(argparse.ArgumentTypeError, match=re.escape(errorOutput)): + args = vars(parser.parse_args(scoreArgs)) + +@pytest.mark.skip( + "This test would require a huge setup for a minor warning message (in test_csodiaq.py). Skipping" +) +def test__csodiaq_parser__check_for_conflicting_args__score_throws_error_when_non_maxlfq_method_used_with_min_num_difference_flag( + parser, scoreArgs +): + scoreArgs += ["-p", "average"] + scoreArgs += ["-min", "1"] + errorOutput = ( + "The minNumDifferences flag will only have an effect when paired with the 'maxlfq' proteinQuantMethod flag. You used it with the 'average' method, so this flag will be ignored." + ) + with pytest.raises(argparse.ArgumentTypeError, match=re.escape(errorOutput)): + args = vars(parser.parse_args(scoreArgs)) @pytest.fixture def reanalysisFiles():