diff --git a/csodiaq/identifier/__init__.py b/csodiaq/identifier/__init__.py index 4aeb005..f753dc6 100644 --- a/csodiaq/identifier/__init__.py +++ b/csodiaq/identifier/__init__.py @@ -1,4 +1,6 @@ from .identifier import Identifier from .idpickerFunctions import identify_high_confidence_proteins from .scoringFunctions import calculate_fdr_rates_of_decoy_array -from .targetedReanalysisFunctions import create_mass_spec_input_dataframes_for_targeted_reanalysis_of_identified_peptides \ No newline at end of file +from .targetedReanalysisFunctions import ( + create_mass_spec_input_dataframes_for_targeted_reanalysis_of_identified_peptides, +) diff --git a/csodiaq/identifier/outputFormattingFunctions.py b/csodiaq/identifier/outputFormattingFunctions.py index 861cc1e..858e54a 100644 --- a/csodiaq/identifier/outputFormattingFunctions.py +++ b/csodiaq/identifier/outputFormattingFunctions.py @@ -257,4 +257,3 @@ def determine_if_peptides_are_unique_to_leading_protein(proteinDf): uniquePeptides = np.array([0] * len(proteinDf.index)) uniquePeptides[uniqueValuesDf.index] = 1 return list(uniquePeptides) - diff --git a/csodiaq/identifier/targetedReanalysisFunctions.py b/csodiaq/identifier/targetedReanalysisFunctions.py index 5fb3cd4..d93ee8a 100644 --- a/csodiaq/identifier/targetedReanalysisFunctions.py +++ b/csodiaq/identifier/targetedReanalysisFunctions.py @@ -2,21 +2,26 @@ import pandas as pd import numpy as np + def calculate_mz_of_heavy_version_of_peptide(peptide, lightMz, z): lightAndHeavyLysKMassDiff = 8.014199 lightAndHeavyArgRMassDiff = 10.00827 - numLysK = peptide.count('K') - numArgR = peptide.count('R') + numLysK = peptide.count("K") + numArgR = peptide.count("R") + + return ( + lightMz + + (numLysK * lightAndHeavyLysKMassDiff) / z + + (numArgR * lightAndHeavyArgRMassDiff) / z + ) - return (lightMz + - (numLysK * lightAndHeavyLysKMassDiff) / z + - (numArgR * lightAndHeavyArgRMassDiff) / z) def filter_to_only_keep_peptides_with_possibly_heavy_K_or_R_terminal_residue(fdrDf): return fdrDf[ fdrDf["peptide"].str.endswith("R") | fdrDf["peptide"].str.endswith("K") - ].reset_index(drop=True) + ].reset_index(drop=True) + def filter_to_only_keep_top_peptides_unique_to_protein(fdrDf, maxPeptidesPerProtein): fdrDf = ( @@ -24,83 +29,204 @@ def filter_to_only_keep_top_peptides_unique_to_protein(fdrDf, maxPeptidesPerProt .sort_values("ionCount", ascending=False) .reset_index(drop=True) ) - return fdrDf.groupby(["leadingProtein"]).head(maxPeptidesPerProtein).reset_index(drop=True) + return ( + fdrDf.groupby(["leadingProtein"]) + .head(maxPeptidesPerProtein) + .reset_index(drop=True) + ) + def calculate_mz_of_heavy_isotope_of_each_peptide(fdrDf): - return fdrDf.apply(lambda x: calculate_mz_of_heavy_version_of_peptide(x["peptide"],x["MzLIB"],x["zLIB"]), axis=1) + return fdrDf.apply( + lambda x: calculate_mz_of_heavy_version_of_peptide( + x["peptide"], x["MzLIB"], x["zLIB"] + ), + axis=1, + ) + def make_bin_assignments_for_mz_values(mzValues, binWidth=0.75): - bins = np.arange(int(min(mzValues)) - 1, int(max(mzValues) + binWidth*3)+1, binWidth*2) + bins = np.arange( + int(min(mzValues)) - 1, int(max(mzValues) + binWidth * 3) + 1, binWidth * 2 + ) values = np.digitize(mzValues, bins) mzBinValues = bins[values] mzBinValues -= binWidth return mzBinValues -def filter_out_peptides_based_on_user_settings(fdrDf, isIncludeHeavy, maximumPeptidesPerProtein): - if isIncludeHeavy: - fdrDf = filter_to_only_keep_peptides_with_possibly_heavy_K_or_R_terminal_residue(fdrDf) + +def filter_out_peptides_based_on_user_settings( + fdrDf, isIncludeHeavyIsotopes, maximumPeptidesPerProtein +): + if isIncludeHeavyIsotopes: + fdrDf = ( + filter_to_only_keep_peptides_with_possibly_heavy_K_or_R_terminal_residue( + fdrDf + ) + ) if maximumPeptidesPerProtein: - fdrDf = filter_to_only_keep_top_peptides_unique_to_protein(fdrDf, maximumPeptidesPerProtein) + fdrDf = filter_to_only_keep_top_peptides_unique_to_protein( + fdrDf, maximumPeptidesPerProtein + ) return fdrDf -def calculate_binning_information_by_compensation_voltage(fdrDf, isIncludeHeavy): + +def calculate_binning_information_by_compensation_voltage( + fdrDf, isIncludeHeavyIsotopes +): dfs = [ - organize_for_targeted_reanalysis_of_identified_peptides(df, isIncludeHeavy) for _, df in fdrDf.groupby(["CompensationVoltage"]) + organize_for_targeted_reanalysis_of_identified_peptides( + df, isIncludeHeavyIsotopes + ) + for _, df in fdrDf.groupby(["CompensationVoltage"]) ] return pd.concat(dfs) -def organize_for_targeted_reanalysis_of_identified_peptides(fdrDf, isIncludeHeavy): + +def organize_for_targeted_reanalysis_of_identified_peptides( + fdrDf, isIncludeHeavyIsotopes +): fdrDf["lightMzBin"] = make_bin_assignments_for_mz_values(fdrDf["MzLIB"]) - if isIncludeHeavy: + if isIncludeHeavyIsotopes: fdrDf["heavyMz"] = calculate_mz_of_heavy_isotope_of_each_peptide(fdrDf) fdrDf["heavyMzBin"] = make_bin_assignments_for_mz_values(fdrDf["heavyMz"]) return fdrDf + def make_targeted_reanalysis_line(peptide, mz, id): formula = "" adduct = "(no adduct)" genericCharge = 2 - return [peptide, formula, adduct, mz, genericCharge, id+1] + return [peptide, formula, adduct, mz, genericCharge, id + 1] -def consolidate_peptides_by_bin_values(df, isIncludeHeavy): + +def consolidate_peptides_by_bin_values(df, isIncludeHeavyIsotopes): bins = ["lightMzBin"] - if isIncludeHeavy: + if isIncludeHeavyIsotopes: bins.append("heavyMzBin") - return df.groupby(bins).apply(lambda x: format_protein_list_to_string(x["peptide"])).reset_index(name="peptide").sort_values(bins) + return ( + df.groupby(bins) + .apply(lambda x: format_protein_list_to_string(x["peptide"])) + .reset_index(name="peptide") + .sort_values(bins) + ) + -def organize_binned_data_for_targeted_reanalysis(condensedDf, isIncludeHeavy): +def organize_binned_data_for_targeted_reanalysis(condensedDf, isIncludeHeavyIsotopes): + """ + Compiles data into a format that can be read by a mass spectrometer. Note that, in cases where the SILAC protocol + is being used, the mass spectrometer can automatically match windows with related peptides provided they have + same MSXID (the last column of the output). In this case the output will have paired rows with identical + MSXIDs but differing m.z column values, one for the light mz value and one for the heavy mz value. + """ data = [ - make_targeted_reanalysis_line(condensedDf.loc[i]["peptide"], condensedDf.loc[i]["lightMzBin"], i) for i in range(len(condensedDf.index)) + make_targeted_reanalysis_line( + condensedDf.loc[i]["peptide"], condensedDf.loc[i]["lightMzBin"], i + ) + for i in range(len(condensedDf.index)) ] - if isIncludeHeavy: + if isIncludeHeavyIsotopes: data.extend( [ - make_targeted_reanalysis_line(condensedDf.loc[i]["peptide"], condensedDf.loc[i]["heavyMzBin"], i) for i - in range(len(condensedDf.index)) - + make_targeted_reanalysis_line( + condensedDf.loc[i]["peptide"], condensedDf.loc[i]["heavyMzBin"], i + ) + for i in range(len(condensedDf.index)) ] ) return data -def create_targeted_reanalysis_dataframe(df, isIncludeHeavy): - consolidatedDf = consolidate_peptides_by_bin_values(df, isIncludeHeavy) - targetedReanalysisData = organize_binned_data_for_targeted_reanalysis(consolidatedDf, isIncludeHeavy) - return pd.DataFrame(targetedReanalysisData, columns=["Compound","Formula","Adduct","m.z","z","MSXID"]).sort_values(["MSXID","m.z"]).reset_index(drop=True) + +def create_targeted_reanalysis_dataframe(df, isIncludeHeavyIsotopes): + consolidatedDf = consolidate_peptides_by_bin_values(df, isIncludeHeavyIsotopes) + targetedReanalysisData = organize_binned_data_for_targeted_reanalysis( + consolidatedDf, isIncludeHeavyIsotopes + ) + return ( + pd.DataFrame( + targetedReanalysisData, + columns=["Compound", "Formula", "Adduct", "m.z", "z", "MSXID"], + ) + .sort_values(["MSXID", "m.z"]) + .reset_index(drop=True) + ) + def make_cv_header(cvValue): if cvValue: - return f'CV_{str(abs(int(cvValue)))}' + return f"CV_{str(abs(int(cvValue)))}" else: - return 'noCV' + return "noCV" -def create_targeted_reanalysis_dataframe_by_compensation_voltage(fdrDf, isIncludeHeavy): + +def create_targeted_reanalysis_dataframes_by_compensation_voltage( + fdrDf, isIncludeHeavyIsotopes +): return { - make_cv_header(cv[0]): create_targeted_reanalysis_dataframe(df, isIncludeHeavy) for cv, df in fdrDf.groupby(["CompensationVoltage"]) + make_cv_header(cv[0]): create_targeted_reanalysis_dataframe( + df, isIncludeHeavyIsotopes + ) + for cv, df in fdrDf.groupby(["CompensationVoltage"]) } -def create_mass_spec_input_dataframes_for_targeted_reanalysis_of_identified_peptides(fdrDf, isIncludeHeavy=False, maximumPeptidesPerProtein=0): - fdrDf = filter_out_peptides_based_on_user_settings(fdrDf, isIncludeHeavy, maximumPeptidesPerProtein) - fdrDf = organize_for_targeted_reanalysis_of_identified_peptides(fdrDf, isIncludeHeavy) - outputDfDict = create_targeted_reanalysis_dataframe_by_compensation_voltage(fdrDf, isIncludeHeavy) + +def create_mass_spec_input_dataframes_for_targeted_reanalysis_of_identified_peptides( + fdrDf, isIncludeHeavyIsotopes=False, maximumPeptidesPerProtein=0 +): + """ + Creates dataframes with data to be fed into a mass spectrometer for targetted reanalysis of identified peptides. + + Extended Summary + ---------------- + After peptide identification, additional experimentation is often required to verify or expand upon the + significance of what was identified. For instance, some researchers would want to identify the relative + quantity of peptides under different experimental conditions. Using Stable Isotope Labeling by Amino acids + in Cell culture (SILAC), one can develop a cell culture where proteins use a heavier isotope of lysine and + arginine. When targeting these peptides of interest, one can then compare the intensity of the "lighter" + peptide isotope in one condition to the "heavier" peptide isotope in another, determining relative quantity + changes between conditions. Thus, after identifying specific proteins and/or peptides using the identification + workflow, one would need to prime the next mass spectrometer run to target those specific peptides only, one run + for each category (light and heavy). This function generates tables that can be fed into a mass spectrometer + to target specific peptides of interest. + + Note that as an aid to the researchers running the mass spectrometer, all targeted m/z values within a given + m/z window are binned to reduce redundant window analysis. + + Also note that, for targeted reanalysis of data that utilized compensation voltage, a separate mass spectrometer + input file must be generated for each. As such, a different dataset is made for each one. When no compensation + voltage is provided, a single file will be generated. To keep these potentially variable compensation voltage + tables separate, the return value is a dictionary highlighting the exact compensation voltage (or lack thereof) + in the key name with the dataframe as the value. A summary file of all tables is also provided. + + Parameters + ---------- + fdrDf : pandas DataFrame + A dataframe containing peptides of interest. Note that this could be peptides belonging to specific identified + proteins or just identified peptides generally, as indicated by the maximumPeptidesPerProtein variable. + + isIncludeHeavyIsotopes : boolean + A boolean indicating if the experiment is to be primed for a SILAC-based mass spectrometry run. + + maximumPeptidesPerProtein : int + The maximum number of peptides to be identified per protein of interest. In the case where peptides generally are + being targetted this parameter is set to 0. + + Returns + ------- + outputDfDict : dict + A dictionary containing the targeted reanalysis files in addition to a summary file. + 'fullDf' : summary dataframe + all other keys: target reanalysis dataframes, broken down by compensation voltage. + + """ + fdrDf = filter_out_peptides_based_on_user_settings( + fdrDf, isIncludeHeavyIsotopes, maximumPeptidesPerProtein + ) + fdrDf = organize_for_targeted_reanalysis_of_identified_peptides( + fdrDf, isIncludeHeavyIsotopes + ) + outputDfDict = create_targeted_reanalysis_dataframes_by_compensation_voltage( + fdrDf, isIncludeHeavyIsotopes + ) outputDfDict["fullDf"] = fdrDf return outputDfDict diff --git a/tests/system/test_identifier.py b/tests/system/test_identifier.py index 25edd9b..9e1bb31 100644 --- a/tests/system/test_identifier.py +++ b/tests/system/test_identifier.py @@ -1,13 +1,16 @@ import pandas as pd import numpy as np -from csodiaq.identifier import Identifier, create_mass_spec_input_dataframes_for_targeted_reanalysis_of_identified_peptides +from csodiaq.identifier import ( + Identifier, + create_mass_spec_input_dataframes_for_targeted_reanalysis_of_identified_peptides, +) from csodiaq.loaders.query import QueryLoaderContext import os import pickle import pytest -pd.set_option("display.max_columns",None) -pd.set_option("display.max_rows",None) +pd.set_option("display.max_columns", None) +pd.set_option("display.max_rows", None) def get_parent_dir(): @@ -148,17 +151,53 @@ def test__identifier__main_workflow(commandLineArgs): expectedProteinDf, outputDict["proteinFDR"], "protein" ) - targetedOutputDict = create_mass_spec_input_dataframes_for_targeted_reanalysis_of_identified_peptides(outputDict["proteinFDR"], isIncludeHeavy=True, maximumPeptidesPerProtein=1) - targetedOutputDict["fullDf"] = targetedOutputDict["fullDf"].drop(["fileName"], axis=1) + targetedOutputDict = create_mass_spec_input_dataframes_for_targeted_reanalysis_of_identified_peptides( + outputDict["proteinFDR"], + isIncludeHeavyIsotopes=True, + maximumPeptidesPerProtein=1, + ) + targetedOutputDict["fullDf"] = targetedOutputDict["fullDf"].drop( + ["fileName"], axis=1 + ) expectedAllCVDf = pd.read_csv(get_file_from_system_test_folder("allCVs.csv")) expectedAllCVDf = expectedAllCVDf.drop(["fileName"], axis=1) - expected30Df = pd.read_csv(get_file_from_system_test_folder("targetedReanalysis_mostIntenseTargs_CV_30.txt"), sep='\t').fillna('') - expected40Df = pd.read_csv(get_file_from_system_test_folder("targetedReanalysis_mostIntenseTargs_CV_40.txt"), sep='\t').fillna('') - expected50Df = pd.read_csv(get_file_from_system_test_folder("targetedReanalysis_mostIntenseTargs_CV_50.txt"), sep='\t').fillna('') - expected60Df = pd.read_csv(get_file_from_system_test_folder("targetedReanalysis_mostIntenseTargs_CV_60.txt"), sep='\t').fillna('') - expected70Df = pd.read_csv(get_file_from_system_test_folder("targetedReanalysis_mostIntenseTargs_CV_70.txt"), sep='\t').fillna('') - expected80Df = pd.read_csv(get_file_from_system_test_folder("targetedReanalysis_mostIntenseTargs_CV_80.txt"), sep='\t').fillna('') + expected30Df = pd.read_csv( + get_file_from_system_test_folder( + "targetedReanalysis_mostIntenseTargs_CV_30.txt" + ), + sep="\t", + ).fillna("") + expected40Df = pd.read_csv( + get_file_from_system_test_folder( + "targetedReanalysis_mostIntenseTargs_CV_40.txt" + ), + sep="\t", + ).fillna("") + expected50Df = pd.read_csv( + get_file_from_system_test_folder( + "targetedReanalysis_mostIntenseTargs_CV_50.txt" + ), + sep="\t", + ).fillna("") + expected60Df = pd.read_csv( + get_file_from_system_test_folder( + "targetedReanalysis_mostIntenseTargs_CV_60.txt" + ), + sep="\t", + ).fillna("") + expected70Df = pd.read_csv( + get_file_from_system_test_folder( + "targetedReanalysis_mostIntenseTargs_CV_70.txt" + ), + sep="\t", + ).fillna("") + expected80Df = pd.read_csv( + get_file_from_system_test_folder( + "targetedReanalysis_mostIntenseTargs_CV_80.txt" + ), + sep="\t", + ).fillna("") for column in targetedOutputDict["fullDf"].columns: expectedColumn = np.array(expectedAllCVDf[column]) diff --git a/tests/unit/identifier/test_outputFormattingFunctions.py b/tests/unit/identifier/test_outputFormattingFunctions.py index 623fb22..52ca002 100644 --- a/tests/unit/identifier/test_outputFormattingFunctions.py +++ b/tests/unit/identifier/test_outputFormattingFunctions.py @@ -13,6 +13,7 @@ import pytest import numpy as np + @pytest.fixture def identifierOutputData(): return [ @@ -340,5 +341,3 @@ def test__output_formatting_functions__determine_if_peptides_are_unique_to_leadi ] output = determine_if_peptides_are_unique_to_leading_protein(inputDf) assert expectedOutput == output - - diff --git a/tests/unit/identifier/test_targetedReanalysisFunctions.py b/tests/unit/identifier/test_targetedReanalysisFunctions.py index 16c5c11..ef2c8f4 100644 --- a/tests/unit/identifier/test_targetedReanalysisFunctions.py +++ b/tests/unit/identifier/test_targetedReanalysisFunctions.py @@ -11,7 +11,7 @@ create_targeted_reanalysis_dataframe, organize_for_targeted_reanalysis_of_identified_peptides, filter_out_peptides_based_on_user_settings, - create_targeted_reanalysis_dataframe_by_compensation_voltage, + create_targeted_reanalysis_dataframes_by_compensation_voltage, create_mass_spec_input_dataframes_for_targeted_reanalysis_of_identified_peptides, ) @@ -21,46 +21,61 @@ def test__output_formatting_functions__calculate_mz_of_heavy_version_of_peptide( charge = 1 numLys = 1 numArg = 1 - peptide = numLys * 'K' + numArg * 'R' + peptide = numLys * "K" + numArg * "R" expectedOneChargeOneLysOneArg = 18.022469 - oneChargeOneLysOneArg = calculate_mz_of_heavy_version_of_peptide(peptide, testMz, z=charge) + oneChargeOneLysOneArg = calculate_mz_of_heavy_version_of_peptide( + peptide, testMz, z=charge + ) assert expectedOneChargeOneLysOneArg == oneChargeOneLysOneArg charge = 2 expectedTwoChargeOneLysOneArg = expectedOneChargeOneLysOneArg / 2 - twoChargeOneLysOneArg = calculate_mz_of_heavy_version_of_peptide(peptide, testMz, z=charge) + twoChargeOneLysOneArg = calculate_mz_of_heavy_version_of_peptide( + peptide, testMz, z=charge + ) assert expectedTwoChargeOneLysOneArg == twoChargeOneLysOneArg numLys = 2 charge = 1 - peptide = numLys * 'K' + numArg * 'R' + peptide = numLys * "K" + numArg * "R" expectedOneChargeTwoLysOneArg = 26.036668 - oneChargeTwoLysOneArg = calculate_mz_of_heavy_version_of_peptide(peptide, testMz, z=charge) + oneChargeTwoLysOneArg = calculate_mz_of_heavy_version_of_peptide( + peptide, testMz, z=charge + ) assert expectedOneChargeTwoLysOneArg == oneChargeTwoLysOneArg charge = 2 expectedTwoChargeTwoLysOneArg = expectedOneChargeTwoLysOneArg / 2 - twoChargeTwoLysOneArg = calculate_mz_of_heavy_version_of_peptide(peptide, testMz, z=charge) + twoChargeTwoLysOneArg = calculate_mz_of_heavy_version_of_peptide( + peptide, testMz, z=charge + ) assert expectedTwoChargeTwoLysOneArg == twoChargeTwoLysOneArg numLys = 1 numArg = 2 charge = 1 - peptide = numLys * 'K' + numArg * 'R' + peptide = numLys * "K" + numArg * "R" expectedOneChargeOneLysTwoArg = 28.030738999999997 - oneChargeOneLysTwoArg = calculate_mz_of_heavy_version_of_peptide(peptide, testMz, z=charge) + oneChargeOneLysTwoArg = calculate_mz_of_heavy_version_of_peptide( + peptide, testMz, z=charge + ) assert expectedOneChargeOneLysTwoArg == oneChargeOneLysTwoArg charge = 2 expectedTwoChargeOneLysTwoArg = expectedOneChargeOneLysTwoArg / 2 - twoChargeOneLysTwoArg = calculate_mz_of_heavy_version_of_peptide(peptide, testMz, z=charge) + twoChargeOneLysTwoArg = calculate_mz_of_heavy_version_of_peptide( + peptide, testMz, z=charge + ) assert expectedTwoChargeOneLysTwoArg == twoChargeOneLysTwoArg testMz = 100.0 expectedTwoChargeOneLysTwoArgHundredMz = expectedTwoChargeOneLysTwoArg + testMz - twoChargeOneLysTwoArgHundredMz = calculate_mz_of_heavy_version_of_peptide(peptide, testMz, z=charge) + twoChargeOneLysTwoArgHundredMz = calculate_mz_of_heavy_version_of_peptide( + peptide, testMz, z=charge + ) assert expectedTwoChargeOneLysTwoArgHundredMz == twoChargeOneLysTwoArgHundredMz + def test__output_formatting_functions__filter_to_only_keep_peptides_with_possibly_heavy_K_or_R_terminal_residue(): data = [ ["A"], @@ -75,10 +90,13 @@ def test__output_formatting_functions__filter_to_only_keep_peptides_with_possibl df = pd.DataFrame(data, columns=["peptide"]) expectedData = data[3:] expectedOutput = pd.DataFrame(expectedData, columns=["peptide"]) - output = filter_to_only_keep_peptides_with_possibly_heavy_K_or_R_terminal_residue(df) + output = filter_to_only_keep_peptides_with_possibly_heavy_K_or_R_terminal_residue( + df + ) assert expectedOutput.equals(output) pass + def test__output_formatting_functions__filter_to_only_keep_top_peptides_unique_to_protein(): inputData = [ ["protein1", 100.0, 1], @@ -93,7 +111,9 @@ def test__output_formatting_functions__filter_to_only_keep_top_peptides_unique_t ["protein4", 100.0, 1], ["protein5", 100.0, 0], ] - inputDf = pd.DataFrame(inputData, columns=["leadingProtein","ionCount", "uniquePeptide"]) + inputDf = pd.DataFrame( + inputData, columns=["leadingProtein", "ionCount", "uniquePeptide"] + ) topProteinsToKeep = 2 expectedOutputData = [ ["protein1", 400.0, 1], @@ -104,10 +124,15 @@ def test__output_formatting_functions__filter_to_only_keep_top_peptides_unique_t ["protein3", 100.0, 1], ["protein4", 100.0, 1], ] - expectedOutputDf = pd.DataFrame(expectedOutputData, columns=["leadingProtein","ionCount", "uniquePeptide"]) - outputDf = filter_to_only_keep_top_peptides_unique_to_protein(inputDf, topProteinsToKeep) + expectedOutputDf = pd.DataFrame( + expectedOutputData, columns=["leadingProtein", "ionCount", "uniquePeptide"] + ) + outputDf = filter_to_only_keep_top_peptides_unique_to_protein( + inputDf, topProteinsToKeep + ) assert expectedOutputDf.equals(outputDf) + @pytest.fixture def inputFilteringDf(): inputData = [ @@ -125,17 +150,29 @@ def inputFilteringDf(): ["peptide12K", "protein6", 100.0, 0], ["peptide13", "protein7", 100.0, 1], ] - return pd.DataFrame(inputData, columns=["peptide","leadingProtein","ionCount","uniquePeptide"]) + return pd.DataFrame( + inputData, columns=["peptide", "leadingProtein", "ionCount", "uniquePeptide"] + ) + -def test__output_formatting_functions__filter_out_peptides_based_on_user_settings__no_heavy_no_proteins(inputFilteringDf): - isIncludeHeavy = False +def test__output_formatting_functions__filter_out_peptides_based_on_user_settings__no_heavy_no_proteins( + inputFilteringDf, +): + isIncludeHeavyIsotopes = False maximumPeptidesPerProtein = 0 expectedOutputDf = inputFilteringDf.copy() - outputDf = filter_out_peptides_based_on_user_settings(inputFilteringDf, isIncludeHeavy=isIncludeHeavy, maximumPeptidesPerProtein=maximumPeptidesPerProtein) + outputDf = filter_out_peptides_based_on_user_settings( + inputFilteringDf, + isIncludeHeavyIsotopes=isIncludeHeavyIsotopes, + maximumPeptidesPerProtein=maximumPeptidesPerProtein, + ) assert expectedOutputDf.equals(outputDf) -def test__output_formatting_functions__filter_out_peptides_based_on_user_settings__is_heavy_no_proteins(inputFilteringDf): - isIncludeHeavy = True + +def test__output_formatting_functions__filter_out_peptides_based_on_user_settings__is_heavy_no_proteins( + inputFilteringDf, +): + isIncludeHeavyIsotopes = True maximumPeptidesPerProtein = 0 expectedOutputData = [ @@ -152,12 +189,21 @@ def test__output_formatting_functions__filter_out_peptides_based_on_user_setting ["peptide11R", "protein5", 100.0, 0], ["peptide12K", "protein6", 100.0, 0], ] - expectedOutputDf = pd.DataFrame(expectedOutputData, columns=inputFilteringDf.columns) - outputDf = filter_out_peptides_based_on_user_settings(inputFilteringDf, isIncludeHeavy=isIncludeHeavy, maximumPeptidesPerProtein=maximumPeptidesPerProtein) + expectedOutputDf = pd.DataFrame( + expectedOutputData, columns=inputFilteringDf.columns + ) + outputDf = filter_out_peptides_based_on_user_settings( + inputFilteringDf, + isIncludeHeavyIsotopes=isIncludeHeavyIsotopes, + maximumPeptidesPerProtein=maximumPeptidesPerProtein, + ) assert expectedOutputDf.equals(outputDf) -def test__output_formatting_functions__filter_out_peptides_based_on_user_settings__no_heavy_has_proteins(inputFilteringDf): - isIncludeHeavy = False + +def test__output_formatting_functions__filter_out_peptides_based_on_user_settings__no_heavy_has_proteins( + inputFilteringDf, +): + isIncludeHeavyIsotopes = False maximumPeptidesPerProtein = 2 expectedOutputData = [ ["peptide04K", "protein1", 400.0, 1], @@ -169,12 +215,21 @@ def test__output_formatting_functions__filter_out_peptides_based_on_user_setting ["peptide10K", "protein4", 100.0, 1], ["peptide13", "protein7", 100.0, 1], ] - expectedOutputDf = pd.DataFrame(expectedOutputData, columns=inputFilteringDf.columns) - outputDf = filter_out_peptides_based_on_user_settings(inputFilteringDf, isIncludeHeavy=isIncludeHeavy, maximumPeptidesPerProtein=maximumPeptidesPerProtein) + expectedOutputDf = pd.DataFrame( + expectedOutputData, columns=inputFilteringDf.columns + ) + outputDf = filter_out_peptides_based_on_user_settings( + inputFilteringDf, + isIncludeHeavyIsotopes=isIncludeHeavyIsotopes, + maximumPeptidesPerProtein=maximumPeptidesPerProtein, + ) assert expectedOutputDf.equals(outputDf) -def test__output_formatting_functions__filter_out_peptides_based_on_user_settings__is_heavy_has_proteins(inputFilteringDf): - isIncludeHeavy = True + +def test__output_formatting_functions__filter_out_peptides_based_on_user_settings__is_heavy_has_proteins( + inputFilteringDf, +): + isIncludeHeavyIsotopes = True maximumPeptidesPerProtein = 2 expectedOutputData = [ ["peptide04K", "protein1", 400.0, 1], @@ -185,13 +240,20 @@ def test__output_formatting_functions__filter_out_peptides_based_on_user_setting ["peptide08K", "protein3", 100.0, 1], ["peptide10K", "protein4", 100.0, 1], ] - expectedOutputDf = pd.DataFrame(expectedOutputData, columns=inputFilteringDf.columns) - outputDf = filter_out_peptides_based_on_user_settings(inputFilteringDf, isIncludeHeavy=isIncludeHeavy, maximumPeptidesPerProtein=maximumPeptidesPerProtein) + expectedOutputDf = pd.DataFrame( + expectedOutputData, columns=inputFilteringDf.columns + ) + outputDf = filter_out_peptides_based_on_user_settings( + inputFilteringDf, + isIncludeHeavyIsotopes=isIncludeHeavyIsotopes, + maximumPeptidesPerProtein=maximumPeptidesPerProtein, + ) assert expectedOutputDf.equals(outputDf) + def test__output_formatting_functions__calculate_mz_of_heavy_isotope_of_each_peptide(): data = [ - ["KR",100.0, 1], + ["KR", 100.0, 1], ["KKR", 100.0, 1], ["KRR", 100.0, 1], ] @@ -200,6 +262,7 @@ def test__output_formatting_functions__calculate_mz_of_heavy_isotope_of_each_pep output = calculate_mz_of_heavy_isotope_of_each_peptide(inputDf) np.testing.assert_array_almost_equal(np.array(expectedOutput), np.array(output)) + @pytest.fixture def inputBinningDf(): inputData = [ @@ -213,29 +276,41 @@ def inputBinningDf(): ] return pd.DataFrame(inputData, columns=["peptide", "MzLIB", "zLIB"]) + @pytest.fixture def expectedLightMzBins(): - return np.array([ - 99.75, - 99.75, - 99.75, - 101.25, - 102.75, - 114.75, - 114.75, - ]) - -def test__output_formatting_functions__make_bin_assignments_for_mz_values(inputBinningDf, expectedLightMzBins): + return np.array( + [ + 99.75, + 99.75, + 99.75, + 101.25, + 102.75, + 114.75, + 114.75, + ] + ) + + +def test__output_formatting_functions__make_bin_assignments_for_mz_values( + inputBinningDf, expectedLightMzBins +): bins = make_bin_assignments_for_mz_values(inputBinningDf["MzLIB"]) - np.testing.assert_array_equal(expectedLightMzBins,bins) + np.testing.assert_array_equal(expectedLightMzBins, bins) + -def test__output_formatting_functions__organize_for_targeted_reanalysis_of_identified_peptides__no_heavy(inputBinningDf, expectedLightMzBins): - isIncludeHeavy = False +def test__output_formatting_functions__organize_for_targeted_reanalysis_of_identified_peptides__no_heavy( + inputBinningDf, expectedLightMzBins +): + isIncludeHeavyIsotopes = False expectedOutputDf = inputBinningDf.copy() expectedOutputDf["lightMzBin"] = expectedLightMzBins - outputDf = organize_for_targeted_reanalysis_of_identified_peptides(inputBinningDf, isIncludeHeavy=isIncludeHeavy) + outputDf = organize_for_targeted_reanalysis_of_identified_peptides( + inputBinningDf, isIncludeHeavyIsotopes=isIncludeHeavyIsotopes + ) assert expectedOutputDf.equals(outputDf) + @pytest.fixture def expectedHeavyMzColumn(inputBinningDf): lightAndHeavyLysKMassDiff = 8.014199 @@ -249,32 +324,45 @@ def expectedHeavyMzColumn(inputBinningDf): mzValues[3] + lightAndHeavyLysKMassDiff / chargeValues[3], mzValues[4] + lightAndHeavyArgRMassDiff / chargeValues[4], mzValues[5] + lightAndHeavyLysKMassDiff / chargeValues[5], - mzValues[6] + lightAndHeavyArgRMassDiff / chargeValues[6] + lightAndHeavyLysKMassDiff / chargeValues[6], + mzValues[6] + + lightAndHeavyArgRMassDiff / chargeValues[6] + + lightAndHeavyLysKMassDiff / chargeValues[6], ] + @pytest.fixture def expectedHeavyMzBinColumn(): - return np.array([ - 109.75, - 109.75, - 103.75, - 105.25, - 112.75, - 123.25, - 133.75, - ]) - -def test__output_formatting_functions__organize_for_targeted_reanalysis_of_identified_peptides__has_heavy(inputBinningDf, expectedLightMzBins, expectedHeavyMzColumn, expectedHeavyMzBinColumn): - isIncludeHeavy = True + return np.array( + [ + 109.75, + 109.75, + 103.75, + 105.25, + 112.75, + 123.25, + 133.75, + ] + ) + + +def test__output_formatting_functions__organize_for_targeted_reanalysis_of_identified_peptides__has_heavy( + inputBinningDf, expectedLightMzBins, expectedHeavyMzColumn, expectedHeavyMzBinColumn +): + isIncludeHeavyIsotopes = True expectedOutputDf = inputBinningDf.copy() expectedOutputDf["lightMzBin"] = expectedLightMzBins expectedOutputDf["heavyMz"] = expectedHeavyMzColumn expectedOutputDf["heavyMzBin"] = expectedHeavyMzBinColumn - outputDf = organize_for_targeted_reanalysis_of_identified_peptides(inputBinningDf, isIncludeHeavy=isIncludeHeavy) + outputDf = organize_for_targeted_reanalysis_of_identified_peptides( + inputBinningDf, isIncludeHeavyIsotopes=isIncludeHeavyIsotopes + ) assert expectedOutputDf.equals(outputDf) -def test__output_formatting_functions__calculate_binning_information_by_compensation_voltage__no_heavy(inputBinningDf, expectedLightMzBins): - isIncludeHeavy = False + +def test__output_formatting_functions__calculate_binning_information_by_compensation_voltage__no_heavy( + inputBinningDf, expectedLightMzBins +): + isIncludeHeavyIsotopes = False inputBinningDfCV30 = inputBinningDf.copy() inputBinningDfCV30["CompensationVoltage"] = [-30] * len(inputBinningDfCV30.index) inputBinningDfCV40 = inputBinningDf.copy() @@ -282,11 +370,16 @@ def test__output_formatting_functions__calculate_binning_information_by_compensa inputDf = pd.concat([inputBinningDfCV40, inputBinningDfCV30]) expectedOutputDf = inputDf.copy() expectedOutputDf["lightMzBin"] = np.append(expectedLightMzBins, expectedLightMzBins) - outputDf = calculate_binning_information_by_compensation_voltage(inputDf, isIncludeHeavy=isIncludeHeavy) + outputDf = calculate_binning_information_by_compensation_voltage( + inputDf, isIncludeHeavyIsotopes=isIncludeHeavyIsotopes + ) assert expectedOutputDf.equals(outputDf) -def test__output_formatting_functions__calculate_binning_information_by_compensation_voltage__with_heavy(inputBinningDf, expectedLightMzBins, expectedHeavyMzColumn, expectedHeavyMzBinColumn): - isIncludeHeavy = True + +def test__output_formatting_functions__calculate_binning_information_by_compensation_voltage__with_heavy( + inputBinningDf, expectedLightMzBins, expectedHeavyMzColumn, expectedHeavyMzBinColumn +): + isIncludeHeavyIsotopes = True inputBinningDfCV30 = inputBinningDf.copy() inputBinningDfCV30["CompensationVoltage"] = [-30] * len(inputBinningDfCV30.index) inputBinningDfCV40 = inputBinningDf.copy() @@ -295,19 +388,24 @@ def test__output_formatting_functions__calculate_binning_information_by_compensa expectedOutputDf = inputDf.copy() expectedOutputDf["lightMzBin"] = np.append(expectedLightMzBins, expectedLightMzBins) expectedOutputDf["heavyMz"] = expectedHeavyMzColumn + expectedHeavyMzColumn - expectedOutputDf["heavyMzBin"] = np.append(expectedHeavyMzBinColumn, expectedHeavyMzBinColumn) - outputDf = calculate_binning_information_by_compensation_voltage(inputDf, isIncludeHeavy=isIncludeHeavy) + expectedOutputDf["heavyMzBin"] = np.append( + expectedHeavyMzBinColumn, expectedHeavyMzBinColumn + ) + outputDf = calculate_binning_information_by_compensation_voltage( + inputDf, isIncludeHeavyIsotopes=isIncludeHeavyIsotopes + ) assert expectedOutputDf.equals(outputDf) @pytest.fixture def inputFormattedDf(): inputData = [ - ["peptide1",20.0, 30.0], - ["peptide2",20.0, 30.0], - ["peptide3",10.0, 20.0], + ["peptide1", 20.0, 30.0], + ["peptide2", 20.0, 30.0], + ["peptide3", 10.0, 20.0], ] - return pd.DataFrame(inputData, columns=["peptide","lightMzBin", "heavyMzBin"]) + return pd.DataFrame(inputData, columns=["peptide", "lightMzBin", "heavyMzBin"]) + @pytest.fixture def targetedReanalysisNoHeavyDf(): @@ -318,13 +416,22 @@ def targetedReanalysisNoHeavyDf(): ["1/peptide3", formula, adduct, 10.0, charge, 1], ["2/peptide1/peptide2", formula, adduct, 20.0, charge, 2], ] - return pd.DataFrame(expectedOutputData, columns=["Compound","Formula","Adduct","m.z","z","MSXID"]) - -def test__output_formatting_functions__create_targeted_reanalysis_dataframe__no_heavy(inputFormattedDf, targetedReanalysisNoHeavyDf): - isIncludeHeavy = False - outputDf = create_targeted_reanalysis_dataframe(inputFormattedDf, isIncludeHeavy=isIncludeHeavy) + return pd.DataFrame( + expectedOutputData, + columns=["Compound", "Formula", "Adduct", "m.z", "z", "MSXID"], + ) + + +def test__output_formatting_functions__create_targeted_reanalysis_dataframe__no_heavy( + inputFormattedDf, targetedReanalysisNoHeavyDf +): + isIncludeHeavyIsotopes = False + outputDf = create_targeted_reanalysis_dataframe( + inputFormattedDf, isIncludeHeavyIsotopes=isIncludeHeavyIsotopes + ) assert targetedReanalysisNoHeavyDf.equals(outputDf) + @pytest.fixture def targetedReanalysisWithHeavyDf(): formula = "" @@ -336,68 +443,108 @@ def targetedReanalysisWithHeavyDf(): ["2/peptide1/peptide2", formula, adduct, 20.0, charge, 2], ["2/peptide1/peptide2", formula, adduct, 30.0, charge, 2], ] - return pd.DataFrame(expectedOutputData, columns=["Compound","Formula","Adduct","m.z","z","MSXID"]) - -def test__output_formatting_functions__create_targeted_reanalysis_dataframe__with_heavy(inputFormattedDf, targetedReanalysisWithHeavyDf): - isIncludeHeavy = True - outputDf = create_targeted_reanalysis_dataframe(inputFormattedDf, isIncludeHeavy=isIncludeHeavy) + return pd.DataFrame( + expectedOutputData, + columns=["Compound", "Formula", "Adduct", "m.z", "z", "MSXID"], + ) + + +def test__output_formatting_functions__create_targeted_reanalysis_dataframe__with_heavy( + inputFormattedDf, targetedReanalysisWithHeavyDf +): + isIncludeHeavyIsotopes = True + outputDf = create_targeted_reanalysis_dataframe( + inputFormattedDf, isIncludeHeavyIsotopes=isIncludeHeavyIsotopes + ) assert targetedReanalysisWithHeavyDf.equals(outputDf) -def test__output_formatting_functions__create_targeted_reanalysis_dataframe_by_compensation_voltage__no_heavy(inputFormattedDf, targetedReanalysisNoHeavyDf): - isIncludeHeavy = False + +def test__output_formatting_functions__create_targeted_reanalysis_dataframes_by_compensation_voltage__no_heavy( + inputFormattedDf, targetedReanalysisNoHeavyDf +): + isIncludeHeavyIsotopes = False inputFormattedDfCV30 = inputFormattedDf.copy() - inputFormattedDfCV30["CompensationVoltage"] = [-30] * len(inputFormattedDfCV30.index) + inputFormattedDfCV30["CompensationVoltage"] = [-30] * len( + inputFormattedDfCV30.index + ) inputFormattedDfCV40 = inputFormattedDf.copy() - inputFormattedDfCV40["CompensationVoltage"] = [-40] * len(inputFormattedDfCV40.index) + inputFormattedDfCV40["CompensationVoltage"] = [-40] * len( + inputFormattedDfCV40.index + ) inputDf = pd.concat([inputFormattedDfCV30, inputFormattedDfCV40]) expectedOutput = { "CV_30": targetedReanalysisNoHeavyDf, "CV_40": targetedReanalysisNoHeavyDf, } - output = create_targeted_reanalysis_dataframe_by_compensation_voltage(inputDf, isIncludeHeavy) + output = create_targeted_reanalysis_dataframes_by_compensation_voltage( + inputDf, isIncludeHeavyIsotopes + ) for cv, expectedTargetedReanalysisDf in expectedOutput.items(): assert cv in output assert expectedTargetedReanalysisDf.equals(output[cv]) -def test__output_formatting_functions__create_targeted_reanalysis_dataframe_by_compensation_voltage__with_heavy(inputFormattedDf, targetedReanalysisWithHeavyDf): - isIncludeHeavy = True + +def test__output_formatting_functions__create_targeted_reanalysis_dataframes_by_compensation_voltage__with_heavy( + inputFormattedDf, targetedReanalysisWithHeavyDf +): + isIncludeHeavyIsotopes = True inputFormattedDfCV30 = inputFormattedDf.copy() - inputFormattedDfCV30["CompensationVoltage"] = [-30] * len(inputFormattedDfCV30.index) + inputFormattedDfCV30["CompensationVoltage"] = [-30] * len( + inputFormattedDfCV30.index + ) inputFormattedDfCV40 = inputFormattedDf.copy() - inputFormattedDfCV40["CompensationVoltage"] = [-40] * len(inputFormattedDfCV40.index) + inputFormattedDfCV40["CompensationVoltage"] = [-40] * len( + inputFormattedDfCV40.index + ) inputDf = pd.concat([inputFormattedDfCV30, inputFormattedDfCV40]) expectedOutput = { "CV_30": targetedReanalysisWithHeavyDf, "CV_40": targetedReanalysisWithHeavyDf, } - output = create_targeted_reanalysis_dataframe_by_compensation_voltage(inputDf, isIncludeHeavy) + output = create_targeted_reanalysis_dataframes_by_compensation_voltage( + inputDf, isIncludeHeavyIsotopes + ) for cv, expectedTargetedReanalysisDf in expectedOutput.items(): assert cv in output assert expectedTargetedReanalysisDf.equals(output[cv]) + @pytest.fixture def inputProteinFdrDf(): genericMz = 100.0 inputData = [ - ["peptide01R", "protein1", 100.0, 1, genericMz,1,""], - ["peptide02R", "protein1", 200.0, 1, genericMz*2,2,""], - ["peptide03K", "protein1", 300.0, 1, genericMz*3,1,""], - ["peptide04K", "protein1", 400.0, 1, genericMz*4,1,""], - ["peptide05R", "protein2", 100.0, 1, genericMz*5,1,""], - ["peptide06R", "protein2", 200.0, 1, genericMz,2,""], - ["peptide07K", "protein2", 300.0, 1, genericMz*2,1,""], - ["peptide08KK", "protein3", 100.0, 1, genericMz*3,2,""], - ["peptide09R", "protein3", 200.0, 1, genericMz*4,1,""], - ["peptide10RR", "protein4", 100.0, 1, genericMz*5,1,""], - ["peptide11K", "protein5", 100.0, 0, genericMz,1,""], - ["peptide12K", "protein6", 100.0, 0, genericMz*2,2,""], - ["peptide13", "protein7", 100.0, 1, genericMz*3,1,""], + ["peptide01R", "protein1", 100.0, 1, genericMz, 1, ""], + ["peptide02R", "protein1", 200.0, 1, genericMz * 2, 2, ""], + ["peptide03K", "protein1", 300.0, 1, genericMz * 3, 1, ""], + ["peptide04K", "protein1", 400.0, 1, genericMz * 4, 1, ""], + ["peptide05R", "protein2", 100.0, 1, genericMz * 5, 1, ""], + ["peptide06R", "protein2", 200.0, 1, genericMz, 2, ""], + ["peptide07K", "protein2", 300.0, 1, genericMz * 2, 1, ""], + ["peptide08KK", "protein3", 100.0, 1, genericMz * 3, 2, ""], + ["peptide09R", "protein3", 200.0, 1, genericMz * 4, 1, ""], + ["peptide10RR", "protein4", 100.0, 1, genericMz * 5, 1, ""], + ["peptide11K", "protein5", 100.0, 0, genericMz, 1, ""], + ["peptide12K", "protein6", 100.0, 0, genericMz * 2, 2, ""], + ["peptide13", "protein7", 100.0, 1, genericMz * 3, 1, ""], ] - return pd.DataFrame(inputData, columns=["peptide","leadingProtein","ionCount","uniquePeptide","MzLIB","zLIB","CompensationVoltage"]) - - -def test__output_formatting_functions__create_mass_spec_input_dataframes_for_targeted_reanalysis_of_identified_peptides__no_heavy_no_protein_no_cv(inputProteinFdrDf): - isIncludeHeavy = False + return pd.DataFrame( + inputData, + columns=[ + "peptide", + "leadingProtein", + "ionCount", + "uniquePeptide", + "MzLIB", + "zLIB", + "CompensationVoltage", + ], + ) + + +def test__output_formatting_functions__create_mass_spec_input_dataframes_for_targeted_reanalysis_of_identified_peptides__no_heavy_no_protein_no_cv( + inputProteinFdrDf, +): + isIncludeHeavyIsotopes = False maximumPeptidesPerProtein = 0 fullDf = inputProteinFdrDf.copy() fullDf["lightMzBin"] = [ @@ -425,25 +572,35 @@ def test__output_formatting_functions__create_mass_spec_input_dataframes_for_tar ["2/peptide04K/peptide09R", formula, adduct, 399.75, charge, 4], ["2/peptide05R/peptide10RR", formula, adduct, 500.25, charge, 5], ] - targetedReanalysisDf = pd.DataFrame(targetedReanalysisData, columns=["Compound","Formula","Adduct","m.z","z","MSXID"]) + targetedReanalysisDf = pd.DataFrame( + targetedReanalysisData, + columns=["Compound", "Formula", "Adduct", "m.z", "z", "MSXID"], + ) expectedOutputDict = { "noCV": targetedReanalysisDf, "fullDf": fullDf, } - outputDict = create_mass_spec_input_dataframes_for_targeted_reanalysis_of_identified_peptides(inputProteinFdrDf,isIncludeHeavy=isIncludeHeavy, maximumPeptidesPerProtein=maximumPeptidesPerProtein) + outputDict = create_mass_spec_input_dataframes_for_targeted_reanalysis_of_identified_peptides( + inputProteinFdrDf, + isIncludeHeavyIsotopes=isIncludeHeavyIsotopes, + maximumPeptidesPerProtein=maximumPeptidesPerProtein, + ) for type, df in expectedOutputDict.items(): assert type in outputDict assert df.equals(outputDict[type]) -def test__output_formatting_functions__create_mass_spec_input_dataframes_for_targeted_reanalysis_of_identified_peptides__with_heavy_with_protein_with_cv(inputProteinFdrDf): - isIncludeHeavy = True + +def test__output_formatting_functions__create_mass_spec_input_dataframes_for_targeted_reanalysis_of_identified_peptides__with_heavy_with_protein_with_cv( + inputProteinFdrDf, +): + isIncludeHeavyIsotopes = True maximumPeptidesPerProtein = 2 - filteredSortedIdxs = [3,2,6,5,8,7,9] + filteredSortedIdxs = [3, 2, 6, 5, 8, 7, 9] filteredDf = inputProteinFdrDf.loc[filteredSortedIdxs].reset_index(drop=True) inputDfCV30 = inputProteinFdrDf.copy() - inputDfCV30["CompensationVoltage"] = [-30]*len(inputDfCV30.index) + inputDfCV30["CompensationVoltage"] = [-30] * len(inputDfCV30.index) inputDfCV40 = inputProteinFdrDf.copy() - inputDfCV40["CompensationVoltage"] = [-40]*len(inputDfCV40.index) + inputDfCV40["CompensationVoltage"] = [-40] * len(inputDfCV40.index) inputDfCV40["leadingProtein"] = [ "protein08", "protein08", @@ -487,10 +644,12 @@ def test__output_formatting_functions__create_mass_spec_input_dataframes_for_tar 308.75, 520.25, ] - filteredDf["CompensationVoltage"] = [-30]*len(filteredDf.index) + filteredDf["CompensationVoltage"] = [-30] * len(filteredDf.index) secondCvFilteredDf = filteredDf.copy() - secondCvFilteredDf["leadingProtein"] = list(inputDfCV40.loc[filteredSortedIdxs]["leadingProtein"]) - secondCvFilteredDf["CompensationVoltage"] = [-40]*len(secondCvFilteredDf.index) + secondCvFilteredDf["leadingProtein"] = list( + inputDfCV40.loc[filteredSortedIdxs]["leadingProtein"] + ) + secondCvFilteredDf["CompensationVoltage"] = [-40] * len(secondCvFilteredDf.index) filteredDf = pd.concat([secondCvFilteredDf, filteredDf]) formula = "" @@ -510,17 +669,33 @@ def test__output_formatting_functions__create_mass_spec_input_dataframes_for_tar ["1/peptide10RR", formula, adduct, 500.25, charge, 6], ["1/peptide10RR", formula, adduct, 520.25, charge, 6], ] - targetedReanalysisDf = pd.DataFrame(targetedReanalysisData, columns=["Compound","Formula","Adduct","m.z","z","MSXID"]) + targetedReanalysisDf = pd.DataFrame( + targetedReanalysisData, + columns=["Compound", "Formula", "Adduct", "m.z", "z", "MSXID"], + ) expectedOutputDict = { "CV_30": targetedReanalysisDf, "CV_40": targetedReanalysisDf, "fullDf": filteredDf, } - outputDict = create_mass_spec_input_dataframes_for_targeted_reanalysis_of_identified_peptides(inputDf, isIncludeHeavy=isIncludeHeavy, maximumPeptidesPerProtein=maximumPeptidesPerProtein) - expectedFullDf = expectedOutputDict["fullDf"].sort_values(["peptide","leadingProtein","MzLIB","CompensationVoltage"]).reset_index(drop=True) - fullDf = outputDict["fullDf"].sort_values(["peptide","leadingProtein","MzLIB","CompensationVoltage"]).reset_index(drop=True) + outputDict = create_mass_spec_input_dataframes_for_targeted_reanalysis_of_identified_peptides( + inputDf, + isIncludeHeavyIsotopes=isIncludeHeavyIsotopes, + maximumPeptidesPerProtein=maximumPeptidesPerProtein, + ) + expectedFullDf = ( + expectedOutputDict["fullDf"] + .sort_values(["peptide", "leadingProtein", "MzLIB", "CompensationVoltage"]) + .reset_index(drop=True) + ) + fullDf = ( + outputDict["fullDf"] + .sort_values(["peptide", "leadingProtein", "MzLIB", "CompensationVoltage"]) + .reset_index(drop=True) + ) assert expectedFullDf.equals(fullDf) for type, df in expectedOutputDict.items(): - if type=="fullDf": continue + if type == "fullDf": + continue assert type in outputDict - assert df.equals(outputDict[type]) \ No newline at end of file + assert df.equals(outputDict[type])