Skip to content

Commit

Permalink
targeted reanalysis dataframe system function and tests implemented
Browse files Browse the repository at this point in the history
  • Loading branch information
CCranney committed Aug 3, 2023
1 parent d5f81bf commit 9a86f6b
Show file tree
Hide file tree
Showing 16 changed files with 927 additions and 459 deletions.
12 changes: 10 additions & 2 deletions csodiaq/csodiaq.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from . import csodiaq_quantification_functions as cqf
from . import csodiaq_mgf_cleaning_functions as cmf
from . import peptide_quantification
from csodiaq.identifier import Identifier, identify_high_confidence_proteins, calculate_fdr_rates_of_decoy_array
from csodiaq.identifier import Identifier, identify_high_confidence_proteins, calculate_fdr_rates_of_decoy_array, create_mass_spec_input_dataframes_for_targeted_reanalysis_of_identified_peptides
from csodiaq.utils import create_outfile_header
import pandas as pd

Expand All @@ -39,7 +39,15 @@ def main():
if dfType != "fullOutput":
dfType += "FDR"
df.to_csv(f'{outFileHeader}_{dfType}.csv', index=False)

if args["proteinTargets"]:
targetedReanalysisDict = create_mass_spec_input_dataframes_for_targeted_reanalysis_of_identified_peptides(outputDict["proteinFDR"], isIncludeHeavy=args['heavyMz'], maximumPeptidesPerProtein=args['proteinTargets'])
else:
targetedReanalysisDict = create_mass_spec_input_dataframes_for_targeted_reanalysis_of_identified_peptides(outputDict["peptideFDR"], isIncludeHeavy=args['heavyMz'], maximumPeptidesPerProtein=args['proteinTargets'])
for dfType, df in targetedReanalysisDict.items():
if "CV" in dfType:
df.to_csv(f'{outFileHeader}_mostIntenseTargs_{dfType}.txt', index=False, sep='\t')
else:
df.to_csv(f'{outFileHeader}_{dfType}.csv', index=False)
#lib = cif.library_file_to_dict(args['library'])
#maxQuerySpectraToPool = queryPooling = args['query']
#if not maxQuerySpectraToPool:
Expand Down
1 change: 1 addition & 0 deletions csodiaq/identifier/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .identifier import Identifier
from .idpickerFunctions import identify_high_confidence_proteins
from .scoringFunctions import calculate_fdr_rates_of_decoy_array
from .targetedReanalysisFunctions import create_mass_spec_input_dataframes_for_targeted_reanalysis_of_identified_peptides
90 changes: 0 additions & 90 deletions csodiaq/identifier/outputFormattingFunctions.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,93 +258,3 @@ def determine_if_peptides_are_unique_to_leading_protein(proteinDf):
uniquePeptides[uniqueValuesDf.index] = 1
return list(uniquePeptides)

def calculate_mz_of_heavy_version_of_peptide(peptide, lightMz, z):
lightAndHeavyLysKMassDiff = 8.014199
lightAndHeavyArgRMassDiff = 10.00827

numLysK = peptide.count('K')
numArgR = peptide.count('R')

return (lightMz +
(numLysK * lightAndHeavyLysKMassDiff) / z +
(numArgR * lightAndHeavyArgRMassDiff) / z)

def filter_to_only_keep_peptides_with_possibly_heavy_K_or_R_terminal_residue(fdrDf):
return fdrDf[
fdrDf["peptide"].str.endswith("R") | fdrDf["peptide"].str.endswith("K")
].reset_index(drop=True)

def filter_to_only_keep_top_peptides_unique_to_protein(fdrDf, maxPeptidesPerProtein):
fdrDf = (
fdrDf[fdrDf["uniquePeptide"] == 1]
.sort_values("ionCount", ascending=False)
.reset_index(drop=True)
)
return fdrDf.groupby(["leadingProtein"]).head(maxPeptidesPerProtein).reset_index(drop=True)

def calculate_mz_of_heavy_isotope_of_each_peptide(fdrDf):
return fdrDf.apply(lambda x: calculate_mz_of_heavy_version_of_peptide(x["peptide"],x["MzLIB"],x["zLIB"]), axis=1)

def make_bin_assignments_for_mz_values(mzValues, binWidth=0.75):
bins = np.arange(int(min(mzValues)) - 1, int(max(mzValues) + binWidth*3)+1, binWidth*2)
values = np.digitize(mzValues, bins)
mzBinValues = bins[values]
mzBinValues -= binWidth
return mzBinValues

def filter_out_peptides_based_on_user_settings(fdrDf, isIncludeHeavy, maximumPeptidesPerProtein=0):
if isIncludeHeavy:
fdrDf = filter_to_only_keep_peptides_with_possibly_heavy_K_or_R_terminal_residue(fdrDf)
if maximumPeptidesPerProtein:
fdrDf = filter_to_only_keep_top_peptides_unique_to_protein(fdrDf, maximumPeptidesPerProtein)
return fdrDf

def calculate_binning_information_by_compensation_voltage(fdrDf, isIncludeHeavy):
dfs = [
organize_for_targeted_reanalysis_of_identified_peptides(df, isIncludeHeavy) for _, df in fdrDf.groupby(["CompensationVoltage"])
]
return pd.concat(dfs)

def organize_for_targeted_reanalysis_of_identified_peptides(fdrDf, isIncludeHeavy):
fdrDf["lightMzBin"] = make_bin_assignments_for_mz_values(fdrDf["MzLIB"])
if isIncludeHeavy:
fdrDf["heavyMz"] = calculate_mz_of_heavy_isotope_of_each_peptide(fdrDf)
fdrDf["heavyMzBin"] = make_bin_assignments_for_mz_values(fdrDf["heavyMz"])
return fdrDf

def make_targeted_reanalysis_line(peptide, mz, id):
formula = ""
adduct = "(no adduct)"
genericCharge = 2
return [peptide, formula, adduct, mz, genericCharge, id]

def consolidate_peptides_by_bin_values(df, isIncludeHeavy):
bins = ["lightMzBin"]
if isIncludeHeavy:
bins.append("heavyMzBin")
return df.groupby(bins).apply(lambda x: format_protein_list_to_string(x["peptide"])).reset_index(name="peptide").sort_values(bins)

def organize_binned_data_for_targeted_reanalysis(condensedDf, isIncludeHeavy):
data = [
make_targeted_reanalysis_line(condensedDf.loc[i]["peptide"], condensedDf.loc[i]["lightMzBin"], i) for i in range(len(condensedDf.index))
]
if isIncludeHeavy:
data.extend(
[
make_targeted_reanalysis_line(condensedDf.loc[i]["peptide"], condensedDf.loc[i]["heavyMzBin"], i) for i
in range(len(condensedDf.index))

]
)
return data

def create_targeted_reanalysis_dataframe(df, isIncludeHeavy):
consolidatedDf = consolidate_peptides_by_bin_values(df, isIncludeHeavy)
targetedReanalysisData = organize_binned_data_for_targeted_reanalysis(consolidatedDf, isIncludeHeavy)
return pd.DataFrame(targetedReanalysisData, columns=["Compound","Formula","Adduct","m.z","z","MSXID"]).sort_values(["MSXID","m.z"]).reset_index(drop=True)

def create_targeted_reanalysis_dataframe_by_compensation_voltage__no_heavy(fdrDf, isIncludeHeavy):
return {
str(cv[0]): create_targeted_reanalysis_dataframe(df, isIncludeHeavy) for cv, df in fdrDf.groupby(["CompensationVoltage"])
}

106 changes: 106 additions & 0 deletions csodiaq/identifier/targetedReanalysisFunctions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
from csodiaq.utils import format_protein_list_to_string
import pandas as pd
import numpy as np

def calculate_mz_of_heavy_version_of_peptide(peptide, lightMz, z):
lightAndHeavyLysKMassDiff = 8.014199
lightAndHeavyArgRMassDiff = 10.00827

numLysK = peptide.count('K')
numArgR = peptide.count('R')

return (lightMz +
(numLysK * lightAndHeavyLysKMassDiff) / z +
(numArgR * lightAndHeavyArgRMassDiff) / z)

def filter_to_only_keep_peptides_with_possibly_heavy_K_or_R_terminal_residue(fdrDf):
return fdrDf[
fdrDf["peptide"].str.endswith("R") | fdrDf["peptide"].str.endswith("K")
].reset_index(drop=True)

def filter_to_only_keep_top_peptides_unique_to_protein(fdrDf, maxPeptidesPerProtein):
fdrDf = (
fdrDf[fdrDf["uniquePeptide"] == 1]
.sort_values("ionCount", ascending=False)
.reset_index(drop=True)
)
return fdrDf.groupby(["leadingProtein"]).head(maxPeptidesPerProtein).reset_index(drop=True)

def calculate_mz_of_heavy_isotope_of_each_peptide(fdrDf):
return fdrDf.apply(lambda x: calculate_mz_of_heavy_version_of_peptide(x["peptide"],x["MzLIB"],x["zLIB"]), axis=1)

def make_bin_assignments_for_mz_values(mzValues, binWidth=0.75):
bins = np.arange(int(min(mzValues)) - 1, int(max(mzValues) + binWidth*3)+1, binWidth*2)
values = np.digitize(mzValues, bins)
mzBinValues = bins[values]
mzBinValues -= binWidth
return mzBinValues

def filter_out_peptides_based_on_user_settings(fdrDf, isIncludeHeavy, maximumPeptidesPerProtein):
if isIncludeHeavy:
fdrDf = filter_to_only_keep_peptides_with_possibly_heavy_K_or_R_terminal_residue(fdrDf)
if maximumPeptidesPerProtein:
fdrDf = filter_to_only_keep_top_peptides_unique_to_protein(fdrDf, maximumPeptidesPerProtein)
return fdrDf

def calculate_binning_information_by_compensation_voltage(fdrDf, isIncludeHeavy):
dfs = [
organize_for_targeted_reanalysis_of_identified_peptides(df, isIncludeHeavy) for _, df in fdrDf.groupby(["CompensationVoltage"])
]
return pd.concat(dfs)

def organize_for_targeted_reanalysis_of_identified_peptides(fdrDf, isIncludeHeavy):
fdrDf["lightMzBin"] = make_bin_assignments_for_mz_values(fdrDf["MzLIB"])
if isIncludeHeavy:
fdrDf["heavyMz"] = calculate_mz_of_heavy_isotope_of_each_peptide(fdrDf)
fdrDf["heavyMzBin"] = make_bin_assignments_for_mz_values(fdrDf["heavyMz"])
return fdrDf

def make_targeted_reanalysis_line(peptide, mz, id):
formula = ""
adduct = "(no adduct)"
genericCharge = 2
return [peptide, formula, adduct, mz, genericCharge, id+1]

def consolidate_peptides_by_bin_values(df, isIncludeHeavy):
bins = ["lightMzBin"]
if isIncludeHeavy:
bins.append("heavyMzBin")
return df.groupby(bins).apply(lambda x: format_protein_list_to_string(x["peptide"])).reset_index(name="peptide").sort_values(bins)

def organize_binned_data_for_targeted_reanalysis(condensedDf, isIncludeHeavy):
data = [
make_targeted_reanalysis_line(condensedDf.loc[i]["peptide"], condensedDf.loc[i]["lightMzBin"], i) for i in range(len(condensedDf.index))
]
if isIncludeHeavy:
data.extend(
[
make_targeted_reanalysis_line(condensedDf.loc[i]["peptide"], condensedDf.loc[i]["heavyMzBin"], i) for i
in range(len(condensedDf.index))

]
)
return data

def create_targeted_reanalysis_dataframe(df, isIncludeHeavy):
consolidatedDf = consolidate_peptides_by_bin_values(df, isIncludeHeavy)
targetedReanalysisData = organize_binned_data_for_targeted_reanalysis(consolidatedDf, isIncludeHeavy)
return pd.DataFrame(targetedReanalysisData, columns=["Compound","Formula","Adduct","m.z","z","MSXID"]).sort_values(["MSXID","m.z"]).reset_index(drop=True)

def make_cv_header(cvValue):
if cvValue:
return f'CV_{str(abs(int(cvValue)))}'
else:
return 'noCV'

def create_targeted_reanalysis_dataframe_by_compensation_voltage(fdrDf, isIncludeHeavy):
return {
make_cv_header(cv[0]): create_targeted_reanalysis_dataframe(df, isIncludeHeavy) for cv, df in fdrDf.groupby(["CompensationVoltage"])
}

def create_mass_spec_input_dataframes_for_targeted_reanalysis_of_identified_peptides(fdrDf, isIncludeHeavy=False, maximumPeptidesPerProtein=0):
fdrDf = filter_out_peptides_based_on_user_settings(fdrDf, isIncludeHeavy, maximumPeptidesPerProtein)
fdrDf = organize_for_targeted_reanalysis_of_identified_peptides(fdrDf, isIncludeHeavy)
outputDfDict = create_targeted_reanalysis_dataframe_by_compensation_voltage(fdrDf, isIncludeHeavy)
outputDfDict["fullDf"] = fdrDf
return outputDfDict
Loading

0 comments on commit 9a86f6b

Please sign in to comment.