docstrings added and python black formatting run

xomicsdatascience · Aug 3, 2023 · 28e2b5e · 28e2b5e
1 parent 9a86f6b
commit 28e2b5e
Show file tree

Hide file tree

Showing 6 changed files with 520 additions and 180 deletions.
diff --git a/csodiaq/identifier/__init__.py b/csodiaq/identifier/__init__.py
@@ -1,4 +1,6 @@
 from .identifier import Identifier
 from .idpickerFunctions import identify_high_confidence_proteins
 from .scoringFunctions import calculate_fdr_rates_of_decoy_array
-from .targetedReanalysisFunctions import create_mass_spec_input_dataframes_for_targeted_reanalysis_of_identified_peptides
+from .targetedReanalysisFunctions import (
+    create_mass_spec_input_dataframes_for_targeted_reanalysis_of_identified_peptides,
+)
diff --git a/csodiaq/identifier/outputFormattingFunctions.py b/csodiaq/identifier/outputFormattingFunctions.py
@@ -257,4 +257,3 @@ def determine_if_peptides_are_unique_to_leading_protein(proteinDf):
     uniquePeptides = np.array([0] * len(proteinDf.index))
     uniquePeptides[uniqueValuesDf.index] = 1
     return list(uniquePeptides)
-
diff --git a/csodiaq/identifier/targetedReanalysisFunctions.py b/csodiaq/identifier/targetedReanalysisFunctions.py
@@ -2,105 +2,231 @@
 import pandas as pd
 import numpy as np
 
+
 def calculate_mz_of_heavy_version_of_peptide(peptide, lightMz, z):
     lightAndHeavyLysKMassDiff = 8.014199
     lightAndHeavyArgRMassDiff = 10.00827
 
-    numLysK = peptide.count('K')
-    numArgR = peptide.count('R')
+    numLysK = peptide.count("K")
+    numArgR = peptide.count("R")
+
+    return (
+        lightMz
+        + (numLysK * lightAndHeavyLysKMassDiff) / z
+        + (numArgR * lightAndHeavyArgRMassDiff) / z
+    )
 
-    return (lightMz +
-            (numLysK * lightAndHeavyLysKMassDiff) / z +
-            (numArgR * lightAndHeavyArgRMassDiff) /  z)
 
 def filter_to_only_keep_peptides_with_possibly_heavy_K_or_R_terminal_residue(fdrDf):
     return fdrDf[
         fdrDf["peptide"].str.endswith("R") | fdrDf["peptide"].str.endswith("K")
-        ].reset_index(drop=True)
+    ].reset_index(drop=True)
+
 
 def filter_to_only_keep_top_peptides_unique_to_protein(fdrDf, maxPeptidesPerProtein):
     fdrDf = (
         fdrDf[fdrDf["uniquePeptide"] == 1]
         .sort_values("ionCount", ascending=False)
         .reset_index(drop=True)
     )
-    return fdrDf.groupby(["leadingProtein"]).head(maxPeptidesPerProtein).reset_index(drop=True)
+    return (
+        fdrDf.groupby(["leadingProtein"])
+        .head(maxPeptidesPerProtein)
+        .reset_index(drop=True)
+    )
+
 
 def calculate_mz_of_heavy_isotope_of_each_peptide(fdrDf):
-    return fdrDf.apply(lambda x: calculate_mz_of_heavy_version_of_peptide(x["peptide"],x["MzLIB"],x["zLIB"]), axis=1)
+    return fdrDf.apply(
+        lambda x: calculate_mz_of_heavy_version_of_peptide(
+            x["peptide"], x["MzLIB"], x["zLIB"]
+        ),
+        axis=1,
+    )
+
 
 def make_bin_assignments_for_mz_values(mzValues, binWidth=0.75):
-    bins = np.arange(int(min(mzValues)) - 1, int(max(mzValues) + binWidth*3)+1, binWidth*2)
+    bins = np.arange(
+        int(min(mzValues)) - 1, int(max(mzValues) + binWidth * 3) + 1, binWidth * 2
+    )
     values = np.digitize(mzValues, bins)
     mzBinValues = bins[values]
     mzBinValues -= binWidth
     return mzBinValues
 
-def filter_out_peptides_based_on_user_settings(fdrDf, isIncludeHeavy, maximumPeptidesPerProtein):
-    if isIncludeHeavy:
-        fdrDf = filter_to_only_keep_peptides_with_possibly_heavy_K_or_R_terminal_residue(fdrDf)
+
+def filter_out_peptides_based_on_user_settings(
+    fdrDf, isIncludeHeavyIsotopes, maximumPeptidesPerProtein
+):
+    if isIncludeHeavyIsotopes:
+        fdrDf = (
+            filter_to_only_keep_peptides_with_possibly_heavy_K_or_R_terminal_residue(
+                fdrDf
+            )
+        )
     if maximumPeptidesPerProtein:
-        fdrDf = filter_to_only_keep_top_peptides_unique_to_protein(fdrDf, maximumPeptidesPerProtein)
+        fdrDf = filter_to_only_keep_top_peptides_unique_to_protein(
+            fdrDf, maximumPeptidesPerProtein
+        )
     return fdrDf
 
-def calculate_binning_information_by_compensation_voltage(fdrDf, isIncludeHeavy):
+
+def calculate_binning_information_by_compensation_voltage(
+    fdrDf, isIncludeHeavyIsotopes
+):
     dfs = [
-        organize_for_targeted_reanalysis_of_identified_peptides(df, isIncludeHeavy) for _, df in fdrDf.groupby(["CompensationVoltage"])
+        organize_for_targeted_reanalysis_of_identified_peptides(
+            df, isIncludeHeavyIsotopes
+        )
+        for _, df in fdrDf.groupby(["CompensationVoltage"])
     ]
     return pd.concat(dfs)
 
-def organize_for_targeted_reanalysis_of_identified_peptides(fdrDf, isIncludeHeavy):
+
+def organize_for_targeted_reanalysis_of_identified_peptides(
+    fdrDf, isIncludeHeavyIsotopes
+):
     fdrDf["lightMzBin"] = make_bin_assignments_for_mz_values(fdrDf["MzLIB"])
-    if isIncludeHeavy:
+    if isIncludeHeavyIsotopes:
         fdrDf["heavyMz"] = calculate_mz_of_heavy_isotope_of_each_peptide(fdrDf)
         fdrDf["heavyMzBin"] = make_bin_assignments_for_mz_values(fdrDf["heavyMz"])
     return fdrDf
 
+
 def make_targeted_reanalysis_line(peptide, mz, id):
     formula = ""
     adduct = "(no adduct)"
     genericCharge = 2
-    return [peptide, formula, adduct, mz, genericCharge, id+1]
+    return [peptide, formula, adduct, mz, genericCharge, id + 1]
 
-def consolidate_peptides_by_bin_values(df, isIncludeHeavy):
+
+def consolidate_peptides_by_bin_values(df, isIncludeHeavyIsotopes):
     bins = ["lightMzBin"]
-    if isIncludeHeavy:
+    if isIncludeHeavyIsotopes:
         bins.append("heavyMzBin")
-    return df.groupby(bins).apply(lambda x: format_protein_list_to_string(x["peptide"])).reset_index(name="peptide").sort_values(bins)
+    return (
+        df.groupby(bins)
+        .apply(lambda x: format_protein_list_to_string(x["peptide"]))
+        .reset_index(name="peptide")
+        .sort_values(bins)
+    )
+
 
-def organize_binned_data_for_targeted_reanalysis(condensedDf, isIncludeHeavy):
+def organize_binned_data_for_targeted_reanalysis(condensedDf, isIncludeHeavyIsotopes):
+    """
+    Compiles data into a format that can be read by a mass spectrometer. Note that, in cases where the SILAC protocol
+        is being used, the mass spectrometer can automatically match windows with related peptides provided they have
+        same MSXID (the last column of the output). In this case the output will have paired rows with identical
+        MSXIDs but differing m.z column values, one for the light mz value and one for the heavy mz value.
+    """
     data = [
-         make_targeted_reanalysis_line(condensedDf.loc[i]["peptide"], condensedDf.loc[i]["lightMzBin"], i) for i in range(len(condensedDf.index))
+        make_targeted_reanalysis_line(
+            condensedDf.loc[i]["peptide"], condensedDf.loc[i]["lightMzBin"], i
+        )
+        for i in range(len(condensedDf.index))
     ]
-    if isIncludeHeavy:
+    if isIncludeHeavyIsotopes:
         data.extend(
             [
-                make_targeted_reanalysis_line(condensedDf.loc[i]["peptide"], condensedDf.loc[i]["heavyMzBin"], i) for i
-                in range(len(condensedDf.index))
-
+                make_targeted_reanalysis_line(
+                    condensedDf.loc[i]["peptide"], condensedDf.loc[i]["heavyMzBin"], i
+                )
+                for i in range(len(condensedDf.index))
             ]
         )
     return data
 
-def create_targeted_reanalysis_dataframe(df, isIncludeHeavy):
-    consolidatedDf = consolidate_peptides_by_bin_values(df, isIncludeHeavy)
-    targetedReanalysisData = organize_binned_data_for_targeted_reanalysis(consolidatedDf, isIncludeHeavy)
-    return pd.DataFrame(targetedReanalysisData, columns=["Compound","Formula","Adduct","m.z","z","MSXID"]).sort_values(["MSXID","m.z"]).reset_index(drop=True)
+
+def create_targeted_reanalysis_dataframe(df, isIncludeHeavyIsotopes):
+    consolidatedDf = consolidate_peptides_by_bin_values(df, isIncludeHeavyIsotopes)
+    targetedReanalysisData = organize_binned_data_for_targeted_reanalysis(
+        consolidatedDf, isIncludeHeavyIsotopes
+    )
+    return (
+        pd.DataFrame(
+            targetedReanalysisData,
+            columns=["Compound", "Formula", "Adduct", "m.z", "z", "MSXID"],
+        )
+        .sort_values(["MSXID", "m.z"])
+        .reset_index(drop=True)
+    )
+
 
 def make_cv_header(cvValue):
     if cvValue:
-        return f'CV_{str(abs(int(cvValue)))}'
+        return f"CV_{str(abs(int(cvValue)))}"
     else:
-        return 'noCV'
+        return "noCV"
 
-def create_targeted_reanalysis_dataframe_by_compensation_voltage(fdrDf, isIncludeHeavy):
+
+def create_targeted_reanalysis_dataframes_by_compensation_voltage(
+    fdrDf, isIncludeHeavyIsotopes
+):
     return {
-        make_cv_header(cv[0]): create_targeted_reanalysis_dataframe(df, isIncludeHeavy) for cv, df in fdrDf.groupby(["CompensationVoltage"])
+        make_cv_header(cv[0]): create_targeted_reanalysis_dataframe(
+            df, isIncludeHeavyIsotopes
+        )
+        for cv, df in fdrDf.groupby(["CompensationVoltage"])
     }
 
-def create_mass_spec_input_dataframes_for_targeted_reanalysis_of_identified_peptides(fdrDf, isIncludeHeavy=False, maximumPeptidesPerProtein=0):
-    fdrDf = filter_out_peptides_based_on_user_settings(fdrDf, isIncludeHeavy, maximumPeptidesPerProtein)
-    fdrDf = organize_for_targeted_reanalysis_of_identified_peptides(fdrDf, isIncludeHeavy)
-    outputDfDict = create_targeted_reanalysis_dataframe_by_compensation_voltage(fdrDf, isIncludeHeavy)
+
+def create_mass_spec_input_dataframes_for_targeted_reanalysis_of_identified_peptides(
+    fdrDf, isIncludeHeavyIsotopes=False, maximumPeptidesPerProtein=0
+):
+    """
+    Creates dataframes with data to be fed into a mass spectrometer for targetted reanalysis of identified peptides.
+
+    Extended Summary
+    ----------------
+    After peptide identification, additional experimentation is often required to verify or expand upon the
+        significance of what was identified. For instance, some researchers would want to identify the relative
+        quantity of peptides under different experimental conditions. Using Stable Isotope Labeling by Amino acids
+        in Cell culture (SILAC), one can develop a cell culture where proteins use a heavier isotope of lysine and
+        arginine. When targeting these peptides of interest, one can then compare the intensity of the "lighter"
+        peptide isotope in one condition to the "heavier" peptide isotope in another, determining relative quantity
+        changes between conditions. Thus, after identifying specific proteins and/or peptides using the identification
+        workflow, one would need to prime the next mass spectrometer run to target those specific peptides only, one run
+        for each category (light and heavy). This function generates tables that can be fed into a mass spectrometer
+        to target specific peptides of interest.
+
+    Note that as an aid to the researchers running the mass spectrometer, all targeted m/z values within a given
+        m/z window are binned to reduce redundant window analysis.
+
+    Also note that, for targeted reanalysis of data that utilized compensation voltage, a separate mass spectrometer
+        input file must be generated for each. As such, a different dataset is made for each one. When no compensation
+        voltage is provided, a single file will be generated. To keep these potentially variable compensation voltage
+        tables separate, the return value is a dictionary highlighting the exact compensation voltage (or lack thereof)
+        in the key name with the dataframe as the value. A summary file of all tables is also provided.
+
+    Parameters
+    ----------
+    fdrDf : pandas DataFrame
+    A dataframe containing peptides of interest. Note that this could be peptides belonging to specific identified
+        proteins or just identified peptides generally, as indicated by the maximumPeptidesPerProtein variable.
+
+    isIncludeHeavyIsotopes : boolean
+    A boolean indicating if the experiment is to be primed for a SILAC-based mass spectrometry run.
+
+    maximumPeptidesPerProtein : int
+    The maximum number of peptides to be identified per protein of interest. In the case where peptides generally are
+        being targetted this parameter is set to 0.
+
+    Returns
+    -------
+    outputDfDict : dict
+        A dictionary containing the targeted reanalysis files in addition to a summary file.
+        'fullDf' : summary dataframe
+        all other keys: target reanalysis dataframes, broken down by compensation voltage.
+
+    """
+    fdrDf = filter_out_peptides_based_on_user_settings(
+        fdrDf, isIncludeHeavyIsotopes, maximumPeptidesPerProtein
+    )
+    fdrDf = organize_for_targeted_reanalysis_of_identified_peptides(
+        fdrDf, isIncludeHeavyIsotopes
+    )
+    outputDfDict = create_targeted_reanalysis_dataframes_by_compensation_voltage(
+        fdrDf, isIncludeHeavyIsotopes
+    )
     outputDfDict["fullDf"] = fdrDf
     return outputDfDict
diff --git a/tests/system/test_identifier.py b/tests/system/test_identifier.py
@@ -1,13 +1,16 @@
 import pandas as pd
 import numpy as np
-from csodiaq.identifier import Identifier, create_mass_spec_input_dataframes_for_targeted_reanalysis_of_identified_peptides
+from csodiaq.identifier import (
+    Identifier,
+    create_mass_spec_input_dataframes_for_targeted_reanalysis_of_identified_peptides,
+)
 from csodiaq.loaders.query import QueryLoaderContext
 import os
 import pickle
 import pytest
 
-pd.set_option("display.max_columns",None)
-pd.set_option("display.max_rows",None)
+pd.set_option("display.max_columns", None)
+pd.set_option("display.max_rows", None)
 
 
 def get_parent_dir():
@@ -148,17 +151,53 @@ def test__identifier__main_workflow(commandLineArgs):
         expectedProteinDf, outputDict["proteinFDR"], "protein"
     )
 
-    targetedOutputDict = create_mass_spec_input_dataframes_for_targeted_reanalysis_of_identified_peptides(outputDict["proteinFDR"], isIncludeHeavy=True, maximumPeptidesPerProtein=1)
-    targetedOutputDict["fullDf"] = targetedOutputDict["fullDf"].drop(["fileName"], axis=1)
+    targetedOutputDict = create_mass_spec_input_dataframes_for_targeted_reanalysis_of_identified_peptides(
+        outputDict["proteinFDR"],
+        isIncludeHeavyIsotopes=True,
+        maximumPeptidesPerProtein=1,
+    )
+    targetedOutputDict["fullDf"] = targetedOutputDict["fullDf"].drop(
+        ["fileName"], axis=1
+    )
 
     expectedAllCVDf = pd.read_csv(get_file_from_system_test_folder("allCVs.csv"))
     expectedAllCVDf = expectedAllCVDf.drop(["fileName"], axis=1)
-    expected30Df = pd.read_csv(get_file_from_system_test_folder("targetedReanalysis_mostIntenseTargs_CV_30.txt"), sep='\t').fillna('')
-    expected40Df = pd.read_csv(get_file_from_system_test_folder("targetedReanalysis_mostIntenseTargs_CV_40.txt"), sep='\t').fillna('')
-    expected50Df = pd.read_csv(get_file_from_system_test_folder("targetedReanalysis_mostIntenseTargs_CV_50.txt"), sep='\t').fillna('')
-    expected60Df = pd.read_csv(get_file_from_system_test_folder("targetedReanalysis_mostIntenseTargs_CV_60.txt"), sep='\t').fillna('')
-    expected70Df = pd.read_csv(get_file_from_system_test_folder("targetedReanalysis_mostIntenseTargs_CV_70.txt"), sep='\t').fillna('')
-    expected80Df = pd.read_csv(get_file_from_system_test_folder("targetedReanalysis_mostIntenseTargs_CV_80.txt"), sep='\t').fillna('')
+    expected30Df = pd.read_csv(
+        get_file_from_system_test_folder(
+            "targetedReanalysis_mostIntenseTargs_CV_30.txt"
+        ),
+        sep="\t",
+    ).fillna("")
+    expected40Df = pd.read_csv(
+        get_file_from_system_test_folder(
+            "targetedReanalysis_mostIntenseTargs_CV_40.txt"
+        ),
+        sep="\t",
+    ).fillna("")
+    expected50Df = pd.read_csv(
+        get_file_from_system_test_folder(
+            "targetedReanalysis_mostIntenseTargs_CV_50.txt"
+        ),
+        sep="\t",
+    ).fillna("")
+    expected60Df = pd.read_csv(
+        get_file_from_system_test_folder(
+            "targetedReanalysis_mostIntenseTargs_CV_60.txt"
+        ),
+        sep="\t",
+    ).fillna("")
+    expected70Df = pd.read_csv(
+        get_file_from_system_test_folder(
+            "targetedReanalysis_mostIntenseTargs_CV_70.txt"
+        ),
+        sep="\t",
+    ).fillna("")
+    expected80Df = pd.read_csv(
+        get_file_from_system_test_folder(
+            "targetedReanalysis_mostIntenseTargs_CV_80.txt"
+        ),
+        sep="\t",
+    ).fillna("")
 
     for column in targetedOutputDict["fullDf"].columns:
         expectedColumn = np.array(expectedAllCVDf[column])

diff --git a/tests/unit/identifier/test_outputFormattingFunctions.py b/tests/unit/identifier/test_outputFormattingFunctions.py
@@ -13,6 +13,7 @@
 import pytest
 import numpy as np
 
+
 @pytest.fixture
 def identifierOutputData():
     return [
@@ -340,5 +341,3 @@ def test__output_formatting_functions__determine_if_peptides_are_unique_to_leadi
     ]
     output = determine_if_peptides_are_unique_to_leading_protein(inputDf)
     assert expectedOutput == output
-
-