Skip to content

Commit

Permalink
abandon "openms.tsv"
Browse files Browse the repository at this point in the history
  • Loading branch information
WangHong007 committed Jul 4, 2022
1 parent abbd2c1 commit 430ae79
Show file tree
Hide file tree
Showing 5 changed files with 34 additions and 49 deletions.
53 changes: 18 additions & 35 deletions bin/diann_convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,14 @@ def cli():
@click.option("--pg_matrix", "-pg")
@click.option("--pr_matrix", "-pr")
@click.option("--unique_matrix", "-un")
@click.option("--openms", "-o")
@click.option("--dia_params", "-p")
@click.option("--fasta", "-f")
@click.option("--charge", "-c")
@click.option("--missed_cleavages", "-m")
@click.option("--qvalue_threshold", "-q", type=float)
@click.pass_context

def convert(ctx, diann_report, exp_design, pg_matrix, pr_matrix, unique_matrix, openms, fasta, charge, missed_cleavages, qvalue_threshold):
def convert(ctx, diann_report, exp_design, pg_matrix, pr_matrix, unique_matrix, dia_params, fasta, charge, missed_cleavages, qvalue_threshold):
"""This function is designed to convert the DIA-NN output into three standard formats: MSstats, Triqler and mzTab. These documents are
used for quality control and downstream analysis.
Expand All @@ -41,8 +41,8 @@ def convert(ctx, diann_report, exp_design, pg_matrix, pr_matrix, unique_matrix,
:type pr_matrix: str
:param unique_matrix: Path to a DIA-NN matrix file containing unique genes
:type unique_matrix: str
:param openms: Path to OpenMS, one of the workflow configuration files
:type openms: str
:param dia_params: A list contains DIA parameters
:type dia_params: list
:param fasta: Path to the fasta file
:type fasta: str
:param charge: The charge assigned by DIA-NN(max_precursor_charge)
Expand All @@ -52,12 +52,9 @@ def convert(ctx, diann_report, exp_design, pg_matrix, pr_matrix, unique_matrix,
:param qvalue_threshold: Threshold for filtering q value
:type qvalue_threshold: float
"""

pg = pd.read_csv(pg_matrix, sep = "\t", header = 0, dtype = "str")
pr = pd.read_csv(pr_matrix, sep = "\t", header = 0, dtype = "str")
unique = pd.read_csv(unique_matrix, sep = "\t", header = 0, dtype = "str")
opms = pd.read_csv(openms, sep = "\t", header = 0, dtype = "str")
opms.fillna("null", inplace=True)
report = pd.read_csv(diann_report, sep = "\t", header = 0, dtype = "str")
report["Calculate.Precursor.Mz"] = report.apply(lambda x: molecular_weight(x["Stripped.Sequence"], 'protein', monoisotopic=True) / int(x["Precursor.Charge"]), axis=1)
precursor_list = list(report["Precursor.Id"].unique())
Expand Down Expand Up @@ -125,7 +122,7 @@ def convert(ctx, diann_report, exp_design, pg_matrix, pr_matrix, unique_matrix,
index_ref.loc[:, "study_variable"] = index_ref.loc[:, "study_variable"].astype("int")
report[["ms_run", "study_variable"]] = report.apply(lambda x: add_info(x["Run"], index_ref), axis = 1, result_type = "expand")

(MTD, database) = mztab_MTD(index_ref, opms, unimod_data, fasta, charge, missed_cleavages)
(MTD, database) = mztab_MTD(index_ref, dia_params, unimod_data, fasta, charge, missed_cleavages)
PRH = mztab_PRH(report, pg, unique, index_ref, database, fasta_pd)
PEH = mztab_PEH(report, pr, unique, unimod_data, precursor_list, index_ref, database)
PSH = mztab_PSH(unique, unimod_data, report, database)
Expand All @@ -152,7 +149,6 @@ def query_expdesign_value(reference, f_table, s_table):
:return: A tuple contains Fraction, BioReplicate and Condition
:rtype: tuple
"""

query_reference = f_table[f_table["run"] == reference]
Fraction = query_reference["Fraction"].values[0]
row = s_table[s_table["Sample"] == query_reference["Sample"].values[0]]
Expand All @@ -172,7 +168,6 @@ def convert_modification(peptide, unimod_data):
:return: Peptide contains the modification names
:rtype: str
"""

pattern = re.compile(r"\((.*?)\)")
origianl_mods = re.findall(pattern, peptide)
for mod in set(origianl_mods):
Expand All @@ -188,14 +183,13 @@ def MTD_mod_info(unimod_database, fix_mod, var_mod):
:param unimod_database: An instance of class UnimodDatabase
:type unimod_database: sdrf_pipelines.openms.unimod.UnimodDatabase
:param fix_mod: Fixed modifications from openms.tsv
:param fix_mod: Fixed modifications from DIA parameter list
:type fix_mod: str
:param var_mod: Variable modifications from openms.tsv
:param var_mod: Variable modifications from DIA parameter list
:type var_mod: str
:return: A tuple contains fixed and variable modifications, and flags indicating whether they are null
:rtype: tuple
"""

pattern = re.compile("\((.*?)\)")
var_ptm = []
fix_ptm = []
Expand Down Expand Up @@ -231,13 +225,13 @@ def MTD_mod_info(unimod_database, fix_mod, var_mod):
return fix_ptm, var_ptm, fix_flag, var_flag


def mztab_MTD(index_ref, opms, unimod_data, fasta, charge, missed_cleavages):
def mztab_MTD(index_ref, dia_params, unimod_data, fasta, charge, missed_cleavages):
"""Construct MTD sub-table.
:param index_ref: On the basis of f_table, two columns "MS_run" and "study_variable" are added for matching
:type indx_ref: pandas.core.frame.DataFrame
:param opms: Dataframe for openms.tsv
:type opms: pandas.core.frame.DataFrame
:param dia_params: A list contains DIA parameters
:type dia_params: list
:param unimod_data: An instance of class UnimodDatabase
:type unimod_data: sdrf_pipelines.openms.unimod.UnimodDatabase
:param fasta: Fasta file path
Expand All @@ -249,15 +243,14 @@ def mztab_MTD(index_ref, opms, unimod_data, fasta, charge, missed_cleavages):
:return: MTD sub-table
:rtype: pandas.core.frame.DataFrame
"""

FragmentMassTolerance = opms["FragmentMassTolerance"].values[0]
FragmentMassToleranceUnit = opms["FragmentMassToleranceUnit"].values[0]
PrecursorMassTolerance = opms["PrecursorMassTolerance"].values[0]
PrecursorMassToleranceUnit = opms["PrecursorMassToleranceUnit"].values[0]
Enzyme = opms["Enzyme"].values[0]
FixedModifications = opms["FixedModifications"].values[0]
VariableModifications = opms["VariableModifications"].values[0]

dia_params_list = dia_params.split(";")
FragmentMassTolerance = dia_params_list[0]
FragmentMassToleranceUnit = dia_params_list[1]
PrecursorMassTolerance = dia_params_list[2]
PrecursorMassToleranceUnit = dia_params_list[3]
Enzyme = dia_params_list[4]
FixedModifications = dia_params_list[5]
VariableModifications = dia_params_list[6]
out_mztab_MTD = pd.DataFrame()
out_mztab_MTD.loc[1, "mzTab-version"] = "1.0.0"
out_mztab_MTD.loc[1, "mzTab-mode"] = "Summary"
Expand Down Expand Up @@ -350,7 +343,6 @@ def mztab_PRH(report, pg, unique, index_ref, database, fasta_pd):
:return: PRH sub-table
:rtype: pandas.core.frame.DataFrame
"""

file = list(pg.columns[5:])
col = {}
for i in file:
Expand Down Expand Up @@ -422,7 +414,6 @@ def mztab_PEH(report, pr, unique, unimod_data, precursor_list, index_ref, databa
:return: PEH sub-table
:rtype: pandas.core.frame.DataFrame
"""

out_mztab_PEH = pd.DataFrame()
out_mztab_PEH = pr.iloc[:, 0:10]
out_mztab_PEH = out_mztab_PEH.drop(["Protein.Group", "Protein.Names", "First.Protein.Description", "Proteotypic"], axis = 1)
Expand Down Expand Up @@ -491,7 +482,6 @@ def mztab_PSH(unique, unimod_data, report, database):
:return: PSH sub-table
:rtype: pandas.core.frame.DataFrame
"""

out_mztab_PSH = pd.DataFrame()
## Score at PSM level: Q.Value
out_mztab_PSH = report[["Stripped.Sequence", "Protein.Ids", "Genes", "Q.Value", "RT",
Expand Down Expand Up @@ -536,7 +526,6 @@ def add_info(target, index_ref):
:return: A tuple contains ms_run and study_variable
:rtype: tuple
"""

match = index_ref[index_ref["run"] == target]
ms_run = match["ms_run"].values[0]
study_variable = match["study_variable"].values[0]
Expand All @@ -558,7 +547,6 @@ def classify_protein(target, unique, t, f):
:return: The signature of genes
:rtype: str
"""

if any(unique == target):
return t
else:
Expand All @@ -577,7 +565,6 @@ def calculate_protein_coverage(report, target, fasta_pd):
:return: Protein coverage
:rtype: str
"""

protein_coverage = ""
len_current = len(max(report[report["Protein.Ids"] == target]["Stripped.Sequence"].values, key = len))
for i in target.split(";"):
Expand All @@ -603,7 +590,6 @@ def match_in_report(report, target, max, flag, level):
:return: A tuple contains multiple messages
:rtype: tuple
"""

if flag == 1 and level == 'pep':
result = report[report['precursor.Index'] == target]
PEH_params = []
Expand Down Expand Up @@ -642,7 +628,6 @@ def PRH_match_report(report, target):
:return: A tuple contains multiple information to construct PRH sub-table
:rtype: tuple
"""

match = report[report["Protein.Ids"] == target]
modSeq = match["Modified.Sequence"].values[0] if match["Modified.Sequence"].values.size > 0 else np.nan
## Score at protein level: Global.PG.Q.Value (without MBR)
Expand All @@ -661,7 +646,6 @@ def PEH_match_report(report, target):
:return: A tuple contains multiple information to construct PEH sub-table
:rtype: tuple
"""

match = report[report["precursor.Index"] == target]
## Score at peptide level: the minimum of the respective precursor q-values (minimum of Q.Value per group)
search_score = match["Q.Value"].min()
Expand All @@ -681,7 +665,6 @@ def find_modification(peptide):
:return: Modification sites
:rtype: str
"""

pattern = re.compile(r"\((.*?)\)")
original_mods = re.findall(pattern, peptide)
peptide = re.sub('\(.*?\)','.',peptide)
Expand Down
22 changes: 12 additions & 10 deletions modules/local/diannconvert/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -15,32 +15,34 @@ process DIANNCONVERT {
path(report_pg)
path(report_pr)
path(report_unique_gene)
path(openms)
val(meta)
path(fasta)
val(charge)
val(missed_cleavages)

output:
path "*msstats_in.csv", emit: out_msstats
path "*triqler_in.tsv", emit: out_triqler
path "*out.mztab", emit: out_mztab
path "*.mztab", emit: out_mztab
path "versions.yml", emit: version

script:
def args = task.ext.args ?: ''
def dia_params = [meta.fragmentmasstolerance, meta.fragmentmasstoleranceunit, meta.precursormasstolerance,
meta.precursormasstoleranceunit, meta.enzyme, meta.fixedmodifications, meta.variablemodifications].join(';')

"""
diann_convert.py convert \\
--diann_report ${report} \\
--exp_design ${exp_design} \\
--pg_matrix ${report_pg} \\
--pr_matrix ${report_pr} \\
--unique_matrix ${report_unique_gene} \\
--openms ${openms} \\
--fasta ${fasta} \\
--diann_report "${report}" \\
--exp_design "${exp_design}" \\
--pg_matrix "${report_pg}" \\
--pr_matrix "${report_pr}" \\
--unique_matrix "${report_unique_gene}" \\
--dia_params "${dia_params}" \\
--fasta "${fasta}" \\
--charge ${charge} \\
--missed_cleavages ${missed_cleavages} \\
--qvalue_threshold $params.protein_level_fdr_cutoff \\
--qvalue_threshold ${params.protein_level_fdr_cutoff} \\
|& tee convert_report.log
cat <<-END_VERSIONS > versions.yml
Expand Down
2 changes: 1 addition & 1 deletion subworkflows/local/create_input_channel.nf
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ workflow CREATE_INPUT_CHANNEL {
ch_meta_config_iso // [meta, [spectra_files ]]
ch_meta_config_lfq // [meta, [spectra_files ]]
ch_meta_config_dia // [meta, [spectra files ]]
ch_config

ch_expdesign

version = ch_versions
Expand Down
4 changes: 2 additions & 2 deletions workflows/dia.nf
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ workflow DIA {
take:
ch_file_preparation_results
ch_expdesign
ch_config

main:

Expand Down Expand Up @@ -86,7 +85,8 @@ workflow DIA {
//
// MODULE: DIANNCONVERT
//
DIANNCONVERT(DIANNSUMMARY.out.main_report, ch_expdesign, DIANNSUMMARY.out.pg_matrix, DIANNSUMMARY.out.pr_matrix, DIANNSUMMARY.out.unique_gene_matrix, ch_config, params.database, params.max_precursor_charge, params.allowed_missed_cleavages)
DIANNCONVERT(DIANNSUMMARY.out.main_report, ch_expdesign, DIANNSUMMARY.out.pg_matrix, DIANNSUMMARY.out.pr_matrix, DIANNSUMMARY.out.unique_gene_matrix,
ch_result.meta.first(), params.database, params.max_precursor_charge, params.allowed_missed_cleavages)
ch_software_versions = ch_software_versions.mix(DIANNCONVERT.out.version.ifEmpty(null))

//
Expand Down
2 changes: 1 addition & 1 deletion workflows/quantms.nf
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ workflow QUANTMS {
ch_msstats_in = ch_msstats_in.mix(LFQ.out.msstats_in)
ch_versions = ch_versions.mix(LFQ.out.versions.ifEmpty(null))

DIA(ch_fileprep_result.dia, CREATE_INPUT_CHANNEL.out.ch_expdesign, CREATE_INPUT_CHANNEL.out.ch_config)
DIA(ch_fileprep_result.dia, CREATE_INPUT_CHANNEL.out.ch_expdesign)
ch_pipeline_results = ch_pipeline_results.mix(DIA.out.diann_report)
ch_msstats_in = ch_msstats_in.mix(DIA.out.msstats_in)
ch_versions = ch_versions.mix(DIA.out.versions.ifEmpty(null))
Expand Down

0 comments on commit 430ae79

Please sign in to comment.