bigbio · ypriverol · Mar 17, 2023 · Mar 16, 2023 · Mar 16, 2023 · Mar 16, 2023
diff --git a/.prettierignore b/.prettierignore
@@ -10,3 +10,5 @@ testing/
 testing*
 *.pyc
 bin/
+venv/
+
diff --git a/bin/diann_convert.py b/bin/diann_convert.py
@@ -1,11 +1,16 @@
 #!/usr/bin/env python
-
+"""
+This script converts the output from DIA-NN into three standard formats: MSstats, Triqler and mzTab.
+License: Apache 2.0
+Authors: Hong Wong, Yasset Perez-Riverol
+"""
+import logging
 import os
 import re
+
 import click
 import numpy as np
 import pandas as pd
-import logging as log
 from pyopenms import AASequence, FASTAFile, ModificationsDB
 
 CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])
@@ -65,22 +70,20 @@ def convert(ctx, folder, dia_params, diann_version, charge, missed_cleavages, qv
 
     for item in pathdict.items():
         if item[0] != "mzml_info" and len(item[1]) > 1:
-            log.error(f"{item[0]} is duplicate, check whether the file is redundant or change the file name!")
+            logging.error(f"{item[0]} is duplicate, check whether the file is redundant or change the file name!")
 
     diann_report = folder + pathdict["report"][0]
     exp_design = folder + pathdict["exp_design"][0]
     pg_matrix = folder + pathdict["pg_matrix"][0]
     pr_matrix = folder + pathdict["pr_matrix"][0]
     fasta = folder + pathdict["fasta"][0]
     diann_version_file = diann_version
-    mzml_info = pathdict["mzml_info"]
 
     with open(diann_version_file) as f:
         for line in f:
             if "DIA-NN" in line:
                 diann_version_id = line.rstrip("\n").split(": ")[1]
                 break
-    f.close()
 
     remain_cols = [
         "File.Name",
@@ -774,15 +777,16 @@ def findstr(basestr, s, resultlist):
     return protein_coverage
 
 
-def match_in_report(report, target, max, flag, level):
-    """This function is used to match the columns "ms_run" and "study_variable" in the report to get the information.
+def match_in_report(report, target, max_, flag, level):
+    """This function is used to match the columns "ms_run" and "study_variable" from the report and
+     get the corresponding information for the mztab ms_run and study_values metadata values.
 
     :param report: Dataframe for Dia-NN main report
     :type report: pandas.core.frame.DataFrame
     :param target: The value of "pr_id" column in out_mztab_PEH(level="peptide") or the "accession" column in out_mztab_PRH(level="protein")
     :type target: str
-    :param max: max_assay or max_study_variable
-    :type max: int
+    :param max_: max_assay or max_study_variable
+    :type max_: int
     :param flag: Match the "study_variable" column(flag=1) or the "ms_run" column(flag=0) in the filter result
     :type flag: int
     :param level: "pep" or "protein"
@@ -793,7 +797,7 @@ def match_in_report(report, target, max, flag, level):
     if flag == 1 and level == "pep":
         result = report[report["precursor.Index"] == target]
         PEH_params = []
-        for i in range(1, max + 1):
+        for i in range(1, max_ + 1):
             match = result[result["study_variable"] == i]
             PEH_params.extend([match["Precursor.Normalised"].mean(), "null", "null", "null", match["RT.Start"].mean()])
 
@@ -802,7 +806,7 @@ def match_in_report(report, target, max, flag, level):
     if flag == 0 and level == "pep":
         result = report[report["precursor.Index"] == target]
         q_value = []
-        for i in range(1, max + 1):
+        for i in range(1, max_ + 1):
             match = result[result["ms_run"] == i]
             q_value.append(match["Q.Value"].values[0] if match["Q.Value"].values.size > 0 else np.nan)
 
@@ -811,7 +815,7 @@ def match_in_report(report, target, max, flag, level):
     if flag == 1 and level == "protein":
         result = report[report["Protein.Ids"] == target]
         PRH_params = []
-        for i in range(1, max + 1):
+        for i in range(1, max_ + 1):
             match = result[result["study_variable"] == i]
             PRH_params.extend([match["PG.MaxLFQ"].mean(), "null", "null"])
 
@@ -882,14 +886,13 @@ def find_modification(peptide):
 
 
 def calculate_mz(seq, charge):
-    """Remove unknown aminoacids and calculate mz
-
-    :param seq: Sequences of peptides
-    :type seq: str
-    :param charge: charge of peptides
+    """
+    Calculate the precursor m/z based on the peptide sequence and charge state.
+    :param seq: Sequence peptide
     :type seq: str
-    :return: mz
-    :rtype: float or NoneType
+    :param charge: charge state
+    :type charge: int
+    :return:
     """
     ref = "ARNDBCEQZGHILKMFPSTWYV"
     seq = "".join([i for i in seq if i in ref])

diff --git a/bin/mzml_statistics.py b/bin/mzml_statistics.py
@@ -1,12 +1,13 @@
 #!/usr/bin/env python
 
-from pyopenms import MzMLFile, MSExperiment
-import os
-import pandas as pd
 import sys
+from pathlib import Path
+
+import pandas as pd
+from pyopenms import MSExperiment, MzMLFile
 
 
-def mzml_dataframe(mzml_path):
+def mzml_dataframe(mzml_path: str) -> None:
     file_columns = [
         "SpectrumID",
         "MSLevel",
@@ -17,34 +18,34 @@ def mzml_dataframe(mzml_path):
         "Exp_Mass_To_Charge",
     ]
 
-    def parse_mzml(file_name, file_columns):
+    def parse_mzml(file_name: str, file_columns: list):
         info = []
         exp = MSExperiment()
         MzMLFile().load(file_name, exp)
-        for i in exp:
-            id = i.getNativeID()
-            MSLevel = i.getMSLevel()
-            rt = i.getRT() if i.getRT() else None
+        for spectrum in exp:
+            id_ = spectrum.getNativeID()
+            MSLevel = spectrum.getMSLevel()
+            rt = spectrum.getRT() if spectrum.getRT() else None
             if MSLevel == 2:
-                charge_state = i.getPrecursors()[0].getCharge()
-                emz = i.getPrecursors()[0].getMZ() if i.getPrecursors()[0].getMZ() else None
-                peaks_tuple = i.get_peaks()
+                charge_state = spectrum.getPrecursors()[0].getCharge()
+                emz = spectrum.getPrecursors()[0].getMZ() if spectrum.getPrecursors()[0].getMZ() else None
+                peaks_tuple = spectrum.get_peaks()
                 peak_per_ms2 = len(peaks_tuple[0])
-                if i.getMetaValue("base peak intensity"):
-                    base_peak_intensity = i.getMetaValue("base peak intensity")
+                if spectrum.getMetaValue("base peak intensity"):
+                    base_peak_intensity = spectrum.getMetaValue("base peak intensity")
                 else:
                     base_peak_intensity = max(peaks_tuple[1]) if len(peaks_tuple[1]) > 0 else None
-                info_list = [id, 2, charge_state, peak_per_ms2, base_peak_intensity, rt, emz]
+                info_list = [id_, 2, charge_state, peak_per_ms2, base_peak_intensity, rt, emz]
             else:
-                info_list = [id, MSLevel, None, None, None, rt, None]
+                info_list = [id_, MSLevel, None, None, None, rt, None]
 
             info.append(info_list)
 
         return pd.DataFrame(info, columns=file_columns)
 
     mzml_df = parse_mzml(mzml_path, file_columns)
     mzml_df.to_csv(
-        "{}_mzml_info.tsv".format(os.path.splitext(os.path.split(mzml_path)[1])[0]),
+        f"{Path(mzml_path).stem}_mzml_info.tsv",
         mode="a",
         sep="\t",
         index=False,

diff --git a/bin/prepare_diann_parameters.py b/bin/prepare_diann_parameters.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 
 import re
+from typing import List, Tuple
 
 import click
 from sdrf_pipelines.openms.unimod import UnimodDatabase
@@ -32,21 +33,21 @@ def generate_cfg(ctx, enzyme, fix_mod, var_mod):
     for mod in var_ptm:
         diann_var_ptm += var_ptm_str + mod
 
-    with open("diann_config.cfg", "w") as f:
-        f.write("--cut " + cut + diann_fix_ptm + diann_var_ptm)
+    with open("diann_config.cfg", "w") as file:
+        file.write("--cut " + cut + diann_fix_ptm + diann_var_ptm)
 
 
-def convert_mod(unimod_database, fix_mod, var_mod):
+def convert_mod(unimod_database, fix_mod: str, var_mod: str) -> Tuple[List, List]:
     pattern = re.compile(r"\((.*?)\)")
     var_ptm = []
     fix_ptm = []
 
     if fix_mod != "":
         for mod in fix_mod.split(","):
             tag = 0
-            for m in unimod_database.modifications:
-                if m.get_name() == mod.split(" ")[0]:
-                    diann_mod = m.get_name() + "," + str(m._delta_mono_mass)
+            for modification in unimod_database.modifications:
+                if modification.get_name() == mod.split(" ")[0]:
+                    diann_mod = modification.get_name() + "," + str(modification._delta_mono_mass)
                     tag = 1
                     break
             if tag == 0:
@@ -66,9 +67,9 @@ def convert_mod(unimod_database, fix_mod, var_mod):
     if var_mod != "":
         for mod in var_mod.split(","):
             tag = 0
-            for m in unimod_database.modifications:
-                if m.get_name() == mod.split(" ")[0]:
-                    diann_mod = m.get_name() + "," + str(m._delta_mono_mass)
+            for modification in unimod_database.modifications:
+                if modification.get_name() == mod.split(" ")[0]:
+                    diann_mod = modification.get_name() + "," + str(modification._delta_mono_mass)
                     tag = 1
                     break
             if tag == 0:
@@ -88,22 +89,18 @@ def convert_mod(unimod_database, fix_mod, var_mod):
     return fix_ptm, var_ptm
 
 
-def enzyme_cut(enzyme):
-    if enzyme == "Trypsin":
-        cut = "K*,R*,!*P"
-    elif enzyme == "Trypsin/P":
-        cut = "K*,R*,*P"
-    elif enzyme == "Arg-C":
-        cut = "R*,!*P"
-    elif enzyme == "Asp-N":
-        cut = "*B,*D"
-    elif enzyme == "Chymotrypsin":
-        cut = "F*,W*,Y*,L*,!*P"
-    elif enzyme == "Lys-C":
-        cut = "K*,!*P"
-    else:
-        cut = "--cut"
-    return cut
+_ENZYME_SPECIFICITY = {
+    "Trypsin": "K*,R*,!*P",
+    "Trypsin/P": "K*,R*",
+    "Arg-C": "R*,!*P",
+    "Asp-N": "*B,*D",
+    "Chymotrypsin": "F*,W*,Y*,L*,!*P",
+    "Lys-C": "K*,!*P",
+}
+
+
+def enzyme_cut(enzyme: str) -> str:
+    return _ENZYME_SPECIFICITY.get(enzyme) or "--cut"
 
 
 cli.add_command(generate_cfg)

diff --git a/conf/test_lfq.config b/conf/test_lfq.config
@@ -27,7 +27,7 @@ params {
     input = 'https://raw.githubusercontent.com/nf-core/test-datasets/quantms/testdata/lfq_ci/BSA/BSA_design_urls.tsv'
     database = 'https://raw.githubusercontent.com/nf-core/test-datasets/quantms/testdata/lfq_ci/BSA/18Protein_SoCe_Tr_detergents_trace_target_decoy.fasta'
     posterior_probabilities = "fit_distributions"
-    search_engines = "msgf"
+    search_engines = "msgf,comet"
     decoy_string= "rev"
     add_triqler_output = true
     protein_level_fdr_cutoff = 1.0

diff --git a/docs/output.md b/docs/output.md
@@ -38,7 +38,7 @@ As an example, a rough visualization of the DDA identification subworkflow can b
 
 ## Output structure
 
-Output is by default written to the $NXF_WORKSPACE/results folder. Each step of the workflow export different files and reports with the specific data, peptide identifications, protein quantifications, etc. Most of the pipeline outputs are [HUPO-PSI](https://www.psidev.info/) standard file formats:
+Output will be saved to the folder defined by parameter `--outdir`. Each step of the workflow export different files and reports with the specific data, peptide identifications, protein quantifications, etc. Most of the pipeline outputs are [HUPO-PSI](https://www.psidev.info/) standard file formats:
 
 - [mzML](https://www.psidev.info/mzML): The mzML format is an open, XML-based format for mass spectrometer output files, developed with the full participation of vendors and researchers in order to create a single open format that would be supported by all software.
 - [mzTab](https://www.psidev.info/mztab>): mzTab is intended as a lightweight supplement to the existing standard mzML to store and represent peptide and protein and identifications together with experimental metadata and basic quantitative information.
@@ -88,7 +88,7 @@ results
 
 #### Spectra
 
-Quantms main format for spectra is the open [mzML](https://www.psidev.info/mzML) format. However it also supports Thermo raw files through conversion with
+Quantms main format for spectra is the open [mzML](https://www.psidev.info/mzML) format. However, it also supports Thermo raw files through conversion with
 ThermoRawFileParser. Mixed inputs should be possible but are untested. Conversion results can be cached if run locally or outputted to results.
 Mismatches between file extensions in the design and on disk can be corrected through parameters.
 
@@ -133,7 +133,7 @@ decoy identifications and search engine scores.
 
 The mzTab is exported for all three workflows DDA-LFQ, DDA-ISO and DIA-LFQ. It is a complete [mzTab](https://github.com/HUPO-PSI/mzTab) file
 ready for submission to [PRIDE](https://www.ebi.ac.uk/pride/). It contains both identifications (only those responsible for a quantification),
-quantities as well as some metadata about both the experiment and the quantification.
+quantities and some metadata about both the experiment and the quantification.
 
 #### MSstats-processed mzTab
 

diff --git a/modules/local/mzmlstatistics/main.nf b/modules/local/mzmlstatistics/main.nf
@@ -1,6 +1,6 @@
 process MZMLSTATISTICS {
+    tag "$meta.mzml_id"
     label 'process_medium'
-    // TODO could be easily parallelized
     label 'process_single_thread'
 
     conda "bioconda::pyopenms=2.8.0"
@@ -11,7 +11,7 @@ process MZMLSTATISTICS {
     }
 
     input:
-    path mzml_path
+    tuple val(meta), path(mzml)
 
     output:
     path "*_mzml_info.tsv", emit: mzml_statistics
@@ -20,9 +20,10 @@ process MZMLSTATISTICS {
 
     script:
     def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.mzml_id}"
 
     """
-    mzml_statistics.py "${mzml_path}" \\
+    mzml_statistics.py "${mzml}" \\
         2>&1 | tee mzml_statistics.log
 
     cat <<-END_VERSIONS > versions.yml

diff --git a/nextflow.config b/nextflow.config
@@ -108,7 +108,7 @@ params {
 
     // ConsensusID
     consensusid_algorithm           = 'best'
-    min_consensus_support           = 1
+    min_consensus_support           = 0
     consensusid_considered_top_hits = 0
 
     // Luciphor options

diff --git a/subworkflows/local/file_preparation.nf b/subworkflows/local/file_preparation.nf
@@ -53,12 +53,9 @@ workflow FILE_PREPARATION {
     ch_versions = ch_versions.mix(MZMLINDEXING.out.version)
     ch_results = ch_results.mix(MZMLINDEXING.out.mzmls_indexed)
 
-    ch_results.multiMap{
-        meta: it[0]
-        mzml: it[1]
-    }.set{ ch_mzml }
+    ch_results.map{ it -> [it[0], it[1]] }.set{ ch_mzml }
 
-    MZMLSTATISTICS( ch_mzml.mzml )
+    MZMLSTATISTICS( ch_mzml )
     ch_statistics = ch_statistics.mix(MZMLSTATISTICS.out.mzml_statistics.collect())
     ch_versions = ch_versions.mix(MZMLSTATISTICS.out.version)
-Original file line number
+Diff line change
@@ Expand Up / @@ -10,3 +10,5 @@ testing/ @@
     testing*
     *.pyc
     bin/
+    venv/