Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dev #264

Merged
merged 23 commits into from
Mar 17, 2023
Merged

Dev #264

Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
e069f8e
small changes for review PR https://github.com/nf-core/quantms/pull/48
ypriverol Mar 16, 2023
4f905e7
small changes for review PR https://github.com/nf-core/quantms/pull/48
ypriverol Mar 16, 2023
29a2d24
small changes for review PR https://github.com/nf-core/quantms/pull/48
ypriverol Mar 16, 2023
0ca2cb5
small changes for review PR https://github.com/nf-core/quantms/pull/48
ypriverol Mar 16, 2023
f044239
sort imports
fabianegli Mar 17, 2023
908484e
remove unused variable assignment
fabianegli Mar 17, 2023
1883fc1
unobfuscate direct use of logging.error
fabianegli Mar 17, 2023
dc728c4
do not shadow the builtin function max
fabianegli Mar 17, 2023
c438127
do not shadow builtin function id
fabianegli Mar 17, 2023
0528816
simplify path operations with pathlib
fabianegli Mar 17, 2023
71042e1
remove unnecessary close after context
fabianegli Mar 17, 2023
ec49df9
black
fabianegli Mar 17, 2023
2ceed76
map protease to specificity with a dict
fabianegli Mar 17, 2023
c816abf
replace one letter variable names with longer ones
fabianegli Mar 17, 2023
0e062f5
fix Trypsin/P cleavage specificity
fabianegli Mar 17, 2023
4df3260
add some type annotations
fabianegli Mar 17, 2023
0bb7431
mzmlstatistics to tuple input
ypriverol Mar 17, 2023
2c319cd
small changes.
ypriverol Mar 17, 2023
9b3fc0c
fix type annotation in docstring
fabianegli Mar 17, 2023
61a14ba
test_lfq used also comet
ypriverol Mar 17, 2023
766bec5
Merge pull request #76 from fabianegli/contribution-to-v1-1
ypriverol Mar 17, 2023
7848567
Merge branch 'dev' of https://github.com/nf-core/quantms into dev
ypriverol Mar 17, 2023
2306abe
small change.
ypriverol Mar 17, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .prettierignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,5 @@ testing/
testing*
*.pyc
bin/
venv/

41 changes: 22 additions & 19 deletions bin/diann_convert.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
#!/usr/bin/env python

"""
This script converts the output from DIA-NN into three standard formats: MSstats, Triqler and mzTab.
License: Apache 2.0
Authors: Hong Wong, Yasset Perez-Riverol
"""
import logging
import os
import re

import click
import numpy as np
import pandas as pd
import logging as log
from pyopenms import AASequence, FASTAFile, ModificationsDB

CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])
Expand Down Expand Up @@ -65,22 +70,20 @@ def convert(ctx, folder, dia_params, diann_version, charge, missed_cleavages, qv

for item in pathdict.items():
if item[0] != "mzml_info" and len(item[1]) > 1:
log.error(f"{item[0]} is duplicate, check whether the file is redundant or change the file name!")
logging.error(f"{item[0]} is duplicate, check whether the file is redundant or change the file name!")

diann_report = folder + pathdict["report"][0]
exp_design = folder + pathdict["exp_design"][0]
pg_matrix = folder + pathdict["pg_matrix"][0]
pr_matrix = folder + pathdict["pr_matrix"][0]
fasta = folder + pathdict["fasta"][0]
diann_version_file = diann_version
mzml_info = pathdict["mzml_info"]

with open(diann_version_file) as f:
for line in f:
if "DIA-NN" in line:
diann_version_id = line.rstrip("\n").split(": ")[1]
break
f.close()

remain_cols = [
"File.Name",
Expand Down Expand Up @@ -774,15 +777,16 @@ def findstr(basestr, s, resultlist):
return protein_coverage


def match_in_report(report, target, max, flag, level):
"""This function is used to match the columns "ms_run" and "study_variable" in the report to get the information.
def match_in_report(report, target, max_, flag, level):
"""This function is used to match the columns "ms_run" and "study_variable" from the report and
get the corresponding information for the mztab ms_run and study_values metadata values.

:param report: Dataframe for Dia-NN main report
:type report: pandas.core.frame.DataFrame
:param target: The value of "pr_id" column in out_mztab_PEH(level="peptide") or the "accession" column in out_mztab_PRH(level="protein")
:type target: str
:param max: max_assay or max_study_variable
:type max: int
:param max_: max_assay or max_study_variable
:type max_: int
:param flag: Match the "study_variable" column(flag=1) or the "ms_run" column(flag=0) in the filter result
:type flag: int
:param level: "pep" or "protein"
Expand All @@ -793,7 +797,7 @@ def match_in_report(report, target, max, flag, level):
if flag == 1 and level == "pep":
result = report[report["precursor.Index"] == target]
PEH_params = []
for i in range(1, max + 1):
for i in range(1, max_ + 1):
match = result[result["study_variable"] == i]
PEH_params.extend([match["Precursor.Normalised"].mean(), "null", "null", "null", match["RT.Start"].mean()])

Expand All @@ -802,7 +806,7 @@ def match_in_report(report, target, max, flag, level):
if flag == 0 and level == "pep":
result = report[report["precursor.Index"] == target]
q_value = []
for i in range(1, max + 1):
for i in range(1, max_ + 1):
match = result[result["ms_run"] == i]
q_value.append(match["Q.Value"].values[0] if match["Q.Value"].values.size > 0 else np.nan)

Expand All @@ -811,7 +815,7 @@ def match_in_report(report, target, max, flag, level):
if flag == 1 and level == "protein":
result = report[report["Protein.Ids"] == target]
PRH_params = []
for i in range(1, max + 1):
for i in range(1, max_ + 1):
match = result[result["study_variable"] == i]
PRH_params.extend([match["PG.MaxLFQ"].mean(), "null", "null"])

Expand Down Expand Up @@ -882,14 +886,13 @@ def find_modification(peptide):


def calculate_mz(seq, charge):
"""Remove unknown aminoacids and calculate mz

:param seq: Sequences of peptides
:type seq: str
:param charge: charge of peptides
"""
Calculate the precursor m/z based on the peptide sequence and charge state.
:param seq: Sequence peptide
:type seq: str
:return: mz
:rtype: float or NoneType
:param charge: charge state
:type charge: int
:return:
"""
ref = "ARNDBCEQZGHILKMFPSTWYV"
seq = "".join([i for i in seq if i in ref])
Expand Down
35 changes: 18 additions & 17 deletions bin/mzml_statistics.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
#!/usr/bin/env python

from pyopenms import MzMLFile, MSExperiment
import os
import pandas as pd
import sys
from pathlib import Path

import pandas as pd
from pyopenms import MSExperiment, MzMLFile


def mzml_dataframe(mzml_path):
def mzml_dataframe(mzml_path: str) -> None:
file_columns = [
"SpectrumID",
"MSLevel",
Expand All @@ -17,34 +18,34 @@ def mzml_dataframe(mzml_path):
"Exp_Mass_To_Charge",
]

def parse_mzml(file_name, file_columns):
def parse_mzml(file_name: str, file_columns: list):
info = []
exp = MSExperiment()
MzMLFile().load(file_name, exp)
for i in exp:
id = i.getNativeID()
MSLevel = i.getMSLevel()
rt = i.getRT() if i.getRT() else None
for spectrum in exp:
id_ = spectrum.getNativeID()
MSLevel = spectrum.getMSLevel()
rt = spectrum.getRT() if spectrum.getRT() else None
if MSLevel == 2:
charge_state = i.getPrecursors()[0].getCharge()
emz = i.getPrecursors()[0].getMZ() if i.getPrecursors()[0].getMZ() else None
peaks_tuple = i.get_peaks()
charge_state = spectrum.getPrecursors()[0].getCharge()
emz = spectrum.getPrecursors()[0].getMZ() if spectrum.getPrecursors()[0].getMZ() else None
peaks_tuple = spectrum.get_peaks()
peak_per_ms2 = len(peaks_tuple[0])
if i.getMetaValue("base peak intensity"):
base_peak_intensity = i.getMetaValue("base peak intensity")
if spectrum.getMetaValue("base peak intensity"):
base_peak_intensity = spectrum.getMetaValue("base peak intensity")
else:
base_peak_intensity = max(peaks_tuple[1]) if len(peaks_tuple[1]) > 0 else None
info_list = [id, 2, charge_state, peak_per_ms2, base_peak_intensity, rt, emz]
info_list = [id_, 2, charge_state, peak_per_ms2, base_peak_intensity, rt, emz]
else:
info_list = [id, MSLevel, None, None, None, rt, None]
info_list = [id_, MSLevel, None, None, None, rt, None]

info.append(info_list)

return pd.DataFrame(info, columns=file_columns)

mzml_df = parse_mzml(mzml_path, file_columns)
mzml_df.to_csv(
"{}_mzml_info.tsv".format(os.path.splitext(os.path.split(mzml_path)[1])[0]),
f"{Path(mzml_path).stem}_mzml_info.tsv",
mode="a",
sep="\t",
index=False,
Expand Down
47 changes: 22 additions & 25 deletions bin/prepare_diann_parameters.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#!/usr/bin/env python

import re
from typing import List, Tuple

import click
from sdrf_pipelines.openms.unimod import UnimodDatabase
Expand Down Expand Up @@ -32,21 +33,21 @@ def generate_cfg(ctx, enzyme, fix_mod, var_mod):
for mod in var_ptm:
diann_var_ptm += var_ptm_str + mod

with open("diann_config.cfg", "w") as f:
f.write("--cut " + cut + diann_fix_ptm + diann_var_ptm)
with open("diann_config.cfg", "w") as file:
file.write("--cut " + cut + diann_fix_ptm + diann_var_ptm)


def convert_mod(unimod_database, fix_mod, var_mod):
def convert_mod(unimod_database, fix_mod: str, var_mod: str) -> Tuple[List, List]:
pattern = re.compile(r"\((.*?)\)")
var_ptm = []
fix_ptm = []

if fix_mod != "":
for mod in fix_mod.split(","):
tag = 0
for m in unimod_database.modifications:
if m.get_name() == mod.split(" ")[0]:
diann_mod = m.get_name() + "," + str(m._delta_mono_mass)
for modification in unimod_database.modifications:
if modification.get_name() == mod.split(" ")[0]:
diann_mod = modification.get_name() + "," + str(modification._delta_mono_mass)
tag = 1
break
if tag == 0:
Expand All @@ -66,9 +67,9 @@ def convert_mod(unimod_database, fix_mod, var_mod):
if var_mod != "":
for mod in var_mod.split(","):
tag = 0
for m in unimod_database.modifications:
if m.get_name() == mod.split(" ")[0]:
diann_mod = m.get_name() + "," + str(m._delta_mono_mass)
for modification in unimod_database.modifications:
if modification.get_name() == mod.split(" ")[0]:
diann_mod = modification.get_name() + "," + str(modification._delta_mono_mass)
tag = 1
break
if tag == 0:
Expand All @@ -88,22 +89,18 @@ def convert_mod(unimod_database, fix_mod, var_mod):
return fix_ptm, var_ptm


def enzyme_cut(enzyme):
if enzyme == "Trypsin":
cut = "K*,R*,!*P"
elif enzyme == "Trypsin/P":
cut = "K*,R*,*P"
elif enzyme == "Arg-C":
cut = "R*,!*P"
elif enzyme == "Asp-N":
cut = "*B,*D"
elif enzyme == "Chymotrypsin":
cut = "F*,W*,Y*,L*,!*P"
elif enzyme == "Lys-C":
cut = "K*,!*P"
else:
cut = "--cut"
return cut
_ENZYME_SPECIFICITY = {
"Trypsin": "K*,R*,!*P",
"Trypsin/P": "K*,R*",
"Arg-C": "R*,!*P",
"Asp-N": "*B,*D",
"Chymotrypsin": "F*,W*,Y*,L*,!*P",
"Lys-C": "K*,!*P",
}


def enzyme_cut(enzyme: str) -> str:
return _ENZYME_SPECIFICITY.get(enzyme) or "--cut"


cli.add_command(generate_cfg)
Expand Down
2 changes: 1 addition & 1 deletion conf/test_lfq.config
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ params {
input = 'https://raw.githubusercontent.com/nf-core/test-datasets/quantms/testdata/lfq_ci/BSA/BSA_design_urls.tsv'
database = 'https://raw.githubusercontent.com/nf-core/test-datasets/quantms/testdata/lfq_ci/BSA/18Protein_SoCe_Tr_detergents_trace_target_decoy.fasta'
posterior_probabilities = "fit_distributions"
search_engines = "msgf"
search_engines = "msgf,comet"
decoy_string= "rev"
add_triqler_output = true
protein_level_fdr_cutoff = 1.0
Expand Down
6 changes: 3 additions & 3 deletions docs/output.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ As an example, a rough visualization of the DDA identification subworkflow can b

## Output structure

Output is by default written to the $NXF_WORKSPACE/results folder. Each step of the workflow export different files and reports with the specific data, peptide identifications, protein quantifications, etc. Most of the pipeline outputs are [HUPO-PSI](https://www.psidev.info/) standard file formats:
Output will be saved to the folder defined by parameter `--outdir`. Each step of the workflow export different files and reports with the specific data, peptide identifications, protein quantifications, etc. Most of the pipeline outputs are [HUPO-PSI](https://www.psidev.info/) standard file formats:

- [mzML](https://www.psidev.info/mzML): The mzML format is an open, XML-based format for mass spectrometer output files, developed with the full participation of vendors and researchers in order to create a single open format that would be supported by all software.
- [mzTab](https://www.psidev.info/mztab>): mzTab is intended as a lightweight supplement to the existing standard mzML to store and represent peptide and protein and identifications together with experimental metadata and basic quantitative information.
Expand Down Expand Up @@ -88,7 +88,7 @@ results

#### Spectra

Quantms main format for spectra is the open [mzML](https://www.psidev.info/mzML) format. However it also supports Thermo raw files through conversion with
Quantms main format for spectra is the open [mzML](https://www.psidev.info/mzML) format. However, it also supports Thermo raw files through conversion with
ThermoRawFileParser. Mixed inputs should be possible but are untested. Conversion results can be cached if run locally or outputted to results.
Mismatches between file extensions in the design and on disk can be corrected through parameters.

Expand Down Expand Up @@ -133,7 +133,7 @@ decoy identifications and search engine scores.

The mzTab is exported for all three workflows DDA-LFQ, DDA-ISO and DIA-LFQ. It is a complete [mzTab](https://github.com/HUPO-PSI/mzTab) file
ready for submission to [PRIDE](https://www.ebi.ac.uk/pride/). It contains both identifications (only those responsible for a quantification),
quantities as well as some metadata about both the experiment and the quantification.
quantities and some metadata about both the experiment and the quantification.

#### MSstats-processed mzTab

Expand Down
7 changes: 4 additions & 3 deletions modules/local/mzmlstatistics/main.nf
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
process MZMLSTATISTICS {
tag "$meta.mzml_id"
label 'process_medium'
// TODO could be easily parallelized
label 'process_single_thread'

conda "bioconda::pyopenms=2.8.0"
Expand All @@ -11,7 +11,7 @@ process MZMLSTATISTICS {
}

input:
path mzml_path
tuple val(meta), path(mzml)

output:
path "*_mzml_info.tsv", emit: mzml_statistics
Expand All @@ -20,9 +20,10 @@ process MZMLSTATISTICS {

script:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: "${meta.mzml_id}"

"""
mzml_statistics.py "${mzml_path}" \\
mzml_statistics.py "${mzml}" \\
2>&1 | tee mzml_statistics.log

cat <<-END_VERSIONS > versions.yml
Expand Down
2 changes: 1 addition & 1 deletion nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ params {

// ConsensusID
consensusid_algorithm = 'best'
min_consensus_support = 1
min_consensus_support = 0
consensusid_considered_top_hits = 0

// Luciphor options
Expand Down
7 changes: 2 additions & 5 deletions subworkflows/local/file_preparation.nf
Original file line number Diff line number Diff line change
Expand Up @@ -53,12 +53,9 @@ workflow FILE_PREPARATION {
ch_versions = ch_versions.mix(MZMLINDEXING.out.version)
ch_results = ch_results.mix(MZMLINDEXING.out.mzmls_indexed)

ch_results.multiMap{
meta: it[0]
mzml: it[1]
}.set{ ch_mzml }
ch_results.map{ it -> [it[0], it[1]] }.set{ ch_mzml }

MZMLSTATISTICS( ch_mzml.mzml )
MZMLSTATISTICS( ch_mzml )
ch_statistics = ch_statistics.mix(MZMLSTATISTICS.out.mzml_statistics.collect())
ch_versions = ch_versions.mix(MZMLSTATISTICS.out.version)

Expand Down