Merge branch 'dev' into nf-template2.13.1

bigbio · Mar 2, 2024 · 67357d1 · 67357d1
2 parents fbf50c6 + 6d6626f
commit 67357d1
Show file tree

Hide file tree

Showing 156 changed files with 16,550 additions and 1,247 deletions.
diff --git a/.github/workflows/awsfulltest.yml b/.github/workflows/awsfulltest.yml
@@ -1,17 +1,22 @@
 name: nf-core AWS full size tests
 # This workflow is triggered on published releases.
 # It can be additionally triggered manually with GitHub actions workflow dispatch button.
-# It runs the -profile 'test_full' on AWS batch
+# It runs the -profiles 'test_lfq' 'test_tmt' and 'test_dia' on AWS batch
 
 on:
   release:
     types: [published]
   workflow_dispatch:
+
 jobs:
   run-tower:
     name: Run AWS full tests
     if: github.repository == 'nf-core/quantms'
     runs-on: ubuntu-latest
+    # Do a full-scale run with data from each acquisition/quantification mode
+    strategy:
+      matrix:
+        mode: ["lfq", "tmt", "dia"]
     steps:
       - name: Launch workflow via tower
         uses: seqeralabs/action-tower-launch@v2
@@ -27,9 +32,9 @@ jobs:
           parameters: |
             {
               "hook_url": "${{ secrets.MEGATESTS_ALERTS_SLACK_HOOK_URL }}",
-              "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/quantms/results-${{ github.sha }}"
+              "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/quantms/results-${{ github.sha }}/mode_${{ matrix.mode }}"
             }
-          profiles: test_full
+          profiles: test_${{ matrix.mode }}
 
       - uses: actions/upload-artifact@v4
         with:

diff --git a/.github/workflows/awstest.yml b/.github/workflows/awstest.yml
@@ -1,6 +1,6 @@
 name: nf-core AWS test
 # This workflow can be triggered manually with the GitHub actions workflow dispatch button.
-# It runs the -profile 'test' on AWS batch
+# It runs the -profile 'test_tmt' on AWS batch
 
 on:
   workflow_dispatch:
@@ -23,7 +23,7 @@ jobs:
             {
               "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/quantms/results-test-${{ github.sha }}"
             }
-          profiles: test
+          profiles: test_tmt
 
       - uses: actions/upload-artifact@v4
         with:

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -17,18 +17,35 @@ concurrency:
 
 jobs:
   test:
+    env:
+      NXF_ANSI_LOG: false
+      CAPSULE_LOG: none
+      TEST_PROFILE: ${{ matrix.test_profile }}
+      EXEC_PROFILE: ${{ matrix.exec_profile }}
+
     name: Run pipeline with test data
     # Only run on push if this is the nf-core dev branch (merged PRs)
-    if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/quantms') }}"
+    if: ${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/quantms') }}
     runs-on: ubuntu-latest
     strategy:
+      fail-fast: false
       matrix:
+        # Nextflow versions
         NXF_VER:
           - "23.04.0"
           - "latest-everything"
+        test_profile: ["test_lfq", "test_lfq_sage", "test_dia", "test_localize", "test_tmt"]
+        exec_profile: ["docker", "conda"]
+        exclude:
+          - test_profile: test_dia
+            exec_profile: conda
+          - test_profile: test_localize
+            exec_profile: conda
+          - NXF_VER: "latest-everything"
+            exec_profile: "conda"
     steps:
       - name: Check out pipeline code
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4
+        uses: actions/checkout@v4
 
       - name: Install Nextflow
         uses: nf-core/setup-nextflow@v1
@@ -38,9 +55,50 @@ jobs:
       - name: Disk space cleanup
         uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1
 
+      - name: Install micromamba
+        if: matrix.exec_profile == 'conda'
+        run: |
+          wget -qO- https://micromamba.snakepit.net/api/micromamba/linux-64/latest | tar -xvj bin/micromamba
+          echo "$(pwd)/bin" >> $GITHUB_PATH
+          echo "$(pwd)/micromamba/bin" >> $GITHUB_PATH
+          ./bin/micromamba shell init -s bash -p ./micromamba
+          echo $'channels:\n  - conda-forge\n  - bioconda\n  - defaults\nuse_lockfiles: false' >> ~/.mambarc
+
       - name: Run pipeline with test data
+        if: matrix.exec_profile != 'conda'
         # TODO nf-core: You can customise CI pipeline run tests as required
         # For example: adding multiple test runs with different parameters
         # Remember that you can parallelise this by using strategy.matrix
         run: |
-          nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results
+          nextflow run ${GITHUB_WORKSPACE} -profile $TEST_PROFILE,$EXEC_PROFILE --outdir ${TEST_PROFILE}_${EXEC_PROFILE}_results
+      - name: Run pipeline with test data in conda profile (and single-threaded)
+        if: matrix.exec_profile == 'conda'
+        # TODO nf-core: You can customise CI pipeline run tests as required
+        # For example: adding multiple test runs with different parameters
+        # Remember that you can parallelise this by using strategy.matrix
+        run: |
+          nextflow run ${GITHUB_WORKSPACE} -profile $TEST_PROFILE,micromamba --outdir ${TEST_PROFILE}_${EXEC_PROFILE}_results
+      - name: Gather failed logs
+        if: failure() || cancelled()
+        run: |
+          mkdir failed_logs
+          failed=$(grep "FAILED" ${TEST_PROFILE}_${EXEC_PROFILE}_results/pipeline_info/execution_trace.txt | cut -f 2)
+          while read -r line ; do cp $(ls work/${line}*/*.log) failed_logs/ | true ; done <<< "$failed"
+      - uses: actions/upload-artifact@v1
+        if: failure() || cancelled()
+        name: Upload failed logs
+        with:
+          name: failed_logs
+          path: failed_logs
+      - uses: actions/upload-artifact@v1
+        if: always()
+        name: Upload results
+        with:
+          name: ${{ env.TEST_PROFILE }}_${{ env.EXEC_PROFILE }}_results
+          path: ${{ env.TEST_PROFILE }}_${{ env.EXEC_PROFILE }}_results
+      - uses: actions/upload-artifact@v1
+        if: always()
+        name: Upload log
+        with:
+          name: nextflow.log
+          path: .nextflow.log
diff --git a/.gitignore b/.gitignore
@@ -6,3 +6,16 @@ results/
 testing/
 testing*
 *.pyc
+.idea/
+.idea/*
+*.log
+/build/
+results*/
+venv/
+node_modules
+conversion_inputs
+debug_dir
+test_out
+
+lint_log.txt
+node_modules
diff --git a/.nf-core.yml b/.nf-core.yml
@@ -1 +1,8 @@
 repository_type: pipeline
+lint:
+  files_exist:
+    - conf/igenomes.config
+    - conf/test_full.config
+    - conf/test.config
+  files_unchanged:
+    - .github/PULL_REQUEST_TEMPLATE.md
diff --git a/.prettierignore b/.prettierignore
@@ -10,3 +10,5 @@ testing/
 testing*
 *.pyc
 bin/
+venv/
+
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,14 +3,139 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## v1.3.0dev - [date]
+## [1.3.0dev] nfcore/quantms - [TBD] - Vaduz
+
+### `Added`
+
+### `Changed`
+
+### `Fixed`
+
+### `Dependencies`
+
+### `Parameters`
+
+### `Deprecations`
+
+## [1.2.0] nfcore/quantms - [11/02/2023] - Thimphu
+
+### `Added`
+
+- [#275 BigBio](https://github.com/bigbio/quantms/pull/275) Added support for bruker data in DIA branch.
+- [#275 BigBio](https://github.com/bigbio/quantms/pull/275) And speed-up to DIA-NN pipeline.
+- [#275 BigBio](https://github.com/bigbio/quantms/pull/275) Support for library-base search in DIA-NN pipeline.
+- [#300 BigBio](https://github.com/bigbio/quantms/pull/300) Major refactoring of LFQ-DDA MBR algorithm.
+- [#279 BigBio](https://github.com/bigbio/quantms/pull/279) Support for SAGE search engine.
+
+### `Changed`
+
+- [#314](https://github.com/bigbio/quantms/pull/314) Update for pmultiqc to pmultiqc=0.0.23
+- [#308](https://github.com/bigbio/quantms/pull/308) Update for openms to openms=3.1.0
+- Update for sdrf-pipelines to sdrf-pipelines=0.0.24
+- Update for msstats to msstats=4.2.1
+
+### `Fixed`
+
+- [#316](https://github.com/bigbio/quantms/pull/316) Fixed jar path selection of luciphoradapter and msgf+
+- Fixed bug where modification masses were not calculated correctly in DIA-NN conversion.
+- Fixed multiple bugs Pull Request [#293 BigBio](https://github.com/bigbio/quantms/pull/293), [#279 BigBio](https://github.com/bigbio/quantms/pull/279), [#265 BigBio](https://github.com/bigbio/quantms/pull/265), [#260 BigBio](https://github.com/bigbio/quantms/pull/260), [#257 BigBio](https://github.com/bigbio/quantms/pull/257)
+
+### `Dependencies`
+
+- New dependency on `sage` search engine.
+
+### `Parameters`
+
+- feature_with_id_min_score: Minimum score of a feature with a peptide identification (default: 0.10)
+- feature_without_id_min_score: Minimum score of a feature without peptide identification (transfer feature, default: 0.75)
+- lfq_intensity_threshold: Minimum intensity of a feature to be considered in the MBR algorithm (default: 1000)
+- sage_processes: Number of processes to use in SAGE search engine (default: 1)
+- diann_speclib: Path to the spectral library to use in DIA-NN (default: null)
+- convert_dotd: if convert .d file to mzml (default: false)
+
+## [1.1.1] nfcore/quantms - [03/27/23] - Berlin-Bern
+
+### `Added`
+
+- [#92](https://github.com/nf-core/quantms/pull/92) Improved output docs for mzTab
+- [#91](https://github.com/nf-core/quantms/pull/91) Added dev profile for nightly versions of OpenMS tools
+
+### `Changed`
+
+- [#88](https://github.com/nf-core/quantms/pull/88) Updated Comet version to latest release (2023010)
+
+### `Fixed`
+
+- [#93](https://github.com/nf-core/quantms/pull/93) Fixed bug in docker vs. singularity container logic in some processes.
+
+## [1.1.0] nfcore/quantms - [03/20/2023] - Berlin
+
+- Bugfixes and speed increases in the OpenMS tools due to version update to 2.9.1
+- Improvements in logging by adding many more process.ids
+- Large restructuring of DIA branch to increase parallelizability
+- Better error handling in MSstats step plus new parameter to filter for MSstats' adjusted p-value in the plots
+- More efficient parsing of mzML statistics in a separate step
+- A clearer distinction between per-run and experiment-wide FDRs with one parameter for each
+- More test profiles including larger "full" tests
+
+### `Added`
+
+- [#176](https://github.com/bigbio/quantms/pull/176) - Add name of each ID step
+- [#205](https://github.com/bigbio/quantms/pull/205) - mzTab export for DIANN outputs
+
+### `Changed`
+
+- [#169](https://github.com/bigbio/quantms/pull/169) - Restruct DIA-NN step1 : Generate an in silico predicted spectral library
+- [#178](https://github.com/bigbio/quantms/pull/178) - Restruct DIA-NN step2 : Preliminary analysis of individual raw files
+- [#179](https://github.com/bigbio/quantms/pull/179) - Restruct DIA-NN steps 3-5 to be as parallel as possible
+- [#200](https://github.com/bigbio/quantms/pull/200) - Rename MSstats/Triqler/mzTab input and output
+
+### `Fixed`
+
+- [#187](https://github.com/bigbio/quantms/pull/187) - Bug fixing in proteomicsLFQ applying FDR at PSM level
+- [#207](https://github.com/bigbio/quantms/pull/207) - Bug fixing in dissociation method translation for Luciphor
+
+### `Dependencies`
+
+- [#203](https://github.com/bigbio/quantms/pull/203) - update openms dependency -> 3.0.0dev
+- [#208](https://github.com/bigbio/quantms/pull/208) - update pmultiqc dependency -> 0.0.13. Support for DIANN in pmultiqc and enable the generation of search engine scores distributions/peptide and protein table by pmultiqc.
+
+### `Parameters`
+
+- [#193](https://github.com/bigbio/quantms/pull/193) - Set the `local_input_type` default parameter to `mzML`
+- [#212](https://github.com/bigbio/quantms/pull/212) - Set the `min_consensus_support` default parameter to `1` to filter in ConsensusID for peptides identified with both search engines
+- [#200](https://github.com/bigbio/quantms/pull/200) - Add `export_mztab` parameter to allow to run PROTEINQUANTIFIER TMT without exporting to mzTab
+
+## [1.0] nfcore/quantms - [05/02/2022] - Havana
 
 Initial release of nf-core/quantms, created with the [nf-core](https://nf-co.re/) template.
 
 ### `Added`
 
+- New pipeline for DDA-LFQ data analysis
+- New pipeline for DDA-ISO data analysis
+- New datasets for DDA-LFQ and DDA-ISO data analysis
+- Documentation added for DDA pipeline
+- First pipeline for DIA-LFQ data analysis
+
 ### `Fixed`
 
+- This is the first release - no reported issues
+
 ### `Dependencies`
 
-### `Deprecated`
+The pipeline is using Nextflow DSL2, each process will be run with its own [Biocontainer](https://biocontainers.pro/#/registry). This means that on occasion it is entirely possible for the pipeline to be using different versions of the same tool. However, the overall software dependency changes compared to the last release have been listed below for reference.
+
+| Dependency            | Version    |
+| --------------------- | ---------- |
+| `thermorawfileparser` | 1.3.4      |
+| `comet`               | 2021010    |
+| `msgf+`               | 2022.01.07 |
+| `openms`              | 2.9.1      |
+| `sdrf-pipelines`      | 0.0.22     |
+| `percolator`          | 3.5        |
+| `pmultiqc`            | 0.0.11     |
+| `luciphor`            | 2020_04_03 |
+| `dia-nn`              | 1.8.1      |
+| `msstats`             | 4.2.0      |
+| `msstatstmt`          | 2.2.0      |
diff --git a/CITATIONS.md b/CITATIONS.md
@@ -1,5 +1,23 @@
 # nf-core/quantms: Citations
 
+## [Pipeline](https://www.researchsquare.com/article/rs-3002027/v1)
+
+> Chengxin Dai, Julianus Pfeuffer, Hong Wang et al. quantms: A cloud-based pipeline for proteomics reanalysis enables the quantification of 17521 proteins in 9,502 human samples., 01 June 2023, PREPRINT (Version 1) available at Research Square [https://doi.org/10.21203/rs.3.rs-3002027/v1]
+
+## Pipeline research manuscripts
+
+- [proteogenomics](https://pubmed.ncbi.nlm.nih.gov/34904638/)
+
+  > Umer HM, Audain E, Zhu Y, Pfeuffer J, Sachsenberg T, Lehtiö J, Branca RM, Perez-Riverol Y. Generation of ENSEMBL-based proteogenomics databases boosts the identification of non-canonical peptides. Bioinformatics. 2022 Feb 7;38(5):1470-1472. doi: 10.1093/bioinformatics/btab838. PMID: 34904638; PMCID: PMC8825679.
+
+- [lfq dda benchmark](https://pubmed.ncbi.nlm.nih.gov/37220883/)
+
+  > Bai M, Deng J, Dai C, Pfeuffer J, Sachsenberg T, Perez-Riverol Y. LFQ-Based Peptide and Protein Intensity Differential Expression Analysis. J Proteome Res. 2023 Jun 2;22(6):2114-2123. doi: 10.1021/acs.jproteome.2c00812. Epub 2023 May 23. PMID: 37220883; PMCID: PMC10243145.
+
+- [tissue absolute expression](https://pubmed.ncbi.nlm.nih.gov/37488995/)
+
+  > Wang H, Dai C, Pfeuffer J, Sachsenberg T, Sanchez A, Bai M, Perez-Riverol Y. Tissue-based absolute quantification using large-scale TMT and LFQ experiments. Proteomics. 2023 Jul 24:e2300188. doi: 10.1002/pmic.202300188. Epub ahead of print. PMID: 37488995.
+
 ## [nf-core](https://pubmed.ncbi.nlm.nih.gov/32055031/)
 
 > Ewels PA, Peltzer A, Fillinger S, Patel H, Alneberg J, Wilm A, Garcia MU, Di Tommaso P, Nahnsen S. The nf-core framework for community-curated bioinformatics pipelines. Nat Biotechnol. 2020 Mar;38(3):276-278. doi: 10.1038/s41587-020-0439-x. PubMed PMID: 32055031.
@@ -10,9 +28,49 @@
 
 ## Pipeline tools
 
-- [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)
+- [thermorawfileparser](https://pubmed.ncbi.nlm.nih.gov/31755270/)
+
+  > Hulstaert N, Shofstahl J, Sachsenberg T, Walzer M, Barsnes H, Martens L, Perez-Riverol Y. ThermoRawFileParser: Modular, Scalable, and Cross-Platform RAW File Conversion. J Proteome Res. 2020 Jan 3;19(1):537-542. doi: 10.1021/acs.jproteome.9b00328. Epub 2019 Dec 6. PMID: 31755270; PMCID: PMC7116465.
+
+- [sdrf-pipelines](https://pubmed.ncbi.nlm.nih.gov/34615866/)
+
+  > Dai C, Füllgrabe A, Pfeuffer J, Solovyeva EM, Deng J, Moreno P, Kamatchinathan S, Kundu DJ, George N, Fexova S, Grüning B, Föll MC, Griss J, Vaudel M, Audain E, Locard-Paulet M, Turewicz M, Eisenacher M, Uszkoreit J, Van Den Bossche T, Schwämmle V, Webel H, Schulze S, Bouyssié D, Jayaram S, Duggineni VK, Samaras P, Wilhelm M, Choi M, Wang M, Kohlbacher O, Brazma A, Papatheodorou I, Bandeira N, Deutsch EW, Vizcaíno JA, Bai M, Sachsenberg T, Levitsky LI, Perez-Riverol Y. A proteomics sample metadata representation for multiomics integration and big data analysis. Nat Commun. 2021 Oct 6;12(1):5854. doi: 10.1038/s41467-021-26111-3. PMID: 34615866; PMCID: PMC8494749.
+
+- [OpenMS](https://pubmed.ncbi.nlm.nih.gov/27312411/)
+
+  > Röst HL., Sachsenberg T., Aiche S., Bielow C., Weisser H., Aicheler F., Andreotti S., Ehrlich HC., Gutenbrunner P., Kenar E., Liang X., Nahnsen S., Nilse L., Pfeuffer J., Rosenberger G., Rurik M., Schmitt U., Veit J., Walzer M., Wojnar D., Wolski WE., Schilling O., Choudhary JS, Malmström L., Aebersold R., Reinert K., Kohlbacher O. (2016). OpenMS: a flexible open-source software platform for mass spectrometry data analysis. Nature methods, 13(9), 741–748. doi: 10.1038/nmeth.3959. PubMed PMID: 27575624; PubMed Central PMCID: PMC5617107.
+
+- [DIA-NN](https://pubmed.ncbi.nlm.nih.gov/31768060/)
+
+  > Demichev V, Messner CB, Vernardis SI, Lilley KS, Ralser M. DIA-NN: neural networks and interference correction enable deep proteome coverage in high throughput. Nat Methods. 2020 Jan;17(1):41-44. doi: 10.1038/s41592-019-0638-x. Epub 2019 Nov 25. PMID: 31768060; PMCID: PMC6949130.
+
+- [MSstats](https://www.ncbi.nlm.nih.gov/pubmed/24794931/)
+
+  > Choi M., Chang CY., Clough T., Broudy D., Killeen T., MacLean B., Vitek O. (2014). MSstats: an R package for statistical analysis of quantitative mass spectrometry-based proteomic experiments. Bioinformatics (Oxford, England), 30(17), 2524–2526. doi: 10.1093/bioinformatics/btu305. PubMed PMID: 24794931.
+
+- [Comet](https://www.ncbi.nlm.nih.gov/pubmed/23148064/)
+
+  > Eng JK., Jahan TA., Hoopmann MR. (2013). Comet: an open-source MS/MS sequence database search tool. Proteomics, 13(1), 22–24. doi: 10.1002/pmic.201200439. PubMed PMID: 23148064
+
+- [MS-GF+](https://www.ncbi.nlm.nih.gov/pubmed/25358478/)
+
+  > Kim S., Pevzner PA. (2014). MS-GF+ makes progress towards a universal database search tool for proteomics. Nature communications, 5, 5277. doi: 10.1038/ncomms6277. PubMed PMID: 25358478; PubMed Central PMCID: PMC5036525
+
+- [Sage](https://pubmed.ncbi.nlm.nih.gov/37819886/)
+
+  > Lazear MR. Sage: An Open-Source Tool for Fast Proteomics Searching and Quantification at Scale. J Proteome Res. 2023 Oct 11. doi: 10.1021/acs.jproteome.3c00486. Epub ahead of print. PMID: 37819886.
+
+- [Epifany](https://pubmed.ncbi.nlm.nih.gov/31975601/)
+
+  > Pfeuffer J, Sachsenberg T, Dijkstra TMH, Serang O, Reinert K, Kohlbacher O. EPIFANY: A Method for Efficient High-Confidence Protein Inference. J Proteome Res. 2020 Mar 6;19(3):1060-1072. doi: 10.1021/acs.jproteome.9b00566. Epub 2020 Feb 13. PMID: 31975601; PMCID: PMC7583457.
+
+- [Triqler](https://pubmed.ncbi.nlm.nih.gov/30482846/)
+
+  > The M, Käll L. Integrated Identification and Quantification Error Probabilities for Shotgun Proteomics. Mol Cell Proteomics. 2019 Mar;18(3):561-570. doi: 10.1074/mcp.RA118.001018. Epub 2018 Nov 27. PMID: 30482846; PMCID: PMC6398204.
+
+- [luciphor](https://pubmed.ncbi.nlm.nih.gov/23918812/)
 
-  > Andrews, S. (2010). FastQC: A Quality Control Tool for High Throughput Sequence Data [Online].
+  > Fermin D, Walmsley SJ, Gingras AC, Choi H, Nesvizhskii AI. LuciPHOr: algorithm for phosphorylation site localization with false localization rate estimation using modified target-decoy approach. Mol Cell Proteomics. 2013 Nov;12(11):3409-19. doi: 10.1074/mcp.M113.028928. Epub 2013 Aug 5. PMID: 23918812; PMCID: PMC3820951.
 
 - [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/)
-Original file line number
+Diff line change
@@ Expand Up / @@ -10,3 +10,5 @@ testing/ @@
     testing*
     *.pyc
     bin/
+    venv/