diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 0000000000..903e7aa141 --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1,3 @@ +* @FriederikeHanssen @maxulysse +*.nf.test* @nf-core/nf-test +.github/workflows/ @nf-core/a-team diff --git a/.github/RELEASE_CHECKLIST.md b/.github/RELEASE_CHECKLIST.md new file mode 100644 index 0000000000..fd15eb72a5 --- /dev/null +++ b/.github/RELEASE_CHECKLIST.md @@ -0,0 +1,43 @@ +# Release checklist + +> This checklist is for our own reference, to help us prepare a new release + +1. Check that everything is ready to go + - Desired [PRs](https://github.com/nf-core/sarek/pulls) are merged + - [GHA CI](https://github.com/nf-core/sarek/actions/workflows/ci.yml) are passing on `dev` + - [GHA linting](https://github.com/nf-core/sarek/actions/workflows/linting.yml) are passing on `dev` +2. Increase version number following [semantic versioning](http://semver.org/spec/v2.0.0.html) +3. Choose an appropriate codename for the release (if major or minor) + - i.e. Peaks in [Sarek National Park](https://en.wikipedia.org/wiki/Sarek_National_Park#Topography) +4. Sync `dev` and checkout a new branch for the release +5. Bump version: + - `nf-core bump-version . ` + - edit `docs/images/sarek_indices_subway.svg` + - generate a new `docs/images/sarek_indices_subway.png` + - edit `docs/images/sarek_subway.svg` + - generate a new `docs/images/sarek_subway.png` + - edit `docs/images/sarek_workflow.svg` + - generate a new `docs/images/sarek_workflow.png` + - edit `CHANGELOG` +6. Make a PR to `master` +7. Wait for reviews +8. Merge said PR +9. Make a [release](https://github.com/nf-core/sarek/releases) on GitHub +10. Update [bio.tools](https://bio.tools/nf-core-sarek) with the new release details +11. RT the nf-core automated tweet about the new released version +12. Make a new branch from `dev` +13. Checkout the `CHANGELOG.md` from `master` + - `git checkout upstream/master -- CHANGELOG.md` +14. Add a new `Unreleased` section in `CHANGELOG.md` for the `dev` version +15. Checkout figures from `master` + - `git checkout upstream/master -- docs/images/sarek_indices_subway.svg` + - `git checkout upstream/master -- docs/images/sarek_indices_subway.png` + - `git checkout upstream/master -- docs/images/sarek_subway.svg` + - `git checkout upstream/master -- docs/images/sarek_subway.png` + - `git checkout upstream/master -- docs/images/sarek_workflow.svg` + - `git checkout upstream/master -- docs/images/sarek_workflow.png` +16. Make a PR to `dev` +17. Wait for review +18. Merge said PR +19. Commit and push. Continue making more awesome :metal: +20. Have fika :cake: diff --git a/.github/workflows/awsfulltest.yml b/.github/workflows/awsfulltest.yml deleted file mode 100644 index 24b83a47af..0000000000 --- a/.github/workflows/awsfulltest.yml +++ /dev/null @@ -1,39 +0,0 @@ -name: nf-core AWS full size tests -# This workflow is triggered on published releases. -# It can be additionally triggered manually with GitHub actions workflow dispatch button. -# It runs the -profile 'test_full' on AWS batch - -on: - release: - types: [published] - workflow_dispatch: -jobs: - run-tower: - name: Run AWS full tests - if: github.repository == 'nf-core/sarek' - runs-on: ubuntu-latest - steps: - - name: Launch workflow via tower - uses: seqeralabs/action-tower-launch@v2 - # TODO nf-core: You can customise AWS full pipeline tests as required - # Add full size test data (but still relatively small datasets for few samples) - # on the `test_full.config` test runs with only one set of parameters - with: - workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} - access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} - compute_env: ${{ secrets.TOWER_COMPUTE_ENV }} - revision: ${{ github.sha }} - workdir: s3://${{ secrets.AWS_S3_BUCKET }}/work/sarek/work-${{ github.sha }} - parameters: | - { - "hook_url": "${{ secrets.MEGATESTS_ALERTS_SLACK_HOOK_URL }}", - "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/sarek/results-${{ github.sha }}" - } - profiles: test_full - - - uses: actions/upload-artifact@v3 - with: - name: Tower debug log file - path: | - tower_action_*.log - tower_action_*.json diff --git a/.github/workflows/awstest.yml b/.github/workflows/awstest.yml deleted file mode 100644 index e5d375fdc3..0000000000 --- a/.github/workflows/awstest.yml +++ /dev/null @@ -1,33 +0,0 @@ -name: nf-core AWS test -# This workflow can be triggered manually with the GitHub actions workflow dispatch button. -# It runs the -profile 'test' on AWS batch - -on: - workflow_dispatch: -jobs: - run-tower: - name: Run AWS tests - if: github.repository == 'nf-core/sarek' - runs-on: ubuntu-latest - steps: - # Launch workflow using Tower CLI tool action - - name: Launch workflow via tower - uses: seqeralabs/action-tower-launch@v2 - with: - workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} - access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} - compute_env: ${{ secrets.TOWER_COMPUTE_ENV }} - revision: ${{ github.sha }} - workdir: s3://${{ secrets.AWS_S3_BUCKET }}/work/sarek/work-${{ github.sha }} - parameters: | - { - "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/sarek/results-test-${{ github.sha }}" - } - profiles: test - - - uses: actions/upload-artifact@v3 - with: - name: Tower debug log file - path: | - tower_action_*.log - tower_action_*.json diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b2f909ca0c..0bf047d57f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,43 +1,309 @@ -name: nf-core CI +name: test # This workflow runs the pipeline with the minimal test dataset to check that it completes without any syntax errors on: - push: - branches: - - dev pull_request: release: types: [published] + merge_group: + types: + - checks_requested + branches: + - master + - dev env: NXF_ANSI_LOG: false + NFTEST_VER: "0.8.1" +# Cancel if a newer run is started concurrency: - group: "${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}" + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true jobs: - test: - name: Run pipeline with test data - # Only run on push if this is the nf-core dev branch (merged PRs) - if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/sarek') }}" + pytest-changes: + name: Check for changes (pytest) + runs-on: ubuntu-latest + outputs: + # Expose matched filters as job 'tags' output variable + tags: ${{ steps.filter.outputs.changes }} + steps: + - uses: actions/checkout@v4 + + - uses: frouioui/paths-filter@main + id: filter + with: + filters: "tests/config/pytesttags.yml" + token: "" + + pytest: + name: ${{ matrix.tags }} ${{ matrix.profile }} NF ${{ matrix.NXF_VER }} runs-on: ubuntu-latest + needs: pytest-changes + if: needs.pytest-changes.outputs.tags != '[]' strategy: + fail-fast: false matrix: + tags: ["${{ fromJson(needs.pytest-changes.outputs.tags) }}"] + profile: ["docker"] + # profile: ["docker", "singularity", "conda"] + TEST_DATA_BASE: + - "test-datasets/data" NXF_VER: - "23.04.0" - "latest-everything" + env: + NXF_ANSI_LOG: false + TEST_DATA_BASE: "${{ github.workspace }}/test-datasets" + SENTIEON_LICENSE_BASE64: ${{ secrets.SENTIEON_LICENSE_BASE64 }} + steps: - name: Check out pipeline code uses: actions/checkout@v4 - - name: Install Nextflow + - name: Hash Github Workspace + id: hash_workspace + run: | + echo "digest=$(echo sarek3_${{ github.workspace }} | md5sum | cut -c 1-25)" >> $GITHUB_OUTPUT + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.x" + cache: "pip" + cache-dependency-path: | + **/requirements.txt + + - name: Install Python dependencies + run: pip install --upgrade -r tests/requirements.txt + + - name: Install Nextflow ${{ matrix.NXF_VER }} uses: nf-core/setup-nextflow@v1 with: version: "${{ matrix.NXF_VER }}" - - name: Run pipeline with test data - # TODO nf-core: You can customise CI pipeline run tests as required - # For example: adding multiple test runs with different parameters - # Remember that you can parallelise this by using strategy.matrix + - name: Set up Singularity + if: matrix.profile == 'singularity' + uses: eWaterCycle/setup-singularity@v5 + with: + singularity-version: 3.7.1 + + - name: Set up miniconda + if: matrix.profile == 'conda' + uses: conda-incubator/setup-miniconda@v2 + with: + auto-update-conda: true + channels: conda-forge,bioconda,defaults + python-version: ${{ matrix.python-version }} + + - name: Cache test data + id: cache-testdata + uses: actions/cache@v3 + with: + path: test-datasets/ + key: ${{ steps.hash_workspace.outputs.digest }} + + - name: Check out test data + if: steps.cache-testdata.outputs.cache-hit != 'true' + uses: actions/checkout@v4 + with: + repository: nf-core/test-datasets + ref: sarek3 + path: test-datasets/ + + - name: Replace remote paths in samplesheets + run: | + for f in tests/csv/3.0/*csv; do + sed -i "s=https://raw.githubusercontent.com/nf-core/test-datasets/modules/=${{ github.workspace }}/test-datasets/=g" $f + echo "========== $f ============" + cat $f + echo "========================================" + done; + + # Set up secrets + - name: Set up nextflow secrets + if: env.SENTIEON_LICENSE_BASE64 != null + run: | + nextflow secrets set SENTIEON_LICENSE_BASE64 ${{ secrets.SENTIEON_LICENSE_BASE64 }} + nextflow secrets set SENTIEON_AUTH_MECH_BASE64 ${{ secrets.SENTIEON_AUTH_MECH_BASE64 }} + SENTIEON_ENCRYPTION_KEY=$(echo -n "${{ secrets.ENCRYPTION_KEY_BASE64 }}" | base64 -d) + SENTIEON_LICENSE_MESSAGE=$(echo -n "${{ secrets.LICENSE_MESSAGE_BASE64 }}" | base64 -d) + SENTIEON_AUTH_DATA=$(python bin/license_message.py encrypt --key "$SENTIEON_ENCRYPTION_KEY" --message "$SENTIEON_LICENSE_MESSAGE") + SENTIEON_AUTH_DATA_BASE64=$(echo -n "$SENTIEON_AUTH_DATA" | base64 -w 0) + nextflow secrets set SENTIEON_AUTH_DATA_BASE64 $SENTIEON_AUTH_DATA_BASE64 + + - name: Conda clean + if: matrix.profile == 'conda' + run: conda clean -a + + - name: Run pytest-workflow + uses: Wandalen/wretry.action@v1 + with: + command: TMPDIR=~ PROFILE=${{ matrix.profile }} pytest --tag ${{ matrix.tags }} --symlink --kwdof --git-aware --color=yes + attempt_limit: 3 + + - name: Output log on failure + if: failure() + run: | + sudo apt install bat > /dev/null + batcat --decorations=always --color=always /home/runner/pytest_workflow_*/*/log.{out,err} + + - name: Upload logs on failure + if: failure() + uses: actions/upload-artifact@v2 + with: + name: logs-${{ matrix.profile }} + path: | + /home/runner/pytest_workflow_*/*/.nextflow.log + /home/runner/pytest_workflow_*/*/log.out + /home/runner/pytest_workflow_*/*/log.err + /home/runner/pytest_workflow_*/*/work + !/home/runner/pytest_workflow_*/*/work/conda + !/home/runner/pytest_workflow_*/*/work/singularity + + nftest-changes: + name: Check for changes (nf-test) + runs-on: ubuntu-latest + outputs: + tags: ${{ steps.filter.outputs.changes }} + + steps: + - uses: actions/checkout@v4 + + - name: Combine all tags.yml files + id: get_tags + run: find . -name "tags.yml" -not -path "./.github/*" -exec cat {} + > .github/tags.yml + + - name: debug + run: cat .github/tags.yml + + - uses: frouioui/paths-filter@main + id: filter + with: + filters: ".github/tags.yml" + token: "" + + nftest: + name: ${{ matrix.tags }} ${{ matrix.profile }} NF ${{ matrix.NXF_VER }} + runs-on: ubuntu-latest + needs: nftest-changes + if: needs.nftest-changes.outputs.tags != '[]' + strategy: + fail-fast: false + matrix: + tags: ["${{ fromJson(needs.nftest-changes.outputs.tags) }}"] + profile: ["docker"] + # profile: ["docker", "singularity", "conda"] + TEST_DATA_BASE: + - "test-datasets/data" + NXF_VER: + - "23.04.0" + - "latest-everything" + exclude: + - tags: "bwa/index" + - tags: "bwa/mem" + - tags: "cat/cat" + - tags: "cat/fastq" + - tags: "custom/dumpsoftwareversions" + - tags: "dragmap/align" + - tags: "fastp" + - tags: "fastqc" + - tags: "multiqc" + - tags: "samtools/stats" + - tags: "untar" + env: + NXF_ANSI_LOG: false + TEST_DATA_BASE: "${{ github.workspace }}/test-datasets" + SENTIEON_LICENSE_BASE64: ${{ secrets.SENTIEON_LICENSE_BASE64 }} + + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-java@v3 + with: + distribution: "temurin" + java-version: "17" + - name: Setup Nextflow + uses: nf-core/setup-nextflow@v1 + + - name: Cache nf-test installation + id: cache-software + uses: actions/cache@v3 + with: + path: | + /usr/local/bin/nf-test + /home/runner/.nf-test/nf-test.jar + key: ${{ runner.os }}-${{ env.NFTEST_VER }}-nftest + + - name: Install nf-test + if: steps.cache-software.outputs.cache-hit != 'true' + run: | + wget -qO- https://code.askimed.com/install/nf-test | bash + sudo mv nf-test /usr/local/bin/ + + - name: Setup apptainer + if: matrix.profile == 'singularity' + uses: eWaterCycle/setup-apptainer@main + + - name: Set up Singularity + if: matrix.profile == 'singularity' + run: | + mkdir -p $NXF_SINGULARITY_CACHEDIR + mkdir -p $NXF_SINGULARITY_LIBRARYDIR + + - name: Set up miniconda + uses: conda-incubator/setup-miniconda@v2 + with: + miniconda-version: "latest" + auto-update-conda: true + channels: conda-forge,bioconda,defaults + python-version: ${{ matrix.python-version }} + + - name: Conda setup + run: | + conda clean -a + conda install -n base conda-libmamba-solver + conda config --set solver libmamba + echo $(realpath $CONDA)/condabin >> $GITHUB_PATH + echo $(realpath python) >> $GITHUB_PATH + + # Set up secrets + - name: Set up nextflow secrets + if: env.SENTIEON_LICENSE_BASE64 != null + run: | + nextflow secrets set SENTIEON_LICENSE_BASE64 ${{ secrets.SENTIEON_LICENSE_BASE64 }} + nextflow secrets set SENTIEON_AUTH_MECH_BASE64 ${{ secrets.SENTIEON_AUTH_MECH_BASE64 }} + SENTIEON_ENCRYPTION_KEY=$(echo -n "${{ secrets.ENCRYPTION_KEY_BASE64 }}" | base64 -d) + SENTIEON_LICENSE_MESSAGE=$(echo -n "${{ secrets.LICENSE_MESSAGE_BASE64 }}" | base64 -d) + SENTIEON_AUTH_DATA=$(python3 bin/license_message.py encrypt --key "$SENTIEON_ENCRYPTION_KEY" --message "$SENTIEON_LICENSE_MESSAGE") + SENTIEON_AUTH_DATA_BASE64=$(echo -n "$SENTIEON_AUTH_DATA" | base64 -w 0) + nextflow secrets set SENTIEON_AUTH_DATA_BASE64 $SENTIEON_AUTH_DATA_BASE64 + + # Test the module + - name: Run nf-test + run: | + nf-test test \ + --profile=${{ matrix.profile }} \ + --tag ${{ matrix.tags }} \ + --tap=test.tap \ + --verbose + + confirm-pass: + runs-on: ubuntu-latest + needs: + - pytest + - nftest + if: always() + steps: + - name: All tests ok + if: ${{ success() || !contains(needs.*.result, 'failure') }} + run: exit 0 + - name: One or more tests failed + if: ${{ contains(needs.*.result, 'failure') }} + run: exit 1 + + - name: debug-print + if: always() run: | - nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results + echo "toJSON(needs) = ${{ toJSON(needs) }}" + echo "toJSON(needs.*.result) = ${{ toJSON(needs.*.result) }}" diff --git a/.github/workflows/cloudtest.yml b/.github/workflows/cloudtest.yml new file mode 100644 index 0000000000..1ad196281f --- /dev/null +++ b/.github/workflows/cloudtest.yml @@ -0,0 +1,103 @@ +name: nf-core cloud test + +on: + release: + types: [created] + workflow_dispatch: + inputs: + test: + description: "-profile test (smaller)" + type: boolean + default: true + somatic: + description: "Somatic full test" + type: boolean + default: false + germline: + description: "Germline full test" + type: boolean + default: false + aws: + description: "AWS Batch" + type: boolean + default: true + azure: + description: "Azure Batch" + type: boolean + default: true + +jobs: + trigger-profile-test: + name: Run AWS tests + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + include: + - profile: test_aws + enabled: ${{ ( github.repository == 'nf-core/sarek' ) && ( github.event_name != 'workflow_dispatch' || ( inputs.test && inputs.aws ) ) }} + - profile: test_azure + enabled: ${{ ( github.repository == 'nf-core/sarek' ) && ( github.event_name != 'workflow_dispatch' || ( inputs.test && inputs.azure ) ) }} + - profile: test_full_aws + enabled: ${{ ( github.repository == 'nf-core/sarek' ) && ( github.event_name != 'workflow_dispatch' || ( inputs.somatic && inputs.aws ) ) }} + - profile: test_full_azure + enabled: ${{ ( github.repository == 'nf-core/sarek' ) && ( github.event_name != 'workflow_dispatch' || ( inputs.somatic && inputs.azure ) ) }} + - profile: test_full_germline_aws + enabled: ${{ ( github.repository == 'nf-core/sarek' ) && ( github.event_name != 'workflow_dispatch' || ( inputs.germline && inputs.aws ) ) }} + - profile: test_full_germline_azure + enabled: ${{ ( github.repository == 'nf-core/sarek' ) && ( github.event_name != 'workflow_dispatch' || ( inputs.germline && inputs.azure ) ) }} + + steps: + # Launch workflow on AWS Batch + - name: AWS Launch + uses: seqeralabs/action-tower-launch@v2 + if: ${{ matrix.enabled && ( github.event_name != 'workflow_dispatch' || inputs.aws ) }} + with: + run_name: sarek_${{ matrix.profile }} + workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} + access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} + compute_env: ${{ secrets.TOWER_COMPUTE_ENV }} + revision: ${{ github.sha }} + workdir: s3://${{ secrets.AWS_S3_BUCKET }}/work/sarek/work-${{ github.sha }}/${{ matrix.profile }} + parameters: | + { + "hook_url": "${{ secrets.MEGATESTS_ALERTS_SLACK_HOOK_URL }}", + "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/sarek/results-${{ github.sha }}/${{ matrix.profile }}/" + } + profiles: ${{ matrix.profile }} + + - uses: actions/upload-artifact@v3 + name: Save AWS Logs + if: success() || failure() + with: + name: tower-aws-${{ matrix.profile }}-log + path: | + tower_action_*.log + tower_action_*.json + + # Launch workflow using Tower CLI tool action + - name: Azure Launch + uses: seqeralabs/action-tower-launch@v2 + if: ${{ matrix.enabled && ( github.event_name != 'workflow_dispatch' || inputs.azure ) }} + with: + run_name: sarek_${{ matrix.profile }} + workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} + access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} + compute_env: ${{ secrets.TOWER_CE_AZURE_CPU }} + revision: ${{ github.sha }} + workdir: ${{ secrets.TOWER_BUCKET_AZURE}}/sarek/work-${{ github.sha }}/${{ matrix.profile }} + parameters: | + { + "hook_url": "${{ secrets.MEGATESTS_ALERTS_SLACK_HOOK_URL }}", + "outdir": "${{ secrets.TOWER_BUCKET_AZURE }}/sarek/results-${{ github.sha }}/${{ matrix.profile }}/" + } + profiles: ${{ matrix.profile }} + + - uses: actions/upload-artifact@v3 + name: Save Azure Logs + if: success() || failure() + with: + name: tower-azure-${{ matrix.profile }}-log + path: | + tower_action_*.log + tower_action_*.json diff --git a/.gitignore b/.gitignore index 5124c9ac77..f3bc0d64f3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,12 @@ +*.code-workspace +*.pyc .nextflow* -work/ -data/ -results/ .DS_Store -testing/ +.nf-test* +.nf-test/ +data/ testing* -*.pyc +testing/ +test-datasets/ +results/ +work/ diff --git a/.nf-core.yml b/.nf-core.yml index 3805dc81c1..36189e1fcb 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -1 +1,14 @@ repository_type: pipeline +lint: + actions_ci: False + files_exist: + - .github/workflows/awsfulltest.yml + - conf/modules.config + files_unchanged: + - .gitignore + - assets/nf-core-sarek_logo_light.png + - docs/images/nf-core-sarek_logo_dark.png + - docs/images/nf-core-sarek_logo_light.png + - lib/NfcoreTemplate.groovy + - lib/NfcoreSchema.groovy + template_strings: False diff --git a/CHANGELOG.md b/CHANGELOG.md index 37417ceb38..d1e27f7698 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,16 +1,1614 @@ # nf-core/sarek: Changelog +All notable changes to this project will be documented in this file. + The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## v3.5dev - [date] +## [dev] + +### Added + +- [#1333](https://github.com/nf-core/sarek/pull/1333) - Back to dev +- [#1335](https://github.com/nf-core/sarek/pull/1335) - Add index computation of `bcftools_annotations`, if not provided +- [#1340](https://github.com/nf-core/sarek/pull/1340) - Adds Azure test profiles and megatests. + +### Changed + +- [#1339](https://github.com/nf-core/sarek/pull/1339) - Update sentieon-modules +- [#1344](https://github.com/nf-core/sarek/pull/1344) - Enable CRAM QC, when starting from variantcalling +- [#1360](https://github.com/nf-core/sarek/pull/1360) - Sync `TEMPLATE` with `tools` `2.11` + +### Fixed + +- [#1334](https://github.com/nf-core/sarek/pull/1334) - Remove extra v, when reporting tower runs on slack +- [#1335](https://github.com/nf-core/sarek/pull/1335) - Add docs and validation for bcftools annotation parameters +- [#1345](https://github.com/nf-core/sarek/pull/1345) - Preserve STDERR for easier debugging +- [#1351](https://github.com/nf-core/sarek/pull/1351) - Fix params name for test profiles (`bcftools_annotations`) +- [#1357](https://github.com/nf-core/sarek/pull/1364) - Fixed bug where samples were dropped while reconstituting BAM files + +### Removed + +### Dependencies + +| Dependency | Old version | New version | +| ---------- | ----------- | ----------- | +| multiqc | 1.17 | 1.18 | + +### Modules / Subworkflows + +| script | Old name | New name | +| ------ | -------- | -------- | + +### Parameter + +| Old name | New name | +| -------------------------- | ------------------------ | +| bcftools_annotations_index | bcftools_annotations_tbi | + +## [3.4.0](https://github.com/nf-core/sarek/releases/tag/3.4.0) - Pårtetjåkko + +Pårtetjåkko is a mountain in the south of the park. + +### Added + +- [#1113](https://github.com/nf-core/sarek/pull/1113) - Adding CNVkit genemetrics module +- [#1193](https://github.com/nf-core/sarek/pull/1193) - Adding support for Sentieon's DnaScope for germline variant-calling including joint-germline +- [#1244](https://github.com/nf-core/sarek/pull/1244) - Add bcf annotate module +- [#1252](https://github.com/nf-core/sarek/pull/1252) - Added NGSCheckMate tool for checking that samples come from the same individual +- [#1271](https://github.com/nf-core/sarek/pull/1271) - Back to dev +- [#1288](https://github.com/nf-core/sarek/pull/1288) - Add nf-test continuous integration (but no tests) +- [#1290](https://github.com/nf-core/sarek/pull/1290) - Add nf-test for whole pipeline + +### Changed + +- [#1278](https://github.com/nf-core/sarek/pull/1278) - Hide sentieon parameters similar to other variant callers +- [#1280](https://github.com/nf-core/sarek/pull/1280) - Replacing link to `SentieonDNAscopeModel1.1.model` in Sentieon's S3 with link to same file in igenomes' S3 +- [#1303](https://github.com/nf-core/sarek/pull/1303) - Ressurect vep_version params and changed its scope to pipeline to enable usage for vep loftee plugin +- [#1304](https://github.com/nf-core/sarek/pull/1304) - Update modules +- [#1311](https://github.com/nf-core/sarek/pull/1311) - Update local modules with an `environment.yml` file +- [#1317](https://github.com/nf-core/sarek/pull/1317) - Add new tools to subway map +- [#1325](https://github.com/nf-core/sarek/pull/1325) - Move `sentieon_dnascope_model` params into `igenomes.config` +- [#1325](https://github.com/nf-core/sarek/pull/1325) - Refactor config files +- [#1327](https://github.com/nf-core/sarek/pull/1327) - Update modules to have an conda environment name + +### Fixed + +- [#1277](https://github.com/nf-core/sarek/pull/1277) - Fix null value issue for Mutect2 joint calling +- [#1287](https://github.com/nf-core/sarek/pull/1287) - Adding label `process_single` to local modules +- [#1298](https://github.com/nf-core/sarek/pull/1298) - Fix annotation cache usage +- [#1301](https://github.com/nf-core/sarek/pull/1301) - Fix nf-prov usage +- [#1315](https://github.com/nf-core/sarek/pull/1315) - Avoid clash of configs of `FILTERVARIANTTRANCHES` in the Sentieon-Haplotyper and GATK-Haplotypecaller subworkflows +- [#1318](https://github.com/nf-core/sarek/pull/1218) - Fix writing of params.json on S3 +- [#1324](https://github.com/nf-core/sarek/pull/1324) - Fix various typos & code formatting +- [#1325](https://github.com/nf-core/sarek/pull/1325) - Update bcfannotate tests and related config files +- [#1328](https://github.com/nf-core/sarek/pull/1328) - Fix links to docs in `nextflow_schema.json` and `docs/output.md` +- [#1328](https://github.com/nf-core/sarek/pull/1328) - Add missing icons in `nextflow_schema.json` +- [#1330](https://github.com/nf-core/sarek/pull/1330) - Add SnpEff to full sized tests + +### Removed + +- [#1298](https://github.com/nf-core/sarek/pull/1298) - Remove `--use_annotation_cache_keys` params + +### Dependencies + +| Dependency | Old version | New version | +| ---------- | ----------- | ----------- | +| fastqc | 0.11.9 | 0.12.1 | +| multiqc | 1.15 | 1.17 | + +### Modules / Subworkflows + +| script | Old name | New name | +| ----------------------------- | ----------------------------- | ----------------------------- | +| `gatk4spark/applybqsr` | `GATK4_APPLYBQSRSPARK` | `GATK4SPARK_APPLYBQSR` | +| `gatk4spark/baserecalibrator` | `GATK4_BASERECALIBRATORSPARK` | `GATK4SPARK_BASERECALIBRATOR` | +| `gatk4spark/markduplicates` | `GATK4_MARKDUPLICATESSPARK` | `GATK4SPARK_MARKDUPLICATES` | + +## [3.3.2](https://github.com/nf-core/sarek/releases/tag/3.3.2) - Ráhpajávvre + +Ráhpajávvre is the Lule Sámi spelling of Rapaselet. + +### Added + +- [#1246](https://github.com/nf-core/sarek/pull/1246) - Back to dev +- [#1259](https://github.com/nf-core/sarek/pull/1259) - nf-prov plugin +- [#1288](https://github.com/nf-core/sarek/pull/1288) - Add nf-test continuous integration. + +### Changed + +- [#1248](https://github.com/nf-core/sarek/pull/1248) - Improve annotation-cache docs +- [#1261](https://github.com/nf-core/sarek/pull/1261) - Enable cache for annotation generation when using 'merge' + +### Fixed + +- [#1247](https://github.com/nf-core/sarek/pull/1247) - FIX: Result paths for full size test to be correctly displayed on the website +- [#1256](https://github.com/nf-core/sarek/pull/1256) - Fix issue with controlfreec container declaration +- [#1270](https://github.com/nf-core/sarek/pull/1270) - Revert controlfreec/assesssignificance module to 11.6 + +### Dependencies + +| Dependency | Old version | New version | +| -------------------------------- | ----------- | ----------- | +| Control-FREEC/assesssignificance | 11.6b | 11.6 | + +## [3.3.1](https://github.com/nf-core/sarek/releases/tag/3.3.1) - Biellorippjávrre + +A lake near the Rapaselet delta. + +### Added + +- [#1231](https://github.com/nf-core/sarek/pull/1231) - Back to dev + +### Changed + +- [#1242](https://github.com/nf-core/sarek/pull/1242) - Simplify sentieon nf-core test license usage +- [#1243](https://github.com/nf-core/sarek/pull/1243) - Improve json schema usage for input + +### Fixed + +- [#1232](https://github.com/nf-core/sarek/pull/1232) - Fix Zenodo IDs in manifest +- [#1236](https://github.com/nf-core/sarek/pull/1236) - Fix annotation cache folder verification when no annotation +- [#1240](https://github.com/nf-core/sarek/pull/1240) - Disable JVM Hotspot in all modules/gatk4 ([#1030](https://github.com/nf-core/sarek/issues/1030)) +- [#1241](https://github.com/nf-core/sarek/pull/1241) - Fix axis text of controlfreec plots closing [#921](https://github.com/nf-core/sarek/issues/921) + +### Dependencies + +| Dependency | Old version | New version | +| ------------- | ----------- | ----------- | +| Control-FREEC | 11.6 | 11.6b | + +## [3.3.0](https://github.com/nf-core/sarek/releases/tag/3.3.0) - Rapaselet + +Rapaselet is a delta formed by the Rapaätno river between the Bielloriehppe massif (formerly written Piellorieppe) and the Skårki massif. + +### Added + +- [#930](https://github.com/nf-core/sarek/pull/930) - Add more manual tests +- [#1130](https://github.com/nf-core/sarek/pull/1130) - Back to dev +- [#1013](https://github.com/nf-core/sarek/pull/1013) - Mutect2 multi sample mode with `--joint_mutect2` +- [#1153](https://github.com/nf-core/sarek/pull/1153) - Add input validation for Sentieon & FGBio UMI incompatibility +- [#1158](https://github.com/nf-core/sarek/pull/1158) - Add preprint +- [#1159](https://github.com/nf-core/sarek/pull/1159) - ISMB Poster +- [#1173](https://github.com/nf-core/sarek/pull/1173) - CI tests for VQSR track with stub runs +- [#1122](https://github.com/nf-core/sarek/pull/1122), [#1196](https://github.com/nf-core/sarek/pull/1196) - Add `annotation cache` functionality +- [#1184](https://github.com/nf-core/sarek/pull/1184) - Stub-based CI-test of Sentieon joint-germline variant-calling with VQSR + +### Changed + +- [#1151](https://github.com/nf-core/sarek/pull/1151) - Refactor codebase +- [#1157](https://github.com/nf-core/sarek/pull/1157) - Move all vep args from `ext.args` to `params.vep_custom_args` to allow easier modifications +- [#1059](https://github.com/nf-core/sarek/pull/1059) - Add `nf-validation` for samplesheet validation +- [#1160](https://github.com/nf-core/sarek/pull/1160) - Updating tiddit to v3.6.1 +- [#1166](https://github.com/nf-core/sarek/pull/1166) - More info about `--tools` +- [#1173](https://github.com/nf-core/sarek/pull/1173) - Refactor single sample filtering of Haplotypecaller generated VCFs ([#1053](https://github.com/nf-core/sarek/pull/1053)) +- [#1174](https://github.com/nf-core/sarek/pull/1174) - Updating multiqc to v1.15 +- [#1179](https://github.com/nf-core/sarek/pull/1179) - Unhide params `trim_fastq`, `umi_read_structure`, and `aligner` +- [#1180](https://github.com/nf-core/sarek/pull/1180) - Updating the nf-core modules +- [#1198](https://github.com/nf-core/sarek/pull/1198) - Prepare release `3.3.0` +- [#1200](https://github.com/nf-core/sarek/pull/1200) - Streamline Github Actions workflows +- [#1212](https://github.com/nf-core/sarek/pull/1212) - Use matrix for AWS megatests +- [#1218](https://github.com/nf-core/sarek/pull/1218) - Remove Singularity tests for GHA +- [#1227](https://github.com/nf-core/sarek/pull/1227) - Update modules + +### Fixed + +- [#1143](https://github.com/nf-core/sarek/pull/1143) - `snpeff_db` is now a string +- [#1145](https://github.com/nf-core/sarek/pull/1145) - Fixed Zenodo links in `README.md` and in `WorkflowMain.groovy` +- [#1149](https://github.com/nf-core/sarek/pull/1149) - Update `Manta` modules and fix usage of `--exome` flag +- [#1155](https://github.com/nf-core/sarek/pull/1155) - Restore proper rendering in `usage.md` +- [#1163](https://github.com/nf-core/sarek/pull/1163) - Correcting location of output folder for joint variant calling with GATK's haplotypecaller +- [#1169](https://github.com/nf-core/sarek/pull/1169) - Updating Sentieon-modules. (The conda-check in the Sentieon-modules was moved to the script-section. The version of Sentieon remain unchanged.) +- [#1171](https://github.com/nf-core/sarek/pull/1171) - Fix channel logic for germline resource to skip GetPileupSummary if not provided +- [#1172](https://github.com/nf-core/sarek/pull/1172) - Publish gvcf files when all intervals are processed at once ([#764](https://github.com/nf-core/sarek/issues/764)) +- [#1173](https://github.com/nf-core/sarek/pull/1173) - Fixed duplicated entries in joint germline recalibrated VCF ([#966](https://github.com/nf-core/sarek/pull/966), [#1102](https://github.com/nf-core/sarek/pull/1102)), + fixed grouping joint germline recalibrated VCF ([#1137](https://github.com/nf-core/sarek/pull/1137)) +- [#1177](https://github.com/nf-core/sarek/pull/1177) - Fix status inference when using nf-validation plugin +- [#1181](https://github.com/nf-core/sarek/pull/1181) - Fix join mismatch error in Mutect2 tumor only subworkflow +- [#1183](https://github.com/nf-core/sarek/pull/1183) - Add docs for concatentated germline variants +- [#1184](https://github.com/nf-core/sarek/pull/1184) - Fix issue with duplicated variants in VCF from Sentieon-based joint-germline variant-calling with VQSR. (Corresponding to [#966](https://github.com/nf-core/sarek/issues/966) for GATK.) +- [#1192](https://github.com/nf-core/sarek/pull/1192) - Add `ASCATprofile.png` to ASCAT output docs +- [#1197](https://github.com/nf-core/sarek/pull/1197) - Improve `tower.yml` file to display reports in Tower ([#1190](https://github.com/nf-core/sarek/issues/1190)) +- [#1202](https://github.com/nf-core/sarek/pull/1202) - Remove GHA step that caches Nextflow and bump other out of date actions +- [#1203](https://github.com/nf-core/sarek/pull/1203) - Fix issue with Singularity containers on test profiles +- [#1204](https://github.com/nf-core/sarek/pull/1204) - Fix issue with nf-validation: lane can be a requirement of bam too now +- [#1205](https://github.com/nf-core/sarek/pull/1205) - Less tests triggered +- [#1214](https://github.com/nf-core/sarek/pull/1214) - Don't pass in intervals file to ControlFREEC for WGS analysis +- [#1215](https://github.com/nf-core/sarek/pull/1215) - Fix `meta.id` for mutect2 tumor_only subworkflows +- [#1216](https://github.com/nf-core/sarek/pull/1216) - Better test coverage for variant calling `*_all` subworkflows +- [#1217](https://github.com/nf-core/sarek/pull/1217) - Fix `groupTuple` statement for mutect2 tumor_only subworkflows +- [#1220](https://github.com/nf-core/sarek/pull/1220) - Fix channel and meta logic for `joint_mutect2` feature +- [#1221](https://github.com/nf-core/sarek/pull/1221) - Remove `lane` meta field after samplesheet validation to ensure proper merging after mapping +- [#1222](https://github.com/nf-core/sarek/pull/1222) - Better documentation for annotation cache +- [#1224](https://github.com/nf-core/sarek/pull/1224) - Update BCFTOOLS_SORT module with `--temp-dir .` added as option, which was required for Singularity +- [#1225](https://github.com/nf-core/sarek/pull/1225) - Better test coverage for all tests +- [#1227](https://github.com/nf-core/sarek/pull/1227) - Lint warning fix +- [#1229](https://github.com/nf-core/sarek/pull/1229) - Fix md5sum for gatk4_spark tests +- [#1230](https://github.com/nf-core/sarek/pull/1230) - Fix md5sum for sentieon aligner tests + +### Dependencies + +| Dependency | Old version | New version | +| ------------- | ------------------------- | ------------------------ | +| `cnvkit` | 0.9.9 (`samtools` 1.16.1) | 0.9.10 (`samtools` 1.17) | +| `ensembl-vep` | 108 | 110 | +| `grep` | 3.4 | 3.11 | +| `multiqc` | 1.14 | 1.15 | +| `tiddit` | 3.3.2 | 3.6.1 | + +## [3.2.3](https://github.com/nf-core/sarek/releases/tag/3.2.3) - Gällivare + +Gällivare is a small lake next to Pierikjaure. + +### Added + +- [#1112](https://github.com/nf-core/sarek/pull/1112) - Back to dev +- [#1119](https://github.com/nf-core/sarek/pull/1119) - Added `help_text` for `input_output_options` group in schema +- [#1044](https://github.com/nf-core/sarek/pull/1044) - Adding support for several tools from Sentieon's DNAseq package. The standard fastq-to-vcf processing can now be done using Sentieon's DNAseq tools `ApplyVarCal`, `bwa mem`, `Dedup`, `GVCFtyper`, `Haplotyper`, `LocusCollector` and `VarCal`. + +### Changed + +- [#1119](https://github.com/nf-core/sarek/pull/1119) - Remove `null` by default in schema +- [#1128](https://github.com/nf-core/sarek/pull/1128) - Prepare release `3.2.3` + +### Fixed + +- [#1118](https://github.com/nf-core/sarek/pull/1118) - Remove `public_aws_ecr` profile + +## [3.2.2](https://github.com/nf-core/sarek/releases/tag/3.2.2) - Vuoinesluobbalah + +Vuoinesluobbalah is a lake close to Bierikjávrre. + +### Added + +- [#1106](https://github.com/nf-core/sarek/pull/1106) - Add Slack integration to Megatests +- [#1107](https://github.com/nf-core/sarek/pull/1107) - Add `singularity.registry` to `public_aws_ecr` + +### Changed + +- [#1087](https://github.com/nf-core/sarek/pull/1087) - Back to dev +- [#1087](https://github.com/nf-core/sarek/pull/1087) - Minor modules update +- [#1088](https://github.com/nf-core/sarek/pull/1088) - Replace profile `test` by `test_cache` and add a `test` profile without hidden files +- [#1095](https://github.com/nf-core/sarek/pull/1095) - Prepare release `3.2.2` + +### Fixed + +- [#1087](https://github.com/nf-core/sarek/pull/1087) - Fix wrong default memory in GATK4_CREATESEQUENCEDICTIONARY [#1085](https://github.com/nf-core/sarek/pull/1085) +- [#1089](https://github.com/nf-core/sarek/pull/1089) - Remove duplicated code +- [#1093](https://github.com/nf-core/sarek/pull/1093) - Fixing Ascat by reverting meta.id in channels allele_files, loci_files, gc_file and rt_file to baseName. +- [#1098](https://github.com/nf-core/sarek/pull/1098) - Fix Channel issue in Mutect2 subworkflow [#1094](https://github.com/nf-core/sarek/pull/1094) +- [#1100](https://github.com/nf-core/sarek/pull/1100) - Remove duplicate index with deepvariant when no_intervals [#1069](https://github.com/nf-core/sarek/pull/1069) +- [#1101](https://github.com/nf-core/sarek/pull/1101) - Remove duplicate index computation for GATK4 Markduplicates & [#1065](https://github.com/nf-core/sarek/issues/1065) +- [#1101](https://github.com/nf-core/sarek/pull/1101) - Fix GATK4 version for GATK4 MarkduplicatesSpark [#1068](https://github.com/nf-core/sarek/issues/1068) +- [#1105](https://github.com/nf-core/sarek/pull/1105) - Remove `params.tracedir` +- [#1108](https://github.com/nf-core/sarek/pull/1108) - Refactor bad prefix definition for vcf files [#938](https://github.com/nf-core/sarek/issues/938) +- [#1109](https://github.com/nf-core/sarek/pull/1109) - Fix `mpileup` for variantcalling: only `bcftools` run and file publishing + +## [3.2.1](https://github.com/nf-core/sarek/releases/tag/3.2.1) - Pierikjaure + +Pierikjaure is a previous spelling of Bierikjávrre. + +### Changed + +- [#1073](https://github.com/nf-core/sarek/pull/1073) - Back to dev +- [#1080](https://github.com/nf-core/sarek/pull/1080) - Prepare release `3.2.1` +- [#1082](https://github.com/nf-core/sarek/pull/1082) - Bump minimal Nextflow version to 23.04.0 + +### Fixed + +- [#1078](https://github.com/nf-core/sarek/pull/1078) - Update tabix/bgziptabix module to fix typo +- [#1079](https://github.com/nf-core/sarek/pull/1079) - Fixed typo in profile name for tower aws megatests +- [#1082](https://github.com/nf-core/sarek/pull/1082) - Patch more modules to use quay.io registry +- [#1082](https://github.com/nf-core/sarek/pull/1082) - Update `public_aws_ecr` profile +- [#1082](https://github.com/nf-core/sarek/pull/1082) - Add quay.io as singularity default registry + +## [3.2.0](https://github.com/nf-core/sarek/releases/tag/3.2.0) - Bierikjávrre + +Bierikjávrre is one of the largest lake in Sarek. + +### Added + +- [#864](https://github.com/nf-core/sarek/pull/864) - Added possibilities to export assembled haplotypes and locally realigned reads +- [#792](https://github.com/nf-core/sarek/pull/792) - Added the option `--concatenate_vcfs` for concatenating the germline VCF files. Per default, the resulting vcf-files will be placed under `/variant_calling/concat` +- [#889](https://github.com/nf-core/sarek/pull/889) - Added possibilities to skip variant filtering after Haplotypecaller +- [#945](https://github.com/nf-core/sarek/pull/945) - Adding Adam Talbot to contributor list +- [#954](https://github.com/nf-core/sarek/pull/954) - Adding keys for annotation with snpeff and ensemblvep for `hg19`, `hg38` and `mm10` +- [#967](https://github.com/nf-core/sarek/pull/967) - Adding new `outdir_cache` params +- [#971](https://github.com/nf-core/sarek/pull/971) - Subtle bugfix to correct mutation of FASTP output channel objects +- [#978](https://github.com/nf-core/sarek/pull/978) - Validate that patient/sample does not contain spaces +- [#981](https://github.com/nf-core/sarek/pull/981) - Added documentation on generating ASCAT resources for exome and targeted sequencing +- [#1041](https://github.com/nf-core/sarek/pull/1041) - Add params `vep_custom_args` to let user specify custom params more easily for `VEP` +- [#1045](https://github.com/nf-core/sarek/pull/1045) - Add `public_aws_ecr` for using ECR hosted containers + +### Changed + +- [#859](https://github.com/nf-core/sarek/pull/859) - Back to dev +- [#860](https://github.com/nf-core/sarek/pull/860) - Replace local subworkflow with nf-core version - `vcf_annotate_snpeff` +- [#865](https://github.com/nf-core/sarek/pull/865) - Replace local subworkflow with nf-core version - `vcf_annotate_ensemblvep` +- [#874](https://github.com/nf-core/sarek/pull/874) - Update all modules +- [#882](https://github.com/nf-core/sarek/pull/882) - Remove exit strategy for `Manta`/`Strelka` +- [#890](https://github.com/nf-core/sarek/pull/890) - Sync `TEMPLATE` with `tools` `2.7.1` +- [#896](https://github.com/nf-core/sarek/pull/896) - Code refactoring +- [#898](https://github.com/nf-core/sarek/pull/898) - Nextflow minimal version is now `22.10.1` +- [#898](https://github.com/nf-core/sarek/pull/898) - Sync `TEMPLATE` with `tools` `2.7.2` +- [#909](https://github.com/nf-core/sarek/pull/909) - Cache test data on GHA +- [#928](https://github.com/nf-core/sarek/pull/928) - No need for BAI when starting from uBAM +- [#935](https://github.com/nf-core/sarek/pull/935) - Add params `build_only_index` to only build index +- [#936](https://github.com/nf-core/sarek/pull/936) - Add params `donwload_cache` to download annotation cache +- [#942](https://github.com/nf-core/sarek/pull/942) - Update `README.md` +- [#967](https://github.com/nf-core/sarek/pull/967) - Update and detail extensively how to use annotation cache +- [#968](https://github.com/nf-core/sarek/pull/968) - Update all modules +- [#1011](https://github.com/nf-core/sarek/pull/1011) - Sync `TEMPLATE` with `tools` `2.8` +- [#1012](https://github.com/nf-core/sarek/pull/1012) - Better handling of meta maps in `bam_variant_calling_somatic_mutect2` +- [#1014](https://github.com/nf-core/sarek/pull/1014) - `snpeff_db` is now only the `db` version and not `genome.db` +- [#1015](https://github.com/nf-core/sarek/pull/1015) - Increase default value for `--nucleotides_per_second` to `200000` resulting in 21 groups for `GATK.GRCh38` +- [#1019](https://github.com/nf-core/sarek/pull/1019) - Set a default registry outside of profile scope +- [#1031](https://github.com/nf-core/sarek/pull/1031) - Update pipeline summary +- [#1032](https://github.com/nf-core/sarek/pull/1032) - Update all modules +- [#1051](https://github.com/nf-core/sarek/pull/1051) - Update more modules +- [#1056](https://github.com/nf-core/sarek/pull/1056) - Bump pipeline version to `3.2.0` + +### Fixed + +- [#870](https://github.com/nf-core/sarek/pull/870) - Fix output for locally realigned reads from haplotypecaller +- [#874](https://github.com/nf-core/sarek/pull/874) - Remove `CITATION.cff` +- [#893](https://github.com/nf-core/sarek/pull/893) - Fix logic of when to execute tabix on dbsnp +- [#894](https://github.com/nf-core/sarek/pull/894) - Add description to `--cnvkit_reference` +- [#894](https://github.com/nf-core/sarek/pull/894) - Remove methods description TODO prompt +- [#927](https://github.com/nf-core/sarek/pull/927) - Fix tumor only variant calling issues with freebayes following [#896](https://github.com/nf-core/sarek/pull/896) +- [#928](https://github.com/nf-core/sarek/pull/928) - Fix [#700](https://github.com/nf-core/sarek/issues/700) +- [#929](https://github.com/nf-core/sarek/pull/929) - Fix somatic variant calling issues with msisensor following [#896](https://github.com/nf-core/sarek/pull/896) +- [#941](https://github.com/nf-core/sarek/pull/941) - Fix json validation for `tools`, `skip_tools` and `use_gatk_spark` [#892](https://github.com/nf-core/sarek/issues/892) +- [#954](https://github.com/nf-core/sarek/pull/954) - Fix missing annotation keys with `snpeff` and `ensemblvep` for `hg19` +- [#957](https://github.com/nf-core/sarek/pull/957) - Add `failOnDuplicate` and `failOnMismatch` options to all `join()` operator where it was possible +- [#982](https://github.com/nf-core/sarek/pull/982) - Remove usage of exit statements, using `Nextflow.error` instead +- [#985](https://github.com/nf-core/sarek/pull/985) - Cache correctly identifies when it needs to be updated +- [#988](https://github.com/nf-core/sarek/pull/988) - Updated ascat module to fix seed for reproducibility +- [#998](https://github.com/nf-core/sarek/pull/998) - Remove parallelization within a sample for `Manta` +- [#1014](https://github.com/nf-core/sarek/pull/1014) - Fix calls to `ensemblvep` and `snpeff` containers +- [#1022](https://github.com/nf-core/sarek/pull/1022) - Fix call to variantrecalibrator. (Making sure that dbsnp_vqsr, known_indels_vqsr and known_snps_vqsr are channels, and not strings.) +- [#1039](https://github.com/nf-core/sarek/pull/1039) - Remove concatenate_vcfs tests with singularity, as they are failing due to not enough space on GHA runners +- [#1040](https://github.com/nf-core/sarek/pull/1040) - Fix dict channel issue due to [#1032](https://github.com/nf-core/sarek/pull/1032) +- [#1043](https://github.com/nf-core/sarek/pull/1043) - Fix typo in the tags.yml files from [#978](https://github.com/nf-core/sarek/pull/978) +- [#1048](https://github.com/nf-core/sarek/pull/1048) - Skip tool validation on annotation to fix [#949](https://github.com/nf-core/sarek/issues/949), check that bam is bam and cram is cram [#895](https://github.com/nf-core/sarek/issues/895) +- [#1050](https://github.com/nf-core/sarek/pull/1050) - Disable GATK VCF filters when joint calling to fix [#1025](https://github.com/nf-core/sarek/issues/1025) +- [#1055](https://github.com/nf-core/sarek/pull/1055) - Fix pattern for fasta file in the json schema +- [#1058](https://github.com/nf-core/sarek/pull/1058) - Fix container declaration for VCFTOOLS as it has been updated in the registry +- [#1061](https://github.com/nf-core/sarek/pull/1061) - Fix GenomicsDB also works with one interval file, fix results publishing of GenomicsDB +- [#1062](https://github.com/nf-core/sarek/pull/1062) - Fix automatic restart from steps +- [#1063](https://github.com/nf-core/sarek/pull/1063) - Fix join duplication for manta/strelka + +### Removed + +- [#898](https://github.com/nf-core/sarek/pull/898) - Params `enable_conda` was removed +- [#1070](https://github.com/nf-core/sarek/pull/1070) - Remove Sarek version from workflow and subway map pictures + +### Dependencies + +| Dependency | Old version | New version | +| ------------- | ----------- | ----------- | +| `ascat` | 3.0.0 | 3.1.1 | +| `bcftools` | 1.15.1 | 1.17 | +| `deepvariant` | 1.4.0 | 1.5.0 | +| `ensembl-vep` | 106.1 | 108.2 | +| `fastp` | 0.23.2 | 0.23.4 | +| `multiqc` | 1.13a | 1.14 | +| `samtools` | 1.16 | 1.17 | +| `svdb` | 2.6.1 | 2.8.1 | + +### Modules / Subworkflows + +| script | Old name | New name | +| --------------------- | ------------ | --------------------- | +| `ensemblvep/download` | | 'ENSEMBLVEP_DOWNLOAD' | +| `ensemblvep/vep` | 'ENSEMBLVEP' | 'ENSEMBLVEP_VEP' | +| `snpeff/download` | | 'SNPEFF_DOWNLOAD' | +| `snpeff/snpeff` | 'SNPEFF' | 'SNPEFF_SNPEFF' | + +## [3.1.2](https://github.com/nf-core/sarek/releases/tag/3.1.2) - Lesser Lule River + +Lesser Lule River is English for Lilla Luleälven + +### Added + +### Changed + +### Fixed + +- [#906](https://github.com/nf-core/sarek/pull/906) - Remove usages of deprecated `Channel.from` method + +### Deprecated + +### Removed + +### Dependencies + +## [3.1.1](https://github.com/nf-core/sarek/releases/tag/3.1.1) - Lilla Luleälven + +Lilla Luleälven river's main affluent is Rapaätno. + +### Added + +- [#856](https://github.com/nf-core/sarek/pull/856) - Add annotation for `R64-1-1` and `UMD3.1` + +### Changed + +- [#855](https://github.com/nf-core/sarek/pull/855) - Speed up duplicate marking by using `samtools` for CRAM conversion +- [#858](https://github.com/nf-core/sarek/pull/858) - Prepare release `3.1.1` + +### Fixed + +- [#851](https://github.com/nf-core/sarek/pull/851) - Fix `schema` definition `None` for `cf_chrom_len` + +### Deprecated + +### Removed + +### Dependencies + +## [3.1](https://github.com/nf-core/sarek/releases/tag/3.1) - Rapaätno + +Rapaätno is the river you can see from the Skierfe mountain. + +### Added + +- [#735](https://github.com/nf-core/sarek/pull/735) - GATK Markduplicates now natively supports CRAM output +- [#774](https://github.com/nf-core/sarek/pull/774) - Add logo for Danish National Genome Center +- [#783](https://github.com/nf-core/sarek/pull/783) - Add paths for chr length used by controlfreec to GRCh38 config +- [#820](https://github.com/nf-core/sarek/pull/820) - Improve documentation on scatter/gather effects +- [#833](https://github.com/nf-core/sarek/pull/833) - Add name to CI tests to avoid confusion between runs + +### Changed + +- [#735](https://github.com/nf-core/sarek/pull/735) - `--save_mapped` now saves mapping output in CRAM format +- [#762](https://github.com/nf-core/sarek/pull/762) - Back to dev +- [#762](https://github.com/nf-core/sarek/pull/762) - Update deepvariant module +- [#773](https://github.com/nf-core/sarek/pull/773) - Sync `TEMPLATE` with `tools` `2.6` +- [#782](https://github.com/nf-core/sarek/pull/782) - Reduce scatter/gather for full size tests on AWS +- [#785](https://github.com/nf-core/sarek/pull/785) - Update description of `bcftools stats` +- [#784](https://github.com/nf-core/sarek/pull/784) - Update all subworkflows names thanks to @scorreard +- [#806](https://github.com/nf-core/sarek/pull/806) - Refactor all tests +- [#806](https://github.com/nf-core/sarek/pull/806) - Split up `modules.config` file +- [#810](https://github.com/nf-core/sarek/pull/810) - Update CHANGELOG +- [#821](https://github.com/nf-core/sarek/pull/821) - Change `replace` to `putIfAbsent` for automatic search of `input` if none is provided to avoid overwriting values +- [#822](https://github.com/nf-core/sarek/pull/822) - Update modules with `nf-core modules update -a`: Update GATK version to 4.3.0 +- [#827](https://github.com/nf-core/sarek/pull/827) - Add `--genomicsdb-shared-posixfs-optimizations true --bypass-feature-reader` to `GenomicsDB` parameters to speed up the analysis +- [#842](https://github.com/nf-core/sarek/pull/842) - Increase default memory for samtools stats +- [#844](https://github.com/nf-core/sarek/pull/844) - All small scale tests are run on PR to `master` + +### Fixed + +- [#762](https://github.com/nf-core/sarek/pull/762) - Polish CHANGELOG + figures +- [#766](https://github.com/nf-core/sarek/pull/766) - Align box description in subway map +- [#768](https://github.com/nf-core/sarek/pull/768) - Use double quotes to fix import of singularity images for deepvariant module +- [#770](https://github.com/nf-core/sarek/pull/770) - Use double quotes to fix import of singularity images for gatk4/cnnscorevariants module +- [#771](https://github.com/nf-core/sarek/pull/771) - update to new modules syntax +- [#777](https://github.com/nf-core/sarek/pull/777) - Fix mixed up aws full size tests output paths +- [#790](https://github.com/nf-core/sarek/pull/790) - Fix issue [#789](https://github.com/nf-core/sarek/issues/789) somatic mutect2 test +- [#793](https://github.com/nf-core/sarek/pull/793) - Remove DeepVariant GVCF from annotation +- [#794](https://github.com/nf-core/sarek/pull/794) - Fix publishing for unzipped reference files +- [#807](https://github.com/nf-core/sarek/pull/807) - Fix read group when uBAMs are provided (see issue [#732](https://github.com/nf-core/sarek/issues/732)) +- [#813](https://github.com/nf-core/sarek/pull/813) - Fix input validation when launching from website (see issue [#694](https://github.com/nf-core/sarek/issues/694)) +- [#814](https://github.com/nf-core/sarek/pull/814) - Fix readgroups when using DragMap together with FreeBayes or Mutect2 (see issue [#780](https://github.com/nf-core/sarek/issues/780)) +- [#817](https://github.com/nf-core/sarek/pull/817) - Fix CNVKit run on tumor-only sample to be run on all samples +- [#828](https://github.com/nf-core/sarek/pull/817) - Fix issue [#763](https://github.com/nf-core/sarek/issues/763) to run variantcalling when starting form step recalibration +- [#837](https://github.com/nf-core/sarek/pull/837) - Fix Freebayes config selector after subworkflow renaming +- [#839](https://github.com/nf-core/sarek/pull/839) - Remove `copyTo` method that fails on S3 when the source and destination buckets are in different regions +- [#841](https://github.com/nf-core/sarek/pull/841) - Fix path priority for `cf_chrom_len` + +### Deprecated + +### Removed + +### Dependencies + +| Dependency | Old version | New version | +| ------------- | ----------- | ----------- | +| `bcftools` | 1.15.1 | 1.16 | +| `deepvariant` | 1.3.0 | 1.4.0 | +| `freebayes` | 1.3.5 | 1.3.6 | +| `gatk4` | 4.2.6.1 | 4.3.0.0 | +| `samtools` | 1.15.1 | 1.16.1 | +| `tiddit` | 3.1.0 | 3.3.2 | + +## [3.0.2](https://github.com/nf-core/sarek/releases/tag/3.0.2) - Lájtávrre + +Lájtávrre is a lake you can see from the Skierfe mountain, formed by the Rapaätno river. + +### Added + +- [#691](https://github.com/nf-core/sarek/pull/691) - Enable `PROFILE=conda`, `PROFILE=docker` and `PROFILE=singularity` for pytest +- [#716](https://github.com/nf-core/sarek/pull/716) - Add documentation for Azure recommended config vm_size +- [#752](https://github.com/nf-core/sarek/pull/752) - Add tracking of all dependencies starting 3.0 + +### Changed + +- [#679](https://github.com/nf-core/sarek/pull/679) - Back to `dev` +- [#685](https://github.com/nf-core/sarek/pull/685) - Updating the nf-core modules used by Sarek +- [#691](https://github.com/nf-core/sarek/pull/691) - To run the same pytest as before locally, use `PROFILE=docker` +- [#692](https://github.com/nf-core/sarek/pull/692) - Use `params.tools=strelka` in profile `test` +- [#696](https://github.com/nf-core/sarek/pull/696) - Adding check of md5-sums in CI-tests +- [#719](https://github.com/nf-core/sarek/pull/719) - Added boxes to subway map +- [#720](https://github.com/nf-core/sarek/pull/720) - Sync `TEMPLATE` with `tools` `2.5` +- [#723](https://github.com/nf-core/sarek/pull/723) - Sync `TEMPLATE` with `tools` `2.5.1` +- [#726](https://github.com/nf-core/sarek/pull/726) - Adapt resource requests +- [#730](https://github.com/nf-core/sarek/pull/730) - Reduce number of tests +- [#731](https://github.com/nf-core/sarek/pull/731) - Run the somatic test as default on `-profile test_full`, the germline can be tested with `-profile test_full_germline` +- [#733](https://github.com/nf-core/sarek/pull/733) - Add description for params.cf_chrom_len +- [#734](https://github.com/nf-core/sarek/pull/734) - nf-core modules update -a +- [#736](https://github.com/nf-core/sarek/pull/736) - More extensive CI for default test +- [#742](https://github.com/nf-core/sarek/pull/742) - Requiring the Haplotypecaller to be specified as one of the tools for joint germline genotyping +- [#752](https://github.com/nf-core/sarek/pull/752) - Code polishing + +### Fixed + +- [#679](https://github.com/nf-core/sarek/pull/679) - Fixed typos in subway maps +- [#681](https://github.com/nf-core/sarek/pull/681) - Fixed intermediate files published cf [#680](https://github.com/nf-core/sarek/issues/680) +- [#688](https://github.com/nf-core/sarek/pull/688) - Fixed VEP plugins issue cf [#687](https://github.com/nf-core/sarek/issues/687) +- [#689](https://github.com/nf-core/sarek/pull/689) - Fixed when clause for non `BWA mem` building mapping indexes +- [#704](https://github.com/nf-core/sarek/pull/704) - Fixed `cf_ploidy` to string instead of number +- [#705](https://github.com/nf-core/sarek/pull/705) - Fix publishing for processes in `alignment_to_fastq` subworkflow; prevent tabix computation for `known_snps` when present; publish `umi` processed files into `preprocessing/umi` subdirectory +- [#706](https://github.com/nf-core/sarek/pull/706) - Fixed `vep_version` not found error when running `--vep_loftee` +- [#724](https://github.com/nf-core/sarek/pull/724) - Fixed prettier issue +- [#727](https://github.com/nf-core/sarek/pull/727) - Allow `.list` interval files; remove `seconds` from GRCh38 file to allow `--nucleotides_per_second` to be used +- [#728](https://github.com/nf-core/sarek/pull/728) - Circumvent issue with controlfreec and length file containing regions not in intervals file +- [#729](https://github.com/nf-core/sarek/pull/729) - Trailing commas in `--tools`, `--skip_tools` and `--use_gatk_spark` now raise failure cf [#722](https://github.com/nf-core/sarek/issues/722) +- [#741](https://github.com/nf-core/sarek/pull/741) - Fix prefix for `bcftools sort` for joint germline variant calling +- [#743](https://github.com/nf-core/sarek/pull/743) - Remove profile definitions in profile to avoid issues with Tower +- [#758](https://github.com/nf-core/sarek/pull/758) - Fix Zenodo batch +- [#760](https://github.com/nf-core/sarek/pull/760) - Fix CHANGELOG dependencies +- [#761](https://github.com/nf-core/sarek/pull/761) - Fix font in subway map and workflow image + +### Deprecated + +### Removed + +- [#742](https://github.com/nf-core/sarek/pull/742) - Removed some lines from the usage-doc as Sarek no longer support input supplied as a list of multiple csv-files +- [#757](https://github.com/nf-core/sarek/pull/757) - Remove `errorStrategy` in `conf/modules.config` + +## [3.0.1](https://github.com/nf-core/sarek/releases/tag/3.0.1) - Saiva + +Saiva is a lake in the Sarek national park, just below the Skierfe mountain. + +### Fixed + +- [#708](https://github.com/nf-core/sarek/pull/708) - Fixes mpileup bug. Update nf-core module `samtools/mpileup` to subset CRAM file by intervals + +## [3.0](https://github.com/nf-core/sarek/releases/tag/3.0) - Skierfe + +Skierfe is a mountain in the Sarek national park, and the inspiration for the logo. + +### Added + +- [#388](https://github.com/nf-core/sarek/pull/388) - Add cram support + read splitting with `SeqKit` for speedup +- [#394](https://github.com/nf-core/sarek/pull/394) - Add `DeepVariant` +- [#411](https://github.com/nf-core/sarek/pull/411) - cram in csv samplesheet +- [#448](https://github.com/nf-core/sarek/pull/448) - Allow to skip base quality recalibration with `--skip_bqsr` +- [#449](https://github.com/nf-core/sarek/pull/449) - [@FriederikeHanssen](https://github.com/FriederikeHanssen) is now a `CODEOWNERS` +- [#460](https://github.com/nf-core/sarek/pull/460) - Add posters +- [#463](https://github.com/nf-core/sarek/pull/463) - Add dark/light logo versions +- [#464](https://github.com/nf-core/sarek/pull/464), [#514](https://github.com/nf-core/sarek/pull/514) - Add `DRAGMAP` as a possible aligner +- [#479](https://github.com/nf-core/sarek/pull/479) - Add more subworkflows +- [#485](https://github.com/nf-core/sarek/pull/485) - `--skip_qc`, `--skip_markduplicates` and `--skip_bqsr` is now `--skip_tools` +- [#507](https://github.com/nf-core/sarek/pull/507), [#537](https://github.com/nf-core/sarek/pull/537) - Subway map for building indexes +- [#512](https://github.com/nf-core/sarek/pull/512), [#531](https://github.com/nf-core/sarek/pull/531), [#537](https://github.com/nf-core/sarek/pull/537) - Subway map for pipeline +- [#522](https://github.com/nf-core/sarek/pull/522) - Add QC for vcf files & MultiQC +- [#533](https://github.com/nf-core/sarek/pull/533) - Add param `--only_paired_variant_calling` to allow skipping of germline variantcalling for paired samples +- [#536](https://github.com/nf-core/sarek/pull/536) - Add `--step markduplicates` to start from duplicate marking, `--step prepare_recalibration` now ONLY starts at process `BaseRecalibrator` & adding `bam` and `cram` input support for `--step` `markduplicates`, `prepare_recalibration`, `recalibrate`, and `variant_calling` +- [#538](https://github.com/nf-core/sarek/pull/538) - Add param `--seq_platform`, default: `ILLUMINA` +- [#545](https://github.com/nf-core/sarek/pull/545) - Add modules and subworkflows for `cnvkit` tumor_only mode +- [#540](https://github.com/nf-core/sarek/pull/540) - Add modules and subworkflows for `cnvkit` somatic mode +- [#557](https://github.com/nf-core/sarek/pull/557) - Add `Haplotypecaller` single sample mode together with `CNNScoreVariants` and `FilterVariantTranches` +- [#576](https://github.com/nf-core/sarek/pull/576) - Add modules and subworkflows for `cnvkit` germline mode +- [#582](https://github.com/nf-core/sarek/pull/582) - Added option `--vep_out_format` for setting the format of the output-file from VEP to `json`, `tab` or `vcf` (default) +- [#594](https://github.com/nf-core/sarek/pull/594) - Add parameter `--save_output_as_bam` to allow output of result files in BAM format +- [#595](https://github.com/nf-core/sarek/pull/595) - Added Haplotypecaller joint germline calling +- [#597](https://github.com/nf-core/sarek/pull/597) - Added tiddit for tumor variant calling +- [#600](https://github.com/nf-core/sarek/pull/600) - Added description for UMI related params in schema +- [#604](https://github.com/nf-core/sarek/pull/604), [#617](https://github.com/nf-core/sarek/pull/617) - Added full size tests WGS 30x NA12878 +- [#613](https://github.com/nf-core/sarek/pull/613) - Added params `--dbnsfp_fields` to allow configuration of fields for the `dbnsfp` `VEP` plugin +- [#613](https://github.com/nf-core/sarek/pull/613) - Added params `--dbnsfp_consequence` to allow configuration of consequence for the `dbnsfp` `VEP` plugin +- [#613](https://github.com/nf-core/sarek/pull/613) - Added params `--vep_version` to allow more configuration on the vep container definition +- [#620](https://github.com/nf-core/sarek/pull/620) - Added checks for sex information when running a CNV tools +- [#623](https://github.com/nf-core/sarek/pull/623) - Additional checks of data in the input sample sheet +- [#629](https://github.com/nf-core/sarek/pull/629) - Added checks to catch inconsistency between supplied samples and requested tools +- [#632](https://github.com/nf-core/sarek/pull/632) - Added params `--snpeff_version` to allow more configuration on the snpeff container definition +- [#632](https://github.com/nf-core/sarek/pull/632) - Added params `--vep_include_fasta` to use the fasta file for annotation +- [#639](https://github.com/nf-core/sarek/pull/639) - Adding genes-txt-file and summary-html-file to the published output from snpEff +- [#647](https://github.com/nf-core/sarek/pull/647) - Update resource requests for preprocessing based on what worked for 5 ICGC matched WGS samples +- [#652](https://github.com/nf-core/sarek/pull/652) - Added full size somatic test profile + +### Changed + +- [#580](https://github.com/nf-core/sarek/pull/580) - changed the test_full config to real public WXS data. 1 sample WXS germline, 1 Tumor/Normal pair. https://doi.org/10.1038/sdata.2016.25 and https://doi.org/10.1038/s41587-021-00994-5 +- [#383](https://github.com/nf-core/sarek/pull/383), [#528](https://github.com/nf-core/sarek/pull/528) - Update `CHANGELOG` +- [#390](https://github.com/nf-core/sarek/pull/390) - Update `nextflow_schema.json` +- [#408](https://github.com/nf-core/sarek/pull/408) - Sync `TEMPLATE` with `tools` `2.0.1` +- [#416](https://github.com/nf-core/sarek/pull/416) - Sync `TEMPLATE` with `tools` `2.1` +- [#417](https://github.com/nf-core/sarek/pull/417) - Merge `dsl2` and `dev` branches +- [#419](https://github.com/nf-core/sarek/pull/419) - Improve preprocessing +- [#420](https://github.com/nf-core/sarek/pull/420), [#455](https://github.com/nf-core/sarek/pull/455), [#459](https://github.com/nf-core/sarek/pull/459), [#633](https://github.com/nf-core/sarek/pull/633) - `nf-core modules update --all` +- [#427](https://github.com/nf-core/sarek/pull/427) - Update `DeepVariant` +- [#462](https://github.com/nf-core/sarek/pull/462) - Update modules and `modules.config` +- [#465](https://github.com/nf-core/sarek/pull/465) - Improve `test_data.config` +- [#466](https://github.com/nf-core/sarek/pull/466), [#478](https://github.com/nf-core/sarek/pull/478), [#492](https://github.com/nf-core/sarek/pull/492), [#521](https://github.com/nf-core/sarek/pull/521) - Move some local modules to `nf-core/modules` +- [#466](https://github.com/nf-core/sarek/pull/466), [#485](https://github.com/nf-core/sarek/pull/485), [#492](https://github.com/nf-core/sarek/pull/492), [#494](https://github.com/nf-core/sarek/pull/494), [#515](https://github.com/nf-core/sarek/pull/515) - Improve preprocessing subworkflows +- [#474](https://github.com/nf-core/sarek/pull/474), [#475](https://github.com/nf-core/sarek/pull/475) - Sync `TEMPLATE` with `tools` `2.2` +- [#487](https://github.com/nf-core/sarek/pull/487), [#489](https://github.com/nf-core/sarek/pull/489), [#492](https://github.com/nf-core/sarek/pull/492), [#497](https://github.com/nf-core/sarek/pull/497), [#522](https://github.com/nf-core/sarek/pull/522), [#583](https://github.com/nf-core/sarek/pull/583) - Improve variant calling subworkflows +- [#498](https://github.com/nf-core/sarek/pull/498) - Update docs +- [#501](https://github.com/nf-core/sarek/pull/501) - Sync `TEMPLATE` with `tools` `2.3` +- [#511](https://github.com/nf-core/sarek/pull/511) - Sync `TEMPLATE` with `tools` `2.3.2` +- [#520](https://github.com/nf-core/sarek/pull/520) - Improve annotation subworkflows +- [#537](https://github.com/nf-core/sarek/pull/537) - Update workflow figure +- [#539](https://github.com/nf-core/sarek/pull/539) - Update `CITATIONS.md` +- [#544](https://github.com/nf-core/sarek/pull/544) - `Mutect2` is no longer compatible with `--no_intervals` +- [#551](https://github.com/nf-core/sarek/pull/551) - Sync `TEMPLATE` with `tools` `2.4` +- [#562](https://github.com/nf-core/sarek/pull/562) - Restart from `--step annotate` is now also requiring a CSV file +- [#563](https://github.com/nf-core/sarek/pull/563) - Updated subway map +- [#570](https://github.com/nf-core/sarek/pull/570) - Extract mpileup into its own subworkflow; zip mpileup files +- [#571](https://github.com/nf-core/sarek/pull/571) - Including and using GATK4's mergeVcfs +- [#572](https://github.com/nf-core/sarek/pull/572) - Adjusted subway map svg for firefox compatibility +- [#577](https://github.com/nf-core/sarek/pull/577) - Update `RELEASE_CHECKLIST` +- [#578](https://github.com/nf-core/sarek/pull/578) - Updated module deeptools/bamcoverage +- [#585](https://github.com/nf-core/sarek/pull/585) - Remove explicit BAM to CRAM conversion after MarkduplicatesSpark; tool does it internally +- [#581](https://github.com/nf-core/sarek/pull/581) - `TIDDIT` is updated to `3.1.0` +- [#593](https://github.com/nf-core/sarek/pull/593) - update `ensembl-vep` cache version and module +- [#600](https://github.com/nf-core/sarek/pull/600) - Remove `TODO` in awsfulltest +- [#606](https://github.com/nf-core/sarek/pull/606) - Updated `ASCAT` to version `3.0` as module +- [#608](https://github.com/nf-core/sarek/pull/608) - Prevent candidate VCFs from getting published in manta +- [#618](https://github.com/nf-core/sarek/pull/618) - Update `multiqc` module +- [#618](https://github.com/nf-core/sarek/pull/618) - Update test yml files +- [#620](https://github.com/nf-core/sarek/pull/620) - `gender` is now `sex` in the samplesheet +- [#630](https://github.com/nf-core/sarek/pull/630) - Update citations file +- [#632](https://github.com/nf-core/sarek/pull/632) - Update `snpEff` version to `5.1` and cache up to `105` +- [#632](https://github.com/nf-core/sarek/pull/632) - Update `VEP` version to `106.1` and cache up to `106` +- [#618](https://github.com/nf-core/sarek/pull/618) - Update `multiqc` module update test yml files +- [#618](https://github.com/nf-core/sarek/pull/618) - Update test yml files +- [#633](https://github.com/nf-core/sarek/pull/633) - Update `BCFTOOLS` version to `1.15.1` +- [#644](https://github.com/nf-core/sarek/pull/644) - Use `-Y` for `bwa-mem(2)` and remove `-M` +- [#645](https://github.com/nf-core/sarek/pull/645) - Merge `tests/nextflow.config` in `conf/test.config` +- [#646](https://github.com/nf-core/sarek/pull/646) - Update `nextflow_schema.json` to reflect new parameters and functions, removes `--annotation_cache`, removes `--ascat_chromosomes` +- [#649](https://github.com/nf-core/sarek/pull/649) - Update, simplify and add more files to all `test_*.yml` files +- [#651](https://github.com/nf-core/sarek/pull/651) - Added TIDDIT_SOMATIC subworkflow +- [#653](https://github.com/nf-core/sarek/pull/653) - Coherent results subfolder structure between preprocessing, variantcalling and reporting +- [#659](https://github.com/nf-core/sarek/pull/659) - Update usage.md docu section on `How to run ASCAT with WES` +- [#661](https://github.com/nf-core/sarek/pull/661) - Add cnvkit reference creation to index subway map +- [#662](https://github.com/nf-core/sarek/pull/662) - Add bgzipped and indexed GATKBundle reference files for `GATK.GRCh37` and replace germline-resources with GATKBundle one +- [#663](https://github.com/nf-core/sarek/pull/663) - Add separate parameters for `ASCAT` and `ControlFREEC` back in +- [#668](https://github.com/nf-core/sarek/pull/668) - Update annotation documentation +- [#674](https://github.com/nf-core/sarek/pull/664) - Default value for splitting is `50000000` + +### Fixed + +- [#234](https://github.com/nf-core/sarek/pull/234) - Switching to DSL2 +- [#234](https://github.com/nf-core/sarek/pull/234), [#238](https://github.com/nf-core/sarek/pull/238) - Add modules and sub workflow for building indices +- [#234](https://github.com/nf-core/sarek/pull/234), [#252](https://github.com/nf-core/sarek/pull/252), [#256](https://github.com/nf-core/sarek/pull/256), [#283](https://github.com/nf-core/sarek/pull/283), [#334](https://github.com/nf-core/sarek/pull/334) - Update Nextflow `19.10.0` -> `20.11.0-edge` +- [#239](https://github.com/nf-core/sarek/pull/239) - Restore Sarek ascii art to header +- [#241](https://github.com/nf-core/sarek/pull/241), [#248](https://github.com/nf-core/sarek/pull/248), [#250](https://github.com/nf-core/sarek/pull/250), [#257](https://github.com/nf-core/sarek/pull/257), [#259](https://github.com/nf-core/sarek/pull/259) - Add modules and sub workflow for preprocessing +- [#242](https://github.com/nf-core/sarek/pull/242), [#244](https://github.com/nf-core/sarek/pull/244), [#245](https://github.com/nf-core/sarek/pull/245), [#246](https://github.com/nf-core/sarek/pull/246), [#247](https://github.com/nf-core/sarek/pull/247), [#249](https://github.com/nf-core/sarek/pull/249), [#252](https://github.com/nf-core/sarek/pull/252), [#256](https://github.com/nf-core/sarek/pull/256), [#263](https://github.com/nf-core/sarek/pull/263), [#264](https://github.com/nf-core/sarek/pull/264), [#283](https://github.com/nf-core/sarek/pull/283), [#285](https://github.com/nf-core/sarek/pull/285), [#338](https://github.com/nf-core/sarek/pull/338) - Refactor `dsl2` branch +- [#257](https://github.com/nf-core/sarek/pull/257) - Use a params modules config file +- [#266](https://github.com/nf-core/sarek/pull/266), [#285](https://github.com/nf-core/sarek/pull/285), [#297](https://github.com/nf-core/sarek/pull/297) - Add modules and sub workflow for variant calling +- [#333](https://github.com/nf-core/sarek/pull/333) - Bump `Sarek` version to `3.0dev` +- [#334](https://github.com/nf-core/sarek/pull/334) - Sync `dsl2` and `dev` branches +- [#342](https://github.com/nf-core/sarek/pull/342) - Update `README.md` +- [#386](https://github.com/nf-core/sarek/pull/386) - Annotation is back +- [#410](https://github.com/nf-core/sarek/pull/410), [#412](https://github.com/nf-core/sarek/pull/412), [#584](https://github.com/nf-core/sarek/pull/584) - Update `CI` tests +- [#418](https://github.com/nf-core/sarek/pull/418) - Fix `known_sites` channels +- [#432](https://github.com/nf-core/sarek/pull/432), [#457](https://github.com/nf-core/sarek/pull/457) - Sort before `tabix index` +- [#454](https://github.com/nf-core/sarek/pull/454) - Input is optional (can actually be found automatically by `Sarek` if previously run) +- [#463](https://github.com/nf-core/sarek/pull/463), [#468](https://github.com/nf-core/sarek/pull/468) - Fix `nf-core lint` +- [#513](https://github.com/nf-core/sarek/pull/513), [#527](https://github.com/nf-core/sarek/pull/527) - CNV is back +- [#529](https://github.com/nf-core/sarek/pull/529) - Do not save `versions.yml` files +- [#524](https://github.com/nf-core/sarek/pull/524) - Fix intervals usage by counting the actual list of scatter/gather files produced and not overall number of intervals +- [#549](https://github.com/nf-core/sarek/pull/549) - Fix unique lanes required for Freebayes: issue [#311](https://github.com/nf-core/sarek/issues/311), replaces `meta.clone()` with actual copy of map to avoid issues with +- [#567](https://github.com/nf-core/sarek/pull/567) - Fix interval name resolving during scatter/gather by moving logic to modules.config causing name to be correctly resolved on process execution; also fixed duplicate naming when variant callers produce multiple vcf files by adding field `type` to `meta` map +- [#585](https://github.com/nf-core/sarek/pull/585) - Fix Spark usage for GATK4 modules +- [#587](https://github.com/nf-core/sarek/pull/587) - Fix issue with VEP extra files +- [#581](https://github.com/nf-core/sarek/pull/581) - `TIDDIT` is back +- [#590](https://github.com/nf-core/sarek/pull/590) - Fix empty folders during scatter/gather +- [#592](https://github.com/nf-core/sarek/pull/592) - Fix optional resources for Mutect2, GetPileupSummaries, and HaplotypeCaller: issue [#299](https://github.com/nf-core/sarek/issues/299), [#359](https://github.com/nf-core/sarek/issues/359), [#367](https://github.com/nf-core/sarek/issues/367) +- [#598](https://github.com/nf-core/sarek/pull/598), [#614](https://github.com/nf-core/sarek/pull/614), [#626](https://github.com/nf-core/sarek/pull/626) - Remove WARNING message for config selector not matching +- [#599](https://github.com/nf-core/sarek/pull/599) - Add checks for correct data type for `params.step` +- [#599](https://github.com/nf-core/sarek/pull/599) - Add checks for no empty `--tools` with `--step variant_calling` or `--step annotate` +- [#600](https://github.com/nf-core/sarek/pull/600) - Remove `nf-core lint` warnings +- [#602](https://github.com/nf-core/sarek/pull/602) - Fixed bug in `alignment_to_fastq` and added tests +- [#609](https://github.com/nf-core/sarek/pull/609) - Remove unused intervals code, reorganize combined intervals file +- [#613](https://github.com/nf-core/sarek/pull/613) - Fixed filenames for `dbnsfp` and `SpliceAI` `VEP` plugin +- [#615](https://github.com/nf-core/sarek/pull/615) - Fix ASCAT igenomes file paths +- [#619](https://github.com/nf-core/sarek/pull/619) - Fix issue with checking samplesheet content with AWS +- [#628](https://github.com/nf-core/sarek/pull/628) - Fix issue with value converting to string before schema validation +- [#628](https://github.com/nf-core/sarek/pull/628) - Fix dbsnp check issue with `--step annotate` +- [#618](https://github.com/nf-core/sarek/pull/618) - Fix `bcftools/vcftools` sample labelling in multiqc report +- [#618](https://github.com/nf-core/sarek/pull/618) - Fix issue with tiddit [#621](https://github.com/nf-core/sarek/issues/621) +- [#618](https://github.com/nf-core/sarek/pull/618) - Fix channel issue with `targets.bed` in prepare_intervals +- [#634](https://github.com/nf-core/sarek/pull/634) - Fix issue with samtools/mosdepth plots in multiqc_report +- [#641](https://github.com/nf-core/sarek/pull/641) - Fix issue with duplicate substring in tools and skip_tools +- [#642](https://github.com/nf-core/sarek/pull/642) - Only unzip ref files if tool is run, only publish ref files if `--save_reference` and simplify CNKit logic +- [#650](https://github.com/nf-core/sarek/pull/650) - Fix intervals checks +- [#654](https://github.com/nf-core/sarek/pull/654) - Allow any step but annotation to start from BAM files +- [#655](https://github.com/nf-core/sarek/pull/655) - Fix `--intervals false` logic & add versioning for local modules +- [#658](https://github.com/nf-core/sarek/pull/658) - Fix split fastq names in multiqc-report +- [#666](https://github.com/nf-core/sarek/pull/666) - Simplify multiqc config channel input +- [#668](https://github.com/nf-core/sarek/pull/668) - Add `snpeff_version` and `vep_version` to `schema_ignore_params` to avoid issue when specifying on command line +- [#669](https://github.com/nf-core/sarek/pull/669) - Fix path to files when creating csv files + +### Dependencies + +| Dependency | Old version | New version | +| ---------------------- | ----------- | ----------- | +| `ascat` | 2.5.2 | 3.0.0 | +| `bcftools` | 1.9 | 1.15.1 | +| `bwa-mem2` | 2.0 | 2.2.1 | +| `bwa` | 0.7.17 | unchanged | +| `cancerit-allelecount` | 4.0.2 | 4.3.0 | +| `cnvkit` | 0.9.6 | 0.9.9 | +| `control-freec` | 11.6 | unchanged | +| `deepvariant` | added | 1.3.0 | +| `dragmap` | added | 1.2.1 | +| `ensembl-vep` | 99.2 | 106.1 | +| `fastp` | added | 0.23.2 | +| `fastqc` | 0.11.9 | unchanged | +| `fgbio` | 1.1.0 | 2.0.2 | +| `freebayes` | 1.3.2 | 1.3.5 | +| `gatk4` | 4.1.7.0 | 4.2.6.1 | +| `gawk` | added | 5.1.0 | +| `genesplicer` | 1.0 | removed | +| `htslib` | 1.9 | removed | +| `llvm-openmp` | 8.0.1 | removed | +| `manta` | 1.6.0 | unchanged | +| `markdown` | 3.1.1 | removed | +| `mosdepth` | 0.3.3 | unchanged | +| `msisensor-pro` | 1.1.a | 1.2.0 | +| `msisensor` | 0.5 | removed | +| `multiqc` | 1.8 | 1.13a | +| `openjdk` | added | 8.0.312 | +| `openmp` | 8.0.1 | removed | +| `p7zip` | added | 15.09 | +| `pigz` | 2.3.4 | unchanged | +| `pygments` | 2.5.2 | removed | +| `pymdown-extensions` | 6.0 | removed | +| `qualimap` | 2.2.2d | removed | +| `r-ggplot2` | 3.3.0 | removed | +| `samblaster` | 0.1.24 | 0.1.26 | +| `samtools` | 1.9 | 1.15.1 | +| `sed` | added | 4.7 | +| `snpeff` | 4.3.1t | 5.1 | +| `strelka` | 2.9.10 | unchanged | +| `svdb` | added | 2.6.1 | +| `tabix` | added | 1.11 | +| `tiddit` | 2.7.1 | 3.1.0 | +| `trim-galore` | 0.6.5 | removed | +| `vcfanno` | 0.3.2 | removed | +| `vcftools` | 0.1.16 | unchanged | + +### Deprecated + +### Removed + +- [#485](https://github.com/nf-core/sarek/pull/485) - `--skip_qc`, `--skip_markduplicates` and `--skip_bqsr` is now `--skip_tools` +- [#538](https://github.com/nf-core/sarek/pull/538) - `--sequencing_center` is now `--seq_center` +- [#538](https://github.com/nf-core/sarek/pull/538) - `--markdup_java_options` has been removed +- [#539](https://github.com/nf-core/sarek/pull/539) - `--annotate_tools` has been removed +- [#539](https://github.com/nf-core/sarek/pull/539) - `--cadd_cache`, `--cadd_indels`, `--cadd_indels_tbi`, `--cadd_wg_snvs`, `--cadd_wg_snvs_tbi` have been removed +- [#539](https://github.com/nf-core/sarek/pull/539) - `--genesplicer` has been removed +- [#539](https://github.com/nf-core/sarek/pull/539) - `conf/genomes.config` and `params.genomes_base` have been removed +- [#562](https://github.com/nf-core/sarek/pull/562) - Restart from `--step annotate` from folder is removed. Use a `csv` file instead +- [#571](https://github.com/nf-core/sarek/pull/571) - Removed the local module `concat_vcf` +- [#605](https://github.com/nf-core/sarek/pull/605) - Removed Scatter/gather from GATK_SINGLE_SAMPLE_GERMLINE_VARIANT_CALLING, all intervals are processed together +- [#643](https://github.com/nf-core/sarek/pull/643) - Removed Sentieon parameters + +## [2.7.2](https://github.com/nf-core/sarek/releases/tag/2.7.2) - Áhkká + +Áhkká is one of the massifs just outside of the Sarek National Park. + +### Fixed + +- [#566](https://github.com/nf-core/sarek/pull/566) - Fix caching bug affecting a variable number of `MapReads` jobs due to non-deterministic state of `statusMap` during caching evaluation + +## [2.7.1](https://github.com/nf-core/sarek/releases/tag/2.7.1) - Pårtejekna + +Pårtejekna is one of glaciers of the Pårte Massif. + +### Added + +- [#353](https://github.com/nf-core/sarek/pull/353) - Add support for task retries with exit code 247 (exhibited by `Picard MarkDuplicates`) +- [#354](https://github.com/nf-core/sarek/pull/354) - Add tumor only mode for `Mutect2` and `MSIsensor` +- [#356](https://github.com/nf-core/sarek/pull/356) - Add `--cf_contamination_adjustment` params to adjust contamination with `Control-FREEC` +- [#372](https://github.com/nf-core/sarek/pull/372) - Add `--cf_contamination` params to specify contamination value with `Control-FREEC` + +### Changed + +- [#373](https://github.com/nf-core/sarek/pull/373) - Sync `TEMPLATE` with `tools` 1.14 +- [#376](https://github.com/nf-core/sarek/pull/376) - Better logo on Github dark Mode +- [#387](https://github.com/nf-core/sarek/pull/387) - Fix tables for TSV file content + +### Fixed + +- [#375](https://github.com/nf-core/sarek/pull/375), [#381](https://github.com/nf-core/sarek/pull/381), [#382](https://github.com/nf-core/sarek/pull/382), [#385](https://github.com/nf-core/sarek/pull/385) - Fix bugs due to `TEMPLATE` sync from [#373](https://github.com/nf-core/sarek/pull/373) +- [#378](https://github.com/nf-core/sarek/pull/378) - Fix `Spark` related issue due to `Docker` settings in `nextflow.config` + +### Deprecated + +### Removed + +- [#368](https://github.com/nf-core/sarek/pull/368) - Remove social preview image to use GitHub OpenGraph + +## [2.7](https://github.com/nf-core/sarek/releases/tag/2.7) - Pårte + +Pårte is one of the main massif in the Sarek National Park. + +### Added + +- [#145](https://github.com/nf-core/sarek/pull/145) - Add `UMI annotation and consensus` functionality to `Sarek` +- [#230](https://github.com/nf-core/sarek/pull/230) - Add `ignore_soft_clipped_bases` option for `GATK Mutect2` [#218](https://github.com/nf-core/sarek/issues/218) +- [#253](https://github.com/nf-core/sarek/pull/253) - Add `UMI` `CI` testing +- [#262](https://github.com/nf-core/sarek/pull/262) - Add `nextflow_schema.json` +- [#237](https://github.com/nf-core/sarek/pull/237), [#282](https://github.com/nf-core/sarek/pull/282) - Add `--aligner` to choose between `bwa` and `bwa-mem2` +- [#294](https://github.com/nf-core/sarek/pull/294) - Add `Troubleshooting` section to `docs/usage.md` +- [#302](https://github.com/nf-core/sarek/pull/302), [#304](https://github.com/nf-core/sarek/pull/304) - Add WES and tumor-only mode for `Control-FREEC` + +### Changed + +- [#253](https://github.com/nf-core/sarek/pull/253), [#255](https://github.com/nf-core/sarek/pull/255), [#326](https://github.com/nf-core/sarek/pull/326), [#329](https://github.com/nf-core/sarek/pull/329) - Update docs +- [#260](https://github.com/nf-core/sarek/pull/260), [#262](https://github.com/nf-core/sarek/pull/262), [#278](https://github.com/nf-core/sarek/pull/278), [#322](https://github.com/nf-core/sarek/pull/322) - Sync with `TEMPLATE` updated from [nf-core/tools](https://github.com/nf-core/tools) [`1.10.2`](https://github.com/nf-core/tools/releases/tag/1.10.2) +- [#262](https://github.com/nf-core/sarek/pull/262) - Update issue templates to fit the recommended community standards +- [#278](https://github.com/nf-core/sarek/pull/278), [#322](https://github.com/nf-core/sarek/pull/322) - Refactor docs +- [#284](https://github.com/nf-core/sarek/pull/284) - Update F1000Research publication to version 2 +- [#284](https://github.com/nf-core/sarek/pull/284) - Update Scilifelab logo +- [#317](https://github.com/nf-core/sarek/pull/317) - Update `README.md` (Add: QBiC + Friederike/Gisela) +- [#320](https://github.com/nf-core/sarek/pull/278) - Set `MarkDuplicates MAX_RECORDS_IN_RAM` to default value + +### Fixed + +- [#229](https://github.com/nf-core/sarek/pull/229) - Fix `Control-FREEC` restart issue [#225](https://github.com/nf-core/sarek/issues/225) +- [#236](https://github.com/nf-core/sarek/pull/236) - Fix `GATK Mutect2` typo issue [#227](https://github.com/nf-core/sarek/issues/227) +- [#271](https://github.com/nf-core/sarek/pull/271) - Fix `ConcatVCF_Mutect2` `SIGPIPE` issue [#268](https://github.com/nf-core/sarek/issues/268) +- [#272](https://github.com/nf-core/sarek/pull/272) - Fix annotation `--tools merge` issue +- [#279](https://github.com/nf-core/sarek/pull/279) - Fix issue with `--step prepare_recalibration` [#267](https://github.com/nf-core/sarek/issues/267) +- [#280](https://github.com/nf-core/sarek/pull/280) - Use HTML codes instead of `<` and `>` in docs +- [#288](https://github.com/nf-core/sarek/pull/288) - Fix `test_annotation` profile +- [#289](https://github.com/nf-core/sarek/pull/289) - Random string added to `extractFastqFromDir` to avoid name collition +- [#290](https://github.com/nf-core/sarek/pull/290), [#323](https://github.com/nf-core/sarek/pull/323) - Faster solving of `Conda` environment +- [#293](https://github.com/nf-core/sarek/pull/293) - Fix typo issue when printing infos [#292](https://github.com/nf-core/sarek/issues/292) +- [#309](https://github.com/nf-core/sarek/pull/309) - Fixed concatenation of many VCF files +- [#310](https://github.com/nf-core/sarek/pull/310) - Fix Github Actions not running after November 16, 2020 (deprecated Github Actions API [#739](https://github.com/nf-core/tools/issues/739) +- [#329](https://github.com/nf-core/sarek/pull/329) - Simplify `Control-FREEC` usage +- [#331](https://github.com/nf-core/sarek/pull/331) - Replace `spread` operator by `combine` to remove `Nextflow` deprecation warning + +### Removed + +- [#234](https://github.com/nf-core/sarek/pull/243) - Removing obsolete script [#92](https://github.com/nf-core/sarek/issues/92) +- [#262](https://github.com/nf-core/sarek/pull/262) - Removing deprecated params: `annotateTools`, `annotateVCF`, `cadd_InDels`, `cadd_InDels_tbi`, `cadd_WG_SNVs`, `cadd_WG_SNVs_tbi`, `maxMultiqcEmailFileSize`, `noGVCF`, `noReports`, `noStrelkaBP`, `nucleotidesPerSecond`, `publishDirMode`, `sample`, `sampleDir`, `saveGenomeIndex`, `skipQC`, `snpEff_cache`, `targetBed` +- [#262](https://github.com/nf-core/sarek/pull/262) - Removing warning message about deprecated and obsolete params +- [#324](https://github.com/nf-core/sarek/pull/324) - `--no_gatk_spark` is now removed, use `--use_gatk_spark` instead +- [#324](https://github.com/nf-core/sarek/pull/324) - `--no_gvcf` is now removed, use `--generate_gvcf` instead + +## [2.6.1](https://github.com/nf-core/sarek/releases/tag/2.6.1) - Gådokgaskatjåhkkå + +Gådokgaskatjåhkkå is the highest peak in the Piellorieppe massif. + +### Changed + +- [#208](https://github.com/nf-core/sarek/pull/208) - Merge changes from the release PR +- [#208](https://github.com/nf-core/sarek/pull/208) - Bump version to `3.0dev` +- [#214](https://github.com/nf-core/sarek/pull/214) - Update `GATK` from `4.1.6.0` to `4.1.7.0` +- [#219](https://github.com/nf-core/sarek/pull/219) - Added `awsfulltest.yml` GitHub Actions workflow +- [#222](https://github.com/nf-core/sarek/pull/222) - Bump version to `2.6.1` and minor release +- [#223](https://github.com/nf-core/sarek/pull/223) - Apply comments from the release PR + +### Fixed + +- [#211](https://github.com/nf-core/sarek/pull/211) - Extend timeout for pushing to DockerHub for VEP containers +- [#212](https://github.com/nf-core/sarek/pull/212) - No AWS test on forks +- [#214](https://github.com/nf-core/sarek/pull/214) - Fix channels collision between `Freebayes` and `GATK Mutect2` [#200](https://github.com/nf-core/sarek/issues/200) +- [#214](https://github.com/nf-core/sarek/pull/214) - Fix warning Invalid tag value for `CreateIntervalBeds` [#209](https://github.com/nf-core/sarek/issues/209) +- [#214](https://github.com/nf-core/sarek/pull/214) - Fix `GATK Mutect2` issue [#210](https://github.com/nf-core/sarek/issues/210) +- [#219](https://github.com/nf-core/sarek/pull/219) - Updated `awstest.yml` GitHub actions workflow +- [#221](https://github.com/nf-core/sarek/pull/221) - Fix issue with `tmp_dir` in `BaseRecalibrator` process + +## [2.6](https://github.com/nf-core/sarek/releases/tag/2.6) - Piellorieppe + +Piellorieppe is one of the main massif in the Sarek National Park. + +### Added + +- [#76](https://github.com/nf-core/sarek/pull/76) - Add `GATK Spark` possibilities to Sarek +- [#87](https://github.com/nf-core/sarek/pull/87) - Add `GATK BaseRecalibrator` plot to `MultiQC` report +- [#115](https://github.com/nf-core/sarek/pull/115) - Add [@szilvajuhos](https://github.com/szilvajuhos) abstract for ESHG2020 +- [#117](https://github.com/nf-core/sarek/pull/117) - Add `Trim Galore` possibilities to Sarek +- [#141](https://github.com/nf-core/sarek/pull/141) - Add containers for `WBcel235` +- [#150](https://github.com/nf-core/sarek/pull/150), [#151](https://github.com/nf-core/sarek/pull/151), [#154](https://github.com/nf-core/sarek/pull/154) - Add AWS mega test GitHub Actions +- [#153](https://github.com/nf-core/sarek/pull/153) - Add `CNVkit` possibilities to Sarek +- [#158](https://github.com/nf-core/sarek/pull/158) - Added `ggplot2` version `3.3.0` +- [#163](https://github.com/nf-core/sarek/pull/163) - Add [MSIsensor](https://github.com/ding-lab/msisensor) in tools and container +- [#164](https://github.com/nf-core/sarek/pull/164) - Add `--no_gatk_spark` params and tests +- [#167](https://github.com/nf-core/sarek/pull/167) - Add `--markdup_java_options` documentation +- [#169](https://github.com/nf-core/sarek/pull/169) - Add `RELEASE_CHECKLIST.md` document +- [#174](https://github.com/nf-core/sarek/pull/174) - Add `variant_calling.md` documentation +- [#175](https://github.com/nf-core/sarek/pull/175) - Add `Sentieon` documentation +- [#176](https://github.com/nf-core/sarek/pull/176) - Add empty `custom` genome in `genomes.config` to allow genomes that are not in `AWS iGenomes` +- [#179](https://github.com/nf-core/sarek/pull/179), [#201](https://github.com/nf-core/sarek/pull/201) - Add `FreeBayes` germline variant calling +- [#180](https://github.com/nf-core/sarek/pull/180) - Now saving Mapped BAMs (and creating TSV) in minimal setting +- [#182](https://github.com/nf-core/sarek/pull/182) - Add possibility to run `HaplotypeCaller` without `dbsnp` so it can be used to actually generate vcfs to build a set of known sites (cf [gatkforums](https://gatkforums.broadinstitute.org/gatk/discussion/1247/what-should-i-use-as-known-variants-sites-for-running-tool-x)) +- [#195](https://github.com/nf-core/sarek/pull/195) - Now creating TSV for duplicates marked BAMs in minimal setting +- [#195](https://github.com/nf-core/sarek/pull/195), [#202](https://github.com/nf-core/sarek/pull/202) - Add `--save_bam_mapped` params to save mapped BAMs +- [#197](https://github.com/nf-core/sarek/pull/197) - Add step `prepare_recalibration` to allow restart from DuplicatesMarked BAMs +- [#204](https://github.com/nf-core/sarek/pull/204) - Add step `Control-FREEC` to allow restart from pileup files +- [#205](https://github.com/nf-core/sarek/pull/205) - Add `--skip_markduplicates` to allow skipping the `MarkDuplicates` process + +### Changed + +- [#76](https://github.com/nf-core/sarek/pull/76) - Use `MarkDuplicatesSpark` instead of `MarkDuplicates` +- [#76](https://github.com/nf-core/sarek/pull/76) - Use `gatk4-spark` instead of `gatk4` in `environment.yml` +- [#80](https://github.com/nf-core/sarek/pull/80) - Re-bump `dev` branch +- [#85](https://github.com/nf-core/sarek/pull/85) - Use new merged vcf files for known indels to simplify setting up channel +- [#104](https://github.com/nf-core/sarek/pull/104) - Update Figure 1 +- [#107](https://github.com/nf-core/sarek/pull/107) - Switch params to snake_case +- [#109](https://github.com/nf-core/sarek/pull/109) - Update publication with F1000Research preprint +- [#113](https://github.com/nf-core/sarek/pull/113) - Move social preview image +- [#120](https://github.com/nf-core/sarek/pull/120) - Sync TEMPLATE +- [#121](https://github.com/nf-core/sarek/pull/121) - Update `MultiQC` to `1.8` +- [#126](https://github.com/nf-core/sarek/pull/126), [#131](https://github.com/nf-core/sarek/pull/131) - Update docs +- [#131](https://github.com/nf-core/sarek/pull/131) - Use `nfcore/base:1.9` as base for containers +- [#131](https://github.com/nf-core/sarek/pull/131) - Update `Control-FREEC` to `11.5` +- [#131](https://github.com/nf-core/sarek/pull/131) - Update `FastQC` to `0.11.9` +- [#131](https://github.com/nf-core/sarek/pull/131) - Update `FreeBayes` to `1.3.2` +- [#131](https://github.com/nf-core/sarek/pull/131) - Update `Manta` to `1.6.0` +- [#131](https://github.com/nf-core/sarek/pull/131) - Update `Qualimap` to `2.2.2d` +- [#131](https://github.com/nf-core/sarek/pull/131) - Update `VEP` to `99.2` +- [#141](https://github.com/nf-core/sarek/pull/141) - Update `snpEff` cache version from `75` to `87` for `GRCh37` +- [#141](https://github.com/nf-core/sarek/pull/141) - Update `snpEff` cache version from `86` to `92` for `GRCh38` +- [#141](https://github.com/nf-core/sarek/pull/141) - Update `VEP` databases to `99` +- [#143](https://github.com/nf-core/sarek/pull/143) - Revert `snpEff` cache version to `75` for `GRCh37` +- [#143](https://github.com/nf-core/sarek/pull/143) - Revert `snpEff` cache version to `86` for `GRCh38` +- [#152](https://github.com/nf-core/sarek/pull/152), [#158](https://github.com/nf-core/sarek/pull/158), [#164](https://github.com/nf-core/sarek/pull/164), [#174](https://github.com/nf-core/sarek/pull/174), [#194](https://github.com/nf-core/sarek/pull/194), [#198](https://github.com/nf-core/sarek/pull/198), [#204](https://github.com/nf-core/sarek/pull/204) - Update docs +- [#164](https://github.com/nf-core/sarek/pull/164) - Update `gatk4-spark` from `4.1.4.1` to `4.1.6.0` +- [#180](https://github.com/nf-core/sarek/pull/180), [#195](https://github.com/nf-core/sarek/pull/195) - Improve minimal setting +- [#183](https://github.com/nf-core/sarek/pull/183), [#204](https://github.com/nf-core/sarek/pull/204) - Update `input.md` documentation +- [#197](https://github.com/nf-core/sarek/pull/197) - Output directory `DuplicateMarked` is now replaced by `DuplicatesMarked` +- [#204](https://github.com/nf-core/sarek/pull/204) - Output directory `controlFREEC` is now replaced by `Control-FREEC` + +### Fixed + +- [#83](https://github.com/nf-core/sarek/pull/83) - Fix some typos in `docs/input.md` +- [#107](https://github.com/nf-core/sarek/pull/107) - Fix linting +- [#110](https://github.com/nf-core/sarek/pull/110) - Fix `snpEff` report issue cf [#106](https://github.com/nf-core/sarek/issues/106) +- [#126](https://github.com/nf-core/sarek/pull/126) - Fix `iGenomes` paths +- [#127](https://github.com/nf-core/sarek/pull/127), [#128](https://github.com/nf-core/sarek/pull/128) - Fix `ASCAT` +- [#129](https://github.com/nf-core/sarek/pull/129) - Fix issue with Channel `channel ch_software_versions_yaml` +- [#129](https://github.com/nf-core/sarek/pull/129) - Apply @drpatelh fix for `mardown_to_html.py` compatibility with Python 2 +- [#129](https://github.com/nf-core/sarek/pull/129) - Removed `Python` `3.7.3` from conda environment due to incompatibility +- [#129](https://github.com/nf-core/sarek/pull/129) - Change ascii characters that were not supported from the `output.md` docs +- [#140](https://github.com/nf-core/sarek/pull/140) - Fix extra T/N combinations for `ASCAT` cf [#136](https://github.com/nf-core/sarek/issues/136) +- [#141](https://github.com/nf-core/sarek/pull/141) - Fix `download_cache.nf` script to download cache for `snpEff` and `VEP` +- [#143](https://github.com/nf-core/sarek/pull/143) - Fix annotation CI testing with `snpEff` and `VEP` +- [#144](https://github.com/nf-core/sarek/pull/144) - Fix CircleCI for building `VEP` containers +- [#146](https://github.com/nf-core/sarek/pull/146) - Fix `--no_intervals` for `GATK Mutect2` cf [#135](https://github.com/nf-core/sarek/issues/135) +- [#156](https://github.com/nf-core/sarek/pull/156) - Fix typos +- [#156](https://github.com/nf-core/sarek/pull/156) - Fix issues with `dbsnp` files while using only `Sention` tools +- [#158](https://github.com/nf-core/sarek/pull/158) - Fix typo with `params.snpeff_cache` to decide containers for `snpEff` +- [#164](https://github.com/nf-core/sarek/pull/164) - Fix issues when running with `Sentieon` +- [#164](https://github.com/nf-core/sarek/pull/164) - Add more VCFs to annotation +- [#167](https://github.com/nf-core/sarek/pull/167) - Add `--markdup_java_options` documentation to fix [#166](https://github.com/nf-core/sarek/issues/166) +- [#178](https://github.com/nf-core/sarek/pull/178) - Fix `Sentieon` variant calling, now using deduped bam files +- [#188](https://github.com/nf-core/sarek/pull/188) - Fix input/output channels for process `IndexBamFile` to match actual files in the `mapped.tsv` files +- [#189](https://github.com/nf-core/sarek/pull/189) - Fix `no_intervals` for process `HaplotypeCaller` (the file just need to actually exists...) +- [#197](https://github.com/nf-core/sarek/pull/197) - Fix issue with `--step recalibrate` +- [#197](https://github.com/nf-core/sarek/pull/197) - Fix typo in output directory `DuplicateMarked` -> `DuplicatesMarked` + +### Deprecated + +- [#107](https://github.com/nf-core/sarek/pull/107) - `--annotateTools` is now deprecated, use `--annotate_tools` instead +- [#107](https://github.com/nf-core/sarek/pull/107) - `--cadd_InDels` is now deprecated, use `--cadd_indels` instead +- [#107](https://github.com/nf-core/sarek/pull/107) - `--cadd_InDels_tbi` is now deprecated, use `--cadd_indels_tbi` instead +- [#107](https://github.com/nf-core/sarek/pull/107) - `--cadd_WG_SNVs` is now deprecated, use `--cadd_wg_snvs` instead +- [#107](https://github.com/nf-core/sarek/pull/107) - `--cadd_WG_SNVs_tbi` is now deprecated, use `--cadd_wg_snvs_tbi` instead +- [#107](https://github.com/nf-core/sarek/pull/107) - `--maxMultiqcEmailFileSize` is now deprecated, use `--max_multiqc_email_size` instead +- [#107](https://github.com/nf-core/sarek/pull/107) - `--noGVCF` is now deprecated, use `--no_gvcf` instead +- [#107](https://github.com/nf-core/sarek/pull/107) - `--noStrelkaBP` is now deprecated, use `--no_strelka_bp` instead +- [#107](https://github.com/nf-core/sarek/pull/107) - `--nucleotidesPerSecond` is now deprecated, use `--nucleotides_per_second` instead +- [#107](https://github.com/nf-core/sarek/pull/107) - `--publishDirMode` is now deprecated, use `--publish_dir_mode` instead +- [#107](https://github.com/nf-core/sarek/pull/107) - `--saveGenomeIndex` is now deprecated, use `--save_reference` instead +- [#107](https://github.com/nf-core/sarek/pull/107) - `--skipQC` is now deprecated, use `--skip_qc` instead +- [#107](https://github.com/nf-core/sarek/pull/107) - `--snpEff_cache` is now deprecated, use `--snpeff_cache` instead +- [#107](https://github.com/nf-core/sarek/pull/107) - `--targetBed` is now deprecated, use `--target_bed` instead + +### Removed + +- [#107](https://github.com/nf-core/sarek/pull/107) - `--acLociGC` is now removed, use `--ac_loci_gc` instead +- [#107](https://github.com/nf-core/sarek/pull/107) - `--acLoci` is now removed, use `--ac_loci` instead +- [#107](https://github.com/nf-core/sarek/pull/107) - `--bwaIndex` is now removed, use `--bwa` instead +- [#107](https://github.com/nf-core/sarek/pull/107) - `--chrDir` is now removed, use `--chr_dir` instead +- [#107](https://github.com/nf-core/sarek/pull/107) - `--chrLength` is now removed, use `--chr_length` instead +- [#107](https://github.com/nf-core/sarek/pull/107) - `--dbsnpIndex` is now removed, use `--dbsnp_index` instead +- [#107](https://github.com/nf-core/sarek/pull/107) - `--fastaFai` is now removed, use `--fasta_fai` instead +- [#107](https://github.com/nf-core/sarek/pull/107) - `--genomeDict` is now removed, use `--dict` instead +- [#107](https://github.com/nf-core/sarek/pull/107) - `--genomeFile` is now removed, use `--fasta` instead +- [#107](https://github.com/nf-core/sarek/pull/107) - `--genomeIndex` is now removed, use `--fasta_fai` instead +- [#107](https://github.com/nf-core/sarek/pull/107) - `--germlineResourceIndex` is now removed, use `--germline_resource_index` instead +- [#107](https://github.com/nf-core/sarek/pull/107) - `--germlineResource` is now removed, use `--germline_resource` instead +- [#107](https://github.com/nf-core/sarek/pull/107) - `--igenomesIgnore` is now removed, use `--igenomes_ignore` instead +- [#107](https://github.com/nf-core/sarek/pull/107) - `--knownIndelsIndex` is now removed, use `--known_indels_index` instead +- [#107](https://github.com/nf-core/sarek/pull/107) - `--knownIndels` is now removed, use `--known_indels` instead +- [#107](https://github.com/nf-core/sarek/pull/107) - `--singleCPUMem` is now removed, use `--single_cpu_mem` instead +- [#107](https://github.com/nf-core/sarek/pull/107) - `--snpeffDb` is now removed, use `--snpeff_db` instead +- [#107](https://github.com/nf-core/sarek/pull/107) - `--vepCacheVersion` is now removed, use `--vep_cache_version` instead +- [#152](https://github.com/nf-core/sarek/pull/152) - Removed `Jenkinsfile` +- [#169](https://github.com/nf-core/sarek/pull/169) - Removed omicX from README +- [#181](https://github.com/nf-core/sarek/pull/181) - Remove duplicate code in `nextflow.config` + +## [2.5.2](https://github.com/nf-core/sarek/releases/tag/2.5.2) - Jåkkåtjkaskajekna + +Jåkkåtjkaskajekna is one of the two glaciers of the Ålkatj Massif. + +### Added + +- [#45](https://github.com/nf-core/sarek/pull/45) - Include Workflow figure in `README.md` +- [#46](https://github.com/nf-core/sarek/pull/46) - Add location to abstracts +- [#52](https://github.com/nf-core/sarek/pull/52) - Add support for mouse data `GRCm38` +- [#60](https://github.com/nf-core/sarek/pull/60) - Add `no_intervals` params +- [#60](https://github.com/nf-core/sarek/pull/60) - Add automatic generation of `intervals` file with `BuildIntervals` process +- [#60](https://github.com/nf-core/sarek/pull/60) - Add minimal support for minimal genome (only `fasta`, or `fasta` + `knownIndels`) +- [#60](https://github.com/nf-core/sarek/pull/60) - Add new processes (`IndexBamFile`, `IndexBamRecal`) to deal with optional usage of interval files and minimal genome +- [#60](https://github.com/nf-core/sarek/pull/60) - Add tests for minimal genome usage +- [#60](https://github.com/nf-core/sarek/pull/60) - Add new minimal genomes (`TAIR10`, `EB2`, `UMD3.1`, `bosTau8`, `WBcel235`, `ce10`, `CanFam3.1`, `canFam3`, `GRCz10`, `danRer10`, `BDGP6`, `dm6`, `EquCab2`, `equCab2`, `EB1`, `Galgal4`, `galGal4`, `Gm01`, `hg38`, `hg19`, `Mmul_1`, `mm10`, `IRGSP-1.0`, `CHIMP2.1.4`, `panTro4`, `Rnor_6.0`, `rn6`, `R64-1-1`, `sacCer3`, `EF2`, `Sbi1`, `Sscrofa10.2`, `susScr3`, `AGPv3`) to `igenomes.config` +- [#61](https://github.com/nf-core/sarek/pull/61) - Add params `split_fastq` +- [#61](https://github.com/nf-core/sarek/pull/61) - Add test `SPLITFASTQ` +- [#66](https://github.com/nf-core/sarek/pull/66) - Add `Sentieon` possibilities to Sarek + +### Changed + +- [#54](https://github.com/nf-core/sarek/pull/54) - Bump version to `2.5.2dev` +- [#60](https://github.com/nf-core/sarek/pull/60) - Some process (`BaseRecalibrator`, `ApplyBQSR`, `Mpileup`) have now optional usage of interval files +- [#60](https://github.com/nf-core/sarek/pull/60) - Update documentation +- [#71](https://github.com/nf-core/sarek/pull/71) - Update `README` +- [#71](https://github.com/nf-core/sarek/pull/71) - Update `CHANGELOG` +- [#74](https://github.com/nf-core/sarek/pull/74) - Update docs +- [#74](https://github.com/nf-core/sarek/pull/74) - Improve CI tests (both Jenkins and GitHub actions tests) +- [#74](https://github.com/nf-core/sarek/pull/74) - Move all CI from `ci-extra.yml` to `ci.yml` + +### Removed + +- [#46](https://github.com/nf-core/sarek/pull/46) - Remove mention of old `build.nf` script which was included in `main.nf` +- [#74](https://github.com/nf-core/sarek/pull/74) - Remove `download_image.sh` and `run_tests.sh` scripts +- [#76](https://github.com/nf-core/sarek/pull/76) - Remove `runOptions = "-u \$(id -u):\$(id -g)"` in `nextflow.config` to enable `Spark` possibilities + +### Fixed + +- [#40](https://github.com/nf-core/sarek/pull/40) - Fix issue with `publishDirMode` within `test` profile +- [#42](https://github.com/nf-core/sarek/pull/42) - Fix typos, and minor updates in `README.md` +- [#43](https://github.com/nf-core/sarek/pull/43) - Fix automated `VEP` builds with circleCI +- [#54](https://github.com/nf-core/sarek/pull/54) - Apply fixes from release `2.5.1` +- [#58](https://github.com/nf-core/sarek/pull/58) - Fix issue with `.interval_list` file from the `GATK` bundle [#56](https://github.com/nf-core/sarek/issues/56) that was not recognized in the `CreateIntervalsBed` process +- [#71](https://github.com/nf-core/sarek/pull/71) - Fix typos in `CHANGELOG` +- [#73](https://github.com/nf-core/sarek/pull/73) - Fix issue with label `memory_max` for `BaseRecalibrator` process [#72](https://github.com/nf-core/sarek/issues/72) + +## [2.5.1](https://github.com/nf-core/sarek/releases/tag/2.5.1) - Årjep-Ålkatjjekna + +Årjep-Ålkatjjekna is one of the two glaciers of the Ålkatj Massif. + +### Added + +- [#53](https://github.com/nf-core/sarek/pull/53) - Release `2.5.1` + +### Fixed + +- [#48](https://github.com/nf-core/sarek/issues/48) - Fix `singularity.autoMounts` issue +- [#49](https://github.com/nf-core/sarek/issues/49) - Use correct tag for annotation containers +- [#50](https://github.com/nf-core/sarek/issues/50) - Fix paths for scripts + +## [2.5](https://github.com/nf-core/sarek/releases/tag/2.5) - Ålkatj + +Ålkatj is one of the main massif in the Sarek National Park. + +Initial release of `nf-core/sarek`, created with the [nf-core](http://nf-co.re/) template. + +### Added + +- [#2](https://github.com/nf-core/sarek/pull/2) - Create `nf-core/sarek` `environment.yml` file +- [#2](https://github.com/nf-core/sarek/pull/2), [#3](https://github.com/nf-core/sarek/pull/3), [#4](https://github.com/nf-core/sarek/pull/4), [#5](https://github.com/nf-core/sarek/pull/5), [#7](https://github.com/nf-core/sarek/pull/7), [#9](https://github.com/nf-core/sarek/pull/9), [#10](https://github.com/nf-core/sarek/pull/10), [#11](https://github.com/nf-core/sarek/pull/11), [#12](https://github.com/nf-core/sarek/pull/12) - Add CI for `nf-core/sarek` +- [#3](https://github.com/nf-core/sarek/pull/3) - Add preprocessing to `nf-core/sarek` +- [#4](https://github.com/nf-core/sarek/pull/4) - Add variant calling to `nf-core/sarek` with `HaplotypeCaller`, and single mode `Manta` and `Strelka` +- [#5](https://github.com/nf-core/sarek/pull/5), [#34](https://github.com/nf-core/sarek/pull/34) - Add variant calling to `nf-core/sarek` with `Manta`, `Strelka`, `Strelka Best Practices`, `GATK Mutect2`, `FreeBayes`, `ASCAT`, `ControlFREEC` +- [#6](https://github.com/nf-core/sarek/pull/6) - Add default containers for annotation to `nf-core/sarek` +- [#7](https://github.com/nf-core/sarek/pull/7) - Add `MultiQC` +- [#7](https://github.com/nf-core/sarek/pull/7) - Add annotation +- [#7](https://github.com/nf-core/sarek/pull/7) - Add social preview image in `png` and `svg` format +- [#7](https://github.com/nf-core/sarek/pull/7), [#8](https://github.com/nf-core/sarek/pull/8), [#11](https://github.com/nf-core/sarek/pull/11), [#21](https://github.com/nf-core/sarek/pull/21) - Add helper script `run_tests.sh` to run different tests +- [#7](https://github.com/nf-core/sarek/pull/7), [#8](https://github.com/nf-core/sarek/pull/8), [#9](https://github.com/nf-core/sarek/pull/9) - Add automatic build of specific containers for annotation for `GRCh37`, `GRCh38` and `GRCm38` using `CircleCI` +- [#7](https://github.com/nf-core/sarek/pull/7), [#8](https://github.com/nf-core/sarek/pull/8), [#9](https://github.com/nf-core/sarek/pull/9), [#11](https://github.com/nf-core/sarek/pull/11) - Add helper script `build_reference.sh` to build small reference from [nf-core/test-datasets:sarek](https://github.com/nf-core/test-datasets/tree/sarek) +- [#7](https://github.com/nf-core/sarek/pull/7), [#9](https://github.com/nf-core/sarek/pull/9), [#11](https://github.com/nf-core/sarek/pull/11), [#12](https://github.com/nf-core/sarek/pull/12) - Add helper script `download_image.sh` to download containers for testing +- [#8](https://github.com/nf-core/sarek/pull/8) - Add test configuration for easier testing +- [#9](https://github.com/nf-core/sarek/pull/9), [#11](https://github.com/nf-core/sarek/pull/11) - Add scripts for `ASCAT` +- [#10](https://github.com/nf-core/sarek/pull/10) - Add `TIDDIT` to detect structural variants +- [#11](https://github.com/nf-core/sarek/pull/11) - Add automatic build of specific containers for annotation for `CanFam3.1` using `CircleCI` +- [#11](https://github.com/nf-core/sarek/pull/11), [#12](https://github.com/nf-core/sarek/pull/12) - Add posters and abstracts +- [#12](https://github.com/nf-core/sarek/pull/12) - Add helper script `make_snapshot.sh` to make an archive for usage on a secure cluster +- [#12](https://github.com/nf-core/sarek/pull/12) - Add helper scripts `filter_locifile.py` and `selectROI.py` +- [#12](https://github.com/nf-core/sarek/pull/12) - Use `label` for processes configuration +- [#13](https://github.com/nf-core/sarek/pull/13) - Add Citation documentation +- [#13](https://github.com/nf-core/sarek/pull/13) - Add `BamQC` process +- [#13](https://github.com/nf-core/sarek/pull/13) - Add `CompressVCFsnpEff` and `CompressVCFvep` processes +- [#18](https://github.com/nf-core/sarek/pull/18) - Add `--no-reports` option for tests + add snpEff,VEP,merge to MULTIPLE test +- [#18](https://github.com/nf-core/sarek/pull/18) - Add logo to `MultiQC` report +- [#18](https://github.com/nf-core/sarek/pull/18), [#29](https://github.com/nf-core/sarek/pull/29) - Add params `--skipQC` to skip specified QC tools +- [#18](https://github.com/nf-core/sarek/pull/18) - Add possibility to download other genome for `sareksnpeff` and `sarekvep` containers +- [#20](https://github.com/nf-core/sarek/pull/20) - Add `markdownlint` config file +- [#21](https://github.com/nf-core/sarek/pull/21) - Add tests for latest `Nextflow` version as well +- [#21](https://github.com/nf-core/sarek/pull/21) - Add `genomes.config` for genomes without `AWS iGenomes` +- [#24](https://github.com/nf-core/sarek/pull/24) - Added `GATK4 Mutect2` calling and filtering +- [#27](https://github.com/nf-core/sarek/pull/27), [#30](https://github.com/nf-core/sarek/pull/30) - Use Github actions for CI, linting and branch protection +- [#31](https://github.com/nf-core/sarek/pull/31) - Add `nf-core lint` +- [#31](https://github.com/nf-core/sarek/pull/31) - Add extra CI to `GitHub Actions` nf-core extra CI +- [#35](https://github.com/nf-core/sarek/pull/35) - Building indexes from [nf-core/test-datasets:sarek](https://github.com/nf-core/test-datasets/tree/sarek) for CI and small tests + +### Changed + +- [#1](https://github.com/nf-core/sarek/pull/1), [#2](https://github.com/nf-core/sarek/pull/2), [#3](https://github.com/nf-core/sarek/pull/3), [#4](https://github.com/nf-core/sarek/pull/4), [#5](https://github.com/nf-core/sarek/pull/5), [#6](https://github.com/nf-core/sarek/pull/6), [#7](https://github.com/nf-core/sarek/pull/7), [#8](https://github.com/nf-core/sarek/pull/8), [#9](https://github.com/nf-core/sarek/pull/9), [#10](https://github.com/nf-core/sarek/pull/10), [#11](https://github.com/nf-core/sarek/pull/11), [#12](https://github.com/nf-core/sarek/pull/12), [#18](https://github.com/nf-core/sarek/pull/18), [#20](https://github.com/nf-core/sarek/pull/20), [#21](https://github.com/nf-core/sarek/pull/21), [#23](https://github.com/nf-core/sarek/pull/23), [#29](https://github.com/nf-core/sarek/pull/29) - Update docs +- [#4](https://github.com/nf-core/sarek/pull/4) - Update `cancerit-allelecount` from `2.1.2` to `4.0.2` +- [#4](https://github.com/nf-core/sarek/pull/4) - Update `gatk4` from `4.1.1.0` to `4.1.2.0` +- [#7](https://github.com/nf-core/sarek/pull/7), [#23](https://github.com/nf-core/sarek/pull/23) - `--sampleDir` is now deprecated, use `--input` instead +- [#7](https://github.com/nf-core/sarek/pull/8), [#23](https://github.com/nf-core/sarek/pull/23) - `--annotateVCF` is now deprecated, use `--input` instead +- [#8](https://github.com/nf-core/sarek/pull/8), [#12](https://github.com/nf-core/sarek/pull/12) - Improve helper script `build.nf` for downloading and building reference files +- [#9](https://github.com/nf-core/sarek/pull/9) - `ApplyBQSR` is now parallelized +- [#9](https://github.com/nf-core/sarek/pull/9) - Fastq files are named following "${idRun}\_R1.fastq.gz" in the `FastQC` output for easier reporting +- [#9](https://github.com/nf-core/sarek/pull/9) - Status is now a map with `idpatient`, `idsample` as keys (ie: `status = statusMap[idPatient, idSample]`) +- [#9](https://github.com/nf-core/sarek/pull/9) - Use `ensembl-vep` `95.2` instead of `96.0` +- [#11](https://github.com/nf-core/sarek/pull/11) - Summary HTML from `VEP` is now in the `Reports` directory +- [#12](https://github.com/nf-core/sarek/pull/12) - Update configuration files +- [#12](https://github.com/nf-core/sarek/pull/12) - Disable `Docker` in `singularity` profile +- [#12](https://github.com/nf-core/sarek/pull/12) - Disable `Singularity` in `docker` profile +- [#12](https://github.com/nf-core/sarek/pull/12) - Disable `Docker` and `Singularity` in `conda` profile +- [#12](https://github.com/nf-core/sarek/pull/12) - Simplify `check_max()` function +- [#13](https://github.com/nf-core/sarek/pull/13) - Merge `BamQCmapped` and `BamQCrecalibrated` processes into `BamQC` process +- [#13](https://github.com/nf-core/sarek/pull/13) - Split `CompressVCF` process into `CompressVCFsnpEff` and `CompressVCFvep` processes +- [#16](https://github.com/nf-core/sarek/pull/16) - Make scripts in `bin/` and `scripts/` executable +- [#18](https://github.com/nf-core/sarek/pull/18) - Use `--no-reports` for TravisCI testing +- [#18](https://github.com/nf-core/sarek/pull/18) - Add `--no-reports` for all tests but MULTIPLE in Jenkins +- [#18](https://github.com/nf-core/sarek/pull/18), [#29](https://github.com/nf-core/sarek/pull/29) - `--noReports` is now `--skipQC all` +- [#18](https://github.com/nf-core/sarek/pull/18), [#21](https://github.com/nf-core/sarek/pull/21) - Update logo +- [#21](https://github.com/nf-core/sarek/pull/21) - Moved `smallGRCh37` path to `genomes.config` +- [#23](https://github.com/nf-core/sarek/pull/23) - Rename `genomeFile`, `genomeIndex` and `genomeDict` by `fasta`, `fastaFai` and `dict` +- [#23](https://github.com/nf-core/sarek/pull/23) - `--sample` is now deprecated, use `--input` instead +- [#23](https://github.com/nf-core/sarek/pull/23) - `--genomeFile` is now deprecated, use `--fasta` instead +- [#23](https://github.com/nf-core/sarek/pull/23) - `--genomeIndex` is now deprecated, use `--fastaFai` instead +- [#23](https://github.com/nf-core/sarek/pull/23) - `--genomeDict` is now deprecated, use `--dict` instead +- [#24](https://github.com/nf-core/sarek/pull/24) - `AWS iGenomes` config now contains germline resource for `GATK4 Mutect2` +- [#30](https://github.com/nf-core/sarek/pull/30) - Simplify code for `MapReads` process +- [#24](https://github.com/nf-core/sarek/pull/24) - `AWS iGenomes` config now contains germline resource for `GATK4 Mutect2` +- [#31](https://github.com/nf-core/sarek/pull/31) - Move extra CI to `GitHub Actions` nf-core extra CI +- [#32](https://github.com/nf-core/sarek/pull/32), [#33](https://github.com/nf-core/sarek/pull/33) - Install `ASCAT` with `conda` in the `environment.yml` file +- [#33](https://github.com/nf-core/sarek/pull/33) - Use `workflow.manifest.version` to specify workflow version in path to scripts for `ControlFREEC` and `VEP` processes +- [#35](https://github.com/nf-core/sarek/pull/35) - Building indexes is now done in `main.nf` +- [#35](https://github.com/nf-core/sarek/pull/35) - `build.nf` script now only download cache, so renamed to `downloadcache.nf` +- [#35](https://github.com/nf-core/sarek/pull/35) - Use `tabix` instead of `IGVtools` to build vcf indexes +- [#35](https://github.com/nf-core/sarek/pull/35) - Refactor references handling +- [#35](https://github.com/nf-core/sarek/pull/35) - Use Channel values instead of `referenceMap` +- [#37](https://github.com/nf-core/sarek/pull/37) - Bump version for Release +- [#38](https://github.com/nf-core/sarek/pull/38) - File names before merge is based on `${idSample}_${idRun}` instead of `${idRun}` + +### Removed + +- [#9](https://github.com/nf-core/sarek/pull/9) - Removed `relatedness2` graph from `vcftools stats` +- [#13](https://github.com/nf-core/sarek/pull/13) - Removed `BamQCmapped` and `BamQCrecalibrated` processes +- [#13](https://github.com/nf-core/sarek/pull/13) - Removed `CompressVCF` +- [#18](https://github.com/nf-core/sarek/pull/18) - Removed params `--noReports` +- [#24](https://github.com/nf-core/sarek/pull/18) - Removed `GATK3.X Mutect2` +- [#31](https://github.com/nf-core/sarek/pull/31) - Remove extra CI from `Travis CI` and `GitHub Actions` nf-core CI +- [#32](https://github.com/nf-core/sarek/pull/32), [#35](https://github.com/nf-core/sarek/pull/35) - Clean up `environment.yml` file +- [#35](https://github.com/nf-core/sarek/pull/35) - Remove building indexes from `build.nf` script +- [#35](https://github.com/nf-core/sarek/pull/35) - Remove helper script `build_reference.sh` +- [#35](https://github.com/nf-core/sarek/pull/35) - Remove `IGVtools` +- [#35](https://github.com/nf-core/sarek/pull/35) - Remove `GATK Mutect2` from `MULTIPLE` test +- [#35](https://github.com/nf-core/sarek/pull/35) - Remove `referenceMap` and `defineReferenceMap()` and use Channel values instead + +### Fixed + +- [#3](https://github.com/nf-core/sarek/pull/3) - Fix `Docker` ownership +- [#11](https://github.com/nf-core/sarek/pull/11) - Fix `MergeMpileup` PublishDir +- [#13](https://github.com/nf-core/sarek/pull/13) - Fix merge in annotation +- [#14](https://github.com/nf-core/sarek/pull/14) - Fix output name for vcf files +- [#16](https://github.com/nf-core/sarek/pull/16) - Fix path to `Rscript` +- [#18](https://github.com/nf-core/sarek/pull/18) - Improve cpu usage +- [#18](https://github.com/nf-core/sarek/pull/18) - Use same font for `nf-core` and `sarek` in ascii art +- [#20](https://github.com/nf-core/sarek/pull/20) - Use new logo in README +- [#20](https://github.com/nf-core/sarek/pull/20) - Fix path to references genomes +- [#22](https://github.com/nf-core/sarek/pull/22) - Fix `--singleCPUMem` issue +- [#30](https://github.com/nf-core/sarek/pull/30) - Fix choice between `inputPairReadsFastQC` and `inputBAMFastQC` channels +- [#31](https://github.com/nf-core/sarek/pull/31) - Fix badges according to nf-core lint +- [#31](https://github.com/nf-core/sarek/pull/31) - Fix `rcolorbrewer` version according to nf-core lint +- [#33](https://github.com/nf-core/sarek/pull/33) - Fix MD Linting +- [#38](https://github.com/nf-core/sarek/pull/38) - Avoid collision in `MultiQC` +- [#39](https://github.com/nf-core/sarek/pull/39) - Fix `ch_dbsnp` channel + +### Deprecated + +- [#23](https://github.com/nf-core/sarek/pull/23) - `--sample` is now deprecated, use `--input` instead +- [#23](https://github.com/nf-core/sarek/pull/23) - `--genomeFile` is now deprecated, use `--fasta` instead +- [#23](https://github.com/nf-core/sarek/pull/23) - `--genomeIndex` is now deprecated, use `--fastaFai` instead +- [#23](https://github.com/nf-core/sarek/pull/23) - `--genomeDict` is now deprecated, use `--dict` instead +- [#29](https://github.com/nf-core/sarek/pull/29) - `--noReports` is now deprecated, use `--skipQC all` + +## [2.3.FIX1](https://github.com/SciLifeLab/Sarek/releases/tag/2.3.FIX1) - 2019-03-04 + +### Fixed + +- [#742](https://github.com/SciLifeLab/Sarek/pull/742) - Fix output dirs (`HaplotypeCaller` that was not recognized by `annotate.nf` introduced by [#728](https://github.com/SciLifeLab/Sarek/pull/728)) + +## [2.3](https://github.com/SciLifeLab/Sarek/releases/tag/2.3) - Äpar - 2019-02-27 + +Äpar is one of the main massif in the Sarek National Park. + +### Added + +- [#628](https://github.com/SciLifeLab/Sarek/pull/628), [#722](https://github.com/SciLifeLab/Sarek/pull/722) - `ASCAT` now use `.gc` file +- [#712](https://github.com/SciLifeLab/Sarek/pull/712), [#718](https://github.com/SciLifeLab/Sarek/pull/718) - Added possibilities to run Sarek with `conda` +- [#719](https://github.com/SciLifeLab/Sarek/pull/719) - Annotation documentation +- [#719](https://github.com/SciLifeLab/Sarek/pull/719) - Helper script to download `snpeff` and `VEP` cache files +- [#719](https://github.com/SciLifeLab/Sarek/pull/719) - New `--annotation_cache`, `--snpEff_cache`, `--vep_cache` parameters +- [#719](https://github.com/SciLifeLab/Sarek/pull/719) - Possibility to use cache wen annotating with `snpEff` and `VEP` +- [#722](https://github.com/SciLifeLab/Sarek/pull/722) - Add path to ASCAT `.gc` file in `igenomes.config` +- [#728](https://github.com/SciLifeLab/Sarek/pull/728) - Update `Sarek-data` submodule with multiple patients TSV file +- [#732](https://github.com/SciLifeLab/Sarek/pull/732) - Add `cadd_WG_SNVs`, `cadd_WG_SNVs_tbi`, `cadd_InDels`, `cadd_InDels_tbi` and `cadd_cache` params +- [#732](https://github.com/SciLifeLab/Sarek/pull/732) - Add `tabix` indexed cache for `VEP` +- [#732](https://github.com/SciLifeLab/Sarek/pull/732) - New `DownloadCADD` process to download CADD files +- [#732](https://github.com/SciLifeLab/Sarek/pull/732) - Specify values for `cadd_WG_SNVs`, `cadd_WG_SNVs_tbi`, `cadd_InDels`, `cadd_InDels_tbi` and `cadd_cache` params in `munin.conf` file +- [#732](https://github.com/SciLifeLab/Sarek/pull/732) - Use `cadd_cache` param for optional use of CADD VEP plugin in `annotate.nf` +- [#732](https://github.com/SciLifeLab/Sarek/pull/732) - `VEP` cache has now fasta files for `--HGVS` +- [#735](https://github.com/SciLifeLab/Sarek/pull/735) - Added `--exome` for `Manta`, and for `StrelkaBP` +- [#735](https://github.com/SciLifeLab/Sarek/pull/735) - Added `Travis CI` test for targeted + +### Changed + +- [#710](https://github.com/SciLifeLab/Sarek/pull/710) - Improve release checklist and script +- [#711](https://github.com/SciLifeLab/Sarek/pull/711) - Improve configuration priorities +- [#716](https://github.com/SciLifeLab/Sarek/pull/716) - Update paths to containers and `AWS iGenomes` +- [#717](https://github.com/SciLifeLab/Sarek/pull/717) - `checkFileExtension` has changed to `hasExtension`, and now only verify if file has extension +- [#717](https://github.com/SciLifeLab/Sarek/pull/717) - `fastqFiles` renamed to `inputFiles` +- [#717](https://github.com/SciLifeLab/Sarek/pull/717) - `mapping` step can now map BAM files too +- [#717](https://github.com/SciLifeLab/Sarek/pull/717) - `MapReads` can now convert BAM to FASTQ and feed it to BWA on the fly +- [#717](https://github.com/SciLifeLab/Sarek/pull/717), [#732](https://github.com/SciLifeLab/Sarek/pull/732) - Update documentation +- [#719](https://github.com/SciLifeLab/Sarek/pull/719) - `snpeff` and `vep` containers are now built with `conda` +- [#719](https://github.com/SciLifeLab/Sarek/pull/719) - `vepCacheVersion` is now defined in `conf/genomes.config` or `conf/igenomes.config` +- [#722](https://github.com/SciLifeLab/Sarek/pull/722) - Add path to ASCAT `.gc` file in `igenomes.config` +- [#722](https://github.com/SciLifeLab/Sarek/pull/722) - Update `Sarek-data` submodule +- [#723](https://github.com/SciLifeLab/Sarek/pull/723), [#725](https://github.com/SciLifeLab/Sarek/pull/725) - Update docs +- [#724](https://github.com/SciLifeLab/Sarek/pull/724) - Improved `AWS batch` configuration +- [#728](https://github.com/SciLifeLab/Sarek/pull/728) - Improved usage of `targetBED` params +- [#728](https://github.com/SciLifeLab/Sarek/pull/728) - `Strelka` Best Practices output is now prefixed with `StrelkaBP_` +- [#728](https://github.com/SciLifeLab/Sarek/pull/728) - VCFs and Annotated VCFs are now ordered by Patient, then tools +- [#732](https://github.com/SciLifeLab/Sarek/pull/732) - Merge `buildContainers.nf` and `buildReferences.nf` in `build.nf` +- [#732](https://github.com/SciLifeLab/Sarek/pull/732) - Reduce number of CPUs for `RunVEP` to `4` cf: [VEP docs](https://www.ensembl.org/info/docs/tools/vep/script/vep_other.html#faster) +- [#732](https://github.com/SciLifeLab/Sarek/pull/732) - Update `VEP` from `95.1` to `95.2` + +### Removed + +- [#715](https://github.com/SciLifeLab/Sarek/pull/715) - Remove `defReferencesFiles` function from `buildReferences.nf` +- [#719](https://github.com/SciLifeLab/Sarek/pull/719) - `snpEff` base container is no longer used +- [#721](https://github.com/SciLifeLab/Sarek/pull/721) - Remove `COSMIC` docs +- [#728](https://github.com/SciLifeLab/Sarek/pull/728) - Remove `defineDirectoryMap()` +- [#732](https://github.com/SciLifeLab/Sarek/pull/732) - Remove `--database` option for VEP cf: [VEP docs](https://www.ensembl.org/info/docs/tools/vep/script/vep_other.html#faster) + +### Fixed + +- [#720](https://github.com/SciLifeLab/Sarek/pull/720) - `bamQC` is now run on the recalibrated BAMs, and not after `MarkDuplicates` +- [#726](https://github.com/SciLifeLab/Sarek/pull/726) - Fix `Ascat` ref file input (one file can't be a set) +- [#727](https://github.com/SciLifeLab/Sarek/pull/727) - `bamQC` outputs are no longer overwritten (name of dir is now the file instead of sample) +- [#728](https://github.com/SciLifeLab/Sarek/pull/728) - Fix issue with annotation that was consuming `cache` channels +- [#728](https://github.com/SciLifeLab/Sarek/pull/728) - Fix multi sample TSV file [#691](https://github.com/SciLifeLab/Sarek/issues/691) +- [#733](https://github.com/SciLifeLab/Sarek/pull/733) - Fix the possibility to specify reference files on the command line + +## [2.2.2](https://github.com/SciLifeLab/Sarek/releases/tag/2.2.2) - 2018-12-19 + +### Added + +- [#671](https://github.com/SciLifeLab/Sarek/pull/671) - New `publishDirMode` param and docs +- [#673](https://github.com/SciLifeLab/Sarek/pull/673), [#675](https://github.com/SciLifeLab/Sarek/pull/675), [#676](https://github.com/SciLifeLab/Sarek/pull/676) - Profiles for BinAC and CFC clusters in Tübingen +- [#679](https://github.com/SciLifeLab/Sarek/pull/679) - Add container for `CreateIntervalBeds` +- [#692](https://github.com/SciLifeLab/Sarek/pull/692), [#697](https://github.com/SciLifeLab/Sarek/pull/697) - Add `AWS iGenomes` possibilities (within `conf/igenomes.conf`) +- [#694](https://github.com/SciLifeLab/Sarek/pull/694) - Add monochrome and grey logos for light or dark background +- [#698](https://github.com/SciLifeLab/Sarek/pull/698) - Add btb profile for munin server +- [#702](https://github.com/SciLifeLab/Sarek/pull/702) - Add `font-ttf-dejavu-sans-mono` `2.37` and `fontconfig` `2.1dev` to container + +### Changed + +- [#663](https://github.com/SciLifeLab/Sarek/pull/663) - Update `do_release.sh` script +- [#671](https://github.com/SciLifeLab/Sarek/pull/671) - `publishDir` modes are now params +- [#677](https://github.com/SciLifeLab/Sarek/pull/677), [#698](https://github.com/SciLifeLab/Sarek/pull/698), [#703](https://github.com/SciLifeLab/Sarek/pull/703) - Update docs +- [#678](https://github.com/SciLifeLab/Sarek/pull/678) - Changing `VEP` to `v92` and adjusting CPUs for `VEP` +- [#679](https://github.com/SciLifeLab/Sarek/pull/679) - Update old `awsbatch` configuration +- [#682](https://github.com/SciLifeLab/Sarek/pull/682) - Specifications for memory and cpus for `awsbatch` +- [#693](https://github.com/SciLifeLab/Sarek/pull/693) - `Qualimap bamQC` is now ran after mapping and after recalibration for better QC +- [#700](https://github.com/SciLifeLab/Sarek/pull/700) - Update `GATK` to `4.0.9.0` +- [#702](https://github.com/SciLifeLab/Sarek/pull/702) - Update `FastQC` to `0.11.8` +- [#705](https://github.com/SciLifeLab/Sarek/pull/705) - Change `--TMP_DIR` by `--tmp-dir` for `GATK` `4.0.9.0` `BaseRecalibrator` +- [#706](https://github.com/SciLifeLab/Sarek/pull/706) - Update `Travis CI` testing + +### Fixed + +- [#665](https://github.com/SciLifeLab/Sarek/pull/665) - Input bam file now has always the same name (whether it is from a single fastq pair or multiple) in the `MarkDuplicates` process, so metrics too +- [#672](https://github.com/SciLifeLab/Sarek/pull/672) - Process `PullSingularityContainers` from `buildContainers.nf` now expect a file with the correct `.simg` extension for singularity images, and no longer the `.img` one +- [#679](https://github.com/SciLifeLab/Sarek/pull/679) - Add `publishDirMode` for `germlineVC.nf` +- [#700](https://github.com/SciLifeLab/Sarek/pull/700) - Fix [#699](https://github.com/SciLifeLab/Sarek/issues/699) missing DP in the FORMAT column VCFs for Mutect2 +- [#702](https://github.com/SciLifeLab/Sarek/pull/702) - Fix [#701](https://github.com/SciLifeLab/Sarek/issues/701) +- [#705](https://github.com/SciLifeLab/Sarek/pull/705) - Fix [#704](https://github.com/SciLifeLab/Sarek/issues/704) + +## [2.2.1](https://github.com/SciLifeLab/Sarek/releases/tag/2.2.1) - 2018-10-04 + +### Changed + +- [#646](https://github.com/SciLifeLab/Sarek/pull/646) - Update [`pathfindr`](https://github.com/NBISweden/pathfindr) submodule +- [#659](https://github.com/SciLifeLab/Sarek/pull/659) - Update `Nextflow` to `0.32.0` +- [#660](https://github.com/SciLifeLab/Sarek/pull/660) - Update docs + +### Fixed + +- [#657](https://github.com/SciLifeLab/Sarek/pull/657) - Fix `RunMultiQC.nf` bug +- [#659](https://github.com/SciLifeLab/Sarek/pull/659) - Fix bugs due to updating `Nextflow` + +## [2.2.0](https://github.com/SciLifeLab/Sarek/releases/tag/2.2.0) - Skårki - 2018-09-21 + +Skårki is one of the main massif in the Sarek National Park. + +### Added + +- [#613](https://github.com/SciLifeLab/Sarek/pull/613) - Add Issue Templates (bug report and feature request) +- [#614](https://github.com/SciLifeLab/Sarek/pull/614) - Add PR Template +- [#615](https://github.com/SciLifeLab/Sarek/pull/615) - Add presentation +- [#616](https://github.com/SciLifeLab/Sarek/pull/616) - Update documentation +- [#620](https://github.com/SciLifeLab/Sarek/pull/620) - Add `tmp/` to `.gitignore` +- [#625](https://github.com/SciLifeLab/Sarek/pull/625) - Add [`pathfindr`](https://github.com/NBISweden/pathfindr) as a submodule +- [#635](https://github.com/SciLifeLab/Sarek/pull/635) - To process targeted sequencing with a target BED +- [#639](https://github.com/SciLifeLab/Sarek/pull/639) - Add a complete example analysis to docs +- [#640](https://github.com/SciLifeLab/Sarek/pull/640), [#642](https://github.com/SciLifeLab/Sarek/pull/642) - Add helper script for changing version number + +### Changed + +- [#608](https://github.com/SciLifeLab/Sarek/pull/608) - Update `Nextflow` required version +- [#615](https://github.com/SciLifeLab/Sarek/pull/615) - Use `splitCsv` instead of `readlines` +- [#616](https://github.com/SciLifeLab/Sarek/pull/616) - Update `CHANGELOG` +- [#621](https://github.com/SciLifeLab/Sarek/pull/621), [#638](https://github.com/SciLifeLab/Sarek/pull/638) - Improve install script +- [#621](https://github.com/SciLifeLab/Sarek/pull/621), [#638](https://github.com/SciLifeLab/Sarek/pull/638) - Simplify tests +- [#627](https://github.com/SciLifeLab/Sarek/pull/627), [#629](https://github.com/SciLifeLab/Sarek/pull/629), [#637](https://github.com/SciLifeLab/Sarek/pull/637) - Refactor docs +- [#629](https://github.com/SciLifeLab/Sarek/pull/629) - Refactor config +- [#632](https://github.com/SciLifeLab/Sarek/pull/632) - Use 2 threads and 2 cpus `FastQC` processes +- [#637](https://github.com/SciLifeLab/Sarek/pull/637) - Update tool version gathering +- [#638](https://github.com/SciLifeLab/Sarek/pull/638) - Use correct `.simg` extension for Singularity images +- [#639](https://github.com/SciLifeLab/Sarek/pull/639) - Smaller refactoring of the docs +- [#640](https://github.com/SciLifeLab/Sarek/pull/640) - Update RELEASE_CHECKLIST +- [#642](https://github.com/SciLifeLab/Sarek/pull/642) - `MultiQC` 1.5 -> 1.6 +- [#642](https://github.com/SciLifeLab/Sarek/pull/642) - `Qualimap` 2.2.2a -> 2.2.2b +- [#642](https://github.com/SciLifeLab/Sarek/pull/642) - Update `conda` channel order priorities +- [#642](https://github.com/SciLifeLab/Sarek/pull/642) - `VCFanno` 0.2.8 -> 0.3.0 +- [#642](https://github.com/SciLifeLab/Sarek/pull/642) - `VCFtools` 0.1.15 -> 0.1.16 + +### Removed + +- [#616](https://github.com/SciLifeLab/Sarek/pull/616) - Remove old Issue Template +- [#629](https://github.com/SciLifeLab/Sarek/pull/629) - Remove old Dockerfiles +- [#637](https://github.com/SciLifeLab/Sarek/pull/637) - Remove old comments + +### Fixed + +- [#621](https://github.com/SciLifeLab/Sarek/pull/621) - Fix `VEP` tests +- [#637](https://github.com/SciLifeLab/Sarek/pull/637) - Fix links in MD files + +## [2.1.0](https://github.com/SciLifeLab/Sarek/releases/tag/2.1.0) - Ruotes - 2018-08-14 + +Ruotes is one of the main massif in the Sarek National Park. + +### Added + +- [#555](https://github.com/SciLifeLab/Sarek/pull/555) - `snpEff` output into `VEP` +- [#556](https://github.com/SciLifeLab/Sarek/pull/556) - `Strelka` Best Practices +- [#563](https://github.com/SciLifeLab/Sarek/pull/563) - Use `SnpEFF` reports in `MultiQC` +- [#568](https://github.com/SciLifeLab/Sarek/pull/568) - `VCFTools` process `RunVcftools` for QC +- [#574](https://github.com/SciLifeLab/Sarek/pull/574), [#580](https://github.com/SciLifeLab/Sarek/pull/580) - Abstracts for `NPMI`, `JOBIM` and `EACR25` +- [#577](https://github.com/SciLifeLab/Sarek/pull/577) - New repository for testing: [Sarek-data](https://github.com/SciLifeLab/Sarek-data) +- [#595](https://github.com/SciLifeLab/Sarek/pull/595) - New library `QC` for functions `bamQC`, `bcftools`, `samtoolsStats`, `vcftools`, `getVersionBCFtools`, `getVersionGATK`, `getVersionManta`, `getVersionSnpEFF`, `getVersionStrelka`, `getVersionVCFtools`, `getVersionVEP` +- [#595](https://github.com/SciLifeLab/Sarek/pull/595) - New Processes `GetVersionBCFtools`, `GetVersionGATK`, `GetVersionManta`, `GetVersionSnpEFF`, `GetVersionStrelka`, `GetVersionVCFtools`, `GetVersionVEP` +- [#595](https://github.com/SciLifeLab/Sarek/pull/595) - New `Python` script `bin/scrape_tool_versions.py` inspired by @ewels and @apeltzer +- [#595](https://github.com/SciLifeLab/Sarek/pull/595) - New QC Process `RunVcftools` +- [#596](https://github.com/SciLifeLab/Sarek/pull/596) - New profile for `BinAC` cluster +- [#597](https://github.com/SciLifeLab/Sarek/pull/597) - New function `sarek_ascii()` in `SarekUtils` +- [#599](https://github.com/SciLifeLab/Sarek/pull/599), [#602](https://github.com/SciLifeLab/Sarek/pull/602) - New Process `CompressVCF` +- [#601](https://github.com/SciLifeLab/Sarek/pull/601), [#603](https://github.com/SciLifeLab/Sarek/pull/603) - Container for `GATK4` +- [#606](https://github.com/SciLifeLab/Sarek/pull/606) - Add test data as a submodule from [`Sarek-data`](https://github.com/SciLifeLab/Sarek-data) +- [#608](https://github.com/SciLifeLab/Sarek/pull/608) - Add documentation on how to install Nextflow on `bianca` + +### Changed + +- [#557](https://github.com/SciLifeLab/Sarek/pull/557), [#583](https://github.com/SciLifeLab/Sarek/pull/583), [#585](https://github.com/SciLifeLab/Sarek/pull/585), [#588](https://github.com/SciLifeLab/Sarek/pull/588) - Update help +- [#560](https://github.com/SciLifeLab/Sarek/pull/560) - `GitHub` langage for the repository is now `Nextflow` +- [#561](https://github.com/SciLifeLab/Sarek/pull/561) - `do_all.sh` build only containers for one genome reference (default `GRCh38`) only +- [#571](https://github.com/SciLifeLab/Sarek/pull/571) - Only one container for all QC tools +- [#582](https://github.com/SciLifeLab/Sarek/pull/582), [#587](https://github.com/SciLifeLab/Sarek/pull/587) - Update figures +- [#595](https://github.com/SciLifeLab/Sarek/pull/595) - Function `defineDirectoryMap()` is now part of `SarekUtils` +- [#595](https://github.com/SciLifeLab/Sarek/pull/595) - Process `GenerateMultiQCconfig` replace by function `createMultiQCconfig()` +- [#597](https://github.com/SciLifeLab/Sarek/pull/597) - `extractBams()` now takes an extra parameter +- [#597](https://github.com/SciLifeLab/Sarek/pull/597) - Move `checkFileExtension()`, `checkParameterExistence()`, `checkParameterList()`, `checkReferenceMap()`, `checkRefExistence()`, `extractBams()`, `extractGenders()`, `returnFile()`, `returnStatus()` and `returnTSV()` functions to `SarekUtils` +- [#597](https://github.com/SciLifeLab/Sarek/pull/597) - Reduce data footprint for Process `CreateRecalibrationTable` +- [#597](https://github.com/SciLifeLab/Sarek/pull/597) - Replace deprecated operator `phase` by `join` +- [#599](https://github.com/SciLifeLab/Sarek/pull/599) - Merge is tested with `ANNOTATEALL` +- [#604](https://github.com/SciLifeLab/Sarek/pull/604) - Synching `GRCh38` `wgs_calling_regions` bedfiles +- [#607](https://github.com/SciLifeLab/Sarek/pull/607) - One container approach +- [#607](https://github.com/SciLifeLab/Sarek/pull/607) - Update to `GATK4` +- [#608](https://github.com/SciLifeLab/Sarek/pull/608) - Update `Nextflow` required version +- [#616](https://github.com/SciLifeLab/Sarek/pull/616) - Update `CHANGELOG` +- [#617](https://github.com/SciLifeLab/Sarek/pull/617) - Replace deprecated `Nextflow` `$name` syntax with `withName` + +### Fixed + +- [#560](https://github.com/SciLifeLab/Sarek/pull/560) - Display message for `repository` and `containerPath` +- [#566](https://github.com/SciLifeLab/Sarek/pull/566) - `slurmDownload` profile +- [#579](https://github.com/SciLifeLab/Sarek/pull/579), [#584](https://github.com/SciLifeLab/Sarek/pull/584) - `Manta` output reorganized after modification for `Strelka Best Practices` process +- [#585](https://github.com/SciLifeLab/Sarek/pull/583) - Trace file is plain txt +- [#590](https://github.com/SciLifeLab/Sarek/pull/590), [#593](https://github.com/SciLifeLab/Sarek/pull/593) - Fix `Singularity` installation in `Travis CI` testing +- [#598](https://github.com/SciLifeLab/Sarek/pull/598), [#601](https://github.com/SciLifeLab/Sarek/pull/601) - Fixes for `Python` script `selectROI.py` to work with `CLC` viewer + +### Removed + +- [#607](https://github.com/SciLifeLab/Sarek/pull/607) - Remove `Mutect1` + +## [2.0.0](https://github.com/SciLifeLab/Sarek/releases/tag/2.0.0) - 2018-03-23 + +First release under the `Sarek` name, from the National Park in Northern Sweden. + +### Added + +- Basic wrapper script +- Abstract, posters and figures +- ROI selector and `FreeBayes` sanitizer scripts +- New logo and icon for the project +- Check for existing tumor/normal channel +- `SarekUtils` with `checkParams()`, `checkParameterList()`, `checkParameterExistence()` and `isAllowedParams()` functions +- Some `runOptions` for `docker` (prevent some user right problem) +- This `CHANGELOG` + +### Changed + +- `CAW` is now `Sarek` +- Dissect Workflow in 5 new scripts: `annotate.nf`, `main.nf`, `germlineVC.nf`, `runMultiQC.nf` and `somaticVC.nf` +- `report.html`, `timeline.html` and `trace.html` are generated in `Reports/` +- `--version` is now used to define the workflow version +- Most params are now defined in the `base.config` file instead of in the scripts +- Update `RELEASE_CHECKLIST.md` +- `checkParams()`, `checkParameterList()`, `checkParameterExistence()` and `isAllowedParams()` in script functions are now called within `SarekUtils` +- `nf_required_version` is now `params.nfRequiredVersion` +- In `buildReferences.nf` script, channels now begin by `ch_`, and files by `f_` +- Use `PublishDir mode: 'link'` instead of `copy` +- `directoryMap` now contains `params.outDir` +- [#539](https://github.com/SciLifeLab/Sarek/issues/539) - Use Nextflow support of scratch +- Reordered `Travis CI` tests +- Update documentation +- `MultiQC` version in container from v`1.4` to v`1.5` +- `vepgrch37` container base image from `release_90.6` to `release_92` +- `vepgrch38` container base image from `release_90.6` to `release_92` +- `VEP` version in containers from v`90` to v`91` +- `nucleotidesPerSecond` is now `params.nucleotidesPerSecond` +- Default `params.tag` is now `latest` instead of current version, so `--tag` needs to be specified with the right version to be sure of using the `containers` corresponding + +### Deprecated + +- `standard` profile +- `uppmax-localhost.config` file + +### Removed + +- `scripts/skeleton_batch.sh` +- Old data and tsv files +- `UPPMAX` directories from containers +- `--step` in `annotate.nf`, `germlineVC.nf` and `somatic.nf` +- Some `runOptions` for `Singularity` (binding not needed anymore on `UPPMAX`) +- `download` profile + +### Fixed + +- [#530](https://github.com/SciLifeLab/Sarek/issues/530) - Use `$PWD` for default `outDir` +- [#533](https://github.com/SciLifeLab/Sarek/issues/533) - Replace `VEP` `--pick` option by `--per_gene` + +## [1.2.5](https://github.com/SciLifeLab/Sarek/releases/tag/1.2.5) - 2018-01-18 + +### Added + +- `Zenodo` for DOI +- Delivery README +- Document use of the `--sampleDir` option +- Contributing Guidelines +- Issue Templates +- Release Checklist +- `--outDir` +- `awsbatch` profile +- `aws-batch.config` config file +- `--noBAMQC` params (failing sometimes on `Bianca`) + +### Changed + +- Update `Nextflow` to `0.26.0` (new fancy report + `AWS Batch`) +- Extra time on `Travis CI` testing +- Replace `bundleDir` by `params.genome_base` +- Update `MultiQC` to `1.3` (`MEGAQC` FTW) +- Move and rename some test files + +### Fixed + +- Version of `COSMIC` `GRCh37` `v83` +- Write an error message when `--sampleDir` does not find any FASTQ files +- `base.config` for `ConcatVCF` process +- File specification for `recalibrationReport` in `RecalibrateBam` process (got error on `AWS Batch`) + +## [1.2.4](https://github.com/SciLifeLab/Sarek/releases/tag/1.2.4) - 2017-10-27 + +### Fixed + +- [#488](https://github.com/SciLifeLab/Sarek/issues/488) - Better CPU requirements for `ConcatVCF` +- [#489](https://github.com/SciLifeLab/Sarek/issues/489) - Exception handling for `ASCAT` +- [#490](https://github.com/SciLifeLab/Sarek/issues/490) - CPU requirements for `runSingleStrelka` and `runSingleManta` + +## [1.2.3](https://github.com/SciLifeLab/Sarek/releases/tag/1.2.3) - 2017-10-18 + +### Fixed + +- [#357](https://github.com/SciLifeLab/Sarek/issues/357) - `ASCAT` works for `GRCh38` +- [#471](https://github.com/SciLifeLab/Sarek/issues/471) - Running `Singularity` on `/scratch` +- [#475](https://github.com/SciLifeLab/Sarek/issues/475) - 16 cpus for local executor +- [#480](https://github.com/SciLifeLab/Sarek/issues/480) - No `tsv` file needed for step `annotate` + +## [1.2.2](https://github.com/SciLifeLab/Sarek/releases/tag/1.2.2) - 2017-10-06 + +### Fixed + +- [#479](https://github.com/SciLifeLab/Sarek/issues/479) - Typo in `uppmax-localhost.config` + +## [1.2.1](https://github.com/SciLifeLab/Sarek/releases/tag/1.2.1) - 2017-10-06 + +### Changed + +- `runascat` and `runconvertallelecounts` containers are now replaced by `r-base` +- `willmclaren/ensembl-vep:release_90.5` is now base for `vepgrch37` and `vepgrch38` + +### Removed + +- `vep` container +- `strelka_config.ini` file + +### Fixed + +- [#471](https://github.com/SciLifeLab/Sarek/issues/471) - Running `Singularity` on /scratch +- [#472](https://github.com/SciLifeLab/Sarek/issues/472) - Update function to check `Nextflow` version +- [#473](https://github.com/SciLifeLab/Sarek/issues/473) - Remove `returnMin()` function + +## [1.2.0](https://github.com/SciLifeLab/Sarek/releases/tag/1.2.0) - 2017-10-02 + +### Changed + +- Fix version for Manuscript + +## [1.1](https://github.com/SciLifeLab/Sarek/releases/tag/1.1) - 2017-09-15 + +### Added + +- `Singularity` possibilities + +### Changed + +- Reports made by default +- Intervals file can be a bed file +- Normal sample preprocessing + `HaplotypeCaller` is possible +- Better `Travis CI` tests + +### Fixed + +- Memory requirements + +## [1.0](https://github.com/SciLifeLab/Sarek/releases/tag/1.0) - 2017-02-16 -Initial release of nf-core/sarek, created with the [nf-core](https://nf-co.re/) template. +### Added -### `Added` +- `Docker` possibilities -### `Fixed` +## [0.9](https://github.com/SciLifeLab/Sarek/releases/tag/0.9) - 2016-11-16 -### `Dependencies` +## [0.8](https://github.com/SciLifeLab/Sarek/releases/tag/0.8) - 2016-11-16 -### `Deprecated` +## [0.1] - 2016-04-05 diff --git a/CITATIONS.md b/CITATIONS.md index 9621da5bfa..632a65b795 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -1,5 +1,13 @@ # nf-core/sarek: Citations +## [nf-core/sarek 3](https://www.biorxiv.org/content/10.1101/2023.07.19.549462v2) + +> Hanssen F, Garcia MU, Folkersen L, Pedersen AS, Lescai F, Jodoin S, Miller E, Wacker O, Smith N, nf-core community, Gabernet G, Nahnsen S. Scalable and efficient DNA sequencing analysis on different compute infrastructures aiding variant discovery. bioRxiv. 2023 Jul 19:2023-07. + +## [nf-core/sarek](https://pubmed.ncbi.nlm.nih.gov/32269765/) + +> Garcia MU, Juhos S, Larsson M, Olason PI, Martin M, Eisfeldt J, DiLorenzo S, Sandgren J, Díaz De Ståhl T, Ewels PA, Wirta V, Nistér M, Käller M, Nystedt B. Sarek: A portable workflow for whole-genome sequencing analysis of germline and somatic variants. F1000Res. 2020 Jan 29;9:63. eCollection 2020. doi: 10.12688/f1000research.16665.2. PubMed PMID: 32269765. + ## [nf-core](https://pubmed.ncbi.nlm.nih.gov/32055031/) > Ewels PA, Peltzer A, Fillinger S, Patel H, Alneberg J, Wilm A, Garcia MU, Di Tommaso P, Nahnsen S. The nf-core framework for community-curated bioinformatics pipelines. Nat Biotechnol. 2020 Mar;38(3):276-278. doi: 10.1038/s41587-020-0439-x. PubMed PMID: 32055031. @@ -10,14 +18,142 @@ ## Pipeline tools +- [ASCAT](https://pubmed.ncbi.nlm.nih.gov/20837533/) + + > Van Loo P, Nordgard SH, Lingjærde OC, et al.: Allele-specific copy number analysis of tumors. Proc Natl Acad Sci USA . 2010 Sep 28;107(39):16910-5. doi: 10.1073/pnas.1009843107. PubMed PMID: 20837533; PubMed Central PMCID: PMC2947907. + +- [alleleCount](https://github.com/cancerit/alleleCount) + +- [BCFTools](https://pubmed.ncbi.nlm.nih.gov/21903627/) + + > Li H: A statistical framework for SNP calling, mutation discovery, association mapping and population genetical parameter estimation from sequencing data. Bioinformatics. 2011 Nov 1;27(21):2987-93. doi: 10.1093/bioinformatics/btr509. PubMed PMID: 21903627; PubMed Central PMCID: PMC3198575. + +- [BGZip](https://github.com/madler/pigz) + +- [BWA-MEM](https://arxiv.org/abs/1303.3997v2) + + > Li H: Aligning sequence reads, clone sequences and assembly contigs with BWA-MEM. arXiv 2013. doi: 10.48550/arXiv.1303.3997 + +- [BWA-MEM2](https://ieeexplore.ieee.org/document/8820962) + + > M. Vasimuddin, S. Misra, H. Li and S. Aluru, "Efficient Architecture-Aware Acceleration of BWA-MEM for Multicore Systems," 2019 IEEE International Parallel and Distributed Processing Symposium (IPDPS), 2019, pp. 314-324. doi: 10.1109/IPDPS.2019.00041. + +- [CNVKIT](https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1004873) + + > Talevich E, Shain AH, Botton T, Bastian BC (2016) CNVkit: Genome-Wide Copy Number Detection and Visualization from Targeted DNA Sequencing. PLoS Comput Biol 12(4): e1004873. doi: 10.1371/journal.pcbi.1004873. PubMed PMID: 27100738. PubMed Central PMCID: PMC4839673. + +- [Control-FREEC](https://pubmed.ncbi.nlm.nih.gov/22155870/) + + > Boeva V, Popova T, Bleakley K, et al.: Control-FREEC: a tool for assessing copy number and allelic content using next-generation sequencing data. Bioinformatics. 2012; 28(3): 423–5. doi: 10.1093/bioinformatics/btr670. Epub 2011 Dec 6. PubMed PMID: 22155870; PubMed Central PMCID: PMC3268243. + +- [dbNSFP](https://pubmed.ncbi.nlm.nih.gov/33261662/) + + > Liu X, et al.: dbNSFP v4: a comprehensive database of transcript-specific functional predictions and annotations for human nonsynonymous and splice-site SNVs. Genome Med. 2020 Dec 2;12(1):103. doi: 10.1186/s13073-020-00803-9. PubMed PMID: 33261662; PubMed Central PMCID: PMC7709417. + +- [DeepVariant](https://www.nature.com/articles/nbt.4235) + + > Poplin, R., Chang, PC., Alexander, D. et al. A universal SNP and small-indel variant caller using deep neural networks. Nat Biotechnol 36, 983–987 (2018). doi: 10.1038/nbt.4235. + +- [DragMap](https://github.com/Illumina/DRAGMAP) + +- [EnsemblVEP](https://pubmed.ncbi.nlm.nih.gov/27268795/) + + > McLaren W, Gil L, Hunt SE, et al.: The Ensembl Variant Effect Predictor. Genome Biol. 2016 Jun 6;17(1):122. doi: 10.1186/s13059-016-0974-4. PubMed PMID: 27268795; PubMed Central PMCID: PMC4893825. + +- [FastP](https://academic.oup.com/bioinformatics/article/34/17/i884/5093234) + + > Shifu Chen, Yanqing Zhou, Yaru Chen, Jia Gu, fastp: an ultra-fast all-in-one FASTQ preprocessor, Bioinformatics, Volume 34, Issue 17, 01 September 2018, Pages i884–i890, doi: 10.1093/bioinformatics/bty560. PubMed PMID: 30423086. PubMed Central PMCID: PMC6129281 + - [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) > Andrews, S. (2010). FastQC: A Quality Control Tool for High Throughput Sequence Data [Online]. +- [FGBio](https://github.com/fulcrumgenomics/fgbio) + +- [FreeBayes](https://arxiv.org/abs/1207.3907) + + > Garrison E, Marth G. Haplotype-based variant detection from short-read sequencing. arXiv preprint arXiv:1207.3907 [q-bio.GN] 2012. doi: 10.48550/arXiv.1207.3907 + +- [GATK](https://pubmed.ncbi.nlm.nih.gov/20644199/) + + > McKenna A, Hanna M, Banks E, et al.: The Genome Analysis Toolkit: a MapReduce framework for analyzing next-generation DNA sequencing data. Genome Res. 2010 Sep;20(9):1297-303. doi: 10.1101/gr.107524.110. Epub 2010 Jul 19. PubMed PMID: 20644199; PubMed Central PMCID: PMC2928508. + +- [GNU sed](http://www.gnu.org/software/sed/) + +- [HaplotypeCaller Joint Germline](https://www.biorxiv.org/content/10.1101/201178v3) + + > Poplin R. et al, Scaling accurate genetic variant discovery to tens of thousands of samples, bioRxiv 2018. doi: 10.1101/201178 + +- [LOFTEE](https://pubmed.ncbi.nlm.nih.gov/32461654/) + + > Karczewski KJ, et al.: The mutational constraint spectrum quantified from variation in 141,456 humans. Nature. 2020 May;581(7809):434-443. doi: 10.1038/s41586-020-2308-7. PubMed PMID: 32461654; PubMed Central PMCID: PMC7334197. + +- [Manta](https://pubmed.ncbi.nlm.nih.gov/26647377/) + + > Chen X, Schulz-Trieglaff O, Shaw R, et al.: Manta: rapid detection of structural variants and indels for germline and cancer sequencing applications. Bioinformatics. 2016 Apr 15;32(8):1220-2. doi: 10.1093/bioinformatics/btv710. PubMed PMID: 26647377. + +- [Mosdepth](https://academic.oup.com/bioinformatics/article/34/5/867/4583630) + + > Brent S Pedersen, Aaron R Quinlan, Mosdepth: quick coverage calculation for genomes and exomes, Bioinformatics, Volume 34, Issue 5, 01 March 2018, Pages 867–868. doi: 10.1093/bioinformatics/btx699. PubMed PMID: 29096012. PubMed Central PMCID: PMC6030888. + +- [MSISensorPro](https://www.sciencedirect.com/science/article/pii/S1672022920300218) + + > Peng Jia, Xiaofei Yang, Li Guo, Bowen Liu, Jiadong Lin, Hao Liang, et al. MSIsensor-pro: fast, accurate, and matched-normal-sample-free detection of microsatellite instability. Genomics Proteomics Bioinformatics 2020,18(1). doi: 10.1016/j.gpb.2020.02.001. PubMed PMID: 32171661. PubMed Central PMCID: PMC7393535. + - [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/) > Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924. +- [PIGZ](https://zlib.net/pigz/) + +- [P7Zip](http://p7zip.sourceforge.net/) + +- [Samblaster](https://academic.oup.com/bioinformatics/article/30/17/2503/2748175) + + > Gregory G. Faust, Ira M. Hall, SAMBLASTER: fast duplicate marking and structural variant read extraction, Bioinformatics, Volume 30, Issue 17, 1 September 2014, Pages 2503–2505. doi: 10.1093/bioinformatics/btu314. PubMed PMID: 24812344. PubMed Central PMCID: PMC4147885. + +- [SAMtools](https://pubmed.ncbi.nlm.nih.gov/19505943/) + + > Li H, Handsaker B, Wysoker A, Fennell T, Ruan J, Homer N, Marth G, Abecasis G, Durbin R; 1000 Genome Project Data Processing Subgroup. The Sequence Alignment/Map format and SAMtools. Bioinformatics. 2009 Aug 15;25(16):2078-9. doi: 10.1093/bioinformatics/btp352. Epub 2009 Jun 8. PubMed PMID: 19505943; PubMed Central PMCID: PMC2723002. + +- [snpEff](https://pubmed.ncbi.nlm.nih.gov/22728672/) + + > Cingolani P, Platts A, Wang le L, et al.: A program for annotating and predicting the effects of single nucleotide polymorphisms, SnpEff: SNPs in the genome of Drosophila melanogaster strain w1118; iso-2; iso-3. Fly (Austin). Apr-Jun 2012;6(2):80-92. doi: 10.4161/fly.19695. PubMed PMID: 22728672; PubMed Central PMCID: PMC3679285. + +- [SpliceAI](https://pubmed.ncbi.nlm.nih.gov/30661751/) + + > Jaganathan K, et al.: Predicting Splicing from Primary Sequence with Deep Learning. Cell. 2019 Jan 24;176(3):535-548.e24. doi: 10.1016/j.cell.2018.12.015. PubMed PMID: 30661751. + +- [SpliceRegion](https://github.com/Ensembl/VEP_plugins/blob/release/106/SpliceRegion.pm) + +- [Strelka2](https://pubmed.ncbi.nlm.nih.gov/30013048/) + + > Kim S, Scheffler K, Halpern AL, et al.: Strelka2: fast and accurate calling of germline and somatic variants. Nat Methods. 2018 Aug;15(8):591-594. doi: 10.1038/s41592-018-0051-x. Epub 2018 Jul 16. PubMed PMID: 30013048. + +- [SVDB](https://github.com/J35P312/SVDB) + +- [Tabix](https://academic.oup.com/bioinformatics/article/27/5/718/262743) + + > Li H, Tabix: fast retrieval of sequence features from generic TAB-delimited files, Bioinformatics, Volume 27, Issue 5, 1 March 2011, Pages 718–719, doi: 10.1093/bioinformatics/btq671. PubMed PMID: 21208982. PubMed Central PMCID: PMC3042176. + +- [TIDDIT](https://pubmed.ncbi.nlm.nih.gov/28781756/) + + > Eisfeldt J, Vezzi F, Olason P, et al.: TIDDIT, an efficient and comprehensive structural variant caller for massive parallel sequencing data. F1000Res. 2017 May 10;6:664. doi: 10.12688/f1000research.11168.2. eCollection 2017. PubMed PMID: 28781756; PubMed Central PMCID: PMC5521161. + +- [VCFTools](https://pubmed.ncbi.nlm.nih.gov/21653522/) + + > Danecek P, Auton A, Abecasis G, et al.: The variant call format and VCFtools. Bioinformatics. 2011 Aug 1;27(15):2156-8. doi: 10.1093/bioinformatics/btr330. Epub 2011 Jun 7. PubMed PMID: 21653522; PubMed Central PMCID: PMC3137218. + +## R packages + +- [R](https://www.R-project.org/) + + > R Core Team (2017). R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. + +- [RColorBrewer](https://CRAN.R-project.org/package=RColorBrewer) + + > Erich Neuwirth (2014). RColorBrewer: ColorBrewer Palettes. + ## Software packaging/containerisation tools - [Anaconda](https://anaconda.com) @@ -34,7 +170,7 @@ - [Docker](https://dl.acm.org/doi/10.5555/2600239.2600241) - > Merkel, D. (2014). Docker: lightweight linux containers for consistent development and deployment. Linux Journal, 2014(239), 2. doi: 10.5555/2600239.2600241. + > Merkel, D. 2014. Docker: lightweight linux containers for consistent development and deployment. Linux Journal, 2014(239), 2. doi: 10.5555/2600239.2600241. - [Singularity](https://pubmed.ncbi.nlm.nih.gov/28494014/) diff --git a/README.md b/README.md index f3a70672db..6e4688e1b1 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,9 @@ # ![nf-core/sarek](docs/images/nf-core-sarek_logo_light.png#gh-light-mode-only) ![nf-core/sarek](docs/images/nf-core-sarek_logo_dark.png#gh-dark-mode-only) [![GitHub Actions CI Status](https://github.com/nf-core/sarek/workflows/nf-core%20CI/badge.svg)](https://github.com/nf-core/sarek/actions?query=workflow%3A%22nf-core+CI%22) -[![GitHub Actions Linting Status](https://github.com/nf-core/sarek/workflows/nf-core%20linting/badge.svg)](https://github.com/nf-core/sarek/actions?query=workflow%3A%22nf-core+linting%22)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/sarek/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX) +[![GitHub Actions Linting Status](https://github.com/nf-core/sarek/workflows/nf-core%20linting/badge.svg)](https://github.com/nf-core/sarek/actions?query=workflow%3A%22nf-core+linting%22) +[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/sarek/results) +[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.3476425-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.3476425) [![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A523.04.0-23aa62.svg)](https://www.nextflow.io/) [![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/) @@ -13,46 +15,66 @@ ## Introduction -**nf-core/sarek** is a bioinformatics pipeline that ... +**nf-core/sarek** is a workflow designed to detect variants on whole genome or targeted sequencing data. Initially designed for Human, and Mouse, it can work on any species with a reference genome. Sarek can also handle tumour / normal pairs and could include additional relapses. - +The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. Where possible, these processes have been submitted to and installed from [nf-core/modules](https://github.com/nf-core/modules) in order to make them available to all nf-core pipelines, and to everyone within the Nextflow community! - - +On release, automated continuous integration tests run the pipeline on a full-sized dataset on the AWS cloud infrastructure. This ensures that the pipeline runs on AWS, has sensible resource allocation defaults set to run on real-world datasets, and permits the persistent storage of results to benchmark between pipeline releases and other analysis sources. The results obtained from the full-sized test can be viewed on the [nf-core website](https://nf-co.re/sarek/results). -1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) -2. Present QC for raw reads ([`MultiQC`](http://multiqc.info/)) +It's listed on [Elixir - Tools and Data Services Registry](https://bio.tools/nf-core-sarek) and [Dockstore](https://dockstore.org/workflows/github.com/nf-core/sarek). + +

+ +

+ +## Pipeline summary + +Depending on the options and samples provided, the pipeline can currently perform the following: + +- Form consensus reads from UMI sequences (`fgbio`) +- Sequencing quality control and trimming (enabled by `--trim_fastq`) (`FastQC`, `fastp`) +- Map Reads to Reference (`BWA-mem`, `BWA-mem2`, `dragmap` or `Sentieon BWA-mem`) +- Process BAM file (`GATK MarkDuplicates`, `GATK BaseRecalibrator` and `GATK ApplyBQSR` or `Sentieon LocusCollector` and `Sentieon Dedup`) +- Summarise alignment statistics (`samtools stats`, `mosdepth`) +- Variant calling (enabled by `--tools`, see [compatibility](#which-variant-calling-tool-is-implemented-for-which-data-type)): + - `ASCAT` + - `CNVkit` + - `Control-FREEC` + - `DeepVariant` + - `freebayes` + - `GATK HaplotypeCaller` + - `Manta` + - `mpileup` + - `MSIsensor-pro` + - `Mutect2` + - `Sentieon Haplotyper` + - `Strelka2` + - `TIDDIT` +- Variant filtering and annotation (`SnpEff`, `Ensembl VEP`, `BCFtools annotate`) +- Summarise and represent QC (`MultiQC`) + +

+ +

## Usage > [!NOTE] > If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data. - +Each row represents a pair of fastq files (paired end). Now, you can run the pipeline using: - - ```bash nextflow run nf-core/sarek \ -profile \ @@ -74,24 +96,88 @@ For more details about the output files and reports, please refer to the ## Credits -nf-core/sarek was originally written by Maxime Garcia, Szilveszter Juhos, Friederike Hanssen. +Sarek was originally written by Maxime U Garcia and Szilveszter Juhos at the [National Genomics Infastructure](https://ngisweden.scilifelab.se) and [National Bioinformatics Infastructure Sweden](https://nbis.se) which are both platforms at [SciLifeLab](https://scilifelab.se), with the support of [The Swedish Childhood Tumor Biobank (Barntumörbanken)](https://ki.se/forskning/barntumorbanken). +Friederike Hanssen and Gisela Gabernet at [QBiC](https://www.qbic.uni-tuebingen.de/) later joined and helped with further development. -We thank the following people for their extensive assistance in the development of this pipeline: +The Nextflow DSL2 conversion of the pipeline was lead by Friederike Hanssen and Maxime U Garcia. - +Maintenance is now lead by Friederike Hanssen and Maxime U Garcia (now at [Seqera Labs](https://seqera/io)) -## Contributions and Support +Main developers: + +- [Maxime U Garcia](https://github.com/maxulysse) +- [Friederike Hanssen](https://github.com/FriederikeHanssen) + +We thank the following people for their extensive assistance in the development of this pipeline: + +- [Abhinav Sharma](https://github.com/abhi18av) +- [Adam Talbot](https://github.com/adamrtalbot) +- [Adrian Lärkeryd](https://github.com/adrlar) +- [Alexander Peltzer](https://github.com/apeltzer) +- [Alison Meynert](https://github.com/ameynert) +- [Anders Sune Pedersen](https://github.com/asp8200) +- [arontommi](https://github.com/arontommi) +- [BarryDigby](https://github.com/BarryDigby) +- [Bekir Ergüner](https://github.com/berguner) +- [bjornnystedt](https://github.com/bjornnystedt) +- [cgpu](https://github.com/cgpu) +- [Chela James](https://github.com/chelauk) +- [David Mas-Ponte](https://github.com/davidmasp) +- [Francesco Lescai](https://github.com/lescai) +- [Gavin Mackenzie](https://github.com/GCJMackenzie) +- [Gisela Gabernet](https://github.com/ggabernet) +- [Grant Neilson](https://github.com/grantn5) +- [gulfshores](https://github.com/gulfshores) +- [Harshil Patel](https://github.com/drpatelh) +- [James A. Fellows Yates](https://github.com/jfy133) +- [Jesper Eisfeldt](https://github.com/J35P312) +- [Johannes Alneberg](https://github.com/alneberg) +- [José Fernández Navarro](https://github.com/jfnavarro) +- [Júlia Mir Pedrol](https://github.com/mirpedrol) +- [Lasse Westergaard Folkersen](https://github.com/lassefolkersen) +- [Lucia Conde](https://github.com/lconde-ucl) +- [Malin Larsson](https://github.com/malinlarsson) +- [Marcel Martin](https://github.com/marcelm) +- [Nick Smith](https://github.com/nickhsmith) +- [Nilesh Tawari](https://github.com/nilesh-tawari) +- [Olga Botvinnik](https://github.com/olgabot) +- [Oskar Wacker](https://github.com/WackerO) +- [pallolason](https://github.com/pallolason) +- [Paul Cantalupo](https://github.com/pcantalupo) +- [Phil Ewels](https://github.com/ewels) +- [Sabrina Krakau](https://github.com/skrakau) +- [Sam Minot](https://github.com/sminot) +- [Sebastian-D](https://github.com/Sebastian-D) +- [Silvia Morini](https://github.com/silviamorins) +- [Solenne Correard](https://github.com/scorreard) +- [Susanne Jodoin](https://github.com/SusiJo) +- [Szilveszter Juhos](https://github.com/szilvajuhos) +- [Tobias Koch](https://github.com/KochTobi) +- [Winni Kretzschmar](https://github.com/winni2k) + +## Acknowledgements + +| [![Barntumörbanken](docs/images/BTB_logo.png)](https://ki.se/forskning/barntumorbanken) | [![SciLifeLab](docs/images/SciLifeLab_logo.png)](https://scilifelab.se) | +| :-----------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------: | +| [![National Genomics Infrastructure](docs/images/NGI_logo.png)](https://ngisweden.scilifelab.se/) | [![National Bioinformatics Infrastructure Sweden](docs/images/NBIS_logo.png)](https://nbis.se) | +| [![QBiC](docs/images/QBiC_logo.png)](https://www.qbic.uni-tuebingen.de) | [![GHGA](docs/images/GHGA_logo.png)](https://www.ghga.de/) | +| [![DNGC](docs/images/DNGC_logo.png)](https://eng.ngc.dk/) | | + +## Contributions & Support If you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md). -For further information or help, don't hesitate to get in touch on the [Slack `#sarek` channel](https://nfcore.slack.com/channels/sarek) (you can join with [this invite](https://nf-co.re/join/slack)). +For further information or help, don't hesitate to get in touch on the [Slack `#sarek` channel](https://nfcore.slack.com/channels/sarek) (you can join with [this invite](https://nf-co.re/join/slack)), or contact us: [Maxime U Garcia](mailto:maxime.garcia@seqera.io?subject=[GitHub]%20nf-core/sarek), [Friederike Hanssen](mailto:friederike.hanssen@qbic.uni-tuebingen.de?subject=[GitHub]%20nf-core/sarek) ## Citations - - +If you use `nf-core/sarek` for your analysis, please cite the `Sarek` article as follows: - +> Friederike Hanssen, Maxime U Garcia, Lasse Folkersen, Anders Sune Pedersen, Francesco Lescai, Susanne Jodoin, Edmund Miller, Oskar Wacker, Nicholas Smith, nf-core community, Gisela Gabernet, Sven Nahnsen **Scalable and efficient DNA sequencing analysis on different compute infrastructures aiding variant discovery** _bioRxiv_ [doi: 10.1101/2023.07.19.549462](https://doi.org/10.1101/2023.07.19.549462). + +> Garcia M, Juhos S, Larsson M et al. **Sarek: A portable workflow for whole-genome sequencing analysis of germline and somatic variants [version 2; peer review: 2 approved]** _F1000Research_ 2020, 9:63 [doi: 10.12688/f1000research.16665.2](http://dx.doi.org/10.12688/f1000research.16665.2). + +You can cite the sarek zenodo record for a specific version using the following [doi: 10.5281/zenodo.3476425](https://doi.org/10.5281/zenodo.3476425) An extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file. @@ -102,3 +188,7 @@ You can cite the `nf-core` publication as follows: > Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen. > > _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x). + +## CHANGELOG + +- [CHANGELOG](CHANGELOG.md) diff --git a/assets/dummy_file.txt b/assets/dummy_file.txt new file mode 100644 index 0000000000..e69de29bb2 diff --git a/assets/methods_description_template.yml b/assets/methods_description_template.yml index 54239f67c9..fec6e66d3f 100644 --- a/assets/methods_description_template.yml +++ b/assets/methods_description_template.yml @@ -3,8 +3,6 @@ description: "Suggested text and references to use when describing pipeline usag section_name: "nf-core/sarek Methods Description" section_href: "https://github.com/nf-core/sarek" plot_type: "html" -## TODO nf-core: Update the HTML below to your preferred methods description, e.g. add publication citation for this pipeline -## You inject any metadata in the Nextflow '${workflow}' object data: |

Methods

Data was processed using nf-core/sarek v${workflow.manifest.version} ${doi_text} of the nf-core collection of workflows (Ewels et al., 2020), utilising reproducible software environments from the Bioconda (Grüning et al., 2018) and Biocontainers (da Veiga Leprevost et al., 2017) projects.

diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index be42ee2088..c92052b67c 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -1,3 +1,7 @@ +custom_logo: "nf-core-sarek_logo_light.png" +custom_logo_url: https://github.com/nf-core/sarek/ +custom_logo_title: "nf-core/sarek" + report_comment: > This report has been generated by the nf-core/sarek analysis pipeline. For information about how to interpret these results, please see the @@ -11,3 +15,89 @@ report_section_order: order: -1002 export_plots: true + +# Run only these modules +run_modules: + - custom_content + - fastqc + - fastp + - picard + - samtools + - mosdepth + - gatk + - bcftools + - vcftools + - snpeff + - vep + +module_order: + - fastqc: + name: "FastQC (raw)" + path_filters_exclude: + - "*_val_*.zip" + - fastp: + name: "FastP (Read preprocessing)" + - picard: + name: "GATK4 MarkDuplicates" + info: " metrics generated either by GATK4 MarkDuplicates or EstimateLibraryComplexity (with --use_gatk_spark)." + - samtools: + name: "Samtools Flagstat" + - mosdepth: + name: "Mosdepth" + - gatk: + name: "GATK4 BQSR" + - bcftools: + name: "Bcftools" + - vcftools: + name: "Vcftools" + - snpeff: + name: "SNPeff" + - vep: + name: "VEP" + +extra_fn_clean_exts: + - "_val" + - type: regex_keep + pattern: "^.*.(md|recal).mosdepth.(global|region).dist" + module: mosdepth + +sample_names_replace_regex: True +sample_names_replace: + "\\.[0-9]{4}$": ".md" # should match ".0001" but only at the end of strings for module Markduplicates/EstimateLibraryComplexity + module: picard + +custom_data: + dedup_metrics: + id: "dedup_metrics" + section_name: "Sentieon Dedup Metrics" + plot_type: "table" + pconfig: + id: "dedup_metrics" + namespace: "Sentieon Dedup Metrics" + headers: + LIBRARY: + description: "LIBRARY" + UNPAIRED_READS_EXAMINED: + description: "UNPAIRED_READS_EXAMINE" + READ_PAIRS_EXAMINED: + description: "READ_PAIRS_EXAMINED" + SECONDARY_OR_SUPPLEMENTARY_RDS: + description: "SECONDARY_OR_SUPPLEMENTARY_RDS" + UNMAPPED_READS: + description: "UNMAPPED_READS" + UNPAIRED_READ_DUPLICATES: + description: "UNPAIRED_READ_DUPLICATES" + READ_PAIR_DUPLICATES: + description: "READ_PAIR_DUPLICATES" + READ_PAIR_OPTICAL_DUPLICATES: + description: "READ_PAIR_OPTICAL_DUPLICATES" + PERCENT_DUPLICATION: + description: "PERCENT_DUPLICATION" + ESTIMATED_LIBRARY_SIZE: + description: "ESTIMATED_LIBRARY_SIZE" +sp: + snpeff: + contents: "SnpEff_version" + max_filesize: 5000000 + dedup_metrics: + fn: "*.metrics.multiqc.tsv" diff --git a/assets/nf-core-sarek_logo_light.png b/assets/nf-core-sarek_logo_light.png index 89581a852a..61aa1c81d5 100644 Binary files a/assets/nf-core-sarek_logo_light.png and b/assets/nf-core-sarek_logo_light.png differ diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv index 5f653ab7bf..171d070180 100644 --- a/assets/samplesheet.csv +++ b/assets/samplesheet.csv @@ -1,3 +1,2 @@ -sample,fastq_1,fastq_2 -SAMPLE_PAIRED_END,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz -SAMPLE_SINGLE_END,/path/to/fastq/files/AEG588A4_S4_L003_R1_001.fastq.gz, +patient,sample,lane,fastq_1,fastq_2 +PATIENT_ID,SAMPLE_PAIRED_END,LANE,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz diff --git a/assets/schema_input.json b/assets/schema_input.json index 9941cf8a5f..ad0b39f586 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -7,18 +7,73 @@ "items": { "type": "object", "properties": { + "patient": { + "type": "string", + "pattern": "^\\S+$", + "errorMessage": "Patient ID must be provided and cannot contain spaces", + "meta": ["patient"] + }, "sample": { "type": "string", "pattern": "^\\S+$", - "errorMessage": "Sample name must be provided and cannot contain spaces" + "errorMessage": "Sample ID must be provided and cannot contain spaces", + "meta": ["sample"] }, - "fastq_1": { + "sex": { + "errorMessage": "Sex cannot contain spaces", + "meta": ["sex"], + "default": "NA", + "anyOf": [ + { + "type": "string", + "pattern": "^\\S+$" + }, + { + "type": "string", + "maxLength": 0 + } + ] + }, + "status": { + "type": "integer", + "errorMessage": "Status can only be 0 (normal) or 1 (tumor). Defaults to 0, if none is supplied.", + "meta": ["status"], + "default": "0", + "minimum": 0, + "maximum": 1 + }, + "lane": { "type": "string", - "pattern": "^\\S+\\.f(ast)?q\\.gz$", - "errorMessage": "FastQ file for reads 1 must be provided, cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'" + "pattern": "^\\S+$", + "unique": ["patient", "sample"], + "anyOf": [ + { + "dependentRequired": ["fastq_1"] + }, + { + "dependentRequired": ["bam"] + } + ], + "meta": ["lane"] + }, + "fastq_1": { + "errorMessage": "FastQ file for reads 1 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'", + "anyOf": [ + { + "type": "string", + "pattern": "^\\S+\\.f(ast)?q\\.gz$" + }, + { + "type": "string", + "maxLength": 0 + } + ], + "format": "file-path", + "exists": true }, "fastq_2": { "errorMessage": "FastQ file for reads 2 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'", + "dependentRequired": ["fastq_1"], "anyOf": [ { "type": "string", @@ -28,9 +83,104 @@ "type": "string", "maxLength": 0 } - ] + ], + "format": "file-path", + "exists": true + }, + "table": { + "errorMessage": "Recalibration table cannot contain spaces and must have extension '.table'", + "anyOf": [ + { + "type": "string", + "pattern": "^\\S+\\.table$" + }, + { + "type": "string", + "maxLength": 0 + } + ], + "format": "file-path", + "exists": true + }, + "cram": { + "errorMessage": "CRAM file cannot contain spaces and must have extension '.cram'", + "anyOf": [ + { + "type": "string", + "pattern": "^\\S+\\.cram$" + }, + { + "type": "string", + "maxLength": 0 + } + ], + "format": "file-path", + "exists": true + }, + "crai": { + "errorMessage": "CRAM index file cannot contain spaces and must have extension '.crai'", + "anyOf": [ + { + "type": "string", + "pattern": "^\\S+\\.crai$" + }, + { + "type": "string", + "maxLength": 0 + } + ], + "format": "file-path", + "exists": true + }, + "bam": { + "errorMessage": "BAM file cannot contain spaces and must have extension '.bam'", + "anyOf": [ + { + "type": "string", + "pattern": "^\\S+\\.bam$" + }, + { + "type": "string", + "maxLength": 0 + } + ], + "format": "file-path", + "exists": true + }, + "bai": { + "errorMessage": "BAM index file cannot contain spaces and must have extension '.bai'", + "anyOf": [ + { + "type": "string", + "pattern": "^\\S+\\.bai$" + }, + { + "type": "string", + "maxLength": 0 + } + ], + "format": "file-path", + "exists": true + }, + "vcf": { + "errorMessage": "VCF file for reads 1 cannot contain spaces and must have extension '.vcf' or '.vcf.gz'", + "anyOf": [ + { + "type": "string", + "pattern": "^\\S+\\.vcf(\\.gz)?$" + }, + { + "type": "string", + "maxLength": 0 + } + ], + "format": "file-path", + "exists": true + }, + "variantcaller": { + "type": "string" } }, - "required": ["sample", "fastq_1"] + "required": ["patient", "sample"] } } diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py deleted file mode 100755 index 4a758fe003..0000000000 --- a/bin/check_samplesheet.py +++ /dev/null @@ -1,259 +0,0 @@ -#!/usr/bin/env python - - -"""Provide a command line tool to validate and transform tabular samplesheets.""" - - -import argparse -import csv -import logging -import sys -from collections import Counter -from pathlib import Path - -logger = logging.getLogger() - - -class RowChecker: - """ - Define a service that can validate and transform each given row. - - Attributes: - modified (list): A list of dicts, where each dict corresponds to a previously - validated and transformed row. The order of rows is maintained. - - """ - - VALID_FORMATS = ( - ".fq.gz", - ".fastq.gz", - ) - - def __init__( - self, - sample_col="sample", - first_col="fastq_1", - second_col="fastq_2", - single_col="single_end", - **kwargs, - ): - """ - Initialize the row checker with the expected column names. - - Args: - sample_col (str): The name of the column that contains the sample name - (default "sample"). - first_col (str): The name of the column that contains the first (or only) - FASTQ file path (default "fastq_1"). - second_col (str): The name of the column that contains the second (if any) - FASTQ file path (default "fastq_2"). - single_col (str): The name of the new column that will be inserted and - records whether the sample contains single- or paired-end sequencing - reads (default "single_end"). - - """ - super().__init__(**kwargs) - self._sample_col = sample_col - self._first_col = first_col - self._second_col = second_col - self._single_col = single_col - self._seen = set() - self.modified = [] - - def validate_and_transform(self, row): - """ - Perform all validations on the given row and insert the read pairing status. - - Args: - row (dict): A mapping from column headers (keys) to elements of that row - (values). - - """ - self._validate_sample(row) - self._validate_first(row) - self._validate_second(row) - self._validate_pair(row) - self._seen.add((row[self._sample_col], row[self._first_col])) - self.modified.append(row) - - def _validate_sample(self, row): - """Assert that the sample name exists and convert spaces to underscores.""" - if len(row[self._sample_col]) <= 0: - raise AssertionError("Sample input is required.") - # Sanitize samples slightly. - row[self._sample_col] = row[self._sample_col].replace(" ", "_") - - def _validate_first(self, row): - """Assert that the first FASTQ entry is non-empty and has the right format.""" - if len(row[self._first_col]) <= 0: - raise AssertionError("At least the first FASTQ file is required.") - self._validate_fastq_format(row[self._first_col]) - - def _validate_second(self, row): - """Assert that the second FASTQ entry has the right format if it exists.""" - if len(row[self._second_col]) > 0: - self._validate_fastq_format(row[self._second_col]) - - def _validate_pair(self, row): - """Assert that read pairs have the same file extension. Report pair status.""" - if row[self._first_col] and row[self._second_col]: - row[self._single_col] = False - first_col_suffix = Path(row[self._first_col]).suffixes[-2:] - second_col_suffix = Path(row[self._second_col]).suffixes[-2:] - if first_col_suffix != second_col_suffix: - raise AssertionError("FASTQ pairs must have the same file extensions.") - else: - row[self._single_col] = True - - def _validate_fastq_format(self, filename): - """Assert that a given filename has one of the expected FASTQ extensions.""" - if not any(filename.endswith(extension) for extension in self.VALID_FORMATS): - raise AssertionError( - f"The FASTQ file has an unrecognized extension: {filename}\n" - f"It should be one of: {', '.join(self.VALID_FORMATS)}" - ) - - def validate_unique_samples(self): - """ - Assert that the combination of sample name and FASTQ filename is unique. - - In addition to the validation, also rename all samples to have a suffix of _T{n}, where n is the - number of times the same sample exist, but with different FASTQ files, e.g., multiple runs per experiment. - - """ - if len(self._seen) != len(self.modified): - raise AssertionError("The pair of sample name and FASTQ must be unique.") - seen = Counter() - for row in self.modified: - sample = row[self._sample_col] - seen[sample] += 1 - row[self._sample_col] = f"{sample}_T{seen[sample]}" - - -def read_head(handle, num_lines=10): - """Read the specified number of lines from the current position in the file.""" - lines = [] - for idx, line in enumerate(handle): - if idx == num_lines: - break - lines.append(line) - return "".join(lines) - - -def sniff_format(handle): - """ - Detect the tabular format. - - Args: - handle (text file): A handle to a `text file`_ object. The read position is - expected to be at the beginning (index 0). - - Returns: - csv.Dialect: The detected tabular format. - - .. _text file: - https://docs.python.org/3/glossary.html#term-text-file - - """ - peek = read_head(handle) - handle.seek(0) - sniffer = csv.Sniffer() - dialect = sniffer.sniff(peek) - return dialect - - -def check_samplesheet(file_in, file_out): - """ - Check that the tabular samplesheet has the structure expected by nf-core pipelines. - - Validate the general shape of the table, expected columns, and each row. Also add - an additional column which records whether one or two FASTQ reads were found. - - Args: - file_in (pathlib.Path): The given tabular samplesheet. The format can be either - CSV, TSV, or any other format automatically recognized by ``csv.Sniffer``. - file_out (pathlib.Path): Where the validated and transformed samplesheet should - be created; always in CSV format. - - Example: - This function checks that the samplesheet follows the following structure, - see also the `viral recon samplesheet`_:: - - sample,fastq_1,fastq_2 - SAMPLE_PE,SAMPLE_PE_RUN1_1.fastq.gz,SAMPLE_PE_RUN1_2.fastq.gz - SAMPLE_PE,SAMPLE_PE_RUN2_1.fastq.gz,SAMPLE_PE_RUN2_2.fastq.gz - SAMPLE_SE,SAMPLE_SE_RUN1_1.fastq.gz, - - .. _viral recon samplesheet: - https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv - - """ - required_columns = {"sample", "fastq_1", "fastq_2"} - # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`. - with file_in.open(newline="") as in_handle: - reader = csv.DictReader(in_handle, dialect=sniff_format(in_handle)) - # Validate the existence of the expected header columns. - if not required_columns.issubset(reader.fieldnames): - req_cols = ", ".join(required_columns) - logger.critical(f"The sample sheet **must** contain these column headers: {req_cols}.") - sys.exit(1) - # Validate each row. - checker = RowChecker() - for i, row in enumerate(reader): - try: - checker.validate_and_transform(row) - except AssertionError as error: - logger.critical(f"{str(error)} On line {i + 2}.") - sys.exit(1) - checker.validate_unique_samples() - header = list(reader.fieldnames) - header.insert(1, "single_end") - # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`. - with file_out.open(mode="w", newline="") as out_handle: - writer = csv.DictWriter(out_handle, header, delimiter=",") - writer.writeheader() - for row in checker.modified: - writer.writerow(row) - - -def parse_args(argv=None): - """Define and immediately parse command line arguments.""" - parser = argparse.ArgumentParser( - description="Validate and transform a tabular samplesheet.", - epilog="Example: python check_samplesheet.py samplesheet.csv samplesheet.valid.csv", - ) - parser.add_argument( - "file_in", - metavar="FILE_IN", - type=Path, - help="Tabular input samplesheet in CSV or TSV format.", - ) - parser.add_argument( - "file_out", - metavar="FILE_OUT", - type=Path, - help="Transformed output samplesheet in CSV format.", - ) - parser.add_argument( - "-l", - "--log-level", - help="The desired log level (default WARNING).", - choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG"), - default="WARNING", - ) - return parser.parse_args(argv) - - -def main(argv=None): - """Coordinate argument parsing and program execution.""" - args = parse_args(argv) - logging.basicConfig(level=args.log_level, format="[%(levelname)s] %(message)s") - if not args.file_in.is_file(): - logger.error(f"The given input file {args.file_in} was not found!") - sys.exit(2) - args.file_out.parent.mkdir(parents=True, exist_ok=True) - check_samplesheet(args.file_in, args.file_out) - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/bin/license_message.py b/bin/license_message.py new file mode 100644 index 0000000000..9ba2abba36 --- /dev/null +++ b/bin/license_message.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python3 + +######################################### +# Author: [DonFreed](https://github.com/DonFreed) +# File: license_message.py +# Source: https://github.com/DonFreed/docker-actions-test/blob/main/.github/scripts/license_message.py +# Source+commit: https://github.com/DonFreed/docker-actions-test/blob/aa1051a9f53b3a1e801953748d062cad74dca9a9/.github/scripts/license_message.py +# Download Date: 2023-07-04, commit: aa1051a +# This source code is licensed under the BSD 2-Clause license +######################################### + +""" +Functions for generating and sending license messages +""" + +# Modified from - https://stackoverflow.com/a/59835994 + +import argparse +import base64 +import calendar +import re +import secrets +import sys + +from cryptography.hazmat.primitives.ciphers.aead import AESGCM +from datetime import datetime as dt + +MESSAGE_TIMEOUT = 60 * 60 * 24 # Messages are valid for 1 day +NONCE_BYTES = 12 + + +class DecryptionTimeout(Exception): + # Decrypting a message that is too old + pass + + +def generate_key(): + key = secrets.token_bytes(32) + return key + + +def handle_generate_key(args): + key = generate_key() + key_b64 = base64.b64encode(key) + print(key_b64.decode("utf-8"), file=args.outfile) + + +def encrypt_message(key, message): + nonce = secrets.token_bytes(NONCE_BYTES) + timestamp = calendar.timegm(dt.now().utctimetuple()) + data = timestamp.to_bytes(10, byteorder="big") + b"__" + message + ciphertext = nonce + AESGCM(key).encrypt(nonce, data, b"") + return ciphertext + + +def handle_encrypt_message(args): + key = base64.b64decode(args.key.encode("utf-8")) + message = args.message.encode("utf-8") + ciphertext = encrypt_message(key, message) + ciphertext_b64 = base64.b64encode(ciphertext) + print(ciphertext_b64.decode("utf-8"), file=args.outfile) + + +def decrypt_message(key, ciphertext, timeout=MESSAGE_TIMEOUT): + nonce, ciphertext = ciphertext[:NONCE_BYTES], ciphertext[NONCE_BYTES:] + message = AESGCM(key).decrypt(nonce, ciphertext, b"") + + msg_timestamp, message = re.split(b"__", message, maxsplit=1) + msg_timestamp = int.from_bytes(msg_timestamp, byteorder="big") + timestamp = calendar.timegm(dt.now().utctimetuple()) + if (timestamp - msg_timestamp) > timeout: + raise DecryptionTimeout("The message has an expired timeout") + return message.decode("utf-8") + + +def handle_decrypt_message(args): + key = base64.b64decode(args.key.encode("utf-8")) + ciphertext = base64.b64decode(args.message.encode("utf-8")) + message = decrypt_message(key, ciphertext, timeout=args.timeout) + print(str(message), file=args.outfile) + + +def parse_args(argv=None): + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--outfile", default=sys.stdout, type=argparse.FileType("w"), help="The output file") + + subparsers = parser.add_subparsers(help="Available sub-commands") + + gen_parser = subparsers.add_parser("generate_key", help="Generate a random key string") + gen_parser.set_defaults(func=handle_generate_key) + + encrypt_parser = subparsers.add_parser("encrypt", help="Encrypt a message") + encrypt_parser.add_argument("--key", required=True, help="The encryption key") + encrypt_parser.add_argument("--message", required=True, help="Message to encrypt") + encrypt_parser.set_defaults(func=handle_encrypt_message) + + decrypt_parser = subparsers.add_parser("decrypt", help="Decyrpt a message") + decrypt_parser.add_argument("--key", required=True, help="The encryption key") + decrypt_parser.add_argument("--message", required=True, help="Message to decrypt") + decrypt_parser.add_argument( + "--timeout", + default=MESSAGE_TIMEOUT, + type=int, + help="A message timeout. Decryption will fail for older messages", + ) + decrypt_parser.set_defaults(func=handle_decrypt_message) + + return parser.parse_args(argv) + + +if __name__ == "__main__": + args = parse_args() + args.func(args) diff --git a/conf/base.config b/conf/base.config index fd61a5d231..d371e9407f 100644 --- a/conf/base.config +++ b/conf/base.config @@ -9,23 +9,25 @@ */ process { - - // TODO nf-core: Check the defaults for all processes cpus = { check_max( 1 * task.attempt, 'cpus' ) } memory = { check_max( 6.GB * task.attempt, 'memory' ) } time = { check_max( 4.h * task.attempt, 'time' ) } + shell = ['/bin/bash', '-euo', 'pipefail'] + // memory errors which should be retried. otherwise error out errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' } maxRetries = 1 maxErrors = '-1' // Process-specific resource requirements - // NOTE - Please try and re-use the labels below as much as possible. - // These labels are used and recognised by default in DSL2 files hosted on nf-core/modules. - // If possible, it would be nice to keep the same label naming convention when - // adding in your local modules too. - // TODO nf-core: Customise requirements for specific processes. // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors + withLabel:error_ignore { + errorStrategy = 'ignore' + } + withLabel:error_retry { + errorStrategy = 'retry' + maxRetries = 2 + } withLabel:process_single { cpus = { check_max( 1 , 'cpus' ) } memory = { check_max( 6.GB * task.attempt, 'memory' ) } @@ -52,14 +54,51 @@ process { withLabel:process_high_memory { memory = { check_max( 200.GB * task.attempt, 'memory' ) } } - withLabel:error_ignore { - errorStrategy = 'ignore' + withName: 'UNZIP.*|UNTAR.*|TABIX.*|BUILD_INTERVALS|CREATE_INTERVALS_BED|CUSTOM_DUMPSOFTWAREVERSIONS|VCFTOOLS|BCFTOOLS.*|SAMTOOLS_INDEX' { + cpus = { check_max( 1 * task.attempt, 'cpus' ) } + memory = { check_max( 1.GB * task.attempt, 'memory' ) } } - withLabel:error_retry { - errorStrategy = 'retry' - maxRetries = 2 + withName: 'FASTQC'{ + cpus = { check_max( 4 * task.attempt, 'cpus' ) } + memory = { check_max( 4.GB * task.attempt, 'memory' ) } + } + withName: 'FASTP'{ + cpus = { check_max( 12 * task.attempt, 'cpus' ) } + memory = { check_max( 4.GB * task.attempt, 'memory' ) } + } + withName: 'BWAMEM1_MEM|BWAMEM2_MEM' { + cpus = { check_max( 24 * task.attempt, 'cpus' ) } + memory = { check_max( 30.GB * task.attempt, 'memory' ) } + } + withName:'CNVKIT_BATCH' { + label = "process_high" + memory = { check_max( 36.GB * task.attempt, 'memory' ) } + } + withName: 'GATK4_MARKDUPLICATES|GATK4SPARK_MARKDUPLICATES' { + cpus = { check_max( 6 * task.attempt, 'cpus' ) } + memory = { check_max( 30.GB * task.attempt, 'memory' ) } + } + withName:'GATK4_APPLYBQSR|GATK4SPARK_APPLYBQSR|GATK4_BASERECALIBRATOR|GATK4SPARK_BASERECALIBRATOR|GATK4_GATHERBQSRREPORTS'{ + cpus = { check_max( 2 * task.attempt, 'cpus' ) } + memory = { check_max( 4.GB * task.attempt, 'memory' ) } + } + withName:'MOSDEPTH'{ + cpus = { check_max( 4 * task.attempt, 'cpus' ) } + memory = { check_max( 4.GB * task.attempt, 'memory' ) } + } + withName:'STRELKA.*|MANTA.*' { + cpus = { check_max( 10 * task.attempt, 'cpus' ) } + memory = { check_max( 8.GB * task.attempt, 'memory' ) } + } + withName:'SAMTOOLS_CONVERT'{ + memory = { check_max( 4.GB * task.attempt, 'memory' ) } + } + withName:'GATK4_MERGEVCFS'{ + cpus = { check_max( 2 * task.attempt, 'cpus' ) } + memory = { check_max( 4.GB * task.attempt, 'memory' ) } } - withName:CUSTOM_DUMPSOFTWAREVERSIONS { - cache = false + withName: 'MULTIQC' { + cpus = { check_max( 4 * task.attempt, 'cpus' ) } + memory = { check_max( 12.GB * task.attempt, 'memory' ) } } } diff --git a/conf/igenomes.config b/conf/igenomes.config index 3f11437759..1ae02673d7 100644 --- a/conf/igenomes.config +++ b/conf/igenomes.config @@ -11,430 +11,314 @@ params { // illumina iGenomes reference file paths genomes { - 'GRCh37' { - fasta = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Annotation/README.txt" - mito_name = "MT" - macs_gsize = "2.7e9" - blacklist = "${projectDir}/assets/blacklists/GRCh37-blacklist.bed" - } - 'GRCh38' { - fasta = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Annotation/Genes/genes.bed" - mito_name = "chrM" - macs_gsize = "2.7e9" - blacklist = "${projectDir}/assets/blacklists/hg38-blacklist.bed" + 'GATK.GRCh37' { + ascat_alleles = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/ASCAT/G1000_alleles_hg19.zip" + ascat_genome = 'hg19' + ascat_loci = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/ASCAT/G1000_loci_hg19.zip" + ascat_loci_gc = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/ASCAT/GC_G1000_hg19.zip" + ascat_loci_rt = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/ASCAT/RT_G1000_hg19.zip" + bwa = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Sequence/BWAIndex/" + chr_dir = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Sequence/Chromosomes" + dbsnp = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/GATKBundle/dbsnp_138.b37.vcf.gz" + dbsnp_tbi = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/GATKBundle/dbsnp_138.b37.vcf.gz.tbi" + dbsnp_vqsr = '--resource:dbsnp,known=false,training=true,truth=false,prior=2.0 dbsnp_138.b37.vcf.gz' + dict = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Sequence/WholeGenomeFasta/human_g1k_v37_decoy.dict" + fasta = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Sequence/WholeGenomeFasta/human_g1k_v37_decoy.fasta" + fasta_fai = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Sequence/WholeGenomeFasta/human_g1k_v37_decoy.fasta.fai" + germline_resource = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/GATKBundle/af-only-gnomad.raw.sites.vcf.gz" + germline_resource_tbi = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/GATKBundle/af-only-gnomad.raw.sites.vcf.gz.tbi" + intervals = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/intervals/wgs_calling_regions_Sarek.list" + known_snps = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/GATKBundle/1000G_phase1.snps.high_confidence.b37.vcf.gz" + known_snps_tbi = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/GATKBundle/1000G_phase1.snps.high_confidence.b37.vcf.gz.tbi" + known_snps_vqsr = '--resource:1000G,known=false,training=true,truth=true,prior=10.0 1000G_phase1.snps.high_confidence.b37.vcf.gz' + known_indels = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/GATKBundle/{1000G_phase1,Mills_and_1000G_gold_standard}.indels.b37.vcf.gz" + known_indels_tbi = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/GATKBundle/{1000G_phase1,Mills_and_1000G_gold_standard}.indels.b37.vcf.gz.tbi" + known_indels_vqsr = '--resource:1000G,known=false,training=true,truth=true,prior=10.0 1000G_phase1.indels.b37.vcf.gz --resource:mills,known=false,training=true,truth=true,prior=10.0 Mills_and_1000G_gold_standard.indels.b37.vcf.gz' + mappability = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/Control-FREEC/out100m2_hg19.gem" + ngscheckmate_bed = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh37/Annotation/NGSCheckMate/SNP_GRCh37_hg19_wChr.bed" + snpeff_db = 87 + snpeff_genome = 'GRCh37' + vep_cache_version = 110 + vep_genome = 'GRCh37' + vep_species = 'homo_sapiens' + } + 'GATK.GRCh38' { + ascat_alleles = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/ASCAT/G1000_alleles_hg38.zip" + ascat_genome = 'hg38' + ascat_loci = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/ASCAT/G1000_loci_hg38.zip" + ascat_loci_gc = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/ASCAT/GC_G1000_hg38.zip" + ascat_loci_rt = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/ASCAT/RT_G1000_hg38.zip" + bwa = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Sequence/BWAIndex/" + bwamem2 = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Sequence/BWAmem2Index/" + cf_chrom_len = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Sequence/Length/Homo_sapiens_assembly38.len" + chr_dir = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Sequence/Chromosomes" + dbsnp = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/GATKBundle/dbsnp_146.hg38.vcf.gz" + dbsnp_tbi = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/GATKBundle/dbsnp_146.hg38.vcf.gz.tbi" + dbsnp_vqsr = '--resource:dbsnp,known=false,training=true,truth=false,prior=2.0 dbsnp_146.hg38.vcf.gz' + dict = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.dict" + dragmap = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Sequence/dragmap/" + fasta = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta" + fasta_fai = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta.fai" + germline_resource = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/GATKBundle/af-only-gnomad.hg38.vcf.gz" + germline_resource_tbi = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/GATKBundle/af-only-gnomad.hg38.vcf.gz.tbi" + intervals = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/intervals/wgs_calling_regions_noseconds.hg38.bed" + known_indels = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/GATKBundle/{Mills_and_1000G_gold_standard.indels.hg38,beta/Homo_sapiens_assembly38.known_indels}.vcf.gz" + known_indels_tbi = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/GATKBundle/{Mills_and_1000G_gold_standard.indels.hg38,beta/Homo_sapiens_assembly38.known_indels}.vcf.gz.tbi" + known_indels_vqsr = '--resource:gatk,known=false,training=true,truth=true,prior=10.0 Homo_sapiens_assembly38.known_indels.vcf.gz --resource:mills,known=false,training=true,truth=true,prior=10.0 Mills_and_1000G_gold_standard.indels.hg38.vcf.gz' + known_snps = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/GATKBundle/1000G_omni2.5.hg38.vcf.gz" + known_snps_tbi = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/GATKBundle/1000G_omni2.5.hg38.vcf.gz.tbi" + known_snps_vqsr = '--resource:1000G,known=false,training=true,truth=true,prior=10.0 1000G_omni2.5.hg38.vcf.gz' + mappability = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/Control-FREEC/out100m2_hg38.gem" + ngscheckmate_bed = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/NGSCheckMate/SNP_GRCh38_hg38_wChr.bed" + pon = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/GATKBundle/1000g_pon.hg38.vcf.gz" + pon_tbi = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/GATKBundle/1000g_pon.hg38.vcf.gz.tbi" + sentieon_dnascope_model = "${params.igenomes_base}/Homo_sapiens/GATK/GRCh38/Annotation/Sentieon/SentieonDNAscopeModel1.1.model" + snpeff_db = 105 + snpeff_genome = 'GRCh38' + vep_cache_version = 110 + vep_genome = 'GRCh38' + vep_species = 'homo_sapiens' + } + 'Ensembl.GRCh37' { + bwa = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/BWAIndex/version0.6.0/" + fasta = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/WholeGenomeFasta/genome.fa" + ngscheckmate_bed = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Annotation/NGSCheckMate/SNP_GRCh37_hg19_woChr.bed" + readme = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Annotation/README.txt" + snpeff_db = 87 + snpeff_genome = 'GRCh37' + vep_cache_version = 110 + vep_genome = 'GRCh37' + vep_species = 'homo_sapiens' + } + 'NCBI.GRCh38' { + bwa = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/BWAIndex/version0.6.0/" + fasta = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/WholeGenomeFasta/genome.fa" + ngscheckmate_bed ="${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Annotation/NGSCheckMate/SNP_GRCh38_hg38_wChr.bed" + snpeff_db = 105 + snpeff_genome = 'GRCh38' + vep_cache_version = 110 + vep_genome = 'GRCh38' + vep_species = 'homo_sapiens' } 'CHM13' { - fasta = "${params.igenomes_base}/Homo_sapiens/UCSC/CHM13/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Homo_sapiens/UCSC/CHM13/Sequence/BWAIndex/" - bwamem2 = "${params.igenomes_base}/Homo_sapiens/UCSC/CHM13/Sequence/BWAmem2Index/" - gtf = "${params.igenomes_base}/Homo_sapiens/NCBI/CHM13/Annotation/Genes/genes.gtf" - gff = "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/009/914/755/GCF_009914755.1_T2T-CHM13v2.0/GCF_009914755.1_T2T-CHM13v2.0_genomic.gff.gz" - mito_name = "chrM" + fasta = "${params.igenomes_base}/Homo_sapiens/UCSC/CHM13/Sequence/WholeGenomeFasta/genome.fa" + bwa = "${params.igenomes_base}/Homo_sapiens/UCSC/CHM13/Sequence/BWAIndex/" + bwamem2 = "${params.igenomes_base}/Homo_sapiens/UCSC/CHM13/Sequence/BWAmem2Index/" } 'GRCm38' { - fasta = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Annotation/README.txt" - mito_name = "MT" - macs_gsize = "1.87e9" - blacklist = "${projectDir}/assets/blacklists/GRCm38-blacklist.bed" + bwa = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/BWAIndex/version0.6.0/" + chr_dir = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/Chromosomes" + dbsnp = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/MouseGenomeProject/mgp.v5.merged.snps_all.dbSNP142.vcf.gz" + dbsnp_tbi = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/MouseGenomeProject/mgp.v5.merged.snps_all.dbSNP142.vcf.gz.tbi" + dict = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/WholeGenomeFasta/genome.dict" + fasta = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/WholeGenomeFasta/genome.fa" + fasta_fai = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/WholeGenomeFasta/genome.fa.fai" + intervals = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Annotation/intervals/GRCm38_calling_list.bed" + known_indels = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/MouseGenomeProject/mgp.v5.merged.indels.dbSNP142.normed.vcf.gz" + known_indels_tbi = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/MouseGenomeProject/mgp.v5.merged.indels.dbSNP142.normed.vcf.gz.tbi" + mappability = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Annotation/Control-FREEC/GRCm38_68_mm10.gem" + readme = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Annotation/README.txt" + snpeff_db = 99 + snpeff_genome = 'GRCm38' + vep_cache_version = 102 + vep_genome = 'GRCm38' + vep_species = 'mus_musculus' } 'TAIR10' { - fasta = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Annotation/README.txt" - mito_name = "Mt" + bwa = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/BWAIndex/version0.6.0/" + fasta = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/WholeGenomeFasta/genome.fa" + readme = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Annotation/README.txt" } 'EB2' { - fasta = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Annotation/README.txt" + bwa = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/BWAIndex/version0.6.0/" + fasta = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/WholeGenomeFasta/genome.fa" + readme = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Annotation/README.txt" } 'UMD3.1' { - fasta = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Annotation/README.txt" - mito_name = "MT" + bwa = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/BWAIndex/version0.6.0/" + fasta = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/WholeGenomeFasta/genome.fa" + readme = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Annotation/README.txt" + snpeff_db = 75 + snpeff_genome = 'UMD3.1' + vep_cache_version = 94 + vep_genome = 'UMD3.1' + vep_species = 'bos_taurus' } 'WBcel235' { - fasta = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Annotation/Genes/genes.bed" - mito_name = "MtDNA" - macs_gsize = "9e7" + bwa = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/BWAIndex/version0.6.0/" + fasta = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/WholeGenomeFasta/genome.fa" + snpeff_db = 105 + snpeff_genome = 'WBcel235' + vep_cache_version = 110 + vep_genome = 'WBcel235' + vep_species = 'caenorhabditis_elegans' } 'CanFam3.1' { - fasta = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Annotation/README.txt" - mito_name = "MT" + bwa = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/BWAIndex/version0.6.0/" + fasta = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/WholeGenomeFasta/genome.fa" + readme = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Annotation/README.txt" + snpeff_db = 99 + snpeff_genome = 'CanFam3.1' + vep_cache_version = 104 + vep_genome = 'CanFam3.1' + vep_species = 'canis_lupus_familiaris' } 'GRCz10' { - fasta = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Annotation/Genes/genes.bed" - mito_name = "MT" + bwa = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/BWAIndex/version0.6.0/" + fasta = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/WholeGenomeFasta/genome.fa" } 'BDGP6' { - fasta = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Annotation/Genes/genes.bed" - mito_name = "M" - macs_gsize = "1.2e8" + bwa = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/BWAIndex/version0.6.0/" + fasta = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/WholeGenomeFasta/genome.fa" } 'EquCab2' { - fasta = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Annotation/README.txt" - mito_name = "MT" + bwa = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/BWAIndex/version0.6.0/" + fasta = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/WholeGenomeFasta/genome.fa" + readme = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Annotation/README.txt" } 'EB1' { - fasta = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Annotation/README.txt" + bwa = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/BWAIndex/version0.6.0/" + fasta = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/WholeGenomeFasta/genome.fa" + readme = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Annotation/README.txt" } 'Galgal4' { - fasta = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Annotation/Genes/genes.bed" - mito_name = "MT" + bwa = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/BWAIndex/version0.6.0/" + fasta = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/WholeGenomeFasta/genome.fa" } 'Gm01' { - fasta = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Annotation/README.txt" + bwa = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/BWAIndex/version0.6.0/" + fasta = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/WholeGenomeFasta/genome.fa" + readme = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Annotation/README.txt" } 'Mmul_1' { - fasta = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Annotation/README.txt" - mito_name = "MT" + bwa = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/BWAIndex/version0.6.0/" + fasta = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/WholeGenomeFasta/genome.fa" + readme = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Annotation/README.txt" } 'IRGSP-1.0' { - fasta = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Annotation/Genes/genes.bed" - mito_name = "Mt" + bwa = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/BWAIndex/version0.6.0/" + fasta = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/WholeGenomeFasta/genome.fa" } 'CHIMP2.1.4' { - fasta = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Annotation/README.txt" - mito_name = "MT" + bwa = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/BWAIndex/version0.6.0/" + fasta = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/WholeGenomeFasta/genome.fa" + readme = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Annotation/README.txt" } 'Rnor_5.0' { - fasta = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Annotation/Genes/genes.bed" - mito_name = "MT" + bwa = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Sequence/BWAIndex/version0.6.0/" + fasta = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Sequence/WholeGenomeFasta/genome.fa" } 'Rnor_6.0' { - fasta = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Annotation/Genes/genes.bed" - mito_name = "MT" + bwa = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/BWAIndex/version0.6.0/" + fasta = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/WholeGenomeFasta/genome.fa" } 'R64-1-1' { - fasta = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Annotation/Genes/genes.bed" - mito_name = "MT" - macs_gsize = "1.2e7" + bwa = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/BWAIndex/version0.6.0/" + fasta = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/WholeGenomeFasta/genome.fa" + snpeff_db = 105 + snpeff_genome = 'R64-1-1' + vep_cache_version = 110 + vep_genome = 'R64-1-1' + vep_species = 'saccharomyces_cerevisiae' } 'EF2' { - fasta = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Annotation/README.txt" - mito_name = "MT" - macs_gsize = "1.21e7" + bwa = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/BWAIndex/version0.6.0/" + fasta = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/WholeGenomeFasta/genome.fa" + readme = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Annotation/README.txt" } 'Sbi1' { - fasta = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Annotation/README.txt" + bwa = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/BWAIndex/version0.6.0/" + fasta = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/WholeGenomeFasta/genome.fa" + readme = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Annotation/README.txt" } 'Sscrofa10.2' { - fasta = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Annotation/README.txt" - mito_name = "MT" + bwa = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/BWAIndex/version0.6.0/" + fasta = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/WholeGenomeFasta/genome.fa" + readme = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Annotation/README.txt" } 'AGPv3' { - fasta = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Annotation/Genes/genes.bed" - mito_name = "Mt" + bwa = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/BWAIndex/version0.6.0/" + fasta = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/WholeGenomeFasta/genome.fa" } 'hg38' { - fasta = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.bed" - mito_name = "chrM" - macs_gsize = "2.7e9" - blacklist = "${projectDir}/assets/blacklists/hg38-blacklist.bed" + bwa = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/BWAIndex/version0.6.0/" + fasta = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/WholeGenomeFasta/genome.fa" + snpeff_db = 105 + snpeff_genome = 'GRCh38' + vep_cache_version = 110 + vep_genome = 'GRCh38' + vep_species = 'homo_sapiens' } 'hg19' { - fasta = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Annotation/README.txt" - mito_name = "chrM" - macs_gsize = "2.7e9" - blacklist = "${projectDir}/assets/blacklists/hg19-blacklist.bed" + bwa = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/BWAIndex/version0.6.0/" + fasta = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/WholeGenomeFasta/genome.fa" + readme = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Annotation/README.txt" + snpeff_db = 87 + snpeff_genome = 'GRCh37' + vep_cache_version = 110 + vep_genome = 'GRCh37' + vep_species = 'homo_sapiens' } 'mm10' { - fasta = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Annotation/README.txt" - mito_name = "chrM" - macs_gsize = "1.87e9" - blacklist = "${projectDir}/assets/blacklists/mm10-blacklist.bed" + bwa = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/BWAIndex/version0.6.0/" + fasta = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/WholeGenomeFasta/genome.fa" + readme = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Annotation/README.txt" + snpeff_db = 99 + snpeff_genome = 'GRCm38' + vep_cache_version = 102 + vep_genome = 'GRCm38' + vep_species = 'mus_musculus' } 'bosTau8' { - fasta = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Annotation/Genes/genes.bed" - mito_name = "chrM" + bwa = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/BWAIndex/version0.6.0/" + fasta = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/WholeGenomeFasta/genome.fa" } 'ce10' { - fasta = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Annotation/README.txt" - mito_name = "chrM" - macs_gsize = "9e7" + bwa = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/BWAIndex/version0.6.0/" + fasta = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/WholeGenomeFasta/genome.fa" + readme = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Annotation/README.txt" } 'canFam3' { - fasta = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Annotation/README.txt" - mito_name = "chrM" + bwa = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/BWAIndex/version0.6.0/" + fasta = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/WholeGenomeFasta/genome.fa" + readme = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Annotation/README.txt" } 'danRer10' { - fasta = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Annotation/Genes/genes.bed" - mito_name = "chrM" - macs_gsize = "1.37e9" + bwa = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/BWAIndex/version0.6.0/" + fasta = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/WholeGenomeFasta/genome.fa" } 'dm6' { - fasta = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Annotation/Genes/genes.bed" - mito_name = "chrM" - macs_gsize = "1.2e8" + bwa = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/BWAIndex/version0.6.0/" + fasta = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/WholeGenomeFasta/genome.fa" } 'equCab2' { - fasta = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Annotation/README.txt" - mito_name = "chrM" + bwa = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/BWAIndex/version0.6.0/" + fasta = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/WholeGenomeFasta/genome.fa" + readme = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Annotation/README.txt" } 'galGal4' { - fasta = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Annotation/README.txt" - mito_name = "chrM" + bwa = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/BWAIndex/version0.6.0/" + fasta = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/WholeGenomeFasta/genome.fa" + readme = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Annotation/README.txt" } 'panTro4' { - fasta = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Annotation/README.txt" - mito_name = "chrM" + bwa = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/BWAIndex/version0.6.0/" + fasta = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/WholeGenomeFasta/genome.fa" + readme = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Annotation/README.txt" } 'rn6' { - fasta = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Annotation/Genes/genes.bed" - mito_name = "chrM" + bwa = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/BWAIndex/version0.6.0/" + fasta = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/WholeGenomeFasta/genome.fa" } 'sacCer3' { - fasta = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/BismarkIndex/" - readme = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Annotation/README.txt" - mito_name = "chrM" - macs_gsize = "1.2e7" + bwa = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/BWAIndex/version0.6.0/" + fasta = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/WholeGenomeFasta/genome.fa" + readme = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Annotation/README.txt" } 'susScr3' { - fasta = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Annotation/README.txt" - mito_name = "chrM" + bwa = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/BWAIndex/version0.6.0/" + fasta = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/WholeGenomeFasta/genome.fa" + readme = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Annotation/README.txt" } } } diff --git a/conf/modules/aligner.config b/conf/modules/aligner.config new file mode 100644 index 0000000000..5f44e199b0 --- /dev/null +++ b/conf/modules/aligner.config @@ -0,0 +1,85 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. + ext.when = When to run the module. +---------------------------------------------------------------------------------------- +*/ + +// MAPPING + +process { + + if (params.step == 'mapping') { + withName: 'BWAMEM1_MEM' { + ext.when = { params.aligner == 'bwa-mem' } + } + + withName: 'BWAMEM2_MEM' { + ext.when = { params.aligner == 'bwa-mem2' } + } + + withName: 'DRAGMAP_ALIGN' { + ext.args = { "--RGSM ${meta.patient}_${meta.sample} --RGID ${meta.read_group}" } + ext.when = { params.aligner == 'dragmap' } + } + + withName: 'SENTIEON_BWAMEM' { + ext.when = { params.aligner == 'sentieon-bwamem' } + } + + withName: 'BWAMEM.*_MEM|DRAGMAP_ALIGN|SENTIEON_BWAMEM' { + ext.prefix = { params.split_fastq > 1 ? "${meta.id}".concat('.').concat(reads.get(0).name.tokenize('.')[0]) : "${meta.id}.sorted" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/preprocessing/" }, + pattern: "*bam", + // Only save if save_output_as_bam AND + // (save_mapped OR no_markduplicates OR sentieon_dedup) AND + // only a single BAM file per sample + saveAs: { + if (params.save_output_as_bam && + ( + params.save_mapped || + (params.skip_tools && params.skip_tools.split(',').contains('markduplicates')) && + !(params.tools && params.tools.split(',').contains('sentieon_dedup')) + ) && (meta.size * meta.num_lanes == 1) + ) { "mapped/${meta.id}/${it}" } + else { null } + } + ] + } + + withName: 'BWAMEM.*_MEM|DRAGMAP_ALIGN' { + // Markduplicates Spark NEEDS name-sorted reads or runtime goes through the roof + // However if it's skipped, reads need to be coordinate-sorted + // Only name sort if Spark for Markduplicates + duplicate marking is not skipped + // Currently SENTIEON_BWAMEM only supports coordinate sorting the reads. + ext.args2 = { params.use_gatk_spark && params.use_gatk_spark.contains('markduplicates') && (!params.skip_tools || (params.skip_tools && !params.skip_tools.split(',').contains('markduplicates'))) ? '-n' : '' } + } + + withName: 'BWAMEM.*_MEM|SENTIEON_BWAMEM' { + // Using -B 3 for tumor samples + ext.args = { meta.status == 1 ? "-K 100000000 -Y -B 3 -R ${meta.read_group}" : "-K 100000000 -Y -R ${meta.read_group}" } + } + } + + withName: 'MERGE_BAM|INDEX_MERGE_BAM' { + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/preprocessing/" }, + pattern: "*{bam,bai}", + // Only save if (save_output_as_bam AND (no_markduplicates OR save_mapped )) + saveAs: { (params.save_output_as_bam && (params.save_mapped || params.skip_tools && params.skip_tools.split(',').contains('markduplicates'))) ? "mapped/${meta.id}/${it}" : null } + ] + } + + withName: 'MERGE_BAM' { + ext.prefix = { "${meta.id}.sorted" } + } +} diff --git a/conf/modules/alignment_to_fastq.config b/conf/modules/alignment_to_fastq.config new file mode 100644 index 0000000000..32878e5342 --- /dev/null +++ b/conf/modules/alignment_to_fastq.config @@ -0,0 +1,85 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. + ext.when = When to run the module. +---------------------------------------------------------------------------------------- +*/ + +// BAM TO FASTQ + +process { + + withName: 'COLLATE_FASTQ_MAP' { + ext.args2 = { '-N' } + ext.prefix = { "${meta.id}.mapped" } + publishDir = [ + //specify to avoid publishing, overwritten otherwise + enabled: false + ] + } + + withName: 'COLLATE_FASTQ_UNMAP' { + ext.args2 = { '-N' } + ext.prefix = { "${meta.id}.unmapped" } + publishDir = [ + //specify to avoid publishing, overwritten otherwise + enabled: false + ] + } + + withName: 'SAMTOOLS_VIEW_MAP_MAP' { + ext.args = { '-b -f1 -F12' } + ext.prefix = { "${meta.id}.map_map" } + publishDir = [ + //specify to avoid publishing, overwritten otherwise + enabled: false + ] + } + + withName: 'SAMTOOLS_VIEW_MAP_UNMAP' { + ext.args = { '-b -f8 -F260' } + ext.prefix = { "${meta.id}.map_unmap" } + publishDir = [ + //specify to avoid publishing, overwritten otherwise + enabled: false + ] + } + + withName: 'SAMTOOLS_VIEW_UNMAP_MAP' { + ext.args = { '-b -f4 -F264' } + ext.prefix = { "${meta.id}.unmap_map" } + publishDir = [ + //specify to avoid publishing, overwritten otherwise + enabled: false + ] + } + + withName: 'SAMTOOLS_VIEW_UNMAP_UNMAP' { + ext.args = { '-b -f12 -F256' } + ext.prefix = { "${meta.id}.unmap_unmap" } + publishDir = [ + //specify to avoid publishing, overwritten otherwise + enabled: false + ] + } + + withName: 'SAMTOOLS_MERGE_UNMAP' { + ext.prefix = { "${meta.id}.merged_unmap" } + publishDir = [ + //specify to avoid publishing, overwritten otherwise + enabled: false + ] + } + withName: 'CAT_FASTQ' { + publishDir = [ + //specify to avoid publishing, overwritten otherwise + enabled: false + ] + } +} diff --git a/conf/modules/annotate.config b/conf/modules/annotate.config new file mode 100644 index 0000000000..ff046ca843 --- /dev/null +++ b/conf/modules/annotate.config @@ -0,0 +1,106 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. + ext.when = When to run the module. +---------------------------------------------------------------------------------------- +*/ + +// ANNOTATE + +process { + + // SNPEFF + if (params.tools && (params.tools.split(',').contains('snpeff') || params.tools.split(',').contains('merge'))) { + withName: 'SNPEFF_SNPEFF' { + ext.args = { '-nodownload -canon -v' } + ext.prefix = { vcf.baseName - '.vcf' + '_snpEff' } + publishDir = [ + [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/reports/snpeff/${meta.variantcaller}/${meta.id}/" }, + pattern: "*{csv,html,genes.txt}", + saveAs: { params.tools.split(',').contains('snpeff') ? it : null } + ] + ] + } + } + + // VEP + if (params.tools && (params.tools.split(',').contains('vep') || params.tools.split(',').contains('merge'))) { + withName: 'ENSEMBLVEP_VEP' { + ext.args = { [ + (params.vep_dbnsfp && params.dbnsfp && !params.dbnsfp_consequence) ? "--plugin dbNSFP,${params.dbnsfp.split("/")[-1]},${params.dbnsfp_fields}" : '', + (params.vep_dbnsfp && params.dbnsfp && params.dbnsfp_consequence) ? "--plugin dbNSFP,'consequence=${params.dbnsfp_consequence}',${params.dbnsfp.split("/")[-1]},${params.dbnsfp_fields}" : '', + (params.vep_loftee) ? "--plugin LoF,loftee_path:/usr/local/share/ensembl-vep-${params.vep_version}" : '', + (params.vep_spliceai && params.spliceai_snv && params.spliceai_indel) ? "--plugin SpliceAI,snv=${params.spliceai_snv.split("/")[-1]},indel=${params.spliceai_indel.split("/")[-1]}" : '', + (params.vep_spliceregion) ? '--plugin SpliceRegion' : '', + (params.vep_out_format) ? "--${params.vep_out_format}" : '--vcf', + (params.vep_custom_args) ?: '' + ].join(' ').trim() } + // If just VEP: _VEP.ann.vcf + ext.prefix = { vcf.baseName - '.vcf' + '_VEP.ann' } + publishDir = [ + [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/reports/EnsemblVEP/${meta.variantcaller}/${meta.id}/" }, + pattern: "*html" + ], + [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/annotation/${meta.variantcaller}/${meta.id}/" }, + pattern: "*{gz}" + ] + ] + } + } + + // BCFTOOLS ANNOTATE + if (params.tools && params.tools.split(',').contains('bcfann')) { + withName: 'NFCORE_SAREK:SAREK:VCF_ANNOTATE_ALL:VCF_ANNOTATE_BCFTOOLS:BCFTOOLS_ANNOTATE' { + ext.args = { '--output-type z' } + ext.prefix = { input.baseName - '.vcf' + '_BCF.ann' } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/annotation/${meta.variantcaller}/${meta.id}/" }, + pattern: "*{gz}" + ] + } + } + + // SNPEFF THEN VEP + if (params.tools && params.tools.split(',').contains('merge')) { + withName: 'NFCORE_SAREK:SAREK:VCF_ANNOTATE_ALL:VCF_ANNOTATE_MERGE:ENSEMBLVEP_VEP' { + // If merge: Output file will have format *_snpEff_VEP.ann.vcf, *_snpEff_VEP.ann.json or *_snpEff_VEP.ann.tab + ext.prefix = { vcf.baseName - '.ann.vcf' + '_VEP.ann' } + } + } + + // ALL ANNOTATION TOOLS + if (params.tools && (params.tools.split(',').contains('snpeff') || params.tools.split(',').contains('vep') || params.tools.split(',').contains('merge') || params.tools.split(',').contains('bcfann'))) { + withName: 'NFCORE_SAREK:SAREK:VCF_ANNOTATE_ALL:.*:(TABIX_BGZIPTABIX|TABIX_TABIX)' { + ext.prefix = { input.name - '.vcf' } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/annotation/${meta.variantcaller}/${meta.id}/" }, + pattern: "*{gz.tbi}" + ] + } + } + + if (params.tools && (params.tools.split(',').contains('snpeff') || params.tools.split(',').contains('merge'))) { + withName: 'NFCORE_SAREK:SAREK:VCF_ANNOTATE_ALL:VCF_ANNOTATE_SNPEFF:TABIX_BGZIPTABIX' { + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/annotation/${meta.variantcaller}/${meta.id}/" }, + pattern: "*{gz,gz.tbi}", + saveAs: { params.tools.split(',').contains('snpeff') ? it : null } + ] + } + } +} diff --git a/conf/modules/ascat.config b/conf/modules/ascat.config new file mode 100644 index 0000000000..4df9824274 --- /dev/null +++ b/conf/modules/ascat.config @@ -0,0 +1,37 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. + ext.when = When to run the module. +---------------------------------------------------------------------------------------- +*/ + +// ASCAT + +process { + + withName: 'ASCAT' { + ext.args = { [ + "gender": meta.sex, + "genomeVersion": params.ascat_genome, + "purity": params.ascat_purity, + "ploidy": params.ascat_ploidy, + "minCounts": params.ascat_min_counts, + "chrom_names": meta.sex == 'XX' ? "c(1:22, 'X')" : "c(1:22, 'X', 'Y')", // for faster testing use "c('21', '22')" + "min_base_qual": params.ascat_min_base_qual, + "min_map_qual": params.ascat_min_map_qual + ] + } + ext.when = { params.tools && params.tools.split(',').contains('ascat') } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/ascat/${meta.id}/" }, + pattern: "*{png,cnvs.txt,metrics.txt,purityploidy.txt,segments.txt,LogR.txt,BAF.txt}" + ] + } +} diff --git a/conf/modules/cnvkit.config b/conf/modules/cnvkit.config new file mode 100644 index 0000000000..afeed0bdf6 --- /dev/null +++ b/conf/modules/cnvkit.config @@ -0,0 +1,37 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. + ext.when = When to run the module. +---------------------------------------------------------------------------------------- +*/ + +// CNVKIT + +process { + + withName: 'CNVKIT_BATCH' { + ext.args = { params.wes ? "--method hybrid --diagram --scatter" : "--method wgs --diagram --scatter" } + ext.when = { params.tools && params.tools.split(',').contains('cnvkit') } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/cnvkit/${meta.id}/" }, + pattern: "*{bed,cnn,cnr,cns,pdf,png}" + ] + } + + withName: 'CNVKIT_GENEMETRICS' { + ext.prefix = { "${cnr.baseName}.genemetrics" } + ext.when = { params.tools && params.tools.split(',').contains('cnvkit') } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/cnvkit/${meta.id}/" }, + pattern: "*{tsv}" + ] + } +} diff --git a/conf/modules/controlfreec.config b/conf/modules/controlfreec.config new file mode 100644 index 0000000000..8ed3920c43 --- /dev/null +++ b/conf/modules/controlfreec.config @@ -0,0 +1,131 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. + ext.when = When to run the module. +---------------------------------------------------------------------------------------- +*/ + +// CONTROLFREEC + +process { + + withName: 'ASSESS_SIGNIFICANCE' { + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/controlfreec/${meta.id}/" }, + pattern: "*{.p.value.txt}" + ] + } + + withName: 'FREEC_.*' { + ext.when = { params.tools && params.tools.split(',').contains('controlfreec') } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/controlfreec/${meta.id}/" }, + pattern: "*{BedGraph,cpn,txt,_CNVs}" + ] + } + + withName: 'FREEC2BED' { + ext.args = { "${params.cf_ploidy}" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/controlfreec/${meta.id}/" }, + pattern: "*bed" + ] + } + + withName: 'FREEC2CIRCOS' { + ext.args = { "${params.cf_ploidy}" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/controlfreec/${meta.id}/" }, + pattern: "*circos.txt" + ] + } + + withName: 'MAKEGRAPH' { + ext.args = { "${params.cf_ploidy}" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/controlfreec/${meta.id}/" }, + pattern: "*png" + ] + } + +// TUMOR_ONLY_VARIANT_CALLING + withName: 'FREEC_TUMORONLY' { + ext.args = { [ + "sample":[ + inputformat: 'pileup', + mateorientation: 'FR' + ], + "general" :[ + bedgraphoutput: "TRUE", + breakpointthreshold: params.wes ? "1.2" : "0.8", //Values taken from Freec example configs + breakpointtype: params.wes ? "4" : "2", // Values taken from Freec example configs + coefficientofvariation: params.cf_coeff, + contamination: params.cf_contamination ?: "", + contaminationadjustment: params.cf_contamination_adjustment ? "TRUE" : "", + forcegccontentnormalization: params.wes ? "1" : "0", + minimalsubclonepresence: params.wes ? "30" : "20", + noisydata: params.wes ? "TRUE" : "FALSE", + ploidy: params.cf_ploidy, + printNA: params.wes ? "FALSE" : "TRUE", + readcountthreshold: params.wes ? "50" : "10", + sex: meta.sex, + //uniquematch: not set + window: params.cf_window ?: "" + ], + "BAF":[ + minimalcoverageperposition: params.cf_mincov ?: "", + minimalqualityperposition: params.cf_minqual ?: "", + //"shiftinquality": (optional)not set + ] + ] + } + } + +// PAIR_VARIANT_CALLING + withName: 'FREEC_SOMATIC' { + ext.args = { [ + "sample":[ + inputformat: 'pileup', + mateorientation: 'FR' + ], + "control":[ + inputformat: "pileup", + mateorientation: "FR" + ], + "general" :[ + bedgraphoutput: "TRUE", + breakpointthreshold: params.wes ? "1.2" : "0.8", //Values taken from Freec example configs + breakpointtype: params.wes ? "4" : "2", // Values taken from Freec example configs + coefficientofvariation: params.cf_coeff, + contamination: params.cf_contamination ?: "", + contaminationadjustment: params.cf_contamination_adjustment ? "TRUE" : "", + forcegccontentnormalization: params.wes ? "1" : "0", + minimalsubclonepresence: params.wes ? "30" : "20", + noisydata: params.wes ? "TRUE" : "FALSE", + ploidy: params.cf_ploidy, + printNA: params.wes ? "FALSE" : "TRUE", + readcountthreshold: params.wes ? "50" : "10", + sex: meta.sex, + //uniquematch: not set + window: params.cf_window ?: "" + ], + "BAF":[ + minimalcoverageperposition: params.cf_mincov ?: "", + minimalqualityperposition: params.cf_minqual ?: "", + //"shiftinquality": (optional)not set + ] + ] + } + } +} diff --git a/conf/modules/deepvariant.config b/conf/modules/deepvariant.config new file mode 100644 index 0000000000..ff67bc1a4b --- /dev/null +++ b/conf/modules/deepvariant.config @@ -0,0 +1,43 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. + ext.when = When to run the module. +---------------------------------------------------------------------------------------- +*/ + +// DEEPVARIANT + +process { + + withName: 'DEEPVARIANT' { + ext.args = { params.wes ? "--model_type WES" : "--model_type WGS" } + ext.prefix = { meta.num_intervals <= 1 ? "${meta.id}.deepvariant" : "${meta.id}.deepvariant.${intervals.simpleName}" } + ext.when = { params.tools && params.tools.split(',').contains('deepvariant') } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/" }, + pattern: "*{vcf.gz,vcf.gz.tbi}", + saveAs: { meta.num_intervals > 1 ? null : "deepvariant/${meta.id}/${it}" } + ] + } + + withName: 'MERGE_DEEPVARIANT_.*' { + ext.prefix = { "${meta.id}.deepvariant" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/deepvariant/${meta.id}/" }, + pattern: "*{vcf.gz,vcf.gz.tbi}" + ] + } + + withName: 'MERGE_DEEPVARIANT_GVCF' { + ext.prefix = { "${meta.id}.deepvariant.g" } + } + +} diff --git a/conf/modules/download_cache.config b/conf/modules/download_cache.config new file mode 100644 index 0000000000..5b36ab4cc5 --- /dev/null +++ b/conf/modules/download_cache.config @@ -0,0 +1,36 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. + ext.when = When to run the module. +---------------------------------------------------------------------------------------- +*/ + +// PREPARE_CACHE + +process { + + // SNPEFF + withName: 'SNPEFF_DOWNLOAD' { + ext.when = { params.tools && (params.tools.split(',').contains('snpeff') || params.tools.split(',').contains('merge')) } + publishDir = [ + mode: params.publish_dir_mode, + path: { params.outdir_cache ? "${params.outdir_cache}/": "${params.outdir}/cache/" } + ] + } + + // VEP + withName: 'ENSEMBLVEP_DOWNLOAD' { + ext.when = { params.tools && (params.tools.split(',').contains('vep') || params.tools.split(',').contains('merge')) } + ext.args = { '--AUTO c --CONVERT --NO_BIOPERL --NO_HTSLIB --NO_TEST --NO_UPDATE' } + publishDir = [ + mode: params.publish_dir_mode, + path: { params.outdir_cache ? "${params.outdir_cache}/": "${params.outdir}/cache/" } + ] + } +} diff --git a/conf/modules/freebayes.config b/conf/modules/freebayes.config new file mode 100644 index 0000000000..a30ec62308 --- /dev/null +++ b/conf/modules/freebayes.config @@ -0,0 +1,68 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. + ext.when = When to run the module. +---------------------------------------------------------------------------------------- +*/ + +// FREEBAYES + +process { + + withName: 'MERGE_FREEBAYES' { + ext.prefix = { "${meta.id}.freebayes" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/freebayes/${meta.id}/" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'FREEBAYES' { + ext.args = { '--min-alternate-fraction 0.1 --min-mapping-quality 1' } + //To make sure no naming conflicts ensure with module BCFTOOLS_SORT & the naming being correct in the output folder + ext.prefix = { meta.num_intervals <= 1 ? "${meta.id}" : "${meta.id}.${target_bed.simpleName}" } + ext.when = { params.tools && params.tools.split(',').contains('freebayes') } + publishDir = [ + enabled: false + ] + } + + withName: 'BCFTOOLS_SORT' { + ext.prefix = { meta.num_intervals <= 1 ? meta.id + ".freebayes" : vcf.name - ".vcf" + ".sort" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/" }, + pattern: "*vcf.gz", + saveAs: { meta.num_intervals > 1 ? null : "freebayes/${meta.id}/${it}" } + ] + } + + withName : 'TABIX_VC_FREEBAYES' { + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/freebayes/${meta.id}/" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + // PAIR_VARIANT_CALLING + if (params.tools && params.tools.split(',').contains('freebayes')) { + withName: '.*:BAM_VARIANT_CALLING_SOMATIC_ALL:BAM_VARIANT_CALLING_FREEBAYES:FREEBAYES' { + ext.args = { "--pooled-continuous \ + --pooled-discrete \ + --genotype-qualities \ + --report-genotype-likelihood-max \ + --allele-balance-priors-off \ + --min-alternate-fraction 0.03 \ + --min-repeat-entropy 1 \ + --min-alternate-count 2 " } + } + } +} diff --git a/conf/modules/haplotypecaller.config b/conf/modules/haplotypecaller.config new file mode 100644 index 0000000000..f376f4f41d --- /dev/null +++ b/conf/modules/haplotypecaller.config @@ -0,0 +1,67 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. + ext.when = When to run the module. +---------------------------------------------------------------------------------------- +*/ + +// HAPLOTYPECALLER + +process { + + withName: 'GATK4_HAPLOTYPECALLER' { + ext.args = { params.joint_germline ? "-ERC GVCF" : "" } + ext.prefix = { meta.num_intervals <= 1 ? ( params.joint_germline ? "${meta.id}.haplotypecaller.g" : "${meta.id}.haplotypecaller" ) : ( params.joint_germline ? "${meta.id}.haplotypecaller.${intervals.simpleName}.g" :"${meta.id}.haplotypecaller.${intervals.simpleName}" ) } + ext.when = { params.tools && params.tools.split(',').contains('haplotypecaller') } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/"}, + pattern: "*{vcf.gz,vcf.gz.tbi}", + saveAs: { meta.num_intervals > 1 ? null : "haplotypecaller/${meta.id}/${it}" } + ] + } + + withName: 'MERGE_HAPLOTYPECALLER' { + ext.prefix = { params.joint_germline ? "${meta.id}.haplotypecaller.g" : "${meta.id}.haplotypecaller" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/haplotypecaller/${meta.id}/" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'CNNSCOREVARIANTS' { + publishDir = [ + // Otherwise it gets published + enabled: false + ] + } + + withName: '.*:VCF_VARIANT_FILTERING_GATK:FILTERVARIANTTRANCHES' { + ext.args = { "--info-key CNN_1D" } + ext.prefix = { "${meta.id}.haplotypecaller" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/haplotypecaller/${meta.id}/"}, + pattern: "*{vcf.gz,vcf.gz.tbi}" + ] + } + + if (params.tools && params.tools.split(',').contains('haplotypecaller')) { + withName: 'NFCORE_SAREK:SAREK:BAM_VARIANT_CALLING_GERMLINE_ALL:BAM_VARIANT_CALLING_HAPLOTYPECALLER:BAM_MERGE_INDEX_SAMTOOLS:(MERGE_BAM|INDEX_MERGE_BAM)' { + ext.prefix = { "${meta.id}.realigned" } + publishDir = [ + enabled: true, + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/haplotypecaller/${meta.id}/" }, + pattern: "*{bam,bai}" + ] + } + } +} diff --git a/conf/modules/joint_germline.config b/conf/modules/joint_germline.config new file mode 100644 index 0000000000..61a296721d --- /dev/null +++ b/conf/modules/joint_germline.config @@ -0,0 +1,84 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. + ext.when = When to run the module. +---------------------------------------------------------------------------------------- +*/ + +// JOINT_GERMLINE + +process { + + withName: 'GATK4_GENOMICSDBIMPORT' { + ext.args = { '--genomicsdb-shared-posixfs-optimizations true --bypass-feature-reader' } + ext.prefix = { "${meta.intervals_name}.joint" } + publishDir = [ + enabled: false + ] + } + + withName: 'GATK4_GENOTYPEGVCFS' { + ext.prefix = { meta.intervals_name } + publishDir = [ + enabled: false + ] + } + + if (params.tools && params.tools.contains('haplotypecaller') && params.joint_germline) { + withName: 'NFCORE_SAREK:SAREK:BAM_VARIANT_CALLING_GERMLINE_ALL:BAM_JOINT_CALLING_GERMLINE_GATK:BCFTOOLS_SORT' { + ext.prefix = { vcf.baseName - ".vcf" + ".sort" } + publishDir = [ + enabled: false + ] + } + } + + withName: 'MERGE_GENOTYPEGVCFS' { + ext.prefix = { 'joint_germline' } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/haplotypecaller/joint_variant_calling/" }, + pattern: "*{vcf.gz,vcf.gz.tbi}" + ] + } + + withName: 'VARIANTRECALIBRATOR_INDEL' { + ext.args = { '-an QD -an MQRankSum -an ReadPosRankSum -an FS -an SOR -an DP -mode INDEL' } + ext.prefix = { "${meta.id}_INDEL" } + publishDir = [ + enabled: false + ] + } + + withName: 'VARIANTRECALIBRATOR_SNP' { + ext.args = { '-an QD -an MQ -an MQRankSum -an ReadPosRankSum -an FS -an SOR -mode SNP' } + ext.prefix = { "${meta.id}_SNP" } + publishDir = [ + enabled: false + ] + } + + withName: 'GATK4_APPLYVQSR_SNP' { + ext.args = { '--truth-sensitivity-filter-level 99.9 -mode SNP' } + ext.prefix = { "${meta.id}_SNP" } + publishDir = [ + enabled: false + ] + } + + withName: 'GATK4_APPLYVQSR_INDEL' { + ext.args = { '--truth-sensitivity-filter-level 99.9 -mode INDEL' } + ext.prefix = { 'joint_germline_recalibrated' } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/haplotypecaller/joint_variant_calling/"}, + pattern: "*{vcf.gz,vcf.gz.tbi}" + ] + } +} diff --git a/conf/modules/manta.config b/conf/modules/manta.config new file mode 100644 index 0000000000..71a1c43299 --- /dev/null +++ b/conf/modules/manta.config @@ -0,0 +1,28 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. + ext.when = When to run the module. +---------------------------------------------------------------------------------------- +*/ + +// MANTA + +process { + if (params.tools && params.tools.split(',').contains('manta')) { + withName: 'MANTA_GERMLINE|MANTA_TUMORONLY|MANTA_SOMATIC' { + ext.args = { params.wes ? "--exome" : '' } + ext.prefix = { "${meta.id}.manta" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/manta/${meta.id}" }, + pattern: "*{diploid_sv,tumor_sv,somatic_sv}.{vcf.gz,vcf.gz.tbi}" + ] + } + } +} diff --git a/conf/modules/markduplicates.config b/conf/modules/markduplicates.config new file mode 100644 index 0000000000..c33b6a3a2a --- /dev/null +++ b/conf/modules/markduplicates.config @@ -0,0 +1,133 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. + ext.when = When to run the module. +---------------------------------------------------------------------------------------- +*/ + +// MARKDUPLICATES + +process { + + withName: 'CRAM_TO_BAM' { + ext.args = { '-b' } + } + + withName: 'BAM_TO_CRAM' { + // BAM provided for step Markduplicates either run through MD or Convert -> then saved as sorted.cram (convert) or md.cram (md directly) + // BAM files provided for step prepare_recal are converted and run through BQSR -> then saved as md.cram + // BAM files provided for step recal are converted and run through BQSR II -> then saved as md.cram + ext.args = { '-C' } + ext.prefix = { "${meta.id}.converted" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/preprocessing/converted/${meta.id}" }, + pattern: "*{cram,crai}", + saveAs: { !params.save_output_as_bam ? it : null } + ] + } + + withName: 'NFCORE_SAREK:SAREK:(BAM_MARKDUPLICATES|BAM_MARKDUPLICATES_SPARK):CRAM_QC_MOSDEPTH_SAMTOOLS:SAMTOOLS_STATS' { + ext.prefix = { "${meta.id}.md.cram" } + ext.when = { !(params.skip_tools && params.skip_tools.split(',').contains('samtools')) } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/reports/samtools/${meta.id}" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'BAM_TO_CRAM_MAPPING' { + ext.prefix = { "${meta.id}.sorted" } + // Run only when mapping should be saved as CRAM or when no MD is done + ext.when = (params.save_mapped && !params.save_output_as_bam) || + ( + (params.skip_tools && params.skip_tools.split(',').contains('markduplicates')) && + !(params.tools && params.tools.split(',').contains('sentieon_dedup')) + ) + publishDir = [ + // Never publish if BAM only should be published + mode: params.publish_dir_mode, + path: { "${params.outdir}/preprocessing/mapped/${meta.id}/" }, + pattern: "*{cram,crai}", + saveAs: { !params.save_output_as_bam ? it : null } + ] + } + + withName: 'GATK4_ESTIMATELIBRARYCOMPLEXITY' { + ext.prefix = { "${meta.id}.md.cram" } + ext.when = { !(params.skip_tools && params.skip_tools.split(',').contains('markduplicates_report')) } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/reports/markduplicates/${meta.id}" }, + pattern: "*metrics" + ] + } + + withName: 'GATK4_MARKDUPLICATES' { + ext.args = '-REMOVE_DUPLICATES false -VALIDATION_STRINGENCY LENIENT' + ext.prefix = { "${meta.id}.md.cram" } + ext.when = { !(params.skip_tools && params.skip_tools.split(',').contains('markduplicates')) } + publishDir = [ + [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/preprocessing/markduplicates/${meta.id}/" }, + pattern: "*{cram,crai}", + saveAs: { !params.save_output_as_bam ? it : null } + ], + [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/reports/" }, + pattern: "*metrics", + saveAs: { !(params.skip_tools && params.skip_tools.split(',').contains('markduplicates_report')) ? "markduplicates/${meta.id}/${it}" : null} + ] + ] + } + + withName: 'GATK4SPARK_MARKDUPLICATES' { + ext.args = { '--remove-sequencing-duplicates false -VS LENIENT' } + ext.prefix = { "${meta.id}.md.cram" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/preprocessing/markduplicates/${meta.id}/" }, + pattern: "*{cram,crai}", + saveAs: { !params.save_output_as_bam ? it : null } + ] + } + + withName: 'INDEX_MARKDUPLICATES' { + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/preprocessing/markduplicates/${meta.id}/" }, + pattern: "*{cram,crai}", + saveAs: { !params.save_output_as_bam ? it : null } + ] + } + + withName: 'NFCORE_SAREK:SAREK:CRAM_TO_BAM' { + ext.when = { params.save_output_as_bam } + if (params.tools && params.tools.split(',').contains('sentieon_dedup')) { + ext.prefix = { "${meta.id}.dedup" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/preprocessing/sentieon_dedup/${meta.id}/" }, + pattern: "*{dedup.bam,dedup.bam.bai}", + saveAs: { params.save_output_as_bam ? it : null } + ] + } else { + ext.prefix = { "${meta.id}.md" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/preprocessing/markduplicates/${meta.id}/" }, + pattern: "*{md.bam,md.bam.bai}", + saveAs: { params.save_output_as_bam ? it : null } + ] + } + } +} diff --git a/conf/modules/modules.config b/conf/modules/modules.config new file mode 100644 index 0000000000..f24cb481b7 --- /dev/null +++ b/conf/modules/modules.config @@ -0,0 +1,129 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. + ext.when = When to run the module. +---------------------------------------------------------------------------------------- +*/ + +process { + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + + withName: CUSTOM_DUMPSOFTWAREVERSIONS { + cache = false + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/pipeline_info" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + +// QC + withName: 'FASTQC' { + ext.args = { '--quiet' } + ext.when = { !(params.skip_tools && params.skip_tools.split(',').contains('fastqc')) } + publishDir = [ + [ + path: { "${params.outdir}/reports/fastqc/${meta.id}" }, + mode: params.publish_dir_mode, + pattern: "*{html,zip}" + ] + ] + } + + withName: 'MULTIQC' { + ext.args = { params.multiqc_title ? "--title \"$params.multiqc_title\"" : '' } + publishDir = [ + path: { "${params.outdir}/multiqc" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'NFCORE_SAREK:SAREK:CRAM_QC_NO_MD:SAMTOOLS_STATS' { + ext.prefix = { "${meta.id}.sorted.cram" } + ext.when = { !(params.skip_tools && params.skip_tools.split(',').contains('samtools')) } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/reports/samtools/${meta.id}" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'MOSDEPTH' { + ext.args = { !params.wes ? "-n --fast-mode --by 500" : ""} + ext.prefix = { + if (params.tools && params.tools.split(',').contains('sentieon_dedup')) { + "${meta.id}.dedup" + } else if (params.skip_tools && params.skip_tools.split(',').contains('markduplicates')) { + "${meta.id}.sorted" + } else { + "${meta.id}.md" + } + } + ext.when = { !(params.skip_tools && params.skip_tools.split(',').contains('mosdepth')) } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/reports/mosdepth/${meta.id}" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + if (!(params.skip_tools && params.skip_tools.split(',').contains('baserecalibrator'))) { + withName: 'NFCORE_SAREK:SAREK:CRAM_SAMPLEQC:CRAM_QC_RECAL:MOSDEPTH' { + ext.prefix = { "${meta.id}.recal" } + } + + withName: 'NFCORE_SAREK:SAREK:CRAM_SAMPLEQC:CRAM_QC_RECAL:SAMTOOLS_STATS' { + ext.prefix = { "${meta.id}.recal.cram" } + ext.when = { !(params.skip_tools && params.skip_tools.split(',').contains('samtools')) } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/reports/samtools/${meta.id}" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + } + + // VCF + withName: 'BCFTOOLS_STATS' { + ext.prefix = { vcf.baseName - ".vcf" } + ext.when = { !(params.skip_tools && params.skip_tools.split(',').contains('bcftools')) } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/reports/bcftools/${meta.variantcaller}/${meta.id}/" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'VCFTOOLS_.*' { + ext.prefix = { variant_file.baseName - ".vcf" } + ext.when = { !(params.skip_tools && params.skip_tools.split(',').contains('vcftools')) } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/reports/vcftools/${meta.variantcaller}/${meta.id}/" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'VCFTOOLS_TSTV_COUNT' { + ext.args = { '--TsTv-by-count' } + } + + withName: 'VCFTOOLS_TSTV_QUAL' { + ext.args = { '--TsTv-by-qual' } + } + + withName: 'VCFTOOLS_SUMMARY' { + ext.args = { '--FILTER-summary' } + } +} diff --git a/conf/modules/mpileup.config b/conf/modules/mpileup.config new file mode 100644 index 0000000000..43cae7f1bd --- /dev/null +++ b/conf/modules/mpileup.config @@ -0,0 +1,88 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. + ext.when = When to run the module. +---------------------------------------------------------------------------------------- +*/ + +// MPILEUP + +process { + + withName: 'CAT_MPILEUP' { + publishDir = [ + enabled: false + ] + } + + withName: 'BCFTOOLS_MPILEUP' { + ext.args2 = { '--multiallelic-caller' } + ext.args3 = { "-i 'count(GT==\"RR\")==0'" } // only report non homozygous reference variants + ext.prefix = { meta.num_intervals <= 1 ? "${meta.id}.bcftools" : "${meta.id}_${intervals.simpleName}.bcftools" } + ext.when = { params.tools && params.tools.split(',').contains('mpileup') } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/bcftools/${meta.id}/" }, + pattern: "*{vcf.gz,vcf.gz.tbi}", + saveAs: { meta.num_intervals > 1 ? null : it } + ] + } + + withName: 'MERGE_BCFTOOLS_MPILEUP' { + ext.prefix = {"${meta.id}.bcftools"} + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/bcftools/${meta.id}/" }, + pattern: "*{vcf.gz,vcf.gz.tbi}" + ] + } + + withName: 'SAMTOOLS_MPILEUP' { + ext.when = { params.tools && params.tools.split(',').contains('controlfreec') } + publishDir = [ + enabled: false + ] + + } + +// PAIR_VARIANT_CALLING + if (params.tools && params.tools.split(',').contains('controlfreec')) { + withName: 'NFCORE_SAREK:SAREK:BAM_VARIANT_CALLING_GERMLINE_ALL:BAM_VARIANT_CALLING_MPILEUP:SAMTOOLS_MPILEUP' { + ext.prefix = { meta.num_intervals <= 1 ? "${meta.id}.normal" : "${meta.id}_${intervals.simpleName}.normal" } + } + + withName: 'NFCORE_SAREK:SAREK:BAM_VARIANT_CALLING_GERMLINE_ALL:BAM_VARIANT_CALLING_MPILEUP:CAT_MPILEUP' { + ext.prefix = { "${meta.id}.normal.mpileup.gz" } + } + + withName: 'NFCORE_SAREK:SAREK:BAM_VARIANT_CALLING_SOMATIC_ALL:MPILEUP_NORMAL:SAMTOOLS_MPILEUP' { + ext.prefix = { meta.num_intervals <= 1 ? "${meta.id}.normal" : "${meta.id}_${intervals.simpleName}.normal" } + } + + withName: 'NFCORE_SAREK:SAREK:BAM_VARIANT_CALLING_SOMATIC_ALL:MPILEUP_NORMAL:CAT_MPILEUP' { + ext.prefix = { "${meta.id}.normal.mpileup.gz" } + } + + withName: 'NFCORE_SAREK:SAREK:BAM_VARIANT_CALLING_SOMATIC_ALL:MPILEUP_TUMOR:SAMTOOLS_MPILEUP' { + ext.prefix = { meta.num_intervals <= 1 ? "${meta.id}.tumor" : "${meta.id}_${intervals.simpleName}.tumor" } + } + + withName: 'NFCORE_SAREK:SAREK:BAM_VARIANT_CALLING_SOMATIC_ALL:MPILEUP_TUMOR:CAT_MPILEUP' { + ext.prefix = { "${meta.id}.tumor.mpileup.gz" } + } + + withName: 'NFCORE_SAREK:SAREK:BAM_VARIANT_CALLING_TUMOR_ONLY_ALL:BAM_VARIANT_CALLING_MPILEUP:SAMTOOLS_MPILEUP' { + ext.prefix = { meta.num_intervals <= 1 ? "${meta.id}.tumor" : "${meta.id}_${intervals.simpleName}.tumor" } + } + + withName: 'NFCORE_SAREK:SAREK:BAM_VARIANT_CALLING_TUMOR_ONLY_ALL:BAM_VARIANT_CALLING_MPILEUP:CAT_MPILEUP' { + ext.prefix = { "${meta.id}.tumor.mpileup.gz" } + } + } +} diff --git a/conf/modules/msisensorpro.config b/conf/modules/msisensorpro.config new file mode 100644 index 0000000000..8253cccc50 --- /dev/null +++ b/conf/modules/msisensorpro.config @@ -0,0 +1,25 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. + ext.when = When to run the module. +---------------------------------------------------------------------------------------- +*/ + +// MSISENSORPRO + +process { + + withName: 'MSISENSORPRO_MSISOMATIC' { + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/msisensorpro/${meta.id}/" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } +} diff --git a/conf/modules/mutect2.config b/conf/modules/mutect2.config new file mode 100644 index 0000000000..2f74ee6327 --- /dev/null +++ b/conf/modules/mutect2.config @@ -0,0 +1,113 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. + ext.when = When to run the module. +---------------------------------------------------------------------------------------- +*/ + +// MUTECT2 + +process { + if (params.tools && params.tools.split(',').contains('mutect2')) { + + withName: 'GATK4_MUTECT2' { + ext.args = { params.ignore_soft_clipped_bases ? "--dont-use-soft-clipped-bases true --f1r2-tar-gz ${task.ext.prefix}.f1r2.tar.gz" : "--f1r2-tar-gz ${task.ext.prefix}.f1r2.tar.gz" } + ext.prefix = { meta.num_intervals <= 1 ? "${meta.id}.mutect2" : "${meta.id}.mutect2.${intervals.simpleName}" } + ext.when = { params.tools && params.tools.split(',').contains('mutect2') } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/" }, + pattern: "*{vcf.gz,vcf.gz.tbi,stats}", + saveAs: { meta.num_intervals > 1 ? null : "mutect2/${meta.id}/${it}" } + ] + } + + // PAIR_VARIANT_CALLING + withName: 'MUTECT2_PAIRED' { + ext.args = { params.ignore_soft_clipped_bases ? + "--dont-use-soft-clipped-bases true --f1r2-tar-gz ${task.ext.prefix}.f1r2.tar.gz --normal-sample ${meta.patient}_${meta.normal_id}" : + "--f1r2-tar-gz ${task.ext.prefix}.f1r2.tar.gz --normal-sample ${meta.patient}_${meta.normal_id}" } + } + + withName: 'MERGE_MUTECT2.*' { + ext.prefix = { "${meta.id}.mutect2" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/mutect2/${meta.id}" }, + pattern: "*{vcf.gz,vcf.gz.tbi}" + ] + } + + withName: 'FILTERMUTECTCALLS.*' { + ext.prefix = {"${meta.id}.mutect2.filtered"} + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/mutect2/${meta.id}" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'CALCULATECONTAMINATION' { + ext.args = { "-tumor-segmentation ${meta.id}.mutect2.segmentation.table" } + ext.prefix = { "${meta.id}.mutect2" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/mutect2/${meta.id}" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'LEARNREADORIENTATIONMODEL' { + ext.prefix = { "${meta.id}.mutect2.artifactprior" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/mutect2/${meta.id}/" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'MERGEMUTECTSTATS' { + ext.prefix = { "${meta.id}.mutect2" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/mutect2/${meta.id}/" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'GATHERPILEUPSUMMARIES.*' { + ext.prefix = { "${meta.id}.mutect2" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/mutect2/${meta.id}/" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'GETPILEUPSUMMARIES.*' { + ext.prefix = { meta.num_intervals <= 1 ? "${meta.id}.mutect2" : "${meta.id}.mutect2.${intervals.simpleName}" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/" }, + pattern: "*.table", + saveAs: { meta.num_intervals > 1 ? null : "mutect2/${meta.id}/${it}" } + ] + } + + if (params.joint_mutect2) { + withName: 'CALCULATECONTAMINATION' { + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/mutect2/${meta.patient}" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + } + } +} diff --git a/conf/modules/ngscheckmate.config b/conf/modules/ngscheckmate.config new file mode 100644 index 0000000000..4d35c94468 --- /dev/null +++ b/conf/modules/ngscheckmate.config @@ -0,0 +1,24 @@ +process { + + withName: '.*BAM_NGSCHECKMATE:BCFTOOLS_MPILEUP' { + ext.args2 = { '--no-version --ploidy 1 -c' } + ext.args3 = { '--no-version' } + ext.prefix = { "${meta.id}.ngscheckmate" } + ext.when = { params.tools && params.tools.split(',').contains('ngscheckmate') } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/reports/ngscheckmate/vcfs" }, + pattern: "*{vcf.gz}" + ] + } + + withName: '.*BAM_NGSCHECKMATE:NGSCHECKMATE_NCM' { + ext.args = { '-V' } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/reports/ngscheckmate/" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + +} diff --git a/conf/modules.config b/conf/modules/post_variant_calling.config similarity index 50% rename from conf/modules.config rename to conf/modules/post_variant_calling.config index d91c6aba0b..ee29a656ce 100644 --- a/conf/modules.config +++ b/conf/modules/post_variant_calling.config @@ -7,44 +7,43 @@ ext.args2 = Second set of arguments appended to command in module (multi-tool modules). ext.args3 = Third set of arguments appended to command in module (multi-tool modules). ext.prefix = File name prefix for output files. + ext.when = When to run the module. ---------------------------------------------------------------------------------------- */ -process { +// POSTPROCESSING VCFS +// Like, for instance, concatenating the unannotated, germline vcf-files - publishDir = [ - path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] +process { - withName: SAMPLESHEET_CHECK { + withName: 'GERMLINE_VCFS_CONCAT'{ + ext.when = { params.concatenate_vcfs } publishDir = [ - path: { "${params.outdir}/pipeline_info" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + //specify to avoid publishing, overwritten otherwise + enabled: false ] } - withName: FASTQC { - ext.args = '--quiet' - } - - withName: CUSTOM_DUMPSOFTWAREVERSIONS { + withName: 'GERMLINE_VCFS_CONCAT_SORT'{ + ext.prefix = { "${meta.id}.germline" } + ext.when = { params.concatenate_vcfs } publishDir = [ - path: { "${params.outdir}/pipeline_info" }, mode: params.publish_dir_mode, - pattern: '*_versions.yml' + path: { "${params.outdir}/variant_calling/concat/${meta.id}/" } ] } - withName: 'MULTIQC' { - ext.args = { params.multiqc_title ? "--title \"$params.multiqc_title\"" : '' } + withName: 'TABIX_EXT_VCF' { + ext.prefix = { "${input.baseName}" } + ext.when = { params.concatenate_vcfs } + } + + withName: 'TABIX_GERMLINE_VCFS_CONCAT_SORT'{ + ext.prefix = { "${meta.id}.germline" } + ext.when = { params.concatenate_vcfs } publishDir = [ - path: { "${params.outdir}/multiqc" }, mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + path: { "${params.outdir}/variant_calling/concat/${meta.id}/" } ] } - } diff --git a/conf/modules/prepare_genome.config b/conf/modules/prepare_genome.config new file mode 100644 index 0000000000..e948e1eea5 --- /dev/null +++ b/conf/modules/prepare_genome.config @@ -0,0 +1,169 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. + ext.when = When to run the module. +---------------------------------------------------------------------------------------- +*/ + +// PREPARE_GENOME + +process { + + withName: 'BWAMEM1_INDEX' { + ext.when = { !params.bwa && params.step == "mapping" && (params.aligner == "bwa-mem" || params.aligner == "sentieon-bwamem")} + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/reference" }, + pattern: "bwa", + saveAs: { params.save_reference || params.build_only_index ? it : null } + ] + } + + withName: 'BWAMEM2_INDEX' { + ext.when = { !params.bwamem2 && params.step == "mapping" && params.aligner == "bwa-mem2" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/reference" }, + pattern: "bwamem2", + saveAs: { params.save_reference || params.build_only_index ? it : null } + ] + } + + withName: 'CNVKIT_ANTITARGET' { + ext.when = { params.tools && params.tools.split(',').contains('cnvkit') } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/reference/cnvkit" }, + pattern: "*{bed}", + saveAs: { params.save_reference || params.build_only_index ? it : null } + ] + } + + withName: 'CNVKIT_REFERENCE' { + ext.prefix = { 'cnvkit' } + ext.when = { params.tools && params.tools.split(',').contains('cnvkit') && !params.cnvkit_reference } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/reference/cnvkit" }, + pattern: "*{cnn}", + saveAs: { params.save_reference || params.build_only_index ? it : null } + ] + } + + withName: 'DRAGMAP_HASHTABLE' { + ext.when = { !params.dragmap && params.step == "mapping" && params.aligner == "dragmap" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/reference" }, + pattern: "dragmap", + saveAs: { params.save_reference || params.build_only_index ? it : null } + ] + } + + withName: 'GATK4_CREATESEQUENCEDICTIONARY' { + ext.when = { !params.dict && params.step != "annotate" && params.step != "controlfreec" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/reference/dict" }, + pattern: "*dict", + saveAs: { params.save_reference || params.build_only_index ? it : null } + ] + } + + withName: 'MSISENSORPRO_SCAN' { + ext.when = { params.tools && params.tools.split(',').contains('msisensorpro') } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/reference/msi" }, + pattern: "*list", + saveAs: { params.save_reference || params.build_only_index ? it : null } + ] + } + + withName: 'SAMTOOLS_FAIDX' { + ext.when = { !params.fasta_fai && params.step != "annotate" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/reference/fai" }, + pattern: "*fai", + saveAs: { params.save_reference || params.build_only_index ? it : null } + ] + } + + withName: 'TABIX_BCFTOOLS_ANNOTATIONS' { + ext.when = { !params.bcftools_annotations_tbi && params.bcftools_annotations && params.tools && params.tools.split(',').contains('bcfann') } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/reference/bcfann" }, + pattern: "*vcf.gz.tbi", + saveAs: { params.save_reference || params.build_only_index ? it : null } + ] + } + + withName: 'TABIX_DBSNP' { + ext.when = { !params.dbsnp_tbi && params.dbsnp && ((params.step == "mapping" || params.step == "markduplicates" || params.step == "prepare_recalibration") || params.tools && (params.tools.split(',').contains('controlfreec') || params.tools.split(',').contains('haplotypecaller') || params.tools.split(',').contains('sentieon_haplotyper') || params.tools.split(',').contains('sentieon_dnascope') || params.tools.split(',').contains('mutect2'))) } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/reference/dbsnp" }, + pattern: "*vcf.gz.tbi", + saveAs: { params.save_reference || params.build_only_index ? it : null } + ] + } + + withName: 'TABIX_GERMLINE_RESOURCE' { + ext.when = { !params.germline_resource_tbi && params.germline_resource && params.tools && params.tools.split(',').contains('mutect2') } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/reference/germline_resource" }, + pattern: "*vcf.gz.tbi", + saveAs: { params.save_reference || params.build_only_index ? it : null } + ] + } + + withName: 'TABIX_KNOWN_INDELS' { + ext.when = { !params.known_indels_tbi && params.known_indels && (params.step == 'mapping' || params.step == "markduplicates" || params.step == 'prepare_recalibration' || (params.tools && (params.tools.split(',').contains('haplotypecaller') || params.tools.split(',').contains('sentieon_haplotyper') || params.tools.split(',').contains('sentieon_dnascope'))) ) } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/reference/known_indels" }, + pattern: "*vcf.gz.tbi", + saveAs: { params.save_reference || params.build_only_index ? it : null } + ] + } + + withName: 'TABIX_KNOWN_SNPS' { + ext.when = { !params.known_snps_tbi && params.known_snps && (params.step == 'mapping' || params.step == "markduplicates" || params.step == 'prepare_recalibration' || (params.tools && (params.tools.split(',').contains('haplotypecaller') || params.tools.split(',').contains('sentieon_haplotyper') )) ) } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/reference/known_snps" }, + pattern: "*vcf.gz.tbi", + saveAs: { params.save_reference || params.build_only_index ? it : null } + ] + } + + withName: 'TABIX_PON' { + ext.when = { !params.pon_tbi && params.pon && params.tools && params.tools.split(',').contains('mutect2') } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/reference/pon" }, + pattern: "*vcf.gz.tbi", + saveAs: { params.save_reference || params.build_only_index ? it : null } + ] + } + + withName: 'UNZIP_ALLELES|UNZIP_LOCI|UNZIP_GC|UNZIP_RT' { + ext.when = { params.tools && params.tools.split(',').contains('ascat')} + publishDir = [ + enabled: false + ] + } + + withName: 'UNTAR_CHR_DIR' { + ext.when = { params.tools && params.tools.split(',').contains('controlfreec')} + } +} diff --git a/conf/modules/prepare_intervals.config b/conf/modules/prepare_intervals.config new file mode 100644 index 0000000000..655a227f8a --- /dev/null +++ b/conf/modules/prepare_intervals.config @@ -0,0 +1,45 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. + ext.when = When to run the module. +---------------------------------------------------------------------------------------- +*/ + +// PREPARE INTERVALS + +process { + + withName: 'CREATE_INTERVALS_BED' { + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/reference/intervals" }, + pattern: "*bed", + saveAs: { params.save_reference || params.build_only_index ? it : null } + ] + } + + withName: 'GATK4_INTERVALLISTTOBED' { + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/reference/intervals" }, + pattern: "*bed", + saveAs: { params.save_reference || params.build_only_index ? it : null } + ] + } + + withName: 'TABIX_BGZIPTABIX_INTERVAL_SPLIT' { + ext.prefix = {"${meta.id}"} + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/reference/intervals" }, + pattern: "*bed.gz", + saveAs: { params.save_reference || params.build_only_index ? it : null } + ] + } +} diff --git a/conf/modules/prepare_recalibration.config b/conf/modules/prepare_recalibration.config new file mode 100644 index 0000000000..915075d5b4 --- /dev/null +++ b/conf/modules/prepare_recalibration.config @@ -0,0 +1,37 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. + ext.when = When to run the module. +---------------------------------------------------------------------------------------- +*/ + +// PREPARE_RECALIBRATION + +process { + + withName: 'GATK4_BASERECALIBRATOR|GATK4SPARK_BASERECALIBRATOR' { + ext.prefix = { meta.num_intervals <= 1 ? "${meta.id}.recal" : "${meta.id}_${intervals.simpleName}.recal" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/preprocessing/" }, + pattern: "*table", + saveAs: { meta.num_intervals > 1 ? null : "recal_table/${meta.id}/${it}" } + ] + } + + withName: 'GATK4_GATHERBQSRREPORTS' { + ext.prefix = {"${meta.id}.recal"} + ext.when = { meta.num_intervals > 1 } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/preprocessing/recal_table/${meta.id}/" }, + pattern: "*table", + ] + } +} diff --git a/conf/modules/recalibrate.config b/conf/modules/recalibrate.config new file mode 100644 index 0000000000..7ca0a476f1 --- /dev/null +++ b/conf/modules/recalibrate.config @@ -0,0 +1,60 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. + ext.when = When to run the module. +---------------------------------------------------------------------------------------- +*/ + +// RECALIBRATE + +process { + + withName: 'GATK4_APPLYBQSR|GATK4SPARK_APPLYBQSR' { + ext.prefix = { meta.num_intervals <= 1 ? "${meta.id}.recal" : "${meta.id}_${intervals.simpleName}.recal" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/preprocessing/" }, + pattern: "*cram", + saveAs: { !params.save_output_as_bam ? meta.num_intervals > 1 ? null : "recalibrated/${meta.id}/${it}" : null } + ] + } + + if ((params.step == 'mapping' || params.step == 'markduplicates'|| params.step == 'prepare_recalibration'|| params.step == 'recalibrate') && (!(params.skip_tools && params.skip_tools.split(',').contains('baserecalibrator')))) { + withName: 'NFCORE_SAREK:SAREK:(BAM_APPLYBQSR|BAM_APPLYBQSR_SPARK):CRAM_MERGE_INDEX_SAMTOOLS:MERGE_CRAM' { + ext.prefix = { "${meta.id}.recal" } + ext.when = { meta.num_intervals > 1 } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/preprocessing/recalibrated/${meta.id}/" }, + pattern: "*cram", + saveAs: { !params.save_output_as_bam ? it : null } + ] + } + + withName: 'NFCORE_SAREK:SAREK:(BAM_APPLYBQSR|BAM_APPLYBQSR_SPARK):CRAM_MERGE_INDEX_SAMTOOLS:INDEX_CRAM' { + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/preprocessing/recalibrated/${meta.id}/" }, + pattern: "*{recal.cram,recal.cram.crai}", + saveAs: { !params.save_output_as_bam ? it : null } + ] + } + } + + withName: 'CRAM_TO_BAM_RECAL' { + ext.prefix = { "${meta.id}.recal" } + ext.when = { params.save_output_as_bam} + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/preprocessing/recalibrated/${meta.id}/" }, + pattern: "*{recal.bam,recal.bam.bai}", + saveAs: { params.save_output_as_bam ? it : null } + ] + } +} diff --git a/conf/modules/sentieon_dedup.config b/conf/modules/sentieon_dedup.config new file mode 100644 index 0000000000..df52c3bb95 --- /dev/null +++ b/conf/modules/sentieon_dedup.config @@ -0,0 +1,49 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. + ext.when = When to run the module. +---------------------------------------------------------------------------------------- +*/ + +// SENTIEON_DEDUP + +process { + + withName: 'SENTIEON_DEDUP' { + ext.prefix = { "${meta.id}.dedup" } + ext.when = { params.tools && params.tools.split(',').contains('sentieon_dedup') } + publishDir = [ + [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/preprocessing/sentieon_dedup/${meta.id}/" }, + pattern: "*{cram,crai}", + saveAs: { !params.save_output_as_bam ? it : null } + ], + [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/reports/" }, + pattern: "*{metrics,metrics.multiqc.tsv}", + saveAs: { !(params.skip_tools && params.skip_tools.split(',').contains('sentieon_dedup_report')) ? "sentieon_dedup/${meta.id}/${it}" : null} + ] + ] + } + + if (params.tools && params.tools.contains('sentieon_dedup')) { + withName: 'NFCORE_SAREK:SAREK:BAM_SENTIEON_DEDUP:CRAM_QC_MOSDEPTH_SAMTOOLS:SAMTOOLS_STATS' { + ext.when = { !(params.skip_tools && params.skip_tools.split(',').contains('samtools')) } + ext.prefix = { "${meta.id}.dedup.cram" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/reports/samtools/${meta.id}" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + } + +} diff --git a/conf/modules/sentieon_dnascope.config b/conf/modules/sentieon_dnascope.config new file mode 100644 index 0000000000..224c33c89c --- /dev/null +++ b/conf/modules/sentieon_dnascope.config @@ -0,0 +1,56 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. + ext.when = When to run the module. +---------------------------------------------------------------------------------------- +*/ + +// SENTIEON DNASCOPE + +process { + + withName: 'SENTIEON_DNASCOPE' { + ext.prefix = { meta.num_intervals <= 1 ? "${meta.id}.dnascope" : "${meta.id}.dnascope.${intervals.simpleName}" } + ext.when = { params.tools && params.tools.split(',').contains('sentieon_dnascope') } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/"}, + pattern: "*{vcf.gz,vcf.gz.tbi}", + saveAs: { meta.num_intervals > 1 ? null : "sentieon_dnascope/${meta.id}/${it}" } + ] + } + + withName: 'MERGE_SENTIEON_DNASCOPE_VCFS' { + ext.prefix = { params.joint_germline ? "${meta.id}.dnascope.g" : "${meta.id}.dnascope.unfiltered" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/sentieon_dnascope/${meta.id}/" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'MERGE_SENTIEON_DNASCOPE_GVCFS' { + ext.prefix = { "${meta.id}.dnascope.g" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/sentieon_dnascope/${meta.id}/" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'SENTIEON_DNAMODELAPPLY' { + ext.prefix = { "${meta.id}.dnascope.filtered" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/sentieon_dnascope/${meta.id}/"}, + pattern: "*{vcf.gz,vcf.gz.tbi}" + ] + } + +} diff --git a/conf/modules/sentieon_dnascope_joint_germline.config b/conf/modules/sentieon_dnascope_joint_germline.config new file mode 100644 index 0000000000..1b57c0d5ed --- /dev/null +++ b/conf/modules/sentieon_dnascope_joint_germline.config @@ -0,0 +1,44 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. + ext.when = When to run the module. +---------------------------------------------------------------------------------------- +*/ + +// SENTIEON DNASCOPE JOINT_GERMLINE + +process { + + // TO-DO: duplicate!! + withName: 'SENTIEON_GVCFTYPER' { + ext.args = { '--allow-old-rms-mapping-quality-annotation-data' } + ext.prefix = { meta.intervals_name } + publishDir = [ + enabled: false + ] + } + + if (params.tools && params.tools.contains('sentieon_dnascope') && params.joint_germline) { + withName: 'NFCORE_SAREK:SAREK:BAM_VARIANT_CALLING_GERMLINE_ALL:BAM_JOINT_CALLING_GERMLINE_SENTIEON:BCFTOOLS_SORT' { + ext.prefix = { vcf.baseName - ".vcf" + ".sort" } + publishDir = [ + enabled: false + ] + } + + withName: 'NFCORE_SAREK:SAREK:BAM_VARIANT_CALLING_GERMLINE_ALL:BAM_JOINT_CALLING_GERMLINE_SENTIEON:MERGE_GENOTYPEGVCFS' { + ext.prefix = { 'joint_germline' } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/sentieon_dnascope/joint_variant_calling/" }, + pattern: "*{vcf.gz,vcf.gz.tbi}" + ] + } + } +} diff --git a/conf/modules/sentieon_haplotyper.config b/conf/modules/sentieon_haplotyper.config new file mode 100644 index 0000000000..8b01b04c22 --- /dev/null +++ b/conf/modules/sentieon_haplotyper.config @@ -0,0 +1,59 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. + ext.when = When to run the module. +---------------------------------------------------------------------------------------- +*/ + +// SENTIEON HAPLOTYPER + +process { + + withName: 'SENTIEON_HAPLOTYPER' { + ext.prefix = { meta.num_intervals <= 1 ? "${meta.id}.haplotyper" : "${meta.id}.haplotyper.${intervals.simpleName}" } + ext.when = { params.tools && params.tools.split(',').contains('sentieon_haplotyper') } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/"}, + pattern: "*{vcf.gz,vcf.gz.tbi}", + saveAs: { meta.num_intervals > 1 ? null : "sentieon_haplotyper/${meta.id}/${it}" } + ] + } + + withName: 'MERGE_SENTIEON_HAPLOTYPER_VCFS' { + ext.prefix = { params.joint_germline ? "${meta.id}.haplotyper.g" : "${meta.id}.haplotyper.unfiltered" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/sentieon_haplotyper/${meta.id}/" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'MERGE_SENTIEON_HAPLOTYPER_GVCFS' { + ext.prefix = { "${meta.id}.haplotyper.g" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/sentieon_haplotyper/${meta.id}/" }, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + if (params.tools && params.tools.contains('sentieon_haplotyper')) { + withName: '.*:SENTIEON_HAPLOTYPER_VCF_VARIANT_FILTERING_GATK:FILTERVARIANTTRANCHES' { + ext.args = { "--info-key CNN_1D" } + ext.prefix = { "${meta.id}.haplotyper" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/sentieon_haplotyper/${meta.id}/"}, + pattern: "*{vcf.gz,vcf.gz.tbi}" + ] + } + } + +} diff --git a/conf/modules/sentieon_haplotyper_joint_germline.config b/conf/modules/sentieon_haplotyper_joint_germline.config new file mode 100644 index 0000000000..1f7bd4ba5a --- /dev/null +++ b/conf/modules/sentieon_haplotyper_joint_germline.config @@ -0,0 +1,75 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. + ext.when = When to run the module. +---------------------------------------------------------------------------------------- +*/ + +// SENTIEON JOINT_GERMLINE + +process { + + withName: 'SENTIEON_GVCFTYPER' { + ext.args = { '--allow-old-rms-mapping-quality-annotation-data' } + ext.prefix = { meta.intervals_name } + publishDir = [ + enabled: false + ] + } + + if (params.tools && params.tools.contains('sentieon_haplotyper') && params.joint_germline) { + withName: 'NFCORE_SAREK:SAREK:BAM_VARIANT_CALLING_GERMLINE_ALL:BAM_JOINT_CALLING_GERMLINE_SENTIEON:BCFTOOLS_SORT' { + ext.prefix = { vcf.baseName - ".vcf" + ".sort" } + publishDir = [ + enabled: false + ] + } + + withName: 'NFCORE_SAREK:SAREK:BAM_VARIANT_CALLING_GERMLINE_ALL:BAM_JOINT_CALLING_GERMLINE_SENTIEON:MERGE_GENOTYPEGVCFS' { + ext.prefix = { 'joint_germline' } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/sentieon_haplotyper/joint_variant_calling/" }, + pattern: "*{vcf.gz,vcf.gz.tbi}" + ] + } + } + + withName: 'SENTIEON_VARCAL_INDEL' { + ext.args = { '--annotation QD --annotation MQRankSum --annotation ReadPosRankSum --annotation FS --annotation SOR --annotation DP --var_type INDEL' } + ext.prefix = { "${meta.id}_INDEL" } + publishDir = [ + enabled: false + ] + } + + withName: 'SENTIEON_APPLYVARCAL_INDEL' { + ext.args = { '--sensitivity 99.9 --var_type INDEL' } + ext.prefix = { 'joint_germline_recalibrated' } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/sentieon_haplotyper/joint_variant_calling/"}, + pattern: "*{vcf.gz,vcf.gz.tbi}" + ] + } + + withName: 'SENTIEON_VARCAL_SNP' { + ext.args = { '--annotation QD --annotation MQRankSum --annotation ReadPosRankSum --annotation FS --annotation SOR --annotation DP --var_type SNP' } + ext.prefix = { "${meta.id}_SNP" } + publishDir = [ + enabled: false + ] + } + + withName: 'SENTIEON_APPLYVARCAL_SNP' { + ext.args = { '--sensitivity 99.9 --var_type SNP' } + ext.prefix = { "${meta.id}_SNP" } + } + +} diff --git a/conf/modules/strelka.config b/conf/modules/strelka.config new file mode 100644 index 0000000000..23620dfdaf --- /dev/null +++ b/conf/modules/strelka.config @@ -0,0 +1,53 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. + ext.when = When to run the module. +---------------------------------------------------------------------------------------- +*/ + +// STRELKA + +process { + + withName: 'STRELKA_.*' { + ext.args = { params.wes ? '--exome' : '' } + ext.prefix = { meta.num_intervals <= 1 ? "${meta.id}.strelka" : "${meta.id}.strelka.${target_bed.simpleName}" } + ext.when = { params.tools && params.tools.split(',').contains('strelka') } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/" }, + pattern: "*{vcf.gz,vcf.gz.tbi}", + saveAs: { meta.num_intervals > 1 ? null : "strelka/${meta.id}/${it}" } + ] + } + + withName: 'MERGE_STRELKA.*' { + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/strelka/${meta.id}/" }, + pattern: "*{vcf.gz,vcf.gz.tbi}" + ] + } + + withName: 'MERGE_STRELKA' { + ext.prefix = { "${meta.id}.strelka.variants" } + } + + withName: 'MERGE_STRELKA_GENOME' { + ext.prefix = { "${meta.id}.strelka.genome" } + } + + // PAIR_VARIANT_CALLING + withName: 'MERGE_STRELKA_INDELS' { + ext.prefix = { "${meta.id}.strelka.somatic_indels" } + } + withName: 'MERGE_STRELKA_SNVS' { + ext.prefix = { "${meta.id}.strelka.somatic_snvs" } + } +} diff --git a/conf/modules/tiddit.config b/conf/modules/tiddit.config new file mode 100644 index 0000000000..6a6060ba61 --- /dev/null +++ b/conf/modules/tiddit.config @@ -0,0 +1,58 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. + ext.when = When to run the module. +---------------------------------------------------------------------------------------- +*/ + +// TIDDIT + +process { + + withName: 'TIDDIT_SV' { + ext.args = { bwa_index ? '' : '--skip_assembly' } + ext.prefix = { "${meta.id}.tiddit" } + ext.when = { params.tools && params.tools.split(',').contains('tiddit') } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/tiddit/${meta.id}/" }, + pattern: "*tab" + ] + } + + withName : 'TABIX_BGZIP_TIDDIT_SV' { + ext.prefix = { "${meta.id}.tiddit" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/tiddit/${meta.id}/" }, + pattern: "*{vcf.gz,vcf.gz.tbi}" + ] + } + + // PAIR_VARIANT_CALLING + if (params.tools && params.tools.split(',').contains('tiddit')) { + withName: 'NFCORE_SAREK:SAREK:BAM_VARIANT_CALLING_SOMATIC_ALL:BAM_VARIANT_CALLING_SOMATIC_TIDDIT:TIDDIT_NORMAL:TABIX_BGZIP_TIDDIT_SV' { + ext.prefix = {"${meta.id}.tiddit.normal"} + } + + withName: 'NFCORE_SAREK:SAREK:BAM_VARIANT_CALLING_SOMATIC_ALL:BAM_VARIANT_CALLING_SOMATIC_TIDDIT:TIDDIT_TUMOR:TABIX_BGZIP_TIDDIT_SV' { + ext.prefix = {"${meta.id}.tiddit.tumor"} + } + + // SVDB + withName: 'NFCORE_SAREK:SAREK:BAM_VARIANT_CALLING_SOMATIC_ALL:BAM_VARIANT_CALLING_SOMATIC_TIDDIT:SVDB_MERGE' { + ext.prefix = { "${meta.id}.tiddit" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/tiddit/${meta.id}/" }, + pattern: "*vcf.gz" + ] + } + } +} diff --git a/conf/modules/trimming.config b/conf/modules/trimming.config new file mode 100644 index 0000000000..58be3b2539 --- /dev/null +++ b/conf/modules/trimming.config @@ -0,0 +1,42 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. + ext.when = When to run the module. +---------------------------------------------------------------------------------------- +*/ + +// TRIMMING + +process { + + withName: 'FASTP' { + ext.args = [ '', + !params.trim_fastq ? '--disable_adapter_trimming' : '', // Disable adapter trimming + params.clip_r1 > 0 ? "--trim_front1 ${params.clip_r1}" : '', // Remove bp from the 5' end of read 1 + params.clip_r2 > 0 ? "--trim_front2 ${params.clip_r2}" : '', // Remove bp from the 5' end of read 2 + params.three_prime_clip_r1 > 0 ? "--trim_tail1 ${params.three_prime_clip_r1}" : '', // Remove bp from the 3' end of read 1 AFTER adapter/quality trimming has been performed + params.three_prime_clip_r2 > 0 ? "--trim_tail2 ${params.three_prime_clip_r2}" : '', // Remove bp from the 3' end of read 2 AFTER adapter/quality trimming has been performed + params.trim_nextseq ? '--trim_poly_g' : '', // Apply the --nextseq=X option, to trim based on quality after removing poly-G tails + params.split_fastq > 0 ? "--split_by_lines ${params.split_fastq * 4}" : '' + ].join(' ').trim() + publishDir = [ + [ + path: { "${params.outdir}/reports/fastp/${meta.sample}" }, + mode: params.publish_dir_mode, + pattern: "*.{html,json,log}" + ], + [ + path: { "${params.outdir}/preprocessing/fastp/${meta.sample}/" }, + mode: params.publish_dir_mode, + pattern: "*.fastp.fastq.gz", + saveAs: { params.save_trimmed || params.save_split_fastqs ? it : null } + ] + ] + } +} diff --git a/conf/modules/umi.config b/conf/modules/umi.config new file mode 100644 index 0000000000..7973dd16d8 --- /dev/null +++ b/conf/modules/umi.config @@ -0,0 +1,84 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. + ext.when = When to run the module. +---------------------------------------------------------------------------------------- +*/ + +// UMI + +process { + + withName: 'FASTQTOBAM' { + ext.args = { "--read-structures $params.umi_read_structure" } + ext.prefix = {"${meta.id}"} + publishDir = [ + //specify to avoid publishing, overwritten otherwise + enabled: false + ] + } + + withName: 'BAM2FASTQ' { + ext.args = '-T RX' + ext.when = { params.umi_read_structure } + publishDir = [ + //specify to avoid publishing, overwritten otherwise + enabled: false + ] + } + + if (params.umi_read_structure) { + withName: 'NFCORE_SAREK:SAREK:FASTQ_CREATE_UMI_CONSENSUS_FGBIO:ALIGN_UMI:BWAMEM.*_MEM' { + ext.args = { "-K 100000000 -p -C -Y -R ${meta.read_group}" } + ext.args2 = { '-bS' } + ext.prefix = { "${meta.id}.umi_unsorted" } + publishDir = [ + //specify to avoid publishing, overwritten otherwise + enabled: false + ] + } + + withName: 'NFCORE_SAREK:SAREK:FASTQ_CREATE_UMI_CONSENSUS_FGBIO:ALIGN_UMI:DRAGMAP_ALIGN' { + ext.args2 = { '-bS' } + ext.prefix = { "${meta.id}.umi_unsorted" } + publishDir = [ + //specify to avoid publishing, overwritten otherwise + enabled: false + ] + } + } + + withName: 'SAMBLASTER' { + ext.args = { '-M --addMateTags' } + ext.prefix = { "${meta.id}_unsorted_tagged" } + publishDir = [ + //specify to avoid publishing, overwritten otherwise + enabled: false + ] + } + + withName: 'GROUPREADSBYUMI' { + publishDir = [ + [ path: { "${params.outdir}/reports/umi/" }, + mode: params.publish_dir_mode, + pattern: "*.{txt}" + ] + ] + } + + withName: 'CALLUMICONSENSUS' { + ext.args = { '-M 1 -S Coordinate' } + ext.prefix = { "${meta.id}_umi-consensus" } + publishDir = [ + path: { "${params.outdir}/preprocessing/umi/${meta.sample}" }, + mode: params.publish_dir_mode, + pattern: "*.{bam}" + ] + } +} diff --git a/conf/test.config b/conf/test.config index 9f40765fa6..2612b92cd0 100644 --- a/conf/test.config +++ b/conf/test.config @@ -5,9 +5,8 @@ Defines input files and everything required to run a fast and simple pipeline test. Use as follows: - nextflow run nf-core/sarek -profile test, --outdir - ----------------------------------------------------------------------------------------- + nextflow run nf-core/sarek -profile test,, --outdir +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ params { @@ -16,14 +15,108 @@ params { // Limit resources so that this can run on GitHub Actions max_cpus = 2 - max_memory = '6.GB' - max_time = '6.h' + max_memory = '6.5GB' + max_time = '8.h' // Input data - // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets - // TODO nf-core: Give any required params for the test so that command line flags are not needed - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv' + input = "${projectDir}/tests/csv/3.0/fastq_single.csv" + + // No AWS iGenomes + genome = null + igenomes_ignore = true + + // Small reference genome + bcftools_annotations = "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/sarscov2/illumina/vcf/test2.vcf.gz" + bcftools_annotations_tbi = "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/sarscov2/illumina/vcf/test2.vcf.gz.tbi" + bcftools_header_lines = "${projectDir}/tests/config/bcfann_test_header.txt" + dbsnp = "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/dbsnp_146.hg38.vcf.gz" + fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta" + germline_resource = "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/gnomAD.r2.1.1.vcf.gz" + intervals = "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.interval_list" + known_indels = "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/vcf/mills_and_1000G.indels.vcf.gz" + ngscheckmate_bed = "https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/chr21/germlineresources/SNP_GRCh38_hg38_wChr.bed" + snpeff_cache = null + snpeff_db = 105 + snpeff_genome = 'WBcel235' + vep_cache = null + vep_cache_version = 110 + vep_genome = 'WBcel235' + vep_species = 'caenorhabditis_elegans' + + // Sentieon + sentieon_dnascope_model = "s3://ngi-igenomes/igenomes/Homo_sapiens/GATK/GRCh38/Annotation/Sentieon/SentieonDNAscopeModel1.1.model" + + // default params + split_fastq = 0 // no FASTQ splitting + tools = 'strelka' // Variant calling with Strelka + + // Ignore params that will throw warning through params validation + validationSchemaIgnoreParams = 'genomes' +} + +process { + + withName:'.*:FREEC_SOMATIC'{ + ext.args = { + [ + "sample":[ + inputformat: "pileup", + mateorientation: "FR" + ], + "general":[ + bedgraphoutput: "TRUE", + noisydata: "TRUE", + minexpectedgc: "0", + readcountthreshold: "1", + sex: meta.sex, + window: "10", + ], + "control":[ + inputformat: "pileup", + mateorientation: "FR" + ] + ] + } + } + + withName: '.*:MUTECT2_PAIRED'{ + //sample name from when the test data was generated + ext.args = { "--f1r2-tar-gz ${task.ext.prefix}.f1r2.tar.gz --normal-sample normal" } + } + + withName: '.*:FILTERVARIANTTRANCHES'{ + ext.args = { "--info-key CNN_1D --indel-tranche 0" } + } +} + - // Genome references - genome = 'R64-1-1' +// Enable container engines/virtualisation envs for CI testing +// only works when specified with the profile ENV +// otherwise tests can be done with the regular provided profiles +if (System.getenv('PROFILE')) { + if ("$PROFILE" == "conda") { + conda.createTimeout = "120 min" + conda.enabled = true + charliecloud.enabled = false + docker.enabled = false + podman.enabled = false + shifter.enabled = false + singularity.enabled = false + } else if ("$PROFILE" == "docker") { + conda.enabled = false + docker.enabled = true + docker.userEmulation = { params.use_gatk_spark ? false : true }.call() + charliecloud.enabled = false + podman.enabled = false + shifter.enabled = false + singularity.enabled = false + } else if ("$PROFILE" == "singularity") { + conda.enabled = false + singularity.autoMounts = true + singularity.enabled = true + charliecloud.enabled = false + docker.enabled = false + podman.enabled = false + shifter.enabled = false + } } diff --git a/conf/test/alignment_to_fastq.config b/conf/test/alignment_to_fastq.config new file mode 100644 index 0000000000..22937bdee0 --- /dev/null +++ b/conf/test/alignment_to_fastq.config @@ -0,0 +1,15 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/sarek -profile test,, --outdir +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +params { + input = "${projectDir}/tests/csv/3.0/bam_for_remapping.csv" + tools = null +} diff --git a/conf/test/annotation.config b/conf/test/annotation.config new file mode 100644 index 0000000000..d4ac853ea2 --- /dev/null +++ b/conf/test/annotation.config @@ -0,0 +1,16 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/sarek -profile test,, --outdir +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +params { + input = "${projectDir}/tests/csv/3.0/vcf_single.csv" + step = 'annotate' + tools = null // vep, snpeff and/or merge should be specified on the command line +} diff --git a/conf/test/cache.config b/conf/test/cache.config new file mode 100644 index 0000000000..12853ad129 --- /dev/null +++ b/conf/test/cache.config @@ -0,0 +1,133 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/sarek -profile test,, --outdir +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +if (System.getenv('TEST_DATA_BASE')) { + if ("$TEST_DATA_BASE") { + params.test_data_base = "$TEST_DATA_BASE" + } +} + +try { + includeConfig "https://raw.githubusercontent.com/nf-core/modules/master/tests/config/test_data.config" +} catch (Exception e) { + System.err.println("WARNING: Could not load nf-core/modules test data config") +} + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.5GB' + max_time = '8.h' + + // Input data + input = "${projectDir}/tests/csv/3.0/fastq_single.csv" + + // No AWS iGenomes + genome = null + igenomes_ignore = true + + // Small reference genome + bcftools_annotations = params.test_data['sarscov2']['illumina']['test2_vcf_gz'] + bcftools_annotations_tbi = params.test_data['sarscov2']['illumina']['test2_vcf_gz_tbi'] + bcftools_header_lines = "${projectDir}/tests/config/bcfann_test_header.txt" + dbsnp = params.test_data['homo_sapiens']['genome']['dbsnp_146_hg38_vcf_gz'] + fasta = params.test_data['homo_sapiens']['genome']['genome_fasta'] + germline_resource = params.test_data['homo_sapiens']['genome']['gnomad_r2_1_1_vcf_gz'] + intervals = params.test_data['homo_sapiens']['genome']['genome_interval_list'] + known_indels = params.test_data['homo_sapiens']['genome']['mills_and_1000g_indels_vcf_gz'] + ngscheckmate_bed = params.test_data['homo_sapiens']['genome']['ngscheckmate_bed'] + snpeff_cache = null + snpeff_db = 105 + snpeff_genome = 'WBcel235' + vep_cache = null + vep_cache_version = 110 + vep_genome = 'WBcel235' + vep_species = 'caenorhabditis_elegans' + + // Sentieon + sentieon_dnascope_model = "s3://ngi-igenomes/igenomes/Homo_sapiens/GATK/GRCh38/Annotation/Sentieon/SentieonDNAscopeModel1.1.model" + + // default params + split_fastq = 0 // no FASTQ splitting + tools = 'strelka' // Variant calling with Strelka + + // Ignore params that will throw warning through params validation + validationSchemaIgnoreParams = 'genomes,test_data' +} + +process { + // This must contain .* in order to properly overwrite the standard config in test cases + withName:'.*:FREEC_SOMATIC'{ + ext.args = { + [ + "sample":[ + inputformat: "pileup", + mateorientation: "FR" + ], + "general":[ + bedgraphoutput: "TRUE", + noisydata: "TRUE", + minexpectedgc: "0", + readcountthreshold: "1", + sex: meta.sex, + window: "10", + ], + "control":[ + inputformat: "pileup", + mateorientation: "FR" + ] + ] + } + } + + withName: '.*:MUTECT2_PAIRED'{ + //sample name from when the test data was generated + ext.args = { "--f1r2-tar-gz ${task.ext.prefix}.f1r2.tar.gz --normal-sample normal" } + } + + withName: '.*:FILTERVARIANTTRANCHES'{ + ext.args = { "--info-key CNN_1D --indel-tranche 0" } + } +} + +// Enable container engines/virtualisation envs for CI testing +// only works when specified with the profile ENV +// otherwise tests can be done with the regular provided profiles +if (System.getenv('PROFILE')) { + if ("$PROFILE" == "conda") { + conda.createTimeout = "120 min" + conda.enabled = true + charliecloud.enabled = false + docker.enabled = false + podman.enabled = false + shifter.enabled = false + singularity.enabled = false + } else if ("$PROFILE" == "docker") { + conda.enabled = false + docker.enabled = true + docker.userEmulation = { params.use_gatk_spark ? false : true }.call() + charliecloud.enabled = false + podman.enabled = false + shifter.enabled = false + singularity.enabled = false + } else if ("$PROFILE" == "singularity") { + conda.enabled = false + singularity.autoMounts = true + singularity.enabled = true + charliecloud.enabled = false + docker.enabled = false + podman.enabled = false + shifter.enabled = false + } +} diff --git a/conf/test/markduplicates_bam.config b/conf/test/markduplicates_bam.config new file mode 100644 index 0000000000..16060a2ba8 --- /dev/null +++ b/conf/test/markduplicates_bam.config @@ -0,0 +1,16 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/sarek -profile test,, --outdir +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +params { + input = "${projectDir}/tests/csv/3.0/mapped_single_bam.csv" + step = 'markduplicates' + tools = null +} diff --git a/conf/test/markduplicates_cram.config b/conf/test/markduplicates_cram.config new file mode 100644 index 0000000000..e8f1d7c6f3 --- /dev/null +++ b/conf/test/markduplicates_cram.config @@ -0,0 +1,16 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/sarek -profile test,, --outdir +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +params { + input = "${projectDir}/tests/csv/3.0/mapped_single_cram.csv" + step = 'markduplicates' + tools = null +} diff --git a/conf/test/no_intervals.config b/conf/test/no_intervals.config new file mode 100644 index 0000000000..a82f837e4b --- /dev/null +++ b/conf/test/no_intervals.config @@ -0,0 +1,16 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/sarek -profile test,, --outdir +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +params { + intervals = null + no_intervals = true + tools = null +} diff --git a/conf/test/pair.config b/conf/test/pair.config new file mode 100644 index 0000000000..d514fa04a7 --- /dev/null +++ b/conf/test/pair.config @@ -0,0 +1,15 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/sarek -profile test,, --outdir +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +params { + input = "${projectDir}/tests/csv/3.0/fastq_pair.csv" + tools = null +} diff --git a/conf/test/prepare_recalibration_bam.config b/conf/test/prepare_recalibration_bam.config new file mode 100644 index 0000000000..20a209b438 --- /dev/null +++ b/conf/test/prepare_recalibration_bam.config @@ -0,0 +1,16 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/sarek -profile test,, --outdir +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +params { + input = "${projectDir}/tests/csv/3.0/mapped_single_bam.csv" + step = 'prepare_recalibration' + tools = null +} diff --git a/conf/test/prepare_recalibration_cram.config b/conf/test/prepare_recalibration_cram.config new file mode 100644 index 0000000000..ccab4977c9 --- /dev/null +++ b/conf/test/prepare_recalibration_cram.config @@ -0,0 +1,16 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/sarek -profile test,, --outdir +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +params { + input = "${projectDir}/tests/csv/3.0/mapped_single_cram.csv" + step = 'prepare_recalibration' + tools = null +} diff --git a/conf/test/recalibrate_bam.config b/conf/test/recalibrate_bam.config new file mode 100644 index 0000000000..fd26476e3c --- /dev/null +++ b/conf/test/recalibrate_bam.config @@ -0,0 +1,16 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/sarek -profile test,, --outdir +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +params { + input = "${projectDir}/tests/csv/3.0/prepare_recalibration_single_bam.csv" + step = 'recalibrate' + tools = null +} diff --git a/conf/test/recalibrate_cram.config b/conf/test/recalibrate_cram.config new file mode 100644 index 0000000000..cad0611765 --- /dev/null +++ b/conf/test/recalibrate_cram.config @@ -0,0 +1,16 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/sarek -profile test,, --outdir +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +params { + input = "${projectDir}/tests/csv/3.0/prepare_recalibration_single_cram.csv" + step = 'recalibrate' + tools = null +} diff --git a/conf/test/save_bam_mapped.config b/conf/test/save_bam_mapped.config new file mode 100644 index 0000000000..e80aba6844 --- /dev/null +++ b/conf/test/save_bam_mapped.config @@ -0,0 +1,15 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/sarek -profile test,, --outdir +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +params { + save_bam_mapped = true + tools = null +} diff --git a/conf/test/sentieon_dedup_bam.config b/conf/test/sentieon_dedup_bam.config new file mode 100644 index 0000000000..8b8179debc --- /dev/null +++ b/conf/test/sentieon_dedup_bam.config @@ -0,0 +1,16 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/sarek -profile test,, --outdir +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +params { + input = "${projectDir}/tests/csv/3.0/mapped_single_bam.csv" + step = 'markduplicates' + tools = 'sentieon_dedup' +} diff --git a/conf/test/sentieon_dedup_cram.config b/conf/test/sentieon_dedup_cram.config new file mode 100644 index 0000000000..7476c37f1f --- /dev/null +++ b/conf/test/sentieon_dedup_cram.config @@ -0,0 +1,16 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/sarek -profile test,, --outdir +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +params { + input = "${projectDir}/tests/csv/3.0/mapped_single_cram.csv" + step = 'markduplicates' + tools = 'sentieon_dedup' +} diff --git a/conf/test/skip_bqsr.config b/conf/test/skip_bqsr.config new file mode 100644 index 0000000000..3e1444b31b --- /dev/null +++ b/conf/test/skip_bqsr.config @@ -0,0 +1,15 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/sarek -profile test,, --outdir +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +params { + skip_tools = "baserecalibrator" + tools = null +} diff --git a/conf/test/skip_markduplicates.config b/conf/test/skip_markduplicates.config new file mode 100644 index 0000000000..fbb677837a --- /dev/null +++ b/conf/test/skip_markduplicates.config @@ -0,0 +1,15 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/sarek -profile test,, --outdir +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +params { + skip_tools = "markduplicates" + tools = null +} diff --git a/conf/test/split_fastq.config b/conf/test/split_fastq.config new file mode 100644 index 0000000000..3c78ecc652 --- /dev/null +++ b/conf/test/split_fastq.config @@ -0,0 +1,16 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/sarek -profile test,, --outdir +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +params { + save_split_fastqs = true + split_fastq = 150000 + tools = null +} diff --git a/conf/test/targeted.config b/conf/test/targeted.config new file mode 100644 index 0000000000..80fa1759d1 --- /dev/null +++ b/conf/test/targeted.config @@ -0,0 +1,17 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/sarek -profile test,, --outdir +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +params { + intervals = params.test_data['homo_sapiens']['genome']['genome_multi_interval_bed'] + nucleotides_per_second = 20 + tools = null + wes = true +} diff --git a/conf/test/tools.config b/conf/test/tools.config new file mode 100644 index 0000000000..a91389e27b --- /dev/null +++ b/conf/test/tools.config @@ -0,0 +1,23 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/sarek -profile test,, --outdir +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +params { + input = "${projectDir}/tests/csv/3.0/recalibrated.csv" + dbsnp = params.test_data['homo_sapiens']['genome']['dbsnp_138_hg38_21_vcf_gz'] + fasta = params.test_data['homo_sapiens']['genome']['genome_21_fasta'] + germline_resource = params.test_data['homo_sapiens']['genome']['gnomad_r2_1_1_21_vcf_gz'] + intervals = params.test_data['homo_sapiens']['genome']['genome_21_multi_interval_bed'] + pon = params.test_data['homo_sapiens']['genome']['mills_and_1000g_indels_21_vcf_gz'] + nucleotides_per_second = 20 + step = 'variant_calling' + tools = null + wes = true +} diff --git a/conf/test/tools_germline.config b/conf/test/tools_germline.config new file mode 100644 index 0000000000..31cb79cd93 --- /dev/null +++ b/conf/test/tools_germline.config @@ -0,0 +1,25 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/sarek -profile test,, --outdir +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +params { + input = "${projectDir}/tests/csv/3.0/recalibrated_germline.csv" + dbsnp = params.test_data['homo_sapiens']['genome']['dbsnp_138_hg38_21_vcf_gz'] + fasta = params.test_data['homo_sapiens']['genome']['genome_21_fasta'] + intervals = params.test_data['homo_sapiens']['genome']['genome_21_multi_interval_bed'] + known_indels = params.test_data['homo_sapiens']['genome']['mills_and_1000g_indels_21_vcf_gz'] + known_indels_vqsr = "--resource:1000G,known=false,training=true,truth=true,prior=10.0 mills_and_1000G.indels.hg38.vcf.gz" + known_snps = params.test_data['homo_sapiens']['genome']['hapmap_3_3_hg38_21_vcf_gz'] + known_snps_vqsr = "--resource:hapmap,known=false,training=true,truth=true,prior=10.0 hapmap_3.3.hg38.vcf.gz" + nucleotides_per_second = 20 + step = 'variant_calling' + tools = null + wes = true +} diff --git a/conf/test/tools_somatic.config b/conf/test/tools_somatic.config new file mode 100644 index 0000000000..93d4df8c19 --- /dev/null +++ b/conf/test/tools_somatic.config @@ -0,0 +1,24 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/sarek -profile test,, --outdir +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +params { + input = "${projectDir}/tests/csv/3.0/recalibrated_somatic.csv" + chr_dir = params.test_data['homo_sapiens']['genome']['genome_21_chromosomes_dir'] + dbsnp = params.test_data['homo_sapiens']['genome']['dbsnp_138_hg38_21_vcf_gz'] + fasta = params.test_data['homo_sapiens']['genome']['genome_21_fasta'] + germline_resource = params.test_data['homo_sapiens']['genome']['gnomad_r2_1_1_21_vcf_gz'] + intervals = params.test_data['homo_sapiens']['genome']['genome_21_multi_interval_bed'] + pon = params.test_data['homo_sapiens']['genome']['mills_and_1000g_indels_21_vcf_gz'] + nucleotides_per_second = 20 + step = 'variant_calling' + tools = null + wes = true +} diff --git a/conf/test/tools_somatic_ascat.config b/conf/test/tools_somatic_ascat.config new file mode 100644 index 0000000000..3f37bd4b3f --- /dev/null +++ b/conf/test/tools_somatic_ascat.config @@ -0,0 +1,24 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/sarek -profile test,, --outdir +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +params { + input = "${projectDir}/tests/csv/3.0/ascat_somatic.csv" + genome = 'GATK.GRCh37' + igenomes_ignore = false + ascat_loci = "G1000_loci_hg19.zip" + ascat_min_base_qual = 30 + chr_dir = params.test_data['homo_sapiens']['genome']['genome_21_chromosomes_dir'] + germline_resource = params.test_data['homo_sapiens']['genome']['gnomad_r2_1_1_21_vcf_gz'] + intervals = params.test_data['homo_sapiens']['genome']['genome_21_multi_interval_bed'] + step = 'variant_calling' + tools = 'ascat' + wes = false +} diff --git a/conf/test/tools_tumoronly.config b/conf/test/tools_tumoronly.config new file mode 100644 index 0000000000..b113fa76fd --- /dev/null +++ b/conf/test/tools_tumoronly.config @@ -0,0 +1,23 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/sarek -profile test,, --outdir +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +params { + input = "${projectDir}/tests/csv/3.0/recalibrated_tumoronly.csv" + dbsnp = params.test_data['homo_sapiens']['genome']['dbsnp_138_hg38_21_vcf_gz'] + fasta = params.test_data['homo_sapiens']['genome']['genome_21_fasta'] + germline_resource = params.test_data['homo_sapiens']['genome']['gnomad_r2_1_1_21_vcf_gz'] + intervals = params.test_data['homo_sapiens']['genome']['genome_21_multi_interval_bed'] + pon = params.test_data['homo_sapiens']['genome']['mills_and_1000g_indels_21_vcf_gz'] + nucleotides_per_second = 20 + step = 'variant_calling' + tools = null + wes = true +} diff --git a/conf/test/trimming.config b/conf/test/trimming.config new file mode 100644 index 0000000000..d904d17660 --- /dev/null +++ b/conf/test/trimming.config @@ -0,0 +1,19 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/sarek -profile test,, --outdir +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +params { + clip_r1 = 1 + clip_r2 = 1 + three_prime_clip_r1 = 1 + three_prime_clip_r2 = 1 + tools = null + trim_fastq = true +} diff --git a/conf/test/umi.config b/conf/test/umi.config new file mode 100644 index 0000000000..6d538c4145 --- /dev/null +++ b/conf/test/umi.config @@ -0,0 +1,16 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/sarek -profile test,, --outdir +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +params { + input = "${projectDir}/tests/csv/3.0/fastq_umi.csv" + tools = null + umi_read_structure = '+T 7M1S+T' +} diff --git a/conf/test/use_gatk_spark.config b/conf/test/use_gatk_spark.config new file mode 100644 index 0000000000..bd40af1b77 --- /dev/null +++ b/conf/test/use_gatk_spark.config @@ -0,0 +1,16 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/sarek -profile test,, --outdir +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +params { + tools = null + use_gatk_spark = 'baserecalibrator,markduplicates' + input = "${projectDir}/tests/csv/3.0/fastq_tumor_only.csv" +} diff --git a/conf/test/variantcalling_channels.config b/conf/test/variantcalling_channels.config new file mode 100644 index 0000000000..a6ff70387a --- /dev/null +++ b/conf/test/variantcalling_channels.config @@ -0,0 +1,20 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/sarek -profile test,, --outdir +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +params { + input = "${projectDir}/tests/csv/3.0/recalibrated.csv" + fasta = params.test_data['homo_sapiens']['genome']['genome_21_fasta'] + intervals = params.test_data['homo_sapiens']['genome']['genome_21_multi_interval_bed'] + nucleotides_per_second = 20 + step = 'variant_calling' + tools = null + wes = true +} diff --git a/conf/test_full.config b/conf/test_full.config index af9296e668..1ba5ad2c78 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -15,10 +15,11 @@ params { config_profile_description = 'Full test dataset to check pipeline function' // Input data for full size test - // TODO nf-core: Specify the paths to your full test data ( on nf-core/test-datasets or directly in repositories, e.g. SRA) - // TODO nf-core: Give any required params for the test so that command line flags are not needed - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_full_illumina_amplicon.csv' + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/sarek/testdata/csv/HCC1395_WXS_somatic_full_test.csv' - // Genome references - genome = 'R64-1-1' + // Other params + tools = 'ngscheckmate,strelka,mutect2,freebayes,ascat,manta,cnvkit,tiddit,controlfreec,vep,snpeff' + split_fastq = 20000000 + intervals = 's3://ngi-igenomes/test-data/sarek/S07604624_Padded_Agilent_SureSelectXT_allexons_V6_UTR.bed' + wes = true } diff --git a/conf/test_full_germline.config b/conf/test_full_germline.config new file mode 100644 index 0000000000..d731a25709 --- /dev/null +++ b/conf/test_full_germline.config @@ -0,0 +1,23 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running full-size tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a full size pipeline test. + + Use as follows: + nextflow run nf-core/sarek -profile test_full_germline, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Full test profile for germline VC' + config_profile_description = 'Full test dataset to check germline VC pipeline function' + + // Input data for full size test + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/sarek/testdata/csv/NA12878_WGS_30x_full_test.csv' + + // Other params + tools = 'strelka,freebayes,haplotypecaller,deepvariant,manta,tiddit,cnvkit,vep,snpeff' + split_fastq = 50000000 +} diff --git a/docs/abstracts/2016-09-KICR.md b/docs/abstracts/2016-09-KICR.md new file mode 100644 index 0000000000..a5758c5288 --- /dev/null +++ b/docs/abstracts/2016-09-KICR.md @@ -0,0 +1,44 @@ +# The XVth KICancer Retreat - Djurö, Sweden, 2016/09 + +## Cancer Analysis Workflow Of Tumor/Normal Pairs At The National Genomics Infrastructure Of SciLifeLab + +Maxime Garcia +Pelin Akan, +Teresita Díaz de Ståhl, +Jesper Eisfeldt, +Szilveszter Juhos, +Malin Larsson, +Björn Nystedt, +Pall Olason, +Monica Nistér, +Max Käller + +BarnTumörBanken, Department of Oncology Pathology, Karolinska Institutet, Science for Life Laboratory + +One of the most prominent usage of NGS is whole genome sequencing (WGS). The +National Genomics Infrastructure (NGI) at Science for Life Laboratory is today +providing WGS and germ line variant analysis. However, building a robust and +reliable bioinformatics workflow to find somatic mutations is challenging: +tumor samples are heterogeneous, likely contain structural variants and +multiple sub-clones besides the normal tissue. + +We are presenting our workflow that is designed to analyze WGS tumor/normal +data in a high-throughput environment. The framework is based on the Nextflow +domain-specific language on top of Java/Groovy. Using Nextflow we are able to +utilize both the Slurm load balancing environment and local execution, +implementing data flow forks and joins, call external software etc. Individual +sub-steps of a complex flow can be connected and restarted after failure from +the last execution point. + +The actual preprocessing workflow is based on BWA as an aligner and GATK best +practice steps. To achieve a consensus variant call different variant callers +can be added, currently MuTect2, Strelka and VarDict are supported, more to be +added. Structural variants are going to be estimated by Manta, ploidy and +sample heterogeneity is measured by ASCAT. The expected output of the workflow +is a VCF file presenting filtered, prioritized and annotated polymorphisms. + +As the Nextflow environment is flexible, we can add other tools or remove +obsolete ones as development progresses. The goal is to build a workflow for +cancer genome analysis that can be deployed to both research and clinical +environments and are going to be included as a standard workflow at NGI during +the fall of 2016. diff --git a/docs/abstracts/2017-05-ESHG.md b/docs/abstracts/2017-05-ESHG.md new file mode 100644 index 0000000000..8098600c34 --- /dev/null +++ b/docs/abstracts/2017-05-ESHG.md @@ -0,0 +1,31 @@ +# European Human Genetics Conference - Copenhagen, Denmark, 2017/05 + +## CAW - Cancer Analysis Workflow to process normal/tumor WGS data + +Maxime Garcia 1, +Szilveszter Juhos 2, +Malin Larsson 3, +Teresita Díaz de Ståhl 4, +Jesper Eisfeldt 5, +Sebastian DiLorenzo 6, +Pall Olason 7, +Björn Nystedt 7, +Monica Nistér 4, +Max Käller 8 + +1. BarnTumörBanken, Department of Oncology Pathology, Science for Life Laboratory, Karolinska Institutet +2. Department of Biochemistry and Biophysics, Science for Life Laboratory, Stockholm University +3. Science for Life laboratory, Department of Physics, Chemistry and Biology, Linköping University +4. BarnTumörBanken, Department of Oncology Pathology, Karolinska Institutet +5. Clinical Genetics, Department of Molecular Medicine and Surgery, Karolinska Institutet +6. Department of Medical Sciences, National Bioinformatics Infrastructure Sweden, Science for Life Laboratory, Uppsala University +7. Science for Life Laboratory, Department of Cell and Molecular Biology, Uppsala University +8. Science for Life Laboratory, School of Biotechnology, Division of Gene Technology, Royal Institute of Technology + +As whole genome sequencing is getting cheaper, it is viable to compare NGS data from normal and tumor samples of numerous patients. There are still many challenges, mostly regarding bioinformatics: datasets are huge, workflows are complex, and there are multiple tools to choose from for somatic and structural variants and quality control. + +We are presenting CAW (Cancer Analysis Workflow) a complete open source pipeline to resolve somatic variants from WGS data: it is written in Nextflow, a domain specific language for workflow building. We are utilizing GATK best practices to align, realign and recalibrate short-read data in parallel for both tumor and normal sample. After these preprocessing steps several somatic variant callers scan the resulting BAM files; MuTect1, MuTect2 and Strelka are used to find somatic SNVs and small indels.For structural variants we use Manta. Furthermore, we are applying ASCAT to estimate sample heterogeneity, ploidy and CNVs. + +The software can start the analysis from raw FASTQ files, from the realignment step, or directly with any subset of variant callers. At the end of the analysis the resulting VCF files are merged to facilitate further downstream processing, though the individual results are also retained. The flow is capable of accommodating further variant calling software or CNV callers. It is also prepared to process normal - tumor - and several relapse samples. + +Besides variant calls, the workflow provides quality controls presented by MultiQC. A docker image is also available, the open source software can be downloaded from . diff --git a/docs/abstracts/2018-05-PMC.md b/docs/abstracts/2018-05-PMC.md new file mode 100644 index 0000000000..22300860f5 --- /dev/null +++ b/docs/abstracts/2018-05-PMC.md @@ -0,0 +1,48 @@ +# Keystone Symposia - Precision Medicine in Cancer - Stockholm, Sweden, 2018/05 + +## Sarek, a workflow for WGS analysis of germline and somatic mutations + +Maxime Garcia 123*, +Szilveszter Juhos 123*, +Malin Larsson 456, +Teresita Díaz de Ståhl 13, +Johanna Sandgren 13, +Jesper Eisfeldt 73, +Sebastian DiLorenzo 85A, +Marcel Martin B3C, +Pall Olason 95A, +Phil Ewels B2C, +Björn Nystedt 95A*, +Monica Nistér 13, +Max Käller 2D, +*Corresponding Author + +1. Barntumörbanken, Dept. of Oncology Pathology; +2. Science for Life Laboratory; +3. Karolinska Institutet; +4. Dept. of Physics, Chemistry and Biology; +5. National Bioinformatics Infrastructure Sweden, Science for Life Laboratory; +6. Linköping University; +7. Clinical Genetics, Dept. of Molecular Medicine and Surgery; +8. Dept. of Medical Sciences; +9. Dept. of Cell and Molecular Biology; + A. Uppsala University; + B. Dept. of Biochemistry and Biophysics; + C. Stockholm University; + D. School of Biotechnology, Division of Gene Technology, Royal Institute of Technology + +We present Sarek, a complete Open Source pipeline to resolve germline and somatic variants from WGS data: it is written in Nextflow, a domain-specific language for workflow building. +Sarek is based on GATK best practices to prepare short-read data, in parallel for a tumor/normal pair sample. +After these preprocessing steps several variant callers scan the resulting BAM files; For structural variants we use Manta. +Strelka and GATK HaplotypeCaller are used to find germline variants and for somatic calls we use MuTect2 and Strelka. +Finally, we apply ASCAT to estimate sample heterogeneity, ploidy and CNVs. +Checkpoints allow to start the software from different states. +At the end of the analysis the resulting VCF files are annotated to facilitate further downstream processing. +The flow is capable of accommodating further variant callers. +It can also process only the normal sample, tumor/normal pairs or even normal, tumor and several relapse samples. +Besides variant calls, the workflow provides quality controls presented by MultiQC. +For easy sharing, installation, and to ensure reproducibility, containers (Docker and Singularity) are available. +The MIT licensed open source code can be downloaded from GitHub. + +The authors thank the Swedish Childhood Cancer Foundation for the funding of Barntumörbanken. +We would like to acknowledge support from Science for Life Laboratory, the National Genomics Infrastructure, NGI, and UPPMAX for providing assistance in massive parallel sequencing and computational infrastructure. diff --git a/docs/abstracts/2018-06-EACR25.md b/docs/abstracts/2018-06-EACR25.md new file mode 100644 index 0000000000..afe49fcfd5 --- /dev/null +++ b/docs/abstracts/2018-06-EACR25.md @@ -0,0 +1,56 @@ +# 25th Biennial Congress Of The European Association For Cancer Research - Amsterdam, Netherlands, 2018/06-07 + +## Somatic and germline calls from tumour/normal whole genome data: bioinformatics workflow for reproducible research + +Szilveszter Juhos 1, +Maxime Garcia 2, +Teresita Díaz de Ståhl 3, +Johanna Sandgren 3, +Markus Mayrhofer 4, +Max Käller 5, +Björn Nystedt 6, +Monica Nistér 3 + +1. Karolinska Institutet, Department of Oncology Pathology, Stockholm, Sweden. +2. Karolinska Institutet- Science for Life Laboratory, Department of Oncology-Pathology, Stockholm, Sweden. +3. Karolinska Institutet, Department of Oncology-Pathology, Stockholm, Sweden. +4. Science for Life Laboratory, Uppsala University, Uppsala, Sweden. +5. Science for Life Laboratory, Royal Institute of Technology- School of Biotechnology- Division of Gene Technology, Stockholm, Sweden. +6. Science for Life Laboratory, Department of Cell and Molecular Biology- National Bioinformatics Infrastructure Sweden- Uppsala University, Uppsala, Sweden. + +### Introduction + +Whole-genome sequencing of cancer tumours is more a research tool nowadays, but going to be used in clinical settings in +the near future to facilitate precision medicine. While large institutions have built up in-house bioinformatics +solutions for their own data analysis, robust and portable workflows combining multiple software have been lacking, +making it difficult for individual research groups to utilise the potential of this research field. Here we present +Sarek, a robust, easy-to-install workflow for identification of both somatic and germline mutations from paired +tumour/normal/relapse samples. + +### Material and Methods + +Sarek is open source and implemented in Nextflow; a domain specific programming language to enable portability and +reproducibility. With the help of docker containers the versions of the underlying software can be maintained. +Furthermore, with Singularity it is possible to run the workflow on protected clusters with no internet connection. + +The workflow starts from raw FASTQ files, and follows the GATK best practices to prepare the recalibrated files with +joint realignment around indels for both the tumour and the normal data. Reads are alignment to the GRCh38 human +reference in an ALT-aware settings using BWA, however, it is possible to assign other references. HaplotypeCaller and +Strelka2 germline calls are collected for both the tumour and the normal sample, and Manta provides germline structural +variants. The somatic variations are calculated by running MuTect2, Strelka and FreeBayes (and MuTect1 optionally). +Somatic structural variants are delivered by Manta, and ASCAT estimates ploidy, tumour heterogeneity and CNVs. The +resulting variant call files are annotated by SnpEff and Ensembl-VEP. The annotated calls are further filtered and +prioritised by our custom methods. During running the workflow quality control metrics are also calculated and +aggregated by MultiQC. + +### Results and Discussions + +Sarek was validated on a real dataset with known golden set of somatic mutations. In a real settings, whole-genome +sequencing (WGS, 45-60x coverage) of patient-matched tumor and blood derived-DNA is being performed on a set of 80 +pediatric brain tumor samples of the Swedish Childhood Tumor Biobank. The workflow helps to produce, filter, prioritise +and characterise both germline and somatic variations. + +### Conclusion + +Sarek is a portable bioinformatics pipeline for WGS normal/tumour matched samples, aiding precision medicine by improved +subtyping and to gain novel functional insights in a reproducible framework. diff --git a/docs/abstracts/2018-06-NPMI.md b/docs/abstracts/2018-06-NPMI.md new file mode 100644 index 0000000000..7de2d1d533 --- /dev/null +++ b/docs/abstracts/2018-06-NPMI.md @@ -0,0 +1,50 @@ +# The Nordic Precision Medicine Initiative - Meeting No 5 - Reykjavìk, Iceland, 2018/06 + +## Sarek, a portable workflow for WGS analysis of germline and somatic mutations + +Maxime Garcia 123*, +Szilveszter Juhos 123*, +Malin Larsson 456, +Teresita Díaz de Ståhl 13, +Johanna Sandgren 13, +Jesper Eisfeldt 73, +Sebastian DiLorenzo 85A, +Marcel Martin B5C, +Pall Olason 95A, +Phil Ewels B2C, +Björn Nystedt 95A*, +Monica Nistér 13, +Max Käller 2D, +*Corresponding Author + +1. Barntumörbanken, Dept. of Oncology Pathology; +2. Science for Life Laboratory; +3. Karolinska Institutet; +4. Dept. of Physics, Chemistry and Biology; +5. National Bioinformatics Infrastructure Sweden, Science for Life Laboratory; +6. Linköping University; +7. Clinical Genetics, Dept. of Molecular Medicine and Surgery; +8. Dept. of Medical Sciences; +9. Dept. of Cell and Molecular Biology; + A. Uppsala University; + B. Dept. of Biochemistry and Biophysics; + C. Stockholm University; + D. School of Biotechnology, Division of Gene Technology, Royal Institute of Technology + +We present Sarek, a portable Open Source pipeline to resolve germline and somatic variants from WGS data: it is written in Nextflow, a domain-specific language for workflow building. +It processes normal samples or normal/tumor pairs (with the option to include matched relapses). + +Sarek is based on GATK best practices to prepare short-read data, which is done in parallel for a tumor/normal pair sample. +After these preprocessing steps several variant callers scan the resulting BAM files: Manta for structural variants; Strelka and GATK HaplotypeCaller for germline variants; Freebayes, MuTect1, MuTect2 and Strelka for somatic variants; ASCAT to estimate sample heterogeneity, ploidy and CNVs. +At the end of the analysis the resulting VCF files can be annotated by SNPEff and/or VEP to facilitate further downstream processing. +Our ongoing effort focuses in filtering and prioritizing the annotated variants. + +Sarek is based on Docker and Singularity containers, enabling version tracking, reproducibility and handling sensitive data. +It is designed with flexible environments in mind, like running on a local fat node, a HTC cluster or in a cloud environment like AWS. +The workflow is capable of accommodating further variant callers. +Besides variant calls, the workflow provides quality controls presented by MultiQC. +Checkpoints allow the software to be started from FastQ, BAM or VCF. +Besides WGS data, it is capable to process inputs from WES or gene panels. +The pipeline currently use GRCh37 or GRCh38 as a reference genome, it is also possible to add custom genomes. +It has been successfully used to analyze more than two hundred WGS samples sent to National Genomics Infrastructure (Science for Life Laboratory) from different users. +The MIT licensed Open Source code can be downloaded from GitHub. diff --git a/docs/abstracts/2018-07-JOBIM.md b/docs/abstracts/2018-07-JOBIM.md new file mode 100644 index 0000000000..aa902caeb4 --- /dev/null +++ b/docs/abstracts/2018-07-JOBIM.md @@ -0,0 +1,43 @@ +# Journées Ouvertes en Biologie, Informatique et Mathématiques - Marseille, France, 2018/07 + +## Sarek, a portable workflow for WGS analysis of germline and somatic mutations + +Maxime Garcia 123, +Szilveszter Juhos 123, +Malin Larsson 456, +Teresita Díaz de Ståhl 13, +Johanna Sandgren 13, +Jesper Eisfeldt 73, +Sebastian DiLorenzo 85A, +Marcel Martin B5C, +Pall Olason 95A, +Phil Ewels B2C, +Björn Nystedt 95A, +Monica Nistér 13, +Max Käller 2D + + Max Käller + +1. Barntumörbanken, Dept. of Oncology Pathology; +2. Science for Life Laboratory; +3. Karolinska Institutet; +4. Dept. of Physics, Chemistry and Biology; +5. National Bioinformatics Infrastructure Sweden, Science for Life Laboratory; +6. Linköping University; +7. Clinical Genetics, Dept. of Molecular Medicine and Surgery; +8. Dept. of Medical Sciences; +9. Dept. of Cell and Molecular Biology; + A. Uppsala University; + B. Dept. of Biochemistry and Biophysics; + C. Stockholm University; + D. School of Biotechnology, Division of Gene Technology, Royal Institute of Technology + +We present Sarek, a portable Open Source pipeline to resolve germline and somatic variants from WGS data: it is written in Nextflow, a domain-specific language for workflow building. It processes normal samples or normal/tumor pairs (with the option to include matched relapses). + +Sarek is based on GATK best practices to prepare short-read data, which is done in parallel for a tumor/normal pair sample. After these preprocessing steps several variant callers scan the resulting BAM files: Manta for structural variants; Strelka and GATK HaplotypeCaller for germline variants; Freebayes, MuTect2 and Strelka for somatic variants; ASCAT and Control-FREEC to estimate sample heterogeneity, ploidy and CNVs. At the end of the analysis the resulting VCF files can be annotated by SNPEff and/or VEP to facilitate further downstream processing. Our ongoing effort focuses in filtering and prioritizing the annotated variants. + +Sarek is based on Docker and Singularity containers, enabling version tracking, reproducibility and handling sensitive data. It is designed with flexible environments in mind, like running on a local fat node, a HTC cluster or in a cloud environment like AWS. The workflow is modular and capable of accommodating further variant callers. Besides variant calls, the workflow provides quality controls presented by MultiQC. Checkpoints allow the software to be started from FastQ, BAM or VCF. Besides WGS data, it is capable to process inputs from WES or gene panels. + +The pipeline currently uses GRCh37 or GRCh38 as a reference genome, it is also possible to add custom genomes. It has been successfully used to analyze more than two hundred WGS samples sent to National Genomics Infrastructure (Science for Life Laboratory) from different users. The MIT licensed Open Source code can be downloaded from GitHub. + +The authors thank the Swedish Childhood Cancer Foundation for the funding of Barntumörbanken. We would like to acknowledge support from Science for Life Laboratory, the National Genomics Infrastructure, NGI, and UPPMAX for providing assistance in massive parallel sequencing and computational infrastructure. diff --git a/docs/abstracts/2020-06-ESHG.md b/docs/abstracts/2020-06-ESHG.md new file mode 100644 index 0000000000..b754dfeb6b --- /dev/null +++ b/docs/abstracts/2020-06-ESHG.md @@ -0,0 +1,39 @@ +# European Society of Human Genetics - European Human Genetics Conference - Berlin, Germany, 2020-06 + +## Reproduce easily: analysis of matching tumor-normal NGS data with the Sarek workflow + +Szilveszter Juhos, +Maxime Garcia, +Teresita Díaz de Ståhl, +Markus Mayrhofer, +Johanna Sandgren, +Monica Nistér + +### Introduction + +High throughput sequencing for precision medicine is now a routine method. +Numerous tools have to be used, and analysis is time consuming. +We propose Sarek, an open-source container based bioinformatics workflow for germline or matching tumor-normal pairs, written in Nextflow, to process WGS, whole-exome or gene-panel samples. + +#### Materials and methods + +Sarek is part of nf-core, a collection of peer-reviewed workflows; supported environments are Conda, Docker and Singularity. +It is system-agnostic: can be used on single machines, clusters (HPC) or in a cloud such as AWS, with little difference between setups. +Additional software can be included as new modules. +Several model organism references are available (including Human GRCh37 and GRCh38). +The pipeline reports germline and somatic SNVs and SVs (by HaplotypeCaller, Strelka, Mutect2, Manta and TIDDIT). +CNVs, purity and ploidy is estimated by ASCAT and Control-FREEC. +Furthermore, a broad set of QC metrics is reported at the end of the workflow with MultiQC. + +#### Results + +From FASTQs to annotated VCFs it takes three days for a 90X/90X sample on a 48 cores node. +Sarek is used in production at the National Genomics Infrastructure Sweden for germline and cancer samples for the Swedish Childhood Tumor Biobank and other research groups. + +#### Conclusions + +Sarek is an easy-to-use tool for germline or cancer NGS samples, to be downloaded from [nf-co.re/sarek](https://nf-co.re/sarek) under MIT license. + +#### Supporting grants + +Swedish Research Council (2017-00630, 2017-00656), the Swedish Childhood Cancer Fund (BTB: BB2017-0001; BB2018-0001; BB2019-0001), and the Knut and Alice Wallenberg Foundation (KAW 2014.0278). diff --git a/docs/abstracts/2020-10-VCBS.md b/docs/abstracts/2020-10-VCBS.md new file mode 100644 index 0000000000..ca4b00314c --- /dev/null +++ b/docs/abstracts/2020-10-VCBS.md @@ -0,0 +1,36 @@ +# Victorian Cancer Bioinformatics Symposium - online, 2020-10-23 + +## Sarek, a reproducible and portable workflow for analysis of matching tumor-normal NGS data + +Maxime Garcia [1], Szilveszter Juhos [1], Teresita Díaz de Ståhl [1], Markus Mayrhofer [2], Johanna Sandgren [1], Björn Nystedt [2], Monica Nistér [1] + +[1] Dept. of Oncology Pathology, The Swedish Childhood Tumor Biobank (Barntumörbanken, BTB); Karolinska Institutet +[2] Dept. of Cell and Molecular Biology; National Bioinformatics Infrastructure Sweden, Science for Life Laboratory; Uppsala University + +### Introduction + +High throughput sequencing for precision medicine is a routine method. +Numerous tools have to be used, and analysis is time consuming. +We propose Sarek, an open-source container based bioinformatics workflow for germline or tumor/normal pairs (can include matched relapses), written in Nextflow, to process WGS, whole-exome or gene-panel samples. + +### Methods + +Sarek is part of nf-core, a collection of high quality peer-reviewed workflows; supported environments are Docker, Singularity and Conda, enabling version tracking and reproducibility. +It is designed with flexible environments in mind: local fat node, HTC cluster or cloud environment like AWS. +Several model organism references are available (including Human GRCh37 and GRCh38). +Sarek is based on GATK best practices to prepare short-read data. +The pipeline then reports germline and somatic SNVs and SVs (HaplotypeCaller, Strelka, Mutect2, Manta and TIDDIT). +CNVs, purity and ploidy is estimated with ASCAT and Control-FREEC. +At the end of the analysis the resulting VCF files can be annotated by SNPEff and/or VEP to facilitate further downstream processing. +Furthermore, a broad set of QC metrics is reported as a final step of the workflow with MultiQC. +Additional software can be included as new modules. + +### Results + +From FASTQs to annotated VCFs it takes four days for a paired 90X/90X WGS-sample on a 48 cores node, with the complete set of tools. +Processing can be sped-up with the optional use of Sentieon (C). +Sarek is used in production at the National Genomics Infrastructure Sweden for germline and cancer samples for the Swedish Childhood Tumor Biobank and other research groups. + +### Conclusion + +Sarek is an easy-to-use tool for germline or cancer NGS samples, to be downloaded from [nf-co.re/sarek](https://nf-co.re/sarek) under MIT license. diff --git a/docs/images/BTB_logo.png b/docs/images/BTB_logo.png new file mode 100644 index 0000000000..6a197b8075 Binary files /dev/null and b/docs/images/BTB_logo.png differ diff --git a/docs/images/BTB_logo.svg b/docs/images/BTB_logo.svg new file mode 100644 index 0000000000..099f1101f6 --- /dev/null +++ b/docs/images/BTB_logo.svg @@ -0,0 +1,184 @@ + + + +image/svg+xml \ No newline at end of file diff --git a/docs/images/DNGC_logo.png b/docs/images/DNGC_logo.png new file mode 100644 index 0000000000..51106970ff Binary files /dev/null and b/docs/images/DNGC_logo.png differ diff --git a/docs/images/DNGC_logo.svg b/docs/images/DNGC_logo.svg new file mode 100644 index 0000000000..4a0da06c7f --- /dev/null +++ b/docs/images/DNGC_logo.svg @@ -0,0 +1,163 @@ + + + +image/svg+xml \ No newline at end of file diff --git a/docs/images/GHGA_logo.png b/docs/images/GHGA_logo.png new file mode 100644 index 0000000000..551973b788 Binary files /dev/null and b/docs/images/GHGA_logo.png differ diff --git a/docs/images/GHGA_logo.svg b/docs/images/GHGA_logo.svg new file mode 100644 index 0000000000..a831ac8f85 --- /dev/null +++ b/docs/images/GHGA_logo.svg @@ -0,0 +1 @@ + diff --git a/docs/images/NBIS_logo.png b/docs/images/NBIS_logo.png new file mode 100644 index 0000000000..6e5c303168 Binary files /dev/null and b/docs/images/NBIS_logo.png differ diff --git a/docs/images/NBIS_logo.svg b/docs/images/NBIS_logo.svg new file mode 100644 index 0000000000..337e22cf68 --- /dev/null +++ b/docs/images/NBIS_logo.svg @@ -0,0 +1,287 @@ + + + +image/svg+xml \ No newline at end of file diff --git a/docs/images/NGI_logo.png b/docs/images/NGI_logo.png new file mode 100644 index 0000000000..3f4b769e55 Binary files /dev/null and b/docs/images/NGI_logo.png differ diff --git a/docs/images/NGI_logo.svg b/docs/images/NGI_logo.svg new file mode 100644 index 0000000000..aef40fd811 --- /dev/null +++ b/docs/images/NGI_logo.svg @@ -0,0 +1,333 @@ + + + +image/svg+xml \ No newline at end of file diff --git a/docs/images/QBiC_logo.png b/docs/images/QBiC_logo.png new file mode 100644 index 0000000000..840f24ca17 Binary files /dev/null and b/docs/images/QBiC_logo.png differ diff --git a/docs/images/QBiC_logo.svg b/docs/images/QBiC_logo.svg new file mode 100644 index 0000000000..6bc6bd6289 --- /dev/null +++ b/docs/images/QBiC_logo.svg @@ -0,0 +1,103 @@ + + + +image/svg+xml + + + + + + + + + + + \ No newline at end of file diff --git a/docs/images/SciLifeLab_logo.png b/docs/images/SciLifeLab_logo.png new file mode 100644 index 0000000000..e71d44b931 Binary files /dev/null and b/docs/images/SciLifeLab_logo.png differ diff --git a/docs/images/SciLifeLab_logo.svg b/docs/images/SciLifeLab_logo.svg new file mode 100644 index 0000000000..3602a3b855 --- /dev/null +++ b/docs/images/SciLifeLab_logo.svg @@ -0,0 +1,143 @@ + + + +image/svg+xml \ No newline at end of file diff --git a/docs/images/logos/nf-core_sarek/nf-core_sarek_color.svg b/docs/images/logos/nf-core_sarek/nf-core_sarek_color.svg new file mode 100644 index 0000000000..1a11a859e4 --- /dev/null +++ b/docs/images/logos/nf-core_sarek/nf-core_sarek_color.svg @@ -0,0 +1,355 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/images/logos/nf-core_sarek/nf-core_sarek_dark_color.svg b/docs/images/logos/nf-core_sarek/nf-core_sarek_dark_color.svg new file mode 100644 index 0000000000..e22ded4fac --- /dev/null +++ b/docs/images/logos/nf-core_sarek/nf-core_sarek_dark_color.svg @@ -0,0 +1,352 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/images/logos/nf-core_sarek/nf-core_sarek_dark_grey.svg b/docs/images/logos/nf-core_sarek/nf-core_sarek_dark_grey.svg new file mode 100644 index 0000000000..f64eb12904 --- /dev/null +++ b/docs/images/logos/nf-core_sarek/nf-core_sarek_dark_grey.svg @@ -0,0 +1,376 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/images/logos/nf-core_sarek/nf-core_sarek_dark_mono.svg b/docs/images/logos/nf-core_sarek/nf-core_sarek_dark_mono.svg new file mode 100644 index 0000000000..392bd8d5af --- /dev/null +++ b/docs/images/logos/nf-core_sarek/nf-core_sarek_dark_mono.svg @@ -0,0 +1,292 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/images/logos/nf-core_sarek/nf-core_sarek_grey.svg b/docs/images/logos/nf-core_sarek/nf-core_sarek_grey.svg new file mode 100644 index 0000000000..4dfe2b1a91 --- /dev/null +++ b/docs/images/logos/nf-core_sarek/nf-core_sarek_grey.svg @@ -0,0 +1,364 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/images/logos/nf-core_sarek/nf-core_sarek_mono.svg b/docs/images/logos/nf-core_sarek/nf-core_sarek_mono.svg new file mode 100644 index 0000000000..e93a485d59 --- /dev/null +++ b/docs/images/logos/nf-core_sarek/nf-core_sarek_mono.svg @@ -0,0 +1,348 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/images/logos/sarek-germline/sarek-germline.svg b/docs/images/logos/sarek-germline/sarek-germline.svg new file mode 100644 index 0000000000..f02c125528 --- /dev/null +++ b/docs/images/logos/sarek-germline/sarek-germline.svg @@ -0,0 +1,315 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/images/logos/sarek-somatic/sarek-somatic_logo.svg b/docs/images/logos/sarek-somatic/sarek-somatic_logo.svg new file mode 100644 index 0000000000..8f580298e8 --- /dev/null +++ b/docs/images/logos/sarek-somatic/sarek-somatic_logo.svg @@ -0,0 +1,308 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/images/logos/sarek/sarek_color.svg b/docs/images/logos/sarek/sarek_color.svg new file mode 100644 index 0000000000..7a8ccdfe9b --- /dev/null +++ b/docs/images/logos/sarek/sarek_color.svg @@ -0,0 +1,247 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/images/logos/sarek/sarek_dark_color.svg b/docs/images/logos/sarek/sarek_dark_color.svg new file mode 100644 index 0000000000..40f3b88e00 --- /dev/null +++ b/docs/images/logos/sarek/sarek_dark_color.svg @@ -0,0 +1,248 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/images/logos/sarek/sarek_dark_grey.svg b/docs/images/logos/sarek/sarek_dark_grey.svg new file mode 100644 index 0000000000..a503af1444 --- /dev/null +++ b/docs/images/logos/sarek/sarek_dark_grey.svg @@ -0,0 +1,288 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/images/logos/sarek/sarek_dark_mono.svg b/docs/images/logos/sarek/sarek_dark_mono.svg new file mode 100644 index 0000000000..d14308f6b2 --- /dev/null +++ b/docs/images/logos/sarek/sarek_dark_mono.svg @@ -0,0 +1,243 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/images/logos/sarek/sarek_grey.svg b/docs/images/logos/sarek/sarek_grey.svg new file mode 100644 index 0000000000..68e2b116d2 --- /dev/null +++ b/docs/images/logos/sarek/sarek_grey.svg @@ -0,0 +1,256 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/images/logos/sarek/sarek_mono.svg b/docs/images/logos/sarek/sarek_mono.svg new file mode 100644 index 0000000000..be66c6fe84 --- /dev/null +++ b/docs/images/logos/sarek/sarek_mono.svg @@ -0,0 +1,248 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/images/nf-core-sarek-germline_logo.svg b/docs/images/nf-core-sarek-germline_logo.svg new file mode 100644 index 0000000000..bf5fb0d09e --- /dev/null +++ b/docs/images/nf-core-sarek-germline_logo.svg @@ -0,0 +1,423 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/images/nf-core-sarek-somatic_logo.svg b/docs/images/nf-core-sarek-somatic_logo.svg new file mode 100644 index 0000000000..86a3b16715 --- /dev/null +++ b/docs/images/nf-core-sarek-somatic_logo.svg @@ -0,0 +1,416 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/images/nf-core-sarek_logo_dark.png b/docs/images/nf-core-sarek_logo_dark.png index 7d57ac346b..a91c2025a0 100644 Binary files a/docs/images/nf-core-sarek_logo_dark.png and b/docs/images/nf-core-sarek_logo_dark.png differ diff --git a/docs/images/nf-core-sarek_logo_dark.svg b/docs/images/nf-core-sarek_logo_dark.svg new file mode 100644 index 0000000000..f91b8a7e36 --- /dev/null +++ b/docs/images/nf-core-sarek_logo_dark.svg @@ -0,0 +1,352 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/images/nf-core-sarek_logo_light.png b/docs/images/nf-core-sarek_logo_light.png index 89581a852a..61aa1c81d5 100644 Binary files a/docs/images/nf-core-sarek_logo_light.png and b/docs/images/nf-core-sarek_logo_light.png differ diff --git a/docs/images/nf-core-sarek_logo_light.svg b/docs/images/nf-core-sarek_logo_light.svg new file mode 100644 index 0000000000..7a3bb9bbf6 --- /dev/null +++ b/docs/images/nf-core-sarek_logo_light.svg @@ -0,0 +1,262 @@ + + + +image/svg+xml diff --git a/docs/images/nf-core_logo.png b/docs/images/nf-core_logo.png new file mode 100644 index 0000000000..a68747357d Binary files /dev/null and b/docs/images/nf-core_logo.png differ diff --git a/docs/images/nf-core_logo.svg b/docs/images/nf-core_logo.svg new file mode 100644 index 0000000000..2998cddb8e --- /dev/null +++ b/docs/images/nf-core_logo.svg @@ -0,0 +1,217 @@ + +image/svg+xmlnf- + + +core + + + \ No newline at end of file diff --git a/docs/images/sarek_icon.svg b/docs/images/sarek_icon.svg new file mode 100644 index 0000000000..4202555569 --- /dev/null +++ b/docs/images/sarek_icon.svg @@ -0,0 +1,215 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + diff --git a/docs/images/sarek_indices_subway.png b/docs/images/sarek_indices_subway.png new file mode 100644 index 0000000000..eb7d95bc99 Binary files /dev/null and b/docs/images/sarek_indices_subway.png differ diff --git a/docs/images/sarek_indices_subway.svg b/docs/images/sarek_indices_subway.svg new file mode 100644 index 0000000000..3d515ca5f0 --- /dev/null +++ b/docs/images/sarek_indices_subway.svg @@ -0,0 +1,4936 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + dbsnp + panel of normals + known indels + germline resource + + + vcf + + + + + + + + + + vcf + + + + + vcf + + + + + + + + + + tabix index + + + tbi + + + + + tbi + + + + + tbi + + + + + + vcf + + + + + vcf + + + + + + + + + + + + + tbi + + + + fasta dictionnary + + + gatk + + + + + + + + + + + + + + + fasta + + + + bed + + + + + + + + samtools + cnvkit + fasta index + intervals + + + convert + + build indices + + + + bwa + + + build indices + + + bwamem2 + + + + hashtable + + + dragmap + + + + microsatellites + + + msisensorpro + + + + + + + + + + + + + + + + bed + + + + + fai + + + + + cnn + + Adapted from: Fellows Yates, James A., et al. PeerJ 9 (2021). + + + + + dict + + + + + + msi + + + + + + + + + + + + + + + + bwa + + + + + + + + + + + + + + + + bwa + + + + + + + + + + + + + + + + ht + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Index preparation + + + diff --git a/docs/images/sarek_logo.svg b/docs/images/sarek_logo.svg new file mode 100644 index 0000000000..7a8ccdfe9b --- /dev/null +++ b/docs/images/sarek_logo.svg @@ -0,0 +1,247 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/images/sarek_subway.png b/docs/images/sarek_subway.png new file mode 100644 index 0000000000..02937f57e5 Binary files /dev/null and b/docs/images/sarek_subway.png differ diff --git a/docs/images/sarek_subway.svg b/docs/images/sarek_subway.svg new file mode 100644 index 0000000000..2a7831b02b --- /dev/null +++ b/docs/images/sarek_subway.svg @@ -0,0 +1,3121 @@ + + + +image/svg+xmlUMImappingvcfvcfvcfvcfvcftxtvcfvcf ...fastqcfastpfasfastqensemblvepsnpeffmosdepth, samtoolsbcftools, vcftoolsmosdepth, samtoolsngscheckmateconcatenate(germline)multiqcconvertbam/crambam/crambam/cramvcfmarkduplicatesbam/cramvariant callingpre-processingannotationvariant callingTumor-normal pair variant callingAdapted from: Fellows Yates, James A., et al. PeerJ 9 (2021).Core workflowGermline variant callingTumor only variant callingprepare recalibrationapplybqsrbcftools annotatedeepvariantfreebayeshaplotypecallermantastrelka2tiddittidditmutect2ascatmsisensorprocontrolfreeccnvkitvcfvcfvcfcracramcramfreebayesmantastrelka2convertExample analysis pathwaysubammpileupmpileupSentieon haplotyperSentieon dnascopeOptionalMandatoryOptionally Sentieon acceleratedSNPs & IndelsSV & CNVMSI diff --git a/docs/images/sarek_workflow.png b/docs/images/sarek_workflow.png new file mode 100644 index 0000000000..7fb4cd52c2 Binary files /dev/null and b/docs/images/sarek_workflow.png differ diff --git a/docs/images/sarek_workflow.svg b/docs/images/sarek_workflow.svg new file mode 100644 index 0000000000..5f4cbd2ddd --- /dev/null +++ b/docs/images/sarek_workflow.svg @@ -0,0 +1,3659 @@ + + + +image/svg+xml rfastqfastqfastqcramcramcramvcfvcfvcf Based on GATK4 Best Practices,optionally accelerated with SentieonPreprocessing snpEff, VEPbcftools annotate, snpeff, vep Annotation Reports Variant Calling Somaticfreebayes, mutect2, strelka2• manta, tiddit• ascat, cnvkit, controlfreec• msisensorpro Germline deepvariant, freebayes GATK haplotypecaller, mpileup, strelka2, Sentieon haplotyper • manta, tiddit • cnvkit diff --git a/docs/output.md b/docs/output.md index 1cc990a100..ff6445e89b 100644 --- a/docs/output.md +++ b/docs/output.md @@ -1,45 +1,1151 @@ -# nf-core/sarek: Output +# nf-core/sarek: Output -## Introduction +## Introduction This document describes the output produced by the pipeline. Most of the plots are taken from the MultiQC report, which summarises results at the end of the pipeline. The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory. - - -## Pipeline overview +## Pipeline overview The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: -- [FastQC](#fastqc) - Raw read QC -- [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline -- [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution +- [Directory Structure](#directory-structure) +- [Preprocessing](#preprocessing) + - [Preparation of input files (FastQ or (u)BAM)](#preparation-of-input-files-fastq-or-ubam) + - [Trim adapters](#trim-adapters) + - [Split FastQ files](#split-fastq-files) + - [UMI consensus](#umi-consensus) + - [Map to Reference](#map-to-reference) + - [BWA](#bwa) + - [BWA-mem2](#bwa-mem2) + - [DragMap](#dragmap) + - [Sentieon BWA mem](#sentieon-bwa-mem) + - [Mark Duplicates](#mark-duplicates) + - [GATK MarkDuplicates (Spark)](#gatk-markduplicates-spark) + - [Sentieon LocusCollector and Dedup](#sentieon-locuscollector-and-dedup) + - [Base Quality Score Recalibration](#base-quality-score-recalibration) + - [GATK BaseRecalibrator (Spark)](#gatk-baserecalibrator-spark) + - [GATK ApplyBQSR (Spark)](#gatk-applybqsr-spark) + - [CSV files](#csv-files) +- [Variant Calling](#variant-calling) + - [SNVs and small indels](#snvs-and-small-indels) + - [bcftools](#bcftools) + - [DeepVariant](#deepvariant) + - [FreeBayes](#freebayes) + - [GATK HaplotypeCaller](#gatk-haplotypecaller) + - [GATK Germline Single Sample Variant Calling](#gatk-germline-single-sample-variant-calling) + - [GATK Joint Germline Variant Calling](#gatk-joint-germline-variant-calling) + - [GATK Mutect2](#gatk-mutect2) + - [Sentieon DNAscope](#sentieon-dnascope) + - [Sentieon DNAscope joint germline variant calling](#sentieon-dnascope-joint-germline-variant-calling) + - [Sentieon Haplotyper](#sentieon-haplotyper) + - [Sentieon Haplotyper joint germline variant calling](#sentieon-haplotyper-joint-germline-variant-calling) + - [Strelka2](#strelka2) + - [Structural Variants](#structural-variants) + - [Manta](#manta) + - [TIDDIT](#tiddit) + - [Sample heterogeneity, ploidy and CNVs](#sample-heterogeneity-ploidy-and-cnvs) + - [ASCAT](#ascat) + - [CNVKit](#cnvkit) + - [Control-FREEC](#control-freec) + - [Microsatellite instability (MSI)](#microsatellite-instability-msi) + - [MSIsensorPro](#msisensorpro) + - [Concatenation](#concatenation) +- [Variant annotation](#variant-annotation) + - [snpEff](#snpeff) + - [VEP](#vep) + - [BCFtools annotate](#bcftools-annotate) +- [Quality control and reporting](#quality-control-and-reporting) + - [Quality control](#quality-control) + - [FastQC](#fastqc) + - [FastP](#fastp) + - [Mosdepth](#mosdepth) + - [NGSCheckMate](#ngscheckmate) + - [GATK MarkDuplicates reports](#gatk-markduplicates-reports) + - [Sentieon Dedup reports](#sentieon-dedup-reports) + - [samtools stats](#samtools-stats) + - [bcftools stats](#bcftools-stats) + - [VCFtools](#vcftools) + - [snpEff reports](#snpeff-reports) + - [VEP reports](#vep-reports) + - [Reporting](#reporting) + - [MultiQC](#multiqc) + - [Pipeline information](#pipeline-information) +- [Reference files](#reference-files) + +## Directory Structure + +The default directory structure is as follows + +``` +{outdir} +├── csv +├── multiqc +├── pipeline_info +├── preprocessing +│ ├── markduplicates +│ └── +│ ├── recal_table +│ └── +│ └── recalibrated +│ └── +├── reference +└── reports + ├── + └── +work/ +.nextflow.log +``` + +## Preprocessing + +Sarek pre-processes raw FastQ files or unmapped BAM files, based on [GATK best practices](https://gatk.broadinstitute.org/hc/en-us/sections/360007226651-Best-Practices-Workflows). + +### Preparation of input files (FastQ or (u)BAM) -### FastQC +[FastP](https://github.com/OpenGene/fastp) is a tool designed to provide all-in-one preprocessing for FastQ files and as such is used for trimming and splitting. By default, these files are not published. However, if publishing is enabled, please be aware that these files are only published once, meaning if trimming and splitting is enabled, then the resulting files will be sharded FastQ files with trimmed reads. If only one of them is enabled then the files contain either trimmed or split reads, respectively. + +#### Trim adapters + +[FastP](https://github.com/OpenGene/fastp) supports global trimming, which means it trims all reads in the front or the tail. This function is useful since sometimes you want to drop some cycles of a sequencing run. In the current implementation in Sarek +`--detect_adapter_for_pe` is set by default which enables auto-detection of adapter sequences. For more information on how to fine-tune adapter trimming, take a look into the parameter docs. + +The resulting files are intermediate and by default not kept in the final files delivered to users. Set `--save_trimmed` to enable publishing of the files in:
-Output files +Output files for all samples + +**Output directory: `{outdir}/preprocessing/fastp/`** + +- `__{1,2}.fastp.fastq.gz>` + - Bgzipped FastQ file + +
+ +#### Split FastQ files + +[FastP](https://github.com/OpenGene/fastp) supports splitting of one FastQ file into multiple files allowing parallel alignment of sharded FastQ file. To enable splitting, the number of reads per output can be specified. For more information, take a look into the parameter `--split_fastq`in the parameter docs. + +These files are intermediate and by default not placed in the output-folder kept in the final files delivered to users. Set `--save_split` to enable publishing of these files to: + +
+Output files for all samples + +**Output directory: `{outdir}/preprocessing/fastp//`** + +- `` + - Bgzipped FastQ file + +
+ +#### UMI consensus + +Sarek can process UMI-reads, using [fgbio](http://fulcrumgenomics.github.io/fgbio/tools/latest/) tools. + +These files are intermediate and by default not placed in the output-folder kept in the final files delivered to users. Set `--save_split` to enable publishing of these files to: -- `fastqc/` - - `*_fastqc.html`: FastQC report containing quality metrics. - - `*_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images. +
+Output files for all samples + +**Output directory: `{outdir}/preprocessing/umi//`** + +- `` + +**Output directory: `{outdir}/reports/umi/`** + +- ``
+### Map to Reference + +#### BWA + +[BWA](https://github.com/lh3/bwa) is a software package for mapping low-divergent sequences against a large reference genome. The aligned reads are then coordinate-sorted (or name-sorted if [`GATK MarkDuplicatesSpark`](https://gatk.broadinstitute.org/hc/en-us/articles/5358833264411-MarkDuplicatesSpark) is used for duplicate marking) with [samtools](https://www.htslib.org/doc/samtools.html). + +#### BWA-mem2 + +[BWA-mem2](https://github.com/bwa-mem2/bwa-mem2) is a software package for mapping low-divergent sequences against a large reference genome.The aligned reads are then coordinate-sorted (or name-sorted if [`GATK MarkDuplicatesSpark`](https://gatk.broadinstitute.org/hc/en-us/articles/5358833264411-MarkDuplicatesSpark) is used for duplicate marking) with [samtools](https://www.htslib.org/doc/samtools.html). + +#### DragMap + +[DragMap](https://github.com/Illumina/dragmap) is an open-source software implementation of the DRAGEN mapper, which the Illumina team created so that we would have an open-source way to produce the same results as their proprietary DRAGEN hardware. The aligned reads are then coordinate-sorted (or name-sorted if [`GATK MarkDuplicatesSpark`](https://gatk.broadinstitute.org/hc/en-us/articles/5358833264411-MarkDuplicatesSpark) is used for duplicate marking) with [samtools](https://www.htslib.org/doc/samtools.html). + +These files are intermediate and by default not placed in the output-folder kept in the final files delivered to users. Set `--save_mapped` to enable publishing, furthermore add the flag `save_output_as_bam` for publishing in BAM format. + +#### Sentieon BWA mem + +Sentieon [bwa mem](https://support.sentieon.com/manual/usages/general/#bwa-mem-syntax) is a subroutine for mapping low-divergent sequences against a large reference genome. It is part of the proprietary software package [DNAseq](https://www.sentieon.com/detailed-description-of-pipelines/#dnaseq) from [Sentieon](https://www.sentieon.com/). + +The aligned reads are coordinate-sorted with Sentieon. + +
+Output files for all mappers and samples + +The alignment files (BAM or CRAM) produced by the chosen aligner are not published by default. CRAM output files will not be saved in the output-folder (`outdir`), unless the flag `--save_mapped` is used. BAM output can be selected by setting the flag `--save_output_as_bam`. + +**Output directory: `{outdir}/preprocessing/mapped//`** + +- if `--save_mapped`: `.sorted.cram` and `.sorted.cram.crai` + + - CRAM file and index + +- if `--save_mapped --save_output_as_bam`: `.sorted.bam` and `.sorted.bam.bai` + - BAM file and index +
+ +### Mark Duplicates + +During duplicate marking, read pairs that are likely to have originated from duplicates of the same original DNA fragments through some artificial processes are identified. These are considered to be non-independent observations, so all but a single read pair within each set of duplicates are marked, causing the marked pairs to be ignored by default during the variant discovery process. + +For further reading and documentation see the [data pre-processing for variant discovery from the GATK best practices](https://gatk.broadinstitute.org/hc/en-us/articles/360035535912-Data-pre-processing-for-variant-discovery). + +#### GATK MarkDuplicates (Spark) + +By default, Sarek will use [GATK MarkDuplicates](https://gatk.broadinstitute.org/hc/en-us/articles/5358880192027-MarkDuplicates-Picard-). + +To use the corresponding spark implementation [`GATK MarkDuplicatesSpark`](https://gatk.broadinstitute.org/hc/en-us/articles/5358833264411-MarkDuplicatesSpark), please specify `--use_gatk_spark markduplicates`. The resulting files are converted to CRAM with either [samtools](https://www.htslib.org/doc/samtools.html), when GATK MarkDuplicates is used, or, implicitly, by GATK MarkDuplicatesSpark. + +The resulting CRAM files are delivered to the users. + +
+Output files for all samples + +**Output directory: `{outdir}/preprocessing/markduplicates//`** + +- `.md.cram` and `.md.cram.crai` + - CRAM file and index +- if `--save_output_as_bam`: + - `.md.bam` and `.md.bam.bai` + +
+ +### Sentieon LocusCollector and Dedup + +The subroutines LocusCollector and Dedup are part of Sentieon DNAseq packages with speedup versions of the standard GATK tools, and together those two subroutines correspond to GATK's MarkDuplicates. + +The subroutine [LocusCollector](https://support.sentieon.com/manual/usages/general/#driver-algorithm-syntax) collects read information that will be used for removing or tagging duplicate reads; its output is the score file indicating which reads are likely duplicates. + +The subroutine [Dedup](https://support.sentieon.com/manual/usages/general/#dedup-algorithm) marks or removes duplicate reads based on the score file supplied by LocusCollector, and produces a BAM or CRAM file. + +
+Output files for all samples + +**Output directory: `{outdir}/preprocessing/sentieon_dedup//`** + +- `.dedup.cram` and `.dedup.cram.crai` + - CRAM file and index +- if `--save_output_as_bam`: + - `.dedup.bam` and `.dedup.bam.bai` + +
+ +### Base Quality Score Recalibration + +During Base Quality Score Recalibration, systematic errors in the base quality scores are corrected by applying machine learning to detect and correct for them. This is important for evaluating the correct call of a variant during the variant discovery process. However, this is not needed for all combinations of tools in Sarek. Notably, this should be turned off when having UMI tagged reads or using DragMap (see [here](https://gatk.broadinstitute.org/hc/en-us/articles/4407897446939--How-to-Run-germline-single-sample-short-variant-discovery-in-DRAGEN-mode)) as mapper. + +For further reading and documentation see the [technical documentation by GATK](https://gatk.broadinstitute.org/hc/en-us/articles/360035890531-Base-Quality-Score-Recalibration-BQSR-). + +#### GATK BaseRecalibrator (Spark) + +[GATK BaseRecalibrator](https://gatk.broadinstitute.org/hc/en-us/articles/360042477672-BaseRecalibrator) generates a recalibration table based on various co-variates. + +To use the corresponding spark implementation [GATK BaseRecalibratorSpark](https://gatk.broadinstitute.org/hc/en-us/articles/5358896138011-BaseRecalibrator), please specify `--use_gatk_spark baserecalibrator`. + +
+Output files for all samples + +**Output directory: `{outdir}/preprocessing/recal_table//`** + +- `.recal.table` + - Recalibration table associated to the duplicates-marked CRAM file. + +
+ +#### GATK ApplyBQSR (Spark) + +[GATK ApplyBQSR](https://gatk.broadinstitute.org/hc/en-us/articles/5358826654875-ApplyBQSR) recalibrates the base qualities of the input reads based on the recalibration table produced by the [GATK BaseRecalibrator](#gatk-baserecalibrator) tool. + +Specify `--use_gatk_spark baserecalibrator` to use [GATK ApplyBQSRSpark](https://gatk.broadinstitute.org/hc/en-us/articles/5358898266011-ApplyBQSRSpark-BETA-) instead, the respective spark implementation. + +The resulting recalibrated CRAM files are delivered to the user. Recalibrated CRAM files are usually 2-3 times larger than the duplicate-marked CRAM files. + +
+Output files for all samples + +**Output directory: `{outdir}/preprocessing/recalibrated//`** + +- `.recal.cram` and `.recal.cram.crai` + - CRAM file and index +- if `--save_output_as_bam`: + - `.recal.bam` and `.recal.bam.bai` - BAM file and index +
+ +### CSV files + +The CSV files are auto-generated and can be used by Sarek for further processing and/or variant calling. + +See the [`input`](usage#input-sample-sheet-configurations) section in the usage documentation for further reading and documentation on how to make the most of them. + +
+Output files: + +**Output directory: `{outdir}/preprocessing/csv`** + +- `mapped.csv` + - if `--save_mapped` + - CSV containing an entry for each sample with the columns `patient,sample,sex,status,bam,bai` +- `markduplicates_no_table.csv` + - CSV containing an entry for each sample with the columns `patient,sample,sex,status,cram,crai` +- `markduplicates.csv` + - CSV containing an entry for each sample with the columns `patient,sample,sex,status,cram,crai,table` +- `recalibrated.csv` + - CSV containing an entry for each sample with the columns`patient,sample,sex,status,cram,crai` +- `variantcalled.csv` + - CSV containing an entry for each sample with the columns `patient,sample,vcf` +
+ +## Variant Calling + +The results regarding variant calling are collected in `{outdir}/variantcalling/`. +If some results from a variant caller do not appear here, please check out the `--tools` section in the parameter [documentation](https://nf-co.re/sarek/latest/parameters). + +(Recalibrated) CRAM files can used as an input to start the variant calling. + +### SNVs and small indels + +For single nucleotide variants (SNVs) and small indels, multiple tools are available for normal (germline), tumor-only, and tumor-normal (somatic) paired data. For a list of the appropriate tool(s) for the data and sequencing type at hand, please check [here](usage#which-tool). + +#### bcftools + +[bcftools mpileup](https://samtools.github.io/bcftools/bcftools.html#mpileup) generates pileup of a CRAM file, followed by [bcftools call](https://samtools.github.io/bcftools/bcftools.html#call) and filtered with `-i 'count(GT==\"RR\")==0`. +For further reading and documentation see the [bcftools manual](https://samtools.github.io/bcftools/howtos/variant-calling.html). + +
+Output files for all samples + +**Output directory: `{outdir}/variantcalling/bcftools//`** + +- `.bcftools.vcf.gz` and `.bcftools.vcf.gz.tbi` + - VCF with tabix index + +
+ +#### DeepVariant + +[DeepVariant](https://github.com/google/deepvariant) is a deep learning-based variant caller that takes aligned reads, produces pileup image tensors from them, classifies each tensor using a convolutional neural network and finally reports the results in a standard VCF or gVCF file. For further documentation take a look [here](https://github.com/google/deepvariant/tree/r1.4/docs). + +
+Output files for normal samples + +**Output directory: `{outdir}/variantcalling/deepvariant//`** + +- `.deepvariant.vcf.gz` and `.deepvariant.vcf.gz.tbi` + - VCF with tabix index +- `.deepvariant.g.vcf.gz` and `.deepvariant.g.vcf.gz.tbi` + - gVCF with tabix index +
+ +#### FreeBayes + +[FreeBayes](https://github.com/ekg/freebayes) is a Bayesian genetic variant detector designed to find small polymorphisms, specifically SNPs, indels, MNPs, and complex events smaller than the length of a short-read sequencing alignment. For further reading and documentation see the [FreeBayes manual](https://github.com/ekg/freebayes/blob/master/README.md#user-manual-and-guide). + +
+Output files for all samples + +**Output directory: `{outdir}/variantcalling/freebayes/{sample,normalsample_vs_tumorsample}/`** + +- `.freebayes.vcf.gz` and `.freebayes.vcf.gz.tbi` + - VCF with tabix index + +
+ +#### GATK HaplotypeCaller + +[GATK HaplotypeCaller](https://gatk.broadinstitute.org/hc/en-us/articles/5358864757787-HaplotypeCaller) calls germline SNPs and indels via local re-assembly of haplotypes. + +
+Output files for normal samples + +**Output directory: `{outdir}/variantcalling/haplotypecaller//`** + +- `.haplotypecaller.vcf.gz` and `.haplotypecaller.vcf.gz.tbi` + - VCF with tabix index + +
+ +##### GATK Germline Single Sample Variant Calling + +[GATK Single Sample Variant Calling](https://gatk.broadinstitute.org/hc/en-us/articles/360035535932-Germline-short-variant-discovery-SNPs-Indels-) +uses HaplotypeCaller in its default single-sample mode to call variants. The VCF that HaplotypeCaller emits errors on the side of sensitivity, therefore they are filtered by first running the [CNNScoreVariants](https://gatk.broadinstitute.org/hc/en-us/articles/5358904862107-CNNScoreVariants) tool. This tool annotates each variant with a score indicating the model's prediction of the quality of each variant. To apply filters based on those scores run the [FilterVariantTranches](https://gatk.broadinstitute.org/hc/en-us/articles/5358928898971-FilterVariantTranches) tool with SNP and INDEL sensitivity tranches appropriate for your task. + +If the haplotype-called VCF files are not filtered, then Sarek should be run with at least one of the options `--dbsnp` or `--known_indels`. + +
+Output files for normal samples + +**Output directory: `{outdir}/variantcalling/haplotypecaller//`** + +- `.haplotypecaller.filtered.vcf.gz` and `.haplotypecaller.filtered.vcf.gz.tbi` + - VCF with tabix index + +
+ +##### GATK Joint Germline Variant Calling + +[GATK Joint germline Variant Calling](https://gatk.broadinstitute.org/hc/en-us/articles/360035535932-Germline-short-variant-discovery-SNPs-Indels-) uses Haplotypecaller per sample in `gvcf` mode. Next, the gVCFs are consolidated from multiple samples into a [GenomicsDB](https://gatk.broadinstitute.org/hc/en-us/articles/5358869876891-GenomicsDBImport) datastore. After joint [genotyping](https://gatk.broadinstitute.org/hc/en-us/articles/5358906861083-GenotypeGVCFs), [VQSR](https://gatk.broadinstitute.org/hc/en-us/articles/5358906115227-VariantRecalibrator) is applied for filtering to produce the final multisample callset with the desired balance of precision and sensitivity. + +
+Output files from joint germline variant calling + +**Output directory: `{outdir}/variantcalling/haplotypecaller//`** + +- `.haplotypecaller.g.vcf.gz` and `.haplotypecaller.g.vcf.gz.tbi` + - gVCF with tabix index + +**Output directory: `{outdir}/variantcalling/haplotypecaller/joint_variant_calling/`** + +- `joint_germline.vcf.gz` and `joint_germline.vcf.gz.tbi` + - VCF with tabix index +- `joint_germline_recalibrated.vcf.gz` and `joint_germline_recalibrated.vcf.gz.tbi` + - variant recalibrated VCF with tabix index (if VQSR is applied) + +
+ +#### GATK Mutect2 + +[GATK Mutect2](https://gatk.broadinstitute.org/hc/en-us/articles/5358911630107-Mutect2) calls somatic SNVs and indels via local assembly of haplotypes. +When `--joint_mutect2` is used, Mutect2 subworkflow outputs will be saved in a subfolder named with the patient ID and `{patient}.mutect2.vcf.gz` file will contain variant calls from all of the normal and tumor samples of the patient. +For further reading and documentation see the [Mutect2 manual](https://gatk.broadinstitute.org/hc/en-us/articles/360035531132). +It is not required, but recommended to have a [panel of normals (PON)](https://gatk.broadinstitute.org/hc/en-us/articles/360035890631-Panel-of-Normals-PON) using at least 40 normal samples to get filtered somatic calls. When using `--genome GATK.GRCh38`, a panel-of-normals file is available. However, it is _highly_ recommended to create one matching your tumor samples. Creating your own panel-of-normals is currently not natively supported by the pipeline. See [here](https://gatk.broadinstitute.org/hc/en-us/articles/360035531132) for how to create one manually. + +
+Output files for tumor-only and tumor/normal paired samples + +**Output directory: `{outdir}/variantcalling/mutect2/{sample,tumorsample_vs_normalsample,patient}/`** + +Files created: + +- `{sample,tumorsample_vs_normalsample,patient}.mutect2.vcf.gz` and `{sample,tumorsample_vs_normalsample,patient}.mutect2.vcf.gz.tbi` + - unfiltered (raw) Mutect2 calls VCF with tabix index +- `{sample,tumorsample_vs_normalsample,patient}.mutect2.vcf.gz.stats` + - a stats file generated during calling of raw variants (needed for filtering) +- `{sample,tumorsample_vs_normalsample}.mutect2.contamination.table` + - table calculating the fraction of reads coming from cross-sample contamination +- `{sample,tumorsample_vs_normalsample}.mutect2.segmentation.table` + - table containing segmentation of the tumor by minor allele fraction +- `{sample,tumorsample_vs_normalsample,patient}.mutect2.artifactprior.tar.gz` + - prior probabilities for read orientation artifacts +- `{sample,tumorsample,normalsample}.mutect2.pileups.table` + - tabulates pileup metrics for inferring contamination +- `{sample,tumorsample_vs_normalsample,patient}.mutect2.filtered.vcf.gz` and `{sample,tumorsample_vs_normalsample,patient}.mutect2.filtered.vcf.gz.tbi` + - filtered Mutect2 calls VCF with tabix index based on the probability that a variant is somatic +- `{sample,tumorsample_vs_normalsample,patient}.mutect2.filtered.vcf.gz.filteringStats.tsv` + - a stats file generated during the filtering of Mutect2 called variants + +
+ +#### Sentieon DNAscope + +[Sentieon DNAscope](https://support.sentieon.com/appnotes/dnascope_ml/#dnascope-germline-variant-calling-with-a-machine-learning-model) is a variant-caller which aims at outperforming GATK's Haplotypecaller in terms of both speed and accuracy. DNAscope allows you to use a machine learning model to perform variant calling with higher accuracy by improving the candidate detection and filtering. + +
+Unfiltered VCF-files for normal samples + +**Output directory: `{outdir}/variantcalling/sentieon_dnascope//`** + +- `.dnascope.unfiltered.vcf.gz` and `.dnascope.unfiltered.vcf.gz.tbi` + - VCF with tabix index + +
+ +The output from Sentieon's DNAscope can be controlled through the option `--sentieon_dnascope_emit_mode` for Sarek, see [Basic usage of Sentieon functions](#basic-usage-of-sentieon-functions). + +Unless `dnascope_filter` is listed under `--skip_tools` in the nextflow command, Sentieon's [DNAModelApply](https://support.sentieon.com/manual/usages/general/#dnamodelapply-algorithm) is applied to the unfiltered VCF-files in order to obtain filtered VCF-files. + +
+Filtered VCF-files for normal samples + +**Output directory: `{outdir}/variantcalling/sentieon_dnascope//`** + +- `.dnascope.filtered.vcf.gz` and `.dnascope.filtered.vcf.gz.tbi` + - VCF with tabix index + +
+ +##### Sentieon DNAscope joint germline variant calling + +In Sentieon's package DNAscope, joint germline variant calling is done by first running Sentieon's Dnacope in emit-mode `gvcf` for each sample and then running Sentieon's [GVCFtyper](https://support.sentieon.com/manual/usages/general/#gvcftyper-algorithm) on the set of gVCF-files. See [Basic usage of Sentieon functions](#basic-usage-of-sentieon-functions) for information on how joint germline variant calling can be done in Sarek using Sentieon's DNAscope. + +
+Output files from joint germline variant calling + +**Output directory: `{outdir}/variantcalling/sentieon_dnascope//`** + +- `.dnascope.g.vcf.gz` and `.dnascope.g.vcf.gz.tbi` + - VCF with tabix index + +**Output directory: `{outdir}/variantcalling/sentieon_dnascope/joint_variant_calling/`** + +- `joint_germline.vcf.gz` and `joint_germline.vcf.gz.tbi` + - VCF with tabix index + +
+ +#### Sentieon Haplotyper + +[Sentieon Haplotyper](https://support.sentieon.com/manual/usages/general/#haplotyper-algorithm) is Sention's speedup version of GATK's Haplotypecaller (see above). + +
+Unfiltered VCF-files for normal samples + +**Output directory: `{outdir}/variantcalling/sentieon_haplotyper//`** + +- `.haplotyper.unfiltered.vcf.gz` and `.haplotyper.unfiltered.vcf.gz.tbi` + - VCF with tabix index + +
+ +The output from Sentieon's Haplotyper can be controlled through the option `--sentieon_haplotyper_emit_mode` for Sarek, see [Basic usage of Sentieon functions](#basic-usage-of-sentieon-functions). + +Unless `haplotyper_filter` is listed under `--skip_tools` in the nextflow command, GATK's CNNScoreVariants and FilterVariantTranches (see above) is applied to the unfiltered VCF-files in order to obtain filtered VCF-files. + +
+Filtered VCF-files for normal samples + +**Output directory: `{outdir}/variantcalling/sentieon_haplotyper//`** + +- `.haplotyper.filtered.vcf.gz` and `.haplotyper.filtered.vcf.gz.tbi` + - VCF with tabix index + +
+ +##### Sentieon Haplotyper joint germline variant calling + +In Sentieon's package DNAseq, joint germline variant calling is done by first running Sentieon's Haplotyper in emit-mode `gvcf` for each sample and then running Sentieon's [GVCFtyper](https://support.sentieon.com/manual/usages/general/#gvcftyper-algorithm) on the set of gVCF-files. See [Basic usage of Sentieon functions](#basic-usage-of-sentieon-functions) for information on how joint germline variant calling can be done in Sarek using Sentieon's DNAseq. After joint genotyping, Sentieon's version of VQSR ([VarCal](https://support.sentieon.com/manual/usages/general/#varcal-algorithm) and [ApplyVarCal](https://support.sentieon.com/manual/usages/general/#applyvarcal-algorithm)) is applied for filtering to produce the final multisample callset with the desired balance of precision and sensitivity. + +
+Output files from joint germline variant calling + +**Output directory: `{outdir}/variantcalling/sentieon_haplotyper//`** + +- `.haplotyper.g.vcf.gz` and `.haplotyper.g.vcf.gz.tbi` + - VCF with tabix index + +**Output directory: `{outdir}/variantcalling/sentieon_haplotyper/joint_variant_calling/`** + +- `joint_germline.vcf.gz` and `joint_germline.vcf.gz.tbi` + - VCF with tabix index +- `joint_germline_recalibrated.vcf.gz` and `joint_germline_recalibrated.vcf.gz.tbi` + - variant recalibrated VCF with tabix index (if VarCal is applied) + +
+ +#### Strelka2 + +[Strelka2](https://github.com/Illumina/strelka) is a fast and accurate small variant caller optimized for analysis of germline variation in small cohorts and somatic variation in tumor/normal sample pairs. For further reading and documentation see the [Strelka2 user guide](https://github.com/Illumina/strelka/blob/master/docs/userGuide/README.md). If [Strelka2](https://github.com/Illumina/strelka) is used for somatic variant calling and [Manta](https://github.com/Illumina/manta) is also specified in tools, the output candidate indels from [Manta](https://github.com/Illumina/manta) are used according to [Strelka Best Practices](https://github.com/Illumina/strelka/blob/master/docs/userGuide/README.md#somatic-configuration-example). +For further downstream analysis, take a look [here](https://github.com/Illumina/strelka/blob/v2.9.x/docs/userGuide/README.md#interpreting-the-germline-multi-sample-variants-vcf). + +
+Output files for all single samples (normal or tumor-only) + +**Output directory: `{outdir}/variantcalling/strelka//`** + +- `.strelka.genome.vcf.gz` and `.strelka.genome.vcf.gz.tbi` + - genome VCF with tabix index +- `.strelka.variants.vcf.gz` and `.strelka.variants.vcf.gz.tbi` + - VCF with tabix index with all potential variant loci across the sample. Note this file includes non-variant loci if they have a non-trivial level of variant evidence or contain one or more alleles for which genotyping has been forced. +
+ +
+Output files for tumor/normal paired samples + +**Output directory: `{outdir}/variantcalling/strelka//`** + +- `.strelka.somatic_indels.vcf.gz` and `.strelka.somatic_indels.vcf.gz.tbi` + - VCF with tabix index with all somatic indels inferred in the tumor sample. +- `.strelka.somatic_snvs.vcf.gz` and `.strelka.somatic_snvs.vcf.gz.tbi` + - VCF with tabix index with all somatic SNVs inferred in the tumor sample. + +
+ +### Structural Variants + +#### Manta + +[Manta](https://github.com/Illumina/manta) calls structural variants (SVs) and indels from mapped paired-end sequencing reads. +It is optimized for analysis of germline variation in small sets of individuals and somatic variation in tumor/normal sample pairs. +[Manta](https://github.com/Illumina/manta) provides a candidate list for small indels that can be fed to [Strelka2](https://github.com/Illumina/strelka) following [Strelka Best Practices](https://github.com/Illumina/strelka/blob/master/docs/userGuide/README.md#somatic-configuration-example). For further reading and documentation see the [Manta user guide](https://github.com/Illumina/manta/blob/master/docs/userGuide/README.md). + +
+Output files for normal samples + +**Output directory: `{outdir}/variantcalling/manta//`** + +- `.manta.diploid_sv.vcf.gz` and `.manta.diploid_sv.vcf.gz.tbi` + - VCF with tabix index containing SVs and indels scored and genotyped under a diploid model for the sample. +
+ +
+Output files for tumor-only samples + +**Output directory: `{outdir}/variantcalling/manta//`** + +- `.manta.tumor_sv.vcf.gz` and `.manta.tumor_sv.vcf.gz.tbi` + - VCF with tabix index containing a subset of the candidateSV.vcf.gz file after removing redundant candidates and small indels less than the minimum scored variant size (50 by default). The SVs are not scored, but include additional details: (1) paired and split read supporting evidence counts for each allele (2) a subset of the filters from the scored tumor-normal model are applied to the single tumor case to improve precision. +
+ +
+Output files for tumor/normal paired samples + +**Output directory: `{outdir}/variantcalling/manta//`** + +- `.manta.diploid_sv.vcf.gz` and `.manta.diploid_sv.vcf.gz.tbi` + - VCF with tabix index containing SVs and indels scored and genotyped under a diploid model for the sample. In the case of a tumor/normal subtraction, the scores in this file do not reflect any information from the tumor sample. +- `.manta.somatic_sv.vcf.gz` and `.manta.somatic_sv.vcf.gz.tbi` + - VCF with tabix index containing SVs and indels scored under a somatic variant model. +
+ +#### TIDDIT + +[TIDDIT](https://github.com/SciLifeLab/TIDDIT) identifies intra and inter-chromosomal translocations, deletions, tandem-duplications and inversions. For further reading and documentation see the [TIDDIT manual](https://github.com/SciLifeLab/TIDDIT/blob/master/README.md). + +
+Output files for normal and tumor-only samples + +**Output directory: `{outdir}/variantcalling/tiddit//`** + +- `.tiddit.vcf.gz` and `.tiddit.vcf.gz.tbi` + - VCF with tabix index containing SV calls +- `.tiddit.ploidies.tab` + - tab file describing the estimated ploidy and coverage across each contig + +
+ +
+Output files for tumor/normal paired samples + +**Output directory: `{outdir}/variantcalling/tiddit//`** + +- `.tiddit.normal.vcf.gz` and `.tiddit.normal.vcf.gz.tbi` + - VCF with tabix index containing SV calls +- `.tiddit.tumor.vcf.gz` and `.tiddit.tumor.vcf.gz.tbi` + - VCF with tabix index containing SV calls +- `_sv_merge.tiddit.vcf.gz` and `_sv_merge.tiddit.vcf.gz.tbi` + - merged tumor/normal VCF with tabix index +- `.tiddit.ploidies.tab` + - tab file describing the estimated ploidy and coverage across each contig + +
+ +### Sample heterogeneity, ploidy and CNVs + +#### ASCAT + +[ASCAT](https://github.com/VanLoo-lab/ascat) is a software for performing allele-specific copy number analysis of tumor samples and for estimating tumor ploidy and purity (normal contamination). +It infers tumor purity and ploidy and calculates whole-genome allele-specific copy number profiles. +The [ASCAT](https://github.com/VanLoo-lab/ascat) process gives several images as output, described in detail in this [book chapter](http://www.ncbi.nlm.nih.gov/pubmed/22130873). +Running ASCAT on NGS data requires that the BAM files are converted into BAF and LogR values. +This is done internally using the software [AlleleCount](https://github.com/cancerit/alleleCount). For further reading and documentation see the [ASCAT manual](https://www.crick.ac.uk/research/labs/peter-van-loo/software). + +
+Output files for tumor/normal paired samples + +**Output directory: `{outdir}/variantcalling/ascat//`** + +- `.tumour.ASCATprofile.png` + - image with information about allele-specific copy number profile +- `.tumour.ASPCF.png` + - image with information about allele-specific copy number segmentation +- `.before_correction_Tumour..tumour.png` + - image with information about raw profile of tumor sample of logR and BAF values before GC correction +- `.before_correction_Tumour..germline.png` + - image with information about raw profile of normal sample of logR and BAF values before GC correction +- `.after_correction_GC_Tumour..tumour.png` + - image with information about GC and RT corrected logR and BAF values of tumor sample after GC correction +- `.after_correction_GC_Tumour..germline.png` + - image with information about GC and RT corrected logR and BAF values of normal sample after GC correction +- `.tumour.sunrise.png` + - image visualising the range of ploidy and tumor percentage values +- `.metrics.txt` + - file with information about different metrics from ASCAT profiles +- `.cnvs.txt` + - file with information about CNVS +- `.purityploidy.txt` + - file with information about purity and ploidy +- `.segments.txt` + - file with information about copy number segments +- `.tumour_tumourBAF.txt` and `.tumour_normalBAF.txt` + - file with beta allele frequencies +- `.tumour_tumourLogR.txt` and `.tumour_normalLogR.txt` + - file with total copy number on a logarithmic scale + +The text file `.cnvs.txt` contains predictions about copy number state for all the segments. +The output is a tab delimited text file with the following columns: + +- _chr_: chromosome number +- _startpos_: start position of the segment +- _endpos_: end position of the segment +- _nMajor_: number of copies of one of the allels (for example the chromosome inherited of one parent) +- _nMinor_: number of copies of the other allele (for example the chromosome inherited of the other parent) + +The file `.cnvs.txt` contains all segments predicted by ASCAT, both those with normal copy number (nMinor = 1 and nMajor =1) and those corresponding to copy number aberrations. + +
+ +#### CNVKit + +[CNVKit](https://cnvkit.readthedocs.io/en/stable/) is a toolkit to infer and visualize copy number from high-throughput DNA sequencing data. It is designed for use with hybrid capture, including both whole-exome and custom target panels, and short-read sequencing platforms such as Illumina. For further reading and documentation, see the [CNVKit Documentation](https://cnvkit.readthedocs.io/en/stable/plots.html) + +
+Output files for normal and tumor-only samples + +**Output directory: `{outdir}/variantcalling/cnvkit//`** + +- `.antitargetcoverage.cnn` + - file containing coverage information +- `.targetcoverage.cnn` + - file containing coverage information +- `-diagram.pdf` + - file with plot of copy numbers or segments on chromosomes +- `-scatter.png` + - file with plot of bin-level log2 coverages and segmentation calls +- `.bintest.cns` + - file containing copy number segment information +- `.cnr` + - file containing copy number ratio information +- `.cns` + - file containing copy number segment information +- `.call.cns` + - file containing copy number segment information +- `.genemetrics.tsv` + - file containing per gene copy number information (if input files are annotated) +
+ +
+Output files for tumor/normal samples + +**Output directory: `{outdir}/variantcalling/cnvkit//`** + +- `.antitargetcoverage.cnn` + - file containing coverage information +- `.targetcoverage.cnn` + - file containing coverage information +- `.antitargetcoverage.cnn` + - file containing coverage information +- `.targetcoverage.cnn` + - file containing coverage information +- `.bintest.cns` + - file containing copy number segment information +- `-scatter.png` + - file with plot of bin-level log2 coverages and segmentation calls +- `-diagram.pdf` + - file with plot of copy numbers or segments on chromosomes +- `.cnr` + - file containing copy number ratio information +- `.cns` + - file containing copy number segment information +- `.call.cns` + - file containing copy number segment information +- `.genemetrics.tsv` + - file containing per gene copy number information (if input files are annotated) +
+ +#### Control-FREEC + +[Control-FREEC](https://github.com/BoevaLab/FREEC) is a tool for detection of copy-number changes and allelic imbalances (including loss of heterozygoity (LOH)) using deep-sequencing data. +[Control-FREEC](https://github.com/BoevaLab/FREEC) automatically computes, normalizes, segments copy number and beta allele frequency profiles, then calls copy number alterations and LOH. +It also detects subclonal gains and losses and evaluates the most likely average ploidy of the sample. For further reading and documentation see the [Control-FREEC Documentation](http://boevalab.inf.ethz.ch/FREEC/tutorial.html). + +
+Output files for tumor-only and tumor/normal paired samples + +**Output directory: `{outdir}/variantcalling/controlfreec/{tumorsample,tumorsample_vs_normalsample}/`** + +- `config.txt` + - Configuration file used to run Control-FREEC +- `_BAF.png` and `_BAF.png` + - image of BAF plot +- `_ratio.log2.png` and `_ratio.log2.png` + - image of ratio log2 plot +- `_ratio.png` and `_ratio.png` + - image of ratio plot +- `.bed` and `.bed` + - translated output to a .BED file (so to view it in the UCSC Genome Browser) +- `.circos.txt` and `.circos.txt` + - translated output to the Circos format +- `.p.value.txt` and `.p.value.txt` + - CNV file containing p_values for each call +- `_BAF.txt` and `.mpileup.gz_BAF.txt` + - file with beta allele frequencies for each possibly heterozygous SNP position +- `.tumor.mpileup.gz_CNVs` + - file with coordinates of predicted copy number alterations +- `_info.txt` and `.tumor.mpileup.gz_info.txt` + - parsable file with information about FREEC run +- ` _ratio.BedGraph` and `.tumor.mpileup.gz_ratio.BedGraph ` + - file with ratios in BedGraph format for visualization in the UCSC genome browser. The file contains tracks for normal copy number, gains and losses, and copy neutral LOH (\*). +- `_ratio.txt` and `.tumor.mpileup.gz_ratio.txt` + - file with ratios and predicted copy number alterations for each window +- `_sample.cpn` and `.tumor.mpileup.gz_sample.cpn` + - files with raw copy number profiles for the tumor sample +- `.normal.mpileup.gz_control.cpn` + - files with raw copy number profiles for the control sample +- `.cpn>` + - file with GC-content profile + +
+ +### Microsatellite instability (MSI) + +[Microsatellite instability](https://en.wikipedia.org/wiki/Microsatellite_instability) is a genetic condition associated with deficiencies in the mismatch repair (MMR) system which causes a tendency to accumulate a high number of mutations (SNVs and indels). +An altered distribution of microsatellite length is associated with a missed replication slippage which would be corrected under normal MMR conditions. + +#### MSIsensorPro + +[MSIsensorPro](https://github.com/xjtu-omics/msisensor-pro) is a tool to detect the MSI status of a tumor scanning the length of the microsatellite regions. +It requires a normal sample for each tumour to differentiate the somatic and germline cases. For further reading see the [MSIsensor paper](https://www.ncbi.nlm.nih.gov/pubmed/24371154). + +
+Output files for tumor/normal paired samples + +**Output directory: `{outdir}/variantcalling/msisensor//`** + +- `` + - MSI score output, contains information about the number of somatic sites. +- `_dis` + - The normal and tumor length distribution for each microsatellite position. +- `_germline` + - Somatic sites detected. +- `_somatic` + - Germline sites detected. +
+ +### Concatenation + +Germline VCFs from `DeepVariant`, `FreeBayes`, `HaplotypeCaller`, `Haplotyper`, `Manta`, `bcftools mpileup`, `Strelka2`, or `Tiddit` are concatenated with `bcftools concat`. The field `SOURCE` is added to the VCF header to report the variant caller. + +
+Concatenated VCF-files for normal samples + +**Output directory: `{outdir}/variantcalling/concat//`** + +- `.germline.vcf.gz` and `.germline.vcf.gz.tbi` + - VCF with tabix index + +
+ +## Variant annotation + +This directory contains results from the final annotation steps: two tools are used for annotation, [snpEff](http://snpeff.sourceforge.net/) and [VEP](https://www.ensembl.org/info/docs/tools/vep/index.html). Both results can also be combined by setting `--tools merge`. +All variants present in the called VCF files are annotated. For some variant callers this can mean that the variants are already filtered by `PASS`, for some this needs to be done during post-processing. + +### snpEff + +[snpeff](http://snpeff.sourceforge.net/) is a genetic variant annotation and effect prediction toolbox. +It annotates and predicts the effects of variants on genes (such as amino acid changes) using multiple databases for annotations. +The generated VCF header contains the software version and the used command line. For further reading and documentation see the [snpEff manual](http://snpeff.sourceforge.net/SnpEff_manual.html#outputSummary). + +
+Output files for all samples + +**Output directory: `{outdir}/annotation/{sample,tumorsample_vs_normalsample}`** + +- `{sample,tumorsample_vs_normalsample}._snpEff.ann.vcf.gz` and `{sample,tumorsample_vs_normalsample}._snpEff.ann.vcf.gz.tbi` + - VCF with tabix index +
+ +### VEP + +[VEP (Variant Effect Predictor)](https://www.ensembl.org/info/docs/tools/vep/index.html), based on `Ensembl`, is a tool to determine the effects of all sorts of variants, including SNPs, indels, structural variants, CNVs. +The generated VCF header contains the software version, also the version numbers for additional databases like [Clinvar](https://www.ncbi.nlm.nih.gov/clinvar/) or [dbSNP](https://www.ncbi.nlm.nih.gov/snp/) used in the [VEP](https://www.ensembl.org/info/docs/tools/vep/index.html) line. +The format of the [consequence annotations](https://www.ensembl.org/info/genome/variation/prediction/predicted_data.html) is also in the VCF header describing the `INFO` field. +For further reading and documentation see the [VEP manual](https://www.ensembl.org/info/docs/tools/vep/index.html). + +Currently, it contains: + +- _Consequence_: impact of the variation, if there is any +- _Codons_: the codon change, i.e. cGt/cAt +- _Amino_acids_: change in amino acids, i.e. R/H if there is any +- _Gene_: ENSEMBL gene name +- _SYMBOL_: gene symbol +- _Feature_: actual transcript name +- _EXON_: affected exon +- _PolyPhen_: prediction based on [PolyPhen](http://genetics.bwh.harvard.edu/pph2/) +- _SIFT_: prediction by [SIFT](http://sift.bii.a-star.edu.sg/) +- _Protein_position_: Relative position of amino acid in protein +- _BIOTYPE_: Biotype of transcript or regulatory feature + +plus any additional filed selected via the plugins: [dbNSFP](https://sites.google.com/site/jpopgen/dbNSFP), [LOFTEE](https://github.com/konradjk/loftee), [SpliceAI](https://spliceailookup.broadinstitute.org/), [SpliceRegion](https://www.ensembl.info/2018/10/26/cool-stuff-the-vep-can-do-splice-site-variant-annotation/). + +
+Output files for all samples + +**Output directory: `{outdir}/annotation/{sample,tumorsample_vs_normalsample}`** + +- `{sample,tumorsample_vs_normalsample}._VEP.ann.vcf.gz` and `{sample,tumorsample_vs_normalsample}._VEP.ann.vcf.gz.tbi` + - VCF with tabix index + +
+ +### BCFtools annotate + +[BCFtools annotate](https://samtools.github.io/bcftools/bcftools.html#annotate) is used to add annotations to VCF files. The annotations are added to the INFO column of the VCF file. The annotations are added to the VCF header and the VCF header is updated with the new annotations. For further reading and documentation see the [BCFtools annotate manual](https://samtools.github.io/bcftools/bcftools.html#annotate). + +
+Output files for all samples + +- `{sample,tumorsample_vs_normalsample}._bcf.ann.vcf.gz` and `{sample,tumorsample_vs_normalsample}._bcf.ann.vcf.gz.tbi` + - VCF with tabix index + +
+ +## Quality control and reporting + +### Quality control + +#### FastQC + [FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). -![MultiQC - FastQC sequence counts plot](images/mqc_fastqc_counts.png) +The plots display: -![MultiQC - FastQC mean quality scores plot](images/mqc_fastqc_quality.png) +- Sequence counts for each sample. +- Sequence Quality Histograms: The mean quality value across each base position in the read. +- Per Sequence Quality Scores: The number of reads with average quality scores. Shows if a subset of reads has poor quality. +- Per Base Sequence Content: The proportion of each base position for which each of the four normal DNA bases has been called. +- Per Sequence GC Content: The average GC content of reads. Normal random library typically have a roughly normal distribution of GC content. +- Per Base N Content: The percentage of base calls at each position for which an N was called. +- Sequence Length Distribution. +- Sequence Duplication Levels: The relative level of duplication found for each sequence. +- Overrepresented sequences: The total amount of overrepresented sequences found in each library. +- Adapter Content: The cumulative percentage count of the proportion of your library which has seen each of the adapter sequences at each position. -![MultiQC - FastQC adapter content plot](images/mqc_fastqc_adapter.png) +
+Output files for all samples :::note The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality. ::: +**Output directory: `{outdir}/reports/fastqc/`** + +- `_fastqc.html` and `_fastqc.html` + - [FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) report containing quality metrics for your untrimmed raw FastQ files +- `_fastqc.zip` and `_fastqc.zip` + - Zip archive containing the [FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) report, tab-delimited data file and plot images + +> **NB:** The FastQC plots displayed in the [MultiQC](https://multiqc.info/) report shows _untrimmed_ reads. +> They may contain adapter sequence and potentially regions with low quality. + +
+ +#### FastP + +[FastP](https://github.com/OpenGene/fastp) is a tool designed to provide all-in-one preprocessing for FastQ files and is used for trimming and splitting. The tool then determines QC metrics for the processed reads. + +
+Output files for all samples + +**Output directory: `{outdir}/reports/fastp/`** + +- `_fastp.html` + - report in HTML format +- `_fastp.json` + - report in JSON format +- `_fastp.log` + - FastQ log file + +
+ +#### Mosdepth + +[Mosdepth](https://github.com/brentp/mosdepth) reports information for the evaluation of the quality of the provided alignment data. +In short, the basic statistics of the alignment (number of reads, coverage, GC-content, etc.) are summarized and a number of useful graphs are produced. +For further reading and documentation see the [Mosdepth documentation](https://github.com/brentp/mosdepth). + +Plots will show: + +- cumulative coverage distribution +- absolute coverage distribution +- average coverage per contig/chromosome + +
+Output files for all samples + +**Output directory: `{outdir}/reports/mosdepth/`** + +- `.{sorted,md,recal}.mosdepth.global.dist.txt` + - file used by [MultiQC](https://multiqc.info/), if `.region` file does not exist +- `.{sorted,md,recal}.mosdepth.region.dist.txt` + - file used by [MultiQC](https://multiqc.info/) +- `.{sorted,md,recal}.mosdepth.summary.txt` + -A summary of mean depths per chromosome and within specified regions per chromosome. +- `.{sorted,md,recal}.{per-base,regions}.bed.gz` + - per-base depth for targeted data, per-window (500bp) depth of WGS +- `.{sorted,md,recal}.regions.bed.gz.csi` + - CSI index for per-base depth for targeted data, per-window (500bp) depth of WGS +
+ +#### NGSCheckMate + +[NGSCheckMate](https://github.com/parklab/NGSCheckMate) is a tool for determining whether samples come from the same genetic individual, using a set of commonly heterozygous SNPs. This enables for the detecting of sample mislabelling events. The output includes a text file indicating whether samples have matched or not according to the algorithm, as well as a dendrogram visualising these results. + +
+Output files for all samples + +**Output directory: `{outdir}/reports/ngscheckmate/`** + +- `ngscheckmate_all.txt` + - Tab delimited text file listing all the comparisons made, whether they were considered as a match, with the correlation and a normalised depth. +- `ngscheckmate_matched.txt` + - Tab delimited text file listing only the comparison that were considered to match, with the correlation and a normalised depth. +- `ngscheckmate_output_corr_matrix.txt` + - Tab delimited text file containing a matrix of all correlations for all comparisons made. +- `vcfs/.vcf.gz` + - Set of vcf files for each sample. Contains calls for the set of SNP positions used to calculate sample relatedness. +
+ +#### GATK MarkDuplicates reports + +More information in the [GATK MarkDuplicates section](#gatk-markduplicates) + +Duplicates can arise during sample preparation _e.g._ library construction using PCR. +Duplicate reads can also result from a single amplification cluster, incorrectly detected as multiple clusters by the optical sensor of the sequencing instrument. +These duplication artifacts are referred to as optical duplicates. If [GATK MarkDuplicates](https://gatk.broadinstitute.org/hc/en-us/articles/5358880192027-MarkDuplicates-Picard-) is used, the metrics file generated by the tool is used, if [`GATK MarkDuplicatesSpark`](https://gatk.broadinstitute.org/hc/en-us/articles/5358833264411-MarkDuplicatesSpark) is used the report is generated by [GATK4 EstimateLibraryComplexity](https://gatk.broadinstitute.org/hc/en-us/articles/5358838684187-EstimateLibraryComplexity-Picard-) on the mapped BAM files. +For further reading and documentation see the [MarkDuplicates manual](https://software.broadinstitute.org/gatk/documentation/tooldocs/4.1.2.0/picard_sam_markduplicates_MarkDuplicates.php). + +The plot will show: + +- duplication statistics + +
+Output files for all samples + +**Output directory: `{outdir}/reports/markduplicates/`** + +- `.md.cram.metrics` + - file used by [MultiQC](https://multiqc.info/) +
+ +#### Sentieon Dedup reports + +Sentieon's DNAseq subroutine Dedup produces a metrics report much like the one produced by GATK's MarkDuplicates. The Dedup metrics are imported into MultiQC as custom content and displayed in a table. + +
+Output files for all samples + +**Output directory: `{outdir}/reports/sentieon_dedup/`** -### MultiQC +- `.dedup.cram.metrics` + - file used by [MultiQC](https://multiqc.info/). +
+ +#### samtools stats + +[samtools stats](https://www.htslib.org/doc/samtools.html) collects statistics from CRAM files and outputs in a text format. +For further reading and documentation see the [`samtools` manual](https://www.htslib.org/doc/samtools.html#COMMANDS_AND_OPTIONS). + +The plots will show: + +- Alignment metrics. + +
+Output files for all samples + +**Output directory: `{outdir}/reports/samtools/`** + +- `.{sorted,md,recal}.samtools.stats.out` + - Raw statistics used by `MultiQC` + +
+ +#### bcftools stats + +[bcftools stats](https://samtools.github.io/bcftools/bcftools.html#stats) produces a statistics text file which is suitable for machine processing and can be plotted using plot-vcfstats. +For further reading and documentation see the [bcftools stats manual](https://samtools.github.io/bcftools/bcftools.html#stats). + +Plots will show: + +- Stats by non-reference allele frequency, depth distribution, stats by quality and per-sample counts, singleton stats, etc. +- Note: When using [Strelka2](https://github.com/Illumina/strelka), there will be no depth distribution plot, as Strelka2 does not report the INFO/DP field + +
+Output files for all samples + +**Output directory: `{outdir}/reports/bcftools/`** + +- `..bcftools_stats.txt` + - Raw statistics used by `MultiQC` +
+ +#### VCFtools + +[VCFtools](https://vcftools.github.io/) is a program package designed for working with VCF files. For further reading and documentation see the [VCFtools manual](https://vcftools.github.io/man_latest.html#OUTPUT%20OPTIONS). + +Plots will show: + +- the summary counts of each type of transition to transversion ratio for each `FILTER` category. +- the transition to transversion ratio as a function of alternative allele count (using only bi-allelic SNPs). +- the transition to transversion ratio as a function of SNP quality threshold (using only bi-allelic SNPs). + +
+Output files for all samples + +**Output directory: `{outdir}/reports/vcftools/`** + +- `..FILTER.summary` + - Raw statistics used by `MultiQC` with a summary of the number of SNPs and Ts/Tv ratio for each FILTER category +- `..TsTv.count` + - Raw statistics used by `MultiQC` with the Transition / Transversion ratio as a function of alternative allele count. Only uses bi-allelic SNPs. +- `..TsTv.qual` + - Raw statistics used by `MultiQC` with Transition / Transversion ratio as a function of SNP quality threshold. Only uses bi-allelic SNPs. +
+ +#### snpEff reports + +[snpeff](http://snpeff.sourceforge.net/) is a genetic variant annotation and effect prediction toolbox. +It annotates and predicts the effects of variants on genes (such as amino acid changes) using multiple databases for annotations. For further reading and documentation see the [snpEff manual](http://snpeff.sourceforge.net/SnpEff_manual.html#outputSummary). + +The plots will show: + +- locations of detected variants in the genome and the number of variants for each location. +- the putative impact of detected variants and the number of variants for each impact. +- the effect of variants at protein level and the number of variants for each effect type. +- the quantity as function of the variant quality score. + +
+Output files for all samples + +**Output directory: `{outdir}/reports/SnpEff/{sample,tumorsample_vs_normalsample}//`** + +- `._snpEff.csv` + - Raw statistics used by [MultiQC](http://multiqc.info) +- `._snpEff.html` + - Statistics to be visualised with a web browser +- `._snpEff.genes.txt` + - TXT (tab separated) summary counts for variants affecting each transcript and gene +
+ +#### VEP reports + +[VEP (Variant Effect Predictor)](https://www.ensembl.org/info/docs/tools/vep/index.html), based on `Ensembl`, is a tool to determine the effects of all sorts of variants, including SNPs, indels, structural variants, CNVs. For further reading and documentation see the [VEP manual](https://www.ensembl.org/info/docs/tools/vep/index.html) + +
+Output files for all samples + +**Output directory: `{outdir}/reports/EnsemblVEP/{sample,tumorsamplt_vs_normalsample}//`** + +- `._VEP.summary.html` + - Summary of the VEP run to be visualised with a web browser +
+ +### Reporting + +#### MultiQC + +[MultiQC](http://multiqc.info) is a visualization tool that generates a single HTML report summarizing all samples in your project. +Most of the pipeline QC results are visualised in the report and further statistics are available in the report data directory. +Results generated by MultiQC collect pipeline QC from supported tools e.g. FastQC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see .
Output files @@ -48,12 +1154,7 @@ The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They m - `multiqc_report.html`: a standalone HTML file that can be viewed in your web browser. - `multiqc_data/`: directory containing parsed statistics from the different tools used in the pipeline. - `multiqc_plots/`: directory containing static images from the report in various formats. - -
- -[MultiQC](http://multiqc.info) is a visualization tool that generates a single HTML report summarising all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in the report data directory. - -Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see . + ### Pipeline information @@ -61,11 +1162,43 @@ Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQ Output files - `pipeline_info/` - - Reports generated by Nextflow: `execution_report.html`, `execution_timeline.html`, `execution_trace.txt` and `pipeline_dag.dot`/`pipeline_dag.svg`. + - Reports generated by Nextflow: `execution_report_.html`, `execution_timeline_.html`, `execution_trace_.txt`, `pipeline_dag_.dot`/`pipeline_dag_.svg` and `manifest_.bco.json`. - Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.yml`. The `pipeline_report*` files will only be present if the `--email` / `--email_on_fail` parameter's are used when running the pipeline. - - Reformatted samplesheet files used as input to the pipeline: `samplesheet.valid.csv`. - - Parameters used by the pipeline run: `params.json`. + - Parameters used by the pipeline run: `params_.json`. [Nextflow](https://www.nextflow.io/docs/latest/tracing.html) provides excellent functionality for generating various reports relevant to the running and execution of the pipeline. This will allow you to troubleshoot errors with the running of the pipeline, and also provide you with other information such as launch commands, run times and resource usage. + +## Reference files + +Contains reference folders generated by the pipeline. These files are only published, if `--save_reference` is set. + +
+Output files + +- `bwa/` + - Index corresponding to the [BWA](https://github.com/lh3/bwa) aligner +- `bwamem2/` + - Index corresponding to the [BWA-mem2](https://github.com/bwa-mem2/bwa-mem2) aligner +- `cnvkit/` + - Reference files generated by [CNVKit](https://cnvkit.readthedocs.io/en/stable/) +- `dragmap/` + - Index corresponding to the [DragMap](https://github.com/Illumina/dragmap) aligner +- `dbsnp/` + - Tabix index generated by [Tabix](http://www.htslib.org/doc/tabix.html) from the given dbsnp file +- `dict/` + - Sequence dictionary generated by [GATK4 CreateSequenceDictionary](https://gatk.broadinstitute.org/hc/en-us/articles/5358872471963-CreateSequenceDictionary-Picard-) from the given fasta +- `fai/` + - Fasta index generated with [samtools faidx](http://www.htslib.org/doc/samtools-faidx.html) from the given fasta +- `germline_resource/` + - Tabix index generated by [Tabix](http://www.htslib.org/doc/tabix.html) from the given gernline resource file +- `intervals/` + - Bed files in various stages: .bed, .bed.gz, .bed per chromosome, .bed.gz per chromsome +- `known_indels/` + - Tabix index generated by [Tabix](http://www.htslib.org/doc/tabix.html) from the given known indels file +- `msi/` + - [MSIsensorPro](https://github.com/xjtu-omics/msisensor-pro) scan of the reference genome to get microsatellites information +- `pon/` + - Tabix index generated by [Tabix](http://www.htslib.org/doc/tabix.html) from the given panel-of-normals file +
diff --git a/docs/posters/EMBO_2022_FHanssen.pdf b/docs/posters/EMBO_2022_FHanssen.pdf new file mode 100644 index 0000000000..068e837c29 Binary files /dev/null and b/docs/posters/EMBO_2022_FHanssen.pdf differ diff --git a/docs/posters/ESHG_2017_Mgarcia.pdf b/docs/posters/ESHG_2017_Mgarcia.pdf new file mode 100644 index 0000000000..618fc37a36 Binary files /dev/null and b/docs/posters/ESHG_2017_Mgarcia.pdf differ diff --git a/docs/posters/ESHG_2017_Mgarcia.svg b/docs/posters/ESHG_2017_Mgarcia.svg new file mode 100644 index 0000000000..d8be275f2f --- /dev/null +++ b/docs/posters/ESHG_2017_Mgarcia.svg @@ -0,0 +1,11017 @@ + + + +image/svg+xmlMaxime Garcia1, Szilveszter Juhos2, Malin Larsson3, Teresita Diaz de Ståhl4, Johanna Sandgren4, Jesper Eisfeldt5,Sebastian DiLorenzo6, Marcel Martin7, Pall Olason8, Björn Nystedt8, Monica Nistér4, Max Käller9 + +Cancer Analysis Workflow toprocess normal/tumor WGS data + As WGS (whole genome sequencing), the broadly used research tool is getting cheaper and being introduced to clinics, it is now possible to compare data from normal and tumor samples of numerous patients. There are still many challenges, mostly regarding bioinformatics: datasets are huge, workflows are complex, and there are multiple tools to choose from for somatic and structural variant detection and quality control. The pipeline can use GRCh37 or GRCh38 as a reference genome. Docker containers are also available for easier deployment and testing purposes. CAW can be downloaded from our GitHub repository. The authors thank the Swedish Childhood Cancer Foundation for the funding of Barntumörbanken. We would like to acknowledge support from Science for Life Laboratory, the National Genomics Infrastructure, NGI, and UPPMAX for providing assistance in massive parallel sequencing and computational infrastructure.AknowledgementsReferencesLinkshttp://opensource.scilifelab.se/projects/cawhttps://github.com/SciLifeLab/CAWhttps://gitter.im/SciLifeLab/CAWhttps://www.scilifelab.se/facilities/genomics-applications/ +* : http://dx.doi.org/10.1038/nbt.3820 +Core principlesbstractAWACWGS normal/tumor pairs analysis workflow written inFollows GATK best practices. Provides SNVs, small indels, structural variants, heterogeneity, ploidy CNVs, annotation and QC reportsTools used:- ASCAT- Freebayes- HaplotypeCaller- Manta- MultiQC- MuTect1- MuTect2- snpEff- Strelka- VEPL;DRTEasily deployable, supports containersThe MIT License (MIT) Copyright © 2016 SciLifeLabCan be used onCan also analyse normal only samples1 - BarnTumörBanken, Dept. of Oncology Pathology, Science for Life Laboratory, Karolinska Institutet2 - Dept. of Biochemistry and Biophysics, Science for Life Laboratory, Stockholm University3 - Dept. of Physics, Chemistry and Biology, National Bioinformatics Infrastructure Sweden, Science for Life Laboratory, Linköping University4 - BarnTumörBanken, Dept. of Oncology Pathology, Karolinska Institutet5 - Clinical Genetics, Dept. of Molecular Medicine and Surgery, Karolinska Institutet6 - Dept. of Medical Sciences, National Bioinformatics Infrastructure Sweden, Science for Life Laboratory, Uppsala University7 - Dept. of Biochemistry and Biophysics, National Bioinformatics Infrastructure Sweden, Science for Life Laboratory, Stockholm University 8 - Dept. of Cell and Molecular Biology, National Bioinformatics Infrastructure Sweden, Science for Life Laboratory, Uppsala University9 - Science for Life Laboratory, School of Biotechnology, Division of Gene Technology, Royal Institute of TechnologyJoin the chat on gitterOpen source, contribute on githubAknowledgementsUPPMAX Preprocessing takes arround 4 days on one Huawei XH620 V3 compute node with dual CPUs (Intel Xeon E5-2630 v3, each with 8 cores) with 45 coverage normal/tumor pairs. Further Variant Calling can take up to 10 hours for each variant caller. CAW is prepared to process normal/tumor pairs and can handle additional relapse samples. It can also be used to preprocess and analyse normal only samples using GATK best practices and HaplotypeCaller. CAW can start the analysis from raw FASTQ files, from the realignment step, or directly with any subset of variant callers. Resulting VCF files can be annotated using snpEff or VEP. At the end of the analysis the final VCF files are merged to facilitate further downstream processing, though the individual results are also retained. The flow is capable of accommodating further variant calling software or CNV callers. We are utilizing GATK best practices to align, realign and recalibrate short-read data in parallel for both tumor and normal samples. After preprocessing, several somatic variant callers scan the resulting BAM files; MuTect1, MuTect2 and Strelka are used to find somatic SNVs and small indels. For structural variants we use Manta. Furthermore, we are applying ASCAT to estimate sample heterogeneity, ploidy and CNVs. The workflow also provides quality controls presented by MultiQC.orkflow specificitiesW We are presenting CAW (Cancer Analysis Workflow), a complete open source pipeline to resolve somatic variants from WGS data: it is written in Nextflow*, a domain specific language for workflow building.Variant Calling +Structural +Variants +SNVs +CNV +Indels +Preprocessing +GATK +Best Practices +normal +BAM, bai +GATK +Best Practices +tumor +BAM, bai +GATK +Best Practices +relapse +BAM, bai +Configuration +Metadata +Statistics & QC +Results +Annotation +Pre-filter +Merge + \ No newline at end of file diff --git a/docs/posters/ESHG_2019.pdf b/docs/posters/ESHG_2019.pdf new file mode 100644 index 0000000000..d65b0415f4 Binary files /dev/null and b/docs/posters/ESHG_2019.pdf differ diff --git a/docs/posters/ESHG_2019.svg b/docs/posters/ESHG_2019.svg new file mode 100644 index 0000000000..3aaf7052f5 --- /dev/null +++ b/docs/posters/ESHG_2019.svg @@ -0,0 +1,5580 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + Analysis of genome sequencing data with a minimal investment IT-infrastructure Johannes Alneberg1,*, Maxime Ulysse Garcia2, Alexander Peltzer3, Tobias Koch3, Martin Proks4, Andreas Wilm5, Philip A Ewels6,* 1. School of Engineering Sciences in Chemistry, Biotechnology and Health, Royal Institute of Technology KTH, Sweden2. Department of Oncology, Karolinska Institute, Stockholm, Sweden3. Quantitative Biology Center (QBiC), University of Tübingen, Tübingen, Germany4. University of Southern Denmark, Odense, Denmark5. A*STAR Genome Institute of Singapore, Bioinformatics Core Unit, Singapore, Singapore6. Department of Biochemistry and Biophysics, Stockholm University, Stockholm, Sweden* National Genomics Infrastructure Stockholm, Science for Life Laboratory + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + This work was supported by the AWS Cloud Credits for Research program in the form of a computational credit grant received by Philip A Ewels. + + + + + + + + + + + + + + 30X Human Genome:Storage: $2.43Compute: $46.42 + NextFlow has native support for major cloud providers, including AWS Batch. Sarek is a whole genome sequencing analysis pipeline implemented in NextFlow. + + + + + + + fastq + + + + bam + + + + + vcf + + + + + vcf + + + + + vcf + + + + + Based on GATK Best Practices Preprocessing + + + + + HaplotypeCaller, Strelka2, Manta Variant Calling + + snpEff, VEP + + snpEff, VEP Annotation + + + + Reports + + + + + + + + + + + + + + + + + + + + + + Analysis steps + + $ aws batch create-compute-environment --region eu-west-1 \--compute-environment-name $COMPUTE_ENV \--compute-resources type=SPOT,instanceTypes="m5d",bidPercentage=50$ aws batch create-job-queue --region eu-west-1 --job-queue-name $AWS_QUEUE \--compute-environment-order "order=1,computeEnvironment=$COMPUTE_ENV"$ nextflow run Sarek/main.nf -profile awsbatch --awsqueue $AWS_QUEUE + Commands outline + While large high-performance computing might be already available to larger research groups, especially smaller research groups would greatly benefit from a publicly accessible commercial alternative.Cloud computing offers a complete computational infrastructure without upfront infrastructure investment. Using Sarek on the AWS cloud infrastructure; mapping, germline variant calling and annotation for a human WGS sample (30X coverage) was performed for less than US-$ 50. Summary + + Beware that storing sensitive data with US companies might be violating GDPR due to the "cloud act". If this is the case, other local cloud providers might be more appropriate. Caveat + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + A container (Amazon machine image) was constructed to automatically mount the physical ssd drives. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + To scale storage space with instance size, m5d instances with physical ssd storage were used. + + + + + + + AWS Batch executes batch jobs much like a HPC scheduler. + The analysis finishes in roughly 30 hours. + + + + + diff --git a/docs/posters/ISMB_ECCB_2023_FHanssen.pdf b/docs/posters/ISMB_ECCB_2023_FHanssen.pdf new file mode 100644 index 0000000000..e99c617057 Binary files /dev/null and b/docs/posters/ISMB_ECCB_2023_FHanssen.pdf differ diff --git a/docs/posters/NextflowSummit_2022_FHanssen.pdf b/docs/posters/NextflowSummit_2022_FHanssen.pdf new file mode 100644 index 0000000000..aa3f91d072 Binary files /dev/null and b/docs/posters/NextflowSummit_2022_FHanssen.pdf differ diff --git a/docs/posters/PMC_2018_Mgarcia.pdf b/docs/posters/PMC_2018_Mgarcia.pdf new file mode 100644 index 0000000000..cb7686ffd0 Binary files /dev/null and b/docs/posters/PMC_2018_Mgarcia.pdf differ diff --git a/docs/posters/PMC_2018_Mgarcia.svg b/docs/posters/PMC_2018_Mgarcia.svg new file mode 100644 index 0000000000..9201e86332 --- /dev/null +++ b/docs/posters/PMC_2018_Mgarcia.svg @@ -0,0 +1,12095 @@ + + + +image/svg+xmlMaxime Garcia1, Szilveszter Juhos1, Malin Larsson2, Teresita Diaz de Ståhl3, Johanna Sandgren3, Jesper Eisfeldt4,Sebastian DiLorenzo5, Marcel Martin6, Pall Olason7, Phil Ewels8, Björn Nystedt7, Monica Nistér3, Max Käller9 + +Sarek, a workflow for WGS analysisof germline and somatic mutations +Portable WGS germline and normal/tumor pairs analysis workflow written inSummaryEasily deployable with containersThe MIT License (MIT) Copyright © 2016 SciLifeLabCan be used onJoin the chat on GitterOpen source, contribute on GitHubAcknowledgementsUPPMAX1 - BarnTumörBanken, Dept. of Oncology Pathology, Science for Life Laboratory, Karolinska Institutet2 - Dept. of Physics, Chemistry and Biology, National Bioinformatics Infrastructure Sweden, Science for Life Laboratory, Linköping University3 - BarnTumörBanken, Dept. of Oncology Pathology, Karolinska Institutet4 - Clinical Genetics, Dept. of Molecular Medicine and Surgery, Karolinska Institutet5 - Dept. of Medical Sciences, National Bioinformatics Infrastructure Sweden, Science for Life Laboratory, Uppsala University6 - Dept. of Biochemistry and Biophysics, National Bioinformatics Infrastructure Sweden, Science for Life Laboratory, Stockholm University 7 - Dept. of Cell and Molecular Biology, National Bioinformatics Infrastructure Sweden, Science for Life Laboratory, Uppsala University8 - Dept. of Biochemistry and Biophysics, Science for Life Laboratory, Stockholm University9 - Science for Life Laboratory, School of Biotechnology, Division of Gene Technology, Royal Institute of Technology + +rThe authors thank the Swedish Childhood Cancer Foundation for the funding of Barntumörbanken. We would like to acknowledge support from Science for Life Laboratory, the National Genomics Infrastructure, NGI, and UPPMAX for providing assistance in massive parallel sequencing and computational infrastructure.AcknowledgementsSarek is based on Docker and Singularity2 containers, enabling version tracking, reproducibility and handling sensitive data.The workflow is capable of accommodating further variant callers.Besides variant calls, the workflow provides quality controls presented by MultiQC3.Checkpoints allow the software to be started from FastQ, BAM or VCF.The pipeline currently use GRCh37 or GRCh38 as a reference genome, it is also possible to add custom genomes.The MIT licensed Open Source code can be downloaded from GitHub.Linksopensource.scilifelab.se/projects/sarekgithub.com/SciLifeLab/Sarekgitter.im/SciLifeLab/Sarekngisweden.scilifelab.se +References1: doi.org/10.1038/nbt.38202: doi.org/10.1371/journal.pone.01774593: doi.org/10.1093/bioinformatics/btw354 +We present Sarek, a portable Open Source pipeline to resolve germline and somatic variants from WGS data: it is written in Nextflow1, a domain-specific language for workflow building.It processes normal samples or normal/tumor pairs (with the option to include matched relapses).Sarek is based on GATK best practices to prepare short-read data, which is done in parallel for a tumor/normal pair sample.After these preprocessing steps several variant callers scan the resulting BAM files:- Manta for structural variants- Strelka and GATK HaplotypeCaller for germline variants- Freebayes, MuTect1, MuTect2 and Strelka for somatic variants- ASCAT to estimate sample heterogeneity, ploidy and CNVsAt the end of the analysis the resulting VCF files can be annotated to facilitate further downstream processing.Fig1: CPU usage for 90x tumor/normal pair sample (hours)Map, merge, dedup - 27%Realign, recalibrate - 20%HaplotypeCaller - 8%Strelka - 1%Manta - 10%FreeBayes - 5%MuTect2 - 6%MuTect1 - 3%ASCAT - 5%PrintReads - 16%84621815302548159Preprocessing based on GATK best practicesVariant Calling with:- HaplotypeCaller- Manta- StrelkaVariant Calling with:- ASCAT- Freebayes- HaplotypeCaller- Manta- MuTect1- MuTect2- StrelkaAnnotation with:- snpEff- VEPReports aggregated byFig2: Workflow organizationhtmlorfastqbambambairecalvcfvcfvcfVariant Calling +- HaplotypeCaller, Strelka- MantaPreprocessing +Based on GATK Best PracticesfastqbamorfastqbambambairecalbambairecalvcfvcfvcfvcfvcfvcfvcfvcfPreprocessing +Based on GATK Best PracticesVariant Calling +- HaplotypeCaller, Freebayes MuTect1, MuTect2, Strelka- ASCAT- MantaAnnotation +snpEff, VEPReports +SarekSomaticSarekGermlineSarekSarekGermlineSarekSomatic \ No newline at end of file diff --git a/docs/posters/QBiC_Symposium_2022_FHanssen.pdf b/docs/posters/QBiC_Symposium_2022_FHanssen.pdf new file mode 100644 index 0000000000..c981734fac Binary files /dev/null and b/docs/posters/QBiC_Symposium_2022_FHanssen.pdf differ diff --git a/docs/posters/iFIT_Poster_2021_FHanssen.pdf b/docs/posters/iFIT_Poster_2021_FHanssen.pdf new file mode 100644 index 0000000000..96f57bea78 Binary files /dev/null and b/docs/posters/iFIT_Poster_2021_FHanssen.pdf differ diff --git a/docs/posters/poster_tubit_2021_FHanssen.pdf b/docs/posters/poster_tubit_2021_FHanssen.pdf new file mode 100644 index 0000000000..6ad3a6be50 Binary files /dev/null and b/docs/posters/poster_tubit_2021_FHanssen.pdf differ diff --git a/docs/posters/poster_tubit_2021_FHanssen.svg b/docs/posters/poster_tubit_2021_FHanssen.svg new file mode 100644 index 0000000000..e3b87ad176 --- /dev/null +++ b/docs/posters/poster_tubit_2021_FHanssen.svg @@ -0,0 +1,3598 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + Citations + Funding + 1. Ewels et al. (2020), Nature Biotechnology 38, 276–278 2. Di Tommaso et al. (2017), Nature Biotechnology, 35(4), 316–319 + 3. Garcia et al. (2020), F1000Research 9:634. https://www.ga4gh.org/cram/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Introduction + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + nf-core provides portable, reproducible Nextflowbased pipelines + Many cancer DBs available in commercial clouds + Bring pipelines to the data + local HPC + (Re-)analyzing public data can support own data + + WGS, WES, panel dataGATK Best-practicesVariety of tools supported + 1 + 2 + 3 + + + + + + + + Limited compute resources + Data downloadtime-consuming + Expensive + Data upload time-consuming + Data security concerns + + + Optimization of nf-core/sarek for large-scale analysis of public cancer data in the cloud + Friederike Hanssen, Maxime Garcia, Gisela Gabernet, Sven Nahnsen + * + * + * + + + + + * + Quantitative Biology Center(QBiC), University of Tuebingen, + SciLifeLab, Karolinska Institutet, Stockholm + + Datasets can be large: + i.e. 300GB WGS/patient (tumor/normal) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 4 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Methods + CRAM: Storage reduction by 30-50% + Change & update tools to reduce resource usage + Improve cloud usage + Improve "Preprocessing": + + + + Duplicate Marking + Base QualityRecalibration + Variant Calling + Split fastq + Mapping + Merge + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + fastq + + + + + fastq + + + + + + + + BWAMem-2 + BWAMem-2 + + + + + + GATK BQSR + GATK BQSR + + GATK Spark Markduplicates + + Samtools + + + fastq + + + Annotation + Further improve sarek workflow + Split input into equal sizes to allow precise resource requests for mapping + + + + + + bam + + + + + cram + + + + + cram + + + + + cram + + + + + vcf + + + + + + Current Results & Outlook + + + + + + Tailor AWS setup & requested resources to new workflow + Evaluate other commercial cloud providers + Compare resource usage for the whole pipeline on thelocal HPC: Sarek 2.6.1 vs Sarek 3.0 + + New in sarek 3.0 + + + + + + diff --git a/docs/usage.md b/docs/usage.md index a33f46329d..1937daa301 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -4,63 +4,31 @@ > _Documentation of pipeline parameters is generated automatically from the pipeline schema and can no longer be found in markdown files._ -## Introduction +# Introduction - +Sarek is a workflow designed to detect germline and somatic variants on whole genome, whole exome, or targeted sequencing data. -## Samplesheet input +Initially designed for human and mouse, it can work on any species if a reference genome is available. +Sarek is designed to handle single samples, such as single-normal or single-tumor samples, and tumor-normal pairs including additional relapses. -You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below. +# Running the pipeline -```bash ---input '[path to samplesheet file]' -``` - -### Multiple runs of the same sample - -The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Below is an example for the same sample sequenced across 3 lanes: - -```csv title="samplesheet.csv" -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP1,AEG588A1_S1_L003_R1_001.fastq.gz,AEG588A1_S1_L003_R2_001.fastq.gz -CONTROL_REP1,AEG588A1_S1_L004_R1_001.fastq.gz,AEG588A1_S1_L004_R2_001.fastq.gz -``` - -### Full samplesheet +## Quickstart -The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 3 columns to match those defined in the table below. - -A final samplesheet file consisting of both single- and paired-end data may look something like the one below. This is for 6 samples, where `TREATMENT_REP3` has been sequenced twice. +The typical command for running the pipeline is as follows: -```csv title="samplesheet.csv" -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP2,AEG588A2_S2_L002_R1_001.fastq.gz,AEG588A2_S2_L002_R2_001.fastq.gz -CONTROL_REP3,AEG588A3_S3_L002_R1_001.fastq.gz,AEG588A3_S3_L002_R2_001.fastq.gz -TREATMENT_REP1,AEG588A4_S4_L003_R1_001.fastq.gz, -TREATMENT_REP2,AEG588A5_S5_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L004_R1_001.fastq.gz, +```bash +nextflow run nf-core/sarek -r -profile --input ./samplesheet.csv --outdir ./results --genome GATK.GRCh38 --tools ``` -| Column | Description | -| --------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). | -| `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | -| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | +`-r ` is optional but strongly recommended for reproducibility and should match the latest version. -An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. +`-profile ` is mandatory and should reflect either your own institutional profile or any pipeline profile specified in the [profile section](##-profile). -## Running the pipeline +This documentation imply that any `nextflow run nf-core/sarek` command is run with the appropriate `-r` and `-profile` commands. -The typical command for running the pipeline is as follows: - -```bash -nextflow run nf-core/sarek --input ./samplesheet.csv --outdir ./results --genome GRCh37 -profile docker -``` - -This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. +This will launch the pipeline and perform variant calling with the tools specified in `--tools`, see the [parameter section](https://nf-co.re/sarek/latest/parameters#tools) for details on variant calling tools. +In the above example the pipeline runs with the `docker` configuration profile. See below for more information about profiles. Note that the pipeline will create the following files in your working directory: @@ -82,7 +50,7 @@ Do not use `-c ` to specify parameters as this will result in errors. Cust The above pipeline run specified with a params file in yaml format: ```bash -nextflow run nf-core/sarek -profile docker -params-file params.yaml +nextflow run nf-core/sarek -params-file params.yaml ``` with `params.yaml` containing: @@ -90,25 +58,278 @@ with `params.yaml` containing: ```yaml input: './samplesheet.csv' outdir: './results/' -genome: 'GRCh37' +genome: 'GATK.GRCh38' <...> ``` You can also generate such `YAML`/`JSON` files via [nf-core/launch](https://nf-co.re/launch). -### Updating the pipeline +## Input: Sample sheet configurations + +You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use the parameter `--input` to specify its location. It has to be a comma-separated file with at least 3 columns, and a header row as shown in the examples below. + +It is recommended to use the absolute path of the files, but a relative path should also work. + +If necessary, a tumor sample can be associated to a normal sample as a pair, if specified with the same `patient` ID, a different `sample`, and the respective `status`. +An additional tumor sample (such as a relapse for example), can be added if specified with the same `patient` ID, a different `sample`, and the `status` value `1`. + +Sarek will output results in a different directory for _each sample_. +If multiple samples IDs are specified in the CSV file, Sarek will consider all files to be from different samples. + +Output from Variant Calling and/or Annotation will be in a specific directory for each sample and tool configuration (or normal/tumor pair if applicable). + +### Overview: Samplesheet Columns + +| Column | Description | +| --------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `patient` | **Custom patient ID**; designates the patient/subject; must be unique for each patient, but one patient can have multiple samples (e.g. normal and tumor).
_Required_ | +| `sex` | **Sex chromosomes of the patient**; i.e. XX, XY..., only used for Copy-Number Variation analysis in a tumor/pair
_Optional, Default: `NA`_ | +| `status` | **Normal/tumor status of sample**; can be `0` (normal) or `1` (tumor).
_Optional, Default: `0`_ | +| `sample` | **Custom sample ID** for each tumor and normal sample; more than one tumor sample for each subject is possible, i.e. a tumor and a relapse; samples can have multiple lanes for which the _same_ ID must be used to merge them later (see also `lane`). Sample IDs must be unique for unique biological samples
_Required_ | +| `lane` | Lane ID, used when the `sample` is multiplexed on several lanes. Must be unique for each lane in the same sample (but does not need to be the original lane name), and must contain at least one character
_Required for `--step mapping`_ | +| `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension `.fastq.gz` or `.fq.gz`. | +| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension `.fastq.gz` or `.fq.gz`. | +| `bam` | Full path to (u)BAM file | +| `bai` | Full path to BAM index file | +| `cram` | Full path to CRAM file | +| `crai` | Full path to CRAM index file | +| `table` | Full path to recalibration table file | +| `vcf` | Full path to vcf file | + +An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. + +### Start with mapping (`--step mapping` [default]) + +This step can be started either from FastQ files or (u)BAMs. The CSV must contain at least the columns `patient`, `sample`, `lane`, and either `fastq_1/fastq_2` or `bam`. + +#### Examples + +Minimal config file: + +```bash +patient,sample,lane,fastq_1,fastq_2 +patient1,test_sample,lane_1,test_1.fastq.gz,test_2.fastq.gz +``` + +```bash +patient,sample,lane,bam +patient1,test_sample,lane_1,test.bam +``` + +In this example, the sample is multiplexed over three lanes: + +```bash +patient,sample,lane,fastq_1,fastq_2 +patient1,test_sample,lane_1,test_L001_1.fastq.gz,test_L001_2.fastq.gz +patient1,test_sample,lane_2,test_L002_1.fastq.gz,test_L002_2.fastq.gz +patient1,test_sample,lane_3,test_L003_1.fastq.gz,test_L003_2.fastq.gz +``` + +```bash +patient,sample,lane,bam +patient1,test_sample,1,test_L001.bam +patient1,test_sample,2,test_L002.bam +patient1,test_sample,3,test_L003.bam +``` + +#### Full samplesheet + +In this example, all possible columns are used. There are three lanes for the normal sample, two for the tumor sample, and one for the relapse sample, including the `sex` and `status` information per patient: + +```bash +patient,sex,status,sample,lane,fastq_1,fastq_2 +patient1,XX,0,normal_sample,lane_1,test_L001_1.fastq.gz,test_L001_2.fastq.gz +patient1,XX,0,normal_sample,lane_2,test_L002_1.fastq.gz,test_L002_2.fastq.gz +patient1,XX,0,normal_sample,lane_3,test_L003_1.fastq.gz,test_L003_2.fastq.gz +patient1,XX,1,tumor_sample,lane_1,test2_L001_1.fastq.gz,test2_L001_2.fastq.gz +patient1,XX,1,tumor_sample,lane_2,test2_L002_1.fastq.gz,test2_L002_2.fastq.gz +patient1,XX,1,relapse_sample,lane_1,test3_L001_1.fastq.gz,test3_L001_2.fastq.gz +``` + +```bash +patient,sex,status,sample,lane,bam +patient1,XX,0,normal_sample,lane_1,test_L001.bam +patient1,XX,0,normal_sample,lane_2,test_L002.bam +patient1,XX,0,normal_sample,lane_3,test_L003.bam +patient1,XX,1,tumor_sample,lane_1,test2_L001.bam +patient1,XX,1,tumor_sample,lane_2,test2_L002.bam +patient1,XX,1,relapse_sample,lane_1,test3_L001.bam +``` + +### Start with duplicate marking (`--step markduplicates`) + +#### Duplicate Marking + +For starting from duplicate marking, the CSV file must contain at least the columns `patient`, `sample`, `bam`, `bai` or `patient`, `sample`, `cram`, `crai` + +> **NB:** When using [GATK4 MarkduplicatesSpark](https://gatk.broadinstitute.org/hc/en-us/articles/5358833264411-MarkDuplicatesSpark) reads should be name-sorted for efficient execution + +Example: + +```bash +patient,sample,bam,bai +patient1,test_sample,test_mapped.bam,test_mapped.bam.bai +``` + +```bash +patient,sample,cram,crai +patient1,test_sample,test_mapped.cram,test_mapped.cram.crai +``` + +The Sarek-generated CSV file is stored under `results/csv/mapped.csv` if in a previous run `--save_mapped` was set and will automatically be used as an input when specifying the parameter `--step markduplicates`. Otherwise this file will need to be manually generated. + +#### Full samplesheet + +In this example, all possible columns are used including the `sex` and `status` information per patient: + +```bash +patient,sex,status,sample,bam,bai +patient1,XX,0,test_sample,test_mapped.bam,test_mapped.bam.bai +patient1,XX,1,tumor_sample,test2_mapped.bam,test2_mapped.bam.bai +patient1,XX,1,relapse_sample,test3_mapped.bam,test3_mapped.bam.bai +``` + +```bash +patient,sex,status,sample,cram,crai +patient1,XX,0,normal_sample,test_mapped.cram,test_mapped.cram.crai +patient1,XX,1,tumor_sample,test2_mapped.cram,test2_mapped.cram.crai +patient1,XX,1,relapse_sample,test3_mapped.cram,test3_mapped.cram.crai +``` + +### Start with preparing the recalibration tables (`--step prepare_recalibration`) + +For starting directly from preparing the recalibration tables, the CSV file must contain at least the columns `patient`, `sample`, `bam`, `bai` or `patient`, `sample`, `cram`, `crai`. + +Example: + +```bash +patient,sample,bam,bai +patient1,test_sample,test_md.bam,test_md.bam.bai +``` + +```bash +patient,sample,cram,crai +patient1,test_sample,test_md.cram,test_md.cram.crai +``` + +The Sarek-generated CSV file is stored under `results/csv/markduplicates_no_table.csv` and will automatically be used as an input when specifying the parameter `--step prepare_recalibration`. + +#### Full samplesheet + +In this example, all possible columns are used including the `sex` and `status` information per patient: + +```bash +patient,sex,status,sample,bam,bai +patient1,XX,0,test_sample,test_md.bam,test_md.bam.bai +patient1,XX,1,tumor_sample,test2_md.bam,test2_md.bam.bai +patient1,XX,1,relapse_sample,test3_md.bam,test3_md.bam.bai +``` + +```bash +patient,sex,status,sample,cram,crai +patient1,XX,0,normal_sample,test_md.cram,test_md.cram.crai +patient1,XX,1,tumor_sample,test2_md.cram,test2_md.cram.crai +patient1,XX,1,relapse_sample,test3_md.cram,test3_md.cram.crai +``` + +### Start with base quality score recalibration (`--step recalibrate`) + +For starting from base quality score recalibration the CSV file must contain at least the columns `patient`, `sample`, `bam`, `bai`, `table` or `patient`, `sample`, `cram`, `crai`, `table` containing the paths to _non-recalibrated CRAM/BAM_ files and the associated recalibration table. + +Example: + +```bash +patient,sample,bam,bai,table +patient1,test_sample,test_mapped.cram,test_mapped.cram.crai,test.table +``` + +```bash +patient,sample,cram,crai,table +patient1,test_sample,test_mapped.cram,test_mapped.cram.crai,test.table +``` + +The Sarek-generated CSV file is stored under `results/csv/markduplicates.csv` and will automatically be used as an input when specifying the parameter `--step recalibrate`. + +#### Full samplesheet -When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline: +In this example, all possible columns are used including the `sex` and `status` information per patient: + +```bash +patient,sex,status,sample,cram,crai,table +patient1,XX,0,test_sample,test_mapped.cram,test_mapped.cram.crai,test.table +patient1,XX,1,tumor_sample,test2_mapped.cram,test2_mapped.cram.crai,test2.table +patient1,XX,1,relapse_sample,test3_mapped.cram,test3_mapped.cram.crai,test3.table +``` + +### Start with variant calling (`--step variant_calling`) + +For starting from the variant calling step, the CSV file must contain at least the columns `patient`, `sample`, `bam`, `bai` or `patient`, `sample`, `cram`, `crai`. + +Example: + +```bash +patient,sample,bam,bai +patient1,test_sample,test_mapped.bam,test_mapped.bam.bai +``` + +```bash +patient,sample,cram,crai +patient1,test_sample,test_mapped.cram,test_mapped.cram.crai +``` + +The Sarek-generated CSV file is stored under `results/csv/recalibrated.csv` and will automatically be used as an input when specifying the parameter `--step variant_calling`. + +#### Full samplesheet + +In this example, all possible columns are used including the `sex` and `status` information per patient: + +```bash +patient,sex,status,sample,cram,crai +patient1,XX,0,normal_sample,test_mapped.cram,test_mapped.cram.crai +patient1,XX,1,tumor_sample,test2_mapped.cram,test2_mapped.cram.crai +patient1,XX,1,relapse_sample,test3_mapped.cram,test3_mapped.cram.crai +``` + +### Start with annotation (`--step annotate`) + +For starting from the annotation step, the CSV file must contain at least the columns `patient`, `sample`, `vcf`. + +As Sarek will use [bgzip](http://www.htslib.org/doc/bgzip.html) and [tabix](http://www.htslib.org/doc/tabix.html) to compress and index the annotated VCF files, it expects the input VCF files to be sorted and compressed. + +Example: + +```bash +patient,sample,vcf +patient1,test_sample,test.vcf.gz +``` + +The Sarek-generated CSV file is stored under `results/csv/variantcalled.csv` and will automatically be used as an input when specifying the parameter `--step annotation`. + +#### Full samplesheet + +In this example, all possible columns are used including the `variantcaller` information per sample: + +```bash +patient,sample,variantcaller,vcf +test,sample3,strelka,sample3.variants.vcf.gz +test,sample4_vs_sample3,manta,sample4_vs_sample3.diploid_sv.vcf.gz +test,sample4_vs_sample3,manta,sample4_vs_sample3.somatic_sv.vcf.gz +``` + +## Updating the pipeline + +When you launch a pipeline from the command-line with `nextflow run nf-core/sarek -params-file params.yaml`, Nextflow will automatically pull the pipeline code from GitHub and store it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline: ```bash nextflow pull nf-core/sarek ``` -### Reproducibility +## Reproducibility It is a good idea to specify a pipeline version when running the pipeline on your data. This ensures that a specific version of the pipeline code and software are used when you run your pipeline. If you keep using the same tag, you'll be running the same version of the pipeline, even if there have been changes to the code since. -First, go to the [nf-core/sarek releases page](https://github.com/nf-core/sarek/releases) and find the latest pipeline version - numeric only (eg. `1.3.1`). Then specify this when running the pipeline with `-r` (one hyphen) - eg. `-r 1.3.1`. Of course, you can switch to another version by changing the number after the `-r` flag. +First, go to the [nf-core/sarek releases page](https://github.com/nf-core/sarek/releases) and find the latest version number - numeric only (eg. `3.3.2`). +Then specify this when running the pipeline with `-r` (one hyphen) - eg. `-r 3.3.2`. Of course, you can switch to another version by changing the number after the `-r` flag. This version number will be logged in reports when you run the pipeline, so that you'll know what you used when you look back in the future. For example, at the bottom of the MultiQC reports. @@ -118,15 +339,16 @@ To further assist in reproducbility, you can use share and re-use [parameter fil If you wish to share such profile (such as upload as supplementary material for academic publications), make sure to NOT include cluster specific paths to files, nor institutional specific profiles. ::: -## Core Nextflow arguments +# Core Nextflow arguments :::note These options are part of Nextflow and use a _single_ hyphen (pipeline parameters use a double-hyphen). ::: -### `-profile` +## `-profile` -Use this parameter to choose a configuration profile. Profiles can give configuration presets for different compute environments. +Use this parameter to choose a configuration profile. +Profiles can give configuration presets for different compute environments. Several generic profiles are bundled with the pipeline which instruct the pipeline to use software packaged using different methods (Docker, Singularity, Podman, Shifter, Charliecloud, Apptainer, Conda) - see below. @@ -134,7 +356,8 @@ Several generic profiles are bundled with the pipeline which instruct the pipeli We highly recommend the use of Docker or Singularity containers for full pipeline reproducibility, however when this is not possible, Conda is also supported. ::: -The pipeline also dynamically loads configurations from [https://github.com/nf-core/configs](https://github.com/nf-core/configs) when it runs, making multiple config profiles for various institutional clusters available at run time. For more information and to see if your system is available in these configs please see the [nf-core/configs documentation](https://github.com/nf-core/configs#documentation). +The pipeline also dynamically loads configurations from [github.com/nf-core/configs](https://github.com/nf-core/configs) when it runs, making multiple config profiles for various institutional clusters available at run time. +For more information and to see if your system is available in these configs please see the [nf-core/configs documentation](https://github.com/nf-core/configs#documentation). Note that multiple profiles can be loaded, for example: `-profile test,docker` - the order of arguments is important! They are loaded in sequence, so later profiles can overwrite earlier profiles. @@ -159,19 +382,37 @@ If `-profile` is not specified, the pipeline will run locally and expect all sof - `conda` - A generic configuration profile to be used with [Conda](https://conda.io/docs/). Please only use Conda as a last resort i.e. when it's not possible to run the pipeline with Docker, Singularity, Podman, Shifter, Charliecloud, or Apptainer. -### `-resume` +## `-resume` Specify this when restarting a pipeline. Nextflow will use cached results from any pipeline steps where the inputs are the same, continuing from where it got to previously. For input to be considered the same, not only the names must be identical but the files' contents as well. For more info about this parameter, see [this blog post](https://www.nextflow.io/blog/2019/demystifying-nextflow-resume.html). You can also supply a run name to resume a specific run: `-resume [run-name]`. Use the `nextflow log` command to show previous run names. -### `-c` +## `-c` Specify the path to a specific config file (this is a core Nextflow command). See the [nf-core website documentation](https://nf-co.re/usage/configuration) for more information. -## Custom configuration +## Nextflow memory requirements + +In some cases, the Nextflow Java virtual machines can start to request a large amount of memory. +We recommend adding the following line to your environment to limit this (typically in `~/.bashrc` or `~./bash_profile`): + +```bash +NXF_OPTS='-Xms1g -Xmx4g' +``` + +## Running in the background + +Nextflow handles job submissions and supervises the running jobs. The Nextflow process must run until the pipeline is finished. + +The Nextflow `-bg` flag launches Nextflow in the background, detached from your terminal so that the workflow does not stop if you log out of your session. The logs are saved to a file. + +Alternatively, you can use `screen` / `tmux` or similar tool to create a detached session which you can log back into at a later time. +Some HPC setups also allow you to run nextflow within a cluster job submitted your job scheduler (from where it submits more jobs). -### Resource requests +# Custom configuration + +## Resource requests Whilst the default requirements set within the pipeline will hopefully work for most people and with most input data, you may find that you want to customise the compute resources that the pipeline requests. Each step in the pipeline has a default set of requirements for number of CPUs, memory and time. For most of the steps in the pipeline, if the job exits with any of the error codes specified [here](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/conf/base.config#L18) it will automatically be resubmitted with higher requests (2 x original, then 3 x original). If it still fails after the third attempt then the pipeline execution is stopped. @@ -189,7 +430,7 @@ A pipeline might not always support every possible argument or option of a parti To learn how to provide additional arguments to a particular tool of the pipeline, please see the [customising tool arguments](https://nf-co.re/docs/usage/configuration#customising-tool-arguments) section of the nf-core website. -### nf-core/configs +## nf-core/configs In most cases, you will only need to create a custom config as a one-off but if you and others within your organisation are likely to be running nf-core pipelines regularly and need to use the same settings regularly it may be a good idea to request that your custom config file is uploaded to the `nf-core/configs` git repository. Before you do this please can you test that the config file works with your pipeline of choice using the `-c` parameter. You can then create a pull request to the `nf-core/configs` repository with the addition of your config file, associated documentation file (see examples in [`nf-core/configs/docs`](https://github.com/nf-core/configs/tree/master/docs)), and amending [`nfcore_custom.config`](https://github.com/nf-core/configs/blob/master/nfcore_custom.config) to include your custom profile. @@ -205,20 +446,695 @@ We recommend providing a compute `params.vm_type` of `Standard_D16_v3` VMs by de Note that the choice of VM size depends on your quota and the overall workload during the analysis. For a thorough list, please refer the [Azure Sizes for virtual machines in Azure](https://docs.microsoft.com/en-us/azure/virtual-machines/sizes). -## Running in the background +# Troubleshooting & FAQ -Nextflow handles job submissions and supervises the running jobs. The Nextflow process must run until the pipeline is finished. +## How to test the pipeline -The Nextflow `-bg` flag launches Nextflow in the background, detached from your terminal so that the workflow does not stop if you log out of your session. The logs are saved to a file. +When using default parameters only, sarek runs preprocessing and `Strelka2`. +This is reflected in the default test profile: -Alternatively, you can use `screen` / `tmux` or similar tool to create a detached session which you can log back into at a later time. -Some HPC setups also allow you to run nextflow within a cluster job submitted your job scheduler (from where it submits more jobs). +```bash +nextflow run nf-core/sarek -profile test, --outdir results +``` -## Nextflow memory requirements +Expected run output: -In some cases, the Nextflow Java virtual machines can start to request a large amount of memory. -We recommend adding the following line to your environment to limit this (typically in `~/.bashrc` or `~./bash_profile`): +```bash +[85/6b7739] process > NFCORE_SAREK:SAREK:PREPARE_GENOME:BWAMEM1_INDEX (genome.fasta) [100%] 1 of 1 ✔ +[- ] process > NFCORE_SAREK:SAREK:PREPARE_GENOME:BWAMEM2_INDEX - +[- ] process > NFCORE_SAREK:SAREK:PREPARE_GENOME:DRAGMAP_HASHTABLE - +[22/cf54a8] process > NFCORE_SAREK:SAREK:PREPARE_GENOME:GATK4_CREATESEQUENCEDICTIONARY (genome.fasta) [100%] 1 of 1 ✔ +[- ] process > NFCORE_SAREK:SAREK:PREPARE_GENOME:MSISENSORPRO_SCAN - +[28/dad25a] process > NFCORE_SAREK:SAREK:PREPARE_GENOME:SAMTOOLS_FAIDX (genome.fasta) [100%] 1 of 1 ✔ +[23/3fe964] process > NFCORE_SAREK:SAREK:PREPARE_GENOME:TABIX_DBSNP (dbsnp_146.hg38.vcf) [100%] 1 of 1 ✔ +[- ] process > NFCORE_SAREK:SAREK:PREPARE_GENOME:TABIX_GERMLINE_RESOURCE - +[- ] process > NFCORE_SAREK:SAREK:PREPARE_GENOME:TABIX_KNOWN_SNPS - +[14/26e286] process > NFCORE_SAREK:SAREK:PREPARE_GENOME:TABIX_KNOWN_INDELS (mills_and_1000G.indels.vcf) [100%] 1 of 1 ✔ +[- ] process > NFCORE_SAREK:SAREK:PREPARE_GENOME:TABIX_PON - +[76/04d107] process > NFCORE_SAREK:SAREK:PREPARE_INTERVALS:CREATE_INTERVALS_BED (genome.interval_list) [100%] 1 of 1 ✔ +[d4/f97174] process > NFCORE_SAREK:SAREK:PREPARE_INTERVALS:GATK4_INTERVALLISTTOBED (genome) [100%] 1 of 1 ✔ +[70/82ba3c] process > NFCORE_SAREK:SAREK:PREPARE_INTERVALS:TABIX_BGZIPTABIX_INTERVAL_SPLIT (chr22_1-40001) [100%] 1 of 1 ✔ +[d4/c2d0c4] process > NFCORE_SAREK:SAREK:PREPARE_INTERVALS:TABIX_BGZIPTABIX_INTERVAL_COMBINED (genome) [100%] 1 of 1 ✔ +[- ] process > NFCORE_SAREK:SAREK:CONVERT_FASTQ_INPUT:SAMTOOLS_VIEW_MAP_MAP - +[- ] process > NFCORE_SAREK:SAREK:CONVERT_FASTQ_INPUT:SAMTOOLS_VIEW_UNMAP_UNMAP - +[- ] process > NFCORE_SAREK:SAREK:CONVERT_FASTQ_INPUT:SAMTOOLS_VIEW_UNMAP_MAP - +[- ] process > NFCORE_SAREK:SAREK:CONVERT_FASTQ_INPUT:SAMTOOLS_VIEW_MAP_UNMAP - +[- ] process > NFCORE_SAREK:SAREK:CONVERT_FASTQ_INPUT:SAMTOOLS_MERGE_UNMAP - +[- ] process > NFCORE_SAREK:SAREK:CONVERT_FASTQ_INPUT:COLLATE_FASTQ_UNMAP - +[- ] process > NFCORE_SAREK:SAREK:CONVERT_FASTQ_INPUT:COLLATE_FASTQ_MAP - +[- ] process > NFCORE_SAREK:SAREK:CONVERT_FASTQ_INPUT:CAT_FASTQ - +[c4/f59e5a] process > NFCORE_SAREK:SAREK:FASTQC (test-test_L1) [100%] 1 of 1 ✔ +[0b/c5a999] process > NFCORE_SAREK:SAREK:FASTQ_ALIGN_BWAMEM_MEM2_DRAGMAP:BWAMEM1_MEM (test) [100%] 1 of 1 ✔ +[- ] process > NFCORE_SAREK:SAREK:FASTQ_ALIGN_BWAMEM_MEM2_DRAGMAP:BWAMEM2_MEM - +[- ] process > NFCORE_SAREK:SAREK:FASTQ_ALIGN_BWAMEM_MEM2_DRAGMAP:DRAGMAP_ALIGN - +[c7/664cd1] process > NFCORE_SAREK:SAREK:BAM_MARKDUPLICATES:GATK4_MARKDUPLICATES (test) [100%] 1 of 1 ✔ +[13/bc73b6] process > NFCORE_SAREK:SAREK:BAM_MARKDUPLICATES:INDEX_MARKDUPLICATES (test) [100%] 1 of 1 ✔ +[2a/99608e] process > NFCORE_SAREK:SAREK:BAM_MARKDUPLICATES:CRAM_QC_MOSDEPTH_SAMTOOLS:SAMTOOLS_STATS (test) [100%] 1 of 1 ✔ +[f2/0420ca] process > NFCORE_SAREK:SAREK:BAM_MARKDUPLICATES:CRAM_QC_MOSDEPTH_SAMTOOLS:MOSDEPTH (test) [100%] 1 of 1 ✔ +[- ] process > NFCORE_SAREK:SAREK:CRAM_TO_BAM - +[eb/46945a] process > NFCORE_SAREK:SAREK:BAM_BASERECALIBRATOR:GATK4_BASERECALIBRATOR (test) [100%] 1 of 1 ✔ +[- ] process > NFCORE_SAREK:SAREK:BAM_BASERECALIBRATOR:GATK4_GATHERBQSRREPORTS - +[ec/2377d4] process > NFCORE_SAREK:SAREK:BAM_APPLYBQSR:GATK4_APPLYBQSR (test) [100%] 1 of 1 ✔ +[- ] process > NFCORE_SAREK:SAREK:BAM_APPLYBQSR:CRAM_MERGE_INDEX_SAMTOOLS:MERGE_CRAM - +[88/3af664] process > NFCORE_SAREK:SAREK:BAM_APPLYBQSR:CRAM_MERGE_INDEX_SAMTOOLS:INDEX_CRAM (test) [100%] 1 of 1 ✔ +[f4/828fde] process > NFCORE_SAREK:SAREK:CRAM_QC_RECAL:SAMTOOLS_STATS (test) [100%] 1 of 1 ✔ +[fb/a9d66f] process > NFCORE_SAREK:SAREK:CRAM_QC_RECAL:MOSDEPTH (test) [100%] 1 of 1 ✔ +[- ] process > NFCORE_SAREK:SAREK:CRAM_TO_BAM_RECAL - +[ef/026185] process > NFCORE_SAREK:SAREK:BAM_VARIANT_CALLING_GERMLINE_ALL:BAM_VARIANT_CALLING_SINGLE_STRELKA:STRELKA_SINGLE (test) [100%] 1 of 1 ✔ +[- ] process > NFCORE_SAREK:SAREK:BAM_VARIANT_CALLING_GERMLINE_ALL:BAM_VARIANT_CALLING_SINGLE_STRELKA:MERGE_STRELKA - +[- ] process > NFCORE_SAREK:SAREK:BAM_VARIANT_CALLING_GERMLINE_ALL:BAM_VARIANT_CALLING_SINGLE_STRELKA:MERGE_STRELKA_GENOME - +[- ] process > NFCORE_SAREK:SAREK:BAM_VARIANT_CALLING_TUMOR_ONLY_ALL:BAM_VARIANT_CALLING_SINGLE_STRELKA:STRELKA_SINGLE - +[- ] process > NFCORE_SAREK:SAREK:BAM_VARIANT_CALLING_TUMOR_ONLY_ALL:BAM_VARIANT_CALLING_SINGLE_STRELKA:MERGE_STRELKA - +[- ] process > NFCORE_SAREK:SAREK:BAM_VARIANT_CALLING_TUMOR_ONLY_ALL:BAM_VARIANT_CALLING_SINGLE_STRELKA:MERGE_STRELKA_GENOME - +[- ] process > NFCORE_SAREK:SAREK:BAM_VARIANT_CALLING_SOMATIC_ALL:BAM_VARIANT_CALLING_SOMATIC_STRELKA:STRELKA_SOMATIC - +[- ] process > NFCORE_SAREK:SAREK:BAM_VARIANT_CALLING_SOMATIC_ALL:BAM_VARIANT_CALLING_SOMATIC_STRELKA:MERGE_STRELKA_INDELS - +[- ] process > NFCORE_SAREK:SAREK:BAM_VARIANT_CALLING_SOMATIC_ALL:BAM_VARIANT_CALLING_SOMATIC_STRELKA:MERGE_STRELKA_SNVS - +[bc/f3f5cf] process > NFCORE_SAREK:SAREK:VCF_QC_BCFTOOLS_VCFTOOLS:BCFTOOLS_STATS (test) [100%] 1 of 1 ✔ +[21/8d4f02] process > NFCORE_SAREK:SAREK:VCF_QC_BCFTOOLS_VCFTOOLS:VCFTOOLS_TSTV_COUNT (test) [100%] 1 of 1 ✔ +[36/957fba] process > NFCORE_SAREK:SAREK:VCF_QC_BCFTOOLS_VCFTOOLS:VCFTOOLS_TSTV_QUAL (test) [100%] 1 of 1 ✔ +[70/a8e064] process > NFCORE_SAREK:SAREK:VCF_QC_BCFTOOLS_VCFTOOLS:VCFTOOLS_SUMMARY (test) [100%] 1 of 1 ✔ +[36/e35b1b] process > NFCORE_SAREK:SAREK:CUSTOM_DUMPSOFTWAREVERSIONS (1) [100%] 1 of 1 ✔ +[3f/3c3356] process > NFCORE_SAREK:SAREK:MULTIQC [100%] 1 of 1 ✔ +-[nf-core/sarek] Pipeline completed successfully- +Completed at: 09-Jun-2023 13:46:31 +Duration : 1m 50s +CPU hours : (a few seconds) +Succeeded : 27 +``` + +The pipeline comes with a number of possible paths and tools that can be used. + +Due to the small test data size, unfortunately not everything can be tested from top-to-bottom, but often is done by utilizing the pipeline's `--step` parameter. + +For more extensive testing purpose, we have the `test_cache` profile that contain the same data, but on which the path to the reference and input files can be changed using the `--test_data_base` params. + +Annotation is generally tested separately from the remaining workflow, since we use references for `C.elegans`, while the remaining tests are run on downsampled human data. ```bash -NXF_OPTS='-Xms1g -Xmx4g' +nextflow run nf-core/sarek -profile test_cache, --outdir results --tools snpeff --step annotation ``` + +If you are interested in any of the other tests that are run on every code change or would like to run them yourself, you can take a look at `tests/.yml`. +For each entry the respective nextflow command run and the expected output is specified. + +Some of the currently, available test profiles: + +| Test profile | Run command | +| :-------------- | :------------------------------------------------------------------------------------ | +| annotation | `nextflow run main.nf -profile test_cache,annotation,docker --tools snpeff,vep,merge` | +| no_intervals | `nextflow run main.nf -profile test_cache,no_intervals,docker` | +| targeted | `nextflow run main.nf -profile test_cache,targeted,docker` | +| tools_germline | `nextflow run main.nf -profile test_cache,tools_germline,docker --tools strelka` | +| tools_tumoronly | `nextflow run main.nf -profile test_cache,tools_tumoronly,docker --tools strelka` | +| tools_somatic | `nextflow run main.nf -profile test_cache,tools_somatic,docker --tools strelka` | +| trimming | `nextflow run main.nf -profile test_cache,trim_fastq,docker` | +| umi | `nextflow run main.nf -profile test_cache,umi,docker` | +| use_gatk_spark | `nextflow run main.nf -profile test_cache,use_gatk_spark,docker` | + +If you are interested in any of the other profiles that are used, you can take a look at `conf/test/.config`. + +## How can the different steps be used + +Sarek can be started at different points in the analysis by setting the parameter `--step`. Once started at a certain point, the pipeline runs through all the following steps without additional intervention. For example when starting from `--step mapping` (set by default) and `--tools strelka,vep`, the input reads will be aligned, duplicate marked, recalibrated, variant called with Strelka, and finally VEP will annotate the called variants. + +## Which variant calling tool is implemented for which data type? + +This list is by no means exhaustive and it will depend on the specific analysis you would like to run. This is a suggestion based on the individual docs of the tools specifically for human genomes and a garden-variety sequencing run as well as what has been added to the pipeline. + +| Tool | WGS | WES |  Panel |  Normal | Tumor | Somatic | +| :------------------------------------------------------------------------------------------------------ | :-: | :-: | :----: | :-----: | :---: | :-----: | +| [DeepVariant](https://github.com/google/deepvariant) | x | x | x | x | - | - | +| [FreeBayes](https://github.com/ekg/freebayes) | x | x | x | x | x | x | +| [GATK HaplotypeCaller](https://gatk.broadinstitute.org/hc/en-us/articles/5358864757787-HaplotypeCaller) | x | x | x | x | - | - | +| [GATK Mutect2](https://gatk.broadinstitute.org/hc/en-us/articles/5358911630107-Mutect2) | x | x | x | - | x | x | +| [mpileup](https://www.htslib.org/doc/samtools-mpileup.html) | x | x | x | x | x | - | +| [Strelka2](https://github.com/Illumina/strelka) | x | x | x | x | x | x | +| [Manta](https://github.com/Illumina/manta) | x | x | x | x | x | x | +| [TIDDIT](https://github.com/SciLifeLab/TIDDIT) | x | x | x | x | x | x | +| [ASCAT](https://github.com/VanLoo-lab/ascat) | x | x | - | - | - | x | +| [CNVKit](https://cnvkit.readthedocs.io/en/stable/) | x | x | - | x | x | x | +| [Control-FREEC](https://github.com/BoevaLab/FREEC) | x | x | x | - | x | x | +| [MSIsensorPro](https://github.com/xjtu-omics/msisensor-pro) | x | x | x | - | - | x | + +## How to run ASCAT with whole-exome sequencing data? + +ASCAT runs out of the box on whole genome sequencing data using iGenomes resources. While the ASCAT implementation in sarek is capable of running with whole-exome sequencing data, the needed references are currently not provided with the igenomes.config. According to the [developers](https://github.com/VanLoo-lab/ascat/issues/97) of ASCAT, loci and allele files (one file per chromosome) can be downloaded directly from the [Battenberg repository](https://ora.ox.ac.uk/objects/uuid:08e24957-7e76-438a-bd38-66c48008cf52). + +Please note that: + +- Row names (for GC and RT correction files) should be `${chr}_${position}` (there is no SNP/probe ID for HTS data). +- All row names in GC and RT correction files should also appear in the loci files +- Loci and allele files must contain the same set of SNPs +- ASCAT developers strongly recommend using a BED file for WES/TS data. This prevents considering SNPs covered by off-target reads that would add noise to log/BAF tracks. +- The total number of GC correction loci in a sample must be at least 10% of the number of loci with logR values. If the number of GC correction loci is too small compared to the total number of loci, ASCAT will throw an error. + +From 'Reference files' https://github.com/VanLoo-lab/ascat: + +> For WES and targeted sequencing, we recommend using the reference files (loci, allele and logR correction files) as part of the Battenberg package. Because they require a high-resolution input, our reference files for WGS are not suitable for WES and targeted sequencing. For WES, loci and allele files from the Battenberg package can be fed into ascat.prepareHTS. For targeted sequencing, allele files from the Battenberg package can be fed into ascat.prepareTargetedSeq, which will generate cleaned loci and allele files that can be fed into ascat.prepareHTS. + +### How to generate ASCAT resources for exome or targeted sequencing + +1. Fetch the GC content correction and replication timing (RT) correction files from the [Dropbox links provided by the ASCAT developers](https://github.com/VanLoo-lab/ascat/tree/master/ReferenceFiles/WGS) and intersect the SNP coordinates with the exome target coordinates. If the target file has 'chr' prefixes, make a copy with these removed first. Extract the GC and RT information for only the on target SNPs and zip the results. + +```bash +sed -e 's/chr//' targets_with_chr.bed > targets.bed + +for t in GC RT +do + unzip ${t}_G1000_hg38.zip + + cut -f 1-3 ${t}_G1000_hg38.txt > ascat_${t}_snps_hg38.txt + tail -n +2 ascat_${t}_snps_hg38.txt | awk '{ print $2 "\t" $3-1 "\t" $3 "\t" $1 }' > ascat_${t}_snps_hg38.bed + bedtools intersect -a ascat_${t}_snps_hg38.bed -b targets.bed | awk '{ print $1 "_" $3 }' > ascat_${t}_snps_on_target_hg38.txt + + head -n 1 ${t}_G1000_hg38.txt > ${t}_G1000_on_target_hg38.txt + grep -f ascat_${t}_snps_on_target_hg38.txt ${t}_G1000_hg38.txt >> ${t}_G1000_on_target_hg38.txt + zip ${t}_G1000_on_target_hg38.zip ${t}_G1000_on_target_hg38.txt + + rm ${t}_G1000_hg38.zip +done +``` + +2. Download the Battenberg 1000G loci and alleles files. The steps below follow downloading from the [Battenberg repository at the Oxford University Research Archive](https://ora.ox.ac.uk/objects/uuid:08e24957-7e76-438a-bd38-66c48008cf52). The files are also available via Dropbox links from the same page as the GC and RT correction files above. + +```bash +wget https://ora.ox.ac.uk/objects/uuid:08e24957-7e76-438a-bd38-66c48008cf52/files/rt435gd52w +mv rt345gd52w battenberg.zip +tar xf battenberg.zip + +unzip 1000G_loci_hg38_chr.zip +cd 1000G_loci_hg38 +mkdir battenberg_alleles_on_target_hg38 +mv *allele* battenberg_alleles_on_target_hg38/ +mkdir battenberg_loci_on_target_hg38 +mv *loci* battenberg_loci_on_target_hg38/ +``` + +3. Copy the `targets_with_chr.bed` and `GC_G1000_on_target_hg38.txt` files into the newly created `battenberg_loci_on_target_hg38` folder before running the next set of steps. ASCAT generates a list of GC correction loci with sufficient coverage in a sample, then intersects that with the list of all loci with tumour logR values in that sample. If the intersection is <10% the size of the latter, it will fail with an error. Because the Battenberg loci/allele sets are very dense, subsetting to on-target regions is still too many loci. This script ensures that all SNPs with GC correction information are included in the loci list, plus a random sample of another 30% of all on target loci. You may need to vary this proportion depending on your set of targets. A good rule of thumb is that the size of your GC correction loci list should be about 15% the size of your total loci list. This allows for a margin of error. + +```bash +cd battenberg_loci_on_target_hg38/ +rm *chrstring* +rm 1kg.phase3.v5a_GRCh38nounref_loci_chr23.txt +for i in {1..22} X +do + awk '{ print $1 "\t" $2-1 "\t" $2 }' 1kg.phase3.v5a_GRCh38nounref_loci_chr${i}.txt > chr${i}.bed + grep "^${i}_" GC_G1000_on_target_hg38.txt | awk '{ print "chr" $1 }' > chr${i}.txt + bedtools intersect -a chr${i}.bed -b targets_with_chr.bed | awk '{ print $1 "_" $3 }' > chr${i}_on_target.txt + n=`wc -l chr${i}_on_target.txt | awk '{ print $1 }'` + count=$((n * 3 / 10)) + grep -xf chr${i}.txt chr${i}_on_target.txt > chr${i}.temp + shuf -n $count chr${i}_on_target.txt >> chr${i}.temp + sort -n -k2 -t '_' chr${i}.temp | uniq | awk 'BEGIN { FS="_" } ; { print $1 "\t" $2 }' > battenberg_loci_on_target_hg38_chr${i}.txt +done +zip battenberg_loci_on_target_hg38.zip battenberg_loci_on_target_hg38_chr*.txt +``` + +4. Extract the alleles for the same set of SNPs. Uses a short R script defined below. + +```bash +cd ../battenberg_alleles_on_target_hg38/ +rm 1kg.phase3.v5a_GRCh38nounref_allele_index_chr23.txt +for i in {1..22} X +do + Rscript intersect_ascat_alleles.R ../battenberg_loci_on_target_hg38/battenberg_loci_on_target_hg38_chr${i}.txt \ + 1kg.phase3.v5a_GRCh38nounref_allele_index_chr${i}.txt battenberg_alleles_on_target_hg38_chr${i}.txt +done +zip battenberg_alleles_on_target_hg38.zip battenberg_alleles_on_target_hg38_chr*.txt +``` + +Rscript `intersect_ascat_alleles.R` + +```bash +#!/usr/bin/env Rscript + +args = commandArgs(trailingOnly=TRUE) + +loci = read.table(args[1], header=F, sep="\t", stringsAsFactors=F) +alleles = read.table(args[2], header=T, sep="\t", stringsAsFactors=F) + +i = intersect(loci$V2, alleles$position) + +out = subset(alleles, alleles$position %in% i) +write.table(out, args[3], col.names=T, row.names=F, quote=F, sep="\t") +``` + +5. Move or copy all of the zip files you've created to a suitable location. Specify these in your parameters, e.g. + +```json +{ + "ascat_alleles": "/path/to/battenberg_alleles_on_target_hg38.zip", + "ascat_loci": "/path/to/battenberg_loci_on_target_hg38.zip", + "ascat_loci_gc": "/path/to/GC_G1000_on_target_hg38.zip", + "ascat_loci_rt": "/path/to/RT_G1000_on_target_hg38.zip" +} +``` + +## What are the bwa, bwa-mem2 and sentieon bwa mem parameters? + +For mapping, sarek follows the parameter suggestions provided in this [paper](https://www.nature.com/articles/s41467-018-06159-4): + +`-K 100000000` : for deterministic pipeline results, for more info see [here](https://github.com/CCDG/Pipeline-Standardization/issues/2) + +`-Y`: force soft-clipping rather than default hard-clipping of supplementary alignments + +In addition, currently the mismatch penalty for reads with tumor status in the sample sheet are mapped with a mismatch penalty of `-B 3`. + +## How to manage scatter/gathering (parallelization with-in each sample) + +While Nextflow ensures all samples are run in parallel, the pipeline can split input files for each sample into smaller chunks which are processes in parallel. +This speeds up analysis for individual chunks, but might occupy more storage space. + +Therefore, the different scatter/gather options can be set by the user: + +### Split Fastq files + +By default, the input fastq files are split into smaller chunks with FASTP, mapped in parallel, and then merged and duplicate marked. This can be customized by setting the parameter `--split_fastq`. +This parameter determines how many reads are within each split. Setting it to `0` will turn of any splitting and only one mapping process is run per input fastq file. + +> FastP creates as many chunks as CPUs are specified (by default 12) and subdivides them further, if the number of reads in a chunk is larger then the value specified in `--split_fastq`. Thus, the parameter `--split_fastq` is an upper bound, e.g. if 1/12th of the Fastq file exceeds the provided value another fastq file will be generated. + +### Intervals for Base Quality Score Recalibration and Variantcalling + +The pipeline can parallelize base quality score recalibration and variant calling across genomic chunks of roughly similar sizes. +For this, a bed file containing genomic regions of interest is used, it's the intervals file. +By default, the intervals file for WGS used is the one provided by GATK (details [here](https://gatk.broadinstitute.org/hc/en-us/articles/360035889551-When-should-I-restrict-my-analysis-to-specific-intervals-)). +When running targeted analysis, it is recommended to use the bed file containing the targeted regions. + +The amount of scatter/gathering can be customized by adjusting the parameter `--nucleotides_per_second`. + +> **NB:** The _same_ intervals are processed regardless of the number of groups. The number of groups however determines over how many compute nodes the analysis is scattered on. + +The default value is `200000`, increasing this value will _reduce_ the number of groups that are processed in parallel. +Generally, smaller numbers of groups (each group has more regions), the slower the processing, and less storage space is consumed. +In particular, in cloud computing setting it is often advisable to reduce the number of groups to be run in parallel to reduce data staging steps. + +## How to create a panel-of-normals for Mutect2 + +For a detailed tutorial on how to create a panel-of-normals, see [here](https://gatk.broadinstitute.org/hc/en-us/articles/360035531132). + +## Spark related issues + +If you have problems running processes that make use of Spark such as `MarkDuplicates`. +You are probably experiencing issues with the limit of open files in your system. +You can check your current limit by typing the following: + +```bash +ulimit -n +``` + +The default limit size is usually 1024 which is quite low to run Spark jobs. +In order to increase the size limit permanently you can: + +Edit the file `/etc/security/limits.conf` and add the lines: + +```bash +* soft nofile 65535 +* hard nofile 65535 +``` + +Edit the file `/etc/sysctl.conf` and add the line: + +```bash +fs.file-max = 65535 +``` + +Edit the file `/etc/sysconfig/docker` and add the new limits to OPTIONS like this: + +```bash +OPTIONS=”—default-ulimit nofile=65535:65535" +``` + +Re-start your session. + +Note that the way to increase the open file limit in your system may be slightly different or require additional steps. + +### Cannot delete work folder when using docker + Spark + +Currently, when running spark-based tools in combination with docker, it is required to set `docker.userEmulation = false`. This can unfortunately causes permission issues when `work/` is being written with root permissions. In case this happens, you might need to configure docker to run without `userEmulation` (see [here](https://github.com/Midnighter/nf-core-adr/blob/main/docs/adr/0008-refrain-from-using-docker-useremulation-in-nextflow.md)). + +## How to handle UMIs + +Sarek can process UMI-reads, using [fgbio](http://fulcrumgenomics.github.io/fgbio/tools/latest/) tools. + +In order to use reads containing UMI tags as your initial input, you need to include `--umi_read_structure [structure]` in your parameters. + +This will enable pre-processing of the reads and UMI consensus reads calling, which will then be used to continue the workflow from the mapping steps. For post-UMI processing depending on the experimental setup, duplicate marking and base quality recalibration can be skipped with [`--skip_tools`]. + +### UMI Read Structure + +This parameter is a string, which follows a [convention](https://github.com/fulcrumgenomics/fgbio/wiki/Read-Structures) to describe the structure of the umi. + +As an example: if your reads contain a UMI only on the forward read, the string can only represent one structure (i.e. "2M11S+T"); should your reads contain a UMI on both reas, the string will contain two structures separated by a blank space (i.e. "2M11S+T 2M11S+T"); should your reads contain a UMI only on the reverse read, your structure must represent the template only for the forward read and template plus UMI for the reverse read (i.e. +T 12M11S+T). Please do refer to FGBIO documentation for more details, as providing the correct structure is essential and specific to the UMI kit used. + +### Limitations and future updates + +Recent updates to Samtools have been introduced, which can speed-up performance of fgbio tools used in this workflow. +The current workflow does not handle duplex UMIs (i.e. where opposite strands of a duplex molecule have been tagged with a different UMI), and best practices have been proposed to process this type of data. +Both changes will be implemented in a future release. + +## How to run sarek when no(t all) reference files are in igenomes + +For common genomes, such as GRCh38 and GRCh37, the pipeline is shipped with (almost) all necessary reference files. However, sometimes it is necessary to use custom references for some or all files: + +### No igenomes reference files are used + +If none of your required genome files are in igenomes, `--igenomes_ignore` must be set to ignore any igenomes input and `--genome null`. The `fasta` file is the only required input file and must be provided to run the pipeline. All other possible reference file can be provided in addition. For details, see the paramter documentation. + +Minimal example for custom genomes: + +```bash +nextflow run nf-core/sarek --genome null --igenomes_ignore --fasta +``` + +### Overwrite specific reference files + +If you don't want to use some of the provided reference genomes, they can be overwritten by either providing a new file or setting the respective file parameter to `false`, if it should be ignored: + +Example for using a custom known indels file: + +```bash +nextflow run nf-core/sarek --known_indels --genome GRCh38.GATK +``` + +Example for not using known indels, but all other provided reference file: + +```bash +nextflow run nf-core/sarek --known_indels false --genome GRCh38.GATK +``` + +### Where do the used reference genomes originate from + +For GATK.GRCh38 the links for each reference file and the corresponding processes that use them is listed below. For GATK.GRCh37 the files originate from the same sources: + +| File | Tools | Origin | Docs | +| :-------------------- | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :-------------------------------------------------------------------------------------------------------------------- | :----------------------------------------------------------------------------------- | +| ascat_alleles | ASCAT | https://www.dropbox.com/s/uouszfktzgoqfy7/G1000_alleles_hg38.zip | https://github.com/VanLoo-lab/ascat/tree/master/ReferenceFiles/WGS | +| ascat_loci | ASCAT | https://www.dropbox.com/s/80cq0qgao8l1inj/G1000_loci_hg38.zip | https://github.com/VanLoo-lab/ascat/tree/master/ReferenceFiles/WGS | +| ascat_loci_gc | ASCAT | https://www.dropbox.com/s/80cq0qgao8l1inj/G1000_loci_hg38.zip | https://github.com/VanLoo-lab/ascat/tree/master/ReferenceFiles/WGS | +| ascat_loci_rt | ASCAT | https://www.dropbox.com/s/xlp99uneqh6nh6p/RT_G1000_hg38.zip | https://github.com/VanLoo-lab/ascat/tree/master/ReferenceFiles/WGS | +| bwa | bwa-mem | bwa index -p bwa/${fasta.baseName} $fasta | | +| bwamem2 | bwa-mem2 | bwa-mem2 index -p bwamem2/${fasta} $fasta | | +| dragmap | DragMap | dragen-os --build-hash-table true --ht-reference $fasta --output-directory dragmap | | +| dbsnp | Baserecalibrator, ControlFREEC, GenotypeGVCF, HaplotypeCaller | [GATKBundle](https://console.cloud.google.com/storage/browser/_details/genomics-public-data/resources/broad/hg38/v0/) | https://gatk.broadinstitute.org/hc/en-us/articles/360035890811-Resource-bundle | +| dbsnp_tbi | Baserecalibrator, ControlFREEC, GenotypeGVCF, HaplotypeCaller | [GATKBundle](https://console.cloud.google.com/storage/browser/_details/genomics-public-data/resources/broad/hg38/v0/) | | +| dict | Baserecalibrator(Spark), CNNScoreVariant, EstimateLibraryComplexity, FilterMutectCalls, FilterVariantTranches, GatherPileupSummaries,GenotypeGVCF, GetPileupSummaries, HaplotypeCaller, MarkDulpicates(Spark), MergeVCFs, Mutect2, Variantrecalibrator | [GATKBundle](https://console.cloud.google.com/storage/browser/_details/genomics-public-data/resources/broad/hg38/v0/) | https://gatk.broadinstitute.org/hc/en-us/articles/360035890811-Resource-bundle | +| fasta | ApplyBQSR(Spark), ApplyVQSR, ASCAT, Baserecalibrator(Spark), BWA, BWAMem2, CNNScoreVariant, CNVKit, ControlFREEC, DragMap, DEEPVariant, EnsemblVEP, EstimateLibraryComplexity, FilterMutectCalls, FilterVariantTranches, FreeBayes, GatherPileupSummaries,GenotypeGVCF, GetPileupSummaries, HaplotypeCaller, interval building, Manta, MarkDuplicates(Spark),MergeVCFs,MSISensorPro, Mutect2, Samtools, SnpEff, Strelka, Tiddit, Variantrecalibrator | [GATKBundle](https://console.cloud.google.com/storage/browser/_details/genomics-public-data/resources/broad/hg38/v0/) | https://gatk.broadinstitute.org/hc/en-us/articles/360035890811-Resource-bundle | +| fasta_fai | ApplyBQSR(Spark), ApplyVQSR, ASCAT, Baserecalibrator(Spark), BWA, BWAMem2, CNNScoreVariant, CNVKit, ControlFREEC, DragMap, DEEPVariant, EnsemblVEP, EstimateLibraryComplexity, FilterMutectCalls, FilterVariantTranches, FreeBayes, GatherPileupSummaries,GenotypeGVCF, GetPileupSummaries, HaplotypeCaller, interval building, Manta, MarkDuplicates(Spark),MergeVCFs,MSISensorPro, Mutect2, Samtools, SnpEff, Strelka, Tiddit, Variantrecalibrator | [GATKBundle](https://console.cloud.google.com/storage/browser/_details/genomics-public-data/resources/broad/hg38/v0/) | https://gatk.broadinstitute.org/hc/en-us/articles/360035890811-Resource-bundle | +| germline_resource | GetPileupsummaries,Mutect2 | [GATKBundle](https://console.cloud.google.com/storage/browser/_details/genomics-public-data/resources/broad/hg38/v0/) | | +| germline_resource_tbi | GetPileupsummaries,Mutect2 | [GATKBundle](https://console.cloud.google.com/storage/browser/_details/genomics-public-data/resources/broad/hg38/v0/) | | +| intervals | ApplyBQSR(Spark), ASCAT, Baserecalibrator(Spark), BCFTools, CNNScoreVariants, ControlFREEC, Deepvariant, FilterVariantTranches, FreeBayes, GenotypeGVCF, GetPileupSummaries, HaplotypeCaller, Strelka, mpileup, MSISensorPro, Mutect2, VCFTools | [GATKBundle](https://console.cloud.google.com/storage/browser/_details/genomics-public-data/resources/broad/hg38/v0/) | | +| known_indels | BaseRecalibrator(Spark), FilterVariantTranches | [GATKBundle](https://console.cloud.google.com/storage/browser/_details/genomics-public-data/resources/broad/hg38/v0/) | | +| known_indels_tbi | BaseRecalibrator(Spark), FilterVariantTranches | [GATKBundle](https://console.cloud.google.com/storage/browser/_details/genomics-public-data/resources/broad/hg38/v0/) | | +| known_snps | BaseRecalibrator(Spark), FilterVariantTranches, VariantRecalibrator | [GATKBundle](https://console.cloud.google.com/storage/browser/_details/genomics-public-data/resources/broad/hg38/v0/) | | +| known_snps_tbi | BaseRecalibrator(Spark), FilterVariantTranches, VariantRecalibrator | [GATKBundle](https://console.cloud.google.com/storage/browser/_details/genomics-public-data/resources/broad/hg38/v0/) | +| mappability | ControlFREEC | http://xfer.curie.fr/get/vyIi4w8EONl/out100m2_hg38.zip | http://boevalab.inf.ethz.ch/FREEC/tutorial.html | +| pon | Mutect2 | [GATKBundle](https://console.cloud.google.com/storage/browser/_details/genomics-public-data/resources/broad/hg38/v0/) | https://gatk.broadinstitute.org/hc/en-us/articles/360035890631-Panel-of-Normals-PON- | +| pon_tbi | Mutect2 | [GATKBundle](https://console.cloud.google.com/storage/browser/_details/genomics-public-data/resources/broad/hg38/v0/) | https://gatk.broadinstitute.org/hc/en-us/articles/360035890631-Panel-of-Normals-PON- | + +## How to customise SnpEff and VEP annotation + +SNPeff and VEP both require a large resource of files known as a cache. +These are folders composed of multiple gigabytes of files which need to be available for the software to properly function. +To use these, supply the parameters `--vep_cache` and/or `--snpeff_cache` with the locations to the root of the annotation cache folder for each tool. + +### Specify the cache location + +Params `--snpeff_cache` and `--vep_cache` are used to specify the locations to the root of the annotation cache folder. +The cache will be located within a subfolder with the path `${snpeff_species}.${snpeff_version}` for SnpEff and `${vep_species}/${vep_genome}_${vep_cache_version}` for VEP. +If this directory is missing, Sarek will raise an error. + +For example this is a typical folder structure for `GRCh38` and `WBCel235`, with SNPeff cache version 105 and VEP cache version 110: + +```text +/data/ +├─ snpeff_cache/ +│ ├─ GRCh38.105/ +│ ├─ WBcel235.105/ +├─ vep_cache/ +│ ├─ caenorhabditis_elegans/ +│ │ ├─ 110_WBCel235/ +│ ├─ homo_sapiens/ +│ │ ├─ 110_GRCh38/ +``` + +For this example, the parameters `--snpeff_cache /data/snpeff_cache` and `--vep_cache /data/vep_cache` would be used. +Both SnpEff and VEP will figure out internally the path towards the specific cache version / species the annotation should be performed given the parameters specified to Sarek. + +### Change cache version and species + +By default all is specified in the [igenomes.config](https://github.com/nf-core/sarek/blob/master/conf/igenomes.config) file. +Explanation can be found for all params in the documentation: + +- [snpeff_db](https://nf-co.re/sarek/parameters#snpeff_db) +- [snpeff_genome](https://nf-co.re/sarek/parameters#snpeff_genome) +- [vep_genome](https://nf-co.re/sarek/parameters#vep_genome) +- [vep_species](https://nf-co.re/sarek/parameters#vep_species) +- [vep_cache_version](https://nf-co.re/sarek/parameters#vep_cache_version) + +With the previous example of `GRCh38`, these are the values that were used for these params: + +```bash +snpeff_db = '105' +snpeff_genome = 'GRCh38' +vep_cache_version = '110' +vep_genome = 'GRCh38' +vep_species = 'homo_sapiens' +``` + +### Usage recommendation with AWS iGenomes + +The cache for each of these annotation tools has its own structure and is frequently updated, therefore it is kept separate from AWS iGenomes. It is not recommended to put any cache for each of this annotation tools in your local AWS iGenomes folder. + +A classical organisation on a shared storage area might be: + +```bash +/data/igenomes/ +/data/cache/snpeff_cache +/data/cache/vep_cache +``` + +Which can then be used this way in Sarek: + +```bash +nextflow run nf-core/sarek \ + --igenomes_base /data/igenomes/ \ + --snpeff_cache /data/cache/snpeff_cache/ \ + --vep_cache /data/cache/vep_cache/ \ + ... +``` + +Alternatively the data may be stored on AWS S3 storage, therefore the parameters might be: + +```bash +s3://my-reference-data/igenomes/ +s3://my-reference-data/cache/snpeff_cache/ +s3://my-reference-data/cache/vep_cache/ +``` + +Which can then be used this way in Sarek: + +```bash +nextflow run nf-core/sarek \ + --igenomes_base s3://my-reference-data/igenomes/ \ + --snpeff_cache s3://my-reference-data/cache/ensemblvep/ \ + --vep_cache s3://my-reference-data/cache/snpeff/ \ + ... +``` + +These params can be specified in a config file or in a profile using the params scope, or even in a json or a yaml file using the `-params-file` nextflow option. + +Note: we recommend storing each annotation cache in a separate directory so each cache version is handled differently. +This may mean you have many similar directories but will dramatically reduce the storage burden on machines running the SnpEff or VEP process. + +### Use annotation-cache for SnpEff and VEP + +[Annotation-cache](https://annotation-cache.github.io) is an open AWS registry resource that stores a mirror of some cache files on AWS S3 which can be used with Sarek. +It contains some genome builds which can be found by checking the contents of the S3 bucket. + +SNPeff and VEP cache are stored at the following location on S3: + +```bash +snpeff_cache = s3://annotation-cache/snpeff_cache/ +vep_cache = s3://annotation-cache/vep_cache/ +``` + +The contents of said cache can be listed with the following command using the S3 CLI: + +```bash +aws s3 --no-sign-request ls s3://annotation-cache/snpeff_cache +aws s3 --no-sign-request ls s3://annotation-cache/vep_cache/ +``` + +Since both Snpeff and VEP are internally figuring the path towards the specific cache version / species, `annotation-cache` is using an extra set of keys to specify the species and genome build. + +Which is handled internally by Sarek. + +Please refer to the [annotation-cache documentation](https://annotation-cache.github.io) for more details. + +### Use Sarek to download cache and annotate in one go + +Both VEP and snpEff come with built-in download functionality to download the cache prior to use. +Sarek includes these as optional processes. +Use the params `--download_cache`, and specify the tool with `--tools` and Sarek will download the relevant cache (`snpeff` and/or `vep`) using their respective download functions. +It is recommended to save the cache somewhere highly accessible for subsequent runs of Sarek, so the cache does not have to be re-downloaded. + +Sarek will automatically download the cache using each tools (SnpEff and/or VEP) to your work directory. +And subsequently perform the annotation of VCF files specified as an input in a samplesheet or produced by Sarek. + +### Only download cache + +Using the params `--build_only_index` allow for only downloading the cache for the specified tools. + +### Location for the cache + +Cache can be downloaded in the specified `--outdir_cache` location. +Else, it will be downloaded in `cache/` in the specified `--outdir` location. + +This command could be used to download the cache for both tools in the specified `--outdir_cache` location: + +```bash +nextflow run nf-core/sarek --outdir results --outdir_cache /path_to/my-own-cache --tools vep,snpeff --download_cache --build_only_index --input false +``` + +This command could be used to point to the recently downloaded cache and run SnpEff and VEP: + +```bash +nextflow run nf-core/sarek --outdir results --vep_cache /path_to/my-own-cache/vep_cache --snpeff_cache /path_to/my-own-cache/snpeff_cache --tools vep,snpeff --input samplesheet_vcf.csv +``` + +### Create containers with pre-downloaded cache + +nf-core is no longer maintaining containers with pre-downloaded cache. Hosting the cache within the container is not recommended as it can cause a number of problems. Instead we recommned using an external cache. The following is left for legacy reasons. + +But for each of these tools, an helper script `build.sh` can be found at the root of the tool folder in the nf-core module repo ([snpeff](https://github.com/nf-core/modules/tree/master/modules/nf-core/snpeff) and [ensemblvep](https://github.com/nf-core/modules/tree/master/modules/nf-core/ensemblvep)), and can be adapted for your usage. + +Overwritting the container declaration is then possible to accomodate for the new container. + +### Using VEP plugins + +#### dbnsfp + +Enable with `--vep_dbnsfp`. The following parameters are mandatory: + +- `--dbnsfp`, to specify the path to the dbNSFP processed file. +- `--dbnsfp_tbi`, to specify the path to the dbNSFP tabix indexed file. + +The following parameters are optionnal: + +- `--dbnsfp_consequence`, to filter/limit outputs to a specific effect of the variant. + - The set of consequence terms is defined by the Sequence Ontology and an overview of those used in VEP can be found [here](https://www.ensembl.org/info/genome/variation/prediction/predicted_data.html). + - If one wants to filter using several consequences, then separate those by using '&' (i.e. `--dbnsfp_consequence '3_prime_UTR_variant&intron_variant'`.", +- `--dbnsfp_fields`, to retrieve individual values from the dbNSFP file. + - The values correspond to the name of the columns in the dbNSFP file and are separated by comma. + - The column names might differ between the different dbNSFP versions. Please check the Readme.txt file, which is provided with the dbNSFP file, to obtain the correct column names. The Readme file contains also a short description of the provided values and the version of the tools used to generate them. + +For more details, see [here](https://www.ensembl.org/info/docs/tools/vep/script/vep_plugins.html#dbnsfp). + +#### LOFTEE + +Enable with `--vep_loftee`. + +For more details, see [here](https://github.com/konradjk/loftee). + +#### SpliceAi + +Enable with `--vep_spliceai`. The following parameters are mandatory: + +- `--spliceai_snv`, to specify the path to SpliceAI raw scores snv file. +- `--spliceai_snv_tbi`, to specify the path to SpliceAI raw scores snv tabix indexed file. +- `--spliceai_indel`, to specify the path to SpliceAI raw scores indel file. +- `--spliceai_indel_tbi`, to specify the path to SpliceAI raw scores indel tabix indexed file. + +For more details, see [here](https://www.ensembl.org/info/docs/tools/vep/script/vep_plugins.html#spliceai). + +#### SpliceRegions + +Enable with `--vep_spliceregion`. + +For more details, see [here](https://www.ensembl.org/info/docs/tools/vep/script/vep_plugins.html#spliceregion) and [here](https://www.ensembl.info/2018/10/26/cool-stuff-the-vep-can-do-splice-site-variant-annotation/)." + +### BCFTOOLS Annotate + +It is possible to annotate a VCF file with a custom annotation file using [BCFTOOLS Annotate](https://samtools.github.io/bcftools/bcftools.html#annotate). This can be done by setting adding bcfann to the tools list and setting the following parameters: + +- annotations: path to vcf annotation file +- annotations_index: path to vcf annotation index file +- header_lines: path to header lines file + +## MultiQC related issues + +### Plots for SnpEff are missing + +When plots are missing, it is possible that the fasta and the custom SnpEff database are not matching https://pcingola.github.io/SnpEff/se_faq/#error_chromosome_not_found-details. +The SnpEff completes without throwing an error causing nextflow to complete successfully. An indication for the error are these lines in the `.command` files: + +```text +ERRORS: Some errors were detected +Error type Number of errors +ERROR_CHROMOSOME_NOT_FOUND 17522411 +``` + +## Sentieon + +[Sentieon](https://www.sentieon.com/) is a commercial solution to process genomics data with high computing efficiency, fast turnaround time, exceptional high accuracy, and 100% consistency. + +In particular, Sentieon contains what may be view as speedup version of some standard GATK tools, like bwamem and haplotyper. Sarek contains support for some of the functions in Sentieon. In order to use those functions, the user will need to supply Sarek with a license for Sentieon. + +### Setup of Sentieon license + +Sentieon supply license in the form of a string-value (a url) or a file. It should be base64-encoded and stored in a nextflow secret named `SENTIEON_LICENSE_BASE64`. If a license string (url) is supplied, then the nextflow secret should be set like this: + +```bash +nextflow secrets set SENTIEON_LICENSE_BASE64 $(echo -n | base64 -w 0) +``` + +If a license file is supplied, then the nextflow secret should be set like this: + +```bash +nextflow secrets set SENTIEON_LICENSE_BASE64 \$(cat | base64 -w 0) +``` + +### Available Sentieon functions + +Sarek contains the following Sentieon functions from [DnaSeq](https://support.sentieon.com/manual/DNAseq_usage/dnaseq/) : [bwa mem](https://support.sentieon.com/manual/usages/general/#bwa-mem-syntax), [LocusCollector](https://support.sentieon.com/manual/usages/general/#locuscollector-algorithm) + [Dedup](https://support.sentieon.com/manual/usages/general/#dedup-algorithm), [Haplotyper](https://support.sentieon.com/manual/usages/general/#haplotyper-algorithm), [GVCFtyper](https://support.sentieon.com/manual/usages/general/#gvcftyper-algorithm) and [VarCal](https://support.sentieon.com/manual/usages/general/#varcal-algorithm) + [ApplyVarCal](https://support.sentieon.com/manual/usages/general/#applyvarcal-algorithm), so the basic processing of alignment of fastq-files to VCF-files can be done using speedup Sentieon functions. + +Sarek also contains the Sentieon functions [DnaScope](https://support.sentieon.com/manual/usages/general/?highlight=dnamodelapply#dnascope-algorithm) and [DNAModelApply](https://support.sentieon.com/manual/usages/general/?highlight=dnamodelapply#dnamodelapply-algorithm). + +### Basic usage of Sentieon functions + +To use Sentieon's aligner `bwa mem`, set the aligner option `sentieon-bwamem`. +(This can, for example, be done by adding `--aligner sentieon-bwamem` to the `nextflow run` command.) + +To use Sentieon's function `Dedup`, specify `sentieon_dedup` as one of the tools. +(This can, for example, be done by adding `--tools sentieon_dedup` to the `nextflow run` command.) + +To use Sentieon's function `DNAscope`, specify `sentieon_dnascope` as one of the tools. +This can, for example, be done by adding `--tools sentieon_dnascope` to the `nextflow run` command. +In order to skip Sentieon's variant-filter `DNAModelApply`, one may add `--skip_tools dnascope_filter` to the `nextflow run` command. +Sarek also provides the option `sentieon_dnascope_emit_mode` which can be used to set the [emit-mode](https://support.sentieon.com/manual/usages/general/#dnascope-algorithm) of Sentieon's dnascope. +Sentieon's dnascope can output both a vcf-file and a gvcf-file in the same run; this is achieved by setting `sentieon_dnascope_emit_mode` to `,gvcf`, where `` is `variant`, `confident` or `all`. + +Sentieon's function `Haplotyper` is used in much the same way as `DNAscope`. +To use Sentieon's function `Haplotyper`, specify `sentieon_haplotyper` as one of the tools. +This can, for example, be done by adding `--tools sentieon_haplotyper` to the `nextflow run` command. +In order to skip the GATK-based variant-filter, one may add `--skip_tools haplotyper_filter` to the `nextflow run` command. +Sarek also provides the option `sentieon_haplotyper_emit_mode` which can be used to set the [emit-mode](https://support.sentieon.com/manual/usages/general/#haplotyper-algorithm) of Sentieon's haplotyper. +Sentieon's haplotyper can output both a vcf-file and a gvcf-file in the same run; this is achieved by setting `sentieon_haplotyper_emit_mode` to `,gvcf`, where `` is `variant`, `confident` or `all`. + +To use Sentieon's function `GVCFtyper` along with Sention's version of VQSR (`VarCal` and `ApplyVarCal`) for joint-germline genotyping, specify `sentieon_haplotyper` as one of the tools, set the option `sentieon_haplotyper_emit_mode` to `gvcf`, and add the option `joint_germline`. +This can, for example, be done by adding `--tools sentieon_haplotyper --joint_germline --sentieon_haplotyper_emit_mode gvcf` to the `nextflow run` command. +If `sentieon_dnascope` is chosen instead of `sentieon_haplotyper`, then Sention's version of VQSR is skipped, as recommended by Sentieon. + +### Joint germline variant calling + +Sentieon's [GVCFtyper](https://support.sentieon.com/manual/usages/general/#gvcftyper-algorithm) does not support the [GenomicsDB](https://gatk.broadinstitute.org/hc/en-us/articles/5358869876891-GenomicsDBImport) datastore format. This means that, in contrast to the GATK based joint germline variant calling subworkflow in Sarek, the Sentieon/DNAseq based joint germline variant calling subworkflow does not use the GenomicsDB datastore format. + +### QualCal (BQSR) + +Currently, Sentieon's version of BQSR, QualCal, is not available in Sarek. Recent Illumina sequencers tend to provide well-calibrated BQs, so BQSR may not provide much benefit. By default Sarek runs GATK's BQSR; that can be skipped by adding the option `--skip_tools baserecalibrator`. + +## Requested resources for the tools + +Resource requests are difficult to generalize and are often dependent on input data size. Currently, the number of cpus and memory requested by default were adapted from tests on 5 ICGC paired whole-genome sequencing samples with approximately 40X and 80X depth. +For targeted data analysis, this is overshooting by a lot. In this case resources for each process can be limited by either setting `--max_memory` and `-max_cpus` or tailoring the request by process name as described [here](#resource-requests). If you are using sarek for a certain data type regulary, and would like to make these requests available to others on your system, an institution-specific, pipeline-specific config file can be added [here](https://github.com/nf-core/configs/tree/master/conf/pipeline/sarek). diff --git a/lib/NfcoreTemplate.groovy b/lib/NfcoreTemplate.groovy index e248e4c3f7..755ee64d44 100755 --- a/lib/NfcoreTemplate.groovy +++ b/lib/NfcoreTemplate.groovy @@ -242,6 +242,7 @@ class NfcoreTemplate { temp_pf.delete() } + // // Print pipeline summary on completion // @@ -348,6 +349,13 @@ class NfcoreTemplate { ${colors.blue} |\\ | |__ __ / ` / \\ |__) |__ ${colors.yellow}} {${colors.reset} ${colors.blue} | \\| | \\__, \\__/ | \\ |___ ${colors.green}\\`-._,-`-,${colors.reset} ${colors.green}`._,._,\'${colors.reset} + ${colors.white} ____${colors.reset} + ${colors.white} .´ _ `.${colors.reset} + ${colors.white} / ${colors.green}|\\${colors.reset}`-_ \\${colors.reset} ${colors.blue} __ __ ___ ${colors.reset} + ${colors.white} | ${colors.green}| \\${colors.reset} `-|${colors.reset} ${colors.blue}|__` /\\ |__) |__ |__/${colors.reset} + ${colors.white} \\ ${colors.green}| \\${colors.reset} /${colors.reset} ${colors.blue}.__| /¯¯\\ | \\ |___ | \\${colors.reset} + ${colors.white} `${colors.green}|${colors.reset}____${colors.green}\\${colors.reset}´${colors.reset} + ${colors.purple} ${workflow.manifest.name} ${workflow_version}${colors.reset} ${dashedLine(monochrome_logs)} """.stripIndent() diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy index 96af61fd17..bd412e58c8 100755 --- a/lib/WorkflowMain.groovy +++ b/lib/WorkflowMain.groovy @@ -11,16 +11,15 @@ class WorkflowMain { // public static String citation(workflow) { return "If you use ${workflow.manifest.name} for your analysis please cite:\n\n" + - // TODO nf-core: Add Zenodo DOI for pipeline after first release - //"* The pipeline\n" + - //" https://doi.org/10.5281/zenodo.XXXXXXX\n\n" + + "* The pipeline\n" + + " https://doi.org/10.12688/f1000research.16665.2\n" + + " https://doi.org/10.5281/zenodo.3476425\n\n" + "* The nf-core framework\n" + " https://doi.org/10.1038/s41587-020-0439-x\n\n" + "* Software dependencies\n" + " https://github.com/${workflow.manifest.name}/blob/master/CITATIONS.md" } - // // Validate parameters and print summary to screen // @@ -44,9 +43,9 @@ class WorkflowMain { // Check AWS batch settings NfcoreTemplate.awsBatch(workflow, params) - // Check input has been provided - if (!params.input) { - Nextflow.error("Please provide an input samplesheet to the pipeline e.g. '--input samplesheet.csv'") + // Warn that no input was provided + if (!params.input && !params.build_only_index) { + log.warn "No samplesheet specified, attempting to restart from csv files present in ${params.outdir}" } } // diff --git a/lib/WorkflowSarek.groovy b/lib/WorkflowSarek.groovy index 01d2fc2870..12f001c251 100755 --- a/lib/WorkflowSarek.groovy +++ b/lib/WorkflowSarek.groovy @@ -14,8 +14,7 @@ class WorkflowSarek { genomeExistsError(params, log) - - if (!params.fasta) { + if (!params.fasta && params.step == 'annotate') { Nextflow.error "Genome fasta file not specified with e.g. '--fasta genome.fa' or via a detectable config file." } } @@ -97,7 +96,6 @@ class WorkflowSarek { //meta["tool_citations"] = toolCitationText(params).replaceAll(", \\.", ".").replaceAll("\\. \\.", ".").replaceAll(", \\.", ".") //meta["tool_bibliography"] = toolBibliographyText(params) - def methods_text = mqc_methods_yaml.text def engine = new SimpleTemplateEngine() @@ -119,4 +117,33 @@ class WorkflowSarek { Nextflow.error(error_string) } } + + public static String retrieveInput(params, log){ + def input = null + if (!params.input && !params.build_only_index) { + switch (params.step) { + case 'mapping': Nextflow.error("Can't start with step $params.step without samplesheet") + break + case 'markduplicates': log.warn("Using file ${params.outdir}/csv/mapped.csv"); + input = params.outdir + "/csv/mapped.csv" + break + case 'prepare_recalibration': log.warn("Using file ${params.outdir}/csv/markduplicates_no_table.csv"); + input = params.outdir + "/csv/markduplicates_no_table.csv" + break + case 'recalibrate': log.warn("Using file ${params.outdir}/csv/markduplicates.csv"); + input = params.outdir + "/csv/markduplicates.csv" + break + case 'variant_calling': log.warn("Using file ${params.outdir}/csv/recalibrated.csv"); + input = params.outdir + "/csv/recalibrated.csv" + break + // case 'controlfreec': csv_file = file("${params.outdir}/variant_calling/csv/control-freec_mpileup.csv", checkIfExists: true); break + case 'annotate': log.warn("Using file ${params.outdir}/csv/variantcalled.csv"); + input = params.outdir + "/csv/variantcalled.csv" + break + default: log.warn("Please provide an input samplesheet to the pipeline e.g. '--input samplesheet.csv'") + Nextflow.error("Unknown step $params.step") + } + } + return input + } } diff --git a/main.nf b/main.nf index 2fc844bfeb..101db87a36 100644 --- a/main.nf +++ b/main.nf @@ -1,12 +1,22 @@ #!/usr/bin/env nextflow + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ nf-core/sarek +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Started March 2016. + Ported to nf-core May 2019. + Ported to DSL 2 July 2020. +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + nf-core/sarek: + An open-source analysis pipeline to detect germline or somatic variants + from whole genome or targeted sequencing ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Github : https://github.com/nf-core/sarek Website: https://nf-co.re/sarek + Docs : https://nf-co.re/sarek/usage Slack : https://nfcore.slack.com/channels/sarek ----------------------------------------------------------------------------------------- +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ nextflow.enable.dsl = 2 @@ -16,11 +26,49 @@ nextflow.enable.dsl = 2 GENOME PARAMETER VALUES ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ +params.ascat_alleles = WorkflowMain.getGenomeAttribute(params, 'ascat_alleles') +params.ascat_genome = WorkflowMain.getGenomeAttribute(params, 'ascat_genome') +params.ascat_loci = WorkflowMain.getGenomeAttribute(params, 'ascat_loci') +params.ascat_loci_gc = WorkflowMain.getGenomeAttribute(params, 'ascat_loci_gc') +params.ascat_loci_rt = WorkflowMain.getGenomeAttribute(params, 'ascat_loci_rt') +params.bwa = WorkflowMain.getGenomeAttribute(params, 'bwa') +params.bwamem2 = WorkflowMain.getGenomeAttribute(params, 'bwamem2') +params.cf_chrom_len = WorkflowMain.getGenomeAttribute(params, 'cf_chrom_len') +params.chr_dir = WorkflowMain.getGenomeAttribute(params, 'chr_dir') +params.dbsnp = WorkflowMain.getGenomeAttribute(params, 'dbsnp') +params.dbsnp_tbi = WorkflowMain.getGenomeAttribute(params, 'dbsnp_tbi') +params.dbsnp_vqsr = WorkflowMain.getGenomeAttribute(params, 'dbsnp_vqsr') +params.dict = WorkflowMain.getGenomeAttribute(params, 'dict') +params.dragmap = WorkflowMain.getGenomeAttribute(params, 'dragmap') +params.fasta = WorkflowMain.getGenomeAttribute(params, 'fasta') +params.fasta_fai = WorkflowMain.getGenomeAttribute(params, 'fasta_fai') +params.germline_resource = WorkflowMain.getGenomeAttribute(params, 'germline_resource') +params.germline_resource_tbi = WorkflowMain.getGenomeAttribute(params, 'germline_resource_tbi') +params.intervals = WorkflowMain.getGenomeAttribute(params, 'intervals') +params.known_indels = WorkflowMain.getGenomeAttribute(params, 'known_indels') +params.known_indels_tbi = WorkflowMain.getGenomeAttribute(params, 'known_indels_tbi') +params.known_indels_vqsr = WorkflowMain.getGenomeAttribute(params, 'known_indels_vqsr') +params.known_snps = WorkflowMain.getGenomeAttribute(params, 'known_snps') +params.known_snps_tbi = WorkflowMain.getGenomeAttribute(params, 'known_snps_tbi') +params.known_snps_vqsr = WorkflowMain.getGenomeAttribute(params, 'known_snps_vqsr') +params.mappability = WorkflowMain.getGenomeAttribute(params, 'mappability') +params.ngscheckmate_bed = WorkflowMain.getGenomeAttribute(params, 'ngscheckmate_bed') +params.pon = WorkflowMain.getGenomeAttribute(params, 'pon') +params.pon_tbi = WorkflowMain.getGenomeAttribute(params, 'pon_tbi') +params.sentieon_dnascope_model = WorkflowMain.getGenomeAttribute(params, 'sentieon_dnascope_model') +params.snpeff_db = WorkflowMain.getGenomeAttribute(params, 'snpeff_db') +params.snpeff_genome = WorkflowMain.getGenomeAttribute(params, 'snpeff_genome') +params.vep_cache_version = WorkflowMain.getGenomeAttribute(params, 'vep_cache_version') +params.vep_genome = WorkflowMain.getGenomeAttribute(params, 'vep_genome') +params.vep_species = WorkflowMain.getGenomeAttribute(params, 'vep_species') + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ALTERNATIVE INPUT FILE ON RESTART +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ -// TODO nf-core: Remove this line if you don't need a FASTA file -// This is an example of how to use getGenomeAttribute() to fetch parameters -// from igenomes.config using `--genome` -params.fasta = WorkflowMain.getGenomeAttribute(params, 'fasta') +params.input_restart = WorkflowSarek.retrieveInput(params, log) /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -34,7 +82,7 @@ include { validateParameters; paramsHelp } from 'plugin/nf-validation' if (params.help) { def logo = NfcoreTemplate.logo(workflow, params.monochrome_logs) def citation = '\n' + WorkflowMain.citation(workflow) + '\n' - def String command = "nextflow run ${workflow.manifest.name} --input samplesheet.csv --genome GRCh37 -profile docker" + def String command = "nextflow run ${workflow.manifest.name} --input samplesheet.csv --genome GATK.GRCh38 -profile docker --outdir results" log.info logo + paramsHelp(command) + citation + NfcoreTemplate.dashedLine(params.monochrome_logs) System.exit(0) } @@ -54,9 +102,7 @@ WorkflowMain.initialise(workflow, params, log) include { SAREK } from './workflows/sarek' -// // WORKFLOW: Run main nf-core/sarek analysis pipeline -// workflow NFCORE_SAREK { SAREK () } @@ -67,10 +113,8 @@ workflow NFCORE_SAREK { ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -// // WORKFLOW: Execute a single named workflow for the pipeline // See: https://github.com/nf-core/rnaseq/issues/619 -// workflow { NFCORE_SAREK () } diff --git a/modules.json b/modules.json index 4c0ace7a32..e2440fa692 100644 --- a/modules.json +++ b/modules.json @@ -5,20 +5,504 @@ "https://github.com/nf-core/modules.git": { "modules": { "nf-core": { + "ascat": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "bcftools/annotate": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"], + "patch": "modules/nf-core/bcftools/annotate/bcftools-annotate.diff" + }, + "bcftools/concat": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "bcftools/mpileup": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["bam_ngscheckmate"] + }, + "bcftools/sort": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "bcftools/stats": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "bwa/index": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "bwa/mem": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "bwamem2/index": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "bwamem2/mem": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "cat/cat": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "cat/fastq": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "cnvkit/antitarget": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "cnvkit/batch": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "cnvkit/genemetrics": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "cnvkit/reference": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "controlfreec/assesssignificance": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"], + "patch": "modules/nf-core/controlfreec/assesssignificance/controlfreec-assesssignificance.diff" + }, + "controlfreec/freec": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "controlfreec/freec2bed": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "controlfreec/freec2circos": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "controlfreec/makegraph": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"], + "patch": "modules/nf-core/controlfreec/makegraph/controlfreec-makegraph.diff" + }, "custom/dumpsoftwareversions": { "branch": "master", "git_sha": "bba7e362e4afead70653f84d8700588ea28d0f9e", "installed_by": ["modules"] }, + "deepvariant": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "dragmap/align": { + "branch": "master", + "git_sha": "3c77ca9aac783e76c3614a06db3bfe4fef619bde", + "installed_by": ["modules"], + "patch": "modules/nf-core/dragmap/align/dragmap-align.diff" + }, + "dragmap/hashtable": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"], + "patch": "modules/nf-core/dragmap/hashtable/dragmap-hashtable.diff" + }, + "ensemblvep/download": { + "branch": "master", + "git_sha": "214d575774c172062924ad3564b4f66655600730", + "installed_by": ["modules"] + }, + "ensemblvep/vep": { + "branch": "master", + "git_sha": "214d575774c172062924ad3564b4f66655600730", + "installed_by": ["vcf_annotate_ensemblvep", "modules"] + }, + "fastp": { + "branch": "master", + "git_sha": "3c77ca9aac783e76c3614a06db3bfe4fef619bde", + "installed_by": ["modules"] + }, "fastqc": { "branch": "master", "git_sha": "65ad3e0b9a4099592e1102e92e10455dc661cf53", "installed_by": ["modules"] }, + "fgbio/callmolecularconsensusreads": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "fgbio/fastqtobam": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "fgbio/groupreadsbyumi": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "freebayes": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "gatk4/applybqsr": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "gatk4/applyvqsr": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "gatk4/baserecalibrator": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "gatk4/calculatecontamination": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "gatk4/cnnscorevariants": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "gatk4/createsequencedictionary": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "gatk4/estimatelibrarycomplexity": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "gatk4/filtermutectcalls": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "gatk4/filtervarianttranches": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "gatk4/gatherbqsrreports": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "gatk4/gatherpileupsummaries": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "gatk4/genomicsdbimport": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "gatk4/genotypegvcfs": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "gatk4/getpileupsummaries": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "gatk4/haplotypecaller": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "gatk4/intervallisttobed": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "gatk4/learnreadorientationmodel": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "gatk4/markduplicates": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "gatk4/mergemutectstats": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "gatk4/mergevcfs": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "gatk4/mutect2": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "gatk4/variantrecalibrator": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "gatk4spark/applybqsr": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "gatk4spark/baserecalibrator": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "gatk4spark/markduplicates": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "manta/germline": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "manta/somatic": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "manta/tumoronly": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "mosdepth": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "msisensorpro/msisomatic": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "msisensorpro/scan": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, "multiqc": { "branch": "master", "git_sha": "4ab13872435962dadc239979554d13709e20bf29", "installed_by": ["modules"] + }, + "ngscheckmate/ncm": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["bam_ngscheckmate"] + }, + "samblaster": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "samtools/bam2fq": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "samtools/collatefastq": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "samtools/convert": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "samtools/faidx": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "samtools/index": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "samtools/merge": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "samtools/mpileup": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "samtools/stats": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "samtools/view": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "sentieon/applyvarcal": { + "branch": "master", + "git_sha": "89b6873f15dd31ed17f4d10ede2fa623e2a128ff", + "installed_by": ["modules"] + }, + "sentieon/bwamem": { + "branch": "master", + "git_sha": "89b6873f15dd31ed17f4d10ede2fa623e2a128ff", + "installed_by": ["modules"] + }, + "sentieon/dedup": { + "branch": "master", + "git_sha": "89b6873f15dd31ed17f4d10ede2fa623e2a128ff", + "installed_by": ["modules"] + }, + "sentieon/dnamodelapply": { + "branch": "master", + "git_sha": "89b6873f15dd31ed17f4d10ede2fa623e2a128ff", + "installed_by": ["modules"] + }, + "sentieon/dnascope": { + "branch": "master", + "git_sha": "89b6873f15dd31ed17f4d10ede2fa623e2a128ff", + "installed_by": ["modules"] + }, + "sentieon/gvcftyper": { + "branch": "master", + "git_sha": "89b6873f15dd31ed17f4d10ede2fa623e2a128ff", + "installed_by": ["modules"] + }, + "sentieon/haplotyper": { + "branch": "master", + "git_sha": "89b6873f15dd31ed17f4d10ede2fa623e2a128ff", + "installed_by": ["modules"] + }, + "sentieon/varcal": { + "branch": "master", + "git_sha": "89b6873f15dd31ed17f4d10ede2fa623e2a128ff", + "installed_by": ["modules"] + }, + "snpeff/download": { + "branch": "master", + "git_sha": "214d575774c172062924ad3564b4f66655600730", + "installed_by": ["modules"] + }, + "snpeff/snpeff": { + "branch": "master", + "git_sha": "214d575774c172062924ad3564b4f66655600730", + "installed_by": ["modules", "vcf_annotate_snpeff"] + }, + "strelka/germline": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "strelka/somatic": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "svdb/merge": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "tabix/bgziptabix": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules", "vcf_annotate_snpeff"] + }, + "tabix/tabix": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["vcf_annotate_ensemblvep", "modules"] + }, + "tiddit/sv": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "untar": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "unzip": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "vcftools": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + } + } + }, + "subworkflows": { + "nf-core": { + "bam_ngscheckmate": { + "branch": "master", + "git_sha": "cfd937a668919d948f6fcbf4218e79de50c2f36f", + "installed_by": ["subworkflows"] + }, + "vcf_annotate_ensemblvep": { + "branch": "master", + "git_sha": "cfd937a668919d948f6fcbf4218e79de50c2f36f", + "installed_by": ["subworkflows"] + }, + "vcf_annotate_snpeff": { + "branch": "master", + "git_sha": "cfd937a668919d948f6fcbf4218e79de50c2f36f", + "installed_by": ["subworkflows"] } } } diff --git a/modules/local/add_info_to_vcf/environment.yml b/modules/local/add_info_to_vcf/environment.yml new file mode 100644 index 0000000000..34513c7f4a --- /dev/null +++ b/modules/local/add_info_to_vcf/environment.yml @@ -0,0 +1,7 @@ +name: gawk +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - anaconda::gawk=5.1.0 diff --git a/modules/local/add_info_to_vcf/main.nf b/modules/local/add_info_to_vcf/main.nf new file mode 100644 index 0000000000..a55cf14b56 --- /dev/null +++ b/modules/local/add_info_to_vcf/main.nf @@ -0,0 +1,41 @@ +process ADD_INFO_TO_VCF { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gawk:5.1.0' : + 'biocontainers/gawk:5.1.0' }" + + input: + tuple val(meta), path(vcf_gz) + + output: + tuple val(meta), path("*.added_info.vcf"), emit: vcf + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + input="input.vcf" + output="${vcf_gz.baseName.minus(".vcf")}.added_info.vcf" + zcat $vcf_gz > \$input + ## Add info header lines + grep -E "^##" \$input > \$output + ## Add description of new INFO value + echo '##INFO=' >> \$output + ## Add column header + grep -E "^#CHROM" \$input >> \$output + ## Add SOURCE value to INFO column of variant calls + if grep -Ev "^#" \$input; then + grep -Ev "^#" \$input | awk 'BEGIN{FS=OFS="\t"} { \$8=="." ? \$8="SOURCE=$vcf_gz" : \$8=\$8";SOURCE=$vcf_gz"; print }' >> \$output + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gawk: \$(awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//') + END_VERSIONS + """ +} diff --git a/modules/local/build_intervals/environment.yml b/modules/local/build_intervals/environment.yml new file mode 100644 index 0000000000..34513c7f4a --- /dev/null +++ b/modules/local/build_intervals/environment.yml @@ -0,0 +1,7 @@ +name: gawk +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - anaconda::gawk=5.1.0 diff --git a/modules/local/build_intervals/main.nf b/modules/local/build_intervals/main.nf new file mode 100644 index 0000000000..9c562f5c17 --- /dev/null +++ b/modules/local/build_intervals/main.nf @@ -0,0 +1,29 @@ +process BUILD_INTERVALS { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gawk:5.1.0' : + 'biocontainers/gawk:5.1.0' }" + + input: + tuple val(meta), path(fasta_fai) + + output: + tuple val(meta), path("${fasta_fai.baseName}.bed") , emit: bed + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + awk -v FS='\t' -v OFS='\t' '{ print \$1, \"0\", \$2 }' ${fasta_fai} > ${fasta_fai.baseName}.bed + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gawk: \$(awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//') + END_VERSIONS + """ +} diff --git a/modules/local/create_intervals_bed/environment.yml b/modules/local/create_intervals_bed/environment.yml new file mode 100644 index 0000000000..34513c7f4a --- /dev/null +++ b/modules/local/create_intervals_bed/environment.yml @@ -0,0 +1,7 @@ +name: gawk +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - anaconda::gawk=5.1.0 diff --git a/modules/local/create_intervals_bed/main.nf b/modules/local/create_intervals_bed/main.nf new file mode 100644 index 0000000000..88160ccbf4 --- /dev/null +++ b/modules/local/create_intervals_bed/main.nf @@ -0,0 +1,74 @@ +process CREATE_INTERVALS_BED { + tag "$intervals" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gawk:5.1.0' : + 'biocontainers/gawk:5.1.0' }" + + input: + path(intervals) + + output: + path("*.bed") , emit: bed + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // If intervals file is in BED format, + // Fifth column is interpreted to contain runtime estimates + // Which is then used to combine short-running jobs + if (intervals.toString().toLowerCase().endsWith("bed")) { + """ + awk -vFS="\t" '{ + t = \$5 # runtime estimate + if (t == "") { + # no runtime estimate in this row, assume default value + t = (\$3 - \$2) / ${params.nucleotides_per_second} + } + if (name == "" || (chunk > 600 && (chunk + t) > longest * 1.05)) { + # start a new chunk + name = sprintf("%s_%d-%d.bed", \$1, \$2+1, \$3) + chunk = 0 + longest = 0 + } + if (t > longest) + longest = t + chunk += t + print \$0 > name + }' ${intervals} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gawk: \$(awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//') + END_VERSIONS + """ + } else if (intervals.toString().toLowerCase().endsWith("interval_list")) { + """ + grep -v '^@' ${intervals} | awk -vFS="\t" '{ + name = sprintf("%s_%d-%d", \$1, \$2, \$3); + printf("%s\\t%d\\t%d\\n", \$1, \$2-1, \$3) > name ".bed" + }' + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gawk: \$(awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//') + END_VERSIONS + """ + } else { + """ + awk -vFS="[:-]" '{ + name = sprintf("%s_%d-%d", \$1, \$2, \$3); + printf("%s\\t%d\\t%d\\n", \$1, \$2-1, \$3) > name ".bed" + }' ${intervals} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gawk: \$(awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//') + END_VERSIONS + """ + } +} diff --git a/modules/local/samplesheet_check.nf b/modules/local/samplesheet_check.nf deleted file mode 100644 index de79fb1f74..0000000000 --- a/modules/local/samplesheet_check.nf +++ /dev/null @@ -1,31 +0,0 @@ -process SAMPLESHEET_CHECK { - tag "$samplesheet" - label 'process_single' - - conda "conda-forge::python=3.8.3" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/python:3.8.3' : - 'biocontainers/python:3.8.3' }" - - input: - path samplesheet - - output: - path '*.csv' , emit: csv - path "versions.yml", emit: versions - - when: - task.ext.when == null || task.ext.when - - script: // This script is bundled with the pipeline, in nf-core/sarek/bin/ - """ - check_samplesheet.py \\ - $samplesheet \\ - samplesheet.valid.csv - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - python: \$(python --version | sed 's/Python //g') - END_VERSIONS - """ -} diff --git a/modules/nf-core/ascat/environment.yml b/modules/nf-core/ascat/environment.yml new file mode 100644 index 0000000000..52935f0978 --- /dev/null +++ b/modules/nf-core/ascat/environment.yml @@ -0,0 +1,8 @@ +name: ascat +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::ascat=3.1.1 + - bioconda::cancerit-allelecount=4.3.0 diff --git a/modules/nf-core/ascat/main.nf b/modules/nf-core/ascat/main.nf new file mode 100644 index 0000000000..8aeb9847b5 --- /dev/null +++ b/modules/nf-core/ascat/main.nf @@ -0,0 +1,200 @@ +process ASCAT { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-c278c7398beb73294d78639a864352abef2931ce:ba3e6d2157eac2d38d22e62ec87675e12adb1010-0': + 'biocontainers/mulled-v2-c278c7398beb73294d78639a864352abef2931ce:ba3e6d2157eac2d38d22e62ec87675e12adb1010-0' }" + + input: + tuple val(meta), path(input_normal), path(index_normal), path(input_tumor), path(index_tumor) + path(allele_files) + path(loci_files) + path(bed_file) // optional + path(fasta) // optional + path(gc_file) // optional + path(rt_file) // optional + + output: + tuple val(meta), path("*alleleFrequencies_chr*.txt"), emit: allelefreqs + tuple val(meta), path("*BAF.txt"), emit: bafs + tuple val(meta), path("*cnvs.txt"), emit: cnvs + tuple val(meta), path("*LogR.txt"), emit: logrs + tuple val(meta), path("*metrics.txt"), emit: metrics + tuple val(meta), path("*png"), emit: png + tuple val(meta), path("*purityploidy.txt"), emit: purityploidy + tuple val(meta), path("*segments.txt"), emit: segments + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def gender = args.gender ? "$args.gender" : "NULL" + def genomeVersion = args.genomeVersion ? "$args.genomeVersion" : "NULL" + def purity = args.purity ? "$args.purity" : "NULL" + def ploidy = args.ploidy ? "$args.ploidy" : "NULL" + def gc_input = gc_file ? "$gc_file" : "NULL" + def rt_input = rt_file ? "$rt_file" : "NULL" + + def minCounts_arg = args.minCounts ? ",minCounts = $args.minCounts" : "" + def bed_file_arg = bed_file ? ",BED_file = '$bed_file'": "" + def chrom_names_arg = args.chrom_names ? ",chrom_names = $args.chrom_names" : "" + def min_base_qual_arg = args.min_base_qual ? ",min_base_qual = $args.min_base_qual" : "" + def min_map_qual_arg = args.min_map_qual ? ",min_map_qual = $args.min_map_qual" : "" + def fasta_arg = fasta ? ",ref.fasta = '$fasta'" : "" + def skip_allele_counting_tumour_arg = args.skip_allele_counting_tumour ? ",skip_allele_counting_tumour = $args.skip_allele_counting_tumour" : "" + def skip_allele_counting_normal_arg = args.skip_allele_counting_normal ? ",skip_allele_counting_normal = $args.skip_allele_counting_normal" : "" + + """ + #!/usr/bin/env Rscript + library(RColorBrewer) + library(ASCAT) + options(bitmapType='cairo') + + #build prefixes: + allele_path = normalizePath("$allele_files") + allele_prefix = paste0(allele_path, "/", "$allele_files", "_chr") + + loci_path = normalizePath("$loci_files") + loci_prefix = paste0(loci_path, "/", "$loci_files", "_chr") + + #prepare from BAM files + ascat.prepareHTS( + tumourseqfile = "$input_tumor", + normalseqfile = "$input_normal", + tumourname = paste0("$prefix", ".tumour"), + normalname = paste0("$prefix", ".normal"), + allelecounter_exe = "alleleCounter", + alleles.prefix = allele_prefix, + loci.prefix = loci_prefix, + gender = "$gender", + genomeVersion = "$genomeVersion", + nthreads = $task.cpus + $minCounts_arg + $bed_file_arg + $chrom_names_arg + $min_base_qual_arg + $min_map_qual_arg + $fasta_arg + $skip_allele_counting_tumour_arg + $skip_allele_counting_normal_arg, + seed = 42 + ) + + + #Load the data + ascat.bc = ascat.loadData( + Tumor_LogR_file = paste0("$prefix", ".tumour_tumourLogR.txt"), + Tumor_BAF_file = paste0("$prefix", ".tumour_tumourBAF.txt"), + Germline_LogR_file = paste0("$prefix", ".tumour_normalLogR.txt"), + Germline_BAF_file = paste0("$prefix", ".tumour_normalBAF.txt"), + genomeVersion = "$genomeVersion", + gender = "$gender" + ) + + #Plot the raw data + ascat.plotRawData(ascat.bc, img.prefix = paste0("$prefix", ".before_correction.")) + + # optional LogRCorrection + if("$gc_input" != "NULL") { + gc_input = paste0(normalizePath("$gc_input"), "/", "$gc_input", ".txt") + + if("$rt_input" != "NULL"){ + rt_input = paste0(normalizePath("$rt_input"), "/", "$rt_input", ".txt") + ascat.bc = ascat.correctLogR(ascat.bc, GCcontentfile = gc_input, replictimingfile = rt_input) + #Plot raw data after correction + ascat.plotRawData(ascat.bc, img.prefix = paste0("$prefix", ".after_correction_gc_rt.")) + } + else { + ascat.bc = ascat.correctLogR(ascat.bc, GCcontentfile = gc_input, replictimingfile = $rt_input) + #Plot raw data after correction + ascat.plotRawData(ascat.bc, img.prefix = paste0("$prefix", ".after_correction_gc.")) + } + } + + #Segment the data + ascat.bc = ascat.aspcf(ascat.bc, seed=42) + + #Plot the segmented data + ascat.plotSegmentedData(ascat.bc) + + #Run ASCAT to fit every tumor to a model, inferring ploidy, normal cell contamination, and discrete copy numbers + #If psi and rho are manually set: + if (!is.null($purity) && !is.null($ploidy)){ + ascat.output <- ascat.runAscat(ascat.bc, gamma=1, rho_manual=$purity, psi_manual=$ploidy) + } else if(!is.null($purity) && is.null($ploidy)){ + ascat.output <- ascat.runAscat(ascat.bc, gamma=1, rho_manual=$purity) + } else if(!is.null($ploidy) && is.null($purity)){ + ascat.output <- ascat.runAscat(ascat.bc, gamma=1, psi_manual=$ploidy) + } else { + ascat.output <- ascat.runAscat(ascat.bc, gamma=1) + } + + #Extract metrics from ASCAT profiles + QC = ascat.metrics(ascat.bc,ascat.output) + + #Write out segmented regions (including regions with one copy of each allele) + write.table(ascat.output[["segments"]], file=paste0("$prefix", ".segments.txt"), sep="\t", quote=F, row.names=F) + + #Write out CNVs in bed format + cnvs=ascat.output[["segments"]][2:6] + write.table(cnvs, file=paste0("$prefix",".cnvs.txt"), sep="\t", quote=F, row.names=F, col.names=T) + + #Write out purity and ploidy info + summary <- tryCatch({ + matrix(c(ascat.output[["aberrantcellfraction"]], ascat.output[["ploidy"]]), ncol=2, byrow=TRUE)}, error = function(err) { + # error handler picks up where error was generated + print(paste("Could not find optimal solution: ",err)) + return(matrix(c(0,0),nrow=1,ncol=2,byrow = TRUE)) + } + ) + colnames(summary) <- c("AberrantCellFraction","Ploidy") + write.table(summary, file=paste0("$prefix",".purityploidy.txt"), sep="\t", quote=F, row.names=F, col.names=T) + + write.table(QC, file=paste0("$prefix", ".metrics.txt"), sep="\t", quote=F, row.names=F) + + # version export + f <- file("versions.yml","w") + alleleCounter_version = system(paste("alleleCounter --version"), intern = T) + ascat_version = sessionInfo()\$otherPkgs\$ASCAT\$Version + writeLines(paste0('"', "$task.process", '"', ":"), f) + writeLines(paste(" alleleCounter:", alleleCounter_version), f) + writeLines(paste(" ascat:", ascat_version), f) + close(f) + + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + echo stub > ${prefix}.after_correction.gc_rt.test.tumour.germline.png + echo stub > ${prefix}.after_correction.gc_rt.test.tumour.tumour.png + echo stub > ${prefix}.before_correction.test.tumour.germline.png + echo stub > ${prefix}.before_correction.test.tumour.tumour.png + echo stub > ${prefix}.cnvs.txt + echo stub > ${prefix}.metrics.txt + echo stub > ${prefix}.normal_alleleFrequencies_chr21.txt + echo stub > ${prefix}.normal_alleleFrequencies_chr22.txt + echo stub > ${prefix}.purityploidy.txt + echo stub > ${prefix}.segments.txt + echo stub > ${prefix}.tumour.ASPCF.png + echo stub > ${prefix}.tumour.sunrise.png + echo stub > ${prefix}.tumour_alleleFrequencies_chr21.txt + echo stub > ${prefix}.tumour_alleleFrequencies_chr22.txt + echo stub > ${prefix}.tumour_normalBAF.txt + echo stub > ${prefix}.tumour_normalLogR.txt + echo stub > ${prefix}.tumour_tumourBAF.txt + echo stub > ${prefix}.tumour_tumourLogR.txt + + echo "${task.process}:" > versions.yml + echo ' alleleCounter: 4.3.0' >> versions.yml + echo ' ascat: 3.0.0' >> versions.yml + + """ + + +} diff --git a/modules/nf-core/ascat/meta.yml b/modules/nf-core/ascat/meta.yml new file mode 100644 index 0000000000..34ea2e51d9 --- /dev/null +++ b/modules/nf-core/ascat/meta.yml @@ -0,0 +1,119 @@ +name: ascat +description: copy number profiles of tumour cells. +keywords: + - bam + - copy number + - cram +tools: + - ascat: + description: ASCAT is a method to derive copy number profiles of tumour cells, accounting for normal cell admixture and tumour aneuploidy. ASCAT infers tumour purity (the fraction of tumour cells) and ploidy (the amount of DNA per tumour cell), expressed as multiples of haploid genomes from SNP array or massively parallel sequencing data, and calculates whole-genome allele-specific copy number profiles (the number of copies of both parental alleles for all SNP loci across the genome). + documentation: https://github.com/VanLoo-lab/ascat/tree/master/man + tool_dev_url: https://github.com/VanLoo-lab/ascat + doi: "10.1093/bioinformatics/btaa538" + licence: ["GPL v3"] +input: + - args: + type: map + description: | + Groovy Map containing tool parameters. MUST follow the structure/keywords below and be provided via modules.config. Parameters must be set between quotes. (optional) parameters can be removed from the map, if they are not set. For default values, please check the documentation above. + + ``` + { + [ + "gender": "XX", + "genomeVersion": "hg19" + "purity": (optional), + "ploidy": (optional), + "gc_files": (optional), + "minCounts": (optional), + "BED_file": (optional) but recommended for WES, + "chrom_names": (optional), + "min_base_qual": (optional), + "min_map_qual": (optional), + "ref_fasta": (optional), + "skip_allele_counting_tumour": (optional), + "skip_allele_counting_normal": (optional) + ] + } + ``` + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input_normal: + type: file + description: BAM/CRAM file, must adhere to chr1, chr2, ...chrX notation For modifying chromosome notation in bam files please follow https://josephcckuo.wordpress.com/2016/11/17/modify-chromosome-notation-in-bam-file/. + pattern: "*.{bam,cram}" + - index_normal: + type: file + description: index for normal_bam/cram + pattern: "*.{bai,crai}" + - input_tumor: + type: file + description: BAM/CRAM file, must adhere to chr1, chr2, ...chrX notation + pattern: "*.{bam,cram}" + - index_tumor: + type: file + description: index for tumor_bam/cram + pattern: "*.{bai,crai}" + - allele_files: + type: file + description: allele files for ASCAT WGS. Can be downloaded here https://github.com/VanLoo-lab/ascat/tree/master/ReferenceFiles/WGS + - loci_files: + type: file + description: loci files for ASCAT WGS. Loci files without chromosome notation can be downloaded here https://github.com/VanLoo-lab/ascat/tree/master/ReferenceFiles/WGS Make sure the chromosome notation matches the bam/cram input files. To add the chromosome notation to loci files (hg19/hg38) if necessary, you can run this command `if [[ $(samtools view | head -n1 | cut -f3)\" == *\"chr\"* ]]; then for i in {1..22} X; do sed -i 's/^/chr/' G1000_loci_hg19_chr_${i}.txt; done; fi` + - bed_file: + type: file + description: Bed file for ASCAT WES (optional, but recommended for WES) + - fasta: + type: file + description: Reference fasta file (optional) + - gc_file: + type: file + description: GC correction file (optional) - Used to do logR correction of the tumour sample(s) with genomic GC content + - rt_file: + type: file + description: replication timing correction file (optional, provide only in combination with gc_file) +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - allelefreqs: + type: file + description: Files containing allee frequencies per chromosome + pattern: "*{alleleFrequencies_chr*.txt}" + - metrics: + type: file + description: File containing quality metrics + pattern: "*.{metrics.txt}" + - png: + type: file + description: ASCAT plots + pattern: "*.{png}" + - purityploidy: + type: file + description: File with purity and ploidy data + pattern: "*.{purityploidy.txt}" + - segments: + type: file + description: File with segments data + pattern: "*.{segments.txt}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@aasNGC" + - "@lassefolkersen" + - "@FriederikeHanssen" + - "@maxulysse" + - "@SusiJo" +maintainers: + - "@aasNGC" + - "@lassefolkersen" + - "@FriederikeHanssen" + - "@maxulysse" + - "@SusiJo" diff --git a/modules/nf-core/bcftools/annotate/bcftools-annotate.diff b/modules/nf-core/bcftools/annotate/bcftools-annotate.diff new file mode 100644 index 0000000000..4376af5172 --- /dev/null +++ b/modules/nf-core/bcftools/annotate/bcftools-annotate.diff @@ -0,0 +1,28 @@ +Changes in module 'nf-core/bcftools/annotate' +--- modules/nf-core/bcftools/annotate/main.nf ++++ modules/nf-core/bcftools/annotate/main.nf +@@ -8,7 +8,10 @@ + 'biocontainers/bcftools:1.17--haef29d1_0' }" + + input: +- tuple val(meta), path(input), path(index), path(annotations), path(annotations_index), path(header_lines) ++ tuple val(meta), path(input) ++ path annotations ++ path annotations_index ++ path header_lines + + output: + tuple val(meta), path("*.{vcf,vcf.gz,bcf,bcf.gz}"), emit: vcf +@@ -29,6 +32,10 @@ + "vcf" + if ("$input" == "${prefix}.${extension}") error "Input and output names are the same, set prefix in module configuration to disambiguate!" + """ ++ bcftools \\ ++ index \\ ++ $input ++ + bcftools \\ + annotate \\ + $args \\ + +************************************************************ diff --git a/modules/nf-core/bcftools/annotate/environment.yml b/modules/nf-core/bcftools/annotate/environment.yml new file mode 100644 index 0000000000..273ffff4c2 --- /dev/null +++ b/modules/nf-core/bcftools/annotate/environment.yml @@ -0,0 +1,7 @@ +name: bcftools_annotate +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::bcftools=1.17 diff --git a/modules/nf-core/bcftools/annotate/main.nf b/modules/nf-core/bcftools/annotate/main.nf new file mode 100644 index 0000000000..f00c5fe2d7 --- /dev/null +++ b/modules/nf-core/bcftools/annotate/main.nf @@ -0,0 +1,70 @@ +process BCFTOOLS_ANNOTATE { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bcftools:1.17--haef29d1_0': + 'biocontainers/bcftools:1.17--haef29d1_0' }" + + input: + tuple val(meta), path(input) + path annotations + path annotations_index + path header_lines + + output: + tuple val(meta), path("*.{vcf,vcf.gz,bcf,bcf.gz}"), emit: vcf + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def header_file = header_lines ? "--header-lines ${header_lines}" : '' + def annotations_file = annotations ? "--annotations ${annotations}" : '' + def extension = args.contains("--output-type b") || args.contains("-Ob") ? "bcf.gz" : + args.contains("--output-type u") || args.contains("-Ou") ? "bcf" : + args.contains("--output-type z") || args.contains("-Oz") ? "vcf.gz" : + args.contains("--output-type v") || args.contains("-Ov") ? "vcf" : + "vcf" + if ("$input" == "${prefix}.${extension}") error "Input and output names are the same, set prefix in module configuration to disambiguate!" + """ + bcftools \\ + index \\ + $input + + bcftools \\ + annotate \\ + $args \\ + $annotations_file \\ + $header_file \\ + --output ${prefix}.${extension} \\ + --threads $task.cpus \\ + $input + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bcftools: \$( bcftools --version |& sed '1!d; s/^.*bcftools //' ) + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def extension = args.contains("--output-type b") || args.contains("-Ob") ? "bcf.gz" : + args.contains("--output-type u") || args.contains("-Ou") ? "bcf" : + args.contains("--output-type z") || args.contains("-Oz") ? "vcf.gz" : + args.contains("--output-type v") || args.contains("-Ov") ? "vcf" : + "vcf" + """ + touch ${prefix}.${extension} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bcftools: \$( bcftools --version |& sed '1!d; s/^.*bcftools //' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/bcftools/annotate/meta.yml b/modules/nf-core/bcftools/annotate/meta.yml new file mode 100644 index 0000000000..f3aa463bf5 --- /dev/null +++ b/modules/nf-core/bcftools/annotate/meta.yml @@ -0,0 +1,56 @@ +name: bcftools_annotate +description: Add or remove annotations. +keywords: + - bcftools + - annotate + - vcf + - remove + - add +tools: + - annotate: + description: Add or remove annotations. + homepage: http://samtools.github.io/bcftools/bcftools.html + documentation: https://samtools.github.io/bcftools/bcftools.html#annotate + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: Query VCF or BCF file, can be either uncompressed or compressed + - index: + type: file + description: Index of the query VCF or BCF file + - annotations: + type: file + description: Bgzip-compressed file with annotations + - annotations_index: + type: file + description: Index of the annotations file + - header_lines: + type: file + description: Contains lines to append to the output VCF header +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - vcf: + type: file + description: Compressed annotated VCF file + pattern: "*{vcf,vcf.gz,bcf,bcf.gz}" +authors: + - "@projectoriented" + - "@ramprasadn" +maintainers: + - "@projectoriented" + - "@ramprasadn" diff --git a/modules/nf-core/bcftools/concat/environment.yml b/modules/nf-core/bcftools/concat/environment.yml new file mode 100644 index 0000000000..be2539990b --- /dev/null +++ b/modules/nf-core/bcftools/concat/environment.yml @@ -0,0 +1,7 @@ +name: bcftools_concat +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::bcftools=1.17 diff --git a/modules/nf-core/bcftools/concat/main.nf b/modules/nf-core/bcftools/concat/main.nf new file mode 100644 index 0000000000..2ff690b1be --- /dev/null +++ b/modules/nf-core/bcftools/concat/main.nf @@ -0,0 +1,46 @@ +process BCFTOOLS_CONCAT { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bcftools:1.17--haef29d1_0': + 'biocontainers/bcftools:1.17--haef29d1_0' }" + + input: + tuple val(meta), path(vcfs), path(tbi) + + output: + tuple val(meta), path("*.gz"), emit: vcf + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + bcftools concat \\ + --output ${prefix}.vcf.gz \\ + $args \\ + --threads $task.cpus \\ + ${vcfs} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//') + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.vcf.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/bcftools/concat/meta.yml b/modules/nf-core/bcftools/concat/meta.yml new file mode 100644 index 0000000000..8731b17bc8 --- /dev/null +++ b/modules/nf-core/bcftools/concat/meta.yml @@ -0,0 +1,51 @@ +name: bcftools_concat +description: Concatenate VCF files +keywords: + - variant calling + - concat + - bcftools + - VCF +tools: + - concat: + description: | + Concatenate VCF files. + homepage: http://samtools.github.io/bcftools/bcftools.html + documentation: http://www.htslib.org/doc/bcftools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - vcfs: + type: files + description: | + List containing 2 or more vcf files + e.g. [ 'file1.vcf', 'file2.vcf' ] + - tbi: + type: files + description: | + List containing 2 or more index files (optional) + e.g. [ 'file1.tbi', 'file2.tbi' ] +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - vcf: + type: file + description: VCF concatenated output file + pattern: "*.{vcf.gz}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@abhi18av" + - "@nvnieuwk" +maintainers: + - "@abhi18av" + - "@nvnieuwk" diff --git a/modules/nf-core/bcftools/mpileup/environment.yml b/modules/nf-core/bcftools/mpileup/environment.yml new file mode 100644 index 0000000000..346d187fec --- /dev/null +++ b/modules/nf-core/bcftools/mpileup/environment.yml @@ -0,0 +1,7 @@ +name: bcftools_mpileup +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::bcftools=1.17 diff --git a/modules/nf-core/bcftools/mpileup/main.nf b/modules/nf-core/bcftools/mpileup/main.nf new file mode 100644 index 0000000000..83bec8ef5f --- /dev/null +++ b/modules/nf-core/bcftools/mpileup/main.nf @@ -0,0 +1,58 @@ +process BCFTOOLS_MPILEUP { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bcftools:1.17--haef29d1_0': + 'biocontainers/bcftools:1.17--haef29d1_0' }" + + input: + tuple val(meta), path(bam), path(intervals) + tuple val(meta2), path(fasta) + val save_mpileup + + output: + tuple val(meta), path("*vcf.gz") , emit: vcf + tuple val(meta), path("*vcf.gz.tbi") , emit: tbi + tuple val(meta), path("*stats.txt") , emit: stats + tuple val(meta), path("*.mpileup.gz"), emit: mpileup, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def args3 = task.ext.args3 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def mpileup = save_mpileup ? "| tee ${prefix}.mpileup" : "" + def bgzip_mpileup = save_mpileup ? "bgzip ${prefix}.mpileup" : "" + def intervals = intervals ? "-T ${intervals}" : "" + """ + echo "${meta.id}" > sample_name.list + + bcftools \\ + mpileup \\ + --fasta-ref $fasta \\ + $args \\ + $bam \\ + $intervals \\ + $mpileup \\ + | bcftools call --output-type v $args2 \\ + | bcftools reheader --samples sample_name.list \\ + | bcftools view --output-file ${prefix}.vcf.gz --output-type z $args3 + + $bgzip_mpileup + + tabix -p vcf -f ${prefix}.vcf.gz + + bcftools stats ${prefix}.vcf.gz > ${prefix}.bcftools_stats.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/bcftools/mpileup/meta.yml b/modules/nf-core/bcftools/mpileup/meta.yml new file mode 100644 index 0000000000..65410ddd66 --- /dev/null +++ b/modules/nf-core/bcftools/mpileup/meta.yml @@ -0,0 +1,70 @@ +name: bcftools_mpileup +description: Compresses VCF files +keywords: + - variant calling + - mpileup + - VCF +tools: + - mpileup: + description: | + Generates genotype likelihoods at each genomic position with coverage. + homepage: http://samtools.github.io/bcftools/bcftools.html + documentation: http://www.htslib.org/doc/bcftools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: Input BAM file + pattern: "*.{bam}" + - intervals: + type: file + description: Input intervals file. A file (commonly '.bed') containing regions to subset + - meta: + type: map + description: | + Groovy Map containing information about the genome fasta, e.g. [ id: 'sarscov2' ] + - fasta: + type: file + description: FASTA reference file + pattern: "*.{fasta,fa}" + - save_mpileup: + type: boolean + description: Save mpileup file generated by bcftools mpileup +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - vcf: + type: file + description: VCF gzipped output file + pattern: "*.{vcf.gz}" + - tbi: + type: file + description: tabix index file + pattern: "*.{vcf.gz.tbi}" + - stats: + type: file + description: Text output file containing stats + pattern: "*{stats.txt}" + - mpileup: + type: file + description: mpileup gzipped output for all positions + pattern: "{*.mpileup.gz}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" +maintainers: + - "@joseespinosa" + - "@drpatelh" diff --git a/modules/nf-core/bcftools/sort/environment.yml b/modules/nf-core/bcftools/sort/environment.yml new file mode 100644 index 0000000000..26d3644501 --- /dev/null +++ b/modules/nf-core/bcftools/sort/environment.yml @@ -0,0 +1,7 @@ +name: bcftools_sort +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::bcftools=1.17 diff --git a/modules/nf-core/bcftools/sort/main.nf b/modules/nf-core/bcftools/sort/main.nf new file mode 100644 index 0000000000..c842daf232 --- /dev/null +++ b/modules/nf-core/bcftools/sort/main.nf @@ -0,0 +1,61 @@ +process BCFTOOLS_SORT { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bcftools:1.17--haef29d1_0': + 'biocontainers/bcftools:1.17--haef29d1_0' }" + + input: + tuple val(meta), path(vcf) + + output: + tuple val(meta), path("*.{vcf,vcf.gz,bcf,bcf.gz}") , emit: vcf + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '--output-type z' + def prefix = task.ext.prefix ?: "${meta.id}" + def extension = args.contains("--output-type b") || args.contains("-Ob") ? "bcf.gz" : + args.contains("--output-type u") || args.contains("-Ou") ? "bcf" : + args.contains("--output-type z") || args.contains("-Oz") ? "vcf.gz" : + args.contains("--output-type v") || args.contains("-Ov") ? "vcf" : + "vcf" + + """ + bcftools \\ + sort \\ + --output ${prefix}.${extension} \\ + --temp-dir . \\ + $args \\ + $vcf + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '--output-type z' + def prefix = task.ext.prefix ?: "${meta.id}" + + def extension = args.contains("--output-type b") || args.contains("-Ob") ? "bcf.gz" : + args.contains("--output-type u") || args.contains("-Ou") ? "bcf" : + args.contains("--output-type z") || args.contains("-Oz") ? "vcf.gz" : + args.contains("--output-type v") || args.contains("-Ov") ? "vcf" : + "vcf" + + """ + touch ${prefix}.${extension} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/bcftools/sort/meta.yml b/modules/nf-core/bcftools/sort/meta.yml new file mode 100644 index 0000000000..84747c6d89 --- /dev/null +++ b/modules/nf-core/bcftools/sort/meta.yml @@ -0,0 +1,42 @@ +name: bcftools_sort +description: Sorts VCF files +keywords: + - sorting + - VCF + - variant calling +tools: + - sort: + description: Sort VCF files by coordinates. + homepage: http://samtools.github.io/bcftools/bcftools.html + documentation: http://www.htslib.org/doc/bcftools.html + tool_dev_url: https://github.com/samtools/bcftools + doi: "10.1093/bioinformatics/btp352" + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - vcf: + type: file + description: The VCF/BCF file to be sorted + pattern: "*.{vcf.gz,vcf,bcf}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - vcf: + type: file + description: Sorted VCF file + pattern: "*.{vcf.gz}" +authors: + - "@Gwennid" +maintainers: + - "@Gwennid" diff --git a/modules/nf-core/bcftools/stats/environment.yml b/modules/nf-core/bcftools/stats/environment.yml new file mode 100644 index 0000000000..a937a10a7f --- /dev/null +++ b/modules/nf-core/bcftools/stats/environment.yml @@ -0,0 +1,7 @@ +name: bcftools_stats +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::bcftools=1.17 diff --git a/modules/nf-core/bcftools/stats/main.nf b/modules/nf-core/bcftools/stats/main.nf new file mode 100644 index 0000000000..b3a5f23ba6 --- /dev/null +++ b/modules/nf-core/bcftools/stats/main.nf @@ -0,0 +1,60 @@ +process BCFTOOLS_STATS { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bcftools:1.17--haef29d1_0': + 'biocontainers/bcftools:1.17--haef29d1_0' }" + + input: + tuple val(meta), path(vcf), path(tbi) + tuple val(meta2), path(regions) + tuple val(meta3), path(targets) + tuple val(meta4), path(samples) + tuple val(meta5), path(exons) + tuple val(meta6), path(fasta) + + output: + tuple val(meta), path("*stats.txt"), emit: stats + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def regions_file = regions ? "--regions-file ${regions}" : "" + def targets_file = targets ? "--targets-file ${targets}" : "" + def samples_file = samples ? "--samples-file ${samples}" : "" + def reference_fasta = fasta ? "--fasta-ref ${fasta}" : "" + def exons_file = exons ? "--exons ${exons}" : "" + """ + bcftools stats \\ + $args \\ + $regions_file \\ + $targets_file \\ + $samples_file \\ + $reference_fasta \\ + $exons_file \\ + $vcf > ${prefix}.bcftools_stats.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + touch ${prefix}.bcftools_stats.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/bcftools/stats/meta.yml b/modules/nf-core/bcftools/stats/meta.yml new file mode 100644 index 0000000000..7ea2103e3b --- /dev/null +++ b/modules/nf-core/bcftools/stats/meta.yml @@ -0,0 +1,77 @@ +name: bcftools_stats +description: Generates stats from VCF files +keywords: + - variant calling + - stats + - VCF +tools: + - stats: + description: | + Parses VCF or BCF and produces text file stats which is suitable for + machine processing and can be plotted using plot-vcfstats. + homepage: http://samtools.github.io/bcftools/bcftools.html + documentation: http://www.htslib.org/doc/bcftools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - vcf: + type: file + description: VCF input file + pattern: "*.{vcf}" + - tbi: + type: file + description: | + The tab index for the VCF file to be inspected. Optional: only required when parameter regions is chosen. + pattern: "*.tbi" + - regions: + type: file + description: | + Optionally, restrict the operation to regions listed in this file. (VCF, BED or tab-delimited) + - targets: + type: file + description: | + Optionally, restrict the operation to regions listed in this file (doesn't rely upon tbi index files) + - samples: + type: file + description: | + Optional, file of sample names to be included or excluded. + e.g. 'file.tsv' + - exons: + type: file + description: | + Tab-delimited file with exons for indel frameshifts (chr,beg,end; 1-based, inclusive, optionally bgzip compressed). + e.g. 'exons.tsv.gz' + - fasta: + type: file + description: | + Faidx indexed reference sequence file to determine INDEL context. + e.g. 'reference.fa' +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - stats: + type: file + description: Text output file containing stats + pattern: "*_{stats.txt}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" + - "@SusiJo" + - "@TCLamnidis" +maintainers: + - "@joseespinosa" + - "@drpatelh" + - "@SusiJo" + - "@TCLamnidis" diff --git a/modules/nf-core/bwa/index/environment.yml b/modules/nf-core/bwa/index/environment.yml new file mode 100644 index 0000000000..5d3cb3231b --- /dev/null +++ b/modules/nf-core/bwa/index/environment.yml @@ -0,0 +1,7 @@ +name: bwa_index +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::bwa=0.7.17 diff --git a/modules/nf-core/bwa/index/main.nf b/modules/nf-core/bwa/index/main.nf new file mode 100644 index 0000000000..24b5a2ea99 --- /dev/null +++ b/modules/nf-core/bwa/index/main.nf @@ -0,0 +1,53 @@ +process BWA_INDEX { + tag "$fasta" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bwa:0.7.17--hed695b0_7' : + 'biocontainers/bwa:0.7.17--hed695b0_7' }" + + input: + tuple val(meta), path(fasta) + + output: + tuple val(meta), path(bwa) , emit: index + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: "${fasta.baseName}" + def args = task.ext.args ?: '' + """ + mkdir bwa + bwa \\ + index \\ + $args \\ + -p bwa/${prefix} \\ + $fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bwa: \$(echo \$(bwa 2>&1) | sed 's/^.*Version: //; s/Contact:.*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${fasta.baseName}" + """ + mkdir bwa + + touch bwa/${prefix}.amb + touch bwa/${prefix}.ann + touch bwa/${prefix}.bwt + touch bwa/${prefix}.pac + touch bwa/${prefix}.sa + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bwa: \$(echo \$(bwa 2>&1) | sed 's/^.*Version: //; s/Contact:.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/bwa/index/meta.yml b/modules/nf-core/bwa/index/meta.yml new file mode 100644 index 0000000000..730628d005 --- /dev/null +++ b/modules/nf-core/bwa/index/meta.yml @@ -0,0 +1,45 @@ +name: bwa_index +description: Create BWA index for reference genome +keywords: + - index + - fasta + - genome + - reference +tools: + - bwa: + description: | + BWA is a software package for mapping DNA sequences against + a large reference genome, such as the human genome. + homepage: http://bio-bwa.sourceforge.net/ + documentation: http://www.htslib.org/doc/samtools.html + arxiv: arXiv:1303.3997 + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: Input genome fasta file +output: + - meta: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test', single_end:false ] + - index: + type: file + description: BWA genome index files + pattern: "*.{amb,ann,bwt,pac,sa}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@maxulysse" +maintainers: + - "@drpatelh" + - "@maxulysse" diff --git a/modules/nf-core/bwa/index/tests/main.nf.test b/modules/nf-core/bwa/index/tests/main.nf.test new file mode 100644 index 0000000000..5fc8d49662 --- /dev/null +++ b/modules/nf-core/bwa/index/tests/main.nf.test @@ -0,0 +1,33 @@ +nextflow_process { + + name "Test Process BWA_INDEX" + tag "modules_nfcore" + tag "modules" + tag "bwa" + tag "bwa/index" + script "../main.nf" + process "BWA_INDEX" + + test("BWA index") { + + when { + process { + """ + input[0] = [ + [id: 'test'], + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/bwa/index/tests/main.nf.test.snap b/modules/nf-core/bwa/index/tests/main.nf.test.snap new file mode 100644 index 0000000000..e51ad5bf2b --- /dev/null +++ b/modules/nf-core/bwa/index/tests/main.nf.test.snap @@ -0,0 +1,43 @@ +{ + "BWA index": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + [ + "genome.amb:md5,3a68b8b2287e07dd3f5f95f4344ba76e", + "genome.ann:md5,c32e11f6c859f166c7525a9c1d583567", + "genome.bwt:md5,0469c30a1e239dd08f68afe66fde99da", + "genome.pac:md5,983e3d2cd6f36e2546e6d25a0da78d66", + "genome.sa:md5,ab3952cabf026b48cd3eb5bccbb636d1" + ] + ] + ], + "1": [ + "versions.yml:md5,0f20525da90e7489a7ebb02adca3265f" + ], + "index": [ + [ + { + "id": "test" + }, + [ + "genome.amb:md5,3a68b8b2287e07dd3f5f95f4344ba76e", + "genome.ann:md5,c32e11f6c859f166c7525a9c1d583567", + "genome.bwt:md5,0469c30a1e239dd08f68afe66fde99da", + "genome.pac:md5,983e3d2cd6f36e2546e6d25a0da78d66", + "genome.sa:md5,ab3952cabf026b48cd3eb5bccbb636d1" + ] + ] + ], + "versions": [ + "versions.yml:md5,0f20525da90e7489a7ebb02adca3265f" + ] + } + ], + "timestamp": "2023-10-17T17:20:20.180927714" + } +} \ No newline at end of file diff --git a/modules/nf-core/bwa/index/tests/tags.yml b/modules/nf-core/bwa/index/tests/tags.yml new file mode 100644 index 0000000000..28bb483c4e --- /dev/null +++ b/modules/nf-core/bwa/index/tests/tags.yml @@ -0,0 +1,2 @@ +bwa/index: + - modules/nf-core/bwa/index/** diff --git a/modules/nf-core/bwa/mem/environment.yml b/modules/nf-core/bwa/mem/environment.yml new file mode 100644 index 0000000000..401a0d06ce --- /dev/null +++ b/modules/nf-core/bwa/mem/environment.yml @@ -0,0 +1,9 @@ +name: bwa_mem +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bwa=0.7.17 + # renovate: datasource=conda depName=bioconda/samtools + - samtools=1.16.1 diff --git a/modules/nf-core/bwa/mem/main.nf b/modules/nf-core/bwa/mem/main.nf new file mode 100644 index 0000000000..17e6fbd06c --- /dev/null +++ b/modules/nf-core/bwa/mem/main.nf @@ -0,0 +1,55 @@ +process BWA_MEM { + tag "$meta.id" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-fe8faa35dbf6dc65a0f7f5d4ea12e31a79f73e40:219b6c272b25e7e642ae3ff0bf0c5c81a5135ab4-0' : + 'biocontainers/mulled-v2-fe8faa35dbf6dc65a0f7f5d4ea12e31a79f73e40:219b6c272b25e7e642ae3ff0bf0c5c81a5135ab4-0' }" + + input: + tuple val(meta), path(reads) + tuple val(meta2), path(index) + val sort_bam + + output: + tuple val(meta), path("*.bam"), emit: bam + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def samtools_command = sort_bam ? 'sort' : 'view' + """ + INDEX=`find -L ./ -name "*.amb" | sed 's/\\.amb\$//'` + + bwa mem \\ + $args \\ + -t $task.cpus \\ + \$INDEX \\ + $reads \\ + | samtools $samtools_command $args2 --threads $task.cpus -o ${prefix}.bam - + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bwa: \$(echo \$(bwa 2>&1) | sed 's/^.*Version: //; s/Contact:.*\$//') + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.bam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bwa: \$(echo \$(bwa 2>&1) | sed 's/^.*Version: //; s/Contact:.*\$//') + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/bwa/mem/meta.yml b/modules/nf-core/bwa/mem/meta.yml new file mode 100644 index 0000000000..440fb1f9cf --- /dev/null +++ b/modules/nf-core/bwa/mem/meta.yml @@ -0,0 +1,58 @@ +name: bwa_mem +description: Performs fastq alignment to a fasta reference using BWA +keywords: + - mem + - bwa + - alignment + - map + - fastq + - bam + - sam +tools: + - bwa: + description: | + BWA is a software package for mapping DNA sequences against + a large reference genome, such as the human genome. + homepage: http://bio-bwa.sourceforge.net/ + documentation: http://www.htslib.org/doc/samtools.html + arxiv: arXiv:1303.3997 + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + - meta2: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test', single_end:false ] + - index: + type: file + description: BWA genome index files + pattern: "Directory containing BWA index *.{amb,ann,bwt,pac,sa}" + - sort_bam: + type: boolean + description: use samtools sort (true) or samtools view (false) + pattern: "true or false" +output: + - bam: + type: file + description: Output BAM file containing read alignments + pattern: "*.{bam}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@jeremy1805" +maintainers: + - "@drpatelh" + - "@jeremy1805" diff --git a/modules/nf-core/bwa/mem/tests/main.nf.test b/modules/nf-core/bwa/mem/tests/main.nf.test new file mode 100644 index 0000000000..b199bb70f6 --- /dev/null +++ b/modules/nf-core/bwa/mem/tests/main.nf.test @@ -0,0 +1,172 @@ +nextflow_process { + + name "Test Process BWA_MEM" + tag "modules_nfcore" + tag "modules" + tag "bwa" + tag "bwa/mem" + script "../main.nf" + process "BWA_MEM" + + test("Single-End") { + + setup { + run("BWA_INDEX") { + script "../../index/main.nf" + process { + """ + input[0] = [ + [id: 'test'], + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + } + } + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + [ + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) + ] + ] + input[1] = BWA_INDEX.out.index + input[2] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("Single-End Sort") { + + setup { + run("BWA_INDEX") { + script "../../index/main.nf" + process { + """ + input[0] = [ + [id: 'test'], + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + } + } + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + [ + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) + ] + ] + input[1] = BWA_INDEX.out.index + input[2] = true + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("Paired-End") { + + setup { + run("BWA_INDEX") { + script "../../index/main.nf" + process { + """ + input[0] = [ + [id: 'test'], + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + } + } + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) + ] + ] + input[1] = BWA_INDEX.out.index + input[2] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("Paired-End Sort") { + + setup { + run("BWA_INDEX") { + script "../../index/main.nf" + process { + """ + input[0] = [ + [id: 'test'], + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + } + } + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) + ] + ] + input[1] = BWA_INDEX.out.index + input[2] = true + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } +} diff --git a/modules/nf-core/bwa/mem/tests/main.nf.test.snap b/modules/nf-core/bwa/mem/tests/main.nf.test.snap new file mode 100644 index 0000000000..ea3bfed4fd --- /dev/null +++ b/modules/nf-core/bwa/mem/tests/main.nf.test.snap @@ -0,0 +1,126 @@ +{ + "Single-End": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.bam:md5,3d43027d4163ada97911b814001511e5" + ] + ], + "1": [ + "versions.yml:md5,809f4a8c7f0c8497a9099dab9d6cc71e" + ], + "bam": [ + [ + { + "id": "test", + "single_end": true + }, + "test.bam:md5,3d43027d4163ada97911b814001511e5" + ] + ], + "versions": [ + "versions.yml:md5,809f4a8c7f0c8497a9099dab9d6cc71e" + ] + } + ], + "timestamp": "2023-10-18T11:02:55.420631681" + }, + "Single-End Sort": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.bam:md5,61eac1213d2bf5e88e225e545010e9b8" + ] + ], + "1": [ + "versions.yml:md5,809f4a8c7f0c8497a9099dab9d6cc71e" + ], + "bam": [ + [ + { + "id": "test", + "single_end": true + }, + "test.bam:md5,61eac1213d2bf5e88e225e545010e9b8" + ] + ], + "versions": [ + "versions.yml:md5,809f4a8c7f0c8497a9099dab9d6cc71e" + ] + } + ], + "timestamp": "2023-10-18T11:03:02.646869498" + }, + "Paired-End": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.bam:md5,809ccfe4300fa5005a9d0d4dc09b1a36" + ] + ], + "1": [ + "versions.yml:md5,809f4a8c7f0c8497a9099dab9d6cc71e" + ], + "bam": [ + [ + { + "id": "test", + "single_end": false + }, + "test.bam:md5,809ccfe4300fa5005a9d0d4dc09b1a36" + ] + ], + "versions": [ + "versions.yml:md5,809f4a8c7f0c8497a9099dab9d6cc71e" + ] + } + ], + "timestamp": "2023-10-18T11:03:09.793041294" + }, + "Paired-End Sort": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.bam:md5,2622f4380f992c505af7dab8c256313f" + ] + ], + "1": [ + "versions.yml:md5,809f4a8c7f0c8497a9099dab9d6cc71e" + ], + "bam": [ + [ + { + "id": "test", + "single_end": false + }, + "test.bam:md5,2622f4380f992c505af7dab8c256313f" + ] + ], + "versions": [ + "versions.yml:md5,809f4a8c7f0c8497a9099dab9d6cc71e" + ] + } + ], + "timestamp": "2023-10-18T11:04:43.662093286" + } +} \ No newline at end of file diff --git a/modules/nf-core/bwa/mem/tests/tags.yml b/modules/nf-core/bwa/mem/tests/tags.yml new file mode 100644 index 0000000000..82992d1f0b --- /dev/null +++ b/modules/nf-core/bwa/mem/tests/tags.yml @@ -0,0 +1,3 @@ +bwa/mem: + - modules/nf-core/bwa/index/** + - modules/nf-core/bwa/mem/** diff --git a/modules/nf-core/bwamem2/index/environment.yml b/modules/nf-core/bwamem2/index/environment.yml new file mode 100644 index 0000000000..26b439172a --- /dev/null +++ b/modules/nf-core/bwamem2/index/environment.yml @@ -0,0 +1,7 @@ +name: bwamem2_index +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::bwa-mem2=2.2.1 diff --git a/modules/nf-core/bwamem2/index/main.nf b/modules/nf-core/bwamem2/index/main.nf new file mode 100644 index 0000000000..b7688285d7 --- /dev/null +++ b/modules/nf-core/bwamem2/index/main.nf @@ -0,0 +1,52 @@ +process BWAMEM2_INDEX { + tag "$fasta" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bwa-mem2:2.2.1--he513fc3_0' : + 'biocontainers/bwa-mem2:2.2.1--he513fc3_0' }" + + input: + tuple val(meta), path(fasta) + + output: + tuple val(meta), path("bwamem2"), emit: index + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: "${fasta}" + def args = task.ext.args ?: '' + """ + mkdir bwamem2 + bwa-mem2 \\ + index \\ + $args \\ + $fasta -p bwamem2/${prefix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bwamem2: \$(echo \$(bwa-mem2 version 2>&1) | sed 's/.* //') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${fasta}" + + """ + mkdir bwamem2 + touch bwamem2/${prefix}.0123 + touch bwamem2/${prefix}.ann + touch bwamem2/${prefix}.pac + touch bwamem2/${prefix}.amb + touch bwamem2/${prefix}.bwt.2bit.64 + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bwamem2: \$(echo \$(bwa-mem2 version 2>&1) | sed 's/.* //') + END_VERSIONS + """ +} diff --git a/modules/nf-core/bwamem2/index/meta.yml b/modules/nf-core/bwamem2/index/meta.yml new file mode 100644 index 0000000000..c14a109252 --- /dev/null +++ b/modules/nf-core/bwamem2/index/meta.yml @@ -0,0 +1,42 @@ +name: bwamem2_index +description: Create BWA-mem2 index for reference genome +keywords: + - index + - fasta + - genome + - reference +tools: + - bwamem2: + description: | + BWA-mem2 is a software package for mapping DNA sequences against + a large reference genome, such as the human genome. + homepage: https://github.com/bwa-mem2/bwa-mem2 + documentation: https://github.com/bwa-mem2/bwa-mem2#usage + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: Input genome fasta file +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - index: + type: file + description: BWA genome index files + pattern: "*.{0123,amb,ann,bwt.2bit.64,pac}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@maxulysse" +maintainers: + - "@maxulysse" diff --git a/modules/nf-core/bwamem2/mem/environment.yml b/modules/nf-core/bwamem2/mem/environment.yml new file mode 100644 index 0000000000..67989071e3 --- /dev/null +++ b/modules/nf-core/bwamem2/mem/environment.yml @@ -0,0 +1,9 @@ +name: bwamem2_mem +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bwa-mem2=2.2.1 + # renovate: datasource=conda depName=bioconda/samtools + - samtools=1.16.1 diff --git a/modules/nf-core/bwamem2/mem/main.nf b/modules/nf-core/bwamem2/mem/main.nf new file mode 100644 index 0000000000..cbec7eef50 --- /dev/null +++ b/modules/nf-core/bwamem2/mem/main.nf @@ -0,0 +1,55 @@ +process BWAMEM2_MEM { + tag "$meta.id" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-e5d375990341c5aef3c9aff74f96f66f65375ef6:2cdf6bf1e92acbeb9b2834b1c58754167173a410-0' : + 'biocontainers/mulled-v2-e5d375990341c5aef3c9aff74f96f66f65375ef6:2cdf6bf1e92acbeb9b2834b1c58754167173a410-0' }" + + input: + tuple val(meta), path(reads) + tuple val(meta2), path(index) + val sort_bam + + output: + tuple val(meta), path("*.bam"), emit: bam + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def samtools_command = sort_bam ? 'sort' : 'view' + """ + INDEX=`find -L ./ -name "*.amb" | sed 's/\\.amb\$//'` + + bwa-mem2 \\ + mem \\ + $args \\ + -t $task.cpus \\ + \$INDEX \\ + $reads \\ + | samtools $samtools_command $args2 -@ $task.cpus -o ${prefix}.bam - + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bwamem2: \$(echo \$(bwa-mem2 version 2>&1) | sed 's/.* //') + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.bam + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bwamem2: \$(echo \$(bwa-mem2 version 2>&1) | sed 's/.* //') + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/bwamem2/mem/meta.yml b/modules/nf-core/bwamem2/mem/meta.yml new file mode 100644 index 0000000000..04891b26a9 --- /dev/null +++ b/modules/nf-core/bwamem2/mem/meta.yml @@ -0,0 +1,61 @@ +name: bwamem2_mem +description: Performs fastq alignment to a fasta reference using BWA +keywords: + - mem + - bwa + - alignment + - map + - fastq + - bam + - sam +tools: + - bwa: + description: | + BWA-mem2 is a software package for mapping DNA sequences against + a large reference genome, such as the human genome. + homepage: https://github.com/bwa-mem2/bwa-mem2 + documentation: http://www.htslib.org/doc/samtools.html + arxiv: arXiv:1303.3997 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + - meta2: + type: map + description: | + Groovy Map containing reference/index information + e.g. [ id:'test' ] + - index: + type: file + description: BWA genome index files + pattern: "Directory containing BWA index *.{0132,amb,ann,bwt.2bit.64,pac}" + - sort_bam: + type: boolean + description: use samtools sort (true) or samtools view (false) + pattern: "true or false" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: Output BAM file containing read alignments + pattern: "*.{bam}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@maxulysse" +maintainers: + - "@maxulysse" diff --git a/modules/nf-core/cat/cat/environment.yml b/modules/nf-core/cat/cat/environment.yml new file mode 100644 index 0000000000..17a04ef232 --- /dev/null +++ b/modules/nf-core/cat/cat/environment.yml @@ -0,0 +1,7 @@ +name: cat_cat +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - conda-forge::pigz=2.3.4 diff --git a/modules/nf-core/cat/cat/main.nf b/modules/nf-core/cat/cat/main.nf new file mode 100644 index 0000000000..4264a92ccc --- /dev/null +++ b/modules/nf-core/cat/cat/main.nf @@ -0,0 +1,62 @@ +process CAT_CAT { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/pigz:2.3.4' : + 'biocontainers/pigz:2.3.4' }" + + input: + tuple val(meta), path(files_in) + + output: + tuple val(meta), path("${prefix}"), emit: file_out + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def file_list = files_in.collect { it.toString() } + + // | input | output | command1 | command2 | + // |-----------|------------|----------|----------| + // | gzipped | gzipped | cat | | + // | ungzipped | ungzipped | cat | | + // | gzipped | ungzipped | zcat | | + // | ungzipped | gzipped | cat | pigz | + + // Use input file ending as default + prefix = task.ext.prefix ?: "${meta.id}${file_list[0].substring(file_list[0].lastIndexOf('.'))}" + out_zip = prefix.endsWith('.gz') + in_zip = file_list[0].endsWith('.gz') + command1 = (in_zip && !out_zip) ? 'zcat' : 'cat' + command2 = (!in_zip && out_zip) ? "| pigz -c -p $task.cpus $args2" : '' + """ + $command1 \\ + $args \\ + ${file_list.join(' ')} \\ + $command2 \\ + > ${prefix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' ) + END_VERSIONS + """ + + stub: + def file_list = files_in.collect { it.toString() } + prefix = task.ext.prefix ?: "${meta.id}${file_list[0].substring(file_list[0].lastIndexOf('.'))}" + """ + touch $prefix + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/cat/cat/meta.yml b/modules/nf-core/cat/cat/meta.yml new file mode 100644 index 0000000000..00a8db0bca --- /dev/null +++ b/modules/nf-core/cat/cat/meta.yml @@ -0,0 +1,36 @@ +name: cat_cat +description: A module for concatenation of gzipped or uncompressed files +keywords: + - concatenate + - gzip + - cat +tools: + - cat: + description: Just concatenation + documentation: https://man7.org/linux/man-pages/man1/cat.1.html + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - files_in: + type: file + description: List of compressed / uncompressed files + pattern: "*" +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - file_out: + type: file + description: Concatenated file. Will be gzipped if file_out ends with ".gz" + pattern: "${file_out}" +authors: + - "@erikrikarddaniel" + - "@FriederikeHanssen" +maintainers: + - "@erikrikarddaniel" + - "@FriederikeHanssen" diff --git a/modules/nf-core/cat/cat/tests/main.nf.test b/modules/nf-core/cat/cat/tests/main.nf.test new file mode 100644 index 0000000000..5766daafbe --- /dev/null +++ b/modules/nf-core/cat/cat/tests/main.nf.test @@ -0,0 +1,153 @@ +nextflow_process { + + name "Test Process CAT_CAT" + script "../main.nf" + process "CAT_CAT" + tag "modules" + tag "modules_nfcore" + tag "cat" + tag "cat/cat" + + test("test_cat_unzipped_unzipped") { + when { + params { + outdir = "${outputDir}" + } + process { + """ + input[0] = + [ + [ id:'test', single_end:true ], + [ + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true), + file(params.test_data['sarscov2']['genome']['genome_sizes'], checkIfExists: true) + ] + ] + """ + } + } + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + + test("test_cat_zipped_zipped") { + when { + params { + outdir = "${outputDir}" + } + process { + """ + input[0] = + [ + [ id:'test', single_end:true ], + [ + file(params.test_data['sarscov2']['genome']['genome_gff3_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['genome']['contigs_genome_maf_gz'], checkIfExists: true) + ] + ] + """ + } + } + then { + def lines = path(process.out.file_out.get(0).get(1)).linesGzip + assertAll( + { assert process.success }, + { assert snapshot(lines[0..5]).match("test_cat_zipped_zipped_lines") }, + { assert snapshot(lines.size()).match("test_cat_zipped_zipped_size")} + ) + } + } + + test("test_cat_zipped_unzipped") { + config './nextflow_zipped_unzipped.config' + + when { + params { + outdir = "${outputDir}" + } + process { + """ + input[0] = + [ + [ id:'test', single_end:true ], + [ + file(params.test_data['sarscov2']['genome']['genome_gff3_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['genome']['contigs_genome_maf_gz'], checkIfExists: true) + ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("test_cat_unzipped_zipped") { + config './nextflow_unzipped_zipped.config' + when { + params { + outdir = "${outputDir}" + } + process { + """ + input[0] = + [ + [ id:'test', single_end:true ], + [ + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true), + file(params.test_data['sarscov2']['genome']['genome_sizes'], checkIfExists: true) + ] + ] + """ + } + } + then { + def lines = path(process.out.file_out.get(0).get(1)).linesGzip + assertAll( + { assert process.success }, + { assert snapshot(lines[0..5]).match("test_cat_unzipped_zipped_lines") }, + { assert snapshot(lines.size()).match("test_cat_unzipped_zipped_size")} + ) + } + } + + test("test_cat_one_file_unzipped_zipped") { + config './nextflow_unzipped_zipped.config' + when { + params { + outdir = "${outputDir}" + } + process { + """ + input[0] = + [ + [ id:'test', single_end:true ], + [ + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + ] + """ + } + } + then { + def lines = path(process.out.file_out.get(0).get(1)).linesGzip + assertAll( + { assert process.success }, + { assert snapshot(lines[0..5]).match("test_cat_one_file_unzipped_zipped_lines") }, + { assert snapshot(lines.size()).match("test_cat_one_file_unzipped_zipped_size")} + ) + } + } +} + diff --git a/modules/nf-core/cat/cat/tests/main.nf.test.snap b/modules/nf-core/cat/cat/tests/main.nf.test.snap new file mode 100644 index 0000000000..423571ba27 --- /dev/null +++ b/modules/nf-core/cat/cat/tests/main.nf.test.snap @@ -0,0 +1,121 @@ +{ + "test_cat_unzipped_zipped_size": { + "content": [ + 375 + ], + "timestamp": "2023-10-16T14:33:08.049445686" + }, + "test_cat_unzipped_unzipped": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.fasta:md5,f44b33a0e441ad58b2d3700270e2dbe2" + ] + ], + "1": [ + "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894" + ], + "file_out": [ + [ + { + "id": "test", + "single_end": true + }, + "test.fasta:md5,f44b33a0e441ad58b2d3700270e2dbe2" + ] + ], + "versions": [ + "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894" + ] + } + ], + "timestamp": "2023-10-16T14:32:18.500464399" + }, + "test_cat_zipped_unzipped": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "cat.txt:md5,c439d3b60e7bc03e8802a451a0d9a5d9" + ] + ], + "1": [ + "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894" + ], + "file_out": [ + [ + { + "id": "test", + "single_end": true + }, + "cat.txt:md5,c439d3b60e7bc03e8802a451a0d9a5d9" + ] + ], + "versions": [ + "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894" + ] + } + ], + "timestamp": "2023-10-16T14:32:49.642741302" + }, + "test_cat_zipped_zipped_lines": { + "content": [ + [ + "MT192765.1\tGenbank\ttranscript\t259\t29667\t.\t+\t.\tID=unknown_transcript_1;geneID=orf1ab;gene_name=orf1ab", + "MT192765.1\tGenbank\tgene\t259\t21548\t.\t+\t.\tParent=unknown_transcript_1", + "MT192765.1\tGenbank\tCDS\t259\t13461\t.\t+\t0\tParent=unknown_transcript_1;exception=\"ribosomal slippage\";gbkey=CDS;gene=orf1ab;note=\"pp1ab;translated=by -1 ribosomal frameshift\";product=\"orf1ab polyprotein\";protein_id=QIK50426.1", + "MT192765.1\tGenbank\tCDS\t13461\t21548\t.\t+\t0\tParent=unknown_transcript_1;exception=\"ribosomal slippage\";gbkey=CDS;gene=orf1ab;note=\"pp1ab;translated=by -1 ribosomal frameshift\";product=\"orf1ab polyprotein\";protein_id=QIK50426.1", + "MT192765.1\tGenbank\tCDS\t21556\t25377\t.\t+\t0\tParent=unknown_transcript_1;gbkey=CDS;gene=S;note=\"structural protein\";product=\"surface glycoprotein\";protein_id=QIK50427.1", + "MT192765.1\tGenbank\tgene\t21556\t25377\t.\t+\t.\tParent=unknown_transcript_1" + ] + ], + "timestamp": "2023-10-16T14:32:33.629048645" + }, + "test_cat_unzipped_zipped_lines": { + "content": [ + [ + ">MT192765.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/USA/PC00101P/2020, complete genome", + "GTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGT", + "GTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAG", + "TAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTTGTCCGG", + "GTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTT", + "ACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAG" + ] + ], + "timestamp": "2023-10-16T14:33:08.038830506" + }, + "test_cat_one_file_unzipped_zipped_lines": { + "content": [ + [ + ">MT192765.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/USA/PC00101P/2020, complete genome", + "GTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGT", + "GTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAG", + "TAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTTGTCCGG", + "GTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTT", + "ACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAG" + ] + ], + "timestamp": "2023-10-16T14:33:21.39642399" + }, + "test_cat_zipped_zipped_size": { + "content": [ + 78 + ], + "timestamp": "2023-10-16T14:32:33.641869244" + }, + "test_cat_one_file_unzipped_zipped_size": { + "content": [ + 374 + ], + "timestamp": "2023-10-16T14:33:21.4094373" + } +} \ No newline at end of file diff --git a/modules/nf-core/cat/cat/tests/nextflow_unzipped_zipped.config b/modules/nf-core/cat/cat/tests/nextflow_unzipped_zipped.config new file mode 100644 index 0000000000..ec26b0fdc6 --- /dev/null +++ b/modules/nf-core/cat/cat/tests/nextflow_unzipped_zipped.config @@ -0,0 +1,6 @@ + +process { + withName: CAT_CAT { + ext.prefix = 'cat.txt.gz' + } +} diff --git a/modules/nf-core/cat/cat/tests/nextflow_zipped_unzipped.config b/modules/nf-core/cat/cat/tests/nextflow_zipped_unzipped.config new file mode 100644 index 0000000000..fbc79783d5 --- /dev/null +++ b/modules/nf-core/cat/cat/tests/nextflow_zipped_unzipped.config @@ -0,0 +1,8 @@ + +process { + + withName: CAT_CAT { + ext.prefix = 'cat.txt' + } + +} diff --git a/modules/nf-core/cat/cat/tests/tags.yml b/modules/nf-core/cat/cat/tests/tags.yml new file mode 100644 index 0000000000..37b578f523 --- /dev/null +++ b/modules/nf-core/cat/cat/tests/tags.yml @@ -0,0 +1,2 @@ +cat/cat: + - modules/nf-core/cat/cat/** diff --git a/modules/nf-core/cat/fastq/environment.yml b/modules/nf-core/cat/fastq/environment.yml new file mode 100644 index 0000000000..bff93add01 --- /dev/null +++ b/modules/nf-core/cat/fastq/environment.yml @@ -0,0 +1,7 @@ +name: cat_fastq +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - conda-forge::sed=4.7 diff --git a/modules/nf-core/cat/fastq/main.nf b/modules/nf-core/cat/fastq/main.nf new file mode 100644 index 0000000000..3d963784cb --- /dev/null +++ b/modules/nf-core/cat/fastq/main.nf @@ -0,0 +1,80 @@ +process CAT_FASTQ { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'nf-core/ubuntu:20.04' }" + + input: + tuple val(meta), path(reads, stageAs: "input*/*") + + output: + tuple val(meta), path("*.merged.fastq.gz"), emit: reads + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def readList = reads instanceof List ? reads.collect{ it.toString() } : [reads.toString()] + if (meta.single_end) { + if (readList.size >= 1) { + """ + cat ${readList.join(' ')} > ${prefix}.merged.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + END_VERSIONS + """ + } + } else { + if (readList.size >= 2) { + def read1 = [] + def read2 = [] + readList.eachWithIndex{ v, ix -> ( ix & 1 ? read2 : read1 ) << v } + """ + cat ${read1.join(' ')} > ${prefix}_1.merged.fastq.gz + cat ${read2.join(' ')} > ${prefix}_2.merged.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + END_VERSIONS + """ + } + } + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + def readList = reads instanceof List ? reads.collect{ it.toString() } : [reads.toString()] + if (meta.single_end) { + if (readList.size > 1) { + """ + touch ${prefix}.merged.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + END_VERSIONS + """ + } + } else { + if (readList.size > 2) { + """ + touch ${prefix}_1.merged.fastq.gz + touch ${prefix}_2.merged.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + END_VERSIONS + """ + } + } + +} diff --git a/modules/nf-core/cat/fastq/meta.yml b/modules/nf-core/cat/fastq/meta.yml new file mode 100644 index 0000000000..db4ac3c79a --- /dev/null +++ b/modules/nf-core/cat/fastq/meta.yml @@ -0,0 +1,42 @@ +name: cat_fastq +description: Concatenates fastq files +keywords: + - cat + - fastq + - concatenate +tools: + - cat: + description: | + The cat utility reads files sequentially, writing them to the standard output. + documentation: https://www.gnu.org/software/coreutils/manual/html_node/cat-invocation.html + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files to be concatenated. +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: Merged fastq file + pattern: "*.{merged.fastq.gz}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" +maintainers: + - "@joseespinosa" + - "@drpatelh" diff --git a/modules/nf-core/cat/fastq/tests/main.nf.test b/modules/nf-core/cat/fastq/tests/main.nf.test new file mode 100644 index 0000000000..f5f941825c --- /dev/null +++ b/modules/nf-core/cat/fastq/tests/main.nf.test @@ -0,0 +1,143 @@ +nextflow_process { + + name "Test Process CAT_FASTQ" + script "../main.nf" + process "CAT_FASTQ" + tag "modules" + tag "modules_nfcore" + tag "cat" + tag "cat/fastq" + + test("test_cat_fastq_single_end") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test2_1_fastq_gz'], checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.reads).match() }, + { assert path(process.out.versions.get(0)).getText().contains("cat") } + ) + } + } + + test("test_cat_fastq_paired_end") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test2_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test2_2_fastq_gz'], checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.reads).match() }, + { assert path(process.out.versions.get(0)).getText().contains("cat") } + ) + } + } + + test("test_cat_fastq_single_end_same_name") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.reads).match() }, + { assert path(process.out.versions.get(0)).getText().contains("cat") } + ) + } + } + + test("test_cat_fastq_paired_end_same_name") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.reads).match() }, + { assert path(process.out.versions.get(0)).getText().contains("cat") } + ) + } + } + + test("test_cat_fastq_single_end_single_file") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true)] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.reads).match() }, + { assert path(process.out.versions.get(0)).getText().contains("cat") } + ) + } + } +} diff --git a/modules/nf-core/cat/fastq/tests/main.nf.test.snap b/modules/nf-core/cat/fastq/tests/main.nf.test.snap new file mode 100644 index 0000000000..ec2342e549 --- /dev/null +++ b/modules/nf-core/cat/fastq/tests/main.nf.test.snap @@ -0,0 +1,78 @@ +{ + "test_cat_fastq_single_end": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.merged.fastq.gz:md5,f9cf5e375f7de81a406144a2c70cc64d" + ] + ] + ], + "timestamp": "2023-10-17T23:19:12.990284837" + }, + "test_cat_fastq_single_end_same_name": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.merged.fastq.gz:md5,63f817db7a29a03eb538104495556f66" + ] + ] + ], + "timestamp": "2023-10-17T23:19:31.554568147" + }, + "test_cat_fastq_single_end_single_file": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.merged.fastq.gz:md5,e325ef7deb4023447a1f074e285761af" + ] + ] + ], + "timestamp": "2023-10-17T23:19:49.629360033" + }, + "test_cat_fastq_paired_end_same_name": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1.merged.fastq.gz:md5,63f817db7a29a03eb538104495556f66", + "test_2.merged.fastq.gz:md5,fe9f266f43a6fc3dcab690a18419a56e" + ] + ] + ] + ], + "timestamp": "2023-10-17T23:19:40.711617539" + }, + "test_cat_fastq_paired_end": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1.merged.fastq.gz:md5,f9cf5e375f7de81a406144a2c70cc64d", + "test_2.merged.fastq.gz:md5,77c8e966e130d8c6b6ec9be52fcb2bda" + ] + ] + ] + ], + "timestamp": "2023-10-18T07:53:20.923560211" + } +} \ No newline at end of file diff --git a/modules/nf-core/cat/fastq/tests/tags.yml b/modules/nf-core/cat/fastq/tests/tags.yml new file mode 100644 index 0000000000..6ac4361405 --- /dev/null +++ b/modules/nf-core/cat/fastq/tests/tags.yml @@ -0,0 +1,2 @@ +cat/fastq: + - modules/nf-core/cat/fastq/** diff --git a/modules/nf-core/cnvkit/antitarget/environment.yml b/modules/nf-core/cnvkit/antitarget/environment.yml new file mode 100644 index 0000000000..08a0b27cea --- /dev/null +++ b/modules/nf-core/cnvkit/antitarget/environment.yml @@ -0,0 +1,8 @@ +name: cnvkit_antitarget +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::cnvkit=0.9.10 + - bioconda::samtools=1.17 diff --git a/modules/nf-core/cnvkit/antitarget/main.nf b/modules/nf-core/cnvkit/antitarget/main.nf new file mode 100644 index 0000000000..795145451b --- /dev/null +++ b/modules/nf-core/cnvkit/antitarget/main.nf @@ -0,0 +1,36 @@ +process CNVKIT_ANTITARGET { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/cnvkit:0.9.10--pyhdfd78af_0': + 'biocontainers/cnvkit:0.9.10--pyhdfd78af_0' }" + + input: + tuple val(meta), path(targets) + + output: + tuple val(meta), path("*.bed"), emit: bed + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + cnvkit.py \\ + antitarget \\ + $targets \\ + --output ${prefix}.antitarget.bed \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cnvkit: \$(cnvkit.py version | sed -e "s/cnvkit v//g") + END_VERSIONS + """ +} diff --git a/modules/nf-core/cnvkit/antitarget/meta.yml b/modules/nf-core/cnvkit/antitarget/meta.yml new file mode 100644 index 0000000000..d879092d33 --- /dev/null +++ b/modules/nf-core/cnvkit/antitarget/meta.yml @@ -0,0 +1,49 @@ +name: cnvkit_antitarget +description: Derive off-target (“antitarget”) bins from target regions. +keywords: + - cvnkit + - antitarget + - cnv + - copy number +tools: + - cnvkit: + description: | + CNVkit is a Python library and command-line software toolkit to infer and visualize copy number from high-throughput DNA sequencing data. + It is designed for use with hybrid capture, including both whole-exome and custom target panels, and short-read sequencing platforms such as Illumina and Ion Torrent. + homepage: https://cnvkit.readthedocs.io/en/stable/index.html + documentation: https://cnvkit.readthedocs.io/en/stable/index.html + tool_dev_url: "https://github.com/etal/cnvkit" + doi: 10.1371/journal.pcbi.1004873 + licence: ["Apache-2.0"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - targets: + type: file + description: File containing genomic regions + pattern: "*.{bed}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bed: + type: file + description: File containing off-target regions + pattern: "*.{bed}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@adamrtalbot" + - "@priesgo" + - "@SusiJo" +maintainers: + - "@adamrtalbot" + - "@priesgo" + - "@SusiJo" diff --git a/modules/nf-core/cnvkit/batch/environment.yml b/modules/nf-core/cnvkit/batch/environment.yml new file mode 100644 index 0000000000..eb9ed375b7 --- /dev/null +++ b/modules/nf-core/cnvkit/batch/environment.yml @@ -0,0 +1,8 @@ +name: cnvkit_batch +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::cnvkit=0.9.10 + - bioconda::samtools=1.17 diff --git a/modules/nf-core/cnvkit/batch/main.nf b/modules/nf-core/cnvkit/batch/main.nf new file mode 100644 index 0000000000..3ccc9faa44 --- /dev/null +++ b/modules/nf-core/cnvkit/batch/main.nf @@ -0,0 +1,105 @@ +process CNVKIT_BATCH { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-780d630a9bb6a0ff2e7b6f730906fd703e40e98f:c94363856059151a2974dc501fb07a0360cc60a3-0' : + 'biocontainers/mulled-v2-780d630a9bb6a0ff2e7b6f730906fd703e40e98f:c94363856059151a2974dc501fb07a0360cc60a3-0' }" + + input: + tuple val(meta), path(tumor), path(normal) + tuple val(meta2), path(fasta) + tuple val(meta3), path(fasta_fai) + tuple val(meta4), path(targets) + tuple val(meta5), path(reference) + val panel_of_normals + + output: + tuple val(meta), path("*.bed"), emit: bed + tuple val(meta), path("*.cnn"), emit: cnn, optional: true + tuple val(meta), path("*.cnr"), emit: cnr, optional: true + tuple val(meta), path("*.cns"), emit: cns, optional: true + tuple val(meta), path("*.pdf"), emit: pdf, optional: true + tuple val(meta), path("*.png"), emit: png, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + + def tumor_exists = tumor ? true : false + def normal_exists = normal ? true : false + + // execute samtools only when cram files are input, cnvkit runs natively on bam but is prohibitively slow + def tumor_cram = tumor_exists && tumor.Extension == "cram" ? true : false + def normal_cram = normal_exists && normal.Extension == "cram" ? true : false + def tumor_bam = tumor_exists && tumor.Extension == "bam" ? true : false + def normal_bam = normal_exists && normal.Extension == "bam" ? true : false + + def tumor_out = tumor_cram ? tumor.BaseName + ".bam" : "${tumor}" + + // tumor_only mode does not need fasta & target + // instead it requires a pre-computed reference.cnn which is built from fasta & target + def (normal_out, normal_args, fasta_args) = ["", "", ""] + def fai_reference = fasta_fai ? "--fai-reference ${fasta_fai}" : "" + + if (normal_exists){ + def normal_prefix = normal.BaseName + normal_out = normal_cram ? "${normal_prefix}" + ".bam" : "${normal}" + fasta_args = fasta ? "--fasta $fasta" : "" + + // germline mode + // normal samples must be input without a flag + // requires flag --normal to be empty [] + if(!tumor_exists){ + tumor_out = "${normal_prefix}" + ".bam" + normal_args = "--normal " + } + // somatic mode + else { + normal_args = normal_prefix ? "--normal $normal_out" : "" + } + } + + // generation of panel of normals + def generate_pon = panel_of_normals ? true : false + + if (generate_pon && !tumor_exists){ + def pon_input = normal.join(' ') + normal_args = "--normal $pon_input" + tumor_out = "" + } + + def target_args = targets ? "--targets $targets" : "" + def reference_args = reference ? "--reference $reference" : "" + + def samtools_cram_convert = '' + samtools_cram_convert += normal_cram ? " samtools view -T $fasta $fai_reference $normal -@ $task.cpus -o $normal_out\n" : '' + samtools_cram_convert += normal_cram ? " samtools index $normal_out\n" : '' + samtools_cram_convert += tumor_cram ? " samtools view -T $fasta $fai_reference $tumor -@ $task.cpus -o $tumor_out\n" : '' + samtools_cram_convert += tumor_cram ? " samtools index $tumor_out\n" : '' + def versions = normal_cram || tumor_cram ? + "samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//')\n cnvkit: \$(cnvkit.py version | sed -e 's/cnvkit v//g')" : + "cnvkit: \$(cnvkit.py version | sed -e 's/cnvkit v//g')" + """ + $samtools_cram_convert + + cnvkit.py \\ + batch \\ + $tumor_out \\ + $normal_args \\ + $fasta_args \\ + $reference_args \\ + $target_args \\ + --processes $task.cpus \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + ${versions} + END_VERSIONS + """ +} diff --git a/modules/nf-core/cnvkit/batch/meta.yml b/modules/nf-core/cnvkit/batch/meta.yml new file mode 100644 index 0000000000..4f88ba3f9b --- /dev/null +++ b/modules/nf-core/cnvkit/batch/meta.yml @@ -0,0 +1,118 @@ +name: cnvkit_batch +description: Copy number variant detection from high-throughput sequencing data +keywords: + - cnvkit + - bam + - fasta + - copy number +tools: + - cnvkit: + description: | + CNVkit is a Python library and command-line software toolkit to infer and visualize copy number from high-throughput DNA sequencing data. It is designed for use with hybrid capture, including both whole-exome and custom target panels, and short-read sequencing platforms such as Illumina and Ion Torrent. + homepage: https://cnvkit.readthedocs.io/en/stable/index.html + documentation: https://cnvkit.readthedocs.io/en/stable/index.html + licence: ["Apache-2.0"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - tumour: + type: file + description: | + Input tumour sample bam file (or cram) + - normal: + type: file + description: | + Input normal sample bam file (or cram) + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - fasta: + type: file + description: | + Input reference genome fasta file (only needed for cram_input and/or when normal_samples are provided) + - meta3: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - fasta_fai: + type: file + description: | + Input reference genome fasta index (optional, but recommended for cram_input) + - meta4: + type: map + description: | + Groovy Map containing information about target file + e.g. [ id:'test' ] + - targetfile: + type: file + description: | + Input target bed file + - meta5: + type: map + description: | + Groovy Map containing information about reference file + e.g. [ id:'test' ] + - reference: + type: file + description: | + Input reference cnn-file (only for germline and tumor-only running) +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bed: + type: file + description: File containing genomic regions + pattern: "*.{bed}" + - cnn: + type: file + description: File containing coverage information + pattern: "*.{cnn}" + - cnr: + type: file + description: File containing copy number ratio information + pattern: "*.{cnr}" + - cns: + type: file + description: File containing copy number segment information + pattern: "*.{cns}" + - pdf: + type: file + description: File with plot of copy numbers or segments on chromosomes + pattern: "*.{pdf}" + - png: + type: file + description: File with plot of bin-level log2 coverages and segmentation calls + pattern: "*.{png}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@adamrtalbot" + - "@drpatelh" + - "@fbdtemme" + - "@kaurravneet4123" + - "@KevinMenden" + - "@lassefolkersen" + - "@MaxUlysse" + - "@priesgo" + - "@SusiJo" +maintainers: + - "@adamrtalbot" + - "@drpatelh" + - "@fbdtemme" + - "@kaurravneet4123" + - "@KevinMenden" + - "@lassefolkersen" + - "@MaxUlysse" + - "@priesgo" + - "@SusiJo" diff --git a/modules/nf-core/cnvkit/genemetrics/environment.yml b/modules/nf-core/cnvkit/genemetrics/environment.yml new file mode 100644 index 0000000000..2d00f762e2 --- /dev/null +++ b/modules/nf-core/cnvkit/genemetrics/environment.yml @@ -0,0 +1,8 @@ +name: cnvkit_genemetrics +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::cnvkit=0.9.10 + - bioconda::samtools=1.17 diff --git a/modules/nf-core/cnvkit/genemetrics/main.nf b/modules/nf-core/cnvkit/genemetrics/main.nf new file mode 100755 index 0000000000..825b12bdac --- /dev/null +++ b/modules/nf-core/cnvkit/genemetrics/main.nf @@ -0,0 +1,39 @@ +process CNVKIT_GENEMETRICS { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/cnvkit:0.9.10--pyhdfd78af_0': + 'biocontainers/cnvkit:0.9.10--pyhdfd78af_0' }" + + input: + tuple val(meta), path(cnr), path(cns) + + output: + tuple val(meta), path("*.tsv"), emit: tsv + //tuple val(meta), path("*.cnn"), emit: cnn + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def segments = cns ? "--segment ${cns}" : "" + + """ + cnvkit.py \\ + genemetrics \\ + $cnr \\ + $segments \\ + --output ${prefix}.tsv \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cnvkit: \$(cnvkit.py version | sed -e "s/cnvkit v//g") + END_VERSIONS + """ +} diff --git a/modules/nf-core/cnvkit/genemetrics/meta.yml b/modules/nf-core/cnvkit/genemetrics/meta.yml new file mode 100755 index 0000000000..4bef28c7d5 --- /dev/null +++ b/modules/nf-core/cnvkit/genemetrics/meta.yml @@ -0,0 +1,50 @@ +name: cnvkit_genemetrics +description: Copy number variant detection from high-throughput sequencing data +keywords: + - cnvkit + - bam + - fasta + - copy number +tools: + - cnvkit: + description: | + CNVkit is a Python library and command-line software toolkit to infer and visualize copy number from high-throughput DNA sequencing data. It is designed for use with hybrid capture, including both whole-exome and custom target panels, and short-read sequencing platforms such as Illumina and Ion Torrent. + homepage: https://cnvkit.readthedocs.io/en/stable/index.html + documentation: https://cnvkit.readthedocs.io/en/stable/index.html + licence: ["Apache-2.0"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - cnr: + type: file + description: CNR file + pattern: "*.cnr" + - cns: + type: file + description: CNS file [Optional] + pattern: "*.cns" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - txt: + type: file + description: TXT file + pattern: "*.txt" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@adamrtalbot" + - "@marrip" + - "@priesgo" +maintainers: + - "@adamrtalbot" + - "@marrip" + - "@priesgo" diff --git a/modules/nf-core/cnvkit/reference/environment.yml b/modules/nf-core/cnvkit/reference/environment.yml new file mode 100644 index 0000000000..6bb62f9054 --- /dev/null +++ b/modules/nf-core/cnvkit/reference/environment.yml @@ -0,0 +1,8 @@ +name: cnvkit_reference +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::cnvkit=0.9.10 + - bioconda::samtools=1.17 diff --git a/modules/nf-core/cnvkit/reference/main.nf b/modules/nf-core/cnvkit/reference/main.nf new file mode 100644 index 0000000000..0e0b20a01e --- /dev/null +++ b/modules/nf-core/cnvkit/reference/main.nf @@ -0,0 +1,40 @@ +process CNVKIT_REFERENCE { + tag "$fasta" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/cnvkit:0.9.10--pyhdfd78af_0': + 'biocontainers/cnvkit:0.9.10--pyhdfd78af_0' }" + + input: + path fasta + path targets + path antitargets + + output: + path "*.cnn" , emit: cnn + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: targets.BaseName + + """ + cnvkit.py \\ + reference \\ + --fasta $fasta \\ + --targets $targets \\ + --antitargets $antitargets \\ + --output ${prefix}.reference.cnn \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cnvkit: \$(cnvkit.py version | sed -e "s/cnvkit v//g") + END_VERSIONS + """ +} diff --git a/modules/nf-core/cnvkit/reference/meta.yml b/modules/nf-core/cnvkit/reference/meta.yml new file mode 100644 index 0000000000..8c561c9953 --- /dev/null +++ b/modules/nf-core/cnvkit/reference/meta.yml @@ -0,0 +1,52 @@ +name: cnvkit_reference +description: Compile a coverage reference from the given files (normal samples). +keywords: + - cnvkit + - reference + - cnv + - copy number +tools: + - cnvkit: + description: | + CNVkit is a Python library and command-line software toolkit to infer and visualize copy number from high-throughput DNA sequencing data. + It is designed for use with hybrid capture, including both whole-exome and custom target panels, and short-read sequencing platforms such as Illumina and Ion Torrent. + homepage: https://cnvkit.readthedocs.io/en/stable/index.html + documentation: https://cnvkit.readthedocs.io/en/stable/index.html + tool_dev_url: https://github.com/etal/cnvkit + doi: 10.1371/journal.pcbi.1004873 + licence: ["Apache-2.0"] +input: + - fasta: + type: file + description: File containing reference genome + pattern: "*.{fasta}" + - targets: + type: file + description: File containing genomic regions + pattern: "*.{bed}" + - antitargets: + type: file + description: File containing off-target genomic regions + pattern: "*.{bed}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reference: + type: file + description: File containing a copy-number reference (required for CNV calling in tumor_only mode) + pattern: "*.{cnn}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@adamrtalbot" + - "@priesgo" + - "@SusiJo" +maintainers: + - "@adamrtalbot" + - "@priesgo" + - "@SusiJo" diff --git a/modules/nf-core/controlfreec/assesssignificance/controlfreec-assesssignificance.diff b/modules/nf-core/controlfreec/assesssignificance/controlfreec-assesssignificance.diff new file mode 100644 index 0000000000..c8dc392666 --- /dev/null +++ b/modules/nf-core/controlfreec/assesssignificance/controlfreec-assesssignificance.diff @@ -0,0 +1,25 @@ +Changes in module 'nf-core/controlfreec/assesssignificance' +--- modules/nf-core/controlfreec/assesssignificance/environment.yml ++++ modules/nf-core/controlfreec/assesssignificance/environment.yml +@@ -4,4 +4,4 @@ + - bioconda + - defaults + dependencies: +- - bioconda::control-freec=11.6b ++ - bioconda::control-freec=11.6 + +--- modules/nf-core/controlfreec/assesssignificance/main.nf ++++ modules/nf-core/controlfreec/assesssignificance/main.nf +@@ -4,8 +4,8 @@ + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? +- 'https://depot.galaxyproject.org/singularity/control-freec:11.6b--hdbdd923_0' : +- 'biocontainers/control-freec:11.6b--hdbdd923_0' }" ++ 'https://depot.galaxyproject.org/singularity/control-freec:11.6--h1b792b2_1' : ++ 'biocontainers/control-freec:11.6--h1b792b2_1' }" + + input: + tuple val(meta), path(cnvs), path(ratio) + +************************************************************ diff --git a/modules/nf-core/controlfreec/assesssignificance/environment.yml b/modules/nf-core/controlfreec/assesssignificance/environment.yml new file mode 100644 index 0000000000..cb0b9c17c3 --- /dev/null +++ b/modules/nf-core/controlfreec/assesssignificance/environment.yml @@ -0,0 +1,7 @@ +name: controlfreec_assesssignificance +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::control-freec=11.6 diff --git a/modules/nf-core/controlfreec/assesssignificance/main.nf b/modules/nf-core/controlfreec/assesssignificance/main.nf new file mode 100644 index 0000000000..4be70b176d --- /dev/null +++ b/modules/nf-core/controlfreec/assesssignificance/main.nf @@ -0,0 +1,44 @@ +process CONTROLFREEC_ASSESSSIGNIFICANCE { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/control-freec:11.6--h1b792b2_1' : + 'biocontainers/control-freec:11.6--h1b792b2_1' }" + + input: + tuple val(meta), path(cnvs), path(ratio) + + output: + tuple val(meta), path("*.p.value.txt"), emit: p_value_txt + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + cat \$(which assess_significance.R) | R --slave --args ${cnvs} ${ratio} + + mv *.p.value.txt ${prefix}.p.value.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + controlfreec: \$(echo \$(freec -version 2>&1) | sed 's/^.*Control-FREEC //; s/:.*\$//' | sed -e "s/Control-FREEC v//g" ) + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.p.value.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + controlfreec: \$(echo \$(freec -version 2>&1) | sed 's/^.*Control-FREEC //; s/:.*\$//' | sed -e "s/Control-FREEC v//g" ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/controlfreec/assesssignificance/meta.yml b/modules/nf-core/controlfreec/assesssignificance/meta.yml new file mode 100644 index 0000000000..b8cda6dd50 --- /dev/null +++ b/modules/nf-core/controlfreec/assesssignificance/meta.yml @@ -0,0 +1,49 @@ +name: controlfreec_assesssignificance +description: Add both Wilcoxon test and Kolmogorov-Smirnov test p-values to each CNV output of FREEC +keywords: + - cna + - cnv + - somatic + - single + - tumor-only +tools: + - controlfreec/assesssignificance: + description: Copy number and genotype annotation from whole genome and whole exome sequencing data. + homepage: http://boevalab.inf.ethz.ch/FREEC + documentation: http://boevalab.inf.ethz.ch/FREEC/tutorial.html + tool_dev_url: https://github.com/BoevaLab/FREEC/ + doi: "10.1093/bioinformatics/btq635" + licence: ["GPL >=2"] +input: + # Only when we have meta + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - cnvs: + type: file + description: _CNVs file generated by FREEC + pattern: "*._CNVs" + - ratio: + type: file + description: ratio file generated by FREEC + pattern: "*.ratio.txt" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - p_value_txt: + type: file + description: CNV file containing p_values for each call + pattern: "*.p.value.txt" +authors: + - "@FriederikeHanssen" +maintainers: + - "@FriederikeHanssen" diff --git a/modules/nf-core/controlfreec/freec/environment.yml b/modules/nf-core/controlfreec/freec/environment.yml new file mode 100644 index 0000000000..cb76c6ba93 --- /dev/null +++ b/modules/nf-core/controlfreec/freec/environment.yml @@ -0,0 +1,7 @@ +name: controlfreec_freec +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::control-freec=11.6b diff --git a/modules/nf-core/controlfreec/freec/main.nf b/modules/nf-core/controlfreec/freec/main.nf new file mode 100644 index 0000000000..65cae5cba8 --- /dev/null +++ b/modules/nf-core/controlfreec/freec/main.nf @@ -0,0 +1,176 @@ +process CONTROLFREEC_FREEC { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/control-freec:11.6b--hdbdd923_0' : + 'biocontainers/control-freec:11.6b--hdbdd923_0' }" + + input: + tuple val(meta), path(mpileup_normal), path(mpileup_tumor), path(cpn_normal), path(cpn_tumor), path(minipileup_normal), path(minipileup_tumor) + path fasta + path fai + path snp_position + path known_snps + path known_snps_tbi + path chr_directory + path mappability + path target_bed + path gccontent_profile + + output: + tuple val(meta), path("*_ratio.BedGraph") , emit: bedgraph, optional: true + tuple val(meta), path("*_control.cpn") , emit: control_cpn, optional: true + tuple val(meta), path("*_sample.cpn") , emit: sample_cpn + tuple val(meta), path("GC_profile.*.cpn") , emit: gcprofile_cpn, optional:true + tuple val(meta), path("*_BAF.txt") , emit: BAF + tuple val(meta), path("*_CNVs") , emit: CNV + tuple val(meta), path("*_info.txt") , emit: info + tuple val(meta), path("*_ratio.txt") , emit: ratio + tuple val(meta), path("config.txt") , emit: config + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + //"General" configurations + def bedgraphoutput = task.ext.args?["general"]?["bedgraphoutput"] ? "BedGraphOutput = ${task.ext.args["general"]["bedgraphoutput"]}" : "" + def chr_files = chr_directory ? "chrFiles =\${PWD}/${chr_directory}" : "" + def chr_length = fai ? "chrLenFile = \${PWD}/${fai}" : "" + def breakpointthreshold = task.ext.args?["general"]?["breakpointthreshold"] ? "breakPointThreshold = ${task.ext.args["general"]["breakpointthreshold"]}" : "" + def breakpointtype = task.ext.args?["general"]?["breakpointtype"] ? "breakPointType = ${task.ext.args["general"]["breakpointtype"]}" : "" + def coefficientofvariation = task.ext.args?["general"]?["coefficientofvariation"] ? "coefficientOfVariation = ${task.ext.args["general"]["coefficientofvariation"]}" : "" + def contamination = task.ext.args?["general"]?["contamination"] ? "contamination = ${task.ext.args["general"]["contamination"]}" : "" + def contaminationadjustment = task.ext.args?["general"]?["contaminationadjustment"] ? "contaminationAdjustment = ${task.ext.args["general"]["contaminationadjustment"]}" : "" + def degree = task.ext.args?["general"]?["degree"] ? "degree = ${task.ext.args["general"]["degree"]}" : "" + def forcegccontentnormalization = task.ext.args?["general"]?["forcegccontentnormalization"] ? "forceGCcontentNormalization = ${task.ext.args["general"]["forcegccontentnormalization"]}" : "" + def gccontentprofile = gccontent_profile ? "GCcontentProfile = ${gccontent_profile}" : "" + def mappability = mappability ? "gemMappabilityFile = \${PWD}/${mappability}" : "" + def intercept = task.ext.args?["general"]?["intercept"] ? "intercept = ${task.ext.args["general"]["intercept"]}" : "" + def mincnalength = task.ext.args?["general"]?["mincnalength"] ? "minCNAlength = ${task.ext.args["general"]["mincnalength"]}" : "" + def minmappabilityperwindow = task.ext.args?["general"]?["minmappabilityperwindow"] ? "minMappabilityPerWindow = ${task.ext.args["general"]["minmappabilityperwindow"]}" : "" + def minexpectedgc = task.ext.args?["general"]?["minexpectedgc"] ? "minExpectedGC = ${task.ext.args["general"]["minexpectedgc"]}" : "" + def maxexpectedgc = task.ext.args?["general"]?["maxexpectedgc"] ? "maxExpectedGC = ${task.ext.args["general"]["maxexpectedgc"]}" : "" + def minimalsubclonepresence = task.ext.args?["general"]?["minimalsubclonepresence"] ? "minimalSubclonePresence = ${task.ext.args["general"]["minimalsubclonepresence"]}" : "" + def noisydata = task.ext.args?["general"]?["noisydata"] ? "noisyData = ${task.ext.args["general"]["noisydata"]}" : "" + def output = task.ext.prefix ? "outputDir = \${PWD}/${task.ext.prefix}" : "" + def ploidy = task.ext.args?["general"]?["ploidy"] ? "ploidy = ${task.ext.args["general"]["ploidy"]}" : "" + def printNA = task.ext.args?["general"]?["printNA"] ? "printNA = ${task.ext.args["general"]["printNA"]}" : "" + def readcountthreshold = task.ext.args?["general"]?["readcountthreshold"] ? "readCountThreshold = ${task.ext.args["general"]["readcountthreshold"]}" : "" + def sex = task.ext.args?["general"]?["sex"] ? "sex = ${task.ext.args["general"]["sex"]}" : "" + def step = task.ext.args?["general"]?["step"] ? "step = ${task.ext.args["general"]["step"]}" : "" + def telocentromeric = task.ext.args?["general"]?["telocentromeric"] ? "telocentromeric = ${task.ext.args["general"]["telocentromeric"]} " : "" + def uniquematch = task.ext.args?["general"]?["uniquematch"] ? "uniqueMatch = ${task.ext.args["general"]["uniquematch"]}" : "" + def window = task.ext.args?["general"]?["window"] ? "window = ${task.ext.args["general"]["window"]}" : "" + + //"Control" configurations + def matefile_normal = mpileup_normal ? "mateFile = \${PWD}/${mpileup_normal}" : "" + def matecopynumberfile_normal = cpn_normal ? "mateCopyNumberFile = \${PWD}/${cpn_normal}" : "" + def minipileup_normal = minipileup_normal ? "miniPileup = \${PWD}/${minipileup_normal}" : "" + def inputformat_normal = task.ext.args?["control"]?["inputformat"] ? "inputFormat = ${task.ext.args["control"]["inputformat"]}" : "" + def mateorientation_normal = task.ext.args?["control"]?["mateorientation"] ? "mateOrientation = ${task.ext.args["control"]["mateorientation"]}" : "" + + //"Sample" configuration + def matefile_tumor = mpileup_tumor ? "mateFile = \${PWD}/${mpileup_tumor}" : "" + def matecopynumberfile_tumor = cpn_tumor ? "mateCopyNumberFile = \${PWD}/${cpn_tumor}" : "" + def minipileup_tumor = minipileup_tumor ? "miniPileup = \${PWD}/${minipileup_tumor}" : "" + def inputformat_tumor = task.ext.args?["sample"]?["inputformat"] ? "inputFormat = ${task.ext.args["sample"]["inputformat"]}" : "" + def mateorientation_tumor = task.ext.args?["sample"]?["mateorientation"] ? "mateOrientation = ${task.ext.args["sample"]["mateorientation"]}" : "" + + //"BAF" configuration + def makepileup = snp_position ? "makePileup = \${PWD}/${snp_position}" : "" + def fastafile = fasta ? "fastaFile = \${PWD}/${fasta}" : "" + def minimalcoverageperposition = task.ext.args?["BAF"]?["minimalcoverageperposition"] ? "minimalCoveragePerPosition = ${task.ext.args["BAF"]["minimalcoverageperposition"]}" : "" + def minimalqualityperposition = task.ext.args?["BAF"]?["minimalqualityperposition"] ? "minimalQualityPerPosition = ${task.ext.args["BAF"]["minimalqualityperposition"]}" : "" + def shiftinquality = task.ext.args?["BAF"]?["shiftinquality"] ? "shiftInQuality = ${task.ext.args["BAF"]["shiftinquality"]}" : "" + def snpfile = known_snps ? "SNPfile = \$PWD/${known_snps}" : "" + + //"Target" configuration + def target_bed = target_bed ? "captureRegions = ${target_bed}" : "" + """ + touch config.txt + + echo "[general]" >> config.txt + echo ${bedgraphoutput} >> config.txt + echo ${breakpointthreshold} >> config.txt + echo ${breakpointtype} >> config.txt + echo ${chr_files} >> config.txt + echo ${chr_length} >> config.txt + echo ${coefficientofvariation} >> config.txt + echo ${contamination} >> config.txt + echo ${contaminationadjustment} >> config.txt + echo ${degree} >> config.txt + echo ${forcegccontentnormalization} >> config.txt + echo ${gccontentprofile} >> config.txt + echo ${mappability} >> config.txt + echo ${intercept} >> config.txt + echo ${mincnalength} >> config.txt + echo ${minmappabilityperwindow} >> config.txt + echo ${minexpectedgc} >> config.txt + echo ${maxexpectedgc} >> config.txt + echo ${minimalsubclonepresence} >> config.txt + echo "maxThreads = ${task.cpus}" >> config.txt + echo ${noisydata} >> config.txt + echo ${output} >> config.txt + echo ${ploidy} >> config.txt + echo ${printNA} >> config.txt + echo ${readcountthreshold} >> config.txt + echo ${sex} >> config.txt + echo ${step} >> config.txt + echo ${telocentromeric} >> config.txt + echo ${uniquematch} >> config.txt + echo ${window} >> config.txt + + echo "[control]" >> config.txt + echo ${matefile_normal} >> config.txt + echo ${matecopynumberfile_normal} >> config.txt + echo ${minipileup_normal} >> config.txt + echo ${inputformat_normal} >> config.txt + echo ${mateorientation_normal} >> config.txt + + echo "[sample]" >> config.txt + echo ${matefile_tumor} >> config.txt + echo ${matecopynumberfile_tumor} >> config.txt + echo ${minipileup_tumor} >> config.txt + echo ${inputformat_tumor} >> config.txt + echo ${mateorientation_tumor} >> config.txt + + echo "[BAF]" >> config.txt + echo ${makepileup} >> config.txt + echo ${fastafile} >> config.txt + echo ${minimalcoverageperposition} >> config.txt + echo ${minimalqualityperposition} >> config.txt + echo ${shiftinquality} >> config.txt + echo ${snpfile} >> config.txt + + echo "[target]" >> config.txt + echo ${target_bed} >> config.txt + + freec -conf config.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + controlfreec: \$(echo \$(freec -version 2>&1) | sed 's/^.*Control-FREEC //; s/:.*\$//' | sed -e "s/Control-FREEC v//g" ) + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}_ratio.BedGraph + touch ${prefix}_sample.cpn + touch GC_profile.${prefix}.cpn + touch ${prefix}_BAF.txt + touch ${prefix}_CNVs + touch ${prefix}_info.txt + touch ${prefix}_ratio.txt + touch config.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + controlfreec: \$(echo \$(freec -version 2>&1) | sed 's/^.*Control-FREEC //; s/:.*\$//' | sed -e "s/Control-FREEC v//g" ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/controlfreec/freec/meta.yml b/modules/nf-core/controlfreec/freec/meta.yml new file mode 100644 index 0000000000..1442bbe44a --- /dev/null +++ b/modules/nf-core/controlfreec/freec/meta.yml @@ -0,0 +1,180 @@ +name: controlfreec_freec +description: Copy number and genotype annotation from whole genome and whole exome sequencing data +keywords: + - cna + - cnv + - somatic + - single + - tumor-only +tools: + - controlfreec/freec: + description: Copy number and genotype annotation from whole genome and whole exome sequencing data. + homepage: http://boevalab.inf.ethz.ch/FREEC + documentation: http://boevalab.inf.ethz.ch/FREEC/tutorial.html + tool_dev_url: https://github.com/BoevaLab/FREEC/ + doi: "10.1093/bioinformatics/btq635" + licence: ["GPL >=2"] +input: + - args: + type: map + description: | + Groovy Map containing tool parameters. MUST follow the structure/keywords below and be provided via modules.config. + Parameters marked as (optional) can be removed from the map, if they are not set. All values must be surrounded by quotes, meta map parameters can be set with, i.e. `sex = meta.sex`: + For default values, please check the documentation above. + + ``` + { + [ + "general" :[ + "bedgraphoutput": (optional), + "breakpointthreshold": (optional), + "breakpointtype": (optional), + "coefficientofvariation": (optional), + "contamination": (optional), + "contaminationadjustment": (optional), + "degree": (optional), + "forcegccontentnormalization": (optional), + "gccontentprofile": (optional), + "intercept": (optional), + "mincnalength": (optional), + "minmappabilityperwindow": (optional), + "minexpectedgc": (optional), + "maxexpectedgc": (optional), + "minimalsubclonepresence": (optional), + "noisydata": (optional), + "ploidy": (optional), + "printNA": (optional), + "readcountthreshold": (optional), + "sex": (optional), + "step": (optional), + "telocentromeric": (optional), + "uniquematch": (optional), + "window": (optional) + ], + "control":[ + "inputformat": (required), + "mateorientation": (optional), + ], + "sample":[ + "inputformat": (required), + "mateorientation": (optional), + ], + "BAF":[ + "minimalcoverageperposition": (optional), + "minimalqualityperposition": (optional), + "shiftinquality": (optional) + ] + ] + } + ``` + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - mateFile_normal: + type: file + description: File with mapped reads + pattern: "*.{sam,bam,pileup(.gz),bowtie(.gz),eland(.gz),arachne(.gz),psl(.gz),bed(.gz)}" + - mateFile_tumor: + type: file + description: File with mapped reads + pattern: "*.{sam,bam,pileup(.gz),bowtie(.gz),eland(.gz),arachne(.gz),psl(.gz),bed(.gz)}" + - cpn_normal: + type: file + description: Raw copy number profiles (optional) + pattern: "*.cpn" + - cpn_tumor: + type: file + description: Raw copy number profiles (optional) + pattern: "*.cpn" + - minipileup_normal: + type: file + description: miniPileup file from previous run (optional) + pattern: "*.pileup" + - minipileup_tumor: + type: file + description: miniPileup file from previous run (optional) + pattern: "*.pileup" + - fasta: + type: file + description: Reference file (optional; required if args 'makePileup' is set) + pattern: "*.{fasta,fna,fa}" + - fai: + type: file + description: Fasta index + pattern: "*.fai" + - snp_position: + type: file + description: Path to a BED or VCF file with SNP positions to create a mini pileup file from the initial BAM file provided in mateFile (optional) + pattern: "*.{bed,vcf}" + - known_snps: + type: file + description: File with known SNPs + pattern: "*.{vcf,vcf.gz}" + - known_snps_tbi: + type: file + description: Index of known_snps + pattern: "*.tbi" + - chr_directory: + type: file + description: Path to directory with chromosome fasta files (optional, required if gccontentprofile is not provided) + pattern: "*/" + - mappability: + type: file + description: Contains information of mappable positions (optional) + pattern: "*.gem" + - target_bed: + type: file + description: Sorted bed file containing capture regions (optional) + pattern: "*.bed" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - bedgraph: + type: file + description: Bedgraph format for the UCSC genome browser + pattern: ".bedgraph" + - control_cpn: + type: file + description: files with raw copy number profiles + pattern: "*_control.cpn" + - sample_cpn: + type: file + description: files with raw copy number profiles + pattern: "*_sample.cpn" + - gcprofile_cpn: + type: file + description: file with GC-content profile. + pattern: "GC_profile.*.cpn" + - BAF: + type: file + description: file B-allele frequencies for each possibly heterozygous SNP position + pattern: "*_BAF.txt" + - CNV: + type: file + description: file with coordinates of predicted copy number alterations. + pattern: "*_CNVs" + - info: + type: file + description: parsable file with information about FREEC run + pattern: "*_info.txt" + - ratio: + type: file + description: file with ratios and predicted copy number alterations for each window + pattern: "*_ratio.txt" + - config: + type: file + description: Config file used to run Control-FREEC + pattern: "config.txt" +authors: + - "@FriederikeHanssen" +maintainers: + - "@FriederikeHanssen" diff --git a/modules/nf-core/controlfreec/freec2bed/environment.yml b/modules/nf-core/controlfreec/freec2bed/environment.yml new file mode 100644 index 0000000000..12601ffa55 --- /dev/null +++ b/modules/nf-core/controlfreec/freec2bed/environment.yml @@ -0,0 +1,7 @@ +name: controlfreec_freec2bed +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::control-freec=11.6b diff --git a/modules/nf-core/controlfreec/freec2bed/main.nf b/modules/nf-core/controlfreec/freec2bed/main.nf new file mode 100644 index 0000000000..d2649cf95b --- /dev/null +++ b/modules/nf-core/controlfreec/freec2bed/main.nf @@ -0,0 +1,42 @@ +process CONTROLFREEC_FREEC2BED { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/control-freec:11.6b--hdbdd923_0' : + 'biocontainers/control-freec:11.6b--hdbdd923_0' }" + + input: + tuple val(meta), path(ratio) + + output: + tuple val(meta), path("*.bed"), emit: bed + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + freec2bed.pl -f ${ratio} ${args} > ${prefix}.bed + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + controlfreec: \$(echo \$(freec -version 2>&1) | sed 's/^.*Control-FREEC //; s/:.*\$//' | sed -e "s/Control-FREEC v//g" ) + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.bed + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + controlfreec: \$(echo \$(freec -version 2>&1) | sed 's/^.*Control-FREEC //; s/:.*\$//' | sed -e "s/Control-FREEC v//g" ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/controlfreec/freec2bed/meta.yml b/modules/nf-core/controlfreec/freec2bed/meta.yml new file mode 100644 index 0000000000..b10c8ab377 --- /dev/null +++ b/modules/nf-core/controlfreec/freec2bed/meta.yml @@ -0,0 +1,44 @@ +name: controlfreec_freec2bed +description: Plot Freec output +keywords: + - cna + - cnv + - somatic + - single + - tumor-only +tools: + - controlfreec: + description: Copy number and genotype annotation from whole genome and whole exome sequencing data. + homepage: http://boevalab.inf.ethz.ch/FREEC + documentation: http://boevalab.inf.ethz.ch/FREEC/tutorial.html + tool_dev_url: https://github.com/BoevaLab/FREEC/ + doi: "10.1093/bioinformatics/btq635" + licence: ["GPL >=2"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - ratio: + type: file + description: ratio file generated by FREEC + pattern: "*.ratio.txt" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - bed: + type: file + description: Bed file + pattern: "*.bed" +authors: + - "@FriederikeHanssen" +maintainers: + - "@FriederikeHanssen" diff --git a/modules/nf-core/controlfreec/freec2circos/environment.yml b/modules/nf-core/controlfreec/freec2circos/environment.yml new file mode 100644 index 0000000000..1915abfd7f --- /dev/null +++ b/modules/nf-core/controlfreec/freec2circos/environment.yml @@ -0,0 +1,7 @@ +name: controlfreec_freec2circos +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::control-freec=11.6b diff --git a/modules/nf-core/controlfreec/freec2circos/main.nf b/modules/nf-core/controlfreec/freec2circos/main.nf new file mode 100644 index 0000000000..f80116799c --- /dev/null +++ b/modules/nf-core/controlfreec/freec2circos/main.nf @@ -0,0 +1,42 @@ +process CONTROLFREEC_FREEC2CIRCOS { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/control-freec:11.6b--hdbdd923_0' : + 'biocontainers/control-freec:11.6b--hdbdd923_0' }" + + input: + tuple val(meta), path(ratio) + + output: + tuple val(meta), path("*.circos.txt"), emit: circos + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + freec2circos.pl -f ${ratio} ${args} > ${prefix}.circos.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + controlfreec: \$(echo \$(freec -version 2>&1) | sed 's/^.*Control-FREEC //; s/:.*\$//' | sed -e "s/Control-FREEC v//g" ) + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.circos.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + controlfreec: \$(echo \$(freec -version 2>&1) | sed 's/^.*Control-FREEC //; s/:.*\$//' | sed -e "s/Control-FREEC v//g" ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/controlfreec/freec2circos/meta.yml b/modules/nf-core/controlfreec/freec2circos/meta.yml new file mode 100644 index 0000000000..2c6b77d611 --- /dev/null +++ b/modules/nf-core/controlfreec/freec2circos/meta.yml @@ -0,0 +1,44 @@ +name: controlfreec_freec2circos +description: Format Freec output to circos input format +keywords: + - cna + - cnv + - somatic + - single + - tumor-only +tools: + - controlfreec: + description: Copy number and genotype annotation from whole genome and whole exome sequencing data. + homepage: http://boevalab.inf.ethz.ch/FREEC + documentation: http://boevalab.inf.ethz.ch/FREEC/tutorial.html + tool_dev_url: https://github.com/BoevaLab/FREEC/ + doi: "10.1093/bioinformatics/btq635" + licence: ["GPL >=2"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - ratio: + type: file + description: ratio file generated by FREEC + pattern: "*.ratio.txt" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - circos: + type: file + description: Txt file + pattern: "*.circos.txt" +authors: + - "@FriederikeHanssen" +maintainers: + - "@FriederikeHanssen" diff --git a/modules/nf-core/controlfreec/makegraph/controlfreec-makegraph.diff b/modules/nf-core/controlfreec/makegraph/controlfreec-makegraph.diff new file mode 100644 index 0000000000..13f4208a05 --- /dev/null +++ b/modules/nf-core/controlfreec/makegraph/controlfreec-makegraph.diff @@ -0,0 +1,23 @@ +Changes in module 'nf-core/controlfreec/makegraph' +--- modules/nf-core/controlfreec/makegraph/main.nf ++++ modules/nf-core/controlfreec/makegraph/main.nf +@@ -8,7 +8,7 @@ + 'biocontainers/control-freec:11.6b--hdbdd923_0' }" + + input: +- tuple val(meta), path(ratio), path(baf), val(ploidy) ++ tuple val(meta), path(ratio), path(baf) + + output: + tuple val(meta), path("*_BAF.png") , emit: png_baf +@@ -25,7 +25,7 @@ + def prefix = task.ext.prefix ?: "${meta.id}" + def baf = baf ?: "" + """ +- cat \$(which makeGraph.R) | R --slave --args ${ploidy} ${args} ${ratio} ${baf} ++ cat \$(which makeGraph.R) | R --slave --args ${args} ${ratio} ${baf} + + mv *_BAF.txt.png ${prefix}_BAF.png + mv *_ratio.txt.log2.png ${prefix}_ratio.log2.png + +************************************************************ diff --git a/modules/nf-core/controlfreec/makegraph/environment.yml b/modules/nf-core/controlfreec/makegraph/environment.yml new file mode 100644 index 0000000000..897eadf3bd --- /dev/null +++ b/modules/nf-core/controlfreec/makegraph/environment.yml @@ -0,0 +1,7 @@ +name: controlfreec_makegraph +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::control-freec=11.6b diff --git a/modules/nf-core/controlfreec/makegraph/main.nf b/modules/nf-core/controlfreec/makegraph/main.nf new file mode 100644 index 0000000000..8d489f71f7 --- /dev/null +++ b/modules/nf-core/controlfreec/makegraph/main.nf @@ -0,0 +1,52 @@ +process CONTROLFREEC_MAKEGRAPH { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/control-freec:11.6b--hdbdd923_0' : + 'biocontainers/control-freec:11.6b--hdbdd923_0' }" + + input: + tuple val(meta), path(ratio), path(baf) + + output: + tuple val(meta), path("*_BAF.png") , emit: png_baf + tuple val(meta), path("*_ratio.log2.png"), emit: png_ratio_log2 + tuple val(meta), path("*_ratio.png") , emit: png_ratio + + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: "" + def prefix = task.ext.prefix ?: "${meta.id}" + def baf = baf ?: "" + """ + cat \$(which makeGraph.R) | R --slave --args ${args} ${ratio} ${baf} + + mv *_BAF.txt.png ${prefix}_BAF.png + mv *_ratio.txt.log2.png ${prefix}_ratio.log2.png + mv *_ratio.txt.png ${prefix}_ratio.png + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + controlfreec: \$(echo \$(freec -version 2>&1) | sed 's/^.*Control-FREEC //; s/:.*\$//' | sed -e "s/Control-FREEC v//g" ) + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}_BAF.png + touch ${prefix}_ratio.log2.png + touch ${prefix}_ratio.png + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + controlfreec: \$(echo \$(freec -version 2>&1) | sed 's/^.*Control-FREEC //; s/:.*\$//' | sed -e "s/Control-FREEC v//g" ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/controlfreec/makegraph/meta.yml b/modules/nf-core/controlfreec/makegraph/meta.yml new file mode 100644 index 0000000000..6f91db38aa --- /dev/null +++ b/modules/nf-core/controlfreec/makegraph/meta.yml @@ -0,0 +1,60 @@ +name: controlfreec_makegraph +description: Plot Freec output +keywords: + - cna + - cnv + - somatic + - single + - tumor-only +tools: + - controlfreec: + description: Copy number and genotype annotation from whole genome and whole exome sequencing data. + homepage: http://boevalab.inf.ethz.ch/FREEC + documentation: http://boevalab.inf.ethz.ch/FREEC/tutorial.html + tool_dev_url: https://github.com/BoevaLab/FREEC/ + doi: "10.1093/bioinformatics/btq635" + licence: ["GPL >=2"] +input: + # Only when we have meta + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - ratio: + type: file + description: ratio file generated by FREEC + pattern: "*.ratio.txt" + - baf: + type: file + description: .BAF file generated by FREEC + pattern: "*.BAF" + - ploidy: + type: integer + description: Ploidy value for which graph should be created +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - png_baf: + type: file + description: Image of BAF plot + pattern: "*_BAF.png" + - png_ratio_log2: + type: file + description: Image of ratio log2 plot + pattern: "*_ratio.log2.png" + - png_ratio: + type: file + description: Image of ratio plot + pattern: "*_ratio.png" +authors: + - "@FriederikeHanssen" +maintainers: + - "@FriederikeHanssen" diff --git a/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py b/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py index e55b8d43a9..da03340857 100755 --- a/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py +++ b/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py @@ -4,11 +4,10 @@ """Provide functions to merge multiple versions.yml files.""" +import yaml import platform from textwrap import dedent -import yaml - def _make_versions_html(versions): """Generate a tabular HTML output of all versions for MultiQC.""" diff --git a/modules/nf-core/deepvariant/environment.yml b/modules/nf-core/deepvariant/environment.yml new file mode 100644 index 0000000000..648a76dea6 --- /dev/null +++ b/modules/nf-core/deepvariant/environment.yml @@ -0,0 +1,5 @@ +name: deepvariant +channels: + - conda-forge + - bioconda + - defaults diff --git a/modules/nf-core/deepvariant/main.nf b/modules/nf-core/deepvariant/main.nf new file mode 100644 index 0000000000..2d5c480c4d --- /dev/null +++ b/modules/nf-core/deepvariant/main.nf @@ -0,0 +1,66 @@ +process DEEPVARIANT { + tag "$meta.id" + label 'process_high' + + container "nf-core/deepvariant:1.5.0" + + input: + tuple val(meta), path(input), path(index), path(intervals) + tuple val(meta2), path(fasta) + tuple val(meta3), path(fai) + tuple val(meta4), path(gzi) + + output: + tuple val(meta), path("${prefix}.vcf.gz") , emit: vcf + tuple val(meta), path("${prefix}.vcf.gz.tbi") , emit: vcf_tbi + tuple val(meta), path("${prefix}.g.vcf.gz") , emit: gvcf + tuple val(meta), path("${prefix}.g.vcf.gz.tbi"), emit: gvcf_tbi + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error "DEEPVARIANT module does not support Conda. Please use Docker / Singularity / Podman instead." + } + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + def regions = intervals ? "--regions=${intervals}" : "" + + """ + /opt/deepvariant/bin/run_deepvariant \\ + --ref=${fasta} \\ + --reads=${input} \\ + --output_vcf=${prefix}.vcf.gz \\ + --output_gvcf=${prefix}.g.vcf.gz \\ + ${args} \\ + ${regions} \\ + --intermediate_results_dir=. \\ + --num_shards=${task.cpus} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + deepvariant: \$(echo \$(/opt/deepvariant/bin/run_deepvariant --version) | sed 's/^.*version //; s/ .*\$//' ) + END_VERSIONS + """ + + stub: + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error "DEEPVARIANT module does not support Conda. Please use Docker / Singularity / Podman instead." + } + prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.vcf.gz + touch ${prefix}.vcf.gz.tbi + touch ${prefix}.g.vcf.gz + touch ${prefix}.g.vcf.gz.tbi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + deepvariant: \$(echo \$(/opt/deepvariant/bin/run_deepvariant --version) | sed 's/^.*version //; s/ .*\$//' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/deepvariant/meta.yml b/modules/nf-core/deepvariant/meta.yml new file mode 100644 index 0000000000..a50dc57d9a --- /dev/null +++ b/modules/nf-core/deepvariant/meta.yml @@ -0,0 +1,83 @@ +name: deepvariant +description: DeepVariant is an analysis pipeline that uses a deep neural network to call genetic variants from next-generation DNA sequencing data +keywords: + - variant calling + - machine learning + - neural network +tools: + - deepvariant: + description: DeepVariant is an analysis pipeline that uses a deep neural network to call genetic variants from next-generation DNA sequencing data + homepage: https://github.com/google/deepvariant + documentation: https://github.com/google/deepvariant + tool_dev_url: https://github.com/google/deepvariant + doi: "10.1038/nbt.4235" + licence: ["BSD-3-clause"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: BAM/CRAM file + pattern: "*.bam/cram" + - index: + type: file + description: Index of BAM/CRAM file + pattern: "*.bai/crai" + - interval: + type: file + description: Interval file for targeted regions + pattern: "*.bed" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - fasta: + type: file + description: The reference fasta file + pattern: "*.fasta" + - meta3: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - fai: + type: file + description: Index of reference fasta file + pattern: "*.fai" + - meta4: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - gzi: + type: file + description: GZI index of reference fasta file + pattern: "*.gzi" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - vcf: + type: file + description: Compressed VCF file + pattern: "*.vcf.gz" + - gvcf: + type: file + description: Compressed GVCF file + pattern: "*.g.vcf.gz" + - version: + type: file + description: File containing software version + pattern: "*.{version.txt}" +authors: + - "@abhi18av" + - "@ramprasadn" +maintainers: + - "@abhi18av" + - "@ramprasadn" diff --git a/modules/nf-core/dragmap/align/dragmap-align.diff b/modules/nf-core/dragmap/align/dragmap-align.diff new file mode 100644 index 0000000000..b789f815e3 --- /dev/null +++ b/modules/nf-core/dragmap/align/dragmap-align.diff @@ -0,0 +1,30 @@ +Changes in module 'nf-core/dragmap/align' +--- modules/nf-core/dragmap/align/main.nf ++++ modules/nf-core/dragmap/align/main.nf +@@ -4,8 +4,8 @@ + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? +- 'https://depot.galaxyproject.org/singularity/mulled-v2-580d344d9d4a496cd403932da8765f9e0187774d:7eed251370ac7f3537c3d9472cdb2f9f5d8da1c5-0': +- 'biocontainers/mulled-v2-580d344d9d4a496cd403932da8765f9e0187774d:7eed251370ac7f3537c3d9472cdb2f9f5d8da1c5-0' }" ++ 'https://depot.galaxyproject.org/singularity/mulled-v2-580d344d9d4a496cd403932da8765f9e0187774d:5ebebbc128cd624282eaa37d2c7fe01505a91a69-0': ++ 'biocontainers/mulled-v2-580d344d9d4a496cd403932da8765f9e0187774d:5ebebbc128cd624282eaa37d2c7fe01505a91a69-0' }" + + input: + tuple val(meta) , path(reads) + +--- modules/nf-core/dragmap/align/environment.yml ++++ modules/nf-core/dragmap/align/environment.yml +@@ -4,7 +4,7 @@ + - bioconda + - defaults + dependencies: +- - dragmap=1.3.0 ++ - dragmap=1.2.1 + # renovate: datasource=conda depName=bioconda/samtools +- - samtools=1.18 +- - pigz=2.8 ++ - samtools=1.15.1 ++ - pigz=2.3.4 + +************************************************************ diff --git a/modules/nf-core/dragmap/align/environment.yml b/modules/nf-core/dragmap/align/environment.yml new file mode 100644 index 0000000000..b00e62f435 --- /dev/null +++ b/modules/nf-core/dragmap/align/environment.yml @@ -0,0 +1,10 @@ +name: dragmap_align +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - dragmap=1.2.1 + # renovate: datasource=conda depName=bioconda/samtools + - samtools=1.15.1 + - pigz=2.3.4 diff --git a/modules/nf-core/dragmap/align/main.nf b/modules/nf-core/dragmap/align/main.nf new file mode 100644 index 0000000000..eafa8697e4 --- /dev/null +++ b/modules/nf-core/dragmap/align/main.nf @@ -0,0 +1,60 @@ +process DRAGMAP_ALIGN { + tag "$meta.id" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-580d344d9d4a496cd403932da8765f9e0187774d:5ebebbc128cd624282eaa37d2c7fe01505a91a69-0': + 'biocontainers/mulled-v2-580d344d9d4a496cd403932da8765f9e0187774d:5ebebbc128cd624282eaa37d2c7fe01505a91a69-0' }" + + input: + tuple val(meta) , path(reads) + tuple val(meta2), path(hashmap) + val sort_bam + + output: + tuple val(meta), path("*.bam"), emit: bam + tuple val(meta), path('*.log'), emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def reads_command = meta.single_end ? "-1 $reads" : "-1 ${reads[0]} -2 ${reads[1]}" + def samtools_command = sort_bam ? 'sort' : 'view' + + """ + dragen-os \\ + -r $hashmap \\ + $args \\ + --num-threads $task.cpus \\ + $reads_command \\ + 2> >(tee ${prefix}.dragmap.log >&2) \\ + | samtools $samtools_command $args2 --threads $task.cpus -o ${prefix}.bam - + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + dragmap: \$(echo \$(dragen-os --version 2>&1)) + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' ) + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.bam + touch ${prefix}.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + dragmap: \$(echo \$(dragen-os --version 2>&1)) + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/dragmap/align/meta.yml b/modules/nf-core/dragmap/align/meta.yml new file mode 100644 index 0000000000..f0def75567 --- /dev/null +++ b/modules/nf-core/dragmap/align/meta.yml @@ -0,0 +1,48 @@ +name: dragmap_align +description: Performs fastq alignment to a reference using DRAGMAP +keywords: + - alignment + - map + - fastq + - bam + - sam +tools: + - dragmap: + description: Dragmap is the Dragen mapper/aligner Open Source Software. + homepage: https://github.com/Illumina/dragmap + documentation: https://github.com/Illumina/dragmap + tool_dev_url: https://github.com/Illumina/dragmap#basic-command-line-usage + licence: ["GPL v3"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test', single_end:false ] + - hashmap: + type: file + description: DRAGMAP hash table + pattern: "Directory containing DRAGMAP hash table *.{cmp,.bin,.txt}" +output: + - bam: + type: file + description: Output BAM file containing read alignments + pattern: "*.{bam}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@Emiller88" +maintainers: + - "@Emiller88" diff --git a/modules/nf-core/dragmap/align/tests/main.nf.test b/modules/nf-core/dragmap/align/tests/main.nf.test new file mode 100644 index 0000000000..c90c78f314 --- /dev/null +++ b/modules/nf-core/dragmap/align/tests/main.nf.test @@ -0,0 +1,276 @@ +nextflow_process { + + name "Test Process DRAGMAP_ALIGN" + script "../main.nf" + process "DRAGMAP_ALIGN" + tag "modules" + tag "modules_nfcore" + tag "dragmap" + tag "dragmap/align" + + test("sarscov2 - fastq, hashtable, false") { + + setup { + run("DRAGMAP_HASHTABLE") { + script "../../hashtable/main.nf" + process { + """ + input[0] = [ + [id:'test'], + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + } + } + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) + ] + input[1] = DRAGMAP_HASHTABLE.out.hashmap + input[2] = false //sort + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + file(process.out.log[0][1]).readLines().findAll { it.startsWith("decompHash") }, + file(process.out.versions[0]).name + ).match() } + ) + } + + } + + test("sarscov2 - fastq, hashtable, true") { + + setup { + run("DRAGMAP_HASHTABLE") { + script "../../hashtable/main.nf" + process { + """ + input[0] = [ + [id:'test'], + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + } + } + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) + ] + input[1] = DRAGMAP_HASHTABLE.out.hashmap + input[2] = true //sort + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + file(process.out.log[0][1]).readLines().findAll { it.startsWith("decompHash") }, + file(process.out.versions[0]).name + ).match() } + ) + } + + } + + test("sarscov2 - [fastq1, fastq2], hashtable, false") { + + setup { + run("DRAGMAP_HASHTABLE") { + script "../../hashtable/main.nf" + process { + """ + input[0] = [ + [id:'test'], + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + } + } + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) + ] + ] + input[1] = DRAGMAP_HASHTABLE.out.hashmap + input[2] = false //sort + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + file(process.out.log[0][1]).readLines().findAll { it.startsWith("decompHash") }, + file(process.out.versions[0]).name + ).match() } + ) + } + + } + + test("sarscov2 - [fastq1, fastq2], hashtable, true") { + + setup { + run("DRAGMAP_HASHTABLE") { + script "../../hashtable/main.nf" + process { + """ + input[0] = [ + [id:'test'], + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + } + } + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) + ] + ] + input[1] = DRAGMAP_HASHTABLE.out.hashmap + input[2] = true //sort + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + file(process.out.log[0][1]).readLines().findAll { it.startsWith("decompHash") }, + file(process.out.versions[0]).name + ).match() } + ) + } + + } + + test("homo_sapiens - [fastq1, fastq2], hashtable, true") { + + setup { + run("DRAGMAP_HASHTABLE") { + script "../../hashtable/main.nf" + process { + """ + input[0] = [ + [id:'test'], + file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + } + } + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ + file(params.test_data['homo_sapiens']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['homo_sapiens']['illumina']['test_2_fastq_gz'], checkIfExists: true) + ] + ] + input[1] = DRAGMAP_HASHTABLE.out.hashmap + input[2] = true //sort + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + file(process.out.log[0][1]).readLines().findAll { it.startsWith("decompHash") }, + file(process.out.versions[0]).name + ).match() } + ) + } + + } + + test("sarscov2 - [fastq1, fastq2], hashtable, true - stub") { + + options "-stub" + setup { + run("DRAGMAP_HASHTABLE") { + script "../../hashtable/main.nf" + process { + """ + input[0] = [ + [id:'test'], + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + } + } + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) + ] + ] + input[1] = DRAGMAP_HASHTABLE.out.hashmap + input[2] = true //sort + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot( + file(process.out.bam[0][1]).name, + file(process.out.log[0][1]).name, + file(process.out.versions[0]).name + ).match() } + ) + } + + } + +} diff --git a/modules/nf-core/dragmap/align/tests/main.nf.test.snap b/modules/nf-core/dragmap/align/tests/main.nf.test.snap new file mode 100644 index 0000000000..32d88a2796 --- /dev/null +++ b/modules/nf-core/dragmap/align/tests/main.nf.test.snap @@ -0,0 +1,85 @@ +{ + "homo_sapiens - [fastq1, fastq2], hashtable, true": { + "content": [ + "test.bam", + [ + "decompHashTableCtxInit...", + "decompHashTableHeader...", + "decompHashTableLiterals...", + "decompHashTableExtIndex...", + "decompHashTableAutoHits...", + "decompHashTableSetFlags..." + ], + "versions.yml" + ], + "timestamp": "2023-11-23T10:12:03.844836279" + }, + "sarscov2 - [fastq1, fastq2], hashtable, true - stub": { + "content": [ + "test.bam", + "test.log", + "versions.yml" + ], + "timestamp": "2023-11-22T13:41:18.016853266" + }, + "sarscov2 - fastq, hashtable, false": { + "content": [ + "test.bam", + [ + "decompHashTableCtxInit...", + "decompHashTableHeader...", + "decompHashTableLiterals...", + "decompHashTableExtIndex...", + "decompHashTableAutoHits...", + "decompHashTableSetFlags..." + ], + "versions.yml" + ], + "timestamp": "2023-11-23T10:11:33.956661024" + }, + "sarscov2 - fastq, hashtable, true": { + "content": [ + "test.bam", + [ + "decompHashTableCtxInit...", + "decompHashTableHeader...", + "decompHashTableLiterals...", + "decompHashTableExtIndex...", + "decompHashTableAutoHits...", + "decompHashTableSetFlags..." + ], + "versions.yml" + ], + "timestamp": "2023-11-23T10:11:40.270598375" + }, + "sarscov2 - [fastq1, fastq2], hashtable, false": { + "content": [ + "test.bam", + [ + "decompHashTableCtxInit...", + "decompHashTableHeader...", + "decompHashTableLiterals...", + "decompHashTableExtIndex...", + "decompHashTableAutoHits...", + "decompHashTableSetFlags..." + ], + "versions.yml" + ], + "timestamp": "2023-11-23T10:11:46.928978876" + }, + "sarscov2 - [fastq1, fastq2], hashtable, true": { + "content": [ + "test.bam", + [ + "decompHashTableCtxInit...", + "decompHashTableHeader...", + "decompHashTableLiterals...", + "decompHashTableExtIndex...", + "decompHashTableAutoHits...", + "decompHashTableSetFlags..." + ], + "versions.yml" + ], + "timestamp": "2023-11-23T10:11:53.506727278" + } +} \ No newline at end of file diff --git a/modules/nf-core/dragmap/align/tests/tags.yml b/modules/nf-core/dragmap/align/tests/tags.yml new file mode 100644 index 0000000000..a2a388af37 --- /dev/null +++ b/modules/nf-core/dragmap/align/tests/tags.yml @@ -0,0 +1,2 @@ +dragmap/align: + - modules/nf-core/dragmap/align/** diff --git a/modules/nf-core/dragmap/hashtable/dragmap-hashtable.diff b/modules/nf-core/dragmap/hashtable/dragmap-hashtable.diff new file mode 100644 index 0000000000..a5823ce8bc --- /dev/null +++ b/modules/nf-core/dragmap/hashtable/dragmap-hashtable.diff @@ -0,0 +1,25 @@ +Changes in module 'nf-core/dragmap/hashtable' +--- modules/nf-core/dragmap/hashtable/environment.yml ++++ modules/nf-core/dragmap/hashtable/environment.yml +@@ -4,4 +4,4 @@ + - bioconda + - defaults + dependencies: +- - bioconda::dragmap=1.3.0 ++ - bioconda::dragmap=1.2.1 + +--- modules/nf-core/dragmap/hashtable/main.nf ++++ modules/nf-core/dragmap/hashtable/main.nf +@@ -4,8 +4,8 @@ + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? +- 'https://depot.galaxyproject.org/singularity/dragmap:1.3.0--h72d16da_1': +- 'biocontainers/dragmap:1.3.0--h72d16da_1' }" ++ 'https://depot.galaxyproject.org/singularity/dragmap:1.2.1--h72d16da_1': ++ 'biocontainers/dragmap:1.2.1--h72d16da_1' }" + + input: + tuple val(meta), path(fasta) + +************************************************************ diff --git a/modules/nf-core/dragmap/hashtable/environment.yml b/modules/nf-core/dragmap/hashtable/environment.yml new file mode 100644 index 0000000000..3c3d1404f4 --- /dev/null +++ b/modules/nf-core/dragmap/hashtable/environment.yml @@ -0,0 +1,7 @@ +name: dragmap_hashtable +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::dragmap=1.2.1 diff --git a/modules/nf-core/dragmap/hashtable/main.nf b/modules/nf-core/dragmap/hashtable/main.nf new file mode 100644 index 0000000000..604052f33f --- /dev/null +++ b/modules/nf-core/dragmap/hashtable/main.nf @@ -0,0 +1,36 @@ +process DRAGMAP_HASHTABLE { + tag "$fasta" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/dragmap:1.2.1--h72d16da_1': + 'biocontainers/dragmap:1.2.1--h72d16da_1' }" + + input: + tuple val(meta), path(fasta) + + output: + tuple val(meta), path("dragmap") , emit: hashmap + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + mkdir dragmap + dragen-os \\ + --build-hash-table true \\ + --ht-reference $fasta \\ + --output-directory dragmap \\ + $args \\ + --ht-num-threads $task.cpus + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + dragmap: \$(echo \$(dragen-os --version 2>&1)) + END_VERSIONS + """ +} diff --git a/modules/nf-core/dragmap/hashtable/meta.yml b/modules/nf-core/dragmap/hashtable/meta.yml new file mode 100644 index 0000000000..8daca83273 --- /dev/null +++ b/modules/nf-core/dragmap/hashtable/meta.yml @@ -0,0 +1,41 @@ +name: dragmap_hashtable +description: Create DRAGEN hashtable for reference genome +keywords: + - index + - fasta + - genome + - reference +tools: + - dragmap: + description: Dragmap is the Dragen mapper/aligner Open Source Software. + homepage: https://github.com/Illumina/dragmap + documentation: https://github.com/Illumina/dragmap + tool_dev_url: https://github.com/Illumina/dragmap#basic-command-line-usage + licence: ["GPL v3"] +input: + - meta: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: Input genome fasta file +output: + - meta: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test', single_end:false ] + - hashmap: + type: file + description: DRAGMAP hash table + pattern: "*.{cmp,.bin,.txt}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@Emiller88" +maintainers: + - "@Emiller88" diff --git a/modules/nf-core/ensemblvep/download/environment.yml b/modules/nf-core/ensemblvep/download/environment.yml new file mode 100644 index 0000000000..beebaca634 --- /dev/null +++ b/modules/nf-core/ensemblvep/download/environment.yml @@ -0,0 +1,7 @@ +name: ensemblvep_download +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::ensembl-vep=110.0 diff --git a/modules/nf-core/ensemblvep/download/main.nf b/modules/nf-core/ensemblvep/download/main.nf new file mode 100644 index 0000000000..a770cbfc6f --- /dev/null +++ b/modules/nf-core/ensemblvep/download/main.nf @@ -0,0 +1,45 @@ +process ENSEMBLVEP_DOWNLOAD { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ensembl-vep:110.0--pl5321h2a3209d_0' : + 'biocontainers/ensembl-vep:110.0--pl5321h2a3209d_0' }" + + input: + tuple val(meta), val(assembly), val(species), val(cache_version) + + output: + tuple val(meta), path("vep_cache"), emit: cache + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + vep_install \\ + --CACHEDIR vep_cache \\ + --SPECIES $species \\ + --ASSEMBLY $assembly \\ + --CACHE_VERSION $cache_version \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + ensemblvep: \$( echo \$(vep --help 2>&1) | sed 's/^.*Versions:.*ensembl-vep : //;s/ .*\$//') + END_VERSIONS + """ + + stub: + """ + mkdir vep_cache + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + ensemblvep: \$( echo \$(vep --help 2>&1) | sed 's/^.*Versions:.*ensembl-vep : //;s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/ensemblvep/download/meta.yml b/modules/nf-core/ensemblvep/download/meta.yml new file mode 100644 index 0000000000..a4277ad7a7 --- /dev/null +++ b/modules/nf-core/ensemblvep/download/meta.yml @@ -0,0 +1,45 @@ +name: ensemblvep_download +description: Ensembl Variant Effect Predictor (VEP). The cache downloading options are controlled through `task.ext.args`. +keywords: + - annotation + - cache + - download +tools: + - ensemblvep: + description: | + VEP determines the effect of your variants (SNPs, insertions, deletions, CNVs + or structural variants) on genes, transcripts, and protein sequence, as well as regulatory regions. + homepage: https://www.ensembl.org/info/docs/tools/vep/index.html + documentation: https://www.ensembl.org/info/docs/tools/vep/script/index.html + licence: ["Apache-2.0"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - assembly: + type: string + description: | + Genome assembly + - species: + type: string + description: | + Specie + - cache_version: + type: string + description: | + cache version +output: + - cache: + type: file + description: cache + pattern: "*" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@maxulysse" +maintainers: + - "@maxulysse" diff --git a/modules/nf-core/ensemblvep/vep/environment.yml b/modules/nf-core/ensemblvep/vep/environment.yml new file mode 100644 index 0000000000..7a12774608 --- /dev/null +++ b/modules/nf-core/ensemblvep/vep/environment.yml @@ -0,0 +1,7 @@ +name: ensemblvep_vep +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::ensembl-vep=110.0 diff --git a/modules/nf-core/ensemblvep/vep/main.nf b/modules/nf-core/ensemblvep/vep/main.nf new file mode 100644 index 0000000000..3a2b742348 --- /dev/null +++ b/modules/nf-core/ensemblvep/vep/main.nf @@ -0,0 +1,71 @@ +process ENSEMBLVEP_VEP { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ensembl-vep:110.0--pl5321h2a3209d_0' : + 'biocontainers/ensembl-vep:110.0--pl5321h2a3209d_0' }" + + input: + tuple val(meta), path(vcf), path(custom_extra_files) + val genome + val species + val cache_version + path cache + tuple val(meta2), path(fasta) + path extra_files + + output: + tuple val(meta), path("*.vcf.gz") , optional:true, emit: vcf + tuple val(meta), path("*.tab.gz") , optional:true, emit: tab + tuple val(meta), path("*.json.gz") , optional:true, emit: json + path "*.summary.html" , emit: report + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def file_extension = args.contains("--vcf") ? 'vcf' : args.contains("--json")? 'json' : args.contains("--tab")? 'tab' : 'vcf' + def compress_cmd = args.contains("--compress_output") ? '' : '--compress_output bgzip' + def prefix = task.ext.prefix ?: "${meta.id}" + def dir_cache = cache ? "\${PWD}/${cache}" : "/.vep" + def reference = fasta ? "--fasta $fasta" : "" + """ + vep \\ + -i $vcf \\ + -o ${prefix}.${file_extension}.gz \\ + $args \\ + $compress_cmd \\ + $reference \\ + --assembly $genome \\ + --species $species \\ + --cache \\ + --cache_version $cache_version \\ + --dir_cache $dir_cache \\ + --fork $task.cpus \\ + --stats_file ${prefix}.summary.html \\ + + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + ensemblvep: \$( echo \$(vep --help 2>&1) | sed 's/^.*Versions:.*ensembl-vep : //;s/ .*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.vcf.gz + touch ${prefix}.tab.gz + touch ${prefix}.json.gz + touch ${prefix}.summary.html + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + ensemblvep: \$( echo \$(vep --help 2>&1) | sed 's/^.*Versions:.*ensembl-vep : //;s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/ensemblvep/vep/meta.yml b/modules/nf-core/ensemblvep/vep/meta.yml new file mode 100644 index 0000000000..d8ff8d1443 --- /dev/null +++ b/modules/nf-core/ensemblvep/vep/meta.yml @@ -0,0 +1,92 @@ +name: ensemblvep_vep +description: Ensembl Variant Effect Predictor (VEP). The output-file-format is controlled through `task.ext.args`. +keywords: + - annotation + - vcf + - json + - tab +tools: + - ensemblvep: + description: | + VEP determines the effect of your variants (SNPs, insertions, deletions, CNVs + or structural variants) on genes, transcripts, and protein sequence, as well as regulatory regions. + homepage: https://www.ensembl.org/info/docs/tools/vep/index.html + documentation: https://www.ensembl.org/info/docs/tools/vep/script/index.html + licence: ["Apache-2.0"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - vcf: + type: file + description: | + vcf to annotate + - custom_extra_files: + type: file + description: | + extra sample-specific files to be used with the `--custom` flag to be configured with ext.args + (optional) + - genome: + type: string + description: | + which genome to annotate with + - species: + type: string + description: | + which species to annotate with + - cache_version: + type: integer + description: | + which version of the cache to annotate with + - cache: + type: file + description: | + path to VEP cache (optional) + - meta2: + type: map + description: | + Groovy Map containing fasta reference information + e.g. [ id:'test' ] + - fasta: + type: file + description: | + reference FASTA file (optional) + pattern: "*.{fasta,fa}" + - extra_files: + type: file + description: | + path to file(s) needed for plugins (optional) +output: + - vcf: + type: file + description: | + annotated vcf (optional) + pattern: "*.ann.vcf.gz" + - tab: + type: file + description: | + tab file with annotated variants (optional) + pattern: "*.ann.tab.gz" + - json: + type: file + description: | + json file with annotated variants (optional) + pattern: "*.ann.json.gz" + - report: + type: file + description: VEP report file + pattern: "*.html" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@maxulysse" + - "@matthdsm" + - "@nvnieuwk" +maintainers: + - "@maxulysse" + - "@matthdsm" + - "@nvnieuwk" diff --git a/modules/nf-core/fastp/environment.yml b/modules/nf-core/fastp/environment.yml new file mode 100644 index 0000000000..70389e664c --- /dev/null +++ b/modules/nf-core/fastp/environment.yml @@ -0,0 +1,7 @@ +name: fastp +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::fastp=0.23.4 diff --git a/modules/nf-core/fastp/main.nf b/modules/nf-core/fastp/main.nf new file mode 100644 index 0000000000..5fac3c1adb --- /dev/null +++ b/modules/nf-core/fastp/main.nf @@ -0,0 +1,102 @@ +process FASTP { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/fastp:0.23.4--h5f740d0_0' : + 'biocontainers/fastp:0.23.4--h5f740d0_0' }" + + input: + tuple val(meta), path(reads) + path adapter_fasta + val save_trimmed_fail + val save_merged + + output: + tuple val(meta), path('*.fastp.fastq.gz') , optional:true, emit: reads + tuple val(meta), path('*.json') , emit: json + tuple val(meta), path('*.html') , emit: html + tuple val(meta), path('*.log') , emit: log + path "versions.yml" , emit: versions + tuple val(meta), path('*.fail.fastq.gz') , optional:true, emit: reads_fail + tuple val(meta), path('*.merged.fastq.gz'), optional:true, emit: reads_merged + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def adapter_list = adapter_fasta ? "--adapter_fasta ${adapter_fasta}" : "" + def fail_fastq = save_trimmed_fail && meta.single_end ? "--failed_out ${prefix}.fail.fastq.gz" : save_trimmed_fail && !meta.single_end ? "--unpaired1 ${prefix}_1.fail.fastq.gz --unpaired2 ${prefix}_2.fail.fastq.gz" : '' + // Added soft-links to original fastqs for consistent naming in MultiQC + // Use single ended for interleaved. Add --interleaved_in in config. + if ( task.ext.args?.contains('--interleaved_in') ) { + """ + [ ! -f ${prefix}.fastq.gz ] && ln -sf $reads ${prefix}.fastq.gz + + fastp \\ + --stdout \\ + --in1 ${prefix}.fastq.gz \\ + --thread $task.cpus \\ + --json ${prefix}.fastp.json \\ + --html ${prefix}.fastp.html \\ + $adapter_list \\ + $fail_fastq \\ + $args \\ + 2> >(tee ${prefix}.fastp.log >&2) \\ + | gzip -c > ${prefix}.fastp.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g") + END_VERSIONS + """ + } else if (meta.single_end) { + """ + [ ! -f ${prefix}.fastq.gz ] && ln -sf $reads ${prefix}.fastq.gz + + fastp \\ + --in1 ${prefix}.fastq.gz \\ + --out1 ${prefix}.fastp.fastq.gz \\ + --thread $task.cpus \\ + --json ${prefix}.fastp.json \\ + --html ${prefix}.fastp.html \\ + $adapter_list \\ + $fail_fastq \\ + $args \\ + 2> >(tee ${prefix}.fastp.log >&2) + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g") + END_VERSIONS + """ + } else { + def merge_fastq = save_merged ? "-m --merged_out ${prefix}.merged.fastq.gz" : '' + """ + [ ! -f ${prefix}_1.fastq.gz ] && ln -sf ${reads[0]} ${prefix}_1.fastq.gz + [ ! -f ${prefix}_2.fastq.gz ] && ln -sf ${reads[1]} ${prefix}_2.fastq.gz + fastp \\ + --in1 ${prefix}_1.fastq.gz \\ + --in2 ${prefix}_2.fastq.gz \\ + --out1 ${prefix}_1.fastp.fastq.gz \\ + --out2 ${prefix}_2.fastp.fastq.gz \\ + --json ${prefix}.fastp.json \\ + --html ${prefix}.fastp.html \\ + $adapter_list \\ + $fail_fastq \\ + $merge_fastq \\ + --thread $task.cpus \\ + --detect_adapter_for_pe \\ + $args \\ + 2> >(tee ${prefix}.fastp.log >&2) + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g") + END_VERSIONS + """ + } +} diff --git a/modules/nf-core/fastp/meta.yml b/modules/nf-core/fastp/meta.yml new file mode 100644 index 0000000000..c22a16abd9 --- /dev/null +++ b/modules/nf-core/fastp/meta.yml @@ -0,0 +1,75 @@ +name: fastp +description: Perform adapter/quality trimming on sequencing reads +keywords: + - trimming + - quality control + - fastq +tools: + - fastp: + description: | + A tool designed to provide fast all-in-one preprocessing for FastQ files. This tool is developed in C++ with multithreading supported to afford high performance. + documentation: https://github.com/OpenGene/fastp + doi: 10.1093/bioinformatics/bty560 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information. Use 'single_end: true' to specify single ended or interleaved FASTQs. Use 'single_end: false' for paired-end reads. + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. If you wish to run interleaved paired-end data, supply as single-end data + but with `--interleaved_in` in your `modules.conf`'s `ext.args` for the module. + - adapter_fasta: + type: file + description: File in FASTA format containing possible adapters to remove. + pattern: "*.{fasta,fna,fas,fa}" + - save_trimmed_fail: + type: boolean + description: Specify true to save files that failed to pass trimming thresholds ending in `*.fail.fastq.gz` + - save_merged: + type: boolean + description: Specify true to save all merged reads to the a file ending in `*.merged.fastq.gz` +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: The trimmed/modified/unmerged fastq reads + pattern: "*fastp.fastq.gz" + - json: + type: file + description: Results in JSON format + pattern: "*.json" + - html: + type: file + description: Results in HTML format + pattern: "*.html" + - log: + type: file + description: fastq log file + pattern: "*.log" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - reads_fail: + type: file + description: Reads the failed the preprocessing + pattern: "*fail.fastq.gz" + - reads_merged: + type: file + description: Reads that were successfully merged + pattern: "*.{merged.fastq.gz}" +authors: + - "@drpatelh" + - "@kevinmenden" +maintainers: + - "@drpatelh" + - "@kevinmenden" diff --git a/modules/nf-core/fastp/tests/main.nf.test b/modules/nf-core/fastp/tests/main.nf.test new file mode 100644 index 0000000000..f610b735e2 --- /dev/null +++ b/modules/nf-core/fastp/tests/main.nf.test @@ -0,0 +1,485 @@ +nextflow_process { + + name "Test Process FASTP" + script "../main.nf" + process "FASTP" + tag "modules" + tag "modules_nfcore" + tag "fastp" + + test("test_fastp_single_end") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + adapter_fasta = [] + save_trimmed_fail = false + save_merged = false + + input[0] = [ + [ id:'test', single_end:true ], + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) ] + ] + + input[1] = adapter_fasta + input[2] = save_trimmed_fail + input[3] = save_merged + """ + } + } + + then { + def html_text = [ "Q20 bases:12.922000 K (92.984097%)", + "single end (151 cycles)" ] + def log_text = [ "Q20 bases: 12922(92.9841%)", + "reads passed filter: 99" ] + def read_lines = ["@ERR5069949.2151832 NS500628:121:HK3MMAFX2:2:21208:10793:15304/1", + "TCATAAACCAAAGCACTCACAGTGTCAACAATTTCAGCAGGACAACGCCGACAAGTTCCGAGGAACATGTCTGGACCTATAGTTTTCATAAGTCTACACACTGAATTGAAATATTCTGGTTCTAGTGTGCCCTTAGTTAGCAATGTGCGT", + "AAAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEE + { assert path(process.out.reads.get(0).get(1)).linesGzip.contains(read_line) } + } + }, + { html_text.each { html_part -> + { assert path(process.out.html.get(0).get(1)).getText().contains(html_part) } + } + }, + { assert snapshot(process.out.json).match("test_fastp_single_end_json") }, + { log_text.each { log_part -> + { assert path(process.out.log.get(0).get(1)).getText().contains(log_part) } + } + }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("test_fastp_paired_end") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + adapter_fasta = [] + save_trimmed_fail = false + save_merged = false + + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) ] + ] + + input[1] = adapter_fasta + input[2] = save_trimmed_fail + input[3] = save_merged + """ + } + } + + then { + def html_text = [ "Q20 bases:25.719000 K (93.033098%)", + "The input has little adapter percentage (~0.000000%), probably it's trimmed before."] + def log_text = [ "No adapter detected for read1", + "Q30 bases: 12281(88.3716%)"] + def json_text = ['"passed_filter_reads": 198'] + def read1_lines = ["@ERR5069949.2151832 NS500628:121:HK3MMAFX2:2:21208:10793:15304/1", + "TCATAAACCAAAGCACTCACAGTGTCAACAATTTCAGCAGGACAACGCCGACAAGTTCCGAGGAACATGTCTGGACCTATAGTTTTCATAAGTCTACACACTGAATTGAAATATTCTGGTTCTAGTGTGCCCTTAGTTAGCAATGTGCGT", + "AAAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEE + { assert path(process.out.reads.get(0).get(1).get(0)).linesGzip.contains(read1_line) } + } + }, + { read2_lines.each { read2_line -> + { assert path(process.out.reads.get(0).get(1).get(1)).linesGzip.contains(read2_line) } + } + }, + { html_text.each { html_part -> + { assert path(process.out.html.get(0).get(1)).getText().contains(html_part) } + } + }, + { json_text.each { json_part -> + { assert path(process.out.json.get(0).get(1)).getText().contains(json_part) } + } + }, + { log_text.each { log_part -> + { assert path(process.out.log.get(0).get(1)).getText().contains(log_part) } + } + }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("fastp test_fastp_interleaved") { + config './nextflow.config' + when { + params { + outdir = "$outputDir" + } + process { + """ + adapter_fasta = [] + save_trimmed_fail = false + save_merged = false + + input[0] = [ [ id:'test', single_end:true ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_interleaved_fastq_gz'], checkIfExists: true) ] + ] + + input[1] = adapter_fasta + input[2] = save_trimmed_fail + input[3] = save_merged + """ + } + } + + then { + def html_text = [ "Q20 bases:25.719000 K (93.033098%)", + "paired end (151 cycles + 151 cycles)"] + def log_text = [ "Q20 bases: 12922(92.9841%)", + "reads passed filter: 198"] + def read_lines = [ "@ERR5069949.2151832 NS500628:121:HK3MMAFX2:2:21208:10793:15304/1", + "TCATAAACCAAAGCACTCACAGTGTCAACAATTTCAGCAGGACAACGCCGACAAGTTCCGAGGAACATGTCTGGACCTATAGTTTTCATAAGTCTACACACTGAATTGAAATATTCTGGTTCTAGTGTGCCCTTAGTTAGCAATGTGCGT", + "AAAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEE + { assert path(process.out.reads.get(0).get(1)).linesGzip.contains(read_line) } + } + }, + { html_text.each { html_part -> + { assert path(process.out.html.get(0).get(1)).getText().contains(html_part) } + } + }, + { assert snapshot(process.out.json).match("fastp test_fastp_interleaved_json") }, + { log_text.each { log_part -> + { assert path(process.out.log.get(0).get(1)).getText().contains(log_part) } + } + }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("test_fastp_single_end_trim_fail") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + adapter_fasta = [] + save_trimmed_fail = true + save_merged = false + + input[0] = [ [ id:'test', single_end:true ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) ] + ] + input[1] = adapter_fasta + input[2] = save_trimmed_fail + input[3] = save_merged + """ + } + } + + then { + def html_text = [ "Q20 bases:12.922000 K (92.984097%)", + "single end (151 cycles)"] + def log_text = [ "Q20 bases: 12922(92.9841%)", + "reads passed filter: 99" ] + def read_lines = [ "@ERR5069949.2151832 NS500628:121:HK3MMAFX2:2:21208:10793:15304/1", + "TCATAAACCAAAGCACTCACAGTGTCAACAATTTCAGCAGGACAACGCCGACAAGTTCCGAGGAACATGTCTGGACCTATAGTTTTCATAAGTCTACACACTGAATTGAAATATTCTGGTTCTAGTGTGCCCTTAGTTAGCAATGTGCGT", + "AAAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEE + { assert path(process.out.reads.get(0).get(1)).linesGzip.contains(read_line) } + } + }, + { failed_read_lines.each { failed_read_line -> + { assert path(process.out.reads_fail.get(0).get(1)).linesGzip.contains(failed_read_line) } + } + }, + { html_text.each { html_part -> + { assert path(process.out.html.get(0).get(1)).getText().contains(html_part) } + } + }, + { assert snapshot(process.out.json).match("test_fastp_single_end_trim_fail_json") }, + { log_text.each { log_part -> + { assert path(process.out.log.get(0).get(1)).getText().contains(log_part) } + } + }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("test_fastp_paired_end_trim_fail") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + adapter_fasta = [] + save_trimmed_fail = true + save_merged = false + + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) + ] + ] + input[1] = adapter_fasta + input[2] = save_trimmed_fail + input[3] = save_merged + """ + } + } + + then { + def html_text = [ "Q20 bases:25.719000 K (93.033098%)", + "The input has little adapter percentage (~0.000000%), probably it's trimmed before."] + def log_text = [ "No adapter detected for read1", + "Q30 bases: 12281(88.3716%)"] + def json_text = ['"passed_filter_reads": 198'] + def read1_lines = ["@ERR5069949.2151832 NS500628:121:HK3MMAFX2:2:21208:10793:15304/1", + "TCATAAACCAAAGCACTCACAGTGTCAACAATTTCAGCAGGACAACGCCGACAAGTTCCGAGGAACATGTCTGGACCTATAGTTTTCATAAGTCTACACACTGAATTGAAATATTCTGGTTCTAGTGTGCCCTTAGTTAGCAATGTGCGT", + "AAAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEE + { assert path(process.out.reads.get(0).get(1).get(0)).linesGzip.contains(read1_line) } + } + }, + { read2_lines.each { read2_line -> + { assert path(process.out.reads.get(0).get(1).get(1)).linesGzip.contains(read2_line) } + } + }, + { failed_read2_lines.each { failed_read2_line -> + { assert path(process.out.reads_fail.get(0).get(1).get(1)).linesGzip.contains(failed_read2_line) } + } + }, + { html_text.each { html_part -> + { assert path(process.out.html.get(0).get(1)).getText().contains(html_part) } + } + }, + { json_text.each { json_part -> + { assert path(process.out.json.get(0).get(1)).getText().contains(json_part) } + } + }, + { log_text.each { log_part -> + { assert path(process.out.log.get(0).get(1)).getText().contains(log_part) } + } + }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("test_fastp_paired_end_merged") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + adapter_fasta = [] + save_trimmed_fail = false + save_merged = true + + input[0] = [ [ id:'test', single_end:false ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) ] + ] + input[1] = adapter_fasta + input[2] = save_trimmed_fail + input[3] = save_merged + """ + } + } + + then { + def html_text = [ "
"] + def log_text = [ "Merged and filtered:", + "total reads: 75", + "total bases: 13683"] + def json_text = ['"merged_and_filtered": {', '"total_reads": 75', '"total_bases": 13683'] + def read1_lines = [ "@ERR5069949.1066259 NS500628:121:HK3MMAFX2:1:11312:18369:8333/1", + "CCTTATGACAGCAAGAACTGTGTATGATGATGGTGCTAGGAGAGTGTGGACACTTATGAATGTCTTGACACTCGTTTATAAAGTTTATTATGGTAATGCTTTAGATCAAGCCATTTCCATGTGGGCTCTTATAATCTCTGTTACTTC", + "AAAAAEAEEAEEEEEEEEEEEEEEEEAEEEEAEEEEEEEEAEEEEEEEEEEEEEEEEE/EAEEEEEE/6EEEEEEEEEEAEEAEEE/EE/AEEAEEEEEAEEEA/EEAAEAE + { assert path(process.out.reads.get(0).get(1).get(0)).linesGzip.contains(read1_line) } + } + }, + { read2_lines.each { read2_line -> + { assert path(process.out.reads.get(0).get(1).get(1)).linesGzip.contains(read2_line) } + } + }, + { read_merged_lines.each { read_merged_line -> + { assert path(process.out.reads_merged.get(0).get(1)).linesGzip.contains(read_merged_line) } + } + }, + { html_text.each { html_part -> + { assert path(process.out.html.get(0).get(1)).getText().contains(html_part) } + } + }, + { json_text.each { json_part -> + { assert path(process.out.json.get(0).get(1)).getText().contains(json_part) } + } + }, + { log_text.each { log_part -> + { assert path(process.out.log.get(0).get(1)).getText().contains(log_part) } + } + }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + + test("test_fastp_paired_end_merged_adapterlist") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + adapter_fasta = file("https://github.com/nf-core/test-datasets/raw/modules/data/delete_me/fastp/adapters.fasta", checkIfExists: true) + save_trimmed_fail = false + save_merged = true + + input[0] = [ [ id:'test', single_end:false ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) ] + ] + input[1] = adapter_fasta + input[2] = save_trimmed_fail + input[3] = save_merged + """ + } + } + + then { + def html_text = [ "
"] + def log_text = [ "Merged and filtered:", + "total reads: 75", + "total bases: 13683"] + def json_text = ['"merged_and_filtered": {', '"total_reads": 75', '"total_bases": 13683',"--adapter_fasta"] + def read1_lines = ["@ERR5069949.1066259 NS500628:121:HK3MMAFX2:1:11312:18369:8333/1", + "CCTTATGACAGCAAGAACTGTGTATGATGATGGTGCTAGGAGAGTGTGGACACTTATGAATGTCTTGACACTCGTTTATAAAGTTTATTATGGTAATGCTTTAGATCAAGCCATTTCCATGTGGGCTCTTATAATCTCTGTTACTTC", + "AAAAAEAEEAEEEEEEEEEEEEEEEEAEEEEAEEEEEEEEAEEEEEEEEEEEEEEEEE/EAEEEEEE/6EEEEEEEEEEAEEAEEE/EE/AEEAEEEEEAEEEA/EEAAEAE + { assert path(process.out.reads.get(0).get(1).get(0)).linesGzip.contains(read1_line) } + } + }, + { read2_lines.each { read2_line -> + { assert path(process.out.reads.get(0).get(1).get(1)).linesGzip.contains(read2_line) } + } + }, + { read_merged_lines.each { read_merged_line -> + { assert path(process.out.reads_merged.get(0).get(1)).linesGzip.contains(read_merged_line) } + } + }, + { html_text.each { html_part -> + { assert path(process.out.html.get(0).get(1)).getText().contains(html_part) } + } + }, + { json_text.each { json_part -> + { assert path(process.out.json.get(0).get(1)).getText().contains(json_part) } + } + }, + { log_text.each { log_part -> + { assert path(process.out.log.get(0).get(1)).getText().contains(log_part) } + } + }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + } +} diff --git a/modules/nf-core/fastp/tests/main.nf.test.snap b/modules/nf-core/fastp/tests/main.nf.test.snap new file mode 100644 index 0000000000..0fa68c7d71 --- /dev/null +++ b/modules/nf-core/fastp/tests/main.nf.test.snap @@ -0,0 +1,52 @@ +{ + "fastp test_fastp_interleaved_json": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.fastp.json:md5,168f516f7bd4b7b6c32da7cba87299a4" + ] + ] + ], + "timestamp": "2023-10-17T11:04:45.794175881" + }, + "test_fastp_single_end_json": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.fastp.json:md5,c852d7a6dba5819e4ac8d9673bedcacc" + ] + ] + ], + "timestamp": "2023-10-17T11:04:10.566343705" + }, + "versions": { + "content": [ + [ + "versions.yml:md5,48ffc994212fb1fc9f83a74fa69c9f02" + ] + ], + "timestamp": "2023-10-17T11:04:10.582076024" + }, + "test_fastp_single_end_trim_fail_json": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.fastp.json:md5,9a7ee180f000e8d00c7fb67f06293eb5" + ] + ] + ], + "timestamp": "2023-10-17T11:05:00.379878948" + } +} \ No newline at end of file diff --git a/modules/nf-core/fastp/tests/nextflow.config b/modules/nf-core/fastp/tests/nextflow.config new file mode 100644 index 0000000000..0f7849ad96 --- /dev/null +++ b/modules/nf-core/fastp/tests/nextflow.config @@ -0,0 +1,6 @@ +process { + + withName: FASTP { + ext.args = "--interleaved_in" + } +} diff --git a/modules/nf-core/fastp/tests/tags.yml b/modules/nf-core/fastp/tests/tags.yml new file mode 100644 index 0000000000..c1afcce75f --- /dev/null +++ b/modules/nf-core/fastp/tests/tags.yml @@ -0,0 +1,2 @@ +fastp: + - modules/nf-core/fastp/** diff --git a/modules/nf-core/fgbio/callmolecularconsensusreads/environment.yml b/modules/nf-core/fgbio/callmolecularconsensusreads/environment.yml new file mode 100644 index 0000000000..1429e478ec --- /dev/null +++ b/modules/nf-core/fgbio/callmolecularconsensusreads/environment.yml @@ -0,0 +1,7 @@ +name: fgbio_callmolecularconsensusreads +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::fgbio=2.0.2 diff --git a/modules/nf-core/fgbio/callmolecularconsensusreads/main.nf b/modules/nf-core/fgbio/callmolecularconsensusreads/main.nf new file mode 100644 index 0000000000..e9f209ef16 --- /dev/null +++ b/modules/nf-core/fgbio/callmolecularconsensusreads/main.nf @@ -0,0 +1,37 @@ +process FGBIO_CALLMOLECULARCONSENSUSREADS { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/fgbio:2.0.2--hdfd78af_0' : + 'biocontainers/fgbio:2.0.2--hdfd78af_0' }" + + input: + tuple val(meta), path(bam) + + output: + tuple val(meta), path("*.bam"), emit: bam + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + fgbio \\ + --tmp-dir=. \\ + CallMolecularConsensusReads \\ + --input $bam \\ + --threads ${task.cpus} \\ + $args \\ + --output ${prefix}.bam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fgbio: \$( echo \$(fgbio --version 2>&1 | tr -d '[:cntrl:]' ) | sed -e 's/^.*Version: //;s/\\[.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/fgbio/callmolecularconsensusreads/meta.yml b/modules/nf-core/fgbio/callmolecularconsensusreads/meta.yml new file mode 100644 index 0000000000..f4a6ab1bb8 --- /dev/null +++ b/modules/nf-core/fgbio/callmolecularconsensusreads/meta.yml @@ -0,0 +1,43 @@ +name: fgbio_callmolecularconsensusreads +description: Calls consensus sequences from reads with the same unique molecular tag. +keywords: + - UMIs + - consensus sequence + - bam + - sam +tools: + - fgbio: + description: Tools for working with genomic and high throughput sequencing data. + homepage: https://github.com/fulcrumgenomics/fgbio + documentation: http://fulcrumgenomics.github.io/fgbio/ + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false, collapse:false ] + - bam: + type: file + description: | + The input SAM or BAM file. + pattern: "*.{bam,sam}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: | + Output SAM or BAM file to write consensus reads. + pattern: "*.{bam,sam}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@sruthipsuresh" +maintainers: + - "@sruthipsuresh" diff --git a/modules/nf-core/fgbio/fastqtobam/environment.yml b/modules/nf-core/fgbio/fastqtobam/environment.yml new file mode 100644 index 0000000000..f5f1992581 --- /dev/null +++ b/modules/nf-core/fgbio/fastqtobam/environment.yml @@ -0,0 +1,7 @@ +name: fgbio_fastqtobam +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::fgbio=2.0.2 diff --git a/modules/nf-core/fgbio/fastqtobam/main.nf b/modules/nf-core/fgbio/fastqtobam/main.nf new file mode 100644 index 0000000000..f7302171b8 --- /dev/null +++ b/modules/nf-core/fgbio/fastqtobam/main.nf @@ -0,0 +1,43 @@ +process FGBIO_FASTQTOBAM { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/fgbio:2.0.2--hdfd78af_0' : + 'biocontainers/fgbio:2.0.2--hdfd78af_0' }" + + input: + tuple val(meta), path(reads) + + output: + tuple val(meta), path("*.bam") , emit: bam , optional: true + tuple val(meta), path("*.cram"), emit: cram, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def sample_name = args.contains("--sample") ? "" : "--sample ${prefix}" + def library_name = args.contains("--library") ? "" : "--library ${prefix}" + def output = prefix =~ /\.(bam|cram)$/ ? prefix : "${prefix}.bam" + """ + + fgbio \\ + --tmp-dir=. \\ + FastqToBam \\ + ${args} \\ + --input ${reads} \\ + --output ${output} \\ + ${sample_name} \\ + ${library_name} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fgbio: \$( echo \$(fgbio --version 2>&1 | tr -d '[:cntrl:]' ) | sed -e 's/^.*Version: //;s/\\[.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/fgbio/fastqtobam/meta.yml b/modules/nf-core/fgbio/fastqtobam/meta.yml new file mode 100644 index 0000000000..4b37cd530f --- /dev/null +++ b/modules/nf-core/fgbio/fastqtobam/meta.yml @@ -0,0 +1,42 @@ +name: fgbio_fastqtobam +description: | + Using the fgbio tools, converts FASTQ files sequenced into unaligned BAM or CRAM files possibly moving the UMI barcode into the RX field of the reads +keywords: + - fastqtobam + - fgbio +tools: + - fgbio: + description: A set of tools for working with genomic and high throughput sequencing data, including UMIs + homepage: http://fulcrumgenomics.github.io/fgbio/ + documentation: http://fulcrumgenomics.github.io/fgbio/tools/latest/ + tool_dev_url: https://github.com/fulcrumgenomics/fgbio + licence: ["MIT"] +input: + - reads: + type: file + description: pair of reads to be converted into BAM file + pattern: "*.{fastq.gz}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - version: + type: file + description: File containing software version + pattern: "*.{version.yml}" + - bam: + type: file + description: Unaligned, unsorted BAM file + pattern: "*.{bam}" + - cram: + type: file + description: Unaligned, unsorted CRAM file + pattern: "*.{cram}" +authors: + - "@lescai" + - "@matthdsm" +maintainers: + - "@lescai" + - "@matthdsm" diff --git a/modules/nf-core/fgbio/groupreadsbyumi/environment.yml b/modules/nf-core/fgbio/groupreadsbyumi/environment.yml new file mode 100644 index 0000000000..58e37bf6bd --- /dev/null +++ b/modules/nf-core/fgbio/groupreadsbyumi/environment.yml @@ -0,0 +1,7 @@ +name: fgbio_groupreadsbyumi +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::fgbio=2.0.2 diff --git a/modules/nf-core/fgbio/groupreadsbyumi/main.nf b/modules/nf-core/fgbio/groupreadsbyumi/main.nf new file mode 100644 index 0000000000..7179290c91 --- /dev/null +++ b/modules/nf-core/fgbio/groupreadsbyumi/main.nf @@ -0,0 +1,42 @@ +process FGBIO_GROUPREADSBYUMI { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/fgbio:2.0.2--hdfd78af_0' : + 'biocontainers/fgbio:2.0.2--hdfd78af_0' }" + + input: + tuple val(meta), path(taggedbam) + val(strategy) + + output: + tuple val(meta), path("*_umi-grouped.bam") , emit: bam + tuple val(meta), path("*_umi_histogram.txt"), emit: histogram + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + + fgbio \\ + --tmp-dir=. \\ + GroupReadsByUmi \\ + -s $strategy \\ + $args \\ + -i $taggedbam \\ + -o ${prefix}_umi-grouped.bam \\ + -f ${prefix}_umi_histogram.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fgbio: \$( echo \$(fgbio --version 2>&1 | tr -d '[:cntrl:]' ) | sed -e 's/^.*Version: //;s/\\[.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/fgbio/groupreadsbyumi/meta.yml b/modules/nf-core/fgbio/groupreadsbyumi/meta.yml new file mode 100644 index 0000000000..02ca91f19f --- /dev/null +++ b/modules/nf-core/fgbio/groupreadsbyumi/meta.yml @@ -0,0 +1,57 @@ +name: fgbio_groupreadsbyumi +description: | + Groups reads together that appear to have come from the same original molecule. + Reads are grouped by template, and then templates are sorted by the 5’ mapping positions + of the reads from the template, used from earliest mapping position to latest. + Reads that have the same end positions are then sub-grouped by UMI sequence. + (!) Note: the MQ tag is required on reads with mapped mates (!) + This can be added using samblaster with the optional argument --addMateTags. +keywords: + - UMI + - groupreads + - fgbio +tools: + - fgbio: + description: A set of tools for working with genomic and high throughput sequencing data, including UMIs + homepage: http://fulcrumgenomics.github.io/fgbio/ + documentation: http://fulcrumgenomics.github.io/fgbio/tools/latest/ + tool_dev_url: https://github.com/fulcrumgenomics/fgbio + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: | + BAM file. Note: the MQ tag is required on reads with mapped mates (!) + pattern: "*.bam" + - strategy: + type: value + description: | + Reguired argument: defines the UMI assignment strategy. + Must be chosen among: Identity, Edit, Adjacency, Paired. +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - bam: + type: file + description: UMI-grouped BAM + pattern: "*.bam" + - histogram: + type: file + description: A text file containing the tag family size counts + pattern: "*.txt" +authors: + - "@lescai" +maintainers: + - "@lescai" diff --git a/modules/nf-core/freebayes/environment.yml b/modules/nf-core/freebayes/environment.yml new file mode 100644 index 0000000000..6846080a2f --- /dev/null +++ b/modules/nf-core/freebayes/environment.yml @@ -0,0 +1,7 @@ +name: freebayes +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::freebayes=1.3.6 diff --git a/modules/nf-core/freebayes/main.nf b/modules/nf-core/freebayes/main.nf new file mode 100644 index 0000000000..8a1c641ded --- /dev/null +++ b/modules/nf-core/freebayes/main.nf @@ -0,0 +1,51 @@ +process FREEBAYES { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/freebayes:1.3.6--hbfe0e7f_2' : + 'biocontainers/freebayes:1.3.6--hbfe0e7f_2' }" + + input: + tuple val(meta), path(input_1), path(input_1_index), path(input_2), path(input_2_index), path(target_bed) + path fasta + path fasta_fai + path samples + path populations + path cnv + + output: + tuple val(meta), path("*.vcf.gz"), emit: vcf + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def input = input_2 ? "${input_1} ${input_2}" : "${input_1}" + def targets_file = target_bed ? "--target ${target_bed}" : "" + def samples_file = samples ? "--samples ${samples}" : "" + def populations_file = populations ? "--populations ${populations}" : "" + def cnv_file = cnv ? "--cnv-map ${cnv}" : "" + + """ + freebayes \\ + -f $fasta \\ + $targets_file \\ + $samples_file \\ + $populations_file \\ + $cnv_file \\ + $args \\ + $input > ${prefix}.vcf + + bgzip ${prefix}.vcf + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + freebayes: \$(echo \$(freebayes --version 2>&1) | sed 's/version:\s*v//g' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/freebayes/meta.yml b/modules/nf-core/freebayes/meta.yml new file mode 100644 index 0000000000..e2cf1a175c --- /dev/null +++ b/modules/nf-core/freebayes/meta.yml @@ -0,0 +1,82 @@ +name: freebayes +description: A haplotype-based variant detector +keywords: + - variant caller + - SNP + - genotyping + - somatic variant calling + - germline variant calling + - bacterial variant calling + - bayesian +tools: + - freebayes: + description: Bayesian haplotype-based polymorphism discovery and genotyping + homepage: https://github.com/freebayes/freebayes + documentation: https://github.com/freebayes/freebayes + tool_dev_url: https://github.com/freebayes/freebayes + doi: "10.48550/arXiv.1207.3907" + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - input_index: + type: file + description: BAM/CRAM/SAM index file + pattern: "*.{bai,crai}" + - target_bed: + type: file + description: Optional - Limit analysis to targets listed in this BED-format FILE. + pattern: "*.bed" + - fasta: + type: file + description: reference fasta file + pattern: ".{fa,fa.gz,fasta,fasta.gz}" + - fasta_fai: + type: file + description: reference fasta file index + pattern: "*.{fa,fasta}.fai" + - samples: + type: file + description: Optional - Limit analysis to samples listed (one per line) in the FILE. + pattern: "*.txt" + - populations: + type: file + description: Optional - Each line of FILE should list a sample and a population which it is part of. + pattern: "*.txt" + - cnv: + type: file + description: | + A copy number map BED file, which has either a sample-level ploidy: + sample_name copy_number + or a region-specific format: + seq_name start end sample_name copy_number + pattern: "*.bed" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - version: + type: file + description: File containing software version + pattern: "*.{version.txt}" + - vcf: + type: file + description: Compressed VCF file + pattern: "*.vcf.gz" +authors: + - "@maxibor" + - "@FriederikeHanssen" + - "@maxulysse" +maintainers: + - "@maxibor" + - "@FriederikeHanssen" + - "@maxulysse" diff --git a/modules/nf-core/gatk4/applybqsr/environment.yml b/modules/nf-core/gatk4/applybqsr/environment.yml new file mode 100644 index 0000000000..a690099123 --- /dev/null +++ b/modules/nf-core/gatk4/applybqsr/environment.yml @@ -0,0 +1,7 @@ +name: gatk4_applybqsr +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::gatk4=4.4.0.0 diff --git a/modules/nf-core/gatk4/applybqsr/main.nf b/modules/nf-core/gatk4/applybqsr/main.nf new file mode 100644 index 0000000000..7e49563739 --- /dev/null +++ b/modules/nf-core/gatk4/applybqsr/main.nf @@ -0,0 +1,51 @@ +process GATK4_APPLYBQSR { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gatk4:4.4.0.0--py36hdfd78af_0': + 'biocontainers/gatk4:4.4.0.0--py36hdfd78af_0' }" + + input: + tuple val(meta), path(input), path(input_index), path(bqsr_table), path(intervals) + path fasta + path fai + path dict + + output: + tuple val(meta), path("*.bam") , emit: bam, optional: true + tuple val(meta), path("*.cram"), emit: cram, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def interval_command = intervals ? "--intervals $intervals" : "" + + def avail_mem = 3072 + if (!task.memory) { + log.info '[GATK ApplyBQSR] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' + } else { + avail_mem = (task.memory.mega*0.8).intValue() + } + """ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + ApplyBQSR \\ + --input $input \\ + --output ${prefix}.${input.getExtension()} \\ + --reference $fasta \\ + --bqsr-recal-file $bqsr_table \\ + $interval_command \\ + --tmp-dir . \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/gatk4/applybqsr/meta.yml b/modules/nf-core/gatk4/applybqsr/meta.yml new file mode 100644 index 0000000000..ab9efea3f4 --- /dev/null +++ b/modules/nf-core/gatk4/applybqsr/meta.yml @@ -0,0 +1,74 @@ +name: gatk4_applybqsr +description: Apply base quality score recalibration (BQSR) to a bam file +keywords: + - bam + - base quality score recalibration + - bqsr + - cram + - gatk4 +tools: + - gatk4: + description: | + Developed in the Data Sciences Platform at the Broad Institute, the toolkit offers a wide variety of tools + with a primary focus on variant discovery and genotyping. Its powerful processing engine + and high-performance computing features make it capable of taking on projects of any size. + homepage: https://gatk.broadinstitute.org/hc/en-us + documentation: https://gatk.broadinstitute.org/hc/en-us/categories/360002369672s + doi: 10.1158/1538-7445.AM2017-3590 + licence: ["Apache-2.0"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: BAM/CRAM file from alignment + pattern: "*.{bam,cram}" + - input_index: + type: file + description: BAI/CRAI file from alignment + pattern: "*.{bai,crai}" + - bqsr_table: + type: file + description: Recalibration table from gatk4_baserecalibrator + - intervals: + type: file + description: Bed file with the genomic regions included in the library (optional) + - fasta: + type: file + description: The reference fasta file + pattern: "*.fasta" + - fai: + type: file + description: Index of reference fasta file + pattern: "*.fasta.fai" + - dict: + type: file + description: GATK sequence dictionary + pattern: "*.dict" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - bam: + type: file + description: Recalibrated BAM file + pattern: "*.{bam}" + - cram: + type: file + description: Recalibrated CRAM file + pattern: "*.{cram}" +authors: + - "@yocra3" + - "@FriederikeHanssen" +maintainers: + - "@yocra3" + - "@FriederikeHanssen" diff --git a/modules/nf-core/gatk4/applyvqsr/environment.yml b/modules/nf-core/gatk4/applyvqsr/environment.yml new file mode 100644 index 0000000000..e640768957 --- /dev/null +++ b/modules/nf-core/gatk4/applyvqsr/environment.yml @@ -0,0 +1,7 @@ +name: gatk4_applyvqsr +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::gatk4=4.4.0.0 diff --git a/modules/nf-core/gatk4/applyvqsr/main.nf b/modules/nf-core/gatk4/applyvqsr/main.nf new file mode 100644 index 0000000000..21afe9a528 --- /dev/null +++ b/modules/nf-core/gatk4/applyvqsr/main.nf @@ -0,0 +1,63 @@ +process GATK4_APPLYVQSR { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gatk4:4.4.0.0--py36hdfd78af_0': + 'biocontainers/gatk4:4.4.0.0--py36hdfd78af_0' }" + + input: + tuple val(meta), path(vcf), path(vcf_tbi), path(recal), path(recal_index), path(tranches) + path fasta + path fai + path dict + + output: + tuple val(meta), path("*.vcf.gz"), emit: vcf + tuple val(meta), path("*.tbi") , emit: tbi + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def reference_command = fasta ? "--reference $fasta" : '' + + def avail_mem = 3072 + if (!task.memory) { + log.info '[GATK ApplyVQSR] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' + } else { + avail_mem = (task.memory.mega*0.8).intValue() + } + """ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + ApplyVQSR \\ + --variant ${vcf} \\ + --output ${prefix}.vcf.gz \\ + $reference_command \\ + --tranches-file $tranches \\ + --recal-file $recal \\ + --tmp-dir . \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.vcf.gz + touch ${prefix}.vcf.gz.tbi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/gatk4/applyvqsr/meta.yml b/modules/nf-core/gatk4/applyvqsr/meta.yml new file mode 100644 index 0000000000..de5d6d067a --- /dev/null +++ b/modules/nf-core/gatk4/applyvqsr/meta.yml @@ -0,0 +1,76 @@ +name: gatk4_applyvqsr +description: | + Apply a score cutoff to filter variants based on a recalibration table. + AplyVQSR performs the second pass in a two-stage process called Variant Quality Score Recalibration (VQSR). + Specifically, it applies filtering to the input variants based on the recalibration table produced + in the first step by VariantRecalibrator and a target sensitivity value. +keywords: + - gatk4 + - variant quality score recalibration + - vcf + - vqsr +tools: + - gatk4: + description: | + Developed in the Data Sciences Platform at the Broad Institute, the toolkit offers a wide variety of tools + with a primary focus on variant discovery and genotyping. Its powerful processing engine + and high-performance computing features make it capable of taking on projects of any size. + homepage: https://gatk.broadinstitute.org/hc/en-us + documentation: https://gatk.broadinstitute.org/hc/en-us/categories/360002369672s + doi: 10.1158/1538-7445.AM2017-3590 + licence: ["Apache-2.0"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test'] + - vcf: + type: file + description: VCF file to be recalibrated, this should be the same file as used for the first stage VariantRecalibrator. + pattern: "*.vcf" + - vcf_tbi: + type: file + description: tabix index for the input vcf file. + pattern: "*.vcf.tbi" + - recal: + type: file + description: Recalibration file produced when the input vcf was run through VariantRecalibrator in stage 1. + pattern: "*.recal" + - recal_index: + type: file + description: Index file for the recalibration file. + pattern: ".recal.idx" + - tranches: + type: file + description: Tranches file produced when the input vcf was run through VariantRecalibrator in stage 1. + pattern: ".tranches" + - fasta: + type: file + description: The reference fasta file + pattern: "*.fasta" + - fai: + type: file + description: Index of reference fasta file + pattern: "*.fasta.fai" + - dict: + type: file + description: GATK sequence dictionary + pattern: "*.dict" +output: + - vcf: + type: file + description: compressed vcf file containing the recalibrated variants. + pattern: "*.vcf.gz" + - tbi: + type: file + description: Index of recalibrated vcf file. + pattern: "*vcf.gz.tbi" + - versions: + type: file + description: File containing software versions. + pattern: "versions.yml" +authors: + - "@GCJMackenzie" +maintainers: + - "@GCJMackenzie" diff --git a/modules/nf-core/gatk4/baserecalibrator/environment.yml b/modules/nf-core/gatk4/baserecalibrator/environment.yml new file mode 100644 index 0000000000..6863fb1712 --- /dev/null +++ b/modules/nf-core/gatk4/baserecalibrator/environment.yml @@ -0,0 +1,7 @@ +name: gatk4_baserecalibrator +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::gatk4=4.4.0.0 diff --git a/modules/nf-core/gatk4/baserecalibrator/main.nf b/modules/nf-core/gatk4/baserecalibrator/main.nf new file mode 100644 index 0000000000..e893b65036 --- /dev/null +++ b/modules/nf-core/gatk4/baserecalibrator/main.nf @@ -0,0 +1,53 @@ +process GATK4_BASERECALIBRATOR { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gatk4:4.4.0.0--py36hdfd78af_0': + 'biocontainers/gatk4:4.4.0.0--py36hdfd78af_0' }" + + input: + tuple val(meta), path(input), path(input_index), path(intervals) + path fasta + path fai + path dict + path known_sites + path known_sites_tbi + + output: + tuple val(meta), path("*.table"), emit: table + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def interval_command = intervals ? "--intervals $intervals" : "" + def sites_command = known_sites.collect{"--known-sites $it"}.join(' ') + + def avail_mem = 3072 + if (!task.memory) { + log.info '[GATK BaseRecalibrator] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' + } else { + avail_mem = (task.memory.mega*0.8).intValue() + } + """ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + BaseRecalibrator \\ + --input $input \\ + --output ${prefix}.table \\ + --reference $fasta \\ + $interval_command \\ + $sites_command \\ + --tmp-dir . \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/gatk4/baserecalibrator/meta.yml b/modules/nf-core/gatk4/baserecalibrator/meta.yml new file mode 100644 index 0000000000..8252b8c290 --- /dev/null +++ b/modules/nf-core/gatk4/baserecalibrator/meta.yml @@ -0,0 +1,77 @@ +name: gatk4_baserecalibrator +description: Generate recalibration table for Base Quality Score Recalibration (BQSR) +keywords: + - base quality score recalibration + - table + - bqsr + - gatk4 + - sort +tools: + - gatk4: + description: | + Developed in the Data Sciences Platform at the Broad Institute, the toolkit offers a wide variety of tools + with a primary focus on variant discovery and genotyping. Its powerful processing engine + and high-performance computing features make it capable of taking on projects of any size. + homepage: https://gatk.broadinstitute.org/hc/en-us + documentation: https://gatk.broadinstitute.org/hc/en-us/categories/360002369672s + doi: 10.1158/1538-7445.AM2017-3590 + licence: ["Apache-2.0"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: BAM/CRAM file from alignment + pattern: "*.{bam,cram}" + - input_index: + type: file + description: BAI/CRAI file from alignment + pattern: "*.{bai,crai}" + - intervals: + type: file + description: Bed file with the genomic regions included in the library (optional) + - fasta: + type: file + description: The reference fasta file + pattern: "*.fasta" + - fai: + type: file + description: Index of reference fasta file + pattern: "*.fasta.fai" + - dict: + type: file + description: GATK sequence dictionary + pattern: "*.dict" + - known_sites: + type: file + description: VCF files with known sites for indels / snps (optional) + pattern: "*.vcf.gz" + - known_sites_tbi: + type: file + description: Tabix index of the known_sites (optional) + pattern: "*.vcf.gz.tbi" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - table: + type: file + description: Recalibration table from BaseRecalibrator + pattern: "*.{table}" +authors: + - "@yocra3" + - "@FriederikeHanssen" + - "@maxulysse" +maintainers: + - "@yocra3" + - "@FriederikeHanssen" + - "@maxulysse" diff --git a/modules/nf-core/gatk4/calculatecontamination/environment.yml b/modules/nf-core/gatk4/calculatecontamination/environment.yml new file mode 100644 index 0000000000..d5e45ebe42 --- /dev/null +++ b/modules/nf-core/gatk4/calculatecontamination/environment.yml @@ -0,0 +1,7 @@ +name: gatk4_calculatecontamination +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::gatk4=4.4.0.0 diff --git a/modules/nf-core/gatk4/calculatecontamination/main.nf b/modules/nf-core/gatk4/calculatecontamination/main.nf new file mode 100644 index 0000000000..8d43c4ee6b --- /dev/null +++ b/modules/nf-core/gatk4/calculatecontamination/main.nf @@ -0,0 +1,46 @@ +process GATK4_CALCULATECONTAMINATION { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gatk4:4.4.0.0--py36hdfd78af_0': + 'biocontainers/gatk4:4.4.0.0--py36hdfd78af_0' }" + + input: + tuple val(meta), path(pileup), path(matched) + + output: + tuple val(meta), path('*.contamination.table'), emit: contamination + tuple val(meta), path('*.segmentation.table') , emit: segmentation, optional:true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def matched_command = matched ? "--matched-normal $matched" : '' + + def avail_mem = 3072 + if (!task.memory) { + log.info '[GATK CalculateContamination] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' + } else { + avail_mem = (task.memory.mega*0.8).intValue() + } + """ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + CalculateContamination \\ + --input $pileup \\ + --output ${prefix}.contamination.table \\ + $matched_command \\ + --tmp-dir . \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/gatk4/calculatecontamination/meta.yml b/modules/nf-core/gatk4/calculatecontamination/meta.yml new file mode 100644 index 0000000000..b0ffe814c5 --- /dev/null +++ b/modules/nf-core/gatk4/calculatecontamination/meta.yml @@ -0,0 +1,52 @@ +name: gatk4_calculatecontamination +description: | + Calculates the fraction of reads from cross-sample contamination based on summary tables from getpileupsummaries. Output to be used with filtermutectcalls. +keywords: + - gatk4 + - calculatecontamination + - cross-samplecontamination + - getpileupsummaries + - filtermutectcalls +tools: + - gatk4: + description: | + Developed in the Data Sciences Platform at the Broad Institute, the toolkit offers a wide variety of tools + with a primary focus on variant discovery and genotyping. Its powerful processing engine + and high-performance computing features make it capable of taking on projects of any size. + homepage: https://gatk.broadinstitute.org/hc/en-us + documentation: https://gatk.broadinstitute.org/hc/en-us/categories/360002369672s + doi: 10.1158/1538-7445.AM2017-3590 + licence: ["Apache-2.0"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - pileup: + type: file + description: File containing the pileups summary table of a tumor sample to be used to calculate contamination. + pattern: "*.pileups.table" + - matched: + type: file + description: File containing the pileups summary table of a normal sample that matches with the tumor sample specified in pileup argument. This is an optional input. + pattern: "*.pileups.table" +output: + - contamination: + type: file + description: File containing the contamination table. + pattern: "*.contamination.table" + - segmentation: + type: file + description: output table containing segmentation of tumor minor allele fractions (optional) + pattern: "*.segmentation.table" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@GCJMackenzie" + - "@maxulysse" +maintainers: + - "@GCJMackenzie" + - "@maxulysse" diff --git a/modules/nf-core/gatk4/cnnscorevariants/environment.yml b/modules/nf-core/gatk4/cnnscorevariants/environment.yml new file mode 100644 index 0000000000..12cc34ba66 --- /dev/null +++ b/modules/nf-core/gatk4/cnnscorevariants/environment.yml @@ -0,0 +1,5 @@ +name: gatk4_cnnscorevariants +channels: + - conda-forge + - bioconda + - defaults diff --git a/modules/nf-core/gatk4/cnnscorevariants/main.nf b/modules/nf-core/gatk4/cnnscorevariants/main.nf new file mode 100644 index 0000000000..71efe9b12d --- /dev/null +++ b/modules/nf-core/gatk4/cnnscorevariants/main.nf @@ -0,0 +1,60 @@ +process GATK4_CNNSCOREVARIANTS { + tag "$meta.id" + label 'process_low' + + //Conda is not supported at the moment: https://github.com/broadinstitute/gatk/issues/7811 + container "nf-core/gatk:4.4.0.0" //Biocontainers is missing a package + + input: + tuple val(meta), path(vcf), path(tbi), path(aligned_input), path(intervals) + path fasta + path fai + path dict + path architecture + path weights + + output: + tuple val(meta), path("*cnn.vcf.gz") , emit: vcf + tuple val(meta), path("*cnn.vcf.gz.tbi"), emit: tbi + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error "GATK4_CNNSCOREVARIANTS module does not support Conda. Please use Docker / Singularity / Podman instead." + } + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def aligned_input = aligned_input ? "--input $aligned_input" : "" + def interval_command = intervals ? "--intervals $intervals" : "" + def architecture = architecture ? "--architecture $architecture" : "" + def weights = weights ? "--weights $weights" : "" + + def avail_mem = 3072 + if (!task.memory) { + log.info '[GATK CnnScoreVariants] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' + } else { + avail_mem = (task.memory.mega*0.8).intValue() + } + """ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + CNNScoreVariants \\ + --variant $vcf \\ + --output ${prefix}.cnn.vcf.gz \\ + --reference $fasta \\ + $interval_command \\ + $aligned_input \\ + $architecture \\ + $weights \\ + --tmp-dir . \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/gatk4/cnnscorevariants/meta.yml b/modules/nf-core/gatk4/cnnscorevariants/meta.yml new file mode 100644 index 0000000000..8a9d0f51c2 --- /dev/null +++ b/modules/nf-core/gatk4/cnnscorevariants/meta.yml @@ -0,0 +1,79 @@ +name: "gatk4_cnnscorevariants" +description: Apply a Convolutional Neural Net to filter annotated variants +keywords: + - cnnscorevariants + - gatk4 + - variants +tools: + - gatk4: + description: | + Developed in the Data Sciences Platform at the Broad Institute, the toolkit offers a wide variety of tools + with a primary focus on variant discovery and genotyping. Its powerful processing engine + and high-performance computing features make it capable of taking on projects of any size. + homepage: https://gatk.broadinstitute.org/hc/en-us + documentation: https://gatk.broadinstitute.org/hc/en-us/categories/360002369672s + doi: 10.1158/1538-7445.AM2017-3590 + licence: ["Apache-2.0"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - vcf: + type: file + description: VCF file + pattern: "*.vcf.gz" + - tbi: + type: file + description: VCF index file + pattern: "*.vcf.gz.tbi" + - aligned_input: + type: file + description: BAM/CRAM file from alignment (optional) + pattern: "*.{bam,cram}" + - intervals: + type: file + description: Bed file with the genomic regions included in the library (optional) + - fasta: + type: file + description: The reference fasta file + pattern: "*.fasta" + - fai: + type: file + description: Index of reference fasta file + pattern: "*.fasta.fai" + - dict: + type: file + description: GATK sequence dictionary + pattern: "*.dict" + - architecture: + type: file + description: Neural Net architecture configuration json file (optional) + pattern: "*.json" + - weights: + type: file + description: Keras model HD5 file with neural net weights. (optional) + pattern: "*.hd5" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - vcf: + type: file + description: Annotated VCF file + pattern: "*.vcf" + - tbi: + type: file + description: VCF index file + pattern: "*.vcf.gz.tbi" +authors: + - "@FriederikeHanssen" +maintainers: + - "@FriederikeHanssen" diff --git a/modules/nf-core/gatk4/createsequencedictionary/environment.yml b/modules/nf-core/gatk4/createsequencedictionary/environment.yml new file mode 100644 index 0000000000..db663e148f --- /dev/null +++ b/modules/nf-core/gatk4/createsequencedictionary/environment.yml @@ -0,0 +1,7 @@ +name: gatk4_createsequencedictionary +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::gatk4=4.4.0.0 diff --git a/modules/nf-core/gatk4/createsequencedictionary/main.nf b/modules/nf-core/gatk4/createsequencedictionary/main.nf new file mode 100644 index 0000000000..b47ad16221 --- /dev/null +++ b/modules/nf-core/gatk4/createsequencedictionary/main.nf @@ -0,0 +1,52 @@ +process GATK4_CREATESEQUENCEDICTIONARY { + tag "$fasta" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gatk4:4.4.0.0--py36hdfd78af_0': + 'biocontainers/gatk4:4.4.0.0--py36hdfd78af_0' }" + + input: + tuple val(meta), path(fasta) + + output: + tuple val(meta), path('*.dict') , emit: dict + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + + def avail_mem = 6144 + if (!task.memory) { + log.info '[GATK CreateSequenceDictionary] Available memory not known - defaulting to 6GB. Specify process memory requirements to change this.' + } else { + avail_mem = (task.memory.mega*0.8).intValue() + } + """ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + CreateSequenceDictionary \\ + --REFERENCE $fasta \\ + --URI $fasta \\ + --TMP_DIR . \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ + + stub: + """ + touch ${fasta.baseName}.dict + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/gatk4/createsequencedictionary/meta.yml b/modules/nf-core/gatk4/createsequencedictionary/meta.yml new file mode 100644 index 0000000000..f9d70be098 --- /dev/null +++ b/modules/nf-core/gatk4/createsequencedictionary/meta.yml @@ -0,0 +1,42 @@ +name: gatk4_createsequencedictionary +description: Creates a sequence dictionary for a reference sequence +keywords: + - createsequencedictionary + - dictionary + - fasta + - gatk4 +tools: + - gatk: + description: | + Developed in the Data Sciences Platform at the Broad Institute, the toolkit offers a wide variety of tools + with a primary focus on variant discovery and genotyping. Its powerful processing engine + and high-performance computing features make it capable of taking on projects of any size. + homepage: https://gatk.broadinstitute.org/hc/en-us + documentation: https://gatk.broadinstitute.org/hc/en-us/categories/360002369672s + doi: 10.1158/1538-7445.AM2017-3590 + licence: ["Apache-2.0"] +input: + - meta: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - fasta: + type: file + description: Input fasta file + pattern: "*.{fasta,fa}" +output: + - dict: + type: file + description: gatk dictionary file + pattern: "*.{dict}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@maxulysse" + - "@ramprasadn" +maintainers: + - "@maxulysse" + - "@ramprasadn" diff --git a/modules/nf-core/gatk4/estimatelibrarycomplexity/environment.yml b/modules/nf-core/gatk4/estimatelibrarycomplexity/environment.yml new file mode 100644 index 0000000000..fabb6f2ba4 --- /dev/null +++ b/modules/nf-core/gatk4/estimatelibrarycomplexity/environment.yml @@ -0,0 +1,7 @@ +name: gatk4_estimatelibrarycomplexity +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::gatk4=4.4.0.0 diff --git a/modules/nf-core/gatk4/estimatelibrarycomplexity/main.nf b/modules/nf-core/gatk4/estimatelibrarycomplexity/main.nf new file mode 100644 index 0000000000..c0eef7b327 --- /dev/null +++ b/modules/nf-core/gatk4/estimatelibrarycomplexity/main.nf @@ -0,0 +1,48 @@ +process GATK4_ESTIMATELIBRARYCOMPLEXITY { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gatk4:4.4.0.0--py36hdfd78af_0': + 'biocontainers/gatk4:4.4.0.0--py36hdfd78af_0' }" + + input: + tuple val(meta), path(input) + path fasta + path fai + path dict + + output: + tuple val(meta), path('*.metrics'), emit: metrics + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def input_list = input.collect(){"--INPUT $it"}.join(" ") + + def avail_mem = 3072 + if (!task.memory) { + log.info '[GATK EstimateLibraryComplexity] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' + } else { + avail_mem = (task.memory.mega*0.8).intValue() + } + """ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + EstimateLibraryComplexity \\ + $input_list \\ + --OUTPUT ${prefix}.metrics \\ + --REFERENCE_SEQUENCE ${fasta} \\ + --TMP_DIR . \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/gatk4/estimatelibrarycomplexity/meta.yml b/modules/nf-core/gatk4/estimatelibrarycomplexity/meta.yml new file mode 100644 index 0000000000..2d5bddf6c9 --- /dev/null +++ b/modules/nf-core/gatk4/estimatelibrarycomplexity/meta.yml @@ -0,0 +1,57 @@ +name: gatk4_estimatelibrarycomplexity +description: Estimates the numbers of unique molecules in a sequencing library. +keywords: + - duplication metrics + - estimatelibrarycomplexity + - gatk4 + - reporting +tools: + - gatk4: + description: Genome Analysis Toolkit (GATK4) + homepage: https://gatk.broadinstitute.org/hc/en-us + documentation: https://gatk.broadinstitute.org/hc/en-us + tool_dev_url: https://github.com/broadinstitute/gatk + doi: "10.1158/1538-7445.AM2017-3590" + licence: ["Apache-2.0"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - fasta: + type: file + description: The reference fasta file + pattern: "*.fasta" + - fai: + type: file + description: Index of reference fasta file + pattern: "fasta.fai" + - dict: + type: file + description: GATK sequence dictionary + pattern: "*.dict" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - metrics: + type: file + description: File containing metrics on the input files + pattern: "*.{metrics}" +authors: + - "@FriederikeHanssen" + - "@maxulysse" +maintainers: + - "@FriederikeHanssen" + - "@maxulysse" diff --git a/modules/nf-core/gatk4/filtermutectcalls/environment.yml b/modules/nf-core/gatk4/filtermutectcalls/environment.yml new file mode 100644 index 0000000000..8057d765d5 --- /dev/null +++ b/modules/nf-core/gatk4/filtermutectcalls/environment.yml @@ -0,0 +1,7 @@ +name: gatk4_filtermutectcalls +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::gatk4=4.4.0.0 diff --git a/modules/nf-core/gatk4/filtermutectcalls/main.nf b/modules/nf-core/gatk4/filtermutectcalls/main.nf new file mode 100644 index 0000000000..fa6b46ab3c --- /dev/null +++ b/modules/nf-core/gatk4/filtermutectcalls/main.nf @@ -0,0 +1,71 @@ +process GATK4_FILTERMUTECTCALLS { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gatk4:4.4.0.0--py36hdfd78af_0': + 'biocontainers/gatk4:4.4.0.0--py36hdfd78af_0' }" + + input: + tuple val(meta), path(vcf), path(vcf_tbi), path(stats), path(orientationbias), path(segmentation), path(table), val(estimate) + tuple val(meta2), path(fasta) + tuple val(meta3), path(fai) + tuple val(meta4), path(dict) + + output: + tuple val(meta), path("*.vcf.gz") , emit: vcf + tuple val(meta), path("*.vcf.gz.tbi") , emit: tbi + tuple val(meta), path("*.filteringStats.tsv"), emit: stats + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + def orientationbias_command = orientationbias ? orientationbias.collect{"--orientation-bias-artifact-priors $it"}.join(' ') : '' + def segmentation_command = segmentation ? segmentation.collect{"--tumor-segmentation $it"}.join(' ') : '' + def estimate_command = estimate ? " --contamination-estimate ${estimate} " : '' + def table_command = table ? table.collect{"--contamination-table $it"}.join(' ') : '' + + def avail_mem = 3072 + if (!task.memory) { + log.info '[GATK FilterMutectCalls] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' + } else { + avail_mem = (task.memory.mega*0.8).intValue() + } + """ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + FilterMutectCalls \\ + --variant $vcf \\ + --output ${prefix}.vcf.gz \\ + --reference $fasta \\ + $orientationbias_command \\ + $segmentation_command \\ + $estimate_command \\ + $table_command \\ + --tmp-dir . \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.vcf.gz + touch ${prefix}.vcf.gz.tbi + touch ${prefix}.vcf.gz.filteringStats.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/gatk4/filtermutectcalls/meta.yml b/modules/nf-core/gatk4/filtermutectcalls/meta.yml new file mode 100644 index 0000000000..736c838625 --- /dev/null +++ b/modules/nf-core/gatk4/filtermutectcalls/meta.yml @@ -0,0 +1,103 @@ +name: gatk4_filtermutectcalls +description: | + Filters the raw output of mutect2, can optionally use outputs of calculatecontamination and learnreadorientationmodel to improve filtering. +keywords: + - filtermutectcalls + - filter + - gatk4 + - mutect2 + - vcf +tools: + - gatk4: + description: | + Developed in the Data Sciences Platform at the Broad Institute, the toolkit offers a wide variety of tools + with a primary focus on variant discovery and genotyping. Its powerful processing engine + and high-performance computing features make it capable of taking on projects of any size. + homepage: https://gatk.broadinstitute.org/hc/en-us + documentation: https://gatk.broadinstitute.org/hc/en-us/categories/360002369672s + doi: 10.1158/1538-7445.AM2017-3590 +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - vcf: + type: file + description: compressed vcf file of mutect2calls + pattern: "*.vcf.gz" + - vcf_tbi: + type: file + description: Tabix index of vcf file + pattern: "*vcf.gz.tbi" + - stats: + type: file + description: Stats file that pairs with output vcf file + pattern: "*vcf.gz.stats" + - orientationbias: + type: file + description: files containing artifact priors for input vcf. Optional input. + pattern: "*.artifact-prior.tar.gz" + - segmentation: + type: file + description: tables containing segmentation information for input vcf. Optional input. + pattern: "*.segmentation.table" + - table: + type: file + description: table(s) containing contamination data for input vcf. Optional input, takes priority over estimate. + pattern: "*.contamination.table" + - estimate: + type: float + description: estimation of contamination value as a double. Optional input, will only be used if table is not specified. + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - fasta: + type: file + description: The reference fasta file + pattern: "*.fasta" + - meta3: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - fai: + type: file + description: Index of reference fasta file + pattern: "*.fasta.fai" + - meta4: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - dict: + type: file + description: GATK sequence dictionary + pattern: "*.dict" +output: + - vcf: + type: file + description: file containing filtered mutect2 calls. + pattern: "*.vcf.gz" + - tbi: + type: file + description: tbi file that pairs with vcf. + pattern: "*.vcf.gz.tbi" + - stats: + type: file + description: file containing statistics of the filtermutectcalls run. + pattern: "*.filteringStats.tsv" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@GCJMackenzie" + - "@maxulysse" + - "@ramprasadn" +maintainers: + - "@GCJMackenzie" + - "@maxulysse" + - "@ramprasadn" diff --git a/modules/nf-core/gatk4/filtervarianttranches/environment.yml b/modules/nf-core/gatk4/filtervarianttranches/environment.yml new file mode 100644 index 0000000000..faeea8ddb0 --- /dev/null +++ b/modules/nf-core/gatk4/filtervarianttranches/environment.yml @@ -0,0 +1,7 @@ +name: gatk4_filtervarianttranches +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::gatk4=4.4.0.0 diff --git a/modules/nf-core/gatk4/filtervarianttranches/main.nf b/modules/nf-core/gatk4/filtervarianttranches/main.nf new file mode 100644 index 0000000000..9da47ab739 --- /dev/null +++ b/modules/nf-core/gatk4/filtervarianttranches/main.nf @@ -0,0 +1,52 @@ +process GATK4_FILTERVARIANTTRANCHES { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gatk4:4.4.0.0--py36hdfd78af_0': + 'biocontainers/gatk4:4.4.0.0--py36hdfd78af_0' }" + + input: + tuple val(meta), path(vcf), path(tbi), path(intervals) + path resources + path resources_index + path fasta + path fai + path dict + + + output: + tuple val(meta), path("*.vcf.gz") , emit: vcf + tuple val(meta), path("*.vcf.gz.tbi"), emit: tbi + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def resources = resources.collect{"--resource $it"}.join(' ') + + def avail_mem = 3072 + if (!task.memory) { + log.info '[GATK FilterVariantTranches] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' + } else { + avail_mem = (task.memory.mega*0.8).intValue() + } + """ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + FilterVariantTranches \\ + --variant $vcf \\ + $resources \\ + --output ${prefix}.filtered.vcf.gz \\ + --tmp-dir . \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/gatk4/filtervarianttranches/meta.yml b/modules/nf-core/gatk4/filtervarianttranches/meta.yml new file mode 100644 index 0000000000..9346d2b4a4 --- /dev/null +++ b/modules/nf-core/gatk4/filtervarianttranches/meta.yml @@ -0,0 +1,72 @@ +name: "gatk4_filtervarianttranches" +description: Apply tranche filtering +keywords: + - filtervarianttranches + - gatk4 + - tranche filtering +tools: + - "gatk4": + description: | + Developed in the Data Sciences Platform at the Broad Institute, the toolkit offers a wide variety of tools + with a primary focus on variant discovery and genotyping. Its powerful processing engine + and high-performance computing features make it capable of taking on projects of any size. + homepage: https://gatk.broadinstitute.org/hc/en-us + documentation: https://gatk.broadinstitute.org/hc/en-us/articles/360051308071-FilterVariantTranches + doi: 10.1158/1538-7445.AM2017-3590 + licence: ["Apache-2.0"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - vcf: + type: file + description: a VCF file containing variants, must have info key:CNN_2D + pattern: "*.vcf.gz" + - tbi: + type: file + description: tbi file matching with -vcf + pattern: "*.vcf.gz.tbi" + - resources: + type: list + description: resource A VCF containing known SNP and or INDEL sites. Can be supplied as many times as necessary + pattern: "*.vcf.gz" + - resources_index: + type: list + description: Index of resource VCF containing known SNP and or INDEL sites. Can be supplied as many times as necessary + pattern: "*.vcf.gz" + - fasta: + type: file + description: The reference fasta file + pattern: "*.fasta" + - fai: + type: file + description: Index of reference fasta file + pattern: "fasta.fai" + - dict: + type: file + description: GATK sequence dictionary + pattern: ".dict" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - vcf: + type: file + description: VCF file + pattern: "*.vcf.gz" + - tbi: + type: file + description: VCF index file + pattern: "*.vcf.gz.tbi" +authors: + - "@FriederikeHanssen" +maintainers: + - "@FriederikeHanssen" diff --git a/modules/nf-core/gatk4/gatherbqsrreports/environment.yml b/modules/nf-core/gatk4/gatherbqsrreports/environment.yml new file mode 100644 index 0000000000..928ac76e6b --- /dev/null +++ b/modules/nf-core/gatk4/gatherbqsrreports/environment.yml @@ -0,0 +1,7 @@ +name: gatk4_gatherbqsrreports +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::gatk4=4.4.0.0 diff --git a/modules/nf-core/gatk4/gatherbqsrreports/main.nf b/modules/nf-core/gatk4/gatherbqsrreports/main.nf new file mode 100644 index 0000000000..e783701017 --- /dev/null +++ b/modules/nf-core/gatk4/gatherbqsrreports/main.nf @@ -0,0 +1,44 @@ +process GATK4_GATHERBQSRREPORTS { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gatk4:4.4.0.0--py36hdfd78af_0': + 'biocontainers/gatk4:4.4.0.0--py36hdfd78af_0' }" + + input: + tuple val(meta), path(table) + + output: + tuple val(meta), path("*.table"), emit: table + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def input_list = table.collect{"--input $it"}.join(' ') + + def avail_mem = 3072 + if (!task.memory) { + log.info '[GATK GatherBQSRReports] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' + } else { + avail_mem = (task.memory.mega*0.8).intValue() + } + """ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + GatherBQSRReports \\ + $input_list \\ + --output ${prefix}.table \\ + --tmp-dir . \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/gatk4/gatherbqsrreports/meta.yml b/modules/nf-core/gatk4/gatherbqsrreports/meta.yml new file mode 100644 index 0000000000..b9f5bf5f8b --- /dev/null +++ b/modules/nf-core/gatk4/gatherbqsrreports/meta.yml @@ -0,0 +1,43 @@ +name: gatk4_gatherbqsrreports +description: Gathers scattered BQSR recalibration reports into a single file +keywords: + - base quality score recalibration + - bqsr + - gatherbqsrreports + - gatk4 +tools: + - gatk4: + description: Genome Analysis Toolkit (GATK4) + homepage: https://gatk.broadinstitute.org/hc/en-us + documentation: https://gatk.broadinstitute.org/hc/en-us + tool_dev_url: https://github.com/broadinstitute/gatk + doi: "10.1158/1538-7445.AM2017-3590" + licence: ["BSD-3-clause"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - table: + type: file + description: File(s) containing BQSR table(s) + pattern: "*.table" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - table: + type: file + description: File containing joined BQSR table + pattern: "*.table" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@FriederikeHanssen" +maintainers: + - "@FriederikeHanssen" diff --git a/modules/nf-core/gatk4/gatherpileupsummaries/environment.yml b/modules/nf-core/gatk4/gatherpileupsummaries/environment.yml new file mode 100644 index 0000000000..1a2ebf5761 --- /dev/null +++ b/modules/nf-core/gatk4/gatherpileupsummaries/environment.yml @@ -0,0 +1,7 @@ +name: gatk4_gatherpileupsummaries +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::gatk4=4.4.0.0 diff --git a/modules/nf-core/gatk4/gatherpileupsummaries/main.nf b/modules/nf-core/gatk4/gatherpileupsummaries/main.nf new file mode 100644 index 0000000000..1863133d2c --- /dev/null +++ b/modules/nf-core/gatk4/gatherpileupsummaries/main.nf @@ -0,0 +1,47 @@ +process GATK4_GATHERPILEUPSUMMARIES { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gatk4:4.4.0.0--py36hdfd78af_0': + 'biocontainers/gatk4:4.4.0.0--py36hdfd78af_0' }" + + + input: + tuple val(meta), path(pileup) + path dict + + output: + tuple val(meta), path("*.pileups.table"), emit: table + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def input_list = pileup.collect{ "--I $it" }.join(' ') + + def avail_mem = 3072 + if (!task.memory) { + log.info '[GATK GatherPileupSummaries] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' + } else { + avail_mem = (task.memory.mega*0.8).intValue() + } + """ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + GatherPileupSummaries \\ + $input_list \\ + --O ${prefix}.pileups.table \\ + --sequence-dictionary $dict \\ + --tmp-dir . \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/gatk4/gatherpileupsummaries/meta.yml b/modules/nf-core/gatk4/gatherpileupsummaries/meta.yml new file mode 100644 index 0000000000..35381a3b51 --- /dev/null +++ b/modules/nf-core/gatk4/gatherpileupsummaries/meta.yml @@ -0,0 +1,44 @@ +name: gatk4_gatherpileupsummaries +description: write your description here +keywords: + - gatk4 + - mpileup + - sort +tools: + - gatk4: + description: Genome Analysis Toolkit (GATK4) + homepage: https://gatk.broadinstitute.org/hc/en-us + documentation: https://gatk.broadinstitute.org/hc/en-us + tool_dev_url: https://github.com/broadinstitute/gatk + doi: "10.1158/1538-7445.AM2017-3590" + licence: ["BSD-3-clause"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - pileup: + type: file + description: Pileup files from gatk4/getpileupsummaries + pattern: "*.pileups.table" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - table: + type: file + description: pileup summaries table file + pattern: "*.pileups.table" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@FriederikeHanssen" + - "@maxulysse" +maintainers: + - "@FriederikeHanssen" + - "@maxulysse" diff --git a/modules/nf-core/gatk4/genomicsdbimport/environment.yml b/modules/nf-core/gatk4/genomicsdbimport/environment.yml new file mode 100644 index 0000000000..ce3f941694 --- /dev/null +++ b/modules/nf-core/gatk4/genomicsdbimport/environment.yml @@ -0,0 +1,7 @@ +name: gatk4_genomicsdbimport +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::gatk4=4.4.0.0 diff --git a/modules/nf-core/gatk4/genomicsdbimport/main.nf b/modules/nf-core/gatk4/genomicsdbimport/main.nf new file mode 100644 index 0000000000..916037ebef --- /dev/null +++ b/modules/nf-core/gatk4/genomicsdbimport/main.nf @@ -0,0 +1,104 @@ +process GATK4_GENOMICSDBIMPORT { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gatk4:4.4.0.0--py36hdfd78af_0': + 'biocontainers/gatk4:4.4.0.0--py36hdfd78af_0' }" + + input: + tuple val(meta), path(vcf), path(tbi), path(interval_file), val(interval_value), path(wspace) + val run_intlist + val run_updatewspace + val input_map + + output: + tuple val(meta), path("$prefix") , optional:true, emit: genomicsdb + tuple val(meta), path("$updated_db") , optional:true, emit: updatedb + tuple val(meta), path("*.interval_list"), optional:true, emit: intervallist + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + + // settings for running default create gendb mode + input_command = input_map ? "--sample-name-map ${vcf[0]}" : vcf.collect(){"--variant $it"}.join(' ') + + genomicsdb_command = "--genomicsdb-workspace-path ${prefix}" + interval_command = interval_file ? "--intervals ${interval_file}" : "--intervals ${interval_value}" + updated_db = "" + + // settings changed for running get intervals list mode if run_intlist is true + if (run_intlist) { + genomicsdb_command = "--genomicsdb-update-workspace-path ${wspace}" + interval_command = "--output-interval-list-to-file ${prefix}.interval_list" + } + + // settings changed for running update gendb mode. input_command same as default, update_db forces module to emit the updated gendb + if (run_updatewspace) { + genomicsdb_command = "--genomicsdb-update-workspace-path ${wspace}" + interval_command = '' + updated_db = "${wspace}" + } + + def avail_mem = 3072 + if (!task.memory) { + log.info '[GATK GenomicsDBImport] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' + } else { + avail_mem = (task.memory.mega*0.8).intValue() + } + """ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + GenomicsDBImport \\ + $input_command \\ + $genomicsdb_command \\ + $interval_command \\ + --tmp-dir . \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}" + + genomicsdb_command = "--genomicsdb-workspace-path ${prefix}" + interval_command = interval_file ? "--intervals ${interval_file}" : "--intervals ${interval_value}" + updated_db = "" + + // settings changed for running get intervals list mode if run_intlist is true + if (run_intlist) { + genomicsdb_command = "--genomicsdb-update-workspace-path ${wspace}" + interval_command = "--output-interval-list-to-file ${prefix}.interval_list" + } + + // settings changed for running update gendb mode. input_command same as default, update_db forces module to emit the updated gendb + if (run_updatewspace) { + genomicsdb_command = "--genomicsdb-update-workspace-path ${wspace}" + interval_command = '' + updated_db = "${wspace}" + } + + def stub_genomicsdb = genomicsdb_command == "--genomicsdb-workspace-path ${prefix}" ? "touch ${prefix}" : "" + def stub_interval = interval_command == "--output-interval-list-to-file ${prefix}.interval_list" ? "touch ${prefix}.interval_list" : "" + def stub_update = updated_db != "" ? "touch ${wspace}" : "" + + """ + ${stub_genomicsdb} + ${stub_interval} + ${stub_update} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/gatk4/genomicsdbimport/meta.yml b/modules/nf-core/gatk4/genomicsdbimport/meta.yml new file mode 100644 index 0000000000..ca8fe3d076 --- /dev/null +++ b/modules/nf-core/gatk4/genomicsdbimport/meta.yml @@ -0,0 +1,76 @@ +name: gatk4_genomicsdbimport +description: merge GVCFs from multiple samples. For use in joint genotyping or somatic panel of normal creation. +keywords: + - gatk4 + - genomicsdb + - genomicsdbimport + - jointgenotyping + - panelofnormalscreation +tools: + - gatk4: + description: | + Developed in the Data Sciences Platform at the Broad Institute, the toolkit offers a wide variety of tools + with a primary focus on variant discovery and genotyping. Its powerful processing engine + and high-performance computing features make it capable of taking on projects of any size. + homepage: https://gatk.broadinstitute.org/hc/en-us + documentation: https://gatk.broadinstitute.org/hc/en-us/categories/360002369672s + doi: 10.1158/1538-7445.AM2017-3590 +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test'] + - vcf: + type: list + description: either a list of vcf files to be used to create or update a genomicsdb, or a file that contains a map to vcf files to be used. + pattern: "*.vcf.gz" + - tbi: + type: list + description: list of tbi files that match with the input vcf files + pattern: "*.vcf.gz_tbi" + - wspace: + type: file + description: path to an existing genomicsdb to be used in update db mode or get intervals mode. This WILL NOT specify name of a new genomicsdb in create db mode. + pattern: "/path/to/existing/gendb" + - intervalfile: + type: file + description: file containing the intervals to be used when creating the genomicsdb + pattern: "*.interval_list" + - intervalval: + type: string + description: if an intervals file has not been spcified, the value enetered here will be used as an interval via the "-L" argument + pattern: "example: chr1:1000-10000" + - run_intlist: + type: boolean + description: Specify whether to run get interval list mode, this option cannot be specified at the same time as run_updatewspace. + pattern: "true/false" + - run_updatewspace: + type: boolean + description: Specify whether to run update genomicsdb mode, this option takes priority over run_intlist. + pattern: "true/false" + - input_map: + type: boolean + description: Specify whether the vcf input is providing a list of vcf file(s) or a single file containing a map of paths to vcf files to be used to create or update a genomicsdb. + pattern: "*.sample_map" +output: + - genomicsdb: + type: directory + description: Directory containing the files that compose the genomicsdb workspace, this is only output for create mode, as update changes an existing db + pattern: "*/$prefix" + - updatedb: + type: directory + description: Directory containing the files that compose the updated genomicsdb workspace, this is only output for update mode, and should be the same path as the input wspace. + pattern: "same/path/as/wspace" + - intervallist: + type: file + description: File containing the intervals used to generate the genomicsdb, only created by get intervals mode. + pattern: "*.interval_list" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@GCJMackenzie" +maintainers: + - "@GCJMackenzie" diff --git a/modules/nf-core/gatk4/genotypegvcfs/environment.yml b/modules/nf-core/gatk4/genotypegvcfs/environment.yml new file mode 100644 index 0000000000..49f213790d --- /dev/null +++ b/modules/nf-core/gatk4/genotypegvcfs/environment.yml @@ -0,0 +1,7 @@ +name: gatk4_genotypegvcfs +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::gatk4=4.4.0.0 diff --git a/modules/nf-core/gatk4/genotypegvcfs/main.nf b/modules/nf-core/gatk4/genotypegvcfs/main.nf new file mode 100644 index 0000000000..c6c0ba501d --- /dev/null +++ b/modules/nf-core/gatk4/genotypegvcfs/main.nf @@ -0,0 +1,68 @@ +process GATK4_GENOTYPEGVCFS { + tag "$meta.id" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gatk4:4.4.0.0--py36hdfd78af_0': + 'biocontainers/gatk4:4.4.0.0--py36hdfd78af_0' }" + + input: + tuple val(meta), path(gvcf), path(gvcf_index), path(intervals), path(intervals_index) + path fasta + path fai + path dict + path dbsnp + path dbsnp_tbi + + output: + tuple val(meta), path("*.vcf.gz"), emit: vcf + tuple val(meta), path("*.tbi") , emit: tbi + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def gvcf_command = gvcf.name.endsWith(".vcf") || gvcf.name.endsWith(".vcf.gz") ? "$gvcf" : "gendb://$gvcf" + def dbsnp_command = dbsnp ? "--dbsnp $dbsnp" : "" + def interval_command = intervals ? "--intervals $intervals" : "" + + def avail_mem = 3072 + if (!task.memory) { + log.info '[GATK GenotypeGVCFs] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' + } else { + avail_mem = (task.memory.mega*0.8).intValue() + } + """ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + GenotypeGVCFs \\ + --variant $gvcf_command \\ + --output ${prefix}.vcf.gz \\ + --reference $fasta \\ + $interval_command \\ + $dbsnp_command \\ + --tmp-dir . \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + touch ${prefix}.vcf.gz + touch ${prefix}.vcf.gz.tbi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/gatk4/genotypegvcfs/meta.yml b/modules/nf-core/gatk4/genotypegvcfs/meta.yml new file mode 100644 index 0000000000..8f1e377eb9 --- /dev/null +++ b/modules/nf-core/gatk4/genotypegvcfs/meta.yml @@ -0,0 +1,82 @@ +name: gatk4_genotypegvcfs +description: | + Perform joint genotyping on one or more samples pre-called with HaplotypeCaller. +keywords: + - gatk4 + - genotype + - gvcf + - joint genotyping +tools: + - gatk4: + description: Genome Analysis Toolkit (GATK4) + homepage: https://gatk.broadinstitute.org/hc/en-us + documentation: https://gatk.broadinstitute.org/hc/en-us/categories/360002369672s + tool_dev_url: https://github.com/broadinstitute/gatk + doi: "10.1158/1538-7445.AM2017-3590" + licence: ["BSD-3-clause"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - gvcf: + type: file + description: | + gVCF(.gz) file or to a GenomicsDB + pattern: "*.{vcf,vcf.gz}" + - gvcf_index: + type: file + description: | + index of gvcf file, or empty when providing GenomicsDB + pattern: "*.{idx,tbi}" + - intervals: + type: file + description: Interval file with the genomic regions included in the library (optional) + - intervals_index: + type: file + description: Interval index file (optional) + - fasta: + type: file + description: Reference fasta file + pattern: "*.fasta" + - fai: + type: file + description: Reference fasta index file + pattern: "*.fai" + - dict: + type: file + description: Reference fasta sequence dict file + pattern: "*.dict" + - dbsnp: + type: file + description: dbSNP VCF file + pattern: "*.vcf.gz" + - dbsnp_tbi: + type: file + description: dbSNP VCF index file + pattern: "*.tbi" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - vcf: + type: file + description: Genotyped VCF file + pattern: "*.vcf.gz" + - tbi: + type: file + description: Tbi index for VCF file + pattern: "*.vcf.gz" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@santiagorevale" + - "@maxulysse" +maintainers: + - "@santiagorevale" + - "@maxulysse" diff --git a/modules/nf-core/gatk4/getpileupsummaries/environment.yml b/modules/nf-core/gatk4/getpileupsummaries/environment.yml new file mode 100644 index 0000000000..d650467cf8 --- /dev/null +++ b/modules/nf-core/gatk4/getpileupsummaries/environment.yml @@ -0,0 +1,7 @@ +name: gatk4_getpileupsummaries +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::gatk4=4.4.0.0 diff --git a/modules/nf-core/gatk4/getpileupsummaries/main.nf b/modules/nf-core/gatk4/getpileupsummaries/main.nf new file mode 100644 index 0000000000..d509cdf3bb --- /dev/null +++ b/modules/nf-core/gatk4/getpileupsummaries/main.nf @@ -0,0 +1,53 @@ +process GATK4_GETPILEUPSUMMARIES { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gatk4:4.4.0.0--py36hdfd78af_0': + 'biocontainers/gatk4:4.4.0.0--py36hdfd78af_0' }" + + input: + tuple val(meta), path(input), path(index), path(intervals) + tuple val(meta2), path(fasta) + tuple val(meta3), path(fai) + tuple val(meta4), path(dict) + path variants + path variants_tbi + + output: + tuple val(meta), path('*.pileups.table'), emit: table + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def interval_command = intervals ? "--intervals $intervals" : "--intervals $variants" + def reference_command = fasta ? "--reference $fasta" : '' + + def avail_mem = 3072 + if (!task.memory) { + log.info '[GATK GetPileupSummaries] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' + } else { + avail_mem = (task.memory.mega*0.8).intValue() + } + """ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + GetPileupSummaries \\ + --input $input \\ + --variant $variants \\ + --output ${prefix}.pileups.table \\ + $reference_command \\ + $interval_command \\ + --tmp-dir . \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/gatk4/getpileupsummaries/meta.yml b/modules/nf-core/gatk4/getpileupsummaries/meta.yml new file mode 100644 index 0000000000..fab3c1435e --- /dev/null +++ b/modules/nf-core/gatk4/getpileupsummaries/meta.yml @@ -0,0 +1,84 @@ +name: gatk4_getpileupsummaries +description: | + Summarizes counts of reads that support reference, alternate and other alleles for given sites. Results can be used with CalculateContamination. Requires a common germline variant sites file, such as from gnomAD. +keywords: + - gatk4 + - germlinevariantsites + - getpileupsumaries + - readcountssummary +tools: + - gatk4: + description: | + Developed in the Data Sciences Platform at the Broad Institute, the toolkit offers a wide variety of tools + with a primary focus on variant discovery and genotyping. Its powerful processing engine + and high-performance computing features make it capable of taking on projects of any size. + homepage: https://gatk.broadinstitute.org/hc/en-us + documentation: https://gatk.broadinstitute.org/hc/en-us/categories/360002369672s + doi: 10.1158/1538-7445.AM2017-3590 + licence: ["Apache-2.0"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - input: + type: file + description: BAM/CRAM file to be summarised. + pattern: "*.{bam,cram}" + - input_index: + type: file + description: BAM/CRAM file index. + pattern: "*.{bai,crai}" + - intervals: + type: file + description: File containing specified sites to be used for the summary. If this option is not specified, variants file is used instead automatically. + pattern: "*.interval_list" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - fasta: + type: file + description: The reference fasta file + pattern: "*.fasta" + - meta3: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - fai: + type: file + description: Index of reference fasta file + pattern: "*.fasta.fai" + - meta4: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - dict: + type: file + description: GATK sequence dictionary + pattern: "*.dict" + - variants: + type: file + description: Population vcf of germline sequencing, containing allele fractions. Is also used as sites file if no separate sites file is specified. + pattern: "*.vcf.gz" + - variants_tbi: + type: file + description: Index file for the germline resource. + pattern: "*.vcf.gz.tbi" +output: + - pileup: + type: file + description: File containing the pileup summary table. + pattern: "*.pileups.table" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@GCJMackenzie" +maintainers: + - "@GCJMackenzie" diff --git a/modules/nf-core/gatk4/haplotypecaller/environment.yml b/modules/nf-core/gatk4/haplotypecaller/environment.yml new file mode 100644 index 0000000000..0c8f32fa63 --- /dev/null +++ b/modules/nf-core/gatk4/haplotypecaller/environment.yml @@ -0,0 +1,7 @@ +name: gatk4_haplotypecaller +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::gatk4=4.4.0.0 diff --git a/modules/nf-core/gatk4/haplotypecaller/main.nf b/modules/nf-core/gatk4/haplotypecaller/main.nf new file mode 100644 index 0000000000..fdecf5f830 --- /dev/null +++ b/modules/nf-core/gatk4/haplotypecaller/main.nf @@ -0,0 +1,76 @@ +process GATK4_HAPLOTYPECALLER { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gatk4:4.4.0.0--py36hdfd78af_0': + 'biocontainers/gatk4:4.4.0.0--py36hdfd78af_0' }" + + input: + tuple val(meta), path(input), path(input_index), path(intervals), path(dragstr_model) + path fasta + path fai + path dict + path dbsnp + path dbsnp_tbi + + output: + tuple val(meta), path("*.vcf.gz") , emit: vcf + tuple val(meta), path("*.tbi") , optional:true, emit: tbi + tuple val(meta), path("*.realigned.bam"), optional:true, emit: bam + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def dbsnp_command = dbsnp ? "--dbsnp $dbsnp" : "" + def interval_command = intervals ? "--intervals $intervals" : "" + def dragstr_command = dragstr_model ? "--dragstr-params-path $dragstr_model" : "" + def bamout_command = args.contains("--bam-writer-type") ? "--bam-output ${prefix.replaceAll('.g\\s*$', '')}.realigned.bam" : "" + + def avail_mem = 3072 + if (!task.memory) { + log.info '[GATK HaplotypeCaller] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' + } else { + avail_mem = (task.memory.mega*0.8).intValue() + } + """ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + HaplotypeCaller \\ + --input $input \\ + --output ${prefix}.vcf.gz \\ + --reference $fasta \\ + $dbsnp_command \\ + $interval_command \\ + $dragstr_command \\ + $bamout_command \\ + --tmp-dir . \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def bamout_command = args.contains("--bam-writer-type") ? "--bam-output ${prefix.replaceAll('.g\\s*$', '')}.realigned.bam" : "" + + def stub_realigned_bam = bamout_command ? "touch ${prefix.replaceAll('.g\\s*$', '')}.realigned.bam" : "" + """ + touch ${prefix}.vcf.gz + touch ${prefix}.vcf.gz.tbi + ${stub_realigned_bam} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/gatk4/haplotypecaller/meta.yml b/modules/nf-core/gatk4/haplotypecaller/meta.yml new file mode 100644 index 0000000000..f38dc37dd0 --- /dev/null +++ b/modules/nf-core/gatk4/haplotypecaller/meta.yml @@ -0,0 +1,83 @@ +name: gatk4_haplotypecaller +description: Call germline SNPs and indels via local re-assembly of haplotypes +keywords: + - gatk4 + - haplotype + - haplotypecaller +tools: + - gatk4: + description: | + Developed in the Data Sciences Platform at the Broad Institute, the toolkit offers a wide variety of tools + with a primary focus on variant discovery and genotyping. Its powerful processing engine + and high-performance computing features make it capable of taking on projects of any size. + homepage: https://gatk.broadinstitute.org/hc/en-us + documentation: https://gatk.broadinstitute.org/hc/en-us/categories/360002369672s + doi: 10.1158/1538-7445.AM2017-3590 + licence: ["Apache-2.0"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: BAM/CRAM file from alignment + pattern: "*.{bam,cram}" + - input_index: + type: file + description: BAI/CRAI file from alignment + pattern: "*.{bai,crai}" + - intervals: + type: file + description: Bed file with the genomic regions included in the library (optional) + - dragstr_model: + type: file + description: Text file containing the DragSTR model of the used BAM/CRAM file (optional) + pattern: "*.txt" + - fasta: + type: file + description: The reference fasta file + pattern: "*.fasta" + - fai: + type: file + description: Index of reference fasta file + pattern: "fasta.fai" + - dict: + type: file + description: GATK sequence dictionary + pattern: "*.dict" + - dbsnp: + type: file + description: VCF file containing known sites (optional) + - dbsnp_tbi: + type: file + description: VCF index of dbsnp (optional) +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - vcf: + type: file + description: Compressed VCF file + pattern: "*.vcf.gz" + - tbi: + type: file + description: Index of VCF file + pattern: "*.vcf.gz.tbi" + - bam: + type: file + description: Assembled haplotypes and locally realigned reads + pattern: "*.realigned.bam" +authors: + - "@suzannejin" + - "@FriederikeHanssen" +maintainers: + - "@suzannejin" + - "@FriederikeHanssen" diff --git a/modules/nf-core/gatk4/intervallisttobed/environment.yml b/modules/nf-core/gatk4/intervallisttobed/environment.yml new file mode 100644 index 0000000000..06d9f0e9c7 --- /dev/null +++ b/modules/nf-core/gatk4/intervallisttobed/environment.yml @@ -0,0 +1,7 @@ +name: gatk4_intervallisttobed +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::gatk4=4.4.0.0 diff --git a/modules/nf-core/gatk4/intervallisttobed/main.nf b/modules/nf-core/gatk4/intervallisttobed/main.nf new file mode 100644 index 0000000000..89772081e0 --- /dev/null +++ b/modules/nf-core/gatk4/intervallisttobed/main.nf @@ -0,0 +1,43 @@ +process GATK4_INTERVALLISTTOBED { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gatk4:4.4.0.0--py36hdfd78af_0': + 'biocontainers/gatk4:4.4.0.0--py36hdfd78af_0' }" + + input: + tuple val(meta), path(intervals) + + output: + tuple val(meta), path("*.bed"), emit: bed + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + def avail_mem = 3072 + if (!task.memory) { + log.info '[GATK IntervalListToBed] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' + } else { + avail_mem = (task.memory.mega*0.8).intValue() + } + """ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + IntervalListToBed \\ + --INPUT $intervals \\ + --OUTPUT ${prefix}.bed \\ + --TMP_DIR . \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/gatk4/intervallisttobed/meta.yml b/modules/nf-core/gatk4/intervallisttobed/meta.yml new file mode 100644 index 0000000000..28d264dfef --- /dev/null +++ b/modules/nf-core/gatk4/intervallisttobed/meta.yml @@ -0,0 +1,43 @@ +name: gatk4_intervallisttobed +description: Converts an Picard IntervalList file to a BED file. +keywords: + - bed + - conversion + - gatk4 + - interval +tools: + - gatk4: + description: Genome Analysis Toolkit (GATK4) + homepage: https://gatk.broadinstitute.org/hc/en-us + documentation: https://gatk.broadinstitute.org/hc/en-us/categories/360002369672s + tool_dev_url: https://github.com/broadinstitute/gatk + doi: "10.1158/1538-7445.AM2017-3590" + licence: ["BSD-3-clause"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - interval: + type: file + description: Interval list + pattern: "*.{interval,interval_list}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bed: + type: file + description: BED file + pattern: "*.bed" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@FriederikeHanssen" +maintainers: + - "@FriederikeHanssen" diff --git a/modules/nf-core/gatk4/learnreadorientationmodel/environment.yml b/modules/nf-core/gatk4/learnreadorientationmodel/environment.yml new file mode 100644 index 0000000000..d1c35caf83 --- /dev/null +++ b/modules/nf-core/gatk4/learnreadorientationmodel/environment.yml @@ -0,0 +1,7 @@ +name: gatk4_learnreadorientationmodel +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::gatk4=4.4.0.0 diff --git a/modules/nf-core/gatk4/learnreadorientationmodel/main.nf b/modules/nf-core/gatk4/learnreadorientationmodel/main.nf new file mode 100644 index 0000000000..c4e39db74a --- /dev/null +++ b/modules/nf-core/gatk4/learnreadorientationmodel/main.nf @@ -0,0 +1,44 @@ +process GATK4_LEARNREADORIENTATIONMODEL { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gatk4:4.4.0.0--py36hdfd78af_0': + 'biocontainers/gatk4:4.4.0.0--py36hdfd78af_0' }" + + input: + tuple val(meta), path(f1r2) + + output: + tuple val(meta), path("*.tar.gz"), emit: artifactprior + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def input_list = f1r2.collect{"--input $it"}.join(' ') + + def avail_mem = 3072 + if (!task.memory) { + log.info '[GATK LearnReadOrientationModel] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' + } else { + avail_mem = (task.memory.mega*0.8).intValue() + } + """ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + LearnReadOrientationModel \\ + $input_list \\ + --output ${prefix}.tar.gz \\ + --tmp-dir . \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/gatk4/learnreadorientationmodel/meta.yml b/modules/nf-core/gatk4/learnreadorientationmodel/meta.yml new file mode 100644 index 0000000000..4b73a51adb --- /dev/null +++ b/modules/nf-core/gatk4/learnreadorientationmodel/meta.yml @@ -0,0 +1,41 @@ +name: gatk4_learnreadorientationmodel +description: | + Uses f1r2 counts collected during mutect2 to Learn the prior probability of read orientation artifacts +keywords: + - gatk4 + - learnreadorientationmodel + - mutect2 + - readorientationartifacts +tools: + - gatk4: + description: | + Developed in the Data Sciences Platform at the Broad Institute, the toolkit offers a wide variety of tools + with a primary focus on variant discovery and genotyping. Its powerful processing engine + and high-performance computing features make it capable of taking on projects of any size. + homepage: https://gatk.broadinstitute.org/hc/en-us + documentation: https://gatk.broadinstitute.org/hc/en-us/categories/360002369672s + doi: 10.1158/1538-7445.AM2017-3590 + licence: ["Apache-2.0"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - f1r2: + type: list + description: list of f1r2 files to be used as input. + pattern: "*.f1r2.tar.gz" +output: + - artifactprior: + type: file + description: file containing artifact-priors to be used by filtermutectcalls + pattern: "*.tar.gz" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@GCJMackenzie" +maintainers: + - "@GCJMackenzie" diff --git a/modules/nf-core/gatk4/markduplicates/environment.yml b/modules/nf-core/gatk4/markduplicates/environment.yml new file mode 100644 index 0000000000..9adad104d8 --- /dev/null +++ b/modules/nf-core/gatk4/markduplicates/environment.yml @@ -0,0 +1,8 @@ +name: gatk4_markduplicates +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::gatk4=4.4.0.0 + - bioconda::samtools=1.17 diff --git a/modules/nf-core/gatk4/markduplicates/main.nf b/modules/nf-core/gatk4/markduplicates/main.nf new file mode 100644 index 0000000000..564b86d3dd --- /dev/null +++ b/modules/nf-core/gatk4/markduplicates/main.nf @@ -0,0 +1,85 @@ +process GATK4_MARKDUPLICATES { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-d9e7bad0f7fbc8f4458d5c3ab7ffaaf0235b59fb:f857e2d6cc88d35580d01cf39e0959a68b83c1d9-0': + 'biocontainers/mulled-v2-d9e7bad0f7fbc8f4458d5c3ab7ffaaf0235b59fb:f857e2d6cc88d35580d01cf39e0959a68b83c1d9-0' }" + + input: + tuple val(meta), path(bam) + path fasta + path fasta_fai + + output: + tuple val(meta), path("*cram"), emit: cram, optional: true + tuple val(meta), path("*bam"), emit: bam, optional: true + tuple val(meta), path("*.crai"), emit: crai, optional: true + tuple val(meta), path("*.bai"), emit: bai, optional: true + tuple val(meta), path("*.metrics"), emit: metrics + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}.bam" + + // If the extension is CRAM, then change it to BAM + prefix_bam = prefix.tokenize('.')[-1] == 'cram' ? "${prefix.substring(0, prefix.lastIndexOf('.'))}.bam" : prefix + + def input_list = bam.collect{"--INPUT $it"}.join(' ') + def reference = fasta ? "--REFERENCE_SEQUENCE ${fasta}" : "" + + def avail_mem = 3072 + if (!task.memory) { + log.info '[GATK MarkDuplicates] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' + } else { + avail_mem = (task.memory.mega*0.8).intValue() + } + + // Using samtools and not Markduplicates to compress to CRAM speeds up computation: + // https://medium.com/@acarroll.dna/looking-at-trade-offs-in-compression-levels-for-genomics-tools-eec2834e8b94 + """ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + MarkDuplicates \\ + $input_list \\ + --OUTPUT ${prefix_bam} \\ + --METRICS_FILE ${prefix}.metrics \\ + --TMP_DIR . \\ + ${reference} \\ + $args + + # If cram files are wished as output, the run samtools for conversion + if [[ ${prefix} == *.cram ]]; then + samtools view -Ch -T ${fasta} -o ${prefix} ${prefix_bam} + rm ${prefix_bam} + samtools index ${prefix} + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}.bam" + prefix_no_suffix = task.ext.prefix ? prefix.tokenize('.')[0] : "${meta.id}" + """ + touch ${prefix_no_suffix}.bam + touch ${prefix_no_suffix}.cram + touch ${prefix_no_suffix}.cram.crai + touch ${prefix_no_suffix}.bai + touch ${prefix}.metrics + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/gatk4/markduplicates/meta.yml b/modules/nf-core/gatk4/markduplicates/meta.yml new file mode 100644 index 0000000000..b0f09d4b84 --- /dev/null +++ b/modules/nf-core/gatk4/markduplicates/meta.yml @@ -0,0 +1,71 @@ +name: gatk4_markduplicates +description: This tool locates and tags duplicate reads in a BAM or SAM file, where duplicate reads are defined as originating from a single fragment of DNA. +keywords: + - bam + - gatk4 + - markduplicates + - sort +tools: + - gatk4: + description: Developed in the Data Sciences Platform at the Broad Institute, the toolkit offers a wide variety of tools with a primary focus on variant discovery and genotyping. Its powerful processing engine and high-performance computing features make it capable of taking on projects of any size. + homepage: https://gatk.broadinstitute.org/hc/en-us + documentation: https://gatk.broadinstitute.org/hc/en-us/articles/360037052812-MarkDuplicates-Picard- + tool_dev_url: https://github.com/broadinstitute/gatk + doi: 10.1158/1538-7445.AM2017-3590 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: Sorted BAM file + pattern: "*.{bam}" + - fasta: + type: file + description: Fasta file + pattern: "*.{fasta}" + - fasta_fai: + type: file + description: Fasta index file + pattern: "*.{fai}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - bam: + type: file + description: Marked duplicates BAM file + pattern: "*.{bam}" + - cram: + type: file + description: Marked duplicates CRAM file + pattern: "*.{cram}" + - bai: + type: file + description: BAM index file + pattern: "*.{bam.bai}" + - crai: + type: file + description: CRAM index file + pattern: "*.{cram.crai}" + - metrics: + type: file + description: Duplicate metrics file generated by GATK + pattern: "*.{metrics.txt}" +authors: + - "@ajodeh-juma" + - "@FriederikeHanssen" + - "@maxulysse" +maintainers: + - "@ajodeh-juma" + - "@FriederikeHanssen" + - "@maxulysse" diff --git a/modules/nf-core/gatk4/mergemutectstats/environment.yml b/modules/nf-core/gatk4/mergemutectstats/environment.yml new file mode 100644 index 0000000000..dd132c3a3d --- /dev/null +++ b/modules/nf-core/gatk4/mergemutectstats/environment.yml @@ -0,0 +1,7 @@ +name: gatk4_mergemutectstats +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::gatk4=4.4.0.0 diff --git a/modules/nf-core/gatk4/mergemutectstats/main.nf b/modules/nf-core/gatk4/mergemutectstats/main.nf new file mode 100644 index 0000000000..3a4913220c --- /dev/null +++ b/modules/nf-core/gatk4/mergemutectstats/main.nf @@ -0,0 +1,44 @@ +process GATK4_MERGEMUTECTSTATS { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gatk4:4.4.0.0--py36hdfd78af_0': + 'biocontainers/gatk4:4.4.0.0--py36hdfd78af_0' }" + + input: + tuple val(meta), path(stats) + + output: + tuple val(meta), path("*.vcf.gz.stats"), emit: stats + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + def input_list = stats.collect{ "--stats ${it}"}.join(' ') + + def avail_mem = 3072 + if (!task.memory) { + log.info '[GATK MergeMutectStats] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' + } else { + avail_mem = (task.memory.mega*0.8).intValue() + } + """ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + MergeMutectStats \\ + $input_list \\ + --output ${prefix}.vcf.gz.stats \\ + --tmp-dir . \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/gatk4/mergemutectstats/meta.yml b/modules/nf-core/gatk4/mergemutectstats/meta.yml new file mode 100644 index 0000000000..1269525657 --- /dev/null +++ b/modules/nf-core/gatk4/mergemutectstats/meta.yml @@ -0,0 +1,43 @@ +name: gatk4_mergemutectstats +description: Merges mutect2 stats generated on different intervals/regions +keywords: + - gatk4 + - merge + - mutect2 + - mutectstats +tools: + - gatk4: + description: Genome Analysis Toolkit (GATK4) + homepage: https://gatk.broadinstitute.org/hc/en-us + documentation: https://gatk.broadinstitute.org/hc/en-us/categories/360002369672s + tool_dev_url: https://github.com/broadinstitute/gatk + doi: "10.1158/1538-7445.AM2017-3590" + licence: ["BSD-3-clause"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - stats: + type: file + description: Stats file + pattern: "*.{stats}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - stats: + type: file + description: Stats file + pattern: "*.vcf.gz.stats" +authors: + - "@FriederikeHanssen" +maintainers: + - "@FriederikeHanssen" diff --git a/modules/nf-core/gatk4/mergevcfs/environment.yml b/modules/nf-core/gatk4/mergevcfs/environment.yml new file mode 100644 index 0000000000..d6c3e51a9f --- /dev/null +++ b/modules/nf-core/gatk4/mergevcfs/environment.yml @@ -0,0 +1,7 @@ +name: gatk4_mergevcfs +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::gatk4=4.4.0.0 diff --git a/modules/nf-core/gatk4/mergevcfs/main.nf b/modules/nf-core/gatk4/mergevcfs/main.nf new file mode 100644 index 0000000000..3362c2bdad --- /dev/null +++ b/modules/nf-core/gatk4/mergevcfs/main.nf @@ -0,0 +1,60 @@ +process GATK4_MERGEVCFS { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gatk4:4.4.0.0--py36hdfd78af_0': + 'biocontainers/gatk4:4.4.0.0--py36hdfd78af_0' }" + + input: + tuple val(meta), path(vcf) + tuple val(meta2), path(dict) + + output: + tuple val(meta), path('*.vcf.gz'), emit: vcf + tuple val(meta), path("*.tbi") , emit: tbi + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def input_list = vcf.collect{ "--INPUT $it"}.join(' ') + def reference_command = dict ? "--SEQUENCE_DICTIONARY $dict" : "" + + def avail_mem = 3072 + if (!task.memory) { + log.info '[GATK MergeVcfs] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' + } else { + avail_mem = (task.memory.mega*0.8).intValue() + } + """ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + MergeVcfs \\ + $input_list \\ + --OUTPUT ${prefix}.vcf.gz \\ + $reference_command \\ + --TMP_DIR . \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.vcf.gz + touch ${prefix}.vcf.gz.tbi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/gatk4/mergevcfs/meta.yml b/modules/nf-core/gatk4/mergevcfs/meta.yml new file mode 100644 index 0000000000..30290a854f --- /dev/null +++ b/modules/nf-core/gatk4/mergevcfs/meta.yml @@ -0,0 +1,52 @@ +name: gatk4_mergevcfs +description: Merges several vcf files +keywords: + - gatk4 + - merge + - vcf +tools: + - gatk4: + description: | + Developed in the Data Sciences Platform at the Broad Institute, the toolkit offers a wide variety of tools + with a primary focus on variant discovery and genotyping. Its powerful processing engine + and high-performance computing features make it capable of taking on projects of any size. + homepage: https://gatk.broadinstitute.org/hc/en-us + documentation: https://gatk.broadinstitute.org/hc/en-us/categories/360002369672s + doi: 10.1158/1538-7445.AM2017-3590 + licence: ["Apache-2.0"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test'] + - vcf: + type: list + description: Two or more VCF files + pattern: "*.{vcf,vcf.gz}" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome'] + - ref_dict: + type: file + description: Optional Sequence Dictionary as input + pattern: "*.dict" +output: + - vcf: + type: file + description: merged vcf file + pattern: "*.vcf.gz" + - tbi: + type: file + description: index files for the merged vcf files + pattern: "*.tbi" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@kevinmenden" +maintainers: + - "@kevinmenden" diff --git a/modules/nf-core/gatk4/mutect2/environment.yml b/modules/nf-core/gatk4/mutect2/environment.yml new file mode 100644 index 0000000000..54da66ce57 --- /dev/null +++ b/modules/nf-core/gatk4/mutect2/environment.yml @@ -0,0 +1,7 @@ +name: gatk4_mutect2 +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::gatk4=4.4.0.0 diff --git a/modules/nf-core/gatk4/mutect2/main.nf b/modules/nf-core/gatk4/mutect2/main.nf new file mode 100644 index 0000000000..721e94f3e4 --- /dev/null +++ b/modules/nf-core/gatk4/mutect2/main.nf @@ -0,0 +1,75 @@ +process GATK4_MUTECT2 { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gatk4:4.4.0.0--py36hdfd78af_0': + 'biocontainers/gatk4:4.4.0.0--py36hdfd78af_0' }" + + input: + tuple val(meta), path(input), path(input_index), path(intervals) + tuple val(meta2), path(fasta) + tuple val(meta3), path(fai) + tuple val(meta4), path(dict) + path(germline_resource) + path(germline_resource_tbi) + path(panel_of_normals) + path(panel_of_normals_tbi) + + output: + tuple val(meta), path("*.vcf.gz") , emit: vcf + tuple val(meta), path("*.tbi") , emit: tbi + tuple val(meta), path("*.stats") , emit: stats + tuple val(meta), path("*.f1r2.tar.gz"), optional:true, emit: f1r2 + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def inputs = input.collect{ "--input $it"}.join(" ") + def interval_command = intervals ? "--intervals $intervals" : "" + def pon_command = panel_of_normals ? "--panel-of-normals $panel_of_normals" : "" + def gr_command = germline_resource ? "--germline-resource $germline_resource" : "" + + def avail_mem = 3072 + if (!task.memory) { + log.info '[GATK Mutect2] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' + } else { + avail_mem = (task.memory.mega*0.8).intValue() + } + """ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + Mutect2 \\ + $inputs \\ + --output ${prefix}.vcf.gz \\ + --reference $fasta \\ + $pon_command \\ + $gr_command \\ + $interval_command \\ + --tmp-dir . \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.vcf.gz + touch ${prefix}.vcf.gz.tbi + touch ${prefix}.vcf.gz.stats + touch ${prefix}.f1r2.tar.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/gatk4/mutect2/meta.yml b/modules/nf-core/gatk4/mutect2/meta.yml new file mode 100644 index 0000000000..21c928ed96 --- /dev/null +++ b/modules/nf-core/gatk4/mutect2/meta.yml @@ -0,0 +1,107 @@ +name: gatk4_mutect2 +description: Call somatic SNVs and indels via local assembly of haplotypes. +keywords: + - gatk4 + - haplotype + - indels + - mutect2 + - snvs + - somatic +tools: + - gatk4: + description: | + Developed in the Data Sciences Platform at the Broad Institute, the toolkit offers a wide variety of tools + with a primary focus on variant discovery and genotyping. Its powerful processing engine + and high-performance computing features make it capable of taking on projects of any size. + homepage: https://gatk.broadinstitute.org/hc/en-us + documentation: https://gatk.broadinstitute.org/hc/en-us/categories/360002369672s + doi: 10.1158/1538-7445.AM2017-3590 + licence: ["Apache-2.0"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test'] + - input: + type: list + description: list of BAM files, also able to take CRAM as an input + pattern: "*.{bam/cram}" + - input_index: + type: list + description: list of BAM file indexes, also able to take CRAM indexes as an input + pattern: "*.{bam.bai/cram.crai}" + - intervals: + type: file + description: Specify region the tools is run on. + pattern: ".{bed,interval_list}" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - fasta: + type: file + description: The reference fasta file + pattern: "*.fasta" + - meta3: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - fai: + type: file + description: Index of reference fasta file + pattern: "*.fasta.fai" + - meta4: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - dict: + type: file + description: GATK sequence dictionary + pattern: "*.dict" + - germline_resource: + type: file + description: Population vcf of germline sequencing, containing allele fractions. + pattern: "*.vcf.gz" + - germline_resource_tbi: + type: file + description: Index file for the germline resource. + pattern: "*.vcf.gz.tbi" + - panel_of_normals: + type: file + description: vcf file to be used as a panel of normals. + pattern: "*.vcf.gz" + - panel_of_normals_tbi: + type: file + description: Index for the panel of normals. + pattern: "*.vcf.gz.tbi" +output: + - vcf: + type: file + description: compressed vcf file + pattern: "*.vcf.gz" + - tbi: + type: file + description: Index of vcf file + pattern: "*vcf.gz.tbi" + - stats: + type: file + description: Stats file that pairs with output vcf file + pattern: "*vcf.gz.stats" + - f1r2: + type: file + description: file containing information to be passed to LearnReadOrientationModel (only outputted when tumor_normal_pair mode is run) + pattern: "*.f1r2.tar.gz" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@GCJMackenzie" + - "@ramprasadn" +maintainers: + - "@GCJMackenzie" + - "@ramprasadn" diff --git a/modules/nf-core/gatk4/variantrecalibrator/environment.yml b/modules/nf-core/gatk4/variantrecalibrator/environment.yml new file mode 100644 index 0000000000..619208a56d --- /dev/null +++ b/modules/nf-core/gatk4/variantrecalibrator/environment.yml @@ -0,0 +1,7 @@ +name: gatk4_variantrecalibrator +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::gatk4=4.4.0.0 diff --git a/modules/nf-core/gatk4/variantrecalibrator/main.nf b/modules/nf-core/gatk4/variantrecalibrator/main.nf new file mode 100644 index 0000000000..f9cd45ac94 --- /dev/null +++ b/modules/nf-core/gatk4/variantrecalibrator/main.nf @@ -0,0 +1,71 @@ +process GATK4_VARIANTRECALIBRATOR { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gatk4:4.4.0.0--py36hdfd78af_0': + 'biocontainers/gatk4:4.4.0.0--py36hdfd78af_0' }" + + input: + tuple val(meta), path(vcf), path(tbi) // input vcf and tbi of variants to recalibrate + path resource_vcf // resource vcf + path resource_tbi // resource tbi + val labels // string (or list of strings) containing dedicated resource labels already formatted with '--resource:' tag + path fasta + path fai + path dict + + output: + tuple val(meta), path("*.recal") , emit: recal + tuple val(meta), path("*.idx") , emit: idx + tuple val(meta), path("*.tranches"), emit: tranches + tuple val(meta), path("*plots.R") , emit: plots, optional:true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def reference_command = fasta ? "--reference $fasta " : '' + def labels_command = labels.join(' ') + + def avail_mem = 3072 + if (!task.memory) { + log.info '[GATK VariantRecalibrator] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' + } else { + avail_mem = (task.memory.mega*0.8).intValue() + } + """ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + VariantRecalibrator \\ + --variant $vcf \\ + --output ${prefix}.recal \\ + --tranches-file ${prefix}.tranches \\ + $reference_command \\ + --tmp-dir . \\ + $labels_command \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.recal + touch ${prefix}.idx + touch ${prefix}.tranches + touch ${prefix}plots.R + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/gatk4/variantrecalibrator/meta.yml b/modules/nf-core/gatk4/variantrecalibrator/meta.yml new file mode 100644 index 0000000000..39a415b61c --- /dev/null +++ b/modules/nf-core/gatk4/variantrecalibrator/meta.yml @@ -0,0 +1,84 @@ +name: gatk4_variantrecalibrator +description: | + Build a recalibration model to score variant quality for filtering purposes. + It is highly recommended to follow GATK best practices when using this module, + the gaussian mixture model requires a large number of samples to be used for the + tool to produce optimal results. For example, 30 samples for exome data. For more details see + https://gatk.broadinstitute.org/hc/en-us/articles/4402736812443-Which-training-sets-arguments-should-I-use-for-running-VQSR- +keywords: + - gatk4 + - recalibration model + - variantrecalibrator +tools: + - gatk4: + description: | + Developed in the Data Sciences Platform at the Broad Institute, the toolkit offers a wide variety of tools + with a primary focus on variant discovery and genotyping. Its powerful processing engine + and high-performance computing features make it capable of taking on projects of any size. + homepage: https://gatk.broadinstitute.org/hc/en-us + documentation: https://gatk.broadinstitute.org/hc/en-us/categories/360002369672s + doi: 10.1158/1538-7445.AM2017-3590 +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - vcf: + type: file + description: input vcf file containing the variants to be recalibrated + pattern: "*.vcf.gz" + - tbi: + type: file + description: tbi file matching with -vcf + pattern: "*.vcf.gz.tbi" + - resource_vcf: + type: file + description: all resource vcf files that are used with the corresponding '--resource' label + pattern: "*.vcf.gz" + - resource_tbi: + type: file + description: all resource tbi files that are used with the corresponding '--resource' label + pattern: "*.vcf.gz.tbi" + - labels: + type: string + description: necessary arguments for GATK VariantRecalibrator. Specified to directly match the resources provided. More information can be found at https://gatk.broadinstitute.org/hc/en-us/articles/5358906115227-VariantRecalibrator + - fasta: + type: file + description: The reference fasta file + pattern: "*.fasta" + - fai: + type: file + description: Index of reference fasta file + pattern: "fasta.fai" + - dict: + type: file + description: GATK sequence dictionary + pattern: "*.dict" +output: + - recal: + type: file + description: Output recal file used by ApplyVQSR + pattern: "*.recal" + - idx: + type: file + description: Index file for the recal output file + pattern: "*.idx" + - tranches: + type: file + description: Output tranches file used by ApplyVQSR + pattern: "*.tranches" + - plots: + type: file + description: Optional output rscript file to aid in visualization of the input data and learned model. + pattern: "*plots.R" + - version: + type: file + description: File containing software versions + pattern: "*.versions.yml" +authors: + - "@GCJMackenzie" + - "@nickhsmith" +maintainers: + - "@GCJMackenzie" + - "@nickhsmith" diff --git a/modules/nf-core/gatk4spark/applybqsr/environment.yml b/modules/nf-core/gatk4spark/applybqsr/environment.yml new file mode 100644 index 0000000000..709dd488c9 --- /dev/null +++ b/modules/nf-core/gatk4spark/applybqsr/environment.yml @@ -0,0 +1,7 @@ +name: gatk4spark_applybqsr +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::gatk4-spark=4.4.0.0 diff --git a/modules/nf-core/gatk4spark/applybqsr/main.nf b/modules/nf-core/gatk4spark/applybqsr/main.nf new file mode 100644 index 0000000000..170dbeeafd --- /dev/null +++ b/modules/nf-core/gatk4spark/applybqsr/main.nf @@ -0,0 +1,53 @@ +process GATK4SPARK_APPLYBQSR { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gatk4-spark:4.4.0.0--hdfd78af_0': + 'biocontainers/gatk4-spark:4.4.0.0--hdfd78af_0' }" + + input: + tuple val(meta), path(input), path(input_index), path(bqsr_table), path(intervals) + path fasta + path fai + path dict + + output: + tuple val(meta), path("*.bam") , emit: bam, optional: true + tuple val(meta), path("*.cram"), emit: cram, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def interval_command = intervals ? "--intervals $intervals" : "" + + def avail_mem = 3072 + if (!task.memory) { + log.info '[GATK ApplyBQSRSpark] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' + } else { + avail_mem = (task.memory.mega*0.8).intValue() + } + """ + gatk \\ + --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + ApplyBQSRSpark \\ + --input $input \\ + --output ${prefix}.${input.getExtension()} \\ + --reference $fasta \\ + --bqsr-recal-file $bqsr_table \\ + $interval_command \\ + --spark-master local[${task.cpus}] \\ + --tmp-dir . \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/gatk4spark/applybqsr/meta.yml b/modules/nf-core/gatk4spark/applybqsr/meta.yml new file mode 100644 index 0000000000..4904568d2e --- /dev/null +++ b/modules/nf-core/gatk4spark/applybqsr/meta.yml @@ -0,0 +1,76 @@ +name: gatk4spark_applybqsr +description: Apply base quality score recalibration (BQSR) to a bam file +keywords: + - bam + - base quality score recalibration + - bqsr + - cram + - gatk4spark +tools: + - gatk4: + description: | + Developed in the Data Sciences Platform at the Broad Institute, the toolkit offers a wide variety of tools + with a primary focus on variant discovery and genotyping. Its powerful processing engine + and high-performance computing features make it capable of taking on projects of any size. + homepage: https://gatk.broadinstitute.org/hc/en-us + documentation: https://gatk.broadinstitute.org/hc/en-us/categories/360002369672s + doi: 10.1158/1538-7445.AM2017-3590 + licence: ["Apache-2.0"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: BAM/CRAM file from alignment + pattern: "*.{bam,cram}" + - input_index: + type: file + description: BAI/CRAI file from alignment + pattern: "*.{bai,crai}" + - bqsr_table: + type: file + description: Recalibration table from gatk4_baserecalibrator + - intervals: + type: file + description: Bed file with the genomic regions included in the library (optional) + - fasta: + type: file + description: The reference fasta file + pattern: "*.fasta" + - fai: + type: file + description: Index of reference fasta file + pattern: "*.fasta.fai" + - dict: + type: file + description: GATK sequence dictionary + pattern: "*.dict" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - bam: + type: file + description: Recalibrated BAM file + pattern: "*.{bam}" + - cram: + type: file + description: Recalibrated CRAM file + pattern: "*.{cram}" +authors: + - "@yocra3" + - "@FriederikeHanssen" + - "@maxulysse" +maintainers: + - "@yocra3" + - "@FriederikeHanssen" + - "@maxulysse" diff --git a/modules/nf-core/gatk4spark/baserecalibrator/environment.yml b/modules/nf-core/gatk4spark/baserecalibrator/environment.yml new file mode 100644 index 0000000000..bf2568a2a8 --- /dev/null +++ b/modules/nf-core/gatk4spark/baserecalibrator/environment.yml @@ -0,0 +1,7 @@ +name: gatk4spark_baserecalibrator +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::gatk4-spark=4.4.0.0 diff --git a/modules/nf-core/gatk4spark/baserecalibrator/main.nf b/modules/nf-core/gatk4spark/baserecalibrator/main.nf new file mode 100644 index 0000000000..ee44bf7d66 --- /dev/null +++ b/modules/nf-core/gatk4spark/baserecalibrator/main.nf @@ -0,0 +1,54 @@ +process GATK4SPARK_BASERECALIBRATOR { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gatk4-spark:4.4.0.0--hdfd78af_0': + 'biocontainers/gatk4-spark:4.4.0.0--hdfd78af_0' }" + + input: + tuple val(meta), path(input), path(input_index), path(intervals) + path fasta + path fai + path dict + path known_sites + path known_sites_tbi + + output: + tuple val(meta), path("*.table"), emit: table + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def interval_command = intervals ? "--intervals $intervals" : "" + def sites_command = known_sites.collect{"--known-sites $it"}.join(' ') + + def avail_mem = 3072 + if (!task.memory) { + log.info '[GATK BaseRecalibratorSpark] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' + } else { + avail_mem = (task.memory.mega*0.8).intValue() + } + """ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + BaseRecalibratorSpark \\ + --input $input \\ + --output ${prefix}.table \\ + --reference $fasta \\ + $interval_command \\ + $sites_command \\ + --spark-master local[${task.cpus}] \\ + --tmp-dir . \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/gatk4spark/baserecalibrator/meta.yml b/modules/nf-core/gatk4spark/baserecalibrator/meta.yml new file mode 100644 index 0000000000..dd334a225f --- /dev/null +++ b/modules/nf-core/gatk4spark/baserecalibrator/meta.yml @@ -0,0 +1,77 @@ +name: gatk4spark_baserecalibrator +description: Generate recalibration table for Base Quality Score Recalibration (BQSR) +keywords: + - base quality score recalibration + - table + - bqsr + - gatk4spark + - sort +tools: + - gatk4: + description: | + Developed in the Data Sciences Platform at the Broad Institute, the toolkit offers a wide variety of tools + with a primary focus on variant discovery and genotyping. Its powerful processing engine + and high-performance computing features make it capable of taking on projects of any size. + homepage: https://gatk.broadinstitute.org/hc/en-us + documentation: https://gatk.broadinstitute.org/hc/en-us/categories/360002369672s + doi: 10.1158/1538-7445.AM2017-3590 + licence: ["Apache-2.0"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: BAM/CRAM file from alignment + pattern: "*.{bam,cram}" + - input_index: + type: file + description: BAI/CRAI file from alignment + pattern: "*.{bai,crai}" + - intervals: + type: file + description: Bed file with the genomic regions included in the library (optional) + - fasta: + type: file + description: The reference fasta file + pattern: "*.fasta" + - fai: + type: file + description: Index of reference fasta file + pattern: "*.fasta.fai" + - dict: + type: file + description: GATK sequence dictionary + pattern: "*.dict" + - known_sites: + type: file + description: VCF files with known sites for indels / snps (optional) + pattern: "*.vcf.gz" + - known_sites_tbi: + type: file + description: Tabix index of the known_sites (optional) + pattern: "*.vcf.gz.tbi" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - table: + type: file + description: Recalibration table from BaseRecalibrator + pattern: "*.{table}" +authors: + - "@yocra3" + - "@FriederikeHanssen" + - "@maxulysse" +maintainers: + - "@yocra3" + - "@FriederikeHanssen" + - "@maxulysse" diff --git a/modules/nf-core/gatk4spark/markduplicates/environment.yml b/modules/nf-core/gatk4spark/markduplicates/environment.yml new file mode 100644 index 0000000000..3e33d7fe3d --- /dev/null +++ b/modules/nf-core/gatk4spark/markduplicates/environment.yml @@ -0,0 +1,7 @@ +name: gatk4spark_markduplicates +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::gatk4-spark=4.4.0.0 diff --git a/modules/nf-core/gatk4spark/markduplicates/main.nf b/modules/nf-core/gatk4spark/markduplicates/main.nf new file mode 100644 index 0000000000..61e295c839 --- /dev/null +++ b/modules/nf-core/gatk4spark/markduplicates/main.nf @@ -0,0 +1,52 @@ +process GATK4SPARK_MARKDUPLICATES { + tag "$meta.id" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gatk4-spark:4.4.0.0--hdfd78af_0': + 'biocontainers/gatk4-spark:4.4.0.0--hdfd78af_0' }" + + input: + tuple val(meta), path(bam) + path fasta + path fasta_fai + path dict + + output: + tuple val(meta), path("${prefix}"), emit: output + tuple val(meta), path("${prefix}.bai"), emit: bam_index, optional:true + tuple val(meta), path("*.metrics"), emit: metrics, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + def input_list = bam.collect{"--input $it"}.join(' ') + + def avail_mem = 3072 + if (!task.memory) { + log.info '[GATK MarkDuplicatesSpark] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.' + } else { + avail_mem = (task.memory.mega*0.8).intValue() + } + """ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + MarkDuplicatesSpark \\ + $input_list \\ + --output $prefix \\ + --reference $fasta \\ + --spark-master local[${task.cpus}] \\ + --tmp-dir . \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + openjdk: \$(echo \$(java -version 2>&1) | grep version | sed 's/\"//g' | cut -f3 -d ' ') + END_VERSIONS + """ +} diff --git a/modules/nf-core/gatk4spark/markduplicates/meta.yml b/modules/nf-core/gatk4spark/markduplicates/meta.yml new file mode 100644 index 0000000000..016a215b25 --- /dev/null +++ b/modules/nf-core/gatk4spark/markduplicates/meta.yml @@ -0,0 +1,65 @@ +name: gatk4spark_markduplicates +description: This tool locates and tags duplicate reads in a BAM or SAM file, where duplicate reads are defined as originating from a single fragment of DNA. +keywords: + - bam + - gatk4spark + - markduplicates + - sort +tools: + - gatk4: + description: Developed in the Data Sciences Platform at the Broad Institute, the toolkit offers a wide variety of tools with a primary focus on variant discovery and genotyping. Its powerful processing engine and high-performance computing features make it capable of taking on projects of any size. + homepage: https://gatk.broadinstitute.org/hc/en-us + documentation: https://gatk.broadinstitute.org/hc/en-us/articles/360037052812-MarkDuplicates-Picard- + tool_dev_url: https://github.com/broadinstitute/gatk + doi: 10.1158/1538-7445.AM2017-3590 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: Sorted BAM file + pattern: "*.{bam}" + - fasta: + type: file + description: The reference fasta file + pattern: "*.fasta" + - fai: + type: file + description: Index of reference fasta file + pattern: "*.fasta.fai" + - dict: + type: file + description: GATK sequence dictionary + pattern: "*.dict" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - output: + type: file + description: Marked duplicates BAM/CRAM file + pattern: "*.{bam,cram}" + - bam_index: + type: file + description: Optional BAM index file + pattern: "*.bai" +authors: + - "@ajodeh-juma" + - "@FriederikeHanssen" + - "@maxulysse" + - "@SusiJo" +maintainers: + - "@ajodeh-juma" + - "@FriederikeHanssen" + - "@maxulysse" + - "@SusiJo" diff --git a/modules/nf-core/manta/germline/environment.yml b/modules/nf-core/manta/germline/environment.yml new file mode 100644 index 0000000000..4a63d3084b --- /dev/null +++ b/modules/nf-core/manta/germline/environment.yml @@ -0,0 +1,7 @@ +name: manta_germline +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::manta=1.6.0 diff --git a/modules/nf-core/manta/germline/main.nf b/modules/nf-core/manta/germline/main.nf new file mode 100644 index 0000000000..5d5666c6e5 --- /dev/null +++ b/modules/nf-core/manta/germline/main.nf @@ -0,0 +1,81 @@ +process MANTA_GERMLINE { + tag "$meta.id" + label 'process_medium' + label 'error_retry' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/manta:1.6.0--h9ee0642_1' : + 'biocontainers/manta:1.6.0--h9ee0642_1' }" + + input: + //Matching the target bed with the input sample allows to parallelize the same sample run across different intervals or a single bed file + tuple val(meta), path(input), path(index), path(target_bed), path(target_bed_tbi) + tuple val(meta2), path(fasta) + tuple val(meta3), path(fai) + path(config) + + output: + tuple val(meta), path("*candidate_small_indels.vcf.gz") , emit: candidate_small_indels_vcf + tuple val(meta), path("*candidate_small_indels.vcf.gz.tbi"), emit: candidate_small_indels_vcf_tbi + tuple val(meta), path("*candidate_sv.vcf.gz") , emit: candidate_sv_vcf + tuple val(meta), path("*candidate_sv.vcf.gz.tbi") , emit: candidate_sv_vcf_tbi + tuple val(meta), path("*diploid_sv.vcf.gz") , emit: diploid_sv_vcf + tuple val(meta), path("*diploid_sv.vcf.gz.tbi") , emit: diploid_sv_vcf_tbi + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def input_files = input.collect{"--bam ${it}"}.join(' ') + def options_manta = target_bed ? "--callRegions $target_bed" : "" + def config_option = config ? "--config ${config}" : "" + """ + configManta.py \\ + ${input_files} \\ + ${config_option} \\ + --reference $fasta \\ + --runDir manta \\ + $options_manta \\ + $args + + python manta/runWorkflow.py -m local -j $task.cpus + + mv manta/results/variants/candidateSmallIndels.vcf.gz \\ + ${prefix}.candidate_small_indels.vcf.gz + mv manta/results/variants/candidateSmallIndels.vcf.gz.tbi \\ + ${prefix}.candidate_small_indels.vcf.gz.tbi + mv manta/results/variants/candidateSV.vcf.gz \\ + ${prefix}.candidate_sv.vcf.gz + mv manta/results/variants/candidateSV.vcf.gz.tbi \\ + ${prefix}.candidate_sv.vcf.gz.tbi + mv manta/results/variants/diploidSV.vcf.gz \\ + ${prefix}.diploid_sv.vcf.gz + mv manta/results/variants/diploidSV.vcf.gz.tbi \\ + ${prefix}.diploid_sv.vcf.gz.tbi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + manta: \$( configManta.py --version ) + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.candidate_small_indels.vcf.gz + touch ${prefix}.candidate_small_indels.vcf.gz.tbi + touch ${prefix}.candidate_sv.vcf.gz + touch ${prefix}.candidate_sv.vcf.gz.tbi + touch ${prefix}.diploid_sv.vcf.gz + touch ${prefix}.diploid_sv.vcf.gz.tbi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + manta: \$( configManta.py --version ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/manta/germline/meta.yml b/modules/nf-core/manta/germline/meta.yml new file mode 100644 index 0000000000..72ed15f8bc --- /dev/null +++ b/modules/nf-core/manta/germline/meta.yml @@ -0,0 +1,104 @@ +name: manta_germline +description: Manta calls structural variants (SVs) and indels from mapped paired-end sequencing reads. It is optimized for analysis of germline variation in small sets of individuals and somatic variation in tumor/normal sample pairs. +keywords: + - somatic + - wgs + - wxs + - panel + - vcf + - structural variants + - small indels +tools: + - manta: + description: Structural variant and indel caller for mapped sequencing data + homepage: https://github.com/Illumina/manta + documentation: https://github.com/Illumina/manta/blob/v1.6.0/docs/userGuide/README.md + tool_dev_url: https://github.com/Illumina/manta + doi: "10.1093/bioinformatics/btv710" + licence: ["GPL v3"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: BAM/CRAM/SAM file. For joint calling use a list of files. + pattern: "*.{bam,cram,sam}" + - index: + type: file + description: BAM/CRAM/SAM index file. For joint calling use a list of files. + pattern: "*.{bai,crai,sai}" + - target_bed: + type: file + description: BED file containing target regions for variant calling + pattern: "*.{bed}" + - target_bed_tbi: + type: file + description: Index for BED file containing target regions for variant calling + pattern: "*.{bed.tbi}" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - fasta: + type: file + description: Genome reference FASTA file + pattern: "*.{fa,fasta}" + - meta3: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - fai: + type: file + description: Genome reference FASTA index file + pattern: "*.{fa.fai,fasta.fai}" + - config: + type: file + description: Manta configuration file + pattern: "*.{ini,conf,config}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - candidate_small_indels_vcf: + type: file + description: Gzipped VCF file containing variants + pattern: "*.{vcf.gz}" + - candidate_small_indels_vcf_tbi: + type: file + description: Index for gzipped VCF file containing variants + pattern: "*.{vcf.gz.tbi}" + - candidate_sv_vcf: + type: file + description: Gzipped VCF file containing variants + pattern: "*.{vcf.gz}" + - candidate_sv_vcf_tbi: + type: file + description: Index for gzipped VCF file containing variants + pattern: "*.{vcf.gz.tbi}" + - diploid_sv_vcf: + type: file + description: Gzipped VCF file containing variants + pattern: "*.{vcf.gz}" + - diploid_sv_vcf_tbi: + type: file + description: Index for gzipped VCF file containing variants + pattern: "*.{vcf.gz.tbi}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@maxulysse" + - "@ramprasadn" + - "@nvnieuwk" +maintainers: + - "@maxulysse" + - "@ramprasadn" + - "@nvnieuwk" diff --git a/modules/nf-core/manta/somatic/environment.yml b/modules/nf-core/manta/somatic/environment.yml new file mode 100644 index 0000000000..aac8827dfc --- /dev/null +++ b/modules/nf-core/manta/somatic/environment.yml @@ -0,0 +1,7 @@ +name: manta_somatic +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::manta=1.6.0 diff --git a/modules/nf-core/manta/somatic/main.nf b/modules/nf-core/manta/somatic/main.nf new file mode 100644 index 0000000000..07511b2f0b --- /dev/null +++ b/modules/nf-core/manta/somatic/main.nf @@ -0,0 +1,88 @@ +process MANTA_SOMATIC { + tag "$meta.id" + label 'process_medium' + label 'error_retry' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/manta:1.6.0--h9ee0642_1' : + 'biocontainers/manta:1.6.0--h9ee0642_1' }" + + input: + tuple val(meta), path(input_normal), path(input_index_normal), path(input_tumor), path(input_index_tumor), path(target_bed), path(target_bed_tbi) + tuple val(meta2), path(fasta) + tuple val(meta3), path(fai) + path(config) + + output: + tuple val(meta), path("*.candidate_small_indels.vcf.gz") , emit: candidate_small_indels_vcf + tuple val(meta), path("*.candidate_small_indels.vcf.gz.tbi") , emit: candidate_small_indels_vcf_tbi + tuple val(meta), path("*.candidate_sv.vcf.gz") , emit: candidate_sv_vcf + tuple val(meta), path("*.candidate_sv.vcf.gz.tbi") , emit: candidate_sv_vcf_tbi + tuple val(meta), path("*.diploid_sv.vcf.gz") , emit: diploid_sv_vcf + tuple val(meta), path("*.diploid_sv.vcf.gz.tbi") , emit: diploid_sv_vcf_tbi + tuple val(meta), path("*.somatic_sv.vcf.gz") , emit: somatic_sv_vcf + tuple val(meta), path("*.somatic_sv.vcf.gz.tbi") , emit: somatic_sv_vcf_tbi + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def options_manta = target_bed ? "--callRegions $target_bed" : "" + def config_option = config ? "--config ${config}" : "" + """ + configManta.py \\ + --tumorBam $input_tumor \\ + --normalBam $input_normal \\ + --reference $fasta \\ + ${config_option} \\ + --runDir manta \\ + $options_manta \\ + $args + + python manta/runWorkflow.py -m local -j $task.cpus + + mv manta/results/variants/candidateSmallIndels.vcf.gz \\ + ${prefix}.candidate_small_indels.vcf.gz + mv manta/results/variants/candidateSmallIndels.vcf.gz.tbi \\ + ${prefix}.candidate_small_indels.vcf.gz.tbi + mv manta/results/variants/candidateSV.vcf.gz \\ + ${prefix}.candidate_sv.vcf.gz + mv manta/results/variants/candidateSV.vcf.gz.tbi \\ + ${prefix}.candidate_sv.vcf.gz.tbi + mv manta/results/variants/diploidSV.vcf.gz \\ + ${prefix}.diploid_sv.vcf.gz + mv manta/results/variants/diploidSV.vcf.gz.tbi \\ + ${prefix}.diploid_sv.vcf.gz.tbi + mv manta/results/variants/somaticSV.vcf.gz \\ + ${prefix}.somatic_sv.vcf.gz + mv manta/results/variants/somaticSV.vcf.gz.tbi \\ + ${prefix}.somatic_sv.vcf.gz.tbi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + manta: \$( configManta.py --version ) + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.candidate_small_indels.vcf.gz + touch ${prefix}.candidate_small_indels.vcf.gz.tbi + touch ${prefix}.candidate_sv.vcf.gz + touch ${prefix}.candidate_sv.vcf.gz.tbi + touch ${prefix}.diploid_sv.vcf.gz + touch ${prefix}.diploid_sv.vcf.gz.tbi + touch ${prefix}.somatic_sv.vcf.gz + touch ${prefix}.somatic_sv.vcf.gz.tbi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + manta: \$( configManta.py --version ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/manta/somatic/meta.yml b/modules/nf-core/manta/somatic/meta.yml new file mode 100644 index 0000000000..e658edaaa4 --- /dev/null +++ b/modules/nf-core/manta/somatic/meta.yml @@ -0,0 +1,118 @@ +name: manta_somatic +description: Manta calls structural variants (SVs) and indels from mapped paired-end sequencing reads. It is optimized for analysis of germline variation in small sets of individuals and somatic variation in tumor/normal sample pairs. +keywords: + - somatic + - wgs + - wxs + - panel + - vcf + - structural variants + - small indels +tools: + - manta: + description: Structural variant and indel caller for mapped sequencing data + homepage: https://github.com/Illumina/manta + documentation: https://github.com/Illumina/manta/blob/v1.6.0/docs/userGuide/README.md + tool_dev_url: https://github.com/Illumina/manta + doi: "10.1093/bioinformatics/btv710" + licence: ["GPL v3"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input_normal: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - input_index_normal: + type: file + description: BAM/CRAM/SAM index file + pattern: "*.{bai,crai,sai}" + - input_tumor: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - input_index_tumor: + type: file + description: BAM/CRAM/SAM index file + pattern: "*.{bai,crai,sai}" + - target_bed: + type: file + description: BED file containing target regions for variant calling + pattern: "*.{bed}" + - target_bed_tbi: + type: file + description: Index for BED file containing target regions for variant calling + pattern: "*.{bed.tbi}" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - fasta: + type: file + description: Genome reference FASTA file + pattern: "*.{fa,fasta}" + - meta3: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - fai: + type: file + description: Genome reference FASTA index file + pattern: "*.{fa.fai,fasta.fai}" + - config: + type: file + description: Manta configuration file + pattern: "*.{ini,conf,config}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - candidate_small_indels_vcf: + type: file + description: Gzipped VCF file containing variants + pattern: "*.{vcf.gz}" + - candidate_small_indels_vcf_tbi: + type: file + description: Index for gzipped VCF file containing variants + pattern: "*.{vcf.gz.tbi}" + - candidate_sv_vcf: + type: file + description: Gzipped VCF file containing variants + pattern: "*.{vcf.gz}" + - candidate_sv_vcf_tbi: + type: file + description: Index for gzipped VCF file containing variants + pattern: "*.{vcf.gz.tbi}" + - diploid_sv_vcf: + type: file + description: Gzipped VCF file containing variants + pattern: "*.{vcf.gz}" + - diploid_sv_vcf_tbi: + type: file + description: Index for gzipped VCF file containing variants + pattern: "*.{vcf.gz.tbi}" + - somatic_sv_vcf: + type: file + description: Gzipped VCF file containing variants + pattern: "*.{vcf.gz}" + - somatic_sv_vcf_tbi: + type: file + description: Index for gzipped VCF file containing variants + pattern: "*.{vcf.gz.tbi}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@FriederikeHanssen" + - "@nvnieuwk" +maintainers: + - "@FriederikeHanssen" + - "@nvnieuwk" diff --git a/modules/nf-core/manta/tumoronly/environment.yml b/modules/nf-core/manta/tumoronly/environment.yml new file mode 100644 index 0000000000..cf5db361e0 --- /dev/null +++ b/modules/nf-core/manta/tumoronly/environment.yml @@ -0,0 +1,7 @@ +name: manta_tumoronly +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::manta=1.6.0 diff --git a/modules/nf-core/manta/tumoronly/main.nf b/modules/nf-core/manta/tumoronly/main.nf new file mode 100644 index 0000000000..b047299571 --- /dev/null +++ b/modules/nf-core/manta/tumoronly/main.nf @@ -0,0 +1,79 @@ +process MANTA_TUMORONLY { + tag "$meta.id" + label 'process_medium' + label 'error_retry' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/manta:1.6.0--h9ee0642_1' : + 'biocontainers/manta:1.6.0--h9ee0642_1' }" + + input: + tuple val(meta), path(input), path(input_index), path(target_bed), path(target_bed_tbi) + tuple val(meta2), path(fasta) + tuple val(meta3), path(fai) + path(config) + + output: + tuple val(meta), path("*candidate_small_indels.vcf.gz") , emit: candidate_small_indels_vcf + tuple val(meta), path("*candidate_small_indels.vcf.gz.tbi"), emit: candidate_small_indels_vcf_tbi + tuple val(meta), path("*candidate_sv.vcf.gz") , emit: candidate_sv_vcf + tuple val(meta), path("*candidate_sv.vcf.gz.tbi") , emit: candidate_sv_vcf_tbi + tuple val(meta), path("*tumor_sv.vcf.gz") , emit: tumor_sv_vcf + tuple val(meta), path("*tumor_sv.vcf.gz.tbi") , emit: tumor_sv_vcf_tbi + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def options_manta = target_bed ? "--callRegions $target_bed" : "" + def config_option = config ? "--config ${config}" : "" + """ + configManta.py \\ + --tumorBam $input \\ + --reference $fasta \\ + ${config_option} \\ + --runDir manta \\ + $options_manta \\ + $args + + python manta/runWorkflow.py -m local -j $task.cpus + + mv manta/results/variants/candidateSmallIndels.vcf.gz \\ + ${prefix}.candidate_small_indels.vcf.gz + mv manta/results/variants/candidateSmallIndels.vcf.gz.tbi \\ + ${prefix}.candidate_small_indels.vcf.gz.tbi + mv manta/results/variants/candidateSV.vcf.gz \\ + ${prefix}.candidate_sv.vcf.gz + mv manta/results/variants/candidateSV.vcf.gz.tbi \\ + ${prefix}.candidate_sv.vcf.gz.tbi + mv manta/results/variants/tumorSV.vcf.gz \\ + ${prefix}.tumor_sv.vcf.gz + mv manta/results/variants/tumorSV.vcf.gz.tbi \\ + ${prefix}.tumor_sv.vcf.gz.tbi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + manta: \$( configManta.py --version ) + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.candidate_small_indels.vcf.gz + touch ${prefix}.candidate_small_indels.vcf.gz.tbi + touch ${prefix}.candidate_sv.vcf.gz + touch ${prefix}.candidate_sv.vcf.gz.tbi + touch ${prefix}.tumor_sv.vcf.gz + touch ${prefix}.tumor_sv.vcf.gz.tbi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + manta: \$( configManta.py --version ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/manta/tumoronly/meta.yml b/modules/nf-core/manta/tumoronly/meta.yml new file mode 100644 index 0000000000..63556c59b4 --- /dev/null +++ b/modules/nf-core/manta/tumoronly/meta.yml @@ -0,0 +1,102 @@ +name: manta_tumoronly +description: Manta calls structural variants (SVs) and indels from mapped paired-end sequencing reads. It is optimized for analysis of germline variation in small sets of individuals and somatic variation in tumor/normal sample pairs. +keywords: + - somatic + - wgs + - wxs + - panel + - vcf + - structural variants + - small indels +tools: + - manta: + description: Structural variant and indel caller for mapped sequencing data + homepage: https://github.com/Illumina/manta + documentation: https://github.com/Illumina/manta/blob/v1.6.0/docs/userGuide/README.md + tool_dev_url: https://github.com/Illumina/manta + doi: "10.1093/bioinformatics/btv710" + licence: ["GPL v3"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - input_index: + type: file + description: BAM/CRAM/SAM index file + pattern: "*.{bai,crai,sai}" + - target_bed: + type: file + description: BED file containing target regions for variant calling + pattern: "*.{bed}" + - target_bed_tbi: + type: file + description: Index for BED file containing target regions for variant calling + pattern: "*.{bed.tbi}" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - fasta: + type: file + description: Genome reference FASTA file + pattern: "*.{fa,fasta}" + - meta3: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - fai: + type: file + description: Genome reference FASTA index file + pattern: "*.{fa.fai,fasta.fai}" + - config: + type: file + description: Manta configuration file + pattern: "*.{ini,conf,config}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - candidate_small_indels_vcf: + type: file + description: Gzipped VCF file containing variants + pattern: "*.{vcf.gz}" + - candidate_small_indels_vcf_tbi: + type: file + description: Index for gzipped VCF file containing variants + pattern: "*.{vcf.gz.tbi}" + - candidate_sv_vcf: + type: file + description: Gzipped VCF file containing variants + pattern: "*.{vcf.gz}" + - candidate_sv_vcf_tbi: + type: file + description: Index for gzipped VCF file containing variants + pattern: "*.{vcf.gz.tbi}" + - tumor_sv_vcf: + type: file + description: Gzipped VCF file containing variants + pattern: "*.{vcf.gz}" + - tumor_sv_vcf_tbi: + type: file + description: Index for gzipped VCF file containing variants + pattern: "*.{vcf.gz.tbi}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@maxulysse" + - "@nvnieuwk" +maintainers: + - "@maxulysse" + - "@nvnieuwk" diff --git a/modules/nf-core/mosdepth/environment.yml b/modules/nf-core/mosdepth/environment.yml new file mode 100644 index 0000000000..b12e3cb127 --- /dev/null +++ b/modules/nf-core/mosdepth/environment.yml @@ -0,0 +1,8 @@ +name: mosdepth +channels: + - conda-forge + - bioconda + - defaults +dependencies: + # renovate: datasource=conda depName=bioconda/mosdepth + - mosdepth=0.3.3 diff --git a/modules/nf-core/mosdepth/main.nf b/modules/nf-core/mosdepth/main.nf new file mode 100644 index 0000000000..7dd13ffb51 --- /dev/null +++ b/modules/nf-core/mosdepth/main.nf @@ -0,0 +1,80 @@ +process MOSDEPTH { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mosdepth:0.3.3--hdfd78af_1' : + 'biocontainers/mosdepth:0.3.3--hdfd78af_1'}" + + input: + tuple val(meta), path(bam), path(bai), path(bed) + tuple val(meta2), path(fasta) + + output: + tuple val(meta), path('*.global.dist.txt') , emit: global_txt + tuple val(meta), path('*.summary.txt') , emit: summary_txt + tuple val(meta), path('*.region.dist.txt') , optional:true, emit: regions_txt + tuple val(meta), path('*.per-base.d4') , optional:true, emit: per_base_d4 + tuple val(meta), path('*.per-base.bed.gz') , optional:true, emit: per_base_bed + tuple val(meta), path('*.per-base.bed.gz.csi') , optional:true, emit: per_base_csi + tuple val(meta), path('*.regions.bed.gz') , optional:true, emit: regions_bed + tuple val(meta), path('*.regions.bed.gz.csi') , optional:true, emit: regions_csi + tuple val(meta), path('*.quantized.bed.gz') , optional:true, emit: quantized_bed + tuple val(meta), path('*.quantized.bed.gz.csi') , optional:true, emit: quantized_csi + tuple val(meta), path('*.thresholds.bed.gz') , optional:true, emit: thresholds_bed + tuple val(meta), path('*.thresholds.bed.gz.csi'), optional:true, emit: thresholds_csi + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def reference = fasta ? "--fasta ${fasta}" : "" + def interval = bed ? "--by ${bed}" : "" + if (bed && args.contains("--by")) { + error "'--by' can only be specified once when running mosdepth! Either remove input BED file definition or remove '--by' from 'ext.args' definition" + } + if (!bed && args.contains("--thresholds")) { + error "'--thresholds' can only be specified in conjunction with '--by'" + } + + """ + mosdepth \\ + --threads $task.cpus \\ + $interval \\ + $reference \\ + $args \\ + $prefix \\ + $bam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + mosdepth: \$(mosdepth --version 2>&1 | sed 's/^.*mosdepth //; s/ .*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.global.dist.txt + touch ${prefix}.region.dist.txt + touch ${prefix}.summary.txt + touch ${prefix}.per-base.d4 + touch ${prefix}.per-base.bed.gz + touch ${prefix}.per-base.bed.gz.csi + touch ${prefix}.regions.bed.gz + touch ${prefix}.regions.bed.gz.csi + touch ${prefix}.quantized.bed.gz + touch ${prefix}.quantized.bed.gz.csi + touch ${prefix}.thresholds.bed.gz + touch ${prefix}.thresholds.bed.gz.csi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + mosdepth: \$(mosdepth --version 2>&1 | sed 's/^.*mosdepth //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/mosdepth/meta.yml b/modules/nf-core/mosdepth/meta.yml new file mode 100644 index 0000000000..76263b5af9 --- /dev/null +++ b/modules/nf-core/mosdepth/meta.yml @@ -0,0 +1,114 @@ +name: mosdepth +description: Calculates genome-wide sequencing coverage. +keywords: + - mosdepth + - bam + - cram + - coverage +tools: + - mosdepth: + description: | + Fast BAM/CRAM depth calculation for WGS, exome, or targeted sequencing. + documentation: https://github.com/brentp/mosdepth + doi: 10.1093/bioinformatics/btx699 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: Input BAM/CRAM file + pattern: "*.{bam,cram}" + - bai: + type: file + description: Index for BAM/CRAM file + pattern: "*.{bai,crai}" + - meta2: + type: map + description: | + Groovy Map containing bed information + e.g. [ id:'test' ] + - bed: + type: file + description: BED file with intersected intervals + pattern: "*.{bed}" + - meta3: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - fasta: + type: file + description: Reference genome FASTA file + pattern: "*.{fa,fasta}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - global_txt: + type: file + description: Text file with global cumulative coverage distribution + pattern: "*.{global.dist.txt}" + - regions_txt: + type: file + description: Text file with region cumulative coverage distribution + pattern: "*.{region.dist.txt}" + - summary_txt: + type: file + description: Text file with summary mean depths per chromosome and regions + pattern: "*.{summary.txt}" + - per_base_bed: + type: file + description: BED file with per-base coverage + pattern: "*.{per-base.bed.gz}" + - per_base_csi: + type: file + description: Index file for BED file with per-base coverage + pattern: "*.{per-base.bed.gz.csi}" + - per_base_d4: + type: file + description: D4 file with per-base coverage + pattern: "*.{per-base.d4}" + - regions_bed: + type: file + description: BED file with per-region coverage + pattern: "*.{regions.bed.gz}" + - regions_csi: + type: file + description: Index file for BED file with per-region coverage + pattern: "*.{regions.bed.gz.csi}" + - quantized_bed: + type: file + description: BED file with binned coverage + pattern: "*.{quantized.bed.gz}" + - quantized_csi: + type: file + description: Index file for BED file with binned coverage + pattern: "*.{quantized.bed.gz.csi}" + - thresholds_bed: + type: file + description: BED file with the number of bases in each region that are covered at or above each threshold + pattern: "*.{thresholds.bed.gz}" + - thresholds_csi: + type: file + description: Index file for BED file with threshold coverage + pattern: "*.{thresholds.bed.gz.csi}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" + - "@ramprasadn" + - "@matthdsm" +maintainers: + - "@joseespinosa" + - "@drpatelh" + - "@ramprasadn" + - "@matthdsm" diff --git a/modules/nf-core/msisensorpro/msisomatic/environment.yml b/modules/nf-core/msisensorpro/msisomatic/environment.yml new file mode 100644 index 0000000000..147a9d6b85 --- /dev/null +++ b/modules/nf-core/msisensorpro/msisomatic/environment.yml @@ -0,0 +1,7 @@ +name: msisensorpro_msisomatic +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::msisensor-pro=1.2.0 diff --git a/modules/nf-core/msisensorpro/msisomatic/main.nf b/modules/nf-core/msisensorpro/msisomatic/main.nf new file mode 100644 index 0000000000..9b0084d949 --- /dev/null +++ b/modules/nf-core/msisensorpro/msisomatic/main.nf @@ -0,0 +1,47 @@ +process MSISENSORPRO_MSISOMATIC { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/msisensor-pro:1.2.0--hfc31af2_0' : + 'biocontainers/msisensor-pro:1.2.0--hfc31af2_0' }" + + input: + tuple val(meta), path(normal), path(normal_index), path(tumor), path(tumor_index), path(intervals) + path (fasta) + path (msisensor_scan) + + output: + tuple val(meta), path("${prefix}") , emit: output_report + tuple val(meta), path("${prefix}_dis") , emit: output_dis + tuple val(meta), path("${prefix}_germline"), emit: output_germline + tuple val(meta), path("${prefix}_somatic") , emit: output_somatic + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + def fasta = fasta ? "-g ${fasta}" : "" + def intervals = intervals ? " -e ${intervals} " : "" + """ + msisensor-pro \\ + msi \\ + -d ${msisensor_scan} \\ + -n ${normal} \\ + -t ${tumor} \\ + ${fasta} \\ + -o $prefix \\ + -b ${task.cpus} \\ + ${intervals} \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + msisensor-pro: \$(msisensor-pro 2>&1 | sed -nE 's/Version:\\sv([0-9]\\.[0-9])/\\1/ p') + END_VERSIONS + """ +} diff --git a/modules/nf-core/msisensorpro/msisomatic/meta.yml b/modules/nf-core/msisensorpro/msisomatic/meta.yml new file mode 100644 index 0000000000..a6dda66ff2 --- /dev/null +++ b/modules/nf-core/msisensorpro/msisomatic/meta.yml @@ -0,0 +1,79 @@ +name: msisensorpro_msisomatic +description: MSIsensor-pro evaluates Microsatellite Instability (MSI) for cancer patients with next generation sequencing data. It accepts the whole genome sequencing, whole exome sequencing and target region (panel) sequencing data as input +keywords: + - micro-satellite-scan + - msisensor-pro + - msi + - somatic +tools: + - msisensorpro: + description: Microsatellite Instability (MSI) detection using high-throughput sequencing data. + homepage: https://github.com/xjtu-omics/msisensor-pro + documentation: https://github.com/xjtu-omics/msisensor-pro/wiki + tool_dev_url: https://github.com/xjtu-omics/msisensor-pro + doi: "10.1016/j.gpb.2020.02.001" + licence: ["Custom Licence"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - normal: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - normal_index: + type: file + description: BAM/CRAM/SAM index file + pattern: "*.{bai,crai,sai}" + - tumor: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - tumor_index: + type: file + description: BAM/CRAM/SAM index file + pattern: "*.{bai,crai,sai}" + - intervals: + type: file + description: bed file containing interval information, optional + pattern: "*.{bed}" + - fasta: + type: file + description: Reference genome + pattern: "*.{fasta}" + - msisensor_scan: + type: file + description: Output from msisensor-pro/scan, conaining list of msi regions + pattern: "*.list" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - output_report: + type: file + description: File containing final report with all detected microsatellites, unstable somatic microsatellites, msi score + - output_dis: + type: file + description: File containing distribution results + - output_germline: + type: file + description: File containing germline results + - output_somatic: + type: file + description: File containing somatic results + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - list: + type: file + description: File containing microsatellite list + pattern: "*.{list}" +authors: + - "@FriederikeHanssen" +maintainers: + - "@FriederikeHanssen" diff --git a/modules/nf-core/msisensorpro/scan/environment.yml b/modules/nf-core/msisensorpro/scan/environment.yml new file mode 100644 index 0000000000..377c28a61b --- /dev/null +++ b/modules/nf-core/msisensorpro/scan/environment.yml @@ -0,0 +1,7 @@ +name: msisensorpro_scan +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::msisensor-pro=1.2.0 diff --git a/modules/nf-core/msisensorpro/scan/main.nf b/modules/nf-core/msisensorpro/scan/main.nf new file mode 100644 index 0000000000..9c7dce2596 --- /dev/null +++ b/modules/nf-core/msisensorpro/scan/main.nf @@ -0,0 +1,35 @@ +process MSISENSORPRO_SCAN { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/msisensor-pro:1.2.0--hfc31af2_0' : + 'biocontainers/msisensor-pro:1.2.0--hfc31af2_0' }" + + input: + tuple val(meta), path(fasta) + + output: + tuple val(meta), path("*.list"), emit: list + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + msisensor-pro \\ + scan \\ + -d $fasta \\ + -o ${prefix}.msisensor_scan.list \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + msisensor-pro: \$(msisensor-pro 2>&1 | sed -nE 's/Version:\\sv([0-9]\\.[0-9])/\\1/ p') + END_VERSIONS + """ +} diff --git a/modules/nf-core/msisensorpro/scan/meta.yml b/modules/nf-core/msisensorpro/scan/meta.yml new file mode 100644 index 0000000000..aec743ede5 --- /dev/null +++ b/modules/nf-core/msisensorpro/scan/meta.yml @@ -0,0 +1,42 @@ +name: msisensorpro_scan +description: MSIsensor-pro evaluates Microsatellite Instability (MSI) for cancer patients with next generation sequencing data. It accepts the whole genome sequencing, whole exome sequencing and target region (panel) sequencing data as input +keywords: + - micro-satellite-scan + - msisensor-pro + - scan +tools: + - msisensorpro: + description: Microsatellite Instability (MSI) detection using high-throughput sequencing data. + homepage: https://github.com/xjtu-omics/msisensor-pro + documentation: https://github.com/xjtu-omics/msisensor-pro/wiki + tool_dev_url: https://github.com/xjtu-omics/msisensor-pro + doi: "10.1016/j.gpb.2020.02.001" + licence: ["Custom Licence"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: Reference genome + pattern: "*.{fasta}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - list: + type: file + description: File containing microsatellite list + pattern: "*.{list}" +authors: + - "@FriederikeHanssen" +maintainers: + - "@FriederikeHanssen" diff --git a/modules/nf-core/ngscheckmate/ncm/environment.yml b/modules/nf-core/ngscheckmate/ncm/environment.yml new file mode 100644 index 0000000000..bf185fc23e --- /dev/null +++ b/modules/nf-core/ngscheckmate/ncm/environment.yml @@ -0,0 +1,7 @@ +name: ngscheckmate_ncm +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::ngscheckmate=1.0.1 diff --git a/modules/nf-core/ngscheckmate/ncm/main.nf b/modules/nf-core/ngscheckmate/ncm/main.nf new file mode 100644 index 0000000000..99921ddcc5 --- /dev/null +++ b/modules/nf-core/ngscheckmate/ncm/main.nf @@ -0,0 +1,64 @@ +process NGSCHECKMATE_NCM { + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ngscheckmate:1.0.1--py27pl5321r40hdfd78af_1': + 'biocontainers/ngscheckmate:1.0.1--py27pl5321r40hdfd78af_1' }" + + input: + tuple val(meta) , path(files) + tuple val(meta2), path(snp_bed) + tuple val(meta3), path(fasta) + + output: + tuple val(meta), path("*_corr_matrix.txt"), emit: corr_matrix + tuple val(meta), path("*_matched.txt") , emit: matched + tuple val(meta), path("*_all.txt") , emit: all + tuple val(meta), path("*.pdf") , emit: pdf, optional: true + tuple val(meta), path("*.vcf") , emit: vcf, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "$meta.id" + def unzip = files.any { it.toString().endsWith(".vcf.gz") } + """ + if $unzip + then + for VCFGZ in *.vcf.gz; do + gunzip -cdf \$VCFGZ > \$( basename \$VCFGZ .gz ); + done + fi + + NCM_REF="./"${fasta} ncm.py -d . -bed ${snp_bed} -O . -N ${prefix} $args + + if $unzip + then + rm -f *.vcf # clean up decompressed vcfs + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + ngscheckmate: \$(ncm.py --help | sed "7!d;s/ *Ensuring Sample Identity v//g") + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "$meta.id" + """ + touch ${prefix}_output_corr_matrix.txt + touch ${prefix}_matched.txt + touch ${prefix}_all.txt + touch ${prefix}.pdf + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + ngscheckmate: \$(ncm.py --help | sed "7!d;s/ *Ensuring Sample Identity v//g") + END_VERSIONS + """ + +} diff --git a/modules/nf-core/ngscheckmate/ncm/meta.yml b/modules/nf-core/ngscheckmate/ncm/meta.yml new file mode 100644 index 0000000000..0defad0064 --- /dev/null +++ b/modules/nf-core/ngscheckmate/ncm/meta.yml @@ -0,0 +1,71 @@ +name: ngscheckmate_ncm +description: Determining whether sequencing data comes from the same individual by using SNP matching. Designed for humans on vcf or bam files. +keywords: + - ngscheckmate + - matching + - snp +tools: + - ngscheckmate: + description: NGSCheckMate is a software package for identifying next generation sequencing (NGS) data files from the same individual, including matching between DNA and RNA. + homepage: https://github.com/parklab/NGSCheckMate + documentation: https://github.com/parklab/NGSCheckMate + tool_dev_url: https://github.com/parklab/NGSCheckMate + doi: "10.1093/nar/gkx193" + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test'] + - files: + type: file + description: VCF or BAM files for each sample, in a merged channel (possibly gzipped). BAM files require an index too. + pattern: "*.{vcf,vcf.gz,bam,bai}" + - meta2: + type: map + description: | + Groovy Map containing SNP information + e.g. [ id:'test' ] + - snp_bed: + type: file + description: BED file containing the SNPs to analyse + pattern: "*.{bed}" + - meta3: + type: map + description: | + Groovy Map containing reference fasta index information + e.g. [ id:'test' ] + - fasta: + type: file + description: fasta file for the genome, only used in the bam mode + pattern: "*.{bed}" +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - pdf: + type: file + description: A pdf containing a dendrogram showing how the samples match up + pattern: "*.{pdf}" + - corr_matrix: + type: file + description: A text file containing the correlation matrix between each sample + pattern: "*corr_matrix.txt" + - matched: + type: file + description: A txt file containing only the samples that match with each other + pattern: "*matched.txt" + - all: + type: file + description: A txt file containing all the sample comparisons, whether they match or not + pattern: "*all.txt" + - vcf: + type: file + description: If ran in bam mode, vcf files for each sample giving the SNP calls used + pattern: "*.vcf" +authors: + - "@sppearce" +maintainers: + - "@sppearce" diff --git a/modules/nf-core/samblaster/environment.yml b/modules/nf-core/samblaster/environment.yml new file mode 100644 index 0000000000..f956283ec0 --- /dev/null +++ b/modules/nf-core/samblaster/environment.yml @@ -0,0 +1,8 @@ +name: samblaster +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samblaster=0.1.26 + - bioconda::samtools=1.16.1 diff --git a/modules/nf-core/samblaster/main.nf b/modules/nf-core/samblaster/main.nf new file mode 100644 index 0000000000..4622d3691d --- /dev/null +++ b/modules/nf-core/samblaster/main.nf @@ -0,0 +1,37 @@ +process SAMBLASTER { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-19fa9f1a5c3966b63a24166365e81da35738c5ab:cee56b506ceb753d4bbef7e05b81e1bfc25d937f-0' : + 'biocontainers/mulled-v2-19fa9f1a5c3966b63a24166365e81da35738c5ab:cee56b506ceb753d4bbef7e05b81e1bfc25d937f-0' }" + + input: + tuple val(meta), path(bam) + + output: + tuple val(meta), path("*.bam"), emit: bam + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def args3 = task.ext.args3 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + if( "$bam" == "${prefix}.bam" ) error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + samtools view -h $args2 $bam | \\ + samblaster $args | \\ + samtools view $args3 -Sb - >${prefix}.bam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samblaster: \$( samblaster -h 2>&1 | head -n 1 | sed 's/^samblaster: Version //' ) + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samblaster/meta.yml b/modules/nf-core/samblaster/meta.yml new file mode 100644 index 0000000000..ccb48320c1 --- /dev/null +++ b/modules/nf-core/samblaster/meta.yml @@ -0,0 +1,53 @@ +name: samblaster +description: | + This module combines samtools and samblaster in order to use + samblaster capability to filter or tag SAM files, with the advantage + of maintaining both input and output in BAM format. + Samblaster input must contain a sequence header: for this reason it has been piped + with the "samtools view -h" command. + Additional desired arguments for samtools can be passed using: + options.args2 for the input bam file + options.args3 for the output bam file +keywords: + - sort + - duplicate marking + - bam +tools: + - samblaster: + description: | + samblaster is a fast and flexible program for marking duplicates in read-id grouped paired-end SAM files. + It can also optionally output discordant read pairs and/or split read mappings to separate SAM files, + and/or unmapped/clipped reads to a separate FASTQ file. + By default, samblaster reads SAM input from stdin and writes SAM to stdout. + documentation: https://github.com/GregoryFaust/samblaster + tool_dev_url: https://github.com/GregoryFaust/samblaster + doi: "10.1093/bioinformatics/btu314" + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM file + pattern: "*.bam" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - bam: + type: file + description: Tagged or filtered BAM file + pattern: "*.bam" +authors: + - "@lescai" +maintainers: + - "@lescai" diff --git a/modules/nf-core/samtools/bam2fq/environment.yml b/modules/nf-core/samtools/bam2fq/environment.yml new file mode 100644 index 0000000000..b59ea8e050 --- /dev/null +++ b/modules/nf-core/samtools/bam2fq/environment.yml @@ -0,0 +1,7 @@ +name: samtools_bam2fq +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.17 diff --git a/modules/nf-core/samtools/bam2fq/main.nf b/modules/nf-core/samtools/bam2fq/main.nf new file mode 100644 index 0000000000..016d91d992 --- /dev/null +++ b/modules/nf-core/samtools/bam2fq/main.nf @@ -0,0 +1,56 @@ +process SAMTOOLS_BAM2FQ { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(inputbam) + val split + + output: + tuple val(meta), path("*.fq.gz"), emit: reads + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + if (split){ + """ + samtools \\ + bam2fq \\ + $args \\ + -@ $task.cpus \\ + -1 ${prefix}_1.fq.gz \\ + -2 ${prefix}_2.fq.gz \\ + -0 ${prefix}_other.fq.gz \\ + -s ${prefix}_singleton.fq.gz \\ + $inputbam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + } else { + """ + samtools \\ + bam2fq \\ + $args \\ + -@ $task.cpus \\ + $inputbam | gzip --no-name > ${prefix}_interleaved.fq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + } +} diff --git a/modules/nf-core/samtools/bam2fq/meta.yml b/modules/nf-core/samtools/bam2fq/meta.yml new file mode 100644 index 0000000000..7769046b54 --- /dev/null +++ b/modules/nf-core/samtools/bam2fq/meta.yml @@ -0,0 +1,51 @@ +name: samtools_bam2fq +description: | + The module uses bam2fq method from samtools to + convert a SAM, BAM or CRAM file to FASTQ format +keywords: + - bam2fq + - samtools + - fastq +tools: + - samtools: + description: Tools for dealing with SAM, BAM and CRAM files + documentation: http://www.htslib.org/doc/1.1/samtools.html + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - inputbam: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - split: + type: boolean + description: | + TRUE/FALSE value to indicate if reads should be separated into + /1, /2 and if present other, or singleton. + Note: choosing TRUE will generate 4 different files. + Choosing FALSE will produce a single file, which will be interleaved in case + the input contains paired reads. +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - reads: + type: file + description: | + FASTQ files, which will be either a group of 4 files (read_1, read_2, other and singleton) + or a single interleaved .fq.gz file if the user chooses not to split the reads. + pattern: "*.fq.gz" +authors: + - "@lescai" +maintainers: + - "@lescai" diff --git a/modules/nf-core/samtools/collatefastq/environment.yml b/modules/nf-core/samtools/collatefastq/environment.yml new file mode 100644 index 0000000000..ec3faa9cdc --- /dev/null +++ b/modules/nf-core/samtools/collatefastq/environment.yml @@ -0,0 +1,7 @@ +name: samtools_collatefastq +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.17 diff --git a/modules/nf-core/samtools/collatefastq/main.nf b/modules/nf-core/samtools/collatefastq/main.nf new file mode 100644 index 0000000000..537b88cca4 --- /dev/null +++ b/modules/nf-core/samtools/collatefastq/main.nf @@ -0,0 +1,55 @@ +process SAMTOOLS_COLLATEFASTQ { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(input) + tuple val(meta2), path(fasta) + val(interleave) + + output: + tuple val(meta), path("*_{1,2}.fq.gz") , optional:true, emit: fastq + tuple val(meta), path("*_interleaved.fq.gz") , optional:true, emit: fastq_interleaved + tuple val(meta), path("*_other.fq.gz") , emit: fastq_other + tuple val(meta), path("*_singleton.fq.gz") , optional:true, emit: fastq_singleton + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def reference = fasta ? "--reference ${fasta}" : "" + def output = (interleave && ! meta.single_end) ? "> ${prefix}_interleaved.fq.gz" : + meta.single_end ? "-1 ${prefix}_1.fq.gz -s ${prefix}_singleton.fq.gz" : + "-1 ${prefix}_1.fq.gz -2 ${prefix}_2.fq.gz -s ${prefix}_singleton.fq.gz" + + """ + samtools collate \\ + $args \\ + --threads $task.cpus \\ + ${reference} \\ + -O \\ + $input \\ + . | + + samtools fastq \\ + $args2 \\ + --threads $task.cpus \\ + ${reference} \\ + -0 ${prefix}_other.fq.gz \\ + $output + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/collatefastq/meta.yml b/modules/nf-core/samtools/collatefastq/meta.yml new file mode 100644 index 0000000000..898cdbdad7 --- /dev/null +++ b/modules/nf-core/samtools/collatefastq/meta.yml @@ -0,0 +1,76 @@ +name: samtools_collatefastq +description: | + The module uses collate and then fastq methods from samtools to + convert a SAM, BAM or CRAM file to FASTQ format +keywords: + - bam2fq + - samtools + - fastq +tools: + - samtools: + description: Tools for dealing with SAM, BAM and CRAM files + documentation: http://www.htslib.org/doc/1.1/samtools.html + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - fasta: + type: file + description: Reference genome fasta file + pattern: "*.{fasta,fa}" + - interleave: + type: boolean + description: | + If true, the output is a single interleaved paired-end FASTQ + If false, the output split paired-end FASTQ + default: false +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fastq: + type: file + description: | + R1 and R2 FASTQ files + pattern: "*_{1,2}.fq.gz" + - fastq_interleaved: + type: file + description: | + Interleaved paired end FASTQ files + pattern: "*_interleaved.fq.gz" + - fastq_other: + type: file + description: | + FASTQ files with reads where the READ1 and READ2 FLAG bits set are either both set or both unset. + pattern: "*_other.fq.gz" + - fastq_singleton: + type: file + description: | + FASTQ files with singleton reads. + pattern: "*_singleton.fq.gz" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@lescai" + - "@maxulysse" + - "@matthdsm" +maintainers: + - "@lescai" + - "@maxulysse" + - "@matthdsm" diff --git a/modules/nf-core/samtools/convert/environment.yml b/modules/nf-core/samtools/convert/environment.yml new file mode 100644 index 0000000000..b2150ad507 --- /dev/null +++ b/modules/nf-core/samtools/convert/environment.yml @@ -0,0 +1,7 @@ +name: samtools_convert +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.17 diff --git a/modules/nf-core/samtools/convert/main.nf b/modules/nf-core/samtools/convert/main.nf new file mode 100644 index 0000000000..ddf17d2dee --- /dev/null +++ b/modules/nf-core/samtools/convert/main.nf @@ -0,0 +1,42 @@ +process SAMTOOLS_CONVERT { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(input), path(index) + path fasta + path fai + + output: + tuple val(meta), path("*.{cram,bam}"), path("*.{crai,bai}") , emit: alignment_index + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def output_extension = input.getExtension() == "bam" ? "cram" : "bam" + + """ + samtools view \\ + --threads ${task.cpus} \\ + --reference ${fasta} \\ + $args \\ + $input \\ + -o ${prefix}.${output_extension} + + samtools index -@${task.cpus} ${prefix}.${output_extension} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/convert/meta.yml b/modules/nf-core/samtools/convert/meta.yml new file mode 100644 index 0000000000..f8a1612fc2 --- /dev/null +++ b/modules/nf-core/samtools/convert/meta.yml @@ -0,0 +1,55 @@ +name: samtools_convert +description: convert and then index CRAM -> BAM or BAM -> CRAM file +keywords: + - view + - index + - bam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: BAM/CRAM file + pattern: "*.{bam,cram}" + - index: + type: file + description: BAM/CRAM index file + pattern: "*.{bai,crai}" + - fasta: + type: file + description: Reference file to create the CRAM file + pattern: "*.{fasta,fa}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - alignment_index: + type: file + description: filtered/converted BAM/CRAM file + index + pattern: "*{.bam/cram,.bai/crai}" + - version: + type: file + description: File containing software version + pattern: "*.{version.txt}" +authors: + - "@FriederikeHanssen" + - "@maxulysse" +maintainers: + - "@FriederikeHanssen" + - "@maxulysse" diff --git a/modules/nf-core/samtools/faidx/environment.yml b/modules/nf-core/samtools/faidx/environment.yml new file mode 100644 index 0000000000..73badedb18 --- /dev/null +++ b/modules/nf-core/samtools/faidx/environment.yml @@ -0,0 +1,7 @@ +name: samtools_faidx +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.17 diff --git a/modules/nf-core/samtools/faidx/main.nf b/modules/nf-core/samtools/faidx/main.nf new file mode 100644 index 0000000000..3aa988224e --- /dev/null +++ b/modules/nf-core/samtools/faidx/main.nf @@ -0,0 +1,50 @@ +process SAMTOOLS_FAIDX { + tag "$fasta" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(fasta) + tuple val(meta2), path(fai) + + output: + tuple val(meta), path ("*.{fa,fasta}") , emit: fa , optional: true + tuple val(meta), path ("*.fai") , emit: fai, optional: true + tuple val(meta), path ("*.gzi") , emit: gzi, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + samtools \\ + faidx \\ + $fasta \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def match = (task.ext.args =~ /-o(?:utput)?\s(.*)\s?/).findAll() + def fastacmd = match[0] ? "touch ${match[0][1]}" : '' + """ + ${fastacmd} + touch ${fasta}.fai + + cat <<-END_VERSIONS > versions.yml + + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/faidx/meta.yml b/modules/nf-core/samtools/faidx/meta.yml new file mode 100644 index 0000000000..e189af28fd --- /dev/null +++ b/modules/nf-core/samtools/faidx/meta.yml @@ -0,0 +1,61 @@ +name: samtools_faidx +description: Index FASTA file +keywords: + - index + - fasta + - faidx +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - fasta: + type: file + description: FASTA file + pattern: "*.{fa,fasta}" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - fai: + type: file + description: FASTA index file + pattern: "*.{fai}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fai: + type: file + description: FASTA index file + pattern: "*.{fai}" + - gzi: + type: file + description: Optional gzip index file for compressed inputs + pattern: "*.gzi" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@ewels" + - "@phue" +maintainers: + - "@drpatelh" + - "@ewels" + - "@phue" diff --git a/modules/nf-core/samtools/index/environment.yml b/modules/nf-core/samtools/index/environment.yml new file mode 100644 index 0000000000..3c6f95b25a --- /dev/null +++ b/modules/nf-core/samtools/index/environment.yml @@ -0,0 +1,7 @@ +name: samtools_index +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.17 diff --git a/modules/nf-core/samtools/index/main.nf b/modules/nf-core/samtools/index/main.nf new file mode 100644 index 0000000000..256bd7c469 --- /dev/null +++ b/modules/nf-core/samtools/index/main.nf @@ -0,0 +1,48 @@ +process SAMTOOLS_INDEX { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(input) + + output: + tuple val(meta), path("*.bai") , optional:true, emit: bai + tuple val(meta), path("*.csi") , optional:true, emit: csi + tuple val(meta), path("*.crai"), optional:true, emit: crai + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + samtools \\ + index \\ + -@ ${task.cpus-1} \\ + $args \\ + $input + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + """ + touch ${input}.bai + touch ${input}.crai + touch ${input}.csi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/index/meta.yml b/modules/nf-core/samtools/index/meta.yml new file mode 100644 index 0000000000..01a4ee03eb --- /dev/null +++ b/modules/nf-core/samtools/index/meta.yml @@ -0,0 +1,57 @@ +name: samtools_index +description: Index SAM/BAM/CRAM file +keywords: + - index + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bai: + type: file + description: BAM/CRAM/SAM index file + pattern: "*.{bai,crai,sai}" + - crai: + type: file + description: BAM/CRAM/SAM index file + pattern: "*.{bai,crai,sai}" + - csi: + type: file + description: CSI index file + pattern: "*.{csi}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@ewels" + - "@maxulysse" +maintainers: + - "@drpatelh" + - "@ewels" + - "@maxulysse" diff --git a/modules/nf-core/samtools/merge/environment.yml b/modules/nf-core/samtools/merge/environment.yml new file mode 100644 index 0000000000..0d437d8a88 --- /dev/null +++ b/modules/nf-core/samtools/merge/environment.yml @@ -0,0 +1,7 @@ +name: samtools_merge +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.17 diff --git a/modules/nf-core/samtools/merge/main.nf b/modules/nf-core/samtools/merge/main.nf new file mode 100644 index 0000000000..21f785cfdc --- /dev/null +++ b/modules/nf-core/samtools/merge/main.nf @@ -0,0 +1,57 @@ +process SAMTOOLS_MERGE { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(input_files, stageAs: "?/*") + tuple val(meta2), path(fasta) + tuple val(meta3), path(fai) + + output: + tuple val(meta), path("${prefix}.bam") , optional:true, emit: bam + tuple val(meta), path("${prefix}.cram"), optional:true, emit: cram + tuple val(meta), path("*.csi") , optional:true, emit: csi + tuple val(meta), path("*.crai") , optional:true, emit: crai + path "versions.yml" , emit: versions + + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + def file_type = input_files instanceof List ? input_files[0].getExtension() : input_files.getExtension() + def reference = fasta ? "--reference ${fasta}" : "" + """ + samtools \\ + merge \\ + --threads ${task.cpus-1} \\ + $args \\ + ${reference} \\ + ${prefix}.${file_type} \\ + $input_files + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + prefix = task.ext.suffix ? "${meta.id}${task.ext.suffix}" : "${meta.id}" + def file_type = input_files instanceof List ? input_files[0].getExtension() : input_files.getExtension() + """ + touch ${prefix}.${file_type} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/merge/meta.yml b/modules/nf-core/samtools/merge/meta.yml new file mode 100644 index 0000000000..2e8f3dbbb5 --- /dev/null +++ b/modules/nf-core/samtools/merge/meta.yml @@ -0,0 +1,83 @@ +name: samtools_merge +description: Merge BAM or CRAM file +keywords: + - merge + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input_files: + type: file + description: BAM/CRAM file + pattern: "*.{bam,cram,sam}" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - fasta: + type: file + description: Reference file the CRAM was created with (optional) + pattern: "*.{fasta,fa}" + - meta3: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - fai: + type: file + description: Index of the reference file the CRAM was created with (optional) + pattern: "*.fai" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM file + pattern: "*.{bam}" + - cram: + type: file + description: CRAM file + pattern: "*.{cram}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - csi: + type: file + description: BAM index file (optional) + pattern: "*.csi" + - crai: + type: file + description: CRAM index file (optional) + pattern: "*.crai" +authors: + - "@drpatelh" + - "@yuukiiwa " + - "@maxulysse" + - "@FriederikeHanssen" + - "@ramprasadn" +maintainers: + - "@drpatelh" + - "@yuukiiwa " + - "@maxulysse" + - "@FriederikeHanssen" + - "@ramprasadn" diff --git a/modules/nf-core/samtools/mpileup/environment.yml b/modules/nf-core/samtools/mpileup/environment.yml new file mode 100644 index 0000000000..5f06050d19 --- /dev/null +++ b/modules/nf-core/samtools/mpileup/environment.yml @@ -0,0 +1,7 @@ +name: samtools_mpileup +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.17 diff --git a/modules/nf-core/samtools/mpileup/main.nf b/modules/nf-core/samtools/mpileup/main.nf new file mode 100644 index 0000000000..ed102582c4 --- /dev/null +++ b/modules/nf-core/samtools/mpileup/main.nf @@ -0,0 +1,37 @@ +process SAMTOOLS_MPILEUP { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" + input: + tuple val(meta), path(input), path(intervals) + path fasta + + output: + tuple val(meta), path("*.mpileup.gz"), emit: mpileup + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def intervals = intervals ? "-l ${intervals}" : "" + """ + samtools mpileup \\ + --fasta-ref $fasta \\ + --output ${prefix}.mpileup \\ + $args \\ + $intervals \\ + $input + bgzip ${prefix}.mpileup + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/mpileup/meta.yml b/modules/nf-core/samtools/mpileup/meta.yml new file mode 100644 index 0000000000..13038fbc9b --- /dev/null +++ b/modules/nf-core/samtools/mpileup/meta.yml @@ -0,0 +1,55 @@ +name: samtools_mpileup +description: BAM +keywords: + - mpileup + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - fasta: + type: file + description: FASTA reference file + pattern: "*.{fasta,fa}" + - intervals: + type: file + description: Interval FILE + pattern: "*.bed" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - mpileup: + type: file + description: mpileup file + pattern: "*.{mpileup}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@joseespinosa" +maintainers: + - "@drpatelh" + - "@joseespinosa" diff --git a/modules/nf-core/samtools/stats/environment.yml b/modules/nf-core/samtools/stats/environment.yml new file mode 100644 index 0000000000..ed4e8961af --- /dev/null +++ b/modules/nf-core/samtools/stats/environment.yml @@ -0,0 +1,7 @@ +name: samtools_stats +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.17 diff --git a/modules/nf-core/samtools/stats/main.nf b/modules/nf-core/samtools/stats/main.nf new file mode 100644 index 0000000000..07286ef410 --- /dev/null +++ b/modules/nf-core/samtools/stats/main.nf @@ -0,0 +1,49 @@ +process SAMTOOLS_STATS { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(input), path(input_index) + tuple val(meta2), path(fasta) + + output: + tuple val(meta), path("*.stats"), emit: stats + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def reference = fasta ? "--reference ${fasta}" : "" + """ + samtools \\ + stats \\ + --threads ${task.cpus} \\ + ${reference} \\ + ${input} \\ + > ${prefix}.stats + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.stats + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/stats/meta.yml b/modules/nf-core/samtools/stats/meta.yml new file mode 100644 index 0000000000..735ff8122a --- /dev/null +++ b/modules/nf-core/samtools/stats/meta.yml @@ -0,0 +1,63 @@ +name: samtools_stats +description: Produces comprehensive statistics from SAM/BAM/CRAM file +keywords: + - statistics + - counts + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: BAM/CRAM file from alignment + pattern: "*.{bam,cram}" + - input_index: + type: file + description: BAI/CRAI file from alignment + pattern: "*.{bai,crai}" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - fasta: + type: file + description: Reference file the CRAM was created with (optional) + pattern: "*.{fasta,fa}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - stats: + type: file + description: File containing samtools stats output + pattern: "*.{stats}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@FriederikeHanssen" + - "@ramprasadn" +maintainers: + - "@drpatelh" + - "@FriederikeHanssen" + - "@ramprasadn" diff --git a/modules/nf-core/samtools/stats/tests/main.nf.test b/modules/nf-core/samtools/stats/tests/main.nf.test new file mode 100644 index 0000000000..e037132ca2 --- /dev/null +++ b/modules/nf-core/samtools/stats/tests/main.nf.test @@ -0,0 +1,78 @@ +nextflow_process { + + name "Test Process SAMTOOLS_STATS" + script "../main.nf" + process "SAMTOOLS_STATS" + tag "modules" + tag "modules/nf-core" + tag "samtools" + tag "samtools/stats" + + test("SAMTOOLS STATS Should run without failures") { + + when { + params { + + outdir = "$outputDir" + } + process { + """ + // define inputs of the process here. + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam_bai'], checkIfExists: true) + + ] + input[1] = [[],[]] + """ + + } + } + + then { + assertAll( + {assert process.success}, + {assert snapshot(process.out).match()} + ) + } + + } + + test("SAMTOOLS CRAM Should run without failures") { + + when { + params { + + outdir = "$outputDir" + } + process { + """ + // define inputs of the process here + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.test_data['homo_sapiens']['illumina']['test_paired_end_recalibrated_sorted_cram'], checkIfExists: true), + file(params.test_data['homo_sapiens']['illumina']['test_paired_end_recalibrated_sorted_cram_crai'], checkIfExists: true) + + ] + input[1] = [ + [ id:'genome' ], + file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + + + } + + then { + assertAll( + {assert process.success}, + {assert snapshot(process.out).match()} + ) + } + + } + + +} diff --git a/modules/nf-core/samtools/stats/tests/main.nf.test.snap b/modules/nf-core/samtools/stats/tests/main.nf.test.snap new file mode 100644 index 0000000000..516b2b0192 --- /dev/null +++ b/modules/nf-core/samtools/stats/tests/main.nf.test.snap @@ -0,0 +1,64 @@ +{ + "SAMTOOLS STATS Should run without failures": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.stats:md5,6e768486d5df0257351c5419a79f9c9b" + ] + ], + "1": [ + "versions.yml:md5,08035f3409d934d47a416150884bb0df" + ], + "stats": [ + [ + { + "id": "test", + "single_end": false + }, + "test.stats:md5,6e768486d5df0257351c5419a79f9c9b" + ] + ], + "versions": [ + "versions.yml:md5,08035f3409d934d47a416150884bb0df" + ] + } + ], + "timestamp": "2023-10-18T12:12:42.998746" + }, + "SAMTOOLS CRAM Should run without failures": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.stats:md5,7c9ee5747793cceb9d6f4d733345641a" + ] + ], + "1": [ + "versions.yml:md5,08035f3409d934d47a416150884bb0df" + ], + "stats": [ + [ + { + "id": "test", + "single_end": false + }, + "test.stats:md5,7c9ee5747793cceb9d6f4d733345641a" + ] + ], + "versions": [ + "versions.yml:md5,08035f3409d934d47a416150884bb0df" + ] + } + ], + "timestamp": "2023-10-18T12:13:30.747222" + } +} \ No newline at end of file diff --git a/modules/nf-core/samtools/stats/tests/tags.yml b/modules/nf-core/samtools/stats/tests/tags.yml new file mode 100644 index 0000000000..7c28e30f3f --- /dev/null +++ b/modules/nf-core/samtools/stats/tests/tags.yml @@ -0,0 +1,2 @@ +samtools/stats: + - modules/nf-core/samtools/stats/** diff --git a/modules/nf-core/samtools/view/environment.yml b/modules/nf-core/samtools/view/environment.yml new file mode 100644 index 0000000000..141e7bd829 --- /dev/null +++ b/modules/nf-core/samtools/view/environment.yml @@ -0,0 +1,7 @@ +name: samtools_view +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.17 diff --git a/modules/nf-core/samtools/view/main.nf b/modules/nf-core/samtools/view/main.nf new file mode 100644 index 0000000000..ddf3f88ae5 --- /dev/null +++ b/modules/nf-core/samtools/view/main.nf @@ -0,0 +1,66 @@ +process SAMTOOLS_VIEW { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(input), path(index) + tuple val(meta2), path(fasta) + path qname + + output: + tuple val(meta), path("*.bam"), emit: bam, optional: true + tuple val(meta), path("*.cram"), emit: cram, optional: true + tuple val(meta), path("*.sam"), emit: sam, optional: true + tuple val(meta), path("*.bai"), emit: bai, optional: true + tuple val(meta), path("*.csi"), emit: csi, optional: true + tuple val(meta), path("*.crai"), emit: crai, optional: true + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def reference = fasta ? "--reference ${fasta}" : "" + def readnames = qname ? "--qname-file ${qname}": "" + def file_type = args.contains("--output-fmt sam") ? "sam" : + args.contains("--output-fmt bam") ? "bam" : + args.contains("--output-fmt cram") ? "cram" : + input.getExtension() + if ("$input" == "${prefix}.${file_type}") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + samtools \\ + view \\ + --threads ${task.cpus-1} \\ + ${reference} \\ + ${readnames} \\ + $args \\ + -o ${prefix}.${file_type} \\ + $input \\ + $args2 + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.bam + touch ${prefix}.cram + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/view/meta.yml b/modules/nf-core/samtools/view/meta.yml new file mode 100644 index 0000000000..3dadafae75 --- /dev/null +++ b/modules/nf-core/samtools/view/meta.yml @@ -0,0 +1,89 @@ +name: samtools_view +description: filter/convert SAM/BAM/CRAM file +keywords: + - view + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - index: + type: file + description: BAM.BAI/BAM.CSI/CRAM.CRAI file (optional) + pattern: "*.{.bai,.csi,.crai}" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - fasta: + type: file + description: Reference file the CRAM was created with (optional) + pattern: "*.{fasta,fa}" + - qname: + type: file + description: Optional file with read names to output only select alignments + pattern: "*.{txt,list}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: optional filtered/converted BAM file + pattern: "*.{bam}" + - cram: + type: file + description: optional filtered/converted CRAM file + pattern: "*.{cram}" + - sam: + type: file + description: optional filtered/converted SAM file + pattern: "*.{sam}" + # bai, csi, and crai are created with `--write-index` + - bai: + type: file + description: optional BAM file index + pattern: "*.{bai}" + - csi: + type: file + description: optional tabix BAM file index + pattern: "*.{csi}" + - crai: + type: file + description: optional CRAM file index + pattern: "*.{crai}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@joseespinosa" + - "@FriederikeHanssen" + - "@priyanka-surana" +maintainers: + - "@drpatelh" + - "@joseespinosa" + - "@FriederikeHanssen" + - "@priyanka-surana" diff --git a/modules/nf-core/sentieon/applyvarcal/environment.yml b/modules/nf-core/sentieon/applyvarcal/environment.yml new file mode 100644 index 0000000000..c4c11b1f85 --- /dev/null +++ b/modules/nf-core/sentieon/applyvarcal/environment.yml @@ -0,0 +1,7 @@ +name: sentieon_applyvarcal +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::sentieon=202308.01 diff --git a/modules/nf-core/sentieon/applyvarcal/main.nf b/modules/nf-core/sentieon/applyvarcal/main.nf new file mode 100644 index 0000000000..304d0a0431 --- /dev/null +++ b/modules/nf-core/sentieon/applyvarcal/main.nf @@ -0,0 +1,95 @@ +process SENTIEON_APPLYVARCAL { + tag "$meta.id" + label 'process_low' + label 'sentieon' + + secret 'SENTIEON_LICENSE_BASE64' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/sentieon:202308.01--h43eeafb_0' : + 'biocontainers/sentieon:202308.01--h43eeafb_0' }" + + input: + tuple val(meta), path(vcf), path(vcf_tbi), path(recal), path(recal_index), path(tranches) + tuple val(meta2), path(fasta) + tuple val(meta3), path(fai) + + output: + tuple val(meta), path("*.vcf.gz"), emit: vcf + tuple val(meta), path("*.tbi") , emit: tbi + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // The following code sets LD_LIBRARY_PATH in the script-section when the module is run by Singularity. + // That turned out to be one way of overcoming the following issue with the Singularity-Sentieon-containers from galaxy, Sentieon (LD_LIBRARY_PATH) and the way Nextflow runs Singularity-containers. + // The galaxy container uses a runscript which is responsible for setting LD_PRELOAD properly. Nextflow executes singularity containers using `singularity exec`, which avoids the run script, leading to the LD_LIBRARY_PATH/libstdc++.so.6 error. + if (workflow.containerEngine == 'singularity') { + fix_ld_library_path = 'LD_LIBRARY_PATH=/usr/local/lib/:\$LD_LIBRARY_PATH;export LD_LIBRARY_PATH' + } else { + fix_ld_library_path = '' + } + + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def sentieon_auth_mech_base64 = task.ext.sentieon_auth_mech_base64 ?: '' + def sentieon_auth_data_base64 = task.ext.sentieon_auth_data_base64 ?: '' + + """ + if [ "\${#SENTIEON_LICENSE_BASE64}" -lt "1500" ]; then # If the string SENTIEON_LICENSE_BASE64 is short, then it is an encrypted url. + export SENTIEON_LICENSE=\$(echo -e "\$SENTIEON_LICENSE_BASE64" | base64 -d) + else # Localhost license file + # The license file is stored as a nextflow variable like, for instance, this: + # nextflow secrets set SENTIEON_LICENSE_BASE64 \$(cat | base64 -w 0) + export SENTIEON_LICENSE=\$(mktemp) + echo -e "\$SENTIEON_LICENSE_BASE64" | base64 -d > \$SENTIEON_LICENSE + fi + + if [ ${sentieon_auth_mech_base64} ] && [ ${sentieon_auth_data_base64} ]; then + # If sentieon_auth_mech_base64 and sentieon_auth_data_base64 are non-empty strings, then Sentieon is mostly likely being run with some test-license. + export SENTIEON_AUTH_MECH=\$(echo -n "${sentieon_auth_mech_base64}" | base64 -d) + export SENTIEON_AUTH_DATA=\$(echo -n "${sentieon_auth_data_base64}" | base64 -d) + echo "Decoded and exported Sentieon test-license system environment variables" + fi + + $fix_ld_library_path + + sentieon driver -r ${fasta} --algo ApplyVarCal \\ + -v $vcf \\ + --recal $recal \\ + --tranches_file $tranches \\ + $args \\ + ${prefix}.vcf.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sentieon: \$(echo \$(sentieon driver --version 2>&1) | sed -e "s/sentieon-genomics-//g") + END_VERSIONS + """ + + stub: + // The following code sets LD_LIBRARY_PATH in the script-section when the module is run by Singularity. + // That turned out to be one way of overcoming the following issue with the Singularity-Sentieon-containers from galaxy, Sentieon (LD_LIBRARY_PATH) and the way Nextflow runs Singularity-containers. + // The galaxy container uses a runscript which is responsible for setting LD_PRELOAD properly. Nextflow executes singularity containers using `singularity exec`, which avoids the run script, leading to the LD_LIBRARY_PATH/libstdc++.so.6 error. + if (workflow.containerEngine == 'singularity') { + fix_ld_library_path = 'LD_LIBRARY_PATH=/usr/local/lib/:\$LD_LIBRARY_PATH;export LD_LIBRARY_PATH' + } else { + fix_ld_library_path = '' + } + + def prefix = task.ext.prefix ?: "${meta.id}" + """ + $fix_ld_library_path + + touch ${prefix}.vcf.gz + touch ${prefix}.vcf.gz.tbi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sentieon: \$(echo \$(sentieon driver --version 2>&1) | sed -e "s/sentieon-genomics-//g") + END_VERSIONS + """ +} diff --git a/modules/nf-core/sentieon/applyvarcal/meta.yml b/modules/nf-core/sentieon/applyvarcal/meta.yml new file mode 100644 index 0000000000..da92ce3436 --- /dev/null +++ b/modules/nf-core/sentieon/applyvarcal/meta.yml @@ -0,0 +1,85 @@ +name: sentieon_applyvarcal +description: | + Apply a score cutoff to filter variants based on a recalibration table. + Sentieon's Aplyvarcal performs the second pass in a two-stage process called Variant Quality Score Recalibration (VQSR). + Specifically, it applies filtering to the input variants based on the recalibration table produced + in the previous step VarCal and a target sensitivity value. + https://support.sentieon.com/manual/usages/general/#applyvarcal-algorithm +keywords: + - sentieon + - applyvarcal + - varcal + - VQSR +tools: + - sentieon: + description: | + Sentieon® provides complete solutions for secondary DNA/RNA analysis for a variety of sequencing platforms, including short and long reads. + Our software improves upon BWA, STAR, Minimap2, GATK, HaplotypeCaller, Mutect, and Mutect2 based pipelines and is deployable on any generic-CPU-based computing system. + homepage: https://www.sentieon.com/ + documentation: https://www.sentieon.com/ +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test'] + - vcf: + type: file + description: VCF file to be recalibrated, this should be the same file as used for the first stage VariantRecalibrator. + pattern: "*.vcf" + - vcf_tbi: + type: file + description: tabix index for the input vcf file. + pattern: "*.vcf.tbi" + - recal: + type: file + description: Recalibration file produced when the input vcf was run through VariantRecalibrator in stage 1. + pattern: "*.recal" + - recal_index: + type: file + description: Index file for the recalibration file. + pattern: ".recal.idx" + - tranches: + type: file + description: Tranches file produced when the input vcf was run through VariantRecalibrator in stage 1. + pattern: ".tranches" + - meta2: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test'] + - fasta: + type: file + description: The reference fasta file + pattern: "*.fasta" + - meta3: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test'] + - fai: + type: file + description: Index of reference fasta file + pattern: "*.fasta.fai" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test'] + - vcf: + type: file + description: compressed vcf file containing the recalibrated variants. + pattern: "*.vcf.gz" + - tbi: + type: file + description: Index of recalibrated vcf file. + pattern: "*vcf.gz.tbi" + - versions: + type: file + description: File containing software versions. + pattern: "versions.yml" +authors: + - "@assp8200" +maintainers: + - "@assp8200" diff --git a/modules/nf-core/sentieon/bwamem/environment.yml b/modules/nf-core/sentieon/bwamem/environment.yml new file mode 100644 index 0000000000..c090bfa5ae --- /dev/null +++ b/modules/nf-core/sentieon/bwamem/environment.yml @@ -0,0 +1,7 @@ +name: sentieon_bwamem +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::sentieon=202308.01 diff --git a/modules/nf-core/sentieon/bwamem/main.nf b/modules/nf-core/sentieon/bwamem/main.nf new file mode 100644 index 0000000000..e25515e7eb --- /dev/null +++ b/modules/nf-core/sentieon/bwamem/main.nf @@ -0,0 +1,99 @@ +process SENTIEON_BWAMEM { + tag "$meta.id" + label 'process_high' + label 'sentieon' + + secret 'SENTIEON_LICENSE_BASE64' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/sentieon:202308.01--h43eeafb_0' : + 'biocontainers/sentieon:202308.01--h43eeafb_0' }" + + input: + tuple val(meta), path(reads) + tuple val(meta2), path(index) + tuple val(meta3), path(fasta) + tuple val(meta4), path(fasta_fai) + + output: + tuple val(meta), path("*.bam"), path("*.bai"), emit: bam_and_bai + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // The following code sets LD_LIBRARY_PATH in the script-section when the module is run by Singularity. + // That turned out to be one way of overcoming the following issue with the Singularity-Sentieon-containers from galaxy, Sentieon (LD_LIBRARY_PATH) and the way Nextflow runs Singularity-containers. + // The galaxy container uses a runscript which is responsible for setting LD_PRELOAD properly. Nextflow executes singularity containers using `singularity exec`, which avoids the run script, leading to the LD_LIBRARY_PATH/libstdc++.so.6 error. + if (workflow.containerEngine == 'singularity') { + fix_ld_library_path = 'LD_LIBRARY_PATH=/usr/local/lib/:\$LD_LIBRARY_PATH;export LD_LIBRARY_PATH' + } else { + fix_ld_library_path = '' + } + + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def sentieon_auth_mech_base64 = task.ext.sentieon_auth_mech_base64 ?: '' + def sentieon_auth_data_base64 = task.ext.sentieon_auth_data_base64 ?: '' + + """ + if [ "\${#SENTIEON_LICENSE_BASE64}" -lt "1500" ]; then # If the string SENTIEON_LICENSE_BASE64 is short, then it is an encrypted url. + export SENTIEON_LICENSE=\$(echo -e "\$SENTIEON_LICENSE_BASE64" | base64 -d) + else # Localhost license file + # The license file is stored as a nextflow variable like, for instance, this: + # nextflow secrets set SENTIEON_LICENSE_BASE64 \$(cat | base64 -w 0) + export SENTIEON_LICENSE=\$(mktemp) + echo -e "\$SENTIEON_LICENSE_BASE64" | base64 -d > \$SENTIEON_LICENSE + fi + + if [ ${sentieon_auth_mech_base64} ] && [ ${sentieon_auth_data_base64} ]; then + # If sentieon_auth_mech_base64 and sentieon_auth_data_base64 are non-empty strings, then Sentieon is mostly likely being run with some test-license. + export SENTIEON_AUTH_MECH=\$(echo -n "${sentieon_auth_mech_base64}" | base64 -d) + export SENTIEON_AUTH_DATA=\$(echo -n "${sentieon_auth_data_base64}" | base64 -d) + echo "Decoded and exported Sentieon test-license system environment variables" + fi + + $fix_ld_library_path + + INDEX=`find -L ./ -name "*.amb" | sed 's/.amb//'` + + sentieon bwa mem \\ + $args \\ + -t $task.cpus \\ + \$INDEX \\ + $reads \\ + | sentieon util sort -r $fasta -t $task.cpus -o ${prefix}.bam --sam2bam - + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sentieon: \$(echo \$(sentieon driver --version 2>&1) | sed -e "s/sentieon-genomics-//g") + bwa: \$(echo \$(sentieon bwa 2>&1) | sed 's/^.*Version: //; s/Contact:.*\$//') + END_VERSIONS + """ + + stub: + // The following code sets LD_LIBRARY_PATH in the script-section when the module is run by Singularity. + // That turned out to be one way of overcoming the following issue with the Singularity-Sentieon-containers from galaxy, Sentieon (LD_LIBRARY_PATH) and the way Nextflow runs Singularity-containers. + // The galaxy container uses a runscript which is responsible for setting LD_PRELOAD properly. Nextflow executes singularity containers using `singularity exec`, which avoids the run script, leading to the LD_LIBRARY_PATH/libstdc++.so.6 error. + if (workflow.containerEngine == 'singularity') { + fix_ld_library_path = 'LD_LIBRARY_PATH=/usr/local/lib/:\$LD_LIBRARY_PATH;export LD_LIBRARY_PATH' + } else { + fix_ld_library_path = '' + } + + def prefix = task.ext.prefix ?: "${meta.id}" + """ + $fix_ld_library_path + + touch ${prefix}.bam + touch ${prefix}.bam.bai + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sentieon: \$(echo \$(sentieon driver --version 2>&1) | sed -e "s/sentieon-genomics-//g") + bwa: \$(echo \$(sentieon bwa 2>&1) | sed 's/^.*Version: //; s/Contact:.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/sentieon/bwamem/meta.yml b/modules/nf-core/sentieon/bwamem/meta.yml new file mode 100644 index 0000000000..0859a923ca --- /dev/null +++ b/modules/nf-core/sentieon/bwamem/meta.yml @@ -0,0 +1,75 @@ +name: sentieon_bwamem +description: Performs fastq alignment to a fasta reference using Sentieon's BWA MEM +keywords: + - mem + - bwa + - alignment + - map + - fastq + - bam + - sentieon +tools: + - sentieon: + description: | + Sentieon® provides complete solutions for secondary DNA/RNA analysis for a variety of sequencing platforms, including short and long reads. + Our software improves upon BWA, STAR, Minimap2, GATK, HaplotypeCaller, Mutect, and Mutect2 based pipelines and is deployable on any generic-CPU-based computing system. + homepage: https://www.sentieon.com/ + documentation: https://www.sentieon.com/ +input: + - meta: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: Genome fastq files (single-end or paired-end) + - meta2: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test', single_end:false ] + - index: + type: file + description: BWA genome index files + pattern: "*.{amb,ann,bwt,pac,sa}" + - meta3: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: Genome fasta file + pattern: "*.{fa,fasta}" + - meta4: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test', single_end:false ] + - fasta_fai: + type: file + description: The index of the FASTA reference. + pattern: "*.fai" +output: + - meta: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM file. + pattern: "*.bam" + - bai: + type: file + description: BAI file + pattern: "*.bai" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@asp8200" +maintainers: + - "@asp8200" diff --git a/modules/nf-core/sentieon/dedup/environment.yml b/modules/nf-core/sentieon/dedup/environment.yml new file mode 100644 index 0000000000..622cf73909 --- /dev/null +++ b/modules/nf-core/sentieon/dedup/environment.yml @@ -0,0 +1,7 @@ +name: sentieon_dedup +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::sentieon=202308.01 diff --git a/modules/nf-core/sentieon/dedup/main.nf b/modules/nf-core/sentieon/dedup/main.nf new file mode 100644 index 0000000000..01ee885a17 --- /dev/null +++ b/modules/nf-core/sentieon/dedup/main.nf @@ -0,0 +1,107 @@ +process SENTIEON_DEDUP { + tag "$meta.id" + label 'process_medium' + label 'sentieon' + + secret 'SENTIEON_LICENSE_BASE64' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/sentieon:202308.01--h43eeafb_0' : + 'biocontainers/sentieon:202308.01--h43eeafb_0' }" + + input: + tuple val(meta), path(bam), path(bai) + tuple val(meta2), path(fasta) + tuple val(meta3), path(fasta_fai) + + output: + tuple val(meta), path("*.cram") , emit: cram, optional: true + tuple val(meta), path("*.crai") , emit: crai, optional: true + tuple val(meta), path("*.bam") , emit: bam , optional: true + tuple val(meta), path("*.bai") , emit: bai + tuple val(meta), path("*.score") , emit: score + tuple val(meta), path("*.metrics") , emit: metrics + tuple val(meta), path("*.metrics.multiqc.tsv"), emit: metrics_multiqc_tsv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // The following code sets LD_LIBRARY_PATH in the script-section when the module is run by Singularity. + // That turned out to be one way of overcoming the following issue with the Singularity-Sentieon-containers from galaxy, Sentieon (LD_LIBRARY_PATH) and the way Nextflow runs Singularity-containers. + // The galaxy container uses a runscript which is responsible for setting LD_PRELOAD properly. Nextflow executes singularity containers using `singularity exec`, which avoids the run script, leading to the LD_LIBRARY_PATH/libstdc++.so.6 error. + if (workflow.containerEngine == 'singularity') { + fix_ld_library_path = 'LD_LIBRARY_PATH=/usr/local/lib/:\$LD_LIBRARY_PATH;export LD_LIBRARY_PATH' + } else { + fix_ld_library_path = '' + } + + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def args3 = task.ext.args3 ?: '' + def args4 = task.ext.args4 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def suffix = task.ext.suffix ?: ".cram" // The suffix should be either ".cram" or ".bam". + def metrics = task.ext.metrics ?: "${prefix}${suffix}.metrics" + def sentieon_auth_mech_base64 = task.ext.sentieon_auth_mech_base64 ?: '' + def sentieon_auth_data_base64 = task.ext.sentieon_auth_data_base64 ?: '' + def input_list = bam.collect{"-i $it"}.join(' ') + + """ + if [ "\${#SENTIEON_LICENSE_BASE64}" -lt "1500" ]; then # If the string SENTIEON_LICENSE_BASE64 is short, then it is an encrypted url. + export SENTIEON_LICENSE=\$(echo -e "\$SENTIEON_LICENSE_BASE64" | base64 -d) + else # Localhost license file + # The license file is stored as a nextflow variable like, for instance, this: + # nextflow secrets set SENTIEON_LICENSE_BASE64 \$(cat | base64 -w 0) + export SENTIEON_LICENSE=\$(mktemp) + echo -e "\$SENTIEON_LICENSE_BASE64" | base64 -d > \$SENTIEON_LICENSE + fi + + if [ ${sentieon_auth_mech_base64} ] && [ ${sentieon_auth_data_base64} ]; then + # If sentieon_auth_mech_base64 and sentieon_auth_data_base64 are non-empty strings, then Sentieon is mostly likely being run with some test-license. + export SENTIEON_AUTH_MECH=\$(echo -n "${sentieon_auth_mech_base64}" | base64 -d) + export SENTIEON_AUTH_DATA=\$(echo -n "${sentieon_auth_data_base64}" | base64 -d) + echo "Decoded and exported Sentieon test-license system environment variables" + fi + + $fix_ld_library_path + + sentieon driver $args $input_list -r ${fasta} --algo LocusCollector $args2 --fun score_info ${prefix}.score + sentieon driver $args3 -t $task.cpus $input_list -r ${fasta} --algo Dedup $args4 --score_info ${prefix}.score --metrics ${metrics} ${prefix}${suffix} + # This following tsv-file is produced in order to get a proper tsv-file with Dedup-metrics for importing in MultiQC as "custom content". + # It should be removed once MultiQC has a module for displaying Dedup-metrics. + head -3 ${metrics} > ${metrics}.multiqc.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sentieon: \$(echo \$(sentieon driver --version 2>&1) | sed -e "s/sentieon-genomics-//g") + END_VERSIONS + """ + + stub: + // The following code sets LD_LIBRARY_PATH in the script-section when the module is run by Singularity. + // That turned out to be one way of overcoming the following issue with the Singularity-Sentieon-containers from galaxy, Sentieon (LD_LIBRARY_PATH) and the way Nextflow runs Singularity-containers. + // The galaxy container uses a runscript which is responsible for setting LD_PRELOAD properly. Nextflow executes singularity containers using `singularity exec`, which avoids the run script, leading to the LD_LIBRARY_PATH/libstdc++.so.6 error. + if (workflow.containerEngine == 'singularity') { + fix_ld_library_path = 'LD_LIBRARY_PATH=/usr/local/lib/:\$LD_LIBRARY_PATH;export LD_LIBRARY_PATH' + } else { + fix_ld_library_path = '' + } + + def prefix = task.ext.prefix ?: "${meta.id}" + """ + $fix_ld_library_path + + touch ${prefix}.cram + touch ${prefix}.cram.crai + touch ${prefix}.metrics + touch ${prefix}.score + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sentieon: \$(echo \$(sentieon driver --version 2>&1) | sed -e "s/sentieon-genomics-//g") + END_VERSIONS + """ +} diff --git a/modules/nf-core/sentieon/dedup/meta.yml b/modules/nf-core/sentieon/dedup/meta.yml new file mode 100644 index 0000000000..0efbb96c22 --- /dev/null +++ b/modules/nf-core/sentieon/dedup/meta.yml @@ -0,0 +1,90 @@ +name: sentieon_dedup +description: Runs the sentieon tool LocusCollector followed by Dedup. LocusCollector collects read information that is used by Dedup which in turn marks or removes duplicate reads. +keywords: + - mem + - dedup + - map + - bam + - cram + - sentieon +tools: + - sentieon: + description: | + Sentieon® provides complete solutions for secondary DNA/RNA analysis for a variety of sequencing platforms, including short and long reads. + Our software improves upon BWA, STAR, Minimap2, GATK, HaplotypeCaller, Mutect, and Mutect2 based pipelines and is deployable on any generic-CPU-based computing system. + homepage: https://www.sentieon.com/ + documentation: https://www.sentieon.com/ +input: + - meta: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM file. + pattern: "*.bam" + - bai: + type: file + description: BAI file + pattern: "*.bai" + - meta2: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: Genome fasta file + pattern: "*.{fa,fasta}" + - meta3: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test', single_end:false ] + - fasta_fai: + type: file + description: The index of the FASTA reference. + pattern: "*.fai" +output: + - meta: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test', single_end:false ] + - cram: + type: file + description: CRAM file + pattern: "*.cram" + - crai: + type: file + description: CRAM index file + pattern: "*.crai" + - bam: + type: file + description: BAM file. + pattern: "*.bam" + - bai: + type: file + description: BAI file + pattern: "*.bai" + - score: + type: file + description: The score file indicates which reads LocusCollector finds are likely duplicates. + pattern: "*.score" + - metrics: + type: file + description: Output file containing Dedup metrics incl. histogram data. + pattern: "*.metrics" + - metrics_multiqc_tsv: + type: file + description: Output tsv-file containing Dedup metrics excl. histogram data. + pattern: "*.metrics.multiqc.tsv" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@asp8200" +maintainers: + - "@asp8200" diff --git a/modules/nf-core/sentieon/dnamodelapply/environment.yml b/modules/nf-core/sentieon/dnamodelapply/environment.yml new file mode 100644 index 0000000000..6d27d44a1d --- /dev/null +++ b/modules/nf-core/sentieon/dnamodelapply/environment.yml @@ -0,0 +1,7 @@ +name: sentieon_dnamodelapply +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::sentieon=202308.01 diff --git a/modules/nf-core/sentieon/dnamodelapply/main.nf b/modules/nf-core/sentieon/dnamodelapply/main.nf new file mode 100644 index 0000000000..1cbb02e3b3 --- /dev/null +++ b/modules/nf-core/sentieon/dnamodelapply/main.nf @@ -0,0 +1,99 @@ +process SENTIEON_DNAMODELAPPLY { + tag "$meta.id" + label 'process_high' + label 'sentieon' + + secret 'SENTIEON_LICENSE_BASE64' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/sentieon:202308.01--h43eeafb_0' : + 'biocontainers/sentieon:202308.01--h43eeafb_0' }" + + input: + tuple val(meta), path(vcf), path(idx) + tuple val(meta2), path(fasta) + tuple val(meta3), path(fai) + tuple val(meta4), path(ml_model) + + output: + tuple val(meta), path("*.vcf.gz") , emit: vcf + tuple val(meta), path("*.vcf.gz.tbi"), emit: index + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // The following code sets LD_LIBRARY_PATH in the script-section when the module is run by Singularity. + // That turned out to be one way of overcoming the following issue with the Singularity-Sentieon-containers from galaxy, Sentieon (LD_LIBRARY_PATH) and the way Nextflow runs Singularity-containers. + // The galaxy container uses a runscript which is responsible for setting LD_PRELOAD properly. Nextflow executes singularity containers using `singularity exec`, which avoids the run script, leading to the LD_LIBRARY_PATH/libstdc++.so.6 error. + if (workflow.containerEngine == 'singularity') { + fix_ld_library_path = 'LD_LIBRARY_PATH=/usr/local/lib/:\$LD_LIBRARY_PATH;export LD_LIBRARY_PATH' + } else { + fix_ld_library_path = '' + } + + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def sentieon_auth_mech_base64 = task.ext.sentieon_auth_mech_base64 ?: '' + def sentieon_auth_data_base64 = task.ext.sentieon_auth_data_base64 ?: '' + + """ + if [ "\${#SENTIEON_LICENSE_BASE64}" -lt "1500" ]; then # If the string SENTIEON_LICENSE_BASE64 is short, then it is an encrypted url. + export SENTIEON_LICENSE=\$(echo -e "\$SENTIEON_LICENSE_BASE64" | base64 -d) + else # Localhost license file + # The license file is stored as a nextflow variable like, for instance, this: + # nextflow secrets set SENTIEON_LICENSE_BASE64 \$(cat | base64 -w 0) + export SENTIEON_LICENSE=\$(mktemp) + echo -e "\$SENTIEON_LICENSE_BASE64" | base64 -d > \$SENTIEON_LICENSE + fi + + if [ ${sentieon_auth_mech_base64} ] && [ ${sentieon_auth_data_base64} ]; then + # If sentieon_auth_mech_base64 and sentieon_auth_data_base64 are non-empty strings, then Sentieon is mostly likely being run with some test-license. + export SENTIEON_AUTH_MECH=\$(echo -n "${sentieon_auth_mech_base64}" | base64 -d) + export SENTIEON_AUTH_DATA=\$(echo -n "${sentieon_auth_data_base64}" | base64 -d) + echo "Decoded and exported Sentieon test-license system environment variables" + fi + + $fix_ld_library_path + + sentieon driver \\ + -t $task.cpus \\ + -r $fasta \\ + $args \\ + --algo DNAModelApply \\ + --model $ml_model \\ + -v $vcf \\ + ${prefix}.vcf.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sentieon: \$(echo \$(sentieon driver --version 2>&1) | sed -e "s/sentieon-genomics-//g") + END_VERSIONS + """ + + stub: + // The following code sets LD_LIBRARY_PATH in the script-section when the module is run by Singularity. + // That turned out to be one way of overcoming the following issue with the Singularity-Sentieon-containers from galaxy, Sentieon (LD_LIBRARY_PATH) and the way Nextflow runs Singularity-containers. + // The galaxy container uses a runscript which is responsible for setting LD_PRELOAD properly. Nextflow executes singularity containers using `singularity exec`, which avoids the run script, leading to the LD_LIBRARY_PATH/libstdc++.so.6 error. + if (workflow.containerEngine == 'singularity') { + fix_ld_library_path = 'LD_LIBRARY_PATH=/usr/local/lib/:\$LD_LIBRARY_PATH;export LD_LIBRARY_PATH' + } else { + fix_ld_library_path = '' + } + + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + $fix_ld_library_path + + touch ${prefix}.vcf.gz + touch ${prefix}.vcf.gz.tbi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sentieon: \$(echo \$(sentieon driver --version 2>&1) | sed -e "s/sentieon-genomics-//g" ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/sentieon/dnamodelapply/meta.yml b/modules/nf-core/sentieon/dnamodelapply/meta.yml new file mode 100644 index 0000000000..2507654577 --- /dev/null +++ b/modules/nf-core/sentieon/dnamodelapply/meta.yml @@ -0,0 +1,77 @@ +name: sentieon_dnamodelapply +description: modifies the input VCF file by adding the MLrejected FILTER to the variants +keywords: + - dnamodelapply + - vcf + - filter + - sentieon +tools: + - sentieon: + description: | + Sentieon® provides complete solutions for secondary DNA/RNA analysis for a variety of sequencing platforms, including short and long reads. + Our software improves upon BWA, STAR, Minimap2, GATK, HaplotypeCaller, Mutect, and Mutect2 based pipelines and is deployable on any generic-CPU-based computing system. + homepage: https://www.sentieon.com/ + documentation: https://www.sentieon.com/ +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test', single_end:false ]` + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. `[ id:'test' ]` + - meta3: + type: map + description: | + Groovy Map containing reference information + e.g. `[ id:'test' ]` + - meta4: + type: map + description: | + Groovy Map containing reference information + e.g. `[ id:'test' ]` + - vcf: + type: file + description: INPUT VCF file + pattern: "*.{vcf,vcf.gz}" + - idx: + type: file + description: Index of the input VCF file + pattern: "*.{tbi}" + - fasta: + type: file + description: Genome fasta file + pattern: "*.{fa,fasta}" + - fai: + type: file + description: Index of the genome fasta file + pattern: "*.fai" + - ml_model: + type: file + description: machine learning model file + pattern: "*.model" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test', single_end:false ]` + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - vcf: + type: file + description: INPUT VCF file + pattern: "*.{vcf,vcf.gz}" + - index: + type: file + description: Index of the input VCF file + pattern: "*.{tbi}" +authors: + - "@ramprasadn" +maintainers: + - "@ramprasadn" diff --git a/modules/nf-core/sentieon/dnascope/environment.yml b/modules/nf-core/sentieon/dnascope/environment.yml new file mode 100644 index 0000000000..45c2116c04 --- /dev/null +++ b/modules/nf-core/sentieon/dnascope/environment.yml @@ -0,0 +1,7 @@ +name: sentieon_dnascope +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::sentieon=202308.01 diff --git a/modules/nf-core/sentieon/dnascope/main.nf b/modules/nf-core/sentieon/dnascope/main.nf new file mode 100644 index 0000000000..7e5adefc7c --- /dev/null +++ b/modules/nf-core/sentieon/dnascope/main.nf @@ -0,0 +1,118 @@ +process SENTIEON_DNASCOPE { + tag "$meta.id" + label 'process_high' + label 'sentieon' + + secret 'SENTIEON_LICENSE_BASE64' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/sentieon:202308.01--h43eeafb_0' : + 'biocontainers/sentieon:202308.01--h43eeafb_0' }" + + input: + tuple val(meta), path(bam), path(bai), path(intervals) + tuple val(meta2), path(fasta) + tuple val(meta3), path(fai) + tuple val(meta4), path(dbsnp) + tuple val(meta5), path(dbsnp_tbi) + tuple val(meta6), path(ml_model) + val(pcr_indel_model) + val(emit_vcf) + val(emit_gvcf) + + output: + tuple val(meta), path("*.unfiltered.vcf.gz") , optional:true, emit: vcf // added the substring ".unfiltered" in the filename of the vcf-files since without that the g.vcf.gz-files were ending up in the vcf-channel + tuple val(meta), path("*.unfiltered.vcf.gz.tbi"), optional:true, emit: vcf_tbi + tuple val(meta), path("*.g.vcf.gz") , optional:true, emit: gvcf // these output-files have to have the extension ".vcf.gz", otherwise the subsequent GATK-MergeVCFs will fail. + tuple val(meta), path("*.g.vcf.gz.tbi") , optional:true, emit: gvcf_tbi + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // The following code sets LD_LIBRARY_PATH in the script-section when the module is run by Singularity. + // That turned out to be one way of overcoming the following issue with the Singularity-Sentieon-containers from galaxy, Sentieon (LD_LIBRARY_PATH) and the way Nextflow runs Singularity-containers. + // The galaxy container uses a runscript which is responsible for setting LD_PRELOAD properly. Nextflow executes singularity containers using `singularity exec`, which avoids the run script, leading to the LD_LIBRARY_PATH/libstdc++.so.6 error. + if (workflow.containerEngine == 'singularity') { + fix_ld_library_path = 'LD_LIBRARY_PATH=/usr/local/lib/:\$LD_LIBRARY_PATH;export LD_LIBRARY_PATH' + } else { + fix_ld_library_path = '' + } + + def args = task.ext.args ?: '' // options for the driver + def args2 = task.ext.args2 ?: '' // options for the vcf generation + def args3 = task.ext.args3 ?: '' // options for the gvcf generation + def interval = intervals ? "--interval ${intervals}" : '' + def dbsnp_cmd = dbsnp ? "-d ${dbsnp}" : '' + def model_cmd = ml_model ? " --model ${ml_model}" : '' + def pcr_indel_model_cmd = pcr_indel_model ? " --pcr_indel_model ${pcr_indel_model}" : '' + def prefix = task.ext.prefix ?: "${meta.id}" + def sentieon_auth_mech_base64 = task.ext.sentieon_auth_mech_base64 ?: '' + def sentieon_auth_data_base64 = task.ext.sentieon_auth_data_base64 ?: '' + def vcf_cmd = "" + def gvcf_cmd = "" + def base_cmd = '--algo DNAscope ' + dbsnp_cmd + ' ' + + if (emit_vcf) { // emit_vcf can be the empty string, 'variant', 'confident' or 'all' but NOT 'gvcf' + vcf_cmd = base_cmd + args2 + ' ' + model_cmd + pcr_indel_model_cmd + ' --emit_mode ' + emit_vcf + ' ' + prefix + '.unfiltered.vcf.gz' + } + + if (emit_gvcf) { // emit_gvcf can be either true or false + gvcf_cmd = base_cmd + args3 + ' ' + model_cmd + pcr_indel_model_cmd + ' --emit_mode gvcf ' + prefix + '.g.vcf.gz' + } + + """ + if [ "\${#SENTIEON_LICENSE_BASE64}" -lt "1500" ]; then # If the string SENTIEON_LICENSE_BASE64 is short, then it is an encrypted url. + export SENTIEON_LICENSE=\$(echo -e "\$SENTIEON_LICENSE_BASE64" | base64 -d) + else # Localhost license file + # The license file is stored as a nextflow variable like, for instance, this: + # nextflow secrets set SENTIEON_LICENSE_BASE64 \$(cat | base64 -w 0) + export SENTIEON_LICENSE=\$(mktemp) + echo -e "\$SENTIEON_LICENSE_BASE64" | base64 -d > \$SENTIEON_LICENSE + fi + + if [ ${sentieon_auth_mech_base64} ] && [ ${sentieon_auth_data_base64} ]; then + # If sentieon_auth_mech_base64 and sentieon_auth_data_base64 are non-empty strings, then Sentieon is mostly likely being run with some test-license. + export SENTIEON_AUTH_MECH=\$(echo -n "${sentieon_auth_mech_base64}" | base64 -d) + export SENTIEON_AUTH_DATA=\$(echo -n "${sentieon_auth_data_base64}" | base64 -d) + echo "Decoded and exported Sentieon test-license system environment variables" + fi + + $fix_ld_library_path + + sentieon driver $args -r $fasta -t $task.cpus -i $bam $interval $vcf_cmd $gvcf_cmd + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sentieon: \$(echo \$(sentieon driver --version 2>&1) | sed -e "s/sentieon-genomics-//g") + END_VERSIONS + """ + + stub: + // The following code sets LD_LIBRARY_PATH in the script-section when the module is run by Singularity. + // That turned out to be one way of overcoming the following issue with the Singularity-Sentieon-containers from galaxy, Sentieon (LD_LIBRARY_PATH) and the way Nextflow runs Singularity-containers. + // The galaxy container uses a runscript which is responsible for setting LD_PRELOAD properly. Nextflow executes singularity containers using `singularity exec`, which avoids the run script, leading to the LD_LIBRARY_PATH/libstdc++.so.6 error. + if (workflow.containerEngine == 'singularity') { + fix_ld_library_path = 'LD_LIBRARY_PATH=/usr/local/lib/:\$LD_LIBRARY_PATH;export LD_LIBRARY_PATH' + } else { + fix_ld_library_path = '' + } + + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + $fix_ld_library_path + + touch ${prefix}.unfiltered.vcf.gz + touch ${prefix}.unfiltered.vcf.gz.tbi + touch ${prefix}.g.vcf.gz + touch ${prefix}.g.vcf.gz.tbi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sentieon: \$(echo \$(sentieon driver --version 2>&1) | sed -e "s/sentieon-genomics-//g" ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/sentieon/dnascope/meta.yml b/modules/nf-core/sentieon/dnascope/meta.yml new file mode 100644 index 0000000000..6b61cee828 --- /dev/null +++ b/modules/nf-core/sentieon/dnascope/meta.yml @@ -0,0 +1,120 @@ +name: sentieon_dnascope +description: DNAscope algorithm performs an improved version of Haplotype variant calling. +keywords: + - dnascope + - sentieon + - variant_calling +tools: + - sentieon: + description: | + Sentieon® provides complete solutions for secondary DNA/RNA analysis for a variety of sequencing platforms, including short and long reads. + Our software improves upon BWA, STAR, Minimap2, GATK, HaplotypeCaller, Mutect, and Mutect2 based pipelines and is deployable on any generic-CPU-based computing system. + homepage: https://www.sentieon.com/ + documentation: https://www.sentieon.com/ +input: + - meta: + type: map + description: | + Groovy Map containing sample information. + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM file. + pattern: "*.bam" + - bai: + type: file + description: BAI file + pattern: "*.bai" + - intervals: + type: file + description: bed or interval_list file containing interval in the reference that will be used in the analysis + pattern: "*.{bed,interval_list}" + - meta2: + type: map + description: | + Groovy Map containing meta information for fasta. + - fasta: + type: file + description: Genome fasta file + pattern: "*.{fa,fasta}" + - meta3: + type: map + description: | + Groovy Map containing meta information for fasta index. + - fai: + type: file + description: Index of the genome fasta file + pattern: "*.fai" + - meta4: + type: map + description: | + Groovy Map containing meta information for dbsnp. + - dbsnp: + type: file + description: Single Nucleotide Polymorphism database (dbSNP) file + pattern: "*.vcf.gz" + - meta5: + type: map + description: | + Groovy Map containing meta information for dbsnp_tbi. + - dbsnp_tbi: + type: file + description: Index of the Single Nucleotide Polymorphism database (dbSNP) file + pattern: "*.vcf.gz.tbi" + - meta6: + type: map + description: | + Groovy Map containing meta information for machine learning model for Dnascope. + - ml_model: + type: file + description: machine learning model file + pattern: "*.model" + - ml_model: + type: file + description: machine learning model file + pattern: "*.model" + - pcr_indel_model: + type: string + description: | + Controls the option pcr_indel_model for Dnascope. + The possible options are "NONE" (used for PCR free samples), and "HOSTILE", "AGGRESSIVE" and "CONSERVATIVE". + See Sentieons documentation for further explanation. + - emit_vcf: + type: string + description: | + Controls the vcf output from Dnascope. + Possible options are "all", "confident" and "variant". + See Sentieons documentation for further explanation. + - emit_gvcf: + type: boolean + description: If true, the haplotyper will output a gvcf +output: + - meta: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test', single_end:false ] + - vcf: + type: file + description: Compressed VCF file + pattern: "*.unfiltered.vcf.gz" + - vcf_tbi: + type: file + description: Index of VCF file + pattern: "*.unfiltered.vcf.gz.tbi" + - gvcf: + type: file + description: Compressed GVCF file + pattern: "*.g.vcf.gz" + - gvcf_tbi: + type: file + description: Index of GVCF file + pattern: "*.g.vcf.gz.tbi" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@ramprasadn" +maintainers: + - "@ramprasadn" diff --git a/modules/nf-core/sentieon/gvcftyper/environment.yml b/modules/nf-core/sentieon/gvcftyper/environment.yml new file mode 100644 index 0000000000..9a8143068a --- /dev/null +++ b/modules/nf-core/sentieon/gvcftyper/environment.yml @@ -0,0 +1,7 @@ +name: sentieon_gvcftyper +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::sentieon=202308.01 diff --git a/modules/nf-core/sentieon/gvcftyper/main.nf b/modules/nf-core/sentieon/gvcftyper/main.nf new file mode 100644 index 0000000000..d2be759fe3 --- /dev/null +++ b/modules/nf-core/sentieon/gvcftyper/main.nf @@ -0,0 +1,94 @@ +process SENTIEON_GVCFTYPER { + tag "$meta.id" + label 'process_high' + label 'sentieon' + + secret 'SENTIEON_LICENSE_BASE64' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/sentieon:202308.01--h43eeafb_0' : + 'biocontainers/sentieon:202308.01--h43eeafb_0' }" + + input: + tuple val(meta), path(gvcfs), path(tbis), path(intervals) + path fasta + path fai + path dbsnp + path dbsnp_tbi + + output: + tuple val(meta), path("*.vcf.gz") , emit: vcf_gz + tuple val(meta), path("*.vcf.gz.tbi"), emit: vcf_gz_tbi + path("versions.yml") , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // The following code sets LD_LIBRARY_PATH in the script-section when the module is run by Singularity. + // That turned out to be one way of overcoming the following issue with the Singularity-Sentieon-containers from galaxy, Sentieon (LD_LIBRARY_PATH) and the way Nextflow runs Singularity-containers. + // The galaxy container uses a runscript which is responsible for setting LD_PRELOAD properly. Nextflow executes singularity containers using `singularity exec`, which avoids the run script, leading to the LD_LIBRARY_PATH/libstdc++.so.6 error. + if (workflow.containerEngine == 'singularity') { + fix_ld_library_path = 'LD_LIBRARY_PATH=/usr/local/lib/:\$LD_LIBRARY_PATH;export LD_LIBRARY_PATH' + } else { + fix_ld_library_path = '' + } + + def prefix = task.ext.prefix ?: "${meta.id}" + def sentieon_auth_mech_base64 = task.ext.sentieon_auth_mech_base64 ?: '' + def sentieon_auth_data_base64 = task.ext.sentieon_auth_data_base64 ?: '' + def gvcfs_input = '-v ' + gvcfs.join(' -v ') + def dbsnp_cmd = dbsnp ? "--dbsnp $dbsnp" : "" + + """ + if [ "\${#SENTIEON_LICENSE_BASE64}" -lt "1500" ]; then # If the string SENTIEON_LICENSE_BASE64 is short, then it is an encrypted url. + export SENTIEON_LICENSE=\$(echo -e "\$SENTIEON_LICENSE_BASE64" | base64 -d) + else # Localhost license file + # The license file is stored as a nextflow variable like, for instance, this: + # nextflow secrets set SENTIEON_LICENSE_BASE64 \$(cat | base64 -w 0) + export SENTIEON_LICENSE=\$(mktemp) + echo -e "\$SENTIEON_LICENSE_BASE64" | base64 -d > \$SENTIEON_LICENSE + fi + + if [ ${sentieon_auth_mech_base64} ] && [ ${sentieon_auth_data_base64} ]; then + # If sentieon_auth_mech_base64 and sentieon_auth_data_base64 are non-empty strings, then Sentieon is mostly likely being run with some test-license. + export SENTIEON_AUTH_MECH=\$(echo -n "${sentieon_auth_mech_base64}" | base64 -d) + export SENTIEON_AUTH_DATA=\$(echo -n "${sentieon_auth_data_base64}" | base64 -d) + echo "Decoded and exported Sentieon test-license system environment variables" + fi + + $fix_ld_library_path + + sentieon driver -r ${fasta} --algo GVCFtyper ${gvcfs_input} ${dbsnp_cmd} ${prefix}.vcf.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sentieon: \$(echo \$(sentieon driver --version 2>&1) | sed -e "s/sentieon-genomics-//g") + END_VERSIONS + """ + + stub: + // The following code sets LD_LIBRARY_PATH in the script-section when the module is run by Singularity. + // That turned out to be one way of overcoming the following issue with the Singularity-Sentieon-containers from galaxy, Sentieon (LD_LIBRARY_PATH) and the way Nextflow runs Singularity-containers. + // The galaxy container uses a runscript which is responsible for setting LD_PRELOAD properly. Nextflow executes singularity containers using `singularity exec`, which avoids the run script, leading to the LD_LIBRARY_PATH/libstdc++.so.6 error. + if (workflow.containerEngine == 'singularity') { + fix_ld_library_path = 'LD_LIBRARY_PATH=/usr/local/lib/:\$LD_LIBRARY_PATH;export LD_LIBRARY_PATH' + } else { + fix_ld_library_path = '' + } + + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + $fix_ld_library_path + + touch ${prefix}.vcf.gz + touch ${prefix}.vcf.gz.tbi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sentieon: \$(echo \$(sentieon driver --version 2>&1) | sed -e "s/sentieon-genomics-//g") + END_VERSIONS + """ +} diff --git a/modules/nf-core/sentieon/gvcftyper/meta.yml b/modules/nf-core/sentieon/gvcftyper/meta.yml new file mode 100644 index 0000000000..5a83eb0308 --- /dev/null +++ b/modules/nf-core/sentieon/gvcftyper/meta.yml @@ -0,0 +1,71 @@ +name: sentieon_gvcftyper +description: | + Perform joint genotyping on one or more samples pre-called with Sentieon's Haplotyper. +keywords: + - joint genotyping + - genotype + - gvcf +tools: + - sentieon: + description: | + Sentieon® provides complete solutions for secondary DNA/RNA analysis for a variety of sequencing platforms, including short and long reads. + Our software improves upon BWA, STAR, Minimap2, GATK, HaplotypeCaller, Mutect, and Mutect2 based pipelines and is deployable on any generic-CPU-based computing system. + homepage: https://www.sentieon.com/ + documentation: https://www.sentieon.com/ +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - gvcfs: + type: file + description: | + gVCF(.gz) file + pattern: "*.{vcf,vcf.gz}" + - tbis: + type: file + description: | + index of gvcf file + pattern: "*.tbi" + - intervals: + type: file + description: Interval file with the genomic regions included in the library (optional) + - fasta: + type: file + description: Reference fasta file + pattern: "*.fasta" + - fai: + type: file + description: Reference fasta index file + pattern: "*.fai" + - dbsnp: + type: file + description: dbSNP VCF file + pattern: "*.vcf.gz" + - dbsnp_tbi: + type: file + description: dbSNP VCF index file + pattern: "*.tbi" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - vcf: + type: file + description: Genotyped VCF file + pattern: "*.vcf.gz" + - tbi: + type: file + description: Tbi index for VCF file + pattern: "*.vcf.gz" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@asp8200" +maintainers: + - "@asp8200" diff --git a/modules/nf-core/sentieon/haplotyper/environment.yml b/modules/nf-core/sentieon/haplotyper/environment.yml new file mode 100644 index 0000000000..a3a721cf1d --- /dev/null +++ b/modules/nf-core/sentieon/haplotyper/environment.yml @@ -0,0 +1,7 @@ +name: sentieon_haplotyper +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::sentieon=202308.01 diff --git a/modules/nf-core/sentieon/haplotyper/main.nf b/modules/nf-core/sentieon/haplotyper/main.nf new file mode 100644 index 0000000000..d349525dea --- /dev/null +++ b/modules/nf-core/sentieon/haplotyper/main.nf @@ -0,0 +1,113 @@ +process SENTIEON_HAPLOTYPER { + tag "$meta.id" + label 'process_medium' + label 'sentieon' + + secret 'SENTIEON_LICENSE_BASE64' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/sentieon:202308.01--h43eeafb_0' : + 'biocontainers/sentieon:202308.01--h43eeafb_0' }" + + input: + tuple val(meta), path(input), path(input_index), path(intervals) + path fasta + path fai + path dbsnp + path dbsnp_tbi + val(emit_vcf) + val(emit_gvcf) + + output: + tuple val(meta), path("*.unfiltered.vcf.gz") , optional:true, emit: vcf // added the substring ".unfiltered" in the filename of the vcf-files since without that the g.vcf.gz-files were ending up in the vcf-channel + tuple val(meta), path("*.unfiltered.vcf.gz.tbi"), optional:true, emit: vcf_tbi + tuple val(meta), path("*.g.vcf.gz") , optional:true, emit: gvcf // these output-files have to have the extension ".vcf.gz", otherwise the subsequent GATK-MergeVCFs will fail. + tuple val(meta), path("*.g.vcf.gz.tbi") , optional:true, emit: gvcf_tbi + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // The following code sets LD_LIBRARY_PATH in the script-section when the module is run by Singularity. + // That turned out to be one way of overcoming the following issue with the Singularity-Sentieon-containers from galaxy, Sentieon (LD_LIBRARY_PATH) and the way Nextflow runs Singularity-containers. + // The galaxy container uses a runscript which is responsible for setting LD_PRELOAD properly. Nextflow executes singularity containers using `singularity exec`, which avoids the run script, leading to the LD_LIBRARY_PATH/libstdc++.so.6 error. + if (workflow.containerEngine == 'singularity') { + fix_ld_library_path = 'LD_LIBRARY_PATH=/usr/local/lib/:\$LD_LIBRARY_PATH;export LD_LIBRARY_PATH' + } else { + fix_ld_library_path = '' + } + + def args = task.ext.args ?: '' // options for the driver + def args2 = task.ext.args2 ?: '' // options for the vcf generation + def args3 = task.ext.args3 ?: '' // options for the gvcf generation + def prefix = task.ext.prefix ?: "${meta.id}" + def dbsnp_command = dbsnp ? "-d $dbsnp " : "" + def interval_command = intervals ? "--interval $intervals" : "" + def sentieon_auth_mech_base64 = task.ext.sentieon_auth_mech_base64 ?: '' + def sentieon_auth_data_base64 = task.ext.sentieon_auth_data_base64 ?: '' + def vcf_cmd = "" + def gvcf_cmd = "" + def base_cmd = '--algo Haplotyper ' + dbsnp_command + + if (emit_vcf) { // emit_vcf can be the empty string, 'variant', 'confident' or 'all' but NOT 'gvcf' + vcf_cmd = base_cmd + args2 + ' --emit_mode ' + emit_vcf + ' ' + prefix + '.unfiltered.vcf.gz' + } + + if (emit_gvcf) { // emit_gvcf can be either true or false + gvcf_cmd = base_cmd + args3 + ' --emit_mode gvcf ' + prefix + '.g.vcf.gz' + } + + """ + if [ "\${#SENTIEON_LICENSE_BASE64}" -lt "1500" ]; then # If the string SENTIEON_LICENSE_BASE64 is short, then it is an encrypted url. + export SENTIEON_LICENSE=\$(echo -e "\$SENTIEON_LICENSE_BASE64" | base64 -d) + else # Localhost license file + # The license file is stored as a nextflow variable like, for instance, this: + # nextflow secrets set SENTIEON_LICENSE_BASE64 \$(cat | base64 -w 0) + export SENTIEON_LICENSE=\$(mktemp) + echo -e "\$SENTIEON_LICENSE_BASE64" | base64 -d > \$SENTIEON_LICENSE + fi + + if [ ${sentieon_auth_mech_base64} ] && [ ${sentieon_auth_data_base64} ]; then + # If sentieon_auth_mech_base64 and sentieon_auth_data_base64 are non-empty strings, then Sentieon is mostly likely being run with some test-license. + export SENTIEON_AUTH_MECH=\$(echo -n "${sentieon_auth_mech_base64}" | base64 -d) + export SENTIEON_AUTH_DATA=\$(echo -n "${sentieon_auth_data_base64}" | base64 -d) + echo "Decoded and exported Sentieon test-license system environment variables" + fi + + $fix_ld_library_path + + sentieon driver $args -r $fasta -t $task.cpus -i $input $interval_command $vcf_cmd $gvcf_cmd + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sentieon: \$(echo \$(sentieon driver --version 2>&1) | sed -e "s/sentieon-genomics-//g") + END_VERSIONS + """ + + stub: + // The following code sets LD_LIBRARY_PATH in the script-section when the module is run by Singularity. + // That turned out to be one way of overcoming the following issue with the Singularity-Sentieon-containers from galaxy, Sentieon (LD_LIBRARY_PATH) and the way Nextflow runs Singularity-containers. + // The galaxy container uses a runscript which is responsible for setting LD_PRELOAD properly. Nextflow executes singularity containers using `singularity exec`, which avoids the run script, leading to the LD_LIBRARY_PATH/libstdc++.so.6 error. + if (workflow.containerEngine == 'singularity') { + fix_ld_library_path = 'LD_LIBRARY_PATH=/usr/local/lib/:\$LD_LIBRARY_PATH;export LD_LIBRARY_PATH' + } else { + fix_ld_library_path = '' + } + + def prefix = task.ext.prefix ?: "${meta.id}" + """ + $fix_ld_library_path + + touch ${prefix}.unfiltered.vcf.gz + touch ${prefix}.unfiltered.vcf.gz.tbi + touch ${prefix}.g.vcf.gz + touch ${prefix}.g.vcf.gz.tbi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sentieon: \$(echo \$(sentieon driver --version 2>&1) | sed -e "s/sentieon-genomics-//g") + END_VERSIONS + """ +} diff --git a/modules/nf-core/sentieon/haplotyper/meta.yml b/modules/nf-core/sentieon/haplotyper/meta.yml new file mode 100644 index 0000000000..c248db3fca --- /dev/null +++ b/modules/nf-core/sentieon/haplotyper/meta.yml @@ -0,0 +1,84 @@ +name: sentieon_haplotyper +description: Runs Sentieon's haplotyper for germline variant calling. +keywords: + - sentieon + - haplotypecaller + - haplotype +tools: + - sentieon: + description: | + Sentieon® provides complete solutions for secondary DNA/RNA analysis for a variety of sequencing platforms, including short and long reads. + Our software improves upon BWA, STAR, Minimap2, GATK, HaplotypeCaller, Mutect, and Mutect2 based pipelines and is deployable on any generic-CPU-based computing system. + homepage: https://www.sentieon.com/ + documentation: https://www.sentieon.com/ +input: + - meta: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: BAM/CRAM file from alignment + pattern: "*.{bam,cram}" + - input_index: + type: file + description: BAI/CRAI file from alignment + pattern: "*.{bai,crai}" + - intervals: + type: file + description: Bed file with the genomic regions included in the library (optional) + - fasta: + type: file + description: Genome fasta file + pattern: "*.{fa,fasta}" + - fai: + type: file + description: The index of the FASTA reference. + pattern: "*.fai" + - dbsnp: + type: file + description: VCF file containing known sites (optional) + - dbsnp_tbi: + type: file + description: VCF index of dbsnp (optional) + - emit_vcf: + type: string + description: | + Controls the vcf output from the haplotyper. + If emit_vcf is set to "all" then the haplotyper will output a vcf generated by the haplotyper in emit-mode "all". + If emit_vcf is set to "confident" then the haplotyper will output a vcf generated by the haplotyper in emit-mode "confident". + If emit_vcf is set to "variant" then the haplotyper will output a vcf generated by the haplotyper in emit_mode "confident". + - emit_gvcf: + type: boolean + description: If true, the haplotyper will output a gvcf +output: + - meta: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test', single_end:false ] + - vcf: + type: file + description: Compressed VCF file + pattern: "*.unfiltered.vcf.gz" + - vcf_tbi: + type: file + description: Index of VCF file + pattern: "*.unfiltered.vcf.gz.tbi" + - gvcf: + type: file + description: Compressed GVCF file + pattern: "*.g.vcf.gz" + - gvcf_tbi: + type: file + description: Index of GVCF file + pattern: "*.g.vcf.gz.tbi" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@asp8200" +maintainers: + - "@asp8200" diff --git a/modules/nf-core/sentieon/varcal/environment.yml b/modules/nf-core/sentieon/varcal/environment.yml new file mode 100644 index 0000000000..93921ff046 --- /dev/null +++ b/modules/nf-core/sentieon/varcal/environment.yml @@ -0,0 +1,7 @@ +name: sentieon_varcal +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::sentieon=202308.01 diff --git a/modules/nf-core/sentieon/varcal/main.nf b/modules/nf-core/sentieon/varcal/main.nf new file mode 100644 index 0000000000..7a0c807127 --- /dev/null +++ b/modules/nf-core/sentieon/varcal/main.nf @@ -0,0 +1,119 @@ +process SENTIEON_VARCAL { + tag "$meta.id" + label 'process_low' + label 'sentieon' + + secret 'SENTIEON_LICENSE_BASE64' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/sentieon:202308.01--h43eeafb_0' : + 'biocontainers/sentieon:202308.01--h43eeafb_0' }" + + input: + tuple val(meta), path(vcf), path(tbi) // input vcf and tbi of variants to recalibrate + path resource_vcf // resource vcf + path resource_tbi // resource tbi + val labels // string (or list of strings) containing dedicated resource labels already formatted with '--resource:' tag + path fasta + path fai + + output: + tuple val(meta), path("*.recal") , emit: recal + tuple val(meta), path("*.idx") , emit: idx + tuple val(meta), path("*.tranches"), emit: tranches + tuple val(meta), path("*plots.R") , emit: plots, optional:true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // The following code sets LD_LIBRARY_PATH in the script-section when the module is run by Singularity. + // That turned out to be one way of overcoming the following issue with the Singularity-Sentieon-containers from galaxy, Sentieon (LD_LIBRARY_PATH) and the way Nextflow runs Singularity-containers. + // The galaxy container uses a runscript which is responsible for setting LD_PRELOAD properly. Nextflow executes singularity containers using `singularity exec`, which avoids the run script, leading to the LD_LIBRARY_PATH/libstdc++.so.6 error. + if (workflow.containerEngine == 'singularity') { + fix_ld_library_path = 'LD_LIBRARY_PATH=/usr/local/lib/:\$LD_LIBRARY_PATH;export LD_LIBRARY_PATH' + } else { + fix_ld_library_path = '' + } + + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def reference_command = fasta ? "--reference $fasta " : '' + def labels_command = '' + + // labels is a list. Here is an example of what labels might look like: + // ['--resource:dbsnp,known=false,training=true,truth=false,prior=2.0 dbsnp_146.hg38.vcf.gz', '--resource:gatk,known=false,training=true,truth=true,prior=10.0 Homo_sapiens_assembly38.known_indels.vcf.gz --resource:mills,known=false,training=true,truth=true,prior=10.0 Mills_and_1000G_gold_standard.indels.hg38.vcf.gz'] + for(label in labels){ + for(gatk_resource_string in label.split('--resource:').findAll()){ // The findAll cmd is there to remove any empty string elements in the list + def items = gatk_resource_string.split(' ') + // Here is an example of what the list items might look like: + // ['dbsnp,known=false,training=true,truth=false,prior=2.0', 'dbsnp_146.hg38.vcf.gz'] + if (items.size() != 2) { + error("Expected the list '${items}' to contain two elements.") + } + labels_command += "--resource ${items[1]} --resource_param ${items[0]} " + } + } + + def sentieon_auth_mech_base64 = task.ext.sentieon_auth_mech_base64 ?: '' + def sentieon_auth_data_base64 = task.ext.sentieon_auth_data_base64 ?: '' + + """ + if [ "\${#SENTIEON_LICENSE_BASE64}" -lt "1500" ]; then # If the string SENTIEON_LICENSE_BASE64 is short, then it is an encrypted url. + export SENTIEON_LICENSE=\$(echo -e "\$SENTIEON_LICENSE_BASE64" | base64 -d) + else # Localhost license file + # The license file is stored as a nextflow variable like, for instance, this: + # nextflow secrets set SENTIEON_LICENSE_BASE64 \$(cat | base64 -w 0) + export SENTIEON_LICENSE=\$(mktemp) + echo -e "\$SENTIEON_LICENSE_BASE64" | base64 -d > \$SENTIEON_LICENSE + fi + + if [ ${sentieon_auth_mech_base64} ] && [ ${sentieon_auth_data_base64} ]; then + # If sentieon_auth_mech_base64 and sentieon_auth_data_base64 are non-empty strings, then Sentieon is mostly likely being run with some test-license. + export SENTIEON_AUTH_MECH=\$(echo -n "${sentieon_auth_mech_base64}" | base64 -d) + export SENTIEON_AUTH_DATA=\$(echo -n "${sentieon_auth_data_base64}" | base64 -d) + echo "Decoded and exported Sentieon test-license system environment variables" + fi + + $fix_ld_library_path + + sentieon driver -r ${fasta} --algo VarCal \\ + -v $vcf \\ + --tranches_file ${prefix}.tranches \\ + $labels_command \\ + $args \\ + ${prefix}.recal + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sentieon: \$(echo \$(sentieon driver --version 2>&1) | sed -e "s/sentieon-genomics-//g") + END_VERSIONS + """ + + stub: + // The following code sets LD_LIBRARY_PATH in the script-section when the module is run by Singularity. + // That turned out to be one way of overcoming the following issue with the Singularity-Sentieon-containers from galaxy, Sentieon (LD_LIBRARY_PATH) and the way Nextflow runs Singularity-containers. + // The galaxy container uses a runscript which is responsible for setting LD_PRELOAD properly. Nextflow executes singularity containers using `singularity exec`, which avoids the run script, leading to the LD_LIBRARY_PATH/libstdc++.so.6 error. + if (workflow.containerEngine == 'singularity') { + fix_ld_library_path = 'LD_LIBRARY_PATH=/usr/local/lib/:\$LD_LIBRARY_PATH;export LD_LIBRARY_PATH' + } else { + fix_ld_library_path = '' + } + + def prefix = task.ext.prefix ?: "${meta.id}" + """ + $fix_ld_library_path + + touch ${prefix}.recal + touch ${prefix}.idx + touch ${prefix}.tranches + touch ${prefix}plots.R + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sentieon: \$(echo \$(sentieon driver --version 2>&1) | sed -e "s/sentieon-genomics-//g") + END_VERSIONS + """ +} diff --git a/modules/nf-core/sentieon/varcal/meta.yml b/modules/nf-core/sentieon/varcal/meta.yml new file mode 100644 index 0000000000..cad7ee106f --- /dev/null +++ b/modules/nf-core/sentieon/varcal/meta.yml @@ -0,0 +1,74 @@ +name: sentieon_varcal +description: | + Module for Sentieons VarCal. The VarCal algorithm calculates the Variant Quality Score Recalibration (VQSR). + VarCal builds a recalibration model for scoring variant quality. + https://support.sentieon.com/manual/usages/general/#varcal-algorithm +keywords: + - sentieon + - varcal + - variant recalibration +tools: + - sentieon: + description: | + Sentieon® provides complete solutions for secondary DNA/RNA analysis for a variety of sequencing platforms, including short and long reads. + Our software improves upon BWA, STAR, Minimap2, GATK, HaplotypeCaller, Mutect, and Mutect2 based pipelines and is deployable on any generic-CPU-based computing system. + homepage: https://www.sentieon.com/ + documentation: https://www.sentieon.com/ +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - vcf: + type: file + description: input vcf file containing the variants to be recalibrated + pattern: "*.vcf.gz" + - tbi: + type: file + description: tbi file matching with -vcf + pattern: "*.vcf.gz.tbi" + - resource_vcf: + type: file + description: all resource vcf files that are used with the corresponding '--resource' label + pattern: "*.vcf.gz" + - resource_tbi: + type: file + description: all resource tbi files that are used with the corresponding '--resource' label + pattern: "*.vcf.gz.tbi" + - labels: + type: string + description: necessary arguments for Sentieon's VarCal. Specified to directly match the resources provided. More information can be found at https://support.sentieon.com/manual/usages/general/#varcal-algorithm + - fasta: + type: file + description: The reference fasta file + pattern: "*.fasta" + - fai: + type: file + description: Index of reference fasta file + pattern: "fasta.fai" +output: + - recal: + type: file + description: Output recal file used by ApplyVQSR + pattern: "*.recal" + - idx: + type: file + description: Index file for the recal output file + pattern: "*.idx" + - tranches: + type: file + description: Output tranches file used by ApplyVQSR + pattern: "*.tranches" + - plots: + type: file + description: Optional output rscript file to aid in visualization of the input data and learned model. + pattern: "*plots.R" + - version: + type: file + description: File containing software versions + pattern: "*.versions.yml" +authors: + - "@asp8200" +maintainers: + - "@asp8200" diff --git a/modules/nf-core/snpeff/download/environment.yml b/modules/nf-core/snpeff/download/environment.yml new file mode 100644 index 0000000000..62f3d5aad6 --- /dev/null +++ b/modules/nf-core/snpeff/download/environment.yml @@ -0,0 +1,7 @@ +name: snpeff_download +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::snpeff=5.1 diff --git a/modules/nf-core/snpeff/download/main.nf b/modules/nf-core/snpeff/download/main.nf new file mode 100644 index 0000000000..f1fc4cc395 --- /dev/null +++ b/modules/nf-core/snpeff/download/main.nf @@ -0,0 +1,51 @@ +process SNPEFF_DOWNLOAD { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/snpeff:5.1--hdfd78af_2' : + 'biocontainers/snpeff:5.1--hdfd78af_2' }" + + input: + tuple val(meta), val(genome), val(cache_version) + + output: + tuple val(meta), path('snpeff_cache'), emit: cache + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def avail_mem = 6144 + if (!task.memory) { + log.info '[snpEff] Available memory not known - defaulting to 6GB. Specify process memory requirements to change this.' + } else { + avail_mem = (task.memory.mega*0.8).intValue() + } + """ + snpEff \\ + -Xmx${avail_mem}M \\ + download ${genome}.${cache_version} \\ + -dataDir \${PWD}/snpeff_cache \\ + ${args} + + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + snpeff: \$(echo \$(snpEff -version 2>&1) | cut -f 2 -d ' ') + END_VERSIONS + """ + + stub: + """ + mkdir ${genome}.${cache_version} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + snpeff: \$(echo \$(snpEff -version 2>&1) | cut -f 2 -d ' ') + END_VERSIONS + """ +} diff --git a/modules/nf-core/snpeff/download/meta.yml b/modules/nf-core/snpeff/download/meta.yml new file mode 100644 index 0000000000..f367c69664 --- /dev/null +++ b/modules/nf-core/snpeff/download/meta.yml @@ -0,0 +1,43 @@ +name: snpeff_download +description: Genetic variant annotation and functional effect prediction toolbox +keywords: + - annotation + - effect prediction + - snpeff + - variant + - vcf +tools: + - snpeff: + description: | + SnpEff is a variant annotation and effect prediction tool. + It annotates and predicts the effects of genetic variants on genes and proteins (such as amino acid changes). + homepage: https://pcingola.github.io/SnpEff/ + documentation: https://pcingola.github.io/SnpEff/se_introduction/ + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - vcf: + type: file + description: | + vcf to annotate + - db: + type: string + description: | + which db to annotate with +output: + - cache: + type: file + description: | + snpEff cache + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@maxulysse" +maintainers: + - "@maxulysse" diff --git a/modules/nf-core/snpeff/snpeff/environment.yml b/modules/nf-core/snpeff/snpeff/environment.yml new file mode 100644 index 0000000000..b492e6a88e --- /dev/null +++ b/modules/nf-core/snpeff/snpeff/environment.yml @@ -0,0 +1,7 @@ +name: snpeff_snpeff +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::snpeff=5.1 diff --git a/modules/nf-core/snpeff/snpeff/main.nf b/modules/nf-core/snpeff/snpeff/main.nf new file mode 100644 index 0000000000..cc4f2ccb36 --- /dev/null +++ b/modules/nf-core/snpeff/snpeff/main.nf @@ -0,0 +1,62 @@ +process SNPEFF_SNPEFF { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/snpeff:5.1--hdfd78af_2' : + 'biocontainers/snpeff:5.1--hdfd78af_2' }" + + input: + tuple val(meta), path(vcf) + val db + tuple val(meta2), path(cache) + + output: + tuple val(meta), path("*.ann.vcf"), emit: vcf + path "*.csv" , emit: report + path "*.html" , emit: summary_html + path "*.genes.txt" , emit: genes_txt + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def avail_mem = 6144 + if (!task.memory) { + log.info '[snpEff] Available memory not known - defaulting to 6GB. Specify process memory requirements to change this.' + } else { + avail_mem = (task.memory.mega*0.8).intValue() + } + def prefix = task.ext.prefix ?: "${meta.id}" + def cache_command = cache ? "-dataDir \${PWD}/${cache}" : "" + """ + snpEff \\ + -Xmx${avail_mem}M \\ + $db \\ + $args \\ + -csvStats ${prefix}.csv \\ + $cache_command \\ + $vcf \\ + > ${prefix}.ann.vcf + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + snpeff: \$(echo \$(snpEff -version 2>&1) | cut -f 2 -d ' ') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.ann.vcf + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + snpeff: \$(echo \$(snpEff -version 2>&1) | cut -f 2 -d ' ') + END_VERSIONS + """ + +} diff --git a/modules/nf-core/snpeff/snpeff/meta.yml b/modules/nf-core/snpeff/snpeff/meta.yml new file mode 100644 index 0000000000..7559c3de08 --- /dev/null +++ b/modules/nf-core/snpeff/snpeff/meta.yml @@ -0,0 +1,60 @@ +name: snpeff_snpeff +description: Genetic variant annotation and functional effect prediction toolbox +keywords: + - annotation + - effect prediction + - snpeff + - variant + - vcf +tools: + - snpeff: + description: | + SnpEff is a variant annotation and effect prediction tool. + It annotates and predicts the effects of genetic variants on genes and proteins (such as amino acid changes). + homepage: https://pcingola.github.io/SnpEff/ + documentation: https://pcingola.github.io/SnpEff/se_introduction/ + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - vcf: + type: file + description: | + vcf to annotate + - db: + type: string + description: | + which db to annotate with + - cache: + type: file + description: | + path to snpEff cache (optional) +output: + - vcf: + type: file + description: | + annotated vcf + pattern: "*.ann.vcf" + - report: + type: file + description: snpEff report csv file + pattern: "*.csv" + - summary_html: + type: file + description: snpEff summary statistics in html file + pattern: "*.html" + - genes_txt: + type: file + description: txt (tab separated) file having counts of the number of variants affecting each transcript and gene + pattern: "*.genes.txt" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@maxulysse" +maintainers: + - "@maxulysse" diff --git a/modules/nf-core/strelka/germline/environment.yml b/modules/nf-core/strelka/germline/environment.yml new file mode 100644 index 0000000000..23bd165b21 --- /dev/null +++ b/modules/nf-core/strelka/germline/environment.yml @@ -0,0 +1,7 @@ +name: strelka_germline +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::strelka=2.9.10 diff --git a/modules/nf-core/strelka/germline/main.nf b/modules/nf-core/strelka/germline/main.nf new file mode 100644 index 0000000000..8f93356160 --- /dev/null +++ b/modules/nf-core/strelka/germline/main.nf @@ -0,0 +1,51 @@ +process STRELKA_GERMLINE { + tag "$meta.id" + label 'process_medium' + label 'error_retry' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/strelka:2.9.10--h9ee0642_1' : + 'biocontainers/strelka:2.9.10--h9ee0642_1' }" + + input: + tuple val(meta), path(input), path(input_index), path (target_bed), path (target_bed_tbi) + path fasta + path fai + + output: + tuple val(meta), path("*variants.vcf.gz") , emit: vcf + tuple val(meta), path("*variants.vcf.gz.tbi"), emit: vcf_tbi + tuple val(meta), path("*genome.vcf.gz") , emit: genome_vcf + tuple val(meta), path("*genome.vcf.gz.tbi") , emit: genome_vcf_tbi + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def regions = target_bed ? "--callRegions ${target_bed}" : "" + """ + configureStrelkaGermlineWorkflow.py \\ + --bam $input \\ + --referenceFasta $fasta \\ + $regions \\ + $args \\ + --runDir strelka + + sed -i s/"isEmail = isLocalSmtp()"/"isEmail = False"/g strelka/runWorkflow.py + + python strelka/runWorkflow.py -m local -j $task.cpus + mv strelka/results/variants/genome.*.vcf.gz ${prefix}.genome.vcf.gz + mv strelka/results/variants/genome.*.vcf.gz.tbi ${prefix}.genome.vcf.gz.tbi + mv strelka/results/variants/variants.vcf.gz ${prefix}.variants.vcf.gz + mv strelka/results/variants/variants.vcf.gz.tbi ${prefix}.variants.vcf.gz.tbi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + strelka: \$( configureStrelkaGermlineWorkflow.py --version ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/strelka/germline/meta.yml b/modules/nf-core/strelka/germline/meta.yml new file mode 100644 index 0000000000..6ee656683e --- /dev/null +++ b/modules/nf-core/strelka/germline/meta.yml @@ -0,0 +1,64 @@ +name: strelka_germline +description: Strelka2 is a fast and accurate small variant caller optimized for analysis of germline variation +keywords: + - variantcalling + - germline + - wgs + - vcf + - variants +tools: + - strelka: + description: Strelka calls somatic and germline small variants from mapped sequencing reads + homepage: https://github.com/Illumina/strelka + documentation: https://github.com/Illumina/strelka/blob/v2.9.x/docs/userGuide/README.md + tool_dev_url: https://github.com/Illumina/strelka + doi: 10.1038/s41592-018-0051-x + licence: ["GPL v3"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test'] + - input: + type: file + description: BAM/CRAM file + pattern: "*.{bam,cram}" + - input_index: + type: file + description: BAM/CRAI index file + pattern: "*.{bai,crai}" + - target_bed: + type: file + description: An optional bed file + pattern: "*.{bed}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test'] + - vcf: + type: file + description: gzipped germline variant file + pattern: "*.{vcf.gz}" + - vcf_tbi: + type: file + description: index file for the vcf file + pattern: "*.vcf.gz.tbi" + - genome_vcf: + type: file + description: variant records and compressed non-variant blocks + pattern: "*_genome.vcf.gz" + - genome_vcf_tbi: + type: file + description: index file for the genome_vcf file + pattern: "*_genome.vcf.gz.tbi" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@arontommi" +maintainers: + - "@arontommi" diff --git a/modules/nf-core/strelka/somatic/environment.yml b/modules/nf-core/strelka/somatic/environment.yml new file mode 100644 index 0000000000..ecbc865ec9 --- /dev/null +++ b/modules/nf-core/strelka/somatic/environment.yml @@ -0,0 +1,7 @@ +name: strelka_somatic +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::strelka=2.9.10 diff --git a/modules/nf-core/strelka/somatic/main.nf b/modules/nf-core/strelka/somatic/main.nf new file mode 100644 index 0000000000..dd975bd563 --- /dev/null +++ b/modules/nf-core/strelka/somatic/main.nf @@ -0,0 +1,55 @@ +process STRELKA_SOMATIC { + tag "$meta.id" + label 'process_medium' + label 'error_retry' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/strelka:2.9.10--h9ee0642_1' : + 'biocontainers/strelka:2.9.10--h9ee0642_1' }" + + input: + tuple val(meta), path(input_normal), path(input_index_normal), path(input_tumor), path(input_index_tumor), path(manta_candidate_small_indels), path(manta_candidate_small_indels_tbi), path(target_bed), path(target_bed_index) + path fasta + path fai + + output: + tuple val(meta), path("*.somatic_indels.vcf.gz") , emit: vcf_indels + tuple val(meta), path("*.somatic_indels.vcf.gz.tbi"), emit: vcf_indels_tbi + tuple val(meta), path("*.somatic_snvs.vcf.gz") , emit: vcf_snvs + tuple val(meta), path("*.somatic_snvs.vcf.gz.tbi") , emit: vcf_snvs_tbi + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def options_target_bed = target_bed ? "--callRegions ${target_bed}" : "" + def options_manta = manta_candidate_small_indels ? "--indelCandidates ${manta_candidate_small_indels}" : "" + """ + + configureStrelkaSomaticWorkflow.py \\ + --tumor $input_tumor \\ + --normal $input_normal \\ + --referenceFasta $fasta \\ + ${options_target_bed} \\ + ${options_manta} \\ + $args \\ + --runDir strelka + + sed -i s/"isEmail = isLocalSmtp()"/"isEmail = False"/g strelka/runWorkflow.py + + python strelka/runWorkflow.py -m local -j $task.cpus + mv strelka/results/variants/somatic.indels.vcf.gz ${prefix}.somatic_indels.vcf.gz + mv strelka/results/variants/somatic.indels.vcf.gz.tbi ${prefix}.somatic_indels.vcf.gz.tbi + mv strelka/results/variants/somatic.snvs.vcf.gz ${prefix}.somatic_snvs.vcf.gz + mv strelka/results/variants/somatic.snvs.vcf.gz.tbi ${prefix}.somatic_snvs.vcf.gz.tbi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + strelka: \$( configureStrelkaSomaticWorkflow.py --version ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/strelka/somatic/meta.yml b/modules/nf-core/strelka/somatic/meta.yml new file mode 100644 index 0000000000..6032cd6d5f --- /dev/null +++ b/modules/nf-core/strelka/somatic/meta.yml @@ -0,0 +1,92 @@ +name: strelka_somatic +description: Strelka2 is a fast and accurate small variant caller optimized for analysis of germline variation in small cohorts and somatic variation in tumor/normal sample pairs +keywords: + - variant calling + - germline + - wgs + - vcf + - variants +tools: + - strelka: + description: Strelka calls somatic and germline small variants from mapped sequencing reads + homepage: https://github.com/Illumina/strelka + documentation: https://github.com/Illumina/strelka/blob/v2.9.x/docs/userGuide/README.md + tool_dev_url: https://github.com/Illumina/strelka + doi: 10.1038/s41592-018-0051-x + licence: ["GPL v3"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input_normal: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - input_index_normal: + type: file + description: BAM/CRAM/SAM index file + pattern: "*.{bai,crai,sai}" + - input_tumor: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - input_index_tumor: + type: file + description: BAM/CRAM/SAM index file + pattern: "*.{bai,crai,sai}" + - manta_candidate_small_indels: + type: file + description: VCF.gz file + pattern: "*.{vcf.gz}" + - manta_candidate_small_indels_tbi: + type: file + description: VCF.gz index file + pattern: "*.tbi" + - fasta: + type: file + description: Genome reference FASTA file + pattern: "*.{fa,fasta}" + - fai: + type: file + description: Genome reference FASTA index file + pattern: "*.{fa.fai,fasta.fai}" + - target_bed: + type: file + description: BED file containing target regions for variant calling + pattern: "*.{bed}" + - target_bed_tbi: + type: file + description: Index for BED file containing target regions for variant calling + pattern: "*.{bed.tbi}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - vcf_indels: + type: file + description: Gzipped VCF file containing variants + pattern: "*.{vcf.gz}" + - vcf_indels_tbi: + type: file + description: Index for gzipped VCF file containing variants + pattern: "*.{vcf.gz.tbi}" + - vcf_snvs: + type: file + description: Gzipped VCF file containing variants + pattern: "*.{vcf.gz}" + - vcf_snvs_tbi: + type: file + description: Index for gzipped VCF file containing variants + pattern: "*.{vcf.gz.tbi}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" +maintainers: + - "@drpatelh" diff --git a/modules/nf-core/svdb/merge/environment.yml b/modules/nf-core/svdb/merge/environment.yml new file mode 100644 index 0000000000..4aad50da52 --- /dev/null +++ b/modules/nf-core/svdb/merge/environment.yml @@ -0,0 +1,9 @@ +name: svdb_merge +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - svdb=2.8.1 + # renovate: datasource=conda depName=bioconda/samtools + - samtools=1.16.1 diff --git a/modules/nf-core/svdb/merge/main.nf b/modules/nf-core/svdb/merge/main.nf new file mode 100644 index 0000000000..0d9967dda9 --- /dev/null +++ b/modules/nf-core/svdb/merge/main.nf @@ -0,0 +1,59 @@ +process SVDB_MERGE { + tag "$meta.id" + label 'process_medium' + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-c8daa8f9d69d3c5a1a4ff08283a166c18edb0000:af6f8534cd538a85ff43a2eae1b52b143e7abd05-0': + 'biocontainers/mulled-v2-c8daa8f9d69d3c5a1a4ff08283a166c18edb0000:af6f8534cd538a85ff43a2eae1b52b143e7abd05-0' }" + + input: + tuple val(meta), path(vcfs) + val (priority) + + output: + tuple val(meta), path("*_sv_merge.vcf.gz"), emit: vcf + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def input = "${vcfs.join(" ")}" + def prio = "" + if(priority) { + prio = "--priority ${priority.join(',')}" + input = "" + for (int index = 0; index < vcfs.size(); index++) { + input += " ${vcfs[index]}:${priority[index]}" + } + } + """ + svdb \\ + --merge \\ + $args \\ + $prio \\ + --vcf $input \\ + > ${prefix}_sv_merge.vcf + bgzip ${prefix}_sv_merge.vcf + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + svdb: \$( echo \$(svdb) | head -1 | sed 's/usage: SVDB-\\([0-9]\\.[0-9]\\.[0-9]\\).*/\\1/' ) + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}_sv_merge.vcf.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + svdb: \$( echo \$(svdb) | head -1 | sed 's/usage: SVDB-\\([0-9]\\.[0-9]\\.[0-9]\\).*/\\1/' ) + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/svdb/merge/meta.yml b/modules/nf-core/svdb/merge/meta.yml new file mode 100644 index 0000000000..84265acb84 --- /dev/null +++ b/modules/nf-core/svdb/merge/meta.yml @@ -0,0 +1,43 @@ +name: svdb_merge +description: The merge module merges structural variants within one or more vcf files. +keywords: + - structural variants + - vcf + - merge +tools: + - svdb: + description: structural variant database software + homepage: https://github.com/J35P312/SVDB + documentation: https://github.com/J35P312/SVDB/blob/master/README.md + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - priority: + type: list + description: prioritise the input vcf files according to this list, e.g ['tiddit','cnvnator'] + - vcfs: + type: list + description: Two or more VCF files. Order of files should correspond to the order of tags used for priority. + pattern: "*.{vcf,vcf.gz}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - vcf: + type: file + description: merged VCF file + pattern: "*_sv_merge.vcf.gz" +authors: + - "@ramprasadn" +maintainers: + - "@ramprasadn" diff --git a/modules/nf-core/tabix/bgziptabix/environment.yml b/modules/nf-core/tabix/bgziptabix/environment.yml new file mode 100644 index 0000000000..028461c987 --- /dev/null +++ b/modules/nf-core/tabix/bgziptabix/environment.yml @@ -0,0 +1,7 @@ +name: tabix_bgziptabix +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::tabix=1.11 diff --git a/modules/nf-core/tabix/bgziptabix/main.nf b/modules/nf-core/tabix/bgziptabix/main.nf new file mode 100644 index 0000000000..f9482690ba --- /dev/null +++ b/modules/nf-core/tabix/bgziptabix/main.nf @@ -0,0 +1,47 @@ +process TABIX_BGZIPTABIX { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/tabix:1.11--hdfd78af_0' : + 'biocontainers/tabix:1.11--hdfd78af_0' }" + + input: + tuple val(meta), path(input) + + output: + tuple val(meta), path("*.gz"), path("*.tbi"), optional: true, emit: gz_tbi + tuple val(meta), path("*.gz"), path("*.csi"), optional: true, emit: gz_csi + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + bgzip --threads ${task.cpus} -c $args $input > ${prefix}.${input.getExtension()}.gz + tabix $args2 ${prefix}.${input.getExtension()}.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + tabix: \$(echo \$(tabix -h 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.${input.getExtension()}.gz + touch ${prefix}.${input.getExtension()}.gz.tbi + touch ${prefix}.${input.getExtension()}.gz.csi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + tabix: \$(echo \$(tabix -h 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/tabix/bgziptabix/meta.yml b/modules/nf-core/tabix/bgziptabix/meta.yml new file mode 100644 index 0000000000..438aba4d18 --- /dev/null +++ b/modules/nf-core/tabix/bgziptabix/meta.yml @@ -0,0 +1,53 @@ +name: tabix_bgziptabix +description: bgzip a sorted tab-delimited genome file and then create tabix index +keywords: + - bgzip + - compress + - index + - tabix + - vcf +tools: + - tabix: + description: Generic indexer for TAB-delimited genome position files. + homepage: https://www.htslib.org/doc/tabix.html + documentation: https://www.htslib.org/doc/tabix.1.html + doi: 10.1093/bioinformatics/btq671 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - tab: + type: file + description: TAB-delimited genome position file + pattern: "*.{bed,gff,sam,vcf}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - gz: + type: file + description: Output compressed file + pattern: "*.{gz}" + - tbi: + type: file + description: tabix index file + pattern: "*.{gz.tbi}" + - csi: + type: file + description: tabix alternate index file + pattern: "*.{gz.csi}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@maxulysse" + - "@DLBPointon" +maintainers: + - "@maxulysse" + - "@DLBPointon" diff --git a/modules/nf-core/tabix/tabix/environment.yml b/modules/nf-core/tabix/tabix/environment.yml new file mode 100644 index 0000000000..7167fb87d6 --- /dev/null +++ b/modules/nf-core/tabix/tabix/environment.yml @@ -0,0 +1,7 @@ +name: tabix_tabix +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::tabix=1.11 diff --git a/modules/nf-core/tabix/tabix/main.nf b/modules/nf-core/tabix/tabix/main.nf new file mode 100644 index 0000000000..c304a8a34b --- /dev/null +++ b/modules/nf-core/tabix/tabix/main.nf @@ -0,0 +1,42 @@ +process TABIX_TABIX { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/tabix:1.11--hdfd78af_0' : + 'biocontainers/tabix:1.11--hdfd78af_0' }" + + input: + tuple val(meta), path(tab) + + output: + tuple val(meta), path("*.tbi"), optional:true, emit: tbi + tuple val(meta), path("*.csi"), optional:true, emit: csi + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + tabix $args $tab + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + tabix: \$(echo \$(tabix -h 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${tab}.tbi + cat <<-END_VERSIONS > versions.yml + + "${task.process}": + tabix: \$(echo \$(tabix -h 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/tabix/tabix/meta.yml b/modules/nf-core/tabix/tabix/meta.yml new file mode 100644 index 0000000000..ae5b4f439f --- /dev/null +++ b/modules/nf-core/tabix/tabix/meta.yml @@ -0,0 +1,49 @@ +name: tabix_tabix +description: create tabix index from a sorted bgzip tab-delimited genome file +keywords: + - index + - tabix + - vcf +tools: + - tabix: + description: Generic indexer for TAB-delimited genome position files. + homepage: https://www.htslib.org/doc/tabix.html + documentation: https://www.htslib.org/doc/tabix.1.html + doi: 10.1093/bioinformatics/btq671 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - tab: + type: file + description: TAB-delimited genome position file compressed with bgzip + pattern: "*.{bed.gz,gff.gz,sam.gz,vcf.gz}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - tbi: + type: file + description: tabix index file + pattern: "*.{tbi}" + - csi: + type: file + description: coordinate sorted index file + pattern: "*.{csi}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" + - "@maxulysse" +maintainers: + - "@joseespinosa" + - "@drpatelh" + - "@maxulysse" diff --git a/modules/nf-core/tiddit/sv/environment.yml b/modules/nf-core/tiddit/sv/environment.yml new file mode 100644 index 0000000000..d0367f1717 --- /dev/null +++ b/modules/nf-core/tiddit/sv/environment.yml @@ -0,0 +1,7 @@ +name: tiddit_sv +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::tiddit=3.6.1 diff --git a/modules/nf-core/tiddit/sv/main.nf b/modules/nf-core/tiddit/sv/main.nf new file mode 100644 index 0000000000..0f4bc7cb52 --- /dev/null +++ b/modules/nf-core/tiddit/sv/main.nf @@ -0,0 +1,55 @@ +process TIDDIT_SV { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/tiddit:3.6.1--py38h24c8ff8_0' : + 'biocontainers/tiddit:3.6.1--py38h24c8ff8_0' }" + + input: + tuple val(meta), path(input), path(input_index) + tuple val(meta2), path(fasta) + tuple val(meta3), path(bwa_index) + + output: + tuple val(meta), path("*.vcf") , emit: vcf + tuple val(meta), path("*.ploidies.tab"), emit: ploidy + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def bwa_command = bwa_index ? "[[ -d $bwa_index ]] && for i in $bwa_index/*; do [[ -f $fasta && ! \"\$i\" =~ .*\"$fasta.\".* ]] && ln -s \$i ${fasta}.\${i##*.} || ln -s \$i .; done" : "" + + """ + $bwa_command + + tiddit \\ + --sv \\ + $args \\ + --bam $input \\ + --ref $fasta \\ + -o $prefix + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + tiddit: \$(echo \$(tiddit 2>&1) | sed 's/^.*tiddit-//; s/ .*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.vcf + touch ${prefix}.ploidies.tab + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + tiddit: \$(echo \$(tiddit 2>&1) | sed 's/^.*tiddit-//; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/tiddit/sv/meta.yml b/modules/nf-core/tiddit/sv/meta.yml new file mode 100644 index 0000000000..b13ae5cdcb --- /dev/null +++ b/modules/nf-core/tiddit/sv/meta.yml @@ -0,0 +1,57 @@ +name: tiddit_sv +description: Identify chromosomal rearrangements. +keywords: + - structural + - variants + - vcf +tools: + - sv: + description: Search for structural variants. + homepage: https://github.com/SciLifeLab/TIDDIT + documentation: https://github.com/SciLifeLab/TIDDIT/blob/master/README.md + doi: 10.12688/f1000research.11168.1 + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: BAM/CRAM file + pattern: "*.{bam,cram}" + - index: + type: file + description: BAM/CRAM index file + pattern: "*.{bai,crai}" + - fasta: + type: file + description: Input FASTA file + pattern: "*.{fasta,fa}" + - bwa_index: + type: file + description: BWA genome index files + pattern: "Directory containing BWA index *.{amb,ann,bwt,pac,sa}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - vcf: + type: file + description: vcf + pattern: "*.{vcf}" + - ploidy: + type: file + description: tab + pattern: "*.{ploidies.tab}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@maxulysse" +maintainers: + - "@maxulysse" diff --git a/modules/nf-core/untar/environment.yml b/modules/nf-core/untar/environment.yml new file mode 100644 index 0000000000..d6917da326 --- /dev/null +++ b/modules/nf-core/untar/environment.yml @@ -0,0 +1,9 @@ +name: untar +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - conda-forge::sed=4.7 + - conda-forge::grep=3.11 + - conda-forge::tar=1.34 diff --git a/modules/nf-core/untar/main.nf b/modules/nf-core/untar/main.nf new file mode 100644 index 0000000000..8a75bb957d --- /dev/null +++ b/modules/nf-core/untar/main.nf @@ -0,0 +1,63 @@ +process UNTAR { + tag "$archive" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'nf-core/ubuntu:20.04' }" + + input: + tuple val(meta), path(archive) + + output: + tuple val(meta), path("$prefix"), emit: untar + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + prefix = task.ext.prefix ?: ( meta.id ? "${meta.id}" : archive.baseName.toString().replaceFirst(/\.tar$/, "")) + + """ + mkdir $prefix + + ## Ensures --strip-components only applied when top level of tar contents is a directory + ## If just files or multiple directories, place all in prefix + if [[ \$(tar -taf ${archive} | grep -o -P "^.*?\\/" | uniq | wc -l) -eq 1 ]]; then + tar \\ + -C $prefix --strip-components 1 \\ + -xavf \\ + $args \\ + $archive \\ + $args2 + else + tar \\ + -C $prefix \\ + -xavf \\ + $args \\ + $archive \\ + $args2 + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + untar: \$(echo \$(tar --version 2>&1) | sed 's/^.*(GNU tar) //; s/ Copyright.*\$//') + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: ( meta.id ? "${meta.id}" : archive.toString().replaceFirst(/\.[^\.]+(.gz)?$/, "")) + """ + mkdir $prefix + touch ${prefix}/file.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + untar: \$(echo \$(tar --version 2>&1) | sed 's/^.*(GNU tar) //; s/ Copyright.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/untar/meta.yml b/modules/nf-core/untar/meta.yml new file mode 100644 index 0000000000..a9a2110f55 --- /dev/null +++ b/modules/nf-core/untar/meta.yml @@ -0,0 +1,46 @@ +name: untar +description: Extract files. +keywords: + - untar + - uncompress + - extract +tools: + - untar: + description: | + Extract tar.gz files. + documentation: https://www.gnu.org/software/tar/manual/ + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - archive: + type: file + description: File to be untar + pattern: "*.{tar}.{gz}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - untar: + type: directory + description: Directory containing contents of archive + pattern: "*/" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" + - "@matthdsm" + - "@jfy133" +maintainers: + - "@joseespinosa" + - "@drpatelh" + - "@matthdsm" + - "@jfy133" diff --git a/modules/nf-core/untar/tests/main.nf.test b/modules/nf-core/untar/tests/main.nf.test new file mode 100644 index 0000000000..d40db13d82 --- /dev/null +++ b/modules/nf-core/untar/tests/main.nf.test @@ -0,0 +1,77 @@ +nextflow_process { + + name "Test Process UNTAR" + script "../main.nf" + process "UNTAR" + + tag "modules" + tag "modules_nfcore" + tag "untar" + + test("test_untar") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ [], file(params.test_data['sarscov2']['genome']['kraken2_tar_gz'], checkIfExists: true) ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out.untar).match("test_untar") }, + ) + } + + } + + test("test_untar_different_output_path") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ [], file(params.test_data['homo_sapiens']['illumina']['test_flowcell'], checkIfExists: true) ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out.untar).match("test_untar_different_output_path") }, + ) + } + + } + + test("test_untar_onlyfiles") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ [], file(params.test_data['generic']['tar']['tar_gz'], checkIfExists: true) ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out.untar).match("test_untar_onlyfiles") }, + ) + } + + } + +} diff --git a/modules/nf-core/untar/tests/main.nf.test.snap b/modules/nf-core/untar/tests/main.nf.test.snap new file mode 100644 index 0000000000..146c8678a7 --- /dev/null +++ b/modules/nf-core/untar/tests/main.nf.test.snap @@ -0,0 +1,513 @@ +{ + "test_untar_different_output_path": { + "content": [ + [ + [ + [ + + ], + [ + [ + [ + [ + [ + [ + "s_1_1101.bcl:md5,ad01889e2ff43e2f194224e20bdb600c", + "s_1_1101.stats:md5,4bbbf103454b37fbc3138fadf1b4446b" + ], + [ + "s_1_1101.bcl:md5,565384bbe67a694dfd690bae6d1d30c2", + "s_1_1101.stats:md5,55e5abd8f129ff38ef169873547abdb8" + ], + [ + "s_1_1101.bcl:md5,650fa58a630a9148835ba79e323d4237", + "s_1_1101.stats:md5,77403669ca1b05340c390dff64425c1e" + ], + [ + "s_1_1101.bcl:md5,54471c9e97299cd141e202e204637702", + "s_1_1101.stats:md5,67b14c9a89b7f8556674a7524d5cfb2d" + ], + [ + "s_1_1101.bcl:md5,74e4f929fc7476c380fd9d741ddb6700", + "s_1_1101.stats:md5,5730a4c35463eaa12a06b6758710b98c" + ], + [ + "s_1_1101.bcl:md5,c785f472f4350c120c02c888c8189590", + "s_1_1101.stats:md5,fee4ec63895ea81007e06ee6a36ba5e0" + ], + [ + "s_1_1101.bcl:md5,b7ea50bb25f08d43c301741d77050a9b", + "s_1_1101.stats:md5,fa7c68f3122c74d14364e6f7b011af70" + ], + [ + "s_1_1101.bcl:md5,9d5087dc4bcae39d66486363d4f68ecf", + "s_1_1101.stats:md5,23cdceee4d82c4b8e7c60018b9276ace" + ], + [ + "s_1_1101.bcl:md5,581e0c5ee94e8f2de14b2b1d8e777530", + "s_1_1101.stats:md5,9a3536d573c97f66bb56b49463612607" + ], + [ + "s_1_1101.bcl:md5,296fc026bb34c67bbe2b44845fe0d1de", + "s_1_1101.stats:md5,a7f57a7770fb9c5ae2a0fb1ef403ec4f" + ], + [ + "s_1_1101.bcl:md5,2a3ca15531556c36d10d132a9e051de8", + "s_1_1101.stats:md5,2d0bcdb0a1b51d3d79e415db2ab2d3b1" + ], + [ + "s_1_1101.bcl:md5,1150d46a2ccd4ac58aee0585d3e4ffd7", + "s_1_1101.stats:md5,2e97550bd5b5864ffd0565bb7a3f6d40" + ], + [ + "s_1_1101.bcl:md5,0b85c4b3da0de95e7b862d849c5333ae", + "s_1_1101.stats:md5,6eab9746fbeb783b0cd70398f44e0c1a" + ], + [ + "s_1_1101.bcl:md5,e0e9c91f4698804d7a6d1058ef68b34f", + "s_1_1101.stats:md5,790022cdc7878a02b2ebd166e1ddf0a7" + ], + [ + "s_1_1101.bcl:md5,38cd0ad4de359e651c8ac0d5777ea625", + "s_1_1101.stats:md5,a1b1d5ea5371d326abb029774483c5e6" + ], + [ + "s_1_1101.bcl:md5,b0ddc05c4012ccba24e712a1cfec748f", + "s_1_1101.stats:md5,af3d232f839d720f76f40ba06caa2987" + ], + [ + "s_1_1101.bcl:md5,af32fcc5dc3b836cf7a5ba3db85a75dd", + "s_1_1101.stats:md5,f93f2c09bd4e486c74a5f6e2040f7296" + ], + [ + "s_1_1101.bcl:md5,54b7428e037ca87816107647d4a3d9db", + "s_1_1101.stats:md5,e5ac77a72cd7bed5e9bf03cccda0e48c" + ], + [ + "s_1_1101.bcl:md5,fc8b4eacd493bf3d0b20bc23998dc7ff", + "s_1_1101.stats:md5,190315e159e2f4bc4c057ded7470dc52" + ], + [ + "s_1_1101.bcl:md5,9484ecffda489927fce424ac6a44fa9d", + "s_1_1101.stats:md5,0825feeb457ecc9efcf6f8526ba32311" + ], + [ + "s_1_1101.bcl:md5,eec59e21036e31c95ce1e847bfb0a9c4", + "s_1_1101.stats:md5,9acc13f63c98e5a8445e7be70d49222b" + ], + [ + "s_1_1101.bcl:md5,a9fb24476f87cba4fba68e2b3c3f2c07", + "s_1_1101.stats:md5,dc0aa7db9790733291c3e6480ca2a0fc" + ], + [ + "s_1_1101.bcl:md5,ed950b3e82c500927c2e236c9df005c6", + "s_1_1101.stats:md5,dccb71ec47d1f9d33a192da6d5660a45" + ], + [ + "s_1_1101.bcl:md5,b3e992025e995ca56b5ea2820144ef47", + "s_1_1101.stats:md5,a6a829bf2cffb26ac5d9dc3012057699" + ], + [ + "s_1_1101.bcl:md5,89edc726a5a4e0b4ff8ca3899ed0232b", + "s_1_1101.stats:md5,5b9b4fd8110577a59b82d0c419519d29" + ], + [ + "s_1_1101.bcl:md5,4dc696149169f232c451225f563cb5cd", + "s_1_1101.stats:md5,d3514a71ea3adc60e2943c6b8f6e2598" + ], + [ + "s_1_1101.bcl:md5,35b992d0318afb7c825ceaa31b0755e6", + "s_1_1101.stats:md5,2826093acc175c16c3795de7c4ca8f07" + ], + [ + "s_1_1101.bcl:md5,7bc927f56a362e49c00b5d76ee048901", + "s_1_1101.stats:md5,e47d862b795fd6b88a31d7d482ab22f6" + ], + [ + "s_1_1101.bcl:md5,84742233ff2a651626fe9036f27f7cb2", + "s_1_1101.stats:md5,b78fad11d3c50bc76b722cdc03e3028b" + ], + [ + "s_1_1101.bcl:md5,3935341c86263a7938e8c49620ef39f8", + "s_1_1101.stats:md5,cc6585b2daac5354073d150874da9704" + ], + [ + "s_1_1101.bcl:md5,3627f4fd548bf6e64aaf08fba3a342be", + "s_1_1101.stats:md5,120ae4831ae004ff7d16728aef36e82f" + ], + [ + "s_1_1101.bcl:md5,07631014bc35124149fabd80ef19f933", + "s_1_1101.stats:md5,eadd63d91f47cc6db6b6f0a967a23927" + ], + [ + "s_1_1101.bcl:md5,a1149c80415dc2f34d768eeb397c43fb", + "s_1_1101.stats:md5,ca89a9def67611a9151c6ce685b7cce1" + ], + [ + "s_1_1101.bcl:md5,eb5f71d4741d2f40618756bc72eaf8b4", + "s_1_1101.stats:md5,90f48501e735e5915b843478e23d1ae2" + ], + [ + "s_1_1101.bcl:md5,9bf270fe3f6add1a591ebc24fff10078", + "s_1_1101.stats:md5,a4e429671d4098034293c638aa655e16" + ], + [ + "s_1_1101.bcl:md5,219bedcbd24bae54fe4cf05dae05282c", + "s_1_1101.stats:md5,dd97525b65b68207137d51fcf19132c7" + ], + [ + "s_1_1101.bcl:md5,5163bc00a68fd57ae50cae0b76350892", + "s_1_1101.stats:md5,b606a5368eff1f012f3ea5d11ccdf2e0" + ], + [ + "s_1_1101.bcl:md5,fc429195a5af59a59e0cc4c48e6c05ea", + "s_1_1101.stats:md5,d809aa19698053f90d639da4dcad8008" + ], + [ + "s_1_1101.bcl:md5,383340219a1dd77076a092a64a71a7e4", + "s_1_1101.stats:md5,b204a5cf256378679ffc906c15cc1bae" + ], + [ + "s_1_1101.bcl:md5,0c369540d3e24696cf1f9c55bab69315", + "s_1_1101.stats:md5,a2bc69a4031a22ce9621dcc623a0bf4b" + ], + [ + "s_1_1101.bcl:md5,3127abc8016ba8eb954f8f8015dff387", + "s_1_1101.stats:md5,5deafff31150b7bf757f814e49a53bc2" + ], + [ + "s_1_1101.bcl:md5,045f40c82de676bafec3d59f91376a7a", + "s_1_1101.stats:md5,890700edc20687c090ef52248c7884b1" + ], + [ + "s_1_1101.bcl:md5,78af269aa2b39a1d765703f0a4739a86", + "s_1_1101.stats:md5,303cf457aa1543a8208544f694cbc531" + ], + [ + "s_1_1101.bcl:md5,0ab8c781959b783b62888e9274364a46", + "s_1_1101.stats:md5,2605b0e8322f83aa4d0dae5da4ec7a7a" + ], + [ + "s_1_1101.bcl:md5,d0cf823ffe352e8b3f75d589544ab617", + "s_1_1101.stats:md5,efa3c0e01e3db71e12fd961cb2d03739" + ], + [ + "s_1_1101.bcl:md5,db4ca4ab7a01e03c246f9160c3758d82", + "s_1_1101.stats:md5,f61550d9e4a90df6b860e68f41f82f60" + ], + [ + "s_1_1101.bcl:md5,1af39a2c7e5ff20ece91cb8160b51d17", + "s_1_1101.stats:md5,d0e20879afcaf6dfcd88c73f1c5c78cf" + ], + [ + "s_1_1101.bcl:md5,4cf7123bb0fffcd79266df03aef01665", + "s_1_1101.stats:md5,29bff4075109a121b087116b58d7e927" + ], + [ + "s_1_1101.bcl:md5,aa9980428cb60cd6320f4b48f4dd0d74", + "s_1_1101.stats:md5,6b0e20bde93133117a8d1a6df3d6f37b" + ], + [ + "s_1_1101.bcl:md5,0f6e440374e15b9b491d52fb83a8adfe", + "s_1_1101.stats:md5,55cb5eb0ecdabd23dca39ab8c4607598" + ], + [ + "s_1_1101.bcl:md5,2c645d7bdaddaa403f6e304d36df9e4b", + "s_1_1101.stats:md5,53acf33d21f832779b400c2447386ce4" + ], + [ + "s_1_1101.bcl:md5,3bbf0863b423b770c879203644420206", + "s_1_1101.stats:md5,579bdc7293cac8c3d7407249cacf4c25" + ], + [ + "s_1_1101.bcl:md5,6658a08409e81d29cfeb2d096b491985", + "s_1_1101.stats:md5,bb559ffbea46d612f9933cefa84c4c03" + ], + [ + "s_1_1101.bcl:md5,1700d9a13d3d4f7643af2943ef838acb", + "s_1_1101.stats:md5,f01cb6050ebfb15da1e0399ebd791eb4" + ], + [ + "s_1_1101.bcl:md5,1ac7aa9ffae25eb103f755f33e4a39c6", + "s_1_1101.stats:md5,0b9d45d7929ccf336d5e5b95373ed3c2" + ], + [ + "s_1_1101.bcl:md5,812a97af2e983a53226e18c75190b06c", + "s_1_1101.stats:md5,d2410c7b0e506dab2972e77e2398de1e" + ], + [ + "s_1_1101.bcl:md5,c981e8e4dcc434956c2b86159da268bc", + "s_1_1101.stats:md5,e9c826e85361ce673f1f248786c9a611" + ], + [ + "s_1_1101.bcl:md5,88e09e99a0a4ef3357b203a41b22f77c", + "s_1_1101.stats:md5,ef06f2e5ad667bbd383f9ed6a05b7b42" + ], + [ + "s_1_1101.bcl:md5,461c8b146fc8a7938be38689978ecd09", + "s_1_1101.stats:md5,65115693935da66f9791b27136e22fb0" + ], + [ + "s_1_1101.bcl:md5,c7b827df5ce20e0f21916fe60860ca3f", + "s_1_1101.stats:md5,87be73613aeb507847f94d3cac5bb30a" + ], + [ + "s_1_1101.bcl:md5,7c4cc3dc9c8a1b0f15917b282dfb40ce", + "s_1_1101.stats:md5,bdd9181fa89debbfafe7b6ea3e064065" + ], + [ + "s_1_1101.bcl:md5,19f4debaf91e118aca8934517179ac33", + "s_1_1101.stats:md5,1143082719e136241d21b14a6b19b8a2" + ], + [ + "s_1_1101.bcl:md5,38aa256ad2d697d84b0b2c0e876a3eba", + "s_1_1101.stats:md5,64dd82f03df23f7f437eede2671ed4fe" + ], + [ + "s_1_1101.bcl:md5,b7929970378949571fed922c1b8cab32", + "s_1_1101.stats:md5,3d6d7985a41629fe196e4342d7fe36aa" + ], + [ + "s_1_1101.bcl:md5,fb2ed0bf6e89d79624ee78754e773491", + "s_1_1101.stats:md5,f34940810ff255aee79953496a12716d" + ], + [ + "s_1_1101.bcl:md5,4f8a8311f5f9c3a7629c1a973a7b280e", + "s_1_1101.stats:md5,4fd7cd28c09f4e152e7c2ad1ab541cd2" + ], + [ + "s_1_1101.bcl:md5,9eb46c903d0344e25af51f88cc311d60", + "s_1_1101.stats:md5,df3abd5f620d9e7f99496098d9fd3f7f" + ], + [ + "s_1_1101.bcl:md5,3ecbc17f3660e2014b58d7fe70ae62d5", + "s_1_1101.stats:md5,8e89a13c85a6d6ab3ccd251b66d1f165" + ], + [ + "s_1_1101.bcl:md5,5d59cc2499a77791233a64f73fe82894", + "s_1_1101.stats:md5,32ec99cd400f4b80cb26e2fa8e07ece0" + ], + [ + "s_1_1101.bcl:md5,1c052da47b9ae8554388f0fa3aade482", + "s_1_1101.stats:md5,d23f438772673688aa7bc92421dc6dce" + ], + [ + "s_1_1101.bcl:md5,1a52bd4f23130c0c96bc967ccd448a2b", + "s_1_1101.stats:md5,9b597e3388d59ef1f61aba30ac90ea79" + ], + [ + "s_1_1101.bcl:md5,8a1e84b79cf3f80794c20e3a0cc84688", + "s_1_1101.stats:md5,9561f7b6ef4b1849afc72b2bb49792bd" + ], + [ + "s_1_1101.bcl:md5,75c00111051f3fa95d04286823cb9109", + "s_1_1101.stats:md5,1fe786cdf8181767deafbd60b3c76610" + ], + [ + "s_1_1101.bcl:md5,529255d8deee0873ed5565e6d1a2ebda", + "s_1_1101.stats:md5,3fa7f467e97a75880f32d17b7429d316" + ], + [ + "s_1_1101.bcl:md5,ea4d960e3d9355d2149da71b88a21df4", + "s_1_1101.stats:md5,2540fe65586e8e800c1ddd8cddd1e8cd" + ], + [ + "s_1_1101.bcl:md5,0dfe1fd92a2dce2f23119aa483429744", + "s_1_1101.stats:md5,78257b2169fb9f0cf40966e06e847e86" + ], + [ + "s_1_1101.bcl:md5,f692ddc9aa3ab849271d07c666d0b3b9", + "s_1_1101.stats:md5,aa2ec6a3e3a9c116e34fe74a21e6459e" + ], + [ + "s_1_1101.bcl:md5,29cc4c239eae7c871c9a1adf92ebdb98", + "s_1_1101.stats:md5,263184813090acd740a5bf25304aed3a" + ], + [ + "s_1_1101.bcl:md5,e005af6a84925e326afbfe264241f047", + "s_1_1101.stats:md5,b6fb20868eebaffcc19daa694a449795" + ], + [ + "s_1_1101.bcl:md5,02f1a699b1ba9967accccf99a7af3d24", + "s_1_1101.stats:md5,4f007efacecaf26dc0e0231aede28754" + ], + [ + "s_1_1101.bcl:md5,df308c72a2dcc655cd95e98f5457187a", + "s_1_1101.stats:md5,130c4b07f4c14030bab012824cbe34da" + ], + [ + "s_1_1101.bcl:md5,f3ce10d8d2406b72355023bfa8c96822", + "s_1_1101.stats:md5,2638f4db393ed5b699ec2ce59ff0ec19" + ], + [ + "s_1_1101.bcl:md5,cc2f6d675ad1593ff96f734b172d249e", + "s_1_1101.stats:md5,f5b13f1e1ababc9e1a7a73b0b993cbf1" + ], + [ + "s_1_1101.bcl:md5,7938a0b21448305a951b023b1845b3a7", + "s_1_1101.stats:md5,fcd57511adabfc3ba1ac045165330006" + ], + [ + "s_1_1101.bcl:md5,44879bc6a38df1fee8def61868115041", + "s_1_1101.stats:md5,517e20e4b58a8023a37f9af62e0e2036" + ], + [ + "s_1_1101.bcl:md5,8749611e62406a7d2f34c610a55e56af", + "s_1_1101.stats:md5,8ccf24b3676ef84f2e513be8f2a9f3d1" + ], + [ + "s_1_1101.bcl:md5,a9846a037611cda3721958088f714c0e", + "s_1_1101.stats:md5,6438fa5a1892f328cab1605a95d80a3b" + ], + [ + "s_1_1101.bcl:md5,d6c4a2a726496476eb826532f974ed5f", + "s_1_1101.stats:md5,8c2c65b5e8b00dbf61ada65252aeb266" + ], + [ + "s_1_1101.bcl:md5,be3dde6cae7dd85855a6bf295ebfacfe", + "s_1_1101.stats:md5,93bc13f3b0749b2b8d8bcb0b1199f4f0" + ], + [ + "s_1_1101.bcl:md5,7c64514735a6cf1565b60647edd17d20", + "s_1_1101.stats:md5,4a0aa6c49b24f876415e5878cef7f805" + ], + [ + "s_1_1101.bcl:md5,3983b4043bc9df4b505202a5134ccf03", + "s_1_1101.stats:md5,1c9d9a8558adc1279ca27c96bc1b9758" + ], + [ + "s_1_1101.bcl:md5,a0b8d77f116ec95975f9253dcb768136", + "s_1_1101.stats:md5,c3992b786756e7ec42f65ef4b13b50d4" + ], + [ + "s_1_1101.bcl:md5,43c95ba35d06bb7c57fbd16f3d1cfd6c", + "s_1_1101.stats:md5,3cb69d04698c39f97f962e5bf1eea7f0" + ], + [ + "s_1_1101.bcl:md5,3dbeea0cad7052f19f53ff6f19dd4d90", + "s_1_1101.stats:md5,58bbc8254f0f5f4a244531e8e9c12a04" + ], + [ + "s_1_1101.bcl:md5,da56d088996376c898d855b6cd0a7dfc", + "s_1_1101.stats:md5,9f2d78af6908ce1576b89cdc059844ff" + ], + [ + "s_1_1101.bcl:md5,7b641a5565f095e9a6ffcad9e4305033", + "s_1_1101.stats:md5,3ada06c59b4fb41b83ab6abd0979e9fc" + ], + [ + "s_1_1101.bcl:md5,a3843d397a01d51657825bb652c191e5", + "s_1_1101.stats:md5,19341e52a4bfc7d9d48e9d2acc68c519" + ], + [ + "s_1_1101.bcl:md5,048e3ebfc8efeb8012def6b741c9060d", + "s_1_1101.stats:md5,88bd38deca1e87d700effab1fd099565" + ], + [ + "s_1_1101.bcl:md5,b340db0e07e829dd5da22371916a1a9e", + "s_1_1101.stats:md5,e44cfaddcc4ffb968e5b1a2f41ac48a5" + ], + [ + "s_1_1101.bcl:md5,e6011ec6eabbc2b8792deb283c621ce0", + "s_1_1101.stats:md5,090875dcd1a431af24bc631333f089c4" + ], + [ + "s_1_1101.bcl:md5,a08f216e3352345031ed100ec4245082", + "s_1_1101.stats:md5,97b949ef4b96219e1369f673cf5f8a6c" + ], + [ + "s_1_1101.bcl:md5,b43337c76fb037dfcf5f8f7bcb3618e5", + "s_1_1101.stats:md5,ddef585805e79951f69d23ab7354f69b" + ], + [ + "s_1_1101.bcl:md5,8c61fd004104397b360855e058bbf1bf", + "s_1_1101.stats:md5,0f8d253816d594dcfea3ccf48c826401" + ], + [ + "s_1_1101.bcl:md5,594d06310d328b188aa0b3edfff22cb2", + "s_1_1101.stats:md5,3160bf271b39aeb7590e4fd2984710ba" + ], + [ + "s_1_1101.bcl:md5,4c9eada67c9d55437211d83e111961d5", + "s_1_1101.stats:md5,2901b46ab16ec4863d30e4c84ec29c97" + ], + [ + "s_1_1101.bcl:md5,e03971ae5282f0accc0c1b7374d9ef1b", + "s_1_1101.stats:md5,60d2a19ce59bf70a21a28555484cead8" + ], + [ + "s_1_1101.bcl:md5,e1c6f7a06e63d149895d3e48e63df155", + "s_1_1101.stats:md5,44beb10af847ea3dddaf06dda7031126" + ], + [ + "s_1_1101.bcl:md5,960a99bf29a8f9d936e9b8582d46c9c6", + "s_1_1101.stats:md5,544cd1a7aaaa841914b40ece43399334" + ], + [ + "s_1_1101.bcl:md5,5706679f349fd4a6b6313bc2c41c7a42", + "s_1_1101.stats:md5,627eea844b26dae033848c2f9f69177b" + ], + [ + "s_1_1101.bcl:md5,21da5abc4b0402bbac14b5ab998b0b4f", + "s_1_1101.stats:md5,515bd140b095ad90473ca7a9a69877ab" + ], + "s_1_1101.control:md5,08a72e2198ae95150718e8adf011d105", + "s_1_1101.filter:md5,3a72bc73b323c8cb0ac5bfeb62d98989" + ] + ], + [ + "s_1_1101.locs:md5,0827ea802e5257cc5b20e757a33d4c98" + ], + "RTAConfiguration.xml:md5,c7d6e257bc374f142dc64b9d2281d4c9", + "config.xml:md5,9a4cc7ec01fefa2f1ce9bcb45bbad6e9" + ] + ], + [ + "ControlMetricsOut.bin:md5,6d77b38d0793a6e1ce1e85706e488953", + "CorrectedIntMetricsOut.bin:md5,2bbf84d3be72734addaa2fe794711434", + "ErrorMetricsOut.bin:md5,38c88def138e9bb832539911affdb286", + "ExtractionMetricsOut.bin:md5,7497c3178837eea8f09350b5cd252e99", + "IndexMetricsOut.bin:md5,d41d8cd98f00b204e9800998ecf8427e", + "QMetricsOut.bin:md5,7e9f198d53ebdfbb699a5f94cf1ed51c", + "TileMetricsOut.bin:md5,83891751ec1c91a425a524b476b6ca3c" + ], + "RunInfo.xml:md5,03038959f4dd181c86bc97ae71fe270a" + ] + ] + ] + ], + "timestamp": "2023-10-18T11:56:39.562418" + }, + "test_untar_onlyfiles": { + "content": [ + [ + [ + [ + + ], + [ + "hello.txt:md5,e59ff97941044f85df5297e1c302d260" + ] + ] + ] + ], + "timestamp": "2023-10-18T11:56:46.878844" + }, + "test_untar": { + "content": [ + [ + [ + [ + + ], + [ + "hash.k2d:md5,8b8598468f54a7087c203ad0190555d9", + "opts.k2d:md5,a033d00cf6759407010b21700938f543", + "taxo.k2d:md5,094d5891cdccf2f1468088855c214b2c" + ] + ] + ] + ], + "timestamp": "2023-10-18T11:56:08.16574" + } +} \ No newline at end of file diff --git a/modules/nf-core/untar/tests/tags.yml b/modules/nf-core/untar/tests/tags.yml new file mode 100644 index 0000000000..feb6f15c0c --- /dev/null +++ b/modules/nf-core/untar/tests/tags.yml @@ -0,0 +1,2 @@ +untar: + - modules/nf-core/untar/** diff --git a/modules/nf-core/unzip/environment.yml b/modules/nf-core/unzip/environment.yml new file mode 100644 index 0000000000..d3a535f170 --- /dev/null +++ b/modules/nf-core/unzip/environment.yml @@ -0,0 +1,7 @@ +name: unzip +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - conda-forge::p7zip=16.02 diff --git a/modules/nf-core/unzip/main.nf b/modules/nf-core/unzip/main.nf new file mode 100644 index 0000000000..08cfc3c406 --- /dev/null +++ b/modules/nf-core/unzip/main.nf @@ -0,0 +1,37 @@ +process UNZIP { + tag "$archive" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/p7zip:16.02' : + 'biocontainers/p7zip:16.02' }" + + input: + tuple val(meta), path(archive) + + output: + tuple val(meta), path("${prefix}/"), emit: unzipped_archive + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + if ( archive instanceof List && archive.name.size > 1 ) { error "[UNZIP] error: 7za only accepts a single archive as input. Please check module input." } + + prefix = task.ext.prefix ?: ( meta.id ? "${meta.id}" : archive.baseName) + """ + 7za \\ + x \\ + -o"${prefix}"/ \\ + $args \\ + $archive + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + 7za: \$(echo \$(7za --help) | sed 's/.*p7zip Version //; s/(.*//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/unzip/meta.yml b/modules/nf-core/unzip/meta.yml new file mode 100644 index 0000000000..e8e377e2af --- /dev/null +++ b/modules/nf-core/unzip/meta.yml @@ -0,0 +1,42 @@ +name: unzip +description: Unzip ZIP archive files +keywords: + - unzip + - decompression + - zip + - archiving +tools: + - unzip: + description: p7zip is a quick port of 7z.exe and 7za.exe (command line version of 7zip, see www.7-zip.org) for Unix. + homepage: https://sourceforge.net/projects/p7zip/ + documentation: https://sourceforge.net/projects/p7zip/ + tool_dev_url: https://sourceforge.net/projects/p7zip" + licence: ["LGPL-2.1-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - archive: + type: file + description: ZIP file + pattern: "*.zip" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - unzipped_archive: + type: directory + description: Directory contents of the unzipped archive + pattern: "${archive.baseName}/" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@jfy133" +maintainers: + - "@jfy133" diff --git a/modules/nf-core/vcftools/environment.yml b/modules/nf-core/vcftools/environment.yml new file mode 100644 index 0000000000..503449e833 --- /dev/null +++ b/modules/nf-core/vcftools/environment.yml @@ -0,0 +1,7 @@ +name: vcftools +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::vcftools=0.1.16 diff --git a/modules/nf-core/vcftools/main.nf b/modules/nf-core/vcftools/main.nf new file mode 100644 index 0000000000..0153a60891 --- /dev/null +++ b/modules/nf-core/vcftools/main.nf @@ -0,0 +1,123 @@ +process VCFTOOLS { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/vcftools:0.1.16--he513fc3_4' : + 'biocontainers/vcftools:0.1.16--he513fc3_4' }" + + input: + // Owing to the nature of vcftools we here provide solutions to working with optional bed files and optional + // alternative variant files, for use with the 'diff' suite of tools. + // Other optional input files can be utilised in a similar way to below but we do not exhaustively itterate through all + // possible options. Instead we leave that to the user. + tuple val(meta), path(variant_file) + path bed + path diff_variant_file + + output: + tuple val(meta), path("*.vcf") , optional:true, emit: vcf + tuple val(meta), path("*.bcf") , optional:true, emit: bcf + tuple val(meta), path("*.frq") , optional:true, emit: frq + tuple val(meta), path("*.frq.count") , optional:true, emit: frq_count + tuple val(meta), path("*.idepth") , optional:true, emit: idepth + tuple val(meta), path("*.ldepth") , optional:true, emit: ldepth + tuple val(meta), path("*.ldepth.mean") , optional:true, emit: ldepth_mean + tuple val(meta), path("*.gdepth") , optional:true, emit: gdepth + tuple val(meta), path("*.hap.ld") , optional:true, emit: hap_ld + tuple val(meta), path("*.geno.ld") , optional:true, emit: geno_ld + tuple val(meta), path("*.geno.chisq") , optional:true, emit: geno_chisq + tuple val(meta), path("*.list.hap.ld") , optional:true, emit: list_hap_ld + tuple val(meta), path("*.list.geno.ld") , optional:true, emit: list_geno_ld + tuple val(meta), path("*.interchrom.hap.ld") , optional:true, emit: interchrom_hap_ld + tuple val(meta), path("*.interchrom.geno.ld") , optional:true, emit: interchrom_geno_ld + tuple val(meta), path("*.TsTv") , optional:true, emit: tstv + tuple val(meta), path("*.TsTv.summary") , optional:true, emit: tstv_summary + tuple val(meta), path("*.TsTv.count") , optional:true, emit: tstv_count + tuple val(meta), path("*.TsTv.qual") , optional:true, emit: tstv_qual + tuple val(meta), path("*.FILTER.summary") , optional:true, emit: filter_summary + tuple val(meta), path("*.sites.pi") , optional:true, emit: sites_pi + tuple val(meta), path("*.windowed.pi") , optional:true, emit: windowed_pi + tuple val(meta), path("*.weir.fst") , optional:true, emit: weir_fst + tuple val(meta), path("*.het") , optional:true, emit: heterozygosity + tuple val(meta), path("*.hwe") , optional:true, emit: hwe + tuple val(meta), path("*.Tajima.D") , optional:true, emit: tajima_d + tuple val(meta), path("*.ifreqburden") , optional:true, emit: freq_burden + tuple val(meta), path("*.LROH") , optional:true, emit: lroh + tuple val(meta), path("*.relatedness") , optional:true, emit: relatedness + tuple val(meta), path("*.relatedness2") , optional:true, emit: relatedness2 + tuple val(meta), path("*.lqual") , optional:true, emit: lqual + tuple val(meta), path("*.imiss") , optional:true, emit: missing_individual + tuple val(meta), path("*.lmiss") , optional:true, emit: missing_site + tuple val(meta), path("*.snpden") , optional:true, emit: snp_density + tuple val(meta), path("*.kept.sites") , optional:true, emit: kept_sites + tuple val(meta), path("*.removed.sites") , optional:true, emit: removed_sites + tuple val(meta), path("*.singletons") , optional:true, emit: singeltons + tuple val(meta), path("*.indel.hist") , optional:true, emit: indel_hist + tuple val(meta), path("*.hapcount") , optional:true, emit: hapcount + tuple val(meta), path("*.mendel") , optional:true, emit: mendel + tuple val(meta), path("*.FORMAT") , optional:true, emit: format + tuple val(meta), path("*.INFO") , optional:true, emit: info + tuple val(meta), path("*.012") , optional:true, emit: genotypes_matrix + tuple val(meta), path("*.012.indv") , optional:true, emit: genotypes_matrix_individual + tuple val(meta), path("*.012.pos") , optional:true, emit: genotypes_matrix_position + tuple val(meta), path("*.impute.hap") , optional:true, emit: impute_hap + tuple val(meta), path("*.impute.hap.legend") , optional:true, emit: impute_hap_legend + tuple val(meta), path("*.impute.hap.indv") , optional:true, emit: impute_hap_indv + tuple val(meta), path("*.ldhat.sites") , optional:true, emit: ldhat_sites + tuple val(meta), path("*.ldhat.locs") , optional:true, emit: ldhat_locs + tuple val(meta), path("*.BEAGLE.GL") , optional:true, emit: beagle_gl + tuple val(meta), path("*.BEAGLE.PL") , optional:true, emit: beagle_pl + tuple val(meta), path("*.ped") , optional:true, emit: ped + tuple val(meta), path("*.map") , optional:true, emit: map_ + tuple val(meta), path("*.tped") , optional:true, emit: tped + tuple val(meta), path("*.tfam") , optional:true, emit: tfam + tuple val(meta), path("*.diff.sites_in_files") , optional:true, emit: diff_sites_in_files + tuple val(meta), path("*.diff.indv_in_files") , optional:true, emit: diff_indv_in_files + tuple val(meta), path("*.diff.sites") , optional:true, emit: diff_sites + tuple val(meta), path("*.diff.indv") , optional:true, emit: diff_indv + tuple val(meta), path("*.diff.discordance.matrix"), optional:true, emit: diff_discd_matrix + tuple val(meta), path("*.diff.switch") , optional:true, emit: diff_switch_error + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def args_list = args.tokenize() + + def bed_arg = (args.contains('--bed')) ? "--bed ${bed}" : + (args.contains('--exclude-bed')) ? "--exclude-bed ${bed}" : + (args.contains('--hapcount')) ? "--hapcount ${bed}" : '' + args_list.removeIf { it.contains('--bed') } + args_list.removeIf { it.contains('--exclude-bed') } + args_list.removeIf { it.contains('--hapcount') } + + def diff_variant_arg = (args.contains('--diff')) ? "--diff ${diff_variant_file}" : + (args.contains('--gzdiff')) ? "--gzdiff ${diff_variant_file}" : + (args.contains('--diff-bcf')) ? "--diff-bcf ${diff_variant_file}" : '' + args_list.removeIf { it.contains('--diff') } + args_list.removeIf { it.contains('--gzdiff') } + args_list.removeIf { it.contains('--diff-bcf') } + + def input_file = ("$variant_file".endsWith(".vcf")) ? "--vcf ${variant_file}" : + ("$variant_file".endsWith(".vcf.gz")) ? "--gzvcf ${variant_file}" : + ("$variant_file".endsWith(".bcf")) ? "--bcf ${variant_file}" : '' + + """ + vcftools \\ + $input_file \\ + --out $prefix \\ + ${args_list.join(' ')} \\ + $bed_arg \\ + $diff_variant_arg + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + vcftools: \$(echo \$(vcftools --version 2>&1) | sed 's/^.*VCFtools (//;s/).*//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/vcftools/meta.yml b/modules/nf-core/vcftools/meta.yml new file mode 100644 index 0000000000..f361db4a8f --- /dev/null +++ b/modules/nf-core/vcftools/meta.yml @@ -0,0 +1,292 @@ +name: vcftools +description: A set of tools written in Perl and C++ for working with VCF files +keywords: + - VCF + - sort +tools: + - vcftools: + description: A set of tools written in Perl and C++ for working with VCF files. This package only contains the C++ libraries whereas the package perl-vcftools-vcf contains the perl libraries + homepage: http://vcftools.sourceforge.net/ + documentation: http://vcftools.sourceforge.net/man_latest.html + licence: ["LGPL"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - variant_file: + type: file + description: variant input file which can be vcf, vcf.gz, or bcf format. + - bed: + type: file + description: bed file which can be used with different arguments in vcftools (optional) + - diff_variant_file: + type: file + description: secondary variant file which can be used with the 'diff' suite of tools (optional) +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - vcf: + type: file + description: vcf file (optional) + pattern: "*.vcf" + - bcf: + type: file + description: bcf file (optional) + pattern: "*.bcf" + - frq: + type: file + description: Allele frequency for each site (optional) + pattern: "*.frq" + - frq_count: + type: file + description: Allele counts for each site (optional) + pattern: "*.frq.count" + - idepth: + type: file + description: mean depth per individual (optional) + pattern: "*.idepth" + - ldepth: + type: file + description: depth per site summed across individuals (optional) + pattern: "*.ildepth" + - ldepth_mean: + type: file + description: mean depth per site calculated across individuals (optional) + pattern: "*.ldepth.mean" + - gdepth: + type: file + description: depth for each genotype in vcf file (optional) + pattern: "*.gdepth" + - hap_ld: + type: file + description: r2, D, and D’ statistics using phased haplotypes (optional) + pattern: "*.hap.ld" + - geno_ld: + type: file + description: squared correlation coefficient between genotypes encoded as 0, 1 and 2 to represent the number of non-reference alleles in each individual (optional) + pattern: "*.geno.ld" + - geno_chisq: + type: file + description: test for genotype independence via the chi-squared statistic (optional) + pattern: "*.geno.chisq" + - list_hap_ld: + type: file + description: r2 statistics of the sites contained in the provided input file verses all other sites (optional) + pattern: "*.list.hap.ld" + - list_geno_ld: + type: file + description: r2 statistics of the sites contained in the provided input file verses all other sites (optional) + pattern: "*.list.geno.ld" + - interchrom_hap_ld: + type: file + description: r2 statistics for sites (haplotypes) on different chromosomes (optional) + pattern: "*.interchrom.hap.ld" + - interchrom_geno_ld: + type: file + description: r2 statistics for sites (genotypes) on different chromosomes (optional) + pattern: "*.interchrom.geno.ld" + - tstv: + type: file + description: Transition / Transversion ratio in bins of size defined in options (optional) + pattern: "*.TsTv" + - tstv_summary: + type: file + description: Summary of all Transitions and Transversions (optional) + pattern: "*.TsTv.summary" + - tstv_count: + type: file + description: Transition / Transversion ratio as a function of alternative allele count (optional) + pattern: "*.TsTv.count" + - tstv_qual: + type: file + description: Transition / Transversion ratio as a function of SNP quality threshold (optional) + pattern: "*.TsTv.qual" + - filter_summary: + type: file + description: Summary of the number of SNPs and Ts/Tv ratio for each FILTER category (optional) + pattern: "*.FILTER.summary" + - sites_pi: + type: file + description: Nucleotide divergency on a per-site basis (optional) + pattern: "*.sites.pi" + - windowed_pi: + type: file + description: Nucleotide diversity in windows, with window size determined by options (optional) + pattern: "*windowed.pi" + - weir_fst: + type: file + description: Fst estimate from Weir and Cockerham’s 1984 paper (optional) + pattern: "*.weir.fst" + - heterozygosity: + type: file + description: Heterozygosity on a per-individual basis (optional) + pattern: "*.het" + - hwe: + type: file + description: Contains the Observed numbers of Homozygotes and Heterozygotes and the corresponding Expected numbers under HWE (optional) + pattern: "*.hwe" + - tajima_d: + type: file + description: Tajima’s D statistic in bins with size of the specified number in options (optional) + pattern: "*.Tajima.D" + - freq_burden: + type: file + description: Number of variants within each individual of a specific frequency in options (optional) + pattern: "*.ifreqburden" + - lroh: + type: file + description: Long Runs of Homozygosity (optional) + pattern: "*.LROH" + - relatedness: + type: file + description: Relatedness statistic based on the method of Yang et al, Nature Genetics 2010 (doi:10.1038/ng.608) (optional) + pattern: "*.relatedness" + - relatedness2: + type: file + description: Relatedness statistic based on the method of Manichaikul et al., BIOINFORMATICS 2010 (doi:10.1093/bioinformatics/btq559) (optional) + pattern: "*.relatedness2" + - lqual: + type: file + description: per-site SNP quality (optional) + pattern: "*.lqual" + - missing_individual: + type: file + description: Missingness on a per-individual basis (optional) + pattern: "*.imiss" + - missing_site: + type: file + description: Missingness on a per-site basis (optional) + pattern: "*.lmiss" + - snp_density: + type: file + description: Number and density of SNPs in bins of size defined by option (optional) + pattern: "*.snpden" + - kept_sites: + type: file + description: All sites that have been kept after filtering (optional) + pattern: "*.kept.sites" + - removed_sites: + type: file + description: All sites that have been removed after filtering (optional) + pattern: "*.removed.sites" + - singeltons: + type: file + description: Location of singletons, and the individual they occur in (optional) + pattern: "*.singeltons" + - indel_hist: + type: file + description: Histogram file of the length of all indels (including SNPs) (optional) + pattern: "*.indel_hist" + - hapcount: + type: file + description: Unique haplotypes within user specified bins (optional) + pattern: "*.hapcount" + - mendel: + type: file + description: Mendel errors identified in trios (optional) + pattern: "*.mendel" + - format: + type: file + description: Extracted information from the genotype fields in the VCF file relating to a specfied FORMAT identifier (optional) + pattern: "*.FORMAT" + - info: + type: file + description: Extracted information from the INFO field in the VCF file (optional) + pattern: "*.INFO" + - genotypes_matrix: + type: file + description: | + Genotypes output as large matrix. + Genotypes of each individual on a separate line. + Genotypes are represented as 0, 1 and 2, where the number represent that number of non-reference alleles. + Missing genotypes are represented by -1 (optional) + pattern: "*.012" + - genotypes_matrix_individual: + type: file + description: Details the individuals included in the main genotypes_matrix file (optional) + pattern: "*.012.indv" + - genotypes_matrix_position: + type: file + description: Details the site locations included in the main genotypes_matrix file (optional) + pattern: "*.012.pos" + - impute_hap: + type: file + description: Phased haplotypes in IMPUTE reference-panel format (optional) + pattern: "*.impute.hap" + - impute_hap_legend: + type: file + description: Impute haplotype legend file (optional) + pattern: "*.impute.hap.legend" + - impute_hap_indv: + type: file + description: Impute haplotype individuals file (optional) + pattern: "*.impute.hap.indv" + - ldhat_sites: + type: file + description: Output data in LDhat format, sites (optional) + pattern: "*.ldhat.sites" + - ldhat_locs: + type: file + description: output data in LDhat format, locations (optional) + pattern: "*.ldhat.locs" + - beagle_gl: + type: file + description: Genotype likelihoods for biallelic sites (optional) + pattern: "*.BEAGLE.GL" + - beagle_pl: + type: file + description: Genotype likelihoods for biallelic sites (optional) + pattern: "*.BEAGLE.PL" + - ped: + type: file + description: output the genotype data in PLINK PED format (optional) + pattern: "*.ped" + - map_: + type: file + description: output the genotype data in PLINK PED format (optional) + pattern: "*.map" + - tped: + type: file + description: output the genotype data in PLINK PED format (optional) + pattern: "*.tped" + - tfam: + type: file + description: output the genotype data in PLINK PED format (optional) + pattern: "*.tfam" + - diff_sites_in_files: + type: file + description: Sites that are common / unique to each file specified in optional inputs (optional) + pattern: "*.diff.sites.in.files" + - diff_indv_in_files: + type: file + description: Individuals that are common / unique to each file specified in optional inputs (optional) + pattern: "*.diff.indv.in.files" + - diff_sites: + type: file + description: Discordance on a site by site basis, specified in optional inputs (optional) + pattern: "*.diff.sites" + - diff_indv: + type: file + description: Discordance on a individual by individual basis, specified in optional inputs (optional) + pattern: "*.diff.indv" + - diff_discd_matrix: + type: file + description: Discordance matrix between files specified in optional inputs (optional) + pattern: "*.diff.discordance.matrix" + - diff_switch_error: + type: file + description: Switch errors found between sites (optional) + pattern: "*.diff.switch" +authors: + - "@Mark-S-Hill" +maintainers: + - "@Mark-S-Hill" diff --git a/nextflow.config b/nextflow.config index 4066884fea..ad2bd87e3d 100644 --- a/nextflow.config +++ b/nextflow.config @@ -6,17 +6,97 @@ ---------------------------------------------------------------------------------------- */ -// Global default params, used in configs params { + // Workflow flags: + + // Mandatory arguments + input = null // No default input + step = 'mapping' // Starts with mapping - // TODO nf-core: Specify your pipeline's command line flags - // Input options - input = null // References - genome = null - igenomes_base = 's3://ngi-igenomes/igenomes/' - igenomes_ignore = false - + genome = 'GATK.GRCh38' + igenomes_base = 's3://ngi-igenomes/igenomes/' + snpeff_cache = 's3://annotation-cache/snpeff_cache/' + vep_cache = 's3://annotation-cache/vep_cache/' + igenomes_ignore = false + save_reference = false // Built references not saved + build_only_index = false // Only build the reference indexes + download_cache = false // Do not download annotation cache + + // Main options + no_intervals = false // Intervals will be built from the fasta file + nucleotides_per_second = 200000 // Default interval size + tools = null // No default Variant_Calling or Annotation tools + skip_tools = null // All tools (markduplicates + baserecalibrator + QC) are used by default + split_fastq = 50000000 // FASTQ files will not be split by default by FASTP + + // Modify fastqs (trim/split) with FASTP + trim_fastq = false // No trimming + clip_r1 = 0 + clip_r2 = 0 + three_prime_clip_r1 = 0 + three_prime_clip_r2 = 0 + trim_nextseq = 0 + save_trimmed = false + save_split_fastqs = false + + // UMI tagged reads + umi_read_structure = null // no UMI + group_by_umi_strategy = 'Adjacency' // default strategy when running with UMI for GROUPREADSBYUMI + + // Preprocessing + aligner = 'bwa-mem' // Default is bwa-mem, bwa-mem2 and dragmap can be used too + use_gatk_spark = null // GATK Spark implementation of their tools in local mode not used by default + save_mapped = false // Mapped BAMs not saved + save_output_as_bam = false // Output files from preprocessing are saved as bam and not as cram files + seq_center = null // No sequencing center to be written in read group CN field by aligner + seq_platform = 'ILLUMINA' // Default platform written in read group PL field by aligner + + // Variant Calling + ascat_ploidy = null // default value for ASCAT + ascat_min_base_qual = 20 // default value for ASCAT + ascat_min_counts = 10 // default value for ASCAT + ascat_min_map_qual = 35 // default value for ASCAT + ascat_purity = null // default value for ASCAT + cf_ploidy = "2" // default value for Control-FREEC + cf_coeff = 0.05 // default value for Control-FREEC + cf_contamination = 0 // default value for Control-FREEC + cf_contamination_adjustment = false // by default we are not using this in Control-FREEC + cf_mincov = 0 // ControlFreec default values + cf_minqual = 0 // ControlFreec default values + cf_window = null // by default we are not using this in Control-FREEC + cnvkit_reference = null // by default the reference is build from the fasta file + concatenate_vcfs = false // by default we don't concatenate the germline-vcf-files + ignore_soft_clipped_bases = false // no --dont-use-soft-clipped-bases for GATK Mutect2 + joint_germline = false // g.vcf & joint germline calling are not run by default if HaplotypeCaller is selected + joint_mutect2 = false // if true, enables patient-wise multi-sample somatic variant calling + only_paired_variant_calling = false // if true, skips germline variant calling for normal-paired sample + sentieon_dnascope_emit_mode = 'variant' // default value for Sentieon dnascope + sentieon_dnascope_pcr_indel_model = 'CONSERVATIVE' + sentieon_haplotyper_emit_mode = 'variant' // default value for Sentieon haplotyper + wes = false // Set to true, if data is exome/targeted sequencing data. Used to use correct models in various variant callers + + // Annotation + bcftools_annotations = null // No extra annotation file + bcftools_annotations_tbi = null // No extra annotation file index + bcftools_header_lines = null // No header lines to be added to the VCF file + dbnsfp = null // No dbnsfp processed file + dbnsfp_consequence = null // No default consequence for dbnsfp plugin + dbnsfp_fields = "rs_dbSNP,HGVSc_VEP,HGVSp_VEP,1000Gp3_EAS_AF,1000Gp3_AMR_AF,LRT_score,GERP++_RS,gnomAD_exomes_AF" // Default fields for dbnsfp plugin + dbnsfp_tbi = null // No dbnsfp processed file index + outdir_cache = null // No default outdir cache + spliceai_indel = null // No spliceai_indel file + spliceai_indel_tbi = null // No spliceai_indel file index + spliceai_snv = null // No spliceai_snv file + spliceai_snv_tbi = null // No spliceai_snv file index + vep_custom_args = "--everything --filter_common --per_gene --total_length --offline --format vcf" // Default arguments for VEP + vep_dbnsfp = null // dbnsfp plugin disabled within VEP + vep_include_fasta = false // Don't use fasta file for annotation with VEP + vep_loftee = null // loftee plugin disabled within VEP + vep_out_format = "vcf" + vep_spliceai = null // spliceai plugin disabled within VEP + vep_spliceregion = null // spliceregion plugin disabled within VEP + vep_version = "110.0-0" // Should be updated when we update VEP, needs this to get full path to some plugins // MultiQC options multiqc_config = null @@ -43,7 +123,7 @@ params { custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}" config_profile_contact = null config_profile_url = null - + test_data_base = 'https://raw.githubusercontent.com/nf-core/test-datasets/sarek3' // Max resource options // Defaults only, expecting to be overwritten @@ -53,11 +133,10 @@ params { // Schema validation default options validationFailUnrecognisedParams = false - validationLenientMode = false - validationSchemaIgnoreParams = 'genomes,igenomes_base' + validationLenientMode = true + validationSchemaIgnoreParams = 'cf_ploidy,genomes,igenomes_base' validationShowHiddenParams = false validate_params = true - } // Load base.config by default for all pipelines @@ -71,105 +150,164 @@ try { } // Load nf-core/sarek custom profiles from different institutions. -// Warning: Uncomment only if a pipeline-specific instititutional config already exists on nf-core/configs! -// try { -// includeConfig "${params.custom_config_base}/pipeline/sarek.config" -// } catch (Exception e) { -// System.err.println("WARNING: Could not load nf-core/config/sarek profiles: ${params.custom_config_base}/pipeline/sarek.config") -// } +try { + includeConfig "${params.custom_config_base}/pipeline/sarek.config" +} catch (Exception e) { + System.err.println("WARNING: Could not load nf-core/config/sarek profiles: ${params.custom_config_base}/pipeline/sarek.config") +} + profiles { debug { + cleanup = false dumpHashes = true process.beforeScript = 'echo $HOSTNAME' cleanup = false nextflow.enable.configProcessNamesValidation = true } conda { + apptainer.enabled = false + charliecloud.enabled = false conda.enabled = true docker.enabled = false - singularity.enabled = false podman.enabled = false shifter.enabled = false - charliecloud.enabled = false - apptainer.enabled = false + singularity.enabled = false } mamba { + apptainer.enabled = false + charliecloud.enabled = false conda.enabled = true conda.useMamba = true + charliecloud.enabled = false docker.enabled = false - singularity.enabled = false podman.enabled = false shifter.enabled = false - charliecloud.enabled = false - apptainer.enabled = false + singularity.enabled = false } docker { - docker.enabled = true + apptainer.enabled = false + charliecloud.enabled = false conda.enabled = false - singularity.enabled = false + docker.enabled = true + docker.userEmulation = { params.use_gatk_spark ? false : true }.call() podman.enabled = false shifter.enabled = false - charliecloud.enabled = false - apptainer.enabled = false docker.runOptions = '-u $(id -u):$(id -g)' + singularity.enabled = false } arm { docker.runOptions = '-u $(id -u):$(id -g) --platform=linux/amd64' } singularity { - singularity.enabled = true - singularity.autoMounts = true + apptainer.enabled = false + charliecloud.enabled = false conda.enabled = false docker.enabled = false podman.enabled = false shifter.enabled = false - charliecloud.enabled = false - apptainer.enabled = false + singularity.autoMounts = true + singularity.enabled = true } podman { - podman.enabled = true + apptainer.enabled = false + charliecloud.enabled = false conda.enabled = false docker.enabled = false - singularity.enabled = false + podman.enabled = true shifter.enabled = false - charliecloud.enabled = false - apptainer.enabled = false + singularity.enabled = false } shifter { - shifter.enabled = true + apptainer.enabled = false + charliecloud.enabled = false conda.enabled = false docker.enabled = false - singularity.enabled = false podman.enabled = false - charliecloud.enabled = false - apptainer.enabled = false + shifter.enabled = true + singularity.enabled = false } charliecloud { + apptainer.enabled = false charliecloud.enabled = true conda.enabled = false docker.enabled = false - singularity.enabled = false podman.enabled = false shifter.enabled = false - apptainer.enabled = false + singularity.enabled = false } apptainer { apptainer.enabled = true apptainer.autoMounts = true + charliecloud.enabled = false conda.enabled = false docker.enabled = false - singularity.enabled = false podman.enabled = false shifter.enabled = false - charliecloud.enabled = false + singularity.enabled = false } gitpod { - executor.name = 'local' executor.cpus = 4 executor.memory = 8.GB + executor.name = 'local' + } + // Basic test profile for CI + test { includeConfig 'conf/test.config' } + test_aws { + includeConfig 'conf/test.config' + params.sentieon_dnascope_model = "s3://ngi-igenomes/igenomes/Homo_sapiens/GATK/GRCh38/Annotation/Sentieon/SentieonDNAscopeModel1.1.model" + } + test_azure { + includeConfig 'conf/test.config' + params.sentieon_dnascope_model = "az://igenomes/Homo_sapiens/GATK/GRCh38/Annotation/Sentieon/SentieonDNAscopeModel1.1.model" } - test { includeConfig 'conf/test.config' } - test_full { includeConfig 'conf/test_full.config' } + test_cache { includeConfig 'conf/test/cache.config' } + // Extra test profiles for full tests on AWS + test_full { includeConfig 'conf/test_full.config' } + test_full_aws { + includeConfig 'conf/test_full.config' + } + test_full_azure { + includeConfig 'conf/test_full.config' + params.input = 'https://raw.githubusercontent.com/nf-core/test-datasets/sarek/testdata/csv/HCC1395_WXS_somatic_full_test_azure.csv' + params.intervals = 'az://test-data/sarek/S07604624_Padded_Agilent_SureSelectXT_allexons_V6_UTR.bed' + params.igenomes_base = "az://igenomes" + } + test_full_germline { includeConfig 'conf/test_full_germline.config' } + test_full_germline_aws { + includeConfig 'conf/test_full_germline.config' + } + test_full_germline_azure { + includeConfig 'conf/test_full_germline.config' + params.input = 'https://raw.githubusercontent.com/nf-core/test-datasets/sarek/testdata/csv/NA12878_WGS_30x_full_test_azure.csv' + params.igenomes_base = "az://igenomes" + } + // Extra test profiles for more complete CI + alignment_to_fastq { includeConfig 'conf/test/alignment_to_fastq.config' } + annotation { includeConfig 'conf/test/annotation.config' } + markduplicates_bam { includeConfig 'conf/test/markduplicates_bam.config' } + markduplicates_cram { includeConfig 'conf/test/markduplicates_cram.config' } + no_intervals { includeConfig 'conf/test/no_intervals.config' } + pair { includeConfig 'conf/test/pair.config' } + prepare_recalibration_bam { includeConfig 'conf/test/prepare_recalibration_bam.config' } + prepare_recalibration_cram { includeConfig 'conf/test/prepare_recalibration_cram.config' } + recalibrate_bam { includeConfig 'conf/test/recalibrate_bam.config' } + recalibrate_cram { includeConfig 'conf/test/recalibrate_cram.config' } + save_bam_mapped { includeConfig 'conf/test/save_bam_mapped.config' } + sentieon_dedup_bam { includeConfig 'conf/test/sentieon_dedup_bam.config' } + sentieon_dedup_cram { includeConfig 'conf/test/sentieon_dedup_cram.config' } + skip_bqsr { includeConfig 'conf/test/skip_bqsr.config' } + skip_markduplicates { includeConfig 'conf/test/skip_markduplicates.config' } + split_fastq { includeConfig 'conf/test/split_fastq.config' } + targeted { includeConfig 'conf/test/targeted.config' } + tools { includeConfig 'conf/test/tools.config' } + tools_germline { includeConfig 'conf/test/tools_germline.config' } + tools_somatic { includeConfig 'conf/test/tools_somatic.config' } + tools_somatic_ascat { includeConfig 'conf/test/tools_somatic_ascat.config' } + tools_tumoronly { includeConfig 'conf/test/tools_tumoronly.config' } + trimming { includeConfig 'conf/test/trimming.config' } + umi { includeConfig 'conf/test/umi.config' } + use_gatk_spark { includeConfig 'conf/test/use_gatk_spark.config' } + variantcalling_channels { includeConfig 'conf/test/variantcalling_channels.config' } } // Set default registry for Apptainer, Docker, Podman and Singularity independent of -profile @@ -183,6 +321,7 @@ singularity.registry = 'quay.io' // Nextflow plugins plugins { id 'nf-validation@1.1.3' // Validation of pipeline parameters and creation of an input channel from a sample sheet + id 'nf-prov' // Provenance reports for pipeline runs } // Load igenomes.config if required @@ -191,6 +330,7 @@ if (!params.igenomes_ignore) { } else { params.genomes = [:] } + // Export these variables to prevent local Python/R libraries from conflicting with those in the container // The JULIA depot path has been adjusted to a fixed path `/usr/local/share/julia` that needs to be used for packages in the container. // See https://apeltzer.github.io/post/03-julia-lang-nextflow/ for details on that. Once we have a common agreement on where to keep Julia packages, this is adjustable. @@ -225,6 +365,14 @@ dag { enabled = true file = "${params.outdir}/pipeline_info/pipeline_dag_${trace_timestamp}.html" } +prov { + enabled = true + formats { + bco { + file = "${params.outdir}/pipeline_info/manifest_${trace_timestamp}.bco.json" + } + } +} manifest { name = 'nf-core/sarek' @@ -234,11 +382,54 @@ manifest { mainScript = 'main.nf' nextflowVersion = '!>=23.04.0' version = '3.5dev' - doi = '' + doi = '10.12688/f1000research.16665.2, 10.1101/2023.07.19.549462, 10.5281/zenodo.3476425' } // Load modules.config for DSL2 module specific options -includeConfig 'conf/modules.config' +includeConfig 'conf/modules/modules.config' + +// Load more modules specific config for DSL2 module specific options + +// prepare reference +includeConfig 'conf/modules/download_cache.config' +includeConfig 'conf/modules/prepare_genome.config' +includeConfig 'conf/modules/prepare_intervals.config' + +// preprocessing +includeConfig 'conf/modules/aligner.config' +includeConfig 'conf/modules/alignment_to_fastq.config' +includeConfig 'conf/modules/markduplicates.config' +includeConfig 'conf/modules/sentieon_dedup.config' +includeConfig 'conf/modules/prepare_recalibration.config' +includeConfig 'conf/modules/recalibrate.config' +includeConfig 'conf/modules/trimming.config' +includeConfig 'conf/modules/umi.config' + +//ngscheckmate +includeConfig 'conf/modules/ngscheckmate.config' + +// variant calling +includeConfig 'conf/modules/ascat.config' +includeConfig 'conf/modules/cnvkit.config' +includeConfig 'conf/modules/controlfreec.config' +includeConfig 'conf/modules/deepvariant.config' +includeConfig 'conf/modules/freebayes.config' +includeConfig 'conf/modules/haplotypecaller.config' +includeConfig 'conf/modules/joint_germline.config' +includeConfig 'conf/modules/manta.config' +includeConfig 'conf/modules/mpileup.config' +includeConfig 'conf/modules/msisensorpro.config' +includeConfig 'conf/modules/mutect2.config' +includeConfig 'conf/modules/sentieon_dnascope.config' +includeConfig 'conf/modules/sentieon_dnascope_joint_germline.config' +includeConfig 'conf/modules/sentieon_haplotyper.config' +includeConfig 'conf/modules/sentieon_haplotyper_joint_germline.config' +includeConfig 'conf/modules/strelka.config' +includeConfig 'conf/modules/tiddit.config' +includeConfig 'conf/modules/post_variant_calling.config' + +//annotate +includeConfig 'conf/modules/annotate.config' // Function to ensure that resource requirements don't go beyond // a maximum limit diff --git a/nextflow_schema.json b/nextflow_schema.json index fff86f52ee..a7f7ed7065 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,35 +10,551 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": ["input", "outdir"], + "help_text": "Specify input samplesheet, step and output folder.", + "required": ["step", "outdir"], "properties": { "input": { + "description": "Path to comma-separated file containing information about the samples in the experiment.", + "help_text": "A design file with information about the samples in your experiment. Use this parameter to specify the location of the input files. It has to be a comma-separated file with a header row. See [usage docs](https://nf-co.re/sarek/usage#input).\n\nIf no input file is specified, sarek will attempt to locate one in the `{outdir}` directory. If no input should be supplied, i.e. when --step is supplied or --build_from_index, then set --input false", + "fa_icon": "fas fa-file-csv", + "schema": "assets/schema_input.json", + "type": "string", + "format": "file-path", + "exists": true, + "mimetype": "text/csv", + "pattern": "^\\S+\\.csv$" + }, + "input_restart": { "type": "string", + "description": "Automatic retrieval for restart", + "fa_icon": "fas fa-file-csv", "format": "file-path", "exists": true, "mimetype": "text/csv", "pattern": "^\\S+\\.csv$", - "description": "Path to comma-separated file containing information about the samples in the experiment.", - "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row. See [usage docs](https://nf-co.re/sarek/usage#samplesheet-input).", - "fa_icon": "fas fa-file-csv" + "hidden": true, + "schema": "assets/schema_input.json" + }, + "step": { + "type": "string", + "default": "mapping", + "fa_icon": "fas fa-play", + "description": "Starting step", + "help_text": "The pipeline starts from this step and then runs through the possible subsequent steps.", + "enum": [ + "mapping", + "markduplicates", + "prepare_recalibration", + "recalibrate", + "variant_calling", + "annotate" + ] }, "outdir": { "type": "string", "format": "directory-path", "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.", "fa_icon": "fas fa-folder-open" + } + } + }, + "main_options": { + "title": "Main options", + "type": "object", + "description": "Most common options used for the pipeline", + "default": "", + "properties": { + "split_fastq": { + "oneOf": [ + { + "type": "integer", + "minimum": 250 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 0 + } + ], + "type": "integer", + "default": 50000000, + "fa_icon": "fas fa-clock", + "description": "Specify how many reads each split of a FastQ file contains. Set 0 to turn off splitting at all.", + "help_text": "Use the the tool FastP to split FASTQ file by number of reads. This parallelizes across fastq file shards speeding up mapping. Note although the minimum value is 250 reads, if you have fewer than 250 reads a single FASTQ shard will still be created." }, - "email": { + "wes": { + "type": "boolean", + "fa_icon": "fas fa-dna", + "description": "Enable when exome or panel data is provided.", + "help_text": "With this parameter flags in various tools are set for targeted sequencing data. It is recommended to enable for whole-exome and panel data analysis." + }, + "intervals": { "type": "string", - "description": "Email address for completion summary.", - "fa_icon": "fas fa-envelope", - "help_text": "Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.", - "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$" + "fa_icon": "fas fa-file-alt", + "help_text": "To speed up preprocessing and variant calling processes, the execution is parallelized across a reference chopped into smaller pieces.\n\nParts of preprocessing and variant calling are done by these intervals, the different resulting files are then merged.\nThis can parallelize processes, and push down wall clock time significantly.\n\nWe are aligning to the whole genome, and then run Base Quality Score Recalibration and Variant Calling on the supplied regions.\n\n**Whole Genome Sequencing:**\n\nThe (provided) intervals are chromosomes cut at their centromeres (so each chromosome arm processed separately) also additional unassigned contigs.\n\nWe are ignoring the `hs37d5` contig that contains concatenated decoy sequences.\n\nThe calling intervals can be defined using a .list or a BED file.\nA .list file contains one interval per line in the format `chromosome:start-end` (1-based coordinates).\nA BED file must be a tab-separated text file with one interval per line.\nThere must be at least three columns: chromosome, start, and end (0-based coordinates).\nAdditionally, the score column of the BED file can be used to provide an estimate of how many seconds it will take to call variants on that interval.\nThe fourth column remains unused.\n\n```\n|chr1|10000|207666|NA|47.3|\n```\nThis indicates that variant calling on the interval chr1:10001-207666 takes approximately 47.3 seconds.\n\nThe runtime estimate is used in two different ways.\nFirst, when there are multiple consecutive intervals in the file that take little time to compute, they are processed as a single job, thus reducing the number of processes that needs to be spawned.\nSecond, the jobs with largest processing time are started first, which reduces wall-clock time.\nIf no runtime is given, a time of 200000 nucleotides per second is assumed. See `--nucleotides_per_second` on how to customize this.\nActual figures vary from 2 nucleotides/second to 30000 nucleotides/second.\nIf you prefer, you can specify the full path to your reference genome when you run the pipeline:\n\n> **NB** If none provided, will be generated automatically from the FASTA reference\n> **NB** Use --no_intervals to disable automatic generation.\n\n**Targeted Sequencing:**\n\nThe recommended flow for targeted sequencing data is to use the workflow as it is, but also provide a `BED` file containing targets for all steps using the `--intervals` option. In addition, the parameter `--wes` should be set.\nIt is advised to pad the variant calling regions (exons or target) to some extent before submitting to the workflow.\n\nThe procedure is similar to whole genome sequencing, except that only BED file are accepted. See above for formatting description.\nAdding every exon as an interval in case of `WES` can generate >200K processes or jobs, much more forks, and similar number of directories in the Nextflow work directory. These are appropriately grouped together to reduce number of processes run in parallel (see above and `--nucleotides_per_second` for details). \nFurthermore, primers and/or baits are not 100% specific, (certainly not for MHC and KIR, etc.), quite likely there going to be reads mapping to multiple locations.\nIf you are certain that the target is unique for your genome (all the reads will certainly map to only one location), and aligning to the whole genome is an overkill, it is actually better to change the reference itself.", + "description": "Path to target bed file in case of whole exome or targeted sequencing or intervals file." }, - "multiqc_title": { + "nucleotides_per_second": { + "type": "integer", + "fa_icon": "fas fa-clock", + "description": "Estimate interval size.", + "help_text": "Intervals are parts of the chopped up genome used to speed up preprocessing and variant calling. See `--intervals` for more info. \n\nChanging this parameter, changes the number of intervals that are grouped and processed together. Bed files from target sequencing can contain thousands or small intervals. Spinning up a new process for each can be quite resource intensive. Instead it can be desired to process small intervals together on larger nodes. \nIn order to make use of this parameter, no runtime estimate can be present in the bed file (column 5). ", + "default": 200000 + }, + "no_intervals": { + "type": "boolean", + "fa_icon": "fas fa-ban", + "description": "Disable usage of intervals.", + "help_text": "Intervals are parts of the chopped up genome used to speed up preprocessing and variant calling. See `--intervals` for more info. \n\nIf `--no_intervals` is set no intervals will be taken into account for speed up or data processing." + }, + "tools": { "type": "string", - "description": "MultiQC report title. Printed as page header, used for filename if not otherwise specified.", - "fa_icon": "fas fa-file-signature" + "fa_icon": "fas fa-toolbox", + "description": "Tools to use for duplicate marking, variant calling and/or for annotation.", + "help_text": "Multiple tools separated with commas.\n\n**Variant Calling:**\n\nGermline variant calling can currently be performed with the following variant callers:\n- SNPs/Indels: DeepVariant, FreeBayes, GATK HaplotypeCaller, mpileup, Sentieon Haplotyper, Strelka\n- Structural Variants: Manta, TIDDIT\n- Copy-number: CNVKit\n\nTumor-only somatic variant calling can currently be performed with the following variant callers:\n- SNPs/Indels: FreeBayes, mpileup, Mutect2, Strelka\n- Structural Variants: Manta, TIDDIT\n- Copy-number: CNVKit, ControlFREEC\n\nSomatic variant calling can currently only be performed with the following variant callers:\n- SNPs/Indels: FreeBayes, Mutect2, Strelka2\n- Structural variants: Manta, TIDDIT\n- Copy-Number: ASCAT, CNVKit, Control-FREEC\n- Microsatellite Instability: MSIsensorpro\n\n> **NB** Mutect2 for somatic variant calling cannot be combined with `--no_intervals`\n\n**Annotation:**\n \n- snpEff, VEP, merge (both consecutively), and bcftools annotate (needs `--bcftools_annotation`).\n\n> **NB** As Sarek will use bgzip and tabix to compress and index VCF files annotated, it expects VCF files to be sorted when starting from `--step annotate`.", + "pattern": "^((ascat|bcfann|cnvkit|controlfreec|deepvariant|freebayes|haplotypecaller|sentieon_dnascope|sentieon_haplotyper|manta|merge|mpileup|msisensorpro|mutect2|ngscheckmate|sentieon_dedup|snpeff|strelka|tiddit|vep)?,?)*(? **NB** `--skip_tools baserecalibrator_report` is actually just not saving the reports.\n> **NB** `--skip_tools markduplicates_report` does not skip `MarkDuplicates` but prevent the collection of duplicate metrics that slows down performance.", + "pattern": "^((baserecalibrator|baserecalibrator_report|bcftools|dnascope_filter|documentation|fastqc|haplotypecaller_filter|haplotyper_filter|markduplicates|markduplicates_report|mosdepth|multiqc|samtools|vcftools|versions)?,?)*(? The GATK4 Base Quality Score recalibration tools `Baserecalibrator` and `ApplyBQSR` are currently available as Beta release. Use with caution!", + "pattern": "^((baserecalibrator|markduplicates)?,?)*(? **NB** If none provided, will be generated automatically from the FASTA reference. Combine with `--save_reference` to save for future runs.", + "hidden": true + }, + "bwamem2": { + "type": "string", + "fa_icon": "fas fa-copy", + "description": "Path to bwa-mem2 mem indices.", + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\n\nIf you wish to recompute indices available on igenomes, set `--bwamem2 false`.\n\n> **NB** If none provided, will be generated automatically from the FASTA reference, if `--aligner bwa-mem2` is specified. Combine with `--save_reference` to save for future runs.", + "hidden": true + }, + "chr_dir": { + "type": "string", + "fa_icon": "fas fa-folder-open", + "description": "Path to chromosomes folder used with ControLFREEC.", + "hidden": true, + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately." + }, + "dbsnp": { + "type": "string", + "fa_icon": "fas fa-file", + "description": "Path to dbsnp file.", + "hidden": true, + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately." + }, + "dbsnp_tbi": { + "type": "string", + "fa_icon": "fas fa-file", + "description": "Path to dbsnp index.", + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\n\n> **NB** If none provided, will be generated automatically from the dbsnp file. Combine with `--save_reference` to save for future runs.", + "hidden": true + }, + "dbsnp_vqsr": { + "type": "string", + "fa_icon": "fas fa-copy", + "description": "label string for VariantRecalibration (haplotypecaller joint variant calling)" + }, + "dict": { + "type": "string", + "fa_icon": "fas fa-file", + "description": "Path to FASTA dictionary file.", + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\n\n> **NB** If none provided, will be generated automatically from the FASTA reference. Combine with `--save_reference` to save for future runs.", + "hidden": true + }, + "dragmap": { + "type": "string", + "fa_icon": "fas fa-copy", + "description": "Path to dragmap indices.", + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\n\nIf you wish to recompute indices available on igenomes, set `--dragmap false`.\n\n> **NB** If none provided, will be generated automatically from the FASTA reference, if `--aligner dragmap` is specified. Combine with `--save_reference` to save for future runs.", + "hidden": true }, "fasta": { "type": "string", @@ -61,17 +668,177 @@ "mimetype": "text/plain", "pattern": "^\\S+\\.fn?a(sta)?(\\.gz)?$", "description": "Path to FASTA genome file.", - "help_text": "This parameter is *mandatory* if `--genome` is not specified. If you don't have a BWA index available this will be generated for you automatically. Combine with `--save_reference` to save BWA index for future runs.", + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\n\nThis parameter is *mandatory* if `--genome` is not specified.", "fa_icon": "far fa-file-code" }, + "fasta_fai": { + "type": "string", + "fa_icon": "fas fa-file", + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\n\n> **NB** If none provided, will be generated automatically from the FASTA reference. Combine with `--save_reference` to save for future runs.", + "description": "Path to FASTA reference index." + }, + "germline_resource": { + "type": "string", + "fa_icon": "fas fa-file", + "description": "Path to GATK Mutect2 Germline Resource File.", + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\n\nThe germline resource VCF file (bgzipped and tabixed) needed by GATK4 Mutect2 is a collection of calls that are likely present in the sample, with allele frequencies.\nThe AF info field must be present.\nYou can find a smaller, stripped gnomAD VCF file (most of the annotation is removed and only calls signed by PASS are stored) in the AWS iGenomes Annotation/GermlineResource folder.", + "hidden": true + }, + "germline_resource_tbi": { + "type": "string", + "fa_icon": "fas fa-file", + "description": "Path to GATK Mutect2 Germline Resource Index.", + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\n\n> **NB** If none provided, will be generated automatically from the Germline Resource file, if provided. Combine with `--save_reference` to save for future runs.", + "hidden": true + }, + "known_indels": { + "type": "string", + "fa_icon": "fas fa-copy", + "description": "Path to known indels file.", + "hidden": true, + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately." + }, + "known_indels_tbi": { + "type": "string", + "fa_icon": "fas fa-copy", + "description": "Path to known indels file index.", + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\n\n> **NB** If none provided, will be generated automatically from the known index file, if provided. Combine with `--save_reference` to save for future runs.", + "hidden": true + }, + "known_indels_vqsr": { + "type": "string", + "fa_icon": "fas fa-copy", + "description": "If you use AWS iGenomes, this has already been set for you appropriately.\n\n1st label string for VariantRecalibration (haplotypecaller joint variant calling)" + }, + "known_snps": { + "type": "string", + "fa_icon": "fas fa-copy", + "description": "If you use AWS iGenomes, this has already been set for you appropriately.\n\nPath to known snps file." + }, + "known_snps_tbi": { + "type": "string", + "fa_icon": "fas fa-copy", + "description": "Path to known snps file snps.", + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\n\n> **NB** If none provided, will be generated automatically from the known index file, if provided. Combine with `--save_reference` to save for future runs." + }, + "known_snps_vqsr": { + "type": "string", + "fa_icon": "fas fa-copy", + "description": "If you use AWS iGenomes, this has already been set for you appropriately.\n\nlabel string for VariantRecalibration (haplotypecaller joint variant calling)" + }, + "mappability": { + "type": "string", + "fa_icon": "fas fa-file", + "description": "Path to Control-FREEC mappability file.", + "hidden": true, + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately." + }, + "ngscheckmate_bed": { + "type": "string", + "fa_icon": "fas fa-file", + "description": "Path to SNP bed file for sample checking with NGSCheckMate", + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately." + }, + "pon": { + "type": "string", + "fa_icon": "fas fa-file", + "description": "Panel-of-normals VCF (bgzipped) for GATK Mutect2", + "help_text": "Without PON, there will be no calls with PASS in the INFO field, only an unfiltered VCF is written.\nIt is highly recommended to make your own PON, as it depends on sequencer and library preparation.\n\nThe pipeline is shipped with a panel-of-normals for `--genome GATK.GRCh38` provided by [GATK](https://gatk.broadinstitute.org/hc/en-us/articles/360035890631-Panel-of-Normals-PON-). \n\nSee [PON documentation](https://gatk.broadinstitute.org/hc/en-us/articles/360042479112-CreateSomaticPanelOfNormals-BETA)\n> **NB** PON file should be bgzipped.", + "hidden": true + }, + "pon_tbi": { + "type": "string", + "fa_icon": "fas fa-file", + "description": "Index of PON panel-of-normals VCF.", + "help_text": "If none provided, will be generated automatically from the PON bgzipped VCF file.", + "hidden": true + }, + "sentieon_dnascope_model": { + "type": "string", + "fa_icon": "fas fa-database", + "hidden": true, + "description": "Machine learning model for Sentieon Dnascope.", + "help_text": " It is recommended to use DNAscope with a machine learning model to perform variant calling with higher accuracy by improving the candidate detection and filtering. Sentieon can provide you with a model trained using a subset of the data from the GiAB truth-set found in https://github.com/genome-in-a-bottle. In addition, Sentieon can assist you in the creation of models using your own data, which will calibrate the specifics of your sequencing and bio-informatics processing." + }, + "snpeff_db": { + "type": "string", + "fa_icon": "fas fa-database", + "description": "snpEff DB version.", + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\nThis is used to specify the database to be use to annotate with.\nAlternatively databases' names can be listed with the `snpEff databases`." + }, + "snpeff_genome": { + "type": "string", + "fa_icon": "fas fa-microscope", + "description": "snpEff genome.", + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\nThis is used to specify the genome when looking for local cache, or cloud based cache." + }, + "vep_genome": { + "type": "string", + "fa_icon": "fas fa-microscope", + "description": "VEP genome.", + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\nThis is used to specify the genome when looking for local cache, or cloud based cache." + }, + "vep_species": { + "type": "string", + "fa_icon": "fas fa-microscope", + "description": "VEP species.", + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\nAlternatively species listed in Ensembl Genomes caches can be used." + }, + "vep_cache_version": { + "type": "number", + "fa_icon": "fas fa-tag", + "description": "VEP cache version.", + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\nAlternatively cache version can be use to specify the correct Ensembl Genomes version number as these differ from the concurrent Ensembl/VEP version numbers" + }, + "save_reference": { + "type": "boolean", + "fa_icon": "fas fa-download", + "description": "Save built references.", + "help_text": "Set this parameter, if you wish to save all computed reference files. This is useful to avoid re-computation on future runs." + }, + "build_only_index": { + "type": "boolean", + "fa_icon": "fas fa-download", + "description": "Only built references.", + "help_text": "Set this parameter, if you wish to compute and save all computed reference files. No alignment or any other downstream steps will be performed." + }, + "download_cache": { + "type": "boolean", + "fa_icon": "fas fa-download", + "description": "Download annotation cache.", + "help_text": "Set this parameter, if you wish to download annotation cache.\nUsing this parameter will download cache even if --snpeff_cache and --vep_cache are provided." + }, + "igenomes_base": { + "type": "string", + "format": "directory-path", + "description": "Directory / URL base for iGenomes references.", + "default": "s3://ngi-igenomes/igenomes/", + "fa_icon": "fas fa-cloud-download-alt" + }, "igenomes_ignore": { "type": "boolean", "description": "Do not load the iGenomes reference config.", "fa_icon": "fas fa-ban", - "hidden": true, - "help_text": "Do not load `igenomes.config` when running the pipeline. You may choose this option if you observe clashes between custom parameters and those supplied in `igenomes.config`." + "help_text": "Do not load `igenomes.config` when running the pipeline.\nYou may choose this option if you observe clashes between custom parameters and those supplied in `igenomes.config`.\n\n> **NB** You can then run `Sarek` by specifying at least a FASTA genome file." + }, + "vep_cache": { + "type": "string", + "format": "directory-path", + "fa_icon": "fas fa-cloud-download-alt", + "default": "s3://annotation-cache/vep_cache/", + "description": "Path to VEP cache.", + "help_text": "Path to VEP cache which should contain the relevant species, genome and build directories at the path ${vep_species}/${vep_genome}_${vep_cache_version}" + }, + "snpeff_cache": { + "type": "string", + "format": "directory-path", + "fa_icon": "fas fa-cloud-download-alt", + "default": "s3://annotation-cache/snpeff_cache/", + "description": "Path to snpEff cache.", + "help_text": "Path to snpEff cache which should contain the relevant genome and build directory in the path ${snpeff_species}.${snpeff_version}" } - } + }, + "help_text": "The pipeline config files come bundled with paths to the Illumina iGenomes reference index files.\nThe configuration is set up to use the AWS-iGenomes resource\ncf https://ewels.github.io/AWS-iGenomes/." }, "institutional_config_options": { "title": "Institutional config options", @@ -118,6 +885,27 @@ "description": "Institutional config URL link.", "hidden": true, "fa_icon": "fas fa-users-cog" + }, + "test_data_base": { + "type": "string", + "default": "https://raw.githubusercontent.com/nf-core/test-datasets/sarek3", + "description": "Base path / URL for data used in the test profiles", + "help_text": "Warning: The `-profile test` samplesheet file itself contains remote paths. Setting this parameter does not alter the contents of that file.", + "hidden": true + }, + "seq_center": { + "type": "string", + "fa_icon": "fas fa-university", + "description": "Sequencing center information to be added to read group (CN field).", + "hidden": true + }, + "seq_platform": { + "type": "string", + "fa_icon": "fas fa-university", + "default": "ILLUMINA", + "description": "Sequencing platform information to be added to read group (PL field).", + "help_text": "Default: ILLUMINA. Will be used to create a proper header for further GATK4 downstream analysis.", + "hidden": true } } }, @@ -134,7 +922,7 @@ "default": 16, "fa_icon": "fas fa-microchip", "hidden": true, - "help_text": "Use to set an upper-limit for the CPU requirement for each process. Should be an integer e.g. `--max_cpus 1`" + "help_text": "Use to set an upper-limit for the CPU requirement for each process. Should be an integer e.g. `--max_cpus 1`." }, "max_memory": { "type": "string", @@ -143,7 +931,7 @@ "fa_icon": "fas fa-memory", "pattern": "^\\d+(\\.\\d+)?\\.?\\s*(K|M|G|T)?B$", "hidden": true, - "help_text": "Use to set an upper-limit for the memory requirement for each process. Should be a string in the format integer-unit e.g. `--max_memory '8.GB'`" + "help_text": "Use to set an upper-limit for the memory requirement for each process. Should be a string in the format integer-unit e.g. `--max_memory '8.GB'`." }, "max_time": { "type": "string", @@ -152,7 +940,7 @@ "fa_icon": "far fa-clock", "pattern": "^(\\d+\\.?\\s*(s|m|h|d|day)\\s*)+$", "hidden": true, - "help_text": "Use to set an upper-limit for the time requirement for each process. Should be a string in the format integer-unit e.g. `--max_time '2.h'`" + "help_text": "Use to set an upper-limit for the time requirement for each process. Should be a string in the format integer-unit e.g. `--max_time '2.h'`." } } }, @@ -184,6 +972,13 @@ "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], "hidden": true }, + "email": { + "type": "string", + "description": "Email address for completion summary.", + "fa_icon": "fas fa-envelope", + "help_text": "Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.", + "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$" + }, "email_on_fail": { "type": "string", "description": "Email address for completion summary, only when pipeline fails.", @@ -212,12 +1007,10 @@ "fa_icon": "fas fa-palette", "hidden": true }, - "hook_url": { + "multiqc_title": { "type": "string", - "description": "Incoming hook URL for messaging service", - "fa_icon": "fas fa-people-group", - "help_text": "Incoming hook URL for messaging service. Currently, MS Teams and Slack are supported.", - "hidden": true + "description": "MultiQC report title. Printed as page header, used for filename if not otherwise specified.", + "fa_icon": "fas fa-file-signature" }, "multiqc_config": { "type": "string", @@ -264,6 +1057,13 @@ "description": "Validation of parameters in lenient more.", "hidden": true, "help_text": "Allows string values that are parseable as numbers or booleans. For further information see [JSONSchema docs](https://github.com/everit-org/json-schema#lenient-mode)." + }, + "hook_url": { + "type": "string", + "description": "Incoming hook URL for messaging service", + "fa_icon": "fas fa-people-group", + "help_text": "Incoming hook URL for messaging service. Currently, MS Teams and Slack are supported.", + "hidden": true } } } @@ -272,6 +1072,21 @@ { "$ref": "#/definitions/input_output_options" }, + { + "$ref": "#/definitions/main_options" + }, + { + "$ref": "#/definitions/fastq_preprocessing" + }, + { + "$ref": "#/definitions/preprocessing" + }, + { + "$ref": "#/definitions/variant_calling" + }, + { + "$ref": "#/definitions/annotation" + }, { "$ref": "#/definitions/reference_genome_options" }, diff --git a/nf-test.config b/nf-test.config new file mode 100644 index 0000000000..c60f901961 --- /dev/null +++ b/nf-test.config @@ -0,0 +1,6 @@ +config { + testsDir "." + workDir ".nf-test" + configFile "conf/test.config" + profile "test" +} diff --git a/subworkflows/local/bam_applybqsr/main.nf b/subworkflows/local/bam_applybqsr/main.nf new file mode 100644 index 0000000000..667b349a27 --- /dev/null +++ b/subworkflows/local/bam_applybqsr/main.nf @@ -0,0 +1,47 @@ +// +// RECALIBRATE +// +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run + +include { GATK4_APPLYBQSR } from '../../../modules/nf-core/gatk4/applybqsr/main' +include { CRAM_MERGE_INDEX_SAMTOOLS } from '../cram_merge_index_samtools/main' + +workflow BAM_APPLYBQSR { + take: + cram // channel: [mandatory] [ meta, cram, crai, recal ] + dict // channel: [mandatory] [ dict ] + fasta // channel: [mandatory] [ fasta ] + fasta_fai // channel: [mandatory] [ fasta_fai ] + intervals // channel: [mandatory] [ intervals, num_intervals ] or [ [], 0 ] if no intervals + + main: + versions = Channel.empty() + + // Combine cram and intervals for spread and gather strategy + cram_intervals = cram.combine(intervals) + // Move num_intervals to meta map + .map{ meta, cram, crai, recal, intervals, num_intervals -> [ meta + [ num_intervals:num_intervals ], cram, crai, recal, intervals ] } + + // RUN APPLYBQSR + GATK4_APPLYBQSR(cram_intervals, fasta, fasta_fai, dict.map{ meta, it -> [ it ] }) + + // Gather the recalibrated cram files + cram_to_merge = GATK4_APPLYBQSR.out.cram.map{ meta, cram -> [ groupKey(meta, meta.num_intervals), cram ] }.groupTuple() + + // Merge and index the recalibrated cram files + CRAM_MERGE_INDEX_SAMTOOLS(cram_to_merge, fasta, fasta_fai) + + cram_recal = CRAM_MERGE_INDEX_SAMTOOLS.out.cram_crai + // Remove no longer necessary field: num_intervals + .map{ meta, cram, crai -> [ meta - meta.subMap('num_intervals'), cram, crai ] } + + // Gather versions of all tools used + versions = versions.mix(GATK4_APPLYBQSR.out.versions) + versions = versions.mix(CRAM_MERGE_INDEX_SAMTOOLS.out.versions) + + emit: + cram = cram_recal // channel: [ meta, cram, crai ] + + versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/bam_applybqsr_spark/main.nf b/subworkflows/local/bam_applybqsr_spark/main.nf new file mode 100644 index 0000000000..cfb86ef6a4 --- /dev/null +++ b/subworkflows/local/bam_applybqsr_spark/main.nf @@ -0,0 +1,47 @@ +// +// RECALIBRATE SPARK +// +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run + +include { GATK4SPARK_APPLYBQSR } from '../../../modules/nf-core/gatk4spark/applybqsr/main' +include { CRAM_MERGE_INDEX_SAMTOOLS } from '../cram_merge_index_samtools/main' + +workflow BAM_APPLYBQSR_SPARK { + take: + cram // channel: [mandatory] [ meta, cram, crai, recal ] + dict // channel: [mandatory] [ dict ] + fasta // channel: [mandatory] [ fasta ] + fasta_fai // channel: [mandatory] [ fasta_fai ] + intervals // channel: [mandatory] [ intervals, num_intervals ] or [ [], 0 ] if no intervals + + main: + versions = Channel.empty() + + // Combine cram and intervals for spread and gather strategy + cram_intervals = cram.combine(intervals) + // Move num_intervals to meta map + .map{ meta, cram, crai, recal, intervals, num_intervals -> [ meta + [ num_intervals:num_intervals ], cram, crai, recal, intervals ] } + + // RUN APPLYBQSR SPARK + GATK4SPARK_APPLYBQSR(cram_intervals, fasta, fasta_fai, dict.map{ meta, it -> [ it ] }) + + // Gather the recalibrated cram files + cram_to_merge = GATK4SPARK_APPLYBQSR.out.cram.map{ meta, cram -> [ groupKey(meta, meta.num_intervals), cram ] }.groupTuple() + + // Merge and index the recalibrated cram files + CRAM_MERGE_INDEX_SAMTOOLS(cram_to_merge, fasta, fasta_fai) + + cram_recal = CRAM_MERGE_INDEX_SAMTOOLS.out.cram_crai + // Remove no longer necessary field: num_intervals + .map{ meta, cram, crai -> [ meta - meta.subMap('num_intervals'), cram, crai ] } + + // Gather versions of all tools used + versions = versions.mix(GATK4SPARK_APPLYBQSR.out.versions) + versions = versions.mix(CRAM_MERGE_INDEX_SAMTOOLS.out.versions) + + emit: + cram = cram_recal // channel: [ meta, cram, crai ] + + versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/bam_baserecalibrator/main.nf b/subworkflows/local/bam_baserecalibrator/main.nf new file mode 100644 index 0000000000..198b96e4ea --- /dev/null +++ b/subworkflows/local/bam_baserecalibrator/main.nf @@ -0,0 +1,54 @@ +// +// PREPARE RECALIBRATION +// +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run + +include { GATK4_BASERECALIBRATOR } from '../../../modules/nf-core/gatk4/baserecalibrator/main' +include { GATK4_GATHERBQSRREPORTS } from '../../../modules/nf-core/gatk4/gatherbqsrreports/main' + +workflow BAM_BASERECALIBRATOR { + take: + cram // channel: [mandatory] [ meta, cram_markduplicates, crai ] + dict // channel: [mandatory] [ dict ] + fasta // channel: [mandatory] [ fasta ] + fasta_fai // channel: [mandatory] [ fasta_fai ] + intervals // channel: [mandatory] [ intervals, num_intervals ] (or [ [], 0 ] if no intervals) + known_sites // channel: [optional] [ known_sites ] + known_sites_tbi // channel: [optional] [ known_sites_tbi ] + + main: + versions = Channel.empty() + + // Combine cram and intervals for spread and gather strategy + cram_intervals = cram.combine(intervals) + // Move num_intervals to meta map + .map{ meta, cram, crai, intervals, num_intervals -> [ meta + [ num_intervals:num_intervals ], cram, crai, intervals ] } + + // RUN BASERECALIBRATOR + GATK4_BASERECALIBRATOR(cram_intervals, fasta, fasta_fai, dict.map{ meta, it -> [ it ] }, known_sites, known_sites_tbi) + + // Figuring out if there is one or more table(s) from the same sample + table_to_merge = GATK4_BASERECALIBRATOR.out.table.map{ meta, table -> [ groupKey(meta, meta.num_intervals), table ] }.groupTuple().branch{ + // Use meta.num_intervals to asses number of intervals + single: it[0].num_intervals <= 1 + multiple: it[0].num_intervals > 1 + } + + // Only when using intervals + GATK4_GATHERBQSRREPORTS(table_to_merge.multiple) + + // Mix intervals and no_intervals channels together + table_bqsr = GATK4_GATHERBQSRREPORTS.out.table.mix(table_to_merge.single.map{ meta, table -> [ meta, table[0] ] }) + // Remove no longer necessary field: num_intervals + .map{ meta, table -> [ meta - meta.subMap('num_intervals'), table ] } + + // Gather versions of all tools used + versions = versions.mix(GATK4_BASERECALIBRATOR.out.versions) + versions = versions.mix(GATK4_GATHERBQSRREPORTS.out.versions) + + emit: + table_bqsr // channel: [ meta, table ] + + versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/bam_baserecalibrator_spark/main.nf b/subworkflows/local/bam_baserecalibrator_spark/main.nf new file mode 100644 index 0000000000..d6e12c39e0 --- /dev/null +++ b/subworkflows/local/bam_baserecalibrator_spark/main.nf @@ -0,0 +1,54 @@ +// +// PREPARE RECALIBRATION SPARK +// +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run + +include { GATK4SPARK_BASERECALIBRATOR } from '../../../modules/nf-core/gatk4spark/baserecalibrator/main' +include { GATK4_GATHERBQSRREPORTS } from '../../../modules/nf-core/gatk4/gatherbqsrreports/main' + +workflow BAM_BASERECALIBRATOR_SPARK { + take: + cram // channel: [mandatory] [ meta, cram_markduplicates, crai ] + dict // channel: [mandatory] [ dict ] + fasta // channel: [mandatory] [ fasta ] + fasta_fai // channel: [mandatory] [ fasta_fai ] + intervals // channel: [mandatory] [ intervals, num_intervals ] (or [ [], 0 ] if no intervals) + known_sites // channel: [optional] [ known_sites ] + known_sites_tbi // channel: [optional] [ known_sites_tbi ] + + main: + versions = Channel.empty() + + // Combine cram and intervals for spread and gather strategy + cram_intervals = cram.combine(intervals) + // Move num_intervals to meta map + .map{ meta, cram, crai, intervals, num_intervals -> [ meta + [ num_intervals:num_intervals ], cram, crai, intervals ] } + + // RUN BASERECALIBRATOR SPARK + GATK4SPARK_BASERECALIBRATOR(cram_intervals, fasta, fasta_fai, dict.map{ meta, it -> [ it ] }, known_sites, known_sites_tbi) + + // Figuring out if there is one or more table(s) from the same sample + table_to_merge = GATK4SPARK_BASERECALIBRATOR.out.table.map{ meta, table -> [ groupKey(meta, meta.num_intervals), table ] }.groupTuple().branch{ + // Use meta.num_intervals to asses number of intervals + single: it[0].num_intervals <= 1 + multiple: it[0].num_intervals > 1 + } + + // Only when using intervals + GATK4_GATHERBQSRREPORTS(table_to_merge.multiple) + + // Mix intervals and no_intervals channels together + table_bqsr = GATK4_GATHERBQSRREPORTS.out.table.mix(table_to_merge.single.map{ meta, table -> [ meta, table[0] ] }) + // Remove no longer necessary field: num_intervals + .map{ meta, table -> [ meta - meta.subMap('num_intervals'), table ] } + + // Gather versions of all tools used + versions = versions.mix(GATK4SPARK_BASERECALIBRATOR.out.versions) + versions = versions.mix(GATK4_GATHERBQSRREPORTS.out.versions) + + emit: + table_bqsr // channel: [ meta, table ] + + versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/bam_convert_samtools/main.nf b/subworkflows/local/bam_convert_samtools/main.nf new file mode 100644 index 0000000000..5b057e45ae --- /dev/null +++ b/subworkflows/local/bam_convert_samtools/main.nf @@ -0,0 +1,76 @@ +// +// BAM/CRAM to FASTQ conversion, paired end only +// + +include { SAMTOOLS_VIEW as SAMTOOLS_VIEW_MAP_MAP } from '../../../modules/nf-core/samtools/view/main' +include { SAMTOOLS_VIEW as SAMTOOLS_VIEW_UNMAP_UNMAP } from '../../../modules/nf-core/samtools/view/main' +include { SAMTOOLS_VIEW as SAMTOOLS_VIEW_UNMAP_MAP } from '../../../modules/nf-core/samtools/view/main' +include { SAMTOOLS_VIEW as SAMTOOLS_VIEW_MAP_UNMAP } from '../../../modules/nf-core/samtools/view/main' +include { SAMTOOLS_MERGE as SAMTOOLS_MERGE_UNMAP } from '../../../modules/nf-core/samtools/merge/main' +include { SAMTOOLS_COLLATEFASTQ as COLLATE_FASTQ_UNMAP } from '../../../modules/nf-core/samtools/collatefastq/main' +include { SAMTOOLS_COLLATEFASTQ as COLLATE_FASTQ_MAP } from '../../../modules/nf-core/samtools/collatefastq/main' +include { CAT_FASTQ } from '../../../modules/nf-core/cat/fastq/main' + +workflow BAM_CONVERT_SAMTOOLS { + take: + input // channel: [meta, alignment (BAM or CRAM), index (optional)] + fasta // optional: reference file if CRAM format and reference not in header + fasta_fai + interleaved // value: true/false + + main: + versions = Channel.empty() + + // Index File if not PROVIDED -> this also requires updates to samtools view possibly URGH + + // MAP - MAP + SAMTOOLS_VIEW_MAP_MAP(input, fasta, []) + + // UNMAP - UNMAP + SAMTOOLS_VIEW_UNMAP_UNMAP(input, fasta, []) + + // UNMAP - MAP + SAMTOOLS_VIEW_UNMAP_MAP(input, fasta, []) + + // MAP - UNMAP + SAMTOOLS_VIEW_MAP_UNMAP(input, fasta, []) + + // Merge UNMAP + all_unmapped_bam = SAMTOOLS_VIEW_UNMAP_UNMAP.out.bam + .join(SAMTOOLS_VIEW_UNMAP_MAP.out.bam, failOnDuplicate: true, remainder: true) + .join(SAMTOOLS_VIEW_MAP_UNMAP.out.bam, failOnDuplicate: true, remainder: true) + .map{ meta, unmap_unmap, unmap_map, map_unmap -> [ meta, [ unmap_unmap, unmap_map, map_unmap ] ] } + + SAMTOOLS_MERGE_UNMAP(all_unmapped_bam, fasta, fasta_fai) + + // Collate & convert unmapped + COLLATE_FASTQ_UNMAP(SAMTOOLS_MERGE_UNMAP.out.bam, fasta, interleaved) + + // Collate & convert mapped + COLLATE_FASTQ_MAP(SAMTOOLS_VIEW_MAP_MAP.out.bam, fasta, interleaved) + + // join Mapped & unmapped fastq + + reads_to_concat = COLLATE_FASTQ_MAP.out.fastq + .join(COLLATE_FASTQ_UNMAP.out.fastq, failOnDuplicate: true, failOnMismatch: true) + .map{ meta, mapped_reads, unmapped_reads -> [ meta, [ mapped_reads[0], mapped_reads[1], unmapped_reads[0], unmapped_reads[1] ] ] } + + // Concatenate Mapped_R1 with Unmapped_R1 and Mapped_R2 with Unmapped_R2 + CAT_FASTQ(reads_to_concat) + reads = CAT_FASTQ.out.reads + + // Gather versions of all tools used + versions = versions.mix(CAT_FASTQ.out.versions) + versions = versions.mix(COLLATE_FASTQ_MAP.out.versions) + versions = versions.mix(COLLATE_FASTQ_UNMAP.out.versions) + versions = versions.mix(SAMTOOLS_MERGE_UNMAP.out.versions) + versions = versions.mix(SAMTOOLS_VIEW_MAP_MAP.out.versions) + versions = versions.mix(SAMTOOLS_VIEW_MAP_UNMAP.out.versions) + versions = versions.mix(SAMTOOLS_VIEW_UNMAP_MAP.out.versions) + versions = versions.mix(SAMTOOLS_VIEW_UNMAP_UNMAP.out.versions) + + emit: + reads + + versions +} diff --git a/subworkflows/local/bam_joint_calling_germline_gatk/main.nf b/subworkflows/local/bam_joint_calling_germline_gatk/main.nf new file mode 100644 index 0000000000..f0d9148c07 --- /dev/null +++ b/subworkflows/local/bam_joint_calling_germline_gatk/main.nf @@ -0,0 +1,160 @@ +// +// JOINT GERMLINE CALLING +// +// Merge samples with genomicsdbimport, perform joint genotyping with genotypeGVCFS +// + +include { BCFTOOLS_SORT } from '../../../modules/nf-core/bcftools/sort/main' +include { GATK4_APPLYVQSR as GATK4_APPLYVQSR_INDEL } from '../../../modules/nf-core/gatk4/applyvqsr/main' +include { GATK4_APPLYVQSR as GATK4_APPLYVQSR_SNP } from '../../../modules/nf-core/gatk4/applyvqsr/main' +include { GATK4_GENOMICSDBIMPORT } from '../../../modules/nf-core/gatk4/genomicsdbimport/main' +include { GATK4_GENOTYPEGVCFS } from '../../../modules/nf-core/gatk4/genotypegvcfs/main' +include { GATK4_MERGEVCFS as MERGE_GENOTYPEGVCFS } from '../../../modules/nf-core/gatk4/mergevcfs/main' +include { GATK4_MERGEVCFS as MERGE_VQSR } from '../../../modules/nf-core/gatk4/mergevcfs/main' +include { GATK4_VARIANTRECALIBRATOR as VARIANTRECALIBRATOR_INDEL } from '../../../modules/nf-core/gatk4/variantrecalibrator/main' +include { GATK4_VARIANTRECALIBRATOR as VARIANTRECALIBRATOR_SNP } from '../../../modules/nf-core/gatk4/variantrecalibrator/main' + +workflow BAM_JOINT_CALLING_GERMLINE_GATK { + take: + input // channel: [ meta, [ input ], [ input_index ], intervals ] + fasta // channel: [ fasta ] + fai // channel: [ fasta_fai ] + dict // channel: [ dict ] + dbsnp + dbsnp_tbi + dbsnp_vqsr + resource_indels_vcf + resource_indels_tbi + known_indels_vqsr + resource_snps_vcf + resource_snps_tbi + known_snps_vqsr + + main: + versions = Channel.empty() + + // Map input for GenomicsDBImport + // Rename based on num_intervals, group all samples by their interval_name/interval_file and restructure for channel + // Group by [0, 3] to avoid a list of metas and make sure that any intervals + gendb_input = input + .map{ meta, gvcf, tbi, intervals -> [ [ id:'joint_variant_calling', intervals_name:intervals.simpleName, num_intervals:meta.num_intervals ], gvcf, tbi, intervals ] } + .groupTuple(by:3) //join on interval file + .map{ meta_list, gvcf, tbi, intervals -> + // meta is now a list of [meta1, meta2] but they are all the same. So take the first element. + [ meta_list[0], gvcf, tbi, intervals, [], [] ] + } + + // Convert all sample vcfs into a genomicsdb workspace using genomicsdbimport + GATK4_GENOMICSDBIMPORT(gendb_input, false, false, false) + + genotype_input = GATK4_GENOMICSDBIMPORT.out.genomicsdb.map{ meta, genomicsdb -> [ meta, genomicsdb, [], [], [] ] } + + // Joint genotyping performed using GenotypeGVCFs + // Sort vcfs called by interval within each VCF + + GATK4_GENOTYPEGVCFS(genotype_input, fasta, fai, dict.map{ meta, dict -> [ dict ] }, dbsnp, dbsnp_tbi) + + BCFTOOLS_SORT(GATK4_GENOTYPEGVCFS.out.vcf) + gvcf_to_merge = BCFTOOLS_SORT.out.vcf.map{ meta, vcf -> [ meta.subMap('num_intervals') + [ id:'joint_variant_calling', patient:'all_samples', variantcaller:'haplotypecaller' ], vcf ]}.groupTuple() + + // Merge scatter/gather vcfs & index + // Rework meta for variantscalled.csv and annotation tools + MERGE_GENOTYPEGVCFS(gvcf_to_merge, dict) + + vqsr_input = MERGE_GENOTYPEGVCFS.out.vcf.join(MERGE_GENOTYPEGVCFS.out.tbi, failOnDuplicate: true) + indels_resource_label = known_indels_vqsr.mix(dbsnp_vqsr).collect() + snps_resource_label = known_snps_vqsr.mix(dbsnp_vqsr).collect() + + // Recalibrate INDELs and SNPs separately + VARIANTRECALIBRATOR_INDEL( + vqsr_input, + resource_indels_vcf, + resource_indels_tbi, + indels_resource_label, + fasta, + fai, + dict.map{ meta, dict -> [ dict ] }) + + VARIANTRECALIBRATOR_SNP( + vqsr_input, + resource_snps_vcf, + resource_snps_tbi, + snps_resource_label, + fasta, + fai, + dict.map{ meta, dict -> [ dict ] }) + + //Prepare SNPs and INDELs for ApplyVQSR + // Step 1. : ApplyVQSR to SNPs + // Step 2. : Use ApplyVQSR_SNP output and run ApplyVQSR_INDEL. This avoids duplicate entries in the vcf as described here: https://hpc.nih.gov/training/gatk_tutorial/vqsr.html + + // Join results of variant recalibration into a single channel tuple + // Rework meta for variantscalled.csv and annotation tools + vqsr_input_snp = vqsr_input.join(VARIANTRECALIBRATOR_SNP.out.recal, failOnDuplicate: true) + .join(VARIANTRECALIBRATOR_SNP.out.idx, failOnDuplicate: true) + .join(VARIANTRECALIBRATOR_SNP.out.tranches, failOnDuplicate: true) + .map{ meta, vcf, tbi, recal, index, tranche -> [ meta + [ id:'recalibrated_joint_variant_calling' ], vcf, tbi, recal, index, tranche ] } + + GATK4_APPLYVQSR_SNP( + vqsr_input_snp, + fasta, + fai, + dict.map{ meta, dict -> [ dict ] }) + + // Join results of ApplyVQSR_SNP and use as input for Indels to avoid duplicate entries in the result + // Rework meta for variantscalled.csv and annotation tools + vqsr_input_indel = GATK4_APPLYVQSR_SNP.out.vcf.join(GATK4_APPLYVQSR_SNP.out.tbi).map{ meta, vcf, tbi -> [ meta + [ id:'joint_variant_calling' ], vcf, tbi ]} + .join(VARIANTRECALIBRATOR_INDEL.out.recal, failOnDuplicate: true) + .join(VARIANTRECALIBRATOR_INDEL.out.idx, failOnDuplicate: true) + .join(VARIANTRECALIBRATOR_INDEL.out.tranches, failOnDuplicate: true) + .map{ meta, vcf, tbi, recal, index, tranche -> [ meta + [ id:'recalibrated_joint_variant_calling' ], vcf, tbi, recal, index, tranche ] } + + GATK4_APPLYVQSR_INDEL( + vqsr_input_indel, + fasta, + fai, + dict.map{ meta, dict -> [ dict ] }) + + + // The following is an ugly monster to achieve the following: + // When MERGE_GENOTYPEGVCFS and GATK4_APPLYVQSR are run, then use output from APPLYVQSR + // When MERGE_GENOTYPEGVCFS and NOT GATK4_APPLYVQSR , then use the output from MERGE_GENOTYPEGVCFS + + merge_vcf_for_join = MERGE_GENOTYPEGVCFS.out.vcf.map{meta, vcf -> [[id: 'joint_variant_calling'] , vcf]} + merge_tbi_for_join = MERGE_GENOTYPEGVCFS.out.tbi.map{meta, tbi -> [[id: 'joint_variant_calling'] , tbi]} + + // Remap for both to have the same key, if ApplyBQSR is not run, the channel is empty --> populate with empty elements + vqsr_vcf_for_join = GATK4_APPLYVQSR_INDEL.out.vcf.ifEmpty([[:], []]).map{meta, vcf -> [[id: 'joint_variant_calling'] , vcf]} + vqsr_tbi_for_join = GATK4_APPLYVQSR_INDEL.out.tbi.ifEmpty([[:], []]).map{meta, tbi -> [[id: 'joint_variant_calling'] , tbi]} + + // Join on metamap + // If both --> meta, vcf_merged, vcf_bqsr + // If not VQSR --> meta, vcf_merged, [] + // if the second is empty, use the first + genotype_vcf = merge_vcf_for_join.join(vqsr_vcf_for_join, remainder: true).map{ + meta, joint_vcf, recal_vcf -> + + vcf_out = recal_vcf ?: joint_vcf + + [[id:"joint_variant_calling", patient:"all_samples", variantcaller:"haplotypecaller"], vcf_out] + } + + genotype_index = merge_tbi_for_join.join(vqsr_tbi_for_join, remainder: true).map{ + meta, joint_tbi, recal_tbi -> + + tbi_out = recal_tbi ?: joint_tbi + + [[id:"joint_variant_calling", patient:"all_samples", variantcaller:"haplotypecaller"], tbi_out] + } + + versions = versions.mix(GATK4_GENOMICSDBIMPORT.out.versions) + versions = versions.mix(GATK4_GENOTYPEGVCFS.out.versions) + versions = versions.mix(VARIANTRECALIBRATOR_SNP.out.versions) + versions = versions.mix(GATK4_APPLYVQSR_SNP.out.versions) + + emit: + genotype_index // channel: [ val(meta), [ tbi ] ] + genotype_vcf // channel: [ val(meta), [ vcf ] ] + + versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/bam_joint_calling_germline_gatk/meta.yml b/subworkflows/local/bam_joint_calling_germline_gatk/meta.yml new file mode 100644 index 0000000000..e69de29bb2 diff --git a/subworkflows/local/bam_joint_calling_germline_sentieon/main.nf b/subworkflows/local/bam_joint_calling_germline_sentieon/main.nf new file mode 100644 index 0000000000..3f19b33d52 --- /dev/null +++ b/subworkflows/local/bam_joint_calling_germline_sentieon/main.nf @@ -0,0 +1,150 @@ +// +// JOINT GERMLINE CALLING +// +// Merge samples perform joint genotyping with SENTIEON_GVCFTYPER +// + +include { BCFTOOLS_SORT } from '../../../modules/nf-core/bcftools/sort/main' +include { GATK4_MERGEVCFS as MERGE_GENOTYPEGVCFS } from '../../../modules/nf-core/gatk4/mergevcfs/main' +include { SENTIEON_APPLYVARCAL as SENTIEON_APPLYVARCAL_INDEL } from '../../../modules/nf-core/sentieon/applyvarcal/main' +include { SENTIEON_APPLYVARCAL as SENTIEON_APPLYVARCAL_SNP } from '../../../modules/nf-core/sentieon/applyvarcal/main' +include { SENTIEON_GVCFTYPER } from '../../../modules/nf-core/sentieon/gvcftyper/main' +include { SENTIEON_VARCAL as SENTIEON_VARCAL_INDEL } from '../../../modules/nf-core/sentieon/varcal/main' +include { SENTIEON_VARCAL as SENTIEON_VARCAL_SNP } from '../../../modules/nf-core/sentieon/varcal/main' + +workflow BAM_JOINT_CALLING_GERMLINE_SENTIEON { + take: + input // channel: [ meta, [ input ], [ input_index ], intervals ] + fasta // channel: [ fasta ] + fai // channel: [ fasta_fai ] + dict // channel: [ dict ] + dbsnp + dbsnp_tbi + dbsnp_vqsr + resource_indels_vcf + resource_indels_tbi + known_indels_vqsr + resource_snps_vcf + resource_snps_tbi + known_snps_vqsr + variant_caller + + main: + versions = Channel.empty() + + sentieon_input = input + .map{ meta, gvcf, tbi, intervals -> [ [ id:'joint_variant_calling', intervals_name:intervals.simpleName, num_intervals:meta.num_intervals ], gvcf, tbi, intervals ] } + .groupTuple(by:[0, 3]) + + SENTIEON_GVCFTYPER(sentieon_input, fasta, fai, dbsnp, dbsnp_tbi) + + BCFTOOLS_SORT(SENTIEON_GVCFTYPER.out.vcf_gz) + + gvcf_to_merge = BCFTOOLS_SORT.out.vcf.map{ meta, vcf -> [ meta.subMap('num_intervals') + [ id:'joint_variant_calling', patient:'all_samples', variantcaller:variant_caller ], vcf ]}.groupTuple() + + // Merge scatter/gather vcfs & index + // Rework meta for variantscalled.csv and annotation tools + MERGE_GENOTYPEGVCFS(gvcf_to_merge, dict) + + merged_vcf = MERGE_GENOTYPEGVCFS.out.vcf.map{meta, vcf -> [[id: 'joint_variant_calling'] , vcf]} + merged_tbi = MERGE_GENOTYPEGVCFS.out.tbi.map{meta, tbi -> [[id: 'joint_variant_calling'] , tbi]} + + if (variant_caller == 'sentieon_dnascope') { + // As advised by Don Freed (Sentieon), VQSR is skipped for DnaScope + genotype_vcf = merged_vcf.map{ + meta, vcf -> [ meta + [ patient:"all_samples", variantcaller:'sentieon_dnascope'], vcf ] + } + genotype_index = merged_tbi.map{ + meta, tbi -> [ meta + [ patient:"all_samples", variantcaller:'sentieon_dnascope'], tbi ] + } + } else { + vqsr_input = MERGE_GENOTYPEGVCFS.out.vcf.join(MERGE_GENOTYPEGVCFS.out.tbi, failOnDuplicate: true) + indels_resource_label = known_indels_vqsr.mix(dbsnp_vqsr).collect() + snps_resource_label = known_snps_vqsr.mix(dbsnp_vqsr).collect() + + // Recalibrate INDELs and SNPs separately + SENTIEON_VARCAL_INDEL( + vqsr_input, + resource_indels_vcf, + resource_indels_tbi, + indels_resource_label, + fasta, + fai) + + SENTIEON_VARCAL_SNP( + vqsr_input, + resource_snps_vcf, + resource_snps_tbi, + snps_resource_label, + fasta, + fai) + + //Prepare SNPs and INDELs for Sentieon's applyvarcal + // Step 1. : applyvarcal to SNPs + // Step 2. : Use SENTIEON_APPLYVARCAL_SNP output and run ApplyVQSR_INDEL. This avoids duplicate entries in the vcf as described here: https://hpc.nih.gov/training/gatk_tutorial/vqsr.html + + // Join results of variant recalibration into a single channel tuple + // Rework meta for variantscalled.csv and annotation tools + vqsr_input_snp = vqsr_input.join(SENTIEON_VARCAL_SNP.out.recal, failOnDuplicate: true) + .join(SENTIEON_VARCAL_SNP.out.idx, failOnDuplicate: true) + .join(SENTIEON_VARCAL_SNP.out.tranches, failOnDuplicate: true) + .map{ meta, vcf, tbi, recal, index, tranche -> [ meta + [ id:'recalibrated_joint_variant_calling' ], vcf, tbi, recal, index, tranche ] } + + SENTIEON_APPLYVARCAL_SNP( + vqsr_input_snp, + fasta.map{ fasta -> [ [ id:fasta.baseName ], fasta ] }, + fai.map{ fai -> [ [ id:fai.baseName ], fai ] }) + + // Join results of SENTIEON_APPLYVARCAL_SNP and use as input for SENTIEON_APPLYVARCAL_INDEL to avoid duplicate entries in the result + // Rework meta for variantscalled.csv and annotation tools + vqsr_input_indel = SENTIEON_APPLYVARCAL_SNP.out.vcf.join(SENTIEON_APPLYVARCAL_SNP.out.tbi).map{ meta, vcf, tbi -> [ meta + [ id:'joint_variant_calling' ], vcf, tbi ]} + .join(SENTIEON_VARCAL_INDEL.out.recal, failOnDuplicate: true) + .join(SENTIEON_VARCAL_INDEL.out.idx, failOnDuplicate: true) + .join(SENTIEON_VARCAL_INDEL.out.tranches, failOnDuplicate: true) + .map{ meta, vcf, tbi, recal, index, tranche -> [ meta + [ id:'recalibrated_joint_variant_calling' ], vcf, tbi, recal, index, tranche ] } + + SENTIEON_APPLYVARCAL_INDEL( + vqsr_input_indel, + fasta.map{ fasta -> [ [ id:fasta.baseName ], fasta ] }, + fai.map{ fai -> [ [ id:fai.baseName ], fai ] }) + + // The following is an ugly monster to achieve the following: + // When MERGE_GENOTYPEGVCFS and SENTIEON_APPLYVARCAL are run, then use output from SENTIEON_APPLYVARCAL + // When MERGE_GENOTYPEGVCFS and NOT SENTIEON_APPLYVARCAL, then use the output from MERGE_GENOTYPEGVCFS + + // Remap for both to have the same key, if ApplyBQSR is not run, the channel is empty --> populate with empty elements + vqsr_vcf_for_join = SENTIEON_APPLYVARCAL_INDEL.out.vcf.ifEmpty([[:], []]).map{meta, vcf -> [[id: 'joint_variant_calling'] , vcf]} + vqsr_tbi_for_join = SENTIEON_APPLYVARCAL_INDEL.out.tbi.ifEmpty([[:], []]).map{meta, tbi -> [[id: 'joint_variant_calling'] , tbi]} + + // Join on metamap + // If both --> meta, vcf_merged, vcf_bqsr + // If not VQSR --> meta, vcf_merged, [] + // if the second is empty, use the first + genotype_vcf = merged_vcf.join(vqsr_vcf_for_join, remainder: true).map{ + meta, joint_vcf, recal_vcf -> + + vcf_out = recal_vcf ?: joint_vcf + + [[id:"joint_variant_calling", patient:"all_samples", variantcaller:"sentieon_haplotyper"], vcf_out] + } + + genotype_index = merged_tbi.join(vqsr_tbi_for_join, remainder: true).map{ + meta, joint_tbi, recal_tbi -> + + tbi_out = recal_tbi ?: joint_tbi + [[id:"joint_variant_calling", patient:"all_samples", variantcaller:"sentieon_haplotyper"], tbi_out] + } + + versions = versions.mix(SENTIEON_VARCAL_SNP.out.versions) + versions = versions.mix(SENTIEON_VARCAL_INDEL.out.versions) + versions = versions.mix(SENTIEON_APPLYVARCAL_INDEL.out.versions) + } + + versions = versions.mix(SENTIEON_GVCFTYPER.out.versions) + + emit: + genotype_index // channel: [ val(meta), [ tbi ] ] + genotype_vcf // channel: [ val(meta), [ vcf ] ] + + versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/bam_markduplicates/main.nf b/subworkflows/local/bam_markduplicates/main.nf new file mode 100644 index 0000000000..b1084f9ce4 --- /dev/null +++ b/subworkflows/local/bam_markduplicates/main.nf @@ -0,0 +1,43 @@ +// +// MARKDUPLICATES AND QC after mapping +// +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run + +include { CRAM_QC_MOSDEPTH_SAMTOOLS } from '../cram_qc_mosdepth_samtools/main' +include { GATK4_MARKDUPLICATES } from '../../../modules/nf-core/gatk4/markduplicates/main' + +workflow BAM_MARKDUPLICATES { + take: + bam // channel: [mandatory] [ meta, bam ] + fasta // channel: [mandatory] [ fasta ] + fasta_fai // channel: [mandatory] [ fasta_fai ] + intervals_bed_combined // channel: [optional] [ intervals_bed ] + + main: + versions = Channel.empty() + reports = Channel.empty() + + // RUN MARKUPDUPLICATES + GATK4_MARKDUPLICATES(bam, fasta, fasta_fai) + + // Join with the crai file + cram = GATK4_MARKDUPLICATES.out.cram.join(GATK4_MARKDUPLICATES.out.crai, failOnDuplicate: true, failOnMismatch: true) + + // QC on CRAM + CRAM_QC_MOSDEPTH_SAMTOOLS(cram, fasta, intervals_bed_combined) + + // Gather all reports generated + reports = reports.mix(GATK4_MARKDUPLICATES.out.metrics) + reports = reports.mix(CRAM_QC_MOSDEPTH_SAMTOOLS.out.reports) + + // Gather versions of all tools used + versions = versions.mix(GATK4_MARKDUPLICATES.out.versions) + versions = versions.mix(CRAM_QC_MOSDEPTH_SAMTOOLS.out.versions) + + emit: + cram + reports + + versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/bam_markduplicates_spark/main.nf b/subworkflows/local/bam_markduplicates_spark/main.nf new file mode 100644 index 0000000000..8e7d0ee023 --- /dev/null +++ b/subworkflows/local/bam_markduplicates_spark/main.nf @@ -0,0 +1,54 @@ +// +// MARKDUPLICATES SPARK AND QC after mapping +// +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run + +include { CRAM_QC_MOSDEPTH_SAMTOOLS } from '../cram_qc_mosdepth_samtools/main' +include { GATK4_ESTIMATELIBRARYCOMPLEXITY } from '../../../modules/nf-core/gatk4/estimatelibrarycomplexity/main' +include { GATK4SPARK_MARKDUPLICATES } from '../../../modules/nf-core/gatk4spark/markduplicates/main' +include { SAMTOOLS_INDEX as INDEX_MARKDUPLICATES } from '../../../modules/nf-core/samtools/index/main' + +workflow BAM_MARKDUPLICATES_SPARK { + take: + bam // channel: [mandatory] meta, bam + dict // channel: [mandatory] dict + fasta // channel: [mandatory] fasta + fasta_fai // channel: [mandatory] fasta_fai + intervals_bed_combined // channel: [optional] intervals_bed + + main: + versions = Channel.empty() + reports = Channel.empty() + + // RUN MARKUPDUPLICATES SPARK + GATK4SPARK_MARKDUPLICATES(bam, fasta, fasta_fai, dict) + + // Index cram + INDEX_MARKDUPLICATES(GATK4SPARK_MARKDUPLICATES.out.output) + + // Join with the crai file + cram = GATK4SPARK_MARKDUPLICATES.out.output.join(INDEX_MARKDUPLICATES.out.crai, failOnDuplicate: true, failOnMismatch: true) + + // QC on CRAM + CRAM_QC_MOSDEPTH_SAMTOOLS(cram, fasta, intervals_bed_combined) + + // When running Marduplicates spark, and saving reports + GATK4_ESTIMATELIBRARYCOMPLEXITY(bam, fasta, fasta_fai, dict) + + // Gather all reports generated + reports = reports.mix(GATK4_ESTIMATELIBRARYCOMPLEXITY.out.metrics) + reports = reports.mix(CRAM_QC_MOSDEPTH_SAMTOOLS.out.reports) + + // Gather versions of all tools used + versions = versions.mix(GATK4_ESTIMATELIBRARYCOMPLEXITY.out.versions) + versions = versions.mix(GATK4SPARK_MARKDUPLICATES.out.versions) + versions = versions.mix(INDEX_MARKDUPLICATES.out.versions) + versions = versions.mix(CRAM_QC_MOSDEPTH_SAMTOOLS.out.versions) + + emit: + cram + reports + + versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/bam_merge_index_samtools/main.nf b/subworkflows/local/bam_merge_index_samtools/main.nf new file mode 100644 index 0000000000..f615b1c3d2 --- /dev/null +++ b/subworkflows/local/bam_merge_index_samtools/main.nf @@ -0,0 +1,45 @@ +// +// MERGE INDEX BAM +// +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run + +include { SAMTOOLS_INDEX as INDEX_MERGE_BAM } from '../../../modules/nf-core/samtools/index/main' +include { SAMTOOLS_MERGE as MERGE_BAM } from '../../../modules/nf-core/samtools/merge/main' + +workflow BAM_MERGE_INDEX_SAMTOOLS { + take: + bam // channel: [mandatory] meta, bam + + main: + versions = Channel.empty() + + // Figuring out if there is one or more bam(s) from the same sample + bam_to_merge = bam.branch{ meta, bam -> + // bam is a list, so use bam.size() to asses number of intervals + single: bam.size() <= 1 + return [ meta, bam[0] ] + multiple: bam.size() > 1 + } + + // Only when using intervals + MERGE_BAM(bam_to_merge.multiple, [ [ id:'null' ], []], [ [ id:'null' ], []]) + + // Mix intervals and no_intervals channels together + bam_all = MERGE_BAM.out.bam.mix(bam_to_merge.single) + + // Index bam + INDEX_MERGE_BAM(bam_all) + + // Join with the bai file + bam_bai = bam_all.join(INDEX_MERGE_BAM.out.bai, failOnDuplicate: true, failOnMismatch: true) + + // Gather versions of all tools used + versions = versions.mix(INDEX_MERGE_BAM.out.versions) + versions = versions.mix(MERGE_BAM.out.versions) + + emit: + bam_bai + + versions +} diff --git a/subworkflows/local/bam_sentieon_dedup/main.nf b/subworkflows/local/bam_sentieon_dedup/main.nf new file mode 100644 index 0000000000..b75ba00cbc --- /dev/null +++ b/subworkflows/local/bam_sentieon_dedup/main.nf @@ -0,0 +1,45 @@ +// +// SENTIEON DEDUP + +include { CRAM_QC_MOSDEPTH_SAMTOOLS } from '../cram_qc_mosdepth_samtools/main' +include { SENTIEON_DEDUP } from '../../../modules/nf-core/sentieon/dedup/main' + +workflow BAM_SENTIEON_DEDUP { + take: + bam // channel: [mandatory] [ meta, bam ] // Although the channel is named "bam", it may contain cram-files. + bai + fasta // channel: [mandatory] [ fasta ] + fasta_fai // channel: [mandatory] [ fasta_fai ] + intervals_bed_combined // channel: [optional] [ intervals_bed ] + + main: + versions = Channel.empty() + reports = Channel.empty() + + bam = bam.map{ meta, bam -> [ meta - meta.subMap('data_type'), bam ] } + bai = bai.map{ meta, bai -> [ meta - meta.subMap('data_type'), bai ] } + bam_bai = bam.join(bai, failOnMismatch:true, failOnDuplicate:true) + SENTIEON_DEDUP(bam_bai, fasta.map{fa -> [[:], fa]}, fasta_fai.map{fai -> [[:], fai]}) + + // Join with the crai file + cram = SENTIEON_DEDUP.out.cram.join(SENTIEON_DEDUP.out.crai, failOnDuplicate: true, failOnMismatch: true) + + // QC on CRAM + CRAM_QC_MOSDEPTH_SAMTOOLS(cram, fasta, intervals_bed_combined) + + // Gather all reports generated + reports = reports.mix(SENTIEON_DEDUP.out.metrics) + reports = reports.mix(SENTIEON_DEDUP.out.metrics_multiqc_tsv) + reports = reports.mix(SENTIEON_DEDUP.out.score) + reports = reports.mix(CRAM_QC_MOSDEPTH_SAMTOOLS.out.reports) + + // Gather versions of all tools used + versions = versions.mix(SENTIEON_DEDUP.out.versions) + versions = versions.mix(CRAM_QC_MOSDEPTH_SAMTOOLS.out.versions) + + emit: + cram + reports + + versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/bam_variant_calling_cnvkit/main.nf b/subworkflows/local/bam_variant_calling_cnvkit/main.nf new file mode 100644 index 0000000000..161c2dc21a --- /dev/null +++ b/subworkflows/local/bam_variant_calling_cnvkit/main.nf @@ -0,0 +1,32 @@ +// +// CNVKIT calling +// +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run + +include { CNVKIT_BATCH } from '../../../modules/nf-core/cnvkit/batch/main' +include { CNVKIT_GENEMETRICS } from '../../../modules/nf-core/cnvkit/genemetrics/main' + +workflow BAM_VARIANT_CALLING_CNVKIT { + take: + cram // channel: [mandatory] meta, cram + fasta // channel: [mandatory] meta, fasta + fasta_fai // channel: [optional] meta, fasta_fai + targets // channel: [mandatory] meta, bed + reference // channel: [optional] meta, cnn + + main: + versions = Channel.empty() + generate_pon = false + + CNVKIT_BATCH(cram, fasta, fasta_fai, targets, reference, generate_pon) + + ch_genemetrics = CNVKIT_BATCH.out.cnr.join(CNVKIT_BATCH.out.cns).map{ meta, cnr, cns -> [meta, cnr, cns[2]]} + CNVKIT_GENEMETRICS(ch_genemetrics) + + versions = versions.mix(CNVKIT_BATCH.out.versions) + versions = versions.mix(CNVKIT_GENEMETRICS.out.versions) + + emit: + versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/bam_variant_calling_deepvariant/main.nf b/subworkflows/local/bam_variant_calling_deepvariant/main.nf new file mode 100644 index 0000000000..5c25f24114 --- /dev/null +++ b/subworkflows/local/bam_variant_calling_deepvariant/main.nf @@ -0,0 +1,70 @@ +// +// DEEPVARIANT germline calling +// +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run + +include { DEEPVARIANT } from '../../../modules/nf-core/deepvariant/main' +include { GATK4_MERGEVCFS as MERGE_DEEPVARIANT_GVCF } from '../../../modules/nf-core/gatk4/mergevcfs/main' +include { GATK4_MERGEVCFS as MERGE_DEEPVARIANT_VCF } from '../../../modules/nf-core/gatk4/mergevcfs/main' + +// Deepvariant: https://github.com/google/deepvariant/issues/510 +workflow BAM_VARIANT_CALLING_DEEPVARIANT { + take: + cram // channel: [mandatory] [ meta, cram, crai ] + dict // channel: [optional] [ meta, dict ] + fasta // channel: [mandatory] [ fasta ] + fasta_fai // channel: [mandatory] [ fasta_fai ] + intervals // channel: [mandatory] [ intervals, num_intervals ] or [ [], 0 ] if no intervals + + main: + versions = Channel.empty() + + // Combine cram and intervals for spread and gather strategy + cram_intervals = cram.combine(intervals) + // Move num_intervals to meta map + .map{ meta, cram, crai, intervals, num_intervals -> [ meta + [ num_intervals:num_intervals ], cram, crai, intervals ]} + + DEEPVARIANT(cram_intervals, fasta.map{ fasta -> [ [ id:fasta.baseName ], fasta ] }, fasta_fai.map{ fasta_fai -> [ [ id:fasta_fai.baseName ], fasta_fai ] }, [ [ id:'null' ], [] ]) + + // Figuring out if there is one or more vcf(s) from the same sample + vcf_out = DEEPVARIANT.out.vcf.branch{ + // Use meta.num_intervals to asses number of intervals + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 + } + + // Figuring out if there is one or more gvcf(s) from the same sample + gvcf_out = DEEPVARIANT.out.gvcf.branch{ + // Use meta.num_intervals to asses number of intervals + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 + } + + // Only when using intervals + gvcf_to_merge = gvcf_out.intervals.map{ meta, vcf -> [ groupKey(meta, meta.num_intervals), vcf ]}.groupTuple() + vcf_to_merge = vcf_out.intervals.map{ meta, vcf -> [ groupKey(meta, meta.num_intervals), vcf ]}.groupTuple() + + MERGE_DEEPVARIANT_GVCF(gvcf_to_merge, dict) + MERGE_DEEPVARIANT_VCF(vcf_to_merge, dict) + + // Mix intervals and no_intervals channels together + gvcf = Channel.empty().mix(MERGE_DEEPVARIANT_GVCF.out.vcf, gvcf_out.no_intervals) + // add variantcaller to meta map and remove no longer necessary field: num_intervals + .map{ meta, vcf -> [ meta - meta.subMap('num_intervals') + [ variantcaller:'deepvariant' ], vcf ] } + + // Mix intervals and no_intervals channels together + vcf = Channel.empty().mix(MERGE_DEEPVARIANT_VCF.out.vcf, vcf_out.no_intervals) + // add variantcaller to meta map and remove no longer necessary field: num_intervals + .map{ meta, vcf -> [ meta - meta.subMap('num_intervals') + [ variantcaller:'deepvariant' ], vcf ] } + + versions = versions.mix(DEEPVARIANT.out.versions) + versions = versions.mix(MERGE_DEEPVARIANT_GVCF.out.versions) + versions = versions.mix(MERGE_DEEPVARIANT_VCF.out.versions) + + emit: + gvcf + vcf + + versions +} diff --git a/subworkflows/local/bam_variant_calling_freebayes/main.nf b/subworkflows/local/bam_variant_calling_freebayes/main.nf new file mode 100644 index 0000000000..81538cd3e0 --- /dev/null +++ b/subworkflows/local/bam_variant_calling_freebayes/main.nf @@ -0,0 +1,60 @@ +// +// FREEBAYES variant calling +// +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run + +include { BCFTOOLS_SORT } from '../../../modules/nf-core/bcftools/sort/main' +include { FREEBAYES } from '../../../modules/nf-core/freebayes/main' +include { GATK4_MERGEVCFS as MERGE_FREEBAYES } from '../../../modules/nf-core/gatk4/mergevcfs/main' +include { TABIX_TABIX as TABIX_VC_FREEBAYES } from '../../../modules/nf-core/tabix/tabix/main' + +workflow BAM_VARIANT_CALLING_FREEBAYES { + take: + cram // channel: [mandatory] [ meta, cram1, crai1, cram2, crai2 ] or [ meta, cram, crai, [], [] ] + dict // channel: [mandatory] [ meta, dict ] + fasta // channel: [mandatory] [ fasta ] + fasta_fai // channel: [mandatory] [ fasta_fai ] + intervals // channel: [mandatory] [ intervals, num_intervals ] or [ [], 0 ] if no intervals + + main: + versions = Channel.empty() + + // Combine cram and intervals for spread and gather strategy + cram_intervals = cram.combine(intervals) + // Move num_intervals to meta map and reorganize channel for FREEBAYES module + .map{ meta, cram1, crai1, cram2, crai2, intervals, num_intervals -> [ meta + [ num_intervals:num_intervals ], cram1, crai1, cram2, crai2, intervals ]} + + FREEBAYES(cram_intervals, fasta, fasta_fai, [], [], []) + + BCFTOOLS_SORT(FREEBAYES.out.vcf) + + // Figuring out if there is one or more vcf(s) from the same sample + bcftools_vcf_out = BCFTOOLS_SORT.out.vcf.branch{ + // Use meta.num_intervals to asses number of intervals + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 + } + + // Only when using intervals + vcf_to_merge = bcftools_vcf_out.intervals.map{ meta, vcf -> [ groupKey(meta, meta.num_intervals), vcf ]}.groupTuple() + MERGE_FREEBAYES(vcf_to_merge, dict) + + // Only when no_intervals + TABIX_VC_FREEBAYES(bcftools_vcf_out.no_intervals) + + // Mix intervals and no_intervals channels together + vcf = MERGE_FREEBAYES.out.vcf.mix(bcftools_vcf_out.no_intervals) + // add variantcaller to meta map and remove no longer necessary field: num_intervals + .map{ meta, vcf -> [ meta - meta.subMap('num_intervals') + [ variantcaller:'freebayes' ], vcf ] } + + versions = versions.mix(BCFTOOLS_SORT.out.versions) + versions = versions.mix(MERGE_FREEBAYES.out.versions) + versions = versions.mix(FREEBAYES.out.versions) + versions = versions.mix(TABIX_VC_FREEBAYES.out.versions) + + emit: + vcf + + versions +} diff --git a/subworkflows/local/bam_variant_calling_germline_all/main.nf b/subworkflows/local/bam_variant_calling_germline_all/main.nf new file mode 100644 index 0000000000..1f751c2263 --- /dev/null +++ b/subworkflows/local/bam_variant_calling_germline_all/main.nf @@ -0,0 +1,369 @@ +// +// GERMLINE VARIANT CALLING +// + +include { BAM_JOINT_CALLING_GERMLINE_GATK } from '../bam_joint_calling_germline_gatk/main' +include { BAM_JOINT_CALLING_GERMLINE_SENTIEON } from '../bam_joint_calling_germline_sentieon/main' +include { BAM_VARIANT_CALLING_CNVKIT } from '../bam_variant_calling_cnvkit/main' +include { BAM_VARIANT_CALLING_DEEPVARIANT } from '../bam_variant_calling_deepvariant/main' +include { BAM_VARIANT_CALLING_FREEBAYES } from '../bam_variant_calling_freebayes/main' +include { BAM_VARIANT_CALLING_GERMLINE_MANTA } from '../bam_variant_calling_germline_manta/main' +include { BAM_VARIANT_CALLING_HAPLOTYPECALLER } from '../bam_variant_calling_haplotypecaller/main' +include { BAM_VARIANT_CALLING_SENTIEON_DNASCOPE } from '../bam_variant_calling_sentieon_dnascope/main' +include { BAM_VARIANT_CALLING_SENTIEON_HAPLOTYPER } from '../bam_variant_calling_sentieon_haplotyper/main' +include { BAM_VARIANT_CALLING_MPILEUP } from '../bam_variant_calling_mpileup/main' +include { BAM_VARIANT_CALLING_SINGLE_STRELKA } from '../bam_variant_calling_single_strelka/main' +include { BAM_VARIANT_CALLING_SINGLE_TIDDIT } from '../bam_variant_calling_single_tiddit/main' +include { SENTIEON_DNAMODELAPPLY } from '../../../modules/nf-core/sentieon/dnamodelapply/main' +include { VCF_VARIANT_FILTERING_GATK } from '../vcf_variant_filtering_gatk/main' +include { VCF_VARIANT_FILTERING_GATK as SENTIEON_HAPLOTYPER_VCF_VARIANT_FILTERING_GATK } from '../vcf_variant_filtering_gatk/main' + + + +workflow BAM_VARIANT_CALLING_GERMLINE_ALL { + take: + tools // Mandatory, list of tools to apply + skip_tools // Mandatory, list of tools to skip + cram // channel: [mandatory] cram + bwa // channel: [mandatory] bwa + dbsnp // channel: [mandatory] dbsnp + dbsnp_tbi // channel: [mandatory] dbsnp_tbi + dbsnp_vqsr + dict // channel: [mandatory] dict + fasta // channel: [mandatory] fasta + fasta_fai // channel: [mandatory] fasta_fai + intervals // channel: [mandatory] [ intervals, num_intervals ] or [ [], 0 ] if no intervals + intervals_bed_combined // channel: [mandatory] intervals/target regions in one file unzipped + intervals_bed_gz_tbi_combined // channel: [mandatory] intervals/target regions in one file zipped + intervals_bed_combined_haplotypec // channel: [mandatory] intervals/target regions in one file unzipped, no_intervals.bed if no_intervals + intervals_bed_gz_tbi // channel: [mandatory] [ interval.bed.gz, interval.bed.gz.tbi, num_intervals ] or [ [], [], 0 ] if no intervals + known_indels_vqsr + known_sites_indels + known_sites_indels_tbi + known_sites_snps + known_sites_snps_tbi + known_snps_vqsr + joint_germline // boolean: [mandatory] [default: false] joint calling of germline variants + skip_haplotypecaller_filter // boolean: [mandatory] [default: false] whether to filter haplotypecaller single sample vcfs + sentieon_haplotyper_emit_mode // channel: [mandatory] value channel with string + sentieon_dnascope_emit_mode // channel: [mandatory] value channel with string + sentieon_dnascope_pcr_indel_model // channel: [mandatory] value channel with string + sentieon_dnascope_model // channel: [mandatory] value channel with string + + main: + versions = Channel.empty() + + //TODO: Temporary until the if's can be removed and printing to terminal is prevented with "when" in the modules.config + gvcf_sentieon_dnascope = Channel.empty() + gvcf_sentieon_haplotyper = Channel.empty() + + vcf_deepvariant = Channel.empty() + vcf_freebayes = Channel.empty() + vcf_haplotypecaller = Channel.empty() + vcf_manta = Channel.empty() + vcf_mpileup = Channel.empty() + vcf_sentieon_dnascope = Channel.empty() + vcf_sentieon_haplotyper = Channel.empty() + vcf_strelka = Channel.empty() + vcf_tiddit = Channel.empty() + + // BCFTOOLS MPILEUP + if (tools.split(',').contains('mpileup')) { + BAM_VARIANT_CALLING_MPILEUP( + cram, + dict, + fasta, + intervals + ) + vcf_mpileup = BAM_VARIANT_CALLING_MPILEUP.out.vcf + versions = versions.mix(BAM_VARIANT_CALLING_MPILEUP.out.versions) + } + + // CNVKIT + if (tools.split(',').contains('cnvkit')) { + BAM_VARIANT_CALLING_CNVKIT( + // Remap channel to match module/subworkflow + cram.map{ meta, cram, crai -> [ meta, [], cram ] }, + fasta.map{ it -> [[id:it[0].baseName], it] }, + fasta_fai.map{ it -> [[id:it[0].baseName], it] }, + intervals_bed_combined.map{ it -> [[id:it[0].baseName], it] }, + [[id:"null"], []] + ) + versions = versions.mix(BAM_VARIANT_CALLING_CNVKIT.out.versions) + } + + // DEEPVARIANT + if (tools.split(',').contains('deepvariant')) { + BAM_VARIANT_CALLING_DEEPVARIANT( + cram, + dict, + fasta, + fasta_fai, + intervals + ) + + vcf_deepvariant = BAM_VARIANT_CALLING_DEEPVARIANT.out.vcf + versions = versions.mix(BAM_VARIANT_CALLING_DEEPVARIANT.out.versions) + } + + // FREEBAYES + if (tools.split(',').contains('freebayes')) { + // Input channel is remapped to match input of module/subworkflow + BAM_VARIANT_CALLING_FREEBAYES( + // Remap channel to match module/subworkflow + cram.map{ meta, cram, crai -> [ meta, cram, crai, [], [] ] }, + dict, + fasta, + fasta_fai, + intervals + ) + + vcf_freebayes = BAM_VARIANT_CALLING_FREEBAYES.out.vcf + versions = versions.mix(BAM_VARIANT_CALLING_FREEBAYES.out.versions) + } + + // HAPLOTYPECALLER + if (tools.split(',').contains('haplotypecaller')) { + BAM_VARIANT_CALLING_HAPLOTYPECALLER( + cram, + fasta, + fasta_fai, + dict, + dbsnp, + dbsnp_tbi, + dbsnp_vqsr, + intervals) + + vcf_haplotypecaller = BAM_VARIANT_CALLING_HAPLOTYPECALLER.out.vcf + tbi_haplotypecaller = BAM_VARIANT_CALLING_HAPLOTYPECALLER.out.tbi + + versions = versions.mix(BAM_VARIANT_CALLING_HAPLOTYPECALLER.out.versions) + + if (joint_germline) { + BAM_JOINT_CALLING_GERMLINE_GATK( + BAM_VARIANT_CALLING_HAPLOTYPECALLER.out.gvcf_tbi_intervals, + fasta, + fasta_fai, + dict, + dbsnp, + dbsnp_tbi, + dbsnp_vqsr, + known_sites_indels, + known_sites_indels_tbi, + known_indels_vqsr, + known_sites_snps, + known_sites_snps_tbi, + known_snps_vqsr) + + vcf_haplotypecaller = BAM_JOINT_CALLING_GERMLINE_GATK.out.genotype_vcf + versions = versions.mix(BAM_JOINT_CALLING_GERMLINE_GATK.out.versions) + } else { + + // If single sample track, check if filtering should be done + if (!skip_haplotypecaller_filter) { + + VCF_VARIANT_FILTERING_GATK( + vcf_haplotypecaller.join(tbi_haplotypecaller, failOnDuplicate: true, failOnMismatch: true), + fasta, + fasta_fai, + dict.map{ meta, dict -> [ dict ] }, + intervals_bed_combined_haplotypec, + known_sites_indels.concat(known_sites_snps).flatten().unique().collect(), + known_sites_indels_tbi.concat(known_sites_snps_tbi).flatten().unique().collect()) + + vcf_haplotypecaller = VCF_VARIANT_FILTERING_GATK.out.filtered_vcf + + versions = versions.mix(VCF_VARIANT_FILTERING_GATK.out.versions) + } + } + } + + // MANTA + if (tools.split(',').contains('manta')) { + BAM_VARIANT_CALLING_GERMLINE_MANTA ( + cram, + fasta.map{ it -> [ [ id:'fasta' ], it ] }, + fasta_fai.map{ it -> [ [ id:'fasta_fai' ], it ] }, + intervals_bed_gz_tbi_combined + ) + + vcf_manta = BAM_VARIANT_CALLING_GERMLINE_MANTA.out.vcf + versions = versions.mix(BAM_VARIANT_CALLING_GERMLINE_MANTA.out.versions) + } + + // SENTIEON DNASCOPE + if (tools.split(',').contains('sentieon_dnascope')) { + BAM_VARIANT_CALLING_SENTIEON_DNASCOPE( + cram, + fasta, + fasta_fai, + dict, + dbsnp, + dbsnp_tbi, + dbsnp_vqsr, + intervals, + joint_germline, + sentieon_dnascope_emit_mode, + sentieon_dnascope_pcr_indel_model, + sentieon_dnascope_model) + + versions = versions.mix(BAM_VARIANT_CALLING_SENTIEON_DNASCOPE.out.versions) + + vcf_sentieon_dnascope = BAM_VARIANT_CALLING_SENTIEON_DNASCOPE.out.vcf + vcf_tbi_sentieon_dnascope = BAM_VARIANT_CALLING_SENTIEON_DNASCOPE.out.vcf_tbi + gvcf_sentieon_dnascope = BAM_VARIANT_CALLING_SENTIEON_DNASCOPE.out.gvcf + gvcf_tbi_sentieon_dnascope = BAM_VARIANT_CALLING_SENTIEON_DNASCOPE.out.gvcf_tbi + + if (joint_germline) { + BAM_JOINT_CALLING_GERMLINE_SENTIEON( + BAM_VARIANT_CALLING_SENTIEON_DNASCOPE.out.genotype_intervals, + fasta, + fasta_fai, + dict, + dbsnp, + dbsnp_tbi, + dbsnp_vqsr, + known_sites_indels, + known_sites_indels_tbi, + known_indels_vqsr, + known_sites_snps, + known_sites_snps_tbi, + known_snps_vqsr, + 'sentieon_dnascope') + + vcf_sentieon_dnascope = BAM_JOINT_CALLING_GERMLINE_SENTIEON.out.genotype_vcf + versions = versions.mix(BAM_JOINT_CALLING_GERMLINE_SENTIEON.out.versions) + } else { + // If single sample track, check if filtering should be done + if (!(skip_tools && skip_tools.split(',').contains('dnascope_filter'))) { + + SENTIEON_DNAMODELAPPLY( + vcf_sentieon_dnascope.join(vcf_tbi_sentieon_dnascope, failOnDuplicate: true, failOnMismatch: true), + fasta.map{ fasta -> [ [ id:fasta.baseName ], fasta ] }, + fasta_fai.map{ fai -> [ [ id:fai.baseName ], fai ] }, + sentieon_dnascope_model.map{ model -> [ [ id:model.baseName ], model ] }) + + vcf_sentieon_dnascope = SENTIEON_DNAMODELAPPLY.out.vcf + versions = versions.mix(SENTIEON_DNAMODELAPPLY.out.versions) + + } + + } + } + + // SENTIEON HAPLOTYPER + if (tools.split(',').contains('sentieon_haplotyper')) { + BAM_VARIANT_CALLING_SENTIEON_HAPLOTYPER( + cram, + fasta, + fasta_fai, + dict, + dbsnp, + dbsnp_tbi, + dbsnp_vqsr, + intervals, + joint_germline, + sentieon_haplotyper_emit_mode) + + versions = versions.mix(BAM_VARIANT_CALLING_SENTIEON_HAPLOTYPER.out.versions) + + vcf_sentieon_haplotyper = BAM_VARIANT_CALLING_SENTIEON_HAPLOTYPER.out.vcf + vcf_tbi_sentieon_haplotyper = BAM_VARIANT_CALLING_SENTIEON_HAPLOTYPER.out.vcf_tbi + gvcf_sentieon_haplotyper = BAM_VARIANT_CALLING_SENTIEON_HAPLOTYPER.out.gvcf + gvcf_tbi_sentieon_haplotyper = BAM_VARIANT_CALLING_SENTIEON_HAPLOTYPER.out.gvcf_tbi + + if (joint_germline) { + BAM_JOINT_CALLING_GERMLINE_SENTIEON( + BAM_VARIANT_CALLING_SENTIEON_HAPLOTYPER.out.genotype_intervals, + fasta, + fasta_fai, + dict, + dbsnp, + dbsnp_tbi, + dbsnp_vqsr, + known_sites_indels, + known_sites_indels_tbi, + known_indels_vqsr, + known_sites_snps, + known_sites_snps_tbi, + known_snps_vqsr, + 'sentieon_haplotyper') + + vcf_sentieon_haplotyper = BAM_JOINT_CALLING_GERMLINE_SENTIEON.out.genotype_vcf + versions = versions.mix(BAM_JOINT_CALLING_GERMLINE_SENTIEON.out.versions) + } else { + + // If single sample track, check if filtering should be done + if (!(skip_tools && skip_tools.split(',').contains('haplotyper_filter'))) { + + SENTIEON_HAPLOTYPER_VCF_VARIANT_FILTERING_GATK( + vcf_sentieon_haplotyper.join(vcf_tbi_sentieon_haplotyper, failOnDuplicate: true, failOnMismatch: true), + fasta, + fasta_fai, + dict.map{ meta, dict -> [ dict ] }, + intervals_bed_combined_haplotypec, + known_sites_indels.concat(known_sites_snps).flatten().unique().collect(), + known_sites_indels_tbi.concat(known_sites_snps_tbi).flatten().unique().collect()) + + vcf_sentieon_haplotyper = SENTIEON_HAPLOTYPER_VCF_VARIANT_FILTERING_GATK.out.filtered_vcf + + versions = versions.mix(SENTIEON_HAPLOTYPER_VCF_VARIANT_FILTERING_GATK.out.versions) + } + } + } + + // STRELKA + if (tools.split(',').contains('strelka')) { + BAM_VARIANT_CALLING_SINGLE_STRELKA( + cram, + dict, + fasta, + fasta_fai, + intervals_bed_gz_tbi + ) + + vcf_strelka = BAM_VARIANT_CALLING_SINGLE_STRELKA.out.vcf + versions = versions.mix(BAM_VARIANT_CALLING_SINGLE_STRELKA.out.versions) + } + + // TIDDIT + if (tools.split(',').contains('tiddit')) { + BAM_VARIANT_CALLING_SINGLE_TIDDIT( + cram, + // Remap channel to match module/subworkflow + fasta.map{ it -> [ [ id:'fasta' ], it ] }, + bwa + ) + + vcf_tiddit = BAM_VARIANT_CALLING_SINGLE_TIDDIT.out.vcf + versions = versions.mix(BAM_VARIANT_CALLING_SINGLE_TIDDIT.out.versions) + } + + vcf_all = Channel.empty().mix( + vcf_deepvariant, + vcf_freebayes, + vcf_sentieon_dnascope, + vcf_haplotypecaller, + vcf_manta, + vcf_mpileup, + vcf_sentieon_haplotyper, + vcf_strelka, + vcf_tiddit + ) + + emit: + gvcf_sentieon_dnascope + gvcf_sentieon_haplotyper + vcf_all + vcf_deepvariant + vcf_freebayes + vcf_haplotypecaller + vcf_manta + vcf_mpileup + vcf_strelka + vcf_sentieon_dnascope + vcf_sentieon_haplotyper + vcf_tiddit + + versions +} diff --git a/subworkflows/local/bam_variant_calling_germline_manta/main.nf b/subworkflows/local/bam_variant_calling_germline_manta/main.nf new file mode 100644 index 0000000000..d27a999a68 --- /dev/null +++ b/subworkflows/local/bam_variant_calling_germline_manta/main.nf @@ -0,0 +1,44 @@ +// +// Manta germline variant calling +// +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run + +include { MANTA_GERMLINE } from '../../../modules/nf-core/manta/germline/main' + +// Seems to be the consensus on upstream modules implementation too +workflow BAM_VARIANT_CALLING_GERMLINE_MANTA { + take: + cram // channel: [mandatory] [ meta, cram, crai ] + fasta // channel: [mandatory] [ meta, fasta ] + fasta_fai // channel: [mandatory] [ meta, fasta_fai ] + intervals // channel: [mandatory] [ interval.bed.gz, interval.bed.gz.tbi] or [ [], []] if no intervals; intervals file contains all intervals + + main: + versions = Channel.empty() + + // Combine cram and intervals, account for 0 intervals + cram_intervals = cram.combine(intervals).map{ it -> + bed_gz = it.size() > 3 ? it[3] : [] + bed_tbi = it.size() > 3 ? it[4] : [] + + [it[0], it[1], it[2], bed_gz, bed_tbi] + } + + MANTA_GERMLINE(cram_intervals, fasta, fasta_fai, []) + + small_indels_vcf = MANTA_GERMLINE.out.candidate_small_indels_vcf + sv_vcf = MANTA_GERMLINE.out.candidate_sv_vcf + diploid_sv_vcf = MANTA_GERMLINE.out.diploid_sv_vcf + + // Only diploid SV should get annotated + // add variantcaller to meta map + vcf = diploid_sv_vcf.map{ meta, vcf -> [ meta + [ variantcaller:'manta' ], vcf ] } + + versions = versions.mix(MANTA_GERMLINE.out.versions) + + emit: + vcf + + versions +} diff --git a/subworkflows/local/bam_variant_calling_haplotypecaller/main.nf b/subworkflows/local/bam_variant_calling_haplotypecaller/main.nf new file mode 100644 index 0000000000..1dbef4c613 --- /dev/null +++ b/subworkflows/local/bam_variant_calling_haplotypecaller/main.nf @@ -0,0 +1,103 @@ +// +// GATK4 HAPLOTYPACALLER germline variant calling: +// +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run + +include { BAM_MERGE_INDEX_SAMTOOLS } from '../bam_merge_index_samtools/main' +include { GATK4_HAPLOTYPECALLER } from '../../../modules/nf-core/gatk4/haplotypecaller/main' +include { GATK4_MERGEVCFS as MERGE_HAPLOTYPECALLER } from '../../../modules/nf-core/gatk4/mergevcfs/main' + +workflow BAM_VARIANT_CALLING_HAPLOTYPECALLER { + take: + cram // channel: [mandatory] [ meta, cram, crai, interval.bed ] + fasta // channel: [mandatory] + fasta_fai // channel: [mandatory] + dict // channel: [mandatory] + dbsnp // channel: [optional] + dbsnp_tbi // channel: [optional] + dbsnp_vqsr // channel: [optional] + intervals // channel: [mandatory] [ intervals, num_intervals ] or [ [], 0 ] if no intervals + + main: + versions = Channel.empty() + + vcf = Channel.empty() + realigned_bam = Channel.empty() + + // Combine cram and intervals for spread and gather strategy + cram_intervals = cram.combine(intervals) + // Move num_intervals to meta map + // Add interval_name to allow correct merging with interval files + .map{ meta, cram, crai, intervals, num_intervals -> [ meta + [ interval_name:intervals.simpleName, num_intervals:num_intervals, variantcaller:'haplotypecaller' ], cram, crai, intervals, [] ] } + + GATK4_HAPLOTYPECALLER(cram_intervals, fasta, fasta_fai, dict.map{ meta, dict -> [ dict ] }, dbsnp, dbsnp_tbi) + + // For joint genotyping + gvcf_tbi_intervals = GATK4_HAPLOTYPECALLER.out.vcf + .join(GATK4_HAPLOTYPECALLER.out.tbi, failOnMismatch: true) + .join(cram_intervals, failOnMismatch: true) + .map{ meta, gvcf, tbi, cram, crai, intervals, dragstr_model -> [ meta, gvcf, tbi, intervals ] } + + // Figuring out if there is one or more vcf(s) from the same sample + haplotypecaller_vcf = GATK4_HAPLOTYPECALLER.out.vcf.map{ + meta, vcf -> [ meta - meta.subMap('interval_name'), vcf] + } + .branch{ + // Use meta.num_intervals to asses number of intervals + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 + } + + // Figuring out if there is one or more tbi(s) from the same sample + haplotypecaller_tbi = GATK4_HAPLOTYPECALLER.out.tbi.map{ + meta, tbi -> [ meta - meta.subMap('interval_name'), tbi] + }.branch{ + // Use meta.num_intervals to asses number of intervals + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 + } + + // Figuring out if there is one or more bam(s) from the same sample + haplotypecaller_bam = GATK4_HAPLOTYPECALLER.out.bam.map{ + meta, bam -> [ meta - meta.subMap('interval_name'), bam] + }.branch{ + // Use meta.num_intervals to asses number of intervals + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 + } + + // Only when using intervals + MERGE_HAPLOTYPECALLER(haplotypecaller_vcf.intervals.map{ meta, vcf -> [ groupKey(meta, meta.num_intervals), vcf ] }.groupTuple(), dict) + + haplotypecaller_vcf = Channel.empty().mix( + MERGE_HAPLOTYPECALLER.out.vcf, + haplotypecaller_vcf.no_intervals) + + haplotypecaller_tbi = Channel.empty().mix( + MERGE_HAPLOTYPECALLER.out.tbi, + haplotypecaller_tbi.no_intervals) + + // BAM output + BAM_MERGE_INDEX_SAMTOOLS(haplotypecaller_bam.intervals + .map{ meta, bam -> [ groupKey(meta, meta.num_intervals), bam ] } + .groupTuple() + .mix(haplotypecaller_bam.no_intervals)) + + realigned_bam = BAM_MERGE_INDEX_SAMTOOLS.out.bam_bai + + versions = versions.mix(GATK4_HAPLOTYPECALLER.out.versions) + versions = versions.mix(MERGE_HAPLOTYPECALLER.out.versions) + + // Remove no longer necessary field: num_intervals + vcf = haplotypecaller_vcf.map{ meta, vcf -> [ meta - meta.subMap('num_intervals'), vcf ] } + tbi = haplotypecaller_tbi.map{ meta, tbi -> [ meta - meta.subMap('num_intervals'), tbi ] } + + emit: + gvcf_tbi_intervals // For joint genotyping + realigned_bam // Optional + vcf // vcf + tbi // tbi + + versions +} diff --git a/subworkflows/local/bam_variant_calling_mpileup/main.nf b/subworkflows/local/bam_variant_calling_mpileup/main.nf new file mode 100644 index 0000000000..663ed6a0bc --- /dev/null +++ b/subworkflows/local/bam_variant_calling_mpileup/main.nf @@ -0,0 +1,74 @@ +// +// MPILEUP variant calling: BCFTOOLS for variantcalling, SAMTools for controlfreec input +// +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run + +include { BCFTOOLS_MPILEUP } from '../../../modules/nf-core/bcftools/mpileup/main' +include { CAT_CAT as CAT_MPILEUP } from '../../../modules/nf-core/cat/cat/main' +include { GATK4_MERGEVCFS as MERGE_BCFTOOLS_MPILEUP } from '../../../modules/nf-core/gatk4/mergevcfs/main' +include { SAMTOOLS_MPILEUP } from '../../../modules/nf-core/samtools/mpileup/main' + +workflow BAM_VARIANT_CALLING_MPILEUP { + take: + cram // channel: [mandatory] [ meta, cram, crai ] + dict // channel: [mandatory] [ meta, dict ] + fasta // channel: [mandatory] [ fasta ] + intervals // channel: [mandatory] [ intervals, num_intervals ] or [ [], 0 ] if no intervals + + main: + versions = Channel.empty() + + // Combine cram and intervals for spread and gather strategy + cram_intervals = cram.combine(intervals) + // Move num_intervals to meta map and reorganize channel for BCFTOOLS_MPILEUP/SAMTOOLS_MPILEUP modules + .map{ meta, cram, crai, intervals, num_intervals -> [ meta + [ num_intervals:num_intervals ], cram, intervals ] } + + // Run, if --tools mpileup + keep_bcftools_mpileup = false + BCFTOOLS_MPILEUP(cram_intervals, fasta.map{ it -> [[id:it[0].baseName], it] }, keep_bcftools_mpileup) + + //Only run, if --tools ControlFreec + SAMTOOLS_MPILEUP(cram_intervals, fasta) + + // Figuring out if there is one or more vcf(s) from the same sample + vcf_mpileup = BCFTOOLS_MPILEUP.out.vcf.branch{ + // Use meta.num_intervals to asses number of intervals + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 + } + + // Figuring out if there is one or more mpileup(s) from the same sample + mpileup_samtools = SAMTOOLS_MPILEUP.out.mpileup.branch{ + // Use meta.num_intervals to asses number of intervals + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 + } + + // Merge mpileup and natural order sort them + mpileup_to_merge = mpileup_samtools.intervals.map{ meta, pileup -> [ groupKey(meta, meta.num_intervals), pileup ] }.groupTuple(sort:true) + CAT_MPILEUP(mpileup_to_merge) + + // Merge VCF + vcf_to_merge = vcf_mpileup.intervals.map{ meta, vcf -> [ groupKey(meta, meta.num_intervals), vcf ] }.groupTuple() + MERGE_BCFTOOLS_MPILEUP(vcf_to_merge, dict) + + // Mix intervals and no_intervals channels together + mpileup = CAT_MPILEUP.out.file_out.mix(mpileup_samtools.no_intervals) + // add variantcaller to meta map and remove no longer necessary field: num_intervals + .map{ meta, mpileup -> [ meta - meta.subMap('num_intervals') + [ variantcaller:'samtools' ], mpileup ] } + vcf = MERGE_BCFTOOLS_MPILEUP.out.vcf.mix(vcf_mpileup.no_intervals) + // add variantcaller to meta map and remove no longer necessary field: num_intervals + .map{ meta, vcf -> [ meta - meta.subMap('num_intervals') + [ variantcaller:'bcftools' ], vcf ] } + + versions = versions.mix(SAMTOOLS_MPILEUP.out.versions) + versions = versions.mix(BCFTOOLS_MPILEUP.out.versions) + versions = versions.mix(CAT_MPILEUP.out.versions) + versions = versions.mix(MERGE_BCFTOOLS_MPILEUP.out.versions) + + emit: + mpileup + vcf + + versions +} diff --git a/subworkflows/local/bam_variant_calling_sentieon_dnascope/main.nf b/subworkflows/local/bam_variant_calling_sentieon_dnascope/main.nf new file mode 100644 index 0000000000..9eea9b2d61 --- /dev/null +++ b/subworkflows/local/bam_variant_calling_sentieon_dnascope/main.nf @@ -0,0 +1,157 @@ +// +// SENTIEON HAPLOTYPER germline variant calling +// +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run + +include { GATK4_MERGEVCFS as MERGE_SENTIEON_DNASCOPE_GVCFS } from '../../../modules/nf-core/gatk4/mergevcfs/main' +include { GATK4_MERGEVCFS as MERGE_SENTIEON_DNASCOPE_VCFS } from '../../../modules/nf-core/gatk4/mergevcfs/main' +include { SENTIEON_DNASCOPE } from '../../../modules/nf-core/sentieon/dnascope/main' + +workflow BAM_VARIANT_CALLING_SENTIEON_DNASCOPE { + take: + cram // channel: [mandatory] [ meta, cram, crai, interval.bed ] + fasta // channel: [mandatory] + fasta_fai // channel: [mandatory] + dict // channel: [mandatory] + dbsnp // channel: [optional] + dbsnp_tbi // channel: [optional] + dbsnp_vqsr // channel: [optional] + intervals // channel: [mandatory] [ intervals, num_intervals ] or [ [], 0 ] if no intervals + joint_germline // boolean: [mandatory] [default: false] joint calling of germline variants + sentieon_dnascope_emit_mode // string + sentieon_dnascope_pcr_indel_model // string + sentieon_dnascope_model // channel + + main: + versions = Channel.empty() + + gvcf = Channel.empty() + vcf = Channel.empty() + genotype_intervals = Channel.empty() + + // Combine cram and intervals for spread and gather strategy + cram_intervals_for_sentieon = cram.combine(intervals) + // Move num_intervals to meta map + .map{ meta, cram, crai, intervals, num_intervals -> [ + meta + [ + num_intervals:num_intervals, + intervals_name:intervals.simpleName, + variantcaller:'sentieon_dnascope'], + cram, + crai, + intervals + ] + } + + emit_mode_items = sentieon_dnascope_emit_mode.split(',').each{ it -> it.toLowerCase().trim() } + lst = emit_mode_items - 'gvcf' + emit_vcf = lst.size() > 0 ? lst[0] : '' + + SENTIEON_DNASCOPE( + cram_intervals_for_sentieon, + fasta.map{it -> [[:], it]}, + fasta_fai.map{it -> [[:], it]}, + dbsnp.map{it -> [[:], it]}, + dbsnp_tbi.map{it -> [[:], it]}, + sentieon_dnascope_model.map{it -> [[:], it]}, + sentieon_dnascope_pcr_indel_model, + emit_vcf, + emit_mode_items.any{ it.equals('gvcf') }) + + if (joint_germline) { + genotype_intervals = SENTIEON_DNASCOPE.out.gvcf + .join(SENTIEON_DNASCOPE.out.gvcf_tbi, failOnMismatch: true) + .join(cram_intervals_for_sentieon, failOnMismatch: true) + .map{ meta, gvcf, tbi, cram, crai, intervals -> [ meta, gvcf, tbi, intervals ] } + } + + // Figure out if using intervals or no_intervals + dnascope_vcf_branch = SENTIEON_DNASCOPE.out.vcf.map{ + meta, vcf -> [ meta - meta.subMap('interval_name'), vcf] + } + .branch{ + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 + } + + dnascope_vcf_tbi_branch = SENTIEON_DNASCOPE.out.vcf_tbi.map{ + meta, vcf_tbi -> [ meta - meta.subMap('interval_name'), vcf_tbi] + } + .branch{ + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 + } + + haplotyper_gvcf_branch = SENTIEON_DNASCOPE.out.gvcf.map{ + meta, gvcf -> [ meta - meta.subMap('interval_name'), gvcf] + } + .branch{ + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 + } + + haplotyper_gvcf_tbi_branch = SENTIEON_DNASCOPE.out.gvcf_tbi.map{ + meta, gvcf_tbi -> [ meta - meta.subMap('interval_name'), gvcf_tbi] + } + .branch{ + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 + } + + vcfs_for_merging = dnascope_vcf_branch.intervals.map{ + meta, vcf -> [ groupKey(meta, meta.num_intervals), vcf ]} + + vcfs_for_merging = vcfs_for_merging.map{ + meta, vcf -> [ + meta - meta.subMap('intervals_name'), + vcf]}.groupTuple() + + // VCFs + // Only when using intervals + MERGE_SENTIEON_DNASCOPE_VCFS(vcfs_for_merging, dict) + + dnascope_vcf = Channel.empty().mix( + MERGE_SENTIEON_DNASCOPE_VCFS.out.vcf, + dnascope_vcf_branch.no_intervals) + + haplotyper_tbi = Channel.empty().mix( + MERGE_SENTIEON_DNASCOPE_VCFS.out.tbi, + dnascope_vcf_tbi_branch.no_intervals) + + // Remove no longer necessary field: num_intervals + vcf = dnascope_vcf.map{ meta, vcf -> [ meta - meta.subMap('num_intervals'), vcf ] } + vcf_tbi = haplotyper_tbi.map{ meta, tbi -> [ meta - meta.subMap('num_intervals'), tbi ] } + + // GVFs + // Only when using intervals + gvcfs_for_merging = haplotyper_gvcf_branch.intervals.map{ + meta, vcf -> [groupKey(meta, meta.num_intervals), vcf]} + + gvcfs_for_merging = gvcfs_for_merging.map{ + meta, vcf -> [ meta - meta.subMap('intervals_name'), vcf ] + }.groupTuple() + + MERGE_SENTIEON_DNASCOPE_GVCFS(gvcfs_for_merging, dict) + + gvcf = Channel.empty().mix( + MERGE_SENTIEON_DNASCOPE_GVCFS.out.vcf, + haplotyper_gvcf_branch.no_intervals) + + gvcf_tbi = Channel.empty().mix( + MERGE_SENTIEON_DNASCOPE_GVCFS.out.tbi, + haplotyper_gvcf_tbi_branch.no_intervals) + + versions = versions.mix(SENTIEON_DNASCOPE.out.versions) + versions = versions.mix(MERGE_SENTIEON_DNASCOPE_VCFS.out.versions) + versions = versions.mix(MERGE_SENTIEON_DNASCOPE_GVCFS.out.versions) + + emit: + versions + vcf + vcf_tbi + gvcf + gvcf_tbi + genotype_intervals // For joint genotyping + +} diff --git a/subworkflows/local/bam_variant_calling_sentieon_haplotyper/main.nf b/subworkflows/local/bam_variant_calling_sentieon_haplotyper/main.nf new file mode 100644 index 0000000000..4b280d271c --- /dev/null +++ b/subworkflows/local/bam_variant_calling_sentieon_haplotyper/main.nf @@ -0,0 +1,154 @@ +// +// SENTIEON HAPLOTYPER germline variant calling +// +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run + +include { GATK4_MERGEVCFS as MERGE_SENTIEON_HAPLOTYPER_GVCFS } from '../../../modules/nf-core/gatk4/mergevcfs/main' +include { GATK4_MERGEVCFS as MERGE_SENTIEON_HAPLOTYPER_VCFS } from '../../../modules/nf-core/gatk4/mergevcfs/main' +include { SENTIEON_HAPLOTYPER } from '../../../modules/nf-core/sentieon/haplotyper/main' + +workflow BAM_VARIANT_CALLING_SENTIEON_HAPLOTYPER { + take: + cram // channel: [mandatory] [ meta, cram, crai, interval.bed ] + fasta // channel: [mandatory] + fasta_fai // channel: [mandatory] + dict // channel: [mandatory] + dbsnp // channel: [optional] + dbsnp_tbi // channel: [optional] + dbsnp_vqsr // channel: [optional] + intervals // channel: [mandatory] [ intervals, num_intervals ] or [ [], 0 ] if no intervals + joint_germline // boolean: [mandatory] [default: false] joint calling of germline variants + sentieon_haplotyper_emit_mode + + main: + versions = Channel.empty() + + gvcf = Channel.empty() + vcf = Channel.empty() + genotype_intervals = Channel.empty() + + // Combine cram and intervals for spread and gather strategy + cram_intervals_for_sentieon = cram.combine(intervals) + // Move num_intervals to meta map + .map{ meta, cram, crai, intervals, num_intervals -> [ + meta + [ + num_intervals:num_intervals, + intervals_name:intervals.simpleName, + variantcaller:'sentieon_haplotyper'], + cram, + crai, + intervals + ] + } + + + emit_mode_items = sentieon_haplotyper_emit_mode.split(',').each{ it -> it.toLowerCase().trim() } + lst = emit_mode_items - 'gvcf' + emit_vcf = lst.size() > 0 ? lst[0] : '' + + SENTIEON_HAPLOTYPER( + cram_intervals_for_sentieon, + fasta, + fasta_fai, + dbsnp, + dbsnp_tbi, + emit_vcf, + emit_mode_items.any{ it.equals('gvcf') }) + + if (joint_germline) { + genotype_intervals = SENTIEON_HAPLOTYPER.out.gvcf + .join(SENTIEON_HAPLOTYPER.out.gvcf_tbi, failOnMismatch: true) + .join(cram_intervals_for_sentieon, failOnMismatch: true) + .map{ meta, gvcf, tbi, cram, crai, intervals -> [ meta, gvcf, tbi, intervals ] } + } + + // Figure out if using intervals or no_intervals + haplotyper_vcf_branch = SENTIEON_HAPLOTYPER.out.vcf.map{ + meta, vcf -> [ meta - meta.subMap('interval_name'), vcf] + } + .branch{ + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 + } + + haplotyper_vcf_tbi_branch = SENTIEON_HAPLOTYPER.out.vcf_tbi.map{ + meta, vcf_tbi -> [ meta - meta.subMap('interval_name'), vcf_tbi] + } + .branch{ + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 + } + + haplotyper_gvcf_branch = SENTIEON_HAPLOTYPER.out.gvcf.map{ + meta, gvcf -> [ meta - meta.subMap('interval_name'), gvcf] + } + .branch{ + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 + } + + haplotyper_gvcf_tbi_branch = SENTIEON_HAPLOTYPER.out.gvcf_tbi.map{ + meta, gvcf_tbi -> [ meta - meta.subMap('interval_name'), gvcf_tbi] + } + .branch{ + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 + } + + vcfs_for_merging = haplotyper_vcf_branch.intervals.map{ + meta, vcf -> [ groupKey(meta, meta.num_intervals), vcf ]} + + vcfs_for_merging = vcfs_for_merging.map{ + meta, vcf -> [ + meta - meta.subMap('intervals_name'), + vcf]}.groupTuple() + + // VCFs + // Only when using intervals + MERGE_SENTIEON_HAPLOTYPER_VCFS(vcfs_for_merging, dict) + + haplotyper_vcf = Channel.empty().mix( + MERGE_SENTIEON_HAPLOTYPER_VCFS.out.vcf, + haplotyper_vcf_branch.no_intervals) + + haplotyper_tbi = Channel.empty().mix( + MERGE_SENTIEON_HAPLOTYPER_VCFS.out.tbi, + haplotyper_vcf_tbi_branch.no_intervals) + + // Remove no longer necessary field: num_intervals + vcf = haplotyper_vcf.map{ meta, vcf -> [ meta - meta.subMap('num_intervals'), vcf ] } + vcf_tbi = haplotyper_tbi.map{ meta, tbi -> [ meta - meta.subMap('num_intervals'), tbi ] } + + // GVFs + // Only when using intervals + gvcfs_for_merging = haplotyper_gvcf_branch.intervals.map{ + meta, vcf -> [groupKey(meta, meta.num_intervals), vcf]} + + gvcfs_for_merging = gvcfs_for_merging.map{ + meta, vcf -> [ meta - meta.subMap('intervals_name'), vcf ] + }.groupTuple() + + MERGE_SENTIEON_HAPLOTYPER_GVCFS(gvcfs_for_merging, dict) + + gvcf = Channel.empty().mix( + MERGE_SENTIEON_HAPLOTYPER_GVCFS.out.vcf, + haplotyper_gvcf_branch.no_intervals) + + gvcf_tbi = Channel.empty().mix( + MERGE_SENTIEON_HAPLOTYPER_GVCFS.out.tbi, + haplotyper_gvcf_tbi_branch.no_intervals) + + versions = versions.mix(SENTIEON_HAPLOTYPER.out.versions) + versions = versions.mix(MERGE_SENTIEON_HAPLOTYPER_VCFS.out.versions) + versions = versions.mix(MERGE_SENTIEON_HAPLOTYPER_GVCFS.out.versions) + + emit: + versions + vcf + vcf_tbi + gvcf + gvcf_tbi + genotype_intervals // For joint genotyping + +} diff --git a/subworkflows/local/bam_variant_calling_single_strelka/main.nf b/subworkflows/local/bam_variant_calling_single_strelka/main.nf new file mode 100644 index 0000000000..ab6b3373c3 --- /dev/null +++ b/subworkflows/local/bam_variant_calling_single_strelka/main.nf @@ -0,0 +1,64 @@ +// +// STRELKA2 single sample variant calling +// +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run + +include { GATK4_MERGEVCFS as MERGE_STRELKA } from '../../../modules/nf-core/gatk4/mergevcfs/main' +include { GATK4_MERGEVCFS as MERGE_STRELKA_GENOME } from '../../../modules/nf-core/gatk4/mergevcfs/main' +include { STRELKA_GERMLINE as STRELKA_SINGLE } from '../../../modules/nf-core/strelka/germline/main' + +workflow BAM_VARIANT_CALLING_SINGLE_STRELKA { + take: + cram // channel: [mandatory] [ meta, cram, crai ] + dict // channel: [optional] [ meta, dict ] + fasta // channel: [mandatory] [ fasta ] + fasta_fai // channel: [mandatory] [ fasta_fai ] + intervals // channel: [mandatory] [ interval.bed.gz, interval.bed.gz.tbi, num_intervals ] or [ [], [], 0 ] if no intervals + + main: + versions = Channel.empty() + + // Combine cram and intervals for spread and gather strategy + cram_intervals = cram.combine(intervals) + // Move num_intervals to meta map + .map{ meta, cram, crai, intervals, intervals_index, num_intervals -> [ meta + [ num_intervals:num_intervals ], cram, crai, intervals, intervals_index ] } + + STRELKA_SINGLE(cram_intervals, fasta, fasta_fai) + + // Figuring out if there is one or more vcf(s) from the same sample + genome_vcf = STRELKA_SINGLE.out.genome_vcf.branch{ + // Use meta.num_intervals to asses number of intervals + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 + } + + // Figuring out if there is one or more vcf(s) from the same sample + vcf = STRELKA_SINGLE.out.vcf.branch{ + // Use meta.num_intervals to asses number of intervals + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 + } + + // Only when using intervals + genome_vcf_to_merge = genome_vcf.intervals.map{ meta, vcf -> [ groupKey(meta, meta.num_intervals), vcf ]}.groupTuple() + vcf_to_merge = vcf.intervals.map{ meta, vcf -> [ groupKey(meta, meta.num_intervals), vcf ]}.groupTuple() + + MERGE_STRELKA(vcf_to_merge, dict) + MERGE_STRELKA_GENOME(genome_vcf_to_merge, dict) + + // Mix intervals and no_intervals channels together + // Only strelka variant vcf should get annotated + vcf = Channel.empty().mix(MERGE_STRELKA.out.vcf, vcf.no_intervals) + // add variantcaller to meta map and remove no longer necessary field: num_intervals + .map{ meta, vcf -> [ meta - meta.subMap('num_intervals') + [ variantcaller:'strelka' ], vcf ] } + + versions = versions.mix(MERGE_STRELKA.out.versions) + versions = versions.mix(MERGE_STRELKA_GENOME.out.versions) + versions = versions.mix(STRELKA_SINGLE.out.versions) + + emit: + vcf + + versions +} diff --git a/subworkflows/local/bam_variant_calling_single_tiddit/main.nf b/subworkflows/local/bam_variant_calling_single_tiddit/main.nf new file mode 100644 index 0000000000..356ce7c2fa --- /dev/null +++ b/subworkflows/local/bam_variant_calling_single_tiddit/main.nf @@ -0,0 +1,34 @@ +// +// TIDDIT single sample variant calling +// +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run + +include { TABIX_BGZIPTABIX as TABIX_BGZIP_TIDDIT_SV } from '../../../modules/nf-core/tabix/bgziptabix/main' +include { TIDDIT_SV } from '../../../modules/nf-core/tiddit/sv/main' + +workflow BAM_VARIANT_CALLING_SINGLE_TIDDIT { + take: + cram + fasta + bwa + + main: + versions = Channel.empty() + + TIDDIT_SV(cram, fasta, bwa) + + TABIX_BGZIP_TIDDIT_SV(TIDDIT_SV.out.vcf) + + ploidy = TIDDIT_SV.out.ploidy + vcf = TABIX_BGZIP_TIDDIT_SV.out.gz_tbi.map{ meta, gz, tbi -> [ meta + [ variantcaller: 'tiddit'], gz ] } + + versions = versions.mix(TABIX_BGZIP_TIDDIT_SV.out.versions) + versions = versions.mix(TIDDIT_SV.out.versions) + + emit: + ploidy + vcf + + versions +} diff --git a/subworkflows/local/bam_variant_calling_somatic_all/main.nf b/subworkflows/local/bam_variant_calling_somatic_all/main.nf new file mode 100644 index 0000000000..f561ea420c --- /dev/null +++ b/subworkflows/local/bam_variant_calling_somatic_all/main.nf @@ -0,0 +1,246 @@ +// +// PAIRED VARIANT CALLING +// + +include { BAM_VARIANT_CALLING_CNVKIT } from '../bam_variant_calling_cnvkit/main' +include { BAM_VARIANT_CALLING_FREEBAYES } from '../bam_variant_calling_freebayes/main' +include { BAM_VARIANT_CALLING_MPILEUP as MPILEUP_NORMAL } from '../bam_variant_calling_mpileup/main' +include { BAM_VARIANT_CALLING_MPILEUP as MPILEUP_TUMOR } from '../bam_variant_calling_mpileup/main' +include { BAM_VARIANT_CALLING_SOMATIC_ASCAT } from '../bam_variant_calling_somatic_ascat/main' +include { BAM_VARIANT_CALLING_SOMATIC_CONTROLFREEC } from '../bam_variant_calling_somatic_controlfreec/main' +include { BAM_VARIANT_CALLING_SOMATIC_MANTA } from '../bam_variant_calling_somatic_manta/main' +include { BAM_VARIANT_CALLING_SOMATIC_MUTECT2 } from '../bam_variant_calling_somatic_mutect2/main' +include { BAM_VARIANT_CALLING_SOMATIC_STRELKA } from '../bam_variant_calling_somatic_strelka/main' +include { BAM_VARIANT_CALLING_SOMATIC_TIDDIT } from '../bam_variant_calling_somatic_tiddit/main' +include { MSISENSORPRO_MSISOMATIC } from '../../../modules/nf-core/msisensorpro/msisomatic/main' + +workflow BAM_VARIANT_CALLING_SOMATIC_ALL { + take: + tools // Mandatory, list of tools to apply + cram // channel: [mandatory] cram + bwa // channel: [optional] bwa + cf_chrom_len // channel: [optional] controlfreec length file + chr_files + dbsnp // channel: [mandatory] dbsnp + dbsnp_tbi // channel: [mandatory] dbsnp_tbi + dict // channel: [mandatory] dict + fasta // channel: [mandatory] fasta + fasta_fai // channel: [mandatory] fasta_fai + germline_resource // channel: [optional] germline_resource + germline_resource_tbi // channel: [optional] germline_resource_tbi + intervals // channel: [mandatory] [ intervals, num_intervals ] or [ [], 0 ] if no intervals + intervals_bed_gz_tbi // channel: [mandatory] intervals/target regions index zipped and indexed + intervals_bed_combined // channel: [mandatory] intervals/target regions in one file unzipped + intervals_bed_gz_tbi_combined // channel: [mandatory] intervals/target regions in one file zipped + mappability + msisensorpro_scan // channel: [optional] msisensorpro_scan + panel_of_normals // channel: [optional] panel_of_normals + panel_of_normals_tbi // channel: [optional] panel_of_normals_tbi + allele_files // channel: [optional] ascat allele files + loci_files // channel: [optional] ascat loci files + gc_file // channel: [optional] ascat gc content file + rt_file // channel: [optional] ascat rt file + joint_mutect2 // boolean: [mandatory] [default: false] run mutect2 in joint mode + wes // boolean: [mandatory] [default: false] whether targeted data is processed + + main: + versions = Channel.empty() + + //TODO: Temporary until the if's can be removed and printing to terminal is prevented with "when" in the modules.config + vcf_freebayes = Channel.empty() + vcf_manta = Channel.empty() + vcf_strelka = Channel.empty() + out_msisensorpro = Channel.empty() + vcf_mutect2 = Channel.empty() + vcf_tiddit = Channel.empty() + + if (tools.split(',').contains('ascat')) { + BAM_VARIANT_CALLING_SOMATIC_ASCAT( + cram, + allele_files, + loci_files, + intervals_bed_combined, + fasta, + gc_file, + rt_file + ) + + versions = versions.mix(BAM_VARIANT_CALLING_SOMATIC_ASCAT.out.versions) + } + + // CONTROLFREEC + if (tools.split(',').contains('controlfreec')) { + // Remap channels to match module/subworkflow + cram_normal = cram.map{ meta, normal_cram, normal_crai, tumor_cram, tumor_crai -> [ meta, normal_cram, normal_crai ] } + cram_tumor = cram.map{ meta, normal_cram, normal_crai, tumor_cram, tumor_crai -> [ meta, tumor_cram, tumor_crai ] } + + MPILEUP_NORMAL( + cram_normal, + dict, + fasta, + intervals + ) + + MPILEUP_TUMOR( + cram_tumor, + dict, + fasta, + intervals + ) + + mpileup_normal = MPILEUP_NORMAL.out.mpileup + mpileup_tumor = MPILEUP_TUMOR.out.mpileup + // Remap channel to match module/subworkflow + mpileup_pair = mpileup_normal.cross(mpileup_tumor).map{ normal, tumor -> [ normal[0], normal[1], tumor[1], [], [], [], [] ] } + + length_file = cf_chrom_len ?: fasta_fai + + intervals_controlfreec = wes ? intervals_bed_combined : [] + + BAM_VARIANT_CALLING_SOMATIC_CONTROLFREEC( + mpileup_pair, + fasta, + length_file, + dbsnp, + dbsnp_tbi, + chr_files, + mappability, + intervals_controlfreec + ) + + versions = versions.mix(MPILEUP_NORMAL.out.versions) + versions = versions.mix(MPILEUP_TUMOR.out.versions) + versions = versions.mix(BAM_VARIANT_CALLING_SOMATIC_CONTROLFREEC.out.versions) + } + + // CNVKIT + if (tools.split(',').contains('cnvkit')) { + BAM_VARIANT_CALLING_CNVKIT( + // Remap channel to match module/subworkflow + cram.map{ meta, normal_cram, normal_crai, tumor_cram, tumor_crai -> [ meta, tumor_cram, normal_cram ] }, + fasta.map{ it -> [[id:it[0].baseName], it] }, + fasta_fai.map{ it -> [[id:it[0].baseName], it] }, + intervals_bed_combined.map{ it -> [[id:it[0].baseName], it] }, + [[id:"null"], []] + ) + + versions = versions.mix(BAM_VARIANT_CALLING_CNVKIT.out.versions) + } + + // FREEBAYES + if (tools.split(',').contains('freebayes')) { + BAM_VARIANT_CALLING_FREEBAYES( + cram, + dict, + fasta, + fasta_fai, + intervals + ) + + vcf_freebayes = BAM_VARIANT_CALLING_FREEBAYES.out.vcf + versions = versions.mix(BAM_VARIANT_CALLING_FREEBAYES.out.versions) + } + + // MANTA + if (tools.split(',').contains('manta')) { + BAM_VARIANT_CALLING_SOMATIC_MANTA( + cram, + fasta.map{ it -> [ [ id:'fasta' ], it ] }, + fasta_fai.map{ it -> [ [ id:'fasta_fai' ], it ] }, + intervals_bed_gz_tbi_combined + ) + + vcf_manta = BAM_VARIANT_CALLING_SOMATIC_MANTA.out.vcf + versions = versions.mix(BAM_VARIANT_CALLING_SOMATIC_MANTA.out.versions) + } + + // STRELKA + if (tools.split(',').contains('strelka')) { + // Remap channel to match module/subworkflow + cram_strelka = (tools.split(',').contains('manta')) ? + cram.join(BAM_VARIANT_CALLING_SOMATIC_MANTA.out.candidate_small_indels_vcf, failOnDuplicate: true, failOnMismatch: true).join(BAM_VARIANT_CALLING_SOMATIC_MANTA.out.candidate_small_indels_vcf_tbi, failOnDuplicate: true, failOnMismatch: true) : + cram.map{ meta, normal_cram, normal_crai, tumor_cram, tumor_crai -> [ meta, normal_cram, normal_crai, tumor_cram, tumor_crai, [], [] ] } + + BAM_VARIANT_CALLING_SOMATIC_STRELKA( + cram_strelka, + // Remap channel to match module/subworkflow + dict, + fasta, + fasta_fai, + intervals_bed_gz_tbi + ) + + vcf_strelka = Channel.empty().mix(BAM_VARIANT_CALLING_SOMATIC_STRELKA.out.vcf) + versions = versions.mix(BAM_VARIANT_CALLING_SOMATIC_STRELKA.out.versions) + } + + // MSISENSOR + if (tools.split(',').contains('msisensorpro')) { + MSISENSORPRO_MSISOMATIC(cram.combine(intervals_bed_combined), fasta, msisensorpro_scan) + + versions = versions.mix(MSISENSORPRO_MSISOMATIC.out.versions) + out_msisensorpro = out_msisensorpro.mix(MSISENSORPRO_MSISOMATIC.out.output_report) + } + + // MUTECT2 + if (tools.split(',').contains('mutect2')) { + BAM_VARIANT_CALLING_SOMATIC_MUTECT2( + // Remap channel to match module/subworkflow + // Adjust meta.map to simplify joining channels + // joint_mutect2 mode needs different meta.map than regular mode + cram.map{ meta, normal_cram, normal_crai, tumor_cram, tumor_crai -> + joint_mutect2 ? + //we need to keep all fields and then remove on a per-tool-basis to ensure proper joining at the filtering step + [ meta + [ id:meta.patient ], [ normal_cram, tumor_cram ], [ normal_crai, tumor_crai ] ] : + [ meta, [ normal_cram, tumor_cram ], [ normal_crai, tumor_crai ] ] + }, + // Remap channel to match module/subworkflow + fasta.map{ it -> [ [ id:'fasta' ], it ] }, + // Remap channel to match module/subworkflow + fasta_fai.map{ it -> [ [ id:'fasta_fai' ], it ] }, + dict, + germline_resource, + germline_resource_tbi, + panel_of_normals, + panel_of_normals_tbi, + intervals, + joint_mutect2 + ) + + vcf_mutect2 = BAM_VARIANT_CALLING_SOMATIC_MUTECT2.out.vcf_filtered + versions = versions.mix(BAM_VARIANT_CALLING_SOMATIC_MUTECT2.out.versions) + } + + // TIDDIT + if (tools.split(',').contains('tiddit')) { + BAM_VARIANT_CALLING_SOMATIC_TIDDIT( + // Remap channel to match module/subworkflow + cram.map{ meta, normal_cram, normal_crai, tumor_cram, tumor_crai -> [ meta, normal_cram, normal_crai ] }, + // Remap channel to match module/subworkflow + cram.map{ meta, normal_cram, normal_crai, tumor_cram, tumor_crai -> [ meta, tumor_cram, tumor_crai ] }, + // Remap channel to match module/subworkflow + fasta.map{ it -> [ [ id:'fasta' ], it ] }, + bwa) + vcf_tiddit = BAM_VARIANT_CALLING_SOMATIC_TIDDIT.out.vcf + versions = versions.mix(BAM_VARIANT_CALLING_SOMATIC_TIDDIT.out.versions) + } + + vcf_all = Channel.empty().mix( + vcf_freebayes, + vcf_manta, + vcf_mutect2, + vcf_strelka, + vcf_tiddit + ) + + emit: + out_msisensorpro + vcf_all + vcf_freebayes + vcf_manta + vcf_mutect2 + vcf_strelka + vcf_tiddit + + versions +} diff --git a/subworkflows/local/bam_variant_calling_somatic_ascat/main.nf b/subworkflows/local/bam_variant_calling_somatic_ascat/main.nf new file mode 100644 index 0000000000..64f45508ab --- /dev/null +++ b/subworkflows/local/bam_variant_calling_somatic_ascat/main.nf @@ -0,0 +1,31 @@ +// +// ASCAT variant calling +// +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run + +include { ASCAT } from '../../../modules/nf-core/ascat/main' + +workflow BAM_VARIANT_CALLING_SOMATIC_ASCAT { + + take: + cram_pair // channel: [mandatory] [meta, normal_cram, normal_crai, tumor_cram, tumor_crai] + allele_files // channel: [mandatory] zip + loci_files // channel: [mandatory] zip + intervals_bed // channel: [optional] bed for WES + fasta // channel: [optional] fasta needed for cram + gc_file // channel: [optional] txt for LogRCorrection + rt_file // channel: [optional] txt for LogRCorrection + + main: + + ch_versions = Channel.empty() + + if (!params.wes) intervals_bed = [] // No intervals needed if not WES + ASCAT(cram_pair, allele_files, loci_files, intervals_bed, fasta, gc_file, rt_file) + + ch_versions = ch_versions.mix(ASCAT.out.versions) + + emit: + versions = ch_versions +} diff --git a/subworkflows/local/bam_variant_calling_somatic_controlfreec/main.nf b/subworkflows/local/bam_variant_calling_somatic_controlfreec/main.nf new file mode 100644 index 0000000000..a2e7e17cff --- /dev/null +++ b/subworkflows/local/bam_variant_calling_somatic_controlfreec/main.nf @@ -0,0 +1,43 @@ +// +// CONTROLFREEC somatc variant calling +// +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run + +include { CONTROLFREEC_FREEC as FREEC_SOMATIC } from '../../../modules/nf-core/controlfreec/freec/main' +include { CONTROLFREEC_ASSESSSIGNIFICANCE as ASSESS_SIGNIFICANCE } from '../../../modules/nf-core/controlfreec/assesssignificance/main' +include { CONTROLFREEC_FREEC2BED as FREEC2BED } from '../../../modules/nf-core/controlfreec/freec2bed/main' +include { CONTROLFREEC_FREEC2CIRCOS as FREEC2CIRCOS } from '../../../modules/nf-core/controlfreec/freec2circos/main' +include { CONTROLFREEC_MAKEGRAPH as MAKEGRAPH } from '../../../modules/nf-core/controlfreec/makegraph/main' + +workflow BAM_VARIANT_CALLING_SOMATIC_CONTROLFREEC { + take: + controlfreec_input // channel: [mandatory] [meta, pileup_normal, pileup_tumor, [], [], [], []] + fasta // channel: [mandatory] + fasta_fai // channel: [mandatory] + dbsnp // channel: [mandatory] + dbsnp_tbi // channel: [mandatory] + chr_files // channel: [mandatory] + mappability // channel: [mandatory] + intervals_bed // channel: [optional] Contains a bed file of all intervals combined provided with the cram input(s). Should be empty for WGS + + main: + + ch_versions = Channel.empty() + + FREEC_SOMATIC(controlfreec_input, fasta, fasta_fai, [], dbsnp, dbsnp_tbi, chr_files, mappability, intervals_bed, []) + + ASSESS_SIGNIFICANCE(FREEC_SOMATIC.out.CNV.join(FREEC_SOMATIC.out.ratio, failOnDuplicate: true, failOnMismatch: true)) + FREEC2BED(FREEC_SOMATIC.out.ratio) + FREEC2CIRCOS(FREEC_SOMATIC.out.ratio) + MAKEGRAPH(FREEC_SOMATIC.out.ratio.join(FREEC_SOMATIC.out.BAF, failOnDuplicate: true, failOnMismatch: true)) + + ch_versions = ch_versions.mix(FREEC_SOMATIC.out.versions) + ch_versions = ch_versions.mix(ASSESS_SIGNIFICANCE.out.versions) + ch_versions = ch_versions.mix(FREEC2BED.out.versions) + ch_versions = ch_versions.mix(FREEC2CIRCOS.out.versions) + ch_versions = ch_versions.mix(MAKEGRAPH.out.versions) + + emit: + versions = ch_versions +} diff --git a/subworkflows/local/bam_variant_calling_somatic_manta/main.nf b/subworkflows/local/bam_variant_calling_somatic_manta/main.nf new file mode 100644 index 0000000000..f6720c5406 --- /dev/null +++ b/subworkflows/local/bam_variant_calling_somatic_manta/main.nf @@ -0,0 +1,47 @@ +// +// MANTA somatic variant calling +// +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run + +include { MANTA_SOMATIC } from '../../../modules/nf-core/manta/somatic/main' + +workflow BAM_VARIANT_CALLING_SOMATIC_MANTA { + take: + cram // channel: [mandatory] [ meta, cram1, crai1, cram2, crai2 ] + fasta // channel: [mandatory] [ meta, fasta ] + fasta_fai // channel: [mandatory] [ meta, fasta_fai ] + intervals // channel: [mandatory] [ interval.bed.gz, interval.bed.gz.tbi ] or [ [], [] ] if no intervals + + main: + versions = Channel.empty() + + // Combine cram and intervals, account for 0 intervals + cram_intervals = cram.combine(intervals).map{ it -> + bed_gz = it.size() > 5 ? it[5] : [] + bed_tbi = it.size() > 5 ? it[6] : [] + + [it[0], it[1], it[2], it[3], it[4], bed_gz, bed_tbi] + } + + MANTA_SOMATIC(cram_intervals, fasta, fasta_fai, []) + + candidate_small_indels_vcf = MANTA_SOMATIC.out.candidate_small_indels_vcf + candidate_small_indels_vcf_tbi = MANTA_SOMATIC.out.candidate_small_indels_vcf_tbi + candidate_sv_vcf = MANTA_SOMATIC.out.candidate_sv_vcf + diploid_sv_vcf = MANTA_SOMATIC.out.diploid_sv_vcf + somatic_sv_vcf = MANTA_SOMATIC.out.somatic_sv_vcf + + // Only diploid and somatic SV should get annotated + // add variantcaller to meta map + vcf = Channel.empty().mix(diploid_sv_vcf, somatic_sv_vcf).map{ meta, vcf -> [ meta + [ variantcaller:'manta' ], vcf ] } + + versions = versions.mix(MANTA_SOMATIC.out.versions) + + emit: + candidate_small_indels_vcf + candidate_small_indels_vcf_tbi + vcf + + versions +} diff --git a/subworkflows/local/bam_variant_calling_somatic_mutect2/main.nf b/subworkflows/local/bam_variant_calling_somatic_mutect2/main.nf new file mode 100644 index 0000000000..3f5b7d53ca --- /dev/null +++ b/subworkflows/local/bam_variant_calling_somatic_mutect2/main.nf @@ -0,0 +1,236 @@ +// +// +// MUTECT2: tumor-normal mode variantcalling: getpileupsummaries, calculatecontamination, learnreadorientationmodel and filtermutectcalls +// + +include { GATK4_CALCULATECONTAMINATION as CALCULATECONTAMINATION } from '../../../modules/nf-core/gatk4/calculatecontamination/main' +include { GATK4_FILTERMUTECTCALLS as FILTERMUTECTCALLS } from '../../../modules/nf-core/gatk4/filtermutectcalls/main' +include { GATK4_GATHERPILEUPSUMMARIES as GATHERPILEUPSUMMARIES_NORMAL } from '../../../modules/nf-core/gatk4/gatherpileupsummaries/main' +include { GATK4_GATHERPILEUPSUMMARIES as GATHERPILEUPSUMMARIES_TUMOR } from '../../../modules/nf-core/gatk4/gatherpileupsummaries/main' +include { GATK4_GETPILEUPSUMMARIES as GETPILEUPSUMMARIES_NORMAL } from '../../../modules/nf-core/gatk4/getpileupsummaries/main' +include { GATK4_GETPILEUPSUMMARIES as GETPILEUPSUMMARIES_TUMOR } from '../../../modules/nf-core/gatk4/getpileupsummaries/main' +include { GATK4_LEARNREADORIENTATIONMODEL as LEARNREADORIENTATIONMODEL } from '../../../modules/nf-core/gatk4/learnreadorientationmodel/main' +include { GATK4_MERGEMUTECTSTATS as MERGEMUTECTSTATS } from '../../../modules/nf-core/gatk4/mergemutectstats/main' +include { GATK4_MERGEVCFS as MERGE_MUTECT2 } from '../../../modules/nf-core/gatk4/mergevcfs/main' +include { GATK4_MUTECT2 as MUTECT2_PAIRED } from '../../../modules/nf-core/gatk4/mutect2/main' + +workflow BAM_VARIANT_CALLING_SOMATIC_MUTECT2 { + take: + input // channel: [ meta, [ input ], [ input_index ] ] + fasta // channel: /path/to/reference/fasta + fai // channel: /path/to/reference/fasta/index + dict // channel: /path/to/reference/fasta/dictionary + germline_resource // channel: /path/to/germline/resource + germline_resource_tbi // channel: /path/to/germline/index + panel_of_normals // channel: /path/to/panel/of/normals + panel_of_normals_tbi // channel: /path/to/panel/of/normals/index + intervals // channel: [mandatory] [ intervals, num_intervals ] or [ [], 0 ] if no intervals + joint_mutect2 // boolean: [mandatory] [default: false] run mutect2 in joint mode + + main: + versions = Channel.empty() + + //If no germline resource is provided, then create an empty channel to avoid GetPileupsummaries from being run + germline_resource_pileup = (germline_resource && germline_resource_tbi) ? germline_resource : Channel.empty() + germline_resource_pileup_tbi = germline_resource_tbi ?: Channel.empty() + + // Combine input and intervals for spread and gather strategy + input_intervals = input.combine(intervals) + // Move num_intervals to meta map and reorganize channel for MUTECT2_PAIRED module + .map{ meta, input_list, input_index_list, intervals, num_intervals -> [ meta + [ num_intervals:num_intervals ], input_list, input_index_list, intervals ] } + + if (joint_mutect2) { + + // Separate normal cram files + // Extract tumor cram files + ch_cram = input.multiMap{ meta, cram, crai -> + normal: [ meta - meta.subMap('tumor_id') , cram[0], crai[0] ] + tumor: [ meta - meta.subMap('tumor_id') , cram[1], crai[1] ] + } + + // Remove duplicates from normal channel and merge normal and tumor crams by patient + ch_tn_cram = ch_cram.normal.unique().mix(ch_cram.tumor).groupTuple() + // Combine input and intervals for scatter and gather strategy + ch_tn_intervals = ch_tn_cram.combine(intervals) + // Move num_intervals to meta map and reorganize channel for MUTECT2_PAIRED module + // meta: [id:patient_id, num_intervals, patient, sex] + .map{ meta, cram, crai, intervals, num_intervals -> [ meta + [ num_intervals:num_intervals ], cram, crai, intervals ] } + + MUTECT2_PAIRED( ch_tn_intervals, fasta, fai, dict, germline_resource, germline_resource_tbi, panel_of_normals, panel_of_normals_tbi) + } + else { + + // Perform variant calling using mutect2 module pair mode + // meta: [id:tumor_id_vs_normal_id, normal_id, num_intervals, patient, sex, tumor_id] + MUTECT2_PAIRED( input_intervals, fasta, fai, dict, germline_resource, germline_resource_tbi, panel_of_normals, panel_of_normals_tbi) + } + + // Figuring out if there is one or more vcf(s) from the same sample + vcf_branch = MUTECT2_PAIRED.out.vcf.branch{ + // Use meta.num_intervals to asses number of intervals + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 + } + + // Figuring out if there is one or more tbi(s) from the same sample + tbi_branch = MUTECT2_PAIRED.out.tbi.branch{ + // Use meta.num_intervals to asses number of intervals + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 + } + + // Figuring out if there is one or more vcf(s) from the same sample + stats_branch = MUTECT2_PAIRED.out.stats.branch{ + // Use meta.num_intervals to asses number of intervals + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 + } + + // Figuring out if there is one or more vcf(s) from the same sample + f1r2_branch = MUTECT2_PAIRED.out.f1r2.branch{ + // Use meta.num_intervals to asses number of intervals + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 + } + + // Only when using intervals + vcf_to_merge = vcf_branch.intervals.map{ meta, vcf -> [ groupKey(meta, meta.num_intervals), vcf ] }.groupTuple() + stats_to_merge = stats_branch.intervals.map{ meta, stats -> [ groupKey(meta, meta.num_intervals), stats ] }.groupTuple() + f1r2_to_merge = f1r2_branch.intervals.map{ meta, f1r2 -> [ groupKey(meta, meta.num_intervals), f1r2 ] }.groupTuple() + + MERGE_MUTECT2(vcf_to_merge, dict) + MERGEMUTECTSTATS(stats_to_merge) + + // Mix intervals and no_intervals channels together and remove no longer necessary field: normal_id, tumor_id, num_intervals + vcf = Channel.empty().mix(MERGE_MUTECT2.out.vcf, vcf_branch.no_intervals).map{ meta, vcf -> + [ joint_mutect2 ? meta - meta.subMap('normal_id', 'num_intervals') : meta - meta.subMap('num_intervals') , vcf ] + } + tbi = Channel.empty().mix(MERGE_MUTECT2.out.tbi, tbi_branch.no_intervals).map{ meta, tbi-> + [ joint_mutect2 ? meta - meta.subMap('normal_id', 'num_intervals') : meta - meta.subMap('num_intervals'), tbi ] + } + stats = Channel.empty().mix(MERGEMUTECTSTATS.out.stats, stats_branch.no_intervals).map{ meta, stats -> + [ joint_mutect2 ? meta - meta.subMap('normal_id', 'num_intervals') : meta - meta.subMap('num_intervals'), stats ] + } + f1r2 = Channel.empty().mix(f1r2_to_merge, f1r2_branch.no_intervals).map{ meta, f1r2-> + [ joint_mutect2 ? meta - meta.subMap('normal_id', 'num_intervals') : meta - meta.subMap('num_intervals') , f1r2 ] + } + + // Generate artifactpriors using learnreadorientationmodel on the f1r2 output of mutect2 + LEARNREADORIENTATIONMODEL(f1r2) + + pileup = input_intervals.multiMap{ meta, input_list, input_index_list, intervals -> + tumor: [ meta, input_list[1], input_index_list[1], intervals ] + normal: [ meta, input_list[0], input_index_list[0], intervals ] + } + + // Prepare input channel for normal pileup summaries. + // Remember, the input channel contains tumor-normal pairs, so there will be multiple copies of the normal sample for each tumor for a given patient. + // Therefore, we use unique function to generate normal pileup summaries once for each patient for better efficiency. + pileup_normal = pileup.normal.map{ meta, cram, crai, intervals -> [ meta - meta.subMap('tumor_id') + [ id:meta.normal_id ], cram, crai, intervals] }.unique() + // Prepare input channel for tumor pileup summaries. + pileup_tumor = pileup.tumor.map{ meta, cram, crai, intervals -> [ meta - meta.subMap('normal_id') + [ id:meta.tumor_id ], cram, crai, intervals ] } + + // Generate pileup summary tables using getepileupsummaries. tumor sample should always be passed in as the first input and input list entries of vcf_to_filter, + GETPILEUPSUMMARIES_NORMAL(pileup_normal, fasta, fai, dict, germline_resource_pileup, germline_resource_pileup_tbi) + GETPILEUPSUMMARIES_TUMOR(pileup_tumor, fasta, fai, dict, germline_resource_pileup, germline_resource_pileup_tbi) + + // Figuring out if there is one or more table(s) from the same sample + pileup_table_normal_branch = GETPILEUPSUMMARIES_NORMAL.out.table.branch{ + // Use meta.num_intervals to asses number of intervals + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 + } + + // Figuring out if there is one or more table(s) from the same sample + pileup_table_tumor_branch = GETPILEUPSUMMARIES_TUMOR.out.table.branch{ + // Use meta.num_intervals to asses number of intervals + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 + } + + // Only when using intervals + pileup_table_normal_to_merge = pileup_table_normal_branch.intervals.map{ meta, table -> [ groupKey(meta, meta.num_intervals), table ] }.groupTuple() + pileup_table_tumor_to_merge = pileup_table_tumor_branch.intervals.map{ meta, table -> [ groupKey(meta, meta.num_intervals), table ] }.groupTuple() + + // Merge Pileup Summaries + GATHERPILEUPSUMMARIES_NORMAL(pileup_table_normal_to_merge, dict.map{ meta, dict -> [ dict ] }) + GATHERPILEUPSUMMARIES_TUMOR(pileup_table_tumor_to_merge, dict.map{ meta, dict -> [ dict ] }) + + // Do some channel magic to generate tumor-normal pairs again. + // This is necessary because we generated one normal pileup summary for each patient but we need run calculate contamination for each tumor-normal pair. + pileup_table_tumor = Channel.empty().mix(GATHERPILEUPSUMMARIES_TUMOR.out.table, pileup_table_tumor_branch.no_intervals).map{meta, table -> [ meta - meta.subMap('normal_id', 'tumor_id', 'num_intervals') + [id:meta.patient], meta.id, table ] } + pileup_table_normal = Channel.empty().mix(GATHERPILEUPSUMMARIES_NORMAL.out.table, pileup_table_normal_branch.no_intervals).map{meta, table -> [ meta - meta.subMap('normal_id', 'tumor_id', 'num_intervals') + [id:meta.patient], meta.id, table ] } + + ch_calculatecontamination_in_tables = pileup_table_tumor.combine( + pileup_table_normal, by:0).map{ + meta, tumor_id, tumor_table, normal_id, normal_table -> + if(joint_mutect2){ + [ meta + [ id: tumor_id + "_vs_" + normal_id], tumor_table, normal_table] + }else{ + // we need tumor and normal ID for further post processing + [ meta + [ id: tumor_id + "_vs_" + normal_id, normal_id:normal_id, tumor_id:tumor_id ], tumor_table, normal_table] + } + } + + CALCULATECONTAMINATION(ch_calculatecontamination_in_tables) + + // Initialize empty channel: Contamination calculation is run on pileup table, pileup is not run if germline resource is not provided + calculatecontamination_out_seg = Channel.empty() + calculatecontamination_out_cont = Channel.empty() + + if (joint_mutect2) { + // Reduce the meta to only patient name + calculatecontamination_out_seg = CALCULATECONTAMINATION.out.segmentation.map{ meta, seg -> [ meta + [id: meta.patient], seg]}.groupTuple() + calculatecontamination_out_cont = CALCULATECONTAMINATION.out.contamination.map{ meta, cont -> [ meta + [id: meta.patient], cont]}.groupTuple() + } + else { + // Keep tumor_vs_normal ID + calculatecontamination_out_seg = CALCULATECONTAMINATION.out.segmentation + calculatecontamination_out_cont = CALCULATECONTAMINATION.out.contamination + } + + // Mutect2 calls filtered by filtermutectcalls using the artifactpriors, contamination and segmentation tables + // meta joint calling: [id:patient_id, patient, sex] + // meta paired calling: [id:tumorID_vs_normalID, normal_ID, patient, sex, tumorID] + vcf_to_filter = vcf.join(tbi, failOnDuplicate: true, failOnMismatch: true) + .join(stats, failOnDuplicate: true, failOnMismatch: true) + .join(LEARNREADORIENTATIONMODEL.out.artifactprior, failOnDuplicate: true, failOnMismatch: true) + .join(calculatecontamination_out_seg) + .join(calculatecontamination_out_cont) + .map{ meta, vcf, tbi, stats, orientation, seg, cont -> [ meta, vcf, tbi, stats, orientation, seg, cont, [] ] } + + FILTERMUTECTCALLS(vcf_to_filter, fasta, fai, dict) + + vcf_filtered = FILTERMUTECTCALLS.out.vcf + // add variantcaller to meta map + .map{ meta, vcf -> [ meta + [ variantcaller:'mutect2' ], vcf ] } + + versions = versions.mix(MERGE_MUTECT2.out.versions) + versions = versions.mix(CALCULATECONTAMINATION.out.versions) + versions = versions.mix(FILTERMUTECTCALLS.out.versions) + versions = versions.mix(GETPILEUPSUMMARIES_NORMAL.out.versions) + versions = versions.mix(GETPILEUPSUMMARIES_TUMOR.out.versions) + versions = versions.mix(GATHERPILEUPSUMMARIES_NORMAL.out.versions) + versions = versions.mix(GATHERPILEUPSUMMARIES_TUMOR.out.versions) + versions = versions.mix(LEARNREADORIENTATIONMODEL.out.versions) + versions = versions.mix(MERGEMUTECTSTATS.out.versions) + versions = versions.mix(MUTECT2_PAIRED.out.versions) + + emit: + vcf // channel: [ meta, vcf ] + stats // channel: [ meta, stats ] + + vcf_filtered // channel: [ meta, vcf ] + index_filtered = FILTERMUTECTCALLS.out.tbi // channel: [ meta, tbi ] + stats_filtered = FILTERMUTECTCALLS.out.stats // channel: [ meta, stats ] + + artifact_priors = LEARNREADORIENTATIONMODEL.out.artifactprior // channel: [ meta, artifactprior ] + + pileup_table_normal // channel: [ meta, table_normal ] + pileup_table_tumor // channel: [ meta, table_tumor ] + + contamination_table = calculatecontamination_out_cont // channel: [ meta, contamination ] + segmentation_table = calculatecontamination_out_seg // channel: [ meta, segmentation ] + + versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/bam_variant_calling_somatic_mutect2/meta.yml b/subworkflows/local/bam_variant_calling_somatic_mutect2/meta.yml new file mode 100644 index 0000000000..d5abdca939 --- /dev/null +++ b/subworkflows/local/bam_variant_calling_somatic_mutect2/meta.yml @@ -0,0 +1,127 @@ +name: gatk_tumor_normal_somatic_variant_calling +description: | + Perform variant calling on a paired tumor normal set of samples using mutect2 tumor normal mode. + f1r2 output of mutect2 is run through learnreadorientationmodel to get the artifact priors. + Run the input bam files through getpileupsummarries and then calculatecontamination to get the contamination and segmentation tables. + Filter the mutect2 output vcf using filtermutectcalls, artifact priors and the contamination & segmentation tables for additional filtering. +keywords: + - gatk4 + - mutect2 + - learnreadorientationmodel + - getpileupsummaries + - calculatecontamination + - filtermutectcalls + - variant_calling + - tumor_only + - filtered_vcf +modules: + - gatk4/mutect2 + - gatk4/learnreadorientationmodel + - gatk4/getpileupsummaries + - gatk4/calculatecontamination + - gatk4/filtermutectcalls +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - input: + type: list + description: list containing the tumor and normal BAM files, in that order, also able to take CRAM as an input + pattern: "[ *.{bam/cram} ]" + - input_index: + type: list + description: list containing the tumor and normal BAM file indexes, in that order, also able to take CRAM index as an input + pattern: "[ *.{bam.bai/cram.crai} ]" + - which_norm: + type: list + description: optional list of sample headers contained in the normal sample input file. + pattern: "testN" + - fasta: + type: file + description: The reference fasta file + pattern: "*.fasta" + - fai: + type: file + description: Index of reference fasta file + pattern: "*.fasta.fai" + - dict: + type: file + description: GATK sequence dictionary + pattern: "*.dict" + - germline_resource: + type: file + description: Population vcf of germline sequencing, containing allele fractions. + pattern: "*.vcf.gz" + - germline_resource_tbi: + type: file + description: Index file for the germline resource. + pattern: "*.vcf.gz.tbi" + - panel_of_normals: + type: file + description: vcf file to be used as a panel of normals. + pattern: "*.vcf.gz" + - panel_of_normals_tbi: + type: file + description: Index for the panel of normals. + pattern: "*.vcf.gz.tbi" + - interval_file: + type: file + description: File containing intervals. + pattern: "*.interval_list" +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - mutect2_vcf: + type: file + description: Compressed vcf file to be used for variant_calling. + pattern: "[ *.vcf.gz ]" + - mutect2_tbi: + type: file + description: Indexes of the mutect2_vcf file + pattern: "[ *vcf.gz.tbi ]" + - mutect2_stats: + type: file + description: Stats files for the mutect2 vcf + pattern: "[ *vcf.gz.stats ]" + - mutect2_f1r2: + type: file + description: file containing information to be passed to LearnReadOrientationModel. + pattern: "*.f1r2.tar.gz" + - artifact_priors: + type: file + description: file containing artifact-priors to be used by filtermutectcalls. + pattern: "*.tar.gz" + - pileup_table_tumor: + type: file + description: File containing the tumor pileup summary table, kept separate as calculatecontamination needs them individually specified. + pattern: "*_tumor.pileups.table" + - pileup_table_normal: + type: file + description: File containing the normal pileup summary table, kept separate as calculatecontamination needs them individually specified. + pattern: "*_normal.pileups.table" + - contamination_table: + type: file + description: File containing the contamination table. + pattern: "*.contamination.table" + - segmentation_table: + type: file + description: Output table containing segmentation of tumor minor allele fractions. + pattern: "*.segmentation.table" + - filtered_vcf: + type: file + description: file containing filtered mutect2 calls. + pattern: "*.vcf.gz" + - filtered_tbi: + type: file + description: tbi file that pairs with filtered vcf. + pattern: "*.vcf.gz.tbi" + - filtered_stats: + type: file + description: file containing statistics of the filtermutectcalls run. + pattern: "*.filteringStats.tsv" +authors: + - "@GCJMackenzie" diff --git a/subworkflows/local/bam_variant_calling_somatic_strelka/main.nf b/subworkflows/local/bam_variant_calling_somatic_strelka/main.nf new file mode 100644 index 0000000000..02c729f93e --- /dev/null +++ b/subworkflows/local/bam_variant_calling_somatic_strelka/main.nf @@ -0,0 +1,63 @@ +// +// STRELKA2 tumor-normal variant calling +// +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run + +include { GATK4_MERGEVCFS as MERGE_STRELKA_INDELS } from '../../../modules/nf-core/gatk4/mergevcfs/main' +include { GATK4_MERGEVCFS as MERGE_STRELKA_SNVS } from '../../../modules/nf-core/gatk4/mergevcfs/main' +include { STRELKA_SOMATIC } from '../../../modules/nf-core/strelka/somatic/main' + +workflow BAM_VARIANT_CALLING_SOMATIC_STRELKA { + take: + cram // channel: [mandatory] [ meta, normal_cram, normal_crai, tumor_cram, tumor_crai, manta_vcf, manta_tbi ] manta* are optional + dict // channel: [optional] [ meta, dict ] + fasta // channel: [mandatory] [ fasta ] + fasta_fai // channel: [mandatory] [ fasta_fai ] + intervals // channel: [mandatory] [ interval.bed.gz, interval.bed.gz.tbi, num_intervals ] or [ [], [], 0 ] if no intervals + + main: + versions = Channel.empty() + + // Combine cram and intervals for spread and gather strategy + cram_intervals = cram.combine(intervals) + // Move num_intervals to meta map + .map{ meta, normal_cram, normal_crai, tumor_cram, tumor_crai, manta_vcf, manta_tbi, intervals, intervals_index, num_intervals -> [ meta + [ num_intervals:num_intervals ], normal_cram, normal_crai, tumor_cram, tumor_crai, manta_vcf, manta_tbi, intervals, intervals_index ] } + + STRELKA_SOMATIC(cram_intervals, fasta, fasta_fai ) + + // Figuring out if there is one or more vcf(s) from the same sample + vcf_indels = STRELKA_SOMATIC.out.vcf_indels.branch{ + // Use meta.num_intervals to asses number of intervals + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 + } + + // Figuring out if there is one or more vcf(s) from the same sample + vcf_snvs = STRELKA_SOMATIC.out.vcf_snvs.branch{ + // Use meta.num_intervals to asses number of intervals + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 + } + + // Only when using intervals + vcf_indels_to_merge = vcf_indels.intervals.map{ meta, vcf -> [ groupKey(meta, meta.num_intervals), vcf ]}.groupTuple() + vcf_snvs_to_merge = vcf_snvs.intervals.map{ meta, vcf -> [ groupKey(meta, meta.num_intervals), vcf ]}.groupTuple() + + MERGE_STRELKA_INDELS(vcf_indels_to_merge, dict) + MERGE_STRELKA_SNVS(vcf_snvs_to_merge, dict) + + // Mix intervals and no_intervals channels together + vcf = Channel.empty().mix(MERGE_STRELKA_INDELS.out.vcf, MERGE_STRELKA_SNVS.out.vcf, vcf_indels.no_intervals, vcf_snvs.no_intervals) + // add variantcaller to meta map and remove no longer necessary field: num_intervals + .map{ meta, vcf -> [ meta - meta.subMap('num_intervals') + [ variantcaller:'strelka' ], vcf ] } + + versions = versions.mix(MERGE_STRELKA_SNVS.out.versions) + versions = versions.mix(MERGE_STRELKA_INDELS.out.versions) + versions = versions.mix(STRELKA_SOMATIC.out.versions) + + emit: + vcf + + versions +} diff --git a/subworkflows/local/bam_variant_calling_somatic_tiddit/main.nf b/subworkflows/local/bam_variant_calling_somatic_tiddit/main.nf new file mode 100644 index 0000000000..259520fce1 --- /dev/null +++ b/subworkflows/local/bam_variant_calling_somatic_tiddit/main.nf @@ -0,0 +1,36 @@ +// +// TIDDIT single sample variant calling +// +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run + +include { BAM_VARIANT_CALLING_SINGLE_TIDDIT as TIDDIT_NORMAL } from '../bam_variant_calling_single_tiddit/main.nf' +include { BAM_VARIANT_CALLING_SINGLE_TIDDIT as TIDDIT_TUMOR } from '../bam_variant_calling_single_tiddit/main.nf' +include { SVDB_MERGE } from '../../../modules/nf-core/svdb/merge/main.nf' + +workflow BAM_VARIANT_CALLING_SOMATIC_TIDDIT { + take: + cram_normal + cram_tumor + fasta + bwa + + main: + + versions = Channel.empty() + + TIDDIT_NORMAL(cram_normal, fasta, bwa) + TIDDIT_TUMOR(cram_tumor, fasta, bwa) + + SVDB_MERGE(TIDDIT_NORMAL.out.vcf.join(TIDDIT_TUMOR.out.vcf, failOnDuplicate: true, failOnMismatch: true).map{ meta, vcf_normal, vcf_tumor -> [ meta, [vcf_normal, vcf_tumor] ] }, false) + + vcf = SVDB_MERGE.out.vcf + + versions = versions.mix(TIDDIT_NORMAL.out.versions) + versions = versions.mix(TIDDIT_TUMOR.out.versions) + versions = versions.mix(SVDB_MERGE.out.versions) + + emit: + versions + vcf +} diff --git a/subworkflows/local/bam_variant_calling_tumor_only_all/main.nf b/subworkflows/local/bam_variant_calling_tumor_only_all/main.nf new file mode 100644 index 0000000000..31d968a245 --- /dev/null +++ b/subworkflows/local/bam_variant_calling_tumor_only_all/main.nf @@ -0,0 +1,200 @@ +// +// TUMOR ONLY VARIANT CALLING +// Should be only run on patients without normal sample +// + +include { BAM_VARIANT_CALLING_CNVKIT } from '../bam_variant_calling_cnvkit/main' +include { BAM_VARIANT_CALLING_FREEBAYES } from '../bam_variant_calling_freebayes/main' +include { BAM_VARIANT_CALLING_MPILEUP } from '../bam_variant_calling_mpileup/main' +include { BAM_VARIANT_CALLING_SINGLE_STRELKA } from '../bam_variant_calling_single_strelka/main' +include { BAM_VARIANT_CALLING_SINGLE_TIDDIT } from '../bam_variant_calling_single_tiddit/main' +include { BAM_VARIANT_CALLING_TUMOR_ONLY_CONTROLFREEC } from '../bam_variant_calling_tumor_only_controlfreec/main' +include { BAM_VARIANT_CALLING_TUMOR_ONLY_MANTA } from '../bam_variant_calling_tumor_only_manta/main' +include { BAM_VARIANT_CALLING_TUMOR_ONLY_MUTECT2 } from '../bam_variant_calling_tumor_only_mutect2/main' + +workflow BAM_VARIANT_CALLING_TUMOR_ONLY_ALL { + take: + tools // Mandatory, list of tools to apply + cram // channel: [mandatory] cram + bwa // channel: [optional] bwa + cf_chrom_len // channel: [optional] controlfreec length file + chr_files + cnvkit_reference + dbsnp // channel: [mandatory] dbsnp + dbsnp_tbi // channel: [mandatory] dbsnp_tbi + dict // channel: [mandatory] dict + fasta // channel: [mandatory] fasta + fasta_fai // channel: [mandatory] fasta_fai + germline_resource // channel: [optional] germline_resource + germline_resource_tbi // channel: [optional] germline_resource_tbi + intervals // channel: [mandatory] [ intervals, num_intervals ] or [ [], 0 ] if no intervals + intervals_bed_gz_tbi // channel: [mandatory] [ interval.bed.gz, interval.bed.gz.tbi, num_intervals ] or [ [], [], 0 ] if no intervals + intervals_bed_combined // channel: [mandatory] intervals/target regions in one file unzipped + intervals_bed_gz_tbi_combined // channel: [mandatory] intervals/target regions in one file zipped + mappability + panel_of_normals // channel: [optional] panel_of_normals + panel_of_normals_tbi // channel: [optional] panel_of_normals_tbi + joint_mutect2 // boolean: [mandatory] [default: false] run mutect2 in joint mode + wes // boolean: [mandatory] [default: false] whether targeted data is processed + + main: + versions = Channel.empty() + + //TODO: Temporary until the if's can be removed and printing to terminal is prevented with "when" in the modules.config + vcf_freebayes = Channel.empty() + vcf_manta = Channel.empty() + vcf_mpileup = Channel.empty() + vcf_mutect2 = Channel.empty() + vcf_strelka = Channel.empty() + vcf_tiddit = Channel.empty() + + // MPILEUP + if (tools.split(',').contains('mpileup') || tools.split(',').contains('controlfreec')) { + BAM_VARIANT_CALLING_MPILEUP( + cram, + dict, + fasta, + intervals + ) + vcf_mpileup = BAM_VARIANT_CALLING_MPILEUP.out.vcf + versions = versions.mix(BAM_VARIANT_CALLING_MPILEUP.out.versions) + } + + // CONTROLFREEC (depends on MPILEUP) + if (tools.split(',').contains('controlfreec')) { + length_file = cf_chrom_len ?: fasta_fai + intervals_controlfreec = wes ? intervals_bed_combined : [] + + BAM_VARIANT_CALLING_TUMOR_ONLY_CONTROLFREEC( + // Remap channel to match module/subworkflow + BAM_VARIANT_CALLING_MPILEUP.out.mpileup.map{ meta, pileup_tumor -> [ meta, [], pileup_tumor, [], [], [], [] ] }, + fasta, + length_file, + dbsnp, + dbsnp_tbi, + chr_files, + mappability, + intervals_controlfreec + ) + + versions = versions.mix(BAM_VARIANT_CALLING_TUMOR_ONLY_CONTROLFREEC.out.versions) + } + + // CNVKIT + if (tools.split(',').contains('cnvkit')) { + BAM_VARIANT_CALLING_CNVKIT ( + // Remap channel to match module/subworkflow + cram.map{ meta, cram, crai -> [ meta, cram, [] ] }, + fasta.map{ it -> [[id:it[0].baseName], it] }, + fasta_fai.map{ it -> [[id:it[0].baseName], it] }, + [[id:"null"], []], + cnvkit_reference.map{ it -> [[id:it[0].baseName], it] } + ) + + versions = versions.mix(BAM_VARIANT_CALLING_CNVKIT.out.versions) + } + + // FREEBAYES + if (tools.split(',').contains('freebayes')) { + BAM_VARIANT_CALLING_FREEBAYES( + // Remap channel to match module/subworkflow + cram.map{ meta, cram, crai -> [ meta, cram, crai, [], [] ] }, + dict, + fasta, + fasta_fai, + intervals + ) + + vcf_freebayes = BAM_VARIANT_CALLING_FREEBAYES.out.vcf + versions = versions.mix(BAM_VARIANT_CALLING_FREEBAYES.out.versions) + } + + // MUTECT2 + if (tools.split(',').contains('mutect2')) { + BAM_VARIANT_CALLING_TUMOR_ONLY_MUTECT2( + // Adjust meta.map to simplify joining channels + cram.map{ meta, cram, crai -> + joint_mutect2 ? + //we need to keep all fields and then remove on a per-tool-basis to ensure proper joining at the filtering step + [ meta - meta.subMap('data_type', 'status') + [ id:meta.patient ], cram, crai ] : + [ meta - meta.subMap('data_type', 'status'), cram, crai ] + }, + // Remap channel to match module/subworkflow + fasta.map{ it -> [ [ id:'fasta' ], it ] }, + // Remap channel to match module/subworkflow + fasta_fai.map{ it -> [ [ id:'fasta_fai' ], it ] }, + dict, + germline_resource, + germline_resource_tbi, + panel_of_normals, + panel_of_normals_tbi, + intervals, + joint_mutect2 + ) + + vcf_mutect2 = BAM_VARIANT_CALLING_TUMOR_ONLY_MUTECT2.out.vcf_filtered + versions = versions.mix(BAM_VARIANT_CALLING_TUMOR_ONLY_MUTECT2.out.versions) + } + + // MANTA + if (tools.split(',').contains('manta')) { + BAM_VARIANT_CALLING_TUMOR_ONLY_MANTA( + cram, + // Remap channel to match module/subworkflow + fasta.map{ it -> [ [ id:'fasta' ], it ] }, + fasta_fai.map{ it -> [ [ id:'fasta_fai' ], it ] }, + intervals_bed_gz_tbi_combined + + ) + + vcf_manta = BAM_VARIANT_CALLING_TUMOR_ONLY_MANTA.out.vcf + versions = versions.mix(BAM_VARIANT_CALLING_TUMOR_ONLY_MANTA.out.versions) + } + + // STRELKA + if (tools.split(',').contains('strelka')) { + BAM_VARIANT_CALLING_SINGLE_STRELKA( + cram, + dict, + fasta, + fasta_fai, + intervals_bed_gz_tbi + ) + + vcf_strelka = BAM_VARIANT_CALLING_SINGLE_STRELKA.out.vcf + versions = versions.mix(BAM_VARIANT_CALLING_SINGLE_STRELKA.out.versions) + } + + // TIDDIT + if (tools.split(',').contains('tiddit')) { + BAM_VARIANT_CALLING_SINGLE_TIDDIT( + cram, + // Remap channel to match module/subworkflow + fasta.map{ it -> [ [ id:'fasta' ], it ] }, + bwa + ) + + vcf_tiddit = BAM_VARIANT_CALLING_SINGLE_TIDDIT.out.vcf + versions = versions.mix(BAM_VARIANT_CALLING_SINGLE_TIDDIT.out.versions) + } + + vcf_all = Channel.empty().mix( + vcf_freebayes, + vcf_manta, + vcf_mutect2, + vcf_mpileup, + vcf_strelka, + vcf_tiddit + ) + + emit: + vcf_all + vcf_freebayes + vcf_manta + vcf_mpileup + vcf_mutect2 + vcf_strelka + vcf_tiddit + + versions = versions +} diff --git a/subworkflows/local/bam_variant_calling_tumor_only_controlfreec/main.nf b/subworkflows/local/bam_variant_calling_tumor_only_controlfreec/main.nf new file mode 100644 index 0000000000..993faf127c --- /dev/null +++ b/subworkflows/local/bam_variant_calling_tumor_only_controlfreec/main.nf @@ -0,0 +1,43 @@ +// +// CONTROLFREEC tumor-only variant calling +// +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run + +include { CONTROLFREEC_FREEC as FREEC_TUMORONLY } from '../../../modules/nf-core/controlfreec/freec/main' +include { CONTROLFREEC_ASSESSSIGNIFICANCE as ASSESS_SIGNIFICANCE } from '../../../modules/nf-core/controlfreec/assesssignificance/main' +include { CONTROLFREEC_FREEC2BED as FREEC2BED } from '../../../modules/nf-core/controlfreec/freec2bed/main' +include { CONTROLFREEC_FREEC2CIRCOS as FREEC2CIRCOS } from '../../../modules/nf-core/controlfreec/freec2circos/main' +include { CONTROLFREEC_MAKEGRAPH as MAKEGRAPH } from '../../../modules/nf-core/controlfreec/makegraph/main' + +workflow BAM_VARIANT_CALLING_TUMOR_ONLY_CONTROLFREEC { + take: + controlfreec_input // channel: [mandatory] [meta, [], pileup_tumor, [], [], [], []] + fasta // channel: [mandatory] + fasta_fai // channel: [mandatory] + dbsnp // channel: [mandatory] + dbsnp_tbi // channel: [mandatory] + chr_files // channel: [mandatory] + mappability // channel: [mandatory] + intervals_bed // channel: [optional] Contains a bed file of all intervals combined provided with the cram input(s). Should be empty for WGS + + main: + + ch_versions = Channel.empty() + + FREEC_TUMORONLY(controlfreec_input, fasta, fasta_fai, [], dbsnp, dbsnp_tbi, chr_files, mappability, intervals_bed, []) + + ASSESS_SIGNIFICANCE(FREEC_TUMORONLY.out.CNV.join(FREEC_TUMORONLY.out.ratio, failOnDuplicate: true, failOnMismatch: true)) + FREEC2BED(FREEC_TUMORONLY.out.ratio) + FREEC2CIRCOS(FREEC_TUMORONLY.out.ratio) + MAKEGRAPH(FREEC_TUMORONLY.out.ratio.join(FREEC_TUMORONLY.out.BAF, failOnDuplicate: true, failOnMismatch: true)) + + ch_versions = ch_versions.mix(FREEC_TUMORONLY.out.versions) + ch_versions = ch_versions.mix(ASSESS_SIGNIFICANCE.out.versions) + ch_versions = ch_versions.mix(FREEC2BED.out.versions) + ch_versions = ch_versions.mix(FREEC2CIRCOS.out.versions) + ch_versions = ch_versions.mix(MAKEGRAPH.out.versions) + + emit: + versions = ch_versions +} diff --git a/subworkflows/local/bam_variant_calling_tumor_only_manta/main.nf b/subworkflows/local/bam_variant_calling_tumor_only_manta/main.nf new file mode 100644 index 0000000000..38f5d4366c --- /dev/null +++ b/subworkflows/local/bam_variant_calling_tumor_only_manta/main.nf @@ -0,0 +1,44 @@ +// +// MANTA single sample variant calling +// +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run + +include { MANTA_TUMORONLY } from '../../../modules/nf-core/manta/tumoronly/main' + +// Seems to be the consensus on upstream modules implementation too +workflow BAM_VARIANT_CALLING_TUMOR_ONLY_MANTA { + take: + cram // channel: [mandatory] [ meta, cram, crai ] + fasta // channel: [mandatory] [ meta, fasta ] + fasta_fai // channel: [mandatory] [ meta, fasta_fai ] + intervals // channel: [mandatory] [ interval.bed.gz, interval.bed.gz.tbi ] or [ [], [] ] if no intervals + + main: + versions = Channel.empty() + + // Combine cram and intervals, account for 0 intervals + cram_intervals = cram.combine(intervals).map{ it -> + bed_gz = it.size() > 3 ? it[3] : [] + bed_tbi = it.size() > 3 ? it[4] : [] + + [it[0], it[1], it[2], bed_gz, bed_tbi] + } + + MANTA_TUMORONLY(cram_intervals, fasta, fasta_fai, []) + + small_indels_vcf = MANTA_TUMORONLY.out.candidate_small_indels_vcf + candidate_sv_vcf = MANTA_TUMORONLY.out.candidate_sv_vcf + tumor_sv_vcf = MANTA_TUMORONLY.out.tumor_sv_vcf + + // Only tumor sv should get annotated + // add variantcaller to meta map + vcf = tumor_sv_vcf.map{ meta, vcf -> [ meta + [ variantcaller:'manta' ], vcf ] } + + versions = versions.mix(MANTA_TUMORONLY.out.versions) + + emit: + vcf + + versions +} diff --git a/subworkflows/local/bam_variant_calling_tumor_only_mutect2/main.nf b/subworkflows/local/bam_variant_calling_tumor_only_mutect2/main.nf new file mode 100644 index 0000000000..9da171a4c1 --- /dev/null +++ b/subworkflows/local/bam_variant_calling_tumor_only_mutect2/main.nf @@ -0,0 +1,181 @@ +// +// GATK MUTECT2 in tumor only mode: getepileupsummaries, calculatecontamination and filtermutectcalls +// +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run + +include { GATK4_MERGEVCFS as MERGE_MUTECT2 } from '../../../modules/nf-core/gatk4/mergevcfs/main' +include { GATK4_CALCULATECONTAMINATION as CALCULATECONTAMINATION } from '../../../modules/nf-core/gatk4/calculatecontamination/main' +include { GATK4_FILTERMUTECTCALLS as FILTERMUTECTCALLS } from '../../../modules/nf-core/gatk4/filtermutectcalls/main' +include { GATK4_GETPILEUPSUMMARIES as GETPILEUPSUMMARIES } from '../../../modules/nf-core/gatk4/getpileupsummaries/main' +include { GATK4_GATHERPILEUPSUMMARIES as GATHERPILEUPSUMMARIES } from '../../../modules/nf-core/gatk4/gatherpileupsummaries/main' +include { GATK4_LEARNREADORIENTATIONMODEL as LEARNREADORIENTATIONMODEL } from '../../../modules/nf-core/gatk4/learnreadorientationmodel/main' +include { GATK4_MERGEMUTECTSTATS as MERGEMUTECTSTATS } from '../../../modules/nf-core/gatk4/mergemutectstats/main' +include { GATK4_MUTECT2 as MUTECT2 } from '../../../modules/nf-core/gatk4/mutect2/main' + +workflow BAM_VARIANT_CALLING_TUMOR_ONLY_MUTECT2 { + take: + input // channel: [ meta, [ input ], [ input_index ] ] + fasta // channel: /path/to/reference/fasta + fai // channel: /path/to/reference/fasta/index + dict // channel: /path/to/reference/fasta/dictionary + germline_resource // channel: /path/to/germline/resource + germline_resource_tbi // channel: /path/to/germline/index + panel_of_normals // channel: /path/to/panel/of/normals + panel_of_normals_tbi // channel: /path/to/panel/of/normals/index + intervals // channel: [mandatory] [ intervals, num_intervals ] or [ [], 0 ] if no intervals + joint_mutect2 // boolean: [mandatory] [default: false] run mutect2 in joint mode + + main: + versions = Channel.empty() + + //If no germline resource is provided, then create an empty channel to avoid GetPileupsummaries from being run + germline_resource_pileup = germline_resource_tbi ? germline_resource : Channel.empty() + germline_resource_pileup_tbi = germline_resource_tbi ?: Channel.empty() + + // Combine input and intervals for spread and gather strategy + input_intervals = input.combine(intervals) + // Move num_intervals to meta map and reorganize channel for MUTECT2 module + .map{ meta, input, index, intervals, num_intervals -> [ meta + [ num_intervals:num_intervals ], input, index, intervals ] } + + if (joint_mutect2) { + // Perform variant calling using mutect2 module in tumor single mode + // Group cram files by patient + input_joint = input + .map{ meta, input, index -> [ meta - meta.subMap('sample') + [ id:meta.patient ], input, index ] } + .groupTuple() + + // Add intervals for scatter-gather scaling + input_joint_intervals = input_joint.combine(intervals) + // Move num_intervals to meta map and reorganize channel for MUTECT2 module + .map{ meta, cram, crai, intervals, num_intervals -> [ meta + [ num_intervals:num_intervals ], cram, crai, intervals ] } + MUTECT2(input_joint_intervals, fasta, fai, dict, germline_resource, germline_resource_tbi, panel_of_normals, panel_of_normals_tbi) + } + else { + // Perform variant calling using mutect2 module in tumor single mode + MUTECT2(input_intervals, fasta, fai, dict, germline_resource, germline_resource_tbi, panel_of_normals, panel_of_normals_tbi) + } + + // Figuring out if there is one or more vcf(s) from the same sample + vcf_branch = MUTECT2.out.vcf.branch{ + // Use meta.num_intervals to asses number of intervals + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 + } + + // Figuring out if there is one or more tbi(s) from the same sample + tbi_branch = MUTECT2.out.tbi.branch{ + // Use meta.num_intervals to asses number of intervals + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 + } + + // Figuring out if there is one or more stats(s) from the same sample + stats_branch = MUTECT2.out.stats.branch{ + // Use meta.num_intervals to asses number of intervals + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 + } + + // Figuring out if there is one or more f1r2(s) from the same sample + f1r2_branch = MUTECT2.out.f1r2.branch{ + // Use meta.num_intervals to asses number of intervals + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 + } + + // Only when using intervals + vcf_to_merge = vcf_branch.intervals.map{ meta, vcf -> [ groupKey(meta, meta.num_intervals), vcf ] }.groupTuple() + stats_to_merge = stats_branch.intervals.map{ meta, stats -> [ groupKey(meta, meta.num_intervals), stats ] }.groupTuple() + f1r2_to_merge = f1r2_branch.intervals.map{ meta, f1r2 -> [ groupKey(meta, meta.num_intervals), f1r2 ] }.groupTuple() + + MERGE_MUTECT2(vcf_to_merge, dict) + MERGEMUTECTSTATS(stats_to_merge) + + // Mix intervals and no_intervals channels together + // Remove unnecessary metadata + vcf = Channel.empty().mix(MERGE_MUTECT2.out.vcf, vcf_branch.no_intervals).map{ meta, vcf -> [ meta - meta.subMap('num_intervals'), vcf ] } + tbi = Channel.empty().mix(MERGE_MUTECT2.out.tbi, tbi_branch.no_intervals).map{ meta, tbi -> [ meta - meta.subMap('num_intervals'), tbi ] } + stats = Channel.empty().mix(MERGEMUTECTSTATS.out.stats, stats_branch.no_intervals).map{ meta, stats -> [ meta - meta.subMap('num_intervals'), stats ] } + f1r2 = Channel.empty().mix(f1r2_to_merge, f1r2_branch.no_intervals).map{ meta, f1r2 -> [ meta - meta.subMap('num_intervals'), f1r2 ] } + + // Generate artifactpriors using learnreadorientationmodel on the f1r2 output of mutect2 + LEARNREADORIENTATIONMODEL(f1r2) + + pileup_input = input_intervals.map{ meta, cram, crai, intervals -> [ meta + [ id:meta.sample ], cram, crai, intervals] }.unique() + + // Generate pileup summary table using getepileupsummaries + GETPILEUPSUMMARIES(pileup_input, fasta, fai, dict, germline_resource_pileup, germline_resource_pileup_tbi) + + // Figuring out if there is one or more table(s) from the same sample + pileup_table_branch = GETPILEUPSUMMARIES.out.table.branch{ + // Use meta.num_intervals to asses number of intervals + intervals: it[0].num_intervals > 1 + no_intervals: it[0].num_intervals <= 1 + } + + // Only when using intervals + pileup_table_to_merge = pileup_table_branch.intervals.map{ meta, table -> [ groupKey(meta, meta.num_intervals), table ] }.groupTuple() + + GATHERPILEUPSUMMARIES(pileup_table_to_merge, dict.map{ meta, dict -> [ dict ] }) + + // Mix intervals and no_intervals channels together + pileup_table = Channel.empty().mix(GATHERPILEUPSUMMARIES.out.table, pileup_table_branch.no_intervals).map{meta, table -> [ meta - meta.subMap('num_intervals') + [id:meta.sample], table ] } + + // Contamination and segmentation tables created using calculatecontamination on the pileup summary table + CALCULATECONTAMINATION(pileup_table.map{ meta, table -> [ meta, table, [] ] }) + + // Initialize empty channel: Contamination calculation is run on pileup table, pileup is not run if germline resource is not provided + calculatecontamination_out_seg = Channel.empty() + calculatecontamination_out_cont = Channel.empty() + + if (joint_mutect2) { + // Group tables by samples + calculatecontamination_out_seg = CALCULATECONTAMINATION.out.segmentation.map{ meta, seg -> [ meta - meta.subMap('sample', 'num_intervals') + [id:meta.patient], seg ] }.groupTuple() + calculatecontamination_out_cont = CALCULATECONTAMINATION.out.contamination.map{ meta, cont -> [ meta - meta.subMap('sample', 'num_intervals') + [id:meta.patient], cont ] }.groupTuple() + } else { + // Regular single sample mode + calculatecontamination_out_seg = CALCULATECONTAMINATION.out.segmentation.map{ meta, seg -> [ meta - meta.subMap('num_intervals'), seg ] } + calculatecontamination_out_cont = CALCULATECONTAMINATION.out.contamination.map{ meta, cont -> [ meta - meta.subMap('num_intervals'), cont ] } + } + + // Mutect2 calls filtered by filtermutectcalls using the contamination and segmentation tables + vcf_to_filter = vcf.join(tbi, failOnDuplicate: true, failOnMismatch: true) + .join(stats, failOnDuplicate: true, failOnMismatch: true) + .join(LEARNREADORIENTATIONMODEL.out.artifactprior, failOnDuplicate: true, failOnMismatch: true) + .join(calculatecontamination_out_seg) + .join(calculatecontamination_out_cont) + .map{ meta, vcf, tbi, stats, artifactprior, seg, cont -> [ meta, vcf, tbi, stats, artifactprior, seg, cont, [] ] } + + FILTERMUTECTCALLS(vcf_to_filter, fasta, fai, dict) + + vcf_filtered = FILTERMUTECTCALLS.out.vcf + // add variantcaller to meta map and remove no longer necessary field: num_intervals + .map{ meta, vcf -> [ meta + [ variantcaller:'mutect2' ], vcf ] } + + versions = versions.mix(MERGE_MUTECT2.out.versions) + versions = versions.mix(CALCULATECONTAMINATION.out.versions) + versions = versions.mix(FILTERMUTECTCALLS.out.versions) + versions = versions.mix(GETPILEUPSUMMARIES.out.versions) + versions = versions.mix(GATHERPILEUPSUMMARIES.out.versions) + versions = versions.mix(LEARNREADORIENTATIONMODEL.out.versions) + versions = versions.mix(MERGEMUTECTSTATS.out.versions) + versions = versions.mix(MUTECT2.out.versions) + + emit: + vcf // channel: [ meta, vcf ] + stats // channel: [ meta, stats ] + + vcf_filtered // channel: [ meta, vcf ] + index_filtered = FILTERMUTECTCALLS.out.tbi // channel: [ meta, tbi ] + stats_filtered = FILTERMUTECTCALLS.out.stats // channel: [ meta, stats ] + + artifact_priors = LEARNREADORIENTATIONMODEL.out.artifactprior // channel: [ meta, artifactprior ] + + pileup_table // channel: [ meta, table ] + + contamination_table = calculatecontamination_out_cont // channel: [ meta, contamination ] + segmentation_table = calculatecontamination_out_seg // channel: [ meta, segmentation ] + + versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/bam_variant_calling_tumor_only_mutect2/meta.yml b/subworkflows/local/bam_variant_calling_tumor_only_mutect2/meta.yml new file mode 100644 index 0000000000..4c41f1f261 --- /dev/null +++ b/subworkflows/local/bam_variant_calling_tumor_only_mutect2/meta.yml @@ -0,0 +1,108 @@ +name: gatk_tumor_only_somatic_variant_calling +description: | + Perform variant calling on a single tumor sample using mutect2 tumor only mode. + Run the input bam file through getpileupsummarries and then calculatecontaminationto get the contamination and segmentation tables. + Filter the mutect2 output vcf using filtermutectcalls and the contamination & segmentation tables for additional filtering. +keywords: + - gatk4 + - mutect2 + - getpileupsummaries + - calculatecontamination + - filtermutectcalls + - variant_calling + - tumor_only + - filtered_vcf +modules: + - gatk4/mutect2 + - gatk4/getpileupsummaries + - gatk4/calculatecontamination + - gatk4/filtermutectcalls +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - input: + type: list + description: list containing one BAM file, also able to take CRAM as an input + pattern: "[ *.{bam/cram} ]" + - input_index: + type: list + description: list containing one BAM file indexe, also able to take CRAM index as an input + pattern: "[ *.{bam.bai/cram.crai} ]" + - fasta: + type: file + description: The reference fasta file + pattern: "*.fasta" + - fai: + type: file + description: Index of reference fasta file + pattern: "*.fasta.fai" + - dict: + type: file + description: GATK sequence dictionary + pattern: "*.dict" + - germline_resource: + type: file + description: Population vcf of germline sequencing, containing allele fractions. + pattern: "*.vcf.gz" + - germline_resource_tbi: + type: file + description: Index file for the germline resource. + pattern: "*.vcf.gz.tbi" + - panel_of_normals: + type: file + description: vcf file to be used as a panel of normals. + pattern: "*.vcf.gz" + - panel_of_normals_tbi: + type: file + description: Index for the panel of normals. + pattern: "*.vcf.gz.tbi" + - interval_file: + type: file + description: File containing intervals. + pattern: "*.interval_list" +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - mutect2_vcf: + type: file + description: Compressed vcf file to be used for variant_calling. + pattern: "[ *.vcf.gz ]" + - mutect2_tbi: + type: file + description: Indexes of the mutect2_vcf file + pattern: "[ *vcf.gz.tbi ]" + - mutect2_stats: + type: file + description: Stats files for the mutect2 vcf + pattern: "[ *vcf.gz.stats ]" + - pileup_table: + type: file + description: File containing the pileup summary table. + pattern: "*.pileups.table" + - contamination_table: + type: file + description: File containing the contamination table. + pattern: "*.contamination.table" + - segmentation_table: + type: file + description: Output table containing segmentation of tumor minor allele fractions. + pattern: "*.segmentation.table" + - filtered_vcf: + type: file + description: file containing filtered mutect2 calls. + pattern: "*.vcf.gz" + - filtered_tbi: + type: file + description: tbi file that pairs with filtered vcf. + pattern: "*.vcf.gz.tbi" + - filtered_stats: + type: file + description: file containing statistics of the filtermutectcalls run. + pattern: "*.filteringStats.tsv" +authors: + - "@GCJMackenzie" diff --git a/subworkflows/local/channel_align_create_csv/main.nf b/subworkflows/local/channel_align_create_csv/main.nf new file mode 100644 index 0000000000..692ffa0ef4 --- /dev/null +++ b/subworkflows/local/channel_align_create_csv/main.nf @@ -0,0 +1,24 @@ +// +// CHANNEL_ALIGN_CREATE_CSV +// + +workflow CHANNEL_ALIGN_CREATE_CSV { + take: + bam_indexed // channel: [mandatory] meta, bam, bai + + main: + // Creating csv files to restart from this step + bam_indexed.collectFile(keepHeader: true, skip: 1, sort: true, storeDir: "${params.outdir}/csv") { meta, bam, bai -> + patient = meta.patient + sample = meta.sample + sex = meta.sex + status = meta.status + bam = "${params.outdir}/preprocessing/mapped/${sample}/${bam.name}" + bai = "${params.outdir}/preprocessing/mapped/${sample}/${bai.name}" + + type = params.save_output_as_bam ? "bam" : "cram" + type_index = params.save_output_as_bam ? "bai" : "crai" + + ["mapped.csv", "patient,sex,status,sample,${type},${type_index}\n${patient},${sex},${status},${sample},${bam},${bai}\n"] + } +} diff --git a/subworkflows/local/channel_applybqsr_create_csv/main.nf b/subworkflows/local/channel_applybqsr_create_csv/main.nf new file mode 100644 index 0000000000..2396574ced --- /dev/null +++ b/subworkflows/local/channel_applybqsr_create_csv/main.nf @@ -0,0 +1,24 @@ +// +// CHANNEL_APPLYBQSR_CREATE_CSV +// + +workflow CHANNEL_APPLYBQSR_CREATE_CSV { + take: + cram_recalibrated_index // channel: [mandatory] meta, cram, crai + + main: + // Creating csv files to restart from this step + cram_recalibrated_index.collectFile(keepHeader: true, skip: 1, sort: true, storeDir: "${params.outdir}/csv") { meta, file, index -> + patient = meta.patient + sample = meta.sample + sex = meta.sex + status = meta.status + file = "${params.outdir}/preprocessing/recalibrated/${sample}/${file.name}" + index = "${params.outdir}/preprocessing/recalibrated/${sample}/${index.name}" + + type = params.save_output_as_bam ? "bam" : "cram" + type_index = params.save_output_as_bam ? "bai" : "crai" + + ["recalibrated.csv", "patient,sex,status,sample,${type},${type_index}\n${patient},${sex},${status},${sample},${file},${index}\n"] + } +} diff --git a/subworkflows/local/channel_baserecalibrator_create_csv/main.nf b/subworkflows/local/channel_baserecalibrator_create_csv/main.nf new file mode 100644 index 0000000000..8bbfacc85c --- /dev/null +++ b/subworkflows/local/channel_baserecalibrator_create_csv/main.nf @@ -0,0 +1,69 @@ +// +// CHANNEL_BASERECALIBRATOR_CREATE_CSV +// + +workflow CHANNEL_BASERECALIBRATOR_CREATE_CSV { + take: + cram_table_bqsr // channel: [mandatory] meta, cram, crai, table + tools + skip_tools + save_output_as_bam + outdir + + main: + // Creating csv files to restart from this step + if ( tools && tools.split(',').contains('sentieon_dedup') ) { + cram_table_bqsr.collectFile(keepHeader: true, skip: 1, sort: true, storeDir: "${outdir}/csv") { meta, cram, crai, table -> + + patient = meta.patient + sample = meta.sample + sex = meta.sex + status = meta.status + suffix_aligned = save_output_as_bam ? "bam" : "cram" + suffix_index = save_output_as_bam ? "bam.bai" : "cram.crai" + cram = "${outdir}/preprocessing/sentieon_dedup/${sample}/${cram.baseName}.${suffix_aligned}" + crai = "${outdir}/preprocessing/sentieon_dedup/${sample}/${crai.baseName.minus(".cram")}.${suffix_index}" + table = "${outdir}/preprocessing/recal_table/${sample}/${sample}.recal.table" + + type = save_output_as_bam ? "bam" : "cram" + type_index = save_output_as_bam ? "bai" : "crai" + + ["markduplicates.csv", "patient,sex,status,sample,${type},${type_index},table\n${patient},${sex},${status},${sample},${cram},${crai},${table}\n"] + } + } else if (!(skip_tools && (skip_tools.split(',').contains('markduplicates')))) { + cram_table_bqsr.collectFile(keepHeader: true, skip: 1, sort: true, storeDir: "${outdir}/csv") { meta, cram, crai, table -> + + patient = meta.patient + sample = meta.sample + sex = meta.sex + status = meta.status + suffix_aligned = save_output_as_bam ? "bam" : "cram" + suffix_index = save_output_as_bam ? "bam.bai" : "cram.crai" + cram = "${outdir}/preprocessing/markduplicates/${sample}/${cram.baseName}.${suffix_aligned}" + crai = "${outdir}/preprocessing/markduplicates/${sample}/${crai.baseName.minus(".cram")}.${suffix_index}" + table = "${outdir}/preprocessing/recal_table/${sample}/${sample}.recal.table" + + type = save_output_as_bam ? "bam" : "cram" + type_index = save_output_as_bam ? "bai" : "crai" + + ["markduplicates.csv", "patient,sex,status,sample,${type},${type_index},table\n${patient},${sex},${status},${sample},${cram},${crai},${table}\n"] + } + } else { + cram_table_bqsr.collectFile(keepHeader: true, skip: 1, sort: true, storeDir: "${outdir}/csv") { meta, cram, crai, table -> + patient = meta.patient + sample = meta.sample + sex = meta.sex + status = meta.status + suffix_aligned = save_output_as_bam ? "bam" : "cram" + suffix_index = save_output_as_bam ? "bam.bai" : "cram.crai" + cram = "${outdir}/preprocessing/${sample}/mapped/${cram.baseName}.${suffix_aligned}" + crai = "${outdir}/preprocessing/${sample}/mapped/${crai.baseName.minus(".cram")}.${suffix_index}" + table = "${outdir}/preprocessing/${sample}/recal_table/${sample}.recal.table" + + type = save_output_as_bam ? "bam" : "cram" + type_index = save_output_as_bam ? "bai" : "crai" + + ["sorted.csv", "patient,sex,status,sample,${type},${type_index},table\n${patient},${sex},${status},${sample},${cram},${crai},${table}\n"] + } + } +} diff --git a/subworkflows/local/channel_markduplicates_create_csv/main.nf b/subworkflows/local/channel_markduplicates_create_csv/main.nf new file mode 100644 index 0000000000..06e9a9826b --- /dev/null +++ b/subworkflows/local/channel_markduplicates_create_csv/main.nf @@ -0,0 +1,29 @@ +// +// CHANNEL_MARKDUPLICATES_CREATE_CSV +// + +workflow CHANNEL_MARKDUPLICATES_CREATE_CSV { + take: + cram_markduplicates // channel: [mandatory] meta, cram, crai + csv_subfolder + outdir + save_output_as_bam + + main: + // Creating csv files to restart from this step + cram_markduplicates.collectFile(keepHeader: true, skip: 1, sort: true, storeDir: "${outdir}/csv") { meta, file, index -> + patient = meta.patient + sample = meta.sample + sex = meta.sex + status = meta.status + suffix_aligned = save_output_as_bam ? "bam" : "cram" + suffix_index = save_output_as_bam ? "bam.bai" : "cram.crai" + file = "${outdir}/preprocessing/${csv_subfolder}/${sample}/${file.baseName}.${suffix_aligned}" + index = "${outdir}/preprocessing/${csv_subfolder}/${sample}/${index.baseName.minus(".cram")}.${suffix_index}" + + type = save_output_as_bam ? "bam" : "cram" + type_index = save_output_as_bam ? "bai" : "crai" + + ["markduplicates_no_table.csv", "patient,sex,status,sample,${type},${type_index}\n${patient},${sex},${status},${sample},${file},${index}\n"] + } +} diff --git a/subworkflows/local/channel_variant_calling_create_csv/main.nf b/subworkflows/local/channel_variant_calling_create_csv/main.nf new file mode 100644 index 0000000000..b8de11bf8b --- /dev/null +++ b/subworkflows/local/channel_variant_calling_create_csv/main.nf @@ -0,0 +1,18 @@ +// +// CHANNEL_VARIANT_CALLING_CREATE_CSV +// + +workflow CHANNEL_VARIANT_CALLING_CREATE_CSV { + take: + vcf_to_annotate // channel: [mandatory] meta, vcf + + main: + // Creating csv files to restart from this step + vcf_to_annotate.collectFile(keepHeader: true, skip: 1,sort: true, storeDir: "${params.outdir}/csv"){ meta, vcf -> + patient = meta.patient + sample = meta.id + variantcaller = meta.variantcaller + vcf = "${params.outdir}/variant_calling/${variantcaller}/${meta.id}/${vcf.getName()}" + ["variantcalled.csv", "patient,sample,variantcaller,vcf\n${patient},${sample},${variantcaller},${vcf}\n"] + } +} diff --git a/subworkflows/local/cram_merge_index_samtools/main.nf b/subworkflows/local/cram_merge_index_samtools/main.nf new file mode 100644 index 0000000000..b808c8edc6 --- /dev/null +++ b/subworkflows/local/cram_merge_index_samtools/main.nf @@ -0,0 +1,47 @@ +// +// MERGE INDEX CRAM +// +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run + +include { SAMTOOLS_INDEX as INDEX_CRAM } from '../../../modules/nf-core/samtools/index/main' +include { SAMTOOLS_MERGE as MERGE_CRAM } from '../../../modules/nf-core/samtools/merge/main' + +workflow CRAM_MERGE_INDEX_SAMTOOLS { + take: + cram // channel: [mandatory] meta, cram + fasta // channel: [mandatory] fasta + fasta_fai // channel: [mandatory] fai for fasta + + main: + versions = Channel.empty() + + // Figuring out if there is one or more cram(s) from the same sample + cram_to_merge = cram.branch{ meta, cram -> + // cram is a list, so use cram.size() to asses number of intervals + single: cram.size() <= 1 + return [ meta, cram[0] ] + multiple: cram.size() > 1 + } + + // Only when using intervals + MERGE_CRAM(cram_to_merge.multiple, fasta.map{ it -> [ [ id:'fasta' ], it ] }, fasta_fai.map{ it -> [ [ id:'fasta_fai' ], it ] }) + + // Mix intervals and no_intervals channels together + cram_all = MERGE_CRAM.out.cram.mix(cram_to_merge.single) + + // Index cram + INDEX_CRAM(cram_all) + + // Join with the crai file + cram_crai = cram_all.join(INDEX_CRAM.out.crai, failOnDuplicate: true, failOnMismatch: true) + + // Gather versions of all tools used + versions = versions.mix(INDEX_CRAM.out.versions.first()) + versions = versions.mix(MERGE_CRAM.out.versions.first()) + + emit: + cram_crai + + versions +} diff --git a/subworkflows/local/cram_qc_mosdepth_samtools/main.nf b/subworkflows/local/cram_qc_mosdepth_samtools/main.nf new file mode 100644 index 0000000000..fd070a6817 --- /dev/null +++ b/subworkflows/local/cram_qc_mosdepth_samtools/main.nf @@ -0,0 +1,38 @@ +// +// QC on CRAM +// +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run + +include { SAMTOOLS_STATS } from '../../../modules/nf-core/samtools/stats/main' +include { MOSDEPTH } from '../../../modules/nf-core/mosdepth/main' + +workflow CRAM_QC_MOSDEPTH_SAMTOOLS { + take: + cram // channel: [mandatory] [ meta, cram, crai ] + fasta // channel: [mandatory] [ fasta ] + intervals + + main: + versions = Channel.empty() + reports = Channel.empty() + + // Reports run on cram + SAMTOOLS_STATS(cram, fasta.map{ it -> [ [ id:'fasta' ], it ] }) + + MOSDEPTH(cram.combine(intervals.map{ meta, bed -> [ bed?:[] ] }), fasta.map{ it -> [ [ id:'fasta' ], it ] }) + + // Gather all reports generated + reports = reports.mix(SAMTOOLS_STATS.out.stats) + reports = reports.mix(MOSDEPTH.out.global_txt) + reports = reports.mix(MOSDEPTH.out.regions_txt) + + // Gather versions of all tools used + versions = versions.mix(MOSDEPTH.out.versions) + versions = versions.mix(SAMTOOLS_STATS.out.versions.first()) + + emit: + reports + + versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/cram_sampleqc/main.nf b/subworkflows/local/cram_sampleqc/main.nf new file mode 100644 index 0000000000..f00e18aafe --- /dev/null +++ b/subworkflows/local/cram_sampleqc/main.nf @@ -0,0 +1,48 @@ +include { BAM_NGSCHECKMATE } from '../../../subworkflows/nf-core/bam_ngscheckmate/main' +include { CRAM_QC_MOSDEPTH_SAMTOOLS as CRAM_QC_RECAL } from '../../../subworkflows/local/cram_qc_mosdepth_samtools/main' +workflow CRAM_SAMPLEQC { + + take: + ch_cram // channel: [ val(meta), cram, crai ] + ngscheckmate_bed // channel: [ ngscheckmate_bed ] + fasta // channel: [ fasta ] + skip_baserecalibration // boolean: + intervals_for_preprocessing // channel: + + main: + + ch_versions = Channel.empty() + reports = Channel.empty() + + if(!skip_baserecalibration){ + + CRAM_QC_RECAL( + ch_cram, + fasta, + intervals_for_preprocessing) + + // Gather QC reports + reports = CRAM_QC_RECAL.out.reports.collect{ meta, report -> report } + + // Gather used softwares versions + ch_versions = ch_versions.mix(CRAM_QC_RECAL.out.versions) + } + + ch_ngscheckmate_bed = ngscheckmate_bed.map{bed -> [[id: "ngscheckmate"], bed]} + + ch_fasta = fasta.map{fasta -> [[id: "genome"], fasta]} + + BAM_NGSCHECKMATE ( ch_cram.map{meta, cram, crai -> [meta, cram]}, ch_ngscheckmate_bed, ch_fasta) + ch_versions = ch_versions.mix(BAM_NGSCHECKMATE.out.versions.first()) + + emit: + corr_matrix = BAM_NGSCHECKMATE.out.corr_matrix // channel: [ meta, corr_matrix ] + matched = BAM_NGSCHECKMATE.out.matched // channel: [ meta, matched ] + all = BAM_NGSCHECKMATE.out.all // channel: [ meta, all ] + vcf = BAM_NGSCHECKMATE.out.vcf // channel: [ meta, vcf ] + pdf = BAM_NGSCHECKMATE.out.pdf // channel: [ meta, pdf ] + reports + + versions = ch_versions // channel: [ versions.yml ] +} + diff --git a/subworkflows/local/download_cache_snpeff_vep/main.nf b/subworkflows/local/download_cache_snpeff_vep/main.nf new file mode 100644 index 0000000000..f9f776db7c --- /dev/null +++ b/subworkflows/local/download_cache_snpeff_vep/main.nf @@ -0,0 +1,34 @@ +// +// DOWNLOAD CACHE SNPEFF VEP +// + +// Initialize channels based on params or indices that were just built +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run +// Condition is based on params.step and params.tools +// If and extra condition exists, it's specified in comments + +include { ENSEMBLVEP_DOWNLOAD } from '../../../modules/nf-core/ensemblvep/download/main' +include { SNPEFF_DOWNLOAD } from '../../../modules/nf-core/snpeff/download/main' + +workflow DOWNLOAD_CACHE_SNPEFF_VEP { + take: + ensemblvep_info + snpeff_info + + main: + versions = Channel.empty() + + ENSEMBLVEP_DOWNLOAD(ensemblvep_info) + SNPEFF_DOWNLOAD(snpeff_info) + + // Gather versions of all tools used + versions = versions.mix(ENSEMBLVEP_DOWNLOAD.out.versions) + versions = versions.mix(SNPEFF_DOWNLOAD.out.versions) + + emit: + ensemblvep_cache = ENSEMBLVEP_DOWNLOAD.out.cache.collect() // channel: [ meta, cache ] + snpeff_cache = SNPEFF_DOWNLOAD.out.cache.collect() // channel: [ meta, cache ] + + versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/fastq_align_bwamem_mem2_dragmap_sentieon/main.nf b/subworkflows/local/fastq_align_bwamem_mem2_dragmap_sentieon/main.nf new file mode 100644 index 0000000000..c61ac7f4cc --- /dev/null +++ b/subworkflows/local/fastq_align_bwamem_mem2_dragmap_sentieon/main.nf @@ -0,0 +1,56 @@ +// +// MAPPING +// +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run + +include { BWAMEM2_MEM } from '../../../modules/nf-core/bwamem2/mem/main' +include { BWA_MEM as BWAMEM1_MEM } from '../../../modules/nf-core/bwa/mem/main' +include { DRAGMAP_ALIGN } from '../../../modules/nf-core/dragmap/align/main' +include { SENTIEON_BWAMEM } from '../../../modules/nf-core/sentieon/bwamem/main' + +workflow FASTQ_ALIGN_BWAMEM_MEM2_DRAGMAP_SENTIEON { + take: + reads // channel: [mandatory] meta, reads + index // channel: [mandatory] index + sort // boolean: [mandatory] true -> sort, false -> don't sort + fasta + fasta_fai + + main: + + versions = Channel.empty() + reports = Channel.empty() + + // Only one of the following should be run + BWAMEM1_MEM(reads, index.map{ it -> [ [ id:'index' ], it ] }, sort) // If aligner is bwa-mem + BWAMEM2_MEM(reads, index.map{ it -> [ [ id:'index' ], it ] }, sort) // If aligner is bwa-mem2 + DRAGMAP_ALIGN(reads, index.map{ it -> [ [ id:'index' ], it ] }, sort) // If aligner is dragmap + // The sentieon-bwamem-module does sorting as part of the conversion from sam to bam. + SENTIEON_BWAMEM(reads, index.map{ it -> [ [ id:'index' ], it ] }, fasta.map{fa -> [[:], fa]}, fasta_fai.map{fai -> [[:], fai]}) // If aligner is sentieon-bwamem + + // Get the bam files from the aligner + // Only one aligner is run + bam = Channel.empty() + bam = bam.mix(BWAMEM1_MEM.out.bam) + bam = bam.mix(BWAMEM2_MEM.out.bam) + bam = bam.mix(DRAGMAP_ALIGN.out.bam) + bam = bam.mix(SENTIEON_BWAMEM.out.bam_and_bai.map{ meta, bam, bai -> [ meta, bam ] }) + + bai = SENTIEON_BWAMEM.out.bam_and_bai.map{ meta, bam, bai -> [ meta, bai ] } + + // Gather reports of all tools used + reports = reports.mix(DRAGMAP_ALIGN.out.log) + + // Gather versions of all tools used + versions = versions.mix(BWAMEM1_MEM.out.versions) + versions = versions.mix(BWAMEM2_MEM.out.versions) + versions = versions.mix(DRAGMAP_ALIGN.out.versions) + versions = versions.mix(SENTIEON_BWAMEM.out.versions) + + emit: + bam // channel: [ [meta], bam ] + bai // channel: [ [meta], bai ] + reports + versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/fastq_create_umi_consensus_fgbio/main.nf b/subworkflows/local/fastq_create_umi_consensus_fgbio/main.nf new file mode 100644 index 0000000000..c237e64014 --- /dev/null +++ b/subworkflows/local/fastq_create_umi_consensus_fgbio/main.nf @@ -0,0 +1,67 @@ +// +// Runs FGBIO tools to remove UMI tags from FASTQ reads +// Convert them to unmapped BAM file, map them to the reference genome, +// use the mapped information to group UMIs and generate consensus reads +// +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run + +include { FGBIO_CALLMOLECULARCONSENSUSREADS as CALLUMICONSENSUS } from '../../../modules/nf-core/fgbio/callmolecularconsensusreads/main.nf' +include { FGBIO_FASTQTOBAM as FASTQTOBAM } from '../../../modules/nf-core/fgbio/fastqtobam/main' +include { FGBIO_GROUPREADSBYUMI as GROUPREADSBYUMI } from '../../../modules/nf-core/fgbio/groupreadsbyumi/main' +include { FASTQ_ALIGN_BWAMEM_MEM2_DRAGMAP_SENTIEON as ALIGN_UMI } from '../fastq_align_bwamem_mem2_dragmap_sentieon/main' +include { SAMBLASTER } from '../../../modules/nf-core/samblaster/main' +include { SAMTOOLS_BAM2FQ as BAM2FASTQ } from '../../../modules/nf-core/samtools/bam2fq/main.nf' + +workflow FASTQ_CREATE_UMI_CONSENSUS_FGBIO { + take: + reads // channel: [mandatory] [ val(meta), [ reads ] ] + fasta // channel: [mandatory] /path/to/reference/fasta + fai // channel: [optional] /path/to/reference/fasta_fai, needed for Sentieon + map_index // channel: [mandatory] Pre-computed mapping index + groupreadsbyumi_strategy // string: [mandatory] grouping strategy - default: "Adjacency" + + main: + ch_versions = Channel.empty() + + // params.umi_read_structure is passed out as ext.args + // FASTQ reads are converted into a tagged unmapped BAM file (uBAM) + FASTQTOBAM(reads) + + // in order to map uBAM using BWA MEM, we need to convert uBAM to FASTQ + // TODO check if DRAGMAP works well with BAM inputs + // but keep the appropriate UMI tags in the FASTQ comment field and produce + // an interleaved FASQT file (hence, split = false) + split = false + BAM2FASTQ(FASTQTOBAM.out.bam, split) + + // appropriately tagged interleaved FASTQ reads are mapped to the reference + // bams will not be sorted (hence, sort = false) + sort = false + ALIGN_UMI(BAM2FASTQ.out.reads, map_index, sort, fasta, fai) + + // samblaster is used in order to tag mates information in the BAM file + // this is used in order to group reads by UMI + SAMBLASTER(ALIGN_UMI.out.bam) + + // appropriately tagged reads are now grouped by UMI information + GROUPREADSBYUMI(SAMBLASTER.out.bam, groupreadsbyumi_strategy) + + // Using newly created groups + // To call a consensus across reads in the same group + // And emit a consensus BAM file + CALLUMICONSENSUS(GROUPREADSBYUMI.out.bam) + + ch_versions = ch_versions.mix(BAM2FASTQ.out.versions) + ch_versions = ch_versions.mix(ALIGN_UMI.out.versions) + ch_versions = ch_versions.mix(CALLUMICONSENSUS.out.versions) + ch_versions = ch_versions.mix(FASTQTOBAM.out.versions) + ch_versions = ch_versions.mix(GROUPREADSBYUMI.out.versions) + ch_versions = ch_versions.mix(SAMBLASTER.out.versions) + + emit: + umibam = FASTQTOBAM.out.bam // channel: [ val(meta), [ bam ] ] + groupbam = GROUPREADSBYUMI.out.bam // channel: [ val(meta), [ bam ] ] + consensusbam = CALLUMICONSENSUS.out.bam // channel: [ val(meta), [ bam ] ] + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/initialize_annotation_cache/main.nf b/subworkflows/local/initialize_annotation_cache/main.nf new file mode 100644 index 0000000000..d2c6fcb7d6 --- /dev/null +++ b/subworkflows/local/initialize_annotation_cache/main.nf @@ -0,0 +1,57 @@ +// +// INITIALIZE ANNOTATION CACHE +// + +// Initialize channels based on params or indices that were just built +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run +// Condition is based on params.step and params.tools +// If and extra condition exists, it's specified in comments + +workflow INITIALIZE_ANNOTATION_CACHE { + take: + snpeff_enabled + snpeff_cache + snpeff_genome + snpeff_db + vep_enabled + vep_cache + vep_species + vep_cache_version + vep_genome + help_message + + main: + if (snpeff_enabled) { + def snpeff_annotation_cache_key = (snpeff_cache == "s3://annotation-cache/snpeff_cache/") ? "${snpeff_genome}.${snpeff_db}/" : "" + def snpeff_cache_dir = "${snpeff_annotation_cache_key}${snpeff_genome}.${snpeff_db}" + def snpeff_cache_path_full = file("$snpeff_cache/$snpeff_cache_dir", type: 'dir') + if ( !snpeff_cache_path_full.exists() || !snpeff_cache_path_full.isDirectory() ) { + if (snpeff_cache == "s3://annotation-cache/snpeff_cache/") { + error("This path is not available within annotation-cache.\nPlease check https://annotation-cache.github.io/ to create a request for it.") + } else { + error("Path provided with SnpEff cache is invalid.\nMake sure there is a directory named ${snpeff_cache_dir} in ${snpeff_cache}./n${help_message}") + } + } + snpeff_cache = Channel.fromPath(file("${snpeff_cache}/${snpeff_annotation_cache_key}"), checkIfExists: true).collect() + .map{ cache -> [ [ id:"${snpeff_genome}.${snpeff_db}" ], cache ] } + } else snpeff_cache = [] + + if (vep_enabled) { + def vep_annotation_cache_key = (vep_cache == "s3://annotation-cache/vep_cache/") ? "${vep_cache_version}_${vep_genome}/" : "" + def vep_cache_dir = "${vep_annotation_cache_key}${vep_species}/${vep_cache_version}_${vep_genome}" + def vep_cache_path_full = file("$vep_cache/$vep_cache_dir", type: 'dir') + if ( !vep_cache_path_full.exists() || !vep_cache_path_full.isDirectory() ) { + if (vep_cache == "s3://annotation-cache/vep_cache/") { + error("This path is not available within annotation-cache.\nPlease check https://annotation-cache.github.io/ to create a request for it.") + } else { + error("Path provided with VEP cache is invalid.\nMake sure there is a directory named ${vep_cache_dir} in ${vep_cache}./n${help_message}") + } + } + ensemblvep_cache = Channel.fromPath(file("${vep_cache}/${vep_annotation_cache_key}"), checkIfExists: true).collect() + } else ensemblvep_cache = [] + + emit: + ensemblvep_cache // channel: [ meta, cache ] + snpeff_cache // channel: [ meta, cache ] +} diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf deleted file mode 100644 index 0aecf87fb7..0000000000 --- a/subworkflows/local/input_check.nf +++ /dev/null @@ -1,44 +0,0 @@ -// -// Check input samplesheet and get read channels -// - -include { SAMPLESHEET_CHECK } from '../../modules/local/samplesheet_check' - -workflow INPUT_CHECK { - take: - samplesheet // file: /path/to/samplesheet.csv - - main: - SAMPLESHEET_CHECK ( samplesheet ) - .csv - .splitCsv ( header:true, sep:',' ) - .map { create_fastq_channel(it) } - .set { reads } - - emit: - reads // channel: [ val(meta), [ reads ] ] - versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ] -} - -// Function to get list of [ meta, [ fastq_1, fastq_2 ] ] -def create_fastq_channel(LinkedHashMap row) { - // create meta map - def meta = [:] - meta.id = row.sample - meta.single_end = row.single_end.toBoolean() - - // add path(s) of the fastq file(s) to the meta map - def fastq_meta = [] - if (!file(row.fastq_1).exists()) { - exit 1, "ERROR: Please check input samplesheet -> Read 1 FastQ file does not exist!\n${row.fastq_1}" - } - if (meta.single_end) { - fastq_meta = [ meta, [ file(row.fastq_1) ] ] - } else { - if (!file(row.fastq_2).exists()) { - exit 1, "ERROR: Please check input samplesheet -> Read 2 FastQ file does not exist!\n${row.fastq_2}" - } - fastq_meta = [ meta, [ file(row.fastq_1), file(row.fastq_2) ] ] - } - return fastq_meta -} diff --git a/subworkflows/local/post_variantcalling/main.nf b/subworkflows/local/post_variantcalling/main.nf new file mode 100644 index 0000000000..bf23ff13d4 --- /dev/null +++ b/subworkflows/local/post_variantcalling/main.nf @@ -0,0 +1,27 @@ +// +// POST VARIANT CALLING: processes run on variantcalled but not annotated VCFs +// + +include { CONCATENATE_GERMLINE_VCFS } from '../vcf_concatenate_germline/main' + +workflow POST_VARIANTCALLING { + + take: + vcfs + concatenate_vcfs + + main: + versions = Channel.empty() + + if(concatenate_vcfs){ + CONCATENATE_GERMLINE_VCFS(vcfs) + + vcfs = vcfs.mix(CONCATENATE_GERMLINE_VCFS.out.vcfs) + versions = versions.mix(CONCATENATE_GERMLINE_VCFS.out.versions) + } + + emit: + vcfs // post processed vcfs + + versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/prepare_genome/main.nf b/subworkflows/local/prepare_genome/main.nf new file mode 100644 index 0000000000..f9b9e62c95 --- /dev/null +++ b/subworkflows/local/prepare_genome/main.nf @@ -0,0 +1,138 @@ +// +// PREPARE GENOME +// + +// Initialize channels based on params or indices that were just built +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run +// Condition is based on params.step and params.tools +// If and extra condition exists, it's specified in comments + +include { BWA_INDEX as BWAMEM1_INDEX } from '../../../modules/nf-core/bwa/index/main' +include { BWAMEM2_INDEX } from '../../../modules/nf-core/bwamem2/index/main' +include { DRAGMAP_HASHTABLE } from '../../../modules/nf-core/dragmap/hashtable/main' +include { GATK4_CREATESEQUENCEDICTIONARY } from '../../../modules/nf-core/gatk4/createsequencedictionary/main' +include { MSISENSORPRO_SCAN } from '../../../modules/nf-core/msisensorpro/scan/main' +include { SAMTOOLS_FAIDX } from '../../../modules/nf-core/samtools/faidx/main' +include { TABIX_TABIX as TABIX_BCFTOOLS_ANNOTATIONS } from '../../../modules/nf-core/tabix/tabix/main' +include { TABIX_TABIX as TABIX_DBSNP } from '../../../modules/nf-core/tabix/tabix/main' +include { TABIX_TABIX as TABIX_GERMLINE_RESOURCE } from '../../../modules/nf-core/tabix/tabix/main' +include { TABIX_TABIX as TABIX_KNOWN_INDELS } from '../../../modules/nf-core/tabix/tabix/main' +include { TABIX_TABIX as TABIX_KNOWN_SNPS } from '../../../modules/nf-core/tabix/tabix/main' +include { TABIX_TABIX as TABIX_PON } from '../../../modules/nf-core/tabix/tabix/main' +include { UNTAR as UNTAR_CHR_DIR } from '../../../modules/nf-core/untar/main' +include { UNZIP as UNZIP_ALLELES } from '../../../modules/nf-core/unzip/main' +include { UNZIP as UNZIP_GC } from '../../../modules/nf-core/unzip/main' +include { UNZIP as UNZIP_LOCI } from '../../../modules/nf-core/unzip/main' +include { UNZIP as UNZIP_RT } from '../../../modules/nf-core/unzip/main' + +workflow PREPARE_GENOME { + take: + ascat_alleles // channel: [optional] ascat allele files + ascat_loci // channel: [optional] ascat loci files + ascat_loci_gc // channel: [optional] ascat gc content file + ascat_loci_rt // channel: [optional] ascat replictiming file + bcftools_annotations // channel: [optional] bcftools annotations file + chr_dir // channel: [optional] chromosome files + dbsnp // channel: [optional] dbsnp + fasta // channel: [mandatory] fasta + fasta_fai // channel: [optional] fasta_fai + germline_resource // channel: [optional] germline_resource + known_indels // channel: [optional] known_indels + known_snps // channel: [optional] known_snps + pon // channel: [optional] pon + + + main: + fasta = fasta.map{ fasta -> [ [ id:fasta.baseName ], fasta ] } + versions = Channel.empty() + + BWAMEM1_INDEX(fasta) // If aligner is bwa-mem + BWAMEM2_INDEX(fasta) // If aligner is bwa-mem2 + DRAGMAP_HASHTABLE(fasta) // If aligner is dragmap + + GATK4_CREATESEQUENCEDICTIONARY(fasta) + MSISENSORPRO_SCAN(fasta) + SAMTOOLS_FAIDX(fasta, [['id':null], []]) + + // the following are flattened and mapped in case the user supplies more than one value for the param + // written for KNOWN_INDELS, but preemptively applied to the rest + // [ file1, file2 ] becomes [ [ meta1, file1 ], [ meta2, file2 ] ] + // outputs are collected to maintain a single channel for relevant TBI files + TABIX_BCFTOOLS_ANNOTATIONS(bcftools_annotations.flatten().map{ it -> [ [ id:it.baseName ], it ] }) + TABIX_DBSNP(dbsnp.flatten().map{ it -> [ [ id:it.baseName ], it ] }) + TABIX_GERMLINE_RESOURCE(germline_resource.flatten().map{ it -> [ [ id:it.baseName ], it ] }) + TABIX_KNOWN_SNPS(known_snps.flatten().map{ it -> [ [ id:it.baseName ], it ] } ) + TABIX_KNOWN_INDELS(known_indels.flatten().map{ it -> [ [ id:it.baseName ], it ] } ) + TABIX_PON(pon.flatten().map{ it -> [ [ id:it.baseName ], it ] }) + + // prepare ascat reference files + allele_files = ascat_alleles + if (params.ascat_alleles && params.ascat_alleles.endsWith('.zip')) { + UNZIP_ALLELES(ascat_alleles.map{ it -> [[id:it[0].baseName], it]}) + allele_files = UNZIP_ALLELES.out.unzipped_archive.map{ it[1] } + versions = versions.mix(UNZIP_ALLELES.out.versions) + } + + loci_files = ascat_loci + if (params.ascat_loci && params.ascat_loci.endsWith('.zip')) { + UNZIP_LOCI(ascat_loci.map{ it -> [[id:it[0].baseName], it]}) + loci_files = UNZIP_LOCI.out.unzipped_archive.map{ it[1] } + versions = versions.mix(UNZIP_LOCI.out.versions) + } + gc_file = ascat_loci_gc + if (params.ascat_loci_gc && params.ascat_loci_gc.endsWith('.zip')) { + UNZIP_GC(ascat_loci_gc.map{ it -> [[id:it[0].baseName], it]}) + gc_file = UNZIP_GC.out.unzipped_archive.map{ it[1] } + versions = versions.mix(UNZIP_GC.out.versions) + } + rt_file = ascat_loci_rt + if (params.ascat_loci_rt && params.ascat_loci_rt.endsWith('.zip')) { + UNZIP_RT(ascat_loci_rt.map{ it -> [[id:it[0].baseName], it]}) + rt_file = UNZIP_RT.out.unzipped_archive.map{ it[1] } + versions = versions.mix(UNZIP_RT.out.versions) + } + + + chr_files = chr_dir + if (params.chr_dir && params.chr_dir.endsWith('tar.gz')) { + UNTAR_CHR_DIR(chr_dir.map{ it -> [ [ id:'chr_dir' ], it ] }) + chr_files = UNTAR_CHR_DIR.out.untar.map{ it[1] } + versions = versions.mix(UNTAR_CHR_DIR.out.versions) + } + + // Gather versions of all tools used + versions = versions.mix(SAMTOOLS_FAIDX.out.versions) + versions = versions.mix(BWAMEM1_INDEX.out.versions) + versions = versions.mix(BWAMEM2_INDEX.out.versions) + versions = versions.mix(DRAGMAP_HASHTABLE.out.versions) + versions = versions.mix(GATK4_CREATESEQUENCEDICTIONARY.out.versions) + versions = versions.mix(MSISENSORPRO_SCAN.out.versions) + versions = versions.mix(TABIX_BCFTOOLS_ANNOTATIONS.out.versions) + versions = versions.mix(TABIX_DBSNP.out.versions) + versions = versions.mix(TABIX_GERMLINE_RESOURCE.out.versions) + versions = versions.mix(TABIX_KNOWN_SNPS.out.versions) + versions = versions.mix(TABIX_KNOWN_INDELS.out.versions) + versions = versions.mix(TABIX_PON.out.versions) + + emit: + bcftools_annotations_tbi = TABIX_BCFTOOLS_ANNOTATIONS.out.tbi.map{ meta, tbi -> [tbi] }.collect() // bcftools_annotations.vcf.gz.tbi + bwa = BWAMEM1_INDEX.out.index.map{ meta, index -> [index] }.collect() // path: bwa/* + bwamem2 = BWAMEM2_INDEX.out.index.map{ meta, index -> [index] }.collect() // path: bwamem2/* + hashtable = DRAGMAP_HASHTABLE.out.hashmap.map{ meta, index -> [index] }.collect() // path: dragmap/* + dbsnp_tbi = TABIX_DBSNP.out.tbi.map{ meta, tbi -> [tbi] }.collect() // path: dbsnb.vcf.gz.tbi + dict = GATK4_CREATESEQUENCEDICTIONARY.out.dict // path: genome.fasta.dict + fasta_fai = SAMTOOLS_FAIDX.out.fai.map{ meta, fai -> [fai] } // path: genome.fasta.fai + germline_resource_tbi = TABIX_GERMLINE_RESOURCE.out.tbi.map{ meta, tbi -> [tbi] }.collect() // path: germline_resource.vcf.gz.tbi + known_snps_tbi = TABIX_KNOWN_SNPS.out.tbi.map{ meta, tbi -> [tbi] }.collect() // path: {known_indels*}.vcf.gz.tbi + known_indels_tbi = TABIX_KNOWN_INDELS.out.tbi.map{ meta, tbi -> [tbi] }.collect() // path: {known_indels*}.vcf.gz.tbi + msisensorpro_scan = MSISENSORPRO_SCAN.out.list.map{ meta, list -> [list] } // path: genome_msi.list + pon_tbi = TABIX_PON.out.tbi.map{ meta, tbi -> [tbi] }.collect() // path: pon.vcf.gz.tbi + allele_files + chr_files + gc_file + loci_files + rt_file + + versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/prepare_intervals/main.nf b/subworkflows/local/prepare_intervals/main.nf new file mode 100644 index 0000000000..f4079e3e81 --- /dev/null +++ b/subworkflows/local/prepare_intervals/main.nf @@ -0,0 +1,113 @@ +// +// PREPARE INTERVALS +// + +// Initialize channels based on params or indices that were just built +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run + +include { BUILD_INTERVALS } from '../../../modules/local/build_intervals/main' +include { CREATE_INTERVALS_BED } from '../../../modules/local/create_intervals_bed/main' +include { GATK4_INTERVALLISTTOBED } from '../../../modules/nf-core/gatk4/intervallisttobed/main' +include { TABIX_BGZIPTABIX as TABIX_BGZIPTABIX_INTERVAL_SPLIT } from '../../../modules/nf-core/tabix/bgziptabix/main' +include { TABIX_BGZIPTABIX as TABIX_BGZIPTABIX_INTERVAL_COMBINED } from '../../../modules/nf-core/tabix/bgziptabix/main' + +workflow PREPARE_INTERVALS { + take: + fasta_fai // mandatory [ fasta_fai ] + intervals // [ params.intervals ] + no_intervals // [ params.no_intervals ] + + main: + versions = Channel.empty() + + intervals_bed = Channel.empty() // List of [ bed, num_intervals ], one for each region + intervals_bed_gz_tbi = Channel.empty() // List of [ bed.gz, bed,gz.tbi, num_intervals ], one for each region + intervals_combined = Channel.empty() // Single bed file containing all intervals + + if (no_intervals) { + file("${params.outdir}/no_intervals.bed").text = "no_intervals\n" + file("${params.outdir}/no_intervals.bed.gz").text = "no_intervals\n" + file("${params.outdir}/no_intervals.bed.gz.tbi").text = "no_intervals\n" + + intervals_bed = Channel.fromPath(file("${params.outdir}/no_intervals.bed")).map{ it -> [ it, 0 ] } + intervals_bed_gz_tbi = Channel.fromPath(file("${params.outdir}/no_intervals.bed.{gz,gz.tbi}")).collect().map{ it -> [ it, 0 ] } + intervals_combined = Channel.fromPath(file("${params.outdir}/no_intervals.bed")).map{ it -> [ [ id:it.simpleName ], it ] } + } else if (params.step != 'annotate' && params.step != 'controlfreec') { + // If no interval/target file is provided, then generated intervals from FASTA file + if (!intervals) { + BUILD_INTERVALS(fasta_fai.map{it -> [ [ id:it.baseName ], it ] }) + + intervals_combined = BUILD_INTERVALS.out.bed + + CREATE_INTERVALS_BED(intervals_combined.map{ meta, path -> path }).bed + + intervals_bed = CREATE_INTERVALS_BED.out.bed + + versions = versions.mix(BUILD_INTERVALS.out.versions) + versions = versions.mix(CREATE_INTERVALS_BED.out.versions) + } else { + intervals_combined = Channel.fromPath(file(intervals)).map{it -> [ [ id:it.baseName ], it ] } + intervals_bed = CREATE_INTERVALS_BED(file(intervals)).bed + + versions = versions.mix(CREATE_INTERVALS_BED.out.versions) + + // If interval file is not provided as .bed, but e.g. as .interval_list then convert to BED format + if (intervals.endsWith(".interval_list")) { + GATK4_INTERVALLISTTOBED(intervals_combined) + intervals_combined = GATK4_INTERVALLISTTOBED.out.bed + versions = versions.mix(GATK4_INTERVALLISTTOBED.out.versions) + } + } + + // Now for the intervals.bed the following operations are done: + // 1. Intervals file is split up into multiple bed files for scatter/gather + // 2. Each bed file is indexed + + // 1. Intervals file is split up into multiple bed files for scatter/gather & grouping together small intervals + intervals_bed = intervals_bed.flatten() + .map{ intervalFile -> + def duration = 0.0 + for (line in intervalFile.readLines()) { + final fields = line.split('\t') + if (fields.size() >= 5) duration += fields[4].toFloat() + else { + start = fields[1].toInteger() + end = fields[2].toInteger() + duration += (end - start) / params.nucleotides_per_second + } + } + [ duration, intervalFile ] + }.toSortedList({ a, b -> b[0] <=> a[0] }) + .flatten().collate(2).map{ duration, intervalFile -> intervalFile }.collect() + // Adding number of intervals as elements + .map{ it -> [ it, it.size() ] } + .transpose() + + // 2. Create bed.gz and bed.gz.tbi for each interval file. They are split by region (see above) + TABIX_BGZIPTABIX_INTERVAL_SPLIT(intervals_bed.map{ file, num_intervals -> [ [ id:file.baseName], file ] }) + + intervals_bed_gz_tbi = TABIX_BGZIPTABIX_INTERVAL_SPLIT.out.gz_tbi.map{ meta, bed, tbi -> [ bed, tbi ] }.toList() + // Adding number of intervals as elements + .map{ it -> [ it, it.size() ] } + .transpose() + + versions = versions.mix(TABIX_BGZIPTABIX_INTERVAL_SPLIT.out.versions) + } + + TABIX_BGZIPTABIX_INTERVAL_COMBINED(intervals_combined) + versions = versions.mix(TABIX_BGZIPTABIX_INTERVAL_COMBINED.out.versions) + + intervals_bed_combined = intervals_combined.map{meta, bed -> bed }.collect() + intervals_bed_gz_tbi_combined = TABIX_BGZIPTABIX_INTERVAL_COMBINED.out.gz_tbi.map{meta, gz, tbi -> [gz, tbi] }.collect() + + emit: + // Intervals split for parallel execution + intervals_bed // [ intervals.bed, num_intervals ] + intervals_bed_gz_tbi // [ intervals.bed.gz, intervals.bed.gz.tbi, num_intervals ] + // All intervals in one file + intervals_bed_combined // [ intervals.bed ] + intervals_bed_gz_tbi_combined // [ intervals.bed.gz, intervals.bed.gz.tbi] + + versions // [ versions.yml ] +} diff --git a/subworkflows/local/prepare_reference_cnvkit/main.nf b/subworkflows/local/prepare_reference_cnvkit/main.nf new file mode 100644 index 0000000000..87b943dff1 --- /dev/null +++ b/subworkflows/local/prepare_reference_cnvkit/main.nf @@ -0,0 +1,31 @@ +// +// PREPARE_REFERENCE_CNVKIT +// + +// Initialize channels based on params or indices that were just built +// For all modules here: +// A when clause condition is defined in the conf/modules.config to determine if the module should be run + +include { CNVKIT_ANTITARGET } from '../../../modules/nf-core/cnvkit/antitarget/main' +include { CNVKIT_REFERENCE } from '../../../modules/nf-core/cnvkit/reference/main' + +workflow PREPARE_REFERENCE_CNVKIT { + take: + fasta // channel: [mandatory] fasta + intervals_bed_combined // channel: [] + + main: + versions = Channel.empty() + + // prepare a antitarget reference files for tumor_only mode of cnvkit + CNVKIT_ANTITARGET(intervals_bed_combined.flatten().map{ it -> [ [ id:'intervals' ], it ] }) + CNVKIT_REFERENCE(fasta, intervals_bed_combined, CNVKIT_ANTITARGET.out.bed.map{ meta, bed -> [ bed ] } ) + + versions = versions.mix(CNVKIT_ANTITARGET.out.versions) + versions = versions.mix(CNVKIT_REFERENCE.out.versions) + + emit: + cnvkit_reference = CNVKIT_REFERENCE.out.cnn.collect() + + versions +} diff --git a/subworkflows/local/samplesheet_to_channel/main.nf b/subworkflows/local/samplesheet_to_channel/main.nf new file mode 100644 index 0000000000..6784b4616b --- /dev/null +++ b/subworkflows/local/samplesheet_to_channel/main.nf @@ -0,0 +1,296 @@ +workflow SAMPLESHEET_TO_CHANNEL{ + + take: + ch_from_samplesheet + + main: + ch_from_samplesheet.dump(tag:"ch_from_samplesheet") + input_sample = ch_from_samplesheet.map{ meta, fastq_1, fastq_2, table, cram, crai, bam, bai, vcf, variantcaller -> + // generate patient_sample key to group lanes together + [ meta.patient + meta.sample, [meta, fastq_1, fastq_2, table, cram, crai, bam, bai, vcf, variantcaller] ] + }.tap{ ch_with_patient_sample } // save the channel + .groupTuple() //group by patient_sample to get all lanes + .map { patient_sample, ch_items -> + // get number of lanes per sample + [ patient_sample, ch_items.size() ] + }.combine(ch_with_patient_sample, by: 0) // for each entry add numLanes + .map { patient_sample, num_lanes, ch_items -> + (meta, fastq_1, fastq_2, table, cram, crai, bam, bai, vcf, variantcaller) = ch_items + if (meta.lane && fastq_2) { + meta = meta + [id: "${meta.sample}-${meta.lane}".toString()] + def CN = params.seq_center ? "CN:${params.seq_center}\\t" : '' + + def flowcell = flowcellLaneFromFastq(fastq_1) + // Don't use a random element for ID, it breaks resuming + def read_group = "\"@RG\\tID:${flowcell}.${meta.sample}.${meta.lane}\\t${CN}PU:${meta.lane}\\tSM:${meta.patient}_${meta.sample}\\tLB:${meta.sample}\\tDS:${params.fasta}\\tPL:${params.seq_platform}\"" + + meta = meta - meta.subMap('lane') + [num_lanes: num_lanes.toInteger(), read_group: read_group.toString(), data_type: 'fastq', size: 1] + + if (params.step == 'mapping') return [ meta, [ fastq_1, fastq_2 ] ] + else { + error("Samplesheet contains fastq files but step is `$params.step`. Please check your samplesheet or adjust the step parameter.\nhttps://nf-co.re/sarek/usage#input-samplesheet-configurations") + } + + // start from BAM + } else if (meta.lane && bam) { + if (params.step != 'mapping' && !bai) { + error("BAM index (bai) should be provided.") + } + meta = meta + [id: "${meta.sample}-${meta.lane}".toString()] + def CN = params.seq_center ? "CN:${params.seq_center}\\t" : '' + def read_group = "\"@RG\\tID:${meta.sample}_${meta.lane}\\t${CN}PU:${meta.lane}\\tSM:${meta.patient}_${meta.sample}\\tLB:${meta.sample}\\tDS:${params.fasta}\\tPL:${params.seq_platform}\"" + + meta = meta - meta.subMap('lane') + [num_lanes: num_lanes.toInteger(), read_group: read_group.toString(), data_type: 'bam', size: 1] + + if (params.step != 'annotate') return [ meta - meta.subMap('lane'), bam, bai ] + else { + error("Samplesheet contains bam files but step is `annotate`. The pipeline is expecting vcf files for the annotation. Please check your samplesheet or adjust the step parameter.\nhttps://nf-co.re/sarek/usage#input-samplesheet-configurations") + } + + // recalibration + } else if (table && cram) { + meta = meta + [id: meta.sample, data_type: 'cram'] + + if (!(params.step == 'mapping' || params.step == 'annotate')) return [ meta - meta.subMap('lane'), cram, crai, table ] + else { + error("Samplesheet contains cram files but step is `$params.step`. Please check your samplesheet or adjust the step parameter.\nhttps://nf-co.re/sarek/usage#input-samplesheet-configurations") + } + + // recalibration when skipping MarkDuplicates + } else if (table && bam) { + meta = meta + [id: meta.sample, data_type: 'bam'] + + if (!(params.step == 'mapping' || params.step == 'annotate')) return [ meta - meta.subMap('lane'), bam, bai, table ] + else { + error("Samplesheet contains bam files but step is `$params.step`. Please check your samplesheet or adjust the step parameter.\nhttps://nf-co.re/sarek/usage#input-samplesheet-configurations") + } + + // prepare_recalibration or variant_calling + } else if (cram) { + meta = meta + [id: meta.sample, data_type: 'cram'] + + if (!(params.step == 'mapping' || params.step == 'annotate')) return [ meta - meta.subMap('lane'), cram, crai ] + else { + error("Samplesheet contains cram files but step is `$params.step`. Please check your samplesheet or adjust the step parameter.\nhttps://nf-co.re/sarek/usage#input-samplesheet-configurations") + } + + // prepare_recalibration when skipping MarkDuplicates or `--step markduplicates` + } else if (bam) { + meta = meta + [id: meta.sample, data_type: 'bam'] + + if (!(params.step == 'mapping' || params.step == 'annotate')) return [ meta - meta.subMap('lane'), bam, bai ] + else { + error("Samplesheet contains bam files but step is `$params.step`. Please check your samplesheet or adjust the step parameter.\nhttps://nf-co.re/sarek/usage#input-samplesheet-configurations") + } + + // annotation + } else if (vcf) { + meta = meta + [id: meta.sample, data_type: 'vcf', variantcaller: variantcaller ?: ''] + + if (params.step == 'annotate') return [ meta - meta.subMap('lane'), vcf ] + else { + error("Samplesheet contains vcf files but step is `$params.step`. Please check your samplesheet or adjust the step parameter.\nhttps://nf-co.re/sarek/usage#input-samplesheet-configurations") + } + } else { + error("Missing or unknown field in csv file header. Please check your samplesheet") + } + } + + if (params.step != 'annotate' && params.tools && !params.build_only_index) { + // Two checks for ensuring that the pipeline stops with a meaningful error message if + // 1. the sample-sheet only contains normal-samples, but some of the requested tools require tumor-samples, and + // 2. the sample-sheet only contains tumor-samples, but some of the requested tools require normal-samples. + input_sample.filter{ it[0].status == 1 }.ifEmpty{ // In this case, the sample-sheet contains no tumor-samples + if (!params.build_only_index) { + def tools_tumor = ['ascat', 'controlfreec', 'mutect2', 'msisensorpro'] + def tools_tumor_asked = [] + tools_tumor.each{ tool -> + if (params.tools.split(',').contains(tool)) tools_tumor_asked.add(tool) + } + if (!tools_tumor_asked.isEmpty()) { + error('The sample-sheet only contains normal-samples, but the following tools, which were requested with "--tools", expect at least one tumor-sample : ' + tools_tumor_asked.join(", ")) + } + } + } + + input_sample.filter{ it[0].status == 0 }.ifEmpty{ // In this case, the sample-sheet contains no normal/germline-samples + def tools_requiring_normal_samples = ['ascat', 'deepvariant', 'haplotypecaller', 'msisensorpro'] + def requested_tools_requiring_normal_samples = [] + tools_requiring_normal_samples.each{ tool_requiring_normal_samples -> + if (params.tools.split(',').contains(tool_requiring_normal_samples)) requested_tools_requiring_normal_samples.add(tool_requiring_normal_samples) + } + if (!requested_tools_requiring_normal_samples.isEmpty()) { + error('The sample-sheet only contains tumor-samples, but the following tools, which were requested by the option "tools", expect at least one normal-sample : ' + requested_tools_requiring_normal_samples.join(", ")) + } + } + } + + // Fails when wrongfull extension for intervals file + if (params.wes && !params.step == 'annotate') { + if (params.intervals && !params.intervals.endsWith("bed")) error("Target file specified with `--intervals` must be in BED format for targeted data") + else log.warn("Intervals file was provided without parameter `--wes`: Pipeline will assume this is Whole-Genome-Sequencing data.") + } else if (params.intervals && !params.intervals.endsWith("bed") && !params.intervals.endsWith("list")) error("Intervals file must end with .bed, .list, or .interval_list") + + if (params.step == 'mapping' && params.aligner.contains("dragmap") && !(params.skip_tools && params.skip_tools.split(',').contains("baserecalibrator"))) { + log.warn("DragMap was specified as aligner. Base recalibration is not contained in --skip_tools. It is recommended to skip baserecalibration when using DragMap\nhttps://gatk.broadinstitute.org/hc/en-us/articles/4407897446939--How-to-Run-germline-single-sample-short-variant-discovery-in-DRAGEN-mode") + } + + if (params.step == 'mapping' && params.aligner.contains("sentieon-bwamem") && params.umi_read_structure) { + error("Sentieon BWA is currently not compatible with FGBio UMI handeling. Please choose a different aligner.") + } + + if (params.tools && params.tools.split(',').contains("sentieon_haplotyper") && params.joint_germline && (!params.sentieon_haplotyper_emit_mode || !(params.sentieon_haplotyper_emit_mode.contains('gvcf')))) { + error("When setting the option `--joint_germline` and including `sentieon_haplotyper` among the requested tools, please set `--sentieon_haplotyper_emit_mode` to include `gvcf`.") + } + + // Fails or warns when missing files or params for ascat + if (params.tools && params.tools.split(',').contains('ascat')) { + if (!params.ascat_alleles) { + error("No allele files were provided for running ASCAT. Please provide a zip folder with allele files.") + } + if (!params.ascat_loci) { + error("No loci files were provided for running ASCAT. Please provide a zip folder with loci files.") + } + if (!params.ascat_loci_gc && !params.ascat_loci_rt) { + log.warn("No LogRCorrection performed in ASCAT. For LogRCorrection to run, please provide either loci gc files or both loci gc files and loci rt files.") + } + if (params.wes) { + log.warn("Default reference files not suited for running ASCAT on WES data. It's recommended to use the reference files provided here: https://github.com/Wedge-lab/battenberg#required-reference-files") + } + } + + // Warns when missing files or params for mutect2 + if (params.tools && params.tools.split(',').contains('mutect2')) { + if (!params.pon) { + log.warn("No Panel-of-normal was specified for Mutect2.\nIt is highly recommended to use one: https://gatk.broadinstitute.org/hc/en-us/articles/5358911630107-Mutect2\nFor more information on how to create one: https://gatk.broadinstitute.org/hc/en-us/articles/5358921041947-CreateSomaticPanelOfNormals-BETA-") + } + if (!params.germline_resource) { + log.warn("If Mutect2 is specified without a germline resource, no filtering will be done.\nIt is recommended to use one: https://gatk.broadinstitute.org/hc/en-us/articles/5358911630107-Mutect2") + } + if (params.pon && params.pon.contains("/Homo_sapiens/GATK/GRCh38/Annotation/GATKBundle/1000g_pon.hg38.vcf.gz")) { + log.warn("The default Panel-of-Normals provided by GATK is used for Mutect2.\nIt is highly recommended to generate one from normal samples that are technical similar to the tumor ones.\nFor more information: https://gatk.broadinstitute.org/hc/en-us/articles/360035890631-Panel-of-Normals-PON-") + } + } + + // Fails when missing resources for baserecalibrator + // Warns when missing resources for haplotypecaller + if (!params.dbsnp && !params.known_indels) { + if (params.step in ['mapping', 'markduplicates', 'prepare_recalibration', 'recalibrate'] && (!params.skip_tools || (params.skip_tools && !params.skip_tools.split(',').contains('baserecalibrator')))) { + error("Base quality score recalibration requires at least one resource file. Please provide at least one of `--dbsnp` or `--known_indels`\nYou can skip this step in the workflow by adding `--skip_tools baserecalibrator` to the command.") + } + if (params.tools && (params.tools.split(',').contains('haplotypecaller') || params.tools.split(',').contains('sentieon_haplotyper') || params.tools.split(',').contains('sentieon_dnascope'))) { + log.warn "If GATK's Haplotypecaller, Sentieon's Dnascope or Sentieon's Haplotyper is specified, without `--dbsnp` or `--known_indels no filtering will be done. For filtering, please provide at least one of `--dbsnp` or `--known_indels`.\nFor more information see FilterVariantTranches (single-sample, default): https://gatk.broadinstitute.org/hc/en-us/articles/5358928898971-FilterVariantTranches\nFor more information see VariantRecalibration (--joint_germline): https://gatk.broadinstitute.org/hc/en-us/articles/5358906115227-VariantRecalibrator\nFor more information on GATK Best practice germline variant calling: https://gatk.broadinstitute.org/hc/en-us/articles/360035535932-Germline-short-variant-discovery-SNPs-Indels-" + } + } + if (params.joint_germline && (!params.tools || !(params.tools.split(',').contains('haplotypecaller') || params.tools.split(',').contains('sentieon_haplotyper') || params.tools.split(',').contains('sentieon_dnascope')))) { + error("The GATK's Haplotypecaller, Sentieon's Dnascope or Sentieon's Haplotyper should be specified as one of the tools when doing joint germline variant calling.) ") + } + + if ( + params.tools && + ( + params.tools.split(',').contains('haplotypecaller') || + params.tools.split(',').contains('sentieon_haplotyper') || + params.tools.split(',').contains('sentieon_dnascope') + ) && + params.joint_germline && + ( + !params.dbsnp || + !params.known_indels || + !params.known_snps || + params.no_intervals + ) + ) { + log.warn("""If GATK's Haplotypecaller, Sentieon's Dnascope and/or Sentieon's Haplotyper is specified, \ + but without `--dbsnp`, `--known_snps`, `--known_indels` or the associated resource labels (ie `known_snps_vqsr`), \ + no variant recalibration will be done. For recalibration you must provide all of these resources.\nFor more information \ + see VariantRecalibration: https://gatk.broadinstitute.org/hc/en-us/articles/5358906115227-VariantRecalibrator \n\ + Joint germline variant calling also requires intervals in order to genotype the samples. \ + As a result, if `--no_intervals` is set to `true` the joint germline variant calling will not be performed.""") + } + + if (params.tools && + params.tools.split(',').contains('sentieon_dnascope') && + params.joint_germline && + ( + !params.sentieon_dnascope_emit_mode || + !params.sentieon_dnascope_emit_mode.split(',').contains('gvcf') + ) + ) { + error("When using Sentieon Dnascope for joint-germline variant-calling the option `--sentieon_dnascope_emit_mode` has to include `gvcf`.") + } + + if (params.tools && + params.tools.split(',').contains('sentieon_haplotyper') && + params.joint_germline && + ( + !params.sentieon_haplotyper_emit_mode || + !params.sentieon_haplotyper_emit_mode.split(',').contains('gvcf') + ) + ) { + error("When using Sentieon Haplotyper for joint-germline variant-calling the option `--sentieon_haplotyper_emit_mode` has to include `gvcf`.") + } + + + // Fails when --joint_mutect2 is used without enabling mutect2 + if (params.joint_mutect2 && (!params.tools || !params.tools.split(',').contains('mutect2'))) { + error("The mutect2 should be specified as one of the tools when doing joint somatic variant calling with Mutect2. (The mutect2 could be specified by adding `--tools mutect2` to the nextflow command.)") + } + + // Fails when missing tools for variant_calling or annotate + if ((params.step == 'variant_calling' || params.step == 'annotate') && !params.tools) { + error("Please specify at least one tool when using `--step ${params.step}`.\nhttps://nf-co.re/sarek/parameters#tools") + } + + // Fails when missing sex information for CNV tools + if (params.tools && (params.tools.split(',').contains('ascat') || params.tools.split(',').contains('controlfreec'))) { + input_sample.map{ + if (it[0].sex == 'NA' ) { + error("Please specify sex information for each sample in your samplesheet when using '--tools' with 'ascat' or 'controlfreec'.\nhttps://nf-co.re/sarek/usage#input-samplesheet-configurations") + } + } + } + + // Fails when bcftools annotate is used but no files are supplied + if (params.tools && params.tools.split(',').contains('bcfann') && !(params.bcftools_annotations && params.bcftools_annotations_tbi && params.bcftools_header_lines)) { + error("Please specify --bcftools_annotations, --bcftools_annotations_tbi, and --bcftools_header_lines, when using BCFTools annotations") + } + + emit: + input_sample + } + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + FUNCTIONS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ +// Parse first line of a FASTQ file, return the flowcell id and lane number. +def flowcellLaneFromFastq(path) { + // expected format: + // xx:yy:FLOWCELLID:LANE:... (seven fields) + // or + // FLOWCELLID:LANE:xx:... (five fields) + def line + path.withInputStream { + InputStream gzipStream = new java.util.zip.GZIPInputStream(it) + Reader decoder = new InputStreamReader(gzipStream, 'ASCII') + BufferedReader buffered = new BufferedReader(decoder) + line = buffered.readLine() + } + assert line.startsWith('@') + line = line.substring(1) + def fields = line.split(':') + String fcid + + if (fields.size() >= 7) { + // CASAVA 1.8+ format, from https://support.illumina.com/help/BaseSpace_OLH_009008/Content/Source/Informatics/BS/FileFormat_FASTQ-files_swBS.htm + // "@::::::: :::" + fcid = fields[2] + } else if (fields.size() == 5) { + fcid = fields[0] + } + return fcid +} + + diff --git a/subworkflows/local/samplesheet_to_channel/main.nf.test b/subworkflows/local/samplesheet_to_channel/main.nf.test new file mode 100644 index 0000000000..49eeb2a132 --- /dev/null +++ b/subworkflows/local/samplesheet_to_channel/main.nf.test @@ -0,0 +1,34 @@ +nextflow_workflow { + + name "Test Workflow SAMPLESHEET_TO_CHANNEL" + script "subworkflows/local/samplesheet_to_channel/main.nf" + workflow "SAMPLESHEET_TO_CHANNEL" + + test("Should run without failures") { + + when { + params { + // define parameters here. Example: + skip_tools = 'baserecalibrator' + + } + workflow { + """ + // define inputs of the workflow here. Example: + input[0] = Channel.of([['patient':'test', 'sample':'test', + 'sex':'XX', 'status':0, 'lane':'test_L1'], + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true), + [], [], [], [], [], [], []]) + """ + } + } + + then { + assert workflow.success + assert snapshot(workflow.out).match() + } + + } + +} diff --git a/subworkflows/local/samplesheet_to_channel/main.nf.test.snap b/subworkflows/local/samplesheet_to_channel/main.nf.test.snap new file mode 100644 index 0000000000..fa440f539b --- /dev/null +++ b/subworkflows/local/samplesheet_to_channel/main.nf.test.snap @@ -0,0 +1,47 @@ +{ + "Should run without failures": { + "content": [ + { + "0": [ + [ + { + "patient": "test", + "sample": "test", + "sex": "XX", + "status": 0, + "id": "test-test_L1", + "num_lanes": 1, + "read_group": "\"@RG\\tID:null.test.test_L1\\tPU:test_L1\\tSM:test_test\\tLB:test\\tDS:null\\tPL:ILLUMINA\"", + "data_type": "fastq", + "size": 1 + }, + [ + "/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/fastq/test_1.fastq.gz", + "/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/fastq/test_2.fastq.gz" + ] + ] + ], + "input_sample": [ + [ + { + "patient": "test", + "sample": "test", + "sex": "XX", + "status": 0, + "id": "test-test_L1", + "num_lanes": 1, + "read_group": "\"@RG\\tID:null.test.test_L1\\tPU:test_L1\\tSM:test_test\\tLB:test\\tDS:null\\tPL:ILLUMINA\"", + "data_type": "fastq", + "size": 1 + }, + [ + "/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/fastq/test_1.fastq.gz", + "/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/fastq/test_2.fastq.gz" + ] + ] + ] + } + ], + "timestamp": "2023-10-16T14:12:54.640503" + } +} \ No newline at end of file diff --git a/subworkflows/local/vcf_annotate_all/main.nf b/subworkflows/local/vcf_annotate_all/main.nf new file mode 100644 index 0000000000..89d4b696c3 --- /dev/null +++ b/subworkflows/local/vcf_annotate_all/main.nf @@ -0,0 +1,75 @@ +// +// ANNOTATION +// + +include { VCF_ANNOTATE_BCFTOOLS } from '../vcf_annotate_bcftools/main' +include { VCF_ANNOTATE_ENSEMBLVEP } from '../../nf-core/vcf_annotate_ensemblvep/main' +include { VCF_ANNOTATE_ENSEMBLVEP as VCF_ANNOTATE_MERGE } from '../../nf-core/vcf_annotate_ensemblvep/main' +include { VCF_ANNOTATE_SNPEFF } from '../../nf-core/vcf_annotate_snpeff/main' + +workflow VCF_ANNOTATE_ALL { + take: + vcf // channel: [ val(meta), vcf ] + fasta + tools // Mandatory, list of tools to apply + snpeff_db + snpeff_cache + vep_genome + vep_species + vep_cache_version + vep_cache + vep_extra_files + bcftools_annotations + bcftools_annotations_index + bcftools_header_lines + + main: + reports = Channel.empty() + vcf_ann = Channel.empty() + tab_ann = Channel.empty() + json_ann = Channel.empty() + versions = Channel.empty() + + if (tools.split(',').contains('bcfann')) { + VCF_ANNOTATE_BCFTOOLS(vcf, bcftools_annotations, bcftools_annotations_index, bcftools_header_lines) + + vcf_ann = vcf_ann.mix(VCF_ANNOTATE_BCFTOOLS.out.vcf_tbi) + versions = versions.mix(VCF_ANNOTATE_BCFTOOLS.out.versions) + } + + + if (tools.split(',').contains('merge') || tools.split(',').contains('snpeff')) { + VCF_ANNOTATE_SNPEFF(vcf, snpeff_db, snpeff_cache) + + reports = reports.mix(VCF_ANNOTATE_SNPEFF.out.reports) + vcf_ann = vcf_ann.mix(VCF_ANNOTATE_SNPEFF.out.vcf_tbi) + versions = versions.mix(VCF_ANNOTATE_SNPEFF.out.versions) + } + + if (tools.split(',').contains('merge')) { + vcf_ann_for_merge = VCF_ANNOTATE_SNPEFF.out.vcf_tbi.map{ meta, vcf, tbi -> [ meta, vcf, [] ] } + VCF_ANNOTATE_MERGE(vcf_ann_for_merge, fasta, vep_genome, vep_species, vep_cache_version, vep_cache, vep_extra_files) + + reports = reports.mix(VCF_ANNOTATE_MERGE.out.reports) + vcf_ann = vcf_ann.mix(VCF_ANNOTATE_MERGE.out.vcf_tbi) + versions = versions.mix(VCF_ANNOTATE_MERGE.out.versions) + } + + if (tools.split(',').contains('vep')) { + vcf_for_vep = vcf.map{ meta, vcf -> [ meta, vcf, [] ] } + VCF_ANNOTATE_ENSEMBLVEP(vcf_for_vep, fasta, vep_genome, vep_species, vep_cache_version, vep_cache, vep_extra_files) + + reports = reports.mix(VCF_ANNOTATE_ENSEMBLVEP.out.reports) + vcf_ann = vcf_ann.mix(VCF_ANNOTATE_ENSEMBLVEP.out.vcf_tbi) + tab_ann = tab_ann.mix(VCF_ANNOTATE_ENSEMBLVEP.out.tab) + json_ann = json_ann.mix(VCF_ANNOTATE_ENSEMBLVEP.out.json) + versions = versions.mix(VCF_ANNOTATE_ENSEMBLVEP.out.versions) + } + + emit: + vcf_ann // channel: [ val(meta), vcf.gz, vcf.gz.tbi ] + tab_ann + json_ann + reports // path: *.html + versions // path: versions.yml +} diff --git a/subworkflows/local/vcf_annotate_bcftools/main.nf b/subworkflows/local/vcf_annotate_bcftools/main.nf new file mode 100644 index 0000000000..e54c52aa7c --- /dev/null +++ b/subworkflows/local/vcf_annotate_bcftools/main.nf @@ -0,0 +1,33 @@ + +// +// Run BCFtools to annotate VCF files +// + +include { BCFTOOLS_ANNOTATE } from '../../../modules/nf-core/bcftools/annotate/main' +include { TABIX_TABIX } from '../../../modules/nf-core/tabix/tabix/main' + +workflow VCF_ANNOTATE_BCFTOOLS { + take: + vcf // channel: [ val(meta), vcf ] + annotations // + annotations_index // + header_lines // + + + main: + ch_versions = Channel.empty() + + BCFTOOLS_ANNOTATE(vcf, annotations, annotations_index, header_lines) + TABIX_TABIX(BCFTOOLS_ANNOTATE.out.vcf) + + ch_vcf_tbi = BCFTOOLS_ANNOTATE.out.vcf.join(TABIX_TABIX.out.tbi, failOnDuplicate: true, failOnMismatch: true) + + + // Gather versions of all tools used + ch_versions = ch_versions.mix(BCFTOOLS_ANNOTATE.out.versions) + ch_versions = ch_versions.mix(TABIX_TABIX.out.versions) + + emit: + vcf_tbi = ch_vcf_tbi // channel: [ val(meta), vcf.gz, vcf.gz.tbi ] + versions = ch_versions // path: versions.yml +} diff --git a/subworkflows/local/vcf_concatenate_germline/main.nf b/subworkflows/local/vcf_concatenate_germline/main.nf new file mode 100644 index 0000000000..87f46b22e1 --- /dev/null +++ b/subworkflows/local/vcf_concatenate_germline/main.nf @@ -0,0 +1,42 @@ +// +// CONCATENATE Germline VCFs +// + +// Concatenation of germline vcf-files +include { ADD_INFO_TO_VCF } from '../../../modules/local/add_info_to_vcf/main' +include { TABIX_BGZIPTABIX as TABIX_EXT_VCF } from '../../../modules/nf-core/tabix/bgziptabix/main' +include { BCFTOOLS_CONCAT as GERMLINE_VCFS_CONCAT } from '../../../modules/nf-core/bcftools/concat/main' +include { BCFTOOLS_SORT as GERMLINE_VCFS_CONCAT_SORT } from '../../../modules/nf-core/bcftools/sort/main' +include { TABIX_TABIX as TABIX_GERMLINE_VCFS_CONCAT_SORT } from '../../../modules/nf-core/tabix/tabix/main' + +workflow CONCATENATE_GERMLINE_VCFS { + + take: + vcfs + + main: + versions = Channel.empty() + + // Concatenate vcf-files + ADD_INFO_TO_VCF(vcfs) + TABIX_EXT_VCF(ADD_INFO_TO_VCF.out.vcf) + + // Gather vcfs and vcf-tbis for concatenating germline-vcfs + germline_vcfs_with_tbis = TABIX_EXT_VCF.out.gz_tbi.map{ meta, vcf, tbi -> [ meta.subMap('id'), vcf, tbi ] }.groupTuple() + + GERMLINE_VCFS_CONCAT(germline_vcfs_with_tbis) + GERMLINE_VCFS_CONCAT_SORT(GERMLINE_VCFS_CONCAT.out.vcf) + TABIX_GERMLINE_VCFS_CONCAT_SORT(GERMLINE_VCFS_CONCAT_SORT.out.vcf) + + // Gather versions of all tools used + versions = versions.mix(ADD_INFO_TO_VCF.out.versions) + versions = versions.mix(TABIX_EXT_VCF.out.versions) + versions = versions.mix(GERMLINE_VCFS_CONCAT.out.versions) + versions = versions.mix(GERMLINE_VCFS_CONCAT.out.versions) + versions = versions.mix(TABIX_GERMLINE_VCFS_CONCAT_SORT.out.versions) + + emit: + vcfs = germline_vcfs_with_tbis // post processed vcfs + + versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/vcf_qc_bcftools_vcftools/main.nf b/subworkflows/local/vcf_qc_bcftools_vcftools/main.nf new file mode 100644 index 0000000000..bcdc34e30d --- /dev/null +++ b/subworkflows/local/vcf_qc_bcftools_vcftools/main.nf @@ -0,0 +1,30 @@ +include { BCFTOOLS_STATS } from '../../../modules/nf-core/bcftools/stats/main' +include { VCFTOOLS as VCFTOOLS_SUMMARY } from '../../../modules/nf-core/vcftools/main' +include { VCFTOOLS as VCFTOOLS_TSTV_COUNT } from '../../../modules/nf-core/vcftools/main' +include { VCFTOOLS as VCFTOOLS_TSTV_QUAL } from '../../../modules/nf-core/vcftools/main' + +workflow VCF_QC_BCFTOOLS_VCFTOOLS { + take: + vcf + target_bed + + main: + + versions = Channel.empty() + + BCFTOOLS_STATS(vcf.map{ meta, vcf -> [ meta, vcf, [] ] }, [[:],[]], [[:],[]], [[:],[]], [[:],[]], [[:],[]]) + VCFTOOLS_TSTV_COUNT(vcf, target_bed, []) + VCFTOOLS_TSTV_QUAL(vcf, target_bed, []) + VCFTOOLS_SUMMARY(vcf, target_bed, []) + + versions = versions.mix(BCFTOOLS_STATS.out.versions) + versions = versions.mix(VCFTOOLS_TSTV_COUNT.out.versions) + + emit: + bcftools_stats = BCFTOOLS_STATS.out.stats + vcftools_tstv_counts = VCFTOOLS_TSTV_COUNT.out.tstv_count + vcftools_tstv_qual = VCFTOOLS_TSTV_QUAL.out.tstv_qual + vcftools_filter_summary = VCFTOOLS_SUMMARY.out.filter_summary + + versions +} diff --git a/subworkflows/local/vcf_variant_filtering_gatk/main.nf b/subworkflows/local/vcf_variant_filtering_gatk/main.nf new file mode 100644 index 0000000000..1e0cbd210b --- /dev/null +++ b/subworkflows/local/vcf_variant_filtering_gatk/main.nf @@ -0,0 +1,37 @@ +include { GATK4_CNNSCOREVARIANTS as CNNSCOREVARIANTS } from '../../../modules/nf-core/gatk4/cnnscorevariants/main' +include { GATK4_FILTERVARIANTTRANCHES as FILTERVARIANTTRANCHES } from '../../../modules/nf-core/gatk4/filtervarianttranches/main' + +workflow VCF_VARIANT_FILTERING_GATK { + + take: + vcf // meta, vcf, tbi, intervals + fasta + fasta_fai + dict + intervals_bed_combined + known_sites + known_sites_tbi + + main: + + versions = Channel.empty() + + // Don't scatter/gather by intervals, because especially for small regions (targeted or WGS), it easily fails with 0 SNPS in region + cnn_in = vcf.combine(intervals_bed_combined).map{ meta, vcf, tbi, intervals -> [ meta, vcf, tbi, [], intervals ] } + + CNNSCOREVARIANTS(cnn_in, fasta, fasta_fai, dict, [], []) + + FILTERVARIANTTRANCHES(CNNSCOREVARIANTS.out.vcf.join(CNNSCOREVARIANTS.out.tbi, failOnDuplicate: true, failOnMismatch: true).combine(intervals_bed_combined), known_sites, known_sites_tbi, fasta, fasta_fai, dict) + + filtered_vcf = FILTERVARIANTTRANCHES.out.vcf + // remove no longer necessary field: num_intervals + .map{ meta, vcf -> [ meta - meta.subMap('num_intervals'), vcf ] } + + versions = versions.mix(CNNSCOREVARIANTS.out.versions) + versions = versions.mix(FILTERVARIANTTRANCHES.out.versions) + + emit: + filtered_vcf + + versions +} diff --git a/subworkflows/nf-core/bam_ngscheckmate/main.nf b/subworkflows/nf-core/bam_ngscheckmate/main.nf new file mode 100644 index 0000000000..4dd106f327 --- /dev/null +++ b/subworkflows/nf-core/bam_ngscheckmate/main.nf @@ -0,0 +1,49 @@ +include { BCFTOOLS_MPILEUP } from '../../../modules/nf-core/bcftools/mpileup/main' +include { NGSCHECKMATE_NCM } from '../../../modules/nf-core/ngscheckmate/ncm/main' + +workflow BAM_NGSCHECKMATE { + + take: + ch_input // channel: [ val(meta1), bam/cram ] + ch_snp_bed // channel: [ val(meta2), bed ] + ch_fasta // channel: [ val(meta3), fasta ] + + main: + + ch_versions = Channel.empty() + + ch_input_bed = ch_input.combine(ch_snp_bed.collect()) + // do something to combine the metas? + .map{ input_meta, input_file, bed_meta, bed_file -> + [input_meta, input_file, bed_file] + } + + BCFTOOLS_MPILEUP (ch_input_bed, ch_fasta.collect(), false) + ch_versions = ch_versions.mix(BCFTOOLS_MPILEUP.out.versions) + + BCFTOOLS_MPILEUP + .out + .vcf + .map{meta, vcf -> vcf} // discard individual metas + .collect() // group into one channel + .map{files -> [files]} // make the channel into [vcf1, vcf2, ...] + .set {ch_collected_vcfs} + + ch_snp_bed + .map{meta, bed -> meta} // use the snp_bed file meta as the meta for the merged channel + .combine(ch_collected_vcfs) // add the vcf files after the meta, now looks like [meta, [vcf1, vcf2, ... ] ] + .set {ch_vcfs} + + NGSCHECKMATE_NCM (ch_vcfs, ch_snp_bed, ch_fasta) + ch_versions = ch_versions.mix(NGSCHECKMATE_NCM.out.versions) + + emit: + corr_matrix = NGSCHECKMATE_NCM.out.corr_matrix // channel: [ meta, corr_matrix ] + matched = NGSCHECKMATE_NCM.out.matched // channel: [ meta, matched ] + all = NGSCHECKMATE_NCM.out.all // channel: [ meta, all ] + vcf = BCFTOOLS_MPILEUP.out.vcf // channel: [ meta, vcf ] + pdf = NGSCHECKMATE_NCM.out.pdf // channel: [ meta, pdf ] + versions = ch_versions // channel: [ versions.yml ] + +} + diff --git a/subworkflows/nf-core/bam_ngscheckmate/meta.yml b/subworkflows/nf-core/bam_ngscheckmate/meta.yml new file mode 100644 index 0000000000..7de0a114d4 --- /dev/null +++ b/subworkflows/nf-core/bam_ngscheckmate/meta.yml @@ -0,0 +1,68 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json +name: "bam_ngscheckmate" +description: Take a set of bam files and run NGSCheckMate to determine whether samples match with each other, using a set of SNPs. +keywords: + - ngscheckmate + - qc + - bam + - snp +components: + - bcftools/mpileup + - ngscheckmate/ncm +input: + - meta1: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - bam: + type: file + description: BAM files for each sample + pattern: "*.{bam}" + - meta2: + type: map + description: | + Groovy Map containing bed file information + e.g. [ id:'sarscov2' ] + - snp_bed: + type: file + description: BED file containing the SNPs to analyse. NGSCheckMate provides some default ones for hg19/hg38. + pattern: "*.{bed}" + - meta3: + type: map + description: | + Groovy Map containing reference genome meta information + e.g. [ id:'sarscov2' ] + - fasta: + type: file + description: fasta file for the genome + pattern: "*.{fasta}" +output: + - pdf: + type: file + description: A pdf containing a dendrogram showing how the samples match up + pattern: "*.{pdf}" + - corr_matrix: + type: file + description: A text file containing the correlation matrix between each sample + pattern: "*corr_matrix.txt" + - matched: + type: file + description: A txt file containing only the samples that match with each other + pattern: "*matched.txt" + - all: + type: file + description: A txt file containing all the sample comparisons, whether they match or not + pattern: "*all.txt" + - vcf: + type: file + description: vcf files for each sample giving the SNP calls + pattern: "*.vcf" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@SPPearce" +maintainers: + - "@SPPearce" diff --git a/subworkflows/nf-core/bam_ngscheckmate/nextflow.config b/subworkflows/nf-core/bam_ngscheckmate/nextflow.config new file mode 100644 index 0000000000..cad9f57cc7 --- /dev/null +++ b/subworkflows/nf-core/bam_ngscheckmate/nextflow.config @@ -0,0 +1,13 @@ +// IMPORTANT: Add this configuration to your modules.config + +process { + withName: ".*BAM_NGSCHECKMATE:BCFTOOLS_MPILEUP" { + ext.args2 = '--no-version --ploidy 1 -c' + ext.args3 = '--no-version' + } + + withName: ".*BAM_NGSCHECKMATE:NGSCHECKMATE_NCM" { + ext.args = '-V' + } + +} diff --git a/subworkflows/nf-core/vcf_annotate_ensemblvep/main.nf b/subworkflows/nf-core/vcf_annotate_ensemblvep/main.nf new file mode 100644 index 0000000000..291eddc11b --- /dev/null +++ b/subworkflows/nf-core/vcf_annotate_ensemblvep/main.nf @@ -0,0 +1,45 @@ +// +// Run VEP to annotate VCF files +// + +include { ENSEMBLVEP_VEP } from '../../../modules/nf-core/ensemblvep/vep/main' +include { TABIX_TABIX } from '../../../modules/nf-core/tabix/tabix/main' + +workflow VCF_ANNOTATE_ENSEMBLVEP { + take: + ch_vcf // channel: [ val(meta), path(vcf), [path(custom_file1), path(custom_file2)... (optionnal)]] + ch_fasta // channel: [ val(meta2), path(fasta) ] (optional) + val_genome // value: genome to use + val_species // value: species to use + val_cache_version // value: cache version to use + ch_cache // channel: [ val(meta3), path(cache) ] (optional) + ch_extra_files // channel: [ path(file1), path(file2)... ] (optional) + + main: + ch_versions = Channel.empty() + + ENSEMBLVEP_VEP( + ch_vcf, + val_genome, + val_species, + val_cache_version, + ch_cache, + ch_fasta, + ch_extra_files + ) + + TABIX_TABIX(ENSEMBLVEP_VEP.out.vcf) + + ch_vcf_tbi = ENSEMBLVEP_VEP.out.vcf.join(TABIX_TABIX.out.tbi, failOnDuplicate: true, failOnMismatch: true) + + // Gather versions of all tools used + ch_versions = ch_versions.mix(ENSEMBLVEP_VEP.out.versions) + ch_versions = ch_versions.mix(TABIX_TABIX.out.versions) + + emit: + vcf_tbi = ch_vcf_tbi // channel: [ val(meta), path(vcf), path(tbi) ] + json = ENSEMBLVEP_VEP.out.json // channel: [ val(meta), path(json) ] + tab = ENSEMBLVEP_VEP.out.tab // channel: [ val(meta), path(tab) ] + reports = ENSEMBLVEP_VEP.out.report // channel: [ path(html) ] + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/subworkflows/nf-core/vcf_annotate_ensemblvep/meta.yml b/subworkflows/nf-core/vcf_annotate_ensemblvep/meta.yml new file mode 100644 index 0000000000..15d42da23f --- /dev/null +++ b/subworkflows/nf-core/vcf_annotate_ensemblvep/meta.yml @@ -0,0 +1,65 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json +name: vcf_annotate_ensemblvep +description: Perform annotation with ensemblvep and bgzip + tabix index the resulting VCF file +keywords: + - vcf + - annotation + - ensemblvep +components: + - ensemblvep/vep + - tabix/tabix +input: + - ch_vcf: + description: | + vcf file to annotate + Structure: [ val(meta), path(vcf), [path(custom_file1), path(custom_file2)... (optionnal)] ] + - ch_fasta: + description: | + Reference genome fasta file (optional) + Structure: [ val(meta2), path(fasta) ] + - val_genome: + type: string + description: genome to use + - val_species: + type: string + description: species to use + - val_cache_version: + type: integer + description: cache version to use + - ch_cache: + description: | + the root cache folder for ensemblvep (optional) + Structure: [ val(meta3), path(cache) ] + - ch_extra_files: + description: | + any extra files needed by plugins for ensemblvep (optional) + Structure: [ path(file1), path(file2)... ] +output: + - vcf_tbi: + description: | + Compressed vcf file + tabix index + Structure: [ val(meta), path(vcf), path(tbi) ] + - json: + description: | + json file + Structure: [ val(meta), path(json) ] + - tab: + description: | + tab file + Structure: [ val(meta), path(tab) ] + - reports: + type: file + description: html reports + pattern: "*.html" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@maxulysse" + - "@matthdsm" + - "@nvnieuwk" +maintainers: + - "@maxulysse" + - "@matthdsm" + - "@nvnieuwk" diff --git a/subworkflows/nf-core/vcf_annotate_snpeff/main.nf b/subworkflows/nf-core/vcf_annotate_snpeff/main.nf new file mode 100644 index 0000000000..3570a5b7c0 --- /dev/null +++ b/subworkflows/nf-core/vcf_annotate_snpeff/main.nf @@ -0,0 +1,28 @@ +// +// Run SNPEFF to annotate VCF files +// + +include { SNPEFF_SNPEFF } from '../../../modules/nf-core/snpeff/snpeff/main.nf' +include { TABIX_BGZIPTABIX } from '../../../modules/nf-core/tabix/bgziptabix/main.nf' + +workflow VCF_ANNOTATE_SNPEFF { + take: + ch_vcf // channel: [ val(meta), path(vcf) ] + val_snpeff_db // string: db version to use + ch_snpeff_cache // channel: [ path(cache) ] (optional) + + main: + ch_versions = Channel.empty() + + SNPEFF_SNPEFF(ch_vcf, val_snpeff_db, ch_snpeff_cache) + TABIX_BGZIPTABIX(SNPEFF_SNPEFF.out.vcf) + + // Gather versions of all tools used + ch_versions = ch_versions.mix(SNPEFF_SNPEFF.out.versions) + ch_versions = ch_versions.mix(TABIX_BGZIPTABIX.out.versions) + + emit: + vcf_tbi = TABIX_BGZIPTABIX.out.gz_tbi // channel: [ val(meta), path(vcf), path(tbi) ] + reports = SNPEFF_SNPEFF.out.report // channel: [ path(html) ] + versions = ch_versions // channel: [ path(versions.yml) ] +} diff --git a/subworkflows/nf-core/vcf_annotate_snpeff/meta.yml b/subworkflows/nf-core/vcf_annotate_snpeff/meta.yml new file mode 100644 index 0000000000..c8d5a635ee --- /dev/null +++ b/subworkflows/nf-core/vcf_annotate_snpeff/meta.yml @@ -0,0 +1,40 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json +name: vcf_annotate_snpeff +description: Perform annotation with snpEff and bgzip + tabix index the resulting VCF file +keywords: + - vcf + - annotation + - snpeff +components: + - snpeff + - snpeff/snpeff + - tabix/bgziptabix +input: + - ch_vcf: + description: | + vcf file + Structure: [ val(meta), path(vcf) ] + - val_snpeff_db: + type: string + description: db version to use + - ch_snpeff_cache: + description: | + path to root cache folder for snpEff (optional) + Structure: [ path(cache) ] +output: + - vcf_tbi: + description: | + Compressed vcf file + tabix index + Structure: [ val(meta), path(vcf), path(tbi) ] + - reports: + description: | + html reports + Structure: [ path(html) ] + - versions: + description: | + Files containing software versions + Structure: [ path(versions.yml) ] +authors: + - "@maxulysse" +maintainers: + - "@maxulysse" diff --git a/tests/config/bcfann_test_header.txt b/tests/config/bcfann_test_header.txt new file mode 100644 index 0000000000..443dd3ea4a --- /dev/null +++ b/tests/config/bcfann_test_header.txt @@ -0,0 +1 @@ +##INFO= diff --git a/tests/config/pytesttags.yml b/tests/config/pytesttags.yml new file mode 100644 index 0000000000..f33e55023c --- /dev/null +++ b/tests/config/pytesttags.yml @@ -0,0 +1,611 @@ +# default +default: + - "**" + +# default_extended + +tumor_normal_pair: + - conf/modules/** + - main.nf + - modules/** + - nextflow.config + - nextflow_schema.json + - subworkflows/** + - tests/csv/3.0/fastq_pair.csv + - tests/test_tumor_normal_pair.yml + - workflows/** + +save_mapped_only: + - conf/modules/** + - main.nf + - modules/** + - nextflow.config + - nextflow_schema.json + - subworkflows/** + - tests/csv/3.0/fastq_single.csv + - tests/test_save_mapped.yml + - workflows/** + +save_output_as_bam_only: + - conf/modules/** + - main.nf + - modules/** + - nextflow.config + - nextflow_schema.json + - subworkflows/** + - tests/csv/3.0/fastq_single.csv + - tests/test_save_output_as_bam_only.yml + - workflows/** + +skip_all_qc: + - conf/modules/** + - main.nf + - modules/** + - nextflow.config + - nextflow_schema.json + - subworkflows/** + - tests/csv/3.0/fastq_single.csv + - tests/test_skip_all_qc.yml + - workflows/** + +skip_markduplicates: + - conf/modules/** + - main.nf + - modules/** + - nextflow.config + - nextflow_schema.json + - subworkflows/** + - tests/csv/3.0/fastq_single.csv + - tests/test_skip_markduplicates.yml + - workflows/** + +validation_checks: + - conf/modules/** + - main.nf + - modules/** + - nextflow.config + - nextflow_schema.json + - subworkflows/** + - tests/csv/3.0/sample_with_space.csv + - tests/test_samplesheet_validation_spaces.yml + - workflows/** + +# preprocessing + +## alignment_to_fastq +alignment_to_fastq: + - conf/modules/alignment_to_fastq.config + - modules/nf-core/cat/fastq/** + - modules/nf-core/samtools/collatefastq/** + - modules/nf-core/samtools/merge/** + - modules/nf-core/samtools/view/** + - subworkflows/local/bam_convert_samtools/** + - tests/csv/3.0/bam_for_remapping.csv + - tests/test_alignment_to_fastq.yml + +## umi +umi: + - conf/modules/umi.config + - modules/nf-core/bwa/mem/** + - modules/nf-core/bwamem2/mem/** + - modules/nf-core/dragmap/align/** + - modules/nf-core/fgbio/callmolecularconsensusreads/** + - modules/nf-core/fgbio/fastqtobam/** + - modules/nf-core/fgbio/groupreadsbyumi/** + - modules/nf-core/samblaster/** + - modules/nf-core/samtools/bam2fq/** + - subworkflows/local/fastq_align_bwamem_mem2_dragmap/** + - subworkflows/local/fastq_create_umi_consensus_fgbio/** + - tests/csv/3.0/fastq_umi.csv + - tests/test_umi.yml + +## fastp +fastp: + - conf/modules/trimming.config + - modules/nf-core/fastp/** + - tests/csv/3.0/fastq_single.csv + - tests/test_fastp.yml + +## aligner + +### bwamem +bwamem: + - conf/modules/aligner.config + - modules/nf-core/bwa/mem/** + - subworkflows/local/fastq_align_bwamem_mem2_dragmap_sentieon/** + - tests/csv/3.0/fastq_single.csv + - tests/test_aligner_bwamem.yml + +### bwamem2 +bwamem2: + - conf/modules/aligner.config + - modules/nf-core/bwamem2/mem/** + - subworkflows/local/fastq_align_bwamem_mem2_dragmap_sentieon/** + - tests/csv/3.0/fastq_single.csv + - tests/test_aligner_bwamem2.yml + +### dragmap +dragmap: + - conf/modules/aligner.config + - modules/nf-core/dragmap/align/** + - subworkflows/local/fastq_align_bwamem_mem2_dragmap_sentieon/** + - tests/csv/3.0/fastq_single.csv + - tests/test_aligner_dragmap.yml + +### sentieon/bwamem +sentieon/bwamem: + - conf/modules/aligner.config + - modules/nf-core/sentieon/bwamem/** + - subworkflows/local/fastq_align_bwamem_mem2_dragmap_sentieon/** + - tests/csv/3.0/fastq_single.csv + - tests/test_sentieon_aligner_bwamem.yml + +## markduplicates +gatk4/markduplicates: + - conf/modules/markduplicates.config + - modules/nf-core/gatk4/markduplicates/** + - modules/nf-core/mosdepth/** + - modules/nf-core/samtools/convert/** + - modules/nf-core/samtools/index/** + - modules/nf-core/samtools/stats/** + - subworkflows/local/bam_markduplicates/** + - subworkflows/local/cram_qc_mosdepth_samtools/** + - tests/csv/3.0/mapped_single_bam.csv + - tests/csv/3.0/mapped_single_cram.csv + - tests/test_markduplicates_from_bam.yml + - tests/test_markduplicates_from_cram.yml + +## sentieon/dedup +sentieon/dedup: + - conf/modules/sentieon_dedup.config + - modules/nf-core/sentieon/bwamem/** + - modules/nf-core/mosdepth/** + - modules/nf-core/samtools/index/** + - modules/nf-core/samtools/stats/** + - subworkflows/local/bam_sentieon_dedup/** + - subworkflows/local/cram_qc_mosdepth_samtools/** + - tests/csv/3.0/mapped_single_bam.csv + - tests/csv/3.0/mapped_single_cram.csv + - tests/test_sentieon_dedup_from_bam.yml + - tests/test_sentieon_dedup_from_cram.yml + +## prepare_recalibration +prepare_recalibration: + - conf/modules/prepare_recalibration.config + - modules/nf-core/gatk4/baserecalibrator/** + - modules/nf-core/gatk4/gatherbqsrreports/** + - modules/nf-core/samtools/convert/** + - subworkflows/local/bam_baserecalibrator/** + - tests/csv/3.0/mapped_single_bam.csv + - tests/csv/3.0/mapped_single_cram.csv + - tests/test_prepare_recalibration_from_bam.yml + - tests/test_prepare_recalibration_from_cram.yml + +## recalibrate +recalibrate: + - conf/modules/recalibrate.config + - modules/nf-core/gatk4/applybqsr/** + - modules/nf-core/samtools/convert/** + - modules/nf-core/samtools/index/** + - modules/nf-core/samtools/merge/** + - subworkflows/local/bam_applybqsr/** + - subworkflows/local/cram_merge_index_samtools/** + - tests/csv/3.0/prepare_recalibration_single_bam.csv + - tests/csv/3.0/prepare_recalibration_single_cram.csv + - tests/test_recalibrate_from_bam.yml + - tests/test_recalibrate_from_cram.yml + +## intervals +intervals: + - conf/modules/prepare_intervals.config + - modules/local/build_intervals/** + - modules/local/create_intervals_bed/** + - modules/nf-core/gatk4/intervallisttobed/** + - modules/nf-core/tabix/bgziptabix/** + - subworkflows/local/prepare_intervals/** + - tests/csv/3.0/fastq_single.csv + - tests/test_intervals.yml + +## gatk4spark +gatk4spark: + - conf/modules/markduplicates.config + - conf/modules/prepare_recalibration.config + - conf/modules/recalibrate.config + - modules/nf-core/gatk4spark/applybqsr/** + - modules/nf-core/gatk4spark/baserecalibrator/** + - modules/nf-core/gatk4/estimatelibrarycomplexity/** + - modules/nf-core/gatk4spark/markduplicates/** + - subworkflows/local/bam_applybqsr_spark/** + - subworkflows/local/bam_baserecalibrator_spark/** + - subworkflows/local/bam_markduplicates_spark/** + - tests/csv/3.0/fastq_single.csv + - tests/test_gatk4spark.yml + +# variant calling + +## cnvkit +cnvkit: + - conf/modules/cnvkit.config + - modules/nf-core/cnvkit/antitarget/** + - modules/nf-core/cnvkit/batch/** + - modules/nf-core/cnvkit/reference/** + - subworkflows/local/bam_variant_calling_cnvkit/** + - subworkflows/local/bam_variant_calling_germline_all/** + - subworkflows/local/bam_variant_calling_somatic_all/** + - subworkflows/local/bam_variant_calling_tumor_only_all/** + - subworkflows/local/prepare_reference_cnvkit/** + - tests/csv/3.0/recalibrated_germline.csv + - tests/csv/3.0/recalibrated_somatic.csv + - tests/csv/3.0/recalibrated_tumoronly.csv + - tests/test_cnvkit.yml + +## controlfreec +controlfreec: + - conf/modules/controlfreec.config + - conf/modules/mpileup.config + - modules/nf-core/cat/cat/** + - modules/nf-core/controlfreec/assesssignificance/** + - modules/nf-core/controlfreec/freec/** + - modules/nf-core/controlfreec/freec2bed/** + - modules/nf-core/controlfreec/freec2circos/** + - modules/nf-core/controlfreec/makegraph/** + - modules/nf-core/samtools/mpileup/** + - subworkflows/local/bam_variant_calling_mpileup/** + - subworkflows/local/bam_variant_calling_somatic_all/** + - subworkflows/local/bam_variant_calling_somatic_controlfreec/** + - subworkflows/local/bam_variant_calling_tumor_only_all/** + - subworkflows/local/bam_variant_calling_tumor_only_controlfreec/** + - tests/csv/3.0/recalibrated_somatic.csv + - tests/csv/3.0/recalibrated_tumoronly.csv + - tests/test_controlfreec.yml + +## deepvariant +deepvariant: + - conf/modules/deepvariant.config + - modules/nf-core/deepvariant/** + - modules/nf-core/gatk4/mergevcfs/** + - modules/nf-core/tabix/tabix/** + - subworkflows/local/bam_variant_calling_deepvariant/** + - subworkflows/local/bam_variant_calling_germline_all/** + - tests/csv/3.0/recalibrated_germline.csv + - tests/test_deepvariant.yml + +## freebayes +freebayes: + - conf/modules/freebayes.config + - modules/nf-core/bcftools/sort/** + - modules/nf-core/freebayes/** + - modules/nf-core/gatk4/mergevcfs/** + - modules/nf-core/tabix/tabix/** + - subworkflows/local/bam_variant_calling_freebayes/** + - subworkflows/local/bam_variant_calling_germline_all/** + - subworkflows/local/bam_variant_calling_somatic_all/** + - subworkflows/local/bam_variant_calling_tumor_only_all/** + - tests/csv/3.0/fastq_pair.csv + - tests/csv/3.0/fastq_single.csv + - tests/csv/3.0/recalibrated_tumoronly.csv + - tests/test_freebayes.yml + +## haplotypecaller +haplotypecaller: + - conf/modules/haplotypecaller.config + - modules/nf-core/gatk4/cnnscorevariants/** + - modules/nf-core/gatk4/filtervarianttranches/** + - modules/nf-core/gatk4/haplotypecaller/** + - modules/nf-core/gatk4/mergevcfs/** + - modules/nf-core/samtools/index/** + - modules/nf-core/samtools/merge/** + - subworkflows/local/bam_merge_index_samtools/** + - subworkflows/local/bam_variant_calling_germline_all/** + - subworkflows/local/bam_variant_calling_haplotypecaller/** + - subworkflows/local/vcf_variant_filtering_gatk/** + - tests/csv/3.0/mapped_single_bam.csv + - tests/test_haplotypecaller.yml + +haplotypecaller_skip_filter: + - conf/modules/haplotypecaller.config + - modules/nf-core/gatk4/haplotypecaller/** + - modules/nf-core/gatk4/mergevcfs/** + - modules/nf-core/samtools/index/** + - modules/nf-core/samtools/merge/** + - subworkflows/local/bam_merge_index_samtools/** + - subworkflows/local/bam_variant_calling_germline_all/** + - subworkflows/local/bam_variant_calling_haplotypecaller/** + - tests/csv/3.0/mapped_single_bam.csv + - tests/test_haplotypecaller_skip_filter.yml + +## sentieon/dnascope +sentieon/dnascope: + - conf/modules/sentieon_dnascope.config + - modules/nf-core/sentieon/dnascope/** + - modules/nf-core/gatk4/mergevcfs/** + - modules/nf-core/samtools/index/** + - modules/nf-core/samtools/merge/** + - subworkflows/local/bam_merge_index_samtools/** + - subworkflows/local/bam_variant_calling_germline_all/** + - subworkflows/local/bam_variant_calling_sentieon_dnascope/** + - tests/csv/3.0/mapped_single_bam.csv + - tests/test_sentieon_dnascope.yml + +sentieon_dnascope_skip_filter: + - conf/modules/sentieon_dnascope.config + - modules/nf-core/sentieon/dnascope/** + - modules/nf-core/gatk4/mergevcfs/** + - modules/nf-core/samtools/index/** + - modules/nf-core/samtools/merge/** + - subworkflows/local/bam_merge_index_samtools/** + - subworkflows/local/bam_variant_calling_germline_all/** + - subworkflows/local/bam_variant_calling_sentieon_dnascope/** + - tests/csv/3.0/mapped_single_bam.csv + - tests/test_sentieon_dnascope_skip_filter.yml + +## sentieon/haplotyper +sentieon/haplotyper: + - conf/modules/sentieon_haplotyper.config + - modules/nf-core/gatk4/cnnscorevariants/** + - modules/nf-core/gatk4/filtervarianttranches/** + - modules/nf-core/sentieon/haplotyper/** + - modules/nf-core/gatk4/mergevcfs/** + - modules/nf-core/samtools/index/** + - modules/nf-core/samtools/merge/** + - subworkflows/local/bam_merge_index_samtools/** + - subworkflows/local/bam_variant_calling_germline_all/** + - subworkflows/local/bam_variant_calling_sentieon_haplotyper/** + - subworkflows/local/vcf_variant_filtering_gatk/** + - tests/csv/3.0/mapped_single_bam.csv + - tests/test_sentieon_haplotyper.yml + +sentieon_haplotyper_skip_filter: + - conf/modules/sentieon_haplotyper.config + - modules/nf-core/sentieon/haplotyper/** + - modules/nf-core/gatk4/mergevcfs/** + - modules/nf-core/samtools/index/** + - modules/nf-core/samtools/merge/** + - subworkflows/local/bam_merge_index_samtools/** + - subworkflows/local/bam_variant_calling_germline_all/** + - subworkflows/local/bam_variant_calling_sentieon_haplotyper/** + - tests/csv/3.0/mapped_single_bam.csv + - tests/test_sentieon_haplotyper_skip_filter.yml + +## joint_germline +joint_germline: + - conf/modules/haplotypecaller.config + - modules/nf-core/bcftools/sort/** + - modules/nf-core/gatk4/applyvqsr/** + - modules/nf-core/gatk4/cnnscorevariants/** + - modules/nf-core/gatk4/filtervarianttranches/** + - modules/nf-core/gatk4/genomicsdbimport/** + - modules/nf-core/gatk4/genotypegvcfs/** + - modules/nf-core/gatk4/haplotypecaller/** + - modules/nf-core/gatk4/mergevcfs/** + - modules/nf-core/gatk4/variantrecalibrator/** + - modules/nf-core/samtools/index/** + - modules/nf-core/samtools/merge/** + - subworkflows/local/bam_joint_calling_germline_gatk/** + - subworkflows/local/bam_merge_index_samtools/** + - subworkflows/local/bam_variant_calling_germline_all/** + - subworkflows/local/bam_variant_calling_haplotypecaller/** + - subworkflows/local/vcf_variant_filtering_gatk/** + - tests/csv/3.0/mapped_joint_bam.csv + - tests/test_joint_germline.yml + +## sentieon_dnascope_joint_germline +sentieon_dnascope_joint_germline: + - conf/modules/prepare_genome.config + - conf/modules/sentieon_dnascope.config + - conf/modules/sentieon_dnascope_joint_germline.config + - modules/nf-core/sentieon/dnascope/** + - subworkflows/local/bam_variant_calling_germline_all/** + - subworkflows/local/bam_variant_calling_sentieon_dnascope/** + - tests/csv/3.0/mapped_joint_bam.csv + - tests/test_sentieon_dnascop_joint_germline.yml + +## sentieon_haplotyper_joint_germline +sentieon_haplotyper_joint_germline: + - conf/modules/prepare_genome.config + - conf/modules/sentieon_haplotyper.config + - conf/modules/sentieon_haplotyper_joint_germline.config + - modules/nf-core/sentieon/haplotyper/** + - subworkflows/local/bam_variant_calling_germline_all/** + - subworkflows/local/bam_variant_calling_sentieon_haplotyper/** + - tests/csv/3.0/mapped_joint_bam.csv + - tests/test_sentieon_haplotyper_joint_germline.yml + +## manta +manta: + - conf/modules/manta.config + - modules/nf-core/gatk4/mergevcfs/** + - modules/nf-core/manta/germline/** + - modules/nf-core/manta/somatic/** + - modules/nf-core/manta/tumoronly/** + - subworkflows/local/bam_variant_calling_germline_all/** + - subworkflows/local/bam_variant_calling_germline_manta/** + - subworkflows/local/bam_variant_calling_somatic_all/** + - subworkflows/local/bam_variant_calling_somatic_manta/** + - subworkflows/local/bam_variant_calling_tumor_only_all/** + - subworkflows/local/bam_variant_calling_tumor_only_manta/** + - tests/csv/3.0/recalibrated_germline.csv + - tests/csv/3.0/recalibrated_somatic.csv + - tests/csv/3.0/recalibrated_tumoronly.csv + - tests/test_manta.yml + +## mpileup +mpileup: + - conf/modules/mpileup.config + - modules/nf-core/cat/cat/** + - modules/nf-core/samtools/mpileup/** + - subworkflows/local/bam_variant_calling_germline_all/** + - subworkflows/local/bam_variant_calling_mpileup/** + - subworkflows/local/bam_variant_calling_somatic_all/** + - subworkflows/local/bam_variant_calling_tumor_only_all/** + - tests/csv/3.0/recalibrated_germline.csv + - tests/csv/3.0/recalibrated_tumoronly.csv + - tests/test_mpileup.yml + +## msisensorpro +msisensorpro: + - conf/modules/msisensorpro.config + - modules/nf-core/msisensorpro/msi_somatic/** + - modules/nf-core/msisensorpro/scan/** + - subworkflows/local/bam_variant_calling_somatic_all/** + - tests/csv/3.0/recalibrated_somatic.csv + - tests/test_msisensorpro.yml + +## mutect2 +mutect2: + - conf/modules/mutect2.config + - modules/nf-core/gatk4/calculatecontamination/** + - modules/nf-core/gatk4/filtermutectcalls/** + - modules/nf-core/gatk4/gatherpileupsummaries/** + - modules/nf-core/gatk4/getpileupsummaries/** + - modules/nf-core/gatk4/learnreadorientationmodel/** + - modules/nf-core/gatk4/mergemutectstats/** + - modules/nf-core/gatk4/mergevcfs/** + - modules/nf-core/gatk4/mutect2/** + - subworkflows/local/bam_variant_calling_somatic_all/** + - subworkflows/local/bam_variant_calling_somatic_mutect2/** + - subworkflows/local/bam_variant_calling_tumor_only_all/** + - subworkflows/local/bam_variant_calling_tumor_only_mutect2/** + - tests/csv/3.0/recalibrated_tumoronly.csv + - tests/test_mutect2.yml + +## strelka +strelka: + - conf/modules/strelka.config + - modules/nf-core/gatk4/mergevcfs/** + - modules/nf-core/strelka/germline/** + - modules/nf-core/strelka/somatic/** + - subworkflows/local/bam_variant_calling_germline_all/** + - subworkflows/local/bam_variant_calling_single_strelka/** + - subworkflows/local/bam_variant_calling_somatic_all/** + - subworkflows/local/bam_variant_calling_somatic_strelka/** + - subworkflows/local/bam_variant_calling_tumor_only_all/** + - tests/csv/3.0/recalibrated_germline.csv + - tests/csv/3.0/recalibrated_somatic.csv + - tests/csv/3.0/recalibrated_tumoronly.csv + - tests/csv/3.0/recalibrated.csv + - tests/test_strelka.yml + +## strelka_bp +strelka_bp: + - conf/modules/manta.config + - conf/modules/strelka.config + - modules/nf-core/gatk4/mergevcfs/** + - modules/nf-core/manta/somatic/** + - modules/nf-core/strelka/somatic/** + - subworkflows/local/bam_variant_calling_germline_all/** + - subworkflows/local/bam_variant_calling_somatic_all/** + - subworkflows/local/bam_variant_calling_somatic_manta/** + - subworkflows/local/bam_variant_calling_somatic_strelka/** + - subworkflows/local/bam_variant_calling_tumor_only_all/** + - tests/csv/3.0/recalibrated_somatic.csv + - tests/test_strelka_bp.yml + +## tiddit +tiddit: + - conf/modules/tiddit.config + - modules/nf-core/svdb/merge/**.nf + - modules/nf-core/tabix/bgziptabix/** + - modules/nf-core/tiddit/sv/** + - subworkflows/local/bam_variant_calling_germline_all/** + - subworkflows/local/bam_variant_calling_single_tiddit/** + - subworkflows/local/bam_variant_calling_somatic_all/** + - subworkflows/local/bam_variant_calling_somatic_tiddit/** + - subworkflows/local/bam_variant_calling_tumor_only_all/** + - tests/csv/3.0/recalibrated_germline.csv + - tests/csv/3.0/recalibrated_somatic.csv + - tests/csv/3.0/recalibrated_tumoronly.csv + - tests/test_tiddit.yml + +# annotate + +## cache +cache: + - conf/modules/prepare_cache.config + - modules/nf-core/ensemblvep/download/** + - modules/nf-core/snpeff/download/** + - subworkflows/local/prepare_cache/** + - tests/test_annotation_cache.yml + +## merge +merge: + - conf/modules/annotate.config + - modules/nf-core/ensemblvep/vep/** + - modules/nf-core/snpeff/snpeff/** + - modules/nf-core/tabix/bgziptabix/** + - subworkflows/local/vcf_annotate_all/** + - subworkflows/nf-core/vcf_annotate_ensemblvep/** + - subworkflows/nf-core/vcf_annotate_snpeff/** + - tests/csv/3.0/vcf_single.csv + - tests/test_annotation_merge.yml + +## snpeff +snpeff: + - conf/modules/annotate.config + - modules/nf-core/snpeff/snpeff/** + - modules/nf-core/tabix/bgziptabix/** + - subworkflows/nf-core/vcf_annotate_snpeff/** + - tests/csv/3.0/vcf_single.csv + - tests/test_annotation_snpeff.yml + +## vep +vep: + - conf/modules/annotate.config + - modules/nf-core/ensemblvep/vep/** + - modules/nf-core/tabix/bgziptabix/** + - subworkflows/nf-core/vcf_annotate_ensemblvep/** + - tests/csv/3.0/vcf_single.csv + - tests/test_annotation_vep.yml + +## bcfann +bcfann: + - conf/modules/annotate.config + - modules/nf-core/bcftools/annotate/** + - modules/nf-core/tabix/bgziptabix/** + - subworkflows/nf-core/vcf_annotate_bcftools/** + - tests/csv/3.0/vcf_single.csv + - tests/test_annotation_bcfann.yml + +# postprocessing + +## concatenate germline vcfs +concatenate_vcfs: + - conf/modules/post_variant_calling.config + - modules/nf-core/bcftools/concat/** + - modules/nf-core/bcftools/mpileup/** + - modules/nf-core/bcftools/sort/** + - modules/nf-core/deepvariant/** + - modules/nf-core/freebayes/** + - modules/nf-core/gatk4/haplotypecaller/** + - modules/nf-core/gatk4/mergevcfs/** + - modules/nf-core/manta/germline/** + - modules/nf-core/samtools/mpileup/** + - modules/nf-core/strelka/germline/** + - modules/nf-core/tabix/bgziptabix/** + - modules/nf-core/tabix/tabix/** + - modules/nf-core/tiddit/sv/** + - subworkflows/local/bam_variant_calling_deepvariant/** + - subworkflows/local/bam_variant_calling_freebayes/** + - subworkflows/local/bam_variant_calling_germline_all/** + - subworkflows/local/bam_variant_calling_germline_manta/** + - subworkflows/local/bam_variant_calling_haplotypecaller/** + - subworkflows/local/bam_variant_calling_mpileup/** + - subworkflows/local/bam_variant_calling_single_strelka/** + - subworkflows/local/bam_variant_calling_single_tiddit/** + - subworkflows/local/bam_variant_calling_somatic_all/** + - subworkflows/local/bam_variant_calling_tumor_only_all/** + - subworkflows/local/post_variantcalling/** + - subworkflows/local/vcf_concatenate_germline/** + - tests/csv/3.0/mapped_joint_bam.csv + - tests/test_concat_germline_vcfs.yml + +# sampleqc + +## ngscheckmate +ngscheckmate: + - conf/modules/ngscheckmate.config + - modules/nf-core/bcftools/mpileup/** + - modules/nf-core/ngscheckmate/ncm/** + - subworkflows/local/cram_sampleqc/** + - subworkflows/nf-core/bam_ngscheckmate/** + - tests/test_ngscheckmate.yml diff --git a/tests/csv/3.0/ascat_somatic.csv b/tests/csv/3.0/ascat_somatic.csv new file mode 100644 index 0000000000..8eb30f5531 --- /dev/null +++ b/tests/csv/3.0/ascat_somatic.csv @@ -0,0 +1,3 @@ +patient,sex,status,sample,cram,crai +test3,XX,0,sample3,HG00145.mapped.ILLUMINA.bwa.GBR.low_coverage.20120522.bam.cram,HG00145.mapped.ILLUMINA.bwa.GBR.low_coverage.20120522.bam.cram.crai +test3,XX,1,sample4,HG00146.mapped.ILLUMINA.bwa.GBR.low_coverage.20120522.bam.cram,HG00146.mapped.ILLUMINA.bwa.GBR.low_coverage.20120522.bam.cram.crai diff --git a/tests/csv/3.0/bam_for_remapping.csv b/tests/csv/3.0/bam_for_remapping.csv new file mode 100644 index 0000000000..a37e07bb83 --- /dev/null +++ b/tests/csv/3.0/bam_for_remapping.csv @@ -0,0 +1,2 @@ +patient,sex,status,sample,lane,bam,bai +test,XX,0,test,1,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/bam/test.paired_end.sorted.bam,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/bam/test.paired_end.sorted.bam.bai diff --git a/tests/csv/3.0/fastq_pair.csv b/tests/csv/3.0/fastq_pair.csv new file mode 100644 index 0000000000..2986ebfd1b --- /dev/null +++ b/tests/csv/3.0/fastq_pair.csv @@ -0,0 +1,3 @@ +patient,sex,status,sample,lane,fastq_1,fastq_2 +test,XX,0,test,test_L1,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/fastq/test_1.fastq.gz,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/fastq/test_2.fastq.gz +test,XX,1,test2,test_L1,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/fastq/test2_1.fastq.gz,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/fastq/test2_2.fastq.gz diff --git a/tests/csv/3.0/fastq_single.csv b/tests/csv/3.0/fastq_single.csv new file mode 100644 index 0000000000..c89bab1bec --- /dev/null +++ b/tests/csv/3.0/fastq_single.csv @@ -0,0 +1,3 @@ +patient,sex,status,sample,lane,fastq_1,fastq_2 +test,XX,0,test,test_L1,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/fastq/test_1.fastq.gz,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/fastq/test_2.fastq.gz +test,XX,0,test,test_L2,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/fastq/test_1.fastq.gz,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/fastq/test_2.fastq.gz diff --git a/tests/csv/3.0/fastq_tumor_only.csv b/tests/csv/3.0/fastq_tumor_only.csv new file mode 100644 index 0000000000..ea75421038 --- /dev/null +++ b/tests/csv/3.0/fastq_tumor_only.csv @@ -0,0 +1,2 @@ +patient,sex,status,sample,lane,fastq_1,fastq_2 +test,XX,1,test2,test_L1,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/fastq/test2_1.fastq.gz,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/fastq/test2_2.fastq.gz diff --git a/tests/csv/3.0/fastq_umi.csv b/tests/csv/3.0/fastq_umi.csv new file mode 100644 index 0000000000..9c6718c7b7 --- /dev/null +++ b/tests/csv/3.0/fastq_umi.csv @@ -0,0 +1,2 @@ +patient,sex,status,sample,lane,fastq_1,fastq_2 +test,XX,0,test,test_L1,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/fastq/test.umi_1.fastq.gz,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/fastq/test.umi_2.fastq.gz diff --git a/tests/csv/3.0/mapped_joint_bam.csv b/tests/csv/3.0/mapped_joint_bam.csv new file mode 100644 index 0000000000..689393be00 --- /dev/null +++ b/tests/csv/3.0/mapped_joint_bam.csv @@ -0,0 +1,3 @@ +patient,sample,bam,bai +testN,testN,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/bam/test.paired_end.sorted.bam,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/bam/test.paired_end.sorted.bam.bai +testT,testT,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/bam/test2.paired_end.sorted.bam,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/bam/test2.paired_end.sorted.bam.bai diff --git a/tests/csv/3.0/mapped_single_bam.csv b/tests/csv/3.0/mapped_single_bam.csv new file mode 100644 index 0000000000..8cbe6f9ce2 --- /dev/null +++ b/tests/csv/3.0/mapped_single_bam.csv @@ -0,0 +1,2 @@ +patient,status,sample,bam,bai +test,0,test,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/bam/test.paired_end.sorted.bam,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/bam/test.paired_end.sorted.bam.bai diff --git a/tests/csv/3.0/mapped_single_cram.csv b/tests/csv/3.0/mapped_single_cram.csv new file mode 100644 index 0000000000..1baa471c41 --- /dev/null +++ b/tests/csv/3.0/mapped_single_cram.csv @@ -0,0 +1,2 @@ +patient,status,sample,cram,crai +test,0,test,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/cram/test.paired_end.sorted.cram,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/cram/test.paired_end.sorted.cram.crai diff --git a/tests/csv/3.0/prepare_recalibration_single_bam.csv b/tests/csv/3.0/prepare_recalibration_single_bam.csv new file mode 100644 index 0000000000..a61c3f8222 --- /dev/null +++ b/tests/csv/3.0/prepare_recalibration_single_bam.csv @@ -0,0 +1,2 @@ +patient,status,sample,bam,bai,table +test,0,test,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/bam/test.paired_end.sorted.bam,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/bam/test.paired_end.sorted.bam.bai,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/gatk/test.baserecalibrator.table diff --git a/tests/csv/3.0/prepare_recalibration_single_cram.csv b/tests/csv/3.0/prepare_recalibration_single_cram.csv new file mode 100644 index 0000000000..4adc8fa105 --- /dev/null +++ b/tests/csv/3.0/prepare_recalibration_single_cram.csv @@ -0,0 +1,2 @@ +patient,status,sample,cram,crai,table +test,0,test,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/cram/test.paired_end.sorted.cram,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/cram/test.paired_end.sorted.cram.crai,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/gatk/test.baserecalibrator.table diff --git a/tests/csv/3.0/recalibrated.csv b/tests/csv/3.0/recalibrated.csv new file mode 100644 index 0000000000..fbaba2c90d --- /dev/null +++ b/tests/csv/3.0/recalibrated.csv @@ -0,0 +1,5 @@ +patient,sex,status,sample,cram,crai +test,XX,0,sample1,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/cram/test.paired_end.recalibrated.sorted.cram,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/cram/test.paired_end.recalibrated.sorted.cram.crai +test1,XX,1,sample2,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/cram/test2.paired_end.recalibrated.sorted.cram,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/cram/test2.paired_end.recalibrated.sorted.cram.crai +test3,XX,0,sample3,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/cram/test.paired_end.recalibrated.sorted.cram,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/cram/test.paired_end.recalibrated.sorted.cram.crai +test3,XX,1,sample4,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/cram/test2.paired_end.recalibrated.sorted.cram,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/cram/test2.paired_end.recalibrated.sorted.cram.crai diff --git a/tests/csv/3.0/recalibrated_germline.csv b/tests/csv/3.0/recalibrated_germline.csv new file mode 100644 index 0000000000..a7875203a6 --- /dev/null +++ b/tests/csv/3.0/recalibrated_germline.csv @@ -0,0 +1,2 @@ +patient,sex,status,sample,cram,crai +test,XX,0,sample1,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/cram/test.paired_end.recalibrated.sorted.cram,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/cram/test.paired_end.recalibrated.sorted.cram.crai diff --git a/tests/csv/3.0/recalibrated_somatic.csv b/tests/csv/3.0/recalibrated_somatic.csv new file mode 100644 index 0000000000..4a87b75e4e --- /dev/null +++ b/tests/csv/3.0/recalibrated_somatic.csv @@ -0,0 +1,3 @@ +patient,sex,status,sample,cram,crai +test3,XX,0,sample3,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/cram/test.paired_end.recalibrated.sorted.cram,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/cram/test.paired_end.recalibrated.sorted.cram.crai +test3,XX,1,sample4,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/cram/test2.paired_end.recalibrated.sorted.cram,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/cram/test2.paired_end.recalibrated.sorted.cram.crai diff --git a/tests/csv/3.0/recalibrated_somatic_joint.csv b/tests/csv/3.0/recalibrated_somatic_joint.csv new file mode 100644 index 0000000000..12eb61d860 --- /dev/null +++ b/tests/csv/3.0/recalibrated_somatic_joint.csv @@ -0,0 +1,4 @@ +patient,sex,status,sample,cram,crai +test,XX,0,sample1,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/cram/test.paired_end.recalibrated.sorted.cram,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/cram/test.paired_end.recalibrated.sorted.cram.crai +test,XX,1,sample2,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/cram/test2.paired_end.recalibrated.sorted.cram,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/cram/test2.paired_end.recalibrated.sorted.cram.crai +test,XX,1,sample3,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/cram/test3.paired_end.recalibrated.sorted.cram,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/cram/test3.paired_end.recalibrated.sorted.cram.crai diff --git a/tests/csv/3.0/recalibrated_tumoronly.csv b/tests/csv/3.0/recalibrated_tumoronly.csv new file mode 100644 index 0000000000..ae29aa64c5 --- /dev/null +++ b/tests/csv/3.0/recalibrated_tumoronly.csv @@ -0,0 +1,2 @@ +patient,sex,status,sample,cram,crai +test1,XX,1,sample2,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/cram/test2.paired_end.recalibrated.sorted.cram,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/cram/test2.paired_end.recalibrated.sorted.cram.crai diff --git a/tests/csv/3.0/recalibrated_tumoronly_joint.csv b/tests/csv/3.0/recalibrated_tumoronly_joint.csv new file mode 100644 index 0000000000..f3ded832ec --- /dev/null +++ b/tests/csv/3.0/recalibrated_tumoronly_joint.csv @@ -0,0 +1,3 @@ +patient,sex,status,sample,cram,crai +test,XX,1,sample2,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/cram/test2.paired_end.recalibrated.sorted.cram,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/cram/test2.paired_end.recalibrated.sorted.cram.crai +test,XX,1,sample3,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/cram/test3.paired_end.recalibrated.sorted.cram,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/cram/test3.paired_end.recalibrated.sorted.cram.crai diff --git a/tests/csv/3.0/sample_with_space.csv b/tests/csv/3.0/sample_with_space.csv new file mode 100644 index 0000000000..24e917ce44 --- /dev/null +++ b/tests/csv/3.0/sample_with_space.csv @@ -0,0 +1,3 @@ +patient,sex,status,sample,lane,fastq_1,fastq_2 +test,XX,0,test,test_L1,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/fastq/test_1.fastq.gz,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/fastq/test_2.fastq.gz +test,XX,1,test 2,test_L1,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/fastq/test2_1.fastq.gz,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/fastq/test2_2.fastq.gz diff --git a/tests/csv/3.0/vcf_single.csv b/tests/csv/3.0/vcf_single.csv new file mode 100644 index 0000000000..601e72f60f --- /dev/null +++ b/tests/csv/3.0/vcf_single.csv @@ -0,0 +1,2 @@ +patient,sample,vcf +test,test,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/sarscov2/illumina/vcf/test.vcf.gz diff --git a/tests/main.nf.test b/tests/main.nf.test new file mode 100644 index 0000000000..ad9209ab88 --- /dev/null +++ b/tests/main.nf.test @@ -0,0 +1,29 @@ +nextflow_pipeline { + + name "Test pipeline" + script "../main.nf" + tag "pipeline" + tag "pipeline_sarek" + + test("Run with profile test") { + + when { + params { + outdir = "results" + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + input = "$projectDir/tests/csv/3.0/fastq_pair.csv" + validationSchemaIgnoreParams = 'test_data_base,test_data,genomes' + use_gatk_spark = false + } + } + + then { + assertAll( + { assert workflow.success } + ) + } + } + +} diff --git a/tests/requirements.txt b/tests/requirements.txt new file mode 100644 index 0000000000..095a66001f --- /dev/null +++ b/tests/requirements.txt @@ -0,0 +1,3 @@ +pip +pytest-workflow +cryptography diff --git a/tests/tags.yml b/tests/tags.yml new file mode 100644 index 0000000000..0b52c379ca --- /dev/null +++ b/tests/tags.yml @@ -0,0 +1,5 @@ +pipeline_sarek: + - "**.nf" + - "**.config" + - "**.nf.test" + - "**.json" diff --git a/tests/test_aligner_bwamem.yml b/tests/test_aligner_bwamem.yml new file mode 100644 index 0000000000..ea51c0ccd5 --- /dev/null +++ b/tests/test_aligner_bwamem.yml @@ -0,0 +1,107 @@ +- name: Run bwamem + command: nextflow run main.nf -profile test_cache --aligner bwa-mem --save_reference --outdir results + tags: + - aligner + - bwamem + - preprocessing + files: + - path: results/csv/markduplicates.csv + md5sum: 0d6120bb99e92f6810343270711ca53e + - path: results/csv/markduplicates_no_table.csv + md5sum: 2a2d3d4842befd4def39156463859ee3 + - path: results/csv/recalibrated.csv + md5sum: 2d29d9e53894dcce96a1b5beb6ef3312 + - path: results/multiqc + - path: results/preprocessing/markduplicates/test/test.md.cram + # binary changes md5sums on reruns + - path: results/preprocessing/markduplicates/test/test.md.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recal_table/test/test.recal.table + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + # binary changes md5sums on reruns + - path: results/reference/bwa/genome.amb + md5sum: 1891c1de381b3a96d4e72f590fde20c1 + - path: results/reference/bwa/genome.ann + md5sum: 2df4aa2d7580639fa0fcdbcad5e2e969 + - path: results/reference/bwa/genome.bwt + md5sum: 815eded87e4cb6b0f1daab5c4d6e30af + - path: results/reference/bwa/genome.pac + md5sum: 8569fbdb2c98c6fb16dfa73d8eacb070 + - path: results/reference/bwa/genome.sa + md5sum: e7cff62b919448a3a3d0fe4aaf427594 + - path: results/reference/dbsnp/dbsnp_146.hg38.vcf.gz.tbi + # conda changes md5sums for test + - path: results/reference/dict/genome.dict + md5sum: 2433fe2ba31257337bf4c4bd4cb8da15 + - path: results/reference/fai/genome.fasta.fai + md5sum: 3520cd30e1b100e55f578db9c855f685 + - path: results/reference/intervals/chr22_1-40001.bed + md5sum: 87a15eb9c2ff20ccd5cd8735a28708f7 + - path: results/reference/intervals/chr22_1-40001.bed.gz + md5sum: d3341fa28986c40b24fcc10a079dbb80 + - path: results/reference/intervals/genome.bed + md5sum: a87dc7d20ebca626f65cc16ff6c97a3e + - path: results/reference/known_indels/mills_and_1000G.indels.vcf.gz.tbi + # conda changes md5sums for test + - path: results/reports/fastqc/test-test_L1 + - path: results/reports/markduplicates/test/test.md.cram.metrics + contains: ["test 17094 1534 168 1046782 12429 197 0 0.635998", "1.0 0.999991 1171"] + - path: results/reports/mosdepth/test/test.md.mosdepth.global.dist.txt + md5sum: b61e1acee11a6ddf7ce3232a5948a6a0 + - path: results/reports/mosdepth/test/test.md.mosdepth.region.dist.txt + md5sum: 1a382f98d488d2ae3df83a0d87caafc1 + - path: results/reports/mosdepth/test/test.md.mosdepth.summary.txt + md5sum: 839108358878ada89e1eaddf6e0541ba + - path: results/reports/mosdepth/test/test.md.regions.bed.gz + md5sum: 0aaee6da65050bedcd40b9fbf0622873 + - path: results/reports/mosdepth/test/test.md.regions.bed.gz.csi + md5sum: 544e02fcca548749a0af758d0a2df352 + - path: results/reports/mosdepth/test/test.recal.mosdepth.global.dist.txt + md5sum: b61e1acee11a6ddf7ce3232a5948a6a0 + - path: results/reports/mosdepth/test/test.recal.mosdepth.region.dist.txt + md5sum: 1a382f98d488d2ae3df83a0d87caafc1 + - path: results/reports/mosdepth/test/test.recal.mosdepth.summary.txt + md5sum: 839108358878ada89e1eaddf6e0541ba + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz + md5sum: 0aaee6da65050bedcd40b9fbf0622873 + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz.csi + md5sum: 544e02fcca548749a0af758d0a2df352 + - path: results/reports/samtools/test/test.md.cram.stats + # conda changes md5sums for test + - path: results/reports/samtools/test/test.recal.cram.stats + # conda changes md5sums for test +- name: Build only index with bwa + command: nextflow run main.nf -profile test_cache --build_only_index --input false --outdir results + tags: + - aligner + - build_only_index + - bwamem + files: + - path: results/multiqc + - path: results/reference/bwa/genome.amb + md5sum: 1891c1de381b3a96d4e72f590fde20c1 + - path: results/reference/bwa/genome.ann + md5sum: 2df4aa2d7580639fa0fcdbcad5e2e969 + - path: results/reference/bwa/genome.bwt + md5sum: 815eded87e4cb6b0f1daab5c4d6e30af + - path: results/reference/bwa/genome.pac + md5sum: 8569fbdb2c98c6fb16dfa73d8eacb070 + - path: results/reference/bwa/genome.sa + md5sum: e7cff62b919448a3a3d0fe4aaf427594 + - path: results/reference/dbsnp/dbsnp_146.hg38.vcf.gz.tbi + # conda changes md5sums for test + - path: results/reference/dict/genome.dict + md5sum: 2433fe2ba31257337bf4c4bd4cb8da15 + - path: results/reference/fai/genome.fasta.fai + md5sum: 3520cd30e1b100e55f578db9c855f685 + - path: results/reference/intervals/chr22_1-40001.bed + md5sum: 87a15eb9c2ff20ccd5cd8735a28708f7 + - path: results/reference/intervals/chr22_1-40001.bed.gz + md5sum: d3341fa28986c40b24fcc10a079dbb80 + - path: results/reference/intervals/genome.bed + md5sum: a87dc7d20ebca626f65cc16ff6c97a3e + - path: results/reference/known_indels/mills_and_1000G.indels.vcf.gz.tbi + # conda changes md5sums for test diff --git a/tests/test_aligner_bwamem2.yml b/tests/test_aligner_bwamem2.yml new file mode 100644 index 0000000000..17760a558b --- /dev/null +++ b/tests/test_aligner_bwamem2.yml @@ -0,0 +1,107 @@ +- name: Run bwamem2 + command: nextflow run main.nf -profile test_cache --aligner bwa-mem2 --save_reference --outdir results + tags: + - aligner + - bwamem2 + - preprocessing + files: + - path: results/csv/markduplicates.csv + md5sum: 0d6120bb99e92f6810343270711ca53e + - path: results/csv/markduplicates_no_table.csv + md5sum: 2a2d3d4842befd4def39156463859ee3 + - path: results/csv/recalibrated.csv + md5sum: 2d29d9e53894dcce96a1b5beb6ef3312 + - path: results/multiqc + - path: results/preprocessing/markduplicates/test/test.md.cram + # binary changes md5sums on reruns + - path: results/preprocessing/markduplicates/test/test.md.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recal_table/test/test.recal.table + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + # binary changes md5sums on reruns + - path: results/reference/bwamem2/genome.fasta.0123 + md5sum: d73300d44f733bcdb7c988fc3ff3e3e9 + - path: results/reference/bwamem2/genome.fasta.amb + md5sum: 1891c1de381b3a96d4e72f590fde20c1 + - path: results/reference/bwamem2/genome.fasta.ann + md5sum: 2df4aa2d7580639fa0fcdbcad5e2e969 + - path: results/reference/bwamem2/genome.fasta.bwt.2bit.64 + md5sum: cd4bdf496eab05228a50c45ee43c1ed0 + - path: results/reference/bwamem2/genome.fasta.pac + md5sum: 8569fbdb2c98c6fb16dfa73d8eacb070 + - path: results/reference/dbsnp/dbsnp_146.hg38.vcf.gz.tbi + # conda changes md5sums for test + - path: results/reference/dict/genome.dict + md5sum: 2433fe2ba31257337bf4c4bd4cb8da15 + - path: results/reference/fai/genome.fasta.fai + md5sum: 3520cd30e1b100e55f578db9c855f685 + - path: results/reference/intervals/chr22_1-40001.bed + md5sum: 87a15eb9c2ff20ccd5cd8735a28708f7 + - path: results/reference/intervals/chr22_1-40001.bed.gz + md5sum: d3341fa28986c40b24fcc10a079dbb80 + - path: results/reference/intervals/genome.bed + md5sum: a87dc7d20ebca626f65cc16ff6c97a3e + - path: results/reference/known_indels/mills_and_1000G.indels.vcf.gz.tbi + # conda changes md5sums for test + - path: results/reports/fastqc/test-test_L1 + - path: results/reports/markduplicates/test/test.md.cram.metrics + contains: ["test 17094 1534 168 1046782 12429 197 0 0.635998", "1.0 0.999991 1171"] + - path: results/reports/mosdepth/test/test.md.mosdepth.global.dist.txt + md5sum: b61e1acee11a6ddf7ce3232a5948a6a0 + - path: results/reports/mosdepth/test/test.md.mosdepth.region.dist.txt + md5sum: 1a382f98d488d2ae3df83a0d87caafc1 + - path: results/reports/mosdepth/test/test.md.mosdepth.summary.txt + md5sum: 839108358878ada89e1eaddf6e0541ba + - path: results/reports/mosdepth/test/test.md.regions.bed.gz + md5sum: 0aaee6da65050bedcd40b9fbf0622873 + - path: results/reports/mosdepth/test/test.md.regions.bed.gz.csi + md5sum: 544e02fcca548749a0af758d0a2df352 + - path: results/reports/mosdepth/test/test.recal.mosdepth.global.dist.txt + md5sum: b61e1acee11a6ddf7ce3232a5948a6a0 + - path: results/reports/mosdepth/test/test.recal.mosdepth.region.dist.txt + md5sum: 1a382f98d488d2ae3df83a0d87caafc1 + - path: results/reports/mosdepth/test/test.recal.mosdepth.summary.txt + md5sum: 839108358878ada89e1eaddf6e0541ba + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz + md5sum: 0aaee6da65050bedcd40b9fbf0622873 + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz.csi + md5sum: 544e02fcca548749a0af758d0a2df352 + - path: results/reports/samtools/test/test.md.cram.stats + # conda changes md5sums for test + - path: results/reports/samtools/test/test.recal.cram.stats + # conda changes md5sums for test +- name: Build only index with bwa-mem2 + command: nextflow run main.nf -profile test_cache --build_only_index --aligner bwa-mem2 --input false --outdir results + tags: + - aligner + - build_only_index + - bwamem2 + files: + - path: results/multiqc + - path: results/reference/bwamem2/genome.fasta.0123 + md5sum: d73300d44f733bcdb7c988fc3ff3e3e9 + - path: results/reference/bwamem2/genome.fasta.amb + md5sum: 1891c1de381b3a96d4e72f590fde20c1 + - path: results/reference/bwamem2/genome.fasta.ann + md5sum: 2df4aa2d7580639fa0fcdbcad5e2e969 + - path: results/reference/bwamem2/genome.fasta.bwt.2bit.64 + md5sum: cd4bdf496eab05228a50c45ee43c1ed0 + - path: results/reference/bwamem2/genome.fasta.pac + md5sum: 8569fbdb2c98c6fb16dfa73d8eacb070 + - path: results/reference/dbsnp/dbsnp_146.hg38.vcf.gz.tbi + # conda changes md5sums for test + - path: results/reference/dict/genome.dict + md5sum: 2433fe2ba31257337bf4c4bd4cb8da15 + - path: results/reference/fai/genome.fasta.fai + md5sum: 3520cd30e1b100e55f578db9c855f685 + - path: results/reference/intervals/chr22_1-40001.bed + md5sum: 87a15eb9c2ff20ccd5cd8735a28708f7 + - path: results/reference/intervals/chr22_1-40001.bed.gz + md5sum: d3341fa28986c40b24fcc10a079dbb80 + - path: results/reference/intervals/genome.bed + md5sum: a87dc7d20ebca626f65cc16ff6c97a3e + - path: results/reference/known_indels/mills_and_1000G.indels.vcf.gz.tbi + # conda changes md5sums for test diff --git a/tests/test_aligner_dragmap.yml b/tests/test_aligner_dragmap.yml new file mode 100644 index 0000000000..a4960d8e02 --- /dev/null +++ b/tests/test_aligner_dragmap.yml @@ -0,0 +1,133 @@ +- name: Run dragmap + command: nextflow run main.nf -profile test_cache --aligner dragmap --save_reference --outdir results + tags: + - aligner + - dragmap + - preprocessing + files: + - path: results/csv/markduplicates.csv + md5sum: 0d6120bb99e92f6810343270711ca53e + - path: results/csv/markduplicates_no_table.csv + md5sum: 2a2d3d4842befd4def39156463859ee3 + - path: results/csv/recalibrated.csv + md5sum: 2d29d9e53894dcce96a1b5beb6ef3312 + - path: results/multiqc + - path: results/preprocessing/markduplicates/test/test.md.cram + # binary changes md5sums on reruns + - path: results/preprocessing/markduplicates/test/test.md.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recal_table/test/test.recal.table + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + # binary changes md5sums on reruns + - path: results/reference/dbsnp/dbsnp_146.hg38.vcf.gz.tbi + # conda changes md5sums for test + - path: results/reference/dict/genome.dict + md5sum: 2433fe2ba31257337bf4c4bd4cb8da15 + - path: results/reference/dragmap/hash_table.cfg + contains: + [ + "reference_sequences = 1", + "reference_len = 368640", + "reference_len_raw = 40001", + "reference_len_not_n = 40001", + "reference_alt_seed = 204800", + ] + - path: results/reference/dragmap/hash_table.cfg.bin + # binary changes md5sums on reruns + - path: results/reference/dragmap/hash_table.cmp + md5sum: 1caab4ffc89f81ace615a2e813295cf4 + - path: results/reference/dragmap/hash_table_stats.txt + contains: ["A bases: 10934", "C bases: 8612", "G bases: 8608", "T bases: 11847"] + - path: results/reference/dragmap/ref_index.bin + md5sum: dbb5c7d26b974e0ac338024fe4535044 + - path: results/reference/dragmap/reference.bin + md5sum: be67b80ee48aa96b383fd72f1ccfefea + - path: results/reference/dragmap/repeat_mask.bin + md5sum: 294939f1f80aa7f4a70b9b537e4c0f21 + - path: results/reference/dragmap/str_table.bin + md5sum: 45f7818c4a10fdeed04db7a34b5f9ff1 + - path: results/reference/fai/genome.fasta.fai + md5sum: 3520cd30e1b100e55f578db9c855f685 + - path: results/reference/intervals/chr22_1-40001.bed + md5sum: 87a15eb9c2ff20ccd5cd8735a28708f7 + - path: results/reference/intervals/chr22_1-40001.bed.gz + md5sum: d3341fa28986c40b24fcc10a079dbb80 + - path: results/reference/intervals/genome.bed + md5sum: a87dc7d20ebca626f65cc16ff6c97a3e + - path: results/reference/known_indels/mills_and_1000G.indels.vcf.gz.tbi + # conda changes md5sums for test + - path: results/reports/fastqc/test-test_L1 + - path: results/reports/markduplicates/test/test.md.cram.metrics + contains: ["LB0 27214 1086 322 1037558 20017 100 0 0.687981"] + - path: results/reports/mosdepth/test/test.md.mosdepth.global.dist.txt + md5sum: b0d7d5de1e00132bb5b47e1b3d90d944 + - path: results/reports/mosdepth/test/test.md.mosdepth.region.dist.txt + md5sum: 02b88d7ec0ca8ff23ba688c35939fa05 + - path: results/reports/mosdepth/test/test.md.mosdepth.summary.txt + md5sum: 1792d98676f597f755749f4286c5102d + - path: results/reports/mosdepth/test/test.md.regions.bed.gz + md5sum: 590119568d091ce3a88f7fe4f43f24ff + - path: results/reports/mosdepth/test/test.md.regions.bed.gz.csi + md5sum: 2e7fe057ead0622e6a2a6b0ed1832315 + - path: results/reports/mosdepth/test/test.recal.mosdepth.global.dist.txt + md5sum: b0d7d5de1e00132bb5b47e1b3d90d944 + - path: results/reports/mosdepth/test/test.recal.mosdepth.region.dist.txt + md5sum: 02b88d7ec0ca8ff23ba688c35939fa05 + - path: results/reports/mosdepth/test/test.recal.mosdepth.summary.txt + md5sum: 1792d98676f597f755749f4286c5102d + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz + md5sum: 590119568d091ce3a88f7fe4f43f24ff + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz.csi + md5sum: 2e7fe057ead0622e6a2a6b0ed1832315 + - path: results/reports/samtools/test/test.md.cram.stats + # conda changes md5sums for test + - path: results/reports/samtools/test/test.recal.cram.stats + # conda changes md5sums for test +- name: Build only index with dragmap + command: nextflow run main.nf -profile test_cache --build_only_index --aligner dragmap --input false --outdir results + tags: + - aligner + - build_only_index + - dragmap + files: + - path: results/multiqc + - path: results/reference/dragmap/hash_table.cfg + contains: + [ + "reference_sequences = 1", + "reference_len = 368640", + "reference_len_raw = 40001", + "reference_len_not_n = 40001", + "reference_alt_seed = 204800", + ] + - path: results/reference/dragmap/hash_table.cfg.bin + # binary changes md5sums on reruns + - path: results/reference/dragmap/hash_table.cmp + md5sum: 1caab4ffc89f81ace615a2e813295cf4 + - path: results/reference/dragmap/hash_table_stats.txt + contains: ["A bases: 10934", "C bases: 8612", "G bases: 8608", "T bases: 11847"] + - path: results/reference/dragmap/ref_index.bin + md5sum: dbb5c7d26b974e0ac338024fe4535044 + - path: results/reference/dragmap/reference.bin + md5sum: be67b80ee48aa96b383fd72f1ccfefea + - path: results/reference/dragmap/repeat_mask.bin + md5sum: 294939f1f80aa7f4a70b9b537e4c0f21 + - path: results/reference/dragmap/str_table.bin + md5sum: 45f7818c4a10fdeed04db7a34b5f9ff1 + - path: results/reference/dbsnp/dbsnp_146.hg38.vcf.gz.tbi + # conda changes md5sums for test + - path: results/reference/dict/genome.dict + md5sum: 2433fe2ba31257337bf4c4bd4cb8da15 + - path: results/reference/fai/genome.fasta.fai + md5sum: 3520cd30e1b100e55f578db9c855f685 + - path: results/reference/intervals/chr22_1-40001.bed + md5sum: 87a15eb9c2ff20ccd5cd8735a28708f7 + - path: results/reference/intervals/chr22_1-40001.bed.gz + md5sum: d3341fa28986c40b24fcc10a079dbb80 + - path: results/reference/intervals/genome.bed + md5sum: a87dc7d20ebca626f65cc16ff6c97a3e + - path: results/reference/known_indels/mills_and_1000G.indels.vcf.gz.tbi + # conda changes md5sums for test diff --git a/tests/test_alignment_to_fastq.yml b/tests/test_alignment_to_fastq.yml new file mode 100644 index 0000000000..6077b745d4 --- /dev/null +++ b/tests/test_alignment_to_fastq.yml @@ -0,0 +1,52 @@ +- name: Run alignment to fastq and then remap on bam files + command: nextflow run main.nf -profile test_cache,alignment_to_fastq --outdir results + tags: + - alignment_to_fastq + - input_bam + files: + - path: results/csv/markduplicates.csv + md5sum: 0d6120bb99e92f6810343270711ca53e + - path: results/csv/markduplicates_no_table.csv + md5sum: 2a2d3d4842befd4def39156463859ee3 + - path: results/csv/recalibrated.csv + md5sum: 2d29d9e53894dcce96a1b5beb6ef3312 + - path: results/multiqc + - path: results/preprocessing/markduplicates/test/test.md.cram + # binary changes md5sums on reruns + - path: results/preprocessing/markduplicates/test/test.md.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recal_table/test/test.recal.table + md5sum: 9c0517ffdc5d30a5c73b9f7df1ff3060 + - path: results/preprocessing/recalibrated/test/test.recal.cram + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + # binary changes md5sums on reruns + - path: results/reports/fastqc/test-1 + - path: results/reports/markduplicates/test/test.md.cram.metrics + contains: ["test 0 2820 2 2 0 828 0 0.293617 3807", "1.0 0.999986 1178 1178", "2.0 1.47674 800 800", "100.0 1.911145 0 0"] + - path: results/reports/mosdepth/test/test.md.mosdepth.global.dist.txt + md5sum: 9cb9b181119256ed17a77dcf44d58285 + - path: results/reports/mosdepth/test/test.md.mosdepth.region.dist.txt + md5sum: 75e1ce7e55af51f4985fa91654a5ea2d + - path: results/reports/mosdepth/test/test.md.mosdepth.summary.txt + md5sum: dbe376360e437c89190139ef0ae6769a + - path: results/reports/mosdepth/test/test.md.regions.bed.gz + md5sum: d9b53915d473710ff0260a0ff694fd32 + - path: results/reports/mosdepth/test/test.md.regions.bed.gz.csi + md5sum: d0713716f63ac573f4a3385733e9a537 + - path: results/reports/mosdepth/test/test.recal.mosdepth.global.dist.txt + md5sum: 9cb9b181119256ed17a77dcf44d58285 + - path: results/reports/mosdepth/test/test.recal.mosdepth.region.dist.txt + md5sum: 75e1ce7e55af51f4985fa91654a5ea2d + - path: results/reports/mosdepth/test/test.recal.mosdepth.summary.txt + md5sum: dbe376360e437c89190139ef0ae6769a + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz + md5sum: d9b53915d473710ff0260a0ff694fd32 + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz.csi + md5sum: d0713716f63ac573f4a3385733e9a537 + - path: results/reports/samtools/test/test.md.cram.stats + # conda changes md5sums for test + - path: results/reports/samtools/test/test.recal.cram.stats + # conda changes md5sums for test + - path: results/preprocessing/mapped/ + should_exist: false diff --git a/tests/test_annotation_bcfann.yml b/tests/test_annotation_bcfann.yml new file mode 100644 index 0000000000..5b9546b961 --- /dev/null +++ b/tests/test_annotation_bcfann.yml @@ -0,0 +1,10 @@ +- name: Run bcfann + command: nextflow run main.nf -profile test_cache,annotation --tools bcfann --outdir results + tags: + - annotation + - bcfann + files: + - path: results/annotation/test/test_BCF.ann.vcf.gz + # binary changes md5sums on reruns + - path: results/annotation/test/test_BCF.ann.vcf.gz.tbi + # binary changes md5sums on reruns diff --git a/tests/test_annotation_cache.yml b/tests/test_annotation_cache.yml new file mode 100644 index 0000000000..fd84fb47e4 --- /dev/null +++ b/tests/test_annotation_cache.yml @@ -0,0 +1,29 @@ +- name: Only download annotation cache + command: nextflow run main.nf -profile test_cache,annotation --tools merge --download_cache --input false --build_only_index --outdir results + tags: + - annotation + - cache + - vep + - snpeff + files: + - path: results/multiqc + - path: results/cache/snpeff_cache + - path: results/cache/vep_cache + - path: results/annotation + should_exist: false + +- name: Fail to locate VEP cache + command: nextflow run main.nf -profile test_cache,annotation --vep_cache s3://annotation-cache/vep_cache/ --vep_cache_version 1 --tools vep --input false --build_only_index --outdir results + tags: + - annotation + - cache + - vep + exit_code: 1 + +- name: Fail to locate snpEff cache + command: nextflow run main.nf -profile test_cache,annotation --snpeff_cache s3://annotation-cache/snpeff_cache/ --snpeff_genome na --tools snpeff --input false --build_only_index --outdir results + tags: + - annotation + - cache + - snpeff + exit_code: 1 diff --git a/tests/test_annotation_merge.yml b/tests/test_annotation_merge.yml new file mode 100644 index 0000000000..9915e502cd --- /dev/null +++ b/tests/test_annotation_merge.yml @@ -0,0 +1,58 @@ +- name: Run snpEff followed by VEP + command: nextflow run main.nf -profile test_cache,annotation --tools merge --outdir results --download_cache + tags: + - annotation + - merge + files: + - path: results/annotation/test/test_snpEff_VEP.ann.vcf.gz + # binary changes md5sums on reruns + - path: results/annotation/test/test_snpEff_VEP.ann.vcf.gz.tbi + md5sum: 4cb176febbc8c26d717a6c6e67b9c905 + - path: results/reports/EnsemblVEP/test/test_snpEff_VEP.ann.summary.html + contains: ["test_snpEff.ann.vcf.gzOutput filetest_snpEff_VEP.ann.vcf.gz"] + - path: results/multiqc + - path: results/annotation/test/test_snpEff.ann.vcf.gz + should_exist: false + - path: results/annotation/test/test_snpEff.ann.vcf.gz.tbi + should_exist: false + - path: results/annotation/test/test_VEP.ann.vcf.gz + should_exist: false + - path: results/annotation/test/test_VEP.ann.vcf.gz.tbi + should_exist: false + - path: results/reports/snpeff/test/snpEff_summary.html + should_exist: false + - path: results/reports/snpeff/test/test_snpEff.csv + should_exist: false + - path: results/reports/snpeff/test/test_snpEff.genes.txt + should_exist: false + - path: results/reports/EnsemblVEP/test/test_VEP.ann.summary.html + should_exist: false +- name: Run VEP and snpEff followed by VEP + command: nextflow run main.nf -profile test_cache,annotation --tools merge,snpeff,vep --outdir results --download_cache + tags: + - annotation + - merge + files: + - path: results/annotation/test/test_VEP.ann.vcf.gz + # binary changes md5sums on reruns + - path: results/annotation/test/test_VEP.ann.vcf.gz.tbi + md5sum: 4cb176febbc8c26d717a6c6e67b9c905 + - path: results/annotation/test/test_snpEff.ann.vcf.gz + md5sum: 01f24fdd76f73eefd695beea7b3d3d8e + - path: results/annotation/test/test_snpEff.ann.vcf.gz.tbi + md5sum: 51e418d9be9bb33f1d4123493b15b6c9 + - path: results/annotation/test/test_snpEff_VEP.ann.vcf.gz + # binary changes md5sums on reruns + - path: results/annotation/test/test_snpEff_VEP.ann.vcf.gz.tbi + md5sum: 4cb176febbc8c26d717a6c6e67b9c905 + - path: results/multiqc + - path: results/reports/EnsemblVEP/test/test_VEP.ann.summary.html + # text-based file changes md5sums on reruns + - path: results/reports/EnsemblVEP/test/test_snpEff_VEP.ann.summary.html + # text-based file changes md5sums on reruns + - path: results/reports/snpeff/test/snpEff_summary.html + # text-based file changes md5sums on reruns + - path: results/reports/snpeff/test/test_snpEff.csv + # text-based file changes md5sums on reruns + - path: results/reports/snpeff/test/test_snpEff.genes.txt + md5sum: 130536bf0237d7f3f746d32aaa32840a diff --git a/tests/test_annotation_snpeff.yml b/tests/test_annotation_snpeff.yml new file mode 100644 index 0000000000..a98b7bd34d --- /dev/null +++ b/tests/test_annotation_snpeff.yml @@ -0,0 +1,27 @@ +- name: Run snpEff + command: nextflow run main.nf -profile test_cache,annotation --tools snpeff --outdir results --download_cache + tags: + - annotation + - snpeff + files: + - path: results/annotation/test/test_snpEff.ann.vcf.gz + md5sum: 01f24fdd76f73eefd695beea7b3d3d8e + - path: results/annotation/test/test_snpEff.ann.vcf.gz.tbi + md5sum: 51e418d9be9bb33f1d4123493b15b6c9 + - path: results/multiqc + - path: results/reports/snpeff/test/snpEff_summary.html + contains: [" Genome total length ", " 100,286,402 ", " MT192765.1 "] + - path: results/reports/snpeff/test/test_snpEff.csv + contains: + [ + "Values , 50,100", + "Count , 1,8", + "Reference , 0", + "Het , 1", + "Hom , 8", + "Missing , 0", + "MT192765.1, Position,0,1", + "MT192765.1,Count,0,0", + ] + - path: results/reports/snpeff/test/test_snpEff.genes.txt + md5sum: 130536bf0237d7f3f746d32aaa32840a diff --git a/tests/test_annotation_vep.yml b/tests/test_annotation_vep.yml new file mode 100644 index 0000000000..34baab8b99 --- /dev/null +++ b/tests/test_annotation_vep.yml @@ -0,0 +1,27 @@ +- name: Run VEP + command: nextflow run main.nf -profile test_cache,annotation --tools vep --outdir results --download_cache + tags: + - annotation + - vep + files: + - path: results/annotation/test/test_VEP.ann.vcf.gz + # binary changes md5sums on reruns + - path: results/annotation/test/test_VEP.ann.vcf.gz.tbi + md5sum: 4cb176febbc8c26d717a6c6e67b9c905 + - path: results/multiqc + - path: results/reports/EnsemblVEP/test/test_VEP.ann.summary.html + contains: ["test.vcf.gzOutput filetest_VEP.ann.vcf.gz"] +- name: Run VEP with fasta + command: nextflow run main.nf -profile test_cache,annotation --tools vep --vep_include_fasta --outdir results --download_cache + tags: + - annotation + - vep + files: + - path: results/annotation/test/test_VEP.ann.vcf.gz + # binary changes md5sums on reruns + - path: results/annotation/test/test_VEP.ann.vcf.gz.tbi + md5sum: 4cb176febbc8c26d717a6c6e67b9c905 + - path: results/multiqc + - path: results/reports/EnsemblVEP/test/test_VEP.ann.summary.html + # text-based file changes md5sums on reruns + contains: ["test.vcf.gzOutput filetest_VEP.ann.vcf.gz"] diff --git a/tests/test_cnvkit.yml b/tests/test_cnvkit.yml new file mode 100644 index 0000000000..8e9017c70b --- /dev/null +++ b/tests/test_cnvkit.yml @@ -0,0 +1,228 @@ +- name: Run variant calling on somatic samples with cnvkit + command: nextflow run main.nf -profile test_cache,tools_somatic --tools cnvkit --outdir results + tags: + - cnvkit + - copy_number_calling + - somatic + - variant_calling + files: + - path: results/multiqc + - path: results/variant_calling/cnvkit/sample3/multi_intervals.antitarget.bed + md5sum: 3d4d20f9f23b39970865d29ef239d20b + - path: results/variant_calling/cnvkit/sample3/multi_intervals.target.bed + md5sum: 86d30493bb2e619a93f4ebc2923d29f3 + - path: results/variant_calling/cnvkit/sample3/reference.cnn + md5sum: a09ee4be5dda1cf0f68073bdb3aad8ec + - path: results/variant_calling/cnvkit/sample3/test.paired_end.recalibrated.sorted-diagram.pdf + # binary changes md5sums on reruns + - path: results/variant_calling/cnvkit/sample3/test.paired_end.recalibrated.sorted-scatter.png + # conda changes md5sums for test + - path: results/variant_calling/cnvkit/sample3/test.paired_end.recalibrated.sorted.antitargetcoverage.cnn + md5sum: fe1248aa91fad7769303bb4c031d55ca + - path: results/variant_calling/cnvkit/sample3/test.paired_end.recalibrated.sorted.bintest.cns + md5sum: d8c43bead209cdf4c480284c64542693 + - path: results/variant_calling/cnvkit/sample3/test.paired_end.recalibrated.sorted.call.cns + md5sum: 9968e02ef2f11ed22e2789c053f7159c + - path: results/variant_calling/cnvkit/sample3/test.paired_end.recalibrated.sorted.cnr + md5sum: 3f0fe46574d0f7137f779e7ac1c2362d + - path: results/variant_calling/cnvkit/sample3/test.paired_end.recalibrated.sorted.cns + md5sum: 0b2b81d391965488b8634a2b802b69cd + - path: results/variant_calling/cnvkit/sample3/test.paired_end.recalibrated.sorted.targetcoverage.cnn + md5sum: 79aae7e9c135fb8c65f8fbda12610faf + - path: results/variant_calling/cnvkit/sample4_vs_sample3/multi_intervals.antitarget.bed + md5sum: 3d4d20f9f23b39970865d29ef239d20b + - path: results/variant_calling/cnvkit/sample4_vs_sample3/multi_intervals.target.bed + md5sum: 86d30493bb2e619a93f4ebc2923d29f3 + - path: results/variant_calling/cnvkit/sample4_vs_sample3/reference.cnn + md5sum: 59ec306bb820684b1f6f277d67cb2d92 + - path: results/variant_calling/cnvkit/sample4_vs_sample3/test.paired_end.recalibrated.sorted.antitargetcoverage.cnn + md5sum: fe1248aa91fad7769303bb4c031d55ca + - path: results/variant_calling/cnvkit/sample4_vs_sample3/test.paired_end.recalibrated.sorted.targetcoverage.cnn + md5sum: 79aae7e9c135fb8c65f8fbda12610faf + - path: results/variant_calling/cnvkit/sample4_vs_sample3/test2.paired_end.recalibrated.sorted-diagram.pdf + # binary changes md5sums on reruns + - path: results/variant_calling/cnvkit/sample4_vs_sample3/test2.paired_end.recalibrated.sorted-scatter.png + # conda changes md5sums for test + - path: results/variant_calling/cnvkit/sample4_vs_sample3/test2.paired_end.recalibrated.sorted.antitargetcoverage.cnn + md5sum: 067115082c4af4b64d58c0dc3a3642e4 + - path: results/variant_calling/cnvkit/sample4_vs_sample3/test2.paired_end.recalibrated.sorted.bintest.cns + md5sum: 3950153843b43230c5d0fdf832740b5d + - path: results/variant_calling/cnvkit/sample4_vs_sample3/test2.paired_end.recalibrated.sorted.call.cns + md5sum: ac3d073de1db84fa19077d8eda01b616 + - path: results/variant_calling/cnvkit/sample4_vs_sample3/test2.paired_end.recalibrated.sorted.cnr + md5sum: d7d437ae406ca2f00a6362277fe334ba + - path: results/variant_calling/cnvkit/sample4_vs_sample3/test2.paired_end.recalibrated.sorted.cns + md5sum: d15bf5061d90c8edbcba04d2bd2a270c + - path: results/variant_calling/cnvkit/sample4_vs_sample3/test2.paired_end.recalibrated.sorted.targetcoverage.cnn + md5sum: b4a49faf170e436ec32dcc21ccc3ce8f + - path: results/cnvkit + should_exist: false + - path: results/reports/mosdepth/sample3/sample3.recal.mosdepth.global.dist.txt + md5sum: 69e29702ef01fd8f6c7a5468fc35a16a + - path: results/reports/mosdepth/sample3/sample3.recal.mosdepth.region.dist.txt + md5sum: 6ec49cd7d510c2eb3d9d90fdb79b783a + - path: results/reports/mosdepth/sample3/sample3.recal.mosdepth.summary.txt + md5sum: 103098d0bf76ed82d2b87d5f242b099a + - path: results/reports/mosdepth/sample3/sample3.recal.regions.bed.gz + md5sum: b5888cf7395c57d39879a5faa6159eb3 + - path: results/reports/mosdepth/sample3/sample3.recal.regions.bed.gz.csi + md5sum: 9cb0ad7039a3b703d16ca7d5b835c0ee + - path: results/reports/samtools/sample3/sample3.recal.cram.stats + # conda changes md5sums for test + - path: results/reports/mosdepth/sample4/sample4.recal.mosdepth.global.dist.txt + md5sum: f2dcd00a64947c49e8e4b93c2f4fbf27 + - path: results/reports/mosdepth/sample4/sample4.recal.mosdepth.region.dist.txt + md5sum: 39005ffaac22871ffaaf19656fe69c5b + - path: results/reports/mosdepth/sample4/sample4.recal.mosdepth.summary.txt + md5sum: 68d4b98f17361fddf73052ead34fa370 + - path: results/reports/mosdepth/sample4/sample4.recal.regions.bed.gz + md5sum: 2819e995eafded35f53328c4ec19ba58 + - path: results/reports/mosdepth/sample4/sample4.recal.regions.bed.gz.csi + md5sum: 393c2749068304d8545b501b9d4658e4 + - path: results/reports/samtools/sample4/sample4.recal.cram.stats + # conda changes md5sums for test +- name: Run variant calling on tumor_only sample with cnvkit + command: nextflow run main.nf -profile test_cache,tools_tumoronly --tools cnvkit --outdir results + tags: + - cnvkit + - copy_number_calling + - tumor_only + - variant_calling + files: + - path: results/multiqc + - path: results/variant_calling/cnvkit/sample2/cnvkit.reference.antitarget-tmp.bed + md5sum: 3d4d20f9f23b39970865d29ef239d20b + - path: results/variant_calling/cnvkit/sample2/cnvkit.reference.target-tmp.bed + md5sum: 657b25dbda8516624efa8cb2cf3716ca + - path: results/variant_calling/cnvkit/sample2/test2.paired_end.recalibrated.sorted-diagram.pdf + # binary changes md5sums on reruns + - path: results/variant_calling/cnvkit/sample2/test2.paired_end.recalibrated.sorted-scatter.png + # conda changes md5sums for test + - path: results/variant_calling/cnvkit/sample2/test2.paired_end.recalibrated.sorted.antitargetcoverage.cnn + md5sum: 067115082c4af4b64d58c0dc3a3642e4 + - path: results/variant_calling/cnvkit/sample2/test2.paired_end.recalibrated.sorted.bintest.cns + md5sum: 7a66b5f63acb05e6dfb0784c215851ec + - path: results/variant_calling/cnvkit/sample2/test2.paired_end.recalibrated.sorted.call.cns + md5sum: f7caeca04aba28b125ce26b511f42afb + - path: results/variant_calling/cnvkit/sample2/test2.paired_end.recalibrated.sorted.cnr + md5sum: d9bdb71ce807051369577ee7f807a40c + - path: results/variant_calling/cnvkit/sample2/test2.paired_end.recalibrated.sorted.cns + md5sum: 2b56aac606ba6183d018b30ca58afcec + - path: results/variant_calling/cnvkit/sample2/test2.paired_end.recalibrated.sorted.targetcoverage.cnn + md5sum: e6d0190c1c37ce6e41f76ca5b24ccca3 + - path: results/cnvkit + should_exist: false + - path: results/reports/mosdepth/sample2/sample2.recal.mosdepth.global.dist.txt + md5sum: f2dcd00a64947c49e8e4b93c2f4fbf27 + - path: results/reports/mosdepth/sample2/sample2.recal.mosdepth.region.dist.txt + md5sum: 39005ffaac22871ffaaf19656fe69c5b + - path: results/reports/mosdepth/sample2/sample2.recal.mosdepth.summary.txt + md5sum: 68d4b98f17361fddf73052ead34fa370 + - path: results/reports/mosdepth/sample2/sample2.recal.regions.bed.gz + md5sum: 2819e995eafded35f53328c4ec19ba58 + - path: results/reports/mosdepth/sample2/sample2.recal.regions.bed.gz.csi + md5sum: 393c2749068304d8545b501b9d4658e4 + - path: results/reports/samtools/sample2/sample2.recal.cram.stats +- name: Run variant calling on germline sample with cnvkit + command: nextflow run main.nf -profile test_cache,tools_germline --tools cnvkit --outdir results + tags: + - cnvkit + - copy_number_calling + - germline + - variant_calling + files: + - path: results/multiqc + - path: results/variant_calling/cnvkit/sample1/multi_intervals.antitarget.bed + md5sum: 3d4d20f9f23b39970865d29ef239d20b + - path: results/variant_calling/cnvkit/sample1/multi_intervals.target.bed + md5sum: 86d30493bb2e619a93f4ebc2923d29f3 + - path: results/variant_calling/cnvkit/sample1/reference.cnn + md5sum: a09ee4be5dda1cf0f68073bdb3aad8ec + - path: results/variant_calling/cnvkit/sample1/test.paired_end.recalibrated.sorted-diagram.pdf + # binary changes md5sums on reruns + - path: results/variant_calling/cnvkit/sample1/test.paired_end.recalibrated.sorted-scatter.png + # conda changes md5sums for test + - path: results/variant_calling/cnvkit/sample1/test.paired_end.recalibrated.sorted.antitargetcoverage.cnn + md5sum: fe1248aa91fad7769303bb4c031d55ca + - path: results/variant_calling/cnvkit/sample1/test.paired_end.recalibrated.sorted.bintest.cns + md5sum: d8c43bead209cdf4c480284c64542693 + - path: results/variant_calling/cnvkit/sample1/test.paired_end.recalibrated.sorted.call.cns + md5sum: 9968e02ef2f11ed22e2789c053f7159c + - path: results/variant_calling/cnvkit/sample1/test.paired_end.recalibrated.sorted.cnr + md5sum: 3f0fe46574d0f7137f779e7ac1c2362d + - path: results/variant_calling/cnvkit/sample1/test.paired_end.recalibrated.sorted.cns + md5sum: 0b2b81d391965488b8634a2b802b69cd + - path: results/variant_calling/cnvkit/sample1/test.paired_end.recalibrated.sorted.targetcoverage.cnn + md5sum: 79aae7e9c135fb8c65f8fbda12610faf + - path: results/cnvkit + should_exist: false + - path: results/reports/mosdepth/sample1/sample1.recal.mosdepth.global.dist.txt + md5sum: 69e29702ef01fd8f6c7a5468fc35a16a + - path: results/reports/mosdepth/sample1/sample1.recal.mosdepth.region.dist.txt + md5sum: 6ec49cd7d510c2eb3d9d90fdb79b783a + - path: results/reports/mosdepth/sample1/sample1.recal.mosdepth.summary.txt + md5sum: 103098d0bf76ed82d2b87d5f242b099a + - path: results/reports/mosdepth/sample1/sample1.recal.regions.bed.gz + md5sum: b5888cf7395c57d39879a5faa6159eb3 + - path: results/reports/mosdepth/sample1/sample1.recal.regions.bed.gz.csi + md5sum: 9cb0ad7039a3b703d16ca7d5b835c0ee + - path: results/reports/samtools/sample1/sample1.recal.cram.stats +- name: Run variant calling on somatic samples with cnvkit and skip variant calling on matched normal + command: nextflow run main.nf -profile test_cache,tools_somatic --tools cnvkit --only_paired_variant_calling --outdir results + tags: + - cnvkit + - somatic + - only_paired_variant_calling + - copy_number_calling + files: + - path: results/multiqc + - path: results/variant_calling/cnvkit/sample4_vs_sample3/multi_intervals.antitarget.bed + md5sum: 3d4d20f9f23b39970865d29ef239d20b + - path: results/variant_calling/cnvkit/sample4_vs_sample3/multi_intervals.target.bed + md5sum: 86d30493bb2e619a93f4ebc2923d29f3 + - path: results/variant_calling/cnvkit/sample4_vs_sample3/reference.cnn + md5sum: 59ec306bb820684b1f6f277d67cb2d92 + - path: results/variant_calling/cnvkit/sample4_vs_sample3/test.paired_end.recalibrated.sorted.antitargetcoverage.cnn + md5sum: fe1248aa91fad7769303bb4c031d55ca + - path: results/variant_calling/cnvkit/sample4_vs_sample3/test.paired_end.recalibrated.sorted.targetcoverage.cnn + md5sum: 79aae7e9c135fb8c65f8fbda12610faf + - path: results/variant_calling/cnvkit/sample4_vs_sample3/test2.paired_end.recalibrated.sorted-diagram.pdf + # binary changes md5sums on reruns + - path: results/variant_calling/cnvkit/sample4_vs_sample3/test2.paired_end.recalibrated.sorted-scatter.png + # conda changes md5sums for test + - path: results/variant_calling/cnvkit/sample4_vs_sample3/test2.paired_end.recalibrated.sorted.antitargetcoverage.cnn + md5sum: 067115082c4af4b64d58c0dc3a3642e4 + - path: results/variant_calling/cnvkit/sample4_vs_sample3/test2.paired_end.recalibrated.sorted.bintest.cns + md5sum: 3950153843b43230c5d0fdf832740b5d + - path: results/variant_calling/cnvkit/sample4_vs_sample3/test2.paired_end.recalibrated.sorted.call.cns + md5sum: ac3d073de1db84fa19077d8eda01b616 + - path: results/variant_calling/cnvkit/sample4_vs_sample3/test2.paired_end.recalibrated.sorted.cnr + md5sum: d7d437ae406ca2f00a6362277fe334ba + - path: results/variant_calling/cnvkit/sample4_vs_sample3/test2.paired_end.recalibrated.sorted.cns + md5sum: d15bf5061d90c8edbcba04d2bd2a270c + - path: results/variant_calling/cnvkit/sample4_vs_sample3/test2.paired_end.recalibrated.sorted.targetcoverage.cnn + md5sum: b4a49faf170e436ec32dcc21ccc3ce8f + - path: results/reports/mosdepth/sample3/sample3.recal.mosdepth.global.dist.txt + md5sum: 69e29702ef01fd8f6c7a5468fc35a16a + - path: results/reports/mosdepth/sample3/sample3.recal.mosdepth.region.dist.txt + md5sum: 6ec49cd7d510c2eb3d9d90fdb79b783a + - path: results/reports/mosdepth/sample3/sample3.recal.mosdepth.summary.txt + md5sum: 103098d0bf76ed82d2b87d5f242b099a + - path: results/reports/mosdepth/sample3/sample3.recal.regions.bed.gz + md5sum: b5888cf7395c57d39879a5faa6159eb3 + - path: results/reports/mosdepth/sample3/sample3.recal.regions.bed.gz.csi + md5sum: 9cb0ad7039a3b703d16ca7d5b835c0ee + - path: results/reports/samtools/sample3/sample3.recal.cram.stats + # conda changes md5sums for test + - path: results/reports/mosdepth/sample4/sample4.recal.mosdepth.global.dist.txt + md5sum: f2dcd00a64947c49e8e4b93c2f4fbf27 + - path: results/reports/mosdepth/sample4/sample4.recal.mosdepth.region.dist.txt + md5sum: 39005ffaac22871ffaaf19656fe69c5b + - path: results/reports/mosdepth/sample4/sample4.recal.mosdepth.summary.txt + md5sum: 68d4b98f17361fddf73052ead34fa370 + - path: results/reports/mosdepth/sample4/sample4.recal.regions.bed.gz + md5sum: 2819e995eafded35f53328c4ec19ba58 + - path: results/reports/mosdepth/sample4/sample4.recal.regions.bed.gz.csi + md5sum: 393c2749068304d8545b501b9d4658e4 + - path: results/reports/samtools/sample4/sample4.recal.cram.stats + # conda changes md5sums for test diff --git a/tests/test_concat_germline_vcfs.yml b/tests/test_concat_germline_vcfs.yml new file mode 100644 index 0000000000..8a32231ac8 --- /dev/null +++ b/tests/test_concat_germline_vcfs.yml @@ -0,0 +1,22 @@ +- name: Run all germline variant callers and check for existence of concatenated vcf-files + command: nextflow run main.nf -profile test_cache --input ./tests/csv/3.0/mapped_joint_bam.csv --concatenate_vcfs --tools deepvariant,freebayes,haplotypecaller,manta,mpileup,strelka,tiddit --step variant_calling --outdir results + tags: + - concatenate_vcfs + files: + - path: results/variant_calling/concat/testN/testN.germline.vcf.gz + # binary changes md5sums on reruns + contains: + [ + "SOURCE=testN.deepvariant.vcf.gz", + "AB=0.167832;ABP=277.102;AC=1;AF=0.5;AN=2;AO=48;CIGAR=1X;DP=286;DPB=286;DPRA=0;EPP=3.0103;EPPR=3.0103;GTI=0;LEN=1;MEANALT=1;MQM=60;MQMR=60;NS=1;NUMALT=1;ODDS=105.855;PAIRED=1;PAIREDR=1;PAO=0;PQA=0;PQR=0;PRO=0;QA=2017;QR=9863;RO=238;RPL=0;RPP=107.241;RPPR=519.821;RPR=48;RUN=1;SAF=24;SAP=3.0103;SAR=24;SRF=119;SRP=3.0103;SRR=119;TYPE=snp;technology.illumina=1;", + "SOURCE=testN.freebayes.vcf.gz", + "SNVHPOL=7;MQ=60;", + "SOURCE=testN.strelka.variants.vcf.gz", + "SOURCE=testN.bcftools.vcf.gz", + ] + - path: results/variant_calling/concat/testT/testT.germline.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/concat/testN/testN.germline.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/concat/testT/testT.germline.vcf.gz.tbi + # binary changes md5sums on reruns diff --git a/tests/test_controlfreec.yml b/tests/test_controlfreec.yml new file mode 100644 index 0000000000..dad942b50e --- /dev/null +++ b/tests/test_controlfreec.yml @@ -0,0 +1,205 @@ +- name: Run variant calling on somatic samples with controlfreec + command: nextflow run main.nf -profile test_cache,tools_somatic --tools controlfreec --outdir results + tags: + - controlfreec + - somatic + - variant_calling + - copy_number_calling + files: + - path: results/multiqc + - path: results/untar/chr_dir/chr21.fasta + md5sum: 69bd44ef67566a76d6cbb8aa4a25ae35 + - path: results/variant_calling/controlfreec/sample4_vs_sample3/config.txt + contains: + [ + "BedGraphOutput = TRUE", + "minExpectedGC = 0", + "maxThreads = 2", + "noisyData = TRUE", + "readCountThreshold = 1", + "sex = XX", + "window = 10", + ] + - path: results/variant_calling/controlfreec/sample4_vs_sample3/sample4_vs_sample3.bed + md5sum: 833920178e4f40a296d8eab029caf086 + - path: results/variant_calling/controlfreec/sample4_vs_sample3/sample4_vs_sample3.circos.txt + md5sum: 92ce5ce97b27a7214dfa9c2cb20cf854 + - path: results/variant_calling/controlfreec/sample4_vs_sample3/sample4_vs_sample3.normal.mpileup.gz_control.cpn + md5sum: 508a003da85b186d9a60d867ef7cdf15 + - path: results/variant_calling/controlfreec/sample4_vs_sample3/sample4_vs_sample3.p.value.txt + # binary changes md5sums on reruns + - path: results/variant_calling/controlfreec/sample4_vs_sample3/sample4_vs_sample3.tumor.mpileup.gz_BAF.txt + # binary changes md5sums on reruns + - path: results/variant_calling/controlfreec/sample4_vs_sample3/sample4_vs_sample3.tumor.mpileup.gz_CNVs + # binary changes md5sums on reruns + - path: results/variant_calling/controlfreec/sample4_vs_sample3/sample4_vs_sample3.tumor.mpileup.gz_info.txt + md5sum: 271271719c576d9218bdc859850e54ee + - path: results/variant_calling/controlfreec/sample4_vs_sample3/sample4_vs_sample3.tumor.mpileup.gz_ratio.BedGraph + md5sum: 5d3321af93678f16878d59e01d3a87d3 + - path: results/variant_calling/controlfreec/sample4_vs_sample3/sample4_vs_sample3.tumor.mpileup.gz_ratio.txt + # binary changes md5sums on reruns + - path: results/variant_calling/controlfreec/sample4_vs_sample3/sample4_vs_sample3.tumor.mpileup.gz_sample.cpn + md5sum: befe1706c61464635a76c7323a6bd2a2 + - path: results/variant_calling/controlfreec/sample4_vs_sample3/sample4_vs_sample3_BAF.png + # binary changes md5sums on reruns + - path: results/variant_calling/controlfreec/sample4_vs_sample3/sample4_vs_sample3_ratio.log2.png + # binary changes md5sums on reruns + - path: results/variant_calling/controlfreec/sample4_vs_sample3/sample4_vs_sample3_ratio.png + # binary changes md5sums on reruns + - path: results/variant_calling/mpileup/sample4_vs_sample3/sample4_vs_sample3.normal.mpileup.gz + should_exist: false + - path: results/variant_calling/mpileup/sample4_vs_sample3/sample4_vs_sample3.tumor.mpileup.gz + should_exist: false + - path: results/cnvkit + should_exist: false + - path: results/reports/mosdepth/sample3/sample3.recal.mosdepth.global.dist.txt + md5sum: 69e29702ef01fd8f6c7a5468fc35a16a + - path: results/reports/mosdepth/sample3/sample3.recal.mosdepth.region.dist.txt + md5sum: 6ec49cd7d510c2eb3d9d90fdb79b783a + - path: results/reports/mosdepth/sample3/sample3.recal.mosdepth.summary.txt + md5sum: 103098d0bf76ed82d2b87d5f242b099a + - path: results/reports/mosdepth/sample3/sample3.recal.regions.bed.gz + md5sum: b5888cf7395c57d39879a5faa6159eb3 + - path: results/reports/mosdepth/sample3/sample3.recal.regions.bed.gz.csi + md5sum: 9cb0ad7039a3b703d16ca7d5b835c0ee + - path: results/reports/samtools/sample3/sample3.recal.cram.stats + # conda changes md5sums for test + - path: results/reports/mosdepth/sample4/sample4.recal.mosdepth.global.dist.txt + md5sum: f2dcd00a64947c49e8e4b93c2f4fbf27 + - path: results/reports/mosdepth/sample4/sample4.recal.mosdepth.region.dist.txt + md5sum: 39005ffaac22871ffaaf19656fe69c5b + - path: results/reports/mosdepth/sample4/sample4.recal.mosdepth.summary.txt + md5sum: 68d4b98f17361fddf73052ead34fa370 + - path: results/reports/mosdepth/sample4/sample4.recal.regions.bed.gz + md5sum: 2819e995eafded35f53328c4ec19ba58 + - path: results/reports/mosdepth/sample4/sample4.recal.regions.bed.gz.csi + md5sum: 393c2749068304d8545b501b9d4658e4 + - path: results/reports/samtools/sample4/sample4.recal.cram.stats + # conda changes md5sums for test +- name: Run variant calling on somatic samples with controlfreec without intervals + command: nextflow run main.nf -profile test_cache,tools_somatic --tools controlfreec --no_intervals -stub-run --outdir results + tags: + - controlfreec + - no_intervals + - somatic + - variant_calling + - copy_number_calling + files: + - path: results/multiqc + - path: results/no_intervals.bed + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/no_intervals.bed.gz + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/no_intervals.bed.gz.tbi + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/untar/chr_dir + - path: results/variant_calling/controlfreec/sample4_vs_sample3/GC_profile.sample4_vs_sample3.cpn + md5sum: d41d8cd98f00b204e9800998ecf8427e # This is the md5sum of an empty file. Are all these files suppose to be empty? + - path: results/variant_calling/controlfreec/sample4_vs_sample3/config.txt + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/variant_calling/controlfreec/sample4_vs_sample3/sample4_vs_sample3.bed + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/variant_calling/controlfreec/sample4_vs_sample3/sample4_vs_sample3.circos.txt + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/variant_calling/controlfreec/sample4_vs_sample3/sample4_vs_sample3.p.value.txt + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/variant_calling/controlfreec/sample4_vs_sample3/sample4_vs_sample3_BAF.png + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/variant_calling/controlfreec/sample4_vs_sample3/sample4_vs_sample3_BAF.txt + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/variant_calling/controlfreec/sample4_vs_sample3/sample4_vs_sample3_CNVs + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/variant_calling/controlfreec/sample4_vs_sample3/sample4_vs_sample3_info.txt + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/variant_calling/controlfreec/sample4_vs_sample3/sample4_vs_sample3_ratio.BedGraph + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/variant_calling/controlfreec/sample4_vs_sample3/sample4_vs_sample3_ratio.log2.png + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/variant_calling/controlfreec/sample4_vs_sample3/sample4_vs_sample3_ratio.png + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/variant_calling/controlfreec/sample4_vs_sample3/sample4_vs_sample3_ratio.txt + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/variant_calling/controlfreec/sample4_vs_sample3/sample4_vs_sample3_sample.cpn + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/variant_calling/mpileup/sample4_vs_sample3/sample4_vs_sample3.normal.mpileup.gz + should_exist: false + - path: results/variant_calling/mpileup/sample4_vs_sample3/sample4_vs_sample3.tumor.mpileup.gz + should_exist: false + - path: results/controlfreec + should_exist: false + - path: results/mpileup + should_exist: false + - path: results/reports/mosdepth/sample3/sample3.recal.global.dist.txt + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/reports/mosdepth/sample3/sample3.recal.summary.txt + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/reports/mosdepth/sample3/sample3.recal.per-base.bed.gz + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/reports/mosdepth/sample3/sample3.recal.per-base.bed.gz.csi + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/reports/samtools/sample3/sample3.recal.cram.stats + # conda changes md5sums for test + - path: results/reports/mosdepth/sample4/sample4.recal.global.dist.txt + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/reports/mosdepth/sample4/sample4.recal.summary.txt + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/reports/mosdepth/sample4/sample4.recal.per-base.bed.gz + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/reports/mosdepth/sample4/sample4.recal.per-base.bed.gz.csi + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/reports/samtools/sample4/sample4.recal.cram.stats + # conda changes md5sums for test +- name: Run variant calling on tumor_only sample with controlfreec + command: nextflow run main.nf -profile test_cache,tools_tumoronly --tools controlfreec -stub-run --outdir results + tags: + - controlfreec + - tumor_only + - variant_calling + - copy_number_calling + files: + - path: results/multiqc + - path: results/variant_calling/controlfreec/sample2/GC_profile.sample2.cpn + md5sum: d41d8cd98f00b204e9800998ecf8427e # This is the md5sum of an empty file. Are all these files suppose to be empty? + - path: results/variant_calling/controlfreec/sample2/config.txt + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/variant_calling/controlfreec/sample2/sample2.bed + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/variant_calling/controlfreec/sample2/sample2.circos.txt + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/variant_calling/controlfreec/sample2/sample2.p.value.txt + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/variant_calling/controlfreec/sample2/sample2_BAF.png + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/variant_calling/controlfreec/sample2/sample2_BAF.txt + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/variant_calling/controlfreec/sample2/sample2_CNVs + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/variant_calling/controlfreec/sample2/sample2_info.txt + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/variant_calling/controlfreec/sample2/sample2_ratio.BedGraph + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/variant_calling/controlfreec/sample2/sample2_ratio.log2.png + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/variant_calling/controlfreec/sample2/sample2_ratio.png + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/variant_calling/controlfreec/sample2/sample2_ratio.txt + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/variant_calling/controlfreec/sample2/sample2_sample.cpn + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/variant_calling/mpileup/sample2/sample2.tumor.mpileup.gz + should_exist: false + - path: results/controlfreec + should_exist: false + - path: results/mpileup + should_exist: false + - path: results/reports/mosdepth/sample2/sample2.recal.global.dist.txt + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/reports/mosdepth/sample2/sample2.recal.region.dist.txt + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/reports/mosdepth/sample2/sample2.recal.summary.txt + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/reports/mosdepth/sample2/sample2.recal.regions.bed.gz + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/reports/mosdepth/sample2/sample2.recal.regions.bed.gz.csi + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/reports/samtools/sample2/sample2.recal.cram.stats diff --git a/tests/test_deepvariant.yml b/tests/test_deepvariant.yml new file mode 100644 index 0000000000..b91ad01fa2 --- /dev/null +++ b/tests/test_deepvariant.yml @@ -0,0 +1,79 @@ +- name: Run variant calling on germline sample with deepvariant + command: nextflow run main.nf -profile test_cache,tools_germline --tools deepvariant --outdir results + tags: + - deepvariant + - germline + - variant_calling + files: + - path: results/multiqc + - path: results/reports/bcftools/deepvariant/sample1/sample1.deepvariant.bcftools_stats.txt + md5sum: 1e8eefa704c0d2b77ee6e8a2cf2ef428 + - path: results/reports/vcftools/deepvariant/sample1/sample1.deepvariant.FILTER.summary + md5sum: acce7a163f4070226429f9d6bc3fbd2c + - path: results/reports/vcftools/deepvariant/sample1/sample1.deepvariant.TsTv.count + md5sum: de1632b8413f4c14c78acdc2df5c5224 + - path: results/reports/vcftools/deepvariant/sample1/sample1.deepvariant.TsTv.qual + md5sum: a9c05f0ecb0bb71123e345589bd7089c + - path: results/variant_calling/deepvariant/sample1/sample1.deepvariant.g.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/deepvariant/sample1/sample1.deepvariant.g.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/deepvariant/sample1/sample1.deepvariant.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/deepvariant/sample1/sample1.deepvariant.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/deepvariant + should_exist: false + - path: results/reports/mosdepth/sample1/sample1.recal.mosdepth.global.dist.txt + md5sum: 69e29702ef01fd8f6c7a5468fc35a16a + - path: results/reports/mosdepth/sample1/sample1.recal.mosdepth.region.dist.txt + md5sum: 6ec49cd7d510c2eb3d9d90fdb79b783a + - path: results/reports/mosdepth/sample1/sample1.recal.mosdepth.summary.txt + md5sum: 103098d0bf76ed82d2b87d5f242b099a + - path: results/reports/mosdepth/sample1/sample1.recal.regions.bed.gz + md5sum: b5888cf7395c57d39879a5faa6159eb3 + - path: results/reports/mosdepth/sample1/sample1.recal.regions.bed.gz.csi + md5sum: 9cb0ad7039a3b703d16ca7d5b835c0ee + - path: results/reports/samtools/sample1/sample1.recal.cram.stats +- name: Run variant calling on germline sample with deepvariant without intervals + command: nextflow run main.nf -profile test_cache,tools_germline --tools deepvariant --no_intervals --outdir results + tags: + - deepvariant + - germline + - no_intervals + - variant_calling + files: + - path: results/multiqc + - path: results/no_intervals.bed + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/no_intervals.bed.gz + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/no_intervals.bed.gz.tbi + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/reports/bcftools/deepvariant/sample1/sample1.deepvariant.bcftools_stats.txt + md5sum: 33551d79cef60dd9a7f56c20db6f30a4 + - path: results/reports/vcftools/deepvariant/sample1/sample1.deepvariant.FILTER.summary + md5sum: 7b17bd18c2d4bf129561c7c6a419a889 + - path: results/reports/vcftools/deepvariant/sample1/sample1.deepvariant.TsTv.count + md5sum: e570b07835a793bbab4f517cabed5a45 + - path: results/reports/vcftools/deepvariant/sample1/sample1.deepvariant.TsTv.qual + md5sum: 03f64b8092fc212bcb746b08f9e676a5 + - path: results/variant_calling/deepvariant/sample1/sample1.deepvariant.g.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/deepvariant/sample1/sample1.deepvariant.g.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/deepvariant/sample1/sample1.deepvariant.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/deepvariant/sample1/sample1.deepvariant.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/deepvariant + should_exist: false + - path: results/reports/mosdepth/sample1/sample1.recal.mosdepth.global.dist.txt + md5sum: 69e29702ef01fd8f6c7a5468fc35a16a + - path: results/reports/mosdepth/sample1/sample1.recal.mosdepth.summary.txt + md5sum: d2775eb102acc5950f7f53883dcb503d + - path: results/reports/mosdepth/sample1/sample1.recal.per-base.bed.gz + md5sum: 54431f155c9538809e8caf99d1a75ec7 + - path: results/reports/mosdepth/sample1/sample1.recal.per-base.bed.gz.csi + md5sum: c67dcd711b096eb42f43784d5eadbc0d + - path: results/reports/samtools/sample1/sample1.recal.cram.stats diff --git a/tests/test_default.yml b/tests/test_default.yml new file mode 100644 index 0000000000..5368cfd9b0 --- /dev/null +++ b/tests/test_default.yml @@ -0,0 +1,72 @@ +- name: Run default pipeline + command: nextflow run main.nf -profile test --outdir results + tags: + - default + - preprocessing + - variant_calling + files: + - path: results/csv/markduplicates.csv + md5sum: 0d6120bb99e92f6810343270711ca53e + - path: results/csv/markduplicates_no_table.csv + md5sum: 2a2d3d4842befd4def39156463859ee3 + - path: results/csv/recalibrated.csv + md5sum: 2d29d9e53894dcce96a1b5beb6ef3312 + - path: results/csv/variantcalled.csv + md5sum: 4d0effd3d8dc2b814230a189e7ca9dba + - path: results/multiqc + - path: results/preprocessing/markduplicates/test/test.md.cram + # binary changes md5sums on reruns + - path: results/preprocessing/markduplicates/test/test.md.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recal_table/test/test.recal.table + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + # binary changes md5sums on reruns + - path: results/reports/bcftools/strelka/test/test.strelka.variants.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/fastqc/test-test_L1 + - path: results/reports/markduplicates/test/test.md.cram.metrics + contains: ["test 17094 1534 168 1046782 12429 197 0 0.635998", "1.0 0.999991 1171"] + - path: results/reports/mosdepth/test/test.md.mosdepth.global.dist.txt + md5sum: b61e1acee11a6ddf7ce3232a5948a6a0 + - path: results/reports/mosdepth/test/test.md.mosdepth.region.dist.txt + md5sum: 1a382f98d488d2ae3df83a0d87caafc1 + - path: results/reports/mosdepth/test/test.md.mosdepth.summary.txt + md5sum: 839108358878ada89e1eaddf6e0541ba + - path: results/reports/mosdepth/test/test.md.regions.bed.gz + md5sum: 0aaee6da65050bedcd40b9fbf0622873 + - path: results/reports/mosdepth/test/test.md.regions.bed.gz.csi + md5sum: 544e02fcca548749a0af758d0a2df352 + - path: results/reports/mosdepth/test/test.recal.mosdepth.global.dist.txt + md5sum: b61e1acee11a6ddf7ce3232a5948a6a0 + - path: results/reports/mosdepth/test/test.recal.mosdepth.region.dist.txt + md5sum: 1a382f98d488d2ae3df83a0d87caafc1 + - path: results/reports/mosdepth/test/test.recal.mosdepth.summary.txt + md5sum: 839108358878ada89e1eaddf6e0541ba + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz + md5sum: 0aaee6da65050bedcd40b9fbf0622873 + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz.csi + md5sum: 544e02fcca548749a0af758d0a2df352 + - path: results/reports/samtools/test/test.md.cram.stats + # conda changes md5sums for test + - path: results/reports/samtools/test/test.recal.cram.stats + # conda changes md5sums for test + - path: results/reports/vcftools/strelka/test/test.strelka.variants.FILTER.summary + md5sum: ad417bc96d31223f61170987975d8128 + - path: results/reports/vcftools/strelka/test/test.strelka.variants.TsTv.count + md5sum: fa27f678965b7cba6a92efcd039f802a + - path: results/reports/vcftools/strelka/test/test.strelka.variants.TsTv.qual + md5sum: bc68ae4e688e9fb772b457069e604883 + - path: results/variant_calling/strelka/test/test.strelka.genome.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/test/test.strelka.genome.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/test/test.strelka.variants.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/test/test.strelka.variants.vcf.gz.tbi + - path: results/strelka + should_exist: false + - path: results/preprocessing/mapped/ + should_exist: false diff --git a/tests/test_fastp.yml b/tests/test_fastp.yml new file mode 100644 index 0000000000..7ea4dc4bd7 --- /dev/null +++ b/tests/test_fastp.yml @@ -0,0 +1,115 @@ +- name: Run trimming pipeline + command: nextflow run main.nf -profile test_cache,trimming --save_trimmed --outdir results + tags: + - fastp + - preprocessing + - trimming + files: + - path: results/csv/markduplicates.csv + md5sum: 0d6120bb99e92f6810343270711ca53e + - path: results/csv/markduplicates_no_table.csv + md5sum: 2a2d3d4842befd4def39156463859ee3 + - path: results/csv/recalibrated.csv + md5sum: 2d29d9e53894dcce96a1b5beb6ef3312 + - path: results/multiqc + - path: results/preprocessing/fastp/test/test-test_L1_1.fastp.fastq.gz + # conda changes md5sums for test + - path: results/preprocessing/fastp/test/test-test_L1_2.fastp.fastq.gz + # conda changes md5sums for test + - path: results/preprocessing/markduplicates/test/test.md.cram + # binary changes md5sums on reruns + - path: results/preprocessing/markduplicates/test/test.md.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recal_table/test/test.recal.table + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + # binary changes md5sums on reruns + - path: results/reports/fastp/test + - path: results/reports/fastqc/test-test_L1 + - path: results/reports/markduplicates/test/test.md.cram.metrics + contains: ["test 16608 1860 160 1046616 12117 256 0 0.621261"] + - path: results/reports/mosdepth/test/test.md.mosdepth.global.dist.txt + md5sum: 3098d33090a0f90f6fc16d497d2ce644 + - path: results/reports/mosdepth/test/test.md.mosdepth.region.dist.txt + md5sum: 057a02943301a5acec55d19a5a629f11 + - path: results/reports/mosdepth/test/test.md.mosdepth.summary.txt + md5sum: fef22f026f7b4a89ab60c715689c5591 + - path: results/reports/mosdepth/test/test.md.regions.bed.gz + md5sum: cca0f725bea04688b39f2ea8ad2e1605 + - path: results/reports/mosdepth/test/test.md.regions.bed.gz.csi + md5sum: a5ad8f917979f62eacfff1461529dbaa + - path: results/reports/mosdepth/test/test.recal.mosdepth.global.dist.txt + md5sum: 3098d33090a0f90f6fc16d497d2ce644 + - path: results/reports/mosdepth/test/test.recal.mosdepth.region.dist.txt + md5sum: 057a02943301a5acec55d19a5a629f11 + - path: results/reports/mosdepth/test/test.recal.mosdepth.summary.txt + md5sum: fef22f026f7b4a89ab60c715689c5591 + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz + md5sum: cca0f725bea04688b39f2ea8ad2e1605 + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz.csi + md5sum: a5ad8f917979f62eacfff1461529dbaa + - path: results/reports/samtools/test/test.md.cram.stats + # conda changes md5sums for test + - path: results/reports/samtools/test/test.recal.cram.stats + # conda changes md5sums for test +- name: Run split fastq module + command: nextflow run main.nf -profile test_cache,split_fastq --outdir results + tags: + - fastp + - preprocessing + - split_fastq + files: + - path: results/csv/markduplicates.csv + md5sum: 0d6120bb99e92f6810343270711ca53e + - path: results/csv/markduplicates_no_table.csv + md5sum: 2a2d3d4842befd4def39156463859ee3 + - path: results/csv/recalibrated.csv + md5sum: 2d29d9e53894dcce96a1b5beb6ef3312 + - path: results/preprocessing/fastp/test/0001.test-test_L1_1.fastp.fastq.gz + # conda changes md5sums for test + - path: results/preprocessing/fastp/test/0001.test-test_L1_2.fastp.fastq.gz + # conda changes md5sums for test + - path: results/preprocessing/fastp/test/0002.test-test_L1_1.fastp.fastq.gz + # conda changes md5sums for test + - path: results/preprocessing/fastp/test/0002.test-test_L1_2.fastp.fastq.gz + # conda changes md5sums for test + - path: results/preprocessing/markduplicates/test/test.md.cram + # binary changes md5sums on reruns + - path: results/preprocessing/markduplicates/test/test.md.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recal_table/test/test.recal.table + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + # binary changes md5sums on reruns + - path: results/reports/fastp/test + - path: results/reports/fastqc/test-test_L1 + - path: results/reports/markduplicates/test/test.md.cram.metrics + contains: ["test 17482 890 170 1047682 12552 69 0 0.65881"] + - path: results/reports/mosdepth/test/test.md.mosdepth.global.dist.txt + md5sum: 3626e543b91aa564f0056747827366d3 + - path: results/reports/mosdepth/test/test.md.mosdepth.region.dist.txt + md5sum: e3442f1098899a22748d07ef436925f6 + - path: results/reports/mosdepth/test/test.md.mosdepth.summary.txt + md5sum: 2873e7f9c9aede39942731894a6077d1 + - path: results/reports/mosdepth/test/test.md.regions.bed.gz + md5sum: 948108074663677f9225fd0574658ca1 + - path: results/reports/mosdepth/test/test.md.regions.bed.gz.csi + md5sum: d5f1c9389ecf52ba839e834780a94549 + - path: results/reports/mosdepth/test/test.recal.mosdepth.global.dist.txt + md5sum: 3626e543b91aa564f0056747827366d3 + - path: results/reports/mosdepth/test/test.recal.mosdepth.region.dist.txt + md5sum: e3442f1098899a22748d07ef436925f6 + - path: results/reports/mosdepth/test/test.recal.mosdepth.summary.txt + md5sum: 2873e7f9c9aede39942731894a6077d1 + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz + md5sum: 948108074663677f9225fd0574658ca1 + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz.csi + md5sum: d5f1c9389ecf52ba839e834780a94549 + - path: results/reports/samtools/test/test.md.cram.stats + # conda changes md5sums for test + - path: results/reports/samtools/test/test.recal.cram.stats + # conda changes md5sums for test diff --git a/tests/test_freebayes.yml b/tests/test_freebayes.yml new file mode 100644 index 0000000000..bd76fa1411 --- /dev/null +++ b/tests/test_freebayes.yml @@ -0,0 +1,439 @@ +- name: Run variant calling on germline sample with freebayes + command: nextflow run main.nf -profile test_cache,targeted --tools freebayes --outdir results + tags: + - freebayes + - germline + - variant_calling + files: + - path: results/csv/markduplicates.csv + md5sum: 0d6120bb99e92f6810343270711ca53e + - path: results/csv/markduplicates_no_table.csv + md5sum: 2a2d3d4842befd4def39156463859ee3 + - path: results/csv/recalibrated.csv + md5sum: 2d29d9e53894dcce96a1b5beb6ef3312 + - path: results/csv/variantcalled.csv + md5sum: 0cc6a67fedb2ef9ce97e463d310f9f30 + - path: results/multiqc + - path: results/preprocessing/markduplicates/test/test.md.cram + # binary changes md5sums on reruns + - path: results/preprocessing/markduplicates/test/test.md.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recal_table/test/test.recal.table + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + # binary changes md5sums on reruns + - path: results/reports/bcftools/freebayes/test/test.freebayes.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/fastqc/test-test_L1 + - path: results/reports/markduplicates/test/test.md.cram.metrics + contains: ["test 17094 1534 168 1046782 12429 197 0 0.635998", "1.0 0.999991 1171"] + - path: results/reports/mosdepth/test/test.md.mosdepth.global.dist.txt + md5sum: e4ce28ba1c331dc08bc53a0189908f77 + - path: results/reports/mosdepth/test/test.md.mosdepth.region.dist.txt + md5sum: 04d9f20dc5306990eec982a3c5a7d107 + - path: results/reports/mosdepth/test/test.md.mosdepth.summary.txt + md5sum: 70b4bbe29bd5e7c4ea39b6caf3316096 + - path: results/reports/mosdepth/test/test.md.per-base.bed.gz + md5sum: 2a0c38fb19d6a1f81ca2018e59e7bfcf + - path: results/reports/mosdepth/test/test.md.per-base.bed.gz.csi + md5sum: 0714f8c677277168b9f95d3a43ea5237 + - path: results/reports/mosdepth/test/test.md.regions.bed.gz + md5sum: eb0bc92c253326a109e73af98c9a7d4b + - path: results/reports/mosdepth/test/test.md.regions.bed.gz.csi + md5sum: c6d1ac97ef4dfe43731c8368d8391cab + - path: results/reports/mosdepth/test/test.recal.mosdepth.global.dist.txt + md5sum: 9c1d90e0fed14b710098b7724b602aea + - path: results/reports/mosdepth/test/test.recal.mosdepth.region.dist.txt + md5sum: 04d9f20dc5306990eec982a3c5a7d107 + - path: results/reports/mosdepth/test/test.recal.mosdepth.summary.txt + md5sum: 17dfb78b147488eb8fd450294de4a35e + - path: results/reports/mosdepth/test/test.recal.per-base.bed.gz + md5sum: fb6804911f9d437d0251869fe112a528 + - path: results/reports/mosdepth/test/test.recal.per-base.bed.gz.csi + md5sum: 033032cdbb3a2b74dd41dac89628112c + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz + md5sum: eb0bc92c253326a109e73af98c9a7d4b + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz.csi + md5sum: c6d1ac97ef4dfe43731c8368d8391cab + - path: results/reports/samtools/test/test.md.cram.stats + # conda changes md5sums for test + - path: results/reports/samtools/test/test.recal.cram.stats + # conda changes md5sums for test + - path: results/reports/vcftools/freebayes/test/test.freebayes.FILTER.summary + md5sum: 75824ce08910acce7e9f6adb4d635850 + - path: results/reports/vcftools/freebayes/test/test.freebayes.TsTv.count + md5sum: 3c198f7ec7fe2f5d365218ba0ff64197 + - path: results/reports/vcftools/freebayes/test/test.freebayes.TsTv.qual + - path: results/variant_calling/freebayes/test/test.freebayes.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/freebayes/test/test.freebayes.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/freebayes + should_exist: false +- name: Run variant calling on germline sample with freebayes without intervals + command: nextflow run main.nf -profile test_cache --tools freebayes --no_intervals --outdir results + tags: + - freebayes + - germline + - no_intervals + - variant_calling + files: + - path: results/csv/markduplicates.csv + md5sum: 0d6120bb99e92f6810343270711ca53e + - path: results/csv/markduplicates_no_table.csv + md5sum: 2a2d3d4842befd4def39156463859ee3 + - path: results/csv/recalibrated.csv + md5sum: 2d29d9e53894dcce96a1b5beb6ef3312 + - path: results/csv/variantcalled.csv + md5sum: 0cc6a67fedb2ef9ce97e463d310f9f30 + - path: results/multiqc + - path: results/no_intervals.bed + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/no_intervals.bed.gz + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/no_intervals.bed.gz.tbi + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/preprocessing/markduplicates/test/test.md.cram + # binary changes md5sums on reruns + - path: results/preprocessing/markduplicates/test/test.md.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recal_table/test/test.recal.table + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + # binary changes md5sums on reruns + - path: results/reports/bcftools/freebayes/test/test.freebayes.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/fastqc/test-test_L1 + - path: results/reports/markduplicates/test/test.md.cram.metrics + contains: ["test 17094 1534 168 1046782 12429 197 0 0.635998", "1.0 0.999991 1171"] + - path: results/reports/mosdepth/test/test.md.mosdepth.global.dist.txt + md5sum: b61e1acee11a6ddf7ce3232a5948a6a0 + - path: results/reports/mosdepth/test/test.md.mosdepth.region.dist.txt + md5sum: 1a382f98d488d2ae3df83a0d87caafc1 + - path: results/reports/mosdepth/test/test.md.mosdepth.summary.txt + md5sum: 839108358878ada89e1eaddf6e0541ba + - path: results/reports/mosdepth/test/test.md.regions.bed.gz + md5sum: 0aaee6da65050bedcd40b9fbf0622873 + - path: results/reports/mosdepth/test/test.md.regions.bed.gz.csi + md5sum: 544e02fcca548749a0af758d0a2df352 + - path: results/reports/mosdepth/test/test.recal.mosdepth.global.dist.txt + md5sum: b61e1acee11a6ddf7ce3232a5948a6a0 + - path: results/reports/mosdepth/test/test.recal.mosdepth.region.dist.txt + md5sum: 1a382f98d488d2ae3df83a0d87caafc1 + - path: results/reports/mosdepth/test/test.recal.mosdepth.summary.txt + md5sum: 839108358878ada89e1eaddf6e0541ba + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz + md5sum: 0aaee6da65050bedcd40b9fbf0622873 + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz.csi + md5sum: 544e02fcca548749a0af758d0a2df352 + - path: results/reports/samtools/test/test.md.cram.stats + # conda changes md5sums for test + - path: results/reports/samtools/test/test.recal.cram.stats + # conda changes md5sums for test + - path: results/reports/vcftools/freebayes/test/test.freebayes.FILTER.summary + md5sum: 562eaa4512cd4b57e6cfca7b44957d1c + - path: results/reports/vcftools/freebayes/test/test.freebayes.TsTv.count + md5sum: 4e6935b1e1906e57be1b54c0dffe7169 + - path: results/reports/vcftools/freebayes/test/test.freebayes.TsTv.qual + - path: results/variant_calling/freebayes/test/test.freebayes.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/freebayes/test/test.freebayes.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/freebayes + should_exist: false +- name: Run variant calling on somatic sample with freebayes + command: nextflow run main.nf -profile test_cache,pair,targeted --tools freebayes --outdir results + tags: + - freebayes + - somatic + - variant_calling + files: + - path: results/csv/markduplicates.csv + md5sum: e8e587ac25253ff7ab8f1cc66d410c98 + - path: results/csv/markduplicates_no_table.csv + md5sum: 617574c9b607e5daaf4ad56d48982247 + - path: results/csv/recalibrated.csv + md5sum: 008dff17e2a0d96ef9c1cae12fcab6ab + - path: results/csv/variantcalled.csv + md5sum: b31f56256a1cfa839a2ea7f7ba6c1c45 + - path: results/multiqc + - path: results/preprocessing/markduplicates/test/test.md.cram + # binary changes md5sums on reruns + - path: results/preprocessing/markduplicates/test/test.md.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/markduplicates/test2/test2.md.cram + # binary changes md5sums on reruns + - path: results/preprocessing/markduplicates/test2/test2.md.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recal_table/test/test.recal.table + md5sum: c990e4f1b7dbd5a3a623882a54ae2bf2 + - path: results/preprocessing/recal_table/test2/test2.recal.table + md5sum: 00d6877d68d622d81e4d633c4e340e7e + - path: results/preprocessing/recalibrated/test/test.recal.cram + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test2/test2.recal.cram + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test2/test2.recal.cram.crai + # binary changes md5sums on reruns + - path: results/reports/bcftools/freebayes/test/test.freebayes.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/bcftools/freebayes/test2_vs_test/test2_vs_test.freebayes.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/fastqc/test-test_L1 + - path: results/reports/markduplicates/test/test.md.cram.metrics + contains: ["test 8547 767 84 523391 3882 0 0 0.385081", "1.0 767 767"] + - path: results/reports/markduplicates/test2/test2.md.cram.metrics + contains: ["test2 10103 880 35 523579 4837 2 0 0.408076 193306", "1.0 1 876 876", "100.0 80.515303 0 0"] + - path: results/reports/mosdepth/test/test.md.mosdepth.global.dist.txt + md5sum: 5a0679057c530e5945c9c5a3a17312dc + - path: results/reports/mosdepth/test/test.md.mosdepth.region.dist.txt + md5sum: 835fdc6fa52cc33e6fb76c0c20a8a6c3 + - path: results/reports/mosdepth/test/test.md.mosdepth.summary.txt + md5sum: dcc9ab2bf3248903e02d8da87e678977 + - path: results/reports/mosdepth/test/test.md.per-base.bed.gz + md5sum: 5724f1c6b6a0e63e25ec8a0f38edfda6 + - path: results/reports/mosdepth/test/test.md.per-base.bed.gz.csi + md5sum: b0ab630c3241fbd7581b7a38d944ff8b + - path: results/reports/mosdepth/test/test.md.regions.bed.gz + md5sum: 91e0d531f1bab64711ecefe52bfc8255 + - path: results/reports/mosdepth/test/test.md.regions.bed.gz.csi + md5sum: c6d1ac97ef4dfe43731c8368d8391cab + - path: results/reports/mosdepth/test/test.recal.mosdepth.global.dist.txt + md5sum: 0b3162def977123809598639f7698121 + - path: results/reports/mosdepth/test/test.recal.mosdepth.region.dist.txt + md5sum: 835fdc6fa52cc33e6fb76c0c20a8a6c3 + - path: results/reports/mosdepth/test/test.recal.mosdepth.summary.txt + md5sum: a8455eb2947de529abfa62b303986e0f + - path: results/reports/mosdepth/test/test.recal.per-base.bed.gz + md5sum: d9fa560ff78ae106cfee9db2c90801b5 + - path: results/reports/mosdepth/test/test.recal.per-base.bed.gz.csi + md5sum: 4816eeb9af254ca40177b08cf11b98d2 + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz + md5sum: 91e0d531f1bab64711ecefe52bfc8255 + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz.csi + md5sum: c6d1ac97ef4dfe43731c8368d8391cab + - path: results/reports/mosdepth/test2/test2.md.mosdepth.global.dist.txt + md5sum: f25166c3a0051bb4d8c11a210278de6c + - path: results/reports/mosdepth/test2/test2.md.mosdepth.region.dist.txt + md5sum: 3211135329e4077bd9bf0ba488e14371 + - path: results/reports/mosdepth/test2/test2.md.mosdepth.summary.txt + md5sum: ce0eb6d33c6d0dc720fbc6d1811abef8 + - path: results/reports/mosdepth/test2/test2.md.per-base.bed.gz + md5sum: 55c160e8f3c8c7761524646426611f6b + - path: results/reports/mosdepth/test2/test2.md.per-base.bed.gz.csi + md5sum: 4205a09ede17cdbdaad45e3553f73105 + - path: results/reports/mosdepth/test2/test2.md.regions.bed.gz + md5sum: 1dd426a45f967a9f37dcddcaea29a582 + - path: results/reports/mosdepth/test2/test2.md.regions.bed.gz.csi + md5sum: c6d1ac97ef4dfe43731c8368d8391cab + - path: results/reports/mosdepth/test2/test2.recal.mosdepth.global.dist.txt + md5sum: a1ef7e662ce993da4668e804952014ce + - path: results/reports/mosdepth/test2/test2.recal.mosdepth.region.dist.txt + md5sum: 3211135329e4077bd9bf0ba488e14371 + - path: results/reports/mosdepth/test2/test2.recal.mosdepth.summary.txt + md5sum: 70ad653c0c98baeeaf5085f1209a7bdb + - path: results/reports/mosdepth/test2/test2.recal.per-base.bed.gz + md5sum: 250a9f15a7d3f102435fa98adccf48a3 + - path: results/reports/mosdepth/test2/test2.recal.per-base.bed.gz.csi + md5sum: 8072f447199c60f24b01eede8b557333 + - path: results/reports/mosdepth/test2/test2.recal.regions.bed.gz + md5sum: 1dd426a45f967a9f37dcddcaea29a582 + - path: results/reports/mosdepth/test2/test2.recal.regions.bed.gz.csi + md5sum: c6d1ac97ef4dfe43731c8368d8391cab + - path: results/reports/samtools/test/test.md.cram.stats + # conda changes md5sums for test + - path: results/reports/samtools/test/test.recal.cram.stats + # conda changes md5sums for test + - path: results/reports/samtools/test2/test2.md.cram.stats + # conda changes md5sums for test + - path: results/reports/samtools/test2/test2.recal.cram.stats + # conda changes md5sums for test + - path: results/reports/vcftools/freebayes/test/test.freebayes.FILTER.summary + md5sum: 43d53e36cbb1091f915b2499e545b41e + - path: results/reports/vcftools/freebayes/test/test.freebayes.TsTv.count + md5sum: 650f3dc78c5aaaecfe8ffa3d499e812f + - path: results/reports/vcftools/freebayes/test/test.freebayes.TsTv.qual + - path: results/reports/vcftools/freebayes/test2_vs_test/test2_vs_test.freebayes.FILTER.summary + md5sum: 84039d55edf0981d6b9b81252aff6741 + - path: results/reports/vcftools/freebayes/test2_vs_test/test2_vs_test.freebayes.TsTv.count + md5sum: 6c6038d43eb7fa766909b495979d120e + - path: results/reports/vcftools/freebayes/test2_vs_test/test2_vs_test.freebayes.TsTv.qual + # the text-based file test2_vs_test.freebayes.TsTv.qual seemingly changes content on reruns! + - path: results/variant_calling/freebayes/test/test.freebayes.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/freebayes/test/test.freebayes.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/freebayes/test2_vs_test/test2_vs_test.freebayes.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/freebayes/test2_vs_test/test2_vs_test.freebayes.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/freebayes + should_exist: false +- name: Run variant calling on somatic sample with freebayes without intervals + command: nextflow run main.nf -profile test_cache,pair,targeted --tools freebayes --no_intervals --outdir results + tags: + - freebayes + - somatic + - no_intervals + - variant_calling + files: + - path: results/csv/markduplicates.csv + md5sum: e8e587ac25253ff7ab8f1cc66d410c98 + - path: results/csv/markduplicates_no_table.csv + md5sum: 617574c9b607e5daaf4ad56d48982247 + - path: results/csv/recalibrated.csv + md5sum: 008dff17e2a0d96ef9c1cae12fcab6ab + - path: results/csv/variantcalled.csv + md5sum: b31f56256a1cfa839a2ea7f7ba6c1c45 + - path: results/multiqc + - path: results/no_intervals.bed + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/no_intervals.bed.gz + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/no_intervals.bed.gz.tbi + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/preprocessing/markduplicates/test/test.md.cram + # binary changes md5sums on reruns + - path: results/preprocessing/markduplicates/test/test.md.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/markduplicates/test2/test2.md.cram + # binary changes md5sums on reruns + - path: results/preprocessing/markduplicates/test2/test2.md.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recal_table/test/test.recal.table + md5sum: 4ac774bf5f1157e77426fd82f5ac0fbe + - path: results/preprocessing/recal_table/test2/test2.recal.table + md5sum: 0626cd4337eab79b38b5bc5c95e0c003 + - path: results/preprocessing/recalibrated/test/test.recal.cram + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test2/test2.recal.cram + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test2/test2.recal.cram.crai + # binary changes md5sums on reruns + - path: results/reports/bcftools/freebayes/test/test.freebayes.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/bcftools/freebayes/test2_vs_test/test2_vs_test.freebayes.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/fastqc/test-test_L1 + - path: results/reports/markduplicates/test/test.md.cram.metrics + contains: ["test 8547 767 84 523391 3882 0 0 0.385081", "1.0 767 767"] + - path: results/reports/markduplicates/test2/test2.md.cram.metrics + contains: ["test2 10103 880 35 523579 4837 2 0 0.408076 193306", "1.0 1 876 876", "100.0 80.515303 0 0"] + - path: results/reports/mosdepth/test/test.md.mosdepth.global.dist.txt + md5sum: 5a0679057c530e5945c9c5a3a17312dc + - path: results/reports/mosdepth/test/test.md.mosdepth.summary.txt + md5sum: 0010c2396a3173c7cf4983abe2eb6a4c + - path: results/reports/mosdepth/test/test.md.per-base.bed.gz + md5sum: 5724f1c6b6a0e63e25ec8a0f38edfda6 + - path: results/reports/mosdepth/test/test.md.per-base.bed.gz.csi + md5sum: b0ab630c3241fbd7581b7a38d944ff8b + - path: results/reports/mosdepth/test/test.recal.mosdepth.global.dist.txt + md5sum: 5a0679057c530e5945c9c5a3a17312dc + - path: results/reports/mosdepth/test/test.recal.mosdepth.summary.txt + md5sum: 0010c2396a3173c7cf4983abe2eb6a4c + - path: results/reports/mosdepth/test/test.recal.per-base.bed.gz + md5sum: 5724f1c6b6a0e63e25ec8a0f38edfda6 + - path: results/reports/mosdepth/test/test.recal.per-base.bed.gz.csi + md5sum: b0ab630c3241fbd7581b7a38d944ff8b + - path: results/reports/mosdepth/test2/test2.md.mosdepth.global.dist.txt + md5sum: f25166c3a0051bb4d8c11a210278de6c + - path: results/reports/mosdepth/test2/test2.md.mosdepth.summary.txt + md5sum: d5e4084de2ea2a0a7b60b2d71c804d4b + - path: results/reports/mosdepth/test2/test2.md.per-base.bed.gz + md5sum: 55c160e8f3c8c7761524646426611f6b + - path: results/reports/mosdepth/test2/test2.md.per-base.bed.gz.csi + md5sum: 4205a09ede17cdbdaad45e3553f73105 + - path: results/reports/mosdepth/test2/test2.recal.mosdepth.global.dist.txt + md5sum: f25166c3a0051bb4d8c11a210278de6c + - path: results/reports/mosdepth/test2/test2.recal.mosdepth.summary.txt + md5sum: d5e4084de2ea2a0a7b60b2d71c804d4b + - path: results/reports/mosdepth/test2/test2.recal.per-base.bed.gz + md5sum: 55c160e8f3c8c7761524646426611f6b + - path: results/reports/mosdepth/test2/test2.recal.per-base.bed.gz.csi + md5sum: 4205a09ede17cdbdaad45e3553f73105 + - path: results/reports/samtools/test/test.md.cram.stats + # conda changes md5sums for test + - path: results/reports/samtools/test/test.recal.cram.stats + # conda changes md5sums for test + - path: results/reports/samtools/test2/test2.md.cram.stats + # conda changes md5sums for test + - path: results/reports/samtools/test2/test2.recal.cram.stats + # conda changes md5sums for test + - path: results/reports/vcftools/freebayes/test/test.freebayes.FILTER.summary + md5sum: 76c5919541536c12b5c8a6094d6d78d5 + - path: results/reports/vcftools/freebayes/test/test.freebayes.TsTv.count + md5sum: 0a0464beef110bc0f3c5a35d022b528e + - path: results/reports/vcftools/freebayes/test/test.freebayes.TsTv.qual + - path: results/reports/vcftools/freebayes/test2_vs_test/test2_vs_test.freebayes.FILTER.summary + md5sum: d2d717fef7c18ef9b40bbbc5c5bbf101 + - path: results/reports/vcftools/freebayes/test2_vs_test/test2_vs_test.freebayes.TsTv.count + md5sum: e09dacc71bf72254e3aace1cc7c1e16d + - path: results/reports/vcftools/freebayes/test2_vs_test/test2_vs_test.freebayes.TsTv.qual + # the text-based file test2_vs_test.freebayes.TsTv.qual seemingly changes content on reruns! + - path: results/variant_calling/freebayes/test/test.freebayes.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/freebayes/test/test.freebayes.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/freebayes/test2_vs_test/test2_vs_test.freebayes.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/freebayes/test2_vs_test/test2_vs_test.freebayes.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/freebayes + should_exist: false +- name: Run variant calling on tumor_only sample with freebayes + command: nextflow run main.nf -profile test_cache,tools_tumoronly --tools freebayes --outdir results + tags: + - freebayes + - tumor_only + - variant_calling + files: + - path: results/csv/variantcalled.csv + md5sum: 5c5938a7bcc814cdaf5433c1120964c5 + - path: results/multiqc + - path: results/reports/bcftools/freebayes/sample2/sample2.freebayes.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/vcftools/freebayes/sample2/sample2.freebayes.FILTER.summary + md5sum: 0df3ddeec5779344b5d463347c9c6ea8 + - path: results/reports/vcftools/freebayes/sample2/sample2.freebayes.TsTv.count + md5sum: b1d308ed5087361a584cb61e7b835e1e + - path: results/reports/vcftools/freebayes/sample2/sample2.freebayes.TsTv.qual + # content changes md5sums on reruns + - path: results/variant_calling/freebayes/sample2/sample2.freebayes.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/freebayes/sample2/sample2.freebayes.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/freebayes + should_exist: false +- name: Run variant calling on tumor_only sample with freebayes without intervals + command: nextflow run main.nf -profile test_cache,tools_tumoronly --tools freebayes --no_intervals --outdir results + tags: + - freebayes + - no_intervals + - tumor_only + - variant_calling + files: + - path: results/csv/variantcalled.csv + md5sum: 5c5938a7bcc814cdaf5433c1120964c5 + - path: results/multiqc + - path: results/reports/bcftools/freebayes/sample2/sample2.freebayes.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/vcftools/freebayes/sample2/sample2.freebayes.FILTER.summary + md5sum: ee513ecf779b6e201b8ef98f95f25aab + - path: results/reports/vcftools/freebayes/sample2/sample2.freebayes.TsTv.count + md5sum: 2dc153ad5af26c9f8aa82442bf65b4bf + - path: results/reports/vcftools/freebayes/sample2/sample2.freebayes.TsTv.qual + # content changes md5sums on reruns + - path: results/variant_calling/freebayes/sample2/sample2.freebayes.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/freebayes/sample2/sample2.freebayes.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/freebayes + should_exist: false diff --git a/tests/test_gatk4spark.yml b/tests/test_gatk4spark.yml new file mode 100644 index 0000000000..7b6b42a3a5 --- /dev/null +++ b/tests/test_gatk4spark.yml @@ -0,0 +1,134 @@ +- name: Run default pipeline with gatk4spark + command: nextflow run main.nf -profile test_cache,use_gatk_spark --outdir results + tags: + - gatk4spark + - preprocessing + files: + - path: results/csv/markduplicates.csv + md5sum: 0dec46c4dc83acc263efc234805e9349 + - path: results/csv/markduplicates_no_table.csv + md5sum: f274df0cba98b2641fdc2a9becb23f78 + - path: results/csv/recalibrated.csv + md5sum: f6ab55df6c87f44c2d8651a2be0b959f + - path: results/multiqc + - path: results/preprocessing/markduplicates/test2/test2.md.cram + # binary file changes md5sums on reruns + - path: results/preprocessing/markduplicates/test2/test2.md.cram.crai + # binary file changes md5sums on reruns + - path: results/preprocessing/recal_table/test2/test2.recal.table + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test2/test2.recal.cram + # binary file changes md5sums on reruns + - path: results/preprocessing/recalibrated/test2/test2.recal.cram.crai + # binary file changes md5sums on reruns + - path: results/reports/fastqc/test2-test_L1 + - path: results/reports/markduplicates/test2/test2.md.cram.metrics + # text-based file changes md5sums on reruns + - path: results/reports/mosdepth/test2/test2.md.mosdepth.global.dist.txt + md5sum: 85d38a74ce189b9110c57cd94bc26757 + - path: results/reports/mosdepth/test2/test2.md.mosdepth.region.dist.txt + md5sum: 286d57b7d9b3a95ef18ab2eb7f913d81 + - path: results/reports/mosdepth/test2/test2.md.mosdepth.summary.txt + md5sum: 04b69ef7f00199dcea7822a79d2c7bd7 + - path: results/reports/mosdepth/test2/test2.md.regions.bed.gz + md5sum: 4e6c360aea7f05d801b2ea5685fe154a + - path: results/reports/mosdepth/test2/test2.md.regions.bed.gz.csi + md5sum: 5bf5fc178e4faf2462427502c3666004 + - path: results/reports/mosdepth/test2/test2.recal.mosdepth.global.dist.txt + md5sum: 85d38a74ce189b9110c57cd94bc26757 + - path: results/reports/mosdepth/test2/test2.recal.mosdepth.region.dist.txt + md5sum: 286d57b7d9b3a95ef18ab2eb7f913d81 + - path: results/reports/mosdepth/test2/test2.recal.mosdepth.summary.txt + md5sum: 04b69ef7f00199dcea7822a79d2c7bd7 + - path: results/reports/mosdepth/test2/test2.recal.regions.bed.gz + md5sum: 4e6c360aea7f05d801b2ea5685fe154a + - path: results/reports/samtools/test2/test2.md.cram.stats + # conda changes md5sums for test + - path: results/reports/samtools/test2/test2.recal.cram.stats + # conda changes md5sums for test + - path: results/preprocessing/mapped/ + should_exist: false +- name: Run default pipeline with gatk4spark and skipping all QC steps + command: nextflow run main.nf -profile test_cache,use_gatk_spark --skip_tools fastqc,markduplicates_report,mosdepth,multiqc,samtools --outdir results + tags: + - gatk4spark + - preprocessing + - skip_qc + files: + - path: results/csv/markduplicates.csv + md5sum: 0dec46c4dc83acc263efc234805e9349 + - path: results/csv/markduplicates_no_table.csv + md5sum: f274df0cba98b2641fdc2a9becb23f78 + - path: results/csv/recalibrated.csv + md5sum: f6ab55df6c87f44c2d8651a2be0b959f + - path: results/preprocessing/markduplicates/test2/test2.md.cram + # binary file changes md5sums on reruns + - path: results/preprocessing/markduplicates/test2/test2.md.cram.crai + # binary file changes md5sums on reruns + - path: results/preprocessing/recal_table/test2/test2.recal.table + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test2/test2.recal.cram + # binary file changes md5sums on reruns + - path: results/preprocessing/recalibrated/test2/test2.recal.cram.crai + # binary file changes md5sums on reruns + - path: results/multiqc + should_exist: false + - path: results/reports/fastqc + should_exist: false + - path: results/reports/markduplicates + should_exist: false + - path: results/reports/mosdepth + should_exist: false + - path: results/reports/samtools + should_exist: false +- name: Run save_output_as_bam with gatk4 spark + command: nextflow run main.nf -profile test_cache,use_gatk_spark --save_output_as_bam --outdir results + tags: + - gatk4spark + - preprocessing + - save_output_as_bam + files: + - path: results/csv/markduplicates.csv + md5sum: 974a2375ca62c56078076e40768b6367 + - path: results/csv/markduplicates_no_table.csv + md5sum: d508740377e048822c9e2f11f048a56d + - path: results/csv/recalibrated.csv + md5sum: e5f9c71eb6746324a68a0ef989c50a28 + - path: results/multiqc + - path: results/preprocessing/markduplicates/test2/test2.md.bam + # conda changes md5sums for test + - path: results/preprocessing/markduplicates/test2/test2.md.bam.bai + # conda changes md5sums for test + - path: results/preprocessing/recal_table/test2/test2.recal.table + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test2/test2.recal.bam + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test2/test2.recal.bam.bai + # binary changes md5sums on reruns + - path: results/reports/fastqc/test2-test_L1 + - path: results/reports/markduplicates/test2/test2.md.cram.metrics + # text-based file changes md5sums on reruns + - path: results/reports/mosdepth/test2/test2.md.mosdepth.global.dist.txt + md5sum: 85d38a74ce189b9110c57cd94bc26757 + - path: results/reports/mosdepth/test2/test2.md.mosdepth.region.dist.txt + md5sum: 286d57b7d9b3a95ef18ab2eb7f913d81 + - path: results/reports/mosdepth/test2/test2.md.mosdepth.summary.txt + md5sum: 04b69ef7f00199dcea7822a79d2c7bd7 + - path: results/reports/mosdepth/test2/test2.md.regions.bed.gz + md5sum: 4e6c360aea7f05d801b2ea5685fe154a + - path: results/reports/mosdepth/test2/test2.md.regions.bed.gz.csi + md5sum: 5bf5fc178e4faf2462427502c3666004 + - path: results/reports/mosdepth/test2/test2.recal.mosdepth.global.dist.txt + md5sum: 85d38a74ce189b9110c57cd94bc26757 + - path: results/reports/mosdepth/test2/test2.recal.mosdepth.region.dist.txt + md5sum: 286d57b7d9b3a95ef18ab2eb7f913d81 + - path: results/reports/mosdepth/test2/test2.recal.mosdepth.summary.txt + md5sum: 04b69ef7f00199dcea7822a79d2c7bd7 + - path: results/reports/mosdepth/test2/test2.recal.regions.bed.gz + md5sum: 4e6c360aea7f05d801b2ea5685fe154a + - path: results/reports/mosdepth/test2/test2.recal.regions.bed.gz.csi + md5sum: 5bf5fc178e4faf2462427502c3666004 + - path: results/reports/samtools/test2/test2.md.cram.stats + # conda changes md5sums for test + - path: results/reports/samtools/test2/test2.recal.cram.stats + # conda changes md5sums for test diff --git a/tests/test_haplotypecaller.yml b/tests/test_haplotypecaller.yml new file mode 100644 index 0000000000..85c06d5e75 --- /dev/null +++ b/tests/test_haplotypecaller.yml @@ -0,0 +1,100 @@ +- name: Run variant calling on germline sample with haplotypecaller + command: nextflow run main.nf -profile test_cache,targeted --input ./tests/csv/3.0/mapped_single_bam.csv --tools haplotypecaller --step variant_calling --outdir results + tags: + - germline + - haplotypecaller + - variant_calling + files: + - path: results/csv/variantcalled.csv + md5sum: d7d86e82902a4f57876b2414a4f812a4 + - path: results/multiqc + - path: results/preprocessing/converted/test/test.converted.cram + # binary changes md5sums on reruns + - path: results/preprocessing/converted/test/test.converted.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram + should_exist: false + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + should_exist: false + - path: results/reports/bcftools/haplotypecaller/test/test.haplotypecaller.filtered.bcftools_stats.txt + md5sum: 8ef41291923302f7a8100e4fc34bde0e + - path: results/reports/vcftools/haplotypecaller/test/test.haplotypecaller.filtered.FILTER.summary + md5sum: 4e2ceea7f3ff998004691fd71192d9ee + - path: results/reports/vcftools/haplotypecaller/test/test.haplotypecaller.filtered.TsTv.count + md5sum: b77c120ee5cc0423267200c67d60c663 + - path: results/reports/vcftools/haplotypecaller/test/test.haplotypecaller.filtered.TsTv.qual + md5sum: 1e34357e5848c318f8c2c7d3b041d229 + - path: results/variant_calling/haplotypecaller/test/test.haplotypecaller.filtered.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/haplotypecaller/test/test.haplotypecaller.filtered.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/haplotypecaller/test/test.haplotypecaller.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/haplotypecaller/test/test.haplotypecaller.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/haplotypecaller + should_exist: false + - path: results/reports/mosdepth/test/test.recal.mosdepth.global.dist.txt + md5sum: e82e90c7d508a135b5a8a7cd6933452e + - path: results/reports/mosdepth/test/test.recal.mosdepth.region.dist.txt + md5sum: 3a2030e5e8af7bc12720c3a5592bf921 + - path: results/reports/mosdepth/test/test.recal.mosdepth.summary.txt + md5sum: 615c5c5019d88045a9ff5bbe6e63d270 + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz + md5sum: 9f1ea20e7461db948ba21f70c4d1b3ba + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz.csi + md5sum: 5c00a1d457c387d6e71848a6d897e309 + - path: results/reports/samtools/test/test.recal.cram.stats + +- name: Run variant calling on germline sample with haplotypecaller without intervals + command: nextflow run main.nf -profile test_cache,targeted --input ./tests/csv/3.0/mapped_single_bam.csv --tools haplotypecaller --step variant_calling --no_intervals --outdir results + tags: + - germline + - haplotypecaller + - no_intervals + - variant_calling + files: + - path: results/csv/variantcalled.csv + md5sum: d7d86e82902a4f57876b2414a4f812a4 + - path: results/multiqc + - path: results/no_intervals.bed + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/no_intervals.bed.gz + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/no_intervals.bed.gz.tbi + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/preprocessing/converted/test/test.converted.cram + # binary changes md5sums on reruns + - path: results/preprocessing/converted/test/test.converted.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram + should_exist: false + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + should_exist: false + - path: results/reports/bcftools/haplotypecaller/test/test.haplotypecaller.filtered.bcftools_stats.txt + md5sum: 8ef41291923302f7a8100e4fc34bde0e + - path: results/reports/vcftools/haplotypecaller/test/test.haplotypecaller.filtered.FILTER.summary + md5sum: 4e2ceea7f3ff998004691fd71192d9ee + - path: results/reports/vcftools/haplotypecaller/test/test.haplotypecaller.filtered.TsTv.count + md5sum: b77c120ee5cc0423267200c67d60c663 + - path: results/reports/vcftools/haplotypecaller/test/test.haplotypecaller.filtered.TsTv.qual + md5sum: 1e34357e5848c318f8c2c7d3b041d229 + - path: results/variant_calling/haplotypecaller/test/test.haplotypecaller.filtered.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/haplotypecaller/test/test.haplotypecaller.filtered.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/haplotypecaller/test/test.haplotypecaller.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/haplotypecaller/test/test.haplotypecaller.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/haplotypecaller + should_exist: false + - path: results/reports/mosdepth/test/test.recal.mosdepth.global.dist.txt + md5sum: e82e90c7d508a135b5a8a7cd6933452e + - path: results/reports/mosdepth/test/test.recal.mosdepth.summary.txt + md5sum: 4f0d231060cbde4efdd673863bd2fb59 + - path: results/reports/mosdepth/test/test.recal.per-base.bed.gz + md5sum: bc1df47d46f818fee5275975925d769a + - path: results/reports/mosdepth/test/test.recal.per-base.bed.gz.csi + md5sum: 9e649ac749ff6c6073bef5ab63e8aaa4 + - path: results/reports/samtools/test/test.recal.cram.stats diff --git a/tests/test_haplotypecaller_skip_filter.yml b/tests/test_haplotypecaller_skip_filter.yml new file mode 100644 index 0000000000..8600485d33 --- /dev/null +++ b/tests/test_haplotypecaller_skip_filter.yml @@ -0,0 +1,103 @@ +- name: Run variant calling on germline sample with haplotypecaller and skip filter + command: nextflow run main.nf -profile test_cache,targeted --input ./tests/csv/3.0/mapped_single_bam.csv --tools haplotypecaller --step variant_calling --skip_tools haplotypecaller_filter --outdir results + tags: + - germline + - haplotypecaller_skip_filter + - variant_calling + files: + - path: results/csv/variantcalled.csv + md5sum: f1041cfc30cedb240f224dd8e3dbf9d2 + - path: results/multiqc + - path: results/preprocessing/converted/test/test.converted.cram + # binary changes md5sums on reruns + - path: results/preprocessing/converted/test/test.converted.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram + should_exist: false + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + should_exist: false + - path: results/variant_calling/haplotypecaller/test/test.haplotypecaller.filtered.vcf.gz + should_exist: false + - path: results/variant_calling/haplotypecaller/test/test.haplotypecaller.filtered.vcf.gz.tbi + should_exist: false + - path: results/reports/bcftools/haplotypecaller/test/test.haplotypecaller.bcftools_stats.txt + md5sum: 98c4a1fdd2700f2e5aee3a5060de57e0 + - path: results/reports/vcftools/haplotypecaller/test/test.haplotypecaller.FILTER.summary + md5sum: 87a84b5f8ac3d3cbeeef7d60afcdbfe7 + - path: results/reports/vcftools/haplotypecaller/test/test.haplotypecaller.TsTv.count + md5sum: b77c120ee5cc0423267200c67d60c663 + - path: results/reports/vcftools/haplotypecaller/test/test.haplotypecaller.TsTv.qual + md5sum: 1e34357e5848c318f8c2c7d3b041d229 + - path: results/variant_calling/haplotypecaller/test/test.haplotypecaller.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/haplotypecaller/test/test.haplotypecaller.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/haplotypecaller/test/test.haplotypecaller.filtered.vcf.gz + should_exist: false + - path: results/variant_calling/haplotypecaller/test/test.haplotypecaller.filtered.vcf.gz.tbi + should_exist: false + - path: results/haplotypecaller + should_exist: false + - path: results/reports/mosdepth/test/test.recal.mosdepth.global.dist.txt + md5sum: e82e90c7d508a135b5a8a7cd6933452e + - path: results/reports/mosdepth/test/test.recal.mosdepth.region.dist.txt + md5sum: 3a2030e5e8af7bc12720c3a5592bf921 + - path: results/reports/mosdepth/test/test.recal.mosdepth.summary.txt + md5sum: 615c5c5019d88045a9ff5bbe6e63d270 + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz + md5sum: 9f1ea20e7461db948ba21f70c4d1b3ba + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz.csi + md5sum: 5c00a1d457c387d6e71848a6d897e309 + - path: results/reports/samtools/test/test.recal.cram.stats +- name: Run variant calling on germline sample with haplotypecaller without intervals and skip filter + command: nextflow run main.nf -profile test_cache,targeted --input ./tests/csv/3.0/mapped_single_bam.csv --tools haplotypecaller --step variant_calling --skip_tools haplotypecaller_filter --no_intervals --outdir results + tags: + - germline + - haplotypecaller_skip_filter + - no_intervals + - variant_calling + files: + - path: results/csv/variantcalled.csv + md5sum: f1041cfc30cedb240f224dd8e3dbf9d2 + - path: results/multiqc + - path: results/no_intervals.bed + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/no_intervals.bed.gz + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/no_intervals.bed.gz.tbi + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/preprocessing/converted/test/test.converted.cram + # binary changes md5sums on reruns + - path: results/preprocessing/converted/test/test.converted.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram + should_exist: false + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + should_exist: false + - path: results/reports/bcftools/haplotypecaller/test/test.haplotypecaller.bcftools_stats.txt + md5sum: 98c4a1fdd2700f2e5aee3a5060de57e0 + - path: results/reports/vcftools/haplotypecaller/test/test.haplotypecaller.FILTER.summary + md5sum: 87a84b5f8ac3d3cbeeef7d60afcdbfe7 + - path: results/reports/vcftools/haplotypecaller/test/test.haplotypecaller.TsTv.count + md5sum: b77c120ee5cc0423267200c67d60c663 + - path: results/reports/vcftools/haplotypecaller/test/test.haplotypecaller.TsTv.qual + md5sum: 1e34357e5848c318f8c2c7d3b041d229 + - path: results/variant_calling/haplotypecaller/test/test.haplotypecaller.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/haplotypecaller/test/test.haplotypecaller.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/haplotypecaller/test/test.haplotypecaller.filtered.vcf.gz + should_exist: false + - path: results/variant_calling/haplotypecaller/test/test.haplotypecaller.filtered.vcf.gz.tbi + should_exist: false + - path: results/haplotypecaller + should_exist: false + - path: results/reports/mosdepth/test/test.recal.mosdepth.global.dist.txt + md5sum: e82e90c7d508a135b5a8a7cd6933452e + - path: results/reports/mosdepth/test/test.recal.mosdepth.summary.txt + md5sum: 4f0d231060cbde4efdd673863bd2fb59 + - path: results/reports/mosdepth/test/test.recal.per-base.bed.gz + md5sum: bc1df47d46f818fee5275975925d769a + - path: results/reports/mosdepth/test/test.recal.per-base.bed.gz.csi + md5sum: 9e649ac749ff6c6073bef5ab63e8aaa4 + - path: results/reports/samtools/test/test.recal.cram.stats diff --git a/tests/test_intervals.yml b/tests/test_intervals.yml new file mode 100644 index 0000000000..f0cc2495c3 --- /dev/null +++ b/tests/test_intervals.yml @@ -0,0 +1,169 @@ +- name: Run default pipeline with target bed + command: nextflow run main.nf -profile test_cache,targeted --outdir results + tags: + - intervals + - preprocessing + - targeted + files: + - path: results/csv/markduplicates.csv + md5sum: 0d6120bb99e92f6810343270711ca53e + - path: results/csv/markduplicates_no_table.csv + md5sum: 2a2d3d4842befd4def39156463859ee3 + - path: results/csv/recalibrated.csv + md5sum: 2d29d9e53894dcce96a1b5beb6ef3312 + - path: results/multiqc + - path: results/preprocessing/markduplicates/test/test.md.cram + # binary changes md5sums on reruns + - path: results/preprocessing/markduplicates/test/test.md.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recal_table/test/test.recal.table + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + # binary changes md5sums on reruns + - path: results/reports/fastqc/test-test_L1 + - path: results/reports/markduplicates/test/test.md.cram.metrics + contains: ["test 17094 1534 168 1046782 12429 197 0 0.635998", "1.0 0.999991 1171"] + - path: results/reports/mosdepth/test/test.md.mosdepth.global.dist.txt + md5sum: e4ce28ba1c331dc08bc53a0189908f77 + - path: results/reports/mosdepth/test/test.md.mosdepth.region.dist.txt + md5sum: 04d9f20dc5306990eec982a3c5a7d107 + - path: results/reports/mosdepth/test/test.md.mosdepth.summary.txt + md5sum: 70b4bbe29bd5e7c4ea39b6caf3316096 + - path: results/reports/mosdepth/test/test.md.per-base.bed.gz + md5sum: 2a0c38fb19d6a1f81ca2018e59e7bfcf + - path: results/reports/mosdepth/test/test.md.per-base.bed.gz.csi + md5sum: 0714f8c677277168b9f95d3a43ea5237 + - path: results/reports/mosdepth/test/test.md.regions.bed.gz + md5sum: eb0bc92c253326a109e73af98c9a7d4b + - path: results/reports/mosdepth/test/test.md.regions.bed.gz.csi + md5sum: c6d1ac97ef4dfe43731c8368d8391cab + - path: results/reports/mosdepth/test/test.recal.mosdepth.global.dist.txt + md5sum: 9c1d90e0fed14b710098b7724b602aea + - path: results/reports/mosdepth/test/test.recal.mosdepth.region.dist.txt + md5sum: 04d9f20dc5306990eec982a3c5a7d107 + - path: results/reports/mosdepth/test/test.recal.mosdepth.summary.txt + md5sum: 17dfb78b147488eb8fd450294de4a35e + - path: results/reports/mosdepth/test/test.recal.per-base.bed.gz + md5sum: fb6804911f9d437d0251869fe112a528 + - path: results/reports/mosdepth/test/test.recal.per-base.bed.gz.csi + md5sum: 033032cdbb3a2b74dd41dac89628112c + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz + md5sum: eb0bc92c253326a109e73af98c9a7d4b + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz.csi + md5sum: c6d1ac97ef4dfe43731c8368d8391cab + - path: results/reports/samtools/test/test.md.cram.stats + # conda changes md5sums for test + - path: results/reports/samtools/test/test.recal.cram.stats + # conda changes md5sums for test +- name: Run pipeline with intervals false + command: nextflow run main.nf -profile test_cache --intervals false --save_reference --outdir results + tags: + - intervals + - intervals_false + - preprocessing + files: + - path: results/csv/markduplicates.csv + md5sum: 0d6120bb99e92f6810343270711ca53e + - path: results/csv/markduplicates_no_table.csv + md5sum: 2a2d3d4842befd4def39156463859ee3 + - path: results/csv/recalibrated.csv + md5sum: 2d29d9e53894dcce96a1b5beb6ef3312 + - path: results/multiqc + - path: results/preprocessing/markduplicates/test/test.md.cram + # binary changes md5sums on reruns + - path: results/preprocessing/markduplicates/test/test.md.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recal_table/test/test.recal.table + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + # binary changes md5sums on reruns + - path: results/reference/intervals/chr22_1-40001.bed + md5sum: 87a15eb9c2ff20ccd5cd8735a28708f7 + - path: results/reference/intervals/chr22_1-40001.bed.gz + md5sum: d3341fa28986c40b24fcc10a079dbb80 + - path: results/reports/fastqc/test-test_L1 + - path: results/reports/markduplicates/test/test.md.cram.metrics + contains: ["test 17094 1534 168 1046782 12429 197 0 0.635998", "1.0 0.999991 1171"] + - path: results/reports/mosdepth/test/test.md.mosdepth.global.dist.txt + md5sum: b61e1acee11a6ddf7ce3232a5948a6a0 + - path: results/reports/mosdepth/test/test.md.mosdepth.region.dist.txt + md5sum: 1a382f98d488d2ae3df83a0d87caafc1 + - path: results/reports/mosdepth/test/test.md.mosdepth.summary.txt + md5sum: 839108358878ada89e1eaddf6e0541ba + - path: results/reports/mosdepth/test/test.md.regions.bed.gz + md5sum: 0aaee6da65050bedcd40b9fbf0622873 + - path: results/reports/mosdepth/test/test.recal.mosdepth.global.dist.txt + md5sum: b61e1acee11a6ddf7ce3232a5948a6a0 + - path: results/reports/mosdepth/test/test.recal.mosdepth.region.dist.txt + md5sum: 1a382f98d488d2ae3df83a0d87caafc1 + - path: results/reports/mosdepth/test/test.recal.mosdepth.summary.txt + md5sum: 839108358878ada89e1eaddf6e0541ba + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz + md5sum: 0aaee6da65050bedcd40b9fbf0622873 + - path: results/reports/samtools/test/test.md.cram.stats + # conda changes md5sums for test + - path: results/reports/samtools/test/test.recal.cram.stats + # conda changes md5sums for test +- name: Run default pipeline without intervals + command: nextflow run main.nf -profile test_cache,no_intervals --outdir results + tags: + - intervals + - no_intervals + - preprocessing + files: + - path: results/csv/markduplicates.csv + md5sum: 0d6120bb99e92f6810343270711ca53e + - path: results/csv/markduplicates_no_table.csv + md5sum: 2a2d3d4842befd4def39156463859ee3 + - path: results/csv/recalibrated.csv + md5sum: 2d29d9e53894dcce96a1b5beb6ef3312 + - path: results/multiqc + - path: results/no_intervals.bed + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/no_intervals.bed.gz + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/no_intervals.bed.gz.tbi + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/preprocessing/markduplicates/test/test.md.cram + # binary changes md5sums on reruns + - path: results/preprocessing/markduplicates/test/test.md.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recal_table/test/test.recal.table + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + # binary changes md5sums on reruns + - path: results/reference/intervals + should_exist: false + - path: results/reports/fastqc/test-test_L1 + - path: results/reports/markduplicates/test/test.md.cram.metrics + contains: ["test 17094 1534 168 1046782 12429 197 0 0.635998", "1.0 0.999991 1171"] + - path: results/reports/mosdepth/test/test.md.mosdepth.global.dist.txt + md5sum: b61e1acee11a6ddf7ce3232a5948a6a0 + - path: results/reports/mosdepth/test/test.md.mosdepth.region.dist.txt + md5sum: 1a382f98d488d2ae3df83a0d87caafc1 + - path: results/reports/mosdepth/test/test.md.mosdepth.summary.txt + md5sum: 839108358878ada89e1eaddf6e0541ba + - path: results/reports/mosdepth/test/test.md.regions.bed.gz + md5sum: 0aaee6da65050bedcd40b9fbf0622873 + - path: results/reports/mosdepth/test/test.md.regions.bed.gz.csi + md5sum: 544e02fcca548749a0af758d0a2df352 + - path: results/reports/mosdepth/test/test.recal.mosdepth.global.dist.txt + md5sum: b61e1acee11a6ddf7ce3232a5948a6a0 + - path: results/reports/mosdepth/test/test.recal.mosdepth.region.dist.txt + md5sum: 1a382f98d488d2ae3df83a0d87caafc1 + - path: results/reports/mosdepth/test/test.recal.mosdepth.summary.txt + md5sum: 839108358878ada89e1eaddf6e0541ba + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz + md5sum: 0aaee6da65050bedcd40b9fbf0622873 + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz.csi + md5sum: 544e02fcca548749a0af758d0a2df352 + - path: results/reports/samtools/test/test.md.cram.stats + # conda changes md5sums for test + - path: results/reports/samtools/test/test.recal.cram.stats + # conda changes md5sums for test diff --git a/tests/test_joint_germline.yml b/tests/test_joint_germline.yml new file mode 100644 index 0000000000..7b634193c6 --- /dev/null +++ b/tests/test_joint_germline.yml @@ -0,0 +1,167 @@ +- name: Run joint germline variant calling with haplotypecaller + command: nextflow run main.nf -profile test_cache,targeted --input ./tests/csv/3.0/mapped_joint_bam.csv --tools haplotypecaller --step variant_calling --joint_germline --outdir results --known_snps_vqsr false --known_indels_vqsr false + tags: + - germline + - joint_germline + - variant_calling + files: + - path: results/csv/variantcalled.csv + md5sum: d2dffdbd2b4f1f26a06637592d24dab3 + - path: results/multiqc + - path: results/preprocessing/recalibrated/test/test.recal.cram + should_exist: false + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + should_exist: false + - path: results/reports/bcftools/haplotypecaller/joint_variant_calling/joint_germline.bcftools_stats.txt + # Not stable enough + - path: results/reports/vcftools/haplotypecaller/joint_variant_calling/joint_germline.FILTER.summary + # Not stable enough + - path: results/reports/vcftools/haplotypecaller/joint_variant_calling/joint_germline.TsTv.count + # Not stable enough + - path: results/reports/vcftools/haplotypecaller/joint_variant_calling/joint_germline.TsTv.qual + # Not stable enough + - path: results/variant_calling/haplotypecaller/joint_variant_calling/joint_germline.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/haplotypecaller/joint_variant_calling/joint_germline.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/haplotypecaller/testN/testN.haplotypecaller.g.vcf.gz + - path: results/variant_calling/haplotypecaller/testN/testN.haplotypecaller.g.vcf.gz.tbi + - path: results/variant_calling/haplotypecaller/testT/testT.haplotypecaller.g.vcf.gz + - path: results/variant_calling/haplotypecaller/testT/testT.haplotypecaller.g.vcf.gz.tbi + - path: results/haplotypecaller + should_exist: false + - path: results/reports/mosdepth/testN/testN.recal.mosdepth.global.dist.txt + md5sum: e82e90c7d508a135b5a8a7cd6933452e + - path: results/reports/mosdepth/testN/testN.recal.mosdepth.region.dist.txt + md5sum: 3a2030e5e8af7bc12720c3a5592bf921 + - path: results/reports/mosdepth/testN/testN.recal.mosdepth.summary.txt + md5sum: 615c5c5019d88045a9ff5bbe6e63d270 + - path: results/reports/mosdepth/testN/testN.recal.regions.bed.gz + md5sum: 9f1ea20e7461db948ba21f70c4d1b3ba + - path: results/reports/mosdepth/testN/testN.recal.regions.bed.gz.csi + md5sum: 5c00a1d457c387d6e71848a6d897e309 + - path: results/reports/samtools/testN/testN.recal.cram.stats + - path: results/reports/mosdepth/testT/testT.recal.mosdepth.global.dist.txt + md5sum: ba97ed85645f77da6f3adad138b3cdb4 + - path: results/reports/mosdepth/testT/testT.recal.mosdepth.region.dist.txt + md5sum: a7eb835371dd0aaf347ccca7ebe1eb3b + - path: results/reports/mosdepth/testT/testT.recal.mosdepth.summary.txt + md5sum: a937108cbf24c1430b79c861234ce22b + - path: results/reports/mosdepth/testT/testT.recal.regions.bed.gz + md5sum: d2b579a74bf8d858f82869f073056252 + - path: results/reports/mosdepth/testT/testT.recal.regions.bed.gz.csi + md5sum: 5c00a1d457c387d6e71848a6d897e309 + - path: results/reports/samtools/testT/testT.recal.cram.stats +- name: Run joint germline variant calling with haplotypecaller all intervals at once + command: nextflow run main.nf -profile test_cache,targeted --input ./tests/csv/3.0/mapped_joint_bam.csv --tools haplotypecaller --step variant_calling --joint_germline --outdir results --nucleotides_per_second 100 + tags: + - germline + - joint_germline + - variant_calling + files: + - path: results/csv/variantcalled.csv + md5sum: d2dffdbd2b4f1f26a06637592d24dab3 + - path: results/multiqc + - path: results/preprocessing/recalibrated/test/test.recal.cram + should_exist: false + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + should_exist: false + - path: results/reports/bcftools/haplotypecaller/joint_variant_calling/joint_germline.bcftools_stats.txt + # Not stable enough + - path: results/reports/vcftools/haplotypecaller/joint_variant_calling/joint_germline.FILTER.summary + # Not stable enough + - path: results/reports/vcftools/haplotypecaller/joint_variant_calling/joint_germline.TsTv.count + # Not stable enough + - path: results/reports/vcftools/haplotypecaller/joint_variant_calling/joint_germline.TsTv.qual + # Not stable enough + - path: results/variant_calling/haplotypecaller/joint_variant_calling/joint_germline.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/haplotypecaller/joint_variant_calling/joint_germline.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/haplotypecaller/testN/testN.haplotypecaller.g.vcf.gz + - path: results/variant_calling/haplotypecaller/testN/testN.haplotypecaller.g.vcf.gz.tbi + - path: results/variant_calling/haplotypecaller/testT/testT.haplotypecaller.g.vcf.gz + - path: results/variant_calling/haplotypecaller/testT/testT.haplotypecaller.g.vcf.gz.tbi + - path: results/haplotypecaller + should_exist: false + - path: results/reports/mosdepth/testN/testN.recal.mosdepth.global.dist.txt + md5sum: e82e90c7d508a135b5a8a7cd6933452e + - path: results/reports/mosdepth/testN/testN.recal.mosdepth.region.dist.txt + md5sum: 3a2030e5e8af7bc12720c3a5592bf921 + - path: results/reports/mosdepth/testN/testN.recal.mosdepth.summary.txt + md5sum: 615c5c5019d88045a9ff5bbe6e63d270 + - path: results/reports/mosdepth/testN/testN.recal.regions.bed.gz + md5sum: 9f1ea20e7461db948ba21f70c4d1b3ba + - path: results/reports/mosdepth/testN/testN.recal.regions.bed.gz.csi + md5sum: 5c00a1d457c387d6e71848a6d897e309 + - path: results/reports/samtools/testN/testN.recal.cram.stats + - path: results/reports/mosdepth/testT/testT.recal.mosdepth.global.dist.txt + md5sum: ba97ed85645f77da6f3adad138b3cdb4 + - path: results/reports/mosdepth/testT/testT.recal.mosdepth.region.dist.txt + md5sum: a7eb835371dd0aaf347ccca7ebe1eb3b + - path: results/reports/mosdepth/testT/testT.recal.mosdepth.summary.txt + md5sum: a937108cbf24c1430b79c861234ce22b + - path: results/reports/mosdepth/testT/testT.recal.regions.bed.gz + md5sum: d2b579a74bf8d858f82869f073056252 + - path: results/reports/mosdepth/testT/testT.recal.regions.bed.gz.csi + md5sum: 5c00a1d457c387d6e71848a6d897e309 + - path: results/reports/samtools/testT/testT.recal.cram.stats +- name: Run joint germline variant calling with haplotypecaller with Stub for VQSR + command: nextflow run main.nf -profile test_cache,tools_germline --input ./tests/csv/3.0/mapped_joint_bam.csv --tools haplotypecaller --step variant_calling --joint_germline --outdir results -stub-run + tags: + - germline + - joint_germline + - variant_calling + - vqsr + files: + - path: results/csv/variantcalled.csv + md5sum: 8513cd4aef3f54e2a72940461617c6c7 + - path: results/multiqc + - path: results/preprocessing/recalibrated/test/test.recal.cram + should_exist: false + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + should_exist: false + - path: results/reports/bcftools/haplotypecaller/joint_variant_calling/joint_germline_recalibrated.bcftools_stats.txt + # Not stable enough + - path: results/reports/vcftools/haplotypecaller/joint_variant_calling/joint_germline_recalibrated.FILTER.summary + # Not stable enough + - path: results/reports/vcftools/haplotypecaller/joint_variant_calling/joint_germline_recalibrated.TsTv.count + # Not stable enough + - path: results/reports/vcftools/haplotypecaller/joint_variant_calling/joint_germline_recalibrated.TsTv.qual + # Not stable enough + - path: results/variant_calling/haplotypecaller/joint_variant_calling/joint_germline.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/haplotypecaller/joint_variant_calling/joint_germline.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/haplotypecaller/joint_variant_calling/joint_germline_recalibrated.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/haplotypecaller/joint_variant_calling/joint_germline_recalibrated.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/haplotypecaller/testN/testN.haplotypecaller.g.vcf.gz + - path: results/variant_calling/haplotypecaller/testN/testN.haplotypecaller.g.vcf.gz.tbi + - path: results/variant_calling/haplotypecaller/testT/testT.haplotypecaller.g.vcf.gz + - path: results/variant_calling/haplotypecaller/testT/testT.haplotypecaller.g.vcf.gz.tbi + - path: results/haplotypecaller + should_exist: false + - path: results/reports/mosdepth/testN/testN.recal.global.dist.txt + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/reports/mosdepth/testN/testN.recal.region.dist.txt + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/reports/mosdepth/testN/testN.recal.summary.txt + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/reports/mosdepth/testN/testN.recal.regions.bed.gz + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/reports/mosdepth/testN/testN.recal.regions.bed.gz.csi + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/reports/samtools/testN/testN.recal.cram.stats + - path: results/reports/mosdepth/testT/testT.recal.global.dist.txt + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/reports/mosdepth/testT/testT.recal.region.dist.txt + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/reports/mosdepth/testT/testT.recal.summary.txt + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/reports/mosdepth/testT/testT.recal.regions.bed.gz + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/reports/mosdepth/testT/testT.recal.regions.bed.gz.csi + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/reports/samtools/testT/testT.recal.cram.stats diff --git a/tests/test_manta.yml b/tests/test_manta.yml new file mode 100644 index 0000000000..b119ca80b7 --- /dev/null +++ b/tests/test_manta.yml @@ -0,0 +1,297 @@ +- name: Run variant calling on germline sample with manta + command: nextflow run main.nf -profile test_cache,tools_germline --tools manta --outdir results + tags: + - germline + - manta + - variant_calling + files: + - path: results/csv/variantcalled.csv + md5sum: 981280af86f69190fdf0639030a80249 + - path: results/multiqc + - path: results/reports/bcftools/manta/sample1/sample1.manta.diploid_sv.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/vcftools/manta/sample1/sample1.manta.diploid_sv.FILTER.summary + md5sum: 1ce42d34e4ae919afb519efc99146423 + - path: results/reports/vcftools/manta/sample1/sample1.manta.diploid_sv.TsTv.count + md5sum: fa27f678965b7cba6a92efcd039f802a + - path: results/reports/vcftools/manta/sample1/sample1.manta.diploid_sv.TsTv.qual + md5sum: bc68ae4e688e9fb772b457069e604883 + - path: results/variant_calling/manta/sample1/sample1.manta.diploid_sv.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/manta/sample1/sample1.manta.diploid_sv.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/manta + should_exist: false + - path: results/reports/mosdepth/sample1/sample1.recal.mosdepth.global.dist.txt + md5sum: 69e29702ef01fd8f6c7a5468fc35a16a + - path: results/reports/mosdepth/sample1/sample1.recal.mosdepth.region.dist.txt + md5sum: 6ec49cd7d510c2eb3d9d90fdb79b783a + - path: results/reports/mosdepth/sample1/sample1.recal.mosdepth.summary.txt + md5sum: 103098d0bf76ed82d2b87d5f242b099a + - path: results/reports/mosdepth/sample1/sample1.recal.regions.bed.gz + md5sum: b5888cf7395c57d39879a5faa6159eb3 + - path: results/reports/mosdepth/sample1/sample1.recal.regions.bed.gz.csi + md5sum: 9cb0ad7039a3b703d16ca7d5b835c0ee + - path: results/reports/samtools/sample1/sample1.recal.cram.stats +- name: Run variant calling on germline sample with manta without intervals + command: nextflow run main.nf -profile test_cache,tools_germline --tools manta --no_intervals --outdir results + tags: + - germline + - manta + - no_intervals + - variant_calling + files: + - path: results/csv/variantcalled.csv + md5sum: 981280af86f69190fdf0639030a80249 + - path: results/multiqc + - path: results/no_intervals.bed + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/no_intervals.bed.gz + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/no_intervals.bed.gz.tbi + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/reports/bcftools/manta/sample1/sample1.manta.diploid_sv.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/vcftools/manta/sample1/sample1.manta.diploid_sv.FILTER.summary + md5sum: 1ce42d34e4ae919afb519efc99146423 + - path: results/reports/vcftools/manta/sample1/sample1.manta.diploid_sv.TsTv.count + md5sum: fa27f678965b7cba6a92efcd039f802a + - path: results/reports/vcftools/manta/sample1/sample1.manta.diploid_sv.TsTv.qual + md5sum: bc68ae4e688e9fb772b457069e604883 + - path: results/variant_calling/manta/sample1/sample1.manta.diploid_sv.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/manta/sample1/sample1.manta.diploid_sv.vcf.gz.tbi + md5sum: 4cb176febbc8c26d717a6c6e67b9c905 + - path: results/manta + should_exist: false + - path: results/reports/mosdepth/sample1/sample1.recal.mosdepth.global.dist.txt + md5sum: 69e29702ef01fd8f6c7a5468fc35a16a + - path: results/reports/mosdepth/sample1/sample1.recal.mosdepth.summary.txt + md5sum: d2775eb102acc5950f7f53883dcb503d + - path: results/reports/mosdepth/sample1/sample1.recal.per-base.bed.gz + md5sum: 54431f155c9538809e8caf99d1a75ec7 + - path: results/reports/mosdepth/sample1/sample1.recal.per-base.bed.gz.csi + md5sum: c67dcd711b096eb42f43784d5eadbc0d + - path: results/reports/samtools/sample1/sample1.recal.cram.stats +- name: Run variant calling on tumor_only sample with manta + command: nextflow run main.nf -profile test_cache,tools_tumoronly --tools manta --outdir results + tags: + - manta + - tumor_only + - variant_calling + files: + - path: results/csv/variantcalled.csv + md5sum: f1051fe647abf202e6332f9a1789c05d + - path: results/multiqc + - path: results/reports/bcftools/manta/sample2/sample2.manta.tumor_sv.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/vcftools/manta/sample2/sample2.manta.tumor_sv.FILTER.summary + md5sum: 1ce42d34e4ae919afb519efc99146423 + - path: results/reports/vcftools/manta/sample2/sample2.manta.tumor_sv.TsTv.count + md5sum: fa27f678965b7cba6a92efcd039f802a + - path: results/reports/vcftools/manta/sample2/sample2.manta.tumor_sv.TsTv.qual + md5sum: bc68ae4e688e9fb772b457069e604883 + - path: results/variant_calling/manta/sample2/sample2.manta.tumor_sv.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/manta/sample2/sample2.manta.tumor_sv.vcf.gz.tbi + md5sum: 4cb176febbc8c26d717a6c6e67b9c905 + - path: results/manta + should_exist: false + - path: results/reports/mosdepth/sample2/sample2.recal.mosdepth.global.dist.txt + md5sum: f2dcd00a64947c49e8e4b93c2f4fbf27 + - path: results/reports/mosdepth/sample2/sample2.recal.mosdepth.region.dist.txt + md5sum: 39005ffaac22871ffaaf19656fe69c5b + - path: results/reports/mosdepth/sample2/sample2.recal.mosdepth.summary.txt + md5sum: 68d4b98f17361fddf73052ead34fa370 + - path: results/reports/mosdepth/sample2/sample2.recal.regions.bed.gz + md5sum: 2819e995eafded35f53328c4ec19ba58 + - path: results/reports/mosdepth/sample2/sample2.recal.regions.bed.gz.csi + md5sum: 393c2749068304d8545b501b9d4658e4 + - path: results/reports/samtools/sample2/sample2.recal.cram.stats +- name: Run variant calling on tumor_only sample with manta without intervals + command: nextflow run main.nf -profile test_cache,tools_tumoronly --tools manta --no_intervals --outdir results + tags: + - manta + - no_intervals + - tumor_only + - variant_calling + files: + - path: results/csv/variantcalled.csv + md5sum: f1051fe647abf202e6332f9a1789c05d + - path: results/multiqc + - path: results/no_intervals.bed + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/no_intervals.bed.gz + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/no_intervals.bed.gz.tbi + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/reports/bcftools/manta/sample2/sample2.manta.tumor_sv.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/vcftools/manta/sample2/sample2.manta.tumor_sv.FILTER.summary + md5sum: 1ce42d34e4ae919afb519efc99146423 + - path: results/reports/vcftools/manta/sample2/sample2.manta.tumor_sv.TsTv.count + md5sum: fa27f678965b7cba6a92efcd039f802a + - path: results/reports/vcftools/manta/sample2/sample2.manta.tumor_sv.TsTv.qual + md5sum: bc68ae4e688e9fb772b457069e604883 + - path: results/variant_calling/manta/sample2/sample2.manta.tumor_sv.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/manta/sample2/sample2.manta.tumor_sv.vcf.gz.tbi + md5sum: 4cb176febbc8c26d717a6c6e67b9c905 + - path: results/manta + should_exist: false + - path: results/reports/mosdepth/sample2/sample2.recal.mosdepth.global.dist.txt + md5sum: f2dcd00a64947c49e8e4b93c2f4fbf27 + - path: results/reports/mosdepth/sample2/sample2.recal.mosdepth.summary.txt + md5sum: 0a7300e56eda6fba7c7564f00aa000f0 + - path: results/reports/mosdepth/sample2/sample2.recal.per-base.bed.gz + md5sum: 3de4a9f4da2f2b4909ef192452a8d211 + - path: results/reports/mosdepth/sample2/sample2.recal.per-base.bed.gz.csi + md5sum: cfb07b0ba46e8468b4342edb243536f3 + - path: results/reports/samtools/sample2/sample2.recal.cram.stats +- name: Run variant calling on somatic sample with manta + command: nextflow run main.nf -profile test_cache,tools_somatic --tools manta --outdir results + tags: + - manta + - somatic + - variant_calling + files: + - path: results/csv/variantcalled.csv + md5sum: 3a8861808601994f89d5c55ce5c95dae + - path: results/multiqc + - path: results/reports/bcftools/manta/sample3/sample3.manta.diploid_sv.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/bcftools/manta/sample4_vs_sample3/sample4_vs_sample3.manta.diploid_sv.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/bcftools/manta/sample4_vs_sample3/sample4_vs_sample3.manta.somatic_sv.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/vcftools/manta/sample3/sample3.manta.diploid_sv.FILTER.summary + md5sum: 1ce42d34e4ae919afb519efc99146423 + - path: results/reports/vcftools/manta/sample3/sample3.manta.diploid_sv.TsTv.count + md5sum: fa27f678965b7cba6a92efcd039f802a + - path: results/reports/vcftools/manta/sample3/sample3.manta.diploid_sv.TsTv.qual + md5sum: bc68ae4e688e9fb772b457069e604883 + - path: results/reports/vcftools/manta/sample4_vs_sample3/sample4_vs_sample3.manta.diploid_sv.FILTER.summary + md5sum: 1ce42d34e4ae919afb519efc99146423 + - path: results/reports/vcftools/manta/sample4_vs_sample3/sample4_vs_sample3.manta.diploid_sv.TsTv.count + md5sum: fa27f678965b7cba6a92efcd039f802a + - path: results/reports/vcftools/manta/sample4_vs_sample3/sample4_vs_sample3.manta.diploid_sv.TsTv.qual + md5sum: bc68ae4e688e9fb772b457069e604883 + - path: results/reports/vcftools/manta/sample4_vs_sample3/sample4_vs_sample3.manta.somatic_sv.FILTER.summary + md5sum: 1ce42d34e4ae919afb519efc99146423 + - path: results/reports/vcftools/manta/sample4_vs_sample3/sample4_vs_sample3.manta.somatic_sv.TsTv.count + md5sum: 8dcfdbcaac118df1d5ad407dd2af699f + - path: results/reports/vcftools/manta/sample4_vs_sample3/sample4_vs_sample3.manta.somatic_sv.TsTv.qual + md5sum: bc68ae4e688e9fb772b457069e604883 + - path: results/variant_calling/manta/sample3/sample3.manta.diploid_sv.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/manta/sample3/sample3.manta.diploid_sv.vcf.gz.tbi + md5sum: 4cb176febbc8c26d717a6c6e67b9c905 + - path: results/variant_calling/manta/sample4_vs_sample3/sample4_vs_sample3.manta.diploid_sv.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/manta/sample4_vs_sample3/sample4_vs_sample3.manta.diploid_sv.vcf.gz.tbi + md5sum: 4cb176febbc8c26d717a6c6e67b9c905 + - path: results/variant_calling/manta/sample4_vs_sample3/sample4_vs_sample3.manta.somatic_sv.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/manta/sample4_vs_sample3/sample4_vs_sample3.manta.somatic_sv.vcf.gz.tbi + md5sum: 4cb176febbc8c26d717a6c6e67b9c905 + - path: results/manta + should_exist: false + - path: results/reports/mosdepth/sample3/sample3.recal.mosdepth.global.dist.txt + md5sum: 69e29702ef01fd8f6c7a5468fc35a16a + - path: results/reports/mosdepth/sample3/sample3.recal.mosdepth.region.dist.txt + md5sum: 6ec49cd7d510c2eb3d9d90fdb79b783a + - path: results/reports/mosdepth/sample3/sample3.recal.mosdepth.summary.txt + md5sum: 103098d0bf76ed82d2b87d5f242b099a + - path: results/reports/mosdepth/sample3/sample3.recal.regions.bed.gz + md5sum: b5888cf7395c57d39879a5faa6159eb3 + - path: results/reports/mosdepth/sample3/sample3.recal.regions.bed.gz.csi + md5sum: 9cb0ad7039a3b703d16ca7d5b835c0ee + - path: results/reports/samtools/sample3/sample3.recal.cram.stats + # conda changes md5sums for test + - path: results/reports/mosdepth/sample4/sample4.recal.mosdepth.global.dist.txt + md5sum: f2dcd00a64947c49e8e4b93c2f4fbf27 + - path: results/reports/mosdepth/sample4/sample4.recal.mosdepth.region.dist.txt + md5sum: 39005ffaac22871ffaaf19656fe69c5b + - path: results/reports/mosdepth/sample4/sample4.recal.mosdepth.summary.txt + md5sum: 68d4b98f17361fddf73052ead34fa370 + - path: results/reports/mosdepth/sample4/sample4.recal.regions.bed.gz + md5sum: 2819e995eafded35f53328c4ec19ba58 + - path: results/reports/mosdepth/sample4/sample4.recal.regions.bed.gz.csi + md5sum: 393c2749068304d8545b501b9d4658e4 + - path: results/reports/samtools/sample4/sample4.recal.cram.stats + # conda changes md5sums for test +- name: Run variant calling on somatic sample with manta without intervals + command: nextflow run main.nf -profile test_cache,tools_somatic --tools manta --no_intervals --outdir results + tags: + - manta + - no_intervals + - somatic + - variant_calling + files: + - path: results/csv/variantcalled.csv + md5sum: 3a8861808601994f89d5c55ce5c95dae + - path: results/multiqc + - path: results/no_intervals.bed + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/no_intervals.bed.gz + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/no_intervals.bed.gz.tbi + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/reports/bcftools/manta/sample3/sample3.manta.diploid_sv.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/bcftools/manta/sample4_vs_sample3/sample4_vs_sample3.manta.diploid_sv.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/bcftools/manta/sample4_vs_sample3/sample4_vs_sample3.manta.somatic_sv.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/vcftools/manta/sample3/sample3.manta.diploid_sv.FILTER.summary + md5sum: 1ce42d34e4ae919afb519efc99146423 + - path: results/reports/vcftools/manta/sample3/sample3.manta.diploid_sv.TsTv.count + md5sum: fa27f678965b7cba6a92efcd039f802a + - path: results/reports/vcftools/manta/sample3/sample3.manta.diploid_sv.TsTv.qual + md5sum: bc68ae4e688e9fb772b457069e604883 + - path: results/reports/vcftools/manta/sample4_vs_sample3/sample4_vs_sample3.manta.diploid_sv.FILTER.summary + md5sum: 1ce42d34e4ae919afb519efc99146423 + - path: results/reports/vcftools/manta/sample4_vs_sample3/sample4_vs_sample3.manta.diploid_sv.TsTv.count + md5sum: fa27f678965b7cba6a92efcd039f802a + - path: results/reports/vcftools/manta/sample4_vs_sample3/sample4_vs_sample3.manta.diploid_sv.TsTv.qual + md5sum: bc68ae4e688e9fb772b457069e604883 + - path: results/reports/vcftools/manta/sample4_vs_sample3/sample4_vs_sample3.manta.somatic_sv.FILTER.summary + md5sum: 1ce42d34e4ae919afb519efc99146423 + - path: results/reports/vcftools/manta/sample4_vs_sample3/sample4_vs_sample3.manta.somatic_sv.TsTv.count + md5sum: 8dcfdbcaac118df1d5ad407dd2af699f + - path: results/reports/vcftools/manta/sample4_vs_sample3/sample4_vs_sample3.manta.somatic_sv.TsTv.qual + md5sum: bc68ae4e688e9fb772b457069e604883 + - path: results/variant_calling/manta/sample3/sample3.manta.diploid_sv.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/manta/sample3/sample3.manta.diploid_sv.vcf.gz.tbi + md5sum: 4cb176febbc8c26d717a6c6e67b9c905 + - path: results/variant_calling/manta/sample4_vs_sample3/sample4_vs_sample3.manta.diploid_sv.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/manta/sample4_vs_sample3/sample4_vs_sample3.manta.diploid_sv.vcf.gz.tbi + md5sum: 4cb176febbc8c26d717a6c6e67b9c905 + - path: results/variant_calling/manta/sample4_vs_sample3/sample4_vs_sample3.manta.somatic_sv.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/manta/sample4_vs_sample3/sample4_vs_sample3.manta.somatic_sv.vcf.gz.tbi + md5sum: 4cb176febbc8c26d717a6c6e67b9c905 + - path: results/manta + should_exist: false + - path: results/reports/mosdepth/sample3/sample3.recal.mosdepth.global.dist.txt + md5sum: 69e29702ef01fd8f6c7a5468fc35a16a + - path: results/reports/mosdepth/sample3/sample3.recal.mosdepth.summary.txt + md5sum: d2775eb102acc5950f7f53883dcb503d + - path: results/reports/mosdepth/sample3/sample3.recal.per-base.bed.gz + md5sum: 54431f155c9538809e8caf99d1a75ec7 + - path: results/reports/mosdepth/sample3/sample3.recal.per-base.bed.gz.csi + md5sum: c67dcd711b096eb42f43784d5eadbc0d + - path: results/reports/samtools/sample3/sample3.recal.cram.stats + # conda changes md5sums for test + - path: results/reports/mosdepth/sample4/sample4.recal.mosdepth.global.dist.txt + md5sum: f2dcd00a64947c49e8e4b93c2f4fbf27 + - path: results/reports/mosdepth/sample4/sample4.recal.mosdepth.summary.txt + md5sum: 0a7300e56eda6fba7c7564f00aa000f0 + - path: results/reports/mosdepth/sample4/sample4.recal.per-base.bed.gz + md5sum: 3de4a9f4da2f2b4909ef192452a8d211 + - path: results/reports/mosdepth/sample4/sample4.recal.per-base.bed.gz.csi + md5sum: cfb07b0ba46e8468b4342edb243536f3 + - path: results/reports/samtools/sample4/sample4.recal.cram.stats + # conda changes md5sums for test diff --git a/tests/test_markduplicates_from_bam.yml b/tests/test_markduplicates_from_bam.yml new file mode 100644 index 0000000000..d6191216bd --- /dev/null +++ b/tests/test_markduplicates_from_bam.yml @@ -0,0 +1,104 @@ +- name: Run markduplicates starting from BAM + command: nextflow run main.nf -profile test_cache,markduplicates_bam --outdir results + tags: + - input_bam + - gatk4/markduplicates + - preprocessing + files: + - path: results/csv/markduplicates.csv + md5sum: 8e9408ef8d4f9e6e00e531268eebd42a + - path: results/csv/markduplicates_no_table.csv + md5sum: f8b1b25fec472453a98c3f7f0e3a7953 + - path: results/csv/recalibrated.csv + md5sum: 1888a924bc70bd80165a96ad641e22d6 + - path: results/multiqc + - path: results/preprocessing/markduplicates/test/test.md.cram + # binary changes md5sums on reruns + - path: results/preprocessing/markduplicates/test/test.md.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recal_table/test/test.recal.table + md5sum: 9603b69fdc3b5090de2e0dd78bfcc4bf + - path: results/preprocessing/recalibrated/test/test.recal.cram + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + # binary changes md5sums on reruns + - path: results/reports/markduplicates/test/test.md.cram.metrics + contains: ["testN 0 2820 2 2 0 828 0 0.293617 3807", "1.0 0.999986 1178 1178", "100.0 1.911145 0 0"] + - path: results/reports/mosdepth/test/test.md.mosdepth.global.dist.txt + md5sum: 8e875e20e3fb9cf288d68c1d223f6fd5 + - path: results/reports/mosdepth/test/test.md.mosdepth.region.dist.txt + md5sum: 75e1ce7e55af51f4985fa91654a5ea2d + - path: results/reports/mosdepth/test/test.md.mosdepth.summary.txt + md5sum: b23cf96942b2ada3f41172a9349a1175 + - path: results/reports/mosdepth/test/test.md.regions.bed.gz + # binary changes md5sums on reruns + - path: results/reports/mosdepth/test/test.md.regions.bed.gz.csi + # binary changes md5sums on reruns + - path: results/reports/mosdepth/test/test.recal.mosdepth.global.dist.txt + md5sum: 8e875e20e3fb9cf288d68c1d223f6fd5 + - path: results/reports/mosdepth/test/test.recal.mosdepth.region.dist.txt + md5sum: 75e1ce7e55af51f4985fa91654a5ea2d + - path: results/reports/mosdepth/test/test.recal.mosdepth.summary.txt + md5sum: b23cf96942b2ada3f41172a9349a1175 + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz + # binary changes md5sums on reruns + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz.csi + # binary changes md5sums on reruns + - path: results/reports/samtools/test/test.md.cram.stats + # conda changes md5sums for test + - path: results/reports/samtools/test/test.recal.cram.stats + # conda changes md5sums for test + +- name: Run skip markduplicates bam from step markduplicates + command: nextflow run main.nf -profile test_cache,markduplicates_bam,skip_markduplicates --outdir results + tags: + - input_bam + - markduplicates + - preprocessing + - skip_markduplicates + files: + - path: results/csv/recalibrated.csv + md5sum: 1888a924bc70bd80165a96ad641e22d6 + - path: results/multiqc + - path: results/preprocessing/converted/test/test.converted.cram + # binary changes md5sums on reruns + - path: results/preprocessing/converted/test/test.converted.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recal_table/test/test.recal.table + md5sum: 35d89a3811aa31711fc9815b6b80e6ec + - path: results/preprocessing/recalibrated/test/test.recal.cram + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + # binary changes md5sums on reruns + - path: results/reports/mosdepth/test/test.recal.mosdepth.global.dist.txt + md5sum: bdb8f185c35dd1eec7ce2f69bce57972 + - path: results/reports/mosdepth/test/test.recal.mosdepth.region.dist.txt + md5sum: 6fd2e5c5c938bf69cdb2811f9e3afef8 + - path: results/reports/mosdepth/test/test.recal.mosdepth.summary.txt + md5sum: 32ea70ef1b99def3dc900b4afd513a40 + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz + md5sum: d034a60ae5c0768d67b9ba6442bd2212 + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz.csi + md5sum: b3716e5cd1744610e69c29bd4ffad259 + - path: results/reports/mosdepth/test/test.sorted.mosdepth.global.dist.txt + md5sum: bdb8f185c35dd1eec7ce2f69bce57972 + - path: results/reports/mosdepth/test/test.sorted.mosdepth.region.dist.txt + md5sum: 6fd2e5c5c938bf69cdb2811f9e3afef8 + - path: results/reports/mosdepth/test/test.sorted.mosdepth.summary.txt + md5sum: 32ea70ef1b99def3dc900b4afd513a40 + - path: results/reports/mosdepth/test/test.sorted.regions.bed.gz + md5sum: d034a60ae5c0768d67b9ba6442bd2212 + - path: results/reports/mosdepth/test/test.sorted.regions.bed.gz.csi + md5sum: b3716e5cd1744610e69c29bd4ffad259 + - path: results/reports/samtools/test/test.recal.cram.stats + # conda changes md5sums for test + - path: results/reports/samtools/test/test.sorted.cram.stats + # conda changes md5sums for test + - path: results/csv/markduplicates.csv + should_exist: false + - path: results/csv/markduplicates_no_table.csv + should_exist: false + - path: results/preprocessing/mapped/test/test.bam + should_exist: false + - path: results/preprocessing/mapped/test/test.sorted.bam + should_exist: false diff --git a/tests/test_markduplicates_from_cram.yml b/tests/test_markduplicates_from_cram.yml new file mode 100644 index 0000000000..f36619f719 --- /dev/null +++ b/tests/test_markduplicates_from_cram.yml @@ -0,0 +1,101 @@ +- name: Run markduplicates starting from CRAM + command: nextflow run main.nf -profile test_cache,markduplicates_cram --outdir results + tags: + - input_cram + - gatk4/markduplicates + - preprocessing + files: + - path: results/csv/markduplicates.csv + md5sum: 8e9408ef8d4f9e6e00e531268eebd42a + - path: results/csv/markduplicates_no_table.csv + md5sum: f8b1b25fec472453a98c3f7f0e3a7953 + - path: results/csv/recalibrated.csv + md5sum: 1888a924bc70bd80165a96ad641e22d6 + - path: results/multiqc + - path: results/preprocessing/markduplicates/test/test.md.cram + # binary changes md5sums on reruns + - path: results/preprocessing/markduplicates/test/test.md.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recal_table/test/test.recal.table + md5sum: 9603b69fdc3b5090de2e0dd78bfcc4bf + - path: results/preprocessing/recalibrated/test/test.recal.cram + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + # binary changes md5sums on reruns + - path: results/reports/markduplicates/test/test.md.cram.metrics + contains: ["testN 0 2820 2 2 0 828 0 0.293617 3807", "1.0 0.999986 1178 1178", "100.0 1.911145 0 0"] + - path: results/reports/mosdepth/test/test.md.mosdepth.global.dist.txt + md5sum: 8e875e20e3fb9cf288d68c1d223f6fd5 + - path: results/reports/mosdepth/test/test.md.mosdepth.region.dist.txt + md5sum: 75e1ce7e55af51f4985fa91654a5ea2d + - path: results/reports/mosdepth/test/test.md.mosdepth.summary.txt + md5sum: b23cf96942b2ada3f41172a9349a1175 + - path: results/reports/mosdepth/test/test.md.regions.bed.gz + # binary changes md5sums on reruns + - path: results/reports/mosdepth/test/test.md.regions.bed.gz.csi + # binary changes md5sums on reruns + - path: results/reports/mosdepth/test/test.recal.mosdepth.global.dist.txt + md5sum: 8e875e20e3fb9cf288d68c1d223f6fd5 + - path: results/reports/mosdepth/test/test.recal.mosdepth.region.dist.txt + md5sum: 75e1ce7e55af51f4985fa91654a5ea2d + - path: results/reports/mosdepth/test/test.recal.mosdepth.summary.txt + md5sum: b23cf96942b2ada3f41172a9349a1175 + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz + # binary changes md5sums on reruns + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz.csi + # binary changes md5sums on reruns + - path: results/reports/samtools/test/test.md.cram.stats + # conda changes md5sums for test + - path: results/reports/samtools/test/test.recal.cram.stats + # conda changes md5sums for test + - path: results/preprocessing/mapped/ + should_exist: false +- name: Run skip markduplicates cram from step markduplicates + command: nextflow run main.nf -profile test_cache,markduplicates_cram,skip_markduplicates --outdir results + tags: + - input_cram + - markduplicates + - preprocessing + - skip_markduplicates + files: + - path: results/csv/recalibrated.csv + md5sum: 1888a924bc70bd80165a96ad641e22d6 + - path: results/multiqc + - path: results/preprocessing/recal_table/test/test.recal.table + md5sum: 35d89a3811aa31711fc9815b6b80e6ec + - path: results/preprocessing/recalibrated/test/test.recal.cram + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + # binary changes md5sums on reruns + - path: results/reports/mosdepth/test/test.recal.mosdepth.global.dist.txt + md5sum: bdb8f185c35dd1eec7ce2f69bce57972 + - path: results/reports/mosdepth/test/test.recal.mosdepth.region.dist.txt + md5sum: 6fd2e5c5c938bf69cdb2811f9e3afef8 + - path: results/reports/mosdepth/test/test.recal.mosdepth.summary.txt + md5sum: 32ea70ef1b99def3dc900b4afd513a40 + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz + md5sum: d034a60ae5c0768d67b9ba6442bd2212 + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz.csi + md5sum: b3716e5cd1744610e69c29bd4ffad259 + - path: results/reports/mosdepth/test/test.sorted.mosdepth.global.dist.txt + md5sum: bdb8f185c35dd1eec7ce2f69bce57972 + - path: results/reports/mosdepth/test/test.sorted.mosdepth.region.dist.txt + md5sum: 6fd2e5c5c938bf69cdb2811f9e3afef8 + - path: results/reports/mosdepth/test/test.sorted.mosdepth.summary.txt + md5sum: 32ea70ef1b99def3dc900b4afd513a40 + - path: results/reports/mosdepth/test/test.sorted.regions.bed.gz + md5sum: d034a60ae5c0768d67b9ba6442bd2212 + - path: results/reports/mosdepth/test/test.sorted.regions.bed.gz.csi + md5sum: b3716e5cd1744610e69c29bd4ffad259 + - path: results/reports/samtools/test/test.recal.cram.stats + # conda changes md5sums for test + - path: results/reports/samtools/test/test.sorted.cram.stats + # conda changes md5sums for test + - path: results/csv/markduplicates.csv + should_exist: false + - path: results/csv/markduplicates_no_table.csv + should_exist: false + - path: results/preprocessing/mapped/test/test.sorted.cram + should_exist: false + - path: results/preprocessing/mapped/test/test.sorted.cram.crai + should_exist: false diff --git a/tests/test_mpileup.yml b/tests/test_mpileup.yml new file mode 100644 index 0000000000..f7c5d87749 --- /dev/null +++ b/tests/test_mpileup.yml @@ -0,0 +1,114 @@ +- name: Run variant calling on tumor_only sample to test mpileup + command: nextflow run main.nf -profile test_cache,tools_tumoronly --tools mpileup --outdir results + tags: + - tumor_only + - mpileup + files: + - path: results/multiqc + - path: results/variant_calling/bcftools/sample2/sample2.bcftools.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/bcftools/sample2/sample2.bcftools.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/mpileup/sample2/sample2.tumor.mpileup.gz + should_exist: false + - path: results/mpileup + should_exist: false + - path: results/reports/mosdepth/sample2/sample2.recal.mosdepth.global.dist.txt + md5sum: f2dcd00a64947c49e8e4b93c2f4fbf27 + - path: results/reports/mosdepth/sample2/sample2.recal.mosdepth.region.dist.txt + md5sum: 39005ffaac22871ffaaf19656fe69c5b + - path: results/reports/mosdepth/sample2/sample2.recal.mosdepth.summary.txt + md5sum: 68d4b98f17361fddf73052ead34fa370 + - path: results/reports/mosdepth/sample2/sample2.recal.regions.bed.gz + md5sum: 2819e995eafded35f53328c4ec19ba58 + - path: results/reports/mosdepth/sample2/sample2.recal.regions.bed.gz.csi + md5sum: 393c2749068304d8545b501b9d4658e4 + - path: results/reports/samtools/sample2/sample2.recal.cram.stats +- name: Run variant calling on tumor_only sample to test mpileup without intervals + command: nextflow run main.nf -profile test_cache,tools_tumoronly --tools mpileup --no_intervals --outdir results + tags: + - tumor_only + - mpileup + - no_intervals + files: + - path: results/multiqc + - path: results/no_intervals.bed + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/no_intervals.bed.gz + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/no_intervals.bed.gz.tbi + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/variant_calling/bcftools/sample2/sample2.bcftools.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/bcftools/sample2/sample2.bcftools.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/mpileup/ + should_exist: false + - path: results/mpileup + should_exist: false + - path: results/reports/mosdepth/sample2/sample2.recal.mosdepth.global.dist.txt + md5sum: f2dcd00a64947c49e8e4b93c2f4fbf27 + - path: results/reports/mosdepth/sample2/sample2.recal.mosdepth.summary.txt + md5sum: 0a7300e56eda6fba7c7564f00aa000f0 + - path: results/reports/mosdepth/sample2/sample2.recal.per-base.bed.gz + md5sum: 3de4a9f4da2f2b4909ef192452a8d211 + - path: results/reports/mosdepth/sample2/sample2.recal.per-base.bed.gz.csi + md5sum: cfb07b0ba46e8468b4342edb243536f3 + - path: results/reports/samtools/sample2/sample2.recal.cram.stats +- name: Run variant calling on germline sample to test mpileup + command: nextflow run main.nf -profile test_cache,tools_germline --tools mpileup --outdir results + tags: + - germline + - mpileup + files: + - path: results/multiqc + - path: results/variant_calling/bcftools/sample1/sample1.bcftools.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/bcftools/sample1/sample1.bcftools.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/mpileup/ + should_exist: false + - path: results/mpileup + should_exist: false + - path: results/reports/mosdepth/sample1/sample1.recal.mosdepth.global.dist.txt + md5sum: 69e29702ef01fd8f6c7a5468fc35a16a + - path: results/reports/mosdepth/sample1/sample1.recal.mosdepth.region.dist.txt + md5sum: 6ec49cd7d510c2eb3d9d90fdb79b783a + - path: results/reports/mosdepth/sample1/sample1.recal.mosdepth.summary.txt + md5sum: 103098d0bf76ed82d2b87d5f242b099a + - path: results/reports/mosdepth/sample1/sample1.recal.regions.bed.gz + md5sum: b5888cf7395c57d39879a5faa6159eb3 + - path: results/reports/mosdepth/sample1/sample1.recal.regions.bed.gz.csi + md5sum: 9cb0ad7039a3b703d16ca7d5b835c0ee + - path: results/reports/samtools/sample1/sample1.recal.cram.stats +- name: Run variant calling on germline sample to test mpileup without intervals + command: nextflow run main.nf -profile test_cache,tools_germline --tools mpileup --no_intervals --outdir results + tags: + - germline + - mpileup + - no_intervals + files: + - path: results/multiqc + - path: results/no_intervals.bed + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/no_intervals.bed.gz + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/no_intervals.bed.gz.tbi + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/variant_calling/bcftools/sample1/sample1.bcftools.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/bcftools/sample1/sample1.bcftools.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/mpileup/ + should_exist: false + - path: results/mpileup + should_exist: false + - path: results/reports/mosdepth/sample1/sample1.recal.mosdepth.global.dist.txt + md5sum: 69e29702ef01fd8f6c7a5468fc35a16a + - path: results/reports/mosdepth/sample1/sample1.recal.mosdepth.summary.txt + md5sum: d2775eb102acc5950f7f53883dcb503d + - path: results/reports/mosdepth/sample1/sample1.recal.per-base.bed.gz + md5sum: 54431f155c9538809e8caf99d1a75ec7 + - path: results/reports/mosdepth/sample1/sample1.recal.per-base.bed.gz.csi + md5sum: c67dcd711b096eb42f43784d5eadbc0d + - path: results/reports/samtools/sample1/sample1.recal.cram.stats diff --git a/tests/test_msisensorpro.yml b/tests/test_msisensorpro.yml new file mode 100644 index 0000000000..173dbcf5ca --- /dev/null +++ b/tests/test_msisensorpro.yml @@ -0,0 +1,71 @@ +- name: Run variant calling on somatic sample with msisensor-pro + command: nextflow run main.nf -profile test_cache,tools_somatic --tools msisensorpro --outdir results + tags: + - msisensorpro + - somatic + - variant_calling + files: + - path: results/multiqc + - path: results/variant_calling/msisensorpro/sample4_vs_sample3/sample4_vs_sample3 + - path: results/variant_calling/msisensorpro/sample4_vs_sample3/sample4_vs_sample3_dis + - path: results/variant_calling/msisensorpro/sample4_vs_sample3/sample4_vs_sample3_germline + - path: results/variant_calling/msisensorpro/sample4_vs_sample3/sample4_vs_sample3_somatic + - path: results/msisensorpro + should_exist: false + - path: results/reports/mosdepth/sample3/sample3.recal.mosdepth.global.dist.txt + md5sum: 69e29702ef01fd8f6c7a5468fc35a16a + - path: results/reports/mosdepth/sample3/sample3.recal.mosdepth.region.dist.txt + md5sum: 6ec49cd7d510c2eb3d9d90fdb79b783a + - path: results/reports/mosdepth/sample3/sample3.recal.mosdepth.summary.txt + md5sum: 103098d0bf76ed82d2b87d5f242b099a + - path: results/reports/mosdepth/sample3/sample3.recal.regions.bed.gz + md5sum: b5888cf7395c57d39879a5faa6159eb3 + - path: results/reports/mosdepth/sample3/sample3.recal.regions.bed.gz.csi + md5sum: 9cb0ad7039a3b703d16ca7d5b835c0ee + - path: results/reports/samtools/sample3/sample3.recal.cram.stats + # conda changes md5sums for test + - path: results/reports/mosdepth/sample4/sample4.recal.mosdepth.global.dist.txt + md5sum: f2dcd00a64947c49e8e4b93c2f4fbf27 + - path: results/reports/mosdepth/sample4/sample4.recal.mosdepth.region.dist.txt + md5sum: 39005ffaac22871ffaaf19656fe69c5b + - path: results/reports/mosdepth/sample4/sample4.recal.mosdepth.summary.txt + md5sum: 68d4b98f17361fddf73052ead34fa370 + - path: results/reports/mosdepth/sample4/sample4.recal.regions.bed.gz + md5sum: 2819e995eafded35f53328c4ec19ba58 + - path: results/reports/mosdepth/sample4/sample4.recal.regions.bed.gz.csi + md5sum: 393c2749068304d8545b501b9d4658e4 + - path: results/reports/samtools/sample4/sample4.recal.cram.stats + # conda changes md5sums for test +- name: Build only index with msisensorpro + command: nextflow run main.nf -profile test_cache --build_only_index --tools msisensorpro --input false --outdir results + tags: + - build_only_index + - msisensorpro + files: + - path: results/multiqc + - path: results/reference/bwa/genome.amb + md5sum: 1891c1de381b3a96d4e72f590fde20c1 + - path: results/reference/bwa/genome.ann + md5sum: 2df4aa2d7580639fa0fcdbcad5e2e969 + - path: results/reference/bwa/genome.bwt + md5sum: 815eded87e4cb6b0f1daab5c4d6e30af + - path: results/reference/bwa/genome.pac + md5sum: 8569fbdb2c98c6fb16dfa73d8eacb070 + - path: results/reference/bwa/genome.sa + md5sum: e7cff62b919448a3a3d0fe4aaf427594 + - path: results/reference/dbsnp/dbsnp_146.hg38.vcf.gz.tbi + # conda changes md5sums for test + - path: results/reference/dict/genome.dict + md5sum: 2433fe2ba31257337bf4c4bd4cb8da15 + - path: results/reference/fai/genome.fasta.fai + md5sum: 3520cd30e1b100e55f578db9c855f685 + - path: results/reference/intervals/chr22_1-40001.bed + md5sum: 87a15eb9c2ff20ccd5cd8735a28708f7 + - path: results/reference/intervals/chr22_1-40001.bed.gz + md5sum: d3341fa28986c40b24fcc10a079dbb80 + - path: results/reference/intervals/genome.bed + md5sum: a87dc7d20ebca626f65cc16ff6c97a3e + - path: results/reference/known_indels/mills_and_1000G.indels.vcf.gz.tbi + # conda changes md5sums for test + - path: results/reference/msi/genome.msisensor_scan.list + md5sum: a7886e7a56a1d7e3be6b4496753fd743 diff --git a/tests/test_mutect2.yml b/tests/test_mutect2.yml new file mode 100644 index 0000000000..3ff6307485 --- /dev/null +++ b/tests/test_mutect2.yml @@ -0,0 +1,107 @@ +- name: Run variant calling on tumor only sample with mutect2 + command: nextflow run main.nf -profile test_cache,tools_tumoronly --tools mutect2 --outdir results + tags: + - mutect2 + - tumor_only + - variant_calling + files: + - path: results/csv/variantcalled.csv + md5sum: d57c1beba9005e9790a573bd93398b72 + - path: results/multiqc + - path: results/reports/bcftools/mutect2/sample2/sample2.mutect2.filtered.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/vcftools/mutect2/sample2/sample2.mutect2.filtered.FILTER.summary + md5sum: ef9bd9a2f41d8872ba25e5616e4c2a5e + - path: results/reports/vcftools/mutect2/sample2/sample2.mutect2.filtered.TsTv.count + md5sum: fe3ff1f0c2ead72f037552727438e00a + - path: results/reports/vcftools/mutect2/sample2/sample2.mutect2.filtered.TsTv.qual + # conda changes md5sums for test + - path: results/variant_calling/mutect2/sample2/sample2.mutect2.artifactprior.tar.gz + # binary changes md5sums on reruns + - path: results/variant_calling/mutect2/sample2/sample2.mutect2.contamination.table + md5sum: 46c708c943b453da89a3da08acfdb2a7 + - path: results/variant_calling/mutect2/sample2/sample2.mutect2.filtered.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/mutect2/sample2/sample2.mutect2.filtered.vcf.gz.filteringStats.tsv + md5sum: 9a8439d0bb5875f1e673cf592af85ffb + - path: results/variant_calling/mutect2/sample2/sample2.mutect2.filtered.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/mutect2/sample2/sample2.mutect2.pileups.table + md5sum: 9afe42339f590937166edcf4746c22ec + - path: results/variant_calling/mutect2/sample2/sample2.mutect2.segmentation.table + md5sum: f4643d9319bde4efbfbe516d6fb13052 + - path: results/variant_calling/mutect2/sample2/sample2.mutect2.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/mutect2/sample2/sample2.mutect2.vcf.gz.stats + md5sum: 3cc40a35727af6c5223fb45678f3f172 + - path: results/variant_calling/mutect2/sample2/sample2.mutect2.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/mutect2 + should_exist: false + - path: results/reports/mosdepth/sample2/sample2.recal.mosdepth.global.dist.txt + md5sum: f2dcd00a64947c49e8e4b93c2f4fbf27 + - path: results/reports/mosdepth/sample2/sample2.recal.mosdepth.region.dist.txt + md5sum: 39005ffaac22871ffaaf19656fe69c5b + - path: results/reports/mosdepth/sample2/sample2.recal.mosdepth.summary.txt + md5sum: 68d4b98f17361fddf73052ead34fa370 + - path: results/reports/mosdepth/sample2/sample2.recal.regions.bed.gz + md5sum: 2819e995eafded35f53328c4ec19ba58 + - path: results/reports/mosdepth/sample2/sample2.recal.regions.bed.gz.csi + md5sum: 393c2749068304d8545b501b9d4658e4 + - path: results/reports/samtools/sample2/sample2.recal.cram.stats +- name: Run variant calling on tumor only sample with mutect2 without intervals + command: nextflow run main.nf -profile test_cache,tools_tumoronly --tools mutect2 --no_intervals --outdir results + tags: + - mutect2 + - no_intervals + - tumor_only + - variant_calling + files: + - path: results/csv/variantcalled.csv + md5sum: d57c1beba9005e9790a573bd93398b72 + - path: results/multiqc + - path: results/no_intervals.bed + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/no_intervals.bed.gz + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/no_intervals.bed.gz.tbi + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/reports/bcftools/mutect2/sample2/sample2.mutect2.filtered.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/vcftools/mutect2/sample2/sample2.mutect2.filtered.FILTER.summary + md5sum: 5a833fd50e6efb26d1df2336eb0caf5e + - path: results/reports/vcftools/mutect2/sample2/sample2.mutect2.filtered.TsTv.count + md5sum: f5295a61da80f12babae74fe4e104aad + - path: results/reports/vcftools/mutect2/sample2/sample2.mutect2.filtered.TsTv.qual + # conda changes md5sums for test + - path: results/variant_calling/mutect2/sample2/sample2.mutect2.artifactprior.tar.gz + # binary changes md5sums on reruns + - path: results/variant_calling/mutect2/sample2/sample2.mutect2.contamination.table + md5sum: 46c708c943b453da89a3da08acfdb2a7 + - path: results/variant_calling/mutect2/sample2/sample2.mutect2.filtered.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/mutect2/sample2/sample2.mutect2.filtered.vcf.gz.filteringStats.tsv + md5sum: e4eac0c602dd25aa61a6dc26a2b61844 + - path: results/variant_calling/mutect2/sample2/sample2.mutect2.filtered.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/mutect2/sample2/sample2.mutect2.pileups.table + md5sum: fe35b6bc041f2df8bd1f23420af3ddf9 + - path: results/variant_calling/mutect2/sample2/sample2.mutect2.segmentation.table + md5sum: f4643d9319bde4efbfbe516d6fb13052 + - path: results/variant_calling/mutect2/sample2/sample2.mutect2.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/mutect2/sample2/sample2.mutect2.vcf.gz.stats + md5sum: 55ed641e16089afb33cdbc478e202d3d + - path: results/variant_calling/mutect2/sample2/sample2.mutect2.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/mutect2 + should_exist: false + - path: results/reports/mosdepth/sample2/sample2.recal.mosdepth.global.dist.txt + md5sum: f2dcd00a64947c49e8e4b93c2f4fbf27 + - path: results/reports/mosdepth/sample2/sample2.recal.mosdepth.summary.txt + md5sum: 0a7300e56eda6fba7c7564f00aa000f0 + - path: results/reports/mosdepth/sample2/sample2.recal.per-base.bed.gz + md5sum: 3de4a9f4da2f2b4909ef192452a8d211 + - path: results/reports/mosdepth/sample2/sample2.recal.per-base.bed.gz.csi + md5sum: cfb07b0ba46e8468b4342edb243536f3 + - path: results/reports/samtools/sample2/sample2.recal.cram.stats diff --git a/tests/test_ngscheckmate.yml b/tests/test_ngscheckmate.yml new file mode 100644 index 0000000000..8923ddc9ec --- /dev/null +++ b/tests/test_ngscheckmate.yml @@ -0,0 +1,15 @@ +- name: Check ngscheckmate is working + command: nextflow run main.nf -profile test_cache,tools --tools ngscheckmate --outdir results + tags: + - ngscheckmate + - tools + files: + - path: results/multiqc + - path: results/reports/ngscheckmate/ngscheckmate_all.txt + - path: results/reports/ngscheckmate/ngscheckmate_matched.txt + - path: results/reports/ngscheckmate/ngscheckmate_output_corr_matrix.txt + - path: results/reports/ngscheckmate/ngscheckmate.pdf + - path: results/reports/ngscheckmate/vcfs/sample1.ngscheckmate.vcf.gz + - path: results/reports/ngscheckmate/vcfs/sample2.ngscheckmate.vcf.gz + - path: results/reports/ngscheckmate/vcfs/sample3.ngscheckmate.vcf.gz + - path: results/reports/ngscheckmate/vcfs/sample4.ngscheckmate.vcf.gz diff --git a/tests/test_prepare_recalibration_from_bam.yml b/tests/test_prepare_recalibration_from_bam.yml new file mode 100644 index 0000000000..3d2560058c --- /dev/null +++ b/tests/test_prepare_recalibration_from_bam.yml @@ -0,0 +1,84 @@ +- name: Run prepare_recalibration starting from bam + command: nextflow run main.nf -profile test_cache,prepare_recalibration_bam --outdir results + tags: + - input_bam + - prepare_recalibration + - preprocessing + files: + - path: results/csv/markduplicates.csv + md5sum: 90e2ab85d8af642d6548af448a9d4226 + - path: results/csv/recalibrated.csv + md5sum: 1888a924bc70bd80165a96ad641e22d6 + - path: results/multiqc + - path: results/preprocessing/converted/test/test.converted.cram + # binary changes md5sums on reruns + - path: results/preprocessing/converted/test/test.converted.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recal_table/test/test.recal.table + md5sum: 35d89a3811aa31711fc9815b6b80e6ec + - path: results/preprocessing/recalibrated/test/test.recal.cram + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + # binary changes md5sums on reruns + - path: results/reports/mosdepth/test/test.recal.mosdepth.global.dist.txt + md5sum: bdb8f185c35dd1eec7ce2f69bce57972 + - path: results/reports/mosdepth/test/test.recal.mosdepth.region.dist.txt + md5sum: 6fd2e5c5c938bf69cdb2811f9e3afef8 + - path: results/reports/mosdepth/test/test.recal.mosdepth.summary.txt + md5sum: 32ea70ef1b99def3dc900b4afd513a40 + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz + md5sum: d034a60ae5c0768d67b9ba6442bd2212 + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz.csi + md5sum: b3716e5cd1744610e69c29bd4ffad259 + - path: results/reports/samtools/test/test.recal.cram.stats + # conda changes md5sums for test + - path: results/preprocessing/mapped/ + should_exist: false + - path: results/preprocessing/markduplicates/ + should_exist: false +- name: Run prepare_recalibration starting from bam and skip baserecalibration + command: nextflow run main.nf -profile test_cache,prepare_recalibration_bam,skip_bqsr --tools strelka --outdir results + tags: + - input_bam + - prepare_recalibration + - preprocessing + files: + - path: results/csv/variantcalled.csv + md5sum: 4d0effd3d8dc2b814230a189e7ca9dba + - path: results/multiqc + - path: results/preprocessing/converted/test/test.converted.cram + # binary changes md5sums on reruns + - path: results/preprocessing/converted/test/test.converted.cram.crai + # binary changes md5sums on reruns + - path: results/reports/bcftools/strelka/test/test.strelka.variants.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/vcftools/strelka/test/test.strelka.variants.FILTER.summary + md5sum: 39ff2cc8eb7495a14a6b76e0ab627027 + - path: results/reports/vcftools/strelka/test/test.strelka.variants.TsTv.count + md5sum: ee7dafc8d941b8502a04a63dc3126fff + - path: results/reports/vcftools/strelka/test/test.strelka.variants.TsTv.qual + # conda changes md5sums for test + - path: results/variant_calling/strelka/test/test.strelka.genome.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/test/test.strelka.genome.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/test/test.strelka.variants.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/test/test.strelka.variants.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/csv/recalibrated.csv + should_exist: false + - path: results/preprocessing/recal_table/test/test.recal.table + should_exist: false + - path: results/preprocessing/recalibrated/test/test.recal.cram + should_exist: false + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + should_exist: false + - path: results/reports/mosdepth + should_exist: false + - path: results/reports/samtools_stats + should_exist: false + - path: results/preprocessing/mapped/ + should_exist: false + - path: results/preprocessing/markduplicates/ + should_exist: false diff --git a/tests/test_prepare_recalibration_from_cram.yml b/tests/test_prepare_recalibration_from_cram.yml new file mode 100644 index 0000000000..d69fa91525 --- /dev/null +++ b/tests/test_prepare_recalibration_from_cram.yml @@ -0,0 +1,84 @@ +- name: Run prepare_recalibration starting from cram + command: nextflow run main.nf -profile test_cache,prepare_recalibration_cram --outdir results + tags: + - input_cram + - prepare_recalibration + - preprocessing + files: + - path: results/csv/recalibrated.csv + md5sum: 1888a924bc70bd80165a96ad641e22d6 + - path: results/multiqc + - path: results/preprocessing/recal_table/test/test.recal.table + md5sum: 35d89a3811aa31711fc9815b6b80e6ec + - path: results/preprocessing/recalibrated/test/test.recal.cram + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + # binary changes md5sums on reruns + - path: results/reports/mosdepth/test/test.recal.mosdepth.global.dist.txt + md5sum: bdb8f185c35dd1eec7ce2f69bce57972 + - path: results/reports/mosdepth/test/test.recal.mosdepth.region.dist.txt + md5sum: 6fd2e5c5c938bf69cdb2811f9e3afef8 + - path: results/reports/mosdepth/test/test.recal.mosdepth.summary.txt + md5sum: 32ea70ef1b99def3dc900b4afd513a40 + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz + md5sum: d034a60ae5c0768d67b9ba6442bd2212 + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz.csi + md5sum: b3716e5cd1744610e69c29bd4ffad259 + - path: results/reports/samtools/test/test.recal.cram.stats + # conda changes md5sums for test + - path: results/preprocessing/mapped/ + should_exist: false + - path: results/preprocessing/markduplicates/ + should_exist: false +- name: Run prepare_recalibration starting from cram and skip baserecalibration + command: nextflow run main.nf -profile test_cache,prepare_recalibration_cram,skip_bqsr --tools strelka --outdir results + tags: + - input_cram + - prepare_recalibration + - preprocessing + files: + - path: results/csv/variantcalled.csv + md5sum: 4d0effd3d8dc2b814230a189e7ca9dba + - path: results/multiqc + - path: results/reports/bcftools/strelka/test/test.strelka.variants.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/vcftools/strelka/test/test.strelka.variants.FILTER.summary + md5sum: 39ff2cc8eb7495a14a6b76e0ab627027 + - path: results/reports/vcftools/strelka/test/test.strelka.variants.TsTv.count + md5sum: ee7dafc8d941b8502a04a63dc3126fff + - path: results/reports/vcftools/strelka/test/test.strelka.variants.TsTv.qual + # conda changes md5sums for test + - path: results/variant_calling/strelka/test/test.strelka.genome.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/test/test.strelka.genome.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/test/test.strelka.variants.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/test/test.strelka.variants.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/csv/recalibrated.csv + should_exist: false + - path: results/preprocessing/markduplicates/test/test.md.cram + should_exist: false + - path: results/preprocessing/markduplicates/test/test.md.cram.crai + should_exist: false + - path: results/preprocessing/recal_table/test/test.recal.table + should_exist: false + - path: results/preprocessing/recalibrated/test/test.recal.cram + should_exist: false + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + should_exist: false + - path: results/reports/mosdepth/test/test.recal.mosdepth.global.dist.txt + should_exist: false + - path: results/reports/mosdepth/test/test.recal.mosdepth.region.dist.txt + should_exist: false + - path: results/reports/mosdepth/test/test.recal.mosdepth.summary.txt + should_exist: false + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz + should_exist: false + - path: results/reports/samtools/test/test.recal.cram.stats + should_exist: false + - path: results/preprocessing/mapped/ + should_exist: false + - path: results/preprocessing/markduplicates/ + should_exist: false diff --git a/tests/test_recalibrate_from_bam.yml b/tests/test_recalibrate_from_bam.yml new file mode 100644 index 0000000000..cc1a3e114a --- /dev/null +++ b/tests/test_recalibrate_from_bam.yml @@ -0,0 +1,87 @@ +- name: Run Recalibration starting from bam + command: nextflow run main.nf -profile test_cache,recalibrate_bam --outdir results + tags: + - input_bam + - recalibrate + - preprocessing + files: + - path: results/csv/recalibrated.csv + md5sum: 1888a924bc70bd80165a96ad641e22d6 + - path: results/multiqc + - path: results/preprocessing/converted/test/test.converted.cram + # binary changes md5sums on reruns + - path: results/preprocessing/converted/test/test.converted.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + # binary changes md5sums on reruns + - path: results/reports/mosdepth/test/test.recal.mosdepth.global.dist.txt + md5sum: bdb8f185c35dd1eec7ce2f69bce57972 + - path: results/reports/mosdepth/test/test.recal.mosdepth.region.dist.txt + md5sum: 6fd2e5c5c938bf69cdb2811f9e3afef8 + - path: results/reports/mosdepth/test/test.recal.mosdepth.summary.txt + md5sum: 32ea70ef1b99def3dc900b4afd513a40 + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz + md5sum: d034a60ae5c0768d67b9ba6442bd2212 + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz.csi + md5sum: b3716e5cd1744610e69c29bd4ffad259 + - path: results/reports/samtools/test/test.recal.cram.stats + # conda changes md5sums for test + - path: results/preprocessing/mapped/ + should_exist: false + - path: results/preprocessing/markduplicates/ + should_exist: false +- name: Run Recalibration starting from bam and skip baserecalibration + command: nextflow run main.nf -profile test_cache,recalibrate_bam,skip_bqsr --tools strelka --outdir results + tags: + - input_bam + - recalibrate + - preprocessing + - variant_calling + files: + - path: results/csv/variantcalled.csv + md5sum: 4d0effd3d8dc2b814230a189e7ca9dba + - path: results/multiqc + - path: results/preprocessing/converted/test/test.converted.cram + # binary changes md5sums on reruns + - path: results/preprocessing/converted/test/test.converted.cram.crai + # binary changes md5sums on reruns + - path: results/reports/bcftools/strelka/test/test.strelka.variants.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/vcftools/strelka/test/test.strelka.variants.FILTER.summary + md5sum: 39ff2cc8eb7495a14a6b76e0ab627027 + - path: results/reports/vcftools/strelka/test/test.strelka.variants.TsTv.count + md5sum: ee7dafc8d941b8502a04a63dc3126fff + - path: results/reports/vcftools/strelka/test/test.strelka.variants.TsTv.qual + contains: ["0 0 0 -nan 3 4 0.75", "2 0 1 0 2 4 0.5", "5 1 1 1 2 3 0.666667"] + - path: results/variant_calling/strelka/test/test.strelka.genome.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/test/test.strelka.genome.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/test/test.strelka.variants.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/test/test.strelka.variants.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/csv/recalibrated.csv + should_exist: false + - path: results/preprocessing/recal_table/test/test.recal.table + should_exist: false + - path: results/preprocessing/recalibrated/test/test.recal.cram + should_exist: false + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + should_exist: false + - path: results/reports/mosdepth/test/test.recal.mosdepth.global.dist.txt + should_exist: false + - path: results/reports/mosdepth/test/test.recal.mosdepth.region.dist.txt + should_exist: false + - path: results/reports/mosdepth/test/test.recal.mosdepth.summary.txt + should_exist: false + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz + should_exist: false + - path: results/reports/samtools/test/test.recal.cram.stats + should_exist: false + - path: results/preprocessing/mapped/ + should_exist: false + - path: results/preprocessing/markduplicates/ + should_exist: false diff --git a/tests/test_recalibrate_from_cram.yml b/tests/test_recalibrate_from_cram.yml new file mode 100644 index 0000000000..e843424f50 --- /dev/null +++ b/tests/test_recalibrate_from_cram.yml @@ -0,0 +1,83 @@ +- name: Run Recalibration starting from cram + command: nextflow run main.nf -profile test_cache,recalibrate_cram --outdir results + tags: + - input_cram + - recalibrate + - preprocessing + files: + - path: results/csv/recalibrated.csv + md5sum: 1888a924bc70bd80165a96ad641e22d6 + - path: results/multiqc + - path: results/preprocessing/recalibrated/test/test.recal.cram + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + # binary changes md5sums on reruns + - path: results/reports/mosdepth/test/test.recal.mosdepth.global.dist.txt + md5sum: bdb8f185c35dd1eec7ce2f69bce57972 + - path: results/reports/mosdepth/test/test.recal.mosdepth.region.dist.txt + md5sum: 6fd2e5c5c938bf69cdb2811f9e3afef8 + - path: results/reports/mosdepth/test/test.recal.mosdepth.summary.txt + md5sum: 32ea70ef1b99def3dc900b4afd513a40 + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz + md5sum: d034a60ae5c0768d67b9ba6442bd2212 + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz.csi + md5sum: b3716e5cd1744610e69c29bd4ffad259 + - path: results/reports/samtools/test/test.recal.cram.stats + # conda changes md5sums for test + - path: results/preprocessing/mapped/ + should_exist: false + - path: results/preprocessing/markduplicates/ + should_exist: false +- name: Run Recalibration starting from cram and skip baserecalibration + command: nextflow run main.nf -profile test_cache,recalibrate_cram,skip_bqsr --tools strelka --outdir results + tags: + - input_cram + - recalibrate + - preprocessing + - variant_calling + files: + - path: results/csv/variantcalled.csv + md5sum: 4d0effd3d8dc2b814230a189e7ca9dba + - path: results/multiqc + - path: results/reports/bcftools/strelka/test/test.strelka.variants.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/vcftools/strelka/test/test.strelka.variants.FILTER.summary + md5sum: 39ff2cc8eb7495a14a6b76e0ab627027 + - path: results/reports/vcftools/strelka/test/test.strelka.variants.TsTv.count + md5sum: ee7dafc8d941b8502a04a63dc3126fff + - path: results/reports/vcftools/strelka/test/test.strelka.variants.TsTv.qual + contains: ["0 0 0 -nan 3 4 0.75", "2 0 1 0 2 4 0.5", "5 1 1 1 2 3 0.666667"] + - path: results/variant_calling/strelka/test/test.strelka.genome.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/test/test.strelka.genome.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/test/test.strelka.variants.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/test/test.strelka.variants.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/csv/recalibrated.csv + should_exist: false + - path: results/preprocessing/markduplicates/test/test.md.cram + should_exist: false + - path: results/preprocessing/markduplicates/test/test.md.cram.crai + should_exist: false + - path: results/preprocessing/recal_table/test/test.recal.table + should_exist: false + - path: results/preprocessing/recalibrated/test/test.recal.cram + should_exist: false + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + should_exist: false + - path: results/reports/mosdepth/test/test.recal.mosdepth.global.dist.txt + should_exist: false + - path: results/reports/mosdepth/test/test.recal.mosdepth.region.dist.txt + should_exist: false + - path: results/reports/mosdepth/test/test.recal.mosdepth.summary.txt + should_exist: false + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz + should_exist: false + - path: results/reports/samtools/test/test.recal.cram.stats + should_exist: false + - path: results/preprocessing/mapped/ + should_exist: false + - path: results/preprocessing/markduplicates/ + should_exist: false diff --git a/tests/test_samplesheet_validation_spaces.yml b/tests/test_samplesheet_validation_spaces.yml new file mode 100644 index 0000000000..edd0576d3a --- /dev/null +++ b/tests/test_samplesheet_validation_spaces.yml @@ -0,0 +1,9 @@ +- name: Test that pipeline fail when sample/patient name contain space + command: nextflow run main.nf -profile test_cache --input ./tests/csv/3.0/sample_with_space.csv --outdir results + tags: + - sample_with_space + - validation_checks + exit_code: 1 + stderr: + contains: + - "Sample ID must be provided and cannot contain spaces" diff --git a/tests/test_save_mapped.yml b/tests/test_save_mapped.yml new file mode 100644 index 0000000000..7868da9d35 --- /dev/null +++ b/tests/test_save_mapped.yml @@ -0,0 +1,76 @@ +- name: Run save_mapped + command: nextflow run main.nf -profile test_cache --save_mapped --outdir results + tags: + - default_extended + - preprocessing + - save_mapped + - save_mapped_only + - variant_calling + files: + - path: results/csv/mapped.csv + md5sum: 3bee45ccf65e301ce09ee4eed8f26933 + - path: results/csv/markduplicates.csv + md5sum: 0d6120bb99e92f6810343270711ca53e + - path: results/csv/markduplicates_no_table.csv + md5sum: 2a2d3d4842befd4def39156463859ee3 + - path: results/csv/recalibrated.csv + md5sum: 2d29d9e53894dcce96a1b5beb6ef3312 + - path: results/multiqc + - path: results/preprocessing/mapped/test/test.sorted.cram + # binary changes md5sums on reruns + - path: results/preprocessing/mapped/test/test.sorted.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/markduplicates/test/test.md.cram + # binary changes md5sums on reruns + - path: results/preprocessing/markduplicates/test/test.md.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recal_table/test/test.recal.table + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + # binary changes md5sums on reruns + - path: results/reports/fastqc/test-test_L1 + - path: results/reports/markduplicates/test/test.md.cram.metrics + contains: ["test 17094 1534 168 1046782 12429 197 0 0.635998", "1.0 0.999991 1171"] + - path: results/reports/mosdepth/test/test.md.mosdepth.global.dist.txt + md5sum: b61e1acee11a6ddf7ce3232a5948a6a0 + - path: results/reports/mosdepth/test/test.md.mosdepth.region.dist.txt + md5sum: 1a382f98d488d2ae3df83a0d87caafc1 + - path: results/reports/mosdepth/test/test.md.mosdepth.summary.txt + md5sum: 839108358878ada89e1eaddf6e0541ba + - path: results/reports/mosdepth/test/test.md.regions.bed.gz + md5sum: 0aaee6da65050bedcd40b9fbf0622873 + - path: results/reports/mosdepth/test/test.md.regions.bed.gz.csi + md5sum: 544e02fcca548749a0af758d0a2df352 + - path: results/reports/mosdepth/test/test.recal.mosdepth.global.dist.txt + md5sum: b61e1acee11a6ddf7ce3232a5948a6a0 + - path: results/reports/mosdepth/test/test.recal.mosdepth.region.dist.txt + md5sum: 1a382f98d488d2ae3df83a0d87caafc1 + - path: results/reports/mosdepth/test/test.recal.mosdepth.summary.txt + md5sum: 839108358878ada89e1eaddf6e0541ba + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz + md5sum: 0aaee6da65050bedcd40b9fbf0622873 + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz.csi + md5sum: 544e02fcca548749a0af758d0a2df352 + - path: results/reports/samtools/test/test.md.cram.stats + # conda changes md5sums for test + - path: results/reports/samtools/test/test.recal.cram.stats + # conda changes md5sums for test + - path: results/reports/bcftools/strelka/test/test.strelka.variants.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/vcftools/strelka/test/test.strelka.variants.FILTER.summary + md5sum: ad417bc96d31223f61170987975d8128 + - path: results/reports/vcftools/strelka/test/test.strelka.variants.TsTv.count + md5sum: fa27f678965b7cba6a92efcd039f802a + - path: results/reports/vcftools/strelka/test/test.strelka.variants.TsTv.qual + md5sum: bc68ae4e688e9fb772b457069e604883 + - path: results/variant_calling/strelka/test/test.strelka.genome.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/test/test.strelka.genome.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/test/test.strelka.variants.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/test/test.strelka.variants.vcf.gz.tbi + - path: results/strelka + should_exist: false diff --git a/tests/test_save_output_as_bam_only.yml b/tests/test_save_output_as_bam_only.yml new file mode 100644 index 0000000000..bc577c0ae8 --- /dev/null +++ b/tests/test_save_output_as_bam_only.yml @@ -0,0 +1,70 @@ +- name: Run save_output_as_bam + command: nextflow run main.nf -profile test_cache --save_output_as_bam --outdir results + tags: + - default_extended + - preprocessing + - save_output_as_bam + - save_output_as_bam_only + - variant_calling + files: + - path: results/csv/markduplicates.csv + md5sum: 6004ab16b63012e336f6251396a983c5 + - path: results/csv/markduplicates_no_table.csv + md5sum: 8a145eca178cfd02403d60122b9d3960 + - path: results/csv/recalibrated.csv + md5sum: 2dfbcaaeaaf4937c51c5c310f1c77614 + - path: results/multiqc + - path: results/preprocessing/markduplicates/test/test.md.bam.bai + # conda changes md5sums for test + - path: results/preprocessing/markduplicates/test/test.md.bam + # conda changes md5sums for test + - path: results/preprocessing/recal_table/test/test.recal.table + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.bam + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.bam.bai + # binary changes md5sums on reruns + - path: results/reports/fastqc/test-test_L1 + - path: results/reports/markduplicates/test/test.md.cram.metrics + contains: ["test 17094 1534 168 1046782 12429 197 0 0.635998", "1.0 0.999991 1171"] + - path: results/reports/mosdepth/test/test.md.mosdepth.global.dist.txt + md5sum: b61e1acee11a6ddf7ce3232a5948a6a0 + - path: results/reports/mosdepth/test/test.md.mosdepth.region.dist.txt + md5sum: 1a382f98d488d2ae3df83a0d87caafc1 + - path: results/reports/mosdepth/test/test.md.mosdepth.summary.txt + md5sum: 839108358878ada89e1eaddf6e0541ba + - path: results/reports/mosdepth/test/test.md.regions.bed.gz + md5sum: 0aaee6da65050bedcd40b9fbf0622873 + - path: results/reports/mosdepth/test/test.md.regions.bed.gz.csi + md5sum: 544e02fcca548749a0af758d0a2df352 + - path: results/reports/mosdepth/test/test.recal.mosdepth.global.dist.txt + md5sum: b61e1acee11a6ddf7ce3232a5948a6a0 + - path: results/reports/mosdepth/test/test.recal.mosdepth.region.dist.txt + md5sum: 1a382f98d488d2ae3df83a0d87caafc1 + - path: results/reports/mosdepth/test/test.recal.mosdepth.summary.txt + md5sum: 839108358878ada89e1eaddf6e0541ba + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz + md5sum: 0aaee6da65050bedcd40b9fbf0622873 + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz.csi + md5sum: 544e02fcca548749a0af758d0a2df352 + - path: results/reports/samtools/test/test.md.cram.stats + # conda changes md5sums for test + - path: results/reports/samtools/test/test.recal.cram.stats + # conda changes md5sums for test + - path: results/reports/bcftools/strelka/test/test.strelka.variants.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/vcftools/strelka/test/test.strelka.variants.FILTER.summary + md5sum: ad417bc96d31223f61170987975d8128 + - path: results/reports/vcftools/strelka/test/test.strelka.variants.TsTv.count + md5sum: fa27f678965b7cba6a92efcd039f802a + - path: results/reports/vcftools/strelka/test/test.strelka.variants.TsTv.qual + md5sum: bc68ae4e688e9fb772b457069e604883 + - path: results/variant_calling/strelka/test/test.strelka.genome.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/test/test.strelka.genome.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/test/test.strelka.variants.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/test/test.strelka.variants.vcf.gz.tbi + - path: results/strelka + should_exist: false diff --git a/tests/test_sentieon_aligner_bwamem.yml b/tests/test_sentieon_aligner_bwamem.yml new file mode 100644 index 0000000000..37f883312d --- /dev/null +++ b/tests/test_sentieon_aligner_bwamem.yml @@ -0,0 +1,215 @@ +- name: Run sentieon bwamem + command: nextflow run main.nf -profile test_cache,software_license --sentieon_extension --aligner sentieon-bwamem --save_reference --outdir results + tags: + - aligner + - sentieon/bwamem + - preprocessing + files: + - path: results/csv/markduplicates.csv + md5sum: 0d6120bb99e92f6810343270711ca53e + - path: results/csv/markduplicates_no_table.csv + md5sum: 2a2d3d4842befd4def39156463859ee3 + - path: results/csv/recalibrated.csv + md5sum: 2d29d9e53894dcce96a1b5beb6ef3312 + - path: results/multiqc + - path: results/preprocessing/markduplicates/test/test.md.cram + # binary changes md5sums on reruns + - path: results/preprocessing/markduplicates/test/test.md.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recal_table/test/test.recal.table + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + # binary changes md5sums on reruns + - path: results/reference/bwa/genome.amb + md5sum: 1891c1de381b3a96d4e72f590fde20c1 + - path: results/reference/bwa/genome.ann + md5sum: 2df4aa2d7580639fa0fcdbcad5e2e969 + - path: results/reference/bwa/genome.bwt + md5sum: 815eded87e4cb6b0f1daab5c4d6e30af + - path: results/reference/bwa/genome.pac + md5sum: 8569fbdb2c98c6fb16dfa73d8eacb070 + - path: results/reference/bwa/genome.sa + md5sum: e7cff62b919448a3a3d0fe4aaf427594 + - path: results/reference/dbsnp/dbsnp_146.hg38.vcf.gz.tbi + # conda changes md5sums for test + - path: results/reference/dict/genome.dict + md5sum: 2433fe2ba31257337bf4c4bd4cb8da15 + - path: results/reference/fai/genome.fasta.fai + md5sum: 3520cd30e1b100e55f578db9c855f685 + - path: results/reference/intervals/chr22_1-40001.bed + md5sum: 87a15eb9c2ff20ccd5cd8735a28708f7 + - path: results/reference/intervals/chr22_1-40001.bed.gz + md5sum: d3341fa28986c40b24fcc10a079dbb80 + - path: results/reference/intervals/genome.bed + md5sum: a87dc7d20ebca626f65cc16ff6c97a3e + - path: results/reference/known_indels/mills_and_1000G.indels.vcf.gz.tbi + # conda changes md5sums for test + - path: results/reports/fastqc/test-test_L1 + - path: results/reports/markduplicates/test/test.md.cram.metrics + contains: ["test 17094 1534 168 1046782 12429 197 0 0.635998", "1.0 0.999991 1171"] + - path: results/reports/mosdepth/test/test.md.mosdepth.global.dist.txt + md5sum: b61e1acee11a6ddf7ce3232a5948a6a0 + - path: results/reports/mosdepth/test/test.md.mosdepth.region.dist.txt + md5sum: 1a382f98d488d2ae3df83a0d87caafc1 + - path: results/reports/mosdepth/test/test.md.mosdepth.summary.txt + md5sum: 9afbcd5e44b62b3947e47af850a66188 + - path: results/reports/mosdepth/test/test.md.regions.bed.gz + md5sum: 3bd6a6f7127394802d9a7c7d559072ee + - path: results/reports/mosdepth/test/test.md.regions.bed.gz.csi + md5sum: 72ae14370dfdaab906e50d0552c90119 + - path: results/reports/mosdepth/test/test.recal.mosdepth.global.dist.txt + md5sum: b61e1acee11a6ddf7ce3232a5948a6a0 + - path: results/reports/mosdepth/test/test.recal.mosdepth.region.dist.txt + md5sum: 1a382f98d488d2ae3df83a0d87caafc1 + - path: results/reports/mosdepth/test/test.recal.mosdepth.summary.txt + md5sum: 9afbcd5e44b62b3947e47af850a66188 + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz + md5sum: 3bd6a6f7127394802d9a7c7d559072ee + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz.csi + md5sum: 72ae14370dfdaab906e50d0552c90119 + - path: results/reports/samtools/test/test.md.cram.stats + # conda changes md5sums for test + - path: results/reports/samtools/test/test.recal.cram.stats + # conda changes md5sums for test +- name: Build only index with sentieon bwa + command: nextflow run main.nf -profile test_cache,software_license --sentieon_extension --aligner sentieon-bwamem --build_only_index --outdir results + tags: + - aligner + - build_only_index + - sentieon/bwamem + files: + - path: results/multiqc + - path: results/reference/bwa/genome.amb + md5sum: 1891c1de381b3a96d4e72f590fde20c1 + - path: results/reference/bwa/genome.ann + md5sum: 2df4aa2d7580639fa0fcdbcad5e2e969 + - path: results/reference/bwa/genome.bwt + md5sum: 815eded87e4cb6b0f1daab5c4d6e30af + - path: results/reference/bwa/genome.pac + md5sum: 8569fbdb2c98c6fb16dfa73d8eacb070 + - path: results/reference/bwa/genome.sa + md5sum: e7cff62b919448a3a3d0fe4aaf427594 + - path: results/reference/dbsnp/dbsnp_146.hg38.vcf.gz.tbi + # conda changes md5sums for test + - path: results/reference/dict/genome.dict + md5sum: 2433fe2ba31257337bf4c4bd4cb8da15 + - path: results/reference/fai/genome.fasta.fai + md5sum: 3520cd30e1b100e55f578db9c855f685 + - path: results/reference/intervals/chr22_1-40001.bed + md5sum: 87a15eb9c2ff20ccd5cd8735a28708f7 + - path: results/reference/intervals/chr22_1-40001.bed.gz + md5sum: d3341fa28986c40b24fcc10a079dbb80 + - path: results/reference/intervals/genome.bed + md5sum: a87dc7d20ebca626f65cc16ff6c97a3e + - path: results/reference/known_indels/mills_and_1000G.indels.vcf.gz.tbi + # conda changes md5sums for test +- name: Run sentieon bwamem save bam + command: nextflow run main.nf -profile test_cache,software_license --sentieon_extension --aligner sentieon-bwamem --save_mapped --save_output_as_bam --outdir results + tags: + - aligner + - sentieon/bwamem + - save_output_as_bam + - preprocessing + files: + - path: results/csv/markduplicates.csv + md5sum: 6004ab16b63012e336f6251396a983c5 + - path: results/csv/markduplicates_no_table.csv + md5sum: 8a145eca178cfd02403d60122b9d3960 + - path: results/csv/recalibrated.csv + md5sum: 2dfbcaaeaaf4937c51c5c310f1c77614 + - path: results/multiqc + - path: results/preprocessing/mapped/test/test.sorted.bam + - path: results/preprocessing/mapped/test/test.sorted.bam.bai + - path: results/preprocessing/markduplicates/test/test.md.bam + # binary changes md5sums on reruns + - path: results/preprocessing/markduplicates/test/test.md.bam.bai + # binary changes md5sums on reruns + - path: results/preprocessing/recal_table/test/test.recal.table + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.bam + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.bam.bai + # binary changes md5sums on reruns + - path: results/reports/fastqc/test-test_L1 + - path: results/reports/markduplicates/test/test.md.cram.metrics + contains: ["test 17094 1534 168 1046782 12429 197 0 0.635998", "1.0 0.999991 1171"] + - path: results/reports/mosdepth/test/test.md.mosdepth.global.dist.txt + md5sum: b61e1acee11a6ddf7ce3232a5948a6a0 + - path: results/reports/mosdepth/test/test.md.mosdepth.region.dist.txt + md5sum: 1a382f98d488d2ae3df83a0d87caafc1 + - path: results/reports/mosdepth/test/test.md.mosdepth.summary.txt + md5sum: 9afbcd5e44b62b3947e47af850a66188 + - path: results/reports/mosdepth/test/test.md.regions.bed.gz + md5sum: 3bd6a6f7127394802d9a7c7d559072ee + - path: results/reports/mosdepth/test/test.md.regions.bed.gz.csi + md5sum: 72ae14370dfdaab906e50d0552c90119 + - path: results/reports/mosdepth/test/test.recal.mosdepth.global.dist.txt + md5sum: b61e1acee11a6ddf7ce3232a5948a6a0 + - path: results/reports/mosdepth/test/test.recal.mosdepth.region.dist.txt + md5sum: 1a382f98d488d2ae3df83a0d87caafc1 + - path: results/reports/mosdepth/test/test.recal.mosdepth.summary.txt + md5sum: 9afbcd5e44b62b3947e47af850a66188 + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz + md5sum: 3bd6a6f7127394802d9a7c7d559072ee + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz.csi + md5sum: 72ae14370dfdaab906e50d0552c90119 + - path: results/reports/samtools/test/test.md.cram.stats + # conda changes md5sums for test + - path: results/reports/samtools/test/test.recal.cram.stats + # conda changes md5sums for test +- name: Run sentieon bwamem save cram + command: nextflow run main.nf -profile test_cache,software_license --sentieon_extension --aligner sentieon-bwamem --save_mapped --outdir results + tags: + - aligner + - sentieon/bwamem + - save_output_as_cram + - preprocessing + files: + - path: results/csv/markduplicates.csv + md5sum: 0d6120bb99e92f6810343270711ca53e + - path: results/csv/markduplicates_no_table.csv + md5sum: 2a2d3d4842befd4def39156463859ee3 + - path: results/csv/recalibrated.csv + md5sum: 2d29d9e53894dcce96a1b5beb6ef3312 + - path: results/multiqc + - path: results/preprocessing/mapped/test/test.sorted.cram + - path: results/preprocessing/mapped/test/test.sorted.cram.crai + - path: results/preprocessing/markduplicates/test/test.md.cram + # binary changes md5sums on reruns + - path: results/preprocessing/markduplicates/test/test.md.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recal_table/test/test.recal.table + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + # binary changes md5sums on reruns + - path: results/reports/fastqc/test-test_L1 + - path: results/reports/markduplicates/test/test.md.cram.metrics + contains: ["test 17094 1534 168 1046782 12429 197 0 0.635998", "1.0 0.999991 1171"] + - path: results/reports/mosdepth/test/test.md.mosdepth.global.dist.txt + md5sum: b61e1acee11a6ddf7ce3232a5948a6a0 + - path: results/reports/mosdepth/test/test.md.mosdepth.region.dist.txt + md5sum: 1a382f98d488d2ae3df83a0d87caafc1 + - path: results/reports/mosdepth/test/test.md.mosdepth.summary.txt + md5sum: 9afbcd5e44b62b3947e47af850a66188 + - path: results/reports/mosdepth/test/test.md.regions.bed.gz + md5sum: 3bd6a6f7127394802d9a7c7d559072ee + - path: results/reports/mosdepth/test/test.md.regions.bed.gz.csi + md5sum: 72ae14370dfdaab906e50d0552c90119 + - path: results/reports/mosdepth/test/test.recal.mosdepth.global.dist.txt + md5sum: b61e1acee11a6ddf7ce3232a5948a6a0 + - path: results/reports/mosdepth/test/test.recal.mosdepth.region.dist.txt + md5sum: 1a382f98d488d2ae3df83a0d87caafc1 + - path: results/reports/mosdepth/test/test.recal.mosdepth.summary.txt + md5sum: 9afbcd5e44b62b3947e47af850a66188 + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz + md5sum: 3bd6a6f7127394802d9a7c7d559072ee + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz.csi + md5sum: 72ae14370dfdaab906e50d0552c90119 + - path: results/reports/samtools/test/test.md.cram.stats + # conda changes md5sums for test + - path: results/reports/samtools/test/test.recal.cram.stats + # conda changes md5sums for test diff --git a/tests/test_sentieon_dedup_from_bam.yml b/tests/test_sentieon_dedup_from_bam.yml new file mode 100644 index 0000000000..54df60e415 --- /dev/null +++ b/tests/test_sentieon_dedup_from_bam.yml @@ -0,0 +1,52 @@ +- name: Run sentieon dedup starting from BAM + command: nextflow run main.nf -profile test_cache,software_license,sentieon_dedup_bam --sentieon_extension --outdir results + tags: + - input_bam + - preprocessing + - sentieon/dedup + files: + - path: results/csv/markduplicates.csv + md5sum: b06889d5be3ec1be6f5dd278ccc8f28e + - path: results/csv/markduplicates_no_table.csv + md5sum: 49661e56662d74f3a3db269387cbd9bf + - path: results/csv/recalibrated.csv + md5sum: 1888a924bc70bd80165a96ad641e22d6 + - path: results/multiqc + - path: results/multiqc/multiqc_report.html + contains: ["Sentieon Dedup Metrics", "PERCENT_DUPLICATION", "ESTIMATED_LIBRARY_SIZE"] + - path: results/preprocessing/sentieon_dedup/test/test.dedup.cram + # binary changes md5sums on reruns + - path: results/preprocessing/sentieon_dedup/test/test.dedup.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recal_table/test/test.recal.table + md5sum: 9603b69fdc3b5090de2e0dd78bfcc4bf + - path: results/preprocessing/recalibrated/test/test.recal.cram + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + # binary changes md5sums on reruns + - path: results/reports/sentieon_dedup/test/test.dedup.cram.metrics + contains: ["testN 0 2820 2 2 0 828 0 0.293617 3807", "1.0 0.999986", "100.0 1.911145"] + - path: results/reports/mosdepth/test/test.dedup.mosdepth.global.dist.txt + md5sum: 8e875e20e3fb9cf288d68c1d223f6fd5 + - path: results/reports/mosdepth/test/test.dedup.mosdepth.region.dist.txt + md5sum: 75e1ce7e55af51f4985fa91654a5ea2d + - path: results/reports/mosdepth/test/test.dedup.mosdepth.summary.txt + md5sum: b23cf96942b2ada3f41172a9349a1175 + - path: results/reports/mosdepth/test/test.dedup.regions.bed.gz + # binary changes md5sums on reruns + - path: results/reports/mosdepth/test/test.dedup.regions.bed.gz.csi + # binary changes md5sums on reruns + - path: results/reports/mosdepth/test/test.recal.mosdepth.global.dist.txt + md5sum: 8e875e20e3fb9cf288d68c1d223f6fd5 + - path: results/reports/mosdepth/test/test.recal.mosdepth.region.dist.txt + md5sum: 75e1ce7e55af51f4985fa91654a5ea2d + - path: results/reports/mosdepth/test/test.recal.mosdepth.summary.txt + md5sum: b23cf96942b2ada3f41172a9349a1175 + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz + # binary changes md5sums on reruns + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz.csi + # binary changes md5sums on reruns + - path: results/reports/samtools/test/test.dedup.cram.stats + # conda changes md5sums for test + - path: results/reports/samtools/test/test.recal.cram.stats + # conda changes md5sums for test diff --git a/tests/test_sentieon_dedup_from_cram.yml b/tests/test_sentieon_dedup_from_cram.yml new file mode 100644 index 0000000000..87dfdebfa7 --- /dev/null +++ b/tests/test_sentieon_dedup_from_cram.yml @@ -0,0 +1,54 @@ +- name: Run sentieon dedup starting from CRAM + command: nextflow run main.nf -profile test_cache,software_license,sentieon_dedup_cram --sentieon_extension --outdir results + tags: + - input_cram + - preprocessing + - sentieon/dedup + files: + - path: results/csv/markduplicates.csv + md5sum: b06889d5be3ec1be6f5dd278ccc8f28e + - path: results/csv/markduplicates_no_table.csv + md5sum: 49661e56662d74f3a3db269387cbd9bf + - path: results/csv/recalibrated.csv + md5sum: 1888a924bc70bd80165a96ad641e22d6 + - path: results/multiqc + - path: results/multiqc/multiqc_report.html + contains: ["Sentieon Dedup Metrics", "PERCENT_DUPLICATION", "ESTIMATED_LIBRARY_SIZE"] + - path: results/preprocessing/sentieon_dedup/test/test.dedup.cram + # binary changes md5sums on reruns + - path: results/preprocessing/sentieon_dedup/test/test.dedup.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recal_table/test/test.recal.table + md5sum: 9603b69fdc3b5090de2e0dd78bfcc4bf + - path: results/preprocessing/recalibrated/test/test.recal.cram + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + # binary changes md5sums on reruns + - path: results/reports/sentieon_dedup/test/test.dedup.cram.metrics + contains: ["testN 0 2820 2 2 0 828 0 0.293617 3807", "1.0 0.999986", "2.0 1.476740", "3.0 1.704038"] + - path: results/reports/mosdepth/test/test.dedup.mosdepth.global.dist.txt + md5sum: 8e875e20e3fb9cf288d68c1d223f6fd5 + - path: results/reports/mosdepth/test/test.dedup.mosdepth.region.dist.txt + md5sum: 75e1ce7e55af51f4985fa91654a5ea2d + - path: results/reports/mosdepth/test/test.dedup.mosdepth.summary.txt + md5sum: b23cf96942b2ada3f41172a9349a1175 + - path: results/reports/mosdepth/test/test.dedup.regions.bed.gz + # binary changes md5sums on reruns + - path: results/reports/mosdepth/test/test.dedup.regions.bed.gz.csi + # binary changes md5sums on reruns + - path: results/reports/mosdepth/test/test.recal.mosdepth.global.dist.txt + md5sum: 8e875e20e3fb9cf288d68c1d223f6fd5 + - path: results/reports/mosdepth/test/test.recal.mosdepth.region.dist.txt + md5sum: 75e1ce7e55af51f4985fa91654a5ea2d + - path: results/reports/mosdepth/test/test.recal.mosdepth.summary.txt + md5sum: b23cf96942b2ada3f41172a9349a1175 + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz + # binary changes md5sums on reruns + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz.csi + # binary changes md5sums on reruns + - path: results/reports/samtools/test/test.dedup.cram.stats + # conda changes md5sums for test + - path: results/reports/samtools/test/test.recal.cram.stats + # conda changes md5sums for test + - path: results/preprocessing/mapped/ + should_exist: false diff --git a/tests/test_sentieon_dnascope.yml b/tests/test_sentieon_dnascope.yml new file mode 100644 index 0000000000..babd1bef0d --- /dev/null +++ b/tests/test_sentieon_dnascope.yml @@ -0,0 +1,190 @@ +- name: Run variant calling on germline sample with sentieons dnascope + command: nextflow run main.nf -profile test_cache,targeted,software_license --sentieon_extension --input ./tests/csv/3.0/mapped_single_bam.csv --tools sentieon_dnascope --step variant_calling --outdir results + tags: + - germline + - sentieon/dnascope + - variant_calling + files: + - path: results/csv/variantcalled.csv + md5sum: b2144d21a0ebfd807a8646f1751d0ddc + - path: results/multiqc + - path: results/preprocessing/converted/test/test.converted.cram + # binary changes md5sums on reruns + - path: results/preprocessing/converted/test/test.converted.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram + should_exist: false + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + should_exist: false + - path: results/reports/bcftools/sentieon_dnascope/test/test.dnascope.filtered.bcftools_stats.txt + md5sum: fb3923060b59b7dc18705cac5704caba + - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.filtered.FILTER.summary + md5sum: e67b24d296810a075378e5864bcea0fa + - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.filtered.TsTv.count + md5sum: b77c120ee5cc0423267200c67d60c663 + - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.filtered.TsTv.qual + # changes md5sum on reruns. This is somewhat unexpected, but might to tiny variation in very small numbers in the qual-files. + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.filtered.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.filtered.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.unfiltered.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.unfiltered.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/dnascope + should_exist: false + - path: results/reports/mosdepth/test/test.recal.mosdepth.global.dist.txt + md5sum: e82e90c7d508a135b5a8a7cd6933452e + - path: results/reports/mosdepth/test/test.recal.mosdepth.region.dist.txt + md5sum: 3a2030e5e8af7bc12720c3a5592bf921 + - path: results/reports/mosdepth/test/test.recal.mosdepth.summary.txt + md5sum: 615c5c5019d88045a9ff5bbe6e63d270 + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz + md5sum: 9f1ea20e7461db948ba21f70c4d1b3ba + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz.csi + md5sum: 5c00a1d457c387d6e71848a6d897e309 + - path: results/reports/samtools/test/test.recal.cram.stats +- name: Run variant calling on germline sample with sentieons dnascope without intervals + command: nextflow run main.nf -profile test_cache,targeted,software_license --sentieon_extension --input ./tests/csv/3.0/mapped_single_bam.csv --tools sentieon_dnascope --step variant_calling --no_intervals --outdir results + tags: + - germline + - sentieon/dnascope + - no_intervals + - variant_calling + files: + - path: results/csv/variantcalled.csv + md5sum: b2144d21a0ebfd807a8646f1751d0ddc + - path: results/multiqc + - path: results/no_intervals.bed + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/no_intervals.bed.gz + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/no_intervals.bed.gz.tbi + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/preprocessing/converted/test/test.converted.cram + # binary changes md5sums on reruns + - path: results/preprocessing/converted/test/test.converted.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram + should_exist: false + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + should_exist: false + - path: results/reports/bcftools/sentieon_dnascope/test/test.dnascope.filtered.bcftools_stats.txt + md5sum: fb3923060b59b7dc18705cac5704caba + - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.filtered.FILTER.summary + md5sum: e67b24d296810a075378e5864bcea0fa + - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.filtered.TsTv.count + md5sum: b77c120ee5cc0423267200c67d60c663 + - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.filtered.TsTv.qual + # changes md5sum on reruns. This is somewhat unexpected, but might to tiny variation in very small numbers in the qual-files. + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.filtered.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.filtered.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.unfiltered.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.unfiltered.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/sentieon_dnascope + should_exist: false + - path: results/reports/mosdepth/test/test.recal.mosdepth.global.dist.txt + md5sum: e82e90c7d508a135b5a8a7cd6933452e + - path: results/reports/mosdepth/test/test.recal.mosdepth.summary.txt + md5sum: 4f0d231060cbde4efdd673863bd2fb59 + - path: results/reports/mosdepth/test/test.recal.per-base.bed.gz + md5sum: bc1df47d46f818fee5275975925d769a + - path: results/reports/mosdepth/test/test.recal.per-base.bed.gz.csi + md5sum: 9e649ac749ff6c6073bef5ab63e8aaa4 + - path: results/reports/samtools/test/test.recal.cram.stats + +- name: Run variant calling on germline sample with sentieons dnascope output gvcf + command: nextflow run main.nf -profile test_cache,targeted,software_license --sentieon_extension --input ./tests/csv/3.0/mapped_single_bam.csv --tools sentieon_dnascope --step variant_calling --outdir results --sentieon_dnascope_emit_mode gvcf + tags: + - germline + - sentieon/dnascope + - variant_calling + files: + - path: results/csv/variantcalled.csv + should_exist: false + - path: results/multiqc + - path: results/preprocessing/converted/test/test.converted.cram + # binary changes md5sums on reruns + - path: results/preprocessing/converted/test/test.converted.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram + should_exist: false + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + should_exist: false + - path: results/reports/bcftools/sentieon_dnascope/test/test.dnascope.filtered.bcftools_stats.txt + should_exist: false + - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.filtered.FILTER.summary + should_exist: false + - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.filtered.TsTv.count + should_exist: false + - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.filtered.TsTv.qual + should_exist: false + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.g.vcf.gz + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.g.vcf.gz.tbi + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.filtered.vcf.gz + should_exist: false + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.filtered.vcf.gz.tbi + should_exist: false + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.unfiltered.vcf.gz + should_exist: false + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.unfiltered.vcf.gz.tbi + should_exist: false + - path: results/dnascope + should_exist: false + - path: results/reports/mosdepth/test/test.recal.mosdepth.global.dist.txt + md5sum: e82e90c7d508a135b5a8a7cd6933452e + - path: results/reports/mosdepth/test/test.recal.mosdepth.region.dist.txt + md5sum: 3a2030e5e8af7bc12720c3a5592bf921 + - path: results/reports/mosdepth/test/test.recal.mosdepth.summary.txt + md5sum: 615c5c5019d88045a9ff5bbe6e63d270 + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz + md5sum: 9f1ea20e7461db948ba21f70c4d1b3ba + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz.csi + md5sum: 5c00a1d457c387d6e71848a6d897e309 + - path: results/reports/samtools/test/test.recal.cram.stats +- name: Run variant calling on germline sample with sentieons dnascope output both gvcf and vcf + command: nextflow run main.nf -profile test_cache,targeted,software_license --sentieon_extension --input ./tests/csv/3.0/mapped_single_bam.csv --tools sentieon_dnascope --step variant_calling --outdir results --sentieon_dnascope_emit_mode variant,gvcf + tags: + - germline + - sentieon/dnascope + - variant_calling + files: + - path: results/csv/variantcalled.csv + md5sum: b2144d21a0ebfd807a8646f1751d0ddc + - path: results/multiqc + - path: results/preprocessing/converted/test/test.converted.cram + # binary changes md5sums on reruns + - path: results/preprocessing/converted/test/test.converted.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram + should_exist: false + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + should_exist: false + - path: results/reports/bcftools/sentieon_dnascope/test/test.dnascope.filtered.bcftools_stats.txt + - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.filtered.FILTER.summary + - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.filtered.TsTv.count + - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.filtered.TsTv.qual + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.g.vcf.gz + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.g.vcf.gz.tbi + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.filtered.vcf.gz + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.filtered.vcf.gz.tbi + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.unfiltered.vcf.gz + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.unfiltered.vcf.gz.tbi + - path: results/dnascope + should_exist: false + - path: results/reports/mosdepth/test/test.recal.mosdepth.global.dist.txt + md5sum: e82e90c7d508a135b5a8a7cd6933452e + - path: results/reports/mosdepth/test/test.recal.mosdepth.region.dist.txt + md5sum: 3a2030e5e8af7bc12720c3a5592bf921 + - path: results/reports/mosdepth/test/test.recal.mosdepth.summary.txt + md5sum: 615c5c5019d88045a9ff5bbe6e63d270 + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz + md5sum: 9f1ea20e7461db948ba21f70c4d1b3ba + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz.csi + md5sum: 5c00a1d457c387d6e71848a6d897e309 + - path: results/reports/samtools/test/test.recal.cram.stats diff --git a/tests/test_sentieon_dnascope_joint_germline.yml b/tests/test_sentieon_dnascope_joint_germline.yml new file mode 100644 index 0000000000..295bb45e7e --- /dev/null +++ b/tests/test_sentieon_dnascope_joint_germline.yml @@ -0,0 +1,110 @@ +- name: Run joint germline variant calling with sentieon dnascope + command: nextflow run main.nf -profile test_cache,targeted,software_license --sentieon_extension --input ./tests/csv/3.0/mapped_joint_bam.csv --tools sentieon_dnascope --step variant_calling --joint_germline --outdir results --sentieon_dnascope_emit_mode gvcf + tags: + - germline + - sentieon_dnascope_joint_germline + - variant_calling + files: + - path: results/csv/variantcalled.csv + md5sum: 62d70060aad96337254efe2d7a1df170 + - path: results/multiqc + - path: results/reports/bcftools/sentieon_dnascope/joint_variant_calling/joint_germline.bcftools_stats.txt + - path: results/reports/vcftools/sentieon_dnascope/joint_variant_calling/joint_germline.FILTER.summary + - path: results/reports/vcftools/sentieon_dnascope/joint_variant_calling/joint_germline.TsTv.count + - path: results/reports/vcftools/sentieon_dnascope/joint_variant_calling/joint_germline.TsTv.qual + - path: results/variant_calling/sentieon_dnascope/joint_variant_calling/joint_germline.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/sentieon_dnascope/joint_variant_calling/joint_germline.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/sentieon_dnascope/testN/testN.dnascope.g.vcf.gz + - path: results/variant_calling/sentieon_dnascope/testN/testN.dnascope.g.vcf.gz.tbi + - path: results/variant_calling/sentieon_dnascope/testT/testT.dnascope.g.vcf.gz + - path: results/variant_calling/sentieon_dnascope/testT/testT.dnascope.g.vcf.gz.tbi + - path: results/dnascope + should_exist: false + - path: results/preprocessing/recalibrated/test/test.recal.cram + should_exist: false + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + should_exist: false + - path: results/reports/bcftools/sentieon_dnascope/joint_variant_calling/joint_germline_recalibrated.bcftools_stats.txt + should_exist: false + - path: results/reports/vcftools/sentieon_dnascope/joint_variant_calling/joint_germline_recalibrated.FILTER.summary + should_exist: false + - path: results/reports/vcftools/sentieon_dnascope/joint_variant_calling/joint_germline_recalibrated.TsTv.count + should_exist: false + - path: results/reports/vcftools/sentieon_dnascope/joint_variant_calling/joint_germline_recalibrated.TsTv.qual + should_exist: false + - path: results/variant_calling/sentieon_dnascope/joint_variant_calling/joint_germline_recalibrated.vcf.gz + should_exist: false + - path: results/variant_calling/sentieon_dnascope/joint_variant_calling/joint_germline_recalibrated.vcf.gz.tbi + should_exist: false + - path: results/reports/mosdepth/testN/testN.recal.mosdepth.global.dist.txt + md5sum: 69e29702ef01fd8f6c7a5468fc35a16a + - path: results/reports/mosdepth/testN/testN.recal.mosdepth.region.dist.txt + md5sum: 6ec49cd7d510c2eb3d9d90fdb79b783a + - path: results/reports/mosdepth/testN/testN.recal.mosdepth.summary.txt + md5sum: 103098d0bf76ed82d2b87d5f242b099a + - path: results/reports/mosdepth/testN/testN.recal.regions.bed.gz + md5sum: b5888cf7395c57d39879a5faa6159eb3 + - path: results/reports/mosdepth/testN/testN.recal.regions.bed.gz.csi + md5sum: 9cb0ad7039a3b703d16ca7d5b835c0ee + - path: results/reports/samtools/testN/testN.recal.cram.stats + - path: results/reports/mosdepth/testT/testT.recal.mosdepth.global.dist.txt + md5sum: 69e29702ef01fd8f6c7a5468fc35a16a + - path: results/reports/mosdepth/testT/testT.recal.mosdepth.region.dist.txt + md5sum: 6ec49cd7d510c2eb3d9d90fdb79b783a + - path: results/reports/mosdepth/testT/testT.recal.mosdepth.summary.txt + md5sum: 103098d0bf76ed82d2b87d5f242b099a + - path: results/reports/mosdepth/testT/testT.recal.regions.bed.gz + md5sum: b5888cf7395c57d39879a5faa6159eb3 + - path: results/reports/mosdepth/testT/testT.recal.regions.bed.gz.csi + md5sum: 9cb0ad7039a3b703d16ca7d5b835c0ee + - path: results/reports/samtools/testT/testT.recal.cram.stats +- name: Run joint germline variant calling with sentieon dnascope all intervals at once + command: nextflow run main.nf -profile test_cache,targeted,software_license --sentieon_extension --input ./tests/csv/3.0/mapped_joint_bam.csv --tools sentieon_dnascope --step variant_calling --joint_germline --outdir results --sentieon_dnascope_emit_mode gvcf --nucleotides_per_second 100 + tags: + - germline + - sentieon_dnascope_joint_germline + - variant_calling + files: + - path: results/csv/variantcalled.csv + md5sum: 62d70060aad96337254efe2d7a1df170 + - path: results/multiqc + - path: results/preprocessing/recalibrated/test/test.recal.cram + should_exist: false + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + should_exist: false + - path: results/reports/bcftools/sentieon_dnascope/joint_variant_calling/joint_germline.bcftools_stats.txt + - path: results/reports/vcftools/sentieon_dnascope/joint_variant_calling/joint_germline.FILTER.summary + - path: results/reports/vcftools/sentieon_dnascope/joint_variant_calling/joint_germline.TsTv.count + - path: results/reports/vcftools/sentieon_dnascope/joint_variant_calling/joint_germline.TsTv.qual + - path: results/variant_calling/sentieon_dnascope/joint_variant_calling/joint_germline.vcf.gz + - path: results/variant_calling/sentieon_dnascope/joint_variant_calling/joint_germline.vcf.gz.tbi + - path: results/variant_calling/sentieon_dnascope/testN/testN.dnascope.g.vcf.gz + - path: results/variant_calling/sentieon_dnascope/testN/testN.dnascope.g.vcf.gz.tbi + - path: results/variant_calling/sentieon_dnascope/testT/testT.dnascope.g.vcf.gz + - path: results/variant_calling/sentieon_dnascope/testT/testT.dnascope.g.vcf.gz.tbi + - path: results/dnascope + should_exist: false + - path: results/reports/mosdepth/testN/testN.recal.mosdepth.global.dist.txt + md5sum: 69e29702ef01fd8f6c7a5468fc35a16a + - path: results/reports/mosdepth/testN/testN.recal.mosdepth.region.dist.txt + md5sum: 6ec49cd7d510c2eb3d9d90fdb79b783a + - path: results/reports/mosdepth/testN/testN.recal.mosdepth.summary.txt + md5sum: 103098d0bf76ed82d2b87d5f242b099a + - path: results/reports/mosdepth/testN/testN.recal.regions.bed.gz + md5sum: b5888cf7395c57d39879a5faa6159eb3 + - path: results/reports/mosdepth/testN/testN.recal.regions.bed.gz.csi + md5sum: 9cb0ad7039a3b703d16ca7d5b835c0ee + - path: results/reports/samtools/testN/testN.recal.cram.stats + - path: results/reports/mosdepth/testT/testT.recal.mosdepth.global.dist.txt + md5sum: 69e29702ef01fd8f6c7a5468fc35a16a + - path: results/reports/mosdepth/testT/testT.recal.mosdepth.region.dist.txt + md5sum: 6ec49cd7d510c2eb3d9d90fdb79b783a + - path: results/reports/mosdepth/testT/testT.recal.mosdepth.summary.txt + md5sum: 103098d0bf76ed82d2b87d5f242b099a + - path: results/reports/mosdepth/testT/testT.recal.regions.bed.gz + md5sum: b5888cf7395c57d39879a5faa6159eb3 + - path: results/reports/mosdepth/testT/testT.recal.regions.bed.gz.csi + md5sum: 9cb0ad7039a3b703d16ca7d5b835c0ee + - path: results/reports/samtools/testT/testT.recal.cram.stats diff --git a/tests/test_sentieon_dnascope_skip_filter.yml b/tests/test_sentieon_dnascope_skip_filter.yml new file mode 100644 index 0000000000..bd2a49b551 --- /dev/null +++ b/tests/test_sentieon_dnascope_skip_filter.yml @@ -0,0 +1,93 @@ +- name: Run variant calling on germline sample with sentieon dnascope and skip filter + command: nextflow run main.nf -profile test_cache,targeted,software_license --sentieon_extension --input ./tests/csv/3.0/mapped_single_bam.csv --tools sentieon_dnascope --step variant_calling --skip_tools dnascope_filter --outdir results + tags: + - germline + - sentieon_dnascope_skip_filter + - variant_calling + files: + - path: results/csv/variantcalled.csv + md5sum: 10254414c0679ba1fb25e41b9ff548cc + - path: results/multiqc + - path: results/preprocessing/converted/test/test.converted.cram + # binary changes md5sums on reruns + - path: results/preprocessing/converted/test/test.converted.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram + should_exist: false + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + should_exist: false + - path: results/reports/bcftools/sentieon_dnascope/test/test.dnascope.unfiltered.bcftools_stats.txt + - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.unfiltered.FILTER.summary + - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.unfiltered.TsTv.count + - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.unfiltered.TsTv.qual + # changes md5sum on reruns. This is somewhat unexpected, but might to tiny variation in very small numbers in the qual-files. + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.filtered.vcf.gz + should_exist: false + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.filtered.vcf.gz.tbi + should_exist: false + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.unfiltered.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.unfiltered.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/sentieon_dnascope + should_exist: false + - path: results/reports/mosdepth/test/test.recal.mosdepth.global.dist.txt + md5sum: e82e90c7d508a135b5a8a7cd6933452e + - path: results/reports/mosdepth/test/test.recal.mosdepth.region.dist.txt + md5sum: 3a2030e5e8af7bc12720c3a5592bf921 + - path: results/reports/mosdepth/test/test.recal.mosdepth.summary.txt + md5sum: 615c5c5019d88045a9ff5bbe6e63d270 + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz + md5sum: 9f1ea20e7461db948ba21f70c4d1b3ba + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz.csi + md5sum: 5c00a1d457c387d6e71848a6d897e309 + - path: results/reports/samtools/test/test.recal.cram.stats +- name: Run variant calling on germline sample with sentieon dnascope without intervals and skip filter + command: nextflow run main.nf -profile test_cache,targeted,software_license --sentieon_extension --input ./tests/csv/3.0/mapped_single_bam.csv --tools sentieon_dnascope --step variant_calling --skip_tools dnascope_filter --no_intervals --outdir results + tags: + - germline + - sentieon_dnascope_skip_filter + - no_intervals + - variant_calling + files: + - path: results/csv/variantcalled.csv + md5sum: 10254414c0679ba1fb25e41b9ff548cc + - path: results/multiqc + - path: results/no_intervals.bed + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/no_intervals.bed.gz + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/no_intervals.bed.gz.tbi + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/preprocessing/converted/test/test.converted.cram + # binary changes md5sums on reruns + - path: results/preprocessing/converted/test/test.converted.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram + should_exist: false + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + should_exist: false + - path: results/reports/bcftools/sentieon_dnascope/test/test.dnascope.unfiltered.bcftools_stats.txt + - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.unfiltered.FILTER.summary + - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.unfiltered.TsTv.count + - path: results/reports/vcftools/sentieon_dnascope/test/test.dnascope.unfiltered.TsTv.qual + # changes md5sum on reruns. This is somewhat unexpected, but might to tiny variation in very small numbers in the qual-files. + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.filtered.vcf.gz + should_exist: false + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.filtered.vcf.gz.tbi + should_exist: false + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.unfiltered.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/sentieon_dnascope/test/test.dnascope.unfiltered.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/sentieon_dnascope + should_exist: false + - path: results/reports/mosdepth/test/test.recal.mosdepth.global.dist.txt + md5sum: e82e90c7d508a135b5a8a7cd6933452e + - path: results/reports/mosdepth/test/test.recal.mosdepth.summary.txt + md5sum: 4f0d231060cbde4efdd673863bd2fb59 + - path: results/reports/mosdepth/test/test.recal.per-base.bed.gz + md5sum: bc1df47d46f818fee5275975925d769a + - path: results/reports/mosdepth/test/test.recal.per-base.bed.gz.csi + md5sum: 9e649ac749ff6c6073bef5ab63e8aaa4 + - path: results/reports/samtools/test/test.recal.cram.stats diff --git a/tests/test_sentieon_haplotyper.yml b/tests/test_sentieon_haplotyper.yml new file mode 100644 index 0000000000..6567804259 --- /dev/null +++ b/tests/test_sentieon_haplotyper.yml @@ -0,0 +1,244 @@ +- name: Run variant calling on germline sample with sentieons haplotyper + command: nextflow run main.nf -profile test_cache,software_license,targeted --sentieon_extension --input ./tests/csv/3.0/mapped_single_bam.csv --tools sentieon_haplotyper --step variant_calling --outdir results + tags: + - germline + - sentieon/haplotyper + - variant_calling + files: + - path: results/csv/variantcalled.csv + md5sum: 4d3dd4f6dcb34a91a949641f2b1ac202 + - path: results/multiqc + - path: results/preprocessing/converted/test/test.converted.cram + # binary changes md5sums on reruns + - path: results/preprocessing/converted/test/test.converted.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram + should_exist: false + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + should_exist: false + - path: results/reports/bcftools/sentieon_haplotyper/test/test.haplotyper.filtered.bcftools_stats.txt + md5sum: 66be03d4e6535175514f54a1a031d49f + - path: results/reports/vcftools/sentieon_haplotyper/test/test.haplotyper.filtered.FILTER.summary + md5sum: d501a93356f3c91c743f51104e24514a + - path: results/reports/vcftools/sentieon_haplotyper/test/test.haplotyper.filtered.TsTv.count + md5sum: 89562fef808b5c3db629682d36fd86fc + - path: results/reports/vcftools/sentieon_haplotyper/test/test.haplotyper.filtered.TsTv.qual + # changes md5sum on reruns. This is somewhat unexpected, but might to tiny variation in very small numbers in the qual-files. + - path: results/variant_calling/sentieon_haplotyper/test/test.haplotyper.filtered.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/sentieon_haplotyper/test/test.haplotyper.filtered.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/sentieon_haplotyper/test/test.haplotyper.unfiltered.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/sentieon_haplotyper/test/test.haplotyper.unfiltered.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/haplotyper + should_exist: false + - path: results/reports/mosdepth/test/test.recal.mosdepth.global.dist.txt + md5sum: e82e90c7d508a135b5a8a7cd6933452e + - path: results/reports/mosdepth/test/test.recal.mosdepth.region.dist.txt + md5sum: 3a2030e5e8af7bc12720c3a5592bf921 + - path: results/reports/mosdepth/test/test.recal.mosdepth.summary.txt + md5sum: 615c5c5019d88045a9ff5bbe6e63d270 + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz + md5sum: 9f1ea20e7461db948ba21f70c4d1b3ba + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz.csi + md5sum: 5c00a1d457c387d6e71848a6d897e309 + - path: results/reports/samtools/test/test.recal.cram.stats +- name: Run variant calling on germline sample with sentieons haplotyper without intervals + command: nextflow run main.nf -profile test_cache,software_license,targeted --sentieon_extension --input ./tests/csv/3.0/mapped_single_bam.csv --tools sentieon_haplotyper --step variant_calling --no_intervals --outdir results + tags: + - germline + - sentieon/haplotyper + - no_intervals + - variant_calling + files: + - path: results/csv/variantcalled.csv + md5sum: 4d3dd4f6dcb34a91a949641f2b1ac202 + - path: results/multiqc + - path: results/no_intervals.bed + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/no_intervals.bed.gz + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/no_intervals.bed.gz.tbi + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/preprocessing/converted/test/test.converted.cram + # binary changes md5sums on reruns + - path: results/preprocessing/converted/test/test.converted.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram + should_exist: false + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + should_exist: false + - path: results/reports/bcftools/sentieon_haplotyper/test/test.haplotyper.filtered.bcftools_stats.txt + md5sum: 66be03d4e6535175514f54a1a031d49f + - path: results/reports/vcftools/sentieon_haplotyper/test/test.haplotyper.filtered.FILTER.summary + md5sum: d501a93356f3c91c743f51104e24514a + - path: results/reports/vcftools/sentieon_haplotyper/test/test.haplotyper.filtered.TsTv.count + md5sum: 89562fef808b5c3db629682d36fd86fc + - path: results/reports/vcftools/sentieon_haplotyper/test/test.haplotyper.filtered.TsTv.qual + # changes md5sum on reruns. This is somewhat unexpected, but might to tiny variation in very small numbers in the qual-files. + - path: results/variant_calling/sentieon_haplotyper/test/test.haplotyper.filtered.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/sentieon_haplotyper/test/test.haplotyper.filtered.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/sentieon_haplotyper/test/test.haplotyper.unfiltered.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/sentieon_haplotyper/test/test.haplotyper.unfiltered.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/sentieon_haplotyper + should_exist: false + - path: results/reports/mosdepth/test/test.recal.mosdepth.global.dist.txt + md5sum: e82e90c7d508a135b5a8a7cd6933452e + - path: results/reports/mosdepth/test/test.recal.mosdepth.summary.txt + md5sum: 615c5c5019d88045a9ff5bbe6e63d270 + - path: results/reports/mosdepth/test/test.recal.per-base.bed.gz + md5sum: bc1df47d46f818fee5275975925d769a + - path: results/reports/mosdepth/test/test.recal.per-base.bed.gz.csi + md5sum: 9e649ac749ff6c6073bef5ab63e8aaa4 + - path: results/reports/samtools/test/test.recal.cram.stats +- name: Run variant calling on germline sample with sentieons haplotyper output gvcf + command: nextflow run main.nf -profile test_cache,software_license,targeted --sentieon_extension --input ./tests/csv/3.0/mapped_single_bam.csv --tools sentieon_haplotyper --step variant_calling --outdir results --sentieon_haplotyper_emit_mode gvcf + tags: + - germline + - sentieon/haplotyper + - variant_calling + files: + - path: results/csv/variantcalled.csv + should_exist: false + - path: results/multiqc + - path: results/preprocessing/converted/test/test.converted.cram + # binary changes md5sums on reruns + - path: results/preprocessing/converted/test/test.converted.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram + should_exist: false + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + should_exist: false + - path: results/reports/bcftools/sentieon_haplotyper/test/test.haplotyper.filtered.bcftools_stats.txt + should_exist: false + - path: results/reports/vcftools/sentieon_haplotyper/test/test.haplotyper.filtered.FILTER.summary + should_exist: false + - path: results/reports/vcftools/sentieon_haplotyper/test/test.haplotyper.filtered.TsTv.count + should_exist: false + - path: results/reports/vcftools/sentieon_haplotyper/test/test.haplotyper.filtered.TsTv.qual + should_exist: false + - path: results/variant_calling/sentieon_haplotyper/test/test.haplotyper.g.vcf.gz + - path: results/variant_calling/sentieon_haplotyper/test/test.haplotyper.g.vcf.gz.tbi + - path: results/variant_calling/sentieon_haplotyper/test/test.haplotyper.filtered.vcf.gz + should_exist: false + - path: results/variant_calling/sentieon_haplotyper/test/test.haplotyper.filtered.vcf.gz.tbi + should_exist: false + - path: results/variant_calling/sentieon_haplotyper/test/test.haplotyper.unfiltered.vcf.gz + should_exist: false + - path: results/variant_calling/sentieon_haplotyper/test/test.haplotyper.unfiltered.vcf.gz.tbi + should_exist: false + - path: results/haplotyper + should_exist: false + - path: results/reports/mosdepth/test/test.recal.mosdepth.global.dist.txt + md5sum: e82e90c7d508a135b5a8a7cd6933452e + - path: results/reports/mosdepth/test/test.recal.mosdepth.region.dist.txt + md5sum: 3a2030e5e8af7bc12720c3a5592bf921 + - path: results/reports/mosdepth/test/test.recal.mosdepth.summary.txt + md5sum: 615c5c5019d88045a9ff5bbe6e63d270 + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz + md5sum: 9f1ea20e7461db948ba21f70c4d1b3ba + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz.csi + md5sum: 5c00a1d457c387d6e71848a6d897e309 + - path: results/reports/samtools/test/test.recal.cram.stats +- name: Run variant calling on germline sample with sentieons haplotyper output both gvcf and vcf + command: nextflow run main.nf -profile test_cache,software_license,targeted --sentieon_extension --input ./tests/csv/3.0/mapped_single_bam.csv --tools sentieon_haplotyper --step variant_calling --outdir results --sentieon_haplotyper_emit_mode variant,gvcf + tags: + - germline + - sentieon/haplotyper + - variant_calling + files: + - path: results/csv/variantcalled.csv + md5sum: 4d3dd4f6dcb34a91a949641f2b1ac202 + - path: results/multiqc + - path: results/preprocessing/converted/test/test.converted.cram + # binary changes md5sums on reruns + - path: results/preprocessing/converted/test/test.converted.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram + should_exist: false + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + should_exist: false + - path: results/reports/bcftools/sentieon_haplotyper/test/test.haplotyper.filtered.bcftools_stats.txt + - path: results/reports/vcftools/sentieon_haplotyper/test/test.haplotyper.filtered.FILTER.summary + - path: results/reports/vcftools/sentieon_haplotyper/test/test.haplotyper.filtered.TsTv.count + - path: results/reports/vcftools/sentieon_haplotyper/test/test.haplotyper.filtered.TsTv.qual + - path: results/variant_calling/sentieon_haplotyper/test/test.haplotyper.g.vcf.gz + - path: results/variant_calling/sentieon_haplotyper/test/test.haplotyper.g.vcf.gz.tbi + - path: results/variant_calling/sentieon_haplotyper/test/test.haplotyper.filtered.vcf.gz + - path: results/variant_calling/sentieon_haplotyper/test/test.haplotyper.filtered.vcf.gz.tbi + - path: results/variant_calling/sentieon_haplotyper/test/test.haplotyper.unfiltered.vcf.gz + - path: results/variant_calling/sentieon_haplotyper/test/test.haplotyper.unfiltered.vcf.gz.tbi + - path: results/haplotyper + should_exist: false + - path: results/reports/mosdepth/test/test.recal.mosdepth.global.dist.txt + md5sum: e82e90c7d508a135b5a8a7cd6933452e + - path: results/reports/mosdepth/test/test.recal.mosdepth.region.dist.txt + md5sum: 3a2030e5e8af7bc12720c3a5592bf921 + - path: results/reports/mosdepth/test/test.recal.mosdepth.summary.txt + md5sum: 615c5c5019d88045a9ff5bbe6e63d270 + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz + md5sum: 9f1ea20e7461db948ba21f70c4d1b3ba + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz.csi + md5sum: 5c00a1d457c387d6e71848a6d897e309 + - path: results/reports/samtools/test/test.recal.cram.stats +- name: Run variant calling on germline sample with sentieons haplotyper and gatk haplotypecaller + command: nextflow run main.nf -profile test_cache,software_license,targeted --sentieon_extension --input ./tests/csv/3.0/mapped_single_bam.csv --tools haplotypecaller,sentieon_haplotyper --step variant_calling --outdir results + tags: + - germline + - sentieon/haplotyper + - variant_calling + files: + - path: results/csv/variantcalled.csv + md5sum: caa9932235cf993fca208943d2e58041 + - path: results/multiqc + - path: results/preprocessing/converted/test/test.converted.cram + # binary changes md5sums on reruns + - path: results/preprocessing/converted/test/test.converted.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram + should_exist: false + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + should_exist: false + - path: results/reports/bcftools/sentieon_haplotyper/test/test.haplotyper.filtered.bcftools_stats.txt + md5sum: 66be03d4e6535175514f54a1a031d49f + - path: results/reports/vcftools/sentieon_haplotyper/test/test.haplotyper.filtered.FILTER.summary + md5sum: d501a93356f3c91c743f51104e24514a + - path: results/reports/vcftools/sentieon_haplotyper/test/test.haplotyper.filtered.TsTv.count + md5sum: 89562fef808b5c3db629682d36fd86fc + - path: results/reports/vcftools/sentieon_haplotyper/test/test.haplotyper.filtered.TsTv.qual + # changes md5sum on reruns. This is somewhat unexpected, but might to tiny variation in very small numbers in the qual-files. + - path: results/variant_calling/sentieon_haplotyper/test/test.haplotyper.filtered.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/sentieon_haplotyper/test/test.haplotyper.filtered.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/sentieon_haplotyper/test/test.haplotyper.unfiltered.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/sentieon_haplotyper/test/test.haplotyper.unfiltered.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/haplotypecaller/test/test.haplotypecaller.filtered.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/haplotypecaller/test/test.haplotypecaller.filtered.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/haplotypecaller/test/test.haplotypecaller.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/haplotypecaller/test/test.haplotypecaller.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/haplotyper + should_exist: false + - path: results/reports/mosdepth/test/test.recal.mosdepth.global.dist.txt + md5sum: e82e90c7d508a135b5a8a7cd6933452e + - path: results/reports/mosdepth/test/test.recal.mosdepth.region.dist.txt + md5sum: 3a2030e5e8af7bc12720c3a5592bf921 + - path: results/reports/mosdepth/test/test.recal.mosdepth.summary.txt + md5sum: 615c5c5019d88045a9ff5bbe6e63d270 + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz + md5sum: 9f1ea20e7461db948ba21f70c4d1b3ba + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz.csi + md5sum: 5c00a1d457c387d6e71848a6d897e309 + - path: results/reports/samtools/test/test.recal.cram.stats diff --git a/tests/test_sentieon_haplotyper_joint_germline.yml b/tests/test_sentieon_haplotyper_joint_germline.yml new file mode 100644 index 0000000000..3474764886 --- /dev/null +++ b/tests/test_sentieon_haplotyper_joint_germline.yml @@ -0,0 +1,151 @@ +- name: Run joint germline variant calling with sentieon haplotyper + command: nextflow run main.nf -profile test_cache,software_license,targeted --sentieon_extension --input ./tests/csv/3.0/mapped_joint_bam.csv --tools sentieon_haplotyper --step variant_calling --joint_germline --outdir results --sentieon_haplotyper_emit_mode gvcf + tags: + - germline + - sentieon_haplotyper_joint_germline + - variant_calling + - sentieon/haplotyper + files: + - path: results/csv/variantcalled.csv + md5sum: 6ec10f6455c2b5290c7f6fc687c529ca + - path: results/multiqc + - path: results/preprocessing/recalibrated/test/test.recal.cram + should_exist: false + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + should_exist: false + - path: results/reports/bcftools/sentieon_haplotyper/joint_variant_calling/joint_germline.bcftools_stats.txt + - path: results/reports/vcftools/sentieon_haplotyper/joint_variant_calling/joint_germline.FILTER.summary + - path: results/reports/vcftools/sentieon_haplotyper/joint_variant_calling/joint_germline.TsTv.count + - path: results/reports/vcftools/sentieon_haplotyper/joint_variant_calling/joint_germline.TsTv.qual + - path: results/variant_calling/sentieon_haplotyper/joint_variant_calling/joint_germline.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/sentieon_haplotyper/joint_variant_calling/joint_germline.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/sentieon_haplotyper/testN/testN.haplotyper.g.vcf.gz + - path: results/variant_calling/sentieon_haplotyper/testN/testN.haplotyper.g.vcf.gz.tbi + - path: results/variant_calling/sentieon_haplotyper/testT/testT.haplotyper.g.vcf.gz + - path: results/variant_calling/sentieon_haplotyper/testT/testT.haplotyper.g.vcf.gz.tbi + - path: results/haplotyper + should_exist: false + - path: results/reports/mosdepth/testN/testN.recal.mosdepth.global.dist.txt + md5sum: e82e90c7d508a135b5a8a7cd6933452e + - path: results/reports/mosdepth/testN/testN.recal.mosdepth.region.dist.txt + md5sum: 3a2030e5e8af7bc12720c3a5592bf921 + - path: results/reports/mosdepth/testN/testN.recal.mosdepth.summary.txt + md5sum: 615c5c5019d88045a9ff5bbe6e63d270 + - path: results/reports/mosdepth/testN/testN.recal.regions.bed.gz + md5sum: 9f1ea20e7461db948ba21f70c4d1b3ba + - path: results/reports/mosdepth/testN/testN.recal.regions.bed.gz.csi + md5sum: 5c00a1d457c387d6e71848a6d897e309 + - path: results/reports/samtools/testN/testN.recal.cram.stats + - path: results/reports/mosdepth/testT/testT.recal.mosdepth.global.dist.txt + md5sum: ba97ed85645f77da6f3adad138b3cdb4 + - path: results/reports/mosdepth/testT/testT.recal.mosdepth.region.dist.txt + md5sum: a7eb835371dd0aaf347ccca7ebe1eb3b + - path: results/reports/mosdepth/testT/testT.recal.mosdepth.summary.txt + md5sum: a937108cbf24c1430b79c861234ce22b + - path: results/reports/mosdepth/testT/testT.recal.regions.bed.gz + md5sum: d2b579a74bf8d858f82869f073056252 + - path: results/reports/mosdepth/testT/testT.recal.regions.bed.gz.csi + md5sum: 5c00a1d457c387d6e71848a6d897e309 + - path: results/reports/samtools/testT/testT.recal.cram.stats +- name: Run joint germline variant calling with sentieon haplotyper all intervals at once + command: nextflow run main.nf -profile test_cache,software_license,targeted --sentieon_extension --input ./tests/csv/3.0/mapped_joint_bam.csv --tools sentieon_haplotyper --step variant_calling --joint_germline --outdir results --sentieon_haplotyper_emit_mode gvcf --nucleotides_per_second 100 + tags: + - germline + - sentieon_haplotyper_joint_germline + - variant_calling + - sentieon/haplotyper + files: + - path: results/csv/variantcalled.csv + md5sum: 6ec10f6455c2b5290c7f6fc687c529ca + - path: results/multiqc + - path: results/preprocessing/recalibrated/test/test.recal.cram + should_exist: false + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + should_exist: false + - path: results/reports/bcftools/sentieon_haplotyper/joint_variant_calling/joint_germline.bcftools_stats.txt + - path: results/reports/vcftools/sentieon_haplotyper/joint_variant_calling/joint_germline.FILTER.summary + - path: results/reports/vcftools/sentieon_haplotyper/joint_variant_calling/joint_germline.TsTv.count + - path: results/reports/vcftools/sentieon_haplotyper/joint_variant_calling/joint_germline.TsTv.qual + - path: results/variant_calling/sentieon_haplotyper/joint_variant_calling/joint_germline.vcf.gz + - path: results/variant_calling/sentieon_haplotyper/joint_variant_calling/joint_germline.vcf.gz.tbi + - path: results/variant_calling/sentieon_haplotyper/testN/testN.haplotyper.g.vcf.gz + - path: results/variant_calling/sentieon_haplotyper/testN/testN.haplotyper.g.vcf.gz.tbi + - path: results/variant_calling/sentieon_haplotyper/testT/testT.haplotyper.g.vcf.gz + - path: results/variant_calling/sentieon_haplotyper/testT/testT.haplotyper.g.vcf.gz.tbi + - path: results/haplotyper + should_exist: false + - path: results/reports/mosdepth/testN/testN.recal.mosdepth.global.dist.txt + md5sum: e82e90c7d508a135b5a8a7cd6933452e + - path: results/reports/mosdepth/testN/testN.recal.mosdepth.region.dist.txt + md5sum: 3a2030e5e8af7bc12720c3a5592bf921 + - path: results/reports/mosdepth/testN/testN.recal.mosdepth.summary.txt + md5sum: 615c5c5019d88045a9ff5bbe6e63d270 + - path: results/reports/mosdepth/testN/testN.recal.regions.bed.gz + md5sum: 9f1ea20e7461db948ba21f70c4d1b3ba + - path: results/reports/mosdepth/testN/testN.recal.regions.bed.gz.csi + md5sum: 5c00a1d457c387d6e71848a6d897e309 + - path: results/reports/samtools/testN/testN.recal.cram.stats + - path: results/reports/mosdepth/testT/testT.recal.mosdepth.global.dist.txt + md5sum: ba97ed85645f77da6f3adad138b3cdb4 + - path: results/reports/mosdepth/testT/testT.recal.mosdepth.region.dist.txt + md5sum: a7eb835371dd0aaf347ccca7ebe1eb3b + - path: results/reports/mosdepth/testT/testT.recal.mosdepth.summary.txt + md5sum: a937108cbf24c1430b79c861234ce22b + - path: results/reports/mosdepth/testT/testT.recal.regions.bed.gz + md5sum: d2b579a74bf8d858f82869f073056252 + - path: results/reports/mosdepth/testT/testT.recal.regions.bed.gz.csi + md5sum: 5c00a1d457c387d6e71848a6d897e309 + - path: results/reports/samtools/testT/testT.recal.cram.stats +- name: Run joint germline variant calling with sentieon haplotyper with stub for VQSR + command: nextflow run main.nf -profile test_cache,software_license,tools_germline --sentieon_extension --input ./tests/csv/3.0/mapped_joint_bam.csv --tools sentieon_haplotyper --step variant_calling --joint_germline --outdir results --sentieon_haplotyper_emit_mode gvcf -stub-run + tags: + - germline + - sentieon_haplotyper_joint_germline + - variant_calling + - vqsr + files: + - path: results/csv/variantcalled.csv + md5sum: 44415aaccc30c837943aea406c2f8d9d + - path: results/multiqc + - path: results/preprocessing/recalibrated/test/test.recal.cram + should_exist: false + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + should_exist: false + - path: results/reports/bcftools/sentieon_haplotyper/joint_variant_calling/joint_germline_recalibrated.bcftools_stats.txt + - path: results/reports/vcftools/sentieon_haplotyper/joint_variant_calling/joint_germline_recalibrated.FILTER.summary + - path: results/reports/vcftools/sentieon_haplotyper/joint_variant_calling/joint_germline_recalibrated.TsTv.count + - path: results/reports/vcftools/sentieon_haplotyper/joint_variant_calling/joint_germline_recalibrated.TsTv.qual + - path: results/variant_calling/sentieon_haplotyper/joint_variant_calling/joint_germline.vcf.gz + - path: results/variant_calling/sentieon_haplotyper/joint_variant_calling/joint_germline.vcf.gz.tbi + - path: results/variant_calling/sentieon_haplotyper/joint_variant_calling/joint_germline_recalibrated.vcf.gz + - path: results/variant_calling/sentieon_haplotyper/joint_variant_calling/joint_germline_recalibrated.vcf.gz.tbi + - path: results/variant_calling/sentieon_haplotyper/testN/testN.haplotyper.g.vcf.gz + - path: results/variant_calling/sentieon_haplotyper/testN/testN.haplotyper.g.vcf.gz.tbi + - path: results/variant_calling/sentieon_haplotyper/testT/testT.haplotyper.g.vcf.gz + - path: results/variant_calling/sentieon_haplotyper/testT/testT.haplotyper.g.vcf.gz.tbi + - path: results/haplotyper + should_exist: false + - path: results/reports/mosdepth/testN/testN.recal.global.dist.txt + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/reports/mosdepth/testN/testN.recal.region.dist.txt + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/reports/mosdepth/testN/testN.recal.summary.txt + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/reports/mosdepth/testN/testN.recal.regions.bed.gz + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/reports/mosdepth/testN/testN.recal.regions.bed.gz.csi + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/reports/samtools/testN/testN.recal.cram.stats + - path: results/reports/mosdepth/testT/testT.recal.global.dist.txt + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/reports/mosdepth/testT/testT.recal.region.dist.txt + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/reports/mosdepth/testT/testT.recal.summary.txt + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/reports/mosdepth/testT/testT.recal.regions.bed.gz + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/reports/mosdepth/testT/testT.recal.regions.bed.gz.csi + md5sum: d41d8cd98f00b204e9800998ecf8427e + - path: results/reports/samtools/testT/testT.recal.cram.stats diff --git a/tests/test_sentieon_haplotyper_skip_filter.yml b/tests/test_sentieon_haplotyper_skip_filter.yml new file mode 100644 index 0000000000..f333f41558 --- /dev/null +++ b/tests/test_sentieon_haplotyper_skip_filter.yml @@ -0,0 +1,99 @@ +- name: Run variant calling on germline sample with sentieon haplotyper and skip filter + command: nextflow run main.nf -profile test_cache,software_license,targeted --sentieon_extension --input ./tests/csv/3.0/mapped_single_bam.csv --tools sentieon_haplotyper --step variant_calling --skip_tools haplotyper_filter --outdir results + tags: + - germline + - sentieon_haplotyper_skip_filter + - variant_calling + - sentieon/haplotyper + files: + - path: results/csv/variantcalled.csv + md5sum: e08d6aa77d914bc2c933e70696b74cdd + - path: results/multiqc + - path: results/preprocessing/converted/test/test.converted.cram + # binary changes md5sums on reruns + - path: results/preprocessing/converted/test/test.converted.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram + should_exist: false + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + should_exist: false + - path: results/reports/bcftools/sentieon_haplotyper/test/test.haplotyper.unfiltered.bcftools_stats.txt + md5sum: d2660f9f6074b4bf18756c42ee656b8f + - path: results/reports/vcftools/sentieon_haplotyper/test/test.haplotyper.unfiltered.FILTER.summary + md5sum: 01b3d10464a3ac86f90ee82cdda23f68 + - path: results/reports/vcftools/sentieon_haplotyper/test/test.haplotyper.unfiltered.TsTv.count + md5sum: 89562fef808b5c3db629682d36fd86fc + - path: results/reports/vcftools/sentieon_haplotyper/test/test.haplotyper.unfiltered.TsTv.qual + # changes md5sum on reruns. This is somewhat unexpected, but might to tiny variation in very small numbers in the qual-files. + - path: results/variant_calling/sentieon_haplotyper/test/test.haplotyper.filtered.vcf.gz + should_exist: false + - path: results/variant_calling/sentieon_haplotyper/test/test.haplotyper.filtered.vcf.gz.tbi + should_exist: false + - path: results/variant_calling/sentieon_haplotyper/test/test.haplotyper.unfiltered.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/sentieon_haplotyper/test/test.haplotyper.unfiltered.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/sentieon_haplotyper + should_exist: false + - path: results/reports/mosdepth/test/test.recal.mosdepth.global.dist.txt + md5sum: e82e90c7d508a135b5a8a7cd6933452e + - path: results/reports/mosdepth/test/test.recal.mosdepth.summary.txt + md5sum: 4f0d231060cbde4efdd673863bd2fb59 + - path: results/reports/mosdepth/test/test.recal.per-base.bed.gz + md5sum: bc1df47d46f818fee5275975925d769a + - path: results/reports/mosdepth/test/test.recal.per-base.bed.gz.csi + md5sum: 9e649ac749ff6c6073bef5ab63e8aaa4 + - path: results/reports/samtools/test/test.recal.cram.stats +- name: Run variant calling on germline sample with sentieon haplotyper without intervals and skip filter + command: nextflow run main.nf -profile test_cache,software_license,targeted --sentieon_extension --input ./tests/csv/3.0/mapped_single_bam.csv --tools sentieon_haplotyper --step variant_calling --skip_tools haplotyper_filter --no_intervals --outdir results + tags: + - germline + - sentieon_haplotyper_skip_filter + - no_intervals + - variant_calling + - sentieon/haplotyper + files: + - path: results/csv/variantcalled.csv + md5sum: e08d6aa77d914bc2c933e70696b74cdd + - path: results/multiqc + - path: results/no_intervals.bed + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/no_intervals.bed.gz + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/no_intervals.bed.gz.tbi + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/preprocessing/converted/test/test.converted.cram + # binary changes md5sums on reruns + - path: results/preprocessing/converted/test/test.converted.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram + should_exist: false + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + should_exist: false + - path: results/reports/bcftools/sentieon_haplotyper/test/test.haplotyper.unfiltered.bcftools_stats.txt + md5sum: d2660f9f6074b4bf18756c42ee656b8f + - path: results/reports/vcftools/sentieon_haplotyper/test/test.haplotyper.unfiltered.FILTER.summary + md5sum: 01b3d10464a3ac86f90ee82cdda23f68 + - path: results/reports/vcftools/sentieon_haplotyper/test/test.haplotyper.unfiltered.TsTv.count + md5sum: 89562fef808b5c3db629682d36fd86fc + - path: results/reports/vcftools/sentieon_haplotyper/test/test.haplotyper.unfiltered.TsTv.qual + # changes md5sum on reruns. This is somewhat unexpected, but might to tiny variation in very small numbers in the qual-files. + - path: results/variant_calling/sentieon_haplotyper/test/test.haplotyper.filtered.vcf.gz + should_exist: false + - path: results/variant_calling/sentieon_haplotyper/test/test.haplotyper.filtered.vcf.gz.tbi + should_exist: false + - path: results/variant_calling/sentieon_haplotyper/test/test.haplotyper.unfiltered.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/sentieon_haplotyper/test/test.haplotyper.unfiltered.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/sentieon_haplotyper + should_exist: false + - path: results/reports/mosdepth/test/test.recal.mosdepth.global.dist.txt + md5sum: e82e90c7d508a135b5a8a7cd6933452e + - path: results/reports/mosdepth/test/test.recal.mosdepth.summary.txt + md5sum: 4f0d231060cbde4efdd673863bd2fb59 + - path: results/reports/mosdepth/test/test.recal.per-base.bed.gz + md5sum: bc1df47d46f818fee5275975925d769a + - path: results/reports/mosdepth/test/test.recal.per-base.bed.gz.csi + md5sum: 9e649ac749ff6c6073bef5ab63e8aaa4 + - path: results/reports/samtools/test/test.recal.cram.stats diff --git a/tests/test_skip_all_qc.yml b/tests/test_skip_all_qc.yml new file mode 100644 index 0000000000..9e2ef2ceb1 --- /dev/null +++ b/tests/test_skip_all_qc.yml @@ -0,0 +1,52 @@ +- name: Run default pipeline with skipping all QC steps + command: nextflow run main.nf -profile test_cache --skip_tools fastqc,markduplicates_report,mosdepth,multiqc,samtools --outdir results + tags: + - default_extended + - preprocessing + - skip_all_qc + - skip_qc + - variant_calling + files: + - path: results/csv/markduplicates.csv + md5sum: 0d6120bb99e92f6810343270711ca53e + - path: results/csv/markduplicates_no_table.csv + md5sum: 2a2d3d4842befd4def39156463859ee3 + - path: results/csv/recalibrated.csv + md5sum: 2d29d9e53894dcce96a1b5beb6ef3312 + - path: results/preprocessing/markduplicates/test/test.md.cram + # binary changes md5sums on reruns + - path: results/preprocessing/markduplicates/test/test.md.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recal_table/test/test.recal.table + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + # binary changes md5sums on reruns + - path: results/multiqc + should_exist: false + - path: results/reports/fastqc + should_exist: false + - path: results/reports/markduplicates + should_exist: false + - path: results/reports/mosdepth + should_exist: false + - path: results/reports/samtools + should_exist: false + - path: results/reports/bcftools/strelka/test/test.strelka.variants.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/vcftools/strelka/test/test.strelka.variants.FILTER.summary + md5sum: ad417bc96d31223f61170987975d8128 + - path: results/reports/vcftools/strelka/test/test.strelka.variants.TsTv.count + md5sum: fa27f678965b7cba6a92efcd039f802a + - path: results/reports/vcftools/strelka/test/test.strelka.variants.TsTv.qual + md5sum: bc68ae4e688e9fb772b457069e604883 + - path: results/variant_calling/strelka/test/test.strelka.genome.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/test/test.strelka.genome.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/test/test.strelka.variants.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/test/test.strelka.variants.vcf.gz.tbi + - path: results/strelka + should_exist: false diff --git a/tests/test_skip_markduplicates.yml b/tests/test_skip_markduplicates.yml new file mode 100644 index 0000000000..d683b6952d --- /dev/null +++ b/tests/test_skip_markduplicates.yml @@ -0,0 +1,163 @@ +- name: Run default pipeline with skipping Markduplicates + command: nextflow run main.nf -profile test_cache,skip_markduplicates --outdir results + tags: + - default_extended + - preprocessing + - skip_markduplicates + files: + - path: results/csv/mapped.csv + md5sum: 3bee45ccf65e301ce09ee4eed8f26933 + - path: results/csv/recalibrated.csv + md5sum: 2d29d9e53894dcce96a1b5beb6ef3312 + - path: results/multiqc + - path: results/preprocessing/mapped/test/test.sorted.cram + # binary changes md5sums on reruns + - path: results/preprocessing/mapped/test/test.sorted.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recal_table/test/test.recal.table + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + # binary changes md5sums on reruns + - path: results/reports/fastqc/test-test_L1 + - path: results/reports/mosdepth/test/test.recal.mosdepth.global.dist.txt + md5sum: 3ab32cc98996e0f12b8088b99dd1e2d1 + - path: results/reports/mosdepth/test/test.recal.mosdepth.region.dist.txt + md5sum: 1653d9aa161a78d8574269083f7d92f1 + - path: results/reports/mosdepth/test/test.recal.mosdepth.summary.txt + md5sum: b8874be5d830a2d03d42bccad7c996d3 + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz + md5sum: 67ac075b077723fb1cce4b026dcdf0af + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz.csi + md5sum: 3230b6808c22d4907d18910f2dc2daf2 + - path: results/reports/mosdepth/test/test.sorted.mosdepth.global.dist.txt + md5sum: 3ab32cc98996e0f12b8088b99dd1e2d1 + - path: results/reports/mosdepth/test/test.sorted.mosdepth.region.dist.txt + md5sum: 1653d9aa161a78d8574269083f7d92f1 + - path: results/reports/mosdepth/test/test.sorted.mosdepth.summary.txt + md5sum: b8874be5d830a2d03d42bccad7c996d3 + - path: results/reports/mosdepth/test/test.sorted.regions.bed.gz + md5sum: 67ac075b077723fb1cce4b026dcdf0af + - path: results/reports/mosdepth/test/test.sorted.regions.bed.gz.csi + md5sum: 3230b6808c22d4907d18910f2dc2daf2 + - path: results/reports/samtools/test/test.recal.cram.stats + # conda changes md5sums for test + - path: results/reports/samtools/test/test.sorted.cram.stats + # conda changes md5sums for test + - path: results/csv/markduplicates.csv + should_exist: false + - path: results/csv/markduplicates_no_table.csv + should_exist: false + - path: results/preprocessing/mapped/test/test.bam + should_exist: false + - path: results/preprocessing/mapped/test/test.sorted.bam + should_exist: false +- name: Run default pipeline with skipping Markduplicates with save_mapped + command: nextflow run main.nf -profile test_cache,skip_markduplicates --save_mapped --outdir results + tags: + - default_extended + - preprocessing + - save_mapped + - skip_markduplicates + files: + - path: results/csv/mapped.csv + md5sum: 3bee45ccf65e301ce09ee4eed8f26933 + - path: results/csv/recalibrated.csv + md5sum: 2d29d9e53894dcce96a1b5beb6ef3312 + - path: results/multiqc + - path: results/preprocessing/mapped/test/test.sorted.cram + # binary changes md5sums on reruns + - path: results/preprocessing/mapped/test/test.sorted.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recal_table/test/test.recal.table + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + # binary changes md5sums on reruns + - path: results/reports/fastqc/test-test_L1 + - path: results/reports/mosdepth/test/test.recal.mosdepth.global.dist.txt + md5sum: 3ab32cc98996e0f12b8088b99dd1e2d1 + - path: results/reports/mosdepth/test/test.recal.mosdepth.region.dist.txt + md5sum: 1653d9aa161a78d8574269083f7d92f1 + - path: results/reports/mosdepth/test/test.recal.mosdepth.summary.txt + md5sum: b8874be5d830a2d03d42bccad7c996d3 + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz + md5sum: 67ac075b077723fb1cce4b026dcdf0af + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz.csi + md5sum: 3230b6808c22d4907d18910f2dc2daf2 + - path: results/reports/mosdepth/test/test.sorted.mosdepth.global.dist.txt + md5sum: 3ab32cc98996e0f12b8088b99dd1e2d1 + - path: results/reports/mosdepth/test/test.sorted.mosdepth.region.dist.txt + md5sum: 1653d9aa161a78d8574269083f7d92f1 + - path: results/reports/mosdepth/test/test.sorted.mosdepth.summary.txt + md5sum: b8874be5d830a2d03d42bccad7c996d3 + - path: results/reports/mosdepth/test/test.sorted.regions.bed.gz + md5sum: 67ac075b077723fb1cce4b026dcdf0af + - path: results/reports/mosdepth/test/test.sorted.regions.bed.gz.csi + md5sum: 3230b6808c22d4907d18910f2dc2daf2 + - path: results/reports/samtools/test/test.recal.cram.stats + # conda changes md5sums for test + - path: results/reports/samtools/test/test.sorted.cram.stats + # conda changes md5sums for test + - path: results/csv/markduplicates.csv + should_exist: false + - path: results/csv/markduplicates_no_table.csv + should_exist: false + - path: results/preprocessing/mapped/test/test.bam + should_exist: false +- name: Run default pipeline with skipping Markduplicates with save_mapped & save_output_as_bam + command: nextflow run main.nf -profile test_cache,skip_markduplicates --save_mapped --save_output_as_bam --outdir results + tags: + - default_extended + - preprocessing + - save_output_as_bam + - skip_markduplicates + files: + - path: results/csv/mapped.csv + md5sum: 7f21bf40d3fbc248ee2ea3fdf0f7cdb2 + - path: results/csv/recalibrated.csv + md5sum: 2dfbcaaeaaf4937c51c5c310f1c77614 + - path: results/multiqc + - path: results/preprocessing/mapped/test/test.sorted.bam + # binary changes md5sums on reruns + - path: results/preprocessing/mapped/test/test.sorted.bam.bai + # binary changes md5sums on reruns + - path: results/preprocessing/recal_table/test/test.recal.table + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.bam + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.bam.bai + # binary changes md5sums on reruns + - path: results/reports/fastqc/test-test_L1 + - path: results/reports/mosdepth/test/test.recal.mosdepth.global.dist.txt + md5sum: 3ab32cc98996e0f12b8088b99dd1e2d1 + - path: results/reports/mosdepth/test/test.recal.mosdepth.region.dist.txt + md5sum: 1653d9aa161a78d8574269083f7d92f1 + - path: results/reports/mosdepth/test/test.recal.mosdepth.summary.txt + md5sum: b8874be5d830a2d03d42bccad7c996d3 + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz + md5sum: 67ac075b077723fb1cce4b026dcdf0af + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz.csi + md5sum: 3230b6808c22d4907d18910f2dc2daf2 + - path: results/reports/mosdepth/test/test.sorted.mosdepth.global.dist.txt + md5sum: 3ab32cc98996e0f12b8088b99dd1e2d1 + - path: results/reports/mosdepth/test/test.sorted.mosdepth.region.dist.txt + md5sum: 1653d9aa161a78d8574269083f7d92f1 + - path: results/reports/mosdepth/test/test.sorted.mosdepth.summary.txt + md5sum: b8874be5d830a2d03d42bccad7c996d3 + - path: results/reports/mosdepth/test/test.sorted.regions.bed.gz + md5sum: 67ac075b077723fb1cce4b026dcdf0af + - path: results/reports/mosdepth/test/test.sorted.regions.bed.gz.csi + md5sum: 3230b6808c22d4907d18910f2dc2daf2 + - path: results/reports/samtools/test/test.recal.cram.stats + # conda changes md5sums for test + - path: results/reports/samtools/test/test.sorted.cram.stats + # conda changes md5sums for test + - path: results/csv/markduplicates.csv + should_exist: false + - path: results/csv/markduplicates_no_table.csv + should_exist: false + - path: results/preprocessing/mapped/test/test.bam + should_exist: false diff --git a/tests/test_strelka.yml b/tests/test_strelka.yml new file mode 100644 index 0000000000..766d9e18c7 --- /dev/null +++ b/tests/test_strelka.yml @@ -0,0 +1,445 @@ +- name: Skip variant calling on matched normal + command: nextflow run main.nf -profile test_cache,variantcalling_channels --tools strelka --only_paired_variant_calling --outdir results + tags: + - somatic + - strelka + - variantcalling_channel + files: + - path: results/multiqc + - path: results/reports/bcftools/strelka/sample1/sample1.strelka.variants.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/bcftools/strelka/sample2/sample2.strelka.variants.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/bcftools/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_indels.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/bcftools/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_snvs.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/vcftools/strelka/sample1/sample1.strelka.variants.FILTER.summary + md5sum: 2048a5de0201a6052c988a0189979a5f + - path: results/reports/vcftools/strelka/sample1/sample1.strelka.variants.TsTv.count + md5sum: c5b7a8eda2526d899098439ae4c06a49 + - path: results/reports/vcftools/strelka/sample1/sample1.strelka.variants.TsTv.qual + # conda changes md5sums for test + - path: results/reports/vcftools/strelka/sample2/sample2.strelka.variants.FILTER.summary + md5sum: fa3112841a4575d104916027c8851b30 + - path: results/reports/vcftools/strelka/sample2/sample2.strelka.variants.TsTv.count + md5sum: d7f54d09d38af01a574a4930af21cfc9 + - path: results/reports/vcftools/strelka/sample2/sample2.strelka.variants.TsTv.qual + contains: + [ + "19 453 47848 0.00946748 11 50 0.22", + "56 456 47875 0.0095248 8 25 0.32", + "72 458 47880 0.00956558 6 20 0.3", + "314 463 47899 0.00966617 1 1 1", + ] + - path: results/reports/vcftools/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_indels.FILTER.summary + md5sum: 3441628cd6550ed459ca1c3db989ceea + - path: results/reports/vcftools/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_indels.TsTv.count + md5sum: 8dcfdbcaac118df1d5ad407dd2af699f + - path: results/reports/vcftools/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_indels.TsTv.qual + # conda changes md5sums for test + - path: results/reports/vcftools/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_snvs.FILTER.summary + md5sum: 4fc17fa5625b4d1dcc5d791b1eb22d85 + - path: results/reports/vcftools/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_snvs.TsTv.count + md5sum: fc7af1f534890c4ad3025588b3af62ae + - path: results/reports/vcftools/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_snvs.TsTv.qual + # conda changes md5sums for test + - path: results/variant_calling/strelka/sample1/sample1.strelka.genome.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/sample1/sample1.strelka.genome.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/sample1/sample1.strelka.variants.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/sample1/sample1.strelka.variants.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/sample2/sample2.strelka.genome.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/sample2/sample2.strelka.genome.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/sample2/sample2.strelka.variants.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/sample2/sample2.strelka.variants.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_indels.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_indels.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_snvs.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_snvs.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/sample3/strelka/sample3.strelka.variants.vcf.gz + should_exist: false + - path: results/variant_calling/strelka/sample3/sample3.strelka.variants.vcf.gz.tbi + should_exist: false + - path: results/variant_calling/strelka/sample3/sample3.strelka.genome.vcf.gz + should_exist: false + - path: results/variant_calling/strelka/sample3/sample3.strelka.genome.vcf.gz.tbi + should_exist: false + - path: results/reports/mosdepth/sample1/sample1.recal.mosdepth.global.dist.txt + md5sum: 69e29702ef01fd8f6c7a5468fc35a16a + - path: results/reports/mosdepth/sample1/sample1.recal.mosdepth.region.dist.txt + md5sum: 6ec49cd7d510c2eb3d9d90fdb79b783a + - path: results/reports/mosdepth/sample1/sample1.recal.mosdepth.summary.txt + md5sum: 103098d0bf76ed82d2b87d5f242b099a + - path: results/reports/mosdepth/sample1/sample1.recal.regions.bed.gz + md5sum: b5888cf7395c57d39879a5faa6159eb3 + - path: results/reports/mosdepth/sample1/sample1.recal.regions.bed.gz.csi + md5sum: 9cb0ad7039a3b703d16ca7d5b835c0ee + - path: results/reports/samtools/sample1/sample1.recal.cram.stats + - path: results/reports/mosdepth/sample2/sample2.recal.mosdepth.global.dist.txt + md5sum: f2dcd00a64947c49e8e4b93c2f4fbf27 + - path: results/reports/mosdepth/sample2/sample2.recal.mosdepth.region.dist.txt + md5sum: 39005ffaac22871ffaaf19656fe69c5b + - path: results/reports/mosdepth/sample2/sample2.recal.mosdepth.summary.txt + md5sum: 68d4b98f17361fddf73052ead34fa370 + - path: results/reports/mosdepth/sample2/sample2.recal.regions.bed.gz + md5sum: 2819e995eafded35f53328c4ec19ba58 + - path: results/reports/mosdepth/sample2/sample2.recal.regions.bed.gz.csi + md5sum: 393c2749068304d8545b501b9d4658e4 + - path: results/reports/samtools/sample2/sample2.recal.cram.stats + - path: results/reports/mosdepth/sample3/sample3.recal.mosdepth.global.dist.txt + md5sum: 69e29702ef01fd8f6c7a5468fc35a16a + - path: results/reports/mosdepth/sample3/sample3.recal.mosdepth.region.dist.txt + md5sum: 6ec49cd7d510c2eb3d9d90fdb79b783a + - path: results/reports/mosdepth/sample3/sample3.recal.mosdepth.summary.txt + md5sum: 103098d0bf76ed82d2b87d5f242b099a + - path: results/reports/mosdepth/sample3/sample3.recal.regions.bed.gz + md5sum: b5888cf7395c57d39879a5faa6159eb3 + - path: results/reports/mosdepth/sample3/sample3.recal.regions.bed.gz.csi + md5sum: 9cb0ad7039a3b703d16ca7d5b835c0ee + - path: results/reports/samtools/sample3/sample3.recal.cram.stats + # conda changes md5sums for test + - path: results/reports/mosdepth/sample4/sample4.recal.mosdepth.global.dist.txt + md5sum: f2dcd00a64947c49e8e4b93c2f4fbf27 + - path: results/reports/mosdepth/sample4/sample4.recal.mosdepth.region.dist.txt + md5sum: 39005ffaac22871ffaaf19656fe69c5b + - path: results/reports/mosdepth/sample4/sample4.recal.mosdepth.summary.txt + md5sum: 68d4b98f17361fddf73052ead34fa370 + - path: results/reports/mosdepth/sample4/sample4.recal.regions.bed.gz + md5sum: 2819e995eafded35f53328c4ec19ba58 + - path: results/reports/mosdepth/sample4/sample4.recal.regions.bed.gz.csi + md5sum: 393c2749068304d8545b501b9d4658e4 + - path: results/reports/samtools/sample4/sample4.recal.cram.stats + # conda changes md5sums for test +- name: Run variant calling on germline sample with strelka + command: nextflow run main.nf -profile test_cache,tools_germline --tools strelka --outdir results + tags: + - germline + - strelka + - variant_calling + files: + - path: results/csv/variantcalled.csv + md5sum: cd8a47dfc3e44c395e9f693770ccc6c9 + - path: results/multiqc + - path: results/reports/bcftools/strelka/sample1/sample1.strelka.variants.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/vcftools/strelka/sample1/sample1.strelka.variants.FILTER.summary + md5sum: 2048a5de0201a6052c988a0189979a5f + - path: results/reports/vcftools/strelka/sample1/sample1.strelka.variants.TsTv.count + md5sum: c5b7a8eda2526d899098439ae4c06a49 + - path: results/reports/vcftools/strelka/sample1/sample1.strelka.variants.TsTv.qual + # conda changes md5sums for test + - path: results/variant_calling/strelka/sample1/sample1.strelka.genome.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/sample1/sample1.strelka.genome.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/sample1/sample1.strelka.variants.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/sample1/sample1.strelka.variants.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/strelka + should_exist: false + - path: results/reports/mosdepth/sample1/sample1.recal.mosdepth.global.dist.txt + md5sum: 69e29702ef01fd8f6c7a5468fc35a16a + - path: results/reports/mosdepth/sample1/sample1.recal.mosdepth.region.dist.txt + md5sum: 6ec49cd7d510c2eb3d9d90fdb79b783a + - path: results/reports/mosdepth/sample1/sample1.recal.mosdepth.summary.txt + md5sum: 103098d0bf76ed82d2b87d5f242b099a + - path: results/reports/mosdepth/sample1/sample1.recal.regions.bed.gz + md5sum: b5888cf7395c57d39879a5faa6159eb3 + - path: results/reports/mosdepth/sample1/sample1.recal.regions.bed.gz.csi + md5sum: 9cb0ad7039a3b703d16ca7d5b835c0ee + - path: results/reports/samtools/sample1/sample1.recal.cram.stats +- name: Run variant calling on germline sample with strelka without intervals + command: nextflow run main.nf -profile test_cache,tools_germline --tools strelka --no_intervals --outdir results + tags: + - germline + - strelka + - no_intervals + - variant_calling + files: + - path: results/csv/variantcalled.csv + md5sum: cd8a47dfc3e44c395e9f693770ccc6c9 + - path: results/multiqc + - path: results/no_intervals.bed + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/no_intervals.bed.gz + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/no_intervals.bed.gz.tbi + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/reports/bcftools/strelka/sample1/sample1.strelka.variants.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/vcftools/strelka/sample1/sample1.strelka.variants.FILTER.summary + md5sum: 2b7be6ff481fddc655210b836587810d + - path: results/reports/vcftools/strelka/sample1/sample1.strelka.variants.TsTv.count + md5sum: 1481854d2a765f5641856ecf95ca4097 + - path: results/reports/vcftools/strelka/sample1/sample1.strelka.variants.TsTv.qual + # conda changes md5sums for test + - path: results/variant_calling/strelka/sample1/sample1.strelka.genome.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/sample1/sample1.strelka.genome.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/sample1/sample1.strelka.variants.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/sample1/sample1.strelka.variants.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/strelka + should_exist: false + - path: results/reports/mosdepth/sample1/sample1.recal.mosdepth.global.dist.txt + md5sum: 69e29702ef01fd8f6c7a5468fc35a16a + - path: results/reports/mosdepth/sample1/sample1.recal.mosdepth.summary.txt + md5sum: d2775eb102acc5950f7f53883dcb503d + - path: results/reports/mosdepth/sample1/sample1.recal.per-base.bed.gz + md5sum: 54431f155c9538809e8caf99d1a75ec7 + - path: results/reports/mosdepth/sample1/sample1.recal.per-base.bed.gz.csi + md5sum: c67dcd711b096eb42f43784d5eadbc0d + - path: results/reports/samtools/sample1/sample1.recal.cram.stats +- name: Run variant calling on tumor only sample with strelka + command: nextflow run main.nf -profile test_cache,tools_tumoronly --tools strelka --outdir results + tags: + - strelka + - tumor_only + - variant_calling + files: + - path: results/csv/variantcalled.csv + md5sum: 8d2a5e0ad12781c99e773b828e478d35 + - path: results/multiqc + - path: results/reports/bcftools/strelka/sample2/sample2.strelka.variants.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/vcftools/strelka/sample2/sample2.strelka.variants.FILTER.summary + md5sum: fa3112841a4575d104916027c8851b30 + - path: results/reports/vcftools/strelka/sample2/sample2.strelka.variants.TsTv.count + md5sum: d7f54d09d38af01a574a4930af21cfc9 + - path: results/reports/vcftools/strelka/sample2/sample2.strelka.variants.TsTv.qual + contains: ["19 453 47848 0.00946748 11 50 0.22", "72 458 47880 0.00956558 6 20 0.3", "314 463 47899 0.00966617 1 1 1"] + - path: results/variant_calling/strelka/sample2/sample2.strelka.genome.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/sample2/sample2.strelka.genome.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/sample2/sample2.strelka.variants.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/sample2/sample2.strelka.variants.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/strelka + should_exist: false + - path: results/reports/mosdepth/sample2/sample2.recal.mosdepth.global.dist.txt + md5sum: f2dcd00a64947c49e8e4b93c2f4fbf27 + - path: results/reports/mosdepth/sample2/sample2.recal.mosdepth.region.dist.txt + md5sum: 39005ffaac22871ffaaf19656fe69c5b + - path: results/reports/mosdepth/sample2/sample2.recal.mosdepth.summary.txt + md5sum: 68d4b98f17361fddf73052ead34fa370 + - path: results/reports/mosdepth/sample2/sample2.recal.regions.bed.gz + md5sum: 2819e995eafded35f53328c4ec19ba58 + - path: results/reports/mosdepth/sample2/sample2.recal.regions.bed.gz.csi + md5sum: 393c2749068304d8545b501b9d4658e4 + - path: results/reports/samtools/sample2/sample2.recal.cram.stats +- name: Run variant calling on tumor only sample with strelka without intervals + command: nextflow run main.nf -profile test_cache,tools_tumoronly --tools strelka --no_intervals --outdir results + tags: + - no_intervals + - strelka + - tumor_only + - variant_calling + files: + - path: results/csv/variantcalled.csv + md5sum: 8d2a5e0ad12781c99e773b828e478d35 + - path: results/multiqc + - path: results/no_intervals.bed + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/no_intervals.bed.gz + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/no_intervals.bed.gz.tbi + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/reports/bcftools/strelka/sample2/sample2.strelka.variants.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/vcftools/strelka/sample2/sample2.strelka.variants.FILTER.summary + md5sum: d1dcce19d82ced016724ace746e95d01 + - path: results/reports/vcftools/strelka/sample2/sample2.strelka.variants.TsTv.count + md5sum: 9de35bbe9ebe45166b6bd195717f733a + - path: results/reports/vcftools/strelka/sample2/sample2.strelka.variants.TsTv.qual + # conda changes md5sums for test + - path: results/variant_calling/strelka/sample2/sample2.strelka.genome.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/sample2/sample2.strelka.genome.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/sample2/sample2.strelka.variants.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/sample2/sample2.strelka.variants.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/strelka + should_exist: false + - path: results/reports/mosdepth/sample2/sample2.recal.mosdepth.global.dist.txt + md5sum: f2dcd00a64947c49e8e4b93c2f4fbf27 + - path: results/reports/mosdepth/sample2/sample2.recal.mosdepth.summary.txt + md5sum: 0a7300e56eda6fba7c7564f00aa000f0 + - path: results/reports/mosdepth/sample2/sample2.recal.per-base.bed.gz + md5sum: 3de4a9f4da2f2b4909ef192452a8d211 + - path: results/reports/mosdepth/sample2/sample2.recal.per-base.bed.gz.csi + md5sum: cfb07b0ba46e8468b4342edb243536f3 + - path: results/reports/samtools/sample2/sample2.recal.cram.stats +- name: Run variant calling on somatic sample with strelka + command: nextflow run main.nf -profile test_cache,tools_somatic --tools strelka --outdir results + tags: + - somatic + - strelka + - variant_calling + files: + - path: results/csv/variantcalled.csv + md5sum: 31ccee9472fed8bd15798724c62aee15 + - path: results/multiqc + - path: results/reports/bcftools/strelka/sample3/sample3.strelka.variants.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/bcftools/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_indels.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/bcftools/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_snvs.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/vcftools/strelka/sample3/sample3.strelka.variants.FILTER.summary + md5sum: 2048a5de0201a6052c988a0189979a5f + - path: results/reports/vcftools/strelka/sample3/sample3.strelka.variants.TsTv.count + md5sum: c5b7a8eda2526d899098439ae4c06a49 + - path: results/reports/vcftools/strelka/sample3/sample3.strelka.variants.TsTv.qual + # conda changes md5sums for test + - path: results/reports/vcftools/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_indels.FILTER.summary + md5sum: 3441628cd6550ed459ca1c3db989ceea + - path: results/reports/vcftools/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_indels.TsTv.count + md5sum: 8dcfdbcaac118df1d5ad407dd2af699f + - path: results/reports/vcftools/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_indels.TsTv.qual + # conda changes md5sums for test + - path: results/reports/vcftools/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_snvs.FILTER.summary + md5sum: 4fc17fa5625b4d1dcc5d791b1eb22d85 + - path: results/reports/vcftools/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_snvs.TsTv.count + md5sum: fc7af1f534890c4ad3025588b3af62ae + - path: results/reports/vcftools/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_snvs.TsTv.qual + # conda changes md5sums for test + - path: results/variant_calling/strelka/sample3/sample3.strelka.genome.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/sample3/sample3.strelka.genome.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/sample3/sample3.strelka.variants.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/sample3/sample3.strelka.variants.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_indels.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_indels.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_snvs.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_snvs.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/strelka + should_exist: false + - path: results/reports/mosdepth/sample3/sample3.recal.mosdepth.global.dist.txt + md5sum: 69e29702ef01fd8f6c7a5468fc35a16a + - path: results/reports/mosdepth/sample3/sample3.recal.mosdepth.region.dist.txt + md5sum: 6ec49cd7d510c2eb3d9d90fdb79b783a + - path: results/reports/mosdepth/sample3/sample3.recal.mosdepth.summary.txt + md5sum: 103098d0bf76ed82d2b87d5f242b099a + - path: results/reports/mosdepth/sample3/sample3.recal.regions.bed.gz + md5sum: b5888cf7395c57d39879a5faa6159eb3 + - path: results/reports/mosdepth/sample3/sample3.recal.regions.bed.gz.csi + md5sum: 9cb0ad7039a3b703d16ca7d5b835c0ee + - path: results/reports/samtools/sample3/sample3.recal.cram.stats + # conda changes md5sums for test + - path: results/reports/mosdepth/sample4/sample4.recal.mosdepth.global.dist.txt + md5sum: f2dcd00a64947c49e8e4b93c2f4fbf27 + - path: results/reports/mosdepth/sample4/sample4.recal.mosdepth.region.dist.txt + md5sum: 39005ffaac22871ffaaf19656fe69c5b + - path: results/reports/mosdepth/sample4/sample4.recal.mosdepth.summary.txt + md5sum: 68d4b98f17361fddf73052ead34fa370 + - path: results/reports/mosdepth/sample4/sample4.recal.regions.bed.gz + md5sum: 2819e995eafded35f53328c4ec19ba58 + - path: results/reports/mosdepth/sample4/sample4.recal.regions.bed.gz.csi + md5sum: 393c2749068304d8545b501b9d4658e4 + - path: results/reports/samtools/sample4/sample4.recal.cram.stats + # conda changes md5sums for test +- name: Run variant calling on somatic sample with strelka without intervals + command: nextflow run main.nf -profile test_cache,tools_somatic --tools strelka --no_intervals --outdir results + tags: + - no_intervals + - somatic + - strelka + - variant_calling + files: + - path: results/csv/variantcalled.csv + md5sum: 31ccee9472fed8bd15798724c62aee15 + - path: results/multiqc + - path: results/no_intervals.bed + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/no_intervals.bed.gz + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/no_intervals.bed.gz.tbi + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/reports/bcftools/strelka/sample3/sample3.strelka.variants.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/bcftools/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_indels.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/bcftools/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_snvs.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/vcftools/strelka/sample3/sample3.strelka.variants.FILTER.summary + md5sum: 2b7be6ff481fddc655210b836587810d + - path: results/reports/vcftools/strelka/sample3/sample3.strelka.variants.TsTv.count + md5sum: 1481854d2a765f5641856ecf95ca4097 + - path: results/reports/vcftools/strelka/sample3/sample3.strelka.variants.TsTv.qual + # conda changes md5sums for test + - path: results/reports/vcftools/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_indels.FILTER.summary + md5sum: 3441628cd6550ed459ca1c3db989ceea + - path: results/reports/vcftools/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_indels.TsTv.count + md5sum: 8dcfdbcaac118df1d5ad407dd2af699f + - path: results/reports/vcftools/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_indels.TsTv.qual + # conda changes md5sums for test + - path: results/reports/vcftools/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_snvs.FILTER.summary + md5sum: 7a81b11aa29fec73d5bc872b7b58f8aa + - path: results/reports/vcftools/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_snvs.TsTv.count + md5sum: a922c51ca3b2ea7cdcfa09e9c8c55d52 + - path: results/reports/vcftools/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_snvs.TsTv.qual + # conda changes md5sums for test + - path: results/variant_calling/strelka/sample3/sample3.strelka.genome.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/sample3/sample3.strelka.genome.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/sample3/sample3.strelka.variants.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/sample3/sample3.strelka.variants.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_indels.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_indels.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_snvs.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_snvs.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/strelka + should_exist: false + - path: results/reports/mosdepth/sample3/sample3.recal.mosdepth.global.dist.txt + md5sum: 69e29702ef01fd8f6c7a5468fc35a16a + - path: results/reports/mosdepth/sample3/sample3.recal.mosdepth.summary.txt + md5sum: d2775eb102acc5950f7f53883dcb503d + - path: results/reports/mosdepth/sample3/sample3.recal.per-base.bed.gz + md5sum: 54431f155c9538809e8caf99d1a75ec7 + - path: results/reports/mosdepth/sample3/sample3.recal.per-base.bed.gz.csi + md5sum: c67dcd711b096eb42f43784d5eadbc0d + - path: results/reports/samtools/sample3/sample3.recal.cram.stats + # conda changes md5sums for test + - path: results/reports/mosdepth/sample4/sample4.recal.mosdepth.global.dist.txt + md5sum: f2dcd00a64947c49e8e4b93c2f4fbf27 + - path: results/reports/mosdepth/sample4/sample4.recal.mosdepth.summary.txt + md5sum: 0a7300e56eda6fba7c7564f00aa000f0 + - path: results/reports/mosdepth/sample4/sample4.recal.per-base.bed.gz + md5sum: 3de4a9f4da2f2b4909ef192452a8d211 + - path: results/reports/mosdepth/sample4/sample4.recal.per-base.bed.gz.csi + md5sum: cfb07b0ba46e8468b4342edb243536f3 + - path: results/reports/samtools/sample4/sample4.recal.cram.stats + # conda changes md5sums for test diff --git a/tests/test_strelka_bp.yml b/tests/test_strelka_bp.yml new file mode 100644 index 0000000000..b813f648d8 --- /dev/null +++ b/tests/test_strelka_bp.yml @@ -0,0 +1,231 @@ +- name: Run variant calling on somatic sample with Strelka BP + command: nextflow run main.nf -profile test_cache,tools_somatic --tools strelka,manta --outdir results + tags: + - somatic + - strelka_bp + - variant_calling + files: + - path: results/csv/variantcalled.csv + md5sum: eff248896ca462b76c79749403e44f48 + - path: results/multiqc + - path: results/reports/bcftools/manta/sample3/sample3.manta.diploid_sv.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/bcftools/manta/sample4_vs_sample3/sample4_vs_sample3.manta.diploid_sv.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/bcftools/manta/sample4_vs_sample3/sample4_vs_sample3.manta.somatic_sv.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/bcftools/strelka/sample3/sample3.strelka.variants.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/bcftools/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_indels.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/bcftools/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_snvs.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/vcftools/manta/sample3/sample3.manta.diploid_sv.FILTER.summary + md5sum: 1ce42d34e4ae919afb519efc99146423 + - path: results/reports/vcftools/manta/sample3/sample3.manta.diploid_sv.TsTv.count + md5sum: fa27f678965b7cba6a92efcd039f802a + - path: results/reports/vcftools/manta/sample3/sample3.manta.diploid_sv.TsTv.qual + # conda changes md5sums for test + - path: results/reports/vcftools/manta/sample4_vs_sample3/sample4_vs_sample3.manta.diploid_sv.FILTER.summary + md5sum: 1ce42d34e4ae919afb519efc99146423 + - path: results/reports/vcftools/manta/sample4_vs_sample3/sample4_vs_sample3.manta.diploid_sv.TsTv.count + md5sum: fa27f678965b7cba6a92efcd039f802a + - path: results/reports/vcftools/manta/sample4_vs_sample3/sample4_vs_sample3.manta.diploid_sv.TsTv.qual + # conda changes md5sums for test + - path: results/reports/vcftools/manta/sample4_vs_sample3/sample4_vs_sample3.manta.somatic_sv.FILTER.summary + md5sum: 1ce42d34e4ae919afb519efc99146423 + - path: results/reports/vcftools/manta/sample4_vs_sample3/sample4_vs_sample3.manta.somatic_sv.TsTv.count + md5sum: 8dcfdbcaac118df1d5ad407dd2af699f + - path: results/reports/vcftools/manta/sample4_vs_sample3/sample4_vs_sample3.manta.somatic_sv.TsTv.qual + # conda changes md5sums for test + - path: results/reports/vcftools/strelka/sample3/sample3.strelka.variants.FILTER.summary + md5sum: 2048a5de0201a6052c988a0189979a5f + - path: results/reports/vcftools/strelka/sample3/sample3.strelka.variants.TsTv.count + md5sum: c5b7a8eda2526d899098439ae4c06a49 + - path: results/reports/vcftools/strelka/sample3/sample3.strelka.variants.TsTv.qual + # conda changes md5sums for test + - path: results/reports/vcftools/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_indels.FILTER.summary + md5sum: 3441628cd6550ed459ca1c3db989ceea + - path: results/reports/vcftools/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_indels.TsTv.count + md5sum: 8dcfdbcaac118df1d5ad407dd2af699f + - path: results/reports/vcftools/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_indels.TsTv.qual + # conda changes md5sums for test + - path: results/reports/vcftools/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_snvs.FILTER.summary + md5sum: 4fc17fa5625b4d1dcc5d791b1eb22d85 + - path: results/reports/vcftools/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_snvs.TsTv.count + md5sum: fc7af1f534890c4ad3025588b3af62ae + - path: results/reports/vcftools/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_snvs.TsTv.qual + # conda changes md5sums for test + - path: results/variant_calling/manta/sample3/sample3.manta.diploid_sv.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/manta/sample3/sample3.manta.diploid_sv.vcf.gz.tbi + md5sum: 4cb176febbc8c26d717a6c6e67b9c905 + - path: results/variant_calling/manta/sample4_vs_sample3/sample4_vs_sample3.manta.diploid_sv.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/manta/sample4_vs_sample3/sample4_vs_sample3.manta.diploid_sv.vcf.gz.tbi + md5sum: 4cb176febbc8c26d717a6c6e67b9c905 + - path: results/variant_calling/manta/sample4_vs_sample3/sample4_vs_sample3.manta.somatic_sv.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/manta/sample4_vs_sample3/sample4_vs_sample3.manta.somatic_sv.vcf.gz.tbi + md5sum: 4cb176febbc8c26d717a6c6e67b9c905 + - path: results/variant_calling/strelka/sample3/sample3.strelka.genome.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/sample3/sample3.strelka.genome.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/sample3/sample3.strelka.variants.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/sample3/sample3.strelka.variants.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_indels.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_indels.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_snvs.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_snvs.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/manta + should_exist: false + - path: results/strelka + should_exist: false + - path: results/reports/mosdepth/sample3/sample3.recal.mosdepth.global.dist.txt + md5sum: 69e29702ef01fd8f6c7a5468fc35a16a + - path: results/reports/mosdepth/sample3/sample3.recal.mosdepth.region.dist.txt + md5sum: 6ec49cd7d510c2eb3d9d90fdb79b783a + - path: results/reports/mosdepth/sample3/sample3.recal.mosdepth.summary.txt + md5sum: 103098d0bf76ed82d2b87d5f242b099a + - path: results/reports/mosdepth/sample3/sample3.recal.regions.bed.gz + md5sum: b5888cf7395c57d39879a5faa6159eb3 + - path: results/reports/mosdepth/sample3/sample3.recal.regions.bed.gz.csi + md5sum: 9cb0ad7039a3b703d16ca7d5b835c0ee + - path: results/reports/samtools/sample3/sample3.recal.cram.stats + # conda changes md5sums for test + - path: results/reports/mosdepth/sample4/sample4.recal.mosdepth.global.dist.txt + md5sum: f2dcd00a64947c49e8e4b93c2f4fbf27 + - path: results/reports/mosdepth/sample4/sample4.recal.mosdepth.region.dist.txt + md5sum: 39005ffaac22871ffaaf19656fe69c5b + - path: results/reports/mosdepth/sample4/sample4.recal.mosdepth.summary.txt + md5sum: 68d4b98f17361fddf73052ead34fa370 + - path: results/reports/mosdepth/sample4/sample4.recal.regions.bed.gz + md5sum: 2819e995eafded35f53328c4ec19ba58 + - path: results/reports/mosdepth/sample4/sample4.recal.regions.bed.gz.csi + md5sum: 393c2749068304d8545b501b9d4658e4 + - path: results/reports/samtools/sample4/sample4.recal.cram.stats + # conda changes md5sums for test +- name: Run variant calling on somatic sample with Strelka BP without intervals + command: nextflow run main.nf -profile test_cache,tools_somatic --tools strelka,manta --no_intervals --outdir results + tags: + - no_intervals + - somatic + - strelka_bp + - variant_calling + files: + - path: results/csv/variantcalled.csv + md5sum: eff248896ca462b76c79749403e44f48 + - path: results/multiqc + - path: results/no_intervals.bed + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/no_intervals.bed.gz + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/no_intervals.bed.gz.tbi + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/reports/bcftools/manta/sample3/sample3.manta.diploid_sv.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/bcftools/manta/sample4_vs_sample3/sample4_vs_sample3.manta.diploid_sv.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/bcftools/manta/sample4_vs_sample3/sample4_vs_sample3.manta.somatic_sv.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/bcftools/strelka/sample3/sample3.strelka.variants.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/bcftools/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_indels.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/bcftools/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_snvs.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/vcftools/manta/sample3/sample3.manta.diploid_sv.FILTER.summary + md5sum: 1ce42d34e4ae919afb519efc99146423 + - path: results/reports/vcftools/manta/sample3/sample3.manta.diploid_sv.TsTv.count + md5sum: fa27f678965b7cba6a92efcd039f802a + - path: results/reports/vcftools/manta/sample3/sample3.manta.diploid_sv.TsTv.qual + # conda changes md5sums for test + - path: results/reports/vcftools/manta/sample4_vs_sample3/sample4_vs_sample3.manta.diploid_sv.FILTER.summary + md5sum: 1ce42d34e4ae919afb519efc99146423 + - path: results/reports/vcftools/manta/sample4_vs_sample3/sample4_vs_sample3.manta.diploid_sv.TsTv.count + md5sum: fa27f678965b7cba6a92efcd039f802a + - path: results/reports/vcftools/manta/sample4_vs_sample3/sample4_vs_sample3.manta.diploid_sv.TsTv.qual + # conda changes md5sums for test + - path: results/reports/vcftools/manta/sample4_vs_sample3/sample4_vs_sample3.manta.somatic_sv.FILTER.summary + md5sum: 1ce42d34e4ae919afb519efc99146423 + - path: results/reports/vcftools/manta/sample4_vs_sample3/sample4_vs_sample3.manta.somatic_sv.TsTv.count + md5sum: 8dcfdbcaac118df1d5ad407dd2af699f + - path: results/reports/vcftools/manta/sample4_vs_sample3/sample4_vs_sample3.manta.somatic_sv.TsTv.qual + # conda changes md5sums for test + - path: results/reports/vcftools/strelka/sample3/sample3.strelka.variants.FILTER.summary + md5sum: 2b7be6ff481fddc655210b836587810d + - path: results/reports/vcftools/strelka/sample3/sample3.strelka.variants.TsTv.count + md5sum: 1481854d2a765f5641856ecf95ca4097 + - path: results/reports/vcftools/strelka/sample3/sample3.strelka.variants.TsTv.qual + # conda changes md5sums for test + - path: results/reports/vcftools/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_indels.FILTER.summary + md5sum: 3441628cd6550ed459ca1c3db989ceea + - path: results/reports/vcftools/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_indels.TsTv.count + md5sum: 8dcfdbcaac118df1d5ad407dd2af699f + - path: results/reports/vcftools/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_indels.TsTv.qual + # conda changes md5sums for test + - path: results/reports/vcftools/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_snvs.FILTER.summary + md5sum: 7a81b11aa29fec73d5bc872b7b58f8aa + - path: results/reports/vcftools/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_snvs.TsTv.count + md5sum: a922c51ca3b2ea7cdcfa09e9c8c55d52 + - path: results/reports/vcftools/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_snvs.TsTv.qual + # conda changes md5sums for test + - path: results/variant_calling/manta/sample3/sample3.manta.diploid_sv.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/manta/sample3/sample3.manta.diploid_sv.vcf.gz.tbi + md5sum: 4cb176febbc8c26d717a6c6e67b9c905 + - path: results/variant_calling/manta/sample4_vs_sample3/sample4_vs_sample3.manta.diploid_sv.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/manta/sample4_vs_sample3/sample4_vs_sample3.manta.diploid_sv.vcf.gz.tbi + md5sum: 4cb176febbc8c26d717a6c6e67b9c905 + - path: results/variant_calling/manta/sample4_vs_sample3/sample4_vs_sample3.manta.somatic_sv.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/manta/sample4_vs_sample3/sample4_vs_sample3.manta.somatic_sv.vcf.gz.tbi + md5sum: 4cb176febbc8c26d717a6c6e67b9c905 + - path: results/variant_calling/strelka/sample3/sample3.strelka.genome.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/sample3/sample3.strelka.genome.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/sample3/sample3.strelka.variants.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/sample3/sample3.strelka.variants.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_indels.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_indels.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_snvs.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/strelka/sample4_vs_sample3/sample4_vs_sample3.strelka.somatic_snvs.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/manta + should_exist: false + - path: results/strelka + should_exist: false + - path: results/reports/mosdepth/sample3/sample3.recal.mosdepth.global.dist.txt + md5sum: 69e29702ef01fd8f6c7a5468fc35a16a + - path: results/reports/mosdepth/sample3/sample3.recal.mosdepth.summary.txt + md5sum: d2775eb102acc5950f7f53883dcb503d + - path: results/reports/mosdepth/sample3/sample3.recal.per-base.bed.gz + md5sum: 54431f155c9538809e8caf99d1a75ec7 + - path: results/reports/mosdepth/sample3/sample3.recal.per-base.bed.gz.csi + md5sum: c67dcd711b096eb42f43784d5eadbc0d + - path: results/reports/samtools/sample3/sample3.recal.cram.stats + # conda changes md5sums for test + - path: results/reports/mosdepth/sample4/sample4.recal.mosdepth.global.dist.txt + md5sum: f2dcd00a64947c49e8e4b93c2f4fbf27 + - path: results/reports/mosdepth/sample4/sample4.recal.mosdepth.summary.txt + md5sum: 0a7300e56eda6fba7c7564f00aa000f0 + - path: results/reports/mosdepth/sample4/sample4.recal.per-base.bed.gz + md5sum: 3de4a9f4da2f2b4909ef192452a8d211 + - path: results/reports/mosdepth/sample4/sample4.recal.per-base.bed.gz.csi + md5sum: cfb07b0ba46e8468b4342edb243536f3 + - path: results/reports/samtools/sample4/sample4.recal.cram.stats + # conda changes md5sums for test diff --git a/tests/test_tiddit.yml b/tests/test_tiddit.yml new file mode 100644 index 0000000000..abac99f94b --- /dev/null +++ b/tests/test_tiddit.yml @@ -0,0 +1,144 @@ +- name: Run variant calling on somatic sample with tiddit + command: nextflow run main.nf -profile test_cache,tools_somatic --tools tiddit --outdir results + tags: + - tiddit + - somatic + - variant_calling + files: + - path: results/csv/variantcalled.csv + md5sum: 76b499e35c128d67b0606ea561bf70e0 + - path: results/multiqc + - path: results/reports/bcftools/tiddit/sample3/sample3.tiddit.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/bcftools/tiddit/sample4_vs_sample3/sample4_vs_sample3.tiddit_sv_merge.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/vcftools/tiddit/sample3/sample3.tiddit.FILTER.summary + md5sum: 1ce42d34e4ae919afb519efc99146423 + - path: results/reports/vcftools/tiddit/sample3/sample3.tiddit.TsTv.count + md5sum: fa27f678965b7cba6a92efcd039f802a + - path: results/reports/vcftools/tiddit/sample3/sample3.tiddit.TsTv.qual + md5sum: bc68ae4e688e9fb772b457069e604883 + - path: results/reports/vcftools/tiddit/sample4_vs_sample3/sample4_vs_sample3.tiddit_sv_merge.FILTER.summary + md5sum: 1ce42d34e4ae919afb519efc99146423 + - path: results/reports/vcftools/tiddit/sample4_vs_sample3/sample4_vs_sample3.tiddit_sv_merge.TsTv.count + md5sum: 8dcfdbcaac118df1d5ad407dd2af699f + - path: results/reports/vcftools/tiddit/sample4_vs_sample3/sample4_vs_sample3.tiddit_sv_merge.TsTv.qual + md5sum: bc68ae4e688e9fb772b457069e604883 + - path: results/variant_calling/tiddit/sample3/sample3.tiddit.ploidies.tab + md5sum: fcfd2ecf6e7eef532e072757354c8a90 + - path: results/variant_calling/tiddit/sample3/sample3.tiddit.vcf.gz + # conda changes md5sums for test + - path: results/variant_calling/tiddit/sample3/sample3.tiddit.vcf.gz.tbi + # conda changes md5sums for test + - path: results/variant_calling/tiddit/sample4_vs_sample3/sample4_vs_sample3.tiddit.normal.vcf.gz + # conda changes md5sums for test + - path: results/variant_calling/tiddit/sample4_vs_sample3/sample4_vs_sample3.tiddit.normal.vcf.gz.tbi + # conda changes md5sums for test + - path: results/variant_calling/tiddit/sample4_vs_sample3/sample4_vs_sample3.tiddit.ploidies.tab + # conda changes md5sums for test + - path: results/variant_calling/tiddit/sample4_vs_sample3/sample4_vs_sample3.tiddit.tumor.vcf.gz + # conda changes md5sums for test + - path: results/variant_calling/tiddit/sample4_vs_sample3/sample4_vs_sample3.tiddit.tumor.vcf.gz.tbi + # conda changes md5sums for test + - path: results/variant_calling/tiddit/sample4_vs_sample3/sample4_vs_sample3.tiddit_sv_merge.vcf.gz + # conda changes md5sums for test + - path: results/tiddit + should_exist: false + - path: results/reports/mosdepth/sample3/sample3.recal.mosdepth.global.dist.txt + md5sum: 69e29702ef01fd8f6c7a5468fc35a16a + - path: results/reports/mosdepth/sample3/sample3.recal.mosdepth.region.dist.txt + md5sum: 6ec49cd7d510c2eb3d9d90fdb79b783a + - path: results/reports/mosdepth/sample3/sample3.recal.mosdepth.summary.txt + md5sum: 103098d0bf76ed82d2b87d5f242b099a + - path: results/reports/mosdepth/sample3/sample3.recal.regions.bed.gz + md5sum: b5888cf7395c57d39879a5faa6159eb3 + - path: results/reports/mosdepth/sample3/sample3.recal.regions.bed.gz.csi + md5sum: 9cb0ad7039a3b703d16ca7d5b835c0ee + - path: results/reports/samtools/sample3/sample3.recal.cram.stats + # conda changes md5sums for test + - path: results/reports/mosdepth/sample4/sample4.recal.mosdepth.global.dist.txt + md5sum: f2dcd00a64947c49e8e4b93c2f4fbf27 + - path: results/reports/mosdepth/sample4/sample4.recal.mosdepth.region.dist.txt + md5sum: 39005ffaac22871ffaaf19656fe69c5b + - path: results/reports/mosdepth/sample4/sample4.recal.mosdepth.summary.txt + md5sum: 68d4b98f17361fddf73052ead34fa370 + - path: results/reports/mosdepth/sample4/sample4.recal.regions.bed.gz + md5sum: 2819e995eafded35f53328c4ec19ba58 + - path: results/reports/mosdepth/sample4/sample4.recal.regions.bed.gz.csi + md5sum: 393c2749068304d8545b501b9d4658e4 + - path: results/reports/samtools/sample4/sample4.recal.cram.stats + # conda changes md5sums for test +- name: Run variant calling on germline sample with tiddit + command: nextflow run main.nf -profile test_cache,tools_germline --tools tiddit --outdir results + tags: + - tiddit + - germline + - variant_calling + files: + - path: results/csv/variantcalled.csv + md5sum: cd458ebee25e1fb1258d2f390e54c736 + - path: results/multiqc + - path: results/reports/bcftools/tiddit/sample1/sample1.tiddit.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/vcftools/tiddit/sample1/sample1.tiddit.FILTER.summary + md5sum: 1ce42d34e4ae919afb519efc99146423 + - path: results/reports/vcftools/tiddit/sample1/sample1.tiddit.TsTv.count + md5sum: fa27f678965b7cba6a92efcd039f802a + - path: results/reports/vcftools/tiddit/sample1/sample1.tiddit.TsTv.qual + md5sum: bc68ae4e688e9fb772b457069e604883 + - path: results/variant_calling/tiddit/sample1/sample1.tiddit.ploidies.tab + md5sum: fcfd2ecf6e7eef532e072757354c8a90 + - path: results/variant_calling/tiddit/sample1/sample1.tiddit.vcf.gz + # conda changes md5sums for test + - path: results/variant_calling/tiddit/sample1/sample1.tiddit.vcf.gz.tbi + # conda changes md5sums for test + - path: results/tiddit + should_exist: false + - path: results/reports/mosdepth/sample1/sample1.recal.mosdepth.global.dist.txt + md5sum: 69e29702ef01fd8f6c7a5468fc35a16a + - path: results/reports/mosdepth/sample1/sample1.recal.mosdepth.region.dist.txt + md5sum: 6ec49cd7d510c2eb3d9d90fdb79b783a + - path: results/reports/mosdepth/sample1/sample1.recal.mosdepth.summary.txt + md5sum: 103098d0bf76ed82d2b87d5f242b099a + - path: results/reports/mosdepth/sample1/sample1.recal.regions.bed.gz + md5sum: b5888cf7395c57d39879a5faa6159eb3 + - path: results/reports/mosdepth/sample1/sample1.recal.regions.bed.gz.csi + md5sum: 9cb0ad7039a3b703d16ca7d5b835c0ee + - path: results/reports/samtools/sample1/sample1.recal.cram.stats +- name: Run variant calling on tumor_only sample with tiddit + command: nextflow run main.nf -profile test_cache,tools_tumoronly --tools tiddit --outdir results + tags: + - tiddit + - tumor_only + - variant_calling + files: + - path: results/csv/variantcalled.csv + md5sum: 15076bb78912fe51006e83934c376fc2 + - path: results/multiqc + - path: results/reports/bcftools/tiddit/sample2/sample2.tiddit.bcftools_stats.txt + # conda changes md5sums for test + - path: results/reports/vcftools/tiddit/sample2/sample2.tiddit.FILTER.summary + md5sum: 1ce42d34e4ae919afb519efc99146423 + - path: results/reports/vcftools/tiddit/sample2/sample2.tiddit.TsTv.count + md5sum: fa27f678965b7cba6a92efcd039f802a + - path: results/reports/vcftools/tiddit/sample2/sample2.tiddit.TsTv.qual + md5sum: bc68ae4e688e9fb772b457069e604883 + - path: results/variant_calling/tiddit/sample2/sample2.tiddit.ploidies.tab + md5sum: 5c5d70295c3957990502cb692e07a965 + - path: results/variant_calling/tiddit/sample2/sample2.tiddit.vcf.gz + # conda changes md5sums for test + - path: results/variant_calling/tiddit/sample2/sample2.tiddit.vcf.gz.tbi + # conda changes md5sums for test + - path: results/tiddit + should_exist: false + - path: results/reports/mosdepth/sample2/sample2.recal.mosdepth.global.dist.txt + md5sum: f2dcd00a64947c49e8e4b93c2f4fbf27 + - path: results/reports/mosdepth/sample2/sample2.recal.mosdepth.region.dist.txt + md5sum: 39005ffaac22871ffaaf19656fe69c5b + - path: results/reports/mosdepth/sample2/sample2.recal.mosdepth.summary.txt + md5sum: 68d4b98f17361fddf73052ead34fa370 + - path: results/reports/mosdepth/sample2/sample2.recal.regions.bed.gz + md5sum: 2819e995eafded35f53328c4ec19ba58 + - path: results/reports/mosdepth/sample2/sample2.recal.regions.bed.gz.csi + md5sum: 393c2749068304d8545b501b9d4658e4 + - path: results/reports/samtools/sample2/sample2.recal.cram.stats diff --git a/tests/test_tools_manually.yml b/tests/test_tools_manually.yml new file mode 100644 index 0000000000..8725ec6a64 --- /dev/null +++ b/tests/test_tools_manually.yml @@ -0,0 +1,357 @@ +- name: Run variant calling on somatic samples with ascat + command: nextflow run main.nf -profile test_cache,tools_somatic_ascat --outdir results + tags: + - ascat_manual + - manual + - somatic + - variant_calling + files: + - path: results/variant_calling/ascat/sample4_vs_sample3/sample4_vs_sample3.tumour_normalBAF.txt + md5sum: b73a38fd183143b1e8aed9f261a9c5f0 + - path: results/variant_calling/ascat/sample4_vs_sample3/sample4_vs_sample3.tumour_tumourLogR.txt + md5sum: 29f29092c19274aa3d5fd4f9e3828cbb + - path: results/variant_calling/ascat/sample4_vs_sample3/sample4_vs_sample3.after_correction_gc_rt.sample4_vs_sample3.tumour.tumour.png + md5sum: df246ef9c2c4dc868901afe17366e116 + - path: results/variant_calling/ascat/sample4_vs_sample3/sample4_vs_sample3.before_correction.sample4_vs_sample3.tumour.tumour.png + md5sum: 7629826e2e02ab99bedbab75b0c022a5 + - path: results/variant_calling/ascat/sample4_vs_sample3/sample4_vs_sample3.cnvs.txt + md5sum: 68b329da9893e34099c7d8ad5cb9c940 + - path: results/variant_calling/ascat/sample4_vs_sample3/sample4_vs_sample3.tumour_tumourBAF.txt + md5sum: 5235f69624ab91f395ebf30b90c02e9e + - path: results/variant_calling/ascat/sample4_vs_sample3/sample4_vs_sample3.tumour.ASPCF.png + md5sum: 883075c53513dea8bbcb85ad564cb641 + - path: results/variant_calling/ascat/sample4_vs_sample3/sample4_vs_sample3.after_correction_gc_rt.sample4_vs_sample3.tumour.germline.png + md5sum: 1a47cc241548fa89f914c2f5bfee6bee + - path: results/variant_calling/ascat/sample4_vs_sample3/sample4_vs_sample3.metrics.txt + md5sum: f7e486e5eed6166dedf9306235f537ec + - path: results/variant_calling/ascat/sample4_vs_sample3/sample4_vs_sample3.before_correction.sample4_vs_sample3.tumour.germline.png + md5sum: 1a47cc241548fa89f914c2f5bfee6bee + - path: results/variant_calling/ascat/sample4_vs_sample3/sample4_vs_sample3.purityploidy.txt + md5sum: f1484c2b120834d3db8774ad02a038b9 + - path: results/variant_calling/ascat/sample4_vs_sample3/sample4_vs_sample3.tumour.sunrise.png + md5sum: a2bf4b04176983a87b85843c789eaab8 + - path: results/variant_calling/ascat/sample4_vs_sample3/sample4_vs_sample3.segments.txt + md5sum: 68b329da9893e34099c7d8ad5cb9c940 + - path: results/variant_calling/ascat/sample4_vs_sample3/sample4_vs_sample3.tumour_normalLogR.txt + md5sum: 05418a7d814db11808172a4f57d040a1 + - path: results/reports/mosdepth/sample3/sample3.recal.mosdepth.global.dist.txt + md5sum: 69e29702ef01fd8f6c7a5468fc35a16a + - path: results/reports/mosdepth/sample3/sample3.recal.mosdepth.region.dist.txt + md5sum: 6ec49cd7d510c2eb3d9d90fdb79b783a + - path: results/reports/mosdepth/sample3/sample3.recal.mosdepth.summary.txt + md5sum: 103098d0bf76ed82d2b87d5f242b099a + - path: results/reports/mosdepth/sample3/sample3.recal.regions.bed.gz + md5sum: b5888cf7395c57d39879a5faa6159eb3 + - path: results/reports/mosdepth/sample3/sample3.recal.regions.bed.gz.csi + md5sum: 9cb0ad7039a3b703d16ca7d5b835c0ee + - path: results/reports/samtools/sample3/sample3.recal.cram.stats + # conda changes md5sums for test + - path: results/reports/mosdepth/sample4/sample4.recal.mosdepth.global.dist.txt + md5sum: f2dcd00a64947c49e8e4b93c2f4fbf27 + - path: results/reports/mosdepth/sample4/sample4.recal.mosdepth.region.dist.txt + md5sum: 39005ffaac22871ffaaf19656fe69c5b + - path: results/reports/mosdepth/sample4/sample4.recal.mosdepth.summary.txt + md5sum: 68d4b98f17361fddf73052ead34fa370 + - path: results/reports/mosdepth/sample4/sample4.recal.regions.bed.gz + md5sum: 2819e995eafded35f53328c4ec19ba58 + - path: results/reports/mosdepth/sample4/sample4.recal.regions.bed.gz.csi + md5sum: 393c2749068304d8545b501b9d4658e4 + - path: results/reports/samtools/sample4/sample4.recal.cram.stats + # conda changes md5sums for test +- name: Run variant calling on somatic sample with mutect2 without intervals + command: nextflow run main.nf -profile test_cache,tools_somatic --tools mutect2 --no_intervals --outdir results + tags: + - mutect2_manual + - manual + - no_intervals + - somatic + - variant_calling + files: + - path: results/csv/variantcalled.csv + md5sum: d3c9f0559d48696c54f3c463b1606586 + - path: results/multiqc + - path: results/no_intervals.bed + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/no_intervals.bed.gz + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/no_intervals.bed.gz.tbi + md5sum: f3dac01ea66b95fe477446fde2d31489 + - path: results/reports/bcftools/mutect2/sample4_vs_sample3/sample4_vs_sample3.mutect2.filtered.bcftools_stats.txt + md5sum: 9876607145d11c6b8492264936d7a82c + - path: results/reports/vcftools/mutect2/sample4_vs_sample3/sample4_vs_sample3.mutect2.filtered.FILTER.summary + md5sum: b25d4d2a64f9590d0ffb119fd3adb06e + - path: results/reports/vcftools/mutect2/sample4_vs_sample3/sample4_vs_sample3.mutect2.filtered.TsTv.count + md5sum: 3739f24da2d2019cc4bc2821e30658eb + - path: results/reports/vcftools/mutect2/sample4_vs_sample3/sample4_vs_sample3.mutect2.filtered.TsTv.qual + md5sum: 05c0cbb017d9232bc728d48f9d0c7afd + - path: results/variant_calling/mutect2/sample3/sample3.mutect2.pileups.table + md5sum: 8e0ca6f66e112bd2f7ec1d31a2d62469 + - path: results/variant_calling/mutect2/sample4/sample4.mutect2.pileups.table + md5sum: fe35b6bc041f2df8bd1f23420af3ddf9 + - path: results/variant_calling/mutect2/sample4_vs_sample3/sample4_vs_sample3.mutect2.artifactprior.tar.gz + # binary changes md5sums on reruns + - path: results/variant_calling/mutect2/sample4_vs_sample3/sample4_vs_sample3.mutect2.contamination.table + md5sum: 46c708c943b453da89a3da08acfdb2a7 + - path: results/variant_calling/mutect2/sample4_vs_sample3/sample4_vs_sample3.mutect2.filtered.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/mutect2/sample4_vs_sample3/sample4_vs_sample3.mutect2.filtered.vcf.gz.filteringStats.tsv + md5sum: 9ae27fbd04af1a2ea574e2ff1c3a683b + - path: results/variant_calling/mutect2/sample4_vs_sample3/sample4_vs_sample3.mutect2.filtered.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/mutect2/sample4_vs_sample3/sample4_vs_sample3.mutect2.segmentation.table + md5sum: f4643d9319bde4efbfbe516d6fb13052 + - path: results/variant_calling/mutect2/sample4_vs_sample3/sample4_vs_sample3.mutect2.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/mutect2/sample4_vs_sample3/sample4_vs_sample3.mutect2.vcf.gz.stats + md5sum: 17d2091015d04cbd4a26b7a67dc659e6 + - path: results/variant_calling/mutect2/sample4_vs_sample3/sample4_vs_sample3.mutect2.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/reports/mosdepth/sample3/sample3.recal.mosdepth.global.dist.txt + md5sum: 69e29702ef01fd8f6c7a5468fc35a16a + - path: results/reports/mosdepth/sample3/sample3.recal.mosdepth.summary.txt + md5sum: d2775eb102acc5950f7f53883dcb503d + - path: results/reports/mosdepth/sample3/sample3.recal.per-base.bed.gz + md5sum: 54431f155c9538809e8caf99d1a75ec7 + - path: results/reports/mosdepth/sample3/sample3.recal.per-base.bed.gz.csi + md5sum: c67dcd711b096eb42f43784d5eadbc0d + - path: results/reports/samtools/sample3/sample3.recal.cram.stats + # conda changes md5sums for test + - path: results/reports/mosdepth/sample4/sample4.recal.mosdepth.global.dist.txt + md5sum: f2dcd00a64947c49e8e4b93c2f4fbf27 + - path: results/reports/mosdepth/sample4/sample4.recal.mosdepth.summary.txt + md5sum: 0a7300e56eda6fba7c7564f00aa000f0 + - path: results/reports/mosdepth/sample4/sample4.recal.per-base.bed.gz + md5sum: 3de4a9f4da2f2b4909ef192452a8d211 + - path: results/reports/mosdepth/sample4/sample4.recal.per-base.bed.gz.csi + md5sum: cfb07b0ba46e8468b4342edb243536f3 + - path: results/reports/samtools/sample4/sample4.recal.cram.stats + # conda changes md5sums for test +- name: Run variant calling on somatic sample with mutect2 + command: nextflow run main.nf -profile test_cache,tools_somatic --tools mutect2 --outdir results + tags: + - mutect2_manual + - manual + - somatic + - variant_calling + files: + - path: results/csv/variantcalled.csv + md5sum: d3c9f0559d48696c54f3c463b1606586 + - path: results/multiqc + - path: results/reports/bcftools/mutect2/sample4_vs_sample3/sample4_vs_sample3.mutect2.filtered.bcftools_stats.txt + md5sum: 9876607145d11c6b8492264936d7a82c + - path: results/reports/vcftools/mutect2/sample4_vs_sample3/sample4_vs_sample3.mutect2.filtered.FILTER.summary + md5sum: b25d4d2a64f9590d0ffb119fd3adb06e + - path: results/reports/vcftools/mutect2/sample4_vs_sample3/sample4_vs_sample3.mutect2.filtered.TsTv.count + md5sum: 3739f24da2d2019cc4bc2821e30658eb + - path: results/reports/vcftools/mutect2/sample4_vs_sample3/sample4_vs_sample3.mutect2.filtered.TsTv.qual + md5sum: 05c0cbb017d9232bc728d48f9d0c7afd + - path: results/variant_calling/mutect2/sample3/sample3.mutect2.pileups.table + md5sum: 16077fdb885a8afe64c7669477471354 + - path: results/variant_calling/mutect2/sample4/sample4.mutect2.pileups.table + md5sum: 9afe42339f590937166edcf4746c22ec + - path: results/variant_calling/mutect2/sample4_vs_sample3/sample4_vs_sample3.mutect2.artifactprior.tar.gz + # binary changes md5sums on reruns + - path: results/variant_calling/mutect2/sample4_vs_sample3/sample4_vs_sample3.mutect2.contamination.table + md5sum: 46c708c943b453da89a3da08acfdb2a7 + - path: results/variant_calling/mutect2/sample4_vs_sample3/sample4_vs_sample3.mutect2.filtered.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/mutect2/sample4_vs_sample3/sample4_vs_sample3.mutect2.filtered.vcf.gz.filteringStats.tsv + md5sum: 9ae27fbd04af1a2ea574e2ff1c3a683b + - path: results/variant_calling/mutect2/sample4_vs_sample3/sample4_vs_sample3.mutect2.filtered.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/mutect2/sample4_vs_sample3/sample4_vs_sample3.mutect2.segmentation.table + md5sum: f4643d9319bde4efbfbe516d6fb13052 + - path: results/variant_calling/mutect2/sample4_vs_sample3/sample4_vs_sample3.mutect2.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/mutect2/sample4_vs_sample3/sample4_vs_sample3.mutect2.vcf.gz.stats + md5sum: c09dff3f145d77d4848992e244811c08 + - path: results/variant_calling/mutect2/sample4_vs_sample3/sample4_vs_sample3.mutect2.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/reports/mosdepth/sample3/sample3.recal.mosdepth.global.dist.txt + md5sum: 69e29702ef01fd8f6c7a5468fc35a16a + - path: results/reports/mosdepth/sample3/sample3.recal.mosdepth.region.dist.txt + md5sum: 6ec49cd7d510c2eb3d9d90fdb79b783a + - path: results/reports/mosdepth/sample3/sample3.recal.mosdepth.summary.txt + md5sum: 103098d0bf76ed82d2b87d5f242b099a + - path: results/reports/mosdepth/sample3/sample3.recal.regions.bed.gz + md5sum: b5888cf7395c57d39879a5faa6159eb3 + - path: results/reports/mosdepth/sample3/sample3.recal.regions.bed.gz.csi + md5sum: 9cb0ad7039a3b703d16ca7d5b835c0ee + - path: results/reports/samtools/sample3/sample3.recal.cram.stats + # conda changes md5sums for test + - path: results/reports/mosdepth/sample4/sample4.recal.mosdepth.global.dist.txt + md5sum: f2dcd00a64947c49e8e4b93c2f4fbf27 + - path: results/reports/mosdepth/sample4/sample4.recal.mosdepth.region.dist.txt + md5sum: 39005ffaac22871ffaaf19656fe69c5b + - path: results/reports/mosdepth/sample4/sample4.recal.mosdepth.summary.txt + md5sum: 68d4b98f17361fddf73052ead34fa370 + - path: results/reports/mosdepth/sample4/sample4.recal.regions.bed.gz + md5sum: 2819e995eafded35f53328c4ec19ba58 + - path: results/reports/mosdepth/sample4/sample4.recal.regions.bed.gz.csi + md5sum: 393c2749068304d8545b501b9d4658e4 + - path: results/reports/samtools/sample4/sample4.recal.cram.stats + # conda changes md5sums for test +- name: Run joint calling on tumor only samples with mutect2 + command: nextflow run main.nf -profile test_cache,tools_tumoronly --input tests/csv/3.0/recalibrated_tumoronly_joint.csv --tools mutect2 --joint_mutect2 --outdir results + tags: + - mutect2_manual + - manual + - tumor_only + - variant_calling + - multi_sample + - joint_tumoronly + files: + - path: results/csv/variantcalled.csv + md5sum: f87290ce1c6ea523e08354ed6c258b0b + - path: results/multiqc + - path: results/reports/bcftools/mutect2/test/test.mutect2.filtered.bcftools_stats.txt + md5sum: a0cdc26fb7d8c446dd0283fed71a24d5 + - path: results/reports/vcftools/mutect2/test/test.mutect2.filtered.FILTER.summary + md5sum: e1e42b6f65cbdba116cff72a56e40c4b + - path: results/reports/vcftools/mutect2/test/test.mutect2.filtered.TsTv.count + md5sum: c00e1639a41deb107099487676a6cf37 + - path: results/reports/vcftools/mutect2/test/test.mutect2.filtered.TsTv.qual + md5sum: a21016aa99e5cbf32eeae1b405ca6d8d + - path: results/variant_calling/mutect2/test/sample2.mutect2.contamination.table + md5sum: 46c708c943b453da89a3da08acfdb2a7 + - path: results/variant_calling/mutect2/sample2/sample2.mutect2.pileups.table + md5sum: 9afe42339f590937166edcf4746c22ec + - path: results/variant_calling/mutect2/test/sample2.mutect2.segmentation.table + md5sum: f4643d9319bde4efbfbe516d6fb13052 + - path: results/variant_calling/mutect2/test/sample3.mutect2.contamination.table + md5sum: 11440fe64b5b953d7efb9cf47e330364 + - path: results/variant_calling/mutect2/sample3/sample3.mutect2.pileups.table + md5sum: fd0c1f7819717b7f94e52f6611f4b2e0 + - path: results/variant_calling/mutect2/test/sample3.mutect2.segmentation.table + md5sum: 38f83e2f98b206640644dd93d5e96f4e + - path: results/variant_calling/mutect2/test/test.mutect2.artifactprior.tar.gz + # binary changes md5sums on reruns + - path: results/variant_calling/mutect2/test/test.mutect2.filtered.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/mutect2/test/test.mutect2.filtered.vcf.gz.filteringStats.tsv + md5sum: f237666ae325fde0c06b8bc62d2846fc + - path: results/variant_calling/mutect2/test/test.mutect2.filtered.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/mutect2/test/test.mutect2.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/mutect2/test/test.mutect2.vcf.gz.stats + md5sum: 22e58aef3b14b335fa487d40b590ffeb + - path: results/variant_calling/mutect2/test/test.mutect2.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/reports/mosdepth/sample3/sample3.recal.mosdepth.global.dist.txt + md5sum: f2dcd00a64947c49e8e4b93c2f4fbf27 + - path: results/reports/mosdepth/sample3/sample3.recal.mosdepth.region.dist.txt + md5sum: 39005ffaac22871ffaaf19656fe69c5b + - path: results/reports/mosdepth/sample3/sample3.recal.mosdepth.summary.txt + md5sum: 68d4b98f17361fddf73052ead34fa370 + - path: results/reports/mosdepth/sample3/sample3.recal.regions.bed.gz + md5sum: 2819e995eafded35f53328c4ec19ba58 + - path: results/reports/mosdepth/sample3/sample3.recal.regions.bed.gz.csi + md5sum: 393c2749068304d8545b501b9d4658e4 + - path: results/reports/samtools/sample3/sample3.recal.cram.stats + # conda changes md5sums for test +- name: Run joint calling on somatic samples with mutect2 + command: nextflow run main.nf -profile test_cache,tools_somatic --input tests/csv/3.0/recalibrated_somatic_joint.csv --tools mutect2 --joint_mutect2 --outdir results + tags: + - mutect2_manual + - somatic + - variant_calling + - multi_sample + - joint_somatic + files: + - path: results/csv/variantcalled.csv + md5sum: f87290ce1c6ea523e08354ed6c258b0b + - path: results/multiqc + - path: results/reports/bcftools/mutect2/test/test.mutect2.filtered.bcftools_stats.txt + md5sum: d75da410d57960944f54d02b2b5cdcac + - path: results/reports/vcftools/mutect2/test/test.mutect2.filtered.FILTER.summary + md5sum: e0eb3e34fc15f3b452bfc43f032cc8be + - path: results/reports/vcftools/mutect2/test/test.mutect2.filtered.TsTv.count + md5sum: aa51bde6080c015c6aa6c8254977dd11 + - path: results/reports/vcftools/mutect2/test/test.mutect2.filtered.TsTv.qual + md5sum: 262f843f68d072c457ca28b56da3ede8 + - path: results/variant_calling/mutect2/sample1/sample1.mutect2.pileups.table + md5sum: 16077fdb885a8afe64c7669477471354 + - path: results/variant_calling/mutect2/sample2/sample2.mutect2.pileups.table + md5sum: 9afe42339f590937166edcf4746c22ec + - path: results/variant_calling/mutect2/sample3/sample3.mutect2.pileups.table + md5sum: fd0c1f7819717b7f94e52f6611f4b2e0 + - path: results/variant_calling/mutect2/test/sample2_vs_sample1.mutect2.contamination.table + md5sum: 46c708c943b453da89a3da08acfdb2a7 + - path: results/variant_calling/mutect2/test/sample3_vs_sample1.mutect2.contamination.table + md5sum: 11440fe64b5b953d7efb9cf47e330364 + - path: results/variant_calling/mutect2/test/sample2_vs_sample1.mutect2.segmentation.table + md5sum: f4643d9319bde4efbfbe516d6fb13052 + - path: results/variant_calling/mutect2/test/sample3_vs_sample1.mutect2.segmentation.table + md5sum: 38f83e2f98b206640644dd93d5e96f4e + - path: results/variant_calling/mutect2/test/test.mutect2.artifactprior.tar.gz + # binary changes md5sums on reruns + - path: results/variant_calling/mutect2/test/test.mutect2.filtered.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/mutect2/test/test.mutect2.filtered.vcf.gz.filteringStats.tsv + md5sum: dee72b4c5c9bbda01d44fd3e00f1b404 + - path: results/variant_calling/mutect2/test/test.mutect2.filtered.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/variant_calling/mutect2/test/test.mutect2.vcf.gz + # binary changes md5sums on reruns + - path: results/variant_calling/mutect2/test/test.mutect2.vcf.gz.stats + md5sum: 094cb75b0bda28e92b6718ff33d136e2 + - path: results/variant_calling/mutect2/test/test.mutect2.vcf.gz.tbi + # binary changes md5sums on reruns + - path: results/reports/mosdepth/sample1/sample1.recal.mosdepth.global.dist.txt + md5sum: 69e29702ef01fd8f6c7a5468fc35a16a + - path: results/reports/mosdepth/sample1/sample1.recal.mosdepth.region.dist.txt + md5sum: 6ec49cd7d510c2eb3d9d90fdb79b783a + - path: results/reports/mosdepth/sample1/sample1.recal.mosdepth.summary.txt + md5sum: 103098d0bf76ed82d2b87d5f242b099a + - path: results/reports/mosdepth/sample1/sample1.recal.regions.bed.gz + md5sum: b5888cf7395c57d39879a5faa6159eb3 + - path: results/reports/mosdepth/sample1/sample1.recal.regions.bed.gz.csi + md5sum: 9cb0ad7039a3b703d16ca7d5b835c0ee + - path: results/reports/samtools/sample1/sample1.recal.cram.stats + # conda changes md5sums for test + - path: results/reports/mosdepth/sample2/sample2.recal.mosdepth.global.dist.txt + md5sum: f2dcd00a64947c49e8e4b93c2f4fbf27 + - path: results/reports/mosdepth/sample2/sample2.recal.mosdepth.region.dist.txt + md5sum: 39005ffaac22871ffaaf19656fe69c5b + - path: results/reports/mosdepth/sample2/sample2.recal.mosdepth.summary.txt + md5sum: 68d4b98f17361fddf73052ead34fa370 + - path: results/reports/mosdepth/sample2/sample2.recal.regions.bed.gz + md5sum: 2819e995eafded35f53328c4ec19ba58 + - path: results/reports/mosdepth/sample2/sample2.recal.regions.bed.gz.csi + md5sum: 393c2749068304d8545b501b9d4658e4 + - path: results/reports/samtools/sample2/sample2.recal.cram.stats + # conda changes md5sums for test + - path: results/reports/mosdepth/sample3/sample3.recal.mosdepth.global.dist.txt + md5sum: f2dcd00a64947c49e8e4b93c2f4fbf27 + - path: results/reports/mosdepth/sample3/sample3.recal.mosdepth.region.dist.txt + md5sum: 39005ffaac22871ffaaf19656fe69c5b + - path: results/reports/mosdepth/sample3/sample3.recal.mosdepth.summary.txt + md5sum: 68d4b98f17361fddf73052ead34fa370 + - path: results/reports/mosdepth/sample3/sample3.recal.regions.bed.gz + md5sum: 2819e995eafded35f53328c4ec19ba58 + - path: results/reports/mosdepth/sample3/sample3.recal.regions.bed.gz.csi + md5sum: 393c2749068304d8545b501b9d4658e4 + - path: results/reports/samtools/sample3/sample3.recal.cram.stats + # conda changes md5sums for test +- name: Run full pipeline on tumoronly with most tools + command: nextflow run . -profile test --input tests/csv/3.0/fastq_tumor_only.csv --tools cnvkit,freebayes,merge,mpileup,mutect2,snpeff,strelka,tiddit,vep --outdir results + tags: + - full_pipeline_manual + - manual + - tumor_only + - variant_calling +- name: Run full pipeline on somatic with most tools + command: nextflow run . -profile test --input tests/csv/3.0/fastq_pair.csv --tools cnvkit,deepvariant,freebayes,merge,mpileup,msisensorpro,snpeff,strelka,tiddit,vep --outdir results + tags: + - full_pipeline_manual + - manual + - somatic + - variant_calling +- name: Run full pipeline on germline with most tools + command: nextflow run . -profile test --input tests/csv/3.0/fastq_single.csv --tools cnvkit,deepvariant,freebayes,merge,mpileup,snpeff,strelka,tiddit,vep --outdir results + tags: + - full_pipeline_manual + - manual + - germline + - variant_calling diff --git a/tests/test_tumor_normal_pair.yml b/tests/test_tumor_normal_pair.yml new file mode 100644 index 0000000000..b616fc2b83 --- /dev/null +++ b/tests/test_tumor_normal_pair.yml @@ -0,0 +1,90 @@ +- name: Run default pipeline for tumor normal pair + command: nextflow run main.nf -profile test_cache,pair --outdir results + tags: + - default_extended + - preprocessing + - tumor_normal_pair + files: + - path: results/csv/markduplicates.csv + md5sum: e8e587ac25253ff7ab8f1cc66d410c98 + - path: results/csv/markduplicates_no_table.csv + md5sum: 617574c9b607e5daaf4ad56d48982247 + - path: results/csv/recalibrated.csv + md5sum: 008dff17e2a0d96ef9c1cae12fcab6ab + - path: results/multiqc + - path: results/preprocessing/markduplicates/test/test.md.cram + # binary changes md5sums on reruns + - path: results/preprocessing/markduplicates/test/test.md.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recal_table/test/test.recal.table + md5sum: 4ac774bf5f1157e77426fd82f5ac0fbe + - path: results/preprocessing/recalibrated/test/test.recal.cram + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/markduplicates/test2/test2.md.cram + # binary changes md5sums on reruns + - path: results/preprocessing/markduplicates/test2/test2.md.cram.crai + # binary changes md5sums on reruns + - path: results/preprocessing/recal_table/test2/test2.recal.table + md5sum: 0626cd4337eab79b38b5bc5c95e0c003 + - path: results/preprocessing/recalibrated/test2/test2.recal.cram + # binary changes md5sums on reruns + - path: results/preprocessing/recalibrated/test2/test2.recal.cram.crai + # binary changes md5sums on reruns + - path: results/reports/fastqc/test-test_L1 + - path: results/reports/fastqc/test2-test_L1 + - path: results/reports/markduplicates/test/test.md.cram.metrics + contains: ["test 8547 767 84 523391 3882 0 0 0.385081", "1.0 767 767"] + - path: results/reports/markduplicates/test2/test2.md.cram.metrics + contains: ["test2 10103 880 35 523579 4837 2 0 0.408076 193306", "1.0 1 876 876", "100.0 80.515303 0 0"] + - path: results/reports/mosdepth/test/test.md.mosdepth.global.dist.txt + md5sum: 76fa71922a3f748e507c2364c531dfcb + - path: results/reports/mosdepth/test/test.md.mosdepth.region.dist.txt + md5sum: abc5df85e302b79985627888870882da + - path: results/reports/mosdepth/test/test.md.mosdepth.summary.txt + md5sum: d536456436eb275159b8c6af83213d80 + - path: results/reports/mosdepth/test/test.md.regions.bed.gz + md5sum: 38fe39894abe62e38f8ac214cba64f2b + - path: results/reports/mosdepth/test/test.md.regions.bed.gz.csi + md5sum: b1c2a861f64e20a94108a6de3b76c582 + - path: results/reports/mosdepth/test/test.recal.mosdepth.global.dist.txt + md5sum: 76fa71922a3f748e507c2364c531dfcb + - path: results/reports/mosdepth/test/test.recal.mosdepth.region.dist.txt + md5sum: abc5df85e302b79985627888870882da + - path: results/reports/mosdepth/test/test.recal.mosdepth.summary.txt + md5sum: d536456436eb275159b8c6af83213d80 + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz + md5sum: 38fe39894abe62e38f8ac214cba64f2b + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz.csi + md5sum: b1c2a861f64e20a94108a6de3b76c582 + - path: results/reports/mosdepth/test2/test2.md.mosdepth.global.dist.txt + md5sum: 2020cf6dfc7ddca020c921dd9f0549b7 + - path: results/reports/mosdepth/test2/test2.md.mosdepth.region.dist.txt + md5sum: 38ff8b38c33b9231f047fea8ea830aae + - path: results/reports/mosdepth/test2/test2.md.mosdepth.summary.txt + md5sum: 8b991358768cade225470a07cd34f573 + - path: results/reports/mosdepth/test2/test2.md.regions.bed.gz + md5sum: 5d67bc6ea9f077abb4fdac3b087c6387 + - path: results/reports/mosdepth/test2/test2.md.regions.bed.gz.csi + md5sum: d5f1c9389ecf52ba839e834780a94549 + - path: results/reports/mosdepth/test2/test2.recal.mosdepth.global.dist.txt + md5sum: 2020cf6dfc7ddca020c921dd9f0549b7 + - path: results/reports/mosdepth/test2/test2.recal.mosdepth.region.dist.txt + md5sum: 38ff8b38c33b9231f047fea8ea830aae + - path: results/reports/mosdepth/test2/test2.recal.mosdepth.summary.txt + md5sum: 8b991358768cade225470a07cd34f573 + - path: results/reports/mosdepth/test2/test2.recal.regions.bed.gz + md5sum: 5d67bc6ea9f077abb4fdac3b087c6387 + - path: results/reports/mosdepth/test2/test2.recal.regions.bed.gz.csi + md5sum: d5f1c9389ecf52ba839e834780a94549 + - path: results/reports/samtools/test/test.md.cram.stats + # conda changes md5sums for test + - path: results/reports/samtools/test/test.recal.cram.stats + # conda changes md5sums for test + - path: results/reports/samtools/test2/test2.md.cram.stats + # conda changes md5sums for test + - path: results/reports/samtools/test2/test2.recal.cram.stats + # conda changes md5sums for test + - path: results/preprocessing/mapped/ + should_exist: false diff --git a/tests/test_umi.yml b/tests/test_umi.yml new file mode 100644 index 0000000000..0034d131d9 --- /dev/null +++ b/tests/test_umi.yml @@ -0,0 +1,88 @@ +- name: Run UMI test + command: nextflow run main.nf -profile test_cache,umi --outdir results + tags: + - preprocessing + - umi + files: + - path: results/preprocessing/umi/test/test-test_L1_umi-consensus.bam + # binary changes md5sums on reruns. + - path: results/reports/umi/test-test_L1_umi_histogram.txt + md5sum: 85292e9acb83edf17110dce17be27f44 + - path: results/csv/markduplicates.csv + md5sum: 0d6120bb99e92f6810343270711ca53e + - path: results/csv/markduplicates_no_table.csv + md5sum: 2a2d3d4842befd4def39156463859ee3 + - path: results/csv/recalibrated.csv + md5sum: 2d29d9e53894dcce96a1b5beb6ef3312 + - path: results/preprocessing/markduplicates/test/test.md.cram + # binary changes md5sums on reruns. + - path: results/preprocessing/markduplicates/test/test.md.cram.crai + # binary changes md5sums on reruns. + - path: results/preprocessing/recal_table/test/test.recal.table + md5sum: 18ea609fa06356aba3bcabda12f2cf7d + - path: results/preprocessing/recalibrated/test/test.recal.cram + # binary changes md5sums on reruns. + - path: results/preprocessing/recalibrated/test/test.recal.cram.crai + # binary changes md5sums on reruns. + - path: results/reports/markduplicates/test/test.md.cram.metrics + contains: ["test 0 2804 3 4 0 77 0 0.027461 50115", "1.0 1 2651 2651", "12.0 8.9868 0 0"] + - path: results/reports/mosdepth/test/test.md.mosdepth.global.dist.txt + md5sum: 09d22913aa50a0207f97a3f85b182c6e + - path: results/reports/mosdepth/test/test.md.mosdepth.region.dist.txt + md5sum: 9359ba1c4e09aa47cc95c9134f526675 + - path: results/reports/mosdepth/test/test.md.mosdepth.summary.txt + md5sum: 9bbea5e4d213a51f501c2aadff8d4526 + - path: results/reports/mosdepth/test/test.md.regions.bed.gz + md5sum: ef4426cd1d433464004bb39d483abad9 + - path: results/reports/mosdepth/test/test.md.regions.bed.gz.csi + md5sum: d0713716f63ac573f4a3385733e9a537 + - path: results/reports/mosdepth/test/test.recal.mosdepth.global.dist.txt + md5sum: 09d22913aa50a0207f97a3f85b182c6e + - path: results/reports/mosdepth/test/test.recal.mosdepth.region.dist.txt + md5sum: 9359ba1c4e09aa47cc95c9134f526675 + - path: results/reports/mosdepth/test/test.recal.mosdepth.summary.txt + md5sum: 9bbea5e4d213a51f501c2aadff8d4526 + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz + md5sum: ef4426cd1d433464004bb39d483abad9 + - path: results/reports/mosdepth/test/test.recal.regions.bed.gz.csi + md5sum: d0713716f63ac573f4a3385733e9a537 + - path: results/reports/samtools/test/test.md.cram.stats + # text-based file changes md5sums on reruns + - path: results/reports/samtools/test/test.recal.cram.stats + # text-based file changes md5sums on reruns + +- name: Run Sentieon-FGBio UMI combination test + command: nextflow run main.nf -profile test_cache,software_license,umi --sentieon_extension --outdir results --aligner "sentieon-bwamem" + tags: + - preprocessing + - umi + exit_code: 1 + stdout: + contains: + - "Sentieon BWA is currently not compatible with FGBio UMI handeling. Please choose a different aligner." +# - name: Run UMI TSO test +# command: nextflow run main.nf -profile test_cache,umi_tso --outdir results +# tags: +# - umi_tso +# - umi +# files: +# - path: results/fastqtobam/1234N-HT1080_umi_converted.bam +# - path: results/bam2fastq/1234N-HT1080_interleaved.fq.gz +# - path: results/callumiconsensus/1234N-HT1080_umi-consensus.bam +# - path: results/cat/1234N-HT1080_1.merged.fastq.gz +# - path: results/cat/1234N-HT1080_2.merged.fastq.gz +# - path: results/groupreadsbyumi/1234N-HT1080_umi-grouped.bam +# - path: results/groupreadsbyumi/1234N-HT1080_umi_histogram.txt +# - path: results/samblaster/1234N-HT1080_unsorted_tagged.bam +# - path: results/samtools/1234N-HT1080.map_map.bam +# - path: results/samtools/1234N-HT1080.map_unmap.bam +# - path: results/samtools/1234N-HT1080.mapped_1.fq.gz +# - path: results/samtools/1234N-HT1080.mapped_2.fq.gz +# - path: results/samtools/1234N-HT1080.mapped_other.fq.gz +# - path: results/samtools/1234N-HT1080.mapped_singleton.fq.gz +# - path: results/samtools/1234N-HT1080.unmap_map.bam +# - path: results/samtools/1234N-HT1080.unmap_unmap.bam +# - path: results/samtools/1234N-HT1080.unmapped_1.fq.gz +# - path: results/samtools/1234N-HT1080.unmapped_2.fq.gz +# - path: results/samtools/1234N-HT1080.unmapped_other.fq.gz +# - path: results/samtools/1234N-HT1080.unmapped_singleton.fq.gz diff --git a/tower.yml b/tower.yml index 787aedfe92..acfdf00c2a 100644 --- a/tower.yml +++ b/tower.yml @@ -1,5 +1,59 @@ reports: multiqc_report.html: display: "MultiQC HTML report" - samplesheet.csv: - display: "Auto-created samplesheet with collated metadata and FASTQ paths" + "**/umi/*_umi_histogram.txt": + display: "All UMI histograms" + "**/reports/fastp/*/*_fastp.html": + display: "FASTP report" + "**/reports/mosdepth/*/*.mosdepth.summary.txt": + display: "All samples summary of mean depths per chromosome and within specified regions per chromosome" + "**/csv/*.csv": + display: "All CSV files to restart nf-core/sarek at a different step" + "**/variantcalling/ascat/*/*.tumour.ASPCF.png": + display: "ASCAT: All allele-specific copy number segmentation images" + "**/variantcalling/ascat/*/*.before_correction_Tumour.*.png": + display: "ASCAT: All samples logR and BAF values" + "**/variantcalling/ascat/*/*.after_correction_GC_Tumour.*.png": + display: "ASCAT: All samples GC and RT corrected logR and BAF values" + "**/variantcalling/ascat/*/*.tumour.sunrise.png": + display: "ASCAT: Range of ploidy and tumor percentage values" + "**/variantcalling/ascat/*/*.metrics.txt": + display: "ASCAT: Multiple metrics information" + "**/variantcalling/ascat/*/*.cnvs.txt": + display: "ASCAT: CNVS information" + "**/variantcalling/ascat/*/*.purityploidy.txt": + display: "ASCAT: Purity and ploidy information" + "**/variantcalling/ascat/*/*.segments.txt": + display: "ASCAT: copy number segments information" + "**/variantcalling/ascat/*/*_tumourBAF.txt": + display: "ASCAT: beta allele frequencies" + "**/variantcalling/ascat/*/*.tumour_*LogR.txt": + display: "ASCAT: total copy number on a logarithmic scale" + "**/variantcalling/cnvkit/*/*-diagram.pdf": + display: "CNVKIT: Copy numbers or segments on chromosomes" + "**/variantcalling/cnvkit/**-scatter.png": + display: "CNVKIT: Bin-level log2 coverages and segmentation calls" + "**/variantcalling/controlfreec/*/config.txt": + display: "Control-FREEC: Configuration file used to run Control-FREEC" + "**/variantcalling/controlfreec/*/*_BAF.png": + display: "Control-FREEC: BAF plot" + "**/variantcalling/controlfreec/*/*_ratio.log2.png": + display: "Control-FREEC: log2 ratio plot" + "**/variantcalling/controlfreec/*/*_ratio.png": + display: "Control-FREEC: ratio plot" + "**/variantcalling/controlfreec/*/*.circos.txt": + display: "Control-FREEC: translated output to the Circos format" + "**/variantcalling/controlfreec/*/*.p.value.txt": + display: "Control-FREEC: CNV file containing p_values for each call" + "**/variantcalling/controlfreec/*/*_BAF.txt": + display: "Control-FREEC: file with beta allele frequencies for each possibly heterozygous SNP position" + "**/variantcalling/controlfreec/*/*_info.txt": + display: "Control-FREEC: parsable file with information about FREEC run" + "**/reports/bcftools/*.bcftools_stats.txt": + display: "All samples raw statistics" + "**/reports/SnpEff/*/*/*_snpEff.html": + display: "Statistics and plots for the SnpEff run" + "**/reports/SnpEff/*/*/*_snpEff.genes.txt": + display: "TXT (tab separated) summary counts for variants affecting each transcript and gene" + "**/reports/EnsemblVEP/*/*/*_VEP.summary.html": + display: "Summary of the VEP run" diff --git a/workflows/sarek.nf b/workflows/sarek.nf index 4eea44c4fb..7a90bd6757 100644 --- a/workflows/sarek.nf +++ b/workflows/sarek.nf @@ -4,7 +4,7 @@ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -include { paramsSummaryLog; paramsSummaryMap } from 'plugin/nf-validation' +include { paramsSummaryLog; paramsSummaryMap; fromSamplesheet } from 'plugin/nf-validation' def logo = NfcoreTemplate.logo(workflow, params.monochrome_logs) def citation = '\n' + WorkflowMain.citation(workflow) + '\n' @@ -13,18 +13,68 @@ def summary_params = paramsSummaryMap(workflow) // Print parameter summary log to screen log.info logo + paramsSummaryLog(workflow) + citation +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + VALIDATE INPUTS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +// Check input path parameters to see if they exist +def checkPathParamList = [ + params.ascat_alleles, + params.ascat_loci, + params.ascat_loci_gc, + params.ascat_loci_rt, + params.bwa, + params.bwamem2, + params.bcftools_annotations, + params.bcftools_annotations_tbi, + params.bcftools_header_lines, + params.cf_chrom_len, + params.chr_dir, + params.cnvkit_reference, + params.dbnsfp, + params.dbnsfp_tbi, + params.dbsnp, + params.dbsnp_tbi, + params.dict, + params.dragmap, + params.fasta, + params.fasta_fai, + params.germline_resource, + params.germline_resource_tbi, + params.input, + params.intervals, + params.known_indels, + params.known_indels_tbi, + params.known_snps, + params.known_snps_tbi, + params.mappability, + params.multiqc_config, + params.ngscheckmate_bed, + params.pon, + params.pon_tbi, + params.sentieon_dnascope_model, + params.spliceai_indel, + params.spliceai_indel_tbi, + params.spliceai_snv, + params.spliceai_snv_tbi +] + +// only check if we are using the tools +if (params.tools && (params.tools.split(',').contains('snpeff') || params.tools.split(',').contains('merge'))) checkPathParamList.add(params.snpeff_cache) +if (params.tools && (params.tools.split(',').contains('vep') || params.tools.split(',').contains('merge'))) checkPathParamList.add(params.vep_cache) + +// Validate input parameters WorkflowSarek.initialise(params, log) /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - CONFIG FILES + Check mandatory parameters ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -ch_multiqc_config = Channel.fromPath("$projectDir/assets/multiqc_config.yml", checkIfExists: true) -ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath( params.multiqc_config, checkIfExists: true ) : Channel.empty() -ch_multiqc_logo = params.multiqc_logo ? Channel.fromPath( params.multiqc_logo, checkIfExists: true ) : Channel.empty() -ch_multiqc_custom_methods_description = params.multiqc_methods_description ? file(params.multiqc_methods_description, checkIfExists: true) : file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) +for (param in checkPathParamList) if (param) file(param, checkIfExists: true) /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -32,23 +82,150 @@ ch_multiqc_custom_methods_description = params.multiqc_methods_description ? fil ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -// -// SUBWORKFLOW: Consisting of a mix of local and nf-core/modules -// -include { INPUT_CHECK } from '../subworkflows/local/input_check' +// Initialize file channels based on params, defined in the params.genomes[params.genome] scope +ascat_alleles = params.ascat_alleles ? Channel.fromPath(params.ascat_alleles).collect() : Channel.empty() +ascat_loci = params.ascat_loci ? Channel.fromPath(params.ascat_loci).collect() : Channel.empty() +ascat_loci_gc = params.ascat_loci_gc ? Channel.fromPath(params.ascat_loci_gc).collect() : Channel.value([]) +ascat_loci_rt = params.ascat_loci_rt ? Channel.fromPath(params.ascat_loci_rt).collect() : Channel.value([]) +bcftools_annotations = params.bcftools_annotations ? Channel.fromPath(params.bcftools_annotations).collect() : Channel.empty() +bcftools_header_lines = params.bcftools_header_lines ? Channel.fromPath(params.bcftools_header_lines).collect() : Channel.empty() +cf_chrom_len = params.cf_chrom_len ? Channel.fromPath(params.cf_chrom_len).collect() : [] +chr_dir = params.chr_dir ? Channel.fromPath(params.chr_dir).collect() : Channel.value([]) +dbsnp = params.dbsnp ? Channel.fromPath(params.dbsnp).collect() : Channel.value([]) +fasta = params.fasta ? Channel.fromPath(params.fasta).first() : Channel.empty() +fasta_fai = params.fasta_fai ? Channel.fromPath(params.fasta_fai).collect() : Channel.empty() +germline_resource = params.germline_resource ? Channel.fromPath(params.germline_resource).collect() : Channel.value([]) // Mutect2 does not require a germline resource, so set to optional input +known_indels = params.known_indels ? Channel.fromPath(params.known_indels).collect() : Channel.value([]) +known_snps = params.known_snps ? Channel.fromPath(params.known_snps).collect() : Channel.value([]) +mappability = params.mappability ? Channel.fromPath(params.mappability).collect() : Channel.value([]) +pon = params.pon ? Channel.fromPath(params.pon).collect() : Channel.value([]) // PON is optional for Mutect2 (but highly recommended) +sentieon_dnascope_model = params.sentieon_dnascope_model ? Channel.fromPath(params.sentieon_dnascope_model).collect() : Channel.value([]) + +// Initialize value channels based on params, defined in the params.genomes[params.genome] scope +ascat_genome = params.ascat_genome ?: Channel.empty() +dbsnp_vqsr = params.dbsnp_vqsr ? Channel.value(params.dbsnp_vqsr) : Channel.empty() +known_indels_vqsr = params.known_indels_vqsr ? Channel.value(params.known_indels_vqsr) : Channel.empty() +known_snps_vqsr = params.known_snps_vqsr ? Channel.value(params.known_snps_vqsr) : Channel.empty() +ngscheckmate_bed = params.ngscheckmate_bed ? Channel.value(params.ngscheckmate_bed) : Channel.empty() +snpeff_db = params.snpeff_db ?: Channel.empty() +vep_cache_version = params.vep_cache_version ?: Channel.empty() +vep_genome = params.vep_genome ?: Channel.empty() +vep_species = params.vep_species ?: Channel.empty() + + +vep_extra_files = [] + +if (params.dbnsfp && params.dbnsfp_tbi) { + vep_extra_files.add(file(params.dbnsfp, checkIfExists: true)) + vep_extra_files.add(file(params.dbnsfp_tbi, checkIfExists: true)) +} + +if (params.spliceai_snv && params.spliceai_snv_tbi && params.spliceai_indel && params.spliceai_indel_tbi) { + vep_extra_files.add(file(params.spliceai_indel, checkIfExists: true)) + vep_extra_files.add(file(params.spliceai_indel_tbi, checkIfExists: true)) + vep_extra_files.add(file(params.spliceai_snv, checkIfExists: true)) + vep_extra_files.add(file(params.spliceai_snv_tbi, checkIfExists: true)) +} /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - IMPORT NF-CORE MODULES/SUBWORKFLOWS + IMPORT LOCAL/NF-CORE MODULES/SUBWORKFLOWS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -// -// MODULE: Installed directly from nf-core/modules -// -include { FASTQC } from '../modules/nf-core/fastqc/main' -include { MULTIQC } from '../modules/nf-core/multiqc/main' -include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' +// Create samplesheets to restart from different steps +include { SAMPLESHEET_TO_CHANNEL } from '../subworkflows/local/samplesheet_to_channel/main' +include { CHANNEL_ALIGN_CREATE_CSV } from '../subworkflows/local/channel_align_create_csv/main' +include { CHANNEL_MARKDUPLICATES_CREATE_CSV } from '../subworkflows/local/channel_markduplicates_create_csv/main' +include { CHANNEL_BASERECALIBRATOR_CREATE_CSV } from '../subworkflows/local/channel_baserecalibrator_create_csv/main' +include { CHANNEL_APPLYBQSR_CREATE_CSV } from '../subworkflows/local/channel_applybqsr_create_csv/main' +include { CHANNEL_VARIANT_CALLING_CREATE_CSV } from '../subworkflows/local/channel_variant_calling_create_csv/main' + +// Download cache for SnpEff/VEP if needed +include { DOWNLOAD_CACHE_SNPEFF_VEP } from '../subworkflows/local/download_cache_snpeff_vep/main' + +// Initialize annotation cache +include { INITIALIZE_ANNOTATION_CACHE } from '../subworkflows/local/initialize_annotation_cache/main' + +// Build indices if needed +include { PREPARE_GENOME } from '../subworkflows/local/prepare_genome/main' + +// Build intervals if needed +include { PREPARE_INTERVALS } from '../subworkflows/local/prepare_intervals/main' + +// Build CNVkit reference if needed +include { PREPARE_REFERENCE_CNVKIT } from '../subworkflows/local/prepare_reference_cnvkit/main' + +// Convert BAM files to FASTQ files +include { BAM_CONVERT_SAMTOOLS as CONVERT_FASTQ_INPUT } from '../subworkflows/local/bam_convert_samtools/main' +include { BAM_CONVERT_SAMTOOLS as CONVERT_FASTQ_UMI } from '../subworkflows/local/bam_convert_samtools/main' + +// Run FASTQC +include { FASTQC } from '../modules/nf-core/fastqc/main' + +// TRIM/SPLIT FASTQ Files +include { FASTP } from '../modules/nf-core/fastp/main' + +// Create umi consensus bams from fastq +include { FASTQ_CREATE_UMI_CONSENSUS_FGBIO } from '../subworkflows/local/fastq_create_umi_consensus_fgbio/main' + +// Map input reads to reference genome +include { FASTQ_ALIGN_BWAMEM_MEM2_DRAGMAP_SENTIEON } from '../subworkflows/local/fastq_align_bwamem_mem2_dragmap_sentieon/main' + +// Merge and index BAM files (optional) +include { BAM_MERGE_INDEX_SAMTOOLS } from '../subworkflows/local/bam_merge_index_samtools/main' + +// Convert BAM files +include { SAMTOOLS_CONVERT as BAM_TO_CRAM } from '../modules/nf-core/samtools/convert/main' +include { SAMTOOLS_CONVERT as BAM_TO_CRAM_MAPPING } from '../modules/nf-core/samtools/convert/main' + +// Convert CRAM files (optional) +include { SAMTOOLS_CONVERT as CRAM_TO_BAM } from '../modules/nf-core/samtools/convert/main' +include { SAMTOOLS_CONVERT as CRAM_TO_BAM_RECAL } from '../modules/nf-core/samtools/convert/main' + +// Mark Duplicates (+QC) +include { BAM_MARKDUPLICATES } from '../subworkflows/local/bam_markduplicates/main' +include { BAM_MARKDUPLICATES_SPARK } from '../subworkflows/local/bam_markduplicates_spark/main' +include { BAM_SENTIEON_DEDUP } from '../subworkflows/local/bam_sentieon_dedup/main' + +// QC on CRAM +include { CRAM_QC_MOSDEPTH_SAMTOOLS as CRAM_QC_NO_MD } from '../subworkflows/local/cram_qc_mosdepth_samtools/main' +include { CRAM_QC_MOSDEPTH_SAMTOOLS as CRAM_QC_RECAL } from '../subworkflows/local/cram_qc_mosdepth_samtools/main' + +// Create recalibration tables +include { BAM_BASERECALIBRATOR } from '../subworkflows/local/bam_baserecalibrator/main' +include { BAM_BASERECALIBRATOR_SPARK } from '../subworkflows/local/bam_baserecalibrator_spark/main' + +// Create recalibrated cram files to use for variant calling (+QC) +include { BAM_APPLYBQSR } from '../subworkflows/local/bam_applybqsr/main' +include { BAM_APPLYBQSR_SPARK } from '../subworkflows/local/bam_applybqsr_spark/main' + +// Variant calling on a single normal sample +include { BAM_VARIANT_CALLING_GERMLINE_ALL } from '../subworkflows/local/bam_variant_calling_germline_all/main' + +// Variant calling on a single tumor sample +include { BAM_VARIANT_CALLING_TUMOR_ONLY_ALL } from '../subworkflows/local/bam_variant_calling_tumor_only_all/main' + +// Variant calling on tumor/normal pair +include { BAM_VARIANT_CALLING_SOMATIC_ALL } from '../subworkflows/local/bam_variant_calling_somatic_all/main' + +// POST VARIANTCALLING: e.g. merging +include { POST_VARIANTCALLING } from '../subworkflows/local/post_variantcalling/main' + +// QC on VCF files +include { VCF_QC_BCFTOOLS_VCFTOOLS } from '../subworkflows/local/vcf_qc_bcftools_vcftools/main' + +// Sample QC on CRAM files +include { CRAM_SAMPLEQC } from '../subworkflows/local/cram_sampleqc/main' + +// Annotation +include { VCF_ANNOTATE_ALL } from '../subworkflows/local/vcf_annotate_all/main' + +// REPORTING VERSIONS OF SOFTWARE USED +include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' + +// MULTIQC +include { MULTIQC } from '../modules/nf-core/multiqc/main' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -56,58 +233,886 @@ include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoft ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -// Info required for completion email and summary -def multiqc_report = [] - workflow SAREK { - ch_versions = Channel.empty() - - // - // SUBWORKFLOW: Read in samplesheet, validate and stage input files - // - INPUT_CHECK ( - file(params.input) - ) - ch_versions = ch_versions.mix(INPUT_CHECK.out.versions) - // TODO: OPTIONAL, you can use nf-validation plugin to create an input channel from the samplesheet with Channel.fromSamplesheet("input") - // See the documentation https://nextflow-io.github.io/nf-validation/samplesheets/fromSamplesheet/ - // ! There is currently no tooling to help you write a sample sheet schema - - // - // MODULE: Run FastQC - // - FASTQC ( - INPUT_CHECK.out.reads - ) - ch_versions = ch_versions.mix(FASTQC.out.versions.first()) - - CUSTOM_DUMPSOFTWAREVERSIONS ( - ch_versions.unique().collectFile(name: 'collated_versions.yml') - ) - - // - // MODULE: MultiQC - // - workflow_summary = WorkflowSarek.paramsSummaryMultiqc(workflow, summary_params) - ch_workflow_summary = Channel.value(workflow_summary) - - methods_description = WorkflowSarek.methodsDescriptionText(workflow, ch_multiqc_custom_methods_description, params) - ch_methods_description = Channel.value(methods_description) - - ch_multiqc_files = Channel.empty() - ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) - ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml')) - ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect()) - ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([])) - - MULTIQC ( - ch_multiqc_files.collect(), - ch_multiqc_config.toList(), - ch_multiqc_custom_config.toList(), - ch_multiqc_logo.toList() - ) - multiqc_report = MULTIQC.out.report.toList() + // Parse samplesheet + // Set input, can either be from --input or from automatic retrieval in WorkflowSarek.groovy + ch_from_samplesheet = params.build_only_index ? Channel.empty() : params.input ? Channel.fromSamplesheet("input") : Channel.fromSamplesheet("input_restart") + SAMPLESHEET_TO_CHANNEL(ch_from_samplesheet) + + input_sample = SAMPLESHEET_TO_CHANNEL.out.input_sample + + // MULTIQC + ch_multiqc_config = Channel.fromPath("$projectDir/assets/multiqc_config.yml", checkIfExists: true) + ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath( params.multiqc_config, checkIfExists: true ) : Channel.empty() + ch_multiqc_logo = params.multiqc_logo ? Channel.fromPath( params.multiqc_logo, checkIfExists: true ) : Channel.empty() + ch_multiqc_custom_methods_description = params.multiqc_methods_description ? file(params.multiqc_methods_description, checkIfExists: true) : file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) + + // To gather all QC reports for MultiQC + reports = Channel.empty() + // To gather used softwares versions for MultiQC + versions = Channel.empty() + + // Download cache + if (params.download_cache) { + // Assuming that even if the cache is provided, if the user specify download_cache, sarek will download the cache + ensemblvep_info = Channel.of([ [ id:"${params.vep_cache_version}_${params.vep_genome}" ], params.vep_genome, params.vep_species, params.vep_cache_version ]) + snpeff_info = Channel.of([ [ id:"${params.snpeff_genome}.${params.snpeff_db}" ], params.snpeff_genome, params.snpeff_db ]) + DOWNLOAD_CACHE_SNPEFF_VEP(ensemblvep_info, snpeff_info) + snpeff_cache = DOWNLOAD_CACHE_SNPEFF_VEP.out.snpeff_cache + vep_cache = DOWNLOAD_CACHE_SNPEFF_VEP.out.ensemblvep_cache.map{ meta, cache -> [ cache ] } + + versions = versions.mix(DOWNLOAD_CACHE_SNPEFF_VEP.out.versions) + } else { + // Looks for cache information either locally or on the cloud + INITIALIZE_ANNOTATION_CACHE( + (params.snpeff_cache && params.tools && (params.tools.split(',').contains("snpeff") || params.tools.split(',').contains('merge'))), + params.snpeff_cache, + params.snpeff_genome, + params.snpeff_db, + (params.vep_cache && params.tools && (params.tools.split(',').contains("vep") || params.tools.split(',').contains('merge'))), + params.vep_cache, + params.vep_species, + params.vep_cache_version, + params.vep_genome, + "Please refer to https://nf-co.re/sarek/docs/usage/#how-to-customise-snpeff-and-vep-annotation for more information.") + + snpeff_cache = INITIALIZE_ANNOTATION_CACHE.out.snpeff_cache + vep_cache = INITIALIZE_ANNOTATION_CACHE.out.ensemblvep_cache + } + + // Build indices if needed + PREPARE_GENOME( + ascat_alleles, + ascat_loci, + ascat_loci_gc, + ascat_loci_rt, + bcftools_annotations, + chr_dir, + dbsnp, + fasta, + fasta_fai, + germline_resource, + known_indels, + known_snps, + pon) + + // Gather built indices or get them from the params + // Built from the fasta file: + dict = params.dict ? Channel.fromPath(params.dict).map{ it -> [ [id:'dict'], it ] }.collect() + : PREPARE_GENOME.out.dict + fasta_fai = params.fasta_fai ? Channel.fromPath(params.fasta_fai).collect() + : PREPARE_GENOME.out.fasta_fai + bwa = params.bwa ? Channel.fromPath(params.bwa).collect() + : PREPARE_GENOME.out.bwa + bwamem2 = params.bwamem2 ? Channel.fromPath(params.bwamem2).collect() + : PREPARE_GENOME.out.bwamem2 + dragmap = params.dragmap ? Channel.fromPath(params.dragmap).collect() + : PREPARE_GENOME.out.hashtable + + // Gather index for mapping given the chosen aligner + index_alignement = (params.aligner == "bwa-mem" || params.aligner == "sentieon-bwamem") ? bwa : + params.aligner == "bwa-mem2" ? bwamem2 : + dragmap + + // TODO: add a params for msisensorpro_scan + msisensorpro_scan = PREPARE_GENOME.out.msisensorpro_scan + + // For ASCAT, extracted from zip or tar.gz files: + allele_files = PREPARE_GENOME.out.allele_files + chr_files = PREPARE_GENOME.out.chr_files + gc_file = PREPARE_GENOME.out.gc_file + loci_files = PREPARE_GENOME.out.loci_files + rt_file = PREPARE_GENOME.out.rt_file + + // Tabix indexed vcf files: + bcftools_annotations_tbi = params.bcftools_annotations ? params.bcftools_annotations_tbi ? Channel.fromPath(params.bcftools_annotations_tbi).collect() : PREPARE_GENOME.out.bcftools_annotations_tbi : Channel.empty([]) + dbsnp_tbi = params.dbsnp ? params.dbsnp_tbi ? Channel.fromPath(params.dbsnp_tbi).collect() : PREPARE_GENOME.out.dbsnp_tbi : Channel.value([]) + germline_resource_tbi = params.germline_resource ? params.germline_resource_tbi ? Channel.fromPath(params.germline_resource_tbi).collect() : PREPARE_GENOME.out.germline_resource_tbi : [] //do not change to Channel.value([]), the check for its existence then fails for Getpileupsumamries + known_indels_tbi = params.known_indels ? params.known_indels_tbi ? Channel.fromPath(params.known_indels_tbi).collect() : PREPARE_GENOME.out.known_indels_tbi : Channel.value([]) + known_snps_tbi = params.known_snps ? params.known_snps_tbi ? Channel.fromPath(params.known_snps_tbi).collect() : PREPARE_GENOME.out.known_snps_tbi : Channel.value([]) + pon_tbi = params.pon ? params.pon_tbi ? Channel.fromPath(params.pon_tbi).collect() : PREPARE_GENOME.out.pon_tbi : Channel.value([]) + + // known_sites is made by grouping both the dbsnp and the known snps/indels resources + // Which can either or both be optional + known_sites_indels = dbsnp.concat(known_indels).collect() + known_sites_indels_tbi = dbsnp_tbi.concat(known_indels_tbi).collect() + + known_sites_snps = dbsnp.concat(known_snps).collect() + known_sites_snps_tbi = dbsnp_tbi.concat(known_snps_tbi).collect() + + // Build intervals if needed + PREPARE_INTERVALS(fasta_fai, params.intervals, params.no_intervals) + + // Intervals for speed up preprocessing/variant calling by spread/gather + // [interval.bed] all intervals in one file + intervals_bed_combined = params.no_intervals ? Channel.value([]) : PREPARE_INTERVALS.out.intervals_bed_combined + intervals_bed_gz_tbi_combined = params.no_intervals ? Channel.value([]) : PREPARE_INTERVALS.out.intervals_bed_gz_tbi_combined + + // For QC during preprocessing, we don't need any intervals (MOSDEPTH doesn't take them for WGS) + intervals_for_preprocessing = params.wes ? + intervals_bed_combined.map{it -> [ [ id:it.baseName ], it ]}.collect() : + Channel.value([ [ id:'null' ], [] ]) + + intervals = PREPARE_INTERVALS.out.intervals_bed // [ interval, num_intervals ] multiple interval.bed files, divided by useful intervals for scatter/gather + intervals_bed_gz_tbi = PREPARE_INTERVALS.out.intervals_bed_gz_tbi // [ interval_bed, tbi, num_intervals ] multiple interval.bed.gz/.tbi files, divided by useful intervals for scatter/gather + + intervals_and_num_intervals = intervals.map{ interval, num_intervals -> + if ( num_intervals < 1 ) [ [], num_intervals ] + else [ interval, num_intervals ] + } + + intervals_bed_gz_tbi_and_num_intervals = intervals_bed_gz_tbi.map{ intervals, num_intervals -> + if ( num_intervals < 1 ) [ [], [], num_intervals ] + else [ intervals[0], intervals[1], num_intervals ] + } + + if (params.tools && params.tools.split(',').contains('cnvkit')) { + if (params.cnvkit_reference) { + cnvkit_reference = Channel.fromPath(params.cnvkit_reference).collect() + } else { + PREPARE_REFERENCE_CNVKIT(fasta, intervals_bed_combined) + cnvkit_reference = PREPARE_REFERENCE_CNVKIT.out.cnvkit_reference + + versions = versions.mix(PREPARE_REFERENCE_CNVKIT.out.versions) + } + } else { + cnvkit_reference = Channel.value([]) + } + + // Gather used softwares versions + versions = versions.mix(PREPARE_GENOME.out.versions) + versions = versions.mix(PREPARE_INTERVALS.out.versions) + + // PREPROCESSING + + if (params.step == 'mapping') { + + // Figure out if input is bam or fastq + input_sample_type = input_sample.branch{ + bam: it[0].data_type == "bam" + fastq: it[0].data_type == "fastq" + } + + // Convert any bam input to fastq + // fasta are not needed when converting bam to fastq -> [ id:"fasta" ], [] + // No need for fasta.fai -> [] + interleave_input = false // Currently don't allow interleaved input + CONVERT_FASTQ_INPUT( + input_sample_type.bam, + [ [ id:"fasta" ], [] ], // fasta + [ [ id:'null' ], [] ], // fasta_fai + interleave_input) + + // Gather fastq (inputed or converted) + // Theorically this could work on mixed input (fastq for one sample and bam for another) + // But not sure how to handle that with the samplesheet + // Or if we really want users to be able to do that + input_fastq = input_sample_type.fastq.mix(CONVERT_FASTQ_INPUT.out.reads) + + // STEP 0: QC & TRIM + // `--skip_tools fastqc` to skip fastqc + // Trim only with `--trim_fastq` + // Additional options to be set up + + // QC + if (!(params.skip_tools && params.skip_tools.split(',').contains('fastqc'))) { + FASTQC(input_fastq) + + reports = reports.mix(FASTQC.out.zip.collect{ meta, logs -> logs }) + versions = versions.mix(FASTQC.out.versions.first()) + } + + // UMI consensus calling + if (params.umi_read_structure) { + FASTQ_CREATE_UMI_CONSENSUS_FGBIO( + input_fastq, + fasta, + fasta_fai, + index_alignement, + params.group_by_umi_strategy) + + bam_converted_from_fastq = FASTQ_CREATE_UMI_CONSENSUS_FGBIO.out.consensusbam.map{ meta, bam -> [ meta, bam, [] ] } + + // Convert back to fastq for further preprocessing + // fasta are not needed when converting bam to fastq -> [ id:"fasta" ], [] + // No need for fasta.fai -> [] + interleave_input = false // Currently don't allow interleaved input + CONVERT_FASTQ_UMI( + bam_converted_from_fastq, + [ [ id:"fasta" ], [] ], // fasta + [ [ id:'null' ], [] ], // fasta_fai + interleave_input) + + reads_for_fastp = CONVERT_FASTQ_UMI.out.reads + + // Gather used softwares versions + versions = versions.mix(CONVERT_FASTQ_UMI.out.versions) + versions = versions.mix(FASTQ_CREATE_UMI_CONSENSUS_FGBIO.out.versions) + } else { + reads_for_fastp = input_fastq + } + + // Trimming and/or splitting + if (params.trim_fastq || params.split_fastq > 0) { + + save_trimmed_fail = false + save_merged = false + FASTP( + reads_for_fastp, + [], // we are not using any adapter fastas at the moment + save_trimmed_fail, + save_merged + ) + + reports = reports.mix(FASTP.out.json.collect{ meta, json -> json }) + reports = reports.mix(FASTP.out.html.collect{ meta, html -> html }) + + if (params.split_fastq) { + reads_for_alignment = FASTP.out.reads.map{ meta, reads -> + read_files = reads.sort(false) { a,b -> a.getName().tokenize('.')[0] <=> b.getName().tokenize('.')[0] }.collate(2) + [ meta + [ n_fastq: read_files.size() ], read_files ] + }.transpose() + } else reads_for_alignment = FASTP.out.reads + + versions = versions.mix(FASTP.out.versions) + + } else { + reads_for_alignment = reads_for_fastp + } + + // STEP 1: MAPPING READS TO REFERENCE GENOME + // First, we must calculate number of lanes for each sample (meta.n_fastq) + // This is needed to group reads from the same sample together using groupKey to avoid stalling the workflow + // when reads from different samples are mixed together + reads_for_alignment.map { meta, reads -> + [ meta.subMap('patient', 'sample', 'sex', 'status'), reads ] + } + .groupTuple() + .map { meta, reads -> + meta + [ n_fastq: reads.size() ] // We can drop the FASTQ files now that we know how many there are + } + .set { reads_grouping_key } + + // reads will be sorted + sort_bam = true + FASTQ_ALIGN_BWAMEM_MEM2_DRAGMAP_SENTIEON(reads_for_alignment, index_alignement, sort_bam, fasta, fasta_fai) + + // Grouping the bams from the same samples not to stall the workflow + // Use groupKey to make sure that the correct group can advance as soon as it is complete + // and not stall the workflow until all reads from all channels are mapped + bam_mapped = FASTQ_ALIGN_BWAMEM_MEM2_DRAGMAP_SENTIEON.out.bam + .combine(reads_grouping_key) // Creates a tuple of [ meta, bam, reads_grouping_key ] + .filter { meta1, bam, meta2 -> meta1.sample == meta2.sample } + // Add n_fastq and other variables to meta + .map { meta1, bam, meta2 -> + [ meta1 + meta2, bam ] + } + // Manipulate meta map to remove old fields and add new ones + .map { meta, bam -> + [ meta - meta.subMap('id', 'read_group', 'data_type', 'num_lanes', 'read_group', 'size') + [ data_type: 'bam', id: meta.sample ], bam ] + } + // Create groupKey from meta map + .map { meta, bam -> + [ groupKey( meta, meta.n_fastq), bam ] + } + // Group + .groupTuple() + + bai_mapped = FASTQ_ALIGN_BWAMEM_MEM2_DRAGMAP_SENTIEON.out.bai + .combine(reads_grouping_key) // Creates a tuple of [ meta, bai, reads_grouping_key ] + .filter { meta1, bai, meta2 -> meta1.sample == meta2.sample } + // Add n_fastq and other variables to meta + .map { meta1, bai, meta2 -> + [ meta1 + meta2, bai ] + } + // Manipulate meta map to remove old fields and add new ones + .map { meta, bai -> + [ meta - meta.subMap('id', 'read_group', 'data_type', 'num_lanes', 'read_group', 'size') + [ data_type: 'bai', id: meta.sample ], bai ] + } + // Create groupKey from meta map + .map { meta, bai -> + [ groupKey( meta, meta.n_fastq), bai ] + } + // Group + .groupTuple() + + + // gatk4 markduplicates can handle multiple bams as input, so no need to merge/index here + // Except if and only if save_mapped or (skipping markduplicates and sentieon-dedup) + if ( + params.save_mapped || + ( + (params.skip_tools && params.skip_tools.split(',').contains('markduplicates')) && + !(params.tools && params.tools.split(',').contains('sentieon_dedup')) + ) + ) { + // bams are merged (when multiple lanes from the same sample), indexed and then converted to cram + BAM_MERGE_INDEX_SAMTOOLS(bam_mapped) + + BAM_TO_CRAM_MAPPING(BAM_MERGE_INDEX_SAMTOOLS.out.bam_bai, fasta, fasta_fai) + // Create CSV to restart from this step + params.save_output_as_bam ? CHANNEL_ALIGN_CREATE_CSV(BAM_MERGE_INDEX_SAMTOOLS.out.bam_bai) : CHANNEL_ALIGN_CREATE_CSV(BAM_TO_CRAM_MAPPING.out.alignment_index) + + // Gather used softwares versions + versions = versions.mix(BAM_MERGE_INDEX_SAMTOOLS.out.versions) + versions = versions.mix(BAM_TO_CRAM_MAPPING.out.versions) + } + + // Gather used softwares versions + versions = versions.mix(CONVERT_FASTQ_INPUT.out.versions) + versions = versions.mix(FASTQ_ALIGN_BWAMEM_MEM2_DRAGMAP_SENTIEON.out.versions) + } + + if (params.step in ['mapping', 'markduplicates']) { + + // ch_cram_no_markduplicates_restart = Channel.empty() + cram_markduplicates_no_spark = Channel.empty() + cram_sentieon_dedup = Channel.empty() + cram_markduplicates_spark = Channel.empty() + + // STEP 2: markduplicates (+QC) + convert to CRAM + + // ch_bam_for_markduplicates will contain bam mapped with FASTQ_ALIGN_BWAMEM_MEM2_DRAGMAP_SENTIEON when step is mapping + // Or bams that are specified in the samplesheet.csv when step is prepare_recalibration + cram_for_markduplicates = params.step == 'mapping' ? bam_mapped : input_sample.map{ meta, input, index -> [ meta, input ] } + // if no MD is done, then run QC on mapped & converted CRAM files + // or the input BAM (+converted) or CRAM files + cram_skip_markduplicates = Channel.empty() + + // Should it be possible to restart from converted crams? + // For now, conversion from bam to cram is only done when skipping markduplicates + + if ( + params.skip_tools && + params.skip_tools.split(',').contains('markduplicates') && + !(params.tools && params.tools.split(',').contains('sentieon_dedup')) + ) { + if (params.step == 'mapping') { + cram_skip_markduplicates = BAM_TO_CRAM_MAPPING.out.alignment_index + } else { + input_markduplicates_convert = input_sample.branch{ + bam: it[0].data_type == "bam" + cram: it[0].data_type == "cram" + } + + // Convert any input BAMs to CRAM + BAM_TO_CRAM(input_markduplicates_convert.bam, fasta, fasta_fai) + versions = versions.mix(BAM_TO_CRAM.out.versions) + + cram_skip_markduplicates = Channel.empty().mix(input_markduplicates_convert.cram, BAM_TO_CRAM.out.alignment_index) + } + + CRAM_QC_NO_MD(cram_skip_markduplicates, fasta, intervals_for_preprocessing) + + // Gather QC reports + reports = reports.mix(CRAM_QC_NO_MD.out.reports.collect{ meta, report -> report }) + + // Gather used softwares versions + versions = versions.mix(CRAM_QC_NO_MD.out.versions) + } else if (params.use_gatk_spark && params.use_gatk_spark.contains('markduplicates')) { + BAM_MARKDUPLICATES_SPARK( + cram_for_markduplicates, + dict.map{ meta, dict -> [ dict ] }, + fasta, + fasta_fai, + intervals_for_preprocessing) + cram_markduplicates_spark = BAM_MARKDUPLICATES_SPARK.out.cram + + // Gather QC reports + reports = reports.mix(BAM_MARKDUPLICATES_SPARK.out.reports.collect{ meta, report -> report }) + + // Gather used softwares versions + versions = versions.mix(BAM_MARKDUPLICATES_SPARK.out.versions) + } else if (params.tools && params.tools.split(',').contains('sentieon_dedup')) { + crai_for_markduplicates = params.step == 'mapping' ? bai_mapped : input_sample.map{ meta, input, index -> [ meta, index ] } + BAM_SENTIEON_DEDUP( + cram_for_markduplicates, + crai_for_markduplicates, + fasta, + fasta_fai, + intervals_for_preprocessing) + + cram_sentieon_dedup = BAM_SENTIEON_DEDUP.out.cram + + // Gather QC reports + reports = reports.mix(BAM_SENTIEON_DEDUP.out.reports.collect{ meta, report -> report }) + + // Gather used softwares versions + versions = versions.mix(BAM_SENTIEON_DEDUP.out.versions) + } else { + BAM_MARKDUPLICATES( + cram_for_markduplicates, + fasta, + fasta_fai, + intervals_for_preprocessing) + + cram_markduplicates_no_spark = BAM_MARKDUPLICATES.out.cram + + // Gather QC reports + reports = reports.mix(BAM_MARKDUPLICATES.out.reports.collect{ meta, report -> report }) + + // Gather used softwares versions + versions = versions.mix(BAM_MARKDUPLICATES.out.versions) + } + + // ch_md_cram_for_restart contains either: + // - crams from markduplicates + // - crams from sentieon_dedup + // - crams from markduplicates_spark + // - crams from input step markduplicates --> from the converted ones only? + ch_md_cram_for_restart = Channel.empty().mix(cram_markduplicates_no_spark, cram_markduplicates_spark, cram_sentieon_dedup) + // Make sure correct data types are carried through + .map{ meta, cram, crai -> [ meta + [data_type: "cram"], cram, crai ] } + + // If params.save_output_as_bam, then convert CRAM files to BAM + CRAM_TO_BAM(ch_md_cram_for_restart, fasta, fasta_fai) + versions = versions.mix(CRAM_TO_BAM.out.versions) + + // CSV should be written for the file actually out, either CRAM or BAM + // Create CSV to restart from this step + csv_subfolder = (params.tools && params.tools.split(',').contains('sentieon_dedup')) ? 'sentieon_dedup' : 'markduplicates' + + params.save_output_as_bam ? CHANNEL_MARKDUPLICATES_CREATE_CSV(CRAM_TO_BAM.out.alignment_index, csv_subfolder, params.outdir, params.save_output_as_bam) : CHANNEL_MARKDUPLICATES_CREATE_CSV(ch_md_cram_for_restart, csv_subfolder, params.outdir, params.save_output_as_bam) + } + + if (params.step in ['mapping', 'markduplicates', 'prepare_recalibration']) { + + // Run if starting from step "prepare_recalibration" + if (params.step == 'prepare_recalibration') { + + // Support if starting from BAM or CRAM files + input_prepare_recal_convert = input_sample.branch{ + bam: it[0].data_type == "bam" + cram: it[0].data_type == "cram" + } + + // BAM files first must be converted to CRAM files since from this step on we base everything on CRAM format + BAM_TO_CRAM(input_prepare_recal_convert.bam, fasta, fasta_fai) + versions = versions.mix(BAM_TO_CRAM.out.versions) + + ch_cram_from_bam = BAM_TO_CRAM.out.alignment_index + // Make sure correct data types are carried through + .map{ meta, cram, crai -> [ meta + [data_type: "cram"], cram, crai ] } + + ch_cram_for_bam_baserecalibrator = Channel.empty().mix(ch_cram_from_bam, input_prepare_recal_convert.cram) + ch_md_cram_for_restart = ch_cram_from_bam + + } else { + + // ch_cram_for_bam_baserecalibrator contains either: + // - crams from markduplicates + // - crams from markduplicates_spark + // - crams converted from bam mapped when skipping markduplicates + // - input cram files, when start from step markduplicates + ch_cram_for_bam_baserecalibrator = Channel.empty().mix(ch_md_cram_for_restart, cram_skip_markduplicates ) + // Make sure correct data types are carried through + .map{ meta, cram, crai -> [ meta + [data_type: "cram"], cram, crai ] } + + } + + // STEP 3: Create recalibration tables + if (!(params.skip_tools && params.skip_tools.split(',').contains('baserecalibrator'))) { + + ch_table_bqsr_no_spark = Channel.empty() + ch_table_bqsr_spark = Channel.empty() + + if (params.use_gatk_spark && params.use_gatk_spark.contains('baserecalibrator')) { + BAM_BASERECALIBRATOR_SPARK( + ch_cram_for_bam_baserecalibrator, + dict, + fasta, + fasta_fai, + intervals_and_num_intervals, + known_sites_indels, + known_sites_indels_tbi) + + ch_table_bqsr_spark = BAM_BASERECALIBRATOR_SPARK.out.table_bqsr + + // Gather used softwares versions + versions = versions.mix(BAM_BASERECALIBRATOR_SPARK.out.versions) + } else { + + BAM_BASERECALIBRATOR( + ch_cram_for_bam_baserecalibrator, + dict, + fasta, + fasta_fai, + intervals_and_num_intervals, + known_sites_indels, + known_sites_indels_tbi) + + ch_table_bqsr_no_spark = BAM_BASERECALIBRATOR.out.table_bqsr + + // Gather used softwares versions + versions = versions.mix(BAM_BASERECALIBRATOR.out.versions) + } + + // ch_table_bqsr contains either: + // - bqsr table from baserecalibrator + // - bqsr table from baserecalibrator_spark + ch_table_bqsr = Channel.empty().mix( + ch_table_bqsr_no_spark, + ch_table_bqsr_spark) + + reports = reports.mix(ch_table_bqsr.collect{ meta, table -> table }) + + cram_applybqsr = ch_cram_for_bam_baserecalibrator.join(ch_table_bqsr, failOnDuplicate: true, failOnMismatch: true) + + // Create CSV to restart from this step + CHANNEL_BASERECALIBRATOR_CREATE_CSV(ch_md_cram_for_restart.join(ch_table_bqsr, failOnDuplicate: true), params.tools, params.skip_tools, params.save_output_as_bam, params.outdir) + } + } + + // STEP 4: RECALIBRATING + if (params.step in ['mapping', 'markduplicates', 'prepare_recalibration', 'recalibrate']) { + + // Run if starting from step "prepare_recalibration" + if (params.step == 'recalibrate') { + + // Support if starting from BAM or CRAM files + input_recal_convert = input_sample.branch{ + bam: it[0].data_type == "bam" + cram: it[0].data_type == "cram" + } + + // If BAM file, split up table and mapped file to convert BAM to CRAM + input_only_table = input_recal_convert.bam.map{ meta, bam, bai, table -> [ meta, table ] } + input_only_bam = input_recal_convert.bam.map{ meta, bam, bai, table -> [ meta, bam, bai ] } + + // BAM files first must be converted to CRAM files since from this step on we base everything on CRAM format + BAM_TO_CRAM(input_only_bam, fasta, fasta_fai) + versions = versions.mix(BAM_TO_CRAM.out.versions) + + cram_applybqsr = Channel.empty().mix( + BAM_TO_CRAM.out.alignment_index.join(input_only_table, failOnDuplicate: true, failOnMismatch: true), + input_recal_convert.cram) + // Join together converted cram with input tables + .map{ meta, cram, crai, table -> [ meta + [data_type: "cram"], cram, crai, table ]} + } + + if (!(params.skip_tools && params.skip_tools.split(',').contains('baserecalibrator'))) { + cram_variant_calling_no_spark = Channel.empty() + cram_variant_calling_spark = Channel.empty() + + if (params.use_gatk_spark && params.use_gatk_spark.contains('baserecalibrator')) { + + BAM_APPLYBQSR_SPARK( + cram_applybqsr, + dict, + fasta, + fasta_fai, + intervals_and_num_intervals) + + cram_variant_calling_spark = BAM_APPLYBQSR_SPARK.out.cram + + // Gather used softwares versions + versions = versions.mix(BAM_APPLYBQSR_SPARK.out.versions) + + } else { + + BAM_APPLYBQSR( + cram_applybqsr, + dict, + fasta, + fasta_fai, + intervals_and_num_intervals) + + cram_variant_calling_no_spark = BAM_APPLYBQSR.out.cram + + // Gather used softwares versions + versions = versions.mix(BAM_APPLYBQSR.out.versions) + } + + cram_variant_calling = Channel.empty().mix( + cram_variant_calling_no_spark, + cram_variant_calling_spark) + + // If params.save_output_as_bam, then convert CRAM files to BAM + CRAM_TO_BAM_RECAL(cram_variant_calling, fasta, fasta_fai) + versions = versions.mix(CRAM_TO_BAM_RECAL.out.versions) + + // CSV should be written for the file actually out out, either CRAM or BAM + csv_recalibration = Channel.empty() + csv_recalibration = params.save_output_as_bam ? CRAM_TO_BAM_RECAL.out.alignment_index : cram_variant_calling + + // Create CSV to restart from this step + CHANNEL_APPLYBQSR_CREATE_CSV(csv_recalibration) + + } else if (params.step == 'recalibrate') { + // cram_variant_calling contains either: + // - input bams converted to crams, if started from step recal + skip BQSR + // - input crams if started from step recal + skip BQSR + cram_variant_calling = Channel.empty().mix( + BAM_TO_CRAM.out.alignment_index, + input_recal_convert.cram.map{ meta, cram, crai, table -> [ meta, cram, crai ] }) + } else { + // cram_variant_calling contains either: + // - crams from markduplicates = ch_cram_for_bam_baserecalibrator if skip BQSR but not started from step recalibration + cram_variant_calling = Channel.empty().mix(ch_cram_for_bam_baserecalibrator) + } + } + + if (params.step == 'variant_calling') { + + input_variant_calling_convert = input_sample.branch{ + bam: it[0].data_type == "bam" + cram: it[0].data_type == "cram" + } + + // BAM files first must be converted to CRAM files since from this step on we base everything on CRAM format + BAM_TO_CRAM(input_variant_calling_convert.bam, fasta, fasta_fai) + versions = versions.mix(BAM_TO_CRAM.out.versions) + + cram_variant_calling = Channel.empty().mix(BAM_TO_CRAM.out.alignment_index, input_variant_calling_convert.cram) + + } + + if (params.step == 'annotate') cram_variant_calling = Channel.empty() + + // RUN CRAM QC on the recalibrated CRAM files or when starting from step variant calling. NGSCheckmate should be run also on non-recalibrated CRAM files + CRAM_SAMPLEQC(cram_variant_calling, + ngscheckmate_bed, + fasta, + params.skip_tools && params.skip_tools.split(',').contains('baserecalibrator'), + intervals_for_preprocessing) + + if (params.tools) { + + // + // Logic to separate germline samples, tumor samples with no matched normal, and combine tumor-normal pairs + // + cram_variant_calling_status = cram_variant_calling.branch{ + normal: it[0].status == 0 + tumor: it[0].status == 1 + } + + // All Germline samples + cram_variant_calling_normal_to_cross = cram_variant_calling_status.normal.map{ meta, cram, crai -> [ meta.patient, meta, cram, crai ] } + + // All tumor samples + cram_variant_calling_pair_to_cross = cram_variant_calling_status.tumor.map{ meta, cram, crai -> [ meta.patient, meta, cram, crai ] } + + // Tumor only samples + // 1. Group together all tumor samples by patient ID [ patient1, [ meta1, meta2 ], [ cram1, crai1, cram2, crai2 ] ] + + // Downside: this only works by waiting for all tumor samples to finish preprocessing, since no group size is provided + cram_variant_calling_tumor_grouped = cram_variant_calling_pair_to_cross.groupTuple() + + // 2. Join with normal samples, in each channel there is one key per patient now. Patients without matched normal end up with: [ patient1, [ meta1, meta2 ], [ cram1, crai1, cram2, crai2 ], null ] + cram_variant_calling_tumor_joined = cram_variant_calling_tumor_grouped.join(cram_variant_calling_normal_to_cross, failOnDuplicate: true, remainder: true) + + // 3. Filter out entries with last entry null + cram_variant_calling_tumor_filtered = cram_variant_calling_tumor_joined.filter{ it -> !(it.last()) } + + // 4. Transpose [ patient1, [ meta1, meta2 ], [ cram1, crai1, cram2, crai2 ] ] back to [ patient1, meta1, [ cram1, crai1 ], null ] [ patient1, meta2, [ cram2, crai2 ], null ] + // and remove patient ID field & null value for further processing [ meta1, [ cram1, crai1 ] ] [ meta2, [ cram2, crai2 ] ] + cram_variant_calling_tumor_only = cram_variant_calling_tumor_filtered.transpose().map{ it -> [it[1], it[2], it[3]] } + + if (params.only_paired_variant_calling) { + // Normal only samples + + // 1. Join with tumor samples, in each channel there is one key per patient now. Patients without matched tumor end up with: [ patient1, [ meta1 ], [ cram1, crai1 ], null ] as there is only one matched normal possible + cram_variant_calling_normal_joined = cram_variant_calling_normal_to_cross.join(cram_variant_calling_tumor_grouped, failOnDuplicate: true, remainder: true) + + // 2. Filter out entries with last entry null + cram_variant_calling_normal_filtered = cram_variant_calling_normal_joined.filter{ it -> !(it.last()) } + + // 3. Remove patient ID field & null value for further processing [ meta1, [ cram1, crai1 ] ] [ meta2, [ cram2, crai2 ] ] (no transposing needed since only one normal per patient ID) + cram_variant_calling_status_normal = cram_variant_calling_normal_filtered.map{ it -> [it[1], it[2], it[3]] } + + } else { + cram_variant_calling_status_normal = cram_variant_calling_status.normal + } + + // Tumor - normal pairs + // Use cross to combine normal with all tumor samples, i.e. multi tumor samples from recurrences + cram_variant_calling_pair = cram_variant_calling_normal_to_cross.cross(cram_variant_calling_pair_to_cross) + .map { normal, tumor -> + def meta = [:] + + meta.id = "${tumor[1].sample}_vs_${normal[1].sample}".toString() + meta.normal_id = normal[1].sample + meta.patient = normal[0] + meta.sex = normal[1].sex + meta.tumor_id = tumor[1].sample + + [ meta, normal[2], normal[3], tumor[2], tumor[3] ] + } + + // GERMLINE VARIANT CALLING + BAM_VARIANT_CALLING_GERMLINE_ALL( + params.tools, + params.skip_tools, + cram_variant_calling_status_normal, + [ [ id:'bwa' ], [] ], // bwa_index for tiddit; not used here + dbsnp, + dbsnp_tbi, + dbsnp_vqsr, + dict, + fasta, + fasta_fai, + intervals_and_num_intervals, + intervals_bed_combined, // [] if no_intervals, else interval_bed_combined.bed, + intervals_bed_gz_tbi_combined, // [] if no_intervals, else interval_bed_combined_gz, interval_bed_combined_gz_tbi + PREPARE_INTERVALS.out.intervals_bed_combined, // no_intervals.bed if no intervals, else interval_bed_combined.bed; Channel operations possible + intervals_bed_gz_tbi_and_num_intervals, + known_indels_vqsr, + known_sites_indels, + known_sites_indels_tbi, + known_sites_snps, + known_sites_snps_tbi, + known_snps_vqsr, + params.joint_germline, + params.skip_tools && params.skip_tools.split(',').contains('haplotypecaller_filter'), // true if filtering should be skipped + params.sentieon_haplotyper_emit_mode, + params.sentieon_dnascope_emit_mode, + params.sentieon_dnascope_pcr_indel_model, + sentieon_dnascope_model) + + // TUMOR ONLY VARIANT CALLING + BAM_VARIANT_CALLING_TUMOR_ONLY_ALL( + params.tools, + cram_variant_calling_tumor_only, + [ [ id:'bwa' ], [] ], // bwa_index for tiddit; not used here + cf_chrom_len, + chr_files, + cnvkit_reference, + dbsnp, + dbsnp_tbi, + dict, + fasta, + fasta_fai, + germline_resource, + germline_resource_tbi, + intervals_and_num_intervals, + intervals_bed_gz_tbi_and_num_intervals, + intervals_bed_combined, + intervals_bed_gz_tbi_combined, // [] if no_intervals, else interval_bed_combined_gz, interval_bed_combined_gz_tbi + mappability, + pon, + pon_tbi, + params.joint_mutect2, + params.wes + ) + + // PAIR VARIANT CALLING + BAM_VARIANT_CALLING_SOMATIC_ALL( + params.tools, + cram_variant_calling_pair, + [ [ id:'bwa' ], [] ], // bwa_index for tiddit; not used here + cf_chrom_len, + chr_files, + dbsnp, + dbsnp_tbi, + dict, + fasta, + fasta_fai, + germline_resource, + germline_resource_tbi, + intervals_and_num_intervals, + intervals_bed_gz_tbi_and_num_intervals, + intervals_bed_combined, + intervals_bed_gz_tbi_combined, // [] if no_intervals, else interval_bed_combined_gz, interval_bed_combined_gz_tbi + mappability, + msisensorpro_scan, + pon, + pon_tbi, + allele_files, + loci_files, + gc_file, + rt_file, + params.joint_mutect2, + params.wes + ) + + // POST VARIANTCALLING + POST_VARIANTCALLING(BAM_VARIANT_CALLING_GERMLINE_ALL.out.vcf_all, + params.concatenate_vcfs) + + // Gather vcf files for annotation and QC + vcf_to_annotate = Channel.empty() + vcf_to_annotate = vcf_to_annotate.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.vcf_deepvariant) + vcf_to_annotate = vcf_to_annotate.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.vcf_freebayes) + vcf_to_annotate = vcf_to_annotate.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.vcf_haplotypecaller) + vcf_to_annotate = vcf_to_annotate.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.vcf_manta) + vcf_to_annotate = vcf_to_annotate.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.vcf_sentieon_dnascope) + vcf_to_annotate = vcf_to_annotate.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.vcf_sentieon_haplotyper) + vcf_to_annotate = vcf_to_annotate.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.vcf_strelka) + vcf_to_annotate = vcf_to_annotate.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.vcf_tiddit) + vcf_to_annotate = vcf_to_annotate.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.vcf_mpileup) + vcf_to_annotate = vcf_to_annotate.mix(BAM_VARIANT_CALLING_TUMOR_ONLY_ALL.out.vcf_all) + vcf_to_annotate = vcf_to_annotate.mix(BAM_VARIANT_CALLING_SOMATIC_ALL.out.vcf_all) + + // QC + VCF_QC_BCFTOOLS_VCFTOOLS(vcf_to_annotate, intervals_bed_combined) + + reports = reports.mix(VCF_QC_BCFTOOLS_VCFTOOLS.out.bcftools_stats.collect{ meta, stats -> stats }) + reports = reports.mix(VCF_QC_BCFTOOLS_VCFTOOLS.out.vcftools_tstv_counts.collect{ meta, counts -> counts }) + reports = reports.mix(VCF_QC_BCFTOOLS_VCFTOOLS.out.vcftools_tstv_qual.collect{ meta, qual -> qual }) + reports = reports.mix(VCF_QC_BCFTOOLS_VCFTOOLS.out.vcftools_filter_summary.collect{ meta, summary -> summary }) + + CHANNEL_VARIANT_CALLING_CREATE_CSV(vcf_to_annotate) + + // Gather used variant calling softwares versions + versions = versions.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.versions) + versions = versions.mix(BAM_VARIANT_CALLING_SOMATIC_ALL.out.versions) + versions = versions.mix(BAM_VARIANT_CALLING_TUMOR_ONLY_ALL.out.versions) + versions = versions.mix(POST_VARIANTCALLING.out.versions) + versions = versions.mix(VCF_QC_BCFTOOLS_VCFTOOLS.out.versions) + + // ANNOTATE + if (params.step == 'annotate') vcf_to_annotate = input_sample + + if (params.tools.split(',').contains('merge') || params.tools.split(',').contains('snpeff') || params.tools.split(',').contains('vep')|| params.tools.split(',').contains('bcfann')) { + + vep_fasta = (params.vep_include_fasta) ? fasta.map{ fasta -> [ [ id:fasta.baseName ], fasta ] } : [[id: 'null'], []] + + VCF_ANNOTATE_ALL( + vcf_to_annotate.map{meta, vcf -> [ meta + [ file_name: vcf.baseName ], vcf ] }, + vep_fasta, + params.tools, + params.snpeff_genome ? "${params.snpeff_genome}.${params.snpeff_db}" : "${params.genome}.${params.snpeff_db}", + snpeff_cache, + vep_genome, + vep_species, + vep_cache_version, + vep_cache, + vep_extra_files, + bcftools_annotations, + bcftools_annotations_tbi, + bcftools_header_lines) + + // Gather used softwares versions + versions = versions.mix(VCF_ANNOTATE_ALL.out.versions) + reports = reports.mix(VCF_ANNOTATE_ALL.out.reports) + } + } + + version_yaml = Channel.empty() + if (!(params.skip_tools && params.skip_tools.split(',').contains('versions'))) { + CUSTOM_DUMPSOFTWAREVERSIONS(versions.unique().collectFile(name: 'collated_versions.yml')) + version_yaml = CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect() + } + + if (!(params.skip_tools && params.skip_tools.split(',').contains('multiqc'))) { + workflow_summary = WorkflowSarek.paramsSummaryMultiqc(workflow, summary_params) + ch_workflow_summary = Channel.value(workflow_summary) + + methods_description = WorkflowSarek.methodsDescriptionText(workflow, ch_multiqc_custom_methods_description, params) + ch_methods_description = Channel.value(methods_description) + + multiqc_files = Channel.empty() + multiqc_files = multiqc_files.mix(version_yaml) + multiqc_files = multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) + multiqc_files = multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml')) + multiqc_files = multiqc_files.mix(reports.collect().ifEmpty([])) + + MULTIQC(multiqc_files.collect(), ch_multiqc_config.collect().ifEmpty([]), ch_multiqc_custom_config.collect().ifEmpty([]), ch_multiqc_logo.collect().ifEmpty([])) + + multiqc_report = MULTIQC.out.report.toList() + versions = versions.mix(MULTIQC.out.versions) + } } /* @@ -122,11 +1127,10 @@ workflow.onComplete { } NfcoreTemplate.dump_parameters(workflow, params) NfcoreTemplate.summary(workflow, params, log) - if (params.hook_url) { - NfcoreTemplate.IM_notification(workflow, params, summary_params, projectDir, log) - } + if (params.hook_url) NfcoreTemplate.IM_notification(workflow, params, summary_params, projectDir, log) } + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ THE END