diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index b290e09..97c8c97 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -1,20 +1,20 @@ { "name": "nfcore", - "image": "nfcore/gitpod:latest", - "remoteUser": "gitpod", - "runArgs": ["--privileged"], + "image": "nfcore/devcontainer:latest", - // Configure tool-specific properties. - "customizations": { - // Configure properties specific to VS Code. - "vscode": { - // Set *default* container specific settings.json values on container create. - "settings": { - "python.defaultInterpreterPath": "/opt/conda/bin/python" - }, + "remoteUser": "root", + "privileged": true, - // Add the IDs of extensions you want installed when the container is created. - "extensions": ["ms-python.python", "ms-python.vscode-pylance", "nf-core.nf-core-extensionpack"] - } + "remoteEnv": { + // Workspace path on the host for mounting with docker-outside-of-docker + "LOCAL_WORKSPACE_FOLDER": "${localWorkspaceFolder}" + }, + + "onCreateCommand": "./.devcontainer/setup.sh", + + "hostRequirements": { + "cpus": 4, + "memory": "16gb", + "storage": "32gb" } } diff --git a/.devcontainer/setup.sh b/.devcontainer/setup.sh new file mode 100755 index 0000000..2ca6343 --- /dev/null +++ b/.devcontainer/setup.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash + +# Customise the terminal command prompt +echo "export PROMPT_DIRTRIM=2" >> $HOME/.bashrc +echo "export PS1='\[\e[3;36m\]\w ->\[\e[0m\\] '" >> $HOME/.bashrc +export PROMPT_DIRTRIM=2 +export PS1='\[\e[3;36m\]\w ->\[\e[0m\\] ' + +# Update Nextflow +nextflow self-update + +# Update welcome message +echo "Welcome to the nf-core/deepmutscan devcontainer!" > /usr/local/etc/vscode-dev-containers/first-run-notice.txt diff --git a/.editorconfig b/.editorconfig deleted file mode 100644 index 6d9b74c..0000000 --- a/.editorconfig +++ /dev/null @@ -1,37 +0,0 @@ -root = true - -[*] -charset = utf-8 -end_of_line = lf -insert_final_newline = true -trim_trailing_whitespace = true -indent_size = 4 -indent_style = space - -[*.{md,yml,yaml,html,css,scss,js}] -indent_size = 2 - -# These files are edited and tested upstream in nf-core/modules -[/modules/nf-core/**] -charset = unset -end_of_line = unset -insert_final_newline = unset -trim_trailing_whitespace = unset -indent_style = unset -[/subworkflows/nf-core/**] -charset = unset -end_of_line = unset -insert_final_newline = unset -trim_trailing_whitespace = unset -indent_style = unset - -[/assets/email*] -indent_size = unset - -# ignore python and markdown -[*.{py,md}] -indent_style = unset - -# ignore ro-crate metadata files -[**/ro-crate-metadata.json] -insert_final_newline = unset diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index 3a5f6ae..dc13efb 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -1,23 +1,23 @@ -# `nf-core/dmscore`: Contributing Guidelines +# `nf-core/deepmutscan`: Contributing Guidelines Hi there! -Many thanks for taking an interest in improving nf-core/dmscore. +Many thanks for taking an interest in improving nf-core/deepmutscan. -We try to manage the required tasks for nf-core/dmscore using GitHub issues, you probably came to this page when creating one. +We try to manage the required tasks for nf-core/deepmutscan using GitHub issues, you probably came to this page when creating one. Please use the pre-filled template to save time. However, don't be put off by this template - other more general issues and suggestions are welcome! Contributions to the code are even more welcome ;) > [!NOTE] -> If you need help using or modifying nf-core/dmscore then the best place to ask is on the nf-core Slack [#dmscore](https://nfcore.slack.com/channels/dmscore) channel ([join our Slack here](https://nf-co.re/join/slack)). +> If you need help using or modifying nf-core/deepmutscan then the best place to ask is on the nf-core Slack [#deepmutscan](https://nfcore.slack.com/channels/deepmutscan) channel ([join our Slack here](https://nf-co.re/join/slack)). ## Contribution workflow -If you'd like to write some code for nf-core/dmscore, the standard workflow is as follows: +If you'd like to write some code for nf-core/deepmutscan, the standard workflow is as follows: -1. Check that there isn't already an issue about your idea in the [nf-core/dmscore issues](https://github.com/nf-core/dmscore/issues) to avoid duplicating work. If there isn't one already, please create one so that others know you're working on this -2. [Fork](https://help.github.com/en/github/getting-started-with-github/fork-a-repo) the [nf-core/dmscore repository](https://github.com/nf-core/dmscore) to your GitHub account +1. Check that there isn't already an issue about your idea in the [nf-core/deepmutscan issues](https://github.com/nf-core/deepmutscan/issues) to avoid duplicating work. If there isn't one already, please create one so that others know you're working on this +2. [Fork](https://help.github.com/en/github/getting-started-with-github/fork-a-repo) the [nf-core/deepmutscan repository](https://github.com/nf-core/deepmutscan) to your GitHub account 3. Make the necessary changes / additions within your forked repository following [Pipeline conventions](#pipeline-contribution-conventions) 4. Use `nf-core pipelines schema build` and add any new parameters to the pipeline JSON schema (requires [nf-core tools](https://github.com/nf-core/tools) >= 1.10). 5. Submit a Pull Request against the `dev` branch and wait for the code to be reviewed and merged @@ -61,11 +61,11 @@ These tests are run both with the latest available version of `Nextflow` and als ## Getting help -For further information/help, please consult the [nf-core/dmscore documentation](https://nf-co.re/dmscore/usage) and don't hesitate to get in touch on the nf-core Slack [#dmscore](https://nfcore.slack.com/channels/dmscore) channel ([join our Slack here](https://nf-co.re/join/slack)). +For further information/help, please consult the [nf-core/deepmutscan documentation](https://nf-co.re/deepmutscan/usage) and don't hesitate to get in touch on the nf-core Slack [#deepmutscan](https://nfcore.slack.com/channels/deepmutscan) channel ([join our Slack here](https://nf-co.re/join/slack)). ## Pipeline contribution conventions -To make the `nf-core/dmscore` code and processing logic more understandable for new contributors and to ensure quality, we semi-standardise the way the code and other contributions are written. +To make the `nf-core/deepmutscan` code and processing logic more understandable for new contributors and to ensure quality, we semi-standardise the way the code and other contributions are written. ### Adding a new step @@ -78,7 +78,7 @@ If you wish to contribute a new step, please use the following coding standards: 5. Add any new parameters to `nextflow_schema.json` with help text (via the `nf-core pipelines schema build` tool). 6. Add sanity checks and validation for all relevant parameters. 7. Perform local tests to validate that the new code works as expected. -8. If applicable, add a new test command in `.github/workflow/ci.yml`. +8. If applicable, add a new test in the `tests` directory. 9. Update MultiQC config `assets/multiqc_config.yml` so relevant suffixes, file name clean up and module plots are in the appropriate order. If applicable, add a [MultiQC](https://https://multiqc.info/) module. 10. Add a description of the output files and if relevant any appropriate images from the MultiQC report to `docs/output.md`. @@ -115,7 +115,7 @@ This repo includes a devcontainer configuration which will create a GitHub Codes To get started: -- Open the repo in [Codespaces](https://github.com/nf-core/dmscore/codespaces) +- Open the repo in [Codespaces](https://github.com/nf-core/deepmutscan/codespaces) - Tools installed - nf-core - Nextflow diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index 58a5523..78f6876 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -8,7 +8,7 @@ body: Before you post this issue, please check the documentation: - [nf-core website: troubleshooting](https://nf-co.re/usage/troubleshooting) - - [nf-core/dmscore pipeline documentation](https://nf-co.re/dmscore/usage) + - [nf-core/deepmutscan pipeline documentation](https://nf-co.re/deepmutscan/usage) - type: textarea id: description attributes: @@ -46,4 +46,4 @@ body: * Executor _(eg. slurm, local, awsbatch)_ * Container engine: _(e.g. Docker, Singularity, Conda, Podman, Shifter, Charliecloud, or Apptainer)_ * OS _(eg. CentOS Linux, macOS, Linux Mint)_ - * Version of nf-core/dmscore _(eg. 1.1, 1.5, 1.8.2)_ + * Version of nf-core/deepmutscan _(eg. 1.1, 1.5, 1.8.2)_ diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml index 08f0c49..572e0cf 100644 --- a/.github/ISSUE_TEMPLATE/config.yml +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -2,6 +2,6 @@ contact_links: - name: Join nf-core url: https://nf-co.re/join about: Please join the nf-core community here - - name: "Slack #dmscore channel" - url: https://nfcore.slack.com/channels/dmscore - about: Discussion about the nf-core/dmscore pipeline + - name: "Slack #deepmutscan channel" + url: https://nfcore.slack.com/channels/deepmutscan + about: Discussion about the nf-core/deepmutscan pipeline diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml index a08dd5f..0731019 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.yml +++ b/.github/ISSUE_TEMPLATE/feature_request.yml @@ -1,5 +1,5 @@ name: Feature request -description: Suggest an idea for the nf-core/dmscore pipeline +description: Suggest an idea for the nf-core/deepmutscan pipeline labels: enhancement body: - type: textarea diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 0d6f133..ee9e558 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,22 +1,22 @@ ## PR checklist - [ ] This comment contains a description of changes (with reason). - [ ] If you've fixed a bug or added code that should be tested, add tests! -- [ ] If you've added a new tool - have you followed the pipeline conventions in the [contribution docs](https://github.com/nf-core/dmscore/tree/master/.github/CONTRIBUTING.md) -- [ ] If necessary, also make a PR on the nf-core/dmscore _branch_ on the [nf-core/test-datasets](https://github.com/nf-core/test-datasets) repository. +- [ ] If you've added a new tool - have you followed the pipeline conventions in the [contribution docs](https://github.com/nf-core/deepmutscan/tree/master/.github/CONTRIBUTING.md) +- [ ] If necessary, also make a PR on the nf-core/deepmutscan _branch_ on the [nf-core/test-datasets](https://github.com/nf-core/test-datasets) repository. - [ ] Make sure your code lints (`nf-core pipelines lint`). - [ ] Ensure the test suite passes (`nextflow run . -profile test,docker --outdir `). - [ ] Check for unexpected warnings in debug mode (`nextflow run . -profile debug,test,docker --outdir `). diff --git a/.github/actions/get-shards/action.yml b/.github/actions/get-shards/action.yml new file mode 100644 index 0000000..3408527 --- /dev/null +++ b/.github/actions/get-shards/action.yml @@ -0,0 +1,69 @@ +name: "Get number of shards" +description: "Get the number of nf-test shards for the current CI job" +inputs: + max_shards: + description: "Maximum number of shards allowed" + required: true + paths: + description: "Component paths to test" + required: false + tags: + description: "Tags to pass as argument for nf-test --tag parameter" + required: false +outputs: + shard: + description: "Array of shard numbers" + value: ${{ steps.shards.outputs.shard }} + total_shards: + description: "Total number of shards" + value: ${{ steps.shards.outputs.total_shards }} +runs: + using: "composite" + steps: + - name: Install nf-test + uses: nf-core/setup-nf-test@v1 + with: + version: ${{ env.NFT_VER }} + - name: Get number of shards + id: shards + shell: bash + run: | + # Run nf-test with dynamic parameter + nftest_output=$(nf-test test \ + --profile +docker \ + $(if [ -n "${{ inputs.tags }}" ]; then echo "--tag ${{ inputs.tags }}"; fi) \ + --dry-run \ + --ci \ + --changed-since HEAD^) || { + echo "nf-test command failed with exit code $?" + echo "Full output: $nftest_output" + exit 1 + } + echo "nf-test dry-run output: $nftest_output" + + # Default values for shard and total_shards + shard="[]" + total_shards=0 + + # Check if there are related tests + if echo "$nftest_output" | grep -q 'No tests to execute'; then + echo "No related tests found." + else + # Extract the number of related tests + number_of_shards=$(echo "$nftest_output" | sed -n 's|.*Executed \([0-9]*\) tests.*|\1|p') + if [[ -n "$number_of_shards" && "$number_of_shards" -gt 0 ]]; then + shards_to_run=$(( $number_of_shards < ${{ inputs.max_shards }} ? $number_of_shards : ${{ inputs.max_shards }} )) + shard=$(seq 1 "$shards_to_run" | jq -R . | jq -c -s .) + total_shards="$shards_to_run" + else + echo "Unexpected output format. Falling back to default values." + fi + fi + + # Write to GitHub Actions outputs + echo "shard=$shard" >> $GITHUB_OUTPUT + echo "total_shards=$total_shards" >> $GITHUB_OUTPUT + + # Debugging output + echo "Final shard array: $shard" + echo "Total number of shards: $total_shards" diff --git a/.github/actions/nf-test/action.yml b/.github/actions/nf-test/action.yml new file mode 100644 index 0000000..3b9724c --- /dev/null +++ b/.github/actions/nf-test/action.yml @@ -0,0 +1,111 @@ +name: "nf-test Action" +description: "Runs nf-test with common setup steps" +inputs: + profile: + description: "Profile to use" + required: true + shard: + description: "Shard number for this CI job" + required: true + total_shards: + description: "Total number of test shards(NOT the total number of matrix jobs)" + required: true + paths: + description: "Test paths" + required: true + tags: + description: "Tags to pass as argument for nf-test --tag parameter" + required: false +runs: + using: "composite" + steps: + - name: Setup Nextflow + uses: nf-core/setup-nextflow@v2 + with: + version: "${{ env.NXF_VERSION }}" + + - name: Set up Python + uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6 + with: + python-version: "3.14" + + - name: Install nf-test + uses: nf-core/setup-nf-test@v1 + with: + version: "${{ env.NFT_VER }}" + install-pdiff: true + + - name: Setup apptainer + if: contains(inputs.profile, 'singularity') + uses: eWaterCycle/setup-apptainer@main + + - name: Set up Singularity + if: contains(inputs.profile, 'singularity') + shell: bash + run: | + mkdir -p $NXF_SINGULARITY_CACHEDIR + mkdir -p $NXF_SINGULARITY_LIBRARYDIR + + - name: Conda setup + if: contains(inputs.profile, 'conda') + uses: conda-incubator/setup-miniconda@505e6394dae86d6a5c7fbb6e3fb8938e3e863830 # v3 + with: + auto-update-conda: true + conda-solver: libmamba + channels: conda-forge + channel-priority: strict + conda-remove-defaults: true + + - name: Run nf-test + shell: bash + env: + NFT_WORKDIR: ${{ env.NFT_WORKDIR }} + run: | + nf-test test \ + --profile=+${{ inputs.profile }} \ + $(if [ -n "${{ inputs.tags }}" ]; then echo "--tag ${{ inputs.tags }}"; fi) \ + --ci \ + --changed-since HEAD^ \ + --verbose \ + --tap=test.tap \ + --shard ${{ inputs.shard }}/${{ inputs.total_shards }} + + # Save the absolute path of the test.tap file to the output + echo "tap_file_path=$(realpath test.tap)" >> $GITHUB_OUTPUT + + - name: Generate test summary + if: always() + shell: bash + run: | + # Add header if it doesn't exist (using a token file to track this) + if [ ! -f ".summary_header" ]; then + echo "# 🚀 nf-test results" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "| Status | Test Name | Profile | Shard |" >> $GITHUB_STEP_SUMMARY + echo "|:------:|-----------|---------|-------|" >> $GITHUB_STEP_SUMMARY + touch .summary_header + fi + + if [ -f test.tap ]; then + while IFS= read -r line; do + if [[ $line =~ ^ok ]]; then + test_name="${line#ok }" + # Remove the test number from the beginning + test_name="${test_name#* }" + echo "| ✅ | ${test_name} | ${{ inputs.profile }} | ${{ inputs.shard }}/${{ inputs.total_shards }} |" >> $GITHUB_STEP_SUMMARY + elif [[ $line =~ ^not\ ok ]]; then + test_name="${line#not ok }" + # Remove the test number from the beginning + test_name="${test_name#* }" + echo "| ❌ | ${test_name} | ${{ inputs.profile }} | ${{ inputs.shard }}/${{ inputs.total_shards }} |" >> $GITHUB_STEP_SUMMARY + fi + done < test.tap + else + echo "| ⚠️ | No test results found | ${{ inputs.profile }} | ${{ inputs.shard }}/${{ inputs.total_shards }} |" >> $GITHUB_STEP_SUMMARY + fi + + - name: Clean up + if: always() + shell: bash + run: | + sudo rm -rf /home/ubuntu/tests/ diff --git a/.github/workflows/awsfulltest.yml b/.github/workflows/awsfulltest.yml index 50287a8..e7c08d6 100644 --- a/.github/workflows/awsfulltest.yml +++ b/.github/workflows/awsfulltest.yml @@ -4,44 +4,23 @@ name: nf-core AWS full size tests # It runs the -profile 'test_full' on AWS batch on: - pull_request: - branches: - - main - - master workflow_dispatch: pull_request_review: types: [submitted] + release: + types: [published] jobs: run-platform: name: Run AWS full tests - # run only if the PR is approved by at least 2 reviewers and against the master branch or manually triggered - if: github.repository == 'nf-core/dmscore' && github.event.review.state == 'approved' && github.event.pull_request.base.ref == 'master' || github.event_name == 'workflow_dispatch' + # run only if the PR is approved by at least 2 reviewers and against the master/main branch or manually triggered + if: github.repository == 'nf-core/deepmutscan' && github.event.review.state == 'approved' && (github.event.pull_request.base.ref == 'master' || github.event.pull_request.base.ref == 'main') || github.event_name == 'workflow_dispatch' || github.event_name == 'release' runs-on: ubuntu-latest steps: - - name: Get PR reviews - uses: octokit/request-action@v2.x - if: github.event_name != 'workflow_dispatch' - id: check_approvals - continue-on-error: true - with: - route: GET /repos/${{ github.repository }}/pulls/${{ github.event.pull_request.number }}/reviews?per_page=100 - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - - name: Check for approvals - if: ${{ failure() && github.event_name != 'workflow_dispatch' }} - run: | - echo "No review approvals found. At least 2 approvals are required to run this action automatically." - exit 1 - - - name: Check for enough approvals (>=2) - id: test_variables - if: github.event_name != 'workflow_dispatch' + - name: Set revision variable + id: revision run: | - JSON_RESPONSE='${{ steps.check_approvals.outputs.data }}' - CURRENT_APPROVALS_COUNT=$(echo $JSON_RESPONSE | jq -c '[.[] | select(.state | contains("APPROVED")) ] | length') - test $CURRENT_APPROVALS_COUNT -ge 2 || exit 1 # At least 2 approvals are required + echo "revision=${{ (github.event_name == 'workflow_dispatch' || github.event_name == 'release') && github.sha || 'dev' }}" >> "$GITHUB_OUTPUT" - name: Launch workflow via Seqera Platform uses: seqeralabs/action-tower-launch@v2 @@ -49,21 +28,21 @@ jobs: # Add full size test data (but still relatively small datasets for few samples) # on the `test_full.config` test runs with only one set of parameters with: - workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} + workspace_id: ${{ vars.TOWER_WORKSPACE_ID }} access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} - compute_env: ${{ secrets.TOWER_COMPUTE_ENV }} - revision: ${{ github.sha }} - workdir: s3://${{ secrets.AWS_S3_BUCKET }}/work/dmscore/work-${{ github.sha }} + compute_env: ${{ vars.TOWER_COMPUTE_ENV }} + revision: ${{ steps.revision.outputs.revision }} + workdir: s3://${{ vars.AWS_S3_BUCKET }}/work/deepmutscan/work-${{ steps.revision.outputs.revision }} parameters: | { "hook_url": "${{ secrets.MEGATESTS_ALERTS_SLACK_HOOK_URL }}", - "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/dmscore/results-${{ github.sha }}" + "outdir": "s3://${{ vars.AWS_S3_BUCKET }}/deepmutscan/results-${{ steps.revision.outputs.revision }}" } profiles: test_full - - uses: actions/upload-artifact@v4 + - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4 with: name: Seqera Platform debug log file path: | - seqera_platform_action_*.log - seqera_platform_action_*.json + tower_action_*.log + tower_action_*.json diff --git a/.github/workflows/awstest.yml b/.github/workflows/awstest.yml index 4a4b5db..e82b4c0 100644 --- a/.github/workflows/awstest.yml +++ b/.github/workflows/awstest.yml @@ -7,27 +7,27 @@ on: jobs: run-platform: name: Run AWS tests - if: github.repository == 'nf-core/dmscore' + if: github.repository == 'nf-core/deepmutscan' runs-on: ubuntu-latest steps: # Launch workflow using Seqera Platform CLI tool action - name: Launch workflow via Seqera Platform uses: seqeralabs/action-tower-launch@v2 with: - workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} + workspace_id: ${{ vars.TOWER_WORKSPACE_ID }} access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} - compute_env: ${{ secrets.TOWER_COMPUTE_ENV }} + compute_env: ${{ vars.TOWER_COMPUTE_ENV }} revision: ${{ github.sha }} - workdir: s3://${{ secrets.AWS_S3_BUCKET }}/work/dmscore/work-${{ github.sha }} + workdir: s3://${{ vars.AWS_S3_BUCKET }}/work/deepmutscan/work-${{ github.sha }} parameters: | { - "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/dmscore/results-test-${{ github.sha }}" + "outdir": "s3://${{ vars.AWS_S3_BUCKET }}/deepmutscan/results-test-${{ github.sha }}" } profiles: test - - uses: actions/upload-artifact@v4 + - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4 with: name: Seqera Platform debug log file path: | - seqera_platform_action_*.log - seqera_platform_action_*.json + tower_action_*.log + tower_action_*.json diff --git a/.github/workflows/branch.yml b/.github/workflows/branch.yml index 92e3db0..d13cf80 100644 --- a/.github/workflows/branch.yml +++ b/.github/workflows/branch.yml @@ -13,9 +13,9 @@ jobs: steps: # PRs to the nf-core repo main/master branch are only ok if coming from the nf-core repo `dev` or any `patch` branches - name: Check PRs - if: github.repository == 'nf-core/dmscore' + if: github.repository == 'nf-core/deepmutscan' run: | - { [[ ${{github.event.pull_request.head.repo.full_name }} == nf-core/dmscore ]] && [[ $GITHUB_HEAD_REF == "dev" ]]; } || [[ $GITHUB_HEAD_REF == "patch" ]] + { [[ ${{github.event.pull_request.head.repo.full_name }} == nf-core/deepmutscan ]] && [[ $GITHUB_HEAD_REF == "dev" ]]; } || [[ $GITHUB_HEAD_REF == "patch" ]] # If the above check failed, post a comment on the PR explaining the failure # NOTE - this doesn't currently work if the PR is coming from a fork, due to limitations in GitHub actions secrets diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml deleted file mode 100644 index 92396ce..0000000 --- a/.github/workflows/ci.yml +++ /dev/null @@ -1,87 +0,0 @@ -name: nf-core CI -# This workflow runs the pipeline with the minimal test dataset to check that it completes without any syntax errors -on: - push: - branches: - - dev - pull_request: - release: - types: [published] - workflow_dispatch: - -env: - NXF_ANSI_LOG: false - NXF_SINGULARITY_CACHEDIR: ${{ github.workspace }}/.singularity - NXF_SINGULARITY_LIBRARYDIR: ${{ github.workspace }}/.singularity - -concurrency: - group: "${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}" - cancel-in-progress: true - -jobs: - test: - name: "Run pipeline with test data (${{ matrix.NXF_VER }} | ${{ matrix.test_name }} | ${{ matrix.profile }})" - # Only run on push if this is the nf-core dev branch (merged PRs) - if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/dmscore') }}" - runs-on: ubuntu-latest - strategy: - matrix: - NXF_VER: - - "24.04.2" - - "latest-everything" - profile: - - "conda" - - "docker" - - "singularity" - test_name: - - "test" - isMaster: - - ${{ github.base_ref == 'master' }} - # Exclude conda and singularity on dev - exclude: - - isMaster: false - profile: "conda" - - isMaster: false - profile: "singularity" - steps: - - name: Check out pipeline code - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 - with: - fetch-depth: 0 - - - name: Set up Nextflow - uses: nf-core/setup-nextflow@v2 - with: - version: "${{ matrix.NXF_VER }}" - - - name: Set up Apptainer - if: matrix.profile == 'singularity' - uses: eWaterCycle/setup-apptainer@main - - - name: Set up Singularity - if: matrix.profile == 'singularity' - run: | - mkdir -p $NXF_SINGULARITY_CACHEDIR - mkdir -p $NXF_SINGULARITY_LIBRARYDIR - - - name: Set up Miniconda - if: matrix.profile == 'conda' - uses: conda-incubator/setup-miniconda@a4260408e20b96e80095f42ff7f1a15b27dd94ca # v3 - with: - miniconda-version: "latest" - auto-update-conda: true - conda-solver: libmamba - channels: conda-forge,bioconda - - - name: Set up Conda - if: matrix.profile == 'conda' - run: | - echo $(realpath $CONDA)/condabin >> $GITHUB_PATH - echo $(realpath python) >> $GITHUB_PATH - - - name: Clean up Disk space - uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1 - - - name: "Run pipeline with test data ${{ matrix.NXF_VER }} | ${{ matrix.test_name }} | ${{ matrix.profile }}" - run: | - nextflow run ${GITHUB_WORKSPACE} -profile ${{ matrix.test_name }},${{ matrix.profile }} --outdir ./results diff --git a/.github/workflows/clean-up.yml b/.github/workflows/clean-up.yml index 0b6b1f2..6adb0ff 100644 --- a/.github/workflows/clean-up.yml +++ b/.github/workflows/clean-up.yml @@ -10,7 +10,7 @@ jobs: issues: write pull-requests: write steps: - - uses: actions/stale@28ca1036281a5e5922ead5184a1bbf96e5fc984e # v9 + - uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10 with: stale-issue-message: "This issue has been tagged as awaiting-changes or awaiting-feedback by an nf-core contributor. Remove stale label or add a comment otherwise this issue will be closed in 20 days." stale-pr-message: "This PR has been tagged as awaiting-changes or awaiting-feedback by an nf-core contributor. Remove stale label or add a comment if it is still useful." diff --git a/.github/workflows/download_pipeline.yml b/.github/workflows/download_pipeline.yml index ab06316..6d94bcb 100644 --- a/.github/workflows/download_pipeline.yml +++ b/.github/workflows/download_pipeline.yml @@ -12,14 +12,6 @@ on: required: true default: "dev" pull_request: - types: - - opened - - edited - - synchronize - branches: - - main - - master - pull_request_target: branches: - main - master @@ -52,9 +44,9 @@ jobs: - name: Disk space cleanup uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1 - - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5 + - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6 with: - python-version: "3.12" + python-version: "3.14" architecture: "x64" - name: Setup Apptainer @@ -65,7 +57,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install git+https://github.com/nf-core/tools.git@dev + pip install git+https://github.com/nf-core/tools.git - name: Make a cache directory for the container images run: | @@ -120,6 +112,7 @@ jobs: echo "IMAGE_COUNT_AFTER=$image_count" >> "$GITHUB_OUTPUT" - name: Compare container image counts + id: count_comparison run: | if [ "${{ steps.count_initial.outputs.IMAGE_COUNT_INITIAL }}" -ne "${{ steps.count_afterwards.outputs.IMAGE_COUNT_AFTER }}" ]; then initial_count=${{ steps.count_initial.outputs.IMAGE_COUNT_INITIAL }} @@ -132,3 +125,10 @@ jobs: else echo "The pipeline can be downloaded successfully!" fi + + - name: Upload Nextflow logfile for debugging purposes + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4 + with: + name: nextflow_logfile.txt + path: .nextflow.log* + include-hidden-files: true diff --git a/.github/workflows/fix-linting.yml b/.github/workflows/fix_linting.yml similarity index 75% rename from .github/workflows/fix-linting.yml rename to .github/workflows/fix_linting.yml index addd34e..65b1dc7 100644 --- a/.github/workflows/fix-linting.yml +++ b/.github/workflows/fix_linting.yml @@ -9,17 +9,17 @@ jobs: if: > contains(github.event.comment.html_url, '/pull/') && contains(github.event.comment.body, '@nf-core-bot fix linting') && - github.repository == 'nf-core/dmscore' + github.repository == 'nf-core/deepmutscan' runs-on: ubuntu-latest steps: # Use the @nf-core-bot token to check out so we can push later - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 + - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5 with: token: ${{ secrets.nf_core_bot_auth_token }} # indication that the linting is being fixed - name: React on comment - uses: peter-evans/create-or-update-comment@71345be0265236311c031f5c7866368bd1eff043 # v4 + uses: peter-evans/create-or-update-comment@e8674b075228eee787fea43ef493e45ece1004c9 # v5 with: comment-id: ${{ github.event.comment.id }} reactions: eyes @@ -32,9 +32,9 @@ jobs: GITHUB_TOKEN: ${{ secrets.nf_core_bot_auth_token }} # Install and run pre-commit - - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5 + - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6 with: - python-version: "3.12" + python-version: "3.14" - name: Install pre-commit run: pip install pre-commit @@ -47,7 +47,7 @@ jobs: # indication that the linting has finished - name: react if linting finished succesfully if: steps.pre-commit.outcome == 'success' - uses: peter-evans/create-or-update-comment@71345be0265236311c031f5c7866368bd1eff043 # v4 + uses: peter-evans/create-or-update-comment@e8674b075228eee787fea43ef493e45ece1004c9 # v5 with: comment-id: ${{ github.event.comment.id }} reactions: "+1" @@ -67,23 +67,23 @@ jobs: - name: react if linting errors were fixed id: react-if-fixed if: steps.commit-and-push.outcome == 'success' - uses: peter-evans/create-or-update-comment@71345be0265236311c031f5c7866368bd1eff043 # v4 + uses: peter-evans/create-or-update-comment@e8674b075228eee787fea43ef493e45ece1004c9 # v5 with: comment-id: ${{ github.event.comment.id }} reactions: hooray - name: react if linting errors were not fixed if: steps.commit-and-push.outcome == 'failure' - uses: peter-evans/create-or-update-comment@71345be0265236311c031f5c7866368bd1eff043 # v4 + uses: peter-evans/create-or-update-comment@e8674b075228eee787fea43ef493e45ece1004c9 # v5 with: comment-id: ${{ github.event.comment.id }} reactions: confused - name: react if linting errors were not fixed if: steps.commit-and-push.outcome == 'failure' - uses: peter-evans/create-or-update-comment@71345be0265236311c031f5c7866368bd1eff043 # v4 + uses: peter-evans/create-or-update-comment@e8674b075228eee787fea43ef493e45ece1004c9 # v5 with: issue-number: ${{ github.event.issue.number }} body: | @${{ github.actor }} I tried to fix the linting errors, but it didn't work. Please fix them manually. - See [CI log](https://github.com/nf-core/dmscore/actions/runs/${{ github.run_id }}) for more details. + See [CI log](https://github.com/nf-core/deepmutscan/actions/runs/${{ github.run_id }}) for more details. diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index dbd52d5..30e6602 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -3,9 +3,6 @@ name: nf-core linting # It runs the `nf-core pipelines lint` and markdown lint tests to ensure # that the code meets the nf-core guidelines. on: - push: - branches: - - dev pull_request: release: types: [published] @@ -14,12 +11,12 @@ jobs: pre-commit: runs-on: ubuntu-latest steps: - - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 + - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5 - - name: Set up Python 3.12 - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5 + - name: Set up Python 3.14 + uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6 with: - python-version: "3.12" + python-version: "3.14" - name: Install pre-commit run: pip install pre-commit @@ -31,18 +28,18 @@ jobs: runs-on: ubuntu-latest steps: - name: Check out pipeline code - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5 - name: Install Nextflow uses: nf-core/setup-nextflow@v2 - - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5 + - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6 with: - python-version: "3.12" + python-version: "3.14" architecture: "x64" - name: read .nf-core.yml - uses: pietrobolcato/action-read-yaml@1.1.0 + uses: pietrobolcato/action-read-yaml@9f13718d61111b69f30ab4ac683e67a56d254e1d # 1.1.0 id: read_yml with: config: ${{ github.workspace }}/.nf-core.yml @@ -74,7 +71,7 @@ jobs: - name: Upload linting log file artifact if: ${{ always() }} - uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4 with: name: linting-logs path: | diff --git a/.github/workflows/linting_comment.yml b/.github/workflows/linting_comment.yml index 0bed96d..e6e9bc2 100644 --- a/.github/workflows/linting_comment.yml +++ b/.github/workflows/linting_comment.yml @@ -11,7 +11,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Download lint results - uses: dawidd6/action-download-artifact@80620a5d27ce0ae443b965134db88467fc607b43 # v7 + uses: dawidd6/action-download-artifact@ac66b43f0e6a346234dd65d4d0c8fbb31cb316e5 # v11 with: workflow: linting.yml workflow_conclusion: completed @@ -21,7 +21,7 @@ jobs: run: echo "pr_number=$(cat linting-logs/PR_number.txt)" >> $GITHUB_OUTPUT - name: Post PR comment - uses: marocchino/sticky-pull-request-comment@331f8f5b4215f0445d3c07b4967662a32a2d3e31 # v2 + uses: marocchino/sticky-pull-request-comment@773744901bac0e8cbb5a0dc842800d45e9b2b405 # v2 with: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} number: ${{ steps.pr_number.outputs.pr_number }} diff --git a/.github/workflows/nf-test.yml b/.github/workflows/nf-test.yml new file mode 100644 index 0000000..e20bf6d --- /dev/null +++ b/.github/workflows/nf-test.yml @@ -0,0 +1,144 @@ +name: Run nf-test +on: + pull_request: + paths-ignore: + - "docs/**" + - "**/meta.yml" + - "**/*.md" + - "**/*.png" + - "**/*.svg" + release: + types: [published] + workflow_dispatch: + +# Cancel if a newer run is started +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + NFT_VER: "0.9.3" + NFT_WORKDIR: "~" + NXF_ANSI_LOG: false + NXF_SINGULARITY_CACHEDIR: ${{ github.workspace }}/.singularity + NXF_SINGULARITY_LIBRARYDIR: ${{ github.workspace }}/.singularity + +jobs: + nf-test-changes: + name: nf-test-changes + runs-on: # use self-hosted runners + - runs-on=${{ github.run_id }}-nf-test-changes + - runner=4cpu-linux-x64 + outputs: + shard: ${{ steps.set-shards.outputs.shard }} + total_shards: ${{ steps.set-shards.outputs.total_shards }} + steps: + - name: Clean Workspace # Purge the workspace in case it's running on a self-hosted runner + run: | + ls -la ./ + rm -rf ./* || true + rm -rf ./.??* || true + ls -la ./ + - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5 + with: + fetch-depth: 0 + + - name: get number of shards + id: set-shards + uses: ./.github/actions/get-shards + env: + NFT_VER: ${{ env.NFT_VER }} + with: + max_shards: 7 + + - name: debug + run: | + echo ${{ steps.set-shards.outputs.shard }} + echo ${{ steps.set-shards.outputs.total_shards }} + + nf-test: + name: "${{ matrix.profile }} | ${{ matrix.NXF_VER }} | ${{ matrix.shard }}/${{ needs.nf-test-changes.outputs.total_shards }}" + needs: [nf-test-changes] + if: ${{ needs.nf-test-changes.outputs.total_shards != '0' }} + runs-on: # use self-hosted runners + - runs-on=${{ github.run_id }}-nf-test + - runner=4cpu-linux-x64 + strategy: + fail-fast: false + matrix: + shard: ${{ fromJson(needs.nf-test-changes.outputs.shard) }} + profile: [conda, docker, singularity] + isMain: + - ${{ github.base_ref == 'master' || github.base_ref == 'main' }} + # Exclude conda and singularity on dev + exclude: + - isMain: false + profile: "conda" + - isMain: false + profile: "singularity" + NXF_VER: + - "25.04.0" + - "latest-everything" + env: + NXF_ANSI_LOG: false + TOTAL_SHARDS: ${{ needs.nf-test-changes.outputs.total_shards }} + + steps: + - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5 + with: + fetch-depth: 0 + + - name: Run nf-test + id: run_nf_test + uses: ./.github/actions/nf-test + continue-on-error: ${{ matrix.NXF_VER == 'latest-everything' }} + env: + NFT_WORKDIR: ${{ env.NFT_WORKDIR }} + NXF_VERSION: ${{ matrix.NXF_VER }} + with: + profile: ${{ matrix.profile }} + shard: ${{ matrix.shard }} + total_shards: ${{ env.TOTAL_SHARDS }} + + - name: Report test status + if: ${{ always() }} + run: | + if [[ "${{ steps.run_nf_test.outcome }}" == "failure" ]]; then + echo "::error::Test with ${{ matrix.NXF_VER }} failed" + # Add to workflow summary + echo "## ❌ Test failed: ${{ matrix.profile }} | ${{ matrix.NXF_VER }} | Shard ${{ matrix.shard }}/${{ env.TOTAL_SHARDS }}" >> $GITHUB_STEP_SUMMARY + if [[ "${{ matrix.NXF_VER }}" == "latest-everything" ]]; then + echo "::warning::Test with latest-everything failed but will not cause workflow failure. Please check if the error is expected or if it needs fixing." + fi + if [[ "${{ matrix.NXF_VER }}" != "latest-everything" ]]; then + exit 1 + fi + fi + + confirm-pass: + needs: [nf-test] + if: always() + runs-on: # use self-hosted runners + - runs-on=${{ github.run_id }}-confirm-pass + - runner=2cpu-linux-x64 + steps: + - name: One or more tests failed (excluding latest-everything) + if: ${{ contains(needs.*.result, 'failure') }} + run: exit 1 + + - name: One or more tests cancelled + if: ${{ contains(needs.*.result, 'cancelled') }} + run: exit 1 + + - name: All tests ok + if: ${{ contains(needs.*.result, 'success') }} + run: exit 0 + + - name: debug-print + if: always() + run: | + echo "::group::DEBUG: `needs` Contents" + echo "DEBUG: toJSON(needs) = ${{ toJSON(needs) }}" + echo "DEBUG: toJSON(needs.*.result) = ${{ toJSON(needs.*.result) }}" + echo "::endgroup::" diff --git a/.github/workflows/release-announcements.yml b/.github/workflows/release-announcements.yml index 450b1d5..e64cebd 100644 --- a/.github/workflows/release-announcements.yml +++ b/.github/workflows/release-announcements.yml @@ -14,6 +14,11 @@ jobs: run: | echo "topics=$(curl -s https://nf-co.re/pipelines.json | jq -r '.remote_workflows[] | select(.full_name == "${{ github.repository }}") | .topics[]' | awk '{print "#"$0}' | tr '\n' ' ')" | sed 's/-//g' >> $GITHUB_OUTPUT + - name: get description + id: get_topics + run: | + echo "description=$(curl -s https://nf-co.re/pipelines.json | jq -r '.remote_workflows[] | select(.full_name == "${{ github.repository }}") | .description' >> $GITHUB_OUTPUT + - uses: rzr/fediverse-action@master with: access-token: ${{ secrets.MASTODON_ACCESS_TOKEN }} @@ -23,47 +28,16 @@ jobs: message: | Pipeline release! ${{ github.repository }} v${{ github.event.release.tag_name }} - ${{ github.event.release.name }}! + ${{ steps.get_topics.outputs.description }} + Please see the changelog: ${{ github.event.release.html_url }} ${{ steps.get_topics.outputs.topics }} #nfcore #openscience #nextflow #bioinformatics - send-tweet: - runs-on: ubuntu-latest - - steps: - - uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5 - with: - python-version: "3.10" - - name: Install dependencies - run: pip install tweepy==4.14.0 - - name: Send tweet - shell: python - run: | - import os - import tweepy - - client = tweepy.Client( - access_token=os.getenv("TWITTER_ACCESS_TOKEN"), - access_token_secret=os.getenv("TWITTER_ACCESS_TOKEN_SECRET"), - consumer_key=os.getenv("TWITTER_CONSUMER_KEY"), - consumer_secret=os.getenv("TWITTER_CONSUMER_SECRET"), - ) - tweet = os.getenv("TWEET") - client.create_tweet(text=tweet) - env: - TWEET: | - Pipeline release! ${{ github.repository }} v${{ github.event.release.tag_name }} - ${{ github.event.release.name }}! - - Please see the changelog: ${{ github.event.release.html_url }} - TWITTER_CONSUMER_KEY: ${{ secrets.TWITTER_CONSUMER_KEY }} - TWITTER_CONSUMER_SECRET: ${{ secrets.TWITTER_CONSUMER_SECRET }} - TWITTER_ACCESS_TOKEN: ${{ secrets.TWITTER_ACCESS_TOKEN }} - TWITTER_ACCESS_TOKEN_SECRET: ${{ secrets.TWITTER_ACCESS_TOKEN_SECRET }} - bsky-post: runs-on: ubuntu-latest steps: - - uses: zentered/bluesky-post-action@80dbe0a7697de18c15ad22f4619919ceb5ccf597 # v0.1.0 + - uses: zentered/bluesky-post-action@6461056ea355ea43b977e149f7bf76aaa572e5e8 # v0.3.0 with: post: | Pipeline release! ${{ github.repository }} v${{ github.event.release.tag_name }} - ${{ github.event.release.name }}! diff --git a/.github/workflows/template_version_comment.yml b/.github/workflows/template-version-comment.yml similarity index 91% rename from .github/workflows/template_version_comment.yml rename to .github/workflows/template-version-comment.yml index 537529b..c5988af 100644 --- a/.github/workflows/template_version_comment.yml +++ b/.github/workflows/template-version-comment.yml @@ -9,12 +9,12 @@ jobs: runs-on: ubuntu-latest steps: - name: Check out pipeline code - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4 + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5 with: ref: ${{ github.event.pull_request.head.sha }} - name: Read template version from .nf-core.yml - uses: nichmor/minimal-read-yaml@v0.0.2 + uses: nichmor/minimal-read-yaml@1f7205277e25e156e1f63815781db80a6d490b8f # v0.0.2 id: read_yml with: config: ${{ github.workspace }}/.nf-core.yml diff --git a/.gitpod.yml b/.gitpod.yml deleted file mode 100644 index 83599f6..0000000 --- a/.gitpod.yml +++ /dev/null @@ -1,10 +0,0 @@ -image: nfcore/gitpod:latest -tasks: - - name: Update Nextflow and setup pre-commit - command: | - pre-commit install --install-hooks - nextflow self-update - -vscode: - extensions: - - nf-core.nf-core-extensionpack # https://github.com/nf-core/vscode-extensionpack diff --git a/.nf-core.yml b/.nf-core.yml index 59e948b..3251f8f 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -1,12 +1,8 @@ -repository_type: pipeline - -nf_core_version: 3.1.2 - lint: {} - +nf_core_version: 3.4.1 +repository_type: pipeline template: - org: nf-core - name: dmscore + author: Benjamin Wehnert & Max Stammnitz description: "Until now, most Deep Mutational Scanning (DMS) experiments relied\ \ on variant-specific barcoded libraries for sequencing. This method enabled DMS\ \ on large proteins and led to many great publications. Recently, efforts have\ @@ -17,8 +13,9 @@ template: \ files and generating a count table of variants. Along the way, it provides multiple\ \ QC metrics, enabling users to quickly evaluate the success of their experimental\ \ setup." - author: Benjamin Wehnert & Max Stammnitz - version: 1.0.0dev - force: true - outdir: . + force: false is_nfcore: true + name: deepmutscan + org: nf-core + outdir: . + version: 1.0.0 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9e9f0e1..d06777a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -4,10 +4,24 @@ repos: hooks: - id: prettier additional_dependencies: - - prettier@3.2.5 - - - repo: https://github.com/editorconfig-checker/editorconfig-checker.python - rev: "3.0.3" + - prettier@3.6.2 + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v6.0.0 hooks: - - id: editorconfig-checker - alias: ec + - id: trailing-whitespace + args: [--markdown-linebreak-ext=md] + exclude: | + (?x)^( + .*ro-crate-metadata.json$| + modules/nf-core/.*| + subworkflows/nf-core/.*| + .*\.snap$ + )$ + - id: end-of-file-fixer + exclude: | + (?x)^( + .*ro-crate-metadata.json$| + modules/nf-core/.*| + subworkflows/nf-core/.*| + .*\.snap$ + )$ diff --git a/.prettierignore b/.prettierignore index edd29f0..2255e3e 100644 --- a/.prettierignore +++ b/.prettierignore @@ -10,4 +10,5 @@ testing/ testing* *.pyc bin/ +.nf-test/ ro-crate-metadata.json diff --git a/.prettierrc.yml b/.prettierrc.yml index c81f9a7..07dbd8b 100644 --- a/.prettierrc.yml +++ b/.prettierrc.yml @@ -1 +1,6 @@ printWidth: 120 +tabWidth: 4 +overrides: + - files: "*.{md,yml,yaml,html,css,scss,js,cff}" + options: + tabWidth: 2 diff --git a/CHANGELOG.md b/CHANGELOG.md index 40ea82f..0039a1b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,11 +1,11 @@ -# nf-core/dmscore: Changelog +# nf-core/deepmutscan: Changelog The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## v1.0.0dev - [date] +## v1.0.0 - [date] -Initial release of nf-core/dmscore, created with the [nf-core](https://nf-co.re/) template. +Initial release of nf-core/deepmutscan, created with the [nf-core](https://nf-co.re/) template. ### `Added` diff --git a/CITATIONS.md b/CITATIONS.md index a7e69f7..3ea1015 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -1,4 +1,4 @@ -# nf-core/dmscore: Citations +# nf-core/deepmutscan: Citations ## [nf-core](https://pubmed.ncbi.nlm.nih.gov/32055031/) diff --git a/LICENSE b/LICENSE index 48133aa..8119d7b 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) The nf-core/dmscore team +Copyright (c) The nf-core/deepmutscan team Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index b6ae647..d317632 100644 --- a/README.md +++ b/README.md @@ -1,106 +1,128 @@

- - nf-core/dmscore + + nf-core/deepmutscan

-[![GitHub Actions CI Status](https://github.com/nf-core/dmscore/actions/workflows/ci.yml/badge.svg)](https://github.com/nf-core/dmscore/actions/workflows/ci.yml) -[![GitHub Actions Linting Status](https://github.com/nf-core/dmscore/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/dmscore/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/dmscore/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX) +[![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://github.com/codespaces/new/nf-core/deepmutscan) +[![GitHub Actions CI Status](https://github.com/nf-core/deepmutscan/actions/workflows/nf-test.yml/badge.svg)](https://github.com/nf-core/deepmutscan/actions/workflows/nf-test.yml) +[![GitHub Actions Linting Status](https://github.com/nf-core/deepmutscan/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/deepmutscan/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/deepmutscan/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX) [![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com) -[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A524.04.2-23aa62.svg)](https://www.nextflow.io/) +[![Nextflow](https://img.shields.io/badge/version-%E2%89%A525.04.0-green?style=flat&logo=nextflow&logoColor=white&color=%230DC09D&link=https%3A%2F%2Fnextflow.io)](https://www.nextflow.io/) +[![nf-core template version](https://img.shields.io/badge/nf--core_template-3.4.1-green?style=flat&logo=nfcore&logoColor=white&color=%2324B064&link=https%3A%2F%2Fnf-co.re)](https://github.com/nf-core/tools/releases/tag/3.4.1) [![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/) [![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/) [![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/) -[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/nf-core/dmscore) +[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/nf-core/deepmutscan) -[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23dmscore-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/dmscore)[![Follow on Twitter](http://img.shields.io/badge/twitter-%40nf__core-1DA1F2?labelColor=000000&logo=twitter)](https://twitter.com/nf_core)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core) +[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23deepmutscan-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/deepmutscan)[![Follow on Bluesky](https://img.shields.io/badge/bluesky-%40nf__core-1185fe?labelColor=000000&logo=bluesky)](https://bsky.app/profile/nf-co.re)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core) ## Introduction -**nf-core/dmscore** is a bioinformatics pipeline that ... +**nf-core/deepmutscan** is a workflow designed for the analysis of deep mutational scanning (DMS) data. DMS enables researchers to experimentally measure the fitness effects of thousands of genes or gene variants simultaneously, helping to classify disease causing mutants in human and animal populations, to learn the fundamental rules of virus evolution, protein architecture, splicing, small-molecule interactions and many other phenotypes. - +While DNA synthesis and sequencing technologies have advanced substantially, long open reading frame (ORF) targets still present major challenges for DMS studies. Shotgun DNA sequencing can be used to greatly speed up the inference of long ORF mutant fitness landscapes, theoretically at no expense in accuracy. We have designed the `nf-core/deepmutscan` pipeline to unlock the power of shotgun sequencing based DMS studies on long ORFs, to simplify and standardise the complex bioinformatics steps involved in data processing of such experiments – from read alignment to QC reporting and fitness landscape inferences. - -1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/))2. Present QC for raw reads ([`MultiQC`](http://multiqc.info/)) +

+ +

-## Usage +The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. Where possible, these processes have been submitted to and installed from [nf-core/modules](https://github.com/nf-core/modules) in order to make them available to all nf-core pipelines, and to everyone within the Nextflow community! -> [!NOTE] -> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data. +On release, automated continuous integration tests run the pipeline on a full-sized dataset on the AWS cloud infrastructure. This ensures that the pipeline runs on AWS, has sensible resource allocation defaults set to run on real-world datasets, and permits the persistent storage of results to benchmark between pipeline releases and other analysis sources. The results obtained from the full-sized test can be viewed on the [nf-core website](https://nf-co.re/deepmutscan/results). - +1. Alignment of reads to the reference open reading frame (ORF) (`BWA-mem`) +2. Filtering of wildtype and erroneous reads (`samtools view`) +3. Read merging for base error reduction (`vsearch merge`, `BWA-mem`) +4. Mutation counting (`GATK AnalyzeSaturationMutagenesis`) +5. DMS library quality control +6. Data summarisation across samples +7. Single nucleotide variant error correction _(in development)_ +8. Fitness estimation _(in development)_ -Now, you can run the pipeline using: +## Usage - +> [!NOTE] +> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data. -```bash -nextflow run nf-core/dmscore \ - -profile \ - --input samplesheet.csv \ - --outdir +First, prepare a samplesheet with your input/output data in which each row represents a pair of fastq files (paired end). This should look as follows: + +```csv title="samplesheet.csv" +sample,type,replicate,file1,file2 +ORF1,input,1,/reads/forward1.fastq.gz,/reads/reverse1.fastq.gz +ORF1,input,2,/reads/forward2.fastq.gz,/reads/reverse2.fastq.gz +ORF1,output,1,/reads/forward3.fastq.gz,/reads/reverse3.fastq.gz +ORF1,output,2,/reads/forward4.fastq.gz,/reads/reverse4.fastq.gz ``` -> [!WARNING] -> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; see [docs](https://nf-co.re/docs/usage/getting_started/configuration#custom-configuration-files). +Secondly, specify the gene or gene region of interest using a reference FASTA file via `--fasta`. Provide the exact codon coordinates using `--reading_frame`. -For more details and further functionality, please refer to the [usage documentation](https://nf-co.re/dmscore/usage) and the [parameter documentation](https://nf-co.re/dmscore/parameters). +Now, you can run the pipeline using: + +```bash title="example pipeline run" +nextflow run nf-core/deepmutscan \ + -profile \ + --input ./samplesheet.csv \ + --fasta ./ref.fa \ + --reading_frame 1-300 \ + --outdir ./results +``` ## Pipeline output -To see the results of an example test run with a full size dataset refer to the [results](https://nf-co.re/dmscore/results) tab on the nf-core website pipeline page. +To see the results of an example test run with a full size dataset refer to the [results](https://nf-co.re/deepmutscan/results) tab on the nf-core website pipeline page. + For more details about the output files and reports, please refer to the -[output documentation](https://nf-co.re/dmscore/output). +[output documentation](https://nf-co.re/deepmutscan/output). -## Credits +## Contributing + +We welcome contributions from the community! -nf-core/dmscore was originally written by Benjamin Wehnert & Max Stammnitz. +For technical challenges and feedback on the pipeline, please use our [Github repository](https://github.com/nf-core/deepmutscan). Please open an [issue](https://github.com/nf-core/deepmutscan/issues/new) or [pull request](https://github.com/nf-core/deepmutscan/compare) to: -We thank the following people for their extensive assistance in the development of this pipeline: +- Report bugs or solve data incompatibilities when running `nf-core/deepmutscan` +- Suggest the implementation of new modules for custom DMS workflows +- Help improve this documentation - +If you are interested in getting involved as a developer, please consider joining our interactive [`#deepmutscan` Slack channel](https://nfcore.slack.com/channels/deepmutscan) (via [this invite](https://nf-co.re/join/slack)). + +## Credits -## Contributions and Support +nf-core/deepmutscan was originally written by [Benjamin Wehnert](https://github.com/BenjaminWehnert1008) and [Max Stammnitz](https://github.com/MaximilianStammnitz) at the [Centre for Genomic Regulation, Barcelona](https://www.crg.eu/), with the generous support of an EMBO Long-term Postdoctoral Fellowship and a Marie Skłodowska-Curie grant by the European Union. -If you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md). +If you use `nf-core/deepmutscan` in your analyses, please cite: -For further information or help, don't hesitate to get in touch on the [Slack `#dmscore` channel](https://nfcore.slack.com/channels/dmscore) (you can join with [this invite](https://nf-co.re/join/slack)). +> 📄 Wehnert et al., _bioRxiv_ preprint (coming soon) -## Citations +Please also cite the `nf-core` framework: - - +> 📄 Ewels et al., _Nature Biotechnology_, 2020 +> [https://doi.org/10.1038/s41587-020-0439-x](https://doi.org/10.1038/s41587-020-0439-x) - +For further information or help, don't hesitate to get in touch on the [Slack `#deepmutscan` channel](https://nfcore.slack.com/channels/deepmutscan) (you can join with [this invite](https://nf-co.re/join/slack)). -An extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file. +## Scientific contact -You can cite the `nf-core` publication as follows: +For scientific discussions around the use of this pipeline (e.g. on experimental design or sequencing data requirements), please feel free to get in touch with us directly: -> **The nf-core framework for community-curated bioinformatics pipelines.** -> -> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen. -> -> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x). +- Benjamin Wehnert — wehnertbenjamin@gmail.com +- Maximilian Stammnitz — maximilian.stammnitz@crg.eu diff --git a/assets/adaptivecard.json b/assets/adaptivecard.json index 62ad62a..90786c9 100644 --- a/assets/adaptivecard.json +++ b/assets/adaptivecard.json @@ -17,7 +17,7 @@ "size": "Large", "weight": "Bolder", "color": "<% if (success) { %>Good<% } else { %>Attention<%} %>", - "text": "nf-core/dmscore v${version} - ${runName}", + "text": "nf-core/deepmutscan v${version} - ${runName}", "wrap": true }, { diff --git a/assets/email_template.html b/assets/email_template.html index 8baab9e..c051109 100644 --- a/assets/email_template.html +++ b/assets/email_template.html @@ -4,21 +4,21 @@ - - nf-core/dmscore Pipeline Report + + nf-core/deepmutscan Pipeline Report
-

nf-core/dmscore ${version}

+

nf-core/deepmutscan ${version}

Run Name: $runName

<% if (!success){ out << """
-

nf-core/dmscore execution completed unsuccessfully!

+

nf-core/deepmutscan execution completed unsuccessfully!

The exit status of the task that caused the workflow execution to fail was: $exitStatus.

The full error message was:

${errorReport}
@@ -27,7 +27,7 @@

nf-core/dmscore execution completed un } else { out << """
- nf-core/dmscore execution completed successfully! + nf-core/deepmutscan execution completed successfully!
""" } @@ -44,8 +44,8 @@

Pipeline Configuration:

-

nf-core/dmscore

-

https://github.com/nf-core/dmscore

+

nf-core/deepmutscan

+

https://github.com/nf-core/deepmutscan

diff --git a/assets/email_template.txt b/assets/email_template.txt index 4247bc4..907c089 100644 --- a/assets/email_template.txt +++ b/assets/email_template.txt @@ -4,15 +4,15 @@ |\\ | |__ __ / ` / \\ |__) |__ } { | \\| | \\__, \\__/ | \\ |___ \\`-._,-`-, `._,._,' - nf-core/dmscore ${version} + nf-core/deepmutscan ${version} ---------------------------------------------------- Run Name: $runName <% if (success){ - out << "## nf-core/dmscore execution completed successfully! ##" + out << "## nf-core/deepmutscan execution completed successfully! ##" } else { out << """#################################################### -## nf-core/dmscore execution completed unsuccessfully! ## +## nf-core/deepmutscan execution completed unsuccessfully! ## #################################################### The exit status of the task that caused the workflow execution to fail was: $exitStatus. The full error message was: @@ -35,5 +35,5 @@ Pipeline Configuration: <% out << summary.collect{ k,v -> " - $k: $v" }.join("\n") %> -- -nf-core/dmscore -https://github.com/nf-core/dmscore +nf-core/deepmutscan +https://github.com/nf-core/deepmutscan diff --git a/assets/methods_description_template.yml b/assets/methods_description_template.yml index e4e7ab3..0076d67 100644 --- a/assets/methods_description_template.yml +++ b/assets/methods_description_template.yml @@ -1,13 +1,13 @@ -id: "nf-core-dmscore-methods-description" +id: "nf-core-deepmutscan-methods-description" description: "Suggested text and references to use when describing pipeline usage within the methods section of a publication." -section_name: "nf-core/dmscore Methods Description" -section_href: "https://github.com/nf-core/dmscore" +section_name: "nf-core/deepmutscan Methods Description" +section_href: "https://github.com/nf-core/deepmutscan" plot_type: "html" ## TODO nf-core: Update the HTML below to your preferred methods description, e.g. add publication citation for this pipeline ## You inject any metadata in the Nextflow '${workflow}' object data: |

Methods

-

Data was processed using nf-core/dmscore v${workflow.manifest.version} ${doi_text} of the nf-core collection of workflows (Ewels et al., 2020), utilising reproducible software environments from the Bioconda (Grüning et al., 2018) and Biocontainers (da Veiga Leprevost et al., 2017) projects.

+

Data was processed using nf-core/deepmutscan v${workflow.manifest.version} ${doi_text} of the nf-core collection of workflows (Ewels et al., 2020), utilising reproducible software environments from the Bioconda (Grüning et al., 2018) and Biocontainers (da Veiga Leprevost et al., 2017) projects.

The pipeline was executed with Nextflow v${workflow.nextflow.version} (Di Tommaso et al., 2017) with the following command:

${workflow.commandLine}

${tool_citations}

diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index 2f6ceb9..5845dd0 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -1,13 +1,13 @@ report_comment: > - This report has been generated by the nf-core/dmscore + This report has been generated by the nf-core/deepmutscan analysis pipeline. For information about how to interpret these results, please see the - documentation. + documentation. report_section_order: - "nf-core-dmscore-methods-description": + "nf-core-deepmutscan-methods-description": order: -1000 software_versions: order: -1001 - "nf-core-dmscore-summary": + "nf-core-deepmutscan-summary": order: -1002 export_plots: true diff --git a/assets/nf-core-deepmutscan_logo_light.png b/assets/nf-core-deepmutscan_logo_light.png new file mode 100644 index 0000000..d6c8e55 Binary files /dev/null and b/assets/nf-core-deepmutscan_logo_light.png differ diff --git a/assets/nf-core-dmscore_logo_light.png b/assets/nf-core-dmscore_logo_light.png deleted file mode 100644 index 3faa4ca..0000000 Binary files a/assets/nf-core-dmscore_logo_light.png and /dev/null differ diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv index 5f653ab..5b5d503 100644 --- a/assets/samplesheet.csv +++ b/assets/samplesheet.csv @@ -1,3 +1,5 @@ -sample,fastq_1,fastq_2 -SAMPLE_PAIRED_END,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz -SAMPLE_SINGLE_END,/path/to/fastq/files/AEG588A4_S4_L003_R1_001.fastq.gz, +sample,type,replicate,file1,file2 +ORF1,input,1,/reads/forward1.fastq.gz,/reads/reverse1.fastq.gz +ORF1,input,2,/reads/forward2.fastq.gz,/reads/reverse2.fastq.gz +ORF1,output,1,/reads/forward3.fastq.gz,/reads/reverse3.fastq.gz +ORF1,output,2,/reads/forward4.fastq.gz,/reads/reverse4.fastq.gz diff --git a/assets/schema_input.json b/assets/schema_input.json index 2d9d8f6..134ee17 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -1,7 +1,7 @@ { "$schema": "https://json-schema.org/draft/2020-12/schema", - "$id": "https://raw.githubusercontent.com/nf-core/dmscore/master/assets/schema_input.json", - "title": "nf-core/dmscore pipeline - params.input schema", + "$id": "https://raw.githubusercontent.com/nf-core/deepmutscan/master/assets/schema_input.json", + "title": "nf-core/deepmutscan pipeline - params.input schema", "description": "Schema for the file provided with params.input", "type": "array", "items": { @@ -9,25 +9,46 @@ "properties": { "sample": { "type": "string", - "pattern": "^\\S+$", - "errorMessage": "Sample name must be provided and cannot contain spaces", + "pattern": "^[^\\s/]+$", + "errorMessage": "Sample name must be provided, cannot contain spaces, and must not include special characters", "meta": ["id"] }, - "fastq_1": { + "file1": { "type": "string", "format": "file-path", "exists": true, - "pattern": "^\\S+\\.f(ast)?q\\.gz$", - "errorMessage": "FastQ file for reads 1 must be provided, cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'" + "pattern": "^\\S+\\.(bam|fastq|fq|fastq\\.gz|fq\\.gz)$", + "allOf": [ + { + "if": { + "pattern": "^\\S+\\.bam$" + }, + "then": { + "pattern": "^\\S+_(pe|se)\\.bam$", + "errorMessage": "If file1 ends with .bam, it must contain '_pe.bam' or '_se.bam', defining paired-end or single-end" + } + } + ], + "errorMessage": "File 1 must be provided, cannot contain spaces, and must have an allowed extension (.bam, .fastq, .fq, .fastq.gz, .fq.gz)" }, - "fastq_2": { - "type": "string", + "file2": { + "type": ["string", "null"], "format": "file-path", "exists": true, - "pattern": "^\\S+\\.f(ast)?q\\.gz$", - "errorMessage": "FastQ file for reads 2 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'" + "pattern": "^\\S+\\.(fastq|fq|fastq\\.gz|fq\\.gz)$", + "errorMessage": "File 2 must have an allowed extension (.fastq, .fq, .fastq.gz, .fq.gz) or be null for single-end reads" + }, + "type": { + "type": "string", + "enum": ["input", "output", "quality"], + "errorMessage": "Type must be one of: input, output, or quality" + }, + "replicate": { + "type": "integer", + "minimum": 1, + "errorMessage": "Replicate must be a positive integer" } }, - "required": ["sample", "fastq_1"] + "required": ["sample", "file1", "type", "replicate"] } } diff --git a/assets/sendmail_template.txt b/assets/sendmail_template.txt index cdd2e19..4c84df3 100644 --- a/assets/sendmail_template.txt +++ b/assets/sendmail_template.txt @@ -9,12 +9,12 @@ Content-Type: text/html; charset=utf-8 $email_html --nfcoremimeboundary -Content-Type: image/png;name="nf-core-dmscore_logo.png" +Content-Type: image/png;name="nf-core-deepmutscan_logo.png" Content-Transfer-Encoding: base64 Content-ID: -Content-Disposition: inline; filename="nf-core-dmscore_logo_light.png" +Content-Disposition: inline; filename="nf-core-deepmutscan_logo_light.png" -<% out << new File("$projectDir/assets/nf-core-dmscore_logo_light.png"). +<% out << new File("$projectDir/assets/nf-core-deepmutscan_logo_light.png"). bytes. encodeBase64(). toString(). diff --git a/assets/slackreport.json b/assets/slackreport.json index e5aa3f8..0c5567f 100644 --- a/assets/slackreport.json +++ b/assets/slackreport.json @@ -3,7 +3,7 @@ { "fallback": "Plain-text summary of the attachment.", "color": "<% if (success) { %>good<% } else { %>danger<%} %>", - "author_name": "nf-core/dmscore ${version} - ${runName}", + "author_name": "nf-core/deepmutscan ${version} - ${runName}", "author_icon": "https://www.nextflow.io/docs/latest/_static/favicon.ico", "text": "<% if (success) { %>Pipeline completed successfully!<% } else { %>Pipeline completed with errors<% } %>", "fields": [ diff --git a/conf/base.config b/conf/base.config index 9aa66a0..8f313fe 100644 --- a/conf/base.config +++ b/conf/base.config @@ -1,6 +1,6 @@ /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - nf-core/dmscore Nextflow base config file + nf-core/deepmutscan Nextflow base config file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ A 'blank slate' config file, appropriate for general use on most high performance compute environments. Assumes that all software is installed and available on @@ -15,10 +15,16 @@ process { memory = { 6.GB * task.attempt } time = { 4.h * task.attempt } - errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' } + errorStrategy = { task.exitStatus in ((130..145) + 104 + 175) ? 'retry' : 'finish' } maxRetries = 1 maxErrors = '-1' + // Disable 'noclobber' and pre-clean versions.yml so modules can overwrite it + beforeScript = ''' + set +o noclobber + rm -f versions.yml || true + ''' + // Process-specific resource requirements // NOTE - Please try and reuse the labels below as much as possible. // These labels are used and recognised by default in DSL2 files hosted on nf-core/modules. @@ -59,4 +65,8 @@ process { errorStrategy = 'retry' maxRetries = 2 } + withLabel: process_gpu { + ext.use_gpu = { workflow.profile.contains('gpu') } + accelerator = { workflow.profile.contains('gpu') ? 1 : null } + } } diff --git a/conf/modules.config b/conf/modules.config index d203d2b..b0211d7 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -20,6 +20,7 @@ process { withName: FASTQC { ext.args = '--quiet' + containerOptions = '' } withName: 'MULTIQC' { @@ -31,4 +32,151 @@ process { ] } + withName: 'BWA_INDEX' { + publishDir = [ + path: "${params.outdir}/intermediate_files/bam_files", + mode: 'copy', + saveAs: { filename -> filename == 'versions.yml' ? null : filename } + ] + } + + withName: 'BWA_MEM' { + publishDir = [ + path: "${params.outdir}/intermediate_files/bam_files/bwa/mem", + mode: 'copy', + saveAs: { filename -> filename == 'versions.yml' ? null : filename } + ] + } + + withName: 'BAMFILTER_DMS' { + publishDir = [ + path: "${params.outdir}/intermediate_files/bam_files/filtered", + mode: 'copy', + saveAs: { filename -> filename == 'versions.yml' ? null : filename } + ] + } + + withName: 'PREMERGE' { + publishDir = [ + path: "${params.outdir}/intermediate_files/bam_files/premerged", + mode: 'copy', + saveAs: { filename -> filename == 'versions.yml' ? null : filename } + ] + } + + withName: 'GATK_SATURATIONMUTAGENESIS' { + publishDir = [ + path: "${params.outdir}/intermediate_files/gatk", + mode: 'copy', + overwrite: false, + // put everything except versions.yml under a folder named by the sample id + saveAs: { filename -> + if (filename == 'versions.yml') return null + "${meta.id}/${filename}" + } + ] + } + + withName: 'DMSANALYSIS_POSSIBLE_MUTATIONS' { + publishDir = [ + path: "${params.outdir}/intermediate_files", + mode: 'copy', + saveAs: { filename -> filename == 'versions.yml' ? null : filename } + ] + } + + withName: 'DMSANALYSIS_AASEQ' { + publishDir = [ + path: "${params.outdir}/intermediate_files", + mode: 'copy', + saveAs: { filename -> filename == 'versions.yml' ? null : filename } + ] + } + + withName: 'DMSANALYSIS_PROCESS_GATK' { + publishDir = [ + path: "${params.outdir}/intermediate_files/processed_gatk_files", + mode: 'copy', + saveAs: { filename -> + if (filename == 'versions.yml') return null + "${meta.id}/${filename}" + } + ] + } + + withName: /.*VISUALIZATION_.*/ { + publishDir = [ + path: { "${params.outdir}/library_QC" }, // e.g. results/library_QC + mode: 'copy', + overwrite: false, + saveAs: { fn -> + if (fn == 'versions.yml') return null + "${meta.id}/${fn}" // put every output under the sample's subfolder + } + ] + } + + withName: 'GATK_GATKTOFITNESS' { + publishDir = [ + path: "${params.outdir}/fitness/DiMSum_results/single_rep_counts", + mode: 'copy', + saveAs: { filename -> filename == 'versions.yml' ? null : filename } + ] + } + + withName: 'MERGE_COUNTS' { + publishDir = [ + path: "${params.outdir}/fitness", + mode: 'copy', + saveAs: { filename -> filename == 'versions.yml' ? null : filename } + ] + } + + withName: 'EXPDESIGN_FITNESS' { + publishDir = [ + path: "${params.outdir}/fitness", + mode: 'copy', + saveAs: { filename -> filename == 'versions.yml' ? null : filename } + ] + } + + withName: 'FIND_SYNONYMOUS_MUTATION' { + publishDir = [ + path: "${params.outdir}/fitness", + mode: 'copy', + saveAs: { filename -> filename == 'versions.yml' ? null : filename } + ] + } + + withName: 'FITNESS_CALCULATION' { + publishDir = [ + path: "${params.outdir}/fitness/default_results", + mode: 'copy', + saveAs: { filename -> filename == 'versions.yml' ? null : filename } + ] + } + + withName: 'FITNESS_QC' { + publishDir = [ + path: "${params.outdir}/fitness/default_results", + mode: 'copy', + saveAs: { filename -> filename == 'versions.yml' ? null : filename } + ] + } + + withName: 'FITNESS_HEATMAP' { + publishDir = [ + path: "${params.outdir}/fitness/default_results", + mode: 'copy', + saveAs: { filename -> filename == 'versions.yml' ? null : filename } + ] + } + + withName: 'RUN_DIMSUM' { + publishDir = [ + path: "${params.outdir}/fitness/DiMSum_results", + mode: 'copy', + saveAs: { filename -> filename == 'versions.yml' ? null : filename } + ] + } } diff --git a/conf/test.config b/conf/test.config index 8c566af..7c27314 100644 --- a/conf/test.config +++ b/conf/test.config @@ -5,7 +5,7 @@ Defines input files and everything required to run a fast and simple pipeline test. Use as follows: - nextflow run nf-core/dmscore -profile test, --outdir + nextflow run nf-core/deepmutscan -profile test, --outdir ---------------------------------------------------------------------------------------- */ @@ -13,7 +13,7 @@ process { resourceLimits = [ cpus: 4, - memory: '15.GB', + memory: '8.GB', time: '1.h' ] } @@ -25,6 +25,6 @@ params { // Input data // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets // TODO nf-core: Give any required params for the test so that command line flags are not needed - input = params.pipelines_testdata_base_path + 'viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv'// Genome references + input = params.pipelines_testdata_base_path + 'samplesheet_qc_only.csv'// Genome references genome = 'R64-1-1' } diff --git a/conf/test_full.config b/conf/test_full.config index b57a75c..edb899d 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -5,7 +5,7 @@ Defines input files and everything required to run a full size pipeline test. Use as follows: - nextflow run nf-core/dmscore -profile test_full, --outdir + nextflow run nf-core/deepmutscan -profile test_full, --outdir ---------------------------------------------------------------------------------------- */ @@ -17,7 +17,7 @@ params { // Input data for full size test // TODO nf-core: Specify the paths to your full test data ( on nf-core/test-datasets or directly in repositories, e.g. SRA) // TODO nf-core: Give any required params for the test so that command line flags are not needed - input = params.pipelines_testdata_base_path + 'viralrecon/samplesheet/samplesheet_full_illumina_amplicon.csv' + input = params.pipelines_testdata_base_path + 'samplesheet_qc_only.csv' // Genome references genome = 'R64-1-1' diff --git a/docs/README.md b/docs/README.md index 917d984..657c1b3 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,6 +1,6 @@ -# nf-core/dmscore: Documentation +# nf-core/deepmutscan: Documentation -The nf-core/dmscore documentation is split into the following pages: +The nf-core/deepmutscan documentation is split into the following pages: - [Usage](usage.md) - An overview of how the pipeline works, how to run it and a description of all of the different command-line flags. diff --git a/docs/images/nf-core-deepmutscan_logo_dark.png b/docs/images/nf-core-deepmutscan_logo_dark.png new file mode 100644 index 0000000..398e0b2 Binary files /dev/null and b/docs/images/nf-core-deepmutscan_logo_dark.png differ diff --git a/docs/images/nf-core-deepmutscan_logo_light.png b/docs/images/nf-core-deepmutscan_logo_light.png new file mode 100644 index 0000000..f8bbe08 Binary files /dev/null and b/docs/images/nf-core-deepmutscan_logo_light.png differ diff --git a/docs/images/nf-core-dmscore_logo_dark.png b/docs/images/nf-core-dmscore_logo_dark.png deleted file mode 100644 index 7cbb4e1..0000000 Binary files a/docs/images/nf-core-dmscore_logo_dark.png and /dev/null differ diff --git a/docs/images/nf-core-dmscore_logo_light.png b/docs/images/nf-core-dmscore_logo_light.png deleted file mode 100644 index 569ba7e..0000000 Binary files a/docs/images/nf-core-dmscore_logo_light.png and /dev/null differ diff --git a/docs/output.md b/docs/output.md index ff49a08..4f57067 100644 --- a/docs/output.md +++ b/docs/output.md @@ -1,22 +1,25 @@ -# nf-core/dmscore: Output +# nf-core/deepmutscan: Output ## Introduction -This document describes the output produced by the pipeline. Most of the plots are taken from the MultiQC report, which summarises results at the end of the pipeline. +The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory: -The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory. - - - -## Pipeline overview - -The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: - -- [FastQC](#fastqc) - Raw read QC- [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline -- [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution +```tree +results/ +├── fastqc/ # Individual HTML reports for specified fastq files, raw sequencing QC +├── fitness/ # Merged variant count tables, fitness and error estimates, replicate correlations and heatmaps +├── intermediate_files/ # Raw alignments, raw and pre-filtered variant count tables, QC reports +├── library_QC/ # Sample-specific PDF visualizations: position-wise sequencing coverage, count heatmaps, etc. +├── multiqc/ # Shared HTML reports for all fastq files, raw sequencing QC +├── pipelineinfo/ # Nextflow helper files for timeline and summary report generation +├── timeline.html # Nextflow timeline for all tasks +└── report.html # Nextflow summary report incl. detailed CPU and memory usage per for all tasks +``` ### FastQC +[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). +
Output files @@ -26,7 +29,9 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
-[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/).### MultiQC +### MultiQC + +[MultiQC](http://multiqc.info) is a visualization tool that generates a single HTML report summarising all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in the report data directory. Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see .### Pipeline information
Output files @@ -38,19 +43,66 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
-[MultiQC](http://multiqc.info) is a visualization tool that generates a single HTML report summarising all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in the report data directory. +### Intermediate files + +This directory is created during the first series of steps of the pipeline, featuring raw read alignments, filtering and variant counting. + +
+Output files + +- `intermediate_files/` + - `aa_seq.txt`: + - `bam_files/bwa`: + - `bam_files/filtered`: + - `bam_files/premerged`: + - `gatk`: + - `possible_mutations.csv`: + - `processed_gatk_files`: + +
+ +### Library QC -Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see .### Pipeline information +This directory is created during the second series of steps of the pipeline, featuring various QC visualisations for each sample.
Output files +- `library_QC/` + - `counts_heatmap.pdf`: + - `counts_per_cov_heatmap.pdf`: + - `logdiff_plot.pdf`: + - `logdiff_varying_bases.pdf`: + - `rolling_counts_per_cov.pdf`: + - `rolling_counts.pdf`: + - `rolling_coverage.pdf`: + - `SeqDepth.pdf` (optional): + +
+ +### Fitness + +This directory is created during the final series of steps of the pipeline, featuring fitness and fitness error estimates (when DMS input/output sample groups are specified). + +
+Output files + +- `fitness/` + - `counts_merged.tsv`: summarised gene variant counts across all input and output samples. + - `default_results/fitness_estimation_count_correlation.pdf`: + - `default_results/fitness_estimation_fitness_correlation.pdf`: + - `default_results/fitness_heatmap.pdf`: + - `default_results/fitness_estimation.tsv`: + - `DiMSum_results/dimsum_results` (optional): + +
+ +### Pipeline Info + - `pipeline_info/` - Reports generated by Nextflow: `execution_report.html`, `execution_timeline.html`, `execution_trace.txt` and `pipeline_dag.dot`/`pipeline_dag.svg`. - Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.yml`. The `pipeline_report*` files will only be present if the `--email` / `--email_on_fail` parameter's are used when running the pipeline. - Reformatted samplesheet files used as input to the pipeline: `samplesheet.valid.csv`. - Parameters used by the pipeline run: `params.json`. - - [Nextflow](https://www.nextflow.io/docs/latest/tracing.html) provides excellent functionality for generating various reports relevant to the running and execution of the pipeline. This will allow you to troubleshoot errors with the running of the pipeline, and also provide you with other information such as launch commands, run times and resource usage. diff --git a/docs/pipeline.png b/docs/pipeline.png new file mode 100644 index 0000000..04ad742 Binary files /dev/null and b/docs/pipeline.png differ diff --git a/docs/pipeline_steps.md b/docs/pipeline_steps.md new file mode 100644 index 0000000..9573527 --- /dev/null +++ b/docs/pipeline_steps.md @@ -0,0 +1,111 @@ +# nf-core/deepmutscan: Detailed Pipeline Steps + +This page provides in-depth descriptions of the data processing modules implemented in **nf-core/deepmutscan**. It is mainly intended for advanced users and developers who want to understand the rationale behind design choices, explore implementation details, and consider potential future extensions. + +--- + +## Overview + +The pipeline processes deep mutational scanning (DMS) sequencing data in several stages: + +1. Alignment of reads to the reference open reading frame (ORF) +2. Filtering of wildtype and erroneous reads +3. Read merging for base error reduction +4. Mutation counting +5. DMS library quality control +6. Data summarisation across samples +7. Single nucleotide variant error correction _(in development)_ +8. Fitness estimation _(in development)_ + +![pipeline](/docs/pipeline.png) + +Each step is explained below. Links are provided to the primary tools and libraries used, where applicable. + +--- + +## 1. Alignment + +All paired-end raw reads are first aligned to the provided reference ORF using [**bwa-mem**](http://bio-bwa.sourceforge.net/). This is a highly efficient mapping algorithm for reads ≥100 bp, with its multi-threading support automatically handled by nf-core. + +In future versions of nf-core/deepmutscan, we consider the use of [**bwa-mem2**](https://github.com/bwa-mem2/bwa-mem2), which provides similar alignment rates with a moderate speed increase ([Vasimuddin et al., _IPDPS_ 2019](https://ieeexplore.ieee.org/document/8820962)). With the increasing diversity of sequencing platforms for DMS, new throughput, read length, and error profiles may require further alignment options to be implemented. + +--- + +## 2. Filtering + +For long ORF site-saturation mutagenesis libraries, most aligned shotgun sequencing reads contain exact matches against the reference. It is not possible to infer which of these stem from mutant versus wildtype DNA molecules prior to fragmentation, hence they are filtered out. Similarly, erroneous reads with unexpected indels are also removed. + +To this end, we use [**samtools view**](https://www.htslib.org/doc/samtools.html). + +--- + +## 3. Read Merging + +Even the highest-accuracy next-generation sequencing platforms do not have perfect base accuracy. To minimise the effect of base errors (which would otherwise be counted as "false mutations"), nf-core/deepmutscan uses the overlap of each aligned read pair. With base errors on the forward and reverse read being independent, the pipeline applies the [**vsearch fastq_mergepairs**](https://github.com/torognes/vsearch) function to convert each read pair into a single consensus molecule with adjusted base error scores. + +> [!TIP] +> Optimal merging performance is usually obtained if the average DNA fragment size matches the read size. For example, libraries sequenced with 150 bp paired-end reads should ideally also be sheared/tagmented to a mean size of 150 bp. + +Future versions may offer additional options depending on sequencing type and error profiles. + +--- + +## 4. Variant Counting + +Aligned, non-wildtype consensus reads are screened for exact, base-level mismatches. nf-core/deepmutscan currently uses the popular [**GATK AnalyzeSaturationMutagenesis**](https://gatk.broadinstitute.org/hc/en-us/articles/360037594771-AnalyzeSaturationMutagenesis-BETA) function to count occurrences of all single, double, triple, and higher-order nucleotide changes between each read and the reference ORF. + +We are currently working on the nf-core/deepmutscan implementation of a much lighter, alternative Python implementation for mutation counting. In this script, users will be allowed to specify a minimum base quality cutoff for mutations to be included in the final count table (default: Q30) – an option not available in GATK. + +--- + +## 5. DMS Library Quality Control + +By integrating the reference ORF coordinates and the chosen DMS library type (default: NNK/NNs degenerate codon-based nicking), nf-core/deepmutscan calculates a number of mutation count summary statistics. + +Custom visualisations allow for inspection of (1) mutation efficiency along the ORF, (2) position-specific recovery of amino acid diversity, and (3) overall sequencing coverage evenness and saturation. + +--- + +## 6. Data Summarisation for Fitness Estimation + +Steps 1-5 are iteratively run across all samples defined in the `.csv` spreadsheet. Once read alignment, merging, mutation counting, and library QC have been completed for the full list of samples, users can opt to proceed with fitness estimation. To this end, the pipeline generates all the necessary input files by merging mutation counts across samples. + +--- + +## 7. Single Nucleotide Variant Error Correction _(in development)_ + +This module will implement strategies to distinguish true single nucleotide variants from sequencing artefacts. There are two options to perform this: + +- Empirical error rate modelling based on wildtype sequencing +- Empirical error rate modelling based on false double mutants _(in development)_ + +--- + +## 8. Fitness Estimation _(in development)_ + +The final step of the pipeline will perform fitness estimation based on mutation counts. By default, we calculate fitness scores as the logarithm of variants' output to input ratio, normalised to that of the provided wildtype sequence. Future expansions may include: + +- Integration of other popular fitness inference tools, including [DiMSum](https://github.com/lehner-lab/DiMSum), [Enrich2](https://github.com/FowlerLab/Enrich2), [rosace](https://github.com/pimentellab/rosace/) and [mutscan](https://github.com/fmicompbio/mutscan) +- Standardised output formats for downstream analyses and comparison + +> [!IMPORTANT] +> We note that exact wildtype sequence reads are filtered out in stage 2. Including synonymous wildtype codons in the original mutagenesis design is therefore essential when it comes to calibrating the fitness calculations. + +--- + +## Notes for Developers + +- Custom scripts used in filtering and mutation counting are available in the `bin/` directory of the repository. +- Modules are implemented in Nextflow DSL2 and follow the nf-core community guidelines. +- Contributions, optimisations, and additional analysis modules are welcome - please open a pull request or GitHub issue to discuss ideas. + +_This document is meant as a living reference. As the pipeline evolves, the descriptions of steps 7 and 8 will be expanded with concrete implementation details._ + +--- + +## Contact + +For detailled scientific or technical questions, feedback and experimental discussions, feel free to contact us directly: + +- Benjamin Wehnert — wehnertbenjamin@gmail.com +- Maximilian Stammnitz — maximilian.stammnitz@crg.eu diff --git a/docs/usage.md b/docs/usage.md index e7ae0cb..9a7e86b 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -1,74 +1,38 @@ -# nf-core/dmscore: Usage +# nf-core/deepmutscan: Usage -## :warning: Please read this documentation on the nf-core website: [https://nf-co.re/dmscore/usage](https://nf-co.re/dmscore/usage) +## :warning: Please read this documentation on the nf-core website: [https://nf-co.re/deepmutscan/usage](https://nf-co.re/deepmutscan/usage) > _Documentation of pipeline parameters is generated automatically from the pipeline schema and can no longer be found in markdown files._ ## Introduction - +`nf-core/deepmutscan` is a workflow designed for the analysis of deep mutational scanning (DMS) data. DMS enables researchers to experimentally measure the fitness effects of thousands of genes or gene variants simultaneously, helping to classify disease causing mutants in human and animal populations, to learn the fundamental rules of virus evolution, protein architecture, splicing, small-molecule interactions and many other phenotypes. -## Samplesheet input - -You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below. - -```bash ---input '[path to samplesheet file]' -``` - -### Multiple runs of the same sample - -The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Below is an example for the same sample sequenced across 3 lanes: - -```csv title="samplesheet.csv" -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP1,AEG588A1_S1_L003_R1_001.fastq.gz,AEG588A1_S1_L003_R2_001.fastq.gz -CONTROL_REP1,AEG588A1_S1_L004_R1_001.fastq.gz,AEG588A1_S1_L004_R2_001.fastq.gz -``` - -### Full samplesheet - -The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 3 columns to match those defined in the table below. - -A final samplesheet file consisting of both single- and paired-end data may look something like the one below. This is for 6 samples, where `TREATMENT_REP3` has been sequenced twice. - -```csv title="samplesheet.csv" -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP2,AEG588A2_S2_L002_R1_001.fastq.gz,AEG588A2_S2_L002_R2_001.fastq.gz -CONTROL_REP3,AEG588A3_S3_L002_R1_001.fastq.gz,AEG588A3_S3_L002_R2_001.fastq.gz -TREATMENT_REP1,AEG588A4_S4_L003_R1_001.fastq.gz, -TREATMENT_REP2,AEG588A5_S5_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L004_R1_001.fastq.gz, -``` - -| Column | Description | -| --------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). | -| `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | -| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | - -An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. +This page provides in-depth descriptions of the data processing modules implemented in `nf-core/deepmutscan`. It is intended for both advanced and developers who want to understand the rationale behind certain design choices, explore implementation details, and consider potential future extensions. ## Running the pipeline -The typical command for running the pipeline is as follows: +The typical command for running the pipeline (on an example protein-coding gene with 100 amino acids) is as follows: ```bash -nextflow run nf-core/dmscore --input ./samplesheet.csv --outdir ./results --genome GRCh37 -profile docker +nextflow run nf-core/deepmutscan \ + -profile \ + --input ./samplesheet.csv \ + --fasta ./ref.fa \ + --reading_frame 1-300 \ + --outdir ./results ``` -This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. +The `-profile ` specification is mandatory and should reflect either your own institutional profile or any pipeline profile specified in the [profile section](##-profile). + +This will launch the pipeline by performing sequencing read alignments, various raw data QC analyses, optional fitness and fitness error estimations. Note that the pipeline will create the following files in your working directory: ```bash -work # Directory containing the nextflow working files - # Finished results in specified location (defined with --outdir) -.nextflow_log # Log file from Nextflow -# Other nextflow hidden files, eg. history of pipeline runs and old logs. +work # Directory containing the nextflow working files +results # Finished results in specified location (defined with --outdir) +.nextflow_log # Log file from Nextflow ``` If you wish to repeatedly use the same parameters for multiple runs, rather than specifying each flag in the command, you can specify these in a params file. @@ -81,7 +45,7 @@ Pipeline settings can be provided in a `yaml` or `json` file via `-params-file < The above pipeline run specified with a params file in yaml format: ```bash -nextflow run nf-core/dmscore -profile docker -params-file params.yaml +nextflow run nf-core/deepmutscan -profile docker -params-file params.yaml ``` with: @@ -89,25 +53,132 @@ with: ```yaml title="params.yaml" input: './samplesheet.csv' outdir: './results/' -genome: 'GRCh37' +gene reference: 'ref.fa' <...> ``` You can also generate such `YAML`/`JSON` files via [nf-core/launch](https://nf-co.re/launch). -### Updating the pipeline +## Inputs + +Users need to first prepare a samplesheet with your input/output data in which each row represents a pair of fastq files (paired end). This should look as follows: + +```csv title="samplesheet.csv" +sample,type,replicate,file1,file2 +ORF1,input,1,/reads/forward1.fastq.gz,/reads/reverse1.fastq.gz +ORF1,input,2,/reads/forward2.fastq.gz,/reads/reverse2.fastq.gz +ORF1,output,1,/reads/forward3.fastq.gz,/reads/reverse3.fastq.gz +ORF1,output,2,/reads/forward4.fastq.gz,/reads/reverse4.fastq.gz +``` + +Secondly, users need to specify the gene or gene region of interest using a reference FASTA file via `--fasta`. Provide the exact codon coordinates using `--reading_frame`. + +## Optional parameters + +Several optional parameters are available for `nf-core/deepmutscan`, some of which are currently _(in development)_. + +| Parameter | Default | Description | +| -------------------- | --------------- | ------------------------------------------------------------- | +| `--run_seqdepth` | `false` | Estimate sequencing saturation by rarefaction | +| `--fitness` | `false` | Default fitness inference module | +| `--dimsum` | `false` | Optional fitness inference module _(AMD/x86_64 systems only)_ | +| `--mutagenesis` | `nnk` | Deep mutational scanning strategy used _(in development)_ | +| `--error-estimation` | `wt_sequencing` | Error model used to correct 1nt counts _(in development)_ | +| `--read-align` | `bwa-mem` | Customised read aligner _(in development)_ | + +## Pipeline output + +After execution, the pipeline creates the following directory structure: + +``` +results/ +├── fastqc/ # Individual HTML reports for specified fastq files, raw sequencing QC +├── fitness/ # Merged variant count tables, fitness and error estimates, replicate correlations and heatmaps +├── intermediate_files/ # Raw alignments, raw and pre-filtered variant count tables, QC reports +├── library_QC/ # Sample-specific PDF visualizations: position-wise sequencing coverage, count heatmaps, etc. +├── multiqc/ # Shared HTML reports for all fastq files, raw sequencing QC +├── pipelineinfo/ # Nextflow helper files for timeline and summary report generation +├── timeline.html # Nextflow timeline for all tasks +└── report.html # Nextflow summary report incl. detailed CPU and memory usage per for all tasks +``` + +## Detailed steps + +### 1. Alignment + +All paired-end raw reads are first aligned to the provided reference ORF using [**bwa-mem**](http://bio-bwa.sourceforge.net/). This is a highly efficient mapping algorithm for reads ≥100 bp, with its multi-threading support automatically handled by nf-core. + +In future versions of `nf-core/deepmutscan`, we consider the use of [**bwa-mem2**](https://github.com/bwa-mem2/bwa-mem2), which provides similar alignment rates with a moderate speed increase ([Vasimuddin et al., _IPDPS_ 2019](https://ieeexplore.ieee.org/document/8820962)). With the increasing diversity of sequencing platforms for DMS, new throughput, read length, and error profiles may require further alignment options to be implemented. + +### 2. Filtering + +For long ORF site-saturation mutagenesis libraries, most aligned shotgun sequencing reads contain exact matches against the reference. It is not possible to infer which of these stem from mutant versus wildtype DNA molecules prior to fragmentation, hence they are filtered out. Similarly, erroneous reads with unexpected indels are also removed. + +To this end, we use [**samtools view**](https://www.htslib.org/doc/samtools.html). + +### 3. Read Merging + +Even the highest-accuracy next-generation sequencing platforms do not have perfect base accuracy. To minimise the effect of base errors (which would otherwise be counted as "false mutations"), `nf-core/deepmutscan` uses the overlap of each aligned read pair. With base errors on the forward and reverse read being independent, the pipeline applies the [**vsearch fastq_mergepairs**](https://github.com/torognes/vsearch) function to convert each read pair into a single consensus molecule with adjusted base error scores. + +> [!TIP] +> Optimal merging performance is usually obtained if the average DNA fragment size matches the read size. For example, libraries sequenced with 150 bp paired-end reads should ideally also be sheared/tagmented to a mean size of 150 bp. + +Future versions may offer additional options depending on sequencing type and error profiles. + +### 4. Variant Counting + +Aligned, non-wildtype consensus reads are screened for exact, base-level mismatches. `nf-core/deepmutscan` currently uses the popular [**GATK AnalyzeSaturationMutagenesis**](https://gatk.broadinstitute.org/hc/en-us/articles/360037594771-AnalyzeSaturationMutagenesis-BETA) function to count occurrences of all single, double, triple, and higher-order nucleotide changes between each read and the reference ORF. + +We are currently working on the `nf-core/deepmutscan` implementation of a much lighter, alternative Python implementation for mutation counting. In this script, users will be allowed to specify a minimum base quality cutoff for mutations to be included in the final count table (default: Q30) – an option not available in GATK. + +### 5. DMS Library Quality Control + +By integrating the reference ORF coordinates and the chosen DMS library type (default: NNK/NNs degenerate codon-based nicking), `nf-core/deepmutscan` calculates a number of mutation count summary statistics. + +Custom visualisations allow for inspection of (1) mutation efficiency along the ORF, (2) position-specific recovery of amino acid diversity, and (3) overall sequencing coverage evenness and saturation. + +### 6. Data Summarisation for Fitness Estimation + +Steps 1-5 are iteratively run across all samples defined in the `.csv` spreadsheet. Once read alignment, merging, mutation counting, and library QC have been completed for the full list of samples, users can opt to proceed with fitness estimation. To this end, the pipeline generates all the necessary input files by merging mutation counts across samples. + +### 7. Single Nucleotide Variant Error Correction _(in development)_ + +This module will implement strategies to distinguish true single nucleotide variants from sequencing artefacts. There are two options to perform this: + +- Empirical error rate modelling based on wildtype sequencing +- Empirical error rate modelling based on false double mutants _(in development)_ + +### 8. Fitness Estimation _(in development)_ + +The final step of the pipeline will perform fitness estimation based on mutation counts. By default, we calculate fitness scores as the logarithm of variants' output to input ratio, normalised to that of the provided wildtype sequence. Future expansions may include: + +- Integration of other popular fitness inference tools, including [DiMSum](https://github.com/lehner-lab/DiMSum), [Enrich2](https://github.com/FowlerLab/Enrich2), [rosace](https://github.com/pimentellab/rosace/) and [mutscan](https://github.com/fmicompbio/mutscan) +- Standardised output formats for downstream analyses and comparison + +> [!IMPORTANT] +> We note that exact wildtype sequence reads are filtered out in stage 2. Including synonymous wildtype codons in the original mutagenesis design is therefore essential when it comes to calibrating the fitness calculations. + +## Notes for Developers + +- Custom scripts used in filtering and mutation counting are available in the `bin/` directory of the repository. +- Modules are implemented in Nextflow DSL2 and follow the nf-core community guidelines. +- Contributions, optimisations, and additional analysis modules are welcome - please open a pull request or GitHub issue to discuss ideas. + +_This document is meant as a living reference. As the pipeline evolves, the descriptions of steps 7 and 8 will be expanded with concrete implementation details._ + +## Updating the pipeline When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline: ```bash -nextflow pull nf-core/dmscore +nextflow pull nf-core/deepmutscan ``` -### Reproducibility +## Reproducibility It is a good idea to specify the pipeline version when running the pipeline on your data. This ensures that a specific version of the pipeline code and software are used when you run your pipeline. If you keep using the same tag, you'll be running the same version of the pipeline, even if there have been changes to the code since. -First, go to the [nf-core/dmscore releases page](https://github.com/nf-core/dmscore/releases) and find the latest pipeline version - numeric only (eg. `1.3.1`). Then specify this when running the pipeline with `-r` (one hyphen) - eg. `-r 1.3.1`. Of course, you can switch to another version by changing the number after the `-r` flag. +First, go to the [nf-core/deepmutscan releases page](https://github.com/nf-core/deepmutscan/releases) and find the latest pipeline version - numeric only (eg. `1.0.0`). Then specify this when running the pipeline with `-r` (one hyphen) - eg. `-r 1.0.0`. Of course, you can switch to another version by changing the number after the `-r` flag. This version number will be logged in reports when you run the pipeline, so that you'll know what you used when you look back in the future. For example, at the bottom of the MultiQC reports. @@ -149,7 +220,7 @@ If `-profile` is not specified, the pipeline will run locally and expect all sof - `shifter` - A generic configuration profile to be used with [Shifter](https://nersc.gitlab.io/development/shifter/how-to-use/) - `charliecloud` - - A generic configuration profile to be used with [Charliecloud](https://hpc.github.io/charliecloud/) + - A generic configuration profile to be used with [Charliecloud](https://charliecloud.io/) - `apptainer` - A generic configuration profile to be used with [Apptainer](https://apptainer.org/) - `wave` diff --git a/log.txt b/log.txt new file mode 100644 index 0000000..dceb2c1 --- /dev/null +++ b/log.txt @@ -0,0 +1,613 @@ +May-07 23:01:49.322 [main] DEBUG nextflow.cli.Launcher - $> nextflow run . -profile test,docker --fasta /Users/benjaminwehnert/GID1A_SUNi_ref_small.fasta --reading_frame 352-1383 --min_counts 2 --mutagenesis_type max_diff_to_wt --outdir ./results +May-07 23:01:49.500 [main] DEBUG nextflow.cli.CmdRun - N E X T F L O W ~ version 24.04.4 +May-07 23:01:49.524 [main] DEBUG nextflow.plugin.PluginsFacade - Setting up plugin manager > mode=prod; embedded=false; plugins-dir=/Users/benjaminwehnert/.nextflow/plugins; core-plugins: nf-amazon@2.5.3,nf-azure@1.6.1,nf-cloudcache@0.4.1,nf-codecommit@0.2.1,nf-console@1.1.3,nf-ga4gh@1.3.0,nf-google@1.13.2-patch1,nf-tower@1.9.1,nf-wave@1.4.2-patch1 +May-07 23:01:49.533 [main] INFO o.pf4j.DefaultPluginStatusProvider - Enabled plugins: [] +May-07 23:01:49.534 [main] INFO o.pf4j.DefaultPluginStatusProvider - Disabled plugins: [] +May-07 23:01:49.536 [main] INFO org.pf4j.DefaultPluginManager - PF4J version 3.12.0 in 'deployment' mode +May-07 23:01:49.549 [main] INFO org.pf4j.AbstractPluginManager - No plugins +May-07 23:01:50.385 [main] WARN nextflow.config.Manifest - Invalid config manifest attribute `contributors` +May-07 23:01:50.406 [main] DEBUG nextflow.config.ConfigBuilder - Found config local: /Users/benjaminwehnert/dmscore/nextflow.config +May-07 23:01:50.408 [main] DEBUG nextflow.config.ConfigBuilder - Parsing config file: /Users/benjaminwehnert/dmscore/nextflow.config +May-07 23:01:50.426 [main] DEBUG n.secret.LocalSecretsProvider - Secrets store: /Users/benjaminwehnert/.nextflow/secrets/store.json +May-07 23:01:50.428 [main] DEBUG nextflow.secret.SecretsLoader - Discovered secrets providers: [nextflow.secret.LocalSecretsProvider@169268a7] - activable => nextflow.secret.LocalSecretsProvider@169268a7 +May-07 23:01:50.438 [main] DEBUG nextflow.config.ConfigBuilder - Applying config profile: `test,docker` +May-07 23:01:51.909 [main] DEBUG nextflow.config.ConfigBuilder - Available config profiles: [bih, cfc_dev, uzl_omics, ifb_core, embl_hd, denbi_qbic, alice, mjolnir_globe, uppmax, giga, incliva, ilifu, ki_luria, uge, icr_alma, rosalind_uge, lugh, mccleary, unibe_ibu, vai, czbiohub_aws, jax, roslin, ccga_med, tes, scw, unc_longleaf, tigem, tubingen_apg, google, apollo, ipop_up, vsc_calcua, pdc_kth, googlels, ceci_nic5, humantechnopole, stjude, daisybio, eddie, medair, biowulf, apptainer, bi, bigpurple, adcra, cedars, pawsey_setonix, vsc_kul_uhasselt, pawsey_nimbus, ucl_myriad, utd_ganymede, charliecloud, seattlechildrens, icr_davros, ceres, arm, munin, rosalind, hasta, cfc, uzh, shu_bmrc, ebi_codon_slurm, ebc, ccga_dx, crick, ku_sund_danhead, marvin, shifter, biohpc_gen, mana, mamba, york_viking, unc_lccc, wehi, awsbatch, wustl_htcf, arcc, ceci_dragon2, imperial, maestro, software_license, cannon, genotoul, nci_gadi, abims, janelia, nu_genomics, googlebatch, oist, sahmri, kaust, alliance_canada, mpcdf, leicester, vsc_ugent, create, sage, cambridge, jex, podman, ebi_codon, cheaha, xanadu, nyu_hpc, test, marjorie, computerome, ucd_sonic, seg_globe, mssm, sanger, dkfz, bluebear, pasteur, einstein, ethz_euler, m3c, test_full, imb, ucl_cscluster, tuos_stanage, azurebatch, hki, seadragon, crukmi, csiro_petrichor, qmul_apocrita, wave, docker, engaging, gis, hypatia, psmn, eva, unity, cropdiversityhpc, nygc, fgcz, conda, crg, singularity, mpcdf_viper, pe2, self_hosted_runner, tufts, uw_hyak_pedslabs, binac2, debug, genouest, cbe, unsw_katana, gitpod, phoenix, seawulf, uod_hpc, fub_curta, uct_hpc, aws_tower, binac, fsu_draco] +May-07 23:01:51.958 [main] DEBUG nextflow.cli.CmdRun - Applied DSL=2 by global default +May-07 23:01:51.972 [main] DEBUG nextflow.cli.CmdRun - Launching `./main.nf` [modest_coulomb] DSL2 - revision: 84101fc51c +May-07 23:01:51.974 [main] DEBUG nextflow.plugin.PluginsFacade - Plugins declared=[nf-schema@2.3.0] +May-07 23:01:51.974 [main] DEBUG nextflow.plugin.PluginsFacade - Plugins default=[] +May-07 23:01:51.974 [main] DEBUG nextflow.plugin.PluginsFacade - Plugins resolved requirement=[nf-schema@2.3.0] +May-07 23:01:51.975 [main] DEBUG nextflow.plugin.PluginUpdater - Installing plugin nf-schema version: 2.3.0 +May-07 23:01:51.983 [main] INFO org.pf4j.AbstractPluginManager - Plugin 'nf-schema@2.3.0' resolved +May-07 23:01:51.983 [main] INFO org.pf4j.AbstractPluginManager - Start plugin 'nf-schema@2.3.0' +May-07 23:01:51.990 [main] DEBUG nextflow.plugin.BasePlugin - Plugin started nf-schema@2.3.0 +May-07 23:01:52.045 [main] DEBUG nextflow.Session - Session UUID: 2397b75e-3882-46d7-ba2c-8549f8a2b4a6 +May-07 23:01:52.045 [main] DEBUG nextflow.Session - Run name: modest_coulomb +May-07 23:01:52.046 [main] DEBUG nextflow.Session - Executor pool size: 8 +May-07 23:01:52.053 [main] DEBUG nextflow.file.FilePorter - File porter settings maxRetries=3; maxTransfers=50; pollTimeout=null +May-07 23:01:52.057 [main] DEBUG nextflow.util.ThreadPoolBuilder - Creating thread pool 'FileTransfer' minSize=10; maxSize=24; workQueue=LinkedBlockingQueue[10000]; allowCoreThreadTimeout=false +May-07 23:01:52.077 [main] DEBUG nextflow.cli.CmdRun - + Version: 24.04.4 build 5917 + Created: 01-08-2024 07:05 UTC (09:05 CEST) + System: Mac OS X 15.0 + Runtime: Groovy 4.0.21 on OpenJDK 64-Bit Server VM 17.0.13+0 + Encoding: UTF-8 (UTF-8) + Process: 31358@MacBook-Air-von-Benjamin.local [127.0.0.1] + CPUs: 8 - Mem: 8 GB (94.6 MB) - Swap: 8 GB (737.5 MB) +May-07 23:01:52.088 [main] DEBUG nextflow.Session - Work-dir: /Users/benjaminwehnert/dmscore/work [Mac OS X] +May-07 23:01:52.088 [main] DEBUG nextflow.Session - Script base path does not exist or is not a directory: /Users/benjaminwehnert/dmscore/bin +May-07 23:01:52.104 [main] DEBUG nextflow.executor.ExecutorFactory - Extension executors providers=[] +May-07 23:01:52.123 [main] DEBUG nextflow.Session - Observer factory: DefaultObserverFactory +May-07 23:01:52.143 [main] DEBUG nextflow.Session - Observer factory: ValidationObserverFactory +May-07 23:01:52.174 [main] WARN nextflow.config.Manifest - Invalid config manifest attribute `contributors` +May-07 23:01:52.195 [main] DEBUG nextflow.cache.CacheFactory - Using Nextflow cache factory: nextflow.cache.DefaultCacheFactory +May-07 23:01:52.205 [main] DEBUG nextflow.util.CustomThreadPool - Creating default thread pool > poolSize: 9; maxThreads: 1000 +May-07 23:01:52.264 [main] DEBUG nextflow.Session - Session start +May-07 23:01:52.266 [main] DEBUG nextflow.trace.TraceFileObserver - Workflow started -- trace file: /Users/benjaminwehnert/dmscore/results/pipeline_info/execution_trace_2025-05-07_23-01-50.txt +May-07 23:01:52.411 [main] DEBUG nextflow.script.ScriptRunner - > Launching execution +May-07 23:01:53.516 [main] DEBUG nextflow.script.IncludeDef - Loading included plugin extensions with names: [paramsSummaryMap:paramsSummaryMap]; plugin Id: nf-schema +May-07 23:01:53.917 [main] DEBUG nextflow.script.IncludeDef - Loading included plugin extensions with names: [paramsSummaryLog:paramsSummaryLog]; plugin Id: nf-schema +May-07 23:01:53.918 [main] DEBUG nextflow.script.IncludeDef - Loading included plugin extensions with names: [validateParameters:validateParameters]; plugin Id: nf-schema +May-07 23:01:53.920 [main] DEBUG nextflow.script.IncludeDef - Loading included plugin extensions with names: [paramsSummaryMap:paramsSummaryMap]; plugin Id: nf-schema +May-07 23:01:53.921 [main] DEBUG nextflow.script.IncludeDef - Loading included plugin extensions with names: [samplesheetToList:samplesheetToList]; plugin Id: nf-schema +May-07 23:01:54.131 [main] WARN nextflow.script.ScriptBinding - Access to undefined parameter `custom_codon_library` -- Initialise it to a default value eg. `params.custom_codon_library = some_value` +May-07 23:01:54.356 [main] DEBUG nextflow.script.IncludeDef - Loading included plugin extensions with names: [paramsSummaryLog:paramsSummaryLog]; plugin Id: nf-schema +May-07 23:01:54.357 [main] DEBUG nextflow.script.IncludeDef - Loading included plugin extensions with names: [validateParameters:validateParameters]; plugin Id: nf-schema +May-07 23:01:54.359 [main] DEBUG nextflow.script.IncludeDef - Loading included plugin extensions with names: [paramsSummaryMap:paramsSummaryMap]; plugin Id: nf-schema +May-07 23:01:54.360 [main] DEBUG nextflow.script.IncludeDef - Loading included plugin extensions with names: [samplesheetToList:samplesheetToList]; plugin Id: nf-schema +May-07 23:01:54.574 [main] INFO nextflow.Nextflow - +------------------------------------------------------ + ,--./,-. + ___ __ __ __ ___ /,-._.--~' + |\ | |__ __ / ` / \ |__) |__ } { + | \| | \__, \__/ | \ |___ \`-._,-`-, + `._,._,' + nf-core/dmscore 1.0.0dev +------------------------------------------------------ +Input/output options + input : https://raw.githubusercontent.com/BenjaminWehnert1008/test-datasets/dmsqc/dmsqc/samplesheet_qc_only.csv + outdir : ./results + min_counts : 2 + mutagenesis_type : max_diff_to_wt + +Reference genome options + genome : R64-1-1 + fasta : /Users/benjaminwehnert/GID1A_SUNi_ref_small.fasta + +Institutional config options + config_profile_name : Test profile + config_profile_description: Minimal test dataset to check pipeline function + +Generic options + trace_report_suffix : 2025-05-07_23-01-50 + +Core Nextflow options + runName : modest_coulomb + containerEngine : docker + launchDir : /Users/benjaminwehnert/dmscore + workDir : /Users/benjaminwehnert/dmscore/work + projectDir : /Users/benjaminwehnert/dmscore + userName : benjaminwehnert + profile : test,docker + configFiles : /Users/benjaminwehnert/dmscore/nextflow.config + +!! Only displaying parameters that differ from the pipeline defaults !! +------------------------------------------------------ +* The nf-core framework + https://doi.org/10.1038/s41587-020-0439-x + +* Software dependencies + https://github.com/nf-core/dmscore/blob/master/CITATIONS.md + +May-07 23:01:54.576 [main] DEBUG n.validation.ValidationExtension - Starting parameters validation +May-07 23:01:54.920 [main] DEBUG nextflow.validation.SchemaEvaluator - Started validating /BenjaminWehnert1008/test-datasets/dmsqc/dmsqc/samplesheet_qc_only.csv +May-07 23:01:55.945 [main] DEBUG nextflow.validation.SchemaEvaluator - Validation of file 'https://raw.githubusercontent.com/BenjaminWehnert1008/test-datasets/dmsqc/dmsqc/samplesheet_qc_only.csv' passed! +May-07 23:01:56.095 [main] DEBUG n.v.FormatDirectoryPathEvaluator - Cloud blob storage paths are not supported by 'FormatDirectoryPathEvaluator': 's3://ngi-igenomes/igenomes/' +May-07 23:01:56.099 [main] DEBUG n.validation.ValidationExtension - Finishing parameters validation +May-07 23:01:56.178 [main] DEBUG nextflow.script.ProcessConfig - Config settings `withLabel:process_medium` matches labels `process_medium` for process with name NFCORE_DMSCORE:DMSCORE:FASTQC +May-07 23:01:56.182 [main] DEBUG nextflow.script.ProcessConfig - Config settings `withName:FASTQC` matches process NFCORE_DMSCORE:DMSCORE:FASTQC +May-07 23:01:56.196 [main] DEBUG nextflow.executor.ExecutorFactory - << taskConfig executor: null +May-07 23:01:56.196 [main] DEBUG nextflow.executor.ExecutorFactory - >> processorType: 'local' +May-07 23:01:56.202 [main] DEBUG nextflow.executor.Executor - [warm up] executor > local +May-07 23:01:56.206 [main] DEBUG n.processor.LocalPollingMonitor - Creating local task monitor for executor 'local' > cpus=8; memory=8 GB; capacity=8; pollInterval=100ms; dumpInterval=5m +May-07 23:01:56.209 [main] DEBUG n.processor.TaskPollingMonitor - >>> barrier register (monitor: local) +May-07 23:01:56.359 [main] DEBUG nextflow.script.ProcessConfig - Config settings `withLabel:process_single` matches labels `process_single` for process with name NFCORE_DMSCORE:DMSCORE:MULTIQC +May-07 23:01:56.360 [main] DEBUG nextflow.script.ProcessConfig - Config settings `withName:MULTIQC` matches process NFCORE_DMSCORE:DMSCORE:MULTIQC +May-07 23:01:56.362 [main] DEBUG nextflow.executor.ExecutorFactory - << taskConfig executor: null +May-07 23:01:56.363 [main] DEBUG nextflow.executor.ExecutorFactory - >> processorType: 'local' +May-07 23:01:56.372 [main] DEBUG nextflow.script.ProcessConfig - Config settings `withLabel:process_single` matches labels `process_single` for process with name NFCORE_DMSCORE:DMSCORE:BWA_INDEX +May-07 23:01:56.373 [main] DEBUG nextflow.script.ProcessConfig - Config settings `withName:BWA_INDEX` matches process NFCORE_DMSCORE:DMSCORE:BWA_INDEX +May-07 23:01:56.375 [main] DEBUG nextflow.executor.ExecutorFactory - << taskConfig executor: null +May-07 23:01:56.375 [main] DEBUG nextflow.executor.ExecutorFactory - >> processorType: 'local' +May-07 23:01:56.391 [main] DEBUG nextflow.script.ProcessConfig - Config settings `withLabel:process_high` matches labels `process_high` for process with name NFCORE_DMSCORE:DMSCORE:BWA_MEM +May-07 23:01:56.391 [main] DEBUG nextflow.script.ProcessConfig - Config settings `withName:BWA_MEM` matches process NFCORE_DMSCORE:DMSCORE:BWA_MEM +May-07 23:01:56.394 [main] DEBUG nextflow.executor.ExecutorFactory - << taskConfig executor: null +May-07 23:01:56.395 [main] DEBUG nextflow.executor.ExecutorFactory - >> processorType: 'local' +May-07 23:01:56.404 [main] DEBUG nextflow.script.ProcessConfig - Config settings `withLabel:process_single` matches labels `process_single` for process with name NFCORE_DMSCORE:DMSCORE:BAMFILTER_DMS +May-07 23:01:56.406 [main] DEBUG nextflow.script.ProcessConfig - Config settings `withName:BAMFILTER_DMS` matches process NFCORE_DMSCORE:DMSCORE:BAMFILTER_DMS +May-07 23:01:56.409 [main] DEBUG nextflow.executor.ExecutorFactory - << taskConfig executor: null +May-07 23:01:56.409 [main] DEBUG nextflow.executor.ExecutorFactory - >> processorType: 'local' +May-07 23:01:56.418 [main] DEBUG nextflow.script.ProcessConfig - Config settings `withLabel:process_medium` matches labels `process_medium` for process with name NFCORE_DMSCORE:DMSCORE:PREMERGE +May-07 23:01:56.418 [main] DEBUG nextflow.script.ProcessConfig - Config settings `withName:PREMERGE` matches process NFCORE_DMSCORE:DMSCORE:PREMERGE +May-07 23:01:56.420 [main] DEBUG nextflow.executor.ExecutorFactory - << taskConfig executor: null +May-07 23:01:56.420 [main] DEBUG nextflow.executor.ExecutorFactory - >> processorType: 'local' +May-07 23:01:56.429 [main] DEBUG nextflow.script.ProcessConfig - Config settings `withLabel:process_high` matches labels `process_high` for process with name NFCORE_DMSCORE:DMSCORE:GATK_SATURATIONMUTAGENESIS +May-07 23:01:56.430 [main] DEBUG nextflow.script.ProcessConfig - Config settings `withName:GATK_SATURATIONMUTAGENESIS` matches process NFCORE_DMSCORE:DMSCORE:GATK_SATURATIONMUTAGENESIS +May-07 23:01:56.433 [main] DEBUG nextflow.executor.ExecutorFactory - << taskConfig executor: null +May-07 23:01:56.433 [main] DEBUG nextflow.executor.ExecutorFactory - >> processorType: 'local' +May-07 23:01:56.443 [main] DEBUG nextflow.script.ProcessConfig - Config settings `withLabel:process_single` matches labels `process_single` for process with name NFCORE_DMSCORE:DMSCORE:DMSANALYSIS_AASEQ +May-07 23:01:56.443 [main] DEBUG nextflow.script.ProcessConfig - Config settings `withName:DMSANALYSIS_AASEQ` matches process NFCORE_DMSCORE:DMSCORE:DMSANALYSIS_AASEQ +May-07 23:01:56.446 [main] DEBUG nextflow.executor.ExecutorFactory - << taskConfig executor: null +May-07 23:01:56.446 [main] DEBUG nextflow.executor.ExecutorFactory - >> processorType: 'local' +May-07 23:01:56.458 [main] DEBUG nextflow.script.ProcessConfig - Config settings `withLabel:process_single` matches labels `process_single` for process with name NFCORE_DMSCORE:DMSCORE:DMSANALYSIS_POSSIBLE_MUTATIONS +May-07 23:01:56.459 [main] DEBUG nextflow.script.ProcessConfig - Config settings `withName:DMSANALYSIS_POSSIBLE_MUTATIONS` matches process NFCORE_DMSCORE:DMSCORE:DMSANALYSIS_POSSIBLE_MUTATIONS +May-07 23:01:56.461 [main] DEBUG nextflow.executor.ExecutorFactory - << taskConfig executor: null +May-07 23:01:56.461 [main] DEBUG nextflow.executor.ExecutorFactory - >> processorType: 'local' +May-07 23:01:56.480 [main] DEBUG nextflow.script.ProcessConfig - Config settings `withLabel:process_single` matches labels `process_single` for process with name NFCORE_DMSCORE:DMSCORE:DMSANALYSIS_PROCESS_GATK +May-07 23:01:56.481 [main] DEBUG nextflow.script.ProcessConfig - Config settings `withName:DMSANALYSIS_PROCESS_GATK` matches process NFCORE_DMSCORE:DMSCORE:DMSANALYSIS_PROCESS_GATK +May-07 23:01:56.484 [main] DEBUG nextflow.executor.ExecutorFactory - << taskConfig executor: null +May-07 23:01:56.484 [main] DEBUG nextflow.executor.ExecutorFactory - >> processorType: 'local' +May-07 23:01:56.499 [main] DEBUG nextflow.Session - Config process names validation disabled as requested +May-07 23:01:56.500 [main] DEBUG nextflow.Session - Igniting dataflow network (43) +May-07 23:01:56.508 [main] DEBUG nextflow.processor.TaskProcessor - Starting process > NFCORE_DMSCORE:DMSCORE:FASTQC +May-07 23:01:56.508 [main] DEBUG nextflow.processor.TaskProcessor - Starting process > NFCORE_DMSCORE:DMSCORE:MULTIQC +May-07 23:01:56.509 [main] DEBUG nextflow.processor.TaskProcessor - Starting process > NFCORE_DMSCORE:DMSCORE:BWA_INDEX +May-07 23:01:56.509 [main] DEBUG nextflow.processor.TaskProcessor - Starting process > NFCORE_DMSCORE:DMSCORE:BWA_MEM +May-07 23:01:56.511 [main] DEBUG nextflow.processor.TaskProcessor - Starting process > NFCORE_DMSCORE:DMSCORE:BAMFILTER_DMS +May-07 23:01:56.512 [main] DEBUG nextflow.processor.TaskProcessor - Starting process > NFCORE_DMSCORE:DMSCORE:PREMERGE +May-07 23:01:56.512 [main] DEBUG nextflow.processor.TaskProcessor - Starting process > NFCORE_DMSCORE:DMSCORE:GATK_SATURATIONMUTAGENESIS +May-07 23:01:56.512 [main] DEBUG nextflow.processor.TaskProcessor - Starting process > NFCORE_DMSCORE:DMSCORE:DMSANALYSIS_AASEQ +May-07 23:01:56.512 [main] DEBUG nextflow.processor.TaskProcessor - Starting process > NFCORE_DMSCORE:DMSCORE:DMSANALYSIS_POSSIBLE_MUTATIONS +May-07 23:01:56.512 [main] DEBUG nextflow.processor.TaskProcessor - Starting process > NFCORE_DMSCORE:DMSCORE:DMSANALYSIS_PROCESS_GATK +May-07 23:01:56.514 [main] DEBUG nextflow.script.ScriptRunner - Parsed script files: + Script_e5f965e09aa3641b: /Users/benjaminwehnert/dmscore/./workflows/../modules/local/bamprocessing/premerge.nf + Script_113f045ba19c2c46: /Users/benjaminwehnert/dmscore/./workflows/../modules/nf-core/fastqc/main.nf + Script_4f612cfd8c52e8cd: /Users/benjaminwehnert/dmscore/./subworkflows/local/utils_nfcore_dmscore_pipeline/../../nf-core/utils_nextflow_pipeline/main.nf + Script_215a636da7ab24a2: /Users/benjaminwehnert/dmscore/./workflows/../subworkflows/local/utils_nfcore_dmscore_pipeline/../../nf-core/utils_nfschema_plugin/main.nf + Script_68606fccdecc54d9: /Users/benjaminwehnert/dmscore/./subworkflows/local/utils_nfcore_dmscore_pipeline/main.nf + Script_0f3077e98f6a93f6: /Users/benjaminwehnert/dmscore/./workflows/../modules/nf-core/bwa/mem/main.nf + Script_5c4e8d4051efa81e: /Users/benjaminwehnert/dmscore/./subworkflows/local/utils_nfcore_dmscore_pipeline/../../nf-core/utils_nfcore_pipeline/main.nf + Script_3bf8120d0b5bcde1: /Users/benjaminwehnert/dmscore/./workflows/../modules/local/dmsanalysis/possiblemutations.nf + Script_d65a0ba0a319d7db: /Users/benjaminwehnert/dmscore/./workflows/../modules/local/dmsanalysis/aaseq.nf + Script_4ff7366a79e0ef06: /Users/benjaminwehnert/dmscore/./workflows/../modules/local/bamprocessing/bamfilteringdms.nf + Script_c568aae5b239c1c5: /Users/benjaminwehnert/dmscore/main.nf + Script_09ccfa79b2802f41: /Users/benjaminwehnert/dmscore/./workflows/dmscore.nf + Script_4955288afd8ca61e: /Users/benjaminwehnert/dmscore/./workflows/../modules/nf-core/multiqc/main.nf + Script_a1878766b1a6b241: /Users/benjaminwehnert/dmscore/./workflows/../modules/local/dmsanalysis/processgatk.nf + Script_37dcd664b2773148: /Users/benjaminwehnert/dmscore/./workflows/../modules/local/gatk/saturationmutagenesis.nf + Script_25aa31c18d513c61: /Users/benjaminwehnert/dmscore/./workflows/../modules/nf-core/bwa/index/main.nf +May-07 23:01:56.514 [main] DEBUG nextflow.script.ScriptRunner - > Awaiting termination +May-07 23:01:56.514 [main] DEBUG nextflow.Session - Session await +May-07 23:01:56.605 [Actor Thread 1] DEBUG nextflow.sort.BigSort - Sort completed -- entries: 1; slices: 1; internal sort time: 0.001 s; external sort time: 0.011 s; total time: 0.012 s +May-07 23:01:56.605 [Actor Thread 2] DEBUG nextflow.sort.BigSort - Sort completed -- entries: 1; slices: 1; internal sort time: 0.001 s; external sort time: 0.011 s; total time: 0.012 s +May-07 23:01:56.611 [Actor Thread 2] DEBUG nextflow.file.FileCollector - Saved collect-files list to: /Users/benjaminwehnert/dmscore/work/collect-file/4640128d9cfd4a45636425a4f43db374 +May-07 23:01:56.611 [Actor Thread 1] DEBUG nextflow.file.FileCollector - Saved collect-files list to: /Users/benjaminwehnert/dmscore/work/collect-file/04a58cc29ef1c76e5af4e2fb20a13ad7 +May-07 23:01:56.619 [Actor Thread 1] DEBUG nextflow.file.FileCollector - Deleting file collector temp dir: /var/folders/r0/ldrzd4wn1s3516hy0vsn8xzm0000gn/T/nxf-1651345332501275985 +May-07 23:01:56.619 [Actor Thread 2] DEBUG nextflow.file.FileCollector - Deleting file collector temp dir: /var/folders/r0/ldrzd4wn1s3516hy0vsn8xzm0000gn/T/nxf-12321695686053957190 +May-07 23:01:56.623 [Actor Thread 3] DEBUG nextflow.util.HashBuilder - [WARN] Unknown hashing type: class Script_09ccfa79b2802f41$_runScript_closure1$_closure26 +May-07 23:01:56.624 [Actor Thread 14] DEBUG nextflow.util.HashBuilder - Unable to get file attributes file: /NULL -- Cause: java.nio.file.NoSuchFileException: /NULL +May-07 23:01:56.697 [Task submitter] DEBUG n.executor.local.LocalTaskHandler - Launch cmd line: /bin/bash -ue .command.run +May-07 23:01:56.698 [Task submitter] INFO nextflow.Session - [d4/e141d7] Submitted process > NFCORE_DMSCORE:DMSCORE:DMSANALYSIS_AASEQ (amino_acid_sequence) +May-07 23:01:56.730 [FileTransfer-2] DEBUG nextflow.file.FilePorter - Copying foreign file https://raw.githubusercontent.com/BenjaminWehnert1008/test-datasets/dmsqc/dmsqc/pMS190_GID1A_SUNi_S2_1_R2_50k.fastq to work dir: /Users/benjaminwehnert/dmscore/work/stage-2397b75e-3882-46d7-ba2c-8549f8a2b4a6/6a/8815724cee5e2e4e17200e98e8c0ad/pMS190_GID1A_SUNi_S2_1_R2_50k.fastq +May-07 23:01:56.730 [FileTransfer-1] DEBUG nextflow.file.FilePorter - Copying foreign file https://raw.githubusercontent.com/BenjaminWehnert1008/test-datasets/dmsqc/dmsqc/pMS190_GID1A_SUNi_S2_1_R1_50k.fastq to work dir: /Users/benjaminwehnert/dmscore/work/stage-2397b75e-3882-46d7-ba2c-8549f8a2b4a6/40/dfe72a8fa5ce1b038fdf1aa1fb4749/pMS190_GID1A_SUNi_S2_1_R1_50k.fastq +May-07 23:01:58.744 [Actor Thread 15] INFO nextflow.file.FilePorter - Staging foreign file: https://raw.githubusercontent.com/BenjaminWehnert1008/test-datasets/dmsqc/dmsqc/pMS190_GID1A_SUNi_S2_1_R1_50k.fastq +May-07 23:02:00.581 [Task monitor] DEBUG n.processor.TaskPollingMonitor - Task completed > TaskHandler[id: 3; name: NFCORE_DMSCORE:DMSCORE:DMSANALYSIS_AASEQ (amino_acid_sequence); status: COMPLETED; exit: 0; error: -; workDir: /Users/benjaminwehnert/dmscore/work/d4/e141d76a7c3b4130080bcdf8a831a3] +May-07 23:02:00.585 [Task monitor] DEBUG nextflow.util.ThreadPoolBuilder - Creating thread pool 'TaskFinalizer' minSize=10; maxSize=24; workQueue=LinkedBlockingQueue[10000]; allowCoreThreadTimeout=false +May-07 23:02:00.612 [Task submitter] DEBUG n.executor.local.LocalTaskHandler - Launch cmd line: /bin/bash -ue .command.run +May-07 23:02:00.613 [Task submitter] INFO nextflow.Session - [49/349fd9] Submitted process > NFCORE_DMSCORE:DMSCORE:BWA_INDEX (GID1A_SUNi_ref_small.fasta) +May-07 23:02:00.663 [TaskFinalizer-1] DEBUG nextflow.util.ThreadPoolBuilder - Creating thread pool 'PublishDir' minSize=10; maxSize=24; workQueue=LinkedBlockingQueue[10000]; allowCoreThreadTimeout=false +May-07 23:02:00.756 [Actor Thread 15] INFO nextflow.file.FilePorter - Staging foreign file: https://raw.githubusercontent.com/BenjaminWehnert1008/test-datasets/dmsqc/dmsqc/pMS190_GID1A_SUNi_S2_1_R2_50k.fastq +May-07 23:02:00.757 [Actor Thread 4] WARN nextflow.processor.TaskContext - Cannot serialize context map. Cause: java.lang.IllegalArgumentException: Unable to create serializer "com.esotericsoftware.kryo.serializers.FieldSerializer" for class: java.lang.ref.ReferenceQueue -- Resume will not work on this process +May-07 23:02:00.763 [Actor Thread 4] DEBUG nextflow.processor.TaskContext - Failed to serialize delegate map items: [ + 'meta':[Script_09ccfa79b2802f41$_runScript_closure1$_closure26] = + 'pos_range':[java.lang.String] = 352-1383 + '$':[java.lang.Boolean] = true + 'wt_seq':[nextflow.processor.TaskPath] = GID1A_SUNi_ref_small.fasta + 'script':[nextflow.processor.TaskPath] = aa_seq.R + 'task':[nextflow.processor.TaskConfig] = [container:community.wave.seqera.io/library/bioconductor-biostrings_r-base_r-biocmanager_r-dplyr_pruned:0fd2e39a5bf2ecaa, withName:PREMERGE:[publishDir:[path:./results/intermediate_files/bam_files, mode:copy, saveAs:ScriptE8646A4B8FFA7429020F836DC9CB8146$_run_closure1$_closure9$_closure21@26dc7ffc]], memory:Script126E8BAA3BC9692B58F2A1C965A1D472$_run_closure1$_closure6$_closure15@13f9d575, withName:FASTQC:[ext:[args:--quiet], containerOptions:], withLabel:error_retry:[errorStrategy:retry, maxRetries:2], when:nextflow.script.TaskClosure@264d7648, withLabel:process_high:[cpus:Script126E8BAA3BC9692B58F2A1C965A1D472$_run_closure1$_closure9$_closure23@1b36f3f3, memory:Script126E8BAA3BC9692B58F2A1C965A1D472$_run_closure1$_closure9$_closure24@5e504cf6, time:Script126E8BAA3BC9692B58F2A1C965A1D472$_run_closure1$_closure9$_closure25@35f6834], withLabel:error_ignore:[errorStrategy:ignore], resourceLimits:[cpus:4, memory:8.GB, time:1.h], withName:DMSANALYSIS_AASEQ:[publishDir:[path:./results/intermediate_files, mode:copy, saveAs:ScriptE8646A4B8FFA7429020F836DC9CB8146$_run_closure1$_closure13$_closure25@4eeda121]], withLabel:process_high_memory:[memory:Script126E8BAA3BC9692B58F2A1C965A1D472$_run_closure1$_closure11$_closure27@62e86a64], withName:BAMFILTER_DMS:[publishDir:[path:./results/intermediate_files/bam_files, mode:copy, saveAs:ScriptE8646A4B8FFA7429020F836DC9CB8146$_run_closure1$_closure8$_closure20@3e03bd33]], publishDir:[[path:./results/intermediate_files, mode:copy, saveAs:ScriptE8646A4B8FFA7429020F836DC9CB8146$_run_closure1$_closure13$_closure25@4eeda121]], withName:BWA_MEM:[publishDir:[path:./results/intermediate_files/bam_files/bwa/mem, mode:copy, saveAs:ScriptE8646A4B8FFA7429020F836DC9CB8146$_run_closure1$_closure7$_closure19@46e56c0f]], executor:local, stub:nextflow.script.TaskClosure@6522395b, conda:null/environment.yml, withName:GATK_SATURATIONMUTAGENESIS:[publishDir:[path:./results/intermediate_files/gatk, mode:copy, saveAs:ScriptE8646A4B8FFA7429020F836DC9CB8146$_run_closure1$_closure10$_closure22@4438d4d1]], cacheable:true, withLabel:process_low:[cpus:Script126E8BAA3BC9692B58F2A1C965A1D472$_run_closure1$_closure7$_closure17@67f11340, memory:Script126E8BAA3BC9692B58F2A1C965A1D472$_run_closure1$_closure7$_closure18@a998ea5, time:Script126E8BAA3BC9692B58F2A1C965A1D472$_run_closure1$_closure7$_closure19@7e85964c], withLabel:process_medium:[cpus:Script126E8BAA3BC9692B58F2A1C965A1D472$_run_closure1$_closure8$_closure20@7c995b11, memory:Script126E8BAA3BC9692B58F2A1C965A1D472$_run_closure1$_closure8$_closure21@131d3cd1, time:Script126E8BAA3BC9692B58F2A1C965A1D472$_run_closure1$_closure8$_closure22@55b774b1], tag:amino_acid_sequence, withName:MULTIQC:[ext:[args:ScriptE8646A4B8FFA7429020F836DC9CB8146$_run_closure1$_closure5$_closure15@7d2bfbd], publishDir:[path:ScriptE8646A4B8FFA7429020F836DC9CB8146$_run_closure1$_closure5$_closure16@31a52d85, mode:copy, saveAs:ScriptE8646A4B8FFA7429020F836DC9CB8146$_run_closure1$_closure5$_closure17@4ba464d4]], workDir:/Users/benjaminwehnert/dmscore/work/d4/e141d76a7c3b4130080bcdf8a831a3, exitStatus:0, ext:[:], withLabel:process_single:[cpus:Script126E8BAA3BC9692B58F2A1C965A1D472$_run_closure1$_closure6$_closure14@255893ed, memory:Script126E8BAA3BC9692B58F2A1C965A1D472$_run_closure1$_closure6$_closure15@13f9d575, time:Script126E8BAA3BC9692B58F2A1C965A1D472$_run_closure1$_closure6$_closure16@37e5efac], withName:BWA_INDEX:[publishDir:[path:./results/intermediate_files/bam_files, mode:copy, saveAs:ScriptE8646A4B8FFA7429020F836DC9CB8146$_run_closure1$_closure6$_closure18@2f3435d0]], process:NFCORE_DMSCORE:DMSCORE:DMSANALYSIS_AASEQ, debug:false, cpus:Script126E8BAA3BC9692B58F2A1C965A1D472$_run_closure1$_closure6$_closure14@255893ed, index:1, label:[process_single], withLabel:process_long:[time:Script126E8BAA3BC9692B58F2A1C965A1D472$_run_closure1$_closure10$_closure26@475e6626], maxRetries:1, maxErrors:-1, shell:bash + +set -e # Exit if a tool returns a non-zero status/exit code +set -u # Treat unset variables and parameters as an error +set -o pipefail # Returns the status of the last command to exit with a non-zero status or zero if all successfully execute +set -C # No clobber - prevent output redirection from overwriting files. +, withName:DMSANALYSIS_POSSIBLE_MUTATIONS:[publishDir:[path:./results/intermediate_files, mode:copy, saveAs:ScriptE8646A4B8FFA7429020F836DC9CB8146$_run_closure1$_closure11$_closure23@267852db]], name:NFCORE_DMSCORE:DMSCORE:DMSANALYSIS_AASEQ (amino_acid_sequence), containerOptions:-u $(id -u):$(id -g), errorStrategy:Script126E8BAA3BC9692B58F2A1C965A1D472$_run_closure1$_closure5@3e785137, time:Script126E8BAA3BC9692B58F2A1C965A1D472$_run_closure1$_closure6$_closure16@37e5efac, withName:DMSANALYSIS_PROCESS_GATK:[publishDir:[path:./results/intermediate_files/processed_gatk_files, mode:copy, saveAs:ScriptE8646A4B8FFA7429020F836DC9CB8146$_run_closure1$_closure14$_closure26@30ec799d]], hash:d4e141d76a7c3b4130080bcdf8a831a3] +] +com.esotericsoftware.kryo.KryoException: java.lang.IllegalArgumentException: Unable to create serializer "com.esotericsoftware.kryo.serializers.FieldSerializer" for class: java.lang.ref.ReferenceQueue +Serialization trace: +queue (org.codehaus.groovy.util.ReferenceManager$CallBackedManager) +manager (org.codehaus.groovy.util.ReferenceManager$1) +manager (org.codehaus.groovy.util.ReferenceBundle) +bundle (org.codehaus.groovy.reflection.CachedClass$4) +cachedSuperClass (org.codehaus.groovy.reflection.stdclasses.ObjectCachedClass) +cachedClass (org.codehaus.groovy.reflection.CachedMethod) +allMethods (groovy.lang.MetaClassImpl) +delegate (groovy.runtime.metaclass.NextflowDelegatingMetaClass) +metaClass (groovyx.gpars.dataflow.DataflowVariable) +first (groovyx.gpars.dataflow.stream.DataflowStream) +asyncHead (groovyx.gpars.dataflow.stream.DataflowStreamReadAdapter) +source (nextflow.extension.MapOp) +owner (nextflow.extension.MapOp$_apply_closure1) +code (groovyx.gpars.dataflow.operator.DataflowOperatorActor) +actor (groovyx.gpars.dataflow.operator.DataflowOperator) +allOperators (nextflow.Session) +session (nextflow.validation.ValidationExtension) +target (nextflow.script.FunctionDef) +definitions (nextflow.script.ScriptMeta) +meta (nextflow.script.ScriptBinding) +binding (Script_09ccfa79b2802f41) +delegate (Script_09ccfa79b2802f41$_runScript_closure1) +delegate (Script_09ccfa79b2802f41$_runScript_closure1$_closure26) + at com.esotericsoftware.kryo.serializers.ObjectField.write(ObjectField.java:82) + at com.esotericsoftware.kryo.serializers.FieldSerializer.write(FieldSerializer.java:495) + at com.esotericsoftware.kryo.Kryo.writeObject(Kryo.java:523) + at com.esotericsoftware.kryo.serializers.ObjectField.write(ObjectField.java:61) + at com.esotericsoftware.kryo.serializers.FieldSerializer.write(FieldSerializer.java:495) + at com.esotericsoftware.kryo.Kryo.writeObject(Kryo.java:523) + at com.esotericsoftware.kryo.serializers.ObjectField.write(ObjectField.java:61) + at com.esotericsoftware.kryo.serializers.FieldSerializer.write(FieldSerializer.java:495) + at com.esotericsoftware.kryo.Kryo.writeObject(Kryo.java:523) + at com.esotericsoftware.kryo.serializers.ObjectField.write(ObjectField.java:61) + at com.esotericsoftware.kryo.serializers.FieldSerializer.write(FieldSerializer.java:495) + at com.esotericsoftware.kryo.Kryo.writeObject(Kryo.java:523) + at com.esotericsoftware.kryo.serializers.ObjectField.write(ObjectField.java:61) + at com.esotericsoftware.kryo.serializers.FieldSerializer.write(FieldSerializer.java:495) + at com.esotericsoftware.kryo.Kryo.writeObject(Kryo.java:523) + at com.esotericsoftware.kryo.serializers.ObjectField.write(ObjectField.java:61) + at com.esotericsoftware.kryo.serializers.FieldSerializer.write(FieldSerializer.java:495) + at com.esotericsoftware.kryo.Kryo.writeClassAndObject(Kryo.java:599) + at com.esotericsoftware.kryo.serializers.CollectionSerializer.write(CollectionSerializer.java:82) + at com.esotericsoftware.kryo.serializers.CollectionSerializer.write(CollectionSerializer.java:22) + at com.esotericsoftware.kryo.Kryo.writeObject(Kryo.java:523) + at com.esotericsoftware.kryo.serializers.ObjectField.write(ObjectField.java:61) + at com.esotericsoftware.kryo.serializers.FieldSerializer.write(FieldSerializer.java:495) + at com.esotericsoftware.kryo.Kryo.writeObject(Kryo.java:523) + at com.esotericsoftware.kryo.serializers.ObjectField.write(ObjectField.java:61) + at com.esotericsoftware.kryo.serializers.FieldSerializer.write(FieldSerializer.java:495) + at com.esotericsoftware.kryo.Kryo.writeObject(Kryo.java:523) + at com.esotericsoftware.kryo.serializers.ObjectField.write(ObjectField.java:61) + at com.esotericsoftware.kryo.serializers.FieldSerializer.write(FieldSerializer.java:495) + at com.esotericsoftware.kryo.Kryo.writeObject(Kryo.java:523) + at com.esotericsoftware.kryo.serializers.ObjectField.write(ObjectField.java:61) + at com.esotericsoftware.kryo.serializers.FieldSerializer.write(FieldSerializer.java:495) + at com.esotericsoftware.kryo.Kryo.writeObject(Kryo.java:523) + at com.esotericsoftware.kryo.serializers.ObjectField.write(ObjectField.java:61) + at com.esotericsoftware.kryo.serializers.FieldSerializer.write(FieldSerializer.java:495) + at com.esotericsoftware.kryo.Kryo.writeObject(Kryo.java:523) + at com.esotericsoftware.kryo.serializers.ObjectField.write(ObjectField.java:61) + at com.esotericsoftware.kryo.serializers.FieldSerializer.write(FieldSerializer.java:495) + at com.esotericsoftware.kryo.Kryo.writeObject(Kryo.java:523) + at com.esotericsoftware.kryo.serializers.ObjectField.write(ObjectField.java:61) + at com.esotericsoftware.kryo.serializers.FieldSerializer.write(FieldSerializer.java:495) + at com.esotericsoftware.kryo.Kryo.writeObject(Kryo.java:523) + at com.esotericsoftware.kryo.serializers.ObjectField.write(ObjectField.java:61) + at com.esotericsoftware.kryo.serializers.FieldSerializer.write(FieldSerializer.java:495) + at com.esotericsoftware.kryo.Kryo.writeObject(Kryo.java:523) + at com.esotericsoftware.kryo.serializers.ObjectField.write(ObjectField.java:61) + at com.esotericsoftware.kryo.serializers.FieldSerializer.write(FieldSerializer.java:495) + at com.esotericsoftware.kryo.Kryo.writeClassAndObject(Kryo.java:599) + at com.esotericsoftware.kryo.serializers.CollectionSerializer.write(CollectionSerializer.java:82) + at com.esotericsoftware.kryo.serializers.CollectionSerializer.write(CollectionSerializer.java:22) + at com.esotericsoftware.kryo.Kryo.writeObject(Kryo.java:523) + at com.esotericsoftware.kryo.serializers.ObjectField.write(ObjectField.java:61) + at com.esotericsoftware.kryo.serializers.FieldSerializer.write(FieldSerializer.java:495) + at com.esotericsoftware.kryo.Kryo.writeObject(Kryo.java:523) + at com.esotericsoftware.kryo.serializers.ObjectField.write(ObjectField.java:61) + at com.esotericsoftware.kryo.serializers.FieldSerializer.write(FieldSerializer.java:495) + at com.esotericsoftware.kryo.Kryo.writeObject(Kryo.java:523) + at com.esotericsoftware.kryo.serializers.ObjectField.write(ObjectField.java:61) + at com.esotericsoftware.kryo.serializers.FieldSerializer.write(FieldSerializer.java:495) + at com.esotericsoftware.kryo.Kryo.writeClassAndObject(Kryo.java:599) + at com.esotericsoftware.kryo.serializers.MapSerializer.write(MapSerializer.java:95) + at com.esotericsoftware.kryo.serializers.MapSerializer.write(MapSerializer.java:21) + at com.esotericsoftware.kryo.Kryo.writeObject(Kryo.java:523) + at com.esotericsoftware.kryo.serializers.ObjectField.write(ObjectField.java:61) + at com.esotericsoftware.kryo.serializers.FieldSerializer.write(FieldSerializer.java:495) + at com.esotericsoftware.kryo.Kryo.writeObject(Kryo.java:523) + at com.esotericsoftware.kryo.serializers.ObjectField.write(ObjectField.java:61) + at com.esotericsoftware.kryo.serializers.FieldSerializer.write(FieldSerializer.java:495) + at com.esotericsoftware.kryo.Kryo.writeObject(Kryo.java:523) + at com.esotericsoftware.kryo.serializers.ObjectField.write(ObjectField.java:61) + at com.esotericsoftware.kryo.serializers.FieldSerializer.write(FieldSerializer.java:495) + at com.esotericsoftware.kryo.Kryo.writeObject(Kryo.java:523) + at com.esotericsoftware.kryo.serializers.ObjectField.write(ObjectField.java:61) + at com.esotericsoftware.kryo.serializers.FieldSerializer.write(FieldSerializer.java:495) + at com.esotericsoftware.kryo.Kryo.writeObject(Kryo.java:523) + at com.esotericsoftware.kryo.serializers.ObjectField.write(ObjectField.java:61) + at com.esotericsoftware.kryo.serializers.FieldSerializer.write(FieldSerializer.java:495) + at com.esotericsoftware.kryo.Kryo.writeClassAndObject(Kryo.java:599) + at com.esotericsoftware.kryo.serializers.MapSerializer.write(MapSerializer.java:95) + at com.esotericsoftware.kryo.serializers.MapSerializer.write(MapSerializer.java:21) + at com.esotericsoftware.kryo.Kryo.writeClassAndObject(Kryo.java:599) + at org.codehaus.groovy.vmplugin.v8.IndyInterface.fromCache(IndyInterface.java:321) + at nextflow.util.KryoHelper.serialize(SerializationHelper.groovy:166) + at org.codehaus.groovy.vmplugin.v8.IndyInterface.fromCache(IndyInterface.java:321) + at nextflow.processor.TaskContext.serialize(TaskContext.groovy:198) + at nextflow.cache.CacheDB.writeTaskEntry0(CacheDB.groovy:148) + at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method) + at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77) + at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) + at java.base/java.lang.reflect.Method.invoke(Method.java:569) + at org.codehaus.groovy.reflection.CachedMethod.invoke(CachedMethod.java:343) + at groovy.lang.MetaMethod.doMethodInvoke(MetaMethod.java:328) + at groovy.lang.MetaClassImpl.doInvokeMethod(MetaClassImpl.java:1333) + at groovy.lang.MetaClassImpl.invokeMethod(MetaClassImpl.java:1088) + at groovy.lang.MetaClassImpl.invokeMethod(MetaClassImpl.java:1007) + at org.codehaus.groovy.runtime.InvokerHelper.invokePogoMethod(InvokerHelper.java:645) + at org.codehaus.groovy.runtime.InvokerHelper.invokeMethod(InvokerHelper.java:628) + at org.codehaus.groovy.runtime.InvokerHelper.invokeMethodSafe(InvokerHelper.java:82) + at nextflow.cache.CacheDB$_putTaskAsync_closure1.doCall(CacheDB.groovy:157) + at nextflow.cache.CacheDB$_putTaskAsync_closure1.call(CacheDB.groovy) + at groovyx.gpars.agent.AgentBase.onMessage(AgentBase.java:102) + at groovyx.gpars.agent.Agent.handleMessage(Agent.java:84) + at groovyx.gpars.agent.AgentCore$1.handleMessage(AgentCore.java:48) + at groovyx.gpars.util.AsyncMessagingCore.run(AsyncMessagingCore.java:132) + at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136) + at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635) + at java.base/java.lang.Thread.run(Thread.java:840) +Caused by: java.lang.IllegalArgumentException: Unable to create serializer "com.esotericsoftware.kryo.serializers.FieldSerializer" for class: java.lang.ref.ReferenceQueue + at com.esotericsoftware.kryo.factories.ReflectionSerializerFactory.makeSerializer(ReflectionSerializerFactory.java:48) + at com.esotericsoftware.kryo.factories.ReflectionSerializerFactory.makeSerializer(ReflectionSerializerFactory.java:26) + at com.esotericsoftware.kryo.Kryo.newDefaultSerializer(Kryo.java:351) + at com.esotericsoftware.kryo.Kryo.getDefaultSerializer(Kryo.java:344) + at com.esotericsoftware.kryo.util.DefaultClassResolver.registerImplicit(DefaultClassResolver.java:56) + at com.esotericsoftware.kryo.Kryo.getRegistration(Kryo.java:461) + at com.esotericsoftware.kryo.util.DefaultClassResolver.writeClass(DefaultClassResolver.java:79) + at com.esotericsoftware.kryo.Kryo.writeClass(Kryo.java:488) + at com.esotericsoftware.kryo.serializers.ObjectField.write(ObjectField.java:57) + ... 106 common frames omitted +Caused by: java.lang.reflect.InvocationTargetException: null + at jdk.internal.reflect.GeneratedConstructorAccessor39.newInstance(Unknown Source) + at java.base/jdk.internal.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45) + at java.base/java.lang.reflect.Constructor.newInstanceWithCaller(Constructor.java:500) + at java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:481) + at com.esotericsoftware.kryo.factories.ReflectionSerializerFactory.makeSerializer(ReflectionSerializerFactory.java:35) + ... 114 common frames omitted +Caused by: java.lang.reflect.InaccessibleObjectException: Unable to make field private final java.lang.ref.ReferenceQueue$Lock java.lang.ref.ReferenceQueue.lock accessible: module java.base does not "opens java.lang.ref" to unnamed module @7b02881e + at java.base/java.lang.reflect.AccessibleObject.checkCanSetAccessible(AccessibleObject.java:354) + at java.base/java.lang.reflect.AccessibleObject.checkCanSetAccessible(AccessibleObject.java:297) + at java.base/java.lang.reflect.Field.checkCanSetAccessible(Field.java:178) + at java.base/java.lang.reflect.Field.setAccessible(Field.java:172) + at com.esotericsoftware.kryo.serializers.FieldSerializer.buildValidFields(FieldSerializer.java:282) + at com.esotericsoftware.kryo.serializers.FieldSerializer.rebuildCachedFields(FieldSerializer.java:217) + at com.esotericsoftware.kryo.serializers.FieldSerializer.rebuildCachedFields(FieldSerializer.java:156) + at com.esotericsoftware.kryo.serializers.FieldSerializer.(FieldSerializer.java:133) + ... 119 common frames omitted +May-07 23:02:02.378 [Task monitor] DEBUG n.processor.TaskPollingMonitor - Task completed > TaskHandler[id: 1; name: NFCORE_DMSCORE:DMSCORE:BWA_INDEX (GID1A_SUNi_ref_small.fasta); status: COMPLETED; exit: 0; error: -; workDir: /Users/benjaminwehnert/dmscore/work/49/349fd93ff7dd98e92e426a30457fb9] +May-07 23:02:02.387 [Task submitter] DEBUG n.executor.local.LocalTaskHandler - Launch cmd line: /bin/bash -ue .command.run +May-07 23:02:02.388 [Task submitter] INFO nextflow.Session - [36/b138e8] Submitted process > NFCORE_DMSCORE:DMSCORE:DMSANALYSIS_POSSIBLE_MUTATIONS (table /w all possible variants) +May-07 23:02:02.422 [Actor Thread 8] WARN nextflow.processor.TaskContext - Cannot serialize context map. Cause: java.lang.IllegalArgumentException: Unable to create serializer "com.esotericsoftware.kryo.serializers.FieldSerializer" for class: java.lang.ref.ReferenceQueue -- Resume will not work on this process +May-07 23:02:13.067 [Task monitor] DEBUG n.processor.TaskPollingMonitor - Task completed > TaskHandler[id: 2; name: NFCORE_DMSCORE:DMSCORE:DMSANALYSIS_POSSIBLE_MUTATIONS (table /w all possible variants); status: COMPLETED; exit: 0; error: -; workDir: /Users/benjaminwehnert/dmscore/work/36/b138e8e27f7423eea98b17aaf74451] +May-07 23:02:13.085 [Task submitter] DEBUG n.executor.local.LocalTaskHandler - Launch cmd line: /bin/bash -ue .command.run +May-07 23:02:13.085 [Task submitter] INFO nextflow.Session - [67/c2a06b] Submitted process > NFCORE_DMSCORE:DMSCORE:BWA_MEM (gid1a_1_quality_1_pe) +May-07 23:02:13.133 [Actor Thread 6] WARN nextflow.processor.TaskContext - Cannot serialize context map. Cause: java.lang.IllegalArgumentException: Unable to create serializer "com.esotericsoftware.kryo.serializers.FieldSerializer" for class: java.lang.ref.ReferenceQueue -- Resume will not work on this process +May-07 23:02:13.136 [Actor Thread 6] DEBUG nextflow.processor.TaskContext - Failed to serialize delegate map items: [ + 'meta':[Script_09ccfa79b2802f41$_runScript_closure1$_closure26] = + 'pos_range':[java.lang.String] = 352-1383 + 'mutagenesis_type':[java.lang.String] = max_diff_to_wt + '$':[java.lang.Boolean] = true + 'wt_seq':[nextflow.processor.TaskPath] = GID1A_SUNi_ref_small.fasta + 'custom_codon_library':[nextflow.processor.TaskPath] = NULL + 'script':[nextflow.processor.TaskPath] = possible_mutations.R + 'task':[nextflow.processor.TaskConfig] = [container:community.wave.seqera.io/library/bioconductor-biostrings_r-base_r-biocmanager_r-dplyr_pruned:0fd2e39a5bf2ecaa, withName:PREMERGE:[publishDir:[path:./results/intermediate_files/bam_files, mode:copy, saveAs:ScriptE8646A4B8FFA7429020F836DC9CB8146$_run_closure1$_closure9$_closure21@26dc7ffc]], memory:Script126E8BAA3BC9692B58F2A1C965A1D472$_run_closure1$_closure6$_closure15@13f9d575, withName:FASTQC:[ext:[args:--quiet], containerOptions:], withLabel:error_retry:[errorStrategy:retry, maxRetries:2], when:nextflow.script.TaskClosure@5f4899a5, withLabel:process_high:[cpus:Script126E8BAA3BC9692B58F2A1C965A1D472$_run_closure1$_closure9$_closure23@1b36f3f3, memory:Script126E8BAA3BC9692B58F2A1C965A1D472$_run_closure1$_closure9$_closure24@5e504cf6, time:Script126E8BAA3BC9692B58F2A1C965A1D472$_run_closure1$_closure9$_closure25@35f6834], withLabel:error_ignore:[errorStrategy:ignore], resourceLimits:[cpus:4, memory:8.GB, time:1.h], withName:DMSANALYSIS_AASEQ:[publishDir:[path:./results/intermediate_files, mode:copy, saveAs:ScriptE8646A4B8FFA7429020F836DC9CB8146$_run_closure1$_closure13$_closure25@4eeda121]], withLabel:process_high_memory:[memory:Script126E8BAA3BC9692B58F2A1C965A1D472$_run_closure1$_closure11$_closure27@62e86a64], withName:BAMFILTER_DMS:[publishDir:[path:./results/intermediate_files/bam_files, mode:copy, saveAs:ScriptE8646A4B8FFA7429020F836DC9CB8146$_run_closure1$_closure8$_closure20@3e03bd33]], publishDir:[[path:./results/intermediate_files, mode:copy, saveAs:ScriptE8646A4B8FFA7429020F836DC9CB8146$_run_closure1$_closure11$_closure23@267852db]], withName:BWA_MEM:[publishDir:[path:./results/intermediate_files/bam_files/bwa/mem, mode:copy, saveAs:ScriptE8646A4B8FFA7429020F836DC9CB8146$_run_closure1$_closure7$_closure19@46e56c0f]], executor:local, stub:nextflow.script.TaskClosure@7069093e, conda:null/environment.yml, withName:GATK_SATURATIONMUTAGENESIS:[publishDir:[path:./results/intermediate_files/gatk, mode:copy, saveAs:ScriptE8646A4B8FFA7429020F836DC9CB8146$_run_closure1$_closure10$_closure22@4438d4d1]], cacheable:true, withLabel:process_low:[cpus:Script126E8BAA3BC9692B58F2A1C965A1D472$_run_closure1$_closure7$_closure17@67f11340, memory:Script126E8BAA3BC9692B58F2A1C965A1D472$_run_closure1$_closure7$_closure18@a998ea5, time:Script126E8BAA3BC9692B58F2A1C965A1D472$_run_closure1$_closure7$_closure19@7e85964c], withLabel:process_medium:[cpus:Script126E8BAA3BC9692B58F2A1C965A1D472$_run_closure1$_closure8$_closure20@7c995b11, memory:Script126E8BAA3BC9692B58F2A1C965A1D472$_run_closure1$_closure8$_closure21@131d3cd1, time:Script126E8BAA3BC9692B58F2A1C965A1D472$_run_closure1$_closure8$_closure22@55b774b1], tag:table /w all possible variants, withName:MULTIQC:[ext:[args:ScriptE8646A4B8FFA7429020F836DC9CB8146$_run_closure1$_closure5$_closure15@7d2bfbd], publishDir:[path:ScriptE8646A4B8FFA7429020F836DC9CB8146$_run_closure1$_closure5$_closure16@31a52d85, mode:copy, saveAs:ScriptE8646A4B8FFA7429020F836DC9CB8146$_run_closure1$_closure5$_closure17@4ba464d4]], workDir:/Users/benjaminwehnert/dmscore/work/36/b138e8e27f7423eea98b17aaf74451, exitStatus:0, ext:[:], withLabel:process_single:[cpus:Script126E8BAA3BC9692B58F2A1C965A1D472$_run_closure1$_closure6$_closure14@255893ed, memory:Script126E8BAA3BC9692B58F2A1C965A1D472$_run_closure1$_closure6$_closure15@13f9d575, time:Script126E8BAA3BC9692B58F2A1C965A1D472$_run_closure1$_closure6$_closure16@37e5efac], withName:BWA_INDEX:[publishDir:[path:./results/intermediate_files/bam_files, mode:copy, saveAs:ScriptE8646A4B8FFA7429020F836DC9CB8146$_run_closure1$_closure6$_closure18@2f3435d0]], process:NFCORE_DMSCORE:DMSCORE:DMSANALYSIS_POSSIBLE_MUTATIONS, debug:false, cpus:Script126E8BAA3BC9692B58F2A1C965A1D472$_run_closure1$_closure6$_closure14@255893ed, index:1, label:[process_single], withLabel:process_long:[time:Script126E8BAA3BC9692B58F2A1C965A1D472$_run_closure1$_closure10$_closure26@475e6626], maxRetries:1, maxErrors:-1, shell:bash + +set -e # Exit if a tool returns a non-zero status/exit code +set -u # Treat unset variables and parameters as an error +set -o pipefail # Returns the status of the last command to exit with a non-zero status or zero if all successfully execute +set -C # No clobber - prevent output redirection from overwriting files. +, withName:DMSANALYSIS_POSSIBLE_MUTATIONS:[publishDir:[path:./results/intermediate_files, mode:copy, saveAs:ScriptE8646A4B8FFA7429020F836DC9CB8146$_run_closure1$_closure11$_closure23@267852db]], name:NFCORE_DMSCORE:DMSCORE:DMSANALYSIS_POSSIBLE_MUTATIONS (table /w all possible variants), containerOptions:-u $(id -u):$(id -g), errorStrategy:Script126E8BAA3BC9692B58F2A1C965A1D472$_run_closure1$_closure5@3e785137, time:Script126E8BAA3BC9692B58F2A1C965A1D472$_run_closure1$_closure6$_closure16@37e5efac, withName:DMSANALYSIS_PROCESS_GATK:[publishDir:[path:./results/intermediate_files/processed_gatk_files, mode:copy, saveAs:ScriptE8646A4B8FFA7429020F836DC9CB8146$_run_closure1$_closure14$_closure26@30ec799d]], hash:36b138e8e27f7423eea98b17aaf74451] +] +com.esotericsoftware.kryo.KryoException: java.lang.IllegalArgumentException: Unable to create serializer "com.esotericsoftware.kryo.serializers.FieldSerializer" for class: java.lang.ref.ReferenceQueue +Serialization trace: +queue (org.codehaus.groovy.util.ReferenceManager$CallBackedManager) +manager (org.codehaus.groovy.util.ReferenceManager$1) +manager (org.codehaus.groovy.util.ReferenceBundle) +bundle (org.codehaus.groovy.reflection.CachedClass$4) +cachedSuperClass (org.codehaus.groovy.reflection.stdclasses.ObjectCachedClass) +cachedClass (org.codehaus.groovy.reflection.CachedMethod) +allMethods (groovy.lang.MetaClassImpl) +delegate (groovy.runtime.metaclass.NextflowDelegatingMetaClass) +metaClass (groovyx.gpars.dataflow.DataflowVariable) +first (groovyx.gpars.dataflow.stream.DataflowStream) +asyncHead (groovyx.gpars.dataflow.stream.DataflowStreamReadAdapter) +source (nextflow.extension.MapOp) +owner (nextflow.extension.MapOp$_apply_closure1) +code (groovyx.gpars.dataflow.operator.DataflowOperatorActor) +actor (groovyx.gpars.dataflow.operator.DataflowOperator) +allOperators (nextflow.Session) +session (nextflow.validation.ValidationExtension) +target (nextflow.script.FunctionDef) +definitions (nextflow.script.ScriptMeta) +meta (nextflow.script.ScriptBinding) +binding (Script_09ccfa79b2802f41) +delegate (Script_09ccfa79b2802f41$_runScript_closure1) +delegate (Script_09ccfa79b2802f41$_runScript_closure1$_closure26) + at com.esotericsoftware.kryo.serializers.ObjectField.write(ObjectField.java:82) + at com.esotericsoftware.kryo.serializers.FieldSerializer.write(FieldSerializer.java:495) + at com.esotericsoftware.kryo.Kryo.writeObject(Kryo.java:523) + at com.esotericsoftware.kryo.serializers.ObjectField.write(ObjectField.java:61) + at com.esotericsoftware.kryo.serializers.FieldSerializer.write(FieldSerializer.java:495) + at com.esotericsoftware.kryo.Kryo.writeObject(Kryo.java:523) + at com.esotericsoftware.kryo.serializers.ObjectField.write(ObjectField.java:61) + at com.esotericsoftware.kryo.serializers.FieldSerializer.write(FieldSerializer.java:495) + at com.esotericsoftware.kryo.Kryo.writeObject(Kryo.java:523) + at com.esotericsoftware.kryo.serializers.ObjectField.write(ObjectField.java:61) + at com.esotericsoftware.kryo.serializers.FieldSerializer.write(FieldSerializer.java:495) + at com.esotericsoftware.kryo.Kryo.writeObject(Kryo.java:523) + at com.esotericsoftware.kryo.serializers.ObjectField.write(ObjectField.java:61) + at com.esotericsoftware.kryo.serializers.FieldSerializer.write(FieldSerializer.java:495) + at com.esotericsoftware.kryo.Kryo.writeObject(Kryo.java:523) + at com.esotericsoftware.kryo.serializers.ObjectField.write(ObjectField.java:61) + at com.esotericsoftware.kryo.serializers.FieldSerializer.write(FieldSerializer.java:495) + at com.esotericsoftware.kryo.Kryo.writeClassAndObject(Kryo.java:599) + at com.esotericsoftware.kryo.serializers.CollectionSerializer.write(CollectionSerializer.java:82) + at com.esotericsoftware.kryo.serializers.CollectionSerializer.write(CollectionSerializer.java:22) + at com.esotericsoftware.kryo.Kryo.writeObject(Kryo.java:523) + at com.esotericsoftware.kryo.serializers.ObjectField.write(ObjectField.java:61) + at com.esotericsoftware.kryo.serializers.FieldSerializer.write(FieldSerializer.java:495) + at com.esotericsoftware.kryo.Kryo.writeObject(Kryo.java:523) + at com.esotericsoftware.kryo.serializers.ObjectField.write(ObjectField.java:61) + at com.esotericsoftware.kryo.serializers.FieldSerializer.write(FieldSerializer.java:495) + at com.esotericsoftware.kryo.Kryo.writeObject(Kryo.java:523) + at com.esotericsoftware.kryo.serializers.ObjectField.write(ObjectField.java:61) + at com.esotericsoftware.kryo.serializers.FieldSerializer.write(FieldSerializer.java:495) + at com.esotericsoftware.kryo.Kryo.writeObject(Kryo.java:523) + at com.esotericsoftware.kryo.serializers.ObjectField.write(ObjectField.java:61) + at com.esotericsoftware.kryo.serializers.FieldSerializer.write(FieldSerializer.java:495) + at com.esotericsoftware.kryo.Kryo.writeObject(Kryo.java:523) + at com.esotericsoftware.kryo.serializers.ObjectField.write(ObjectField.java:61) + at com.esotericsoftware.kryo.serializers.FieldSerializer.write(FieldSerializer.java:495) + at com.esotericsoftware.kryo.Kryo.writeObject(Kryo.java:523) + at com.esotericsoftware.kryo.serializers.ObjectField.write(ObjectField.java:61) + at com.esotericsoftware.kryo.serializers.FieldSerializer.write(FieldSerializer.java:495) + at com.esotericsoftware.kryo.Kryo.writeObject(Kryo.java:523) + at com.esotericsoftware.kryo.serializers.ObjectField.write(ObjectField.java:61) + at com.esotericsoftware.kryo.serializers.FieldSerializer.write(FieldSerializer.java:495) + at com.esotericsoftware.kryo.Kryo.writeObject(Kryo.java:523) + at com.esotericsoftware.kryo.serializers.ObjectField.write(ObjectField.java:61) + at com.esotericsoftware.kryo.serializers.FieldSerializer.write(FieldSerializer.java:495) + at com.esotericsoftware.kryo.Kryo.writeObject(Kryo.java:523) + at com.esotericsoftware.kryo.serializers.ObjectField.write(ObjectField.java:61) + at com.esotericsoftware.kryo.serializers.FieldSerializer.write(FieldSerializer.java:495) + at com.esotericsoftware.kryo.Kryo.writeClassAndObject(Kryo.java:599) + at com.esotericsoftware.kryo.serializers.CollectionSerializer.write(CollectionSerializer.java:82) + at com.esotericsoftware.kryo.serializers.CollectionSerializer.write(CollectionSerializer.java:22) + at com.esotericsoftware.kryo.Kryo.writeObject(Kryo.java:523) + at com.esotericsoftware.kryo.serializers.ObjectField.write(ObjectField.java:61) + at com.esotericsoftware.kryo.serializers.FieldSerializer.write(FieldSerializer.java:495) + at com.esotericsoftware.kryo.Kryo.writeObject(Kryo.java:523) + at com.esotericsoftware.kryo.serializers.ObjectField.write(ObjectField.java:61) + at com.esotericsoftware.kryo.serializers.FieldSerializer.write(FieldSerializer.java:495) + at com.esotericsoftware.kryo.Kryo.writeObject(Kryo.java:523) + at com.esotericsoftware.kryo.serializers.ObjectField.write(ObjectField.java:61) + at com.esotericsoftware.kryo.serializers.FieldSerializer.write(FieldSerializer.java:495) + at com.esotericsoftware.kryo.Kryo.writeClassAndObject(Kryo.java:599) + at com.esotericsoftware.kryo.serializers.MapSerializer.write(MapSerializer.java:95) + at com.esotericsoftware.kryo.serializers.MapSerializer.write(MapSerializer.java:21) + at com.esotericsoftware.kryo.Kryo.writeObject(Kryo.java:523) + at com.esotericsoftware.kryo.serializers.ObjectField.write(ObjectField.java:61) + at com.esotericsoftware.kryo.serializers.FieldSerializer.write(FieldSerializer.java:495) + at com.esotericsoftware.kryo.Kryo.writeObject(Kryo.java:523) + at com.esotericsoftware.kryo.serializers.ObjectField.write(ObjectField.java:61) + at com.esotericsoftware.kryo.serializers.FieldSerializer.write(FieldSerializer.java:495) + at com.esotericsoftware.kryo.Kryo.writeObject(Kryo.java:523) + at com.esotericsoftware.kryo.serializers.ObjectField.write(ObjectField.java:61) + at com.esotericsoftware.kryo.serializers.FieldSerializer.write(FieldSerializer.java:495) + at com.esotericsoftware.kryo.Kryo.writeObject(Kryo.java:523) + at com.esotericsoftware.kryo.serializers.ObjectField.write(ObjectField.java:61) + at com.esotericsoftware.kryo.serializers.FieldSerializer.write(FieldSerializer.java:495) + at com.esotericsoftware.kryo.Kryo.writeObject(Kryo.java:523) + at com.esotericsoftware.kryo.serializers.ObjectField.write(ObjectField.java:61) + at com.esotericsoftware.kryo.serializers.FieldSerializer.write(FieldSerializer.java:495) + at com.esotericsoftware.kryo.Kryo.writeClassAndObject(Kryo.java:599) + at com.esotericsoftware.kryo.serializers.MapSerializer.write(MapSerializer.java:95) + at com.esotericsoftware.kryo.serializers.MapSerializer.write(MapSerializer.java:21) + at com.esotericsoftware.kryo.Kryo.writeClassAndObject(Kryo.java:599) + at org.codehaus.groovy.vmplugin.v8.IndyInterface.fromCache(IndyInterface.java:321) + at nextflow.util.KryoHelper.serialize(SerializationHelper.groovy:166) + at org.codehaus.groovy.vmplugin.v8.IndyInterface.fromCache(IndyInterface.java:321) + at nextflow.processor.TaskContext.serialize(TaskContext.groovy:198) + at nextflow.cache.CacheDB.writeTaskEntry0(CacheDB.groovy:148) + at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method) + at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77) + at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) + at java.base/java.lang.reflect.Method.invoke(Method.java:569) + at org.codehaus.groovy.reflection.CachedMethod.invoke(CachedMethod.java:343) + at groovy.lang.MetaMethod.doMethodInvoke(MetaMethod.java:328) + at groovy.lang.MetaClassImpl.doInvokeMethod(MetaClassImpl.java:1333) + at groovy.lang.MetaClassImpl.invokeMethod(MetaClassImpl.java:1088) + at groovy.lang.MetaClassImpl.invokeMethod(MetaClassImpl.java:1007) + at org.codehaus.groovy.runtime.InvokerHelper.invokePogoMethod(InvokerHelper.java:645) + at org.codehaus.groovy.runtime.InvokerHelper.invokeMethod(InvokerHelper.java:628) + at org.codehaus.groovy.runtime.InvokerHelper.invokeMethodSafe(InvokerHelper.java:82) + at nextflow.cache.CacheDB$_putTaskAsync_closure1.doCall(CacheDB.groovy:157) + at nextflow.cache.CacheDB$_putTaskAsync_closure1.call(CacheDB.groovy) + at groovyx.gpars.agent.AgentBase.onMessage(AgentBase.java:102) + at groovyx.gpars.agent.Agent.handleMessage(Agent.java:84) + at groovyx.gpars.agent.AgentCore$1.handleMessage(AgentCore.java:48) + at groovyx.gpars.util.AsyncMessagingCore.run(AsyncMessagingCore.java:132) + at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136) + at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635) + at java.base/java.lang.Thread.run(Thread.java:840) +Caused by: java.lang.IllegalArgumentException: Unable to create serializer "com.esotericsoftware.kryo.serializers.FieldSerializer" for class: java.lang.ref.ReferenceQueue + at com.esotericsoftware.kryo.factories.ReflectionSerializerFactory.makeSerializer(ReflectionSerializerFactory.java:48) + at com.esotericsoftware.kryo.factories.ReflectionSerializerFactory.makeSerializer(ReflectionSerializerFactory.java:26) + at com.esotericsoftware.kryo.Kryo.newDefaultSerializer(Kryo.java:351) + at com.esotericsoftware.kryo.Kryo.getDefaultSerializer(Kryo.java:344) + at com.esotericsoftware.kryo.util.DefaultClassResolver.registerImplicit(DefaultClassResolver.java:56) + at com.esotericsoftware.kryo.Kryo.getRegistration(Kryo.java:461) + at com.esotericsoftware.kryo.util.DefaultClassResolver.writeClass(DefaultClassResolver.java:79) + at com.esotericsoftware.kryo.Kryo.writeClass(Kryo.java:488) + at com.esotericsoftware.kryo.serializers.ObjectField.write(ObjectField.java:57) + ... 106 common frames omitted +Caused by: java.lang.reflect.InvocationTargetException: null + at jdk.internal.reflect.GeneratedConstructorAccessor39.newInstance(Unknown Source) + at java.base/jdk.internal.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45) + at java.base/java.lang.reflect.Constructor.newInstanceWithCaller(Constructor.java:500) + at java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:481) + at com.esotericsoftware.kryo.factories.ReflectionSerializerFactory.makeSerializer(ReflectionSerializerFactory.java:35) + ... 114 common frames omitted +Caused by: java.lang.reflect.InaccessibleObjectException: Unable to make field private final java.lang.ref.ReferenceQueue$Lock java.lang.ref.ReferenceQueue.lock accessible: module java.base does not "opens java.lang.ref" to unnamed module @7b02881e + at java.base/java.lang.reflect.AccessibleObject.checkCanSetAccessible(AccessibleObject.java:354) + at java.base/java.lang.reflect.AccessibleObject.checkCanSetAccessible(AccessibleObject.java:297) + at java.base/java.lang.reflect.Field.checkCanSetAccessible(Field.java:178) + at java.base/java.lang.reflect.Field.setAccessible(Field.java:172) + at com.esotericsoftware.kryo.serializers.FieldSerializer.buildValidFields(FieldSerializer.java:282) + at com.esotericsoftware.kryo.serializers.FieldSerializer.rebuildCachedFields(FieldSerializer.java:217) + at com.esotericsoftware.kryo.serializers.FieldSerializer.rebuildCachedFields(FieldSerializer.java:156) + at com.esotericsoftware.kryo.serializers.FieldSerializer.(FieldSerializer.java:133) + ... 119 common frames omitted +May-07 23:02:17.570 [Task monitor] DEBUG n.processor.TaskPollingMonitor - Task completed > TaskHandler[id: 5; name: NFCORE_DMSCORE:DMSCORE:BWA_MEM (gid1a_1_quality_1_pe); status: COMPLETED; exit: 0; error: -; workDir: /Users/benjaminwehnert/dmscore/work/67/c2a06b1c7b4781da339f3e7aacd574] +May-07 23:02:17.585 [Task submitter] DEBUG n.executor.local.LocalTaskHandler - Launch cmd line: /bin/bash -ue .command.run +May-07 23:02:17.586 [Task submitter] INFO nextflow.Session - [48/c69654] Submitted process > NFCORE_DMSCORE:DMSCORE:FASTQC (gid1a_1_quality_1_pe) +May-07 23:02:17.748 [TaskFinalizer-4] DEBUG nextflow.processor.TaskProcessor - Process NFCORE_DMSCORE:DMSCORE:BWA_MEM > Skipping output binding because one or more optional files are missing: fileoutparam<1:1> +May-07 23:02:17.749 [TaskFinalizer-4] DEBUG nextflow.processor.TaskProcessor - Process NFCORE_DMSCORE:DMSCORE:BWA_MEM > Skipping output binding because one or more optional files are missing: fileoutparam<2:1> +May-07 23:02:17.749 [TaskFinalizer-4] DEBUG nextflow.processor.TaskProcessor - Process NFCORE_DMSCORE:DMSCORE:BWA_MEM > Skipping output binding because one or more optional files are missing: fileoutparam<3:1> +May-07 23:02:17.828 [Actor Thread 14] WARN nextflow.processor.TaskContext - Cannot serialize context map. Cause: java.lang.IllegalArgumentException: Unable to create serializer "com.esotericsoftware.kryo.serializers.FieldSerializer" for class: java.lang.ref.ReferenceQueue -- Resume will not work on this process +May-07 23:02:28.533 [Task monitor] DEBUG n.processor.TaskPollingMonitor - Task completed > TaskHandler[id: 4; name: NFCORE_DMSCORE:DMSCORE:FASTQC (gid1a_1_quality_1_pe); status: COMPLETED; exit: 0; error: -; workDir: /Users/benjaminwehnert/dmscore/work/48/c69654e9b29a194ea81b7c28f63819] +May-07 23:02:28.582 [Task submitter] DEBUG n.executor.local.LocalTaskHandler - Launch cmd line: /bin/bash -ue .command.run +May-07 23:02:28.583 [Task submitter] INFO nextflow.Session - [4e/7ee9bd] Submitted process > NFCORE_DMSCORE:DMSCORE:BAMFILTER_DMS (gid1a_1_quality_1_pe) +May-07 23:02:28.752 [Actor Thread 12] DEBUG nextflow.sort.BigSort - Sort completed -- entries: 2; slices: 1; internal sort time: 0.013 s; external sort time: 0.008 s; total time: 0.021 s +May-07 23:02:28.776 [Actor Thread 12] DEBUG nextflow.file.FileCollector - Saved collect-files list to: /Users/benjaminwehnert/dmscore/work/collect-file/c1149f4a215bbe0651400d7f32cdafdc +May-07 23:02:28.784 [Actor Thread 12] DEBUG nextflow.file.FileCollector - Deleting file collector temp dir: /var/folders/r0/ldrzd4wn1s3516hy0vsn8xzm0000gn/T/nxf-15345989576056151242 +May-07 23:02:32.866 [Task monitor] DEBUG n.processor.TaskPollingMonitor - Task completed > TaskHandler[id: 6; name: NFCORE_DMSCORE:DMSCORE:BAMFILTER_DMS (gid1a_1_quality_1_pe); status: COMPLETED; exit: 0; error: -; workDir: /Users/benjaminwehnert/dmscore/work/4e/7ee9bd66291d03ed6f3c2bbdfefe0d] +May-07 23:02:32.872 [Task submitter] DEBUG n.executor.local.LocalTaskHandler - Launch cmd line: /bin/bash -ue .command.run +May-07 23:02:32.873 [Task submitter] INFO nextflow.Session - [55/5dd043] Submitted process > NFCORE_DMSCORE:DMSCORE:MULTIQC +May-07 23:02:43.262 [Task monitor] DEBUG n.processor.TaskPollingMonitor - Task completed > TaskHandler[id: 7; name: NFCORE_DMSCORE:DMSCORE:MULTIQC; status: COMPLETED; exit: 0; error: -; workDir: /Users/benjaminwehnert/dmscore/work/55/5dd0430083da5c332e7950544ac6e0] +May-07 23:02:43.292 [Task submitter] DEBUG n.executor.local.LocalTaskHandler - Launch cmd line: /bin/bash -ue .command.run +May-07 23:02:43.294 [Task submitter] INFO nextflow.Session - [aa/6e83ae] Submitted process > NFCORE_DMSCORE:DMSCORE:PREMERGE (gid1a_1_quality_1_pe) +May-07 23:02:43.298 [TaskFinalizer-7] DEBUG nextflow.processor.TaskProcessor - Process NFCORE_DMSCORE:DMSCORE:MULTIQC > Skipping output binding because one or more optional files are missing: fileoutparam<2> +May-07 23:02:46.384 [Task monitor] DEBUG n.processor.TaskPollingMonitor - Task completed > TaskHandler[id: 8; name: NFCORE_DMSCORE:DMSCORE:PREMERGE (gid1a_1_quality_1_pe); status: COMPLETED; exit: 0; error: -; workDir: /Users/benjaminwehnert/dmscore/work/aa/6e83aebc1b031afabb4d8df09e97f3] +May-07 23:02:46.488 [Task submitter] DEBUG n.executor.local.LocalTaskHandler - Launch cmd line: /bin/bash -ue .command.run +May-07 23:02:46.489 [Task submitter] INFO nextflow.Session - [f2/f878d3] Submitted process > NFCORE_DMSCORE:DMSCORE:GATK_SATURATIONMUTAGENESIS (gid1a_1_quality_1_pe) +May-07 23:03:05.753 [Task monitor] DEBUG n.processor.TaskPollingMonitor - Task completed > TaskHandler[id: 9; name: NFCORE_DMSCORE:DMSCORE:GATK_SATURATIONMUTAGENESIS (gid1a_1_quality_1_pe); status: COMPLETED; exit: 0; error: -; workDir: /Users/benjaminwehnert/dmscore/work/f2/f878d301d101b20d245c27be19bbf3] +May-07 23:03:05.837 [Actor Thread 2] DEBUG nextflow.processor.TaskProcessor - Handling unexpected condition for + task: name=NFCORE_DMSCORE:DMSCORE:DMSANALYSIS_PROCESS_GATK (1); work-dir=null + error [nextflow.exception.ProcessUnrecoverableException]: Path value cannot be null +May-07 23:03:05.858 [Actor Thread 2] ERROR nextflow.processor.TaskProcessor - Error executing process > 'NFCORE_DMSCORE:DMSCORE:DMSANALYSIS_PROCESS_GATK (1)' + +Caused by: + Path value cannot be null + + + +Tip: you can replicate the issue by changing to the process work dir and entering the command `bash .command.run` +May-07 23:03:05.860 [Actor Thread 2] INFO nextflow.Session - Execution cancelled -- Finishing pending tasks before exit +May-07 23:03:05.865 [Actor Thread 2] ERROR nextflow.Nextflow - Pipeline failed. Please refer to troubleshooting docs: https://nf-co.re/docs/usage/troubleshooting +May-07 23:03:05.866 [main] DEBUG nextflow.Session - Session await > all processes finished +May-07 23:03:05.867 [Task monitor] DEBUG n.processor.TaskPollingMonitor - <<< barrier arrives (monitor: local) - terminating tasks monitor poll loop +May-07 23:03:05.867 [main] DEBUG nextflow.Session - Session await > all barriers passed +May-07 23:03:05.868 [Actor Thread 12] DEBUG nextflow.processor.TaskProcessor - Handling unexpected condition for + task: name=NFCORE_DMSCORE:DMSCORE:DMSANALYSIS_PROCESS_GATK; work-dir=null + error [java.lang.InterruptedException]: java.lang.InterruptedException +May-07 23:03:05.872 [main] DEBUG nextflow.util.ThreadPoolManager - Thread pool 'TaskFinalizer' shutdown completed (hard=false) +May-07 23:03:05.872 [main] DEBUG nextflow.util.ThreadPoolManager - Thread pool 'PublishDir' shutdown completed (hard=false) +May-07 23:03:05.880 [main] INFO nextflow.Nextflow - -[nf-core/dmscore] Pipeline completed with errors- +May-07 23:03:05.897 [main] DEBUG n.trace.WorkflowStatsObserver - Workflow completed > WorkflowStats[succeededCount=9; failedCount=0; ignoredCount=0; cachedCount=0; pendingCount=0; submittedCount=0; runningCount=0; retriesCount=0; abortedCount=0; succeedDuration=2m 24s; failedDuration=0ms; cachedDuration=0ms;loadCpus=0; loadMemory=0; peakRunning=2; peakCpus=8; peakMemory=16 GB; ] +May-07 23:03:05.898 [main] DEBUG nextflow.trace.TraceFileObserver - Workflow completed -- saving trace file +May-07 23:03:05.900 [main] DEBUG nextflow.trace.ReportObserver - Workflow completed -- rendering execution report +May-07 23:03:07.314 [main] DEBUG nextflow.trace.TimelineObserver - Workflow completed -- rendering execution timeline +May-07 23:03:07.511 [main] DEBUG nextflow.cache.CacheDB - Closing CacheDB done +May-07 23:03:07.545 [main] INFO org.pf4j.AbstractPluginManager - Stop plugin 'nf-schema@2.3.0' +May-07 23:03:07.545 [main] DEBUG nextflow.plugin.BasePlugin - Plugin stopped nf-schema +May-07 23:03:07.546 [main] DEBUG nextflow.util.ThreadPoolManager - Thread pool 'FileTransfer' shutdown completed (hard=false) +May-07 23:03:07.547 [main] DEBUG nextflow.script.ScriptRunner - > Execution complete -- Goodbye diff --git a/main.nf b/main.nf index 8c9943a..2df1740 100644 --- a/main.nf +++ b/main.nf @@ -1,11 +1,11 @@ #!/usr/bin/env nextflow /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - nf-core/dmscore + nf-core/deepmutscan ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Github : https://github.com/nf-core/dmscore - Website: https://nf-co.re/dmscore - Slack : https://nfcore.slack.com/channels/dmscore + Github : https://github.com/nf-core/deepmutscan + Website: https://nf-co.re/deepmutscan + Slack : https://nfcore.slack.com/channels/deepmutscan ---------------------------------------------------------------------------------------- */ @@ -15,10 +15,10 @@ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -include { DMSCORE } from './workflows/dmscore' -include { PIPELINE_INITIALISATION } from './subworkflows/local/utils_nfcore_dmscore_pipeline' -include { PIPELINE_COMPLETION } from './subworkflows/local/utils_nfcore_dmscore_pipeline' -include { getGenomeAttribute } from './subworkflows/local/utils_nfcore_dmscore_pipeline' +include { DEEPMUTSCAN } from './workflows/deepmutscan' +include { PIPELINE_INITIALISATION } from './subworkflows/local/utils_nfcore_deepmutscan_pipeline' +include { PIPELINE_COMPLETION } from './subworkflows/local/utils_nfcore_deepmutscan_pipeline' +include { getGenomeAttribute } from './subworkflows/local/utils_nfcore_deepmutscan_pipeline' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -26,9 +26,6 @@ include { getGenomeAttribute } from './subworkflows/local/utils_nfcore_dmsc ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -// TODO nf-core: Remove this line if you don't need a FASTA file -// This is an example of how to use getGenomeAttribute() to fetch parameters -// from igenomes.config using `--genome` params.fasta = getGenomeAttribute('fasta') /* @@ -40,7 +37,7 @@ params.fasta = getGenomeAttribute('fasta') // // WORKFLOW: Run main analysis pipeline depending on type of input // -workflow NFCORE_DMSCORE { +workflow NFCORE_DEEPMUTSCAN { take: samplesheet // channel: samplesheet read in from --input @@ -50,12 +47,13 @@ workflow NFCORE_DMSCORE { // // WORKFLOW: Run pipeline // - DMSCORE ( + DEEPMUTSCAN ( samplesheet ) emit: - multiqc_report = DMSCORE.out.multiqc_report // channel: /path/to/multiqc_report.html + multiqc_report = DEEPMUTSCAN.out.multiqc_report // channel: /path/to/multiqc_report.html } + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RUN MAIN WORKFLOW @@ -74,13 +72,16 @@ workflow { params.monochrome_logs, args, params.outdir, - params.input + params.input, + params.help, + params.help_full, + params.show_hidden ) // // WORKFLOW: Run main workflow // - NFCORE_DMSCORE ( + NFCORE_DEEPMUTSCAN ( PIPELINE_INITIALISATION.out.samplesheet ) // @@ -93,7 +94,7 @@ workflow { params.outdir, params.monochrome_logs, params.hook_url, - NFCORE_DMSCORE.out.multiqc_report + NFCORE_DEEPMUTSCAN.out.multiqc_report ) } diff --git a/modules.json b/modules.json index 910602b..966776f 100644 --- a/modules.json +++ b/modules.json @@ -1,18 +1,28 @@ { - "name": "nf-core/dmscore", - "homePage": "https://github.com/nf-core/dmscore", + "name": "nf-core/deepmutscan", + "homePage": "https://github.com/nf-core/deepmutscan", "repos": { "https://github.com/nf-core/modules.git": { "modules": { "nf-core": { + "bwa/index": { + "branch": "master", + "git_sha": "2d20463181b1c38981a02e90d3084b5f9fa8d540", + "installed_by": ["modules"] + }, + "bwa/mem": { + "branch": "master", + "git_sha": "a29f18660f5e3748d44d6f716241e70c942c065d", + "installed_by": ["modules"] + }, "fastqc": { "branch": "master", - "git_sha": "dc94b6ee04a05ddb9f7ae050712ff30a13149164", + "git_sha": "41dfa3f7c0ffabb96a6a813fe321c6d1cc5b6e46", "installed_by": ["modules"] }, "multiqc": { "branch": "master", - "git_sha": "cf17ca47590cc578dfb47db1c2a44ef86f89976d", + "git_sha": "e10b76ca0c66213581bec2833e30d31f239dec0b", "installed_by": ["modules"] } } @@ -21,17 +31,17 @@ "nf-core": { "utils_nextflow_pipeline": { "branch": "master", - "git_sha": "c2b22d85f30a706a3073387f30380704fcae013b", + "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d", "installed_by": ["subworkflows"] }, "utils_nfcore_pipeline": { "branch": "master", - "git_sha": "51ae5406a030d4da1e49e4dab49756844fdd6c7a", + "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d", "installed_by": ["subworkflows"] }, "utils_nfschema_plugin": { "branch": "master", - "git_sha": "2fd2cd6d0e7b273747f32e465fdc6bcc3ae0814e", + "git_sha": "4b406a74dc0449c0401ed87d5bfff4252fd277fd", "installed_by": ["subworkflows"] } } diff --git a/modules/local/bamprocessing/bamfilteringdms.nf b/modules/local/bamprocessing/bamfilteringdms.nf new file mode 100644 index 0000000..6cef905 --- /dev/null +++ b/modules/local/bamprocessing/bamfilteringdms.nf @@ -0,0 +1,49 @@ + +process BAMFILTER_DMS { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.21--h96c455f_1': + 'biocontainers/samtools:1.21--h96c455f_1' }" + + input: + tuple val(meta), path(bam) + + output: + tuple val(meta), path("*.bam"), emit: bam + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + samtools view -h -F 4 -F 256 -q 30 $bam | \ + samtools view -h | \ + awk '{if(\$6 !~ /I/ && \$6 !~ /D/ && \$6 !~ /N/) print \$0}' | \ + samtools view -h | \ + awk '{for(i=1;i<=NF;i++) if(\$i ~ /^NM:i:/ && \$i != "NM:i:0") {print \$0; next}} \$1 ~ /^@/' | \ + samtools view -bS > ${meta.id}_filtered.bam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(samtools --version |& sed '1!d ; s/samtools //') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.bam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bamfilteringdms: \$(samtools --version |& sed '1!d ; s/samtools //') + END_VERSIONS + """ +} diff --git a/modules/local/bamprocessing/premerge.nf b/modules/local/bamprocessing/premerge.nf new file mode 100644 index 0000000..4a66f85 --- /dev/null +++ b/modules/local/bamprocessing/premerge.nf @@ -0,0 +1,51 @@ + +process PREMERGE { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "community.wave.seqera.io/library/bwa_samtools_vsearch:28e8640725d3d8e9" + + input: + tuple val(meta), path(bam) + path wt_seq + + output: + tuple val(meta), path("*.bam"), emit: bam + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + # Convert BAM to paired FASTQ files + samtools fastq -1 forward_reads.fastq -2 reverse_reads.fastq -0 /dev/null -s /dev/null -n $bam + + # Merge paired reads + vsearch --fastq_mergepairs forward_reads.fastq --reverse reverse_reads.fastq --fastqout merged_reads.fastq --fastq_minovlen 10 --fastq_allowmergestagger + + # Re-align merged reads + bwa index $wt_seq + bwa mem $wt_seq merged_reads.fastq | samtools view -Sb - > ${prefix}_merged.bam + + # Save version information + cat <<-END_VERSIONS > versions.yml + "${task.process}": + premerge: \$(samtools --version |& sed '1!d ; s/samtools //') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch merged_reads.fastq + touch merged_reads.bam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + premerge: dummy_version + END_VERSIONS + """ +} diff --git a/modules/local/dmsanalysis/aaseq.nf b/modules/local/dmsanalysis/aaseq.nf new file mode 100644 index 0000000..dde104f --- /dev/null +++ b/modules/local/dmsanalysis/aaseq.nf @@ -0,0 +1,46 @@ +process DMSANALYSIS_AASEQ { + tag "amino_acid_sequence" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' + ? 'community.wave.seqera.io/library/bioconductor-biostrings_r-base_r-biocmanager_r-dplyr_pruned:ce2ba7ad7f6e7f2c' + : 'community.wave.seqera.io/library/bioconductor-biostrings_r-base_r-biocmanager_r-dplyr_pruned:0fd2e39a5bf2ecaa' }" + + input: + tuple val(meta), path(wt_seq) + val pos_range + path script // aa_seq.R + + output: + tuple val(meta), path("aa_seq.txt"), emit: aa_seq + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + start_stop_codon="$pos_range" + + R_version=\$(R --version | head -n 1 | sed 's/^R version //') + + Rscript -e "source('$script'); aa_seq('$wt_seq', '\$start_stop_codon', 'aa_seq.txt')" + + # Extract R base and packages versions + R_VERSION=\$(R --version | head -n 1 | sed -E 's/^R version ([0-9.]+).*/\\1/') + BIOSTRINGS_VERSION=\$(Rscript -e "packageVersion('Biostrings')" | grep -Eo '[0-9]+(\\.[0-9]+)+') + cat <<-END_VERSIONS > versions.yml + DMSANALYSIS_AASEQ: + r-base: \$R_VERSION + biostrings: \$BIOSTRINGS_VERSION + END_VERSIONS + """ + + stub: + """ + touch aa_seq.txt + echo "DMSANALYSIS_AASEQ:" > versions.yml + echo " stub-version: 0.0.0" >> versions.yml + """ +} diff --git a/modules/local/dmsanalysis/bin/SeqDepth_simulation.R b/modules/local/dmsanalysis/bin/SeqDepth_simulation.R new file mode 100644 index 0000000..ec832d3 --- /dev/null +++ b/modules/local/dmsanalysis/bin/SeqDepth_simulation.R @@ -0,0 +1,129 @@ +# input: prefiltered (by codon library) gatk path, possible mutations path, output_folder, reduction_fraction (steps in % to reduce counts), threshold to count variant as present in dataset +# output: pdf showing the plot +# limitation: takes quite a lot time: depends on number of counts in prefiltered gatk (4 min on M1 MacBook for 340,000 counts in total) -> reduction_fraction only has a low impact -> need to find more efficient random-sampling algorithm + +library(dplyr) +library(ggplot2) + +SeqDepth_simulation_plot <- function(prefiltered_gatk_path, possible_mutations_path, output_file_path, reduction_fraction = 0.01, threshold = 3) { + + # Read data from the specified CSV file + data <- read.csv(prefiltered_gatk_path) + data <- data %>% mutate(counts = as.numeric(counts)) # Ensure counts are numeric + original_counts <- data$counts # Store the original counts for weight calculations + possible_mutations <- read.csv(possible_mutations_path) + + # Initialize variables + total_counts <- sum(data$counts) + reduction_per_step <- floor(total_counts * reduction_fraction) # Round down to the nearest integer + results <- data.frame(remaining_counts = numeric(), remaining_variants = numeric()) + + # Track the initial state + remaining_variants <- sum(data$counts >= threshold) + remaining_counts <- total_counts + results <- rbind(results, data.frame(remaining_counts = remaining_counts, remaining_variants = remaining_variants)) + + # Get a list of indices for non-zero variants + non_zero_indices <- which(data$counts > 0) + + # Calculate initial weights based on the original counts for non-zero variants + weights <- as.numeric(original_counts[non_zero_indices]) + weights <- weights / sum(weights) # Normalize weights + + # Loop until all counts are zero + while (remaining_counts > 0) { + # If no more non-zero variants are available, break the loop + if (length(non_zero_indices) == 0) { + break + } + + # Randomly reduce counts by the specified amount using weighted sampling + for (i in 1:reduction_per_step) { + # Randomly choose an index from the non-zero variants based on weights + selected_idx <- sample(length(non_zero_indices), 1, prob = weights) + index <- non_zero_indices[selected_idx] + + # Reduce the count by 1 + data$counts[index] <- data$counts[index] - 1 + + # If the count reaches zero, remove the index from the list of non-zero variants + if (data$counts[index] == 0) { + non_zero_indices <- non_zero_indices[-selected_idx] + weights <- weights[-selected_idx] # Remove the corresponding weight + } + } + + # Update remaining counts and remaining variants after the reduction + remaining_counts <- sum(data$counts) + remaining_variants <- sum(data$counts >= threshold & data$counts > 0) + + # Store the results for this step + results <- rbind(results, data.frame(remaining_counts = remaining_counts, remaining_variants = remaining_variants)) + + # Adjust the reduction_per_step if the total remaining counts are less than the reduction amount + if (remaining_counts < reduction_per_step) { + reduction_per_step <- remaining_counts + } + + # Recalculate weights using the original counts, but only for remaining non-zero variants + if (length(non_zero_indices) > 0) { + weights <- as.numeric(original_counts[non_zero_indices]) + weights <- weights / sum(weights) # Normalize weights + } + } + + # Transform results for plotting + baseline_count <- max(results$remaining_counts) + plot_data <- results %>% + mutate( + remaining_counts_fold = round(remaining_counts / baseline_count, 2), # X-axis in fold-change from max, rounded to 2 decimals + remaining_variants_percent = (remaining_variants / nrow(possible_mutations)) * 100 # Y-axis in percent + ) + + # Set plot limits + x_max <- 1 # X-axis limit exactly at 1 + y_max <- 100 # Y-axis limit at 100% + + # Create the plot + plot <- ggplot(plot_data, aes(x = remaining_counts_fold, y = remaining_variants_percent)) + + geom_line(color = "black", size = 0.4) + # Solid line for the data + geom_hline(yintercept = 100, linetype = "dotted", color = "black") + # Horizontal line at 100% + + # Main plot settings with fine grid lines + scale_y_continuous( + labels = scales::percent_format(scale = 1), + limits = c(0, y_max), + breaks = seq(0, y_max, by = 5) # Y-axis grid lines every 5% + ) + + scale_x_continuous( + limits = c(0, x_max), + breaks = seq(0, x_max, length.out = 20), # X-axis with 20 grid lines ending at 1 + labels = scales::number_format(accuracy = 0.01) # Round labels to 2 decimal places + ) + + labs( + x = "Fold-Change of Sequencing Depth", + y = "Variants (% of Maximum)" + ) + + theme_minimal() + + theme( + panel.border = element_rect(color = "black", fill = NA), # Add a box-like border + panel.grid.major = element_line(size = 0.2, linetype = "solid", color = "grey80"), # Fine grid lines + panel.grid.minor = element_blank(), # No minor grid lines for clarity + axis.text.x = element_text(angle = 45, hjust = 1) # Rotate X-axis labels at 45 degrees + ) + + # Save the plot as a PDF in the specified output folder + ggsave(output_file_path, plot = plot, device = "pdf", width = 8, height = 6) +} + +#Example usage with input and output paths +# results_weighted <- coverage_simulation_plot( +# prefiltered_gatk_path = "/Users/benjaminwehnert/CRG/DMS_QC/testing_data/gatk_filtered_by_codon_library.csv", +# possible_mutations_path = "/Users/benjaminwehnert/CRG/DMS_QC/testing_data/possible_mutations.csv", +# output_folder = "/Users/benjaminwehnert/CRG/DMS_QC/testing_data/testing_outputs", +# reduction_fraction = 0.01, +# threshold = 3 +# ) + + +#SeqDepth diff --git a/modules/local/dmsanalysis/bin/aa_seq.R b/modules/local/dmsanalysis/bin/aa_seq.R new file mode 100644 index 0000000..9a39fb5 --- /dev/null +++ b/modules/local/dmsanalysis/bin/aa_seq.R @@ -0,0 +1,37 @@ +# input: wildtype-seq, start&stopp pos. +# output: amino acid sequence within the start-stop frame (.txt) + +# Load necessary libraries +suppressMessages(library(Biostrings)) + +# Define the function +aa_seq <- function(wt_seq_input, pos_range, output_file) { + # Parse the start and stop positions from the input format "start-stop" + positions <- unlist(strsplit(pos_range, "-")) + start_pos <- as.numeric(positions[1]) + stop_pos <- as.numeric(positions[2]) + + # Check if the input is a file or a string + if (file.exists(wt_seq_input)) { + # If it's a file, read the sequence from the fasta file + seq_data <- readDNAStringSet(filepath = wt_seq_input) + wt_seq <- seq_data[[1]] # Extract the sequence + } else { + # Otherwise, treat the input as a sequence string + wt_seq <- DNAString(wt_seq_input) + } + + # Extract the sequence between start and stop codons + coding_seq <- subseq(wt_seq, start = start_pos, end = stop_pos) + + # Translate the coding sequence into an amino acid sequence + aa_seq <- translate(coding_seq) + + # Write the amino acid sequence to a .txt file + write(as.character(aa_seq), file = output_file) +} + +# Example usage: +# translate_to_protein("/path/to/sequence.fasta", "23-1225", "/path/to/output_protein.txt") + +#aa_seq("/Users/benjaminwehnert/CRG/DMS_QC/testing_data/MORtn5_reference.fa", "23-1225", "/Users/benjaminwehnert/CRG/DMS_QC/testing_data/testing_outputs/aa_seq.txt") diff --git a/modules/local/dmsanalysis/bin/complete_prefiltered_gatk.R b/modules/local/dmsanalysis/bin/complete_prefiltered_gatk.R new file mode 100644 index 0000000..88cc6b5 --- /dev/null +++ b/modules/local/dmsanalysis/bin/complete_prefiltered_gatk.R @@ -0,0 +1,78 @@ +# input: NNK_codon_library_filtered_gatk.csv-path, prefiltered_gatk.csv-path (containing only NNK mutations), output_folder-path +# output: completed gatk_file with all possible variants (even if not measured in sequencing) -> NA in counts and counts_per_cov to 0.0000001 to deal with log-scale in following calculations + +library(dplyr) +library(Biostrings) # Required for codon-to-amino-acid translation + +# Function to calculate Hamming distance (varying_bases) +hamming_distance <- function(wt_codon, variant_codon) { + sum(strsplit(wt_codon, "")[[1]] != strsplit(variant_codon, "")[[1]]) +} + +# Function to get amino acid from codon +get_amino_acid <- function(codon) { + codon_table <- GENETIC_CODE + aa <- codon_table[[toupper(codon)]] + if (is.null(aa)) { + return(NA) # Handle cases where codon is not valid + } + return(aa) +} + +# Function to calculate mutation type (aa_mut) and pos_mut +mutation_details <- function(wt_codon, variant_codon, codon_number) { + wt_aa <- get_amino_acid(wt_codon) + variant_aa <- get_amino_acid(variant_codon) + + # If amino acids are different, it's a missense mutation; otherwise, synonymous + if (wt_aa != variant_aa) { + mutation_type <- "M" # Missense mutation + } else { + mutation_type <- "S" # Synonymous mutation + } + + # aa_mut: Type of mutation and amino acid changes (e.g., M:D>S) + aa_mut <- paste0(mutation_type, ":", wt_aa, ">", variant_aa) + + # pos_mut: Wild-type AA, codon position, mutated AA (e.g., D2Q) + pos_mut <- paste0(wt_aa, codon_number, variant_aa) + + return(list(aa_mut = aa_mut, pos_mut = pos_mut)) +} + +complete_prefiltered_gatk <- function(possible_nnk_path, prefiltered_gatk_path, output_file_path) { + + # Load the possible NNK mutations CSV + possible_nnk <- read.csv(possible_nnk_path) + + # Load the prefiltered GATK CSV + prefiltered_gatk <- read.csv(prefiltered_gatk_path) + + # Create codon_mut column in possible_NNK_mutations in the format 'Codon_Number:wt_codon>Variant' + possible_nnk <- possible_nnk %>% + mutate(codon_mut = paste0(Codon_Number, ":", wt_codon, ">", Variant)) + + # Merge both dataframes based on the codon_mut column (full join to include all) + merged_data <- full_join(prefiltered_gatk, possible_nnk, by = "codon_mut") + + # Fill missing values in counts_per_cov and counts with 0.0000001 + merged_data <- merged_data %>% + mutate(counts_per_cov = ifelse(is.na(counts_per_cov), 0.0000001, counts_per_cov), + counts = ifelse(is.na(counts), 0.000001, counts)) + + # Calculate Hamming distance (varying_bases) and mutation details (aa_mut, pos_mut) + merged_data <- merged_data %>% + rowwise() %>% + mutate(varying_bases = hamming_distance(wt_codon, Variant), + mutation_info = list(mutation_details(wt_codon, Variant, Codon_Number))) %>% + mutate(aa_mut = mutation_info$aa_mut, # Extract aa_mut + pos_mut = mutation_info$pos_mut) %>% # Extract pos_mut + ungroup() %>% + select(-mutation_info) # Remove the temporary list column + + # Save the merged data to a new CSV file + write.csv(merged_data, file = output_file_path, row.names = FALSE) +} + +# Example call +#complete_prefiltered_gatk("/Users/benjaminwehnert/CRG/DMS_QC/testing_data/possible_NNK_mutations.csv", "/Users/benjaminwehnert/CRG/DMS_QC/testing_data/gatk_filtered_by_codon_library.csv", "/Users/benjaminwehnert/CRG/DMS_QC/testing_data/testing_outputs") diff --git a/modules/local/dmsanalysis/bin/counts_heatmap.R b/modules/local/dmsanalysis/bin/counts_heatmap.R new file mode 100644 index 0000000..0814daa --- /dev/null +++ b/modules/local/dmsanalysis/bin/counts_heatmap.R @@ -0,0 +1,159 @@ +# Input: prepared GATK data path, output_path, threshold (same as used for prepare_gatk_data_for_counts_per_cov_heatmap function) +# Output: counts_per_cov_heatmap.pdf + +library(dplyr) +library(ggplot2) + +counts_heatmap <- function(input_csv_path, threshold = 3, output_pdf_path, img_format = "pdf") { + + # Inner function to add padding to the last row, adding 21 amino acids per position + pad_heatmap_data_long <- function(heatmap_data_long, min_non_na_value, num_positions_per_row = 75) { + all_amino_acids <- c("G", "A", "V", "L", "M", "I", "F", + "Y", "W", "K", "R", "H", "D", "E", + "S", "T", "C", "N", "Q", "P", "*") + + max_position <- max(heatmap_data_long$position) + num_missing_positions <- num_positions_per_row - (max_position %% num_positions_per_row) + + if (num_missing_positions < num_positions_per_row) { + new_positions <- (max_position + 1):(max_position + num_missing_positions) + + # Add all 21 amino acid variants for each new position + padding_data <- expand.grid( + mut_aa = all_amino_acids, # All possible amino acids + position = new_positions # New positions to be padded + ) + + # Set placeholder values for the added positions to the exact smallest non-NA value + padding_data$total_counts <- min_non_na_value # Set to the smallest non-NA value + padding_data$wt_aa <- "Y" # Set wild-type amino acid to 'Y' + padding_data$wt_aa_pos <- paste0("Y", padding_data$position) # Create wt_aa_pos with correct positions + padding_data$row_group <- max(heatmap_data_long$row_group) # Set row group to the current last group + + # Add the new padding rows to heatmap_data_long + heatmap_data_long <- dplyr::bind_rows(heatmap_data_long, padding_data) + } + + return(heatmap_data_long) + } + + # Load the CSV data + heatmap_data <- read.csv(input_csv_path) + + # Check if the necessary column exists in the data + if (!"total_counts" %in% colnames(heatmap_data)) { + stop("The column 'total_counts' is not found in the data.") + } + + # Create heatmap_data_long by selecting necessary columns + heatmap_data_long <- heatmap_data %>% + select(mut_aa, position, total_counts, wt_aa) # Use 'total_counts' + + # Find the smallest non-NA value in total_counts + min_non_na_value <- min(heatmap_data_long$total_counts, na.rm = TRUE) + + # Group positions by rows (75 positions per row) and calculate row_group + heatmap_data_long <- heatmap_data_long %>% + mutate(row_group = ((position - 1) %/% 75) + 1) # Grouping positions into rows + + # Apply padding to add missing positions at the end of the last row, using the calculated min value + heatmap_data_long <- pad_heatmap_data_long(heatmap_data_long, min_non_na_value) + + # Convert positions to numeric, sort them, and create wt_aa_pos for the plot + heatmap_data_long <- heatmap_data_long %>% + mutate(position = as.numeric(position)) %>% # Ensure position is numeric + arrange(position) %>% # Sort by position + mutate(wt_aa_pos = factor(paste0(wt_aa, position), levels = unique(paste0(wt_aa, position)))) # Create sorted factor levels for wt_aa_pos + + # Add a column to identify synonymous mutations (where mut_aa == wt_aa) + heatmap_data_long <- heatmap_data_long %>% + mutate(synonymous = mut_aa == wt_aa) + + # Definiere die korrekte Reihenfolge der Aminosäuren + amino_acid_order <- rev(c("G", "A", "V", "L", "M", "I", "F", + "Y", "W", "K", "R", "H", "D", "E", + "S", "T", "C", "N", "Q", "P", "*")) + + heatmap_data_long <- heatmap_data_long %>% + mutate(mut_aa = factor(mut_aa, levels = amino_acid_order)) + + # Bearbeite heatmap_data_long und erstelle syn_positions gleichzeitig + syn_positions <- heatmap_data_long %>% + mutate(mut_aa = factor(mut_aa, levels = amino_acid_order), + # Berechne die x-Koordinate, die pro Gruppe immer von 1 bis 75 verläuft + x = as.numeric(factor(wt_aa_pos, levels = unique(wt_aa_pos))) - ((row_group - 1) * 75), + y = as.numeric(factor(mut_aa, levels = amino_acid_order))) %>% + filter(synonymous == TRUE) + + # Calculate the number of row groups and adjust plot height dynamically + num_row_groups <- max(heatmap_data_long$row_group) + plot_height <- num_row_groups * 4 + + # Set the limits for the color scale, ignoring NA (negative values are now NA) + min_count <- min(heatmap_data_long$total_counts, na.rm = TRUE) + max_count <- max(heatmap_data_long$total_counts, na.rm = TRUE) + max_position <- max(heatmap_data$position) + + # Create the heatmap plot with explicit handling for positions > max_position + heatmap_plot <- ggplot(heatmap_data_long, aes(x = wt_aa_pos, y = mut_aa, fill = total_counts)) + + scale_fill_gradientn(colours = c(alpha("blue", 0), "blue"), na.value = "grey35", trans = "log", # Apply log transformation to the scale + limits = c(min_count, max_count), + breaks = scales::trans_breaks("log10", function(x) 10^x), # Logarithmic scale breaks + labels = scales::trans_format("log10", scales::math_format(10^.x))) + + scale_x_discrete(labels = function(x) { + numeric_pos <- as.numeric(gsub("[^0-9]", "", x)) + ifelse(numeric_pos > max_position, " ", x) + }) + + geom_tile() + + + # Add diagonal lines for synonymous mutations using geom_segment + geom_segment(data = syn_positions[syn_positions$position <= max_position, ], + aes(x = x - 0.485, xend = x + 0.485, + y = y - 0.485, yend = y + 0.485, color = synonymous), + size = 0.2) + + + # Manuelle Farbskala für die diagonalen Linien + scale_color_manual(values = c("TRUE" = "grey10"), labels = c("TRUE" = "")) + + + theme_minimal() + + labs(title = "Heatmap of Counts per Variant", x = "Wild-type Amino Acid", y = "Mutant Amino Acid", fill = "Counts", color = "Synonymous Mutation") + + theme(plot.title = element_text(size = 16, face = "bold"), + axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 12), + axis.text.y = element_text(size = 12), # Larger y-axis labels + axis.title.x = element_text(size = 16), + axis.title.y = element_text(size = 16), + legend.title = element_text(size = 14), # Larger legend title + legend.text = element_text(size = 12), # Larger legend text + panel.spacing = unit(0.1, "lines"), # Adjust panel spacing + strip.text = element_blank(), # Remove row group labels (facet numbers) + strip.background = element_blank(), + panel.grid.major = element_blank(), # Remove major grid lines + panel.grid.minor = element_blank()) + # Remove minor grid lines + facet_wrap(~ row_group, scales = "free_x", ncol = 1) + # Group by 75 positions per row + theme(panel.spacing = unit(0.2, "lines")) + + heatmap_plot <- heatmap_plot + + geom_point(data = heatmap_data_long, aes(size = ""), colour = "black", alpha = 0) # Invisible points for legend + heatmap_plot <- heatmap_plot + + guides(size = guide_legend(paste("Dropout (Counts <", threshold, ")"), override.aes = list(shape = 15, size = 8, colour = "grey35", alpha = 1))) # Define Legend for Dropouts + + # Save the heatmap plot + if (img_format == "pdf") { + ggsave(output_pdf_path, plot = heatmap_plot, width = 16, height = plot_height, dpi = 150, device = cairo_pdf) + } else { + ggsave(output_pdf_path, plot = heatmap_plot, width = 16, height = plot_height, dpi = 150) + } + + if (file.exists(output_pdf_path)) { + print("Heatmap image successfully created!") + } else { + print("Error: Heatmap image was not created.") + } +} + +# Test the function +# counts_heatmap( +# "/Users/benjaminwehnert/CRG/DMS_QC/testing_data/prepared_gatk_data.csv", +# "/Users/benjaminwehnert/CRG/DMS_QC/testing_data/testing_outputs/counts_heatmap.pdf", +# threshold = 3 +# ) diff --git a/modules/local/dmsanalysis/bin/counts_per_cov_heatmap.R b/modules/local/dmsanalysis/bin/counts_per_cov_heatmap.R new file mode 100644 index 0000000..ea7b36d --- /dev/null +++ b/modules/local/dmsanalysis/bin/counts_per_cov_heatmap.R @@ -0,0 +1,159 @@ +# Input: prefiltered GATK data path, output_path, threshold (same as used for prepare_gatk_data_for_counts_per_cov_heatmap function) +# Output: counts_per_cov_heatmap.pdf + +library(dplyr) +library(ggplot2) + +counts_per_cov_heatmap <- function(input_csv_path, threshold = 3, output_pdf_path, img_format = "pdf") { + + # Inner function to add padding to the last row, adding 21 amino acids per position + pad_heatmap_data_long <- function(heatmap_data_long, min_non_na_value, num_positions_per_row = 75) { + all_amino_acids <- c("G", "A", "V", "L", "M", "I", "F", + "Y", "W", "K", "R", "H", "D", "E", + "S", "T", "C", "N", "Q", "P", "*") + + max_position <- max(heatmap_data_long$position) + num_missing_positions <- num_positions_per_row - (max_position %% num_positions_per_row) + + if (num_missing_positions < num_positions_per_row) { + new_positions <- (max_position + 1):(max_position + num_missing_positions) + + # Add all 21 amino acid variants for each new position + padding_data <- expand.grid( + mut_aa = all_amino_acids, # All possible amino acids + position = new_positions # New positions to be padded + ) + + # Set placeholder values for the added positions to the exact smallest non-NA value + padding_data$total_counts_per_cov <- min_non_na_value # Set to the smallest non-NA value + padding_data$wt_aa <- "Y" # Set wild-type amino acid to 'Y' + padding_data$wt_aa_pos <- paste0("Y", padding_data$position) # Create wt_aa_pos with correct positions + padding_data$row_group <- max(heatmap_data_long$row_group) # Set row group to the current last group + + # Add the new padding rows to heatmap_data_long + heatmap_data_long <- dplyr::bind_rows(heatmap_data_long, padding_data) + } + + return(heatmap_data_long) + } + + # Load the CSV data + heatmap_data <- read.csv(input_csv_path) + + # Check if the necessary column exists in the data + if (!"total_counts_per_cov" %in% colnames(heatmap_data)) { + stop("The column 'total_counts_per_cov' is not found in the data.") + } + + # Create heatmap_data_long by selecting necessary columns + heatmap_data_long <- heatmap_data %>% + select(mut_aa, position, total_counts_per_cov, wt_aa) # Use 'total_counts_per_cov' + + # Find the smallest non-NA value in total_counts_per_cov + min_non_na_value <- min(heatmap_data_long$total_counts_per_cov, na.rm = TRUE) + + # Group positions by rows (75 positions per row) and calculate row_group + heatmap_data_long <- heatmap_data_long %>% + mutate(row_group = ((position - 1) %/% 75) + 1) # Grouping positions into rows + + # Apply padding to add missing positions at the end of the last row, using the calculated min value + heatmap_data_long <- pad_heatmap_data_long(heatmap_data_long, min_non_na_value) + + # Convert positions to numeric, sort them, and create wt_aa_pos for the plot + heatmap_data_long <- heatmap_data_long %>% + mutate(position = as.numeric(position)) %>% # Ensure position is numeric + arrange(position) %>% # Sort by position + mutate(wt_aa_pos = factor(paste0(wt_aa, position), levels = unique(paste0(wt_aa, position)))) # Create sorted factor levels for wt_aa_pos + + # Add a column to identify synonymous mutations (where mut_aa == wt_aa) + heatmap_data_long <- heatmap_data_long %>% + mutate(synonymous = mut_aa == wt_aa) + + # Definiere die korrekte Reihenfolge der Aminosäuren + amino_acid_order <- rev(c("G", "A", "V", "L", "M", "I", "F", + "Y", "W", "K", "R", "H", "D", "E", + "S", "T", "C", "N", "Q", "P", "*")) + + heatmap_data_long <- heatmap_data_long %>% + mutate(mut_aa = factor(mut_aa, levels = amino_acid_order)) + + # Bearbeite heatmap_data_long und erstelle syn_positions gleichzeitig + syn_positions <- heatmap_data_long %>% + mutate(mut_aa = factor(mut_aa, levels = amino_acid_order), + # Berechne die x-Koordinate, die pro Gruppe immer von 1 bis 75 verläuft + x = as.numeric(factor(wt_aa_pos, levels = unique(wt_aa_pos))) - ((row_group - 1) * 75), + y = as.numeric(factor(mut_aa, levels = amino_acid_order))) %>% + filter(synonymous == TRUE) + + # Calculate the number of row groups and adjust plot height dynamically + num_row_groups <- max(heatmap_data_long$row_group) + plot_height <- num_row_groups * 4 + + # Set the limits for the color scale, ignoring NA (negative values are now NA) + min_count <- min(heatmap_data_long$total_counts_per_cov, na.rm = TRUE) + max_count <- max(heatmap_data_long$total_counts_per_cov, na.rm = TRUE) + max_position <- max(heatmap_data$position) + + # Create the heatmap plot with explicit handling for positions > max_position + heatmap_plot <- ggplot(heatmap_data_long, aes(x = wt_aa_pos, y = mut_aa, fill = total_counts_per_cov)) + + scale_fill_gradientn(colours = c(alpha("blue", 0), "blue"), na.value = "grey35", trans = "log", # Apply log transformation to the scale + limits = c(min_count, max_count), + breaks = scales::trans_breaks("log10", function(x) 10^x), # Logarithmic scale breaks + labels = scales::trans_format("log10", scales::math_format(10^.x))) + + scale_x_discrete(labels = function(x) { + numeric_pos <- as.numeric(gsub("[^0-9]", "", x)) + ifelse(numeric_pos > max_position, " ", x) + }) + + geom_tile() + + + # Add diagonal lines for synonymous mutations using geom_segment + geom_segment(data = syn_positions[syn_positions$position <= max_position, ], + aes(x = x - 0.485, xend = x + 0.485, + y = y - 0.485, yend = y + 0.485, color = synonymous), + size = 0.2) + + + # Manuelle Farbskala für die diagonalen Linien + scale_color_manual(values = c("TRUE" = "grey10"), labels = c("TRUE" = "")) + + + theme_minimal() + + labs(title = "Heatmap of Counts per Coverage for Mutations", x = "Wild-type Amino Acid", y = "Mutant Amino Acid", fill = "Counts per \n Coverage", color = "Synonymous Mutation") + + theme(plot.title = element_text(size = 16, face = "bold"), + axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 12), + axis.text.y = element_text(size = 12), # Larger y-axis labels + axis.title.x = element_text(size = 16), + axis.title.y = element_text(size = 16), + legend.title = element_text(size = 14), # Larger legend title + legend.text = element_text(size = 12), # Larger legend text + panel.spacing = unit(0.1, "lines"), # Adjust panel spacing + strip.text = element_blank(), # Remove row group labels (facet numbers) + strip.background = element_blank(), + panel.grid.major = element_blank(), # Remove major grid lines + panel.grid.minor = element_blank()) + # Remove minor grid lines + facet_wrap(~ row_group, scales = "free_x", ncol = 1) + # Group by 75 positions per row + theme(panel.spacing = unit(0.2, "lines")) + + heatmap_plot <- heatmap_plot + + geom_point(data = heatmap_data_long, aes(size = ""), colour = "black", alpha = 0) # Invisible points for legend + heatmap_plot <- heatmap_plot + + guides(size = guide_legend(paste("Dropout (Counts <", threshold, ")"), override.aes = list(shape = 15, size = 8, colour = "grey35", alpha = 1))) # Define Legend for Dropouts + + # Save the heatmap plot + if (img_format == "pdf") { + ggsave(output_pdf_path, plot = heatmap_plot, width = 16, height = plot_height, dpi = 150, device = cairo_pdf) + } else { + ggsave(output_pdf_path, plot = heatmap_plot, width = 16, height = plot_height, dpi = 150) + } + + if (file.exists(output_pdf_path)) { + print("Heatmap image successfully created!") + } else { + print("Error: Heatmap image was not created.") + } +} + +# Test the function +# counts_per_cov_heatmap( +# "/Users/benjaminwehnert/CRG/DMS_QC/testing_data/prepared_gatk_data.csv", +# "/Users/benjaminwehnert/CRG/DMS_QC/testing_data/testing_outputs/heatmap.pdf", +# threshold = 3 +# ) diff --git a/modules/local/dmsanalysis/bin/detect_codons.R b/modules/local/dmsanalysis/bin/detect_codons.R new file mode 100644 index 0000000..1d1d36f --- /dev/null +++ b/modules/local/dmsanalysis/bin/detect_codons.R @@ -0,0 +1,63 @@ +# input: wildtype nucleotide sequence as string or .fa +# output: first nucleotide position of start codon and last nucleotide position of stopp codon (e.g. "7-2145") + +# Load necessary package +if (!requireNamespace("BiocManager", quietly = TRUE)) { + install.packages("BiocManager") +} +if (!requireNamespace("Biostrings", quietly = TRUE)) { + BiocManager::install("Biostrings") +} +suppressMessages(library(Biostrings)) + +# Function to find the nearest stop codon in the reading frame of the first start codon +# Accepts either a sequence string or a path to a reference FASTA file +find_start_stop <- function(input_string, start_codon = "ATG", stop_codons = c("TAA", "TAG", "TGA")) { + # Determine if the input is a file path or a raw sequence + if (file.exists(input_string)) { + # If the input is a valid file path, read the DNA sequence from the FASTA file + dna <- readDNAStringSet(input_string) + sequence <- as.character(dna[[1]]) # Extract the sequence as a character string + } else { + # If the input is not a file path, treat it as a raw sequence + sequence <- input_string + } + + # Check if the sequence is provided or successfully read + if (is.null(sequence)) { + stop("No valid sequence or FASTA file provided.") + } + + # Convert the sequence to uppercase to ensure matching regardless of case + sequence <- toupper(sequence) + + # Function to find codons in the sequence + find_codons <- function(sequence, codon) { + codon_positions <- gregexpr(codon, sequence)[[1]] + codon_positions[codon_positions > 0] # Return only positive matches + } + + # Search for the first start codon + start_positions <- find_codons(sequence, start_codon) + + if (length(start_positions) == 0 || start_positions[1] == -1) { + return("No start codon found") # Return message if no start codon is found + } + first_start <- start_positions[1] # Consider only the first start codon + + # Search for all possible stop codons + stop_positions_list <- lapply(stop_codons, find_codons, sequence = sequence) + all_stop_positions <- sort(unlist(stop_positions_list)) + + # Filter only stop codons that are in the same reading frame and come after the start codon + in_frame_stops <- all_stop_positions[(all_stop_positions - first_start) %% 3 == 0 & all_stop_positions > first_start] + if (length(in_frame_stops) > 0) { + # Choose the nearest stop codon + closest_stop <- min(in_frame_stops) + return(cat(paste(first_start, closest_stop + 2, sep = "-"))) # Output format: start-stop + } else { + return("No suitable stop codon found") # If no suitable stop codon is found + } +} + +#find_start_stop("/Users/benjaminwehnert/CRG/DMS_QC/testing_data/MORtn5_reference.fa") diff --git a/modules/local/dmsanalysis/bin/dimsum_experimentalDesign.R b/modules/local/dmsanalysis/bin/dimsum_experimentalDesign.R new file mode 100644 index 0000000..89f0ce1 --- /dev/null +++ b/modules/local/dmsanalysis/bin/dimsum_experimentalDesign.R @@ -0,0 +1,68 @@ +# Make a DiMSum experimental design from a deepmutscan samplesheet. +# - samplesheet_csv: path to CSV with columns sample,type,replicate,file1,file2 +# - out_path: where to write the TSV (default "experimentalDesign.tsv") +# Returns: the experimental design as a data.frame +make_dimsum_experimental_design <- function(samplesheet_csv, out_path = "experimentalDesign.tsv") { + # ---- read & normalize ---- + ss <- read.csv(samplesheet_csv, stringsAsFactors = FALSE, check.names = FALSE) + names(ss) <- tolower(names(ss)) + + # tolerate missing file2 column (single-end) + if (!"file2" %in% names(ss)) ss$file2 <- "" + + required <- c("sample", "type", "replicate", "file1", "file2") + missing <- setdiff(required, names(ss)) + if (length(missing) > 0) stop("Samplesheet missing columns: ", paste(missing, collapse = ", ")) + + # coerce types + ss$replicate <- as.integer(ss$replicate) + + # ---- derive sample_name strategy ---- + # If only one biological sample present (e.g. one protein), use "input1", "output2", ... + # If multiple biological samples present, prefix with 'sample' to avoid collisions: + # "GID1A_input1", "GID1B_output2", ... + multi_base <- length(unique(ss$sample)) > 1 + if (multi_base) { + sample_name <- paste(ss$sample, ss$type, ss$replicate, sep = "") + } else { + sample_name <- paste0(ss$type, ss$replicate) + } + + # ---- build DiMSum columns ---- + experiment_replicate <- ss$replicate + selection_id <- ifelse(ss$type == "input", 0L, + ifelse(ss$type == "output", 1L, NA_integer_)) + # assume one selection batch + selection_replicate <- ifelse(ss$type == "output", 1L, NA_integer_) + # assume one technical batch + technical_replicate <- rep(1L, nrow(ss)) + + pair1 <- basename(ss$file1) + # keep empty string for single-end / missing file2 + pair2 <- ifelse(is.na(ss$file2) | ss$file2 == "", "", basename(ss$file2)) + + ed <- data.frame( + sample_name = sample_name, + experiment_replicate = experiment_replicate, + selection_id = selection_id, + selection_replicate = selection_replicate, + technical_replicate = technical_replicate, + pair1 = pair1, + pair2 = pair2, + stringsAsFactors = FALSE + ) + + # ---- order rows: by sample (if multiple), type (input, output, quality), then replicate ---- + type_rank <- match(ss$type, c("input", "output", "quality")) + ord <- if (multi_base) { + order(ss$sample, type_rank, ss$replicate, na.last = TRUE) + } else { + order(type_rank, ss$replicate, na.last = TRUE) + } + ed <- ed[ord, , drop = FALSE] + rownames(ed) <- NULL + + # ---- write & return ---- + write.table(ed, file = out_path, sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE, na = "") + return(ed) +} diff --git a/modules/local/dmsanalysis/bin/filter_gatk_by_codon_library.R b/modules/local/dmsanalysis/bin/filter_gatk_by_codon_library.R new file mode 100644 index 0000000..b05d391 --- /dev/null +++ b/modules/local/dmsanalysis/bin/filter_gatk_by_codon_library.R @@ -0,0 +1,122 @@ +# Input: pre_processed_raw_gatk_path, mutation library .csv path (former "possible_NNK_mutations.csv"), output_path +# Output: gatk table filtered for only single-codon mutations that are part of the library + +# Load necessary library +library("dplyr") +library("stringr") + +# filter_gatk_by_codon_library <- function(gatk_file_path, codon_library, output_file_path) { +# # Load the GATK table from the provided file path +# gatk_table <- read.csv(gatk_file_path) +# +# # Predefined codon libraries +# nnk_codons <- c('AAG', 'AAT', 'ATG', 'ATT', 'AGG', 'AGT', 'ACG', 'ACT', +# 'TAG', 'TAT', 'TTG', 'TTT', 'TGG', 'TGT', 'TCG', 'TCT', +# 'GAG', 'GAT', 'GTG', 'GTT', 'GGG', 'GGT', 'GCG', 'GCT', +# 'CAG', 'CAT', 'CTG', 'CTT', 'CGG', 'CGT', 'CCG', 'CCT') +# +# # Check if the codon_library is a predefined string or a custom vector +# if (is.character(codon_library) && length(codon_library) == 1) { +# if (codon_library == "NNK") { +# codon_set <- nnk_codons +# } else { +# stop("Invalid predefined codon library specified.") +# } +# } else if (is.vector(codon_library)) { +# codon_set <- codon_library +# } else { +# stop("Invalid codon library format. Must be a predefined string or a custom vector.") +# } +# +# # Filter for single-codon mutations that are part of the codon library and make sure to handle some mistaken formatting from gatk (there are cases where more than 3 Bases are mutated, but column varying_codons == 1) +# filtered_gatk <- gatk_table %>% +# filter(varying_codons == 1 & sub(".*>", "", codon_mut) %in% codon_set) %>% +# rowwise() %>% +# filter({ +# # Split base_mut into individual mutations +# mutations <- unlist(strsplit(base_mut, ",\\s*")) # Splits by comma and removes extra spaces +# # Extract numeric positions from each mutation string +# positions <- as.numeric(str_extract(mutations, "^[0-9]+")) +# +# # Calculate the distance between the first and last position +# distance <- max(positions, na.rm = TRUE) - min(positions, na.rm = TRUE) +# +# # Keep rows where the distance is <= 2 +# distance <= 2 +# }) %>% +# ungroup() +# +# +# # Write the filtered GATK table to the output file path +# write.csv(filtered_gatk, file = output_file_path, row.names = FALSE) +# } +# +# # Example usage with predefined library (NNK): +# # filtered_gatk <- filter_gatk_by_codon_library(raw_gatk, codon_library = "NNK") +# +# # Example usage with custom library: +# # custom_library <- c('AAG', 'TGT', 'GCT') +# # filtered_gatk <- filter_gatk_by_codon_library(raw_gatk, custom_library = custom_library) +# +# #filter_gatk_by_codon_library("/Users/benjaminwehnert/CRG/DMS_QC/testing_data/raw_gatk.csv", codon_library = "NNK", output_file_path = "/Users/benjaminwehnert/CRG/DMS_QC/testing_data/gatk_filtered_by_codon_library.csv") + + + + + +library("dplyr") +library("stringr") + +filter_gatk_by_codon_library <- function(gatk_file_path, codon_library_path, output_file_path) { + # Load the GATK table from the provided file path + gatk_table <- read.csv(gatk_file_path) + + # Load the codon library from the provided .csv file + codon_library <- read.csv(codon_library_path) + + # Ensure the codon library has the expected columns + if (!all(c("Codon_Number", "wt_codon", "Variant") %in% colnames(codon_library))) { + stop("Codon library must contain columns 'Codon_Number', 'wt_codon', and 'Variant'.") + } + + # Filter the GATK table + filtered_gatk <- gatk_table %>% + filter(varying_codons == 1) %>% # Keep rows with single-codon mutations + rowwise() %>% + filter({ + # Extract the position and mutated codon + codon_position <- as.numeric(sub(":.*", "", codon_mut)) # Extract position before ':' + mutated_codon <- sub(".*>", "", codon_mut) # Extract codon after '>' + + # Check if the position and codon are valid + is_in_library <- any( + codon_library$Codon_Number == codon_position & + (codon_library$Variant == mutated_codon | # Check Variant column + codon_library$wt_codon == mutated_codon) # Check wt_codon column + ) + is_in_library + }) %>% + ungroup() %>% + # Apply additional filtering based on mutation distances + rowwise() %>% + filter({ + # Split base_mut into individual mutations + mutations <- unlist(strsplit(base_mut, ",\\s*")) # Splits by comma and removes extra spaces + # Extract numeric positions from each mutation string + positions <- as.numeric(str_extract(mutations, "^[0-9]+")) + + # Calculate the distance between the first and last position + distance <- max(positions, na.rm = TRUE) - min(positions, na.rm = TRUE) + + # Keep rows where the distance is <= 2 + distance <= 2 + }) %>% + ungroup() + + # Write the filtered GATK table to the output file path + write.csv(filtered_gatk, file = output_file_path, row.names = FALSE) +} + +# example +#filter_gatk_by_codon_library("/Users/benjaminwehnert/CRG/DMS_QC/testing_data/raw_gatk.csv", codon_library = "/Users/benjaminwehnert/CRG/DMS_QC/testing_data/possible_NNK_mutations.csv", output_file_path = "/Users/benjaminwehnert/CRG/DMS_QC/testing_data/gatk_filtered_by_codon_library.csv") ### this one's is correct for this data set +#filter_gatk_by_codon_library("/Users/benjaminwehnert/CRG/DMS_QC/testing_data/raw_gatk.csv", codon_library = "/Users/benjaminwehnert/CRG/DMS_QC/testing_data/testing_outputs/possible_NNK_mutations_taylors_nnk_and_nns.csv", output_file_path = "/Users/benjaminwehnert/CRG/DMS_QC/testing_data/testing_outputs/gatk_filtered_by_complete_codon_library.csv") diff --git a/modules/local/dmsanalysis/bin/find_syn_mutation.R b/modules/local/dmsanalysis/bin/find_syn_mutation.R new file mode 100644 index 0000000..0e593d0 --- /dev/null +++ b/modules/local/dmsanalysis/bin/find_syn_mutation.R @@ -0,0 +1,87 @@ +suppressMessages(library(Biostrings)) + +# Pick a synonymous "WT substitute" for DiMSum normalization using a fixed coding window. +# Inputs: +# wt_fasta : path to FASTA (single WT sequence) +# counts_merged_tsv : path to merged counts (columns: nt_seq, input1..N, output1..M) +# pos_range : "start-end" (1-based, inclusive), e.g. "352-1383" +# Returns: +# character scalar: chosen nt sequence +# +# Preference: +# 1) AA-identical (fully synonymous) AND exactly 2 nt mismatches vs WT, both within ONE codon. +# 2) If none, AA-identical AND exactly 1 nt mismatch vs WT. +# 3) If more than one, pick highest mean of input counts. +# 4) If still none, stop with an error. + +pick_synonymous_wt_from_range <- function(wt_fasta, counts_merged_tsv, pos_range) { + ## ---- parse range ---- + pr <- strsplit(as.character(pos_range), "-", fixed = TRUE)[[1]] + if (length(pr) != 2L) stop("pos_range must be 'start-end', got: ", pos_range) + start_pos <- as.integer(pr[1]); end_pos <- as.integer(pr[2]) + if (is.na(start_pos) || is.na(end_pos) || start_pos < 1L || end_pos < start_pos) + stop("Invalid pos_range: ", pos_range) + + ## ---- WT window ---- + wt_set <- Biostrings::readDNAStringSet(wt_fasta) + if (length(wt_set) != 1L) stop("WT FASTA must contain exactly one sequence.") + wt_subseq <- Biostrings::subseq(wt_set[[1]], start = start_pos, end = end_pos) + wt_seq_chr <- as.character(wt_subseq) + wt_len <- nchar(wt_seq_chr) + if ((wt_len %% 3) != 0) stop("Provided window length is not divisible by 3: ", wt_len) + wt_aa <- Biostrings::translate(wt_subseq, if.fuzzy.codon = "X") + wt_chars <- strsplit(wt_seq_chr, "", fixed = TRUE)[[1]] + + ## ---- counts ---- + df <- utils::read.delim(counts_merged_tsv, sep = "\t", header = TRUE, + stringsAsFactors = FALSE, check.names = FALSE) + if (!"nt_seq" %in% names(df)) stop("counts_merged_tsv must have a 'nt_seq' column.") + + df$nt_seq <- toupper(df$nt_seq) + keep_len <- nchar(df$nt_seq) == wt_len + if (!any(keep_len)) stop("No sequences match WT window length (", wt_len, ").") + if (!all(keep_len)) df <- df[keep_len, , drop = FALSE] + + # input columns & mean (works with 1+ replicates) + input_cols <- grep("^input", names(df), value = TRUE) + if (length(input_cols) == 0L) stop("No input columns found (expect names starting with 'input').") + input_mat <- as.data.frame(lapply(df[, input_cols, drop = FALSE], function(x) as.numeric(as.character(x)))) + input_mean <- if (length(input_cols) == 1L) input_mat[[1]] else rowMeans(as.matrix(input_mat), na.rm = TRUE) + + ## ---- synonymous filter ---- + var_set <- Biostrings::DNAStringSet(df$nt_seq) + var_aa <- Biostrings::translate(var_set, if.fuzzy.codon = "X") + syn_idx <- which(as.character(var_aa) == as.character(wt_aa)) + if (length(syn_idx) == 0L) stop("No fully-synonymous variants found relative to WT translation.") + + # helpers + mismatch_positions <- function(seq_nt_chars) which(seq_nt_chars != wt_chars) # 1-based positions + codon_index <- function(pos_vec) floor((pos_vec - 1L) / 3L) # 0-based codon bin + + # preference 1: exactly 2 mismatches, both within the same codon + cand_two_one <- Filter(function(i) { + vchars <- strsplit(df$nt_seq[i], "", fixed = TRUE)[[1]] + pos <- mismatch_positions(vchars) + length(pos) == 2L && length(unique(codon_index(pos))) == 1L + }, syn_idx) + + choose_best <- function(idx_vec) idx_vec[ which.max(input_mean[idx_vec]) ] + + if (length(cand_two_one) > 0L) { + best_i <- choose_best(cand_two_one) + return(as.character(df$nt_seq[best_i])) + } + + # preference 2 (fallback): exactly 1 mismatch (still synonymous) + cand_one <- Filter(function(i) { + vchars <- strsplit(df$nt_seq[i], "", fixed = TRUE)[[1]] + length(mismatch_positions(vchars)) == 1L + }, syn_idx) + + if (length(cand_one) > 0L) { + best_i <- choose_best(cand_one) + return(as.character(df$nt_seq[best_i])) + } + + stop("No suitable synonymous variant found: neither 2-in-1-codon nor 1-nt synonymous candidates present.") +} diff --git a/modules/local/dmsanalysis/bin/fitness_QC.R b/modules/local/dmsanalysis/bin/fitness_QC.R new file mode 100644 index 0000000..4118ccd --- /dev/null +++ b/modules/local/dmsanalysis/bin/fitness_QC.R @@ -0,0 +1,99 @@ +## fitness QC plots for nf-core/deepmutscan +## 28.10.2025 +## maximilian.stammnitz@crg.eu + +## lower panels: scatter + x=y (log-log version for counts) +panel_xy_abline_counts <- function(x, y, ...) { + op <- par("xpd"); on.exit(par(xpd = op), add = TRUE) + par(xpd = FALSE) + points(x, y, pch = 16, cex = 0.1, ...) + abline(a = 0, b = 1, lty = 2, col = "grey50") +} + +## upper panels: Pearson r (log-log-transformed) +panel_cor_counts <- function(x, y, digits = 2, prefix = "r = ", cex.text = 1.4, ...) { + r <- suppressWarnings(cor(log(x), log(y), use = "pairwise.complete.obs", method = "pearson")) + lab <- if (is.finite(r)) bquote(italic(r) == .(round(r, digits))) else bquote(italic(r) == NA) + + ## Save/restore full graphics state we touch + op <- par(c("usr", "xpd", "xlog", "ylog")) + on.exit(par(op), add = TRUE) + + ## Draw in normalized 0..1 panel coords with logs OFF so text is visible + par(xlog = FALSE, ylog = FALSE, xpd = FALSE, usr = c(0, 1, 0, 1)) + text(0.5, 0.5, labels = lab, cex = cex.text, font = 1, col = "black") +} + +## lower panels: scatter + x=y (linear version for fitness) +panel_xy_abline_fitness <- function(x, y, ...) { + op <- par("xpd"); on.exit(par(xpd = op), add = TRUE) + par(xpd = FALSE) + points(x, y, pch = 16, cex = 0.5, ...) + abline(a = 0, b = 1, lty = 2, col = "grey50") +} + +## upper panels: Pearson r (linear) +panel_cor_fitness <- function(x, y, digits = 2, prefix = "r = ", cex.text = 1.4, ...) { + r <- suppressWarnings(cor(x, y, use = "pairwise.complete.obs", method = "pearson")) + lab <- if (is.finite(r)) bquote(italic(r) == .(round(r, digits))) else bquote(italic(r) == NA) + + ## Save/restore full graphics state we touch + op <- par(c("usr", "xpd", "xlog", "ylog")) + on.exit(par(op), add = TRUE) + + ## Draw in normalized 0..1 panel coords with logs OFF so text is visible + par(xlog = FALSE, ylog = FALSE, xpd = FALSE, usr = c(0, 1, 0, 1)) + text(0.5, 0.5, labels = lab, cex = cex.text, font = 1, col = "black") +} + +#' Plot input/output count correlations and fitness replicate correlations +#' +#' @param fitness_table_path Path to the input table (fitness_estimation.tsv) +#' @param out_counts_corr_pdf Path to write the counts correlation PDF +#' @param out_fitness_corr_pdf Path to write the fitness correlation PDF +#' +#' @return Invisibly returns TRUE; writes the two PDFs. +run_fitness_plots <- function(fitness_table_path, + out_counts_corr_pdf, + out_fitness_corr_pdf) { + + merged.counts.fitness <- read.table(fitness_table_path, sep = "\t", header = TRUE, check.names = FALSE) + + ## identify the right samples + inputs <- grep("input", colnames(merged.counts.fitness)) + outputs <- grep("output", colnames(merged.counts.fitness)) + + ## 5. Plot input vs. output counts ## + ##################################### + pdf(out_counts_corr_pdf, height = 9, width = 14) + pairs(merged.counts.fitness[, c(inputs, outputs)] + 1, ## use a pseudo-count + lower.panel = panel_xy_abline_counts, + upper.panel = panel_cor_counts, + cex.text = 2, + log = "xy") + dev.off() + + ## 6. Plot fitness correlations ## + ################################## + fitness.repl <- grep("rescaled_fitness", colnames(merged.counts.fitness)) + + if (length(fitness.repl) > 1) { + pdf(out_fitness_corr_pdf, height = 9, width = 14) + pairs(merged.counts.fitness[, fitness.repl], + lower.panel = panel_xy_abline_fitness, + upper.panel = panel_cor_fitness, + cex.text = 2, + xlim = c(-3, 1), + ylim = c(-3, 1)) + dev.off() + } else { + ## If only one (or zero) rescaled_fitness columns exist, still create an empty placeholder + ## so Nextflow finds the declared output. + pdf(out_fitness_corr_pdf, height = 9, width = 14) + plot.new() + title("No replicate fitness columns found (need ≥2 'rescaled_fitness...')\nCreated placeholder PDF.") + dev.off() + } + + invisible(TRUE) +} diff --git a/modules/local/dmsanalysis/bin/fitness_calculation.R b/modules/local/dmsanalysis/bin/fitness_calculation.R new file mode 100644 index 0000000..5100cef --- /dev/null +++ b/modules/local/dmsanalysis/bin/fitness_calculation.R @@ -0,0 +1,301 @@ +## default fitness estimation for nf-core/deepmutscan +## 27.10.2025 +## maximilian.stammnitz@crg.eu + +## 0. Libraries ## +################## + +suppressPackageStartupMessages({ + library(Biostrings) +}) + +## --- Helper functions --- + +# calculate nt hamming distances from the specified WT +compute_nt_hamming <- function(merged.counts, wt.seq) { + merged.counts <- cbind("nt_ham" = rep(NA, nrow(merged.counts)), merged.counts) + for (i in 1:nrow(merged.counts)){ + tmp.wt <- strsplit(as.character(wt.seq), "")[[1]] + tmp.mut <- strsplit(as.character(merged.counts$nt_seq[i]), "")[[1]] + if(length(which(tmp.mut != tmp.wt)) == 0){ + merged.counts$nt_ham[i] <- 0 + rm(tmp.mut, tmp.wt) + next + }else{ + merged.counts$nt_ham[i] <- length(which(tmp.mut != tmp.wt)) + rm(tmp.mut, tmp.wt) + next + } + } + merged.counts +} + +# translate sequences and add aa_seq +add_aa_seq <- function(merged.counts) { + merged.counts <- cbind("aa_seq" = as.character(translate(DNAStringSet(merged.counts$nt_seq))), merged.counts) + merged.counts +} + +# calculate AA hamming distances from the WT +compute_aa_hamming <- function(merged.counts, wt.seq.aa) { + merged.counts <- cbind("aa_ham" = rep(NA, nrow(merged.counts)), merged.counts) + for (i in 1:nrow(merged.counts)){ + tmp.wt <- strsplit(as.character(wt.seq.aa), "")[[1]] + tmp.mut <- strsplit(as.character(merged.counts$aa_seq[i]), "")[[1]] + if(length(which(tmp.mut != tmp.wt)) == 0){ + merged.counts$aa_ham[i] <- 0 + rm(tmp.mut, tmp.wt) + next + }else{ + merged.counts$aa_ham[i] <- length(which(tmp.mut != tmp.wt)) + rm(tmp.mut, tmp.wt) + next + } + } + merged.counts +} + +# name the mutations +name_mutations <- function(merged.counts, wt.seq.aa) { + merged.counts <- cbind("wt aa" = rep(NA, nrow(merged.counts)), + "pos" = rep(NA, nrow(merged.counts)), + "mut aa" = rep(NA, nrow(merged.counts)), merged.counts) + for (i in 1:nrow(merged.counts)){ + if(merged.counts$aa_ham[i] == 0){ + next + }else{ + tmp.wt <- strsplit(as.character(wt.seq.aa), "")[[1]] + tmp.mut <- strsplit(as.character(merged.counts$aa_seq[i]), "")[[1]] + merged.counts$pos[i] <- which(tmp.mut != tmp.wt) + merged.counts$`wt aa`[i] <- tmp.wt[merged.counts$pos[i]] + merged.counts$`mut aa`[i] <- tmp.mut[merged.counts$pos[i]] + rm(tmp.mut, tmp.wt) + } + } + merged.counts +} + +# find stops, WT and WT; aggregate counts of variants which are identical on the aa (but not nt) level +aggregate_by_aa <- function(merged.counts) { + ## find stops, WT and WT + merged.counts <- cbind(merged.counts, + "wt" = rep(NA, nrow(merged.counts)), + "stop" = rep(NA, nrow(merged.counts))) + merged.counts$wt[which(merged.counts$nt_ham == 0)] <- TRUE + merged.counts$stop[which(merged.counts$`mut aa` == "*")] <- TRUE + + ## aggregate counts of variants which are identical on the aa (but not nt) level + ## exception: wildtype ones + ## thereby shrinking the matrix + uniq.aa.vars <- unique(merged.counts$aa_seq) + uniq.aa.vars <- uniq.aa.vars[-which(uniq.aa.vars == merged.counts$aa_seq[which(merged.counts$wt == TRUE)])] + for(i in 1:length(uniq.aa.vars)){ + tmp.aa_seq <- uniq.aa.vars[i] + hits <- which(as.character(merged.counts$aa_seq) == tmp.aa_seq) + if(length(hits) == 1){ + rm(tmp.aa_seq, hits) + next + }else{ + for(j in grep("input|output", colnames(merged.counts))){ + merged.counts[hits[1],j] <- sum(merged.counts[hits,j], na.rm = TRUE) + } + merged.counts[hits[1], "nt_seq"] <- paste(merged.counts[hits, "nt_seq"], collapse = ", ") + merged.counts <- merged.counts[-hits[-1],] + rm(tmp.aa_seq, hits) + next + } + } + merged.counts +} + +# 3. Raw fitness calculations ## +calc_raw_fitness <- function(merged.counts, exp.design) { + ## how many fitness replicates are there + reps <- length(unique(exp.design$experiment_replicate)) + for (i in 1:reps){ + merged.counts <- cbind(merged.counts, rep(NA, nrow(merged.counts))) + colnames(merged.counts)[ncol(merged.counts)] <- paste0("raw_fitness_rep", i) + } + + ## calculate raw fitness of all variants vs. WT variant + for (i in 1:reps){ + + ### collect counts + tmp.input.counts <- merged.counts[,paste0("input", i)] + tmp.output.counts <- merged.counts[,paste0("output", i)] + + ### add pseudo-count to zero-outputs (if the corresponding input count is non-zero) + tmp.output.counts[which(tmp.output.counts == 0 & tmp.input.counts != 0)] <- 1 + + ### take logs + tmp.wt.log.ratio <- log(tmp.output.counts[which(merged.counts$wt == TRUE)] / + tmp.input.counts[which(merged.counts$wt == TRUE)]) + tmp.fitness <- log(tmp.output.counts / + tmp.input.counts) - tmp.wt.log.ratio + + ### uncertain values to NA + tmp.fitness[which(is.na(tmp.fitness) == TRUE)] <- NA + tmp.fitness[which(tmp.fitness == "Inf")] <- NA + + ### add to table + merged.counts[,c(ncol(merged.counts) - reps + i)] <- tmp.fitness + + ### clean up + rm(tmp.fitness, tmp.wt.log.ratio, tmp.output.counts, tmp.input.counts) + } + + list(merged.counts = merged.counts, reps = reps) +} + +# 4. Fitness and error refinements ## +rescale_and_summarize <- function(merged.counts, reps) { + ## center the raw fitness distributions on 0 (median of wildtype synonymous) and -1 (median of stops) + for (i in 1:reps){ + + merged.counts <- cbind(merged.counts, rep(NA, nrow(merged.counts))) + colnames(merged.counts)[ncol(merged.counts)] <- paste0("rescaled_fitness_rep", i) + + ### fetch the key counts + tmp.wt.fitness <- merged.counts[which(merged.counts$aa_ham == 0),ncol(merged.counts) - reps] + tmp.stop.fitness <- merged.counts[which(merged.counts$stop == TRUE),ncol(merged.counts) - reps] + + ### rescale + tmp.wt.fitness.med <- median(tmp.wt.fitness, na.rm = TRUE) + tmp.stop.fitness.med <- median(tmp.stop.fitness, na.rm = TRUE) + if(tmp.stop.fitness.med >= tmp.wt.fitness.med){ + + tmp.wt.fitness.mean <- mean(tmp.wt.fitness, na.rm = TRUE) + tmp.stop.fitness.mean <- mean(tmp.stop.fitness, na.rm = TRUE) + lm.rescale <- lm(c(0, -1) ~ c(tmp.wt.fitness.mean, tmp.stop.fitness.mean)) + merged.counts[,ncol(merged.counts)] <- merged.counts[,ncol(merged.counts) - reps] * lm.rescale$coefficients[[2]] + lm.rescale$coefficients[[1]] + rm(tmp.wt.fitness, tmp.stop.fitness, + tmp.wt.fitness.mean, tmp.stop.fitness.mean, + tmp.wt.fitness.med, tmp.stop.fitness.med, lm.rescale) + next + + }else{ + + lm.rescale <- lm(c(0, -1) ~ c(tmp.wt.fitness.med, tmp.stop.fitness.med)) + merged.counts[,ncol(merged.counts)] <- merged.counts[,ncol(merged.counts) - reps] * lm.rescale$coefficients[[2]] + lm.rescale$coefficients[[1]] + rm(tmp.wt.fitness, tmp.stop.fitness, + tmp.wt.fitness.med, tmp.stop.fitness.med, lm.rescale) + next + + } + } + + ## calculate fitness mean and standard deviation across replicates + merged.counts <- cbind(merged.counts, + "mean fitness" = rep(NA, nrow(merged.counts)), + "fitness sd" = rep(NA, nrow(merged.counts))) + + if(reps == 1){ + + merged.counts$`mean fitness` <- merged.counts[,ncol(merged.counts) - 2] + + }else if(reps > 1){ + + merged.counts$`mean fitness` <- apply(merged.counts[,c(ncol(merged.counts) - 2*reps + 1, ncol(merged.counts) - reps)], + 1, + mean, + na.rm = TRUE) + merged.counts$`fitness sd` <- apply(merged.counts[,c(ncol(merged.counts) - 2*reps + 1, ncol(merged.counts) - reps)], + 1, + sd, + na.rm = TRUE) + + } + + merged.counts +} + +## --- Main function --- + +#' Run default fitness estimation with configurable I/O paths +#' +#' @param counts_path Path to counts_merged.tsv +#' @param design_path Path to experimentalDesign.tsv +#' @param wt_seq_path Path to synonymous_wt.txt (single line DNA sequence) +#' @param output_path Path to write fitness_estimation.tsv +#' +#' @return Invisibly returns the final data.frame; writes the output to output_path. +run_fitness_estimation <- function(counts_path, + design_path, + wt_seq_path, + output_path) { + ## 1. Import key files ## + ######################### + + merged.counts <- read.table(counts_path, sep = "\t", header = TRUE, check.names = FALSE) + exp.design <- read.table(design_path, sep = "\t", header = TRUE, check.names = FALSE) + wt.seq <- DNAString(as.character(read.table(wt_seq_path))) + wt.seq.aa <- translate(wt.seq) + + ## 2. Pre-processing the count table ## + ####################################### + + ## calculate nt hamming distances from the specified WT + merged.counts <- compute_nt_hamming(merged.counts, wt.seq) + + ## translate sequences + merged.counts <- add_aa_seq(merged.counts) + + ## calculate AA hamming distances from the WT + merged.counts <- compute_aa_hamming(merged.counts, wt.seq.aa) + + ## name the mutations + merged.counts <- name_mutations(merged.counts, wt.seq.aa) + + ## find stops, WT and WT; aggregate AA-identical variants (except WT) + merged.counts <- aggregate_by_aa(merged.counts) + + ## 3. Raw fitness calculations ## + ################################# + fitness_res <- calc_raw_fitness(merged.counts, exp.design) + merged.counts <- fitness_res$merged.counts + reps <- fitness_res$reps + + ## 4. Fitness and error refinements ## + ###################################### + merged.counts <- rescale_and_summarize(merged.counts, reps) + + ## clean up + rm(reps) + + ## export + write.table(merged.counts, output_path, + col.names = TRUE, row.names = FALSE, quote = FALSE, sep = "\t", na = "") + + invisible(merged.counts) +} + + +## 5. Version ## +################ + +# sessionInfo() +# R version 4.5.1 (2025-06-13) +# Platform: aarch64-apple-darwin20 +# Running under: macOS Sonoma 14.6.1 +# +# Matrix products: default +# BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib +# LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1 +# +# locale: +# [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8 +# +# time zone: Europe/Madrid +# tzcode source: internal +# +# attached base packages: +# [1] stats4 stats graphics grDevices utils datasets methods base +# +# other attached packages: +# [1] Biostrings_2.76.0 GenomeInfoDb_1.44.2 XVector_0.48.0 IRanges_2.42.0 S4Vectors_0.46.0 +# [6] BiocGenerics_0.54.0 generics_0.1.4 +# +# loaded via a namespace (and not attached): +# [1] httr_1.4.7 compiler_4.5.1 R6_2.6.1 tools_4.5.1 +# [5] GenomeInfoDbData_1.2.14 rstudioapi_0.17.1 crayon_1.5.3 UCSC.utils_1.4.0 +# [9] jsonlite_2.0.0 diff --git a/modules/local/dmsanalysis/bin/fitness_heatmap.R b/modules/local/dmsanalysis/bin/fitness_heatmap.R new file mode 100644 index 0000000..04c3cee --- /dev/null +++ b/modules/local/dmsanalysis/bin/fitness_heatmap.R @@ -0,0 +1,272 @@ +suppressPackageStartupMessages({ + library(methods) + library(dplyr) + library(ggplot2) + library(grid) # for unit() +}) + +# ---------- helper functions ---------- +find_col <- function(df, candidates) { + norm <- function(x) gsub("[^a-z0-9]+", "_", tolower(x)) + nms <- colnames(df); nn <- norm(nms) + for (cand in candidates) { + hit <- which(nn == norm(cand)) + if (length(hit) == 1) return(nms[hit]) + } + stop(sprintf("Could not find any of columns: %s", paste(candidates, collapse = ", "))) +} + +get_rescaled_cols <- function(df) { + nms <- colnames(df) + hits <- grep("^rescaled[_ ]?fitness", nms, ignore.case = TRUE, value = TRUE) + if (!length(hits)) stop("No 'rescaled_fitness' columns found.") + idx <- suppressWarnings(as.integer(gsub(".*?([0-9]+)$", "\\1", hits))) + hits[order(is.na(idx), idx, hits)] +} + +# Find "mean fitness" column (if present) +find_mean_col <- function(df) { + nms <- colnames(df) + key <- tolower(gsub("[^a-z0-9]+", "_", nms)) + hit <- which(key == "mean_fitness") + if (length(hit) == 1) nms[hit] else NULL +} + +# NEW: read WT amino acid sequence from .txt file (single line) +read_wt_seq_aa_txt <- function(path) { + if (is.null(path)) stop("wt_seq_aa_txt_path must be provided.") + x <- readLines(path, warn = FALSE) + x <- x[nzchar(x)] + if (!length(x)) stop("WT AA TXT is empty.") + aa <- toupper(gsub("\\s+", "", x[which.max(nchar(x))])) + # Keep only valid amino acid letters (including stop '*') + aa <- gsub("[^ACDEFGHIKLMNPQRSTVWY*]", "", aa) + if (!nchar(aa)) stop("WT AA TXT contains no valid AA letters.") + aa +} + +# Build full AA×position grid for positions 1..wt_len (from WT sequence), +# join fitness values; any missing combos stay NA (-> grey). +# Positions > wt_len are considered padded and will be white. +build_heatmap_long <- function(df, + wt_aa_col, + pos_col, + mut_aa_col, + fitness_col, + positions_per_row = 75, + wt_seq_aa, + fill_missing_as_zero = FALSE) { + + # authoritative WT length from provided sequence + letters <- strsplit(wt_seq_aa, "", fixed = TRUE)[[1]] + wt_len <- length(letters) + + # normalize data + df0 <- df %>% + transmute( + position = suppressWarnings(as.numeric(.data[[pos_col]])), + wt_aa_in = .data[[wt_aa_col]], + mut_aa = .data[[mut_aa_col]], + fitness = suppressWarnings(as.numeric(.data[[fitness_col]])) + ) %>% + filter(is.finite(position)) + + # drop any rows that claim positions beyond WT length + if (nrow(df0) && any(df0$position > wt_len, na.rm = TRUE)) { + dropped <- sum(df0$position > wt_len, na.rm = TRUE) + warning(sprintf("Dropping %d row(s) with position > WT length (%d).", dropped, wt_len)) + df0 <- df0 %>% filter(position <= wt_len) + } + + # pad to next multiple of 75 (by rows) + rem <- wt_len %% positions_per_row + pad_need <- if (rem == 0) 0 else positions_per_row - rem + max_paded <- wt_len + pad_need + + # full grid: positions 1..max_paded (so the tail exists), AA set of 21 + all_positions <- seq_len(max_paded) + all_amino_acids <- c("A","C","D","E","F","G","H","I","K","L", + "M","N","P","Q","R","S","T","V","W","Y","*") + + grid_df <- expand.grid(position = all_positions, + mut_aa = all_amino_acids, + KEEP.OUT.ATTRS = FALSE, stringsAsFactors = FALSE) %>% + mutate(is_padded = position > wt_len) + + # join fitness only for real positions (<= wt_len) + fit_df <- df0 %>% select(position, mut_aa, fitness) + d <- grid_df %>% + left_join(fit_df, by = c("position","mut_aa")) + + if (fill_missing_as_zero) { + d$fitness[is.na(d$fitness) & d$position <= wt_len] <- 0 + } + + # authoritative WT AA per real position; tail gets placeholder 'Y' + wt_map <- tibble(position = seq_len(wt_len), wt_aa = letters) + d <- d %>% + left_join(wt_map, by = "position") %>% + mutate(wt_aa = ifelse(is.na(wt_aa) & position > wt_len, "Y", wt_aa)) + + # layout fields + d <- d %>% + mutate( + row_group = ((position - 1) %/% positions_per_row) + 1, + wt_aa_pos = paste0(wt_aa, position), + wt_aa_pos = factor(wt_aa_pos, levels = unique(wt_aa_pos)), + synonymous = mut_aa == wt_aa + ) + + # IMPORTANT: use WT length as the true end of the protein + d$max_pos <- wt_len + d +} + +syn_segments <- function(d, positions_per_row = 75) { + amino_order <- rev(c("G", "A", "V", "L", "M", "I", "F", + "Y", "W", "K", "R", "H", "D", "E", + "S", "T", "C", "N", "Q", "P", "*")) + d %>% + mutate( + mut_aa = factor(mut_aa, levels = amino_order), + x = as.numeric(factor(wt_aa_pos, levels = levels(wt_aa_pos))) - + ((row_group - 1) * positions_per_row), + y = as.numeric(factor(mut_aa, levels = amino_order)) + ) %>% + filter(synonymous, position <= max_pos) +} + +# Draw one solid white rectangle per row group covering the padded tail region +white_tail_rects <- function(d, positions_per_row = 75) { + wt_len <- unique(d$max_pos)[1] + if (!is.finite(wt_len)) return(dplyr::tibble()[0,]) + + # if perfectly divisible by 75, there is no tail to cover + if (wt_len %% positions_per_row == 0) return(dplyr::tibble()[0,]) + + # which facet (row group) contains the last real position? + last_group <- ((wt_len - 1) %/% positions_per_row) + 1 + last_local_idx <- ((wt_len - 1) %% positions_per_row) + 1 # 1..75 within the facet + + tibble::tibble( + row_group = last_group, + xmin = last_local_idx + 0.5 - 0.025, # tiny epsilon to avoid hairlines + xmax = positions_per_row + 0.5 + 0.025, + ymin = 0.5 - 0.025, + ymax = 21.5 + 0.025 + ) +} + +plot_heatmap <- function(d, title_text, positions_per_row = 75) { + amino_order <- rev(c("G", "A", "V", "L", "M", "I", "F", + "Y", "W", "K", "R", "H", "D", "E", + "S", "T", "C", "N", "Q", "P", "*")) + d <- d %>% mutate(mut_aa = factor(mut_aa, levels = amino_order)) + + min_f <- suppressWarnings(min(d$fitness, na.rm = TRUE)); if (!is.finite(min_f)) min_f <- 0 + max_f <- suppressWarnings(max(d$fitness, na.rm = TRUE)); if (!is.finite(max_f)) max_f <- 0 + max_orig_pos <- unique(d$max_pos)[1] + + syn <- syn_segments(d, positions_per_row) + rect <- white_tail_rects(d, positions_per_row) + + ggplot(d, aes(x = wt_aa_pos, y = mut_aa, fill = fitness)) + + scale_fill_gradientn( + colours = c("#D73027", "#F0F0F0", "#4575B4"), + values = if ((abs(min_f) + max_f) > 0) c(0, abs(min_f)/(abs(min_f)+max_f), 1) else c(0, 0.5, 1), + na.value = "grey35", + limits = c(min_f, max_f) + ) + + scale_x_discrete( + labels = function(x) { + num <- suppressWarnings(as.numeric(gsub("[^0-9]", "", x))) + ifelse(num > max_orig_pos, " ", x) + }, + expand = expansion(mult = c(0, 0)) # no extra margin area + ) + + geom_tile() + + # Solid white block covering the tail (no pattern / no seams) + { if (nrow(rect)) geom_rect(data = rect, inherit.aes = FALSE, + aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax), + fill = "white", color = NA) } + + geom_segment( + data = syn, + aes(x = x - 0.485, xend = x + 0.485, y = y - 0.485, yend = y + 0.485), + linewidth = 0.2, inherit.aes = FALSE, color = "grey10" + ) + + theme_minimal() + + labs(title = title_text, x = "Wild-type amino acid", y = "Mutant amino acid", fill = "Fitness") + + theme( + plot.title = element_text(size = 16, face = "bold"), + axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 10), + axis.text.y = element_text(size = 10), + axis.title.x = element_text(size = 14), + axis.title.y = element_text(size = 14), + legend.title = element_text(size = 12), + legend.text = element_text(size = 10), + panel.grid.major = element_blank(), + panel.grid.minor = element_blank(), + strip.text = element_blank(), + strip.background = element_blank(), + panel.spacing = grid::unit(0.2, "lines") + ) + + facet_wrap(~ row_group, scales = "free_x", ncol = 1) +} + +# ---------- main callable ---------- +# fitness_table_path : path to fitness_estimation.tsv +# wt_seq_aa_txt_path : path to TXT file containing WT AA sequence (one line) +# output_pdf_path : output PDF (default "fitness_heatmap.pdf") +# positions_per_row : default 75 +run_fitness_rescaled_heatmaps <- function(fitness_table_path, + wt_seq_aa_txt_path, + output_pdf_path = "fitness_heatmap.pdf", + positions_per_row = 75) { + + df <- read.table( + fitness_table_path, sep = "\t", header = TRUE, + check.names = FALSE, quote = "", comment.char = "" + ) + + wt_aa_col <- find_col(df, c("wt aa", "wt_aa", "wt")) + pos_col <- find_col(df, c("pos", "position")) + mut_aa_col <- find_col(df, c("mut aa", "mut_aa", "aa")) + rescaled_cols <- get_rescaled_cols(df) + + wt_seq_aa <- read_wt_seq_aa_txt(wt_seq_aa_txt_path) + + plots <- list() + + ## 1) Mean first – use existing "mean fitness" column if available + mean_col <- find_mean_col(df) + if (is.null(mean_col)) { + df$`rescaled_fitness_mean` <- if (length(rescaled_cols) == 1) df[[rescaled_cols[1]]] else rowMeans(df[, rescaled_cols], na.rm = TRUE) + mean_col <- "rescaled_fitness_mean" + } + long_df_mean <- build_heatmap_long(df, wt_aa_col, pos_col, mut_aa_col, mean_col, + positions_per_row, wt_seq_aa) + plots[[length(plots) + 1]] <- list( + title = sprintf("Fitness — mean of %d replicate(s)", length(rescaled_cols)), + data = long_df_mean + ) + + ## 2) Then individual replicates + for (i in seq_along(rescaled_cols)) { + col <- rescaled_cols[i] + long_df <- build_heatmap_long(df, wt_aa_col, pos_col, mut_aa_col, col, + positions_per_row, wt_seq_aa) + plots[[length(plots) + 1]] <- list( + title = sprintf("Fitness — rep%d", i), + data = long_df + ) + } + + # Device height: (#row groups × 4) + page_heights <- vapply(plots, function(p) max(p$data$row_group, na.rm = TRUE), numeric(1)) + device_height <- max(4, as.numeric(page_heights) * 4, na.rm = TRUE) + + grDevices::pdf(output_pdf_path, width = 16, height = device_height) + on.exit(try(grDevices::dev.off(), silent = TRUE), add = TRUE) + for (p in plots) print(plot_heatmap(p$data, p$title, positions_per_row)) + invisible(TRUE) +} diff --git a/modules/local/dmsanalysis/bin/gatk_to_fitness.R b/modules/local/dmsanalysis/bin/gatk_to_fitness.R new file mode 100644 index 0000000..ff150b3 --- /dev/null +++ b/modules/local/dmsanalysis/bin/gatk_to_fitness.R @@ -0,0 +1,170 @@ +# input: wt_seq_path, input-gatk-counts filtered by codon library, output-gatk-counts filtered by codon library, start-stop-codon +# output: merged counts file for input and output data that can be fed into DiMSum +# sums up all synonymous mutations, assigning them to the wildtype sequence in the first line + + +# suppressMessages(library(seqinr)) +# suppressMessages(library(Biostrings)) + +# generate_dimsum_input <- function(wt_seq_path, gatk_input, gatk_output, pos_range) { +# # Parse the position range +# positions <- unlist(strsplit(pos_range, "-")) +# start_pos <- as.numeric(positions[1]) +# stop_pos <- as.numeric(positions[2]) +# +# # Load the wild-type sequence +# seq_data <- Biostrings::readDNAStringSet(filepath = wt_seq_path) +# wt_seq <- seq_data[[1]] # Extract the sequence +# wt_seq <- subseq(wt_seq, start = start_pos, end = stop_pos) +# +# # Convert wt_seq to a character string +# wt_seq <- as.character(wt_seq) +# +# # Split the wild-type sequence into codons (groups of 3 bases) +# wt_codons <- substring(wt_seq, seq(1, nchar(wt_seq), 3), seq(3, nchar(wt_seq), 3)) +# +# # Helper function to process GATK CSVs into count data +# process_gatk_file <- function(gatk_csv) { +# # Load the input GATK CSV file +# gatk_data <- read.csv(gatk_csv, stringsAsFactors = FALSE) +# +# # Calculate the sum of all synonymous mutation counts +# synonymous_counts <- sum(gatk_data$counts[grep("^S:", gatk_data$aa_mut)]) +# +# # Initialize a data frame with the wild-type sequence and its synonymous counts +# results <- data.frame( +# nt_seq = wt_seq, +# count = synonymous_counts, +# stringsAsFactors = FALSE +# ) +# +# # Iterate over each row in the input data +# for (i in 1:nrow(gatk_data)) { +# # Extract the mutation info +# codon_mut <- gatk_data$codon_mut[i] +# counts <- gatk_data$counts[i] +# +# # Create a mutable copy of the wild-type codons +# mutated_codons <- wt_codons +# +# # Apply the mutation +# mutations <- strsplit(codon_mut, ", ")[[1]] +# for (mutation in mutations) { +# codon_position <- as.numeric(sub(":.*", "", mutation)) +# new_codon <- sub(".*>", "", mutation) +# # Replace the codon at the specified position +# mutated_codons[codon_position] <- new_codon +# } +# +# # Convert the mutated codons back to a sequence string +# mutated_seq_string <- paste(mutated_codons, collapse = "") +# +# # Add the result to the data frame +# results <- rbind(results, data.frame(nt_seq = mutated_seq_string, count = counts)) +# } +# +# return(results) +# } +# +# # Process the GATK input and output files +# cat("Processing GATK input file...\n") +# input_data <- process_gatk_file(gatk_input) +# colnames(input_data)[2] <- "input1" # Rename count column +# +# cat("Processing GATK output file...\n") +# output_data <- process_gatk_file(gatk_output) +# colnames(output_data)[2] <- "output1" # Rename count column +# +# # Merge the input and output data +# merged_data <- merge(input_data, output_data, by = "nt_seq", all = TRUE) +# merged_data[is.na(merged_data)] <- 0 # Replace NA with 0 for missing counts +# +# # Write the merged data to a file +# output_file <- "merged_counts_for_dimsum.txt" +# write.table(merged_data, file = output_file, sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE) +# cat("Merged counts file created:", output_file, "\n") +# } +# + +# +# generate_dimsum_input( +# "/Users/benjaminwehnert/CRG/DMS_QC/testing_data/MORtn5_reference.fa", +# "/Users/benjaminwehnert/CRG/DMS_QC/testing_data/gatk_filtered_by_codon_library.csv", +# "/Users/benjaminwehnert/CRG/DMS_QC/testing_data/gatk_filtered_by_codon_library_dummy.csv", +# "23-1225" +# ) + + +suppressMessages(library(Biostrings)) + +generate_fitness_input <- function(wt_seq_path, gatk_file, pos_range, output_file_path) { + # Parse the position range + positions <- unlist(strsplit(pos_range, "-")) + start_pos <- as.numeric(positions[1]) + stop_pos <- as.numeric(positions[2]) + + # Load the wild-type sequence + seq_data <- Biostrings::readDNAStringSet(filepath = wt_seq_path) + wt_seq <- seq_data[[1]] # Extract the sequence + wt_seq <- subseq(wt_seq, start = start_pos, end = stop_pos) + + # Convert wt_seq to a character string + wt_seq <- as.character(wt_seq) + + # Split the wild-type sequence into codons (groups of 3 bases) + wt_codons <- substring(wt_seq, seq(1, nchar(wt_seq), 3), seq(3, nchar(wt_seq), 3)) + + # Helper function to process GATK CSVs into count data + process_gatk_file <- function(gatk_csv) { + # Load the input GATK CSV file + gatk_data <- read.csv(gatk_csv, stringsAsFactors = FALSE) + + # Initialize a data frame for results + results <- data.frame( + nt_seq = character(), + count = numeric(), + stringsAsFactors = FALSE + ) + + # Iterate over each row in the input data + for (i in 1:nrow(gatk_data)) { + # Extract the mutation info + codon_mut <- gatk_data$codon_mut[i] + counts <- gatk_data$counts[i] + + # Create a mutable copy of the wild-type codons + mutated_codons <- wt_codons + + # Apply the mutation + mutations <- strsplit(codon_mut, ", ")[[1]] + for (mutation in mutations) { + codon_position <- as.numeric(sub(":.*", "", mutation)) + new_codon <- sub(".*>", "", mutation) + # Replace the codon at the specified position + mutated_codons[codon_position] <- new_codon + } + + # Convert the mutated codons back to a sequence string + mutated_seq_string <- paste(mutated_codons, collapse = "") + + # Add the result to the data frame + results <- rbind(results, data.frame(nt_seq = mutated_seq_string, count = counts)) + } + + return(results) + } + + # Process the GATK file + cat("Processing GATK file...\n") + processed_data <- process_gatk_file(gatk_file) + + # Write the processed data to a file without column names + write.table(processed_data, file = output_file_path, sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE) +} + +# generate_fitness_input( +# "/Users/benjaminwehnert/CRG/DMS_QC/testing_data/MORtn5_reference.fa", +# "/Users/benjaminwehnert/CRG/DMS_QC/testing_data/gatk_filtered_by_codon_library.csv", +# "23-1225", +# "/Users/benjaminwehnert/CRG/DMS_QC/testing_data/testing_outputs/dimsum_input.tsv" +# ) diff --git a/modules/local/dmsanalysis/bin/global_position_biases_counts_and_counts_per_cov.R b/modules/local/dmsanalysis/bin/global_position_biases_counts_and_counts_per_cov.R new file mode 100644 index 0000000..51c5283 --- /dev/null +++ b/modules/local/dmsanalysis/bin/global_position_biases_counts_and_counts_per_cov.R @@ -0,0 +1,149 @@ +# input: prefiltered gatk path, aa seq path, window_size (sliding window), output_path_folder +# output: two lineplots showing counts & counts per coverage divided in types of variants (single-/double-/triple-base exchange) + +library(zoo) # sliding window +library(dplyr) +library(ggplot2) +library(scales) + +position_biases <- function(prefiltered_gatk_path, aa_seq_path, window_size = 10) { + + # Load and process the data + prefiltered_gatk <- read.table(prefiltered_gatk_path, sep = ",", fill = NA, header = TRUE) + prefiltered_gatk$pos <- as.numeric(sub("(\\D)(\\d+)(\\D)", "\\2", prefiltered_gatk$pos_mut)) + unique_pos <- unique(as.numeric(prefiltered_gatk$pos)) + aa_seq <- readLines(aa_seq_path, warn = FALSE) + aa_seq_length <- nchar(aa_seq) + aa_positions <- seq(nchar(aa_seq)) + + means_counts_single <- rep(NA, nchar(aa_seq)) + means_counts_double <- rep(NA, nchar(aa_seq)) + means_counts_triple <- rep(NA, nchar(aa_seq)) + means_counts_per_cov_single <- rep(NA, nchar(aa_seq)) + means_counts_per_cov_double <- rep(NA, nchar(aa_seq)) + means_counts_per_cov_triple <- rep(NA, nchar(aa_seq)) + + + # Loop through each position in the amino acid sequence + for (i in 1:(nchar(aa_seq))) { + + # Filter the data for the current position in aa_positions + window_data <- prefiltered_gatk %>% filter(prefiltered_gatk$pos %in% aa_positions[i]) + + # Calculate mean for Single mutations (where varying_bases == 1) + window_data_single <- window_data %>% filter(varying_bases == 1) + means_counts_single[i] <- mean(window_data_single$counts, na.rm = FALSE) + means_counts_per_cov_single[i] <- mean(window_data_single$counts_per_cov, na.rm = FALSE) + + # Calculate mean for Double mutations (where varying_bases == 2) + window_data_double <- window_data %>% filter(varying_bases == 2) + means_counts_double[i] <- mean(window_data_double$counts, na.rm = FALSE) + means_counts_per_cov_double[i] <- mean(window_data_double$counts_per_cov, na.rm = FALSE) + + # Calculate mean for Triple mutations (where varying_bases == 3) + window_data_triple <- window_data %>% filter(varying_bases == 3) + means_counts_triple[i] <- mean(window_data_triple$counts, na.rm = FALSE) + means_counts_per_cov_triple[i] <- mean(window_data_triple$counts_per_cov, na.rm = FALSE) + } + + pos_bias_df <- data.frame(pos = seq(nchar(aa_seq))) + + + pos_bias_df$rolling_mean_counts_single <- rollapply(means_counts_single, width = window_size, + FUN = function(x) (mean(x, na.rm = TRUE) + 0.00001), fill = "extend") + pos_bias_df$rolling_mean_counts_double <- rollapply(means_counts_double, width = window_size, + FUN = function(x) (mean(x, na.rm = TRUE) + 0.00001), fill = "extend") + pos_bias_df$rolling_mean_counts_triple <- rollapply(means_counts_triple, width = window_size, + FUN = function(x) (mean(x, na.rm = TRUE) + 0.00001), fill = "extend") + + pos_bias_df$rolling_mean_counts_per_cov_single <- rollapply(means_counts_per_cov_single, width = window_size, + FUN = function(x) (mean(x, na.rm = TRUE) + 0.00001), fill = "extend") + pos_bias_df$rolling_mean_counts_per_cov_double <- rollapply(means_counts_per_cov_double, width = window_size, + FUN = function(x) (mean(x, na.rm = TRUE) + 0.00001), fill = "extend") + pos_bias_df$rolling_mean_counts_per_cov_triple <- rollapply(means_counts_per_cov_triple, width = window_size, + FUN = function(x) (mean(x, na.rm = TRUE) + 0.00001), fill = "extend") + + + # Replace NAs with 0 for rolling means and SE + pos_bias_df$rolling_mean_counts_single[is.na(pos_bias_df$rolling_mean_counts_single)] <- 0.00001 + pos_bias_df$rolling_mean_counts_double[is.na(pos_bias_df$rolling_mean_counts_double)] <- 0.00001 + pos_bias_df$rolling_mean_counts_triple[is.na(pos_bias_df$rolling_mean_counts_triple)] <- 0.00001 + pos_bias_df$rolling_mean_counts_per_cov_single[is.na(pos_bias_df$rolling_mean_counts_per_cov_single)] <- 0.000001 + pos_bias_df$rolling_mean_counts_per_cov_double[is.na(pos_bias_df$rolling_mean_counts_per_cov_double)] <- 0.000001 + pos_bias_df$rolling_mean_counts_per_cov_triple[is.na(pos_bias_df$rolling_mean_counts_per_cov_triple)] <- 0.000001 + + + + plots_theme <- list( + + # Add the minimal theme to the list + theme_minimal(), + + # Customize legend title and appearance + theme(legend.title = element_text(size = 10, face = "bold"), + legend.position = "right"), + + # Customize guides for the legend elements + guides( + color = guide_legend(title = "Mutation Type", order = 1), # Title for line color + fill = guide_legend(title = "Standard Deviation", order = 2), # Title for ribbon fill + linetype = guide_legend(title = "Required Coverage", order = 3) # Title for line type + ) + ) + + + # Placeholder name for the linetype + linetype_placeholder <- "Required coverage" + + rolling_counts_plot <- ggplot(pos_bias_df, aes(x = pos)) + + + # Add line for rolling mean coverage + geom_line(aes(y = rolling_mean_counts_single, color = "One Varying Base")) + + geom_line(aes(y = rolling_mean_counts_double, color = "Two Varying Bases")) + + geom_line(aes(y = rolling_mean_counts_triple, color = "Three Varying Bases")) + + + # Individual axis labels for this plot + xlab("Amino Acid Position") + + ylab("Counts") + + + # Manually set color and fill labels + scale_color_manual(name = "Mutation Type", + values = c("One Varying Base" = "chocolate", "Two Varying Bases" = "darkolivegreen3", "Three Varying Bases" = "deepskyblue1"), + limits = c("One Varying Base", "Two Varying Bases", "Three Varying Bases")) + # Color for line + + # Apply the saved theme and design elements at the end + plots_theme + + + scale_y_continuous(trans = 'log10', labels = scales::comma) + + + + + rolling_counts_per_cov_plot <- ggplot(pos_bias_df, aes(x = pos)) + + + # Add line for rolling mean coverage + geom_line(aes(y = rolling_mean_counts_per_cov_single, color = "One Varying Base")) + + geom_line(aes(y = rolling_mean_counts_per_cov_double, color = "Two Varying Bases")) + + geom_line(aes(y = rolling_mean_counts_per_cov_triple, color = "Three Varying Bases")) + + + # Individual axis labels for this plot + xlab("Amino Acid Position") + + ylab("Counts") + + + # Manually set color and fill labels + scale_color_manual(name = "Mutation Type", + values = c("One Varying Base" = "chocolate", "Two Varying Bases" = "darkolivegreen3", "Three Varying Bases" = "deepskyblue1"), + limits = c("One Varying Base", "Two Varying Bases", "Three Varying Bases")) + # Color and legend order for lines + + # Apply the saved theme and design elements at the end + plots_theme + + + scale_y_continuous(trans = 'log10', labels = scales::comma) + + + ggsave(filename = "rolling_counts.pdf", plot = rolling_counts_plot, device = "pdf", width = 10, height = 6) + ggsave(filename = "rolling_counts_per_cov.pdf", plot = rolling_counts_per_cov_plot, device = "pdf", width = 10, height = 6) +} + +# Example call to the function +#position_biases("/Users/benjaminwehnert/CRG/DMS_QC/testing_data/gatk_filtered_by_codon_library.csv", "/Users/benjaminwehnert/CRG/DMS_QC/testing_data/aa_seq.txt", window_size = 18, output_path_folder = "/Users/benjaminwehnert/CRG/DMS_QC/testing_data/testing_outputs") diff --git a/modules/local/dmsanalysis/bin/global_position_biases_cov.R b/modules/local/dmsanalysis/bin/global_position_biases_cov.R new file mode 100644 index 0000000..2f2235c --- /dev/null +++ b/modules/local/dmsanalysis/bin/global_position_biases_cov.R @@ -0,0 +1,82 @@ +# input: prefiltered gatk path, aa seq path, window_size (sliding window), output_path_folder, aimed counts per aa variant +# output: lineplot showing coverage over positions and dotted line: Assuming all potential 21 variants are equally distributed, this is the coverage one need to get the aimed counts per variant. + +library(zoo) # sliding window +library(dplyr) +library(ggplot2) +library(scales) +position_biases <- function(prefiltered_gatk_path, aa_seq_path, window_size = 10, output_file_path, targeted_counts_per_aa_variant = 100) { + + # Load and process the data + prefiltered_gatk <- read.table(prefiltered_gatk_path, sep = ",", fill = NA, header = TRUE) + prefiltered_gatk$pos <- as.numeric(sub("(\\D)(\\d+)(\\D)", "\\2", prefiltered_gatk$pos_mut)) + unique_pos <- unique(as.numeric(prefiltered_gatk$pos)) + aa_seq <- readLines(aa_seq_path, warn = FALSE) + aa_seq_length <- nchar(aa_seq) + aa_positions <- seq(nchar(aa_seq)) + + means_cov <- rep(NA, nchar(aa_seq)) + + # Calculate means for cov (should be the same over all variants in one position) + for (i in 1:(nchar(aa_seq))) { + window_data <- prefiltered_gatk %>% filter(prefiltered_gatk$pos %in% aa_positions[i]) + means_cov[i] <- mean(window_data$cov, na.rm = FALSE) + } + + pos_bias_df <- data.frame(pos = seq(nchar(aa_seq))) + + # Log-transform the rolling means to avoid issues with zeros (log(y + 0.001)) + pos_bias_df$rolling_mean_cov <- rollapply(means_cov, width = window_size, + FUN = function(x) (mean(x, na.rm = TRUE)), fill = "extend") + + # Generate the plot + plots_theme <- list( + + # Customize legend appearance (leave titles blank) + guides( + color = guide_legend(title = NULL, order = 1), # Title for line color + fill = guide_legend(title = NULL, order = 2), # Title for ribbon fill + linetype = guide_legend(title = NULL, order = 3) # Title for line type + ), + + # Apply minimal theme + theme_minimal() + ) + + + linetype_placeholder <- "Required coverage" + + rolling_cov_plot <- ggplot(pos_bias_df, aes(x = pos, y = rolling_mean_cov)) + + + # Add line for rolling mean coverage + geom_line(aes(color = "Coverage")) + + + # Add a horizontal line with a mapped linetype so it appears in the legend + geom_hline(aes(yintercept = ((21 * nchar(aa_seq) * targeted_counts_per_aa_variant)), + linetype = linetype_placeholder), + color = "black", linewidth = 0.3) + + + # Individual axis labels for this plot + xlab("Amino Acid Position") + + ylab("Coverage") + + + # Manually set color and fill labels + scale_color_manual(values = c("Coverage" = "black")) + # Color for line + + # Set the linetype with a label that uses the targeted threshold dynamically + scale_linetype_manual(values = c("Required coverage" = "dotted"), + labels = paste("Required coverage \n for", as.character(targeted_counts_per_aa_variant), "counts per variant \n if equally present")) + + + # Apply the saved theme and design elements at the end + plots_theme + + + # Set y-axis to log scale and apply comma formatting + scale_y_continuous(trans = 'log10', labels = scales::comma) + + + # Return the plot + ggsave(filename = output_file_path, plot = rolling_cov_plot, device = "pdf", width = 10, height = 6) +} + +# Example call to the function +#position_biases("/Users/benjaminwehnert/CRG/DMS_QC/testing_data/gatk_filtered_by_codon_library.csv", "/Users/benjaminwehnert/CRG/DMS_QC/testing_data/aa_seq.txt", window_size = 18, output_path_folder = "/Users/benjaminwehnert/CRG/DMS_QC/testing_data/testing_outputs/rolling_coverage.pdf", targeted_counts_per_aa_variant = 15) diff --git a/modules/local/dmsanalysis/bin/install_packages.R b/modules/local/dmsanalysis/bin/install_packages.R new file mode 100644 index 0000000..7846c92 --- /dev/null +++ b/modules/local/dmsanalysis/bin/install_packages.R @@ -0,0 +1,81 @@ +# List of required packages +required_packages <- c("Biostrings", "dplyr", "ggplot2", "scales", "zoo", "reshape2", "tidyr", "tidyverse") + +# Function to install and load missing packages +install_if_missing <- function(pkg) { + # Check if package is already installed + if (!requireNamespace(pkg, quietly = TRUE)) { + message(paste("Installing package:", pkg)) + + # Ensure BiocManager is installed for Bioconductor packages + if (!requireNamespace("BiocManager", quietly = TRUE)) { + install.packages("BiocManager", repos = "https://cloud.r-project.org") # Set a CRAN mirror + } + + # Install Bioconductor packages via BiocManager + if (pkg %in% c("Biostrings", "GenomicRanges", "IRanges")) { # Add more Bioconductor packages if needed + BiocManager::install(pkg) + } else { + # Install CRAN packages + install.packages(pkg, repos = "https://cloud.r-project.org") + } + } else { + message(paste("Package", pkg, "is already installed")) + } +} + +# Install and load all required packages +invisible(lapply(required_packages, install_if_missing)) + +message("All required packages are installed and loaded.") + + + + + + + + + + + + + + + + + + + +##Find current versions of dependencies: +# List of your packages +packages <- c("dplyr", "ggplot2", "scales", "zoo", "reshape2", "tidyr", "tidyverse", "Biostrings") + +# Function to get the current version of each package +get_package_version <- function(pkg) { + if (requireNamespace(pkg, quietly = TRUE)) { + return(packageVersion(pkg)) + } else { + return(NA) + } +} + +# Check and print versions +package_versions <- sapply(packages, get_package_version) +print(package_versions) + + + +# Check the version of libcurl +system("curl --version") + +# Check the version of OpenSSL +system("openssl version") + +# Check the version of libxml2 +system("xml2-config --version") + +# Check the version of zlib +system("zlib-flate -version") + +system("ldconfig -p | grep zlib") diff --git a/modules/local/dmsanalysis/bin/logdiff.R b/modules/local/dmsanalysis/bin/logdiff.R new file mode 100644 index 0000000..246cffe --- /dev/null +++ b/modules/local/dmsanalysis/bin/logdiff.R @@ -0,0 +1,146 @@ + +# Maybe develop additional ideas to characterise variants below 10 percentile -> Do we find patterns that help the user to understand why certain variants are hard to count? + +# input: completed_prefiltered_gatk path, output folder path +# output: two logdiff-plots (counts_per_cov): 1st lineplot 2nd dotplot showing type of variant (varying bases - 1/2/3) + +library(dplyr) +library(ggplot2) +library(scales) + + logdiff_plot <- function(prefiltered_gatk_path) { + + # Load the data + prefiltered_gatk <- read.table(prefiltered_gatk_path, sep = ",", fill = NA, header = TRUE) + + # Sort by counts_per_cov while keeping corresponding varying_bases + sorted_counts <- prefiltered_gatk %>% + arrange(counts_per_cov) # Sort by counts_per_cov + + # Create a new column for the sorted index (1, 2, 3, ...) + sorted_counts$ids <- 1:nrow(sorted_counts) + + # Calculate Q1 (10% quantile) and Q3 (90% quantile) + Q1 <- quantile(sorted_counts$counts_per_cov, 0.10, na.rm = TRUE) + Q3 <- quantile(sorted_counts$counts_per_cov, 0.90, na.rm = TRUE) + + # Calculate the LogDiff + LogDiff <- log10(Q3) - log10(Q1) + + # Create the first plot with a line + line_plot <- ggplot(sorted_counts, aes(x = ids, y = counts_per_cov)) + + geom_line(color = "black") + + + # Set axis labels + xlab("Variants") + + ylab("Counts per Coverage") + + + # Apply logarithmic scale to the y-axis + scale_y_continuous(trans = 'log10') + + + # Add horizontal dotted lines at Q1 and Q3 + geom_hline(yintercept = Q1, linetype = "dotted", color = "black") + + geom_hline(yintercept = Q3, linetype = "dotted", color = "black") + + + # Add the LogDiff value to the top left corner + annotate("text", x = 0, y = max(sorted_counts$counts_per_cov), label = paste("LogDiff =", round(LogDiff, 2)), hjust = 0, vjust = 1, size = 5, color = "black") + + + # Apply the minimal theme + theme_minimal() + + # Save the line plot as a PDF + ggsave(filename = "logdiff_plot.pdf", plot = line_plot, device = "pdf", width = 10, height = 6) + + # Create the second plot with colored dots based on varying_bases + colored_plot <- ggplot(sorted_counts, aes(x = ids, y = counts_per_cov, color = as.factor(varying_bases))) + + + # Add horizontal dotted lines at Q1 and Q3 + geom_hline(yintercept = Q1, linetype = "dotted", color = "black") + + geom_hline(yintercept = Q3, linetype = "dotted", color = "black") + + + geom_point(size = 0.9) + # Use points instead of lines + + # Set axis labels + xlab("Variants") + + ylab("Counts per Coverage") + + + # Apply logarithmic scale to the y-axis + scale_y_continuous(trans = 'log10') + + + # Add the LogDiff value to the top left corner + annotate("text", x = 1, y = max(sorted_counts$counts_per_cov), label = paste("LogDiff =", round(LogDiff, 2)), hjust = 0, vjust = 1, size = 5, color = "black") + + + # Add color scale for varying_bases + scale_color_manual(values = c("1" = "chocolate", "2" = "darkolivegreen3", "3" = "deepskyblue1"), name = "Varying \n Bases") + + + # Apply the minimal theme + theme_minimal() + + # Save the colored plot as a PDF + ggsave(filename = "logdiff_varying_bases.pdf", plot = colored_plot, device = "pdf", width = 10, height = 6) + } + + # Example call of the function +# logdiff_plot("/Users/benjaminwehnert/CRG/DMS_QC/testing_data/testing_outputs/completed_prefiltered_gatk.csv", "/Users/benjaminwehnert/CRG/DMS_QC/testing_data/testing_outputs") + + + + + + # logdiff_plot_codon_mut <- function(prefiltered_gatk_path, output_folder_path) { + # + # # Load the data + # prefiltered_gatk <- read.table(prefiltered_gatk_path, sep = ",", fill = NA, header = TRUE) + # + # # Extract the codon number from the codon_mut column (e.g., '377' from '377:TCC>CTG') + # prefiltered_gatk <- prefiltered_gatk %>% + # mutate(position = as.numeric(gsub(":.*", "", codon_mut))) # Extract the number before the colon + # + # # Sort by counts_per_cov while keeping corresponding position + # sorted_counts <- prefiltered_gatk %>% + # arrange(counts_per_cov) # Sort by counts_per_cov + # + # # Create a new column for the sorted index (1, 2, 3, ...) + # sorted_counts$ids <- 1:nrow(sorted_counts) + # + # # Calculate Q1 (10% quantile) and Q3 (90% quantile) + # Q1 <- quantile(sorted_counts$counts_per_cov, 0.10, na.rm = TRUE) + # Q3 <- quantile(sorted_counts$counts_per_cov, 0.90, na.rm = TRUE) + # + # # Calculate the LogDiff + # LogDiff <- log10(Q3) - log10(Q1) + # + # # Create the second plot with a larger gray area and preserved horizontal lines and LogDiff + # codon_mut_plot <- ggplot(sorted_counts, aes(x = ids, y = counts_per_cov, color = position)) + + # + # # Add horizontal dotted lines in the background + # geom_hline(yintercept = Q1, linetype = "dotted", color = "black") + + # geom_hline(yintercept = Q3, linetype = "dotted", color = "black") + + # + # # Add points with smaller size + # geom_point(size = 0.9) + + # + # # Set axis labels + # xlab("Sorted Index") + + # ylab("Counts per Coverage") + + # + # # Apply logarithmic scale to the y-axis + # scale_y_continuous(trans = 'log10', labels = scales::comma) + + # + # # Add the LogDiff value to the top left corner + # annotate("text", x = 1, y = max(sorted_counts$counts_per_cov), label = paste("LogDiff =", round(LogDiff, 2)), hjust = 0, vjust = 1, size = 5, color = "black") + + # + # # Add a color gradient with a larger gray area + # scale_color_gradient2(low = "blue", mid = "lightgray", high = "green", midpoint = median(sorted_counts$position, na.rm = TRUE), name = "Position") + + # + # # Apply the minimal theme + # theme_minimal() + # + # # Save the second plot as a PDF + # ggsave(filename = paste(output_folder_path, "/logdiff_position.pdf", sep = ""), + # plot = codon_mut_plot, device = "pdf", width = 10, height = 6) + # } + # + # # Example call + # logdiff_plot_codon_mut("/Users/benjaminwehnert/CRG/DMS_QC/testing_data/testing_outputs/completed_prefiltered_gatk.csv", "/Users/benjaminwehnert/CRG/DMS_QC/testing_data/testing_outputs") + # diff --git a/modules/local/dmsanalysis/bin/low_count_variants.R b/modules/local/dmsanalysis/bin/low_count_variants.R new file mode 100644 index 0000000..03cf8a7 --- /dev/null +++ b/modules/local/dmsanalysis/bin/low_count_variants.R @@ -0,0 +1,127 @@ +#!!!UNFINISHED!!! --> Decide if those graphics make sense for users. If so, further develop them and put everything in a function :) + + +library(tidyverse) + +completed_prefiltered_gatk <- read.csv("/Users/benjaminwehnert/CRG/DMS_QC/testing_data/completed_prefiltered_gatk.csv") +View(completed_prefiltered_gatk) + +# Lade die notwendigen Pakete +library(dplyr) +library(tidyr) + +# Funktion zur Klassifizierung der Basenwechsel +classify_base_changes <- function(wt_codon, variant_codon) { + # Definition der Purine und Pyrimidine + purines <- c("A", "G") + pyrimidines <- c("C", "T") + + # Initialisiere die Zähler + pur_pur <- 0 + pur_pyr <- 0 + pyr_pur <- 0 + pyr_pyr <- 0 + + # Vergleiche jede Base zwischen wt_codon und variant_codon + for (i in 1:3) { + wt_base <- substr(wt_codon, i, i) + var_base <- substr(variant_codon, i, i) + + # Klassifiziere den Basenwechsel + if (wt_base %in% purines && var_base %in% purines) { + pur_pur <- pur_pur + 1 + } else if (wt_base %in% purines && var_base %in% pyrimidines) { + pur_pyr <- pur_pyr + 1 + } else if (wt_base %in% pyrimidines && var_base %in% purines) { + pyr_pur <- pyr_pur + 1 + } else if (wt_base %in% pyrimidines && var_base %in% pyrimidines) { + pyr_pyr <- pyr_pyr + 1 + } + } + + # Erstelle eine Ausgabe als DataFrame + return(data.frame(pur_pur, pur_pyr, pyr_pur, pyr_pyr)) +} + +# Beispielanwendung auf den Datensatz +completed_prefiltered_gatk <- completed_prefiltered_gatk %>% + rowwise() %>% + mutate(base_change_classification = list(classify_base_changes(wt_codon, Variant))) %>% + unnest(base_change_classification) + + + + + + + + +# Load necessary packages +library(dplyr) +library(ggplot2) + +# 1. Data preparation +# Create a binary variable for low vs. high counts_per_cov +threshold <- quantile(completed_prefiltered_gatk$counts_per_cov, 0.10, na.rm = TRUE) +completed_prefiltered_gatk <- completed_prefiltered_gatk %>% + mutate(group = ifelse(counts_per_cov < threshold, "Bottom 10%", "Top 90%")) + +# 2. Calculate the mean frequency for each category in both groups +frequency_data <- completed_prefiltered_gatk %>% + group_by(group) %>% + summarise( + pur_pur_mean = mean(pur_pur, na.rm = TRUE), + pur_pyr_mean = mean(pur_pyr, na.rm = TRUE), + pyr_pur_mean = mean(pyr_pur, na.rm = TRUE), + pyr_pyr_mean = mean(pyr_pyr, na.rm = TRUE) + ) %>% + pivot_longer(cols = c(pur_pur_mean, pur_pyr_mean, pyr_pur_mean, pyr_pyr_mean), + names_to = "mutation_type", + values_to = "mean_frequency") %>% + # Ensure that all mutation types are present + complete(mutation_type, group, fill = list(mean_frequency = 0)) + +# 3. Visualize the distribution of frequencies for the top 90% and bottom 10% +ggplot(frequency_data, aes(x = mutation_type, y = mean_frequency, fill = group)) + + geom_bar(stat = "identity", position = "dodge") + + labs(x = "Mutation Type", y = "Mean Frequency", title = "Distribution of Mutation Types for Top 90% vs. Bottom 10%") + + scale_fill_manual(values = c("Top 90%" = "steelblue", "Bottom 10%" = "tomato")) + + theme_minimal() + + scale_x_discrete(labels = c("pur_pur" = "Purine > Purine", "pur_pyr" = "Purine > Pyrimidine", + "pyr_pur" = "Pyrimidine > Purine", "pyr_pyr" = "Pyrimidine > Pyrimidine")) + + + + + + + + + + +threshold <- quantile(completed_prefiltered_gatk$counts_per_cov, 0.10, na.rm = TRUE) +completed_prefiltered_gatk <- completed_prefiltered_gatk %>% + mutate(group = ifelse(counts_per_cov < threshold, "Bottom 10%", "Top 90%")) + +# 2. Create a flag for variants with at least one pur_pyr or pur_pur +completed_prefiltered_gatk <- completed_prefiltered_gatk %>% + mutate(has_pur_mutation = ifelse(pyr_pur > 0 | pur_pyr > 0, 1, 0)) + +# 3. Calculate the proportion of variants with at least one pur_pyr or pur_pur for each group +proportion_data <- completed_prefiltered_gatk %>% + group_by(group) %>% + summarise( + total_variants = n(), + variants_with_pur_mutation = sum(has_pur_mutation), + proportion_with_pur_mutation = variants_with_pur_mutation / total_variants + ) + +# 4. Print the results +print(proportion_data) + +# 5. Optional: Visualize the proportion for each group +ggplot(proportion_data, aes(x = group, y = proportion_with_pur_mutation, fill = group)) + + geom_bar(stat = "identity") + + labs(x = "Group", y = "Proportion of Variants", title = "Proportion of Variants with at Least One Pur_pyr or Pur_pur Mutation") + + scale_fill_manual(values = c("Top 90%" = "steelblue", "Bottom 10%" = "tomato")) + + theme_minimal() diff --git a/modules/local/dmsanalysis/bin/merge_counts.R b/modules/local/dmsanalysis/bin/merge_counts.R new file mode 100644 index 0000000..2956351 --- /dev/null +++ b/modules/local/dmsanalysis/bin/merge_counts.R @@ -0,0 +1,96 @@ +# merge_counts.R +# Merges DiMSum-ready count tables into a single matrix with columns: +# nt_seq, input1..inputN, output1..outputM + +# ---- Core function ---- +merge_dimsum_counts <- function(input_paths, output_paths, out_path = "counts.tsv") { + # input_paths, output_paths: character vectors of file paths + # out_path: output TSV path + + # Helper to read a 2-col TSV without header -> data.frame(nt_seq, count) + read_counts <- function(fp) { + df <- utils::read.table( + fp, header = FALSE, sep = "\t", quote = "", + col.names = c("nt_seq", "count"), + colClasses = c("character", "numeric"), + comment.char = "", check.names = FALSE + ) + df + } + + # Read all inputs / outputs + input_list <- lapply(input_paths, read_counts) + output_list <- lapply(output_paths, read_counts) + + # Collect universe of sequences + all_seqs <- unique(c( + unlist(lapply(input_list, function(x) x$nt_seq)), + unlist(lapply(output_list, function(x) x$nt_seq)) + )) + + # Pre-allocate output frame + n_in <- length(input_list) + n_out <- length(output_list) + col_names <- c( + "nt_seq", + if (n_in > 0) paste0("input", seq_len(n_in)) else character(0), + if (n_out > 0) paste0("output", seq_len(n_out)) else character(0) + ) + # initialize numeric with 0 counts + out <- data.frame( + nt_seq = all_seqs, + matrix(0, nrow = length(all_seqs), ncol = n_in + n_out), + stringsAsFactors = FALSE, check.names = FALSE + ) + names(out) <- col_names + + # Fill inputs + if (n_in > 0) { + for (i in seq_len(n_in)) { + df <- input_list[[i]] + idx <- match(df$nt_seq, out$nt_seq) + out[idx, paste0("input", i)] <- df$count + } + } + + # Fill outputs + if (n_out > 0) { + for (j in seq_len(n_out)) { + df <- output_list[[j]] + idx <- match(df$nt_seq, out$nt_seq) + out[idx, paste0("output", j)] <- df$count + } + } + + # Write TSV with header, no row names, no quotes + utils::write.table(out, file = out_path, sep = "\t", row.names = FALSE, col.names = TRUE, quote = FALSE) + invisible(out) +} + +# ---- Lightweight CLI wrapper ---- +# Usage: +# Rscript merge_counts.R --inputs ... --outputs ... --out counts.tsv +if (sys.nframe() == 0) { + args <- commandArgs(trailingOnly = TRUE) + + # simple flag parser that supports space-separated lists + get_vals <- function(flag) { + if (!(flag %in% args)) return(character(0)) + start <- which(args == flag) + stop <- which(args %in% c("--inputs","--outputs","--out")) + stop <- min(c(stop[stop > start], length(args) + 1)) - 1 + if (stop <= start) return(character(0)) + args[(start + 1):stop] + } + + input_paths <- get_vals("--inputs") + output_paths <- get_vals("--outputs") + out_path <- get_vals("--out") + out_path <- if (length(out_path)) out_path[1] else "counts.tsv" + + if (!length(input_paths) && !length(output_paths)) { + stop("No inputs/outputs provided. Use --inputs and/or --outputs .") + } + + merge_dimsum_counts(input_paths, output_paths, out_path) +} diff --git a/modules/local/dmsanalysis/bin/possible_mutations.R b/modules/local/dmsanalysis/bin/possible_mutations.R new file mode 100644 index 0000000..97ce5fa --- /dev/null +++ b/modules/local/dmsanalysis/bin/possible_mutations.R @@ -0,0 +1,100 @@ +# input: wildtype-seq (string or fasta file), start&stopp pos., output_folder_path, mutagenesis_type (choose from nnk, nns, max_diff_to_wt, custom), if you choose custom, add: custom_codon_library -> comma-separated .txt (-> "AAA, AAC, AAG, AAT, ...") +# output: .csv with all possible programmed codons for each position + +suppressMessages(library(Biostrings)) + +# Define the function +generate_possible_variants <- function(wt_seq_input, start_stop_pos, mutagenesis_type = "nnk", + custom_codon_library = NULL, output_file) { + # Parse the start and stop positions from the input format "start-stop" + positions <- unlist(strsplit(start_stop_pos, "-")) + start_pos <- as.numeric(positions[1]) + stop_pos <- as.numeric(positions[2]) + + # Check if the input is a file or a string + if (file.exists(wt_seq_input)) { + # If it's a file, read the sequence from the fasta file + seq_data <- readDNAStringSet(filepath = wt_seq_input) + wt_seq <- seq_data[[1]] # Extract the sequence + } else { + # Otherwise, treat the input as a sequence string + wt_seq <- DNAString(wt_seq_input) + } + + # Extract the sequence between start and stop codons + coding_seq <- subseq(wt_seq, start = start_pos, end = stop_pos) + coding_seq <- as.character(coding_seq) + + # List of predefined NNK & NNS codons + nnk_codons <- c('AAG', 'AAT', 'ATG', 'ATT', 'AGG', 'AGT', 'ACG', 'ACT', + 'TAG', 'TAT', 'TTG', 'TTT', 'TGG', 'TGT', 'TCG', 'TCT', + 'GAG', 'GAT', 'GTG', 'GTT', 'GGG', 'GGT', 'GCG', 'GCT', + 'CAG', 'CAT', 'CTG', 'CTT', 'CGG', 'CGT', 'CCG', 'CCT') + + nns_codons <- c('AAG', 'AAC', 'ATG', 'ATC', 'AGG', 'AGC', 'ACG', 'ACC', + 'TAG', 'TAC', 'TTG', 'TTC', 'TGG', 'TGC', 'TCG', 'TCC', + 'GAG', 'GAC', 'GTG', 'GTC', 'GGG', 'GGC', 'GCG', 'GCC', + 'CAG', 'CAC', 'CTG', 'CTC', 'CGG', 'CGC', 'CCG', 'CCC') + + # Function to split a DNA sequence into codons (triplets) + split_into_codons <- function(seq) { + return(strsplit(seq, "(?<=.{3})", perl = TRUE)[[1]]) + } + + # Read custom codons if mode is 'custom' + if (mutagenesis_type == "custom") { + if (is.null(custom_codon_library) || !file.exists(custom_codon_library)) { + stop("Custom codons file must be provided and valid when using 'custom' mutagenesis_type") + } + # Read and parse the custom codons from the file + custom_codons <- unlist(strsplit(readLines(custom_codon_library), ",")) + custom_codons <- trimws(custom_codons) # Remove any whitespace + } + + # Split wild-type sequence into codons + wt_codons <- split_into_codons(coding_seq) + + # Initialize DataFrame to store mutated variants + result <- data.frame(Codon_Number = integer(), wt_codon = character(), Variant = character(), stringsAsFactors = FALSE) + + # Determine the codon list based on the mutagenesis_type + get_codon_list <- function(wt_codon) { + if (mutagenesis_type == "nnk") { + return(nnk_codons) + } else if (mutagenesis_type == "nns") { + return(nns_codons) + } else if (mutagenesis_type == "max_diff_to_wt") { + if (substr(wt_codon, 3, 3) == "T") { + return(nns_codons) + } else { + return(nnk_codons) + } + } else if (mutagenesis_type == "custom") { + return(custom_codons) + } else { + stop("Invalid mutagenesis_type Choose from 'nnk', 'nns', 'max_diff_to_wt', or 'custom'.") + } + } + + # Iterate over each codon in the wild-type sequence + for (i in seq_along(wt_codons)) { + wt_codon <- wt_codons[i] + codon_list <- get_codon_list(wt_codon) + + # Filter codons that are different from the wild-type codon + possible_variants <- codon_list[codon_list != wt_codon] + + # Add all variants to the result list, including the wild-type codon + for (variant in possible_variants) { + result <- rbind(result, data.frame(Codon_Number = i, wt_codon = wt_codon, Variant = variant, stringsAsFactors = FALSE)) + } + } + + # Save the variants into a CSV file + write.csv(result, output_file, row.names = FALSE) +} + +# Example usage +# Possibly generate a custom codons file: "AAA, AAC, AAG, AAT, ..." +# generate_possible_variants("/Users/benjaminwehnert/CRG/DMS_QC/testing_data/MORtn5_reference.fa", "23-1225", "/Users/benjaminwehnert/CRG/DMS_QC/testing_data/possible_NNK_mutations.csv", mutagenesis_type = "nnk") ### this one's correct for the dataset +# generate_possible_variants("/Users/benjaminwehnert/CRG/DMS_QC/testing_data/MORtn5_reference.fa", "23-1225", "/Users/benjaminwehnert/CRG/DMS_QC/testing_data/testing_outputs/possible_NNK_mutations_taylors_nnk_and_nns.csv", mutagenesis_type = "max_diff_to_wt") diff --git a/modules/local/dmsanalysis/bin/prepare_gatk_data_for_count_heatmaps.R b/modules/local/dmsanalysis/bin/prepare_gatk_data_for_count_heatmaps.R new file mode 100644 index 0000000..694a1fa --- /dev/null +++ b/modules/local/dmsanalysis/bin/prepare_gatk_data_for_count_heatmaps.R @@ -0,0 +1,82 @@ +# input: prefiltered GATK path (filtered for codon library), aa-seq file path, output path, threshold (for minimum counts to recognize variant) +# output: csv file serving as basis for counts_per_cov_heatmap function + +suppressMessages(library(dplyr)) +suppressMessages(library(ggplot2)) +suppressMessages(library(tidyr)) +suppressMessages(library(reshape2)) +suppressMessages(library(scales)) + +prepare_gatk_data_for_counts_heatmaps <- function(gatk_file_path, aa_seq_file_path, output_csv_path, threshold = 3) { + # Load the raw GATK data + raw_gatk <- read.table(gatk_file_path, sep = ",", header = TRUE) + + # Read the wild-type amino acid sequence from the text file + wt_seq <- readLines(aa_seq_file_path) + wt_seq <- unlist(strsplit(wt_seq, "")) # Split the sequence into individual amino acids + + # Summarize counts-per-cov for each unique aa mutation in pos_mut + aggregated_data <- raw_gatk %>% + group_by(pos_mut) %>% + summarize(total_counts_per_cov = sum(counts_per_cov, na.rm = TRUE), + total_counts = sum(counts, na.rm = TRUE)) # Also sum the counts + + # Extract the wild-type position and mutations from 'pos_mut' + aggregated_data <- aggregated_data %>% + mutate( + wt_aa = sub("(\\D)(\\d+)(\\D)", "\\1", pos_mut), # Wild-type amino acid (e.g., S) + position = as.numeric(sub("(\\D)(\\d+)(\\D)", "\\2", pos_mut)), # Position (e.g., 3) + mut_aa = sub("(\\D)(\\d+)(\\D)", "\\3", pos_mut) # Mutant amino acid (e.g., R) + ) + + # Replace 'X' with '*', indicating the stop codon + aggregated_data <- aggregated_data %>% + mutate(mut_aa = ifelse(mut_aa == "X", "*", mut_aa)) + + # Replace 'X' with '*' in the wild-type amino acid sequence as well + wt_seq <- ifelse(wt_seq == "X", "*", wt_seq) + + # Define all 20 standard amino acids and the stop codon "*" + all_amino_acids <- c("A", "C", "D", "E", "F", "G", "H", "I", "K", "L", + "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y", "*") + + # Create a list of all positions in the wild-type sequence + all_positions <- 1:length(wt_seq) + + # Create a complete grid of all possible combinations of positions and amino acids + complete_data <- expand.grid(mut_aa = all_amino_acids, position = all_positions) + + # Merge the summarized data with the complete grid (filling missing entries with 0) + heatmap_data <- complete_data %>% + left_join(aggregated_data, by = c("mut_aa", "position")) %>% + mutate(total_counts_per_cov = ifelse(is.na(total_counts_per_cov), 0, total_counts_per_cov), + wt_aa = wt_seq[position]) # Assign the wild-type amino acid + + # Set variants with counts < threshold to NA + heatmap_data <- heatmap_data %>% + mutate( + total_counts_per_cov = ifelse(total_counts < threshold, NA, total_counts_per_cov), + total_counts = ifelse(total_counts < threshold, NA, total_counts) + ) + + # Fill pos_mut column + heatmap_data <- heatmap_data %>% + mutate( + pos_mut = ifelse(is.na(pos_mut), + paste0(wt_aa, position, mut_aa), + pos_mut) + ) + + # Save the aggregated data to a CSV file + write.csv(heatmap_data, file = output_csv_path, row.names = FALSE) + print(paste("Aggregated data saved to:", output_csv_path)) +} + + +# Aufruf der Datenaufbereitungsfunktion +# prepare_gatk_data_for_counts_heatmaps( +# "/Users/benjaminwehnert/CRG/DMS_QC/testing_data/gatk_filtered_by_codon_library.csv", +# "/Users/benjaminwehnert/CRG/DMS_QC/testing_data/aa_seq.txt", +# "/Users/benjaminwehnert/CRG/DMS_QC/testing_data/testing_outputs/prepared_gatk_data.csv", +# threshold = 3 +# ) diff --git a/modules/local/dmsanalysis/bin/prepare_gatk_data_for_fitness_heatmap.R b/modules/local/dmsanalysis/bin/prepare_gatk_data_for_fitness_heatmap.R new file mode 100644 index 0000000..ef25ce9 --- /dev/null +++ b/modules/local/dmsanalysis/bin/prepare_gatk_data_for_fitness_heatmap.R @@ -0,0 +1,134 @@ +suppressMessages(library(dplyr)) + +prepare_gatk_data_for_fitness_heatmap <- function(csv_file_path, aa_seq_path, dimsum_fitness_path, output_csv_path) { + # Load the CSV file + gatk_data <- read.csv(csv_file_path) + + # Read the wild-type amino acid sequence from the text file + aa_seq <- readLines(aa_seq_path) + aa_seq <- unlist(strsplit(aa_seq, "")) # Split the sequence into individual amino acids + + # Add the mutated sequence column + gatk_data <- gatk_data %>% + mutate( + mutated_sequence = sapply(1:nrow(gatk_data), function(i) { + # Create a copy of the wild-type sequence + mutated_seq <- aa_seq + # Extract mutation information + position <- as.numeric(position[i]) + mut_aa <- mut_aa[i] + # Apply the mutation + if (!is.na(mut_aa) && position > 0 && position <= length(mutated_seq)) { + mutated_seq[position] <- mut_aa + } + # Return the mutated sequence as a string + paste(mutated_seq, collapse = "") + }) + ) + + # # Save the updated data to a new CSV file + # write.csv(gatk_data, file = output_csv_path, row.names = FALSE) + # print(paste("Updated data with mutated sequences saved to:", output_csv_path)) + + + load(dimsum_fitness_path) + dimsum_fitness <- rbind(all_variants, synonymous) + dimsum_fitness <- dimsum_fitness[-which(dimsum_fitness$WT == T)[-1],] # remove duplicate + rm(doubles,singles,all_variants,synonymous,wildtype) + + # Perform a left join to retain all rows from gatk_data + merged_data <- gatk_data %>% + left_join(dimsum_fitness, by = c("mutated_sequence" = "aa_seq")) + + # Handle wild-type rows specifically + merged_data <- merged_data %>% + mutate( + # If WT from dimsum_fitness exists, use it; otherwise keep as is + WT = ifelse(is.na(WT) & mutated_sequence == paste(aa_seq, collapse = ""), TRUE, WT) + ) + + # Save the merged data to a CSV file + write.csv(merged_data, file = output_csv_path, row.names = FALSE) + print(paste("Merged data saved to:", output_csv_path)) +} + + + + +# add_mutated_sequence("/Users/benjaminwehnert/CRG/DMS_QC/testing_data/testing_outputs/prepared_gatk_data.csv", +# "/Users/benjaminwehnert/CRG/DMS_QC/testing_data/aa_seq.txt", +# +# "/Users/benjaminwehnert/CRG/DMS_QC/testing_data/testing_outputs/prepared_gatk_for_fitness.csv") + + + + + + +suppressMessages(library(dplyr)) + +add_mutated_sequence <- function(csv_file_path, aa_seq_path, dimsum_fitness_path, output_csv_path) { + + # Load the CSV file + gatk_data <- read.csv(csv_file_path) + + # Read the wild-type amino acid sequence from the text file + aa_seq <- readLines(aa_seq_path) + aa_seq <- unlist(strsplit(aa_seq, "")) # Split the sequence into individual amino acids + + # Add the mutated sequence column + gatk_data <- gatk_data %>% + mutate( + mutated_sequence = sapply(1:nrow(gatk_data), function(i) { + # Create a copy of the wild-type sequence + mutated_seq <- aa_seq + # Extract mutation information + position <- as.numeric(gatk_data$position[i]) + mut_aa <- gatk_data$mut_aa[i] + # Apply the mutation + if (!is.na(mut_aa) && position > 0 && position <= length(mutated_seq)) { + mutated_seq[position] <- mut_aa + } + # Return the mutated sequence as a string + paste(mutated_seq, collapse = "") + }) + ) + + # Load dimsum_fitness data + load(dimsum_fitness_path) + dimsum_fitness <- rbind(all_variants, synonymous) + + # Remove unnecessary columns + dimsum_fitness <- dimsum_fitness %>% + select(-nt_seq, -Nham_nt, -Nmut_codons, -indel, -STOP, -STOP_readthrough) + + # Ensure one-to-one mapping for the merge + dimsum_fitness <- dimsum_fitness %>% + distinct(aa_seq, .keep_all = TRUE) # Keep only one row per unique `aa_seq` + + # Perform a left join to merge gatk_data with dimsum_fitness + merged_data <- gatk_data %>% + left_join(dimsum_fitness, by = c("mutated_sequence" = "aa_seq")) + + # Save the merged data to a CSV file + write.csv(merged_data, file = output_csv_path, row.names = FALSE) + print(paste("Merged data saved to:", output_csv_path)) +} + + + + + + + +# test data +prepare_gatk_data_for_fitness_heatmap("/Users/benjaminwehnert/CRG/DMS_QC/howard/bin4/intermediate_files/processed_gatk_files/variantCounts_for_heatmaps.csv", + "/Users/benjaminwehnert/CRG/DMS_QC/howard/bin4/intermediate_files/aa_seq.txt", + "/Users/benjaminwehnert/CRG/DMS_QC/howard/dimsum/howards_data/howards_data_fitness_replicates.RData", + "/Users/benjaminwehnert/CRG/DMS_QC/howard/prepared_gatk_for_fitness.csv") + +#test on gid1a data +prepare_gatk_data_for_fitness_heatmap("/Users/benjaminwehnert/Downloads/variantCounts_for_heatmaps.csv", + "/Users/benjaminwehnert/Downloads/aa_seq.txt", + "/Users/benjaminwehnert/Downloads/2025-01-20_GID1A_DiMSum_1_fitness_replicates.RData", + "/Users/benjaminwehnert/Downloads/dimsum_fitness_gid1a/prepared_gatk_for_fitness.csv") diff --git a/modules/local/dmsanalysis/bin/process_raw_gatk.R b/modules/local/dmsanalysis/bin/process_raw_gatk.R new file mode 100644 index 0000000..d185ad7 --- /dev/null +++ b/modules/local/dmsanalysis/bin/process_raw_gatk.R @@ -0,0 +1,44 @@ +# input: gatk variantcounts tsv file, output_path +# output: csv with column names. Creates additional counts_per_cov column. Fills pos_mut column for synonymous mutations. Sorted out variants that have mutations, but do not show up in the specifying columns -> was affecting roughly 30 low-count variants out of over 15000 in Taylor's data + +library("dplyr") +process_raw_gatk <- function(gatk_file_path, output_csv_path) { + + # Set the column names + colnames <- c("counts", "cov", "mean_length_variant_reads", "varying_bases", + "base_mut", "varying_codons", "codon_mut", "aa_mut", "pos_mut") + + # Read the GATK file into a data frame + gatk_raw <- read.table(gatk_file_path, sep = "\t", header = FALSE, fill = TRUE, col.names = colnames) + + # Filter out rows where 'aa_mut' is empty or NA + gatk_raw <- gatk_raw[!(gatk_raw$aa_mut == "" | is.na(gatk_raw$aa_mut)), ] + + # Handle synonymous mutations: where aa_mut starts with "S" and pos_mut is either NA or "" + gatk_raw <- gatk_raw %>% + rowwise() %>% + mutate( + pos_mut = ifelse( + (is.na(pos_mut) | pos_mut == "") & grepl("^S:", aa_mut), + # Construct the new 'pos_mut' entry for synonymous mutations + paste0( + sub("S:([A-Z])>[A-Z]", "\\1", aa_mut), # Get the original amino acid from 'aa_mut' + sub("^(\\d+):.*", "\\1", codon_mut), # Get the position from 'codon_mut' + sub("S:[A-Z]>([A-Z])", "\\1", aa_mut) # Get the mutated amino acid from 'aa_mut' + ), + pos_mut # Keep the existing 'pos_mut' if it's not NA or "" + ) + ) %>% + ungroup() %>% + mutate(counts_per_cov = counts / cov) + + # Write the cleaned data frame to a CSV file + write.csv(gatk_raw, file = output_csv_path, row.names = FALSE) +} + + + + +# Example usage (can be used for testing): +# process_raw_gatk("/path/to/gatk_file.txt", "/path/to/output_file.csv") +#process_raw_gatk("/Users/benjaminwehnert/CRG/DMS_QC/testing_data/output_premerged_vsearch.variantCounts", "/Users/benjaminwehnert/CRG/DMS_QC/testing_data/raw_gatk.csv") diff --git a/modules/local/dmsanalysis/possiblemutations.nf b/modules/local/dmsanalysis/possiblemutations.nf new file mode 100644 index 0000000..c2a3361 --- /dev/null +++ b/modules/local/dmsanalysis/possiblemutations.nf @@ -0,0 +1,51 @@ +process DMSANALYSIS_POSSIBLE_MUTATIONS { + tag "table /w all possible variants" + label 'process_single' + + conda "${moduleDir}/environment.yml" + + container "${ workflow.containerEngine == 'singularity' + ? 'community.wave.seqera.io/library/bioconductor-biostrings_r-base_r-biocmanager_r-dplyr_pruned:ce2ba7ad7f6e7f2c' + : 'community.wave.seqera.io/library/bioconductor-biostrings_r-base_r-biocmanager_r-dplyr_pruned:0fd2e39a5bf2ecaa' }" + + input: + tuple val(meta), path(wt_seq) + val pos_range + val mutagenesis_type + path custom_codon_library + path script // possible_mutations.R + + output: + tuple val(meta), path("possible_mutations.csv"), emit: possible_mutations + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + start_stop_codon="$pos_range" + + if [[ "$custom_codon_library" == "/NULL" ]]; then + Rscript -e "source('$script'); generate_possible_variants('$wt_seq', '\$start_stop_codon', '$mutagenesis_type', NULL, 'possible_mutations.csv')" + else + Rscript -e "source('$script'); generate_possible_variants('$wt_seq', '\$start_stop_codon', '$mutagenesis_type', '$custom_codon_library', 'possible_mutations.csv')" + fi + + # Extract R base and packages versions + R_VERSION=\$(R --version | head -n 1 | sed -E 's/^R version ([0-9.]+).*/\\1/') + BIOSTRINGS_VERSION=\$(Rscript -e "packageVersion('Biostrings')" | grep -Eo '[0-9]+(\\.[0-9]+)+') + cat <<-END_VERSIONS > versions.yml + DMSANALYSIS_POSSIBLE_MUTATIONS: + r-base: \$R_VERSION + biostrings: \$BIOSTRINGS_VERSION + END_VERSIONS + """ + + stub: + """ + touch possible_mutations.csv + echo "DMSANALYSIS_POSSIBLE_MUTATIONS:" > versions.yml + echo " stub-version: 0.0.0" >> versions.yml + """ +} diff --git a/modules/local/dmsanalysis/processgatk.nf b/modules/local/dmsanalysis/processgatk.nf new file mode 100644 index 0000000..15bf3cc --- /dev/null +++ b/modules/local/dmsanalysis/processgatk.nf @@ -0,0 +1,55 @@ +process DMSANALYSIS_PROCESS_GATK { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' + ? 'community.wave.seqera.io/library/bioconductor-biostrings_r-base_r-biocmanager_r-dplyr_pruned:ce2ba7ad7f6e7f2c' + : 'community.wave.seqera.io/library/bioconductor-biostrings_r-base_r-biocmanager_r-dplyr_pruned:0fd2e39a5bf2ecaa' }" + + publishDir "${params.outdir}/intermediate_files", mode: 'copy' + + input: + tuple val(meta), path(variantCounts) + path possible_mutations + path aa_seq + val min_counts + path process_raw_gatk_script + path filter_by_library_script + path complete_gatk_script + path prepare_counts_heatmap_script + + output: + tuple val(meta), + path("annotated_variantCounts.csv"), + path("variantCounts_filtered_by_library.csv"), + path("library_completed_variantCounts.csv"), + path("variantCounts_for_heatmaps.csv"), + emit: processed_variantCounts + + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + Rscript -e "source('$process_raw_gatk_script'); process_raw_gatk('$variantCounts', 'annotated_variantCounts.csv')" + Rscript -e "source('$filter_by_library_script'); filter_gatk_by_codon_library('annotated_variantCounts.csv', '$possible_mutations', 'variantCounts_filtered_by_library.csv')" + Rscript -e "source('$complete_gatk_script'); complete_prefiltered_gatk('$possible_mutations', 'variantCounts_filtered_by_library.csv', 'library_completed_variantCounts.csv')" + Rscript -e "source('$prepare_counts_heatmap_script'); prepare_gatk_data_for_counts_heatmaps('variantCounts_filtered_by_library.csv', '$aa_seq', 'variantCounts_for_heatmaps.csv', $min_counts)" + + R_VERSION=\$(R --version | head -n 1 | sed -E 's/^R version ([0-9.]+).*/\\1/') + cat <<-END_VERSIONS > versions.yml + DMSANALYSIS_PROCESSGATK: + r-base: \$R_VERSION + END_VERSIONS + """ + + stub: + """ + touch annotated_variantCounts.csv variantCounts_filtered_by_library.csv library_completed_variantCounts.csv variantCounts_for_heatmaps.csv + echo "DMSANALYSIS_PROCESSGATK:" > versions.yml + echo " stub-version: 0.0.0" >> versions.yml + """ +} diff --git a/modules/local/fitness/find_synonymous_mutation.nf b/modules/local/fitness/find_synonymous_mutation.nf new file mode 100644 index 0000000..a70bc9b --- /dev/null +++ b/modules/local/fitness/find_synonymous_mutation.nf @@ -0,0 +1,39 @@ +process FIND_SYNONYMOUS_MUTATION { + tag { sample.sample } + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' + ? 'community.wave.seqera.io/library/bioconductor-biostrings_r-base_r-biocmanager_r-dplyr_pruned:ce2ba7ad7f6e7f2c' + : 'community.wave.seqera.io/library/bioconductor-biostrings_r-base_r-biocmanager_r-dplyr_pruned:0fd2e39a5bf2ecaa' }" + + input: + tuple val(sample), path(counts_merged) // from MERGE_COUNTS.out.merged_counts + path wt_fasta // broadcast singleton + val pos_range // "start-end", broadcast singleton + path script // find_syn_mutation.R, broadcast singleton + + output: + tuple val(sample), path("synonymous_wt.txt"), emit: synonymous_wt + path "versions.yml", emit: versions + + script: + """ + set -euo pipefail + start_stop_codon="$pos_range" + + Rscript -e "source('$script'); \ + seq <- pick_synonymous_wt_from_range( \ + wt_fasta='$wt_fasta', \ + counts_merged_tsv='$counts_merged', \ + pos_range='\$start_stop_codon' \ + ); \ + write(seq, file='synonymous_wt.txt')" + + R_VERSION=\$(R --version | head -n 1 | sed -E 's/^R version ([0-9.]+).*/\\1/') + cat <<-END_VERSIONS > versions.yml + FIND_SYNONYMOUS_MUTATION: + r-base: \$R_VERSION + END_VERSIONS + """ +} diff --git a/modules/local/fitness/fitness_experimental_design.nf b/modules/local/fitness/fitness_experimental_design.nf new file mode 100644 index 0000000..83ba0b3 --- /dev/null +++ b/modules/local/fitness/fitness_experimental_design.nf @@ -0,0 +1,30 @@ +process EXPDESIGN_FITNESS { + tag "experimentalDesign" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' + ? 'community.wave.seqera.io/library/bioconductor-biostrings_r-base_r-biocmanager_r-dplyr_pruned:ce2ba7ad7f6e7f2c' + : 'community.wave.seqera.io/library/bioconductor-biostrings_r-base_r-biocmanager_r-dplyr_pruned:0fd2e39a5bf2ecaa' }" + + input: + path samplesheet_csv + path script // R file that defines: make_dimsum_experimental_design(input_csv, out_path) + + output: + path "experimentalDesign.tsv", emit: experimental_design + path "versions.yml", emit: versions + + script: + """ + set -euo pipefail + + Rscript -e "source('$script'); make_dimsum_experimental_design('$samplesheet_csv', 'experimentalDesign.tsv')" + + R_VERSION=\$(R --version | head -n 1 | sed -E 's/^R version ([0-9.]+).*/\\1/') + cat <<-END_VERSIONS > versions.yml + DIMSUM_EXPDESIGN: + r-base: \$R_VERSION + END_VERSIONS + """ +} diff --git a/modules/local/fitness/fitness_standard.nf b/modules/local/fitness/fitness_standard.nf new file mode 100644 index 0000000..ace8b4f --- /dev/null +++ b/modules/local/fitness/fitness_standard.nf @@ -0,0 +1,133 @@ +process FITNESS_CALCULATION { + tag { sample.sample } + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' + ? 'community.wave.seqera.io/library/bioconductor-biostrings_r-base_r-biocmanager_r-dplyr_pruned:ce2ba7ad7f6e7f2c' + : 'community.wave.seqera.io/library/bioconductor-biostrings_r-base_r-biocmanager_r-dplyr_pruned:0fd2e39a5bf2ecaa' }" + + input: + tuple val(sample), path(counts_merged) + path(exp_design) + path(syn_wt_txt) + path script // fitness_calculation.R + + output: + tuple val(sample), path("fitness_estimation.tsv"), emit: fitness_estimation + path "versions.yml", emit: versions + + script: + """ + set -euo pipefail + + R_version=\$(R --version | head -n 1 | sed 's/^R version //') + + Rscript -e "source('$script'); run_fitness_estimation('$counts_merged', '$exp_design', '$syn_wt_txt', 'fitness_estimation.tsv')" + + cat > versions.yml < versions.yml + echo " stub-version: 0.0.0" >> versions.yml + """ +} + + + +process FITNESS_QC { + tag { sample.sample } + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' + ? 'community.wave.seqera.io/library/bioconductor-biostrings_r-base_r-biocmanager_r-dplyr_pruned:ce2ba7ad7f6e7f2c' + : 'community.wave.seqera.io/library/bioconductor-biostrings_r-base_r-biocmanager_r-dplyr_pruned:0fd2e39a5bf2ecaa' }" + + input: + tuple val(sample), path(fitness_estimation_tsv) // from FITNESS_CALCULATION + path script // fitness_plots.R + + output: + tuple val(sample), path("fitness_estimation_count_correlation.pdf"), emit: counts_corr_pdf + tuple val(sample), path("fitness_estimation_fitness_correlation.pdf"), emit: fitness_corr_pdf + path "versions.yml", emit: versions + + script: + """ + set -euo pipefail + + R_version=\$(R --version | head -n 1 | sed 's/^R version //') + + Rscript -e "source('$script'); run_fitness_plots( + '$fitness_estimation_tsv', + 'fitness_estimation_count_correlation.pdf', + 'fitness_estimation_fitness_correlation.pdf')" + + cat > versions.yml < versions.yml <<'EOF' + FITNESS_PLOTS: + stub-version: "0.0.0" + EOF + """ +} + + + +process FITNESS_HEATMAP { + tag { sample.sample } + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' + ? 'community.wave.seqera.io/library/bioconductor-biostrings_r-base_r-biocmanager_r-dplyr_pruned:ce2ba7ad7f6e7f2c' + : 'community.wave.seqera.io/library/bioconductor-biostrings_r-base_r-biocmanager_r-dplyr_pruned:0fd2e39a5bf2ecaa' }" + + input: + tuple val(sample), path(fitness_estimation_tsv) // from FITNESS_CALCULATION + tuple val(sample), path(wt_seq) // WT sequence + path script // fitness_plots.R + + output: + tuple val(sample), path("fitness_heatmap.pdf"), emit: fitness_heatmap + path "versions.yml", emit: versions + + script: + """ + set -euo pipefail + + R_version=\$(R --version | head -n 1 | sed 's/^R version //') + + Rscript -e "source('$script'); run_fitness_rescaled_heatmaps( + '$fitness_estimation_tsv', '$wt_seq')" + + cat > versions.yml < versions.yml <<'EOF' + FITNESS_PLOTS: + stub-version: "0.0.0" + EOF + """ +} diff --git a/modules/local/fitness/merge_counts.nf b/modules/local/fitness/merge_counts.nf new file mode 100644 index 0000000..d462748 --- /dev/null +++ b/modules/local/fitness/merge_counts.nf @@ -0,0 +1,35 @@ +process MERGE_COUNTS { + tag "${sample.sample}" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' + ? 'community.wave.seqera.io/library/bioconductor-biostrings_r-base_r-biocmanager_r-dplyr_pruned:ce2ba7ad7f6e7f2c' + : 'community.wave.seqera.io/library/bioconductor-biostrings_r-base_r-biocmanager_r-dplyr_pruned:0fd2e39a5bf2ecaa' }" + + input: + tuple val(sample), val(metas), path(input_counts), path(output_counts) + path merge_script + + output: + tuple val(sample), path("counts_merged.tsv"), emit: merged_counts + path "versions.yml", emit: versions + + script: + def in_list = input_counts .collect { it.getName() }.join(' ') + def out_list = output_counts.collect { it.getName() }.join(' ') + """ + set -euo pipefail + + Rscript "${merge_script}" \\ + --inputs ${in_list} \\ + --outputs ${out_list} \\ + --out counts_merged.tsv + + R_VERSION=\$(R --version | head -n 1 | sed -E 's/^R version ([0-9.]+).*/\\1/') + cat <<-END_VERSIONS > versions.yml + GATK_GATKTODIMSUM: + r-base: \$R_VERSION + END_VERSIONS + """ +} diff --git a/modules/local/fitness/run_dimsum.nf b/modules/local/fitness/run_dimsum.nf new file mode 100644 index 0000000..9af63b2 --- /dev/null +++ b/modules/local/fitness/run_dimsum.nf @@ -0,0 +1,43 @@ +process RUN_DIMSUM { + tag { sample.sample } + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' + ? 'oras://community.wave.seqera.io/library/r-dimsum:1.4--4357734d345c8ccc' + : 'docker.io/bwehnert1008/dms_qc_dimsum_environment@sha256:08f3bd8441df7b4a7e05aadeca178862153cf723e64097a48a2744b2698b15dd' }" + + input: + tuple val(sample), path(counts_merged) + path(wt_txt) + path(exp_design) + + output: + path "dimsum_results**", emit: results_dir + path "versions.yml", emit: versions + + script: + """ + set -euo pipefail + + # DiMSum expects the sequence string, not a file path + WT=\$(tr -d ' \r\\n\\t' < "$wt_txt") + + DiMSum \ + --experimentDesignPath "$exp_design" \ + --wildtypeSequence "\$WT" \ + --countPath "$counts_merged" \ + --startStage 4 \ + --stopStage 5 \ + --fitnessErrorModel F \ + --retainIntermediateFiles T \ + --projectName "dimsum_results" \ + --fastqFileDir . \ + + R_VERSION=\$(R --version | head -n 1 | sed -E 's/^R version ([0-9.]+).*/\\1/') + cat <<-END_VERSIONS > versions.yml + DIMSUM_RUN: + r-base: \$R_VERSION + END_VERSIONS + """ +} diff --git a/modules/local/gatk/gatktofitness.nf b/modules/local/gatk/gatktofitness.nf new file mode 100644 index 0000000..74d775a --- /dev/null +++ b/modules/local/gatk/gatktofitness.nf @@ -0,0 +1,42 @@ +process GATK_GATKTOFITNESS { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' + ? 'community.wave.seqera.io/library/bioconductor-biostrings_r-base_r-biocmanager_r-dplyr_pruned:ce2ba7ad7f6e7f2c' + : 'community.wave.seqera.io/library/bioconductor-biostrings_r-base_r-biocmanager_r-dplyr_pruned:0fd2e39a5bf2ecaa' }" + + input: + tuple val(meta), path(variantCounts_filtered_by_library) + path wt_seq + val pos_range + path script // R script + + output: + tuple val(meta), path("${meta.id}_fitness_input.tsv"), emit: fitness_input + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + start_stop_codon="$pos_range" + + Rscript -e "source('$script'); generate_fitness_input('$wt_seq', '$variantCounts_filtered_by_library', '\$start_stop_codon', '${meta.id}_fitness_input.tsv')" + + R_VERSION=\$(R --version | head -n 1 | sed -E 's/^R version ([0-9.]+).*/\\1/') + cat <<-END_VERSIONS > versions.yml + GATK_GATKTOFITNESS: + r-base: \$R_VERSION + END_VERSIONS + """ + + stub: + """ + touch ${meta.id}_fitness_input.tsv + echo "GATK_GATKTOFITNESS:" > versions.yml + echo " stub-version: 0.0.0" >> versions.yml + """ +} diff --git a/modules/local/gatk/saturationmutagenesis.nf b/modules/local/gatk/saturationmutagenesis.nf new file mode 100644 index 0000000..2616347 --- /dev/null +++ b/modules/local/gatk/saturationmutagenesis.nf @@ -0,0 +1,56 @@ +process GATK_SATURATIONMUTAGENESIS { + tag "$meta.id" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "community.wave.seqera.io/library/gatk4_samtools_java-1.7.0-openjdk-conda-aarch64:7c1f89018b5d5103" + + input: + tuple val(meta), path(premerged_reads) + path wt_seq + val pos_range + val min_counts + + output: + tuple val(meta), path("gatk_output.variantCounts"), emit: variantCounts // to access the output + tuple val(meta), path("gatk_output.*"), emit: gatk_output + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + # Index reference + samtools faidx $wt_seq + gatk CreateSequenceDictionary -R $wt_seq + + # Read start and stop codon from input + start_stop_codon="$pos_range" + + # Run GATK AnalyzeSaturationMutagenesis + gatk AnalyzeSaturationMutagenesis \ + -I $premerged_reads \ + -R $wt_seq \ + --orf \$start_stop_codon \ + --paired-mode false \ + --min-q 30 \ + --min-variant-obs $min_counts \ + -O gatk_output + + # Save versions + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(samtools --version |& sed '1!d ; s/samtools //') + gatk: \$(gatk --version |& sed 's/^.*GATK/\1/') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch gatk_output.variantCounts + touch versions.yml + """ +} diff --git a/modules/local/visualization/visualization.nf b/modules/local/visualization/visualization.nf new file mode 100644 index 0000000..96d52af --- /dev/null +++ b/modules/local/visualization/visualization.nf @@ -0,0 +1,246 @@ +process VISUALIZATION_COUNTS_PER_COV { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' + ? 'community.wave.seqera.io/library/bioconductor-biostrings_r-base_r-biocmanager_r-dplyr_pruned:ce2ba7ad7f6e7f2c' + : 'community.wave.seqera.io/library/bioconductor-biostrings_r-base_r-biocmanager_r-dplyr_pruned:0fd2e39a5bf2ecaa' }" + + input: + tuple val(meta), path(variantCounts_for_heatmaps) + val min_counts + path script // counts_per_cov_heatmap.R + + output: + tuple val(meta), path("counts_per_cov_heatmap.pdf"), emit: counts_per_cov_heatmap + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + Rscript -e "source('$script'); counts_per_cov_heatmap('$variantCounts_for_heatmaps', $min_counts, 'counts_per_cov_heatmap.pdf')" + + R_VERSION=\$(R --version | head -n 1 | sed -E 's/^R version ([0-9.]+).*/\\1/') + cat <<-END_VERSIONS > versions.yml + VISUALIZATION_COUNTS_PER_COV: + r-base: \$R_VERSION + END_VERSIONS + """ + + stub: + """ + touch counts_per_cov_heatmap.pdf + echo "VISUALIZATION_COUNTS_PER_COV:" > versions.yml + echo " stub-version: 0.0.0" >> versions.yml + """ +} + +process VISUALIZATION_COUNTS_HEATMAP { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' + ? 'community.wave.seqera.io/library/bioconductor-biostrings_r-base_r-biocmanager_r-dplyr_pruned:ce2ba7ad7f6e7f2c' + : 'community.wave.seqera.io/library/bioconductor-biostrings_r-base_r-biocmanager_r-dplyr_pruned:0fd2e39a5bf2ecaa' }" + + input: + tuple val(meta), path(variantCounts_for_heatmaps) + val min_counts + path script // counts_heatmap.R + + output: + tuple val(meta), path("counts_heatmap.pdf"), emit: counts_heatmap + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + Rscript -e "source('$script'); counts_heatmap('$variantCounts_for_heatmaps', $min_counts, 'counts_heatmap.pdf')" + + R_VERSION=\$(R --version | head -n 1 | sed -E 's/^R version ([0-9.]+).*/\\1/') + cat <<-END_VERSIONS > versions.yml + VISUALIZATION_COUNTS_PER_COV: + r-base: \$R_VERSION + END_VERSIONS + """ + + stub: + """ + touch counts_heatmap.pdf + echo "VISUALIZATION_COUNTS_HEATMAP:" > versions.yml + echo " stub-version: 0.0.0" >> versions.yml + """ +} + +process VISUALIZATION_GLOBAL_POS_BIASES_COUNTS { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' + ? 'community.wave.seqera.io/library/bioconductor-biostrings_r-base_r-biocmanager_r-dplyr_pruned:ce2ba7ad7f6e7f2c' + : 'community.wave.seqera.io/library/bioconductor-biostrings_r-base_r-biocmanager_r-dplyr_pruned:0fd2e39a5bf2ecaa' }" + + input: + tuple val(meta), path(variantCounts_filtered_by_library) + path aa_seq + val sliding_window_size + path script // global_position_biases_counts_and_counts_per_cov.R + + output: + tuple val(meta), path("rolling_counts.pdf"), emit: rolling_counts + tuple val(meta), path("rolling_counts_per_cov.pdf"), emit: rolling_counts_per_cov + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + Rscript -e "source('$script'); position_biases('$variantCounts_filtered_by_library', '$aa_seq', $sliding_window_size)" + + R_VERSION=\$(R --version | head -n 1 | sed -E 's/^R version ([0-9.]+).*/\\1/') + cat <<-END_VERSIONS > versions.yml + VISUALIZATION_COUNTS_PER_COV: + r-base: \$R_VERSION + END_VERSIONS + """ + + stub: + """ + touch rolling_counts.pdf + touch rolling_counts_per_cov.pdf + echo "VISUALIZATION_COUNTS_HEATMAP:" > versions.yml + echo " stub-version: 0.0.0" >> versions.yml + """ +} + +process VISUALIZATION_GLOBAL_POS_BIASES_COV { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' + ? 'community.wave.seqera.io/library/bioconductor-biostrings_r-base_r-biocmanager_r-dplyr_pruned:ce2ba7ad7f6e7f2c' + : 'community.wave.seqera.io/library/bioconductor-biostrings_r-base_r-biocmanager_r-dplyr_pruned:0fd2e39a5bf2ecaa' }" + + input: + tuple val(meta), path(variantCounts_filtered_by_library) + path aa_seq + val sliding_window_size + val aimed_cov + path script // R script + + output: + tuple val(meta), path("rolling_coverage.pdf"), emit: rolling_coverage + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + Rscript -e "source('$script'); position_biases('$variantCounts_filtered_by_library', '$aa_seq', $sliding_window_size, 'rolling_coverage.pdf', $aimed_cov)" + + R_VERSION=\$(R --version | head -n 1 | sed -E 's/^R version ([0-9.]+).*/\\1/') + cat <<-END_VERSIONS > versions.yml + VISUALIZATION_COUNTS_PER_COV: + r-base: \$R_VERSION + END_VERSIONS + """ + + stub: + """ + touch rolling_coverage.pdf + echo "VISUALIZATION_COUNTS_HEATMAP:" > versions.yml + echo " stub-version: 0.0.0" >> versions.yml + """ +} + +process VISUALIZATION_LOGDIFF { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' + ? 'community.wave.seqera.io/library/bioconductor-biostrings_r-base_r-biocmanager_r-dplyr_pruned:ce2ba7ad7f6e7f2c' + : 'community.wave.seqera.io/library/bioconductor-biostrings_r-base_r-biocmanager_r-dplyr_pruned:0fd2e39a5bf2ecaa' }" + + input: + tuple val(meta), path(library_completed_variantCounts) + path script // R script + + output: + tuple val(meta), path("logdiff_plot.pdf"), emit: logdiff_plot + tuple val(meta), path("logdiff_varying_bases.pdf"), emit: logdiff_varying_bases + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + Rscript -e "source('$script'); logdiff_plot('$library_completed_variantCounts')" + + R_VERSION=\$(R --version | head -n 1 | sed -E 's/^R version ([0-9.]+).*/\\1/') + cat <<-END_VERSIONS > versions.yml + VISUALIZATION_COUNTS_PER_COV: + r-base: \$R_VERSION + END_VERSIONS + """ + + stub: + """ + touch logdiff_plot.pdf + touch logdiff_varying_bases.pdf + echo "VISUALIZATION_COUNTS_HEATMAP:" > versions.yml + echo " stub-version: 0.0.0" >> versions.yml + """ +} + +process VISUALIZATION_SEQDEPTH { + tag "$meta.id" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' + ? 'community.wave.seqera.io/library/bioconductor-biostrings_r-base_r-biocmanager_r-dplyr_pruned:ce2ba7ad7f6e7f2c' + : 'community.wave.seqera.io/library/bioconductor-biostrings_r-base_r-biocmanager_r-dplyr_pruned:0fd2e39a5bf2ecaa' }" + + input: + tuple val(meta), path(variantCounts_filtered_by_library) + path possible_mutations + val min_counts + path script // R script + + output: + tuple val(meta), path("SeqDepth.pdf"), emit: SeqDepth + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + Rscript -e "source('$script'); SeqDepth_simulation_plot('$variantCounts_filtered_by_library', '$possible_mutations', 'SeqDepth.pdf', 0.01, $min_counts)" + + R_VERSION=\$(R --version | head -n 1 | sed -E 's/^R version ([0-9.]+).*/\\1/') + cat <<-END_VERSIONS > versions.yml + VISUALIZATION_COUNTS_PER_COV: + r-base: \$R_VERSION + END_VERSIONS + """ + + stub: + """ + touch SeqDepth.pdf + echo "VISUALIZATION_COUNTS_HEATMAP:" > versions.yml + echo " stub-version: 0.0.0" >> versions.yml + """ +} diff --git a/modules/nf-core/bwa/index/environment.yml b/modules/nf-core/bwa/index/environment.yml new file mode 100644 index 0000000..d8789a2 --- /dev/null +++ b/modules/nf-core/bwa/index/environment.yml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::bwa=0.7.18 diff --git a/modules/nf-core/bwa/index/main.nf b/modules/nf-core/bwa/index/main.nf new file mode 100644 index 0000000..29d9957 --- /dev/null +++ b/modules/nf-core/bwa/index/main.nf @@ -0,0 +1,53 @@ +process BWA_INDEX { + tag "$fasta" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bwa:0.7.18--he4a0461_0' : + 'biocontainers/bwa:0.7.18--he4a0461_0' }" + + input: + tuple val(meta), path(fasta) + + output: + tuple val(meta), path("bwa") , emit: index + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: "${fasta.baseName}" + def args = task.ext.args ?: '' + """ + mkdir bwa + bwa \\ + index \\ + $args \\ + -p bwa/${prefix} \\ + $fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bwa: \$(echo \$(bwa 2>&1) | sed 's/^.*Version: //; s/Contact:.*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${fasta.baseName}" + """ + mkdir bwa + + touch bwa/${prefix}.amb + touch bwa/${prefix}.ann + touch bwa/${prefix}.bwt + touch bwa/${prefix}.pac + touch bwa/${prefix}.sa + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bwa: \$(echo \$(bwa 2>&1) | sed 's/^.*Version: //; s/Contact:.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/bwa/index/meta.yml b/modules/nf-core/bwa/index/meta.yml new file mode 100644 index 0000000..ce5cb8f --- /dev/null +++ b/modules/nf-core/bwa/index/meta.yml @@ -0,0 +1,52 @@ +name: bwa_index +description: Create BWA index for reference genome +keywords: + - index + - fasta + - genome + - reference +tools: + - bwa: + description: | + BWA is a software package for mapping DNA sequences against + a large reference genome, such as the human genome. + homepage: http://bio-bwa.sourceforge.net/ + documentation: https://bio-bwa.sourceforge.net/bwa.shtml + arxiv: arXiv:1303.3997 + licence: ["GPL-3.0-or-later"] + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: Input genome fasta file +output: + - index: + - meta: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test', single_end:false ] + pattern: "*.{amb,ann,bwt,pac,sa}" + - bwa: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test', single_end:false ] + pattern: "*.{amb,ann,bwt,pac,sa}" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@maxulysse" +maintainers: + - "@drpatelh" + - "@maxulysse" + - "@gallvp" diff --git a/modules/nf-core/bwa/index/tests/main.nf.test b/modules/nf-core/bwa/index/tests/main.nf.test new file mode 100644 index 0000000..af33e73 --- /dev/null +++ b/modules/nf-core/bwa/index/tests/main.nf.test @@ -0,0 +1,33 @@ +nextflow_process { + + name "Test Process BWA_INDEX" + tag "modules_nfcore" + tag "modules" + tag "bwa" + tag "bwa/index" + script "../main.nf" + process "BWA_INDEX" + + test("BWA index") { + + when { + process { + """ + input[0] = [ + [id: 'test'], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/bwa/index/tests/main.nf.test.snap b/modules/nf-core/bwa/index/tests/main.nf.test.snap new file mode 100644 index 0000000..7c8f046 --- /dev/null +++ b/modules/nf-core/bwa/index/tests/main.nf.test.snap @@ -0,0 +1,47 @@ +{ + "BWA index": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + [ + "genome.amb:md5,3a68b8b2287e07dd3f5f95f4344ba76e", + "genome.ann:md5,c32e11f6c859f166c7525a9c1d583567", + "genome.bwt:md5,0469c30a1e239dd08f68afe66fde99da", + "genome.pac:md5,983e3d2cd6f36e2546e6d25a0da78d66", + "genome.sa:md5,ab3952cabf026b48cd3eb5bccbb636d1" + ] + ] + ], + "1": [ + "versions.yml:md5,a64462ac7dfb21f4ade9b02e7f65c5bb" + ], + "index": [ + [ + { + "id": "test" + }, + [ + "genome.amb:md5,3a68b8b2287e07dd3f5f95f4344ba76e", + "genome.ann:md5,c32e11f6c859f166c7525a9c1d583567", + "genome.bwt:md5,0469c30a1e239dd08f68afe66fde99da", + "genome.pac:md5,983e3d2cd6f36e2546e6d25a0da78d66", + "genome.sa:md5,ab3952cabf026b48cd3eb5bccbb636d1" + ] + ] + ], + "versions": [ + "versions.yml:md5,a64462ac7dfb21f4ade9b02e7f65c5bb" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-05-16T11:40:09.925307" + } +} \ No newline at end of file diff --git a/modules/nf-core/bwa/index/tests/tags.yml b/modules/nf-core/bwa/index/tests/tags.yml new file mode 100644 index 0000000..28bb483 --- /dev/null +++ b/modules/nf-core/bwa/index/tests/tags.yml @@ -0,0 +1,2 @@ +bwa/index: + - modules/nf-core/bwa/index/** diff --git a/modules/nf-core/bwa/mem/environment.yml b/modules/nf-core/bwa/mem/environment.yml new file mode 100644 index 0000000..ed5448a --- /dev/null +++ b/modules/nf-core/bwa/mem/environment.yml @@ -0,0 +1,13 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda + +dependencies: + # renovate: datasource=conda depName=bioconda/bwa + - bioconda::bwa=0.7.18 + # renovate: datasource=conda depName=bioconda/htslib + - bioconda::htslib=1.21 + # renovate: datasource=conda depName=bioconda/samtools + - bioconda::samtools=1.21 diff --git a/modules/nf-core/bwa/mem/main.nf b/modules/nf-core/bwa/mem/main.nf new file mode 100644 index 0000000..3c54417 --- /dev/null +++ b/modules/nf-core/bwa/mem/main.nf @@ -0,0 +1,74 @@ +process BWA_MEM { + tag "$meta.id" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/bf/bf7890f8d4e38a7586581cb7fa13401b7af1582f21d94eef969df4cea852b6da/data' : + 'community.wave.seqera.io/library/bwa_htslib_samtools:56c9f8d5201889a4' }" + + input: + tuple val(meta) , path(reads) + tuple val(meta2), path(index) + tuple val(meta3), path(fasta) + val sort_bam + + output: + tuple val(meta), path("*.bam") , emit: bam, optional: true + tuple val(meta), path("*.cram") , emit: cram, optional: true + tuple val(meta), path("*.csi") , emit: csi, optional: true + tuple val(meta), path("*.crai") , emit: crai, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def samtools_command = sort_bam ? 'sort' : 'view' + def extension = args2.contains("--output-fmt sam") ? "sam" : + args2.contains("--output-fmt cram") ? "cram": + sort_bam && args2.contains("-O cram")? "cram": + !sort_bam && args2.contains("-C") ? "cram": + "bam" + def reference = fasta && extension=="cram" ? "--reference ${fasta}" : "" + if (!fasta && extension=="cram") error "Fasta reference is required for CRAM output" + """ + INDEX=`find -L ./ -name "*.amb" | sed 's/\\.amb\$//'` + + bwa mem \\ + $args \\ + -t $task.cpus \\ + \$INDEX \\ + $reads \\ + | samtools $samtools_command $args2 ${reference} --threads $task.cpus -o ${prefix}.${extension} - + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bwa: \$(echo \$(bwa 2>&1) | sed 's/^.*Version: //; s/Contact:.*\$//') + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def args2 = task.ext.args2 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def extension = args2.contains("--output-fmt sam") ? "sam" : + args2.contains("--output-fmt cram") ? "cram": + sort_bam && args2.contains("-O cram")? "cram": + !sort_bam && args2.contains("-C") ? "cram": + "bam" + """ + touch ${prefix}.${extension} + touch ${prefix}.csi + touch ${prefix}.crai + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bwa: \$(echo \$(bwa 2>&1) | sed 's/^.*Version: //; s/Contact:.*\$//') + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/bwa/mem/meta.yml b/modules/nf-core/bwa/mem/meta.yml new file mode 100644 index 0000000..b6f696c --- /dev/null +++ b/modules/nf-core/bwa/mem/meta.yml @@ -0,0 +1,111 @@ +name: bwa_mem +description: Performs fastq alignment to a fasta reference using BWA +keywords: + - mem + - bwa + - alignment + - map + - fastq + - bam + - sam +tools: + - bwa: + description: | + BWA is a software package for mapping DNA sequences against + a large reference genome, such as the human genome. + homepage: http://bio-bwa.sourceforge.net/ + documentation: https://bio-bwa.sourceforge.net/bwa.shtml + arxiv: arXiv:1303.3997 + licence: ["GPL-3.0-or-later"] + identifier: "biotools:bwa" +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + ontologies: + - edam: "http://edamontology.org/data_2044" # Sequence + - edam: "http://edamontology.org/format_1930" # FASTQ + - - meta2: + type: map + description: | + Groovy Map containing reference information. + e.g. [ id:'test', single_end:false ] + - index: + type: file + description: BWA genome index files + pattern: "Directory containing BWA index *.{amb,ann,bwt,pac,sa}" + ontologies: + - edam: "http://edamontology.org/data_3210" # Genome index + - - meta3: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: Reference genome in FASTA format + pattern: "*.{fasta,fa}" + ontologies: + - edam: "http://edamontology.org/data_2044" # Sequence + - edam: "http://edamontology.org/format_1929" # FASTA + - - sort_bam: + type: boolean + description: use samtools sort (true) or samtools view (false) + pattern: "true or false" +output: + - bam: + - meta: + type: file + description: Output BAM file containing read alignments + - "*.bam": + type: file + description: Output BAM file containing read alignments + pattern: "*.{bam}" + ontologies: + - edam: "http://edamontology.org/format_2572" # BAM + - cram: + - meta: + type: file + description: Output CRAM file containing read alignments + - "*.cram": + type: file + description: Output CRAM file containing read alignments + pattern: "*.{cram}" + ontologies: + - edam: "http://edamontology.org/format_3462" # CRAM + - csi: + - meta: + type: file + description: Optional index file for BAM file + - "*.csi": + type: file + description: Optional index file for BAM file + pattern: "*.{csi}" + - crai: + - meta: + type: file + description: Optional index file for CRAM file + - "*.crai": + type: file + description: Optional index file for CRAM file + pattern: "*.{crai}" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@jeremy1805" + - "@matthdsm" +maintainers: + - "@drpatelh" + - "@jeremy1805" + - "@matthdsm" diff --git a/modules/nf-core/bwa/mem/tests/main.nf.test b/modules/nf-core/bwa/mem/tests/main.nf.test new file mode 100644 index 0000000..5de2c2f --- /dev/null +++ b/modules/nf-core/bwa/mem/tests/main.nf.test @@ -0,0 +1,260 @@ +nextflow_process { + + name "Test Process BWA_MEM" + tag "modules_nfcore" + tag "modules" + tag "bwa" + tag "bwa/mem" + tag "bwa/index" + script "../main.nf" + process "BWA_MEM" + + setup { + run("BWA_INDEX") { + script "../../index/main.nf" + process { + """ + input[0] = [ + [id: 'test'], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + """ + } + } + } + + test("Single-End") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + [ + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ] + ] + input[1] = BWA_INDEX.out.index + input[2] = [[id: 'test'],file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true)] + input[3] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.cram, + process.out.csi, + process.out.crai, + process.out.versions, + bam(process.out.bam[0][1]).getHeaderMD5(), + bam(process.out.bam[0][1]).getReadsMD5() + ).match() + } + ) + } + + } + + test("Single-End Sort") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + [ + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ] + ] + input[1] = BWA_INDEX.out.index + input[2] = [[id: 'test'],file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true)] + input[3] = true + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.cram, + process.out.csi, + process.out.crai, + process.out.versions, + bam(process.out.bam[0][1]).getHeaderMD5(), + bam(process.out.bam[0][1]).getReadsMD5() + ).match() + } + ) + } + + } + + test("Paired-End") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) + ] + ] + input[1] = BWA_INDEX.out.index + input[2] = [[id: 'test'],file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true)] + input[3] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.cram, + process.out.csi, + process.out.crai, + process.out.versions, + bam(process.out.bam[0][1]).getHeaderMD5(), + bam(process.out.bam[0][1]).getReadsMD5() + ).match() + } + ) + } + + } + + test("Paired-End Sort") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) + ] + ] + input[1] = BWA_INDEX.out.index + input[2] = [[id: 'test'],file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true)] + input[3] = true + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.cram, + process.out.csi, + process.out.crai, + process.out.versions, + bam(process.out.bam[0][1]).getHeaderMD5(), + bam(process.out.bam[0][1]).getReadsMD5() + ).match() + } + ) + } + + } + + test("Paired-End - no fasta") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) + ] + ] + input[1] = BWA_INDEX.out.index + input[2] = [[:],[]] + input[3] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.cram, + process.out.csi, + process.out.crai, + process.out.versions, + bam(process.out.bam[0][1]).getHeaderMD5(), + bam(process.out.bam[0][1]).getReadsMD5() + ).match() + } + ) + } + + } + + test("Single-end - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + [ + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ] + ] + input[1] = BWA_INDEX.out.index + input[2] = [[id: 'test'],file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true)] + input[3] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("Paired-end - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) + ] + ] + input[1] = BWA_INDEX.out.index + input[2] = [[id: 'test'],file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true)] + input[3] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } +} diff --git a/modules/nf-core/bwa/mem/tests/main.nf.test.snap b/modules/nf-core/bwa/mem/tests/main.nf.test.snap new file mode 100644 index 0000000..3aaefdd --- /dev/null +++ b/modules/nf-core/bwa/mem/tests/main.nf.test.snap @@ -0,0 +1,271 @@ +{ + "Single-End": { + "content": [ + [ + + ], + [ + + ], + [ + + ], + [ + "versions.yml:md5,c60680eba0f00e791c0d5a0a6e9d665f" + ], + "53df0e7b72f1f85fb28af5fec435246", + "798439cbd7fd81cbcc5078022dc5479d" + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.5" + }, + "timestamp": "2025-03-27T08:36:00.831642964" + }, + "Single-End Sort": { + "content": [ + [ + + ], + [ + + ], + [ + + ], + [ + "versions.yml:md5,c60680eba0f00e791c0d5a0a6e9d665f" + ], + "5eca502b75fefc26e8000908bf0bb3a3", + "94fcf617f5b994584c4e8d4044e16b4f" + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.5" + }, + "timestamp": "2025-03-27T08:36:16.025706238" + }, + "Paired-End": { + "content": [ + [ + + ], + [ + + ], + [ + + ], + [ + "versions.yml:md5,c60680eba0f00e791c0d5a0a6e9d665f" + ], + "fec2aafbba4637767bc4e202c71aee58", + "57aeef88ed701a8ebc8e2f0a381b2a6" + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.5" + }, + "timestamp": "2025-03-27T08:36:27.309924644" + }, + "Paired-End Sort": { + "content": [ + [ + + ], + [ + + ], + [ + + ], + [ + "versions.yml:md5,c60680eba0f00e791c0d5a0a6e9d665f" + ], + "d5ad8844218280969c1f9349bd62d057", + "af8628d9df18b2d3d4f6fd47ef2bb872" + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.5" + }, + "timestamp": "2025-03-27T08:36:45.448624985" + }, + "Single-end - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + + ], + "2": [ + [ + { + "id": "test", + "single_end": true + }, + "test.csi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "3": [ + [ + { + "id": "test", + "single_end": true + }, + "test.crai:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "4": [ + "versions.yml:md5,c60680eba0f00e791c0d5a0a6e9d665f" + ], + "bam": [ + [ + { + "id": "test", + "single_end": true + }, + "test.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "crai": [ + [ + { + "id": "test", + "single_end": true + }, + "test.crai:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "cram": [ + + ], + "csi": [ + [ + { + "id": "test", + "single_end": true + }, + "test.csi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,c60680eba0f00e791c0d5a0a6e9d665f" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.5" + }, + "timestamp": "2025-03-27T08:37:16.211123969" + }, + "Paired-End - no fasta": { + "content": [ + [ + + ], + [ + + ], + [ + + ], + [ + "versions.yml:md5,c60680eba0f00e791c0d5a0a6e9d665f" + ], + "fec2aafbba4637767bc4e202c71aee58", + "57aeef88ed701a8ebc8e2f0a381b2a6" + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.5" + }, + "timestamp": "2025-03-27T08:36:56.592159657" + }, + "Paired-end - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + + ], + "2": [ + [ + { + "id": "test", + "single_end": false + }, + "test.csi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "3": [ + [ + { + "id": "test", + "single_end": false + }, + "test.crai:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "4": [ + "versions.yml:md5,c60680eba0f00e791c0d5a0a6e9d665f" + ], + "bam": [ + [ + { + "id": "test", + "single_end": false + }, + "test.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "crai": [ + [ + { + "id": "test", + "single_end": false + }, + "test.crai:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "cram": [ + + ], + "csi": [ + [ + { + "id": "test", + "single_end": false + }, + "test.csi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,c60680eba0f00e791c0d5a0a6e9d665f" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.5" + }, + "timestamp": "2025-03-27T08:37:32.177177506" + } +} \ No newline at end of file diff --git a/modules/nf-core/fastqc/environment.yml b/modules/nf-core/fastqc/environment.yml index 691d4c7..f9f54ee 100644 --- a/modules/nf-core/fastqc/environment.yml +++ b/modules/nf-core/fastqc/environment.yml @@ -1,3 +1,5 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json channels: - conda-forge - bioconda diff --git a/modules/nf-core/fastqc/main.nf b/modules/nf-core/fastqc/main.nf index 752c3a1..23e1663 100644 --- a/modules/nf-core/fastqc/main.nf +++ b/modules/nf-core/fastqc/main.nf @@ -1,5 +1,5 @@ process FASTQC { - tag "$meta.id" + tag "${meta.id}" label 'process_medium' conda "${moduleDir}/environment.yml" @@ -19,30 +19,30 @@ process FASTQC { task.ext.when == null || task.ext.when script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" // Make list of old name and new name pairs to use for renaming in the bash while loop def old_new_pairs = reads instanceof Path || reads.size() == 1 ? [[ reads, "${prefix}.${reads.extension}" ]] : reads.withIndex().collect { entry, index -> [ entry, "${prefix}_${index + 1}.${entry.extension}" ] } - def rename_to = old_new_pairs*.join(' ').join(' ') + def rename_to = old_new_pairs*.join(' ').join(' ') def renamed_files = old_new_pairs.collect{ _old_name, new_name -> new_name }.join(' ') // The total amount of allocated RAM by FastQC is equal to the number of threads defined (--threads) time the amount of RAM defined (--memory) // https://github.com/s-andrews/FastQC/blob/1faeea0412093224d7f6a07f777fad60a5650795/fastqc#L211-L222 // Dividing the task.memory by task.cpu allows to stick to requested amount of RAM in the label - def memory_in_mb = MemoryUnit.of("${task.memory}").toUnit('MB') / task.cpus + def memory_in_mb = task.memory ? task.memory.toUnit('MB') / task.cpus : null // FastQC memory value allowed range (100 - 10000) def fastqc_memory = memory_in_mb > 10000 ? 10000 : (memory_in_mb < 100 ? 100 : memory_in_mb) """ - printf "%s %s\\n" $rename_to | while read old_name new_name; do + printf "%s %s\\n" ${rename_to} | while read old_name new_name; do [ -f "\${new_name}" ] || ln -s \$old_name \$new_name done fastqc \\ - $args \\ - --threads $task.cpus \\ - --memory $fastqc_memory \\ - $renamed_files + ${args} \\ + --threads ${task.cpus} \\ + --memory ${fastqc_memory} \\ + ${renamed_files} cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/nf-core/fastqc/meta.yml b/modules/nf-core/fastqc/meta.yml index 2b2e62b..c8d9d02 100644 --- a/modules/nf-core/fastqc/meta.yml +++ b/modules/nf-core/fastqc/meta.yml @@ -29,9 +29,10 @@ input: description: | List of input FastQ files of size 1 and 2 for single-end and paired-end data, respectively. + ontologies: [] output: - - html: - - meta: + html: + - - meta: type: map description: | Groovy Map containing sample information @@ -40,8 +41,9 @@ output: type: file description: FastQC report pattern: "*_{fastqc.html}" - - zip: - - meta: + ontologies: [] + zip: + - - meta: type: map description: | Groovy Map containing sample information @@ -50,11 +52,14 @@ output: type: file description: FastQC report archive pattern: "*_{fastqc.zip}" - - versions: - - versions.yml: - type: file - description: File containing software versions - pattern: "versions.yml" + ontologies: [] + versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: http://edamontology.org/format_3750 # YAML authors: - "@drpatelh" - "@grst" diff --git a/modules/nf-core/fastqc/tests/tags.yml b/modules/nf-core/fastqc/tests/tags.yml deleted file mode 100644 index 7834294..0000000 --- a/modules/nf-core/fastqc/tests/tags.yml +++ /dev/null @@ -1,2 +0,0 @@ -fastqc: - - modules/nf-core/fastqc/** diff --git a/modules/nf-core/multiqc/environment.yml b/modules/nf-core/multiqc/environment.yml index 6f5b867..dd513cb 100644 --- a/modules/nf-core/multiqc/environment.yml +++ b/modules/nf-core/multiqc/environment.yml @@ -1,5 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json channels: - conda-forge - bioconda dependencies: - - bioconda::multiqc=1.25.1 + - bioconda::multiqc=1.31 diff --git a/modules/nf-core/multiqc/main.nf b/modules/nf-core/multiqc/main.nf index cc0643e..5288f5c 100644 --- a/modules/nf-core/multiqc/main.nf +++ b/modules/nf-core/multiqc/main.nf @@ -3,8 +3,8 @@ process MULTIQC { conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.25.1--pyhdfd78af_0' : - 'biocontainers/multiqc:1.25.1--pyhdfd78af_0' }" + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/ef/eff0eafe78d5f3b65a6639265a16b89fdca88d06d18894f90fcdb50142004329/data' : + 'community.wave.seqera.io/library/multiqc:1.31--1efbafd542a23882' }" input: path multiqc_files, stageAs: "?/*" diff --git a/modules/nf-core/multiqc/meta.yml b/modules/nf-core/multiqc/meta.yml index b16c187..ce30eb7 100644 --- a/modules/nf-core/multiqc/meta.yml +++ b/modules/nf-core/multiqc/meta.yml @@ -15,57 +15,71 @@ tools: licence: ["GPL-3.0-or-later"] identifier: biotools:multiqc input: - - - multiqc_files: - type: file - description: | - List of reports / files recognised by MultiQC, for example the html and zip output of FastQC - - - multiqc_config: - type: file - description: Optional config yml for MultiQC - pattern: "*.{yml,yaml}" - - - extra_multiqc_config: - type: file - description: Second optional config yml for MultiQC. Will override common sections - in multiqc_config. - pattern: "*.{yml,yaml}" - - - multiqc_logo: + - multiqc_files: + type: file + description: | + List of reports / files recognised by MultiQC, for example the html and zip output of FastQC + ontologies: [] + - multiqc_config: + type: file + description: Optional config yml for MultiQC + pattern: "*.{yml,yaml}" + ontologies: + - edam: http://edamontology.org/format_3750 # YAML + - extra_multiqc_config: + type: file + description: Second optional config yml for MultiQC. Will override common sections + in multiqc_config. + pattern: "*.{yml,yaml}" + ontologies: + - edam: http://edamontology.org/format_3750 # YAML + - multiqc_logo: + type: file + description: Optional logo file for MultiQC + pattern: "*.{png}" + ontologies: [] + - replace_names: + type: file + description: | + Optional two-column sample renaming file. First column a set of + patterns, second column a set of corresponding replacements. Passed via + MultiQC's `--replace-names` option. + pattern: "*.{tsv}" + ontologies: + - edam: http://edamontology.org/format_3475 # TSV + - sample_names: + type: file + description: | + Optional TSV file with headers, passed to the MultiQC --sample_names + argument. + pattern: "*.{tsv}" + ontologies: + - edam: http://edamontology.org/format_3475 # TSV +output: + report: + - "*multiqc_report.html": type: file - description: Optional logo file for MultiQC - pattern: "*.{png}" - - - replace_names: + description: MultiQC report file + pattern: "multiqc_report.html" + ontologies: [] + data: + - "*_data": + type: directory + description: MultiQC data dir + pattern: "multiqc_data" + plots: + - "*_plots": type: file - description: | - Optional two-column sample renaming file. First column a set of - patterns, second column a set of corresponding replacements. Passed via - MultiQC's `--replace-names` option. - pattern: "*.{tsv}" - - - sample_names: + description: Plots created by MultiQC + pattern: "*_data" + ontologies: [] + versions: + - versions.yml: type: file - description: | - Optional TSV file with headers, passed to the MultiQC --sample_names - argument. - pattern: "*.{tsv}" -output: - - report: - - "*multiqc_report.html": - type: file - description: MultiQC report file - pattern: "multiqc_report.html" - - data: - - "*_data": - type: directory - description: MultiQC data dir - pattern: "multiqc_data" - - plots: - - "*_plots": - type: file - description: Plots created by MultiQC - pattern: "*_data" - - versions: - - versions.yml: - type: file - description: File containing software versions - pattern: "versions.yml" + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: http://edamontology.org/format_3750 # YAML authors: - "@abhi18av" - "@bunop" diff --git a/modules/nf-core/multiqc/tests/main.nf.test.snap b/modules/nf-core/multiqc/tests/main.nf.test.snap index 2fcbb5f..17881d1 100644 --- a/modules/nf-core/multiqc/tests/main.nf.test.snap +++ b/modules/nf-core/multiqc/tests/main.nf.test.snap @@ -2,14 +2,14 @@ "multiqc_versions_single": { "content": [ [ - "versions.yml:md5,41f391dcedce7f93ca188f3a3ffa0916" + "versions.yml:md5,8968b114a3e20756d8af2b80713bcc4f" ] ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.4" + "nf-test": "0.9.2", + "nextflow": "25.04.6" }, - "timestamp": "2024-10-02T17:51:46.317523" + "timestamp": "2025-09-08T20:57:36.139055243" }, "multiqc_stub": { "content": [ @@ -17,25 +17,25 @@ "multiqc_report.html", "multiqc_data", "multiqc_plots", - "versions.yml:md5,41f391dcedce7f93ca188f3a3ffa0916" + "versions.yml:md5,8968b114a3e20756d8af2b80713bcc4f" ] ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.4" + "nf-test": "0.9.2", + "nextflow": "25.04.6" }, - "timestamp": "2024-10-02T17:52:20.680978" + "timestamp": "2025-09-08T20:59:15.142230631" }, "multiqc_versions_config": { "content": [ [ - "versions.yml:md5,41f391dcedce7f93ca188f3a3ffa0916" + "versions.yml:md5,8968b114a3e20756d8af2b80713bcc4f" ] ], "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.4" + "nf-test": "0.9.2", + "nextflow": "25.04.6" }, - "timestamp": "2024-10-02T17:52:09.185842" + "timestamp": "2025-09-08T20:58:29.629087066" } } \ No newline at end of file diff --git a/modules/nf-core/multiqc/tests/tags.yml b/modules/nf-core/multiqc/tests/tags.yml deleted file mode 100644 index bea6c0d..0000000 --- a/modules/nf-core/multiqc/tests/tags.yml +++ /dev/null @@ -1,2 +0,0 @@ -multiqc: - - modules/nf-core/multiqc/** diff --git a/nextflow.config b/nextflow.config index 3fd3e96..337f725 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,6 +1,6 @@ /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - nf-core/dmscore Nextflow config file + nf-core/deepmutscan Nextflow config file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Default config options for all compute environments ---------------------------------------------------------------------------------------- @@ -25,6 +25,14 @@ params { max_multiqc_email_size = '25.MB' multiqc_methods_description = null + min_counts = 3 + mutagenesis_type = 'nnk' + fitness = false + dimsum = false + run_seqdepth = false + reading_frame = null + custom_codon_library = '/NULL' + // Boilerplate options outdir = null publish_dir_mode = 'copy' @@ -32,13 +40,15 @@ params { email_on_fail = null plaintext_email = false monochrome_logs = false - hook_url = null + hook_url = System.getenv('HOOK_URL') help = false help_full = false show_hidden = false version = false - pipelines_testdata_base_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/' - trace_report_suffix = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss')// Config options + pipelines_testdata_base_path = 'https://raw.githubusercontent.com/BenjaminWehnert1008/test-datasets/dmsqc/dmsqc/' + trace_report_suffix = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') + + // Config options config_profile_name = null config_profile_description = null @@ -89,9 +99,20 @@ profiles { shifter.enabled = false charliecloud.enabled = false apptainer.enabled = false - docker.runOptions = '-u $(id -u):$(id -g)' + process.containerOptions = '-u $(id -u):$(id -g)' + } + arm64 { + process.arch = 'arm64' + // TODO https://github.com/nf-core/modules/issues/6694 + // For now if you're using arm64 you have to use wave for the sake of the maintainers + // wave profile + apptainer.ociAutoPull = true + singularity.ociAutoPull = true + wave.enabled = true + wave.freeze = true + wave.strategy = 'conda,container' } - arm { + emulate_amd64 { docker.runOptions = '-u $(id -u):$(id -g) --platform=linux/amd64' } singularity { @@ -148,28 +169,69 @@ profiles { wave.freeze = true wave.strategy = 'conda,container' } - gitpod { - executor.name = 'local' - executor.cpus = 4 - executor.memory = 8.GB + gpu { + docker.runOptions = '-u $(id -u):$(id -g) --gpus all' + apptainer.runOptions = '--nv' + singularity.runOptions = '--nv' + } + local { process { - resourceLimits = [ - memory: 8.GB, - cpus : 4, - time : 1.h - ] + withLabel:process_single { + cpus = 4 + memory = 8.GB + time = '6h' + } + withLabel:process_low { + cpus = 4 + memory = 8.GB + time = '6h' + } + withLabel:process_medium { + cpus = 4 + memory = 8.GB + time = '6h' + } + withLabel:process_high { + cpus = 4 + memory = 8.GB + time = '6h' + } + withLabel:process_long { + cpus = 4 + memory = 8.GB + time = '6h' + } + withLabel:process_high_memory { + cpus = 4 + memory = 8.GB + time = '6h' + } + } + } + lowcpu { + process { + // default for all processes when this profile is active + cpus = 1 + + // override resource labels defined (e.g., in base.config) + withLabel: process_single { cpus = 1 } + withLabel: process_low { cpus = 1 } + withLabel: process_medium { cpus = 1 } + withLabel: process_high { cpus = 1 } + withLabel: process_long { cpus = 1 } + withLabel: process_high_memory { cpus = 1 } + + // anything not labeled gets also 1 CPU + withName: /.*/ { cpus = 1 } } } test { includeConfig 'conf/test.config' } test_full { includeConfig 'conf/test_full.config' } } -// Load nf-core custom profiles from different Institutions -includeConfig !System.getenv('NXF_OFFLINE') && params.custom_config_base ? "${params.custom_config_base}/nfcore_custom.config" : "/dev/null" - -// Load nf-core/dmscore custom profiles from different institutions. -// TODO nf-core: Optionally, you can add a pipeline-specific nf-core config at https://github.com/nf-core/configs -// includeConfig !System.getenv('NXF_OFFLINE') && params.custom_config_base ? "${params.custom_config_base}/pipeline/dmscore.config" : "/dev/null" +// If params.custom_config_base is set AND either the NXF_OFFLINE environment variable is not set or params.custom_config_base is a local path, the nfcore_custom.config file from the specified base path is included. +// Load nf-core/deepmutscan custom profiles from different institutions. +includeConfig params.custom_config_base && (!System.getenv('NXF_OFFLINE') || !params.custom_config_base.startsWith('http')) ? "${params.custom_config_base}/nfcore_custom.config" : "/dev/null" // Set default registry for Apptainer, Docker, Podman, Charliecloud and Singularity independent of -profile // Will not be used unless Apptainer / Docker / Podman / Charliecloud / Singularity are enabled @@ -195,14 +257,14 @@ env { } // Set bash options -process.shell = """\ -bash - -set -e # Exit if a tool returns a non-zero status/exit code -set -u # Treat unset variables and parameters as an error -set -o pipefail # Returns the status of the last command to exit with a non-zero status or zero if all successfully execute -set -C # No clobber - prevent output redirection from overwriting files. -""" +process.shell = [ + "bash", + "-C", // No clobber - prevent output redirection from overwriting files. + "-e", // Exit if a tool returns a non-zero status/exit code + "-u", // Treat unset variables and parameters as an error + "-o", // Returns the status of the last command to exit.. + "pipefail" // ..with a non-zero status or zero if all successfully execute +] // Disable process selector warnings by default. Use debug profile to enable warnings. nextflow.enable.configProcessNamesValidation = false @@ -225,7 +287,7 @@ dag { } manifest { - name = 'nf-core/dmscore' + name = 'nf-core/deepmutscan' author = """Benjamin Wehnert & Max Stammnitz""" // The author field is deprecated from Nextflow version 24.10.0, use contributors instead contributors = [ // TODO nf-core: Update the field with the details of the contributors to your pipeline. New with Nextflow version 24.10.0 @@ -238,50 +300,23 @@ manifest { orcid: '' ], ] - homePage = 'https://github.com/nf-core/dmscore' + homePage = 'https://github.com/nf-core/deepmutscan' description = """Until now, most Deep Mutational Scanning (DMS) experiments relied on variant-specific barcoded libraries for sequencing. This method enabled DMS on large proteins and led to many great publications. Recently, efforts have increased to make use of the classic and more simple random fragmentation-based short-read sequencing (“shotgun-sequencing”). This saves time and money and due to its simpler experimental design is less prone to mistakes. dmscore handles the essential computational steps, processing the raw FASTQ files and generating a count table of variants. Along the way, it provides multiple QC metrics, enabling users to quickly evaluate the success of their experimental setup.""" mainScript = 'main.nf' defaultBranch = 'master' - nextflowVersion = '!>=24.04.2' - version = '1.0.0dev' + nextflowVersion = '!>=25.04.0' + version = '1.0.0' doi = '' } // Nextflow plugins plugins { - id 'nf-schema@2.3.0' // Validation of pipeline parameters and creation of an input channel from a sample sheet + id 'nf-schema@2.5.1' // Validation of pipeline parameters and creation of an input channel from a sample sheet } validation { defaultIgnoreParams = ["genomes"] monochromeLogs = params.monochrome_logs - help { - enabled = true - command = "nextflow run nf-core/dmscore -profile --input samplesheet.csv --outdir " - fullParameter = "help_full" - showHiddenParameter = "show_hidden" - beforeText = """ --\033[2m----------------------------------------------------\033[0m- - \033[0;32m,--.\033[0;30m/\033[0;32m,-.\033[0m -\033[0;34m ___ __ __ __ ___ \033[0;32m/,-._.--~\'\033[0m -\033[0;34m |\\ | |__ __ / ` / \\ |__) |__ \033[0;33m} {\033[0m -\033[0;34m | \\| | \\__, \\__/ | \\ |___ \033[0;32m\\`-._,-`-,\033[0m - \033[0;32m`._,._,\'\033[0m -\033[0;35m nf-core/dmscore ${manifest.version}\033[0m --\033[2m----------------------------------------------------\033[0m- -""" - afterText = """${manifest.doi ? "\n* The pipeline\n" : ""}${manifest.doi.tokenize(",").collect { " https://doi.org/${it.trim().replace('https://doi.org/','')}"}.join("\n")}${manifest.doi ? "\n" : ""} -* The nf-core framework - https://doi.org/10.1038/s41587-020-0439-x - -* Software dependencies - https://github.com/nf-core/dmscore/blob/master/CITATIONS.md -""" - } - summary { - beforeText = validation.help.beforeText - afterText = validation.help.afterText - } } // Load modules.config for DSL2 module specific options diff --git a/nextflow_schema.json b/nextflow_schema.json index e4e5523..16837e1 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -1,7 +1,7 @@ { "$schema": "https://json-schema.org/draft/2020-12/schema", - "$id": "https://raw.githubusercontent.com/nf-core/dmscore/master/nextflow_schema.json", - "title": "nf-core/dmscore pipeline parameters", + "$id": "https://raw.githubusercontent.com/nf-core/deepmutscan/master/nextflow_schema.json", + "title": "nf-core/deepmutscan pipeline parameters", "description": "Until now, most Deep Mutational Scanning (DMS) experiments relied on variant-specific barcoded libraries for sequencing. This method enabled DMS on large proteins and led to many great publications. Recently, efforts have increased to make use of the classic and more simple random fragmentation-based short-read sequencing (“shotgun-sequencing”). This saves time and money and due to its simpler experimental design is less prone to mistakes. dmscore handles the essential computational steps, processing the raw FASTQ files and generating a count table of variants. Along the way, it provides multiple QC metrics, enabling users to quickly evaluate the success of their experimental setup.", "type": "object", "$defs": { @@ -10,7 +10,7 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": ["input", "outdir"], + "required": ["input", "outdir", "fasta", "reading_frame"], "properties": { "input": { "type": "string", @@ -20,7 +20,7 @@ "mimetype": "text/csv", "pattern": "^\\S+\\.csv$", "description": "Path to comma-separated file containing information about the samples in the experiment.", - "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row. See [usage docs](https://nf-co.re/dmscore/usage#samplesheet-input).", + "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row. See [usage docs](https://nf-co.re/deepmutscan/usage#samplesheet-input).", "fa_icon": "fas fa-file-csv" }, "outdir": { @@ -40,6 +40,42 @@ "type": "string", "description": "MultiQC report title. Printed as page header, used for filename if not otherwise specified.", "fa_icon": "fas fa-file-signature" + }, + "reading_frame": { + "type": "string", + "description": "Start and stop codon positions in the format 'start-stop', e.g., '352-1383'.", + "pattern": "^\\d+-\\d+$" + }, + "min_counts": { + "type": "integer", + "description": "Minimum number of variant observations required.", + "minimum": 1, + "default": 3 + }, + "mutagenesis_type": { + "type": "string", + "description": "Type of mutagenic primers. Choose from nnk, nns, max_diff_to_wt, custom. When using 'custom', also provide the parameter 'custom_codon_library'", + "default": "nnk" + }, + "custom_codon_library": { + "type": "string", + "format": "file-path", + "description": "Path to a comma-separated .txt file listing custom codons (e.g., 'AAA,AAC,AAG,...'). Required when mutagenesis_type is 'custom'." + }, + "dimsum": { + "type": "boolean", + "default": false, + "description": "Enable DiMSum execution." + }, + "fitness": { + "type": "boolean", + "default": false, + "description": "Enable basic fitness calculation and preceded data preparation (group and merge counts per sample into counts_merged.tsv, create a specific experimental design file and find a high-count 2nt synonymous mutation). All data prepared serves also as input for the dimsum fitness calculator." + }, + "run_seqdepth": { + "type": "boolean", + "default": false, + "description": "Whether to run the SeqDepth simulation module. This is computationally intensive and optional." } } }, @@ -216,7 +252,7 @@ "type": "string", "fa_icon": "far fa-check-circle", "description": "Base URL or local path to location of pipeline test dataset files", - "default": "https://raw.githubusercontent.com/nf-core/test-datasets/", + "default": "https://raw.githubusercontent.com/BenjaminWehnert1008/test-datasets/dmsqc/dmsqc/", "hidden": true }, "trace_report_suffix": { @@ -224,6 +260,18 @@ "fa_icon": "far calendar", "description": "Suffix to add to the trace report filename. Default is the date and time in the format yyyy-MM-dd_HH-mm-ss.", "hidden": true + }, + "help": { + "type": ["boolean", "string"], + "description": "Display the help message." + }, + "help_full": { + "type": "boolean", + "description": "Display the full detailed help message." + }, + "show_hidden": { + "type": "boolean", + "description": "Display hidden parameters in the help message (only works when --help or --help_full are provided)." } } } diff --git a/nf-test.config b/nf-test.config new file mode 100644 index 0000000..3a1fff5 --- /dev/null +++ b/nf-test.config @@ -0,0 +1,24 @@ +config { + // location for all nf-test tests + testsDir "." + + // nf-test directory including temporary files for each test + workDir System.getenv("NFT_WORKDIR") ?: ".nf-test" + + // location of an optional nextflow.config file specific for executing tests + configFile "tests/nextflow.config" + + // ignore tests coming from the nf-core/modules repo + ignore 'modules/nf-core/**/tests/*', 'subworkflows/nf-core/**/tests/*' + + // run all test with defined profile(s) from the main nextflow.config + profile "test" + + // list of filenames or patterns that should be trigger a full test run + triggers 'nextflow.config', 'nf-test.config', 'conf/test.config', 'tests/nextflow.config', 'tests/.nftignore' + + // load the necessary plugins + plugins { + load "nft-utils@0.0.3" + } +} diff --git a/ro-crate-metadata.json b/ro-crate-metadata.json index 766d934..ea70212 100644 --- a/ro-crate-metadata.json +++ b/ro-crate-metadata.json @@ -21,9 +21,9 @@ { "@id": "./", "@type": "Dataset", - "creativeWorkStatus": "InProgress", - "datePublished": "2025-01-22T16:52:07+00:00", - "description": "

\n \n \n \"nf-core/dmscore\"\n \n

\n\n[![GitHub Actions CI Status](https://github.com/nf-core/dmscore/actions/workflows/ci.yml/badge.svg)](https://github.com/nf-core/dmscore/actions/workflows/ci.yml)\n[![GitHub Actions Linting Status](https://github.com/nf-core/dmscore/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/dmscore/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/dmscore/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX)\n[![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com)\n\n[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A524.04.2-23aa62.svg)](https://www.nextflow.io/)\n[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)\n[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)\n[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)\n[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/nf-core/dmscore)\n\n[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23dmscore-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/dmscore)[![Follow on Twitter](http://img.shields.io/badge/twitter-%40nf__core-1DA1F2?labelColor=000000&logo=twitter)](https://twitter.com/nf_core)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core)\n\n## Introduction\n\n**nf-core/dmscore** is a bioinformatics pipeline that ...\n\n\n\n\n1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/))2. Present QC for raw reads ([`MultiQC`](http://multiqc.info/))\n\n## Usage\n\n> [!NOTE]\n> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.\n\n\n\nNow, you can run the pipeline using:\n\n\n\n```bash\nnextflow run nf-core/dmscore \\\n -profile \\\n --input samplesheet.csv \\\n --outdir \n```\n\n> [!WARNING]\n> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; see [docs](https://nf-co.re/docs/usage/getting_started/configuration#custom-configuration-files).\n\nFor more details and further functionality, please refer to the [usage documentation](https://nf-co.re/dmscore/usage) and the [parameter documentation](https://nf-co.re/dmscore/parameters).\n\n## Pipeline output\n\nTo see the results of an example test run with a full size dataset refer to the [results](https://nf-co.re/dmscore/results) tab on the nf-core website pipeline page.\nFor more details about the output files and reports, please refer to the\n[output documentation](https://nf-co.re/dmscore/output).\n\n## Credits\n\nnf-core/dmscore was originally written by Benjamin Wehnert & Max Stammnitz.\n\nWe thank the following people for their extensive assistance in the development of this pipeline:\n\n\n\n## Contributions and Support\n\nIf you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).\n\nFor further information or help, don't hesitate to get in touch on the [Slack `#dmscore` channel](https://nfcore.slack.com/channels/dmscore) (you can join with [this invite](https://nf-co.re/join/slack)).\n\n## Citations\n\n\n\n\n\n\nAn extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.\n\nYou can cite the `nf-core` publication as follows:\n\n> **The nf-core framework for community-curated bioinformatics pipelines.**\n>\n> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.\n>\n> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).\n", + "creativeWorkStatus": "Stable", + "datePublished": "2025-11-10T09:04:57+00:00", + "description": "

\n \n \n \"nf-core/deepmutscan\"\n \n

\n\n[![Open in GitHub Codespaces](https://github.com/codespaces/badge.svg)](https://github.com/codespaces/new/nf-core/deepmutscan)\n[![GitHub Actions CI Status](https://github.com/nf-core/deepmutscan/actions/workflows/nf-test.yml/badge.svg)](https://github.com/nf-core/deepmutscan/actions/workflows/nf-test.yml)\n[![GitHub Actions Linting Status](https://github.com/nf-core/deepmutscan/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/deepmutscan/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/deepmutscan/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX)\n[![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com)\n\n[![Nextflow](https://img.shields.io/badge/version-%E2%89%A525.04.0-green?style=flat&logo=nextflow&logoColor=white&color=%230DC09D&link=https%3A%2F%2Fnextflow.io)](https://www.nextflow.io/)\n[![nf-core template version](https://img.shields.io/badge/nf--core_template-3.4.1-green?style=flat&logo=nfcore&logoColor=white&color=%2324B064&link=https%3A%2F%2Fnf-co.re)](https://github.com/nf-core/tools/releases/tag/3.4.1)\n[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)\n[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)\n[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)\n[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/nf-core/deepmutscan)\n\n[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23deepmutscan-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/deepmutscan)[![Follow on Bluesky](https://img.shields.io/badge/bluesky-%40nf__core-1185fe?labelColor=000000&logo=bluesky)](https://bsky.app/profile/nf-co.re)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core)\n\n## Introduction\n\n**nf-core/deepmutscan** is a workflow designed for the analysis of deep mutational scanning (DMS) data. DMS enables researchers to experimentally measure the fitness effects of thousands of genes or gene variants simultaneously, helping to classify disease causing mutants in human and animal populations, to learn the fundamental rules of virus evolution, protein architecture, splicing, small-molecule interactions and many other phenotypes.\n\nWhile DNA synthesis and sequencing technologies have advanced substantially, long open reading frame (ORF) targets still present major challenges for DMS studies. Shotgun DNA sequencing can be used to greatly speed up the inference of long ORF mutant fitness landscapes, theoretically at no expense in accuracy. We have designed the `nf-core/deepmutscan` pipeline to unlock the power of shotgun sequencing based DMS studies on long ORFs, to simplify and standardise the complex bioinformatics steps involved in data processing of such experiments \u2013 from read alignment to QC reporting and fitness landscape inferences.\n\n

\n \n

\n\nThe pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. Where possible, these processes have been submitted to and installed from [nf-core/modules](https://github.com/nf-core/modules) in order to make them available to all nf-core pipelines, and to everyone within the Nextflow community!\n\nOn release, automated continuous integration tests run the pipeline on a full-sized dataset on the AWS cloud infrastructure. This ensures that the pipeline runs on AWS, has sensible resource allocation defaults set to run on real-world datasets, and permits the persistent storage of results to benchmark between pipeline releases and other analysis sources. The results obtained from the full-sized test can be viewed on the [nf-core website](https://nf-co.re/deepmutscan/results).\n\n## Major features\n\n- End-to-end analyses of various DMS data\n- Modular, three-stage workflow: alignment \u2192 QC \u2192 error-aware fitness estimation\n- Integration with popular statistical fitness estimation tools like [DiMSum](https://github.com/lehner-lab/DiMSum), [Enrich2](https://github.com/FowlerLab/Enrich2), [rosace](https://github.com/pimentellab/rosace/) and [mutscan](https://github.com/fmicompbio/mutscan)\n- Support of multiple mutagenesis strategies, e.g. by nicking with degenerate NNK and NNS codons\n- Containerisation via Docker, Singularity and Apptainer\n- Scalability across HPC and Cloud systems\n- Monitoring of CPU, memory, and CO\u2082 usage\n\nFor more details on the pipeline and on potential future expansions, please consider reading our [usage description](https://nf-co.re/deepmutscan/usage).\n\n## Step-by-step pipeline summary\n\nThe pipeline processes deep mutational scanning (DMS) sequencing data in several stages:\n\n1. Alignment of reads to the reference open reading frame (ORF) (`BWA-mem`)\n2. Filtering of wildtype and erroneous reads (`samtools view`)\n3. Read merging for base error reduction (`vsearch merge`, `BWA-mem`)\n4. Mutation counting (`GATK AnalyzeSaturationMutagenesis`)\n5. DMS library quality control\n6. Data summarisation across samples\n7. Single nucleotide variant error correction _(in development)_\n8. Fitness estimation _(in development)_\n\n## Usage\n\n> [!NOTE]\n> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.\n\nFirst, prepare a samplesheet with your input/output data in which each row represents a pair of fastq files (paired end). This should look as follows:\n\n```csv title=\"samplesheet.csv\"\nsample,type,replicate,file1,file2\nORF1,input,1,/reads/forward1.fastq.gz,/reads/reverse1.fastq.gz\nORF1,input,2,/reads/forward2.fastq.gz,/reads/reverse2.fastq.gz\nORF1,output,1,/reads/forward3.fastq.gz,/reads/reverse3.fastq.gz\nORF1,output,2,/reads/forward4.fastq.gz,/reads/reverse4.fastq.gz\n```\n\nSecondly, specify the gene or gene region of interest using a reference FASTA file via `--fasta`. Provide the exact codon coordinates using `--reading_frame`.\n\nNow, you can run the pipeline using:\n\n```bash title=\"example pipeline run\"\nnextflow run nf-core/deepmutscan \\\n -profile \\\n --input ./samplesheet.csv \\\n --fasta ./ref.fa \\\n --reading_frame 1-300 \\\n --outdir ./results\n```\n\n## Pipeline output\n\nTo see the results of an example test run with a full size dataset refer to the [results](https://nf-co.re/deepmutscan/results) tab on the nf-core website pipeline page.\n\nFor more details about the output files and reports, please refer to the\n[output documentation](https://nf-co.re/deepmutscan/output).\n\n## Contributing\n\nWe welcome contributions from the community!\n\nFor technical challenges and feedback on the pipeline, please use our [Github repository](https://github.com/nf-core/deepmutscan). Please open an [issue](https://github.com/nf-core/deepmutscan/issues/new) or [pull request](https://github.com/nf-core/deepmutscan/compare) to:\n\n- Report bugs or solve data incompatibilities when running `nf-core/deepmutscan`\n- Suggest the implementation of new modules for custom DMS workflows\n- Help improve this documentation\n\nIf you are interested in getting involved as a developer, please consider joining our interactive [`#deepmutscan` Slack channel](https://nfcore.slack.com/channels/deepmutscan) (via [this invite](https://nf-co.re/join/slack)).\n\n## Credits\n\nnf-core/deepmutscan was originally written by [Benjamin Wehnert](https://github.com/BenjaminWehnert1008) and [Max Stammnitz](https://github.com/MaximilianStammnitz) at the [Centre for Genomic Regulation, Barcelona](https://www.crg.eu/), with the generous support of an EMBO Long-term Postdoctoral Fellowship and a Marie Sk\u0142odowska-Curie grant by the European Union.\n\nIf you use `nf-core/deepmutscan` in your analyses, please cite:\n\n> \ud83d\udcc4 Wehnert et al., _bioRxiv_ preprint (coming soon)\n\nPlease also cite the `nf-core` framework:\n\n> \ud83d\udcc4 Ewels et al., _Nature Biotechnology_, 2020\n> [https://doi.org/10.1038/s41587-020-0439-x](https://doi.org/10.1038/s41587-020-0439-x)\n\nFor further information or help, don't hesitate to get in touch on the [Slack `#deepmutscan` channel](https://nfcore.slack.com/channels/deepmutscan) (you can join with [this invite](https://nf-co.re/join/slack)).\n\n## Scientific contact\n\nFor scientific discussions around the use of this pipeline (e.g. on experimental design or sequencing data requirements), please feel free to get in touch with us directly:\n\n- Benjamin Wehnert \u2014 wehnertbenjamin@gmail.com\n- Maximilian Stammnitz \u2014 maximilian.stammnitz@crg.eu\n", "hasPart": [ { "@id": "main.nf" @@ -43,6 +43,9 @@ { "@id": "modules/" }, + { + "@id": "modules/local/" + }, { "@id": "modules/nf-core/" }, @@ -92,17 +95,17 @@ "@id": ".prettierignore" } ], - "isBasedOn": "https://github.com/nf-core/dmscore", + "isBasedOn": "https://github.com/nf-core/deepmutscan", "license": "MIT", "mainEntity": { "@id": "main.nf" }, "mentions": [ { - "@id": "#f9815654-37bd-4781-a133-ab36324210f5" + "@id": "#f4a3c981-8917-4281-9629-144b2738afbd" } ], - "name": "nf-core/dmscore" + "name": "nf-core/deepmutscan" }, { "@id": "ro-crate-metadata.json", @@ -127,7 +130,7 @@ "ComputationalWorkflow" ], "dateCreated": "", - "dateModified": "2025-01-22T17:52:07Z", + "dateModified": "2025-11-10T10:04:57Z", "dct:conformsTo": "https://bioschemas.org/profiles/ComputationalWorkflow/1.0-RELEASE/", "keywords": [ "nf-core", @@ -137,7 +140,7 @@ "MIT" ], "name": [ - "nf-core/dmscore" + "nf-core/deepmutscan" ], "programmingLanguage": { "@id": "https://w3id.org/workflowhub/workflow-ro-crate#nextflow" @@ -146,11 +149,11 @@ "@id": "https://nf-co.re/" }, "url": [ - "https://github.com/nf-core/dmscore", - "https://nf-co.re/dmscore/dev/" + "https://github.com/nf-core/deepmutscan", + "https://nf-co.re/deepmutscan/1.0.0/" ], "version": [ - "1.0.0dev" + "1.0.0" ] }, { @@ -166,23 +169,23 @@ "version": "!>=24.04.2" }, { - "@id": "#f9815654-37bd-4781-a133-ab36324210f5", + "@id": "#f4a3c981-8917-4281-9629-144b2738afbd", "@type": "TestSuite", "instance": [ { - "@id": "#34dab5c1-0a1b-41b0-80d1-74e2f2d38434" + "@id": "#b0a1d187-61ce-46f7-8e2a-53759e1eafd5" } ], "mainEntity": { "@id": "main.nf" }, - "name": "Test suite for nf-core/dmscore" + "name": "Test suite for nf-core/deepmutscan" }, { - "@id": "#34dab5c1-0a1b-41b0-80d1-74e2f2d38434", + "@id": "#b0a1d187-61ce-46f7-8e2a-53759e1eafd5", "@type": "TestInstance", - "name": "GitHub Actions workflow for testing nf-core/dmscore", - "resource": "repos/nf-core/dmscore/actions/workflows/ci.yml", + "name": "GitHub Actions workflow for testing nf-core/deepmutscan", + "resource": "repos/nf-core/deepmutscan/actions/workflows/ci.yml", "runsOn": { "@id": "https://w3id.org/ro/terms/test#GithubService" }, @@ -221,6 +224,11 @@ "@type": "Dataset", "description": "Modules used by the pipeline" }, + { + "@id": "modules/local/", + "@type": "Dataset", + "description": "Pipeline-specific modules" + }, { "@id": "modules/nf-core/", "@type": "Dataset", diff --git a/subworkflows/local/utils_nfcore_dmscore_pipeline/main.nf b/subworkflows/local/utils_nfcore_deepmutscan_pipeline/main.nf similarity index 76% rename from subworkflows/local/utils_nfcore_dmscore_pipeline/main.nf rename to subworkflows/local/utils_nfcore_deepmutscan_pipeline/main.nf index 14eba3c..2368e89 100644 --- a/subworkflows/local/utils_nfcore_dmscore_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_deepmutscan_pipeline/main.nf @@ -1,5 +1,5 @@ // -// Subworkflow with functionality specific to the nf-core/dmscore pipeline +// Subworkflow with functionality specific to the nf-core/deepmutscan pipeline // /* @@ -11,6 +11,7 @@ include { UTILS_NFSCHEMA_PLUGIN } from '../../nf-core/utils_nfschema_plugin' include { paramsSummaryMap } from 'plugin/nf-schema' include { samplesheetToList } from 'plugin/nf-schema' +include { paramsHelp } from 'plugin/nf-schema' include { completionEmail } from '../../nf-core/utils_nfcore_pipeline' include { completionSummary } from '../../nf-core/utils_nfcore_pipeline' include { imNotification } from '../../nf-core/utils_nfcore_pipeline' @@ -32,6 +33,9 @@ workflow PIPELINE_INITIALISATION { nextflow_cli_args // array: List of positional nextflow CLI args outdir // string: The output directory where the results will be saved input // string: Path to input samplesheet + help // boolean: Display help message and exit + help_full // boolean: Show the full help message + show_hidden // boolean: Show hidden parameters in the help message main: @@ -50,10 +54,35 @@ workflow PIPELINE_INITIALISATION { // // Validate parameters and generate parameter summary to stdout // + before_text = """ +-\033[2m----------------------------------------------------\033[0m- + \033[0;32m,--.\033[0;30m/\033[0;32m,-.\033[0m +\033[0;34m ___ __ __ __ ___ \033[0;32m/,-._.--~\'\033[0m +\033[0;34m |\\ | |__ __ / ` / \\ |__) |__ \033[0;33m} {\033[0m +\033[0;34m | \\| | \\__, \\__/ | \\ |___ \033[0;32m\\`-._,-`-,\033[0m + \033[0;32m`._,._,\'\033[0m +\033[0;35m nf-core/deepmutscan ${workflow.manifest.version}\033[0m +-\033[2m----------------------------------------------------\033[0m- +""" + after_text = """${workflow.manifest.doi ? "\n* The pipeline\n" : ""}${workflow.manifest.doi.tokenize(",").collect { " https://doi.org/${it.trim().replace('https://doi.org/','')}"}.join("\n")}${workflow.manifest.doi ? "\n" : ""} +* The nf-core framework + https://doi.org/10.1038/s41587-020-0439-x + +* Software dependencies + https://github.com/nf-core/deepmutscan/blob/master/CITATIONS.md +""" + command = "nextflow run ${workflow.manifest.name} -profile --input samplesheet.csv --outdir " + UTILS_NFSCHEMA_PLUGIN ( workflow, validate_params, - null + null, + help, + help_full, + show_hidden, + before_text, + after_text, + command ) // @@ -73,25 +102,39 @@ workflow PIPELINE_INITIALISATION { // Channel - .fromList(samplesheetToList(params.input, "${projectDir}/assets/schema_input.json")) - .map { - meta, fastq_1, fastq_2 -> - if (!fastq_2) { - return [ meta.id, meta + [ single_end:true ], [ fastq_1 ] ] - } else { - return [ meta.id, meta + [ single_end:false ], [ fastq_1, fastq_2 ] ] - } - } - .groupTuple() - .map { samplesheet -> - validateInputSamplesheet(samplesheet) + .fromPath(params.input) + .splitCsv(header: true) + .filter { row -> + // Skip rows where file1 or file2 are BAM files + !(row.file1.endsWith('.bam') || (row.file2 && row.file2.endsWith('.bam'))) + } + .map { row -> + // Determine suffix based on the presence of file2 + def suffix = row.file2 ? "_pe" : "_se" + + // Construct metadata object with updated ID + def meta = [ + id : "${row.sample}_${row.type}_${row.replicate}${suffix}", // Base ID with suffix + sample : row.sample, + type : row.type, + replicate : row.replicate as int + ] + + // Generate file paths based on the presence of file1 and file2 + def reads = [] + if (row.file1) { + reads << row.file1 // Add file1 path } - .map { - meta, fastqs -> - return [ meta, fastqs.flatten() ] + if (row.file2) { + reads << row.file2 // Add file2 path } - .set { ch_samplesheet } + // Return metadata and file paths as a tuple + return [meta, reads] + } + .set { ch_samplesheet } + +// Emit the samplesheet channel and an empty version channel for use in the workflow emit: samplesheet = ch_samplesheet versions = ch_versions @@ -261,4 +304,3 @@ def methodsDescriptionText(mqc_methods_yaml) { return description_html.toString() } - diff --git a/subworkflows/nf-core/utils_nextflow_pipeline/tests/tags.yml b/subworkflows/nf-core/utils_nextflow_pipeline/tests/tags.yml deleted file mode 100644 index f847611..0000000 --- a/subworkflows/nf-core/utils_nextflow_pipeline/tests/tags.yml +++ /dev/null @@ -1,2 +0,0 @@ -subworkflows/utils_nextflow_pipeline: - - subworkflows/nf-core/utils_nextflow_pipeline/** diff --git a/subworkflows/nf-core/utils_nfcore_pipeline/tests/tags.yml b/subworkflows/nf-core/utils_nfcore_pipeline/tests/tags.yml deleted file mode 100644 index ac8523c..0000000 --- a/subworkflows/nf-core/utils_nfcore_pipeline/tests/tags.yml +++ /dev/null @@ -1,2 +0,0 @@ -subworkflows/utils_nfcore_pipeline: - - subworkflows/nf-core/utils_nfcore_pipeline/** diff --git a/subworkflows/nf-core/utils_nfschema_plugin/main.nf b/subworkflows/nf-core/utils_nfschema_plugin/main.nf index 4994303..ee4738c 100644 --- a/subworkflows/nf-core/utils_nfschema_plugin/main.nf +++ b/subworkflows/nf-core/utils_nfschema_plugin/main.nf @@ -4,6 +4,7 @@ include { paramsSummaryLog } from 'plugin/nf-schema' include { validateParameters } from 'plugin/nf-schema' +include { paramsHelp } from 'plugin/nf-schema' workflow UTILS_NFSCHEMA_PLUGIN { @@ -15,29 +16,56 @@ workflow UTILS_NFSCHEMA_PLUGIN { // when this input is empty it will automatically use the configured schema or // "${projectDir}/nextflow_schema.json" as default. This input should not be empty // for meta pipelines + help // boolean: show help message + help_full // boolean: show full help message + show_hidden // boolean: show hidden parameters in help message + before_text // string: text to show before the help message and parameters summary + after_text // string: text to show after the help message and parameters summary + command // string: an example command of the pipeline main: + if(help || help_full) { + help_options = [ + beforeText: before_text, + afterText: after_text, + command: command, + showHidden: show_hidden, + fullHelp: help_full, + ] + if(parameters_schema) { + help_options << [parametersSchema: parameters_schema] + } + log.info paramsHelp( + help_options, + params.help instanceof String ? params.help : "", + ) + exit 0 + } + // // Print parameter summary to stdout. This will display the parameters // that differ from the default given in the JSON schema // + + summary_options = [:] if(parameters_schema) { - log.info paramsSummaryLog(input_workflow, parameters_schema:parameters_schema) - } else { - log.info paramsSummaryLog(input_workflow) + summary_options << [parametersSchema: parameters_schema] } + log.info before_text + log.info paramsSummaryLog(summary_options, input_workflow) + log.info after_text // // Validate the parameters using nextflow_schema.json or the schema // given via the validation.parametersSchema configuration option // if(validate_params) { + validateOptions = [:] if(parameters_schema) { - validateParameters(parameters_schema:parameters_schema) - } else { - validateParameters() + validateOptions << [parametersSchema: parameters_schema] } + validateParameters(validateOptions) } emit: diff --git a/subworkflows/nf-core/utils_nfschema_plugin/tests/main.nf.test b/subworkflows/nf-core/utils_nfschema_plugin/tests/main.nf.test index 8fb3016..c977917 100644 --- a/subworkflows/nf-core/utils_nfschema_plugin/tests/main.nf.test +++ b/subworkflows/nf-core/utils_nfschema_plugin/tests/main.nf.test @@ -25,6 +25,12 @@ nextflow_workflow { input[0] = workflow input[1] = validate_params input[2] = "" + input[3] = false + input[4] = false + input[5] = false + input[6] = "" + input[7] = "" + input[8] = "" """ } } @@ -51,6 +57,12 @@ nextflow_workflow { input[0] = workflow input[1] = validate_params input[2] = "" + input[3] = false + input[4] = false + input[5] = false + input[6] = "" + input[7] = "" + input[8] = "" """ } } @@ -77,6 +89,12 @@ nextflow_workflow { input[0] = workflow input[1] = validate_params input[2] = "${projectDir}/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow_schema.json" + input[3] = false + input[4] = false + input[5] = false + input[6] = "" + input[7] = "" + input[8] = "" """ } } @@ -103,6 +121,12 @@ nextflow_workflow { input[0] = workflow input[1] = validate_params input[2] = "${projectDir}/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow_schema.json" + input[3] = false + input[4] = false + input[5] = false + input[6] = "" + input[7] = "" + input[8] = "" """ } } @@ -114,4 +138,36 @@ nextflow_workflow { ) } } + + test("Should create a help message") { + + when { + + params { + test_data = '' + outdir = null + } + + workflow { + """ + validate_params = true + input[0] = workflow + input[1] = validate_params + input[2] = "${projectDir}/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow_schema.json" + input[3] = true + input[4] = false + input[5] = false + input[6] = "Before" + input[7] = "After" + input[8] = "nextflow run test/test" + """ + } + } + + then { + assertAll( + { assert workflow.success } + ) + } + } } diff --git a/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow.config b/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow.config index 0907ac5..8d8c737 100644 --- a/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow.config +++ b/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow.config @@ -1,8 +1,8 @@ plugins { - id "nf-schema@2.1.0" + id "nf-schema@2.5.1" } validation { parametersSchema = "${projectDir}/subworkflows/nf-core/utils_nfschema_plugin/tests/nextflow_schema.json" monochromeLogs = true -} \ No newline at end of file +} diff --git a/testdata/README.md b/testdata/README.md new file mode 100644 index 0000000..219ecf1 --- /dev/null +++ b/testdata/README.md @@ -0,0 +1 @@ +4 x 50k sequencing reads (small pilot test data set) diff --git a/testdata/input1_50k_1.fastq.gz b/testdata/input1_50k_1.fastq.gz new file mode 100644 index 0000000..62fed3d Binary files /dev/null and b/testdata/input1_50k_1.fastq.gz differ diff --git a/testdata/input1_50k_2.fastq.gz b/testdata/input1_50k_2.fastq.gz new file mode 100644 index 0000000..bd888d9 Binary files /dev/null and b/testdata/input1_50k_2.fastq.gz differ diff --git a/testdata/input2_50k_1.fastq.gz b/testdata/input2_50k_1.fastq.gz new file mode 100644 index 0000000..a23bb31 Binary files /dev/null and b/testdata/input2_50k_1.fastq.gz differ diff --git a/testdata/input2_50k_2.fastq.gz b/testdata/input2_50k_2.fastq.gz new file mode 100644 index 0000000..e4166c5 Binary files /dev/null and b/testdata/input2_50k_2.fastq.gz differ diff --git a/testdata/output1_50k_1.fastq.gz b/testdata/output1_50k_1.fastq.gz new file mode 100644 index 0000000..bffa42e Binary files /dev/null and b/testdata/output1_50k_1.fastq.gz differ diff --git a/testdata/output1_50k_2.fastq.gz b/testdata/output1_50k_2.fastq.gz new file mode 100644 index 0000000..60a7cc4 Binary files /dev/null and b/testdata/output1_50k_2.fastq.gz differ diff --git a/testdata/output2_50k_1.fastq.gz b/testdata/output2_50k_1.fastq.gz new file mode 100644 index 0000000..48aec2a Binary files /dev/null and b/testdata/output2_50k_1.fastq.gz differ diff --git a/testdata/output2_50k_2.fastq.gz b/testdata/output2_50k_2.fastq.gz new file mode 100644 index 0000000..ac23137 Binary files /dev/null and b/testdata/output2_50k_2.fastq.gz differ diff --git a/tests/.nftignore b/tests/.nftignore new file mode 100644 index 0000000..e128a12 --- /dev/null +++ b/tests/.nftignore @@ -0,0 +1,12 @@ +.DS_Store +multiqc/multiqc_data/fastqc_top_overrepresented_sequences_table.txt +multiqc/multiqc_data/multiqc.parquet +multiqc/multiqc_data/multiqc.log +multiqc/multiqc_data/multiqc_data.json +multiqc/multiqc_data/multiqc_sources.txt +multiqc/multiqc_data/multiqc_software_versions.txt +multiqc/multiqc_data/llms-full.txt +multiqc/multiqc_plots/{svg,pdf,png}/*.{svg,pdf,png} +multiqc/multiqc_report.html +fastqc/*_fastqc.{html,zip} +pipeline_info/*.{html,json,txt,yml} diff --git a/tests/default.nf.test b/tests/default.nf.test new file mode 100644 index 0000000..efb3834 --- /dev/null +++ b/tests/default.nf.test @@ -0,0 +1,33 @@ +nextflow_pipeline { + + name "Test pipeline" + script "../main.nf" + tag "pipeline" + + test("-profile test") { + + when { + params { + outdir = "$outputDir" + } + } + + then { + // stable_name: All files + folders in ${params.outdir}/ with a stable name + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + // stable_path: All files in ${params.outdir}/ with stable content + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + assertAll( + { assert workflow.success}, + { assert snapshot( + // pipeline versions.yml file for multiqc from which Nextflow version is removed because we test pipelines on multiple Nextflow versions + removeNextflowVersion("$outputDir/pipeline_info/nf_core_deepmutscan_software_mqc_versions.yml"), + // All stable path name, with a relative path + stable_name, + // All files with stable contents + stable_path + ).match() } + ) + } + } +} diff --git a/tests/nextflow.config b/tests/nextflow.config new file mode 100644 index 0000000..662ccd0 --- /dev/null +++ b/tests/nextflow.config @@ -0,0 +1,14 @@ +/* +======================================================================================== + Nextflow config file for running nf-test tests +======================================================================================== +*/ + +// TODO nf-core: Specify any additional parameters here +// Or any resources requirements +params { + modules_testdata_base_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/' + pipelines_testdata_base_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/refs/heads/deepmutscan' +} + +aws.client.anonymous = true // fixes S3 access issues on self-hosted runners diff --git a/workflows/deepmutscan.nf b/workflows/deepmutscan.nf new file mode 100644 index 0000000..009190d --- /dev/null +++ b/workflows/deepmutscan.nf @@ -0,0 +1,528 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + IMPORT MODULES / SUBWORKFLOWS / FUNCTIONS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ +include { FASTQC } from '../modules/nf-core/fastqc/main' +include { MULTIQC } from '../modules/nf-core/multiqc/main' +include { BWA_INDEX } from '../modules/nf-core/bwa/index/main' +include { BWA_MEM } from '../modules/nf-core/bwa/mem/main' +include { BAMFILTER_DMS } from '../modules/local/bamprocessing/bamfilteringdms' +include { PREMERGE } from '../modules/local/bamprocessing/premerge' +include { GATK_SATURATIONMUTAGENESIS } from '../modules/local/gatk/saturationmutagenesis' +include { DMSANALYSIS_AASEQ } from '../modules/local/dmsanalysis/aaseq' +include { DMSANALYSIS_POSSIBLE_MUTATIONS } from '../modules/local/dmsanalysis/possiblemutations' +include { DMSANALYSIS_PROCESS_GATK } from '../modules/local/dmsanalysis/processgatk' +include { VISUALIZATION_COUNTS_PER_COV } from '../modules/local/visualization/visualization' +include { VISUALIZATION_COUNTS_HEATMAP } from '../modules/local/visualization/visualization' +include { VISUALIZATION_GLOBAL_POS_BIASES_COUNTS } from '../modules/local/visualization/visualization' +include { VISUALIZATION_GLOBAL_POS_BIASES_COV } from '../modules/local/visualization/visualization' +include { VISUALIZATION_LOGDIFF } from '../modules/local/visualization/visualization' +include { VISUALIZATION_SEQDEPTH } from '../modules/local/visualization/visualization' +include { GATK_GATKTOFITNESS } from '../modules/local/gatk/gatktofitness' +include { MERGE_COUNTS } from '../modules/local/fitness/merge_counts' +include { EXPDESIGN_FITNESS } from '../modules/local/fitness/fitness_experimental_design' +include { FIND_SYNONYMOUS_MUTATION } from '../modules/local/fitness/find_synonymous_mutation' +include { FITNESS_CALCULATION } from '../modules/local/fitness/fitness_standard' +include { FITNESS_QC } from '../modules/local/fitness/fitness_standard' +include { FITNESS_HEATMAP } from '../modules/local/fitness/fitness_standard' +include { RUN_DIMSUM } from '../modules/local/fitness/run_dimsum' +include { paramsSummaryMap } from 'plugin/nf-schema' +include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline' +include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline' +include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_deepmutscan_pipeline' + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + RUN MAIN WORKFLOW +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +// Params defaults +params.min_counts = 3 // minimum counts for variant to be recognized. All variants>custom<< this variable has to be path to .txt with custom library +params.sliding_window_size = 10 // sliding window size to flatten graphs in plots (e.g. GLOBAL_POS_BIASES_COUNTS function) +params.aimed_cov = 100 // aimed coverage (assuming equal spread) to visualize threshold in plots +params.run_seqdepth = false // creating seqdepth simulation plot, is computationally quite heavy. per default disabled. +params.fitness = false // run basic fitness calculation from selection input & output samples +params.dimsum = false // run DiMSum for fitness/functionality scores from selection input & output samples + + +// Define fasta file as channel (e.g. for BWA index) +Channel + .fromPath(params.fasta, checkIfExists: true) + .map { fasta -> tuple( [id: 'ref'], fasta ) } + .set { ch_fasta } + +// Define reading_frame as channel (e.g. for gatk function) +Channel + .value(params.reading_frame) + .set { reading_frame_ch } + +// Define min_counts as channel (e.g. for gatk function) -> if not set - default: 3 +Channel + .value(params.min_counts) + .set { min_counts_ch } + +// Define custom library as channel +Channel + .value(params.custom_codon_library) + .set { custom_codon_library_ch } + +// Define mutagenesis type as channel +Channel + .value(params.mutagenesis_type) + .set { mutagenesis_type_ch } + +// Define sliding_window_size as channel (e.g. for GLOBAL_POS_BIASES_COUNTS function) -> if not set - default: 10 +Channel + .value(params.sliding_window_size) + .set { sliding_window_size_ch } + +// Define aimed_cov as channel (e.g. for GLOBAL_POS_BIASES_COVERAGE function) -> if not set - default: 100 +Channel + .value(params.aimed_cov) + .set { aimed_cov_ch } + +// Define if seqdepth plot should be executed +Channel + .value(params.run_seqdepth) + .set { run_seqdepth_ch } + +// The deepmutscan samplesheet file provided via --input +Channel + .fromPath(params.input, checkIfExists: true) + .set { ch_samplesheet_csv } + + +// Define R scripts as channels +// Helper to anchor to the repo root +def R = { rel -> Channel.fromPath("${workflow.projectDir}/${rel}", checkIfExists: true) } + +// Define R scripts as channels (repo-anchored, works from any launch dir) +R("modules/local/dmsanalysis/bin/SeqDepth_simulation.R").set { seqdepth_simulation_script_ch } +R("modules/local/dmsanalysis/bin/aa_seq.R").set { aa_seq_script_ch } +R("modules/local/dmsanalysis/bin/complete_prefiltered_gatk.R").set { complete_gatk_script_ch } +R("modules/local/dmsanalysis/bin/counts_heatmap.R").set { counts_heatmap_script_ch } +R("modules/local/dmsanalysis/bin/counts_per_cov_heatmap.R").set { counts_per_cov_heatmap_script_ch } +R("modules/local/dmsanalysis/bin/detect_codons.R").set { detect_codons_script_ch } +R("modules/local/dmsanalysis/bin/filter_gatk_by_codon_library.R").set { filter_by_library_script_ch } +R("modules/local/dmsanalysis/bin/fitness_heatmap.R").set { fitness_heatmap_script_ch } +R("modules/local/dmsanalysis/bin/gatk_to_fitness.R").set { gatk_to_fitness_script_ch } +R("modules/local/dmsanalysis/bin/global_position_biases_counts_and_counts_per_cov.R").set { global_bias_counts_cov_script_ch } +R("modules/local/dmsanalysis/bin/global_position_biases_cov.R").set { global_bias_cov_script_ch } +R("modules/local/dmsanalysis/bin/install_packages.R").set { install_packages_script_ch } +R("modules/local/dmsanalysis/bin/logdiff.R").set { logdiff_script_ch } +R("modules/local/dmsanalysis/bin/low_count_variants.R").set { low_count_variants_script_ch } +R("modules/local/dmsanalysis/bin/possible_mutations.R").set { possible_mutations_script_ch } +R("modules/local/dmsanalysis/bin/prepare_gatk_data_for_count_heatmaps.R").set { prepare_counts_heatmap_script_ch } +R("modules/local/dmsanalysis/bin/prepare_gatk_data_for_fitness_heatmap.R").set { prepare_fitness_heatmap_script_ch } +R("modules/local/dmsanalysis/bin/process_raw_gatk.R").set { process_raw_gatk_script_ch } +R("modules/local/dmsanalysis/bin/merge_counts.R").set { merge_counts_script_ch } +R("modules/local/dmsanalysis/bin/dimsum_experimentalDesign.R").set { exp_design_ch } +R("modules/local/dmsanalysis/bin/fitness_calculation.R").set { fitness_calculation_script_ch } +R("modules/local/dmsanalysis/bin/fitness_QC.R").set { fitness_QC_script_ch } +R("modules/local/dmsanalysis/bin/fitness_heatmap.R").set { fitness_heatmap_script_ch } +R("modules/local/dmsanalysis/bin/find_syn_mutation.R").set { syn_mut_ch } + + + +workflow DEEPMUTSCAN { + + take: + ch_samplesheet // channel: samplesheet read in from --input + main: + + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + + // + // MODULE: Run FastQC + // + FASTQC ( + ch_samplesheet + ) + ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}) + ch_versions = ch_versions.mix(FASTQC.out.versions.first()) + + // + // Collate and save software versions + // + softwareVersionsToYAML(ch_versions) + .collectFile( + storeDir: "${params.outdir}/pipeline_info", + name: 'nf_core_' + 'deepmutscan_software_' + 'mqc_' + 'versions.yml', + sort: true, + newLine: true + ).set { ch_collated_versions } + + + // + // MODULE: MultiQC + // + ch_multiqc_config = Channel.fromPath( + "$projectDir/assets/multiqc_config.yml", checkIfExists: true) + ch_multiqc_custom_config = params.multiqc_config ? + Channel.fromPath(params.multiqc_config, checkIfExists: true) : + Channel.empty() + ch_multiqc_logo = params.multiqc_logo ? + Channel.fromPath(params.multiqc_logo, checkIfExists: true) : + Channel.empty() + + summary_params = paramsSummaryMap( + workflow, parameters_schema: "nextflow_schema.json") + ch_workflow_summary = Channel.value(paramsSummaryMultiqc(summary_params)) + ch_multiqc_files = ch_multiqc_files.mix( + ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) + ch_multiqc_custom_methods_description = params.multiqc_methods_description ? + file(params.multiqc_methods_description, checkIfExists: true) : + file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) + ch_methods_description = Channel.value( + methodsDescriptionText(ch_multiqc_custom_methods_description)) + + ch_multiqc_files = ch_multiqc_files.mix(ch_collated_versions) + ch_multiqc_files = ch_multiqc_files.mix( + ch_methods_description.collectFile( + name: 'methods_description_mqc.yaml', + sort: true + ) + ) + + MULTIQC ( + ch_multiqc_files.collect(), + ch_multiqc_config.toList(), + ch_multiqc_custom_config.toList(), + ch_multiqc_logo.toList(), + [], + [] + ) + + // + // MODULE: BWA Index + // + BWA_INDEX ( + ch_fasta + ) + + // Broadcast index to all samples + ch_bwa_index = BWA_INDEX.out.index + + // Broadcast the index to all samples + ch_bwa_index_broadcast = ch_samplesheet + .combine(ch_bwa_index) + .map { [it[2], it[3]] } + + // Broadcast the fasta to all samples + ch_fasta_broadcast = ch_fasta + .combine(ch_samplesheet) + .map { [it[0], it[1]] } + + // Broadcast the sort flag to all samples + ch_sort_bam = ch_samplesheet.map { false } + + // Run BWA_MEM with all four inputs aligned + BWA_MEM( + ch_samplesheet, + ch_bwa_index_broadcast, + ch_fasta_broadcast, + ch_sort_bam + ) + + BAMFILTER_DMS ( + BWA_MEM.out.bam + ) + + // Broadcast the FASTA path to every BAM emitted by BAMFILTER_DMS + ch_fasta_path_broadcast = ch_fasta + .combine(BAMFILTER_DMS.out.bam) // flattened item: [meta3, fasta, meta, bam] + .map { it[1] } // keep only the fasta path (N emissions) + + PREMERGE( + BAMFILTER_DMS.out.bam, // tuple(val(meta), path(bam)) + ch_fasta_path_broadcast // path(fasta) + ) + + // FASTA path for GATK: broadcast to N + ch_fasta_for_gatk = ch_fasta.combine(PREMERGE.out.bam).map { it[1] } // path -- N + // Reading frame for GATK: broadcast to N (it's a val string) + ch_rf_for_gatk = reading_frame_ch.combine(PREMERGE.out.bam).map { it[0] } // val -- N + // min_counts for GATK: broadcast to N (also a val) + ch_min_for_gatk = min_counts_ch.combine(PREMERGE.out.bam).map { it[0] } // val -- N + + GATK_SATURATIONMUTAGENESIS( + PREMERGE.out.bam, // merged reads - tuple(val(meta), path(bam)) + ch_fasta_for_gatk, // path(fasta) + ch_rf_for_gatk, // val(reading_frame string) + ch_min_for_gatk // val(min_counts) + ) + + DMSANALYSIS_AASEQ ( + ch_fasta, + reading_frame_ch, + aa_seq_script_ch // path to aa_seq.R (defined at the top) + ) + ch_versions = ch_versions.mix(DMSANALYSIS_AASEQ.out.versions) + + DMSANALYSIS_POSSIBLE_MUTATIONS( + ch_fasta, + reading_frame_ch, // pos_range (as val) + mutagenesis_type_ch, // mutagenesis_type (as val) + custom_codon_library_ch, // custom_codon_library (as path) + possible_mutations_script_ch // path to R script + ) + ch_versions = ch_versions.mix(DMSANALYSIS_POSSIBLE_MUTATIONS.out.versions) + + // Anchor (N items; one per sample) + def ch_vc = GATK_SATURATIONMUTAGENESIS.out.variantCounts // tuple(val(meta), path) + + // Fan-out helpers (broadcast singleton → N) + def fanout = { ch_singleton -> ch_singleton.combine(ch_vc).map { it[0] } } + + // Build per-sample inputs + ch_possible_mut_for_proc = fanout( DMSANALYSIS_POSSIBLE_MUTATIONS.out.possible_mutations.map { it[1] } ) + ch_aa_seq_for_proc = fanout( DMSANALYSIS_AASEQ.out.aa_seq.map { it[1] } ) + ch_min_counts_for_proc = fanout( min_counts_ch ) + ch_proc_raw_script = fanout( process_raw_gatk_script_ch ) + ch_filter_lib_script = fanout( filter_by_library_script_ch ) + ch_complete_script = fanout( complete_gatk_script_ch ) + ch_prepare_heatmap_script= fanout( prepare_counts_heatmap_script_ch ) + + // Call with all inputs aligned (each has N items now) + DMSANALYSIS_PROCESS_GATK( + ch_vc, // tuple(val(meta), path(variantCounts)) -- N + ch_possible_mut_for_proc, // path(possible_mutations) -- N + ch_aa_seq_for_proc, // path(aa_seq) -- N + ch_min_counts_for_proc, // val(min_counts) -- N + ch_proc_raw_script, // path(R script) -- N + ch_filter_lib_script, // path(R script) -- N + ch_complete_script, // path(R script) -- N + ch_prepare_heatmap_script // path(R script) -- N + ) + + annotated_variantCounts_ch = DMSANALYSIS_PROCESS_GATK.out.processed_variantCounts.map { meta, a, b, c, d -> tuple(meta, a) } + variantCounts_filtered_by_library_ch = DMSANALYSIS_PROCESS_GATK.out.processed_variantCounts.map { meta, a, b, c, d -> tuple(meta, b) } + library_completed_variantCounts_ch = DMSANALYSIS_PROCESS_GATK.out.processed_variantCounts.map { meta, a, b, c, d -> tuple(meta, c) } + variantCounts_for_heatmaps_ch = DMSANALYSIS_PROCESS_GATK.out.processed_variantCounts.map { meta, a, b, c, d -> tuple(meta, d) } + + // Broadcast `singleton` so it emits once per item in `anchorN` + def fanoutTo = { anchorN, singleton -> singleton.combine(anchorN).map { it[0] } } + + // --- For VISUALIZATION_COUNTS_PER_COV (anchor: variantCounts_for_heatmaps_ch) + min_counts_for_cov_ch = fanoutTo(variantCounts_for_heatmaps_ch, min_counts_ch) + counts_per_cov_heatmap_scriptN = fanoutTo(variantCounts_for_heatmaps_ch, counts_per_cov_heatmap_script_ch) + + // --- For VISUALIZATION_COUNTS_HEATMAP (anchor: variantCounts_for_heatmaps_ch) + min_counts_for_heatmap_ch = fanoutTo(variantCounts_for_heatmaps_ch, min_counts_ch) + counts_heatmap_scriptN = fanoutTo(variantCounts_for_heatmaps_ch, counts_heatmap_script_ch) + + // --- For VISUALIZATION_GLOBAL_POS_BIASES_* (anchor: variantCounts_filtered_by_library_ch) + aa_seq_for_bias_ch = fanoutTo(variantCounts_filtered_by_library_ch, DMSANALYSIS_AASEQ.out.aa_seq.map { it[1] }) + sliding_window_size_N = fanoutTo(variantCounts_filtered_by_library_ch, sliding_window_size_ch) + aimed_cov_N = fanoutTo(variantCounts_filtered_by_library_ch, aimed_cov_ch) + global_bias_counts_cov_scriptN = fanoutTo(variantCounts_filtered_by_library_ch, global_bias_counts_cov_script_ch) + global_bias_cov_scriptN = fanoutTo(variantCounts_filtered_by_library_ch, global_bias_cov_script_ch) + + // --- For VISUALIZATION_LOGDIFF (anchor: library_completed_variantCounts_ch) +logdiff_scriptN = fanoutTo(library_completed_variantCounts_ch, logdiff_script_ch) + + // --- For VISUALIZATION_SEQDEPTH (anchor: variantCounts_filtered_by_library_ch) + possible_mutations_N = fanoutTo(variantCounts_filtered_by_library_ch, DMSANALYSIS_POSSIBLE_MUTATIONS.out.possible_mutations.map { it[1] }) + min_counts_for_seqdepth_ch = fanoutTo(variantCounts_filtered_by_library_ch, min_counts_ch) + seqdepth_simulation_scriptN = fanoutTo(variantCounts_filtered_by_library_ch, seqdepth_simulation_script_ch) + + VISUALIZATION_COUNTS_PER_COV( + variantCounts_for_heatmaps_ch, + min_counts_for_cov_ch, + counts_per_cov_heatmap_scriptN + ) + + VISUALIZATION_COUNTS_HEATMAP( + variantCounts_for_heatmaps_ch, + min_counts_for_heatmap_ch, + counts_heatmap_scriptN + ) + + VISUALIZATION_GLOBAL_POS_BIASES_COUNTS( + variantCounts_filtered_by_library_ch, + aa_seq_for_bias_ch, + sliding_window_size_N, + global_bias_counts_cov_scriptN + ) + + VISUALIZATION_GLOBAL_POS_BIASES_COV( + variantCounts_filtered_by_library_ch, + aa_seq_for_bias_ch, + sliding_window_size_N, + aimed_cov_N, + global_bias_cov_scriptN + ) + + VISUALIZATION_LOGDIFF( + library_completed_variantCounts_ch, + logdiff_scriptN + ) + + if (params.run_seqdepth) { + VISUALIZATION_SEQDEPTH( + variantCounts_filtered_by_library_ch, + possible_mutations_N, + min_counts_for_seqdepth_ch, + seqdepth_simulation_scriptN + ) + } + + // Broadcast singletons to N (one per sample), anchored on variantCounts_filtered_by_library_ch + ch_fasta_for_fitness = ch_fasta.combine(variantCounts_filtered_by_library_ch).map { it[1] } // path(fasta) -- N + ch_rf_for_fitness = reading_frame_ch.combine(variantCounts_filtered_by_library_ch).map { it[0] } // val(range) -- N + ch_script_for_fitness = gatk_to_fitness_script_ch.combine(variantCounts_filtered_by_library_ch).map { it[0] } // path(script) -- N + + // Call with aligned inputs + GATK_GATKTOFITNESS( + variantCounts_filtered_by_library_ch, // tuple(val(meta), path) + ch_fasta_for_fitness, // path(fasta) + ch_rf_for_fitness, // val(reading_frame) + ch_script_for_fitness // path(R script) + ) + + + + // ----- DiMSum: group per biological sample (from samplesheet) and merge counts to use for DiMSum input ----- + + GATK_GATKTOFITNESS.out.fitness_input + .map { meta, tsv -> + def s = meta.sample as String + def id = meta.id as String + def base = s ? (s.replaceFirst(/_(input|output|quality)\d+$/, '')) + : (id?.tokenize('_')?.first()) + tuple(base as String, tuple(meta, tsv)) + } + .groupTuple() + .map { base, pairs -> + def metas = pairs.collect { it[0] } + def inputs = pairs.findAll { it[0].type == 'input' }.sort { it[0].replicate }.collect { it[1] } + def outputs = pairs.findAll { it[0].type == 'output' }.sort { it[0].replicate }.collect { it[1] } + tuple([sample: base], metas, inputs, outputs) + } + .filter { smeta, metas, ins, outs -> ins && outs } + .set { ch_fitness_bundled } + + // Broadcast the singleton script path to match each bundle + def ch_merge_script_for_each = merge_counts_script_ch + .combine(ch_fitness_bundled) + .map { it[0] } // keep the script path, one per bundle + + // Launch the merge of counts in DiMSum input format + if (params.fitness) { + MERGE_COUNTS( + ch_fitness_bundled, // tuple val(sample), val(metas), path(input_counts), path(output_counts) + ch_merge_script_for_each // path merge_script (broadcast) + ) + } + + // Create experimental design file to use for DiMSum + if (params.fitness) { + EXPDESIGN_FITNESS( + ch_samplesheet_csv, // path to CSV + exp_design_ch // path to R script + ) + } + + +// --- Synonymous WT selection (runs only when --fitness) --- +// Strip meta once: keep only the fasta path +ch_fasta.map { it[1] }.set { ch_fasta_path } // path(/…/GID1A.fasta) + +if (params.fitness) { + // MERGE_COUNTS.out.merged_counts shape: tuple( val([sample:'GID1A']), path("counts_merged.tsv") ) + FIND_SYNONYMOUS_MUTATION( + MERGE_COUNTS.out.merged_counts, // tuple(val(sample), path counts_merged.tsv) + ch_fasta_path.combine(MERGE_COUNTS.out.merged_counts).map { it[0] }, // path wt_fasta (broadcast to N) + reading_frame_ch.combine(MERGE_COUNTS.out.merged_counts).map { it[0] }, // val pos_range (broadcast) + syn_mut_ch.combine(MERGE_COUNTS.out.merged_counts).map { it[0] } // path R script (broadcast) + ) +} + +// Warning on compatibilities of DiMSum if --dimsum true. +if (params.dimsum) { + log.warn(""" + '--dimsum true' only works together with '--fitness true' + and is currently (30 Oct 2025) NOT supported on ARM processors. + Use AMD/x86_64 systems for DiMSum execution. + """) +} + +// --- fitness calculation (only when --fitness true) --- +if (params.fitness) { + // Shapes: + // MERGE_COUNTS.out.merged_counts -> tuple(val([sample:'GID1A']), path('counts_merged.tsv')) + // FIND_SYNONYMOUS_MUTATION.out.synonymous_wt -> tuple(val([sample:'GID1A']), path('synonymous_wt.txt')) + // EXPDESIGN_FITNESS.out.experimental_design -> path('experimentalDesign.tsv') (singleton) + + // 1) Key counts and WT by biological sample name to align them robustly + def ch_counts_keyed_d = MERGE_COUNTS.out.merged_counts + .map { smp, counts -> tuple(smp.sample as String, smp, counts) } + + def ch_wt_keyed_d = FIND_SYNONYMOUS_MUTATION.out.synonymous_wt + .map { smp, wt -> tuple(smp.sample as String, wt) } + + // 2) Join by key -> (val(sample), path(counts), path(wt)) + def ch_counts_wt_d = ch_counts_keyed_d.join(ch_wt_keyed_d) + .map { key, smp, counts, wt -> tuple(smp, counts, wt) } + + // 3) Broadcast experimental design (singleton) to each sample triple + def ch_exp_for_each_d = EXPDESIGN_FITNESS.out.experimental_design + .combine(ch_counts_wt_d) + .map { it[0] } + + // 4) Final aligned channels for FITNESS_CALCULATION and RUN_DIMSUM + def ch_run_counts_d = ch_counts_wt_d.map { smp, counts, wt -> tuple(smp, counts) } // matches: tuple val(sample), path(counts_merged) + def ch_run_wt_d = ch_counts_wt_d.map { smp, counts, wt -> wt } // matches: path(wt_txt) + def ch_run_exp_d = ch_exp_for_each_d // matches: path(exp_design) + + FITNESS_CALCULATION( + ch_run_counts_d, // tuple val(sample), path(counts_merged) + ch_run_exp_d, // path experimentalDesign.tsv + ch_run_wt_d, // path syn_wt_txt + fitness_calculation_script_ch + ) + + FITNESS_QC( + FITNESS_CALCULATION.out.fitness_estimation, // tuple val(sample), path(fitness_estimation.tsv) + fitness_QC_script_ch + ) + + FITNESS_HEATMAP( + FITNESS_CALCULATION.out.fitness_estimation, // tuple val(sample), path(fitness_estimation.tsv) + DMSANALYSIS_AASEQ.out.aa_seq, // WT amino acid sequence + fitness_heatmap_script_ch + ) + +// --- DiMSum execution (only when --fitness true & --dimsum true) --- + if (params.dimsum) { + RUN_DIMSUM( + ch_run_counts_d, // tuple val(sample), path(counts_merged) + ch_run_wt_d, // path syn_wt_txt + ch_run_exp_d // path experimentalDesign.tsv + ) + } +} + + + + + emit: + multiqc_report = MULTIQC.out.report.toList() // channel: /path/to/multiqc_report.html + versions = ch_versions // channel: [ path(versions.yml) ] + bwa_index = BWA_INDEX.out.index + aligned_bam = BWA_MEM.out.bam + filtered_bam = BAMFILTER_DMS.out.bam + premerged_bam = PREMERGE.out.bam + +} + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + THE END +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ diff --git a/workflows/dmscore.nf b/workflows/dmscore.nf deleted file mode 100644 index 041e503..0000000 --- a/workflows/dmscore.nf +++ /dev/null @@ -1,97 +0,0 @@ -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - IMPORT MODULES / SUBWORKFLOWS / FUNCTIONS -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ -include { FASTQC } from '../modules/nf-core/fastqc/main' -include { MULTIQC } from '../modules/nf-core/multiqc/main' -include { paramsSummaryMap } from 'plugin/nf-schema' -include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline' -include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline' -include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_dmscore_pipeline' - -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - RUN MAIN WORKFLOW -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ - -workflow DMSCORE { - - take: - ch_samplesheet // channel: samplesheet read in from --input - main: - - ch_versions = Channel.empty() - ch_multiqc_files = Channel.empty() - // - // MODULE: Run FastQC - // - FASTQC ( - ch_samplesheet - ) - ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}) - ch_versions = ch_versions.mix(FASTQC.out.versions.first()) - - // - // Collate and save software versions - // - softwareVersionsToYAML(ch_versions) - .collectFile( - storeDir: "${params.outdir}/pipeline_info", - name: 'nf_core_' + 'dmscore_software_' + 'mqc_' + 'versions.yml', - sort: true, - newLine: true - ).set { ch_collated_versions } - - - // - // MODULE: MultiQC - // - ch_multiqc_config = Channel.fromPath( - "$projectDir/assets/multiqc_config.yml", checkIfExists: true) - ch_multiqc_custom_config = params.multiqc_config ? - Channel.fromPath(params.multiqc_config, checkIfExists: true) : - Channel.empty() - ch_multiqc_logo = params.multiqc_logo ? - Channel.fromPath(params.multiqc_logo, checkIfExists: true) : - Channel.empty() - - summary_params = paramsSummaryMap( - workflow, parameters_schema: "nextflow_schema.json") - ch_workflow_summary = Channel.value(paramsSummaryMultiqc(summary_params)) - ch_multiqc_files = ch_multiqc_files.mix( - ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) - ch_multiqc_custom_methods_description = params.multiqc_methods_description ? - file(params.multiqc_methods_description, checkIfExists: true) : - file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) - ch_methods_description = Channel.value( - methodsDescriptionText(ch_multiqc_custom_methods_description)) - - ch_multiqc_files = ch_multiqc_files.mix(ch_collated_versions) - ch_multiqc_files = ch_multiqc_files.mix( - ch_methods_description.collectFile( - name: 'methods_description_mqc.yaml', - sort: true - ) - ) - - MULTIQC ( - ch_multiqc_files.collect(), - ch_multiqc_config.toList(), - ch_multiqc_custom_config.toList(), - ch_multiqc_logo.toList(), - [], - [] - ) - - emit:multiqc_report = MULTIQC.out.report.toList() // channel: /path/to/multiqc_report.html - versions = ch_versions // channel: [ path(versions.yml) ] - -} - -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - THE END -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/