diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index ea27a584..4ecfbfe3 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -2,6 +2,7 @@ "name": "nfcore", "image": "nfcore/gitpod:latest", "remoteUser": "gitpod", + "runArgs": ["--privileged"], // Configure tool-specific properties. "customizations": { diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index 3893df65..212cbdbe 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -9,7 +9,9 @@ Please use the pre-filled template to save time. However, don't be put off by this template - other more general issues and suggestions are welcome! Contributions to the code are even more welcome ;) -> If you need help using or modifying nf-core/crisprseq then the best place to ask is on the nf-core Slack [#crisprseq](https://nfcore.slack.com/channels/crisprseq) channel ([join our Slack here](https://nf-co.re/join/slack)). +:::info +If you need help using or modifying nf-core/crisprseq then the best place to ask is on the nf-core Slack [#crisprseq](https://nfcore.slack.com/channels/crisprseq) channel ([join our Slack here](https://nf-co.re/join/slack)). +::: ## Contribution workflow @@ -116,4 +118,3 @@ To get started: Devcontainer specs: - [DevContainer config](.devcontainer/devcontainer.json) -- [Dockerfile](.devcontainer/Dockerfile) diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index a6a2106b..5be250d7 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -42,7 +42,7 @@ body: attributes: label: System information description: | - * Nextflow version _(eg. 22.10.1)_ + * Nextflow version _(eg. 23.04.0)_ * Hardware _(eg. HPC, Desktop, Cloud)_ * Executor _(eg. slurm, local, awsbatch)_ * Container engine: _(e.g. Docker, Singularity, Conda, Podman, Shifter, Charliecloud, or Apptainer)_ diff --git a/.github/workflows/awsfulltest.yml b/.github/workflows/awsfulltest.yml index fe3c04f1..df8badba 100644 --- a/.github/workflows/awsfulltest.yml +++ b/.github/workflows/awsfulltest.yml @@ -14,18 +14,24 @@ jobs: runs-on: ubuntu-latest steps: - name: Launch workflow via tower - uses: nf-core/tower-action@v3 + uses: seqeralabs/action-tower-launch@v2 + with: workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} compute_env: ${{ secrets.TOWER_COMPUTE_ENV }} - workdir: s3://${{ secrets.AWS_S3_BUCKET }}/work/crisprseq/work-${{ github.sha }}/targeted_test + revision: ${{ github.sha }} + workdir: s3://${{ secrets.AWS_S3_BUCKET }}/work/crisprseq/work-${{ github.sha }} parameters: | { + "hook_url": "${{ secrets.MEGATESTS_ALERTS_SLACK_HOOK_URL }}", "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/crisprseq/results-${{ github.sha }}/targeted_test" } profiles: test_full + - uses: actions/upload-artifact@v3 with: name: Tower debug log file - path: tower_action_*.log + path: | + tower_action_*.log + tower_action_*.json diff --git a/.github/workflows/awsfulltest_screening.yml b/.github/workflows/awsfulltest_screening.yml index df435c92..8c91e9cf 100644 --- a/.github/workflows/awsfulltest_screening.yml +++ b/.github/workflows/awsfulltest_screening.yml @@ -14,18 +14,22 @@ jobs: runs-on: ubuntu-latest steps: - name: Launch workflow via tower - uses: nf-core/tower-action@v3 + uses: seqeralabs/action-tower-launch@v2 with: workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} compute_env: ${{ secrets.TOWER_COMPUTE_ENV }} - workdir: s3://${{ secrets.AWS_S3_BUCKET }}/work/crisprseq/work-${{ github.sha }}/screening_test + revision: ${{ github.sha }} + workdir: s3://${{ secrets.AWS_S3_BUCKET }}/work/crisprseq/work-${{ github.sha }} parameters: | { + "hook_url": "${{ secrets.MEGATESTS_ALERTS_SLACK_HOOK_URL }}", "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/crisprseq/results-${{ github.sha }}/screening_test" } profiles: test_screening_full - uses: actions/upload-artifact@v3 with: name: Tower debug log file - path: tower_action_*.log + path: | + tower_action_*.log + tower_action_*.json diff --git a/.github/workflows/awstest.yml b/.github/workflows/awstest.yml index 5df76ad4..0d650d8e 100644 --- a/.github/workflows/awstest.yml +++ b/.github/workflows/awstest.yml @@ -12,18 +12,22 @@ jobs: steps: # Launch workflow using Tower CLI tool action - name: Launch workflow via tower - uses: seqeralabs/action-tower-launch@v1 + uses: seqeralabs/action-tower-launch@v2 with: workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} compute_env: ${{ secrets.TOWER_COMPUTE_ENV }} + revision: ${{ github.sha }} workdir: s3://${{ secrets.AWS_S3_BUCKET }}/work/crisprseq/work-${{ github.sha }} parameters: | { "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/crisprseq/results-test-${{ github.sha }}" } profiles: test + - uses: actions/upload-artifact@v3 with: name: Tower debug log file - path: tower_action_*.log + path: | + tower_action_*.log + tower_action_*.json diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 888cb4bc..b8bdd214 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -78,7 +78,7 @@ jobs: - uses: actions/setup-python@v4 with: - python-version: "3.8" + python-version: "3.11" architecture: "x64" - name: Install dependencies diff --git a/.github/workflows/release-announcments.yml b/.github/workflows/release-announcments.yml new file mode 100644 index 00000000..6ad33927 --- /dev/null +++ b/.github/workflows/release-announcments.yml @@ -0,0 +1,68 @@ +name: release-announcements +# Automatic release toot and tweet anouncements +on: + release: + types: [published] + workflow_dispatch: + +jobs: + toot: + runs-on: ubuntu-latest + steps: + - uses: rzr/fediverse-action@master + with: + access-token: ${{ secrets.MASTODON_ACCESS_TOKEN }} + host: "mstdn.science" # custom host if not "mastodon.social" (default) + # GitHub event payload + # https://docs.github.com/en/developers/webhooks-and-events/webhooks/webhook-events-and-payloads#release + message: | + Pipeline release! ${{ github.repository }} v${{ github.event.release.tag_name }} - ${{ github.event.release.name }}! + + Please see the changelog: ${{ github.event.release.html_url }} + + send-tweet: + runs-on: ubuntu-latest + + steps: + - uses: actions/setup-python@v4 + with: + python-version: "3.10" + - name: Install dependencies + run: pip install tweepy==4.14.0 + - name: Send tweet + shell: python + run: | + import os + import tweepy + + client = tweepy.Client( + access_token=os.getenv("TWITTER_ACCESS_TOKEN"), + access_token_secret=os.getenv("TWITTER_ACCESS_TOKEN_SECRET"), + consumer_key=os.getenv("TWITTER_CONSUMER_KEY"), + consumer_secret=os.getenv("TWITTER_CONSUMER_SECRET"), + ) + tweet = os.getenv("TWEET") + client.create_tweet(text=tweet) + env: + TWEET: | + Pipeline release! ${{ github.repository }} v${{ github.event.release.tag_name }} - ${{ github.event.release.name }}! + + Please see the changelog: ${{ github.event.release.html_url }} + TWITTER_CONSUMER_KEY: ${{ secrets.TWITTER_CONSUMER_KEY }} + TWITTER_CONSUMER_SECRET: ${{ secrets.TWITTER_CONSUMER_SECRET }} + TWITTER_ACCESS_TOKEN: ${{ secrets.TWITTER_ACCESS_TOKEN }} + TWITTER_ACCESS_TOKEN_SECRET: ${{ secrets.TWITTER_ACCESS_TOKEN_SECRET }} + + bsky-post: + runs-on: ubuntu-latest + steps: + - uses: zentered/bluesky-post-action@v0.0.2 + with: + post: | + Pipeline release! ${{ github.repository }} v${{ github.event.release.tag_name }} - ${{ github.event.release.name }}! + + Please see the changelog: ${{ github.event.release.html_url }} + env: + BSKY_IDENTIFIER: ${{ secrets.BSKY_IDENTIFIER }} + BSKY_PASSWORD: ${{ secrets.BSKY_PASSWORD }} + # diff --git a/.gitpod.yml b/.gitpod.yml index 85d95ecc..25488dcc 100644 --- a/.gitpod.yml +++ b/.gitpod.yml @@ -1,4 +1,9 @@ image: nfcore/gitpod:latest +tasks: + - name: Update Nextflow and setup pre-commit + command: | + pre-commit install --install-hooks + nextflow self-update vscode: extensions: # based on nf-core.nf-core-extensionpack diff --git a/.nf-core.yml b/.nf-core.yml index 065203c3..085dbd0a 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -1,4 +1,6 @@ repository_type: pipeline lint: - # We skip these linting as we have splitted tests between targeted and screening - files_exist: conf/test.config + files_exist: + - conf/test.config # We skip these linting as we have splitted tests between targeted and screening + files_unchanged: + - lib/NfcoreTemplate.groovy # Introduced a change ahead of the nf-core/tools release diff --git a/CHANGELOG.md b/CHANGELOG.md index 1854e1fa..0b8f63bc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,11 +3,33 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [v2.1.0 - Jamon Salas](https://github.com/nf-core/crisprseq/releases/tag/2.1.0) - [14.11.2023] + +### Added + +- Template update v2.9 ([#52](https://github.com/nf-core/crisprseq/pull/52)) +- Use `Channel.fromSamplesheet()` from `nf-validation` to validate input sample sheets and create an input channel ([#58](https://github.com/nf-core/crisprseq/pull/58)) +- BAGEL2 as a module which detects gene essentiality ([#60](https://github.com/nf-core/crisprseq/pull/60)) +- Add custom plots to MultiQC report (cutadapt module, read processing, edition, edition QC) ([#64](https://github.com/nf-core/crisprseq/pull/64)) +- Template update v2.10 ([#79](https://github.com/nf-core/crisprseq/pull/79)) + +### Fixed + +- Change to `process_high` for the mageck mle module ([#60](https://github.com/nf-core/crisprseq/pull/60)) +- Fix paired-end samplesheet file for screening ([#60](https://github.com/nf-core/crisprseq/pull/60)) +- Summary processes don't modify the input file anymore, allowing resuming these processes ([#66](https://github.com/nf-core/crisprseq/pull/66)) +- Do not stash unexistent files, use empty lists instead. Fixes AWS tests ([#67](https://github.com/nf-core/crisprseq/pull/67)) +- Rename process `merging_summary` to `preprocessing_summary` to improve clarity ([#69](https://github.com/nf-core/crisprseq/pull/69)) +- Fix modules `BWA_INDEX` and `BOWTIE2_BUILD` after module update, new versions accept a meta map ([#76](https://github.com/nf-core/crisprseq/pull/76)) +- Update targeted metromap ([#78](https://github.com/nf-core/crisprseq/pull/78)) + +### Deprecated + ## [v2.0.0 - Paprika Lovelace](https://github.com/nf-core/crisprseq/releases/tag/2.0.0) - [05.07.2023] ### Added -- Crisprseq screening analysis : mageck mle, mageck rra, mageck count and crisprcleanr-normalize ([#22]https://github.com/nf-core/crisprseq/pull/22)) +- Crisprseq screening analysis : mageck mle, mageck rra, mageck count and crisprcleanr-normalize ([#22](https://github.com/nf-core/crisprseq/pull/22)) - Add new parameter `--analysis` to select analysis type (screening/targeted) ([#27](https://github.com/nf-core/crisprseq/pull/27)) - Tests to run screening analysis ([#926]https://github.com/nf-core/test-datasets/pull/926) - Metro map for targeted analysis ([#35](https://github.com/nf-core/crisprseq/pull/35)) diff --git a/CITATIONS.md b/CITATIONS.md index bf087515..6e78897d 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -16,6 +16,8 @@ - [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) + > Andrews, S. (2010). FastQC: A Quality Control Tool for High Throughput Sequence Data [Online]. + - [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/) > Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924. @@ -61,11 +63,18 @@ > Li, W. et al. Quality control, modeling, and visualization of CRISPR screens with MAGeCK-VISPR. Genome Biology 16, 281, doi:10.1186/s13059-015-0843-6 (2015). +- [BAGEL2](https://pubmed.ncbi.nlm.nih.gov/33407829/) + + > Kim E, Hart T. Improved analysis of CRISPR fitness screens and reduced off-target effects with the BAGEL2 gene essentiality classifier. Genome Med. 2021 Jan 6;13(1):2. doi: 10.1186/s13073-020-00809-3. PMID: 33407829; PMCID: PMC7789424. + - [BioContainers](https://pubmed.ncbi.nlm.nih.gov/28379341/) > da Veiga Leprevost F, Grüning B, Aflitos SA, Röst HL, Uszkoreit J, Barsnes H, Vaudel M, Moreno P, Gatto L, Weber J, Bai M, Jimenez RC, Sachsenberg T, Pfeuffer J, Alvarez RV, Griss J, Nesvizhskii AI, Perez-Riverol Y. BioContainers: an open-source and community-driven framework for software standardization. Bioinformatics. 2017 Aug 15;33(16):2580-2582. doi: 10.1093/bioinformatics/btx192. PubMed PMID: 28379341; PubMed Central PMCID: PMC5870671. - [Docker](https://dl.acm.org/doi/10.5555/2600239.2600241) + > Merkel, D. (2014). Docker: lightweight linux containers for consistent development and deployment. Linux Journal, 2014(239), 2. doi: 10.5555/2600239.2600241. + - [Singularity](https://pubmed.ncbi.nlm.nih.gov/28494014/) + > Kurtzer GM, Sochat V, Bauer MW. Singularity: Scientific containers for mobility of compute. PLoS One. 2017 May 11;12(5):e0177459. doi: 10.1371/journal.pone.0177459. eCollection 2017. PubMed PMID: 28494014; PubMed Central PMCID: PMC5426675. diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index f4fd052f..c089ec78 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -1,18 +1,20 @@ -# Code of Conduct at nf-core (v1.0) +# Code of Conduct at nf-core (v1.4) ## Our Pledge -In the interest of fostering an open, collaborative, and welcoming environment, we as contributors and maintainers of nf-core, pledge to making participation in our projects and community a harassment-free experience for everyone, regardless of: +In the interest of fostering an open, collaborative, and welcoming environment, we as contributors and maintainers of nf-core pledge to making participation in our projects and community a harassment-free experience for everyone, regardless of: - Age +- Ability - Body size +- Caste - Familial status - Gender identity and expression - Geographical location - Level of experience - Nationality and national origins - Native language -- Physical and neurological ability +- Neurodiversity - Race or ethnicity - Religion - Sexual identity and orientation @@ -22,80 +24,133 @@ Please note that the list above is alphabetised and is therefore not ranked in a ## Preamble -> Note: This Code of Conduct (CoC) has been drafted by the nf-core Safety Officer and been edited after input from members of the nf-core team and others. "We", in this document, refers to the Safety Officer and members of the nf-core core team, both of whom are deemed to be members of the nf-core community and are therefore required to abide by this Code of Conduct. This document will amended periodically to keep it up-to-date, and in case of any dispute, the most current version will apply. +:::note +This Code of Conduct (CoC) has been drafted by Renuka Kudva, Cris Tuñí, and Michael Heuer, with input from the nf-core Core Team and Susanna Marquez from the nf-core community. "We", in this document, refers to the Safety Officers and members of the nf-core Core Team, both of whom are deemed to be members of the nf-core community and are therefore required to abide by this Code of Conduct. This document will be amended periodically to keep it up-to-date. In case of any dispute, the most current version will apply. +::: -An up-to-date list of members of the nf-core core team can be found [here](https://nf-co.re/about). Our current safety officer is Renuka Kudva. +An up-to-date list of members of the nf-core core team can be found [here](https://nf-co.re/about). + +Our Safety Officers are Saba Nafees, Cris Tuñí, and Michael Heuer. nf-core is a young and growing community that welcomes contributions from anyone with a shared vision for [Open Science Policies](https://www.fosteropenscience.eu/taxonomy/term/8). Open science policies encompass inclusive behaviours and we strive to build and maintain a safe and inclusive environment for all individuals. -We have therefore adopted this code of conduct (CoC), which we require all members of our community and attendees in nf-core events to adhere to in all our workspaces at all times. Workspaces include but are not limited to Slack, meetings on Zoom, Jitsi, YouTube live etc. +We have therefore adopted this CoC, which we require all members of our community and attendees of nf-core events to adhere to in all our workspaces at all times. Workspaces include, but are not limited to, Slack, meetings on Zoom, gather.town, YouTube live etc. -Our CoC will be strictly enforced and the nf-core team reserve the right to exclude participants who do not comply with our guidelines from our workspaces and future nf-core activities. +Our CoC will be strictly enforced and the nf-core team reserves the right to exclude participants who do not comply with our guidelines from our workspaces and future nf-core activities. -We ask all members of our community to help maintain a supportive and productive workspace and to avoid behaviours that can make individuals feel unsafe or unwelcome. Please help us maintain and uphold this CoC. +We ask all members of our community to help maintain supportive and productive workspaces and to avoid behaviours that can make individuals feel unsafe or unwelcome. Please help us maintain and uphold this CoC. -Questions, concerns or ideas on what we can include? Contact safety [at] nf-co [dot] re +Questions, concerns, or ideas on what we can include? Contact members of the Safety Team on Slack or email safety [at] nf-co [dot] re. ## Our Responsibilities -The safety officer is responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behaviour. +Members of the Safety Team (the Safety Officers) are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behaviour. -The safety officer in consultation with the nf-core core team have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. +The Safety Team, in consultation with the nf-core core team, have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this CoC, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. -Members of the core team or the safety officer who violate the CoC will be required to recuse themselves pending investigation. They will not have access to any reports of the violations and be subject to the same actions as others in violation of the CoC. +Members of the core team or the Safety Team who violate the CoC will be required to recuse themselves pending investigation. They will not have access to any reports of the violations and will be subject to the same actions as others in violation of the CoC. -## When are where does this Code of Conduct apply? +## When and where does this Code of Conduct apply? -Participation in the nf-core community is contingent on following these guidelines in all our workspaces and events. This includes but is not limited to the following listed alphabetically and therefore in no order of preference: +Participation in the nf-core community is contingent on following these guidelines in all our workspaces and events, such as hackathons, workshops, bytesize, and collaborative workspaces on gather.town. These guidelines include, but are not limited to, the following (listed alphabetically and therefore in no order of preference): - Communicating with an official project email address. - Communicating with community members within the nf-core Slack channel. - Participating in hackathons organised by nf-core (both online and in-person events). -- Participating in collaborative work on GitHub, Google Suite, community calls, mentorship meetings, email correspondence. -- Participating in workshops, training, and seminar series organised by nf-core (both online and in-person events). This applies to events hosted on web-based platforms such as Zoom, Jitsi, YouTube live etc. +- Participating in collaborative work on GitHub, Google Suite, community calls, mentorship meetings, email correspondence, and on the nf-core gather.town workspace. +- Participating in workshops, training, and seminar series organised by nf-core (both online and in-person events). This applies to events hosted on web-based platforms such as Zoom, gather.town, Jitsi, YouTube live etc. - Representing nf-core on social media. This includes both official and personal accounts. ## nf-core cares 😊 -nf-core's CoC and expectations of respectful behaviours for all participants (including organisers and the nf-core team) include but are not limited to the following (listed in alphabetical order): +nf-core's CoC and expectations of respectful behaviours for all participants (including organisers and the nf-core team) include, but are not limited to, the following (listed in alphabetical order): - Ask for consent before sharing another community member’s personal information (including photographs) on social media. - Be respectful of differing viewpoints and experiences. We are all here to learn from one another and a difference in opinion can present a good learning opportunity. -- Celebrate your accomplishments at events! (Get creative with your use of emojis 🎉 🥳 💯 🙌 !) +- Celebrate your accomplishments! (Get creative with your use of emojis 🎉 🥳 💯 🙌 !) - Demonstrate empathy towards other community members. (We don’t all have the same amount of time to dedicate to nf-core. If tasks are pending, don’t hesitate to gently remind members of your team. If you are leading a task, ask for help if you feel overwhelmed.) - Engage with and enquire after others. (This is especially important given the geographically remote nature of the nf-core community, so let’s do this the best we can) - Focus on what is best for the team and the community. (When in doubt, ask) -- Graciously accept constructive criticism, yet be unafraid to question, deliberate, and learn. +- Accept feedback, yet be unafraid to question, deliberate, and learn. - Introduce yourself to members of the community. (We’ve all been outsiders and we know that talking to strangers can be hard for some, but remember we’re interested in getting to know you and your visions for open science!) -- Show appreciation and **provide clear feedback**. (This is especially important because we don’t see each other in person and it can be harder to interpret subtleties. Also remember that not everyone understands a certain language to the same extent as you do, so **be clear in your communications to be kind.**) +- Show appreciation and **provide clear feedback**. (This is especially important because we don’t see each other in person and it can be harder to interpret subtleties. Also remember that not everyone understands a certain language to the same extent as you do, so **be clear in your communication to be kind.**) - Take breaks when you feel like you need them. -- Using welcoming and inclusive language. (Participants are encouraged to display their chosen pronouns on Zoom or in communication on Slack.) +- Use welcoming and inclusive language. (Participants are encouraged to display their chosen pronouns on Zoom or in communication on Slack) ## nf-core frowns on 😕 -The following behaviours from any participants within the nf-core community (including the organisers) will be considered unacceptable under this code of conduct. Engaging or advocating for any of the following could result in expulsion from nf-core workspaces. +The following behaviours from any participants within the nf-core community (including the organisers) will be considered unacceptable under this CoC. Engaging or advocating for any of the following could result in expulsion from nf-core workspaces: - Deliberate intimidation, stalking or following and sustained disruption of communication among participants of the community. This includes hijacking shared screens through actions such as using the annotate tool in conferencing software such as Zoom. - “Doxing” i.e. posting (or threatening to post) another person’s personal identifying information online. - Spamming or trolling of individuals on social media. -- Use of sexual or discriminatory imagery, comments, or jokes and unwelcome sexual attention. -- Verbal and text comments that reinforce social structures of domination related to gender, gender identity and expression, sexual orientation, ability, physical appearance, body size, race, age, religion or work experience. +- Use of sexual or discriminatory imagery, comments, jokes, or unwelcome sexual attention. +- Verbal and text comments that reinforce social structures of domination related to gender, gender identity and expression, sexual orientation, ability, physical appearance, body size, race, age, religion, or work experience. ### Online Trolling -The majority of nf-core interactions and events are held online. Unfortunately, holding events online comes with the added issue of online trolling. This is unacceptable, reports of such behaviour will be taken very seriously, and perpetrators will be excluded from activities immediately. +The majority of nf-core interactions and events are held online. Unfortunately, holding events online comes with the risk of online trolling. This is unacceptable — reports of such behaviour will be taken very seriously and perpetrators will be excluded from activities immediately. -All community members are required to ask members of the group they are working within for explicit consent prior to taking screenshots of individuals during video calls. +All community members are **required** to ask members of the group they are working with for explicit consent prior to taking screenshots of individuals during video calls. -## Procedures for Reporting CoC violations +## Procedures for reporting CoC violations If someone makes you feel uncomfortable through their behaviours or actions, report it as soon as possible. -You can reach out to members of the [nf-core core team](https://nf-co.re/about) and they will forward your concerns to the safety officer(s). +You can reach out to members of the Safety Team (Saba Nafees, Cris Tuñí, and Michael Heuer) on Slack. Alternatively, contact a member of the nf-core core team [nf-core core team](https://nf-co.re/about), and they will forward your concerns to the Safety Team. + +Issues directly concerning members of the Core Team or the Safety Team will be dealt with by other members of the core team and the safety manager — possible conflicts of interest will be taken into account. nf-core is also in discussions about having an ombudsperson and details will be shared in due course. + +All reports will be handled with the utmost discretion and confidentiality. + +You can also report any CoC violations to safety [at] nf-co [dot] re. In your email report, please do your best to include: + +- Your contact information. +- Identifying information (e.g. names, nicknames, pseudonyms) of the participant who has violated the Code of Conduct. +- The behaviour that was in violation and the circumstances surrounding the incident. +- The approximate time of the behaviour (if different than the time the report was made). +- Other people involved in the incident, if applicable. +- If you believe the incident is ongoing. +- If there is a publicly available record (e.g. mailing list record, a screenshot). +- Any additional information. + +After you file a report, one or more members of our Safety Team will contact you to follow up on your report. + +## Who will read and handle reports + +All reports will be read and handled by the members of the Safety Team at nf-core. + +If members of the Safety Team are deemed to have a conflict of interest with a report, they will be required to recuse themselves as per our Code of Conduct and will not have access to any follow-ups. + +To keep this first report confidential from any of the Safety Team members, please submit your first report by direct messaging on Slack/direct email to any of the nf-core members you are comfortable disclosing the information to, and be explicit about which member(s) you do not consent to sharing the information with. + +## Reviewing reports + +After receiving the report, members of the Safety Team will review the incident report to determine whether immediate action is required, for example, whether there is immediate threat to participants’ safety. + +The Safety Team, in consultation with members of the nf-core core team, will assess the information to determine whether the report constitutes a Code of Conduct violation, for them to decide on a course of action. + +In the case of insufficient information, one or more members of the Safety Team may contact the reporter, the reportee, or any other attendees to obtain more information. -Issues directly concerning members of the core team will be dealt with by other members of the core team and the safety manager, and possible conflicts of interest will be taken into account. nf-core is also in discussions about having an ombudsperson, and details will be shared in due course. +Once additional information is gathered, the Safety Team will collectively review and decide on the best course of action to take, if any. The Safety Team reserves the right to not act on a report. -All reports will be handled with utmost discretion and confidentially. +## Confidentiality + +All reports, and any additional information included, are only shared with the team of safety officers (and possibly members of the core team, in case the safety officer is in violation of the CoC). We will respect confidentiality requests for the purpose of protecting victims of abuse. + +We will not name harassment victims, beyond discussions between the safety officer and members of the nf-core team, without the explicit consent of the individuals involved. + +## Enforcement + +Actions taken by the nf-core’s Safety Team may include, but are not limited to: + +- Asking anyone to stop a behaviour. +- Asking anyone to leave the event and online spaces either temporarily, for the remainder of the event, or permanently. +- Removing access to the gather.town and Slack, either temporarily or permanently. +- Communicating to all participants to reinforce our expectations for conduct and remind what is unacceptable behaviour; this may be public for practical reasons. +- Communicating to all participants that an incident has taken place and how we will act or have acted — this may be for the purpose of letting event participants know we are aware of and dealing with the incident. +- Banning anyone from participating in nf-core-managed spaces, future events, and activities, either temporarily or permanently. +- No action. ## Attribution and Acknowledgements @@ -106,6 +161,22 @@ All reports will be handled with utmost discretion and confidentially. ## Changelog -### v1.0 - March 12th, 2021 +### v1.4 - February 8th, 2022 + +- Included a new member of the Safety Team. Corrected a typographical error in the text. + +### v1.3 - December 10th, 2021 + +- Added a statement that the CoC applies to nf-core gather.town workspaces. Corrected typographical errors in the text. + +### v1.2 - November 12th, 2021 + +- Removed information specific to reporting CoC violations at the Hackathon in October 2021. + +### v1.1 - October 14th, 2021 + +- Updated with names of new Safety Officers and specific information for the hackathon in October 2021. + +### v1.0 - March 15th, 2021 - Complete rewrite from original [Contributor Covenant](http://contributor-covenant.org/) CoC. diff --git a/LICENSE b/LICENSE index 41427d4d..4de6f708 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) mirpedrol +Copyright (c) Júlia Mir Pedrol, Laurence Kuhlburger Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index 046675c7..b9f2281e 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,9 @@ # ![nf-core/crisprseq](docs/images/nf-core-crisprseq_logo_light.png#gh-light-mode-only) ![nf-core/crisprseq](docs/images/nf-core-crisprseq_logo_dark.png#gh-dark-mode-only) -[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/crisprseq/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.7598497-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.7598497) +[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/crisprseq/results) +[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.7598496-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.7598496) +[![GitHub Actions CI Status](https://github.com/nf-core/crisprseq/workflows/nf-core%20CI/badge.svg)](https://github.com/nf-core/crisprseq/actions?query=workflow%3A%22nf-core+CI%22) +[![GitHub Actions Linting Status](https://github.com/nf-core/crisprseq/workflows/nf-core%20linting/badge.svg)](https://github.com/nf-core/crisprseq/actions?query=workflow%3A%22nf-core+linting%22)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/crisprseq/results) [![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A523.04.0-23aa62.svg)](https://www.nextflow.io/) [![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/) @@ -63,10 +66,11 @@ For crispr screening: ## Usage -> **Note** -> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how -> to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) -> with `-profile test_targeted` or `-profile test_screening` before running the workflow on actual data. +:::note +If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how +to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) +with `-profile test` before running the workflow on actual data. +::: First, prepare a samplesheet with your input data that looks as follows: @@ -94,29 +98,39 @@ Now, you can run the pipeline using: nextflow run nf-core/crisprseq --input samplesheet.csv --analysis --outdir -profile ``` -> **Warning:** -> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those -> provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; -> see [docs](https://nf-co.re/usage/configuration#custom-configuration-files). +:::warning +Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those +provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; +see [docs](https://nf-co.re/usage/configuration#custom-configuration-files). +::: -For more details, please refer to the [usage documentation](https://nf-co.re/crisprseq/usage) and the [parameter documentation](https://nf-co.re/crisprseq/parameters). +For more details and further functionality, please refer to the [usage documentation](https://nf-co.re/crisprseq/usage) and the [parameter documentation](https://nf-co.re/crisprseq/parameters). ## Pipeline output -To see the the results of a test run with a full size dataset refer to the [results](https://nf-co.re/crisprseq/results) tab on the nf-core website pipeline page. +To see the results of an example test run with a full size dataset refer to the [results](https://nf-co.re/crisprseq/results) tab on the nf-core website pipeline page. For more details about the output files and reports, please refer to the [output documentation](https://nf-co.re/crisprseq/output). ## Credits nf-core/crisprseq targeted is based on [CRISPR-A](https://doi.org/10.1101/2022.09.02.506351) [[Sanvicente-García, et.al. (2023)](https://doi.org/10.1371/journal.pcbi.1011137)], originally written by Marta Sanvicente García at [Translational Synthetic Biology](https://synbio.upf.edu/) from [Universitat Pompeu Fabra](https://www.upf.edu/home). -It was re-written in Nextflow DSL2 and is primarily maintained by Júlia Mir Pedrol (@mirpedrol) at [Quantitative Biology Center (QBiC)](https://www.qbic.uni-tuebingen.de/) from [Universität Tübingen](https://uni-tuebingen.de/en/). +It was re-written in Nextflow DSL2 and is primarily maintained by Júlia Mir Pedrol ([@mirpedrol](https://github.com/mirpedrol)) at [Quantitative Biology Center (QBiC)](https://www.qbic.uni-tuebingen.de/) from [Universität Tübingen](https://uni-tuebingen.de/en/). -nf-core/crisprseq screening was written and is primarly maintained by Laurence Kuhlburger (@LaurenceKuhl) at [Quantitative Biology Center (QBiC)](https://www.qbic.uni-tuebingen.de/) from [Universität Tübingen](https://uni-tuebingen.de/en/). +nf-core/crisprseq screening was written and is primarly maintained by Laurence Kuhlburger ([@LaurenceKuhl](https://github.com/LaurenceKuhl)) at [Quantitative Biology Center (QBiC)](https://www.qbic.uni-tuebingen.de/) from [Universität Tübingen](https://uni-tuebingen.de/en/). - +- [@LaurenceKuhl](https://github.com/LaurenceKuhl) +- [@mirpedrol](https://github.com/mirpedrol) + +We thank the following people for their extensive assistance in the development of this pipeline: + +- [@ggabernet](https://github.com/ggabernet) +- [@jianhong](https://github.com/jianhong) +- [@mashehu](https://github.com/mashehu) +- [@msanvicente](https://github.com/msanvicente) +- [@SusiJo](https://github.com/SusiJo) ## Contributions and Support @@ -126,7 +140,7 @@ For further information or help, don't hesitate to get in touch on the [Slack `# ## Citations -If you use nf-core/crisprseq for your analysis, please cite it using the following doi: [10.5281/zenodo.7598497](https://doi.org/10.5281/zenodo.7598497) +If you use nf-core/crisprseq for your analysis, please cite it using the following doi: [10.5281/zenodo.7598496](https://doi.org/10.5281/zenodo.7598496) An extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file. diff --git a/assets/email_template.html b/assets/email_template.html index 4a90e11b..ec162f42 100644 --- a/assets/email_template.html +++ b/assets/email_template.html @@ -4,7 +4,7 @@ - + nf-core/crisprseq Pipeline Report diff --git a/assets/methods_description_template.yml b/assets/methods_description_template.yml index c10071b3..e2693af3 100644 --- a/assets/methods_description_template.yml +++ b/assets/methods_description_template.yml @@ -3,17 +3,21 @@ description: "Suggested text and references to use when describing pipeline usag section_name: "nf-core/crisprseq Methods Description" section_href: "https://github.com/nf-core/crisprseq" plot_type: "html" -## TODO nf-core: Update the HTML below to your prefered methods description, e.g. add publication citation for this pipeline +## TODO nf-core: Update the HTML below to your preferred methods description, e.g. add publication citation for this pipeline ## You inject any metadata in the Nextflow '${workflow}' object data: |

Methods

-

Data was processed using nf-core/crisprseq v${workflow.manifest.version} ${doi_text} of the nf-core collection of workflows (Ewels et al., 2020).

+

Data was processed using nf-core/crisprseq v${workflow.manifest.version} ${doi_text} of the nf-core collection of workflows (Ewels et al., 2020), utilising reproducible software environments from the Bioconda (Grüning et al., 2018) and Biocontainers (da Veiga Leprevost et al., 2017) projects.

The pipeline was executed with Nextflow v${workflow.nextflow.version} (Di Tommaso et al., 2017) with the following command:

${workflow.commandLine}
+

${tool_citations}

References

    -
  • Di Tommaso, P., Chatzou, M., Floden, E. W., Barja, P. P., Palumbo, E., & Notredame, C. (2017). Nextflow enables reproducible computational workflows. Nature Biotechnology, 35(4), 316-319. https://doi.org/10.1038/nbt.3820
  • -
  • Ewels, P. A., Peltzer, A., Fillinger, S., Patel, H., Alneberg, J., Wilm, A., Garcia, M. U., Di Tommaso, P., & Nahnsen, S. (2020). The nf-core framework for community-curated bioinformatics pipelines. Nature Biotechnology, 38(3), 276-278. https://doi.org/10.1038/s41587-020-0439-x
  • +
  • Di Tommaso, P., Chatzou, M., Floden, E. W., Barja, P. P., Palumbo, E., & Notredame, C. (2017). Nextflow enables reproducible computational workflows. Nature Biotechnology, 35(4), 316-319. doi: 10.1038/nbt.3820
  • +
  • Ewels, P. A., Peltzer, A., Fillinger, S., Patel, H., Alneberg, J., Wilm, A., Garcia, M. U., Di Tommaso, P., & Nahnsen, S. (2020). The nf-core framework for community-curated bioinformatics pipelines. Nature Biotechnology, 38(3), 276-278. doi: 10.1038/s41587-020-0439-x
  • +
  • Grüning, B., Dale, R., Sjödin, A., Chapman, B. A., Rowe, J., Tomkins-Tinch, C. H., Valieris, R., Köster, J., & Bioconda Team. (2018). Bioconda: sustainable and comprehensive software distribution for the life sciences. Nature Methods, 15(7), 475–476. doi: 10.1038/s41592-018-0046-7
  • +
  • da Veiga Leprevost, F., Grüning, B. A., Alves Aflitos, S., Röst, H. L., Uszkoreit, J., Barsnes, H., Vaudel, M., Moreno, P., Gatto, L., Weber, J., Bai, M., Jimenez, R. C., Sachsenberg, T., Pfeuffer, J., Vera Alvarez, R., Griss, J., Nesvizhskii, A. I., & Perez-Riverol, Y. (2017). BioContainers: an open-source and community-driven framework for software standardization. Bioinformatics (Oxford, England), 33(16), 2580–2582. doi: 10.1093/bioinformatics/btx192
  • + ${tool_bibliography}
Notes:
diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index 4a692180..a754e6ab 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -1,7 +1,7 @@ report_comment: > - This report has been generated by the nf-core/crisprseq + This report has been generated by the nf-core/crisprseq analysis pipeline. For information about how to interpret these results, please see the - documentation. + documentation. report_section_order: "nf-core-crisprseq-methods-description": order: -1000 @@ -11,3 +11,170 @@ report_section_order: order: -1002 export_plots: true + +# Modules to run +run_modules: + - custom_content + - fastqc + - cutadapt + +# Custom tables and plots +custom_data: + # Summary of edition plot + edition_plot: + plot_type: "bargraph" + file_format: "csv" + section_name: "RESULTS: Type of edition" + description: | + Number of reads classified according to edition types. + Reads are classified between WT (without an edit), template based modifications and indels. + Indels are divided between deletions, insertions and delins (deletion + insertion). + Deletions and insertions can be out of frame or in frame + categories: + Wt: + name: "WT reads" + color: "#C0C0C0" + Template-based: + name: "Template-based modifications" + color: "#E9AF82" + Delins: + name: "DelIns" + color: "#D5B1F5" + Ins_inframe: + name: "Insertions in frame" + color: "#B5D5C1" + Ins_outframe: + name: "Insertions out of frame" + color: "#80D09F" + Dels_inframe: + name: "Deletions in frame" + color: "#BFD0EB" + Dels_outframe: + name: "Deletions out of frame" + color: "#699AE9" + pconfig: + id: "edition_bargraph" + title: "Reads by edition type" + ylab: "Read count" + # Indels QC plot + indel_qc_plot: + plot_type: "bargraph" + file_format: "csv" + section_name: "RESULTS: QC of indels" + description: | + Number of reads classified between WT and containing indels. + Both types are classified between passing the filtering steps or not. + Indel reads passing the filtering steps are divided input: + reads with a modification above the error rate and located in the common edit window; + above the error rate but not in the edit region; + below the error rate and within the edit region; + below the error rate and not in the the edit region; + categories: + Wt NOT passing filter: + name: "WT reads not passing filtering steps" + color: "#302F2F" + Wt passing filter: + name: "WT reads passing filtering steps" + color: "#C0C0C0" + Indels NOT passing filter: + name: "Reads with indels not passing filtering steps" + color: "#696868" + Above error & in pick: + name: "Reads with indels above the error rate and within the edition region" + color: "#80D09F" + NOT above error & in pick: + name: "Reads with indels below the error rate and within the edition region" + color: "#FFB570" + Above error & NOT in pick: + name: "Reads with indels above the error rate and outside the edition region" + color: "#F89D47" + NOT above error & NOT in pick: + name: "Reads with indels below the error rate and outside the edition region" + color: "#EB5555" + pconfig: + id: "indelqc_bargraph" + title: "Read indels quality control" + ylab: "Read count" + # Read processing summary table + read_processing: + id: "read_processing" + section_name: "Read processing summary" + plot_type: "table" + description: | + Table showing the number of input raw reads per sample and the percentage of reads after each processing step: + reads merged with Pear; + reads passing quality filters; + number of resulting clusters after UMI clustering of reads; + number of aligned reads + pconfig: + id: "read_processing" + namespace: "Read processing summary" + table_title: "Read processing summary" + headers: + Raw reads: + title: "Raw reads" + description: "Number of input raw reads" + format: "{:,.0f}" + scale: "PuBu" + Merged reads: + title: "Merged reads" + description: "Percentage of merged reads" + suffix: "%" + max: 100 + min: 0 + scale: "RdYlGn" + Quality filtered reads: + title: "Quality reads" + description: "Percentage of reads passing the quality filter" + suffix: "%" + max: 100 + min: 0 + scale: "RdYlGn" + Clustered reads: + title: "Clusters" + description: "Number of UMI clusters after read clustering" + format: "{:,.0f}" + scale: "PuBu" + Aligned reads: + title: "Aligned reads" + description: "Percentage of aligned reads or UMI clusters" + suffix: "%" + max: 100 + min: 0 + scale: "RdYlGn" + +sp: + edition_plot: + fn: "*_edits.csv" + indel_qc_plot: + fn: "*_QC-indels.csv" + read_processing: + fn: "*_reads-summary.csv" + cutadapt: + fn: "*.cutadapt.log" + +# Define the order of sections +module_order: + - fastqc + - cutadapt + - custom_content + +# Set the order of custom code plots and tables +custom_content: + order: + - read_processing + - edition_plot + - indel_qc_plot + +# Order of table columns +table_columns_placement: + Read processing summary: + Raw reads: 100 + Merged reads: 200 + Quality filtered reads: 300 + Clustered reads: 400 + Aligned reads: 500 + +# Add a comment to cutadapt section showing filtered reads +section_comments: + cutadapt_filtered_reads: "Cutadapt is selecting reads passing quality filters." diff --git a/assets/nf-core-crisprseq_logo_light.png b/assets/nf-core-crisprseq_logo_light.png index 622298e6..9f408dfb 100644 Binary files a/assets/nf-core-crisprseq_logo_light.png and b/assets/nf-core-crisprseq_logo_light.png differ diff --git a/assets/schema_input.json b/assets/schema_input.json index 7327787f..8b780504 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -10,25 +10,42 @@ "sample": { "type": "string", "pattern": "^\\S+$", - "errorMessage": "Sample name must be provided and cannot contain spaces" + "errorMessage": "Sample name must be provided and cannot contain spaces", + "meta": ["id"] }, "fastq_1": { "type": "string", + "format": "file-path", + "exists": true, "pattern": "^\\S+\\.f(ast)?q\\.gz$", "errorMessage": "FastQ file for reads 1 must be provided, cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'" }, "fastq_2": { + "type": "string", + "format": "file-path", + "exists": true, "errorMessage": "FastQ file for reads 2 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'", - "anyOf": [ - { - "type": "string", - "pattern": "^\\S+\\.f(ast)?q\\.gz$" - }, - { - "type": "string", - "maxLength": 0 - } - ] + "pattern": "^\\S+\\.f(ast)?q\\.gz$" + }, + "condition": { + "type": "string", + "pattern": "^\\S+$", + "meta": ["condition"] + }, + "reference": { + "type": "string", + "pattern": "^[ACTGNactgn]+$", + "errorMessage": "Reference sequence must be a valid DNA sequence" + }, + "protospacer": { + "type": "string", + "pattern": "^[ACTGNactgn]+$", + "errorMessage": "Protospacer sequence must be a valid DNA sequence" + }, + "template": { + "type": "string", + "pattern": "^[ACTGNactgn]+$", + "errorMessage": "Template sequence must be a valid DNA sequence" } }, "required": ["sample", "fastq_1"] diff --git a/assets/slackreport.json b/assets/slackreport.json index 043d02f2..a3cdd5e4 100644 --- a/assets/slackreport.json +++ b/assets/slackreport.json @@ -3,7 +3,7 @@ { "fallback": "Plain-text summary of the attachment.", "color": "<% if (success) { %>good<% } else { %>danger<%} %>", - "author_name": "sanger-tol/readmapping v${version} - ${runName}", + "author_name": "nf-core/crisprseq v${version} - ${runName}", "author_icon": "https://www.nextflow.io/docs/latest/_static/favicon.ico", "text": "<% if (success) { %>Pipeline completed successfully!<% } else { %>Pipeline completed with errors<% } %>", "fields": [ diff --git a/bin/BAGEL.py b/bin/BAGEL.py new file mode 100755 index 00000000..205fbb44 --- /dev/null +++ b/bin/BAGEL.py @@ -0,0 +1,1160 @@ +#!/usr/bin/env python + +# --------------------------------- +# BAGEL: Bayesian Analysis of Gene EssentaLity +# (c) Traver Hart , Eiru Kim 2017. + +# Acknowledgements: John McGonigle +# modified 10/2019 +# Free to modify and redistribute with attribution +# --------------------------------- + +# ------------------------------------ +# constants + +""" +MIT License + +Copyright (c) 2020 Hart Lab + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +""" + + +import sys +import time + +import click +import numpy as np +import pandas as pd +import scipy.stats as stats +from sklearn.linear_model import LinearRegression + +VERSION = 2.0 + +BUILD = 115 + + +""" +Update history + +Build 115 +1. Add single pass without resampling + +Build 114 +1. Add option for normalizing rep counts + +Build 113 +1. Fixed bugs + +Build 112 +1. Add sgRNA filtering options +2. Implemented 'Click' library. Thanks to John McGonigle + + +Build 111 +1. Add an option to equalize # of sgRNA per gene + +Build 110 +1. Enable multi control for fold-change calculation +2. Now user can input column names +3. Fix Threshold function + +""" + + +class OptionRequiredIf(click.Option): + def full_process_value(self, ctx, value): + value = super(OptionRequiredIf, self).full_process_value(ctx, value) + + if value is None and ctx.params["filter_multi_target"] is True: + msg = ( + "Error! Multi-target-filtering selected and not align-info provided.\n" + "Please indicate align-info file." + ) + raise click.MissingParameter(ctx=ctx, param=self, message=msg) + return value + + +CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"]) + + +def round_to_hundredth(x): + return np.around(x * 100) / 100.0 + + +def func_linear(x, a, b): + return (a * x) + b + + +class Training: + def __init__(self, X, n=None, cvnum=10): + if n == None: + self._n = len(X) + self._cvnum = cvnum + self._bid = int(self._n / cvnum) + self._bucket = np.arange(len(X)) + self._X = X + self._step = 0 + + def cross_validation(self): + if self._bid < 1: # bid check + print("The number of genes is too small! n<" + str(self._cvnum)) + sys.exit(1) + drawing = list() + mask = np.array([True] * self._n) + for j in range(self._bid): + # drawing.append(delete(self._bucket, np.random.randrange(len(self._bucket)))) + select = np.random.randint(len(self._bucket)) + drawing.append(self._bucket[select]) + mask[self._bucket[select]] = False + self._bucket = np.delete(self._bucket, select) + if self._step < self._n % self._cvnum: # for distribute remain.. + select = np.random.randint(len(self._bucket)) + drawing.append(self._bucket[select]) + mask[self._bucket[select]] = False + self._bucket = np.delete(self._bucket, select) + self._step += 1 + X_resample = self._X[mask] + return X_resample, self._X[~mask] + + def get_cv_step(self): + return self._step + + def bootstrap_resample(self): + mask = np.array([False] * self._n) + resample_i = np.floor(np.random.rand(self._n) * len(self._X)).astype(int) + + mask[resample_i] = True + X_resample = self._X[mask] + return X_resample, self._X[~mask] + + def get_data(self, method=0): + if method == 0: + train, test = self.bootstrap_resample() + elif method == 1: + train, test = self.cross_validation() + else: + print("Method passed as a value that was neither 0 nor 1.") + sys.exit(1) + return train, test + + +def fibo_weighted_sum(listofscore): + value = p1 = p2 = 0.0 + c = 1.0 # current value + for v in listofscore: + value += v / c + p2 = p1 # go one step + p1 = c + c = p1 + p2 + return value + + +# Note the \b in the doc string below is prevent click from wrapping the lines on the terminal. +@click.group(context_settings=CONTEXT_SETTINGS) +def bagel(): + """ + -------------------------------------------------------------------- + BAGEL.py + -------------------------------------------------------------------- + A tool from the Bayesian Analysis of Gene EssentiaLity (BAGEL) suite. + + \b + Calculate fold changes from read count data: + + \b + BAGEL.py fc -i [read count file] -o [output label] -c [control column] + + Calculate Bayes Factors from foldchange data: + + \b + BAGEL.py bf -i [fold change] -o [output file] -e [essentials genes] -n [nonessentials genes] -c [columns] + + + Calculate precision-recall from Bayes Factors: + + \b + BAGEL.py pr -i [Bayes Factor file] -o [output file] -e [essentials genes] -n [nonessentials genes] + + + + To print the current build and version use: + + \b + BAGEL.py version + """ + + +@click.command(name="version") +def report_bagel_version(): + """ + Report the current build and version no. + """ + print( + "Bayesian Analysis of Gene EssentiaLity (BAGEL) suite:\n" + "Version: {VERSION}\n" + "Build: {BUILD}".format(VERSION=VERSION, BUILD=BUILD) + ) + + +@click.command(name="fc") +@click.option("-i", "--read-count-file", required=True, type=click.Path(exists=True)) +@click.option("-o", "--output-label", required=True) +@click.option("-c", "--control-columns", required=True) +@click.option("-m", "--min-reads", type=int, default=0) +@click.option("-Np", "--pseudo-count", type=int, default=5) +def calculate_fold_change(read_count_file, output_label, control_columns, min_reads, pseudo_count): + """ + \b + Calculate fold changes from read count data outputting a fold change column: + + \b + BAGEL.py fc -i [read count file] -o [output label] -c [control column] + + \b + Required options: + -i --read-count-file Tab-delimited file of reagents and fold changes. See documentation for format. + -o --output-label Label for all output files + -c --control-columns A comma-delimited list of columns of control (T0 or plasmid) columns. + Input can be either number or name. + \b + Other options: + --min-reads=N Discard gRNA with T0 counts < N (default 0) + -Np, --pseudo-count=N Add a pseudocount of N to every readcount (default 5) + -h, --help Show this help text + + + \b + Example: + BAGEL.py fc -i readcount_file.txt -o experiment_name -c 1 + + This command calculates fold change, and writes [output label].foldchange and [output label].normalized_reads + + """ + + # ---------------------------------------------------------------- # + # Import raw read data, normalize, filter for T0 min read counts # + # Output: [output label].foldchange # + # ---------------------------------------------------------------- # + reads = pd.read_csv(read_count_file, sep="\t", index_col=0) + + reads[reads.columns.values[0]].fillna("NO_GENE_NAME", inplace=True) + reads.fillna(0, inplace=True) + control_columns = control_columns.split(",") + # + # check if controls are given as numeric or name + # + + try: + try: + ctrl_columns = list(map(int, control_columns)) + ctrl_labels = reads.columns.values[ctrl_columns] + except ValueError: + ctrl_labels = control_columns + + ctrl_sum = reads[ctrl_labels].sum(axis=1) + reads.drop(ctrl_labels, axis=1, inplace=True) + ctrl_label_new = ";".join(ctrl_labels) + reads[ctrl_label_new] = ctrl_sum + except: + print(reads[ctrl_labels].sum(axis=1)) + print("Invalid input controls") + sys.exit(1) + + numClones, numColumns = reads.shape + print("Controls: " + ", ".join(ctrl_labels)) + + # + # Add pseudo count + # + + reads.iloc[:, list(range(1, numColumns))] += pseudo_count + + # + # normalize each sample to a fixed total readcount + # + sumReads = reads.iloc[:, list(range(1, numColumns))].sum(0) + normed = pd.DataFrame(index=reads.index.values) + normed["GENE"] = reads.iloc[:, 0] # first column is gene name + normed = ( + reads.iloc[:, list(range(1, numColumns))] / np.tile(sumReads, [numClones, 1]) * 10000000 + ) # normalize to 10M reads + + # + # filter for minimum readcount + # + f = np.where(reads[ctrl_label_new] >= min_reads)[0] + normed = normed.iloc[f, :] + + # + # calculate fold change + # + foldchange = pd.DataFrame(index=normed.index.values) + foldchange.index.name = "REAGENT_ID" + foldchange["GENE"] = reads.iloc[f, 0] # dataframe 'normed' has no GENE column + for i in range(numColumns - 1): + foldchange[normed.columns.values[i]] = np.log2( + (normed.loc[:, normed.columns.values[i]]) / normed[ctrl_label_new] + ) + # + # we have calculated a foldchange for the control column. Drop it. + # + foldchange.drop(ctrl_label_new, axis=1, inplace=True) + + # + # write normed readcount file + # write foldchange file + # + + foldchange_filename = output_label + ".foldchange" + foldchange.to_csv(foldchange_filename, sep="\t", float_format="%4.3f") + + normedreads_filename = output_label + ".normed_readcount" + normed.to_csv(normedreads_filename, sep="\t", float_format="%3.2f") + + +@click.command(name="bf") +@click.option("-i", "--fold-change", required=True, type=click.Path(exists=True)) +@click.option("-o", "--output-file", required=True) +@click.option("-e", "--essential-genes", required=True, type=click.Path(exists=True)) +@click.option("-n", "--non-essential-genes", required=True, type=click.Path(exists=True)) +@click.option("-c", "--columns-to-test", required=True) +@click.option("-w", "--network-file", metavar="[network File]", default=None, type=click.Path(exists=True)) +@click.option("-m", "--filter-multi-target", is_flag=True) +@click.option("-m0", "--loci-without-mismatch", type=int, default=10) +@click.option("-m1", "--loci-with-mismatch", type=int, default=10) +@click.option( + "--align-info", metavar="--align-info [File]", default=None, type=click.Path(exists=True), cls=OptionRequiredIf +) +@click.option("-b", "--use-bootstrapping", is_flag=True) +@click.option("-NS", "--no-resampling", is_flag=True) +@click.option("-s", "--use-small-sample", is_flag=True) +@click.option("-N", "--no-of-cross-validations", type=int, default=10) +@click.option("-NB", "--bootstrap-iterations", type=int, default=1000) +@click.option("-r", "--sgrna-bayes-factors", is_flag=True) +@click.option("-f", "--equalise-sgrna-no", type=int) +@click.option("-s", "--seed", default=int(time.time() * 100000 % 100000), type=int) +@click.option("-t", "--run-test-mode", is_flag=True) +@click.option("-p", "--equalise-rep-no", type=int) +def calculate_bayes_factors( + fold_change, + output_file, + essential_genes, + non_essential_genes, + columns_to_test, + network_file, + align_info, + use_bootstrapping, + no_resampling, + use_small_sample, + filter_multi_target, + loci_without_mismatch, + loci_with_mismatch, + bootstrap_iterations, + no_of_cross_validations, + sgrna_bayes_factors, + equalise_sgrna_no, + seed, + run_test_mode, + equalise_rep_no, +): + """ + \b + Calculate Bayes Factors from an input fold change file: + + \b + BAGEL.py bf -i [fold change] -o [output file] -e [essentials genes] -n [nonessentials genes] -c [columns] + + + \b + Calculates a log2 Bayes Factor for each gene. Positive BFs indicate confidence that the gene is essential. + Output written to the [output file] contains: gene name, mean Bayes Factor across all iterations, std deviation of + BFs, and number of iterations in which the gene was part of the test set (and a BF was calculated[output file]. + + + \b + Required options: + -i --fold-change [fold change file] Tab-delimited file of reagents and fold changes + (see documentation for format). + -o, --output-file [output file] Output filename + -e, --essential-genes [reference essentials] File with list of training set of essential genes + -n, --non-essential-genes [reference nonessentials] File with list of training set of nonessential genes + -c [columns to test] comma-delimited list of columns in input file to + include in analyisis + + \b + Network options: + -w [network file] Enable Network boosting. Tab-delmited file of edges. [GeneA (\\t) GeneB]\n' + + \b + Multi-target guides filtering options: + -m, --filter-multi-target Enable filtering multi-targeting guide RNAs + --align-info [file] Input precalculated align-info file + -m0, --loci-without-mismatch Filtering guide RNAs without mismatch targeting over than [N] loci, default = 10 + -m1, --loci-with-mismatch Filtering guide RNAs with 1-bp mismatch targeting over than [N] loci, default = 10 + + \b + Other options: + -b, --bootstrapping Use bootstrapping instead of cross-validation (Slow) + -NS, --no-resampling Run BAGEL without resampling + -s, --small-sample Low-fat BAGEL, Only resampled training set (Bootstrapping, iteration = 100) + -r --sgrna-bayes-factors Calculate sgRNA-wise Bayes Factor + -f --equalise-sgrna-no Equalize the number of sgRNAs per gene to particular value [Number] + -p --equalise-rep-no Equalize the number of repicates to particular value [Number] + -N --no-of-cross-validations Number of sections for cross validation (default 10) + -NB --bootstraps-iterations Number of bootstrap iterations (default 1000) + -s, --seed=N Define random seed + -h, --help Show this help text + + \b + Example: + + \b + BAGEL.py bf -i fc_file.txt -o results.bf -e ess_training_set.txt -n noness_training_set.txt -c 1,2,3 + + """ + np.random.seed(seed) # set random seed + if network_file: + network_boost = True + else: + network_boost = False + + if sgrna_bayes_factors: + rna_level = True + else: + rna_level = False + + if network_file and sgrna_bayes_factors: + network_boost = False + + if equalise_sgrna_no: + flat_sgrna = True + else: + flat_sgrna = False + + if equalise_rep_no: + flat_rep = True + else: + flat_rep = False + + if use_small_sample: + train_method = 0 + bootstrap_iterations = 100 + + elif use_bootstrapping: + train_method = 0 + + else: + train_method = 1 + + genes = {} + fc = {} + gene2rna = {} + rna2gene = {} + + multi_targeting_sgrnas = dict() + multi_targeting_sgrnas_info = dict() + + if filter_multi_target: + try: + aligninfo = pd.read_csv(align_info, header=None, index_col=0, sep="\t").fillna("") + for seqid in aligninfo.index: + perfectmatch = 0 + mismatch_1bp = 0 + perfectmatch_gene = 0 + mismatch_1bp_gene = 0 + if aligninfo[1][seqid] != "": + perfectmatch = len(aligninfo[1][seqid].split(",")) + if aligninfo[2][seqid] != "": + perfectmatch_gene = len(aligninfo[2][seqid].split(",")) + if aligninfo[3][seqid] != "": + mismatch_1bp = len(aligninfo[3][seqid].split(",")) + if aligninfo[4][seqid] != "": + mismatch_1bp_gene = len(aligninfo[4][seqid].split(",")) + if perfectmatch > loci_without_mismatch or mismatch_1bp > loci_with_mismatch: + multi_targeting_sgrnas[seqid] = True + elif perfectmatch > 1 or mismatch_1bp > 0: + multi_targeting_sgrnas_info[seqid] = ( + perfectmatch, + mismatch_1bp, + perfectmatch_gene, + mismatch_1bp_gene, + ) + + except: + print("Please check align-info file") + sys.exit(1) + + print("Total %d multi-targeting gRNAs are discarded" % len(multi_targeting_sgrnas)) + + # + # LOAD FOLDCHANGES + # + rnatagset = set() + with open(fold_change) as fin: + fieldname = fin.readline().rstrip().split("\t") + # + # DEFINE CONTROLS + # + columns = columns_to_test.split(",") + try: + try: + column_list = list(map(int, columns)) + column_labels = [fieldname[x + 1] for x in column_list] + except ValueError: + column_labels = columns + column_list = [ + x for x in range(len(fieldname) - 1) if fieldname[x + 1] in column_labels + ] # +1 because of First column start 2 + print("Using column: " + ", ".join(column_labels)) + # print "Using column: " + ", ".join(map(str,column_list)) + + except: + print("Invalid columns") + sys.exit(1) + + for line in fin: + fields = line.rstrip().split("\t") + rnatag = fields[0] + if filter_multi_target is True: # multitargeting sgrna filtering + if rnatag in multi_targeting_sgrnas: + continue # skip multitargeting sgrna. + if rnatag in rnatagset: + print("Error! sgRNA tag duplicates") + sys.exit(1) + rnatagset.add(rnatag) + gsym = fields[1] + + genes[gsym] = 1 + if gsym not in gene2rna: + gene2rna[gsym] = [] + gene2rna[gsym].append(rnatag) + rna2gene[rnatag] = gsym + fc[rnatag] = {} + for i in column_list: + fc[rnatag][i] = float(fields[i + 1]) # per user docs, GENE is column 0, first data column is col 1. + + genes_array = np.array(list(genes.keys())) + gene_idx = np.arange(len(genes)) + print("Number of unique genes: " + str(len(genes))) + + # + # DEFINE REFERENCE SETS + # + coreEss = [] + + with open(essential_genes) as fin: + skip_header = fin.readline() + for line in fin: + coreEss.append(line.rstrip().split("\t")[0]) + coreEss = np.array(coreEss) + print("Number of reference essentials: " + str(len(coreEss))) + + nonEss = [] + with open(non_essential_genes) as fin: + skip_header = fin.readline() + for line in fin: + nonEss.append(line.rstrip().split("\t")[0]) + + nonEss = np.array(nonEss) + print("Number of reference nonessentials: " + str(len(nonEss))) + + # + # LOAD NETWORK + # + + if network_boost is True: + network = {} + edgecount = 0 + with open(network_file) as fin: + for line in fin: + linearray = line.rstrip().split("\t") # GeneA \t GeneB format + if linearray[0] in genes_array and linearray[1] in genes_array: + for i in [0, 1]: + if linearray[i] not in network: + network[linearray[i]] = {} + network[linearray[i]][linearray[-1 * (i - 1)]] = 1 # save edge information + edgecount += 1 + + print("Number of network edges: " + str(edgecount)) + + # + # INITIALIZE BFS + # + + # Define foldchange dynamic threshold. logarithm decay. + # Parameters are defined by regression (achilles data) 2**-7 was used in previous version. + + FC_THRESH = 2 ** (-1.1535 * np.log(len(np.intersect1d(genes_array, nonEss)) + 13.324) + 0.7728) + bf = {} + boostedbf = {} + for g in genes_array: + for rnatag in gene2rna[g]: + bf[rnatag] = [] + + boostedbf[g] = [] # boosted bf at gene level + + # + # TRAINING + # + if use_small_sample: + # declare training class + # training_data = Training(setdiff1d(gene_idx,np.where(in1d(genes_array,coreEss))),cvnum=NUMCV) + # declare training class (only for Gold-standard gene set) + training_data = Training( + np.where(np.in1d(genes_array, np.union1d(coreEss, nonEss)))[0], cvnum=no_of_cross_validations + ) + # all non-goldstandards + all_non_gs = np.where(np.logical_not(np.in1d(genes_array, np.union1d(coreEss, nonEss))))[0] + else: + training_data = Training(gene_idx, cvnum=no_of_cross_validations) # declare training class + + if train_method == 0: + LOOPCOUNT = bootstrap_iterations + elif train_method == 1: + LOOPCOUNT = no_of_cross_validations # 10-folds + + if run_test_mode == True: + fp = open(output_file + ".traininfo", "w") + fp.write("#1: Loopcount\n#2: Training set\n#3: Testset\n") + # No resampling option + if no_resampling == True: + print("# Caution: Resampling is disabled") + LOOPCOUNT = 1 + + print("Iter TrainEss TrainNon TestSet") + sys.stdout.flush() + for loop in range(LOOPCOUNT): + currentbf = {} + printstr = "" + printstr += str(loop) + + # + # bootstrap resample (10-folds cross-validation) from gene list to get the training set + # test set for this iteration is everything not selected in bootstrap resampled (10-folds cross-validation) + # training set + # define essential and nonessential training sets: arrays of indexes + # + if no_resampling == True: + # no resampling + gene_train_idx = gene_idx + gene_test_idx = gene_idx + else: + # CV or bootstrapping + gene_train_idx, gene_test_idx = training_data.get_data(train_method) + if use_small_sample: + # test set is union of rest of training set (gold-standard) and the other genes (all of non-gold-standard) + gene_test_idx = np.union1d(gene_test_idx, all_non_gs) + + if run_test_mode: + fp.write( + "%d\n%s\n%s\n" % (loop, ",".join(genes_array[gene_train_idx]), ",".join(genes_array[gene_test_idx])) + ) + + train_ess = np.where(np.in1d(genes_array[gene_train_idx], coreEss))[0] + train_non = np.where(np.in1d(genes_array[gene_train_idx], nonEss))[0] + printstr += " " + str(len(train_ess)) + printstr += " " + str(len(train_non)) + printstr += " " + str(len(gene_test_idx)) + print(printstr) + sys.stdout.flush() + # + # define ess_train: vector of observed fold changes of essential genes in training set + # + ess_train_fc_list_of_lists = [ + fc[rnatag] for g in genes_array[gene_train_idx[train_ess]] for rnatag in gene2rna[g] + ] + ess_train_fc_flat_list = [obs for sublist in ess_train_fc_list_of_lists for obs in list(sublist.values())] + # + # define non_train vector of observed fold changes of nonessential genes in training set + # + non_train_fc_list_of_lists = [ + fc[rnatag] for g in genes_array[gene_train_idx[train_non]] for rnatag in gene2rna[g] + ] + non_train_fc_flat_list = [obs for sublist in non_train_fc_list_of_lists for obs in list(sublist.values())] + # + # calculate empirical fold change distributions for both + # + kess = stats.gaussian_kde(ess_train_fc_flat_list) + knon = stats.gaussian_kde(non_train_fc_flat_list) + # + # define empirical upper and lower bounds within which to calculate BF = f(fold change) + # + x = np.arange(-10, 2, 0.01) + nonfitx = knon.evaluate(x) + # define lower bound empirical fold change threshold: minimum FC np.where knon is above threshold + f = np.where(nonfitx > FC_THRESH) + xmin = round_to_hundredth(min(x[f])) + # define upper bound empirical fold change threshold: minimum value of log2(ess/non) + subx = np.arange(xmin, max(x[f]), 0.01) + logratio_sample = np.log2(kess.evaluate(subx) / knon.evaluate(subx)) + f = np.where(logratio_sample == logratio_sample.min()) + xmax = round_to_hundredth(subx[f]) + # + # round foldchanges to nearest 0.01 + # precalculate logratios and build lookup table (for speed) + # + logratio_lookup = {} + for i in np.arange(xmin, xmax + 0.01, 0.01): + logratio_lookup[np.around(i * 100)] = np.log2(kess.evaluate(i) / knon.evaluate(i)) + # + # calculate BFs from lookup table for withheld test set + # + + # liner interpolation + testx = list() + testy = list() + + for g in genes_array[gene_train_idx]: + for rnatag in gene2rna[g]: + for foldchange in list(fc[rnatag].values()): + if foldchange >= xmin and foldchange <= xmax: + testx.append(np.around(foldchange * 100) / 100) + testy.append(logratio_lookup[np.around(foldchange * 100)][0]) + try: + slope, intercept, r_value, p_value, std_err = stats.linregress(np.array(testx), np.array(testy)) + except: + print("Regression failed. Check quality of the screen") + sys.exit(1) + # + # BF calculation + # + + for g in genes_array[gene_test_idx]: + for rnatag in gene2rna[g]: + bayes_factor = [] + for rep in column_list: + bayes_factor.append(slope * fc[rnatag][rep] + intercept) + bf[rnatag].append(bayes_factor) + + if run_test_mode == True: + fp.close() + + num_obs = dict() + if rna_level is False: + bf_mean = dict() + bf_std = dict() + bf_norm = dict() # sgRNA number complement + if rna_level or filter_multi_target: + bf_mean_rna_rep = dict() + bf_std_rna_rep = dict() + # bf_norm_rna_rep = dict() + + for g in gene2rna: + num_obs[g] = len(bf[gene2rna[g][0]]) + if rna_level or filter_multi_target: + for rnatag in gene2rna[g]: + bf_mean_rna_rep[rnatag] = dict() + bf_std_rna_rep[rnatag] = dict() + t = list(zip(*bf[rnatag])) + for rep in range(len(column_list)): + bf_mean_rna_rep[rnatag][column_list[rep]] = np.mean(t[rep]) + bf_std_rna_rep[rnatag][column_list[rep]] = np.std(t[rep]) + + if rna_level == False: + sumofbf_list = list() + for i in range(num_obs[g]): + sumofbf = 0.0 + for rnatag in gene2rna[g]: + sumofbf += sum(bf[rnatag][i]) + sumofbf_list.append(sumofbf) # append each iter + bf_mean[g] = np.mean(sumofbf_list) + bf_std[g] = np.std(sumofbf_list) + + # + # BUILD MULTIPLE REGRESSION MODEL FOR MULTI TARGETING GUIDE RNAs + # + if filter_multi_target: + count = 0 + trainset = dict() + bf_multi_corrected_gene = dict() + bf_multi_corrected_rna = dict() + for gene in gene2rna: + # multi_targeting_sgrnas_info[seqid] = (perfectmatch, mismatch_1bp, perfectmatch_gene, mismatch_1bp_gene) + multitarget = list() + onlytarget = list() + for seqid in gene2rna[gene]: + if seqid not in aligninfo.index: + continue + if seqid in multi_targeting_sgrnas_info: + multitarget.append(seqid) + else: + onlytarget.append(seqid) + + if len(onlytarget) > 0: # comparsion between sgRNAs targeting one locus and multiple loci + if len(multitarget) > 0: + bf_only = np.mean([sum(list(bf_mean_rna_rep[seqid].values())) for seqid in onlytarget]) + for seqid in onlytarget: + trainset[seqid] = [1, 0, 0] + + for seqid in multitarget: + if ( + multi_targeting_sgrnas_info[seqid][2] > 1 or multi_targeting_sgrnas_info[seqid][3] > 0 + ): # train model using multi-targeting only targeting one protein coding gene + continue + + count += 1 + increment = sum(list(bf_mean_rna_rep[seqid].values())) - bf_only + + trainset[seqid] = [ + multi_targeting_sgrnas_info[seqid][0], + multi_targeting_sgrnas_info[seqid][1], + increment, + ] + + if count < 10: + print("Not enough train set for calculating multi-targeting effect.\n") + print("It may cause due to unmatched gRNA names between the foldchange file and the align info file.\n") + print("Filtering is not finished\n") + filter_multi_target = False + + else: + trainset = pd.DataFrame().from_dict(trainset).T + X = trainset[[0, 1]] + y = trainset[2] + + regressor = LinearRegression() + regressor.fit(X, y) + coeff_df = pd.DataFrame(regressor.coef_, X.columns, columns=["Coefficient"]) + for i in [0, 1]: + if coeff_df["Coefficient"][i] < 0: + print("Regression coefficient is below than zero. Substituted to zero\n") + coeff_df["Coefficient"][i] = 0.0 + print( + "Multiple effects from perfect matched loci = %.3f and 1bp mis-matched loci = %.3f" + % (coeff_df["Coefficient"][0], coeff_df["Coefficient"][1]) + ) + + if rna_level == False: + for g in gene2rna: + penalty = 0.0 + for seqid in gene2rna[g]: + if seqid in multi_targeting_sgrnas_info: + penalty += ( + float(multi_targeting_sgrnas_info[seqid][0] - 1) * coeff_df["Coefficient"][0] + + float(multi_targeting_sgrnas_info[seqid][1]) * coeff_df["Coefficient"][1] + ) + bf_multi_corrected_gene[g] = bf_mean[g] - penalty + else: + for g in gene2rna: + for seqid in gene2rna[g]: + if seqid in multi_targeting_sgrnas_info: + penalty = ( + float(multi_targeting_sgrnas_info[seqid][0] - 1) * coeff_df["Coefficient"][0] + + float(multi_targeting_sgrnas_info[seqid][1]) * coeff_df["Coefficient"][1] + ) + else: + penalty = 0.0 + bf_multi_corrected_rna[seqid] = sum(list(bf_mean_rna_rep[seqid].values())) - penalty + + # + # NORMALIZE sgRNA COUNT + # + if rna_level is False and flat_sgrna == True: + if filter_multi_target == True: + targetbf = bf_multi_corrected_gene + else: + targetbf = bf_mean + + for g in gene2rna: + multiple_factor = equalise_sgrna_no / float(len(gene2rna[g])) + bf_norm[g] = targetbf[g] * multiple_factor + + """ + if bf_std[rnatag] == 0.0: + bf_norm[rnatag] = float('inf') + else: + bf_norm[g] = ( bf[rnatag] - bf_mean[rnatag] ) / bf_std[rnatag] + """ + training_data = Training(gene_idx) # set training class reset + + # + # calculate network scores + # + + if network_boost == True and rna_level == False: # Network boost is only working for gene level + if run_test_mode == True: # TEST MODE + fp = open(output_file + ".netscore", "w") + print("\nNetwork score calculation start\n") + + networkscores = {} + for g in genes_array[gene_idx]: + if g in network: + templist = list() + for neighbor in network[g]: + if neighbor in bf_mean: + templist.append(bf_mean[neighbor]) + + templist.sort(reverse=True) + + networkscores[g] = fibo_weighted_sum(templist) + # + # start training + # + + for loop in range(LOOPCOUNT): + currentnbf = {} + printstr = "" + printstr += str(loop) + + # + # draw train, test sets + # + gene_train_idx, gene_test_idx = training_data.get_data(train_method) + # + # define essential and nonessential training sets: arrays of indexes + # + train_ess = np.where(np.in1d(genes_array[gene_train_idx], coreEss))[0] + train_non = np.where(np.in1d(genes_array[gene_train_idx], nonEss))[0] + printstr += " " + str(len(train_ess)) + printstr += " " + str(len(train_non)) + printstr += " " + str(len(gene_test_idx)) + + sys.stdout.flush() + # + # calculate Network BF for test set + # + ess_ns_list = [networkscores[x] for x in genes_array[gene_train_idx[train_ess]] if x in networkscores] + non_ns_list = [networkscores[x] for x in genes_array[gene_train_idx[train_non]] if x in networkscores] + + kess = stats.gaussian_kde(ess_ns_list) + knon = stats.gaussian_kde(non_ns_list) + # + # set x boundary for liner regression + # + testx = list() + testy = list() + xmin = float(np.inf) + xmax = float(-np.inf) + + for networkscore in np.arange(max(ess_ns_list), min(ess_ns_list), -0.01): + density_ess = kess.evaluate(networkscore)[0] + density_non = knon.evaluate(networkscore)[0] + if density_ess == 0.0 or density_non == 0.0: + continue + + if np.log2(density_ess / density_non) > -5 and networkscore < np.array(ess_ns_list).mean(): # reverse + xmin = min(xmin, networkscore) + + for networkscore in np.arange(min(non_ns_list), max(non_ns_list), 0.01): + density_ess = kess.evaluate(networkscore)[0] + density_non = knon.evaluate(networkscore)[0] + if density_ess == 0.0 or density_non == 0.0: + continue + if np.log2(density_ess / density_non) < 5 and networkscore > np.array(non_ns_list).mean(): # reverse + xmax = max(xmax, networkscore) + # + # liner regression + # + testx = list() + testy = list() + for g in genes_array[gene_train_idx]: + if g in networkscores: + if networkscores[g] >= xmin and networkscores[g] <= xmax: + testx.append(np.around(networkscores[g] * 100) / 100) + testy.append(np.log2(kess.evaluate(networkscores[g])[0] / knon.evaluate(networkscores[g])[0])) + + slope, intercept, r_value, p_value, std_err = stats.linregress(np.array(testx), np.array(testy)) + + for g in genes_array[gene_test_idx]: + if g in networkscores: + if run_test_mode == True: + fp.write("%s\t%f\t%f\n" % (g, networkscores[g], slope * networkscores[g] + intercept)) + nbf = slope * networkscores[g] + intercept + else: + nbf = 0.0 + + boostedbf[g].append(bf_mean[g] + nbf) + if flat_sgrna == True: + boostedbf[g].append(bf_norm[g] + nbf) + + if run_test_mode == True: + fp.close() + + # + # print out results + # + + # Equalizing factor (Replicates) + if flat_rep == True: + eqf = equalise_rep_no / float(len(column_labels)) + else: + eqf = 1 + + # print out + with open(output_file, "w") as fout: + if rna_level == True: + fout.write("RNA\tGENE") + for i in range(len(column_list)): + fout.write("\t{0:s}".format(column_labels[i])) + if train_method == 0: + fout.write("\t{0:s}".format(column_labels[i] + "_STD")) + fout.write("\tBF") + if train_method == 0: + fout.write("\tNumObs") + fout.write("\n") + + for rnatag in sorted(bf.keys()): + # RNA tag + fout.write("{0:s}\t".format(rnatag)) + # Gene + gene = rna2gene[rnatag] + fout.write("{0:s}\t".format(gene)) + + # BF of replicates + for rep in column_list: + fout.write("{0:4.3f}\t".format(bf_mean_rna_rep[rnatag][rep])) + if train_method == 0: + fout.write("{0:4.3f}\t".format(bf_std_rna_rep[rnatag][rep])) + + # Sum BF of replicates + if filter_multi_target == True: + fout.write( + "{0:4.3f}".format(float(bf_multi_corrected_rna[rnatag]) * eqf) + ) # eqf = equalizing factor for the number of replicates + else: + fout.write("{0:4.3f}".format(float(sum(list(bf_mean_rna_rep[rnatag].values()))) * eqf)) + + # Num obs + if train_method == 0: + fout.write("\t{0:d}".format(num_obs[gene])) + fout.write("\n") + else: + fout.write("GENE") + if network_boost == True: + fout.write("\tBoostedBF") + if train_method == 0: + fout.write("\tSTD_BoostedBF") + fout.write("\tBF") + if train_method == 0: + fout.write("\tSTD\tNumObs") + if flat_sgrna == True: + fout.write("\tNormBF") + fout.write("\n") + + for g in sorted(genes.keys()): + # Gene + fout.write("{0:s}".format(g)) + if network_boost == True: + boostedbf_mean = np.mean(boostedbf[g]) + boostedbf_std = np.std(boostedbf[g]) + fout.write("\t{0:4.3f}".format(float(boostedbf_mean) * eqf)) + if train_method == 0: + fout.write("\t{0:4.3f}".format(float(boostedbf_std) * eqf)) + + # BF + if filter_multi_target == True: + fout.write( + "\t{0:4.3f}".format(float(bf_multi_corrected_gene[g]) * eqf) + ) # eqf = equalizing factor for the number of replicates + else: + fout.write("\t{0:4.3f}".format(float(bf_mean[g]) * eqf)) + # STD, Count + if train_method == 0: + fout.write("\t{0:4.3f}\t{1:d}".format(float(bf_std[g]), num_obs[g])) + # Normalized BF + if flat_sgrna == True: + fout.write("\t{0:4.3f}".format(float(bf_norm[g]))) + + fout.write("\n") + + +@click.command(name="pr") +@click.option("-i", "--bayes-factors", required=True, type=click.Path(exists=True)) +@click.option("-o", "--output-file", required=True) +@click.option("-e", "--essential-genes", required=True, type=click.Path(exists=True)) +@click.option("-n", "--non-essential-genes", required=True, type=click.Path(exists=True)) +@click.option("-k", "--use-column", default=None) +def calculate_precision_recall(bayes_factors, output_file, essential_genes, non_essential_genes, use_column): + """ + Calculate precision-recall from an input Bayes Factors file: + + \b + BAGEL.py pr -i [Bayes Factor file] -o [output file] -e [essentials genes] -n [nonessentials genes] + + \b + Required options: + -i, --bayes-factors [Bayes factors] BAGEL output file. + -o, --output-file [output file] Output filename + -e, --essential-genes [reference essentials] File with list of training set of essential genes + -n, --non-essential-genes [reference nonessentials] File with list of training set of nonessential genes + + \b + Other options: + -k [column name] Use other column (default \BF\) + + \b + Example: + BAGEL.py pr -i input.bf -o output.PR -e ref_essentials.txt -n ref_nonessentials.txt + + """ + # + # test for availability of all files + # + essentials = pd.read_csv(essential_genes, index_col=0, sep="\t") + nonessentials = pd.read_csv(non_essential_genes, index_col=0, sep="\t") + bf = pd.read_csv(bayes_factors, index_col=0, sep="\t") + + if use_column is not None: + bf_column = use_column + if bf_column not in bf.dtypes.index: + print("Error! the column name is not in the file") + sys.exit(1) + else: + bf_column = "BF" + + bf.sort_values(by=bf_column, ascending=False, inplace=True) + + cumulative_tp = 0.0 + cumulative_fp = 0.0 + precision = 1.0 + recall = 0.0 + # note float formats + + ess = essentials.index.values + non = nonessentials.index.values + totNumEssentials = len([x for x in bf.index.values if x in ess]) + + with open(output_file, "w") as fout: + fout.write("Gene\t") + fout.write(bf_column) + fout.write("\tRecall\tPrecision\tFDR\n") + + for g in bf.index.values: + if g in ess: + cumulative_tp += 1 + elif g in non: + cumulative_fp += 1 + recall = cumulative_tp / totNumEssentials + if (cumulative_tp > 0) | (cumulative_fp > 0): + precision = cumulative_tp / (cumulative_tp + cumulative_fp) + fout.write( + "{0:s}\t{1:4.3f}\t{2:4.3f}\t{3:4.3f}\t{4:4.3f}\n".format( + g, bf.loc[g, bf_column], recall, precision, 1.0 - precision + ) + ) + + +if __name__ == "__main__": + bagel.add_command(calculate_fold_change) + bagel.add_command(calculate_bayes_factors) + bagel.add_command(calculate_precision_recall) + bagel.add_command(report_bagel_version) + bagel() diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py deleted file mode 100755 index 4a758fe0..00000000 --- a/bin/check_samplesheet.py +++ /dev/null @@ -1,259 +0,0 @@ -#!/usr/bin/env python - - -"""Provide a command line tool to validate and transform tabular samplesheets.""" - - -import argparse -import csv -import logging -import sys -from collections import Counter -from pathlib import Path - -logger = logging.getLogger() - - -class RowChecker: - """ - Define a service that can validate and transform each given row. - - Attributes: - modified (list): A list of dicts, where each dict corresponds to a previously - validated and transformed row. The order of rows is maintained. - - """ - - VALID_FORMATS = ( - ".fq.gz", - ".fastq.gz", - ) - - def __init__( - self, - sample_col="sample", - first_col="fastq_1", - second_col="fastq_2", - single_col="single_end", - **kwargs, - ): - """ - Initialize the row checker with the expected column names. - - Args: - sample_col (str): The name of the column that contains the sample name - (default "sample"). - first_col (str): The name of the column that contains the first (or only) - FASTQ file path (default "fastq_1"). - second_col (str): The name of the column that contains the second (if any) - FASTQ file path (default "fastq_2"). - single_col (str): The name of the new column that will be inserted and - records whether the sample contains single- or paired-end sequencing - reads (default "single_end"). - - """ - super().__init__(**kwargs) - self._sample_col = sample_col - self._first_col = first_col - self._second_col = second_col - self._single_col = single_col - self._seen = set() - self.modified = [] - - def validate_and_transform(self, row): - """ - Perform all validations on the given row and insert the read pairing status. - - Args: - row (dict): A mapping from column headers (keys) to elements of that row - (values). - - """ - self._validate_sample(row) - self._validate_first(row) - self._validate_second(row) - self._validate_pair(row) - self._seen.add((row[self._sample_col], row[self._first_col])) - self.modified.append(row) - - def _validate_sample(self, row): - """Assert that the sample name exists and convert spaces to underscores.""" - if len(row[self._sample_col]) <= 0: - raise AssertionError("Sample input is required.") - # Sanitize samples slightly. - row[self._sample_col] = row[self._sample_col].replace(" ", "_") - - def _validate_first(self, row): - """Assert that the first FASTQ entry is non-empty and has the right format.""" - if len(row[self._first_col]) <= 0: - raise AssertionError("At least the first FASTQ file is required.") - self._validate_fastq_format(row[self._first_col]) - - def _validate_second(self, row): - """Assert that the second FASTQ entry has the right format if it exists.""" - if len(row[self._second_col]) > 0: - self._validate_fastq_format(row[self._second_col]) - - def _validate_pair(self, row): - """Assert that read pairs have the same file extension. Report pair status.""" - if row[self._first_col] and row[self._second_col]: - row[self._single_col] = False - first_col_suffix = Path(row[self._first_col]).suffixes[-2:] - second_col_suffix = Path(row[self._second_col]).suffixes[-2:] - if first_col_suffix != second_col_suffix: - raise AssertionError("FASTQ pairs must have the same file extensions.") - else: - row[self._single_col] = True - - def _validate_fastq_format(self, filename): - """Assert that a given filename has one of the expected FASTQ extensions.""" - if not any(filename.endswith(extension) for extension in self.VALID_FORMATS): - raise AssertionError( - f"The FASTQ file has an unrecognized extension: {filename}\n" - f"It should be one of: {', '.join(self.VALID_FORMATS)}" - ) - - def validate_unique_samples(self): - """ - Assert that the combination of sample name and FASTQ filename is unique. - - In addition to the validation, also rename all samples to have a suffix of _T{n}, where n is the - number of times the same sample exist, but with different FASTQ files, e.g., multiple runs per experiment. - - """ - if len(self._seen) != len(self.modified): - raise AssertionError("The pair of sample name and FASTQ must be unique.") - seen = Counter() - for row in self.modified: - sample = row[self._sample_col] - seen[sample] += 1 - row[self._sample_col] = f"{sample}_T{seen[sample]}" - - -def read_head(handle, num_lines=10): - """Read the specified number of lines from the current position in the file.""" - lines = [] - for idx, line in enumerate(handle): - if idx == num_lines: - break - lines.append(line) - return "".join(lines) - - -def sniff_format(handle): - """ - Detect the tabular format. - - Args: - handle (text file): A handle to a `text file`_ object. The read position is - expected to be at the beginning (index 0). - - Returns: - csv.Dialect: The detected tabular format. - - .. _text file: - https://docs.python.org/3/glossary.html#term-text-file - - """ - peek = read_head(handle) - handle.seek(0) - sniffer = csv.Sniffer() - dialect = sniffer.sniff(peek) - return dialect - - -def check_samplesheet(file_in, file_out): - """ - Check that the tabular samplesheet has the structure expected by nf-core pipelines. - - Validate the general shape of the table, expected columns, and each row. Also add - an additional column which records whether one or two FASTQ reads were found. - - Args: - file_in (pathlib.Path): The given tabular samplesheet. The format can be either - CSV, TSV, or any other format automatically recognized by ``csv.Sniffer``. - file_out (pathlib.Path): Where the validated and transformed samplesheet should - be created; always in CSV format. - - Example: - This function checks that the samplesheet follows the following structure, - see also the `viral recon samplesheet`_:: - - sample,fastq_1,fastq_2 - SAMPLE_PE,SAMPLE_PE_RUN1_1.fastq.gz,SAMPLE_PE_RUN1_2.fastq.gz - SAMPLE_PE,SAMPLE_PE_RUN2_1.fastq.gz,SAMPLE_PE_RUN2_2.fastq.gz - SAMPLE_SE,SAMPLE_SE_RUN1_1.fastq.gz, - - .. _viral recon samplesheet: - https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv - - """ - required_columns = {"sample", "fastq_1", "fastq_2"} - # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`. - with file_in.open(newline="") as in_handle: - reader = csv.DictReader(in_handle, dialect=sniff_format(in_handle)) - # Validate the existence of the expected header columns. - if not required_columns.issubset(reader.fieldnames): - req_cols = ", ".join(required_columns) - logger.critical(f"The sample sheet **must** contain these column headers: {req_cols}.") - sys.exit(1) - # Validate each row. - checker = RowChecker() - for i, row in enumerate(reader): - try: - checker.validate_and_transform(row) - except AssertionError as error: - logger.critical(f"{str(error)} On line {i + 2}.") - sys.exit(1) - checker.validate_unique_samples() - header = list(reader.fieldnames) - header.insert(1, "single_end") - # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`. - with file_out.open(mode="w", newline="") as out_handle: - writer = csv.DictWriter(out_handle, header, delimiter=",") - writer.writeheader() - for row in checker.modified: - writer.writerow(row) - - -def parse_args(argv=None): - """Define and immediately parse command line arguments.""" - parser = argparse.ArgumentParser( - description="Validate and transform a tabular samplesheet.", - epilog="Example: python check_samplesheet.py samplesheet.csv samplesheet.valid.csv", - ) - parser.add_argument( - "file_in", - metavar="FILE_IN", - type=Path, - help="Tabular input samplesheet in CSV or TSV format.", - ) - parser.add_argument( - "file_out", - metavar="FILE_OUT", - type=Path, - help="Transformed output samplesheet in CSV format.", - ) - parser.add_argument( - "-l", - "--log-level", - help="The desired log level (default WARNING).", - choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG"), - default="WARNING", - ) - return parser.parse_args(argv) - - -def main(argv=None): - """Coordinate argument parsing and program execution.""" - args = parse_args(argv) - logging.basicConfig(level=args.log_level, format="[%(levelname)s] %(message)s") - if not args.file_in.is_file(): - logger.error(f"The given input file {args.file_in} was not found!") - sys.exit(2) - args.file_out.parent.mkdir(parents=True, exist_ok=True) - check_samplesheet(args.file_in, args.file_out) - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/bin/check_samplesheet_screening.py b/bin/check_samplesheet_screening.py deleted file mode 100755 index c069109d..00000000 --- a/bin/check_samplesheet_screening.py +++ /dev/null @@ -1,253 +0,0 @@ -#!/usr/bin/env python - - -"""Provide a command line tool to validate and transform tabular samplesheets.""" - - -import argparse -import csv -import logging -import sys -from collections import Counter -from pathlib import Path - -logger = logging.getLogger() - - -class RowChecker: - """ - Define a service that can validate and transform each given row. - - Attributes: - modified (list): A list of dicts, where each dict corresponds to a previously - validated and transformed row. The order of rows is maintained. - - """ - - VALID_FORMATS = ( - ".fq.gz", - ".fastq.gz", - ) - - def __init__( - self, - sample_col="sample", - first_col="fastq_1", - second_col="fastq_2", - condition="control", - **kwargs, - ): - """ - Initialize the row checker with the expected column names. - - Args: - sample_col (str): The name of the column that contains the sample name - (default "sample"). - first_col (str): The name of the column that contains the first (or only) - FASTQ file path (default "fastq_1"). - second_col (str): The name of the column that contains the second (if any) - FASTQ file path (default "fastq_2"). - condition (str): The name of the condition of the sample. - - """ - super().__init__(**kwargs) - self._sample_col = sample_col - self._first_col = first_col - self._second_col = second_col - self._condition = condition - self._seen = set() - self.modified = [] - - def validate_and_transform(self, row): - """ - Perform all validations on the given row and insert the read pairing status. - - Args: - row (dict): A mapping from column headers (keys) to elements of that row - (values). - - """ - self._validate_sample(row) - self._validate_first(row) - self._validate_second(row) - self._validate_pair(row) - self._seen.add((row[self._sample_col], row[self._first_col])) - self.modified.append(row) - - def _validate_sample(self, row): - """Assert that the sample name exists and convert spaces to underscores.""" - if len(row[self._sample_col]) <= 0: - raise AssertionError("Sample input is required.") - # Sanitize samples slightly. - row[self._sample_col] = row[self._sample_col].replace(" ", "_") - - def _validate_first(self, row): - """Assert that the first FASTQ entry is non-empty and has the right format.""" - if len(row[self._first_col]) <= 0: - raise AssertionError("At least the first FASTQ file is required.") - self._validate_fastq_format(row[self._first_col]) - - def _validate_second(self, row): - """Assert that the second FASTQ entry has the right format if it exists.""" - if len(row[self._second_col]) > 0: - self._validate_fastq_format(row[self._second_col]) - - def _validate_pair(self, row): - """Assert that read pairs have the same file extension. Report pair status.""" - if row[self._first_col] and row[self._second_col]: - first_col_suffix = Path(row[self._first_col]).suffixes[-2:] - second_col_suffix = Path(row[self._second_col]).suffixes[-2:] - if first_col_suffix != second_col_suffix: - raise AssertionError("FASTQ pairs must have the same file extensions.") - - def _validate_fastq_format(self, filename): - """Assert that a given filename has one of the expected FASTQ extensions.""" - if not any(filename.endswith(extension) for extension in self.VALID_FORMATS): - raise AssertionError( - f"The FASTQ file has an unrecognized extension: {filename}\n" - f"It should be one of: {', '.join(self.VALID_FORMATS)}" - ) - - def validate_unique_samples(self): - """ - Assert that the combination of sample name and FASTQ filename is unique. - - In addition to the validation, also rename all samples to have a suffix of _T{n}, where n is the - number of times the same sample exist, but with different FASTQ files, e.g., multiple runs per experiment. - - """ - if len(self._seen) != len(self.modified): - raise AssertionError("The pair of sample name and FASTQ must be unique.") - seen = Counter() - for row in self.modified: - sample = row[self._sample_col] - seen[sample] += 1 - row[self._sample_col] = f"{sample}_T{seen[sample]}" - - -def read_head(handle, num_lines=10): - """Read the specified number of lines from the current position in the file.""" - lines = [] - for idx, line in enumerate(handle): - if idx == num_lines: - break - lines.append(line) - return "".join(lines) - - -def sniff_format(handle): - """ - Detect the tabular format. - - Args: - handle (text file): A handle to a `text file`_ object. The read position is - expected to be at the beginning (index 0). - - Returns: - csv.Dialect: The detected tabular format. - - .. _text file: - https://docs.python.org/3/glossary.html#term-text-file - - """ - peek = read_head(handle) - handle.seek(0) - sniffer = csv.Sniffer() - dialect = sniffer.sniff(peek) - return dialect - - -def check_samplesheet(file_in, file_out): - """ - Check that the tabular samplesheet has the structure expected by nf-core pipelines. - - Validate the general shape of the table, expected columns, and each row. Also add - an additional column which records whether one or two FASTQ reads were found. - - Args: - file_in (pathlib.Path): The given tabular samplesheet. The format can be either - CSV, TSV, or any other format automatically recognized by ``csv.Sniffer``. - file_out (pathlib.Path): Where the validated and transformed samplesheet should - be created; always in CSV format. - - Example: - This function checks that the samplesheet follows the following structure, - see also the `viral recon samplesheet`_:: - - sample,fastq_1,fastq_2,condition - SAMPLE_PE,SAMPLE_PE_RUN1_1.fastq.gz,SAMPLE_PE_RUN1_2.fastq.gz,control - SAMPLE_SE,SAMPLE_SE_RUN1_1.fastq.gz,,tratment - - .. _viral recon samplesheet: - https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv - - """ - required_columns = {"sample", "fastq_1", "fastq_2"} - # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`. - with file_in.open(newline="") as in_handle: - reader = csv.DictReader(in_handle, dialect=sniff_format(in_handle)) - # Validate the existence of the expected header columns. - if not required_columns.issubset(reader.fieldnames): - req_cols = ", ".join(required_columns) - logger.critical(f"The sample sheet **must** contain these column headers: {req_cols}.") - sys.exit(1) - # Validate each row. - checker = RowChecker() - for i, row in enumerate(reader): - try: - checker.validate_and_transform(row) - except AssertionError as error: - logger.critical(f"{str(error)} On line {i + 2}.") - sys.exit(1) - checker.validate_unique_samples() - header = list(reader.fieldnames) - header.insert(1, "single_end") - # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`. - with file_out.open(mode="w", newline="") as out_handle: - writer = csv.DictWriter(out_handle, header, delimiter=",") - writer.writeheader() - for row in checker.modified: - writer.writerow(row) - - -def parse_args(argv=None): - """Define and immediately parse command line arguments.""" - parser = argparse.ArgumentParser( - description="Validate and transform a tabular samplesheet.", - epilog="Example: python check_samplesheet.py samplesheet.csv samplesheet.valid.csv", - ) - parser.add_argument( - "file_in", - metavar="FILE_IN", - type=Path, - help="Tabular input samplesheet in CSV or TSV format.", - ) - parser.add_argument( - "file_out", - metavar="FILE_OUT", - type=Path, - help="Transformed output samplesheet in CSV format.", - ) - parser.add_argument( - "-l", - "--log-level", - help="The desired log level (default WARNING).", - choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG"), - default="WARNING", - ) - return parser.parse_args(argv) - - -def main(argv=None): - """Coordinate argument parsing and program execution.""" - args = parse_args(argv) - logging.basicConfig(level=args.log_level, format="[%(levelname)s] %(message)s") - if not args.file_in.is_file(): - logger.error(f"The given input file {args.file_in} was not found!") - sys.exit(2) - args.file_out.parent.mkdir(parents=True, exist_ok=True) - check_samplesheet(args.file_in, args.file_out) - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/bin/cigar_parser.R b/bin/cigar_parser.R index 7c2179da..2018d686 100755 --- a/bin/cigar_parser.R +++ b/bin/cigar_parser.R @@ -946,7 +946,8 @@ if (dim(alignment_info)[1] != 0){ dels <- separated_indels %>% filter(Modification == "del") dels_count <- dim(dels)[1] if ( t_type == "dels-out" || t_type == "dels-in"){ - dels_count <- dels_count - t_reads + t_in_dels <- dels %>% filter(Ids %in% t_ids[[1]]) + dels_count <- dels_count - dim(t_in_dels)[1] } ins <- separated_indels %>% filter(Modification == "ins") ins_count <- dim(ins)[1] @@ -1008,8 +1009,19 @@ if (dim(alignment_info)[1] != 0){ reads_classes <- c("Raw reads", "Merged reads", "Quality filtered reads", "Clustered reads", "Aligned reads") reads_counts <- c(as.character(reads), as.character(merged_reads), as.character(trimmed_reads), as.character(clustered_reads), as.character(aligned_reads)) - write(reads_counts, stdout()) reads_summary <- data.frame(classes = unlist(reads_classes), counts = unlist(reads_counts)) + # Save table for multiQC + formated_counts <- c( + strsplit(as.character(reads), ' ')[[1]][2], + strsplit(strsplit(as.character(merged_reads), '(', fixed = TRUE)[[1]][2], '%')[[1]][1], + strsplit(strsplit(as.character(trimmed_reads), '(', fixed = TRUE)[[1]][2], '%')[[1]][1], + as.character(clustered_reads), + strsplit(strsplit(as.character(aligned_reads), '(', fixed = TRUE)[[1]][2], '%')[[1]][1] + ) + reads_summary_mqc <- data.frame(sample = unlist(formated_counts), row.names = unlist(reads_classes)) + colnames(reads_summary_mqc)[1] = results_path # Rename the column to add the sample ID + reads_summary_mqc <- t(reads_summary_mqc) # t() will add classes as columns and counts as values, 1 row per sample + write.csv(reads_summary_mqc,file=paste0(results_path, "_reads-summary.csv")) ########### Pre-variant calling counts # Indel filters @@ -1026,8 +1038,15 @@ if (dim(alignment_info)[1] != 0){ "Above error & in pick", "NOT above error & in pick", "NOT above error & NOT in pick", "Above error & NOT in pick") prevc_counts <- c(aligned_reads, wt_reads+incorrect_wt, dim(separated_indels)[1]+trunc_reads, wt_reads, incorrect_wt, dim(separated_indels)[1], trunc_reads, ep, nep, nenp, enp) - write(prevc_counts, stdout()) prevc_summary <- data.frame(classes = unlist(prevc_classes), counts = unlist(prevc_counts)) + # Write prevc_summary data to a csv for multiQC plots + prevc_classes_mqc <- c("Wt passing filter", "Wt NOT passing filter", "Indels NOT passing filter", + "Above error & in pick", "NOT above error & in pick", "NOT above error & NOT in pick", "Above error & NOT in pick") + prevc_counts_mqc <- c(wt_reads, incorrect_wt, trunc_reads, ep, nep, nenp, enp) + indel_filters <- data.frame(sample = unlist(prevc_counts_mqc), row.names = unlist(prevc_classes_mqc)) + colnames(indel_filters)[1] = results_path # Rename the column to add the sample ID + indel_filters <- t(indel_filters) # t() will add classes as columns and counts as values, 1 row per sample + write.csv(indel_filters,file=paste0(results_path, "_QC-indels.csv")) } else { ###########################In case we don't have indels but maybe there are template based editions @@ -1064,8 +1083,19 @@ if (dim(alignment_info)[1] != 0){ reads_classes <- c("Raw reads", "Merged reads", "Quality filtered reads", "Clustered reads", "Aligned reads") reads_counts <- c(as.character(reads), as.character(merged_reads), as.character(trimmed_reads), as.character(clustered_reads), as.character(aligned_reads)) - write(reads_counts, stdout()) reads_summary <- data.frame(classes = unlist(reads_classes), counts = unlist(reads_counts)) + # Save table for multiQC + formated_counts <- c( + strsplit(as.character(reads), ' ')[[1]][2], + strsplit(strsplit(as.character(merged_reads), '(', fixed = TRUE)[[1]][2], '%')[[1]][1], + strsplit(strsplit(as.character(trimmed_reads), '(', fixed = TRUE)[[1]][2], '%')[[1]][1], + as.character(clustered_reads), + strsplit(strsplit(as.character(aligned_reads), '(', fixed = TRUE)[[1]][2], '%')[[1]][1] + ) + reads_summary_mqc <- data.frame(sample = unlist(formated_counts), row.names = unlist(reads_classes)) + colnames(reads_summary_mqc)[1] = results_path # Rename the column to add the sample ID + reads_summary_mqc <- t(reads_summary_mqc) # t() will add classes as columns and counts as values, 1 row per sample + write.csv(reads_summary_mqc,file=paste0(results_path, "_reads-summary.csv")) ########### Pre-variant calling counts # Indel filters @@ -1076,19 +1106,26 @@ if (dim(alignment_info)[1] != 0){ prevc_classes <- c("Aligned reads", "Wt", "Indels", "Wt passing filter", "Wt NOT passing filter", "Indels passing filter", "Indels NOT passing filter", "Above error & in pick", "NOT above error & in pick", "NOT above error & NOT in pick", "Above error & NOT in pick") - prevc_counts <- c(aligned_reads, wt_reads+incorrect_wt, 0+trunc_reads, wt_reads, incorrect_wt, 0, trunc_reads, + prevc_counts <- c(aligned_reads, wt_reads+incorrect_wt, dim(separated_indels)[1]+trunc_reads, wt_reads, incorrect_wt, dim(separated_indels)[1], trunc_reads, ep, nep, nenp, enp) - write(prevc_counts, stdout()) prevc_summary <- data.frame(classes = unlist(prevc_classes), counts = unlist(prevc_counts)) exportJson <- toJSON(cut_site) write(exportJson, paste(sample_id,"_cutSite.json",sep = "")) + # Write prevc_summary data to a csv for multiQC plots + prevc_classe_mqc <- c("Wt passing filter", "Wt NOT passing filter", "Indels NOT passing filter", + "Above error & in pick", "NOT above error & in pick", "NOT above error & NOT in pick", "Above error & NOT in pick") + prevc_counts_mqc <- c(wt_reads, incorrect_wt, trunc_reads, ep, nep, nenp, enp) + indel_filters <- data.frame(sample = unlist(prevc_counts_mqc), row.names = unlist(prevc_classe_mqc)) + colnames(indel_filters)[1] = results_path # Rename the column to add the sample ID + indel_filters <- t(indel_filters) # t() will add classes as columns and counts as values, 1 row per sample + write.csv(indel_filters,file=paste0(results_path, "_QC-indels.csv")) } ########### ##### Summary: edition ########### - edit_classes <- c("Wt", "Template-based", "Indels", "Insertions", "Deletions", "Delins", "Dels inframe", "Dels outfarme", "Ins inframe", "Ins outfarme") + edit_classes <- c("Wt", "Template-based", "Indels", "Insertions", "Deletions", "Delins", "Dels_inframe", "Dels_outframe", "Ins_inframe", "Ins_outframe") ##### Update wt if template-based is a substitution if ( t_type == "subs"){ @@ -1098,21 +1135,11 @@ if (dim(alignment_info)[1] != 0){ edit_counts <- c(wt_reads, t_reads, indels_count, ins_count, dels_count, delin_count, in_frame_dels, out_frame_dels, in_frame_ins, out_frame_ins ) edit_summary <- data.frame(classes = unlist(edit_classes), counts = unlist(edit_counts)) - total_reads <- wt_reads+t_reads+indels_count - wt_perc <- (wt_reads/total_reads)*100 - temp_perc <- ((t_reads)/total_reads)*100 - indels_perc <- (indels_count/total_reads)*100 - ins_perc <- (ins_count/indels_count)*100 - dels_perc <- (dels_count/indels_count)*100 - delins_perc <- (delin_count/indels_count)*100 - in_frame_ins_perc <- (in_frame_ins/ins_count)*100 - out_frame_ins_perc <- (out_frame_ins/ins_count)*100 - in_frame_dels_perc <- (in_frame_dels/dels_count)*100 - out_frame_dels_perc <- (out_frame_dels/dels_count)*100 - - edit_classes_perc <- c("Wt", "Template-based", "Indels", "Delins", "Insertions", "Ins inframe", "Ins outfarme", "Deletions", "Dels inframe", "Dels outfarme") - edit_counts_perc <- c(round(wt_perc,2), round(temp_perc,2), round(indels_perc,2), round(delins_perc,2), round(ins_perc,2), round(in_frame_ins_perc,2), round(out_frame_ins_perc,2), round(dels_perc,2), round(in_frame_dels_perc,2), round(out_frame_dels_perc,2 )) - edit_summary_perc <- data.frame(classes = unlist(edit_classes_perc), counts = unlist(edit_counts_perc)) + edit_classes_perc <- c("Wt", "Template-based", "Delins", "Ins_inframe", "Ins_outframe", "Dels_inframe", "Dels_outframe") + edit_counts_perc <- c(wt_reads, t_reads, delin_count, in_frame_ins, out_frame_ins, in_frame_dels, out_frame_dels) + edit_summary_perc <- data.frame(sample = unlist(edit_counts_perc), row.names = unlist(edit_classes_perc)) + colnames(edit_summary_perc)[1] = results_path # Rename the column to add the sample ID + edit_summary_perc <- t(edit_summary_perc) # t() will add classes as columns and counts as values, 1 row per sample ### Save edits count write.csv(edit_summary_perc,file=paste0(results_path, "_edits.csv")) @@ -1201,8 +1228,10 @@ if (dim(alignment_info)[1] != 0){ separated_indels = data.frame(matrix(nrow = 1, ncol = length(columns))) colnames(separated_indels) = columns write.csv(separated_indels,file=paste0(results_path, "_Badlyindels.csv")) - edit_classes_perc <- c("Wt", "Template-based", "Indels", "Delins", "Insertions", "Ins inframe", "Ins outfarme", "Deletions", "Dels inframe", "Dels outfarme") - edit_counts_perc <- c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0) - edit_summary_perc <- data.frame(classes = unlist(edit_classes_perc), counts = unlist(edit_counts_perc)) + edit_classes_perc <- c("Wt", "Template-based", "Delins", "Ins_inframe", "Ins_outframe", "Dels_inframe", "Dels_outframe") + edit_counts_perc <- c(0, 0, 0, 0, 0, 0, 0) + edit_summary_perc <- data.frame(sample = unlist(edit_counts_perc), row.names = unlist(edit_classes_perc)) + colnames(edit_summary_perc)[1] = results_path # Rename the column to add the sample ID + edit_summary_perc <- t(edit_summary_perc) # t() will add classes as columns and counts as values, 1 row per sample write.csv(edit_summary_perc,file=paste0(results_path, "_edits.csv")) } diff --git a/bin/plotter.R b/bin/plotter.R new file mode 100755 index 00000000..8d0fd8ea --- /dev/null +++ b/bin/plotter.R @@ -0,0 +1,666 @@ +#!/usr/bin/env Rscript + +############################ +#### Plot editing results +#### author: Marta Sanvicente +#### modified by: Júlia Mir @mirpedrol +#### Released under the MIT license. See git repository (https://github.com/nf-core/crisprseq) for full license text. +#### Original code https://bitbucket.org/synbiolab/crispr-a_nextflow/src/master/bin/get_plots.R +############################ + +args = commandArgs(trailingOnly=TRUE) + +library(ggplot2) +library(plyr) +library(dplyr) +library(seqinr) +library(ShortRead) +library(ggpubr) +library(ggmsa) +library(seqmagick) +library(stringr) +library(tidyr) +library(ggseqlogo) +library(plotly) +library(cowplot) +library(optparse) + +#################################### +### Load command line arguments #### +#################################### + +option_list = list( + make_option(c("-i", "--indels_info"), type="character", default=NULL, + help="CSV with information of INDEL editions", metavar="character"), + make_option(c("-r", "--reference"), type="character", default=NULL, + help="Reference fasta file", metavar="character"), + make_option(c("-g", "--gRNA_sequence"), type="character", default=NULL, + help="gRNA sequence", metavar="character"), + make_option(c("-n", "--sample_name"), type="character", default=NULL, + help="Sample ID", metavar="character"), + make_option(c("-c", "--cut_site"), type="numeric", default=NULL, + help="Cut position", metavar="numeric"), + make_option(c("-s", "--substitutions_info"), type="character", default=NULL, + help="CSV with information of substitution editions", metavar="character") +); + +opt_parser = OptionParser(option_list=option_list); +opt = parse_args(opt_parser); + +sample_name <- opt$sample_name +indels_info <- opt$indels_info +reference <- opt$reference +gR <- opt$gRNA_sequence +substitutions_info <- opt$substitutions_info +rel_cut_site <- as.numeric(opt$cut_site) + +data <- read.csv(indels_info) +ref_seq <- readFasta(reference) +subs_plup <- read.csv(substitutions_info, row.names = 1) + + +###################### +##### CRISPR-GA-1 like plot https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4184265/ +###################### + +plotIndels_gg <- function(indels_data, cut){ + # Split insertions from deletions + l <- length(indels_data$Modification) + indels_data$wt_reads[1] + data$t_reads[1] + dels <- indels_data %>% filter(Modification == "del") + ins <- indels_data %>% filter(Modification == "ins") + + #Metadata deletion + if (length(dels$X)>0){ + metadels <- as.data.frame(table(dels$Start,dels$Length)) + metadels <- metadels[which(metadels$Freq>0),] + colnames(metadels) <- c("Start","length","Freq") + metadels$End <- 0 + metadels$changes <- sum((as.integer(as.character(metadels$Freq)) * as.integer(as.character(metadels$length)))) #Now the changes column corrspond to a constant that will be used in the creation of accumulative. This variable stores the total number of changes per position + + for (i in 1:length(metadels$Start)){ + metadels$End[i] = as.integer(as.character(metadels$Start[i])) + as.integer(as.character(metadels$length[i])) - 1 + } + }else{ + metadels <- list() + } + + exportJson <- jsonlite::toJSON(metadels) + write(exportJson, paste(sample_name,"_metadels.json",sep="")) + + #Metadata insertion + if(length(ins$X)>0){ + metains <- as.data.frame(table(ins$Start,ins$Length)) + metains <- metains[which(metains$Freq>0),] + colnames(metains) <- c("Start","length","Freq") + metains$End <- 0 + metains$changes <- sum((as.integer(as.character(metains$Freq)) * as.integer(as.character(metains$length)))) #Now the changes column corrspond to a constant that will be used in the creation of accumulative. This variable stores the total number of changes per position + + for (i in 1:length(metains$Start)){ + metains$End[i] = as.integer(as.character(metains$Start[i])) + as.integer(as.character(metains$length[i])) - 1 + } + }else{ + metains <- list() + } + exportJson <- jsonlite::toJSON(metains) + write(exportJson, paste(sample_name,"_metains.json",sep="")) + + + # Count accumulated insertions and deletions by position + accum.dels=c() + accum.dels=unlist(apply(dels,1,function(X) return( seq(as.integer(X[3]), as.integer(X[3]) + as.integer(X[4])) ))) + accum.ins=c() + accum.ins=unlist(apply(ins,1,function(X) return( seq(as.integer(X[3]), as.integer(X[3]) + as.integer(X[4])) ))) + + # Plot + a_dels <- data.frame(location = c(accum.dels)) + a_ins <- data.frame(location = c(accum.ins)) + + + ############################################### + # ACUMMULATIVE LOCATIONS # + ############################################### + if ( dim(dels)[1] != 0 ){ + acc_dels <- as.data.frame(table(a_dels)) + acc_dels <- acc_dels[acc_dels$Freq>0,] + acc_dels$percent <- (acc_dels$Freq/sum(acc_dels$Freq))*100 + del_acumm <- plot_ly(y = ~acc_dels$percent, x= ~as.numeric(as.character(acc_dels$location)),type = "bar",hovertemplate = paste('Percentage: %{y}','Counts: ',acc_dels$Freq)) %>% + layout(xaxis = list(title="Accumulative Deletion Location",rangeslider = list()), yaxis = list(fixedrange = FALSE,title = "Reads Percentage(%)"), barmode = 'stack')%>% + layout(shapes = list(list( + type = "line", + y0 = 0, + y1 = 100, + yref = "paper", + x0 = cut_site, + x1 = cut_site, + line = list(color = "red", dash="dot") + )))%>% + add_text(showlegend = FALSE, x=cut_site, y=max(acc_dels$percent),text = "Cut site", hovertemplate = paste("Cut site ", cut_site), + textfont = list(color= c("red"))) + }else { + acc_dels <- data.frame() + } + + + if ( dim(ins)[1] != 0 ){ + acc_ins <- as.data.frame(table(a_ins)) + acc_ins <- acc_ins[acc_ins$Freq>0,] + acc_ins$percent <- (acc_ins$Freq/sum(acc_ins$Freq))*100 + + ins_acumm <- plot_ly(y = ~acc_ins$percent, x= ~as.numeric(as.character(acc_ins$location)),type = "bar",hovertemplate = paste('Percentage: %{y}','Counts: ',acc_ins$Freq)) %>% + layout(xaxis = list(title="Accumulative Insertion Location",rangeslider = list()), yaxis = list(fixedrange = FALSE,title = "Reads Percentage(%)"), barmode = 'stack')%>% + layout(shapes = list(list( + type = "line", + y0 = 0, + y1 = 100, + yref = "paper", + x0 = cut_site, + x1 = cut_site, + line = list(color = "red", dash="dot") + )))%>% + add_text(showlegend = FALSE, x=cut_site, y=max(acc_ins$percent),text = "Cut site", hovertemplate = paste("Cut Site ", cut_site), + textfont = list(color= c("red"))) + }else { + acc_ins <- data.frame() + } + + if( (dim(ins)[1] != 0) && (dim(dels)[1] != 0) ){ + accummulative <- subplot(del_acumm, ins_acumm, shareY = T,shareX = F,titleX = T,titleY = T) + } else if( dim(ins)[1] != 0 ){ + accummulative <- subplot(ins_acumm, shareY = T,shareX = F,titleX = T,titleY = T) + } else { + accummulative <- subplot(del_acumm, shareY = T,shareX = F,titleX = T,titleY = T) + } + + #Elements for dynamic table + sequece_length <- max(indels_data$Start+indels_data$Length) + exportJson <- jsonlite::toJSON(sequece_length) + write(exportJson, paste(sample_name,"_length.json",sep="")) + + exportJson <- jsonlite::toJSON(l) + write(exportJson, paste(sample_name,"_Total_reads.json",sep="")) + + + #INSERTIONS + ############################################### + # LOCATION # + ############################################### + + pattern = ins + + #defensive programming against file with no insertions + if(length(pattern$Modification)>0){ + + LOC <- as.data.frame(table(pattern$Start,pattern$Length,pattern$ins_nt)) + LOC <- LOC[which(LOC$Freq>0),] + LOC$percent <- (LOC$Freq/l)*100 + + + #AXIS -> PERCENTAGE, BAR -> COUNT + Ins_location <- plot_ly() %>% + layout(xaxis = list(title="Insertion Location",rangeslider = list()), yaxis = list(fixedrange = FALSE,title = "Reads Percentage(%)"), barmode = 'stack') + + Ins_location <- Ins_location %>% add_trace(x = ~as.numeric(as.character(LOC$Var1)), + y= ~LOC$percent, color = ~LOC$Var3, type = 'bar', + marker = list(~LOC$Freq), legendgroup = ~LOC$Var3, + hovertemplate = paste('Percentage: %{y}','Counts: ',LOC$Freq)) + + + Ins_location <- Ins_location %>% layout(shapes = list(list( + type = "line", + y0 = 0, + y1 = 100, + yref = "paper", + x0 = cut_site, + x1 = cut_site, + line = list(color = "red", dash="dot") + ))) %>% add_text(showlegend = FALSE, x=cut_site,y=max(LOC$percent),text = "Cut site", hovertemplate = paste("Cut site ", cut_site), + textfont = list(color= c("red"))) + + + ############################################### + # SIZES # + ############################################### + + #AXIS -> PERCENTAGE, BAR -> COUNT + Ins_sizes <- plot_ly() %>% + layout(xaxis = list(title="Insertion Sizes",rangeslider = list()), yaxis = list(title = "Reads Percentage(%)"), barmode = 'stack') + Ins_sizes <- Ins_sizes %>% add_trace(x = ~as.numeric(as.character(LOC$Var2)), + y= ~LOC$percent, color = ~LOC$Var3, type = 'bar', + marker = list(~LOC$Freq), showlegend = F, legendgroup = ~LOC$Var3, + hovertemplate = paste(' Percentage: %{y}','Count: ',LOC$Freq)) + + ############################################### + # MERGE PLOTS # + ############################################### + + insertions <- subplot(Ins_location,Ins_sizes, titleY = F, titleX = T) %>% layout(title = "Insertions",yaxis = list(title = "Reads Percentage (%)")) + }else{ + insertions<-empty_plot("No Insertions were detected.") + } + + #DELETIONS + #pattern filter (A,C,T,G) -> NHEJ + pattern <- dels + + #defensive programming against file with no insertions + if(length(pattern$Modification)){ + + for(i in 1:length(pattern$patterns)){ + if(length(strsplit(pattern$patterns[i],split = "")[[1]])==1){ + pattern$patterns[i] <- "NHEJ" + } + } + + ############################################### + # LOCATION # + ############################################### + + + LOC <- as.data.frame(table(pattern$Start,pattern$patterns,pattern$Length)) + LOC <- LOC[which(LOC$Freq>0),] + LOC$percent <- (LOC$Freq/l)*100 + + + #AXIS -> PERCENTAGE, BAR -> COUNT + Del_location <- plot_ly() %>% + layout(xaxis = list(title="Deletion Location",rangeslider = list()), yaxis = list(fixedrange = FALSE,title = "Reads Percentage(%)"), barmode = 'stack') + Del_location <- Del_location %>% add_trace(x = ~as.numeric(as.character(LOC$Var1)), + y= ~LOC$percent, color = ~LOC$Var2, type = 'bar', + marker = list(~LOC$Freq), legendgroup = ~LOC$Var2, + hovertemplate = paste('Percentage: %{y}','Counts: ',LOC$Freq)) + + Del_location <- Del_location %>% layout(shapes = list(list( + type = "line", + y0 = 0, + y1 = 100, + yref = "paper", + x0 = cut_site, + x1 = cut_site, + line = list(color = "red", dash="dot") + )))%>% + add_text(showlegend = FALSE, x=cut_site,y=max(LOC$percent),text = "Cut site", hovertemplate = paste("Cut site ", cut_site), + textfont = list(color= c("red"))) + + + ############################################### + # SIZES # + ############################################### + + #AXIS -> PERCENTAGE, BAR -> COUNT + Del_sizes <- plot_ly() %>% + layout(xaxis = list(title="Deletion Sizes",rangeslider = list()), yaxis = list(title = "Reads Percentage(%)"), barmode = 'stack') + Del_sizes <- Del_sizes %>% add_trace(x = ~as.numeric(as.character(LOC$Var3)), + y= ~LOC$percent, color = ~LOC$Var2, type = 'bar', + marker = list(~LOC$Freq), showlegend=F, legendgroup = ~LOC$Var2, + hovertemplate = paste('Size: ',LOC$Var3,' Percentage: %{y}')) + + + ############################################### + # MERGE PLOTS # + ############################################### + + deletions <- subplot(Del_location,Del_sizes, titleY = F, titleX = T) %>% layout(title = "Deletions",yaxis = list(title = "Reads Percentage (%)")) + }else{ + deletions<-empty_plot("No deletions were detected.") + } + plots <- list(deletions,insertions,accummulative) + return(plots) +} + +####### +#### Reverse complement +####### +strComp=function(X){ + return(c2s(rev(comp(s2c(X))))) +} + +###### +### Cut site +###### +get_cutSite <- function(gRNA_seq, reference, rel_cut){ + ### gRNA seq to upper case + gRNA_seq <- toupper(gRNA_seq) + ### Check orientation of gRNA in reference sequence and cut site + rvComp_seq <- strComp(as.character(sread(reference)[[1]])) + align <- pairwiseAlignment(toupper(gRNA_seq), toupper(as.character(sread(reference)[[1]])), type="global-local") + alignRevComp <- pairwiseAlignment(toupper(gRNA_seq), toupper(rvComp_seq), type="global-local") + + if (score(align) > score(alignRevComp)){ + if (rel_cut > 0){ + cut_site <- start(subject(align))+rel_cut-1 + } else { + cut_site <- start(subject(align)) + ( nchar(gRNA_seq) + rel_cut ) -1 + } + } else { + if (rel_cut > 0){ + cut_site <- nchar(as.character(sread(reference)[[1]])) - end(subject(alignRevComp)) + (nchar(gRNA_seq)-rel_cut) + } else { + cut_site <- nchar(as.character(sread(reference)[[1]])) - end(subject(alignRevComp)) - rel_cut + } + } + return(cut_site) +} + + +####### +#### Empty Plots +####### +empty_plot <- function(title = NULL){ + p <- plotly_empty(type = "scatter", mode = "markers") %>% + config( + displayModeBar = FALSE + ) %>% + layout( + title = list( + text = title, + yref = "paper", + y = 0.5 + ) + ) + return(p) +} + +###################### +##### Alleles percentages plot +###################### + +indel_count_seq <- function(indel_data){ + # Get table with percentages of indels among all genotypes + indel_count <- indel_data %>% group_by(Start, Length, Modification) %>% dplyr::summarise(freq=n()) + #indel_count <- plyr::count(indel_data, vars = c("Start", "Length", "Modification")) + indel_count <- indel_count[order(indel_count$freq, decreasing = TRUE),] + indel_count$Perc <- (indel_count$freq/sum(indel_count$freq)) * 100 + return(as.data.frame(indel_count)) +} + +get_sequences <- function(indels, ref_seq){ + edited_sequences <- c() + seqs <- lapply(c(1:dim(indels)[1]), function(num) { + ref_seq_test <- as.character(sread(ref_seq)) + if (indels$Modification[[num]] == "del"){ + substr(ref_seq_test, indels$Start[[num]], indels$Start[[num]]+indels$Length[[num]]-1) <- paste0(rep("-", indels$Length[[num]]), collapse = "") + edited_sequences <- c(edited_sequences, ref_seq_test) + } else { + seq <- paste(substring(ref_seq_test, 1, indels$Start[[num]]), paste0(rep("N", indels$Length[[num]]), collapse = ""), substring(ref_seq_test, indels$Start[[num]]+1, nchar(ref_seq_test)), sep = "") + edited_sequences <- c(edited_sequences, seq) + } + }) +} + +###################### +##### Substitutions at -+25 of the gRNA plot +###################### +subs_plot <- function(subsperc, gRNA_seq, cut_site){ + ### Get the gRNA region + pre_cut_site <- cut_site - (nchar(gRNA_seq)-3) ## instead of 17 to allow different gRNA lengths instead of only 20 + post_cut_site <- cut_site + 3 + 1 + ### gRNA nucleotides to use them in the axis + ref_nt <- stringr::str_split(gRNA_seq, "")[[1]] + ### Get the plot + plot <- ggplot(data=subsperc %>% filter(pos > pre_cut_site - 25) %>% filter(pos < post_cut_site + 25), aes(x=ordered(pos), y=percentage, fill=nucleotide, alpha=ifelse(percentage<95,1,0))) + + geom_bar(stat="identity") + theme_classic() + scale_fill_manual("Nuclotides", values = c("A" = "#109648", "C" = "#86b7ed", "G" = "#f7b32b", "T" = "#d62839", "-" = "#f2f2f2")) + + scale_x_discrete(breaks = (pre_cut_site+1):(post_cut_site-1), labels = ref_nt) + xlab(NULL) + scale_alpha(guide = 'none') + ylab('nt (%)') + + geom_text(aes(label = ifelse(percentage<50 & percentage>5,100-percentage,"")), + hjust = 0, vjust = 1.5, angle = 90, nudge_x = -.5, + size = 2.5) + + theme(text = element_text(size = 11), legend.position = "bottom") + return(plot) +} + +############ +### Substitutions logo plot +############ +subs_logo_plot <- function(subsperc, gRNA_seq, cut_site){ + ### Positions related to the cut site + pre_cut_site <- cut_site - (nchar(gRNA_seq)-3) + post_cut_site <- cut_site + 6 + 1 + # + ### From percentages table to matrix + fper_filtered <- subsperc %>% filter(pos > pre_cut_site) %>% filter(pos < post_cut_site) %>% filter(nucleotide != "-") + nuc_pos <- pivot_wider(fper_filtered, names_from = pos, values_from = percentage) + nuc_pos[is.na(nuc_pos)] <- 0 + nuc_pos[nuc_pos == 100] <- 0 + nuc_pos_matrix <- data.matrix(nuc_pos[,-1]) + nuc_pos_matrix <- nuc_pos_matrix+0.00001 ### This is done to avoid problems in case of a matrix full of 0s + rownames(nuc_pos_matrix) <- nuc_pos$nucleotide + # + ### gRNA nucleotides to use them in the axis + ref_nt <- c(str_split(gRNA_seq, "")[[1]], "N", "G", "G") + logo <- ggseqlogo(nuc_pos_matrix, method='custom', seq_type='dna') + ylab('nt (%)') + logo$scales$scales[[1]] <- scale_x_continuous(breaks= 1:length(ref_nt),labels=ref_nt) + logo_plot <- logo + annotate('rect', xmin = length(ref_nt)-2.5, xmax = length(ref_nt)+0.5, ymin = -5, ymax = 105, alpha = .1, col='black', fill='yellow') + return(logo_plot) +} + +################## +### Top variants plots +################## + +### Logo plot of top indels +get_logo_top_vars <- function(selected_reference, change_len, pattern_indel, modification, str_pos, cut_site){ + # This function is used to generate the logo of the alleles that appear most times in data. + all_pre_nt <- list(A = "", C = "", T = "", G = "") + for (j in c("A", "C", "G", "T")){ + per_nt <- lapply(1:length(selected_reference), { + function(i){ + if(i < 11 || i > 10+l || modification == "ins"){ + if(selected_reference[i] == j){ + return(1-0.00001) + } else { + return(0.00001) + } + } else { + return(0.00001) + } + } + }) + all_pre_nt[[j]] <- per_nt + } + ## Data frame with the probability of each position + df<-NULL + df <- rbind(df, unlist(all_pre_nt$A)) + df <- rbind(df, unlist(all_pre_nt$C)) + df <- rbind(df, unlist(all_pre_nt$G)) + df <- rbind(df, unlist(all_pre_nt$T)) + pos_matrix <- data.matrix(df) + rownames(pos_matrix) <- c("A", "C", "G", "T") + if(modification == "ins"){ + cut_correction <- change_len + } else { + cut_correction <- 0 + } + + logo <- ggseqlogo(pos_matrix, method='custom', seq_type='dna') + ylab('') + + theme(plot.title = element_text(size = 12 , face = "bold" , hjust = 0.5), + axis.text.x = element_blank(), + axis.text.y = element_blank()) + + ggtitle(paste0(l, "nt ", mod, "; Start: ", str_pos)) + + annotate('segment', x = cut_site-str_pos+11+0.5+cut_correction , xend=cut_site-str_pos+11+0.5+cut_correction, y=-0.08, yend=1.08, linewidth=1, color="red") + + if (is.na(pattern_indel) && modification != "ins"){ + return(logo) + } else if (modification == "ins"){ + logo_plot <- logo + annotate('rect', xmin = 11-0.5, xmax = 10+change_len+0.5, ymin = -0.05, ymax = 1.05, alpha = .1, col='black', fill='yellow') + return(logo_plot) + } else if (pattern_indel == "NHEJ"){ + return(logo) + } else { + mh_len <- nchar(pattern_indel) + logo_plot <- logo + annotate('rect', xmin = 11+change_len-0.5, xmax = 10+change_len+mh_len+0.5, ymin = -0.05, ymax = 1.05, alpha = .1, col='black', fill='yellow') + return(logo_plot) + } +} + + + +################## +### Get all plots +################## + +#load(subs_info) +#empty data troubleshooter, notice that 3 is choose to avoid counting small empty df product of read.csv function +a<-as.character(data) +sampleName<-gsub('.{11}$', '', a) +checkFaulty<-grep('Faulty', sampleName) +a<-as.character(indels_info) +a<-gsub('.{11}$', '', a) +checkEmpty<-grep('Empt', a) +checkFaulty<-grep('Badl', a) +if (dim(data)[2]>3 && length(checkFaulty) == 0 && length(checkEmpty) == 0){ ### Substitutions percentages plot + cut_site <- get_cutSite(gR, ref_seq, rel_cut_site) + if (dim(subs_plup)[1] == 0){ + system(paste0("touch ", sample_name, "_subs-perc_plot.png")) + system(paste0("touch ", sample_name, "_subs-perc_plot_LOGO.png")) + } else { + sp <- subs_plot(subs_plup, gR, cut_site) + ggsave(paste0(sample_name, "_subs-perc_plot.png"), width = 12, height = 1.5) + ### Substitutions logo plot + sp_logo <- subs_logo_plot(subs_plup, gR, cut_site) + ggsave(paste0(sample_name, "_subs-perc_plot_LOGO.png"), width = 12, height = 1.5) + } + + ### Counts plot + #GET THE HTMLS + figure_counts <- plotIndels_gg(indels_data = data, cut = cut_site) + htmlwidgets::saveWidget(as_widget(figure_counts[[1]]), paste0(sample_name,"_Deletions.html")) + htmlwidgets::saveWidget(as_widget(figure_counts[[2]]), paste0(sample_name,"_Insertions.html")) + htmlwidgets::saveWidget(as_widget(figure_counts[[3]]), paste0(sample_name,"_accumulative.html")) + + ### Alleles plot (deletions) + indel_count <- indel_count_seq(data) + dels_count <- indel_count %>% filter(Modification == "del") ## Just deletions + if (dim(dels_count)[1] != 0){ + dels_count$Start <- as.numeric(as.character(dels_count$Start)) + dels_count$Length <- as.numeric(as.character(dels_count$Length)) + seqs <- get_sequences(dels_count, ref_seq) + + # Write fasta with the sequences and use it to get the alleles plot + if (length(seqs) < 10){ + rep_seqs <- seqs[1:length(seqs)] + } else { + rep_seqs <- seqs[1:10] + } + write.fasta(sequences = rep_seqs, names = paste0(round(dels_count$Perc, 2), "_", dels_count$Modification, dels_count$Length, "_", seq(1, dim(dels_count)[1]))[1:10], file.out = paste0(sample_name, "_top10.fa")) + nt_sequences <- paste0(sample_name, "_top10.fa") + figure_alleles <- ggmsa(nt_sequences, cut_site-40, cut_site+40, color = "Chemistry_NT", seq_name = TRUE) + ggsave(paste0(sample_name, "_delAlleles_plot.png")) + } else { + system(paste0("touch ", sample_name, "_delAlleles_plot.png")) + } + + ######### Low variability or top variants plot + wt <- data$wt_reads[1] ### 7299 + templata_based <- data$t_reads[1] ### 0 + total_char <- wt + templata_based + dim(data)[1] + + delCols_indels <- data %>% group_by(Modification, Start, Length, ins_nt, patterns) %>% dplyr::summarize(freq = n()) + unique_variants <- rbind(as.data.frame(delCols_indels), c("wt", 0, 0, NA, NA, wt), c("template-based", 0, 0, NA, NA, templata_based)) + uniq_indels_sorted <- unique_variants[order(as.numeric(unique_variants$freq), decreasing = TRUE),] + write.csv(uniq_indels_sorted,file=paste0(sample_name, "_unique-variants.csv")) + + # Check if there are enogh indels to have 5 top + if(dim(uniq_indels_sorted)[1] < 5 ){ + num_top <- dim(delCols_indels)[1] + } else { num_top <- 5 } + + # Get to variants + top_5 <- uniq_indels_sorted[1:num_top,] + + top5_names <- lapply(c(1:dim(top_5)[1]), + function(i){ + if( top_5[i,]$Modification == "del" ) { + return(paste0(top_5[i,]$Start, "_", top_5[i,]$Modification, top_5[i,]$Length, "_", top_5[i,]$pattern)) + } else if ( top_5[i,]$Modification == "ins" ) { + return(paste0(top_5[i,]$Start, "_", top_5[i,]$Modification, top_5[i,]$ins_nt)) + } else { + return(top_5[i,]$Modification) + } + } + ) + + wt_pos <- which(unlist(top5_names) == "wt") + if (length(wt_pos) == 0){ + cols_list = c("#bebebe", rep("#9394f7", num_top + 1)) + } else if (wt_pos == num_top){ + cols_list = c("#bebebe", "#9394f7", rep("#9394f7", num_top - 1), "#1cf453") + } else if (wt_pos == 1) { + cols_list = c("#bebebe", "#9394f7", "#1cf453", rep("#9394f7", num_top - 1)) + } else { + cols_list = c("#bebebe", rep("#9394f7", wt_pos), "#1cf453", rep("#9394f7", num_top-wt_pos)) + } + + reads_classes <- c("Other alleles", "Top alleles", unlist(top5_names)) + reads_counts <- c(total_char - sum(as.numeric(top_5$freq)), sum(as.numeric(top_5$freq)), as.numeric(top_5$freq)) + reads_summary <- data.frame(classes = unlist(reads_classes), counts = unlist(reads_counts)) + reads_summary$parents = c("", "", rep("Top alleles", num_top)) + fig <- plot_ly(reads_summary, + labels = ~ classes, + parents = ~ parents, + values = ~ counts, + type = 'sunburst', + branchvalues = 'total', + textinfo = "label+percent entry", + marker = list(colors = cols_list, color = "black"), + textfont = list(color = '#000000', size = 20) + ) + + htmlwidgets::saveWidget(as_widget(fig), paste0(sample_name,"_top.html")) + + ### Logo plot + all_each_logo <- list() + list_num <- 1 + sel_top <- top_5 %>% filter(Modification %in% c("del", "ins")) + if (dim(sel_top)[1] > 0){ + for (i in c(1:dim(sel_top)[1])){ + if (sel_top[i,]$Modification == "ins"){ + selfil_1 <- data %>% filter(Modification == sel_top[i,]$Modification) %>% filter(Start == sel_top[i,]$Start) %>% filter(Length == sel_top[i,]$Length) %>% filter(ins_nt == sel_top[i,]$ins_nt) + } else { + selfil_1 <- data %>% filter(Modification == sel_top[i,]$Modification) %>% filter(Start == sel_top[i,]$Start) %>% filter(Length == sel_top[i,]$Length) + } + s <- selfil_1[1,]$Start + l <- selfil_1[1,]$Length + if( s > 12 && (nchar(as.character(sread(ref_seq)[[1]])) > (s+l+9) )){ + ref_splited <- c(str_split(toupper(as.character(sread(ref_seq)[[1]])), "")[[1]]) + sel_ref <- ref_splited[(s-10):(s+l+9)] + mod <- selfil_1[1,]$Modification + if (mod == "ins"){ + sel_ref <- c(sel_ref[1:10], c(str_split(toupper(selfil_1[1,]$ins_nt), "")[[1]]), sel_ref[11:length(sel_ref)]) + } + p <- selfil_1[1,]$patterns + each_logo <- get_logo_top_vars(sel_ref, l, p, mod, s, cut_site) + + all_each_logo[[list_num]] <- each_logo + list_num <- list_num + 1 + } + } + } + + if (length(all_each_logo) > 0){ + plot_grid(plotlist = all_each_logo, ncol = 1) + ggsave(paste0(sample_name, "_top-alleles_LOGO.png")) + } else { + system(paste0("touch ", sample_name, "_top-alleles_LOGO.png")) + } + +} else { + fig<-empty_plot("No alignments were produced. + Please check your files and references") + htmlwidgets::saveWidget(as_widget(fig), paste0(sample_name,"_Deletions.html")) + htmlwidgets::saveWidget(as_widget(fig), paste0(sample_name,"_Insertions.html")) + htmlwidgets::saveWidget(as_widget(fig), paste0(sample_name,"_accumulative.html")) + system(paste0("touch ", sample_name, "_delAlleles_plot.png")) + + #Elements for dynamic table IF THERE ARE NO ALIGNMENTS + empty_list = list() + exportJson <- jsonlite::toJSON(empty_list) + write(exportJson, paste(sample_name,"_length.json",sep="")) + write(exportJson, paste(sample_name,"_Total_reads.json",sep="")) + write(exportJson, paste(sample_name,"_metadels.json",sep="")) + write(exportJson, paste(sample_name,"_metains.json",sep="")) + + system(paste0("touch ", sample_name, "_top.html")) + system(paste0("touch ", sample_name, "_top-alleles_LOGO.png")) + system(paste0("touch ", sample_name, "_counts_plot.png")) + system(paste0("touch ", sample_name, "_subs-perc_plot_LOGO.png")) + system(paste0("touch ", sample_name, "_subs-perc_plot.png")) +} diff --git a/conf/modules.config b/conf/modules.config index 84504a31..afae335d 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -18,50 +18,59 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] - withName: SAMPLESHEET_CHECK { + withName: ORIENT_REFERENCE { + ext.prefix = { params.reference_fasta ? "${reference.baseName}" : "${meta.id}_reference" } publishDir = [ - path: { "${params.outdir}/pipeline_info" }, + path: { "${params.outdir}/preprocessing/sequences" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + enabled: false + ] + } + + withName: CAT_FASTQ { + publishDir = [ + path: { "${params.outdir}/preprocessing/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } - withName: SEQ_TO_FILE_REF { + withName: PEAR { publishDir = [ - path: { "${params.outdir}/preprocessing/sequences" }, + path: { "${params.outdir}/preprocessing/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } - withName: SEQ_TO_FILE_TEMPL { + withName: BAGEL2_BF { publishDir = [ - path: { "${params.outdir}/preprocessing/sequences" }, + path: { "${params.outdir}/bagel2/bayes_factor/" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } - withName: ORIENT_REFERENCE { - ext.prefix = { params.reference_fasta ? "${reference.baseName}" : "${meta.id}_reference" } + withName: BAGEL2_PR { publishDir = [ - path: { "${params.outdir}/preprocessing/sequences" }, + path: { "${params.outdir}/bagel2/precision_recall/" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } - withName: CAT_FASTQ { + withName: BAGEL2_FC { publishDir = [ - path: { "${params.outdir}/preprocessing/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, + path: { "${params.outdir}/bagel2/fold_change/" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } - withName: PEAR { + withName: BAGEL2_GRAPH { publishDir = [ - path: { "${params.outdir}/preprocessing/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, + path: { "${params.outdir}/bagel2/graphs/" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] @@ -87,6 +96,7 @@ process { withName: MAGECK_COUNT { publishDir = [ path: { "${params.outdir}/mageck/count/" }, + mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] ext.prefix = 'count_table' @@ -96,6 +106,7 @@ process { withName: MAGECK_MLE { publishDir = [ path: { "${params.outdir}/mageck/mle/${meta.id}/" }, + mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] ext.args = @@ -107,6 +118,7 @@ process { ext.args2 = "-t" publishDir = [ path: { "${params.outdir}/mageck/rra/" }, + mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } @@ -152,11 +164,12 @@ process { ext.prefix = { "${fasta.baseName}_top" } } - withName: MERGING_SUMMARY { + withName: PREPROCESSING_SUMMARY { publishDir = [ path: { "${params.outdir}/summary/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + enabled: false ] } @@ -183,11 +196,21 @@ process { withName: RACON_1 { ext.args = '-t 4 -m 8 -x -6 -g -8 -w 500 --no-trimming' ext.prefix = { "${reads.baseName}_cycle1" } + publishDir = [ + path: { "${params.outdir}/racon_umi" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] } withName: RACON_2 { ext.args = '-t 4 -m 8 -x -6 -g -8 -w 500 --no-trimming' ext.prefix = { "${reads.baseName}_cycle2" } + publishDir = [ + path: { "${params.outdir}/racon_umi" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] } withName: MEDAKA { @@ -255,6 +278,15 @@ process { ext.args = '--cut_site=-3' } + withName: CRISPRSEQ_PLOTTER { + ext.args = '--cut_site=-3' + publishDir = [ + path: { "${params.outdir}/plots" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: CUSTOM_DUMPSOFTWAREVERSIONS { publishDir = [ path: { "${params.outdir}/pipeline_info" }, @@ -263,4 +295,13 @@ process { ] } + withName: 'MULTIQC' { + ext.args = params.multiqc_title ? "--title \"$params.multiqc_title\"" : '' + publishDir = [ + path: { "${params.outdir}/multiqc" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + } diff --git a/conf/test_screening.config b/conf/test_screening.config index 83c42d85..7661aacf 100644 --- a/conf/test_screening.config +++ b/conf/test_screening.config @@ -25,4 +25,5 @@ params { crisprcleanr = "Brunello_Library" mle_design_matrix = "https://raw.githubusercontent.com/nf-core/test-datasets/crisprseq/testdata/design_matrix.txt" library = "https://raw.githubusercontent.com/nf-core/test-datasets/crisprseq/testdata/brunello_target_sequence.txt" + rra_contrasts = "https://raw.githubusercontent.com/nf-core/test-datasets/crisprseq/testdata/rra_contrasts.txt" } diff --git a/docs/images/crisprseq_targeted_metro_map.png b/docs/images/crisprseq_targeted_metro_map.png index 8c57a2c0..b83bac57 100644 Binary files a/docs/images/crisprseq_targeted_metro_map.png and b/docs/images/crisprseq_targeted_metro_map.png differ diff --git a/docs/images/crisprseq_targeted_metro_map_dark.png b/docs/images/crisprseq_targeted_metro_map_dark.png index 296f9d43..f64193df 100644 Binary files a/docs/images/crisprseq_targeted_metro_map_dark.png and b/docs/images/crisprseq_targeted_metro_map_dark.png differ diff --git a/docs/images/hCas9-AAVS1-a_Deletions.png b/docs/images/hCas9-AAVS1-a_Deletions.png new file mode 100644 index 00000000..b2132a82 Binary files /dev/null and b/docs/images/hCas9-AAVS1-a_Deletions.png differ diff --git a/docs/images/hCas9-AAVS1-a_Insertions.png b/docs/images/hCas9-AAVS1-a_Insertions.png new file mode 100644 index 00000000..d71a4463 Binary files /dev/null and b/docs/images/hCas9-AAVS1-a_Insertions.png differ diff --git a/docs/images/hCas9-AAVS1-a_accumulative.png b/docs/images/hCas9-AAVS1-a_accumulative.png new file mode 100644 index 00000000..a064d72e Binary files /dev/null and b/docs/images/hCas9-AAVS1-a_accumulative.png differ diff --git a/docs/images/hCas9-AAVS1-a_delAlleles_plot.png b/docs/images/hCas9-AAVS1-a_delAlleles_plot.png new file mode 100644 index 00000000..87e1ac12 Binary files /dev/null and b/docs/images/hCas9-AAVS1-a_delAlleles_plot.png differ diff --git a/docs/images/hCas9-AAVS1-a_subs-perc_plot.png b/docs/images/hCas9-AAVS1-a_subs-perc_plot.png new file mode 100644 index 00000000..0cd88e1b Binary files /dev/null and b/docs/images/hCas9-AAVS1-a_subs-perc_plot.png differ diff --git a/docs/images/hCas9-AAVS1-a_subs-perc_plot_LOGO.png b/docs/images/hCas9-AAVS1-a_subs-perc_plot_LOGO.png new file mode 100644 index 00000000..05dad749 Binary files /dev/null and b/docs/images/hCas9-AAVS1-a_subs-perc_plot_LOGO.png differ diff --git a/docs/images/hCas9-AAVS1-a_top-alleles_LOGO.png b/docs/images/hCas9-AAVS1-a_top-alleles_LOGO.png new file mode 100644 index 00000000..c15dfd01 Binary files /dev/null and b/docs/images/hCas9-AAVS1-a_top-alleles_LOGO.png differ diff --git a/docs/images/hCas9-AAVS1-a_top.png b/docs/images/hCas9-AAVS1-a_top.png new file mode 100644 index 00000000..3adfe3cf Binary files /dev/null and b/docs/images/hCas9-AAVS1-a_top.png differ diff --git a/docs/images/mqc_qc_of_indels.png b/docs/images/mqc_qc_of_indels.png new file mode 100644 index 00000000..fe342fd6 Binary files /dev/null and b/docs/images/mqc_qc_of_indels.png differ diff --git a/docs/images/mqc_read_processing_summary.png b/docs/images/mqc_read_processing_summary.png new file mode 100644 index 00000000..831f2ea0 Binary files /dev/null and b/docs/images/mqc_read_processing_summary.png differ diff --git a/docs/images/mqc_type_of_edition.png b/docs/images/mqc_type_of_edition.png new file mode 100644 index 00000000..ac26f48d Binary files /dev/null and b/docs/images/mqc_type_of_edition.png differ diff --git a/docs/output.md b/docs/output.md index 4d0c8048..f6e1cc39 100644 --- a/docs/output.md +++ b/docs/output.md @@ -3,7 +3,6 @@ ## Introduction This document describes the output produced by the pipeline. - Please refer to the respective Output documentation: - [Output targeted](output/targeted.md) diff --git a/docs/output/screening.md b/docs/output/screening.md index bb45fdcc..869a7d33 100644 --- a/docs/output/screening.md +++ b/docs/output/screening.md @@ -23,6 +23,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - [Gene essentiality](#gene-essentiality-computation) - [MAGeCK rra](#mageck-rra) - modified robust ranking aggregation (RRA) algorithm - [MAGeCK mle](#mageck-mle) - maximum-likelihood estimation (MLE) for robust identification of CRISPR-screen hits + - [BAGEL2](#BAGEL2) - Bayes Factor to identify essential genes - [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution @@ -92,11 +93,32 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - `*_count_sgrna_summary.txt`: sgRNA ranking results, tab separated file containing means, p-values - `*.report.Rmd`: markdown report recapping essential genes - `*_count_table.log`: log of the run + - `*_scatterview.png`: scatter view of the targeted genes and their logFC + - `*_rank.png`: rank view of the targeted genes [MAGeCK](https://sourceforge.net/p/mageck/wiki/Home/) is a computational tool to identify important genes from CRISPR-Cas9 screens. +### BAGEL2 + +
+Output files + +- `bagel2/fold_change` + - `*.foldchange`: foldchange between the reference and treatment contrast provided +- `bagel2/bayes_factor` + - `*.bf`: bayes factor per gene +- `bagel2/precision_recall` + - `*.pr`: precision recall per gene +- `bagel2/graphs` + - `barplot*.png`: barplot of the bayes factor distribution + - `PR*.png`: precision recall plot (Recall vs FDR) + +
+ +[bagel2](https://github.com/hart-lab/bagel) is a computational tool to identify important essential genes for CRISPR-Cas9 screening experiments. + ## MultiQC
diff --git a/docs/output/targeted.md b/docs/output/targeted.md index b04ca3d8..63f6080e 100644 --- a/docs/output/targeted.md +++ b/docs/output/targeted.md @@ -33,6 +33,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - [bowtie2](#bowtie2) - Mapping reads to reference - [Edits calling](#edits-calling) - [CIGAR](#cigar) - Parse CIGAR to call edits + - [Output plots](#output-plots) - Results visualisation - [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution @@ -44,9 +45,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d Output files - `preprocessing/sequences/` - - `*_reference.fasta`: Sequence used as a reference. - `*_template.fasta`: Provided template sequence. - - `*_correctOrient.fasta`: Reference sequence in the correct orientation. - `_NewReference.fasta`: New reference generated from adding the changes made by the template to the original reference. - `*_template-align.bam`: Alignment of the new reference (with template changes) to the original reference. @@ -174,7 +173,7 @@ The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They m [Minimap2](https://github.com/lh3/minimap2) is a sequence alignment program that aligns DNA sequences against a reference database. -### racon +### racon_umi
Output files @@ -250,18 +249,43 @@ This section contains the final output of the pipeline. It contains information - `cigar/` - `*_cutSite.json`: Contains the protospacer cut site position in the reference. - - `*_edition.html`: Interactive pie chart with the percentage of edition types. Reads are classified between WT (without an edit) and indels. Indels are divided between deletions, insertions and delins (deletion + insertion). Deletions and insertions can be out of frame or in frame. + - `*_edition.html`: Interactive pie chart with the percentage of edition types. Reads are classified between WT (without an edit) and indels. Indels are divided between deletions, insertions and delins (deletion + insertion). Deletions and insertions can be out of frame or in frame. A similar plot can be visualised in the MultiQC report. ![Test sample hCas9-AAVS1-a edition plot](../images/hCas9-AAVS1-a_edition.png) - - `*_edits.csv`: Table containing the data visualized in the pie chart. + - `*_edits.csv`: Table containing the number of reads classified to each edition type. Contains the data visualized in the pie chart. - `*_indels.csv`: Table containing information of all reads. Edit type, edit start and length, if the edition happens above the error rate, if it's located into the common edit window, the frequency, the percentage, the pattern, surrounding nucleotides in case of insertions, the protospacer cut site, the sample id, number of aligned reads and number of reads with and without a template modification. - - `*_QC-indels.html`: Interactive pie chart with information about aligned reads. Reads are classified between WT and containing indels. Both types are classified between passing the filtering steps or not. Indel reads passing the filtering steps are divided in reads with a modification above the error rate and located in the common edit window, above the error rate but not in the edit region, vice versa, or any of those conditions. + - `*_QC-indels.html`: Interactive pie chart with information about aligned reads. Reads are classified between WT and containing indels. Both types are classified between passing the filtering steps or not. Indel reads passing the filtering steps are divided in reads with a modification above the error rate and located in the common edit window, above the error rate but not in the edit region, vice versa, or any of those conditions. A similar plot can be visualised in the MultiQC report. ![Test sample hCas9-AAVS1-a QC indels plot](../images/hCas9-AAVS1-a_QC-indels.png) - - `*_reads.html`: Interactive pie chart with percentage of the number of raw reads, reads merged with Pear, reads passing quality filters and UMI clustered reads. + - `*_reads.html`: Interactive pie chart with percentage of the number of raw reads, reads merged with Pear, reads passing quality filters and UMI clustered reads. A table with this information can be visualised in the MultiQC report. ![Test sample hCas9-AAVS1-a reads plot](../images/hCas9-AAVS1-a_reads.png) - `*_subs-perc.csv`: Table containing the percentage of each nucleotide found for each reference position.
+### Output plots + +
+Output files + +- `plots/` + - `*_accumulative.html`: Interactive barplot showing the accumulative deletions and insertions. x-axis represents the reference position. y-axis represents the percentage of reads containing a deletion or insertion in that position. + ![Test sample hCas9-AAVS1-a accumulative edition plot](../images/hCas9-AAVS1-a_accumulative.png) + - `*_delAlleles_plot.png`: Image showing the most common deletions found. x-axis represents the position. y-axis indicates the percentage in which the plotted deletion is observed (in respect of all deletions), followed by the length of the deletion. Dashes `-` indicate a deleted base. + ![Test sample hCas9-AAVS1-a deletion alleles plot](../images/hCas9-AAVS1-a_delAlleles_plot.png) + - `*_Deletions.html`: Interactive barplot showing the percentage of reads showing a deletion for each position and the deletion sizes. The left panel represents the percentage of reads having a deletion for each position (similar to `*_accumulative.html`). The right panel shows the number of deletions found relative to their size. The deleted sequences found are shown coloured in the stacked barplot. + ![Test sample hCas9-AAVS1-a deletions plot](../images/hCas9-AAVS1-a_Deletions.png) + - `*_Insertions.html`: Interactive barplot showing the percentage of reads showing an insertion for each position as well as the insertion sizes. The left panel represents the percentage of reads having an insertion for each position (similar to `*_accumulative.html`). The right panel shows the number of insertions found relative to their size. The inserted sequences found are shown coloured in the stacked barplot. + ![Test sample hCas9-AAVS1-a insertions plot](../images/hCas9-AAVS1-a_Insertions.png) + - `*_subs-perc_plot_LOGO.png`: LOGO showing the most represented nucleotide and its percentage (y-axis) for protospacer positions. PAM sequence is highlighted in yellow. + ![Test sample hCas9-AAVS1-a substitutions LOGO](../images/hCas9-AAVS1-a_subs-perc_plot_LOGO.png) + - `*_subs-perc_plot.png`: Barplot showing the most represented nucleotide and its percentage (y-axis and bar tags) for +/-25 positions surrounding the cut site. The protospacer sequence is highlighted by writing the sequence base in the y axis. Bases whose percentage is higher than 90% are not colored. + ![Test sample hCas9-AAVS1-a substitutions percentage plot](../images/hCas9-AAVS1-a_subs-perc_plot.png) + - `*_top-alleles_LOGO.png`: LOGO showing the 4 most represented editions. Cut site is highlighted with a vertical red line. The type of edition and start position are shown as a title to each LOGO. Deleted bases are not drawn. Inserted bases are highlighted in yellow. + ![Test sample hCas9-AAVS1-a top alleles LOGO](../images/hCas9-AAVS1-a_top-alleles_LOGO.png) + - `*_top.html`: Interactive pie chart showing the percentage of the top 4 editions found. The percentage of WT is also shown. Editions are named after the position, the type of edition and length and the sequence. + ![Test sample hCas9-AAVS1-a top alleles plot](../images/hCas9-AAVS1-a_top.png) + +
+ ## MultiQC
@@ -278,6 +302,17 @@ This section contains the final output of the pipeline. It contains information Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see . +`multiqc_report.html` contains statistics for FastQC and Cutadapt modules. It also contains a table with statistics about read processing (equivalent to `/cigar/*_reads.html` plots), and plots summarising the found editions (equivalent to `/cigar/*_edition.html` plots) and indel quality filters (equivalent to `/cigar/*_QC-indels.html` plots). + +
+Custom sections example + +![Read processing table](../images/mqc_read_processing_summary.png) +![Type of edition plot](../images/mqc_type_of_edition.png) +![QC of indels plot](../images/mqc_qc_of_indels.png) + +
+ ## Pipeline information
diff --git a/docs/usage.md b/docs/usage.md index 683cd823..e95bbe73 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -40,8 +40,11 @@ If you wish to repeatedly use the same parameters for multiple runs, rather than Pipeline settings can be provided in a `yaml` or `json` file via `-params-file `. -> ⚠️ Do not use `-c ` to specify parameters as this will result in errors. Custom config files specified with `-c` must only be used for [tuning process resource specifications](https://nf-co.re/docs/usage/configuration#tuning-workflow-resources), other infrastructural tweaks (such as output directories), or module arguments (args). -> The above pipeline run specified with a params file in yaml format: +:::warning +Do not use `-c ` to specify parameters as this will result in errors. Custom config files specified with `-c` must only be used for [tuning process resource specifications](https://nf-co.re/docs/usage/configuration#tuning-workflow-resources), other infrastructural tweaks (such as output directories), or module arguments (args). +::: + +The above pipeline run specified with a params file in yaml format: ```bash nextflow run nf-core/crisprseq -profile docker -params-file params.yaml @@ -53,7 +56,6 @@ with `params.yaml` containing: input: './samplesheet.csv' outdir: './results/' genome: 'GRCh37' -input: 'data' <...> ``` @@ -77,11 +79,13 @@ This version number will be logged in reports when you run the pipeline, so that To further assist in reproducbility, you can use share and re-use [parameter files](#running-the-pipeline) to repeat pipeline runs with the same settings without having to write out a command with every single parameter. -> 💡 If you wish to share such profile (such as upload as supplementary material for academic publications), make sure to NOT include cluster specific paths to files, nor institutional specific profiles. +:::tip +If you wish to share such profile (such as upload as supplementary material for academic publications), make sure to NOT include cluster specific paths to files, nor institutional specific profiles. +::: ## Core Nextflow arguments -:::info +:::note These options are part of Nextflow and use a _single_ hyphen (pipeline parameters use a double-hyphen). ::: @@ -91,7 +95,7 @@ Use this parameter to choose a configuration profile. Profiles can give configur Several generic profiles are bundled with the pipeline which instruct the pipeline to use software packaged using different methods (Docker, Singularity, Podman, Shifter, Charliecloud, Apptainer, Conda) - see below. -:::note +:::info We highly recommend the use of Docker or Singularity containers for full pipeline reproducibility, however when this is not possible, Conda is also supported. ::: diff --git a/docs/usage/screening.md b/docs/usage/screening.md index 1901fd0b..63fba93e 100644 --- a/docs/usage/screening.md +++ b/docs/usage/screening.md @@ -41,11 +41,25 @@ SRR8983580,SRR8983580.small.fastq.gz,,treatment An [example samplesheet](https://github.com/nf-core/test-datasets/blob/crisprseq/testdata/samplesheet_test.csv) has been provided with the pipeline. -The pipeline currently supports 2 algorithms to detect gene essentiality, MAGeCK rra and MAGeCK mle. MAGeCK MLE (Maximum Likelihood Estimation) and MAGeCK RRA (Robust Ranking Aggregation) are two different methods provided by the MAGeCK software package to analyze CRISPR-Cas9 screens. +### library + +If you are running the pipeline with fastq files and wish to obtain a count table, the library parameter is needed. The library table has three mandatory columns : id, target transcript (or gRNA sequence) and gene symbol. +An [example](https://github.com/nf-core/test-datasets/blob/crisprseq/testdata/brunello_target_sequence.txt) has been provided with the pipeline. Many libraries can be found on [addgene](https://www.addgene.org/). + +After the alignment step, the pipeline currently supports 3 algorithms to detect gene essentiality, MAGeCK rra, MAGeCK mle and BAGEL2. MAGeCK MLE (Maximum Likelihood Estimation) and MAGeCK RRA (Robust Ranking Aggregation) are two different methods provided by the MAGeCK software package to analyze CRISPR-Cas9 screens. BAGEL2 identifies gene essentiality through Bayesian Analysis. ### MAGeCK rra -MAGeCK RRA performs robust ranking aggregation to identify genes that are consistently ranked highly across multiple replicate screens. To run MAGeCK rra, `--rra_contrasts` should be used with a `csv` separated file stating the two conditions to be compared. +MAGeCK RRA performs robust ranking aggregation to identify genes that are consistently ranked highly across multiple replicate screens. To run MAGeCK rra, `--rra_contrasts` contains two columns : treatment and reference. These two columns should be separated with a dot comma (;) and contain the `csv` extension. You can also integrate several samples/conditions by comma separating them. Please find an example here below : + +| treatment | reference | +| ----------------------- | ------------------- | +| treatment1 | control1 | +| treatment1,treatment2 | control1,control2 | +| ----------------------- | ------------------- | +| treatment1 | control1 | + +A full example can be found [here](https://raw.githubusercontent.com/nf-core/test-datasets/crisprseq/testdata/full_test/samplesheet_full.csv). ### MAGeCK mle @@ -57,7 +71,10 @@ If there are several designs to be run, you can input a folder containing all th CRISPRcleanR is used for gene count normalization and the removal of biases for genomic segments for which copy numbers are amplified. Currently, the pipeline only supports annotation libraries already present in the R package and which can be found [here](https://github.com/francescojm/CRISPRcleanR/blob/master/Reference_Manual.pdf). To use CRISPRcleanR normalization, use `--crisprcleanr library`, `library` being the exact name as the library in the CRISPRcleanR documentation (e.g: "AVANA_Library"). -This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. +### BAGEL2 + +BAGEL2 (Bayesian Analysis of Gene Essentiality with Location) is a computational tool developed by the Hart Lab at Harvard University. It is designed for analyzing large-scale genetic screens, particularly CRISPR-Cas9 screens, to identify genes that are essential for the survival or growth of cells under different conditions. BAGEL2 integrates information about the location of guide RNAs within a gene and leverages this information to improve the accuracy of gene essentiality predictions. +BAGEL2 uses the same contrasts from `--rra_contrasts`. Note that the pipeline will create the following files in your working directory: diff --git a/lib/NfcoreSchema.groovy b/lib/NfcoreSchema.groovy deleted file mode 100755 index 9b34804d..00000000 --- a/lib/NfcoreSchema.groovy +++ /dev/null @@ -1,530 +0,0 @@ -// -// This file holds several functions used to perform JSON parameter validation, help and summary rendering for the nf-core pipeline template. -// - -import nextflow.Nextflow -import org.everit.json.schema.Schema -import org.everit.json.schema.loader.SchemaLoader -import org.everit.json.schema.ValidationException -import org.json.JSONObject -import org.json.JSONTokener -import org.json.JSONArray -import groovy.json.JsonSlurper -import groovy.json.JsonBuilder - -class NfcoreSchema { - - // - // Resolve Schema path relative to main workflow directory - // - public static String getSchemaPath(workflow, schema_filename='nextflow_schema.json') { - return "${workflow.projectDir}/${schema_filename}" - } - - // - // Function to loop over all parameters defined in schema and check - // whether the given parameters adhere to the specifications - // - /* groovylint-disable-next-line UnusedPrivateMethodParameter */ - public static void validateParameters(workflow, params, log, schema_filename='nextflow_schema.json') { - def has_error = false - //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - // Check for nextflow core params and unexpected params - def json = new File(getSchemaPath(workflow, schema_filename=schema_filename)).text - def Map schemaParams = (Map) new JsonSlurper().parseText(json).get('definitions') - def nf_params = [ - // Options for base `nextflow` command - 'bg', - 'c', - 'C', - 'config', - 'd', - 'D', - 'dockerize', - 'h', - 'log', - 'q', - 'quiet', - 'syslog', - 'v', - - // Options for `nextflow run` command - 'ansi', - 'ansi-log', - 'bg', - 'bucket-dir', - 'c', - 'cache', - 'config', - 'dsl2', - 'dump-channels', - 'dump-hashes', - 'E', - 'entry', - 'latest', - 'lib', - 'main-script', - 'N', - 'name', - 'offline', - 'params-file', - 'pi', - 'plugins', - 'poll-interval', - 'pool-size', - 'profile', - 'ps', - 'qs', - 'queue-size', - 'r', - 'resume', - 'revision', - 'stdin', - 'stub', - 'stub-run', - 'test', - 'w', - 'with-apptainer', - 'with-charliecloud', - 'with-conda', - 'with-dag', - 'with-docker', - 'with-mpi', - 'with-notification', - 'with-podman', - 'with-report', - 'with-singularity', - 'with-timeline', - 'with-tower', - 'with-trace', - 'with-weblog', - 'without-docker', - 'without-podman', - 'work-dir' - ] - def unexpectedParams = [] - - // Collect expected parameters from the schema - def expectedParams = [] - def enums = [:] - for (group in schemaParams) { - for (p in group.value['properties']) { - expectedParams.push(p.key) - if (group.value['properties'][p.key].containsKey('enum')) { - enums[p.key] = group.value['properties'][p.key]['enum'] - } - } - } - - for (specifiedParam in params.keySet()) { - // nextflow params - if (nf_params.contains(specifiedParam)) { - log.error "ERROR: You used a core Nextflow option with two hyphens: '--${specifiedParam}'. Please resubmit with '-${specifiedParam}'" - has_error = true - } - // unexpected params - def params_ignore = params.schema_ignore_params.split(',') + 'schema_ignore_params' - def expectedParamsLowerCase = expectedParams.collect{ it.replace("-", "").toLowerCase() } - def specifiedParamLowerCase = specifiedParam.replace("-", "").toLowerCase() - def isCamelCaseBug = (specifiedParam.contains("-") && !expectedParams.contains(specifiedParam) && expectedParamsLowerCase.contains(specifiedParamLowerCase)) - if (!expectedParams.contains(specifiedParam) && !params_ignore.contains(specifiedParam) && !isCamelCaseBug) { - // Temporarily remove camelCase/camel-case params #1035 - def unexpectedParamsLowerCase = unexpectedParams.collect{ it.replace("-", "").toLowerCase()} - if (!unexpectedParamsLowerCase.contains(specifiedParamLowerCase)){ - unexpectedParams.push(specifiedParam) - } - } - } - - //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - // Validate parameters against the schema - InputStream input_stream = new File(getSchemaPath(workflow, schema_filename=schema_filename)).newInputStream() - JSONObject raw_schema = new JSONObject(new JSONTokener(input_stream)) - - // Remove anything that's in params.schema_ignore_params - raw_schema = removeIgnoredParams(raw_schema, params) - - Schema schema = SchemaLoader.load(raw_schema) - - // Clean the parameters - def cleanedParams = cleanParameters(params) - - // Convert to JSONObject - def jsonParams = new JsonBuilder(cleanedParams) - JSONObject params_json = new JSONObject(jsonParams.toString()) - - // Validate - try { - schema.validate(params_json) - } catch (ValidationException e) { - println '' - log.error 'ERROR: Validation of pipeline parameters failed!' - JSONObject exceptionJSON = e.toJSON() - printExceptions(exceptionJSON, params_json, log, enums) - println '' - has_error = true - } - - // Check for unexpected parameters - if (unexpectedParams.size() > 0) { - Map colors = NfcoreTemplate.logColours(params.monochrome_logs) - println '' - def warn_msg = 'Found unexpected parameters:' - for (unexpectedParam in unexpectedParams) { - warn_msg = warn_msg + "\n* --${unexpectedParam}: ${params[unexpectedParam].toString()}" - } - log.warn warn_msg - log.info "- ${colors.dim}Ignore this warning: params.schema_ignore_params = \"${unexpectedParams.join(',')}\" ${colors.reset}" - println '' - } - - if (has_error) { - Nextflow.error('Exiting!') - } - } - - // - // Beautify parameters for --help - // - public static String paramsHelp(workflow, params, command, schema_filename='nextflow_schema.json') { - Map colors = NfcoreTemplate.logColours(params.monochrome_logs) - Integer num_hidden = 0 - String output = '' - output += 'Typical pipeline command:\n\n' - output += " ${colors.cyan}${command}${colors.reset}\n\n" - Map params_map = paramsLoad(getSchemaPath(workflow, schema_filename=schema_filename)) - Integer max_chars = paramsMaxChars(params_map) + 1 - Integer desc_indent = max_chars + 14 - Integer dec_linewidth = 160 - desc_indent - for (group in params_map.keySet()) { - Integer num_params = 0 - String group_output = colors.underlined + colors.bold + group + colors.reset + '\n' - def group_params = params_map.get(group) // This gets the parameters of that particular group - for (param in group_params.keySet()) { - if (group_params.get(param).hidden && !params.show_hidden_params) { - num_hidden += 1 - continue; - } - def type = '[' + group_params.get(param).type + ']' - def description = group_params.get(param).description - def defaultValue = group_params.get(param).default != null ? " [default: " + group_params.get(param).default.toString() + "]" : '' - def description_default = description + colors.dim + defaultValue + colors.reset - // Wrap long description texts - // Loosely based on https://dzone.com/articles/groovy-plain-text-word-wrap - if (description_default.length() > dec_linewidth){ - List olines = [] - String oline = "" // " " * indent - description_default.split(" ").each() { wrd -> - if ((oline.size() + wrd.size()) <= dec_linewidth) { - oline += wrd + " " - } else { - olines += oline - oline = wrd + " " - } - } - olines += oline - description_default = olines.join("\n" + " " * desc_indent) - } - group_output += " --" + param.padRight(max_chars) + colors.dim + type.padRight(10) + colors.reset + description_default + '\n' - num_params += 1 - } - group_output += '\n' - if (num_params > 0){ - output += group_output - } - } - if (num_hidden > 0){ - output += colors.dim + "!! Hiding $num_hidden params, use --show_hidden_params to show them !!\n" + colors.reset - } - output += NfcoreTemplate.dashedLine(params.monochrome_logs) - return output - } - - // - // Groovy Map summarising parameters/workflow options used by the pipeline - // - public static LinkedHashMap paramsSummaryMap(workflow, params, schema_filename='nextflow_schema.json') { - // Get a selection of core Nextflow workflow options - def Map workflow_summary = [:] - if (workflow.revision) { - workflow_summary['revision'] = workflow.revision - } - workflow_summary['runName'] = workflow.runName - if (workflow.containerEngine) { - workflow_summary['containerEngine'] = workflow.containerEngine - } - if (workflow.container) { - workflow_summary['container'] = workflow.container - } - workflow_summary['launchDir'] = workflow.launchDir - workflow_summary['workDir'] = workflow.workDir - workflow_summary['projectDir'] = workflow.projectDir - workflow_summary['userName'] = workflow.userName - workflow_summary['profile'] = workflow.profile - workflow_summary['configFiles'] = workflow.configFiles.join(', ') - - // Get pipeline parameters defined in JSON Schema - def Map params_summary = [:] - def params_map = paramsLoad(getSchemaPath(workflow, schema_filename=schema_filename)) - for (group in params_map.keySet()) { - def sub_params = new LinkedHashMap() - def group_params = params_map.get(group) // This gets the parameters of that particular group - for (param in group_params.keySet()) { - if (params.containsKey(param)) { - def params_value = params.get(param) - def schema_value = group_params.get(param).default - def param_type = group_params.get(param).type - if (schema_value != null) { - if (param_type == 'string') { - if (schema_value.contains('$projectDir') || schema_value.contains('${projectDir}')) { - def sub_string = schema_value.replace('\$projectDir', '') - sub_string = sub_string.replace('\${projectDir}', '') - if (params_value.contains(sub_string)) { - schema_value = params_value - } - } - if (schema_value.contains('$params.outdir') || schema_value.contains('${params.outdir}')) { - def sub_string = schema_value.replace('\$params.outdir', '') - sub_string = sub_string.replace('\${params.outdir}', '') - if ("${params.outdir}${sub_string}" == params_value) { - schema_value = params_value - } - } - } - } - - // We have a default in the schema, and this isn't it - if (schema_value != null && params_value != schema_value) { - sub_params.put(param, params_value) - } - // No default in the schema, and this isn't empty - else if (schema_value == null && params_value != "" && params_value != null && params_value != false) { - sub_params.put(param, params_value) - } - } - } - params_summary.put(group, sub_params) - } - return [ 'Core Nextflow options' : workflow_summary ] << params_summary - } - - // - // Beautify parameters for summary and return as string - // - public static String paramsSummaryLog(workflow, params) { - Map colors = NfcoreTemplate.logColours(params.monochrome_logs) - String output = '' - def params_map = paramsSummaryMap(workflow, params) - def max_chars = paramsMaxChars(params_map) - for (group in params_map.keySet()) { - def group_params = params_map.get(group) // This gets the parameters of that particular group - if (group_params) { - output += colors.bold + group + colors.reset + '\n' - for (param in group_params.keySet()) { - output += " " + colors.blue + param.padRight(max_chars) + ": " + colors.green + group_params.get(param) + colors.reset + '\n' - } - output += '\n' - } - } - output += "!! Only displaying parameters that differ from the pipeline defaults !!\n" - output += NfcoreTemplate.dashedLine(params.monochrome_logs) - return output - } - - // - // Loop over nested exceptions and print the causingException - // - private static void printExceptions(ex_json, params_json, log, enums, limit=5) { - def causingExceptions = ex_json['causingExceptions'] - if (causingExceptions.length() == 0) { - def m = ex_json['message'] =~ /required key \[([^\]]+)\] not found/ - // Missing required param - if (m.matches()) { - log.error "* Missing required parameter: --${m[0][1]}" - } - // Other base-level error - else if (ex_json['pointerToViolation'] == '#') { - log.error "* ${ex_json['message']}" - } - // Error with specific param - else { - def param = ex_json['pointerToViolation'] - ~/^#\// - def param_val = params_json[param].toString() - if (enums.containsKey(param)) { - def error_msg = "* --${param}: '${param_val}' is not a valid choice (Available choices" - if (enums[param].size() > limit) { - log.error "${error_msg} (${limit} of ${enums[param].size()}): ${enums[param][0..limit-1].join(', ')}, ... )" - } else { - log.error "${error_msg}: ${enums[param].join(', ')})" - } - } else { - log.error "* --${param}: ${ex_json['message']} (${param_val})" - } - } - } - for (ex in causingExceptions) { - printExceptions(ex, params_json, log, enums) - } - } - - // - // Remove an element from a JSONArray - // - private static JSONArray removeElement(json_array, element) { - def list = [] - int len = json_array.length() - for (int i=0;i - if(raw_schema.keySet().contains('definitions')){ - raw_schema.definitions.each { definition -> - for (key in definition.keySet()){ - if (definition[key].get("properties").keySet().contains(ignore_param)){ - // Remove the param to ignore - definition[key].get("properties").remove(ignore_param) - // If the param was required, change this - if (definition[key].has("required")) { - def cleaned_required = removeElement(definition[key].required, ignore_param) - definition[key].put("required", cleaned_required) - } - } - } - } - } - if(raw_schema.keySet().contains('properties') && raw_schema.get('properties').keySet().contains(ignore_param)) { - raw_schema.get("properties").remove(ignore_param) - } - if(raw_schema.keySet().contains('required') && raw_schema.required.contains(ignore_param)) { - def cleaned_required = removeElement(raw_schema.required, ignore_param) - raw_schema.put("required", cleaned_required) - } - } - return raw_schema - } - - // - // Clean and check parameters relative to Nextflow native classes - // - private static Map cleanParameters(params) { - def new_params = params.getClass().newInstance(params) - for (p in params) { - // remove anything evaluating to false - if (!p['value']) { - new_params.remove(p.key) - } - // Cast MemoryUnit to String - if (p['value'].getClass() == nextflow.util.MemoryUnit) { - new_params.replace(p.key, p['value'].toString()) - } - // Cast Duration to String - if (p['value'].getClass() == nextflow.util.Duration) { - new_params.replace(p.key, p['value'].toString().replaceFirst(/d(?!\S)/, "day")) - } - // Cast LinkedHashMap to String - if (p['value'].getClass() == LinkedHashMap) { - new_params.replace(p.key, p['value'].toString()) - } - } - return new_params - } - - // - // This function tries to read a JSON params file - // - private static LinkedHashMap paramsLoad(String json_schema) { - def params_map = new LinkedHashMap() - try { - params_map = paramsRead(json_schema) - } catch (Exception e) { - println "Could not read parameters settings from JSON. $e" - params_map = new LinkedHashMap() - } - return params_map - } - - // - // Method to actually read in JSON file using Groovy. - // Group (as Key), values are all parameters - // - Parameter1 as Key, Description as Value - // - Parameter2 as Key, Description as Value - // .... - // Group - // - - private static LinkedHashMap paramsRead(String json_schema) throws Exception { - def json = new File(json_schema).text - def Map schema_definitions = (Map) new JsonSlurper().parseText(json).get('definitions') - def Map schema_properties = (Map) new JsonSlurper().parseText(json).get('properties') - /* Tree looks like this in nf-core schema - * definitions <- this is what the first get('definitions') gets us - group 1 - title - description - properties - parameter 1 - type - description - parameter 2 - type - description - group 2 - title - description - properties - parameter 1 - type - description - * properties <- parameters can also be ungrouped, outside of definitions - parameter 1 - type - description - */ - - // Grouped params - def params_map = new LinkedHashMap() - schema_definitions.each { key, val -> - def Map group = schema_definitions."$key".properties // Gets the property object of the group - def title = schema_definitions."$key".title - def sub_params = new LinkedHashMap() - group.each { innerkey, value -> - sub_params.put(innerkey, value) - } - params_map.put(title, sub_params) - } - - // Ungrouped params - def ungrouped_params = new LinkedHashMap() - schema_properties.each { innerkey, value -> - ungrouped_params.put(innerkey, value) - } - params_map.put("Other parameters", ungrouped_params) - - return params_map - } - - // - // Get maximum number of characters across all parameter names - // - private static Integer paramsMaxChars(params_map) { - Integer max_chars = 0 - for (group in params_map.keySet()) { - def group_params = params_map.get(group) // This gets the parameters of that particular group - for (param in group_params.keySet()) { - if (param.size() > max_chars) { - max_chars = param.size() - } - } - } - return max_chars - } -} diff --git a/lib/NfcoreTemplate.groovy b/lib/NfcoreTemplate.groovy index 25a0a74a..e248e4c3 100755 --- a/lib/NfcoreTemplate.groovy +++ b/lib/NfcoreTemplate.groovy @@ -3,6 +3,8 @@ // import org.yaml.snakeyaml.Yaml +import groovy.json.JsonOutput +import nextflow.extension.FilesEx class NfcoreTemplate { @@ -128,7 +130,7 @@ class NfcoreTemplate { def email_html = html_template.toString() // Render the sendmail template - def max_multiqc_email_size = params.max_multiqc_email_size as nextflow.util.MemoryUnit + def max_multiqc_email_size = (params.containsKey('max_multiqc_email_size') ? params.max_multiqc_email_size : 0) as nextflow.util.MemoryUnit def smail_fields = [ email: email_address, subject: subject, email_txt: email_txt, email_html: email_html, projectDir: "$projectDir", mqcFile: mqc_report, mqcMaxSize: max_multiqc_email_size.toBytes() ] def sf = new File("$projectDir/assets/sendmail_template.txt") def sendmail_template = engine.createTemplate(sf).make(smail_fields) @@ -140,12 +142,14 @@ class NfcoreTemplate { try { if (params.plaintext_email) { throw GroovyException('Send plaintext e-mail, not HTML') } // Try to send HTML e-mail using sendmail + def sendmail_tf = new File(workflow.launchDir.toString(), ".sendmail_tmp.html") + sendmail_tf.withWriter { w -> w << sendmail_html } [ 'sendmail', '-t' ].execute() << sendmail_html log.info "-${colors.purple}[$workflow.manifest.name]${colors.green} Sent summary e-mail to $email_address (sendmail)-" } catch (all) { // Catch failures and try with plaintext def mail_cmd = [ 'mail', '-s', subject, '--content-type=text/html', email_address ] - if ( mqc_report.size() <= max_multiqc_email_size.toBytes() ) { + if ( mqc_report != null && mqc_report.size() <= max_multiqc_email_size.toBytes() ) { mail_cmd += [ '-A', mqc_report ] } mail_cmd.execute() << email_html @@ -154,14 +158,16 @@ class NfcoreTemplate { } // Write summary e-mail HTML to a file - def output_d = new File("${params.outdir}/pipeline_info/") - if (!output_d.exists()) { - output_d.mkdirs() - } - def output_hf = new File(output_d, "pipeline_report.html") + def output_hf = new File(workflow.launchDir.toString(), ".pipeline_report.html") output_hf.withWriter { w -> w << email_html } - def output_tf = new File(output_d, "pipeline_report.txt") + FilesEx.copyTo(output_hf.toPath(), "${params.outdir}/pipeline_info/pipeline_report.html"); + output_hf.delete() + + // Write summary e-mail TXT to a file + def output_tf = new File(workflow.launchDir.toString(), ".pipeline_report.txt") output_tf.withWriter { w -> w << email_txt } + FilesEx.copyTo(output_tf.toPath(), "${params.outdir}/pipeline_info/pipeline_report.txt"); + output_tf.delete() } // @@ -222,6 +228,20 @@ class NfcoreTemplate { } } + // + // Dump pipeline parameters in a json file + // + public static void dump_parameters(workflow, params) { + def timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') + def filename = "params_${timestamp}.json" + def temp_pf = new File(workflow.launchDir.toString(), ".${filename}") + def jsonStr = JsonOutput.toJson(params) + temp_pf.text = JsonOutput.prettyPrint(jsonStr) + + FilesEx.copyTo(temp_pf.toPath(), "${params.outdir}/pipeline_info/params_${timestamp}.json") + temp_pf.delete() + } + // // Print pipeline summary on completion // diff --git a/lib/WorkflowCrisprseq.groovy b/lib/WorkflowCrisprseq.groovy index b5404e94..05ea4360 100755 --- a/lib/WorkflowCrisprseq.groovy +++ b/lib/WorkflowCrisprseq.groovy @@ -11,9 +11,37 @@ class WorkflowCrisprseq { // Check and validate parameters // public static void initialise(params, log) { + genomeExistsError(params, log) } + // + // Function to validate channels from input samplesheet + // + public static ArrayList validateInput(input) { + def (metas, fastqs) = input[1..2] + + // Check that multiple runs of the same sample are of the same datatype i.e. single-end / paired-end + def endedness_ok = metas.collect{ it.single_end }.unique().size == 1 + if (!endedness_ok) { + Nextflow.error("Please check input samplesheet -> Multiple runs of a sample must be of the same datatype i.e. single-end or paired-end: ${metas[0].id}") + } + + // Check that multiple runs of the same sample contain a reference or not + def reference_ok = metas.collect{ it.self_reference }.unique().size == 1 + if (!reference_ok) { + Nextflow.error("Please check input samplesheet -> Multiple runs of a sample must all contain a reference or not: ${metas[0].id}") + } + + // Check that multiple runs of the same sample contain a template or not + def template_ok = metas.collect{ it.template }.unique().size == 1 + if (!template_ok) { + Nextflow.error("Please check input samplesheet -> Multiple runs of a sample must all contain a template or not: ${metas[0].id}") + } + + return [ metas[0], fastqs ] + } + // // Get workflow summary for MultiQC // @@ -41,15 +69,57 @@ class WorkflowCrisprseq { return yaml_file_text } - public static String methodsDescriptionText(run_workflow, mqc_methods_yaml) { + // + // Generate methods description for MultiQC + // + + public static String toolCitationText(params) { + + // TODO nf-core: Optionally add in-text citation tools to this list. + // Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "Tool (Foo et al. 2023)" : "", + // Uncomment function in methodsDescriptionText to render in MultiQC report + def citation_text = [ + "Tools used in the workflow included:", + "FastQC (Andrews 2010),", + "MultiQC (Ewels et al. 2016)", + "." + ].join(' ').trim() + + return citation_text + } + + public static String toolBibliographyText(params) { + + // TODO Optionally add bibliographic entries to this list. + // Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "
  • Author (2023) Pub name, Journal, DOI
  • " : "", + // Uncomment function in methodsDescriptionText to render in MultiQC report + def reference_text = [ + "
  • Andrews S, (2010) FastQC, URL: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/).
  • ", + "
  • Ewels, P., Magnusson, M., Lundin, S., & Käller, M. (2016). MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics , 32(19), 3047–3048. doi: /10.1093/bioinformatics/btw354
  • " + ].join(' ').trim() + + return reference_text + } + + public static String methodsDescriptionText(run_workflow, mqc_methods_yaml, params) { // Convert to a named map so can be used as with familar NXF ${workflow} variable syntax in the MultiQC YML file def meta = [:] meta.workflow = run_workflow.toMap() meta["manifest_map"] = run_workflow.manifest.toMap() + // Pipeline DOI meta["doi_text"] = meta.manifest_map.doi ? "(doi: ${meta.manifest_map.doi})" : "" meta["nodoi_text"] = meta.manifest_map.doi ? "": "
  • If available, make sure to update the text to include the Zenodo DOI of version of the pipeline used.
  • " + // Tool references + meta["tool_citations"] = "" + meta["tool_bibliography"] = "" + + // TODO Only uncomment below if logic in toolCitationText/toolBibliographyText has been filled! + //meta["tool_citations"] = toolCitationText(params).replaceAll(", \\.", ".").replaceAll("\\. \\.", ".").replaceAll(", \\.", ".") + //meta["tool_bibliography"] = toolBibliographyText(params) + + def methods_text = mqc_methods_yaml.text def engine = new SimpleTemplateEngine() diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy index 05c17b7f..0e40058b 100755 --- a/lib/WorkflowMain.groovy +++ b/lib/WorkflowMain.groovy @@ -12,47 +12,18 @@ class WorkflowMain { public static String citation(workflow) { return "If you use ${workflow.manifest.name} for your analysis please cite:\n\n" + "* The pipeline\n" + - " https://doi.org/10.5281/zenodo.7598497\n\n" + + " https://doi.org/10.5281/zenodo.7598496\n\n" + "* The nf-core framework\n" + " https://doi.org/10.1038/s41587-020-0439-x\n\n" + "* Software dependencies\n" + " https://github.com/${workflow.manifest.name}/blob/master/CITATIONS.md" } - // - // Generate help string - // - public static String help(workflow, params) { - def command = "nextflow run ${workflow.manifest.name} --input samplesheet.csv --genome GRCh37 -profile docker" - def help_string = '' - help_string += NfcoreTemplate.logo(workflow, params.monochrome_logs) - help_string += NfcoreSchema.paramsHelp(workflow, params, command) - help_string += '\n' + citation(workflow) + '\n' - help_string += NfcoreTemplate.dashedLine(params.monochrome_logs) - return help_string - } - - // - // Generate parameter summary log string - // - public static String paramsSummaryLog(workflow, params) { - def summary_log = '' - summary_log += NfcoreTemplate.logo(workflow, params.monochrome_logs) - summary_log += NfcoreSchema.paramsSummaryLog(workflow, params) - summary_log += '\n' + citation(workflow) + '\n' - summary_log += NfcoreTemplate.dashedLine(params.monochrome_logs) - return summary_log - } // // Validate parameters and print summary to screen // public static void initialise(workflow, params, log) { - // Print help to screen if required - if (params.help) { - log.info help(workflow, params) - System.exit(0) - } // Print workflow version and exit on --version if (params.version) { @@ -61,14 +32,6 @@ class WorkflowMain { System.exit(0) } - // Print parameter summary log to screen - log.info paramsSummaryLog(workflow, params) - - // Validate workflow parameters via the JSON schema - if (params.validate_params) { - NfcoreSchema.validateParameters(workflow, params, log) - } - // Check that a -profile or Nextflow config has been provided to run the pipeline NfcoreTemplate.checkConfigProvided(workflow, log) @@ -79,11 +42,6 @@ class WorkflowMain { // Check AWS batch settings NfcoreTemplate.awsBatch(workflow, params) - - // Check input has been provided - if (!params.input) { - Nextflow.error("Please provide an input samplesheet to the pipeline e.g. '--input samplesheet.csv'") - } } // // Get attribute from genome config file e.g. fasta diff --git a/main.nf b/main.nf index 9c6d0721..16c212bb 100644 --- a/main.nf +++ b/main.nf @@ -25,6 +25,22 @@ params.reference_fasta = params.reference_fasta ?: WorkflowMain.getGenomeAttribu ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ +include { validateParameters; paramsHelp } from 'plugin/nf-validation' + +// Print help message if needed +if (params.help) { + def logo = NfcoreTemplate.logo(workflow, params.monochrome_logs) + def citation = '\n' + WorkflowMain.citation(workflow) + '\n' + def String command = "nextflow run ${workflow.manifest.name} --input samplesheet.csv --genome GRCh37 -profile docker" + log.info logo + paramsHelp(command) + citation + NfcoreTemplate.dashedLine(params.monochrome_logs) + System.exit(0) +} + +// Validate input parameters +if (params.validate_params) { + validateParameters() +} + WorkflowMain.initialise(workflow, params, log) /* diff --git a/modules.json b/modules.json index 22939a8d..e396008c 100644 --- a/modules.json +++ b/modules.json @@ -102,11 +102,6 @@ "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, - "samtools/faidx": { - "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", - "installed_by": ["modules"] - }, "samtools/index": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", diff --git a/modules/local/alignment_summary.nf b/modules/local/alignment_summary.nf index 2805cd92..1947812c 100644 --- a/modules/local/alignment_summary.nf +++ b/modules/local/alignment_summary.nf @@ -11,7 +11,8 @@ process ALIGNMENT_SUMMARY { tuple val(meta), path(reads), path(summary) output: - tuple val(meta), path(summary), emit: summary + tuple val(meta), path("*_alignment_summary.csv"), emit: summary + path "versions.yml", emit: versions when: task.ext.when == null || task.ext.when diff --git a/modules/local/bagel2/bf.nf b/modules/local/bagel2/bf.nf new file mode 100644 index 00000000..3ff86603 --- /dev/null +++ b/modules/local/bagel2/bf.nf @@ -0,0 +1,38 @@ +process BAGEL2_BF { + tag "$meta.treatment" + label 'process_single' + + + conda "python=3.11.4 pandas=2.0.3 numpy=1.25.1 scikit-learn=1.3.0 click=8.1.6" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-1ec3f69e7819b1ab3e6f57d16594eb40ed7d6792:f94a27287a1921ce0dacd411d48acff738d3ca90-0': + 'biocontainers/mulled-v2-1ec3f69e7819b1ab3e6f57d16594eb40ed7d6792:f94a27287a1921ce0dacd411d48acff738d3ca90-0' }" + + + input: + tuple val(meta), path(foldchange) + path(reference_essentials) + path(reference_nonessentials) + + output: + tuple val(meta), path("*.bf"), emit: bf + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.treatment}" + + """ + BAGEL.py bf -i $foldchange -o '${meta.treatment}_vs_${meta.reference}.bf' $args -e $reference_essentials -n $reference_nonessentials -c ${meta.treatment} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + BAGEL2: \$( BAGEL.py version | grep -o 'Version: [0-9.]*' | awk '{print \$2}' | grep -v '^\$') + END_VERSIONS + """ + +} diff --git a/modules/local/bagel2/fc.nf b/modules/local/bagel2/fc.nf new file mode 100644 index 00000000..b65afaea --- /dev/null +++ b/modules/local/bagel2/fc.nf @@ -0,0 +1,34 @@ +process BAGEL2_FC { + tag "${meta.treatment}_${meta.reference}" + label 'process_single' + + conda "python=3.11.4 pandas=2.0.3 numpy=1.25.1 scikit-learn=1.3.0 click=8.1.6" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-1ec3f69e7819b1ab3e6f57d16594eb40ed7d6792:f94a27287a1921ce0dacd411d48acff738d3ca90-0': + 'biocontainers/mulled-v2-1ec3f69e7819b1ab3e6f57d16594eb40ed7d6792:f94a27287a1921ce0dacd411d48acff738d3ca90-0' }" + + input: + tuple val(meta), path(count_table) + + output: + tuple val(meta), path("*.foldchange") , emit: foldchange + tuple val(meta), path("*.normed_readcount") , emit: normed_counts + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + BAGEL.py fc -i $count_table -o ${meta.treatment}_vs_${meta.reference} -c $meta.reference $args + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + BAGEL2: \$( BAGEL.py version | grep -o 'Version: [0-9.]*' | awk '{print \$2}' | grep -v '^\$') + END_VERSIONS + """ + +} diff --git a/modules/local/bagel2/graph.nf b/modules/local/bagel2/graph.nf new file mode 100644 index 00000000..32c96046 --- /dev/null +++ b/modules/local/bagel2/graph.nf @@ -0,0 +1,73 @@ +process BAGEL2_GRAPH { + tag "${meta.treatment}_${meta.reference}" + label 'process_single' + + conda "python=3.11.4 pandas=2.0.3 numpy=1.25.1 scikit-learn=1.3.0 click=8.1.6 matplotlib=3.7.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-54e0353146eca1531516863e8235bf7385d76663:c9ff1a9eec871c54cbea815eae778da702623978-0': + 'biocontainers/mulled-v2-54e0353146eca1531516863e8235bf7385d76663:c9ff1a9eec871c54cbea815eae778da702623978-0' }" + + input: + tuple val(meta), path(pr) + + output: + path("*.png") , emit: pictures + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + #!/usr/bin/env python3 + + #### author: Laurence Kuhlburger + #### Released under the MIT license. See git repository (https://github.com/nf-core/crisprseq) for full license text. + #### + #### Orient a reference sequence according to reads orientation. + + import pandas as pd + import matplotlib.pyplot as plt + + pr_data = pd.read_table('${pr}', index_col=0) + + plt.plot(pr_data.Recall, pr_data.Precision, linewidth=2) + plt.xlim(0, 1.01) + plt.ylim(0, 1.02) + plt.xlabel('Recall') + plt.ylabel('Precision (1-FDR)') + plt.title('Precision-Recall Plot') + + # Save the plot to a PNG file + file_name = 'PR_plot_{}_vs_{}.png'.format('${meta.treatment}', '${meta.reference}') + + plt.savefig(file_name) + + # Show the plot (optional) + plt.show() + + pr_data.hist('BF', bins=50, range=(-100,100)) + plt.xlabel('Bayes Factor') + plt.ylabel('Number of Genes') + + file_name = 'barplot_{}_vs_{}.png'.format('${meta.treatment}', '${meta.reference}') + + plt.savefig(file_name) + plt.show() + + # Output version information + version = pd. __version__ + matplotlib_version = plt.matplotlib.__version__ + # alas, no `pyyaml` pre-installed in the cellranger container + with open("versions.yml", "w") as f: + f.write('"${task.process}":\\n') + f.write(f' pandas: "{version}"\\n') + f.write(f' matplotlib.pyplot: "{matplotlib_version}"\\n') + + """ + + +} diff --git a/modules/local/bagel2/pr.nf b/modules/local/bagel2/pr.nf new file mode 100644 index 00000000..e27366d6 --- /dev/null +++ b/modules/local/bagel2/pr.nf @@ -0,0 +1,35 @@ +process BAGEL2_PR { + tag "$meta.treatment" + label 'process_single' + + conda "python=3.11.4 pandas=2.0.3 numpy=1.25.1 scikit-learn=1.3.0 click=8.1.6" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-1ec3f69e7819b1ab3e6f57d16594eb40ed7d6792:f94a27287a1921ce0dacd411d48acff738d3ca90-0': + 'biocontainers/mulled-v2-1ec3f69e7819b1ab3e6f57d16594eb40ed7d6792:f94a27287a1921ce0dacd411d48acff738d3ca90-0' }" + + + input: + tuple val(meta), path(bf), path(reference_essentials), path(reference_nonessentials) + + output: + tuple val(meta), path("*.pr") , emit: pr + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + BAGEL.py pr -i $bf -o '${meta.treatment}_vs_${meta.reference}.pr' -e $reference_essentials -n $reference_nonessentials $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + BAGEL2: \$( BAGEL.py version | grep -o 'Version: [0-9.]*' | awk '{print \$2}' | grep -v '^\$') + END_VERSIONS + """ + +} diff --git a/modules/local/cigar_parser.nf b/modules/local/cigar_parser.nf index 8c87314a..45dfd7d5 100644 --- a/modules/local/cigar_parser.nf +++ b/modules/local/cigar_parser.nf @@ -11,9 +11,11 @@ process CIGAR_PARSER { tuple val(meta), path(reads), path(index), path(reference), val(protospacer), path(template), path(template_bam), path(reference_template), path(summary) output: - tuple val(meta), path("*indels.csv"), path("*_subs-perc.csv"), emit: indels + tuple val(meta), path("*[!QC-]indels.csv"), path("*_subs-perc.csv"), emit: indels tuple val(meta), path("*.html"), path("*edits.csv") , emit: edition tuple val(meta), path("*cutSite.json") , emit: cutsite + tuple val(meta), path("*_QC-indels.csv") , emit: qcindels + tuple val(meta), path("*_reads-summary.csv") , emit: processing path "versions.yml" , emit: versions when: @@ -39,7 +41,13 @@ process CIGAR_PARSER { cat <<-END_VERSIONS > versions.yml "${task.process}": - R: \$(R --version) + seqinr: \$(Rscript -e "cat(paste(packageVersion('seqinr'), collapse='.'))") + Rsamtools: \$(Rscript -e "cat(paste(packageVersion('Rsamtools'), collapse='.'))") + dplyr: \$(Rscript -e "cat(paste(packageVersion('dplyr'), collapse='.'))") + ShortRead: \$(Rscript -e "cat(paste(packageVersion('ShortRead'), collapse='.'))") + jsonlite: \$(Rscript -e "cat(paste(packageVersion('jsonlite'), collapse='.'))") + stringr: \$(Rscript -e "cat(paste(packageVersion('stringr'), collapse='.'))") + plotly: \$(Rscript -e "cat(paste(packageVersion('plotly'), collapse='.'))") END_VERSIONS """ } diff --git a/modules/local/clustering_summary.nf b/modules/local/clustering_summary.nf index a134bdaf..a48fa270 100644 --- a/modules/local/clustering_summary.nf +++ b/modules/local/clustering_summary.nf @@ -11,7 +11,9 @@ process CLUSTERING_SUMMARY { tuple val(meta), path(reads), path(summary) output: - tuple val(meta), path(summary), emit: summary + tuple val(meta), path("*_clustering_summary.csv"), emit: summary + path "versions.yml", emit: versions + when: task.ext.when == null || task.ext.when diff --git a/modules/local/crisprseq_plotter.nf b/modules/local/crisprseq_plotter.nf new file mode 100644 index 00000000..c3b0ed62 --- /dev/null +++ b/modules/local/crisprseq_plotter.nf @@ -0,0 +1,53 @@ +process CRISPRSEQ_PLOTTER { + tag "$meta.id" + label 'process_medium' + + conda 'r-ggplot2=3.4.3 bioconductor-shortread=1.58.0 r-ggpubr=0.6.0 r-ggmsa=1.0.2 r-seqmagick=0.1.6 r-tidyr=1.3.0 r-ggseqlogo=0.1 r-cowplot=1.1.1 r-seqinr=4.2_30 r-optparse=1.7.3 r-dplyr=1.1.2 r-plyr=1.8.8 r-stringr=1.5.0 r-plotly=4.10.2' + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-6de07928379e6eface08a0019c4a1d6b5192e805:0d77388f37ddd923a087f7792e30e83ab54c918c-0' : + 'biocontainers/mulled-v2-6de07928379e6eface08a0019c4a1d6b5192e805:0d77388f37ddd923a087f7792e30e83ab54c918c-0' }" + + input: + tuple val(meta), path(indels), path(substitutions), path(reference), val(protospacer) + + output: + tuple val(meta), path("*_Deletions.html"), path("*_delAlleles_plot.png") , emit: deletions + tuple val(meta), path("*_Insertions.html") , emit: insertions + tuple val(meta), path("*_accumulative.html") , emit: accumulative + tuple val(meta), path("*_subs-perc_plot.png"), path("*_subs-perc_plot_LOGO.png"), emit: substitutions + tuple val(meta), path("*_top-alleles_LOGO.png"), path("*_top.html") , emit: topalleles + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + plotter.R \\ + $args \\ + --indels_info=$indels \\ + --reference=$reference \\ + --gRNA_sequence=$protospacer \\ + --sample_name=$prefix \\ + --substitutions_info=$substitutions + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + ggplot2: \$(Rscript -e "cat(paste(packageVersion('ggplot2'), collapse='.'))") + ShortRead: \$(Rscript -e "cat(paste(packageVersion('ShortRead'), collapse='.'))") + plyr: \$(Rscript -e "cat(paste(packageVersion('plyr'), collapse='.'))") + dplyr: \$(Rscript -e "cat(paste(packageVersion('dplyr'), collapse='.'))") + seqinr: \$(Rscript -e "cat(paste(packageVersion('seqinr'), collapse='.'))") + ggpubr: \$(Rscript -e "cat(paste(packageVersion('ggpubr'), collapse='.'))") + ggmsa: \$(Rscript -e "cat(paste(packageVersion('ggmsa'), collapse='.'))") + seqmagick: \$(Rscript -e "cat(paste(packageVersion('seqmagick'), collapse='.'))") + stringr: \$(Rscript -e "cat(paste(packageVersion('stringr'), collapse='.'))") + tidyr: \$(Rscript -e "cat(paste(packageVersion('tidyr'), collapse='.'))") + ggseqlogo: \$(Rscript -e "cat(paste(packageVersion('ggseqlogo'), collapse='.'))") + plotly: \$(Rscript -e "cat(paste(packageVersion('plotly'), collapse='.'))") + cowplot: \$(Rscript -e "cat(paste(packageVersion('cowplot'), collapse='.'))") + END_VERSIONS + """ +} diff --git a/modules/local/mageck/graphrra.nf b/modules/local/mageck/graphrra.nf new file mode 100644 index 00000000..69f48ae7 --- /dev/null +++ b/modules/local/mageck/graphrra.nf @@ -0,0 +1,65 @@ +process MAGECK_GRAPHRRA { + tag "$meta.treatment" + label 'process_single' + + conda "bioconda::bioconductor-mageckflute=2.2.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mageckflute:2.2.0--r42hdfd78af_0': + 'biocontainers/bioconductor-mageckflute:2.2.0--r42hdfd78af_0' }" + + input: + tuple val(meta), path(gene_summary) + + output: + tuple val(meta), path("*.png"), emit: graphs + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + #!/usr/bin/env Rscript + #### author: Laurence Kuhlburger + #### Released under the MIT license. See git repository (https://github.com/nf-core/crisprseq) for full license text. + #### + #### Orient a reference sequence according to reads orientation. + + library(MAGeCKFlute) + library(ggplot2) + options(ggrepel.max.overlaps = Inf) + + gdata = ReadRRA("$gene_summary") + gdata <- transform(gdata, LogFDR = -log10(FDR)) + png(filename = paste0("$meta.treatment","_vs_","$meta.reference","_scatterview.png"), width = 6, height = 4, units = "in", res = 300) + p1 = ScatterView(gdata, x = "Score", y = "LogFDR", label = "id", + model = "volcano", top = 5) + print(p1) + dev.off() + + gdata <- transform(gdata, Rank = rank(Score)) + png(filename = paste0("$meta.treatment","_vs_","$meta.reference","_rank.png"), width = 6, height = 4, units = "in", res = 300) + p1 = ScatterView(gdata, x = "Rank", y = "Score", label = "id", + top = 5, auto_cut_y = TRUE, ylab = "Log2FC", + groups = c("top", "bottom")) + print(p1) + dev.off() + + version_file_path <- "versions.yml" + version_flute <- paste(unlist(packageVersion("MAGeCKFlute")), collapse = ".") + version_ggplot <- paste(unlist(packageVersion("ggplot2")), collapse = ".") + + f <- file(version_file_path, "w") + writeLines('"${task.process}":', f) + writeLines(" MAGeCKFlute: ", f, sep = "") + writeLines(version_flute, f) + writeLines(" ggplot2: ", f, sep = "") + writeLines(version_ggplot, f) + close(f) + """ + + +} diff --git a/modules/local/merging_summary.nf b/modules/local/preprocessing_summary.nf similarity index 70% rename from modules/local/merging_summary.nf rename to modules/local/preprocessing_summary.nf index dcd31e04..f1aafa7c 100644 --- a/modules/local/merging_summary.nf +++ b/modules/local/preprocessing_summary.nf @@ -1,4 +1,4 @@ -process MERGING_SUMMARY { +process PREPROCESSING_SUMMARY { tag "$meta.id" label 'process_single' @@ -12,11 +12,13 @@ process MERGING_SUMMARY { output: - tuple val(meta), path("*_summary.csv"), emit: summary + tuple val(meta), path("*_preprocessing_summary.csv"), emit: summary + path "versions.yml", emit: versions + when: task.ext.when == null || task.ext.when script: - template 'merging_summary.py' + template 'preprocessing_summary.py' } diff --git a/modules/local/samplesheet_check.nf b/modules/local/samplesheet_check.nf deleted file mode 100644 index ec0e11b2..00000000 --- a/modules/local/samplesheet_check.nf +++ /dev/null @@ -1,31 +0,0 @@ -process SAMPLESHEET_CHECK { - tag "$samplesheet" - label 'process_single' - - conda "conda-forge::python=3.8.3" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/python:3.8.3' : - 'biocontainers/python:3.8.3' }" - - input: - path samplesheet - - output: - path '*.csv' , emit: csv - path "versions.yml", emit: versions - - when: - task.ext.when == null || task.ext.when - - script: // This script is bundled with the pipeline, in nf-core/crisprseq/bin/ - """ - check_samplesheet.py \\ - $samplesheet \\ - samplesheet.valid.csv - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - python: \$(python --version | sed 's/Python //g') - END_VERSIONS - """ -} diff --git a/modules/local/samplesheet_check_screening.nf b/modules/local/samplesheet_check_screening.nf deleted file mode 100644 index 69b887f6..00000000 --- a/modules/local/samplesheet_check_screening.nf +++ /dev/null @@ -1,28 +0,0 @@ -process SAMPLESHEET_CHECK_SCREENING { - tag "$samplesheet" - label 'process_single' - - conda "conda-forge::python=3.8.3" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/python:3.8.3' : - 'biocontainers/python:3.8.3' }" - - input: - path samplesheet - - output: - path '*.csv' , emit: csv - path "versions.yml", emit: versions - - script: // This script is bundled with the pipeline, in nf-core/test/bin/ - """ - check_samplesheet_screening.py \\ - $samplesheet \\ - samplesheet.valid.csv - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - python: \$(python --version | sed 's/Python //g') - END_VERSIONS - """ -} diff --git a/modules/local/seq_to_file.nf b/modules/local/seq_to_file.nf deleted file mode 100644 index 6dfc0022..00000000 --- a/modules/local/seq_to_file.nf +++ /dev/null @@ -1,33 +0,0 @@ -process SEQ_TO_FILE { - tag "$meta.id" - label 'process_single' - - conda "conda-forge::p7zip==16.02" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/p7zip:16.02' : - 'biocontainers/p7zip:16.02' }" - - input: - tuple val(meta), val(sequence) - val(type) - - output: - tuple val(meta), path('*.fasta') , emit: file - path "versions.yml" , emit: versions - - when: - sequence != null - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - """ - echo ">${meta.id}" > ${meta.id}_${type}.fasta - echo $sequence >> ${meta.id}_${type}.fasta - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - 7za: \$(echo \$(7za --help) | sed 's/.*p7zip Version //; s/(.*//') - END_VERSIONS - """ -} diff --git a/modules/local/template_reference.nf b/modules/local/template_reference.nf index 12c24553..28fcfd45 100644 --- a/modules/local/template_reference.nf +++ b/modules/local/template_reference.nf @@ -29,7 +29,7 @@ process TEMPLATE_REFERENCE { cat <<-END_VERSIONS > versions.yml "${task.process}": - R: \$(R --version) + Rscript: \$(Rscript --version) END_VERSIONS """ } diff --git a/modules/nf-core/mageck/count/mageck-count.diff b/modules/nf-core/mageck/count/mageck-count.diff index 50ee78b1..53659a20 100644 --- a/modules/nf-core/mageck/count/mageck-count.diff +++ b/modules/nf-core/mageck/count/mageck-count.diff @@ -1,26 +1,54 @@ Changes in module 'nf-core/mageck/count' --- modules/nf-core/mageck/count/main.nf +++ modules/nf-core/mageck/count/main.nf -@@ -12,8 +12,10 @@ +@@ -1,6 +1,6 @@ + process MAGECK_COUNT { + tag "$meta.id" +- label 'process_medium' ++ label 'process_high' + + conda "bioconda::mageck=0.5.9" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? +@@ -12,8 +12,12 @@ path(library) output: - tuple val(meta), path("*count*.txt"), emit: count + tuple val(meta), path("*count.txt"), emit: count tuple val(meta), path("*.count_normalized.txt"), emit: norm ++ tuple val(meta), path("*.countsummary.txt"), emit: summary ++ tuple val(meta), path("*.count_normalized.txt"), emit: normalized + tuple val(meta), path("*.log"), emit: logs + path "versions.yml" , emit: versions when: -@@ -22,7 +24,7 @@ +@@ -22,9 +26,15 @@ script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" - def input_file = ("$inputfile".endsWith(".fastq.gz")) ? "--fastq ${inputfile}" : "-k ${inputfile}" -+ def input_file = ("$inputfile".endsWith(".fastq.gz") || "$inputfile".endsWith(".fq.gz")) ? "--fastq ${inputfile}" : "-k ${inputfile}" ++ // def input_file = ("$inputfile".endsWith(".fastq.gz") || "$inputfile".endsWith(".fq.gz")) ? "--fastq ${inputfile}" : "-k ${inputfile}" def sample_label = ("$inputfile".endsWith(".fastq.gz") || "$inputfile".endsWith(".fq.gz")) ? "--sample-label ${meta.id}" : '' - +- ++ ++ if (meta.single_end && ("$inputfile".endsWith(".fastq.gz") || "$inputfile".endsWith(".fq.gz"))) { ++ input = "--fastq ${inputfile}" ++ } else { ++ input = "--fastq ${inputfile[0]} --fastq-2 ${inputfile[1]}" ++ } ++ """ + mageck \\ + count \\ +@@ -32,7 +42,7 @@ + -l $library \\ + -n $prefix \\ + $sample_label \\ +- $input_file \\ ++ $input + + + cat <<-END_VERSIONS > versions.yml ************************************************************ diff --git a/modules/nf-core/mageck/count/main.nf b/modules/nf-core/mageck/count/main.nf index 30fbd8fe..f47ad55e 100644 --- a/modules/nf-core/mageck/count/main.nf +++ b/modules/nf-core/mageck/count/main.nf @@ -1,6 +1,6 @@ process MAGECK_COUNT { tag "$meta.id" - label 'process_medium' + label 'process_high' conda "bioconda::mageck=0.5.9" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? @@ -12,11 +12,12 @@ process MAGECK_COUNT { path(library) output: - tuple val(meta), path("*count.txt"), emit: count + tuple val(meta), path("*count.txt"), emit: count tuple val(meta), path("*.count_normalized.txt"), emit: norm - tuple val(meta), path("*.log"), emit: logs - - path "versions.yml" , emit: versions + tuple val(meta), path("*.countsummary.txt"), emit: summary + tuple val(meta), path("*.count_normalized.txt"), emit: normalized + tuple val(meta), path("*.log"), emit: logs + path "versions.yml", emit: versions when: task.ext.when == null || task.ext.when @@ -24,9 +25,15 @@ process MAGECK_COUNT { script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" - def input_file = ("$inputfile".endsWith(".fastq.gz") || "$inputfile".endsWith(".fq.gz")) ? "--fastq ${inputfile}" : "-k ${inputfile}" + // def input_file = ("$inputfile".endsWith(".fastq.gz") || "$inputfile".endsWith(".fq.gz")) ? "--fastq ${inputfile}" : "-k ${inputfile}" def sample_label = ("$inputfile".endsWith(".fastq.gz") || "$inputfile".endsWith(".fq.gz")) ? "--sample-label ${meta.id}" : '' - + + if (meta.single_end && ("$inputfile".endsWith(".fastq.gz") || "$inputfile".endsWith(".fq.gz"))) { + input = "--fastq ${inputfile}" + } else { + input = "--fastq ${inputfile[0]} --fastq-2 ${inputfile[1]}" + } + """ mageck \\ count \\ @@ -34,7 +41,7 @@ process MAGECK_COUNT { -l $library \\ -n $prefix \\ $sample_label \\ - $input_file \\ + $input cat <<-END_VERSIONS > versions.yml diff --git a/modules/nf-core/mageck/mle/mageck-mle.diff b/modules/nf-core/mageck/mle/mageck-mle.diff index b635b553..cba65759 100644 --- a/modules/nf-core/mageck/mle/mageck-mle.diff +++ b/modules/nf-core/mageck/mle/mageck-mle.diff @@ -1,6 +1,14 @@ Changes in module 'nf-core/mageck/mle' --- modules/nf-core/mageck/mle/main.nf +++ modules/nf-core/mageck/mle/main.nf +@@ -1,6 +1,6 @@ + process MAGECK_MLE { + tag "$meta.id" +- label 'process_medium' ++ label 'process_high' + + conda "bioconda::mageck=0.5.9" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? @@ -8,8 +8,7 @@ 'biocontainers/mageck:0.5.9--py37h6bb024c_0' }" diff --git a/modules/nf-core/mageck/mle/main.nf b/modules/nf-core/mageck/mle/main.nf index 626c410d..9c744afe 100644 --- a/modules/nf-core/mageck/mle/main.nf +++ b/modules/nf-core/mageck/mle/main.nf @@ -1,6 +1,6 @@ process MAGECK_MLE { tag "$meta.id" - label 'process_medium' + label 'process_high' conda "bioconda::mageck=0.5.9" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? diff --git a/modules/nf-core/mageck/test/mageck-test.diff b/modules/nf-core/mageck/test/mageck-test.diff index ee8558c3..9da13d3b 100644 --- a/modules/nf-core/mageck/test/mageck-test.diff +++ b/modules/nf-core/mageck/test/mageck-test.diff @@ -8,17 +8,16 @@ Changes in module 'nf-core/mageck/test' label 'process_medium' conda "bioconda::mageck=0.5.9" -@@ -14,6 +14,9 @@ +@@ -14,6 +14,8 @@ tuple val(meta), path("*.gene_summary.txt") , emit: gene_summary tuple val(meta), path("*.sgrna_summary.txt") , emit: sgrna_summary tuple val(meta), path("*.R") , emit: r_script -+ tuple val(meta), path("*.Rmd") , emit: r_report + tuple val(meta), path("*.Rnw") , emit: r_summary + tuple val(meta), path("*.log") , emit: logs path "versions.yml" , emit: versions when: -@@ -21,14 +24,18 @@ +@@ -21,14 +23,18 @@ script: def args = task.ext.args ?: '' @@ -38,10 +37,5 @@ Changes in module 'nf-core/mageck/test' cat <<-END_VERSIONS > versions.yml -@@ -36,4 +43,4 @@ - mageck: \$(mageck -v) - END_VERSIONS - """ --} -+} + ************************************************************ diff --git a/modules/nf-core/mageck/test/main.nf b/modules/nf-core/mageck/test/main.nf index 420edcc1..78d5f58c 100644 --- a/modules/nf-core/mageck/test/main.nf +++ b/modules/nf-core/mageck/test/main.nf @@ -14,7 +14,6 @@ process MAGECK_TEST { tuple val(meta), path("*.gene_summary.txt") , emit: gene_summary tuple val(meta), path("*.sgrna_summary.txt") , emit: sgrna_summary tuple val(meta), path("*.R") , emit: r_script - tuple val(meta), path("*.Rmd") , emit: r_report tuple val(meta), path("*.Rnw") , emit: r_summary tuple val(meta), path("*.log") , emit: logs path "versions.yml" , emit: versions diff --git a/modules/nf-core/samtools/faidx/main.nf b/modules/nf-core/samtools/faidx/main.nf deleted file mode 100644 index 4dd0e5b0..00000000 --- a/modules/nf-core/samtools/faidx/main.nf +++ /dev/null @@ -1,44 +0,0 @@ -process SAMTOOLS_FAIDX { - tag "$fasta" - label 'process_single' - - conda "bioconda::samtools=1.17" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : - 'biocontainers/samtools:1.17--h00cdaf9_0' }" - - input: - tuple val(meta), path(fasta) - - output: - tuple val(meta), path ("*.fai"), emit: fai - tuple val(meta), path ("*.gzi"), emit: gzi, optional: true - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - """ - samtools \\ - faidx \\ - $args \\ - $fasta - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') - END_VERSIONS - """ - - stub: - """ - touch ${fasta}.fai - cat <<-END_VERSIONS > versions.yml - - "${task.process}": - samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') - END_VERSIONS - """ -} diff --git a/modules/nf-core/samtools/faidx/meta.yml b/modules/nf-core/samtools/faidx/meta.yml deleted file mode 100644 index fe2fe9a1..00000000 --- a/modules/nf-core/samtools/faidx/meta.yml +++ /dev/null @@ -1,47 +0,0 @@ -name: samtools_faidx -description: Index FASTA file -keywords: - - index - - fasta -tools: - - samtools: - description: | - SAMtools is a set of utilities for interacting with and post-processing - short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. - These files are generated as output by short read aligners like BWA. - homepage: http://www.htslib.org/ - documentation: http://www.htslib.org/doc/samtools.html - doi: 10.1093/bioinformatics/btp352 - licence: ["MIT"] -input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - fasta: - type: file - description: FASTA file - pattern: "*.{fa,fasta}" -output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - fai: - type: file - description: FASTA index file - pattern: "*.{fai}" - - gzi: - type: file - description: Optional gzip index file for compressed inputs - pattern: "*.gzi" - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" -authors: - - "@drpatelh" - - "@ewels" - - "@phue" diff --git a/nextflow.config b/nextflow.config index 05bda0dd..5f898729 100644 --- a/nextflow.config +++ b/nextflow.config @@ -21,6 +21,8 @@ params { count_table = null min_reads = 30 min_targeted_genes = 3 + bagel_reference_essentials = 'https://raw.githubusercontent.com/hart-lab/bagel/master/CEGv2.txt' + bagel_reference_nonessentials = 'https://raw.githubusercontent.com/hart-lab/bagel/master/NEGv1.txt' // Pipeline steps overrepresented = false @@ -50,7 +52,6 @@ params { // Boilerplate options outdir = null - tracedir = "${params.outdir}/pipeline_info" publish_dir_mode = 'copy' email = null email_on_fail = null @@ -59,18 +60,14 @@ params { hook_url = null help = false version = false - validate_params = true - show_hidden_params = false - schema_ignore_params = 'genomes' - // Config options + config_profile_name = null + config_profile_description = null custom_config_version = 'master' custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}" - config_profile_description = null config_profile_contact = null config_profile_url = null - config_profile_name = null // Max resource options @@ -79,6 +76,13 @@ params { max_cpus = 16 max_time = '240.h' + // Schema validation default options + validationFailUnrecognisedParams = false + validationLenientMode = false + validationSchemaIgnoreParams = 'genomes,igenomes_base' + validationShowHiddenParams = false + validate_params = true + } // Load base.config by default for all pipelines @@ -98,13 +102,11 @@ try { // } catch (Exception e) { // System.err.println("WARNING: Could not load nf-core/config/crisprseq profiles: ${params.custom_config_base}/pipeline/crisprseq.config") // } - - profiles { debug { dumpHashes = true process.beforeScript = 'echo $HOSTNAME' - cleanup = false + cleanup = false } conda { conda.enabled = true @@ -177,6 +179,7 @@ profiles { } apptainer { apptainer.enabled = true + apptainer.autoMounts = true conda.enabled = false docker.enabled = false singularity.enabled = false @@ -186,8 +189,8 @@ profiles { } gitpod { executor.name = 'local' - executor.cpus = 16 - executor.memory = 60.GB + executor.cpus = 4 + executor.memory = 8.GB } test { includeConfig 'conf/test_targeted.config' } test_targeted { includeConfig 'conf/test_targeted.config' } @@ -197,6 +200,18 @@ profiles { test_screening { includeConfig 'conf/test_screening.config' } } +// Set default registry for Apptainer, Docker, Podman and Singularity independent of -profile +// Will not be used unless Apptainer / Docker / Podman / Singularity are enabled +// Set to your registry if you have a mirror of containers +apptainer.registry = 'quay.io' +docker.registry = 'quay.io' +podman.registry = 'quay.io' +singularity.registry = 'quay.io' + +// Nextflow plugins +plugins { + id 'nf-validation' // Validation of pipeline parameters and creation of an input channel from a sample sheet +} // Load igenomes.config if required if (!params.igenomes_ignore) { @@ -204,8 +219,6 @@ if (!params.igenomes_ignore) { } else { params.genomes = [:] } - - // Export these variables to prevent local Python/R libraries from conflicting with those in the container // The JULIA depot path has been adjusted to a fixed path `/usr/local/share/julia` that needs to be used for packages in the container. // See https://apeltzer.github.io/post/03-julia-lang-nextflow/ for details on that. Once we have a common agreement on where to keep Julia packages, this is adjustable. @@ -230,30 +243,30 @@ singularity.registry = 'quay.io' def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') timeline { enabled = true - file = "${params.tracedir}/execution_timeline_${trace_timestamp}.html" + file = "${params.outdir}/pipeline_info/execution_timeline_${trace_timestamp}.html" } report { enabled = true - file = "${params.tracedir}/execution_report_${trace_timestamp}.html" + file = "${params.outdir}/pipeline_info/execution_report_${trace_timestamp}.html" } trace { enabled = true - file = "${params.tracedir}/execution_trace_${trace_timestamp}.txt" + file = "${params.outdir}/pipeline_info/execution_trace_${trace_timestamp}.txt" } dag { enabled = true - file = "${params.tracedir}/pipeline_dag_${trace_timestamp}.html" + file = "${params.outdir}/pipeline_info/pipeline_dag_${trace_timestamp}.html" } manifest { name = 'nf-core/crisprseq' - author = """mirpedrol""" + author = """Júlia Mir Pedrol, Laurence Kuhlburger""" homePage = 'https://github.com/nf-core/crisprseq' - description = """Pipeline for the analysis of crispr data""" + description = """Pipeline for the analysis of CRISPR data""" mainScript = 'main.nf' nextflowVersion = '!>=23.04.0' - version = '2.0.0' - doi = 'https://doi.org/10.5281/zenodo.7598497' + version = '2.1.0' + doi = 'https://doi.org/10.5281/zenodo.7598496' } // Load modules.config for DSL2 module specific options diff --git a/nextflow_schema.json b/nextflow_schema.json index f599d796..286554b9 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -2,7 +2,7 @@ "$schema": "http://json-schema.org/draft-07/schema", "$id": "https://raw.githubusercontent.com/nf-core/crisprseq/master/nextflow_schema.json", "title": "nf-core/crisprseq pipeline parameters", - "description": "Pipeline for the analysis of crispr data", + "description": "Pipeline for the analysis of CRISPR data", "type": "object", "definitions": { "input_output_options": { @@ -15,9 +15,10 @@ "input": { "type": "string", "format": "file-path", + "exists": true, + "schema": "assets/schema_input.json", "mimetype": "text/csv", "pattern": "^\\S+\\.csv$", - "schema": "assets/schema_input.json", "description": "Path to comma-separated file containing information about the samples in the experiment.", "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row. See [usage docs](https://nf-co.re/crisprseq/usage#samplesheet-input).", "fa_icon": "fas fa-file-csv" @@ -151,20 +152,32 @@ "properties": { "mle_design_matrix": { "type": "string", + "format": "file-path", + "exists": true, "description": "Design matrix used for MAGeCK MLE to call essential genes under multiple conditions while considering sgRNA knockout efficiency" }, "rra_contrasts": { "type": "string", + "format": "file-path", + "exists": true, "description": "Comma-separated file with the conditions to be compared. The first one will be the reference (control)" }, "count_table": { "type": "string", + "format": "file-path", + "pattern": "^\\S+\\.(tsv|txt)$", + "mimetype": "text/tsv", + "exists": true, "description": "Please provide your count table if the mageck test should be skipped." }, "library": { "type": "string", + "format": "file-path", + "pattern": "^\\S+\\.(tsv|txt)$", + "mimetype": "text/tsv", + "exists": true, "fa_icon": "far fa-address-book", - "description": "sgRNA sequences and targeting genes" + "description": "sgRNA and targetting genes, tab separated" }, "crisprcleanr": { "type": "string", @@ -173,12 +186,22 @@ "min_reads": { "type": "number", "description": "a filter threshold value for sgRNAs, based on their average counts in the control sample", - "default": 30 + "default": 30.0 }, "min_targeted_genes": { "type": "number", - "description": "Minimal number of different genes targeted by sgRNAs in a biased segment in order for the corresponding counts to be corrected", - "default": 3 + "description": "Minimal number of different genes targeted by sgRNAs in a biased segment in order for the corresponding counts to be corrected for CRISPRcleanR", + "default": 3.0 + }, + "bagel_reference_essentials": { + "type": "string", + "description": "Core essential gene set for BAGEL2", + "default": "https://raw.githubusercontent.com/hart-lab/bagel/master/CEGv2.txt" + }, + "bagel_reference_nonessentials": { + "type": "string", + "description": "Non essential gene set for BAGEL2", + "default": "https://raw.githubusercontent.com/hart-lab/bagel/master/NEGv1.txt" } } }, @@ -196,18 +219,12 @@ }, "reference_fasta": { "type": "string", + "format": "file-path", + "exists": true, + "mimetype": "text/plain", "pattern": "^\\S+\\.fn?a(sta)?(\\.gz)?$", "description": "Path to the reference FASTA file. Will override reference sequences provided by an input sample sheet.", - "fa_icon": "far fa-file-alt", - "format": "file-path" - }, - "igenomes_base": { - "type": "string", - "format": "directory-path", - "description": "Directory / URL base for iGenomes references.", - "default": "s3://ngi-igenomes/igenomes", - "fa_icon": "fas fa-cloud-download-alt", - "hidden": true + "fa_icon": "far fa-file-alt" }, "igenomes_ignore": { "type": "boolean", @@ -295,7 +312,7 @@ "description": "Maximum amount of time that can be requested for any single job.", "default": "240.h", "fa_icon": "far fa-clock", - "pattern": "^(\\d+\\.?\\s*(s|m|h|day)\\s*)+$", + "pattern": "^(\\d+\\.?\\s*(s|m|h|d|day)\\s*)+$", "hidden": true, "help_text": "Use to set an upper-limit for the time requirement for each process. Should be a string in the format integer-unit e.g. `--max_time '2.h'`" } @@ -366,28 +383,27 @@ }, "multiqc_config": { "type": "string", + "format": "file-path", + "exists": true, "description": "Custom config file to supply to MultiQC.", "fa_icon": "fas fa-cog", "hidden": true }, "multiqc_logo": { "type": "string", + "format": "file-path", + "exists": true, "description": "Custom logo file to supply to MultiQC. File name must also be set in the MultiQC config file", "fa_icon": "fas fa-image", "hidden": true }, "multiqc_methods_description": { "type": "string", + "format": "file-path", + "exists": true, "description": "Custom MultiQC yaml file containing HTML including a methods description.", "fa_icon": "fas fa-cog" }, - "tracedir": { - "type": "string", - "description": "Directory to keep pipeline Nextflow logs and reports.", - "default": "${params.outdir}/pipeline_info", - "fa_icon": "fas fa-cogs", - "hidden": true - }, "validate_params": { "type": "boolean", "description": "Boolean whether to validate parameters against the schema at runtime", @@ -395,19 +411,33 @@ "fa_icon": "fas fa-check-square", "hidden": true }, - "show_hidden_params": { + "validationShowHiddenParams": { "type": "boolean", "fa_icon": "far fa-eye-slash", "description": "Show all params when using `--help`", "hidden": true, "help_text": "By default, parameters set as _hidden_ in the schema are not shown on the command line when a user runs with `--help`. Specifying this option will tell the pipeline to show all parameters." }, - "schema_ignore_params": { + "validationSchemaIgnoreParams": { "type": "string", "default": "genomes", "description": "Ignore JSON schema validation of the following params", "fa_icon": "fas fa-ban", "hidden": true + }, + "validationFailUnrecognisedParams": { + "type": "boolean", + "fa_icon": "far fa-check-circle", + "description": "Validation of parameters fails when an unrecognised parameter is found.", + "hidden": true, + "help_text": "By default, when an unrecognised parameter is found, it returns a warinig." + }, + "validationLenientMode": { + "type": "boolean", + "fa_icon": "far fa-check-circle", + "description": "Validation of parameters in lenient more.", + "hidden": true, + "help_text": "Allows string values that are parseable as numbers or booleans. For further information see [JSONSchema docs](https://github.com/everit-org/json-schema#lenient-mode)." } } } @@ -422,21 +452,12 @@ { "$ref": "#/definitions/umi_parameters" }, - { - "$ref": "#/definitions/targeted_pipeline_steps" - }, - { - "$ref": "#/definitions/umi_parameters" - }, { "$ref": "#/definitions/targeted_parameters" }, { "$ref": "#/definitions/vsearch_parameters" }, - { - "$ref": "#/definitions/vsearch_parameters" - }, { "$ref": "#/definitions/screening_parameters" }, diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf deleted file mode 100644 index c9aafbde..00000000 --- a/subworkflows/local/input_check.nf +++ /dev/null @@ -1,130 +0,0 @@ -// -// Check input samplesheet and get read channels -// - -include { SAMPLESHEET_CHECK } from '../../modules/local/samplesheet_check' - -workflow INPUT_CHECK { - take: - samplesheet // file: /path/to/samplesheet.csv - - main: - SAMPLESHEET_CHECK ( samplesheet ) - .csv - .splitCsv ( header:true, sep:',' ) - .multiMap { - reads: create_fastq_channel(it) - reference: create_reference_channel(it) - template: create_template_channel(it) - protospacer: create_protospacer_channel(it) - } - .set { inputs } - - emit: - reads = inputs.reads // channel: [ val(meta), [ reads ] ] - reference = inputs.reference // channel: [ val(meta), reference ] - template = inputs.template // channel: [ val(meta), template ] - protospacer = inputs.protospacer // channel: [ val(meta), protospacer ] - versions = SAMPLESHEET_CHECK.out.versions // channel: [ versions.yml ] - -} - -// Function to generate meta map -def create_meta(LinkedHashMap row) { - // Create meta map with sample ID - def meta = [:] - meta.id = row.sample - - // Add single end boolean - if (row.fastq_2 && file(row.fastq_2).exists()) { - meta.single_end = false - } else { - meta.single_end = true - } - - // Add self reference boolean - if (row.reference.length() <= 0) { - meta.self_reference = true - } else { - meta.self_reference = false - } - - // Add template boolean - if (!row.template) { - meta.template = false - } else if (!file(row.template).exists()) { - meta.template = true - } else { - meta.template = true - } - - return meta -} - -// Function to get list of [ meta, [ fastq_1, fastq_2 ] ] -def create_fastq_channel(LinkedHashMap row) { - // create meta map - def meta = create_meta(row) - - // add path(s) of the fastq file(s) to the meta map - def fastq_meta = [] - if (!file(row.fastq_1).exists()) { - error("ERROR: Please check input samplesheet -> Read 1 FastQ file does not exist!\n${row.fastq_1}") - } - if (!row.fastq_2 || !file(row.fastq_2).exists()) { - fastq_meta = [ meta, [ file(row.fastq_1) ] ] - } else { - fastq_meta = [ meta, [ file(row.fastq_1), file(row.fastq_2) ] ] - } - return fastq_meta -} - -// Function to get a list of [ meta, reference ] -def create_reference_channel(LinkedHashMap row) { - // create meta map - def meta = create_meta(row) - - // add reference sequence to meta - def reference_meta = [] - if (row.reference.length() <= 0) { - reference_meta = [ meta, null ] - } else { - reference_meta = [ meta, row.reference ] - } - - return reference_meta -} - -// Function to get a list of [ meta, template ] -def create_template_channel(LinkedHashMap row) { - // create meta map - def meta = create_meta(row) - - // add template sequence/path to meta - def template_meta = [] - if (!row.template) { - template_meta = [ meta, null ] - } else if (!file(row.template).exists()) { - template_meta = [ meta, row.template ] - } else { - template_meta = [ meta, file(row.template) ] - } - - return template_meta -} - -// Function to get a list of [ meta, protospacer ] -def create_protospacer_channel(LinkedHashMap row) { - // create meta map - def meta = create_meta(row) - - // add protospacer sequence to meta - def protospacer_meta = [] - if (row.protospacer.length() <= 0 && !params.protospacer) { - error("ERROR: Please check input samplesheet -> Protospacer sequence is not provided!\n") - } else { - protospacer_meta = [ meta, row.protospacer ] - } - - return protospacer_meta -} diff --git a/subworkflows/local/input_check_screening.nf b/subworkflows/local/input_check_screening.nf deleted file mode 100644 index 8b3ba471..00000000 --- a/subworkflows/local/input_check_screening.nf +++ /dev/null @@ -1,44 +0,0 @@ -// -// Check input samplesheet and get read channels -// - -include { SAMPLESHEET_CHECK_SCREENING } from '../../modules/local/samplesheet_check_screening' - -workflow INPUT_CHECK_SCREENING { - take: - samplesheet // file: /path/to/samplesheet.csv - - main: - SAMPLESHEET_CHECK_SCREENING ( samplesheet ) - .csv - .splitCsv ( header:true, sep:',' ) - .map { create_fastq_channel(it) } - .set { reads } - - emit: - reads // channel: [ val(meta), [ reads ] ] - versions = SAMPLESHEET_CHECK_SCREENING.out.versions // channel: [ versions.yml ] -} - -// Function to get list of [ meta, [ fastq_1, fastq_2 ] ] -def create_fastq_channel(LinkedHashMap row) { - // create meta map - def meta = [:] - meta.id = row.sample - meta.condition = row.condition - - // add path(s) of the fastq file(s) to the meta map - def fastq_meta = [] - if (!file(row.fastq_1).exists()) { - error("ERROR: Please check input samplesheet -> Read 1 FastQ file does not exist!\n${row.fastq_1}") - } - if (!row.fastq_2) { - fastq_meta = [ meta, [ file(row.fastq_1) ] ] - } else { - if (!file(row.fastq_2).exists()) { - error("ERROR: Please check input samplesheet -> Read 2 FastQ file does not exist!\n${row.fastq_2}") - } - fastq_meta = [ meta, [ file(row.fastq_1), file(row.fastq_2) ] ] - } - return fastq_meta -} diff --git a/templates/alignment_summary.py b/templates/alignment_summary.py index 5adedf93..035e7b90 100644 --- a/templates/alignment_summary.py +++ b/templates/alignment_summary.py @@ -1,5 +1,11 @@ #!/usr/bin/env python +############################ +#### Summary of alignment +#### author: Júlia Mir @mirpedrol +#### Released under the MIT license. See git repository (https://github.com/nf-core/crisprseq) for full license text. +############################ + import sys import pysam @@ -12,7 +18,8 @@ summary_lines = summary.readlines() add_line = True -with open("$summary", "w") as output_file: +outname = "$summary".replace("_clustering_summary.csv", "_alignment_summary.csv") +with open(outname, "w") as output_file: for line in summary_lines: if "aligned-reads" not in line: output_file.write(line) @@ -21,3 +28,7 @@ add_line = False if add_line: output_file.write(f"aligned-reads, {mapped_reads_count} ({round(mapped_reads_percentage, 1)}%)\\n") + +with open("versions.yml", "w") as f: + f.write('"${task.process}":\\n') + f.write(f' pysam: "{pysam.__version__}"\\n') diff --git a/templates/clustering_summary.py b/templates/clustering_summary.py index 208b089b..b1ad29ed 100644 --- a/templates/clustering_summary.py +++ b/templates/clustering_summary.py @@ -1,8 +1,15 @@ #!/usr/bin/env python +############################ +#### Summary of clustering +#### author: Júlia Mir @mirpedrol +#### Released under the MIT license. See git repository (https://github.com/nf-core/crisprseq) for full license text. +############################ + import gzip import sys +import Bio from Bio import SeqIO with gzip.open("$reads", "rt") as handle: @@ -12,7 +19,8 @@ summary_lines = summary.readlines() add_line = True -with open("$summary", "w") as output_file: +outname = "$summary".replace("_preprocessing_summary.csv", "_clustering_summary.csv") +with open(outname, "w") as output_file: for line in summary_lines: if "clustered-reads" not in line: output_file.write(line) @@ -21,3 +29,7 @@ add_line = False if add_line: output_file.write(f"clustered-reads, {clusters_count}\\n") + +with open("versions.yml", "w") as f: + f.write('"${task.process}":\\n') + f.write(f' biopython: "{Bio.__version__}"\\n') diff --git a/templates/merging_summary.py b/templates/preprocessing_summary.py similarity index 76% rename from templates/merging_summary.py rename to templates/preprocessing_summary.py index 804a5be3..13c698ce 100755 --- a/templates/merging_summary.py +++ b/templates/preprocessing_summary.py @@ -1,13 +1,20 @@ #!/usr/bin/env python +############################ +#### Summary of reads preprocessing +#### author: Júlia Mir @mirpedrol +#### Released under the MIT license. See git repository (https://github.com/nf-core/crisprseq) for full license text. +############################ + import gzip +import Bio from Bio import SeqIO with gzip.open("${raw_reads[0]}", "rt") as handle: raw_reads_count = len(list(SeqIO.parse(handle, "fastq"))) -if "$assembled_reads" == "null_a": +if "$assembled_reads" == "": assembled_reads_count = 0 else: with gzip.open("$assembled_reads", "rt") as handle: @@ -16,7 +23,7 @@ with gzip.open("$trimmed_reads", "rt") as handle: trimmed_reads_count = len(list(SeqIO.parse(handle, "fastq"))) # Filtered reads -if "$trimmed_adapters" == "null_t": +if "$trimmed_adapters" == "": adapters_count = 0 adapters_percentage = "(0.0%)" else: @@ -34,14 +41,14 @@ else: prefix = "$meta.id" -with open(f"{prefix}_summary.csv", "w") as output_file: +with open(f"{prefix}_preprocessing_summary.csv", "w") as output_file: output_file.write("class, count\\n") output_file.write(f"raw-reads, {raw_reads_count} (100.0%)\\n") output_file.write( f"merged-reads, {assembled_reads_count} ({round(assembled_reads_count * 100 / raw_reads_count,1)}%)\\n" ) output_file.write(f"reads-with-adapters, {adapters_count} {adapters_percentage}\\n") - if "$assembled_reads" == "null_a": + if "$assembled_reads" == "": output_file.write( f"quality-filtered-reads, {trimmed_reads_count} ({round(trimmed_reads_count * 100 / raw_reads_count,1)}%)\\n" ) @@ -49,3 +56,8 @@ output_file.write( f"quality-filtered-reads, {trimmed_reads_count} ({round(trimmed_reads_count * 100 / assembled_reads_count,1)}%)\\n" ) + + +with open("versions.yml", "w") as f: + f.write('"${task.process}":\\n') + f.write(f' biopython: "{Bio.__version__}"\\n') diff --git a/workflows/crisprseq_screening.nf b/workflows/crisprseq_screening.nf index ed8f28aa..e37619ce 100644 --- a/workflows/crisprseq_screening.nf +++ b/workflows/crisprseq_screening.nf @@ -1,27 +1,29 @@ /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - VALIDATE INPUTS + PRINT PARAMS SUMMARY ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -def summary_params = NfcoreSchema.paramsSummaryMap(workflow, params) +include { paramsSummaryLog; paramsSummaryMap; fromSamplesheet } from 'plugin/nf-validation' -// Validate input parameters -WorkflowCrisprseq.initialise(params, log) +def logo = NfcoreTemplate.logo(workflow, params.monochrome_logs) +def citation = '\n' + WorkflowMain.citation(workflow) + '\n' +def summary_params = paramsSummaryMap(workflow) + +// Print parameter summary log to screen +log.info logo + paramsSummaryLog(workflow) + citation -// Check input path parameters to see if they exist -def checkPathParamList = [ params.multiqc_config, params.reference_fasta, params.library] -for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } +WorkflowCrisprseq.initialise(params, log) -// Check mandatory parameters -if (!params.count_table) { ch_input = file(params.input) } else { error('Input samplesheet not specified!') } +// Set screening parameters and channels if (params.library) { ch_library = file(params.library) } -if (params.crisprcleanr) { ch_crisprcleanr= Channel.value(params.crisprcleanr) } +if (params.crisprcleanr) { ch_crisprcleanr = Channel.value(params.crisprcleanr) } if(params.mle_design_matrix) { - Channel.fromPath(params.mle_design_matrix,checkIfExists: true) + Channel.fromPath(params.mle_design_matrix) .set { ch_design } } + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONFIG FILES @@ -29,9 +31,9 @@ if(params.mle_design_matrix) { */ ch_multiqc_config = Channel.fromPath("$projectDir/assets/multiqc_config.yml", checkIfExists: true) -ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath( params.multiqc_config, checkIfExists: true ) : Channel.empty() -ch_multiqc_logo = params.multiqc_logo ? Channel.fromPath( params.multiqc_logo, checkIfExists: true ) : Channel.empty() -ch_multiqc_custom_methods_description = params.multiqc_methods_description ? file(params.multiqc_methods_description, checkIfExists: true) : file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) +ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath( params.multiqc_config ) : Channel.empty() +ch_multiqc_logo = params.multiqc_logo ? Channel.fromPath( params.multiqc_logo ) : Channel.empty() +ch_multiqc_custom_methods_description = params.multiqc_methods_description ? file(params.multiqc_methods_description) : file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -42,7 +44,6 @@ ch_multiqc_custom_methods_description = params.multiqc_methods_description ? fil // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // -include { INPUT_CHECK_SCREENING } from '../subworkflows/local/input_check_screening' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -58,8 +59,14 @@ include { MULTIQC } from '../modules/nf-core/multiqc/main' include { MAGECK_COUNT } from '../modules/nf-core/mageck/count/main' include { MAGECK_MLE } from '../modules/nf-core/mageck/mle/main' include { MAGECK_TEST } from '../modules/nf-core/mageck/test/main' +include { MAGECK_GRAPHRRA } from '../modules/local/mageck/graphrra' include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' include { CRISPRCLEANR_NORMALIZE } from '../modules/nf-core/crisprcleanr/normalize/main' +include { BAGEL2_FC } from '../modules/local/bagel2/fc' +include { BAGEL2_BF } from '../modules/local/bagel2/bf' +include { BAGEL2_PR } from '../modules/local/bagel2/pr' +include { BAGEL2_GRAPH } from '../modules/local/bagel2/graph' + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RUN MAIN WORKFLOW @@ -75,33 +82,42 @@ workflow CRISPRSEQ_SCREENING { if(!params.count_table){ // - // SUBWORKFLOW: Read in samplesheet, validate and stage input files + // Create input channel from input file provided through params.input // - INPUT_CHECK_SCREENING ( - ch_input - ) - ch_versions = ch_versions.mix(INPUT_CHECK_SCREENING.out.versions) + Channel.fromSamplesheet("input") + .map{ meta, fastq_1, fastq_2, x, y, z -> + // x (reference), y (protospacer), and z (template) are part of the targeted workflows and we don't need them + return [ meta + [ single_end:fastq_2?false:true ], fastq_2?[ fastq_1, fastq_2 ]:[ fastq_1 ] ] + } + .set { ch_input } // // MODULE: Run FastQC // FASTQC ( - INPUT_CHECK_SCREENING.out.reads + ch_input ) + + ch_versions = ch_versions.mix(FASTQC.out.versions.first()) - INPUT_CHECK_SCREENING.out.reads - .map { meta, fastq -> - [meta.condition, fastq] + + ch_input + .map { meta, fastq -> + [meta.condition, fastq, meta.single_end] } .reduce { a, b -> - ["${a[0]},${b[0]}", a[1] + b[1]] + if(a[2] != b[2] ) { + error "Your samplesheet contains a mix of single-end and paired-end data. This is not supported." + } + return ["${a[0]},${b[0]}", a[1] + b[1], b[2]] } - .map { condition, fastqs -> - [[id: condition], fastqs] + .map { condition, fastqs, single_end -> + [[id: condition, single_end: single_end], fastqs] } .set { joined } + // // MODULE: Run mageck count // @@ -112,7 +128,6 @@ workflow CRISPRSEQ_SCREENING { ch_versions = ch_versions.mix(MAGECK_COUNT.out.versions.first()) - MAGECK_COUNT.out.count.map { it -> it[1] }.set { ch_counts } @@ -131,6 +146,9 @@ workflow CRISPRSEQ_SCREENING { params.min_targeted_genes ) + ch_versions = ch_versions.mix(CRISPRCLEANR_NORMALIZE.out.versions) + + CRISPRCLEANR_NORMALIZE.out.norm_count_file.map { it -> it[1] }.set { ch_counts } @@ -138,13 +156,61 @@ workflow CRISPRSEQ_SCREENING { if(params.rra_contrasts) { Channel.fromPath(params.rra_contrasts) - .splitCsv(header:true, sep:',' ) + .splitCsv(header:true, sep:';' ) .set { ch_contrasts } counts = ch_contrasts.combine(ch_counts) MAGECK_TEST ( counts ) + + ch_versions = ch_versions.mix(MAGECK_TEST.out.versions) + + MAGECK_GRAPHRRA ( + MAGECK_TEST.out.gene_summary + ) + ch_versions = ch_versions.mix(MAGECK_GRAPHRRA.out.versions) + } + + if(params.rra_contrasts) { + Channel.fromPath(params.rra_contrasts) + .splitCsv(header:true, sep:';' ) + .set { ch_bagel } + counts = ch_bagel.combine(ch_counts) + + //Define non essential and essential genes channels for bagel2 + ch_bagel_reference_essentials= Channel.value(params.bagel_reference_essentials) + ch_bagel_reference_nonessentials= Channel.value(params.bagel_reference_nonessentials) + + BAGEL2_FC ( + counts + ) + ch_versions = ch_versions.mix(BAGEL2_FC.out.versions) + + BAGEL2_BF ( + BAGEL2_FC.out.foldchange, + ch_bagel_reference_essentials, + ch_bagel_reference_nonessentials + ) + + ch_versions = ch_versions.mix(BAGEL2_BF.out.versions) + + + ch_bagel_pr = BAGEL2_BF.out.bf.combine(ch_bagel_reference_essentials) + .combine(ch_bagel_reference_nonessentials) + + BAGEL2_PR ( + ch_bagel_pr + ) + + ch_versions = ch_versions.mix(BAGEL2_PR.out.versions) + + BAGEL2_GRAPH ( + BAGEL2_PR.out.pr + ) + + ch_versions = ch_versions.mix(BAGEL2_GRAPH.out.versions) + } if(params.mle_design_matrix) { @@ -156,10 +222,14 @@ workflow CRISPRSEQ_SCREENING { MAGECK_MLE ( ch_designed_mle ) + + ch_versions = ch_versions.mix(MAGECK_MLE.out.versions) + + } CUSTOM_DUMPSOFTWAREVERSIONS ( - ch_versions.unique().collectFile(name: 'collated_versions.yml') + ch_versions.unique{ it.text }.collectFile(name: 'collated_versions.yml') ) // @@ -168,7 +238,7 @@ workflow CRISPRSEQ_SCREENING { workflow_summary = WorkflowCrisprseq.paramsSummaryMultiqc(workflow, summary_params) ch_workflow_summary = Channel.value(workflow_summary) - methods_description = WorkflowCrisprseq.methodsDescriptionText(workflow, ch_multiqc_custom_methods_description) + methods_description = WorkflowCrisprseq.methodsDescriptionText(workflow, ch_multiqc_custom_methods_description, params) ch_methods_description = Channel.value(methods_description) ch_multiqc_files = Channel.empty() @@ -201,6 +271,7 @@ workflow.onComplete { if (params.email || params.email_on_fail) { NfcoreTemplate.email(workflow, params, summary_params, projectDir, log, multiqc_report) } + NfcoreTemplate.dump_parameters(workflow, params) NfcoreTemplate.summary(workflow, params, log) if (params.hook_url) { NfcoreTemplate.adaptivecard(workflow, params, summary_params, projectDir, log) diff --git a/workflows/crisprseq_targeted.nf b/workflows/crisprseq_targeted.nf index df5fb1e1..abf67bb2 100644 --- a/workflows/crisprseq_targeted.nf +++ b/workflows/crisprseq_targeted.nf @@ -1,21 +1,19 @@ /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - VALIDATE INPUTS + PRINT PARAMS SUMMARY ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -def summary_params = NfcoreSchema.paramsSummaryMap(workflow, params) +include { paramsSummaryLog; paramsSummaryMap; fromSamplesheet } from 'plugin/nf-validation' -// Validate input parameters -WorkflowCrisprseq.initialise(params, log) - -// Check input path parameters to see if they exist -def checkPathParamList = [ params.input, params.multiqc_config, params.reference_fasta ] -for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } +def logo = NfcoreTemplate.logo(workflow, params.monochrome_logs) +def citation = '\n' + WorkflowMain.citation(workflow) + '\n' +def summary_params = paramsSummaryMap(workflow) -// Check mandatory parameters -if (params.input) { ch_input = file(params.input) } else { error('Input samplesheet not specified!') } +// Print parameter summary log to screen +log.info logo + paramsSummaryLog(workflow) + citation +WorkflowCrisprseq.initialise(params, log) /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONFIG FILES @@ -23,9 +21,9 @@ if (params.input) { ch_input = file(params.input) } else { error('Input samplesh */ ch_multiqc_config = Channel.fromPath("$projectDir/assets/multiqc_config.yml", checkIfExists: true) -ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath( params.multiqc_config, checkIfExists: true ) : Channel.empty() -ch_multiqc_logo = params.multiqc_logo ? Channel.fromPath( params.multiqc_logo, checkIfExists: true ) : Channel.empty() -ch_multiqc_custom_methods_description = params.multiqc_methods_description ? file(params.multiqc_methods_description, checkIfExists: true) : file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) +ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath( params.multiqc_config ) : Channel.empty() +ch_multiqc_logo = params.multiqc_logo ? Channel.fromPath( params.multiqc_logo ) : Channel.empty() +ch_multiqc_custom_methods_description = params.multiqc_methods_description ? file(params.multiqc_methods_description) : file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -36,21 +34,19 @@ ch_multiqc_custom_methods_description = params.multiqc_methods_description ? fil // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // -include { INPUT_CHECK } from '../subworkflows/local/input_check' // // MODULE // -include { FIND_ADAPTERS } from '../modules/local/find_adapters' -include { EXTRACT_UMIS } from '../modules/local/extract_umis' -include { SEQ_TO_FILE as SEQ_TO_FILE_REF } from '../modules/local/seq_to_file' -include { SEQ_TO_FILE as SEQ_TO_FILE_TEMPL } from '../modules/local/seq_to_file' -include { ORIENT_REFERENCE } from '../modules/local/orient_reference' -include { CIGAR_PARSER } from '../modules/local/cigar_parser' -include { MERGING_SUMMARY } from '../modules/local/merging_summary' -include { CLUSTERING_SUMMARY } from '../modules/local/clustering_summary' -include { ALIGNMENT_SUMMARY } from '../modules/local/alignment_summary' -include { TEMPLATE_REFERENCE } from '../modules/local/template_reference' +include { FIND_ADAPTERS } from '../modules/local/find_adapters' +include { EXTRACT_UMIS } from '../modules/local/extract_umis' +include { ORIENT_REFERENCE } from '../modules/local/orient_reference' +include { CIGAR_PARSER } from '../modules/local/cigar_parser' +include { PREPROCESSING_SUMMARY } from '../modules/local/preprocessing_summary' +include { CLUSTERING_SUMMARY } from '../modules/local/clustering_summary' +include { ALIGNMENT_SUMMARY } from '../modules/local/alignment_summary' +include { TEMPLATE_REFERENCE } from '../modules/local/template_reference' +include { CRISPRSEQ_PLOTTER } from '../modules/local/crisprseq_plotter' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -61,30 +57,29 @@ include { TEMPLATE_REFERENCE } from '../modules/loc // // MODULE: Installed directly from nf-core/modules // -include { FASTQC } from '../modules/nf-core/fastqc/main' -include { MULTIQC } from '../modules/nf-core/multiqc/main' -include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' -include { PEAR } from '../modules/nf-core/pear/main' -include { CAT_FASTQ } from '../modules/nf-core/cat/fastq/main' -include { SEQTK_SEQ as SEQTK_SEQ_MASK } from '../modules/nf-core/seqtk/seq/main' -include { SEQTK_SEQ as SEQTK_SEQ_FATOFQ } from '../modules/nf-core/seqtk/seq/main' -include { VSEARCH_CLUSTER } from '../modules/nf-core/vsearch/cluster/main' -include { VSEARCH_SORT } from '../modules/nf-core/vsearch/sort/main' -include { RACON as RACON_1 } from '../modules/nf-core/racon/main' -include { RACON as RACON_2 } from '../modules/nf-core/racon/main' -include { BOWTIE2_ALIGN } from '../modules/nf-core/bowtie2/align/main' -include { BOWTIE2_BUILD } from '../modules/nf-core/bowtie2/build/main' -include { BWA_MEM } from '../modules/nf-core/bwa/mem/main' -include { BWA_INDEX } from '../modules/nf-core/bwa/index/main' -include { MINIMAP2_ALIGN as MINIMAP2_ALIGN_ORIGINAL } from '../modules/nf-core/minimap2/align/main' -include { MINIMAP2_ALIGN as MINIMAP2_ALIGN_UMI_1 } from '../modules/nf-core/minimap2/align/main' -include { MINIMAP2_ALIGN as MINIMAP2_ALIGN_UMI_2 } from '../modules/nf-core/minimap2/align/main' -include { MINIMAP2_ALIGN as MINIMAP2_ALIGN_TEMPLATE } from '../modules/nf-core/minimap2/align/main' -include { SAMTOOLS_FAIDX } from '../modules/nf-core/samtools/faidx/main' -include { MINIMAP2_INDEX } from '../modules/nf-core/minimap2/index/main' -include { MEDAKA } from '../modules/nf-core/medaka/main' -include { CUTADAPT } from '../modules/nf-core/cutadapt/main' -include { SAMTOOLS_INDEX } from '../modules/nf-core/samtools/index/main' +include { FASTQC } from '../modules/nf-core/fastqc/main' +include { MULTIQC } from '../modules/nf-core/multiqc/main' +include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' +include { PEAR } from '../modules/nf-core/pear/main' +include { CAT_FASTQ } from '../modules/nf-core/cat/fastq/main' +include { SEQTK_SEQ as SEQTK_SEQ_MASK } from '../modules/nf-core/seqtk/seq/main' +include { SEQTK_SEQ as SEQTK_SEQ_FATOFQ } from '../modules/nf-core/seqtk/seq/main' +include { VSEARCH_CLUSTER } from '../modules/nf-core/vsearch/cluster/main' +include { VSEARCH_SORT } from '../modules/nf-core/vsearch/sort/main' +include { RACON as RACON_1 } from '../modules/nf-core/racon/main' +include { RACON as RACON_2 } from '../modules/nf-core/racon/main' +include { BOWTIE2_ALIGN } from '../modules/nf-core/bowtie2/align/main' +include { BOWTIE2_BUILD } from '../modules/nf-core/bowtie2/build/main' +include { BWA_MEM } from '../modules/nf-core/bwa/mem/main' +include { BWA_INDEX } from '../modules/nf-core/bwa/index/main' +include { MINIMAP2_ALIGN as MINIMAP2_ALIGN_ORIGINAL } from '../modules/nf-core/minimap2/align/main' +include { MINIMAP2_ALIGN as MINIMAP2_ALIGN_UMI_1 } from '../modules/nf-core/minimap2/align/main' +include { MINIMAP2_ALIGN as MINIMAP2_ALIGN_UMI_2 } from '../modules/nf-core/minimap2/align/main' +include { MINIMAP2_ALIGN as MINIMAP2_ALIGN_TEMPLATE } from '../modules/nf-core/minimap2/align/main' +include { MINIMAP2_INDEX } from '../modules/nf-core/minimap2/index/main' +include { MEDAKA } from '../modules/nf-core/medaka/main' +include { CUTADAPT } from '../modules/nf-core/cutadapt/main' +include { SAMTOOLS_INDEX } from '../modules/nf-core/samtools/index/main' /* @@ -140,89 +135,105 @@ workflow CRISPRSEQ_TARGETED { ch_versions = Channel.empty() // - // SUBWORKFLOW: Read in samplesheet, validate and stage input files + // Create input channel from input file provided through params.input // - INPUT_CHECK ( - ch_input - ) + Channel.fromSamplesheet("input") + .multiMap { meta, fastq_1, fastq_2, reference, protospacer, template -> + // meta.condition is part of the screening workflow and we need to remove it + reads: [ meta.id, meta - meta.subMap('condition') + [ single_end:fastq_2?false:true, self_reference:reference?false:true, template:template?true:false ], fastq_2?[ fastq_1, fastq_2 ]:[ fastq_1 ] ] + reference: [meta - meta.subMap('condition') + [ single_end:fastq_2?false:true, self_reference:reference?false:true, template:template?true:false ], reference] + protospacer: [meta - meta.subMap('condition') + [ single_end:fastq_2?false:true, self_reference:reference?false:true, template:template?true:false ], protospacer] + template: [meta - meta.subMap('condition') + [ single_end:fastq_2?false:true, self_reference:reference?false:true, template:template?true:false ], template] + } + .set { ch_input } + + ch_input .reads + .groupTuple() .map { - meta, fastq -> - [ meta - meta.subMap('id') + [id: meta.id.split('_')[0..-2].join('_')], fastq ] + WorkflowCrisprseq.validateInput(it) } - .groupTuple(by: [0]) // Separate samples by the ones containing all reads in one file or the ones with many files to be concatenated .branch { - meta, fastq -> - single : fastq.size() == 1 - return [ meta, fastq.flatten() ] - multiple: fastq.size() > 1 - return [ meta, fastq.flatten() ] + meta, fastqs -> + single : fastqs.size() == 1 + return [ meta, fastqs.flatten() ] + multiple: fastqs.size() > 1 + return [ meta, fastqs.flatten() ] } .set { ch_fastq } - ch_versions = ch_versions.mix(INPUT_CHECK.out.versions) + // - // MODULE: Add reference sequences to file + // Add reference sequences to file // - SEQ_TO_FILE_REF ( - INPUT_CHECK.out.reference - .map { - meta, fastq -> - [ meta - meta.subMap('id') + [id: meta.id.split('_')[0..-2].join('_')], fastq ] - }, - "reference" + ch_input.reference + .tap{ meta_reference } + .filter{ meta, sequence -> sequence instanceof String } + .collectFile() { meta, reference -> + [ "${meta.id}_reference.fasta", ">${meta.id}\n${reference}\n" ] // Write each reference sequence to a file + } + .map{ new_file -> + [new_file.baseName.split("_reference")[0], new_file] // create a channel with the meta.id and the new file + } + .join(meta_reference + .map{ meta, reference -> + [meta.id, meta] // Join the channel by meta.id with the meta map + } ) - ch_versions = ch_versions.mix(SEQ_TO_FILE_REF.out.versions) + .map{ metaid, new_file, meta -> + [meta, new_file] // Obtain the final channel with meta map and the new file + } + .set{ ch_seq_reference } + // - // MODULE: Add template sequences to file + // Add template sequences to file // - SEQ_TO_FILE_TEMPL ( - INPUT_CHECK.out.template - .map { - meta, fastq -> - [ meta - meta.subMap('id') + [id: meta.id.split('_')[0..-2].join('_')], fastq ] - }, - "template" + ch_input.template + .tap{ meta_template } + .filter{ meta, sequence -> sequence instanceof String } + .collectFile() { meta, template -> + [ "${meta.id}_template.fasta", ">${meta.id}\n${template}\n" ] // Write each template sequence to a file + } + .map{ new_file -> + [new_file.baseName.split("_template")[0], new_file] // create a channel with the meta.id and the new file + } + .join(meta_template + .map{ meta, template -> + [meta.id, meta] // Join the channel by meta.id with the meta map + } ) - ch_versions = ch_versions.mix(SEQ_TO_FILE_TEMPL.out.versions) + .map{ metaid, new_file, meta -> + [meta, new_file] // Obtain the final channel with meta map and the new file + } + .set{ ch_seq_template } + // Join channels with reference and protospacer // to channel: [ meta, reference, protospacer] if (!params.reference_fasta && !params.protospacer) { - SEQ_TO_FILE_REF.out.file - .join(INPUT_CHECK.out.protospacer - .map { - meta, fastq -> - [ meta - meta.subMap('id') + [id: meta.id.split('_')[0..-2].join('_')], fastq ] - }, - by: 0) + ch_seq_reference + .join(ch_input.protospacer) .set{ reference_protospacer } } else if (!params.reference_fasta) { // If a protospacer was provided through the --protospacer param instead of the samplesheet ch_protospacer = Channel.of(params.protospacer) - SEQ_TO_FILE_REF.out.file + ch_seq_reference .combine(ch_protospacer) .set{ reference_protospacer } } else if (!params.protospacer) { // If a reference was provided through a fasta file or igenomes instead of the samplesheet ch_reference = Channel.fromPath(params.reference_fasta) - INPUT_CHECK.out.protospacer + ch_input.protospacer .combine(ch_reference) - .map{ meta, protospacer, reference -> - [ meta - meta.subMap('id') + [id: meta.id.split('_')[0..-2].join('_')], reference, protospacer ] - } .set{ reference_protospacer } } else { ch_reference = Channel.fromPath(params.reference_fasta) ch_protospacer = Channel.of(params.protospacer) - INPUT_CHECK.out.reads + ch_input.reads .combine(ch_reference) .combine(ch_protospacer) - .map{ meta, reads, reference, protospacer -> - [meta - meta.subMap('id') + [id: meta.id.split('_')[0..-2].join('_')], reference, protospacer] - } .set{ reference_protospacer } } @@ -294,6 +305,7 @@ workflow CRISPRSEQ_TARGETED { return [ meta, reads[0], adapter_seqs[0] ] } .set { ch_adapter_seqs } + ch_versions = ch_versions.mix(FIND_ADAPTERS.out.versions.first()) // @@ -331,31 +343,34 @@ workflow CRISPRSEQ_TARGETED { .join(PEAR.out.assembled, remainder: true) .join(SEQTK_SEQ_MASK.out.fastx) .join(CUTADAPT.out.log) - .set { ch_merging_summary_data } + .map { meta, reads, assembled, masked, trimmed -> + if (assembled == null) { + assembled = [] + } + return [ meta, reads, assembled, masked, trimmed ] + } + .set { ch_preprocessing_summary_data } } else { ch_cat_fastq.paired .mix(ch_cat_fastq.single) .join(PEAR.out.assembled, remainder: true) .join(SEQTK_SEQ_MASK.out.fastx) - .combine(Channel.value("null")) - .map { meta, reads, assembled, masked, trimmed -> + .map { meta, reads, assembled, masked -> if (assembled == null) { - assembled = file('null_a') - } - if (trimmed == "null") { - trimmed = file('null_t') + assembled = [] } - return [ meta, reads, assembled, masked, trimmed ] + return [ meta, reads, assembled, masked, [] ] } - .set { ch_merging_summary_data } + .set { ch_preprocessing_summary_data } } // // MODULE: Summary of merged reads // - MERGING_SUMMARY { - ch_merging_summary_data + PREPROCESSING_SUMMARY { + ch_preprocessing_summary_data } + ch_versions = ch_versions.mix(PREPROCESSING_SUMMARY.out.versions) if (params.umi_clustering) { @@ -365,6 +380,7 @@ workflow CRISPRSEQ_TARGETED { EXTRACT_UMIS ( SEQTK_SEQ_MASK.out.fastx ) + ch_versions = ch_versions.mix(EXTRACT_UMIS.out.versions.first()) // @@ -373,6 +389,7 @@ workflow CRISPRSEQ_TARGETED { VSEARCH_CLUSTER ( EXTRACT_UMIS.out.fasta ) + ch_versions = ch_versions.mix(VSEARCH_CLUSTER.out.versions.first()) // Obtain a file with UBS (UBI bin size) and UMI ID VSEARCH_CLUSTER.out.clusters @@ -423,6 +440,7 @@ workflow CRISPRSEQ_TARGETED { ch_umi_bysize.cluster, Channel.value("--sortbysize") ) + ch_versions = ch_versions.mix(VSEARCH_SORT.out.versions.first()) // Get the correspondent fasta sequencences from top cluster sequences // Replaces the sequence name adding the "centroid_" prefix to avoid having two sequences with the same name in following steps @@ -436,11 +454,11 @@ workflow CRISPRSEQ_TARGETED { [ "${name}.fasta", fasta ] // >centroid_... -> sample_top.fasta } .map{ new_file -> - [new_file.baseName, new_file] // Substring is removing "_top" added by VSEARCH_SORT // [sample, sample_top.fasta] + [new_file.baseName, new_file] // [sample, sample_top.fasta] } .join(meta_channel_2 .map { meta, original_file -> - ["${original_file.baseName}", meta] // Substring is removing "_top" added by VSEARCH_SORT // [sample, [id:sample_id, ...]] + ["${original_file.baseName}", meta] // [sample, [id:sample_id, ...]] }) // [sample, sample_top.fasta, [id:sample_id, ...]] .map{ file_name, new_file, meta -> [meta + [cluster_id: file_name[0..-5]], new_file] // Add cluster ID to meta map // [[id:sample_id, ..., cluster_id:sample], sample_top.fasta] @@ -459,7 +477,7 @@ workflow CRISPRSEQ_TARGETED { [ "${name}_sequences.fasta", fasta ] // >... -> sample_sequences.fasta } .map{ new_file -> - [new_file.baseName[0..-11], new_file] // Substring is removing "_sequences" added by collectFile // [sample, sample_sequences.fasta] + [new_file.baseName[0..-11], new_file] // [sample, sample_sequences.fasta] } .join(meta_channel_3 .map { meta, original_file -> @@ -485,6 +503,7 @@ workflow CRISPRSEQ_TARGETED { false, false ) + ch_versions = ch_versions.mix(MINIMAP2_ALIGN_UMI_1.out.versions.first()) // Only continue with clusters that have aligned sequences @@ -500,6 +519,7 @@ workflow CRISPRSEQ_TARGETED { .join(ch_top_clusters_sequence) .join(ch_minimap_1) ) + ch_versions = ch_versions.mix(RACON_1.out.versions.first()) // // MODULE: Mapping with minimap2 - cycle 2 @@ -512,6 +532,7 @@ workflow CRISPRSEQ_TARGETED { false, false ) + ch_versions = ch_versions.mix(MINIMAP2_ALIGN_UMI_2.out.versions.first()) // Only continue with clusters that have aligned sequences MINIMAP2_ALIGN_UMI_2.out.paf @@ -526,6 +547,7 @@ workflow CRISPRSEQ_TARGETED { .join(RACON_1.out.improved_assembly) .join(ch_minimap_2) ) + ch_versions = ch_versions.mix(RACON_2.out.versions.first()) // @@ -535,6 +557,7 @@ workflow CRISPRSEQ_TARGETED { ch_clusters_sequence .join(RACON_2.out.improved_assembly) ) + ch_versions = ch_versions.mix(MEDAKA.out.versions.first()) // Collect all consensus UMI sequences into one single file per sample MEDAKA.out.assembly @@ -566,6 +589,7 @@ workflow CRISPRSEQ_TARGETED { SEQTK_SEQ_FATOFQ ( ch_umi_consensus ) + ch_versions = ch_versions.mix(SEQTK_SEQ_FATOFQ.out.versions.first()) } ch_preprocess_reads = params.umi_clustering ? SEQTK_SEQ_FATOFQ.out.fastx : SEQTK_SEQ_MASK.out.fastx @@ -575,9 +599,12 @@ workflow CRISPRSEQ_TARGETED { // CLUSTERING_SUMMARY ( ch_preprocess_reads - .join(MERGING_SUMMARY.out.summary) + .join(PREPROCESSING_SUMMARY.out.summary) ) + ch_versions = ch_versions.mix(CLUSTERING_SUMMARY.out.versions) + + // // MODULE: Mapping with Minimap2 @@ -599,7 +626,7 @@ workflow CRISPRSEQ_TARGETED { // if (params.aligner == "bwa") { BWA_INDEX ( - ORIENT_REFERENCE.out.reference.map { it[1] } + ORIENT_REFERENCE.out.reference ) ch_versions = ch_versions.mix(BWA_INDEX.out.versions) BWA_MEM ( @@ -616,7 +643,7 @@ workflow CRISPRSEQ_TARGETED { // if (params.aligner == "bowtie2") { BOWTIE2_BUILD ( - ORIENT_REFERENCE.out.reference.map { it[1] } + ORIENT_REFERENCE.out.reference ) ch_versions = ch_versions.mix(BOWTIE2_BUILD.out.versions) BOWTIE2_ALIGN ( @@ -637,6 +664,7 @@ workflow CRISPRSEQ_TARGETED { ch_mapped_bam .join(CLUSTERING_SUMMARY.out.summary) ) + ch_versions = ch_versions.mix(ALIGNMENT_SUMMARY.out.versions) // @@ -652,8 +680,9 @@ workflow CRISPRSEQ_TARGETED { // TEMPLATE_REFERENCE ( ORIENT_REFERENCE.out.reference - .join(SEQ_TO_FILE_TEMPL.out.file) + .join(ch_seq_template) ) + ch_versions = ch_versions.mix(TEMPLATE_REFERENCE.out.versions.first()) // @@ -683,25 +712,20 @@ workflow CRISPRSEQ_TARGETED { ch_mapped_bam .join(SAMTOOLS_INDEX.out.bai) .join(ORIENT_REFERENCE.out.reference) - .join(INPUT_CHECK.out.protospacer - .map { - meta, fastq -> - [ meta - meta.subMap('id') + [id: meta.id.split('_')[0..-2].join('_')], fastq ] - } - ) - .join(SEQ_TO_FILE_TEMPL.out.file, remainder: true) + .join(ch_input.protospacer) + .join(ch_seq_template, remainder: true) .join(ch_template_bam, remainder: true) .join(TEMPLATE_REFERENCE.out.fasta, remainder: true) .join(ALIGNMENT_SUMMARY.out.summary) .map { meta, reads, index, reference, protospacer, template, template_bam, reference_template, summary -> if (template == null) { - template = file('null_t') + template = [] } if (template_bam == null) { - template_bam = file('null_b') + template_bam = [] } if (reference_template == null) { - reference_template = file('null_r') + reference_template = [] } return [meta, reads, index, reference, protospacer, template, template_bam, reference_template, summary] } @@ -714,6 +738,18 @@ workflow CRISPRSEQ_TARGETED { CIGAR_PARSER ( ch_to_parse_cigar ) + ch_versions = ch_versions.mix(CIGAR_PARSER.out.versions.first()) + + + // + // + // + CRISPRSEQ_PLOTTER ( + CIGAR_PARSER.out.indels + .join(ORIENT_REFERENCE.out.reference) + .join(ch_input.protospacer) + ) + ch_versions = ch_versions.mix(CRISPRSEQ_PLOTTER.out.versions.first()) // @@ -729,14 +765,20 @@ workflow CRISPRSEQ_TARGETED { workflow_summary = WorkflowCrisprseq.paramsSummaryMultiqc(workflow, summary_params) ch_workflow_summary = Channel.value(workflow_summary) - methods_description = WorkflowCrisprseq.methodsDescriptionText(workflow, ch_multiqc_custom_methods_description) + methods_description = WorkflowCrisprseq.methodsDescriptionText(workflow, ch_multiqc_custom_methods_description, params) ch_methods_description = Channel.value(methods_description) ch_multiqc_files = Channel.empty() ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml')) + ch_multiqc_files = ch_multiqc_files.mix(CIGAR_PARSER.out.processing.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(CIGAR_PARSER.out.edition.collect{it[2]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(CIGAR_PARSER.out.qcindels.collect{it[1]}.ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect()) ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([])) + if (params.overrepresented) { + ch_multiqc_files = ch_multiqc_files.mix(CUTADAPT.out.log.collect{it[1]}.ifEmpty([])) + } MULTIQC ( ch_multiqc_files.collect(), @@ -757,6 +799,7 @@ workflow.onComplete { if (params.email || params.email_on_fail) { NfcoreTemplate.email(workflow, params, summary_params, projectDir, log, multiqc_report) } + NfcoreTemplate.dump_parameters(workflow, params) NfcoreTemplate.summary(workflow, params, log) if (params.hook_url) { NfcoreTemplate.IM_notification(workflow, params, summary_params, projectDir, log)