Skip to content

Commit

Permalink
Merge pull request #84 from sanger-tol/dev
Browse files Browse the repository at this point in the history
Release 1.2
  • Loading branch information
muffato authored Dec 19, 2023
2 parents c5d5612 + db5c8b9 commit 91efc18
Show file tree
Hide file tree
Showing 58 changed files with 690 additions and 445 deletions.
16 changes: 8 additions & 8 deletions .github/workflows/fix-linting.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,21 @@ jobs:
# Only run if comment is on a PR with the main repo, and if it contains the magic keywords
if: >
contains(github.event.comment.html_url, '/pull/') &&
contains(github.event.comment.body, '@nf-core-bot fix linting') &&
contains(github.event.comment.body, '@sanger-tolsoft fix linting') &&
github.repository == 'sanger-tol/readmapping'
runs-on: ubuntu-latest
steps:
# Use the @nf-core-bot token to check out so we can push later
# Use the @sanger-tolsoft token to check out so we can push later
- uses: actions/checkout@v3
with:
token: ${{ secrets.nf_core_bot_auth_token }}
token: ${{ secrets.sangertolsoft_access_token }}

# Action runs on the issue comment, so we don't get the PR by default
# Use the gh cli to check out the PR
- name: Checkout Pull Request
run: gh pr checkout ${{ github.event.issue.number }}
env:
GITHUB_TOKEN: ${{ secrets.nf_core_bot_auth_token }}
GITHUB_TOKEN: ${{ secrets.sangertolsoft_access_token }}

- uses: actions/setup-node@v3

Expand All @@ -34,9 +34,9 @@ jobs:
id: prettier_status
run: |
if prettier --check ${GITHUB_WORKSPACE}; then
echo "name=result::pass" >> $GITHUB_OUTPUT
echo "result=pass" >> $GITHUB_OUTPUT
else
echo "name=result::fail" >> $GITHUB_OUTPUT
echo "result=fail" >> $GITHUB_OUTPUT
fi
- name: Run 'prettier --write'
Expand All @@ -46,8 +46,8 @@ jobs:
- name: Commit & push changes
if: steps.prettier_status.outputs.result == 'fail'
run: |
git config user.email "core@nf-co.re"
git config user.name "nf-core-bot"
git config user.email "105875386+sanger-tolsoft@users.noreply.github.com"
git config user.name "sanger-tolsoft"
git config push.default upstream
git add .
git status
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,22 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Launch workflow via tower
uses: nf-core/tower-action@v2
uses: seqeralabs/action-tower-launch@v2
with:
workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }}
access_token: ${{ secrets.TOWER_ACCESS_TOKEN }}
compute_env: ${{ secrets.TOWER_COMPUTE_ENV }}
pipeline: ${{ github.repository }}
revision: ${{ github.sha }}
workdir: ${{ secrets.TOWER_WORKDIR_PARENT }}/work/${{ github.repository }}/work-${{ github.sha }}
parameters: |
{
"outdir": "${{ secrets.TOWER_WORKDIR_PARENT }}/results/${{ github.repository }}/results-${{ github.sha }}",
}
profiles: test,sanger,singularity
profiles: test,sanger,singularity,cleanup

- uses: actions/upload-artifact@v3
with:
name: Tower debug log file
path: |
tower_action_*.log
tower_action_*.json
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,22 @@ jobs:
if: github.event_name == 'workflow_dispatch'

- name: Launch workflow via tower
uses: nf-core/tower-action@v2
uses: seqeralabs/action-tower-launch@v2
with:
workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }}
access_token: ${{ secrets.TOWER_ACCESS_TOKEN }}
compute_env: ${{ secrets.TOWER_COMPUTE_ENV }}
pipeline: ${{ github.repository }}
revision: ${{ env.REVISION }}
workdir: ${{ secrets.TOWER_WORKDIR_PARENT }}/work/${{ github.repository }}/work-${{ env.REVISION }}
parameters: |
{
"outdir": "${{ secrets.TOWER_WORKDIR_PARENT }}/results/${{ github.repository }}/results-${{ env.REVISION }}",
}
profiles: test_full,sanger,singularity
profiles: test_full,sanger,singularity,cleanup

- uses: actions/upload-artifact@v3
with:
name: Tower debug log file
path: |
tower_action_*.log
tower_action_*.json
38 changes: 38 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,44 @@
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [[1.2.0](https://github.com/sanger-tol/readmapping/releases/tag/1.2.0)] – Norwegian Ridgeback - [2023-12-19]

### Enhancements & fixes

- Restored recording read-groups (`@RG`) in the BAM/CRAM files.
- Updated the CI procedure to use "sanger-tol" rather than "nf-core" names.
- [crumble](https://github.com/jkbonfield/crumble) now used to compress the
PacBio HiFi alignments.
- Execution statistics now under `pipeline_info/readmapping/` (to be consistent
with the other sanger-tol pipelines).
- All resource requirements (memory, time, CPUs) now fit the actual usage. This
is achieved by automatically adjusting to the size of the input whenever
possible.
- Added the `--use_work_dir_as_temp` parameter to make SAMTOOLS_COLLATE use its
work directory for temporary files instead of `$TMPDIR`. It can be used to avoid
leaving unwanted temporary files on a HPC.

### Parameters

| Old parameter | New parameter |
| ------------- | ------------------------ |
| | `--use_work_dir_as_temp` |

> **NB:** Parameter has been **updated** if both old and new parameter information is present. </br> **NB:** Parameter has been **added** if just the new parameter information is present. </br> **NB:** Parameter has been **removed** if new parameter information isn't present.
### Software dependencies

Note, since the pipeline is using Nextflow DSL2, each process will be run with its own [Biocontainer](https://biocontainers.pro/#/registry). This means that on occasion it is entirely possible for the pipeline to be using different versions of the same tool. However, the overall software dependency changes compared to the last release have been listed below for reference.

| Dependency | Old version | New version |
| ---------- | --------------- | ------------- |
| `blast` | 2.12.0 | 2.13.0 |
| `crumble` | | 0.9.1 |
| `samtools` | 1.14 and 1.16.1 | 1.14 and 1.17 |
| `multiqc` | 1.13 | 1.14 |

> **NB:** Dependency has been **updated** if both old and new version information is present. </br> **NB:** Dependency has been **added** if just the new version information is present. </br> **NB:** Dependency has been **removed** if version information isn't present.
## [[1.1.0](https://github.com/sanger-tol/readmapping/releases/tag/1.1.0)] – Hebridean Black - [2023-03-16]

### Enhancements & fixes
Expand Down
157 changes: 114 additions & 43 deletions conf/base.config
Original file line number Diff line number Diff line change
Expand Up @@ -2,64 +2,135 @@
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
sanger-tol/readmapping Nextflow base config file
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
A 'blank slate' config file, appropriate for general use on most high performance
compute environments. Assumes that all software is installed and available on
the PATH. Runs in `local` mode - all jobs will be run on the logged in environment.
----------------------------------------------------------------------------------------
*/

process {
/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Increasing the number of CPUs often gives diminishing returns, so we increase it
following a logarithm curve. Example:
- 0 < value <= 1: start + step
- 1 < value <= 2: start + 2*step
- 2 < value <= 4: start + 3*step
- 4 < value <= 8: start + 4*step
In order to support re-runs, the step increase may be multiplied by the attempt
number prior to calling this function.
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
*/

cpus = { check_max( 1 * task.attempt, 'cpus' ) }
memory = { check_max( 6.GB * task.attempt, 'memory' ) }
time = { check_max( 4.h * task.attempt, 'time' ) }
// Modified logarithm function that doesn't return negative numbers
def positive_log(value, base) {
if (value <= 1) {
return 0
} else {
return Math.log(value)/Math.log(base)
}
}

def log_increase_cpus(start, step, value, base) {
return check_max(start + step * (1 + Math.ceil(positive_log(value, base))), 'cpus')
}

errorStrategy = { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' }
maxRetries = 1

process {

errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' }
maxRetries = 5
maxErrors = '-1'

// Process-specific resource requirements
// NOTE - Please try and re-use the labels below as much as possible.
// These labels are used and recognised by default in DSL2 files hosted on nf-core/modules.
// If possible, it would be nice to keep the same label naming convention when
// adding in your local modules too.
// See https://www.nextflow.io/docs/latest/config.html#config-process-selectors
withLabel:process_single {
cpus = { check_max( 1 , 'cpus' ) }
memory = { check_max( 6.GB * task.attempt, 'memory' ) }
time = { check_max( 4.h * task.attempt, 'time' ) }
// In this configuration file, we give little resources by default and
// explicitly bump them up for some processes.
// All rules should still increase resources every attempt to allow the
// pipeline to self-heal from MEMLIMIT/RUNLIMIT.

// Default
cpus = 1
memory = { check_max( 50.MB * task.attempt, 'memory' ) }
time = { check_max( 30.min * task.attempt, 'time' ) }

withName: 'SAMTOOLS_(CONVERT|FILTER)' {
time = { check_max( 1.hour * task.attempt, 'time' ) }
}

withName: 'SAMTOOLS_(FASTA)' {
time = { check_max( 2.hour * task.attempt, 'time' ) }
}

withName: 'SAMTOOLS_(STATS)' {
// Actually less than 1 hour for PacBio HiFi data, but confirmed 3 hours for Hi-C
time = { check_max( 4.hour * task.attempt, 'time' ) }
}
withLabel:process_low {
cpus = { check_max( 2 * task.attempt, 'cpus' ) }
memory = { check_max( 12.GB * task.attempt, 'memory' ) }
time = { check_max( 4.h * task.attempt, 'time' ) }

withName: 'SAMTOOLS_(COLLATE|FASTQ|FIXMATE|FLAGSTAT|MARKDUP|MERGE|SORT|VIEW)' {
time = { check_max( 8.hour * task.attempt, 'time' ) }
}
withLabel:process_medium {
cpus = { check_max( 6 * task.attempt, 'cpus' ) }
memory = { check_max( 36.GB * task.attempt, 'memory' ) }
time = { check_max( 8.h * task.attempt, 'time' ) }

withName: 'SAMTOOLS_(FLAGSTAT|IDXSTATS)' {
memory = { check_max( 250.MB * task.attempt, 'memory' ) }
}
withLabel:process_high {
cpus = { check_max( 12 * task.attempt, 'cpus' ) }
memory = { check_max( 72.GB * task.attempt, 'memory' ) }
time = { check_max( 16.h * task.attempt, 'time' ) }

withName: '.*:ALIGN_(HIFI|HIC|ILLUMINA):.*:SAMTOOLS_(STATS|VIEW)' {
memory = { check_max( 1.GB * task.attempt, 'memory' ) }
}
withLabel:process_long {
time = { check_max( 20.h * task.attempt, 'time' ) }
withName: '.*:ALIGN_(CLR|ONT):.*:SAMTOOLS_(STATS|VIEW)' {
memory = { check_max( 2.GB * task.attempt, 'memory' ) }
}
withLabel:process_high_memory {
memory = { check_max( 200.GB * task.attempt, 'memory' ) }

withName: '.*:FILTER_PACBIO:SAMTOOLS_COLLATE' {
cpus = { log_increase_cpus(4, 2*task.attempt, 1, 2) }
memory = { check_max( 1.GB * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'memory' ) }
}
withLabel:error_ignore {
errorStrategy = 'ignore'

withName: 'SAMTOOLS_SORMADUP' {
cpus = { log_increase_cpus(2, 6*task.attempt, 1, 2) }
memory = { check_max( 10.GB + 0.6.GB * Math.ceil( meta.read_count / 100000000 ) * task.attempt, 'memory' ) }
time = { check_max( 2.h * Math.ceil( meta.read_count / 100000000 ) * task.attempt / log_increase_cpus(2, 6*task.attempt, 1, 2), 'time' ) }
}
withLabel:error_retry {
errorStrategy = 'retry'
maxRetries = 2

withName: SAMTOOLS_SORT {
cpus = { log_increase_cpus(4, 2*task.attempt, 1, 2) }
// Memory increases by 768M for each thread
memory = { check_max( 1.GB + 800.MB * log_increase_cpus(4, 2*task.attempt, 1, 2), 'memory' ) }
time = { check_max( 8.hour * Math.ceil( meta.read_count / 1000000000 ) * task.attempt, 'time' ) }
}
withName:BWAMEM2_INDEX {
memory = { check_max( 1.GB * Math.ceil( 28 * fasta.size() / 1000000000 ) * task.attempt, 'memory' ) }

withName: BLAST_BLASTN {
time = { check_max( 2.hour * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'time' ) }
memory = { check_max( 100.MB + 20.MB * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'memory' ) }
// The tool never seems to use more than 1 core even when given multiple. Sticking to 1 (the default)
}

withName: BWAMEM2_INDEX {
memory = { check_max( 24.GB * Math.ceil( meta.genome_size / 1000000000 ) * task.attempt, 'memory' ) }
time = { check_max( 30.min * Math.ceil( meta.genome_size / 1000000000 ) * task.attempt, 'time' ) }
// Not multithreaded
}

withName: BWAMEM2_MEM {
// Corresponds to 12 threads as the minimum, 24 threads if 3 billion reads
cpus = { log_increase_cpus(6, 6*task.attempt, meta.read_count/1000000000, 2) }
// Runtime for 1 billion reads on 12 threads is a function of the logarithm of the genome size
// Runtime is considered proportional to the number of reads and inversely to number of threads
time = { check_max( 3.h * task.attempt * Math.ceil(positive_log(meta2.genome_size/100000, 10)) * Math.ceil(meta.read_count/1000000000) * 12 / log_increase_cpus(6, 6*task.attempt, meta.read_count/1000000000, 2), 'time' ) }
// Base RAM usage is about 6 times the genome size. Each thread takes an additional 800 MB RAM
// Memory usage of SAMTOOLS_VIEW is negligible.
memory = { check_max( 6.GB * Math.ceil(meta2.genome_size / 1000000000) + 800.MB * log_increase_cpus(6, 6*task.attempt, meta.read_count/1000000000, 2), 'memory' ) }
}

withName: MINIMAP2_ALIGN {
cpus = { log_increase_cpus(4, 2*task.attempt, meta.read_count/1000000, 2) }
memory = { check_max( (6.GB * Math.ceil( reference.size() / 1000000000 ) + 4.GB * Math.ceil( meta.read_count / 1000000 )) * task.attempt, 'memory' ) }
time = { check_max( 3.h * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'time' ) }
}

withName: CRUMBLE {
// No correlation between memory usage and the number of reads or the genome size.
// Most genomes seem happy with 1 GB, then some with 2 GB, then some with 5 GB.
// The formula below tries to mimic that growth and relies on job retries being allowed.
memory = { check_max( task.attempt * (task.attempt + 1) * 512.MB, 'memory' ) }
// Slightly better correlation between runtime and the number of reads.
time = { check_max( 1.5.h + 1.h * Math.ceil( meta.read_count / 1000000 ) * task.attempt, 'time' ) }
}

withName:CUSTOM_DUMPSOFTWAREVERSIONS {
cache = false
}
Expand Down
Loading

0 comments on commit 91efc18

Please sign in to comment.