Skip to content
This repository has been archived by the owner on Jan 27, 2020. It is now read-only.

Commit

Permalink
Merge pull request #682 from alneberg/awsbatch
Browse files Browse the repository at this point in the history
Awsbatch cpu and memory config
  • Loading branch information
maxulysse authored Dec 17, 2018
2 parents ac53e63 + 43a380a commit e8469c6
Show file tree
Hide file tree
Showing 8 changed files with 79 additions and 14 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
- [#671](https://github.com/SciLifeLab/Sarek/pull/671) - publishDir modes are now params
- [#677](https://github.com/SciLifeLab/Sarek/pull/677) - Update docs
- [#679](https://github.com/SciLifeLab/Sarek/pull/679) - Update old awsbatch configuration
- [#682](https://github.com/SciLifeLab/Sarek/pull/682) - Specifications for memory and cpus for awsbatch
- [#693](https://github.com/SciLifeLab/Sarek/pull/693) - Qualimap bamQC is now ran after mapping and after recalibration for better QC
- [#700](https://github.com/SciLifeLab/Sarek/pull/700) - Update GATK to `4.0.9.0`
- [#702](https://github.com/SciLifeLab/Sarek/pull/702) - update FastQC to `0.11.8`
Expand Down
33 changes: 33 additions & 0 deletions conf/aws-batch.config
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,21 @@
params {
genome_base = params.genome == 'GRCh37' ? "s3://ngi-igenomes/igenomes/Homo_sapiens/GATK/GRCh37" : params.genome == 'GRCh38' ? "s3://ngi-igenomes/igenomes/Homo_sapiens/GATK/GRCh38" : "s3://sarek-references/small"
publishDirMode = 'copy'
singleCPUMem = 7.GB // To make the uppmax slurm copy paste work.
localReportDir = 'Reports'
}

executor {
name = 'awsbatch'
awscli = '/home/ec2-user/miniconda/bin/aws'
}

/* Rolling files are currently not supported on s3 */
report.file = "${params.localReportDir}/Sarek_report.html"
timeline.file = "${params.localReportDir}/Sarek_timeline.html"
dag.file = "${params.localReportDir}/Sarek_DAG.svg"
trace.file = "${params.localReportDir}/Sarek_trace.txt"

process {
queue = params.awsqueue

Expand All @@ -26,4 +34,29 @@ process {
cpus = 2
memory = 8.GB

withName:RunBcftoolsStats {
cpus = 1
memory = {params.singleCPUMem * 2} // Memory is doubled so that it won't run two on the same instance
// Use a tiny queue for this one, so storage doesn't run out
queue = params.awsqueue_tiny
}
withName:RunVcftools {
cpus = 1
memory = {params.singleCPUMem * 2} // Memory is doubled so that it won't run two on the same instance
// Use a tiny queue for this one, so storage doesn't run out
queue = params.awsqueue_tiny
}
withName:RunHaplotypecaller {
cpus = 1
// Increase memory quadratically
memory = {params.singleCPUMem * 2} // Memory is doubled so that it won't run two on the same instance
// Use a tiny queue for this one, so storage doesn't run out
queue = params.awsqueue_tiny
}
withName:RunGenotypeGVCFs {
cpus = 1
memory = {params.singleCPUMem * 2} // Memory is doubled so that it won't run two on the same instance
// Use a tiny queue for this one, so storage doesn't run out
queue = params.awsqueue_tiny
}
}
4 changes: 3 additions & 1 deletion conf/base.config
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ params {
test = false // Not testing by default
verbose = false // Enable for more verbose information
awsqueue = false // Queue has to be provided when using awsbatch executor
awsqueue_tiny = params.awsqueue // A separate queue with smaller instance types
localReportDir = false // Used by AWS since reporting is not fully supported on s3 buckets
}

process {
Expand Down Expand Up @@ -67,6 +69,6 @@ dag { // Turning on dag by default

trace { // Turning on trace tracking by default
enabled = true
fields = 'process,task_id,hash,name,attempt,status,exit,realtime,%cpu,vmem,rss,submit,start,complete,duration,realtime,rchar,wchar'
fields = 'process,task_id,hash,name,attempt,status,exit,realtime,cpus,memory,%cpu,vmem,rss,submit,start,complete,duration,realtime,rchar,wchar'
file = "${params.outDir}/Reports/Sarek_trace.txt"
}
40 changes: 27 additions & 13 deletions conf/resources.config
Original file line number Diff line number Diff line change
Expand Up @@ -21,25 +21,27 @@ process {

withName:MapReads {
memory = { check_max( 60.GB * task.attempt, 'memory' ) }
cpus = { check_max( 10, 'cpus' ) }
cpus = { check_max( 16, 'cpus' ) }
}
withName:CreateRecalibrationTable {
cpus = { check_max( 12, 'cpus' ) }
memory = {params.singleCPUMem * 8 * task.attempt}
cpus = { check_max( 1, 'cpus' ) }
memory = { check_max( 60.GB * task.attempt, 'memory') }
}
withName:MarkDuplicates {
// Actually the -Xmx value should be kept lower
// Actually the -Xmx value should be kept lower,
// and is set through the markdup_java_options
cpus = { check_max( 8, 'cpus' ) }
memory = { check_max( 8.GB * task.attempt, 'memory' ) }
}
withName:MergeBams {
cpus = { check_max( 4, 'cpus') }
memory = {params.singleCPUMem * task.attempt}
time = { check_max( 5.h * task.attempt, 'time' ) }
}
withName:RecalibrateBam {
cpus = { check_max( 12, 'cpus' ) }
memory = { check_max( 7.GB * 8 * task.attempt, 'memory' ) }
time = { check_max( 5.h * task.attempt, 'time' ) }
cpus = { check_max( 2, 'cpus' ) }
memory = { check_max( 7.GB * 2 * task.attempt, 'memory' ) }
time = { check_max( 10.h * task.attempt, 'time' ) }
}
withName:RunAlleleCount {
cpus = { check_max( 1, 'cpus' ) }
Expand All @@ -49,6 +51,14 @@ process {
cpus = { check_max( 1, 'cpus' ) }
memory = { check_max( 14.GB * task.attempt, 'memory' ) }
}
withName:RunBamQCmapped {
cpus = { check_max( 6, 'cpus' ) }
memory = { check_max( 70.GB, 'memory' ) }
}
withName:RunBamQCrecalibrated {
cpus = { check_max( 6, 'cpus' ) }
memory = { check_max( 70.GB, 'memory' ) }
}
withName:RunBcftoolsStats {
cpus = { check_max( 1, 'cpus' ) }
}
Expand All @@ -65,13 +75,13 @@ process {
memory = { check_max( 8.GB * task.attempt, 'memory' ) }
}
withName:RunHaplotypecaller {
cpus = { check_max( 20, 'cpus' ) }
cpus = { check_max( 1, 'cpus' ) }
// Increase memory quadratically
memory = { check_max( 7.GB * 2 * task.attempt, 'memory' ) }
time = { check_max( 5.h * task.attempt, 'time' ) }
}
withName:RunGenotypeGVCFs {
cpus = { check_max( 20, 'cpus' ) }
cpus = { check_max( 1, 'cpus' ) }
memory = { check_max( 7.GB * task.attempt, 'memory' ) }
}
withName:RunMultiQC {
Expand All @@ -86,20 +96,24 @@ process {
cpus = { check_max( 2, 'cpus' ) }
time = { check_max( 5.h * task.attempt, 'time' ) }
}
withName:RunSingleManta {
cpus = { check_max( 20, 'cpus' ) }
memory = { check_max( 16.GB, 'memory') }
}
withName:RunSingleStrelka {
cpus = { check_max( 20, 'cpus' ) }
memory = { check_max( 16.GB, 'memory') }
time = { check_max( 5.h * task.attempt, 'time' ) }
}
withName:RunSnpeff {
cpus = { check_max( 1, 'cpus' ) }
errorStrategy = { task.exitStatus == 143 ? 'retry' : 'ignore' }
}
withName:RunStrelka {
cpus = { check_max( 1, 'cpus' ) }
time = { check_max( 5.h * task.attempt, 'time' ) }
}
withName:RunVEP {
cpus = { check_max( 1, 'cpus' ) }
cpus = { check_max( 16, 'cpus' ) }
memory = {check_max (32.GB * task.attempt, 'memory' ) }
errorStrategy = { task.exitStatus == 143 ? 'retry' : 'ignore' }
}
}
}
3 changes: 3 additions & 0 deletions conf/uppmax-slurm.config
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ params {
singleCPUMem = 7.GB // for processes that are using more memory but a single CPU only. Use the 'core' queue for these
}

// Extended set of fields, e.g. native_id, cpu and memory:
trace.fields = 'process,task_id,hash,name,native_id,attempt,status,exit,realtime,cpus,memory,%cpu,vmem,rss,submit,start,complete,duration,realtime,rchar,wchar'

process {
clusterOptions = {"-A $params.project"}
cpus = 16
Expand Down
8 changes: 8 additions & 0 deletions docs/PARAMETERS.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,14 @@ So you can write `--tools mutect2,ascat` or `--tools MuTect2,ASCAT` without worr

Only required if you use the awsbatch profile. This parameter specifies the queue for which jobs are submitted in AWS Batch.

### --awsqueue_tiny `BatchQueueName`

Only used if you use the awsbatch profile. This parameter specifies a queue used for certain small jobs that might still require a significant amount of disk storage.

### --localReportDir `Directory`

Only used if you use the awsbatch profile. This parameter specifies an output directory for nextflow reports, such as Sarek_timeline.html, which currently is not fully supported to store on s3.

### --verbose

Display more information about files being processed.
Expand Down
3 changes: 3 additions & 0 deletions lib/SarekUtils.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ class SarekUtils {
'annotateTools',
'annotateVCF',
'awsqueue',
'awsqueue_tiny',
'build',
'call-name',
'callName',
Expand All @@ -52,6 +53,8 @@ class SarekUtils {
'genome',
'genomes',
'help',
'localReportDir',
'local-report-dir',
'markdup_java_options',
'max_cpus',
'max_memory',
Expand Down
1 change: 1 addition & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ profiles {
includeConfig 'conf/igenomes.config'
includeConfig 'conf/aws-batch.config'
includeConfig 'conf/docker.config'
includeConfig 'conf/resources.config'
includeConfig 'conf/containers.config'
}
// Small testing with Singularity profile
Expand Down

0 comments on commit e8469c6

Please sign in to comment.