From 7b1833d35db32efebe29d2575750c5387b914c83 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Wed, 1 May 2019 17:33:15 +0200 Subject: [PATCH 01/28] update README --- README.md | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 4171c4be8e..3e52b0e833 100644 --- a/README.md +++ b/README.md @@ -3,11 +3,13 @@ **An open-source analysis pipeline to detect germline or somatic variants from whole genome or targeted sequencing**. +> :warning: This pipeline is a work in progress being ported to nf-core from [SciLifeLab/Sarek](https://github/SciLifeLab/Sarek) + [![Nextflow version][nextflow-badge]](https://www.nextflow.io) [![Travis build status][travis-badge]](https://travis-ci.org/nf-core/sarek) [![Install with bioconda][bioconda-badge]](http://bioconda.github.io/) -[![Docker Container available][docker-badge]](https://hub.docker.com/r/nf-core/sarek) +[![Docker Container available][docker-badge]](https://hub.docker.com/r/nfcore/sarek) [![Join us on Slack][slack-badge]](https://nfcore.slack.com/messages/CGFUX04HZ/) @@ -136,8 +138,12 @@ For further information or help, don't hesitate to get in touch on [Slack](https [National Genomics Infrastructure logo][ngi-link] [National Bioinformatics Infrastructure Sweden logo][nbis-link] -[bioconda-badge]: https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=popout&logo= -[docker-badge]: https://img.shields.io/docker/automated/nf-core/sarek.svg?style=popout&logo=docker -[nextflow-badge]: https://img.shields.io/badge/nextflow-%E2%89%A519.04.0-brightgreen.svg?style=popout&logo= -[travis-badge]: https://img.shields.io/travis/nf-core/sarek.svg?style=popout&logo=travis -[slack-badge]: https://img.shields.io/badge/slack-nfcore/sarek-blue.svg?style=popout&logo=slack +[bioconda-badge]: https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?logo= +[btb-link]: https://ki.se/forskning/barntumorbanken-0 +[docker-badge]: https://img.shields.io/docker/automated/nfcore/sarek.svg?logo=docker +[nbis-link]: https://nbis.se +[nextflow-badge]: https://img.shields.io/badge/nextflow-%E2%89%A519.04.0-brightgreen.svg?logo= +[ngi-link]: https://ngisweden.scilifelab.se/ +[scilifelab-link]: https://scilifelab.se +[slack-badge]: https://img.shields.io/badge/slack-nfcore/sarek-blue.svg?logo=slack +[travis-badge]: https://img.shields.io/travis/nf-core/sarek.svg?logo=travis From 961856cc3b5fc0b2826a39fc2306b4526b9c2db5 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Wed, 1 May 2019 17:55:13 +0200 Subject: [PATCH 02/28] update .travis.yml --- .travis.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index fa72533100..e736da2a64 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,10 +11,10 @@ before_install: # PRs to master are only ok if coming from dev branch - '[ $TRAVIS_PULL_REQUEST = "false" ] || [ $TRAVIS_BRANCH != "master" ] || ([ $TRAVIS_PULL_REQUEST_SLUG = $TRAVIS_REPO_SLUG ] && [ $TRAVIS_PULL_REQUEST_BRANCH = "dev" ])' # Pull the docker image first so the test doesn't wait for this - - docker pull maxulysse/sarek:dev + - docker pull nfcore/sarek:dev # Fake the tag locally so that the pipeline runs properly # Looks weird when this is :dev to :dev, but makes sense when testing code for a release (:dev to :1.0.1) - - docker tag maxulysse/sarek:dev maxulysse/sarek:dev + - docker tag nfcore/sarek:dev nfcore/sarek:dev install: # Install Nextflow From cefd329e686e001216b9ce5cd72ea2401108c7c4 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Wed, 1 May 2019 17:55:24 +0200 Subject: [PATCH 03/28] add Jenkinsfile --- Jenkinsfile | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 Jenkinsfile diff --git a/Jenkinsfile b/Jenkinsfile new file mode 100644 index 0000000000..b54e5c7deb --- /dev/null +++ b/Jenkinsfile @@ -0,0 +1,35 @@ +pipeline { + agent any + + environment { + JENKINS_API = credentials('api') + } + + stages { + stage('Setup environment') { + steps { + sh "docker pull nfcore/sarek:dev" + } + } + stage('Build') { + steps { + sh "git clone --single-branch --branch sarek https://github.com/nf-core/test-datasets.git test-data" + sh "nextflow run build.nf -profile docker --genome smallGRCh37 --refdir test-data/reference --outdir References" + } + } + stage('Test') { + steps { + sh "nextflow run main.nf -profile docker --help" + } + } + } + + post { + failure { + script { + def response = sh(script: "curl -u ${JENKINS_API_USR}:${JENKINS_API_PSW} ${BUILD_URL}/consoleText", returnStdout: true).trim().replace('\n', '
') + def comment = pullRequest.comment("## :rotating_light: Buil log output:
${response}
") + } + } + } +} From 1d3c120f24a6917ab914fd0254aa2fdd25e75772 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Wed, 1 May 2019 17:58:00 +0200 Subject: [PATCH 04/28] update conda environment file --- environment.yml | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index 4e6811ec55..dcdbe97d7d 100644 --- a/environment.yml +++ b/environment.yml @@ -6,6 +6,25 @@ channels: - bioconda - defaults dependencies: - # TODO nf-core: Add required software dependencies here + - r-rcolorbrewer=1.1 + - r-base=3.5.1 + - bcftools=1.9 + - bioconductor-rtracklayer=1.42.1 + - bwa=0.7.17 + - cancerit-allelecount=2.1.2 + - control-freec=11.4 + - ensembl-vep=96.0 - fastqc=0.11.8 + - freebayes=1.2.0 + - gatk4=4.1.1.0 + - genesplicer=1.0 + - htslib=1.9 + - igvtools=2.3.93 + - manta=1.5.0 - multiqc=1.7 + - qualimap=2.2.2b + - samtools=1.9 + - snpeff=4.3.1t + - strelka=2.9.10 + - vcfanno=0.3.1 + - vcftools=0.1.16 From 95d55117eb19712f1e173b10f977686ab7df4fbf Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 2 May 2019 09:49:07 +0200 Subject: [PATCH 05/28] update software_version collect --- bin/scrape_software_versions.py | 36 +++++++++++++++++++++++++++++++-- main.nf | 26 ++++++++++++++++-------- 2 files changed, 52 insertions(+), 10 deletions(-) diff --git a/bin/scrape_software_versions.py b/bin/scrape_software_versions.py index 0cb269ad8c..599043c4ae 100755 --- a/bin/scrape_software_versions.py +++ b/bin/scrape_software_versions.py @@ -5,16 +5,48 @@ # TODO nf-core: Add additional regexes for new tools in process get_software_versions regexes = { - 'nf-core/sarek': ['v_pipeline.txt', r"(\S+)"], - 'Nextflow': ['v_nextflow.txt', r"(\S+)"], + 'AlleleCount': ['v_allelecount.txt', r"(\S+)"], + 'ASCAT': ['v_ascat.txt', r"(\d\.\d+)"], + 'bcftools': ['v_bcftools.txt', r"bcftools (\S+)"], + 'BWA': ['v_bwa.txt', r"Version: (\S+)"], 'FastQC': ['v_fastqc.txt', r"FastQC v(\S+)"], + 'FreeBayes': ['v_freebayes.txt', r"version: v(\d\.\d\.\d+)"], + 'GATK': ['v_gatk.txt', r"Version:(\S+)"], + 'htslib': ['v_samtools.txt', r"htslib (\S+)"], + 'Manta': ['v_manta.txt', r"([0-9.]+)"], 'MultiQC': ['v_multiqc.txt', r"multiqc, version (\S+)"], + 'Nextflow': ['v_nextflow.txt', r"(\S+)"], + 'nf-core/sarek': ['v_pipeline.txt', r"(\S+)"], + 'Picard': ['v_picard.txt', r"Picard version:(\d\.\d\.\d+)"], + 'Qualimap': ['v_qualimap.txt', r"QualiMap v.(\S+)"], + 'R': ['v_r.txt', r"R version (\S+)"], + 'samtools': ['v_samtools.txt', r"samtools (\S+)"], + 'SnpEff': ['v_snpeff.txt', r"version SnpEff (\S+)"], + 'Strelka': ['v_strelka.txt', r"([0-9.]+)"], + 'vcftools': ['v_vcftools.txt', r"([0-9.]+)"], + 'VEP': ['v_vep.txt', r"ensembl-vep : (\S+)"], } results = OrderedDict() results['nf-core/sarek'] = 'N/A' results['Nextflow'] = 'N/A' +results['AlleleCount'] = 'N/A' +results['ASCAT'] = 'N/A' +results['bcftools'] = 'N/A' +results['BWA'] = 'N/A' results['FastQC'] = 'N/A' +results['FreeBayes'] = 'N/A' +results['GATK'] = 'N/A' +results['htslib'] = 'N/A' +results['Manta'] = 'N/A' results['MultiQC'] = 'N/A' +results['Picard'] = 'N/A' +results['Qualimap'] = 'N/A' +results['R'] = 'N/A' +results['samtools'] = 'N/A' +results['SnpEff'] = 'N/A' +results['Strelka'] = 'N/A' +results['vcftools'] = 'N/A' +results['VEP'] = 'N/A' # Search each file using its regex for k, v in regexes.items(): diff --git a/main.nf b/main.nf index dc781271f2..3b8687f20c 100644 --- a/main.nf +++ b/main.nf @@ -20,16 +20,15 @@ def helpMessage() { The typical command for running the pipeline is as follows: - nextflow run nf-core/sarek --reads '*_R{1,2}.fastq.gz' -profile docker + nextflow run nf-core/sarek --sample sample.tsv -profile docker Mandatory arguments: - --reads Path to input data (must be surrounded with quotes) + --sample Path to TSV input file -profile Configuration profile to use. Can use multiple (comma separated) Available: conda, docker, singularity, awsbatch, test and more. Options: --genome Name of iGenomes reference - --singleEnd Specifies that the input is single end reads References If not specified in the configuration file or you wish to overwrite any of the references. --fasta Path to Fasta reference @@ -176,12 +175,23 @@ process get_software_versions { file "software_versions.csv" script: - // TODO nf-core: Get all tools to print their version number here """ - echo $workflow.manifest.version > v_pipeline.txt - echo $workflow.nextflow.version > v_nextflow.txt - fastqc --version > v_fastqc.txt - multiqc --version > v_multiqc.txt + bcftools version > v_bcftools.txt 2>&1 || true + bwa &> v_bwa.txt 2>&1 || true + configManta.py --version > v_manta.txt 2>&1 || true + configureStrelkaGermlineWorkflow.py --version > v_strelka.txt 2>&1 || true + echo "${workflow.manifest.version}" &> v_pipeline.txt 2>&1 || true + echo "${workflow.nextflow.version}" &> v_nextflow.txt 2>&1 || true + echo "SNPEFF version"\$(snpEff -h 2>&1) > v_snpeff.txt + fastqc --version > v_fastqc.txt 2>&1 || true + freebayes --version > v_freebayes.txt 2>&1 || true + gatk ApplyBQSR --help 2>&1 | grep Version: > v_gatk.txt 2>&1 || true + multiqc --version &> v_multiqc.txt 2>&1 || true + qualimap --version &> v_qualimap.txt 2>&1 || true + samtools --version &> v_samtools.txt 2>&1 || true + vcftools --version &> v_vcftools.txt 2>&1 || true + vep --help &> v_vep.txt 2>&1 || true + scrape_software_versions.py &> software_versions_mqc.yaml """ } From e21ce880e2c1a8cc9e814638b59a8411ae28dfb2 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 2 May 2019 10:09:37 +0200 Subject: [PATCH 06/28] trying to fix travis CI --- .travis.yml | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/.travis.yml b/.travis.yml index e736da2a64..84b9cecb5d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,18 +2,27 @@ sudo: required language: python jdk: openjdk8 services: docker + +addons: + apt: + update: true + python: '3.6' cache: pip + matrix: fast_finish: true +env: + - NXF_VER=19.04.0 + - NXF_VER='' + before_install: # PRs to master are only ok if coming from dev branch - '[ $TRAVIS_PULL_REQUEST = "false" ] || [ $TRAVIS_BRANCH != "master" ] || ([ $TRAVIS_PULL_REQUEST_SLUG = $TRAVIS_REPO_SLUG ] && [ $TRAVIS_PULL_REQUEST_BRANCH = "dev" ])' # Pull the docker image first so the test doesn't wait for this - docker pull nfcore/sarek:dev # Fake the tag locally so that the pipeline runs properly - # Looks weird when this is :dev to :dev, but makes sense when testing code for a release (:dev to :1.0.1) - docker tag nfcore/sarek:dev nfcore/sarek:dev install: @@ -24,12 +33,13 @@ install: # Install nf-core/tools - pip install --upgrade pip - pip install nf-core + # Install Conda + - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh + - bash Miniconda3-latest-Linux-x86_64.sh -b -f -p $HOME/miniconda + - export PATH="$HOME/miniconda/bin:$PATH" # Reset - mkdir ${TRAVIS_BUILD_DIR}/tests && cd ${TRAVIS_BUILD_DIR}/tests -env: - - NXF_VER=19.04.0 - jobs: include: - stage: lint From 00c7704478fa0b4e492ea675179be15c1a5cf7f8 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 2 May 2019 10:14:47 +0200 Subject: [PATCH 07/28] only one env --- .travis.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 84b9cecb5d..1c6997caab 100644 --- a/.travis.yml +++ b/.travis.yml @@ -15,7 +15,6 @@ matrix: env: - NXF_VER=19.04.0 - - NXF_VER='' before_install: # PRs to master are only ok if coming from dev branch From 3be2f63cbfef5e64cd4fe077bb1c8d51d557d5bf Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 2 May 2019 10:20:11 +0200 Subject: [PATCH 08/28] remove pip upgrade --- .travis.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 1c6997caab..03be6585db 100644 --- a/.travis.yml +++ b/.travis.yml @@ -30,7 +30,6 @@ install: - wget -qO- get.nextflow.io | bash - sudo ln -s /tmp/nextflow/nextflow /usr/local/bin/nextflow # Install nf-core/tools - - pip install --upgrade pip - pip install nf-core # Install Conda - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh From 3cd9748c63e68519853c195aa750668e25e46e7b Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 2 May 2019 10:28:26 +0200 Subject: [PATCH 09/28] remove lint --- .travis.yml | 9 --------- 1 file changed, 9 deletions(-) diff --git a/.travis.yml b/.travis.yml index 03be6585db..03cf98bb46 100644 --- a/.travis.yml +++ b/.travis.yml @@ -29,21 +29,12 @@ install: - mkdir /tmp/nextflow && cd /tmp/nextflow - wget -qO- get.nextflow.io | bash - sudo ln -s /tmp/nextflow/nextflow /usr/local/bin/nextflow - # Install nf-core/tools - - pip install nf-core - # Install Conda - - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh - - bash Miniconda3-latest-Linux-x86_64.sh -b -f -p $HOME/miniconda - - export PATH="$HOME/miniconda/bin:$PATH" # Reset - mkdir ${TRAVIS_BUILD_DIR}/tests && cd ${TRAVIS_BUILD_DIR}/tests jobs: include: - - stage: lint - script: nf-core lint ${TRAVIS_BUILD_DIR} - stage: built - script: skip script: git clone --single-branch --branch sarek https://github.com/nf-core/test-datasets.git test-data script: nextflow run ${TRAVIS_BUILD_DIR}/build.nf -profile docker --genome smallGRCh37 --refdir test-data/reference --outdir References - stage: test From 861577cde5d285f8c27fc608ec88e0a5895937b0 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 2 May 2019 10:40:42 +0200 Subject: [PATCH 10/28] remove lib/SarekUtils.groovy --- lib/SarekUtils.groovy | 41 ------------------------------------ main.nf | 49 ++++++++++++++++++++++++++++++++++++------- 2 files changed, 42 insertions(+), 48 deletions(-) delete mode 100644 lib/SarekUtils.groovy diff --git a/lib/SarekUtils.groovy b/lib/SarekUtils.groovy deleted file mode 100644 index b1a90bfcaa..0000000000 --- a/lib/SarekUtils.groovy +++ /dev/null @@ -1,41 +0,0 @@ -import static nextflow.Nextflow.file -import nextflow.Channel - -class SarekUtils { - - // Check parameter existence - static def checkParameterExistence(it, list) { - if (!list.contains(it)) { - println("Unknown parameter: ${it}") - return false - } - return true - } - - // Compare each parameter with a list of parameters - static def checkParameterList(list, realList) { - return list.every{ checkParameterExistence(it, realList) } - } - - // Loop through all the references files to check their existence - static def checkReferenceMap(referenceMap) { - referenceMap.every { - referenceFile, fileToCheck -> - SarekUtils.checkRefExistence(referenceFile, fileToCheck) - } - } - - // Loop through all the references files to check their existence - static def checkRefExistence(referenceFile, fileToCheck) { - if (fileToCheck instanceof List) return fileToCheck.every{ SarekUtils.checkRefExistence(referenceFile, it) } - def f = file(fileToCheck) - // this is an expanded wildcard: we can assume all files exist - if (f instanceof List && f.size() > 0) return true - else if (!f.exists()) { - println "Missing references: ${referenceFile} ${fileToCheck}" - return false - } - return true - } - -} diff --git a/main.nf b/main.nf index 3b8687f20c..c39a1353ea 100644 --- a/main.nf +++ b/main.nf @@ -63,17 +63,17 @@ if (params.genomes && params.genome && !params.genomes.containsKey(params.genome stepList = defineStepList() step = params.step ? params.step.toLowerCase() : '' if (step == 'preprocessing' || step == '') step = 'mapping' -if (!SarekUtils.checkParameterExistence(step, stepList)) exit 1, 'Unknown step, see --help for more information' +if (!checkParameterExistence(step, stepList)) exit 1, 'Unknown step, see --help for more information' if (step.contains(',')) exit 1, 'You can choose only one step, see --help for more information' if (step == 'mapping' && !checkExactlyOne([params.test, params.sample, params.sampleDir])) exit 1, 'Please define which samples to work on by providing exactly one of the --test, --sample or --sampleDir options' tools = params.tools ? params.tools.split(',').collect{it.trim().toLowerCase()} : [] toolList = defineToolList() -if (!SarekUtils.checkParameterList(tools,toolList)) exit 1, 'Unknown tool(s), see --help for more information' +if (!checkParameterList(tools,toolList)) exit 1, 'Unknown tool(s), see --help for more information' referenceMap = defineReferenceMap(step, tools) -if (!SarekUtils.checkReferenceMap(referenceMap)) exit 1, 'Missing Reference file(s), see --help for more information' +if (!checkReferenceMap(referenceMap)) exit 1, 'Missing Reference file(s), see --help for more information' // Has the run name been specified by the user? // this has the bonus effect of catching both -name and --name @@ -363,15 +363,50 @@ def checkHostname(){ ======================================================================================== */ +def checkExactlyOne(list) { + def n = 0 + list.each{n += it ? 1 : 0} + return n == 1 +} + +// Check parameter existence +def checkParameterExistence(it, list) { + if (!list.contains(it)) { + println("Unknown parameter: ${it}") + return false + } + return true +} + +// Compare each parameter with a list of parameters +def checkParameterList(list, realList) { + return list.every{ checkParameterExistence(it, realList) } +} + def checkParamReturnFile(item) { params."${item}" = params.genomes[params.genome]."${item}" return file(params."${item}") } -def checkExactlyOne(list) { - def n = 0 - list.each{n += it ? 1 : 0} - return n == 1 +// Loop through all the references files to check their existence +def checkRefExistence(referenceFile, fileToCheck) { + if (fileToCheck instanceof List) return fileToCheck.every{ checkRefExistence(referenceFile, it) } + def f = file(fileToCheck) + // this is an expanded wildcard: we can assume all files exist + if (f instanceof List && f.size() > 0) return true + else if (!f.exists()) { + println "Missing references: ${referenceFile} ${fileToCheck}" + return false + } + return true +} + +// Loop through all the references files to check their existence +def checkReferenceMap(referenceMap) { + referenceMap.every { + referenceFile, fileToCheck -> + checkRefExistence(referenceFile, fileToCheck) + } } def defineReferenceMap(step, tools) { From 6cc6efe8e829fefebdf7adfb54c2c96133a19929 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 2 May 2019 10:46:41 +0200 Subject: [PATCH 11/28] add References to .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 5b54e3e6c2..6d5ff0b631 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ .nextflow* work/ data/ +References/ results/ .DS_Store tests/test_data From e2a44ddd7498747cc2655d657844cfc38d0fbd77 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 2 May 2019 10:48:14 +0200 Subject: [PATCH 12/28] lower case --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 6d5ff0b631..96675d0154 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,7 @@ .nextflow* work/ data/ -References/ +references/ results/ .DS_Store tests/test_data From 1f0cce6b5dd9fe4173f26e7a14043fe018821a3d Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 2 May 2019 11:51:25 +0200 Subject: [PATCH 13/28] remove picard --- bin/scrape_software_versions.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/bin/scrape_software_versions.py b/bin/scrape_software_versions.py index 599043c4ae..5667cd24c8 100755 --- a/bin/scrape_software_versions.py +++ b/bin/scrape_software_versions.py @@ -17,7 +17,6 @@ 'MultiQC': ['v_multiqc.txt', r"multiqc, version (\S+)"], 'Nextflow': ['v_nextflow.txt', r"(\S+)"], 'nf-core/sarek': ['v_pipeline.txt', r"(\S+)"], - 'Picard': ['v_picard.txt', r"Picard version:(\d\.\d\.\d+)"], 'Qualimap': ['v_qualimap.txt', r"QualiMap v.(\S+)"], 'R': ['v_r.txt', r"R version (\S+)"], 'samtools': ['v_samtools.txt', r"samtools (\S+)"], @@ -39,7 +38,6 @@ results['htslib'] = 'N/A' results['Manta'] = 'N/A' results['MultiQC'] = 'N/A' -results['Picard'] = 'N/A' results['Qualimap'] = 'N/A' results['R'] = 'N/A' results['samtools'] = 'N/A' From 8627fc0861e572630897e6c946262f93d98b170f Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 2 May 2019 11:52:22 +0200 Subject: [PATCH 14/28] improve software versions gathering --- main.nf | 181 ++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 164 insertions(+), 17 deletions(-) diff --git a/main.nf b/main.nf index c39a1353ea..2060a2dec6 100644 --- a/main.nf +++ b/main.nf @@ -3,6 +3,8 @@ ======================================================================================== nf-core/sarek ======================================================================================== +New Germline (+ Somatic) Analysis Workflow. Started March 2016. +---------------------------------------------------------------------------------------- nf-core/sarek Analysis Pipeline. @Homepage https://sarek.scilifelab.se/ @@ -60,12 +62,17 @@ if (params.genomes && params.genome && !params.genomes.containsKey(params.genome exit 1, "The provided genome '${params.genome}' is not available in the iGenomes file. Currently the available genomes are ${params.genomes.keySet().join(", ")}" } +params.step = 'mapping' +params.test = false +params.sampleDir = false +params.tools = false + stepList = defineStepList() step = params.step ? params.step.toLowerCase() : '' if (step == 'preprocessing' || step == '') step = 'mapping' if (!checkParameterExistence(step, stepList)) exit 1, 'Unknown step, see --help for more information' if (step.contains(',')) exit 1, 'You can choose only one step, see --help for more information' -if (step == 'mapping' && !checkExactlyOne([params.test, params.sample, params.sampleDir])) +if (step == 'mapping' && ([params.test, params.sample, params.sampleDir].size == 1)) exit 1, 'Please define which samples to work on by providing exactly one of the --test, --sample or --sampleDir options' tools = params.tools ? params.tools.split(',').collect{it.trim().toLowerCase()} : [] @@ -112,6 +119,33 @@ ch_output_docs = Channel.fromPath("$baseDir/docs/output.md") if (params.test || step != 'mapping') tsvPath = tsvPaths[step] } + // Set up the inputFiles and bamFiles channels. One of them will remain empty + inputFiles = Channel.empty() + bamFiles = Channel.empty() + if (tsvPath) { + tsvFile = file(tsvPath) + switch (step) { + case 'mapping': inputFiles = extractSample(tsvFile); break + case 'recalibrate': bamFiles = extractRecal(tsvFile); break + default: exit 1, "Unknown step ${step}" + } + } else if (params.sampleDir) { + if (step != 'mapping') exit 1, '--sampleDir does not support steps other than "mapping"' + inputFiles = extractFastqFromDir(params.sampleDir) + (inputFiles, fastqTmp) = inputFiles.into(2) + fastqTmp.toList().subscribe onNext: { + if (it.size() == 0) { + exit 1, "No FASTQ files found in --sampleDir directory '${params.sampleDir}'" + } + } + tsvFile = params.sampleDir // used in the reports + } else exit 1, 'No sample were defined, see --help' + + if (step == 'recalibrate') (patientGenders, bamFiles) = extractGenders(bamFiles) + else (patientGenders, inputFiles) = extractGenders(inputFiles) + + + // Header log info log.info nfcoreHeader() def summary = [:] @@ -164,20 +198,17 @@ ${summary.collect { k,v -> "
$k
${v ?: ' - if (filename.indexOf(".csv") > 0) filename - else null - } + publishDir path:"${params.outdir}/pipeline_info", mode: params.publishDirMode output: file 'software_versions_mqc.yaml' into software_versions_yaml - file "software_versions.csv" script: """ + alleleCounter --version &> v_allelecount.txt || true bcftools version > v_bcftools.txt 2>&1 || true bwa &> v_bwa.txt 2>&1 || true + cat ${baseDir}/scripts/ascat.R | grep "ASCAT version" &> v_ascat.txt || true configManta.py --version > v_manta.txt 2>&1 || true configureStrelkaGermlineWorkflow.py --version > v_strelka.txt 2>&1 || true echo "${workflow.manifest.version}" &> v_pipeline.txt 2>&1 || true @@ -188,6 +219,7 @@ process get_software_versions { gatk ApplyBQSR --help 2>&1 | grep Version: > v_gatk.txt 2>&1 || true multiqc --version &> v_multiqc.txt 2>&1 || true qualimap --version &> v_qualimap.txt 2>&1 || true + R --version &> v_r.txt || true samtools --version &> v_samtools.txt 2>&1 || true vcftools --version &> v_vcftools.txt 2>&1 || true vep --help &> v_vep.txt 2>&1 || true @@ -196,6 +228,9 @@ process get_software_versions { """ } + + + /* * Completion e-mail notification */ @@ -324,12 +359,12 @@ def nfcoreHeader(){ ${c_blue} |\\ | |__ __ / ` / \\ |__) |__ ${c_yellow}} {${c_reset} ${c_blue} | \\| | \\__, \\__/ | \\ |___ ${c_green}\\`-._,-`-,${c_reset} ${c_green}`._,._,\'${c_reset} - ____ _____ _ - .' _ `. / ____| | | - / |\\`-_ \\ | (___ ___ _ __ __ | | __ - | | \\ `-| \\___ \\/__ \\| ´__/ _\\| |/ / - \\ | \\ / ____) | __ | | | __| < - `|____\\' |_____/\\____|_| \\__/|_|\\_\\ + ${c_black} ____ ${c_blue} _____ _ ${c_reset} + ${c_black} .' ${c_green}_${c_black} `. ${c_blue} / ____| | | ${c_reset} + ${c_black} / ${c_green}|\\${c_white}`-_${c_black} \\ ${c_blue} | (___ ___ _ __ __ | | __ ${c_reset} + ${c_black} | ${c_green}| \\ ${c_white}`-${c_black}| ${c_blue} \\___ \\/__ \\| ´__/ _\\| |/ / ${c_reset} + ${c_black} \\ ${c_green}| \\ ${c_black}/ ${c_blue} ____) | __ | | | __| < ${c_reset} + ${c_black} `${c_green}|${c_black}____${c_green}\\${c_black}' ${c_blue} |_____/\\____|_| \\__/|_|\\_\\ ${c_reset} ${c_purple} nf-core/sarek v${workflow.manifest.version}${c_reset} ${c_dim}----------------------------------------------------${c_reset} @@ -363,10 +398,10 @@ def checkHostname(){ ======================================================================================== */ -def checkExactlyOne(list) { - def n = 0 - list.each{n += it ? 1 : 0} - return n == 1 +// Check if a row has the expected number of item +def checkNumberOfItem(row, number) { + if (row.size() != number) exit 1, "Malformed row in TSV file: ${row}, see --help for more information" + return true } // Check parameter existence @@ -383,6 +418,7 @@ def checkParameterList(list, realList) { return list.every{ checkParameterExistence(it, realList) } } +// Check if params.item exists and return params.genomes[params.genome].item otherwise def checkParamReturnFile(item) { params."${item}" = params.genomes[params.genome]."${item}" return file(params."${item}") @@ -409,6 +445,7 @@ def checkReferenceMap(referenceMap) { } } +// Define map of reference depending of tools and step def defineReferenceMap(step, tools) { def referenceMap = [ @@ -439,6 +476,7 @@ def defineReferenceMap(step, tools) { return referenceMap } +// Define list of available step def defineStepList() { return [ 'mapping', @@ -448,6 +486,7 @@ def defineStepList() { ] } +// Define list of available tools def defineToolList() { return [ 'ascat', @@ -458,3 +497,111 @@ def defineToolList() { 'strelka' ] } + + // Create a channel of germline FASTQs from a directory pattern: "my_samples/*/" + // All FASTQ files in subdirectories are collected and emitted; + // they must have _R1_ and _R2_ in their names. +def extractFastqFromDir(pattern) { + def fastq = Channel.create() + // a temporary channel does all the work + Channel + .fromPath(pattern, type: 'dir') + .ifEmpty { error "No directories found matching pattern '${pattern}'" } + .subscribe onNext: { sampleDir -> + // the last name of the sampleDir is assumed to be a unique sample id + sampleId = sampleDir.getFileName().toString() + + for (path1 in file("${sampleDir}/**_R1_*.fastq.gz")) { + assert path1.getName().contains('_R1_') + path2 = file(path1.toString().replace('_R1_', '_R2_')) + if (!path2.exists()) error "Path '${path2}' not found" + (flowcell, lane) = flowcellLaneFromFastq(path1) + patient = sampleId + gender = 'ZZ' // unused + status = 0 // normal (not tumor) + rgId = "${flowcell}.${sampleId}.${lane}" + result = [patient, gender, status, sampleId, rgId, path1, path2] + fastq.bind(result) + } + }, onComplete: { fastq.close() } + fastq +} + +// Extract gender from Channel as it's only used for CNVs +def extractGenders(channel) { + def genders = [:] + channel = channel.map{ it -> + def idPatient = it[0] + def gender = it[1] + genders[idPatient] = gender + [idPatient] + it[2..-1] + } + [genders, channel] +} + +// Channeling the TSV file containing FASTQ or BAM +// Format is: "subject gender status sample lane fastq1 fastq2" +// or: "subject gender status sample lane bam" +def extractSample(tsvFile) { + Channel.from(tsvFile) + .splitCsv(sep: '\t') + .map { row -> + def idPatient = row[0] + def gender = row[1] + def status = returnStatus(row[2].toInteger()) + def idSample = row[3] + def idRun = row[4] + def file1 = returnFile(row[5]) + def file2 = file("null") + if (hasExtension(file1,"fastq.gz") || hasExtension(file1,"fq.gz")) { + checkNumberOfItem(row, 7) + file2 = returnFile(row[6]) + if (!hasExtension(file2,"fastq.gz") && !hasExtension(file2,"fq.gz")) exit 1, "File: ${file2} has the wrong extension. See --help for more information" + } + else if (hasExtension(file1,"bam")) checkNumberOfItem(row, 6) + else "No recognisable extention for input file: ${file1}" + + [idPatient, gender, status, idSample, idRun, file1, file2] + } +} + +// Channeling the TSV file containing Recalibration Tables. +// Format is: "subject gender status sample bam bai recalTables" +def extractRecal(tsvFile) { + Channel.from(tsvFile) + .splitCsv(sep: '\t') + .map { row -> + checkNumberOfItem(row, 7) + def idPatient = row[0] + def gender = row[1] + def status = returnStatus(row[2].toInteger()) + def idSample = row[3] + def bamFile = returnFile(row[4]) + def baiFile = returnFile(row[5]) + def recalTable = returnFile(row[6]) + + if (!hasExtension(bamFile,"bam")) exit 1, "File: ${bamFile} has the wrong extension. See --help for more information" + if (!hasExtension(baiFile,"bai")) exit 1, "File: ${baiFile} has the wrong extension. See --help for more information" + if (!hasExtension(recalTable,"recal.table")) exit 1, "File: ${recalTable} has the wrong extension. See --help for more information" + + [ idPatient, gender, status, idSample, bamFile, baiFile, recalTable ] + } +} + +// Check file extension +def hasExtension(it, extension) { + it.toString().toLowerCase().endsWith(extension.toLowerCase()) +} + +// Return file if it exists +def returnFile(it) { + if (!file(it).exists()) exit 1, "Missing file in TSV file: ${it}, see --help for more information" + return file(it) +} + +// Return status [0,1] +// 0 == Normal, 1 == Tumor +def returnStatus(it) { + if (!(it in [0, 1])) exit 1, "Status is not recognized in TSV file: ${it}, see --help for more information" + return it +} From 9bc65d1815265cb75565218196e84d6117021e4b Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 2 May 2019 11:52:52 +0200 Subject: [PATCH 15/28] fix docker owner --- nextflow.config | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 6e7ff7b57d..a43d967601 100644 --- a/nextflow.config +++ b/nextflow.config @@ -58,7 +58,13 @@ profiles { awsbatch { includeConfig 'conf/awsbatch.config' } conda { process.conda = "$baseDir/environment.yml" } debug { process.beforeScript = 'echo $HOSTNAME' } - docker { docker.enabled = true } + docker { + docker { + enabled = true + fixOwnership = true + runOptions = "-u \$(id -u):\$(id -g)" + } + } singularity { singularity.enabled = true } test { includeConfig 'conf/test.config' } } From 806cab59c8d7a1e19c7b8e61f3b221d9f83576d2 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 2 May 2019 13:18:42 +0200 Subject: [PATCH 16/28] update tests --- .travis.yml | 6 +++--- Jenkinsfile | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.travis.yml b/.travis.yml index 03cf98bb46..7941be71fb 100644 --- a/.travis.yml +++ b/.travis.yml @@ -35,7 +35,7 @@ install: jobs: include: - stage: built - script: git clone --single-branch --branch sarek https://github.com/nf-core/test-datasets.git test-data - script: nextflow run ${TRAVIS_BUILD_DIR}/build.nf -profile docker --genome smallGRCh37 --refdir test-data/reference --outdir References + script: git clone --single-branch --branch sarek https://github.com/nf-core/test-datasets.git data + script: nextflow run ${TRAVIS_BUILD_DIR}/build.nf -profile docker --genome smallGRCh37 --refdir data/reference --outdir references --publishDirMode link - stage: test - script: nextflow run ${TRAVIS_BUILD_DIR}/main.nf -profile docker --help + script: nextflow run ${TRAVIS_BUILD_DIR}/main.nf -profile docker --genome smallGRCh37 --igenomes_base references --sample data/testdata/tsv/tiny-multiple.tsv --publishDirMode link diff --git a/Jenkinsfile b/Jenkinsfile index b54e5c7deb..ed45e1923b 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -13,13 +13,13 @@ pipeline { } stage('Build') { steps { - sh "git clone --single-branch --branch sarek https://github.com/nf-core/test-datasets.git test-data" - sh "nextflow run build.nf -profile docker --genome smallGRCh37 --refdir test-data/reference --outdir References" + sh "git clone --single-branch --branch sarek https://github.com/nf-core/test-datasets.git data" + sh "nextflow run build.nf -profile docker --genome smallGRCh37 --refdir data/reference --outdir references --publishDirMode link" } } stage('Test') { steps { - sh "nextflow run main.nf -profile docker --help" + sh "nextflow run main.nf -profile docker --genome smallGRCh37 --igenomes_base references --sample data/testdata/tsv/tiny-multiple.tsv --publishDirMode link" } } } From a58eaa255f59b1321b8eb03ae299350103904054 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 2 May 2019 13:19:04 +0200 Subject: [PATCH 17/28] sort params --- conf/base.config | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/conf/base.config b/conf/base.config index bb54f1e010..14c8f70aef 100644 --- a/conf/base.config +++ b/conf/base.config @@ -17,8 +17,8 @@ process { time = { check_max( 2.h * task.attempt, 'time' ) } errorStrategy = { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' } - maxRetries = 1 maxErrors = '-1' + maxRetries = 1 // Process-specific resource requirements // TODO nf-core: Customise requirements for specific processes. @@ -27,8 +27,9 @@ process { params { // Defaults only, expecting to be overwritten - max_memory = 128.GB + igenomes_base = 's3://ngi-igenomes/igenomes/' + markdup_java_options = '"-Xms4000m -Xmx7g"' //Established values for markDuplicate memory consumption, see issue PR #689 for details max_cpus = 16 + max_memory = 128.GB max_time = 240.h - igenomes_base = 's3://ngi-igenomes/igenomes/' } From b0fb4b93531fbef8cbe7207939a76f7b50ff9a34 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 2 May 2019 13:19:40 +0200 Subject: [PATCH 18/28] add preprocessing --- main.nf | 359 +++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 353 insertions(+), 6 deletions(-) diff --git a/main.nf b/main.nf index 2060a2dec6..454d301de0 100644 --- a/main.nf +++ b/main.nf @@ -62,9 +62,12 @@ if (params.genomes && params.genome && !params.genomes.containsKey(params.genome exit 1, "The provided genome '${params.genome}' is not available in the iGenomes file. Currently the available genomes are ${params.genomes.keySet().join(", ")}" } +params.noReports = false +params.sampleDir = false +params.sequencing_center = null params.step = 'mapping' +params.targetBED = null params.test = false -params.sampleDir = false params.tools = false stepList = defineStepList() @@ -113,8 +116,8 @@ ch_output_docs = Channel.fromPath("$baseDir/docs/output.md") if (!params.sample && !params.sampleDir) { tsvPaths = [ 'mapping': "${workflow.projectDir}/Sarek-data/testdata/tsv/tiny.tsv", - 'recalibrate': "${params.outDir}/Preprocessing/DuplicateMarked/duplicateMarked.tsv", - 'variantcalling': "${params.outDir}/Preprocessing/Recalibrated/recalibrated.tsv" + 'recalibrate': "${params.outdir}/Preprocessing/DuplicateMarked/duplicateMarked.tsv", + 'variantcalling': "${params.outdir}/Preprocessing/Recalibrated/recalibrated.tsv" ] if (params.test || step != 'mapping') tsvPath = tsvPaths[step] } @@ -144,8 +147,6 @@ ch_output_docs = Channel.fromPath("$baseDir/docs/output.md") if (step == 'recalibrate') (patientGenders, bamFiles) = extractGenders(bamFiles) else (patientGenders, inputFiles) = extractGenders(inputFiles) - - // Header log info log.info nfcoreHeader() def summary = [:] @@ -198,7 +199,7 @@ ${summary.collect { k,v -> "
$k
${v ?: ' ${idRun}.bam + """ + else if (hasExtension(inputFile1,"bam")) + // -K is an hidden option, used to fix the number of reads processed by bwa mem + // Chunk size can affect bwa results, if not specified, the number of threads can change + // which can give not deterministic result. + // cf https://github.com/CCDG/Pipeline-Standardization/blob/master/PipelineStandard.md + // and https://github.com/gatk-workflows/gatk4-data-processing/blob/8ffa26ff4580df4ac3a5aa9e272a4ff6bab44ba2/processing-for-variant-discovery-gatk4.b37.wgs.inputs.json#L29 + """ + gatk --java-options -Xmx${task.memory.toGiga()}g \ + SamToFastq \ + --INPUT=${inputFile1} \ + --FASTQ=/dev/stdout \ + --INTERLEAVE=true \ + --NON_PF=true \ + | \ + bwa mem -K 100000000 -p -R \"${readGroup}\" ${extra} -t ${task.cpus} -M ${genomeFile} \ + /dev/stdin - 2> >(tee ${inputFile1}.bwa.stderr.log >&2) \ + | \ + samtools sort --threads ${task.cpus} -m 2G - > ${idRun}.bam + """ +} + +mappedBam = mappedBam.dump(tag:'Mapped BAM') + +process RunBamQCmapped { + tag {idPatient + "-" + idSample} + + publishDir "${params.outdir}/Reports/${idSample}/bamQC", mode: params.publishDirMode + + input: + set idPatient, status, idSample, idRun, file(bam) from mappedBamForQC + file(targetBED) from Channel.value(params.targetBED ? file(params.targetBED) : "null") + + output: + file("${bam.baseName}") into bamQCmappedReport + + when: !params.noReports + + script: + use_bed = params.targetBED ? "-gff ${targetBED}" : '' + """ + qualimap --java-mem-size=${task.memory.toGiga()}G \ + bamqc \ + -bam ${bam} \ + --paint-chromosome-limits \ + --genome-gc-distr HUMAN \ + $use_bed \ + -nt ${task.cpus} \ + -skip-duplicated \ + --skip-dup-mode 0 \ + -outdir ${bam.baseName} \ + -outformat HTML + """ +} + +bamQCmappedReport.dump(tag:'BamQC BAM') + +// Sort bam whether they are standalone or should be merged + +singleBam = Channel.create() +groupedBam = Channel.create() +mappedBam.groupTuple(by:[0,1,2]) + .choice(singleBam, groupedBam) {it[3].size() > 1 ? 1 : 0} +singleBam = singleBam.map { + idPatient, status, idSample, idRun, bam -> + [idPatient, status, idSample, bam] +} +process MergeBams { + tag {idPatient + "-" + idSample} + + input: + set idPatient, status, idSample, idRun, file(bam) from groupedBam + + output: + set idPatient, status, idSample, file("${idSample}.bam") into mergedBam + + when: step == 'mapping' + + script: + """ + samtools merge --threads ${task.cpus} ${idSample}.bam ${bam} + """ +} + +singleBam = singleBam.dump(tag:'Single BAM') +mergedBam = mergedBam.dump(tag:'Merged BAM') +mergedBam = mergedBam.mix(singleBam) +mergedBam = mergedBam.dump(tag:'BAM for MD') + +process MarkDuplicates { + tag {idPatient + "-" + idSample} + + publishDir params.outdir, mode: params.publishDirMode, + saveAs: { + if (it == "${idSample}.bam.metrics") "Reports/${idSample}/MarkDuplicates/${it}" + else "Preprocessing/${idSample}/DuplicateMarked/${it}" + } + + input: + set idPatient, status, idSample, file("${idSample}.bam") from mergedBam + + output: + set idPatient, file("${idSample}_${status}.md.bam"), file("${idSample}_${status}.md.bai") into duplicateMarkedBams + set idPatient, status, idSample, val("${idSample}_${status}.md.bam"), val("${idSample}_${status}.md.bai") into markDuplicatesTSV + file ("${idSample}.bam.metrics") into markDuplicatesReport + + when: step == 'mapping' + + script: + markdup_java_options = task.memory.toGiga() > 8 ? params.markdup_java_options : "\"-Xms" + (task.memory.toGiga() / 2 ).trunc() + "g -Xmx" + (task.memory.toGiga() - 1) + "g\"" + """ + gatk --java-options ${markdup_java_options} \ + MarkDuplicates \ + --MAX_RECORDS_IN_RAM 50000 \ + --INPUT ${idSample}.bam \ + --METRICS_FILE ${idSample}.bam.metrics \ + --TMP_DIR . \ + --ASSUME_SORT_ORDER coordinate \ + --CREATE_INDEX true \ + --OUTPUT ${idSample}_${status}.md.bam + """ +} + +// Creating a TSV file to restart from this step +markDuplicatesTSV.map { idPatient, status, idSample, bam, bai -> + gender = patientGenders[idPatient] + "${idPatient}\t${gender}\t${status}\t${idSample}\t${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${bam}\t${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${bai}\n" +}.collectFile( + name: 'duplicateMarked.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/DuplicateMarked" +) + +duplicateMarkedBams = duplicateMarkedBams.map { + idPatient, bam, bai -> + tag = bam.baseName.tokenize('.')[0] + status = tag[-1..-1].toInteger() + idSample = tag.take(tag.length()-2) + [idPatient, status, idSample, bam, bai] +} + +duplicateMarkedBams = duplicateMarkedBams.dump(tag:'MD BAM') + +(mdBam, mdBamToJoin) = duplicateMarkedBams.into(2) + +process CreateRecalibrationTable { + tag {idPatient + "-" + idSample} + + publishDir "${params.outdir}/Preprocessing/${idSample}/DuplicateMarked", mode: params.publishDirMode, overwrite: false + + input: + set idPatient, status, idSample, file(bam), file(bai) from mdBam // realignedBam + set file(genomeFile), file(genomeIndex), file(genomeDict), file(dbsnp), file(dbsnpIndex), file(knownIndels), file(knownIndelsIndex), file(intervals) from Channel.value([ + referenceMap.genomeFile, + referenceMap.genomeIndex, + referenceMap.genomeDict, + referenceMap.dbsnp, + referenceMap.dbsnpIndex, + referenceMap.knownIndels, + referenceMap.knownIndelsIndex, + referenceMap.intervals, + ]) + + output: + set idPatient, status, idSample, file("${idSample}.recal.table") into recalibrationTable + set idPatient, status, idSample, val("${idSample}_${status}.md.bam"), val("${idSample}_${status}.md.bai"), val("${idSample}.recal.table") into recalibrationTableTSV + + when: step == 'mapping' + + script: + known = knownIndels.collect{ "--known-sites ${it}" }.join(' ') + """ + gatk --java-options -Xmx${task.memory.toGiga()}g \ + BaseRecalibrator \ + --input ${bam} \ + --output ${idSample}.recal.table \ + --tmp-dir /tmp \ + -R ${genomeFile} \ + -L ${intervals} \ + --known-sites ${dbsnp} \ + ${known} \ + --verbosity INFO + """ +} + +// Create a TSV file to restart from this step +recalibrationTableTSV.map { idPatient, status, idSample, bam, bai, recalTable -> + gender = patientGenders[idPatient] + "${idPatient}\t${gender}\t${status}\t${idSample}\t${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${bam}\t${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${bai}\t${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${recalTable}\n" +}.collectFile( + name: 'duplicateMarked.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/DuplicateMarked" +) + +recalibrationTable = mdBamToJoin.join(recalibrationTable, by:[0,1,2]) + +if (step == 'recalibrate') recalibrationTable = bamFiles + +recalibrationTable = recalibrationTable.dump(tag:'recal.table') + +process RecalibrateBam { + tag {idPatient + "-" + idSample} + + publishDir "${params.outdir}/Preprocessing/${idSample}/Recalibrated", mode: params.publishDirMode + + input: + set idPatient, status, idSample, file(bam), file(bai), file(recalibrationReport) from recalibrationTable + set file(genomeFile), file(genomeIndex), file(genomeDict), file(intervals) from Channel.value([ + referenceMap.genomeFile, + referenceMap.genomeIndex, + referenceMap.genomeDict, + referenceMap.intervals, + ]) + + output: + set idPatient, status, idSample, file("${idSample}.recal.bam"), file("${idSample}.recal.bai") into recalibratedBam, recalibratedBamForStats + set idPatient, status, idSample, val("${idSample}.recal.bam"), val("${idSample}.recal.bai") into recalibratedBamTSV + + script: + """ + gatk --java-options -Xmx${task.memory.toGiga()}g \ + ApplyBQSR \ + -R ${genomeFile} \ + --input ${bam} \ + --output ${idSample}.recal.bam \ + -L ${intervals} \ + --create-output-bam-index true \ + --bqsr-recal-file ${recalibrationReport} + """ +} +// Creating a TSV file to restart from this step +recalibratedBamTSV.map { idPatient, status, idSample, bam, bai -> + gender = patientGenders[idPatient] + "${idPatient}\t${gender}\t${status}\t${idSample}\t${params.outdir}/Preprocessing/${idSample}/Recalibrated/${bam}\t${params.outdir}/Preprocessing/${idSample}/Recalibrated/${bai}\n" +}.collectFile( + name: 'recalibrated.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/Recalibrated" +) + +recalibratedBam.dump(tag:'recal.bam') + +// Remove recalTable from Channels to match inputs for Process to avoid: +// WARN: Input tuple does not match input set cardinality declared by process... +(bamForBamQC, bamForSamToolsStats) = recalibratedBamForStats.map{ it[0..4] }.into(2) + +process RunSamtoolsStats { + tag {idPatient + "-" + idSample} + + publishDir "${params.outdir}/Reports/${idSample}/SamToolsStats", mode: params.publishDirMode + + input: + set idPatient, status, idSample, file(bam), file(bai) from bamForSamToolsStats + + output: + file ("${bam}.samtools.stats.out") into samtoolsStatsReport + + when: !params.noReports + + script: + """ + samtools stats ${bam} > ${bam}.samtools.stats.out + """ +} + +samtoolsStatsReport.dump(tag:'SAMTools') + +process RunBamQCrecalibrated { + tag {idPatient + "-" + idSample} + + publishDir "${params.outdir}/Reports/${idSample}/bamQC", mode: params.publishDirMode + + input: + set idPatient, status, idSample, file(bam), file(bai) from bamForBamQC + + output: + file("${bam.baseName}") into bamQCrecalibratedReport + + when: !params.noReports + + script: + """ + qualimap --java-mem-size=${task.memory.toGiga()}G \ + bamqc \ + -bam ${bam} \ + --paint-chromosome-limits \ + --genome-gc-distr HUMAN \ + -nt ${task.cpus} \ + -skip-duplicated \ + --skip-dup-mode 0 \ + -outdir ${bam.baseName} \ + -outformat HTML + """ +} +bamQCrecalibratedReport.dump(tag:'BamQC') /* * Completion e-mail notification From ba09b0acd7ea2e77e056f5f0999091f936fe37e3 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 2 May 2019 13:21:47 +0200 Subject: [PATCH 19/28] improve TSV localisation --- main.nf | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/main.nf b/main.nf index 454d301de0..83217d7ce3 100644 --- a/main.nf +++ b/main.nf @@ -116,8 +116,8 @@ ch_output_docs = Channel.fromPath("$baseDir/docs/output.md") if (!params.sample && !params.sampleDir) { tsvPaths = [ 'mapping': "${workflow.projectDir}/Sarek-data/testdata/tsv/tiny.tsv", - 'recalibrate': "${params.outdir}/Preprocessing/DuplicateMarked/duplicateMarked.tsv", - 'variantcalling': "${params.outdir}/Preprocessing/Recalibrated/recalibrated.tsv" + 'recalibrate': "${params.outdir}/Preprocessing/TSV/duplicateMarked.tsv", + 'variantcalling': "${params.outdir}/Preprocessing/TSV/recalibrated.tsv" ] if (params.test || step != 'mapping') tsvPath = tsvPaths[step] } @@ -414,7 +414,7 @@ markDuplicatesTSV.map { idPatient, status, idSample, bam, bai -> gender = patientGenders[idPatient] "${idPatient}\t${gender}\t${status}\t${idSample}\t${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${bam}\t${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${bai}\n" }.collectFile( - name: 'duplicateMarked.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/DuplicateMarked" + name: 'duplicateMarked.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV" ) duplicateMarkedBams = duplicateMarkedBams.map { @@ -474,7 +474,7 @@ recalibrationTableTSV.map { idPatient, status, idSample, bam, bai, recalTable -> gender = patientGenders[idPatient] "${idPatient}\t${gender}\t${status}\t${idSample}\t${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${bam}\t${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${bai}\t${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${recalTable}\n" }.collectFile( - name: 'duplicateMarked.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/DuplicateMarked" + name: 'duplicateMarked.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV" ) recalibrationTable = mdBamToJoin.join(recalibrationTable, by:[0,1,2]) @@ -518,7 +518,7 @@ recalibratedBamTSV.map { idPatient, status, idSample, bam, bai -> gender = patientGenders[idPatient] "${idPatient}\t${gender}\t${status}\t${idSample}\t${params.outdir}/Preprocessing/${idSample}/Recalibrated/${bam}\t${params.outdir}/Preprocessing/${idSample}/Recalibrated/${bai}\n" }.collectFile( - name: 'recalibrated.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/Recalibrated" + name: 'recalibrated.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV" ) recalibratedBam.dump(tag:'recal.bam') From 3d8208abca5959b57c8f082658605d58dd1629b5 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 2 May 2019 14:26:17 +0200 Subject: [PATCH 20/28] improve multiple TSV --- main.nf | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/main.nf b/main.nf index 83217d7ce3..4ac7662e73 100644 --- a/main.nf +++ b/main.nf @@ -389,7 +389,6 @@ process MarkDuplicates { output: set idPatient, file("${idSample}_${status}.md.bam"), file("${idSample}_${status}.md.bai") into duplicateMarkedBams - set idPatient, status, idSample, val("${idSample}_${status}.md.bam"), val("${idSample}_${status}.md.bai") into markDuplicatesTSV file ("${idSample}.bam.metrics") into markDuplicatesReport when: step == 'mapping' @@ -409,14 +408,6 @@ process MarkDuplicates { """ } -// Creating a TSV file to restart from this step -markDuplicatesTSV.map { idPatient, status, idSample, bam, bai -> - gender = patientGenders[idPatient] - "${idPatient}\t${gender}\t${status}\t${idSample}\t${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${bam}\t${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${bai}\n" -}.collectFile( - name: 'duplicateMarked.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV" -) - duplicateMarkedBams = duplicateMarkedBams.map { idPatient, bam, bai -> tag = bam.baseName.tokenize('.')[0] @@ -469,7 +460,8 @@ process CreateRecalibrationTable { """ } -// Create a TSV file to restart from this step +(recalibrationTableTSV, recalibrationTableSampleTSV) = recalibrationTableTSV.into(2) +// Create TSV files to restart from this step recalibrationTableTSV.map { idPatient, status, idSample, bam, bai, recalTable -> gender = patientGenders[idPatient] "${idPatient}\t${gender}\t${status}\t${idSample}\t${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${bam}\t${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${bai}\t${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${recalTable}\n" @@ -477,6 +469,13 @@ recalibrationTableTSV.map { idPatient, status, idSample, bam, bai, recalTable -> name: 'duplicateMarked.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV" ) +recalibrationTableSampleTSV + .collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { + idPatient, status, idSample, bam, bai, recalTable -> + gender = patientGenders[idPatient] + ["duplicateMarked_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${bam}\t${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${bai}\t${params.outdir}/Preprocessing/${idSample}/DuplicateMarked/${recalTable}\n"] +} + recalibrationTable = mdBamToJoin.join(recalibrationTable, by:[0,1,2]) if (step == 'recalibrate') recalibrationTable = bamFiles @@ -513,6 +512,9 @@ process RecalibrateBam { --bqsr-recal-file ${recalibrationReport} """ } + + +(recalibratedBamTSV, recalibratedBamSampleTSV) = recalibratedBamTSV.into(2) // Creating a TSV file to restart from this step recalibratedBamTSV.map { idPatient, status, idSample, bam, bai -> gender = patientGenders[idPatient] @@ -521,6 +523,13 @@ recalibratedBamTSV.map { idPatient, status, idSample, bam, bai -> name: 'recalibrated.tsv', sort: true, storeDir: "${params.outdir}/Preprocessing/TSV" ) +recalibratedBamSampleTSV + .collectFile(storeDir: "${params.outdir}/Preprocessing/TSV") { + idPatient, status, idSample, bam, bai -> + gender = patientGenders[idPatient] + ["recalibrated_${idSample}.tsv", "${idPatient}\t${gender}\t${status}\t${idSample}\t${params.outdir}/Preprocessing/${idSample}/Recalibrated/${bam}\t${params.outdir}/Preprocessing/${idSample}/Recalibrated/${bai}\n"] +} + recalibratedBam.dump(tag:'recal.bam') // Remove recalTable from Channels to match inputs for Process to avoid: From 0b777d458fce6bab060bb37f68bf3a972c15d443 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 2 May 2019 14:43:51 +0200 Subject: [PATCH 21/28] add parrallelized BaseRecalibrator --- main.nf | 110 +++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 98 insertions(+), 12 deletions(-) diff --git a/main.nf b/main.nf index 4ac7662e73..8631369549 100644 --- a/main.nf +++ b/main.nf @@ -63,6 +63,7 @@ if (params.genomes && params.genome && !params.genomes.containsKey(params.genome } params.noReports = false +params.nucleotidesPerSecond = 1000.0 params.sampleDir = false params.sequencing_center = null params.step = 'mapping' @@ -420,14 +421,76 @@ duplicateMarkedBams = duplicateMarkedBams.dump(tag:'MD BAM') (mdBam, mdBamToJoin) = duplicateMarkedBams.into(2) +process CreateIntervalBeds { + tag {intervals.fileName} + + input: + file(intervals) from Channel.value(referenceMap.intervals) + + output: + file '*.bed' into bedIntervals mode flatten + + script: + // If the interval file is BED format, the fifth column is interpreted to + // contain runtime estimates, which is then used to combine short-running jobs + if (hasExtension(intervals,"bed")) + """ + awk -vFS="\t" '{ + t = \$5 # runtime estimate + if (t == "") { + # no runtime estimate in this row, assume default value + t = (\$3 - \$2) / ${params.nucleotidesPerSecond} + } + if (name == "" || (chunk > 600 && (chunk + t) > longest * 1.05)) { + # start a new chunk + name = sprintf("%s_%d-%d.bed", \$1, \$2+1, \$3) + chunk = 0 + longest = 0 + } + if (t > longest) + longest = t + chunk += t + print \$0 > name + }' ${intervals} + """ + else + """ + awk -vFS="[:-]" '{ + name = sprintf("%s_%d-%d", \$1, \$2, \$3); + printf("%s\\t%d\\t%d\\n", \$1, \$2-1, \$3) > name ".bed" + }' ${intervals} + """ +} + +bedIntervals = bedIntervals + .map { intervalFile -> + def duration = 0.0 + for (line in intervalFile.readLines()) { + final fields = line.split('\t') + if (fields.size() >= 5) duration += fields[4].toFloat() + else { + start = fields[1].toInteger() + end = fields[2].toInteger() + duration += (end - start) / params.nucleotidesPerSecond + } + } + [duration, intervalFile] + }.toSortedList({ a, b -> b[0] <=> a[0] }) + .flatten().collate(2) + .map{duration, intervalFile -> intervalFile} + +bedIntervals = bedIntervals.dump(tag:'bedintervals') + +bamForBaseRecalibrator = mdBam.combine(bedIntervals) + process CreateRecalibrationTable { - tag {idPatient + "-" + idSample} + tag {idPatient + "-" + idSample + "-" + intervalBed} publishDir "${params.outdir}/Preprocessing/${idSample}/DuplicateMarked", mode: params.publishDirMode, overwrite: false input: - set idPatient, status, idSample, file(bam), file(bai) from mdBam // realignedBam - set file(genomeFile), file(genomeIndex), file(genomeDict), file(dbsnp), file(dbsnpIndex), file(knownIndels), file(knownIndelsIndex), file(intervals) from Channel.value([ + set idPatient, status, idSample, file(bam), file(bai), file(intervalBed) from bamForBaseRecalibrator + set file(genomeFile), file(genomeIndex), file(genomeDict), file(dbsnp), file(dbsnpIndex), file(knownIndels), file(knownIndelsIndex) from Channel.value([ referenceMap.genomeFile, referenceMap.genomeIndex, referenceMap.genomeDict, @@ -435,32 +498,56 @@ process CreateRecalibrationTable { referenceMap.dbsnpIndex, referenceMap.knownIndels, referenceMap.knownIndelsIndex, - referenceMap.intervals, ]) output: - set idPatient, status, idSample, file("${idSample}.recal.table") into recalibrationTable - set idPatient, status, idSample, val("${idSample}_${status}.md.bam"), val("${idSample}_${status}.md.bai"), val("${idSample}.recal.table") into recalibrationTableTSV + set idPatient, status, idSample, file("${intervalBed.baseName}_${idSample}.recal.table") into recalIntervals when: step == 'mapping' script: known = knownIndels.collect{ "--known-sites ${it}" }.join(' ') + // --use-original-qualities ??? """ gatk --java-options -Xmx${task.memory.toGiga()}g \ BaseRecalibrator \ - --input ${bam} \ - --output ${idSample}.recal.table \ + -I ${bam} \ + -O ${intervalBed.baseName}_${idSample}.recal.table \ --tmp-dir /tmp \ -R ${genomeFile} \ - -L ${intervals} \ + -L ${intervalBed} \ --known-sites ${dbsnp} \ ${known} \ --verbosity INFO """ } -(recalibrationTableTSV, recalibrationTableSampleTSV) = recalibrationTableTSV.into(2) +recalIntervals = recalIntervals.groupTuple(by:[0,1,2]) + +process GatherBQSRReports { + tag {idPatient + "-" + idSample} + + publishDir "${params.outdir}/Preprocessing/${idSample}/DuplicateMarked", mode: params.publishDirMode, overwrite: false + + input: + set idPatient, status, idSample, file(recalTable) from recalIntervals + + output: + set idPatient, status, idSample, file("${idSample}.recal.table") into recalibrationTable + set idPatient, status, idSample, val("${idSample}_${status}.md.bam"), val("${idSample}_${status}.md.bai"), val("${idSample}.recal.table") into (recalibrationTableTSV, recalibrationTableSampleTSV) + + when: step == 'mapping' + + script: + recal = recalTable.collect{ "-I ${it}" }.join(' ') + """ + gatk --java-options -Xmx${task.memory.toGiga()}g \ + GatherBQSRReports \ + ${recal} \ + -O ${idSample}.recal.table \ + """ +} + // Create TSV files to restart from this step recalibrationTableTSV.map { idPatient, status, idSample, bam, bai, recalTable -> gender = patientGenders[idPatient] @@ -498,7 +585,7 @@ process RecalibrateBam { output: set idPatient, status, idSample, file("${idSample}.recal.bam"), file("${idSample}.recal.bai") into recalibratedBam, recalibratedBamForStats - set idPatient, status, idSample, val("${idSample}.recal.bam"), val("${idSample}.recal.bai") into recalibratedBamTSV + set idPatient, status, idSample, val("${idSample}.recal.bam"), val("${idSample}.recal.bai") into (recalibratedBamTSV, recalibratedBamSampleTSV) script: """ @@ -514,7 +601,6 @@ process RecalibrateBam { } -(recalibratedBamTSV, recalibratedBamSampleTSV) = recalibratedBamTSV.into(2) // Creating a TSV file to restart from this step recalibratedBamTSV.map { idPatient, status, idSample, bam, bai -> gender = patientGenders[idPatient] From 76bbfaf75c5e017013418f1d67a80b83887180ce Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 2 May 2019 14:49:35 +0200 Subject: [PATCH 22/28] smaller test on travisCI --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 7941be71fb..3c1bfc69a3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -38,4 +38,4 @@ jobs: script: git clone --single-branch --branch sarek https://github.com/nf-core/test-datasets.git data script: nextflow run ${TRAVIS_BUILD_DIR}/build.nf -profile docker --genome smallGRCh37 --refdir data/reference --outdir references --publishDirMode link - stage: test - script: nextflow run ${TRAVIS_BUILD_DIR}/main.nf -profile docker --genome smallGRCh37 --igenomes_base references --sample data/testdata/tsv/tiny-multiple.tsv --publishDirMode link + script: nextflow run ${TRAVIS_BUILD_DIR}/main.nf -profile docker --genome smallGRCh37 --igenomes_base references --sampleDir data/testdata/tiny/normal --publishDirMode link From aa8d70fdff6b015fefaa4900bbf148e304561bcb Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 2 May 2019 14:54:58 +0200 Subject: [PATCH 23/28] try to fix path to data --- .travis.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index 3c1bfc69a3..cded9b6e5e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -35,7 +35,7 @@ install: jobs: include: - stage: built - script: git clone --single-branch --branch sarek https://github.com/nf-core/test-datasets.git data - script: nextflow run ${TRAVIS_BUILD_DIR}/build.nf -profile docker --genome smallGRCh37 --refdir data/reference --outdir references --publishDirMode link + script: git clone --single-branch --branch sarek https://github.com/nf-core/test-datasets.git ${TRAVIS_BUILD_DIR}/data + script: nextflow run ${TRAVIS_BUILD_DIR}/build.nf -profile docker --genome smallGRCh37 --refdir ${TRAVIS_BUILD_DIR}/data/reference --outdir ${TRAVIS_BUILD_DIR}/references --publishDirMode link - stage: test - script: nextflow run ${TRAVIS_BUILD_DIR}/main.nf -profile docker --genome smallGRCh37 --igenomes_base references --sampleDir data/testdata/tiny/normal --publishDirMode link + script: nextflow run ${TRAVIS_BUILD_DIR}/main.nf -profile docker --genome smallGRCh37 --igenomes_base ${TRAVIS_BUILD_DIR}/references --sampleDir ${TRAVIS_BUILD_DIR}/data/testdata/tiny/normal --publishDirMode link From af594adcdf38e82ab0dcd1b61481dab8ed1ae288 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 2 May 2019 15:05:31 +0200 Subject: [PATCH 24/28] include building reference when testing... --- .travis.yml | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/.travis.yml b/.travis.yml index cded9b6e5e..61560bab33 100644 --- a/.travis.yml +++ b/.travis.yml @@ -34,8 +34,7 @@ install: jobs: include: - - stage: built - script: git clone --single-branch --branch sarek https://github.com/nf-core/test-datasets.git ${TRAVIS_BUILD_DIR}/data - script: nextflow run ${TRAVIS_BUILD_DIR}/build.nf -profile docker --genome smallGRCh37 --refdir ${TRAVIS_BUILD_DIR}/data/reference --outdir ${TRAVIS_BUILD_DIR}/references --publishDirMode link - - stage: test - script: nextflow run ${TRAVIS_BUILD_DIR}/main.nf -profile docker --genome smallGRCh37 --igenomes_base ${TRAVIS_BUILD_DIR}/references --sampleDir ${TRAVIS_BUILD_DIR}/data/testdata/tiny/normal --publishDirMode link + - stage: germline + script: git clone --single-branch --branch sarek https://github.com/nf-core/test-datasets.git data + script: nextflow run ${TRAVIS_BUILD_DIR}/build.nf -profile docker --genome smallGRCh37 --refdir data/reference --outdir references --publishDirMode link + script: nextflow run ${TRAVIS_BUILD_DIR}/main.nf -profile docker --genome smallGRCh37 --igenomes_base references --sampleDir data/testdata/tiny/normal --publishDirMode link From f58d2555d6f1aeefbe3676fd02f8ed6366b1b175 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 2 May 2019 15:11:53 +0200 Subject: [PATCH 25/28] reorganize tests --- .travis.yml | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/.travis.yml b/.travis.yml index 61560bab33..20c7432682 100644 --- a/.travis.yml +++ b/.travis.yml @@ -32,9 +32,7 @@ install: # Reset - mkdir ${TRAVIS_BUILD_DIR}/tests && cd ${TRAVIS_BUILD_DIR}/tests -jobs: - include: - - stage: germline - script: git clone --single-branch --branch sarek https://github.com/nf-core/test-datasets.git data - script: nextflow run ${TRAVIS_BUILD_DIR}/build.nf -profile docker --genome smallGRCh37 --refdir data/reference --outdir references --publishDirMode link - script: nextflow run ${TRAVIS_BUILD_DIR}/main.nf -profile docker --genome smallGRCh37 --igenomes_base references --sampleDir data/testdata/tiny/normal --publishDirMode link +script: + - git clone --single-branch --branch sarek https://github.com/nf-core/test-datasets.git data + - nextflow run ${TRAVIS_BUILD_DIR}/build.nf -profile docker --genome smallGRCh37 --refdir data/reference --outdir references --publishDirMode link + - nextflow run ${TRAVIS_BUILD_DIR}/main.nf -profile docker --genome smallGRCh37 --igenomes_base references --sampleDir data/testdata/tiny/normal --publishDirMode link From 06782292ef92e3d987cd36c621ec15020df33473 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 2 May 2019 15:20:26 +0200 Subject: [PATCH 26/28] reduce memory and cpus --- .travis.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 20c7432682..01d73056b2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -34,5 +34,5 @@ install: script: - git clone --single-branch --branch sarek https://github.com/nf-core/test-datasets.git data - - nextflow run ${TRAVIS_BUILD_DIR}/build.nf -profile docker --genome smallGRCh37 --refdir data/reference --outdir references --publishDirMode link - - nextflow run ${TRAVIS_BUILD_DIR}/main.nf -profile docker --genome smallGRCh37 --igenomes_base references --sampleDir data/testdata/tiny/normal --publishDirMode link + - nextflow run ${TRAVIS_BUILD_DIR}/build.nf -profile docker --genome smallGRCh37 --refdir data/reference --outdir references --publishDirMode link --max_memory 7.GB --max_cpus 2 + - nextflow run ${TRAVIS_BUILD_DIR}/main.nf -profile docker --genome smallGRCh37 --igenomes_base references --sampleDir data/testdata/tiny/normal --publishDirMode link --max_memory 7.GB --max_cpus 2 From 23a8cb072b101fe1e7d6066da3d815f0062c5b46 Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 2 May 2019 15:30:57 +0200 Subject: [PATCH 27/28] add tests --- Jenkinsfile | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index ed45e1923b..7ba61ecfee 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -17,7 +17,12 @@ pipeline { sh "nextflow run build.nf -profile docker --genome smallGRCh37 --refdir data/reference --outdir references --publishDirMode link" } } - stage('Test') { + stage('SampleDir') { + steps { + sh "nextflow run main.nf -profile docker --genome smallGRCh37 --igenomes_base references --sampleDir data/testdata/tiny/normal --publishDirMode link" + } + } + stage('Multiple') { steps { sh "nextflow run main.nf -profile docker --genome smallGRCh37 --igenomes_base references --sample data/testdata/tsv/tiny-multiple.tsv --publishDirMode link" } From 5be316378b22285a11641b299997deebe669146c Mon Sep 17 00:00:00 2001 From: MaxUlysse Date: Thu, 2 May 2019 15:31:16 +0200 Subject: [PATCH 28/28] add flowcellLaneFromFastq function --- main.nf | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/main.nf b/main.nf index 8631369549..a554c41a65 100644 --- a/main.nf +++ b/main.nf @@ -65,6 +65,7 @@ if (params.genomes && params.genome && !params.genomes.containsKey(params.genome params.noReports = false params.nucleotidesPerSecond = 1000.0 params.sampleDir = false +params.sample = false params.sequencing_center = null params.step = 'mapping' params.targetBED = null @@ -1030,6 +1031,34 @@ def extractRecal(tsvFile) { } } +// Parse first line of a FASTQ file, return the flowcell id and lane number. +def flowcellLaneFromFastq(path) { + // expected format: + // xx:yy:FLOWCELLID:LANE:... (seven fields) + // or + // FLOWCELLID:LANE:xx:... (five fields) + InputStream fileStream = new FileInputStream(path.toFile()) + InputStream gzipStream = new java.util.zip.GZIPInputStream(fileStream) + Reader decoder = new InputStreamReader(gzipStream, 'ASCII') + BufferedReader buffered = new BufferedReader(decoder) + def line = buffered.readLine() + assert line.startsWith('@') + line = line.substring(1) + def fields = line.split(' ')[0].split(':') + String fcid + int lane + if (fields.size() == 7) { + // CASAVA 1.8+ format + fcid = fields[2] + lane = fields[3].toInteger() + } + else if (fields.size() == 5) { + fcid = fields[0] + lane = fields[1].toInteger() + } + [fcid, lane] +} + // Check file extension def hasExtension(it, extension) { it.toString().toLowerCase().endsWith(extension.toLowerCase())