nf-core · maxulysse · Jun 3, 2019 · May 22, 2019 · May 23, 2019 · May 23, 2019
@@ -48,6 +48,22 @@ jobs:
             echo "$DOCKERHUB_PASS" | docker login -u "$DOCKERHUB_USERNAME" --password-stdin
             docker push nfcore/sareksnpeff:dev.${GENOME}
 
+  snpeffcanfam3_1:
+    docker:
+      - image: circleci/buildpack-deps:stretch
+    environment:
+      GENOME: CanFam3.1
+      SNPEFF_CACHE_VERSION: 86
+    steps:
+      - checkout
+      - setup_remote_docker
+      - run:
+          command: docker build -t nfcore/sareksnpeff:dev.${GENOME} containers/snpeff/. --build-arg GENOME=${GENOME} --build-arg SNPEFF_CACHE_VERSION=${SNPEFF_CACHE_VERSION}
+      - run:
+          command: |
+            echo "$DOCKERHUB_PASS" | docker login -u "$DOCKERHUB_USERNAME" --password-stdin
+            docker push nfcore/sareksnpeff:dev.${GENOME}
+
   vepgrch37:
     docker:
       - image: circleci/buildpack-deps:stretch
@@ -96,13 +112,31 @@ jobs:
       - run:
           command: echo "$DOCKERHUB_PASS" | docker login -u "$DOCKERHUB_USERNAME" --password-stdin ; docker push nfcore/sarekvep:dev.${GENOME}
 
+  vepcanfam3_1:
+    docker:
+      - image: circleci/buildpack-deps:stretch
+    environment:
+      GENOME: CanFam3.1
+      SPECIES: canis_familiaris
+      VEP_VERSION: 95
+    steps:
+      - checkout
+      - setup_remote_docker
+      - run:
+          command: docker build -t nfcore/sarekvep:dev.${GENOME} containers/vep/. --build-arg GENOME=${GENOME} --build-arg SPECIES=${SPECIES} --build-arg VEP_VERSION=${VEP_VERSION}
+          no_output_timeout: 1h
+      - run:
+          command: echo "$DOCKERHUB_PASS" | docker login -u "$DOCKERHUB_USERNAME" --password-stdin ; docker push nfcore/sarekvep:dev.${GENOME}
+
 workflows:
   version: 2
   build:
     jobs:
+      - snpeffcanfam3_1
       - snpeffgrch37
       - snpeffgrch38
       - snpeffgrcm38
+      - vepcanfam3_1
       - vepgrch37
       - vepgrch38
       - vepgrcm38
@@ -27,7 +27,7 @@ before_install:
   # PRs to master are only ok if coming from dev branch
   - '[ $TRAVIS_PULL_REQUEST = "false" ] || [ $TRAVIS_BRANCH != "master" ] || ([ $TRAVIS_PULL_REQUEST_SLUG = $TRAVIS_REPO_SLUG ] && [ $TRAVIS_PULL_REQUEST_BRANCH = "dev" ])'
   # Pull the docker image first so the test doesn't wait for this
-  - "travis_retry ./bin/download_docker.sh --test $TEST"
+  - "travis_retry ./scripts/download_docker.sh --test $TEST"
 
 install:
   # Install Nextflow
@@ -39,8 +39,8 @@ install:
 
 # Build references if needed
 before_script:
-  - "${TRAVIS_BUILD_DIR}/bin/build_reference.sh --test $TEST --verbose"
+  - "${TRAVIS_BUILD_DIR}/scripts/build_reference.sh --test $TEST --verbose"
 
 # Actual tests
 script:
-  - "${TRAVIS_BUILD_DIR}/bin/run_tests.sh --test $TEST --verbose"
+  - "${TRAVIS_BUILD_DIR}/scripts/run_tests.sh --test $TEST --verbose"
@@ -8,41 +8,41 @@ pipeline {
     stages {
         stage('Docker setup') {
             steps {
-                sh "./bin/download_docker.sh"
+                sh "./scripts/download_docker.sh"
             }
         }
         stage('Build references') {
             steps {
                 sh "rm -rf references/"
-                sh "./bin/build_reference.sh"
+                sh "./scripts/build_reference.sh"
             }
         }
         stage('Germline') {
             steps {
                 sh "rm -rf data/"
                 sh "git clone --single-branch --branch sarek https://github.com/nf-core/test-datasets.git data"
-                sh "./bin/run_tests.sh --test GERMLINE"
+                sh "./scripts/run_tests.sh --test GERMLINE"
                 sh "rm -rf data/"
             }
         }
         stage('Somatic') {
             steps {
-                sh "./bin/run_tests.sh --test SOMATIC"
+                sh "./scripts/run_tests.sh --test SOMATIC"
             }
         }
         stage('Targeted') {
             steps {
-                sh "./bin/run_tests.sh --test TARGETED"
+                sh "./scripts/run_tests.sh --test TARGETED"
             }
         }
         stage('Annotation') {
             steps {
-                sh "./bin/run_tests.sh --test ANNOTATEALL"
+                sh "./scripts/run_tests.sh --test ANNOTATEALL"
             }
         }
         stage('Multiple') {
             steps {
-                sh "./bin/run_tests.sh --test MULTIPLE"
+                sh "./scripts/run_tests.sh --test MULTIPLE"
             }
         }
     }

@@ -1,7 +1,7 @@
 # [![Sarek](docs/images/Sarek_logo.png "Sarek")](https://sarek.scilifelab.se/)
 [![nf-core](docs/images/nf-core_logo.png "Sarek")](https://nf-co.re/)
 
-**An open-source analysis pipeline to detect germline or somatic variants from whole genome or targeted sequencing**.
+**An open-source analysis pipeline to detect germline or somatic variants from whole genome or targeted sequencing**
 
 > :warning: This pipeline is a work in progress being ported to nf-core from [SciLifeLab/Sarek](https://github/SciLifeLab/Sarek/)
 
@@ -21,15 +21,15 @@
 <img align="right" title="CAW" src="/docs/images/CAW_logo.png">
 
 Previously known as the Cancer Analysis Workflow (CAW),
-Sarek is a workflow designed to run analyses on whole genome or targeted sequencing data from regular samples or tumour / normal pairs and could including additional relapses.
+Sarek is a workflow designed to run analyses on whole genome or targeted sequencing data from regular samples or tumour / normal pairs and could include additional relapses.
 
 It's built using [Nextflow](https://www.nextflow.io),
 a domain specific language for workflow building,
 across multiple compute infrastructures in a very portable manner.
 Software dependencies are handled using [Conda](https://conda.io/), [Docker](https://www.docker.com) or [Singularity](https://www.sylabs.io/singularity/) - environment/container technologies that provide excellent reproducibility and ease of use.
 Thus making installation trivial and results highly reproducible.
 
-It is listed on the [Elixir - Tools and Data Services Registry](https://bio.tools/Sarek), [Dockstore](https://dockstore.org/workflows/github.com/SciLifeLab/Sarek/) and [omicX - Bioinformatics tools](https://omictools.com/sarek-tool).
+It's listed on the [Elixir - Tools and Data Services Registry](https://bio.tools/Sarek), [Dockstore](https://dockstore.org/workflows/github.com/SciLifeLab/Sarek/) and [omicX - Bioinformatics tools](https://omictools.com/sarek-tool).
 
 ## Documentation
 The nf-core/sarek pipeline comes with documentation about the pipeline, found in the `docs/` directory:
@@ -41,17 +41,13 @@ The nf-core/sarek pipeline comes with documentation about the pipeline, found in
     * [Reference genomes](https://nf-co.re/usage/reference_genomes)
     * [Extra documentation on reference](docs/reference.md)
 3. [Running the pipeline](docs/usage.md)
+    * [Examples](docs/use_cases.md)
     * [Input files documentation](docs/input.md)
     * [Extra documentation on variant calling](docs/variantcalling.md)
     * [Documentation about containers](docs/containers.md)
     * [Extra documentation for targeted sequencing](docs/targetseq.md)
-
-    * [Intervals documentation](docs/INTERVALS.md)
-    * [Command line parameters](docs/PARAMETERS.md)
-    * [Examples](docs/USE_CASES.md)
-    * [Processes documentation](docs/PROCESS.md)
 4. [Output and how to interpret the results](docs/output.md)
-    * [Complementary information about ASCAT](docs/ASCAT.md)
+    * [Complementary information about ASCAT](docs/ascat.md)
     * [Extra documentation on annotation](docs/annotation.md)
 5. [Troubleshooting](https://nf-co.re/usage/troubleshooting)
 

@@ -0,0 +1,93 @@
+#!/bin/env Rscript
+# Description:
+# R-script for converting output from AlleleCount to BAF and LogR values.
+#
+# Input:
+# AlleleCounter output file for tumor and normal samples
+# The first line should contain a header describing the data
+# The following columns and headers should be present:
+# CHR    POS     Count_A Count_C Count_G Count_T Good_depth
+#
+# Output:
+# BAF and LogR tables (tab delimited text files)
+################################################################################
+
+##First read in the arguments listed at the command line
+args = commandArgs(trailingOnly=TRUE)
+
+## args is now a list of character vectors
+## First check to see if arguments are passed.
+if(length(args)<5){
+    stop("No input files supplied\n\nUsage:\nRscript convertAlleleCounts.r tumorid tumorac normalid normalac gender\nWhere:\ntumorid - id of tumor sample\ntumorac - output from AlleleCount for the tumor\nnormalid - id of normal sample\nnormalac - output from AlleleCount for the normal\ngender - XX or XY\n\n")
+} else{
+    tumorid = args[1]
+    tumorac = args[2]
+    normalid = args[3]
+    normalac = args[4]
+    gender = args[5]
+}
+
+tumorcounts = read.table(tumorac, header=F, sep="\t")
+normalcounts = read.table(normalac, header=F, sep="\t")
+
+SNPpos = matrix(nrow = dim(normalcounts)[1],ncol = 2)
+
+rownames(SNPpos) = paste("snp",1:dim(SNPpos)[1],sep="")
+
+#Change rownames to "chr_pos" instead, such as 1_44552
+#This does not exactly work:
+#rownames(SNPpos) = apply(cbind(tumorcounts[,1], tumorcounts[,2]), 1, paste, collapse="_")
+#This is for compatibility with gc correction file
+
+colnames(SNPpos) = c("Chr","Position")
+SNPpos[,1] = as.vector(normalcounts[,1])
+SNPpos[,2] = normalcounts[,2]
+
+#Caclulate BAF
+Tumor_BAF = matrix(nrow = dim(normalcounts)[1],ncol = 1)
+rownames(Tumor_BAF) = rownames(SNPpos)
+colnames(Tumor_BAF) = c(tumorid)
+acgt = tumorcounts[,c(3:6)]
+acgts = t(apply(acgt,1,sort))
+Tumor_BAF[,1] = acgts[,4]/(acgts[,3]+acgts[,4])
+Tumor_BAF[,1] = ifelse(runif(length(Tumor_BAF[,1]))<0.5,Tumor_BAF[,1],1-Tumor_BAF[,1])
+Tumor_BAF[is.nan(Tumor_BAF)]=NA
+
+Germline_BAF = matrix(nrow = dim(normalcounts)[1],ncol = 1)
+rownames(Germline_BAF) = rownames(SNPpos)
+colnames(Germline_BAF) = c(normalid)
+acgt = normalcounts[,c(3:6)]
+acgts = t(apply(acgt,1,sort))
+Germline_BAF[,1] = acgts[,4]/(acgts[,3]+acgts[,4])
+Germline_BAF[,1] = ifelse(runif(length(Germline_BAF[,1]))<0.5,Germline_BAF[,1],1-Germline_BAF[,1])
+Germline_BAF[is.nan(Germline_BAF)]=NA
+
+
+Tumor_LogR = matrix(nrow = dim(normalcounts)[1],ncol = 1)
+Germline_LogR = matrix(nrow = dim(normalcounts)[1],ncol = 1)
+rownames(Tumor_LogR) = rownames(SNPpos)
+colnames(Tumor_LogR) = c(tumorid)
+rownames(Germline_LogR) = rownames(SNPpos)
+colnames(Germline_LogR) = c(normalid)
+Tumor_LogR[,1] = log(tumorcounts[,7]/normalcounts[,7],2)
+Germline_LogR[,1] = 0
+Tumor_LogR[is.infinite(Tumor_LogR)]=NA
+if(gender=="XY") {
+    Tumor_LogR[SNPpos[,1]=="X",1] = Tumor_LogR[SNPpos[,1]=="X",1]-1
+    Germline_LogR[SNPpos[,1]=="X",1] = Germline_LogR[SNPpos[,1]=="X",1]-1
+}
+Tumor_LogR[,1] = Tumor_LogR[,1] - median(Tumor_LogR[,1],na.rm=T)
+# set regions with 0 reads in tumor and normal to a LogR of 0.
+Tumor_LogR[is.na(Tumor_LogR[,1]),1] = 0
+
+# limit the number of digits:
+Tumor_LogR = round(Tumor_LogR,4)
+Tumor_BAF = round(Tumor_BAF,4)
+Germline_LogR = round(Germline_LogR,4)
+Germline_BAF = round(Germline_BAF,4)
+
+# write output to files
+write.table(cbind(SNPpos,Tumor_LogR),paste(tumorid,".LogR",sep=""),sep="\t",row.names=T,col.names=NA,quote=F)
+write.table(cbind(SNPpos,Tumor_BAF),paste(tumorid,".BAF",sep=""),sep="\t",row.names=T,col.names=NA,quote=F)
+write.table(cbind(SNPpos,Germline_LogR),paste(normalid,".LogR",sep=""),sep="\t",row.names=T,col.names=NA,quote=F)
+write.table(cbind(SNPpos,Germline_BAF),paste(normalid,".BAF",sep=""),sep="\t",row.names=T,col.names=NA,quote=F)
@@ -0,0 +1,58 @@
+#!/bin/env Rscript
+args = commandArgs(trailingOnly=TRUE)
+if(length(args)<6){
+    stop("No input files supplied\n\nUsage:\nRscript run_ascat.r tumor_baf tumor_logr normal_baf normal_logr tumor_sample_name baseDir gcfile\n\n")
+} else{
+    tumorbaf = args[1]
+    tumorlogr = args[2]
+    normalbaf = args[3]
+    normallogr = args[4]
+    tumorname = args[5]
+    baseDir = args[6]
+    gcfile = args[7]
+}
+
+source(paste(baseDir,"/scripts/ascat.R", sep=""))
+
+if(!require(RColorBrewer)){
+    source("http://bioconductor.org/biocLite.R")
+    biocLite("RColorBrewer", suppressUpdates=TRUE, lib="$baseDir/scripts")
+    library(RColorBrewer)
+}
+options(bitmapType='cairo')
+
+#Load the  data
+ascat.bc <- ascat.loadData(Tumor_LogR_file=tumorlogr, Tumor_BAF_file=tumorbaf, Germline_LogR_file=normallogr, Germline_BAF_file=normalbaf)
+
+#GC wave correction
+ascat.bc = ascat.GCcorrect(ascat.bc, gcfile)
+
+#Plot the raw data
+ascat.plotRawData(ascat.bc)
+
+#Segment the data
+ascat.bc <- ascat.aspcf(ascat.bc)
+
+#Plot the segmented data
+ascat.plotSegmentedData(ascat.bc)
+
+#Run ASCAT to fit every tumor to a model, inferring ploidy, normal cell contamination, and discrete copy numbers
+ascat.output <- ascat.runAscat(ascat.bc, gamma=1)
+
+#Write out segmented regions (including regions with one copy of each allele)
+#write.table(ascat.output$segments, file=paste(tumorname, ".segments.txt", sep=""), sep="\t", quote=F, row.names=F)
+
+#Write out CNVs in bed format
+cnvs=ascat.output$segments[2:6]
+write.table(cnvs, file=paste(tumorname,".cnvs.txt",sep=""), sep="\t", quote=F, row.names=F, col.names=T)
+
+#Write out purity and ploidy info
+summary <- tryCatch({
+		matrix(c(ascat.output$aberrantcellfraction, ascat.output$ploidy), ncol=2, byrow=TRUE)}, error = function(err) {
+			# error handler picks up where error was generated
+			print(paste("Could not find optimal solution:  ",err))
+			return(matrix(c(0,0),nrow=1,ncol=2,byrow = TRUE))
+		}
+)
+colnames(summary) <- c("AberrantCellFraction","Ploidy")
+write.table(summary, file=paste(tumorname,".purityploidy.txt",sep=""), sep="\t", quote=F, row.names=F, col.names=T)
@@ -0,0 +1,44 @@
+# The XVth KICancer Retreat 2016
+
+## Cancer Analysis Workflow Of Tumor/Normal Pairs At The National Genomics Infrastructure Of SciLifeLab
+
+Maxime Garcia
+Pelin Akan,
+Teresita Díaz de Ståhl,
+Jesper Eisfeldt,
+Szilveszter Juhos,
+Malin Larsson,
+Björn Nystedt,
+Pall Olason,
+Monica Nistér,
+Max Käller
+
+BarnTumörBanken, Department of Oncology Pathology, Karolinska Institutet, Science for Life Laboratory
+
+One of the most prominent usage of NGS is whole genome sequencing (WGS). The
+National Genomics Infrastructure (NGI) at Science for Life Laboratory is today
+providing WGS and germ line variant analysis. However, building a robust and
+reliable bioinformatics workflow to find somatic mutations is challenging:
+tumor samples are heterogeneous, likely contain structural variants and
+multiple sub-clones besides the normal tissue.
+
+We are presenting our workflow that is designed to analyze WGS tumor/normal
+data in a high-throughput environment. The framework is based on the Nextflow
+domain-specific language on top of Java/Groovy. Using Nextflow we are able to
+utilize both the Slurm load balancing environment and local execution,
+implementing data flow forks and joins, call external software etc. Individual
+sub-steps of a complex flow can be connected and restarted after failure from
+the last execution point.
+
+The actual preprocessing workflow is based on BWA as an aligner and GATK best
+practice steps. To achieve a consensus variant call different variant callers
+can be added, currently MuTect2, Strelka and VarDict are supported, more to be
+added. Structural variants are going to be estimated by Manta, ploidy and
+sample heterogeneity is measured by ASCAT. The expected output of the workflow
+is a VCF file presenting filtered, prioritized and annotated polymorphisms.
+
+As the Nextflow environment is flexible, we can add other tools or remove
+obsolete ones as development progresses. The goal is to build a workflow for
+cancer genome analysis that can be deployed to both research and clinical
+environments and are going to be included as a standard workflow at NGI during
+the fall of 2016.