Merge pull request #16 from kircherlab/development

Intergrate v1.6 scripts
kircherlab · Apr 14, 2020 · 8bcdf81 · 8bcdf81
2 parents f75f8f6 + 0a31de3
commit 8bcdf81
Show file tree

Hide file tree

Showing 40 changed files with 22,965 additions and 34,324 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,8 +1,8 @@
 # data folders
-data/*
+data/prescored/*
+data/annotations/*
 input/*
 output/*
-config/*
 
 # compiled files
 *.pyc
@@ -12,12 +12,14 @@ config/*
 
 # snakemake
 .snakemake*
+envs/*
 
 # data files
 *.gz
 *.vcf
 *.tsv
 *.zip
+*.tbi
 
 # temp files
 .*
diff --git a/CADD.sh b/CADD.sh
@@ -1,14 +1,18 @@
 #!/bin/bash
 
-usage="$(basename "$0") [-o <outfile>] [-g <genomebuild>] [-v <caddversion>] [-a] <infile>  -- CADD version 1.5
+usage="$(basename "$0") [-o <outfile>] [-g <genomebuild>] [-v <caddversion>] [-a] <infile>  -- CADD version 1.6
 
 where:
     -h  show this help text
     -o  out tsv.gz file (generated from input file name if not set)
     -g  genome build (supported are GRCh37 and GRCh38 [default: GRCh38])
-    -v  CADD version (either v1.4 or v1.5 [default: v1.5])
+    -v  CADD version (only v1.6 possible with this set of scripts [default: v1.6])
     -a  include annotation in output
-        input vcf of vcf.gz file (required)"
+        input vcf of vcf.gz file (required)
+    -q  print basic information about snakemake run
+    -p  print full information about the snakemake run
+    -c  number of cores that snakemake is allowed to use [default: 1]
+    "
 
 unset OPTARG
 unset OPTIND
@@ -17,8 +21,10 @@ export LC_ALL=C
 GENOMEBUILD="GRCh38"
 ANNOTATION=false
 OUTFILE=""
-VERSION="v1.5"
-while getopts ':ho:g:v:a' option; do
+VERSION="v1.6"
+VERBOSE="-q"
+CORES="1"
+while getopts ':ho:g:v:c:aqp' option; do
   case "$option" in
     h) echo "$usage"
        exit
@@ -29,8 +35,14 @@ while getopts ':ho:g:v:a' option; do
        ;;
     v) VERSION=$OPTARG
        ;;
+    v) CORES=$OPTARG
+       ;;
     a) ANNOTATION=true
        ;;
+    q) VERBOSE=""
+       ;;
+    p) VERBOSE="-p"
+       ;;
    \?) printf "illegal option: -%s\n" "$OPTARG" >&2
        echo "$usage" >&2
        exit 1
@@ -41,7 +53,7 @@ shift $((OPTIND-1))
 
 INFILE=$1
 
-echo "CADD-v1.5 (c) University of Washington, Hudson-Alpha Institute for Biotechnology and Berlin Institute of Health 2013-2019. All rights reserved."
+echo "CADD-v1.6 (c) University of Washington, Hudson-Alpha Institute for Biotechnology and Berlin Institute of Health 2013-2020. All rights reserved."
 
 set -ueo pipefail
 
@@ -72,37 +84,18 @@ then
     exit 1
 fi
 
-if [ "$VERSION" != "v1.4" ] && [ "$VERSION" != "v1.5" ]
+if [ "$VERSION" != "v1.6" ]
 then
-    echo "Unknown/Unsupported CADD version $VERSION. This script currently only supports v1.4 and v1.5."
+    echo "Unknown/Unsupported CADD version $VERSION. This set of script currently only supports v1.6."
+    echo "If you want to score another version of CADD, please download the accordingly tagged version of the scripts"
     exit 1
 fi
 
-if [ "$VERSION" == "v1.5" ] && [ "$GENOMEBUILD" == "GRCh37" ]
-then
-    echo "Please note that CADD scores for GRCh37 version v1.5 are the same as in v1.4."
-    VERSION="v1.4"
-fi
-
 if [ "$ANNOTATION" = 'true' ]
 then
-    ANNO_FOLDER="incl_anno"
+    CONFIG=$CADD/config/config_${GENOMEBUILD}_${VERSION}.yml
 else
-    ANNO_FOLDER="no_anno"
-fi
-
-# Pipeline configuration
-PRESCORED_FOLDER=$CADD/data/prescored/${GENOMEBUILD}_${VERSION}/$ANNO_FOLDER
-REFERENCE_CONFIG=$CADD/config/references_${GENOMEBUILD}_${VERSION}.cfg
-IMPUTE_CONFIG=$CADD/config/impute_${GENOMEBUILD}_${VERSION}.cfg
-MODEL=$CADD/data/models/$GENOMEBUILD/CADD${VERSION}-$GENOMEBUILD.mod
-CONVERSION_TABLE=$CADD/data/models/$GENOMEBUILD/conversionTable_CADD${VERSION}-$GENOMEBUILD.txt
-
-# determine VEP database version
-DBVERSION=92
-if [ "$GENOMEBUILD" == "GRCh38" ] && [ "$VERSION" == "v1.5" ]
-then
-    DBVERSION=95
+    CONFIG=$CADD/config/config_${GENOMEBUILD}_${VERSION}_noanno.yml
 fi
 
 # Setup temporary folder that is removed reliably on exit and is outside of
@@ -111,101 +104,19 @@ TMP_FOLDER=$(mktemp -d)
 trap "rm -rf $TMP_FOLDER" ERR EXIT
 
 # Temp files
-TMP_PRE=$TMP_FOLDER/$NAME.pre.tsv.gz
-TMP_VCF=$TMP_FOLDER/$NAME.vcf
-TMP_ANNO=$TMP_FOLDER/$NAME.anno.tsv.gz
-TMP_IMP=$TMP_FOLDER/$NAME.csv.gz
-TMP_NOV=$TMP_FOLDER/$NAME.nov.tsv.gz
-
-mkdir -p $TMP_FOLDER
-
-### Pipeline
+TMP_INFILE=$TMP_FOLDER/$NAME.$FILEFORMAT
+TMP_OUTFILE=$TMP_FOLDER/$NAME.tsv.gz
 
-# Loading the environment
-if [ "$VERSION" == "v1.4" ]
-then
-    source activate cadd-env
-else
-    source activate cadd-env-v1.5
-fi
-
-# File preparation
-if [ "$FILEFORMAT" == "vcf" ]
-then
-    cat $INFILE \
-    | python $CADD/src/scripts/VCF2vepVCF.py \
-    | sort -k1,1 -k2,2n -k3,3 -k4,4 \
-    | uniq > $TMP_VCF 
-else
-    zcat $INFILE \
-    | python $CADD/src/scripts/VCF2vepVCF.py \
-    | sort -k1,1 -k2,2n -k3,3 -k4,4 \
-    | uniq > $TMP_VCF
-fi
-
-# Prescoring
-echo '## Prescored variant file' | gzip -c > $TMP_PRE;
-if [ -d $PRESCORED_FOLDER ]
-then
-    for PRESCORED in $(ls $PRESCORED_FOLDER/*.tsv.gz)
-    do
-        cat $TMP_VCF \
-        | python $CADD/src/scripts/extract_scored.py --header \
-            -p $PRESCORED --found_out=$TMP_PRE.tmp \
-        > $TMP_VCF.tmp;
-        gzip -c $TMP_PRE.tmp >> $TMP_PRE
-        mv $TMP_VCF.tmp $TMP_VCF;
-    done;
-    rm $TMP_PRE.tmp
-fi
+cp $INFILE $TMP_INFILE
 
-# Variant annotation
-cat $TMP_VCF \
-| vep --quiet --cache --buffer 1000 --no_stats --offline --vcf \
-    --dir $CADD/data/annotations/${GENOMEBUILD}_${VERSION}/vep \
-    --species homo_sapiens --db_version=$DBVERSION \
-    --assembly $GENOMEBUILD --regulatory --sift b \
-    --polyphen b --per_gene --ccds --domains --numbers --canonical \
-    --total_length --force_overwrite --format vcf --output_file STDOUT \
-    --warning_file STDERR \
-| python $CADD/src/scripts/annotateVEPvcf.py -c $REFERENCE_CONFIG \
-| gzip -c > $TMP_ANNO
-rm $TMP_VCF
-
-# Imputation
-zcat $TMP_ANNO \
-| python $CADD/src/scripts/trackTransformation.py -b \
-            -c $IMPUTE_CONFIG -o $TMP_IMP --noheader;
-
-# Score prediction
-python $CADD/src/scripts/predictSKmodel.py \
-    -i $TMP_IMP -m $MODEL -a $TMP_ANNO \
-| python $CADD/src/scripts/max_line_hierarchy.py --all \
-| python $CADD/src/scripts/appendPHREDscore.py -t $CONVERSION_TABLE \
-| gzip -c > $TMP_NOV;
-rm $TMP_ANNO
-rm $TMP_IMP
-
-if [ "$ANNOTATION" = 'false' ]
-then
-    if [ "$GENOMEBUILD" == "GRCh38" ]
-    then
-        COLUMNS="1-4,124,125"
-    else
-        COLUMNS="1-4,106,107"
-    fi
-    zcat $TMP_NOV | cut -f $COLUMNS | uniq | gzip -c > $TMP_NOV.tmp
-    mv $TMP_NOV.tmp $TMP_NOV
-fi
+echo "Running snakemake pipeline:"
+echo snakemake $TMP_OUTFILE --use-conda --conda-prefix $CADD/envs --cores $CORES
+echo --configfile $CONFIG --snakefile $CADD/Snakefile $VERBOSE
+snakemake $TMP_OUTFILE --use-conda --conda-prefix $CADD/envs --cores $CORES \
+    --configfile $CONFIG --snakefile $CADD/Snakefile $VERBOSE
 
-# Join pre and novel scored variants
-{
-    echo "##CADD $GENOMEBUILD-$VERSION (c) University of Washington, Hudson-Alpha Institute for Biotechnology and Berlin Institute of Health 2013-2019. All rights reserved.";
-    head -n 1 < <(zcat $TMP_NOV);
-    zcat $TMP_PRE $TMP_NOV | grep -v "^#" | sort -k1,1 -k2,2n -k3,3 -k4,4 || true;
-} | bgzip -c > $OUTFILE;
-rm $TMP_NOV
-rm $TMP_PRE
+mv $TMP_OUTFILE $OUTFILE
+rm $TMP_INFILE # is in temp folder, should not be necessary
 
 OUTFILE=$(echo $OUTFILE |  sed 's/^\.\///')
 echo -e "\nCADD scored variants written to file: $OUTFILE"

diff --git a/LICENSE b/LICENSE
@@ -1,5 +1,5 @@
 Copyright (c) University of Washington, Hudson-Alpha Institute for
-Biotechnology and Berlin Institute of Health 2013-2019. All rights reserved.
+Biotechnology and Berlin Institute of Health 2013-2020. All rights reserved.
 
 Permission is hereby granted, to all non-commercial users and licensees of CADD
 (Combined Annotation Dependent Framework, licensed by the University of

diff --git a/README.md b/README.md
@@ -24,7 +24,7 @@ Please check our [website for updates and further information](http://cadd.gs.wa
 
 ## Offline Installation
 
-This section describes how users can setup CADD version 1.5 on their own system. Please note that this requires between 100 GB - 1 TB of disc space and at least 12 GB of RAM.
+This section describes how users can setup CADD version 1.6 on their own system. Please note that this requires between 100 GB - 1 TB of disc space and at least 12 GB of RAM.
 
 ### Prerequisite
 
@@ -35,10 +35,12 @@ wget https://repo.continuum.io/miniconda/Miniconda2-latest-Linux-x86_64.sh
 bash Miniconda2-latest-Linux-x86_64.sh -p $HOME/miniconda2 -b
 export PATH=$HOME/miniconda2/bin:$PATH
 ```
+- snakemake (installed via conda)
+```bash
+conda install -c conda-forge -c bioconda snakemake
+```
 
-*Note: You can also install CADD without conda by installing the dependencies otherwise. You can find the list of tools (VEP) and (python) libraries in the [`src/environment.yml`](https://github.com/kircherlab/CADD-scripts/blob/master/src/environment.yml) file (for CADD GRCh38-v1.5 use [`src/environment_v1.5.yml`](https://github.com/kircherlab/CADD-scripts/blob/master/src/environment_v1.5.yml)). In this case, you will also have to disable the line `source activate cadd-env` in `CADD.sh`*
-
-*Note2: If you are using an existing conda installation, please make sure it is [a version >=4.4.0](https://github.com/conda/conda/issues/3200).* 
+*Note2: If you are using an existing conda installation, please make sure it is [a version >=4.4.0](https://github.com/conda/conda/issues/3200). Make also sure to use snakemake >= 4.0 as some command line parameters are not available in earlier versions. *
 
 ### Setup
 
@@ -58,48 +60,50 @@ This is the easier way of installing CADD, just run:
 ./install.sh
 ```
 
-You first state which parts you want to install (the environment as well as at least one genome build including annotation tracks are neccessary for a quick start) and the script should manage loading and unpacking the neccessary files.
+You first state which parts you want to install (the environments as well as at least one genome build including annotation tracks are neccessary for a quick start) and the script should manage loading and unpacking the neccessary files.
 
 #### Manual installation
 
-Running CADD depends on three big building blocks (plus the repository containing this README which we assume you already downloaded):
+Running CADD depends on four big building blocks (plus the repository containing this README which we assume you already downloaded):
 
+ - snakemake
  - dependencies
  - genome annotations
  - prescored variants
 
 **Installing dependencies**
 
-As stated already in the Prerequisite you can install CADD dependencies without conda, although we heavily recommend doing so. This is because managing the various parts becomes very handy by relying it. To setup the neccessary environment, we only need to run the command:
+As of this version, dependencies have to be installed via conda and snakemake. This is because we are using two different enviroments for python2 and python3.
 
 ```bash
-conda env create -f src/environment.yml
+snakemake test/input.vcf --use-conda --create-envs-only --conda-prefix envs \
+        --configfile config/config_GRCh38_v1.6.yml --snakefile Snakefile
 ```
 
-After the installing process (which will take a few minutes), the CADD conda environment will be loaded (via `source activate cadd-env` automatically in the `CADD.sh` script) and CADD can run without further settings.
+Please note that we installing both conda environments in the CADD subdirectory `envs` via `--conda-prefix envs`. If you do not want this behavior (we do this in order to not install the environments in all active directories you run CADD from), adjust or remove this parameter.
 
 **Installing annotations**
 
 Both version of CADD (for the different genome builds) rely on a big number of genomic annotations. Depending on which genome build you require you can get them from our website (be careful where you put them as these are really big files and have identical filenames) via:
 
 ```bash
 # for GRCh37 / hg19
-wget -c http://krishna.gs.washington.edu/download/CADD/v1.4/annotationsGRCh37.tar.gz
+wget -c http://krishna.gs.washington.edu/download/CADD/v1.6/annotationsGRCh37_v1.6.tar.gz
 # for GRCh38 / hg38
-wget -c http://krishna.gs.washington.edu/download/CADD/v1.5/annotationsGRCh38.tar.gz
+wget -c http://krishna.gs.washington.edu/download/CADD/v1.6/annotationsGRCh38_v1.6.tar.gz
 ```
 
 As those files are about 100 and 200 GB in size, downloads can take long (depending on your internet connection). We recommend to setup the process in the background and using a tool (like `wget -c` mentioned above) that allows you to continue an interrupted download.
 
-To make sure you downloaded the files correctly, we recommend downloading md5 hash files from our website (e.g. `wget http://krishna.gs.washington.edu/download/CADD/v1.4/GRCh37/MD5SUMs`) and checking for completness (via `md5sum -c`).
+To make sure you downloaded the files correctly, we recommend downloading md5 hash files from our website (e.g. `wget wget -c http://krishna.gs.washington.edu/download/CADD/v1.6/MD5SUMs`) and checking for completness (via `md5sum -c`).
 
 The annotation files are finally put in the folder `data/annotations` and unpacked:
 
 ```bash
 cd data/annotations
-tar -zxvf annotationsGRCh37.tar.gz
+tar -zxvf annotationsGRCh37_v1.6.tar.gz
 mv GRCh37 GRCh37_v1.4
-tar -zxvf annotationsGRCh38.tar.gz
+tar -zxvf annotationsGRCh38_v1.6.tar.gz
 cd $OLDPWD
 ```
 
@@ -109,35 +113,25 @@ At this point you are ready to go, but if you want a faster version of CADD, you
 
 ### Running CADD
 
-You run CADD via the script `CADD.sh` which technically only requieres an either vcf or vcf.gz input file as last argument. You can further specify the genome build via `-g`, CADD version via `-v`, request a fully annotated output (`-a` flag) and specify a seperate output file via `-o` (else inputfile name `.tsv.gz` is used). I.e:
+You run CADD via the script `CADD.sh` which technically only requieres an either vcf or vcf.gz input file as last argument. You can further specify the genome build via `-g`, CADD version via `-v` (deprecated, the new version of the scripts only support v1.6), request a fully annotated output (`-a` flag) and specify a seperate output file via `-o` (else inputfile name `.tsv.gz` is used). I.e:
 
 ```bash
 ./CADD.sh test/input.vcf
 
-./CADD.sh -a -g GRCh37 -v v1.4 -o output_inclAnno_GRCh37.tsv.gz test/input.vcf
+./CADD.sh -a -g GRCh37 -o output_inclAnno_GRCh37.tsv.gz test/input.vcf
 ```
 
 You can test whether your CADD is set up properly by comparing to the example files in the `test` directory.
 
 ### Update
 
-Between versions 1.4 and 1.5, we adjusted the CADD repository slightly. If you used CADD before and obviously do not want to download all v1.4 files again, please proceed as follows:
-
-1. update the repository (just overwriting is fine, however this dublicates some moved files so `git pull` is prefered)
-2. rename the annotation (and prescored) folder from `data/annotations/$GENOMEBUILD` to `data/annotations/${GENOMEBUILD}_${VERSION}`
-
-```
-mv data/annotations/GRCh37 data/annotations/GRCh37_v1.4
-mv data/annotations/GRCh38 data/annotations/GRCh38_v1.4
+Version 1.6 includes some changes in comparison to v1.5. Next to the obvious switch of the pipeline into a Snakemake workflow which became necessary due to the ongoin issues with `conda activate`, the new models for v1.6 are extended by more specialized annotations for splicing variants, as well as a few minor changes in some other annotations (most prominent: fixed gerp for GRCh38) and changes in consequence categories which make this scripts incompatible with CADD v1.4 and v1.5. If you are still using those version, please use [version 1.5 of this repository](https://github.com/kircherlab/CADD-scripts/archive/CADD1.5.zip).
 
-# if you have prescored files
-mv data/prescored/GRCh37 data/prescored/GRCh37_v1.4
-mv data/prescored/GRCh38 data/prescored/GRCh38_v1.4
 ```
 
 ## Copyright
 Copyright (c) University of Washington, Hudson-Alpha Institute for
-Biotechnology and Berlin Institute of Health 2013-2019. All rights reserved.
+Biotechnology and Berlin Institute of Health 2013-2020. All rights reserved.
 
 Permission is hereby granted, to all non-commercial users and licensees of CADD
 (Combined Annotation Dependent Framework, licensed by the University of