Skip to content

Commit

Permalink
Merge pull request #16 from kircherlab/development
Browse files Browse the repository at this point in the history
Intergrate v1.6 scripts
  • Loading branch information
Philipp Rentzsch authored Apr 14, 2020
2 parents f75f8f6 + 0a31de3 commit 8bcdf81
Show file tree
Hide file tree
Showing 40 changed files with 22,965 additions and 34,324 deletions.
6 changes: 4 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# data folders
data/*
data/prescored/*
data/annotations/*
input/*
output/*
config/*

# compiled files
*.pyc
Expand All @@ -12,12 +12,14 @@ config/*

# snakemake
.snakemake*
envs/*

# data files
*.gz
*.vcf
*.tsv
*.zip
*.tbi

# temp files
.*
155 changes: 33 additions & 122 deletions CADD.sh
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
#!/bin/bash

usage="$(basename "$0") [-o <outfile>] [-g <genomebuild>] [-v <caddversion>] [-a] <infile> -- CADD version 1.5
usage="$(basename "$0") [-o <outfile>] [-g <genomebuild>] [-v <caddversion>] [-a] <infile> -- CADD version 1.6
where:
-h show this help text
-o out tsv.gz file (generated from input file name if not set)
-g genome build (supported are GRCh37 and GRCh38 [default: GRCh38])
-v CADD version (either v1.4 or v1.5 [default: v1.5])
-v CADD version (only v1.6 possible with this set of scripts [default: v1.6])
-a include annotation in output
input vcf of vcf.gz file (required)"
input vcf of vcf.gz file (required)
-q print basic information about snakemake run
-p print full information about the snakemake run
-c number of cores that snakemake is allowed to use [default: 1]
"

unset OPTARG
unset OPTIND
Expand All @@ -17,8 +21,10 @@ export LC_ALL=C
GENOMEBUILD="GRCh38"
ANNOTATION=false
OUTFILE=""
VERSION="v1.5"
while getopts ':ho:g:v:a' option; do
VERSION="v1.6"
VERBOSE="-q"
CORES="1"
while getopts ':ho:g:v:c:aqp' option; do
case "$option" in
h) echo "$usage"
exit
Expand All @@ -29,8 +35,14 @@ while getopts ':ho:g:v:a' option; do
;;
v) VERSION=$OPTARG
;;
v) CORES=$OPTARG
;;
a) ANNOTATION=true
;;
q) VERBOSE=""
;;
p) VERBOSE="-p"
;;
\?) printf "illegal option: -%s\n" "$OPTARG" >&2
echo "$usage" >&2
exit 1
Expand All @@ -41,7 +53,7 @@ shift $((OPTIND-1))

INFILE=$1

echo "CADD-v1.5 (c) University of Washington, Hudson-Alpha Institute for Biotechnology and Berlin Institute of Health 2013-2019. All rights reserved."
echo "CADD-v1.6 (c) University of Washington, Hudson-Alpha Institute for Biotechnology and Berlin Institute of Health 2013-2020. All rights reserved."

set -ueo pipefail

Expand Down Expand Up @@ -72,37 +84,18 @@ then
exit 1
fi

if [ "$VERSION" != "v1.4" ] && [ "$VERSION" != "v1.5" ]
if [ "$VERSION" != "v1.6" ]
then
echo "Unknown/Unsupported CADD version $VERSION. This script currently only supports v1.4 and v1.5."
echo "Unknown/Unsupported CADD version $VERSION. This set of script currently only supports v1.6."
echo "If you want to score another version of CADD, please download the accordingly tagged version of the scripts"
exit 1
fi

if [ "$VERSION" == "v1.5" ] && [ "$GENOMEBUILD" == "GRCh37" ]
then
echo "Please note that CADD scores for GRCh37 version v1.5 are the same as in v1.4."
VERSION="v1.4"
fi

if [ "$ANNOTATION" = 'true' ]
then
ANNO_FOLDER="incl_anno"
CONFIG=$CADD/config/config_${GENOMEBUILD}_${VERSION}.yml
else
ANNO_FOLDER="no_anno"
fi

# Pipeline configuration
PRESCORED_FOLDER=$CADD/data/prescored/${GENOMEBUILD}_${VERSION}/$ANNO_FOLDER
REFERENCE_CONFIG=$CADD/config/references_${GENOMEBUILD}_${VERSION}.cfg
IMPUTE_CONFIG=$CADD/config/impute_${GENOMEBUILD}_${VERSION}.cfg
MODEL=$CADD/data/models/$GENOMEBUILD/CADD${VERSION}-$GENOMEBUILD.mod
CONVERSION_TABLE=$CADD/data/models/$GENOMEBUILD/conversionTable_CADD${VERSION}-$GENOMEBUILD.txt

# determine VEP database version
DBVERSION=92
if [ "$GENOMEBUILD" == "GRCh38" ] && [ "$VERSION" == "v1.5" ]
then
DBVERSION=95
CONFIG=$CADD/config/config_${GENOMEBUILD}_${VERSION}_noanno.yml
fi

# Setup temporary folder that is removed reliably on exit and is outside of
Expand All @@ -111,101 +104,19 @@ TMP_FOLDER=$(mktemp -d)
trap "rm -rf $TMP_FOLDER" ERR EXIT

# Temp files
TMP_PRE=$TMP_FOLDER/$NAME.pre.tsv.gz
TMP_VCF=$TMP_FOLDER/$NAME.vcf
TMP_ANNO=$TMP_FOLDER/$NAME.anno.tsv.gz
TMP_IMP=$TMP_FOLDER/$NAME.csv.gz
TMP_NOV=$TMP_FOLDER/$NAME.nov.tsv.gz

mkdir -p $TMP_FOLDER

### Pipeline
TMP_INFILE=$TMP_FOLDER/$NAME.$FILEFORMAT
TMP_OUTFILE=$TMP_FOLDER/$NAME.tsv.gz

# Loading the environment
if [ "$VERSION" == "v1.4" ]
then
source activate cadd-env
else
source activate cadd-env-v1.5
fi

# File preparation
if [ "$FILEFORMAT" == "vcf" ]
then
cat $INFILE \
| python $CADD/src/scripts/VCF2vepVCF.py \
| sort -k1,1 -k2,2n -k3,3 -k4,4 \
| uniq > $TMP_VCF
else
zcat $INFILE \
| python $CADD/src/scripts/VCF2vepVCF.py \
| sort -k1,1 -k2,2n -k3,3 -k4,4 \
| uniq > $TMP_VCF
fi

# Prescoring
echo '## Prescored variant file' | gzip -c > $TMP_PRE;
if [ -d $PRESCORED_FOLDER ]
then
for PRESCORED in $(ls $PRESCORED_FOLDER/*.tsv.gz)
do
cat $TMP_VCF \
| python $CADD/src/scripts/extract_scored.py --header \
-p $PRESCORED --found_out=$TMP_PRE.tmp \
> $TMP_VCF.tmp;
gzip -c $TMP_PRE.tmp >> $TMP_PRE
mv $TMP_VCF.tmp $TMP_VCF;
done;
rm $TMP_PRE.tmp
fi
cp $INFILE $TMP_INFILE

# Variant annotation
cat $TMP_VCF \
| vep --quiet --cache --buffer 1000 --no_stats --offline --vcf \
--dir $CADD/data/annotations/${GENOMEBUILD}_${VERSION}/vep \
--species homo_sapiens --db_version=$DBVERSION \
--assembly $GENOMEBUILD --regulatory --sift b \
--polyphen b --per_gene --ccds --domains --numbers --canonical \
--total_length --force_overwrite --format vcf --output_file STDOUT \
--warning_file STDERR \
| python $CADD/src/scripts/annotateVEPvcf.py -c $REFERENCE_CONFIG \
| gzip -c > $TMP_ANNO
rm $TMP_VCF

# Imputation
zcat $TMP_ANNO \
| python $CADD/src/scripts/trackTransformation.py -b \
-c $IMPUTE_CONFIG -o $TMP_IMP --noheader;

# Score prediction
python $CADD/src/scripts/predictSKmodel.py \
-i $TMP_IMP -m $MODEL -a $TMP_ANNO \
| python $CADD/src/scripts/max_line_hierarchy.py --all \
| python $CADD/src/scripts/appendPHREDscore.py -t $CONVERSION_TABLE \
| gzip -c > $TMP_NOV;
rm $TMP_ANNO
rm $TMP_IMP

if [ "$ANNOTATION" = 'false' ]
then
if [ "$GENOMEBUILD" == "GRCh38" ]
then
COLUMNS="1-4,124,125"
else
COLUMNS="1-4,106,107"
fi
zcat $TMP_NOV | cut -f $COLUMNS | uniq | gzip -c > $TMP_NOV.tmp
mv $TMP_NOV.tmp $TMP_NOV
fi
echo "Running snakemake pipeline:"
echo snakemake $TMP_OUTFILE --use-conda --conda-prefix $CADD/envs --cores $CORES
echo --configfile $CONFIG --snakefile $CADD/Snakefile $VERBOSE
snakemake $TMP_OUTFILE --use-conda --conda-prefix $CADD/envs --cores $CORES \
--configfile $CONFIG --snakefile $CADD/Snakefile $VERBOSE

# Join pre and novel scored variants
{
echo "##CADD $GENOMEBUILD-$VERSION (c) University of Washington, Hudson-Alpha Institute for Biotechnology and Berlin Institute of Health 2013-2019. All rights reserved.";
head -n 1 < <(zcat $TMP_NOV);
zcat $TMP_PRE $TMP_NOV | grep -v "^#" | sort -k1,1 -k2,2n -k3,3 -k4,4 || true;
} | bgzip -c > $OUTFILE;
rm $TMP_NOV
rm $TMP_PRE
mv $TMP_OUTFILE $OUTFILE
rm $TMP_INFILE # is in temp folder, should not be necessary

OUTFILE=$(echo $OUTFILE | sed 's/^\.\///')
echo -e "\nCADD scored variants written to file: $OUTFILE"
Expand Down
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
Copyright (c) University of Washington, Hudson-Alpha Institute for
Biotechnology and Berlin Institute of Health 2013-2019. All rights reserved.
Biotechnology and Berlin Institute of Health 2013-2020. All rights reserved.

Permission is hereby granted, to all non-commercial users and licensees of CADD
(Combined Annotation Dependent Framework, licensed by the University of
Expand Down
50 changes: 22 additions & 28 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ Please check our [website for updates and further information](http://cadd.gs.wa

## Offline Installation

This section describes how users can setup CADD version 1.5 on their own system. Please note that this requires between 100 GB - 1 TB of disc space and at least 12 GB of RAM.
This section describes how users can setup CADD version 1.6 on their own system. Please note that this requires between 100 GB - 1 TB of disc space and at least 12 GB of RAM.

### Prerequisite

Expand All @@ -35,10 +35,12 @@ wget https://repo.continuum.io/miniconda/Miniconda2-latest-Linux-x86_64.sh
bash Miniconda2-latest-Linux-x86_64.sh -p $HOME/miniconda2 -b
export PATH=$HOME/miniconda2/bin:$PATH
```
- snakemake (installed via conda)
```bash
conda install -c conda-forge -c bioconda snakemake
```

*Note: You can also install CADD without conda by installing the dependencies otherwise. You can find the list of tools (VEP) and (python) libraries in the [`src/environment.yml`](https://github.com/kircherlab/CADD-scripts/blob/master/src/environment.yml) file (for CADD GRCh38-v1.5 use [`src/environment_v1.5.yml`](https://github.com/kircherlab/CADD-scripts/blob/master/src/environment_v1.5.yml)). In this case, you will also have to disable the line `source activate cadd-env` in `CADD.sh`*

*Note2: If you are using an existing conda installation, please make sure it is [a version >=4.4.0](https://github.com/conda/conda/issues/3200).*
*Note2: If you are using an existing conda installation, please make sure it is [a version >=4.4.0](https://github.com/conda/conda/issues/3200). Make also sure to use snakemake >= 4.0 as some command line parameters are not available in earlier versions. *

### Setup

Expand All @@ -58,48 +60,50 @@ This is the easier way of installing CADD, just run:
./install.sh
```

You first state which parts you want to install (the environment as well as at least one genome build including annotation tracks are neccessary for a quick start) and the script should manage loading and unpacking the neccessary files.
You first state which parts you want to install (the environments as well as at least one genome build including annotation tracks are neccessary for a quick start) and the script should manage loading and unpacking the neccessary files.

#### Manual installation

Running CADD depends on three big building blocks (plus the repository containing this README which we assume you already downloaded):
Running CADD depends on four big building blocks (plus the repository containing this README which we assume you already downloaded):

- snakemake
- dependencies
- genome annotations
- prescored variants

**Installing dependencies**

As stated already in the Prerequisite you can install CADD dependencies without conda, although we heavily recommend doing so. This is because managing the various parts becomes very handy by relying it. To setup the neccessary environment, we only need to run the command:
As of this version, dependencies have to be installed via conda and snakemake. This is because we are using two different enviroments for python2 and python3.

```bash
conda env create -f src/environment.yml
snakemake test/input.vcf --use-conda --create-envs-only --conda-prefix envs \
--configfile config/config_GRCh38_v1.6.yml --snakefile Snakefile
```

After the installing process (which will take a few minutes), the CADD conda environment will be loaded (via `source activate cadd-env` automatically in the `CADD.sh` script) and CADD can run without further settings.
Please note that we installing both conda environments in the CADD subdirectory `envs` via `--conda-prefix envs`. If you do not want this behavior (we do this in order to not install the environments in all active directories you run CADD from), adjust or remove this parameter.

**Installing annotations**

Both version of CADD (for the different genome builds) rely on a big number of genomic annotations. Depending on which genome build you require you can get them from our website (be careful where you put them as these are really big files and have identical filenames) via:

```bash
# for GRCh37 / hg19
wget -c http://krishna.gs.washington.edu/download/CADD/v1.4/annotationsGRCh37.tar.gz
wget -c http://krishna.gs.washington.edu/download/CADD/v1.6/annotationsGRCh37_v1.6.tar.gz
# for GRCh38 / hg38
wget -c http://krishna.gs.washington.edu/download/CADD/v1.5/annotationsGRCh38.tar.gz
wget -c http://krishna.gs.washington.edu/download/CADD/v1.6/annotationsGRCh38_v1.6.tar.gz
```

As those files are about 100 and 200 GB in size, downloads can take long (depending on your internet connection). We recommend to setup the process in the background and using a tool (like `wget -c` mentioned above) that allows you to continue an interrupted download.

To make sure you downloaded the files correctly, we recommend downloading md5 hash files from our website (e.g. `wget http://krishna.gs.washington.edu/download/CADD/v1.4/GRCh37/MD5SUMs`) and checking for completness (via `md5sum -c`).
To make sure you downloaded the files correctly, we recommend downloading md5 hash files from our website (e.g. `wget wget -c http://krishna.gs.washington.edu/download/CADD/v1.6/MD5SUMs`) and checking for completness (via `md5sum -c`).

The annotation files are finally put in the folder `data/annotations` and unpacked:

```bash
cd data/annotations
tar -zxvf annotationsGRCh37.tar.gz
tar -zxvf annotationsGRCh37_v1.6.tar.gz
mv GRCh37 GRCh37_v1.4
tar -zxvf annotationsGRCh38.tar.gz
tar -zxvf annotationsGRCh38_v1.6.tar.gz
cd $OLDPWD
```

Expand All @@ -109,35 +113,25 @@ At this point you are ready to go, but if you want a faster version of CADD, you

### Running CADD

You run CADD via the script `CADD.sh` which technically only requieres an either vcf or vcf.gz input file as last argument. You can further specify the genome build via `-g`, CADD version via `-v`, request a fully annotated output (`-a` flag) and specify a seperate output file via `-o` (else inputfile name `.tsv.gz` is used). I.e:
You run CADD via the script `CADD.sh` which technically only requieres an either vcf or vcf.gz input file as last argument. You can further specify the genome build via `-g`, CADD version via `-v` (deprecated, the new version of the scripts only support v1.6), request a fully annotated output (`-a` flag) and specify a seperate output file via `-o` (else inputfile name `.tsv.gz` is used). I.e:

```bash
./CADD.sh test/input.vcf

./CADD.sh -a -g GRCh37 -v v1.4 -o output_inclAnno_GRCh37.tsv.gz test/input.vcf
./CADD.sh -a -g GRCh37 -o output_inclAnno_GRCh37.tsv.gz test/input.vcf
```

You can test whether your CADD is set up properly by comparing to the example files in the `test` directory.

### Update

Between versions 1.4 and 1.5, we adjusted the CADD repository slightly. If you used CADD before and obviously do not want to download all v1.4 files again, please proceed as follows:

1. update the repository (just overwriting is fine, however this dublicates some moved files so `git pull` is prefered)
2. rename the annotation (and prescored) folder from `data/annotations/$GENOMEBUILD` to `data/annotations/${GENOMEBUILD}_${VERSION}`

```
mv data/annotations/GRCh37 data/annotations/GRCh37_v1.4
mv data/annotations/GRCh38 data/annotations/GRCh38_v1.4
Version 1.6 includes some changes in comparison to v1.5. Next to the obvious switch of the pipeline into a Snakemake workflow which became necessary due to the ongoin issues with `conda activate`, the new models for v1.6 are extended by more specialized annotations for splicing variants, as well as a few minor changes in some other annotations (most prominent: fixed gerp for GRCh38) and changes in consequence categories which make this scripts incompatible with CADD v1.4 and v1.5. If you are still using those version, please use [version 1.5 of this repository](https://github.com/kircherlab/CADD-scripts/archive/CADD1.5.zip).

# if you have prescored files
mv data/prescored/GRCh37 data/prescored/GRCh37_v1.4
mv data/prescored/GRCh38 data/prescored/GRCh38_v1.4
```
## Copyright
Copyright (c) University of Washington, Hudson-Alpha Institute for
Biotechnology and Berlin Institute of Health 2013-2019. All rights reserved.
Biotechnology and Berlin Institute of Health 2013-2020. All rights reserved.
Permission is hereby granted, to all non-commercial users and licensees of CADD
(Combined Annotation Dependent Framework, licensed by the University of
Expand Down
Loading

0 comments on commit 8bcdf81

Please sign in to comment.