-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdatabase.sh
100 lines (71 loc) · 4.29 KB
/
database.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#!/bin/bash
set -euo pipefail
# This script is used to download all the application dependencies used in the paper to the selected Path, make sure you have already create the path
CURRENT_PATH=$(pwd)
TARGET_PATH="$HOME/Downloads/Database"
# Ensure the directory exists
mkdir -p "$TARGET_PATH"
# Change to the target directory
cd "$TARGET_PATH"
echo "Starting Download All Neccessary Database, ensure stable connection"
echo "01. Downloading Reference Sequence GRCh38.p14 from UCSC"
## Reference Sequence GRCh38.p13
wget http://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/p14/hg38.p14.fa.gz -O hg38.p14.fa.gz
gzip -d hg38.p14.fa.gz
echo "Run indexing with bwa-mem2, ensure your Memory + Swap > 76GB for smoother experiences"
bwa-mem2 index hg38.p14.fa
echo "Create index using samtools"
samtools faidx hg38.p14.fa
echo "02. Download and process dbNSFP from https://sites.google.com/site/jpopgen/dbNSFP?pli=1"
## Paper used dbNSFP4.4a, This link download dbNSFP4.5a, see full release update on https://sites.google.com/site/jpopgen/dbNSFP/changelog?authuser=0
dbNSFP_version=4.5a
echo "02a. Downloading dbNSFP${dbNSFP_version}"
wget https://usf.box.com/shared/static/2hzcx5s6p1xui7oen16xqzndfrkt8l9l -O dbNSFP${dbNSFP_version}.zip
unzip dbNSFP${dbNSFP_version}.zip
echo "02c. Processing the file into a single file version"
### Create Header
zcat dbNSFP${dbNSFP_version}_variant.chr1.gz | head -n 1 | bgzip > header.gz
### NOTE: bgzip parameter --threads X represents number of threads
cat dbNSFP${dbNSFP_version}_variant.chr{1..22}.gz dbNSFP${dbNSFP_version}_variant.chrX.gz dbNSFP${dbNSFP_version}_variant.chrY.gz dbNSFP${dbNSFP_version}_variant.chrM.gz | zgrep -v '#chr' | bgzip --threads nproc > dbNSFPv${dbNSFP_version}_custom.gz
### Add header back into file
cat header.gz dbNSFPv${dbNSFP_version}_custom.gz > dbNSFPv${dbNSFP_version}_custombuild.gz
zcat dbNSFPv${dbNSFP_version}_custombuild.gz | awk '!/^#/{print $0}' | cut -f 1 | sort | uniq | wc -l
### Create tabix index
tabix -s 1 -b 2 -e 2 dbNSFPv${dbNSFP_version}_custombuild.gz
echo "03. Download and Processing dbSNP from NCBI FTP"
echo "03a. Downloading dbSNP b156"
wget https://ftp.ncbi.nih.gov/snp/latest_release/VCF/GCF_000001405.40.gz -O GCF_000001405.40.gz
wget https://ftp.ncbi.nih.gov/snp/archive/b156/VCF/GCF_000001405.40.gz.tbi -O GCF_000001405.40.gz.tbi
wget https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_assembly_report.txt -O GCF_000001405.40_GRCh38.p14_assembly_report.txt
echo "03b. Processing the file into SnpSift ready file"
bcftools annotate \
--rename-chrs ${CURRENT_PATH}/convert/GCF_000001405.40_GRCh38.p14_assembly_report_revised_snpsift.chrnames \
--threads nproc -Oz \
-o Homo_sapiens_assembly38.dbsnp156_snpsift.vcf.gz \
GCF_000001405.40.gz
echo "03c. Check and filter out forbidden charset annotation"
## Reference : https://pcingola.github.io/SnpEff/ss_faq/
SnpSift vcfCheck Homo_sapiens_assembly38.dbsnp156_snpsift.vcf.gz 2>&1 | grep "INFO field" | cut -f 2 -d "'" | sort | uniq -c
zcat Homo_sapiens_assembly38.dbsnp156_snpsift.vcf.gz \
| sed 's/\&base_change=/\&base_change%3D/g' \
| sed 's/A=;/A%3D;/' \
| sed 's/C=;/C%3D;/' \
| sed 's/G=;/G%3D;/' \
| sed 's/T=;/T%3D;/' \
| sed 's/=,;/%3D,/' \
> Homo_sapiens_assembly38.dbsnp156_snpsift.fixed.vcf
# Let's also compress and index the new file so we can use it as a database
bgzip Homo_sapiens_assembly38.dbsnp156_snpsift.fixed.vcf
tabix Homo_sapiens_assembly38.dbsnp156_snpsift.fixed.vcf.gz
SnpSift vcfCheck Homo_sapiens_assembly38.dbsnp156_snpsift.fixed.vcf.gz 2>&1 | grep "INFO field" | cut -f 2 -d "'" | sort | uniq -c
echo "04. Download and Processing ClinVar from NCBI FTP"
echo "04a. Downloading ClinVar data"
# Paper used 20230514, while the most recent one is 20240215
# File version of 20230514 :
## https://ftp.ncbi.nih.gov/pub/clinvar/vcf_GRCh38/archive_2.0/2023/clinvar_20230514.vcf.gz
## https://ftp.ncbi.nih.gov/pub/clinvar/vcf_GRCh38/archive_2.0/2023/clinvar_20230514_papu.vcf.gz.tbi
clinvar_version=20240215
wget https://ftp.ncbi.nih.gov/pub/clinvar/vcf_GRCh38/clinvar_20240215.vcf.gz -O clinvar_${clinvar_version}.vcf.gz
tabix clinvar_${clinvar_version}.vcf.gz
echo "04b. SnpSift check"
SnpSift vcfCheck clinvar_${clinvar_version}.vcf.gz 2>&1 | grep "INFO field" | cut -f 2 -d "'" | sort | uniq -c