-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #68 from tgen/develop
Tempe and Sonoran Updates
- Loading branch information
Showing
62 changed files
with
6,522 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
# Bastien Analysis Pipeline (ROS Cfam 1.0) | ||
|
||
### This directory contains Automated scripts for building files need for Bastien pipeline | ||
|
||
* shared_resource_creation_script/create_genome_reference.sh bastien/bastien_resources.ini | ||
* shared_resource_creation_scripts/create_bwa_genome_index.sh bastien/bastien_resources.ini | ||
* shared_resource_creation_scripts/create_gene_model.sh bastien/bastien_resources.ini | ||
* shared_resource_creation_scripts/create_star_genome_index.sh bastien/bastien_resources.ini bastien/star_index_lengths.csv | ||
* shared_resource_creation_scripts/create_salmon_index.sh bastien/bastien_resources.ini | ||
* shared_resource_creation_scripts/create_snpEff_db.sh bastien/bastien_resources.ini | ||
* shared_resource_creation_scripts/create_vep_database.sh bastien/bastien_resources.ini | ||
* coyote/create_exome_capture_resources.sh bastien/bastien_resources.ini coyote/capture_kits.csv | ||
* coyote/create_star-fusion_resource.sh bastien/bastien_resources.ini | ||
* coyote/create_samtools_stats_non_N_region_file.sh bastien/bastien_resources.ini |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,148 @@ | ||
# Variables used to define Phoenix Resource Locations | ||
# Used by Build scripts to create needed files in expected locations | ||
|
||
########################################### | ||
## UPDATE THESE BASED ON YOUR LOCAL ENVIRONMENT | ||
########################################### | ||
|
||
# Set your environment variable (MUST BE ONE OF "TGen" or "LOCAL" | ||
ENVIRONMENT=TGen | ||
#ENVIRONMENT=LOCAL | ||
|
||
# Define input and output directories | ||
PARENT_DIR=/home/tgenref/canis_familiaris/ros_cfam1.0 | ||
TOPLEVEL_DIR=/home/tgenref/canis_familiaris/ros_cfam1.0/ros_cfam1.0_tgen | ||
PATH_TO_REPO=/home/tgenjetstream/git_repositories/bt_jetstream_resources | ||
CREATOR=bturner | ||
|
||
## WARNING!!! | ||
## PLEASE UPDATE THESE VARIABLES TO SUPPORT LOCAL BUILD | ||
|
||
# Set the number of local compute cores to leverage threading options for some steps | ||
LOCAL_COMPUTE_CORES=20 | ||
# snpEff.jar (https://sourceforge.net/projects/snpeff/files/snpEff_v4_3t_core.zip/download) | ||
SNPEFF=/packages/snpEff/snpEff_v4_3t_core/snpEff/snpEff.jar | ||
# NCBI eUTILs PATH (https://www.ncbi.nlm.nih.gov/books/NBK179288/) | ||
EUTILS_PATH=/home/jkeats/downloads/edirect/ | ||
# JSON.awk (https://github.com/step-/JSON.awk/archive/1.3.tar.gz) | ||
JSON_AWK=/home/jkeats/downloads/JSON.awk-1.3/JSON.awk | ||
# UCSC gtfToGenePred binary (http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/gtfToGenePred) | ||
GTFTOGENEPRED_BINARY=/home/tgenref/binaries/gtfToGenePred/gtfToGenePred | ||
# faToTwoBit (rsync -aP rsync://hgdownload.soe.ucsc.edu/genome/admin/exe/linux.x86_64/faToTwoBit .) | ||
FATOTWOBIT=/home/tgenref/binaries/faToTwoBit/faToTwoBit | ||
|
||
|
||
########################################### | ||
## Common Variables | ||
########################################### | ||
|
||
WORKFLOW_NAME=bastien | ||
GENOME_SUBVERSION_NAME=ros_cfam1.0_tgen | ||
SPECIES="Canis familiaris" | ||
|
||
|
||
########################################### | ||
## Required Modules (Or available in your $PATH) | ||
########################################### | ||
|
||
BOWTIE2_MODULE="Bowtie2/2.3.5.1-GCC-8.2.0-2.31.1" | ||
BOWTIE2_VERSION="2.3.5.1" | ||
|
||
BWA_VERSION="0.7.17" | ||
BWA_MEM2_VERSION="2.2.1" | ||
|
||
GATK_MODULE="GATK/4.1.4.0-GCCcore-8.2.0-Java-1.8-devel" | ||
|
||
SAMTOOLS_VERSION="SAMtools/1.10-GCC-8.2.0-2.31.1" | ||
|
||
STAR_VERSION="2.7.5a" | ||
|
||
SALMON_VERSION="1.2.1-gompi-2019a" | ||
SALMON_TYPE="puff" | ||
|
||
SNPEFF_VERSION="v4_3t" | ||
SNPEFF_DB_NAME=ros_cfam1.0.105 | ||
|
||
CELLRANGER_VERSION="3.1.0" | ||
|
||
DEEPVARIANT_VERSION="v0.10.0" | ||
|
||
|
||
########################################### | ||
## Reference Genome Index Variables | ||
########################################### | ||
|
||
GENOME_SOURCE=ensembl | ||
GENOME_FASTA_DOWNLOAD_LINK=https://ftp.ensembl.org/pub/release-105/fasta/canis_lupus_familiaris/dna/Canis_lupus_familiaris.ROS_Cfam_1.0.dna.toplevel.fa.gz | ||
GENOME_FASTA_MD5_DOWNLOAD_LINK=https://ftp.ensembl.org/pub/release-105/fasta/canis_lupus_familiaris/dna/CHECKSUMS | ||
REFERENCE_DNA_GENOME_NAME=Canis_familiaris.ROS_Cfam_1.0.dna.toplevel.fa | ||
REFERENCE_RNA_GENOME_NAME=Canis_familiaris.ROS_Cfam_1.0.dna.toplevel.fa | ||
GENOME_ASSEMBLY_NAME=ros_cfam1.0 | ||
GENOME_SUBVERSION_NAME=ros_cfam1.0_tgen | ||
|
||
|
||
########################################### | ||
## Gene Model Variables | ||
########################################### | ||
|
||
GENE_MODEL_SOURCE=ensembl | ||
GENE_MODEL_NAME=ensembl_v105 | ||
GENE_MODEL_DOWNLOAD_LINK=ftp://ftp.ensembl.org/pub/release-105/gtf/canis_lupus_familiaris/Canis_lupus_familiaris.ROS_Cfam_1.0.105.gtf.gz | ||
GENE_MODEL_MD5_DOWNLOAD_LINK=ftp://ftp.ensembl.org/pub/release-105/gtf/canis_lupus_familiaris/CHECKSUMS | ||
GENE_MODEL_FILENAME=Canis_lupus_familiaris.ROS_Cfam_1.0.105.gtf | ||
|
||
|
||
########################################### | ||
## VEP Database Variables | ||
########################################### | ||
|
||
ENSEMBL_VERSION="105" | ||
ENSEMBL_DATABASE="canis_lupus_familiaris_core_105_10" | ||
ENSEMBL_PORT="3306" | ||
VEP_CACHE_DOWNLOAD_LINK=ftp://ftp.ensembl.org/pub/release-105/variation/vep/canis_lupus_familiaris_vep_105_ROS_Cfam_1.0.tar.gz | ||
VEP_CACHE_MD5_DOWNLOAD_LINK=ftp://ftp.ensembl.org/pub/release-105/variation/vep/CHECKSUMS | ||
|
||
|
||
########################################### | ||
## GATK CNV Variables | ||
########################################### | ||
GENOME_BUILD="Canis_familiaris.ROS_Cfam_1.0" | ||
LAST_PRIMARY_CONTIG="Y" | ||
|
||
|
||
########################################### | ||
## DogSD Canine SNP Database | ||
########################################### | ||
|
||
SNP_FILE_DOWNLOAD_LINK=ftp://download.big.ac.cn/idog/dogsd/vcf/Filtred_Published.vcf.bz2 | ||
SNP_FILE_MD5SUM_DOWNLOAD_LINK=ftp://download.big.ac.cn/idog/dogsd/vcf/md5.txt | ||
|
||
|
||
########################################### | ||
## OMIA Online Mendelian Inheritance in Animals, HTTP resource only | ||
########################################### | ||
|
||
XML_FILE_DOWNLOAD_LINK=https://omia.org/dumps/omia.xml.gz | ||
|
||
|
||
########################################### | ||
## EVA (European Variation Archive), formerly dbSNP data | ||
########################################### | ||
|
||
SNP_VCF_FILE_DOWNLOAD_LINK=ftp://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_1/by_species/Dog_9615/CanFam3.1/GCA_000002285.2_current_ids.vcf.gz | ||
SNP_VCF_MD5SUM_DOWNLOAD_LINK=ftp://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_1/by_species/Dog_9615/CanFam3.1/md5checksums.txt | ||
|
||
|
||
########################################### | ||
## GATK known sites | ||
########################################### | ||
|
||
BROAD_KNOWN_SITES=https://data.broadinstitute.org/vgb/dog/dog/canFam3/variation/ | ||
|
||
|
||
########################################### | ||
## Chain file resources | ||
########################################### | ||
CANFAM3_TO_ROS_CHAIN_DOWNLOAD_LINK=https://hgdownload.soe.ucsc.edu/goldenPath/canFam3/liftOver/canFam3ToGCF_014441545.1.over.chain.gz | ||
ROS_CHAIN_ALIASES_DOWNLOAD_LINK=https://hgdownload.soe.ucsc.edu/hubs/GCF/014/441/545/GCF_014441545.1/GCF_014441545.1.chromAlias.txt | ||
FINAL_CHAIN_NAME=canFam3.1ToRos_Cfam1.0.over.chain.gz |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
SC2,Agilent_SureSelect_Canine_Exon_V2,agilent_canine_exonV2_targets_sorted.bed,agilent_canine_exonV2_targets_sorted.bed | ||
XT2,Agilent_SureSelect_XT2_Vidium_v1.0,Vidium_v1.0_exome_Covered_nochr_noheader.bed,Vidium_v1.0_exome_Covered_nochr_noheader.bed |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
#!/usr/bin/env bash | ||
|
||
# Usage: create_canfam3toRosCfam_liftover_chain.sh <Config.ini> | ||
|
||
### Setting as an interactive BASH session and forcing history to capture commands to a log/README file | ||
HISTFILE=~/.bash_history | ||
set -o history | ||
set -ue | ||
|
||
# Check to resources.ini was provided on the command line | ||
if [ -n "$1" ] | ||
then | ||
echo "Required ini file detected" | ||
else | ||
echo "Input INI file not provided, exiting due to missing requirement" | ||
exit 1 | ||
fi | ||
|
||
# Read required variables from configuration file | ||
. ${1} | ||
|
||
#################################### | ||
## Navigate Directory Structure | ||
################################### | ||
|
||
# Check liftover directory if not available | ||
LIFTOVER_DIR=$(dirname ${PARENT_DIR})/liftover_files | ||
if [ -e ${LIFTOVER_DIR} ] | ||
then | ||
echo "Liftover directory: ${LIFTOVER_DIR} exists, moving into it" | ||
cd ${LIFTOVER_DIR} | ||
else | ||
echo "Liftover directory NOT found, creating and moving into it" | ||
mkdir -p ${LIFTOVER_DIR} | ||
cd ${LIFTOVER_DIR} | ||
fi | ||
|
||
#################################### | ||
## Generate liftover chain file(s) | ||
#################################### | ||
|
||
# Initialize README | ||
touch README | ||
echo >> README | ||
echo "For details on file creation see the associated github repository:" >> README | ||
echo "https://github.com/tgen/jetstream_resources/${WORKFLOW_NAME}" >> README | ||
echo "Created and downloaded by ${CREATOR}" >> README | ||
date >> README | ||
echo >> README | ||
echo "${GENOME_SUBVERSION_NAME} liftover chain file creation details:" >> README | ||
echo >> README | ||
|
||
# Download chain file | ||
echo "Downloading ${CANFAM3_TO_ROS_CHAIN_DOWNLOAD_LINK}" >> README | ||
wget --no-check-certificate ${CANFAM3_TO_ROS_CHAIN_DOWNLOAD_LINK} | ||
fc -ln -1 >> README | ||
echo >> README | ||
|
||
# Download aliases so that we can fix the chain file | ||
echo "Downloading ${ROS_CHAIN_ALIASES_DOWNLOAD_LINK}" >> README | ||
wget --no-check-certificate ${ROS_CHAIN_ALIASES_DOWNLOAD_LINK} | ||
fc -ln -1 >> README | ||
echo >> README | ||
|
||
CANFAM3_TO_ROS_CHAIN=`basename ${CANFAM3_TO_ROS_CHAIN_DOWNLOAD_LINK}` | ||
ROS_CHAIN_ALIASES=`basename ${ROS_CHAIN_ALIASES_DOWNLOAD_LINK}` | ||
|
||
# The chain file is labelled for the assembly, not genome reference. Changing that and uncompressing to make parsing easier | ||
gunzip -c ${CANFAM3_TO_ROS_CHAIN} > canFam3To${GENOME_ASSEMBLY_NAME}.chain | ||
|
||
# We know that they have some text issues for lines 40 and 41, so we swap them here, also printing the first two columns only | ||
awk -v OFS='\t' '{ if(NR==40 || NR==41) print $1, $3 ; else print $1, $2 }' ${ROS_CHAIN_ALIASES} | tail -n+2 > ${ROS_CHAIN_ALIASES}_fixed.txt | ||
mv ${ROS_CHAIN_ALIASES}_fixed.txt ${ROS_CHAIN_ALIASES} | ||
|
||
# Update the chain file based on the aliases | ||
while read line; do | ||
name=$(echo ${line} | cut -d' ' -f1) | ||
alias=$(echo ${line} | cut -d' ' -f2) | ||
sed -i "s/${name}/${alias}/g" canFam3To${GENOME_ASSEMBLY_NAME}.chain | ||
done < ${ROS_CHAIN_ALIASES} | ||
|
||
# Fixing the canFam3 source name to match our expections - no chr and chrUn_* contigs have .1 at the end | ||
sed 's/chr//g' canFam3To${GENOME_ASSEMBLY_NAME}.chain | sed 's/Un_//g' | awk '{ if ($3 ~ /(^JH|^A)/) $3 = $3 ".1" }1' > ${FINAL_CHAIN_NAME::-3} | ||
|
||
gzip ${FINAL_CHAIN_NAME::-3} | ||
|
||
# Remove temp files | ||
rm canFam3To${GENOME_ASSEMBLY_NAME}.chain | ||
|
||
echo | ||
echo "Process Complete" | ||
echo |
Oops, something went wrong.