Skip to content

Commit

Permalink
Merge pull request #68 from tgen/develop
Browse files Browse the repository at this point in the history
Tempe and Sonoran Updates
  • Loading branch information
bryce-turner authored Sep 30, 2022
2 parents 2494deb + 3fca0cd commit 7c7a990
Show file tree
Hide file tree
Showing 62 changed files with 6,522 additions and 7 deletions.
14 changes: 14 additions & 0 deletions bastien/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Bastien Analysis Pipeline (ROS Cfam 1.0)

### This directory contains Automated scripts for building files need for Bastien pipeline

* shared_resource_creation_script/create_genome_reference.sh bastien/bastien_resources.ini
* shared_resource_creation_scripts/create_bwa_genome_index.sh bastien/bastien_resources.ini
* shared_resource_creation_scripts/create_gene_model.sh bastien/bastien_resources.ini
* shared_resource_creation_scripts/create_star_genome_index.sh bastien/bastien_resources.ini bastien/star_index_lengths.csv
* shared_resource_creation_scripts/create_salmon_index.sh bastien/bastien_resources.ini
* shared_resource_creation_scripts/create_snpEff_db.sh bastien/bastien_resources.ini
* shared_resource_creation_scripts/create_vep_database.sh bastien/bastien_resources.ini
* coyote/create_exome_capture_resources.sh bastien/bastien_resources.ini coyote/capture_kits.csv
* coyote/create_star-fusion_resource.sh bastien/bastien_resources.ini
* coyote/create_samtools_stats_non_N_region_file.sh bastien/bastien_resources.ini
148 changes: 148 additions & 0 deletions bastien/bastien_resources.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
# Variables used to define Phoenix Resource Locations
# Used by Build scripts to create needed files in expected locations

###########################################
## UPDATE THESE BASED ON YOUR LOCAL ENVIRONMENT
###########################################

# Set your environment variable (MUST BE ONE OF "TGen" or "LOCAL"
ENVIRONMENT=TGen
#ENVIRONMENT=LOCAL

# Define input and output directories
PARENT_DIR=/home/tgenref/canis_familiaris/ros_cfam1.0
TOPLEVEL_DIR=/home/tgenref/canis_familiaris/ros_cfam1.0/ros_cfam1.0_tgen
PATH_TO_REPO=/home/tgenjetstream/git_repositories/bt_jetstream_resources
CREATOR=bturner

## WARNING!!!
## PLEASE UPDATE THESE VARIABLES TO SUPPORT LOCAL BUILD

# Set the number of local compute cores to leverage threading options for some steps
LOCAL_COMPUTE_CORES=20
# snpEff.jar (https://sourceforge.net/projects/snpeff/files/snpEff_v4_3t_core.zip/download)
SNPEFF=/packages/snpEff/snpEff_v4_3t_core/snpEff/snpEff.jar
# NCBI eUTILs PATH (https://www.ncbi.nlm.nih.gov/books/NBK179288/)
EUTILS_PATH=/home/jkeats/downloads/edirect/
# JSON.awk (https://github.com/step-/JSON.awk/archive/1.3.tar.gz)
JSON_AWK=/home/jkeats/downloads/JSON.awk-1.3/JSON.awk
# UCSC gtfToGenePred binary (http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/gtfToGenePred)
GTFTOGENEPRED_BINARY=/home/tgenref/binaries/gtfToGenePred/gtfToGenePred
# faToTwoBit (rsync -aP rsync://hgdownload.soe.ucsc.edu/genome/admin/exe/linux.x86_64/faToTwoBit .)
FATOTWOBIT=/home/tgenref/binaries/faToTwoBit/faToTwoBit


###########################################
## Common Variables
###########################################

WORKFLOW_NAME=bastien
GENOME_SUBVERSION_NAME=ros_cfam1.0_tgen
SPECIES="Canis familiaris"


###########################################
## Required Modules (Or available in your $PATH)
###########################################

BOWTIE2_MODULE="Bowtie2/2.3.5.1-GCC-8.2.0-2.31.1"
BOWTIE2_VERSION="2.3.5.1"

BWA_VERSION="0.7.17"
BWA_MEM2_VERSION="2.2.1"

GATK_MODULE="GATK/4.1.4.0-GCCcore-8.2.0-Java-1.8-devel"

SAMTOOLS_VERSION="SAMtools/1.10-GCC-8.2.0-2.31.1"

STAR_VERSION="2.7.5a"

SALMON_VERSION="1.2.1-gompi-2019a"
SALMON_TYPE="puff"

SNPEFF_VERSION="v4_3t"
SNPEFF_DB_NAME=ros_cfam1.0.105

CELLRANGER_VERSION="3.1.0"

DEEPVARIANT_VERSION="v0.10.0"


###########################################
## Reference Genome Index Variables
###########################################

GENOME_SOURCE=ensembl
GENOME_FASTA_DOWNLOAD_LINK=https://ftp.ensembl.org/pub/release-105/fasta/canis_lupus_familiaris/dna/Canis_lupus_familiaris.ROS_Cfam_1.0.dna.toplevel.fa.gz
GENOME_FASTA_MD5_DOWNLOAD_LINK=https://ftp.ensembl.org/pub/release-105/fasta/canis_lupus_familiaris/dna/CHECKSUMS
REFERENCE_DNA_GENOME_NAME=Canis_familiaris.ROS_Cfam_1.0.dna.toplevel.fa
REFERENCE_RNA_GENOME_NAME=Canis_familiaris.ROS_Cfam_1.0.dna.toplevel.fa
GENOME_ASSEMBLY_NAME=ros_cfam1.0
GENOME_SUBVERSION_NAME=ros_cfam1.0_tgen


###########################################
## Gene Model Variables
###########################################

GENE_MODEL_SOURCE=ensembl
GENE_MODEL_NAME=ensembl_v105
GENE_MODEL_DOWNLOAD_LINK=ftp://ftp.ensembl.org/pub/release-105/gtf/canis_lupus_familiaris/Canis_lupus_familiaris.ROS_Cfam_1.0.105.gtf.gz
GENE_MODEL_MD5_DOWNLOAD_LINK=ftp://ftp.ensembl.org/pub/release-105/gtf/canis_lupus_familiaris/CHECKSUMS
GENE_MODEL_FILENAME=Canis_lupus_familiaris.ROS_Cfam_1.0.105.gtf


###########################################
## VEP Database Variables
###########################################

ENSEMBL_VERSION="105"
ENSEMBL_DATABASE="canis_lupus_familiaris_core_105_10"
ENSEMBL_PORT="3306"
VEP_CACHE_DOWNLOAD_LINK=ftp://ftp.ensembl.org/pub/release-105/variation/vep/canis_lupus_familiaris_vep_105_ROS_Cfam_1.0.tar.gz
VEP_CACHE_MD5_DOWNLOAD_LINK=ftp://ftp.ensembl.org/pub/release-105/variation/vep/CHECKSUMS


###########################################
## GATK CNV Variables
###########################################
GENOME_BUILD="Canis_familiaris.ROS_Cfam_1.0"
LAST_PRIMARY_CONTIG="Y"


###########################################
## DogSD Canine SNP Database
###########################################

SNP_FILE_DOWNLOAD_LINK=ftp://download.big.ac.cn/idog/dogsd/vcf/Filtred_Published.vcf.bz2
SNP_FILE_MD5SUM_DOWNLOAD_LINK=ftp://download.big.ac.cn/idog/dogsd/vcf/md5.txt


###########################################
## OMIA Online Mendelian Inheritance in Animals, HTTP resource only
###########################################

XML_FILE_DOWNLOAD_LINK=https://omia.org/dumps/omia.xml.gz


###########################################
## EVA (European Variation Archive), formerly dbSNP data
###########################################

SNP_VCF_FILE_DOWNLOAD_LINK=ftp://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_1/by_species/Dog_9615/CanFam3.1/GCA_000002285.2_current_ids.vcf.gz
SNP_VCF_MD5SUM_DOWNLOAD_LINK=ftp://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_1/by_species/Dog_9615/CanFam3.1/md5checksums.txt


###########################################
## GATK known sites
###########################################

BROAD_KNOWN_SITES=https://data.broadinstitute.org/vgb/dog/dog/canFam3/variation/


###########################################
## Chain file resources
###########################################
CANFAM3_TO_ROS_CHAIN_DOWNLOAD_LINK=https://hgdownload.soe.ucsc.edu/goldenPath/canFam3/liftOver/canFam3ToGCF_014441545.1.over.chain.gz
ROS_CHAIN_ALIASES_DOWNLOAD_LINK=https://hgdownload.soe.ucsc.edu/hubs/GCF/014/441/545/GCF_014441545.1/GCF_014441545.1.chromAlias.txt
FINAL_CHAIN_NAME=canFam3.1ToRos_Cfam1.0.over.chain.gz
2 changes: 2 additions & 0 deletions bastien/capture_kits.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
SC2,Agilent_SureSelect_Canine_Exon_V2,agilent_canine_exonV2_targets_sorted.bed,agilent_canine_exonV2_targets_sorted.bed
XT2,Agilent_SureSelect_XT2_Vidium_v1.0,Vidium_v1.0_exome_Covered_nochr_noheader.bed,Vidium_v1.0_exome_Covered_nochr_noheader.bed
92 changes: 92 additions & 0 deletions bastien/create_canfam3toRosCfam_liftover_chain.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
#!/usr/bin/env bash

# Usage: create_canfam3toRosCfam_liftover_chain.sh <Config.ini>

### Setting as an interactive BASH session and forcing history to capture commands to a log/README file
HISTFILE=~/.bash_history
set -o history
set -ue

# Check to resources.ini was provided on the command line
if [ -n "$1" ]
then
echo "Required ini file detected"
else
echo "Input INI file not provided, exiting due to missing requirement"
exit 1
fi

# Read required variables from configuration file
. ${1}

####################################
## Navigate Directory Structure
###################################

# Check liftover directory if not available
LIFTOVER_DIR=$(dirname ${PARENT_DIR})/liftover_files
if [ -e ${LIFTOVER_DIR} ]
then
echo "Liftover directory: ${LIFTOVER_DIR} exists, moving into it"
cd ${LIFTOVER_DIR}
else
echo "Liftover directory NOT found, creating and moving into it"
mkdir -p ${LIFTOVER_DIR}
cd ${LIFTOVER_DIR}
fi

####################################
## Generate liftover chain file(s)
####################################

# Initialize README
touch README
echo >> README
echo "For details on file creation see the associated github repository:" >> README
echo "https://github.com/tgen/jetstream_resources/${WORKFLOW_NAME}" >> README
echo "Created and downloaded by ${CREATOR}" >> README
date >> README
echo >> README
echo "${GENOME_SUBVERSION_NAME} liftover chain file creation details:" >> README
echo >> README

# Download chain file
echo "Downloading ${CANFAM3_TO_ROS_CHAIN_DOWNLOAD_LINK}" >> README
wget --no-check-certificate ${CANFAM3_TO_ROS_CHAIN_DOWNLOAD_LINK}
fc -ln -1 >> README
echo >> README

# Download aliases so that we can fix the chain file
echo "Downloading ${ROS_CHAIN_ALIASES_DOWNLOAD_LINK}" >> README
wget --no-check-certificate ${ROS_CHAIN_ALIASES_DOWNLOAD_LINK}
fc -ln -1 >> README
echo >> README

CANFAM3_TO_ROS_CHAIN=`basename ${CANFAM3_TO_ROS_CHAIN_DOWNLOAD_LINK}`
ROS_CHAIN_ALIASES=`basename ${ROS_CHAIN_ALIASES_DOWNLOAD_LINK}`

# The chain file is labelled for the assembly, not genome reference. Changing that and uncompressing to make parsing easier
gunzip -c ${CANFAM3_TO_ROS_CHAIN} > canFam3To${GENOME_ASSEMBLY_NAME}.chain

# We know that they have some text issues for lines 40 and 41, so we swap them here, also printing the first two columns only
awk -v OFS='\t' '{ if(NR==40 || NR==41) print $1, $3 ; else print $1, $2 }' ${ROS_CHAIN_ALIASES} | tail -n+2 > ${ROS_CHAIN_ALIASES}_fixed.txt
mv ${ROS_CHAIN_ALIASES}_fixed.txt ${ROS_CHAIN_ALIASES}

# Update the chain file based on the aliases
while read line; do
name=$(echo ${line} | cut -d' ' -f1)
alias=$(echo ${line} | cut -d' ' -f2)
sed -i "s/${name}/${alias}/g" canFam3To${GENOME_ASSEMBLY_NAME}.chain
done < ${ROS_CHAIN_ALIASES}

# Fixing the canFam3 source name to match our expections - no chr and chrUn_* contigs have .1 at the end
sed 's/chr//g' canFam3To${GENOME_ASSEMBLY_NAME}.chain | sed 's/Un_//g' | awk '{ if ($3 ~ /(^JH|^A)/) $3 = $3 ".1" }1' > ${FINAL_CHAIN_NAME::-3}

gzip ${FINAL_CHAIN_NAME::-3}

# Remove temp files
rm canFam3To${GENOME_ASSEMBLY_NAME}.chain

echo
echo "Process Complete"
echo
Loading

0 comments on commit 7c7a990

Please sign in to comment.