Skip to content

Commit

Permalink
Merge pull request #57 from tgen/develop
Browse files Browse the repository at this point in the history
Supports releases v1.2.x
  • Loading branch information
PedalheadPHX authored Aug 17, 2021
2 parents 1a36880 + 2e367d9 commit 6ffe268
Show file tree
Hide file tree
Showing 64 changed files with 4,609 additions and 1,163 deletions.
16 changes: 14 additions & 2 deletions cactusMouse/README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,15 @@
# Mouse Pipeline
# Mouse Pipeline supports mm10 mouse genome with ensembl 98 gene models


### This directory contains Automated scripts for building files need for cactusMouse pipeline

# ../shared_resource_creation_scripts/create_genome_reference.sh cactusMouse_resources.ini
# ../shared_resource_creation_scripts/create_bwa_genome_index_cactusMouse.sh cactusMouse_resources.ini
# ../shared_resource_creation_scripts/create_gene_model.sh cactusMouse_resources.ini
# ../shared_resource_creation_scripts/create_star_genome_index_catcusMouse.sh cactusMouse_resources.ini /home/kdrenner/jetstream_resources/cactusMouse/star_index_lengths.csv
##### Had to update the create_star_genome_index.sh file. Received the error that not enough RAM was allocated.
#### Used --limitGenomeGenerateRAM 34000000000 and the code was able to run

# ../shared_resource_creation_scripts/create_salmon_index.sh cactusMouse_resources.ini
# ../shared_resource_creation_scripts/create_snpEff_db.sh cactusMouse_resources.ini

supports mm10 mouse genome with ensembl xx gene models
22 changes: 11 additions & 11 deletions cactusMouse/cactusMouse_resources.ini
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ WORKFLOW_NAME=cactusMouse
SPECIES="Mus musculus"
PATH_TO_REPO=/home/tgenjetstream/git_repositories/jetstream_resources
CREATOR=kdrenner
PARENT_DIR=/home/tgenref/mus_musculus/canfam3.1
TOPLEVEL_DIR=/home/tgenref/mus_musculus/canfam3.1/canfam3.1_tgen
PARENT_DIR=/scratch/kdrenner/mus_musculus/grch38_mm10
TOPLEVEL_DIR=/scratch/kdrenner/mus_musculus/grchm38_mm10/grchm38_mm10_tgen

###########################################
## Required Modules or Binaries (Or available in your $PATH)
Expand All @@ -31,10 +31,10 @@ GTFTOGENEPRED_BINARY=/home/tgenref/binaries/gtfToGenePred/gtfToGenePred
###########################################

GENOME_SOURCE=ensembl
GENOME_FASTA_DOWNLOAD_LINK=ftp://ftp.ensembl.org/pub/release-98/fasta/canis_familiaris/dna/Canis_familiaris.CanFam3.1.dna.toplevel.fa.gz
GENOME_FASTA_MD5_DOWNLOAD_LINK=ftp://ftp.ensembl.org/pub/release-98/fasta/canis_familiaris/dna/CHECKSUMS
GENOME_ASSEMBLY_NAME=canfam3.1
GENOME_SUBVERSION_NAME=canfam3.1_tgen
GENOME_FASTA_DOWNLOAD_LINK=ftp://ftp.ensembl.org/pub/release-98/fasta/mus_musculus/dna/Mus_musculus.GRCm38.dna.toplevel.fa.gz
GENOME_FASTA_MD5_DOWNLOAD_LINK=ftp://ftp.ensembl.org/pub/release-98/fasta/mus_musculus/dna/CHECKSUMS
GENOME_ASSEMBLY_NAME=GRCm38
GENOME_SUBVERSION_NAME=GRCm38_tgen

###########################################
## BWA Index Variables
Expand All @@ -46,10 +46,10 @@ GENOME_SUBVERSION_NAME=canfam3.1_tgen
## Gene Model Variables
###########################################

GENEMODEL_SOURCE=ensembl
GENE_MODEL_SOURCE=ensembl
GENE_MODEL_NAME=ensembl_v98
GENE_MODEL_DOWNLOAD_LINK=ftp://ftp.ensembl.org/pub/release-98/gtf/canis_familiaris/Canis_familiaris.CanFam3.1.98.gtf.gz
GENE_MODEL_MD5_DOWNLOAD_LINK=ftp://ftp.ensembl.org/pub/release-98/gtf/canis_familiaris/CHECKSUMS
GENE_MODEL_DOWNLOAD_LINK=ftp://ftp.ensembl.org/pub/release-98/gtf/mus_musculus/Mus_musculus.GRCm38.98.gtf.gz
GENE_MODEL_MD5_DOWNLOAD_LINK=ftp://ftp.ensembl.org/pub/release-98/gtf/mus_musculus/CHECKSUMS

###########################################
## STAR Index Variables, some created by previous steps
Expand All @@ -68,14 +68,14 @@ SALMON_VERSION="0.14.1"
###########################################

SNPEFF_VERSION="v4_3t"
SNPEFF_DB_NAME=canfam3.98
SNPEFF_DB_NAME=GRCm38.98

###########################################
## VEP Database Variables
###########################################

ENSEMBL_VERSION="98"
VEP_CACHE_DOWNLOAD_LINK=ftp://ftp.ensembl.org/pub/release-98/variation/vep/canis_familiaris_vep_98_CanFam3.1.tar.gz
VEP_CACHE_DOWNLOAD_LINK=ftp://ftp.ensembl.org/pub/release-98/variation/vep/mus_musculus_vep_98_GRCm38.tar.gz
VEP_CACHE_MD5_DOWNLOAD_LINK=ftp://ftp.ensembl.org/pub/release-98/variation/vep/CHECKSUMS

###########################################
Expand Down
23 changes: 10 additions & 13 deletions coyote/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,13 @@

### This directory contains Automated scripts for building files need for Coyote pipeline

* ../shared_resource_creation_script/create_genome_reference.sh coyote_resources.ini
* ../shared_resource_creation_scripts/create_bwa_genome_index.sh coyote_resources.ini
* ../shared_resource_creation_scripts/create_gene_model.sh coyote_resources.ini
* ../shared_resource_creation_scripts/create_star_genome_index.sh coyote_resources.ini star_index_lengths.csv
* ../shared_resource_creation_scripts/create_salmon_index.sh coyote_resources.ini
* ../shared_resource_creation_scripts/create_snpEff_db.sh coyote_resources.ini
* THIS MIGHT HAVE FAILED, NO *.bin FILE CREATED
* ../shared_resource_creation_scripts/create_vep_database.sh coyote_resources.ini
* create_exome_capture_resources.sh
* create_star-fusion_resource.sh
* create_samtools_stats_non_N_region_file.sh
*
* next
* shared_resource_creation_script/create_genome_reference.sh coyote/coyote_resources.ini
* shared_resource_creation_scripts/create_bwa_genome_index.sh coyote/coyote_resources.ini
* shared_resource_creation_scripts/create_gene_model.sh coyote/coyote_resources.ini
* shared_resource_creation_scripts/create_star_genome_index.sh coyote/coyote_resources.ini coyote/star_index_lengths.csv
* shared_resource_creation_scripts/create_salmon_index.sh coyote/coyote_resources.ini
* shared_resource_creation_scripts/create_snpEff_db.sh coyote/coyote_resources.ini
* shared_resource_creation_scripts/create_vep_database.sh coyote/coyote_resources.ini
* coyote/create_exome_capture_resources.sh coyote/coyote_resources.ini coyote/capture_kits.csv
* coyote/create_star-fusion_resource.sh coyote/coyote_resources.ini
* coyote/create_samtools_stats_non_N_region_file.sh coyote/coyote_resources.ini
2 changes: 2 additions & 0 deletions coyote/capture_kits.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
SC2,Agilent_SureSelect_Canine_Exon_V2,agilent_canine_exonV2_targets_sorted.bed,agilent_canine_exonV2_targets_sorted.bed
XT2,Agilent_SureSelect_XT2_Vidium_v1.0,Vidium_v1.0_exome_Covered_nochr_noheader.bed,Vidium_v1.0_exome_Covered_nochr_noheader.bed
106 changes: 71 additions & 35 deletions coyote/coyote_resources.ini
Original file line number Diff line number Diff line change
@@ -1,30 +1,72 @@
# Variables used to define Phoenix Resource Locations
# Used by Build scripts to create needed files in expected locations

###########################################
## UPDATE THESE BASED ON YOUR LOCAL ENVIRONMENT
###########################################

# Set your environment variable (MUST BE ONE OF "TGen" or "LOCAL"
ENVIRONMENT=TGen
#ENVIRONMENT=LOCAL

# Define input and output directories
PARENT_DIR=/home/tgenref/canis_familiaris/canfam3.1
TOPLEVEL_DIR=/home/tgenref/canis_familiaris/canfam3.1/canfam3.1_tgen
PATH_TO_REPO=/home/tgenjetstream/git_repositories/jetstream_resources
CREATOR=jkeats

## WARNING!!!
## PLEASE UPDATE THESE VARIABLES TO SUPPORT LOCAL BUILD

# Set the number of local compute cores to leverage threading options for some steps
LOCAL_COMPUTE_CORES=20
# snpEff.jar (https://sourceforge.net/projects/snpeff/files/snpEff_v4_3t_core.zip/download)
SNPEFF=/packages/snpEff/snpEff_v4_3t_core/snpEff/snpEff.jar
# NCBI eUTILs PATH (https://www.ncbi.nlm.nih.gov/books/NBK179288/)
EUTILS_PATH=/home/jkeats/downloads/edirect/
# JSON.awk (https://github.com/step-/JSON.awk/archive/1.3.tar.gz)
JSON_AWK=/home/jkeats/downloads/JSON.awk-1.3/JSON.awk
# UCSC gtfToGenePred binary (http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/gtfToGenePred)
GTFTOGENEPRED_BINARY=/home/tgenref/binaries/gtfToGenePred/gtfToGenePred
# faToTwoBit (rsync -aP rsync://hgdownload.soe.ucsc.edu/genome/admin/exe/linux.x86_64/faToTwoBit .)
FATOTWOBIT=/home/tgenref/binaries/faToTwoBit/faToTwoBit


###########################################
## Common Variables
###########################################

WORKFLOW_NAME=coyote
GENOME_SUBVERSION_NAME=canfam3.1_tgen
SPECIES="Canis familiaris"
PATH_TO_REPO=/home/tgenjetstream/git_repositories/jetstream_resources
CREATOR=jkeats
PARENT_DIR=/home/tgenref/canis_familiaris/canfam3.1
TOPLEVEL_DIR=/home/tgenref/canis_familiaris/canfam3.1/canfam3.1_tgen


###########################################
## Required Modules or Binaries (Or available in your $PATH)
## Required Modules (Or available in your $PATH)
###########################################

BOWTIE2_MODULE="Bowtie2/2.3.5.1-GCC-8.2.0-2.31.1"
BOWTIE2_VERSION="2.3.5.1"
BWA_MODULE="BWA/0.7.17-GCC-8.2.0-2.31.1"

BWA_VERSION="0.7.17"
BWA_MEM2_VERSION="2.2.1"

GATK_MODULE="GATK/4.1.4.0-GCCcore-8.2.0-Java-1.8-devel"

SAMTOOLS_VERSION="SAMtools/1.10-GCC-8.2.0-2.31.1"

# This is a hard link to the binary, putting it in your $PATH will fail with existing code
GTFTOGENEPRED_BINARY=/home/tgenref/binaries/gtfToGenePred/gtfToGenePred
STAR_VERSION="2.7.5a"

SALMON_VERSION="1.2.1-gompi-2019a"
SALMON_TYPE="puff"

SNPEFF_VERSION="v4_3t"
SNPEFF_DB_NAME=canfam3.1.98

CELLRANGER_VERSION="3.1.0"

DEEPVARIANT_VERSION="v0.10.0"


###########################################
## Reference Genome Index Variables
Expand All @@ -33,72 +75,66 @@ GTFTOGENEPRED_BINARY=/home/tgenref/binaries/gtfToGenePred/gtfToGenePred
GENOME_SOURCE=ensembl
GENOME_FASTA_DOWNLOAD_LINK=ftp://ftp.ensembl.org/pub/release-98/fasta/canis_familiaris/dna/Canis_familiaris.CanFam3.1.dna.toplevel.fa.gz
GENOME_FASTA_MD5_DOWNLOAD_LINK=ftp://ftp.ensembl.org/pub/release-98/fasta/canis_familiaris/dna/CHECKSUMS
REFERENCE_DNA_GENOME_NAME=Canis_familiaris.CanFam3.1.dna.toplevel.fa
REFERENCE_RNA_GENOME_NAME=Canis_familiaris.CanFam3.1.dna.toplevel.fa
GENOME_ASSEMBLY_NAME=canfam3.1
GENOME_SUBVERSION_NAME=canfam3.1_tgen

###########################################
## BWA Index Variables
###########################################

# See BWA version above

###########################################
## Gene Model Variables
###########################################

GENEMODEL_SOURCE=ensembl
GENE_MODEL_SOURCE=ensembl
GENE_MODEL_NAME=ensembl_v98
GENE_MODEL_DOWNLOAD_LINK=ftp://ftp.ensembl.org/pub/release-98/gtf/canis_familiaris/Canis_familiaris.CanFam3.1.98.gtf.gz
GENE_MODEL_MD5_DOWNLOAD_LINK=ftp://ftp.ensembl.org/pub/release-98/gtf/canis_familiaris/CHECKSUMS
GENE_MODEL_FILENAME=Canis_familiaris.CanFam3.1.98.gtf

###########################################
## STAR Index Variables, some created by previous steps
###########################################

STAR_VERSION="2.7.3a"

###########################################
## Salmon Index Variables, some created by previous steps
###########################################

SALMON_VERSION="0.14.1"

###########################################
## snpEff Custom Database Variables, some created by previous steps
###########################################

SNPEFF_VERSION="v4_3t"
SNPEFF_DB_NAME=canfam3.98

###########################################
## VEP Database Variables
###########################################

ENSEMBL_VERSION="98"
ENSEMBL_DATABASE="canis_familiaris_core_98_31"
ENSEMBL_PORT="3306"
VEP_CACHE_DOWNLOAD_LINK=ftp://ftp.ensembl.org/pub/release-98/variation/vep/canis_familiaris_vep_98_CanFam3.1.tar.gz
VEP_CACHE_MD5_DOWNLOAD_LINK=ftp://ftp.ensembl.org/pub/release-98/variation/vep/CHECKSUMS


###########################################
## Cell Ranger Variables
## GATK CNV Variables
###########################################
GENOME_BUILD="Canis_familiaris.CanFam3.1"
LAST_PRIMARY_CONTIG="X"

CELLRANGER_VERSION="3.1.0"

###########################################
## DogSD Canine SNP Database
###########################################

SNP_FILE_DOWNLOAD_LINK=ftp://download.big.ac.cn/dogsd/snp/
SNP_FILE_DOWNLOAD_LINK=ftp://download.big.ac.cn/idog/dogsd/vcf/Filtred_Published.vcf.bz2
SNP_FILE_MD5SUM_DOWNLOAD_LINK=ftp://download.big.ac.cn/idog/dogsd/vcf/md5.txt


###########################################
## OMIA Online Mendelian Inheritance in Animals, HTTP resource only
###########################################

XML_FILE_DOWNLOAD_LINK=https://omia.org/dumps/omia.xml.gz


###########################################
## EVA (European Variation Archive), formerly dbSNP data
###########################################

SNP_VCF_FILE_DOWNLOAD_LINK=ftp://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_1/by_species/Dog_9615/CanFam3.1/GCA_000002285.2_current_ids.vcf.gz
SNP_VCF_MD5SUM_DOWNLOAD_LINK=ftp://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_1/by_species/Dog_9615/CanFam3.1/md5checksums.txt


###########################################
## GATK known sites
###########################################

BROAD_KNOWN_SITES=https://data.broadinstitute.org/vgb/dog/dog/canFam3/variation/
82 changes: 82 additions & 0 deletions coyote/create_samtools_stats_non_N_region_file.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
#!/usr/bin/env bash

### Setting as an interactive BASH session and forcing history to capture commands to a log/README file
HISTFILE=~/.bash_history
set -o history
set -ue

# Read required variables from configuration file
. ${1}

####################################
## Navigate Directory Structure
###################################

# Check top level directory if not available
if [ -e ${TOPLEVEL_DIR} ]
then
echo "Top level directory: ${TOPLEVEL_DIR} exists, moving into it"
cd ${TOPLEVEL_DIR}
else
echo "Top level directory NOT found, IT IS REQUIRED, EXITING"
exit 1
fi

# Check that the reference genome was created successfully
if [ -e GENOME_FASTA_GENERATION_COMPLETE ]
then
echo "Genome fasta exists, moving forward"
else
echo "Genome fasta generation complete flag NOT found"
echo "Try again later as this is required"
exit 2
fi

####################################
## Generate BWA index
####################################

if [ -e tool_resources ]
then
echo "tool_resources directory exists, moving into it"
cd tool_resources
else
echo "tool_resources directory NOT found, creating and moving into it now"
mkdir tool_resources
cd tool_resources
fi

if [ -e "samtools_stats" ]
then
echo "The samtools_stats directory exists, exiting to prevent overwriting existing non_N_region_file."
exit 2
else
echo "The samtools_stats directory was NOT found, creating and moving into it now"
mkdir samtools_stats
cd samtools_stats
fi

# Initialize a samtools_stats index README
touch README
echo >> README
echo "For details on file creation see the associated github repository:" >> README
echo "https://github.com/tgen/jetstream_resources/coyote" >> README
echo "Created and downloaded by ${CREATOR}" >> README
date >> README
echo >> README
echo "samtools stats --target-regions non_N_region_file creation details:" >> README
echo >> README
echo "By default samtools stats consideres all bases, including N's, in the reference genome when calculating stats." > README
echo "The 1 based regions file generated by make_samtools_stats_non_N_region_file_from_fasta.awk excludes all N bases" > README
echo "and is passed to the --target-regions option of samtools stats to ensure coverage and mapping statistics are " >> README
echo "calculated based on non N genome space." >> README
echo >> README

# Create bwa index files using bwa utility script
echo "Create samtools stats non N region file of primary contigs with chrX removed as follows:" >> README
sbatch --export ALL ${PATH_TO_REPO}/utility_scripts/make_samtools_stats_non_N_region_file_from_fasta.awk ${REFERENCE_DNA_GENOME_FASTA} canfam3.1_coyote_samstats_no_N_1based_primary_contigs_no_chrX.txt 1CONTIG_SEP2CONTIG_SEP3CONTIG_SEP4CONTIG_SEP5CONTIG_SEP6CONTIG_SEP7CONTIG_SEP8CONTIG_SEP9CONTIG_SEP10CONTIG_SEP11CONTIG_SEP12CONTIG_SEP13CONTIG_SEP14CONTIG_SEP15CONTIG_SEP16CONTIG_SEP17CONTIG_SEP18CONTIG_SEP19CONTIG_SEP20CONTIG_SEP21CONTIG_SEP22CONTIG_SEP23CONTIG_SEP24CONTIG_SEP25CONTIG_SEP26CONTIG_SEP27CONTIG_SEP28CONTIG_SEP29CONTIG_SEP30CONTIG_SEP31CONTIG_SEP32CONTIG_SEP33CONTIG_SEP34CONTIG_SEP35CONTIG_SEP36CONTIG_SEP37CONTIG_SEP38
fc -ln -1 >> README
echo >> README
cat ${PATH_TO_REPO}/utility_scripts/make_samtools_stats_non_N_region_file_from_fasta.awk >> README
echo >> README
echo >> README
Loading

0 comments on commit 6ffe268

Please sign in to comment.