Skip to content

Commit

Permalink
Add arbitrary NIO support to Funcotator Data sources (#5425)
Browse files Browse the repository at this point in the history
Updated data source inputs to accept NIO paths for backing files.

Now you can specify a URL in the backing file areas of the configuration
files for Funcotator data sources and the backing files will be read by
the FuncotationDataSourceFactories.

This effectively enables use of data sources in the cloud or a mix of
local- and cloud-based data sources through a config file change.

This update will enable gnomAD annotations (once the data sources are
    updated to point at the gnomAD files on Google Cloud).

Added in cloud data sources to test with.

Minor refactoring of LocatableXsvFuncotationFactory.  Now can only
support one file at a time instead of multiple files for each instance.

New cloud dataset contains local data sources and a pointer to the
gnomAD google cloud bucket.

Fixes #5348
  • Loading branch information
jonn-smith authored Nov 29, 2018
1 parent 7226ad9 commit e258888
Show file tree
Hide file tree
Showing 114 changed files with 1,168 additions and 444 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
###############################################################################

#Setup variables for the script:
UNALIASED_SCRIPT_NAME=$( readlink "${BASH_SOURCE[0]}" || echo "${BASH_SOURCE[0]}" )
UNALIASED_SCRIPT_NAME=$( python -c "import os;print os.path.realpath(\"${BASH_SOURCE[0]}\")" )
SCRIPTDIR="$( cd "$( dirname "${UNALIASED_SCRIPT_NAME}" )" && pwd )"
SCRIPTNAME=$( echo $0 | sed 's#.*/##g' )
MINARGS=2
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
###############################################################################

#Setup variables for the script:
UNALIASED_SCRIPT_NAME=$( readlink "${BASH_SOURCE[0]}" || echo "${BASH_SOURCE[0]}" )
UNALIASED_SCRIPT_NAME=$( python -c "import os;print os.path.realpath(\"${BASH_SOURCE[0]}\")" )
SCRIPTDIR="$( cd "$( dirname "${UNALIASED_SCRIPT_NAME}" )" && pwd )"
SCRIPTNAME=$( echo $0 | sed 's#.*/##g' )
MINARGS=2
Expand Down
258 changes: 138 additions & 120 deletions scripts/funcotator/testing/testFuncotator.sh
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ doClean=false
REF_VER=hg19
OUT_FORMAT=VCF
useAOUDataSources=false
useCloudDataSources=false

MANUAL_MODE=false

Expand All @@ -61,7 +62,7 @@ HG38=/Users/jonn/Development/references/Homo_sapiens_assembly38.fasta

function simpleUsage()
{
echo -e "Usage: $SCRIPTNAME [-c] [-u] [-t] [-19|-38] [-MAF|-VCF] [-AOU]"
echo -e "Usage: $SCRIPTNAME [-c] [-cloud] [-u] [-t] [-19|-38] [-MAF|-VCF] [-AOU]"
echo -e "Build and run Funcotator."
}

Expand All @@ -71,29 +72,30 @@ function usage()
simpleUsage
echo -e "Can clean, run tests, and run large file tests."
echo -e ""
echo -e "MUST be run from the GATK development directory."
echo -e "MUST be run from the GATK development directory."
echo -e ""
echo -e "Will by default (with no options) build GATK/Funcotator."
echo -e "Will by default (with no options) build GATK/Funcotator."
echo -e "For large file tests, defaults to hg19 tests with VCF output."
echo -e ""
echo -e "The following options are available:"
echo -e " -c clean GATK/Funcotator"
echo -e " -u run all tests in the Funcotator Package."
echo -e " (org.broadinstitute.hellbender.tools.funcotator)"
echo -e " (org.broadinstitute.hellbender.tools.funcotator)"
echo -e " -t run Funcotator on a large data file"
echo -e " (internally configured)"
echo -e " -19 run with hg19 data sources/reference/input file"
echo -e " (default)"
echo -e " (default)"
echo -e " -38 run with hg38 data sources/reference/input file"
echo -e " -MAF create MAF output"
echo -e " -VCF create VCF output (default)"
echo -e " -AOU use the All of Us/Clinical Pipeline data sources"
echo -e " -M REF_VER REFERENCE INPUT DATA_SOURCES run in MANUAL mode, providing all necessary input"
echo -e " REF_VER - a string for the reference version"
echo -e " REFERENCE - reference FASTA file"
echo -e " INPUT - input VCF file"
echo -e " DATA_SOURCES - path to FUNCOTATOR data sources folder"
echo -e ""
echo -e " -MAF create MAF output"
echo -e " -VCF create VCF output (default)"
echo -e " -cloud use cloud data sources"
echo -e " -AOU use the All of Us/Clinical Pipeline data sources"
echo -e " -M REF_VER REFERENCE INPUT DATA_SOURCES run in MANUAL mode, providing all necessary input"
echo -e " REF_VER - a string for the reference version"
echo -e " REFERENCE - reference FASTA file"
echo -e " INPUT - input VCF file"
echo -e " DATA_SOURCES - path to FUNCOTATOR data sources folder"
echo -e ""
echo -e "Return values:"
echo -e " 0 NORMAL"
echo -e " 1 TOO MANY ARGUMENTS"
Expand Down Expand Up @@ -130,11 +132,11 @@ function at_exit()
}

function assertFileExists() {
[[ ! -f $1 ]] && error "Error: File does not exist: $1" && exit 3
[[ ! -f $1 ]] && error "Error: File does not exist: $1" && exit 3
}

function assertDirectoryExists() {
[[ ! -d $1 ]] && error "Error: Directory does not exist: $1" && exit 4
[[ ! -d $1 ]] && error "Error: Directory does not exist: $1" && exit 4
}

################################################################################
Expand All @@ -146,7 +148,8 @@ trap at_exit EXIT
function assertInputFilesExist() {
assertFileExists ${INPUT}
assertFileExists ${REF}
assertDirectoryExists ${DATA_SOURCES_PATH}

[[ ! -d $DATA_SOURCES_PATH ]] && error "Warning: Data sources may not exist ${DATA_SOURCES_PATH}" && error "Ignore this if data sources directory is in the cloud."
}

################################################################################
Expand All @@ -155,49 +158,52 @@ function assertInputFilesExist() {
while [ $# -gt 0 ] ; do

case "$1" in
-c)
doClean=true
;;
-u)
doUnitTests=true
;;
-19)
REF_VER=hg19
;;
-38)
REF_VER=hg38
;;
-c)
doClean=true
;;
-u)
doUnitTests=true
;;
-19)
REF_VER=hg19
;;
-38)
REF_VER=hg38
;;
-VCF)
OUT_FORMAT=VCF
;;
OUT_FORMAT=VCF
;;
-MAF)
OUT_FORMAT=MAF
;;
-AOU)
useAOUDataSources=true
;;
-t)
doRunLargeTests=true
;;
-M)
shift
REF_VER=$1
shift
REF=$1
shift
INPUT=$1
shift
DATA_SOURCES_PATH=$1
MANUAL_MODE=true
# Validate our args:
if [[ ${#REF} -eq 0 ]] || [[ ${#INPUT} -eq 0 ]] || [[ ${#DATA_SOURCES_PATH} -eq 0 ]] ; then
error "Error: For manual mode you must specify a reference version, reference fasta, input file, and data sources directory." && exit 5
fi
;;
--help)
usage
exit 0
;;
OUT_FORMAT=MAF
;;
-AOU)
useAOUDataSources=true
;;
-cloud)
useCloudDataSources=true
;;
-t)
doRunLargeTests=true
;;
-M)
shift
REF_VER=$1
shift
REF=$1
shift
INPUT=$1
shift
DATA_SOURCES_PATH=$1
MANUAL_MODE=true
# Validate our args:
if [[ ${#REF} -eq 0 ]] || [[ ${#INPUT} -eq 0 ]] || [[ ${#DATA_SOURCES_PATH} -eq 0 ]] ; then
error "Error: For manual mode you must specify a reference version, reference fasta, input file, and data sources directory." && exit 5
fi
;;
--help)
usage
exit 0
;;
*)
;;
esac
Expand All @@ -210,35 +216,41 @@ done

r=1
if ${doClean} ; then
${GATKDIR}/gradlew clean compileJava compileTestJava installDist
r=$?
${GATKDIR}/gradlew clean compileJava compileTestJava installDist
r=$?
else
${GATKDIR}/gradlew compileJava compileTestJava installDist
r=$?
${GATKDIR}/gradlew compileJava compileTestJava installDist
r=$?
fi

if [[ $r -eq 0 ]] && ${doUnitTests} ; then
echo "################################################################################"
echo "## Running Unit Tests... "
${GATKDIR}/gradlew test --tests org.broadinstitute.hellbender.tools.funcotator* --stacktrace
r=$?
echo "################################################################################"
echo "## Running Unit Tests... "
${GATKDIR}/gradlew test \
--tests org.broadinstitute.hellbender.tools.funcotator* \
--tests org.broadinstitute.hellbender.utils.codecs.xsvLocatableTable* \
--tests org.broadinstitute.hellbender.utils.codecs.gencode* \
--tests org.broadinstitute.hellbender.tools.copynumber.utils.annotatedinterval.SimpleAnnotatedIntervalWriterUnitTest* \
--tests org.broadinstitute.hellbender.tools.copynumber.utils.annotatedinterval.AnnotatedIntervalCollectionUnitTest* \
--stacktrace
r=$?
fi

################################################################################

if [[ $r -eq 0 ]] && $MANUAL_MODE ; then

echo "################################################################################"
echo "## Running MANUAL Test... "
echo
echo "########################################"
echo "## Using Reference: ${REF_VER} ##"
echo "########################################"
echo "################################################################################"
echo "## Running MANUAL Test... "
echo
echo "########################################"
echo "## Using Reference: ${REF_VER} ##"
echo "########################################"

OUT_FORMAT_LOWER=$( echo "${OUT_FORMAT}" | tr 'A-Z' 'a-z' )
OUT_FILE_NAME=FUNCOTATOR_OUT.${OUT_FORMAT_LOWER}

assertInputFilesExist
assertInputFilesExist

${GATKDIR}/gatk Funcotator \
-V ${INPUT} \
Expand All @@ -250,56 +262,62 @@ if [[ $r -eq 0 ]] && $MANUAL_MODE ; then
--output-file-format ${OUT_FORMAT} -- --java-options '-DGATK_STACKTRACE_ON_USER_EXCEPTION=true'

r=$?
exit $r
exit $r
fi

if [[ $r -eq 0 ]] && ${doRunLargeTests} ; then

echo "################################################################################"
echo "## Running Large Tests... "
echo
echo
echo "########################################"
echo "## Using Reference: ${REF_VER} ##"
echo "########################################"

if [[ "${REF_VER}" == "hg19" ]] ; then
INPUT=/Users/jonn/Development/NON_PUBLIC/0816201804HC0_R01C01.vcf
#INPUT=/Users/jonn/Development/gatk/src/test/resources/large/funcotator/regressionTestVariantSet1.vcf
#INPUT=/Users/jonn/Development/gatk/src/test/resources/large/funcotator/regressionTestVariantSet2.vcf
INPUT=/Users/jonn/Development/gatk/src/test/resources/large/funcotator/regressionTestHg19Large.vcf
#INPUT=/Users/jonn/Development/gatk/hg38_trio_liftoverb37.vcf
#INPUT=/Users/jonn/Development/gatk/tmp.vcf
#INPUT=/Users/jonn/Development/data_to_run/problem_samples/splice_site_should_not_be_splice_site/error_case.vcf

#HG19=/Users/jonn/Development/references/ucsc.hg19.fasta
#HG19=/Users/jonn/Development/references/ucsc.hg19.fasta
#HG19=/Users/jonn/Development/references/GRCh37.p13.genome.fasta
REF=$HG19
else
INPUT=/Users/jonn/Development/FUNCOTATOR_LARGE_TEST_INPUTS/hg38_trio.vcf
#INPUT=/Users/jonn/Development/gatk/src/test/resources/large/funcotator/regressionTestVariantSetHG38.vcf
REF=$HG38
fi

# Use the AOU data sources if we need them:
$useAOUDataSources && DATA_SOURCES_PATH=/Users/jonn/Development/funcotator_dataSources.vAoU3

OUT_FORMAT_LOWER=$( echo "${OUT_FORMAT}" | tr 'A-Z' 'a-z' )
OUT_FILE_NAME=FUNCOTATOR_OUT.${OUT_FORMAT_LOWER}

assertInputFilesExist

${GATKDIR}/gatk Funcotator \
-V ${INPUT} \
-O ${OUT_FILE_NAME} \
-R ${REF} \
--verbosity DEBUG \
--data-sources-path ${DATA_SOURCES_PATH} \
--ref-version ${REF_VER} \
--output-file-format ${OUT_FORMAT} -- --java-options '-DGATK_STACKTRACE_ON_USER_EXCEPTION=true'

r=$?

echo "################################################################################"
echo "## Running Large Tests... "
echo
echo
echo "########################################"
echo "## Using Reference: ${REF_VER} ##"
echo "########################################"

if [[ "${REF_VER}" == "hg19" ]] ; then
INPUT=/Users/jonn/Development/NON_PUBLIC/0816201804HC0_R01C01.vcf
#INPUT=/Users/jonn/Development/gatk/src/test/resources/large/funcotator/regressionTestVariantSet1.vcf
#INPUT=/Users/jonn/Development/gatk/src/test/resources/large/funcotator/regressionTestVariantSet2.vcf
#INPUT=/Users/jonn/Development/gatk/src/test/resources/large/funcotator/regressionTestHg19Large.vcf
#INPUT=/Users/jonn/Development/gatk/hg38_trio_liftoverb37.vcf
#INPUT=/Users/jonn/Development/gatk/tmp.vcf
#INPUT=/Users/jonn/Development/data_to_run/problem_samples/splice_site_should_not_be_splice_site/error_case.vcf

#HG19=/Users/jonn/Development/references/ucsc.hg19.fasta
#HG19=/Users/jonn/Development/references/ucsc.hg19.fasta
#HG19=/Users/jonn/Development/references/GRCh37.p13.genome.fasta
REF=$HG19
else
INPUT=/Users/jonn/Development/FUNCOTATOR_LARGE_TEST_INPUTS/hg38_trio.vcf
#INPUT=/Users/jonn/Development/gatk/src/test/resources/large/funcotator/regressionTestVariantSetHG38.vcf
#INPUT=/Users/jonn/Development/tmp/cohort24_23_seg.subset.vcf
#INPUT=/Users/jonn/Development/gatk/tmp.38.vcf
REF=$HG38
fi

# Use the AOU data sources if we need them:
$useAOUDataSources && echo "Using AOU data sources." && DATA_SOURCES_PATH=/Users/jonn/Development/funcotator_dataSources.vAoU3

# Use cloud data sources if we need them:
$useCloudDataSources && echo "Using cloud data sources." && DATA_SOURCES_PATH=/Users/jonn/Development/gatk/src/test/resources/large/funcotator/funcotator_dataSources_cloud_gnomad/
#$useCloudDataSources && echo "Using cloud data sources." && DATA_SOURCES_PATH=gs://hellbender/test/resources/large/funcotatorDataSourceCollection/funcotator_dataSources_cloud/

OUT_FORMAT_LOWER=$( echo "${OUT_FORMAT}" | tr 'A-Z' 'a-z' )
OUT_FILE_NAME=FUNCOTATOR_OUT.${OUT_FORMAT_LOWER}

assertInputFilesExist

time ${GATKDIR}/gatk Funcotator \
-V ${INPUT} \
-O ${OUT_FILE_NAME} \
-R ${REF} \
--verbosity DEBUG \
--data-sources-path ${DATA_SOURCES_PATH} \
--ref-version ${REF_VER} \
--output-file-format ${OUT_FORMAT} -- --java-options '-DGATK_STACKTRACE_ON_USER_EXCEPTION=true'

r=$?
fi

exit $r
Expand Down
Loading

0 comments on commit e258888

Please sign in to comment.