Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Making MAF become the output of Funcotator in M2 WDL and multiple transcript fix. #4941

Merged
merged 15 commits into from
Jul 16, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions scripts/m2_cromwell_tests/run_m2_wdl.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,15 @@ cd "$script_path"
WORKING_DIR=/home/travis/build/broadinstitute

set -e

echo "Creating tar.gz for Funcotator datasources =========="
pushd .
FUNCOTATOR_TEST_DS_DIR=${WORKING_DIR}/gatk/src/test/resources/large/funcotator/
cd ${FUNCOTATOR_TEST_DS_DIR}
# First parameter must match Mutect2_Multi.funco_data_sources_tar_gz test_m2_wdl_multi.json
tar zcvf ${WORKING_DIR}/small_ds_pik3ca.tar.gz small_ds_pik3ca/*
popd

echo "Building docker image for M2 WDL tests (skipping unit tests)..."

#cd $WORKING_DIR/gatk/scripts/docker/
Expand Down
4 changes: 3 additions & 1 deletion scripts/m2_cromwell_tests/test_m2_wdl_multi.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@
"Mutect2_Multi.ref_fai": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/human_g1k_v37.20.21.fasta.fai",
"Mutect2_Multi.ref_dict": "/home/travis/build/broadinstitute/gatk/src/test/resources/large/human_g1k_v37.20.21.dict",
"Mutect2_Multi.pair_list": "/home/travis/build/broadinstitute/gatk/scripts/m2_cromwell_tests/pair_list",
"Mutect2_Multi.funco_data_sources_tar_gz": "/home/travis/build/broadinstitute/small_ds_pik3ca.tar.gz",
"Mutect2_Multi.funco_reference_version": "hg19",
"Mutect2_Multi.scatter_count": 2,
"Mutect2_Multi.run_orientation_bias_filter": true,
"Mutect2_Multi.run_oncotator": true,
"Mutect2_Multi.run_funcotator": false,
"Mutect2_Multi.run_funcotator": true,
"Mutect2_Multi.preemptible_attempts": 2,
"Mutect2_Multi.artifact_modes": ["G/T", "C/T"],
"Mutect2_Multi.compress_vcfs": false,
Expand Down
258 changes: 141 additions & 117 deletions scripts/mutect2_wdl/mutect2.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,14 @@
## filter_oncotator_maf: Whether the MAF generated by oncotator should have the filtered variants removed. Default: true
## realignment_index_bundle: resource for FilterAlignmentArtifacts, which runs if and only if it is specified. Generated by BwaMemIndexImageCreator.
##
## Funcotator parameters (see Funcotator help for more details).
## funco_reference_version: "hg19" for hg19 or b37. "hg38" for hg38. Default: "hg19"
## funco_transcript_selection_list: Transcripts (one GENCODE ID per line) to give priority during selection process.
## funco_transcript_selection_mode: How to select transcripts in Funcotator. ALL, CANONICAL, or BEST_EFFECT
## funco_data_sources_tar_gz: Funcotator datasources tar gz file. Bucket location is recommended when running on the cloud.
## funco_annotation_defaults: Default values for annotations, when values are unspecified. Specified as <ANNOTATION>:<VALUE>. For example: "Center:Broad"
## funco_annotation_overrides: Values for annotations, even when values are unspecified. Specified as <ANNOTATION>:<VALUE>. For example: "Center:Broad"
##
## Outputs :
## - One VCF file and its index with primary filtering applied; secondary filtering and functional annotation if requested; a bamout.bam
## file of reassembled reads if requested
Expand Down Expand Up @@ -102,12 +110,12 @@ workflow Mutect2 {
# funcotator inputs
Boolean? run_funcotator
Boolean run_funcotator_or_default = select_first([run_funcotator, false])
String? reference_version
String? data_sources_tar_gz
String? transcript_selection_mode
Array[String]? transcript_selection_list
Array[String]? annotation_defaults
Array[String]? annotation_overrides
String? funco_reference_version
File? funco_data_sources_tar_gz
String? funco_transcript_selection_mode
File? funco_transcript_selection_list
Array[String]? funco_annotation_defaults
Array[String]? funco_annotation_overrides

File? gatk_override

Expand All @@ -121,6 +129,7 @@ workflow Mutect2 {
Boolean? filter_funcotations
Boolean filter_funcotations_or_default = select_first([filter_funcotations, true])
String? oncotator_extra_args
String? funcotator_extra_args

Int? preemptible_attempts

Expand All @@ -135,6 +144,7 @@ workflow Mutect2 {

# If no tar is provided, the task downloads one from broads ftp server
Int onco_tar_size = if defined(onco_ds_tar_gz) then ceil(size(onco_ds_tar_gz, "GB") * 3) else 100
Int funco_tar_size = if defined(funco_data_sources_tar_gz) then ceil(size(funco_data_sources_tar_gz, "GB") * 3) else 100
Int gatk_override_size = if defined(gatk_override) then ceil(size(gatk_override, "GB")) else 0

# This is added to every task as padding, should increase if systematically you need more disk for every call
Expand Down Expand Up @@ -348,24 +358,28 @@ workflow Mutect2 {
if (run_funcotator_or_default) {
File funcotate_vcf_input = select_first([FilterAlignmentArtifacts.filtered_vcf, FilterByOrientationBias.filtered_vcf, Filter.filtered_vcf])
File funcotate_vcf_input_index = select_first([FilterAlignmentArtifacts.filtered_vcf_index, FilterByOrientationBias.filtered_vcf_index, Filter.filtered_vcf_index])
call Funcotate {
call FuncotateMaf {
input:
m2_vcf = funcotate_vcf_input,
m2_vcf_index = funcotate_vcf_input_index,
input_vcf = funcotate_vcf_input,
input_vcf_idx = funcotate_vcf_input_index,
ref_fasta = ref_fasta,
ref_fai = ref_fai,
ref_fasta_index = ref_fai,
ref_dict = ref_dict,
reference_version = select_first([reference_version, "NO_REFERENCE_VERSION_GIVEN"]),
output_name = funcotated_name,
compress = compress,
data_sources_tar_gz = data_sources_tar_gz,
transcript_selection_mode = transcript_selection_mode,
transcript_selection_list = transcript_selection_list,
annotation_defaults = annotation_defaults,
annotation_overrides = annotation_overrides,
reference_version = select_first([funco_reference_version, "hg19"]),
data_sources_tar_gz = funco_data_sources_tar_gz,
case_id = M2.tumor_sample[0],
control_id = M2.normal_sample[0],
transcript_selection_mode = funco_transcript_selection_mode,
transcript_selection_list = funco_transcript_selection_list,
annotation_defaults = funco_annotation_defaults,
annotation_overrides = funco_annotation_overrides,
gatk_docker = gatk_docker,
gatk_override = gatk_override,
filter_funcotations = filter_funcotations_or_default
filter_funcotations = filter_funcotations_or_default,
sequencing_center = sequencing_center,
sequence_source = sequence_source,
disk_space_gb = ceil(size(funcotate_vcf_input, "GB") * large_input_to_output_multiplier) + onco_tar_size + disk_pad,
extra_args = funcotator_extra_args
}
}

Expand All @@ -375,10 +389,8 @@ workflow Mutect2 {
File filtered_vcf = select_first([FilterAlignmentArtifacts.filtered_vcf, FilterByOrientationBias.filtered_vcf, Filter.filtered_vcf])
File filtered_vcf_index = select_first([FilterAlignmentArtifacts.filtered_vcf_index, FilterByOrientationBias.filtered_vcf_index, Filter.filtered_vcf_index])
File? contamination_table = CalculateContamination.contamination_table

File? oncotated_m2_maf = oncotate_m2.oncotated_m2_maf
File? funcotated_vcf = Funcotate.funcotated_vcf
File? funcotated_vcf_index = Funcotate.funcotated_vcf_index
File? funcotated_maf = FuncotateMaf.funcotated_output
File? preadapter_detail_metrics = CollectSequencingArtifactMetrics.pre_adapter_metrics
File? bamout = MergeBamOuts.merged_bam_out
File? bamout_index = MergeBamOuts.merged_bam_out_index
Expand Down Expand Up @@ -975,98 +987,110 @@ task SumFloats {
}
}

task Funcotate {
# inputs
File ref_fasta
File ref_fai
File ref_dict
File m2_vcf
File m2_vcf_index
String reference_version
String output_name
Boolean compress
String output_vcf = output_name + if compress then ".vcf.gz" else ".vcf"
String output_vcf_index = output_vcf + if compress then ".tbi" else ".idx"

File? data_sources_tar_gz
String? transcript_selection_mode
Array[String]? transcript_selection_list
Array[String]? annotation_defaults
Array[String]? annotation_overrides
Boolean filter_funcotations

# ==============
# Process input args:
String transcript_selection_arg = if defined(transcript_selection_list) then " --transcript-list " else ""
String annotation_def_arg = if defined(annotation_defaults) then " --annotation-default " else ""
String annotation_over_arg = if defined(annotation_overrides) then " --annotation-override " else ""
String filter_funcotations_args = if (filter_funcotations) then " --remove-filtered-variants " else ""
# ==============

# runtime

String gatk_docker
File? gatk_override
Int? mem
Int? preemptible_attempts
Int? disk_space_gb
Int? cpu

Boolean use_ssd = false

# You may have to change the following two parameter values depending on the task requirements
Int default_ram_mb = 3000
# WARNING: In the workflow, you should calculate the disk space as an input to this task (disk_space_gb). Please see [TODO: Link from Jose] for examples.
Int default_disk_space_gb = 100

# Mem is in units of GB but our command and memory runtime values are in MB
Int machine_mem = if defined(mem) then mem *1000 else default_ram_mb
Int command_mem = machine_mem - 1000

command <<<
set -e
export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override}

DATA_SOURCES_TAR_GZ=${data_sources_tar_gz}
if [[ ! -e $DATA_SOURCES_TAR_GZ ]] ; then
# We have to download the data sources:
echo "Data sources gzip does not exist: $DATA_SOURCES_TAR_GZ"
echo "Downloading default data sources..."
wget ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/funcotator/funcotator_dataSources.v1.0.20180105.tar.gz
tar -zxf funcotator_dataSources.v1.0.20180105.tar.gz
DATA_SOURCES_FOLDER=funcotator_dataSources.v1.0.20180105
else
# Extract the tar.gz:
mkdir datasources_dir
tar zxvf ${data_sources_tar_gz} -C datasources_dir --strip-components 1
DATA_SOURCES_FOLDER="$PWD/datasources_dir"
fi

gatk --java-options "-Xmx${command_mem}m" Funcotator \
--data-sources-path $DATA_SOURCES_FOLDER \
--ref-version ${reference_version} \
-R ${ref_fasta} \
-V ${m2_vcf} \
-O ${output_vcf} \
${"--transcript-selection-mode " + transcript_selection_mode} \
${transcript_selection_arg}${default="" sep=" --transcript-list " transcript_selection_list} \
${annotation_def_arg}${default="" sep=" --annotation-default " annotation_defaults} \
${annotation_over_arg}${default="" sep=" --annotation-override " annotation_overrides} \
${filter_funcotations_args}
>>>

runtime {
docker: gatk_docker
bootDiskSizeGb: 12
memory: machine_mem + " MB"
disks: "local-disk " + select_first([disk_space_gb, default_disk_space_gb]) + if use_ssd then " SSD" else " HDD"
preemptible: select_first([preemptible_attempts, 3])
cpu: select_first([cpu, 1])
}

output {
File funcotated_vcf = "${output_vcf}"
File funcotated_vcf_index = "${output_vcf_index}"
}
}

task FuncotateMaf {
# inputs
File ref_fasta
File ref_fasta_index
File ref_dict
File input_vcf
File input_vcf_idx
String reference_version
String output_format = "MAF"
String? sequencing_center
String? sequence_source
String case_id
String? control_id

File? data_sources_tar_gz
String? transcript_selection_mode
File? transcript_selection_list
Array[String]? annotation_defaults
Array[String]? annotation_overrides
Boolean filter_funcotations
File? interval_list

String? extra_args

# ==============
# Process input args:
String annotation_def_arg = if defined(annotation_defaults) then " --annotation-default " else ""
String annotation_over_arg = if defined(annotation_overrides) then " --annotation-override " else ""
String filter_funcotations_args = if (filter_funcotations) then " --remove-filtered-variants " else ""
String final_output_filename = basename(input_vcf, ".vcf") + ".maf.annotated"
# ==============

# runtime

String gatk_docker
File? gatk_override
Int? mem
Int? preemptible_attempts
Int? disk_space_gb
Int? cpu

Boolean use_ssd = false

# This should be updated when a new version of the data sources is released
String default_datasources_version = "funcotator_dataSources.v1.4.20180615"

# You may have to change the following two parameter values depending on the task requirements
Int default_ram_mb = 3000
# WARNING: In the workflow, you should calculate the disk space as an input to this task (disk_space_gb).
Int default_disk_space_gb = 100

# Mem is in units of GB but our command and memory runtime values are in MB
Int machine_mem = if defined(mem) then mem *1000 else default_ram_mb
Int command_mem = machine_mem - 1000

command <<<
set -e
export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override}

DATA_SOURCES_TAR_GZ=${data_sources_tar_gz}
if [[ ! -e $DATA_SOURCES_TAR_GZ ]] ; then
# We have to download the data sources:
echo "Data sources gzip does not exist: $DATA_SOURCES_TAR_GZ"
echo "Downloading default data sources..."
wget ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/funcotator/${default_datasources_version}.tar.gz
tar -zxf ${default_datasources_version}.tar.gz
DATA_SOURCES_FOLDER=${default_datasources_version}
else
# Extract the tar.gz:
mkdir datasources_dir
tar zxvf ${data_sources_tar_gz} -C datasources_dir --strip-components 1
DATA_SOURCES_FOLDER="$PWD/datasources_dir"
fi

gatk --java-options "-Xmx${command_mem}m" Funcotator \
--data-sources-path $DATA_SOURCES_FOLDER \
--ref-version ${reference_version} \
--output-file-format ${output_format} \
-R ${ref_fasta} \
-V ${input_vcf} \
-O ${final_output_filename} \
${"-L " + interval_list} \
${"--transcript-selection-mode " + transcript_selection_mode} \
${"--transcript-list " + transcript_selection_list} \
--annotation-default normal_barcode:${control_id} \
--annotation-default tumor_barcode:${case_id} \
--annotation-default Center:${default="Unknown" sequencing_center} \
--annotation-default source:${default="Unknown" sequence_source} \
${annotation_def_arg}${default="" sep=" --annotation-default " annotation_defaults} \
${annotation_over_arg}${default="" sep=" --annotation-override " annotation_overrides} \
${filter_funcotations_args} \
${extra_args}
>>>

runtime {
docker: gatk_docker
bootDiskSizeGb: 20
memory: machine_mem + " MB"
disks: "local-disk " + select_first([disk_space_gb, default_disk_space_gb]) + if use_ssd then " SSD" else " HDD"
preemptible: select_first([preemptible_attempts, 3])
cpu: select_first([cpu, 1])
}

output {
File funcotated_output = "${final_output_filename}"
}
}
27 changes: 14 additions & 13 deletions scripts/mutect2_wdl/mutect2_multi_sample.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -51,19 +51,19 @@ workflow Mutect2_Multi {

# funcotator inputs
Boolean? run_funcotator
String? reference_version
String? data_sources_tar_gz
String? transcript_selection_mode
Array[String]? transcript_selection_list
Array[String]? annotation_defaults
Array[String]? annotation_overrides
String? funco_reference_version
File? funco_data_sources_tar_gz
String? funco_transcript_selection_mode
File? funco_transcript_selection_list
Array[String]? funco_annotation_defaults
Array[String]? funco_annotation_overrides

File? gatk_override

# runtime
String gatk_docker
String? oncotator_docker
Int? preemptible_attempts
File? gatk_override

scatter( row in pairs ) {
# If the condition is true, variables inside the 'if' block retain their values outside the block.
Expand Down Expand Up @@ -101,12 +101,13 @@ workflow Mutect2_Multi {
sequence_source = sequence_source,
default_config_file = default_config_file,
run_funcotator = run_funcotator,
reference_version = reference_version,
data_sources_tar_gz = data_sources_tar_gz,
transcript_selection_mode = transcript_selection_mode,
transcript_selection_list = transcript_selection_list,
annotation_defaults = annotation_defaults,
annotation_overrides = annotation_overrides,
funco_reference_version = funco_reference_version,
funco_data_sources_tar_gz = funco_data_sources_tar_gz,
funco_transcript_selection_mode = funco_transcript_selection_mode,
funco_transcript_selection_list = funco_transcript_selection_list,
funco_annotation_defaults = funco_annotation_defaults,
funco_annotation_overrides = funco_annotation_overrides,

make_bamout = make_bamout,
compress_vcfs = compress_vcfs,
gatk_override = gatk_override,
Expand Down
Loading