diff --git a/scripts/mutect2_wdl/unsupported/calculate_sensitivity.py b/scripts/mutect2_wdl/unsupported/calculate_sensitivity.py deleted file mode 100644 index 66954cdeac5..00000000000 --- a/scripts/mutect2_wdl/unsupported/calculate_sensitivity.py +++ /dev/null @@ -1,149 +0,0 @@ -""" -The input to this script is a tab-separated table with header columns: -STATUS BAM_DEPTH AF_EXP TYPE - -STATUS takes on values TP (true positive), FP (false positive), FN (false negative), FFN (filtered false negative) -as emitted by the GATK tool Concordance (the input table come from running VariantsToTable on an output vcf of Concordance). -BAM_DEPTH is the depth of the simulated tumor (ie the Hapmap mixture) at each site -AF_EXP is the expected allele fraction based on the component samples' genotypes and the mixing fractions -TYPE is either SNP or INDEL - -This script divides the data by SNP/INDEL and by bins of depth and expected allele fraction and computes the sensitivity -(true positives / (true positives + false negatives + filtered false negatives)) for each bin, along with error bars -on the sensitivity and the raw counts of true and called variants. It produces tables containing this information and plots of -the sensitivity with error bars. - - - -""" - -from __future__ import division -import numpy as np -import pandas as pd -import argparse -from itertools import chain -from statsmodels.stats.proportion import proportion_confint -import matplotlib as mpl -mpl.use('Agg') -import matplotlib.pyplot as plt - -SNP_AF_BINS = [0.05, 0.1, 0.2, 0.4, 0.8, 1.0] -INDEL_AF_BINS = [0.1, 0.2, 1] -AF_JITTER = 0.05 -EPSILON = 0.0001 #regularizer for 0/0 - -EXPECTED_AF_COLUMN = 'AF_EXP' -DEPTH_COLUMN = 'BAM_DEPTH' -STATUS_COLUMN = 'STATUS' # 1 if called, 0 otherwise - -def flatten(x): - return list(chain.from_iterable(x)) - -def generate_sensitivity_table(df, snp_or_indel, depth_bins, depth_bin_width): - - def assign_bin(x, bins, jitter): - """ Assigns 'x' into a bin +/- jitter. """ - for _bin in bins: - lower, upper = _bin - jitter, _bin + jitter - if (x > lower and x <= upper): - return _bin - return np.NaN - - def assign_depth_bin(depth): - return assign_bin(depth, depth_bins, depth_bin_width) - - def assign_snp_af_bin(af): - return assign_bin(af, SNP_AF_BINS, AF_JITTER) - - def assign_indel_af_bin(af): - if (af > 0 and af <= 0.1): - return 0.1 - if (af > 0.1 and af <= 0.2): - return 0.2 - if (af > 0.2 and af <= 1): - return 1 - else: - return np.NaN - - # add the 'bin' columns for each row - df.loc[:, 'af_bin'] = df[EXPECTED_AF_COLUMN].apply(assign_snp_af_bin if snp_or_indel == "SNP" else assign_indel_af_bin) - df.loc[:, 'depth_bin'] = df[DEPTH_COLUMN].apply(assign_depth_bin) - df.loc[:, 'is_tp'] = df['STATUS'] == 'TP' - df = df[~np.isnan(df['af_bin'])] - - # make a matrix of binned AF vs binned depth where entries are the number of true positives - called = pd.pivot_table(df, index = 'depth_bin', columns = 'af_bin', - values = ['is_tp'], aggfunc = sum, fill_value = 0) - - # make a matrix of binned AF vs binned depth where entries are the number of true positives and false negatives - total = pd.pivot_table(df, index = 'depth_bin', columns = 'af_bin', - values = [STATUS_COLUMN], aggfunc = len, fill_value = 0) - - sensitivity = (called.values + EPSILON) / (total.values + EPSILON) - - # wrap in pandas dataframe - af_bins = np.sort(df['af_bin'].unique()) - depth_bins = np.sort(df['depth_bin'].unique()) - sensitivity_table = pd.DataFrame(sensitivity, columns = [str(x) for x in af_bins], index = total.index) - - error_bars = [proportion_confint(x, max(y,1), alpha = 0.05, method = 'wilson') for x,y in zip(called.values.flat, total.values.flat)] - - #reshape confidence intervals - num_rows, num_cols = called.shape - new_shape = (num_rows, num_cols * 2) - error_bars = np.array(error_bars).reshape(new_shape) - - #load into dataframe - error_bars = pd.DataFrame(error_bars, index = total.index, - columns = flatten([(str(x) + "_LCI", str(x) + "_UCI") for x in af_bins])) - - sensitivity_with_confidence = sensitivity_table.merge(error_bars, left_index = True, right_index = True) - - # merge the sensitivity with the error bars, the counts total true variants, and the counts of called true variants into a single data frame - total = pd.DataFrame(total.values, columns = [str(x) + "_total_count" for x in af_bins], index = total.index) - called = pd.DataFrame(called.values, columns = [str(x) + "_called_count" for x in af_bins], index = total.index) - sensitivity_with_confidence_and_truth_count = sensitivity_with_confidence.merge(total, left_index = True, right_index = True) - sensitivity_with_confidence_and_counts = sensitivity_with_confidence_and_truth_count.merge(called, left_index = True, right_index = True) - - sensitivity_with_confidence_and_counts.to_csv( snp_or_indel + "_sensitivity.tsv", sep = '\t') - - return sensitivity_with_confidence_and_counts, af_bins - -def draw_sensitivity_graph(df, af_bins, snp_or_indel): - x = df.index # depth - - for _bin in af_bins: - y = df['{0}'.format(_bin)] - low, high = y - df['{0}_LCI'.format(_bin)], df['{0}_UCI'.format(_bin)] - y - plt.errorbar(x,y, yerr = [low, high], label = "{0}".format(_bin)) - - axes = plt.gca() - axes.set_ylim([0.0, 1.0]) - - plt.legend(loc = 4) - plt.title( snp_or_indel + " Sensitivity") - plt.xlabel('Depth') - plt.ylabel('Sensitivity') - plt.savefig( snp_or_indel + "_sensitivity.png") - plt.close() - -def run(): - parser = argparse.ArgumentParser() - parser.add_argument("--input_file") - parser.add_argument("--depth_bins", nargs='+', type=int) - parser.add_argument("--depth_bin_width", type=int) - args = parser.parse_args() - - df = pd.read_table(args.input_file, sep = '\t', dtype = {'CHROM': str, 'TYPE': str, 'STATUS': str}) - - snps_df = df.loc[df['TYPE'] == 'SNP'] - indels_df = df.loc[df['TYPE'] == 'INDEL'] - - snp_df, snp_af_bins = generate_sensitivity_table(snps_df, "SNP", args.depth_bins, args.depth_bin_width) - indel_df, indel_af_bins = generate_sensitivity_table(indels_df, "Indel", args.depth_bins, args.depth_bin_width) - - draw_sensitivity_graph(snp_df, snp_af_bins, "SNP") - draw_sensitivity_graph(indel_df, indel_af_bins, "Indel") - -if __name__ == '__main__': - run() \ No newline at end of file diff --git a/scripts/mutect2_wdl/unsupported/hapmap_sensitivity.wdl b/scripts/mutect2_wdl/unsupported/hapmap_sensitivity.wdl deleted file mode 100755 index 399ff01acef..00000000000 --- a/scripts/mutect2_wdl/unsupported/hapmap_sensitivity.wdl +++ /dev/null @@ -1,402 +0,0 @@ -### This is the concordance part of the CRSP sensitivity validation - -#Conceptual Overview -# To measure sensitivity, we sequence a pool 5, 10 or 20 normal samples. The pool contains a variety of allele fractions, -# like a real tumor sample. These normal samples were also sequenced individually as part of HapMap, so we have "truth" vcfs -# for them. We calculate sensitivity by comparing the calls to the truth data. For a variety of reasons we don't call -# against a matched normal. - - -#Workflow Steps -# 1. Restrict a huge HapMap wgs VCF to given lists of samples and intervals. -# 2. Annotate this VCF using a bam sequenced from a pool derived from the given samples -# 3. Run Mutect in tumor-only mode on this pooled bam. -# 4. Compare Mutect calls to the truth data and output a table of true positives and false negatives along with -# annotations from the truth VCF prepared in steps 1 and 2. - -# Here we implement these steps, except for the subsampling in step 1, for several replicate bams of a *single* plex, -# e.g. 4 different 10-plex bams. Note that each replicate has its own truth vcf because although the truth variants are identical, -# the bam-derived annotations differ slightly. - -import "mutect2.wdl" as MutectSingleSample - -workflow HapmapSensitivity { - File? intervals - File ref_fasta - File ref_fai - File ref_dict - Int scatter_count - File bam_list - Array[Array[String]] replicates = read_tsv(bam_list) - File? pon - File? pon_index - Boolean? run_orientation_bias_filter - Array[String]? artifact_modes - String? m2_extra_args - String? m2_extra_filtering_args - String prefix #a prefix string like "5plex" - File python_script - Int max_depth - Array[Int] depth_bins - Int depth_bin_width - File preprocessed_hapmap - File preprocessed_hapmap_idx - - File? gatk_override - String gatk_docker - - call RestrictIntervals { - input: gatk_override = gatk_override, gatk_docker = gatk_docker, vcf = preprocessed_hapmap, vcf_idx = preprocessed_hapmap_idx, intervals = intervals - } - - scatter (row in replicates) { - File bam = row[0] - File index = row[1] - - call MixingFractions { - input: gatk_override = gatk_override, gatk_docker = gatk_docker, vcf = RestrictIntervals.output_vcf, vcf_idx = RestrictIntervals.output_vcf_idx, bam = bam, bam_idx = index - } - - call ExpectedAlleleFraction { - input: gatk_override = gatk_override, gatk_docker = gatk_docker, vcf = RestrictIntervals.output_vcf, vcf_idx = RestrictIntervals.output_vcf_idx, mixing_fractions = MixingFractions.mixing - } - - call BamDepth { - input: gatk_override = gatk_override, gatk_docker = gatk_docker, vcf = ExpectedAlleleFraction.output_vcf, vcf_idx = ExpectedAlleleFraction.output_vcf_idx, - bam = bam, bam_idx = index, max_depth = max_depth - } - - call MutectSingleSample.Mutect2 { - input: - gatk_override = gatk_override, - gatk_docker = gatk_docker, - intervals = intervals, - ref_fasta = ref_fasta, - ref_fai = ref_fai, - ref_dict = ref_dict, - scatter_count = scatter_count, - tumor_bam = bam, - tumor_bai = index, - pon = pon, - pon_index = pon_index, - run_orientation_bias_filter = run_orientation_bias_filter, - artifact_modes = artifact_modes, - m2_extra_args = m2_extra_args, - m2_extra_filtering_args = m2_extra_filtering_args - } - - call Concordance { - input: - gatk_override = gatk_override, intervals = intervals, - gatk_docker = gatk_docker, - truth = BamDepth.output_vcf, - truth_idx = BamDepth.output_vcf_idx, - eval = Mutect2.filtered_vcf, - eval_idx = Mutect2.filtered_vcf_index - } - - call ConvertToTable { - input: gatk_override = gatk_override, gatk_docker = gatk_docker, input_vcf = Concordance.tpfn, input_vcf_idx = Concordance.tpfn_idx - } - } #done with scatter over replicates - - call CombineTables as SensitivityTables { - input: input_tables = ConvertToTable.table, prefix = prefix - } - - call AnalyzeSensitivity { - input: input_table = SensitivityTables.table, python_script = python_script, prefix = prefix, depth_bins = depth_bins, depth_bin_width = depth_bin_width - } - - call CombineTables as SummaryTables { - input: input_tables = Concordance.summary, prefix = prefix - } - - call Jaccard as JaccardSNP { - input: gatk_override = gatk_override, gatk_docker = gatk_docker, calls = Mutect2.filtered_vcf, calls_idx = Mutect2.filtered_vcf_index, prefix = prefix, type = "SNP" - } - - call Jaccard as JaccardINDEL { - input: gatk_override = gatk_override, gatk_docker = gatk_docker, calls = Mutect2.filtered_vcf, calls_idx = Mutect2.filtered_vcf_index, prefix = prefix, type = "INDEL" - } - - output { - File snp_table = AnalyzeSensitivity.snp_table - File snp_plot = AnalyzeSensitivity.snp_plot - File indel_table = AnalyzeSensitivity.indel_table - File indel_plot = AnalyzeSensitivity.indel_plot - File summary = SummaryTables.table - File raw_table = SensitivityTables.table - File snp_jaccard_table = JaccardSNP.table - File indel_jaccard_table = JaccardINDEL.table - Array[File] tpfn = Concordance.tpfn - Array[File] tpfn_idx = Concordance.tpfn_idx - Array[File] ftnfn = Concordance.ftnfn - Array[File] ftnfn_idx = Concordance.ftnfn_idx - Array[File] filter_analysis = Concordance.filter_analysis - } -} #end of workflow - -#### Tasks for making truth -task RestrictIntervals { - File? gatk_override - String gatk_docker - File vcf - File vcf_idx - File? intervals - - command { - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - - # only restricting intervals here - # subsamplign and restricting to biallelics done in preprocess_hapmap.wdl - gatk --java-options "-Xmx4g" SelectVariants \ - -V ${vcf} \ - -O restricted.vcf \ - ${"-L " + intervals} - } - - runtime { - docker: "${gatk_docker}" - preemptible: 2 - } - - output { - File output_vcf = "restricted.vcf" - File output_vcf_idx = "restricted.vcf.idx" - } -} - -task BamDepth { - File? gatk_override - String gatk_docker - File vcf - File vcf_idx - File bam - File bam_idx - Int max_depth #ignore sites with depth greater than this because they are alignment artifacts - - command { - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - gatk --java-options "-Xmx4g" AnnotateVcfWithBamDepth -V ${vcf} -I ${bam} -O "bam_depth.vcf" - gatk --java-options "-Xmx4g" SelectVariants -V bam_depth.vcf --select "BAM_DEPTH < ${max_depth}" -O truth.vcf - } - - runtime { - docker: "${gatk_docker}" - preemptible: 2 - } - - output { - File output_vcf = "truth.vcf" - File output_vcf_idx = "truth.vcf.idx" - } -} - -task MixingFractions { - File? gatk_override - String gatk_docker - File vcf - File vcf_idx - File bam - File bam_idx - - command { - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - gatk --java-options "-Xmx4g" CalculateMixingFractions -V ${vcf} -I ${bam} -O "mixing.table" - } - - runtime { - docker: "${gatk_docker}" - preemptible: 2 - } - - output { File mixing = "mixing.table" } -} - -task ExpectedAlleleFraction { - File? gatk_override - String gatk_docker - File vcf - File vcf_idx - File mixing_fractions - - command { - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - gatk --java-options "-Xmx4g" AnnotateVcfWithExpectedAlleleFraction -V ${vcf} -O af_exp.vcf --mixing-fractions ${mixing_fractions} - } - - runtime { - docker: "${gatk_docker}" - preemptible: 2 - } - - output { - File output_vcf = "af_exp.vcf" - File output_vcf_idx = "af_exp.vcf.idx" - } -} - -### Tasks for analysing sensitivity - -task ConvertToTable { - File? gatk_override - String gatk_docker - File input_vcf - File input_vcf_idx - - command { - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - gatk --java-options "-Xmx4g" VariantsToTable -V ${input_vcf} -F STATUS -F BAM_DEPTH -F AF_EXP -F TYPE -O "result.table" - } - - runtime { - docker: "${gatk_docker}" - preemptible: 2 - } - - output { File table = "result.table" } -} - -task CombineTables { - Array[File] input_tables - String prefix - - command { - for file in ${sep=' ' input_tables}; do - head -n 1 $file > header - tail -n +2 $file >> body - done - - cat header body > "${prefix}_combined.txt" - } - - runtime { memory: "5 GB" } - - output { File table = "${prefix}_combined.txt" } -} - -task AnalyzeSensitivity { - File input_table - File python_script - String prefix - Array[Int] depth_bins - Int depth_bin_width - - command { - . /broad/software/scripts/useuse - use Python-2.7 - python ${python_script} --input_file ${input_table} \ - --depth_bins ${sep = ' ' depth_bins} \ - --depth_bin_width ${depth_bin_width} - mv "SNP_sensitivity.tsv" "${prefix}_SNP_sensitivity.tsv" - mv "SNP_sensitivity.png" "${prefix}_SNP_sensitivity.png" - mv "Indel_sensitivity.tsv" "${prefix}_Indel_sensitivity.tsv" - mv "Indel_sensitivity.png" "${prefix}_Indel_sensitivity.png" - } - - runtime { - continueOnReturnCode: [0,1] - preemptible: 2 - } - - output { - File snp_table = "${prefix}_SNP_sensitivity.tsv" - File snp_plot = "${prefix}_SNP_sensitivity.png" - File indel_table = "${prefix}_Indel_sensitivity.tsv" - File indel_plot = "${prefix}_Indel_sensitivity.png" - } -} - -#Make Jaccard index table for SNVs or indels from an array of called vcfs -task Jaccard { - File? gatk_override - String gatk_docker - Array[File] calls - Array[File] calls_idx - String prefix - String type #SNP or INDEL - - command { - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - - result="${prefix}_${type}_jaccard.txt" - touch $result - - count=0 - for vcf in ${sep = ' ' calls}; do - ((count++)) - gatk --java-options "-Xmx4g" SelectVariants -V $vcf --select-type-to-include ${type} -O ${type}_only_$count.vcf - done - - for file1 in ${type}_only*.vcf; do - column=0 - for file2 in ${type}_only*.vcf; do - ((column++)) - if [ $column != 1 ]; then - printf "\t" >> $result - fi - - if [ $file1 == $file2 ]; then - printf 1.0000 >> $result - else - gatk --java-options "-Xmx4g" SelectVariants -V $file1 --concordance $file2 -O overlap.vcf - overlap=`grep -v '#' overlap.vcf | wc -l` - - num1=`grep -v '#' $file1 | wc -l` - num2=`grep -v '#' $file2 | wc -l` - just1=$((num1 - overlap)) - just2=$((num2 - overlap)) - - total=$((overlap + just1 + just2)) - jaccard=`echo "$overlap / $total" | bc -l` - - printf "%0.6f" $jaccard >> $result - fi - done - printf "\n" >> $result - done - - } - - runtime { - docker: "${gatk_docker}" - preemptible: 2 - } - - output { File table = "${prefix}_${type}_jaccard.txt" } -} - -task Concordance { - File? gatk_override - String gatk_docker - File? intervals - File truth - File truth_idx - File eval - File eval_idx - - command { - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - gatk --java-options "-Xmx4g" Concordance ${"-L " + intervals} \ - -truth ${truth} -eval ${eval} \ - -tpfn "tpfn.vcf" \ - -ftnfn "ftnfn.vcf" \ - --filter-analysis "filter-analysis.txt" \ - -summary summary.tsv - } - - runtime { - docker: "${gatk_docker}" - preemptible: 2 - } - - output { - File tpfn = "tpfn.vcf" - File tpfn_idx = "tpfn.vcf.idx" - File ftnfn = "ftnfn.vcf" - File ftnfn_idx = "ftnfn.vcf.idx" - File filter_analysis = "filter-analysis.txt" - File summary = "summary.tsv" - } -} diff --git a/scripts/mutect2_wdl/unsupported/hapmap_sensitivity_all_plexes.wdl b/scripts/mutect2_wdl/unsupported/hapmap_sensitivity_all_plexes.wdl deleted file mode 100755 index ff91540d7a9..00000000000 --- a/scripts/mutect2_wdl/unsupported/hapmap_sensitivity_all_plexes.wdl +++ /dev/null @@ -1,182 +0,0 @@ -### This is the concordance part of the CRSP sensitivity validation - -#Conceptual Overview -# To measure sensitivity, we sequence a pool 5, 10 or 20 normal samples. The pool contains a variety of allele fractions, -# like a real tumor sample. These normal samples were also sequenced individually as part of HapMap, so we have "truth" vcfs -# for them. We calculate sensitivity by comparing the calls to the truth data. For a variety of reasons we don't call -# against a matched normal. - - -#Workflow Steps -# 1. Restrict a huge HapMap wgs VCF to given lists of samples and intervals. -# 2. Annotate this VCF using a bam sequenced from a pool derived from the given samples -# 3. Run Mutect in tumor-only mode on this pooled bam. -# 4. Compare Mutect calls to the truth data and output a table of true positives and false negatives along with -# annotations from the truth VCF prepared in steps 1 and 2. - -# Here we implement these steps, except for the subsampling in step 1, for several replicate bams each of 5-plex, 10-plex, and 20-plex mixtures -# e.g. 4 different 10-plex bams, 3 10-plex bams, and 7 20-plex bams. - -import "hapmap_sensitivity.wdl" as single_plex - -workflow HapmapSensitivityAllPlexes { - File? intervals - File ref_fasta - File ref_fai - File ref_dict - - Int max_depth - Array[Int] depth_bins - Int depth_bin_width - Int scatter_count - - File five_plex_bam_list - File ten_plex_bam_list - File twenty_plex_bam_list - - File? pon - File? pon_index - Boolean run_orientation_bias_filter - Array[String]? artifact_modes - File five_plex_preprocessed - File five_plex_preprocessed_idx - File ten_plex_preprocessed - File ten_plex_preprocessed_idx - File twenty_plex_preprocessed - File twenty_plex_preprocessed_idx - - String? m2_extra_args - String? m2_extra_filtering_args - File python_script - - File? gatk_override - - String gatk_docker - - call single_plex.HapmapSensitivity as FivePlex { - input: - gatk_override = gatk_override, - gatk_docker = gatk_docker, - intervals = intervals, - ref_fasta = ref_fasta, - ref_fai = ref_fai, - ref_dict = ref_dict, - max_depth = max_depth, - depth_bins = depth_bins, - depth_bin_width = depth_bin_width, - scatter_count = scatter_count, - bam_list = five_plex_bam_list, - pon = pon, - pon_index = pon_index, - run_orientation_bias_filter = run_orientation_bias_filter, - artifact_modes = artifact_modes, - preprocessed_hapmap = five_plex_preprocessed, - preprocessed_hapmap_idx = five_plex_preprocessed_idx, - m2_extra_args = m2_extra_args, - m2_extra_filtering_args = m2_extra_filtering_args, - prefix = "5plex", - python_script = python_script - } - - call single_plex.HapmapSensitivity as TenPlex { - input: - gatk_override = gatk_override, - gatk_docker = gatk_docker, - intervals = intervals, - ref_fasta = ref_fasta, - ref_fai = ref_fai, - ref_dict = ref_dict, - max_depth = max_depth, - depth_bins = depth_bins, - depth_bin_width = depth_bin_width, - scatter_count = scatter_count, - bam_list = ten_plex_bam_list, - pon = pon, - pon_index = pon_index, - run_orientation_bias_filter = run_orientation_bias_filter, - artifact_modes = artifact_modes, - preprocessed_hapmap = ten_plex_preprocessed, - preprocessed_hapmap_idx = ten_plex_preprocessed_idx, - m2_extra_args = m2_extra_args, - m2_extra_filtering_args = m2_extra_filtering_args, - prefix = "10plex", - python_script = python_script - } - - call single_plex.HapmapSensitivity as TwentyPlex { - input: - gatk_override = gatk_override, - gatk_docker = gatk_docker, - intervals = intervals, - ref_fasta = ref_fasta, - ref_fai = ref_fai, - ref_dict = ref_dict, - max_depth = max_depth, - depth_bins = depth_bins, - depth_bin_width = depth_bin_width, - scatter_count = scatter_count, - bam_list = twenty_plex_bam_list, - pon = pon, - pon_index = pon_index, - run_orientation_bias_filter = run_orientation_bias_filter, - artifact_modes = artifact_modes, - preprocessed_hapmap = twenty_plex_preprocessed, - preprocessed_hapmap_idx = twenty_plex_preprocessed_idx, - m2_extra_args = m2_extra_args, - m2_extra_filtering_args = m2_extra_filtering_args, - prefix = "20plex", - python_script = python_script - } - - Array[File] all_plex_sensitivity_tables = [FivePlex.raw_table, TenPlex.raw_table, TwentyPlex.raw_table] - - call single_plex.CombineTables as AllPlexTable { input: input_tables = all_plex_sensitivity_tables, prefix = "all_plex" } - - call single_plex.AnalyzeSensitivity as AllPlex { - input: input_table = AllPlexTable.table, python_script = python_script, prefix = "all_plex", depth_bins = depth_bins, depth_bin_width = depth_bin_width - } - - output { - File snp_table_5_plex = FivePlex.snp_table - File snp_plot_5_plex = FivePlex.snp_plot - File indel_table_5_plex = FivePlex.indel_table - File indel_plot_5_plex = FivePlex.indel_plot - File summary_5_plex = FivePlex.summary - File indel_jaccard_5_plex = FivePlex.indel_jaccard_table - File snp_jaccard_5_plex = FivePlex.snp_jaccard_table - Array[File] tpfn_5_plex = FivePlex.tpfn - Array[File] tpfn_idx_5_plex = FivePlex.tpfn_idx - Array[File] ftnfn_5_plex = FivePlex.ftnfn - Array[File] ftnfn_idx_5_plex = FivePlex.ftnfn_idx - Array[File] filter_analysis_5_plex = FivePlex.filter_analysis - File snp_table_10_plex = TenPlex.snp_table - File snp_plot_10_plex = TenPlex.snp_plot - File indel_table_10_plex = TenPlex.indel_table - File indel_plot_10_plex = TenPlex.indel_plot - File summary_10_plex = TenPlex.summary - File indel_jaccard_10_plex = TenPlex.indel_jaccard_table - File snp_jaccard_10_plex = TenPlex.snp_jaccard_table - Array[File] tpfn_10_plex = TenPlex.tpfn - Array[File] tpfn_idx_10_plex = TenPlex.tpfn_idx - Array[File] ftnfn_10_plex = TenPlex.ftnfn - Array[File] ftnfn_idx_10_plex = TenPlex.ftnfn_idx - Array[File] filter_analysis_10_plex = TenPlex.filter_analysis - File snp_table_20_plex = TwentyPlex.snp_table - File snp_plot_20_plex = TwentyPlex.snp_plot - File indel_table_20_plex = TwentyPlex.indel_table - File indel_plot_20_plex = TwentyPlex.indel_plot - File summary_20_plex = TwentyPlex.summary - File indel_jaccard_20_plex = TwentyPlex.indel_jaccard_table - File snp_jaccard_20_plex = TwentyPlex.snp_jaccard_table - Array[File] tpfn_20_plex = TwentyPlex.tpfn - Array[File] tpfn_idx_20_plex = TwentyPlex.tpfn_idx - Array[File] ftnfn_20_plex = TwentyPlex.ftnfn - Array[File] ftnfn_idx_20_plex = TwentyPlex.ftnfn_idx - Array[File] filter_analysis_20_plex = TwentyPlex.filter_analysis - - File snp_table_all_plex = AllPlex.snp_table - File snp_plot_all_plex = AllPlex.snp_plot - File indel_table_all_plex = AllPlex.indel_table - File indel_plot_all_plex = AllPlex.indel_plot - } -} diff --git a/scripts/mutect2_wdl/unsupported/m2_basic_validation.wdl b/scripts/mutect2_wdl/unsupported/m2_basic_validation.wdl deleted file mode 100644 index 1c91bea9bae..00000000000 --- a/scripts/mutect2_wdl/unsupported/m2_basic_validation.wdl +++ /dev/null @@ -1,344 +0,0 @@ -import "mutect2.wdl" as m2 - -# -# This workflow runs M2 on a set of bam quadruplets. For each quadruplet, the first pair of bams is the discovery -# tumor-normal bam that you wish to validate. The second pair are the validation tumor-normal to use only for -# validation. Typically, the validation pair would be separate replicates from a different sequencing process -# (e.g. matched WGS vs discovery exome bams). The validation is actually the variants in the VCF of the discovery -# pair against the pileup information in the validation bams. -# -# All parameters *_file_list or *_file_index_list are files of filenames. Each filename is on a separate line. The -# order must be maintained for each of these file lists. In other words, each line in the eight input files must -# corrsepond to the same discovery and validation pairs. -# -# This workflow is not recommended for use with RNA as validation, due to biases in RNA pileups. -# -# Attempts are made to reconcile the pileup validation with the M2 haplotype creation process. -# -# The output is a tar file of validation reports (tsv) for the -# -workflow m2_validation { - #### M2 parameters - File? intervals - File ref_fasta - File ref_fai - File ref_dict - File tumor_bam - File tumor_bai - String tumor_sample_name - File? normal_bam - File? normal_bai - String? normal_sample_name - File? pon - File? pon_index - Int scatter_count - File? gnomad - File? gnomad_index - File? variants_for_contamination - File? variants_for_contamination_index - Boolean? run_orientation_bias_filter - Int? preemptible_attempts - Array[String]? artifact_modes - String? m2_extra_args - String? m2_extra_filtering_args - - File? gatk_override - - String gatk_docker - ##### - - ### parameter-fu - ### Discovery bams - File tumor_bam_file_list - Array[File] tumor_bam_files = read_lines(tumor_bam_file_list) - - File tumor_bam_file_index_list - Array[File] tumor_bam_indices = read_lines(tumor_bam_file_index_list) - - File normal_bam_file_list - Array[File] normal_bam_files = read_lines(normal_bam_file_list) - - File normal_bam_file_index_list - Array[File] normal_bam_indices = read_lines(normal_bam_file_index_list) - - ### validation bams - File validation_tumor_bam_file_list - Array[File] validation_tumor_bam_files = read_lines(validation_tumor_bam_file_list) - - File validation_tumor_bam_file_index_list - Array[File] validation_tumor_bam_indices = read_lines(validation_tumor_bam_file_index_list) - - File validation_normal_bam_file_list - Array[File] validation_normal_bam_files = read_lines(validation_normal_bam_file_list) - - File validation_normal_bam_file_index_list - Array[File] validation_normal_bam_indices = read_lines(validation_normal_bam_file_index_list) - ##### - - ### Validation parameters - # A name to identify this group of samples. This can be arbitrary, but should not contain special characters, nor whitespace. - String group_id - - # Only use reads with a minimum base quality for the base at the variant. - Int? base_quality_cutoff - - - scatter (i in range(length(tumor_bam_files))) { - call m2.Mutect2 as m2_tn { - input: - gatk_override = gatk_override, - gatk_docker = gatk_docker, - intervals = intervals, - ref_fasta = ref_fasta, - ref_fai = ref_fai, - ref_dict = ref_dict, - tumor_bam = tumor_bam_files[i], - tumor_bai = tumor_bam_indices[i], - normal_bam = normal_bam_files[i], - normal_bai = normal_bam_indices[i], - scatter_count = scatter_count, - pon = pon, - pon_index = pon_index, - gnomad = gnomad, - gnomad_index = gnomad_index, - run_orientation_bias_filter = run_orientation_bias_filter, - preemptible_attempts = preemptible_attempts, - artifact_modes = artifact_modes, - variants_for_contamination = variants_for_contamination, - variants_for_contamination_index = variants_for_contamination_index, - m2_extra_args = m2_extra_args, - m2_extra_filtering_args = m2_extra_filtering_args, - make_bamout = true - } - - call m2.Mutect2 as m2_validation_bamout { - input: - gatk_override = gatk_override, - gatk_docker = gatk_docker, - intervals = intervals, - ref_fasta = ref_fasta, - ref_fai = ref_fai, - ref_dict = ref_dict, - tumor_bam = validation_tumor_bam_files[i], - tumor_bai = validation_tumor_bam_indices[i], - normal_bam = validation_normal_bam_files[i], - normal_bai = validation_normal_bam_indices[i], - scatter_count = scatter_count, - pon = pon, - pon_index = pon_index, - gnomad = gnomad, - gnomad_index = gnomad_index, - run_orientation_bias_filter = run_orientation_bias_filter, - preemptible_attempts = preemptible_attempts, - artifact_modes = artifact_modes, - variants_for_contamination = variants_for_contamination, - variants_for_contamination_index = variants_for_contamination_index, - m2_extra_args = m2_extra_args, - m2_extra_filtering_args = m2_extra_filtering_args, - make_bamout = true - } - - # Delete the reads from the normal and HC sample from the bamout. - call rewrite_bam_by_sample as m2_rewrite_bam_by_sample { - input: - gatk_override = gatk_override, - gatk_docker = gatk_docker, - bam = m2_validation_bamout.bamout, - new_sample_name = m2_validation_bamout.tumor_bam_sample_name, - output_bam_basename = m2_validation_bamout.tumor_bam_sample_name - } - - call basic_validator as m2_basic_validator { - input: - gatk_override = gatk_override, - gatk_docker = gatk_docker, - call_intervals = m2_tn.filtered_vcf, - ref_fasta = ref_fasta, - ref_fai = ref_fai, - ref_dict = ref_dict, - discovery_tumor_sample_name = m2_tn.tumor_bam_sample_name, - discovery_normal_sample_name = m2_tn.normal_bam_sample_name, - validation_tumor_bam = m2_rewrite_bam_by_sample.sample_bam, - validation_tumor_bai = m2_rewrite_bam_by_sample.sample_bai, - validation_normal_bam = validation_normal_bam_files[i], - validation_normal_bai = validation_normal_bam_indices[i], - vcf_calls = m2_tn.filtered_vcf, - vcf_calls_idx = m2_tn.filtered_vcf_index, - entity_id = "m2_" + m2_tn.tumor_bam_sample_name, - base_quality_cutoff = base_quality_cutoff - } - - call m2.CollectSequencingArtifactMetrics as validation_normal_CollectSequencingArtifactMetrics { - input: - gatk_docker = gatk_docker, - ref_fasta = ref_fasta, - ref_fai = ref_fai, - preemptible_attempts = preemptible_attempts, - tumor_bam = validation_normal_bam_files[i], - tumor_bai = validation_normal_bam_indices[i] - } - } - - call tar_results as tar_results_m2 { - input: - to_be_tarred = m2_basic_validator.validation_output, - group_id = group_id - } - - output { - File m2_tar_file = tar_results_m2.tar_file - Array[File] validation_normal_pre_adapter_metrics = validation_normal_CollectSequencingArtifactMetrics.pre_adapter_metrics - } -} - -# Validation bams should *not* be RNA. -task basic_validator { - File? gatk_override - String gatk_docker - # Same calls as what is in the VCF - File call_intervals - - File ref_fasta - File ref_fai - File ref_dict - - String discovery_tumor_sample_name - String discovery_normal_sample_name - - # For validating M2 these should generally be the bamout, not the bam used for input. - File validation_tumor_bam - File validation_tumor_bai - File validation_normal_bam - File validation_normal_bai - - File vcf_calls - File vcf_calls_idx - - # Unique name for the entity. Only used for naming output files. - String entity_id - - Int? base_quality_cutoff - - # Runtime parameters - Int? mem - Int? preemptible_attempts - Int? disk_space_gb - - Int final_mem = select_first([mem, 7]) - - command <<< - set -e - GATK_JAR=${default="/root/gatk.jar" gatk_override} - - echo "Getting sample names...." - java -Xmx${final_mem-1}g -jar $GATK_JAR GetSampleName -I ${validation_normal_bam} -O validation_normal_name.txt - java -Xmx${final_mem-1}g -jar $GATK_JAR GetSampleName -I ${validation_tumor_bam} -O validation_tumor_name.txt - echo ${discovery_tumor_sample_name} - echo ${discovery_normal_sample_name} - echo "Sample names (for validation files):" - echo `cat validation_tumor_name.txt` - echo `cat validation_normal_name.txt` - echo "Running BasicValidatorWalker (incl. filtering info).... " - java -Xmx${final_mem-1}g -jar $GATK_JAR ValidateBasicSomaticShortMutations \ - -discv ${discovery_tumor_sample_name} \ - -V ${vcf_calls} \ - -I ${validation_tumor_bam} \ - -I ${validation_normal_bam} \ - -valcase `cat validation_tumor_name.txt` \ - -valcontrol `cat validation_normal_name.txt` \ - -O output_validation_${entity_id}.tsv \ - -R ${ref_fasta} \ - -bqcutoff ${default=20 base_quality_cutoff} - - >>> - - runtime { - docker: "${gatk_docker}" - memory: "${final_mem}" + " GB" - disks: "local-disk " + select_first([disk_space_gb, 100]) + " HDD" - preemptible: select_first([preemptible_attempts, 2]) - } - - output { - File validation_output = "output_validation_${entity_id}.tsv" - } -} - -task tar_results { - Array[File] to_be_tarred - String group_id - - # Needs to be an image with python installed, but defaults to a bash shell. - String basic_python_docker="broadinstitute/oncotator:1.9.3.0" - Int preemptible_attempts=2 - - command <<< - set -e - python <>> - - runtime { - docker: "${basic_python_docker}" - preemptible: "${preemptible_attempts}" - memory: "3 GB" - disks: "local-disk " + 50 + " HDD" - } - - output { - File tar_file = "${group_id}_results.tar.gz" - } -} - -task rewrite_bam_by_sample { - File? gatk_override - String gatk_docker - - # Also, removes samples not in the list from the header - String new_sample_name - File bam - String output_bam_basename - - # Runtime parameters - Int? mem - Int? preemptible_attempts - Int? disk_space_gb - Int final_mem = select_first([mem, 3]) - - command <<< - set -e - GATK_JAR=${default="/root/gatk.jar" gatk_override} - - java -Xmx${final_mem-1}g -jar $GATK_JAR PrintReads -I ${bam} -O ${output_bam_basename}.tmp.bam -RF SampleReadFilter -sample ${sep=" -sample " new_sample_name} - - samtools view -H ${output_bam_basename}.tmp.bam > tmpheader.txt - - egrep -v "^\@RG" tmpheader.txt > new_header.txt - egrep "^\@RG" tmpheader.txt | egrep "${new_sample_name}" >> new_header.txt - - java -Xmx${final_mem-1}g -jar $GATK_JAR ReplaceSamHeader --HEADER new_header.txt -I ${output_bam_basename}.tmp.bam -O ${output_bam_basename}.bam - samtools index ${output_bam_basename}.bam ${output_bam_basename}.bai - >>> - - runtime { - docker: "${gatk_docker}" - memory: select_first([mem, 3]) + " GB" - disks: "local-disk " + select_first([disk_space_gb, ceil(size(bam, "GB")) + 50]) + " HDD" - preemptible: select_first([preemptible_attempts, 2]) - } - - output { - File sample_bam = "${output_bam_basename}.bam" - File sample_bai = "${output_bam_basename}.bai" - } -} diff --git a/scripts/mutect2_wdl/unsupported/mutect2-replicate-validation.wdl b/scripts/mutect2_wdl/unsupported/mutect2-replicate-validation.wdl deleted file mode 100755 index b5d7a4e5f3a..00000000000 --- a/scripts/mutect2_wdl/unsupported/mutect2-replicate-validation.wdl +++ /dev/null @@ -1,133 +0,0 @@ -# call pairs of replicates as a tumor-normal pair, count the number of variants (i.e. false positives) -# and report false positive rates - -import "mutect2.wdl" as m2 - -workflow Mutect2ReplicateValidation { - File? intervals - File ref_fasta - File ref_fai - File ref_dict - Int scatter_count - # replicate_pair_list file is a tsv file with the following four columns in this order. - # tumor_bam, tumor_bai, normal_bam, normal_bai - File replicate_pair_list - Array[Array[String]] pairs = read_tsv(replicate_pair_list) - File? pon - File? pon_index - File? gnomad - File? gnomad_index - Boolean? run_orientation_bias_filter - Array[String]? artifact_modes - String? m2_extra_args - String? m2_extra_filtering_args - - File? gatk_override - - String gatk_docker - Int? preemptible_attempts - - scatter(pair in pairs) { - call m2.Mutect2 { - input: - intervals = intervals, - ref_fasta = ref_fasta, - ref_fai = ref_fai, - ref_dict = ref_dict, - tumor_bam = pair[0], - tumor_bai = pair[1], - normal_bam = pair[2], - normal_bai = pair[3], - pon = pon, - pon_index = pon_index, - scatter_count = scatter_count, - gnomad = gnomad, - gnomad_index = gnomad_index, - run_orientation_bias_filter = run_orientation_bias_filter, - preemptible_attempts = preemptible_attempts, - artifact_modes = artifact_modes, - m2_extra_args = m2_extra_args, - m2_extra_filtering_args = m2_extra_filtering_args, - gatk_override = gatk_override, - gatk_docker = gatk_docker, - } - - call CountFalsePositives { - input: - intervals = intervals, - ref_fasta = ref_fasta, - ref_fai = ref_fai, - ref_dict = ref_dict, - filtered_vcf = Mutect2.filtered_vcf, - filtered_vcf_index = Mutect2.filtered_vcf_index, - gatk_override = gatk_override, - gatk_docker = gatk_docker - } - } - - call GatherTables { input: tables = CountFalsePositives.false_positive_counts } - - output { - File summary = GatherTables.summary - Array[File] false_positives_vcfs = Mutect2.filtered_vcf - Array[File] unfiltered_vcfs = Mutect2.unfiltered_vcf - } -} - -task GatherTables { - # we assume that each table consists of two lines: one header line and one record - Array[File] tables - - command { - # extract the header from one of the files - head -n 1 ${tables[0]} > summary.txt - - # then append the record from each table - for table in ${sep=" " tables}; do - tail -n +2 $table >> summary.txt - done - } - - runtime { - docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" - memory: "1 GB" - disks: "local-disk " + 100 + " HDD" - } - - output { - File summary = "summary.txt" - } -} - -task CountFalsePositives { - File? intervals - File ref_fasta - File ref_fai - File ref_dict - File filtered_vcf - File filtered_vcf_index - - File? gatk_override - - String gatk_docker - - command { - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - - gatk --java-options "-Xmx4g" CountFalsePositives \ - -V ${filtered_vcf} \ - -R ${ref_fasta} \ - ${"-L " + intervals} \ - -O false-positives.txt \ - } - - runtime { - docker: gatk_docker - memory: "5 GB" - disks: "local-disk " + 500 + " HDD" - } - - output { - File false_positive_counts = "false-positives.txt" - } -} diff --git a/scripts/mutect2_wdl/unsupported/mutect2_compare_tumors.wdl b/scripts/mutect2_wdl/unsupported/mutect2_compare_tumors.wdl deleted file mode 100644 index 45441db64f3..00000000000 --- a/scripts/mutect2_wdl/unsupported/mutect2_compare_tumors.wdl +++ /dev/null @@ -1,131 +0,0 @@ -# Given trios consisting of: a normal sample, a relatively reliable tumor sample, and a questionable tumor sample -# run Mutect in tumor-normal mode on the "good" sample, then run it on the "bad" sample, and compare results -# The idea is to see how well Mutect performs under difficult conditions. For example, the "good" sample might -# be from a solid tumor while the "bad" sample might be low-allele-fraction cfDNA - -# Exept for the different format of the input bams table, the inputs for this script are identical to those of -# mutect2_multi_sample.wdl -import "mutect2.wdl" as m2 - -workflow Mutect2Trio { - File? intervals - File ref_fasta - File ref_fai - File ref_dict - Int scatter_count - # trio_list file is a tsv file with the following nine columns in this order. - # normal_bam, normal_bai, good_tumor_bam, good_tumor_bai, bad_tumor_bam, bad_tumor_bai - File trio_list - Array[Array[String]] trios = read_tsv(trio_list) - File? pon - File? pon_index - File? gnomad - File? gnomad_index - Boolean? run_orientation_bias_filter - Array[String]? artifact_modes - - File? gatk_override - - # runtime - String gatk_docker - Int? preemptible_attempts - - scatter(trio in trios) { - call m2.Mutect2 as GoodTumor { - input: - intervals = intervals, - ref_fasta = ref_fasta, - ref_fai = ref_fai, - ref_dict = ref_dict, - tumor_bam = trio[2], - tumor_bai = trio[3], - normal_bam = trio[0], - normal_bai = trio[1], - pon = pon, - pon_index = pon_index, - scatter_count = scatter_count, - gnomad = gnomad, - gnomad_index = gnomad_index, - run_orientation_bias_filter = run_orientation_bias_filter, - artifact_modes = artifact_modes, - gatk_override = gatk_override, - gatk_docker = gatk_docker, - preemptible_attempts = preemptible_attempts - } - - call m2.Mutect2 as BadTumor { - input: - intervals = intervals, - ref_fasta = ref_fasta, - ref_fai = ref_fai, - ref_dict = ref_dict, - tumor_bam = trio[4], - tumor_bai = trio[5], - normal_bam = trio[0], - normal_bai=trio[1], - pon = pon, - pon_index = pon_index, - scatter_count = scatter_count, - gnomad = gnomad, - gnomad_index = gnomad_index, - run_orientation_bias_filter = run_orientation_bias_filter, - artifact_modes = artifact_modes, - gatk_override = gatk_override, - gatk_docker = gatk_docker, - preemptible_attempts = preemptible_attempts - } - - call Concordance { - input: - intervals = intervals, - truth_vcf = GoodTumor.filtered_vcf, #note, no orientation bias since it's optional output - truth_vcf_idx = GoodTumor.filtered_vcf_index, - eval_vcf = BadTumor.filtered_vcf, - eval_vcf_idx = BadTumor.filtered_vcf_index, - gatk_override = gatk_override - } - } - - output { - Array[File] tpfn = Concordance.tpfn - Array[File] tpfn_idx = Concordance.tpfn_idx - Array[File] tpfp = Concordance.tpfp - Array[File] tpf_idx = Concordance.tpfp_idx - Array[File] summary = Concordance.summary - } -} - -task Concordance { - File? intervals - File truth_vcf - File truth_vcf_idx - File eval_vcf - File eval_vcf_idx - - File? gatk_override - - String gatk_docker - - command { - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - - gatk --java-options "-Xmx2g" Concordance \ - ${"-L " + intervals} \ - -truth ${truth_vcf} -eval ${eval_vcf} -tpfn "true_positives_and_false_negatives.vcf" \ - -tpfp "true_positives_and_false_positives.vcf" \ - -summary summary.tsv - } - - runtime { - docker: gatk_docker - memory: "5 GB" - } - - output { - File tpfn = "true_positives_and_false_negatives.vcf" - File tpfn_idx = "true_positives_and_false_negatives.vcf.idx" - File tpfp = "true_positives_and_false_positives.vcf" - File tpfp_idx = "true_positives_and_false_positives.vcf.idx" - File summary = "summary.tsv" - } -} \ No newline at end of file diff --git a/scripts/mutect2_wdl/unsupported/mutect2_multi_sample_concordance.wdl b/scripts/mutect2_wdl/unsupported/mutect2_multi_sample_concordance.wdl deleted file mode 100644 index 45292b0c122..00000000000 --- a/scripts/mutect2_wdl/unsupported/mutect2_multi_sample_concordance.wdl +++ /dev/null @@ -1,129 +0,0 @@ -#Inputs are same as mutect2_multi_sample.wdl with the addtion of a truth_list input, which is a tsv of the form -# truth1.vcf truth1.vcf.idx -# truth2.vcf truth2.vcf.idx -# . . . -# The rows of this file correspond to the rows of the pair_list input - -import "mutect2_multi_sample.wdl" as m2_multi - -workflow Mutect2_Multi_Concordance { - # inputs - File? intervals - File ref_fasta - File ref_fai - File ref_dict - Int scatter_count - File pair_list - File? pon - File? pon_index - File? gnomad - File? gnomad_index - File? variants_for_contamination - File? variants_for_contamination_index - Boolean? run_orientation_bias_filter - Array[String]? artifact_modes - String? m2_extra_args - String? m2_extra_filtering_args - File truth_list - Array[Array[String]] truth = read_tsv(truth_list) - - File? gatk_override - - # runtime - String gatk_docker - Int? preemptible_attempts - - call m2_multi.Mutect2_Multi { - input: - ref_fasta = ref_fasta, - ref_fai = ref_fai, - ref_dict = ref_dict, - scatter_count = scatter_count, - pair_list = pair_list, - intervals = intervals, - pon = pon, - pon_index = pon_index, - gnomad = gnomad, - gnomad_index = gnomad_index, - variants_for_contamination = variants_for_contamination, - variants_for_contamination_index = variants_for_contamination_index, - run_orientation_bias_filter = run_orientation_bias_filter, - preemptible_attempts = preemptible_attempts, - artifact_modes = artifact_modes, - m2_extra_args = m2_extra_args, - m2_extra_filtering_args = m2_extra_filtering_args, - gatk_override = gatk_override, - gatk_docker = gatk_docker, -c } - - scatter (n in range(length(truth))) { - call Concordance { - input: - intervals = intervals, - truth_vcf = truth[n][0], - truth_vcf_idx = truth[n][1], - eval_vcf = Mutect2_Multi.filtered_vcf[n], - eval_vcf_idx = Mutect2_Multi.filtered_vcf_idx[n], - preemptible_attempts = preemptible_attempts, - gatk_override = gatk_override, - gatk_docker = gatk_docker - } - } - - output { - Array[File] tpfn = Concordance.tpfn - Array[File] tpfn_idx = Concordance.tpfn_idx - Array[File] tpfp = Concordance.tpfp - Array[File] tpfp_idx = Concordance.tpfp_idx - Array[File] ftnfn = Concordance.ftnfn - Array[File] ftnfn_idx = Concordance.ftnfn_idx - Array[File] filter_analysis = Concordance.filter_analysis - Array[File] summary = Concordance.summary - } -} - -task Concordance { - # inputs - File? intervals - File truth_vcf - File truth_vcf_idx - File eval_vcf - File eval_vcf_idx - - File? gatk_override - - # runtime - String gatk_docker - Int? preemptible_attempts - - command { - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - - gatk --java-options "-Xmx2g" Concordance \ - ${"-L " + intervals} \ - -truth ${truth_vcf} -eval ${eval_vcf} \ - -tpfn "tpfn.vcf" \ - -tpfp "tpfp.vcf" \ - -ftnfn "ftnfn.vcf" \ - --filter-analysis "filter-analysis.txt" \ - -summary summary.tsv - } - - runtime { - memory: "5 GB" - docker: "${gatk_docker}" - disks: "local-disk " + 100 + " HDD" - preemptible: select_first([preemptible_attempts, 2]) - } - - output { - File tpfn = "tpfn.vcf" - File tpfn_idx = "tpfn.vcf.idx" - File tpfp = "tpfp.vcf" - File tpfp_idx = "tpfp.vcf.idx" - File ftnfn = "ftnfn.vcf" - File ftnfn_idx = "ftnfn.vcf.idx" - File filter_analysis = "filter-analysis.txt" - File summary = "summary.tsv" - } -} \ No newline at end of file diff --git a/scripts/mutect2_wdl/unsupported/mutect2_opt.wdl b/scripts/mutect2_wdl/unsupported/mutect2_opt.wdl deleted file mode 100755 index db233b01450..00000000000 --- a/scripts/mutect2_wdl/unsupported/mutect2_opt.wdl +++ /dev/null @@ -1,996 +0,0 @@ -## Copyright Broad Institute, 2017 -## -## This WDL workflow runs GATK4 Mutect 2 on a single tumor-normal pair or on a single tumor sample, -## and performs additional filtering and functional annotation tasks. -## -## Main requirements/expectations : -## - One analysis-ready BAM file (and its index) for each sample -## -## Description of inputs: -## -## ** Runtime ** -## gatk_docker, oncotator_docker: docker images to use for GATK 4 Mutect2 and for Oncotator -## preemptible_attempts: how many preemptions to tolerate before switching to a non-preemptible machine (on Google) -## gatk_override: (optional) local file or Google bucket path to a GATK 4 java jar file to be used instead of the GATK 4 jar -## in the docker image. This must be supplied when running in an environment that does not support docker -## (e.g. SGE cluster on a Broad on-prem VM) -## -## ** Workflow options ** -## intervals: genomic intervals (will be used for scatter) -## scatter_count: number of parallel jobs to generate when scattering over intervals -## artifact_modes: types of artifacts to consider in the orientation bias filter (optional) -## m2_extra_args, m2_extra_filtering_args: additional arguments for Mutect2 calling and filtering (optional) -## split_intervals_extra_args: additional arguments for splitting intervals before scattering (optional) -## run_orientation_bias_filter: if true, run the orientation bias filter post-processing step (optional, false by default) -## run_oncotator: if true, annotate the M2 VCFs using oncotator (to produce a TCGA MAF). Important: This requires a -## docker image and should not be run in environments where docker is unavailable (e.g. SGE cluster on -## a Broad on-prem VM). Access to docker hub is also required, since the task downloads a public docker image. -## (optional, false by default) -## -## ** Primary inputs ** -## ref_fasta, ref_fai, ref_dict: reference genome, index, and dictionary -## tumor_bam, tumor_bam_index: BAM and index for the tumor sample -## normal_bam, normal_bam_index: BAM and index for the normal sample -## -## ** Primary resources ** (optional but strongly recommended) -## pon, pon_index: optional panel of normals in VCF format containing probable technical artifacts (false positves) -## gnomad, gnomad_index: optional database of known germline variants (see http://gnomad.broadinstitute.org/downloads) -## variants_for_contamination, variants_for_contamination_index: VCF of common variants with allele frequencies for calculating contamination -## -## ** Secondary resources ** (for optional tasks) -## onco_ds_tar_gz, default_config_file: Oncotator datasources and config file -## sequencing_center, sequence_source: metadata for Oncotator -## -## Outputs : -## - One VCF file and its index with primary filtering applied; secondary filtering and functional annotation if requested; a bamout.bam -## file of reassembled reads if requested -## -## Cromwell version support -## - Successfully tested on v29 -## -## LICENSING : -## This script is released under the WDL source code license (BSD-3) (see LICENSE in -## https://github.com/broadinstitute/wdl). Note however that the programs it calls may -## be subject to different licenses. Users are responsible for checking that they are -## authorized to run all programs before running this script. Please see the docker -## pages at https://hub.docker.com/r/broadinstitute/* for detailed licensing information -## pertaining to the included programs. -workflow Mutect2 { - # Mutect2 inputs - File? intervals - File ref_fasta - File ref_fai - File ref_dict - File tumor_bam - File tumor_bai - File? normal_bam - File? normal_bai - File? pon - File? pon_index - Int scatter_count - File? gnomad - File? gnomad_index - File? variants_for_contamination - File? variants_for_contamination_index - Boolean? run_orientation_bias_filter - Boolean run_ob_filter = select_first([run_orientation_bias_filter, false]) - Array[String]? artifact_modes - File? tumor_sequencing_artifact_metrics - String? m2_extra_args - String? m2_extra_filtering_args - String? split_intervals_extra_args - Boolean? make_bamout - Boolean make_bamout_or_default = select_first([make_bamout, false]) - Boolean? compress_vcfs - Boolean compress = select_first([compress_vcfs, false]) - - # oncotator inputs - Boolean? run_oncotator - Boolean run_oncotator_or_default = select_first([run_oncotator, false]) - File? onco_ds_tar_gz - String? onco_ds_local_db_dir - String? sequencing_center - String? sequence_source - File? default_config_file - - # funcotator inputs - Boolean? run_funcotator - Boolean run_funcotator_or_default = select_first([run_funcotator, false]) - String? reference_version - String? data_sources_tar_gz - String? transcript_selection_mode - Array[String]? transcript_selection_list - Array[String]? annotation_defaults - Array[String]? annotation_overrides - - File? gatk_override - - # runtime - String gatk_docker - String basic_bash_docker = "ubuntu:16.04" - String? oncotator_docker - String oncotator_docker_or_default = select_first([oncotator_docker, "broadinstitute/oncotator:1.9.6.1"]) - Int? preemptible_attempts - - # Do not populate unless you know what you are doing... - File? auth - - # Use as a last resort to increase the disk given to every task in case of ill behaving data - Int? emergency_extra_disk - - # Disk sizes used for dynamic sizing - Int ref_size = ceil(size(ref_fasta, "GB") + size(ref_dict, "GB") + size(ref_fai, "GB")) - Int tumor_bam_size = ceil(size(tumor_bam, "GB") + size(tumor_bai, "GB")) - Int gnomad_vcf_size = if defined(gnomad) then ceil(size(gnomad, "GB") + size(gnomad_index, "GB")) else 0 - Int normal_bam_size = if defined(normal_bam) then ceil(size(normal_bam, "GB") + size(normal_bai, "GB")) else 0 - - # If no tar is provided, the task downloads one from broads ftp server - Int onco_tar_size = if defined(onco_ds_tar_gz) then ceil(size(onco_ds_tar_gz, "GB") * 3) else 100 - Int gatk_override_size = if defined(gatk_override) then ceil(size(gatk_override, "GB")) else 0 - - # This is added to every task as padding, should increase if systematically you need more disk for every call - Int disk_pad = 10 + gatk_override_size + select_first([emergency_extra_disk,0]) - - # These are multipliers to multipler inputs by to make sure we have enough disk to accommodate for possible output sizes - # Large is for Bams/WGS vcfs - # Small is for metrics/other vcfs - Float large_input_to_output_multiplier = 2.25 - Float small_input_to_output_multiplier = 2.0 - - # logic about output file names -- these are the names *without* .vcf extensions - String output_basename = basename(tumor_bam, ".bam") - String unfiltered_name = output_basename + "-unfiltered" - String filtered_name = output_basename + "-filtered" - String funcotated_name = output_basename + "-funcotated" - - String output_vcf_name = basename(tumor_bam, ".bam") + ".vcf" - - # Size M2 differently based on if we are using NIO or not - Int m2_output_size = tumor_bam_size / scatter_count - Int m2_nio_disk_size = ((tumor_bam_size + normal_bam_size) / scatter_count) + ref_size + (gnomad_vcf_size / scatter_count) + m2_output_size + disk_pad - Int m2_no_nio_disk_size = tumor_bam_size + normal_bam_size + ref_size + gnomad_vcf_size + m2_output_size + disk_pad - Int m2_per_scatter_size = if defined(auth) then m2_nio_disk_size else m2_no_nio_disk_size - - call SplitIntervals { - input: - intervals = intervals, - ref_fasta = ref_fasta, - ref_fai = ref_fai, - ref_dict = ref_dict, - scatter_count = scatter_count, - split_intervals_extra_args = split_intervals_extra_args, - gatk_override = gatk_override, - gatk_docker = gatk_docker, - preemptible_attempts = preemptible_attempts, - disk_space = ref_size + ceil(size(intervals, "GB") * small_input_to_output_multiplier) + disk_pad - } - - scatter (subintervals in SplitIntervals.interval_files ) { - call M2 { - input: - intervals = subintervals, - ref_fasta = ref_fasta, - ref_fai = ref_fai, - ref_dict = ref_dict, - tumor_bam = tumor_bam, - tumor_bai = tumor_bai, - normal_bam = normal_bam, - normal_bai = normal_bai, - pon = pon, - pon_index = pon_index, - gnomad = gnomad, - gnomad_index = gnomad_index, - preemptible_attempts = preemptible_attempts, - m2_extra_args = m2_extra_args, - make_bamout = make_bamout_or_default, - compress = compress, - gatk_override = gatk_override, - gatk_docker = gatk_docker, - preemptible_attempts = preemptible_attempts, - disk_space = m2_per_scatter_size, - auth = auth - } - - Float sub_vcf_size = size(M2.unfiltered_vcf, "GB") - Float sub_bamout_size = size(M2.output_bamOut, "GB") - } - - call SumFloats as SumSubVcfs { - input: - sizes = sub_vcf_size, - preemptible_attempts = preemptible_attempts - } - - call MergeVCFs { - input: - input_vcfs = M2.unfiltered_vcf, - input_vcf_indices = M2.unfiltered_vcf_index, - output_name = unfiltered_name, - compress = compress, - gatk_override = gatk_override, - gatk_docker = gatk_docker, - preemptible_attempts = preemptible_attempts, - disk_space = ceil(SumSubVcfs.total_size * large_input_to_output_multiplier) + disk_pad - } - - if (make_bamout_or_default) { - call SumFloats as SumSubBamouts { - input: - sizes = sub_bamout_size, - preemptible_attempts = preemptible_attempts - } - - call MergeBamOuts { - input: - ref_fasta = ref_fasta, - ref_fai = ref_fai, - ref_dict = ref_dict, - bam_outs = M2.output_bamOut, - output_vcf_name = basename(MergeVCFs.merged_vcf, ".vcf"), - gatk_override = gatk_override, - gatk_docker = gatk_docker, - disk_space = ceil(SumSubBamouts.total_size * large_input_to_output_multiplier) + disk_pad - } - } - - if (run_ob_filter && !defined(tumor_sequencing_artifact_metrics)) { - call CollectSequencingArtifactMetrics { - input: - gatk_docker = gatk_docker, - ref_fasta = ref_fasta, - ref_fai = ref_fai, - preemptible_attempts = preemptible_attempts, - tumor_bam = tumor_bam, - tumor_bai = tumor_bai, - gatk_override = gatk_override, - disk_space = tumor_bam_size + ref_size + disk_pad - } - } - - if (defined(variants_for_contamination)) { - call CalculateContamination { - input: - gatk_override = gatk_override, - intervals = intervals, - ref_fasta = ref_fasta, - ref_fai = ref_fai, - ref_dict = ref_dict, - preemptible_attempts = preemptible_attempts, - gatk_docker = gatk_docker, - tumor_bam = tumor_bam, - tumor_bai = tumor_bai, - normal_bam = normal_bam, - normal_bai = normal_bai, - variants_for_contamination = variants_for_contamination, - variants_for_contamination_index = variants_for_contamination_index, - disk_space = tumor_bam_size + normal_bam_size + ceil(size(variants_for_contamination, "GB") * small_input_to_output_multiplier) + disk_pad, - auth = auth - } - } - - call Filter { - input: - gatk_override = gatk_override, - gatk_docker = gatk_docker, - intervals = intervals, - unfiltered_vcf = MergeVCFs.merged_vcf, - unfiltered_vcf_index = MergeVCFs.merged_vcf_index, - output_name = filtered_name, - compress = compress, - preemptible_attempts = preemptible_attempts, - contamination_table = CalculateContamination.contamination_table, - m2_extra_filtering_args = m2_extra_filtering_args, - disk_space = ceil(size(MergeVCFs.merged_vcf, "GB") * small_input_to_output_multiplier) + disk_pad, - auth = auth - } - - if (run_ob_filter) { - # Get the metrics either from the workflow input or CollectSequencingArtifactMetrics if no workflow input is provided - File input_artifact_metrics = select_first([tumor_sequencing_artifact_metrics, CollectSequencingArtifactMetrics.pre_adapter_metrics]) - - call FilterByOrientationBias { - input: - gatk_override = gatk_override, - input_vcf = Filter.filtered_vcf, - input_vcf_index = Filter.filtered_vcf_index, - output_name = filtered_name, - compress = compress, - gatk_docker = gatk_docker, - preemptible_attempts = preemptible_attempts, - pre_adapter_metrics = input_artifact_metrics, - artifact_modes = artifact_modes, - disk_space = ceil(size(Filter.filtered_vcf, "GB") * small_input_to_output_multiplier) + ceil(size(input_artifact_metrics, "GB")) + disk_pad, - auth = auth - } - } - - if (run_oncotator_or_default) { - File oncotate_vcf_input = select_first([FilterByOrientationBias.filtered_vcf, Filter.filtered_vcf]) - call oncotate_m2 { - input: - m2_vcf = oncotate_vcf_input, - onco_ds_tar_gz = onco_ds_tar_gz, - onco_ds_local_db_dir = onco_ds_local_db_dir, - sequencing_center = sequencing_center, - sequence_source = sequence_source, - default_config_file = default_config_file, - case_id = M2.tumor_sample[0], - control_id = M2.normal_sample[0], - oncotator_docker = oncotator_docker_or_default, - preemptible_attempts = preemptible_attempts, - disk_space = ceil(size(oncotate_vcf_input, "GB") * large_input_to_output_multiplier) + onco_tar_size + disk_pad - } - } - - if (run_funcotator_or_default) { - File funcotate_vcf_input = select_first([FilterByOrientationBias.filtered_vcf, Filter.filtered_vcf]) - File funcotate_vcf_input_index = select_first([FilterByOrientationBias.filtered_vcf_index, Filter.filtered_vcf_index]) - call Funcotate { - input: - m2_vcf = funcotate_vcf_input, - m2_vcf_index = funcotate_vcf_input_index, - ref_fasta = ref_fasta, - ref_fai = ref_fai, - ref_dict = ref_dict, - reference_version = select_first([reference_version, "NO_REFERENCE_VERSION_GIVEN"]), - output_name = funcotated_name, - compress = compress, - data_sources_tar_gz = data_sources_tar_gz, - transcript_selection_mode = transcript_selection_mode, - transcript_selection_list = transcript_selection_list, - annotation_defaults = annotation_defaults, - annotation_overrides = annotation_overrides, - gatk_docker = gatk_docker, - gatk_override = gatk_override - } - } - - output { - File unfiltered_vcf = MergeVCFs.merged_vcf - File unfiltered_vcf_index = MergeVCFs.merged_vcf_index - File filtered_vcf = select_first([FilterByOrientationBias.filtered_vcf, Filter.filtered_vcf]) - File filtered_vcf_index = select_first([FilterByOrientationBias.filtered_vcf_index, Filter.filtered_vcf_index]) - File? contamination_table = CalculateContamination.contamination_table - - File? oncotated_m2_maf = oncotate_m2.oncotated_m2_maf - File? funcotated_vcf = Funcotate.funcotated_vcf - File? funcotated_vcf_index = Funcotate.funcotated_vcf_index - File? preadapter_detail_metrics = CollectSequencingArtifactMetrics.pre_adapter_metrics - File? bamout = MergeBamOuts.merged_bam_out - File? bamout_index = MergeBamOuts.merged_bam_out_index - } -} - -task SplitIntervals { - # inputs - File? intervals - File ref_fasta - File ref_fai - File ref_dict - Int scatter_count - String? split_intervals_extra_args - - File? gatk_override - - # runtime - String gatk_docker - Int? mem - Int? preemptible_attempts - Int? disk_space - Int? cpu - Boolean use_ssd = false - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 3500 - Int command_mem = machine_mem - 500 - - command { - set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - - mkdir interval-files - gatk --java-options "-Xmx${command_mem}m" SplitIntervals \ - -R ${ref_fasta} \ - ${"-L " + intervals} \ - -scatter ${scatter_count} \ - -O interval-files \ - ${split_intervals_extra_args} - cp interval-files/*.intervals . - } - - runtime { - docker: gatk_docker - memory: machine_mem + " MB" - disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 10]) - cpu: select_first([cpu, 1]) - } - - output { - Array[File] interval_files = glob("*.intervals") - } -} - -task M2 { - # inputs - File? intervals - File ref_fasta - File ref_fai - File ref_dict - File tumor_bam - File tumor_bai - File? normal_bam - File? normal_bai - File? pon - File? pon_index - File? gnomad - File? gnomad_index - String? m2_extra_args - Boolean? make_bamout - Boolean compress - - String output_vcf = "output" + if compress then ".vcf.gz" else ".vcf" - String output_vcf_index = output_vcf + if compress then ".tbi" else ".idx" - - File? gatk_override - - # runtime - String gatk_docker - Int? mem - Int? preemptible_attempts - Int? disk_space - Int? cpu - Boolean use_ssd = false - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 3500 - Int command_mem = machine_mem - 500 - - # Do not populate this unless you know what you are doing... - File? auth - - command <<< - set -e - - if [[ "${auth}" == *.json ]]; then - gsutil cp ${auth} /root/.config/gcloud/application_default_credentials.json - GOOGLE_APPLICATION_CREDENTIALS=/root/.config/gcloud/application_default_credentials.json - export GOOGLE_APPLICATION_CREDENTIALS=/root/.config/gcloud/application_default_credentials.json - fi - - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - - # We need to create these files regardless, even if they stay empty - touch bamout.bam - echo "" > normal_name.txt - - gatk --java-options "-Xmx${command_mem}m" GetSampleName -R ${ref_fasta} -I ${tumor_bam} -O tumor_name.txt -encode - tumor_command_line="-I ${tumor_bam} -tumor `cat tumor_name.txt`" - - if [[ -f "${normal_bam}" ]]; then - gatk --java-options "-Xmx${command_mem}m" GetSampleName -R ${ref_fasta} -I ${normal_bam} -O normal_name.txt -encode - normal_command_line="-I ${normal_bam} -normal `cat normal_name.txt`" - fi - - gatk --java-options "-Xmx${command_mem}m" Mutect2 \ - -R ${ref_fasta} \ - $tumor_command_line \ - $normal_command_line \ - ${"--germline-resource " + gnomad} \ - ${"-pon " + pon} \ - ${"-L " + intervals} \ - -O "${output_vcf}" \ - ${true='--bam-output bamout.bam' false='' make_bamout} \ - ${m2_extra_args} - >>> - - runtime { - docker: gatk_docker - memory: machine_mem + " MB" - disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 10]) - cpu: select_first([cpu, 1]) - } - - output { - File unfiltered_vcf = "${output_vcf}" - File unfiltered_vcf_index = "${output_vcf_index}" - File output_bamOut = "bamout.bam" - String tumor_sample = read_string("tumor_name.txt") - String normal_sample = read_string("normal_name.txt") - } -} - -task MergeVCFs { - # inputs - Array[File] input_vcfs - Array[File] input_vcf_indices - String output_name - Boolean compress - String output_vcf = output_name + if compress then ".vcf.gz" else ".vcf" - String output_vcf_index = output_vcf + if compress then ".tbi" else ".idx" - - File? gatk_override - - # runtime - String gatk_docker - Int? mem - Int? preemptible_attempts - Int? disk_space - Int? cpu - Boolean use_ssd = false - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 3500 - Int command_mem = machine_mem - 1 - - # using MergeVcfs instead of GatherVcfs so we can create indices - # WARNING 2015-10-28 15:01:48 GatherVcfs Index creation not currently supported when gathering block compressed VCFs. - command { - set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - gatk --java-options "-Xmx${command_mem}m" MergeVcfs -I ${sep=' -I ' input_vcfs} -O ${output_vcf} - } - - runtime { - docker: gatk_docker - memory: machine_mem + " MB" - disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 10]) - cpu: select_first([cpu, 1]) - } - - output { - File merged_vcf = "${output_vcf}" - File merged_vcf_index = "${output_vcf_index}" - } -} - -task MergeBamOuts { - # inputs - File ref_fasta - File ref_fai - File ref_dict - Array[File]+ bam_outs - String output_vcf_name - - File? gatk_override - - # runtime - String gatk_docker - Int? mem - Int? preemptible_attempts - Int? disk_space - Int? cpu - Boolean use_ssd = false - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 7000 - Int command_mem = machine_mem - 1 - - command <<< - # This command block assumes that there is at least one file in bam_outs. - # Do not call this task if len(bam_outs) == 0 - set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - gatk --java-options "-Xmx${command_mem}m" GatherBamFiles \ - -I ${sep=" -I " bam_outs} -O ${output_vcf_name}.out.bam -R ${ref_fasta} - samtools index ${output_vcf_name}.out.bam ${output_vcf_name}.out.bam.bai - >>> - - runtime { - docker: gatk_docker - memory: machine_mem + " MB" - disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 10]) - cpu: select_first([cpu, 1]) - } - - output { - File merged_bam_out = "${output_vcf_name}.out.bam" - File merged_bam_out_index = "${output_vcf_name}.out.bam.bai" - } -} - -task CollectSequencingArtifactMetrics { - # inputs - File ref_fasta - File ref_fai - File tumor_bam - File tumor_bai - - File? gatk_override - - # runtime - String gatk_docker - Int? mem - Int? preemptible_attempts - Int? disk_space - Int? cpu - Boolean use_ssd = false - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 7000 - Int command_mem = machine_mem - 1 - - command { - set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - gatk --java-options "-Xmx${command_mem}m" CollectSequencingArtifactMetrics \ - -I ${tumor_bam} -O "gatk" -R ${ref_fasta} -VALIDATION_STRINGENCY LENIENT - } - - runtime { - docker: gatk_docker - memory: machine_mem + " MB" - disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 10]) - cpu: select_first([cpu, 1]) - } - - output { - File pre_adapter_metrics = "gatk.pre_adapter_detail_metrics" - } -} - -task CalculateContamination { - # inputs - File? intervals - File ref_fasta - File ref_fai - File ref_dict - File tumor_bam - File tumor_bai - File? normal_bam - File? normal_bai - File? variants_for_contamination - File? variants_for_contamination_index - - File? gatk_override - - # runtime - Int? preemptible_attempts - String gatk_docker - Int? disk_space - Int? mem - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 7000 - Int command_mem = machine_mem - 500 - - # Do not populate this unless you know what you are doing... - File? auth - - command { - set -e - - if [[ "${auth}" == *.json ]]; then - gsutil cp ${auth} /root/.config/gcloud/application_default_credentials.json - GOOGLE_APPLICATION_CREDENTIALS=/root/.config/gcloud/application_default_credentials.json - export GOOGLE_APPLICATION_CREDENTIALS=/root/.config/gcloud/application_default_credentials.json - fi - - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - - if [[ -f "${normal_bam}" ]]; then - gatk --java-options "-Xmx${command_mem}m" GetPileupSummaries -I ${normal_bam} ${"-L " + intervals} -V ${variants_for_contamination} -O normal_pileups.table - NORMAL_CMD="-matched normal_pileups.table" - fi - - gatk --java-options "-Xmx${command_mem}m" GetPileupSummaries -R ${ref_fasta} -I ${tumor_bam} ${"-L " + intervals} -V ${variants_for_contamination} -O pileups.table - gatk --java-options "-Xmx${command_mem}m" CalculateContamination -I pileups.table -O contamination.table $NORMAL_CMD - } - - runtime { - docker: gatk_docker - memory: command_mem + " MB" - disks: "local-disk " + select_first([disk_space, 100]) + " HDD" - preemptible: select_first([preemptible_attempts, 10]) - } - - output { - File pileups = "pileups.table" - File contamination_table = "contamination.table" - } -} - -task Filter { - # inputs - File? intervals - File unfiltered_vcf - File unfiltered_vcf_index - String output_name - Boolean compress - String output_vcf = output_name + if compress then ".vcf.gz" else ".vcf" - String output_vcf_index = output_vcf + if compress then ".tbi" else ".idx" - File? contamination_table - String? m2_extra_filtering_args - - File? gatk_override - - # runtime - String gatk_docker - Int? mem - Int? preemptible_attempts - Int? disk_space - Int? cpu - Boolean use_ssd = false - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 7000 - Int command_mem = machine_mem - 500 - - # Do not populate this unless you know what you are doing... - File? auth - - command { - set -e - - if [[ "${auth}" == *.json ]]; then - gsutil cp ${auth} /root/.config/gcloud/application_default_credentials.json - GOOGLE_APPLICATION_CREDENTIALS=/root/.config/gcloud/application_default_credentials.json - export GOOGLE_APPLICATION_CREDENTIALS=/root/.config/gcloud/application_default_credentials.json - fi - - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - - gatk --java-options "-Xmx${command_mem}m" FilterMutectCalls -V ${unfiltered_vcf} \ - -O ${output_vcf} \ - ${"--contamination-table " + contamination_table} \ - ${m2_extra_filtering_args} - } - - runtime { - docker: gatk_docker - memory: machine_mem + " MB" - disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 10]) - cpu: select_first([cpu, 1]) - } - - output { - File filtered_vcf = "${output_vcf}" - File filtered_vcf_index = "${output_vcf_index}" - } -} - -task FilterByOrientationBias { - # input - File? gatk_override - File input_vcf - File input_vcf_index - String output_name - Boolean compress - String output_vcf = output_name + if compress then ".vcf.gz" else ".vcf" - String output_vcf_index = output_vcf + if compress then ".tbi" else ".idx" - File pre_adapter_metrics - Array[String]? artifact_modes - - # runtime - Int? preemptible_attempts - String gatk_docker - Int? disk_space - Int? mem - Int? cpu - Boolean use_ssd = false - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 7000 - Int command_mem = machine_mem - 500 - - # Do not populate this unless you know what you are doing... - File? auth - - command { - set -e - - if [[ "${auth}" == *.json ]]; then - gsutil cp ${auth} /root/.config/gcloud/application_default_credentials.json - GOOGLE_APPLICATION_CREDENTIALS=/root/.config/gcloud/application_default_credentials.json - export GOOGLE_APPLICATION_CREDENTIALS=/root/.config/gcloud/application_default_credentials.json - fi - - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - - gatk --java-options "-Xmx${command_mem}m" FilterByOrientationBias \ - -V ${input_vcf} \ - -AM ${sep=" -AM " artifact_modes} \ - -P ${pre_adapter_metrics} \ - -O ${output_vcf} - } - - runtime { - docker: gatk_docker - memory: command_mem + " MB" - disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 10]) - cpu: select_first([cpu, 1]) - } - - output { - File filtered_vcf = "${output_vcf}" - File filtered_vcf_index = "${output_vcf_index}" - } -} - -task oncotate_m2 { - # inputs - File m2_vcf - File? onco_ds_tar_gz - String? onco_ds_local_db_dir - String? oncotator_exe - String? sequencing_center - String? sequence_source - File? default_config_file - String case_id - String? control_id - - # runtime - String oncotator_docker - Int? mem - Int? preemptible_attempts - Int? disk_space - Int? cpu - Boolean use_ssd = false - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 3500 - Int command_mem = machine_mem - 500 - - command <<< - # fail if *any* command below (not just the last) doesn't return 0, in particular if wget fails - set -e - - # local db dir is a directory and has been specified - if [[ -d "${onco_ds_local_db_dir}" ]]; then - echo "Using local db-dir: ${onco_ds_local_db_dir}" - echo "THIS ONLY WORKS WITHOUT DOCKER!" - ln -s ${onco_ds_local_db_dir} onco_dbdir - elif [[ "${onco_ds_tar_gz}" == *.tar.gz ]]; then - echo "Using given tar file: ${onco_ds_tar_gz}" - mkdir onco_dbdir - tar zxvf ${onco_ds_tar_gz} -C onco_dbdir --strip-components 1 - else - echo "Downloading and installing oncotator datasources from Broad FTP site..." - # Download and untar the db-dir - wget ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/oncotator/oncotator_v1_ds_April052016.tar.gz - tar zxvf oncotator_v1_ds_April052016.tar.gz - ln -s oncotator_v1_ds_April052016 onco_dbdir - fi - - ${default="/root/oncotator_venv/bin/oncotator" oncotator_exe} --db-dir onco_dbdir/ -c $HOME/tx_exact_uniprot_matches.AKT1_CRLF2_FGFR1.txt \ - -v ${m2_vcf} ${case_id}.maf.annotated hg19 -i VCF -o TCGAMAF --skip-no-alt --infer-onps --collapse-number-annotations --log_name oncotator.log \ - -a Center:${default="Unknown" sequencing_center} \ - -a source:${default="Unknown" sequence_source} \ - -a normal_barcode:${control_id} \ - -a tumor_barcode:${case_id} \ - ${"--default_config " + default_config_file} - >>> - - runtime { - docker: oncotator_docker - memory: machine_mem + " MB" - bootDiskSizeGb: 12 - disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 10]) - cpu: select_first([cpu, 1]) - } - - output { - File oncotated_m2_maf="${case_id}.maf.annotated" - } -} - -# Calculates sum of a list of floats -task SumFloats { - Array[Float] sizes - - # Runtime parameters - Int? preemptible_attempts - - command <<< - python -c "print ${sep="+" sizes}" - >>> - - output { - Float total_size = read_float(stdout()) - } - - runtime { - docker: "python:2.7" - disks: "local-disk " + 10 + " HDD" - preemptible: select_first([preemptible_attempts, 10]) - } -} - -task Funcotate { - # inputs - File ref_fasta - File ref_fai - File ref_dict - File m2_vcf - File m2_vcf_index - String reference_version - String output_name - Boolean compress - String output_vcf = output_name + if compress then ".vcf.gz" else ".vcf" - String output_vcf_index = output_vcf + if compress then ".tbi" else ".idx" - - File? data_sources_tar_gz - String? transcript_selection_mode - Array[String]? transcript_selection_list - Array[String]? annotation_defaults - Array[String]? annotation_overrides - - # ============== - # Process input args: - String transcript_selection_arg = if defined(transcript_selection_list) then " --transcript-list " else "" - String annotation_def_arg = if defined(annotation_defaults) then " --annotation-default " else "" - String annotation_over_arg = if defined(annotation_overrides) then " --annotation-override " else "" - # ============== - - # runtime - - String gatk_docker - File? gatk_override - Int? mem - Int? preemptible_attempts - Int? disk_space_gb - Int? cpu - - Boolean use_ssd = false - - # You may have to change the following two parameter values depending on the task requirements - Int default_ram_mb = 3000 - # WARNING: In the workflow, you should calculate the disk space as an input to this task (disk_space_gb). Please see [TODO: Link from Jose] for examples. - Int default_disk_space_gb = 100 - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem *1000 else default_ram_mb - Int command_mem = machine_mem - 1000 - - command <<< - set -e - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - - DATA_SOURCES_TAR_GZ=${data_sources_tar_gz} - if [[ ! -e $DATA_SOURCES_TAR_GZ ]] ; then - # We have to download the data sources: - echo "Data sources gzip does not exist: $DATA_SOURCES_TAR_GZ" - echo "Downloading default data sources..." - wget ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/funcotator/funcotator_dataSources.v1.0.20180105.tar.gz - tar -zxf funcotator_dataSources.v1.0.20180105.tar.gz - DATA_SOURCES_FOLDER=funcotator_dataSources.v1.0.20180105 - else - # Extract the tar.gz: - mkdir datasources_dir - tar zxvf ${data_sources_tar_gz} -C datasources_dir --strip-components 1 - DATA_SOURCES_FOLDER="$PWD/datasources_dir" - fi - - gatk --java-options "-Xmx${command_mem}m" Funcotator \ - --data-sources-path $DATA_SOURCES_FOLDER \ - --ref-version ${reference_version} \ - -R ${ref_fasta} \ - -V ${m2_vcf} \ - -O ${output_vcf} \ - ${"--transcript-selection-mode " + transcript_selection_mode} \ - ${transcript_selection_arg}${default="" sep=" --transcript-list " transcript_selection_list} \ - ${annotation_def_arg}${default="" sep=" --annotation-default " annotation_defaults} \ - ${annotation_over_arg}${default="" sep=" --annotation-override " annotation_overrides} - >>> - - runtime { - docker: gatk_docker - memory: machine_mem + " MB" - disks: "local-disk " + select_first([disk_space_gb, default_disk_space_gb]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 3]) - cpu: select_first([cpu, 1]) - } - - output { - File funcotated_vcf = "${output_vcf}" - File funcotated_vcf_index = "${output_vcf_index}" - } -} - diff --git a/scripts/mutect2_wdl/unsupported/preprocess_hapmap.wdl b/scripts/mutect2_wdl/unsupported/preprocess_hapmap.wdl deleted file mode 100644 index 929ce5f952e..00000000000 --- a/scripts/mutect2_wdl/unsupported/preprocess_hapmap.wdl +++ /dev/null @@ -1,139 +0,0 @@ -### This is the truth data preprocessing part of the CRSP sensitivity validation - -#Conceptual Overview -# To measure sensitivity, we sequence a pool 5, 10 or 20 normal samples. The pool contains a variety of allele fractions, -# like a real tumor sample. These normal samples were also sequenced individually as part of HapMap, so we have "truth" vcfs -# for them. We calculate sensitivity by comparing the calls to the truth data. For a variety of reasons we don't call -# against a matched normal. - -#Workflow Steps -# 1. Restrict a huge HapMap wgs VCF to given lists of samples and intervals. -# 2. Annotate this VCF using a bam sequenced from a pool derived from the given samples -# 3. Run Mutect in tumor-only mode on this pooled bam. -# 4. Compare Mutect calls to the truth data and output a table of true positives and false negatives along with -# annotations from the truth VCF prepared in steps 1 and 2. - -# Here we implement the subsampling part of step 1, along with some other preprocessing such as removing pairs of -# neighboring indels. - -workflow PreprocessHapmap { - # inputs - File hapmap - File hapmap_idx - File five_plex_samples - File ten_plex_samples - File twenty_plex_samples - Int min_indel_spacing - File gnomad # common variants eg gnomad variants with AF > 0.001 - File gnomad_idx - - File? gatk_override - - # runtime - String gatk_docker - - call Subsample as SubsampleFive { - input: gatk_override = gatk_override, gatk_docker = gatk_docker, hapmap = hapmap, hapmap_idx = hapmap_idx, samples = five_plex_samples, gnomad = gnomad, gnomad_idx = gnomad_idx - } - - call Subsample as SubsampleTen { - input: gatk_override = gatk_override, gatk_docker = gatk_docker, hapmap = hapmap, hapmap_idx = hapmap_idx, samples = ten_plex_samples, gnomad = gnomad, gnomad_idx = gnomad_idx - } - - call Subsample as SubsampleTwenty { - input: gatk_override = gatk_override, gatk_docker = gatk_docker, hapmap = hapmap, hapmap_idx = hapmap_idx, samples = twenty_plex_samples, gnomad = gnomad, gnomad_idx = gnomad_idx - } - - call RemoveNearbyIndels as RemoveFive { - input: gatk_override = gatk_override, gatk_docker = gatk_docker, input_vcf = SubsampleFive.output_vcf, input_vcf_idx = SubsampleFive.output_vcf_idx, - min_indel_spacing = min_indel_spacing, name = "five_plex" - } - - call RemoveNearbyIndels as RemoveTen { - input: gatk_override = gatk_override, gatk_docker = gatk_docker, input_vcf = SubsampleTen.output_vcf, input_vcf_idx = SubsampleTen.output_vcf_idx, - min_indel_spacing = min_indel_spacing, name = "ten_plex" - } - - call RemoveNearbyIndels as RemoveTwenty { - input: gatk_override = gatk_override, gatk_docker = gatk_docker, input_vcf = SubsampleTwenty.output_vcf, input_vcf_idx = SubsampleTwenty.output_vcf_idx, - min_indel_spacing = min_indel_spacing, name = "twenty_plex" - } - - output { - File five_plex_preprocessed = RemoveFive.output_vcf - File five_plex_preprocessed_idx = RemoveFive.output_vcf - File ten_plex_preprocessed = RemoveTen.output_vcf - File ten_plex_preprocessed_idx = RemoveTen.output_vcf - File twenty_plex_preprocessed = RemoveTwenty.output_vcf - File twenty_plex_preprocessed_idx = RemoveTwenty.output_vcf - } -} - -task Subsample { - # inputs - File hapmap - File hapmap_idx - File samples - File gnomad # common variants, to reduce false positives eg mapping artifacts in the original Hapmap vcf - File gnomad_idx - - File? gatk_override - - # runtime - String gatk_docker - - command { - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - - # subsampling and restriction to biallelics - gatk --java-options "-Xmx4g" SelectVariants -V ${hapmap} -O sub.vcf \ - -restrict-alleles-to BIALLELIC \ - --sample-name ${samples} \ - -L ${gnomad} \ - -max-indel-size 10 \ - --exclude-non-variants - - #remove NEGATIVE_TRAIN_SITE variants and re-index - grep -v NEGATIVE_TRAIN_SITE sub.vcf > subsampled.vcf - gatk --java-options "-Xmx4g" IndexFeatureFile -F subsampled.vcf - } - - runtime { - docker: "${gatk_docker}" - preemptible: 2 - } - - output { - File output_vcf = "subsampled.vcf" - File output_vcf_idx = "subsampled.vcf.idx" - } -} - -task RemoveNearbyIndels { - # inputs - File input_vcf - File input_vcf_idx - Int min_indel_spacing - String name - - File? gatk_override - - # runtime - String gatk_docker - - command { - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - gatk --java-options "-Xmx4g" RemoveNearbyIndels -V ${input_vcf} -O ${name}.vcf -min-indel-spacing ${min_indel_spacing} - } - - runtime { - docker: "${gatk_docker}" - preemptible: 2 - } - - output { - File output_vcf = "${name}.vcf" - File output_vcf_idx = "${name}.vcf.idx" - } -} -