diff --git a/.dockstore.yml b/.dockstore.yml index de11d9eed1a..897befc41f0 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -95,6 +95,8 @@ workflows: branches: - master - ah_var_store + - rsa_vqsr_lite_poc + - VS-693_VQSR_lite - name: GvsPopulateAltAllele subclass: WDL primaryDescriptorPath: /scripts/variantstore/wdl/GvsPopulateAltAllele.wdl diff --git a/.github/workflows/gatk-tests.yml b/.github/workflows/gatk-tests.yml index 6ad01a31f75..a908b98cd15 100644 --- a/.github/workflows/gatk-tests.yml +++ b/.github/workflows/gatk-tests.yml @@ -291,7 +291,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - wdlTest: [ 'RUN_CNV_GERMLINE_COHORT_WDL', 'RUN_CNV_GERMLINE_CASE_WDL', 'RUN_CNV_SOMATIC_WDL', 'RUN_M2_WDL', 'RUN_CNN_WDL' ] + wdlTest: [ 'RUN_CNV_GERMLINE_COHORT_WDL', 'RUN_CNV_GERMLINE_CASE_WDL', 'RUN_CNV_SOMATIC_WDL', 'RUN_M2_WDL', 'RUN_CNN_WDL', 'RUN_VCF_SITE_LEVEL_FILTERING_WDL' ] continue-on-error: true name: WDL test ${{ matrix.wdlTest }} on cromwell steps: @@ -349,3 +349,9 @@ jobs: run: | echo "Running CNN WDL"; bash scripts/cnn_variant_cromwell_tests/run_cnn_variant_wdl.sh; + + - name: "VCF_SITE_LEVEL_FILTERING_WDL_TEST" + if: ${{ matrix.wdlTest == 'RUN_VCF_SITE_LEVEL_FILTERING_WDL' }} + run: | + echo "Running VCF Site Level Filtering WDL"; + bash scripts/vcf_site_level_filtering_cromwell_tests/run_vcf_site_level_filtering_wdl.sh; \ No newline at end of file diff --git a/build.gradle b/build.gradle index 85aab88634c..c7698c5a290 100644 --- a/build.gradle +++ b/build.gradle @@ -293,6 +293,7 @@ dependencies { implementation 'org.apache.commons:commons-lang3:3.5' implementation 'org.apache.commons:commons-math3:3.5' + implementation 'org.hipparchus:hipparchus-stat:2.0' implementation 'org.apache.commons:commons-collections4:4.1' implementation 'org.apache.commons:commons-vfs2:2.0' implementation 'org.apache.commons:commons-configuration2:2.4' diff --git a/scripts/gatkcondaenv.yml.template b/scripts/gatkcondaenv.yml.template index 9077fed6296..dbe29ed5a28 100644 --- a/scripts/gatkcondaenv.yml.template +++ b/scripts/gatkcondaenv.yml.template @@ -38,10 +38,11 @@ dependencies: # if you wish to update, note that versions of conda-forge::keras after 2.2.5 # undesirably set the environment variable KERAS_BACKEND = theano by default - defaults::intel-openmp=2019.4 -- conda-forge::scikit-learn=0.22.2 +- conda-forge::scikit-learn=0.23.1 - conda-forge::matplotlib=3.2.1 - conda-forge::pandas=1.0.3 - conda-forge::typing_extensions=4.1.1 # see https://github.com/broadinstitute/gatk/issues/7800 and linked PRs +- conda-forge::dill=0.3.4 # used for pickling lambdas in TrainVariantAnnotationsModel # core R dependencies; these should only be used for plotting and do not take precedence over core python dependencies! - r-base=3.6.2 diff --git a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl index 7d45b360814..61d6a32e9f7 100644 --- a/scripts/variantstore/wdl/GvsCreateFilterSet.wdl +++ b/scripts/variantstore/wdl/GvsCreateFilterSet.wdl @@ -2,6 +2,7 @@ version 1.0 import "GvsWarpTasks.wdl" as Tasks import "GvsUtils.wdl" as Utils +import "../../vcf_site_level_filtering_wdl/JointVcfFiltering.wdl" as VQSRLite workflow GvsCreateFilterSet { input { @@ -17,6 +18,7 @@ workflow GvsCreateFilterSet { File interval_list = "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.noCentromeres.noTelomeres.interval_list" File? gatk_override + Boolean use_classic_VQSR = true Int? INDEL_VQSR_max_gaussians_override = 4 Int? INDEL_VQSR_maximum_training_variants Int? INDEL_VQSR_mem_gb_override @@ -54,9 +56,13 @@ workflow GvsCreateFilterSet { String fq_sample_table = "~{project_id}.~{dataset_name}.sample_info" String fq_alt_allele_table = "~{project_id}.~{dataset_name}.alt_allele" String fq_info_destination_table = "~{project_id}.~{dataset_name}.filter_set_info" + String fq_info_destination_table_vqsr_lite = "~{project_id}.~{dataset_name}.vqsr_lite_filter_set_info" String fq_tranches_destination_table = "~{project_id}.~{dataset_name}.filter_set_tranches" String fq_filter_sites_destination_table = "~{project_id}.~{dataset_name}.filter_set_sites" + String fq_info_destination_table_schema = "filter_set_name:string,type:string,location:integer,ref:string,alt:string,vqslod:float,culprit:string,training_label:string,yng_status:string" + String fq_info_destination_table_vqsr_lite_schema = "filter_set_name:string,type:string,location:integer,ref:string,alt:string,vqslod:float,culprit:string,training_label:string,yng_status:string,calibration_sensitivity:float" + call Utils.GetBQTableLastModifiedDatetime as SamplesTableDatetimeCheck { input: query_project = project_id, @@ -118,63 +124,98 @@ workflow GvsCreateFilterSet { preemptible_tries = 3, } - call Tasks.IndelsVariantRecalibrator { - input: - sites_only_variant_filtered_vcf = MergeVCFs.output_vcf, - sites_only_variant_filtered_vcf_index = MergeVCFs.output_vcf_index, - recalibration_filename = filter_set_name + ".indels.recal", - tranches_filename = filter_set_name + ".indels.tranches", - recalibration_tranche_values = ["100.0", "99.95", "99.9", "99.5", "99.0", "97.0", "96.0", "95.0", "94.0", "93.5", "93.0", "92.0", "91.0", "90.0"], - recalibration_annotation_values = indel_recalibration_annotation_values, - mills_resource_vcf = mills_resource_vcf, - mills_resource_vcf_index = mills_resource_vcf_index, - axiomPoly_resource_vcf = axiomPoly_resource_vcf, - axiomPoly_resource_vcf_index = axiomPoly_resource_vcf_index, - dbsnp_resource_vcf = dbsnp_vcf, - dbsnp_resource_vcf_index = dbsnp_vcf_index, - use_allele_specific_annotations = true, - disk_size = "1000", - machine_mem_gb = INDEL_VQSR_mem_gb_override, - max_gaussians = INDEL_VQSR_max_gaussians_override, - maximum_training_variants = INDEL_VQSR_maximum_training_variants, + # From this point, the paths diverge depending on whether they're using classic VQSR or VQSR-Lite + # The first branch here is VQSR-Lite, and the second is classic VQSR + if (!use_classic_VQSR) { + call VQSRLite.JointVcfFiltering as JointVcfFiltering { + input: + vcf = ExtractFilterTask.output_vcf, + vcf_index = ExtractFilterTask.output_vcf_index, + sites_only_vcf = MergeVCFs.output_vcf, + sites_only_vcf_index = MergeVCFs.output_vcf_index, + basename = filter_set_name, + gatk_docker = "us.gcr.io/broad-gatk/gatk:4.3.0.0", + extract_interval_list = interval_list, + score_interval_list = interval_list, + snp_annotations = "-A AS_QD -A AS_MQRankSum -A AS_ReadPosRankSum -A AS_FS -A AS_MQ -A AS_SOR", + indel_annotations = "-A AS_FS -A AS_ReadPosRankSum -A AS_MQRankSum -A AS_QD -A AS_SOR", + use_allele_specific_annotations = true, + } + + call Utils.MergeVCFs as MergeINDELScoredVCFs { + input: + input_vcfs = JointVcfFiltering.indels_variant_scored_vcf, + gather_type = "CONVENTIONAL", + output_vcf_name = "${filter_set_name}.indel.vrecalibration.gz", + preemptible_tries = 3, + } + + call Utils.MergeVCFs as MergeSNPScoredVCFs { + input: + input_vcfs = JointVcfFiltering.snps_variant_scored_vcf, + gather_type = "CONVENTIONAL", + output_vcf_name = "${filter_set_name}.snp.vrecalibration.gz", + preemptible_tries = 3, + } + + call PopulateFilterSetInfo { + input: + gatk_override = gatk_override, + filter_set_name = filter_set_name, + snp_recal_file = MergeSNPScoredVCFs.output_vcf, + snp_recal_file_index = MergeSNPScoredVCFs.output_vcf_index, + indel_recal_file = MergeINDELScoredVCFs.output_vcf, + indel_recal_file_index = MergeINDELScoredVCFs.output_vcf_index, + fq_info_destination_table = fq_info_destination_table_vqsr_lite, + filter_schema = fq_info_destination_table_vqsr_lite_schema, + query_project = project_id, + useClassic = false + } + + call PopulateFilterSetSites { + input: + gatk_override = gatk_override, + filter_set_name = filter_set_name, + sites_only_variant_filtered_vcf = MergeVCFs.output_vcf, + sites_only_variant_filtered_vcf_index = MergeVCFs.output_vcf_index, + fq_filter_sites_destination_table = fq_filter_sites_destination_table, + query_project = project_id + } } - if (GetNumSamplesLoaded.num_samples > snps_variant_recalibration_threshold) { - call Tasks.SNPsVariantRecalibratorCreateModel { + if (use_classic_VQSR) { + + call Tasks.IndelsVariantRecalibrator { input: sites_only_variant_filtered_vcf = MergeVCFs.output_vcf, sites_only_variant_filtered_vcf_index = MergeVCFs.output_vcf_index, - recalibration_filename = filter_set_name + ".snps.recal", - tranches_filename = filter_set_name + ".snps.tranches", - recalibration_tranche_values = snp_recalibration_tranche_values, - recalibration_annotation_values = snp_recalibration_annotation_values, - model_report_filename = filter_set_name + ".snps.model.report", - hapmap_resource_vcf = hapmap_resource_vcf, - hapmap_resource_vcf_index = hapmap_resource_vcf_index, - omni_resource_vcf = omni_resource_vcf, - omni_resource_vcf_index = omni_resource_vcf_index, - one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf, - one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index, + recalibration_filename = filter_set_name + ".indels.recal", + tranches_filename = filter_set_name + ".indels.tranches", + recalibration_tranche_values = ["100.0", "99.95", "99.9", "99.5", "99.0", "97.0", "96.0", "95.0", "94.0", "93.5", "93.0", "92.0", "91.0", "90.0"], + recalibration_annotation_values = indel_recalibration_annotation_values, + mills_resource_vcf = mills_resource_vcf, + mills_resource_vcf_index = mills_resource_vcf_index, + axiomPoly_resource_vcf = axiomPoly_resource_vcf, + axiomPoly_resource_vcf_index = axiomPoly_resource_vcf_index, dbsnp_resource_vcf = dbsnp_vcf, dbsnp_resource_vcf_index = dbsnp_vcf_index, use_allele_specific_annotations = true, disk_size = "1000", - machine_mem_gb = SNP_VQSR_mem_gb_override, - max_gaussians = SNP_VQSR_max_gaussians_override, - sample_every_nth_variant = SNP_VQSR_sample_every_nth_variant, - maximum_training_variants = SNP_VQSR_maximum_training_variants + machine_mem_gb = INDEL_VQSR_mem_gb_override, + max_gaussians = INDEL_VQSR_max_gaussians_override, + maximum_training_variants = INDEL_VQSR_maximum_training_variants, } - scatter (idx in range(length(ExtractFilterTask.output_vcf))) { - call Tasks.SNPsVariantRecalibrator as SNPsVariantRecalibratorScattered { + if (GetNumSamplesLoaded.num_samples > snps_variant_recalibration_threshold) { + call Tasks.SNPsVariantRecalibratorCreateModel { input: - sites_only_variant_filtered_vcf = ExtractFilterTask.output_vcf[idx], - sites_only_variant_filtered_vcf_index = ExtractFilterTask.output_vcf_index[idx], - recalibration_filename = filter_set_name + ".snps." + idx + ".recal", - tranches_filename = filter_set_name + ".snps." + idx + ".tranches", + sites_only_variant_filtered_vcf = MergeVCFs.output_vcf, + sites_only_variant_filtered_vcf_index = MergeVCFs.output_vcf_index, + recalibration_filename = filter_set_name + ".snps.recal", + tranches_filename = filter_set_name + ".snps.tranches", recalibration_tranche_values = snp_recalibration_tranche_values, recalibration_annotation_values = snp_recalibration_annotation_values, - model_report = SNPsVariantRecalibratorCreateModel.model_report, + model_report_filename = filter_set_name + ".snps.model.report", hapmap_resource_vcf = hapmap_resource_vcf, hapmap_resource_vcf_index = hapmap_resource_vcf_index, omni_resource_vcf = omni_resource_vcf, @@ -185,84 +226,114 @@ workflow GvsCreateFilterSet { dbsnp_resource_vcf_index = dbsnp_vcf_index, use_allele_specific_annotations = true, disk_size = "1000", - machine_mem_gb = SNP_VQSR_mem_gb_override + machine_mem_gb = SNP_VQSR_mem_gb_override, + max_gaussians = SNP_VQSR_max_gaussians_override, + sample_every_nth_variant = SNP_VQSR_sample_every_nth_variant, + maximum_training_variants = SNP_VQSR_maximum_training_variants + } + + scatter (idx in range(length(ExtractFilterTask.output_vcf))) { + call Tasks.SNPsVariantRecalibrator as SNPsVariantRecalibratorScattered { + input: + sites_only_variant_filtered_vcf = ExtractFilterTask.output_vcf[idx], + sites_only_variant_filtered_vcf_index = ExtractFilterTask.output_vcf_index[idx], + recalibration_filename = filter_set_name + ".snps." + idx + ".recal", + tranches_filename = filter_set_name + ".snps." + idx + ".tranches", + recalibration_tranche_values = snp_recalibration_tranche_values, + recalibration_annotation_values = snp_recalibration_annotation_values, + model_report = SNPsVariantRecalibratorCreateModel.model_report, + hapmap_resource_vcf = hapmap_resource_vcf, + hapmap_resource_vcf_index = hapmap_resource_vcf_index, + omni_resource_vcf = omni_resource_vcf, + omni_resource_vcf_index = omni_resource_vcf_index, + one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf, + one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index, + dbsnp_resource_vcf = dbsnp_vcf, + dbsnp_resource_vcf_index = dbsnp_vcf_index, + use_allele_specific_annotations = true, + disk_size = "1000", + machine_mem_gb = SNP_VQSR_mem_gb_override + } + } + + call Tasks.GatherTranches as SNPGatherTranches { + input: + tranches = SNPsVariantRecalibratorScattered.tranches, + output_filename = filter_set_name + ".snps.gathered.tranches", + output_tranche_values = snp_recalibration_tranche_values, + mode = "SNP", + disk_size = "200", + gatk_override = gatk_override + } + + call Utils.MergeVCFs as MergeRecalibrationFiles { + input: + input_vcfs = SNPsVariantRecalibratorScattered.recalibration, + gather_type = "CONVENTIONAL", + output_vcf_name = "${filter_set_name}.vrecalibration.gz", + preemptible_tries = 3, } } - call Tasks.GatherTranches as SNPGatherTranches { - input: - tranches = SNPsVariantRecalibratorScattered.tranches, - output_filename = filter_set_name + ".snps.gathered.tranches", - output_tranche_values = snp_recalibration_tranche_values, - mode = "SNP", - disk_size = "200", - gatk_override = gatk_override + if (GetNumSamplesLoaded.num_samples <= snps_variant_recalibration_threshold) { + call Tasks.SNPsVariantRecalibrator as SNPsVariantRecalibratorClassic { + input: + sites_only_variant_filtered_vcf = MergeVCFs.output_vcf, + sites_only_variant_filtered_vcf_index = MergeVCFs.output_vcf_index, + recalibration_filename = filter_set_name + ".snps.recal", + tranches_filename = filter_set_name + ".snps.tranches", + recalibration_tranche_values = snp_recalibration_tranche_values, + recalibration_annotation_values = snp_recalibration_annotation_values, + hapmap_resource_vcf = hapmap_resource_vcf, + hapmap_resource_vcf_index = hapmap_resource_vcf_index, + omni_resource_vcf = omni_resource_vcf, + omni_resource_vcf_index = omni_resource_vcf_index, + one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf, + one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index, + dbsnp_resource_vcf = dbsnp_vcf, + dbsnp_resource_vcf_index = dbsnp_vcf_index, + use_allele_specific_annotations = true, + disk_size = "1000", + machine_mem_gb = SNP_VQSR_mem_gb_override, + max_gaussians = SNP_VQSR_max_gaussians_override, + } } - call Utils.MergeVCFs as MergeRecalibrationFiles { + call PopulateFilterSetInfo as PopulateFilterSetInfoCLassic { input: - input_vcfs = SNPsVariantRecalibratorScattered.recalibration, - gather_type = "CONVENTIONAL", - output_vcf_name = "${filter_set_name}.vrecalibration.gz", - preemptible_tries = 3, + gatk_override = gatk_override, + filter_set_name = filter_set_name, + snp_recal_file = select_first([MergeRecalibrationFiles.output_vcf, SNPsVariantRecalibratorClassic.recalibration]), + snp_recal_file_index = select_first([MergeRecalibrationFiles.output_vcf_index, SNPsVariantRecalibratorClassic.recalibration_index]), + indel_recal_file = IndelsVariantRecalibrator.recalibration, + indel_recal_file_index = IndelsVariantRecalibrator.recalibration_index, + fq_info_destination_table = fq_info_destination_table, + filter_schema = fq_info_destination_table_schema, + query_project = project_id, + useClassic = true } - } - if (GetNumSamplesLoaded.num_samples <= snps_variant_recalibration_threshold) { - call Tasks.SNPsVariantRecalibrator as SNPsVariantRecalibratorClassic { + call PopulateFilterSetSites as PopulateFilterSetSitesClassic { input: + gatk_override = gatk_override, + filter_set_name = filter_set_name, sites_only_variant_filtered_vcf = MergeVCFs.output_vcf, sites_only_variant_filtered_vcf_index = MergeVCFs.output_vcf_index, - recalibration_filename = filter_set_name + ".snps.recal", - tranches_filename = filter_set_name + ".snps.tranches", - recalibration_tranche_values = snp_recalibration_tranche_values, - recalibration_annotation_values = snp_recalibration_annotation_values, - hapmap_resource_vcf = hapmap_resource_vcf, - hapmap_resource_vcf_index = hapmap_resource_vcf_index, - omni_resource_vcf = omni_resource_vcf, - omni_resource_vcf_index = omni_resource_vcf_index, - one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf, - one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index, - dbsnp_resource_vcf = dbsnp_vcf, - dbsnp_resource_vcf_index = dbsnp_vcf_index, - use_allele_specific_annotations = true, - disk_size = "1000", - machine_mem_gb = SNP_VQSR_mem_gb_override, - max_gaussians = SNP_VQSR_max_gaussians_override, + fq_filter_sites_destination_table = fq_filter_sites_destination_table, + query_project = project_id } - } - - call PopulateFilterSetInfo { - input: - gatk_override = gatk_override, - filter_set_name = filter_set_name, - snp_recal_file = select_first([MergeRecalibrationFiles.output_vcf, SNPsVariantRecalibratorClassic.recalibration]), - snp_recal_file_index = select_first([MergeRecalibrationFiles.output_vcf_index, SNPsVariantRecalibratorClassic.recalibration_index]), - indel_recal_file = IndelsVariantRecalibrator.recalibration, - indel_recal_file_index = IndelsVariantRecalibrator.recalibration_index, - fq_info_destination_table = fq_info_destination_table, - query_project = project_id - } - call PopulateFilterSetSites { - input: - gatk_override = gatk_override, - filter_set_name = filter_set_name, - sites_only_variant_filtered_vcf = MergeVCFs.output_vcf, - sites_only_variant_filtered_vcf_index = MergeVCFs.output_vcf_index, - fq_filter_sites_destination_table = fq_filter_sites_destination_table, - query_project = project_id + call PopulateFilterSetTranches as PopulateFilterSetTranchesClassic { + input: + gatk_override = gatk_override, + filter_set_name = filter_set_name, + snp_recal_tranches = select_first([SNPGatherTranches.tranches_file, SNPsVariantRecalibratorClassic.tranches]), + indel_recal_tranches = IndelsVariantRecalibrator.tranches, + fq_tranches_destination_table = fq_tranches_destination_table, + query_project = project_id + } } - call PopulateFilterSetTranches { - input: - gatk_override = gatk_override, - filter_set_name = filter_set_name, - snp_recal_tranches = select_first([SNPGatherTranches.tranches_file, SNPsVariantRecalibratorClassic.tranches]), - indel_recal_tranches = IndelsVariantRecalibrator.tranches, - fq_tranches_destination_table = fq_tranches_destination_table, - query_project = project_id - } output { File output_vcf = MergeVCFs.output_vcf @@ -349,7 +420,9 @@ task ExtractFilterTask { task PopulateFilterSetInfo { input { String filter_set_name + String filter_schema String fq_info_destination_table + Boolean useClassic = true File snp_recal_file File snp_recal_file_index @@ -378,6 +451,7 @@ task PopulateFilterSetInfo { --ref-version 38 \ --filter-set-name ~{filter_set_name} \ -mode SNP \ + --classic ~{useClassic} \ -V ~{snp_recal_file} \ -O ~{filter_set_name}.snps.recal.tsv @@ -387,6 +461,7 @@ task PopulateFilterSetInfo { --ref-version 38 \ --filter-set-name ~{filter_set_name} \ -mode INDEL \ + --classic ~{useClassic} \ -V ~{indel_recal_file} \ -O ~{filter_set_name}.indels.recal.tsv @@ -401,7 +476,7 @@ task PopulateFilterSetInfo { bq load --project_id=~{query_project} --skip_leading_rows 0 -F "tab" \ --range_partitioning=location,0,26000000000000,6500000000 \ --clustering_fields=location \ - --schema "filter_set_name:string,type:string,location:integer,ref:string,alt:string,vqslod:float,culprit:string,training_label:string,yng_status:string" \ + --schema "~{filter_schema}" \ ${bq_table} \ ~{filter_set_name}.filter_set_load.tsv > status_load_filter_set_info >>> @@ -473,7 +548,6 @@ task PopulateFilterSetSites { output { String status_load_filter_set_sites = read_string("status_load_filter_set_sites") - } } diff --git a/scripts/vcf_site_level_filtering_cromwell_tests/README.md b/scripts/vcf_site_level_filtering_cromwell_tests/README.md new file mode 100644 index 00000000000..6f9950fa36d --- /dev/null +++ b/scripts/vcf_site_level_filtering_cromwell_tests/README.md @@ -0,0 +1,9 @@ +# Filtering Automated Tests for WDL + +**This directory is for GATK devs only** + +This directory contains scripts for running Variant Site Level WDL tests in the automated travis build environment. + +Please note that this only tests whether the WDL will complete successfully. + +Test data is a "plumbing test" using a small portion of a 10 sample callset. \ No newline at end of file diff --git a/scripts/vcf_site_level_filtering_cromwell_tests/run_vcf_site_level_filtering_wdl.sh b/scripts/vcf_site_level_filtering_cromwell_tests/run_vcf_site_level_filtering_wdl.sh new file mode 100644 index 00000000000..1c19d18c3b6 --- /dev/null +++ b/scripts/vcf_site_level_filtering_cromwell_tests/run_vcf_site_level_filtering_wdl.sh @@ -0,0 +1,38 @@ +#!/bin/bash -l +set -e +#cd in the directory of the script in order to use relative paths +script_path=$( cd "$(dirname "${BASH_SOURCE}")" ; pwd -P ) +cd "$script_path" + +WORKING_DIR=/home/runner/work/gatk + +set -e +echo "Building docker image for VCF Site Level Filtering WDL tests (skipping unit tests)..." + +#assume Dockerfile is in root +echo "Building docker without running unit tests... =========" +cd $WORKING_DIR/gatk + +# IMPORTANT: This code is duplicated in the cnv and M2 WDL test. +if [ ! -z "$CI_PULL_REQUEST" ]; then + HASH_TO_USE=FETCH_HEAD + sudo bash build_docker.sh -e ${HASH_TO_USE} -s -u -d $PWD/temp_staging/ -t ${CI_PULL_REQUEST}; + echo "using fetch head:"$HASH_TO_USE +else + HASH_TO_USE=${CI_COMMIT} + sudo bash build_docker.sh -e ${HASH_TO_USE} -s -u -d $PWD/temp_staging/; + echo "using travis commit:"$HASH_TO_USE +fi +echo "Docker build done ==========" + +cd $WORKING_DIR/gatk/scripts/ +sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" vcf_site_level_filtering_cromwell_tests/vcf_site_level_filtering_travis.json >$WORKING_DIR/vcf_site_level_filtering_travis.json +echo "JSON FILES (modified) =======" +cat $WORKING_DIR/vcf_site_level_filtering_travis.json +echo "==================" + + +echo "Running Filtering WDL through cromwell" +ln -fs $WORKING_DIR/gatk/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl +cd $WORKING_DIR/gatk/scripts/vcf_site_level_filtering_wdl/ +java -jar $CROMWELL_JAR run JointVcfFiltering.wdl -i $WORKING_DIR/vcf_site_level_filtering_travis.json diff --git a/scripts/vcf_site_level_filtering_cromwell_tests/vcf_site_level_filtering_travis.json b/scripts/vcf_site_level_filtering_cromwell_tests/vcf_site_level_filtering_travis.json new file mode 100644 index 00000000000..8165e199d22 --- /dev/null +++ b/scripts/vcf_site_level_filtering_cromwell_tests/vcf_site_level_filtering_travis.json @@ -0,0 +1,14 @@ +{ + "JointVcfFiltering.gatk_docker": "__GATK_DOCKER__", + "JointVcfFiltering.vcf": ["/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.22.avg.vcf.gz", + "/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.23.avg.vcf.gz"], + "JointVcfFiltering.vcf_index": ["/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.22.avg.vcf.gz.tbi", + "/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.23.avg.vcf.gz.tbi"], + "JointVcfFiltering.sites_only_vcf": "/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.sites_only.vcf.gz", + "JointVcfFiltering.sites_only_vcf_index": "/home/runner/work/gatk/gatk/src/test/resources/large/filteringJointVcf/test_10_samples.sites_only.vcf.gz.tbi", + "JointVcfFiltering.basename": "test_10_samples", + "JointVcfFiltering.snp_annotations": "-A ReadPosRankSum -A FS -A SOR -A QD -A AVERAGE_TREE_SCORE -A AVERAGE_ASSEMBLED_HAPS -A AVERAGE_FILTERED_HAPS", + "JointVcfFiltering.indel_annotations": "-A MQRankSum -A ReadPosRankSum -A FS -A SOR -A QD -A AVERAGE_TREE_SCORE", + "JointVcfFiltering.model_backend": "PYTHON_IFOREST", + "JointVcfFiltering.use_allele_specific_annotations": false +} diff --git a/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl b/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl new file mode 100644 index 00000000000..63d69efa560 --- /dev/null +++ b/scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl @@ -0,0 +1,294 @@ +version 1.0 + +# This is a workflow for filtering a joint callset VCF using INFO level annotations (so filtering is at the site level). +# Note that the input VCFs here may be sharded by genomic position which may be helpful for large cohorts. The script +# will output the same number of shards that are input. +# This portion of the filtering pipeline will assign a SCORE INFO field annotation to each site, but does not yet apply +# the filtering threshold to the final VCF. + +workflow JointVcfFiltering { + input { + Array[File] vcf + Array[File] vcf_index + File sites_only_vcf + File sites_only_vcf_index + String basename + + String? model_backend + File? training_python_script + File? scoring_python_script + File? hyperparameters_json + + String gatk_docker + File? extract_interval_list + File? score_interval_list + + String snp_annotations + String indel_annotations + File? gatk_override + + Boolean use_allele_specific_annotations + + String snp_resource_args = "--resource:hapmap,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz --resource:omni,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz --resource:1000G,training=true,calibration=false gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz" + String indel_resource_args = "--resource:mills,training=true,calibration=true gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz" + } + + parameter_meta { + vcf: "An array of input VCFs that are one callset sharded by genomic region." + sites_only_vcf: "The full VCF callset without any genotype or sample level information." + basename: "Desired output file basename." + } + + call ExtractVariantAnnotations as ExtractVariantAnnotationsSNPs { + input: + input_vcf = sites_only_vcf, + input_vcf_index = sites_only_vcf_index, + mode = "SNP", + annotations = snp_annotations, + resource_args = snp_resource_args, + basename = basename, + interval_list = extract_interval_list, + use_allele_specific_annotations = use_allele_specific_annotations, + gatk_override = gatk_override, + gatk_docker = gatk_docker + } + + call ExtractVariantAnnotations as ExtractVariantAnnotationsINDELs { + input: + input_vcf = sites_only_vcf, + input_vcf_index = sites_only_vcf_index, + mode = "INDEL", + annotations = indel_annotations, + resource_args = indel_resource_args, + basename = basename, + interval_list = extract_interval_list, + use_allele_specific_annotations = use_allele_specific_annotations, + gatk_override = gatk_override, + gatk_docker = gatk_docker + } + + call TrainVariantAnnotationModel as TrainVariantAnnotationModelSNPs { + input: + annots = ExtractVariantAnnotationsSNPs.annots, + basename = basename, + mode = "snp", + model_backend = model_backend, + python_script = training_python_script, + hyperparameters_json = hyperparameters_json, + gatk_override = gatk_override, + gatk_docker = gatk_docker + } + + call TrainVariantAnnotationModel as TrainVariantAnnotationModelINDELs { + input: + annots = ExtractVariantAnnotationsINDELs.annots, + basename = basename, + mode = "indel", + model_backend = model_backend, + python_script = training_python_script, + hyperparameters_json = hyperparameters_json, + gatk_override = gatk_override, + gatk_docker = gatk_docker + } + + scatter(idx in range(length(vcf))) { + call ScoreVariantAnnotations as ScoreVariantAnnotationsSNPs { + input: + vcf = vcf[idx], + vcf_index = vcf_index[idx], + basename = basename, + mode = "SNP", + model_backend = model_backend, + python_script = scoring_python_script, + annotations = snp_annotations, + extracted_training_vcf = ExtractVariantAnnotationsSNPs.extracted_training_vcf, + extracted_training_vcf_index = ExtractVariantAnnotationsSNPs.extracted_training_vcf_index, + interval_list = score_interval_list, + model_files = TrainVariantAnnotationModelSNPs.outputs, + resource_args = snp_resource_args, + use_allele_specific_annotations = use_allele_specific_annotations, + gatk_override = gatk_override, + gatk_docker = gatk_docker + } + + call ScoreVariantAnnotations as ScoreVariantAnnotationsINDELs { + input: + vcf = vcf[idx], + vcf_index = vcf_index[idx], + basename = basename, + mode = "INDEL", + model_backend = model_backend, + python_script = scoring_python_script, + annotations = indel_annotations, + extracted_training_vcf = ExtractVariantAnnotationsINDELs.extracted_training_vcf, + extracted_training_vcf_index = ExtractVariantAnnotationsINDELs.extracted_training_vcf_index, + interval_list = score_interval_list, + model_files = TrainVariantAnnotationModelINDELs.outputs, + resource_args = indel_resource_args, + use_allele_specific_annotations = use_allele_specific_annotations, + gatk_override = gatk_override, + gatk_docker = gatk_docker + } + + } + + output { + Array[File] indels_variant_scored_vcf = ScoreVariantAnnotationsINDELs.output_vcf + Array[File] indels_variant_scored_vcf_index = ScoreVariantAnnotationsINDELs.output_vcf_index + Array[File] snps_variant_scored_vcf = ScoreVariantAnnotationsSNPs.output_vcf + Array[File] snps_variant_scored_vcf_index = ScoreVariantAnnotationsSNPs.output_vcf_index + } + +} + +task ExtractVariantAnnotations { + input { + String gatk_docker + File? gatk_override + File input_vcf + File input_vcf_index + String basename + String mode + String annotations + String resource_args + File? interval_list + Boolean use_allele_specific_annotations + + Int memory_mb = 14000 + Int command_mem = memory_mb - 1000 + } + Int disk_size = ceil(size(input_vcf, "GB") + 50) + command { + set -e + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override} + + gatk --java-options "-Xmx~{command_mem}m" \ + ExtractVariantAnnotations \ + -V ~{input_vcf} \ + -O ~{basename}.~{mode} \ + ~{annotations} \ + ~{if use_allele_specific_annotations then "--use-allele-specific-annotations" else ""} \ + ~{"-L " + interval_list} \ + --mode ~{mode} \ + ~{resource_args} + } + output { + File annots = "~{basename}.~{mode}.annot.hdf5" + File extracted_training_vcf = "~{basename}.~{mode}.vcf.gz" + File extracted_training_vcf_index = "~{basename}.~{mode}.vcf.gz.tbi" + Array[File] outputs = glob("~{basename}.~{mode}.*") + } + runtime { + docker: gatk_docker + disks: "local-disk " + disk_size + " LOCAL" + memory: memory_mb + " MiB" + } +} + +task TrainVariantAnnotationModel { + input { + String gatk_docker + File? gatk_override + File annots + String basename + String mode + String? model_backend + File? python_script + File? hyperparameters_json + + Int memory_mb = 14000 + Int command_mem = memory_mb - 1000 + } + Int disk_size = ceil(size(annots, "GB") + 100) + command <<< + set -e + + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override} + + mode=$(echo "~{mode}" | awk '{print toupper($0)}') + + gatk --java-options "-Xmx~{command_mem}m" \ + TrainVariantAnnotationsModel \ + --annotations-hdf5 ~{annots} \ + -O ~{basename} \ + ~{"--model-backend " + model_backend} \ + ~{"--python-script " + python_script} \ + ~{"--hyperparameters-json " + hyperparameters_json} \ + --mode $mode + + >>> + output { + Array[File] outputs = glob("~{basename}.~{mode}.*") + } + runtime { + docker: gatk_docker + disks: "local-disk " + disk_size + " LOCAL" + memory: memory_mb + " MiB" + } +} + +task ScoreVariantAnnotations { + input { + String gatk_docker + File? gatk_override + File vcf + File vcf_index + String basename + String mode + String? model_backend + File? python_script + String annotations + String resource_args + File extracted_training_vcf + File extracted_training_vcf_index + File? interval_list + Array[File] model_files + Boolean use_allele_specific_annotations + + Int memory_mb = 16000 + Int command_mem = memory_mb - 1000 + } + Int disk_size = ceil(size(vcf, "GB") *2 + 50) + + command { + zgrep -v '#' ~{vcf} > empty.txt + set -e + + if [ -s empty.txt ]; then + ln -s ~{sep=" . && ln -s " model_files} . + + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override} + + gatk --java-options "-Xmx~{command_mem}m" \ + ScoreVariantAnnotations \ + ~{"-L " + interval_list} \ + -V ~{vcf} \ + -O ~{basename}.~{mode} \ + ~{"--model-backend " + model_backend} \ + ~{"--python-script " + python_script} \ + --model-prefix ~{basename} \ + ~{annotations} \ + ~{if use_allele_specific_annotations then "--use-allele-specific-annotations" else ""} \ + -mode ~{mode} \ + --resource:extracted,extracted=true ~{extracted_training_vcf} \ + ~{resource_args} + else + echo "Input VCF was empty so we'll return the same VCF that was input." + echo "Scores and annot hdf5 files will not be produced since the input was empty." + ln -s ~{vcf} ~{basename}.~{mode}.vcf.gz + ln -s ~{vcf_index} ~{basename}.~{mode}.vcf.gz.tbi + fi + } + output { + File? scores = "~{basename}.~{mode}.scores.hdf5" + File? annots = "~{basename}.~{mode}.annot.hdf5" + File output_vcf = "~{basename}.~{mode}.vcf.gz" + File output_vcf_index = "~{basename}.~{mode}.vcf.gz.tbi" + } + runtime { + docker: gatk_docker + disks: "local-disk " + disk_size + " LOCAL" + memory: memory_mb + " MiB" + } +} + diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CollectReadCounts.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CollectReadCounts.java index 878d2706cbc..d248d9c8a2a 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CollectReadCounts.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CollectReadCounts.java @@ -68,7 +68,7 @@ * to TSV format. Using HDF5 files with {@link CreateReadCountPanelOfNormals} * can decrease runtime, by reducing time spent on IO, so this is the default output format. * The HDF5 format contains information in the paths defined in {@link HDF5SimpleCountCollection}. HDF5 files may be viewed using - * hdfview or loaded in python using + * hdfview or loaded in Python using * PyTables or h5py. * The TSV format has a SAM-style header containing a read group sample name, a sequence dictionary, a row specifying the column headers contained in * {@link SimpleCountCollection.SimpleCountTableColumn}, and the corresponding entry rows. diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CreateReadCountPanelOfNormals.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CreateReadCountPanelOfNormals.java index 9c7ef423fb2..63afaab70bd 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CreateReadCountPanelOfNormals.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/CreateReadCountPanelOfNormals.java @@ -85,7 +85,7 @@ * Panel-of-normals file. * This is an HDF5 file containing the panel data in the paths defined in {@link HDF5SVDReadCountPanelOfNormals}. * HDF5 files may be viewed using hdfview - * or loaded in python using PyTables or h5py. + * or loaded in Python using PyTables or h5py. * * * diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/utils/HDF5Utils.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/utils/HDF5Utils.java index 8590e3476f2..870ce37b7dc 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/utils/HDF5Utils.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/utils/HDF5Utils.java @@ -135,7 +135,7 @@ public static double[][] readChunkedDoubleMatrix(final HDF5File file, * Given a large matrix, chunks the matrix into equally sized subsets of rows * (plus a subset containing the remainder, if necessary) and writes these submatrices to indexed sub-paths * to avoid a hard limit in Java HDF5 on the number of elements in a matrix given by - * {@code MAX_NUM_VALUES_PER_HDF5_MATRIX}. The number of chunks is determined by {@code maxChunkSize}, + * {@code MAX_NUMBER_OF_VALUES_PER_HDF5_MATRIX}. The number of chunks is determined by {@code maxChunkSize}, * which should be set appropriately for the desired number of columns. * * @param maxChunkSize The maximum number of values in each chunk. Decreasing this number will reduce diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/filtering/CreateFilteringFiles.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/filtering/CreateFilteringFiles.java index 44f46e8a903..8faf58d109b 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/filtering/CreateFilteringFiles.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/filtering/CreateFilteringFiles.java @@ -38,6 +38,8 @@ public final class CreateFilteringFiles extends VariantWalker { private List HEADER = Arrays.asList("filter_set_name","mode","location","ref","alt","vqslod","culprit","training_label","yng"); + private List HEADER_VQSR_LITE = + Arrays.asList("filter_set_name","mode","location","ref","alt","vqslod","culprit","training_label","yng", "calibration_sensitivity"); @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, @@ -64,6 +66,12 @@ public final class CreateFilteringFiles extends VariantWalker { optional = false) private String mode; + @Argument( + fullName = "classic", + doc = "Whether or not this is using classic VQSR or the newer VQSR-Lite", + optional = true) + private Boolean usingOldVQSR = null; + @Override public boolean requiresIntervals() { return false; @@ -76,7 +84,17 @@ public void onTraversalStart() { } catch (IOException ioe) { throw new GATKException("Unable to initialize writer", ioe); } - writer.setHeaderLine(HEADER); + + if (usingOldVQSR == null) { // default to using the old, or "classic" VQSR if the user specifies nothing + usingOldVQSR = Boolean.TRUE; + } + + if (usingOldVQSR) { + writer.setHeaderLine(HEADER); + } else { + writer.setHeaderLine(HEADER_VQSR_LITE); + } + // Set reference version -- TODO remove this in the future, also, can we get ref version from the header? ChromosomeEnum.setRefVersion(refVersion); @@ -99,17 +117,35 @@ public void apply(final VariantContext variant, final ReadsContext readsContext, // TODO: check with Laura -- should NEGATIVES also be NAYs? String yng = variant.hasAttribute("POSITIVE_TRAIN_SITE")?"Y":"G"; - List row = Arrays.asList( - filterSetName, - mode, - location.toString(), - ref, - alt, - vqslod, - culprit, - trainingLabel, - yng - ); + List row; + if (usingOldVQSR) { + row = Arrays.asList( + filterSetName, + mode, + location.toString(), + ref, + alt, + vqslod, + culprit, + trainingLabel, + yng + ); + } else { + // New VQSR-Lite has CALIBRATION_SENSITIVITY present, so add that column too. + String calibration_sensitivity = variant.getAttributeAsString("CALIBRATION_SENSITIVITY",""); + row = Arrays.asList( + filterSetName, + mode, + location.toString(), + ref, + alt, + vqslod, + culprit, + trainingLabel, + yng, + calibration_sensitivity + ); + } writer.getNewLineBuilder().setRow(row).write(); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ExtractVariantAnnotations.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ExtractVariantAnnotations.java new file mode 100644 index 00000000000..dc98d99072e --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ExtractVariantAnnotations.java @@ -0,0 +1,369 @@ +package org.broadinstitute.hellbender.tools.walkers.vqsr.scalable; + +import htsjdk.variant.variantcontext.Allele; +import htsjdk.variant.variantcontext.VariantContext; +import org.apache.commons.lang3.tuple.Triple; +import org.apache.commons.math3.random.RandomGenerator; +import org.apache.commons.math3.random.RandomGeneratorFactory; +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.BetaFeature; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.barclay.help.DocumentedFeature; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.engine.FeatureContext; +import org.broadinstitute.hellbender.engine.ReadsContext; +import org.broadinstitute.hellbender.engine.ReferenceContext; +import org.broadinstitute.hellbender.exceptions.GATKException; +import org.broadinstitute.hellbender.tools.copynumber.utils.HDF5Utils; +import org.broadinstitute.hellbender.tools.walkers.vqsr.VariantRecalibrator; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data.LabeledVariantAnnotationsData; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data.VariantType; +import picard.cmdline.programgroups.VariantFilteringProgramGroup; + +import java.io.File; +import java.util.List; +import java.util.Random; +import java.util.Set; +import java.util.TreeSet; +import java.util.stream.Collectors; + +/** + * Extracts site-level variant annotations, labels, and other metadata from a VCF file to HDF5 files. + * + *

+ * This tool is intended to be used as the first step in a variant-filtering workflow that supersedes the + * {@link VariantRecalibrator} workflow. This tool extracts site-level annotations, labels, and other relevant metadata + * from variant sites (or alleles, in allele-specific mode) that are or are not present in specified labeled + * resource VCFs (e.g., training or calibration VCFs). Input sites that are present in the resources are considered + * labeled; each site can have multiple labels if it is present in multiple resources. Other input sites that are + * not present in any resources are considered unlabeled and can be randomly sampled using reservoir sampling; + * extraction of these is optional. The outputs of the tool are HDF5 files containing the extracted data for + * labeled and (optional) unlabeled variant sets, as well as a sites-only indexed VCF containing the labeled variants. + *

+ * + *

+ * The extracted sets can be provided as input to the {@link TrainVariantAnnotationsModel} tool + * to produce an annotation-based model for scoring variant calls. This model can in turn be provided + * along with a VCF file to the {@link ScoreVariantAnnotations} tool, which assigns a score to each call + * (with a lower score indicating that a call is more likely to be an artifact and should perhaps be filtered). + * Each score can also be converted to a corresponding sensitivity with respect to a calibration set, if the latter is available. + *

+ * + *

+ * Note that annotations and metadata are collected in memory during traversal until they are written to HDF5 files + * upon completion of the traversal. Memory requirements thus roughly scale linearly with both the number of sites + * extracted and the number of annotations. + *

+ * + *

+ * Note that HDF5 files may be viewed using hdfview + * or loaded in Python using PyTables or h5py. + *

+ * + *

Inputs

+ * + *
    + *
  • + * Input VCF file. Site-level annotations will be extracted from the contained variants (or alleles, + * if the {@value USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME} argument is specified). + *
  • + *
  • + * Annotations to extract. + *
  • + *
  • + * Variant types (i.e., SNP and/or INDEL) to extract. Logic for determining variant type was retained from + * {@link VariantRecalibrator}; see {@link VariantType}. Extracting SNPs and INDELs separately in two runs of + * this tool can be useful if one wishes to extract different sets of annotations for each variant type, + * for example. + *
  • + *
  • + * (Optional) Resource VCF file(s). Each resource should be tagged with a label, which will be assigned to + * extracted sites that are present in the resource. In typical use, the {@value LabeledVariantAnnotationsData#TRAINING_LABEL} + * and {@value LabeledVariantAnnotationsData#CALIBRATION_LABEL} labels should be used to tag at least one resource + * apiece. The resulting sets of sites will be used for model training and conversion of scores to + * calibration-set sensitivity, respectively; the trustworthiness of the respective resources should be + * taken into account accordingly. The {@value LabeledVariantAnnotationsData#SNP_LABEL} label is + * reserved by the tool, as it is used to label sites determined to be SNPs, and thus it cannot be used to tag + * provided resources. + *
  • + *
  • + * (Optional) Maximum number of unlabeled variants (or alleles) to randomly sample with reservoir sampling. + * If nonzero, annotations will also be extracted from unlabeled sites (i.e., those that are not present + * in the labeled resources). + *
  • + *
  • + * Output prefix. + * This is used as the basename for output files. + *
  • + *
+ * + *

Outputs

+ * + *
    + *
  • + * (Optional) Labeled-annotations HDF5 file (.annot.hdf5). Annotation data and metadata for those sites that + * are present in labeled resources are stored in the following HDF5 directory structure: + * + *

    + * |--- alleles
    + * | |--- alt
    + * | |--- ref
    + * |--- annotations
    + * | |--- chunk_0
    + * | |--- ...
    + * | |--- chunk_{num_chunks - 1}
    + * | |--- names
    + * | |--- num_chunks
    + * | |--- num_columns
    + * | |--- num_rows
    + * |--- intervals
    + * | |--- indexed_contig_names
    + * | |--- transposed_index_start_end
    + * |--- labels
    + * | |--- snp
    + * | |--- ... (e.g., training, calibration, etc.)
    + * | |--- ...
    + *

    + * + *

    + * Here, each chunk is a double matrix, with dimensions given by (number of sites in the chunk) x (number of annotations). + * See the methods {@link HDF5Utils#writeChunkedDoubleMatrix} and {@link HDF5Utils#writeIntervals} for additional details. + * If {@value USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME} is specified, each record corresponds to an individual allele; + * otherwise, each record corresponds to a variant site, which may contain multiple alleles. + * Storage of alleles can be omitted using the {@value OMIT_ALLELES_IN_HDF5_LONG_NAME} argument, which will reduce + * the size of the file. This file will only be produced if resources are provided and the number of extracted + * labeled sites is nonzero. + *

    + * + *
  • + *
  • + * Labeled sites-only VCF file and index. The VCF will not be gzipped if the {@value DO_NOT_GZIP_VCF_OUTPUT_LONG_NAME} + * argument is set to true. The VCF can be provided as a resource in subsequent runs of + * {@link ScoreVariantAnnotations} and used to indicate labeled sites that were extracted. + * This can be useful if the {@value StandardArgumentDefinitions#INTERVALS_LONG_NAME} argument was used to + * subset sites in training or calibration resources for extraction; this may occur when setting up + * training/validation/test splits, for example. Note that records for the random sample of unlabeled sites are + * currently not included in the VCF. + *
  • + *
  • + * (Optional) Unlabeled-annotations HDF5 file. This will have the same directory structure as in the + * labeled-annotations HDF5 file. However, note that records are currently written in the order they + * appear in the downsampling reservoir after random sampling, and hence, are not in genomic order. + * This file will only be produced if a nonzero value of the {@value MAXIMUM_NUMBER_OF_UNLABELED_VARIANTS_LONG_NAME} + * argument is provided. + *
  • + *
+ * + *

Usage examples

+ * + *

+ * Extract annotations from training/calibration SNP/INDEL sites, producing the outputs + * 1) {@code extract.annot.hdf5}, 2) {@code extract.vcf.gz}, and 3) {@code extract.vcf.gz.tbi}. + * The HDF5 file can then be provided to {@link TrainVariantAnnotationsModel} + * to train a model using a positive-only approach. Note that the {@value MODE_LONG_NAME} arguments are made + * explicit here, although both SNP and INDEL modes are selected by default. + * + *

+ *     gatk ExtractVariantAnnotations \
+ *          -V input.vcf \
+ *          -A annotation_1 \
+ *          ...
+ *          -A annotation_N \
+ *          --mode SNP \
+ *          --resource:snp-training,training=true snp-training.vcf \
+ *          --resource:snp-calibration,calibration=true snp-calibration.vcf \
+ *          --mode INDEL \
+ *          --resource:indel-training,training=true indel-training.vcf \
+ *          --resource:indel-calibration,calibration=true indel-calibration.vcf \
+ *          -O extract
+ * 
+ *

+ * + *

+ * Extract annotations from both training/calibration SNP/INDEL sites and a random sample of + * 1000000 unlabeled (i.e., non-training/calibration) sites, producing the outputs + * 1) {@code extract.annot.hdf5}, 2) {@code extract.unlabeled.annot.hdf5}, 3) {@code extract.vcf.gz}, + * and 4) {@code extract.vcf.gz.tbi}. The HDF5 files can then be provided to {@link TrainVariantAnnotationsModel} + * to train a model using a positive-negative approach (similar to that used in {@link VariantRecalibrator}). + * Note that the {@value MODE_LONG_NAME} arguments are made explicit here, although both SNP and INDEL modes are + * selected by default. + * + *

+ *     gatk ExtractVariantAnnotations \
+ *          -V input.vcf \
+ *          -A annotation_1 \
+ *          ...
+ *          -A annotation_N \
+ *          --mode SNP \
+ *          --resource:snp-training,training=true snp-training.vcf \
+ *          --resource:snp-calibration,calibration=true snp-calibration.vcf \
+ *          --mode INDEL \
+ *          --resource:indel-training,training=true indel-training.vcf \
+ *          --resource:indel-calibration,calibration=true indel-calibration.vcf \
+ *          --maximum-number-of-unlableled-variants 1000000
+ *          -O extract
+ * 
+ *

+ * + *

+ * In the (atypical) event that resource VCFs are unavailable, one can still extract annotations from a random sample of + * unlabeled sites, producing the outputs 1) {@code extract.unlabeled.annot.hdf5}, + * 2) {@code extract.vcf.gz} (which will contain no records), and 3) {@code extract.vcf.gz.tbi}. + * This random sample cannot be used by {@link TrainVariantAnnotationsModel}, but may still be useful for + * exploratory analyses. Note that the {@value MODE_LONG_NAME} arguments are made explicit here, although both + * SNP and INDEL modes are selected by default. + * + *

+ *     gatk ExtractVariantAnnotations \
+ *          -V input.vcf \
+ *          -A annotation_1 \
+ *          ...
+ *          -A annotation_N \
+ *          --mode SNP \
+ *          --mode INDEL \
+ *          --maximum-number-of-unlableled-variants 1000000
+ *          -O extract
+ * 
+ *

+ * + * DEVELOPER NOTE: See documentation in {@link LabeledVariantAnnotationsWalker}. + * + * @author Samuel Lee <slee@broadinstitute.org> + */ +@CommandLineProgramProperties( + summary = "Extracts site-level variant annotations, labels, and other metadata from a VCF file to HDF5 files.", + oneLineSummary = "Extracts site-level variant annotations, labels, and other metadata from a VCF file to HDF5 files", + programGroup = VariantFilteringProgramGroup.class +) +@DocumentedFeature +@BetaFeature +public final class ExtractVariantAnnotations extends LabeledVariantAnnotationsWalker { + + public static final String MAXIMUM_NUMBER_OF_UNLABELED_VARIANTS_LONG_NAME = "maximum-number-of-unlabeled-variants"; + public static final String RESERVOIR_SAMPLING_RANDOM_SEED_LONG_NAME = "reservoir-sampling-random-seed"; + + public static final String UNLABELED_TAG = ".unlabeled"; + + @Argument( + fullName = MAXIMUM_NUMBER_OF_UNLABELED_VARIANTS_LONG_NAME, + doc = "Maximum number of unlabeled variants to extract. " + + "If greater than zero, reservoir sampling will be used to randomly sample this number " + + "of sites from input sites that are not present in the specified resources. " + + "Choice of this number should be guided by considerations for training the negative model in " + + "TrainVariantAnnotationsModel; users may wish to choose a number that is comparable to the " + + "expected size of the labeled training set or that is compatible with available memory resources.", + minValue = 0) + private int maximumNumberOfUnlabeledVariants = 0; + + @Argument( + fullName = RESERVOIR_SAMPLING_RANDOM_SEED_LONG_NAME, + doc = "Random seed to use for reservoir sampling of unlabeled variants.") + private int reservoirSamplingRandomSeed = 0; + + private RandomGenerator rng; + private LabeledVariantAnnotationsData unlabeledDataReservoir; // will not be sorted in genomic order + private int unlabeledIndex = 0; + + @Override + public void afterOnTraversalStart() { + if (!resourceLabels.contains(LabeledVariantAnnotationsData.TRAINING_LABEL)) { + logger.warn("No training set found! If you are using the downstream TrainVariantAnnotationsModel and ScoreVariantAnnotations tools, " + + "provide sets of known polymorphic loci marked with the training=true feature input tag. " + + "For example, --resource:hapmap,training=true hapmap.vcf"); + } + if (!resourceLabels.contains(LabeledVariantAnnotationsData.CALIBRATION_LABEL)) { + logger.warn("No calibration set found! If you are using the downstream TrainVariantAnnotationsModel and ScoreVariantAnnotations tools " + + "and wish to convert scores to sensitivity with respect to a calibration set of variants, " + + "provide sets of known polymorphic loci marked with the calibration=true feature input tag. " + + "For example, --resource:hapmap,calibration=true hapmap.vcf"); + } + + rng = RandomGeneratorFactory.createRandomGenerator(new Random(reservoirSamplingRandomSeed)); + unlabeledDataReservoir = maximumNumberOfUnlabeledVariants == 0 + ? null + : new LabeledVariantAnnotationsData(annotationNames, resourceLabels, useASAnnotations, maximumNumberOfUnlabeledVariants); // we pass resourceLabels here so that both labeled and unlabeled + } // HDF5 files will have the same directory structure + + @Override + protected void nthPassApply(final VariantContext variant, + final ReadsContext readsContext, + final ReferenceContext referenceContext, + final FeatureContext featureContext, + final int n) { + if (n == 0) { + final List, VariantType, TreeSet>> metadata = extractVariantMetadata( + variant, featureContext, unlabeledDataReservoir != null); + final boolean isVariantExtracted = !metadata.isEmpty(); + if (isVariantExtracted) { + final boolean isUnlabeled = metadata.stream().map(Triple::getRight).allMatch(Set::isEmpty); + if (!isUnlabeled) { + addExtractedVariantToData(data, variant, metadata); + writeExtractedVariantToVCF(variant, metadata); + } else { + // Algorithm R for reservoir sampling: https://en.wikipedia.org/wiki/Reservoir_sampling#Simple_algorithm + if (unlabeledIndex < maximumNumberOfUnlabeledVariants) { + addExtractedVariantToData(unlabeledDataReservoir, variant, metadata); + } else { + final int j = rng.nextInt(unlabeledIndex); + if (j < maximumNumberOfUnlabeledVariants) { + setExtractedVariantInData(unlabeledDataReservoir, variant, metadata, j); + } + } + unlabeledIndex++; + } + } + } + } + + @Override + protected void afterNthPass(final int n) { + if (n == 0) { + writeAnnotationsToHDF5(); + data.clear(); + if (unlabeledDataReservoir != null) { + writeUnlabeledAnnotationsToHDF5(); + // TODO write extracted unlabeled variants to VCF, which can be used to mark extraction in scoring step + unlabeledDataReservoir.clear(); + } + if (vcfWriter != null) { + vcfWriter.close(); + } + } + } + + @Override + public Object onTraversalSuccess() { + + logger.info(String.format("%s complete.", getClass().getSimpleName())); + + return null; + } + + private static void setExtractedVariantInData(final LabeledVariantAnnotationsData data, + final VariantContext variant, + final List, VariantType, TreeSet>> metadata, + final int index) { + data.set(index, variant, + metadata.stream().map(Triple::getLeft).collect(Collectors.toList()), + metadata.stream().map(Triple::getMiddle).collect(Collectors.toList()), + metadata.stream().map(Triple::getRight).collect(Collectors.toList())); + } + + private void writeUnlabeledAnnotationsToHDF5() { + final File outputUnlabeledAnnotationsFile = new File(outputPrefix + UNLABELED_TAG + ANNOTATIONS_HDF5_SUFFIX); + if (unlabeledDataReservoir.size() == 0) { + throw new GATKException(String.format("No unlabeled variants were present in the input VCF. " + + "Consider setting the %s argument to 0.", MAXIMUM_NUMBER_OF_UNLABELED_VARIANTS_LONG_NAME)); + } + for (final VariantType variantType : variantTypesToExtract) { + logger.info(String.format("Extracted unlabeled annotations for %d variants of type %s.", + unlabeledDataReservoir.getVariantTypeFlat().stream().mapToInt(t -> t == variantType ? 1 : 0).sum(), variantType)); + } + logger.info(String.format("Extracted unlabeled annotations for %s total variants.", unlabeledDataReservoir.size())); + + logger.info("Writing unlabeled annotations..."); + // TODO coordinate sort + unlabeledDataReservoir.writeHDF5(outputUnlabeledAnnotationsFile, omitAllelesInHDF5); + logger.info(String.format("Unlabeled annotations and metadata written to %s.", outputUnlabeledAnnotationsFile.getAbsolutePath())); + } +} \ No newline at end of file diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/LabeledVariantAnnotationsWalker.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/LabeledVariantAnnotationsWalker.java new file mode 100644 index 00000000000..e1ebf3ce608 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/LabeledVariantAnnotationsWalker.java @@ -0,0 +1,409 @@ +package org.broadinstitute.hellbender.tools.walkers.vqsr.scalable; + +import com.google.common.collect.Sets; +import htsjdk.samtools.SAMSequenceDictionary; +import htsjdk.variant.variantcontext.Allele; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.VariantContextBuilder; +import htsjdk.variant.variantcontext.writer.VariantContextWriter; +import htsjdk.variant.vcf.VCFConstants; +import htsjdk.variant.vcf.VCFHeader; +import htsjdk.variant.vcf.VCFHeaderLine; +import htsjdk.variant.vcf.VCFHeaderLineType; +import htsjdk.variant.vcf.VCFInfoHeaderLine; +import org.apache.commons.collections4.ListUtils; +import org.apache.commons.lang3.tuple.Triple; +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.engine.FeatureContext; +import org.broadinstitute.hellbender.engine.FeatureInput; +import org.broadinstitute.hellbender.engine.MultiplePassVariantWalker; +import org.broadinstitute.hellbender.exceptions.GATKException; +import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.tools.copynumber.arguments.CopyNumberArgumentValidationUtils; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data.LabeledVariantAnnotationsData; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data.VariantType; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.VariantAnnotationsScorer; +import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; +import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines; +import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils; +import org.broadinstitute.hellbender.utils.variant.VcfUtils; + +import java.io.File; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.EnumSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TreeSet; +import java.util.stream.Collectors; + +/** + * Base walker for both {@link ExtractVariantAnnotations} and {@link ScoreVariantAnnotations}, + * which enforces identical variant-extraction behavior in both tools via {@link #extractVariantMetadata}. + * + * This base implementation covers functionality for {@link ExtractVariantAnnotations}. Namely, it is a single-pass + * walker, performing the operations: + * + * - nthPassApply(n = 0) + * - if variant/alleles pass filters and variant-type/resource-match checks, then: + * - add variant/alleles to a {@link LabeledVariantAnnotationsData} collection + * - write variant/alleles with labels appended to a sites-only VCF file + * - afterNthPass(n = 0) + * - write the resulting {@link LabeledVariantAnnotationsData} collection to an HDF5 file + * + * This results in the following output: + * + * - an HDF5 file, with the directory structure documented in {@link LabeledVariantAnnotationsData#writeHDF5}; + * note that the matrix of annotations contains a single row per datum (i.e., per allele, in allele-specific mode, + * and per variant otherwise) + * - a sites-only VCF file, containing a single line per extracted variant, with labels appended + * + * In contrast, the {@link ScoreVariantAnnotations} implementation overrides methods to yield a two-pass walker, + * performing the operations: + * + * - nthPassApply(n = 0) + * - if variant/alleles pass filters and variant-type checks, then: + * - add variant/alleles to a {@link LabeledVariantAnnotationsData} collection + * - afterNthPass(n = 0) + * - write the resulting {@link LabeledVariantAnnotationsData} collection to an HDF5 file + * - pass this annotations HDF5 file to a {@link VariantAnnotationsScorer}, which generates and writes scores to an HDF5 file + * - read the scores back in and load them into an iterator + * - nthPassApply(n = 1) + * - if variant/alleles pass filters and variant-type checks (which are identical to the first pass), then: + * - draw the corresponding score (or scores, in allele-specific mode) from the iterator + * - write the variant (with all alleles, not just those extracted) with the score + * (or best score, in allele-specific mode) appended to a VCF file + * - else: + * - write an unprocessed copy of the variant to a VCF file + * + * This results in the following output: + * + * - an HDF5 file, as above + * - a VCF file, containing the input variants, with labels, scores, and filters appended/applied for those passing variant-type checks + */ +public abstract class LabeledVariantAnnotationsWalker extends MultiplePassVariantWalker { + + public static final String MODE_LONG_NAME = "mode"; + public static final String USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME = "use-allele-specific-annotations"; + public static final String IGNORE_FILTER_LONG_NAME = "ignore-filter"; + public static final String IGNORE_ALL_FILTERS_LONG_NAME = "ignore-all-filters"; + public static final String DO_NOT_TRUST_ALL_POLYMORPHIC_LONG_NAME = "do-not-trust-all-polymorphic"; + public static final String RESOURCE_MATCHING_STRATEGY_LONG_NAME = "resource-matching-strategy"; + public static final String OMIT_ALLELES_IN_HDF5_LONG_NAME = "omit-alleles-in-hdf5"; + public static final String DO_NOT_GZIP_VCF_OUTPUT_LONG_NAME = "do-not-gzip-vcf-output"; + + public static final String ANNOTATIONS_HDF5_SUFFIX = ".annot.hdf5"; + + public static final String RESOURCE_LABEL_INFO_HEADER_LINE_FORMAT_STRING = "This site was labeled as %s according to resources"; + + enum ResourceMatchingStrategy { + START_POSITION, START_POSITION_AND_GIVEN_REPRESENTATION, START_POSITION_AND_MINIMAL_REPRESENTATION + } + + @Argument( + fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, + shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, + doc = "Prefix for output filenames.") + String outputPrefix; + + @Argument( + fullName = StandardArgumentDefinitions.RESOURCE_LONG_NAME, + doc = "Resource VCFs used to label extracted variants.", + optional = true) + private List> resources = new ArrayList<>(10); + + @Argument( + fullName = StandardArgumentDefinitions.ANNOTATION_LONG_NAME, + shortName = StandardArgumentDefinitions.ANNOTATION_SHORT_NAME, + doc = "Names of the annotations to extract. Note that a requested annotation may in fact not be present " + + "at any extraction site; NaN missing values will be generated for such annotations.", + minElements = 1) + List annotationNames = new ArrayList<>(); + + @Argument( + fullName = MODE_LONG_NAME, + doc = "Variant types to extract.", + minElements = 1) + private List variantTypesToExtractList = new ArrayList<>(Arrays.asList(VariantType.SNP, VariantType.INDEL)); + + @Argument( + fullName = USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME, + doc = "If true, use the allele-specific versions of the specified annotations.", + optional = true) + boolean useASAnnotations = false; + + @Argument( + fullName = IGNORE_FILTER_LONG_NAME, + doc = "Ignore the specified filter(s) in the input VCF.", + optional = true) + private List ignoreInputFilters = new ArrayList<>(); + + @Argument( + fullName = IGNORE_ALL_FILTERS_LONG_NAME, + doc = "If true, ignore all filters in the input VCF.", + optional = true) + private boolean ignoreAllFilters = false; + + // TODO this is a perhaps vestigial argument inherited from VQSR; its impact and necessity could be reevaluated + @Argument( + fullName = DO_NOT_TRUST_ALL_POLYMORPHIC_LONG_NAME, + doc = "If true, do not trust that unfiltered records in the resources contain only polymorphic sites. " + + "This may increase runtime if the resources are not sites-only VCFs.", + optional = true) + private boolean doNotTrustAllPolymorphic = false; + + + @Argument( + fullName = RESOURCE_MATCHING_STRATEGY_LONG_NAME, + doc = "The strategy to use for determining whether an input variant is present in a resource " + + "in non-allele-specific mode (--" + USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME + " false). " + + "START_POSITION: Start positions of input and resource variants must match. " + + "START_POSITION_AND_GIVEN_REPRESENTATION: The intersection of the sets of input and resource alleles " + + "(in their given representations) must also be non-empty. " + + "START_POSITION_AND_MINIMAL_REPRESENTATION: The intersection of the sets of input and resource alleles " + + "(after converting alleles to their minimal representations) must also be non-empty. " + + "This argument has no effect in allele-specific mode (--" + USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME + " true), " + + "in which the minimal representations of the input and resource alleles must match.", + optional = true) + private ResourceMatchingStrategy resourceMatchingStrategy = ResourceMatchingStrategy.START_POSITION; + @Argument( + fullName = OMIT_ALLELES_IN_HDF5_LONG_NAME, + doc = "If true, omit alleles in output HDF5 files in order to decrease file sizes.", + optional = true + ) + boolean omitAllelesInHDF5 = false; + + @Argument( + fullName = DO_NOT_GZIP_VCF_OUTPUT_LONG_NAME, + doc = "If true, VCF output will not be compressed.", + optional = true + ) + boolean doNotGZIPVCFOutput = false; + + private final Set ignoreInputFilterSet = new TreeSet<>(); + Set variantTypesToExtract; + TreeSet resourceLabels = new TreeSet<>(); + + File outputAnnotationsFile; + VariantContextWriter vcfWriter; + + LabeledVariantAnnotationsData data; + + @Override + public void onTraversalStart() { + + ignoreInputFilterSet.addAll(ignoreInputFilters); + + variantTypesToExtract = EnumSet.copyOf(variantTypesToExtractList); + + outputAnnotationsFile = new File(outputPrefix + ANNOTATIONS_HDF5_SUFFIX); + final String vcfSuffix = doNotGZIPVCFOutput ? ".vcf" : ".vcf.gz"; + final File outputVCFFile = new File(outputPrefix + vcfSuffix); + + // TODO this validation method should perhaps be moved outside of the CNV code + CopyNumberArgumentValidationUtils.validateOutputFiles(outputAnnotationsFile, outputVCFFile); + + for (final FeatureInput resource : resources) { + final TreeSet trackResourceLabels = resource.getTagAttributes().entrySet().stream() + .filter(e -> e.getValue().equals("true")) + .map(Map.Entry::getKey) + .sorted() + .collect(Collectors.toCollection(TreeSet::new)); + resourceLabels.addAll(trackResourceLabels); + logger.info( String.format("Found %s track: labels = %s", resource.getName(), trackResourceLabels)); + } + resourceLabels.forEach(String::intern); // TODO evaluate if this affects memory usage and remove if not needed + + if (resourceLabels.contains(LabeledVariantAnnotationsData.SNP_LABEL)) { + throw new UserException.BadInput(String.format("The resource label \"%s\" is reserved for labeling variant types.", + LabeledVariantAnnotationsData.SNP_LABEL)); + } + + data = new LabeledVariantAnnotationsData(annotationNames, resourceLabels, useASAnnotations); + + vcfWriter = createVCFWriter(outputVCFFile); + vcfWriter.writeHeader(constructVCFHeader(data.getSortedLabels())); + + afterOnTraversalStart(); // perform additional validation, set modes in child tools, etc. + } + + public void afterOnTraversalStart() { + // override + } + + @Override + protected int numberOfPasses() { + return 1; + } + + @Override + public Object onTraversalSuccess() { + return null; + } + + static void addExtractedVariantToData(final LabeledVariantAnnotationsData data, + final VariantContext variant, + final List, VariantType, TreeSet>> metadata) { + data.add(variant, + metadata.stream().map(Triple::getLeft).collect(Collectors.toList()), + metadata.stream().map(Triple::getMiddle).collect(Collectors.toList()), + metadata.stream().map(Triple::getRight).collect(Collectors.toList())); + } + + void writeExtractedVariantToVCF(final VariantContext variant, + final List, VariantType, TreeSet>> metadata) { + writeExtractedVariantToVCF(variant, + metadata.stream().map(Triple::getLeft).flatMap(List::stream).collect(Collectors.toList()), + metadata.stream().map(Triple::getRight).flatMap(Set::stream).collect(Collectors.toSet())); + } + + void writeAnnotationsToHDF5() { + if (data.size() == 0) { + logger.warn("Found no input variants for extraction. This may be because the specified " + + "genomic region contains no input variants of the requested type(s) or, if extracting " + + "training labels, because none of the input variants were contained in the resource VCFs " + + "or no resource VCFs were provided. The annotations HDF5 file will not be generated."); + return; + } + for (final VariantType variantType : variantTypesToExtract) { + logger.info(String.format("Extracted annotations for %d variants of type %s.", + data.getVariantTypeFlat().stream().mapToInt(t -> t == variantType ? 1 : 0).sum(), variantType)); + } + for (final String label : data.getSortedLabels()) { + logger.info(String.format("Extracted annotations for %d variants labeled as %s.", + data.isLabelFlat(label).stream().mapToInt(b -> b ? 1 : 0).sum(), label)); + } + logger.info(String.format("Extracted annotations for %s total variants.", data.size())); + + logger.info("Writing annotations..."); + data.writeHDF5(outputAnnotationsFile, omitAllelesInHDF5); + logger.info(String.format("Annotations and metadata written to %s.", outputAnnotationsFile.getAbsolutePath())); + } + + /** + * Writes a sites-only VCF containing the extracted variants and corresponding labels. + */ + void writeExtractedVariantToVCF(final VariantContext vc, + final List altAlleles, + final Set labels) { + final List alleles = ListUtils.union(Collections.singletonList(vc.getReference()), altAlleles); + final VariantContextBuilder builder = new VariantContextBuilder( + vc.getSource(), vc.getContig(), vc.getStart(), vc.getEnd(), alleles); + labels.forEach(l -> builder.attribute(l, true)); // labels should already be sorted as a TreeSet + vcfWriter.add(builder.make()); + } + + // modified from VQSR code + // TODO we're just writing a standard sites-only VCF here, maybe there's a nicer way to do this? + VCFHeader constructVCFHeader(final List sortedLabels) { + Set hInfo = sortedLabels.stream() + .map(l -> new VCFInfoHeaderLine(l, 1, VCFHeaderLineType.Flag, String.format(RESOURCE_LABEL_INFO_HEADER_LINE_FORMAT_STRING, l))) + .collect(Collectors.toCollection(TreeSet::new)); + hInfo.add(GATKVCFHeaderLines.getFilterLine(VCFConstants.PASSES_FILTERS_v4)); + final SAMSequenceDictionary sequenceDictionary = getBestAvailableSequenceDictionary(); + if (sequenceDictionary != null) { + hInfo = VcfUtils.updateHeaderContigLines(hInfo, referenceArguments.getReferencePath(), sequenceDictionary, true); + } + hInfo.addAll(getDefaultToolVCFHeaderLines()); + return new VCFHeader(hInfo); + } + + /** + * Performs variant-filter and variant-type checks to determine variants/alleles suitable for extraction, and returns + * a corresponding list of metadata. This method should not be overridden, as it is intended to enforce identical + * variant-extraction behavior in all child tools. Logic here and below for filtering and determining variant type + * was retained from VQSR, but has been heavily refactored. + */ + final List, VariantType, TreeSet>> extractVariantMetadata(final VariantContext vc, + final FeatureContext featureContext, + final boolean isExtractUnlabeled) { + // if variant is filtered, do not consume here + if (vc == null || !(ignoreAllFilters || vc.isNotFiltered() || ignoreInputFilterSet.containsAll(vc.getFilters()))) { + return Collections.emptyList(); + } + if (!useASAnnotations) { + // in non-allele-specific mode, get a singleton list of the triple + // (list of alt alleles passing variant-type and resource-match checks, variant type, set of labels) + final VariantType variantType = VariantType.getVariantType(vc); + if (variantTypesToExtract.contains(variantType)) { + final TreeSet matchingResourceLabels = findMatchingResourceLabels(vc, null, featureContext); + if (isExtractUnlabeled || !matchingResourceLabels.isEmpty()) { + return Collections.singletonList(Triple.of(vc.getAlternateAlleles(), variantType, matchingResourceLabels)); + } + } + } else { + // in allele-specific mode, get a list containing the triples + // (singleton list of alt allele, variant type, set of labels) + // corresponding to alt alleles that pass variant-type and resource-match checks + return vc.getAlternateAlleles().stream() + .filter(a -> !GATKVCFConstants.isSpanningDeletion(a)) + .filter(a -> variantTypesToExtract.contains(VariantType.getAlleleSpecificVariantType(vc, a))) + .map(a -> Triple.of(Collections.singletonList(a), VariantType.getAlleleSpecificVariantType(vc, a), + findMatchingResourceLabels(vc, a, featureContext))) + .filter(t -> isExtractUnlabeled || !t.getRight().isEmpty()) + .collect(Collectors.toList()); + } + // if variant-type and resource-match checks failed, return an empty list + return Collections.emptyList(); + } + + private TreeSet findMatchingResourceLabels(final VariantContext vc, + final Allele altAllele, + final FeatureContext featureContext) { + final TreeSet matchingResourceLabels = new TreeSet<>(); + for (final FeatureInput resource : resources) { + final List resourceVCs = featureContext.getValues(resource, featureContext.getInterval().getStart()); + for (final VariantContext resourceVC : resourceVCs) { + if (useASAnnotations && !doAllelesMatch(vc.getReference(), altAllele, resourceVC)) { + continue; + } + if (isMatchingVariant(vc, resourceVC, !doNotTrustAllPolymorphic, resourceMatchingStrategy)) { + resource.getTagAttributes().entrySet().stream() + .filter(e -> e.getValue().equals("true")) + .map(Map.Entry::getKey) + .forEach(matchingResourceLabels::add); + } + } + } + return matchingResourceLabels; + } + + private static boolean isMatchingVariant(final VariantContext vc, + final VariantContext resourceVC, + final boolean trustAllPolymorphic, + final ResourceMatchingStrategy resourceMatchingStrategy) { + if (resourceVC != null && resourceVC.isNotFiltered() && resourceVC.isVariant() && VariantType.checkVariantType(vc, resourceVC) && + (trustAllPolymorphic || !resourceVC.hasGenotypes() || resourceVC.isPolymorphicInSamples())) { // this is the check originally performed by VQSR + switch (resourceMatchingStrategy) { + case START_POSITION: + return true; + case START_POSITION_AND_GIVEN_REPRESENTATION: + // we further require that at least one alt allele is present in the resource alt alleles, but don't reconcile representations + return !Sets.intersection(Sets.newHashSet(vc.getAlternateAlleles()), Sets.newHashSet(resourceVC.getAlternateAlleles())).isEmpty(); + case START_POSITION_AND_MINIMAL_REPRESENTATION: + // we further require that at least one alt allele is present in the resource alt alleles, and do reconcile representations + return vc.getAlternateAlleles().stream() + .anyMatch(altAllele -> GATKVariantContextUtils.isAlleleInList(vc.getReference(), altAllele, resourceVC.getReference(), resourceVC.getAlternateAlleles())); + default: + throw new GATKException.ShouldNeverReachHereException("Unknown ResourceMatchingStrategy."); + } + } + return false; + } + + private static boolean doAllelesMatch(final Allele refAllele, + final Allele altAllele, + final VariantContext resourceVC) { + if (altAllele == null) { + return true; + } + try { + return GATKVariantContextUtils.isAlleleInList(refAllele, altAllele, resourceVC.getReference(), resourceVC.getAlternateAlleles()); + } catch (final IllegalStateException e) { + throw new IllegalStateException("Reference allele mismatch at position " + resourceVC.getContig() + ':' + resourceVC.getStart() + " : ", e); + } + } +} \ No newline at end of file diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ScoreVariantAnnotations.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ScoreVariantAnnotations.java new file mode 100644 index 00000000000..fbbbed81faf --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ScoreVariantAnnotations.java @@ -0,0 +1,627 @@ +package org.broadinstitute.hellbender.tools.walkers.vqsr.scalable; + +import com.google.common.primitives.Doubles; +import htsjdk.variant.variantcontext.Allele; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.VariantContextBuilder; +import htsjdk.variant.vcf.VCFFilterHeaderLine; +import htsjdk.variant.vcf.VCFHeader; +import htsjdk.variant.vcf.VCFHeaderLine; +import htsjdk.variant.vcf.VCFHeaderLineType; +import htsjdk.variant.vcf.VCFInfoHeaderLine; +import org.apache.commons.lang3.tuple.Triple; +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.BetaFeature; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.barclay.help.DocumentedFeature; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.engine.FeatureContext; +import org.broadinstitute.hellbender.engine.ReadsContext; +import org.broadinstitute.hellbender.engine.ReferenceContext; +import org.broadinstitute.hellbender.exceptions.GATKException; +import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.tools.copynumber.arguments.CopyNumberArgumentValidationUtils; +import org.broadinstitute.hellbender.tools.walkers.vqsr.VariantRecalibrator; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data.LabeledVariantAnnotationsData; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data.VariantType; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.BGMMVariantAnnotationsScorer; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.PythonSklearnVariantAnnotationsScorer; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.VariantAnnotationsModel; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.VariantAnnotationsModelBackend; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.VariantAnnotationsScorer; +import org.broadinstitute.hellbender.utils.Utils; +import org.broadinstitute.hellbender.utils.io.IOUtils; +import org.broadinstitute.hellbender.utils.io.Resource; +import org.broadinstitute.hellbender.utils.python.PythonScriptExecutor; +import picard.cmdline.programgroups.VariantFilteringProgramGroup; + +import java.io.File; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Set; +import java.util.TreeSet; +import java.util.function.Function; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +/** + * Scores variant calls in a VCF file based on site-level annotations using a previously trained model. + * + *

+ * This tool is intended to be used as the last step in a variant-filtering workflow that supersedes the + * {@link VariantRecalibrator} workflow. Using a previously trained model produced by {@link TrainVariantAnnotationsModel}, + * this tool assigns a score to each call (with a lower score indicating that a call is more likely to be an artifact). + * Each score can also be converted to a corresponding sensitivity with respect to a calibration set, if the latter is available. + * Each VCF record can also be annotated with additional resource labels and/or hard filtered based on its + * calibration-set sensitivity, if desired. + *

+ * + *

+ * Note that annotations and metadata are collected in memory during traversal until they are written to HDF5 files + * upon completion of the traversal. Memory and disk requirements thus roughly scale linearly with both the number + * of sites scored and the number of annotations. For large callsets, this tool may be run in parallel over separate + * genomic shards using the {@value StandardArgumentDefinitions#INTERVALS_LONG_NAME} argument as usual. + *

+ * + *

+ * Scores and annotations are also output to HDF5 files, which may be viewed using + * hdfview or loaded in Python using + * PyTables or h5py. + *

+ * + *

Inputs

+ * + *
    + *
  • + * Input VCF file. Site-level annotations will be extracted from the contained variants (or alleles, + * if the {@value USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME} argument is specified). + *
  • + *
  • + * Annotations to use for scoring. These should be identical to those used in the {@link ExtractVariantAnnotations} + * step to create the training set. + *
  • + *
  • + * Variant types (i.e., SNP and/or INDEL) to score. Logic for determining variant type was retained from + * {@link VariantRecalibrator}; see {@link VariantType}. To use different models for SNPs and INDELs + * (e.g., if it is desired to use different sets of annotations for each variant type), one can first run + * this tool to score SNPs and then again on the resulting output to score INDELs. + *
  • + *
  • + * Model prefix. This should denote the path of model files produced by {@link TrainVariantAnnotationsModel}. + *
  • + *
  • + * (Optional) Model backend. This should be identical to that specified in {@link TrainVariantAnnotationsModel}. + * The default Python IsolationForest implementation requires either the GATK Python environment + * or that certain Python packages (argparse, h5py, numpy, sklearn, and dill) are otherwise available. + * A custom backend can also be specified in conjunction with the {@value PYTHON_SCRIPT_LONG_NAME} argument. + *
  • + *
  • + * (Optional) Resource VCF file(s). See the corresponding documentation in {@link ExtractVariantAnnotations}. + * In typical usage, the same resource VCFs and tags provided to that tool should also be provided here. + * In addition, the sites-only VCF that is produced by that tool can also be provided here and used to + * mark those labeled sites that were extracted, which can be useful if these are a subset of the resource sites. + *
  • + *
  • + * (Optional) Calibration-set sensitivity thresholds for SNPs and INDELs. If the corresponding SNP or INDEL + * calibration-set scores are available in the provided model files, sites that have a calibration-set + * sensitivity falling above the corresponding threshold (i.e., a score falling below the corresponding + * score threshold) will have a filter applied. + *
  • + *
  • + * Output prefix. + * This is used as the basename for output files. + *
  • + *
+ * + *

Outputs

+ * + *
    + *
  • + * Scored VCF file and index. The VCF will not be gzipped if the {@value DO_NOT_GZIP_VCF_OUTPUT_LONG_NAME} + * argument is set to true. The INFO field in each VCF record will be annotated with: + * + *

    + * 1) a score (with a key as given by the {@value SCORE_KEY_LONG_NAME} argument, + * which has a default value of {@value DEFAULT_SCORE_KEY}), + *

    + *

    + * 2) if resources are provided, flags corresponding to the labels (e.g., + * {@value LabeledVariantAnnotationsData#TRAINING_LABEL}, {@value LabeledVariantAnnotationsData#CALIBRATION_LABEL}, etc.) + * of resources containing the record, + *

    + *

    + * 3) if the {@value SNP_KEY_LONG_NAME} argument (which has a default value of {@value DEFAULT_SNP_KEY}) + * is non-null, a flag corresponding to whether a site is treated as a SNP, + *

    + *

    + * 4) if {@value SNP_CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME} and/or + * {@value INDEL_CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME} are provided, a filter (with name given by + * the {@value LOW_SCORE_FILTER_NAME_LONG_NAME} argument, which has a default value of + * {@value DEFAULT_LOW_SCORE_FILTER_NAME}) will be applied if a record has a calibration-set sensitivity + * falling above the appropriate threshold (i.e., if it has a score falling below the corresponding + * score threshold). + *

    + *

    + * If {@value USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME} is true, the score, SNP flag, calibration sensitivity, + * and filter appropriate for the highest scoring allele are used; however, the resource labels for all alleles + * are applied. + *

    + * + *
  • + *
  • + * (Optional) Annotations HDF5 file (.annot.hdf5). Annotation data and metadata for all scored sites + * (labeled and unlabeled) are stored in the HDF5 directory structure given in the documentation for the + * {@link ExtractVariantAnnotations} tool. This file will only be produced if the number of scored sites + * is nonzero. + *

    + * + *
  • + *
  • + * (Optional) Scores HDF5 file (.scores.hdf5). Scores for all scored sites are stored in the + * HDF5 path {@value VariantAnnotationsScorer#SCORES_PATH}. Scores are given in the same order as records + * in both the VCF and the annotations HDF5 file. This file will only be produced if the number of scored sites + * is nonzero. + *

    + *
  • + *
+ * + *

Usage examples

+ * + *

+ * Score sites using a model (produced by {@link TrainVariantAnnotationsModel} using the default + * {@link VariantAnnotationsModelBackend#PYTHON_IFOREST} model backend and contained in the directory + * {@code model_dir}), producing the outputs 1) {@code output.vcf.gz}, 2) {@code output.vcf.gz.tbi}, + * 3) {@code output.annot.hdf5}, and 4) {@code output.scores.hdf5}. Note that {@code extract.vcf.gz} is + * produced by {@link ExtractVariantAnnotations}. Records will be filtered according to the values provided to the + * {@value SNP_CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME} and {@value INDEL_CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME} + * arguments; the values below are only meant to be illustrative and should be set as appropriate for a given analysis. + * Note that the {@value MODE_LONG_NAME} arguments are made explicit here, although both SNP and INDEL modes are + * selected by default. + * + *

+ *     gatk ScoreVariantAnnotations \
+ *          -V input.vcf \
+ *          -A annotation_1 \
+ *          ...
+ *          -A annotation_N \
+ *          --model-prefix model_dir \
+ *          --mode SNP \
+ *          --resource:snp-training,training=true snp-training.vcf \
+ *          --resource:snp-calibration,calibration=true snp-calibration.vcf \
+ *          --mode INDEL \
+ *          --resource:indel-training,training=true indel-training.vcf \
+ *          --resource:indel-calibration,calibration=true indel-calibration.vcf \
+ *          --resource:extracted,extracted=true extract.vcf.gz \
+ *          --snp-calibration-sensitivity-threshold 0.99 \
+ *          --indel-calibration-sensitivity-threshold 0.99 \
+ *          -O output
+ * 
+ * + *

+ * One may chain together two runs of this tool to score SNPs and INDELs using different models + * (note that SNP and INDEL models have "snp" and "indel" tags in their respective filenames, so these + * models can still be contained in the same {@code model_dir} directory). + * This may have implications for mixed SNP/INDEL sites, especially if filters are applied; see also the + * {@value IGNORE_ALL_FILTERS_LONG_NAME} and {@value IGNORE_FILTER_LONG_NAME} arguments. + * + *

+ *     gatk ScoreVariantAnnotations \
+ *          -V input.vcf \
+ *          -A snp_annotation_1 \
+ *          ...
+ *          -A snp_annotation_N \
+ *          --model-prefix model_dir \
+ *          --mode SNP \
+ *          --resource:snp-training,training=true snp-training.vcf \
+ *          --resource:snp-calibration,calibration=true snp-calibration.vcf \
+ *          --resource:extracted,extracted=true snp-extract.vcf.gz \
+ *          --snp-calibration-sensitivity-threshold 0.99 \
+ *          -O intermediate-output
+ *
+ *     gatk ScoreVariantAnnotations \
+ *          -V intermediate-output.vcf \
+ *          -A indel_annotation_1 \
+ *          ...
+ *          -A indel_annotation_M \
+ *          --model-prefix model_dir \
+ *          --mode INDEL \
+ *          --resource:indel-training,training=true indel-training.vcf \
+ *          --resource:indel-calibration,calibration=true indel-calibration.vcf \
+ *          --resource:extracted,extracted=true indel-extract.vcf.gz \
+ *          --indel-calibration-sensitivity-threshold 0.99 \
+ *          -O output
+ * 
+ * + *

Custom modeling/scoring backends (ADVANCED)

+ * + *

+ * The primary scoring functionality performed by this tool is accomplished by a "scoring backend" + * whose fundamental contract is to take an input annotation matrix and to output corresponding scores, + * with both input and output given as HDF5 files. Rather than using one of the available, implemented backends, + * advanced users may provide their own backend via the {@value PYTHON_SCRIPT_LONG_NAME} argument. + * See documentation in the modeling and scoring interfaces ({@link VariantAnnotationsModel} and + * {@link VariantAnnotationsScorer}, respectively), as well as the default Python IsolationForest implementation at + * {@link PythonSklearnVariantAnnotationsScorer} and + * src/main/resources/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/isolation-forest.py. + *

+ * + * DEVELOPER NOTE: See documentation in {@link LabeledVariantAnnotationsWalker}. + * + * @author Samuel Lee <slee@broadinstitute.org> + */ +@CommandLineProgramProperties( + summary = "Scores variant calls in a VCF file based on site-level annotations using a previously trained model.", + oneLineSummary = "Scores variant calls in a VCF file based on site-level annotations using a previously trained model", + programGroup = VariantFilteringProgramGroup.class +) +@DocumentedFeature +@BetaFeature +public class ScoreVariantAnnotations extends LabeledVariantAnnotationsWalker { + + public static final String MODEL_PREFIX_LONG_NAME = "model-prefix"; + public static final String MODEL_BACKEND_LONG_NAME = TrainVariantAnnotationsModel.MODEL_BACKEND_LONG_NAME; + public static final String PYTHON_SCRIPT_LONG_NAME = "python-script"; + public static final String SNP_CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME = "snp-calibration-sensitivity-threshold"; + public static final String INDEL_CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME = "indel-calibration-sensitivity-threshold"; + + public static final String SNP_KEY_LONG_NAME = "snp-key"; + public static final String SCORE_KEY_LONG_NAME = "score-key"; + public static final String CALIBRATION_SENSITIVITY_KEY_LONG_NAME = "calibration-sensitivity-key"; + public static final String LOW_SCORE_FILTER_NAME_LONG_NAME = "low-score-filter-name"; + public static final String DOUBLE_FORMAT_LONG_NAME = "double-format"; + + public static final String DEFAULT_SNP_KEY = LabeledVariantAnnotationsData.SNP_LABEL; + public static final String DEFAULT_SCORE_KEY = "SCORE"; + public static final String DEFAULT_CALIBRATION_SENSITIVITY_KEY = "CALIBRATION_SENSITIVITY"; + public static final String DEFAULT_LOW_SCORE_FILTER_NAME = "LOW_SCORE"; + public static final String DEFAULT_DOUBLE_FORMAT = "%.4f"; + + public static final String SCORES_HDF5_SUFFIX = ".scores.hdf5"; + + @Argument( + fullName = MODEL_PREFIX_LONG_NAME, + doc = "Prefix for model files. This should be identical to the output prefix specified in TrainVariantAnnotationsModel." ) + private String modelPrefix; + + @Argument( + fullName = MODEL_BACKEND_LONG_NAME, + doc = "Backend to use for scoring. " + + "JAVA_BGMM will use a pure Java implementation (ported from Python scikit-learn) of the Bayesian Gaussian Mixture Model. " + + "PYTHON_IFOREST will use the Python scikit-learn implementation of the IsolationForest method and " + + "will require that the corresponding Python dependencies are present in the environment. " + + "PYTHON_SCRIPT will use the script specified by the " + PYTHON_SCRIPT_LONG_NAME + " argument. " + + "See the tool documentation for more details." ) + private VariantAnnotationsModelBackend modelBackend = VariantAnnotationsModelBackend.PYTHON_IFOREST; + + @Argument( + fullName = PYTHON_SCRIPT_LONG_NAME, + doc = "Python script used for specifying a custom scoring backend. If provided, " + MODEL_BACKEND_LONG_NAME + " must also be set to PYTHON_SCRIPT.", + optional = true) + private File pythonScriptFile; + + @Argument( + fullName = SNP_CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME, + doc = "If specified, SNPs with scores corresponding to a calibration sensitivity that is greater than or equal to this threshold will be hard filtered.", + optional = true, + minValue = 0., + maxValue = 1.) + private Double snpCalibrationSensitivityThreshold; + + @Argument( + fullName = INDEL_CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME, + doc = "If specified, indels with scores corresponding to a calibration sensitivity that is greater than or equal to this threshold will be hard filtered.", + optional = true, + minValue = 0., + maxValue = 1.) + private Double indelCalibrationSensitivityThreshold; + + @Argument( + fullName = SNP_KEY_LONG_NAME, + doc = "Annotation flag to use for labeling sites as SNPs in output. " + + "Set this to \"null\" to omit these labels.") + private String snpKey = DEFAULT_SNP_KEY; + + @Argument( + fullName = SCORE_KEY_LONG_NAME, + doc = "Annotation key to use for score values in output.") + private String scoreKey = DEFAULT_SCORE_KEY; + + @Argument( + fullName = CALIBRATION_SENSITIVITY_KEY_LONG_NAME, + doc = "Annotation key to use for calibration-sensitivity values in output.") + private String calibrationSensitivityKey = DEFAULT_CALIBRATION_SENSITIVITY_KEY; + + @Argument( + fullName = LOW_SCORE_FILTER_NAME_LONG_NAME, + doc = "Name to use for low-score filter in output.") + private String lowScoreFilterName = DEFAULT_LOW_SCORE_FILTER_NAME; + + @Argument( + fullName = DOUBLE_FORMAT_LONG_NAME, + doc = "Format string to use for formatting score and calibration-sensitivity values in output.") + private String doubleFormat = DEFAULT_DOUBLE_FORMAT; + + private File outputScoresFile; + private Iterator scoresIterator; + private Iterator isSNPIterator; + + private VariantAnnotationsScorer snpScorer; + private VariantAnnotationsScorer indelScorer; + + private Function snpCalibrationSensitivityConverter; + private Function indelCalibrationSensitivityConverter; + + @Override + protected int numberOfPasses() { + return 2; + } + + @Override + public void afterOnTraversalStart() { + + Utils.nonNull(scoreKey); + Utils.nonNull(calibrationSensitivityKey); + Utils.nonNull(lowScoreFilterName); + Utils.nonNull(doubleFormat); + + switch (modelBackend) { + case JAVA_BGMM: + Utils.validateArg(pythonScriptFile == null, + "Python script should not be provided when using JAVA_BGMM backend."); + logger.info("Running in JAVA_BGMM mode..."); + snpScorer = deserializeScorerFromSerFiles(VariantType.SNP); + indelScorer = deserializeScorerFromSerFiles(VariantType.INDEL); + break; + case PYTHON_IFOREST: + Utils.validateArg(pythonScriptFile == null, + "Python script should not be provided when using PYTHON_IFOREST backend."); + + pythonScriptFile = IOUtils.writeTempResource(new Resource(TrainVariantAnnotationsModel.ISOLATION_FOREST_PYTHON_SCRIPT, TrainVariantAnnotationsModel.class)); + PythonScriptExecutor.checkPythonEnvironmentForPackage("argparse"); + PythonScriptExecutor.checkPythonEnvironmentForPackage("h5py"); + PythonScriptExecutor.checkPythonEnvironmentForPackage("numpy"); + PythonScriptExecutor.checkPythonEnvironmentForPackage("sklearn"); + PythonScriptExecutor.checkPythonEnvironmentForPackage("dill"); + logger.info("Running in PYTHON_IFOREST mode..."); + snpScorer = deserializeScorerFromPklFiles(VariantType.SNP); + indelScorer = deserializeScorerFromPklFiles(VariantType.INDEL); + break; + case PYTHON_SCRIPT: + IOUtils.canReadFile(pythonScriptFile); + logger.info("Running in PYTHON_SCRIPT mode..."); + snpScorer = deserializeScorerFromPklFiles(VariantType.SNP); + indelScorer = deserializeScorerFromPklFiles(VariantType.INDEL); + break; + default: + throw new GATKException.ShouldNeverReachHereException("Unknown model-backend mode."); + } + + if (snpScorer == null && indelScorer == null) { + throw new UserException.BadInput(String.format("At least one serialized scorer must be present " + + "in the model files with the prefix %s.", modelPrefix)); + } + if (variantTypesToExtract.contains(VariantType.SNP) && snpScorer == null) { + throw new UserException.BadInput(String.format("SNPs were indicated for extraction via the %s argument, " + + "but no serialized SNP scorer was available in the model files with the prefix.", MODE_LONG_NAME, modelPrefix)); + } + if (variantTypesToExtract.contains(VariantType.INDEL) && indelScorer == null) { + throw new UserException.BadInput(String.format("INDELs were indicated for extraction via the %s argument, " + + "but no serialized INDEL scorer was available in the model files with the prefix.", MODE_LONG_NAME, modelPrefix)); + } + + snpCalibrationSensitivityConverter = readCalibrationScoresAndCreateConverter(VariantType.SNP); + indelCalibrationSensitivityConverter = readCalibrationScoresAndCreateConverter(VariantType.INDEL); + + if (snpCalibrationSensitivityConverter == null && snpCalibrationSensitivityThreshold != null) { + throw new UserException.BadInput(String.format("The %s argument was specified, " + + "but no SNP calibration scores were provided in the model files with the prefix %s.", + SNP_CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME, modelPrefix)); + } + if (indelCalibrationSensitivityConverter == null && indelCalibrationSensitivityThreshold != null) { + throw new UserException.BadInput(String.format("The %s argument was specified, " + + "but no INDEL calibration scores were provided in the model files with the prefix %s.", + INDEL_CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME, modelPrefix)); + } + + outputScoresFile = new File(outputPrefix + SCORES_HDF5_SUFFIX); + + // TODO this validation method should perhaps be moved outside of the CNV code + CopyNumberArgumentValidationUtils.validateOutputFiles(outputScoresFile); + } + + @Override + protected void nthPassApply(final VariantContext variant, + final ReadsContext readsContext, + final ReferenceContext referenceContext, + final FeatureContext featureContext, + final int n) { + final List, VariantType, TreeSet>> metadata = extractVariantMetadata(variant, featureContext, true); + final boolean isVariantExtracted = !metadata.isEmpty(); + if (n == 0 && isVariantExtracted) { + addExtractedVariantToData(data, variant, metadata); + } + if (n == 1) { + if (isVariantExtracted) { + writeExtractedVariantToVCF(variant, metadata); + } else { + vcfWriter.add(variant); + } + } + } + + @Override + protected void afterNthPass(final int n) { + if (n == 0) { + // TODO if BGMM, preprocess annotations and write to HDF5 with BGMMVariantAnnotationsScorer.preprocessAnnotationsWithBGMMAndWriteHDF5 + writeAnnotationsToHDF5(); + if (data.size() > 0) { + data.clear(); + readAnnotationsAndWriteScoresToHDF5(); + scoresIterator = Arrays.stream(VariantAnnotationsScorer.readScores(outputScoresFile)).iterator(); + isSNPIterator = LabeledVariantAnnotationsData.readLabel(outputAnnotationsFile, LabeledVariantAnnotationsData.SNP_LABEL).iterator(); + } else { + scoresIterator = Collections.emptyIterator(); + isSNPIterator = Collections.emptyIterator(); + } + } + if (n == 1) { + if (scoresIterator.hasNext()) { + throw new IllegalStateException("Traversals of scores and variants " + + "(or alleles, in allele-specific mode) were not correctly synchronized."); + } + if (vcfWriter != null) { + vcfWriter.close(); + } + } + } + + private VariantAnnotationsScorer deserializeScorerFromPklFiles(final VariantType variantType) { + final String variantTypeTag = '.' + variantType.toString().toLowerCase(); + final File scorerPklFile = new File( + modelPrefix + variantTypeTag + PythonSklearnVariantAnnotationsScorer.PYTHON_SCORER_PKL_SUFFIX); + final File negativeScorerPklFile = new File( + modelPrefix + variantTypeTag + TrainVariantAnnotationsModel.NEGATIVE_TAG + PythonSklearnVariantAnnotationsScorer.PYTHON_SCORER_PKL_SUFFIX); + return scorerPklFile.canRead() + ? negativeScorerPklFile.canRead() + ? VariantAnnotationsScorer.combinePositiveAndNegativeScorer( + new PythonSklearnVariantAnnotationsScorer(pythonScriptFile, scorerPklFile), + new PythonSklearnVariantAnnotationsScorer(pythonScriptFile, negativeScorerPklFile)) + : new PythonSklearnVariantAnnotationsScorer(pythonScriptFile, scorerPklFile) + : null; + } + + private VariantAnnotationsScorer deserializeScorerFromSerFiles(final VariantType variantType) { + final String variantTypeTag = '.' + variantType.toString().toLowerCase(); + final File scorerSerFile = new File( + modelPrefix + variantTypeTag + BGMMVariantAnnotationsScorer.BGMM_SCORER_SER_SUFFIX); + final File negativeScorerSerFile = new File( + modelPrefix + variantTypeTag + TrainVariantAnnotationsModel.NEGATIVE_TAG + BGMMVariantAnnotationsScorer.BGMM_SCORER_SER_SUFFIX); + return scorerSerFile.canRead() + ? negativeScorerSerFile.canRead() + ? VariantAnnotationsScorer.combinePositiveAndNegativeScorer( + BGMMVariantAnnotationsScorer.deserialize(scorerSerFile), + BGMMVariantAnnotationsScorer.deserialize(negativeScorerSerFile)) + : BGMMVariantAnnotationsScorer.deserialize(scorerSerFile) + : null; + } + + private Function readCalibrationScoresAndCreateConverter(final VariantType variantType) { + final String variantTypeTag = '.' + variantType.toString().toLowerCase(); + final File calibrationScores = new File( + modelPrefix + variantTypeTag + TrainVariantAnnotationsModel.CALIBRATION_SCORES_HDF5_SUFFIX); + return calibrationScores.canRead() + ? VariantAnnotationsScorer.createScoreToCalibrationSensitivityConverter(VariantAnnotationsScorer.readScores(calibrationScores)) + : null; + } + + private void readAnnotationsAndWriteScoresToHDF5() { + final List annotationNames = LabeledVariantAnnotationsData.readAnnotationNames(outputAnnotationsFile); + final List isSNP = LabeledVariantAnnotationsData.readLabel(outputAnnotationsFile, LabeledVariantAnnotationsData.SNP_LABEL); + final double[][] allAnnotations = LabeledVariantAnnotationsData.readAnnotations(outputAnnotationsFile); + final int numAll = allAnnotations.length; + final List allScores = new ArrayList<>(Collections.nCopies(numAll, Double.NaN)); + if (variantTypesToExtract.contains(VariantType.SNP)) { + logger.info("Scoring SNP variants..."); + scoreVariantTypeAndSetElementsOfAllScores(annotationNames, allAnnotations, isSNP, snpScorer, allScores); + } + if (variantTypesToExtract.contains(VariantType.INDEL)) { + logger.info("Scoring INDEL variants..."); + final List isIndel = isSNP.stream().map(x -> !x).collect(Collectors.toList()); + scoreVariantTypeAndSetElementsOfAllScores(annotationNames, allAnnotations, isIndel, indelScorer, allScores); + } + VariantAnnotationsScorer.writeScores(outputScoresFile, Doubles.toArray(allScores)); + logger.info(String.format("Scores written to %s.", outputScoresFile.getAbsolutePath())); + } + + private static void scoreVariantTypeAndSetElementsOfAllScores(final List annotationNames, + final double[][] allAnnotations, + final List isVariantType, + final VariantAnnotationsScorer variantTypeScorer, + final List allScores) { + final File variantTypeAnnotationsFile = LabeledVariantAnnotationsData.subsetAnnotationsToTemporaryFile(annotationNames, allAnnotations, isVariantType); + final File variantTypeScoresFile = IOUtils.createTempFile("temp", ".scores.hdf5"); + variantTypeScorer.score(variantTypeAnnotationsFile, variantTypeScoresFile); // TODO we do not fail until here in the case of mismatched annotation names; we could fail earlier + final double[] variantTypeScores = VariantAnnotationsScorer.readScores(variantTypeScoresFile); + final Iterator variantTypeScoresIterator = Arrays.stream(variantTypeScores).iterator(); + IntStream.range(0, allScores.size()).filter(isVariantType::get).forEach(i -> allScores.set(i, variantTypeScoresIterator.next())); + } + + @Override + void writeExtractedVariantToVCF(final VariantContext vc, + final List altAlleles, + final Set labels) { + final VariantContextBuilder builder = new VariantContextBuilder(vc); + labels.forEach(l -> builder.attribute(l, true)); // labels should already be sorted as a TreeSet + + final List scores = useASAnnotations + ? altAlleles.stream().map(a -> scoresIterator.next()).collect(Collectors.toList()) + : Collections.singletonList(scoresIterator.next()); + final double score = Collections.max(scores); + final int scoreIndex = scores.indexOf(score); + builder.attribute(scoreKey, formatDouble(score)); + + final List isSNP = useASAnnotations + ? altAlleles.stream().map(a -> isSNPIterator.next()).collect(Collectors.toList()) + : Collections.singletonList(isSNPIterator.next()); + final boolean isSNPMax = isSNP.get(scoreIndex); + + if (snpKey != null) { + builder.attribute(snpKey, isSNPMax); + } + + final Function calibrationSensitivityConverter = isSNPMax ? snpCalibrationSensitivityConverter : indelCalibrationSensitivityConverter; + if (calibrationSensitivityConverter != null) { + final double calibrationSensitivity = calibrationSensitivityConverter.apply(score); + builder.attribute(calibrationSensitivityKey, formatDouble(calibrationSensitivity)); + final Double calibrationSensitivityThreshold = isSNPMax ? snpCalibrationSensitivityThreshold : indelCalibrationSensitivityThreshold; + if (calibrationSensitivityThreshold != null && calibrationSensitivity >= calibrationSensitivityThreshold) { + builder.filter(lowScoreFilterName); // TODO does this sufficiently cover the desired behavior when dealing with previously filtered sites, etc.? + } + } + + vcfWriter.add(builder.make()); + } + + private String formatDouble(final double x) { + return String.format(doubleFormat, x); + } + + /** + * Copies the header from the input VCF and adds info lines for the score, calibration-sensitivity, and label keys, + * as well as the filter line. + */ + @Override + VCFHeader constructVCFHeader(final List sortedLabels) { + final VCFHeader inputHeader = getHeaderForVariants(); + final Set sortedInputHeaderMetaData = inputHeader.getMetaDataInSortedOrder(); + + final Set hInfo = new HashSet<>(sortedInputHeaderMetaData); + hInfo.add(new VCFInfoHeaderLine(scoreKey, 1, VCFHeaderLineType.Float, + "Score according to the model applied by ScoreVariantAnnotations")); + hInfo.add(new VCFInfoHeaderLine(calibrationSensitivityKey, 1, VCFHeaderLineType.Float, + String.format("Calibration sensitivity corresponding to the value of %s", scoreKey))); + hInfo.add(new VCFFilterHeaderLine(lowScoreFilterName, "Low score (corresponding to high calibration sensitivity)")); + + if (snpKey != null) { + hInfo.add(new VCFInfoHeaderLine(snpKey, 1, VCFHeaderLineType.Flag, "This site was considered a SNP during filtering")); + } + hInfo.addAll(sortedLabels.stream() + .map(l -> new VCFInfoHeaderLine(l, 1, VCFHeaderLineType.Flag, String.format(RESOURCE_LABEL_INFO_HEADER_LINE_FORMAT_STRING, l))) + .collect(Collectors.toList())); + hInfo.addAll(getDefaultToolVCFHeaderLines()); + + return new VCFHeader(hInfo, inputHeader.getGenotypeSamples()); + } + + @Override + public Object onTraversalSuccess() { + + logger.info(String.format("%s complete.", getClass().getSimpleName())); + + return null; + } +} \ No newline at end of file diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/TrainVariantAnnotationsModel.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/TrainVariantAnnotationsModel.java new file mode 100644 index 00000000000..3aa67197f8b --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/TrainVariantAnnotationsModel.java @@ -0,0 +1,703 @@ +package org.broadinstitute.hellbender.tools.walkers.vqsr.scalable; + +import com.google.common.collect.Streams; +import com.google.common.primitives.Doubles; +import org.apache.commons.math3.stat.descriptive.moment.Variance; +import org.apache.commons.math3.stat.descriptive.rank.Percentile; +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.BetaFeature; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.barclay.help.DocumentedFeature; +import org.broadinstitute.hellbender.cmdline.CommandLineProgram; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.exceptions.GATKException; +import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.tools.walkers.vqsr.VariantRecalibrator; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data.LabeledVariantAnnotationsData; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data.VariantType; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.BGMMVariantAnnotationsModel; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.BGMMVariantAnnotationsScorer; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.PythonSklearnVariantAnnotationsModel; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.PythonSklearnVariantAnnotationsScorer; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.VariantAnnotationsModel; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.VariantAnnotationsModelBackend; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.VariantAnnotationsScorer; +import org.broadinstitute.hellbender.utils.Utils; +import org.broadinstitute.hellbender.utils.io.IOUtils; +import org.broadinstitute.hellbender.utils.io.Resource; +import org.broadinstitute.hellbender.utils.python.PythonScriptExecutor; +import picard.cmdline.programgroups.VariantFilteringProgramGroup; + +import java.io.File; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +/** + * Trains a model for scoring variant calls based on site-level annotations. + * + *

+ * This tool is intended to be used as the second step in a variant-filtering workflow that supersedes the + * {@link VariantRecalibrator} workflow. Given training (and optionally, calibration) sets of site-level annotations + * produced by {@link ExtractVariantAnnotations}, this tool can be used to train a model for scoring variant + * calls. For each variant type (i.e., SNP or INDEL) specified using the {@value MODE_LONG_NAME} argument, the tool + * outputs files that are either: 1) serialized scorers, each of which persists to disk a function for computing + * scores given subsequent annotations, or 2) HDF5 files containing a set of scores, each corresponding to training, + * calibration, and unlabeled sets, as appropriate. + *

+ * + *

+ * The model files produced by this tool can in turn be provided along with a VCF file to the {@link ScoreVariantAnnotations} + * tool, which assigns a score to each call (with a lower score indicating that a call is more likely to be an artifact + * and should perhaps be filtered). Each score can also be converted to a corresponding sensitivity with respect to a + * calibration set, if the latter is available. + *

+ * + *

Modeling approaches

+ * + *

+ * This tool can perform modeling using either a positive-only approach or a positive-negative approach. + * In a positive-only approach, the annotation-space distribution of training sites is used to learn a + * function for converting annotations for subsequent sites into a score; typically, higher scores correspond to + * regions of annotation space that are more densely populated by training sites. In contrast, a positive-negative + * approach attempts to additionally use unlabeled sites to better identify regions of annotation space that correspond + * to low scores against the original, positive-only model (with the assumption being that unlabeled sites are + * more likely to populate such regions than are training sites). A second, negative model can then be trained, + * and the resulting scores (which are presumably higher in regions of annotation space that are less densely + * populated by the original training sites) can be subtracted from the original scores to produce a final score. + * (Note that this positive-negative approach could be considered as a single iteration of a more general + * approach typically referred to as positive-unlabeled learning.) + *

+ * + *

+ * A positive-only approach is likely to perform well in cases where a sufficient number of reliable training sites + * is available. In contrast, if 1) only a small number of reliable training sites is available, and/or + * 2) the reliability of the training sites is questionable (e.g., the sites may be contaminated by + * a non-negigible number of sequencing artifacts), then a positive-negative approach may be beneficial. + * However, note that the positive-negative approach introduces an additional hyperparameter---the threshold + * that determines the selection of sites for training the negative model, controlled by the + * {@value CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME} argument---which may require tuning. + * Further note that although {@link VariantRecalibrator} (which this tool supplants) has typically been used to + * implement a positive-negative approach, a positive-only approach likely suffices in many use cases. + *

+ * + *

+ * If a positive-only approach has been specified, then if training sites of the variant type are available: + * + *

    + *
  • 1) A positive model is trained using these training sites and is serialized to file,
  • + *
  • 2) Scores for these training sites are generated using the positive model and output to a file,
  • + *
  • 3) If calibration sites of the variant type are available, scores for these calibration sites are + * generated using the positive model and output to a file.
  • + *
+ * + * Additionally, if a positive-negative approach has been specified (i.e., the {@value UNLABELED_ANNOTATIONS_HDF5_LONG_NAME} + * and {@value CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME} arguments have been provided), + * and if both unlabeled and calibration sites of the variant type are available, then: + * + *
    + *
  • 4) The calibration scores generated from the positive model are used to convert the + * calibration-sensitivity threshold into a score threshold,
  • + *
  • 5) Training sites with scores below the score threshold are selected for training a negative model,
  • + *
  • 6) Scores for unlabeled sites are generated using the positive model and output to a file,
  • + *
  • 7) Unlabeled sites with scores below the score threshold are selected for training a negative model,
  • + *
  • 8) A negative model is trained using these selected training and unlabeled sites and is serialized to file,
  • + *
  • 9) Scores for calibration sites are generated using the positive-negative model and overwritten in the existing file.
  • + *
+ * + * Note that the positive-negative approach thus yields 1) scores for training and unlabeled sites generated from + * the positive model and 2) scores for calibration sites generated from the positive-negative model. This is opposed + * to generating scores from all sites from the positive-negative model, since these can simply be obtained from + * a downstream run of {@link ScoreVariantAnnotations}. + *

+ * + *

Modeling backends

+ * + *

+ * This tool allows the use of different backends for modeling and scoring. See also below + * for instructions for using a custom, user-provided implementation. + *

+ * + *

Python isolation-forest backend

+ * + *

+ * + * This backend uses scikit-learn modules to train models and scoring functions using the + * isolation-forest method for anomaly detection. + * Median imputation of missing annotation values is performed before applying the method. + *

+ * + *

+ * This backend can be selected by specifying {@code PYTHON_IFOREST} to the {@value MODEL_BACKEND_LONG_NAME} argument + * and is also currently the the default backend. It is implemented by the script at + * src/main/resources/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/isolation-forest.py, which + * requires that the argparse, h5py, numpy, sklearn, and dill packages be present in the Python environment; users + * may wish to simply use the provided GATK conda environment to ensure that the correct versions of all packages are available. + * See the IsolationForest documentation here + * as appropriate for the version of scikit-learn used in your Python environment. The hyperparameters documented + * there can be specified using the {@value HYPERPARAMETERS_JSON_LONG_NAME} argument; see + * src/main/resources/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/isolation-forest-hyperparameters.json + * for an example and the default values. + *

+ * + *

+ * Note that HDF5 files may be viewed using hdfview + * or loaded in Python using PyTables or h5py. + *

+ * + *

Calibration sets

+ * + *

+ * The choice of calibration set will determine the conversion between model scores and calibration-set sensitivities. + * Ideally, the calibration set should be comprised of a unbiased sample from the full distribution of true sites + * in annotation space; the score-sensitivity conversion can roughly be thought of as a mapping from sensitivities in + * [0, 1] to a contour of this annotation-space distribution. In practice, any biases in the calibration set (e.g., + * if it consists of high quality, previously filtered calls, which may be biased towards the high density regions + * of the full distribution) will be reflected in the conversion and should be taken into consideration when + * interpreting calibration-set sensitivities. + *

+ * + *

Inputs

+ * + *
    + *
  • + * Labeled-annotations HDF5 file (.annot.hdf5). Annotation data and metadata for labeled sites are stored in the + * HDF5 directory structure given in the documentation for the {@link ExtractVariantAnnotations} tool. In typical + * usage, both the {@value LabeledVariantAnnotationsData#TRAINING_LABEL} and + * {@value LabeledVariantAnnotationsData#CALIBRATION_LABEL} labels would be available for non-empty sets of + * sites of the requested variant type. + *
  • + *
  • + * (Optional) Unlabeled-annotations HDF5 file (.unlabeled.annot.hdf5). Annotation data and metadata for + * unlabeled sites are stored in the HDF5 directory structure given in the documentation for the + * {@link ExtractVariantAnnotations} tool. If provided, a positive-negative modeling approach (similar to + * that used in {@link VariantRecalibrator} will be used. + *
  • + *
  • + * Variant types (i.e., SNP and/or INDEL) for which to train models. Logic for determining variant type was retained from + * {@link VariantRecalibrator}; see {@link VariantType}. A separate model will be trained for each variant type + * and separate sets of outputs with corresponding tags in the filenames (i.e., "snp" or "indel") will be produced. + * Alternatively, the tool can be run twice, once for each variant type; this may be useful if one wishes to use + * different argument values or modeling approaches. + *
  • + *
  • + * (Optional) Model backend. The Python isolation-forest backend is currently the default backend. + * A custom backend can also be specified in conjunction with the {@value PYTHON_SCRIPT_LONG_NAME} argument. + *
  • + *
  • + * (Optional) Model hyperparameters JSON file. This file can be used to specify backend-specific + * hyperparameters in JSON format, which is to be consumed by the modeling script. This is required if a + * custom backend is used. + *
  • + *
  • + * (Optional) Calibration-set sensitivity threshold. The same threshold will be used for both SNP and INDEL + * variant types. If different thresholds are desired, the tool can be twice, once for each variant type. + *
  • + *
  • + * Output prefix. + * This is used as the basename for output files. + *
  • + *
+ * + *

Outputs

+ * + *

+ * The following outputs are produced for each variant type specified by the {@value MODE_LONG_NAME} argument + * and are delineated by type-specific tags in the filename of each output, which take the form of + * {@code {output-prefix}.{variant-type}.{file-suffix}}. For example, scores for the SNP calibration set + * will be output to the {@code {output-prefix}.snp.calibrationScores.hdf5} file. + *

+ * + *
    + *
  • + * Training-set positive-model scores HDF5 file (.trainingScores.hdf5). + *
  • + *
  • + * Positive-model serialized scorer file. (.scorer.pkl for the default {@code PYTHON_IFOREST} model backend). + *
  • + *
  • + * (Optional) Unlabeled-set positive-model scores HDF5 file (.unlabeledScores.hdf5). This is only output + * if a positive-negative modeling approach is used. + *
  • + *
  • + * (Optional) Calibration-set scores HDF5 file (.calibrationScores.hdf5). This is only output if a calibration + * set is provided. If a positive-only modeling approach is used, scores will be generated from the positive model; + * if a positive-negative modeling approach is used, scores will be generated from the positive-negative model. + *
  • + *
  • + * (Optional) Negative-model serialized scorer file. (.negative.scorer.pkl for the default {@code PYTHON_IFOREST} model backend). + * This is only output if a positive-negative modeling approach is used. + *
  • + *
+ * + *

Usage examples

+ * + *

+ * Train SNP and INDEL models using the default Python IsolationForest model backend with a positive-only approach, + * given an input labeled-annotations HDF5 file generated by {@link ExtractVariantAnnotations} that contains + * labels for both training and calibration sets, producing the outputs 1) train.snp.scorer.pkl, + * 2) train.snp.trainingScores.hdf5, and 3) train.snp.calibrationScores.hdf5, as well as analogous files + * for the INDEL model. Note that the {@value MODE_LONG_NAME} arguments are made explicit here, although both + * SNP and INDEL modes are selected by default. + * + *

+ *     gatk TrainVariantAnnotationsModel \
+ *          --annotations-hdf5 extract.annot.hdf5 \
+ *          --mode SNP \
+ *          --mode INDEL \
+ *          -O train
+ * 
+ *

+ * + *

+ * Train SNP and INDEL models using the default Python IsolationForest model backend with a positive-negative approach + * (using a calibration-sensitivity threshold of 0.95 to select sites for training the negative model), + * given an input labeled-annotations HDF5 file that contains labels for both training and calibration sets + * and an input unlabeled-annotations HDF5 file (with both HDF5 files generated by {@link ExtractVariantAnnotations}), + * producing the outputs 1) train.snp.scorer.pkl, 2) train.snp.negative.scorer.pkl, 3) train.snp.trainingScores.hdf5, + * 4) train.snp.calibrationScores.hdf5, and 5) train.snp.unlabeledScores.hdf5, as well as analogous files + * for the INDEL model. Note that the {@value MODE_LONG_NAME} arguments are made explicit here, although both + * SNP and INDEL modes are selected by default. + * + *

+ *     gatk TrainVariantAnnotationsModel \
+ *          --annotations-hdf5 extract.annot.hdf5 \
+ *          --unlabeled-annotations-hdf5 extract.unlabeled.annot.hdf5 \
+ *          --mode SNP \
+ *          --mode INDEL \
+ *          --calibration-sensitivity-threshold 0.95 \
+ *          -O train
+ * 
+ *

+ * + *

Custom modeling/scoring backends (ADVANCED)

+ * + *

+ * The primary modeling functionality performed by this tool is accomplished by a "modeling backend" + * whose fundamental contract is to take an input HDF5 file containing an annotation matrix for sites of a + * single variant type (i.e., SNP or INDEL) and to output a serialized scorer for that variant type. + * Rather than using one of the available, implemented backends, advanced users may provide their own backend + * via the {@value PYTHON_SCRIPT_LONG_NAME} argument. See documentation in the modeling and scoring interfaces + * ({@link VariantAnnotationsModel} and {@link VariantAnnotationsScorer}, respectively), as well as the default + * Python IsolationForest implementation at {@link PythonSklearnVariantAnnotationsModel} and + * src/main/resources/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/isolation-forest.py. + *

+ * + *

+ * Extremely advanced users could potentially substitute their own implementation for the entire + * {@link TrainVariantAnnotationsModel} tool, while still making use of the up/downstream + * {@link ExtractVariantAnnotations} and {@link ScoreVariantAnnotations} tools. To do so, one would additionally + * have to implement functionality for subsetting training/calibration sets by variant type, + * calling modeling backends as appropriate, and scoring calibration sets. + *

+ * + * @author Samuel Lee <slee@broadinstitute.org> + */ +@CommandLineProgramProperties( + summary = "Trains a model for scoring variant calls based on site-level annotations.", + oneLineSummary = "Trains a model for scoring variant calls based on site-level annotations", + programGroup = VariantFilteringProgramGroup.class +) +@DocumentedFeature +@BetaFeature +public final class TrainVariantAnnotationsModel extends CommandLineProgram { + + public static final String MODE_LONG_NAME = "mode"; + public static final String ANNOTATIONS_HDF5_LONG_NAME = "annotations-hdf5"; + public static final String UNLABELED_ANNOTATIONS_HDF5_LONG_NAME = "unlabeled-annotations-hdf5"; + public static final String MODEL_BACKEND_LONG_NAME = "model-backend"; + public static final String PYTHON_SCRIPT_LONG_NAME = "python-script"; + public static final String HYPERPARAMETERS_JSON_LONG_NAME = "hyperparameters-json"; + public static final String CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME = "calibration-sensitivity-threshold"; + + public static final String ISOLATION_FOREST_PYTHON_SCRIPT = "isolation-forest.py"; + public static final String ISOLATION_FOREST_HYPERPARAMETERS_JSON = "isolation-forest-hyperparameters.json"; + + enum AvailableLabelsMode { + POSITIVE_ONLY, POSITIVE_UNLABELED + } + + public static final String TRAINING_SCORES_HDF5_SUFFIX = ".trainingScores.hdf5"; + public static final String CALIBRATION_SCORES_HDF5_SUFFIX = ".calibrationScores.hdf5"; + public static final String UNLABELED_SCORES_HDF5_SUFFIX = ".unlabeledScores.hdf5"; + public static final String NEGATIVE_TAG = ".negative"; + + @Argument( + fullName = ANNOTATIONS_HDF5_LONG_NAME, + doc = "HDF5 file containing annotations extracted with ExtractVariantAnnotations.") + private File inputAnnotationsFile; + + @Argument( + fullName = UNLABELED_ANNOTATIONS_HDF5_LONG_NAME, + doc = "HDF5 file containing annotations extracted with ExtractVariantAnnotations. " + + "If specified with " + CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME + ", " + + "a positive-unlabeled modeling approach will be used; otherwise, a positive-only modeling " + + "approach will be used.", + optional = true) + private File inputUnlabeledAnnotationsFile; + + @Argument( + fullName = MODEL_BACKEND_LONG_NAME, + doc = "Backend to use for training models. " + + "JAVA_BGMM will use a pure Java implementation (ported from Python scikit-learn) of the Bayesian Gaussian Mixture Model. " + + "PYTHON_IFOREST will use the Python scikit-learn implementation of the IsolationForest method and " + + "will require that the corresponding Python dependencies are present in the environment. " + + "PYTHON_SCRIPT will use the script specified by the " + PYTHON_SCRIPT_LONG_NAME + " argument. " + + "See the tool documentation for more details.") + private VariantAnnotationsModelBackend modelBackend = VariantAnnotationsModelBackend.PYTHON_IFOREST; + + @Argument( + fullName = PYTHON_SCRIPT_LONG_NAME, + doc = "Python script used for specifying a custom scoring backend. If provided, " + MODEL_BACKEND_LONG_NAME + " must also be set to PYTHON_SCRIPT.", + optional = true) + private File pythonScriptFile; + + @Argument( + fullName = HYPERPARAMETERS_JSON_LONG_NAME, + doc = "JSON file containing hyperparameters. Optional if the PYTHON_IFOREST backend is used " + + "(if not specified, a default set of hyperparameters will be used); otherwise required.", + optional = true) + private File hyperparametersJSONFile; + + @Argument( + fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, + shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, + doc = "Output prefix.") + private String outputPrefix; + + @Argument( + fullName = CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME, + doc = "Calibration-sensitivity threshold that determines which sites will be used for training the negative model " + + "in the positive-unlabeled modeling approach. " + + "Increasing this will decrease the corresponding positive-model score threshold; sites with scores below this score " + + "threshold will be used for training the negative model. Thus, this parameter should typically be chosen to " + + "be close to 1, so that sites that score highly according to the positive model will not be used to train the negative model. " + + "The " + UNLABELED_ANNOTATIONS_HDF5_LONG_NAME + " argument must be specified in conjunction with this argument. " + + "If separate thresholds for SNP and INDEL models are desired, run the tool separately for each mode with its respective threshold.", + optional = true, + minValue = 0., + maxValue = 1.) + private Double calibrationSensitivityThreshold; + + @Argument( + fullName = MODE_LONG_NAME, + doc = "Variant types for which to train models. Duplicate values will be ignored.", + minElements = 1) + public List variantTypes = new ArrayList<>(Arrays.asList(VariantType.SNP, VariantType.INDEL)); + + private AvailableLabelsMode availableLabelsMode; + + @Override + protected Object doWork() { + + validateArgumentsAndSetModes(); + + logger.info("Starting training..."); + + for (final VariantType variantType : VariantType.values()) { // enforces order in which models are trained + if (variantTypes.contains(variantType)) { + doModelingWorkForVariantType(variantType); + } + } + + logger.info(String.format("%s complete.", getClass().getSimpleName())); + + return null; + } + + private void validateArgumentsAndSetModes() { + IOUtils.canReadFile(inputAnnotationsFile); + + Utils.validateArg((inputUnlabeledAnnotationsFile == null) == (calibrationSensitivityThreshold == null), + "Unlabeled annotations and calibration-sensitivity threshold must both be unspecified (for positive-only model training) " + + "or specified (for positive-negative model training)."); + + availableLabelsMode = inputUnlabeledAnnotationsFile != null && calibrationSensitivityThreshold != null + ? AvailableLabelsMode.POSITIVE_UNLABELED + : AvailableLabelsMode.POSITIVE_ONLY; + + if (inputUnlabeledAnnotationsFile != null) { + IOUtils.canReadFile(inputUnlabeledAnnotationsFile); + final List annotationNames = LabeledVariantAnnotationsData.readAnnotationNames(inputAnnotationsFile); + final List unlabeledAnnotationNames = LabeledVariantAnnotationsData.readAnnotationNames(inputUnlabeledAnnotationsFile); + Utils.validateArg(annotationNames.equals(unlabeledAnnotationNames), "Annotation names must be identical for positive and unlabeled annotations."); + } + + switch (modelBackend) { + case JAVA_BGMM: + Utils.validateArg(pythonScriptFile == null, + "Python script should not be provided when using JAVA_BGMM backend."); + IOUtils.canReadFile(hyperparametersJSONFile); + logger.info("Running in JAVA_BGMM mode..."); + break; + case PYTHON_IFOREST: + Utils.validateArg(pythonScriptFile == null, + "Python script should not be provided when using PYTHON_IFOREST backend."); + + pythonScriptFile = IOUtils.writeTempResource(new Resource(ISOLATION_FOREST_PYTHON_SCRIPT, TrainVariantAnnotationsModel.class)); + if (hyperparametersJSONFile == null) { + hyperparametersJSONFile = IOUtils.writeTempResource(new Resource(ISOLATION_FOREST_HYPERPARAMETERS_JSON, TrainVariantAnnotationsModel.class)); + } + IOUtils.canReadFile(hyperparametersJSONFile); + PythonScriptExecutor.checkPythonEnvironmentForPackage("argparse"); + PythonScriptExecutor.checkPythonEnvironmentForPackage("h5py"); + PythonScriptExecutor.checkPythonEnvironmentForPackage("numpy"); + PythonScriptExecutor.checkPythonEnvironmentForPackage("sklearn"); + PythonScriptExecutor.checkPythonEnvironmentForPackage("dill"); + logger.info("Running in PYTHON_IFOREST mode..."); + break; + case PYTHON_SCRIPT: + IOUtils.canReadFile(pythonScriptFile); + IOUtils.canReadFile(hyperparametersJSONFile); + logger.info("Running in PYTHON_SCRIPT mode..."); + break; + default: + throw new GATKException.ShouldNeverReachHereException("Unknown model-backend mode."); + } + } + + /** + * This method does all modeling and scoring work for a given {@code variantType}. See the tool-level documentation + * for the steps expected to be performed. + */ + private void doModelingWorkForVariantType(final VariantType variantType) { + // positive model + final List annotationNames = LabeledVariantAnnotationsData.readAnnotationNames(inputAnnotationsFile); + final double[][] annotations = LabeledVariantAnnotationsData.readAnnotations(inputAnnotationsFile); + + final List isTraining = LabeledVariantAnnotationsData.readLabel(inputAnnotationsFile, LabeledVariantAnnotationsData.TRAINING_LABEL); + final List isCalibration = LabeledVariantAnnotationsData.readLabel(inputAnnotationsFile, LabeledVariantAnnotationsData.CALIBRATION_LABEL); + final List isSNP = LabeledVariantAnnotationsData.readLabel(inputAnnotationsFile, LabeledVariantAnnotationsData.SNP_LABEL); + final List isVariantType = variantType == VariantType.SNP ? isSNP : isSNP.stream().map(x -> !x).collect(Collectors.toList()); + + final List isTrainingAndVariantType = Streams.zip(isTraining.stream(), isVariantType.stream(), (a, b) -> a && b).collect(Collectors.toList()); + final int numTrainingAndVariantType = numPassingFilter(isTrainingAndVariantType); + + final String variantTypeString = variantType.toString(); + final String outputPrefixTag = '.' + variantType.toString().toLowerCase(); + + if (numTrainingAndVariantType > 0) { + logger.info(String.format("Training %s model with %d training sites x %d annotations %s...", + variantTypeString, numTrainingAndVariantType, annotationNames.size(), annotationNames)); + final File labeledTrainingAndVariantTypeAnnotationsFile = LabeledVariantAnnotationsData.subsetAnnotationsToTemporaryFile(annotationNames, annotations, isTrainingAndVariantType); + trainAndSerializeModel(labeledTrainingAndVariantTypeAnnotationsFile, outputPrefixTag); + logger.info(String.format("%s model trained and serialized with output prefix \"%s\".", variantTypeString, outputPrefix + outputPrefixTag)); + + if (modelBackend == VariantAnnotationsModelBackend.JAVA_BGMM) { + BGMMVariantAnnotationsScorer.preprocessAnnotationsWithBGMMAndWriteHDF5( + annotationNames, outputPrefix + outputPrefixTag, labeledTrainingAndVariantTypeAnnotationsFile, logger); + } + + logger.info(String.format("Scoring %d %s training sites...", numTrainingAndVariantType, variantTypeString)); + final File labeledTrainingAndVariantTypeScoresFile = score(labeledTrainingAndVariantTypeAnnotationsFile, outputPrefixTag, TRAINING_SCORES_HDF5_SUFFIX); + logger.info(String.format("%s training scores written to %s.", variantTypeString, labeledTrainingAndVariantTypeScoresFile.getAbsolutePath())); + + final List isLabeledCalibrationAndVariantType = Streams.zip(isCalibration.stream(), isVariantType.stream(), (a, b) -> a && b).collect(Collectors.toList()); + final int numLabeledCalibrationAndVariantType = numPassingFilter(isLabeledCalibrationAndVariantType); + if (numLabeledCalibrationAndVariantType > 0) { + logger.info(String.format("Scoring %d %s calibration sites...", numLabeledCalibrationAndVariantType, variantTypeString)); + final File labeledCalibrationAndVariantTypeAnnotationsFile = LabeledVariantAnnotationsData.subsetAnnotationsToTemporaryFile(annotationNames, annotations, isLabeledCalibrationAndVariantType); + final File labeledCalibrationAndVariantTypeScoresFile = score(labeledCalibrationAndVariantTypeAnnotationsFile, outputPrefixTag, CALIBRATION_SCORES_HDF5_SUFFIX); + logger.info(String.format("%s calibration scores written to %s.", variantTypeString, labeledCalibrationAndVariantTypeScoresFile.getAbsolutePath())); + } else { + logger.warn(String.format("No %s calibration sites were available.", variantTypeString)); + } + + // negative model + if (availableLabelsMode == AvailableLabelsMode.POSITIVE_UNLABELED) { + if (numLabeledCalibrationAndVariantType == 0) { + throw new UserException.BadInput(String.format("Attempted to train %s negative model, " + + "but no suitable calibration sites were found in the provided annotations.", variantTypeString)); + } + final double[][] unlabeledAnnotations = LabeledVariantAnnotationsData.readAnnotations(inputUnlabeledAnnotationsFile); + final List unlabeledIsSNP = LabeledVariantAnnotationsData.readLabel(inputUnlabeledAnnotationsFile, "snp"); + final List isUnlabeledVariantType = variantType == VariantType.SNP ? unlabeledIsSNP : unlabeledIsSNP.stream().map(x -> !x).collect(Collectors.toList()); + + final int numUnlabeledVariantType = numPassingFilter(isUnlabeledVariantType); + + if (numUnlabeledVariantType > 0) { + final File labeledCalibrationAndVariantTypeScoresFile = new File(outputPrefix + outputPrefixTag + CALIBRATION_SCORES_HDF5_SUFFIX); + final double[] labeledCalibrationAndVariantTypeScores = VariantAnnotationsScorer.readScores(labeledCalibrationAndVariantTypeScoresFile); + final double scoreThreshold = calibrationSensitivityThreshold == 1. // Percentile requires quantile > 0, so we treat this as a special case + ? Doubles.min(labeledCalibrationAndVariantTypeScores) + : new Percentile(100. * (1. - calibrationSensitivityThreshold)).evaluate(labeledCalibrationAndVariantTypeScores); + logger.info(String.format("Using %s score threshold of %.4f corresponding to specified calibration-sensitivity threshold of %.4f ...", + variantTypeString, scoreThreshold, calibrationSensitivityThreshold)); + + final double[] labeledTrainingAndVariantTypeScores = VariantAnnotationsScorer.readScores(labeledTrainingAndVariantTypeScoresFile); + final List isNegativeTrainingFromLabeledTrainingAndVariantType = Arrays.stream(labeledTrainingAndVariantTypeScores).boxed().map(s -> s < scoreThreshold).collect(Collectors.toList()); + final int numNegativeTrainingFromLabeledTrainingAndVariantType = numPassingFilter(isNegativeTrainingFromLabeledTrainingAndVariantType); + logger.info(String.format("Selected %d labeled %s sites below score threshold of %.4f for negative-model training...", + numNegativeTrainingFromLabeledTrainingAndVariantType, variantTypeString, scoreThreshold)); + + logger.info(String.format("Scoring %d unlabeled %s sites...", numUnlabeledVariantType, variantTypeString)); + final File unlabeledVariantTypeAnnotationsFile = LabeledVariantAnnotationsData.subsetAnnotationsToTemporaryFile(annotationNames, unlabeledAnnotations, isUnlabeledVariantType); + final File unlabeledVariantTypeScoresFile = score(unlabeledVariantTypeAnnotationsFile, outputPrefixTag, UNLABELED_SCORES_HDF5_SUFFIX); + final double[] unlabeledVariantTypeScores = VariantAnnotationsScorer.readScores(unlabeledVariantTypeScoresFile); + final List isNegativeTrainingFromUnlabeledVariantType = Arrays.stream(unlabeledVariantTypeScores).boxed().map(s -> s < scoreThreshold).collect(Collectors.toList()); // length matches unlabeledAnnotationsFile + final int numNegativeTrainingFromUnlabeledVariantType = numPassingFilter(isNegativeTrainingFromUnlabeledVariantType); + logger.info(String.format("Selected %d unlabeled %s sites below score threshold of %.4f for negative-model training...", + numNegativeTrainingFromUnlabeledVariantType, variantTypeString, scoreThreshold)); + + final double[][] negativeTrainingAndVariantTypeAnnotations = concatenateLabeledAndUnlabeledNegativeTrainingData( + annotationNames, annotations, unlabeledAnnotations, isNegativeTrainingFromLabeledTrainingAndVariantType, isNegativeTrainingFromUnlabeledVariantType); + final int numNegativeTrainingAndVariantType = negativeTrainingAndVariantTypeAnnotations.length; + final List isNegativeTrainingAndVariantType = Collections.nCopies(numNegativeTrainingAndVariantType, true); + + logger.info(String.format("Training %s negative model with %d negative-training sites x %d annotations %s...", + variantTypeString, numNegativeTrainingAndVariantType, annotationNames.size(), annotationNames)); + final File negativeTrainingAnnotationsFile = LabeledVariantAnnotationsData.subsetAnnotationsToTemporaryFile( + annotationNames, negativeTrainingAndVariantTypeAnnotations, isNegativeTrainingAndVariantType); + trainAndSerializeModel(negativeTrainingAnnotationsFile, outputPrefixTag + NEGATIVE_TAG); + logger.info(String.format("%s negative model trained and serialized with output prefix \"%s\".", variantTypeString, outputPrefix + outputPrefixTag + NEGATIVE_TAG)); + + logger.info(String.format("Re-scoring %d %s calibration sites...", numLabeledCalibrationAndVariantType, variantTypeString)); + final File labeledCalibrationAnnotationsFile = LabeledVariantAnnotationsData.subsetAnnotationsToTemporaryFile(annotationNames, annotations, isLabeledCalibrationAndVariantType); + final File labeledCalibrationScoresFile = positiveNegativeScore(labeledCalibrationAnnotationsFile, outputPrefixTag, CALIBRATION_SCORES_HDF5_SUFFIX); + logger.info(String.format("Calibration scores written to %s.", labeledCalibrationScoresFile.getAbsolutePath())); + } else { + throw new UserException.BadInput(String.format("Attempted to train %s negative model, " + + "but no suitable unlabeled sites were found in the provided annotations.", variantTypeString)); + } + } + } else { + throw new UserException.BadInput(String.format("Attempted to train %s model, " + + "but no suitable training sites were found in the provided annotations.", variantTypeString)); + } + } + + private static int numPassingFilter(final List isPassing) { + return (int) isPassing.stream().filter(x -> x).count(); + } + + private void trainAndSerializeModel(final File trainingAnnotationsFile, + final String outputPrefixTag) { + readAndValidateTrainingAnnotations(trainingAnnotationsFile, outputPrefixTag); + final VariantAnnotationsModel model; + switch (modelBackend) { + case JAVA_BGMM: + model = new BGMMVariantAnnotationsModel(hyperparametersJSONFile); + break; + case PYTHON_IFOREST: + model = new PythonSklearnVariantAnnotationsModel(pythonScriptFile, hyperparametersJSONFile); + break; + case PYTHON_SCRIPT: + model = new PythonSklearnVariantAnnotationsModel(pythonScriptFile, hyperparametersJSONFile); + break; + default: + throw new GATKException.ShouldNeverReachHereException("Unknown model mode."); + } + model.trainAndSerialize(trainingAnnotationsFile, outputPrefix + outputPrefixTag); + } + + /** + * When training models on data that has been subset to a given variant type, + * we FAIL if any annotation is completely missing and WARN if any annotation has zero variance. + */ + private void readAndValidateTrainingAnnotations(final File trainingAnnotationsFile, + final String outputPrefixTag) { + final List annotationNames = LabeledVariantAnnotationsData.readAnnotationNames(trainingAnnotationsFile); + final double[][] annotations = LabeledVariantAnnotationsData.readAnnotations(trainingAnnotationsFile); + + // these checks are redundant, but we err on the side of robustness + final int numAnnotationNames = annotationNames.size(); + final int numData = annotations.length; + Utils.validateArg(numAnnotationNames > 0, "Number of annotation names must be positive."); + Utils.validateArg(numData > 0, "Number of data points must be positive."); + final int numFeatures = annotations[0].length; + Utils.validateArg(numAnnotationNames == numFeatures, + "Number of annotation names must match the number of features in the annotation data."); + + final List completelyMissingAnnotationNames = new ArrayList<>(numFeatures); + IntStream.range(0, numFeatures).forEach( + i -> { + if (new Variance().evaluate(IntStream.range(0, numData).mapToDouble(n -> annotations[n][i]).toArray()) == 0.) { + logger.warn(String.format("All values of the annotation %s are identical in the training data for the %s model.", + annotationNames.get(i), outputPrefix + outputPrefixTag)); + } + if (IntStream.range(0, numData).boxed().map(n -> annotations[n][i]).allMatch(x -> Double.isNaN(x))) { + completelyMissingAnnotationNames.add(annotationNames.get(i)); + } + } + ); + + if (!completelyMissingAnnotationNames.isEmpty()) { + throw new UserException.BadInput( + String.format("All values of the following annotations are missing in the training data for the %s model: %s. " + + "Consider repeating the extraction step with this annotation dropped. " + + "If this is a negative model and the amount of negative training data is small, " + + "perhaps also consider lowering the value of the %s argument so that more " + + "training data is considered, which may ultimately admit data with non-missing values for the annotation " + + "(although note that this will also have implications for the resulting model fit); " + + "alternatively, consider excluding the %s and %s arguments and running positive-only modeling.", + outputPrefix + outputPrefixTag, completelyMissingAnnotationNames, + CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME, UNLABELED_ANNOTATIONS_HDF5_LONG_NAME, CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME)); + } + } + + private File score(final File annotationsFile, + final String outputPrefixTag, + final String outputSuffix) { + final VariantAnnotationsScorer scorer; + switch (modelBackend) { + case JAVA_BGMM: + scorer = BGMMVariantAnnotationsScorer.deserialize(new File(outputPrefix + outputPrefixTag + BGMMVariantAnnotationsScorer.BGMM_SCORER_SER_SUFFIX)); + break; + case PYTHON_IFOREST: + case PYTHON_SCRIPT: + scorer = new PythonSklearnVariantAnnotationsScorer(pythonScriptFile, new File(outputPrefix + outputPrefixTag + PythonSklearnVariantAnnotationsScorer.PYTHON_SCORER_PKL_SUFFIX)); + break; + + default: + throw new GATKException.ShouldNeverReachHereException("Unknown model mode."); + } + final File outputScoresFile = new File(outputPrefix + outputPrefixTag + outputSuffix); + scorer.score(annotationsFile, outputScoresFile); + return outputScoresFile; + } + + private File positiveNegativeScore(final File annotationsFile, + final String outputPrefixTag, + final String outputSuffix) { + final VariantAnnotationsScorer scorer; + switch (modelBackend) { + case JAVA_BGMM: + scorer = VariantAnnotationsScorer.combinePositiveAndNegativeScorer( + BGMMVariantAnnotationsScorer.deserialize(new File(outputPrefix + outputPrefixTag + BGMMVariantAnnotationsScorer.BGMM_SCORER_SER_SUFFIX)), + BGMMVariantAnnotationsScorer.deserialize(new File(outputPrefix + outputPrefixTag + NEGATIVE_TAG + BGMMVariantAnnotationsScorer.BGMM_SCORER_SER_SUFFIX))); + break; + case PYTHON_IFOREST: + case PYTHON_SCRIPT: + scorer = VariantAnnotationsScorer.combinePositiveAndNegativeScorer( + new PythonSklearnVariantAnnotationsScorer(pythonScriptFile, new File(outputPrefix + outputPrefixTag + PythonSklearnVariantAnnotationsScorer.PYTHON_SCORER_PKL_SUFFIX)), + new PythonSklearnVariantAnnotationsScorer(pythonScriptFile, new File(outputPrefix + outputPrefixTag + NEGATIVE_TAG + PythonSklearnVariantAnnotationsScorer.PYTHON_SCORER_PKL_SUFFIX))); + break; + default: + throw new GATKException.ShouldNeverReachHereException("Unknown model mode."); + } + final File outputScoresFile = new File(outputPrefix + outputPrefixTag + outputSuffix); + scorer.score(annotationsFile, outputScoresFile); + return outputScoresFile; + } + + private static double[][] concatenateLabeledAndUnlabeledNegativeTrainingData(final List annotationNames, + final double[][] annotations, + final double[][] unlabeledAnnotations, + final List isNegativeTrainingFromLabeledTrainingAndVariantType, + final List isNegativeTrainingFromUnlabeledVariantType) { + final File negativeTrainingFromLabeledTrainingAndVariantTypeAnnotationsFile = + LabeledVariantAnnotationsData.subsetAnnotationsToTemporaryFile(annotationNames, annotations, isNegativeTrainingFromLabeledTrainingAndVariantType); + final double[][] negativeTrainingFromLabeledTrainingAndVariantTypeAnnotations = LabeledVariantAnnotationsData.readAnnotations(negativeTrainingFromLabeledTrainingAndVariantTypeAnnotationsFile); + + final File negativeTrainingFromUnlabeledVariantTypeAnnotationsFile = + LabeledVariantAnnotationsData.subsetAnnotationsToTemporaryFile(annotationNames, unlabeledAnnotations, isNegativeTrainingFromUnlabeledVariantType); + final double[][] negativeTrainingFromUnlabeledVariantTypeAnnotations = LabeledVariantAnnotationsData.readAnnotations(negativeTrainingFromUnlabeledVariantTypeAnnotationsFile); + + return Streams.concat( + Arrays.stream(negativeTrainingFromLabeledTrainingAndVariantTypeAnnotations), + Arrays.stream(negativeTrainingFromUnlabeledVariantTypeAnnotations)).toArray(double[][]::new); + } +} \ No newline at end of file diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/data/LabeledVariantAnnotationsData.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/data/LabeledVariantAnnotationsData.java new file mode 100644 index 00000000000..75d8046f09a --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/data/LabeledVariantAnnotationsData.java @@ -0,0 +1,284 @@ +package org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data; + +import com.google.common.collect.ImmutableList; +import htsjdk.variant.variantcontext.Allele; +import htsjdk.variant.variantcontext.VariantContext; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.broadinstitute.hdf5.HDF5File; +import org.broadinstitute.hdf5.HDF5LibException; +import org.broadinstitute.hellbender.exceptions.GATKException; +import org.broadinstitute.hellbender.tools.copynumber.utils.HDF5Utils; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.VariantAnnotationsModel; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.VariantAnnotationsScorer; +import org.broadinstitute.hellbender.utils.Utils; +import org.broadinstitute.hellbender.utils.io.IOUtils; + +import java.io.File; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.TreeSet; +import java.util.stream.Collectors; +import java.util.stream.IntStream; +import java.util.stream.Stream; + +/** + * Represents a collection of {@link LabeledVariantAnnotationsDatum} as a list of lists of datums. + * The outer list is always per-variant. In allele-specific mode, each datum in the inner lists + * corresponds to a single allele; otherwise, each inner list trivially contains a single datum corresponding + * to the variant. + */ +public final class LabeledVariantAnnotationsData { + private static final Logger logger = LogManager.getLogger(LabeledVariantAnnotationsData.class); + + // chunk size in temporary annotation files + // TODO this could be exposed + private static final int CHUNK_DIVISOR = 16; + private static final int MAXIMUM_CHUNK_SIZE = HDF5Utils.MAX_NUMBER_OF_VALUES_PER_HDF5_MATRIX / CHUNK_DIVISOR; + + private static final int INITIAL_SIZE = 10_000_000; + + public static final String TRAINING_LABEL = "training"; + public static final String CALIBRATION_LABEL = "calibration"; + public static final String SNP_LABEL = "snp"; + + public static final String INTERVALS_PATH = "/intervals"; + public static final String ALLELES_REF_PATH = "/alleles/ref"; + public static final String ALLELES_ALT_PATH = "/alleles/alt"; + public static final String ANNOTATIONS_NAMES_PATH = "/annotations/names"; + public static final String ANNOTATIONS_PATH = "/annotations"; + public static final String LABELS_PATH = "/labels"; + public static final String LABELS_SNP_PATH = LABELS_PATH + "/snp"; + + private final List sortedAnnotationNames; + final List sortedLabels; + + private final List> data; + private final boolean useASAnnotations; + + public LabeledVariantAnnotationsData(final Collection annotationNames, + final Collection labels, + final boolean useASAnnotations, + final int initialSize) { + data = new ArrayList<>(initialSize); + sortedAnnotationNames = ImmutableList.copyOf(annotationNames.stream().distinct().sorted().collect(Collectors.toList())); + Utils.validateArg(sortedAnnotationNames.size() > 0, "Number of annotation names must be positive."); + if (sortedAnnotationNames.size() != annotationNames.size()) { + logger.warn(String.format("Ignoring duplicate annotations: %s.", Utils.getDuplicatedItems(annotationNames))); + } + sortedLabels = ImmutableList.copyOf(labels.stream().distinct().sorted().collect(Collectors.toList())); + if (sortedLabels.size() != labels.size()) { + logger.warn(String.format("Ignoring duplicate labels: %s.", Utils.getDuplicatedItems(labels))); + } + this.useASAnnotations = useASAnnotations; + } + + public LabeledVariantAnnotationsData(final Collection annotationNames, + final Collection labels, + final boolean useASAnnotations) { + this(annotationNames, labels, useASAnnotations, INITIAL_SIZE); + } + + public List getSortedAnnotationNames() { + return sortedAnnotationNames; + } + + public List getSortedLabels() { + return sortedLabels; + } + + public int size() { + return data.size(); + } + + public void clear() { + data.clear(); + } + + /** + * Adds an element to the underlying {@link #data} collection. + */ + public void add(final VariantContext vc, + final List> altAllelesPerDatum, + final List variantTypePerDatum, + final List> labelsPerDatum) { + if (!useASAnnotations) { + data.add(Collections.singletonList(new LabeledVariantAnnotationsDatum( + vc, altAllelesPerDatum.get(0), variantTypePerDatum.get(0), labelsPerDatum.get(0), sortedAnnotationNames, useASAnnotations))); + } else { + data.add(IntStream.range(0, altAllelesPerDatum.size()).boxed() + .map(i -> new LabeledVariantAnnotationsDatum( + vc, altAllelesPerDatum.get(i), variantTypePerDatum.get(i), labelsPerDatum.get(i), sortedAnnotationNames, useASAnnotations)) + .collect(Collectors.toList())); + } + } + + /** + * Sets the element at a specified index in the underlying {@link #data} collection. + */ + public void set(final int index, + final VariantContext vc, + final List> altAllelesPerDatum, + final List variantTypePerDatum, + final List> labelsPerDatum) { + if (!useASAnnotations) { + data.set(index, Collections.singletonList(new LabeledVariantAnnotationsDatum( + vc, altAllelesPerDatum.get(0), variantTypePerDatum.get(0), labelsPerDatum.get(0), sortedAnnotationNames, useASAnnotations))); + } else { + data.set(index, IntStream.range(0, altAllelesPerDatum.size()).boxed() + .map(i -> new LabeledVariantAnnotationsDatum( + vc, altAllelesPerDatum.get(i), variantTypePerDatum.get(i), labelsPerDatum.get(i), sortedAnnotationNames, useASAnnotations)) + .collect(Collectors.toList())); + } + } + + /** + * @return list of {@link VariantType} indicators, with length given by the number of corresponding sites + */ + public List getVariantTypeFlat() { + return streamFlattenedData().map(datum -> datum.variantType).collect(Collectors.toList()); + } + + /** + * @return list of boolean label indicators, with length given by the number of sites; + * an element in the list will be true if the corresponding site is assigned to the specified label + */ + public List isLabelFlat(final String label) { + return streamFlattenedData().map(datum -> datum.labels.contains(label)).collect(Collectors.toList()); + } + + private Stream streamFlattenedData() { + return data.stream().flatMap(List::stream); + } + + /** + * Writes a representation of the collection to an HDF5 file with the following directory structure: + * + *

+ * |--- alleles
+ * | |--- alt
+ * | |--- ref
+ * |--- annotations
+ * | |--- chunk_0
+ * | |--- ...
+ * | |--- chunk_{num_chunks - 1}
+ * | |--- names
+ * | |--- num_chunks
+ * | |--- num_columns
+ * | |--- num_rows
+ * |--- intervals
+ * | |--- indexed_contig_names
+ * | |--- transposed_index_start_end
+ * |--- labels
+ * | |--- snp
+ * | |--- ... (e.g., training, calibration, etc.)
+ * | |--- ...
+ *

+ * + * Here, each chunk is a double matrix, with dimensions given by (number of sites in the chunk) x (number of annotations). + * See the methods {@link HDF5Utils#writeChunkedDoubleMatrix} and {@link HDF5Utils#writeIntervals} for additional details. + * + * @param omitAllelesInHDF5 string arrays containing ref/alt alleles can be large, so we allow the option of omitting them + */ + public void writeHDF5(final File outputFile, + final boolean omitAllelesInHDF5) { + + try (final HDF5File outputHDF5File = new HDF5File(outputFile, HDF5File.OpenMode.CREATE)) { + IOUtils.canReadFile(outputHDF5File.getFile()); + HDF5Utils.writeIntervals(outputHDF5File, INTERVALS_PATH, + streamFlattenedData().map(datum -> datum.interval).collect(Collectors.toList())); + if (!omitAllelesInHDF5) { + outputHDF5File.makeStringArray(ALLELES_REF_PATH, + streamFlattenedData().map(datum -> datum.refAllele.getDisplayString()).toArray(String[]::new)); + if (!useASAnnotations) { + outputHDF5File.makeStringArray(ALLELES_ALT_PATH, + streamFlattenedData() + .map(datum -> datum.altAlleles.stream().map(Allele::getDisplayString).collect(Collectors.joining(","))) + .toArray(String[]::new)); + } else { + outputHDF5File.makeStringArray(ALLELES_ALT_PATH, + streamFlattenedData().map(datum -> datum.altAlleles.get(0).getDisplayString()).toArray(String[]::new)); + } + } + outputHDF5File.makeStringArray(ANNOTATIONS_NAMES_PATH, sortedAnnotationNames.toArray(new String[0])); + HDF5Utils.writeChunkedDoubleMatrix(outputHDF5File, ANNOTATIONS_PATH, + streamFlattenedData().map(datum -> datum.annotations).toArray(double[][]::new), MAXIMUM_CHUNK_SIZE); + outputHDF5File.makeDoubleArray(LABELS_SNP_PATH, + streamFlattenedData().mapToDouble(datum -> datum.variantType == VariantType.SNP ? 1 : 0).toArray()); + for (final String label : sortedLabels) { + outputHDF5File.makeDoubleArray(String.format("%s/%s", LABELS_PATH, label), + streamFlattenedData().mapToDouble(datum -> datum.labels.contains(label) ? 1 : 0).toArray()); + } + } catch (final HDF5LibException exception) { + throw new GATKException(String.format("Exception encountered during writing of annotations and metadata (%s). Output file at %s may be in a bad state.", + exception, outputFile.getAbsolutePath())); + } + } + + /** + * @return list of annotation names, with length given by the number of annotations, read from the specified file + */ + public static List readAnnotationNames(final File annotationsFile) { + try (final HDF5File annotationsHDF5File = new HDF5File(annotationsFile, HDF5File.OpenMode.READ_ONLY)) { + IOUtils.canReadFile(annotationsHDF5File.getFile()); + return Arrays.asList(annotationsHDF5File.readStringArray(ANNOTATIONS_NAMES_PATH)); + } catch (final HDF5LibException exception) { + throw new GATKException(String.format("Exception encountered during reading of annotation names from %s: %s", + annotationsFile.getAbsolutePath(), exception)); + } + } + + /** + * @return matrix with dimensions (number of sites) x (number of annotations), read from the specified file + */ + public static double[][] readAnnotations(final File annotationsFile) { + try (final HDF5File annotationsHDF5File = new HDF5File(annotationsFile, HDF5File.OpenMode.READ_ONLY)) { + IOUtils.canReadFile(annotationsHDF5File.getFile()); + return HDF5Utils.readChunkedDoubleMatrix(annotationsHDF5File, ANNOTATIONS_PATH); + } catch (final HDF5LibException exception) { + throw new GATKException(String.format("Exception encountered during reading of annotations from %s: %s", + annotationsFile.getAbsolutePath(), exception)); + } + } + + /** + * @return list of boolean label indicators, with length given by the number of corresponding sites, read from the specified file; + * an element in the list will be true if the corresponding site is assigned to the specified label + */ + public static List readLabel(final File annotationsFile, + final String label) { + try (final HDF5File annotationsHDF5File = new HDF5File(annotationsFile, HDF5File.OpenMode.READ_ONLY)) { + IOUtils.canReadFile(annotationsHDF5File.getFile()); + return Arrays.stream(annotationsHDF5File.readDoubleArray(String.format("/labels/%s", label))).boxed().map(d -> d == 1).collect(Collectors.toList()); + } catch (final HDF5LibException exception) { + throw new GATKException(String.format("Exception encountered during reading of label %s from %s: %s", + label, annotationsFile.getAbsolutePath(), exception)); + } + } + + /** + * Subsets annotation data according to a boolean filter and writes a limited representation to a temporary HDF5 file. + * Intended for passing annotations via the file interfaces of {@link VariantAnnotationsModel} and {@link VariantAnnotationsScorer}. + */ + public static File subsetAnnotationsToTemporaryFile(final List annotationNames, + final double[][] allAnnotations, + final List isSubset) { + Utils.validateArg(annotationNames.size() > 0, "Number of annotation names must be positive."); + Utils.validateArg(allAnnotations.length > 0, "Number of annotation data points must be positive."); + Utils.validateArg(annotationNames.size() == allAnnotations[0].length, + "Number of annotation names must match number of features in annotation data."); + final double[][] subsetData = IntStream.range(0, isSubset.size()).boxed().filter(isSubset::get).map(i -> allAnnotations[i]).toArray(double[][]::new); + final File subsetAnnotationsFile = IOUtils.createTempFile("subset.annot", ".hdf5"); + try (final HDF5File subsetAnnotationsHDF5File = new HDF5File(subsetAnnotationsFile, HDF5File.OpenMode.CREATE)) { + subsetAnnotationsHDF5File.makeStringArray(ANNOTATIONS_NAMES_PATH, annotationNames.toArray(new String[0])); + HDF5Utils.writeChunkedDoubleMatrix(subsetAnnotationsHDF5File, ANNOTATIONS_PATH, subsetData, MAXIMUM_CHUNK_SIZE); + } catch (final HDF5LibException exception) { + throw new GATKException(String.format("Exception encountered during writing of annotations (%s). Output file at %s may be in a bad state.", + exception, subsetAnnotationsFile.getAbsolutePath())); + } + return subsetAnnotationsFile; + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/data/LabeledVariantAnnotationsDatum.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/data/LabeledVariantAnnotationsDatum.java new file mode 100644 index 00000000000..884529f5c56 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/data/LabeledVariantAnnotationsDatum.java @@ -0,0 +1,104 @@ +package org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableSet; +import htsjdk.samtools.util.Locatable; +import htsjdk.variant.variantcontext.Allele; +import htsjdk.variant.variantcontext.VariantContext; +import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.LabeledVariantAnnotationsWalker; +import org.broadinstitute.hellbender.utils.SimpleInterval; +import org.broadinstitute.hellbender.utils.Utils; +import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; + +import java.util.List; +import java.util.TreeSet; + +/** + * Represents metadata and annotations extracted from either a variant or a single alt allele (if in allele-specific mode). + * Intended to be package-private and accessed only by {@link LabeledVariantAnnotationsData}. + */ +final class LabeledVariantAnnotationsDatum implements Locatable { + final SimpleInterval interval; + final Allele refAllele; + final ImmutableList altAlleles; // in allele-specific mode, this contains a single alt allele; otherwise, it contains all alt alleles that passed variant-type checks + final VariantType variantType; + final ImmutableSet labels; // sorted TreeSet + final double[] annotations; // TODO use ImmutableDoubleArray? + + LabeledVariantAnnotationsDatum(final VariantContext vc, + final List altAlleles, + final VariantType variantType, + final TreeSet labels, + final List sortedAnnotationNames, + final boolean useASAnnotations) { + Utils.validate(!useASAnnotations || altAlleles.size() == 1, + "Datum should only be associated with one alt allele in allele-specific mode."); + this.interval = new SimpleInterval(vc); + this.refAllele = vc.getReference(); + this.altAlleles = ImmutableList.copyOf(altAlleles); + this.variantType = variantType; + this.labels = ImmutableSet.copyOf(labels); + this.annotations = sortedAnnotationNames.stream() + .mapToDouble(a -> decodeAnnotation(vc, altAlleles, a, useASAnnotations)) + .toArray(); + } + + @Override + public String getContig() { + return interval.getContig(); + } + + @Override + public int getStart() { + return interval.getStart(); + } + + @Override + public int getEnd() { + return interval.getEnd(); + } + + // code mostly retained from VQSR; some exception catching added + private static double decodeAnnotation(final VariantContext vc, + final List altAlleles, + final String annotationName, + final boolean useASAnnotations) { + double value; + try { + // if we're in allele-specific mode and an allele-specific annotation has been requested, parse the appropriate value from the list + // TODO: can we trigger allele-specific parsing based on annotation prefix or some other logic? + if (useASAnnotations && annotationName.startsWith(GATKVCFConstants.ALLELE_SPECIFIC_PREFIX)) { + final List valueList = vc.getAttributeAsList(annotationName); + final Allele altAllele = altAlleles.get(0); + // FIXME: we need to look at the ref allele here too (SL: this comment was retained from VQSR code, I'm not sure what it means...) + if (vc.hasAllele(altAllele)) { + final int altIndex = vc.getAlleleIndex(altAllele) - 1; //- 1 is to convert the index from all alleles (including reference) to just alternate alleles + try { + value = Double.parseDouble((String) valueList.get(altIndex)); + } catch (final IndexOutOfBoundsException e) { + throw new UserException(String.format("Could not extract annotation %s from variant context: %s. " + + "Encountered exception: %s", annotationName, vc, e)); + } + } else { + //if somehow our alleles got mixed up + throw new IllegalStateException("Allele " + altAllele + " is not contained in the input VariantContext."); + } + } else { + try { + value = vc.getAttributeAsDouble(annotationName, Double.NaN); + } catch (final ClassCastException e) { + throw new UserException(String.format("Could not extract annotation %s from variant context: %s. " + + "Ensure that %s is specified, if desired. Encountered exception: %s", + annotationName, vc, LabeledVariantAnnotationsWalker.USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME, e)); + } + } + if (Double.isInfinite(value)) { + value = Double.NaN; + } + } catch (final NumberFormatException e) { + value = Double.NaN; + } + return value; + } +} \ No newline at end of file diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/data/VariantType.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/data/VariantType.java new file mode 100644 index 00000000000..0c9560d76fc --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/data/VariantType.java @@ -0,0 +1,58 @@ +package org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data; + +import htsjdk.variant.variantcontext.Allele; +import htsjdk.variant.variantcontext.VariantContext; + +/** + * This code and logic for determining variant types was mostly retained from VQSR. + * Note that there may be some inconsistencies and room for improvement in these definitions; + * see comments in https://github.com/broadinstitute/gatk/pull/7954. + */ +public enum VariantType { + SNP, + INDEL; + + /** + * Returns true if both {@code vc} and {@code resourceVC} are the same variant type, + * following our definitions. + */ + public static boolean checkVariantType(final VariantContext vc, + final VariantContext resourceVC) { + switch (resourceVC.getType()) { + case SNP: + case MNP: + return getVariantType(vc) == SNP; + case INDEL: + case MIXED: + case SYMBOLIC: + return getVariantType(vc) == INDEL; + default: + return false; + } + } + + public static VariantType getVariantType(final VariantContext vc) { + if (vc.isSNP() || vc.isMNP()) { + return SNP; + } else if (vc.isStructuralIndel() || vc.isIndel() || vc.isMixed() || vc.isSymbolic()) { + return INDEL; + } else { + throw new IllegalStateException("Encountered unknown variant type: " + vc.getType()); + } + } + + /** + * Note that spanning deletions are expected to be filtered out upstream of this method + * to preserve VQSR behavior; we do not explicitly check this. + * See VariantDataManager#checkVariationClass(VariantContext, Allele, VariantRecalibratorArgumentCollection.Mode), + * from which this method originated. + */ + public static VariantType getAlleleSpecificVariantType(final VariantContext vc, + final Allele allele) { + if (vc.getReference().length() == allele.length()) { + // note that spanning deletions would be considered SNPs by this logic + return SNP; + } + return INDEL; + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/BGMMVariantAnnotationsModel.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/BGMMVariantAnnotationsModel.java new file mode 100644 index 00000000000..14fedaa0a98 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/BGMMVariantAnnotationsModel.java @@ -0,0 +1,31 @@ +package org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling; + +import org.apache.commons.lang.NotImplementedException; + +import java.io.File; +import java.io.Serializable; + +// TODO this is just a stub, will be fleshed out in a separate PR +public final class BGMMVariantAnnotationsModel implements VariantAnnotationsModel { + + public BGMMVariantAnnotationsModel(final File hyperparametersJSONFile) { + throw new NotImplementedException("BGMM module will be implemented in separate PR."); + } + + @Override + public void trainAndSerialize(final File trainingAnnotationsFile, + final String outputPrefix) { + throw new NotImplementedException("BGMM module will be implemented in separate PR."); + } + + static final class Preprocesser implements Serializable { + private static final long serialVersionUID = 1L; + + Preprocesser() { + } + + double[][] transform(final double[][] data) { + throw new NotImplementedException("BGMM module will be implemented in separate PR."); + } + } +} \ No newline at end of file diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/BGMMVariantAnnotationsScorer.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/BGMMVariantAnnotationsScorer.java new file mode 100644 index 00000000000..5a51dcf8dfb --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/BGMMVariantAnnotationsScorer.java @@ -0,0 +1,67 @@ +package org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling; + +import org.apache.commons.lang.NotImplementedException; +import org.apache.logging.log4j.Logger; +import org.broadinstitute.hdf5.HDF5File; +import org.broadinstitute.hdf5.HDF5LibException; +import org.broadinstitute.hellbender.exceptions.GATKException; +import org.broadinstitute.hellbender.tools.copynumber.utils.HDF5Utils; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data.LabeledVariantAnnotationsData; +import org.broadinstitute.hellbender.utils.clustering.BayesianGaussianMixtureModeller; +import org.broadinstitute.hellbender.utils.io.IOUtils; + +import java.io.File; +import java.io.Serializable; +import java.util.List; + +// TODO this is just a stub, will be fleshed out in a separate PR +public final class BGMMVariantAnnotationsScorer implements VariantAnnotationsScorer, Serializable { + + private static final long serialVersionUID = 1L; + + public static final String BGMM_SCORER_SER_SUFFIX = ".bgmmScorer.ser"; + + public BGMMVariantAnnotationsScorer(final List annotationNames, + final BGMMVariantAnnotationsModel.Preprocesser preprocesser, + final BayesianGaussianMixtureModeller bgmm) { + throw new NotImplementedException("BGMM module will be implemented in separate PR."); + } + + @Override + public void score(final File inputAnnotationsFile, + final File outputScoresFile) { + throw new NotImplementedException("BGMM module will be implemented in separate PR."); + } + + public double[][] preprocess(final double[][] annotations) { + throw new NotImplementedException("BGMM module will be implemented in separate PR."); + } + + public void serialize(final File scorerFile) { + throw new NotImplementedException("BGMM module will be implemented in separate PR."); + } + + public static BGMMVariantAnnotationsScorer deserialize(final File scorerFile) { + throw new NotImplementedException("BGMM module will be implemented in separate PR."); + } + + // TODO clean this up, copy more fields + public static void preprocessAnnotationsWithBGMMAndWriteHDF5(final List annotationNames, + final String outputPrefix, + final File labeledTrainingAndVariantTypeAnnotationsFile, + final Logger logger) { + final double[][] rawAnnotations = LabeledVariantAnnotationsData.readAnnotations(labeledTrainingAndVariantTypeAnnotationsFile); + final BGMMVariantAnnotationsScorer scorer = BGMMVariantAnnotationsScorer.deserialize(new File(outputPrefix + BGMM_SCORER_SER_SUFFIX)); + final double[][] preprocessedAnnotations = scorer.preprocess(rawAnnotations); + final File outputPreprocessedAnnotationsFile = new File(outputPrefix + ".annot.pre.hdf5"); + try (final HDF5File hdf5File = new HDF5File(outputPreprocessedAnnotationsFile, HDF5File.OpenMode.CREATE)) { + IOUtils.canReadFile(hdf5File.getFile()); + hdf5File.makeStringArray("/data/annotation_names", annotationNames.toArray(new String[0])); + HDF5Utils.writeChunkedDoubleMatrix(hdf5File, "/data/annotations", preprocessedAnnotations, HDF5Utils.MAX_NUMBER_OF_VALUES_PER_HDF5_MATRIX / 16); + } catch (final HDF5LibException exception) { + throw new GATKException(String.format("Exception encountered during writing of preprocessed annotations (%s). Output file at %s may be in a bad state.", + exception, outputPreprocessedAnnotationsFile.getAbsolutePath())); + } + logger.info(String.format("Preprocessed annotations written to %s.", outputPreprocessedAnnotationsFile.getAbsolutePath())); + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/PythonSklearnVariantAnnotationsModel.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/PythonSklearnVariantAnnotationsModel.java new file mode 100644 index 00000000000..bbe082186a3 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/PythonSklearnVariantAnnotationsModel.java @@ -0,0 +1,69 @@ +package org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling; + +import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.utils.python.PythonScriptExecutor; +import org.broadinstitute.hellbender.utils.runtime.ProcessOutput; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +/** + * Given an HDF5 file containing annotations for a training set (in the format specified by + * {@link VariantAnnotationsModel#trainAndSerialize}), a Python script containing modeling code, + * and a JSON file containing hyperparameters, the {@link #trainAndSerialize} method can be used to train a model. + * + * The modeling script should take the arguments: {@code annotations_file}, {@code hyperparameters_json_file}, + * and {@code output_prefix}. The script is expected to generate the file {outputPrefix}.scorer.pkl. This file should + * contain a pickled Python lambda function to be used for generating scores from annotations in a subsequent test set. + * The lambda should have the signature: + * + * lambda test_annotation_names_i, test_X_ni + * + * Here, test_annotation_names_i is a numpy array of strings containing the annotation names, and + * test X_ni is a numpy matrix of float-valued annotations, with dimensions (number of data points) x (number of annotations). + * The lambda should check the test annotation names against the training annotation names and + * then return a numpy array of float-valued scores with length given by the number of data points. + * + * See src/main/resources/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/isolation-forest.py for an example implementation. + */ +public final class PythonSklearnVariantAnnotationsModel implements VariantAnnotationsModel { + + private final File pythonScriptFile; + private final File hyperparametersJSONFile; + + public PythonSklearnVariantAnnotationsModel(final File pythonScriptFile, + final File hyperparametersJSONFile) { + this.pythonScriptFile = pythonScriptFile; + this.hyperparametersJSONFile = hyperparametersJSONFile; + } + + @Override + public void trainAndSerialize(final File trainingAnnotationsFile, + final String outputPrefix) { + final PythonScriptExecutor executor = new PythonScriptExecutor(true); + final ProcessOutput pythonProcessOutput = executor.executeScriptAndGetOutput( + pythonScriptFile.getAbsolutePath(), + null, + composePythonArguments(trainingAnnotationsFile, hyperparametersJSONFile, outputPrefix)); + + if (pythonProcessOutput.getExitValue() != 0) { + throw executor.getScriptException(executor.getExceptionMessageFromScriptError(pythonProcessOutput)); + } + } + + private static List composePythonArguments(final File annotationsFile, + final File hyperparametersJSONFile, + final String outputPrefix) { + try { + return new ArrayList<>(Arrays.asList( + "--annotations_file=" + annotationsFile.getCanonicalPath(), + "--hyperparameters_json_file=" + hyperparametersJSONFile.getCanonicalPath(), + "--output_prefix=" + outputPrefix)); + } catch (final IOException e) { + throw new UserException.BadInput(String.format("Encountered exception resolving canonical file paths: %s", e)); + } + } +} \ No newline at end of file diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/PythonSklearnVariantAnnotationsScorer.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/PythonSklearnVariantAnnotationsScorer.java new file mode 100644 index 00000000000..51e4e9a4e9b --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/PythonSklearnVariantAnnotationsScorer.java @@ -0,0 +1,69 @@ +package org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling; + +import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.utils.python.PythonScriptExecutor; +import org.broadinstitute.hellbender.utils.runtime.ProcessOutput; + +import java.io.File; +import java.io.IOException; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +/** + * Given an HDF5 file containing annotations for a test set (in the format specified by + * {@link VariantAnnotationsScorer#score}), a Python script containing scoring code, + * and a file containing a pickled Python lambda function for scoring, + * the {@link #score} method can be used to generate scores. + * + * The scoring script should take the arguments: {@code annotations_file}, {@code scorer_pkl_file}, + * and {@code output_scores_file}. The script is expected to load both the annotations and the pickled scoring function, + * which are then used to generate the file {outputPrefix}.scores.hdf5. This HDF5 file should contain + * a double array of the scores in {@value SCORES_PATH}, in the same order as the corresponding data points + * in the provided annotations. + * + * See src/main/resources/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/isolation-forest.py for an example implementation. + */ +public final class PythonSklearnVariantAnnotationsScorer implements VariantAnnotationsScorer, Serializable { + + private static final long serialVersionUID = 1L; + + public static final String PYTHON_SCORER_PKL_SUFFIX = ".scorer.pkl"; + + private final File pythonScriptFile; + private final File scorerPklFile; + + public PythonSklearnVariantAnnotationsScorer(final File pythonScriptFile, + final File scorerPklFile) { + this.pythonScriptFile = pythonScriptFile; + this.scorerPklFile = scorerPklFile; + } + + @Override + public void score(final File inputAnnotationsFile, + final File outputScoresFile) { + final PythonScriptExecutor executor = new PythonScriptExecutor(true); + final ProcessOutput pythonProcessOutput = executor.executeScriptAndGetOutput( + pythonScriptFile.getAbsolutePath(), + null, + composePythonArguments(inputAnnotationsFile, scorerPklFile, outputScoresFile)); + + if (pythonProcessOutput.getExitValue() != 0) { + throw executor.getScriptException(executor.getExceptionMessageFromScriptError(pythonProcessOutput)); + } + } + + private static List composePythonArguments(final File annotationsFile, + final File scorerPklFile, + final File outputScoresFile) { + try { + return new ArrayList<>(Arrays.asList( + "--annotations_file=" + annotationsFile.getCanonicalPath(), + "--scorer_pkl_file=" + scorerPklFile.getCanonicalPath(), + "--output_scores_file=" + outputScoresFile.getCanonicalPath())); + } catch (final IOException e) { + throw new UserException.BadInput(String.format("Encountered exception resolving canonical file paths: %s", e)); + } + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/VariantAnnotationsModel.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/VariantAnnotationsModel.java new file mode 100644 index 00000000000..ee2e899d0a8 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/VariantAnnotationsModel.java @@ -0,0 +1,46 @@ +package org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling; + +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data.LabeledVariantAnnotationsData; + +import java.io.File; + +/** + * File interface for passing annotations to a modeling backend and indicating a path prefix for resulting output. + */ +public interface VariantAnnotationsModel { + + /** + * @param trainingAnnotationsFile Training annotations in HDF5 format, containing at least the directory structure + * + *

+ * |--- annotations
+ * | |--- chunk_0
+ * | |--- ...
+ * | |--- chunk_{num_chunks - 1}
+ * | |--- names
+ * | |--- num_chunks
+ * | |--- num_columns
+ * | |--- num_rows
+ *

+ * + * Here, each chunk is a double matrix, with dimensions given by + * (number of sites in the chunk) x (number of annotations). + * See {@link LabeledVariantAnnotationsData#writeHDF5}. + * + * Modeling backends are responsible for consuming annotations in this format + * and outputting a {@link VariantAnnotationsScorer} for each variant type + * with the appropriate output names. This responsibility includes the + * implementation of functionality that allows validation of annotation names + * in downstream {@link VariantAnnotationsScorer} instances. + * + * In current use, we assume that a single model will be trained, so either + * 1) training annotations have already been subset to a single variant type (SNP or INDEL), or + * 2) we assume the model does not care about the variant type. + * TODO we could also pass additional labels to be used in training, + * but all backends would have to likewise respect directory structure + * + * @param outputPrefix Path prefix for all output files + */ + void trainAndSerialize(final File trainingAnnotationsFile, + final String outputPrefix); +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/VariantAnnotationsModelBackend.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/VariantAnnotationsModelBackend.java new file mode 100644 index 00000000000..a4fa8460440 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/VariantAnnotationsModelBackend.java @@ -0,0 +1,16 @@ +package org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling; + +public enum VariantAnnotationsModelBackend { + // TODO will be added in a separate PR + JAVA_BGMM, + + /** + * Use the script at org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/isolation-forest.py + */ + PYTHON_IFOREST, + + /** + * Use a user-provided script. + */ + PYTHON_SCRIPT +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/VariantAnnotationsScorer.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/VariantAnnotationsScorer.java new file mode 100644 index 00000000000..c0550273c57 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/modeling/VariantAnnotationsScorer.java @@ -0,0 +1,111 @@ +package org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling; + +import org.broadinstitute.hdf5.HDF5File; +import org.broadinstitute.hdf5.HDF5LibException; +import org.broadinstitute.hellbender.exceptions.GATKException; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data.LabeledVariantAnnotationsData; +import org.broadinstitute.hellbender.utils.Utils; +import org.broadinstitute.hellbender.utils.io.IOUtils; +import org.hipparchus.stat.fitting.EmpiricalDistribution; + +import java.io.File; +import java.util.Arrays; +import java.util.function.Function; +import java.util.stream.IntStream; + +/** + * File interface for passing annotations to a scoring backend and returning scores. + */ +public interface VariantAnnotationsScorer { + + String SCORES_PATH = "/data/scores"; // our HDF5 library does not allow writing to a bare/root path (e.g., /scores) + + /** + * @param inputAnnotationsFile Annotations to be scored in HDF5 format, containing at least the directory structure + * + *

+ * |--- annotations
+ * | |--- chunk_0
+ * | |--- ...
+ * | |--- chunk_{num_chunks - 1}
+ * | |--- names
+ * | |--- num_chunks
+ * | |--- num_columns
+ * | |--- num_rows
+ *

+ * + * Here, each chunk is a double matrix, with dimensions given by + * (number of sites in the chunk) x (number of annotations). + * See {@link LabeledVariantAnnotationsData#writeHDF5}. + * + * Scoring backends are responsible for consuming annotations in this format and + * outputting a double array of scores to file. This responsibility includes + * validation of annotation names. + * + * @param outputScoresFile Output file in HDF5 format, containing scores at {@link VariantAnnotationsScorer#SCORES_PATH}. + */ + void score(final File inputAnnotationsFile, + final File outputScoresFile); + + /** + * Given scores for a calibration set, returns a function for converting a subsequent score to a + * sensitivity with respect to that calibration set. This function is simply given by 1 - ECDF, + * where ECDF is the empirical cumulative distribution function of the calibration scores; + * see here. + * For example, a score that is very low relative to the calibration scores would yield a + * high calibration sensitivity; that is, using this score as the minimum allowable threshold for filtering + * would result in a high sensitivity with respect to the calibration set. + * + * @param calibrationScores must all be finite + */ + static Function createScoreToCalibrationSensitivityConverter(final double[] calibrationScores) { + Utils.validateArg(Arrays.stream(calibrationScores).allMatch(Double::isFinite), + "Calibration scores must all be finite."); + final EmpiricalDistribution empiricalDistribution = new EmpiricalDistribution(); + empiricalDistribution.load(calibrationScores); + return score -> 1. - empiricalDistribution.cumulativeProbability(score); + } + + /** + * Reads a double array of scores from {@value SCORES_PATH} in an HDF5 file. + */ + static double[] readScores(final File inputFile) { + try (final HDF5File inputHDF5File = new HDF5File(inputFile, HDF5File.OpenMode.READ_ONLY)) { + IOUtils.canReadFile(inputHDF5File.getFile()); + return inputHDF5File.readDoubleArray(SCORES_PATH); + } catch (final HDF5LibException exception) { + throw new GATKException(String.format("Exception encountered during reading of scores from %s: %s", + inputFile.getAbsolutePath(), exception)); + } + } + + /** + * Writes a double array of scores to {@value SCORES_PATH} in an HDF5 file. + */ + static void writeScores(final File outputFile, + final double[] scores) { + try (final HDF5File outputHDF5File = new HDF5File(outputFile, HDF5File.OpenMode.CREATE)) { + outputHDF5File.makeDoubleArray(SCORES_PATH, scores); + } catch (final HDF5LibException exception) { + throw new GATKException(String.format("Exception encountered during writing of scores (%s). Output file at %s may be in a bad state.", + exception, outputFile.getAbsolutePath())); + } + } + + /** + * Yields a VQSR-style positive-negative scorer that returns the difference of the positive score and the negative score. + */ + static VariantAnnotationsScorer combinePositiveAndNegativeScorer(final VariantAnnotationsScorer positiveScorer, + final VariantAnnotationsScorer negativeScorer) { + return (inputAnnotationsFile, outputScoresFile) -> { + final File tempPositiveScoresFile = IOUtils.createTempFile("positive", "scores.hdf5"); + final File tempNegativeScoresFile = IOUtils.createTempFile("negative", "scores.hdf5"); + positiveScorer.score(inputAnnotationsFile, tempPositiveScoresFile); + final double[] positiveScores = VariantAnnotationsScorer.readScores(tempPositiveScoresFile); + negativeScorer.score(inputAnnotationsFile, tempNegativeScoresFile); + final double[] negativeScores = VariantAnnotationsScorer.readScores(tempNegativeScoresFile); + final double[] scores = IntStream.range(0, positiveScores.length).mapToDouble(i -> positiveScores[i] - negativeScores[i]).toArray(); + VariantAnnotationsScorer.writeScores(outputScoresFile, scores); + }; + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/utils/MathUtils.java b/src/main/java/org/broadinstitute/hellbender/utils/MathUtils.java index 32b0123b2d7..4ca56eae151 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/MathUtils.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/MathUtils.java @@ -31,6 +31,7 @@ public final class MathUtils { public static final double LOG10_ONE_HALF = Math.log10(0.5); public static final double LOG10_ONE_THIRD = -Math.log10(3.0); public static final double LOG_ONE_THIRD = -Math.log(3.0); + public static final double LOG_2 = Math.log(2.0); public static final double INV_LOG_2 = 1.0 / Math.log(2.0); private static final double LOG_10 = Math.log(10); private static final double INV_LOG_10 = 1.0 / LOG_10; diff --git a/src/main/java/org/broadinstitute/hellbender/utils/NaturalLogUtils.java b/src/main/java/org/broadinstitute/hellbender/utils/NaturalLogUtils.java index 6de748f01f7..55f7b9d8909 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/NaturalLogUtils.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/NaturalLogUtils.java @@ -81,7 +81,7 @@ public static double logSumExp(final double... logValues) { } } if ( Double.isNaN(sum) || sum == Double.POSITIVE_INFINITY ) { - throw new IllegalArgumentException("log10 p: Values must be non-infinite and non-NAN"); + throw new IllegalArgumentException("logValues must be non-infinite and non-NAN"); } return maxValue + (sum != 1.0 ? Math.log(sum) : 0.0); } diff --git a/src/main/java/org/broadinstitute/hellbender/utils/clustering/BayesianGaussianMixtureModeller.java b/src/main/java/org/broadinstitute/hellbender/utils/clustering/BayesianGaussianMixtureModeller.java new file mode 100644 index 00000000000..fc759db3e9d --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/utils/clustering/BayesianGaussianMixtureModeller.java @@ -0,0 +1,35 @@ +package org.broadinstitute.hellbender.utils.clustering; + +import org.apache.commons.lang.NotImplementedException; +import org.apache.commons.math3.linear.RealMatrix; +import org.apache.commons.math3.linear.RealVector; + +import java.io.Serializable; + +public final class BayesianGaussianMixtureModeller implements Serializable { + private static final long serialVersionUID = 1L; + + public enum InitMethod { + K_MEANS_PLUS_PLUS, RANDOM, TEST + } + + private BayesianGaussianMixtureModeller(final int nComponents, + final double tol, + final double regCovar, + final int maxIter, + final int nInit, + final InitMethod initMethod, + final double weightConcentrationPrior, + final double meanPrecisionPrior, + final RealVector meanPrior, + final Double degreesOfFreedomPrior, + final RealMatrix covariancePrior, + final int seed, + final boolean warmStart, + final int verboseInterval, + final double relativeSymmetryThreshold, + final double absolutePositivityThreshold, + final double epsilon) { + throw new NotImplementedException("BGMM module implemented in separate PR."); + } +} \ No newline at end of file diff --git a/src/main/python/org/broadinstitute/hellbender/gcnvkernel/models/model_denoising_calling.py b/src/main/python/org/broadinstitute/hellbender/gcnvkernel/models/model_denoising_calling.py index f5d525ace74..cefebd1cf2b 100644 --- a/src/main/python/org/broadinstitute/hellbender/gcnvkernel/models/model_denoising_calling.py +++ b/src/main/python/org/broadinstitute/hellbender/gcnvkernel/models/model_denoising_calling.py @@ -786,8 +786,7 @@ def __init__(self, # the expected number of erroneously mapped reads mean_mapping_error_correction_s = eps_mapping * read_depth_s * shared_workspace.average_ploidy_s - denoised_copy_ratio_st = ((shared_workspace.n_st - mean_mapping_error_correction_s.dimshuffle(0, 'x')) - / ((1.0 - eps_mapping) * read_depth_s.dimshuffle(0, 'x') * bias_st)) + denoised_copy_ratio_st = shared_workspace.n_st / (read_depth_s.dimshuffle(0, 'x') * bias_st) Deterministic(name='denoised_copy_ratio_st', var=denoised_copy_ratio_st) diff --git a/src/main/resources/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/isolation-forest-hyperparameters.json b/src/main/resources/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/isolation-forest-hyperparameters.json new file mode 100644 index 00000000000..172b8aa42eb --- /dev/null +++ b/src/main/resources/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/isolation-forest-hyperparameters.json @@ -0,0 +1,3 @@ +{ + "random_state": 0 +} \ No newline at end of file diff --git a/src/main/resources/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/isolation-forest.py b/src/main/resources/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/isolation-forest.py new file mode 100644 index 00000000000..554817162b2 --- /dev/null +++ b/src/main/resources/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/isolation-forest.py @@ -0,0 +1,138 @@ +import argparse +import h5py +import sklearn.ensemble +import sklearn.impute +import numpy as np +import dill +import json + + +def read_annotations(h5file): + with h5py.File(h5file, 'r') as f: + annotation_names_i = f['/annotations/names'][()].astype(str) + + # read chunked annotations + num_chunks = int(f['/annotations/num_chunks'][()]) + num_columns = int(f['/annotations/num_columns'][()]) + num_rows = int(f['/annotations/num_rows'][()]) + X_ni = np.zeros((num_rows, num_columns)) + n = 0 + for chunk_index in range(num_chunks): + chunk_ni = f[f'/annotations/chunk_{chunk_index}'][()] + num_rows_in_chunk = len(chunk_ni) + X_ni[n:n + num_rows_in_chunk, :] = chunk_ni + n += num_rows_in_chunk + assert n == num_rows + return annotation_names_i, X_ni + + +def train(annotations_file, + hyperparameters_json_file, + output_prefix): + print('Reading annotations...') + annotation_names_i, X_ni = read_annotations(annotations_file) + print(f'Annotations: {annotation_names_i}.') + + print('Reading hyperparameters...') + with open(hyperparameters_json_file) as json_file: + hyperparameters_kwargs = json.load(json_file) + print('Hyperparameters:', hyperparameters_kwargs) + + print('Imputing annotations...') + imputer = sklearn.impute.SimpleImputer(strategy='median') + imputed_X_ni = imputer.fit_transform(X_ni) + + # SimpleImputer will drop any features that are completely missing, resulting in different shapes for + # imputed_X_ni and X_ni and misalignment of features when training and scoring downstream if not checked. + # We externally check for and fail in the presence of any entirely missing features, but we do a redundant check here. + assert imputed_X_ni.shape == X_ni.shape, \ + f'Shape of imputed annotations differs from shape of raw annotations; at least one feature is completely missing ' \ + f'and hence dropped during imputation.' + + print(f'Training IsolationForest with {imputed_X_ni.shape[0]} training sites x {imputed_X_ni.shape[1]} annotations...') + clf = sklearn.ensemble.IsolationForest(**hyperparameters_kwargs) + clf.fit(imputed_X_ni) + print('Training complete.') + + def score_samples(test_annotation_names_i, + test_X_ni): + assert np.array_equal(test_annotation_names_i, annotation_names_i), \ + f'Input annotation names ({test_annotation_names_i}) must be identical to those used to train the scorer ({annotation_names_i}).' + return clf.score_samples(imputer.transform(test_X_ni)) # TODO sklearn's implementation is single-threaded, but this could perhaps be parallelized + + scorer_lambda = lambda test_annotation_names_i, test_X_ni: score_samples(test_annotation_names_i, test_X_ni) + + print(f'Pickling scorer...') + output_scorer_pkl_file = f'{output_prefix}.scorer.pkl' + with open(output_scorer_pkl_file, 'wb') as f: + dill.dump(scorer_lambda, f) # the dill package can be used to pickle lambda functions + print(f'Scorer pickled to {output_scorer_pkl_file}.') + + +def score(annotations_file, + scorer_pkl_file, + output_scores_file): + annotation_names_i, X_ni = read_annotations(annotations_file) + + with open(scorer_pkl_file, 'rb') as f: + scorer_lambda = dill.load(f) + score_n = scorer_lambda(annotation_names_i, X_ni) + + with h5py.File(output_scores_file, 'w') as f: + scores_dset = f.create_dataset('data/scores', (len(score_n),), dtype='d') + scores_dset[:] = score_n + + +def main(): + parser = argparse.ArgumentParser() + + parser.add_argument('--annotations_file', + type=str, + required=True, + help='') + + parser.add_argument('--hyperparameters_json_file', + type=str, + required=False, + help='') + + parser.add_argument('--output_prefix', + type=str, + required=False, + help='') + + parser.add_argument('--scorer_pkl_file', + type=str, + required=False, + help='') + + parser.add_argument('--output_scores_file', + type=str, + required=False, + help='') + + args = parser.parse_args() + + annotations_file = args.annotations_file + + # this script can handle both training and scoring; we check the passed arguments to determine which is appropriate + if args.hyperparameters_json_file is not None and args.output_prefix is not None and \ + args.scorer_pkl_file is None and args.output_scores_file is None: + hyperparameters_json_file = args.hyperparameters_json_file + output_prefix = args.output_prefix + train(annotations_file, + hyperparameters_json_file, + output_prefix) + elif args.hyperparameters_json_file is None and args.output_prefix is None and \ + args.scorer_pkl_file is not None and args.output_scores_file is not None: + scorer_pkl_file = args.scorer_pkl_file + output_scores_file = args.output_scores_file + score(annotations_file, + scorer_pkl_file, + output_scores_file) + else: + raise + + +if __name__ == '__main__': + main() diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ExtractVariantAnnotationsIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ExtractVariantAnnotationsIntegrationTest.java new file mode 100644 index 00000000000..dd3f1202b7f --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ExtractVariantAnnotationsIntegrationTest.java @@ -0,0 +1,253 @@ +package org.broadinstitute.hellbender.tools.walkers.vqsr.scalable; + +import com.google.common.collect.Lists; +import org.apache.commons.lang3.tuple.Pair; +import org.broadinstitute.hellbender.CommandLineProgramTest; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.testutils.ArgumentsBuilder; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data.LabeledVariantAnnotationsData; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data.VariantType; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.function.Function; +import java.util.function.Supplier; +import java.util.stream.Collectors; + +/** + * Note that the expected outputs for the exact-match tests below are used as inputs for + * {@link TrainVariantAnnotationsModelIntegrationTest}. Similarly, the expected outputs for + * {@link TrainVariantAnnotationsModelIntegrationTest} are used as inputs for {@link ScoreVariantAnnotationsIntegrationTest}. + * Thus, developers should keep the expected outputs for all of these integration tests in sync when updating any of them. + * This can easily be done by setting the UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS flags for all tools to be true and then running + * the tests in order. + */ +public final class ExtractVariantAnnotationsIntegrationTest extends CommandLineProgramTest { + + // If true, update the expected outputs in tests that assert an exact match vs. prior output, + // instead of actually running the tests. Can be used with "./gradlew test -Dtest.single=ExtractVariantAnnotationsIntegrationTest" + // to update all of the exact-match tests at once. After you do this, you should look at the + // diffs in the new expected outputs in git to confirm that they are consistent with expectations. + public static final boolean UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS = false; + + /** + * Make sure that someone didn't leave the UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS toggle turned on. + */ + @Test + public void assertThatExpectedOutputUpdateToggleIsDisabled() { + Assert.assertFalse(UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS, "The toggle to update expected outputs should not be left enabled."); + } + + private static final List NON_ALLELE_SPECIFIC_ANNOTATIONS = Arrays.asList( + "DP", "FS", "MQ", "MQRankSum", "QD", "ReadPosRankSum", "SOR"); + + private static final List ALLELE_SPECIFIC_ANNOTATIONS = Arrays.asList( + "AS_FS", "AS_MQ", "AS_MQRankSum", "AS_QD", "AS_ReadPosRankSum", "AS_SOR"); + + private static final File PACKAGE_TEST_FILES_DIR = new File(largeFileTestDir, + "org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/"); + private static final File TEST_FILES_DIR = new File(largeFileTestDir, + "org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract"); + private static final File EXPECTED_TEST_FILES_DIR = new File(TEST_FILES_DIR, "expected"); + + // The input VCF should cover a genomic region given by the union of regions in the below training and calibration resources + // and should also contain a few multiallelics that overlap those resources. + private static final File INPUT_VCF = new File(PACKAGE_TEST_FILES_DIR, "input/small_callset_low_threshold.sites-only.chr1.1-10M.vcf"); + + // We use snippets of the Omni sites for SNP training (chr1:1-5000000) and calibration (chr1:5000000-10000000); we don't sweat the 1bp overlap. + private static final File SNP_TRAINING_VCF = new File(PACKAGE_TEST_FILES_DIR, "resources/1000G_omni2.5.hg38.chr1.1-5M.vcf.gz"); + private static final File SNP_CALIBRATION_VCF = new File(PACKAGE_TEST_FILES_DIR, "resources/1000G_omni2.5.hg38.chr1.5M-10M.vcf.gz"); + + // We use snippets of the Mills sites for indel training (chr1:1-5000000) and calibration (chr1:5000000-10000000); we don't sweat the 1bp overlap. + private static final File INDEL_TRAINING_VCF = new File(PACKAGE_TEST_FILES_DIR, "resources/Mills_and_1000G_gold_standard.indels.hg38.chr1.1-5M.vcf.gz"); + private static final File INDEL_CALIBRATION_VCF = new File(PACKAGE_TEST_FILES_DIR, "resources/Mills_and_1000G_gold_standard.indels.hg38.chr1.5M-10M.vcf.gz"); + + private static final int MAXIMUM_NUMBER_OF_UNLABELED_VARIANTS = 100; + + // Supplier and functions for creating and adding various arguments to an ArgumentsBuilder. + private static final Supplier BASE_ARGUMENTS_BUILDER_SUPPLIER = () -> { + final ArgumentsBuilder argsBuilder = new ArgumentsBuilder(); + argsBuilder.addVCF(INPUT_VCF); + argsBuilder.addFlag(LabeledVariantAnnotationsWalker.DO_NOT_GZIP_VCF_OUTPUT_LONG_NAME); // we do not gzip VCF outputs so that we can use diff to compare to the expected result + argsBuilder.add(StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE, false); + return argsBuilder; + }; + static final Function ADD_NON_ALLELE_SPECIFIC_ANNOTATIONS = argsBuilder -> { + NON_ALLELE_SPECIFIC_ANNOTATIONS.forEach(a -> argsBuilder.add(StandardArgumentDefinitions.ANNOTATION_LONG_NAME, a)); + return argsBuilder; + }; + static final Function ADD_ALLELE_SPECIFIC_ANNOTATIONS = argsBuilder -> { + argsBuilder.addFlag(LabeledVariantAnnotationsWalker.USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME); + ALLELE_SPECIFIC_ANNOTATIONS.forEach(a -> argsBuilder.add(StandardArgumentDefinitions.ANNOTATION_LONG_NAME, a)); + return argsBuilder; + }; + static final Function ADD_SNP_MODE_AND_RESOURCES = argsBuilder -> { + argsBuilder.add(LabeledVariantAnnotationsWalker.MODE_LONG_NAME, VariantType.SNP) + .add(StandardArgumentDefinitions.RESOURCE_LONG_NAME + String.format(":omni-training,%s=true", LabeledVariantAnnotationsData.TRAINING_LABEL), SNP_TRAINING_VCF) + .add(StandardArgumentDefinitions.RESOURCE_LONG_NAME + String.format(":omni-calibration,%s=true", LabeledVariantAnnotationsData.CALIBRATION_LABEL), SNP_CALIBRATION_VCF); + return argsBuilder; + }; + static final Function ADD_INDEL_MODE_AND_RESOURCES = argsBuilder -> { + argsBuilder.add(LabeledVariantAnnotationsWalker.MODE_LONG_NAME, VariantType.INDEL) + .add(StandardArgumentDefinitions.RESOURCE_LONG_NAME + String.format(":mills-training,%s=true", LabeledVariantAnnotationsData.TRAINING_LABEL), INDEL_TRAINING_VCF) + .add(StandardArgumentDefinitions.RESOURCE_LONG_NAME + String.format(":mills-calibration,%s=true", LabeledVariantAnnotationsData.CALIBRATION_LABEL), INDEL_CALIBRATION_VCF); + return argsBuilder; + }; + private static final Function ADD_MAXIMUM_NUMBER_OF_UNLABELED_VARIANTS = argsBuilder -> { + argsBuilder.add(ExtractVariantAnnotations.MAXIMUM_NUMBER_OF_UNLABELED_VARIANTS_LONG_NAME, MAXIMUM_NUMBER_OF_UNLABELED_VARIANTS); + return argsBuilder; + }; + + /** + * Exact-match tests for configurations given by the Cartesian product of the following options: + * 1) non-allele-specific ("nonAS') vs. allele-specific ("AS") + * 2) SNP-only ("snp") vs. INDEL-only ("indel") vs. SNP+INDEL ("snpIndel") + * 3) positive ("pos") vs. positive-unlabeled ("posUn") + */ + @DataProvider(name = "dataValidInputs") + public Object[][] dataValidInputs() { + final List>>> testConfigurations = Lists.cartesianProduct( + Collections.singletonList( + Pair.of("extract", Function.identity())), + Arrays.asList( + Pair.of("nonAS", ADD_NON_ALLELE_SPECIFIC_ANNOTATIONS), + Pair.of("AS", ADD_ALLELE_SPECIFIC_ANNOTATIONS)), + Arrays.asList( + Pair.of("snp", ADD_SNP_MODE_AND_RESOURCES), + Pair.of("indel", ADD_INDEL_MODE_AND_RESOURCES), + Pair.of("snpIndel", ADD_SNP_MODE_AND_RESOURCES.andThen(ADD_INDEL_MODE_AND_RESOURCES))), + Arrays.asList( + Pair.of("pos", Function.identity()), + Pair.of("posUn", ADD_MAXIMUM_NUMBER_OF_UNLABELED_VARIANTS))); + + return testConfigurations.stream() + .map(tagAndAddFunctionPairs -> new Object[]{ + tagAndAddFunctionPairs.stream().map(Pair::getLeft).collect(Collectors.joining(".")), // e.g., "extract.nonAS.snp.pos" + tagAndAddFunctionPairs.stream().map(Pair::getRight) // creates the corresponding ArgumentsBuilder + .reduce(Function.identity(), Function::andThen) // by stringing together functions that add the + .apply(BASE_ARGUMENTS_BUILDER_SUPPLIER.get())}) // appropriate arguments + .toArray(Object[][]::new); + } + + /** + * Checks expected outputs given a tag (e.g., "extract.nonAS.snp.pos") and arguments corresponding to the + * Cartesian products generated in {@link #dataValidInputs}. + * + * We perform exact-match tests of any annotation HDF5 files produced using h5diff, which is insensitive to timestamps within the file. + * We also perform exact-match tests of VCF files using diff. VCF indices may not be diff equivalent, so + * we just check for their existence. + */ + @Test(dataProvider = "dataValidInputs", groups = {"python"}) // python environment is required to use h5diff for exact-match comparisons + public void testValidInputs(final String tag, + final ArgumentsBuilder argsBuilder) { + final File outputDir = UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS ? EXPECTED_TEST_FILES_DIR : createTempDir("extract"); + final String outputPrefix = String.format("%s/%s", outputDir, tag); + argsBuilder.addOutput(outputPrefix); + runCommandLine(argsBuilder); + + if (!UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS) { + assertOutputs(tag, outputPrefix); + } + } + + private static void assertOutputs(final String tag, + final String outputPrefix) { + // vcf.idx files are not reproducible + SystemCommandUtilsTest.runSystemCommand(String.format("h5diff %s/%s %s", + EXPECTED_TEST_FILES_DIR, + tag + LabeledVariantAnnotationsWalker.ANNOTATIONS_HDF5_SUFFIX, + outputPrefix + LabeledVariantAnnotationsWalker.ANNOTATIONS_HDF5_SUFFIX)); + SystemCommandUtilsTest.runSystemCommand(String.format("diff %s/%s.vcf %s.vcf", + EXPECTED_TEST_FILES_DIR, tag, outputPrefix)); + if (tag.contains("posUn")) { + SystemCommandUtilsTest.runSystemCommand(String.format("h5diff %s/%s %s", + EXPECTED_TEST_FILES_DIR, + tag + ExtractVariantAnnotations.UNLABELED_TAG + ExtractVariantAnnotations.ANNOTATIONS_HDF5_SUFFIX, + outputPrefix + ExtractVariantAnnotations.UNLABELED_TAG + ExtractVariantAnnotations.ANNOTATIONS_HDF5_SUFFIX)); + } else { + Assert.assertFalse(new File(outputPrefix + ExtractVariantAnnotations.UNLABELED_TAG + ExtractVariantAnnotations.ANNOTATIONS_HDF5_SUFFIX).exists()); + } + } + + /** + * If no resources are provided and we do not extract unlabeled sites, then only a zero-record VCF and the corresponding index are created. + * This is because we cannot create HDF5 files with empty arrays/matrices. + */ + @Test + public void testNoResources() { + final File outputDir = createTempDir("extract"); + final String outputPrefix = String.format("%s/test", outputDir); + final ArgumentsBuilder argsBuilder = ADD_ALLELE_SPECIFIC_ANNOTATIONS.apply(BASE_ARGUMENTS_BUILDER_SUPPLIER.get()); + argsBuilder.add(LabeledVariantAnnotationsWalker.MODE_LONG_NAME, VariantType.SNP) + .addOutput(outputPrefix); + runCommandLine(argsBuilder); + Assert.assertFalse(new File(outputPrefix + ExtractVariantAnnotations.ANNOTATIONS_HDF5_SUFFIX).exists()); + Assert.assertFalse(new File(outputPrefix + ExtractVariantAnnotations.UNLABELED_TAG + ExtractVariantAnnotations.ANNOTATIONS_HDF5_SUFFIX).exists()); + Assert.assertTrue(new File(outputPrefix + ".vcf").exists()); + Assert.assertTrue(new File(outputPrefix + ".vcf.idx").exists()); + } + + /** + * If no resources are provided but we do extract unlabeled sites, then all output files except the labeled-annotations HDF5 file are created. + * This is because we cannot create HDF5 files with empty arrays/matrices. + */ + @Test + public void testNoResourcesAndExtractUnlabeled() { + final File outputDir = createTempDir("extract"); + final String outputPrefix = String.format("%s/test", outputDir); + final ArgumentsBuilder argsBuilder = ADD_ALLELE_SPECIFIC_ANNOTATIONS.apply(BASE_ARGUMENTS_BUILDER_SUPPLIER.get()); + argsBuilder.add(LabeledVariantAnnotationsWalker.MODE_LONG_NAME, VariantType.SNP) + .add(ExtractVariantAnnotations.MAXIMUM_NUMBER_OF_UNLABELED_VARIANTS_LONG_NAME, 1) + .addOutput(outputPrefix); + runCommandLine(argsBuilder); + Assert.assertFalse(new File(outputPrefix + ExtractVariantAnnotations.ANNOTATIONS_HDF5_SUFFIX).exists()); + Assert.assertTrue(new File(outputPrefix + ExtractVariantAnnotations.UNLABELED_TAG + ExtractVariantAnnotations.ANNOTATIONS_HDF5_SUFFIX).exists()); + Assert.assertTrue(new File(outputPrefix + ".vcf").exists()); + Assert.assertTrue(new File(outputPrefix + ".vcf.idx").exists()); + } + + /** + * If no variants are present in the input in the specified region, then only a zero-record VCF and the corresponding index are created. + * This is because we cannot create HDF5 files with empty arrays/matrices. + */ + @Test + public void testNoVariantsInInput() { + final File outputDir = createTempDir("extract"); + final String outputPrefix = String.format("%s/test", outputDir); + final ArgumentsBuilder argsBuilder = ADD_ALLELE_SPECIFIC_ANNOTATIONS.apply(BASE_ARGUMENTS_BUILDER_SUPPLIER.get()); + argsBuilder.add(LabeledVariantAnnotationsWalker.MODE_LONG_NAME, VariantType.SNP) + .addOutput(outputPrefix); + runCommandLine(argsBuilder); + Assert.assertFalse(new File(outputPrefix + ScoreVariantAnnotations.ANNOTATIONS_HDF5_SUFFIX).exists()); + Assert.assertFalse(new File(outputPrefix + ScoreVariantAnnotations.SCORES_HDF5_SUFFIX).exists()); + Assert.assertTrue(new File(outputPrefix + ".vcf").exists()); + Assert.assertTrue(new File(outputPrefix + ".vcf.idx").exists()); + } + + @Test(expectedExceptions = UserException.class) + public void testForgotToSpecifyUseAlleleSpecificAnnotationsFlag() { + final File outputDir = createTempDir("extract"); + final String outputPrefix = String.format("%s/test", outputDir); + final ArgumentsBuilder argsBuilder = ADD_SNP_MODE_AND_RESOURCES.apply(BASE_ARGUMENTS_BUILDER_SUPPLIER.get()); + ALLELE_SPECIFIC_ANNOTATIONS.forEach(a -> argsBuilder.add(StandardArgumentDefinitions.ANNOTATION_LONG_NAME, a)); + argsBuilder.addOutput(outputPrefix); + runCommandLine(argsBuilder); + } + + @Test(expectedExceptions = UserException.class) + public void testReservedSNPResourceLabel() { + final File outputDir = createTempDir("extract"); + final String outputPrefix = String.format("%s/test", outputDir); + final ArgumentsBuilder argsBuilder = ADD_NON_ALLELE_SPECIFIC_ANNOTATIONS.apply(BASE_ARGUMENTS_BUILDER_SUPPLIER.get()); + argsBuilder.add(LabeledVariantAnnotationsWalker.MODE_LONG_NAME, VariantType.SNP) + .add(StandardArgumentDefinitions.RESOURCE_LONG_NAME + String.format(":snp,%s=true", LabeledVariantAnnotationsData.SNP_LABEL), SNP_TRAINING_VCF) + .addOutput(outputPrefix); + runCommandLine(argsBuilder); + } +} \ No newline at end of file diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ScoreVariantAnnotationsIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ScoreVariantAnnotationsIntegrationTest.java new file mode 100644 index 00000000000..289821d0e54 --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/ScoreVariantAnnotationsIntegrationTest.java @@ -0,0 +1,260 @@ +package org.broadinstitute.hellbender.tools.walkers.vqsr.scalable; + +import com.google.common.collect.Lists; +import org.apache.commons.lang3.tuple.Pair; +import org.broadinstitute.hellbender.CommandLineProgramTest; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.testutils.ArgumentsBuilder; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data.LabeledVariantAnnotationsData; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data.VariantType; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.VariantAnnotationsModelBackend; +import org.broadinstitute.hellbender.utils.io.IOUtils; +import org.broadinstitute.hellbender.utils.io.Resource; +import org.broadinstitute.hellbender.utils.python.PythonScriptExecutorException; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.Arrays; +import java.util.List; +import java.util.function.BiFunction; +import java.util.function.Function; +import java.util.function.Supplier; +import java.util.stream.Collectors; + +/** + * See documentation for {@link ExtractVariantAnnotationsIntegrationTest} for information about how inputs and + * expected outputs used there are related to those used here and in {@link TrainVariantAnnotationsModelIntegrationTest}. + */ +public final class ScoreVariantAnnotationsIntegrationTest extends CommandLineProgramTest { + + // If true, update the expected outputs in tests that assert an exact match vs. prior output, + // instead of actually running the tests. Can be used with "./gradlew test -Dtest.single=ScoreVariantAnnotationsIntegrationTest" + // to update all of the exact-match tests at once. After you do this, you should look at the + // diffs in the new expected outputs in git to confirm that they are consistent with expectations. + public static final boolean UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS = false; + + /** + * Make sure that someone didn't leave the UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS toggle turned on. + */ + @Test + public void assertThatExpectedOutputUpdateToggleIsDisabled() { + Assert.assertFalse(UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS, "The toggle to update expected outputs should not be left enabled."); + } + + private static final double CALIBRATION_SENSITIVITY_THRESHOLD = 0.9; + + private static final File PACKAGE_TEST_FILES_DIR = new File(largeFileTestDir, + "org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/"); + private static final File TEST_FILES_DIR = new File(largeFileTestDir, + "org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score"); + private static final File INPUT_FROM_TRAIN_EXPECTED_TEST_FILES_DIR = new File(largeFileTestDir, + "org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected"); + private static final File EXPECTED_TEST_FILES_DIR = new File(TEST_FILES_DIR, "expected"); + + private static final File ISOLATION_FOREST_PYTHON_SCRIPT = IOUtils.writeTempResource( + new Resource("isolation-forest.py", TrainVariantAnnotationsModel.class)); + + private static final File INPUT_VCF = new File(PACKAGE_TEST_FILES_DIR, "input/small_callset_low_threshold.sites-only.chr1.1-10M.vcf"); + + // Supplier and functions for creating and adding various arguments to an ArgumentsBuilder. + private static final Supplier BASE_ARGUMENTS_BUILDER_SUPPLIER = () -> { + final ArgumentsBuilder argsBuilder = new ArgumentsBuilder(); + argsBuilder.addVCF(INPUT_VCF); + argsBuilder.addFlag(LabeledVariantAnnotationsWalker.DO_NOT_GZIP_VCF_OUTPUT_LONG_NAME); + argsBuilder.add(StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE, false); + return argsBuilder; + }; + private static final BiFunction ADD_MODEL_PREFIX = (argsBuilder, modelPrefix) -> { + argsBuilder.add(ScoreVariantAnnotations.MODEL_PREFIX_LONG_NAME, modelPrefix); + return argsBuilder; + }; + private static final BiFunction ADD_CALIBRATION_SENSITIVITY_THRESHOLD = (argsBuilder, calibrationSensitivityThreshold) -> { + argsBuilder.add(ScoreVariantAnnotations.SNP_CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME, calibrationSensitivityThreshold); + argsBuilder.add(ScoreVariantAnnotations.INDEL_CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME, calibrationSensitivityThreshold); + return argsBuilder; + }; + private static final BiFunction ADD_MODEL_BACKEND = (argsBuilder, modelBackendMode) -> { + argsBuilder.add(TrainVariantAnnotationsModel.MODEL_BACKEND_LONG_NAME, modelBackendMode); + return argsBuilder; + }; + private static final Function ADD_ISOLATION_FOREST_PYTHON_SCRIPT = argsBuilder -> { + argsBuilder.add(ScoreVariantAnnotations.PYTHON_SCRIPT_LONG_NAME, ISOLATION_FOREST_PYTHON_SCRIPT); + return argsBuilder; + }; + + /** + * Exact-match tests for (non-exhaustive) configurations given by the Cartesian product of the following options: + * 1) non-allele-specific ("nonAS") vs. allele-specific ("AS") + * 2) model backend + * 2a) Java Bayesian Gaussian Mixture Model (BGMM) backend TODO the BGMM has been reduced to a stub for this initial PR; subsequent PRs will cover the backend code and reconnect the stub + * 2b) default PYTHON_IFOREST ("IF.score") + * 2c) specified PYTHON_SCRIPT ("IF.score"); we will simply use the same script as the default PYTHON_IFOREST backend, so this is just a test of the command-line interface + * We should expect 2b-c to give functionally identical results. + * 3) SNP-only ("snp") vs. SNP+INDEL ("snpIndel") (for both of these options, we use trained models that contain both SNP and INDEL scorers as input) + * TODO the BGMM has been reduced to a stub for this initial PR; subsequent PRs will cover the backend code and reconnect the stub + */ + @DataProvider(name = "dataValidInputs") + public Object[][] dataValidInputs() { + final List>>> testConfigurations = Lists.cartesianProduct( + Arrays.asList( + Pair.of("extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg", Function.identity()), + Pair.of("extract.AS.snpIndel.posUn.train.snpIndel.posNeg", Function.identity())), + Arrays.asList( + Pair.of("IF.score", ab -> ADD_MODEL_BACKEND.apply(ab, VariantAnnotationsModelBackend.PYTHON_IFOREST)), // this and the following case give the same results, so they are given the same IF.score tag + Pair.of("IF.score", ADD_ISOLATION_FOREST_PYTHON_SCRIPT + .andThen(ab -> ADD_MODEL_BACKEND.apply(ab, VariantAnnotationsModelBackend.PYTHON_SCRIPT)))), + Arrays.asList( + Pair.of("snp", ExtractVariantAnnotationsIntegrationTest.ADD_SNP_MODE_AND_RESOURCES), + Pair.of("snpIndel", ExtractVariantAnnotationsIntegrationTest.ADD_SNP_MODE_AND_RESOURCES + .andThen(ExtractVariantAnnotationsIntegrationTest.ADD_INDEL_MODE_AND_RESOURCES)))); + + return testConfigurations.stream() + .map(tagAndAddFunctionPairs -> new Object[]{ + tagAndAddFunctionPairs.stream().map(Pair::getLeft).collect(Collectors.joining(".")), // e.g., extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp + tagAndAddFunctionPairs.stream().map(Pair::getRight) // creates the corresponding ArgumentsBuilder + .reduce(Function.identity(), Function::andThen) // by stringing together functions that add the + .apply(BASE_ARGUMENTS_BUILDER_SUPPLIER.get())}) // appropriate arguments + .toArray(Object[][]::new); + } + + /** + * Checks expected outputs given a tag (e.g., "extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp") and arguments corresponding to the + * Cartesian products generated in {@link #dataValidInputs}. + * + * We perform exact-match tests of any HDF5 files produced using h5diff, which is insensitive to timestamps within the file. + * We also perform exact-match tests of VCF files using diff. VCF indices may not be diff equivalent, so + * we just check for their existence. + */ + @Test(dataProvider = "dataValidInputs", groups = {"python"}) // python environment is required to run tool and to use h5diff for exact-match comparisons + public void testValidInputs(final String tag, + final ArgumentsBuilder argsBuilder) { + final File outputDir = UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS ? EXPECTED_TEST_FILES_DIR : createTempDir("score"); + final String outputPrefix = String.format("%s/%s", outputDir, tag); + argsBuilder.addOutput(outputPrefix); + + // add arguments for model prefix based on the + // train tag (the portion of the tag preceding ".score", e.g., extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF), + // which gives the basename for the model files + final String trainTag = tag.split(".score")[0]; + if (tag.contains("nonAS")) { + ExtractVariantAnnotationsIntegrationTest.ADD_NON_ALLELE_SPECIFIC_ANNOTATIONS.apply(argsBuilder); + } else { + ExtractVariantAnnotationsIntegrationTest.ADD_ALLELE_SPECIFIC_ANNOTATIONS.apply(argsBuilder); + } + final String modelPrefix = new File(INPUT_FROM_TRAIN_EXPECTED_TEST_FILES_DIR, trainTag).toString(); + final Function addModelPrefix = ab -> + ADD_MODEL_PREFIX.apply(ab, modelPrefix); + final Function addCalibrationSensitivityThreshold = ab -> + ADD_CALIBRATION_SENSITIVITY_THRESHOLD.apply(ab, CALIBRATION_SENSITIVITY_THRESHOLD); + addModelPrefix.andThen(addCalibrationSensitivityThreshold).apply(argsBuilder); + + // TODO test use of sites-only VCF (output by extract tool) to label extracted sites + + runCommandLine(argsBuilder); + + if (!UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS) { + assertExpectedOutputs(tag, outputPrefix); + } + } + + private static void assertExpectedOutputs(final String tag, + final String outputPrefix) { + // vcf.idx files are not reproducible + SystemCommandUtilsTest.runSystemCommand(String.format("diff %s/%s.vcf %s.vcf", + EXPECTED_TEST_FILES_DIR, tag, outputPrefix)); + SystemCommandUtilsTest.runSystemCommand(String.format("h5diff %s/%s.annot.hdf5 %s.annot.hdf5", + EXPECTED_TEST_FILES_DIR, tag, outputPrefix)); + SystemCommandUtilsTest.runSystemCommand(String.format("h5diff %s/%s.scores.hdf5 %s.scores.hdf5", + EXPECTED_TEST_FILES_DIR, tag, outputPrefix)); + } + + /** + * In contrast to {@link ExtractVariantAnnotationsIntegrationTest#testNoResources}, the non-presence of + * resources here does not really affect the output. + */ + @Test(groups = {"python"}) // python environment is required to run tool + public void testNoResources() { + final File outputDir = createTempDir("score"); + final String outputPrefix = String.format("%s/test", outputDir); + final ArgumentsBuilder argsBuilder = BASE_ARGUMENTS_BUILDER_SUPPLIER.get(); + argsBuilder.add(LabeledVariantAnnotationsWalker.MODE_LONG_NAME, VariantType.SNP) + .addOutput(outputPrefix); + final String modelPrefix = new File(INPUT_FROM_TRAIN_EXPECTED_TEST_FILES_DIR, + "extract.nonAS.snpIndel.posUn.train.snp.posOnly.IF").toString(); + final Function addModelPrefix = ab -> + ADD_MODEL_PREFIX.apply(ab, modelPrefix); + addModelPrefix + .andThen(ExtractVariantAnnotationsIntegrationTest.ADD_NON_ALLELE_SPECIFIC_ANNOTATIONS) + .apply(argsBuilder); + runCommandLine(argsBuilder); + Assert.assertTrue(new File(outputPrefix + ScoreVariantAnnotations.ANNOTATIONS_HDF5_SUFFIX).exists()); + Assert.assertTrue(new File(outputPrefix + ScoreVariantAnnotations.SCORES_HDF5_SUFFIX).exists()); + Assert.assertTrue(new File(outputPrefix + ".vcf").exists()); + Assert.assertTrue(new File(outputPrefix + ".vcf.idx").exists()); + } + + /** + * If no variants are present in the input in the specified region, we do not create the scores or annotations HDF5 files. + * This is because we cannot create HDF5 files with empty arrays/matrices. + */ + @Test(groups = {"python"}) // python environment is required to run tool + public void testNoVariantsInInput() { + final File outputDir = createTempDir("score"); + final String outputPrefix = String.format("%s/test", outputDir); + final ArgumentsBuilder argsBuilder = BASE_ARGUMENTS_BUILDER_SUPPLIER.get(); + argsBuilder.add(LabeledVariantAnnotationsWalker.MODE_LONG_NAME, VariantType.SNP) + .add(StandardArgumentDefinitions.INTERVALS_LONG_NAME, "chr2") // the test input VCF does not have variants here + .addOutput(outputPrefix); + final String modelPrefix = new File(INPUT_FROM_TRAIN_EXPECTED_TEST_FILES_DIR, + "extract.nonAS.snpIndel.posUn.train.snp.posOnly.IF").toString(); + final Function addModelPrefix = ab -> + ADD_MODEL_PREFIX.apply(ab, modelPrefix); + addModelPrefix + .andThen(ExtractVariantAnnotationsIntegrationTest.ADD_NON_ALLELE_SPECIFIC_ANNOTATIONS) + .andThen(ExtractVariantAnnotationsIntegrationTest.ADD_SNP_MODE_AND_RESOURCES) + .apply(argsBuilder); + runCommandLine(argsBuilder); + Assert.assertFalse(new File(outputPrefix + ScoreVariantAnnotations.ANNOTATIONS_HDF5_SUFFIX).exists()); + Assert.assertFalse(new File(outputPrefix + ScoreVariantAnnotations.SCORES_HDF5_SUFFIX).exists()); + Assert.assertTrue(new File(outputPrefix + ".vcf").exists()); + Assert.assertTrue(new File(outputPrefix + ".vcf.idx").exists()); + } + + @Test(expectedExceptions = PythonScriptExecutorException.class, groups = {"python"}) // python environment is required to run tool + public void testAnnotationsDoNotMatchThoseUsedToTrainModel() { + final File outputDir = createTempDir("score"); + final String outputPrefix = String.format("%s/test", outputDir); + final ArgumentsBuilder argsBuilder = BASE_ARGUMENTS_BUILDER_SUPPLIER.get(); + argsBuilder.add(LabeledVariantAnnotationsWalker.MODE_LONG_NAME, VariantType.SNP) + .addOutput(outputPrefix); + final String modelPrefix = new File(INPUT_FROM_TRAIN_EXPECTED_TEST_FILES_DIR, + "extract.nonAS.snpIndel.posUn.train.snp.posOnly.IF").toString(); + final Function addModelPrefix = ab -> + ADD_MODEL_PREFIX.apply(ab, modelPrefix); + addModelPrefix + .andThen(ExtractVariantAnnotationsIntegrationTest.ADD_NON_ALLELE_SPECIFIC_ANNOTATIONS) // model was trained with non-AS annotations + .andThen(ExtractVariantAnnotationsIntegrationTest.ADD_ALLELE_SPECIFIC_ANNOTATIONS) // but we additionally specify AS annotations + .andThen(ExtractVariantAnnotationsIntegrationTest.ADD_SNP_MODE_AND_RESOURCES) + .apply(argsBuilder); + runCommandLine(argsBuilder); + } + + @Test(expectedExceptions = UserException.class) + public void testReservedSNPResourceLabel() { + final File outputDir = createTempDir("extract"); + final String outputPrefix = String.format("%s/test", outputDir); + final ArgumentsBuilder argsBuilder = ExtractVariantAnnotationsIntegrationTest.ADD_NON_ALLELE_SPECIFIC_ANNOTATIONS.apply(BASE_ARGUMENTS_BUILDER_SUPPLIER.get()); + argsBuilder.add(LabeledVariantAnnotationsWalker.MODE_LONG_NAME, VariantType.SNP) + .add(StandardArgumentDefinitions.RESOURCE_LONG_NAME + String.format(":snp,%s=true", LabeledVariantAnnotationsData.SNP_LABEL), INPUT_VCF) // we just use the input VCF as a dummy resource + .addOutput(outputPrefix); + final String modelPrefix = new File(INPUT_FROM_TRAIN_EXPECTED_TEST_FILES_DIR, + "extract.nonAS.snpIndel.posUn.train.snp.posOnly.IF").toString(); + final Function addModelPrefix = ab -> + ADD_MODEL_PREFIX.apply(ab, modelPrefix); + addModelPrefix.apply(argsBuilder); + runCommandLine(argsBuilder); + } +} \ No newline at end of file diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/SystemCommandUtilsTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/SystemCommandUtilsTest.java new file mode 100644 index 00000000000..705f292116a --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/SystemCommandUtilsTest.java @@ -0,0 +1,62 @@ +package org.broadinstitute.hellbender.tools.walkers.vqsr.scalable; + +import org.broadinstitute.hellbender.GATKBaseTest; +import org.broadinstitute.hellbender.exceptions.GATKException; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.io.BufferedReader; +import java.io.File; +import java.io.IOException; +import java.io.InputStreamReader; + +public final class SystemCommandUtilsTest extends GATKBaseTest { + + private static final File TEST_FILES_DIR = new File(largeFileTestDir, + "org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract"); + private static final File EXPECTED_TEST_FILES_DIR = new File(TEST_FILES_DIR, "expected"); + + static void runSystemCommand(final String command) { + logger.debug(String.format("Testing command: %s", command)); + try { + final ProcessBuilder processBuilder = new ProcessBuilder("sh", "-c", command).redirectErrorStream(true); + final Process process = processBuilder.start(); + + final BufferedReader stdInReader = new BufferedReader(new InputStreamReader(process.getInputStream())); + String stdInLine; + while ((stdInLine = stdInReader.readLine()) != null) { + Assert.fail(String.format("The command \"%s\" resulted in: %s", command, stdInLine)); + } + stdInReader.close(); + + } catch (final IOException e) { + throw new GATKException.ShouldNeverReachHereException(e.getMessage()); + } + } + + @Test(groups = {"python"}) // python environment is required to use h5diff + public void testRunSystemCommand() { + runSystemCommand(String.format("h5diff %s/extract.AS.indel.pos.annot.hdf5 %s/extract.AS.indel.pos.annot.hdf5", + EXPECTED_TEST_FILES_DIR, EXPECTED_TEST_FILES_DIR)); + runSystemCommand(String.format("diff %s/extract.AS.indel.pos.vcf %s/extract.AS.indel.pos.vcf", + EXPECTED_TEST_FILES_DIR, EXPECTED_TEST_FILES_DIR)); + } + + @Test(expectedExceptions = AssertionError.class, groups = {"python"}) // python environment is required to use h5diff + public void testRunSystemCommandH5diffException() { + runSystemCommand(String.format("h5diff %s/extract.AS.indel.pos.annot.hdf5 %s/extract.AS.snp.pos.annot.hdf5", + EXPECTED_TEST_FILES_DIR, EXPECTED_TEST_FILES_DIR)); + } + + @Test(expectedExceptions = AssertionError.class) + public void testRunSystemCommandDiffException() { + runSystemCommand(String.format("diff %s/extract.AS.indel.pos.vcf %s/extract.AS.snp.pos.vcf", + EXPECTED_TEST_FILES_DIR, EXPECTED_TEST_FILES_DIR)); + } + + @Test(expectedExceptions = AssertionError.class) + public void testRunSystemCommandDiffNoSuchFileException() { + runSystemCommand(String.format("diff %s/blahblah %s/extract.AS.snp.pos.vcf", + EXPECTED_TEST_FILES_DIR, EXPECTED_TEST_FILES_DIR)); + } +} diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/TrainVariantAnnotationsModelIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/TrainVariantAnnotationsModelIntegrationTest.java new file mode 100644 index 00000000000..9082fe7a0ad --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/TrainVariantAnnotationsModelIntegrationTest.java @@ -0,0 +1,428 @@ +package org.broadinstitute.hellbender.tools.walkers.vqsr.scalable; + +import com.google.common.collect.Lists; +import org.apache.commons.lang3.tuple.Pair; +import org.broadinstitute.hdf5.HDF5File; +import org.broadinstitute.hellbender.CommandLineProgramTest; +import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.testutils.ArgumentsBuilder; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data.LabeledVariantAnnotationsData; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.data.VariantType; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.BGMMVariantAnnotationsScorer; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.PythonSklearnVariantAnnotationsScorer; +import org.broadinstitute.hellbender.tools.walkers.vqsr.scalable.modeling.VariantAnnotationsModelBackend; +import org.broadinstitute.hellbender.utils.io.IOUtils; +import org.broadinstitute.hellbender.utils.io.Resource; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.function.BiFunction; +import java.util.function.Function; +import java.util.function.Supplier; +import java.util.stream.Collectors; + +/** + * See documentation for {@link ExtractVariantAnnotationsIntegrationTest} for information about how inputs and + * expected outputs used there are related to those used here and in {@link ScoreVariantAnnotationsIntegrationTest}. + */ +public final class TrainVariantAnnotationsModelIntegrationTest extends CommandLineProgramTest { + + // If true, update the expected outputs in tests that assert an exact match vs. prior output, + // instead of actually running the tests. Can be used with "./gradlew test -Dtest.single=TrainVariantAnnotationsIntegrationTest" + // to update all of the exact-match tests at once. After you do this, you should look at the + // diffs in the new expected outputs in git to confirm that they are consistent with expectations. + public static final boolean UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS = false; + + /** + * Make sure that someone didn't leave the UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS toggle turned on. + */ + @Test + public void assertThatExpectedOutputUpdateToggleIsDisabled() { + Assert.assertFalse(UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS, "The toggle to update expected outputs should not be left enabled."); + } + + private static final double CALIBRATION_SENSITIVITY_THRESHOLD = 0.9; + + private static final File TEST_FILES_DIR = new File(largeFileTestDir, + "org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train"); + private static final File INPUT_FROM_EXTRACT_EXPECTED_TEST_FILES_DIR = new File(largeFileTestDir, + "org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected"); + private static final File EXPECTED_TEST_FILES_DIR = new File(TEST_FILES_DIR, "expected"); + + private static final File ISOLATION_FOREST_PYTHON_SCRIPT = IOUtils.writeTempResource( + new Resource("isolation-forest.py", TrainVariantAnnotationsModel.class)); + private static final File ISOLATION_FOREST_HYPERPARAMETERS_JSON = new File(TEST_FILES_DIR, + "isolation-forest-hyperparameters-different-seed.json"); + + // Supplier and functions for creating and adding various arguments to an ArgumentsBuilder. + private static final Supplier BASE_ARGUMENTS_BUILDER_SUPPLIER = ArgumentsBuilder::new; + private static final BiFunction ADD_ANNOTATIONS_HDF5 = (argsBuilder, annotationsHDF5) -> { + argsBuilder.add(TrainVariantAnnotationsModel.ANNOTATIONS_HDF5_LONG_NAME, annotationsHDF5); + return argsBuilder; + }; + private static final BiFunction ADD_UNLABELED_ANNOTATIONS_HDF5 = (argsBuilder, unlabeledAnnotationsHDF5) -> { + argsBuilder.add(TrainVariantAnnotationsModel.UNLABELED_ANNOTATIONS_HDF5_LONG_NAME, unlabeledAnnotationsHDF5); + return argsBuilder; + }; + private static final BiFunction ADD_CALIBRATION_SENSITIVITY_THRESHOLD = (argsBuilder, calibrationSensitivityThreshold) -> { + argsBuilder.add(TrainVariantAnnotationsModel.CALIBRATION_SENSITIVITY_THRESHOLD_LONG_NAME, calibrationSensitivityThreshold); + return argsBuilder; + }; + private static final Function ADD_SNP_MODE = argsBuilder -> { + argsBuilder.add(LabeledVariantAnnotationsWalker.MODE_LONG_NAME, VariantType.SNP); + return argsBuilder; + }; + private static final Function ADD_INDEL_MODE = argsBuilder -> { + argsBuilder.add(LabeledVariantAnnotationsWalker.MODE_LONG_NAME, VariantType.INDEL); + return argsBuilder; + }; + private static final BiFunction ADD_MODEL_BACKEND = (argsBuilder, modelBackendMode) -> { + argsBuilder.add(TrainVariantAnnotationsModel.MODEL_BACKEND_LONG_NAME, modelBackendMode); + return argsBuilder; + }; + private static final Function ADD_ISOLATION_FOREST_PYTHON_SCRIPT = argsBuilder -> { + argsBuilder.add(TrainVariantAnnotationsModel.PYTHON_SCRIPT_LONG_NAME, ISOLATION_FOREST_PYTHON_SCRIPT); + return argsBuilder; + }; + private static final Function ADD_ISOLATION_FOREST_HYPERPARAMETERS_JSON = argsBuilder -> { + argsBuilder.add(TrainVariantAnnotationsModel.HYPERPARAMETERS_JSON_LONG_NAME, ISOLATION_FOREST_HYPERPARAMETERS_JSON); + return argsBuilder; + }; + + /** + * Exact-match tests for (non-exhaustive) configurations given by the Cartesian product of the following options: + * 1) non-allele-specific ("nonAS") vs. allele-specific ("AS") + * 2) SNP-only ("snp") vs. SNP+INDEL ("snpIndel") (for both of these options, we use extracted annotations that contain both SNP and INDEL variants as input) + * 3) positive training with {extract-tag}.annot.hdf5 ("posOnly") vs. positive-negative training with {extract-tag}.annot.hdf5 and {extract-tag}.unlabeled.annot.hdf5 ("posNeg") + * 4) model backend + * 4a) Java Bayesian Gaussian Mixture Model (BGMM) backend TODO the BGMM has been reduced to a stub for this initial PR; subsequent PRs will cover the backend code and reconnect the stub + * 4b) default PYTHON_IFOREST with default hyperparameters ("IF") + * 4c) default PYTHON_IFOREST with non-default seed hyperparameter ("IFDifferentSeed") + * 4d) specified PYTHON_SCRIPT with non-default seed hyperparameter ("IFDifferentSeed"); we will simply use the same script as the default PYTHON_IFOREST backend, so this is just a test of the command-line interface + * We should expect 4c-d to give functionally identical results. + */ + @DataProvider(name = "dataValidInputs") + public Object[][] dataValidInputs() { + final List>>> testConfigurations = Lists.cartesianProduct( + Arrays.asList( + Pair.of("extract.nonAS.snpIndel.posUn.train", Function.identity()), + Pair.of("extract.AS.snpIndel.posUn.train", Function.identity())), + Arrays.asList( + Pair.of("snp", ADD_SNP_MODE), + Pair.of("snpIndel", ADD_SNP_MODE.andThen(ADD_INDEL_MODE))), + Arrays.asList( // we will consume the tag and add appropriate arguments for positive and positive-negative training below + Pair.of("posOnly", Function.identity()), + Pair.of("posNeg", Function.identity())), + Arrays.asList( + Pair.of("IF", ab -> ADD_MODEL_BACKEND.apply(ab, VariantAnnotationsModelBackend.PYTHON_IFOREST)), + Pair.of("IFDifferentSeed", ADD_ISOLATION_FOREST_HYPERPARAMETERS_JSON + .andThen(ab -> ADD_MODEL_BACKEND.apply(ab, VariantAnnotationsModelBackend.PYTHON_IFOREST))), // this and the following case give the same results, so they are given the same IFDifferentSeed tag + Pair.of("IFDifferentSeed", ADD_ISOLATION_FOREST_PYTHON_SCRIPT + .andThen(ADD_ISOLATION_FOREST_HYPERPARAMETERS_JSON) + .andThen(ab -> ADD_MODEL_BACKEND.apply(ab, VariantAnnotationsModelBackend.PYTHON_SCRIPT))))); + + return testConfigurations.stream() + .map(tagAndAddFunctionPairs -> new Object[]{ + tagAndAddFunctionPairs.stream().map(Pair::getLeft).collect(Collectors.joining(".")), // e.g., extract.nonAS.snpIndel.posUn.train.snp.posOnly.IF + tagAndAddFunctionPairs.stream().map(Pair::getRight) // creates the corresponding ArgumentsBuilder + .reduce(Function.identity(), Function::andThen) // by stringing together functions that add the + .apply(BASE_ARGUMENTS_BUILDER_SUPPLIER.get())}) // appropriate arguments + .toArray(Object[][]::new); + } + + /** + * Checks expected outputs given a tag (e.g., "extract.nonAS.snpIndel.posUn.train.snp.posOnly.IF") and arguments corresponding to the + * Cartesian products generated in {@link #dataValidInputs}. + * + * We perform exact-match tests of any HDF5 files produced using h5diff, which is insensitive to timestamps within the file. + * Binary serialized scorers may not be diff equivalent, so we just check for their existence. + */ + @Test(dataProvider = "dataValidInputs", groups = {"python"}) // python environment is required to run tool and to use h5diff for exact-match comparisons + public void testValidInputs(final String tag, + final ArgumentsBuilder argsBuilder) { + final File outputDir = UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS ? EXPECTED_TEST_FILES_DIR : createTempDir("train"); + final String outputPrefix = String.format("%s/%s", outputDir, tag); + argsBuilder.addOutput(outputPrefix); + + // add arguments for positive/unlabeled annotations based on the + // extract tag (the portion of the tag preceding ".train", e.g., extract.nonAS.snpIndel.posUn), + // which gives the basename for the annotation files + final String extractTag = tag.split(".train")[0]; + final File positiveAnnotationsHDF5 = new File(INPUT_FROM_EXTRACT_EXPECTED_TEST_FILES_DIR, + extractTag + LabeledVariantAnnotationsWalker.ANNOTATIONS_HDF5_SUFFIX); + final Function addPositiveAnnotations = ab -> + ADD_ANNOTATIONS_HDF5.apply(ab, positiveAnnotationsHDF5); + if (tag.contains("posNeg")) { + final File unlabeledAnnotationsHDF5 = new File(INPUT_FROM_EXTRACT_EXPECTED_TEST_FILES_DIR, + extractTag + ExtractVariantAnnotations.UNLABELED_TAG + LabeledVariantAnnotationsWalker.ANNOTATIONS_HDF5_SUFFIX); + final Function addUnlabeledAnnotations = ab -> + ADD_UNLABELED_ANNOTATIONS_HDF5.apply(ab, unlabeledAnnotationsHDF5); + final Function addCalibrationSensitivityThreshold = ab -> + ADD_CALIBRATION_SENSITIVITY_THRESHOLD.apply(ab, CALIBRATION_SENSITIVITY_THRESHOLD); + addPositiveAnnotations.andThen(addUnlabeledAnnotations).andThen(addCalibrationSensitivityThreshold).apply(argsBuilder); + } else { + addPositiveAnnotations.apply(argsBuilder); + } + + runCommandLine(argsBuilder); + + if (!UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS) { + assertExpectedOutputs(tag, outputPrefix); + } + } + + private static void assertExpectedOutputs(final String tag, + final String outputPrefix) { + if (tag.contains("train.snp.")) { + assertExpectedOutputsForVariantType(tag, outputPrefix, "snp"); + assertOutputsForVariantTypeDoNotExist(outputPrefix, "indel"); + } else if (tag.contains("train.snpIndel.")) { + assertExpectedOutputsForVariantType(tag, outputPrefix, "snp"); + assertExpectedOutputsForVariantType(tag, outputPrefix, "indel"); + } else { + Assert.fail("Unknown variant-type tag."); + } + } + + private static void assertExpectedOutputsForVariantType(final String tag, + final String outputPrefix, + final String variantType) { + final String tagAndVariantType = String.format("%s.%s", tag, variantType); + final String outputPrefixAndVariantType = String.format("%s.%s", outputPrefix, variantType); + + SystemCommandUtilsTest.runSystemCommand(String.format("h5diff %s/%s %s", + EXPECTED_TEST_FILES_DIR, + tagAndVariantType + TrainVariantAnnotationsModel.TRAINING_SCORES_HDF5_SUFFIX, + outputPrefixAndVariantType + TrainVariantAnnotationsModel.TRAINING_SCORES_HDF5_SUFFIX)); + SystemCommandUtilsTest.runSystemCommand(String.format("h5diff %s/%s %s", + EXPECTED_TEST_FILES_DIR, + tagAndVariantType + TrainVariantAnnotationsModel.CALIBRATION_SCORES_HDF5_SUFFIX, + outputPrefixAndVariantType + TrainVariantAnnotationsModel.CALIBRATION_SCORES_HDF5_SUFFIX)); + + assertScorerExpectedOutputs(tagAndVariantType, outputPrefixAndVariantType, false); + + if (tag.contains("posNeg")) { + SystemCommandUtilsTest.runSystemCommand(String.format("h5diff %s/%s %s", + EXPECTED_TEST_FILES_DIR, + tagAndVariantType + TrainVariantAnnotationsModel.UNLABELED_SCORES_HDF5_SUFFIX, + outputPrefixAndVariantType + TrainVariantAnnotationsModel.UNLABELED_SCORES_HDF5_SUFFIX)); + assertScorerExpectedOutputs(tagAndVariantType, outputPrefixAndVariantType, true); + } else { + Assert.assertFalse(new File(outputPrefixAndVariantType + TrainVariantAnnotationsModel.UNLABELED_SCORES_HDF5_SUFFIX).exists()); + Assert.assertFalse(new File(outputPrefixAndVariantType + TrainVariantAnnotationsModel.NEGATIVE_TAG + BGMMVariantAnnotationsScorer.BGMM_SCORER_SER_SUFFIX).exists()); + Assert.assertFalse(new File(outputPrefixAndVariantType + TrainVariantAnnotationsModel.NEGATIVE_TAG + PythonSklearnVariantAnnotationsScorer.PYTHON_SCORER_PKL_SUFFIX).exists()); + } + } + + private static void assertOutputsForVariantTypeDoNotExist(final String outputPrefix, + final String variantType) { + final String outputPrefixAndVariantType = String.format("%s.%s", outputPrefix, variantType); + Assert.assertFalse(new File(outputPrefixAndVariantType + TrainVariantAnnotationsModel.TRAINING_SCORES_HDF5_SUFFIX).exists()); + Assert.assertFalse(new File(outputPrefixAndVariantType + TrainVariantAnnotationsModel.CALIBRATION_SCORES_HDF5_SUFFIX).exists()); + Assert.assertFalse(new File(outputPrefixAndVariantType + TrainVariantAnnotationsModel.UNLABELED_SCORES_HDF5_SUFFIX).exists()); + Assert.assertFalse(new File(outputPrefixAndVariantType + BGMMVariantAnnotationsScorer.BGMM_SCORER_SER_SUFFIX).exists()); + Assert.assertFalse(new File(outputPrefixAndVariantType + PythonSklearnVariantAnnotationsScorer.PYTHON_SCORER_PKL_SUFFIX).exists()); + Assert.assertFalse(new File(outputPrefixAndVariantType + TrainVariantAnnotationsModel.NEGATIVE_TAG + BGMMVariantAnnotationsScorer.BGMM_SCORER_SER_SUFFIX).exists()); + Assert.assertFalse(new File(outputPrefixAndVariantType + TrainVariantAnnotationsModel.NEGATIVE_TAG + PythonSklearnVariantAnnotationsScorer.PYTHON_SCORER_PKL_SUFFIX).exists()); + } + + /** + * Binary serialized scorers may not be diff equivalent, so we just check for their existence. + * We assume that checking elsewhere for equivalence of the scores that the scorers generate provides sufficient + * coverage. + */ + private static void assertScorerExpectedOutputs(final String tagAndVariantType, + final String outputPrefixAndVariantType, + final boolean isNegative) { + final String positiveOrNegativeTag = isNegative ? ".negative" : ""; + final String scorerTag = outputPrefixAndVariantType + positiveOrNegativeTag; + if (tagAndVariantType.contains("BGMM")) { + Assert.assertTrue(new File(scorerTag + BGMMVariantAnnotationsScorer.BGMM_SCORER_SER_SUFFIX).exists()); + Assert.assertFalse(new File(scorerTag + PythonSklearnVariantAnnotationsScorer.PYTHON_SCORER_PKL_SUFFIX).exists()); + } else if (tagAndVariantType.contains("IF")) { + Assert.assertTrue(new File(scorerTag + PythonSklearnVariantAnnotationsScorer.PYTHON_SCORER_PKL_SUFFIX).exists()); + Assert.assertFalse(new File(scorerTag + BGMMVariantAnnotationsScorer.BGMM_SCORER_SER_SUFFIX).exists()); + } else { + Assert.fail("Unknown model-backend tag."); + } + } + + @Test(groups = {"python"}) // python environment is required to run tool and to use h5diff for exact-match comparisons + public void testSNPOnlyModelsFromSNPOnlyAndSNPPlusIndelAnnotationsAreIdentical() { + final File outputDir = createTempDir("train"); + + final String outputPrefixSNPOnly = String.format("%s/test-snp", outputDir); + final ArgumentsBuilder argsBuilderSNPOnly = BASE_ARGUMENTS_BUILDER_SUPPLIER.get(); + argsBuilderSNPOnly.addOutput(outputPrefixSNPOnly); + final File positiveAnnotationsHDF5SNPOnly = new File(INPUT_FROM_EXTRACT_EXPECTED_TEST_FILES_DIR, + "extract.nonAS.snp.pos" + LabeledVariantAnnotationsWalker.ANNOTATIONS_HDF5_SUFFIX); + final Function addPositiveAnnotationsSNPOnly = ab -> + ADD_ANNOTATIONS_HDF5.apply(ab, positiveAnnotationsHDF5SNPOnly); + addPositiveAnnotationsSNPOnly + .andThen(ADD_SNP_MODE) + .apply(argsBuilderSNPOnly); + runCommandLine(argsBuilderSNPOnly); + + final String outputPrefixSNPPlusIndel = String.format("%s/test-snpIndel", outputDir); + final ArgumentsBuilder argsBuilderSNPPlusIndel = BASE_ARGUMENTS_BUILDER_SUPPLIER.get(); + argsBuilderSNPPlusIndel.addOutput(outputPrefixSNPPlusIndel); + final File positiveAnnotationsHDF5SNPPlusIndel = new File(INPUT_FROM_EXTRACT_EXPECTED_TEST_FILES_DIR, + "extract.nonAS.snpIndel.pos" + LabeledVariantAnnotationsWalker.ANNOTATIONS_HDF5_SUFFIX); + final Function addPositiveAnnotationsSNPPlusIndel = ab -> + ADD_ANNOTATIONS_HDF5.apply(ab, positiveAnnotationsHDF5SNPPlusIndel); + addPositiveAnnotationsSNPPlusIndel + .andThen(ADD_SNP_MODE) + .apply(argsBuilderSNPPlusIndel); + runCommandLine(argsBuilderSNPPlusIndel); + + SystemCommandUtilsTest.runSystemCommand(String.format("h5diff %s %s", + outputPrefixSNPOnly + ".snp" + TrainVariantAnnotationsModel.TRAINING_SCORES_HDF5_SUFFIX, + outputPrefixSNPPlusIndel + ".snp" + TrainVariantAnnotationsModel.TRAINING_SCORES_HDF5_SUFFIX)); + SystemCommandUtilsTest.runSystemCommand(String.format("h5diff %s %s", + outputPrefixSNPOnly + ".snp" + TrainVariantAnnotationsModel.CALIBRATION_SCORES_HDF5_SUFFIX, + outputPrefixSNPPlusIndel + ".snp" + TrainVariantAnnotationsModel.CALIBRATION_SCORES_HDF5_SUFFIX)); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testUnlabeledAnnotationsSpecifiedWithoutCalibrationSensitivityThreshold() { + final File outputDir = createTempDir("train"); + final String outputPrefix = String.format("%s/test", outputDir); + final ArgumentsBuilder argsBuilder = BASE_ARGUMENTS_BUILDER_SUPPLIER.get(); + argsBuilder.addOutput(outputPrefix); + final String extractTag = "extract.nonAS.snpIndel.posUn"; + final File positiveAnnotationsHDF5 = new File(INPUT_FROM_EXTRACT_EXPECTED_TEST_FILES_DIR, + extractTag + LabeledVariantAnnotationsWalker.ANNOTATIONS_HDF5_SUFFIX); + final Function addPositiveAnnotations = ab -> + ADD_ANNOTATIONS_HDF5.apply(ab, positiveAnnotationsHDF5); + final File unlabeledAnnotationsHDF5 = new File(INPUT_FROM_EXTRACT_EXPECTED_TEST_FILES_DIR, + extractTag + ExtractVariantAnnotations.UNLABELED_TAG + LabeledVariantAnnotationsWalker.ANNOTATIONS_HDF5_SUFFIX); + final Function addUnlabeledAnnotations = ab -> + ADD_UNLABELED_ANNOTATIONS_HDF5.apply(ab, unlabeledAnnotationsHDF5); + addPositiveAnnotations + .andThen(addUnlabeledAnnotations) + .apply(argsBuilder); + runCommandLine(argsBuilder); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testCalibrationSensitivityThresholdSpecifiedWithoutUnlabeledAnnotations() { + final File outputDir = createTempDir("train"); + final String outputPrefix = String.format("%s/test", outputDir); + final ArgumentsBuilder argsBuilder = BASE_ARGUMENTS_BUILDER_SUPPLIER.get(); + argsBuilder.addOutput(outputPrefix); + final String extractTag = "extract.nonAS.snpIndel.posUn"; + final File positiveAnnotationsHDF5 = new File(INPUT_FROM_EXTRACT_EXPECTED_TEST_FILES_DIR, + extractTag + LabeledVariantAnnotationsWalker.ANNOTATIONS_HDF5_SUFFIX); + final Function addPositiveAnnotations = ab -> + ADD_ANNOTATIONS_HDF5.apply(ab, positiveAnnotationsHDF5); + final Function addCalibrationSensitivityThreshold = ab -> + ADD_CALIBRATION_SENSITIVITY_THRESHOLD.apply(ab, CALIBRATION_SENSITIVITY_THRESHOLD); + addPositiveAnnotations + .andThen(addCalibrationSensitivityThreshold) + .apply(argsBuilder); + runCommandLine(argsBuilder); + } + + @Test(expectedExceptions = IllegalArgumentException.class) // python environment is required to run tool + public void testPositiveAndUnlabeledAnnotationNamesAreNotIdentical() { + final File outputDir = createTempDir("train"); + final String outputPrefix = String.format("%s/test", outputDir); + final ArgumentsBuilder argsBuilder = BASE_ARGUMENTS_BUILDER_SUPPLIER.get(); + argsBuilder.addOutput(outputPrefix); + final File positiveAnnotationsHDF5 = new File(INPUT_FROM_EXTRACT_EXPECTED_TEST_FILES_DIR, + "extract.nonAS.snpIndel.posUn" + LabeledVariantAnnotationsWalker.ANNOTATIONS_HDF5_SUFFIX); // non-allele-specific + final Function addPositiveAnnotations = ab -> + ADD_ANNOTATIONS_HDF5.apply(ab, positiveAnnotationsHDF5); + final File unlabeledAnnotationsHDF5 = new File(INPUT_FROM_EXTRACT_EXPECTED_TEST_FILES_DIR, + "extract.AS.snpIndel.posUn" + ExtractVariantAnnotations.UNLABELED_TAG + LabeledVariantAnnotationsWalker.ANNOTATIONS_HDF5_SUFFIX); // allele-specific + final Function addUnlabeledAnnotations = ab -> + ADD_UNLABELED_ANNOTATIONS_HDF5.apply(ab, unlabeledAnnotationsHDF5); + final Function addCalibrationSensitivityThreshold = ab -> + ADD_CALIBRATION_SENSITIVITY_THRESHOLD.apply(ab, CALIBRATION_SENSITIVITY_THRESHOLD); + addPositiveAnnotations + .andThen(addUnlabeledAnnotations) + .andThen(addCalibrationSensitivityThreshold) + .apply(argsBuilder); + runCommandLine(argsBuilder); + } + + @Test(expectedExceptions = UserException.BadInput.class, groups = {"python"}) // python environment is required to run tool + public void testPositiveAnnotationsOfSpecifiedVariantTypesNotPresent() { + final File outputDir = createTempDir("train"); + final String outputPrefix = String.format("%s/test", outputDir); + final ArgumentsBuilder argsBuilder = BASE_ARGUMENTS_BUILDER_SUPPLIER.get(); + argsBuilder.addOutput(outputPrefix); + final File positiveAnnotationsHDF5 = new File(INPUT_FROM_EXTRACT_EXPECTED_TEST_FILES_DIR, + "extract.nonAS.snp.posUn" + LabeledVariantAnnotationsWalker.ANNOTATIONS_HDF5_SUFFIX); // contains only SNPs, but SNP+INDEL is specified + final Function addPositiveAnnotations = ab -> + ADD_ANNOTATIONS_HDF5.apply(ab, positiveAnnotationsHDF5); + ADD_SNP_MODE + .andThen(ADD_INDEL_MODE) + .andThen(addPositiveAnnotations) + .apply(argsBuilder); + runCommandLine(argsBuilder); + } + + @Test(expectedExceptions = UserException.BadInput.class, groups = {"python"}) // python environment is required to run tool + public void testUnlabeledAnnotationsOfSpecifiedVariantTypesNotPresent() { + final File outputDir = createTempDir("train"); + final String outputPrefix = String.format("%s/test", outputDir); + final ArgumentsBuilder argsBuilder = BASE_ARGUMENTS_BUILDER_SUPPLIER.get(); + argsBuilder.addOutput(outputPrefix); + final File positiveAnnotationsHDF5 = new File(INPUT_FROM_EXTRACT_EXPECTED_TEST_FILES_DIR, + "extract.nonAS.snpIndel.posUn" + LabeledVariantAnnotationsWalker.ANNOTATIONS_HDF5_SUFFIX); + final Function addPositiveAnnotations = ab -> + ADD_ANNOTATIONS_HDF5.apply(ab, positiveAnnotationsHDF5); + final File unlabeledAnnotationsHDF5 = new File(INPUT_FROM_EXTRACT_EXPECTED_TEST_FILES_DIR, + "extract.nonAS.snp.posUn" + ExtractVariantAnnotations.UNLABELED_TAG + LabeledVariantAnnotationsWalker.ANNOTATIONS_HDF5_SUFFIX); // contains only SNPs, but SNP+INDEL is specified + final Function addUnlabeledAnnotations = ab -> + ADD_UNLABELED_ANNOTATIONS_HDF5.apply(ab, unlabeledAnnotationsHDF5); + final Function addCalibrationSensitivityThreshold = ab -> + ADD_CALIBRATION_SENSITIVITY_THRESHOLD.apply(ab, CALIBRATION_SENSITIVITY_THRESHOLD); + ADD_SNP_MODE.andThen(ADD_INDEL_MODE) + .andThen(addPositiveAnnotations) + .andThen(addUnlabeledAnnotations) + .andThen(addCalibrationSensitivityThreshold) + .apply(argsBuilder); + runCommandLine(argsBuilder); + } + + @Test(expectedExceptions = UserException.BadInput.class, groups = {"python"}) // python environment is required to run tool + public void testPositiveAnnotationForOneVariantTypeIsCompletelyMissing() { // TODO add analogous test that warning is emitted when annotation has zero variance? + final File outputDir = createTempDir("train"); + final String outputPrefix = String.format("%s/test", outputDir); + final ArgumentsBuilder argsBuilder = BASE_ARGUMENTS_BUILDER_SUPPLIER.get(); + argsBuilder.addOutput(outputPrefix); + + // we will dummy up an annotations file that contains 2 annotations (ANNOT_1 and ANNOT_2) + // for 4 variants (2 SNPs and 2 INDELs); the INDELs will all have missing (i.e., NaN) ANNOT_1 values + final List annotationNames = Arrays.asList("ANNOT_1", "ANNOT_2"); + final double[][] annotations = new double[][]{ + new double[]{1, 2}, // SNP + new double[]{3, 4}, // SNP + new double[]{Double.NaN, 2}, // INDEL + new double[]{Double.NaN, 4}}; // INDEL + final List isSubset = Collections.nCopies(4, true); + + final File positiveAnnotationsHDF5 = LabeledVariantAnnotationsData.subsetAnnotationsToTemporaryFile( + annotationNames, annotations, isSubset); + + try (final HDF5File positiveAnnotationsHDF5File = new HDF5File(positiveAnnotationsHDF5, HDF5File.OpenMode.READ_WRITE)) { + positiveAnnotationsHDF5File.makeDoubleArray("/labels/snp", new double[]{1, 1, 0, 0}); + positiveAnnotationsHDF5File.makeDoubleArray("/labels/training", new double[]{1, 1, 1, 1}); + positiveAnnotationsHDF5File.makeDoubleArray("/labels/calibration", new double[]{1, 1, 1, 1}); + } + final Function addPositiveAnnotations = ab -> + ADD_ANNOTATIONS_HDF5.apply(ab, positiveAnnotationsHDF5); + + ADD_SNP_MODE.andThen(ADD_INDEL_MODE) + .andThen(addPositiveAnnotations) + .apply(argsBuilder); + runCommandLine(argsBuilder); + } +} \ No newline at end of file diff --git a/src/test/java/org/broadinstitute/hellbender/utils/python/PythonEnvironmentIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/utils/python/PythonEnvironmentIntegrationTest.java index fe89b721a1a..b055d8461cd 100644 --- a/src/test/java/org/broadinstitute/hellbender/utils/python/PythonEnvironmentIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/utils/python/PythonEnvironmentIntegrationTest.java @@ -25,7 +25,7 @@ public Object[][] getDataPackagePresent() { { "pymc3", "3.1" }, { "keras", "2.2.4" }, { "h5py", "2.10.0" }, - { "sklearn", "0.22.2.post1" }, + { "sklearn", "0.23.1" }, { "matplotlib", "3.2.1" }, { "pandas", "1.0.3" }, { "argparse", null }, diff --git a/src/test/resources/large/filteringJointVcf/test_10_samples.22.avg.vcf.gz b/src/test/resources/large/filteringJointVcf/test_10_samples.22.avg.vcf.gz new file mode 100644 index 00000000000..31cba1e00f8 --- /dev/null +++ b/src/test/resources/large/filteringJointVcf/test_10_samples.22.avg.vcf.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dcf1dbda2255fbe1372d09d364835452d610822070b6b9b56b1733388aa3cd19 +size 140900871 diff --git a/src/test/resources/large/filteringJointVcf/test_10_samples.22.avg.vcf.gz.tbi b/src/test/resources/large/filteringJointVcf/test_10_samples.22.avg.vcf.gz.tbi new file mode 100644 index 00000000000..5fd47681849 --- /dev/null +++ b/src/test/resources/large/filteringJointVcf/test_10_samples.22.avg.vcf.gz.tbi @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af32939cd4f63a0a9251a50cc5658738285d4cee4833bcf1cda6b92d90c4b99b +size 100153 diff --git a/src/test/resources/large/filteringJointVcf/test_10_samples.23.avg.vcf.gz b/src/test/resources/large/filteringJointVcf/test_10_samples.23.avg.vcf.gz new file mode 100644 index 00000000000..55dde2493e4 --- /dev/null +++ b/src/test/resources/large/filteringJointVcf/test_10_samples.23.avg.vcf.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4144805bd8fabc74f3eea39a910dbd5c24017b844c44640efda49e3b0febe693 +size 112076612 diff --git a/src/test/resources/large/filteringJointVcf/test_10_samples.23.avg.vcf.gz.tbi b/src/test/resources/large/filteringJointVcf/test_10_samples.23.avg.vcf.gz.tbi new file mode 100644 index 00000000000..114d43936c5 --- /dev/null +++ b/src/test/resources/large/filteringJointVcf/test_10_samples.23.avg.vcf.gz.tbi @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3484a38abb76952b02863099c383eae26d50f44514c5045992f63cc3294ebe8 +size 114295 diff --git a/src/test/resources/large/filteringJointVcf/test_10_samples.sites_only.vcf.gz b/src/test/resources/large/filteringJointVcf/test_10_samples.sites_only.vcf.gz new file mode 100644 index 00000000000..f75a07bd09c --- /dev/null +++ b/src/test/resources/large/filteringJointVcf/test_10_samples.sites_only.vcf.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00212a6387eba259a2d060eef08f50f3de512a155ed4e746d38530310a582e14 +size 134260565 diff --git a/src/test/resources/large/filteringJointVcf/test_10_samples.sites_only.vcf.gz.tbi b/src/test/resources/large/filteringJointVcf/test_10_samples.sites_only.vcf.gz.tbi new file mode 100644 index 00000000000..475b5ba83a0 --- /dev/null +++ b/src/test/resources/large/filteringJointVcf/test_10_samples.sites_only.vcf.gz.tbi @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c99412c88d072d494e545f56acdf621f6c960cbb8f2d734532cf9d5d11e83104 +size 133485 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.pos.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.pos.annot.hdf5 new file mode 100644 index 00000000000..6f17056f47d --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.pos.annot.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ba3af854cf35cffa95393038075dc3dd8907d0987896ecc15854fb928756359 +size 30408 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.pos.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.pos.vcf new file mode 100644 index 00000000000..67a8e58fe29 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.pos.vcf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1d20489c2ff9b0ccba12a24c84d5d9fd61d62d8ffbb416593559120461b8140 +size 171038 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.pos.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.pos.vcf.idx new file mode 100644 index 00000000000..36818418317 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.pos.vcf.idx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73701b5eeb43593c3d3cbdf2f8c4383e6ec6dd04c2b47086a263d234c463f2a5 +size 114263 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.posUn.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.posUn.annot.hdf5 new file mode 100644 index 00000000000..c9ee58d80f2 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.posUn.annot.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:511858197b92a96bac14d64e884a02497c958da24ada5fcebf5bd49664d78b59 +size 30512 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.posUn.unlabeled.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.posUn.unlabeled.annot.hdf5 new file mode 100644 index 00000000000..409f5b378bb --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.posUn.unlabeled.annot.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35158eb4516e8db64479ad493c5cbe225f0a04afbae6c7145bdc3fe02a2d4162 +size 38160 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.posUn.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.posUn.vcf new file mode 100644 index 00000000000..c33bd749e80 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.posUn.vcf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0232e0dbb277e6a5e736d01e0b17c1d0e7f0254c89f200e693393ab0962293e6 +size 171044 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.posUn.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.posUn.vcf.idx new file mode 100644 index 00000000000..737a04bb05b --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.indel.posUn.vcf.idx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6f669764aef593e112d21f7c6414cf2a54ced0fae9be140440e7e3e19055eac +size 114265 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.pos.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.pos.annot.hdf5 new file mode 100644 index 00000000000..687decbfa2d --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.pos.annot.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e836468e49819f47679eb4368f8cd52626df2e5e879dfce4314586b4a708198 +size 146832 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.pos.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.pos.vcf new file mode 100644 index 00000000000..fef16673a21 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.pos.vcf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3060969881dbc006d167f09817924d38b6345e25976ac53880f624d94aea68e9 +size 193277 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.pos.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.pos.vcf.idx new file mode 100644 index 00000000000..96624e70f39 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.pos.vcf.idx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3e477367d5b1f891ed1ef171acd096aa6ceabd2feda0c6699f68260442a1750 +size 114298 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.posUn.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.posUn.annot.hdf5 new file mode 100644 index 00000000000..803746a075d --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.posUn.annot.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3250c8e597f71a6a2f83047a50b96d518ba350d74c3322b1d8a740256d1c4635 +size 147088 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.posUn.unlabeled.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.posUn.unlabeled.annot.hdf5 new file mode 100644 index 00000000000..a6d81581282 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.posUn.unlabeled.annot.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4415f0d40620998e808113f6bcef97ba5ab9ff8cd8148a1f330d8c21b0c08a36 +size 32304 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.posUn.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.posUn.vcf new file mode 100644 index 00000000000..3dbb5880865 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.posUn.vcf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6cf5ee7adacf635c73d7493b99cc8df19a31acbbec991fbe5173e7cd6b405491 +size 193281 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.posUn.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.posUn.vcf.idx new file mode 100644 index 00000000000..a092ca99b1c --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snp.posUn.vcf.idx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92e591c5ec161a5ec44d76b780882194ccd6a22075ef9905079493d87a8be12a +size 114300 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.pos.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.pos.annot.hdf5 new file mode 100644 index 00000000000..e4b20d5259d --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.pos.annot.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f1a99968e68b93349dcd1b6a9671d97a060e33bbcb384fcf79d4505e0a038a5 +size 174096 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.pos.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.pos.vcf new file mode 100644 index 00000000000..1b2a380111c --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.pos.vcf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ea0ee6bca7622ae8670c2f8ee2930a3223ba5d3edd89871c8e4b5cf3cf96f9f +size 196269 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.pos.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.pos.vcf.idx new file mode 100644 index 00000000000..c5cab723e7d --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.pos.vcf.idx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bff89ecb61fef3a81c0a7fd58b397d9b7a4a62d1cd282beb50bb3c6f5b2564c +size 114496 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.posUn.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.posUn.annot.hdf5 new file mode 100644 index 00000000000..d974c3905c0 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.posUn.annot.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f90c42658e62b3717f607ed44ffed1d570b506a8d07636070bee2b4a5dea2aba +size 174480 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.posUn.unlabeled.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.posUn.unlabeled.annot.hdf5 new file mode 100644 index 00000000000..bf70828b1e1 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.posUn.unlabeled.annot.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:095a6624af5e354ad07a9695db1bdc06785e7088b30bf4696a09c64829ee2e1d +size 32080 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.posUn.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.posUn.vcf new file mode 100644 index 00000000000..201b4860fa1 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.posUn.vcf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fa4160640da9143000d5f3b2497ca20c02c0944ca53cfa03b1a63d935b2cf2e +size 196279 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.posUn.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.posUn.vcf.idx new file mode 100644 index 00000000000..5985428efb3 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.AS.snpIndel.posUn.vcf.idx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5119e3b8e791052ee7c71d354fe2fedf432e5b42a4fd07ccd37561e68b871d1 +size 114498 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.pos.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.pos.annot.hdf5 new file mode 100644 index 00000000000..acc759367e6 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.pos.annot.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:091efef8c8ac199e66b04dc610fab56b63d37c0943986f63e37e5c72b7fe2f37 +size 31088 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.pos.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.pos.vcf new file mode 100644 index 00000000000..c33bd749e80 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.pos.vcf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0232e0dbb277e6a5e736d01e0b17c1d0e7f0254c89f200e693393ab0962293e6 +size 171044 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.pos.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.pos.vcf.idx new file mode 100644 index 00000000000..a93f309d0fa --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.pos.vcf.idx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40eab65c4b0806c0f14f0178fafdda682d113bb044d94d14803f2b0b212d0d7e +size 114266 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.posUn.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.posUn.annot.hdf5 new file mode 100644 index 00000000000..acc759367e6 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.posUn.annot.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:091efef8c8ac199e66b04dc610fab56b63d37c0943986f63e37e5c72b7fe2f37 +size 31088 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.posUn.unlabeled.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.posUn.unlabeled.annot.hdf5 new file mode 100644 index 00000000000..32572d5a954 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.posUn.unlabeled.annot.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57c1cb118d8a46cf9f27fcf0dbc64e3de43b3350ae5a738571837f4145b2e8fc +size 32880 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.posUn.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.posUn.vcf new file mode 100644 index 00000000000..c33bd749e80 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.posUn.vcf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0232e0dbb277e6a5e736d01e0b17c1d0e7f0254c89f200e693393ab0962293e6 +size 171044 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.posUn.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.posUn.vcf.idx new file mode 100644 index 00000000000..f09bcbfe1ab --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.indel.posUn.vcf.idx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f360ae4d112ea29bea0e19a83cb13fe552785a9f1ee3473fb2b7f6cb05f50d6 +size 114268 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.pos.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.pos.annot.hdf5 new file mode 100644 index 00000000000..6f4d614aa06 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.pos.annot.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3e58024a7fa06ff7483f12c7b50908ecbf96ad703b8194cddfe99e7c60be5c9 +size 153048 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.pos.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.pos.vcf new file mode 100644 index 00000000000..665f9422ec8 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.pos.vcf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fada922eff03dbd88bbe2e8d593a5cd194e657babb98db49b6c14adae0c2f9de +size 193313 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.pos.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.pos.vcf.idx new file mode 100644 index 00000000000..435819c3ea5 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.pos.vcf.idx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69c40e76562c2e3d8af9818b0cdeee958f6b5851d4022be55cd9ab71acf9c8b9 +size 114301 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.posUn.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.posUn.annot.hdf5 new file mode 100644 index 00000000000..6f4d614aa06 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.posUn.annot.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3e58024a7fa06ff7483f12c7b50908ecbf96ad703b8194cddfe99e7c60be5c9 +size 153048 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.posUn.unlabeled.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.posUn.unlabeled.annot.hdf5 new file mode 100644 index 00000000000..d63b214b697 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.posUn.unlabeled.annot.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4498cdde82d8b9a03dda5c5057227b0978f1da70b09b26f174b10970840597c +size 32880 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.posUn.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.posUn.vcf new file mode 100644 index 00000000000..665f9422ec8 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.posUn.vcf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fada922eff03dbd88bbe2e8d593a5cd194e657babb98db49b6c14adae0c2f9de +size 193313 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.posUn.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.posUn.vcf.idx new file mode 100644 index 00000000000..bfc8b5c6560 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snp.posUn.vcf.idx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e1ababc168beb7e1acd58011d0870d886f3c0b50e8d1e62a6c9611a3a7dfe2e +size 114303 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.pos.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.pos.annot.hdf5 new file mode 100644 index 00000000000..730b64eae2f --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.pos.annot.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f05dd6973390ce9a2069fa8593107853d38dedf88a8db87a8500dbb01329fc9 +size 180992 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.pos.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.pos.vcf new file mode 100644 index 00000000000..abec25cff9d --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.pos.vcf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84b539e52f7e0f24e840845ce3d090e4f55e7444d50c2a44177bb62041fb172a +size 196311 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.pos.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.pos.vcf.idx new file mode 100644 index 00000000000..0af9bd98f39 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.pos.vcf.idx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d528a399f3d230ee8275e71e0878c1b2d132d010c57e78f94576004b87297425 +size 114502 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.posUn.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.posUn.annot.hdf5 new file mode 100644 index 00000000000..75691a1e949 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.posUn.annot.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b00a4acfd4c6cc01e3539498bde4bab2279cffc88e8de6217e26e9db64179132 +size 180992 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.posUn.unlabeled.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.posUn.unlabeled.annot.hdf5 new file mode 100644 index 00000000000..3fcef28ea81 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.posUn.unlabeled.annot.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ced17111b7819b00fdf48da669e717538ebca89913ae2ee09833dc4f5ef6890 +size 32880 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.posUn.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.posUn.vcf new file mode 100644 index 00000000000..abec25cff9d --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.posUn.vcf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84b539e52f7e0f24e840845ce3d090e4f55e7444d50c2a44177bb62041fb172a +size 196311 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.posUn.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.posUn.vcf.idx new file mode 100644 index 00000000000..65a56af440f --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/extract/expected/extract.nonAS.snpIndel.posUn.vcf.idx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d7cc7e967cad1911f637d20172745d65adc0a0f804c578e85378028cc80a044 +size 114504 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/input/small_callset_low_threshold.sites-only.chr1.1-10M.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/input/small_callset_low_threshold.sites-only.chr1.1-10M.vcf new file mode 100644 index 00000000000..5bb2ef3ab94 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/input/small_callset_low_threshold.sites-only.chr1.1-10M.vcf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ea6cbe230a5a18f3447cfd5d29ce2787fd4a625128ab147ce0a1b207e577d50 +size 2013818 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/input/small_callset_low_threshold.sites-only.chr1.1-10M.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/input/small_callset_low_threshold.sites-only.chr1.1-10M.vcf.idx new file mode 100644 index 00000000000..6926fb95f58 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/input/small_callset_low_threshold.sites-only.chr1.1-10M.vcf.idx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f40a26b8528447a9d1b1154643cbd682a154e91a67e6dda58cf11a620a1af3dc +size 5387 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/1000G_omni2.5.hg38.chr1.1-5M.vcf.gz b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/1000G_omni2.5.hg38.chr1.1-5M.vcf.gz new file mode 100644 index 00000000000..4157ac3128e --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/1000G_omni2.5.hg38.chr1.1-5M.vcf.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5804bcbfb060e10c3aa841a4a92acfbafbf1b24c88c87fceaa0d9089eee699e +size 127853 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/1000G_omni2.5.hg38.chr1.1-5M.vcf.gz.tbi b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/1000G_omni2.5.hg38.chr1.1-5M.vcf.gz.tbi new file mode 100644 index 00000000000..bc59b8a6e25 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/1000G_omni2.5.hg38.chr1.1-5M.vcf.gz.tbi @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d09b831af6a1b8585c26da1b29d131f8983f121703c5131a6596a1e81e0408f +size 2141 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/1000G_omni2.5.hg38.chr1.5M-10M.vcf.gz b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/1000G_omni2.5.hg38.chr1.5M-10M.vcf.gz new file mode 100644 index 00000000000..5a556e7a0d7 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/1000G_omni2.5.hg38.chr1.5M-10M.vcf.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c6559d5c1567042ddb0fb05d7a5b7d9a07c56c61d8d21adfa85c15bf44e24fa +size 132259 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/1000G_omni2.5.hg38.chr1.5M-10M.vcf.gz.tbi b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/1000G_omni2.5.hg38.chr1.5M-10M.vcf.gz.tbi new file mode 100644 index 00000000000..a7a45835346 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/1000G_omni2.5.hg38.chr1.5M-10M.vcf.gz.tbi @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbf9f97389369ac5e5a41420e58aaa3fa0a5f5edc21a6bd04b7e18c5bc21c914 +size 2542 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/Mills_and_1000G_gold_standard.indels.hg38.chr1.1-5M.vcf.gz b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/Mills_and_1000G_gold_standard.indels.hg38.chr1.1-5M.vcf.gz new file mode 100644 index 00000000000..187e5f24e86 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/Mills_and_1000G_gold_standard.indels.hg38.chr1.1-5M.vcf.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b610a0aeccbec80b69572abcb89e1d3c5e96bc7df22b38b8dccf0b3c6b0ed1b5 +size 45717 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/Mills_and_1000G_gold_standard.indels.hg38.chr1.1-5M.vcf.gz.tbi b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/Mills_and_1000G_gold_standard.indels.hg38.chr1.1-5M.vcf.gz.tbi new file mode 100644 index 00000000000..582b14d068e --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/Mills_and_1000G_gold_standard.indels.hg38.chr1.1-5M.vcf.gz.tbi @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91926cca5a1c36a336f54ba918d0fe0581a6f6e89421a971c78c39aa9e5dd3e6 +size 2040 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/Mills_and_1000G_gold_standard.indels.hg38.chr1.5M-10M.vcf.gz b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/Mills_and_1000G_gold_standard.indels.hg38.chr1.5M-10M.vcf.gz new file mode 100644 index 00000000000..38011d42e49 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/Mills_and_1000G_gold_standard.indels.hg38.chr1.5M-10M.vcf.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5063e401c67443ce0c12c1534b3b1284fe690c826c8987d0430e516193d062ce +size 49655 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/Mills_and_1000G_gold_standard.indels.hg38.chr1.5M-10M.vcf.gz.tbi b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/Mills_and_1000G_gold_standard.indels.hg38.chr1.5M-10M.vcf.gz.tbi new file mode 100644 index 00000000000..27bd4edcff5 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/resources/Mills_and_1000G_gold_standard.indels.hg38.chr1.5M-10M.vcf.gz.tbi @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a738a699beff718443d36022bf9fb35686498f63d7f8e5c40f79ef26e3d5908 +size 2465 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.annot.hdf5 new file mode 100644 index 00000000000..947ccf6cdf6 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.annot.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ecfde9d89634bdeab65242b0b9fc64d2c3607cec0b4533ac2e6b8f71c8fde1e +size 736656 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.scores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.scores.hdf5 new file mode 100644 index 00000000000..c52cc89bfe4 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.scores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0746acbff61c8ef95225e010964b69f48465f35185a1f6af576bb53ab726314d +size 35136 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.vcf new file mode 100644 index 00000000000..d08cc10df21 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.vcf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc110dfc24a356c280ffd772af354a4cee06eb46e8e2f321638d1faa882b17bb +size 2227437 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.vcf.idx new file mode 100644 index 00000000000..01ca0f42445 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.vcf.idx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67ee6520f4abc8f9f659e3704e2cf45318a46a963297eec9c5534cb352584718 +size 119222 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.annot.hdf5 new file mode 100644 index 00000000000..bfca590ccc5 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.annot.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39872e8a5516b5480015190f646a94dc22eeb4b58b337c46bbab83d67fd5b789 +size 822288 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.scores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.scores.hdf5 new file mode 100644 index 00000000000..9c9224beed2 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.scores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a5b0cd60e2dc9387ceedce1e42bebaac932af2364228944f00be0626bad167e +size 38440 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.vcf new file mode 100644 index 00000000000..8af03459e62 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.vcf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7bb3acb4c30df8259c64e5c47ba7ffaf416fa6f7fa271e8cbd216a555ee62a22 +size 2243539 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.vcf.idx new file mode 100644 index 00000000000..485769c9598 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.vcf.idx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29bdcaa085904edb984ca9ab2724efc9142e63256291cead4e2407080cf87196 +size 119227 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.annot.hdf5 new file mode 100644 index 00000000000..fbf0990ee70 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.annot.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b25607c74d197a7116421014925ad4dcc10c326e561b193b1e2eb71152598369 +size 766368 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.scores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.scores.hdf5 new file mode 100644 index 00000000000..ee4850c9acb --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.scores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afb44cc0b2f1c821d4b79f4c0145edc5fc662d06ce13239fd2077e1d1e045783 +size 34960 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.vcf new file mode 100644 index 00000000000..e46bbcf2a15 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.vcf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:993e2d40dea8558c001a7321a4bbe4804877b2de36c3a266416310446c915ccb +size 2226076 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.vcf.idx new file mode 100644 index 00000000000..9be1548020d --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snp.vcf.idx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a56007a28f971a86349be709cb2b5ce3821ef5f3ae19ff0f9dcd2841021a510 +size 119225 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.annot.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.annot.hdf5 new file mode 100644 index 00000000000..1378a5e61da --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.annot.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38fb5c443979d9468de740c26c1e3b2d8f27938c1ffb43ebf48ae1bef94196b3 +size 829672 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.scores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.scores.hdf5 new file mode 100644 index 00000000000..58244d511a7 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.scores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bb53ebfca7a737737a1d01ff541d414c3cef07d507b3e360d360079239d723a +size 37720 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.vcf b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.vcf new file mode 100644 index 00000000000..4af1921ce48 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.vcf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05f85d264a457cdd81896bde03f51b2369343da5ade21b1c8df183a2b7e8f974 +size 2242450 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.vcf.idx b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.vcf.idx new file mode 100644 index 00000000000..34133ce42ad --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/score/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.score.snpIndel.vcf.idx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cba3840238dc7d6c7d85eeda892da51abdccf1e80c60b9030fa781da42d16b9f +size 119230 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.calibrationScores.hdf5 new file mode 100644 index 00000000000..449a7e34730 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.calibrationScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2e3672445358ec0259fc96c72a2c32781e21b584d869c49a6356bb1869a577d +size 4960 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.negative.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.negative.scorer.pkl new file mode 100644 index 00000000000..c262ca0bc24 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.negative.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b439960dcb18984ea43a0cc6f918eeb54ea796de72730264039081cb8b32ae4 +size 356566 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.scorer.pkl new file mode 100644 index 00000000000..114008b6174 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a174a894dee69236e7767bff18a752981e14b9d891efe05843137bcf4b67cffd +size 506091 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.trainingScores.hdf5 new file mode 100644 index 00000000000..794bef0ac90 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.trainingScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63e6805ffd01e5b0420f44cd35a1e41faa96c4a4252c6487ef000dd98290d99e +size 5984 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.unlabeledScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.unlabeledScores.hdf5 new file mode 100644 index 00000000000..2f8fda2a056 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IF.snp.unlabeledScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11c14051f3c1d2ab1481a0d77f2f127d5cb5fa5f0978879f0225a83c86e70456 +size 3192 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.calibrationScores.hdf5 new file mode 100644 index 00000000000..b803f7adaa2 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.calibrationScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3d1b3de26c39fa919cede168b2b5e5ce560a1c10fae5db82da4108467ac96ce +size 4960 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.negative.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.negative.scorer.pkl new file mode 100644 index 00000000000..79698645b6e --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.negative.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf5955e6395e1e3ca4843a395bd87b0664cbe3d931f27a1a9f68f54f93a6825b +size 371024 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.scorer.pkl new file mode 100644 index 00000000000..00f799db2fe --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:066e3f2e993cf5485d3c650cb10bc99a8b082f7aac4ee427e2eda925af758b14 +size 514533 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.trainingScores.hdf5 new file mode 100644 index 00000000000..3a9c7c2ef55 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.trainingScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24c3ed12e070196e9178f2da8876d58f2d0211d3bc424291bb9754f10fecaa78 +size 5984 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.unlabeledScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.unlabeledScores.hdf5 new file mode 100644 index 00000000000..c522bdbdc76 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.unlabeledScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63ac52c99cb736b74a4ddadd3dfa529a7a5f0524f6a4733181a5f83c98daca79 +size 3192 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IF.snp.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IF.snp.calibrationScores.hdf5 new file mode 100644 index 00000000000..058663f5a25 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IF.snp.calibrationScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:843f2fcfb16e5ff1cc69f135129881e560e19adcf4ae20d43a41e4f86ae0cea9 +size 4960 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IF.snp.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IF.snp.scorer.pkl new file mode 100644 index 00000000000..b49deaadde6 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IF.snp.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2b2d3350bc640f19bc81f527ea45f78ff0c6069fb7acf45524ee26e69f4668c +size 506091 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IF.snp.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IF.snp.trainingScores.hdf5 new file mode 100644 index 00000000000..083847575cc --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IF.snp.trainingScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3479fc04ae522a9e5ea4558b484a723df7f40b07401196e36fdfb1412733b99 +size 5984 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.calibrationScores.hdf5 new file mode 100644 index 00000000000..1e4e7af4c24 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.calibrationScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:219b339ad1fe955f6fa20280b4a0a993d6c4dcfa5e7be025cf62bf9937a57e7d +size 4960 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.scorer.pkl new file mode 100644 index 00000000000..00f799db2fe --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:066e3f2e993cf5485d3c650cb10bc99a8b082f7aac4ee427e2eda925af758b14 +size 514533 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.trainingScores.hdf5 new file mode 100644 index 00000000000..02ddb4fb358 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.trainingScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b2c47a9586f1a4c4c37a62c902d91683392bf37e575f925266815ddc9bcc6e8 +size 5984 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.calibrationScores.hdf5 new file mode 100644 index 00000000000..11e985264af --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.calibrationScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e52232e30ee87672170c3c6d38d1d422ddafcd3219081a912b7227d69f445e7 +size 2664 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.negative.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.negative.scorer.pkl new file mode 100644 index 00000000000..12118c548f9 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.negative.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28d5205329aba6be796f5a640f71598bcd9f10b31f871e613aa63d7dccfcc8c2 +size 130367 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.scorer.pkl new file mode 100644 index 00000000000..5b629aeed4e --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6872a50edc608ffc632d5de1849384e05f916d0df64d391c68fc92abe0918d2e +size 235813 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.trainingScores.hdf5 new file mode 100644 index 00000000000..7a2b7786b01 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.trainingScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8669a0d4ca10acfc5d969021d06bbafa2efc14af057ca968d365a2fe9dedac45 +size 2880 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.unlabeledScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.unlabeledScores.hdf5 new file mode 100644 index 00000000000..69d2a8d5d22 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.unlabeledScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:511e968d47dacb11b03439678f2a6f3bf54cde4c0e158d9ab99bfcb9478b5e10 +size 2472 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.calibrationScores.hdf5 new file mode 100644 index 00000000000..e703b28a4cb --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.calibrationScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78b502beb500979c2d8e24f5f60629489da00984c0935929643a3dbade703086 +size 4960 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.negative.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.negative.scorer.pkl new file mode 100644 index 00000000000..cc692c3e418 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.negative.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a71085c8ff3f9266434b07166abae0977028cf6c3609656e557fa924ab37d22 +size 356566 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.scorer.pkl new file mode 100644 index 00000000000..6c73d0e2d5b --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e0c0d98b5a10f80b43e857781e3eb2d066d5b46bbf8aace671c1e848cfe7a7a +size 506091 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.trainingScores.hdf5 new file mode 100644 index 00000000000..129cbdc80d2 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.trainingScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fe6ddc2bad139260fc01d783ebb08613449898568dc01542f836395d7666c64 +size 5984 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.unlabeledScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.unlabeledScores.hdf5 new file mode 100644 index 00000000000..ccf01a0c470 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.unlabeledScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14e3e8e2a0eab4992bf72bca0600109fada5f93848d244d1b1c097de2aeaa2ab +size 3192 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.calibrationScores.hdf5 new file mode 100644 index 00000000000..ead5e9f41bd --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.calibrationScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e37fcda4dfd551f30a889a98862e9f6ed42a305c14d1f552a07567aa9b18441 +size 2664 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.negative.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.negative.scorer.pkl new file mode 100644 index 00000000000..06ba7f87603 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.negative.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28c4176bea3fec56b992ecb0259d191ebe7238615a8a803eaf1c2c9a61adccf5 +size 133823 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.scorer.pkl new file mode 100644 index 00000000000..86600de5f2c --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1de142d06f85e0303ff046377c3eb51742b8556cc3531d1880d7d37a571a1ae7 +size 240055 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.trainingScores.hdf5 new file mode 100644 index 00000000000..8dc0a46dec7 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.trainingScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c47e4959c7436c4aea44694ea2c401ab96624843b5f682691c65ac9435d3656 +size 2880 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.unlabeledScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.unlabeledScores.hdf5 new file mode 100644 index 00000000000..8182344fc6d --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.unlabeledScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e738c016efdfd9adacb2df9021f655bd0107a6a73cc8c788066b94368eca1c83 +size 2472 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.calibrationScores.hdf5 new file mode 100644 index 00000000000..8b8393f2fa8 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.calibrationScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35da3cdd364d94815492009ef56b545a1221fd8ec71003d5c1a52174b6156613 +size 4960 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.negative.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.negative.scorer.pkl new file mode 100644 index 00000000000..79698645b6e --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.negative.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf5955e6395e1e3ca4843a395bd87b0664cbe3d931f27a1a9f68f54f93a6825b +size 371024 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.scorer.pkl new file mode 100644 index 00000000000..00f799db2fe --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:066e3f2e993cf5485d3c650cb10bc99a8b082f7aac4ee427e2eda925af758b14 +size 514533 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.trainingScores.hdf5 new file mode 100644 index 00000000000..eb8ac9e0552 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.trainingScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:367eb408b122fd23b49ee9c464e28f188851406b16be96a004b58999a26c87d8 +size 5984 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.unlabeledScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.unlabeledScores.hdf5 new file mode 100644 index 00000000000..0ef22ca8754 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.unlabeledScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08457ce521c97cd9fe6a4a8be79d4b7f74c155382d8d43d3db0963ef3e1b15c0 +size 3192 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.calibrationScores.hdf5 new file mode 100644 index 00000000000..8a9731e5846 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.calibrationScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25ccfe56dcd1890b70464a5a9c61e550f1f06a340d8d40c8c3637b573a6af6ae +size 2664 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.scorer.pkl new file mode 100644 index 00000000000..28519a5a7f5 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4e1bba0d4e80002b36ae9c6b593207bd2e400a4a88f0f2d69bdcfd7665f18fe +size 235812 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.trainingScores.hdf5 new file mode 100644 index 00000000000..57490375485 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.trainingScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f85b85807a022ff37529099727d0475e9e4fe6c796d6ef69ec25037e4c4ffff +size 2880 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.calibrationScores.hdf5 new file mode 100644 index 00000000000..f678f3ac45a --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.calibrationScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca24136189690867a758ff1102ddd1168d49379bd5cf31c5e809f3049088ca4e +size 4960 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.scorer.pkl new file mode 100644 index 00000000000..f30afeafbb7 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0aaf6d8e05d906b80771f1165c75792451b86a52f7759774ee55175bea288c6d +size 506090 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.trainingScores.hdf5 new file mode 100644 index 00000000000..b3b1cff2b66 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.trainingScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d93a6ff01ea21375c0b1998197c6bc2d25ef876d050abee095497a939883c0f2 +size 5984 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.calibrationScores.hdf5 new file mode 100644 index 00000000000..a51ead9a242 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.calibrationScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:829946ab39fde054c01fd38c14907141eee1c2fb4af5ba7dc93a1dc432ea3483 +size 2664 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.scorer.pkl new file mode 100644 index 00000000000..86600de5f2c --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1de142d06f85e0303ff046377c3eb51742b8556cc3531d1880d7d37a571a1ae7 +size 240055 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.trainingScores.hdf5 new file mode 100644 index 00000000000..a74970fd2fc --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.trainingScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ada773b0af0a3031c8f8a7cf51144929db6cff082d834aeffe17a796e48da7a +size 2880 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.calibrationScores.hdf5 new file mode 100644 index 00000000000..3c889afe1a3 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.calibrationScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b8a31718b25adc789a14d952403706a497e799f677fce94c0e5abcb8f3ca75a +size 4960 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.scorer.pkl new file mode 100644 index 00000000000..00f799db2fe --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:066e3f2e993cf5485d3c650cb10bc99a8b082f7aac4ee427e2eda925af758b14 +size 514533 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.trainingScores.hdf5 new file mode 100644 index 00000000000..a7a57c5e6e0 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.AS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.trainingScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b72e1038e89fc1e16e17a32194e59bc90cb2f479e60ed31ad891fc4821f6fd6 +size 5984 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.calibrationScores.hdf5 new file mode 100644 index 00000000000..1af4242bbec --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.calibrationScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7687aa701fe0fdb52d86a27ccc12a6cc8bb2b57906ea2335146396bfae47ea1b +size 4960 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.negative.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.negative.scorer.pkl new file mode 100644 index 00000000000..f99de98d4fb --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.negative.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7d481a06e9c5d27b8a322712818731b508ecb09309cacff1c5f24df1077d975 +size 368366 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.scorer.pkl new file mode 100644 index 00000000000..21c88876f37 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f4f417b26d9478fb36ac5372f1634140a4114f09d207a434f8d582a5936e9b9 +size 556675 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.trainingScores.hdf5 new file mode 100644 index 00000000000..17e3ab4ef97 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.trainingScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:048b4690d9f1fdce1e3dbabf995e8e306965ea8e00c21a92129d67e0e3b8fb5c +size 5992 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.unlabeledScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.unlabeledScores.hdf5 new file mode 100644 index 00000000000..323cf93db29 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IF.snp.unlabeledScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e4033c901c80e97d4668d38e40b6677ec68d5d6f960d498c90d6311c971d6ae +size 3168 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.calibrationScores.hdf5 new file mode 100644 index 00000000000..9638837ac77 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.calibrationScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff6eb7db94e1e78d2aa4e2bad08eb92e0df1ae88ef29e27f7f81362ac56e4faa +size 4960 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.negative.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.negative.scorer.pkl new file mode 100644 index 00000000000..02016de49a7 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.negative.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:373207a645b1caa64fe15fd4fec77556c87c11f0f304b8e4920094a59cae89eb +size 359135 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.scorer.pkl new file mode 100644 index 00000000000..ac1fe518303 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0510d85a2680139fbfb2d3c8f2d5fed8977af834bf82d09d9090a06fa8d454f4 +size 525312 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.trainingScores.hdf5 new file mode 100644 index 00000000000..6972268a95c --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.trainingScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:443ed54e4403ba517c2370e18f95200f9b9dc4648c914424344626bacab6c4f2 +size 5992 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.unlabeledScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.unlabeledScores.hdf5 new file mode 100644 index 00000000000..90420cf2917 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posNeg.IFDifferentSeed.snp.unlabeledScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3cd7216a8a13adb6640b1e91d939600bc664202964b0cc74d76d8070c3422b75 +size 3168 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IF.snp.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IF.snp.calibrationScores.hdf5 new file mode 100644 index 00000000000..b8c18492487 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IF.snp.calibrationScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f0b956ad8c517d885254ba6f392713e098a92aeeccc699c5a17f53c1504f119 +size 4960 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IF.snp.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IF.snp.scorer.pkl new file mode 100644 index 00000000000..a9a9086bb5e --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IF.snp.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cf2f338bb130d899eca8ceab17339f6f166958642101de301fb8458b11131a1 +size 556675 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IF.snp.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IF.snp.trainingScores.hdf5 new file mode 100644 index 00000000000..486c29310ee --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IF.snp.trainingScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:537cb3879bde1388c6eca2035682a0c7b771ce790b523649e65f167a23eed255 +size 5992 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.calibrationScores.hdf5 new file mode 100644 index 00000000000..8fe0c3fa585 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.calibrationScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25b24f7a4b85ad16f63043285fb31cd580e3498629bab0ef257a98d7ec318471 +size 4960 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.scorer.pkl new file mode 100644 index 00000000000..ac1fe518303 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0510d85a2680139fbfb2d3c8f2d5fed8977af834bf82d09d9090a06fa8d454f4 +size 525312 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.trainingScores.hdf5 new file mode 100644 index 00000000000..075038efb0f --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snp.posOnly.IFDifferentSeed.snp.trainingScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:742c4374a393e2f86ad836001576d77210dc62cdf11cf8748abffe4399c02d0f +size 5992 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.calibrationScores.hdf5 new file mode 100644 index 00000000000..4ae3f77a6dd --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.calibrationScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:298e36701a70226f3720cd40cd6ca8f37404a807cbd193c23d764141f5594f36 +size 2664 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.negative.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.negative.scorer.pkl new file mode 100644 index 00000000000..8c7e18e918e --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.negative.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff46e688731825bb03a6c9504ed7847c998c129a1e13c507a14f7adb56d733ad +size 108247 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.scorer.pkl new file mode 100644 index 00000000000..2aba7f5c93a --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe5ca4414318a540488afbdea99293e0f67b02725839c13faa5b3ff39b959e7e +size 259163 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.trainingScores.hdf5 new file mode 100644 index 00000000000..004a5bdb157 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.trainingScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7243a9e90836489a979905a386811a688bd07968c115063351b77bf91c72efc3 +size 2880 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.unlabeledScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.unlabeledScores.hdf5 new file mode 100644 index 00000000000..9cc88998aef --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.indel.unlabeledScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00393e27ad2c98fd4c36a003d7dcbc175d31d8346ace63e52b458df76a8d7457 +size 2496 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.calibrationScores.hdf5 new file mode 100644 index 00000000000..a52edeb5b40 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.calibrationScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b3040e1926e7b59c623e2b930e16556d26dd91c4d586120f7db156f3f2f14fe +size 4960 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.negative.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.negative.scorer.pkl new file mode 100644 index 00000000000..df2c423b7d9 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.negative.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4cbb64dbf96f8cbc10908aed4c8ec2e3fcf01af7a3512c90e48aa743af3bf28 +size 368366 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.scorer.pkl new file mode 100644 index 00000000000..9192d59204c --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86e04d419ae8a211acbead86c467cd8c3578c3312f31fdc5600eaefd040ffc32 +size 556675 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.trainingScores.hdf5 new file mode 100644 index 00000000000..483bfe123ec --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.trainingScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8007c65df2a62f1b12cb2bd9d0818ea106edf2fc18610318e1e09def0f1bd77a +size 5992 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.unlabeledScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.unlabeledScores.hdf5 new file mode 100644 index 00000000000..b7ab0c0c576 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IF.snp.unlabeledScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6860775947699fe2df688d0c17321de501ef18c12c87dbb430221ba1c27e56b +size 3168 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.calibrationScores.hdf5 new file mode 100644 index 00000000000..624a515e7f8 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.calibrationScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:223ee064aed2de5a2e0c7a773b08d730666fa28b49181984dc8f16d07233b2ef +size 2664 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.negative.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.negative.scorer.pkl new file mode 100644 index 00000000000..54aa186cf73 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.negative.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0b377bad272e94bf11bca14ed7a7dd3c67296f347509d272b9538b695579199 +size 132823 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.scorer.pkl new file mode 100644 index 00000000000..1142e1f4599 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:426cec56e16eec10f47c824797c035772d1ebf3cf4f73972e8a541deca622cd3 +size 248813 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.trainingScores.hdf5 new file mode 100644 index 00000000000..c91fcf083b5 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.trainingScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f80beea7f769595a56aa8af6d90335626fa5c93fe5f61761e998f59e774f3104 +size 2880 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.unlabeledScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.unlabeledScores.hdf5 new file mode 100644 index 00000000000..64fbf36622e --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.indel.unlabeledScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ba9bf669a916ee10750cf8cf78968af0372a5d272640be9fe3fba97fc4e6059 +size 2496 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.calibrationScores.hdf5 new file mode 100644 index 00000000000..6a831bf1e51 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.calibrationScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dce6eb6a9175112940d97573e3e99818a041f4c8311bd0d790b11bad7153cc90 +size 4960 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.negative.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.negative.scorer.pkl new file mode 100644 index 00000000000..02016de49a7 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.negative.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:373207a645b1caa64fe15fd4fec77556c87c11f0f304b8e4920094a59cae89eb +size 359135 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.scorer.pkl new file mode 100644 index 00000000000..ac1fe518303 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0510d85a2680139fbfb2d3c8f2d5fed8977af834bf82d09d9090a06fa8d454f4 +size 525312 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.trainingScores.hdf5 new file mode 100644 index 00000000000..45b09bcdafc --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.trainingScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9885e1ace48f972b88d808fe9dcf31aae828b0f327909c3922305d48659c9516 +size 5992 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.unlabeledScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.unlabeledScores.hdf5 new file mode 100644 index 00000000000..3baaddc00b8 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posNeg.IFDifferentSeed.snp.unlabeledScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60dfec8ee105371c8b69b4c6790f6920c1f94adfad1bca58fa353a016224c3c4 +size 3168 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.calibrationScores.hdf5 new file mode 100644 index 00000000000..9937d988ad3 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.calibrationScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c9b6a3bf6461653330c84b6ed491fbaf065e87c00b707409009a48e55dc3546 +size 2664 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.scorer.pkl new file mode 100644 index 00000000000..7ab5b7071f4 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb95c34fdfa9bb4126d27355e39708de5d273694ce2edf639a02c324608bfdd2 +size 259163 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.trainingScores.hdf5 new file mode 100644 index 00000000000..61cd73704ad --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.indel.trainingScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b1c27c9bb2587fed510eb347fe53332828c2ee0f1cc6f22f8e259d275c5f877 +size 2880 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.calibrationScores.hdf5 new file mode 100644 index 00000000000..91c0e50cbd1 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.calibrationScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfb11237cffa3309f53869996111de5be91efacc8b210d43101535d90d6195a4 +size 4960 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.scorer.pkl new file mode 100644 index 00000000000..786b299758d --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9113bec7467a7b597e6af65108fd19f2260510fc0174fa5bf071ca1b837e0b28 +size 556675 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.trainingScores.hdf5 new file mode 100644 index 00000000000..7f5f3e9342d --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IF.snp.trainingScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21a219096d12b0122e0e7dc7962d26911289d2e17377d606bb504cb3bf87daa6 +size 5992 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.calibrationScores.hdf5 new file mode 100644 index 00000000000..8a91db1f5b2 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.calibrationScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e3fc94a375bde9284a219c1c2eb2b204731af0c4fce15426f7903f6fed43ecb +size 2664 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.scorer.pkl new file mode 100644 index 00000000000..1142e1f4599 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:426cec56e16eec10f47c824797c035772d1ebf3cf4f73972e8a541deca622cd3 +size 248813 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.trainingScores.hdf5 new file mode 100644 index 00000000000..e0fdf55eb20 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.indel.trainingScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ae00383d5302dfb129e6e7c0c61cb115e74fade18b0b62bba6284f876fa8277 +size 2880 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.calibrationScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.calibrationScores.hdf5 new file mode 100644 index 00000000000..1250aeddd2a --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.calibrationScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a2fdb209c8814f63837d30bca8883ed79063d66fe22516f4dc6cb54542d743e +size 4960 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.scorer.pkl b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.scorer.pkl new file mode 100644 index 00000000000..ac1fe518303 --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.scorer.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0510d85a2680139fbfb2d3c8f2d5fed8977af834bf82d09d9090a06fa8d454f4 +size 525312 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.trainingScores.hdf5 b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.trainingScores.hdf5 new file mode 100644 index 00000000000..4f95113bc2e --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/expected/extract.nonAS.snpIndel.posUn.train.snpIndel.posOnly.IFDifferentSeed.snp.trainingScores.hdf5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c063a077702f5979525286ff8bda09d9f6133e251babf27d0ccb9e384578fc0 +size 5992 diff --git a/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/isolation-forest-hyperparameters-different-seed.json b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/isolation-forest-hyperparameters-different-seed.json new file mode 100644 index 00000000000..6fbb7d105da --- /dev/null +++ b/src/test/resources/large/org/broadinstitute/hellbender/tools/walkers/vqsr/scalable/train/isolation-forest-hyperparameters-different-seed.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddca401b3f0fdceedc96946c8ced9870984f1ae34ce5e5626cc4b08152639532 +size 23