diff --git a/scripts/variantstore/wdl/ngs_filter_extract.inputs.json b/scripts/variantstore/wdl/ngs_filter_extract.inputs.json new file mode 100644 index 00000000000..2d4530ab2c8 --- /dev/null +++ b/scripts/variantstore/wdl/ngs_filter_extract.inputs.json @@ -0,0 +1,43 @@ +{ + "NgsFilterExtract.reference": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", + "NgsFilterExtract.reference_index": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", + "NgsFilterExtract.reference_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict", + + "NgsFilterExtract.snp_recalibration_annotation_values": ["AS_QD", "AS_MQRankSum", "AS_ReadPosRankSum", "AS_FS", "AS_MQ", "AS_SOR"], + "NgsFilterExtract.snp_recalibration_tranche_values": ["100.0", "99.95", "99.9", "99.8", "99.6", "99.5", "99.4", "99.3", "99.0", "98.0", "97.0", "90.0" ], + + "NgsFilterExtract.indel_recalibration_annotation_values": ["AS_FS", "AS_ReadPosRankSum", "AS_MQRankSum", "AS_QD", "AS_SOR"], + "NgsFilterExtract.indel_recalibration_tranche_values": ["100.0", "99.95", "99.9", "99.5", "99.0", "97.0", "96.0", "95.0", "94.0", "93.5", "93.0", "92.0", "91.0", "90.0"], + + "NgsFilterExtract.axiomPoly_resource_vcf": "gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz", + "NgsFilterExtract.axiomPoly_resource_vcf_index": "gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz.tbi", + "NgsFilterExtract.dbsnp_vcf": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf", + "NgsFilterExtract.dbsnp_vcf_index": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf.idx", + "NgsFilterExtract.hapmap_resource_vcf": "gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz", + "NgsFilterExtract.hapmap_resource_vcf_index": "gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz.tbi", + "NgsFilterExtract.mills_resource_vcf": "gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz", + "NgsFilterExtract.mills_resource_vcf_index": "gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi", + "NgsFilterExtract.omni_resource_vcf": "gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz", + "NgsFilterExtract.omni_resource_vcf_index": "gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz.tbi", + "NgsFilterExtract.one_thousand_genomes_resource_vcf": "gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz", + "NgsFilterExtract.one_thousand_genomes_resource_vcf_index": "gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz.tbi", + + "NgsFilterExtract.small_disk": 100, + "NgsFilterExtract.medium_disk": 200, + "NgsFilterExtract.large_disk": 1000, + "NgsFilterExtract.huge_disk": 2000, + + "NgsFilterExtract.gatk_override": "gs://broad-dsp-spec-ops/kcibul/gatk-package-4.1.8.1-145-gefaac86-SNAPSHOT-local.jar", + + "NgsFilterExtract.fq_sample_table": "spec-ops-aou.kc_acmg.metadata", + "NgsFilterExtract.fq_alt_allele_table": "spec-ops-aou.kc_acmg.alt_allele", + "NgsFilterExtract.query_project": "spec-ops-aou", + + "NgsFilterExtract.filter_set_name": "wdl_testing-1234", + "NgsFilterExtract.output_file_base_name": "kc_acmg.sites_only", + + "NgsFilterExtract.fq_filter_set_info_table": "spec-ops-aou.kc_acmg.filter_set_info", + "NgsFilterExtract.fq_filter_set_tranches_table": "spec-ops-aou.kc_acmg.filter_set_tranches" + + +} diff --git a/scripts/variantstore/wdl/ngs_filter_extract.wdl b/scripts/variantstore/wdl/ngs_filter_extract.wdl new file mode 100644 index 00000000000..d80a984a12b --- /dev/null +++ b/scripts/variantstore/wdl/ngs_filter_extract.wdl @@ -0,0 +1,497 @@ +version 1.0 + +workflow NgsFilterExtract { + input { + Int max_chrom_id = 24 + + # bug in cromwell, can't support large integers... + # https://github.com/broadinstitute/cromwell/issues/2685 + String chrom_offset = "1000000000000" + + File reference + File reference_index + File reference_dict + + String fq_sample_table + String fq_alt_allele_table + String query_project + + String filter_set_name + String fq_filter_set_info_table + String fq_filter_set_tranches_table + + String output_file_base_name + File? gatk_override + + File dbsnp_vcf + File dbsnp_vcf_index + + Array[String] snp_recalibration_tranche_values + Array[String] snp_recalibration_annotation_values + Array[String] indel_recalibration_tranche_values + Array[String] indel_recalibration_annotation_values + + File hapmap_resource_vcf + File hapmap_resource_vcf_index + File omni_resource_vcf + File omni_resource_vcf_index + File one_thousand_genomes_resource_vcf + File one_thousand_genomes_resource_vcf_index + File mills_resource_vcf + File mills_resource_vcf_index + File axiomPoly_resource_vcf + File axiomPoly_resource_vcf_index + File dbsnp_resource_vcf = dbsnp_vcf + File dbsnp_resource_vcf_index = dbsnp_vcf_index + + # Runtime attributes + Int? small_disk_override + Int small_disk = select_first([small_disk_override, "100"]) + Int? medium_disk_override + Int medium_disk = select_first([medium_disk_override, "200"]) + Int? large_disk_override + Int large_disk = select_first([large_disk_override, "300"]) + Int? huge_disk_override + Int huge_disk = select_first([huge_disk_override, "400"]) + + String? preemptible_tries_override + Int preemptible_tries = select_first([preemptible_tries_override, "3"]) + + } + + scatter(i in range(max_chrom_id)) { + call ExtractFilterTask { + input: + gatk_override = gatk_override, + reference = reference, + reference_index = reference_index, + reference_dict = reference_dict, + fq_sample_table = fq_sample_table, + chrom_offset = chrom_offset, + chrom_id = i+1, + fq_alt_allele_table = fq_alt_allele_table, + read_project_id = query_project, + output_file = "${output_file_base_name}_${i}.vcf.gz" + } + } + + call MergeVCFs { + input: + input_vcfs = ExtractFilterTask.output_vcf, + input_vcfs_indexes = ExtractFilterTask.output_vcf_index, + output_vcf_name = "${output_file_base_name}.vcf.gz", + preemptible_tries = 3 + } + + call IndelsVariantRecalibrator { + input: + sites_only_variant_filtered_vcf = MergeVCFs.output_vcf, + sites_only_variant_filtered_vcf_index = MergeVCFs.output_vcf_index, + recalibration_filename = filter_set_name + ".indels.recal", + tranches_filename = filter_set_name + ".indels.tranches", + recalibration_tranche_values = indel_recalibration_tranche_values, + recalibration_annotation_values = indel_recalibration_annotation_values, + mills_resource_vcf = mills_resource_vcf, + mills_resource_vcf_index = mills_resource_vcf_index, + axiomPoly_resource_vcf = axiomPoly_resource_vcf, + axiomPoly_resource_vcf_index = axiomPoly_resource_vcf_index, + dbsnp_resource_vcf = dbsnp_resource_vcf, + dbsnp_resource_vcf_index = dbsnp_resource_vcf_index, + use_allele_specific_annotations = true, + disk_size = large_disk + } + + call SNPsVariantRecalibrator { + input: + sites_only_variant_filtered_vcf = MergeVCFs.output_vcf, + sites_only_variant_filtered_vcf_index = MergeVCFs.output_vcf_index, + recalibration_filename = filter_set_name + ".snps.recal", + tranches_filename = filter_set_name + ".snps.tranches", + recalibration_tranche_values = snp_recalibration_tranche_values, + recalibration_annotation_values = snp_recalibration_annotation_values, + hapmap_resource_vcf = hapmap_resource_vcf, + hapmap_resource_vcf_index = hapmap_resource_vcf_index, + omni_resource_vcf = omni_resource_vcf, + omni_resource_vcf_index = omni_resource_vcf_index, + one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf, + one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index, + dbsnp_resource_vcf = dbsnp_resource_vcf, + dbsnp_resource_vcf_index = dbsnp_resource_vcf_index, + use_allele_specific_annotations = true, + disk_size = large_disk + } + + call UploadFilterSetToBQ { + input: + gatk_override = gatk_override, + filter_set_name = filter_set_name, + snp_recal_file = SNPsVariantRecalibrator.recalibration, + snp_recal_file_index = SNPsVariantRecalibrator.recalibration_index, + snp_recal_tranches = SNPsVariantRecalibrator.tranches, + + indel_recal_file = IndelsVariantRecalibrator.recalibration, + indel_recal_file_index = IndelsVariantRecalibrator.recalibration_index, + indel_recal_tranches = IndelsVariantRecalibrator.tranches, + + fq_info_destination_table = fq_filter_set_info_table, + fq_tranches_destination_table = fq_filter_set_tranches_table + } + + output { + File output_vcf = MergeVCFs.output_vcf + File output_vcf_idx = MergeVCFs.output_vcf_index + } +} + +################################################################################ +task ExtractFilterTask { + # indicates that this task should NOT be call cached + + # TODO: should this be marked as volatile??? + #meta { + # volatile: true + #} + + input { + # ------------------------------------------------ + # Input args: + File reference + File reference_index + File reference_dict + + String fq_sample_table + + # bug in cromwell, can't support large integers... + # https://github.com/broadinstitute/cromwell/issues/2685 + String chrom_offset + Int chrom_id + + String fq_alt_allele_table + String read_project_id + String output_file + + # Runtime Options: + File? gatk_override + + Int? local_sort_max_records_in_ram = 1000000 + } + + + # ------------------------------------------------ + # Run our command: + command <<< + set -e + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override} + + df -h + min_location=$(echo "~{chrom_id} * ~{chrom_offset}" | bc) + max_location=$(echo "( ~{chrom_id} + 1 ) * ~{chrom_offset}" | bc) + + gatk --java-options "-Xmx4g" \ + ExtractFeatures \ + --ref-version 38 \ + -R "~{reference}" \ + -O "~{output_file}" \ + --local-sort-max-records-in-ram ~{local_sort_max_records_in_ram} \ + --sample-table ~{fq_sample_table} \ + --alt-allele-table ~{fq_alt_allele_table} \ + --min-location ${min_location} --max-location ${max_location} \ + --project-id ~{read_project_id} + >>> + + # ------------------------------------------------ + # Runtime settings: + runtime { + docker: "us.gcr.io/broad-dsde-methods/broad-gatk-snapshots:varstore_d8a72b825eab2d979c8877448c0ca948fd9b34c7_change_to_hwe" + memory: "7 GB" + disks: "local-disk 10 HDD" + bootDiskSizeGb: 15 + preemptible: 3 + cpu: 2 + } + + # ------------------------------------------------ + # Outputs: + output { + File output_vcf = "~{output_file}" + File output_vcf_index = "~{output_file}.tbi" + } + } + +task UploadFilterSetToBQ { + # indicates that this task should NOT be call cached + + # TODO: should this be marked as volatile??? + #meta { + # volatile: true + #} + + input { + # ------------------------------------------------ + # Input args: + + # Runtime Options: + File? gatk_override + + String filter_set_name + + File snp_recal_file + File snp_recal_file_index + File snp_recal_tranches + + File indel_recal_file + File indel_recal_file_index + File indel_recal_tranches + + String fq_info_destination_table + String fq_tranches_destination_table + } + + + # ------------------------------------------------ + # Run our command: + command <<< + set -e + set -o pipefail + + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override} + + gatk --java-options "-Xmx4g" \ + CreateFilteringFiles \ + --ref-version 38 \ + --filter-set-name ~{filter_set_name} \ + -mode SNP \ + -V ~{snp_recal_file} \ + -O ~{filter_set_name}.snps.recal.tsv + + gatk --java-options "-Xmx4g" \ + CreateFilteringFiles \ + --ref-version 38 \ + --filter-set-name ~{filter_set_name} \ + -mode INDEL \ + -V ~{indel_recal_file} \ + -O ~{filter_set_name}.indels.recal.tsv + + # merge into a single file + echo "Merging SNP + INDELs" + cat ~{filter_set_name}.snps.recal.tsv ~{filter_set_name}.indels.recal.tsv | grep -v filter_set_name | grep -v "#" > filter_set_load.tsv + + # BQ load likes a : instead of a . after the project + bq_info_table=$(echo ~{fq_info_destination_table} | sed s/\\./:/) + + echo "Loading Filter Set into BQ" + bq load --skip_leading_rows 0 -F "tab" \ + ${bq_info_table} \ + filter_set_load.tsv \ + "filter_set_name:string,type:string,location:integer,ref:string,alt:string,vqslod:float,culprit:string,training_label:string,yng_status:string" + + echo "Merging Tranches" + cat ~{snp_recal_tranches} ~{indel_recal_tranches} | grep -v targetTruthSensitivity | grep -v "#" | awk -v CALLSET=~{filter_set_name} '{ print CALLSET "," $0 }' > tranches_load.csv + + # BQ load likes a : instead of a . after the project + bq_tranches_table=$(echo ~{fq_tranches_destination_table} | sed s/\\./:/) + + echo "Loading Tranches into BQ" + bq load --skip_leading_rows 0 -F "," \ + ${bq_tranches_table} \ + tranches_load.csv \ + "filter_set_name:string,target_truth_sensitivity:float,num_known:integer,num_novel:integer,known_ti_tv:float,novel_ti_tv:float,min_vqslod:float,filter_name:string,model:string,accessible_truth_sites:integer,calls_at_truth_sites:integer,truth_sensitivity:float" + >>> + + # ------------------------------------------------ + # Runtime settings: + runtime { + docker: "us.gcr.io/broad-dsde-methods/broad-gatk-snapshots:varstore_d8a72b825eab2d979c8877448c0ca948fd9b34c7_change_to_hwe" + memory: "7 GB" + disks: "local-disk 200 HDD" + bootDiskSizeGb: 15 + preemptible: 3 + cpu: 2 + } + + # ------------------------------------------------ + # Outputs: + output { + File filter_set_load_tsv = "filter_set_load.tsv" + File filter_tranche_load_tsv = "tranches_load.csv" + } + } + + task MergeVCFs { + # TODO: should this be marked as volatile??? + #meta { + # volatile: true + #} + + input { + Array[File] input_vcfs + Array[File] input_vcfs_indexes + String output_vcf_name + Int preemptible_tries + } + + Int disk_size = ceil(size(input_vcfs, "GiB") * 2.5) + 10 + + # Using MergeVcfs instead of GatherVcfs so we can create indices + # See https://github.com/broadinstitute/picard/issues/789 for relevant GatherVcfs ticket + command { + java -Xms2000m -jar /usr/gitc/picard.jar \ + MergeVcfs \ + INPUT=~{sep=' INPUT=' input_vcfs} \ + OUTPUT=~{output_vcf_name} + } + runtime { + docker: "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.4.1-1540490856" + preemptible: preemptible_tries + memory: "3 GiB" + disks: "local-disk ~{disk_size} HDD" + } + output { + File output_vcf = "~{output_vcf_name}" + File output_vcf_index = "~{output_vcf_name}.tbi" + } + } + +task IndelsVariantRecalibrator { + + input { + String recalibration_filename + String tranches_filename + + Array[String] recalibration_tranche_values + Array[String] recalibration_annotation_values + + File sites_only_variant_filtered_vcf + File sites_only_variant_filtered_vcf_index + + File mills_resource_vcf + File axiomPoly_resource_vcf + File dbsnp_resource_vcf + File mills_resource_vcf_index + File axiomPoly_resource_vcf_index + File dbsnp_resource_vcf_index + Boolean use_allele_specific_annotations + Int max_gaussians = 4 + + Int disk_size + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.1.9.0" + } + + command <<< + set -euo pipefail + + gatk --java-options -Xms24g \ + VariantRecalibrator \ + -V ~{sites_only_variant_filtered_vcf} \ + -O ~{recalibration_filename} \ + --tranches-file ~{tranches_filename} \ + --trust-all-polymorphic \ + -tranche ~{sep=' -tranche ' recalibration_tranche_values} \ + -an ~{sep=' -an ' recalibration_annotation_values} \ + ~{true='--use-allele-specific-annotations' false='' use_allele_specific_annotations} \ + -mode INDEL \ + --max-gaussians ~{max_gaussians} \ + -resource:mills,known=false,training=true,truth=true,prior=12 ~{mills_resource_vcf} \ + -resource:axiomPoly,known=false,training=true,truth=false,prior=10 ~{axiomPoly_resource_vcf} \ + -resource:dbsnp,known=true,training=false,truth=false,prior=2 ~{dbsnp_resource_vcf} + >>> + + runtime { + memory: "26 GiB" + cpu: "2" + disks: "local-disk " + disk_size + " HDD" + preemptible: 1 + docker: gatk_docker + } + + output { + File recalibration = "~{recalibration_filename}" + File recalibration_index = "~{recalibration_filename}.idx" + File tranches = "~{tranches_filename}" + } +} + +task SNPsVariantRecalibrator { + + input { + String recalibration_filename + String tranches_filename + File? model_report + + Array[String] recalibration_tranche_values + Array[String] recalibration_annotation_values + + File sites_only_variant_filtered_vcf + File sites_only_variant_filtered_vcf_index + + File hapmap_resource_vcf + File omni_resource_vcf + File one_thousand_genomes_resource_vcf + File dbsnp_resource_vcf + File hapmap_resource_vcf_index + File omni_resource_vcf_index + File one_thousand_genomes_resource_vcf_index + File dbsnp_resource_vcf_index + Boolean use_allele_specific_annotations + Int max_gaussians = 6 + + Int disk_size + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.1.9.0" + Int? machine_mem_gb + + } + + Int auto_mem = ceil(2 * size([sites_only_variant_filtered_vcf, + hapmap_resource_vcf, + omni_resource_vcf, + one_thousand_genomes_resource_vcf, + dbsnp_resource_vcf], + "GiB")) + Int machine_mem = select_first([machine_mem_gb, if auto_mem < 7 then 7 else auto_mem]) + Int java_mem = machine_mem - 1 + + + String model_report_arg = if defined(model_report) then "--input-model $MODEL_REPORT --output-tranches-for-scatter" else "" + + command <<< + set -euo pipefail + + MODEL_REPORT=~{model_report} + + gatk --java-options -Xms~{java_mem}g \ + VariantRecalibrator \ + -V ~{sites_only_variant_filtered_vcf} \ + -O ~{recalibration_filename} \ + --tranches-file ~{tranches_filename} \ + --trust-all-polymorphic \ + -tranche ~{sep=' -tranche ' recalibration_tranche_values} \ + -an ~{sep=' -an ' recalibration_annotation_values} \ + ~{true='--use-allele-specific-annotations' false='' use_allele_specific_annotations} \ + -mode SNP \ + ~{model_report_arg} \ + --max-gaussians ~{max_gaussians} \ + -resource:hapmap,known=false,training=true,truth=true,prior=15 ~{hapmap_resource_vcf} \ + -resource:omni,known=false,training=true,truth=true,prior=12 ~{omni_resource_vcf} \ + -resource:1000G,known=false,training=true,truth=false,prior=10 ~{one_thousand_genomes_resource_vcf} \ + -resource:dbsnp,known=true,training=false,truth=false,prior=7 ~{dbsnp_resource_vcf} + >>> + + runtime { + memory: "~{machine_mem} GiB" + cpu: 2 + disks: "local-disk " + disk_size + " HDD" + preemptible: 1 + docker: gatk_docker + } + + output { + File recalibration = "~{recalibration_filename}" + File recalibration_index = "~{recalibration_filename}.idx" + File tranches = "~{tranches_filename}" + } +} + + + + +