Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

VS-430 Remove YNG status from vds #8861

Merged
merged 22 commits into from
Jun 6, 2024
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,7 @@ workflows:
branches:
- master
- ah_var_store
- gg_VS-430_RemoveYNGStatus_From_VDS
tags:
- /.*/
- name: GvsExtractCallset
Expand Down Expand Up @@ -298,6 +299,7 @@ workflows:
branches:
- master
- ah_var_store
- gg_VS-430_RemoveYNGStatus_From_VDS
tags:
- /.*/
- name: GvsQuickstartIntegration
Expand Down
1 change: 0 additions & 1 deletion scripts/variantstore/wdl/GvsCreateVDS.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ version 1.0
# This WDL will create a VDS in Hail running in a Dataproc cluster.
import "GvsUtils.wdl" as Utils


workflow GvsCreateVDS {
input {
String avro_path
Expand Down
2 changes: 1 addition & 1 deletion scripts/variantstore/wdl/GvsUtils.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ task GetToolVersions {
# GVS generally uses the smallest `alpine` version of the Google Cloud SDK as it suffices for most tasks, but
# there are a handlful of tasks that require the larger GNU libc-based `slim`.
String cloud_sdk_slim_docker = "gcr.io/google.com/cloudsdktool/cloud-sdk:435.0.0-slim"
String variants_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/variants:2024-04-22-alpine-32804b134a75"
String variants_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/variants:2024-06-04-alpine-1dd549733af9"
String variants_nirvana_docker = "us.gcr.io/broad-dsde-methods/variantstore:nirvana_2022_10_19"
String gatk_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/gatk:2024_05_23-gatkbase-28d855745d35"
String real_time_genomics_docker = "docker.io/realtimegenomics/rtg-tools:latest"
Expand Down
24 changes: 16 additions & 8 deletions scripts/variantstore/wdl/extract/import_gvs.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,7 +336,11 @@ def convert_array_with_id_keys_to_dense_array(arr, ids, drop=[]):
vd = vd.annotate_rows(filters=hl.coalesce(site[vd.locus].filters, hl.empty_set(hl.tstr)))

# vqsr ref/alt come in normalized individually, so need to renormalize to the dataset ref allele
# For Method 1 - *THIS* is the version ('as_vqsr') that ultimately will be kept - does NOT contain 'yng_status'
vd = vd.annotate_rows(as_vqsr = hl.dict(vqsr.index(vd.locus, all_matches=True)
.map(lambda record: (record.alt + vd.alleles[0][hl.len(record.ref):], record.drop('ref', 'alt', 'yng_status')))))
# Method 2: Add an 'as_vqsr_dup' from which we will do the calculation of GT filtering ahead, that ultimately will be dropped
vd = vd.annotate_rows(as_vqsr_dup = hl.dict(vqsr.index(vd.locus, all_matches=True)
.map(lambda record: (record.alt + vd.alleles[0][hl.len(record.ref):], record.drop('ref', 'alt')))))

if use_classic_vqsr:
Expand All @@ -358,31 +362,35 @@ def convert_array_with_id_keys_to_dense_array(arr, ids, drop=[]):
is_snp = vd.alleles[1:].map(lambda alt: hl.is_snp(vd.alleles[0], alt))
vd = vd.annotate_rows(
allele_NO=vd.alleles[1:].map(
lambda allele: hl.coalesce(vd.as_vqsr.get(allele).yng_status == 'N', False)),
lambda allele: hl.coalesce(vd.as_vqsr_dup.get(allele).yng_status == 'N', False)),
allele_YES=vd.alleles[1:].map(
lambda allele: hl.coalesce(vd.as_vqsr.get(allele).yng_status == 'Y', True)),
lambda allele: hl.coalesce(vd.aas_vqsr_dup.get(allele).yng_status == 'Y', True)),
gbggrant marked this conversation as resolved.
Show resolved Hide resolved
allele_is_snp=is_snp,
allele_OK=hl._zip_func(is_snp, vd.alleles[1:],
f=lambda is_snp, alt:
hl.coalesce(vd.as_vqsr.get(alt).vqslod >=
hl.coalesce(vd.as_vqsr_dup.get(alt).vqslod >=
hl.if_else(is_snp, vd.snp_vqslod_threshold, vd.indel_vqslod_threshold),
True))
True)),
as_vqsr_dup=vd.alleles[1:].map(
lambda allele: vd.as_vqsr_dup.get(allele).drop('yng_status'))
)
else:
vd = vd.annotate_globals(truth_sensitivity_snp_threshold=truth_sensitivity_snp_threshold,
truth_sensitivity_indel_threshold=truth_sensitivity_indel_threshold)
is_snp = vd.alleles[1:].map(lambda alt: hl.is_snp(vd.alleles[0], alt))
vd = vd.annotate_rows(
allele_NO=vd.alleles[1:].map(
lambda allele: hl.coalesce(vd.as_vqsr.get(allele).yng_status == 'N', False)),
lambda allele: hl.coalesce(vd.as_vqsr_dup.get(allele).yng_status == 'N', False)),
allele_YES=vd.alleles[1:].map(
lambda allele: hl.coalesce(vd.as_vqsr.get(allele).yng_status == 'Y', True)),
lambda allele: hl.coalesce(vd.as_vqsr_dup.get(allele).yng_status == 'Y', True)),
allele_is_snp=is_snp,
allele_OK=hl._zip_func(is_snp, vd.alleles[1:],
f=lambda is_snp, alt:
hl.coalesce(vd.as_vqsr.get(alt).calibration_sensitivity <=
hl.coalesce(vd.as_vqsr_dup.get(alt).calibration_sensitivity <=
hl.if_else(is_snp, vd.truth_sensitivity_snp_threshold, vd.truth_sensitivity_indel_threshold),
True))
True)),
as_vqsr_dup=vd.alleles[1:].map(
lambda allele: vd.as_vqsr_dup.get(allele).drop('yng_status'))
)

lgt = vd.LGT
Expand Down
Loading