From 49c99718fe646ff1a302a9c677206a7a3fac9bbf Mon Sep 17 00:00:00 2001 From: rori <6863459+RoriCremer@users.noreply.github.com> Date: Thu, 16 Sep 2021 14:34:23 -0400 Subject: [PATCH 01/26] clean up --- scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl index 2adccbb9da1..cc86a966f93 100644 --- a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl +++ b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl @@ -94,8 +94,8 @@ workflow GvsSitesOnlyVCF { service_account_json_path = service_account_json_path, load_jsons_done = BigQueryLoadJson.done } - scatter(i in range(length(contig_array)) ) { + scatter(i in range(length(contig_array)) ) { call BigQueryExportVat { input: contig = contig_array[i], From 9ff496bf932c62fc307e7ab82f15a021fc581672 Mon Sep 17 00:00:00 2001 From: rori <6863459+RoriCremer@users.noreply.github.com> Date: Wed, 29 Sep 2021 11:10:51 -0400 Subject: [PATCH 02/26] is this step dropping rows --- scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl index cc86a966f93..f6881d99438 100644 --- a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl +++ b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl @@ -250,7 +250,6 @@ task ExtractAnAcAfFromVCF { ### for validation of the pipeline wc -l ~{custom_annotations_file_name} | awk '{print $1}' > count.txt - # Should this be where we do the filtering of the AC/AN/AF values rather than in the python? ### compress the vcf and index it bcftools view -I deduplicated.vcf -Oz -o ~{normalized_vcf_compressed} @@ -291,6 +290,8 @@ task SitesOnlyVcf { # Adding `--add-output-vcf-command-line false` so that the VCF header doesn't have a timestamp # in it so that downstream steps can call cache + wc -l ~{input_vcf} + gatk --java-options "-Xmx12288m" \ SelectVariants \ -V ~{input_vcf} \ @@ -299,7 +300,10 @@ task SitesOnlyVcf { --sites-only-vcf-output \ -O ~{output_filename} - >>> + wc -l ~{output_filename} + + + >>> # ------------------------------------------------ # Runtime settings: runtime { From d845e0176958c334945d5ca320e90861d014321d Mon Sep 17 00:00:00 2001 From: rori <6863459+RoriCremer@users.noreply.github.com> Date: Wed, 29 Sep 2021 11:28:30 -0400 Subject: [PATCH 03/26] add filtering on the FT tag level --- scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl index f6881d99438..c4c1c5e0997 100644 --- a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl +++ b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl @@ -231,9 +231,11 @@ task ExtractAnAcAfFromVCF { #] + ## normalize and filter out the filtered sites and FT filtered out variants bcftools norm -m-any ~{local_input_vcf} | \ - bcftools norm --check-ref w -f Homo_sapiens_assembly38.fasta > ~{normalized_vcf} - + bcftools norm --check-ref w -f Homo_sapiens_assembly38.fasta | \ + bcftools view -f 'PASS' | \ + bcftools filter -i "FORMAT/FT='PASS'" --set-GTs . > ~{normalized_vcf} ## make a file of just the first 4 columns of the tsv (maybe I could just bcftools query it?) bcftools query ~{normalized_vcf} -f '%CHROM\t%POS\t%REF\t%ALT\n' > check_duplicates.tsv From e8646e468103d8b1fb4fbe3c6c9b47b8ea19fe64 Mon Sep 17 00:00:00 2001 From: rori <6863459+RoriCremer@users.noreply.github.com> Date: Wed, 29 Sep 2021 15:13:46 -0400 Subject: [PATCH 04/26] fixup on the file length --- scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl index c4c1c5e0997..b86a76aff3d 100644 --- a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl +++ b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl @@ -251,7 +251,7 @@ task ExtractAnAcAfFromVCF { | grep -v "*" >> ~{custom_annotations_file_name} ### for validation of the pipeline - wc -l ~{custom_annotations_file_name} | awk '{print $1}' > count.txt + wc -l ~{custom_annotations_file_name} | awk '{print $1 -7}' > count.txt ### compress the vcf and index it bcftools view -I deduplicated.vcf -Oz -o ~{normalized_vcf_compressed} @@ -292,8 +292,6 @@ task SitesOnlyVcf { # Adding `--add-output-vcf-command-line false` so that the VCF header doesn't have a timestamp # in it so that downstream steps can call cache - wc -l ~{input_vcf} - gatk --java-options "-Xmx12288m" \ SelectVariants \ -V ~{input_vcf} \ @@ -302,9 +300,6 @@ task SitesOnlyVcf { --sites-only-vcf-output \ -O ~{output_filename} - wc -l ~{output_filename} - - >>> # ------------------------------------------------ # Runtime settings: From 46b81e14fc0a3c1dbccf032664b258cb0baf66a1 Mon Sep 17 00:00:00 2001 From: rori <6863459+RoriCremer@users.noreply.github.com> Date: Wed, 29 Sep 2021 15:24:03 -0400 Subject: [PATCH 05/26] remove duplicates by 4 cols, not only 2 --- scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl index b86a76aff3d..51f524f2792 100644 --- a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl +++ b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl @@ -238,13 +238,15 @@ task ExtractAnAcAfFromVCF { bcftools filter -i "FORMAT/FT='PASS'" --set-GTs . > ~{normalized_vcf} ## make a file of just the first 4 columns of the tsv (maybe I could just bcftools query it?) - bcftools query ~{normalized_vcf} -f '%CHROM\t%POS\t%REF\t%ALT\n' > check_duplicates.tsv + bcftools query ~{normalized_vcf} -f '%CHROM\t%POS\t%ID\t%REF\t%ALT\n' > check_duplicates.tsv ## check it for duplicates and put them in a new file - sort check_duplicates.tsv | uniq -d | cut -f1,2 > duplicates.tsv - ## remove those rows (this will be ALL rows with this position--so good rows too, potentially we want to grab f1,4 to do this with) + sort check_duplicates.tsv | uniq -d | cut -f1,2,3,4,5 > duplicates.tsv + ## remove those rows (that match up to the first 5 cols) grep -v -wFf duplicates.tsv ~{normalized_vcf} | grep -v "AC=0;" > deduplicated.vcf wc -l duplicates.tsv + echo "the following variants will be dropped due to a bug" + cat duplicates.tsv bcftools plugin fill-tags -- deduplicated.vcf -S ~{subpopulation_sample_list} -t AC,AF,AN,AC_het,AC_hom | bcftools query -f \ '%CHROM\t%POS\t%REF\t%ALT\t%AC\t%AN\t%AF\t%AC_Hom\t%AC_Het\t%AC_afr\t%AN_afr\t%AF_afr\t%AC_Hom_afr\t%AC_Het_afr\t%AC_amr\t%AN_amr\t%AF_amr\t%AC_Hom_amr\t%AC_Het_amr\t%AC_eas\t%AN_eas\t%AF_eas\t%AC_Hom_eas\t%AC_Het_eas\t%AC_eur\t%AN_eur\t%AF_eur\t%AC_Hom_eur\t%AC_Het_eur\t%AC_mid\t%AN_mid\t%AF_mid\t%AC_Hom_mid\t%AC_Het_mid\t%AC_oth\t%AN_oth\t%AF_oth\t%AC_Hom_oth\t%AC_Het_oth\t%AC_sas\t%AN_sas\t%AF_sas\t%AC_Hom_sas\t%AC_Het_sas\n' \ From 59b6cb98da656a373599d6a65dc543c69ad2a793 Mon Sep 17 00:00:00 2001 From: rori <6863459+RoriCremer@users.noreply.github.com> Date: Wed, 29 Sep 2021 22:45:42 -0400 Subject: [PATCH 06/26] cleanup --- scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl index 51f524f2792..c00d020bfc7 100644 --- a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl +++ b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl @@ -231,13 +231,13 @@ task ExtractAnAcAfFromVCF { #] - ## normalize and filter out the filtered sites and FT filtered out variants + ## normalize and filter out the filtered sites and FT filtered out variants ## do we need: bcftools norm -m-any ~{local_input_vcf} | \ bcftools norm --check-ref w -f Homo_sapiens_assembly38.fasta | \ - bcftools view -f 'PASS' | \ + bcftools view -f 'PASS,.' | \ bcftools filter -i "FORMAT/FT='PASS'" --set-GTs . > ~{normalized_vcf} - ## make a file of just the first 4 columns of the tsv (maybe I could just bcftools query it?) + ## make a file of just the first 5 columns of the tsv bcftools query ~{normalized_vcf} -f '%CHROM\t%POS\t%ID\t%REF\t%ALT\n' > check_duplicates.tsv ## check it for duplicates and put them in a new file sort check_duplicates.tsv | uniq -d | cut -f1,2,3,4,5 > duplicates.tsv @@ -245,7 +245,7 @@ task ExtractAnAcAfFromVCF { grep -v -wFf duplicates.tsv ~{normalized_vcf} | grep -v "AC=0;" > deduplicated.vcf wc -l duplicates.tsv - echo "the following variants will be dropped due to a bug" + echo "the following duplicate variants will be dropped due to a bug" cat duplicates.tsv bcftools plugin fill-tags -- deduplicated.vcf -S ~{subpopulation_sample_list} -t AC,AF,AN,AC_het,AC_hom | bcftools query -f \ From 1e358aaf5d3396882a9a73cb9b0d340328e04ec4 Mon Sep 17 00:00:00 2001 From: rori <6863459+RoriCremer@users.noreply.github.com> Date: Thu, 30 Sep 2021 00:24:55 -0400 Subject: [PATCH 07/26] does this cleanly remove sites only? --- scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl | 60 ++------------------ 1 file changed, 6 insertions(+), 54 deletions(-) diff --git a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl index c00d020bfc7..25bdccb0b5a 100644 --- a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl +++ b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl @@ -34,6 +34,8 @@ workflow GvsSitesOnlyVCF { ## Scatter across the shards from the GVS jointVCF scatter(i in range(length(MakeSubpopulationFiles.input_vcfs)) ) { + ## Create a sites-only VCF from the original GVS jointVCF + ## Calculate AC/AN/AF for subpopulations and extract them for custom annotations call ExtractAnAcAfFromVCF { input: input_vcf = MakeSubpopulationFiles.input_vcfs[i], @@ -44,20 +46,11 @@ workflow GvsSitesOnlyVCF { ref = reference } - ## Create a sites-only VCF from the original GVS jointVCF - call SitesOnlyVcf { - input: - input_vcf = ExtractAnAcAfFromVCF.output_vcf, - input_vcf_index = ExtractAnAcAfFromVCF.output_vcf_index, - service_account_json_path = service_account_json_path, - output_filename = "${output_sites_only_file_name}_${i}.sites_only.vcf.gz" - } - ## Use Nirvana to annotate the sites-only VCF and include the AC/AN/AF calculations as custom annotations call AnnotateVCF { input: - input_vcf = SitesOnlyVcf.output_vcf, - input_vcf_index = SitesOnlyVcf.output_vcf_idx, + input_vcf = ExtractAnAcAfFromVCF.output_vcf, + input_vcf_index = ExtractAnAcAfFromVCF.output_vcf_index, output_annotated_file_name = "${output_annotated_file_name}_${i}", nirvana_data_tar = nirvana_data_directory, custom_annotations_file = ExtractAnAcAfFromVCF.annotations_file, @@ -256,7 +249,7 @@ task ExtractAnAcAfFromVCF { wc -l ~{custom_annotations_file_name} | awk '{print $1 -7}' > count.txt ### compress the vcf and index it - bcftools view -I deduplicated.vcf -Oz -o ~{normalized_vcf_compressed} + bcftools view --no-update --drop-genotypes deduplicated.vcf -Oz -o ~{normalized_vcf_compressed} bcftools index --tbi ~{normalized_vcf_compressed} >>> @@ -264,7 +257,7 @@ task ExtractAnAcAfFromVCF { # Runtime settings: runtime { docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_20210916" - memory: "20 GB" + memory: "32 GB" preemptible: 1 cpu: "2" disks: "local-disk 500 SSD" @@ -279,47 +272,6 @@ task ExtractAnAcAfFromVCF { } } -task SitesOnlyVcf { - input { - File input_vcf - File input_vcf_index - String? service_account_json_path - String output_filename - } - String output_vcf_idx = basename(output_filename) + ".tbi" - - command <<< - set -e - - # Adding `--add-output-vcf-command-line false` so that the VCF header doesn't have a timestamp - # in it so that downstream steps can call cache - - gatk --java-options "-Xmx12288m" \ - SelectVariants \ - -V ~{input_vcf} \ - --add-output-vcf-command-line false \ - --exclude-filtered \ - --sites-only-vcf-output \ - -O ~{output_filename} - - >>> - # ------------------------------------------------ - # Runtime settings: - runtime { - docker: "broadinstitute/gatk:4.2.0.0" - memory: "15 GB" - preemptible: 3 - cpu: "2" - disks: "local-disk 250 HDD" - } - # ------------------------------------------------ - # Outputs: - output { - File output_vcf="~{output_filename}" - File output_vcf_idx="~{output_vcf_idx}" - } -} - task AnnotateVCF { input { File input_vcf From d168d2733013a7fb24a160e5746ec86ba05b7c42 Mon Sep 17 00:00:00 2001 From: rori <6863459+RoriCremer@users.noreply.github.com> Date: Thu, 30 Sep 2021 00:53:46 -0400 Subject: [PATCH 08/26] add to dockstore for testing --- .dockstore.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.dockstore.yml b/.dockstore.yml index a7c31ff7225..5c91300c877 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -128,6 +128,7 @@ workflows: branches: - master - ah_var_store + - rc-remove-sites-only-step - name: GvsValidateVat subclass: WDL primaryDescriptorPath: /scripts/variantstore/variant_annotations_table/GvsValidateVAT.wdl From e9f413c6f259c0825054a243ad0f98b757883df7 Mon Sep 17 00:00:00 2001 From: rori <6863459+RoriCremer@users.noreply.github.com> Date: Thu, 30 Sep 2021 00:55:10 -0400 Subject: [PATCH 09/26] lil cheaper --- scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl index 25bdccb0b5a..5307485d783 100644 --- a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl +++ b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl @@ -258,7 +258,7 @@ task ExtractAnAcAfFromVCF { runtime { docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_20210916" memory: "32 GB" - preemptible: 1 + preemptible: 3 cpu: "2" disks: "local-disk 500 SSD" } From f93b896a12cb3c7497a78f4cdd2807a7be715836 Mon Sep 17 00:00:00 2001 From: rori <6863459+RoriCremer@users.noreply.github.com> Date: Thu, 30 Sep 2021 14:54:08 -0400 Subject: [PATCH 10/26] testing the normalized vcfs --- scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl index 5307485d783..f87f45eeb2d 100644 --- a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl +++ b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl @@ -269,6 +269,7 @@ task ExtractAnAcAfFromVCF { Int count_variants = read_int("count.txt") File output_vcf = "~{normalized_vcf_compressed}" File output_vcf_index = "~{normalized_vcf_indexed}" + File testing_normalized = "~{normalized_vcf}" } } From 55fa30590f64fe58981d7bd7706447d9fb96d5b5 Mon Sep 17 00:00:00 2001 From: rori <6863459+RoriCremer@users.noreply.github.com> Date: Thu, 30 Sep 2021 15:35:59 -0400 Subject: [PATCH 11/26] make annotations machine bigger --- scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl index f87f45eeb2d..9a2a390ce62 100644 --- a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl +++ b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl @@ -269,7 +269,6 @@ task ExtractAnAcAfFromVCF { Int count_variants = read_int("count.txt") File output_vcf = "~{normalized_vcf_compressed}" File output_vcf_index = "~{normalized_vcf_indexed}" - File testing_normalized = "~{normalized_vcf}" } } @@ -331,10 +330,10 @@ task AnnotateVCF { # Runtime settings: runtime { docker: "annotation/nirvana:3.14" - memory: "20 GB" + memory: "32 GB" cpu: "2" preemptible: 5 - disks: "local-disk 250 SSD" + disks: "local-disk 500 SSD" } # ------------------------------------------------ # Outputs: From b2d0ffee476ef0f28143aca59c3087f8acfa5c67 Mon Sep 17 00:00:00 2001 From: rori <6863459+RoriCremer@users.noreply.github.com> Date: Fri, 1 Oct 2021 16:08:51 -0400 Subject: [PATCH 12/26] add duplicate count info --- scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl index 9a2a390ce62..f70721975c2 100644 --- a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl +++ b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl @@ -83,6 +83,7 @@ workflow GvsSitesOnlyVCF { project_id = project_id, dataset_name = dataset_name, counts_variants = ExtractAnAcAfFromVCF.count_variants, + counts_variant_duplicates = ExtractAnAcAfFromVCF.count_duplicates, table_suffix = table_suffix, service_account_json_path = service_account_json_path, load_jsons_done = BigQueryLoadJson.done @@ -237,7 +238,7 @@ task ExtractAnAcAfFromVCF { ## remove those rows (that match up to the first 5 cols) grep -v -wFf duplicates.tsv ~{normalized_vcf} | grep -v "AC=0;" > deduplicated.vcf - wc -l duplicates.tsv + wc -l duplicates.tsv | awk '{print $1}' > duplicates_count.txt echo "the following duplicate variants will be dropped due to a bug" cat duplicates.tsv @@ -267,6 +268,7 @@ task ExtractAnAcAfFromVCF { output { File annotations_file = "~{custom_annotations_file_name}" Int count_variants = read_int("count.txt") + Int count_duplicates = read_int("duplicates_count.txt") File output_vcf = "~{normalized_vcf_compressed}" File output_vcf_index = "~{normalized_vcf_indexed}" } @@ -623,6 +625,7 @@ task BigQuerySmokeTest { String project_id String dataset_name Array[Int] counts_variants + Array[Int] counts_variant_duplicates String table_suffix String? service_account_json_path Boolean load_jsons_done @@ -650,6 +653,9 @@ task BigQuerySmokeTest { # sum all the initial input variants across the shards INITIAL_VARIANT_COUNT=$(python -c "print(sum([~{sep=', ' counts_variants}]))") + DUPLICATE_VARIANT_COUNT=$(python -c "print(sum([~{sep=', ' counts_variant_duplicates}]))") + + echo "Duplicate variant count: $DUPLICATE_VARIANT_COUNT." # Count number of variants in the VAT bq query --nouse_legacy_sql --project_id=~{project_id} --format=csv 'SELECT COUNT (DISTINCT vid) AS count FROM `~{dataset_name}.~{vat_table}`' > bq_variant_count.csv From d84ad828da7ae4f93f63eca5d04bb0e438b4699d Mon Sep 17 00:00:00 2001 From: rori <6863459+RoriCremer@users.noreply.github.com> Date: Mon, 4 Oct 2021 12:54:55 -0400 Subject: [PATCH 13/26] dont rename things---use the names that already exist! --- scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl index f70721975c2..1e4f1350cc2 100644 --- a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl +++ b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl @@ -3,8 +3,6 @@ workflow GvsSitesOnlyVCF { input { File inputFileofFileNames File inputFileofIndexFileNames - String output_sites_only_file_name - String output_annotated_file_name String project_id String dataset_name File nirvana_data_directory @@ -36,6 +34,7 @@ workflow GvsSitesOnlyVCF { scatter(i in range(length(MakeSubpopulationFiles.input_vcfs)) ) { ## Create a sites-only VCF from the original GVS jointVCF ## Calculate AC/AN/AF for subpopulations and extract them for custom annotations + String input_vcf_name = basename(MakeSubpopulationFiles.input_vcfs[i], ".vcf.gz") call ExtractAnAcAfFromVCF { input: input_vcf = MakeSubpopulationFiles.input_vcfs[i], @@ -51,7 +50,7 @@ workflow GvsSitesOnlyVCF { input: input_vcf = ExtractAnAcAfFromVCF.output_vcf, input_vcf_index = ExtractAnAcAfFromVCF.output_vcf_index, - output_annotated_file_name = "${output_annotated_file_name}_${i}", + output_annotated_file_name = "${input_vcf_name}_annotated", nirvana_data_tar = nirvana_data_directory, custom_annotations_file = ExtractAnAcAfFromVCF.annotations_file, } @@ -59,7 +58,7 @@ workflow GvsSitesOnlyVCF { call PrepAnnotationJson { input: annotation_json = AnnotateVCF.annotation_json, - output_file_suffix = "${i}.json.gz", + output_file_suffix = "${input_vcf_name}.json.gz", output_path = output_path, service_account_json_path = service_account_json_path } From 3d8b151a9ccc2bb8237556d0a5493da6feab536c Mon Sep 17 00:00:00 2001 From: rori <6863459+RoriCremer@users.noreply.github.com> Date: Mon, 4 Oct 2021 14:16:33 -0400 Subject: [PATCH 14/26] drop all the excessive AS_QUALapprox info since it just adds weight --- scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl index 1e4f1350cc2..9922684310b 100644 --- a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl +++ b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl @@ -248,8 +248,9 @@ task ExtractAnAcAfFromVCF { ### for validation of the pipeline wc -l ~{custom_annotations_file_name} | awk '{print $1 -7}' > count.txt - ### compress the vcf and index it - bcftools view --no-update --drop-genotypes deduplicated.vcf -Oz -o ~{normalized_vcf_compressed} + ### compress the vcf and index it, make it sites-only and drop the AS_QUALapprox field---might be nice to be able to drop all of the INFO fields, since I dont need AC/AN/AF in this form + bcftools view --no-update --drop-genotypes deduplicated.vcf -Oz -o sites_only.vcf.gz + bcftools annotate -x INFO/AS_QUALapprox sites_only.vcf.gz > ~{normalized_vcf_compressed} bcftools index --tbi ~{normalized_vcf_compressed} >>> From 615aa28a90dc5b39ff9983477546cb5d735cc716 Mon Sep 17 00:00:00 2001 From: rori <6863459+RoriCremer@users.noreply.github.com> Date: Mon, 4 Oct 2021 17:40:34 -0400 Subject: [PATCH 15/26] swap gzip --- scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl index 9922684310b..00a067a9b86 100644 --- a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl +++ b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl @@ -250,7 +250,7 @@ task ExtractAnAcAfFromVCF { ### compress the vcf and index it, make it sites-only and drop the AS_QUALapprox field---might be nice to be able to drop all of the INFO fields, since I dont need AC/AN/AF in this form bcftools view --no-update --drop-genotypes deduplicated.vcf -Oz -o sites_only.vcf.gz - bcftools annotate -x INFO/AS_QUALapprox sites_only.vcf.gz > ~{normalized_vcf_compressed} + bcftools annotate -x INFO/AS_QUALapprox sites_only.vcf.gz -Oz -o ~{normalized_vcf_compressed} bcftools index --tbi ~{normalized_vcf_compressed} >>> From 7a12d558530611230c8b93295fc9d0dc694f2887 Mon Sep 17 00:00:00 2001 From: rori <6863459+RoriCremer@users.noreply.github.com> Date: Wed, 6 Oct 2021 16:30:51 -0400 Subject: [PATCH 16/26] check with Lee about VAT - markings --- .../create_variant_annotation_table.py | 24 +++++++++++++------ 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/scripts/variantstore/wdl/extract/create_variant_annotation_table.py b/scripts/variantstore/wdl/extract/create_variant_annotation_table.py index 0097fa18b3d..2b291d31df0 100644 --- a/scripts/variantstore/wdl/extract/create_variant_annotation_table.py +++ b/scripts/variantstore/wdl/extract/create_variant_annotation_table.py @@ -11,13 +11,15 @@ vat_nirvana_positions_dictionary = { "position": "position", # required + "ref_allele": "refAllele", # required + "alt_allele": "altAlleles", # required } vat_nirvana_variants_dictionary = { "vid": "vid", # required "contig": "chromosome", # required - "ref_allele": "refAllele", # required - "alt_allele": "altAllele", # required + # "ref_allele": "refAllele", # required # do we want to grab both to compare? + # "alt_allele": "altAllele", # required "variant_type": "variantType", # required "genomic_location": "hgvsg", # required "dbsnp_rsid": "dbsnp", # nullable @@ -196,9 +198,11 @@ def get_subpopulation_calculations(subpop_annotations): row["gvs_max_sc"] = max_sc return row -def make_annotated_json_row(row_position, variant_line, transcript_line): +def make_annotated_json_row(row_position, row_ref, row_alt, variant_line, transcript_line): row = {} row["position"] = row_position # this is a required field + row["ref_allele"] = row_ref # this is a required field + row["alt_allele"] = row_alt # this is a required field for vat_variants_fieldname in vat_nirvana_variants_dictionary.keys(): # like "contig" nirvana_variants_fieldname = vat_nirvana_variants_dictionary.get(vat_variants_fieldname) @@ -287,17 +291,23 @@ def make_positions_json(annotated_json, output_json): for p in positions: position=p['position'] + refAllele=p['refAllele'] # ref_allele + altAlleles=p['altAlleles'] # this is an array that we need to split into each alt_allele variants=p['variants'] + if '*' in altAlleles: + altAlleles.remove('*') + ## there should be the same number of altAlleles as variants, right? + assert len(altAlleles)==len(variants) # row for each variant - transcript # so let's start with each variant - for variant in variants: + for index, variant in enumerate(variants): # check if it's a row we care about if check_filtering(variant) is False: continue # remember that we want one for each variant-transcript and variant-null for variants without transcripts if variant.get("transcripts") == None: # then we make a special row - row = make_annotated_json_row(position, variant, None) + row = make_annotated_json_row(position, refAllele, altAlleles[index], variant, None) json_str = json.dumps(row) + "\n" json_bytes = json_str.encode('utf-8') output_file.write(json_bytes) @@ -308,13 +318,13 @@ def make_positions_json(annotated_json, output_json): if "Ensembl" in sources: for transcript in transcript_lines: if transcript.get('source') == "Ensembl": - row = make_annotated_json_row(position, variant, transcript) + row = make_annotated_json_row(position, refAllele, altAlleles[index], variant, transcript) json_str = json.dumps(row) + "\n" json_bytes = json_str.encode('utf-8') output_file.write(json_bytes) else: # if there are transcripts, but they are not Ensembl, we now only want one row in the VAT, not one row per transcript - row = make_annotated_json_row(position, variant, None) + row = make_annotated_json_row(position, refAllele, altAlleles[index], variant, None) json_str = json.dumps(row) + "\n" json_bytes = json_str.encode('utf-8') output_file.write(json_bytes) From a815a869e3833242b9375182bc55633057e20a7f Mon Sep 17 00:00:00 2001 From: rori <6863459+RoriCremer@users.noreply.github.com> Date: Wed, 6 Oct 2021 16:31:34 -0400 Subject: [PATCH 17/26] template update --- scripts/variantstore/wdl/GvsSitesOnlyVCF.example.inputs.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/variantstore/wdl/GvsSitesOnlyVCF.example.inputs.json b/scripts/variantstore/wdl/GvsSitesOnlyVCF.example.inputs.json index 2061690b382..7e3f9f130b9 100644 --- a/scripts/variantstore/wdl/GvsSitesOnlyVCF.example.inputs.json +++ b/scripts/variantstore/wdl/GvsSitesOnlyVCF.example.inputs.json @@ -12,5 +12,6 @@ "GvsSitesOnlyVCF.project_id": "spec-ops-aou", "GvsSitesOnlyVCF.dataset_name": "anvil_100_for_testing", "GvsSitesOnlyVCF.AnAcAf_annotations_template": "gs://broad-dsp-spec-ops/scratch/rcremer/Nirvana/custom_annotations_template.tsv", - "GvsSitesOnlyVCF.ancestry_file": "gs://broad-dsp-spec-ops/scratch/rcremer/Nirvana/subpop.tsv" + "GvsSitesOnlyVCF.ancestry_file": "gs://broad-dsp-spec-ops/scratch/rcremer/Nirvana/subpop.tsv", + "GvsSitesOnlyVCF.reference": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta" } From 55d5636d2ff19f0a68359adfc67230d93551ab4e Mon Sep 17 00:00:00 2001 From: rori <6863459+RoriCremer@users.noreply.github.com> Date: Wed, 6 Oct 2021 16:38:08 -0400 Subject: [PATCH 18/26] add in line counts for metrics --- scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl | 31 +++++++++++++------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl index 00a067a9b86..806b9479807 100644 --- a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl +++ b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl @@ -224,33 +224,44 @@ task ExtractAnAcAfFromVCF { #] - ## normalize and filter out the filtered sites and FT filtered out variants ## do we need: - bcftools norm -m-any ~{local_input_vcf} | \ - bcftools norm --check-ref w -f Homo_sapiens_assembly38.fasta | \ - bcftools view -f 'PASS,.' | \ - bcftools filter -i "FORMAT/FT='PASS'" --set-GTs . > ~{normalized_vcf} + wc -l ~{local_input_vcf} + + ## filter out sites with too many alt alleles + bcftools view -e 'N_ALT>500' --no-update ~{local_input_vcf} > test1.vcf + wc -l test1.vcf + + ## filter out the non-passing sites + bcftools view -f 'PASS,.' --no-update test1.vcf | \ + ## normalize, left align and split multi allelic sites to new lines, remove duplicate lines + bcftools norm -m- --check-ref w -f Homo_sapiens_assembly38.fasta | \ + ## filter out spanning deletions and variants with an AC of 0 + bcftools view -e 'ALT[0]="*"' -e 'AC=0' --no-update | \ + ## ensure that we respect the FT tag + bcftools filter -i "FORMAT/FT='PASS,.'" --set-GTs . > ~{normalized_vcf} + wc -l ~{normalized_vcf} ## make a file of just the first 5 columns of the tsv bcftools query ~{normalized_vcf} -f '%CHROM\t%POS\t%ID\t%REF\t%ALT\n' > check_duplicates.tsv ## check it for duplicates and put them in a new file sort check_duplicates.tsv | uniq -d | cut -f1,2,3,4,5 > duplicates.tsv ## remove those rows (that match up to the first 5 cols) - grep -v -wFf duplicates.tsv ~{normalized_vcf} | grep -v "AC=0;" > deduplicated.vcf + grep -v -wFf duplicates.tsv ~{normalized_vcf} > deduplicated.vcf wc -l duplicates.tsv | awk '{print $1}' > duplicates_count.txt echo "the following duplicate variants will be dropped due to a bug" cat duplicates.tsv + ## calculate annotations for all subpopulations bcftools plugin fill-tags -- deduplicated.vcf -S ~{subpopulation_sample_list} -t AC,AF,AN,AC_het,AC_hom | bcftools query -f \ '%CHROM\t%POS\t%REF\t%ALT\t%AC\t%AN\t%AF\t%AC_Hom\t%AC_Het\t%AC_afr\t%AN_afr\t%AF_afr\t%AC_Hom_afr\t%AC_Het_afr\t%AC_amr\t%AN_amr\t%AF_amr\t%AC_Hom_amr\t%AC_Het_amr\t%AC_eas\t%AN_eas\t%AF_eas\t%AC_Hom_eas\t%AC_Het_eas\t%AC_eur\t%AN_eur\t%AF_eur\t%AC_Hom_eur\t%AC_Het_eur\t%AC_mid\t%AN_mid\t%AF_mid\t%AC_Hom_mid\t%AC_Het_mid\t%AC_oth\t%AN_oth\t%AF_oth\t%AC_Hom_oth\t%AC_Het_oth\t%AC_sas\t%AN_sas\t%AF_sas\t%AC_Hom_sas\t%AC_Het_sas\n' \ - | grep -v "*" >> ~{custom_annotations_file_name} + >> ~{custom_annotations_file_name} ### for validation of the pipeline wc -l ~{custom_annotations_file_name} | awk '{print $1 -7}' > count.txt - ### compress the vcf and index it, make it sites-only and drop the AS_QUALapprox field---might be nice to be able to drop all of the INFO fields, since I dont need AC/AN/AF in this form - bcftools view --no-update --drop-genotypes deduplicated.vcf -Oz -o sites_only.vcf.gz - bcftools annotate -x INFO/AS_QUALapprox sites_only.vcf.gz -Oz -o ~{normalized_vcf_compressed} + ### compress the vcf and index it, make it sites-only + bcftools view --no-update --drop-genotypes deduplicated.vcf -Oz -o ~{normalized_vcf_compressed} + ##bcftools annotate -x INFO Is this overkill? bcftools index --tbi ~{normalized_vcf_compressed} >>> From 7a14fb2e87cbaff7615e03ddd9e92448f765ef04 Mon Sep 17 00:00:00 2001 From: rori <6863459+RoriCremer@users.noreply.github.com> Date: Wed, 6 Oct 2021 21:57:03 -0400 Subject: [PATCH 19/26] get the count of high alt allele sites --- scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl index 806b9479807..0ff1b1deac1 100644 --- a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl +++ b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl @@ -223,22 +223,19 @@ task ExtractAnAcAfFromVCF { # "sas" #] - - wc -l ~{local_input_vcf} + ## get the number of +500 alt alleles for Lee + bcftools view -i 'N_ALT>500' --no-update --no-header ~{local_input_vcf} | wc -l ## filter out sites with too many alt alleles - bcftools view -e 'N_ALT>500' --no-update ~{local_input_vcf} > test1.vcf - wc -l test1.vcf - + bcftools view -e 'N_ALT>500' --no-update ~{local_input_vcf} | \ ## filter out the non-passing sites - bcftools view -f 'PASS,.' --no-update test1.vcf | \ + bcftools view -f 'PASS,.' --no-update | \ ## normalize, left align and split multi allelic sites to new lines, remove duplicate lines bcftools norm -m- --check-ref w -f Homo_sapiens_assembly38.fasta | \ ## filter out spanning deletions and variants with an AC of 0 - bcftools view -e 'ALT[0]="*"' -e 'AC=0' --no-update | \ + bcftools view -e 'ALT[0]="*" || AC=0' --no-update | \ ## ensure that we respect the FT tag bcftools filter -i "FORMAT/FT='PASS,.'" --set-GTs . > ~{normalized_vcf} - wc -l ~{normalized_vcf} ## make a file of just the first 5 columns of the tsv bcftools query ~{normalized_vcf} -f '%CHROM\t%POS\t%ID\t%REF\t%ALT\n' > check_duplicates.tsv @@ -254,7 +251,7 @@ task ExtractAnAcAfFromVCF { ## calculate annotations for all subpopulations bcftools plugin fill-tags -- deduplicated.vcf -S ~{subpopulation_sample_list} -t AC,AF,AN,AC_het,AC_hom | bcftools query -f \ '%CHROM\t%POS\t%REF\t%ALT\t%AC\t%AN\t%AF\t%AC_Hom\t%AC_Het\t%AC_afr\t%AN_afr\t%AF_afr\t%AC_Hom_afr\t%AC_Het_afr\t%AC_amr\t%AN_amr\t%AF_amr\t%AC_Hom_amr\t%AC_Het_amr\t%AC_eas\t%AN_eas\t%AF_eas\t%AC_Hom_eas\t%AC_Het_eas\t%AC_eur\t%AN_eur\t%AF_eur\t%AC_Hom_eur\t%AC_Het_eur\t%AC_mid\t%AN_mid\t%AF_mid\t%AC_Hom_mid\t%AC_Het_mid\t%AC_oth\t%AN_oth\t%AF_oth\t%AC_Hom_oth\t%AC_Het_oth\t%AC_sas\t%AN_sas\t%AF_sas\t%AC_Hom_sas\t%AC_Het_sas\n' \ - >> ~{custom_annotations_file_name} + | grep -v "*" >> ~{custom_annotations_file_name} ### for validation of the pipeline wc -l ~{custom_annotations_file_name} | awk '{print $1 -7}' > count.txt From 5d8f1ed8357a41ca41af5434f68bdb28749b90ff Mon Sep 17 00:00:00 2001 From: rori <6863459+RoriCremer@users.noreply.github.com> Date: Thu, 7 Oct 2021 11:17:49 -0400 Subject: [PATCH 20/26] AC_hemi for all --- .../custom_annotations_template.tsv | 8 ++++---- scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl | 6 +++--- .../wdl/extract/create_variant_annotation_table.py | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/scripts/variantstore/variant_annotations_table/custom_annotations_template.tsv b/scripts/variantstore/variant_annotations_table/custom_annotations_template.tsv index 7a5ba70c823..a578ea9b65f 100644 --- a/scripts/variantstore/variant_annotations_table/custom_annotations_template.tsv +++ b/scripts/variantstore/variant_annotations_table/custom_annotations_template.tsv @@ -1,7 +1,7 @@ #title=gvsAnnotations #assembly=GRCh38 #matchVariantsBy=allele -#CHROM POS REF ALT AC AN AF AC_Hom AC_Het AC_afr AN_afr AF_afr AC_Hom_afr AC_Het_afr AC_amr AN_amr AF_amr AC_Hom_amr AC_Het_amr AC_eas AN_eas AF_eas AC_Hom_eas AC_Het_eas AC_eur AN_eur AF_eur AC_Hom_eur AC_Het_eur AC_mid AN_mid AF_mid AC_Hom_mid AC_Het_mid AC_oth AN_oth AF_oth AC_Hom_oth AC_Het_oth AC_sas AN_sas AF_sas AC_Hom_sas AC_Het_sas -#categories . . . AlleleCount AlleleNumber AlleleFrequency AlleleCount AlleleCount AlleleCount AlleleNumber AlleleFrequency AlleleCount AlleleCount AlleleCount AlleleNumber AlleleFrequency AlleleCount AlleleCount AlleleCount AlleleNumber AlleleFrequency AlleleCount AlleleCount AlleleCount AlleleNumber AlleleFrequency AlleleCount AlleleCount AlleleCount AlleleNumber AlleleFrequency AlleleCount AlleleCount AlleleCount AlleleNumber AlleleFrequency AlleleCount AlleleCount AlleleCount AlleleNumber AlleleFrequency AlleleCount AlleleCount -#descriptions . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . -#type . . . number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number +#CHROM POS REF ALT AC AN AF AC_Hom AC_Het AC_Hemi AC_afr AN_afr AF_afr AC_Hom_afr AC_Het_afr AC_Hemi_afr AC_amr AN_amr AF_amr AC_Hom_amr AC_Het_amr AC_Hemi_amr AC_eas AN_eas AF_eas AC_Hom_eas AC_Het_eas AC_Hemi_eas AC_eur AN_eur AF_eur AC_Hom_eur AC_Het_eur AC_Hemi_eur AC_mid AN_mid AF_mid AC_Hom_mid AC_Het_mid AC_Hemi_mid AC_oth AN_oth AF_oth AC_Hom_oth AC_Het_oth AC_Hemi_oth AC_sas AN_sas AF_sas AC_Hom_sas AC_Het_sas AC_Hemi_sas +#categories . . . AlleleCount AlleleNumber AlleleFrequency AlleleCount AlleleCount AlleleCount AlleleCount AlleleNumber AlleleFrequency AlleleCount AlleleCount AlleleCount AlleleCount AlleleNumber AlleleFrequency AlleleCount AlleleCount AlleleCount AlleleCount AlleleNumber AlleleFrequency AlleleCount AlleleCount AlleleCount AlleleCount AlleleNumber AlleleFrequency AlleleCount AlleleCount AlleleCount AlleleCount AlleleNumber AlleleFrequency AlleleCount AlleleCount AlleleCount AlleleCount AlleleNumber AlleleFrequency AlleleCount AlleleCount AlleleCount AlleleCount AlleleNumber AlleleFrequency AlleleCount AlleleCount AlleleCount +#descriptions . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . +#type . . . number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number diff --git a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl index 0ff1b1deac1..10a260b9a8a 100644 --- a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl +++ b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl @@ -249,9 +249,9 @@ task ExtractAnAcAfFromVCF { cat duplicates.tsv ## calculate annotations for all subpopulations - bcftools plugin fill-tags -- deduplicated.vcf -S ~{subpopulation_sample_list} -t AC,AF,AN,AC_het,AC_hom | bcftools query -f \ - '%CHROM\t%POS\t%REF\t%ALT\t%AC\t%AN\t%AF\t%AC_Hom\t%AC_Het\t%AC_afr\t%AN_afr\t%AF_afr\t%AC_Hom_afr\t%AC_Het_afr\t%AC_amr\t%AN_amr\t%AF_amr\t%AC_Hom_amr\t%AC_Het_amr\t%AC_eas\t%AN_eas\t%AF_eas\t%AC_Hom_eas\t%AC_Het_eas\t%AC_eur\t%AN_eur\t%AF_eur\t%AC_Hom_eur\t%AC_Het_eur\t%AC_mid\t%AN_mid\t%AF_mid\t%AC_Hom_mid\t%AC_Het_mid\t%AC_oth\t%AN_oth\t%AF_oth\t%AC_Hom_oth\t%AC_Het_oth\t%AC_sas\t%AN_sas\t%AF_sas\t%AC_Hom_sas\t%AC_Het_sas\n' \ - | grep -v "*" >> ~{custom_annotations_file_name} + bcftools plugin fill-tags -- deduplicated.vcf -S ~{subpopulation_sample_list} -t AC,AF,AN,AC_het,AC_hom,AC_Hemi | bcftools query -f \ + '%CHROM\t%POS\t%REF\t%ALT\t%AC\t%AN\t%AF\t%AC_Hom\t%AC_Het\t%AC_Hemi\t%AC_afr\t%AN_afr\t%AF_afr\t%AC_Hom_afr\t%AC_Het_afr\t%AC_Hemi_afr\t%AC_amr\t%AN_amr\t%AF_amr\t%AC_Hom_amr\t%AC_Het_amr\t%AC_Hemi_amr\t%AC_eas\t%AN_eas\t%AF_eas\t%AC_Hom_eas\t%AC_Het_eas\t%AC_Hemi_eas\t%AC_eur\t%AN_eur\t%AF_eur\t%AC_Hom_eur\t%AC_Het_eur\t%AC_Hemi_eur\t%AC_mid\t%AN_mid\t%AF_mid\t%AC_Hom_mid\t%AC_Het_mid\t%AC_Hemi_mid\t%AC_oth\t%AN_oth\t%AF_oth\t%AC_Hom_oth\t%AC_Het_oth\t%AC_Hemi_oth\t%AC_sas\t%AN_sas\t%AF_sas\t%AC_Hom_sas\t%AC_Het_sas\t%AC_Hemi_sas\n' \ + >> ~{custom_annotations_file_name} ### for validation of the pipeline wc -l ~{custom_annotations_file_name} | awk '{print $1 -7}' > count.txt diff --git a/scripts/variantstore/wdl/extract/create_variant_annotation_table.py b/scripts/variantstore/wdl/extract/create_variant_annotation_table.py index 2b291d31df0..7f0f443462f 100644 --- a/scripts/variantstore/wdl/extract/create_variant_annotation_table.py +++ b/scripts/variantstore/wdl/extract/create_variant_annotation_table.py @@ -178,7 +178,7 @@ def get_subpopulation_calculations(subpop_annotations): subpop_an_val = subpop_annotations.get("_".join(["AN", gvs_subpop])) subpop_af_val = subpop_annotations.get("_".join(["AF", gvs_subpop])) # note the assumption is made that AC_Hom must be even because by it's nature it means there are two, but there could be an error - subpop_sc_val = int(subpop_annotations.get("_".join(["AC_Hom", gvs_subpop])) / 2 ) + subpop_annotations.get("_".join(["AC_Het", gvs_subpop])) + subpop_sc_val = int(subpop_annotations.get("_".join(["AC_Hom", gvs_subpop])) / 2 ) + subpop_annotations.get("_".join(["AC_Het", gvs_subpop])) + subpop_annotations.get("_".join(["AC_Hemi", gvs_subpop])) # here we set the subpopulation ac/an/af values row["_".join(["gvs", gvs_subpop, "ac"])] = subpop_ac_val row["_".join(["gvs", gvs_subpop, "an"])] = subpop_an_val From f57707adafe682b7a7699bc4a65238339f3314c5 Mon Sep 17 00:00:00 2001 From: rori <6863459+RoriCremer@users.noreply.github.com> Date: Thu, 7 Oct 2021 22:20:27 -0400 Subject: [PATCH 21/26] add new docker image --- scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl index 10a260b9a8a..f8f6849c3bc 100644 --- a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl +++ b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl @@ -151,7 +151,7 @@ task MakeSubpopulationFiles { # ------------------------------------------------ # Runtime settings: runtime { - docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_20210916" + docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_20211007" memory: "1 GB" preemptible: 3 cpu: "1" @@ -265,7 +265,7 @@ task ExtractAnAcAfFromVCF { # ------------------------------------------------ # Runtime settings: runtime { - docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_20210916" + docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_20211007" memory: "32 GB" preemptible: 3 cpu: "2" @@ -395,7 +395,7 @@ task PrepAnnotationJson { # ------------------------------------------------ # Runtime settings: runtime { - docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_20210916" + docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_20211007" memory: "8 GB" preemptible: 5 cpu: "1" From c853b63896cf71c9b72dddd0536e21624d75d0f8 Mon Sep 17 00:00:00 2001 From: rori <6863459+RoriCremer@users.noreply.github.com> Date: Thu, 7 Oct 2021 23:18:59 -0400 Subject: [PATCH 22/26] get sample counts --- scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl index f8f6849c3bc..e7de7ed0bb9 100644 --- a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl +++ b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl @@ -211,7 +211,11 @@ task ExtractAnAcAfFromVCF { gsutil cp ~{ref} Homo_sapiens_assembly38.fasta # TODO Compare the ancestry list with the sample list and throw an error (but dont fail the job) if there are samples that are in one, but not the other. Two different errors. - # awk '{print $2}' ~{subpopulation_sample_list} | tail -n +2 | sort -u > collected_subpopulations.txt + #awk '{print $2}' ~{subpopulation_sample_list} | tail -n +2 | sort -u > collected_subpopulations.txt + #bcftools query --list-samples ~{local_input_vcf} | sort -u > collected_samples.txt + + #wc -l collected_subpopulations.txt + #wc -l collected_samples.txt # expected_subpopulations = [ # "afr", From 7f0864b2e717728fbf66941307fc6aeebf99c47e Mon Sep 17 00:00:00 2001 From: rori <6863459+RoriCremer@users.noreply.github.com> Date: Tue, 12 Oct 2021 22:18:26 -0400 Subject: [PATCH 23/26] clean up files for memory preservation --- scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl index e7de7ed0bb9..3502fa90be0 100644 --- a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl +++ b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl @@ -240,13 +240,19 @@ task ExtractAnAcAfFromVCF { bcftools view -e 'ALT[0]="*" || AC=0' --no-update | \ ## ensure that we respect the FT tag bcftools filter -i "FORMAT/FT='PASS,.'" --set-GTs . > ~{normalized_vcf} + du -h ~{normalized_vcf} + + ## clean up + rm ~{local_input_vcf} ## make a file of just the first 5 columns of the tsv bcftools query ~{normalized_vcf} -f '%CHROM\t%POS\t%ID\t%REF\t%ALT\n' > check_duplicates.tsv ## check it for duplicates and put them in a new file sort check_duplicates.tsv | uniq -d | cut -f1,2,3,4,5 > duplicates.tsv + rm check_duplicates.tsv ## clean up ## remove those rows (that match up to the first 5 cols) grep -v -wFf duplicates.tsv ~{normalized_vcf} > deduplicated.vcf + rm ~{normalized_vcf} ## clean up wc -l duplicates.tsv | awk '{print $1}' > duplicates_count.txt echo "the following duplicate variants will be dropped due to a bug" From 27ca1644f3715e88d1564c8e05b6d872d8bd7623 Mon Sep 17 00:00:00 2001 From: rori <6863459+RoriCremer@users.noreply.github.com> Date: Fri, 15 Oct 2021 15:35:46 -0400 Subject: [PATCH 24/26] re-order errors --- .../wdl/extract/create_variant_annotation_table.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/variantstore/wdl/extract/create_variant_annotation_table.py b/scripts/variantstore/wdl/extract/create_variant_annotation_table.py index 7f0f443462f..e305441a954 100644 --- a/scripts/variantstore/wdl/extract/create_variant_annotation_table.py +++ b/scripts/variantstore/wdl/extract/create_variant_annotation_table.py @@ -120,6 +120,10 @@ def check_filtering(variant): if variant.get("gvsAnnotations") == None: # <-- enum since we need this to be in tandem with the custom annotations header / template print("WARNING: There has been an error in creating custom annotations for AC/AF/AN", variant.get("vid")) return False + # skip any row (with a warning) if the AC value is 0 + elif variant["gvsAnnotations"].get("AC") == 0: + print("WARNING: Its AC is 0 so we are dropping this variant", variant.get("vid")) + return False # skip any row (with a warning) if AC, AN or AF is missing elif variant["gvsAnnotations"].get("AC") == None: print("WARNING: There has been an error-- there is no AC value---should AN be 0 for this variant?", variant.get("vid")) @@ -130,10 +134,6 @@ def check_filtering(variant): elif variant["gvsAnnotations"].get("AF") == None: print("WARNING: There has been an error-- there is an AC value---but no AF value", variant.get("vid")) return False - # skip any row (with a warning) if the AC value is 0 - elif variant["gvsAnnotations"].get("AC") == 0: - print("WARNING: Its AC is 0 so we are dropping this variant", variant.get("vid")) - return False else: return True From 2602a5655770757d22740149256ec38518e61a78 Mon Sep 17 00:00:00 2001 From: rori <6863459+RoriCremer@users.noreply.github.com> Date: Wed, 20 Oct 2021 11:46:14 -0400 Subject: [PATCH 25/26] pass dropped variants --- scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl | 35 ++++++++++++-------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl index 3502fa90be0..244a3ce3f5a 100644 --- a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl +++ b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl @@ -82,7 +82,7 @@ workflow GvsSitesOnlyVCF { project_id = project_id, dataset_name = dataset_name, counts_variants = ExtractAnAcAfFromVCF.count_variants, - counts_variant_duplicates = ExtractAnAcAfFromVCF.count_duplicates, + track_dropped_variants = ExtractAnAcAfFromVCF.track_dropped, table_suffix = table_suffix, service_account_json_path = service_account_json_path, load_jsons_done = BigQueryLoadJson.done @@ -227,11 +227,14 @@ task ExtractAnAcAfFromVCF { # "sas" #] - ## get the number of +500 alt alleles for Lee - bcftools view -i 'N_ALT>500' --no-update --no-header ~{local_input_vcf} | wc -l + ## track the dropped variants with +500 alt alleles or N's in the reference + bcftools view -i 'N_ALT>500 || REF~"N"' ~{local_input_vcf} | bcftools query -f '%CHROM\t%POS\t%ID\t%REF\t%ALT\n' > track_dropped.tsv + + ## Since Nirvana cant handle N as a base, drop them for now and keep track of the lines dropped + ## bcftools view -e 'REF~"N"' ~{local_input_vcf} ## filter out sites with too many alt alleles - bcftools view -e 'N_ALT>500' --no-update ~{local_input_vcf} | \ + bcftools view -e 'N_ALT>500 || REF~"N"' --no-update ~{local_input_vcf} | \ ## filter out the non-passing sites bcftools view -f 'PASS,.' --no-update | \ ## normalize, left align and split multi allelic sites to new lines, remove duplicate lines @@ -240,6 +243,7 @@ task ExtractAnAcAfFromVCF { bcftools view -e 'ALT[0]="*" || AC=0' --no-update | \ ## ensure that we respect the FT tag bcftools filter -i "FORMAT/FT='PASS,.'" --set-GTs . > ~{normalized_vcf} + du -h ~{normalized_vcf} ## clean up @@ -254,21 +258,23 @@ task ExtractAnAcAfFromVCF { grep -v -wFf duplicates.tsv ~{normalized_vcf} > deduplicated.vcf rm ~{normalized_vcf} ## clean up - wc -l duplicates.tsv | awk '{print $1}' > duplicates_count.txt echo "the following duplicate variants will be dropped due to a bug" - cat duplicates.tsv + cat duplicates.tsv >> track_dropped.tsv + cat track_dropped.tsv + + rm duplicates.tsv ## clean up ## calculate annotations for all subpopulations bcftools plugin fill-tags -- deduplicated.vcf -S ~{subpopulation_sample_list} -t AC,AF,AN,AC_het,AC_hom,AC_Hemi | bcftools query -f \ '%CHROM\t%POS\t%REF\t%ALT\t%AC\t%AN\t%AF\t%AC_Hom\t%AC_Het\t%AC_Hemi\t%AC_afr\t%AN_afr\t%AF_afr\t%AC_Hom_afr\t%AC_Het_afr\t%AC_Hemi_afr\t%AC_amr\t%AN_amr\t%AF_amr\t%AC_Hom_amr\t%AC_Het_amr\t%AC_Hemi_amr\t%AC_eas\t%AN_eas\t%AF_eas\t%AC_Hom_eas\t%AC_Het_eas\t%AC_Hemi_eas\t%AC_eur\t%AN_eur\t%AF_eur\t%AC_Hom_eur\t%AC_Het_eur\t%AC_Hemi_eur\t%AC_mid\t%AN_mid\t%AF_mid\t%AC_Hom_mid\t%AC_Het_mid\t%AC_Hemi_mid\t%AC_oth\t%AN_oth\t%AF_oth\t%AC_Hom_oth\t%AC_Het_oth\t%AC_Hemi_oth\t%AC_sas\t%AN_sas\t%AF_sas\t%AC_Hom_sas\t%AC_Het_sas\t%AC_Hemi_sas\n' \ >> ~{custom_annotations_file_name} - ### for validation of the pipeline + ## for validation of the pipeline wc -l ~{custom_annotations_file_name} | awk '{print $1 -7}' > count.txt - ### compress the vcf and index it, make it sites-only + ## compress the vcf and index it, make it sites-only bcftools view --no-update --drop-genotypes deduplicated.vcf -Oz -o ~{normalized_vcf_compressed} - ##bcftools annotate -x INFO Is this overkill? + ## bcftools annotate -x INFO Is this overkill? bcftools index --tbi ~{normalized_vcf_compressed} >>> @@ -286,7 +292,7 @@ task ExtractAnAcAfFromVCF { output { File annotations_file = "~{custom_annotations_file_name}" Int count_variants = read_int("count.txt") - Int count_duplicates = read_int("duplicates_count.txt") + File track_dropped = "track_dropped.tsv" File output_vcf = "~{normalized_vcf_compressed}" File output_vcf_index = "~{normalized_vcf_indexed}" } @@ -643,7 +649,7 @@ task BigQuerySmokeTest { String project_id String dataset_name Array[Int] counts_variants - Array[Int] counts_variant_duplicates + Array[File] track_dropped_variants String table_suffix String? service_account_json_path Boolean load_jsons_done @@ -671,9 +677,12 @@ task BigQuerySmokeTest { # sum all the initial input variants across the shards INITIAL_VARIANT_COUNT=$(python -c "print(sum([~{sep=', ' counts_variants}]))") - DUPLICATE_VARIANT_COUNT=$(python -c "print(sum([~{sep=', ' counts_variant_duplicates}]))") + ## here a list of files: ~{sep=', ' track_dropped_variants} + ## I need to get the lines from each file + awk 'FNR==1{print ""}1' ~{sep=' ' track_dropped_variants} > dropped_variants.txt - echo "Duplicate variant count: $DUPLICATE_VARIANT_COUNT." + echo "Dropped variants:" + cat dropped_variants.txt # Count number of variants in the VAT bq query --nouse_legacy_sql --project_id=~{project_id} --format=csv 'SELECT COUNT (DISTINCT vid) AS count FROM `~{dataset_name}.~{vat_table}`' > bq_variant_count.csv From 5284deeaf08998d166b937e65b1f85dfb4d75e4b Mon Sep 17 00:00:00 2001 From: rori <6863459+RoriCremer@users.noreply.github.com> Date: Tue, 26 Oct 2021 23:09:21 -0400 Subject: [PATCH 26/26] comments cleanup --- scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl | 32 ++++++++------------ 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl index 244a3ce3f5a..c4921f86684 100644 --- a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl +++ b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl @@ -193,7 +193,8 @@ task ExtractAnAcAfFromVCF { String normalized_vcf_compressed = "normalized.vcf.gz" String normalized_vcf_indexed = "normalized.vcf.gz.tbi" - # separate multi-allelic sites into their own lines, remove deletions and extract the an/ac/af & sc + # separate multi-allelic sites into their own lines, remove deletions and filtered sites and make a sites only vcf + # while extracting the an/ac/af & sc by subpopulation into a tsv command <<< set -e @@ -210,12 +211,12 @@ task ExtractAnAcAfFromVCF { gsutil cp ~{input_vcf_index} ~{local_input_vcf_index} gsutil cp ~{ref} Homo_sapiens_assembly38.fasta - # TODO Compare the ancestry list with the sample list and throw an error (but dont fail the job) if there are samples that are in one, but not the other. Two different errors. - #awk '{print $2}' ~{subpopulation_sample_list} | tail -n +2 | sort -u > collected_subpopulations.txt - #bcftools query --list-samples ~{local_input_vcf} | sort -u > collected_samples.txt - - #wc -l collected_subpopulations.txt - #wc -l collected_samples.txt + ## compare the ancestry sample list with the vcf sample list + # TODO throw an error (but dont fail the job) if there are samples that are in one, but not the other. Throw two different errors. + # Currently commented out to save on io with the AoU beta VAT creation + # awk '{print $1}' ~{subpopulation_sample_list} | tail -n +2 | sort -u > collected_subpopulation_samples.txt + # bcftools query --list-samples ~{local_input_vcf} | sort -u > collected_samples.txt + # diff collected_subpopulation_samples.txt collected_samples.txt # expected_subpopulations = [ # "afr", @@ -227,12 +228,9 @@ task ExtractAnAcAfFromVCF { # "sas" #] - ## track the dropped variants with +500 alt alleles or N's in the reference + ## track the dropped variants with +500 alt alleles or N's in the reference (Since Nirvana cant handle N as a base, drop them for now) bcftools view -i 'N_ALT>500 || REF~"N"' ~{local_input_vcf} | bcftools query -f '%CHROM\t%POS\t%ID\t%REF\t%ALT\n' > track_dropped.tsv - ## Since Nirvana cant handle N as a base, drop them for now and keep track of the lines dropped - ## bcftools view -e 'REF~"N"' ~{local_input_vcf} - ## filter out sites with too many alt alleles bcftools view -e 'N_ALT>500 || REF~"N"' --no-update ~{local_input_vcf} | \ ## filter out the non-passing sites @@ -244,9 +242,7 @@ task ExtractAnAcAfFromVCF { ## ensure that we respect the FT tag bcftools filter -i "FORMAT/FT='PASS,.'" --set-GTs . > ~{normalized_vcf} - du -h ~{normalized_vcf} - - ## clean up + ## clean up unneeded file rm ~{local_input_vcf} ## make a file of just the first 5 columns of the tsv @@ -258,11 +254,9 @@ task ExtractAnAcAfFromVCF { grep -v -wFf duplicates.tsv ~{normalized_vcf} > deduplicated.vcf rm ~{normalized_vcf} ## clean up - echo "the following duplicate variants will be dropped due to a bug" + ## add duplicates to the file tracking dropped variants cat duplicates.tsv >> track_dropped.tsv - cat track_dropped.tsv - - rm duplicates.tsv ## clean up + rm duplicates.tsv ## clean up unneeded file ## calculate annotations for all subpopulations bcftools plugin fill-tags -- deduplicated.vcf -S ~{subpopulation_sample_list} -t AC,AF,AN,AC_het,AC_hom,AC_Hemi | bcftools query -f \ @@ -274,7 +268,7 @@ task ExtractAnAcAfFromVCF { ## compress the vcf and index it, make it sites-only bcftools view --no-update --drop-genotypes deduplicated.vcf -Oz -o ~{normalized_vcf_compressed} - ## bcftools annotate -x INFO Is this overkill? + ## if we can spare the IO and want to pass a smaller file we can also drop the info field w bcftools annotate -x INFO bcftools index --tbi ~{normalized_vcf_compressed} >>>