From 49c99718fe646ff1a302a9c677206a7a3fac9bbf Mon Sep 17 00:00:00 2001
From: rori <6863459+RoriCremer@users.noreply.github.com>
Date: Thu, 16 Sep 2021 14:34:23 -0400
Subject: [PATCH 01/26] clean up

---
 scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
index 2adccbb9da1..cc86a966f93 100644
--- a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
+++ b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
@@ -94,8 +94,8 @@ workflow GvsSitesOnlyVCF {
          service_account_json_path = service_account_json_path,
          load_jsons_done = BigQueryLoadJson.done
     }
-    scatter(i in range(length(contig_array)) ) {
 
+    scatter(i in range(length(contig_array)) ) {
       call BigQueryExportVat {
         input:
             contig = contig_array[i],

From 9ff496bf932c62fc307e7ab82f15a021fc581672 Mon Sep 17 00:00:00 2001
From: rori <6863459+RoriCremer@users.noreply.github.com>
Date: Wed, 29 Sep 2021 11:10:51 -0400
Subject: [PATCH 02/26] is this step dropping rows

---
 scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
index cc86a966f93..f6881d99438 100644
--- a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
+++ b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
@@ -250,7 +250,6 @@ task ExtractAnAcAfFromVCF {
 
         ### for validation of the pipeline
         wc -l ~{custom_annotations_file_name} | awk '{print $1}'  > count.txt
-        # Should this be where we do the filtering of the AC/AN/AF values rather than in the python?
 
         ### compress the vcf and index it
         bcftools view -I deduplicated.vcf -Oz -o ~{normalized_vcf_compressed}
@@ -291,6 +290,8 @@ task SitesOnlyVcf {
         # Adding `--add-output-vcf-command-line false` so that the VCF header doesn't have a timestamp
         # in it so that downstream steps can call cache
 
+        wc -l ~{input_vcf}
+
         gatk --java-options "-Xmx12288m" \
             SelectVariants \
                 -V ~{input_vcf} \
@@ -299,7 +300,10 @@ task SitesOnlyVcf {
                 --sites-only-vcf-output \
                 -O ~{output_filename}
 
-     >>>
+        wc -l ~{output_filename}
+
+
+    >>>
     # ------------------------------------------------
     # Runtime settings:
     runtime {

From d845e0176958c334945d5ca320e90861d014321d Mon Sep 17 00:00:00 2001
From: rori <6863459+RoriCremer@users.noreply.github.com>
Date: Wed, 29 Sep 2021 11:28:30 -0400
Subject: [PATCH 03/26] add filtering on the FT tag level

---
 scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
index f6881d99438..c4c1c5e0997 100644
--- a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
+++ b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
@@ -231,9 +231,11 @@ task ExtractAnAcAfFromVCF {
         #]
 
 
+        ## normalize and filter out the filtered sites and FT filtered out variants
         bcftools norm -m-any ~{local_input_vcf} | \
-        bcftools norm --check-ref w  -f Homo_sapiens_assembly38.fasta > ~{normalized_vcf}
-
+        bcftools norm --check-ref w  -f Homo_sapiens_assembly38.fasta | \
+        bcftools view -f 'PASS'  | \
+        bcftools filter -i "FORMAT/FT='PASS'" --set-GTs . > ~{normalized_vcf}
 
         ## make a file of just the first 4 columns of the tsv (maybe I could just bcftools query it?)
         bcftools query ~{normalized_vcf} -f '%CHROM\t%POS\t%REF\t%ALT\n' > check_duplicates.tsv

From e8646e468103d8b1fb4fbe3c6c9b47b8ea19fe64 Mon Sep 17 00:00:00 2001
From: rori <6863459+RoriCremer@users.noreply.github.com>
Date: Wed, 29 Sep 2021 15:13:46 -0400
Subject: [PATCH 04/26] fixup on the file length

---
 scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
index c4c1c5e0997..b86a76aff3d 100644
--- a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
+++ b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
@@ -251,7 +251,7 @@ task ExtractAnAcAfFromVCF {
         | grep -v "*" >> ~{custom_annotations_file_name}
 
         ### for validation of the pipeline
-        wc -l ~{custom_annotations_file_name} | awk '{print $1}'  > count.txt
+        wc -l ~{custom_annotations_file_name} | awk '{print $1 -7}'  > count.txt
 
         ### compress the vcf and index it
         bcftools view -I deduplicated.vcf -Oz -o ~{normalized_vcf_compressed}
@@ -292,8 +292,6 @@ task SitesOnlyVcf {
         # Adding `--add-output-vcf-command-line false` so that the VCF header doesn't have a timestamp
         # in it so that downstream steps can call cache
 
-        wc -l ~{input_vcf}
-
         gatk --java-options "-Xmx12288m" \
             SelectVariants \
                 -V ~{input_vcf} \
@@ -302,9 +300,6 @@ task SitesOnlyVcf {
                 --sites-only-vcf-output \
                 -O ~{output_filename}
 
-        wc -l ~{output_filename}
-
-
     >>>
     # ------------------------------------------------
     # Runtime settings:

From 46b81e14fc0a3c1dbccf032664b258cb0baf66a1 Mon Sep 17 00:00:00 2001
From: rori <6863459+RoriCremer@users.noreply.github.com>
Date: Wed, 29 Sep 2021 15:24:03 -0400
Subject: [PATCH 05/26] remove duplicates by 4 cols, not only 2

---
 scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
index b86a76aff3d..51f524f2792 100644
--- a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
+++ b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
@@ -238,13 +238,15 @@ task ExtractAnAcAfFromVCF {
         bcftools filter -i "FORMAT/FT='PASS'" --set-GTs . > ~{normalized_vcf}
 
         ## make a file of just the first 4 columns of the tsv (maybe I could just bcftools query it?)
-        bcftools query ~{normalized_vcf} -f '%CHROM\t%POS\t%REF\t%ALT\n' > check_duplicates.tsv
+        bcftools query ~{normalized_vcf} -f '%CHROM\t%POS\t%ID\t%REF\t%ALT\n' > check_duplicates.tsv
         ## check it for duplicates and put them in a new file
-        sort check_duplicates.tsv | uniq -d | cut -f1,2  > duplicates.tsv
-        ## remove those rows (this will be ALL rows with this position--so good rows too, potentially we want to grab f1,4 to do this with)
+        sort check_duplicates.tsv | uniq -d | cut -f1,2,3,4,5  > duplicates.tsv
+        ## remove those rows (that match up to the first 5 cols)
         grep -v -wFf duplicates.tsv ~{normalized_vcf} | grep -v "AC=0;"  > deduplicated.vcf
 
         wc -l duplicates.tsv
+        echo "the following variants will be dropped due to a bug"
+        cat duplicates.tsv
 
         bcftools plugin fill-tags  -- deduplicated.vcf -S ~{subpopulation_sample_list} -t AC,AF,AN,AC_het,AC_hom | bcftools query -f \
         '%CHROM\t%POS\t%REF\t%ALT\t%AC\t%AN\t%AF\t%AC_Hom\t%AC_Het\t%AC_afr\t%AN_afr\t%AF_afr\t%AC_Hom_afr\t%AC_Het_afr\t%AC_amr\t%AN_amr\t%AF_amr\t%AC_Hom_amr\t%AC_Het_amr\t%AC_eas\t%AN_eas\t%AF_eas\t%AC_Hom_eas\t%AC_Het_eas\t%AC_eur\t%AN_eur\t%AF_eur\t%AC_Hom_eur\t%AC_Het_eur\t%AC_mid\t%AN_mid\t%AF_mid\t%AC_Hom_mid\t%AC_Het_mid\t%AC_oth\t%AN_oth\t%AF_oth\t%AC_Hom_oth\t%AC_Het_oth\t%AC_sas\t%AN_sas\t%AF_sas\t%AC_Hom_sas\t%AC_Het_sas\n' \

From 59b6cb98da656a373599d6a65dc543c69ad2a793 Mon Sep 17 00:00:00 2001
From: rori <6863459+RoriCremer@users.noreply.github.com>
Date: Wed, 29 Sep 2021 22:45:42 -0400
Subject: [PATCH 06/26] cleanup

---
 scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
index 51f524f2792..c00d020bfc7 100644
--- a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
+++ b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
@@ -231,13 +231,13 @@ task ExtractAnAcAfFromVCF {
         #]
 
 
-        ## normalize and filter out the filtered sites and FT filtered out variants
+        ## normalize and filter out the filtered sites and FT filtered out variants ## do we need:
         bcftools norm -m-any ~{local_input_vcf} | \
         bcftools norm --check-ref w  -f Homo_sapiens_assembly38.fasta | \
-        bcftools view -f 'PASS'  | \
+        bcftools view -f 'PASS,.'  | \
         bcftools filter -i "FORMAT/FT='PASS'" --set-GTs . > ~{normalized_vcf}
 
-        ## make a file of just the first 4 columns of the tsv (maybe I could just bcftools query it?)
+        ## make a file of just the first 5 columns of the tsv
         bcftools query ~{normalized_vcf} -f '%CHROM\t%POS\t%ID\t%REF\t%ALT\n' > check_duplicates.tsv
         ## check it for duplicates and put them in a new file
         sort check_duplicates.tsv | uniq -d | cut -f1,2,3,4,5  > duplicates.tsv
@@ -245,7 +245,7 @@ task ExtractAnAcAfFromVCF {
         grep -v -wFf duplicates.tsv ~{normalized_vcf} | grep -v "AC=0;"  > deduplicated.vcf
 
         wc -l duplicates.tsv
-        echo "the following variants will be dropped due to a bug"
+        echo "the following duplicate variants will be dropped due to a bug"
         cat duplicates.tsv
 
         bcftools plugin fill-tags  -- deduplicated.vcf -S ~{subpopulation_sample_list} -t AC,AF,AN,AC_het,AC_hom | bcftools query -f \

From 1e358aaf5d3396882a9a73cb9b0d340328e04ec4 Mon Sep 17 00:00:00 2001
From: rori <6863459+RoriCremer@users.noreply.github.com>
Date: Thu, 30 Sep 2021 00:24:55 -0400
Subject: [PATCH 07/26] does this cleanly remove sites only?

---
 scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl | 60 ++------------------
 1 file changed, 6 insertions(+), 54 deletions(-)

diff --git a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
index c00d020bfc7..25bdccb0b5a 100644
--- a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
+++ b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
@@ -34,6 +34,8 @@ workflow GvsSitesOnlyVCF {
 
     ## Scatter across the shards from the GVS jointVCF
     scatter(i in range(length(MakeSubpopulationFiles.input_vcfs)) ) {
+        ## Create a sites-only VCF from the original GVS jointVCF
+        ## Calculate AC/AN/AF for subpopulations and extract them for custom annotations
         call ExtractAnAcAfFromVCF {
             input:
               input_vcf = MakeSubpopulationFiles.input_vcfs[i],
@@ -44,20 +46,11 @@ workflow GvsSitesOnlyVCF {
               ref = reference
         }
 
-      ## Create a sites-only VCF from the original GVS jointVCF
-        call SitesOnlyVcf {
-          input:
-            input_vcf = ExtractAnAcAfFromVCF.output_vcf,
-            input_vcf_index = ExtractAnAcAfFromVCF.output_vcf_index,
-            service_account_json_path = service_account_json_path,
-            output_filename = "${output_sites_only_file_name}_${i}.sites_only.vcf.gz"
-        }
-
       ## Use Nirvana to annotate the sites-only VCF and include the AC/AN/AF calculations as custom annotations
         call AnnotateVCF {
           input:
-            input_vcf = SitesOnlyVcf.output_vcf,
-            input_vcf_index = SitesOnlyVcf.output_vcf_idx,
+            input_vcf = ExtractAnAcAfFromVCF.output_vcf,
+            input_vcf_index = ExtractAnAcAfFromVCF.output_vcf_index,
             output_annotated_file_name = "${output_annotated_file_name}_${i}",
             nirvana_data_tar = nirvana_data_directory,
             custom_annotations_file = ExtractAnAcAfFromVCF.annotations_file,
@@ -256,7 +249,7 @@ task ExtractAnAcAfFromVCF {
         wc -l ~{custom_annotations_file_name} | awk '{print $1 -7}'  > count.txt
 
         ### compress the vcf and index it
-        bcftools view -I deduplicated.vcf -Oz -o ~{normalized_vcf_compressed}
+        bcftools view --no-update --drop-genotypes deduplicated.vcf -Oz -o ~{normalized_vcf_compressed}
         bcftools index --tbi  ~{normalized_vcf_compressed}
 
     >>>
@@ -264,7 +257,7 @@ task ExtractAnAcAfFromVCF {
     # Runtime settings:
     runtime {
         docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_20210916"
-        memory: "20 GB"
+        memory: "32 GB"
         preemptible: 1
         cpu: "2"
         disks: "local-disk 500 SSD"
@@ -279,47 +272,6 @@ task ExtractAnAcAfFromVCF {
     }
 }
 
-task SitesOnlyVcf {
-    input {
-        File input_vcf
-        File input_vcf_index
-        String? service_account_json_path
-        String output_filename
-    }
-    String output_vcf_idx = basename(output_filename) + ".tbi"
-
-    command <<<
-        set -e
-
-        # Adding `--add-output-vcf-command-line false` so that the VCF header doesn't have a timestamp
-        # in it so that downstream steps can call cache
-
-        gatk --java-options "-Xmx12288m" \
-            SelectVariants \
-                -V ~{input_vcf} \
-                --add-output-vcf-command-line false \
-                --exclude-filtered \
-                --sites-only-vcf-output \
-                -O ~{output_filename}
-
-    >>>
-    # ------------------------------------------------
-    # Runtime settings:
-    runtime {
-        docker: "broadinstitute/gatk:4.2.0.0"
-        memory: "15 GB"
-        preemptible: 3
-        cpu: "2"
-        disks: "local-disk 250 HDD"
-    }
-    # ------------------------------------------------
-    # Outputs:
-    output {
-        File output_vcf="~{output_filename}"
-        File output_vcf_idx="~{output_vcf_idx}"
-    }
-}
-
 task AnnotateVCF {
     input {
         File input_vcf

From d168d2733013a7fb24a160e5746ec86ba05b7c42 Mon Sep 17 00:00:00 2001
From: rori <6863459+RoriCremer@users.noreply.github.com>
Date: Thu, 30 Sep 2021 00:53:46 -0400
Subject: [PATCH 08/26] add to dockstore for testing

---
 .dockstore.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.dockstore.yml b/.dockstore.yml
index a7c31ff7225..5c91300c877 100644
--- a/.dockstore.yml
+++ b/.dockstore.yml
@@ -128,6 +128,7 @@ workflows:
        branches:
          - master
          - ah_var_store
+         - rc-remove-sites-only-step
    - name: GvsValidateVat
      subclass: WDL
      primaryDescriptorPath: /scripts/variantstore/variant_annotations_table/GvsValidateVAT.wdl

From e9f413c6f259c0825054a243ad0f98b757883df7 Mon Sep 17 00:00:00 2001
From: rori <6863459+RoriCremer@users.noreply.github.com>
Date: Thu, 30 Sep 2021 00:55:10 -0400
Subject: [PATCH 09/26] lil cheaper

---
 scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
index 25bdccb0b5a..5307485d783 100644
--- a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
+++ b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
@@ -258,7 +258,7 @@ task ExtractAnAcAfFromVCF {
     runtime {
         docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_20210916"
         memory: "32 GB"
-        preemptible: 1
+        preemptible: 3
         cpu: "2"
         disks: "local-disk 500 SSD"
     }

From f93b896a12cb3c7497a78f4cdd2807a7be715836 Mon Sep 17 00:00:00 2001
From: rori <6863459+RoriCremer@users.noreply.github.com>
Date: Thu, 30 Sep 2021 14:54:08 -0400
Subject: [PATCH 10/26] testing the normalized vcfs

---
 scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
index 5307485d783..f87f45eeb2d 100644
--- a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
+++ b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
@@ -269,6 +269,7 @@ task ExtractAnAcAfFromVCF {
         Int count_variants = read_int("count.txt")
         File output_vcf = "~{normalized_vcf_compressed}"
         File output_vcf_index = "~{normalized_vcf_indexed}"
+        File testing_normalized = "~{normalized_vcf}"
     }
 }
 

From 55fa30590f64fe58981d7bd7706447d9fb96d5b5 Mon Sep 17 00:00:00 2001
From: rori <6863459+RoriCremer@users.noreply.github.com>
Date: Thu, 30 Sep 2021 15:35:59 -0400
Subject: [PATCH 11/26] make annotations machine bigger

---
 scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
index f87f45eeb2d..9a2a390ce62 100644
--- a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
+++ b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
@@ -269,7 +269,6 @@ task ExtractAnAcAfFromVCF {
         Int count_variants = read_int("count.txt")
         File output_vcf = "~{normalized_vcf_compressed}"
         File output_vcf_index = "~{normalized_vcf_indexed}"
-        File testing_normalized = "~{normalized_vcf}"
     }
 }
 
@@ -331,10 +330,10 @@ task AnnotateVCF {
     # Runtime settings:
     runtime {
         docker: "annotation/nirvana:3.14"
-        memory: "20 GB"
+        memory: "32 GB"
         cpu: "2"
         preemptible: 5
-        disks: "local-disk 250 SSD"
+        disks: "local-disk 500 SSD"
     }
     # ------------------------------------------------
     # Outputs:

From b2d0ffee476ef0f28143aca59c3087f8acfa5c67 Mon Sep 17 00:00:00 2001
From: rori <6863459+RoriCremer@users.noreply.github.com>
Date: Fri, 1 Oct 2021 16:08:51 -0400
Subject: [PATCH 12/26] add duplicate count info

---
 scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
index 9a2a390ce62..f70721975c2 100644
--- a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
+++ b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
@@ -83,6 +83,7 @@ workflow GvsSitesOnlyVCF {
          project_id = project_id,
          dataset_name = dataset_name,
          counts_variants = ExtractAnAcAfFromVCF.count_variants,
+         counts_variant_duplicates = ExtractAnAcAfFromVCF.count_duplicates,
          table_suffix = table_suffix,
          service_account_json_path = service_account_json_path,
          load_jsons_done = BigQueryLoadJson.done
@@ -237,7 +238,7 @@ task ExtractAnAcAfFromVCF {
         ## remove those rows (that match up to the first 5 cols)
         grep -v -wFf duplicates.tsv ~{normalized_vcf} | grep -v "AC=0;"  > deduplicated.vcf
 
-        wc -l duplicates.tsv
+        wc -l duplicates.tsv | awk '{print $1}' > duplicates_count.txt
         echo "the following duplicate variants will be dropped due to a bug"
         cat duplicates.tsv
 
@@ -267,6 +268,7 @@ task ExtractAnAcAfFromVCF {
     output {
         File annotations_file = "~{custom_annotations_file_name}"
         Int count_variants = read_int("count.txt")
+        Int count_duplicates = read_int("duplicates_count.txt")
         File output_vcf = "~{normalized_vcf_compressed}"
         File output_vcf_index = "~{normalized_vcf_indexed}"
     }
@@ -623,6 +625,7 @@ task BigQuerySmokeTest {
         String project_id
         String dataset_name
         Array[Int] counts_variants
+        Array[Int] counts_variant_duplicates
         String table_suffix
         String? service_account_json_path
         Boolean load_jsons_done
@@ -650,6 +653,9 @@ task BigQuerySmokeTest {
         # sum all the initial input variants across the shards
 
         INITIAL_VARIANT_COUNT=$(python -c "print(sum([~{sep=', ' counts_variants}]))")
+        DUPLICATE_VARIANT_COUNT=$(python -c "print(sum([~{sep=', ' counts_variant_duplicates}]))")
+
+        echo "Duplicate variant count: $DUPLICATE_VARIANT_COUNT."
 
         # Count number of variants in the VAT
         bq query --nouse_legacy_sql --project_id=~{project_id} --format=csv 'SELECT COUNT (DISTINCT vid) AS count FROM `~{dataset_name}.~{vat_table}`' > bq_variant_count.csv

From d84ad828da7ae4f93f63eca5d04bb0e438b4699d Mon Sep 17 00:00:00 2001
From: rori <6863459+RoriCremer@users.noreply.github.com>
Date: Mon, 4 Oct 2021 12:54:55 -0400
Subject: [PATCH 13/26] dont rename things---use the names that already exist!

---
 scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
index f70721975c2..1e4f1350cc2 100644
--- a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
+++ b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
@@ -3,8 +3,6 @@ workflow GvsSitesOnlyVCF {
    input {
         File inputFileofFileNames
         File inputFileofIndexFileNames
-        String output_sites_only_file_name
-        String output_annotated_file_name
         String project_id
         String dataset_name
         File nirvana_data_directory
@@ -36,6 +34,7 @@ workflow GvsSitesOnlyVCF {
     scatter(i in range(length(MakeSubpopulationFiles.input_vcfs)) ) {
         ## Create a sites-only VCF from the original GVS jointVCF
         ## Calculate AC/AN/AF for subpopulations and extract them for custom annotations
+        String input_vcf_name = basename(MakeSubpopulationFiles.input_vcfs[i], ".vcf.gz")
         call ExtractAnAcAfFromVCF {
             input:
               input_vcf = MakeSubpopulationFiles.input_vcfs[i],
@@ -51,7 +50,7 @@ workflow GvsSitesOnlyVCF {
           input:
             input_vcf = ExtractAnAcAfFromVCF.output_vcf,
             input_vcf_index = ExtractAnAcAfFromVCF.output_vcf_index,
-            output_annotated_file_name = "${output_annotated_file_name}_${i}",
+            output_annotated_file_name = "${input_vcf_name}_annotated",
             nirvana_data_tar = nirvana_data_directory,
             custom_annotations_file = ExtractAnAcAfFromVCF.annotations_file,
         }
@@ -59,7 +58,7 @@ workflow GvsSitesOnlyVCF {
         call PrepAnnotationJson {
           input:
             annotation_json = AnnotateVCF.annotation_json,
-            output_file_suffix = "${i}.json.gz",
+            output_file_suffix = "${input_vcf_name}.json.gz",
             output_path = output_path,
             service_account_json_path = service_account_json_path
         }

From 3d8b151a9ccc2bb8237556d0a5493da6feab536c Mon Sep 17 00:00:00 2001
From: rori <6863459+RoriCremer@users.noreply.github.com>
Date: Mon, 4 Oct 2021 14:16:33 -0400
Subject: [PATCH 14/26] drop all the excessive AS_QUALapprox info since it just
 adds weight

---
 scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
index 1e4f1350cc2..9922684310b 100644
--- a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
+++ b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
@@ -248,8 +248,9 @@ task ExtractAnAcAfFromVCF {
         ### for validation of the pipeline
         wc -l ~{custom_annotations_file_name} | awk '{print $1 -7}'  > count.txt
 
-        ### compress the vcf and index it
-        bcftools view --no-update --drop-genotypes deduplicated.vcf -Oz -o ~{normalized_vcf_compressed}
+        ### compress the vcf and index it, make it sites-only and drop the AS_QUALapprox field---might be nice to be able to drop all of the INFO fields, since I dont need AC/AN/AF in this form
+        bcftools view --no-update --drop-genotypes deduplicated.vcf -Oz -o sites_only.vcf.gz
+        bcftools annotate -x INFO/AS_QUALapprox sites_only.vcf.gz > ~{normalized_vcf_compressed}
         bcftools index --tbi  ~{normalized_vcf_compressed}
 
     >>>

From 615aa28a90dc5b39ff9983477546cb5d735cc716 Mon Sep 17 00:00:00 2001
From: rori <6863459+RoriCremer@users.noreply.github.com>
Date: Mon, 4 Oct 2021 17:40:34 -0400
Subject: [PATCH 15/26] swap gzip

---
 scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
index 9922684310b..00a067a9b86 100644
--- a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
+++ b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
@@ -250,7 +250,7 @@ task ExtractAnAcAfFromVCF {
 
         ### compress the vcf and index it, make it sites-only and drop the AS_QUALapprox field---might be nice to be able to drop all of the INFO fields, since I dont need AC/AN/AF in this form
         bcftools view --no-update --drop-genotypes deduplicated.vcf -Oz -o sites_only.vcf.gz
-        bcftools annotate -x INFO/AS_QUALapprox sites_only.vcf.gz > ~{normalized_vcf_compressed}
+        bcftools annotate -x INFO/AS_QUALapprox sites_only.vcf.gz -Oz -o ~{normalized_vcf_compressed}
         bcftools index --tbi  ~{normalized_vcf_compressed}
 
     >>>

From 7a12d558530611230c8b93295fc9d0dc694f2887 Mon Sep 17 00:00:00 2001
From: rori <6863459+RoriCremer@users.noreply.github.com>
Date: Wed, 6 Oct 2021 16:30:51 -0400
Subject: [PATCH 16/26] check with Lee about VAT - markings

---
 .../create_variant_annotation_table.py        | 24 +++++++++++++------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/scripts/variantstore/wdl/extract/create_variant_annotation_table.py b/scripts/variantstore/wdl/extract/create_variant_annotation_table.py
index 0097fa18b3d..2b291d31df0 100644
--- a/scripts/variantstore/wdl/extract/create_variant_annotation_table.py
+++ b/scripts/variantstore/wdl/extract/create_variant_annotation_table.py
@@ -11,13 +11,15 @@
 
 vat_nirvana_positions_dictionary = {
   "position": "position", # required
+  "ref_allele": "refAllele", # required
+  "alt_allele": "altAlleles", # required
 }
 
 vat_nirvana_variants_dictionary = {
   "vid": "vid", # required
   "contig": "chromosome", # required
-  "ref_allele": "refAllele", # required
-  "alt_allele": "altAllele", # required
+  # "ref_allele": "refAllele", # required # do we want to grab both to compare?
+  # "alt_allele": "altAllele", # required
   "variant_type": "variantType", # required
   "genomic_location": "hgvsg", # required
   "dbsnp_rsid": "dbsnp",  # nullable
@@ -196,9 +198,11 @@ def get_subpopulation_calculations(subpop_annotations):
     row["gvs_max_sc"] = max_sc
     return row
 
-def make_annotated_json_row(row_position, variant_line, transcript_line):
+def make_annotated_json_row(row_position, row_ref, row_alt, variant_line, transcript_line):
     row = {}
     row["position"] = row_position # this is a required field
+    row["ref_allele"] = row_ref # this is a required field
+    row["alt_allele"] = row_alt # this is a required field
 
     for vat_variants_fieldname in vat_nirvana_variants_dictionary.keys():  # like "contig"
       nirvana_variants_fieldname = vat_nirvana_variants_dictionary.get(vat_variants_fieldname)
@@ -287,17 +291,23 @@ def make_positions_json(annotated_json, output_json):
 
   for p in positions:
     position=p['position']
+    refAllele=p['refAllele'] # ref_allele
+    altAlleles=p['altAlleles'] # this is an array that we need to split into each alt_allele
     variants=p['variants']
+    if '*' in altAlleles:
+      altAlleles.remove('*')
+    ## there should be the same number of altAlleles as variants, right?
+    assert len(altAlleles)==len(variants)
     # row for each variant - transcript
     # so let's start with each variant
-    for variant in variants:
+    for index, variant in enumerate(variants):
       # check if it's a row we care about
       if check_filtering(variant) is False:
         continue
       # remember that we want one for each variant-transcript and variant-null for variants without transcripts
       if variant.get("transcripts") == None:
         # then we make a special row
-        row = make_annotated_json_row(position, variant, None)
+        row = make_annotated_json_row(position, refAllele, altAlleles[index], variant, None)
         json_str = json.dumps(row) + "\n"
         json_bytes = json_str.encode('utf-8')
         output_file.write(json_bytes)
@@ -308,13 +318,13 @@ def make_positions_json(annotated_json, output_json):
         if "Ensembl" in sources:
           for transcript in transcript_lines:
             if transcript.get('source') == "Ensembl":
-              row = make_annotated_json_row(position, variant, transcript)
+              row = make_annotated_json_row(position, refAllele, altAlleles[index], variant, transcript)
               json_str = json.dumps(row) + "\n"
               json_bytes = json_str.encode('utf-8')
               output_file.write(json_bytes)
         else:
           # if there are transcripts, but they are not Ensembl, we now only want one row in the VAT, not one row per transcript
-          row = make_annotated_json_row(position, variant, None)
+          row = make_annotated_json_row(position, refAllele, altAlleles[index], variant, None)
           json_str = json.dumps(row) + "\n"
           json_bytes = json_str.encode('utf-8')
           output_file.write(json_bytes)

From a815a869e3833242b9375182bc55633057e20a7f Mon Sep 17 00:00:00 2001
From: rori <6863459+RoriCremer@users.noreply.github.com>
Date: Wed, 6 Oct 2021 16:31:34 -0400
Subject: [PATCH 17/26] template update

---
 scripts/variantstore/wdl/GvsSitesOnlyVCF.example.inputs.json | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scripts/variantstore/wdl/GvsSitesOnlyVCF.example.inputs.json b/scripts/variantstore/wdl/GvsSitesOnlyVCF.example.inputs.json
index 2061690b382..7e3f9f130b9 100644
--- a/scripts/variantstore/wdl/GvsSitesOnlyVCF.example.inputs.json
+++ b/scripts/variantstore/wdl/GvsSitesOnlyVCF.example.inputs.json
@@ -12,5 +12,6 @@
   "GvsSitesOnlyVCF.project_id": "spec-ops-aou",
   "GvsSitesOnlyVCF.dataset_name": "anvil_100_for_testing",
   "GvsSitesOnlyVCF.AnAcAf_annotations_template": "gs://broad-dsp-spec-ops/scratch/rcremer/Nirvana/custom_annotations_template.tsv",
-  "GvsSitesOnlyVCF.ancestry_file": "gs://broad-dsp-spec-ops/scratch/rcremer/Nirvana/subpop.tsv"
+  "GvsSitesOnlyVCF.ancestry_file": "gs://broad-dsp-spec-ops/scratch/rcremer/Nirvana/subpop.tsv",
+  "GvsSitesOnlyVCF.reference": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta"
 }

From 55d5636d2ff19f0a68359adfc67230d93551ab4e Mon Sep 17 00:00:00 2001
From: rori <6863459+RoriCremer@users.noreply.github.com>
Date: Wed, 6 Oct 2021 16:38:08 -0400
Subject: [PATCH 18/26] add in line counts for metrics

---
 scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl | 31 +++++++++++++-------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
index 00a067a9b86..806b9479807 100644
--- a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
+++ b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
@@ -224,33 +224,44 @@ task ExtractAnAcAfFromVCF {
         #]
 
 
-        ## normalize and filter out the filtered sites and FT filtered out variants ## do we need:
-        bcftools norm -m-any ~{local_input_vcf} | \
-        bcftools norm --check-ref w  -f Homo_sapiens_assembly38.fasta | \
-        bcftools view -f 'PASS,.'  | \
-        bcftools filter -i "FORMAT/FT='PASS'" --set-GTs . > ~{normalized_vcf}
+        wc -l ~{local_input_vcf}
+
+        ## filter out sites with too many alt alleles
+        bcftools view -e 'N_ALT>500' --no-update  ~{local_input_vcf}  > test1.vcf
+        wc -l test1.vcf
+
+        ## filter out the non-passing sites
+        bcftools view  -f 'PASS,.' --no-update  test1.vcf | \
+        ## normalize, left align and split multi allelic sites to new lines, remove duplicate lines
+        bcftools norm -m- --check-ref w -f Homo_sapiens_assembly38.fasta | \
+        ## filter out spanning deletions and variants with an AC of 0
+        bcftools view  -e 'ALT[0]="*"' -e 'AC=0' --no-update | \
+        ## ensure that we respect the FT tag
+        bcftools filter -i "FORMAT/FT='PASS,.'" --set-GTs . > ~{normalized_vcf}
+        wc -l ~{normalized_vcf}
 
         ## make a file of just the first 5 columns of the tsv
         bcftools query ~{normalized_vcf} -f '%CHROM\t%POS\t%ID\t%REF\t%ALT\n' > check_duplicates.tsv
         ## check it for duplicates and put them in a new file
         sort check_duplicates.tsv | uniq -d | cut -f1,2,3,4,5  > duplicates.tsv
         ## remove those rows (that match up to the first 5 cols)
-        grep -v -wFf duplicates.tsv ~{normalized_vcf} | grep -v "AC=0;"  > deduplicated.vcf
+        grep -v -wFf duplicates.tsv ~{normalized_vcf} > deduplicated.vcf
 
         wc -l duplicates.tsv | awk '{print $1}' > duplicates_count.txt
         echo "the following duplicate variants will be dropped due to a bug"
         cat duplicates.tsv
 
+        ## calculate annotations for all subpopulations
         bcftools plugin fill-tags  -- deduplicated.vcf -S ~{subpopulation_sample_list} -t AC,AF,AN,AC_het,AC_hom | bcftools query -f \
         '%CHROM\t%POS\t%REF\t%ALT\t%AC\t%AN\t%AF\t%AC_Hom\t%AC_Het\t%AC_afr\t%AN_afr\t%AF_afr\t%AC_Hom_afr\t%AC_Het_afr\t%AC_amr\t%AN_amr\t%AF_amr\t%AC_Hom_amr\t%AC_Het_amr\t%AC_eas\t%AN_eas\t%AF_eas\t%AC_Hom_eas\t%AC_Het_eas\t%AC_eur\t%AN_eur\t%AF_eur\t%AC_Hom_eur\t%AC_Het_eur\t%AC_mid\t%AN_mid\t%AF_mid\t%AC_Hom_mid\t%AC_Het_mid\t%AC_oth\t%AN_oth\t%AF_oth\t%AC_Hom_oth\t%AC_Het_oth\t%AC_sas\t%AN_sas\t%AF_sas\t%AC_Hom_sas\t%AC_Het_sas\n' \
-        | grep -v "*" >> ~{custom_annotations_file_name}
+        >> ~{custom_annotations_file_name}
 
         ### for validation of the pipeline
         wc -l ~{custom_annotations_file_name} | awk '{print $1 -7}'  > count.txt
 
-        ### compress the vcf and index it, make it sites-only and drop the AS_QUALapprox field---might be nice to be able to drop all of the INFO fields, since I dont need AC/AN/AF in this form
-        bcftools view --no-update --drop-genotypes deduplicated.vcf -Oz -o sites_only.vcf.gz
-        bcftools annotate -x INFO/AS_QUALapprox sites_only.vcf.gz -Oz -o ~{normalized_vcf_compressed}
+        ### compress the vcf and index it, make it sites-only
+        bcftools view --no-update --drop-genotypes deduplicated.vcf -Oz -o ~{normalized_vcf_compressed}
+        ##bcftools annotate -x INFO Is this overkill?
         bcftools index --tbi  ~{normalized_vcf_compressed}
 
     >>>

From 7a14fb2e87cbaff7615e03ddd9e92448f765ef04 Mon Sep 17 00:00:00 2001
From: rori <6863459+RoriCremer@users.noreply.github.com>
Date: Wed, 6 Oct 2021 21:57:03 -0400
Subject: [PATCH 19/26] get the count of high alt allele sites

---
 scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
index 806b9479807..0ff1b1deac1 100644
--- a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
+++ b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
@@ -223,22 +223,19 @@ task ExtractAnAcAfFromVCF {
         # "sas"
         #]
 
-
-        wc -l ~{local_input_vcf}
+        ## get the number of +500 alt alleles for Lee
+        bcftools view -i 'N_ALT>500' --no-update --no-header  ~{local_input_vcf} | wc -l
 
         ## filter out sites with too many alt alleles
-        bcftools view -e 'N_ALT>500' --no-update  ~{local_input_vcf}  > test1.vcf
-        wc -l test1.vcf
-
+        bcftools view -e 'N_ALT>500' --no-update  ~{local_input_vcf} | \
         ## filter out the non-passing sites
-        bcftools view  -f 'PASS,.' --no-update  test1.vcf | \
+        bcftools view  -f 'PASS,.' --no-update | \
         ## normalize, left align and split multi allelic sites to new lines, remove duplicate lines
         bcftools norm -m- --check-ref w -f Homo_sapiens_assembly38.fasta | \
         ## filter out spanning deletions and variants with an AC of 0
-        bcftools view  -e 'ALT[0]="*"' -e 'AC=0' --no-update | \
+        bcftools view  -e 'ALT[0]="*" || AC=0' --no-update | \
         ## ensure that we respect the FT tag
         bcftools filter -i "FORMAT/FT='PASS,.'" --set-GTs . > ~{normalized_vcf}
-        wc -l ~{normalized_vcf}
 
         ## make a file of just the first 5 columns of the tsv
         bcftools query ~{normalized_vcf} -f '%CHROM\t%POS\t%ID\t%REF\t%ALT\n' > check_duplicates.tsv
@@ -254,7 +251,7 @@ task ExtractAnAcAfFromVCF {
         ## calculate annotations for all subpopulations
         bcftools plugin fill-tags  -- deduplicated.vcf -S ~{subpopulation_sample_list} -t AC,AF,AN,AC_het,AC_hom | bcftools query -f \
         '%CHROM\t%POS\t%REF\t%ALT\t%AC\t%AN\t%AF\t%AC_Hom\t%AC_Het\t%AC_afr\t%AN_afr\t%AF_afr\t%AC_Hom_afr\t%AC_Het_afr\t%AC_amr\t%AN_amr\t%AF_amr\t%AC_Hom_amr\t%AC_Het_amr\t%AC_eas\t%AN_eas\t%AF_eas\t%AC_Hom_eas\t%AC_Het_eas\t%AC_eur\t%AN_eur\t%AF_eur\t%AC_Hom_eur\t%AC_Het_eur\t%AC_mid\t%AN_mid\t%AF_mid\t%AC_Hom_mid\t%AC_Het_mid\t%AC_oth\t%AN_oth\t%AF_oth\t%AC_Hom_oth\t%AC_Het_oth\t%AC_sas\t%AN_sas\t%AF_sas\t%AC_Hom_sas\t%AC_Het_sas\n' \
-        >> ~{custom_annotations_file_name}
+        | grep -v "*" >> ~{custom_annotations_file_name}
 
         ### for validation of the pipeline
         wc -l ~{custom_annotations_file_name} | awk '{print $1 -7}'  > count.txt

From 5d8f1ed8357a41ca41af5434f68bdb28749b90ff Mon Sep 17 00:00:00 2001
From: rori <6863459+RoriCremer@users.noreply.github.com>
Date: Thu, 7 Oct 2021 11:17:49 -0400
Subject: [PATCH 20/26] AC_hemi for all

---
 .../custom_annotations_template.tsv                       | 8 ++++----
 scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl              | 6 +++---
 .../wdl/extract/create_variant_annotation_table.py        | 2 +-
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/scripts/variantstore/variant_annotations_table/custom_annotations_template.tsv b/scripts/variantstore/variant_annotations_table/custom_annotations_template.tsv
index 7a5ba70c823..a578ea9b65f 100644
--- a/scripts/variantstore/variant_annotations_table/custom_annotations_template.tsv
+++ b/scripts/variantstore/variant_annotations_table/custom_annotations_template.tsv
@@ -1,7 +1,7 @@
 #title=gvsAnnotations
 #assembly=GRCh38
 #matchVariantsBy=allele
-#CHROM	POS	REF	ALT	AC	AN	AF	AC_Hom	AC_Het	AC_afr	AN_afr	AF_afr	AC_Hom_afr	AC_Het_afr	AC_amr	AN_amr	AF_amr	AC_Hom_amr	AC_Het_amr	AC_eas	AN_eas	AF_eas	AC_Hom_eas	AC_Het_eas	AC_eur	AN_eur	AF_eur	AC_Hom_eur	AC_Het_eur	AC_mid	AN_mid	AF_mid	AC_Hom_mid	AC_Het_mid	AC_oth	AN_oth	AF_oth	AC_Hom_oth	AC_Het_oth	AC_sas	AN_sas	AF_sas	AC_Hom_sas	AC_Het_sas
-#categories	.	.	.	AlleleCount	AlleleNumber	AlleleFrequency	AlleleCount	AlleleCount	AlleleCount	AlleleNumber	AlleleFrequency	AlleleCount	AlleleCount	AlleleCount	AlleleNumber	AlleleFrequency	AlleleCount	AlleleCount	AlleleCount	AlleleNumber	AlleleFrequency	AlleleCount	AlleleCount	AlleleCount	AlleleNumber	AlleleFrequency	AlleleCount	AlleleCount	AlleleCount	AlleleNumber	AlleleFrequency	AlleleCount	AlleleCount	AlleleCount	AlleleNumber	AlleleFrequency	AlleleCount	AlleleCount	AlleleCount	AlleleNumber	AlleleFrequency	AlleleCount	AlleleCount
-#descriptions	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.
-#type	.	.	.	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number
+#CHROM	POS	REF	ALT	AC	AN	AF	AC_Hom	AC_Het	AC_Hemi	AC_afr	AN_afr	AF_afr	AC_Hom_afr	AC_Het_afr	AC_Hemi_afr	AC_amr	AN_amr	AF_amr	AC_Hom_amr	AC_Het_amr	AC_Hemi_amr	AC_eas	AN_eas	AF_eas	AC_Hom_eas	AC_Het_eas	AC_Hemi_eas	AC_eur	AN_eur	AF_eur	AC_Hom_eur	AC_Het_eur	AC_Hemi_eur	AC_mid	AN_mid	AF_mid	AC_Hom_mid	AC_Het_mid	AC_Hemi_mid	AC_oth	AN_oth	AF_oth	AC_Hom_oth	AC_Het_oth	AC_Hemi_oth	AC_sas	AN_sas	AF_sas	AC_Hom_sas	AC_Het_sas	AC_Hemi_sas
+#categories	.	.	.	AlleleCount	AlleleNumber	AlleleFrequency	AlleleCount	AlleleCount	AlleleCount	AlleleCount	AlleleNumber	AlleleFrequency	AlleleCount	AlleleCount	AlleleCount	AlleleCount	AlleleNumber	AlleleFrequency	AlleleCount	AlleleCount	AlleleCount	AlleleCount	AlleleNumber	AlleleFrequency	AlleleCount	AlleleCount	AlleleCount	AlleleCount	AlleleNumber	AlleleFrequency	AlleleCount	AlleleCount	AlleleCount	AlleleCount	AlleleNumber	AlleleFrequency	AlleleCount	AlleleCount	AlleleCount	AlleleCount	AlleleNumber	AlleleFrequency	AlleleCount	AlleleCount	AlleleCount	AlleleCount	AlleleNumber	AlleleFrequency	AlleleCount	AlleleCount	AlleleCount
+#descriptions	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.	.
+#type	.	.	.	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number	number
diff --git a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
index 0ff1b1deac1..10a260b9a8a 100644
--- a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
+++ b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
@@ -249,9 +249,9 @@ task ExtractAnAcAfFromVCF {
         cat duplicates.tsv
 
         ## calculate annotations for all subpopulations
-        bcftools plugin fill-tags  -- deduplicated.vcf -S ~{subpopulation_sample_list} -t AC,AF,AN,AC_het,AC_hom | bcftools query -f \
-        '%CHROM\t%POS\t%REF\t%ALT\t%AC\t%AN\t%AF\t%AC_Hom\t%AC_Het\t%AC_afr\t%AN_afr\t%AF_afr\t%AC_Hom_afr\t%AC_Het_afr\t%AC_amr\t%AN_amr\t%AF_amr\t%AC_Hom_amr\t%AC_Het_amr\t%AC_eas\t%AN_eas\t%AF_eas\t%AC_Hom_eas\t%AC_Het_eas\t%AC_eur\t%AN_eur\t%AF_eur\t%AC_Hom_eur\t%AC_Het_eur\t%AC_mid\t%AN_mid\t%AF_mid\t%AC_Hom_mid\t%AC_Het_mid\t%AC_oth\t%AN_oth\t%AF_oth\t%AC_Hom_oth\t%AC_Het_oth\t%AC_sas\t%AN_sas\t%AF_sas\t%AC_Hom_sas\t%AC_Het_sas\n' \
-        | grep -v "*" >> ~{custom_annotations_file_name}
+        bcftools plugin fill-tags  -- deduplicated.vcf -S ~{subpopulation_sample_list} -t AC,AF,AN,AC_het,AC_hom,AC_Hemi | bcftools query -f \
+        '%CHROM\t%POS\t%REF\t%ALT\t%AC\t%AN\t%AF\t%AC_Hom\t%AC_Het\t%AC_Hemi\t%AC_afr\t%AN_afr\t%AF_afr\t%AC_Hom_afr\t%AC_Het_afr\t%AC_Hemi_afr\t%AC_amr\t%AN_amr\t%AF_amr\t%AC_Hom_amr\t%AC_Het_amr\t%AC_Hemi_amr\t%AC_eas\t%AN_eas\t%AF_eas\t%AC_Hom_eas\t%AC_Het_eas\t%AC_Hemi_eas\t%AC_eur\t%AN_eur\t%AF_eur\t%AC_Hom_eur\t%AC_Het_eur\t%AC_Hemi_eur\t%AC_mid\t%AN_mid\t%AF_mid\t%AC_Hom_mid\t%AC_Het_mid\t%AC_Hemi_mid\t%AC_oth\t%AN_oth\t%AF_oth\t%AC_Hom_oth\t%AC_Het_oth\t%AC_Hemi_oth\t%AC_sas\t%AN_sas\t%AF_sas\t%AC_Hom_sas\t%AC_Het_sas\t%AC_Hemi_sas\n' \
+        >> ~{custom_annotations_file_name}
 
         ### for validation of the pipeline
         wc -l ~{custom_annotations_file_name} | awk '{print $1 -7}'  > count.txt
diff --git a/scripts/variantstore/wdl/extract/create_variant_annotation_table.py b/scripts/variantstore/wdl/extract/create_variant_annotation_table.py
index 2b291d31df0..7f0f443462f 100644
--- a/scripts/variantstore/wdl/extract/create_variant_annotation_table.py
+++ b/scripts/variantstore/wdl/extract/create_variant_annotation_table.py
@@ -178,7 +178,7 @@ def get_subpopulation_calculations(subpop_annotations):
       subpop_an_val = subpop_annotations.get("_".join(["AN", gvs_subpop]))
       subpop_af_val = subpop_annotations.get("_".join(["AF", gvs_subpop]))
        # note the assumption is made that AC_Hom must be even because by it's nature it means there are two, but there could be an error
-      subpop_sc_val = int(subpop_annotations.get("_".join(["AC_Hom", gvs_subpop])) / 2 ) + subpop_annotations.get("_".join(["AC_Het", gvs_subpop]))
+      subpop_sc_val = int(subpop_annotations.get("_".join(["AC_Hom", gvs_subpop])) / 2 ) + subpop_annotations.get("_".join(["AC_Het", gvs_subpop])) + subpop_annotations.get("_".join(["AC_Hemi", gvs_subpop]))
       # here we set the subpopulation ac/an/af values
       row["_".join(["gvs", gvs_subpop, "ac"])] = subpop_ac_val
       row["_".join(["gvs", gvs_subpop, "an"])] = subpop_an_val

From f57707adafe682b7a7699bc4a65238339f3314c5 Mon Sep 17 00:00:00 2001
From: rori <6863459+RoriCremer@users.noreply.github.com>
Date: Thu, 7 Oct 2021 22:20:27 -0400
Subject: [PATCH 21/26] add new docker image

---
 scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
index 10a260b9a8a..f8f6849c3bc 100644
--- a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
+++ b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
@@ -151,7 +151,7 @@ task MakeSubpopulationFiles {
     # ------------------------------------------------
     # Runtime settings:
     runtime {
-        docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_20210916"
+        docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_20211007"
         memory: "1 GB"
         preemptible: 3
         cpu: "1"
@@ -265,7 +265,7 @@ task ExtractAnAcAfFromVCF {
     # ------------------------------------------------
     # Runtime settings:
     runtime {
-        docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_20210916"
+        docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_20211007"
         memory: "32 GB"
         preemptible: 3
         cpu: "2"
@@ -395,7 +395,7 @@ task PrepAnnotationJson {
     # ------------------------------------------------
     # Runtime settings:
     runtime {
-        docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_20210916"
+        docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_20211007"
         memory: "8 GB"
         preemptible: 5
         cpu: "1"

From c853b63896cf71c9b72dddd0536e21624d75d0f8 Mon Sep 17 00:00:00 2001
From: rori <6863459+RoriCremer@users.noreply.github.com>
Date: Thu, 7 Oct 2021 23:18:59 -0400
Subject: [PATCH 22/26] get sample counts

---
 scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
index f8f6849c3bc..e7de7ed0bb9 100644
--- a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
+++ b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
@@ -211,7 +211,11 @@ task ExtractAnAcAfFromVCF {
         gsutil cp ~{ref} Homo_sapiens_assembly38.fasta
 
         # TODO Compare the ancestry list with the sample list and throw an error (but dont fail the job) if there are samples that are in one, but not the other. Two different errors.
-        # awk '{print $2}' ~{subpopulation_sample_list} | tail -n +2 | sort -u > collected_subpopulations.txt
+        #awk '{print $2}' ~{subpopulation_sample_list} | tail -n +2 | sort -u > collected_subpopulations.txt
+        #bcftools query --list-samples ~{local_input_vcf} | sort -u > collected_samples.txt
+
+        #wc -l collected_subpopulations.txt
+        #wc -l collected_samples.txt
 
         # expected_subpopulations = [
         # "afr",

From 7f0864b2e717728fbf66941307fc6aeebf99c47e Mon Sep 17 00:00:00 2001
From: rori <6863459+RoriCremer@users.noreply.github.com>
Date: Tue, 12 Oct 2021 22:18:26 -0400
Subject: [PATCH 23/26] clean up files for memory preservation

---
 scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
index e7de7ed0bb9..3502fa90be0 100644
--- a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
+++ b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
@@ -240,13 +240,19 @@ task ExtractAnAcAfFromVCF {
         bcftools view  -e 'ALT[0]="*" || AC=0' --no-update | \
         ## ensure that we respect the FT tag
         bcftools filter -i "FORMAT/FT='PASS,.'" --set-GTs . > ~{normalized_vcf}
+        du -h ~{normalized_vcf}
+
+        ## clean up
+        rm ~{local_input_vcf}
 
         ## make a file of just the first 5 columns of the tsv
         bcftools query ~{normalized_vcf} -f '%CHROM\t%POS\t%ID\t%REF\t%ALT\n' > check_duplicates.tsv
         ## check it for duplicates and put them in a new file
         sort check_duplicates.tsv | uniq -d | cut -f1,2,3,4,5  > duplicates.tsv
+        rm check_duplicates.tsv ## clean up
         ## remove those rows (that match up to the first 5 cols)
         grep -v -wFf duplicates.tsv ~{normalized_vcf} > deduplicated.vcf
+        rm ~{normalized_vcf} ## clean up
 
         wc -l duplicates.tsv | awk '{print $1}' > duplicates_count.txt
         echo "the following duplicate variants will be dropped due to a bug"

From 27ca1644f3715e88d1564c8e05b6d872d8bd7623 Mon Sep 17 00:00:00 2001
From: rori <6863459+RoriCremer@users.noreply.github.com>
Date: Fri, 15 Oct 2021 15:35:46 -0400
Subject: [PATCH 24/26] re-order errors

---
 .../wdl/extract/create_variant_annotation_table.py        | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/scripts/variantstore/wdl/extract/create_variant_annotation_table.py b/scripts/variantstore/wdl/extract/create_variant_annotation_table.py
index 7f0f443462f..e305441a954 100644
--- a/scripts/variantstore/wdl/extract/create_variant_annotation_table.py
+++ b/scripts/variantstore/wdl/extract/create_variant_annotation_table.py
@@ -120,6 +120,10 @@ def check_filtering(variant):
     if variant.get("gvsAnnotations") == None: # <-- enum since we need this to be in tandem with the custom annotations header / template
       print("WARNING: There has been an error in creating custom annotations for AC/AF/AN", variant.get("vid"))
       return False
+    # skip any row (with a warning) if the AC value is 0
+    elif variant["gvsAnnotations"].get("AC") == 0:
+      print("WARNING: Its AC is 0 so we are dropping this variant", variant.get("vid"))
+      return False
     # skip any row (with a warning) if AC, AN or AF is missing
     elif variant["gvsAnnotations"].get("AC") == None:
       print("WARNING: There has been an error-- there is no AC value---should AN be 0 for this variant?", variant.get("vid"))
@@ -130,10 +134,6 @@ def check_filtering(variant):
     elif variant["gvsAnnotations"].get("AF") == None:
       print("WARNING: There has been an error-- there is an AC value---but no AF value", variant.get("vid"))
       return False
-    # skip any row (with a warning) if the AC value is 0
-    elif variant["gvsAnnotations"].get("AC") == 0:
-      print("WARNING: Its AC is 0 so we are dropping this variant", variant.get("vid"))
-      return False
     else:
       return True
 

From 2602a5655770757d22740149256ec38518e61a78 Mon Sep 17 00:00:00 2001
From: rori <6863459+RoriCremer@users.noreply.github.com>
Date: Wed, 20 Oct 2021 11:46:14 -0400
Subject: [PATCH 25/26] pass dropped variants

---
 scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl | 35 ++++++++++++--------
 1 file changed, 22 insertions(+), 13 deletions(-)

diff --git a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
index 3502fa90be0..244a3ce3f5a 100644
--- a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
+++ b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
@@ -82,7 +82,7 @@ workflow GvsSitesOnlyVCF {
          project_id = project_id,
          dataset_name = dataset_name,
          counts_variants = ExtractAnAcAfFromVCF.count_variants,
-         counts_variant_duplicates = ExtractAnAcAfFromVCF.count_duplicates,
+         track_dropped_variants = ExtractAnAcAfFromVCF.track_dropped,
          table_suffix = table_suffix,
          service_account_json_path = service_account_json_path,
          load_jsons_done = BigQueryLoadJson.done
@@ -227,11 +227,14 @@ task ExtractAnAcAfFromVCF {
         # "sas"
         #]
 
-        ## get the number of +500 alt alleles for Lee
-        bcftools view -i 'N_ALT>500' --no-update --no-header  ~{local_input_vcf} | wc -l
+        ## track the dropped variants with +500 alt alleles or N's in the reference
+        bcftools view -i 'N_ALT>500 || REF~"N"' ~{local_input_vcf} | bcftools query -f '%CHROM\t%POS\t%ID\t%REF\t%ALT\n' > track_dropped.tsv
+
+        ## Since Nirvana cant handle N as a base, drop them for now and keep track of the lines dropped
+        ## bcftools view -e 'REF~"N"' ~{local_input_vcf}
 
         ## filter out sites with too many alt alleles
-        bcftools view -e 'N_ALT>500' --no-update  ~{local_input_vcf} | \
+        bcftools view -e 'N_ALT>500 || REF~"N"' --no-update  ~{local_input_vcf} | \
         ## filter out the non-passing sites
         bcftools view  -f 'PASS,.' --no-update | \
         ## normalize, left align and split multi allelic sites to new lines, remove duplicate lines
@@ -240,6 +243,7 @@ task ExtractAnAcAfFromVCF {
         bcftools view  -e 'ALT[0]="*" || AC=0' --no-update | \
         ## ensure that we respect the FT tag
         bcftools filter -i "FORMAT/FT='PASS,.'" --set-GTs . > ~{normalized_vcf}
+
         du -h ~{normalized_vcf}
 
         ## clean up
@@ -254,21 +258,23 @@ task ExtractAnAcAfFromVCF {
         grep -v -wFf duplicates.tsv ~{normalized_vcf} > deduplicated.vcf
         rm ~{normalized_vcf} ## clean up
 
-        wc -l duplicates.tsv | awk '{print $1}' > duplicates_count.txt
         echo "the following duplicate variants will be dropped due to a bug"
-        cat duplicates.tsv
+        cat duplicates.tsv >> track_dropped.tsv
+        cat track_dropped.tsv
+
+        rm duplicates.tsv ## clean up
 
         ## calculate annotations for all subpopulations
         bcftools plugin fill-tags  -- deduplicated.vcf -S ~{subpopulation_sample_list} -t AC,AF,AN,AC_het,AC_hom,AC_Hemi | bcftools query -f \
         '%CHROM\t%POS\t%REF\t%ALT\t%AC\t%AN\t%AF\t%AC_Hom\t%AC_Het\t%AC_Hemi\t%AC_afr\t%AN_afr\t%AF_afr\t%AC_Hom_afr\t%AC_Het_afr\t%AC_Hemi_afr\t%AC_amr\t%AN_amr\t%AF_amr\t%AC_Hom_amr\t%AC_Het_amr\t%AC_Hemi_amr\t%AC_eas\t%AN_eas\t%AF_eas\t%AC_Hom_eas\t%AC_Het_eas\t%AC_Hemi_eas\t%AC_eur\t%AN_eur\t%AF_eur\t%AC_Hom_eur\t%AC_Het_eur\t%AC_Hemi_eur\t%AC_mid\t%AN_mid\t%AF_mid\t%AC_Hom_mid\t%AC_Het_mid\t%AC_Hemi_mid\t%AC_oth\t%AN_oth\t%AF_oth\t%AC_Hom_oth\t%AC_Het_oth\t%AC_Hemi_oth\t%AC_sas\t%AN_sas\t%AF_sas\t%AC_Hom_sas\t%AC_Het_sas\t%AC_Hemi_sas\n' \
         >> ~{custom_annotations_file_name}
 
-        ### for validation of the pipeline
+        ## for validation of the pipeline
         wc -l ~{custom_annotations_file_name} | awk '{print $1 -7}'  > count.txt
 
-        ### compress the vcf and index it, make it sites-only
+        ## compress the vcf and index it, make it sites-only
         bcftools view --no-update --drop-genotypes deduplicated.vcf -Oz -o ~{normalized_vcf_compressed}
-        ##bcftools annotate -x INFO Is this overkill?
+        ## bcftools annotate -x INFO Is this overkill?
         bcftools index --tbi  ~{normalized_vcf_compressed}
 
     >>>
@@ -286,7 +292,7 @@ task ExtractAnAcAfFromVCF {
     output {
         File annotations_file = "~{custom_annotations_file_name}"
         Int count_variants = read_int("count.txt")
-        Int count_duplicates = read_int("duplicates_count.txt")
+        File track_dropped = "track_dropped.tsv"
         File output_vcf = "~{normalized_vcf_compressed}"
         File output_vcf_index = "~{normalized_vcf_indexed}"
     }
@@ -643,7 +649,7 @@ task BigQuerySmokeTest {
         String project_id
         String dataset_name
         Array[Int] counts_variants
-        Array[Int] counts_variant_duplicates
+        Array[File] track_dropped_variants
         String table_suffix
         String? service_account_json_path
         Boolean load_jsons_done
@@ -671,9 +677,12 @@ task BigQuerySmokeTest {
         # sum all the initial input variants across the shards
 
         INITIAL_VARIANT_COUNT=$(python -c "print(sum([~{sep=', ' counts_variants}]))")
-        DUPLICATE_VARIANT_COUNT=$(python -c "print(sum([~{sep=', ' counts_variant_duplicates}]))")
+        ## here a list of files: ~{sep=', ' track_dropped_variants}
+        ## I need to get the lines from each file
+        awk 'FNR==1{print ""}1' ~{sep=' ' track_dropped_variants} > dropped_variants.txt
 
-        echo "Duplicate variant count: $DUPLICATE_VARIANT_COUNT."
+        echo "Dropped variants:"
+        cat dropped_variants.txt
 
         # Count number of variants in the VAT
         bq query --nouse_legacy_sql --project_id=~{project_id} --format=csv 'SELECT COUNT (DISTINCT vid) AS count FROM `~{dataset_name}.~{vat_table}`' > bq_variant_count.csv

From 5284deeaf08998d166b937e65b1f85dfb4d75e4b Mon Sep 17 00:00:00 2001
From: rori <6863459+RoriCremer@users.noreply.github.com>
Date: Tue, 26 Oct 2021 23:09:21 -0400
Subject: [PATCH 26/26] comments cleanup

---
 scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl | 32 ++++++++------------
 1 file changed, 13 insertions(+), 19 deletions(-)

diff --git a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
index 244a3ce3f5a..c4921f86684 100644
--- a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
+++ b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
@@ -193,7 +193,8 @@ task ExtractAnAcAfFromVCF {
     String normalized_vcf_compressed = "normalized.vcf.gz"
     String normalized_vcf_indexed = "normalized.vcf.gz.tbi"
 
-    # separate multi-allelic sites into their own lines, remove deletions and extract the an/ac/af & sc
+    # separate multi-allelic sites into their own lines, remove deletions and filtered sites and make a sites only vcf
+    # while extracting the an/ac/af & sc by subpopulation into a tsv
     command <<<
         set -e
 
@@ -210,12 +211,12 @@ task ExtractAnAcAfFromVCF {
         gsutil cp ~{input_vcf_index} ~{local_input_vcf_index}
         gsutil cp ~{ref} Homo_sapiens_assembly38.fasta
 
-        # TODO Compare the ancestry list with the sample list and throw an error (but dont fail the job) if there are samples that are in one, but not the other. Two different errors.
-        #awk '{print $2}' ~{subpopulation_sample_list} | tail -n +2 | sort -u > collected_subpopulations.txt
-        #bcftools query --list-samples ~{local_input_vcf} | sort -u > collected_samples.txt
-
-        #wc -l collected_subpopulations.txt
-        #wc -l collected_samples.txt
+        ## compare the ancestry sample list with the vcf sample list
+        # TODO throw an error (but dont fail the job) if there are samples that are in one, but not the other. Throw two different errors.
+        # Currently commented out to save on io with the AoU beta VAT creation
+        # awk '{print $1}' ~{subpopulation_sample_list} | tail -n +2 | sort -u > collected_subpopulation_samples.txt
+        # bcftools query --list-samples ~{local_input_vcf} | sort -u > collected_samples.txt
+        # diff collected_subpopulation_samples.txt collected_samples.txt
 
         # expected_subpopulations = [
         # "afr",
@@ -227,12 +228,9 @@ task ExtractAnAcAfFromVCF {
         # "sas"
         #]
 
-        ## track the dropped variants with +500 alt alleles or N's in the reference
+        ## track the dropped variants with +500 alt alleles or N's in the reference (Since Nirvana cant handle N as a base, drop them for now)
         bcftools view -i 'N_ALT>500 || REF~"N"' ~{local_input_vcf} | bcftools query -f '%CHROM\t%POS\t%ID\t%REF\t%ALT\n' > track_dropped.tsv
 
-        ## Since Nirvana cant handle N as a base, drop them for now and keep track of the lines dropped
-        ## bcftools view -e 'REF~"N"' ~{local_input_vcf}
-
         ## filter out sites with too many alt alleles
         bcftools view -e 'N_ALT>500 || REF~"N"' --no-update  ~{local_input_vcf} | \
         ## filter out the non-passing sites
@@ -244,9 +242,7 @@ task ExtractAnAcAfFromVCF {
         ## ensure that we respect the FT tag
         bcftools filter -i "FORMAT/FT='PASS,.'" --set-GTs . > ~{normalized_vcf}
 
-        du -h ~{normalized_vcf}
-
-        ## clean up
+        ## clean up unneeded file
         rm ~{local_input_vcf}
 
         ## make a file of just the first 5 columns of the tsv
@@ -258,11 +254,9 @@ task ExtractAnAcAfFromVCF {
         grep -v -wFf duplicates.tsv ~{normalized_vcf} > deduplicated.vcf
         rm ~{normalized_vcf} ## clean up
 
-        echo "the following duplicate variants will be dropped due to a bug"
+        ## add duplicates to the file tracking dropped variants
         cat duplicates.tsv >> track_dropped.tsv
-        cat track_dropped.tsv
-
-        rm duplicates.tsv ## clean up
+        rm duplicates.tsv ## clean up unneeded file
 
         ## calculate annotations for all subpopulations
         bcftools plugin fill-tags  -- deduplicated.vcf -S ~{subpopulation_sample_list} -t AC,AF,AN,AC_het,AC_hom,AC_Hemi | bcftools query -f \
@@ -274,7 +268,7 @@ task ExtractAnAcAfFromVCF {
 
         ## compress the vcf and index it, make it sites-only
         bcftools view --no-update --drop-genotypes deduplicated.vcf -Oz -o ~{normalized_vcf_compressed}
-        ## bcftools annotate -x INFO Is this overkill?
+        ## if we can spare the IO and want to pass a smaller file we can also drop the info field w bcftools annotate -x INFO
         bcftools index --tbi  ~{normalized_vcf_compressed}
 
     >>>