pr review part 2

broadinstitute · Jul 14, 2021 · 4656c46 · 4656c46
1 parent 56eaa65
commit 4656c46
Show file tree

Hide file tree

Showing 5 changed files with 243 additions and 20 deletions.
diff --git a/scripts/variantstore/wdl/GvsSitesOnlyVCF.example.inputs.json b/scripts/variantstore/wdl/GvsSitesOnlyVCF.example.inputs.json
@@ -4,12 +4,11 @@
   "GvsSitesOnlyVCF.output_sites_only_file_name": "hello_did_I_sites_only",
   "GvsSitesOnlyVCF.output_annotated_file_name": "hello_did_I_annotate",
   "GvsSitesOnlyVCF.nirvana_data_directory": "gs://broad-dsp-spec-ops/scratch/rcremer/Nirvana/NirvanaData.tar.gz",
-  "GvsSitesOnlyVCF.nirvana_schema_json_file": "gs://broad-dsp-spec-ops/scratch/rcremer/Nirvana/schemas/vat_schema.json",
-  "GvsSitesOnlyVCF.vat_vt_schema_json_file": "gs://broad-dsp-spec-ops/scratch/rcremer/Nirvana/schemas/vt_schema.json",
-  "GvsSitesOnlyVCF.vat_genes_schema_json_file": "gs://broad-dsp-spec-ops/scratch/rcremer/Nirvana/schemas/genes_schema.json",
-  "GvsSitesOnlyVCF.output_path": "gs://broad-dsp-spec-ops/scratch/rcremer/Nirvana/output/jul6/",
-  "GvsSitesOnlyVCF.table_id": "jul6",
+  "GvsSitesOnlyVCF.vat_schema_json_file": "gs://broad-dsp-spec-ops/scratch/rcremer/Nirvana/schemas/vat_schema.json",
+  "GvsSitesOnlyVCF.variant_transcript_schema_json_file": "gs://broad-dsp-spec-ops/scratch/rcremer/Nirvana/schemas/vt_schema.json",
+  "GvsSitesOnlyVCF.genes_schema_json_file": "gs://broad-dsp-spec-ops/scratch/rcremer/Nirvana/schemas/genes_schema.json",
+  "GvsSitesOnlyVCF.output_path": "gs://broad-dsp-spec-ops/scratch/rcremer/Nirvana/output/jul13/",
+  "GvsSitesOnlyVCF.table_suffix": "jul13",
   "GvsSitesOnlyVCF.project_id": "spec-ops-aou",
-  "GvsSitesOnlyVCF.dataset_name": "anvil_100_for_testing",
-  "GvsSitesOnlyVCF.service_account_json": "gs://fc-secure-e91afe60-ca52-48fa-b4a9-c76fed5a0449/keys/aou_wgs_vumc_prod.json"
+  "GvsSitesOnlyVCF.dataset_name": "anvil_100_for_testing"
 }
diff --git a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
@@ -8,9 +8,9 @@ workflow GvsSitesOnlyVCF {
         String project_id
         String dataset_name
         File nirvana_data_directory
-        File nirvana_schema_json_file
-        File vat_vt_schema_json_file
-        File vat_genes_schema_json_file
+        File vat_schema_json_file
+        File variant_transcript_schema_json_file
+        File genes_schema_json_file
         String output_path
         String table_suffix
 
@@ -38,16 +38,16 @@ workflow GvsSitesOnlyVCF {
        call PrepAnnotationJson {
          input:
            annotation_json = AnnotateShardedVCF.annotation_json,
-           output_name = "${i}.json.gz",
+           output_file_suffix = "${i}.json.gz",
            output_path = output_path,
            service_account_json = service_account_json,
        }
     }
      call BigQueryLoadJson {
          input:
-             nirvana_schema = nirvana_schema_json_file,
-             vt_schema = vat_vt_schema_json_file,
-             genes_schema = vat_genes_schema_json_file,
+             nirvana_schema = vat_schema_json_file,
+             vt_schema = variant_transcript_schema_json_file,
+             genes_schema = genes_schema_json_file,
              project_id = project_id,
              dataset_name = dataset_name,
              output_path = output_path,
@@ -185,16 +185,15 @@ task AnnotateShardedVCF {
 task PrepAnnotationJson {
     input {
         File annotation_json
-        String output_name
+        String output_file_suffix
         String output_path
         File? service_account_json
     }
 
-    String output_vt_json = "vat_vt_bq_load" + output_name
-    String output_genes_json = "vat_genes_bq_load" + output_name
+    String output_vt_json = "vat_vt_bq_load" + output_file_suffix
+    String output_genes_json = "vat_genes_bq_load" + output_file_suffix
     String output_vt_gcp_path = output_path + 'vt/'
     String output_genes_gcp_path = output_path + 'genes/'
-    String output_ant_gcp_path = output_path + 'annotations/'
 
     String has_service_account_file = if (defined(service_account_json)) then 'true' else 'false'
 

diff --git a/scripts/variantstore/wdl/extract/create_variant_annotation_table.py b/scripts/variantstore/wdl/extract/create_variant_annotation_table.py
@@ -186,7 +186,7 @@ def make_positions_json(annotated_json, output_json):
         output_file.write(json_bytes)
       else:
         transcript_lines = variant.get("transcripts")
-        # Collect all the transcript sources and check for if they contain Ensembl <-- this might be a good place for optimization
+        # Collect all the transcript sources and check for if they contain Ensembl
         sources = [transcript.get('source') for transcript in transcript_lines]
         if "Ensembl" in sources:
           for transcript in transcript_lines:
@@ -202,6 +202,7 @@ def make_positions_json(annotated_json, output_json):
           json_bytes = json_str.encode('utf-8')
           output_file.write(json_bytes)
   output_file.close()
+  json_data.close()
 
 def make_genes_json(annotated_json, output_genes_json):
   output_genes_file=gzip.open(output_genes_json, 'w')
@@ -235,11 +236,11 @@ def make_genes_json(annotated_json, output_genes_json):
       json_bytes = json_str.encode('utf-8')
       output_genes_file.write(json_bytes)
   output_genes_file.close()
+  json_data.close()
 
 def make_annotation_jsons(annotated_json, output_json, output_genes_json):
   make_positions_json(annotated_json, output_json)
   # we've already read the whole file once so we have to open it again
-  # TODO: cleanup closing of file handles
   make_genes_json(annotated_json, output_genes_json)
 
 if __name__ == '__main__':

diff --git a/...ntstore/wdl/schemas/vat_genes_schema.json → ...ariantstore/wdl/schemas/genes_schema.json b/...ntstore/wdl/schemas/vat_genes_schema.json → ...ariantstore/wdl/schemas/genes_schema.json
diff --git a/scripts/variantstore/wdl/schemas/variant_transcript_schema.json b/scripts/variantstore/wdl/schemas/variant_transcript_schema.json
@@ -0,0 +1,224 @@
+[
+  {
+    "description": "Must be positive. Exact position for a SNP and the position before the alteration in an indel",
+    "name": "position",
+    "type": "Integer",
+    "mode": "Required"
+  },
+ {
+   "description": "Variant ID. Unique string for identifying a variant (as produced by NIRVANA based on a spec from Broad Institute)",
+   "name": "vid",
+   "type": "String",
+   "mode": "Required"
+ },
+ {
+   "description": "Contig names match the hg38 reference",
+   "name": "contig",
+   "type": "String",
+   "mode": "Required"
+ },
+ {
+   "description": "base(s). This should always be one base for SNPs and insertions.  More than one base for deletions",
+   "name": "ref_allele",
+   "type": "String",
+   "mode": "Required"
+ },
+ {
+   "description": "base(s).  This should always be one base for SNPs and deletions.  More than one base for insertions",
+   "name": "alt_allele",
+   "type": "String",
+   "mode": "Required"
+ },
+ {
+    "description": "DNA change type (HGVS)",
+    "name": "variant_type",
+    "type": "String",
+    "mode": "Required"
+  },
+  {
+    "description": "HGVS g. nomenclature Variant location",
+    "name": "genomic_location",
+    "type": "String",
+    "mode": "Required"
+  },
+  {
+    "description": "rsID",
+    "name": "dbsnp_rsid",
+    "type": "String",
+    "mode": "Repeated"
+  },
+  {
+    "description": "Transcript ID. Null indicates that this variant does not overlap any transcripts",
+    "name": "transcript",
+    "type": "String",
+    "mode": "Nullable"
+  },
+  {
+   "description": "Gene symbol.  A variant can have more than one associated gene symbol, since about 3% of genes do overlap",
+   "name": "gene_symbol",
+   "type": "String",
+   "mode": "Nullable"
+ },
+ {
+   "description": "",
+   "name": "transcript_source",
+   "type": "String",
+   "mode": "Nullable"
+ },
+ {
+   "description": "HGVS p. nomenclature; Amino acid change",
+   "name": "aa_change",
+   "type": "String",
+   "mode": "Nullable"
+ },
+ {
+   "description": "Amino acid change type TODO check with Lee about why this json thinks its Repeated not nullable",
+   "name": "consequence",
+   "type": "String",
+   "mode": "Repeated"
+ },
+  {
+    "description": "HGVS c. nomenclature; DNA change in transcript space",
+    "name": "dna_change_in_transcript",
+    "type": "String",
+    "mode": "Nullable"
+  },
+  {
+    "description": "Exon number",
+    "name": "exon_number",
+    "type": "String",
+    "mode": "Nullable"
+  },
+  {
+    "description": "Intron number",
+    "name": "intron_number",
+    "type": "String",
+    "mode": "Nullable"
+  },
+  {
+    "description": "Gene ID for the transcript",
+    "name": "gene_id",
+    "type": "String",
+    "mode": "Nullable"
+  },
+  {
+    "description": "Primary Transcript ID",
+    "name": "is_canonical_transcript",
+    "type": "Boolean",
+    "mode": "Nullable"
+  },
+  {
+    "description": "AC TODO -- this needs to be added back and swapped to required -- Lee said this was a string?",
+    "name": "gvs_all_ac",
+    "type": "Integer",
+    "mode": "Nullable"
+  },
+  {
+    "description": "AN TODO -- this needs to be added back and swapped to required",
+    "name": "gvs_all_an",
+    "type": "Integer",
+    "mode": "Nullable"
+  },
+  {
+    "description": "AF TODO -- this needs to be added back and swapped to required  -- Lee said this was a Float?",
+    "name": "gvs_all_af",
+    "type": "INTEGER",
+    "mode": "Nullable"
+  },
+  {
+    "description": "REVEL",
+    "name": "revel",
+    "type": "FLOAT",
+    "mode": "Nullable"
+  },
+  {
+    "description": "Slice AI",
+    "name": "splice_ai_acceptor_gain_score",
+    "type": "Float",
+    "mode": "Nullable"
+  },
+  {
+    "description": "Slice AI",
+    "name": "splice_ai_acceptor_gain_distance",
+    "type": "Integer",
+    "mode": "Nullable"
+  },
+  {
+    "description": "Slice AI",
+    "name": "splice_ai_acceptor_loss_score",
+    "type": "Float",
+    "mode": "Nullable"
+  },
+  {
+    "description": "Slice AI",
+    "name": "splice_ai_acceptor_loss_distance",
+    "type": "Integer",
+    "mode": "Nullable"
+  },
+  {
+    "description": "Slice AI",
+    "name": "splice_ai_donor_gain_score",
+    "type": "Float",
+    "mode": "Nullable"
+  },
+  {
+    "description": "Slice AI",
+    "name": "splice_ai_donor_gain_distance",
+    "type": "Integer",
+    "mode": "Nullable"
+  },
+  {
+    "description": "Slice AI",
+    "name": "splice_ai_donor_loss_score",
+    "type": "Float",
+    "mode": "Nullable"
+  },
+  {
+    "description": "Slice AI",
+    "name": "splice_ai_donor_loss_distance",
+    "type": "Integer",
+    "mode": "Nullable"
+  },
+  {
+    "description": "ClinVar Id for Validation",
+    "name": "clinvar_id",
+    "type": "String",
+    "mode": "Repeated"
+  },
+  {
+    "description": "ClinVar Classification",
+    "name": "clinvar_classification",
+    "type": "String",
+    "mode": "Repeated"
+  },
+  {
+    "description": "ClinVar Classification Date",
+    "name": "clinvar_last_updated",
+    "type": "Date",
+    "mode": "Nullable"
+  },
+ {
+   "description": "ClinVar Disease Name",
+   "name": "clinvar_phenotype",
+   "type": "String",
+   "mode": "Repeated"
+ },
+  {
+    "description": "gnomAD: 'Total' frequency",
+    "name": "gnomad_all_af",
+    "type": "Float",
+    "mode": "Nullable"
+  },
+  {
+    "description": "gnomAD: 'Total' allele count",
+    "name": "gnomad_all_ac",
+    "type": "Integer",
+    "mode": "Nullable"
+  },
+  {
+    "description": "gnomAD: 'Total' allele number",
+    "name": "gnomad_all_an",
+    "type": "Integer",
+    "mode": "Nullable"
+  }
+]