Skip to content

Commit

Permalink
pr review part 2
Browse files Browse the repository at this point in the history
  • Loading branch information
RoriCremer committed Jul 14, 2021
1 parent 56eaa65 commit 4656c46
Show file tree
Hide file tree
Showing 5 changed files with 243 additions and 20 deletions.
13 changes: 6 additions & 7 deletions scripts/variantstore/wdl/GvsSitesOnlyVCF.example.inputs.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,11 @@
"GvsSitesOnlyVCF.output_sites_only_file_name": "hello_did_I_sites_only",
"GvsSitesOnlyVCF.output_annotated_file_name": "hello_did_I_annotate",
"GvsSitesOnlyVCF.nirvana_data_directory": "gs://broad-dsp-spec-ops/scratch/rcremer/Nirvana/NirvanaData.tar.gz",
"GvsSitesOnlyVCF.nirvana_schema_json_file": "gs://broad-dsp-spec-ops/scratch/rcremer/Nirvana/schemas/vat_schema.json",
"GvsSitesOnlyVCF.vat_vt_schema_json_file": "gs://broad-dsp-spec-ops/scratch/rcremer/Nirvana/schemas/vt_schema.json",
"GvsSitesOnlyVCF.vat_genes_schema_json_file": "gs://broad-dsp-spec-ops/scratch/rcremer/Nirvana/schemas/genes_schema.json",
"GvsSitesOnlyVCF.output_path": "gs://broad-dsp-spec-ops/scratch/rcremer/Nirvana/output/jul6/",
"GvsSitesOnlyVCF.table_id": "jul6",
"GvsSitesOnlyVCF.vat_schema_json_file": "gs://broad-dsp-spec-ops/scratch/rcremer/Nirvana/schemas/vat_schema.json",
"GvsSitesOnlyVCF.variant_transcript_schema_json_file": "gs://broad-dsp-spec-ops/scratch/rcremer/Nirvana/schemas/vt_schema.json",
"GvsSitesOnlyVCF.genes_schema_json_file": "gs://broad-dsp-spec-ops/scratch/rcremer/Nirvana/schemas/genes_schema.json",
"GvsSitesOnlyVCF.output_path": "gs://broad-dsp-spec-ops/scratch/rcremer/Nirvana/output/jul13/",
"GvsSitesOnlyVCF.table_suffix": "jul13",
"GvsSitesOnlyVCF.project_id": "spec-ops-aou",
"GvsSitesOnlyVCF.dataset_name": "anvil_100_for_testing",
"GvsSitesOnlyVCF.service_account_json": "gs://fc-secure-e91afe60-ca52-48fa-b4a9-c76fed5a0449/keys/aou_wgs_vumc_prod.json"
"GvsSitesOnlyVCF.dataset_name": "anvil_100_for_testing"
}
21 changes: 10 additions & 11 deletions scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@ workflow GvsSitesOnlyVCF {
String project_id
String dataset_name
File nirvana_data_directory
File nirvana_schema_json_file
File vat_vt_schema_json_file
File vat_genes_schema_json_file
File vat_schema_json_file
File variant_transcript_schema_json_file
File genes_schema_json_file
String output_path
String table_suffix

Expand Down Expand Up @@ -38,16 +38,16 @@ workflow GvsSitesOnlyVCF {
call PrepAnnotationJson {
input:
annotation_json = AnnotateShardedVCF.annotation_json,
output_name = "${i}.json.gz",
output_file_suffix = "${i}.json.gz",
output_path = output_path,
service_account_json = service_account_json,
}
}
call BigQueryLoadJson {
input:
nirvana_schema = nirvana_schema_json_file,
vt_schema = vat_vt_schema_json_file,
genes_schema = vat_genes_schema_json_file,
nirvana_schema = vat_schema_json_file,
vt_schema = variant_transcript_schema_json_file,
genes_schema = genes_schema_json_file,
project_id = project_id,
dataset_name = dataset_name,
output_path = output_path,
Expand Down Expand Up @@ -185,16 +185,15 @@ task AnnotateShardedVCF {
task PrepAnnotationJson {
input {
File annotation_json
String output_name
String output_file_suffix
String output_path
File? service_account_json
}

String output_vt_json = "vat_vt_bq_load" + output_name
String output_genes_json = "vat_genes_bq_load" + output_name
String output_vt_json = "vat_vt_bq_load" + output_file_suffix
String output_genes_json = "vat_genes_bq_load" + output_file_suffix
String output_vt_gcp_path = output_path + 'vt/'
String output_genes_gcp_path = output_path + 'genes/'
String output_ant_gcp_path = output_path + 'annotations/'

String has_service_account_file = if (defined(service_account_json)) then 'true' else 'false'

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ def make_positions_json(annotated_json, output_json):
output_file.write(json_bytes)
else:
transcript_lines = variant.get("transcripts")
# Collect all the transcript sources and check for if they contain Ensembl <-- this might be a good place for optimization
# Collect all the transcript sources and check for if they contain Ensembl
sources = [transcript.get('source') for transcript in transcript_lines]
if "Ensembl" in sources:
for transcript in transcript_lines:
Expand All @@ -202,6 +202,7 @@ def make_positions_json(annotated_json, output_json):
json_bytes = json_str.encode('utf-8')
output_file.write(json_bytes)
output_file.close()
json_data.close()

def make_genes_json(annotated_json, output_genes_json):
output_genes_file=gzip.open(output_genes_json, 'w')
Expand Down Expand Up @@ -235,11 +236,11 @@ def make_genes_json(annotated_json, output_genes_json):
json_bytes = json_str.encode('utf-8')
output_genes_file.write(json_bytes)
output_genes_file.close()
json_data.close()

def make_annotation_jsons(annotated_json, output_json, output_genes_json):
make_positions_json(annotated_json, output_json)
# we've already read the whole file once so we have to open it again
# TODO: cleanup closing of file handles
make_genes_json(annotated_json, output_genes_json)

if __name__ == '__main__':
Expand Down
224 changes: 224 additions & 0 deletions scripts/variantstore/wdl/schemas/variant_transcript_schema.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
[
{
"description": "Must be positive. Exact position for a SNP and the position before the alteration in an indel",
"name": "position",
"type": "Integer",
"mode": "Required"
},
{
"description": "Variant ID. Unique string for identifying a variant (as produced by NIRVANA based on a spec from Broad Institute)",
"name": "vid",
"type": "String",
"mode": "Required"
},
{
"description": "Contig names match the hg38 reference",
"name": "contig",
"type": "String",
"mode": "Required"
},
{
"description": "base(s). This should always be one base for SNPs and insertions. More than one base for deletions",
"name": "ref_allele",
"type": "String",
"mode": "Required"
},
{
"description": "base(s). This should always be one base for SNPs and deletions. More than one base for insertions",
"name": "alt_allele",
"type": "String",
"mode": "Required"
},
{
"description": "DNA change type (HGVS)",
"name": "variant_type",
"type": "String",
"mode": "Required"
},
{
"description": "HGVS g. nomenclature Variant location",
"name": "genomic_location",
"type": "String",
"mode": "Required"
},
{
"description": "rsID",
"name": "dbsnp_rsid",
"type": "String",
"mode": "Repeated"
},
{
"description": "Transcript ID. Null indicates that this variant does not overlap any transcripts",
"name": "transcript",
"type": "String",
"mode": "Nullable"
},
{
"description": "Gene symbol. A variant can have more than one associated gene symbol, since about 3% of genes do overlap",
"name": "gene_symbol",
"type": "String",
"mode": "Nullable"
},
{
"description": "",
"name": "transcript_source",
"type": "String",
"mode": "Nullable"
},
{
"description": "HGVS p. nomenclature; Amino acid change",
"name": "aa_change",
"type": "String",
"mode": "Nullable"
},
{
"description": "Amino acid change type TODO check with Lee about why this json thinks its Repeated not nullable",
"name": "consequence",
"type": "String",
"mode": "Repeated"
},
{
"description": "HGVS c. nomenclature; DNA change in transcript space",
"name": "dna_change_in_transcript",
"type": "String",
"mode": "Nullable"
},
{
"description": "Exon number",
"name": "exon_number",
"type": "String",
"mode": "Nullable"
},
{
"description": "Intron number",
"name": "intron_number",
"type": "String",
"mode": "Nullable"
},
{
"description": "Gene ID for the transcript",
"name": "gene_id",
"type": "String",
"mode": "Nullable"
},
{
"description": "Primary Transcript ID",
"name": "is_canonical_transcript",
"type": "Boolean",
"mode": "Nullable"
},
{
"description": "AC TODO -- this needs to be added back and swapped to required -- Lee said this was a string?",
"name": "gvs_all_ac",
"type": "Integer",
"mode": "Nullable"
},
{
"description": "AN TODO -- this needs to be added back and swapped to required",
"name": "gvs_all_an",
"type": "Integer",
"mode": "Nullable"
},
{
"description": "AF TODO -- this needs to be added back and swapped to required -- Lee said this was a Float?",
"name": "gvs_all_af",
"type": "INTEGER",
"mode": "Nullable"
},
{
"description": "REVEL",
"name": "revel",
"type": "FLOAT",
"mode": "Nullable"
},
{
"description": "Slice AI",
"name": "splice_ai_acceptor_gain_score",
"type": "Float",
"mode": "Nullable"
},
{
"description": "Slice AI",
"name": "splice_ai_acceptor_gain_distance",
"type": "Integer",
"mode": "Nullable"
},
{
"description": "Slice AI",
"name": "splice_ai_acceptor_loss_score",
"type": "Float",
"mode": "Nullable"
},
{
"description": "Slice AI",
"name": "splice_ai_acceptor_loss_distance",
"type": "Integer",
"mode": "Nullable"
},
{
"description": "Slice AI",
"name": "splice_ai_donor_gain_score",
"type": "Float",
"mode": "Nullable"
},
{
"description": "Slice AI",
"name": "splice_ai_donor_gain_distance",
"type": "Integer",
"mode": "Nullable"
},
{
"description": "Slice AI",
"name": "splice_ai_donor_loss_score",
"type": "Float",
"mode": "Nullable"
},
{
"description": "Slice AI",
"name": "splice_ai_donor_loss_distance",
"type": "Integer",
"mode": "Nullable"
},
{
"description": "ClinVar Id for Validation",
"name": "clinvar_id",
"type": "String",
"mode": "Repeated"
},
{
"description": "ClinVar Classification",
"name": "clinvar_classification",
"type": "String",
"mode": "Repeated"
},
{
"description": "ClinVar Classification Date",
"name": "clinvar_last_updated",
"type": "Date",
"mode": "Nullable"
},
{
"description": "ClinVar Disease Name",
"name": "clinvar_phenotype",
"type": "String",
"mode": "Repeated"
},
{
"description": "gnomAD: 'Total' frequency",
"name": "gnomad_all_af",
"type": "Float",
"mode": "Nullable"
},
{
"description": "gnomAD: 'Total' allele count",
"name": "gnomad_all_ac",
"type": "Integer",
"mode": "Nullable"
},
{
"description": "gnomAD: 'Total' allele number",
"name": "gnomad_all_an",
"type": "Integer",
"mode": "Nullable"
}
]

0 comments on commit 4656c46

Please sign in to comment.