diff --git a/.dockstore.yml b/.dockstore.yml index a7c31ff7225..5c91300c877 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -128,6 +128,7 @@ workflows: branches: - master - ah_var_store + - rc-remove-sites-only-step - name: GvsValidateVat subclass: WDL primaryDescriptorPath: /scripts/variantstore/variant_annotations_table/GvsValidateVAT.wdl diff --git a/scripts/variantstore/variant_annotations_table/custom_annotations_template.tsv b/scripts/variantstore/variant_annotations_table/custom_annotations_template.tsv index 7a5ba70c823..a578ea9b65f 100644 --- a/scripts/variantstore/variant_annotations_table/custom_annotations_template.tsv +++ b/scripts/variantstore/variant_annotations_table/custom_annotations_template.tsv @@ -1,7 +1,7 @@ #title=gvsAnnotations #assembly=GRCh38 #matchVariantsBy=allele -#CHROM POS REF ALT AC AN AF AC_Hom AC_Het AC_afr AN_afr AF_afr AC_Hom_afr AC_Het_afr AC_amr AN_amr AF_amr AC_Hom_amr AC_Het_amr AC_eas AN_eas AF_eas AC_Hom_eas AC_Het_eas AC_eur AN_eur AF_eur AC_Hom_eur AC_Het_eur AC_mid AN_mid AF_mid AC_Hom_mid AC_Het_mid AC_oth AN_oth AF_oth AC_Hom_oth AC_Het_oth AC_sas AN_sas AF_sas AC_Hom_sas AC_Het_sas -#categories . . . AlleleCount AlleleNumber AlleleFrequency AlleleCount AlleleCount AlleleCount AlleleNumber AlleleFrequency AlleleCount AlleleCount AlleleCount AlleleNumber AlleleFrequency AlleleCount AlleleCount AlleleCount AlleleNumber AlleleFrequency AlleleCount AlleleCount AlleleCount AlleleNumber AlleleFrequency AlleleCount AlleleCount AlleleCount AlleleNumber AlleleFrequency AlleleCount AlleleCount AlleleCount AlleleNumber AlleleFrequency AlleleCount AlleleCount AlleleCount AlleleNumber AlleleFrequency AlleleCount AlleleCount -#descriptions . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . -#type . . . number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number +#CHROM POS REF ALT AC AN AF AC_Hom AC_Het AC_Hemi AC_afr AN_afr AF_afr AC_Hom_afr AC_Het_afr AC_Hemi_afr AC_amr AN_amr AF_amr AC_Hom_amr AC_Het_amr AC_Hemi_amr AC_eas AN_eas AF_eas AC_Hom_eas AC_Het_eas AC_Hemi_eas AC_eur AN_eur AF_eur AC_Hom_eur AC_Het_eur AC_Hemi_eur AC_mid AN_mid AF_mid AC_Hom_mid AC_Het_mid AC_Hemi_mid AC_oth AN_oth AF_oth AC_Hom_oth AC_Het_oth AC_Hemi_oth AC_sas AN_sas AF_sas AC_Hom_sas AC_Het_sas AC_Hemi_sas +#categories . . . AlleleCount AlleleNumber AlleleFrequency AlleleCount AlleleCount AlleleCount AlleleCount AlleleNumber AlleleFrequency AlleleCount AlleleCount AlleleCount AlleleCount AlleleNumber AlleleFrequency AlleleCount AlleleCount AlleleCount AlleleCount AlleleNumber AlleleFrequency AlleleCount AlleleCount AlleleCount AlleleCount AlleleNumber AlleleFrequency AlleleCount AlleleCount AlleleCount AlleleCount AlleleNumber AlleleFrequency AlleleCount AlleleCount AlleleCount AlleleCount AlleleNumber AlleleFrequency AlleleCount AlleleCount AlleleCount AlleleCount AlleleNumber AlleleFrequency AlleleCount AlleleCount AlleleCount +#descriptions . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . +#type . . . number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number number diff --git a/scripts/variantstore/wdl/GvsSitesOnlyVCF.example.inputs.json b/scripts/variantstore/wdl/GvsSitesOnlyVCF.example.inputs.json index 2061690b382..7e3f9f130b9 100644 --- a/scripts/variantstore/wdl/GvsSitesOnlyVCF.example.inputs.json +++ b/scripts/variantstore/wdl/GvsSitesOnlyVCF.example.inputs.json @@ -12,5 +12,6 @@ "GvsSitesOnlyVCF.project_id": "spec-ops-aou", "GvsSitesOnlyVCF.dataset_name": "anvil_100_for_testing", "GvsSitesOnlyVCF.AnAcAf_annotations_template": "gs://broad-dsp-spec-ops/scratch/rcremer/Nirvana/custom_annotations_template.tsv", - "GvsSitesOnlyVCF.ancestry_file": "gs://broad-dsp-spec-ops/scratch/rcremer/Nirvana/subpop.tsv" + "GvsSitesOnlyVCF.ancestry_file": "gs://broad-dsp-spec-ops/scratch/rcremer/Nirvana/subpop.tsv", + "GvsSitesOnlyVCF.reference": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta" } diff --git a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl index 2adccbb9da1..c4921f86684 100644 --- a/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl +++ b/scripts/variantstore/wdl/GvsSitesOnlyVCF.wdl @@ -3,8 +3,6 @@ workflow GvsSitesOnlyVCF { input { File inputFileofFileNames File inputFileofIndexFileNames - String output_sites_only_file_name - String output_annotated_file_name String project_id String dataset_name File nirvana_data_directory @@ -34,6 +32,9 @@ workflow GvsSitesOnlyVCF { ## Scatter across the shards from the GVS jointVCF scatter(i in range(length(MakeSubpopulationFiles.input_vcfs)) ) { + ## Create a sites-only VCF from the original GVS jointVCF + ## Calculate AC/AN/AF for subpopulations and extract them for custom annotations + String input_vcf_name = basename(MakeSubpopulationFiles.input_vcfs[i], ".vcf.gz") call ExtractAnAcAfFromVCF { input: input_vcf = MakeSubpopulationFiles.input_vcfs[i], @@ -44,21 +45,12 @@ workflow GvsSitesOnlyVCF { ref = reference } - ## Create a sites-only VCF from the original GVS jointVCF - call SitesOnlyVcf { - input: - input_vcf = ExtractAnAcAfFromVCF.output_vcf, - input_vcf_index = ExtractAnAcAfFromVCF.output_vcf_index, - service_account_json_path = service_account_json_path, - output_filename = "${output_sites_only_file_name}_${i}.sites_only.vcf.gz" - } - ## Use Nirvana to annotate the sites-only VCF and include the AC/AN/AF calculations as custom annotations call AnnotateVCF { input: - input_vcf = SitesOnlyVcf.output_vcf, - input_vcf_index = SitesOnlyVcf.output_vcf_idx, - output_annotated_file_name = "${output_annotated_file_name}_${i}", + input_vcf = ExtractAnAcAfFromVCF.output_vcf, + input_vcf_index = ExtractAnAcAfFromVCF.output_vcf_index, + output_annotated_file_name = "${input_vcf_name}_annotated", nirvana_data_tar = nirvana_data_directory, custom_annotations_file = ExtractAnAcAfFromVCF.annotations_file, } @@ -66,7 +58,7 @@ workflow GvsSitesOnlyVCF { call PrepAnnotationJson { input: annotation_json = AnnotateVCF.annotation_json, - output_file_suffix = "${i}.json.gz", + output_file_suffix = "${input_vcf_name}.json.gz", output_path = output_path, service_account_json_path = service_account_json_path } @@ -90,12 +82,13 @@ workflow GvsSitesOnlyVCF { project_id = project_id, dataset_name = dataset_name, counts_variants = ExtractAnAcAfFromVCF.count_variants, + track_dropped_variants = ExtractAnAcAfFromVCF.track_dropped, table_suffix = table_suffix, service_account_json_path = service_account_json_path, load_jsons_done = BigQueryLoadJson.done } - scatter(i in range(length(contig_array)) ) { + scatter(i in range(length(contig_array)) ) { call BigQueryExportVat { input: contig = contig_array[i], @@ -158,7 +151,7 @@ task MakeSubpopulationFiles { # ------------------------------------------------ # Runtime settings: runtime { - docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_20210916" + docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_20211007" memory: "1 GB" preemptible: 3 cpu: "1" @@ -200,7 +193,8 @@ task ExtractAnAcAfFromVCF { String normalized_vcf_compressed = "normalized.vcf.gz" String normalized_vcf_indexed = "normalized.vcf.gz.tbi" - # separate multi-allelic sites into their own lines, remove deletions and extract the an/ac/af & sc + # separate multi-allelic sites into their own lines, remove deletions and filtered sites and make a sites only vcf + # while extracting the an/ac/af & sc by subpopulation into a tsv command <<< set -e @@ -217,8 +211,12 @@ task ExtractAnAcAfFromVCF { gsutil cp ~{input_vcf_index} ~{local_input_vcf_index} gsutil cp ~{ref} Homo_sapiens_assembly38.fasta - # TODO Compare the ancestry list with the sample list and throw an error (but dont fail the job) if there are samples that are in one, but not the other. Two different errors. - # awk '{print $2}' ~{subpopulation_sample_list} | tail -n +2 | sort -u > collected_subpopulations.txt + ## compare the ancestry sample list with the vcf sample list + # TODO throw an error (but dont fail the job) if there are samples that are in one, but not the other. Throw two different errors. + # Currently commented out to save on io with the AoU beta VAT creation + # awk '{print $1}' ~{subpopulation_sample_list} | tail -n +2 | sort -u > collected_subpopulation_samples.txt + # bcftools query --list-samples ~{local_input_vcf} | sort -u > collected_samples.txt + # diff collected_subpopulation_samples.txt collected_samples.txt # expected_subpopulations = [ # "afr", @@ -230,39 +228,56 @@ task ExtractAnAcAfFromVCF { # "sas" #] - - bcftools norm -m-any ~{local_input_vcf} | \ - bcftools norm --check-ref w -f Homo_sapiens_assembly38.fasta > ~{normalized_vcf} - - - ## make a file of just the first 4 columns of the tsv (maybe I could just bcftools query it?) - bcftools query ~{normalized_vcf} -f '%CHROM\t%POS\t%REF\t%ALT\n' > check_duplicates.tsv + ## track the dropped variants with +500 alt alleles or N's in the reference (Since Nirvana cant handle N as a base, drop them for now) + bcftools view -i 'N_ALT>500 || REF~"N"' ~{local_input_vcf} | bcftools query -f '%CHROM\t%POS\t%ID\t%REF\t%ALT\n' > track_dropped.tsv + + ## filter out sites with too many alt alleles + bcftools view -e 'N_ALT>500 || REF~"N"' --no-update ~{local_input_vcf} | \ + ## filter out the non-passing sites + bcftools view -f 'PASS,.' --no-update | \ + ## normalize, left align and split multi allelic sites to new lines, remove duplicate lines + bcftools norm -m- --check-ref w -f Homo_sapiens_assembly38.fasta | \ + ## filter out spanning deletions and variants with an AC of 0 + bcftools view -e 'ALT[0]="*" || AC=0' --no-update | \ + ## ensure that we respect the FT tag + bcftools filter -i "FORMAT/FT='PASS,.'" --set-GTs . > ~{normalized_vcf} + + ## clean up unneeded file + rm ~{local_input_vcf} + + ## make a file of just the first 5 columns of the tsv + bcftools query ~{normalized_vcf} -f '%CHROM\t%POS\t%ID\t%REF\t%ALT\n' > check_duplicates.tsv ## check it for duplicates and put them in a new file - sort check_duplicates.tsv | uniq -d | cut -f1,2 > duplicates.tsv - ## remove those rows (this will be ALL rows with this position--so good rows too, potentially we want to grab f1,4 to do this with) - grep -v -wFf duplicates.tsv ~{normalized_vcf} | grep -v "AC=0;" > deduplicated.vcf - - wc -l duplicates.tsv - - bcftools plugin fill-tags -- deduplicated.vcf -S ~{subpopulation_sample_list} -t AC,AF,AN,AC_het,AC_hom | bcftools query -f \ - '%CHROM\t%POS\t%REF\t%ALT\t%AC\t%AN\t%AF\t%AC_Hom\t%AC_Het\t%AC_afr\t%AN_afr\t%AF_afr\t%AC_Hom_afr\t%AC_Het_afr\t%AC_amr\t%AN_amr\t%AF_amr\t%AC_Hom_amr\t%AC_Het_amr\t%AC_eas\t%AN_eas\t%AF_eas\t%AC_Hom_eas\t%AC_Het_eas\t%AC_eur\t%AN_eur\t%AF_eur\t%AC_Hom_eur\t%AC_Het_eur\t%AC_mid\t%AN_mid\t%AF_mid\t%AC_Hom_mid\t%AC_Het_mid\t%AC_oth\t%AN_oth\t%AF_oth\t%AC_Hom_oth\t%AC_Het_oth\t%AC_sas\t%AN_sas\t%AF_sas\t%AC_Hom_sas\t%AC_Het_sas\n' \ - | grep -v "*" >> ~{custom_annotations_file_name} - - ### for validation of the pipeline - wc -l ~{custom_annotations_file_name} | awk '{print $1}' > count.txt - # Should this be where we do the filtering of the AC/AN/AF values rather than in the python? - - ### compress the vcf and index it - bcftools view -I deduplicated.vcf -Oz -o ~{normalized_vcf_compressed} + sort check_duplicates.tsv | uniq -d | cut -f1,2,3,4,5 > duplicates.tsv + rm check_duplicates.tsv ## clean up + ## remove those rows (that match up to the first 5 cols) + grep -v -wFf duplicates.tsv ~{normalized_vcf} > deduplicated.vcf + rm ~{normalized_vcf} ## clean up + + ## add duplicates to the file tracking dropped variants + cat duplicates.tsv >> track_dropped.tsv + rm duplicates.tsv ## clean up unneeded file + + ## calculate annotations for all subpopulations + bcftools plugin fill-tags -- deduplicated.vcf -S ~{subpopulation_sample_list} -t AC,AF,AN,AC_het,AC_hom,AC_Hemi | bcftools query -f \ + '%CHROM\t%POS\t%REF\t%ALT\t%AC\t%AN\t%AF\t%AC_Hom\t%AC_Het\t%AC_Hemi\t%AC_afr\t%AN_afr\t%AF_afr\t%AC_Hom_afr\t%AC_Het_afr\t%AC_Hemi_afr\t%AC_amr\t%AN_amr\t%AF_amr\t%AC_Hom_amr\t%AC_Het_amr\t%AC_Hemi_amr\t%AC_eas\t%AN_eas\t%AF_eas\t%AC_Hom_eas\t%AC_Het_eas\t%AC_Hemi_eas\t%AC_eur\t%AN_eur\t%AF_eur\t%AC_Hom_eur\t%AC_Het_eur\t%AC_Hemi_eur\t%AC_mid\t%AN_mid\t%AF_mid\t%AC_Hom_mid\t%AC_Het_mid\t%AC_Hemi_mid\t%AC_oth\t%AN_oth\t%AF_oth\t%AC_Hom_oth\t%AC_Het_oth\t%AC_Hemi_oth\t%AC_sas\t%AN_sas\t%AF_sas\t%AC_Hom_sas\t%AC_Het_sas\t%AC_Hemi_sas\n' \ + >> ~{custom_annotations_file_name} + + ## for validation of the pipeline + wc -l ~{custom_annotations_file_name} | awk '{print $1 -7}' > count.txt + + ## compress the vcf and index it, make it sites-only + bcftools view --no-update --drop-genotypes deduplicated.vcf -Oz -o ~{normalized_vcf_compressed} + ## if we can spare the IO and want to pass a smaller file we can also drop the info field w bcftools annotate -x INFO bcftools index --tbi ~{normalized_vcf_compressed} >>> # ------------------------------------------------ # Runtime settings: runtime { - docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_20210916" - memory: "20 GB" - preemptible: 1 + docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_20211007" + memory: "32 GB" + preemptible: 3 cpu: "2" disks: "local-disk 500 SSD" } @@ -271,52 +286,12 @@ task ExtractAnAcAfFromVCF { output { File annotations_file = "~{custom_annotations_file_name}" Int count_variants = read_int("count.txt") + File track_dropped = "track_dropped.tsv" File output_vcf = "~{normalized_vcf_compressed}" File output_vcf_index = "~{normalized_vcf_indexed}" } } -task SitesOnlyVcf { - input { - File input_vcf - File input_vcf_index - String? service_account_json_path - String output_filename - } - String output_vcf_idx = basename(output_filename) + ".tbi" - - command <<< - set -e - - # Adding `--add-output-vcf-command-line false` so that the VCF header doesn't have a timestamp - # in it so that downstream steps can call cache - - gatk --java-options "-Xmx12288m" \ - SelectVariants \ - -V ~{input_vcf} \ - --add-output-vcf-command-line false \ - --exclude-filtered \ - --sites-only-vcf-output \ - -O ~{output_filename} - - >>> - # ------------------------------------------------ - # Runtime settings: - runtime { - docker: "broadinstitute/gatk:4.2.0.0" - memory: "15 GB" - preemptible: 3 - cpu: "2" - disks: "local-disk 250 HDD" - } - # ------------------------------------------------ - # Outputs: - output { - File output_vcf="~{output_filename}" - File output_vcf_idx="~{output_vcf_idx}" - } -} - task AnnotateVCF { input { File input_vcf @@ -375,10 +350,10 @@ task AnnotateVCF { # Runtime settings: runtime { docker: "annotation/nirvana:3.14" - memory: "20 GB" + memory: "32 GB" cpu: "2" preemptible: 5 - disks: "local-disk 250 SSD" + disks: "local-disk 500 SSD" } # ------------------------------------------------ # Outputs: @@ -430,7 +405,7 @@ task PrepAnnotationJson { # ------------------------------------------------ # Runtime settings: runtime { - docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_20210916" + docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_20211007" memory: "8 GB" preemptible: 5 cpu: "1" @@ -668,6 +643,7 @@ task BigQuerySmokeTest { String project_id String dataset_name Array[Int] counts_variants + Array[File] track_dropped_variants String table_suffix String? service_account_json_path Boolean load_jsons_done @@ -695,6 +671,12 @@ task BigQuerySmokeTest { # sum all the initial input variants across the shards INITIAL_VARIANT_COUNT=$(python -c "print(sum([~{sep=', ' counts_variants}]))") + ## here a list of files: ~{sep=', ' track_dropped_variants} + ## I need to get the lines from each file + awk 'FNR==1{print ""}1' ~{sep=' ' track_dropped_variants} > dropped_variants.txt + + echo "Dropped variants:" + cat dropped_variants.txt # Count number of variants in the VAT bq query --nouse_legacy_sql --project_id=~{project_id} --format=csv 'SELECT COUNT (DISTINCT vid) AS count FROM `~{dataset_name}.~{vat_table}`' > bq_variant_count.csv diff --git a/scripts/variantstore/wdl/extract/create_variant_annotation_table.py b/scripts/variantstore/wdl/extract/create_variant_annotation_table.py index 0097fa18b3d..e305441a954 100644 --- a/scripts/variantstore/wdl/extract/create_variant_annotation_table.py +++ b/scripts/variantstore/wdl/extract/create_variant_annotation_table.py @@ -11,13 +11,15 @@ vat_nirvana_positions_dictionary = { "position": "position", # required + "ref_allele": "refAllele", # required + "alt_allele": "altAlleles", # required } vat_nirvana_variants_dictionary = { "vid": "vid", # required "contig": "chromosome", # required - "ref_allele": "refAllele", # required - "alt_allele": "altAllele", # required + # "ref_allele": "refAllele", # required # do we want to grab both to compare? + # "alt_allele": "altAllele", # required "variant_type": "variantType", # required "genomic_location": "hgvsg", # required "dbsnp_rsid": "dbsnp", # nullable @@ -118,6 +120,10 @@ def check_filtering(variant): if variant.get("gvsAnnotations") == None: # <-- enum since we need this to be in tandem with the custom annotations header / template print("WARNING: There has been an error in creating custom annotations for AC/AF/AN", variant.get("vid")) return False + # skip any row (with a warning) if the AC value is 0 + elif variant["gvsAnnotations"].get("AC") == 0: + print("WARNING: Its AC is 0 so we are dropping this variant", variant.get("vid")) + return False # skip any row (with a warning) if AC, AN or AF is missing elif variant["gvsAnnotations"].get("AC") == None: print("WARNING: There has been an error-- there is no AC value---should AN be 0 for this variant?", variant.get("vid")) @@ -128,10 +134,6 @@ def check_filtering(variant): elif variant["gvsAnnotations"].get("AF") == None: print("WARNING: There has been an error-- there is an AC value---but no AF value", variant.get("vid")) return False - # skip any row (with a warning) if the AC value is 0 - elif variant["gvsAnnotations"].get("AC") == 0: - print("WARNING: Its AC is 0 so we are dropping this variant", variant.get("vid")) - return False else: return True @@ -176,7 +178,7 @@ def get_subpopulation_calculations(subpop_annotations): subpop_an_val = subpop_annotations.get("_".join(["AN", gvs_subpop])) subpop_af_val = subpop_annotations.get("_".join(["AF", gvs_subpop])) # note the assumption is made that AC_Hom must be even because by it's nature it means there are two, but there could be an error - subpop_sc_val = int(subpop_annotations.get("_".join(["AC_Hom", gvs_subpop])) / 2 ) + subpop_annotations.get("_".join(["AC_Het", gvs_subpop])) + subpop_sc_val = int(subpop_annotations.get("_".join(["AC_Hom", gvs_subpop])) / 2 ) + subpop_annotations.get("_".join(["AC_Het", gvs_subpop])) + subpop_annotations.get("_".join(["AC_Hemi", gvs_subpop])) # here we set the subpopulation ac/an/af values row["_".join(["gvs", gvs_subpop, "ac"])] = subpop_ac_val row["_".join(["gvs", gvs_subpop, "an"])] = subpop_an_val @@ -196,9 +198,11 @@ def get_subpopulation_calculations(subpop_annotations): row["gvs_max_sc"] = max_sc return row -def make_annotated_json_row(row_position, variant_line, transcript_line): +def make_annotated_json_row(row_position, row_ref, row_alt, variant_line, transcript_line): row = {} row["position"] = row_position # this is a required field + row["ref_allele"] = row_ref # this is a required field + row["alt_allele"] = row_alt # this is a required field for vat_variants_fieldname in vat_nirvana_variants_dictionary.keys(): # like "contig" nirvana_variants_fieldname = vat_nirvana_variants_dictionary.get(vat_variants_fieldname) @@ -287,17 +291,23 @@ def make_positions_json(annotated_json, output_json): for p in positions: position=p['position'] + refAllele=p['refAllele'] # ref_allele + altAlleles=p['altAlleles'] # this is an array that we need to split into each alt_allele variants=p['variants'] + if '*' in altAlleles: + altAlleles.remove('*') + ## there should be the same number of altAlleles as variants, right? + assert len(altAlleles)==len(variants) # row for each variant - transcript # so let's start with each variant - for variant in variants: + for index, variant in enumerate(variants): # check if it's a row we care about if check_filtering(variant) is False: continue # remember that we want one for each variant-transcript and variant-null for variants without transcripts if variant.get("transcripts") == None: # then we make a special row - row = make_annotated_json_row(position, variant, None) + row = make_annotated_json_row(position, refAllele, altAlleles[index], variant, None) json_str = json.dumps(row) + "\n" json_bytes = json_str.encode('utf-8') output_file.write(json_bytes) @@ -308,13 +318,13 @@ def make_positions_json(annotated_json, output_json): if "Ensembl" in sources: for transcript in transcript_lines: if transcript.get('source') == "Ensembl": - row = make_annotated_json_row(position, variant, transcript) + row = make_annotated_json_row(position, refAllele, altAlleles[index], variant, transcript) json_str = json.dumps(row) + "\n" json_bytes = json_str.encode('utf-8') output_file.write(json_bytes) else: # if there are transcripts, but they are not Ensembl, we now only want one row in the VAT, not one row per transcript - row = make_annotated_json_row(position, variant, None) + row = make_annotated_json_row(position, refAllele, altAlleles[index], variant, None) json_str = json.dumps(row) + "\n" json_bytes = json_str.encode('utf-8') output_file.write(json_bytes)