forked from BD2KGenomics/brca-website-deprecated
-
Notifications
You must be signed in to change notification settings - Fork 0
/
addColumns2Vcf.py
76 lines (55 loc) · 1.9 KB
/
addColumns2Vcf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
"""
this script add three more INFO fields to a VCF file
these three INFO fields are:
HGVS_G: HGVS genomic coordinate
HGVS_C: HGVS cDNA coordinate(s)
HGVS_P: HGVS protein coordinate(s)
Note: this script can only handle SNPs, cann't do indels
"""
import hgvs.dataproviders.uta
import hgvs.parser
import hgvs.variantmapper
import random
def main():
old_vcf = open("data/brca.clinvar.10line.addHGVS.vcf", "r")
new_vcf = open("data/brca.clinvar.mockdata.vcf", "w")
new_info = ["FREQ", "BIC_P", "BIC_N", "MUTTYPE", "IARC", "DBSource"]
for line in old_vcf:
if "#" in line:
new_vcf.write(line)
else:
line = add_info(line)
new_vcf.write(line)
def add_info(line):
items = line.strip().split("\t")
current_info = items[-1]
current_info += ";FREQ=" + str(random.random())
current_info += ";BIC_P=?;BIC_N=?"
current_info += ";MUTTYPE=" + random.choice(["missense", "nonsense", "silent"])
current_info += ";IARC=" + random.choice(["group 1", "group 2a", "group 2b", "group 3", "group 4"])
current_info += ";DBSource=" + random.choice(["Clinvar", "LOVD", "BIC", "ExAC", "1000Genomes"])
items[-1] = current_info
new_line = "\t".join(items) + "\n"
return new_line
def add_HGVS(line):
items = line.strip().split("\t")
chrom = items[0]
pos = items[1]
ref = items[3]
alt = items[4]
hgvs_g = "NC_0000" + str(chrom) + ".10:g." + str(pos) + ref + ">" + alt
print hgvs_g
hp = hgvs.parser.Parser()
hgvs_g = hp.parse_hgvs_variant(hgvs_g)
hdp = hgvs.dataproviders.uta.connect()
evm = hgvs.variantmapper.EasyVariantMapper(hdp,
primary_assembly='GRCh37', alt_aln_method='splign')
transcripts = evm.relevant_transcripts(hgvs_g)
for transcript in transcripts:
hgvs_c = evm.g_to_c(hgvs_g, transcript)
hgvs_p = evm.c_to_p(hgvs_c)
items[-1] += ";HGVS_G=" + str(hgvs_g) + ";HGVS_C=" + str(hgvs_c) + ";HGVS_P=s" + str(hgvs_p)
new_line = "\t".join(items) + "\n"
return new_line
if __name__ == "__main__":
main()