-
Notifications
You must be signed in to change notification settings - Fork 4
/
annotate_vcf_cosmic.py
376 lines (327 loc) · 13.2 KB
/
annotate_vcf_cosmic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
'''
Created on Sep 9, 2015
@author: covingto
'''
import os
import os.path
import sys
#TODO: refactor to use hgsc_vcf library
import hgsc_vcf
import re
import logging
import htsjdk.samtools.reference.IndexedFastaSequenceFile as IndexedFastaSequenceFile
import org.bcm.hgsc.utils.SynchronousIndexedFastaReader as SFastaReader
import java.io.File as File
import java.lang.String as String
from java.lang import Class
from java.sql import DriverManager, SQLException
JDBC_DRIVER = "org.sqlite.JDBC"
def getConnection(jdbc_url, driverName):
"""
Given the name of a JDBC driver class and the url to be used
to connect to a database, attempt to obtain a connection to
the database.
"""
try:
Class.forName(driverName).newInstance()
except Exception, msg:
print msg
sys.exit(-1)
try:
dbConn = DriverManager.getConnection(jdbc_url)
except SQLException, msg:
print msg
sys.exit(-1)
return dbConn
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
re_csqformat = re.compile(r'.*?Format: (.*)')
re_capstring = re.compile(r'[A-Z]+')
##
# extract csq format information from a vcf info description
def get_csq_format(csq_descriptor):
_re_csq = re.search(re_csqformat, csq_descriptor)
if _re_csq is None:
raise ValueError("Format search failed")
return _re_csq.group(1).split('|')
##
# codon types
SIMPLE = 0B0001
CODESHIFTING = 0B0010
INS = 0B0110
DEL = 0B1010
##
# returns the offset for the codon range (if one exists) for this csq codon string
#
# given a string, this locates the offset of the capital letter and the next offset
# for the end of the string, thereby defining the codon
# The string gTt/gCt would return -1 and +1 indicating that the codon begins one
# base before the allele and ends one base after it
# The following offsets may be observed for coding snv's
# Aaa/Taa: 0, +2
# aAa/aTa: -1, +1
# aaA/aaT: -2, 0
#
# The following is used for deletions
# ggAGGg/ggg: -2, +3
#
# The following is used for insertions
# a|cg/aCGCcg: 0, +2
# in this case the bar represents the location of the insertion (between the a and c)
# the coordinates in this case match the VCF format with the insertion involving the first base of the insertion
# note that the insertion will only ever involve a range of 3 bases (inclusively)
#
# complex substitutions (sometimes annotated as del/ins) are handled be returning the
# offsets relative to the start of the allele as in the following:
# cAGGTGCTg/cGGGTTCTg: -1, +7
# 101234567 : key
def get_cqs_codon_region(codon_s):
_c1, _c2 = codon_s.split('/')
# codon deletion and insertion events
if _c1 == '-':
return -2, 4, 0, SIMPLE + INS
elif _c2 == '-':
return 0, 0, len(_c1), SIMPLE + DEL
_c1_splitter = re.findall(re_capstring, _c1)
# first test _c1 since that is where most of the action is
if len(_c1_splitter) > 1:
raise ValueError(
"Interspersed capitalization for codon1 in %s" %
codon_s)
elif len(_c1_splitter) < 1: # move to _c2, implies INS
_c2_splitter = re.findall(re_capstring, _c2)
if len(_c2_splitter) != 1:
raise ValueError(
"Codon1 and Codon2 have no capital letters for splitting or interspersed capitalization in %s" %
codon_s)
_left, _right = _c2.split(_c2_splitter[0])
return abs(len(_left) - 1) * -1, len(_right) + 1, 0, INS
else: # implies either DEL or SIMPLE
_left, _right = _c1.split(_c1_splitter[0])
if len(_c1) == len(_c2):
_type = SIMPLE
else:
_type = DEL
return len(_left) * -1, len(_right), len(_c1_splitter[0]), _type
##
# returns the genomic range of a codon
#
# given an allele start position and the codon alterations for that allele, this
# function returns the genomic range of bases that comprise the codon(s) altered
# by the allele
#
# note that the positions listed below are in VCF coordinates
# while the codon resolver is in 1 base (the location of the alternation)
# that means that deleted alleles start at the base that was removed
# insertions happen at the preceeding base
# 100 A T , Aaa/Taa: 100, 102
# 101 A T , aAa/aTa: 100, 102
# 102 A T , aaA/aaT: 100, 102
# 99 gGTA g , GTA/-: 100, 102
# 99 g gGTA , -/GTA: 97, 102 # codon boundary case; return codons before and after
# 100 g gAAA , gta/gAAAta: 100, 102
# 100 gTAG g , gTAGgg/ggg: 100, 105
def get_codon_genomic_range(position_i, codon_s, pos_strand_b=True):
_left, _right, _bases_altered, _codon_type_i = get_cqs_codon_region(
codon_s)
if not pos_strand_b:
if not (_codon_type_i & CODESHIFTING == CODESHIFTING) or not (
_codon_type_i & SIMPLE == SIMPLE):
_left, _right = _right * -1, _left * -1
if (_codon_type_i & INS == INS):
# compensate for inclusivity and zero basing
_left += 2
_right += 2
_right += _bases_altered - 1
return position_i + _left, position_i + _right
##
# Conatiner for VCF files
#
# allows rapid searching and manipulation of VCF files in memory
class VCFContainer:
def __init__(self, reader, buffer):
self.reader = reader
self.buffer = buffer
self.records = {}
self.__load()
self.lastindex = 0
self.lastpos = 0
def __load(self):
logger.info("Reading VCF file into memory")
for record in self.reader:
if record['CHROM'] not in self.records:
self.records[record['CHROM']] = []
self.records[record['CHROM']].append(record)
##
# return a list of records within the indicated buffer limit
def intersect(self, record):
if not record['CHROM'] in self.records:
return []
_record_set = self.records[record['CHROM']]
_pos = record['POS'] - self.buffer
_end = record['POS'] + len(record['REF']) + self.buffer
if _pos > self.lastpos:
self.lastindex = 0
self.lastpos = _pos
_result = []
for _i in range(self.lastindex, len(_record_set)):
_test_record = _record_set[_i]
if ((_test_record['POS'] + len(_test_record['REF']))
>= _pos) and (_test_record['POS'] <= _end):
_result.append(_test_record)
elif _test_record['POS'] > _end:
break # exceeded the end, stop
self.lastpos = _test_record['POS']
self.lastindex = _i
return _result
MATCH_PRIORITY = {'SITE': 0, 'CODON': 1, 'BUFFER': 2, 'NONE':3}
##
# match up cosmic records and format for printing
#
# matches are records that are within the buffer of the indicated site for the primary record
# csq are the csq info fields (may be multiple) corresponding to this record and it's alts
# the goal is to convert each of the matches to an appropriate string. The function will return
# the most severt csq match for each of the matches in the order SITE,
# CODON, BUFFER
def generate_cosmic_info(matches, csq_l, record):
if len(matches) == 0:
return ['NONE']
_result = []
for _m in sorted(matches, key = lambda _m: MATCH_PRIORITY[get_match_type(_m, csq_l, record)]):
_result.append('|'.join([get_match_type(_m,
csq_l,
record),
_m['INFO']['AA'][0],
_m['INFO']['CDS'][0],
str(_m['INFO']['CNT'][0])]))
return _result
##
# return the match type for this match and csq list
#
# match is a single record form a cosmic vcf and csq is a list of csq fields from vep
# the default of no other match is found is to report
def get_match_type(match, csq_l, record):
_sites = [get_csq_cdna_site(_csq) for _csq in csq_l]
if match['INFO']['CDS'][0] in _sites:
return 'SITE'
_codon_ranges = [
get_codon_genomic_range(
record['POS'],
_csq['Codons'],
_csq['STRAND'] == 1) for _csq in csq_l if _csq['Codons'] != '']
for _start, _end in _codon_ranges:
if match['POS'] < _end and (match['POS'] + len(match['REF']) > _start):
return 'CODON'
return 'BUFFER'
def get_csq_cdna_site(csq):
try:
return csq['HGVSc'].split(':')[1]
except:
return ''
def add_info_to_reader(reader, header):
reader.header.add_header(header)
def add_command_to_reader(reader, command):
reader.header.add_header(command)
def get_val_status(rsid, con):
resultSet = con.executeQuery("select rsid, valstatus from dbsnpvalstat where rsid = \"%s\"" % rsid)
while resultSet.next():
yield resultSet.getString("valstatus")
def generate_valstatus_info(_existing_ids, _valdata):
dbCon = getConnection("jdbc:sqlite:%s" % _valdata, JDBC_DRIVER)
con = dbCon.createStatement()
try:
_vals = [v for _id in _existing_ids for v in get_val_status(_id, con)]
_vals_set = set(_vals)
if len(_vals_set) > 0:
return "|".join(_vals_set)
else:
return '.'
finally:
con.close()
dbCon.close()
def main(args):
vcf_reader = hgsc_vcf.Reader(args.INPUTVCF)
vcf_container_cosmic = VCFContainer(
hgsc_vcf.Reader(
args.COSMICVCF),
args.buffer)
# read in the dbsnp data
# connect to the reference file
ifasta = IndexedFastaSequenceFile(File(args.reference))
add_command_to_reader(vcf_reader, '##COMMAND=<ID=annotate_vcf_cosmic.py,Params="%s">' % " ".join(sys.argv))
# add the COSMIC header info
add_info_to_reader(vcf_reader, '##INFO=<ID=COSMIC,Number=.,Type=String,Description="' +
'COSMIC info, can be one of NONE, BUFFER, CODON, SITE. ' +
'All but NONE are accompanied by AA|CDS|CNT BUFFER indicates the COSMIC site is within %(buffer)sbp of the position. example: ' +
'SITE|p.P228fs*227|c.682_683insT|3 or NONE. VCF file used was %(cosmicvcf)s.">\n' % {'buffer': str(args.buffer), 'cosmicvcf': args.COSMICVCF})
# add the context
add_info_to_reader(vcf_reader, "##INFO=<ID=CONTEXT,Number=1,Type=String,Description=\"Base context around variant. [POS - 5, POS + len(REF) + 4]\">\n")
# add the validatio status info
add_info_to_reader(vcf_reader, "##INFO=<ID=DBVS,Number=1,Type=String,Description=\"dbSNP validation status, | separated\">\n")
# get the format for the vep annotations
_vep_format = get_csq_format([h for h in vcf_reader.header.get_headers('INFO', 'CSQ')][0].fields['Description'])
vcf_writer = hgsc_vcf.Writer(args.OUTPUTVCF, vcf_reader.header)
vcf_writer.write_header()
for record in vcf_reader:
try:
## check that the position is annotated with CSQ, if not then this is a write through
if 'CSQ' in record['INFO']:
# matches are intersecting hits in the VCF
_matches = vcf_container_cosmic.intersect(record)
_csq_l = [dict(zip(_vep_format, _csq.split('|')))
for _csq in record['INFO'].get('CSQ')]
_info = generate_cosmic_info(_matches, _csq_l, record)
record['INFO']['COSMIC'] = _info
# extract the dbsnp validation rsids
_existing_ids = [_id for _csq in _csq_l for _id in _csq['Existing_variation'].split('&')]
record['INFO']['DBVS'] = [generate_valstatus_info(_existing_ids, args.DBSNPVAL)]
record['INFO']['CONTEXT'] = [str(
String(
ifasta.getSubsequenceAt(record['CHROM'], record['POS'] - 5, record['POS'] + len(record['REF']) + 4).getBases()
))]
except:
logger.exception("Error in record modification")
vcf_writer.write_record(record)
##
# returns the amino acid residue number from ...
def get_csq_hgvsp_aa_number(csq_hgvsp_s):
pass
if __name__ == '__main__':
# set up the logger
ch = logging.StreamHandler(stream=sys.stdout)
ch.setLevel(logging.DEBUG)
formatter = logging.Formatter(
'%(asctime)s - %(name)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
logger.addHandler(ch)
import argparse
parser = argparse.ArgumentParser()
parser.add_argument(
'--reference',
type = str,
help = 'Reference file for context annotation'
)
parser.add_argument(
'--buffer',
default=10,
type=int,
help='Buffer size for matching COSMIC records, defaults to 10bp')
parser.add_argument(
'COSMICVCF',
type=argparse.FileType('r'),
help='COSMIC vcf for annotation')
parser.add_argument(
'INPUTVCF',
type=argparse.FileType('r'),
help='Input vcf to annotate')
parser.add_argument('DBSNPVAL',
type = str,
help = "dbSNP validation data in the form rsid [tab] valstatus")
parser.add_argument(
'OUTPUTVCF',
type=argparse.FileType('w'),
nargs='?',
default=sys.stdout,
help='Output file, defaults to stdout')
args = parser.parse_args()
main(args)