-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathvcf_parser_for_svhound.py
342 lines (291 loc) · 11.8 KB
/
vcf_parser_for_svhound.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
#!/usr/bin/env python2
# Script parse vcf files to annotate the number of disitinct
# SV alleles for a defined window size (bp). It takes the data
# stream from the standard input.
# Getting user specified arguments:
# USAGE: cat sv_file.vcf | python2 sv_vcf_parser.py <window size>
# EXAMPLES: zcat 1000genome_hg19.vcf.gz | python2 sv_vcf_parser.py 50000
# zcat 1000genome_hg38.vcf.gz | python2 sv_vcf_parser.py 50000
# important imports
import sys
from datetime import datetime
import string
import random
# error output
def error_message(text):
print "[ERROR] %s" % (text)
# incrementor class
class incrementor(object):
def __init__(self, starting_value=None):
if starting_value is None:
self.value = 0
else:
self.value = starting_value
def increment(self):
self.value += 1
def current_value(self):
return self.value
# vcf line class
class vcf_line(object):
# python 101 info:
# python array is [start:not_including]
# [:until] from start
# [from:] until end
# init
def __init__(self, input_line):
VCF_MANDATORY_FIELDS = 9
tab_sep_fields = input_line.split("\t")
# vcf files have 9 fields + the indiviuals data
if len(tab_sep_fields) > VCF_MANDATORY_FIELDS:
#extract mandatory fileds
[self.CHROM, POS, self.ID, REF, ALT, QUAL, self.FILTER, INFO, FORMAT] = tab_sep_fields[:VCF_MANDATORY_FIELDS]
# position is integer
self.POS = int(POS)
# pID is a chromosom-position unique id
self.pID = "%s:%s" % (self.CHROM, self.POS)
# quality is float or '.' when not reported
# we change '.' for -1
if (QUAL != "."):
self.QUAL = float(QUAL)
else:
self.QUAL = -1.0
# allele (genotype), unique ones and number of appearance
# for unique allels
self.GENOTYPE_UNIQ = {}
self.GENOTYPE_COUNT = {}
self.GENOTYPE_1000 = []
# gives an unique numeric.id to the allele
g = incrementor()
# FORMAT index of GT
split_format = FORMAT.split(":")
use_format = False
# Multiple fields
if len(split_format) > 1:
use_format = True
gt_format_index = split_format.index("GT") # GT == genotype
#ty_format_index = split_format.index("TY") # TY == type
#co_format_index = split_format.index("CO") # CO == coordinates
# indiviuals in the vcf file
for gt in tab_sep_fields[VCF_MANDATORY_FIELDS:]:
# use FORMAT duhh!!!
if use_format:
sv_gt = gt.split(":")[gt_format_index]
if sv_gt == "./.":
# use ./. as 0/0
gt = "0/0"
# use ./. with type and coordinate for uniqueness
# gt = "|".join([
# gt.split(":")[ty_format_index],
# gt.split(":")[co_format_index]
# ])
else:
gt = sv_gt
if gt not in self.GENOTYPE_UNIQ:
self.GENOTYPE_UNIQ[gt] = g.current_value()
self.GENOTYPE_COUNT[gt] = 1
self.GENOTYPE_1000.append(g.current_value())
g.increment()
else:
self.GENOTYPE_1000.append(self.GENOTYPE_UNIQ[gt])
self.GENOTYPE_COUNT[gt] += 1
self.ERROR = False
else:
# not in the vcf format, used to create empty objects
# and get errors
self.ERROR = True
self.CHROM = ""
self.POS = 0
self.pID = ""
self.ID = ""
self.FILTER = ""
self.QUAL = 0.0
self.GENOTYPE_UNIQ = {}
self.GENOTYPE_COUNT = {}
self.GENOTYPE_1000 = []
self.MERGE_SV = None
# functions used to edit the values of each field of the object
def not_error(self):
self.ERROR = False
def add_chromosome(self, chromosome):
self.CHROM = chromosome
def add_position(self, position):
self.POS = position
def add_pID(self, pID):
self.pID = pID
def add_id(self, id_):
self.ID = id_
def add_quality(self, qual):
self.QUAL = qual
def add_genotype_uniq(self, geno_uniq):
self.GENOTYPE_UNIQ = geno_uniq
def add_genotype_count(self, geno_count):
self.GENOTYPE_COUNT = geno_count
def add_genotype_list(self, geno_list):
self.GENOTYPE_1000 = geno_list
def add_genotype_merge(self, merge_list):
self.MERGE_SV = merge_list
# def add_filter(self, filter):
# self.FILTER = filter
# functions to print sv details
def sv_print(self):
print "%s\t%s" % (self.pID, "\t".join([str(g) for g in self.GENOTYPE_1000]))
# merge the allele of two SV per individual
def merge_sv_alleles(sv_list):
def mean(l):
return(sum(l)/float(len(l)))
# SV class properties
# self.CHROM is STRING
# self.POS is INT
# self.ID is STRING
# self.FILTER is STRING
# self.QUAL is FLOAT
# self.GENOTYPE_UNIQ is DICT
# self.GENOTYPE_COUNT is DICT
# self.GENOTYPE_1000 is LIST
# self.MERGE_SV is LIST
# self.ERROR is BOOL
FIRST_ELEM = 0
# create new sv object
sv_merge = vcf_line("")
# we expect/assume the same chromosome
# we take all redundant from the first
# object in the list
sv_merge.add_chromosome(sv_list[FIRST_ELEM].CHROM)
sv_merge.add_position(sv_list[FIRST_ELEM].CHROM)
sv_merge.add_pID(sv_list[FIRST_ELEM].pID)
sv_merge.add_id(",".join([sv.ID for sv in sv_list]))
sv_merge.add_quality(mean([sv.QUAL for sv in sv_list]))
l = len(sv_list[FIRST_ELEM].GENOTYPE_1000)
sv_merge.add_genotype_merge([ ",".join(str(g.GENOTYPE_1000[i]) for g in sv_list) for i in range(l) ])
# TODO: make it cleaner
d = {}
dc = {}
svl = []
g = -1
print(sv_merge.MERGE_SV)
sys.exit()
for gt in sv_merge.MERGE_SV:
if gt not in d:
g += 1
d[gt] = g
dc[gt] = 1
svl.append(g)
else:
dc[gt] += 1
svl.append(g)
# add important: alleles
sv_merge.add_genotype_list(svl)
sv_merge.add_genotype_uniq(d)
sv_merge.add_genotype_count(dc)
return sv_merge
usage = """
USAGE: cat sv_file.vcf | python2 sv_vcf_parser.py <window size>
EXAMPLE: zcat 1000genome_sample_hg19.vcf.gz | python2 sv_vcf_parser.py 50000
"""
# main
if __name__ == '__main__':
# #########################
# used once
print_header = True
# did the user specified a window size
if len(sys.argv) != 2:
print usage
sys.exit("Program halt. Wrong number of parameters")
[script_name, analysis_window_size] = sys.argv
analysis_window_size = int(analysis_window_size)
# Open stats file ##########
def id_generator(size=6, chars=string.ascii_uppercase + string.digits):
return ''.join(random.choice(chars) for _ in range(size))
dateTimeObj = datetime.now()
stats_file = open("%s-%04dkb-%s-%s.txt" % ("svhound-stats", analysis_window_size/1000, id_generator(8), dateTimeObj.strftime("%Y%m%d_%H%M%S")), "w")
stats_file.write("ID\tnumber of SVs\n")
# temporary working variables
rm_duplicates = {}
sv_within_window = []
current_chromosome = ""
current_position = -1
last_print = {}
# get the input from stdin
for line in sys.stdin:
# skip comments
if not line.startswith("#"):
# use vcf class
sv = vcf_line(line.rstrip("\n"))
# check that the vcf line has the proper format
if not sv.ERROR:
# single usage to pront the table header
if print_header:
print "\t" + "\t".join(["i%04d"%(i+1) for i in range(len(sv.GENOTYPE_1000))])
print_header = False
# check chromosome and position
# uninitialized
if current_chromosome == "":
current_chromosome = sv.CHROM
# different chromosome
if current_chromosome != sv.CHROM:
# print current result
if len(sv_within_window) > 1:
sv_merge = merge_sv_alleles(sv_within_window)
if sv_merge.pID not in last_print:
sv_merge.sv_print()
last_print[sv_merge.pID] = 1
else:
[sv_merge] = sv_within_window
if sv_merge not in last_print:
sv_merge.sv_print()
last_print[sv_merge.pID] = 1
# init new chromosome
sv_within_window = []
current_chromosome = sv.CHROM
current_position = sv.POS
sv_within_window.append(sv)
# same chromosome
else:
# make a dictionary to remove duplicated lines
if sv.pID not in rm_duplicates:
rm_duplicates[sv.pID] = 1
# check the position of the SV
# if -1 (un-initialized) then the
# curren position is the the sv position
if current_position == -1:
current_position = sv.POS
sv_within_window.append(sv)
# in next sv if closer than the given
# window size, we merge both
else:
if sv.POS - current_position <= analysis_window_size:
sv_within_window.append(sv)
# next sv is father appart from window size
# print results and init
else:
if len(sv_within_window) > 1:
sv_merge = merge_sv_alleles(sv_within_window)
if sv_merge.pID not in last_print:
sv_merge.sv_print()
stats_file.write("%s\t%s\n" % (sv_merge.pID, len(sv_within_window)))
last_print[sv_merge.pID] = 1
else:
[sv_merge] = sv_within_window
if sv_merge.pID not in last_print:
sv_merge.sv_print()
stats_file.write("%s\t%s\n" % (sv_merge.pID, len(sv_within_window)))
last_print[sv_merge.pID] = 1
# init
sv_within_window = []
current_position = sv.POS
sv_within_window.append(sv)
# on final print for the last line
if len(sv_within_window) > 1:
sv_merge = merge_sv_alleles(sv_within_window)
if sv_merge.pID not in last_print:
sv_merge.sv_print()
stats_file.write("%s\t%s\n" % (sv_merge.pID, len(sv_within_window)))
last_print[sv_merge.pID] = 1
else:
if len(sv_within_window) == 1:
[sv_merge] = sv_within_window
if sv_merge.pID not in last_print:
sv_merge.sv_print()
stats_file.write("%s\t%s\n" % (sv_merge.pID, len(sv_within_window)))
last_print[sv_merge.pID] = 1
stats_file.close()