-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse_ExpansionHunter.py
135 lines (107 loc) · 4 KB
/
parse_ExpansionHunter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#!/usr/bin/env python3
################################################
#
# Michele Berselli
# berselli.michele@gmail.com
#
################################################
################################################
#
# Libraries
#
################################################
import sys, argparse, os
import glob
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import numpy as np
from granite.lib import vcf_parser
################################################
# Functions
################################################
def get_counts(inputfile, repeat):
"""
"""
# Open input vcf
vcf = vcf_parser.Vcf(inputfile)
ID_genotype = vcf.header.IDs_genotypes[0]
# Parse and get repeat counts
for vnt_obj in vcf.parse_variants():
if vnt_obj.get_tag_value('REPID') == repeat:
# Reference allele
REF = vnt_obj.get_tag_value('REF')
# Alternate alleles
ALT = vnt_obj.ALT.replace('<STR', '').replace('>', '').split(',')
# ref + alt as integer
REF_ALT = list(map(int, [REF] + ALT))
# Get genotype
GT_0, GT_1 = map(int, vnt_obj.get_genotype_value(ID_genotype, 'GT').replace('|', '/').split('/'))
# Map genotype to alleles
# and return repeat counts
return int(REF), [REF_ALT[GT_0], REF_ALT[GT_1]]
def plot(unique, counts, filename, repeat, REF):
"""
"""
# Create Histogram
_, ax = plt.subplots(1, 1)
# Format data for plot
unique_ = list(map(str, unique))
colors = ["#8856a7" if i == REF else "#1c9099" for i in unique]
ax.bar(unique_, counts, color=colors)
# Axis and title
ax.set_title(f'{repeat} Expansion')
ax.set_ylabel('Observed Alleles in Samples')
ax.set_xlabel('Number of Repeats (per Allele)')
ax.set_xticks(unique_)
# Legend
pop_a = mpatches.Patch(color='#1c9099', label='Alternate Allele')
pop_b = mpatches.Patch(color='#8856a7', label='Reference Allele')
plt.legend(handles=[pop_a, pop_b])
# Save
plt.savefig(filename, format='png')
# plt.show()
def main(args):
# Data Points
x = []
repeat = args['repeat']
# Initialize output file
fo = open(args['outputname']+'.tsv', 'w')
fo.write(f'#Repeat: {repeat}\n')
# Loop files in inputdir
inputdir = args['inputdir']
for i, fn in enumerate(glob.glob(f'{inputdir}/*.vcf')):
# Get repeat counts for file
REF, counts = get_counts(fn, repeat)
# Add counts and write to file
x += counts
fn_ = fn.split('/')[-1]
if i == 0:
fo.write(f'#Reference: {REF}\n')
fo.write(f'#File\tAllele_1\tAllele_2\n')
fo.write(f'{fn_}\t{counts[0]}\t{counts[1]}\n')
# Close output file
fo.close()
# Data to numpy array
x = np.array(x)
# Get counts
unique, counts = np.unique(x, return_counts=True)
total = sum(counts)
# Plot
if args['histogram']:
plot(unique, counts, args['outputname']+'_histogram.png', repeat, REF)
# Counts
with open(args['outputname']+'.counts.tsv', 'w') as fc:
fc.write(f'#Repeat\tCounts\tPercentage\n')
for i, repeat in enumerate(unique):
fc.write(f'{repeat}\t{counts[i]}\t{counts[i]/total*100}\n')
################################################
# Main
################################################
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Loop through ExpansionHunter output VCF files, report and plot repeat counts for the target STR.')
parser.add_argument('-i','--inputdir', help='path to directory containing VCF files', required=True)
parser.add_argument('-r','--repeat', help='LocusId as defined in STR catalog', required=True)
parser.add_argument('-o','--outputname', help='basename for output (w/o extension)', required=True)
parser.add_argument('-p','--histogram', help='plot histogram', action='store_true', required=False)
args = vars(parser.parse_args())
main(args)