-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathgenerate_expected_base_freq.amplicon_seq.py
executable file
·69 lines (57 loc) · 2.57 KB
/
generate_expected_base_freq.amplicon_seq.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#!/usr/bin/env python3
"""
> generate_expected_base_freq.amplicon_seq.py <
Given a flat file containing all possible amplicons sequences (one per line),
script generates an expected per-base frequency table (which should correspond
to the frequencies detected by the next gen sequencer). Assumes distinct
amplicons would be present in equal proportions.
Input accepts all IUPAC bases, e.g. B, D, H, V, N.
"""
import argparse
parser = argparse.ArgumentParser(description="""
Given a flat file containing all possible amplicons sequences (one per line),
script generates an expected per-base frequency table (which should correspond
to the frequencies detected by the next gen sequencer). Assumes distinct
amplicons would be present in equal proportions.""")
parser.add_argument('amplicon_seqs', metavar='seq_file',
type=argparse.FileType('r'),
help='sequence of all amplicons (one per line).')
parser.add_argument('-t', '--transpose', action='store_true',
help='transpose output (prints horizontally, instead '
'of vertically.')
args = parser.parse_args()
IUPAC_BASES = {'-': '', '.': '',
'A': 'A', 'C': 'C', 'G': 'G', 'T': 'T',
'R': 'AG', 'Y': 'CT', 'S': 'CG', 'W': 'AT', 'K': 'GT', 'M': 'AC',
'B': 'CGT', 'D': 'AGT', 'H': 'ACT', 'V': 'ACG',
'N': 'ACGT'}
def calc_base_proportions(bases):
# convert every position as a 12-mer and concatenate them, e.g.
# 'A' --> 'AAAAAAAAAAAA'
# 'N' --> 'ACGTACGTACGT'
b = [IUPAC_BASES[x] * int(12/len(IUPAC_BASES[x])) if x not in '-.' else ''
for x in bases]
b = ''.join(b)
return [f'{b.count(x) / len(b):.2f}' for x in 'ACGT']
amplicon_seqs = []
with args.amplicon_seqs as f:
for line in f:
amplicon_seqs.append(line.strip())
# sort amplicons, from shortest to longest
amplicon_seqs = sorted(amplicon_seqs, key=len)
# generate and print output
if not args.transpose:
print ('\t' * len(amplicon_seqs), 'A\tC\tG\tT')
for n in range(len(amplicon_seqs[-1])):
pos_n_bases = [x[n] if n < len(x) else '-' for x in amplicon_seqs]
print (*pos_n_bases, *calc_base_proportions(pos_n_bases), sep='\t')
else:
for a in amplicon_seqs:
print ('', *a, sep='\t')
for b in 'ACGT':
print (b, end='')
for n in range(len(amplicon_seqs[-1])):
pos_n_bases = [x[n] if n < len(x) else '-' for x in amplicon_seqs]
print ('\t' + calc_base_proportions(pos_n_bases)['ACGT'.index(b)],
end='')
print ('')