-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbam_cstag.py
79 lines (72 loc) · 2.24 KB
/
bam_cstag.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import pysam
import re
import argparse
parser = argparse.ArgumentParser(description='simple arguments')
parser.add_argument(
'--fasta',
'-f',
action="store",
dest="fasta",
help='The input fasta reference file.',
)
parser.add_argument(
'--bam',
'-b',
action="store",
dest="bam",
help='The input bam file',
)
parser.add_argument(
'--out',
'-o',
action="store",
dest="out",
help='The output sam file',
)
args = parser.parse_args()
# bam = "HG00741.maternal.CHM13Y_EBV.bam"
# out = "HG00741.maternal.CHM13Y_EBV.cslong.sam"
# fasta = "/taproom/data/xiaoyu/genomes/chm13chrY_EBV.fasta"
bam = pysam.AlignmentFile(args.bam, "rb")
outfile = pysam.AlignmentFile(args.out, "w", template=bam)
fa = pysam.FastaFile(args.fasta)
for read in bam.fetch():
short_cs = read.get_tag('cs')
blocks = read.get_blocks()
chr = read.reference_name
start = read.reference_start
end = read.reference_end
seq = fa.fetch(reference=chr, start=start, end=end)
cs_array = re.split(r'([\:\-\+\*])', short_cs)
cs_array.pop(0)
symbol_list = [cs_array[i] for i in range(0, len(cs_array), 2)]
value_list = [cs_array[i] for i in range(1, len(cs_array), 2)]
curr_start = start
cs_list = [{'symbol': key, 'value': value} for key, value in zip(symbol_list, value_list)]
# print(short_cs)
for i in cs_list:
if i['symbol'] == ':':
ref_length = int(i['value'])
item_seq = fa.fetch(reference=chr, start=curr_start, end=curr_start + ref_length)
i['symbol'] = '='
i['value'] = item_seq
elif i['symbol'] == '-':
ref_length = len(i['value'])
# test_seq = fa.fetch(reference=chr, start=curr_start, end=curr_start + ref_length)
# print(test_seq)
elif i['symbol'] == '+':
ref_length = 0
else:
ref_length = len(i['value']) / 2
# test_seq = fa.fetch(reference=chr, start=curr_start, end=curr_start + ref_length)
# print(test_seq)
curr_start += ref_length
long_cs = ''
for i in cs_list:
long_cs += i['symbol']
long_cs += i['value']
read.set_tag('cs', long_cs)
outfile.write(read)
outfile.close()
bam.close()
fa.close()