-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcyprocessreads.pyx
205 lines (170 loc) · 6.01 KB
/
cyprocessreads.pyx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
## cython: profile=True
## cython: linetrace=True
## cython: binding=True
## distutils: define_macros=CYTHON_TRACE_NOGIL=1
# cython: wraparound=False
# cython: boundscheck=False
from __future__ import print_function
from pysam.libcalignmentfile cimport AlignmentFile
from pysam.libcalignedsegment cimport AlignedSegment
import util as ut
cimport cyutil as cut
cimport numpy as np
import sys
import numpy as np
from pysam.libchtslib cimport *
from pysam.libcsamfile cimport *
from cylocobs cimport LocObs
from cyregcov cimport RegCov
from cyrowmaker cimport CyCovariateRowMaker
from collections import defaultdict
import gc
cdef inline int get_base_idx(char obsbase):
if obsbase == 'A':
return 0
if obsbase == 'C':
return 1
if obsbase == 'G':
return 2
if obsbase == 'T':
return 3
return -1
'''
these two functions from libcalignedsegment.pyx. (not in pysam's
libcalignedsegment.pxd, unfortunately.) check license.
'''
def add_observations(
AlignedSegment read,
int mapq,
int min_bq,
int context_len,
CyCovariateRowMaker rm,
bytes bam_fn,
bytes ref,
bytes consensus,
covariate_matrix,
batch_locobs,
major_alleles,
int min_refpos,
int max_refpos):
cdef:
bytes context
int readlen, readnum, qpos, q, dend, refpos, cov_idx, base_idx
bint reverse
RegCov cov
LocObs loc
#cdef unsigned char[:] qualities = read.query_qualities
cdef uint8_t *qualities = pysam_bam_get_qual(read._delegate)
cdef bytes bseq = read.seq
cdef char *cseq = bseq
cdef char obsbase, consbase
cdef char *cconsensus = consensus
readlen = read.alen # aligned length
reverse = read.is_reverse
readnum = read.is_read2 + 1
cdef np.ndarray[np.uint32_t,ndim=2] al_pairs_np = np.array(read.get_aligned_pairs(matches_only = True), dtype = np.uint32)
cdef uint32_t [:,:] al_pairs = al_pairs_np
cdef uint32_t i, j
for i in range(al_pairs_np.shape[0]):
qpos = al_pairs[i,0]
refpos = al_pairs[i,1]
if refpos < min_refpos or refpos > max_refpos:
continue
q = qualities[qpos]
if q < min_bq:
continue
if reverse:
if qpos >= readlen-context_len:
continue
#context = cut.comp(bseq[qpos+1:qpos+1+context_len])
context = cut.comp(bseq[qpos+context_len:qpos:-1])
#print('rev context:', context, 'previously', cut.comp(bseq[qpos+1:qpos+1+context_len]))
if 'N' in context:
continue
obsbase = cut.comp1(cseq[qpos])
consbase = cut.comp1(cconsensus[refpos])
dend = readlen - qpos
else:
if qpos < context_len:
continue
context = bseq[qpos-context_len:qpos]
if 'N' in context:
continue
obsbase = cseq[qpos]
consbase = cconsensus[refpos]
dend = qpos
if obsbase == 'N' or consbase == 'N':
continue
row = rm.get_covariate_row(q, mapq, context, dend, refpos, bam_fn,
reverse)
major = major_alleles[refpos][0]
if major == 'N':
continue
if reverse:
major = cut.comp(major)
base_idx = get_base_idx(obsbase)
cov_idx = covariate_matrix.set_default(row, base_idx)
loc = batch_locobs[refpos][reverse][readnum-1]
loc.add_obs(cov_idx, base_idx)
def add_bam_observations(
AlignmentFile bam,
bytes ref,
int reflen,
int min_bq,
int min_mq,
int context_len,
CyCovariateRowMaker rm,
bytes bam_fn,
bytes consensus,
covariate_matrix,
h5lo_bam,
h5file,
major_alleles,
int update_interval = 1000,
read_batch_size = 500,
max_num_reads = -1):
cdef:
AlignedSegment read
int mapq, i
# locobs are built read by read, not position by position.
# add reads to h5lo in 500-bp windows, using fetch.
# keep a dict of already-processed reads, skip if already seen.
i = 0
for start_bp in range(0, reflen, read_batch_size):
end_bp = start_bp + read_batch_size
batch_locobs = defaultdict(lambda: ((LocObs(), LocObs()), (LocObs(), LocObs())))
for read in bam.fetch(contig = ref, start = start_bp, end = end_bp):
if i % update_interval == 0:
print('{}, {}: processed {} of {} reads'.format(
bam_fn, ref, i, bam.mapped), file = sys.stderr)
i += 1
if i == max_num_reads:
break
mapq = read.mapping_quality
if mapq < min_mq:
continue
add_observations(read, mapq, min_bq, context_len, rm,
bam_fn, ref, consensus, covariate_matrix,
batch_locobs, major_alleles, start_bp, end_bp-1)
add_batch_locobs(batch_locobs, h5lo_bam)
del batch_locobs
# try to reduce memory usage?
h5file.flush()
gc.collect()
if i == max_num_reads:
break
def add_batch_locobs(batch_locobs, h5lo_bam):
for loc_idx, loc_obs in batch_locobs.iteritems():
if str(loc_idx) in h5lo_bam.keys():
print('already there:', str(loc_idx))
h5lo_loc = h5lo_bam.create_group(str(loc_idx))
if str(loc_idx) in h5lo_loc:
raise ValueError('already in hdf5 file!')
f1dat = loc_obs[0][0].counts().astype(np.uint32)
h5lo_loc.create_dataset('f1', dtype = np.uint32, data = f1dat)
f2dat = loc_obs[0][1].counts().astype(np.uint32)
h5lo_loc.create_dataset('f2', dtype = np.uint32, data = f2dat)
r1dat = loc_obs[1][0].counts().astype(np.uint32)
h5lo_loc.create_dataset('r1', dtype = np.uint32, data = r1dat)
r2dat = loc_obs[1][1].counts().astype(np.uint32)
h5lo_loc.create_dataset('r2', dtype = np.uint32, data = r2dat)