-
Notifications
You must be signed in to change notification settings - Fork 1
/
cluster.h
233 lines (185 loc) · 7.76 KB
/
cluster.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
#ifndef SURVEYOR_CLUSTER_H
#define SURVEYOR_CLUSTER_H
#include <iostream>
#include <atomic>
#include "sam_utils.h"
#include "config.h"
#include "htslib/sam.h"
typedef uint8_t sv_type_t;
static struct sv_types_t {
sv_type_t DEL, INV, DUP, TRA, INS, NOV;
size_t n_types = 6;
sv_types_t() : DEL(1), INV(2), DUP(3), TRA(4), INS(5), NOV(6) {}
} SV_TYPES;
std::string svt_to_str(sv_type_t svt) {
if (svt == SV_TYPES.DEL) return "DEL";
else if (svt == SV_TYPES.INV) return "INV";
else if (svt == SV_TYPES.DUP) return "DUP";
else if (svt == SV_TYPES.TRA) return "TRA";
else if (svt == SV_TYPES.INS) return "INS";
else if (svt == SV_TYPES.NOV) return "NOV";
else return "";
}
sv_type_t str_to_svt(std::string str) {
if (str == "DEL") return SV_TYPES.DEL;
else if (str == "INV") return SV_TYPES.INV;
else if (str == "DUP") return SV_TYPES.DUP;
else if (str == "TRA") return SV_TYPES.TRA;
else if (str == "INS") return SV_TYPES.INS;
else if (str == "NOV") return SV_TYPES.NOV;
else return 0;
};
sv_type_t disct_to_svt(disc_type_t dt) {
if (dt == DISC_TYPES.LI) return SV_TYPES.DEL;
else if (dt == DISC_TYPES.OW) return SV_TYPES.DUP;
else if (dt == DISC_TYPES.SI) return SV_TYPES.INS;
else return SV_TYPES.NOV;
}
struct anchor_t {
static constexpr const char* pattern = "%c:%d:%d:%d:%d";
char dir;
int contig_id, start, end;
int sc_reads;
anchor_t() {}
anchor_t(char dir, int contig_id, int start, int end, int sc_reads) : dir(dir), contig_id(contig_id), start(start),
end(end), sc_reads(sc_reads) {}
anchor_t(char* s) {
sscanf(s, pattern, &dir, &contig_id, &start, &end, &sc_reads);
}
int pos() { return dir == 'L' ? start : end; }
static bool check_clipped(anchor_t& clipped, anchor_t& other) {
if (clipped.dir == 'L') {
return clipped.pos() <= other.pos()+10;
} else {
return clipped.pos() >= other.pos()-10;
}
}
static bool can_merge(anchor_t& a1, anchor_t& a2, config_t& config) {
if (a1.contig_id != a2.contig_id || a1.dir != a2.dir) return false;
if (a1.sc_reads >= 2 && !check_clipped(a1, a2)) return false;
if (a2.sc_reads >= 2 && !check_clipped(a2, a1)) return false;
return std::max(a1.end, a2.end)-std::min(a1.start, a2.start) <= config.max_is;
}
static anchor_t merge(anchor_t& a1, anchor_t& a2) {
return anchor_t(a1.dir, a1.contig_id, std::min(a1.start, a2.start), std::max(a1.end, a2.end), a1.sc_reads+a2.sc_reads);
}
static int distance(anchor_t& a1, anchor_t& a2) {
int overlap = std::min(a1.end, a2.end) - std::max(a1.start, a2.start);
if (overlap > 0) {
int l2 = std::min(a1.size(), a2.size());
return -100*overlap/l2;
} else {
return std::min(std::abs(a1.start-a2.end), std::abs(a2.start-a1.end));
}
}
std::string to_str() {
char buffer[100];
sprintf(buffer, pattern, dir, contig_id, start, end, sc_reads);
return std::string(buffer);
}
int size() {
return end-start+1;
}
};
bool operator < (const anchor_t& a1, const anchor_t& a2) {
if (a1.start != a2.start) return a1.start < a2.start;
return a1.end < a2.end;
}
struct cluster_t {
static constexpr const char* scan_pattern = "%[^-]-%[^-]-%d";
static constexpr const char* print_pattern = "%s-%s-%d";
int id; // not necessarily set, use if needed
anchor_t a1, a2;
disc_type_t dt;
int disc_pairs;
bool dead = false;
cluster_t(const anchor_t &a1, const anchor_t &a2, disc_type_t dt, int disc_pairs) : a1(a1), a2(a2), dt(dt),
disc_pairs(disc_pairs) {}
cluster_t(std::string& s, disc_type_t dt) : dt(dt) {
char anchor1[100], anchor2[100];
sscanf(s.c_str(), scan_pattern, anchor1, anchor2, &disc_pairs);
a1 = anchor_t(anchor1);
a2 = anchor_t(anchor2);
}
static bool can_merge(cluster_t* c1, cluster_t* c2, config_t& config) {
return anchor_t::can_merge(c1->a1, c2->a1, config) && anchor_t::can_merge(c1->a2, c2->a2, config);
}
static int distance(cluster_t* c1, cluster_t* c2) {
if (c1->a1.contig_id != c2->a1.contig_id || c1->a1.dir != c2->a1.dir ||
c1->a2.contig_id != c2->a2.contig_id || c1->a2.dir != c2->a2.dir) {
return INT32_MAX;
}
return anchor_t::distance(c1->a1, c2->a1) + anchor_t::distance(c1->a2, c2->a2);
}
static cluster_t* merge(cluster_t* c1, cluster_t* c2) {
return new cluster_t(anchor_t::merge(c1->a1, c2->a1), anchor_t::merge(c1->a2, c2->a2), c1->dt,
c1->disc_pairs+c2->disc_pairs);
}
std::string to_str() {
char buffer[1000];
sprintf(buffer, print_pattern, a1.to_str().c_str(), a2.to_str().c_str(), disc_pairs);
return std::string(buffer);
}
};
struct cc_distance_t {
int distance;
cluster_t* c1,* c2;
cc_distance_t(int distance, cluster_t *c1, cluster_t *c2) : distance(distance), c1(c1), c2(c2) {}
};
bool operator < (const cc_distance_t& ccd1, const cc_distance_t& ccd2) { // reverse op for priority queue
return ccd1.distance > ccd2.distance;
}
struct breakpoint_t {
static constexpr const char* pattern = "%c:%d:%d:%d:%d:%d:%d";
char dir;
int contig_id, start, end;
int sc_reads;
int spanning_pairs = 0, spanning_reads = 0;
breakpoint_t() {}
breakpoint_t(anchor_t anchor) : dir(anchor.dir), contig_id(anchor.contig_id),
start(anchor.start), end(anchor.end), sc_reads(anchor.sc_reads) {}
breakpoint_t(char* s) {
sscanf(s, pattern, &dir, &contig_id, &start, &end, &sc_reads, &spanning_pairs, &spanning_reads);
}
int pos() { return dir == 'L' ? start : end; }
int anchor_len() { return end - start; }
std::string to_str() {
char buffer[1000];
sprintf(buffer, pattern, dir, contig_id, start, end, sc_reads, spanning_pairs, spanning_reads);
return std::string(buffer);
}
};
std::atomic<int> pred_id(0);
struct prediction_t {
static constexpr const char* scan_pattern = "%d,%[^,],%[^,],%[^,],%d,%d,%d,%lf,%lf";
static constexpr const char* print_pattern = "%d,%s,%s,%s,%d,%d,%d,%lf,%lf";
int id;
breakpoint_t bp1, bp2;
sv_type_t sv_type;
int disc_pairs;
int size = INT32_MAX, conf_ival = 0;
double pval = -1.0, shift_pval = 1.0;
prediction_t(cluster_t* c, disc_type_t dt) : id(pred_id++), bp1(c->a1), bp2(c->a2), sv_type(disct_to_svt(dt)),
disc_pairs(c->disc_pairs) {}
prediction_t(std::string& line) {
char breakpoint1[1000], breakpoint2[1000];
char svt[10];
sscanf(line.c_str(), scan_pattern, &id, breakpoint1, breakpoint2, svt, &disc_pairs, &size, &conf_ival, &pval, &shift_pval);
if (id >= pred_id) pred_id = id+1;
bp1 = breakpoint_t(breakpoint1);
bp2 = breakpoint_t(breakpoint2);
sv_type = str_to_svt(svt);
}
// len is simply the difference between the breakpoints
int len() { return bp1.contig_id == bp2.contig_id ? bp2.pos()-bp1.pos() : INT32_MAX; }
// size is: len if soft-clipped, the estimated size if not
int get_size() { return bp1.sc_reads > 0 && sv_type != SV_TYPES.INS ? len() :
(size == INT32_MAX ? 0 : size); }
std::string to_str() {
char buffer[10000];
sprintf(buffer, print_pattern, id, bp1.to_str().c_str(), bp2.to_str().c_str(), svt_to_str(sv_type).c_str(),
disc_pairs, size, conf_ival, pval, shift_pval);
return std::string(buffer);
}
};
#endif //SURVEYOR_CLUSTER_H