-
Notifications
You must be signed in to change notification settings - Fork 2
/
BamMultiReader.cpp
449 lines (377 loc) · 17.8 KB
/
BamMultiReader.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
// ***************************************************************************
// BamMultiReader.cpp (c) 2010 Erik Garrison, Derek Barnett
// Marth Lab, Department of Biology, Boston College
// All rights reserved.
// ---------------------------------------------------------------------------
// Last modified: 18 September 2010 (DB)
// ---------------------------------------------------------------------------
// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad
// Institute.
// ---------------------------------------------------------------------------
// Functionality for simultaneously reading multiple BAM files.
//
// This functionality allows applications to work on very large sets of files
// without requiring intermediate merge, sort, and index steps for each file
// subset. It also improves the performance of our merge system as it
// precludes the need to sort merged files.
// ***************************************************************************
#include <algorithm>
#include <fstream>
#include <iostream>
#include <iterator>
#include <sstream>
#include <string>
#include <vector>
#include "BGZF.h"
#include "BamMultiReader.h"
using namespace BamTools;
using namespace std;
// -----------------------------------------------------
// BamMultiReader implementation
// -----------------------------------------------------
// constructor
BamMultiReader::BamMultiReader(void)
: CurrentRefID(0)
, CurrentLeft(0)
{ }
// destructor
BamMultiReader::~BamMultiReader(void) {
Close();
}
// close the BAM files
void BamMultiReader::Close(void) {
// close all BAM readers and clean up pointers
vector<pair<BamReader*, BamAlignment*> >::iterator readerIter = readers.begin();
vector<pair<BamReader*, BamAlignment*> >::iterator readerEnd = readers.end();
for ( ; readerIter != readerEnd; ++readerIter) {
BamReader* reader = (*readerIter).first;
BamAlignment* alignment = (*readerIter).second;
// close the reader
if ( reader) reader->Close();
// delete reader pointer
delete reader;
reader = 0;
// delete alignment pointer
delete alignment;
alignment = 0;
}
// clear out the container
readers.clear();
}
// saves index data to BAM index files (".bai"/".bti") where necessary, returns success/fail
bool BamMultiReader::CreateIndexes(bool useStandardIndex) {
bool result = true;
for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
BamReader* reader = it->first;
result &= reader->CreateIndex(useStandardIndex);
}
return result;
}
// sets the index caching mode on the readers
void BamMultiReader::SetIndexCacheMode(const BamIndex::BamIndexCacheMode mode) {
for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
BamReader* reader = it->first;
reader->SetIndexCacheMode(mode);
}
}
// for debugging
void BamMultiReader::DumpAlignmentIndex(void) {
for (AlignmentIndex::const_iterator it = alignments.begin(); it != alignments.end(); ++it) {
cerr << it->first.first << ":" << it->first.second << " " << it->second.first->GetFilename() << endl;
}
}
// makes a virtual, unified header for all the bam files in the multireader
const string BamMultiReader::GetHeaderText(void) const {
string mergedHeader = "";
map<string, bool> readGroups;
// foreach extraction entry (each BAM file)
for (vector<pair<BamReader*, BamAlignment*> >::const_iterator rs = readers.begin(); rs != readers.end(); ++rs) {
BamReader* reader = rs->first;
string headerText = reader->GetHeaderText();
if ( headerText.empty() ) continue;
map<string, bool> currentFileReadGroups;
stringstream header(headerText);
vector<string> lines;
string item;
while (getline(header, item))
lines.push_back(item);
for (vector<string>::const_iterator it = lines.begin(); it != lines.end(); ++it) {
// get next line from header, skip if empty
string headerLine = *it;
if ( headerLine.empty() ) { continue; }
// if first file, save HD & SQ entries
if ( rs == readers.begin() ) {
if ( headerLine.find("@HD") == 0 || headerLine.find("@SQ") == 0) {
mergedHeader.append(headerLine.c_str());
mergedHeader.append(1, '\n');
}
}
// (for all files) append RG entries if they are unique
if ( headerLine.find("@RG") == 0 ) {
stringstream headerLineSs(headerLine);
string part, readGroupPart, readGroup;
while(std::getline(headerLineSs, part, '\t')) {
stringstream partSs(part);
string subtag;
std::getline(partSs, subtag, ':');
if (subtag == "ID") {
std::getline(partSs, readGroup, ':');
break;
}
}
if (readGroups.find(readGroup) == readGroups.end()) { // prevents duplicate @RG entries
mergedHeader.append(headerLine.c_str() );
mergedHeader.append(1, '\n');
readGroups[readGroup] = true;
currentFileReadGroups[readGroup] = true;
} else {
// warn iff we are reading one file and discover duplicated @RG tags in the header
// otherwise, we emit no warning, as we might be merging multiple BAM files with identical @RG tags
if (currentFileReadGroups.find(readGroup) != currentFileReadGroups.end()) {
cerr << "WARNING: duplicate @RG tag " << readGroup
<< " entry in header of " << reader->GetFilename() << endl;
}
}
}
}
}
// return merged header text
return mergedHeader;
}
// get next alignment among all files
bool BamMultiReader::GetNextAlignment(BamAlignment& nextAlignment) {
// bail out if we are at EOF in all files, means no more alignments to process
if (!HasOpenReaders())
return false;
// when all alignments have stepped into a new target sequence, update our
// current reference sequence id
UpdateReferenceID();
// our lowest alignment and reader will be at the front of our alignment index
BamAlignment* alignment = alignments.begin()->second.second;
BamReader* reader = alignments.begin()->second.first;
// now that we have the lowest alignment in the set, save it by copy to our argument
nextAlignment = BamAlignment(*alignment);
// remove this alignment index entry from our alignment index
alignments.erase(alignments.begin());
// and add another entry if we can get another alignment from the reader
if (reader->GetNextAlignment(*alignment)) {
alignments.insert(make_pair(make_pair(alignment->RefID, alignment->Position),
make_pair(reader, alignment)));
} else { // do nothing
//cerr << "reached end of file " << lowestReader->GetFilename() << endl;
}
return true;
}
// get next alignment among all files without parsing character data from alignments
bool BamMultiReader::GetNextAlignmentCore(BamAlignment& nextAlignment) {
// bail out if we are at EOF in all files, means no more alignments to process
if (!HasOpenReaders())
return false;
// when all alignments have stepped into a new target sequence, update our
// current reference sequence id
UpdateReferenceID();
// our lowest alignment and reader will be at the front of our alignment index
BamAlignment* alignment = alignments.begin()->second.second;
BamReader* reader = alignments.begin()->second.first;
// now that we have the lowest alignment in the set, save it by copy to our argument
nextAlignment = BamAlignment(*alignment);
//memcpy(&nextAlignment, alignment, sizeof(BamAlignment));
// remove this alignment index entry from our alignment index
alignments.erase(alignments.begin());
// and add another entry if we can get another alignment from the reader
if (reader->GetNextAlignmentCore(*alignment)) {
alignments.insert(make_pair(make_pair(alignment->RefID, alignment->Position),
make_pair(reader, alignment)));
} else { // do nothing
//cerr << "reached end of file " << lowestReader->GetFilename() << endl;
}
return true;
}
// ---------------------------------------------------------------------------------------
//
// NB: The following GetReferenceX() functions assume that we have identical
// references for all BAM files. We enforce this by invoking the above
// validation function (ValidateReaders) to verify that our reference data
// is the same across all files on Open, so we will not encounter a situation
// in which there is a mismatch and we are still live.
//
// ---------------------------------------------------------------------------------------
// returns the number of reference sequences
const int BamMultiReader::GetReferenceCount(void) const {
return readers.front().first->GetReferenceCount();
}
// returns vector of reference objects
const BamTools::RefVector BamMultiReader::GetReferenceData(void) const {
return readers.front().first->GetReferenceData();
}
// returns refID from reference name
const int BamMultiReader::GetReferenceID(const string& refName) const {
return readers.front().first->GetReferenceID(refName);
}
// ---------------------------------------------------------------------------------------
// checks if any readers still have alignments
bool BamMultiReader::HasOpenReaders() {
return alignments.size() > 0;
}
// returns whether underlying BAM readers ALL have an index loaded
// this is useful to indicate whether Jump() or SetRegion() are possible
bool BamMultiReader::IsIndexLoaded(void) const {
bool ok = true;
vector<pair<BamReader*, BamAlignment*> >::const_iterator readerIter = readers.begin();
vector<pair<BamReader*, BamAlignment*> >::const_iterator readerEnd = readers.end();
for ( ; readerIter != readerEnd; ++readerIter ) {
const BamReader* reader = (*readerIter).first;
if ( reader ) ok &= reader->IsIndexLoaded();
}
return ok;
}
// jumps to specified region(refID, leftBound) in BAM files, returns success/fail
bool BamMultiReader::Jump(int refID, int position) {
//if ( References.at(refID).RefHasAlignments && (position <= References.at(refID).RefLength) ) {
CurrentRefID = refID;
CurrentLeft = position;
bool result = true;
for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
BamReader* reader = it->first;
result &= reader->Jump(refID, position);
if (!result) {
cerr << "ERROR: could not jump " << reader->GetFilename() << " to " << refID << ":" << position << endl;
exit(1);
}
}
if (result) UpdateAlignments();
return result;
}
// opens BAM files
bool BamMultiReader::Open(const vector<string>& filenames, bool openIndexes, bool coreMode, bool preferStandardIndex) {
// for filename in filenames
fileNames = filenames; // save filenames in our multireader
for (vector<string>::const_iterator it = filenames.begin(); it != filenames.end(); ++it) {
const string filename = *it;
BamReader* reader = new BamReader;
bool openedOK = true;
openedOK = reader->Open(filename, "", openIndexes, preferStandardIndex);
// if file opened ok, check that it can be read
if ( openedOK ) {
bool fileOK = true;
BamAlignment* alignment = new BamAlignment;
fileOK &= ( coreMode ? reader->GetNextAlignmentCore(*alignment) : reader->GetNextAlignment(*alignment) );
if (fileOK) {
readers.push_back(make_pair(reader, alignment)); // store pointers to our readers for cleanup
alignments.insert(make_pair(make_pair(alignment->RefID, alignment->Position),
make_pair(reader, alignment)));
} else {
cerr << "WARNING: could not read first alignment in " << filename << ", ignoring file" << endl;
// if only file available & could not be read, return failure
if ( filenames.size() == 1 ) return false;
}
}
// TODO; any further error handling when openedOK is false ??
else
return false;
}
// files opened ok, at least one alignment could be read,
// now need to check that all files use same reference data
ValidateReaders();
return true;
}
void BamMultiReader::PrintFilenames(void) {
for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
BamReader* reader = it->first;
cout << reader->GetFilename() << endl;
}
}
// returns BAM file pointers to beginning of alignment data
bool BamMultiReader::Rewind(void) {
bool result = true;
for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
BamReader* reader = it->first;
result &= reader->Rewind();
}
return result;
}
bool BamMultiReader::SetRegion(const int& leftRefID, const int& leftPosition, const int& rightRefID, const int& rightPosition) {
BamRegion region(leftRefID, leftPosition, rightRefID, rightPosition);
return SetRegion(region);
}
bool BamMultiReader::SetRegion(const BamRegion& region) {
Region = region;
// NB: While it may make sense to track readers in which we can
// successfully SetRegion, In practice a failure of SetRegion means "no
// alignments here." It makes sense to simply accept the failure,
// UpdateAlignments(), and continue.
for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
if (!it->first->SetRegion(region)) {
cerr << "ERROR: could not jump " << it->first->GetFilename() << " to "
<< region.LeftRefID << ":" << region.LeftPosition
<< ".." << region.RightRefID << ":" << region.RightPosition << endl;
}
}
UpdateAlignments();
return true;
}
void BamMultiReader::UpdateAlignments(void) {
// Update Alignments
alignments.clear();
for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) {
BamReader* br = it->first;
BamAlignment* ba = it->second;
if (br->GetNextAlignment(*ba)) {
alignments.insert(make_pair(make_pair(ba->RefID, ba->Position),
make_pair(br, ba)));
} else {
// assume BamReader end of region / EOF
}
}
}
// updates the reference id stored in the BamMultiReader
// to reflect the current state of the readers
void BamMultiReader::UpdateReferenceID(void) {
// the alignments are sorted by position, so the first alignment will always have the lowest reference ID
if (alignments.begin()->second.second->RefID != CurrentRefID) {
// get the next reference id
// while there aren't any readers at the next ref id
// increment the ref id
int nextRefID = CurrentRefID;
while (alignments.begin()->second.second->RefID != nextRefID) {
++nextRefID;
}
//cerr << "updating reference id from " << CurrentRefID << " to " << nextRefID << endl;
CurrentRefID = nextRefID;
}
}
// ValidateReaders checks that all the readers point to BAM files representing
// alignments against the same set of reference sequences, and that the
// sequences are identically ordered. If these checks fail the operation of
// the multireader is undefined, so we force program exit.
void BamMultiReader::ValidateReaders(void) const {
int firstRefCount = readers.front().first->GetReferenceCount();
BamTools::RefVector firstRefData = readers.front().first->GetReferenceData();
for (vector<pair<BamReader*, BamAlignment*> >::const_iterator it = readers.begin(); it != readers.end(); ++it) {
BamReader* reader = it->first;
BamTools::RefVector currentRefData = reader->GetReferenceData();
BamTools::RefVector::const_iterator f = firstRefData.begin();
BamTools::RefVector::const_iterator c = currentRefData.begin();
if (reader->GetReferenceCount() != firstRefCount || firstRefData.size() != currentRefData.size()) {
cerr << "ERROR: mismatched number of references in " << reader->GetFilename()
<< " expected " << firstRefCount
<< " reference sequences but only found " << reader->GetReferenceCount() << endl;
exit(1);
}
// this will be ok; we just checked above that we have identically-sized sets of references
// here we simply check if they are all, in fact, equal in content
while (f != firstRefData.end()) {
if (f->RefName != c->RefName || f->RefLength != c->RefLength) {
cerr << "ERROR: mismatched references found in " << reader->GetFilename()
<< " expected: " << endl;
for (BamTools::RefVector::const_iterator a = firstRefData.begin(); a != firstRefData.end(); ++a)
cerr << a->RefName << " " << a->RefLength << endl;
cerr << "but found: " << endl;
for (BamTools::RefVector::const_iterator a = currentRefData.begin(); a != currentRefData.end(); ++a)
cerr << a->RefName << " " << a->RefLength << endl;
exit(1);
}
++f; ++c;
}
}
}