forked from HIITMetagenomics/dsm-framework
-
Notifications
You must be signed in to change notification settings - Fork 0
/
TextCollectionBuilder.cpp
152 lines (132 loc) · 4.31 KB
/
TextCollectionBuilder.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#include "rlcsa_builder.h"
#include "TextCollectionBuilder.h"
#include "FMIndex.h"
//#include "rlcsa_wrapper.h"
#include <vector>
using std::vector;
#include <string>
using std::string;
// Using pimpl idiom to hide RLCSABuilder*
struct TCBuilderRep
{
TextCollection::IndexType type;
unsigned samplerate;
CSA::RLCSABuilder * sa;
ulong n;
// Total number of texts in the collection
unsigned numberOfTexts;
// Length of the longest text
ulong maxTextLength;
ulong numberOfSamples;
bool insertAllowed;
vector<string> name;
};
/**
* Init text collection
*/
TextCollectionBuilder::TextCollectionBuilder(unsigned samplerate, ulong estimatedInputLength, TextCollection::IndexType type)
: p_(new struct TCBuilderRep())
{
p_->type = type;
p_->n = 0;
p_->samplerate = samplerate;
if (samplerate == 0)
p_->samplerate = TEXTCOLLECTION_DEFAULT_SAMPLERATE;
p_->numberOfTexts = 0;
p_->numberOfSamples = 0;
p_->insertAllowed = true;
CSA::usint rlcsa_block_size = CSA::RLCSA_BLOCK_SIZE.second;
CSA::usint rlcsa_sample_rate = 0;
if(type == TextCollection::TYPE_RLCSA)
rlcsa_sample_rate = p_->samplerate;
// Parameters for FM-index: 8 bytes, no samples, buffer size n/10 bytes.
// Parameters for RLCSA: 32 bytes, samples, buffer size n/10 bytes.
// Buffer size is always at least 15MB:
if (estimatedInputLength < TEXTCOLLECTION_DEFAULT_INPUT_LENGTH)
estimatedInputLength = TEXTCOLLECTION_DEFAULT_INPUT_LENGTH;
p_->sa = new CSA::RLCSABuilder(rlcsa_block_size, rlcsa_sample_rate, estimatedInputLength/10);
assert(p_->sa->isOk());
}
TextCollectionBuilder::~TextCollectionBuilder()
{
delete p_->sa;
delete p_;
}
void TextCollectionBuilder::InsertText(uchar const * text)
{
if (!p_->insertAllowed)
{
std::cerr << "TextCollectionBuilder::InsertText() error: new text can not be inserted after InitTextCollection() call!" << std::endl;
exit(1);
}
TextCollection::TextPosition m = std::strlen((char *)text) + 1;
if (m > p_->maxTextLength)
p_->maxTextLength = m; // Store length of the longest text seen so far.
if (m > 1)
{
p_->n += m;
p_->numberOfTexts ++;
p_->numberOfSamples += (m-1)/p_->samplerate;
p_->sa->insertSequence((char*)text, m-1, 0);
assert(p_->sa->isOk());
}
else
{
// FIXME indexing empty texts
std::cerr << "TextCollectionBuilder::InsertText() error: can not index empty texts!" << std::endl;
exit(1);
}
}
void TextCollectionBuilder::InsertText(uchar const * text, string const &name)
{
p_->name.push_back(name);
InsertText(text);
}
TextCollection * TextCollectionBuilder::InitTextCollection(bool storePlainText, bool color, unsigned rotationLength)
{
p_->insertAllowed = false; // Disable future insertions
TextCollection *result = 0;
switch(p_->type)
{
case(TextCollection::TYPE_FMINDEX):
{
uchar * bwt = 0;
CSA::usint length = 0;
if (p_->numberOfTexts == 0)
{
p_->numberOfTexts ++; // Add one empty text
bwt = new uchar[2];
bwt[0] = '\0';
bwt[1] = '\0';
length = 1;
p_->maxTextLength = 1;
}
else
{
cerr << "calling rlcsa::getbwt().." << endl;
bwt = (uchar *)p_->sa->getBWT(length);
cerr << "rlcsa::getbwt() successful!" << endl;
delete p_->sa;
p_->sa = 0;
assert(length == p_->n);
}
cerr << "calling fmindex constructor.." << endl;
result = new FMIndex(bwt, (ulong)length, p_->samplerate, p_->numberOfTexts, p_->maxTextLength,
p_->numberOfSamples, p_->name, storePlainText, color, rotationLength);
cerr << "fmindex constructor successful.." << endl;
break;
}
case(TextCollection::TYPE_RLCSA):
//result = new RLCSAWrapper(p_->sa->getRLCSA());
//delete p_->sa;
//p_->sa = 0;
std::cerr << "TextCollectionBuilder::InitTextCollection(): currently unsupported!" << std::endl;
std::abort();
break;
default:
std::cerr << "TextCollectionBuilder::InitTextCollection(): invalid index type!" << std::endl;
exit(2);
break;
}
return result;
}