-
Notifications
You must be signed in to change notification settings - Fork 0
/
feature_index.h
130 lines (111 loc) · 3.69 KB
/
feature_index.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
//
// CRF++PIVAJ -- A CRF toolkit derived from CRF++ for
// the PIVAJ project (see http://plair.univ-rouen.fr).
//
//
// Copyright(C) 2014 Julien Lerouge <julien@lerouge.me>
// Copyright(C) 2005-2007 Taku Kudo <taku@chasen.org>
//
#ifndef CRFPP_FEATURE_INDEX_H_
#define CRFPP_FEATURE_INDEX_H_
#include <vector>
#include <map>
#include <iostream>
#include "common.h"
#include "scoped_ptr.h"
#include "feature_cache.h"
#include "path.h"
#include "node.h"
#include "freelist.h"
#include "mmap.h"
#include "darts.h"
namespace CRFPP {
class TaggerImpl;
class Allocator {
public:
explicit Allocator(size_t thread_num);
Allocator();
virtual ~Allocator();
char *strdup(const char *str);
Path *newPath(size_t thread_id);
Node *newNode(size_t thread_id);
void clear();
void clear_freelist(size_t thread_id);
FeatureCache *feature_cache() const;
size_t thread_num() const;
private:
void init();
size_t thread_num_;
scoped_ptr<FeatureCache> feature_cache_;
scoped_ptr<FreeList<char> > char_freelist_;
scoped_array< FreeList<Path> > path_freelist_;
scoped_array< FreeList<Node> > node_freelist_;
};
class FeatureIndex {
public:
static const unsigned int version = MODEL_VERSION;
size_t size() const { return maxid_; }
size_t xsize() const { return xsize_; }
size_t ysize() const { return y_.size(); }
const char* y(size_t i) const { return y_[i].c_str(); }
void set_alpha(const double *alpha) { alpha_ = alpha; }
const float *alpha_float() { return alpha_float_; }
const double *alpha() const { return alpha_; }
void set_cost_factor(double cost_factor) { cost_factor_ = cost_factor; }
double cost_factor() const { return cost_factor_; }
void calcCost(Node *node) const;
void calcCost(Path *path) const;
bool buildFeatures(TaggerImpl *tagger) const;
void rebuildFeatures(TaggerImpl *tagger) const;
const char* what() { return what_.str(); }
explicit FeatureIndex(): maxid_(0), alpha_(0), alpha_float_(0),
cost_factor_(1.0), xsize_(0),
check_max_xsize_(false), max_xsize_(0) {}
virtual ~FeatureIndex() {}
const char *getTemplate() const;
protected:
virtual int getID(const char *str) const = 0;
const char *getIndex(const char *&p,
size_t pos,
const TaggerImpl &tagger) const;
bool applyRule(string_buffer *os,
const char *pattern,
size_t pos, const TaggerImpl &tagger) const;
mutable unsigned int maxid_;
const double *alpha_;
const float *alpha_float_;
double cost_factor_;
unsigned int xsize_;
bool check_max_xsize_;
mutable unsigned int max_xsize_;
std::vector<std::string> unigram_templs_;
std::vector<std::string> bigram_templs_;
std::vector<std::string> y_;
std::string templs_;
whatlog what_;
};
class EncoderFeatureIndex: public FeatureIndex {
public:
bool open(const char *template_filename,
const char *model_filename);
bool save(const char *filename, bool emit_textmodelfile);
bool convert(const char *text_filename,
const char *binary_filename);
void shrink(size_t freq, Allocator *allocator);
private:
int getID(const char *str) const;
bool openTemplate(const char *filename);
bool openTagSet(const char *filename);
mutable std::map<std::string, std::pair<int, unsigned int> > dic_;
};
class DecoderFeatureIndex: public FeatureIndex {
public:
bool open(const char *model_filename);
bool openFromArray(const char *buf, size_t size);
private:
Mmap <char> mmap_;
Darts::DoubleArray da_;
int getID(const char *str) const;
};
}
#endif