16
16
17
17
namespace rime {
18
18
19
+ // internal data structure used during the sentence making process.
20
+ // the output line of the algorithm is transformed to an<Sentence>.
21
+ struct Line {
22
+ // be sure the pointer to predecessor Line object is stable. it works since
23
+ // pointer to values stored in std::map and std::unordered_map are stable.
24
+ const Line* predecessor;
25
+ // as long as the word graph lives, pointers to entries are valid.
26
+ const DictEntry* entry;
27
+ size_t end_pos;
28
+ double weight;
29
+
30
+ static const Line kEmpty ;
31
+
32
+ bool empty () const {
33
+ return !predecessor && !entry;
34
+ }
35
+
36
+ string last_word () const {
37
+ return entry ? entry->text : string ();
38
+ }
39
+
40
+ struct Components {
41
+ vector<const Line*> lines;
42
+
43
+ Components (const Line* line) {
44
+ for (const Line* cursor = line;
45
+ !cursor->empty ();
46
+ cursor = cursor->predecessor ) {
47
+ lines.push_back (cursor);
48
+ }
49
+ }
50
+
51
+ decltype (lines.crbegin()) begin() const { return lines.crbegin (); }
52
+ decltype (lines.crend()) end() const { return lines.crend (); }
53
+ };
54
+
55
+ Components components () const { return Components (this ); }
56
+
57
+ string context () const {
58
+ // look back 2 words
59
+ return empty () ? string () :
60
+ !predecessor || predecessor->empty () ? last_word () :
61
+ predecessor->last_word () + last_word ();
62
+ }
63
+
64
+ vector<size_t > word_lengths () const {
65
+ vector<size_t > lengths;
66
+ size_t last_end_pos = 0 ;
67
+ for (const auto * c : components ()) {
68
+ lengths.push_back (c->end_pos - last_end_pos);
69
+ last_end_pos = c->end_pos ;
70
+ }
71
+ return lengths;
72
+ }
73
+ };
74
+
75
+ const Line Line::kEmpty {nullptr , nullptr , 0 , 0.0 };
76
+
19
77
inline static Grammar* create_grammar (Config* config) {
20
78
if (auto * grammar = Grammar::Require (" grammar" )) {
21
79
return grammar->Create (config);
@@ -30,102 +88,103 @@ Poet::Poet(const Language* language, Config* config, Compare compare)
30
88
31
89
Poet::~Poet () {}
32
90
33
- bool Poet::LeftAssociateCompare (const Sentence& one, const Sentence& other) {
34
- return one.weight () < other.weight () || ( // left associate if even
35
- one.weight () == other.weight () && (
36
- one.size () > other.size () || ( // less components is more favorable
37
- one.size () == other.size () &&
38
- std::lexicographical_compare (one.syllable_lengths ().begin (),
39
- one.syllable_lengths ().end (),
40
- other.syllable_lengths ().begin (),
41
- other.syllable_lengths ().end ()))));
91
+ bool Poet::CompareWeight (const Line& one, const Line& other) {
92
+ return one.weight < other.weight ;
93
+ }
94
+
95
+ // returns true if one is less than other.
96
+ bool Poet::LeftAssociateCompare (const Line& one, const Line& other) {
97
+ if (one.weight < other.weight ) return true ;
98
+ if (one.weight == other.weight ) {
99
+ auto one_word_lens = one.word_lengths ();
100
+ auto other_word_lens = other.word_lengths ();
101
+ // less words is more favorable
102
+ if (one_word_lens.size () > other_word_lens.size ()) return true ;
103
+ if (one_word_lens.size () == other_word_lens.size ()) {
104
+ return std::lexicographical_compare (
105
+ one_word_lens.begin (), one_word_lens.end (),
106
+ other_word_lens.begin (), other_word_lens.end ());
107
+ }
108
+ }
109
+ return false ;
42
110
}
43
111
44
- // keep the best sentence candidate per last phrase
45
- using SentenceCandidates = hash_map<string, of<Sentence> >;
112
+ // keep the best line candidate per last phrase
113
+ using LineCandidates = hash_map<string, Line >;
46
114
47
115
template <int N>
48
- static vector<of<Sentence> > find_top_candidates (
49
- const SentenceCandidates & candidates, Poet::Compare compare) {
50
- vector<of<Sentence> > top;
116
+ static vector<const Line* > find_top_candidates (
117
+ const LineCandidates & candidates, Poet::Compare compare) {
118
+ vector<const Line* > top;
51
119
top.reserve (N + 1 );
52
120
for (const auto & candidate : candidates) {
53
121
auto pos = std::upper_bound (
54
- top.begin (), top.end (), candidate.second ,
55
- [&](const an<Sentence>& a, const an<Sentence>& b) {
56
- return !compare (*a, *b); // desc
57
- });
122
+ top.begin (), top.end (), &candidate.second ,
123
+ [&](const Line* a, const Line* b) { return compare (*b, *a); }); // desc
58
124
if (pos - top.begin () >= N) continue ;
59
- top.insert (pos, candidate.second );
125
+ top.insert (pos, & candidate.second );
60
126
if (top.size () > N) top.pop_back ();
61
127
}
62
128
return top;
63
129
}
64
130
65
- static an<Sentence> find_best_sentence (const SentenceCandidates& candidates,
66
- Poet::Compare compare) {
67
- an<Sentence> best = nullptr ;
68
- for (const auto & candidate : candidates) {
69
- if (!best || compare (*best, *candidate.second )) {
70
- best = candidate.second ;
71
- }
72
- }
73
- return best;
74
- }
75
-
76
- using UpdateSetenceCandidate = function<void (const an<Sentence>& candidate)>;
131
+ using UpdateLineCandidate = function<void (const Line& candidate)>;
77
132
78
133
struct BeamSearch {
79
- using State = SentenceCandidates ;
134
+ using State = LineCandidates ;
80
135
81
- static constexpr int kMaxSentenceCandidates = 7 ;
136
+ static constexpr int kMaxLineCandidates = 7 ;
82
137
83
- static void Initiate (State& initial_state, const Language* language ) {
84
- initial_state.emplace (" " , New<Sentence>(language) );
138
+ static void Initiate (State& initial_state) {
139
+ initial_state.emplace (" " , Line:: kEmpty );
85
140
}
86
141
87
142
static void ForEachCandidate (const State& state,
88
143
Poet::Compare compare,
89
- UpdateSetenceCandidate update) {
144
+ UpdateLineCandidate update) {
90
145
auto top_candidates =
91
- find_top_candidates<kMaxSentenceCandidates >(state, compare);
92
- for (const auto & candidate : top_candidates) {
93
- update (candidate);
146
+ find_top_candidates<kMaxLineCandidates >(state, compare);
147
+ for (const auto * candidate : top_candidates) {
148
+ update (* candidate);
94
149
}
95
150
}
96
151
97
- static an<Sentence>& BestSentenceToUpdate (State& state,
98
- const an<Sentence>& new_sentence) {
99
- const auto & key = new_sentence->components ().back ().text ;
152
+ static Line& BestLineToUpdate (State& state, const Line& new_line) {
153
+ const auto & key = new_line.last_word ();
100
154
return state[key];
101
155
}
102
156
103
- static an<Sentence> BestSentence (const State& final_state,
104
- Poet::Compare compare) {
105
- return find_best_sentence (final_state, compare);
157
+ static const Line& BestLineInState (const State& final_state,
158
+ Poet::Compare compare) {
159
+ const Line* best = nullptr ;
160
+ for (const auto & candidate : final_state) {
161
+ if (!best || compare (*best, candidate.second )) {
162
+ best = &candidate.second ;
163
+ }
164
+ }
165
+ return best ? *best : Line::kEmpty ;
106
166
}
107
167
};
108
168
109
169
struct DynamicProgramming {
110
- using State = an<Sentence> ;
170
+ using State = Line ;
111
171
112
- static void Initiate (State& initial_state, const Language* language ) {
113
- initial_state = New<Sentence>(language) ;
172
+ static void Initiate (State& initial_state) {
173
+ initial_state = Line:: kEmpty ;
114
174
}
115
175
116
176
static void ForEachCandidate (const State& state,
117
177
Poet::Compare compare,
118
- UpdateSetenceCandidate update) {
178
+ UpdateLineCandidate update) {
119
179
update (state);
120
180
}
121
181
122
- static an<Sentence>& BestSentenceToUpdate (State& state,
123
- const an<Sentence>& new_sentence) {
182
+ static Line& BestLineToUpdate (State& state, const Line& new_line) {
124
183
return state;
125
184
}
126
185
127
- static an<Sentence> BestSentence (const State& final_state,
128
- Poet::Compare compare) {
186
+ static const Line& BestLineInState (const State& final_state,
187
+ Poet::Compare compare) {
129
188
return final_state;
130
189
}
131
190
};
@@ -134,47 +193,58 @@ template <class Strategy>
134
193
an<Sentence> Poet::MakeSentenceWithStrategy (const WordGraph& graph,
135
194
size_t total_length,
136
195
const string& preceding_text) {
137
- map<int , typename Strategy::State> sentences ;
138
- Strategy::Initiate (sentences [0 ], language_ );
139
- for (const auto & w : graph) {
140
- size_t start_pos = w .first ;
141
- if (sentences .find (start_pos) == sentences .end ())
196
+ map<int , typename Strategy::State> states ;
197
+ Strategy::Initiate (states [0 ]);
198
+ for (const auto & sv : graph) {
199
+ size_t start_pos = sv .first ;
200
+ if (states .find (start_pos) == states .end ())
142
201
continue ;
143
202
DLOG (INFO) << " start pos: " << start_pos;
144
- const auto & source (sentences [start_pos]) ;
145
- Strategy::ForEachCandidate (
146
- source, compare_,
147
- [&] (const an<Sentence> & candidate) {
148
- for (const auto & x : w .second ) {
149
- size_t end_pos = x .first ;
203
+ const auto & source_state = states [start_pos];
204
+ const auto update =
205
+ [ this , &states, &sv, start_pos, total_length, &preceding_text]
206
+ (const Line & candidate) {
207
+ for (const auto & ev : sv .second ) {
208
+ size_t end_pos = ev .first ;
150
209
if (start_pos == 0 && end_pos == total_length)
151
- continue ; // exclude single words from the result
210
+ continue ; // exclude single word from the result
152
211
DLOG (INFO) << " end pos: " << end_pos;
153
212
bool is_rear = end_pos == total_length;
154
- auto & target (sentences [end_pos]) ;
213
+ auto & target_state = states [end_pos];
155
214
// extend candidates with dict entries on a valid edge.
156
- const DictEntryList& entries (x .second ) ;
215
+ const DictEntryList& entries = ev .second ;
157
216
for (const auto & entry : entries) {
158
- auto new_sentence = New<Sentence>(*candidate);
159
- new_sentence->Extend (
160
- *entry, end_pos, is_rear, preceding_text, grammar_.get ());
161
- auto & best_sentence =
162
- Strategy::BestSentenceToUpdate (target, new_sentence);
163
- if (!best_sentence || compare_ (*best_sentence, *new_sentence)) {
164
- DLOG (INFO) << " updated sentences " << end_pos << " ) with "
165
- << new_sentence->text () << " weight: "
166
- << new_sentence->weight ();
167
- best_sentence = std::move (new_sentence);
217
+ const string& context =
218
+ candidate.empty () ? preceding_text : candidate.context ();
219
+ double weight = candidate.weight +
220
+ Grammar::Evaluate (context,
221
+ entry->text ,
222
+ entry->weight ,
223
+ is_rear,
224
+ grammar_.get ());
225
+ Line new_line{&candidate, entry.get (), end_pos, weight};
226
+ Line& best = Strategy::BestLineToUpdate (target_state, new_line);
227
+ if (best.empty () || compare_ (best, new_line)) {
228
+ DLOG (INFO) << " updated line ending at " << end_pos
229
+ << " with text: ..." << new_line.last_word ()
230
+ << " weight: " << new_line.weight ;
231
+ best = new_line;
168
232
}
169
233
}
170
234
}
171
- });
235
+ };
236
+ Strategy::ForEachCandidate (source_state, compare_, update);
172
237
}
173
- auto found = sentences .find (total_length);
174
- if (found == sentences .end ())
238
+ auto found = states .find (total_length);
239
+ if (found == states .end () || found-> second . empty ())
175
240
return nullptr ;
176
- else
177
- return Strategy::BestSentence (found->second , compare_);
241
+ const Line& best = Strategy::BestLineInState (found->second , compare_);
242
+ auto sentence = New<Sentence>(language_);
243
+ for (const auto * c : best.components ()) {
244
+ if (!c->entry ) continue ;
245
+ sentence->Extend (*c->entry , c->end_pos , c->weight );
246
+ }
247
+ return sentence;
178
248
}
179
249
180
250
an<Sentence> Poet::MakeSentence (const WordGraph& graph,
0 commit comments