Skip to content

Commit ad3638a

Browse files
nameoverflowlotem
authored andcommitted
feat: spelling correction (#228)
* correction * sym delete search * reverts * edit distance * You know nothing Jon Snow * syllabify with correction * shit OOP * load corrector from dictionary * perform syllabifier with correction in script translator * comfort compiler * add test * more test cases * fix edge case * add test case for multiple edge * allow correction edges exist * BFS approximate NN search * fix a wild pointer * limit the correction candidates' showing up * change the order of SpellingType * only take normal spelling corrections * fix syllabifier * refactors on corrections * address style issues * chore(test/corrector_test): DISABLE_ non-passing test * chore(dict/corrector): move to src/rime/dict/
1 parent edf6a0b commit ad3638a

16 files changed

+863
-96
lines changed

src/rime/algo/syllabifier.cc

+49-8
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,13 @@
77
//
88
#include <queue>
99
#include <boost/range/adaptor/reversed.hpp>
10-
#include <rime/dict/prism.h>
1110
#include <rime/algo/syllabifier.h>
11+
#include <rime/dict/corrector.h>
12+
#include <rime/dict/prism.h>
13+
#include "syllabifier.h"
1214

1315
namespace rime {
16+
using namespace corrector;
1417

1518
using Vertex = pair<size_t, SpellingType>;
1619
using VertexQueue = std::priority_queue<Vertex,
@@ -35,16 +38,36 @@ int Syllabifier::BuildSyllableGraph(const string &input,
3538
// record a visit to the vertex
3639
if (graph->vertices.find(current_pos) == graph->vertices.end())
3740
graph->vertices.insert(vertex); // preferred spelling type comes first
38-
else
41+
else {
42+
// graph->vertices[current_pos] = std::min(vertex.second, graph->vertices[current_pos]);
3943
continue; // discard worse spelling types
44+
}
4045

4146
if (current_pos > farthest)
4247
farthest = current_pos;
4348
DLOG(INFO) << "current_pos: " << current_pos;
4449

4550
// see where we can go by advancing a syllable
4651
vector<Prism::Match> matches;
47-
prism.CommonPrefixSearch(input.substr(current_pos), &matches);
52+
set<SyllableId> match_set;
53+
auto current_input = input.substr(current_pos);
54+
prism.CommonPrefixSearch(current_input, &matches);
55+
for (auto &m : matches) {
56+
match_set.insert(m.value);
57+
}
58+
if (enable_correction_) {
59+
Corrections corrections;
60+
corrector_->ToleranceSearch(prism, current_input, &corrections, 5);
61+
for (const auto &m : corrections) {
62+
for (auto accessor = prism.QuerySpelling(m.first); !accessor.exhausted(); accessor.Next()) {
63+
if (accessor.properties().type == kNormalSpelling) {
64+
matches.push_back({ m.first, m.second.length });
65+
break;
66+
}
67+
}
68+
}
69+
}
70+
4871
if (!matches.empty()) {
4972
auto& end_vertices(graph->edges[current_pos]);
5073
for (const auto& m : matches) {
@@ -56,15 +79,15 @@ int Syllabifier::BuildSyllableGraph(const string &input,
5679
++end_pos;
5780
DLOG(INFO) << "end_pos: " << end_pos;
5881
bool matches_input = (current_pos == 0 && end_pos == input.length());
59-
SpellingMap spellings;
82+
SpellingMap& spellings(end_vertices[end_pos]);
6083
SpellingType end_vertex_type = kInvalidSpelling;
6184
// when spelling algebra is enabled,
6285
// a spelling evaluates to a set of syllables;
6386
// otherwise, it resembles exactly the syllable itself.
6487
SpellingAccessor accessor(prism.QuerySpelling(m.value));
6588
while (!accessor.exhausted()) {
6689
SyllableId syllable_id = accessor.syllable_id();
67-
SpellingProperties props = accessor.properties();
90+
EdgeProperties props(accessor.properties());
6891
if (strict_spelling_ &&
6992
matches_input &&
7093
props.type != kNormalSpelling) {
@@ -74,20 +97,29 @@ int Syllabifier::BuildSyllableGraph(const string &input,
7497
props.end_pos = end_pos;
7598
// add a syllable with properties to the edge's
7699
// spelling-to-syllable map
77-
spellings.insert({syllable_id, props});
100+
if (match_set.find(m.value) == match_set.end()) {
101+
props.is_correction = true;
102+
props.credibility = 0.01;
103+
}
104+
auto it = spellings.find(syllable_id);
105+
if (it == spellings.end()) {
106+
spellings.insert({syllable_id, props});
107+
} else {
108+
it->second.type = std::min(it->second.type, props.type);
109+
}
78110
// let end_vertex_type be the best (smaller) type of spelling
79111
// that ends at the vertex
80-
if (end_vertex_type > props.type) {
112+
if (end_vertex_type > props.type && !props.is_correction) {
81113
end_vertex_type = props.type;
82114
}
83115
}
84116
accessor.Next();
85117
}
86118
if (spellings.empty()) {
87119
DLOG(INFO) << "not spelt.";
120+
end_vertices.erase(end_pos);
88121
continue;
89122
}
90-
end_vertices[end_pos].swap(spellings);
91123
// find the best common type in a path up to the end vertex
92124
// eg. pinyin "shurfa" has vertex type kNormalSpelling at position 3,
93125
// kAbbreviation at position 4 and kAbbreviation at position 6
@@ -121,6 +153,10 @@ int Syllabifier::BuildSyllableGraph(const string &input,
121153
// when there is a path of more favored type
122154
SpellingType edge_type = kInvalidSpelling;
123155
for (auto k = j->second.begin(); k != j->second.end(); ) {
156+
if (k->second.is_correction) {
157+
++k;
158+
continue; // Don't care correction edges
159+
}
124160
if (k->second.type > last_type) {
125161
j->second.erase(k++);
126162
}
@@ -245,4 +281,9 @@ void Syllabifier::Transpose(SyllableGraph* graph) {
245281
}
246282
}
247283

284+
void Syllabifier::EnableCorrection(an<Corrector> corrector) {
285+
enable_correction_ = true;
286+
corrector_ = std::move(corrector);
287+
}
288+
248289
} // namespace rime

src/rime/algo/syllabifier.h

+12-2
Original file line numberDiff line numberDiff line change
@@ -15,15 +15,22 @@
1515
namespace rime {
1616

1717
class Prism;
18+
class Corrector;
1819

1920
using SyllableId = int32_t;
2021

21-
using SpellingMap = map<SyllableId, SpellingProperties>;
22+
struct EdgeProperties : SpellingProperties {
23+
EdgeProperties(SpellingProperties sup): SpellingProperties(sup) {};
24+
EdgeProperties() = default;
25+
bool is_correction = false;
26+
};
27+
28+
using SpellingMap = map<SyllableId, EdgeProperties>;
2229
using VertexMap = map<size_t, SpellingType>;
2330
using EndVertexMap = map<size_t, SpellingMap>;
2431
using EdgeMap = map<size_t, EndVertexMap>;
2532

26-
using SpellingPropertiesList = vector<const SpellingProperties*>;
33+
using SpellingPropertiesList = vector<const EdgeProperties*>;
2734
using SpellingIndex = map<SyllableId, SpellingPropertiesList>;
2835
using SpellingIndices = map<size_t, SpellingIndex>;
2936

@@ -49,6 +56,7 @@ class Syllabifier {
4956
RIME_API int BuildSyllableGraph(const string &input,
5057
Prism &prism,
5158
SyllableGraph *graph);
59+
RIME_API void EnableCorrection(an<Corrector> corrector);
5260

5361
protected:
5462
void CheckOverlappedSpellings(SyllableGraph *graph,
@@ -58,6 +66,8 @@ class Syllabifier {
5866
string delimiters_;
5967
bool enable_completion_ = false;
6068
bool strict_spelling_ = false;
69+
an<Corrector> corrector_ = nullptr;
70+
bool enable_correction_ = false;
6171
};
6272

6373
} // namespace rime

src/rime/common.h

+2
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include <unordered_set>
1919
#include <utility>
2020
#include <vector>
21+
#include <boost/optional.hpp>
2122
#define BOOST_BIND_NO_PLACEHOLDERS
2223
#ifdef BOOST_SIGNALS2
2324
#include <boost/signals2/connection.hpp>
@@ -47,6 +48,7 @@ using std::pair;
4748
using std::set;
4849
using std::string;
4950
using std::vector;
51+
using boost::optional;
5052

5153
template <class Key, class T>
5254
using hash_map = std::unordered_map<Key, T>;

0 commit comments

Comments
 (0)