13
13
#include < rime/dict/vocabulary.h>
14
14
#include < rime/gear/charset_filter.h>
15
15
#include < boost/locale/encoding.hpp>
16
+ #include < boost/algorithm/string.hpp>
17
+
16
18
17
19
namespace rime {
18
20
@@ -44,11 +46,60 @@ bool contains_extended_cjk(const string& text)
44
46
return false ;
45
47
}
46
48
49
+ static bool is_emoji (uint32_t ch)
50
+ {
51
+
52
+ if ((ch >= 0x0000 && ch <= 0x007F ) || // C0 Controls and Basic Latin
53
+ (ch >= 0x0080 && ch <= 0x00FF ) || // C1 Controls and Latin-1 Supplement
54
+ (ch >= 0x02B0 && ch <= 0x02FF ) || // Spacing Modifier Letters
55
+ (ch >= 0x0900 && ch <= 0x097F ) || // Devanagari
56
+ (ch >= 0x2000 && ch <= 0x203C ) || // General Punctuation
57
+ (ch >= 0x20A0 && ch <= 0x20CF ) || // Currency Symbols
58
+ (ch >= 0x2100 && ch <= 0x214F ) || // Letterlike Symbols
59
+ (ch >= 0x2150 && ch <= 0x218F ) || // Number Forms
60
+ (ch >= 0x2190 && ch <= 0x21FF ) || // Arrows
61
+ (ch >= 0x2200 && ch <= 0x22FF ) || // Mathematical Operators
62
+ (ch >= 0x2300 && ch <= 0x23FF ) || // Miscellaneous Technical
63
+ (ch >= 0x2460 && ch <= 0x24FF ) || // Enclosed Alphanumerics
64
+ (ch >= 0x25A0 && ch <= 0x25FF ) || // Geometric Shapes
65
+ (ch >= 0x2600 && ch <= 0x26FF ) || // Miscellaneous Symbols
66
+ (ch >= 0x2700 && ch <= 0x27BF ) || // Dingbats
67
+ (ch >= 0x2900 && ch <= 0x297F ) || // Supplemental Arrows-B
68
+ (ch >= 0x2B00 && ch <= 0x2BFF ) || // Miscellaneous Symbols and Arrows
69
+ (ch >= 0x3000 && ch <= 0x303F ) || // CJK Symbols and Punctuation
70
+ (ch >= 0x3200 && ch <= 0x32FF ) || // Enclosed CJK Letters and Months
71
+ (ch >= 0x1F100 && ch <= 0x1F1FF ) || // Enclosed Alphanumeric Supplement
72
+ (ch >= 0x1F200 && ch <= 0x1F2FF ) || // Enclosed Ideographic Supplement
73
+ (ch >= 0x1F000 && ch <= 0x1F02F ) || // Mahjong Tiles
74
+ (ch >= 0x1F0A0 && ch <= 0x1F0FF ) || // Playing Cards
75
+ (ch >= 0x1F300 && ch <= 0x1F5FF ) || // Miscellaneous Symbols and Pictographs
76
+ (ch >= 0x1F600 && ch <= 0x1F64F ) || // Emoticons
77
+ (ch >= 0x1F680 && ch <= 0x1F6FF ) || // Transport and Map Symbols
78
+ (ch >= 0x1F900 && ch <= 0x1F9FF )) // Supplemental Symbols and Pictographs)
79
+ return true ;
80
+
81
+ return false ;
82
+ }
83
+
84
+ static bool is_all_emoji (const string& text)
85
+ {
86
+ const char *p = text.c_str ();
87
+ uint32_t ch;
88
+
89
+ while ((ch = utf8::unchecked::next (p)) != 0 ) {
90
+ if (!is_emoji (ch)) {
91
+ return false ;
92
+ }
93
+ }
94
+
95
+ return true ;
96
+ }
97
+
47
98
// CharsetFilterTranslation
48
99
49
100
CharsetFilterTranslation::CharsetFilterTranslation (
50
- an<Translation> translation, const string& charset )
51
- : translation_(translation), charset_(charset ) {
101
+ an<Translation> translation, const string& charset_with_parameter_ )
102
+ : translation_(translation), charset_with_parameter_(charset_with_parameter_ ) {
52
103
LocateNextCandidate ();
53
104
}
54
105
@@ -69,7 +120,7 @@ an<Candidate> CharsetFilterTranslation::Peek() {
69
120
bool CharsetFilterTranslation::LocateNextCandidate () {
70
121
while (!translation_->exhausted ()) {
71
122
auto cand = translation_->Peek ();
72
- if (cand && CharsetFilter::FilterText (cand->text (), charset_ ))
123
+ if (cand && CharsetFilter::FilterText (cand->text (), charset_with_parameter_ ))
73
124
return true ;
74
125
translation_->Next ();
75
126
}
@@ -79,9 +130,20 @@ bool CharsetFilterTranslation::LocateNextCandidate() {
79
130
80
131
// CharsetFilter
81
132
82
- bool CharsetFilter::FilterText (const string& text, const string& charset) {
83
- if (charset.empty ()) return !contains_extended_cjk (text);
133
+ bool CharsetFilter::FilterText (const string& text, const string& charset_with_parameter) {
134
+ if (charset_with_parameter.empty ()) return !contains_extended_cjk (text);
135
+ vector<string> charset_arguments_vector;
136
+ boost::split (charset_arguments_vector, charset_with_parameter, boost::is_any_of (" +" ));
137
+ bool is_emoji_enabled = false ;
138
+ if (std::find (charset_arguments_vector.begin (), charset_arguments_vector.end (), " emoji" ) != charset_arguments_vector.end ()) {
139
+ is_emoji_enabled = true ;
140
+ }
141
+ if (is_emoji_enabled && is_all_emoji (text)) {
142
+ return true ;
143
+ }
144
+
84
145
try {
146
+ const auto & charset = charset_arguments_vector[0 ];
85
147
boost::locale::conv::from_utf (text, charset, boost::locale::conv::method_type::stop);
86
148
}
87
149
catch (boost::locale::conv::conversion_error const & /* ex*/ ) {
0 commit comments