1313#include < rime/dict/vocabulary.h>
1414#include < rime/gear/charset_filter.h>
1515#include < boost/locale/encoding.hpp>
16+ #include < boost/algorithm/string.hpp>
17+
1618
1719namespace rime {
1820
@@ -44,11 +46,60 @@ bool contains_extended_cjk(const string& text)
4446 return false ;
4547}
4648
49+ static bool is_emoji (uint32_t ch)
50+ {
51+
52+ if ((ch >= 0x0000 && ch <= 0x007F ) || // C0 Controls and Basic Latin
53+ (ch >= 0x0080 && ch <= 0x00FF ) || // C1 Controls and Latin-1 Supplement
54+ (ch >= 0x02B0 && ch <= 0x02FF ) || // Spacing Modifier Letters
55+ (ch >= 0x0900 && ch <= 0x097F ) || // Devanagari
56+ (ch >= 0x2000 && ch <= 0x203C ) || // General Punctuation
57+ (ch >= 0x20A0 && ch <= 0x20CF ) || // Currency Symbols
58+ (ch >= 0x2100 && ch <= 0x214F ) || // Letterlike Symbols
59+ (ch >= 0x2150 && ch <= 0x218F ) || // Number Forms
60+ (ch >= 0x2190 && ch <= 0x21FF ) || // Arrows
61+ (ch >= 0x2200 && ch <= 0x22FF ) || // Mathematical Operators
62+ (ch >= 0x2300 && ch <= 0x23FF ) || // Miscellaneous Technical
63+ (ch >= 0x2460 && ch <= 0x24FF ) || // Enclosed Alphanumerics
64+ (ch >= 0x25A0 && ch <= 0x25FF ) || // Geometric Shapes
65+ (ch >= 0x2600 && ch <= 0x26FF ) || // Miscellaneous Symbols
66+ (ch >= 0x2700 && ch <= 0x27BF ) || // Dingbats
67+ (ch >= 0x2900 && ch <= 0x297F ) || // Supplemental Arrows-B
68+ (ch >= 0x2B00 && ch <= 0x2BFF ) || // Miscellaneous Symbols and Arrows
69+ (ch >= 0x3000 && ch <= 0x303F ) || // CJK Symbols and Punctuation
70+ (ch >= 0x3200 && ch <= 0x32FF ) || // Enclosed CJK Letters and Months
71+ (ch >= 0x1F100 && ch <= 0x1F1FF ) || // Enclosed Alphanumeric Supplement
72+ (ch >= 0x1F200 && ch <= 0x1F2FF ) || // Enclosed Ideographic Supplement
73+ (ch >= 0x1F000 && ch <= 0x1F02F ) || // Mahjong Tiles
74+ (ch >= 0x1F0A0 && ch <= 0x1F0FF ) || // Playing Cards
75+ (ch >= 0x1F300 && ch <= 0x1F5FF ) || // Miscellaneous Symbols and Pictographs
76+ (ch >= 0x1F600 && ch <= 0x1F64F ) || // Emoticons
77+ (ch >= 0x1F680 && ch <= 0x1F6FF ) || // Transport and Map Symbols
78+ (ch >= 0x1F900 && ch <= 0x1F9FF )) // Supplemental Symbols and Pictographs)
79+ return true ;
80+
81+ return false ;
82+ }
83+
84+ static bool is_all_emoji (const string& text)
85+ {
86+ const char *p = text.c_str ();
87+ uint32_t ch;
88+
89+ while ((ch = utf8::unchecked::next (p)) != 0 ) {
90+ if (!is_emoji (ch)) {
91+ return false ;
92+ }
93+ }
94+
95+ return true ;
96+ }
97+
4798// CharsetFilterTranslation
4899
49100CharsetFilterTranslation::CharsetFilterTranslation (
50- an<Translation> translation, const string& charset )
51- : translation_(translation), charset_(charset ) {
101+ an<Translation> translation, const string& charset_with_parameter_ )
102+ : translation_(translation), charset_with_parameter_(charset_with_parameter_ ) {
52103 LocateNextCandidate ();
53104}
54105
@@ -69,7 +120,7 @@ an<Candidate> CharsetFilterTranslation::Peek() {
69120bool CharsetFilterTranslation::LocateNextCandidate () {
70121 while (!translation_->exhausted ()) {
71122 auto cand = translation_->Peek ();
72- if (cand && CharsetFilter::FilterText (cand->text (), charset_ ))
123+ if (cand && CharsetFilter::FilterText (cand->text (), charset_with_parameter_ ))
73124 return true ;
74125 translation_->Next ();
75126 }
@@ -79,9 +130,20 @@ bool CharsetFilterTranslation::LocateNextCandidate() {
79130
80131// CharsetFilter
81132
82- bool CharsetFilter::FilterText (const string& text, const string& charset) {
83- if (charset.empty ()) return !contains_extended_cjk (text);
133+ bool CharsetFilter::FilterText (const string& text, const string& charset_with_parameter) {
134+ if (charset_with_parameter.empty ()) return !contains_extended_cjk (text);
135+ vector<string> charset_arguments_vector;
136+ boost::split (charset_arguments_vector, charset_with_parameter, boost::is_any_of (" +" ));
137+ bool is_emoji_enabled = false ;
138+ if (std::find (charset_arguments_vector.begin (), charset_arguments_vector.end (), " emoji" ) != charset_arguments_vector.end ()) {
139+ is_emoji_enabled = true ;
140+ }
141+ if (is_emoji_enabled && is_all_emoji (text)) {
142+ return true ;
143+ }
144+
84145 try {
146+ const auto & charset = charset_arguments_vector[0 ];
85147 boost::locale::conv::from_utf (text, charset, boost::locale::conv::method_type::stop);
86148 }
87149 catch (boost::locale::conv::conversion_error const & /* ex*/ ) {
0 commit comments