Skip to content

Commit 943c95b

Browse files
alswllotem
authored andcommitted
feat(charset_filter): support charset options with emoji (#293)
1 parent 6662a28 commit 943c95b

File tree

3 files changed

+115
-7
lines changed

3 files changed

+115
-7
lines changed

src/rime/gear/charset_filter.cc

+67-5
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
#include <rime/dict/vocabulary.h>
1414
#include <rime/gear/charset_filter.h>
1515
#include <boost/locale/encoding.hpp>
16+
#include <boost/algorithm/string.hpp>
17+
1618

1719
namespace rime {
1820

@@ -44,11 +46,60 @@ bool contains_extended_cjk(const string& text)
4446
return false;
4547
}
4648

49+
static bool is_emoji(uint32_t ch)
50+
{
51+
52+
if ((ch >= 0x0000 && ch <= 0x007F) || // C0 Controls and Basic Latin
53+
(ch >= 0x0080 && ch <= 0x00FF) || // C1 Controls and Latin-1 Supplement
54+
(ch >= 0x02B0 && ch <= 0x02FF) || // Spacing Modifier Letters
55+
(ch >= 0x0900 && ch <= 0x097F) || // Devanagari
56+
(ch >= 0x2000 && ch <= 0x203C) || // General Punctuation
57+
(ch >= 0x20A0 && ch <= 0x20CF) || // Currency Symbols
58+
(ch >= 0x2100 && ch <= 0x214F) || // Letterlike Symbols
59+
(ch >= 0x2150 && ch <= 0x218F) || // Number Forms
60+
(ch >= 0x2190 && ch <= 0x21FF) || // Arrows
61+
(ch >= 0x2200 && ch <= 0x22FF) || // Mathematical Operators
62+
(ch >= 0x2300 && ch <= 0x23FF) || // Miscellaneous Technical
63+
(ch >= 0x2460 && ch <= 0x24FF) || // Enclosed Alphanumerics
64+
(ch >= 0x25A0 && ch <= 0x25FF) || // Geometric Shapes
65+
(ch >= 0x2600 && ch <= 0x26FF) || // Miscellaneous Symbols
66+
(ch >= 0x2700 && ch <= 0x27BF) || // Dingbats
67+
(ch >= 0x2900 && ch <= 0x297F) || // Supplemental Arrows-B
68+
(ch >= 0x2B00 && ch <= 0x2BFF) || // Miscellaneous Symbols and Arrows
69+
(ch >= 0x3000 && ch <= 0x303F) || // CJK Symbols and Punctuation
70+
(ch >= 0x3200 && ch <= 0x32FF) || // Enclosed CJK Letters and Months
71+
(ch >= 0x1F100 && ch <= 0x1F1FF) || // Enclosed Alphanumeric Supplement
72+
(ch >= 0x1F200 && ch <= 0x1F2FF) || // Enclosed Ideographic Supplement
73+
(ch >= 0x1F000 && ch <= 0x1F02F) || // Mahjong Tiles
74+
(ch >= 0x1F0A0 && ch <= 0x1F0FF) || // Playing Cards
75+
(ch >= 0x1F300 && ch <= 0x1F5FF) || // Miscellaneous Symbols and Pictographs
76+
(ch >= 0x1F600 && ch <= 0x1F64F) || // Emoticons
77+
(ch >= 0x1F680 && ch <= 0x1F6FF) || // Transport and Map Symbols
78+
(ch >= 0x1F900 && ch <= 0x1F9FF)) // Supplemental Symbols and Pictographs)
79+
return true;
80+
81+
return false;
82+
}
83+
84+
static bool is_all_emoji(const string& text)
85+
{
86+
const char *p = text.c_str();
87+
uint32_t ch;
88+
89+
while ((ch = utf8::unchecked::next(p)) != 0) {
90+
if (!is_emoji(ch)) {
91+
return false;
92+
}
93+
}
94+
95+
return true;
96+
}
97+
4798
// CharsetFilterTranslation
4899

49100
CharsetFilterTranslation::CharsetFilterTranslation(
50-
an<Translation> translation, const string& charset)
51-
: translation_(translation), charset_(charset) {
101+
an<Translation> translation, const string& charset_with_parameter_)
102+
: translation_(translation), charset_with_parameter_(charset_with_parameter_) {
52103
LocateNextCandidate();
53104
}
54105

@@ -69,7 +120,7 @@ an<Candidate> CharsetFilterTranslation::Peek() {
69120
bool CharsetFilterTranslation::LocateNextCandidate() {
70121
while (!translation_->exhausted()) {
71122
auto cand = translation_->Peek();
72-
if (cand && CharsetFilter::FilterText(cand->text(), charset_))
123+
if (cand && CharsetFilter::FilterText(cand->text(), charset_with_parameter_))
73124
return true;
74125
translation_->Next();
75126
}
@@ -79,9 +130,20 @@ bool CharsetFilterTranslation::LocateNextCandidate() {
79130

80131
// CharsetFilter
81132

82-
bool CharsetFilter::FilterText(const string& text, const string& charset) {
83-
if (charset.empty()) return !contains_extended_cjk(text);
133+
bool CharsetFilter::FilterText(const string& text, const string& charset_with_parameter) {
134+
if (charset_with_parameter.empty()) return !contains_extended_cjk(text);
135+
vector<string> charset_arguments_vector;
136+
boost::split(charset_arguments_vector, charset_with_parameter, boost::is_any_of("+"));
137+
bool is_emoji_enabled = false;
138+
if (std::find(charset_arguments_vector.begin(), charset_arguments_vector.end(), "emoji") != charset_arguments_vector.end()) {
139+
is_emoji_enabled = true;
140+
}
141+
if (is_emoji_enabled && is_all_emoji(text)) {
142+
return true;
143+
}
144+
84145
try {
146+
const auto& charset = charset_arguments_vector[0];
85147
boost::locale::conv::from_utf(text, charset, boost::locale::conv::method_type::stop);
86148
}
87149
catch(boost::locale::conv::conversion_error const& /*ex*/) {

src/rime/gear/charset_filter.h

+3-2
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#ifndef RIME_CHARSET_FILTER_H_
88
#define RIME_CHARSET_FILTER_H_
99

10+
#include <rime_api.h>
1011
#include <rime/filter.h>
1112
#include <rime/translation.h>
1213
#include <rime/gear/filter_commons.h>
@@ -23,7 +24,7 @@ class CharsetFilterTranslation : public Translation {
2324
bool LocateNextCandidate();
2425

2526
an<Translation> translation_;
26-
string charset_;
27+
string charset_with_parameter_;
2728
};
2829

2930
struct DictEntry;
@@ -40,7 +41,7 @@ class CharsetFilter : public Filter, TagMatching {
4041
}
4142

4243
// return true to accept, false to reject the tested item
43-
static bool FilterText(const string& text, const string& charset = "");
44+
RIME_API static bool FilterText(const string& text, const string& charset_with_argument = "");
4445
static bool FilterDictEntry(an<DictEntry> entry);
4546
};
4647

test/charset_filter_test.cc

+45
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
//
2+
// Copyright RIME Developers
3+
// Distributed under the BSD License
4+
//
5+
// 2012-01-17 GONG Chen <chen.sst@gmail.com>
6+
//
7+
#include <gtest/gtest.h>
8+
#include <rime/common.h>
9+
#include <rime/gear/charset_filter.h>
10+
#include <rime/translation.h>
11+
12+
using namespace rime;
13+
14+
15+
TEST(RimeCharsetFilterTest, FilterText) {
16+
EXPECT_TRUE(CharsetFilter::FilterText("", "unkown"));
17+
EXPECT_TRUE(CharsetFilter::FilterText("👋", "unkown"));
18+
19+
EXPECT_TRUE(CharsetFilter::FilterText("Hello", "utf8"));
20+
EXPECT_TRUE(CharsetFilter::FilterText("", "utf8"));
21+
EXPECT_TRUE(CharsetFilter::FilterText("", "utf8"));
22+
EXPECT_TRUE(CharsetFilter::FilterText("𤘺", "utf8"));
23+
EXPECT_TRUE(CharsetFilter::FilterText("👋", "utf8"));
24+
EXPECT_TRUE(CharsetFilter::FilterText("荣👋", "utf8"));
25+
26+
EXPECT_TRUE(CharsetFilter::FilterText("Hello", "gbk"));
27+
EXPECT_TRUE(CharsetFilter::FilterText("", "gbk"));
28+
EXPECT_TRUE(CharsetFilter::FilterText("", "gbk"));
29+
EXPECT_FALSE(CharsetFilter::FilterText("𤘺", "gbk"));
30+
EXPECT_FALSE(CharsetFilter::FilterText("👋", "gbk"));
31+
EXPECT_FALSE(CharsetFilter::FilterText("荣👋", "gbk"));
32+
33+
EXPECT_TRUE(CharsetFilter::FilterText("Hello", "gb2312"));
34+
EXPECT_TRUE(CharsetFilter::FilterText("", "gb2312"));
35+
EXPECT_FALSE(CharsetFilter::FilterText("", "gb2312"));
36+
EXPECT_FALSE(CharsetFilter::FilterText("𤘺", "gb2312"));
37+
EXPECT_FALSE(CharsetFilter::FilterText("👋", "gb2312"));
38+
39+
EXPECT_TRUE(CharsetFilter::FilterText("👋", "gbk+emoji"));
40+
EXPECT_FALSE(CharsetFilter::FilterText("荣👋", "gbk+emoji"));
41+
42+
EXPECT_TRUE(CharsetFilter::FilterText("👋", "gb2312+emoji"));
43+
EXPECT_FALSE(CharsetFilter::FilterText("荣👋", "gb2312+emoji"));
44+
}
45+

0 commit comments

Comments
 (0)