From f28bf7cf158fcf35f19cdf5c3143e0905c5a3c05 Mon Sep 17 00:00:00 2001 From: Thomas Proisl Date: Tue, 2 Jul 2019 13:40:14 +0200 Subject: [PATCH] Unicode flags are single tokens --- someweta/tagger.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/someweta/tagger.py b/someweta/tagger.py index ab6e33c..75acae2 100644 --- a/someweta/tagger.py +++ b/someweta/tagger.py @@ -66,7 +66,7 @@ def __init__(self, beam_size=5, iterations=10, lexicon=None, mapping=None, brown r"|".join([re.escape(_) for _ in emoticon_list]) + r"$", re.VERBOSE) # Unicode emoticons and other symbols - self.unicode_flags = re.compile(r"\p{Regional_Indicator}{2}") + self.unicode_flags = re.compile(r"^\p{Regional_Indicator}{2}$") # self.emoji = re.compile(r"^[\u2600-\u27BF\uFE0E\uFE0F\U0001F300-\U0001f64f\U0001F680-\U0001F6FF\U0001F900-\U0001F9FF]$") self.emoji = re.compile(r"[\p{Extended_Pictographic}\p{Emoji_Presentation}\uFE0F]")