Skip to content

Commit

Permalink
adds emoji and smiley support, some refactoring, adds prioritized met…
Browse files Browse the repository at this point in the history
…hod calls support
  • Loading branch information
Said committed Jan 27, 2016
1 parent 490f864 commit c0e78d4
Show file tree
Hide file tree
Showing 6 changed files with 72 additions and 22 deletions.
24 changes: 20 additions & 4 deletions preprocessor/constants.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,30 @@
# -*- coding: utf-8 -*-
"""
preprocessor.constants
~~~~~~~~~~~~
This module includes the constant variables used in Preprocessor
"""
import re

PREPROCESS_METHODS_PREFIX = 'preprocess_'
PARSE_METHODS_PREFIX = 'parse_'
PRIORITISED_METHODS = ['urls']

class Patterns:
URL_PATTERN=ur'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019]))'
HASHTAG_PATTERN = r'#\w*'
MENTION_PATTERN = r'@\w*'
RESERVED_WORDS_PATTERN = r'^(RT|FAV)'
URL_PATTERN=re.compile(ur'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019]))')
HASHTAG_PATTERN = re.compile(r'#\w*')
MENTION_PATTERN = re.compile(r'@\w*')
RESERVED_WORDS_PATTERN = re.compile(r'^(RT|FAV)')

try:
# UCS-4
EMOJIS_PATTERN = re.compile(u'([\U00002600-\U000027BF])|([\U0001f300-\U0001f64F])|([\U0001f680-\U0001f6FF])')
except re.error:
# UCS-2
EMOJIS_PATTERN = re.compile(u'([\u2600-\u27BF])|([\uD83C][\uDF00-\uDFFF])|([\uD83D][\uDC00-\uDE4F])|'
u'([\uD83D][\uDE80-\uDEFF])')

SMILEYS_PATTERN = re.compile(r"(?::|;|=)(?:-)?(?:\)|\(|D|P|S){1,}")

class Functions:
CLEAN=1
Expand Down
20 changes: 14 additions & 6 deletions preprocessor/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,13 @@
"""

import re
from .constants import Patterns
from .utils import Util
from .constants import *

class ParseResult:
urls = None
emojis = None
smileys = None
hashtags = None
mentions = None
reserved_words = None
Expand All @@ -37,11 +39,11 @@ def __init__(self):
def parse(self, tweet_string):
parse_result_obj = ParseResult()

parser_methods = self.u.get_worker_methods(self, 'parse_')
parser_methods = self.u.get_worker_methods(self, PARSE_METHODS_PREFIX)

for a_cleaner_method in parser_methods:
method_to_call = getattr(self, a_cleaner_method)
attr = a_cleaner_method.split('_')[1]
for a_parser_method in parser_methods:
method_to_call = getattr(self, a_parser_method)
attr = a_parser_method.split('_')[1]

items = method_to_call(tweet_string)
setattr(parse_result_obj, attr, items)
Expand All @@ -53,7 +55,7 @@ def parser(self, pattern, string):
items = []

for match_object in re.finditer(pattern, string):
parse_item = ParseItem(match_object.start(), match_object.end(), match_object.group())
parse_item = ParseItem(match_object.start(), match_object.end(), match_object.group().encode('utf-8'))
items.append(parse_item)

if len(items):
Expand All @@ -71,3 +73,9 @@ def parse_mentions(self, tweet_string):
def parse_reserved_words(self, tweet_string):
return self.parser(Patterns.RESERVED_WORDS_PATTERN, tweet_string)

def parse_emojis(self, tweet_string):
tweet_to_clean = tweet_string.decode('utf-8')
return self.parser(Patterns.EMOJIS_PATTERN, tweet_to_clean)

def parse_smileys(self, tweet_string):
return self.parser(Patterns.SMILEYS_PATTERN, tweet_string)
19 changes: 13 additions & 6 deletions preprocessor/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
"""

import re
from .constants import Patterns, Functions
from .constants import *
from .utils import Util

class Preprocess:
Expand All @@ -21,7 +21,7 @@ def __init__(self):

def clean(self, tweet_string, repl):

cleaner_methods = self.u.get_worker_methods(self, 'preprocess_')
cleaner_methods = self.u.get_worker_methods(self, PREPROCESS_METHODS_PREFIX)

for a_cleaner_method in cleaner_methods:
token = self.get_token_string_from_method_name(a_cleaner_method)
Expand All @@ -36,16 +36,23 @@ def clean(self, tweet_string, repl):
return tweet_string

def preprocess_urls(self, tweet_string, repl):
return re.sub(Patterns.URL_PATTERN, repl, tweet_string)
return Patterns.URL_PATTERN.sub(repl, tweet_string)

def preprocess_hashtags(self, tweet_string, repl):
return re.sub(Patterns.HASHTAG_PATTERN, repl, tweet_string)
return Patterns.HASHTAG_PATTERN.sub(repl, tweet_string)

def preprocess_mentions(self, tweet_string, repl):
return re.sub(Patterns.MENTION_PATTERN, repl, tweet_string)
return Patterns.MENTION_PATTERN.sub(repl, tweet_string)

def preprocess_reserved_words(self, tweet_string, repl):
return re.sub(Patterns.RESERVED_WORDS_PATTERN, repl, tweet_string)
return Patterns.RESERVED_WORDS_PATTERN.sub(repl, tweet_string)

def preprocess_emojis(self, tweet_string, repl):
tweet_to_clean = tweet_string.decode('utf-8')
return Patterns.EMOJIS_PATTERN.sub(repl, tweet_to_clean)

def preprocess_smileys(self, tweet_string, repl):
return Patterns.SMILEYS_PATTERN.sub(repl, tweet_string)

def remove_unneccessary_characters(self, tweet_string):
return ' '.join(tweet_string.split())
Expand Down
9 changes: 9 additions & 0 deletions preprocessor/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
This module includes utility methods which are used in Preprocessor
"""

from .constants import PRIORITISED_METHODS

class Util:

def __init__(self):
Expand All @@ -12,4 +14,11 @@ def __init__(self):
def get_worker_methods(self, object, prefix):
all_methods = dir(object)
relevant_methods = filter(lambda x: x.startswith(prefix), all_methods)
prefixed_prioritised_methods = [prefix+m for m in PRIORITISED_METHODS]

offset = 0
for ind, pri_method in enumerate(prefixed_prioritised_methods):
relevant_methods.remove(pri_method)
relevant_methods.insert(offset+ind, pri_method)

return relevant_methods
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

setup(
name='tweet-preprocessor',
version='0.2.0',
version='0.3.0',
description='Elegant tweet preprocessing',
long_description=long_description,
author='Said Özcan',
Expand Down
20 changes: 15 additions & 5 deletions tests/test_api.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# -*- coding: utf-8 -*-

import io
import unittest

Expand All @@ -6,17 +8,17 @@
class PreprocessorTest(unittest.TestCase):

def test_clean(self):
tweet = 'Hello there! @pyistanbul #packathon was awesome. http://packathon.org'
tweet = "Hello there! @pyistanbul #packathon was awesome 😀. http://packathon.org"
cleaned_tweeet = p.clean(tweet)
self.assertEqual(cleaned_tweeet, 'Hello there! was awesome.')
self.assertEqual(cleaned_tweeet, 'Hello there! was awesome .')

def test_tokenize(self):
tweet = 'Packathon was a really #nice challenging. @packathonorg http://packathon.org'
tweet = 'Packathon was a really #nice :) challenging 👌. @packathonorg http://packathon.org'
tokenized_tweet = p.tokenize(tweet)
self.assertEqual(tokenized_tweet, 'Packathon was a really $HASHTAG$ challenging. $MENTION$ $URL$')
self.assertEqual(tokenized_tweet, 'Packathon was a really $HASHTAG$ $SMILEY$ challenging $EMOJI$. $MENTION$ $URL$')

def test_parse(self):
tweet = 'A tweet with #hashtag @mention and http://github.com/s.'
tweet = 'A tweet with #hashtag :) @mention 😀 and http://github.com/s.'
parsed_tweet = p.parse(tweet)

self.assertIsNotNone(parsed_tweet.urls)
Expand All @@ -29,6 +31,14 @@ def test_parse(self):
self.assertEqual(1, len(parsed_tweet.mentions))

self.assertIsNone(parsed_tweet.reserved_words)

self.assertIsNotNone(parsed_tweet.emojis)
self.assertEqual(1, len(parsed_tweet.emojis))
self.assertEqual("😀", parsed_tweet.emojis[0].match)

self.assertIsNotNone(parsed_tweet.smileys)
self.assertEqual(1, len(parsed_tweet.smileys))
self.assertEqual(":)", parsed_tweet.smileys[0].match)

if __name__ == '__main__':
unittest.main()

0 comments on commit c0e78d4

Please sign in to comment.