forked from ydinkov/donders2017
-
Notifications
You must be signed in to change notification settings - Fork 0
/
wordFilter.py
69 lines (59 loc) · 2.46 KB
/
wordFilter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
"""Class that checks whether user input is in the first n most commonly used
words from a given word count list.
"""
import pandas as pd
import re
textfile = "Resources/it_50k.txt"
storytext = "Resources/casino murder Italian.txt"
class WordFilter:
"""Word filter class.
Parameters
----------
n_words : int
Include first n_words from word list in the filter
wordcount : str
Textfile with word count
"""
def __init__(self, n_words=10000, wordcount=textfile):
self.wordlist = pd.read_csv(wordcount, sep=" ", header=None,
encoding="ISO-8859-1", nrows=n_words,
usecols=[0], squeeze=True)
# read in text narrative
with open(storytext, 'r', encoding="ISO-8859-1") as story:
for line in story:
# keep only alphanumeric and diacritic characters
line = re.sub(u'[^a-zA-Z0-9áéíóúÁÉÍÓÚàèìòùÀÈÌÒÙâêîôÂÊÎÔãõÃÕçÇ ]', '', line)
line.encode("ISO-8859-1")
storywords = pd.Series(list(filter(None, line.split(" "))))
# skip lines with only whitespace
if not storywords.empty:
self.wordlist = pd.concat([self.wordlist, storywords],
axis=0, ignore_index=True)
self.wordlist.drop_duplicates(inplace=True)
self.student_words = None
def filter_text(self, user_input):
"""
Parameters
----------
user_input : str
String of words to check for presence in word list
Returns
-------
None or tuple
None if all user input was present in the word list. Return a tuple
with (index, word) of the first word not present in the list.
"""
self.student_words = user_input.split()
for ind_w, word in enumerate(self.student_words):
word = re.sub(u'[^a-zA-Z0-9áéíóúÁÉÍÓÚàèìòùÀÈÌÒÙâêîôÂÊÎÔãõÃÕçÇ ]', '', word)
if word:
if self.wordlist.str.contains(word, case=False).any():
pass
else:
return word
return None
if __name__ == "__main__":
filt = WordFilter()
excl_words = filt.filter_text("Che la appeltaart")
print(excl_words)
print(filt.wordlist)