-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathreference.py
93 lines (85 loc) · 2.88 KB
/
reference.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import re
from nltk.corpus import cmudict
d = cmudict.dict()
def clean_str(self, string):
"""
Tokenization/string cleaning for all datasets except for SST.
Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
"""
string = re.sub(r"[^A-Za-z0-9(),!.?\'\`\r\n]", " ", string)
string = re.sub(r"\'s", " \'s", string)
string = re.sub(r"\'ve", " \'ve", string)
string = re.sub(r"n\'t", " n\'t", string)
string = re.sub(r"\'re", " \'re", string)
string = re.sub(r"\'d", " \'d", string)
string = re.sub(r"\'ll", " \'ll", string)
string = re.sub(r",", " ", string)
string = re.sub(r"!", " ", string)
string = re.sub(r"\(", " ", string)
string = re.sub(r"\)", " ", string)
string = re.sub(r"\?", " ", string)
string = re.sub(r"[^\S\r\n]", " ", string)
string = re.sub(r'\b[0-9]+\b', '', string)
finalString = ''
paragraph = ''
for line in string.splitlines():
if not re.match(r"^(\d|\s)+$", line): # filters out lines which has only numbers
if line == '': # if there is empty line, write the paragraphs to finalString
if paragraph:
finalString += paragraph + "\n"
paragraph = ''
elif len(re.findall(r'\w+', line)) > 1:
paragraph += line + ' '
if paragraph:
finalString += paragraph + "\n"
finalString = re.sub(' +', ' ', finalString).strip()
return finalString.strip().lower()
def getTotalWords(doc):
totalWords = []
for line in doc.splitlines():
totalWords += line.split()
return totalWords
def getWordSyllabel(word):
"""
Returns syllables count of the given word
:param word:
:return:
"""
try:
return [len(list(y for y in x if y[-1].isdigit())) for x in d[word.lower()]][0]
except KeyError:
# if word not found in cmudict
return syllables(word)
def syllables(word):
"""
Returns syllables count of the given word
Custom implementation for the words which are not in nltk dictionary
:param word:
:return:
"""
count = 0
vowels = 'aeiouy'
word = word.lower()
if word[0] in vowels:
count += 1
for index in range(1, len(word)):
if word[index] in vowels and word[index-1] not in vowels:
count += 1
if word.endswith('e'):
count -= 1
if word.endswith('le'):
count += 1
if count == 0:
count += 1
return count
def smogScore(self):
"""
SMOG grading = 3 + √polysyllable count.
:return:
"""
smogScore = 3 + math.sqrt(self.__polySyllableCount)
return smogScore
# print('readier', textstat.syllable_count('readier'))
# print('karate', textstat.syllable_count('karate'))
# print('insouciance', textstat.syllable_count('insouciance'))
# print('Siberia', textstat.syllable_count('Siberia'))