-
Notifications
You must be signed in to change notification settings - Fork 2
/
attutil.py
99 lines (85 loc) · 3.9 KB
/
attutil.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
from collections import defaultdict
import re
import math
class FindNgrams:
def __init__(self, min_count=0, min_pmi=0, language='en'):
self.min_count = min_count
self.min_pmi = min_pmi
self.words = defaultdict(int)
self.ngrams, self.pairs = defaultdict(int), defaultdict(int)
self.total = 0.
self.language = language
def text_filter(self, sentence):
cleaned_text = []
index = 0
for i, w in enumerate(sentence):
if re.match(u'[^\u0600-\u06FF\u0750-\u077F\u4e00-\u9fa50-9a-zA-Z]+', w):
if i > index:
cleaned_text.append([w.lower() for w in sentence[index:i]])
index = 1 + i
if index < len(sentence):
cleaned_text.append([w.lower() for w in sentence[index:]])
return cleaned_text
def count_ngram(self, texts, n):
self.ngrams = defaultdict(int)
for sentence in texts:
for sub_sentence in self.text_filter(sentence):
for i in range(n):
n_len = i + 1
for j in range(len(sub_sentence) - i):
ngram = tuple([w for w in sub_sentence[j: j+n_len]])
self.ngrams[ngram] += 1
self.ngrams = {i:j for i, j in self.ngrams.items() if j > self.min_count}
def find_ngrams_pmi(self, texts, n, freq_threshold):
for sentence in texts:
for sub_sentence in self.text_filter(sentence):
self.words[sub_sentence[0]] += 1
for i in range(len(sub_sentence)-1):
self.words[sub_sentence[i + 1]] += 1
self.pairs[(sub_sentence[i], sub_sentence[i+1])] += 1
self.total += 1
self.words = {i:j for i, j in self.words.items() if j > self.min_count}
self.pairs = {i:j for i,j in self.pairs.items() if j > self.min_count}
min_mi = math.inf
max_mi = -math.inf
self.strong_segments = set()
for i,j in self.pairs.items():
if i[0] in self.words and i[1] in self.words:
mi = math.log(self.total * j / (self.words[i[0]] * self.words[i[1]]))
if mi > max_mi:
max_mi = mi
if mi < min_mi:
min_mi = mi
if mi >= self.min_pmi:
self.strong_segments.add(i)
# print('max mi: %.4f' % max_mi)
# print('min mi: %.4f' % min_mi)
self.ngrams = defaultdict(int)
for sentence in texts:
for sub_sentence in self.text_filter(sentence):
s = [sub_sentence[0]]
for i in range(len(sub_sentence)-1):
if (sub_sentence[i], sub_sentence[i+1]) in self.strong_segments:
s.append(sub_sentence[i+1])
else:
self.ngrams[tuple(s)] += 1
s = [sub_sentence[i+1]]
self.ngrams = {i:j for i, j in self.ngrams.items() if j > self.min_count and len(i) <= n}
self.renew_ngram_by_freq(texts, freq_threshold, n)
def renew_ngram_by_freq(self, all_sentences, min_feq, ngram_len=10):
new_ngram2count = {}
new_all_sentences = []
for sentence in all_sentences:
for sen in self.text_filter(sentence):
for i in range(len(sen)):
for n in range(1, ngram_len + 1):
if i + n > len(sentence):
break
n_gram = tuple(sentence[i: i + n])
if n_gram not in self.ngrams:
continue
if n_gram not in new_ngram2count:
new_ngram2count[n_gram] = 1
else:
new_ngram2count[n_gram] += 1
self.ngrams = {gram: c for gram, c in new_ngram2count.items() if c > min_feq}