-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathfighting_words_py3.py
66 lines (63 loc) · 2.9 KB
/
fighting_words_py3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer as CV
import string
exclude = set(string.punctuation)
def basic_sanitize(in_string):
'''Returns a very roughly sanitized version of the input string.'''
in_string = ''.join([ch for ch in in_string if ch not in exclude])
in_string = in_string.lower()
in_string = ' '.join(in_string.split())
return in_string
def bayes_compare_language(l1, l2, ngram = 1, prior=.01, cv = None):
'''
Arguments:
- l1, l2; a list of strings from each language sample
- ngram; an int describing up to what n gram you want to consider (1 is unigrams,
2 is bigrams + unigrams, etc). Ignored if a custom CountVectorizer is passed.
- prior; either a float describing a uniform prior, or a vector describing a prior
over vocabulary items. If you're using a predefined vocabulary, make sure to specify that
when you make your CountVectorizer object.
- cv; a sklearn.feature_extraction.text.CountVectorizer object, if desired.
Returns:
- A list of length |Vocab| where each entry is a (n-gram, zscore) tuple.'''
if cv is None and type(prior) is not float:
print("If using a non-uniform prior:")
print("Please also pass a count vectorizer with the vocabulary parameter set.")
quit()
l1 = [basic_sanitize(l) for l in l1]
l2 = [basic_sanitize(l) for l in l2]
if cv is None:
cv = CV(decode_error = 'ignore', min_df = 10, max_df = .5, ngram_range=(1,ngram),
binary = False,
max_features = 15000)
counts_mat = cv.fit_transform(l1+l2).toarray()
# Now sum over languages...
vocab_size = len(cv.vocabulary_)
print("Vocab size is {}".format(vocab_size))
if type(prior) is float:
priors = np.array([prior for i in range(vocab_size)])
else:
priors = prior
z_scores = np.empty(priors.shape[0])
count_matrix = np.empty([2, vocab_size], dtype=np.float32)
count_matrix[0, :] = np.sum(counts_mat[:len(l1), :], axis = 0)
count_matrix[1, :] = np.sum(counts_mat[len(l1):, :], axis = 0)
a0 = np.sum(priors)
n1 = 1.*np.sum(count_matrix[0,:])
n2 = 1.*np.sum(count_matrix[1,:])
print("Comparing language...")
for i in range(vocab_size):
#compute delta
term1 = np.log((count_matrix[0,i] + priors[i])/(n1 + a0 - count_matrix[0,i] - priors[i]))
term2 = np.log((count_matrix[1,i] + priors[i])/(n2 + a0 - count_matrix[1,i] - priors[i]))
delta = term1 - term2
#compute variance on delta
var = 1./(count_matrix[0,i] + priors[i]) + 1./(count_matrix[1,i] + priors[i])
#store final score
z_scores[i] = delta/np.sqrt(var)
index_to_term = {v:k for k,v in cv.vocabulary_.items()}
sorted_indices = np.argsort(z_scores)
return_list = []
for i in sorted_indices:
return_list.append((index_to_term[i], z_scores[i]))
return return_list