-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcompression.py
82 lines (68 loc) · 2.45 KB
/
compression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/usr/bin/env python
import glob
import optparse
import os
import sys
import math
from collections import defaultdict
from collections import Counter
optparser = optparse.OptionParser()
optparser.add_option("-d", "--data", dest="data", default="data/train")
optparser.add_option("-t", "--test", dest="test", default="data/test")
opts = optparser.parse_args()[0]
# k is maximum context, adjustable
k = 2
# map from language name --> predictions map
lang_map = {}
lm = defaultdict(lambda: defaultdict(float))
def calculate_space(context, x, index):
if len(context) > k:
calculate_space(context[-k:], x, index)
if lm[context][x]:
return lm[context][x]
elif context == '':
return 1.0 / lm['alpha']
else:
if lm[context]['esc'] == 0.0:
return calculate_space(context[1:], x, index)
else:
return lm[context]['esc'] * calculate_space(context[1:], x, index)
# create language models for each language
for file in glob.glob(opts.data + "/*"):
lang_name = os.path.basename(file)
alpha = len(Counter(open(file).read().lower()).keys())
for line in open(file):
prediction = defaultdict(lambda: defaultdict(float))
line = line.strip().replace(' ', '')
for j in xrange(k+1):
for idx in xrange(len(line)-j):
context = line[idx:idx+j]
prediction[context][line[idx+j]] += 1
for context in prediction:
esc_num = len(prediction[context].keys())
count = sum(prediction[context].values()) + esc_num
for letter in prediction[context]:
prediction[context][letter] = prediction[context][letter] / count
prediction[context]['esc'] = esc_num / count
lang_map[lang_name] = prediction
lang_map[lang_name]['alpha'] = alpha
for idx, line in enumerate(open(opts.test)):
if idx % 50 == 0:
sys.stderr.write("%s\n" % idx)
line = line.lower().strip().replace(' ', '')
lowest_ent = 100.0
lang_guess = ''
for lang in lang_map:
current_ent = 1.0
lm = lang_map[lang]
n = len(line)
for i in xrange(n-1):
x_i = line[i]
context_i = line[:i]
prob = calculate_space(context_i, x_i, idx)
current_ent += -1 * math.log(prob, 2)
current_ent = current_ent * 1.0 / n
if current_ent < lowest_ent:
lowest_ent = current_ent
lang_guess = lang
print lang_guess