-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathembedding.py
68 lines (58 loc) · 1.9 KB
/
embedding.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# set of bilingual word embeddings
import numpy as np
from bisect import bisect_left
from collections import Counter
from os.path import exists
from scipy.sparse import dok_matrix
from scipy.sparse.linalg import svds
def get_index(a, x):
i = bisect_left(a, x)
if i != len(a) and a[i] == x:
return i
return -1
def get_vocabulary(in_wd, n_vocab=None):
vocabulary = Counter()
for sent in in_wd:
for word in sent:
vocabulary[word] += 1
most_common = vocabulary.most_common(n_vocab)
return sorted(pair[0] for pair in most_common)
# super simple
def bi_lsa(src_wd, tgt_wd, n_vocab=10000):
src_v = get_vocabulary(src_wd, n_vocab=n_vocab)
tgt_v = get_vocabulary(tgt_wd, n_vocab=n_vocab)
if exists('fien_bi_lsa.npy'):
embedding = np.load('fien_bi_lsa.npy')
else:
n_src = len(src_v)
n_tgt = len(tgt_v)
n_words = n_src + n_tgt
n_sents = len(src_wd)
lsa_store = dok_matrix((n_words, n_sents))
for sent_ix in range(n_sents):
for word in src_wd[sent_ix]:
wd_ix = get_index(src_v, word)
if wd_ix != -1:
lsa_store[wd_ix, sent_ix] += 1
for word in tgt_wd[sent_ix]:
wd_ix = get_index(tgt_v, word)
if wd_ix != -1:
lsa_store[n_src + wd_ix, sent_ix] += 1
u, s, vt = svds(lsa_store, k=100, return_singular_vectors='u')
embedding = u
np.save('fien_bi_lsa.npy', embedding)
# keep two vocabularies separate
# because of the potential for overlap
return src_v, tgt_v, embedding
# requires word alignment
def bi_skip(src_wd, tgt_wd):
return []
# enforces sentence-level agreement
def bi_cvm(src_wd, tgt_wd):
return []
# requires translational lexicon
def bi_cca(src_wd, tgt_wd):
return []
# requires comparable documents
def bi_vcd(src_wd, tgt_wd):
return []