-
Notifications
You must be signed in to change notification settings - Fork 0
/
LanguageModel.py
58 lines (50 loc) · 2.14 KB
/
LanguageModel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
class LanguageModel:
"""
This class implements Language Model
with likelyhood query approach
# TODO: now it's 1-gramm, try other maybe
"""
def __init__(self, tokens, corpus):
self.tokens = tokens
self.corpus = corpus
def calculate_TF_IDF(self,):
"""
This method takes the corpus (it's better if it was preprocessed)
and creates TF-IDF representation of it.
The table (with pandas) is the following:
| word1 | word2 | ...
tf-idf for doc1 | tf-idf_11 | tf-idf_12 | ...
tf-idf for doc2 | tf-idf_21 | tf-idf_22 | ...
... | ... | ... | ...
"""
self.vectorizerX = TfidfVectorizer()
self.vectorizerX.fit(self.corpus)
self.doc_vector = self.vectorizerX.transform(self.corpus)
def query_likelihood(self, query_tokens):
"""
This method calculates likelyhood of the query and each
text in the corpus. Note: needs tokenized query
"""
eps = 0.000001 # To deal with zero-probabilities of terms
probs = [] # To store p(q|d_i)
# Transform TF-IDF model to pandas to simplify the following code
df1 = pd.DataFrame(self.doc_vector.toarray(), columns=self.vectorizerX.get_feature_names())
# For each text in corpus calculate p(q|d_i)
for line in range(len(df1)):
doc_tfidf = df1.loc[line]
prob = 0
# For each word in query
for q in query_tokens:
prob += np.log(doc_tfidf[q] + eps)
# Maybe we have to divide by real len of document instead of len(doc_tfidf) which is constant
#print("len(self.corpus[line] = ", len(self.corpus[line]))
#prob /= len(self.corpus[line])
prob /= len(doc_tfidf)
probs.append(prob)
probs_np = np.array(probs)
related_docs_indices = np.argsort(probs_np)[:-10:-1]
print("Related docs indices with LM", related_docs_indices)
return related_docs_indices