-
Notifications
You must be signed in to change notification settings - Fork 3
/
movies_nlu.py
148 lines (111 loc) · 4.24 KB
/
movies_nlu.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
"""
Simple NLU for the movie domain
===============================
Author: Shuo Zhang, Krisztian Balog
"""
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize
from simulation.nlp.movies import REPLACE_BY_SPACE_RE, BAD_SYMBOLS_RE, UTTERANCE_PATTERN
from simulation.nlp.nlu import NLU
PRE_FILE = "data/metadata_prep.csv"
class MoviesNLU(NLU):
"""Movies NLU"""
def __init__(self):
super(NLU, self).__init__()
self.metadata, self.titles_all, self.tfidf_fit_nlu, \
self.tfidf_matrix_nlu = self.naive_index()
def naive_index(self):
"""Loads MovieLens dataset as local index.
Returns: movie lens indexes
"""
metadata = pd.read_csv(PRE_FILE) # cf. user/ml-20m/data_pre for data preparation
titles = metadata['title'].tolist()
docs = [self.text_prepare(title) for title in titles if isinstance(title, str)]
titles_all = [title for title in titles if isinstance(title, str)]
tfidf_vectorizer = TfidfVectorizer()
tfidf_fit = tfidf_vectorizer.fit(docs)
tfidf_matrix = tfidf_fit.transform(docs)
return metadata, titles_all, tfidf_fit, tfidf_matrix
@staticmethod
def text_prepare(doc):
"""Split the doc.
Args:
doc: text
Returns: Splitted text
"""
doc = doc.lower()
doc = REPLACE_BY_SPACE_RE.sub(' ', doc)
doc = BAD_SYMBOLS_RE.sub('', doc)
doc = " ".join([w for w in word_tokenize(doc)])
return doc
@staticmethod
def parse(text):
"""Removes special tags to avoid problems such as parenthesis matching in regex.
Args:
text: text
Returns: parsed text
"""
for tag in ["\n", ";)"]:
while tag in text:
text = text.replace(tag, "")
return text
@staticmethod
def find_pattern(utterance):
"""Finds the pattern by checking the prefix, i.e., checking the terms by splitting.
Args:
utterance: agent utterrance
Returns: pattern
"""
prefix_list = UTTERANCE_PATTERN
for i in range(len(utterance.split())):
# Extend prefix by pointing the next term, and keep utterances having the same prefix
current_prefix_list = [j for j in prefix_list if utterance.split()[i] == j.split()[i]]
prefix_list = current_prefix_list
if len(prefix_list) == 1:
return UTTERANCE_PATTERN.get(prefix_list[0])
print("AGENT: ", utterance)
raise SyntaxError("Pattern not found!")
def link_entity(self, sf, id=None):
"""Links entity for the given surface form.
Args:
sf: surface form
id: id
Returns: linked movies
"""
a = list(cosine_similarity(self.tfidf_fit_nlu.transform([self.text_prepare(sf)]),
self.tfidf_matrix_nlu.tocsr())[0])
b = sorted(range(len(a)), key=lambda i: a[i], reverse=True)[:1][0]
return self.titles_all[b] if not id else b # return title or id
def link_entities(self, text):
"""Links entities in the given text.
Args:
text: text
Returns: linked movies
"""
sfs = self.extract_sf(self.parse(text))
return [(text, sf, self.link_entity(sf), self.movie_genre(sf)) for sf in sfs]
def extract_sf(self, text):
"""Based on the recorded utterance patterns (cf. UTTERANCE_PATTERN),
locate and extract surface forms for movie titles
Args:
text: text
Returns: surface form
"""
pattern = self.find_pattern(utterance=text)
p = re.compile(pattern).findall(text)
return p if isinstance(p[0], str) else list(p[0])
def movie_genre(self, title):
"""Finds movie genre based on movie title.
Args:
title: movie title
Returns: list of movie genre
"""
try:
res = str(self.metadata[self.metadata['title'] == title]
['genres'].values[0]).split(", ")
except Exception:
res = []
return res