-
Notifications
You must be signed in to change notification settings - Fork 0
/
retrieval_algorithms.py
271 lines (242 loc) · 10.4 KB
/
retrieval_algorithms.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
"""
@author Daniel Salazar Mora
Algorithms to retrieve information by cleaning a query and then, using the terms weight algorithm,
provide to the user relevant information. This information is related to restaurants that may be
a good fit for the user.
"""
import requests
import os
from threading import Thread
JSON_RESULTS_SEARCH_API = "items"
# Place API constants
JSON_RESULTS_PLACE_API = "results"
JSON_PLACE_NAME_PLACE_API = "name"
JSON_PLACE_PRICE_STATUS_PLACE_API = "price_level"
JSON_PLACE_RATING_PLACE_API = "rating"
JSON_PLACE_WEBSITE_PLACE_API = "website"
"""
Will make the GET requests with the Google API
"""
def get_request(url, params):
try:
r = requests.get(url = url, params = params)
return r.json()
except Exception as e:
print(e)
"""
Will return the Google API Key thats its set in the environment.
"""
def get_key():
return "key="+str(os.getenv('API_KEY'))
"""
Will return the Google API Key thats its set in the environment.
"""
def get_search_key():
return "key="+str(os.getenv('API_KEY_SEARCH'))
"""
Returns the Google Search Engine identifier. Its delimited to only search
in spanish and inside Costa Rica.
"""
def get_cx():
return "cx="+str(os.getenv('SEARCHENGINEID'))
"""
Returns the query with the requested format.
"""
def get_query(query):
return "q="+query
"""
Returns the query with the requested format.
"""
def get_query_place_api(query):
return "query="+query
"""
Returns the url to make the get request and get the results.
"""
def get_url_search_api(query, start):
return f"https://www.googleapis.com/customsearch/v1?{get_search_key()}&{get_cx()}&{get_query(query)}&num=10&start={start}"
"""
Returns the url to make the get request and get the results.
"""
def get_url_place_api(query, start):
return f"https://maps.googleapis.com/maps/api/place/textsearch/json?{get_query_place_api(query)}&{get_key()}"
"""
Removes the stopwords such as innecesary or repeated words in the query. Will return the phrase
given without the stopwords in order to create the query afterwards.
"""
def remove_stopwords(phrase, stopword):
"""
File obtained from https://github.com/xiamx/node-nltk-stopwords/blob/master/data/stopwords/spanish
and modified so it can adapt to this program.
"""
new_text = []
phrase_splitted = phrase.split()
for i in range(len(phrase_splitted)):
if phrase_splitted[i] != stopword:
new_text.append(phrase_splitted[i])
return " ".join(new_text)
"""
Cleans the query by removing stopwords and returns the new query with only relevant keywords.
"""
def clean_query(food, place, extras):
try:
stopwords_file = open("stopwords.txt", "r")
for stopword in stopwords_file:
stopword = stopword.replace("\n", "") # sanitize
food = remove_stopwords(food, stopword)
place = remove_stopwords(place, stopword)
extras = remove_stopwords(extras, stopword)
query = food + " " + extras + " en Costa Rica " + place
stopwords_file.close()
return True, query
except:
return False, "Couldn't open the file"
"""
Check that the results wont be repeated and that they are considered relevant. Returns if is valid.
"""
def hostname_allowed(link):
# These sites are removed as they provide tops or information not directly relevant.
denied_sites_keywords = ["instagram","moovitapp","five","mochil","ihop","economi","miami",
"new","yelp","tips","ucr","deli","wiki","viaje","travel","facebook","twitter","free",
"top","expedia","tiktok","find","search","foursquare", "baix", "trip", "pdf", "sale"]
for keyword in denied_sites_keywords:
if keyword in link:
return False
return True
"""
Will get the results by making the get request to the api. Will do it api_calls_amount of times.
If no results are received, then will return None as an error. Else, will return the array of data.
The param user is the reference of the Telegram user.
"""
def get_results(query, logger, user):
start = 1
items = []
api_calls_amount = int(os.getenv('API_CALLS_AMOUNT'))
for _ in range(api_calls_amount): # Amount of API calls
try:
url = get_url_place_api(query, start)
data = get_request(url, None)
start += 10 # moves to next page of results
items += data[JSON_RESULTS_PLACE_API]
except Exception as e:
# May happen if there are not many results or qouta exceeded
logger.warning("Error %s in get api call for user %s", str(e), user.first_name)
return items if len(items) > 0 else None
"""
Will receive the ranking weights and its an array of 5 tuples (weight, result object).
Asks if the new weight should be inside the top and if thats the case, will add it to the ranking
and move to the end all the rest of results. The last result will disappear as it will no longer
be top 5.
"""
def update_top(ranking_weights, new_weight, new_result):
tmp_weight = 0
tmp_result = None
changed = False
for i,tuple in enumerate(ranking_weights):
weight_in_top = tuple[0]
result_in_top = tuple[1]
# Move the values if it was updated
if changed:
ranking_weights[i] = (tmp_weight, tmp_result)
tmp_weight = weight_in_top
tmp_result = result_in_top
# Check if needs to update
elif weight_in_top <= new_weight:
tmp_weight = weight_in_top
tmp_result = result_in_top
ranking_weights[i] = (new_weight, new_result)
changed = True # Top updated. Following values will reaccommodate.
"""
Initialize a top 5 with empty tuples (weight, result object) and the complete ranking {}.
Will iterate per every result received and make a GET request to the specific link related to it.
Then, will calculate the terms weight algorithm with the body response of the request and save the results
in a dictionary per result. So, every result has a dictionary (named words) that goes {keyword: weight}.
Every result will be saved in the ranking dictionary with the URL as key and the value is a tuple of the total
weight and the word dictionary mentioned earlier.
At the end of the process of every result, the program will check if the result should be in the top 5 relevant
results and if thats the case will add it to top_more_weights. Thats a list of tuples (weight, result object) where
the first value is the top 1 relevant result.
"""
def get_ranking(top_more_weights, results, query):
pages_already_seen = [] # avoids repetition
ranking = {} # { result url : ( total_weight, { word: weight }) }
for result in results:
words = {}
weight_sum = 0
try:
if (result[JSON_PLACE_NAME_PLACE_API] in pages_already_seen):
continue # ignore as it is repeated or not relevant result
query_per_restaurant = get_url_search_api(result[JSON_PLACE_NAME_PLACE_API]+"en Costa Rica",10)
#print("Primer llamado con consulta: ", query_per_restaurant)
response = requests.get(query_per_restaurant,
timeout=3).json() # GET request of
#print("Segundo print a", response["items"][0]["link"])
data_of_first_page = requests.get(response["items"][0]["link"],
timeout=3) # GET request
data = str(data_of_first_page.text).lower() # body response
for word in query.split(" "):
if word == "" or word == "in" or word == " ": continue # ignore stopwords
counter = data.count(word.lower()) # calculate weight of every word in the body
if counter > 0:
words[word] = counter
weight_sum += counter # updating total weight
pages_already_seen.append(result[JSON_PLACE_NAME_PLACE_API])
except Exception as e:
print(e)
continue # There was en error with the result. Will ignore it.
if len(words) > 0: # If there were results
ranking[result[JSON_PLACE_NAME_PLACE_API]] = (weight_sum, words, result)
# Check if result is in the top 5 of relevant results
update_top(top_more_weights, weight_sum, result)
return top_more_weights
"""
Will split list of results to allow parallelism.
"""
def split_list(a_list):
half = len(a_list)//2
return a_list[:half], a_list[half:]
"""
Will check if results were received from the API GET general request and if thats the case
starts with filtering hostnames and calculating the weights. Returns the ranking as a list og
tuples (weight, result object) or None if an error occurred.
"""
def get_relevant_results(items, query, logger, user):
if len(items) > 0:
try:
logger.info("Starting to get ranking for user %s", user.first_name)
items1, items2 = split_list(items)
ranking1 = [(0, None),
(0, None),
(0, None),
(0, None),
(0, None) ] # as will be top 5. (Weight Count, Result)
ranking2 = ranking1[:] # Copy
#return get_ranking(ranking1, items, query)
# Init threads
t1 = Thread(target=get_ranking, args=(ranking1, items1, query,))
t2 = Thread(target=get_ranking, args=(ranking2, items2, query,))
t1.start()
t2.start()
t1.join()
t2.join()
# Compare both new rankings
for i in range(len(ranking2)):
# This way, ranking2 will be the total ranking
update_top(ranking2, ranking1[i][0], ranking1[i][1])
return ranking2
except Exception as e:
logger.error("Error %s in get relevant results async for user %s", str(e), user.first_name)
return None # No results or invalid
# Only created for testing purposes before telegram connection
def main():
food = input("¿Qué te gustaria comer? ")
place = input("En que lugar te gustaría que esté ubicado el restaurante? ")
extras = input("¿Detalles extras que desees del lugar? ")
status, query_or_error = clean_query(food.lower(), place.lower(), extras.lower())
#print(query_or_error)
if status:
items = get_results(query_or_error)
relevant_results = get_relevant_results(items, query_or_error)
else:
print(query_or_error)
if __name__ == "__main__":
main()