-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathanalyzer.py
261 lines (194 loc) · 10.5 KB
/
analyzer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
from bs4 import BeautifulSoup
from datetime import datetime
from fuzzywuzzy import fuzz
from collections import defaultdict
import itertools
import re
import requests
import sqlite3
import time
import wget #kan weg volgens mij
import os #Moet nog worden gebruikt
import PyPDF2
import random
import matplotlib.pyplot as plt
#See variables at the bottom for further information.
##################################################################################################
##################################################################################################
########################################### HERE BE DATABASES ####################################
##################################################################################################
##################################################################################################
def save_to_db(result_title, title, excerpt, article_type, no_pages, document_link, fetched_on):
"""Saves the data to the database"""
data = sqlite3.connect("scraper_save_file.db")
c = data.cursor()
c.execute('''
CREATE TABLE IF NOT EXISTS saved_data
(result_title TEXT, title TEXT, excerpt TEXT, article_type TEXT, no_pages INT, document_link TEXT, fetched_on, document_text TEXT, pdf_url TEXT, enhanced_on TEXT, abstracts TEXT, article_rank INT, number_of_mentions INT, NORMALIZED_number_of_mentions REAL, date_of_analysis TEXT, used_search_terms TEXT, estimated_publication_date INT)''')
now = datetime.now()
c.execute("""
INSERT INTO saved_data
(result_title, title, excerpt, article_type, no_pages, document_link, fetched_on) VALUES
(?,?,?,?,?,?,?)""", (result_title, title, excerpt, article_type, no_pages, document_link, fetched_on))
data.commit()
def analysed_save_to_db(result_title, title, excerpt, article_type, no_pages, document_link, fetched_on, document_text, pdf_url, enhanced_on, abstracts, article_rank, number_of_mentions, NORMALIZED_number_of_mentions, date_of_analysis, used_search_terms, estimated_publication_date):
"""Saves the data to the database"""
data = sqlite3.connect("analysed.db")
c = data.cursor()
c.execute('''
CREATE TABLE IF NOT EXISTS saved_data
(result_title TEXT, title TEXT, excerpt TEXT, article_type TEXT, no_pages INT, document_link TEXT, fetched_on, document_text TEXT, pdf_url TEXT, enhanced_on TEXT, abstracts TEXT, article_rank INT, number_of_mentions INT, NORMALIZED_number_of_mentions REAL, date_of_analysis TEXT, used_search_terms TEXT, estimated_publication_date INT)''')
now = datetime.now()
c.execute("""
INSERT INTO saved_data
(result_title, title, excerpt, article_type, no_pages, document_link, fetched_on, document_text, pdf_url, enhanced_on, abstracts, article_rank, number_of_mentions, NORMALIZED_number_of_mentions, date_of_analysis, used_search_terms, estimated_publication_date) VALUES
(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)""", (result_title, title, excerpt, article_type, no_pages, document_link, fetched_on, document_text, pdf_url, enhanced_on, abstracts, article_rank, number_of_mentions, NORMALIZED_number_of_mentions, date_of_analysis, used_search_terms,estimated_publication_date))
data.commit()
##################################################################################################
##################################################################################################
########################### HERE BE DATABASES THE PART THAT PROCESSES ############################
##################################################################################################
##################################################################################################
def search_term_finder(keywords):
"""Turn the list of keywords into a string to be saved to the database"""
keyword_string = ""
for key in keywords:
keyword_string += key + " "
return keyword_string
def keyword_graph_maker(key, years):
"""Creates a linegraph for the occurance of the keyword"""
# SELECT MIN(estimated_publication_date) FROM saved_data;
# SELECT MAX(estimated_publication_date) FROM saved_data;
#AAN HET EINDE VAN HET PROGRAMMA DE BALANS OPMAKEN
#fetch keyword
#fetch min jaartaal
#fetch max jaartaal
#loop over min tot max (stappen van 1 jaar )
#selecteer entry die dat jaartal heeft en selecteer hits
#Maak graph met keyword als titel
#Save en done
#Doe het zelfde waar je hits door pagenumbers deelt (gebruik normalized niet, die zijn voor totalen )
pass
def publication_date_determiner(string):
"""Scans the document title for the last valid date and assumes that as the publication date"""
try:
estimated_publication_date = int(re.findall('19\d\d|18\d\d', string)[-1])
if estimated_publication_date > 1960:
estimated_publication_date = None
except:
estimated_publication_date = None
return estimated_publication_date
def analyser(text, RANGE, keywords, accuracy):
"""Reads a text and returns relevant parts"""
#IS EEN GOEDE KANS DAT HIJ NIET MEER WERKT ALS ER GEEN NULL VALUES MEER ZIJN, HERCHECK M DAN!
#################################
#Load text to variable
document_text = text
#I need these variables
REMOVED = 0
results = ""
final_results = ""
hits = 0
index_lijst = []
first_catch_of_today = []
split_text = document_text.split()
#zoek binnen lijst naar match
for word in split_text:
if keywords in word:
hits += 1
#vind index van matches en voeg aan index lijst toe
index_lijst.append(split_text.index(word))
#Create list of index ranges
for index in index_lijst:
first_catch_of_today.append((split_text[index - RANGE: index + RANGE]))
#remove duplicates
for this, that in itertools.combinations(first_catch_of_today, 2):
this_match = " ".join(this)
that_match = " ".join(that)
rat = (fuzz.ratio(this_match,that_match))
if rat > accuracy:
#DEZE TRY EXCEPT LOOP MAAKT DAT HET PROGRAMMA WERKT
#MAAR DE BUG VERTRAAGT HET PROGRAMMA ECHT ENORM.
try:
#first_catch_of_today.remove(that)
first_catch_of_today.remove(this)
print("Too simular, removing...")
REMOVED +=1
except:
print("program tries to remove non-existing element, skipping...")
#Turn these index ranges into sentences again.
for x in first_catch_of_today:
#echt chaos, alles dubbelop MAAR HET WERKT
#Maakt caps van matches want dat leest fijner
results = " ".join(x)
results = results.split()
for word in results:
if keywords in word:
index = results.index(word)
results[index] = word.upper()
#Create some whitespace between portions
results.append("\n\n")
#join them together in a return variable
results = " ". join(results)
final_results += results
final_results += "\n\n"
#hits = len(index_lijst)
return final_results, hits
def iterator(keywords, search_range, accuracy):
"""Iterates over the rows within the database and peforms operations on them"""
hitdict = {}
dictlist = []
data = sqlite3.connect("scraper_save_file.db")
c = data.cursor()
total_hits = 0
row_count = 1
rows = c.execute("SELECT result_title, title, excerpt, article_type, no_pages, document_link, fetched_on, document_text, pdf_url, enhanced_on, abstracts, article_rank, number_of_mentions, NORMALIZED_number_of_mentions, date_of_analysis, used_search_terms estimated_publication_date FROM saved_data")
for row in rows:
print("Summarizing row: %d" % row_count)
row_count += 1
try:
result_title = str(row[0])
title = str(row[1])
excerpt = str(row[2])
article_type = str(row[3])
no_pages = row[4]
document_link = row[5]
fetched_on = row[6]
document_text = row[7]
pdf_url = row[8]
enhanced_on = row[9]
abstracts = row[10]
article_rank = row[11]
number_of_mentions = row[12]
NORMALIZED_number_of_mentions = row[13]
date_of_analysis = row[14]
used_search_terms = search_term_finder(keywords)
estimated_publication_date = publication_date_determiner(result_title)
#Als ie leeg is moet ie naar nul, anders crashed het programma
if number_of_mentions == None:
number_of_mentions = 0
#print(result_title)
#
for key in keywords:
abstract_text, number_of_results = analyser(document_text, search_range, key, accuracy)
number_of_mentions += number_of_results
if abstracts == None:
abstracts = ""
abstracts += abstract_text
abstracts += "\n\n"
#Calculate NORMALIZED_number_of_mentions
NORMALIZED_number_of_mentions = number_of_mentions / no_pages
#Hercalculeer hier de rank, het heeft enige peformance penalty om dat hier in te code te doen, maar g dit is veel duidelijker
#Calculeer hier het jaartal (apparte functie) #VOEG COLUMN TOE
#Maak hier de graphs? Of in de functie
#En op een database van enkele honderden documenten (en sowieso al een paar uur analyse tijd) maakt dit echt geen ene reet uit.
analysed_save_to_db(result_title, title, excerpt, article_type, no_pages, document_link, fetched_on, document_text, pdf_url, enhanced_on, abstracts, article_rank, number_of_mentions, NORMALIZED_number_of_mentions, date_of_analysis, used_search_terms, estimated_publication_date)
except:
print("Row analysis failed for some reasons, skipping...")
#The keywords that you wish to search for. Consider using partial words to include spelling / scanning mistakes (EG 'migr' instead of 'migrants')
keywords = []
#How many words you wish to include on each side
search_range = 80
#To avoid duplicate paragraphs the program filters out paragrahps that are deemed to be to similar to those that are already included.\nInput the percentage point (e.g: '90' for 90%) of how much overlap paragrahps are alowed to have before they are removed by the program\nA score of 90 is recommended
accuracy = 90
iterator(keywords, search_range, accuracy)