-
Notifications
You must be signed in to change notification settings - Fork 0
/
disambiguation_gr.py
78 lines (52 loc) · 2.02 KB
/
disambiguation_gr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
from bs4 import BeautifulSoup
import pickle
from datetime import datetime
try:
import urllib
except AttributeError:
import urllib.requests
startTime = datetime.now()
# for other languages change starting_url
starting_url = urllib.request.urlopen('https://el.wikipedia.org/wiki/%CE%9A%CE%B1%CF%84%CE%B7%CE%B3%CE%BF%CF%81%CE%AF%CE%B1:%CE%91%CF%80%CE%BF%CF%83%CE%B1%CF%86%CE%AE%CE%BD%CE%B9%CF%83%CE%B7')
# initialize
soup = BeautifulSoup(starting_url, "lxml")
## disambiguation articles
dis_art = []
counter = 1
# crawling loop
while counter != 0:
print("Page", counter)
#get the disambiguation articles from the current page
divs = soup.find_all("div", attrs={'class':"mw-category-group"})
for div in divs:
lis = div.find_all("li")
for li in lis:
dis_art.append(li.text)
# getting next pages link
links = soup.find_all('a' , text = 'επόμενη σελίδα') # for other languages change the "next page" title/text
if links:
next_link = 'https://el.wikipedia.org' + links[0]['href']
next_url = urllib.request.urlopen(next_link)
soup = BeautifulSoup(next_url, "lxml")
counter += 1
else:
# halting the process if there are no further pages of articles
break
# print info and save to file
print()
print("Process terminated after",datetime.now() - startTime)
print("Finished gathering", len(dis_art), "articles, from", counter, "pages")
print("Last url visited was:", next_link)
print()
print("Please enter a filename for the list")
filename = input("> ")
filename = filename + '.txt'
pickle.dump(dis_art, open(filename, 'wb'))
print("Articles saved as:", filename)
print()
print("If you want to load the articles as a list you could use the pickle library or the joblib library")
print("i.e:")
print(" > with open('"+filename+"', 'rb') as pickle_file:")
print(" > articles = pickle.load(pickle_file)")
print(" or")
print(" > articles = joblib.load('" + filename + "')")