-
Notifications
You must be signed in to change notification settings - Fork 0
/
script.py
49 lines (40 loc) · 1.19 KB
/
script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import csv
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup
def getCompanyURL(url):
try:
html = urlopen(url)
except HTTPError as e:
return None
bsObj = BeautifulSoup(html.read())
try:
for am in bsObj.findAll("span",{"class":"url"}):
continue
for m in am.findAll("a"):
if 'href' in m.attrs:
companyURL = m.attrs['href']
except NameError as e:
try:
for link in bsObj.findAll("td",{"class":"url"}):
continue
for m in link.findAll("a"):
if 'href' in m.attrs:
companyURL = m.attrs['href']
except NameError as e:
return None
return companyURL
input_file = open("wikipedia_links.csv", "r")
rdr = csv.reader(input_file)
wiki_link = []
for link in rdr:
wiki_link +=link
company_link = {}
for link in wiki_link:
company_link[link] = getCompanyURL(link)
input_file.close()
keys = company_link.keys()
with open('answer.csv', 'w') as f:
wr = csv.writer(f)
wr.writerows([("wikipedia_page","website")])
wr.writerows([(i,company_link[i]) for i in keys])