-
Notifications
You must be signed in to change notification settings - Fork 17
/
Copy path5-web-scraping.py
89 lines (67 loc) · 2.15 KB
/
5-web-scraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#!/usr/bin/env python
"""
Getting Started with Python:
Web Scraping
Fetch and print web page contents
We're looking at the U.S. Nuclear Regulatory Commission's
list of power reactor units
"""
import csv
import requests
from bs4 import BeautifulSoup
url = 'http://www.nrc.gov/reactors/operating/list-power-reactor-units.html'
r = requests.get(url)
html_soup = BeautifulSoup(r.text, 'html.parser')
print(html_soup.prettify().encode('UTF-8'))
# find all links on the page and print first 20
links = html_soup.find_all('a')
for a in links[0:20]:
print(a)
for a in links[0:10]:
print(a.get('href'))
# locate and print a particular link
t_link = html_soup.find(title="NRC Twitter Feed")
print(t_link)
print(t_link.get('href'))
# find and print table
table = html_soup.find('table')
print(table.prettify().encode('UTF-8'))
# find and print all the table's rows
table = html_soup.find('table')
for row in table.find_all('tr'):
print(row)
# print table's column values
table = html_soup.find('table')
for row in table.find_all('tr')[1:]:
col = row.find_all('td')
print(col[0].find('a').get('href')) # link
print(col[0].find('a').contents[0]) # name
print(col[1].string)
print(col[2].string)
try: # sometimes this field is blank
print(col[3].string)
except:
print('')
print(col[4].string)
# output the columns as a CSV
csvfile = open('reactors.csv', 'wb')
csvwriter = csv.writer(csvfile, delimiter=',')
headers = ('LINK', 'PLANT NAME', 'REACTOR TYPE',
'LOCATION', 'OWNER', 'NRC REGION')
csvwriter.writerow(headers)
table = html_soup.find('table')
for row in table.find_all('tr')[1:]:
col = row.find_all('td')
link = col[0].find('a').get('href') # link
reactor_name = col[0].find('a').contents[0] # name
reactor_type = col[1].string
location = col[2].string.encode('Latin')
owner_scrape = col[3].string
if owner_scrape is None:
owner = ''
else:
owner = col[3].string.encode('Latin')
region = col[4].string
parsed_row = (link, reactor_name, reactor_type, location, owner, region)
csvwriter.writerow(parsed_row)
csvfile.close()