-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape_mars.py
166 lines (106 loc) · 4.53 KB
/
scrape_mars.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
# Import Dependecies
from bs4 import BeautifulSoup
from splinter import Browser
import pandas as pd
import requests
# Initialize browser
def init_browser():
executable_path = {'executable_path': 'chromedriver'}
return Browser('chrome', headless=False, **executable_path)
# Create Mission to Mars global dictionary that can be imported into Mongo
mars_info = {}
# NASA MARS NEWS
def scrape_mars_news():
browser = init_browser()
url = 'https://mars.nasa.gov/news/'
browser.visit(url)
html = browser.html
soup = BeautifulSoup(html, 'html.parser')
# Retrieve the latest element that contains news title and news_paragraph
news_title = soup.find('div', class_='content_title').find('a').text.strip()
news_p = soup.find('div', class_='article_teaser_body').text.strip()
# Dictionary entry from MARS NEWS
mars_info['news_title'] = news_title
mars_info['news_paragraph'] = news_p
return mars_info
# FEATURED IMAGE
def scrape_mars_image():
browser = init_browser()
image_url_featured = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
browser.visit(image_url_featured)
html_image = browser.html
soup = BeautifulSoup(html_image, 'html.parser')
featured_image_url = soup.find('article')['style'].replace('background-image: url(','').replace(');', '')[1:-1]
# Website Url
main_url = 'https://www.jpl.nasa.gov'
# Concatenate website url with scrapped route
featured_image_url = main_url + featured_image_url
# Display full link to featured image
featured_image_url
# Dictionary entry from FEATURED IMAGE
mars_info['featured_image_url'] = featured_image_url
return mars_info
# Mars Weather
def scrape_mars_weather():
browser = init_browser()
weather_url = 'https://twitter.com/marswxreport?lang=en'
browser.visit(weather_url)
html_weather = browser.html
soup = BeautifulSoup(html_weather, 'html.parser')
latest_tweets = soup.find_all('div', class_='js-tweet-text-container')
for tweet in latest_tweets:
weather_tweet = tweet.find('p').text
if 'Sol' and 'pressure' in weather_tweet:
print(weather_tweet)
break
else:
pass
# Dictionary entry from WEATHER TWEET
mars_info['weather_tweet'] = weather_tweet
return mars_info
# Mars Facts
def scrape_mars_facts():
facts_url = 'http://space-facts.com/mars/'
mars_facts = pd.read_html(facts_url)
# Find the mars facts DataFrame in the list of DataFrames as assign it to `mars_df`
mars_df = mars_facts[0]
# Assign the columns `['Description', 'Value']`
mars_df.columns = ['Description','Value']
# Set the index to the `Description` column without row indexing
mars_df.set_index('Description', inplace=True)
# Save html code to folder Assets
data = mars_df.to_html()
# Dictionary entry from MARS FACTS
mars_info['mars_facts'] = data
return mars_info
# MARS HEMISPHERES
def scrape_mars_hemispheres():
browser = init_browser()
hemispheres_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(hemispheres_url)
html_hemispheres = browser.html
soup = BeautifulSoup(html_hemispheres, 'html.parser')
items = soup.find_all('div', class_='item')
# Create empty list for hemisphere urls
hiu = []
# Store the main_ul
hemispheres_main_url = 'https://astrogeology.usgs.gov'
# Loop through the items previously stored
for i in items:
# Store title
title = i.find('h3').text
# Store link that leads to full image website
partial_img_url = i.find('a', class_='itemLink product-item')['href']
# Visit the link that contains the full image website
browser.visit(hemispheres_main_url + partial_img_url)
# HTML Object of individual hemisphere information website
partial_img_html = browser.html
# Parse HTML with Beautiful Soup for every individual hemisphere information website
soup = BeautifulSoup( partial_img_html, 'html.parser')
# Retrieve full image source
img_url = hemispheres_main_url + soup.find('img', class_='wide-image')['src']
# Append the retreived information into a list of dictionaries
hiu.append({"title" : title, "img_url" : img_url})
mars_info['hiu'] = hiu
# Return mars_data dictionary
return mars_info