forked from drmeow/HW2-Stat157
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFinal_version_curator.py
118 lines (83 loc) · 5.16 KB
/
Final_version_curator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# -*- coding: utf-8 -*-
# <nbformat>3.0</nbformat>
# <codecell>
import urllib
from pandas import read_json
from pandas import DataFrame
from datetime import datetime, timedelta
from pytz import timezone
import pytz
# <codecell>
def Milli_to_DateTime(millisec): #converting millsec time data into recognizable time data
utc = pytz.utc # setting UTC
fmt = '%A, %B %d, %Y %H:%M:%S %Z' #setting output
all_converted=range(len(millisec)) #declare variable
for x in range(0,len(millisec)): #converting all millsec data into recognizable time data
utc_dt = utc.localize(datetime.utcfromtimestamp(millisec[x]/1000))
converted=utc_dt.strftime(fmt)
all_converted[x]=converted #boom! done!
return(all_converted) #booom returns
# <codecell>
#DataCleaning function will scrap the data from http://earthquake.usgs.gov/earthquakes/feed/v1.0/geojson.php and save it to your
#local workspace. Input variable is going to verify which type of data you want to scrap.
#valid input variables are "past hour", "past day", "past 7days", "past 30days"
def DataCleaning(whichdata):
if not whichdata in ["past hour", "past day", "past 7days", "past 30days"]: #this if statement will verify if you put valid input or not
print("Invalid input bro, gg. Valid inputs are \"past hour\", \"past day\", \"past 7days\", \"past 30days\"") #error sign!
else:
if whichdata == "past hour": # scrap past hour data
url = 'http://earthquake.usgs.gov/earthquakes/feed/v1.0/summary/all_hour.geojson'
data = read_json(urllib.urlopen(url),typ='series',convert_axes=True, dtype=True,convert_dates=True)
elif whichdata == "past day": # scrap past day data
url = 'http://earthquake.usgs.gov/earthquakes/feed/v1.0/summary/all_day.geojson'
data = read_json(urllib.urlopen(url),typ='series',convert_axes=True, dtype=True,convert_dates=True)
elif whichdata == "past 7days": # scrap past 7days data
url = 'http://earthquake.usgs.gov/earthquakes/feed/v1.0/summary/all_week.geojson'
data = read_json(urllib.urlopen(url),typ='series',convert_axes=True, dtype=True,convert_dates=True)
elif whichdata == "past 30days": # scrap past 30days data
url = 'http://earthquake.usgs.gov/earthquakes/feed/v1.0/summary/all_month.geojson'
data = read_json(urllib.urlopen(url),typ='series',convert_axes=True, dtype=True,convert_dates=True)
numbers_of_newdata=data['metadata']['count'] #counting numbers of data
Src=range(numbers_of_newdata) #declare variables
Eqid=range(numbers_of_newdata)
Time=range(numbers_of_newdata)
Lat=range(numbers_of_newdata)
Lon=range(numbers_of_newdata)
Depth=range(numbers_of_newdata)
Nst=range(numbers_of_newdata)
Region=range(numbers_of_newdata)
Magnitude=range(numbers_of_newdata)
for x in range(0,numbers_of_newdata) : #read data that we need and save it to corresponding variables in your local repository
Src[x] = data['features'][x]['properties']['net']
Eqid[x] = data['features'][x]['properties']['code']
Time[x] = data['features'][x]['properties']['time']
Lat[x] = data['features'][x]['geometry']['coordinates'][1]
Lon[x] = data['features'][x]['geometry']['coordinates'][0]
Depth[x] = data['features'][x]['geometry']['coordinates'][2]
Nst[x] = data['features'][x]['properties']['nst']
Region[x] = data['features'][x]['properties']['place']
Magnitude[x]=data['features'][x]['properties']['mag']
converted_time= Milli_to_DateTime(Time)
rawdata= {'Src':Src, 'Eqid':Eqid, 'Datetime':converted_time , 'Lat':Lat , 'Lon':Lon ,'Magnitude':Magnitude,'Depth':Depth,'NST': Nst,'Region': Region }
#making data set by using variables that we obtained
converted_rawdata=DataFrame(rawdata,columns=['Src','Eqid','Datetime','Lat','Lon','Magnitude','Depth','NST','Region'])
#converting data set into data frame
without_NA_data=converted_rawdata.dropna(axis=0, how='any')
#droppppiiinnnggggg each row if it has at least one na value
if without_NA_data.shape[0] == 0: #checking if the final dataframe is empty
print("Data is not available since each row contains at least one NA. Empty DataFrame! rofl!")
return(without_NA_data) #returning final dataframe
# <codecell>
DataCleaning("Dayum") #testing for invalid input
# <codecell>
past_hour=DataCleaning("past hour") #this will scrap past hour data and save it to your local worksapce
past_hour
# <codecell>
past_day=DataCleaning("past day") #this will scrap past day data and save it to your local worksapce
past_day
# <codecell>
past_7days=DataCleaning("past 7days") #this will scrap past 7days data and save it to your local worksapce
past_7days[0:30] #printing first 30 data
# <codecell>
past_30days=DataCleaning("past 30days") #this will scrap past 30days data and save it to your local worksapce
past_30days[0:30] #printing first 30