-
Notifications
You must be signed in to change notification settings - Fork 1
/
crawl.py
150 lines (127 loc) · 4.83 KB
/
crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
from pprint import pprint
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime, timedelta
import json
import requests
import pytz
import os
from influxdb import InfluxDBClient
from dotenv import load_dotenv
load_dotenv()
class TCEQ:
# Setting local TZ because all TCEQ monitors are in Texas.
local_tz = pytz.timezone('America/Chicago')
url = "https://www.tceq.texas.gov/cgi-bin/compliance/monops/daily_summary.pl"
site = 56 # Denton Airport South
def remove_attrs(self, soup):
for tag in soup.findAll(True):
tag.attrs = None
return soup
def daterange(self, start_date, end_date):
for n in range(int((end_date - start_date).days)):
yield start_date + timedelta(n)
def get_options(self, format=None):
opts = []
resp = requests.get(self.url)
soup = BeautifulSoup(resp.text, 'html.parser')
options = soup.find('select', {"name": "select_site"})
for option in options.find_all('option'):
value = option.attrs['value']
if value.startswith('site'):
opts.append(value)
if format == 'json':
tmp = []
for val in opts:
obj = {"desc": "", "aqs": "", "cas": ""}
for i, v in enumerate(val.split('|')):
if i == 1:
obj.update({"desc": v})
if i == 2:
obj.update({"aqs": v})
if i == 3:
obj.update({"cas": v})
tmp.append(obj)
return tmp
# Else, return strings.
return opts
def get_html(self, timestamp=None):
# Give default timestamp
if timestamp == None:
timestamp = datetime.now().astimezone(tz=self.local_tz).timestamp()
# Generate date
date = datetime.fromtimestamp(timestamp).astimezone(tz=self.local_tz)
# Prepare JSON
params = {
'select_date': "user",
'user_month': date.month - 1, # TCEQ has a weird offset.
'user_day': date.day,
'user_year': date.year,
'select_site': "|||" + str(self.site),
'time_format': "24hr"
}
return requests.get(self.url, params=params).text
def get_table(self, timestamp=None):
html = BeautifulSoup(self.get_html(timestamp=timestamp), 'html.parser')
# Clean table attrs.
html = self.remove_attrs(html)
tables = pd.read_html(html.prettify(), header=0)
data = tables[-1]
return data
def get_json(self, timestamp=None):
data = self.get_table(timestamp=timestamp)
data = data.rename(columns={"Parameter Measured": "measurement"}).iloc[:-3, 0:25]
return data.to_json(orient="records")
# @todo add support for site id
def jsonbody_for_influx(self, data, timestamp):
date = datetime.fromtimestamp(timestamp)
items = []
data = json.loads(data)
for item in data:
# Create initial object.
obj = {"measurement": "", "time": "", "fields": {"value": None}}
# Extract measurement name first.
if item["measurement"]:
m = item["measurement"].strip()
obj.update({"measurement": m})
# Loop through rest of the timestamps.
for i in item:
cObj = obj.copy()
if self.isInteger(i) and self.isfloat(item[i]): # Check if key is number.
value = float(item[i])
pprint(value)
hour = int(i[0:2]) # Get hour
time = datetime(date.year, date.month, date.day, hour, 00).isoformat() + "Z"
cObj.update({"time": time, "fields": {"value": value}})
items.append(cObj)
return items
def isfloat(self, value):
try:
float(value)
return True
except ValueError:
return False
def isInteger(self, value):
try:
int(value)
return True
except ValueError:
return False
tceq = TCEQ()
client = InfluxDBClient(host=os.getenv("INFLUXDB_HOST"), port=8086, username=os.getenv("INFLUXDB_USER"), password=os.getenv("INFLUXDB_PASSWORD"))
client.switch_database(os.getenv("INFLUXDB_DATABASE"))
start_date = datetime(2009, 1, 1)
end_date = datetime.utcnow()
# Loop through dates, store data.
count = 0
for single_date in tceq.daterange(start_date, end_date):
ts = single_date.timestamp()
data = tceq.get_json(timestamp=ts)
influx = tceq.jsonbody_for_influx(data=data, timestamp=ts)
pprint(influx)
client.write_points(influx)
pprint("Processing: " + single_date.isoformat())
count += 1
# if count > 0:
# break
# print single_date.strftime("%Y-%m-%d")