Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add in LCRA waterquality data #111

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,23 @@
from setuptools.command.test import test as TestCommand



class PyTest(TestCommand):
user_options = [('pytest-args=', 'a', "Arguments to pass to py.test")]

def initialize_options(self):
TestCommand.initialize_options(self)
self.pytest_args = []

def finalize_options(self):
TestCommand.finalize_options(self)
self.test_args = []
self.test_suite = True

def run_tests(self):
# import here, cause outside the eggs aren't loaded
#import here, cause outside the eggs aren't loaded
import pytest
errno = pytest.main(self.test_args)
errno = pytest.main(self.pytest_args)
sys.exit(errno)


Expand Down
199 changes: 199 additions & 0 deletions test/files/lcra/waterquality/12147_params.html

Large diffs are not rendered by default.

135 changes: 135 additions & 0 deletions test/files/lcra/waterquality/12147_results.html

Large diffs are not rendered by default.

87 changes: 87 additions & 0 deletions test/files/lcra/waterquality/stations.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<!-- saved from url=(0042)http://waterquality.lcra.org/sitelist.aspx -->
<html xmlns="http://www.w3.org/1999/xhtml"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<title>waterquality.lcra.org</title>
<link href="./waterquality.lcra.org_files/www09_consolidated2.css" rel="stylesheet" type="text/css">
<link rel="shortcut icon" href="http://waterquality.lcra.org/Images/faviconLCRA.ico" type="image/x-icon">
<script src="./waterquality.lcra.org_files/www09_top_menu.js" type="text/javascript"></script>
<style type="text/css"></style></head>


<body style="height: 100%; background-color:#033968; width:100%">

<div align="left">
<script type="text/javascript" language="javascript">showTopNavBar();</script><div id="MenuBar" class="NoPrint"><div class="NavButtons" id="logo"><a target="_new" href="http://www.lcra.org/"><img src="./waterquality.lcra.org_files/lcralogo.png" border="0"></a></div><div id="NavDivider"><img src="./waterquality.lcra.org_files/button_divider.png" border="0"></div><div class="NavButtons" id="NavAbout" onmouseover="navOn(&quot;NavAbout&quot;)" onmouseout="navOff(&quot;NavAbout&quot;)" style="background-image: url(http://waterquality.lcra.org/images/button_gradient.png);"><a href="http://www.lcra.org/about" target="_new"><img src="./waterquality.lcra.org_files/about.png" border="0"></a></div><div id="NavDivider"><img src="./waterquality.lcra.org_files/button_divider.png" border="0"></div><div class="NavButtons" id="NavEnergy" onmouseover="navOn(&quot;NavEnergy&quot;)" onmouseout="navOff(&quot;NavEnergy&quot;)" style="background-image: url(http://waterquality.lcra.org/images/button_gradient.png);"><a href="http://www.lcra.org/energy" target="_new"><img src="./waterquality.lcra.org_files/energy.png" border="0"></a></div><div id="NavDivider"><img src="./waterquality.lcra.org_files/button_divider.png" border="0"></div><div class="NavButtons" id="NavWater" onmouseover="navOn(&quot;NavWater&quot;)" onmouseout="navOff(&quot;NavWater&quot;)" style="background-image: url(http://waterquality.lcra.org/images/button_gradient.png);"><a href="http://www.lcra.org/water" target="_new"><img src="./waterquality.lcra.org_files/water.png" border="0"></a></div><div id="NavDivider"><img src="./waterquality.lcra.org_files/button_divider.png" border="0"></div><div class="NavButtons" id="NavParks" onmouseover="navOn(&quot;NavParks&quot;)" onmouseout="navOff(&quot;NavParks&quot;)"><a href="http://www.lcra.org/parks" target="_new"><img src="./waterquality.lcra.org_files/parks.png" border="0"></a></div><div id="NavDivider"><img src="./waterquality.lcra.org_files/button_divider.png" border="0"></div><div class="NavButtons" id="NavJobs" onmouseover="navOn(&quot;NavJobs&quot;)" onmouseout="navOff(&quot;NavJobs&quot;)" style="background-image: url(http://waterquality.lcra.org/images/button_gradient.png);"><a href="http://www.lcra.org/about/employment" target="_new"><img src="./waterquality.lcra.org_files/jobs.png" border="0"></a></div><div id="NavDivider"><img src="./waterquality.lcra.org_files/button_divider.png" border="0"></div><div class="NavButtons" id="NavContactUs" onmouseover="navOn(&quot;NavContactUs&quot;)" onmouseout="navOff(&quot;NavContactUs&quot;)"><a href="http://www.lcra.org/about/overview" target="_new"><img src="./waterquality.lcra.org_files/contact_us.png" border="0"></a></div><div id="NavDivider"><img src="./waterquality.lcra.org_files/button_divider.png" border="0"></div><div id="SearchBoxWrapper"><form name="gs" action="http://www.lcra.org/search" method="GET" target="_new"><table height="39px" border="0" align="right"><tbody><tr><td valign="middle"><img src="./waterquality.lcra.org_files/search.png" border="0"></td><td><input type="hidden" name="site" value="dotorg"><input type="hidden" name="client" value="dotorg_new"><input type="hidden" name="proxystylesheet" value="dotorg_new"><input type="hidden" name="output" value="xml_no_dtd"><input type="text" name="q" size="15" class="SearchInputBox"></td><td><input src="./waterquality.lcra.org_files/search_submit.png" name="search" type="image" alt="Click to submit your search">&nbsp;</td></tr></tbody></table></form></div></div>
</div>

<form name="form1" method="post" action="./waterquality.lcra.org_files/waterquality.lcra.org.html" id="form1" style="margin: 0px; padding: 0px;">
<div>
<input type="hidden" name="__VIEWSTATE" id="__VIEWSTATE" value="/wEPDwUJNjg0NzA5NjQ2ZBgBBQlHcmlkVmlldzEPPCsADAEIAgFkdrUlLeK5FmsdI/Lbf1BTyvSFsC4S2WBBt6DwnxoC3UQ=">
</div>

<div>

<input type="hidden" name="__VIEWSTATEGENERATOR" id="__VIEWSTATEGENERATOR" value="F8CC5443">
</div>

<div style="padding:6px;">
<table style="width: 100%; background-color:Black;" cellpadding="4">
<tbody><tr>
<td width="50%">
<span class="HomepageColumnHeadings">Select stream segment for multiple sites or choose an individual site.</span><br><br>
</td>
</tr>
</tbody></table>

<div>
<table cellspacing="0" cellpadding="3" border="0" id="GridView1" style="font-size:Smaller;width:100%;border-collapse:collapse;">
<tbody><tr class="header" style="color:White;font-size:Small;font-weight:bold;">
<th class="header" scope="col">Site ID</th><th class="header" align="left" scope="col">Description</th>
</tr><tr style="color:White;background-color:Gainsboro;">
<td colspan="2" style="background-color:#033968;font-size:8pt;font-weight:bold;width:100%;"><a class="WhiteLinks" href="http://waterquality.lcra.org/parameter_segments.aspx?qrySegment=1301" target="_blank">Segment 1301 San Bernard River Tidal</a></td>
</tr><tr style="color:White;background-color:Gainsboro;">
<td align="center"><a href="http://waterquality.lcra.org/parameter.aspx?qrySite=20460" style="display:inline-block;color:Black;width:150px;">20460</a></td><td><a href="http://waterquality.lcra.org/parameter.aspx?qrySite=20460" style="color:Black;">SAN BERNARD RIVER TIDAL AT SH 35 SOUTHWEST OF WEST COLUMBIA</a></td>
</tr><tr style="color:White;background-color:#EBEBEB;">
<td align="center"><a href="http://waterquality.lcra.org/parameter.aspx?qrySite=12146" style="display:inline-block;color:Black;width:150px;">12146</a></td><td><a href="http://waterquality.lcra.org/parameter.aspx?qrySite=12146" style="color:Black;">SAN BERNARD RIVER TIDAL EAST BANK IMMEDIATELY UPSTREAM OF FM 2611</a></td>
</tr><tr style="color:White;background-color:Gainsboro;">
<td colspan="2" style="background-color:#033968;font-size:8pt;font-weight:bold;width:100%;"><a class="WhiteLinks" href="http://waterquality.lcra.org/parameter_segments.aspx?qrySegment=1302" target="_blank">Segment 1302 San Bernard River Above Tidal</a></td>
</tr><tr style="color:White;background-color:Gainsboro;">
<td align="center"><a href="http://waterquality.lcra.org/parameter.aspx?qrySite=20723" style="display:inline-block;color:Black;width:150px;">20723</a></td><td><a href="http://waterquality.lcra.org/parameter.aspx?qrySite=20723" style="color:Black;">MOUND CREEK AT BRAZORIA CR 450/JACKSON SETTLEMENT ROAD 1.22 KILOMETERS UPSTREAM OF FM 1301 IN WEST OF WEST COLUMBIA</a></td>
</tr><tr style="color:White;background-color:#EBEBEB;">
<td align="center"><a href="http://waterquality.lcra.org/parameter.aspx?qrySite=20722" style="display:inline-block;color:Black;width:150px;">20722</a></td><td><a href="http://waterquality.lcra.org/parameter.aspx?qrySite=20722" style="color:Black;">PEACH CREEK AT WHARTON CR 117/CHUDALLA ROAD/ARCHER ROAD 89 METERS SOUTH OF THE INTERSECTION OF WHARTON CR 117/CHUDALLA ROAD/ARCHER ROAD AND WHARTON CR 121/ WHARTON CR 119/DONALDSON ROAD IN EAST OF WHARTON</a></td>
</tr><tr style="color:White;background-color:#EBEBEB;">
<td align="center"><a href="http://waterquality.lcra.org/parameter.aspx?qrySite=12517" style="display:inline-block;color:Black;width:150px;">12517</a></td><td><a href="http://waterquality.lcra.org/parameter.aspx?qrySite=12517" style="color:Black;">TRES PALACIOS CREEK AT FM 456</a></td>
</tr><tr style="color:White;background-color:Gainsboro;">
<td colspan="2" style="background-color:#033968;font-size:8pt;font-weight:bold;width:100%;"><a class="WhiteLinks" href="http://waterquality.lcra.org/parameter_segments.aspx?qrySegment=2002" target="_blank">Segment 2002 Mission River Above Tidal</a></td>
</tr><tr style="color:White;background-color:Gainsboro;">
<td align="center"><a href="http://waterquality.lcra.org/parameter.aspx?qrySite=20062" style="display:inline-block;color:Black;width:150px;">20062</a></td><td><a href="http://waterquality.lcra.org/parameter.aspx?qrySite=20062" style="color:Black;">SARCO CREEK AT FM 2441 1.30 KM UPSTREAM OF THE ELKINS BRANCH CONFLUENCE</a></td>
</tr>
</tbody></table>
</div>

</div>

<div id="DoesNothingButCenter" align="center">
<div id="OuterWrapper">
<div id="Footer">
<a class="WhiteLinks" href="http://www.lcra.org/about/doing_business/index.html" target="_new">Purchasing</a> | <a class="WhiteLinks" href="http://www.lcra.org/about/overview/openrecords.html" target="_new">Open Records</a> | <a class="WhiteLinks" href="http://www.lcra.org/asklcra/" target="_new">Ask LCRA</a> | <a class="WhiteLinks" href="http://www.lcra.org/sitemap.html" target="_new">Sitemap</a>
<br>
© 1996-<script type="text/javascript"> var d = new Date(); document.write(d.getFullYear());</script>2015 Lower Colorado River Authority. All rights reserved.
</div> <!-- end "Footer" div -->
</div> <!-- end "OuterWrapper" div -->
</div> <!-- end "DoesNothingButCenter" div -->





</form>
<script src="./waterquality.lcra.org_files/urchin.js" type="text/javascript">
</script>
<script type="text/javascript">
_uacct = "UA-1180003-2";
urchinTracker();
</script>


</body></html>
45 changes: 45 additions & 0 deletions test/lcra_waterquality_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@


from ulmo.lcra.waterquality import get_stations, get_station_data
import test_util
import os

def test_get_stations():
service_info_url = 'http://waterquality.lcra.org/sitelist.aspx'
service_info_file = 'lcra/waterquality/stations.html'

url_files = {
(service_info_url, ('GET',)): service_info_file,
}

with test_util.mocked_urls(url_files):
stations = get_stations()

assert len(stations) == 6
assert "SH 35 SOUTHWEST" in stations['20460']

def test_get_station_data():
os.environ["ULMO_TESTING"] ="1"

service_info_url = 'http://waterquality.lcra.org/parameter.aspx?qrySite=12147'
service_info_file = 'lcra/waterquality/12147_params.html'

service_data_url = 'http://waterquality.lcra.org/events.aspx'
service_data_file = 'lcra/waterquality/12147_results.html'

url_files = {
(service_info_url, ('GET',)): service_info_file,
(service_data_url, ('POST',)): service_data_file,
}

with test_util.mocked_urls(url_files):
results = get_station_data(12147)

assert len(results) == 12
for data in results:
assert data['Site'] == u'12147'

del os.environ["ULMO_TESTING"]



1 change: 1 addition & 0 deletions ulmo/lcra/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from . import waterquality
1 change: 1 addition & 0 deletions ulmo/lcra/waterquality/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .core import get_stations, get_station_data
215 changes: 215 additions & 0 deletions ulmo/lcra/waterquality/core.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
"""
ulmo.lcra.waterquality.core
~~~~~~~~~~~~~~~~~~~~~
This module provides access to data provided by the `Lower Colorado
River Authority`_ `Water Quality`_ web site.
.. _Lower Colorado River Authority: http://www.lcra.org
.. _Water Quality: http://waterquality.lcra.org/
"""
from bs4 import BeautifulSoup
import logging

from ulmo import util



import pickle
import dateutil
import os

# import datetime
import os.path as op

LCRA_WATERQUALITY_DIR = op.join(util.get_ulmo_dir(), 'lcra/waterquality')


log = logging.getLogger(__name__)

from bs4 import BeautifulSoup
import requests


import pandas as pd



# try:
# import cStringIO as StringIO
# except ImportError:
# import StringIO


def get_stations():
"""Fetches a list of station codes and descriptions.
Returns
-------
stations_dict : dict
a python dict with station codes mapped to station information
"""
stations_url = 'http://waterquality.lcra.org/sitelist.aspx'
path = op.join(LCRA_WATERQUALITY_DIR, 'stationids.htm')

response = requests.get(stations_url)

soup = BeautifulSoup(response.content, 'html.parser')
gridview = soup.find(id="GridView1")

stations = [
(row.findAll('td')[0].string, row.findAll('td')[1].string)
for row in gridview.findAll('tr')
if len(row.findAll('td'))==2
]

return dict(stations)


def get_station_data(station_code, date=None, as_dataframe=False):
"""Fetches data for a station at a given date.
Parameters
----------
station_code: str
The station code to fetch data for. A list of stations can be retrieved with
``get_stations()``
date : ``None`` or date (see :ref:`dates-and-times`)
The date of the data to be queried. If date is ``None`` (default), then
all data will be returned.
as_dataframe : bool
This determines what format values are returned as. If ``False``
(default), the values dict will be a dict with timestamps as keys mapped
to a dict of gauge variables and values. If ``True`` then the values
dict will be a pandas.DataFrame object containing the equivalent
information.
Returns
-------
data_dict : dict
A dict containing station information and values.
"""


if isinstance(station_code, (str)):
pass
elif isinstance(station_code, (int)):
station_code = str(station_code)
else:
log.error("Unsure of the station_code parameter type. \
Try string or int")
raise

waterquality_url = "http://waterquality.lcra.org/parameter.aspx?qrySite=%s" %station_code
waterquality_url2 = 'http://waterquality.lcra.org/events.aspx'

dir_path = op.join(LCRA_WATERQUALITY_DIR, str(station_code))

resp_path = op.join(dir_path, "resp.html")

pickle_path = op.join(dir_path, "data.pickle")

util.mkdir_if_doesnt_exist(dir_path)



initial_request = requests.get(waterquality_url)
initialsoup = BeautifulSoup(initial_request.content, 'html.parser')

stationvals = [ statag.get('value', None)
for statag in initialsoup.findAll(id="multiple")
if statag.get('value', None)
]

result = _make_next_request(waterquality_url2,
initial_request,
{'multiple': stationvals,
'site': station_code})

if op.exists(resp_path) and \
util.misc._request_file_size_matches(result, resp_path)\
and not os.environ.get('ULMO_TESTING', None):
#means nothing has changed return cached pickle
log.info("%s was not processed because it is the same size"%station_code)
try:
with open(pickle_path, 'rb') as f:
return pickle.load(f)
except IOError:
log.info("Couldn't find the pickle that should be there for \
%s" %station_code)
pass


if not os.environ.get('ULMO_TESTING', None):
with open(resp_path, 'wb') as wf:
wf.write(result.content)


soup = BeautifulSoup(result.content, 'html.parser')

gridview = soup.find(id="GridView1")


results = []

headers = [head.text for head in gridview.findAll('th')]

#uses \xa0 for blank

for row in gridview.findAll('tr'):
vals = [_parse_val(aux.text) for aux in row.findAll('td')]
if len(vals) == 0:
continue

results.append(dict(zip(headers, vals)))

if not os.environ.get('ULMO_TESTING', None):
with open(pickle_path, 'wb') as mf:
pickle.dump(results, mf)

if date:
try:
datelim = dateutil.parser.parse(date)
except ValueError:
log.warn("Could not parse the provided date %s" %date)
datelim = None
if datelim:
df= _create_dataframe(results)
cut_df = df[df['Date'] > datelim]
if as_dataframe:
return cut_df
else:
return cut_df.to_dict('records')

if as_dataframe:
return _create_dataframe(results)
else:
return results

def _create_dataframe(results):
df = pd.DataFrame.from_records(results)
df['Date'] = df['Date'].apply(dateutil.parser.parse)
df.set_index(['Date'])
return df

def _extract_headers_for_next_request(request):
payload = dict()
for tag in BeautifulSoup(request.content, 'html.parser').findAll('input'):
tag_dict = dict(tag.attrs)
if tag_dict.get('value', None) == 'tabular':
#
continue
#some tags don't have a value and are used w/ JS to toggle a set of checkboxes
payload[tag_dict['name']] = tag_dict.get('value')
return payload


def _make_next_request(url, previous_request, data):
data_headers = _extract_headers_for_next_request(previous_request)
data_headers.update(data)
return requests.post(url, cookies=previous_request.cookies, data=data_headers)


def _parse_val(val):
#the &nsbp translates to the following unicode
if val == u'\xa0':
return None
else:
return val