-
Notifications
You must be signed in to change notification settings - Fork 1
/
multiple_parking_permits.py
59 lines (44 loc) · 2.35 KB
/
multiple_parking_permits.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import pandas as pd
import os
from sodapy import Socrata
# Fetch from data.somervillema.gov if more recent data are available
# https://data.somervillema.gov/resource/97j2-xc5y.json
# See https://github.com/xmunoz/sodapy/blob/master/examples/basic_queries.ipynb
# Setup ----
socrata_domain = 'data.somervillema.gov'
socrata_dataset_identifier = '97j2-xc5y'
socrata_token = os.environ.get("SODAPY_APPTOKEN") # No token required here
inputname = 'City_of_Somerville_Parking_Permits.csv'
outputname = 'results.csv'
# Fetch metadata ----
# Manually specify which column to inspect for length, here looking at 'issued'
client = Socrata(socrata_domain, socrata_token)
metadata = client.get_metadata(socrata_dataset_identifier)
meta_issued = [x for x in metadata['columns'] if x['name'] == 'issued'][0]
nrow_available = meta_issued['cachedContents']['not_null']
# Does the required input file exist at all? If not, fetch it and write out csv
if os.path.isfile(inputname) == 'False':
print('Getting data from ' + socrata_domain)
results = client.get(socrata_dataset_identifier)
df = pd.DataFrame.from_dict(results)
df.to_csv(inputname, header=True)
# If it does exist, check to make sure it has the same number of rows as currently available.
# If not, fetch fresh data from scratch
if os.path.isfile(inputname):
df = pd.read_csv(inputname)
nrow_current = df.shape[0]
if int(nrow_current) != int(nrow_available):
print('Fresher data was found, updating from ' + socrata_domain)
results = client.get(socrata_dataset_identifier)
df = pd.DataFrame.from_dict(results)
df.to_csv(inputname, header=True)
# Clean up fields -- seems there is trailing white space in half the Residential fields
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
# Make a list of residential units which have multiple parking permits
address_count = df[(df.type_name == 'Residential')].groupby(['st_addr', 'unit_num']).size().sort_values(ascending=False).head(100)
address_count.to_csv(outputname, header=True)
# Same, but now split by year
df.issued = pd.to_datetime(df.issued) # Super slow right now...
df['year_issued'] = df.issued.dt.year
address_count = df[(df.type_name == 'Residential')].groupby(['year_issued', 'st_addr', 'unit_num']).size().sort_values(ascending=False).head(100)
address_count.to_csv('results_by_year.csv', header=True)