Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Jan open data #626

Merged
merged 14 commits into from
Jan 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions _shared_utils/shared_utils/gtfs_utils_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,8 +233,8 @@ def fill_in_metrolink_trips_df_with_shape_id(


def get_transit_organizations_gtfs_dataset_keys(
keep_cols: list[str], custom_filtering: dict = None
):
keep_cols: list[str], custom_filtering: dict = None, get_df: bool = False
) -> Union[pd.DataFrame, siuba.sql.verbs.LazyTbl]:
"""
From Airtable GTFS datasets, get the datasets (and gtfs_dataset_key)
for usable feeds.
Expand All @@ -249,6 +249,9 @@ def get_transit_organizations_gtfs_dataset_keys(
>> rename(gtfs_dataset_key="key")
)

if get_df:
dim_gtfs_datasets = dim_gtfs_datasets >> collect()

return dim_gtfs_datasets


Expand Down Expand Up @@ -293,6 +296,7 @@ def schedule_daily_feed_to_organization(
dim_gtfs_datasets = get_transit_organizations_gtfs_dataset_keys(
keep_cols=["key", "name", "type", "regional_feed_type"],
custom_filtering={"type": ["schedule"]},
get_df=False,
)

# Merge on gtfs_dataset_key to get organization name
Expand Down
73 changes: 67 additions & 6 deletions _shared_utils/shared_utils/portfolio_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,15 @@
need to import different pandas to add type hint for styler object

"""
import base64

import dask.dataframe as dd
import dask_geopandas as dg
import pandas as pd
import pandas.io.formats.style # for type hint: https://github.com/pandas-dev/pandas/issues/24884
from calitp.tables import tbls
from IPython.display import HTML
from shared_utils import rt_utils
from shared_utils import gtfs_utils_v2, rt_utils
from siuba import *


Expand All @@ -36,6 +40,54 @@ def clean_organization_name(df: pd.DataFrame) -> pd.DataFrame:
return df


def decode_base64_url(row):
"""
Provide decoded version of URL as ASCII.
WeHo gets an incorrect padding, but urlsafe_b64decode works.
Just in case, return uri truncated.
"""
try:
decoded = base64.urlsafe_b64decode(row.base64_url).decode("ascii")
except base64.binascii.Error:
decoded = row.uri.split("?")[0]

return decoded


def add_agency_identifiers(df: pd.DataFrame) -> pd.DataFrame:
"""
Find the current base64_url for the organization name and
decode it as ASCII (Chad Baker request for CKAN data).
The encoded version might not be as usable for users.
"""
dim_gtfs_datasets = gtfs_utils_v2.get_transit_organizations_gtfs_dataset_keys(
keep_cols=None, get_df=True
)

current_feeds = (
dim_gtfs_datasets[
(dim_gtfs_datasets.data_quality_pipeline == True)
& (dim_gtfs_datasets._is_current == True)
]
.drop_duplicates(subset="name")
.reset_index(drop=True)
)

current_feeds2 = current_feeds.assign(
feed_url=current_feeds.apply(lambda x: decode_base64_url(x), axis=1)
)

df2 = pd.merge(
df,
current_feeds2[["name", "base64_url", "feed_url"]],
on="name",
how="inner",
validate="m:1",
)

return df2


# https://github.com/cal-itp/data-analyses/blob/main/bus_service_increase/E5_make_stripplot_data.py
def add_caltrans_district() -> pd.DataFrame:
"""
Expand Down Expand Up @@ -87,16 +139,25 @@ def add_route_name(df: pd.DataFrame) -> pd.DataFrame:
"""
route_cols = ["route_id", "route_short_name", "route_long_name", "route_desc"]

if route_cols not in list(df.columns):
if not (set(route_cols).issubset(set(list(df.columns)))):
raise ValueError(f"Input a df that contains {route_cols}")

df = df.assign(route_name_used=df.apply(lambda x: rt_utils.which_desc(x), axis=1))
if isinstance(df, pd.DataFrame):
ddf = dd.from_pandas(df, npartitions=2)
elif isinstance(df, gpd.GeoDataFrame):
ddf = dg.from_geopandas(df, npartitions=2)

# If route names show up with leading comma
df = df.assign(
route_name_used=route_names.route_name_used.str.lstrip(",").str.strip()
ddf = ddf.assign(
route_name_used=ddf.apply(
lambda x: rt_utils.which_desc(x), axis=1, meta=("route_name_used", "str")
)
)

df = ddf.compute()

# If route names show up with leading comma
df = df.assign(route_name_used=df.route_name_used.str.lstrip(",").str.strip())

return df


Expand Down
45 changes: 39 additions & 6 deletions _shared_utils/shared_utils/shared_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,17 @@
import geopandas as gpd
import pandas as pd

GCS_FILE_PATH = "gs://calitp-analytics-data/data-analyses/"


# Function to set the county centroids and zoom levels
# used in folium and ipyleaflet maps
def make_county_centroids():
URL = "https://opendata.arcgis.com/datasets/8713ced9b78a4abb97dc130a691a8695_0.geojson"
"""
Find a county's centroids from county polygons.
"""
URL = (
"https://opendata.arcgis.com/datasets/"
"8713ced9b78a4abb97dc130a691a8695_0.geojson"
)

gdf = gpd.read_file(URL).to_crs(geography_utils.CA_StatePlane)
gdf.columns = gdf.columns.str.lower()
Expand Down Expand Up @@ -46,17 +52,44 @@ def make_county_centroids():
print("County centroids dataset created")

# Save as parquet, because lat/lon held in list, not point geometry anymore
gdf2.to_parquet(
"gs://calitp-analytics-data/data-analyses/ca_county_centroids.parquet"
)
gdf2.to_parquet(f"{GCS_FILE_PATH}ca_county_centroids.parquet")

print("County centroids exported to GCS")


def make_clean_state_highway_network():
"""
Create State Highway Network dataset.
"""
HIGHWAY_URL = (
"https://opendata.arcgis.com/datasets/"
"77f2d7ba94e040a78bfbe36feb6279da_0.geojson"
)
gdf = gpd.read_file(HIGHWAY_URL)

keep_cols = ["Route", "County", "District", "RouteType", "Direction", "geometry"]

gdf = gdf[keep_cols]
print(f"# rows before dissolve: {len(gdf)}")

# See if we can dissolve further - use all cols except geometry
# Should we dissolve further and use even longer lines?
dissolve_cols = [c for c in list(gdf.columns) if c != "geometry"]

gdf2 = gdf.dissolve(by=dissolve_cols).reset_index()
print(f"# rows after dissolve: {len(gdf2)}")

# Export to GCS
utils.geoparquet_gcs_export(gdf2, GCS_FILE_PATH, "state_highway_network")


# Run functions to create these datasets...store in GCS
if __name__ == "__main__":
# Don't use from shared_utils import geography_utils
# Those have other dependencies...like map_utils imports from geography_utils
import geography_utils
import utils

make_county_centroids()

make_clean_state_highway_network()
23 changes: 14 additions & 9 deletions _shared_utils/shared_utils/shared_data_catalog.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,29 +6,34 @@ sources:
description: CA county polygons
args:
urlpath: https://opendata.arcgis.com/datasets/8713ced9b78a4abb97dc130a691a8695_0.geojson
caltrans_districts:
driver: geojson
description: Caltrans district polygons
args:
urlpath: https://caltrans-gis.dot.ca.gov/arcgis/rest/services/CHboundary/District_Tiger_Lines/FeatureServer/0/query?outFields=*&where=1%3D1&f=geojson
ca_county_centroids:
driver: geoparquet
driver: parquet
description: CA county centroids
args:
# source: shared_utils/shared_data.py
urlpath: gs://calitp-analytics-data/data-analyses/ca_county_centroids.parquet
state_highway_network:
driver: geoparquet
description: Cleaned State Highway Network
args:
# source: shared_utils/shared_data.py
urlpath: gs://calitp-analytics-data/data-analyses/state_highway_network.parquet
ca_transit_routes:
driver: geoparquet
description: CA transit routes with line geometry at the operator-level (open data)
args:
# source: traffic_ops/make_routes_stops_shapefiles.py.py
# source: traffic_ops/create_routes_data.py
urlpath: gs://calitp-analytics-data/data-analyses/traffic_ops/ca_transit_routes.parquet
ca_transit_routes_feed:
driver: geoparquet
description: CA transit routes with line geometry at the feed-level (not on open data)
args:
# source: traffic_ops/make_routes_stops_shapefiles.py.py
urlpath: gs://calitp-analytics-data/data-analyses/traffic_ops/ca_transit_routes_feed.parquet
ca_transit_stops:
driver: geoparquet
description: CA transit stops with point geometry (open data)
args:
# source: traffic_ops/make_routes_stops_shapefiles.py.py
# source: traffic_ops/create_stops_data.py
urlpath: gs://calitp-analytics-data/data-analyses/traffic_ops/ca_transit_stops.parquet
hqta_stops:
driver: geoparquet
Expand Down
4 changes: 0 additions & 4 deletions bus_service_increase/bus_service_utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,15 @@
create_parallel_corridors,
gtfs_build,
#publish_single_report,
report_utils,
utils,
)

__version__ = "0.1.1"

__all__ = [
"better_bus_utils",
"calenviroscreen_lehd_utils",
"chart_utils",
"create_parallel_corridors",
"gtfs_build",
#"publish_single_report",
"report_utils",
"utils",
]
Loading