diff --git a/benchmarks/nyc_taxi_preciptation.py b/benchmarks/nyc_taxi_preciptation.py new file mode 100644 index 0000000000..f5e71f74dc --- /dev/null +++ b/benchmarks/nyc_taxi_preciptation.py @@ -0,0 +1,97 @@ +""" +NYC Taxi Monthly Trips with Precipitation + +Similar to: +https://github.com/toddwschneider/nyc-taxi-data/blob/c65ad8332a44f49770644b11576c0529b40bbc76/citibike_comparison/analysis/analysis_queries.sql#L1 +""" + +import time + +import pandas as pd + +import bodo + + +@bodo.jit(cache=True) +def get_monthly_travels_weather(): + start = time.time() + central_park_weather_observations = pd.read_csv( + "s3://bodo-example-data/nyc-taxi/central_park_weather.csv", parse_dates=["DATE"] + ) + central_park_weather_observations = central_park_weather_observations.rename( + columns={"DATE": "date", "PRCP": "precipitation"}, copy=False + ) + fhvhv_tripdata = pd.read_parquet("s3://bodo-example-data/nyc-taxi/fhvhv_tripdata/") + end = time.time() + print("Reading Time: ", (end - start)) + + start = time.time() + + central_park_weather_observations["date"] = central_park_weather_observations[ + "date" + ].dt.date + fhvhv_tripdata["date"] = fhvhv_tripdata["pickup_datetime"].dt.date + fhvhv_tripdata["month"] = fhvhv_tripdata["pickup_datetime"].dt.month + fhvhv_tripdata["hour"] = fhvhv_tripdata["pickup_datetime"].dt.hour + fhvhv_tripdata["weekday"] = fhvhv_tripdata["pickup_datetime"].dt.dayofweek.isin( + [1, 2, 3, 4, 5] + ) + + monthly_trips_weather = fhvhv_tripdata.merge( + central_park_weather_observations, on="date", how="inner" + ) + monthly_trips_weather["date_with_precipitation"] = ( + monthly_trips_weather["precipitation"] > 0.1 + ) + + def get_time_bucket(t): + bucket = "other" + if t in (8, 9, 10): + bucket = "morning" + elif t in (11, 12, 13, 14, 15): + bucket = "midday" + elif t in (16, 17, 18): + bucket = "afternoon" + elif t in (19, 20, 21): + bucket = "evening" + return bucket + + monthly_trips_weather["time_bucket"] = monthly_trips_weather.hour.map( + get_time_bucket + ) + monthly_trips_weather = monthly_trips_weather.groupby( + [ + "PULocationID", + "DOLocationID", + "month", + "weekday", + "date_with_precipitation", + "time_bucket", + ], + as_index=False, + ).agg({"hvfhs_license_num": "count", "trip_miles": "mean"}) + monthly_trips_weather = monthly_trips_weather.sort_values( + by=[ + "PULocationID", + "DOLocationID", + "month", + "weekday", + "date_with_precipitation", + "time_bucket", + ] + ) + monthly_trips_weather = monthly_trips_weather.rename( + columns={ + "hvfhs_license_num": "trips", + "trip_miles": "avg_distance", + }, + copy=False, + ) + end = time.time() + print("Monthly Taxi Travel Times Computation Time: ", end - start) + print(monthly_trips_weather.head()) + return monthly_trips_weather + + +if __name__ == "__main__": + get_monthly_travels_weather() diff --git a/examples/05-Business-Usecases-at-Scale/4-Transportation-and-Logistics.ipynb b/examples/05-Business-Usecases-at-Scale/4-Transportation-and-Logistics.ipynb index 2debf3710a..f6813e3bf9 100644 --- a/examples/05-Business-Usecases-at-Scale/4-Transportation-and-Logistics.ipynb +++ b/examples/05-Business-Usecases-at-Scale/4-Transportation-and-Logistics.ipynb @@ -193,7 +193,10 @@ " start = time.time()\n", " central_park_weather_observations = pd.read_csv(\n", " \"s3://bodo-example-data/nyc-taxi/central_park_weather.csv\", \n", - " parse_dates=[\"date\"]\n", + " parse_dates=[\"DATE\"]\n", + " )\n", + " central_park_weather_observations = central_park_weather_observations.rename(\n", + " columns={\"DATE\": \"date\", \"PRCP\": \"precipitation\"}, copy=False\n", " )\n", " central_park_weather_observations[\"date\"] = central_park_weather_observations[\n", " \"date\"\n", @@ -634,4 +637,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} diff --git a/examples/05-Business-Usecases-at-Scale/8-Terminal-examples/3-Transportation-and-Logistics/3-monthly_taxi_travel_times.py b/examples/05-Business-Usecases-at-Scale/8-Terminal-examples/3-Transportation-and-Logistics/3-monthly_taxi_travel_times.py index 87bfd0fdbd..5cabcca61a 100644 --- a/examples/05-Business-Usecases-at-Scale/8-Terminal-examples/3-Transportation-and-Logistics/3-monthly_taxi_travel_times.py +++ b/examples/05-Business-Usecases-at-Scale/8-Terminal-examples/3-Transportation-and-Logistics/3-monthly_taxi_travel_times.py @@ -23,12 +23,14 @@ def get_monthly_travels_weather(): start = time.time() central_park_weather_observations = pd.read_csv( - "s3://bodo-example-data/nyc-taxi/central_park_weather.csv", parse_dates=["date"] + "s3://bodo-example-data/nyc-taxi/central_park_weather.csv", parse_dates=["DATE"] + ) + central_park_weather_observations = central_park_weather_observations.rename( + columns={"DATE": "date", "PRCP": "precipitation"}, copy=False ) central_park_weather_observations["date"] = central_park_weather_observations[ "date" ].dt.date - green_taxi = pd.read_csv( "s3://bodo-example-data/nyc-taxi/green_tripdata_2019.csv", usecols=[0, 1, 5, 6, 8],