Skip to content

Commit

Permalink
[BSE-4271] Add NYC Taxi benchmark code (#6)
Browse files Browse the repository at this point in the history
  • Loading branch information
ehsantn authored Dec 5, 2024
1 parent eacbd8a commit ca8c70c
Show file tree
Hide file tree
Showing 3 changed files with 106 additions and 4 deletions.
97 changes: 97 additions & 0 deletions benchmarks/nyc_taxi_preciptation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
"""
NYC Taxi Monthly Trips with Precipitation
Similar to:
https://github.com/toddwschneider/nyc-taxi-data/blob/c65ad8332a44f49770644b11576c0529b40bbc76/citibike_comparison/analysis/analysis_queries.sql#L1
"""

import time

import pandas as pd

import bodo


@bodo.jit(cache=True)
def get_monthly_travels_weather():
start = time.time()
central_park_weather_observations = pd.read_csv(
"s3://bodo-example-data/nyc-taxi/central_park_weather.csv", parse_dates=["DATE"]
)
central_park_weather_observations = central_park_weather_observations.rename(
columns={"DATE": "date", "PRCP": "precipitation"}, copy=False
)
fhvhv_tripdata = pd.read_parquet("s3://bodo-example-data/nyc-taxi/fhvhv_tripdata/")
end = time.time()
print("Reading Time: ", (end - start))

start = time.time()

central_park_weather_observations["date"] = central_park_weather_observations[
"date"
].dt.date
fhvhv_tripdata["date"] = fhvhv_tripdata["pickup_datetime"].dt.date
fhvhv_tripdata["month"] = fhvhv_tripdata["pickup_datetime"].dt.month
fhvhv_tripdata["hour"] = fhvhv_tripdata["pickup_datetime"].dt.hour
fhvhv_tripdata["weekday"] = fhvhv_tripdata["pickup_datetime"].dt.dayofweek.isin(
[1, 2, 3, 4, 5]
)

monthly_trips_weather = fhvhv_tripdata.merge(
central_park_weather_observations, on="date", how="inner"
)
monthly_trips_weather["date_with_precipitation"] = (
monthly_trips_weather["precipitation"] > 0.1
)

def get_time_bucket(t):
bucket = "other"
if t in (8, 9, 10):
bucket = "morning"
elif t in (11, 12, 13, 14, 15):
bucket = "midday"
elif t in (16, 17, 18):
bucket = "afternoon"
elif t in (19, 20, 21):
bucket = "evening"
return bucket

monthly_trips_weather["time_bucket"] = monthly_trips_weather.hour.map(
get_time_bucket
)
monthly_trips_weather = monthly_trips_weather.groupby(
[
"PULocationID",
"DOLocationID",
"month",
"weekday",
"date_with_precipitation",
"time_bucket",
],
as_index=False,
).agg({"hvfhs_license_num": "count", "trip_miles": "mean"})
monthly_trips_weather = monthly_trips_weather.sort_values(
by=[
"PULocationID",
"DOLocationID",
"month",
"weekday",
"date_with_precipitation",
"time_bucket",
]
)
monthly_trips_weather = monthly_trips_weather.rename(
columns={
"hvfhs_license_num": "trips",
"trip_miles": "avg_distance",
},
copy=False,
)
end = time.time()
print("Monthly Taxi Travel Times Computation Time: ", end - start)
print(monthly_trips_weather.head())
return monthly_trips_weather


if __name__ == "__main__":
get_monthly_travels_weather()
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,10 @@
" start = time.time()\n",
" central_park_weather_observations = pd.read_csv(\n",
" \"s3://bodo-example-data/nyc-taxi/central_park_weather.csv\", \n",
" parse_dates=[\"date\"]\n",
" parse_dates=[\"DATE\"]\n",
" )\n",
" central_park_weather_observations = central_park_weather_observations.rename(\n",
" columns={\"DATE\": \"date\", \"PRCP\": \"precipitation\"}, copy=False\n",
" )\n",
" central_park_weather_observations[\"date\"] = central_park_weather_observations[\n",
" \"date\"\n",
Expand Down Expand Up @@ -634,4 +637,4 @@
},
"nbformat": 4,
"nbformat_minor": 5
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,14 @@
def get_monthly_travels_weather():
start = time.time()
central_park_weather_observations = pd.read_csv(
"s3://bodo-example-data/nyc-taxi/central_park_weather.csv", parse_dates=["date"]
"s3://bodo-example-data/nyc-taxi/central_park_weather.csv", parse_dates=["DATE"]
)
central_park_weather_observations = central_park_weather_observations.rename(
columns={"DATE": "date", "PRCP": "precipitation"}, copy=False
)
central_park_weather_observations["date"] = central_park_weather_observations[
"date"
].dt.date

green_taxi = pd.read_csv(
"s3://bodo-example-data/nyc-taxi/green_tripdata_2019.csv",
usecols=[0, 1, 5, 6, 8],
Expand Down

0 comments on commit ca8c70c

Please sign in to comment.