Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion config.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
'processed_test': 'data/processed/feature_engineered_test.csv',
'precipitation': 'data/external/precipitation.csv',
'gmaps_train': 'data/processed/gmapsdata/gmaps_train_data.csv',
'gmaps_test': 'data/processed/gmapsdata/gmaps_test_data.csv'
'gmaps_test': 'data/processed/gmapsdata/gmaps_test_data.csv',
'historical_weather': 'data/processed/historical_weather.csv'
}

# Output paths
Expand Down
3 changes: 3 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,6 @@ ipykernel>=6.0.0

# Utilities
requests>=2.25.0

#api calling
meteostat
80 changes: 80 additions & 0 deletions src/features/weather_api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
from datetime import datetime
from meteostat import Point, Hourly
import pandas as pd
import sys
import os

# --- This block adds the root folder to the Python path ---
# This allows us to import the 'config.py' file from the root
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
PROJECT_ROOT = os.path.abspath(os.path.join(SCRIPT_DIR, '..', '..')) # Go up two levels (features -> src -> root)
sys.path.append(PROJECT_ROOT)
# --- End of path correction ---

def get_weather_for_trip(latitude: float, longitude: float, timestamp: datetime) -> dict:
"""
Fetches historical weather data for a specific location and time using Meteostat.

Args:
latitude: Latitude of the location.
longitude: Longitude of the location.
timestamp: A datetime object for the time of the trip (assumed UTC).

Returns:
A dictionary containing key weather features, or an empty dict on failure.
"""
try:
# Meteostat needs a start and end time.
start = timestamp
end = timestamp

# Create a Meteostat Point
location = Point(latitude, longitude)

# Fetch the hourly data
data = Hourly(location, start, end)
data = data.fetch()

if data.empty:
print(f"No weather data found for {timestamp} at ({latitude}, {longitude})")
return {}

# Extract the first (and only) row of data
weather_row = data.iloc[0]

weather_features = {
'temp': weather_row.get('temp'),
'humidity': weather_row.get('rhum'), # 'rhum' is relative humidity
'wind_speed': weather_row.get('wspd'),
'visibility': weather_row.get('visi'),
'weather_condition_code': weather_row.get('coco') # Weather condition code
}

# Replace any NaN (Not a Number) values with 0.0
for key, value in weather_features.items():
if pd.isna(value):
weather_features[key] = 0.0

return weather_features

except Exception as e:
print(f"Error fetching weather data for {timestamp}: {e}")
return {}

# --- THIS IS THE PART YOU ARE LIKELY MISSING ---
# This block only runs when you execute the file directly
# (e.g., python src/features/weather_api.py)
if __name__ == "__main__":
# Test call for a date from the dataset (Jan 1, 2016, 5:00 PM)
test_lat = 40.767937
test_lon = -73.982155
test_time = datetime(2016, 1, 1, 17, 0, 0)

print(f"Testing Meteostat for {test_time} at ({test_lat}, {test_lon})...")
weather = get_weather_for_trip(test_lat, test_lon, test_time)

if weather:
print("Success! Received data:")
print(weather)
else:
print("Test failed.")
88 changes: 88 additions & 0 deletions src/get_historical_weather.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import sys
import os
from datetime import datetime
from meteostat import Point, Hourly
import pandas as pd

# --- This block adds the root folder to the Python path ---
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
PROJECT_ROOT = os.path.abspath(os.path.join(SCRIPT_DIR, '..'))
sys.path.append(PROJECT_ROOT)
# --- End of path correction ---

import config # This will now work

# --- Configuration ---
NYC_LATITUDE = 40.785091 # (Central Park)
NYC_LONGITUDE = -73.968285
START_DATE = datetime(2016, 1, 1) # Jan 1, 2016
END_DATE = datetime(2016, 6, 30, 23, 59, 59) # Jun 30, 2016

# --- Use the new path from config.py ---
OUTPUT_FILE = config.DATA_PATHS['historical_weather']
# Get the directory part (e.g., "data/processed") for os.makedirs
PROCESSED_DIR = os.path.dirname(OUTPUT_FILE)

def fetch_and_save_weather_bulk():
"""
Fetches all historical weather data for the entire date range
in a single request and saves it to a CSV.
"""
print(f"Starting to fetch all weather data from {START_DATE} to {END_DATE}...")

try:
location = Point(NYC_LATITUDE, NYC_LONGITUDE)

# Get all data in one bulk request
data = Hourly(location, START_DATE, END_DATE)
data = data.fetch()

if data.empty:
print("No data fetched.")
return

# --- START OF THE FIX ---

# 1. Define all columns we WANT and their new names
desired_rename_map = {
'temp': 'temp',
'rhum': 'humidity',
'wspd': 'wind_speed',
'visi': 'visibility', # We want this...
'coco': 'weather_condition_code'
}

# 2. Find out which of these columns ACTUALLY exist in the fetched data
available_cols_map = {
key: val for key, val in desired_rename_map.items() if key in data.columns
}

# 3. Inform the user if a column was missing
if 'visi' not in data.columns:
print("Note: 'visibility' (visi) data was not found. Skipping this feature.")

# 4. Select ONLY the available columns
data_cleaned = data[available_cols_map.keys()].copy()

# 5. Rename ONLY the available columns
data_cleaned.rename(columns=available_cols_map, inplace=True)

# --- END OF THE FIX ---

# Handle missing values (fill with 0)
data_cleaned.fillna(0, inplace=True)
data_cleaned.index.name = 'datetime_hourly'

# --- Use the PROCESSED_DIR variable ---
os.makedirs(PROCESSED_DIR, exist_ok=True)

# Save to CSV
data_cleaned.to_csv(OUTPUT_FILE)

print(f"Success! Saved {len(data_cleaned)} hourly records to {OUTPUT_FILE}")

except Exception as e:
print(f"An error occurred: {e}")

if __name__ == "__main__":
fetch_and_save_weather_bulk()