harshitaphadtare · harshitaphadtare · Oct 24, 2025 · Oct 21, 2025
diff --git a/config.py b/config.py
@@ -13,7 +13,8 @@
     'processed_test': 'data/processed/feature_engineered_test.csv',
     'precipitation': 'data/external/precipitation.csv',
     'gmaps_train': 'data/processed/gmapsdata/gmaps_train_data.csv',
-    'gmaps_test': 'data/processed/gmapsdata/gmaps_test_data.csv'
+    'gmaps_test': 'data/processed/gmapsdata/gmaps_test_data.csv',
+    'historical_weather': 'data/processed/historical_weather.csv'
 }
 
 # Output paths

diff --git a/requirements.txt b/requirements.txt
@@ -32,3 +32,6 @@ ipykernel>=6.0.0
 
 # Utilities
 requests>=2.25.0
+
+#api calling
+meteostat
diff --git a/src/features/weather_api.py b/src/features/weather_api.py
@@ -0,0 +1,80 @@
+from datetime import datetime
+from meteostat import Point, Hourly
+import pandas as pd
+import sys
+import os
+
+# --- This block adds the root folder to the Python path ---
+# This allows us to import the 'config.py' file from the root
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+PROJECT_ROOT = os.path.abspath(os.path.join(SCRIPT_DIR, '..', '..')) # Go up two levels (features -> src -> root)
+sys.path.append(PROJECT_ROOT)
+# --- End of path correction ---
+
+def get_weather_for_trip(latitude: float, longitude: float, timestamp: datetime) -> dict:
+    """
+    Fetches historical weather data for a specific location and time using Meteostat.
+
+    Args:
+        latitude: Latitude of the location.
+        longitude: Longitude of the location.
+        timestamp: A datetime object for the time of the trip (assumed UTC).
+
+    Returns:
+        A dictionary containing key weather features, or an empty dict on failure.
+    """
+    try:
+        # Meteostat needs a start and end time.
+        start = timestamp
+        end = timestamp
+
+        # Create a Meteostat Point
+        location = Point(latitude, longitude)
+
+        # Fetch the hourly data
+        data = Hourly(location, start, end)
+        data = data.fetch()
+
+        if data.empty:
+            print(f"No weather data found for {timestamp} at ({latitude}, {longitude})")
+            return {}
+
+        # Extract the first (and only) row of data
+        weather_row = data.iloc[0]
+
+        weather_features = {
+            'temp': weather_row.get('temp'),
+            'humidity': weather_row.get('rhum'), # 'rhum' is relative humidity
+            'wind_speed': weather_row.get('wspd'),
+            'visibility': weather_row.get('visi'),
+            'weather_condition_code': weather_row.get('coco') # Weather condition code
+        }
+
+        # Replace any NaN (Not a Number) values with 0.0
+        for key, value in weather_features.items():
+            if pd.isna(value):
+                weather_features[key] = 0.0
+
+        return weather_features
+
+    except Exception as e:
+        print(f"Error fetching weather data for {timestamp}: {e}")
+        return {}
+
+# --- THIS IS THE PART YOU ARE LIKELY MISSING ---
+# This block only runs when you execute the file directly
+# (e.g., python src/features/weather_api.py)
+if __name__ == "__main__":
+    # Test call for a date from the dataset (Jan 1, 2016, 5:00 PM)
+    test_lat = 40.767937
+    test_lon = -73.982155
+    test_time = datetime(2016, 1, 1, 17, 0, 0) 
+
+    print(f"Testing Meteostat for {test_time} at ({test_lat}, {test_lon})...")
+    weather = get_weather_for_trip(test_lat, test_lon, test_time)
+
+    if weather:
+        print("Success! Received data:")
+        print(weather)
+    else:
+        print("Test failed.")
diff --git a/src/get_historical_weather.py b/src/get_historical_weather.py
@@ -0,0 +1,88 @@
+import sys
+import os
+from datetime import datetime
+from meteostat import Point, Hourly
+import pandas as pd
+
+# --- This block adds the root folder to the Python path ---
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+PROJECT_ROOT = os.path.abspath(os.path.join(SCRIPT_DIR, '..'))
+sys.path.append(PROJECT_ROOT)
+# --- End of path correction ---
+
+import config  # This will now work
+
+# --- Configuration ---
+NYC_LATITUDE = 40.785091  # (Central Park)
+NYC_LONGITUDE = -73.968285
+START_DATE = datetime(2016, 1, 1) # Jan 1, 2016
+END_DATE = datetime(2016, 6, 30, 23, 59, 59) # Jun 30, 2016
+
+# --- Use the new path from config.py ---
+OUTPUT_FILE = config.DATA_PATHS['historical_weather']
+# Get the directory part (e.g., "data/processed") for os.makedirs
+PROCESSED_DIR = os.path.dirname(OUTPUT_FILE)
+
+def fetch_and_save_weather_bulk():
+    """
+    Fetches all historical weather data for the entire date range
+    in a single request and saves it to a CSV.
+    """
+    print(f"Starting to fetch all weather data from {START_DATE} to {END_DATE}...")
+
+    try:
+        location = Point(NYC_LATITUDE, NYC_LONGITUDE)
+
+        # Get all data in one bulk request
+        data = Hourly(location, START_DATE, END_DATE)
+        data = data.fetch()
+
+        if data.empty:
+            print("No data fetched.")
+            return
+
+        # --- START OF THE FIX ---
+
+        # 1. Define all columns we WANT and their new names
+        desired_rename_map = {
+            'temp': 'temp',
+            'rhum': 'humidity',
+            'wspd': 'wind_speed',
+            'visi': 'visibility',  # We want this...
+            'coco': 'weather_condition_code'
+        }
+
+        # 2. Find out which of these columns ACTUALLY exist in the fetched data
+        available_cols_map = {
+            key: val for key, val in desired_rename_map.items() if key in data.columns
+        }
+
+        # 3. Inform the user if a column was missing
+        if 'visi' not in data.columns:
+            print("Note: 'visibility' (visi) data was not found. Skipping this feature.")
+
+        # 4. Select ONLY the available columns
+        data_cleaned = data[available_cols_map.keys()].copy()
+
+        # 5. Rename ONLY the available columns
+        data_cleaned.rename(columns=available_cols_map, inplace=True)
+
+        # --- END OF THE FIX ---
+
+        # Handle missing values (fill with 0)
+        data_cleaned.fillna(0, inplace=True) 
+        data_cleaned.index.name = 'datetime_hourly'
+
+        # --- Use the PROCESSED_DIR variable ---
+        os.makedirs(PROCESSED_DIR, exist_ok=True)
+
+        # Save to CSV
+        data_cleaned.to_csv(OUTPUT_FILE)
+
+        print(f"Success! Saved {len(data_cleaned)} hourly records to {OUTPUT_FILE}")
+
+    except Exception as e:
+        print(f"An error occurred: {e}")
+
+if __name__ == "__main__":
+    fetch_and_save_weather_bulk()