From 21a1eab6668eed7037410a4a626bcda09390b621 Mon Sep 17 00:00:00 2001
From: mosc5 <moritz.schiel@rl-institut.de>
Date: Mon, 19 Feb 2024 10:49:30 +0100
Subject: [PATCH] update driving profile creation example

---
 examples/driving_profiles_from_mid.py | 263 +++++++++++---------------
 1 file changed, 110 insertions(+), 153 deletions(-)

diff --git a/examples/driving_profiles_from_mid.py b/examples/driving_profiles_from_mid.py
index 756de72..f3bc01f 100644
--- a/examples/driving_profiles_from_mid.py
+++ b/examples/driving_profiles_from_mid.py
@@ -1,21 +1,15 @@
-import json
-import pathlib
-
 import pandas as pd
 import numpy as np
-
+import pathlib
 pd.options.mode.chained_assignment = None  # default='warn'
 
-
-def create_driving_profiles(number_of_dp, region_names, car_type_names):
-    data_path = pathlib.Path("mid_data")
+def create_driving_profiles(number_of_dp, path):
+    data_path = pathlib.Path("..", "mid_data")
+    # way_cols = [0, 1, 2, 3, 4, 17, 19, 21, 26, 28, 30, 35, 43, 48, 55, 58, 60, 67, 68, 87, 94, 95, 113, 122, 163]
     way_cols = [0, 1, 2, 3, 4, 21, 28, 30, 35, 48, 53, 54, 58, 60, 67, 68, 94, 95, 163]
-    way = pd.read_csv(
-        pathlib.Path(data_path, "MiD2017_Wege.csv"),
-        sep=";",
-        decimal=",",
-        usecols=way_cols,
-    )
+    way = pd.read_csv(pathlib.Path(data_path, "MiD2017_Wege.csv"), sep=';', decimal=',',
+                      usecols=way_cols)
+    way.head()
 
     # read MiD-Wege, required fields (see code plan for mid)
     # 0: HP_ID (Haushalts-Personen-ID)
@@ -38,7 +32,7 @@ def create_driving_profiles(number_of_dp, region_names, car_type_names):
     # 58: wegkm_imp (Wegelaenge in km, 0,01 - 900)
     # 60: tempo (Geschwindigkeit 0,5 - 900, 9994 unplausibel, 9995 Wert nicht zu berechnen, 70701 bei rBW nicht zu bestimmen, 70703 Weg ohne Detailerfassung)
     # 67: W_VM_G (Verkehrsmittel - Pkw 0: nicht genannt, 1: genannt)
-    # 68: W_VM_H (Verkehrsmittel - Carsharing 0: nicht genannt, 1: genannt)
+    # 68: W_VM_H (Verkehrsmittel - Carsharing 0: nicht genannt, 1: genannt) TODO add?
     # 87: hvm_imp (Hauptverkehrsmittel - 1: zu Fuss, 2: Fahrrad, 3: MIV Mitfahrer, 4: MIV Fahrer, 5 OePV)
     # 94: W_AUTO_HH (Auto aus dem Haushalt - 1: ja, 2: nein, 9: keine Angabe, weitere Codes fuer Begruendung des Fehlens)
     # 95: W_WAUTO (A_ID des FAhrzeugs - 1: 1. Fahrzeug, 2: 2. Fahrzeug, 3: 3. Fahrzeug, 4: anderes Fahrzeug, 9: keine Angabe, weitere Codes fuer Begruendung des Fehlens)
@@ -52,60 +46,44 @@ def create_driving_profiles(number_of_dp, region_names, car_type_names):
     uc_zwdp_street = [701, 705, 711, 604, 715, 717, 999, 2020, 7704, 7705]
 
     uc_zwdf_retail = [503, 603, 702, 703, 704, 705, 706, 713, 714]
-    uc_zwdf_street = [
-        605,
-        701,
-        707,
-        710,
-        711,
-        712,
-        716,
-        717,
-        719,
-        720,
-        999,
-        2202,
-        7704,
-        7705,
-    ]
-
-    way["charging_use_case"].loc[
-        (way["W_ZWDP"].isin(uc_zwdp_street) & (way["zweck"] == 5))
-        | (way["W_ZWDF"].isin(uc_zwdf_street) & (way["zweck"] == 7))
-    ] = "street"
-    way["charging_use_case"].loc[
-        (way["W_ZWDP"].isin(uc_zwdp_retail) & (way["zweck"] == 5))
-        | (way["W_ZWDF"].isin(uc_zwdf_retail) & (way["zweck"] == 7))
-    ] = "retail"
-
-    household_cols = [0, 1, 87, 97]
-    households = pd.read_csv(
-        pathlib.Path(data_path, "MiD2017_Haushalte.csv"),
-        sep=";",
-        decimal=",",
-        usecols=household_cols,
-    )
+    uc_zwdf_street = [605, 701, 707, 710, 711, 712, 716, 717, 719, 720, 999, 2202, 7704, 7705]
+
+    way["charging_use_case"].loc[(way["W_ZWDP"].isin(uc_zwdp_street) & (way["zweck"] == 5)) |
+                                 (way["W_ZWDF"].isin(uc_zwdf_street) & (way["zweck"] == 7))] = "street"
+    way["charging_use_case"].loc[(way["zweck"].isin([4, 6])) |
+                                 ((way["zweck"] == 2) & (way["wegkm_imp"] <= 350)) |
+                                 (way["W_ZWDP"].isin(uc_zwdp_retail) & (way["zweck"] == 5)) |
+                                 (way["W_ZWDF"].isin(uc_zwdf_retail) & (way["zweck"] == 7))] = "retail"
+
+    # car_cols = []
+    #cars = pd.read_csv(pathlib.Path(data_path, "MiD2017_Autos.csv"), sep=';', decimal=',',
+                       # usecols=car_cols
+    #                   )
+
+    #cars.head()
+
+    household_cols = [0,1,87,97]
+    households = pd.read_csv(pathlib.Path(data_path, "MiD2017_Haushalte.csv"), sep=';', decimal=',',
+                       usecols=household_cols
+                       )
 
     households.head()
 
     person_cols = [0, 1, 3, 22]
-    persons = pd.read_csv(
-        pathlib.Path(data_path, "MiD2017_Personen.csv"),
-        sep=";",
-        decimal=",",
-        usecols=person_cols,
-    )
+    persons = pd.read_csv(pathlib.Path(data_path, "MiD2017_Personen.csv"), sep=';', decimal=',',
+                       usecols=person_cols
+                       )
     persons.head()
 
     # group by H_ID and count number of unique ST_WOTAG values
-    counts = persons.groupby("H_ID")["ST_WOTAG"].nunique()
+    counts = persons.groupby('H_ID')['ST_WOTAG'].nunique()
 
     if (counts == 1).all():
         print("All ST_WOTAG values are the same for each H_ID")
     else:
         print("ST_WOTAG values are not always the same for the same H_ID")
 
-    persons = persons.drop_duplicates(subset="HP_ID", keep="first")
+    persons = persons.drop_duplicates(subset='HP_ID', keep='first')
     persons.head()
 
     households = households.join(persons.set_index("H_ID"), on="H_ID")
@@ -131,17 +109,16 @@ def create_driving_profiles(number_of_dp, region_names, car_type_names):
 
     # fahrten mit fahrzeit = 0 raus filtern
 
-    way_filtered = way.loc[
-        (way["wegkm_imp"] >= way_distance_min)
-        & (way["wegkm_imp"] <= way_distance_max)
-        & ((way["W_VM_G"] == 1) | (way["W_VM_H"] == 1))
-        & (way["tempo"] >= speed_min)
-        & (way["tempo"] <= speed_max)
-        & (way["W_SZ"].str.contains("^\d{1,2}:\d{2}:\d{2}$", regex=True))
-        & (way["W_AZ"].str.contains("^\d{1,2}:\d{2}:\d{2}$", regex=True))
-        & (way["zweck"] >= 1)
-        & (way["zweck"] <= 10)
-    ].copy()
+    way_filtered = way.loc[(way["wegkm_imp"] >= way_distance_min) &
+                           (way["wegkm_imp"] <= way_distance_max) &
+                           ((way["W_VM_G"] == 1) | (way["W_VM_H"] == 1)) &
+                           (way["tempo"] >= speed_min) &
+                           (way["tempo"] <= speed_max) &
+                           (way["W_SZ"].str.contains('^\d{1,2}:\d{2}:\d{2}$', regex=True)) &
+                           (way["W_AZ"].str.contains('^\d{1,2}:\d{2}:\d{2}$', regex=True)) &
+                           (way["zweck"] >= 1) &
+                           (way["zweck"] <= 10)
+                           ].copy()
     print("lenght_filterd", len(way_filtered))
     print("lenght_households", len(households))
 
@@ -149,20 +126,9 @@ def create_driving_profiles(number_of_dp, region_names, car_type_names):
 
     # private/ridesharing/anderer zweck zusammenfassen
 
-    mid_zweck = range(1, 11)
-
-    simbev_zweck = [
-        "work",
-        "business",
-        "school",
-        "shopping",
-        "private",
-        "private",
-        "leisure",
-        "home",
-        "home",
-        "private",
-    ]
+    mid_zweck = range(1, 11)  # todo: check if wegzweck rückweg auf home okay!?
+
+    simbev_zweck = ["work", "business", "school", "shopping", "private", "private", "leisure", "home", "home", "private"]
 
     zweck_dict = dict(zip(mid_zweck, simbev_zweck))
 
@@ -176,6 +142,7 @@ def create_driving_profiles(number_of_dp, region_names, car_type_names):
         "rural": [74, 77],
     }
 
+    # TODO change to 12 classes? method: join vehicle data to households, pkw seg_kba #27
     car_type_names = ["mini", "medium", "luxury"]
     kba_seg = {
         "mini": [1, 2],
@@ -214,34 +181,34 @@ def create_driving_profiles(number_of_dp, region_names, car_type_names):
             # print(region, car_type)
             # 1. find all households that mach region and car-type by region
             household_dict[region][car_type] = households.loc[
-                households["RegioStaR7"].isin(regiostar7[region])
-                & households["pkw_seg_kba"].isin(kba_seg[car_type])
-            ].copy()
-            stats_dict[region][car_type]["number_of_households"] = len(
-                household_dict[region][car_type]["H_ID"].unique()
-            )
+                households["RegioStaR7"].isin(regiostar7[region]) & households["pkw_seg_kba"].isin(
+                    kba_seg[car_type])].copy()
+            stats_dict[region][car_type]["number_of_households"] = len(household_dict[region][car_type]["H_ID"].unique())
 
             # 2. find all ways that are done by household
             way_dict[region][car_type] = way_filtered.loc[
-                way_filtered["H_ID"].isin(household_dict[region][car_type]["H_ID"])
-            ].copy()
-            stats_dict[region][car_type]["number_of_ways"] = len(
-                way_dict[region][car_type]
-            )
+                way_filtered["H_ID"].isin(household_dict[region][car_type]["H_ID"])].copy()
+            stats_dict[region][car_type]["number_of_ways"] = len(way_dict[region][car_type])
             # print()
 
     def create_timestep_from_time(time_str):
         time_str_list = time_str.split(":")
-        return int(time_str_list[0]) * 60 + int(time_str_list[1])
+        return int(time_str_list[0])*60+int(time_str_list[1])
 
+    # get ways data for each household, change it to fit own specifications (use cases)
     for region in region_names:
         for car_type in car_type_names:
-            way_dict[region][car_type]["departure_time"] = way_dict[region][car_type][
-                "W_SZ"
-            ].apply(create_timestep_from_time)
-            way_dict[region][car_type]["arrival_time"] = way_dict[region][car_type][
-                "W_AZ"
-            ].apply(create_timestep_from_time)
+            way_dict[region][car_type]["departure_time"] = way_dict[region][car_type]["W_SZ"].apply(create_timestep_from_time)
+            way_dict[region][car_type]["arrival_time"] = way_dict[region][car_type]["W_AZ"].apply(create_timestep_from_time)
+
+        # fill day with activities
+        #activity = pd.DataFrame([[id_driving_profile, (weekday-1), row["zweck"], row["W_SZ"], row["W_AZ"], row["wegkm_imp"]]],columns=dp_columns)
+        #driving_profile_dict[region][car_type] = pd.concat([driving_profile_dict[region][car_type], activity],ignore_index=True)
+        #previous_row = row
+
+    # construct weekly driving profiles using the weight column of daily profiles
+    # build in eventfree days
+    # check if value of 40% of days are without event in MiD
 
     # choose H_ID out of Householdgroup by using weight column
 
@@ -259,58 +226,58 @@ def select_by_weight(df, column, weight):
 
         # return household_id for further usage
         return selected_df[column]
-
-    def select_randomly(df, column):
+    
+    # deprecated
+    def select_randomly(df,column,weight):
         selected_df = df.sample()
         return selected_df[column].iloc[0]
 
+
     # check choosen household for driving-events on given weekday
     def check_for_way(household_df, weekday, way_df):
         # households_df: all households with PkW in specific region and with specific car type
         household_df_weekday = household_df.loc[household_df["ST_WOTAG"] == weekday]
         # select households that have a maching "Stichtag"
-        H_ID = select_randomly(household_df_weekday, "H_ID")
+        H_ID = select_by_weight(household_df_weekday, "H_ID", "H_GEW")
+        # check for unfiltered ways here?
+        # print(H_ID)
 
         # check for persons and persons with way
+        # unique_persons = household_df["HP_ID"].loc[(household_df["H_ID"] == H_ID)].unique()
         unique_persons_with_way = way_df["HP_ID"].loc[(way_df["H_ID"] == H_ID)].unique()
+        # print(H_ID)
+        # print(unique_persons)
+        # print(unique_persons_with_way)
+        # print()
 
         if unique_persons_with_way.size == 0:
             # If there is no persons with way in household write empty DataFrame
             specific_day = pd.DataFrame()
         else:
             # else check for person with way by weight
-            HP_ID = select_randomly(
-                household_df.loc[household_df["HP_ID"].isin(unique_persons_with_way)],
-                "HP_ID",
-            )
+            HP_ID = select_by_weight(household_df.loc[household_df["HP_ID"].isin(unique_persons_with_way)], "HP_ID",
+                                     "P_GEW")
             # check if there is any connection to way dataframe
             specific_day = way_df.loc[(way_df["HP_ID"] == HP_ID)]
 
         return specific_day, (not specific_day.empty)
 
+
     # start with preperations for generating driving profiles
-    dp_columns = [
-        "id",
-        "day",
-        "location",
-        "departure_time",
-        "arrival_time",
-        "distance",
-        "charging_use_case",
-    ]
-    profile_columns = [
-        "id",
-        "ST_WOTAG",
-        "zweck",
-        "departure_time",
-        "arrival_time",
-        "wegkm_imp",
-        "charging_use_case",
-    ]
+    dp_columns = ["id", "day", "location", "departure_time", "arrival_time", "distance", "charging_use_case"]
+    profile_columns = ["id", "ST_WOTAG", "zweck", "departure_time", "arrival_time", "wegkm_imp", "charging_use_case"]
 
     new_column_dict = {key: value for key, value in zip(profile_columns, dp_columns)}
 
     days = 7
+    region_names = ["urban", "suburban", "rural"]
+    #region_names = ["urban"]
+    #region_names = ["suburban"]
+    #region_names = ["rural"]
+    car_type_names = ["mini", "medium", "luxury"]
+    #car_type_names = ["mini"]
+    #car_type_names = ["medium"]
+    #car_type_names = ["luxury"]
 
     # counters for identifying empty days
     day_counter = number_of_dp * len(region_names) * len(car_type_names) * days
@@ -324,12 +291,10 @@ def check_for_way(household_df, weekday, way_df):
             for _ in range(number_of_dp):
                 # start generating driving-profile for given household
                 for weekday in range(days):
+
                     # check if day is empty
-                    day_specific, ways_found = check_for_way(
-                        household_dict[region][car_type],
-                        weekday,
-                        way_dict[region][car_type],
-                    )
+                    day_specific, ways_found = check_for_way(household_dict[region][car_type], weekday,
+                                                             way_dict[region][car_type])
                     if not ways_found:
                         # skip day with no activities
                         no_way_counter += 1
@@ -342,9 +307,7 @@ def check_for_way(household_df, weekday, way_df):
                         activity = day_specific.copy()
                         activity["id"] = id_driving_profile
                         activity = activity[profile_columns]
-                        driving_profiles = pd.concat(
-                            [driving_profiles, activity], ignore_index=True
-                        )
+                        driving_profiles = pd.concat([driving_profiles, activity], ignore_index=True)
 
                 id_driving_profile += 1
             driving_profiles = driving_profiles.rename(columns=new_column_dict)
@@ -355,40 +318,34 @@ def check_for_way(household_df, weekday, way_df):
     stats_dict["share_of_empty_days"] = share_of_empty_days
     print("share of empty days", share_of_empty_days)
 
-    save_dir = pathlib.Path("driving_profiles")
+    import json
+
+    save_dir = pathlib.Path(path)
     save_dir.mkdir(exist_ok=True)
 
     for region in region_names:
         for car_type in car_type_names:
             # change datatypes
             driving_profile = driving_profile_dict[region][car_type]
-            driving_profile["id"] = driving_profile["id"].astype("int32")
-            driving_profile["day"] = driving_profile["day"].astype("int8")
-            driving_profile["location"] = driving_profile["location"].astype("category")
-            driving_profile["departure_time"] = driving_profile[
-                "departure_time"
-            ].astype("int16")
-            driving_profile["arrival_time"] = driving_profile["arrival_time"].astype(
-                "int16"
-            )
-            driving_profile["distance"] = driving_profile["distance"].astype("float32")
+            driving_profile["id"] = driving_profile["id"].astype('int32')
+            driving_profile["day"] = driving_profile["day"].astype('int8')
+            driving_profile["location"] = driving_profile["location"].astype('category')
+            driving_profile["departure_time"] = driving_profile["departure_time"].astype('int16')
+            driving_profile["arrival_time"] = driving_profile["arrival_time"].astype('int16')
+            driving_profile["distance"] = driving_profile["distance"].astype('float32')
 
             # save to parquet
-            driving_profile.to_parquet(
-                pathlib.Path(save_dir, f"driving_profiles_{region}_{car_type}.gzip"),
-                compression="gzip",
-            )
+            driving_profile.to_parquet(path + f'/driving_profiles_{region}_{car_type}.gzip',
+                                       compression='gzip')
 
             # save to csv
-            # driving_profile.to_csv(pathlib.Path(save_dir, f'driving_profiles_{region}_{car_type}.csv'))
+            # driving_profile.to_csv(path + f'/driving_profiles_{region}_{car_type}.csv')
 
     # save statistics
-    with open(f"stats.json", "w") as outfile:
+    with open(path + f"/stats.json", "w") as outfile:
         json.dump(stats_dict, outfile, indent=4)
 
-
 if __name__ == "__main__":
-    region_names = ["urban", "suburban", "rural"]
-    car_type_names = ["mini", "medium", "luxury"]
-    number_of_dp = 1000
-    create_driving_profiles(number_of_dp, region_names, car_type_names)
+    number_of_dp = 400000
+    path = 'results/profiles'
+    create_driving_profiles(number_of_dp, path)
\ No newline at end of file