-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtransportation_safety_environmental_protection.py
254 lines (198 loc) · 9.57 KB
/
transportation_safety_environmental_protection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
# -*- coding: utf-8 -*-
"""transportation_safety_environmental_protection.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/101BzN2mMt2YxKFfd3jyV-5em1Pw_szAJ
"""
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from google.colab import drive
drive.mount('/content/drive')
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
file_path = "/content/drive/MyDrive/TransportationData/39yearrailroadaccidentincidentoverview.xlsx"
df = pd.read_excel(file_path)
df = pd.read_excel('/content/drive/MyDrive/TransportationData/39yearrailroadaccidentincidentoverview.xlsx')
# Transpose the dataframe
df_transposed = df.set_index('Category').T
# Reset index and rename the columns
df_transposed.reset_index(inplace=True)
df_transposed.columns = df_transposed.columns.str.strip()
df_transposed.rename(columns={'index': 'Year'}, inplace=True)
# Extract the year and convert it to integer
df_transposed['Year'] = df_transposed['Year'].str.extract('(\d+)').astype(int)
# Define the features and the target variable
X = df_transposed[['Year', '--- Train accident deaths', '--- Train accident injuries',
'--- Human factor caused', '--- Track caused',
'--- Motive power/equipment caused', '--- Signal caused, all track types',
'--- Miscellaneous caused', '--- Collisions',
'--- Derailments', '--- Other types, e.g., obstructions']].copy()
X['Hazmat_Derailment_Ratio'] = df_transposed['--- Hazmat cars damaged/derailed'] / df_transposed['TOTAL ACCIDENTS/INCIDENTS 1/']
y1 = df_transposed['--- HAZMAT RELEASES']
y2 = df_transposed['--- Hazmat cars damaged/derailed']
# Split the data into training and testing sets for each target variable
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y1, test_size=0.2, random_state=42)
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y2, test_size=0.2, random_state=42)
# Add function to calculate and display evaluation metrics
def display_metrics(y_true, y_pred):
mse = mean_squared_error(y_true, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_true, y_pred)
r2 = r2_score(y_true, y_pred)
print("Mean squared error (MSE):", mse)
print("Root mean squared error (RMSE):", rmse)
print("Mean absolute error (MAE):", mae)
print("R-squared (R²) score:", r2)
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
models = {'Linear Regression': LinearRegression(),
'Random Forest': RandomForestRegressor(random_state=42),
'Gradient Boosting': GradientBoostingRegressor(random_state=42)}
for name, model in models.items():
# Train the model for hazmat releases
model.fit(X_train1, y_train1)
y_pred1 = model.predict(X_test1)
print(f"{name} (Hazmat Releases) evaluation:")
display_metrics(y_test1, y_pred1)
# Train the model for hazmat cars damaged/derailed
model.fit(X_train2, y_train2)
y_pred2 = model.predict(X_test2)
print(f"{name} (Hazmat Cars Damaged/Derailed) evaluation:")
display_metrics(y_test2, y_pred2)
print("\n")
"""Based on the provided evaluation metrics, here's a summary of the models' performances:
Hazmat Releases:
Linear Regression:
R² score: 0.787
MSE: 21.66
RMSE: 4.65
MAE: 3.86
Random Forest:
R² score: 0.537
MSE: 46.99
RMSE: 6.85
MAE: 5.46
Gradient Boosting:
R² score: 0.318
MSE: 69.24
RMSE: 8.32
MAE: 6.94
Hazmat Cars Damaged/Derailed:
Linear Regression:
R² score: -0.181
MSE: 3566.04
RMSE: 59.72
MAE: 51.69
Random Forest:
R² score: 0.698
MSE: 912.38
RMSE: 30.21
MAE: 25.27
Gradient Boosting:
R² score: 0.546
MSE: 1371.88
RMSE: 37.04
MAE: 28.34
For predicting Hazmat Releases, the Linear Regression model has the best performance, with the highest R² score (0.7866), and the lowest RMSE (4.6537) , MSE (21.66), and MAE (3.8580).
For predicting Hazmat Cars Damaged/Derailed, the Random Forest model has the best performance, with the highest R² score (0.6979), and the lowest RMSE (30.2056), MSE (912.38), and MAE (25.2663).
"""
# Function to prepare the features for a specific year
def prepare_features(df):
row = df.iloc[-1] # Use the last row (most recent year) in the dataset
features = row[['Year', '--- Train accident deaths', '--- Train accident injuries',
'--- Human factor caused', '--- Track caused',
'--- Motive power/equipment caused', '--- Signal caused, all track types',
'--- Miscellaneous caused', '--- Collisions',
'--- Derailments', '--- Other types, e.g., obstructions']].copy()
features['Hazmat_Derailment_Ratio'] = row['--- Hazmat cars damaged/derailed'] / row['TOTAL ACCIDENTS/INCIDENTS 1/']
return features
# Function to make predictions for a specific year
def predict_for_year(year, df):
most_recent_features = prepare_features(df)
most_recent_features['Year'] = year # Set the year for the prediction
features = most_recent_features.to_frame().T
hazmat_releases_pred = best_model_hazmat_releases.predict(features)
hazmat_cars_damaged_derailed_pred = best_model_hazmat_cars_damaged_derailed.predict(features)
return hazmat_releases_pred[0], hazmat_cars_damaged_derailed_pred[0]
# Example usage: predict for the year 2023
year = 2024
hazmat_releases_prediction, hazmat_cars_damaged_derailed_prediction = predict_for_year(year, df_transposed)
print(f"Predicted number of hazmat releases in {year}: {hazmat_releases_prediction}")
print(f"Predicted number of hazmat cars damaged/derailed in {year}: {hazmat_cars_damaged_derailed_prediction}")
from sklearn.model_selection import cross_val_score
models = {'Linear Regression': LinearRegression(),
'Random Forest': RandomForestRegressor(random_state=42),
'Gradient Boosting': GradientBoostingRegressor(random_state=42)}
for name, model in models.items():
# Evaluate the model for hazmat releases using cross-validation
scores1 = cross_val_score(model, X, y1, cv=5, scoring='neg_mean_squared_error')
rmse1 = np.sqrt(-scores1)
print(f"{name} (Hazmat Releases) evaluation:")
print("Cross-validation scores:", rmse1)
print("Mean RMSE:", rmse1.mean())
# Evaluate the model for hazmat cars damaged/derailed using cross-validation
scores2 = cross_val_score(model, X, y2, cv=5, scoring='neg_mean_squared_error')
rmse2 = np.sqrt(-scores2)
print(f"{name} (Hazmat Cars Damaged/Derailed) evaluation:")
print("Cross-validation scores:", rmse2)
print("Mean RMSE:", rmse2.mean())
print("\n")
"""Based on the cross-validation results, here is the summary of the performance of each model for the two target variables:
Hazmat Releases:
Linear Regression:
Mean RMSE: 11.08
Random Forest:
Mean RMSE: 8.87
Gradient Boosting:
Mean RMSE: 8.31
Hazmat Cars Damaged/Derailed:
Linear Regression:
Mean RMSE: 66.22
Random Forest:
Mean RMSE: 149.24
Gradient Boosting:
Mean RMSE: 126.15
For the Hazmat Releases target variable, the Gradient Boosting model performs the best, with the lowest mean RMSE from cross-validation.
For the Hazmat Cars Damaged/Derailed target variable, the Linear Regression model performs the best, with the lowest mean RMSE from cross-validation.
Based on the cross-validation results, I would choose the Gradient Boosting model for predicting Hazmat Releases and the Linear Regression model for predicting Hazmat Cars Damaged/Derailed.
***Note that cross-validation can be computationally expensive, especially for larger datasets.
"""
import matplotlib.pyplot as plt
import seaborn as sns
# Train the Random Forest model on the entire dataset
rf = RandomForestRegressor(random_state=42)
rf.fit(X, y1)
# Create a bar chart of the feature importances
feature_importances = pd.Series(rf.feature_importances_, index=X.columns)
feature_importances = feature_importances.drop(['Hazmat_Derailment_Ratio', 'Year', '--- Train accident deaths', '--- Train accident injuries'])
feature_importances = feature_importances.sort_values()
plt.figure(figsize=(8, 6))
sns.barplot(x=feature_importances, y=feature_importances.index)
plt.title("Feature Importances")
plt.xlabel("Importance")
plt.show()
#Create a time series plot of '--- Hazmat cars damaged/derailed'
plt.figure(figsize=(10, 6))
plt.plot(df_transposed['Year'], df_transposed['--- Hazmat cars damaged/derailed'])
plt.title("Hazmat Cars Damaged/Derailed Over Time")
plt.xlabel("Year")
plt.ylabel("Hazmat Cars Damaged/Derailed")
plt.show()
# Create a time series plot of '--- Cars carrying hazmat'
plt.figure(figsize=(10, 6))
plt.plot(df_transposed['Year'], df_transposed['--- Cars carrying hazmat'])
plt.title("Cars Carrying Hazmat Over Time")
plt.xlabel("Year")
plt.ylabel("Cars Carrying Hazmat")
plt.show()
# Create a line plot of '--- Hazmat cars damaged/derailed' and '--- Cars carrying hazmat'
plt.figure(figsize=(10, 6))
plt.plot(df_transposed['Year'], df_transposed['--- Hazmat cars damaged/derailed'], label='Hazmat Cars Damaged/Derailed')
plt.plot(df_transposed['Year'], df_transposed['--- Cars carrying hazmat'], label='Cars Carrying Hazmat')
plt.title("Hazmat Cars Damaged/Derailed vs. Cars Carrying Hazmat Over Time")
plt.xlabel("Year")
plt.ylabel("Number of Cars")
plt.legend()
plt.show()