forked from pbc504/Outdoor_Assessment_AQI
-
Notifications
You must be signed in to change notification settings - Fork 0
/
evaluate_bocs_model.py
156 lines (129 loc) · 11 KB
/
evaluate_bocs_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
'''
Program to evaluate models with preprocessed files from 2 months and trained models from a third month.
Start program in command line with for example:
%run evaluate_bocs_model.py "../preprocessed_aviva_march_2019.csv" "../preprocessed_aviva_may_2019.csv" "../preprocessed_bocs_aviva_raw_2019-03_2019-06/preprocessed*2019-03*" "../preprocessed_bocs_aviva_raw_2019-03_2019-06/preprocessed*2019-05*" "../bocs_aviva_trained_models_april_2019.csv" "../bocs_aviva_evaluated_models_april_2019.csv"
Problem in data file of 11/05/2019 and 12/05/2019. Not using those days
'''
import numpy as np
import pandas as pd
import argparse
import os
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import itertools
# Function to evaluate how good models are at predicting
def evaluate_model(test_dataframe,combo_num, r_sq_variable_name):
truth = results_df.loc[combo_num,'Truth']
y = ref_df[truth]
predictor_1 = results_df.loc[combo_num, 'Predictor_1']
predictor_2 = results_df.loc[combo_num, 'Predictor_2']
predictor_3 = results_df.loc[combo_num, 'Predictor_3']
predictor_4 = results_df.loc[combo_num, 'Predictor_4']
predictor_5 = results_df.loc[combo_num, 'Predictor_5']
predictor_6 = results_df.loc[combo_num, 'Predictor_6']
predictor_7 = results_df.loc[combo_num, 'Predictor_7']
if predictor_2 == '0':
x = test_dataframe[predictor_1]*results_df.loc[combo_num, 'Slope_1'] + results_df.loc[combo_num, 'Intercept']
elif predictor_3 == '0':
x = test_dataframe[predictor_1]*results_df.loc[combo_num, 'Slope_1'] + test_dataframe[predictor_2]*results_df.loc[combo_num, 'Slope_2'] + results_df.loc[combo_num, 'Intercept']
elif predictor_4 == '0':
x = test_dataframe[predictor_1]*results_df.loc[combo_num, 'Slope_1'] + test_dataframe[predictor_2]*results_df.loc[combo_num, 'Slope_2'] + test_dataframe[predictor_3]*results_df.loc[combo_num, 'Slope_3'] + results_df.loc[combo_num, 'Intercept']
elif predictor_5 == '0':
x = test_dataframe[predictor_1]*results_df.loc[combo_num, 'Slope_1'] + test_dataframe[predictor_2]*results_df.loc[combo_num, 'Slope_2'] + test_dataframe[predictor_3]*results_df.loc[combo_num, 'Slope_3'] + test_dataframe[predictor_4]*results_df.loc[combo_num, 'Slope_4'] + results_df.loc[combo_num, 'Intercept']
elif predictor_6 == '0':
x = test_dataframe[predictor_1]*results_df.loc[combo_num, 'Slope_1'] + test_dataframe[predictor_2]*results_df.loc[combo_num, 'Slope_2'] + test_dataframe[predictor_3]*results_df.loc[combo_num, 'Slope_3'] + test_dataframe[predictor_4]*results_df.loc[combo_num, 'Slope_4'] + test_dataframe[predictor_5]*results_df.loc[combo_num, 'Slope_5'] + results_df.loc[combo_num, 'Intercept']
elif predictor_7 == '0':
x = test_dataframe[predictor_1]*results_df.loc[combo_num, 'Slope_1'] + test_dataframe[predictor_2]*results_df.loc[combo_num, 'Slope_2'] + test_dataframe[predictor_3]*results_df.loc[combo_num, 'Slope_3'] + test_dataframe[predictor_4]*results_df.loc[combo_num, 'Slope_4'] + test_dataframe[predictor_5]*results_df.loc[combo_num, 'Slope_5'] + test_dataframe[predictor_6]*results_df.loc[combo_num, 'Slope_6'] + results_df.loc[combo_num, 'Intercept']
else:
x = test_dataframe[predictor_1]*results_df.loc[combo_num, 'Slope_1'] + test_dataframe[predictor_2]*results_df.loc[combo_num, 'Slope_2'] + test_dataframe[predictor_3]*results_df.loc[combo_num, 'Slope_3'] + test_dataframe[predictor_4]*results_df.loc[combo_num, 'Slope_4'] + test_dataframe[predictor_5]*results_df.loc[combo_num, 'Slope_5'] + test_dataframe[predictor_6]*results_df.loc[combo_num, 'Slope_6'] + test_dataframe[predictor_7]*results_df.loc[combo_num, 'Slope_7'] + results_df.loc[combo_num, 'Intercept']
test_dataframe[truth + '_predicted'] = x
x = test_dataframe[[truth + '_predicted']]
model_r_sq = LinearRegression().fit(x,y).score(x,y)
results_df.loc[combo_num, r_sq_variable_name] = model_r_sq
# Match reference to bocs_data dates
def match_dates(reference_dataframe, array1_dataframe, array2_dataframe):
first_date = max(reference_dataframe.index[0], array1_dataframe.index[0], array2_dataframe.index[0])
last_date = min(reference_dataframe.index[-1], array1_dataframe.index[-1], array2_dataframe.index[-1])
reference_dataframe = reference_dataframe[first_date:last_date]
array1_dataframe = array1_dataframe[first_date:last_date]
array2_dataframe = array2_dataframe[first_date:last_date]
if len(reference_dataframe) != len(array1_dataframe) or len(reference_dataframe) != len(array2_dataframe):
min_length = min(len(reference_dataframe), len(array1_dataframe), len(array2_dataframe))
if len(reference_dataframe) == min_length:
diff_1 = list(set(array1_dataframe.index) - set(reference_dataframe.index))
array1_dataframe = array1_dataframe.drop(diff_1)
diff_2 = list(set(array2_dataframe.index) - set(reference_dataframe.index))
array2_dataframe = array2_dataframe.drop(diff_2)
elif len(array1_dataframe) == min_length:
diff_1 = list(set(reference_dataframe.index) - set(array1_dataframe.index))
reference_dataframe = reference_dataframe.drop(diff_1)
diff_2 = list(set(array2_dataframe.index) - set(array1_dataframe.index))
array2_dataframe = array2_dataframe.drop(diff_2)
elif len(array2_dataframe) == min_length:
diff_1 = list(set(reference_dataframe.index) - set(array2_dataframe.index))
reference_dataframe = reference_dataframe.drop(diff_1)
diff_2 = list(set(array1_dataframe.index) - set(array2_dataframe.index))
array1_dataframe = array1_dataframe.drop(diff_2)
return reference_dataframe, array1_dataframe, array2_dataframe
#========================================================================================================================
# Arguments to parse
parser = argparse.ArgumentParser(description = 'Program to evaluate models with preprocessed files from 2 months and trained models from a third month. Start program in command line with for example: %run evaluate_bocs_model.py "../preprocessed_aviva_march_2019.csv" "../preprocessed_aviva_may_2019.csv" "../preprocessed_bocs_aviva_raw_2019-03_2019-06/preprocessed*2019-03*" "../preprocessed_bocs_aviva_raw_2019-03_2019-06/preprocessed*2019-05*" "../bocs_aviva_trained_models_april_2019.csv" "../bocs_aviva_evaluated_models_april_2019.csv"')
parser.add_argument("first_reference_filepath", help='Input first month reference filepath to evaluate models. Example: "../preprocessed_aviva_march_2019.csv".')
parser.add_argument("second_reference_filepath", help='Input second month reference filepath to evaluate models. Example: "../preprocessed_aviva_may_2019.csv".')
parser.add_argument("arrays_filepath", nargs='+', help='Input arrays filepath to evaluate. Example: "../preprocessed_bocs_aviva_raw_2019-03_2019-06/preprocessed*2019-03*" "../preprocessed_bocs_aviva_raw_2019-03_2019-06/preprocessed*2019-05*".')
parser.add_argument("trained_results_filepath", help='Input filepath of the file with the trained models. Example: "../bocs_aviva_trained_models_april_2019.csv"')
parser.add_argument("evaluated_results_filepath", help='Input filename for the evaluated results file. Example: "../bocs_aviva_evaluated_models_april_2019.csv".')
args = parser.parse_args()
# Separate the files for each array
all_arrays_file = args.arrays_filepath
array_1_files = [s for s in all_arrays_file if "SENSOR_ARRAY_1_" in s]
array_2_files = [s for s in all_arrays_file if "SENSOR_ARRAY_2_" in s]
# Reads reference data for first month
first_ref_df = pd.read_csv(args.first_reference_filepath, header=0, index_col=0,
dtype={'TimeBeginning': 'object', 'NO_Scaled': np.float64, 'NO2_Scaled': np.float64, 'NOx_Scaled': np.float64, 'O3_Scaled': np.float64, 'WD_Scaled': np.float64, 'TEMP_Scaled': np.float64, 'HUM_Scaled': np.float64, 'WINDMS_Scaled': np.float64})
first_ref_df.index = pd.to_datetime(first_ref_df.index)
# Reads reference data for second month
second_ref_df = pd.read_csv(args.second_reference_filepath, header=0, index_col=0,
dtype={'TimeBeginning': 'object', 'NO_Scaled': np.float64, 'NO2_Scaled': np.float64, 'NOx_Scaled': np.float64, 'O3_Scaled': np.float64, 'WD_Scaled': np.float64, 'TEMP_Scaled': np.float64, 'HUM_Scaled': np.float64, 'WINDMS_Scaled': np.float64})
second_ref_df.index = pd.to_datetime(second_ref_df.index)
# Reads selected columns of each preprocessed file in first array, resamples them to 5 minutes and appends them into a dataframe containing all the data.
# Same thing for both sensor arrays
df1 = pd.DataFrame()
for file in array_1_files:
df1_1 = pd.read_csv(file, header=0, index_col=0,
usecols=['timestamp', 'VOC', 'NO', 'CO', 'Ox', 'NO2',
# 'co2_1_active', 'co2_1_reference', 'co2_2_active', 'co2_2_reference', 'co2_3_active', 'co2_3_reference',
'humidity_in_percentage', 'temperature_in_celsius'],
dtype={'timestamp': np.int64, 'VOC': np.float64, 'NO': np.float64, 'CO': np.float64, 'Ox': np.float64, 'NO2': np.float64,
# 'co2_1': np.float64, 'co2_2': np.float64, 'co2_3': np.float64, 'co2_4': np.float64, 'co2_5': np.float64, 'co2_6': np.float64,
'humidity_in_percentage': np.float64, 'temperature_in_celsius': np.float64})
df1_1.index = pd.to_datetime(df1_1.index, unit='s')
df1_1r = df1_1.resample("5Min").mean()
df1 = df1.append(df1_1r, sort=False)
df2 = pd.DataFrame()
for file in array_2_files:
df2_1 = pd.read_csv(file, header=0, index_col=0,
usecols=['timestamp', 'VOC', 'NO', 'CO', 'Ox', 'NO2',
# 'co2_1_active', 'co2_1_reference', 'co2_2_active', 'co2_2_reference', 'co2_3_active', 'co2_3_reference',
'humidity_in_percentage', 'temperature_in_celsius'],
dtype={'timestamp': np.int64, 'VOC': np.float64, 'NO': np.float64, 'CO': np.float64, 'Ox': np.float64, 'NO2': np.float64,
# 'co2_1': np.float64, 'co2_2': np.float64, 'co2_3': np.float64, 'co2_4': np.float64, 'co2_5': np.float64, 'co2_6': np.float64,
'humidity_in_percentage': np.float64, 'temperature_in_celsius': np.float64})
df2_1.index = pd.to_datetime(df2_1.index, unit='s')
df2_1r = df2_1.resample("5Min").mean()
df2 = df2.append(df2_1r, sort=False)
# Append first and second month reference dataframes
ref_df = first_ref_df.append(second_ref_df, sort=False)
# Match start and finish of datalog
# Remove 11th and 12th of may from reference data as was a problem on those files of raw data (10*288=2880) 14th day- 3456
ref_df = match_dates(ref_df, df1, df2)[0]
df1 = match_dates(ref_df, df1, df2)[1]
df2 = match_dates(ref_df, df1, df2)[2]
#================================================================================================================================================
# Values of the results of different models made with training month data
results_df = pd.read_csv(args.trained_results_filepath, index_col=0)
# Evaluates the models on the data of march and may, for both sensor arrays.
for number in range(0,len(results_df)):
evaluate_model(df1, number, 'df1_evaluated_r_sq')
evaluate_model(df2, number, 'df2_evaluated_r_sq')
results_df.to_csv(args.evaluated_results_filepath)