forked from pbc504/Outdoor_Assessment_AQI
-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess_bocs_data.py
203 lines (178 loc) · 11.6 KB
/
preprocess_bocs_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
'''
Program to preprocess files.
Start program in command line with:
%run preprocess_bocs_data.py "../aviva_april_2019.csv" "../bocs_aviva_raw_2019-03_2019-06/*2019-04*"
Problem in data file of 11/05/2019 and 12/05/2019. Not using those days
'''
import numpy as np
import pandas as pd
import os
import argparse
# Function to convert sensor signal to concentration in ppb
def signal_to_ppb(dataframe, compound, sensor, properties_df):
we_signal = dataframe[compound + "_" + sensor + "_working"]
we_zero = properties_df.loc[compound + '_' + sensor, 'we_zero']
we = we_signal - we_zero
ae_signal = dataframe[compound + "_" + sensor + "_aux"]
ae_zero = properties_df.loc[compound + '_' + sensor, 'ae_zero']
ae = ae_signal - ae_zero
sensitivity = properties_df.loc[compound + '_' + sensor, 'sensitivity']
variable_name = compound + '_' + sensor
dataframe[variable_name] = (we - ae)/sensitivity
# Function to convert temperature signal to degrees
def temperature_in_degrees(dataframe, Vout):
NTC = (Vout*10000) / (5000 - Vout)
inverse_T = 8.5494e-4 + 2.5731e-4*np.log(NTC) + 1.6537e-7*np.log(NTC)*np.log(NTC)*np.log(NTC)
T_inK = 1 / inverse_T
new_temp = T_inK - 273.15
dataframe['temperature_in_kelvin'] = T_inK
dataframe['temperature_in_celsius'] = new_temp
# Function to convert co2 signal to concentration in ppm
def co2_concentration(properties_df, sensor, dataframe):
zero_co2 = properties_df.loc[sensor, 'active_zero'] / properties_df.loc[sensor, 'reference_zero']
# Calculate the absorbance from the temperature compensated normalized ratio
temp_comp_ratio = dataframe[sensor + '_active'] / (dataframe[sensor + '_reference']*zero_co2)
if (dataframe['temperature_in_kelvin'] > properties_df.loc[sensor, 'zero_temperature']).all():
temp_comp_ratio = temp_comp_ratio*(1 + (properties_df.loc[sensor, 'positive_zero_Tempcomp']*(dataframe['temperature_in_kelvin'] - properties_df.loc[sensor, 'zero_temperature'])))
elif (dataframe['temperature_in_kelvin'] < properties_df.loc[sensor, 'zero_temperature']).all():
temp_comp_ratio = temp_comp_ratio*(1 + (properties_df.loc[sensor, 'negative_zero_Tempcomp']*(dataframe['temperature_in_kelvin'] - properties_df.loc[sensor, 'zero_temperature'])))
# Get absorbance value
absorbance = 1 - temp_comp_ratio
# Calculate Span correction
temp_comp_span = properties_df.loc[sensor, 'span']
if (dataframe['temperature_in_kelvin'] > properties_df.loc[sensor, 'span_temperature']).all():
temp_comp_span = temp_comp_span*(1 + (properties_df.loc[sensor, 'positive_span_Tempcomp']*(dataframe['temperature_in_kelvin'] - properties_df.loc[sensor, 'span_temperature'])))
elif (dataframe['temperature_in_kelvin'] < properties_df.loc[sensor, 'span_temperature']).all():
temp_comp_span = temp_comp_span*(1 + (properties_df.loc[sensor, 'negative_span_Tempcomp']*(dataframe['temperature_in_kelvin'] - properties_df.loc[sensor, 'span_temperature'])))
# Calculate the value for conversion
value = absorbance / temp_comp_span
value2 = temp_comp_span / absorbance
# Calculate concentration using: concentration = -(ln(1-Absorbance/span)exponent)^(1/powerterm)
value2 = - np.log(1 - value2)
value2 = value2 / properties_df.loc[sensor, 'exponent']
value2 = value2**(1 / properties_df.loc[sensor, 'powerterm'])
value2 = abs(value2)
concentration = value2 * (dataframe['temperature_in_kelvin'] / properties_df.loc[sensor, 'span_temperature'])
dataframe[sensor] = concentration
# Function to align 3 sensors data to median value, then take median value for every timestamp.
def find_median(dataframe, finalname, a, b, c):
med_value = np.median([dataframe[a].iloc[0], dataframe[b].iloc[0], dataframe[c].iloc[0]])
med_df = pd.DataFrame()
for sensor in (a, b, c):
diff = med_value - dataframe[sensor].iloc[0]
if diff == 0:
med_df['med_' + sensor] = dataframe[sensor]
else:
med_df['med_' + sensor] = dataframe[sensor] + diff
new_med = np.median(med_df,axis=1)
dataframe[finalname] = new_med
# Function to align voc sensors data to median value, then take median value for every timestamp.
def find_voc_median(dataframe, finalname, a, b, c, d, e, f, g, h):
med_value = np.median([dataframe[a].iloc[0], dataframe[b].iloc[0], dataframe[c].iloc[0], dataframe[d].iloc[0], dataframe[e].iloc[0], dataframe[f].iloc[0], dataframe[g].iloc[0], dataframe[h].iloc[0]])
med_df = pd.DataFrame()
for sensor in (a, b, c, d, e, f, g, h):
diff = med_value - dataframe[sensor].iloc[0]
if diff == 0:
med_df['med_' + sensor] = dataframe[sensor]
else:
med_df['med_' + sensor] = dataframe[sensor] + diff
new_med = np.median(med_df, axis=1)
dataframe[finalname] = new_med
#=======================================================================================================================
#Arguments to parse
parser = argparse.ArgumentParser(description = 'Filepath to preprocess')
parser.add_argument("reference_filepath", help='Input reference filepath to preprocess. Example: "../aviva_april_2019.csv".')
parser.add_argument("arrays_filepath", nargs='+', help='Input sensor arrays filepath to preprocess. Example: "../bocs_aviva_raw_2019-03_2019-06/*2019-04*".')
args = parser.parse_args()
# Separates sensor array 1 files from sensor array 2 files
all_files = args.arrays_filepath
array_1_files = all_files[:len(all_files)//2]
array_2_files = all_files[len(all_files)//2:]
# Read selected columns of reference data, change columns names and create new file with the processed data
ref_df = pd.read_csv(args.reference_filepath, header=0, index_col=0, usecols=['TimeBeginning', '1045100_NO_29_Scaled', '1045100_NO2_31_Scaled', '1045100_NOx_30_Scaled', '1045100_O3_1_Scaled', '1045100_WD_34_Scaled', '1045100_TEMP_41_Scaled', '1045100_HUM_46_Scaled', '1045100_WINDMS_33_Scaled'],
dtype={'TimeBeginning': 'object', '1045100_NO_29_Scaled': np.float64, '1045100_NO2_31_Scaled': np.float64, '1045100_NOx_30_Scaled': np.float64, '1045100_O3_1_Scaled': np.float64, '1045100_WD_34_Scaled': np.float64, '1045100_TEMP_41_Scaled': np.float64, '1045100_HUM_46_Scaled': np.float64, '1045100_WINDMS_33_Scaled': np.float64})
ref_df.columns = ['NO_Scaled', 'NO2_Scaled', 'NOx_Scaled', 'O3_Scaled', 'WD_Scaled', 'TEMP_Scaled', 'HUM_Scaled', 'WINDMS_Scaled']
filename = os.path.basename(args.reference_filepath)
ref_df.to_csv('../preprocessed_'+filename)
# Process sensor_array_1 data
# Select columns, chage their names, covert sensor signal to ppb, temperature to degrees and relative humidity to percentage. Then write new file with the converted values added.
properties_df1 = pd.read_csv("../sensor_properties/sensor_array_1_electronic_properties.csv", index_col=0)
co2_properties_1 = pd.read_csv("../sensor_properties/sensor_array_1_co2_properties.csv", index_col=0)
for file in array_1_files:
df1= pd.read_csv(file, header=0, index_col=0, usecols=['timestamp', 'voc_1', 'voc_2', 'voc_3', 'voc_4', 'voc_5', 'voc_6', 'voc_7', 'voc_8',
'no_1', 'no_2', 'no_3', 'no_4', 'no_5', 'no_6',
'co_1', 'co_2', 'co_3', 'co_4', 'co_5', 'co_6',
'ox_1', 'ox_2', 'ox_3', 'ox_4', 'ox_5', 'ox_6',
'no2_1', 'no2_2', 'no2_3', 'no2_4', 'no2_5', 'no2_6',
'co2_1', 'co2_2', 'co2_3', 'co2_4', 'co2_5', 'co2_6',
'relative_humidity', 'temperature'], dtype=np.int64)
df1.columns = ['voc_1', 'voc_2', 'voc_3', 'voc_4', 'voc_5', 'voc_6', 'voc_7', 'voc_8',
'no_1_working', 'no_1_aux', 'no_2_working', 'no_2_aux', 'no_3_working', 'no_3_aux',
'co_1_working', 'co_1_aux', 'co_2_working', 'co_2_aux', 'co_3_working', 'co_3_aux',
'ox_1_working', 'ox_1_aux', 'ox_2_working', 'ox_2_aux', 'ox_3_working', 'ox_3_aux',
'no2_1_working', 'no2_1_aux', 'no2_2_working', 'no2_2_aux', 'no2_3_working', 'no2_3_aux',
'co2_1_active', 'co2_1_reference', 'co2_2_active', 'co2_2_reference', 'co2_3_active', 'co2_3_reference',
'relative_humidity', 'temperature']
df1 = df1*0.1875
for compound in ('no', 'co', 'ox', 'no2'):
for sensor in ('1', '2', '3'):
signal_to_ppb(df1, compound, sensor, properties_df1)
hum = df1['relative_humidity']
df1['humidity_in_percentage'] = 0.0375*df1['relative_humidity'] - 37.7
temp = df1['temperature']
temperature_in_degrees(df1, temp)
# co2_concentration(co2_properties_1, 'co2_1', df1)
# co2_concentration(co2_properties_1, 'co2_2', df1)
# co2_concentration(co2_properties_1, 'co2_3', df1)
# find_median(df1, 'CO2', 'co2_1', 'co2_2', 'co2_3')
find_median(df1, 'NO', 'no_1', 'no_2', 'no_2')
find_median(df1, 'CO', 'co_1', 'co_2', 'co_2')
find_median(df1, 'Ox', 'ox_1', 'ox_2', 'ox_2')
find_median(df1, 'NO2', 'no2_1', 'no2_2', 'no2_2')
find_voc_median(df1, 'VOC', 'voc_1', 'voc_2', 'voc_3', 'voc_4', 'voc_5', 'voc_6', 'voc_7', 'voc_8')
filename = os.path.basename(file)
directory = os.path.dirname(file)
foldername = os.path.basename(directory)
directory_of_directory = os.path.dirname(directory)
df1.to_csv(directory_of_directory + "/preprocessed_" + foldername + "/preprocessed_" +filename)
# Process sensor_array_2 data
# Select columns, chage their names, covert sensor signal to ppb, temperature to degrees and relative humidity to percentage. Then write new file with the converted values added.
properties_df2 = pd.read_csv("../sensor_properties/sensor_array_2_electronic_properties.csv", index_col=0)
co2_properties_2 = pd.read_csv("../sensor_properties/sensor_array_2_co2_properties.csv", index_col=0)
for file in array_2_files:
df2= pd.read_csv(file, header=0, index_col=0, usecols=['timestamp', 'voc_1', 'voc_2', 'voc_3', 'voc_4', 'voc_5', 'voc_6', 'voc_7', 'voc_8',
'no_1', 'no_2', 'no_3', 'no_4', 'no_5', 'no_6',
'co_1', 'co_2', 'co_3', 'co_4', 'co_5', 'co_6',
'ox_1', 'ox_2', 'ox_3', 'ox_4', 'ox_5', 'ox_6',
'no2_1', 'no2_2', 'no2_3', 'no2_4', 'no2_5', 'no2_6',
'co2_1', 'co2_2', 'co2_3', 'co2_4', 'co2_5', 'co2_6',
'relative_humidity', 'temperature'], dtype=np.int64)
df2.columns = ['voc_1', 'voc_2', 'voc_3', 'voc_4', 'voc_5', 'voc_6', 'voc_7', 'voc_8',
'no_1_working', 'no_1_aux', 'no_2_working', 'no_2_aux', 'no_3_working', 'no_3_aux',
'co_1_working', 'co_1_aux', 'co_2_working', 'co_2_aux', 'co_3_working', 'co_3_aux',
'ox_1_working', 'ox_1_aux', 'ox_2_working', 'ox_2_aux', 'ox_3_working', 'ox_3_aux',
'no2_1_working', 'no2_1_aux', 'no2_2_working', 'no2_2_aux', 'no2_3_working', 'no2_3_aux',
'co2_1_active', 'co2_1_reference', 'co2_2_active', 'co2_2_reference', 'co2_3_active', 'co2_3_reference',
'relative_humidity', 'temperature']
df2 = df2*0.1875
for compound in ('no', 'co', 'ox', 'no2'):
for sensor in ('1', '2', '3'):
signal_to_ppb(df2, compound, sensor, properties_df2)
hum = df2['relative_humidity']
df2['humidity_in_percentage'] = 0.0375*hum - 37.7
temp = df2['temperature']
temperature_in_degrees(df2, temp)
# co2_concentration(co2_properties_2, 'co2_1', df2)
# co2_concentration(co2_properties_2, 'co2_2', df2)
# co2_concentration(co2_properties_2, 'co2_3', df2)
# find_median(df2, 'CO2', 'co2_1', 'co2_2', 'co2_3')
find_median(df2, 'NO', 'no_1', 'no_2', 'no_2')
find_median(df2, 'CO', 'co_1', 'co_2', 'co_2')
find_median(df2, 'Ox', 'ox_1', 'ox_2', 'ox_2')
find_median(df2, 'NO2', 'no2_1', 'no2_2', 'no2_2')
find_voc_median(df2, 'VOC', 'voc_1', 'voc_2', 'voc_3', 'voc_4', 'voc_5', 'voc_6', 'voc_7', 'voc_8')
filename = os.path.basename(file)
directory = os.path.dirname(file)
foldername = os.path.basename(directory)
directory_of_directory = os.path.dirname(directory)
df2.to_csv(directory_of_directory + "/preprocessed_" + foldername + "/preprocessed_" +filename)