-
Notifications
You must be signed in to change notification settings - Fork 1
/
data_preprocessing.py
111 lines (94 loc) · 3.39 KB
/
data_preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
"""
file: data_preprocessing.py
description: this file preprocesses the data so it can be fed to a nn
author: Raman Zatsarenko rzv090701@gmail.com
"""
import csv
import numpy
def read_csv(filename):
"""
appends data from a csv to an array
the csv if predetermined to have 4 columns,
prices are col = 2
:param filename: filename, str
:return: array of prices (int values)
"""
vehicles = []
prices = []
with open(filename) as csv_file:
csv_reader = csv.reader(csv_file, delimiter=',')
line_count = 0
for row in csv_reader:
current_vehicle_features = []
if line_count == 0:
print(f'Column names are {", ".join(row)}')
else:
# get year of manufacturing
try:
year_of_man = int(row[1]) / 1000
# current_vehicle.append(int(row[1]))
# current_vehicle.append(int(row[2][1:]))
# current_vehicle.append(int(row[3]))
except ValueError:
year_of_man = 0
finally:
current_vehicle_features.append(year_of_man)
# get price
try:
price = int(row[2][1:]) / 1000
except ValueError:
price = 0
# get mileage
try:
mileage = int(row[3]) / 10000
except ValueError:
mileage = 0
finally:
current_vehicle_features.append(mileage)
# append to all vehicles
vehicles.append(current_vehicle_features)
prices.append(price)
line_count += 1
return vehicles, prices
def normalize_data(data):
"""
assuming the list contains valid data that can be converted from str to int
convert the python list to numpy array and normalize the data
partitions the data into train and test arrays
:param data: python list of valid str values
:return: numpy array of normalized values
"""
# 2/3 of all data is used for training
train_data = numpy.array(data[:int(len(data)*0.66)], dtype='float64')
# 1/3 of all data is used for testing
test_data = numpy.array(data[int(len(data)*0.66):], dtype='float64')
# use values computed on train data to normalize train and test data
mean = train_data.mean(axis=0)
train_data -= mean
std = train_data.std(axis=0)
train_data /= std
test_data -= mean
test_data /= std
return train_data, test_data
def partition_targets(target_prices):
"""
partitions the prices data into test_targets and train_targets
:param target_prices: a list of target prices
:return: train_targets, test_targets
"""
train_targets = numpy.array(target_prices[:int(len(target_prices)*0.66)], dtype='float64')
test_targets = numpy.array(target_prices[int(len(target_prices)*0.66):], dtype='float64')
print(train_targets.shape)
print(test_targets.shape)
return train_targets, test_targets
def main():
filename = 'vehicle_info.csv'
vehicles, target_prices = read_csv(filename)
print(vehicles)
print(len(vehicles))
train_data, test_data = normalize_data(vehicles)
print(train_data[0])
print(test_data.shape)
partition_targets(target_prices)
if __name__ == '__main__':
main()