-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess.py
102 lines (85 loc) · 2.83 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
"""
Preprocess dataset from import.io to prepare for graphing
"""
import csv
BATTERY_SIZES = [75, 100, 60, 70, 85, 90]
MODEL_NAMES = {}
for battery_size in BATTERY_SIZES:
for perf in 'P', '':
for drive in 'D', '+', '':
MODEL_NAMES[perf +
str(battery_size) +
drive] = (perf == 'P', battery_size, drive == 'D')
def guess_battery(row):
"""Try to guess the battery kwh size"""
title = row['text_link/_title']
description = row['text_description']
for model in MODEL_NAMES:
if model in title.upper().upper():
return MODEL_NAMES[model][1]
if model in description.upper():
return MODEL_NAMES[model][1]
for bsize in BATTERY_SIZES:
if str(bsize) in title:
return bsize
if str(bsize) + "kwh" in description.lower():
return bsize
if str(bsize) + ' kwh' in description.lower():
return bsize
for bsize in BATTERY_SIZES:
if str(bsize) in description:
return bsize
def guess_performance(row):
"""Try to guess whether the car is a performance (P) model"""
title = row['text_link/_title']
description = row['text_description']
for model in MODEL_NAMES:
if model in title.upper().upper():
return MODEL_NAMES[model][0]
if model in description.upper():
return MODEL_NAMES[model][0]
if 'performance' in title.lower():
return True
return False
def guess_awd(row):
"""Try to guess whether the car is an AWD (D) model"""
title = row['text_link/_title']
description = row['text_description']
for model in MODEL_NAMES:
if model in title.upper().upper():
return MODEL_NAMES[model][2]
if model in description.upper():
return MODEL_NAMES[model][2]
return False
NAME_MAP = {
'year': 'text_value_1_numbers',
'miles': 'col_value_numbers',
'price': 'listunstyledh4_value_prices',
'battery': guess_battery,
'performance': guess_performance,
'awd': guess_awd
}
OUTPUT_KEYS = NAME_MAP.keys()
def process_row(row):
"""Process only one row"""
output_row = {}
for name, value in NAME_MAP.items():
if isinstance(value, str):
output_row[name] = row[value]
else:
output_row[name] = value(row)
return output_row
def process_file():
"""Do all the things"""
output = []
with open('dataset.csv') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
output.append(process_row(row))
with open('dataset.processed.csv', 'w') as outputfile:
writer = csv.DictWriter(outputfile, NAME_MAP.keys())
writer.writeheader()
for output_row in output:
writer.writerow(output_row)
if __name__ == '__main__':
process_file()