-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfeature-extraction.py
59 lines (44 loc) · 1.9 KB
/
feature-extraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
TRAIN_DATA = '_feature-extraction-min/train-arrays-min.csv' # 'train-arrays.csv'
FEATURES = []
def reformat_array():
"""
input: string like '[123 324 567]'
* transform to a list
* cast elements to integer
:return:
"""
return lambda x: [int(elem) for elem in x[1:-1].split()]
def read_big_data_train_data():
for df in pd.read_csv(TRAIN_DATA, iterator=True, encoding='latin1', error_bad_lines=False, names=["index", "length", "array"], chunksize=10):
df.to_sql('train_data', DATABASE, if_exists='append')
features = feature_extraction(df)
features.to_sql('train_features_data', DATABASE, if_exists='append')
def read_train_data():
return pd.read_csv(TRAIN_DATA, encoding='latin1', error_bad_lines=False, names=["ID", "length", "array"])
def feature_extraction(df_train):
df_train['array'] = df_train['array'].apply(reformat_array())
for index, row in df_train.iterrows():
array = np.array(row['array'])
FEATURES.append({
'index': row['index'],
'length': len(array),
'max': max(array),
'min': min(array),
'dist_min_max': max(array) - min(array),
# Compute the weighted average along the specified axis.
'average': np.average(array, axis=0),
# Compute the arithmetic mean along the specified axis
'mean': np.mean(array, axis=0), # reduce(lambda x, y: x + y, array) / len(array),
'q1': np.percentile(array, 25),
'q2': np.percentile(array, 50), # median
'q3': np.percentile(array, 75),
'std_deviation': np.std(array, axis=0),
'variance': np.var(array, axis=0),
})
return pd.DataFrame(FEATURES)
DATABASE = create_engine('sqlite:///database2.db')
read_big_data_train_data()
print('The end')