forked from l1ghtsource/ozon-ecup-matching
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate_features_train.py
31 lines (22 loc) · 1.19 KB
/
generate_features_train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import pandas as pd
from pathlib import Path
from data_preprocessing import main_preprocessing
from feature_generation import feature_generation
DATA_FOLDER_PATH_TRAIN = Path('./data/train')
ATTRIBUTES_PATH = DATA_FOLDER_PATH_TRAIN / 'attributes.parquet'
RESNET_PATH = DATA_FOLDER_PATH_TRAIN / 'resnet.parquet'
TEXT_AND_BERT_PATH = DATA_FOLDER_PATH_TRAIN / 'text_and_bertt.parquet'
TRAIN_PATH = DATA_FOLDER_PATH_TRAIN / 'train.parquet'
def main():
attributes = pd.read_parquet(ATTRIBUTES_PATH, engine='pyarrow')
resnet = pd.read_parquet(RESNET_PATH, engine='pyarrow')
text_and_bert = pd.read_parquet(TEXT_AND_BERT_PATH, engine='pyarrow')
train = pd.read_parquet(TRAIN_PATH, engine='pyarrow')
data = pd.concat([attributes, resnet.drop('variantid', axis=1), text_and_bert.drop('variantid', axis=1)], axis=1)
data['description'] = data['description'].fillna('no desc')
data = main_preprocessing(data, mode='train')
train_features_df = feature_generation(data, train)
train_features_df.to_parquet(DATA_FOLDER_PATH_TRAIN / 'train_feature_df.parquet', engine='pyarrow')
# ДОБАВИТЬ СЮДА ВЫЗОВ ОБУЧЕНИЯ БЕРТОВ
if __name__ == '__main__':
main()