forked from horsepurve/DeepRTplus
-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_split.py
32 lines (29 loc) · 1.1 KB
/
data_split.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
'''
We will not perform the data splitting process while running both resnet and lstm training.
Rather, we will do it in the first place before everything.
'''
import sys
import os
import pandas as pd
import random
total_data = sys.argv[1]
train_ratio = int(sys.argv[2])
test_ratio = int(sys.argv[3])
random_seed = int(sys.argv[4])
def split_data(data_path, train_ratio, test_ratio, seed):
'''
generate the training and testing data using original data and random seed
'''
data = pd.read_csv(data_path).iloc[:,0].values.tolist()
random.seed(seed)
random.shuffle(data)
cut = int(len(data)*train_ratio/(train_ratio+test_ratio))
with open(data_path[:-4]+'_train_'+str(seed)+'.txt', 'w') as f:
f.write('sequence\tRT\n')
f.write('\n'.join(data[:cut])+'\n')
with open(data_path[:-4]+'_test_'+str(seed)+'.txt', 'w') as f:
f.write('sequence\tRT\n')
f.write('\n'.join(data[cut:])+'\n')
print('Now split the data into training and testing data sets with the seed:'+str(random_seed)+'\n')
split_data(total_data, train_ratio, test_ratio, random_seed)
print('Done.')