-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate_hf_dataset.py
46 lines (31 loc) · 1.66 KB
/
generate_hf_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import csv
def txts_to_csv(txt_file_paths, csv_file_path):
row_id = 0 # Initialize row ID counter
with open(csv_file_path, 'w', newline='') as csv_file:
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['id', 'utterance', 'logical_form']) # Write the header row
for txt_file_path in txt_file_paths:
with open(txt_file_path, 'r') as txt_file:
lines = txt_file.readlines()
for i in range(0, len(lines), 2): # Loop through every pair of lines
utterance = lines[i]
logical_form = lines[i + 1]
csv_writer.writerow([row_id, utterance, logical_form])
row_id += 1 # Increment the row ID for the next pair
train_text_file_paths =["gen/train.txt", "gen_logical/train.txt", "para/train.txt", "para_logical/train.txt"]
train_csv_file_path = 'train.csv'
test_text_file_paths = ["gen/test.txt", "gen_logical/test.txt", "para/test.txt", "para_logical/test.txt"]
test_csv_file_path = 'test.csv'
validation_text_file_paths = ["gen/val.txt", "gen_logical/val.txt", "para/val.txt", "para_logical/val.txt"]
validation_csv_file_path = 'validation.csv'
# # generate the three csv files
# txts_to_csv(train_text_file_paths, train_csv_file_path)
# txts_to_csv(test_text_file_paths, test_csv_file_path)
# txts_to_csv(validation_text_file_paths, validation_csv_file_path)
from datasets import load_dataset
dataset = load_dataset('csv', data_files={
'train': 'train.csv',
'validation': 'validation.csv',
'test': 'test.csv'
})
dataset.push_to_hub("robot_commands", private=True) # push our dataset to the hf hub