-
Notifications
You must be signed in to change notification settings - Fork 0
/
train_val_jsonl.py
executable file
·84 lines (73 loc) · 2.28 KB
/
train_val_jsonl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import re
import json
import os
import sys
from datasets import RetrievalDataset
from transformers import XLMRobertaTokenizer
file_path = "../data/val/val_people_labels.txt"
file_json_save_path = "../data/val/val_people.json"
split = 'val'
if os.path.exists(file_path):
image_all = []
with open(file_path) as f:
sent_num = 0
num = 0
max = 5 #caption max len
n = 0
# image_all = []
for line in f:
line = line.strip()
parts = line.split("$")
image_name = parts[0]
text =parts[2]
# print(type(text))
vert = re.split('[,.]', text)
vert.pop()
# print(vert)
if len(vert) > max:
print(vert) #输出跳过text超过五个的描述
n+=1
continue
sentids = [i for i in range(sent_num, sent_num+len(vert))]
sentences = []
for i in range(len(vert)):
words = []
words.append(vert[i].lower().split(' '))
new = [item for item in words[0] if item != '']
# print(new)
sentences_dict = {
'tokens' : new,
'raw' : vert[i],
'imgid' : num,
'sentid' : sentids[i]
}
sentences.append(sentences_dict)
# print(sentences)
image = {
'sentids' : sentids,
'imgid' : num,
'sentences' : sentences,
'split' : split,
'filename' : image_name
}
num+=1
sent_num += 1
image_all.append(image)
data = {
'images' : image_all,
'dataset' : 'flickr30k'
}
if os.path.exists(file_json_save_path):
print("json file already exists.")
os.remove(file_json_save_path)
print("delect file success!")
with open(file_json_save_path,'w') as f:
json.dump(data,f)
print("save file success!")
# tokenizer = XLMRobertaTokenizer("/home/has/lisu/beit3/model_pth/beit3.spm")
#
# RetrievalDataset.make_flickr30k_dataset_index(
# data_path="../data/val",
# tokenizer=tokenizer,
# karpathy_path="../data/val",
# )