forked from sgrvinod/a-PyTorch-Tutorial-to-Image-Captioning
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpostProcSentences.py
78 lines (70 loc) · 2.34 KB
/
postProcSentences.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
from datasets import CaptionDataset
from tqdm import tqdm
import os
import spacy
from utils import get_word_synonyms
import re
dataset = CaptionDataset('dataset/output',
'coco_5_cap_per_img_5_min_word_freq',
'TRAIN',
None,
minimal=True)
nlp = spacy.load("en_core_web_sm")
synonyms = get_word_synonyms()
synonymDict = {}
for item in synonyms:
synonymDict[item[0]] = item
common_pairs = [
('person', 'car'),
('person', 'chair'),
('dining table', 'chair'),
('cup', 'bottle'),
('person', 'handbag'),
('backpack', 'handbag')
]
nouns = [None, None]
modified_sentences = 0
for data in tqdm(dataset):
#if modified_sentences >= 300:
#break
img_id = data[0]
caption = data[1]
captionWords = caption[1:-1]
captionWords = [item for item in captionWords if item != '<unk>']
captionString = ' '.join(captionWords)
existsPair = [False, False]
for pair in common_pairs:
currSynonyms = [synonymDict[pair[0]], synonymDict[pair[1]]]
for noun in currSynonyms[0]:
if re.search(r"\b{}\b".format(noun), captionString) is not None:
existsPair[0] = True
nouns[0] = noun
break
for noun in currSynonyms[1]:
if re.search(r"\b{}\b".format(noun), captionString) is not None:
existsPair[1] = True
nouns[1] = noun
break
if existsPair[0] is True and existsPair[1] is True:
break
if existsPair[0] is False or existsPair[1] is False:
continue
doc = nlp(captionString)
wordMask = [1 for i in range(len(doc))]
for chk in doc.noun_chunks:
print('Test - {}'.format(chk.root.text))
if chk.root.text == nouns[0] or chk.root.text == nouns[1]:
for word in chk:
if word.head.text in nouns and word.pos_ not in ['DET', 'PROPN'] and word.text != chk.root.text:
wordMask[word.i] = 0
finalSentence = []
for word, keep in zip(doc, wordMask):
if keep == 1:
finalSentence.append(word.text)
finalSentence = ' '.join(finalSentence)
if len(doc) != sum(wordMask):
modified_sentences += 1
print(nouns)
print(captionString)
print(finalSentence)
exit()