-
Notifications
You must be signed in to change notification settings - Fork 0
/
sailor_text_postprocessors.py
141 lines (120 loc) · 4.74 KB
/
sailor_text_postprocessors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import re
from typing import Callable, Optional, Union
from opencompass.registry import TEXT_POSTPROCESSORS
@TEXT_POSTPROCESSORS.register_module('general_pred')
def general_pred_postprocess(text: str) -> str:
# Cut off the first newline
text = re.split(r'[\n]', text, 1)[0]
# Remove blank spaces before words
text = re.sub(r'^\s*', '', text)
prompts = set([
"คำตอบ: ".lower(), # Answer:
"Trả lời: ".lower(), # Answer:
"Jawaban: ".lower(), # Answer:
"ไทย: ".lower(), # Thai:
"Bahasa Indonesia: ".lower(), # Indonesian:
"Tiếng Việt: ".lower(), # Vietnamese:
"Bahasa Inggris: ".lower(), # English:
"อังกฤษ: ".lower(), # English:
"Tiếng Anh: ".lower(), # English:
"สรุป: ".lower(), # Summary:
"Ringkasan: ".lower(), # Summary:
])
for prompt in prompts:
if prompt in text:
text = text.split(prompt)[-1]
break
# Remove punctuation after the text
no_punctuation = re.sub(r'(\w)[^\w\s]*\s*$', r'\1', text)
# Remove duplicated blank spaces
cleaned_text = re.sub(r'\s+', ' ', no_punctuation)
# Remove blank spaces before comma
cleaned_text = re.sub(r'\s*,', ',', cleaned_text)
# Remove blank spaces before period
cleaned_text = re.sub(r'\s*\.', '.', cleaned_text)
return cleaned_text.strip()
@TEXT_POSTPROCESSORS.register_module('general_ans')
def general_ans_postprocess(text: str) -> str:
# Remove blank spaces before words
text = re.sub(r'^\s*', '', text)
# Remove punctuation after the text
text = re.sub(r'(\w)[^\w\s]*\s*$', r'\1', text)
# Remove duplicated blank spaces
cleaned_text = re.sub(r'\s+', ' ', text)
# Remove blank spaces before comma
cleaned_text = re.sub(r'\s*,', ',', cleaned_text)
# Remove blank spaces before period
cleaned_text = re.sub(r'\s*\.', '.', cleaned_text)
return cleaned_text.strip()
def first_option_parse(text: str, options: str, cushion=True) -> str:
"""Find first valid option for text."""
# yapf: disable
# flake8: noqa: W605
patterns = [
f'答案是?\s?([{options}])',
f'答案是?\s?:([{options}])',
f'答案是?\s?:([{options}])',
f'答案应该?是\s?([{options}])',
f'答案应该?选\s?([{options}])',
f'答案为\s?([{options}])',
f'答案选\s?([{options}])',
f'选择?\s?([{options}])',
f'只有选?项?\s?([{options}])\s?是?对',
f'只有选?项?\s?([{options}])\s?是?错',
f'只有选?项?\s?([{options}])\s?不?正确',
f'只有选?项?\s?([{options}])\s?错误',
f'说法不?对选?项?的?是\s?([{options}])',
f'说法不?正确选?项?的?是\s?([{options}])',
f'说法错误选?项?的?是\s?([{options}])',
f'([{options}])\s?是正确的',
f'([{options}])\s?是正确答案',
f'选项\s?([{options}])\s?正确',
f'所以答\s?([{options}])',
f'1.\s?([{options}])[.。$]?',
f'[\s,::,]([{options}])[。,,\.]?',
f'[\s,,::][故即]([{options}])[。\.]?',
f'[\s,,::]因此([{options}])[。\.]?',
f'[是为。]\s?([{options}])[。\.]?',
f'因此\s?([{options}])[。\.]?',
f'显然\s?([{options}])[。\.]?',
f'1.\s?(.*?)',
f'1.\s?([{options}])[.。$]?$',
f'所以\s?([{options}][.。$]?$)',
f'所有\s?([{options}][.。$]?$)',
f'[\s,::,]([{options}])[。,,\.]?$',
f'[\s,,::][故即]([{options}])[。\.]?$',
f'[\s,,::]因此([{options}])[。\.]?$',
f'[是为。]\s?([{options}])[。\.]?$',
f'因此\s?([{options}])[。\.]?$',
f'显然\s?([{options}])[。\.]?$',
f'答案是\s?(\S+)(?:。|$)',
f'答案应该是\s?(\S+)(?:。|$)',
f'答案为\s?(\S+)(?:。|$)',
f'[Tt]he answer is ([{options}])',
f'[Tt]he answer is option ([{options}])',
f'[Tt]he correct answer is ([{options}])',
f'[Tt]he correct answer is option ([{options}])',
f'[Tt]he answer to the question is ([{options}])',
f'^选项\s?([{options}])',
f'^([{options}])\s?选?项',
f'(\s|^)[{options}][\s。,,::\.$]',
f'(\s|^)[{options}](\s|$)',
f'1.\s?(.*?)$',
f'1.\s?([{options}])[.。$]?$',
]
cushion_patterns = [
f'([{options}]):',
f'[{options}]',
]
# flake8: noqa
# yapf: enable
if cushion:
patterns.extend(cushion_patterns)
for pattern in patterns:
match = re.search(pattern, text)
if match:
outputs = match.group(0)
for i in options:
if i in outputs:
return i
return ''