-
Notifications
You must be signed in to change notification settings - Fork 2
/
commonutils.py
171 lines (151 loc) · 5.45 KB
/
commonutils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
import os
from pathlib import Path
from typing import List, Dict, Set, Tuple
from markkk.logger import logger
ans_map: Dict[str, int] = {
"A": 0,
"B": 1,
"C": 2,
"D": 3,
"E": 4,
"F": 5,
"G": 6,
"H": 7,
"I": 8,
"J": 9,
"K": 10,
"L": 11,
"M": 12,
}
supported_types: Tuple[str] = (
"Descriptive",
"Explanatory",
"Predictive",
"Reverse Inference",
"Counterfactual",
"Introspection",
)
abbr_to_qtype_map: Dict[str, str] = {
"d": "Descriptive",
"e": "Explanatory",
"p": "Predictive",
"r": "Reverse Inference",
"c": "Counterfactual",
"i": "Introspection",
"1": "Descriptive",
"2": "Explanatory",
"3": "Predictive",
"4": "Reverse Inference",
"5": "Counterfactual",
"6": "Introspection",
}
def bump_version(filepath: Path) -> Path:
head, tail = os.path.split(filepath)
name, ext = os.path.splitext(tail)
suffix = 1
while filepath.is_file():
logger.info(f"{filepath} already exist")
suffix += 1
new_name = f"{name}_v{str(suffix)}{ext}"
filepath = Path(head) / new_name
return filepath
def get_stat(qa_label_lst: List[Dict]) -> Dict:
num_video_sections = len(qa_label_lst)
num_video_ignore = 0
num_video_require_retrim = 0
num_video_has_critical_point = 0
total_num_of_chars_in_qn_body = 0
total_num_of_words_in_qn_body = 0
total_num_qns = 0
total_num_ops = 0
num_ops_per_qn_map: Dict = {
"0": 0,
"1": 0,
"2": 0,
"3": 0,
"4": 0,
"5": 0,
"6": 0,
"7": 0,
"8": 0,
"9": 0,
"10": 0,
"11": 0,
"12": 0,
"13": 0,
}
num_word_per_qn_body_map: Dict = {}
q_type_count_map: Dict = {}
q_body_count_map: Dict = {}
for video_section in qa_label_lst:
if video_section.get("v_ignore"):
num_video_ignore += 1
if video_section.get("re_trim_ts"):
num_video_require_retrim += 1
if video_section.get("critical_ts"):
num_video_has_critical_point += 1
qa_list: List[Dict] = video_section.get("qa_list")
for qa_section in qa_list:
is_ignored: bool = qa_section.get("q_ignore")
if not is_ignored:
total_num_qns += 1
q_type = qa_section.get("q_type")
if q_type in q_type_count_map:
q_type_count_map[q_type] += 1
else:
q_type_count_map[q_type] = 1
q_body = qa_section.get("q_body")
total_num_of_chars_in_qn_body += len(q_body)
num_of_words_in_qn_body = len(q_body.split())
if num_of_words_in_qn_body in num_word_per_qn_body_map:
num_word_per_qn_body_map[num_of_words_in_qn_body] += 1
else:
num_word_per_qn_body_map[num_of_words_in_qn_body] = 1
total_num_of_words_in_qn_body += num_of_words_in_qn_body
if q_body in q_body_count_map:
q_body_count_map[q_body] += 1
else:
q_body_count_map[q_body] = 1
num_ops = str(len(qa_section.get("option_lst")))
if num_ops in num_ops_per_qn_map:
num_ops_per_qn_map[num_ops] += 1
else:
num_ops_per_qn_map[num_ops] = 1
total_num_ops += int(num_ops)
# derived values
num_video_labelled = num_video_sections - num_video_ignore
average_num_qns_per_video = (
total_num_qns / num_video_labelled if num_video_labelled != 0 else 0
)
average_num_qns_per_video = round(average_num_qns_per_video, 1)
average_num_ops_per_qn = total_num_ops / total_num_qns if total_num_qns != 0 else 0
average_num_ops_per_qn = round(average_num_ops_per_qn, 1)
average_num_chars_per_qn = (
total_num_of_chars_in_qn_body / total_num_qns if total_num_qns != 0 else 0
)
average_num_chars_per_qn = int(average_num_chars_per_qn)
average_num_words_per_qn = (
total_num_of_words_in_qn_body / total_num_qns if total_num_qns != 0 else 0
)
average_num_words_per_qn = int(average_num_words_per_qn)
num_of_q_type = len(q_type_count_map)
num_of_unique_qns = len(q_body_count_map)
stats: Dict = {
"Number of video sections found ": num_video_sections,
"Number of video sections ignored ": num_video_ignore,
"Number of videos Labelled ": num_video_labelled,
"Number of videos to be re-trimmed ": num_video_require_retrim,
"Number of videos has critical point ": num_video_has_critical_point,
"Total number of questions ": total_num_qns,
# "total_num_ops": total_num_ops,
"Average Num of questions per video ": average_num_qns_per_video,
"Average Num of options per question ": average_num_ops_per_qn,
"Number of options per question dist ": num_ops_per_qn_map,
"Average Num of characters per question ": average_num_chars_per_qn,
"Average Num of words per question ": average_num_words_per_qn,
"Number of question types ": num_of_q_type,
"Number of unique questions ": num_of_unique_qns,
"Question Count per question types": q_type_count_map,
# "q_body_count_map": q_body_count_map,
}
return stats