-
Notifications
You must be signed in to change notification settings - Fork 0
/
dataset_stats.py
67 lines (46 loc) · 1.9 KB
/
dataset_stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import json
from datasets import load_dataset
if __name__ == '__main__':
dataset = load_dataset(f"avylor/feedback_qesconv")
train_data = dataset['train']
count_perfect = 0
count_imperfect = 0
count_bad_areas = {}
count_good_areas = {}
avg_alternative_length = 0
avg_goal_length = 0
nr_session = 0
nr_utternaces = 0
for ann in train_data:
nr_utternaces += 1
nr_session = max(nr_session, ann['conv_index'])
feedback = json.loads(ann['text'].split("Response:")[1])
if feedback['perfect']:
count_perfect += 1
else:
count_imperfect += 1
for badarea in feedback['badareas']:
if badarea in count_bad_areas:
count_bad_areas[badarea] += 1
else:
count_bad_areas[badarea] = 1
# split alternative into words
avg_alternative_length += len(feedback['alternative'].split())
avg_goal_length += len(feedback['feedback'].split())
for goodarea in feedback['goodareas']:
if goodarea in count_good_areas:
count_good_areas[goodarea] += 1
else:
count_good_areas[goodarea] = 1
avg_goal_length /= count_imperfect
avg_alternative_length /= count_imperfect
print(f'Nr of sessions: {nr_session+1}')
print(f'Nr of utterances: {nr_utternaces}')
print(f'Nr of perfect: {count_perfect}')
print(f'Nr of imperfect: {count_imperfect}')
print(f'Good areas: {count_good_areas}')
print(f'Bad areas: {count_bad_areas}')
print(f'Avg alternative length: {avg_alternative_length}')
print(f'Avg goal length: {avg_goal_length}')
for area in ['Reflections', 'Questions', 'Suggestions', 'Validation', 'Self-disclosure', 'Empathy', 'Professionalism', 'Structure']:
print(f'{area} {count_bad_areas[area]} {count_good_areas[area]}')