-
Notifications
You must be signed in to change notification settings - Fork 0
/
qword_stat.py
75 lines (59 loc) · 2.61 KB
/
qword_stat.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import argparse
from collections import defaultdict
from pprint import pprint
from pathlib import Path
import dill
import stanza
from tqdm import tqdm
from quinductor.loaders import *
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-s', '--save', required=True, type=str, help="A folder to save to")
parser.add_argument('-l', '--lang', type=str, help='A language for stats generation (en, sv are currently supported)')
parser.add_argument('-d', '--data', type=str, help='Comma-separated list of files to generate stats from')
parser.add_argument('-ft', '--format', type=str, help='Data format (tt for Textinator or squad for Squad)')
parser.add_argument('-rtl', '--right-to-left', action='store_true')
args = parser.parse_args()
questions_fnames = [x.strip() for x in args.data.split(',')]
# arabic, finnish - include mwt
# russian - exclude mwt
stanza_processors = 'tokenize,lemma,mwt,pos,depparse' if args.lang in ['fi', 'ar'] else 'tokenize,lemma,pos,depparse'
stanza_lang = stanza.Pipeline(lang=args.lang, processors=stanza_processors)
if args.format == 'tt':
data_loader = TextinatorLoader
elif args.format == 'squad':
data_loader = SquadLoader
elif args.format == 'tydiqa':
data_loader = TyDiQaLoader
else:
# generic case
data_loader = JsonLinesLoader
stats = defaultdict(lambda: defaultdict(int))
answer_tmpl = defaultdict(int)
for q, a, _ in tqdm(data_loader.from_files(questions_fnames, args.lang)):
q_parsed = stanza_lang(q)
qw = q_parsed.sentences[0].words[-1].text if args.right_to_left else q_parsed.sentences[0].words[0].text
if data_loader == JsonLinesLoader:
if not a: continue
a_parsed = stanza_lang(a)
else:
if not a['text']: continue
a_parsed = stanza_lang(a['text'])
aw = None
for w in a_parsed.sentences[0].words:
if w.deprel == 'root':
aw = w
break
if aw.feats:
morph = '|'.join(sorted(aw.feats.split('|')))
else:
morph = None
at = aw.upos + '/' + morph if morph else aw.upos
stats[qw.lower()][at] += 1
answer_tmpl[" ".join(
[x.upos + '/' + '|'.join(sorted(x.feats.split('|'))) if x.feats else x.upos
for x in a_parsed.sentences[0].words])] += 1
folder = Path(args.save)
folder.mkdir(parents=True, exist_ok=True)
dill.dump(stats, open(folder.joinpath('qwstats.dill'), 'wb'))
dill.dump(answer_tmpl, open(folder.joinpath('atmpl.dill'), 'wb'))