-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patheva_stsall_demo2.py
145 lines (132 loc) · 5.78 KB
/
eva_stsall_demo2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#%% Essential packages
import numpy as np
from nltk.corpus import stopwords
import string
from senteval.sts import STS12Eval, STS13Eval, STS14Eval, STS15Eval, STS16Eval, STSBenchmarkEval, STSBun_Eval
from collections import Counter, defaultdict
from scipy.stats import spearmanr, pearsonr
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['SynWMD_MODEL'] = 'princeton-nlp/unsup-simcse-bert-base-uncased' #"sentence-transformers/bert-base-nli-mean-tokens"
#'bert-base-uncased' #
from Lib.SynWMD import SynWMD
from Lib import whiten as pcr
from Lib import lib_dependency as lib_dep
import networkx as nx
import stanza
import warnings
warnings.filterwarnings("ignore")
stop_words = stopwords.words('english')
punct = [i for i in string.punctuation ]
stop_words = stop_words + punct
# Parser
nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma,depparse',
tokenize_pretokenized=True,
tokenize_no_ssplit=True,
verbose = True)
#%% functions
def build_graph(parsing_data, word2id, hop_size=3):
num_pair = 0
G = nx.Graph()
edge_count = defaultdict(float)
total_count = {id:0 for id in range(len(word2id))}
for sent in parsing_data:
tree = lib_dep.sentdic2undicgraph(sent)
for word_idx, word in enumerate(sent):
d_word_idx = word2id[word['text']]
neighb_dict = nx.single_source_shortest_path_length(tree, word_idx, cutoff=hop_size)
# adding co-occurrence time
for neighb_idx, hop in neighb_dict.items():
if hop == 0: continue # avoid the word itself
h_word_idx = word2id[sent[neighb_idx]['text']]
edge_count[(d_word_idx, h_word_idx)] += 1 / hop
total_count[d_word_idx] += 1 / hop
num_pair += 1
# normalize
# edge_count = {x:c/total_count[x[0]] for x, c in edge_count.items()}
weight_edge_list = [x+tuple([c]) for x, c in edge_count.items()]
G.add_weighted_edges_from(weight_edge_list)
# print('num_pair:', num_pair)
return G, num_pair
def data_all_set(data_name, data_loader):
refs = []
cands = []
gs_scores = []
vocab_count = Counter()
if data_name =='STSB':
evaluation = data_loader('./data/downstream/STS/STSBenchmark')
data_test = evaluation.data['sts-test']
rf, cd, gs = data_test
for sent in rf+cd:
vocab_count.update(sent)
rf = [' '.join(x) for x in rf]
cd = [' '.join(x) for x in cd]
refs.extend(rf)
cands.extend(cd)
gs_scores.extend(gs)
else:
tpath = './data'
fpath = data_name + '-en-test'
evaluation = data_loader(tpath + '/downstream/STS/'+fpath)
for dataset in evaluation.datasets:
rf, cd, gs = evaluation.data[dataset]
for sent in rf+cd:
vocab_count.update(sent)
rf = [' '.join(x) for x in rf]
cd = [' '.join(x) for x in cd]
refs.extend(rf)
cands.extend(cd)
gs_scores.extend(gs)
return refs, cands, gs_scores, vocab_count
# %% all-setting STS evaluation
# pre-trained kernel and bias for whitening
# path_whiten = './data/whiten/bert-base-stsall-rmsw-first_last.pkl'
kernel, bias = [], [] #pcr.load_whiten(path_whiten)
# parameters. The following is the setting for SynWMD_dwf+dwd using SimCSE-BERT
param = {'batch_size': 64,
'l2_dist': False, # distance metric: False -> cosine distance, True -> l2 distance
'tree': 's', # 'n' -> without DWD, that is SynWMD_dwf, 's' -> use DWD, that is SynWMD_dwf+dwd
'a': 0.2, # float, parameter a in DWD, controlling how much contextual and structual infor DWD considers
'hop_num': 3, # int, subtree size in DWD
'layer': 'last', # embedding layer: 'first_last', 'last', 'last2' or int
'whiten_flag': False, # whitening pre-processing
'pre_whiten': False, # use pre-trained kernel and bias, only works when 'whiten_flag': True
'pre_kernel': kernel, # pre-trained kernel for whitening, otherwise set to []
'pre_bias': bias } # pre-trained bias for whitening, otherwise set to []
# evaluation
task_list = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16', 'STSB']
task_func_list = [STS12Eval, STS13Eval, STS14Eval, STS15Eval, STS16Eval, STSBun_Eval]
result_all = []
for task, task_func in zip(task_list, task_func_list):
refs, cands, gs_scores, vocab_count = data_all_set(task, task_func)
vocab = [w[0] for w in vocab_count.most_common()]
word2id = {w:id for id,w in enumerate(vocab)}
## IDF weighting
# from sklearn.feature_extraction.text import TfidfVectorizer
# tf = TfidfVectorizer(use_idf=True)
# tf.fit_transform(refs+cands)
# word_weight = dict(zip(tf.get_feature_names(), tf.idf_))
## None weigting
# word_weight = {k:1 for k,v in word_weight.items()}
## DWF weighting
parsing_batch = refs+cands
parsing_batch= [s if not (s.isspace() or len(s)==0) else 'good' for s in parsing_batch]
parsing_data= nlp('\n\n'.join(parsing_batch))
parsing_data = lib_dep.stanza2dic(parsing_data)
G, num_pair = build_graph(parsing_data, word2id, hop_size=3)
word_weight = {}
pr = nx.pagerank(G, alpha=0.2)
for k in range(len(vocab)):
if k in pr:
word_weight[vocab[k]] = 1/(pr[k])
##
sys_scores = SynWMD(refs, cands,
word_weight, **param)
all_pearson = pearsonr(sys_scores, gs_scores)[0]
all_spearman = spearmanr(sys_scores, gs_scores)[0]
print('ALL (weighted average) : Pearson = %.4f, \
Spearman = %.4f' % (all_pearson, all_spearman))
result_all.append(all_spearman)
print('\nFinal (weighted average):Spearman = %.4f\n' % np.average(result_all))
result_all = [str(round(x*100, 2)) for x in result_all]
print(' '.join(result_all))