-
Notifications
You must be signed in to change notification settings - Fork 38
/
result_evaluate.py
90 lines (78 loc) · 4.52 KB
/
result_evaluate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import os
import json
import time
import codecs
import tensorflow as tf
import glob
from nltk import sent_tokenize
FLAGS = tf.app.flags.FLAGS
class Evaluate(object):
def __init__(self ):
self.all_bigram=dict()
self.all_trigram=dict()
self.all_unigram=dict()
self.all_sentence=dict()
self.bigram_num=0
self.trigram_num=0
self.unigram_num=0
self.sen_num=0
def diversity_evaluate(self, data_path):
filelist = glob.glob(data_path) # get the list of datafiles
assert filelist, ('Error: Empty filelist at %s' % data_path) # check filelist isn't empty
filelist = sorted(filelist)
for f in filelist:
reader = codecs.open(f, 'r', 'utf-8')
while True:
string_ = reader.readline()
if not string_: break
dict_example = json.loads(string_)
review = dict_example["example"]
review_sen= sent_tokenize(review)
for sen in review_sen:
self.all_sentence[sen] =1
self.sen_num+=1
sen_words=sen.strip().split()
unigram = [sen_words[i] for i in range(len(sen_words))]
if len(sen_words)>=2:
bigram = [sen_words[i]+sen_words[i+1] for i in range(len(sen_words)-2)]
else:
bigram = []
if len(sen_words)>=3:
trigram = [sen_words[i] + sen_words[i + 1] + sen_words[i + 2] for i in range(len(sen_words) - 3)]
else:
trigram = []
for word in bigram:
self.all_bigram[word]=1
self.bigram_num+=1
for word in trigram:
self.all_trigram[word]=1
self.trigram_num+=1
for word in unigram:
self.all_unigram[word]=1
self.unigram_num+=1
if self.sen_num == 0:
tf.logging.info("sentence number: "+str(self.sen_num)+" unique sentence number: "+str(len(self.all_sentence))+" unique sentence rate: "+str(0))
else:
tf.logging.info("sentence number: "+str(self.sen_num)+" unique sentence number: "+str(len(self.all_sentence))+" unique sentence rate: "+str(len(self.all_sentence)/(1.0*self.sen_num)))
if self.unigram_num == 0:
tf.logging.info("unigram number: "+str(self.unigram_num)+" unique unigram number: "+str(len(self.all_unigram))+" unique unigram rate: "+str(0))
else:
tf.logging.info("unigram number: "+str(self.unigram_num)+" unique unigram number: "+str(len(self.all_unigram))+" unique unigram rate: "+str(len(self.all_unigram)/(1.0*self.unigram_num)))
if self.bigram_num == 0:
tf.logging.info("bigram number: " + str(self.bigram_num) + " unique bigram number: " + str(
len(self.all_bigram)) + " unique bigram rate: " + str(0))
else:
tf.logging.info("bigram number: " + str(self.bigram_num) + " unique bigram number: " + str(
len(self.all_bigram)) + " unique bigram rate: " + str(len(self.all_bigram) / (1.0 * self.bigram_num)))
if self.trigram_num == 0:
tf.logging.info("trigram number: " + str(self.trigram_num) + " unique trigram number: " + str(
len(self.all_trigram)) + " unique trigram rate: " + str(0))
else:
tf.logging.info("trigram number: " + str(self.trigram_num) + " unique trigram number: " + str(
len(self.all_trigram)) + " unique trigram rate: " + str(len(self.all_trigram) / (1.0 * self.trigram_num)))
#tf.logging.info("sentence number: "+str(self.sen_num)+" unique sentence number: "+str(len(self.all_sentence))+" unique sentence rate: "+str(len(self.all_sentence)/(1.0*self.sen_num)))
#tf.logging.info("unigram number: "+str(self.unigram_num)+" unique unigram number: "+str(len(self.all_unigram))+" unique unigram rate: "+str(len(self.all_unigram)/(1.0*self.unigram_num)))
#tf.logging.info("bigram number: " + str(self.bigram_num) + " unique bigram number: " + str(
# len(self.all_bigram)) + " unique bigram rate: " + str(len(self.all_bigram) / (1.0 * self.bigram_num)))
#tf.logging.info("trigram number: " + str(self.trigram_num) + " unique trigram number: " + str(
# len(self.all_trigram)) + " unique trigram rate: " + str(len(self.all_trigram) / (1.0 * self.trigram_num)))