-
Notifications
You must be signed in to change notification settings - Fork 2
/
SemHash.py
170 lines (141 loc) · 5.45 KB
/
SemHash.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
# -*- coding: utf-8 -*-
"""
@author: yingwenjie
"""
import os
import sys
import string
import numpy as np
import scipy
import scipy.io
from utils.utils import *
from utils.evaluation import *
from utils.Keyword import *
from utils.init_B import *
from models.BCSH import *
from models.BCSH_batch import *
from models.ITSH import *
from models.BCSH_paper import *
def SemHash_BCSH():
(allfile,path) = getFilelist(sys.argv)
#max_feat = int(sys.argv[2])
#bits = sys.argv[3]
#iters = sys.argv[4]
#lambd = sys.argv[5]
print(path)
print(allfile)
f = open('./argfile/doc.utf8', '+w')
for ff in allfile :
f.write(ff + '\n')
f.close()
path = fenci(allfile, path)
(word,weight) = Tfidf(path, allfile)
f = open('./argfile/word.utf8', 'w+')
for j in range(len(word)):
f.write(word[j] + "\n")
f.close()
#scipy.io.savemat('X.mat', {'X': weight})
#B = BCSH1(weight)
B = BCSH2(weight)
print(B.shape)
Sim = np.dot(B.transpose(), B)
B[B < 0] = 0
f = open('./argfile/hashcode.utf8', '+w')
for i in range(B.shape[1]):
for j in range(B.shape[0]):
f.write(str(B[j, i]))
f.write('\t' + allfile[i] + "\n")
for i in range(Sim.shape[0]):
for j in range(Sim.shape[0] - i):
f.write(allfile[i] + '\t' + allfile[j] + '\t' + str(Sim[i,j]) + '\n')
f.close()
def SemHash_BCSH_batch():
file_name = sys.argv[1] # 训练集文件
topK = int(sys.argv[2]) # topK关键词
max_features = int(sys.argv[3]) #关键词全集数
bits = int(sys.argv[4]) # 一次学习bits位哈希码
rand_time = int(sys.argv[5]) # 学习rand_time次
batch = int(sys.argv[6]) # 一次学习的样本数量
iters = int(sys.argv[7]) # 一次学习迭代次数
lambd = float(sys.argv[8]) # 损失参数
"""
分词/分字,结巴分词/tfidf关键词
"""
seg_file = fenci(file_name, topK)
#seg_file = fenzi(file_name)
"""
tfidf表示
"""
(word,tfidf) = Tfidf(seg_file, max_features)
f = open('./data/argfile/word.utf8', 'w')
for j in range(len(word)):
f.write(word[j] + "\n")
f.close()
B = BCSH_batch(tfidf, bits, rand_time, batch, iters, lambd, word)
print(B.shape)
B[B < 0] = 0
B = B.astype(int)
f = open('./data/argfile/hashcode.utf8', 'w')
for i in range(B.shape[1]):
B_str = ""
for j in range(B.shape[0]):
B_str +=str(B[j,i])
f.write(str(B_str ) + '\t' + str(i) + '\n')
f.close()
def SemHash_ITSH():
file_name = sys.argv[1] # 训练集文件
topK = int(sys.argv[2]) # topK关键词
max_features = int(sys.argv[3]) #最大特征集合数
bits = int(sys.argv[4]) # 哈希码长度
iters = int(sys.argv[5]) # 一次学习迭代次数
task_name = sys.argv[6] #参数和数据保存路径
task_dir = './data/' + task_name
##paper实验
#arg = scipy.io.loadmat('./data/20ng.mat')
#tfidf = arg["X"]
#label = arg["Y"]
#label = np.squeeze(label, axis=0)
#print(label.shape)
#print(label)
#B = ITSH(tfidf, bits, iters, task_dir)
#eval_retrieval(B.transpose(), label.transpose(), top_n=100)
#exit()
#tfidf = load_sparse(file_name) 直接加载稀疏表示数据作为输入数据
"""
分词/分字, 结巴分词/tfidf关键词
"""
seg_file = fenci(file_name, topK, False, task_dir) #返回分词后的文件
#seg_file = fenzi(file_name, './data/segfile_tmp')
#tfidf表示
tfidf = Tfidf(seg_file, max_features, task_dir)
B = ITSH(tfidf, bits, iters, task_dir)
#eval_retrieval(B.transpose(), label.transpose(), top_n=100)
def SemHash_BCSH_paper():
file_name = sys.argv[1] # 训练集文件
topK = int(sys.argv[2]) # topK关键词
max_features = int(sys.argv[3]) #最大特征集合数
bits = int(sys.argv[4]) # 哈希码长度
iters = int(sys.argv[5]) # 一次学习迭代次数
task_name = sys.argv[6] #参数和数据保存路径
task_dir = './data/' + task_name
##哈希表示
"""
分词/分字,结巴分词/tfidf关键词
"""
seg_file = fenci(file_name, topK, False, task_dir)
#seg_file = fenzi(file_name)
#tfidf表示
tfidf = Tfidf(seg_file, max_features, task_dir)
#label, B, X = init_B("./data/paper_data/clue/inews/train.txt","inews")
#label_test, B_test, X_test = init_B("./data/paper_data/clue/inews/test.txt","inews")
#label_dev, B_dev, X_dev = init_B("./data/paper_data/clue/inews/dev.txt","inews")
label, B, X = init_B("./data/paper_data/clue/tnews/toutiao_category_train.txt","tnews")
label_dev, B_dev, X_dev = init_B("./data/paper_data/clue/tnews/toutiao_category_dev.txt","tnews")
label_test, B_test, X_test = init_B("./data/paper_data/clue/tnews/toutiao_category_test.txt","tnews")
#label, B, X = init_B("./data/paper_data/clue/thucnews/train.txt","thucnews")
#label_test, B_test, X_test = init_B("./data/paper_data/clue/thucnews/test.txt","thucnews")
#label_dev, B_dev, X_dev = init_B("./data/paper_data/clue/thucnews/dev.txt","thucnews")
B = BCSH_paper(B, X, label, B_test, X_test, label_test, B_dev, X_dev, label_dev, bits, iters)
#eval_retrieval(B.transpose(), label.transpose(), top_n=100)
if __name__ == "__main__":
SemHash_ITSH() #稳定版