-
Notifications
You must be signed in to change notification settings - Fork 0
/
VSM1.py
207 lines (162 loc) · 5.86 KB
/
VSM1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
import pandas as pd
import numpy as np
import math
from sklearn.metrics.pairwise import cosine_similarity
"""
cal all doc vocabulary
"""
file_doc = open('C:/Users/Cheng/Desktop/jupyter/Homework1/doc_list.txt') #
file_doc = file_doc.read().replace('\n',' ').split(' ')
doc_all=[]
for root in file_doc:
doc = open('C:/Users/Cheng/Desktop/jupyter/Homework1/Document/' + root)
# for i in range(3):
# doc.readline()
doc.readline() #use three line will faster than use for()
doc.readline()
doc.readline()
doc = doc.read().replace(' -1\n',' ').split(' ') #preprocessing
del doc[-1]
doc_all.extend(doc)
#print(len(doc_all)) #just check how many voc before delete repeat voc
doc_all = list(set(doc_all)) #delete repeat voc
#print(len(doc_all)) #check how many voc after delete
"""
cal all doc vocabulary
"""
file_query = open('C:/Users/Cheng/Desktop/jupyter/Homework1/query_list.txt')
file_query = file_query.read().replace('\n',' ').split(' ') #preprocessing
query_all=[]
for root in file_query:
query = open('C:/Users/Cheng/Desktop/jupyter/Homework1/Query/' + root)
query = query.read().replace(' -1\n',' ').split(' ')
del query[-1]
query_all.extend(query)
#print(len(query_all)) #check
query_all = list(set(query_all))
#print(len(query_all)) #check
"""
add both doc and query vocabulary
"""
doc_all.extend(query_all)
#print(len(doc_all)) #check
doc_all = list(set(doc_all))
#print(len(doc_all)) #check
"""
cal the word appear how many times in each document
"""
doc_array = np.zeros((len(doc_all),2265 + 1))
for i in range(len(doc_all)):
doc_array[i,0] = doc_all[i]
for i in range(len(file_doc)): #有幾個doc檔案
file0 = open('C:/Users/Cheng/Desktop/jupyter/Homework1/Document/' + file_doc[i])
file0.readline()
file0.readline()
file0.readline()
file0 = file0.read().replace(' -1\n',' ').split(' ')
del file0[-1]
for j in range(len(doc_all)): #計算voc在各個doc出現次數
if(file0.count(doc_all[j])) >0:
doc_array[j,i+1] = 1 + math.log(file0.count(doc_all[j]),2)
else:
doc_array[j,i+1] = 0
# df_doc = pd.DataFrame(doc_array)
#
# df_doc #轉成pandas比較好在jupyter上觀看
"""
cal the word appear how many times in each query
"""
query_array = np.zeros((len(doc_all),16 + 1))
for i in range(len(doc_all)):
query_array[i,0] = doc_all[i]
# file = open('D:/downloads/Homework1/Query/20001.query') #先測試一個檔案
# file = file.read().replace(' -1\n',' ').split(' ')
# for i in range(len(query_all)): #計算文本出現次數
# if(file.count(query_all[i])) >0:
# b[i,1] = 1 + math.log(file.count(query_all[i]),2)
# else:
# b[i,1] = 0
# df = pd.DataFrame(b)
# df
for i in range(len(file_query)): #有幾個query檔案
file1 = open('C:/Users/Cheng/Desktop/jupyter/Homework1/Query/' + file_query[i])
file1 = file1.read().replace(' -1\n',' ').split(' ')
del file1[-1]
for j in range(len(doc_all)): #計算voc在各個query出現次數
if(file1.count(doc_all[j])) >0:
query_array[j,i+1] = 1 + math.log(file1.count(doc_all[j]),2)
else:
query_array[j,i+1] = 0
# df_query = pd.DataFrame(query_array)
#
# df_query
"""
-----------------------finish the Term Fequency TF-------------------------------------
"""
doc_IDF = np.zeros((len(doc_all),2))
for i in range(len(doc_all)):
doc_IDF[i,0] = doc_all[i]
for i in range(len(file_doc)): #有幾個doc檔案
file0 = open('C:/Users/Cheng/Desktop/jupyter/Homework1/Document/' + file_doc[i])
file0.readline()
file0.readline()
file0.readline()
file0 = file0.read().replace(' -1\n',' ').split(' ')
del file0[-1]
for j in range(len(doc_all)): #計算voc出現在多少個doc內
if(file0.count(doc_all[j])) >0:
doc_IDF[j,1] += 1
for i in range(len(doc_all)):
if(doc_IDF[i,1]) >0:
doc_IDF[i,1] = math.log(2265/doc_IDF[i,1],10)
else:
doc_IDF[i,1] = 0
# df_IDF = pd.DataFrame(doc_IDF)
#
# print(df_IDF) #check in jupyter
"""
------------------------IDF finish---------------------------------
"""
"""
all_vocabulary TF-IDF
"""
for j in range(2265):
for i in range(len(doc_all)):
doc_array[i,j+1] = doc_array[i,j+1]*doc_IDF[i,1]
#doc_array #check
"""
query TF-IDF
"""
for j in range(16):
for i in range(len(doc_all)):
query_array[i,j+1] = query_array[i,j+1]*doc_IDF[i,1]
#query_array #check
"""
cal cos()
"""
doc_name = open('C:/Users/Cheng/Desktop/jupyter/Homework1/doc_list.txt')
doc_name = doc_name.read().replace('\n',' ').split(' ')
f = open('M10715090.txt','w') #write in txt file
f.write('Query,RetrievedDocuments\n')
for k in range(16): #from query 1 to 16
result1 = np.zeros((2265,2),dtype=object)
for i in range(2265): #把cos()出來的值與doc名稱一一對應放入
result1[i,1] = doc_name[i]
result1[i,0] = cosine_similarity([query_array[:,k+1]],[doc_array[:,i+1]])
p_result = pd.DataFrame(result1) #use pandas to sort
p_result = p_result.sort_values(ascending=False, by=[0]).values
# for j in range(2264): #用buble sort 比較容易理解
# for i in range(2264): #每回合進行比較的範圍
# if result1[i,0] < result1[i+1,0]: #是否需交換,若需要 則一併把doc名稱也一起交換
# tmp = result1[i,0]
# tmp2 = result1[i,1]
# result1[i,0] = result1[i+1,0]
# result1[i,1] = result1[i+1,1]
# result1[i+1,0] = tmp
# result1[i+1,1] = tmp2
f.write(file_query[k]) #we have 16 querys file
f.write(',')
for i in range(2265): #put every query result to txt
f.write(p_result[i,1])
f.write(' ')
f.write('\n')