13
13
from glob import glob
14
14
from scipy .sparse import csc_matrix , csr_matrix , dok_array
15
15
16
- relevance_threshold = 0.5
17
- profile_threshold = 2.5
18
-
19
16
@jit (nopython = True )
20
- def count_tokens (X_indices , X_indptr , word_profile_data , word_profile_indices , word_profile_indptr , feature_map ):
17
+ def count_tokens (X_indices , X_indptr , word_profile_data , word_profile_indices , word_profile_indptr , feature_map , relevance_threshold , profile_threshold ):
21
18
document_vector = np .zeros (word_profile_indptr .shape [0 ]- 1 )
22
19
target_word_profile = np .zeros (word_profile_indptr .shape [0 ]- 1 )
23
20
target_word_refined_profile = np .zeros (word_profile_indptr .shape [0 ]- 1 )
@@ -82,7 +79,7 @@ def count_tokens(X_indices, X_indptr, word_profile_data, word_profile_indices, w
82
79
return global_token_count
83
80
84
81
@jit (nopython = True )
85
- def embed_X (X_indices , X_indptr , word_profile_data , word_profile_indices , word_profile_indptr , feature_map , token_count ):
82
+ def embed_X (X_indices , X_indptr , word_profile_data , word_profile_indices , word_profile_indptr , feature_map , token_count , relevance_threshold , profile_threshold ):
86
83
document_vector = np .zeros (word_profile_indptr .shape [0 ]- 1 )
87
84
target_word_profile = np .zeros (word_profile_indptr .shape [0 ]- 1 )
88
85
target_word_refined_profile = np .zeros (word_profile_indptr .shape [0 ]- 1 )
@@ -146,7 +143,8 @@ def embed_X(X_indices, X_indptr, word_profile_data, word_profile_indices, word_p
146
143
global_token_count += 1
147
144
X_embedded_indptr [row + 1 ] = global_token_count
148
145
149
- print (row , X_indptr .shape [0 ]- 1 , document_token_count )
146
+ if row % 100 == 0 :
147
+ print (row , X_indptr .shape [0 ]- 1 , document_token_count )
150
148
151
149
#print(row, X_indptr[row], X_indptr[row+1], X_indptr[row+1] - X_indptr[row], document_token_count)
152
150
return (X_embedded_data , X_embedded_indices , X_embedded_indptr )
@@ -166,6 +164,9 @@ def embed_X(X_indices, X_indptr, word_profile_data, word_profile_indices, word_p
166
164
parser .add_argument ("--features" , default = 5000 , type = int )
167
165
parser .add_argument ("--reuters-num-words" , default = 10000 , type = int )
168
166
parser .add_argument ("--reuters-index-from" , default = 2 , type = int )
167
+ parser .add_argument ("--relevance_threshold" , default = 0.25 , type = float )
168
+ parser .add_argument ("--profile_threshold" , default = 0.5 , type = float )
169
+
169
170
args = parser .parse_args ()
170
171
171
172
@@ -233,13 +234,13 @@ def embed_X(X_indices, X_indptr, word_profile_data, word_profile_indices, word_p
233
234
feature_map [i ] = 0
234
235
235
236
# Counts number of tokens in the augmented dataset to allocate memory for sparse data structure
236
- token_count = count_tokens (X_train .indices , X_train .indptr , word_profile .data , word_profile .indices , word_profile .indptr , feature_map )
237
- (X_train_embedded_data , X_train_embedded_indices , X_train_embedded_indptr ) = embed_X (X_train .indices , X_train .indptr , word_profile .data , word_profile .indices , word_profile .indptr , feature_map , token_count )
237
+ token_count = count_tokens (X_train .indices , X_train .indptr , word_profile .data , word_profile .indices , word_profile .indptr , feature_map , args . relevance_threshold , args . profile_threshold )
238
+ (X_train_embedded_data , X_train_embedded_indices , X_train_embedded_indptr ) = embed_X (X_train .indices , X_train .indptr , word_profile .data , word_profile .indices , word_profile .indptr , feature_map , token_count , args . relevance_threshold , args . profile_threshold )
238
239
X_train_embedded = csr_matrix ((X_train_embedded_data , X_train_embedded_indices , X_train_embedded_indptr ))
239
240
240
241
# Counts number of tokens in the augmented dataset to allocate memory for sparse data structure
241
- token_count = count_tokens (X_test .indices , X_test .indptr , word_profile .data , word_profile .indices , word_profile .indptr , feature_map )
242
- (X_test_embedded_data , X_test_embedded_indices , X_test_embedded_indptr ) = embed_X (X_test .indices , X_test .indptr , word_profile .data , word_profile .indices , word_profile .indptr , feature_map , token_count )
242
+ token_count = count_tokens (X_test .indices , X_test .indptr , word_profile .data , word_profile .indices , word_profile .indptr , feature_map , args . relevance_threshold , args . profile_threshold )
243
+ (X_test_embedded_data , X_test_embedded_indices , X_test_embedded_indptr ) = embed_X (X_test .indices , X_test .indptr , word_profile .data , word_profile .indices , word_profile .indptr , feature_map , token_count , args . relevance_threshold , args . profile_threshold )
243
244
X_test_embedded = csr_matrix ((X_test_embedded_data , X_test_embedded_indices , X_test_embedded_indptr ))
244
245
245
246
0 commit comments