-
Notifications
You must be signed in to change notification settings - Fork 32
/
run.sh
executable file
·22 lines (16 loc) · 1.36 KB
/
run.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
#!/bin/sh
text_file=data/20ng/text_train.txt # the text file for training
label_file=data/20ng/label_train.txt # the label file for training
infer_file=data/20ng/text_all.txt # the text file to infer
output_path=workspace/
window=5 # the window size for the construction of the word-word network
min_count=0 # discard words that appear less than <min_count>
# heterogeneous text network construction
./text2hin/data2w -text ${text_file} -output-ww ${output_path}ww.net -output-words ${output_path}words.node -window ${window} -min-count ${min_count}
./text2hin/data2dl -text ${text_file} -label ${label_file} -output-lw ${output_path}lw.net -output-labels ${output_path}labels.node -output-dw ${output_path}dw.net -output-docs ${output_path}docs.node -min-count ${min_count}
cat ${output_path}ww.net ${output_path}dw.net ${output_path}lw.net > ${output_path}text.hin
cat ${output_path}words.node ${output_path}docs.node ${output_path}labels.node > ${output_path}text.node
# learn predictive word representations
./pte/pte -nodes ${output_path}text.node -words ${output_path}words.node -hin ${output_path}text.hin -output ${output_path}word.emb -binary 1 -size 100 -negative 5 -samples 300 -threads 20
# infer the embeddings of the texts provided in the <infer_file>
./text2vec/infer -infer ${infer_file} -vector ${output_path}word.emb -output ${output_path}text.emb -debug 2 -binary 0