forked from vdwanderley/Orange-Deskin
-
Notifications
You must be signed in to change notification settings - Fork 0
/
make_prediction.sh
executable file
·239 lines (182 loc) · 7.07 KB
/
make_prediction.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
#!/bin/bash
LC_ALL=en_US.UTF-8
LANG=en_US.UTF-8
LANGUAGE=en_US.UTF-8
# TODO
# if language is non known, take "mix" with forms replaced by CPOS+rand(),
# lemmas deleted and POS replaced by CPOS
if [ $# -lt 3 ]; then
echo "usage $0 test.conllu language-code outfile"
exit 1
fi
TEST=$1
LANGUE=$2
OUTFILE=$3
HOSTNAME=$(hostname)
NOW=$(date '+%Y.%m.%d %H:%M')
echo -e "\nProcessing language: $LANGUE, start: $NOW"
echo -e "\nProcessing language: $LANGUE, start: $NOW" 1>&2
if [ "$HOSTNAME" == "tira-ubuntu" ]; then
export LD_LIBRARY_PATH=/home/Orange-Deskin/conll2017/cnn-v1-gpu/pycnn
BASEPATH=/home/Orange-Deskin/conll2017/Orange-Deskin
DATAPATH=$BASEPATH/data
MODELSPATH=$BASEPATH/models
elif [ "$HOSTNAME" == "yd-jeuh6401" ]; then
export LD_LIBRARY_PATH=/home/jeuh6401/SemanticData/bist-parser/cnn-v1-gpu/pycnn
BASEPATH=/home/jeuh6401/conll2017/Orange-Deskin
DATAPATH=$BASEPATH/data
MODELSPATH=$BASEPATH/models
else
export LD_LIBRARY_PATH="/home/langnat/conll2017/bistparser/cnn-v1-gpu/pycnn"
BASEPATH=/mnt/RAID0SHDD2X1TB/Orange-Deskin
DATAPATH=/home/langnat/conll2017/data
fi
# the temp directory for the run
TMPDIR=$(mktemp -d)
#OUTPATH=$BASEPATH/output
PYSCRIPTROOT=$BASEPATH/py
BISTROOT=$BASEPATH/bistparser/barchybrid
EMBEDDINGSPATH=$DATAPATH/$LANGUE
MODELPATH=$MODELSPATH/$LANGUE
if [ "$HOSTNAME" == "yd-jeuh6401" ]; then
EMBEDDINGSPATH=/data/SemanticData/conll2017/embeddings
fi
# cleaning CoNLL text of comments and compound representations
function cleanconllu() {
grep -v "^#" | grep -P -v '^\d+[\.-]'
}
# extract surface forms from a CoNLL text
function formslist() {
INFILE=$1
cut -f2 $INFILE | sort -u
}
# extract lemmas from a CoNLL text
function lemmalist() {
INFILE=$1
cut -f3 $INFILE | sort -u
}
# generate word list (lowercase)
function wordlist() {
cat $@ | perl -CSD -ne 'print lc'
}
# deprojectivises output from prediction with BistParser
function deprojectivise() {
INFILE=$1
$PYSCRIPTROOT/projectivise.py -d $INFILE
}
# prediction function
function predict() {
# incoming file (to be predicted)
INFILE=$1
# model to use
MODEL=$2
# BistParser parameters to use
PARAMS=$3
# word list file words known during training
WORDS=$4
# words read in document to parse
NEWWORDS=$5
# word embeddings to use
VECTORS=
if [ "$6" != "" ]; then
VECTORS="--extrn $6 --extrnFilter $WORDS --extrnFilterNew $NEWWORDS"
fi
# prediction
# this path is needs to be adapted for individual system
#pushd $BISTROOT > /dev/null
echo python $BISTROOT/src/parse_1by1.py --cnn-mem 4000 --predict \
--outfile $TMPDIR/result1.conllu \
--model $MODEL \
--params $PARAMS \
$VECTORS \
--test $INFILE
#python src/parser.py --cnn-mem 4000 --predict
python $BISTROOT/src/parse_1by1.py --cnn-mem 4000 --predict \
--outfile $TMPDIR/result1.conllu \
--model $MODEL \
--params $PARAMS \
$VECTORS \
--test $INFILE
#popd > /dev/null
# check whether we need to deprojectivise
COUNTPSEUDOPROJ=$(cut -f8 $TMPDIR/result1.conllu | grep "=" | wc -l)
if [ $COUNTPSEUDOPROJ -ne 0 ]; then
deprojectivise $TMPDIR/result1.conllu > $TMPDIR/result-deproj.conllu
else
cp $TMPDIR/result1.conllu $TMPDIR/result-deproj.conllu
fi
# reinsert lines with [n-m] or [n.1]
#reinsert $TMPDIR/result-deproj.conllu $OUTFILE
python $PYSCRIPTROOT/reinsert.py $TEST $TMPDIR/result-deproj.conllu > $TMPDIR/result-deproj-reinsert.conllu
}
# cleaning CoNLL input of comments and compound representations
echo "Cleaning ..."
CLEANTEST=$TMPDIR/$LANGUE.clean.test.conll
cat $TEST | cleanconllu > $CLEANTEST
if [ ! -d $MODELSPATH/$LANGUE ]; then
# check whether we know language without specification (such as _partut)
LGPREFIX=$(echo $LANGUE | cut -d_ -f1)
#echo "prefix $LGPREFIX"
if [ -d $MODELSPATH/$LGPREFIX ]; then
LANGUE=$LGPREFIX
echo "unknown language variation, using $LANGUE"
EMBEDDINGSPATH=$DATAPATH/$LANGUE
if [ "$HOSTNAME" == "yd-jeuh6401" ]; then
EMBEDDINGSPATH=$DATAPATH
fi
MODELPATH=$MODELSPATH/$LANGUE
else
#LANGUE=mix2_random
LANGUE=mix2B
echo "unknown language, using $LANGUE"
CLEANTEST2=$TMPDIR/$LANGUE.clean.test.empty.conll
# delete lemmas, replace forms by CPOS (plus random number for NOUN, VERB and ADJ) and replace POS by CPOS
#cat $CLEANTEST | gawk -F '\t' 'OFS="\t" {if (NF > 6) {if ($4 == "NOUN" || $4 == "VERB" || $4 == "ADJ") print $1, sprintf("%s%d", $4, rand()*50), "_", $4,$4,$6,$7,$8,$9,$10; else print $1, $4, "_", $4,$4,$6,$7,$8,$9,$10;} else print ""}' > $CLEANTEST2
# delete lemmas, replace forms by CPOS (except NOUNS, VERB and ADJ) and replace POS by CPOS
cat $CLEANTEST | gawk -F '\t' 'OFS="\t" {if (NF > 6) {if ($4 == "NOUN" || $4 == "VERB" || $4 == "ADJ") print $1, $2, "_", $4,$4,$6,$7,$8,$9,$10; else print $1, $4, "_", $4,$4,$6,$7,$8,$9,$10;} else print ""}' > $CLEANTEST2
CLEANTEST=$CLEANTEST2
EMBEDDINGSPATH=$DATAPATH/$LANGUE
MODELPATH=$MODELSPATH/$LANGUE
fi
else
# check whether input text needs to be "normalised"
if [ -f $MODELSPATH/$LANGUE/NOWORDS ]; then
echo "replacing forms by CPOS for $LANGUE"
CLEANTEST2=$TMPDIR/$LANGUE.clean.test.empty.conll
cat $CLEANTEST | gawk -F '\t' 'OFS="\t" {if (NF > 6) {if ($4 == "NOUN" || $4 == "VERB" || $4 == "ADJ") print $1, $2, "_", $4,$4,$6,$7,$8,$9,$10; else print $1, $4, "_", $4,$4,$6,$7,$8,$9,$10;} else print ""}' > $CLEANTEST2
CLEANTEST=$CLEANTEST2
fi
fi
# extract surface forms from input
echo "Getting Form List ..."
FORMLIST=$TMPDIR/$LANGUE.forms.txt
formslist $CLEANTEST > $FORMLIST
# extract lemmas from input
echo "Getting Lemma List ..."
LEMLIST=$TMPDIR/$LANGUE.lemmas.txt
lemmalist $CLEANTEST > $LEMLIST
# create word list
echo "Generating Word List ..."
WORDLIST=$TMPDIR/$LANGUE.words.txt
ALLWORDS=$MODELSPATH/$LANGUE/allwords.txt
wordlist $FORMLIST $LEMLIST > $WORDLIST
# TODO make it work without and with 300 dims
#EXVECTORS=$EMBEDDINGSPATH/*500-dim.10-win.cbow.bin
# find correct vectors file
VECTORFILE=$($PYSCRIPTROOT/readparamspickle.py $MODELPATH/params.pickle external_embedding | gawk -F / '{print $NF}')
if [ "$VECTORFILE" != "None" ]; then
EXVECTORS=$EMBEDDINGSPATH/$VECTORFILE
fi
BARCHYBRID=$(ls -1 $MODELPATH/*.model_??? | tail -1)
# predict
echo "Predicting ... language: $LANGUE, start: $NOW"
#echo "Predicting ... language: $LANGUE, start: $NOW" 1>&2
#predict $CLEANTEST $MODELPATH/*.model_??? $MODELPATH/params.pickle $ALLWORDS $WORDLIST $EXVECTORS
predict $CLEANTEST $BARCHYBRID $MODELPATH/params.pickle $ALLWORDS $WORDLIST $EXVECTORS
# copy result in output folder
#cp $TMPDIR/result-deproj-reinsert.conllu $OUTPATH/$LANGUE.output.conllu
cp $TMPDIR/result-deproj-reinsert.conllu $OUTFILE
# evaulation for testing
#$PYSCRIPTROOT/evaluation_script/conll17_ud_eval.py --weights $PYSCRIPTROOT/evaluation_script/weights.clas $TEST $TMPDIR/result-deproj-reinsert.conllu
# clean up
rm -rf $TMPDIR