-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathproject-ner.sh
68 lines (61 loc) · 2.1 KB
/
project-ner.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#!/bin/bash
src="en"
split=${1:-"train"}
tgt=${2:-"ar"}
encoder=${3:-"bert-base-multilingual-cased"}
align_layer=${4:-8}
align_system=${5:-"mbert_l8"}
mt_system="helsinki_opus"
max_len=500
#temp dir where outputs are saved in, after each step
DIR="intermediary/ner-wiki"
#path to where final projection file will be saved
FINAL_DIR="projection/ner-wiki"
mkdir -p $DIR $FINAL_DIR
if [ -f "$DIR/$src.$split.text" ]; then
echo "$DIR/$src.$split.text exists."
else
python scripts/extract-text.py \
--task wikiann \
--path /bigdata/dataset/ner-wiki \
--lang $src \
--split "$split" \
>"$DIR/$src.$split.text"
fi
if [ -f "$DIR/$src.to_$tgt.$mt_system.$split.text" ]; then
echo "$DIR/$src.to_$tgt.$mt_system.$split.text exists."
else
python scripts/translate.py \
--infile "$DIR/$src.$split.text" \
--model_name "Helsinki-NLP/opus-mt-$src-$tgt" \
--src $src \
--tgt "$tgt" \
>"$DIR/$src.to_$tgt.$mt_system.$split.text"
fi
if [ -f "$DIR/$src.and_$tgt.$mt_system.$split.text" ]; then
echo "$DIR/$src.and_$tgt.$mt_system.$split.text exists."
else
python scripts/bitext-concat.py \
--src_fp "$DIR/$src.$split.text" \
--tgt_fp "$DIR/$src.to_$tgt.$mt_system.$split.text" \
>"$DIR/$src.and_$tgt.$mt_system.$split.text"
fi
if [ -f "$DIR/$src.and_$tgt.$mt_system.$align_system.$split.align" ]; then
echo "$DIR/$src.and_$tgt.$mt_system.$align_system.$split.align exists."
else
python scripts/awesome-align.py \
--data_file "$DIR/$src.and_$tgt.$mt_system.$split.text" \
--align_layer "$align_layer" \
--model_name_or_path "$encoder" \
--max_len $max_len \
--output_file "$DIR/$src.and_$tgt.$mt_system.$align_system.$split.align"
fi
python scripts/project-label.py \
--task wikiann \
--path /bigdata/dataset/ner-wiki \
--lang $src \
--split "$split" \
--bitext "$DIR/$src.and_$tgt.$mt_system.$split.text" \
--alignment "$DIR/$src.and_$tgt.$mt_system.$align_system.$split.align" \
--output_path $FINAL_DIR \
--name "$tgt.from_$src.$mt_system.$align_system"