-
Notifications
You must be signed in to change notification settings - Fork 7
/
pipe.sh
163 lines (123 loc) · 3.8 KB
/
pipe.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
############
# Crawling #
############
# Crawl NEJM websites:
python3 crawler/crawl.py
# Get article type distribution:
python3 crawler/url_stat.py
# Filter unwanted texts:
python3 crawler/filter.py
# Get length difference before and after filtering:
python3 crawler/article_stat.py
#################
# Preprocessing #
#################
# Preprocess NEJM articles
# Step 1:
# Turn English sentences into lower case and normalize
# punctuations, also remove:
bash preprocess/normalize.sh
# Step 2:
# Split paragraphs into sentences:
bash preprocess/detect_sentences/eserix.sh
python3 preprocess/detect_sentences/punkt.py
python3 preprocess/detect_sentences/sent_stat.py
# Step 3:
# Tokenize sentences and change case:
bash preprocess/tokenize.sh
bash preprocess/lowercase.sh
bash preprocess/truecase.sh
# Step 4:
# Manually align:
bash preprocess/manual_align/copy.sh
bash preprocess/manual_align/align.sh
##################
# WMT18 baseline #
##################
# Build a WMT18 baseline model.
# Data: WMT18 news translation shared task
# Model: default transformer
# Train zh -> en on WMT18 BPE data:
# Do not run (only run on GPU nodes)
bash translation/wmt18/train.sh
##############
# Evaluation #
##############
#---- Abstracts -----#
# Create a manually aligned gold-standard based on
# WMT19 Biomedical translation shared task to
# evaluate bleualign with WMT18 baseline model:
bash evaluation/wmt19_biomed/modifications.sh
# This will do necessary preprocessing (segmentation, tokenization, BPE)
# and generate sentence-to-sentence translation. Additionally, it will
# also mark each sentence with doc#,# markers.
bash evaluation/wmt19_biomed/translate.sh
# Here I align with Bleualign, Gale-Church, and Moore's IBM 1 model.
bash evaluation/wmt19_biomed/align.sh
# Evaluate precision and recall for different algorithms:
bash evaluation/wmt19_biomed/evaluate.sh
#------ NEJM articles -------#
# Align with Moore's algorithm:
bash evaluation/nejm/align/moore/input.sh
bash evaluation/nejm/align/moore/align.sh
# Align with Hunalign:
bash evaluation/nejm/align/hunalign/input.sh
bash evaluation/nejm/align/hunalign/align.sh
# Align with Bleualign:
bash evaluation/nejm/align/bleualign/input.sh
bash evaluation/nejm/align/bleualign/translate.sh
bash evaluation/nejm/align/bleualign/align.sh
# Evaluate precision and recall for different algorithms:
bash evaluation/nejm/evaluate.sh
# Visually compare Precision-Recall across methods:
python3 evaluation/nejm/vis_pr_curve.py
#####################
# Machine Alignment #
#####################
# Use Moore's algorithm to align:
bash alignment/moore/input.sh
bash alignment/moore/align.sh
#########################
# Crowdsource Alignment #
#########################
# Prepare articles
python3 crowdsource/prep_articles.py
############
# Clean up #
############
# Use bifixer to remove duplicate sentences:
# Don't run (need docker container)
bash clean/concat.sh
bash clean/clean.sh
##############
# Split data #
##############
# Split data into training, dev, test:
bash split_data/split_train_test.py
###############
# Translation #
###############
# Subset the data:
python3 subset/subset.py
# Fine-tune on NEJM dataset:
bash translation/nejm/finetune.sh
bash translation/nejm/test_finetune.sh
# Train on NEJM from scratch:
bash translation/nejm/train_denovo.sh
bash translation/nejm/test_denovo.sh
# Plot bleu score:
python3 translation/nejm/plot_bleu.py
# Do the above things for a LSTM model:
# Fine-tune on NEJM dataset:
bash translation/nejm/finetune_rnn.sh
bash translation/nejm/test_finetune_rnn.sh
# Train on NEJM from scratch:
bash translation/nejm/train_denovo_rnn.sh
bash translation/nejm/test_denovo_rnn.sh
# Plot bleu score:
python3 translation/nejm/plot_bleu_rnn.py
#################
# Visualization #
#################
# Make table 1
python3 visulization/tables/tab1/tab1.py