-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrun.sh
executable file
·268 lines (206 loc) · 10.6 KB
/
run.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
#!/bin/bash
home=$(pwd)
# Script cannot be run as is, since some steps take quite long
# Instead, copy & paste the individual steps as needed.
###########################
# 0: Setting up directories
###########################
echo '0: Creating directories, backing up old data'
mv data data.$(date +'%d%m%Y')
mkdir data data/ids/ data/oger/ data/biobert/ data/harmonised/ data/merged data/merged/brat/ data/public/ data/public/txt
# For PMC:
echo '0: Creating directories, backing up old data'
mv data data.$(date +'%d%m%Y')
mkdir data data/ids data/oger_pmc/ data/biobert_pmc/ data/harmonised_pmc/ data/harmonised_json data/pubannotation_pmc/ data/merged_pmc data/merged_pmc/brat/ data/public/ data/public/txt
################
# 1: Getting IDs
################
echo '1: Downloading PMIDs'
python -c 'import covid; covid.get_pmids()'
# differences (change date to last time you ran the pipeline)
diff --new-line-format="" --unchanged-line-format="" data/ids/all_pmids.txt data.$CHANGEME/ids/all_pmids.txt > data/ids/pmids.txt
# for PMC, use the pmcods_to_txt() from covid.py
# place the pmcids.txt file from the last run's data/ids into the current data/ids
# and rename it to old_pmcids.txt. Place the new .ods into the new data/ids, too
python -c 'import covid; covid.pmcods_to_txt(inpath="data/ids/PMID-PMCID_02092020.ods")'
#################
# 2: RUNNING OGER
#################
cd $home/oger
# During this step, OGER will produce errors for some IDs. These should be
# noted manually in the bad_pmids.txt so that they will not be tried to DL
# again.
for value in CHEBI CL GO_BP GO_CC GO_MF MOP NCBITaxon PR SO UBERON
do
echo '2: Running OGER for' $value
time oger run -s config/common.ini config/$value.ini -o ../data/oger/$value
echo ''
done
# PMC
for value in CHEBI CL GO_BP GO_CC GO_MF MOP NCBITaxon PR SO UBERON
do
echo '2: Running OGER for' $value
screen -S $value -dm oger run -s config/common_pmc.ini config/$value.ini -o ../data/oger_pmc/$value
echo ''
done
# 2: data housekeeping
cp ../data/oger/CHEBI/*.bioc_j collection.bioc_json # this file is necessary for later merge
for value in CHEBI CL GO_BP GO_CC GO_MF MOP NCBITaxon PR SO UBERON
do
collection=$(ls -t ../data/oger/$value/*.conll | head -n1)
cp $collection ../data/oger/$value.conll
rm -r ../data/oger/$value
done
# this file is necessary for later merge
# fails if there's more than one file in CHEBI directory
cp ../data/oger_pmc/CHEBI/*.bioc_j collection_pmc.bioc_json
for value in CHEBI CL GO_BP GO_CC GO_MF MOP NCBITaxon PR SO UBERON
do
collection=$(ls -t ../data/oger_pmc/$value/*.conll | head -n1)
cp $collection ../data/oger_pmc/$value.conll
rm -r ../data/oger_pmc/$value
done
####################
# 3: RUNNING BIOBERT
####################
# If you take note of the number of predictions written in the preprocessing step
# You can get an idea of progress in the actuall processing step, which tends to
# take quite long.
cd $home/biobert
echo '3.1: Preprocessing for BB'
time python3 biobert_predict.py \
--do_preprocess=true \
--input_text=../data/oger/CHEBI.conll \
--tf_record=../data/biobert.tf_record \
--vocab_file=common/vocab.txt
time python3 biobert_predict.py \
--do_preprocess=true \
--input_text=../data/oger_pmc/CHEBI.conll \
--tf_record=../data/biobert_pmc.tf_record \
--vocab_file=common/vocab.txt
# refer to the readme.md for more information
# to restrict CPU usage, change variables in tf_threads.py
cd $home
for SERVER in 1 2 3 ...
do
echo '3.2: Launching BB screens'
ssh $SERVER 'bash -s' < run_bb_$SERVER.sh
done
cd $home
for SERVER in 1 2 3 ...
do
ssh $SERVER 'bash -s' < run_bb_pmc_$SERVER.sh
done
# 3: data house keeping
cd $home/data
echo '3: Moving BB files'
cp -r biobert biobert.bkp
for v in CHEBI CL GO_BP GO_CC GO_MF MOP NCBITaxon PR SO UBERON
do
for s in spans ids
do
mv biobert/$v-$s/biobert.labels biobert/$v-$s.labels
rm -r biobert/$v-$s
done
done
cd $home/data
echo '3: Moving BB files'
cp -r biobert_pmc biobert_pmc.bkp
for v in CHEBI CL GO_BP GO_CC GO_MF MOP NCBITaxon PR SO UBERON
do
for s in ids spans
do
mv biobert_pmc/$v-$s/biobert_pmc.labels biobert_pmc/$v-$s.labels
rm -r biobert_pmc$v-$s
done
done
################
# 4: HARMONISING
################
cd $home
unset vocabularies
declare -A vocabularies=( [CHEBI]=spans-first [CL]=spans-first [GO_BP]=spans-first [GO_CC]=spans-first [GO_MF]=spans-first [MOP]=spans-first [NCBITaxon]=ids-first [PR]=spans-only [SO]=spans-first [UBERON]=spans-first )
#zsh style arrays
for v k in ${(kv)vocabularies}
do
echo '4: Harmonising' $v
python harmonise.py -t data/harmonised/$v.conll -o data/oger/$v.conll -b data/biobert.tokens -i data/biobert/$v-ids.labels -s data/biobert/$v-spans.labels -m $k
done
for v k in ${(kv)vocabularies}
do
echo '4: Harmonising' $v
python harmonise.py -t data/harmonised_pmc/$v.conll -o data/oger_pmc/$v.conll -b data/biobert_pmc.tokens -i data/biobert_pmc/$v-ids.labels -s data/biobert_pmc/$v-spans.labels -m $k
done
#################################
# 5: MERGING and COVID-ANNOTATION
#################################
echo '5: Merging'
cd $home/oger
cp ../data/harmonised/CHEBI.conll collection.conll
oger run -s oger-settings-all.ini
mv ../data/merged/collection.json ../data/merged/collection.bioc.json
oger run -s oger-settings-pubannotation.ini
mv ../data/merged/collection.json ../data/merged/collection.pubannotation.json
mv ../data/merged/collection.tgz ../data/merged/collection.pubannotation.tgz
oger run -s oger-settings-eupmc.ini
mv merged-eupmc/collection.conll ../data/merged/collection.europmc.conll
mv merged-eupmc/collection.json ../data/merged/collection.europmc.json
mv merged-eupmc/collection.zip ../data/merged/collection.europmc.zip
rm -r merged-eupmc
# PMC
cp ../data/harmonised_pmc/CHEBI.conll collection_pmc.conll
oger run -s oger-pmc-settings.ini
mv ../data/merged_pmc/collection_pmc.json ../data/merged_pmc/collection_pmc.bioc.json
oger run -s oger-pmc-settings-pubannotation.ini
mv ../data/merged_pmc/collection_pmc.json ../data/merged_pmc/collection_pmc.pubannotation.json
# clean up
rm collection.conll collection_pmc.conll collection_pmc.bioc_json
#################
# 6: DISTRIBUTION
#################
echo '6: Splitting, .tgz-ing and moving to DL directories'
cd $home
# 6.0 backing up
# 6.1 PUBANNOTATION / PUBMED
# Possibly, update PA collection with data/ids/pmids.txt or pmcids.txt first
# Upload this to PubAnnotation
cp data/merged/collection.pubannotation.json data/collection.pubannotation.json
# 6.1 PUBANNOTATION / PMC
python -c 'import covid; covid.conll_collection_to_jsons(inpath="data/merged_pmc/collection_pmc.conll",outpath="data/pubannotation_pmc",sourcedb="PMC")'
tar -czvf data/pubannotation_pmc.tgz data/pubannotation_pmc/
# 6.2 BRAT / PUBMED
# Creating Brat files and adding new files to directory
python -c 'import covid; covid.bioc_to_brat()'
cp -r /mnt/shared/apaches/transfer/brat/brat_ontogene/data/LitCovid /mnt/shared/apaches/transfer/brat/brat_ontogene/data/LitCovid.$(date +'%d%m%Y')
cp data/merged/brat/* /mnt/shared/apaches/transfer/brat/brat_ontogene/data/LitCovid
# 6.2 BRAT / PMC
python -c 'import covid; covid.bioc_to_brat(inpath="data/merged_pmc/collection_pmc.bioc.json", outpath="data/merged_pmc/brat")'
cp -r /mnt/shared/apaches/transfer/brat/brat_ontogene/data/LitCovidPMC /mnt/shared/apaches/transfer/brat/brat_ontogene/data/LitCovidPMC.$(date +'%d%m%Y')
cp data/merged_pmc/brat/* /mnt/shared/apaches/transfer/brat/brat_ontogene/data/LitCovidPMC
# 6.3 File downloads : BioC / PubMed
cp data/merged/collection.bioc.json data/public/litcovid19.bioc.json
cp data/public/litcovid19.bioc.json /mnt/storage/clfiles/projects/clresources/pub.cl.uzh.ch/public/https/projects/COVID19/LitCovid/litcovid19.bioc/litcovid19.$(date +'%d%m%Y').bioc.json
tar -czvf /mnt/storage/clfiles/projects/clresources/pub.cl.uzh.ch/public/https/projects/COVID19/LitCovid/litcovid19.bioc.json.tgz /mnt/storage/clfiles/projects/clresources/pub.cl.uzh.ch/public/https/projects/COVID19/LitCovid/litcovid19.bioc
# 6.3 File downloads : BioC / PMC
cp /mnt/storage/clfiles/projects/clresources/pub.cl.uzh.ch/public/https/projects/COVID19/LitCovid-PMC /mnt/storage/clfiles/projects/clresources/pub.cl.uzh.ch/public/https/projects/COVID19/LitCovid-PMC.$(date +'%d%m%Y')
cp data/merged_pmc/collection_pmc.bioc.json data/public/covid19lit-pmc.bioc.json
cp data/public/covid19lit-pmc.bioc.json /mnt/storage/clfiles/projects/clresources/pub.cl.uzh.ch/public/https/projects/COVID19/LitCovid-PMC/covid19lit-pmc.bioc.json/covid19lit-pmc.$(date +'%d%m%Y').bioc.json
tar -czvf /mnt/storage/clfiles/projects/clresources/pub.cl.uzh.ch/public/https/projects/COVID19/LitCovid-PMC/covid19lit-pmc.bioc.json.tgz /mnt/storage/clfiles/projects/clresources/pub.cl.uzh.ch/public/https/projects/COVID19/LitCovid-PMC/covid19lit-pmc.bioc.json
# 6.4 File downloads : TSV / PubMed
cp data/merged/collection.tsv data/public/litcovid19.tsv
cat data/public/litcovid19.tsv >> /mnt/storage/clfiles/projects/clresources/pub.cl.uzh.ch/public/https/projects/COVID19/LitCovid/litcovid19.tsv
tar -czvf /mnt/storage/clfiles/projects/clresources/pub.cl.uzh.ch/public/https/projects/COVID19/LitCovid/litcovid19.tsv.tgz /mnt/storage/clfiles/projects/clresources/pub.cl.uzh.ch/public/https/projects/COVID19/LitCovid/litcovid19.tsv
# 6.4 File downloads: TSV / PMC
cp data/merged_pmc/collection_pmc.tsv data/public/covid19lit-pmc.tsv
cat data/public/covid19lit-pmc.tsv >> /mnt/storage/clfiles/projects/clresources/pub.cl.uzh.ch/public/https/projects/COVID19/LitCovid-PMC/covid19lit-pmc.tsv
tar -czvf /mnt/storage/clfiles/projects/clresources/pub.cl.uzh.ch/public/https/projects/COVID19/LitCovid-PMC/covid19lit-pmc.tsv.tgz /mnt/storage/clfiles/projects/clresources/pub.cl.uzh.ch/public/https/projects/COVID19/LitCovid-PMC/covid19lit-pmc.tsv
# 6.5 File downloads: TXT / PubMed
python -c 'import covid; covid.conll_collection_to_txts()'
cp data/public/txt/* /mnt/storage/clfiles/projects/clresources/pub.cl.uzh.ch/public/https/projects/COVID19/LitCovid/litcovid19.txt
tar -czvf /mnt/storage/clfiles/projects/clresources/pub.cl.uzh.ch/public/https/projects/COVID19/LitCovid/litcovid19.txt.tgz /mnt/storage/clfiles/projects/clresources/pub.cl.uzh.ch/public/https/projects/COVID19/LitCovid/litcovid19.txt
# 6.5 File downloads: TXT / PMC
python -c 'import covid; covid.conll_collection_to_txts(inpath="data/merged_pmc/collection_pmc.conll",outpath="data/public/txt")'
cp data/public/txt/* /mnt/storage/clfiles/projects/clresources/pub.cl.uzh.ch/public/https/projects/COVID19/LitCovid-PMC/covid19lit-pmc.txt
tar -czvf /mnt/storage/clfiles/projects/clresources/pub.cl.uzh.ch/public/https/projects/COVID19/LitCovid-PMC/covid19lit-pmc.txt.tgz /mnt/storage/clfiles/projects/clresources/pub.cl.uzh.ch/public/https/projects/COVID19/LitCovid-PMC/covid19lit-pmc.txt
# Verify for EuroPMC
python -c 'import covid; covid.get_naked_conll()'