-
Notifications
You must be signed in to change notification settings - Fork 35
/
convert_msmarco_doc.sh
executable file
·37 lines (29 loc) · 1.09 KB
/
convert_msmarco_doc.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
#!/bin/bash -e
# The main script to convert MSMARCO document collection
source ./data_convert/common_conv.sh
checkVarNonEmpty "ANSWER_FILE_JSONL"
checkVarNonEmpty "QUESTION_FILE_JSONL"
checkVarNonEmpty "inputDataDir"
checkVarNonEmpty "QREL_FILE"
BERT_TOK_OPT=" --bert_tokenize"
for part in docs train dev test2019 test2020 ; do
mkdir -p "$inputDataDir/$part"
done
for year in 2019 2020 ; do
python -u ./data_convert/msmarco/convert_queries.py \
$BERT_TOK_OPT \
--input "$src/msmarco-test${year}-queries.tsv" \
--output "$inputDataDir/test${year}/$QUESTION_FILE_JSONL"
done
python -u ./data_convert/msmarco/convert_docs.py \
$BERT_TOK_OPT \
--input "$src/msmarco-docs.tsv.gz" \
--output "$inputDataDir/docs/${ANSWER_FILE_JSONL}.gz"
for part in train dev ; do
zcat $src/msmarco-doc${part}-qrels.tsv.gz > "$inputDataDir/$part/$QREL_FILE"
./data_convert/msmarco/convert_queries.py \
$BERT_TOK_OPT \
--input "$src/msmarco-doc${part}-queries.tsv.gz" \
--output "$inputDataDir/$part/$QUESTION_FILE_JSONL"
done
cp $src/2019qrels-docs.txt "$inputDataDir/test2019/$QREL_FILE"