-
Notifications
You must be signed in to change notification settings - Fork 19
/
preprocessing.sh
executable file
·79 lines (66 loc) · 2.08 KB
/
preprocessing.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#!/bin/bash
# Preprocessing script for AMR data
# For preocessing unaligned amr annotations, use: ./preprocessing.sh <file>
# For preprocessing amr annotations aligned with JAMR (or other aligner that generate similar output), use: ./preprocessing.sh -a <file>
# For preprocessing English sentences (parsing only), use: ./preprocessing.sh -s <file>
JAMR="/disk/ocean/public/tools/jamr2016"
TOKENIZER="cdec-master/corpus/tokenize-anything.sh"
CORENLP="stanford-corenlp-full-2015-12-09/"
if [[ "$JAMR" != "" ]];
then
source $JAMR/scripts/config.sh
fi
ALIGNED="0"
SENTS="0"
while [[ $# -gt 1 ]]
do
key="$1"
case $key in
-a|--aligned)
ALIGNED="1"
;;
-s|--sents)
SENTS="1"
;;
*)
# unknown option
;;
esac
shift # past argument or value
done
if [ "$#" -ne 1 ]; then
echo "Usage: preprocessing.sh AMR_annotation_file"
exit
fi
workdir=$(dirname $1)
if [[ $SENTS -eq "1" ]];
then
"${TOKENIZER}" < "$1" | sed -E 's/(^# ::.*)cannot/\1can not/g' > "$1.sentences"
else
echo "Extracting AMR graphs.."
cat $1 | grep -v '^#' > "$1.graphs"
if [[ $ALIGNED -eq "0" ]];
then
if [[ $JAMR != "" ]];
then
echo "Running JAMR aligner.."
source $JAMR/scripts/config.sh
sed -E 's/(^# ::.*)cannot/\1can not/g' "$1" > "$1.jamr"
$JAMR/scripts/ALIGN.sh < "$1.jamr" > "$1.tmp"
rm "$1.jamr"
else
echo "JAMR path not specified"
fi
echo "Extracting tokenized sentences and alignments.."
cat "$1.tmp" | grep '# ::alignments ' | grep '::annotator Aligner' | sed 's/^# ::alignments //' | cut -d":" -f1 > "$1.alignments"
cat "$1.tmp" | grep '# ::tok ' | sed 's/^# ::tok //' > "$1.sentences"
rm "$1.tmp"
else
echo "Extracting tokenized sentences and alignments.."
cat $1 | grep '# ::alignments ' | sed 's/^# ::alignments //' | cut -d":" -f1 > "$1.alignments"
cat $1 | grep '# ::tok ' | sed 's/^# ::tok //' > "$1.sentences"
fi
fi
echo "Running CoreNLP.."
java -mx6g -cp "$CORENLP/*" edu.stanford.nlp.pipeline.StanfordCoreNLP -props "corenlp.properties" -file "$1.sentences" --outputFormat text -replaceExtension --outputDirectory "$workdir"
echo "Done!"