This repository has been archived by the owner on Feb 22, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 137
/
prepare.sh
executable file
·97 lines (71 loc) · 2.77 KB
/
prepare.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!/bin/bash
#+------------------------------------------------------------------------------------------------------------------------------+
#| Idio Wiki2Vec | |
#+------------------------------------------------------------------------------------------------------------------------------+
# Creates Wiki2Vec corpora out of a wikipedia dump
# $1 Locale (en_US)
# $2 Target Folder( Output Folder)
# $3 Stemmer
WIKI2VEC_VERSION="1.0"
usage ()
{
echo "prepare.sh"
echo "usage: ./prepare.sh en_US /data/word2vec/ [StemmerLanguage]"
echo "Creates a wikipedia corpus which can be fed into word2vec creation tools"
}
shift $((OPTIND - 1))
if [ $# < 2 ]
then
usage
exit
fi
BASE_DIR=$(pwd)
TARGET_DIR="$2"
LANGUAGE=`echo $1 | sed "s/_.*//g"`
WDIR="$BASE_DIR/working"
SPARK_PATH="$WDIR/spark-1.2.0-bin-hadoop2.4"
JAR_PATH="$BASE_DIR/target/scala-2.10/wiki2vec-assembly-${WIKI2VEC_VERSION}.jar"
READABLEWIKI="$TARGET_DIR/${LANGUAGE}wiki-latest.lines"
SPLIT_OUTPUT_CORPUS="$WDIR/${LANGUAGE}wiki"
OUTPUTCORPUS="$TARGET_DIR/${LANGUAGE}wiki.corpus"
if [ ! -z "$3" ]; then
STEMMERNAME="$3"
else
STEMMERNAME="$LANGUAGE"
fi
echo "Language: $LANGUAGE"
echo "Working directory: $WDIR"
echo "Language stemmer: $STEMMERNAME"
apt-get update
# Installing Java
add-apt-repository ppa:webupd8team/java
# Installing SBT
echo "deb http://dl.bintray.com/sbt/debian /" | tee -a /etc/apt/sources.list.d/sbt.list
apt-get update
apt-get install unzip oracle-java7-installer sbt
mkdir -p $WDIR
mkdir -p $SPLIT_OUTPUT_CORPUS
cd $WDIR
echo "Downloading Wikipedia Dump"
curl -L -O "http://dumps.wikimedia.org/${LANGUAGE}wiki/latest/${LANGUAGE}wiki-latest-pages-articles-multistream.xml.bz2"
WIKIPEDIA_PATH="$WDIR/${LANGUAGE}wiki-latest-pages-articles-multistream.xml.bz2"
echo "Downloading Apache Spark"
curl "http://d3kbcqa49mib13.cloudfront.net/spark-1.2.0-bin-hadoop2.4.tgz" | tar xvz
# Compiling
echo "Compiling wiki2vec..."
cd $BASE_DIR
sbt assembly
# Process Wiki
echo "Creating Readable Wiki.."
java -Xmx10G -Xms10G -cp $JAR_PATH org.idio.wikipedia.dumps.CreateReadableWiki $WIKIPEDIA_PATH $READABLEWIKI
# Create Wiki2Vec Corpus
echo "Creating Word2vec Corpus"
$SPARK_PATH/bin/spark-submit --driver-memory 15g --num-executors 4 --class org.idio.wikipedia.word2vec.Word2VecCorpus $JAR_PATH $READABLEWIKI $BASE_DIR/fakePathToRedirect/file.nt $SPLIT_OUTPUT_CORPUS $STEMMERNAME
# joining split files
echo "Joining corpus.."
cd $SPLIT_OUTPUT_CORPUS
cat part* >> $OUTPUTCORPUS
echo "fixing up punctutation in final corpus"
cd $BASE_DIR
python resources/fix_corpus.py $OUTPUTCORPUS ${OUTPUTCORPUS}.fixed
echo " ^___^ corpus : ${OUTPUTCORPUS}.fixed"