-
Notifications
You must be signed in to change notification settings - Fork 35
/
Copy pathstep2_runasdemo.sh
executable file
·268 lines (211 loc) · 10.2 KB
/
step2_runasdemo.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
set -e
#HOME_DIR=$1
export HOME_DIR=/home/demo
export PROJECT_DIR=$HOME_DIR/hdp-datascience-demo
export HDP_VER=`hdp-select status hadoop-client | sed 's/hadoop-client - \(.*\)/\1/'`
export M2_HOME=/usr/share/maven/latest
export M2=$M2_HOME/bin
export PATH=$PATH:$M2
echo 'M2_HOME=/usr/share/maven/latest' >> ~/.bashrc
echo 'M2=$M2_HOME/bin' >> ~/.bashrc
echo 'PATH=$PATH:$M2' >> ~/.bashrc
#2.2+ specific vars that need to be set
if [ -e /usr/hdp/$HDP_VER/hadoop/bin/hdfs ]
then
export HADOOP_HOME=/usr/hdp/$HDP_VER/hadoop
export HADOOP_VERSION=2.6.0.$HDP_VER
export HDP_VERSION=`hdp-select status hadoop-client | sed 's/hadoop-client - \([0-9]\.[0-9]\).*/\1/'`
export JAVA_HOME=/usr/lib/jvm/java-1.7.0-openjdk.x86_64
#export HADOOP_HOME=/usr/hdp/current/hadoop-client
#export PYTHONPATH=/usr/lib/python2.6/site-packages
#export JAVA_HOME=/usr/lib/jvm/java-1.7.0-openjdk.x86_64
#export YARN_CONF_DIR=/etc/hadoop/conf
#export HADOOP_CONF_DIR=/etc/hadoop/conf
#source /home/demo/pyenv/bin/activate
#export SPARK_HOME=/home/demo/spark-1.2
#export HIVE_HOME=/usr/hdp/current/hive-client
else
export HADOOP_HOME=/usr/lib/hadoop
export JDK_VER=`ls /usr/jdk64/`
export JAVA_HOME=/usr/jdk64/$JDK_VER
export HDP_VERSION=2.1
fi
export PYTHONPATH=/usr/lib/python2.6/site-packages
export YARN_CONF_DIR=/etc/hadoop/conf
export HADOOP_CONF_DIR=/etc/hadoop/conf
echo "export HADOOP_HOME=$HADOOP_HOME" >> $HOME_DIR/.bashrc
echo "export HADOOP_VERSION=$HADOOP_VERSION" >> $HOME_DIR/.bashrc
echo "export PYTHONPATH=$PYTHONPATH" >> $HOME_DIR/.bashrc
echo "export JAVA_HOME=$JAVA_HOME" >> $HOME_DIR/.bashrc
echo "export YARN_CONF_DIR=/etc/hadoop/conf" >> ~/.bashrc
echo "export HADOOP_CONF_DIR=/etc/hadoop/conf" >> ~/.bashrc
echo "cd $HOME_DIR" >> $HOME_DIR/.bashrc
#install sqllite
echo "Installing SQLlite"
cd $HOME_DIR
wget http://www.sqlite.org/2014/sqlite-autoconf-3080600.tar.gz
tar xvfz sqlite-autoconf-3080600.tar.gz
cd sqlite-autoconf-3080600
./configure --prefix=$HOME_DIR/.sqlite3
make && make install
#install python
echo "Installing Python..."
cd $HOME_DIR
wget https://www.python.org/ftp/python/2.7.8/Python-2.7.8.tar.xz
xz -d Python-2.7.8.tar.xz && tar -xvf Python-2.7.8.tar
cd Python-2.7.8
#./configure --prefix=$HOME_DIR/.python LDFLAGS='-L$HOME_DIR/sqlite-autoconf-3080600/.libs' CPPFLAGS='-I$HOME_DIR/sqlite-autoconf-3080600/'
./configure --prefix=$HOME_DIR/.python LDFLAGS='-L/home/demo/sqlite-autoconf-3080600/.libs' CPPFLAGS='-I/home/demo/sqlite-autoconf-3080600/'
make && make altinstall
# Install Python’s package management: easy_install and pip
echo "Installing python pip..."
cd $HOME_DIR
wget https://bitbucket.org/pypa/setuptools/raw/bootstrap/ez_setup.py
.python/bin/python2.7 ez_setup.py
.python/bin/easy_install-2.7 pip
# Install and activate VirtualEnv
echo "Installing python virtualenv..."
.python/bin/pip2.7 install virtualenv
.python/bin/virtualenv pyenv
chmod +x ./pyenv/bin/activate
source ./pyenv/bin/activate
echo "source $HOME_DIR/pyenv/bin/activate" >> $HOME_DIR/.bashrc
#Install data-science related Python package - this can take 10min
echo "Installing python packages: numpy scipy pandas scikit-learn..."
pip install numpy scipy pandas scikit-learn rpy2
#pip install numpy==1.9.0
#pip install scipy==0.14.0
#pip install pandas==0.14.1
#pip install scikit-learn==0.15.1
#pip install rpy2
#Install matplotlib (for graphics in Python)
cd $HOME_DIR
wget http://sourceforge.net/projects/freetype/files/freetype2/2.5.0/freetype-2.5.0.tar.gz
tar xvfz freetype-2.5.0.tar.gz
cd $HOME_DIR/freetype-2.5.0
sudo ./configure --prefix=/usr --disable-static && make
sudo make install
sudo install -v -m755 -d /usr/share/doc/freetype-2.5.0
sudo cp -v -R docs/* /usr/share/doc/freetype-2.5.0
cd $HOME_DIR
source ./pyenv/bin/activate
# Install ipython
easy_install ipython==2.3.0
sudo easy_install -U distribute
pip install matplotlib
#Install PYDOOP – package to enable Hadoop access from Python.
echo "Installing pydoop..."
if [ -e /usr/hdp/$HDP_VER/hadoop/bin/hdfs ]
then
git clone https://github.com/crs4/pydoop.git
cd pydoop
#update _hadoop2_jars method in ./pydoop/hadoop_utils.py
mv $HOME_DIR/pydoop/pydoop/hadoop_utils.py $HOME_DIR/pydoop/pydoop/hadoop_utils.py.bak
cp -f $PROJECT_DIR/setup/hadoop_utils_22.py $HOME_DIR/pydoop/pydoop/hadoop_utils.py
else
wget http://sourceforge.net/projects/pydoop/files/Pydoop-0.12/pydoop-0.12.0.tar.gz
tar xzvf pydoop-0.12.0.tar.gz
cd pydoop-0.12.0
#https://github.com/ZEMUSHKA/pydoop/commit/414a2e52390a873e4766633891190ffede937d90
#vi pydoop/hadoop_utils.py
mv $HOME_DIR/pydoop-0.12.0/pydoop/hadoop_utils.py $HOME_DIR/pydoop-0.12.0/pydoop/hadoop_utils.py.bak
cp -f $PROJECT_DIR/setup/hadoop_utils.py $HOME_DIR/pydoop-0.12.0/pydoop
#https://github.com/ZEMUSHKA/pydoop/commit/e3d3378ae9921561f6c600c79364c2ad42ec206d
#vi setup.py
mv $HOME_DIR/pydoop-0.12.0/setup.py $HOME_DIR/pydoop-0.12.0/setup.py.bak
cp -f $PROJECT_DIR/setup/setup.py $HOME_DIR/pydoop-0.12.0
fi
# build PyDoop
python setup.py build
python setup.py install --skip-build
#Setup default IPython notebook profile
pip install tornado pyzmq ipython pygments matplotlib jinja2 jsonschema
ipython profile create default
echo "c.IPKernelApp.pylab = 'inline'" >> $HOME_DIR/.ipython/profile_default/ipython_notebook_config.py
echo "c.NotebookApp.ip = '*' " >> $HOME_DIR/.ipython/profile_default/ipython_notebook_config.py
echo "c.NotebookApp.open_browser = False" >> $HOME_DIR/.ipython/profile_default/ipython_notebook_config.py
echo "c.NotebookApp.port = 9999" >> $HOME_DIR/.ipython/profile_default/ipython_notebook_config.py
echo "c.NotebookApp.tornado_settings = {'headers': {'Content-Security-Policy': '' } }" >> $HOME_DIR/.ipython/profile_default/ipython_notebook_config.py
echo "c.NotebookApp.webapp_settings = {'headers': {'X-Frame-Options': 'ALLOW-FROM all'}}" >> $HOME_DIR/.ipython/profile_default/ipython_notebook_config.py
echo "Downloading Spark..."
cd
if [ -e /usr/hdp/$HDP_VER/hadoop/bin/hdfs ]
then
#follow instructions from 2.2 Spark TP
#wget http://public-repo-1.hortonworks.com/HDP-LABS/Projects/spark/1.2.0/spark-1.2.0.2.2.0.0-82-bin-2.6.0.2.2.0.0-2041.tgz
#tar xvfz spark-1.2.0.2.2.0.0-82-bin-2.6.0.2.2.0.0-2041.tgz
#export SPARK_HOME=$HOME_DIR/spark-1.2.0.2.2.0.0-82-bin-2.6.0.2.2.0.0-2041
#echo "export SPARK_HOME=$HOME_DIR/spark-1.2.0.2.2.0.0-82-bin-2.6.0.2.2.0.0-2041" >> ~/.bashrc
#git clone https://github.com/hortonworks/spark.git
#git clone https://github.com/hortonworks/spark-native-yarn.git
#download pre-built Spark 1.2 patched with SPARK-4923
wget https://dl.dropboxusercontent.com/u/114020/spark-1.2.zip
unzip spark-1.2.zip
export SPARK_HOME=$HOME_DIR/spark-1.2
echo "export SPARK_HOME=$HOME_DIR/spark-1.2" >> ~/.bashrc
echo "spark.driver.extraJavaOptions -Dhdp.version=$HDP_VER" >> $SPARK_HOME/conf/spark-defaults.conf
echo "spark.yarn.am.extraJavaOptions -Dhdp.version=$HDP_VER" >> $SPARK_HOME/conf/spark-defaults.conf
#set pyspark env vars
export PYSPARK_SUBMIT_ARGS="--master yarn-client"
export PYSPARK_PYTHON="python2.7"
echo 'export PYSPARK_SUBMIT_ARGS="--master yarn-client"' >> ~/.bashrc
echo 'export PYSPARK_PYTHON="python2.7"' >> ~/.bashrc
else
wget http://public-repo-1.hortonworks.com/HDP-LABS/Projects/spark/1.1.0/spark-1.1.0.2.1.5.0-702-bin-2.4.0.2.1.5.0-695.tgz
tar xvfz spark-1.1.0.2.1.5.0-702-bin-2.4.0.2.1.5.0-695.tgz
export SPARK_HOME=$HOME_DIR/spark-1.1.0.2.1.5.0-702-bin-2.4.0.2.1.5.0-695
echo "export SPARK_HOME=$HOME_DIR/spark-1.1.0.2.1.5.0-702-bin-2.4.0.2.1.5.0-695" >> ~/.bashrc
fi
echo "Installing ISpark..."
cd $HOME_DIR
git clone https://github.com/tribbloid/ISpark
cd ISpark/
#Change spark version in ISpark/pom.xml from 1.1.1 to 1.2.1
cp pom.xml pom.xml.bak
sed -i "s/1.1.1/1.2.1/g" pom.xml
mvn package
source ../pyenv/bin/activate
ipython profile create spark
export ISPARK_CORE_ASSEMBLY=`find ~ -iname 'ispark-core-assembly*.jar'`
export SPARK_CONFIG_FILE=$HOME_DIR/.ipython/profile_spark/ipython_config.py
echo "import os" >> $SPARK_CONFIG_FILE
echo "SPARK_HOME = os.environ['SPARK_HOME']" >> $SPARK_CONFIG_FILE
echo "MASTER = 'yarn-client'" >> $SPARK_CONFIG_FILE
echo "ISPARK_CORE_ASSEMBLY = '$ISPARK_CORE_ASSEMBLY'" >> $SPARK_CONFIG_FILE
echo 'c.KernelManager.kernel_cmd = [SPARK_HOME+"/bin/spark-submit", "--master", MASTER, "--class", "org.tribbloid.ispark.Main", "--executor-memory", "2G", ISPARK_CORE_ASSEMBLY, "--profile", "{connection_file}", "--parent"]' >> $SPARK_CONFIG_FILE
echo "c.NotebookApp.ip = '*' " >> $SPARK_CONFIG_FILE
echo "c.NotebookApp.open_browser = False" >> $SPARK_CONFIG_FILE
echo "c.NotebookApp.port = 9998" >> $SPARK_CONFIG_FILE
echo "c.NotebookApp.tornado_settings = {'headers': {'Content-Security-Policy': '' } }" >> $SPARK_CONFIG_FILE
echo "c.NotebookApp.webapp_settings = {'headers': {'X-Frame-Options': 'ALLOW-FROM all'}}" >> $SPARK_CONFIG_FILE
#configure ~/.ipython/profile_default/startup/00-pyspark-setup.py
export PYSPARK_CONFIG_FILE=$HOME_DIR/.ipython/profile_default/00-pyspark-setup.py
echo "import os" > $PYSPARK_CONFIG_FILE
echo "import sys" >> $PYSPARK_CONFIG_FILE
echo "import glob" >> $PYSPARK_CONFIG_FILE
echo "if not spark_home: raise ValueError('SPARK_HOME environment variable is not set')" >> $PYSPARK_CONFIG_FILE
echo "sys.path.insert(0, os.path.join(spark_home, 'python')) " >> $PYSPARK_CONFIG_FILE
echo "for lib in glob.glob(os.path.join(spark_home, 'python/lib/py4j-*-src.zip')):" >> $PYSPARK_CONFIG_FILE
echo " sys.path.insert(0, lib)" >> $PYSPARK_CONFIG_FILE
echo "execfile(os.path.join(spark_home, 'python/pyspark/shell.py'))" >> $PYSPARK_CONFIG_FILE
#Install scalding for R demo
cd
git clone https://github.com/twitter/scalding.git
cd scalding/
git checkout master
sed -i "s/my.host.here/sandbox/g" /home/demo/scalding/scripts/scald.rb
#replace /home/demo/scalding/project/Build.scala
mv /home/demo/scalding/project/Build.scala /home/demo/scalding/project/Build.scala.orig
cp $PROJECT_DIR/setup/Build.scala /home/demo/scalding/project
./sbt update
set +e
./sbt assembly
set -e
export HADOOP_CMD=/usr/bin/hadoop
#export HADOOP_STREAMING=/usr/lib/hadoop/contrib/streaming/hadoop-streaming.jar
echo "export HADOOP_CMD=/usr/bin/hadoop" >> ~/.bashrc
#echo "export HADOOP_STREAMING=/usr/lib/hadoop/contrib/streaming/hadoop-streaming.jar" >> ~/.bashrc
#R CMD javareconf -e
cd $HOME_DIR
rm -f *.tgz *.gz *.zip *.tar
echo 'iPython notebook, pydoop, ISpark, R/Scalding setup complete'