-
Notifications
You must be signed in to change notification settings - Fork 7
/
Dockerfile
233 lines (168 loc) · 6.26 KB
/
Dockerfile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
FROM debian:jessie
ENV DEBIAN_FRONTEND noninteractive
# Add contrib repository
RUN sed -i 's/$/ contrib/g' /etc/apt/sources.list
# Install required software and tools
RUN \
apt-get update && \
apt-get -y upgrade && \
apt-get install -yq --no-install-recommends --fix-missing \
bzip2 \
locales \
tar \
unzip \
vim.tiny \
wget
# Setup en_US locales to handle non-ASCII characters correctly
ENV LC_ALL en_US.UTF-8
ENV LANGUAGE en_US.UTF-8
ENV LANG en_US.UTF-8
ENV dpkg-reconfigure locales
RUN \
echo "en_US.UTF-8 UTF-8" > /etc/locale.gen && \
locale-gen
# Add jessie-backports repository to install JDK 1.8
RUN \
echo "===> add jessie-backports repository ..." && \
echo "deb http://ftp.debian.org/debian jessie-backports main" | tee /etc/apt/sources.list.d/openjdk-8-jdk.list && \
apt-get update && \
echo "===> install Java" && \
apt install -t jessie-backports -yq --no-install-recommends --fix-missing openjdk-8-jre-headless ca-certificates-java openjdk-8-jdk
# define default command
CMD ["java"]
# Install Tini
ARG TINI_VERSION="v0.13.0"
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /usr/bin/
RUN chmod +x /usr/bin/tini
# Create vcap user with UID=1000 and in the 'users' group
ENV SHELL /bin/bash
ENV NB_USER vcap
ENV NB_UID 1000
ENV HOME /home/$NB_USER
RUN useradd -m -s /bin/bash -d $HOME -N -u $NB_UID $NB_USER
ENV CONDA_DIR /opt/anaconda2
RUN mkdir -p $CONDA_DIR
# Download and Install Miniconda
ARG CONDA_VERSION="2-4.2.12"
RUN \
wget -q --no-check-certificate https://repo.continuum.io/miniconda/Miniconda${CONDA_VERSION}-Linux-x86_64.sh -P $CONDA_DIR && \
bash $CONDA_DIR/Miniconda${CONDA_VERSION}-Linux-x86_64.sh -f -b -p $CONDA_DIR && \
rm -rf $CONDA_DIR/Miniconda${CONDA_VERSION}*x86_64.sh
#Add conda binaries to path
ENV PATH $CONDA_DIR/bin:$PATH
# Setup vcap home directory
RUN \
mkdir $HOME/work && \
mkdir $HOME/.jupyter && \
mkdir $HOME/.local && \
echo "cacert=/etc/ssl/certs/ca-certificates.crt" > $HOME/.curlrc
# Configure container startup
EXPOSE 8888
WORKDIR $HOME/jupyter
RUN mkdir -p $HOME/jupyter
COPY assets/start-notebook.sh /usr/local/bin/
COPY assets/jupyter_notebook_config.py $HOME/.jupyter/
ENTRYPOINT ["tini", "--"]
CMD ["start-notebook.sh"]
# Copy all files before switching users
COPY assets/tapmenu/ $HOME/tapmenu
# Install Python 2 packages and kernel spec
RUN \
conda install --yes \
'curl' \
'lxml' \
'notebook=<4.2.3' && \
conda clean --all
# This logo gets displayed within our default notebooks
RUN \
jupyter-nbextension install $HOME/tapmenu && \
jupyter-nbextension enable tapmenu/main
COPY assets/TAP-logo.png $CONDA_DIR/lib/python2.7/site-packages/notebook/static/base/images
# Final apt cleanup
RUN apt-get purge -y 'python3.4*' && \
apt-get -yq autoremove && \
apt-get -yq autoclean && \
rm -rf /var/lib/apt/lists/* && \
conda clean -y --all
RUN mkdir -p $HOME/.jupyter/nbconfig
######### End of Jupyter Base ##########
# Install Spark dependencies
ENV APACHE_SPARK_VERSION 1.6.0
RUN \
wget -q http://archive.apache.org/dist/spark/spark-${APACHE_SPARK_VERSION}/spark-${APACHE_SPARK_VERSION}-bin-hadoop2.6.tgz -P /usr/local && \
tar xzf /usr/local/spark-${APACHE_SPARK_VERSION}-bin-hadoop2.6.tgz -C /usr/local && \
rm -rf /usr/local/spark-${APACHE_SPARK_VERSION}-bin-hadoop2.6.tgz && \
ln -s /usr/local/spark-${APACHE_SPARK_VERSION}-bin-hadoop2.6 /usr/local/spark
# Spark pointers
ENV SPARK_HOME /usr/local/spark
ENV PYTHONPATH $SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.9-src.zip:/opt/cloudera/parcels/CDH/lib/spark/python/lib
ENV PYSPARK_PYTHON python2.7
ENV PATH $SPARK_HOME/bin:$PATH
ENV SPARK_CONF_DIR "/etc/spark/conf"
ENV HADOOP_CONF_DIR "/etc/hadoop/conf"
ENV YARN_CONF_DIR $HADOOP_CONF_DIR
RUN mkdir -p $SPARK_CONF_DIR && \
mkdir -p $HADOOP_CONF_DIR
# Cloudera config is expecting a classpath.txt, also fix some permissions
RUN ls $SPARK_HOME/lib/* > $SPARK_CONF_DIR/classpath.txt && \
mkdir -p /user/spark/applicationHistory && \
chown -R $NB_USER:users /user/spark
# Fix the entry point
COPY ./jupyter-startup.sh /usr/local/bin/jupyter-startup.sh
RUN chmod +x /usr/local/bin/jupyter-startup.sh
CMD ["/usr/local/bin/jupyter-startup.sh"]
RUN mkdir -p $HOME/.jupyter/nbconfig
# Install Python 2 packages and kernel spec
RUN \
conda install --yes \
'pip>=9.0.1' \
'freetype' \
'matplotlib>=1.5*' \
'nomkl' \
'pandas>=0.18*' \
'pymongo' \
'pyzmq' \
'scikit-learn>=0.17*' \
'scipy>=0.17*' \
'futures' && \
conda clean --all
# Install Python 2 kernelspec into conda environment
COPY jupyter-default-notebooks/notebooks $HOME/jupyter
RUN $CONDA_DIR/bin/python -m ipykernel.kernelspec --prefix=$CONDA_DIR
# Create a symbolick link for pip2.7 between now and upgrade to Python3
RUN ln -s $CONDA_DIR/bin/pip $CONDA_DIR/bin/pip2.7
# Set required paths for spark-tk/daal-tk packages
ENV SPARKTK_HOME "/usr/local/sparktk-core"
ENV DAALTK_HOME "/usr/local/daaltk-core"
ENV LD_LIBRARY_PATH /usr/local/daal-2016.2.181:$LD_LIBRARY_PATH
ARG TKLIBS_INSTALLER_URL="https://github.com/trustedanalytics/daal-tk/releases/download/v0.7.4/daal-install"
ARG TKLIBS_INSTALLER="daal-install"
# Install spark-tk/daal-tk packages
#ADD $TKLIBS_INSTALLER_URL /usr/local/
RUN cd /usr/local && \
wget -q --no-check-certificate $TKLIBS_INSTALLER_URL && \
chmod +x $TKLIBS_INSTALLER && \
sync && \
./$TKLIBS_INSTALLER && \
ln -s /usr/local/sparktk-core-* $SPARKTK_HOME && \
ln -s /usr/local/daaltk-core-* $DAALTK_HOME && \
rm -rf /usr/local/$TKLIBS_INSTALLER /usr/local/*.tar.gz
# Install spark-tk package mainly to fix the graphframes install
RUN cd $SPARKTK_HOME && \
chmod +x install.sh && \
sync && \
./install.sh
# copy misc modules for TAP to python2.7 site-packages
COPY misc-modules/* $CONDA_DIR/lib/python2.7/site-packages/
# enable jupyter server extention for sparktk
RUN jupyter serverextension enable sparktk_ext
# Install remaining tk packages
RUN \
pip install trustedanalytics \
tabulate==0.7.5 \
snakebite==2.11.0
# Final cleanup
RUN \
rm -rf /tmp/* && \
rm -rf $HOME/jupyter/examples/pandas-cookbook/Dockerfile && \
rm -rf $HOME/jupyter/examples/pandas-cookbook/README.md