Skip to content
This repository has been archived by the owner on Nov 16, 2019. It is now read-only.

Adding Docker support for CaffeOnSpark #208

Merged
merged 17 commits into from
Jan 21, 2017
Merged
19 changes: 19 additions & 0 deletions docker/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# CaffeOnSpark Standalone Docker

Dockerfiles for CPU build is provided in `standalone/cpu` folder. The image can be built by running:
```
docker build -t caffeonspark:cpu standalone/cpu
```
After the image is built, use `docker images` to validate.

## Launching CaffeOnSpark container
Hadoop and Spark are essential requirements for CaffeOnSpark. To ensure that both process runs flawless, we have included `standalone/cpu/config/bootstrap.sh` script which must be run everytime the container is started.

To launch a container running CaffeOnSpark please use:
```
docker run -it caffeonspark:cpu /etc/bootstrap.sh -bash
```

Now you have a working environment with CaffeOnSpark.

To verify installation, please follow [GetStarted_yarn](https://github.com/yahoo/CaffeOnSpark/wiki/GetStarted_yarn) guide from `Step 7`.
155 changes: 155 additions & 0 deletions docker/standalone/cpu/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
FROM ubuntu:14.04
MAINTAINER arun.das@my.utsa.edu

RUN apt-get update && apt-get install -y software-properties-common
RUN add-apt-repository ppa:openjdk-r/ppa
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This line is repeated in 37 as well. Sorry about that. Didn't see it while uploading. Makes no errors, but redundant.

RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
vim \
cmake \
git \
wget \
libatlas-base-dev \
libboost-all-dev \
libgflags-dev \
libgoogle-glog-dev \
libhdf5-serial-dev \
libleveldb-dev \
liblmdb-dev \
libopencv-dev \
libprotobuf-dev \
libsnappy-dev \
protobuf-compiler \
python-dev \
python-numpy \
python-pip \
python-scipy \
maven \
unzip \
zip \
unzip \
libopenblas-dev \
openssh-server \
openssh-client \
libopenblas-dev \
libboost-all-dev
RUN apt-get install -y software-properties-common
RUN add-apt-repository ppa:openjdk-r/ppa
RUN apt-get update && apt-get install -y openjdk-8-jdk
RUN rm -rf /var/lib/apt/lists/*


# Passwordless SSH
RUN ssh-keygen -y -q -N "" -t dsa -f /etc/ssh/ssh_host_dsa_key
RUN ssh-keygen -y -q -N "" -t rsa -f /etc/ssh/ssh_host_rsa_key
RUN ssh-keygen -q -N "" -t rsa -f /root/.ssh/id_rsa
RUN cp /root/.ssh/id_rsa.pub ~/.ssh/authorized_keys


# Apache Hadoop and Spark section
RUN wget http://apache.mirrors.tds.net/hadoop/common/hadoop-2.6.4/hadoop-2.6.4.tar.gz
RUN wget http://archive.apache.org/dist/spark/spark-1.6.0/spark-1.6.0-bin-hadoop2.6.tgz

RUN gunzip hadoop-2.6.4.tar.gz
RUN gunzip spark-1.6.0-bin-hadoop2.6.tgz
RUN tar -xf hadoop-2.6.4.tar
RUN tar -xf spark-1.6.0-bin-hadoop2.6.tar

RUN sudo cp -r hadoop-2.6.4 /usr/local/hadoop
RUN sudo cp -r spark-1.6.0-bin-hadoop2.6 /usr/local/spark

RUN rm hadoop-2.6.4.tar spark-1.6.0-bin-hadoop2.6.tar
RUN rm -rf hadoop-2.6.4/ spark-1.6.0-bin-hadoop2.6/

RUN sudo mkdir -p /usr/local/hadoop/hadoop_data/hdfs/namenode
RUN sudo mkdir -p /usr/local/hadoop/hadoop_data/hdfs/datanode

# Environment variables
ENV JAVA_HOME /usr/lib/jvm/java-1.8.0-openjdk-amd64
ENV HADOOP_HOME=/usr/local/hadoop
ENV SPARK_HOME=/usr/local/spark
ENV PATH $PATH:$JAVA_HOME/bin
ENV PATH $PATH:$HADOOP_HOME/bin
ENV PATH $PATH:$HADOOP_HOME/sbin
ENV PATH $PATH:$SPARK_HOME/bin
ENV PATH $PATH:$SPARK_HOME/sbin
ENV HADOOP_MAPRED_HOME /usr/local/hadoop
ENV HADOOP_COMMON_HOME /usr/local/hadoop
ENV HADOOP_HDFS_HOME /usr/local/hadoop
ENV HADOOP_CONF_DIR /usr/local/hadoop/etc/hadoop
ENV YARN_HOME /usr/local/hadoop
ENV HADOOP_COMMON_LIB_NATIVE_DIR /usr/local/hadoop/lib/native
ENV HADOOP_OPTS "-Djava.library.path=$HADOOP_HOME/lib"

# Clone CaffeOnSpark
ENV CAFFE_ON_SPARK=/opt/CaffeOnSpark
WORKDIR $CAFFE_ON_SPARK
RUN git clone https://github.com/yahoo/CaffeOnSpark.git . --recursive

# Some of the Hadoop part extracted from "https://hub.docker.com/r/sequenceiq/hadoop-docker/~/dockerfile/"
RUN mkdir $HADOOP_HOME/input
RUN cp $HADOOP_HOME/etc/hadoop/*.xml $HADOOP_HOME/input
RUN cd /usr/local/hadoop/input

# Copy .xml files.
RUN cp ${CAFFE_ON_SPARK}/scripts/*.xml ${HADOOP_HOME}/etc/hadoop

# Format namenode and finish hadoop, spark installations.
RUN $HADOOP_HOME/bin/hdfs namenode -format

RUN ls /root/.ssh/
ADD config/ssh_config /root/.ssh/config
RUN chmod 600 /root/.ssh/config
RUN chown root:root /root/.ssh/config

ADD config/bootstrap.sh /etc/bootstrap.sh
RUN chown root:root /etc/bootstrap.sh
RUN chmod 700 /etc/bootstrap.sh

ENV BOOTSTRAP /etc/bootstrap.sh

RUN sed -i '/^export JAVA_HOME/ s:.*:export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk-amd64\nexport HADOOP_HOME=/usr/local/hadoop\n:' $HADOOP_HOME/etc/hadoop/hadoop-env.sh
RUN sed -i '/^export HADOOP_CONF_DIR/ s:.*:export HADOOP_CONF_DIR=/usr/local/hadoop/etc/hadoop/:' $HADOOP_HOME/etc/hadoop/hadoop-env.sh

# workingaround docker.io build error
RUN ls -la /usr/local/hadoop/etc/hadoop/*-env.sh
RUN chmod +x /usr/local/hadoop/etc/hadoop/*-env.sh
RUN ls -la /usr/local/hadoop/etc/hadoop/*-env.sh

# fix the 254 error code
RUN sed -i "/^[^#]*UsePAM/ s/.*/#&/" /etc/ssh/sshd_config
RUN echo "UsePAM no" >> /etc/ssh/sshd_config
RUN echo "Port 2122" >> /etc/ssh/sshd_config

RUN service ssh start && $HADOOP_HOME/etc/hadoop/hadoop-env.sh && $HADOOP_HOME/sbin/start-dfs.sh && $HADOOP_HOME/bin/hdfs dfs -mkdir -p /user/root
RUN service ssh start && $HADOOP_HOME/etc/hadoop/hadoop-env.sh && $HADOOP_HOME/sbin/start-dfs.sh && $HADOOP_HOME/bin/hdfs dfs -put $HADOOP_HOME/etc/hadoop/ input

CMD ["/etc/bootstrap.sh", "-bash"]

# Hdfs ports
EXPOSE 50010 50020 50070 50075 50090 8020 9000
# Mapred ports
EXPOSE 10020 19888
#Yarn ports
EXPOSE 8030 8031 8032 8033 8040 8042 8088
#Other ports
EXPOSE 49707 2122


# Continue with CaffeOnSpark build.
# ENV CAFFE_ON_SPARK=/opt/CaffeOnSpark
WORKDIR $CAFFE_ON_SPARK
# RUN git clone https://github.com/yahoo/CaffeOnSpark.git . --recursive
RUN cp caffe-public/Makefile.config.example caffe-public/Makefile.config
RUN echo "INCLUDE_DIRS += ${JAVA_HOME}/include" >> caffe-public/Makefile.config
RUN sed -i "s/# CPU_ONLY := 1/CPU_ONLY := 1/g" caffe-public/Makefile.config
RUN sed -i "s|CUDA_DIR := /usr/local/cuda|# CUDA_DIR := /usr/local/cuda|g" caffe-public/Makefile.config
RUN sed -i "s|CUDA_ARCH :=|# CUDA_ARCH :=|g" caffe-public/Makefile.config
RUN sed -i "s|BLAS := atlas|BLAS := open|g" caffe-public/Makefile.config
RUN sed -i "s|TEST_GPUID := 0|# TEST_GPUID := 0|g" caffe-public/Makefile.config

RUN make build

ENV LD_LIBRARY_PATH $LD_LIBRARY_PATH:$CAFFE_ON_SPARK/caffe-public/distribute/lib:$CAFFE_ON_SPARK/caffe-distri/distribute/lib

WORKDIR /root
27 changes: 27 additions & 0 deletions docker/standalone/cpu/config/bootstrap.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#!/bin/bash

: ${HADOOP_PREFIX:=/usr/local/hadoop}

$HADOOP_PREFIX/etc/hadoop/hadoop-env.sh

rm /tmp/*.pid

# installing libraries if any - (resource urls added comma separated to the ACP system variable)
cd $HADOOP_PREFIX/share/hadoop/common ; for cp in ${ACP//,/ }; do echo == $cp; curl -LO $cp ; done; cd -

# adding necessary paths to environment variables (FIXME: These are already in Dockerfile, but does not work. So giving them explicitly.)
export PATH=$PATH:$SPARK_HOME/bin
export PATH=$PATH:$HADOOP_HOME/bin
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CAFFE_ON_SPARK/caffe-public/distribute/lib:$CAFFE_ON_SPARK/caffe-distri/distribute/lib

service ssh start
$HADOOP_PREFIX/sbin/start-dfs.sh
$HADOOP_PREFIX/sbin/start-yarn.sh

if [[ $1 == "-d" ]]; then
while true; do sleep 1000; done
fi

if [[ $1 == "-bash" ]]; then
/bin/bash
fi
5 changes: 5 additions & 0 deletions docker/standalone/cpu/config/ssh_config
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Host *
UserKnownHostsFile /dev/null
StrictHostKeyChecking no
LogLevel quiet
Port 2122