Initial commit

waltherg · waltherg · commit 43993a0bd20f · 2017-08-30T19:52:55.000+02:00
diff --git a/.env b/.env
@@ -0,0 +1,4 @@
+hadoop_version=2.8.1
+hadoop_root=/hadoop
+hadoop_mirror=http://mirror.dkd.de/apache/hadoop/common
+image_version=v0.1
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+.idea
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,27 @@
+Copyright (c) 2017 - present, Georg Walther
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice, this
+  list of conditions and the following disclaimer in the documentation and/or
+  other materials provided with the distribution.
+
+* Neither the name of the {organization} nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/README b/README
@@ -0,0 +1,82 @@
+# Distributed Docker Hadoop
+
+This repository demonstrates how to spin up a distributed Hadoop system.
+
+## Prerequisites
+
+Ensure you have Python Anaconda (the Python 3.6 flavor) installed: https://www.anaconda.com/download/.
+Further ensure you have a recent version of Docker installed.
+The Docker version I developed this example on is:
+
+    $ docker --version
+    Docker version 17.05.0-ce, build 89658be
+
+## Setup
+
+We will use Docker Compose to spin up the various Docker containers constituting
+our Hadoop system.
+To this end let us create a clean Anaconda Python virtual environment and install
+a current version of Docker Compose in it:
+
+    $ conda create --name distributed_docker_hadoop python=3.6 --yes
+    $ source activate distributed_docker_hadoop
+    $ pip install -r requirements.txt
+
+Make certain `docker-compose` points to this newly installed version in the virtual
+environment:
+
+    $ which docker-compose
+
+In case this does not point to the `docker-compose` binary in your virtual environment,
+reload the virtual environment and check again:
+
+    $ source deactivate
+    $ source activate distributed_docker_hadoop
+
+## Start cluster
+
+To start up the cluster:
+
+    $ docker-compose up --force-recreate
+
+Once all Docker services are up you can visit a couple of GUIs in your browser
+to study the overall status of your cluster:
+
+* [The name node](http://localhost:50070)
+* [The resource manager](http://localhost:8088)
+* [The MapReduce job history server](http://localhost:8088)
+
+## Scaling out
+
+Hadoop is well-known for allowing to scale out, i.e. easily run across numerous hosts.
+Since we are using Docker Compose to spin up our virtual hosts in this toy example, we can
+play around with scaling out by using the ability of Docker to scale up Docker services.
+
+### Data nodes
+
+Bring up the Hadoop cluster as described above.
+Browse the current list of data nodes by visiting the web interface of the name node:
+
+`http://localhost:50070/dfshealth.html#tab-datanode`
+
+You should see a single data like so:
+
+![image](https://user-images.githubusercontent.com/3273502/29886791-98f586f4-8dbb-11e7-9bbb-ca6d8314de2f.png)
+
+In a separate terminal window activate the Python virtual environment and scale
+up the data node service as follows:
+
+    $ source activate distributed_docker_hadoop
+    $ docker-compose up --scale data-node=2
+
+Back in the name node web interface you should now notice two data nodes:
+
+![image](https://user-images.githubusercontent.com/3273502/29886878-e00ef7a0-8dbb-11e7-8e91-54117244b115.png)
+
+## Notes
+
+### Hostnames
+
+Hostnames are not allowed to contain underscores `_`, therefore make certain
+to spell out longer hostnames with dashes `-` instead.
+In this example we ensure this by using dashes in the names of our Docker services.
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -0,0 +1,89 @@
+version: '3.3'
+services:
+
+  base:
+    build:
+      context: ./images/base
+      dockerfile: Dockerfile
+      args:
+        hadoop_version: ${hadoop_version}
+        hadoop_root: ${hadoop_root}
+        hadoop_mirror: ${hadoop_mirror}
+    image: distributed_docker_hadoop:${image_version}
+
+  name-node:
+    image: distributed_docker_hadoop:${image_version}
+    networks:
+      - hadoop_net
+    command: >
+      bash -c '
+        yes | ${hadoop_root}/hadoop-${hadoop_version}/bin/hdfs namenode -format &&
+        ${hadoop_root}/hadoop-${hadoop_version}/bin/hdfs namenode
+      '
+    ports:
+      - "50070:50070"
+    depends_on:
+      - base
+
+  data-node:
+    image: distributed_docker_hadoop:${image_version}
+    networks:
+      - hadoop_net
+    command: >
+      bash -c '
+        ${hadoop_root}/hadoop-${hadoop_version}/bin/hdfs datanode
+      '
+    depends_on:
+      - base
+
+  resource-manager:
+    image: distributed_docker_hadoop:${image_version}
+    networks:
+      - hadoop_net
+    command: >
+      bash -c '
+        ${hadoop_root}/hadoop-${hadoop_version}/bin/yarn resourcemanager
+      '
+    ports:
+      - "8088:8088"
+    depends_on:
+      - base
+
+  node-manager:
+    image: distributed_docker_hadoop:${image_version}
+    networks:
+      - hadoop_net
+    command: >
+      bash -c '
+        ${hadoop_root}/hadoop-${hadoop_version}/bin/yarn nodemanager
+      '
+    depends_on:
+      - base
+
+  web-app-proxy:
+    image: distributed_docker_hadoop:${image_version}
+    networks:
+      - hadoop_net
+    command: >
+      bash -c '
+        ${hadoop_root}/hadoop-${hadoop_version}/bin/yarn proxyserver
+      '
+    depends_on:
+      - base
+
+  map-reduce-job-history:
+    image: distributed_docker_hadoop:${image_version}
+    networks:
+      - hadoop_net
+    command: >
+      bash -c '
+        ${hadoop_root}/hadoop-${hadoop_version}/bin/mapred historyserver
+      '
+    ports:
+      - "19888:19888"
+    depends_on:
+      - base
+
+networks:
+  hadoop_net:
+    driver: bridge
diff --git a/images/base/Dockerfile b/images/base/Dockerfile
@@ -0,0 +1,42 @@
+FROM ubuntu:17.10
+MAINTAINER Georg Walther (contact@georg.io)
+
+ARG hadoop_version
+ARG hadoop_root
+ARG hadoop_mirror
+
+ENV HADOOP_PREFIX=$hadoop_root/hadoop-$hadoop_version
+ENV HADOOP_HOME=$HADOOP_PREFIX
+ENV HADOOP_COMMON_HOME=$HADOOP_PREFIX
+ENV HADOOP_CONF_DIR=$HADOOP_PREFIX/etc/hadoop
+ENV HADOOP_HDFS_HOME=$HADOOP_PREFIX
+ENV HADOOP_MAPRED_HOME=$HADOOP_PREFIX
+ENV HADOOP_YARN_HOME=$HADOOP_PREFIX
+
+ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/
+
+USER root
+
+RUN echo "Install required system packages ..." \
+    && apt-get update \
+    && apt-get --yes install \
+        openjdk-8-jre \
+        openssh-client \
+        wget
+
+RUN echo "Download and extract Hadoop source ..." \
+    && mkdir -p ${hadoop_root} \
+    && cd ${hadoop_root} \
+    && wget ${hadoop_mirror}/hadoop-${hadoop_version}/hadoop-${hadoop_version}.tar.gz \
+    && tar xvf hadoop-${hadoop_version}.tar.gz --gzip \
+    && rm hadoop-${hadoop_version}.tar.gz
+
+RUN echo "Create directories for HDFS nodes and logging ..." \
+    && mkdir -p /hdfs_logs \
+    && mkdir -p /hdfs_data \
+    && mkdir -p ${hadoop_root}/hadoop-${hadoop_version}/logs \
+    && chown -R 755 ${hadoop_root}/hadoop-${hadoop_version}/logs
+
+ADD ./configurations/core-site.xml ${hadoop_root}/hadoop-${hadoop_version}/etc/hadoop/core-site.xml
+ADD ./configurations/hdfs-site.xml ${hadoop_root}/hadoop-${hadoop_version}/etc/hadoop/hdfs-site.xml
+ADD ./configurations/yarn-site.xml ${hadoop_root}/hadoop-${hadoop_version}/etc/hadoop/yarn-site.xml
diff --git a/images/base/configurations/core-site.xml b/images/base/configurations/core-site.xml
@@ -0,0 +1,9 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+
+<configuration>
+    <property>
+        <name>fs.defaultFS</name>
+        <value>hdfs://name-node</value>
+    </property>
+</configuration>
diff --git a/images/base/configurations/hdfs-site.xml b/images/base/configurations/hdfs-site.xml
@@ -0,0 +1,13 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+
+<configuration>
+    <property>
+        <name>dfs.namenode.name.dir</name>
+        <value>/hdfs_logs</value>
+    </property>
+    <property>
+        <name>dfs.datanode.data.dir</name>
+        <value>/hdfs_data</value>
+    </property>
+</configuration>
diff --git a/images/base/configurations/yarn-site.xml b/images/base/configurations/yarn-site.xml
@@ -0,0 +1,13 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+
+<configuration>
+    <property>
+        <name>yarn.resourcemanager.hostname</name>
+        <value>resource-manager</value>
+    </property>
+    <property>
+        <name>yarn.web-proxy.address</name>
+        <value>web-app-proxy</value>
+    </property>
+</configuration>
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1 @@
+docker-compose==1.15.0