apache-spark-on-k8s · kimoonkim · Aug 19, 2017 · Aug 21, 2017 · Aug 21, 2017 · Aug 21, 2017
diff --git a/assembly/pom.xml b/assembly/pom.xml
@@ -150,6 +150,9 @@
     </profile>
     <profile>
       <id>kubernetes</id>
+      <!-- Not including extra Kubernetes artifacts such as token-refreh-server
+           because some of them are auxiliary services that should not be part
+           of the Spark distribution -->
       <dependencies>
         <dependency>
           <groupId>org.apache.spark</groupId>

diff --git a/pom.xml b/pom.xml
@@ -2619,6 +2619,17 @@
       </modules>
     </profile>
 
+    <profile>
+      <!-- Extra Kubernetes modules for HDFS support. They should be built
+           regularly, but may not need to be part of the Spark distribution. -->
+      <id>kubernetes-hdfs-extra</id>
+      <modules>
+        <!-- Auxiliary service for refreshing Hadoop tokens. This should not be
+             part of the Spark distribution -->
+        <module>resource-managers/kubernetes/token-refresh-server</module>
+      </modules>
+    </profile>
+
     <profile>
       <id>kubernetes-integration-tests</id>
       <modules>

diff --git a/resource-managers/kubernetes/README.md b/resource-managers/kubernetes/README.md
@@ -28,18 +28,34 @@ building Spark normally. For example, to build Spark against Hadoop 2.7 and Kube
 
     dev/make-distribution.sh --tgz -Phadoop-2.7 -Pkubernetes
 
-# Kubernetes Code Modules
+# Kubernetes Modules
 
 Below is a list of the submodules for this cluster manager and what they do.
 
 * `core`: Implementation of the Kubernetes cluster manager support.
+* `token-refresh-server`: Extra Kubernetes service that refreshes Hadoop
+  tokens for long-running Spark jobs that access secure data sources like
+  Kerberized HDFS.
 * `integration-tests`: Integration tests for the project.
 * `docker-minimal-bundle`: Base Dockerfiles for the driver and the executors. The Dockerfiles are used for integration
   tests as well as being provided in packaged distributions of Spark.
 * `integration-tests-spark-jobs`: Spark jobs that are only used in integration tests.
 * `integration-tests-spark-jobs-helpers`: Dependencies for the spark jobs used in integration tests. These dependencies
   are separated out to facilitate testing the shipping of jars to drivers running on Kubernetes clusters.
 
+# Building Extra Submodules for Spark with Kubernetes
+
+There are non-core extra submodules such as token-refresh-server. To build
+those, use the `kubernetes-*-extra` profiles like `kubernetes-hdfs-extra`
+when invoking Maven. For example, to build the token-refresh-server submodule:
+
+    build/mvn package -Pkubernetes-hdfs-extra  \
+        -pl resource-managers/kubernetes/token-refresh-server -am
+
+Some of these submodules are helper Kubernetes services. They need not be part
+of the Spark distribution. The distribution build script will not include
+artifacts from these submodules.
+
 # Running the Kubernetes Integration Tests
 
 Note that the integration test framework is currently being heavily revised and is subject to change.
@@ -64,7 +80,7 @@ build/mvn integration-test \
     -pl resource-managers/kubernetes/integration-tests -am
 ```
 
-# Running against an arbitrary cluster
+## Running against an arbitrary cluster
 
 In order to run against any cluster, use the following:
 ```sh
@@ -74,7 +90,7 @@ build/mvn integration-test \
     -DextraScalaTestArgs="-Dspark.kubernetes.test.master=k8s://https://<master> -Dspark.docker.test.driverImage=<driver-image> -Dspark.docker.test.executorImage=<executor-image>"
 ```
 
-# Preserve the Minikube VM
+## Preserve the Minikube VM
 
 The integration tests make use of [Minikube](https://github.com/kubernetes/minikube), which fires up a virtual machine
 and setup a single-node kubernetes cluster within it. By default the vm is destroyed after the tests are finished.

diff --git a/resource-managers/kubernetes/token-refresh-server/README.md b/resource-managers/kubernetes/token-refresh-server/README.md
@@ -0,0 +1,68 @@
+---
+layout: global
+title: Hadoop Token Refresh Server on Kubernetes
+---
+
+Spark on Kubernetes may use Kerberized Hadoop data sources such as secure HDFS or Kafka. If the job
+runs for days or weeks, someone should extend the lifetime of Hadoop delegation tokens, which
+expire every 24 hours. The Hadoop Token Refresh Server is a Kubernetes microservice that renews
+token lifetime and puts the replacement tokens in place.
+
+# Building the Refresh Server
+
+To build the refresh server jar, simply run Maven. For example:
+
+    mvn clean package
+
+The target directory will have a tarball that includes the project jar file as well as
+3rd party dependency jars. The tarball name would end with `-assembly.tar.gz`. For example:
+
+    target/token-refresh-server-kubernetes_2.11-2.2.0-k8s-0.3.0-SNAPSHOT-assembly.tar.gz
+
+# Running the Refresh Server
+
+To run the server, follow the steps below.
+
+1. Build and push the docker image:
+
+    docker build -t hadoop-token-refresh-server:latest  \
+         -f src/main/docker/Dockerfile .
+    docker tag hadoop-token-refresh-server:latest <YOUR-REPO>:<YOUR-TAG>
+    docker push <YOUR-REPO>:<YOUR-TAG>
+
+2. Edit the main application config file, src/main/conf/application.conf
+   and create a `configmap`:
+
+    kubectl create configmap hadoop-token-refresh-server-application-conf  \
+        --from-file=src/main/conf/application.conf
+
+3. Create another k8s `configmap` containing Hadoop config files. This should enable Kerberos and secure Hadoop.
+   It should also include the Hadoop servers that would issue delegation tokens such as the HDFS namenode
+   address:
+
+    kubectl create configmap hadoop-token-refresh-server-hadoop-config  \
+        --from-file=/usr/local/hadoop/conf/core-site.xml
+
+4. Create yet another k8s `configmap` containing Kerberos config files. This should include
+   the kerberos server address and the correct realm name for Kerberos principals:
+
+    kubectl create configmap hadoop-token-refresh-server-kerberos-config  \
+        --from-file=/etc/krb5.conf
+
+5. Create a k8s `secret` containing the Kerberos keytab file. The keytab file should include
+   the password for the system user Kerberos principal that the refresh server is using to
+   extend Hadoop delegation tokens. See
+   hadoop-token-refresh-server.kerberosPrincipal in the application.conf.
+
+    kubectl create secret generic hadoop-token-refresh-server-kerberos-keytab  \
+        --from-file /mnt/secrets/krb5.keytab
+
+6. Optionally, create a k8s `service account` and `clusterrolebinding` that
+   the service pod will use. The service account should have `edit` capability for
+   job `secret`s that contains the Hadoop delegation tokens.
+
+7. Finally, edit the config file for k8s `deployment` and launch the service pod
+   using the deployment. The config file should include the right docker image tag
+   and the correct k8s `service account` name.
+
+    kubectl create -f src/main/conf/kubernetes-hadoop-token-refresh-server.yaml
diff --git a/resource-managers/kubernetes/token-refresh-server/pom.xml b/resource-managers/kubernetes/token-refresh-server/pom.xml
@@ -0,0 +1,100 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~    http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <groupId>org.apache.spark</groupId>
+    <artifactId>spark-parent_2.11</artifactId>
+    <version>2.2.0-k8s-0.5.0-SNAPSHOT</version>
+    <relativePath>../../../pom.xml</relativePath>
+  </parent>
+
+  <artifactId>token-refresh-server-kubernetes_2.11</artifactId>
+  <packaging>jar</packaging>
+  <name>Hadoop Token Refresh Server on Kubernetes</name>
+  <properties>
+    <akka.actor.version>2.5.4</akka.actor.version>
+    <commons-logging.version>1.2</commons-logging.version>
+    <kubernetes.client.version>2.2.13</kubernetes.client.version>
+  </properties>
+  <dependencies>
+    <dependency>
+      <groupId>com.typesafe.akka</groupId>
+      <artifactId>akka-actor_${scala.binary.version}</artifactId>
+      <version>${akka.actor.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>io.fabric8</groupId>
+      <artifactId>kubernetes-client</artifactId>
+      <version>${kubernetes.client.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>log4j</groupId>
+      <artifactId>log4j</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.hadoop</groupId>
+      <artifactId>hadoop-client</artifactId>
+    </dependency>
+    <dependency>
+      <!-- Add codehaus jackson back that the root pom.xml excluded. It is
+           needed by Hadoop 2.7 Token.decodeIdentifier method. -->
+      <groupId>org.codehaus.jackson</groupId>
+      <artifactId>jackson-mapper-asl</artifactId>
+      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>commons-logging</groupId>
+      <artifactId>commons-logging</artifactId>
+      <version>${commons-logging.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>com.typesafe.akka</groupId>
+      <artifactId>akka-testkit_${scala.binary.version}</artifactId>
+      <version>${akka.actor.version}</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.mockito</groupId>
+      <artifactId>mockito-core</artifactId>
+      <scope>test</scope>
+    </dependency>
+  </dependencies>
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-assembly-plugin</artifactId>
+        <configuration>
+          <descriptors>
+            <descriptor>src/main/assembly/assembly.xml</descriptor>
+          </descriptors>
+        </configuration>
+        <executions>
+          <execution>
+            <id>make-assembly</id>
+            <phase>package</phase>
+            <goals>
+              <goal>single</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
+    </plugins>
+  </build>
+</project>
diff --git a/resource-managers/kubernetes/token-refresh-server/src/main/assembly/assembly.xml b/resource-managers/kubernetes/token-refresh-server/src/main/assembly/assembly.xml
@@ -0,0 +1,33 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~    http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+<assembly>
+  <id>assembly</id>
+  <formats>
+    <format>tar.gz</format>
+  </formats>
+  <includeBaseDirectory>false</includeBaseDirectory>
+  <dependencySets>
+    <dependencySet>
+      <unpack>false</unpack>
+      <scope>compile</scope>
+    </dependencySet>
+    <dependencySet>
+      <unpack>false</unpack>
+      <scope>provided</scope>
+    </dependencySet>
+  </dependencySets>
+</assembly>
diff --git a/resource-managers/kubernetes/token-refresh-server/src/main/conf/application.conf b/resource-managers/kubernetes/token-refresh-server/src/main/conf/application.conf
@@ -0,0 +1,39 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Main application config file for the Hadoop token refresh server. Override the values below
+# as needed.
+hadoop-token-refresh-server {
+
+  # Kerberos principal that the refresh server should use as its login user. This principal should
+  # match the keytab file used for the refresh server.
+  # For a token to be renewed for the next 24 hours by the refresh server, the token should
+  # designate this refresh server principal as the renewer. To allow a brand new token to be
+  # obtained by the refresh server, the HDFS namenode configuration should specify this refresh
+  # server principal as the special proxy for the job users. See
+  # https://hadoop.apache.org/docs/r2.7.2/hadoop-project-dist/hadoop-common/Superusers.html#Configurations
+  # for details.
+  kerberosPrincipal = "MY-REFRESH-SERVER-KERBEROS-PRINCIPAL"
+
+  # Set this to true if the refresh server should scan secrets across all namespaces. Set it to
+  # false and specify namespaceToScan if the refresh server should scan secrets only from
+  # the specific namespace.
+  scanAllNamespaces = true
+
+  # Effective only if scanAllNamespaces is false. A specific namespace that the refresh server
+  # should scan secrets from.
+  namespaceToScan = "default"
+}