Skip to content

Commit

Permalink
[#2226] dev(docker): Add a Hive with Kerberos mode Docker image (#3488)
Browse files Browse the repository at this point in the history
### What changes were proposed in this pull request?

Add all the Docker image files required by Hive with Kerberos enabled.

### Why are the changes needed?

We need to confirm that everything goes smoothly in a Kerberos-enabled
Hive cluster.

Fix: #2226 

### Does this PR introduce _any_ user-facing change?

N/A

### How was this patch tested?

Test locally.

Co-authored-by: qqqttt123 <148952220+qqqttt123@users.noreply.github.com>
Co-authored-by: Heng Qin <qqtt@123.com>
Co-authored-by: yuqi <yuqi@datastrato.com>
  • Loading branch information
4 people authored May 22, 2024
1 parent f7d1934 commit b0986bc
Show file tree
Hide file tree
Showing 18 changed files with 823 additions and 1 deletion.
4 changes: 4 additions & 0 deletions .github/workflows/docker-image.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ on:
- 'gravitino-ci-doris'
- 'trino'
- 'hive'
- 'gravitino-ci-kerberos-hive'
tag:
description: 'Docker tag to apply to this image'
required: true
Expand All @@ -37,6 +38,9 @@ jobs:
if [ "${{ github.event.inputs.image }}" == "gravitino-ci-hive" ]; then
echo "image_type=hive" >> $GITHUB_ENV
echo "image_name=datastrato/gravitino-ci-hive" >> $GITHUB_ENV
elif [ "${{ github.event.inputs.image }}" == "gravitino-ci-kerberos-hive" ]; then
echo "image_type=kerberos-hive" >> $GITHUB_ENV
echo "image_name=datastrato/gravitino-ci-kerberos-hive" >> $GITHUB_ENV
elif [ "${{ github.event.inputs.image }}" == "gravitino-ci-trino" ]; then
echo "image_type=trino" >> $GITHUB_ENV
echo "image_name=datastrato/gravitino-ci-trino" >> $GITHUB_ENV
Expand Down
2 changes: 2 additions & 0 deletions build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -464,6 +464,8 @@ tasks.rat {
// Ignore files we track but do not need headers
"**/.github/**/*",
"dev/docker/**/*.xml",
"dev/docker/**/*.conf",
"dev/docker/kerberos-hive/kadm5.acl",
"**/*.log",
"**/licenses/*.txt",
"**/licenses/*.md",
Expand Down
5 changes: 4 additions & 1 deletion dev/docker/build-docker.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ usage() {
cat << EOF
Usage:
./build-docker.sh --platform [all|linux/amd64|linux/arm64] --type [gravitino|hive|trino|doris] --image {image_name} --tag {tag_name} --latest
./build-docker.sh --platform [all|linux/amd64|linux/arm64] --type [gravitino|hive|trino|doris|kerberos-hive] --image {image_name} --tag {tag_name} --latest
Notice: You shouldn't use 'all' for the platform if you don't use the Github action to publish the Docker image.
EOF
Expand Down Expand Up @@ -74,6 +74,9 @@ fi
if [[ "${component_type}" == "hive" ]]; then
. ${script_dir}/hive/hive-dependency.sh
build_args="--build-arg HADOOP_PACKAGE_NAME=${HADOOP_PACKAGE_NAME} --build-arg HIVE_PACKAGE_NAME=${HIVE_PACKAGE_NAME} --build-arg JDBC_DIVER_PACKAGE_NAME=${JDBC_DIVER_PACKAGE_NAME}"
elif [[ "${component_type}" == "kerberos-hive" ]]; then
. ${script_dir}/kerberos-hive/hive-dependency.sh
build_args="--build-arg HADOOP_PACKAGE_NAME=${HADOOP_PACKAGE_NAME} --build-arg HIVE_PACKAGE_NAME=${HIVE_PACKAGE_NAME} --build-arg JDBC_DIVER_PACKAGE_NAME=${JDBC_DIVER_PACKAGE_NAME}"
elif [ "${component_type}" == "trino" ]; then
. ${script_dir}/trino/trino-dependency.sh
elif [ "${component_type}" == "gravitino" ]; then
Expand Down
172 changes: 172 additions & 0 deletions dev/docker/kerberos-hive/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
#
# Copyright 2024 Datastrato Pvt Ltd.
# This software is licensed under the Apache License version 2.
#

FROM ubuntu:16.04
LABEL maintainer="support@datastrato.com"

ARG HADOOP_PACKAGE_NAME
ARG HIVE_PACKAGE_NAME
ARG JDBC_DIVER_PACKAGE_NAME

WORKDIR /

################################################################################
# update and install basic tools
ENV DEBIAN_FRONTEND noninteractive

RUN apt-get update && apt-get upgrade -y && apt-get install --fix-missing -yq \
git \
libkrb5-dev \
libmysqlclient-dev \
libssl-dev \
libsasl2-dev \
libsasl2-modules-gssapi-mit \
libsqlite3-dev \
libtidy-0.99-0 \
libxml2-dev \
libxslt-dev \
libffi-dev \
libldap2-dev \
python-dev \
python-setuptools \
libgmp3-dev \
libz-dev \
curl \
software-properties-common \
vim \
openssh-server \
wget \
sudo \
openjdk-8-jdk \
krb5-kdc \
krb5-admin-server \
krb5-user \
krb5-config \
jsvc

#################################################################################
## setup ssh
RUN mkdir /root/.ssh
RUN cat /dev/zero | ssh-keygen -q -N "" > /dev/null && cat /root/.ssh/id_rsa.pub > /root/.ssh/authorized_keys

COPY packages /tmp/packages
COPY kdc.conf /etc/krb5kdc/kdc.conf
COPY kadm5.acl /etc/krb5kdc/kadm5.acl
COPY krb5.conf /etc/krb5.conf

################################################################################
# set environment variables
ENV JAVA_HOME=/usr/local/jdk
ENV HIVE_HOME=/usr/local/hive
ENV JSVC_HOME=/user/bin
ENV HADOOP_SECURE_DN_USER=hdfs
ENV HADOOP_HOME=/usr/local/hadoop
ENV HADOOP_HEAPSIZE=128
ENV HADOOP_INSTALL=${HADOOP_HOME}
ENV HADOOP_MAPRED_HOME=${HADOOP_INSTALL}
ENV HADOOP_COMMON_HOME=${HADOOP_INSTALL}
ENV HADOOP_HDFS_HOME=${HADOOP_INSTALL}
ENV HADOOP_CONF_DIR=${HADOOP_HOME}/etc/hadoop
ENV YARN_HOME=${HADOOP_INSTALL}
ENV KRB5CCNAME=krb5cc_cli_0

ENV PATH=${JAVA_HOME}/bin:${HADOOP_HOME}/bin:${HADOOP_INSTALL}/sbin:${HIVE_HOME}/bin:${PATH}
ENV CLASSPATH=${HADOOP_HOME}/lib/*:HIVE_HOME/lib/*:.
ENV LD_LIBRARY_PATH=${HADOOP_HOME}/lib/native

################################################################################
# add the above env for all users
RUN ARCH=$(uname -m) && \
if [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ]; then \
ln -s /usr/lib/jvm/java-8-openjdk-arm64 ${JAVA_HOME}; \
else \
ln -s /usr/lib/jvm/java-8-openjdk-amd64 ${JAVA_HOME}; \
fi

RUN echo "JAVA_HOME=${JAVA_HOME}" >> /etc/environment
RUN echo "HADOOP_HEAPSIZE=${HADOOP_HEAPSIZE}" >> /etc/environment
RUN echo "HADOOP_HOME=${HADOOP_HOME}" >> /etc/environment
RUN echo "HADOOP_INSTALL=${HADOOP_INSTALL}" >> /etc/environment
RUN echo "HADOOP_MAPRED_HOME=${HADOOP_MAPRED_HOME}" >> /etc/environment
RUN echo "HADOOP_COMMON_HOME=${HADOOP_COMMON_HOME}" >> /etc/environment
RUN echo "HADOOP_HDFS_HOME=${HADOOP_HDFS_HOME}" >> /etc/environment
RUN echo "HADOOP_CONF_DIR=${HADOOP_CONF_DIR}" >> /etc/environment
RUN echo "HADOOP_CLASSPATH=${JAVA_HOME}/lib/tools.jar" >> /etc/environment
RUN echo "YARN_HOME=${YARN_HOME}" >> /etc/environment
RUN echo "HIVE_HOME=${HIVE_HOME}" >> /etc/environment
RUN echo "PATH=${PATH}" >> /etc/environment
RUN echo "CLASSPATH=${CLASSPATH}" >> /etc/environment
RUN echo "LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" >> /etc/environment

################################################################################
# install hadoop
RUN mkdir ${HADOOP_HOME}
RUN tar -xz -C ${HADOOP_HOME} --strip-components 1 -f /tmp/packages/${HADOOP_PACKAGE_NAME}

# replace configuration templates
RUN rm -f ${HADOOP_CONF_DIR}/core-site.xml
RUN rm -f ${HADOOP_CONF_DIR}/hadoop-env.sh
RUN rm -f ${HADOOP_CONF_DIR}/yarn-env.sh
RUN rm -f ${HADOOP_CONF_DIR}/hdfs-site.xml
RUN rm -f ${HADOOP_CONF_DIR}/mapred-site.xml

ADD core-site.xml ${HADOOP_CONF_DIR}/core-site.xml
ADD hadoop-env.sh ${HADOOP_CONF_DIR}/hadoop-env.sh
ADD yarn-env.sh ${HADOOP_CONF_DIR}/yarn-env.sh
ADD hdfs-site.xml ${HADOOP_CONF_DIR}/hdfs-site.xml
ADD mapred-site.xml ${HADOOP_CONF_DIR}/mapred-site.xml
ADD yarn-site.xml ${HADOOP_CONF_DIR}/yarn-site.xml
ADD check-status.sh /tmp/check-status.sh


################################################################################
# install hive
RUN mkdir ${HIVE_HOME}
RUN tar -xz -C ${HIVE_HOME} --strip-components 1 -f /tmp/packages/${HIVE_PACKAGE_NAME}
ADD hive-site.xml ${HIVE_HOME}/conf/hive-site.xml

################################################################################
# install MySQL
ENV MYSQL_PWD=ds123
RUN echo "mysql-server mysql-server/root_password password ${MYSQL_PWD}" | debconf-set-selections
RUN echo "mysql-server mysql-server/root_password_again password ${MYSQL_PWD}" | debconf-set-selections
RUN apt-get install -y mysql-server

RUN chown -R mysql:mysql /var/lib/mysql
RUN usermod -d /var/lib/mysql/ mysql
RUN sed -i "s/.*bind-address.*/bind-address = 0.0.0.0/" /etc/mysql/mysql.conf.d/mysqld.cnf

################################################################################
# add mysql jdbc driver
RUN tar -xz -C ${HIVE_HOME}/lib --strip-components 1 -f /tmp/packages/${JDBC_DIVER_PACKAGE_NAME}

################################################################################
# add users and groups
RUN groupadd hdfs && groupadd hadoop && groupadd hive && groupadd mapred

RUN useradd -g hadoop datastrato && echo "datastrato:ds123" | chpasswd && adduser datastrato sudo
RUN usermod -s /bin/bash datastrato

RUN usermod -a -G hdfs datastrato
RUN usermod -a -G hadoop datastrato
RUN usermod -a -G hive datastrato
RUN usermod -a -G mapred datastrato

RUN mkdir /home/datastrato
RUN chown -R datastrato:hadoop /home/datastrato

################################################################################
# removed install packages
RUN rm -rf /tmp/packages

################################################################################
# expose port
EXPOSE 3306 9000 9083 10000 10002 50070 50075 50010 88

################################################################################
# create startup script and set ENTRYPOINT
WORKDIR /
ADD start.sh /usr/local/sbin
ENTRYPOINT ["/bin/bash", "/usr/local/sbin/start.sh"]
24 changes: 24 additions & 0 deletions dev/docker/kerberos-hive/check-status.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/bin/bash
#
# Copyright 2024 Datastrato Pvt Ltd.
# This software is licensed under the Apache License version 2.
#
set -ex

hdfs_ready=$(hdfs dfsadmin -report | grep "Live datanodes" | awk '{print $3}')
if [[ ${hdfs_ready} == "(1):" ]]; then
echo "HDFS is ready"
else
echo "HDFS is not ready"
exit 1
fi

hive_ready=$(hive -e "select 1;" 2>&1)
if [[ ${hive_ready} == *"FAILED"* ]]; then
echo "Hive is not ready"
exit 1
else
echo "Hive is ready"
fi

exit 0
48 changes: 48 additions & 0 deletions dev/docker/kerberos-hive/core-site.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://0.0.0.0:9000</value>
</property>

<property>
<name>name</name>
<value>Development Cluster</value>
</property>

<property>
<name>hadoop.http.staticuser.user</name>
<value>hadoopuser</value>
</property>

<property>
<name>hadoop.proxyuser.hive.hosts</name>
<value>*</value>
</property>

<property>
<name>hadoop.proxyuser.hive.groups</name>
<value>*</value>
</property>

<property>
<name>hadoop.proxyuser.root.groups</name>
<value>*</value>
</property>

<property>
<name>hadoop.proxyuser.root.hosts</name>
<value>*</value>
</property>

<property>
<name>hadoop.security.auth_to_local</name>
<value>
RULE:[2:$1@$0](.*@HADOOPKRB)s/.*/hadoop/
DEFAULT
</value>
</property>
<property>
<name>hadoop.security.authentication</name>
<value>kerberos</value>
</property>
</configuration>
95 changes: 95 additions & 0 deletions dev/docker/kerberos-hive/hadoop-env.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Set Hadoop-specific environment variables here.

# The only required environment variable is JAVA_HOME. All others are
# optional. When running a distributed configuration it is best to
# set JAVA_HOME in this file, so that it is correctly defined on
# remote nodes.

# The java implementation to use.
export JAVA_HOME=${JAVA_HOME}

# The jsvc implementation to use. Jsvc is required to run secure datanodes
# that bind to privileged ports to provide authentication of data transfer
# protocol. Jsvc is not required if SASL is configured for authentication of
# data transfer protocol using non-privileged ports.
#export JSVC_HOME=${JSVC_HOME}

export HADOOP_CONF_DIR=/usr/local/hadoop/etc/hadoop

# Extra Java CLASSPATH elements. Automatically insert capacity-scheduler.
for f in ${HADOOP_HOME}/contrib/capacity-scheduler/*.jar; do
if [ "${HADOOP_CLASSPATH}" ]; then
export HADOOP_CLASSPATH=${HADOOP_CLASSPATH}:$f
else
export HADOOP_CLASSPATH=$f
fi
done

# The maximum amount of heap to use, in MB. Default is 1000.
export HADOOP_HEAPSIZE=128

# Extra Java runtime options. Empty by default.
export HADOOP_OPTS="${HADOOP_OPTS} -Djava.net.preferIPv4Stack=true -XX:MaxPermSize=128m"

# Command specific options appended to HADOOP_OPTS when specified
export HADOOP_NAMENODE_OPTS="-Dhadoop.security.logger=${HADOOP_SECURITY_LOGGER:-INFO,RFAS} -Dhdfs.audit.logger=${HDFS_AUDIT_LOGGER:-INFO,NullAppender} ${HADOOP_NAMENODE_OPTS}"
export HADOOP_DATANODE_OPTS="-Dhadoop.security.logger=ERROR,RFAS ${HADOOP_DATANODE_OPTS}"

export HADOOP_SECONDARYNAMENODE_OPTS="-Dhadoop.security.logger=${HADOOP_SECURITY_LOGGER:-INFO,RFAS} -Dhdfs.audit.logger=${HDFS_AUDIT_LOGGER:-INFO,NullAppender} ${HADOOP_SECONDARYNAMENODE_OPTS}"

export HADOOP_NFS3_OPTS="${HADOOP_NFS3_OPTS}"
export HADOOP_PORTMAP_OPTS="${HADOOP_PORTMAP_OPTS}"

# The following applies to multiple commands (fs, dfs, fsck, distcp etc)
export HADOOP_CLIENT_OPTS="${HADOOP_CLIENT_OPTS}"

# On secure datanodes, user to run the datanode as after dropping privileges.
# This **MUST** be uncommented to enable secure HDFS if using privileged ports
# to provide authentication of data transfer protocol. This **MUST NOT** be
# defined if SASL is configured for authentication of data transfer protocol
# using non-privileged ports.
export HADOOP_SECURE_DN_USER=${HADOOP_SECURE_DN_USER}

# Where log files are stored. ${HADOOP_HOME}/logs by default.

# Where log files are stored in the secure data environment.
export HADOOP_SECURE_DN_LOG_DIR=${HADOOP_LOG_DIR}/${HADOOP_HDFS_USER}

###
# HDFS Mover specific parameters
###
# Specify the JVM options to be used when starting the HDFS Mover.
# These options will be appended to the options specified as HADOOP_OPTS
# and therefore may override any similar flags set in HADOOP_OPTS
#
# export HADOOP_MOVER_OPTS=""

###
# Advanced Users Only!
###

# The directory where pid files are stored. /tmp by default.
# NOTE: this should be set to a directory that can only be written to by
# the user that will run the hadoop daemons. Otherwise there is the
# potential for a symlink attack.
export HADOOP_PID_DIR=${HADOOP_PID_DIR}
export HADOOP_SECURE_DN_PID_DIR=${HADOOP_PID_DIR}

# A string representing this instance of hadoop. ${USER} by default.
export HADOOP_IDENT_STRING=${USER}
Loading

0 comments on commit b0986bc

Please sign in to comment.