Skip to content

Commit 0bdaf42

Browse files
committed
Optimize the HMS Docker image and enable S3 support
1 parent be14ad0 commit 0bdaf42

File tree

6 files changed

+89
-21
lines changed

6 files changed

+89
-21
lines changed

metastore/src/java/org/apache/hadoop/hive/metastore/HiveProtoEventsCleanerTask.java

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,12 +26,12 @@
2626
import org.apache.hadoop.hive.conf.HiveConf;
2727
import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
2828
import org.apache.hadoop.security.UserGroupInformation;
29-
import org.apache.hadoop.yarn.util.SystemClock;
3029
import org.slf4j.Logger;
3130
import org.slf4j.LoggerFactory;
3231

3332
import java.io.IOException;
3433
import java.security.PrivilegedExceptionAction;
34+
import java.time.Instant;
3535
import java.time.LocalDate;
3636
import java.time.LocalDateTime;
3737
import java.time.ZoneOffset;
@@ -48,7 +48,6 @@ public class HiveProtoEventsCleanerTask implements MetastoreTaskThread {
4848
private Configuration conf;
4949
private long ttl;
5050
private static String expiredDatePtn = null;
51-
private static final SystemClock clock = SystemClock.getInstance();
5251

5352
@Override
5453
public void setConf(Configuration conf) {
@@ -95,9 +94,11 @@ public void run() {
9594
* Compute the expired date partition, using the underlying clock in UTC time.
9695
*/
9796
private static void computeExpiredDatePtn(long ttl) {
97+
LocalDate expiredDate = LocalDate.ofInstant(
98+
Instant.now().minusMillis(ttl),
99+
ZoneOffset.UTC
100+
);
98101
// Use UTC date to ensure reader date is same on all timezones.
99-
LocalDate expiredDate
100-
= LocalDateTime.ofEpochSecond((clock.getTime() - ttl) / 1000, 0, ZoneOffset.UTC).toLocalDate();
101102
expiredDatePtn = "date=" + DateTimeFormatter.ISO_LOCAL_DATE.format(expiredDate);
102103
}
103104

standalone-metastore/packaging/src/docker/Dockerfile

Lines changed: 38 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -42,22 +42,46 @@ RUN echo ${BUILD_ENV}
4242
ARG HADOOP_VERSION
4343
ARG HIVE_VERSION
4444

45-
RUN tar -xzvf /opt/hadoop-$HADOOP_VERSION.tar.gz -C /opt/ && \
46-
rm -rf /opt/hadoop-$HADOOP_VERSION/share/doc/* && \
45+
RUN apt-get update && \
46+
apt-get install -y wget
47+
48+
RUN tar -xzv \
49+
--exclude="hadoop-$HADOOP_VERSION/include" \
50+
--exclude="hadoop-$HADOOP_VERSION/lib/native" \
51+
--exclude="hadoop-$HADOOP_VERSION/share/doc" \
52+
--exclude="hadoop-$HADOOP_VERSION/share/hadoop/client" \
53+
--exclude="hadoop-$HADOOP_VERSION/share/hadoop/tools" \
54+
--exclude="hadoop-$HADOOP_VERSION/share/hadoop/yarn/*" \
55+
--exclude="*/jdiff" \
56+
--exclude="*/sources" \
57+
--exclude="*tests.jar" \
58+
--exclude="*/webapps" \
59+
-f /opt/hadoop-$HADOOP_VERSION.tar.gz \
60+
-C /opt/ && \
61+
\
62+
find /opt/hadoop-$HADOOP_VERSION/share/hadoop/common/lib \
63+
\( -name "jetty-*.jar" -o -name "zookeeper-*.jar" -o -name "netty-*.jar" \) \
64+
-delete && \
65+
# Extract hadoop jars only
66+
tar -xzv \
67+
-f /opt/hadoop-$HADOOP_VERSION.tar.gz \
68+
-C /opt/ \
69+
--wildcards "hadoop-$HADOOP_VERSION/share/hadoop/tools/lib/hadoop-*.jar" && \
70+
# INSTALL HIVE
4771
tar -xzvf /opt/hive-standalone-metastore-$HIVE_VERSION-bin.tar.gz -C /opt/
4872

4973
FROM eclipse-temurin:21.0.3_9-jre-ubi9-minimal AS run
5074

75+
ARG UID=1000
5176
ARG HADOOP_VERSION
5277
ARG HIVE_VERSION
53-
COPY --from=env /opt/hadoop-$HADOOP_VERSION /opt/hadoop
54-
COPY --from=env /opt/apache-hive-metastore-$HIVE_VERSION-bin /opt/hive
5578

5679
# Install dependencies
5780
RUN set -ex; \
5881
microdnf update -y; \
5982
microdnf -y install procps; \
60-
rm -rf /var/lib/apt/lists/*
83+
microdnf clean all; \
84+
useradd --no-create-home -s /sbin/nologin -c "" --uid $UID hive
6185

6286
# Set necessary environment variables.
6387
ENV HADOOP_HOME=/opt/hadoop \
@@ -66,20 +90,18 @@ ENV HADOOP_HOME=/opt/hadoop \
6690

6791
ENV PATH=$HIVE_HOME/bin:$HADOOP_HOME/bin:$PATH
6892

69-
COPY entrypoint.sh /
70-
COPY conf $HIVE_HOME/conf
71-
RUN chmod +x /entrypoint.sh
93+
COPY --from=env --chown=hive /opt/hadoop-$HADOOP_VERSION $HADOOP_HOME
94+
COPY --from=env --chown=hive /opt/apache-hive-metastore-$HIVE_VERSION-bin $HIVE_HOME
7295

96+
COPY --chown=hive entrypoint.sh /
97+
COPY --chown=hive conf $HIVE_HOME/conf
7398

74-
ARG UID=1000
75-
RUN useradd --no-create-home -s /sbin/nologin -c "" --uid $UID hive && \
76-
chown hive /opt/hive && \
77-
chown hive /opt/hadoop && \
78-
chown hive /opt/hive/conf && \
79-
mkdir -p /opt/hive/data/warehouse && \
80-
chown hive /opt/hive/data/warehouse
99+
RUN chmod +x /entrypoint.sh && \
100+
mkdir -p $HIVE_HOME/data/warehouse && \
101+
chown hive $HIVE_HOME/data/warehouse
81102

82103
USER hive
83-
WORKDIR /opt/hive
104+
WORKDIR $HIVE_HOME
84105
EXPOSE 9001 9083
106+
85107
ENTRYPOINT ["sh", "-c", "/entrypoint.sh"]

standalone-metastore/packaging/src/docker/README.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,13 @@ export POSTGRES_LOCAL_PATH=`mvn help:evaluate -Dexpression=settings.localReposit
131131
If you don't install maven or have problem in resolving the postgres driver, you can always download this jar yourself,
132132
change the `POSTGRES_LOCAL_PATH` to the path of the downloaded jar.
133133

134+
- Metastore with S3 support
135+
136+
Download aws-java-sdk-bundle-xxx.jar and place it under the jars directory:
137+
wget https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.770/aws-java-sdk-bundle-1.12.770.jar -P jars/
138+
139+
Add the `fs.s3a.access.key` and `fs.s3a.secret.key` properties in `metastore-site.xml` under the conf directory.
140+
134141
Then,
135142
```shell
136143
docker compose up -d

standalone-metastore/packaging/src/docker/conf/metastore-site.xml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,4 +32,15 @@
3232
<name>metastore.catalog.servlet.auth</name>
3333
<value>none</value>
3434
</property>
35+
36+
<!--
37+
<property>
38+
<name>fs.s3a.access.key</name>
39+
<value>YOUR_AWS_ACCESS_KEY</value>
40+
</property>
41+
<property>
42+
<name>fs.s3a.secret.key</name>
43+
<value>YOUR_AWS_SECRET_KEY</value>
44+
</property>
45+
-->
3546
</configuration>

standalone-metastore/packaging/src/docker/docker-compose.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,8 @@ services:
5454
- type: bind
5555
source: ${POSTGRES_LOCAL_PATH}
5656
target: /opt/hive/lib/postgres.jar
57+
# Mount local jars to a temporary staging area (Read-Only)
58+
- ./jars:/tmp/ext-jars:ro
5759
networks:
5860
- hive
5961

standalone-metastore/packaging/src/docker/entrypoint.sh

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,16 +19,37 @@
1919

2020
set -x
2121

22+
# =========================================================================
23+
# DYNAMIC JAR LOADER (AWS/S3 Support)
24+
# =========================================================================
25+
STAGING_DIR="/tmp/ext-jars"
26+
27+
TOOLS_LIB="${HADOOP_HOME}/share/hadoop/tools/lib"
28+
COMMON_LIB="${HADOOP_HOME}/share/hadoop/common/lib"
29+
30+
# Checks if /tmp/ext-jars is mounted (via Docker volume).
31+
if [ -d "$STAGING_DIR" ]; then
32+
# Check for aws-java-sdk-bundle (Wildcard handles versions)
33+
if ls "$STAGING_DIR"/aws-java-sdk-bundle-*.jar 1> /dev/null 2>&1; then
34+
echo "--> Installing AWS SDK Bundle..."
35+
cp "$STAGING_DIR"/aws-java-sdk-bundle-*.jar "$COMMON_LIB/"
36+
echo "--> activating hadoop-aws from tools..."
37+
cp "$TOOLS_LIB"/hadoop-aws-*.jar "$COMMON_LIB/"
38+
fi
39+
fi
40+
2241
: "${DB_DRIVER:=derby}"
2342

2443
SKIP_SCHEMA_INIT="${IS_RESUME:-false}"
2544
[[ $VERBOSE = "true" ]] && VERBOSE_MODE="--verbose" || VERBOSE_MODE=""
2645

2746
function initialize_hive {
2847
COMMAND="-initOrUpgradeSchema"
48+
# Check Hive version. If < 4.0.0, use older initSchema command
2949
if [ "$(echo "$HIVE_VER" | cut -d '.' -f1)" -lt "4" ]; then
3050
COMMAND="-${SCHEMA_COMMAND:-initSchema}"
3151
fi
52+
3253
"$HIVE_HOME/bin/schematool" -dbType "$DB_DRIVER" "$COMMAND" "$VERBOSE_MODE"
3354
if [ $? -eq 0 ]; then
3455
echo "Initialized Hive Metastore Server schema successfully.."
@@ -39,17 +60,21 @@ function initialize_hive {
3960
}
4061

4162
export HIVE_CONF_DIR=$HIVE_HOME/conf
63+
4264
if [ -d "${HIVE_CUSTOM_CONF_DIR:-}" ]; then
4365
find "${HIVE_CUSTOM_CONF_DIR}" -type f -exec \
4466
ln -sfn {} "${HIVE_CONF_DIR}"/ \;
4567
export HADOOP_CONF_DIR=$HIVE_CONF_DIR
4668
fi
4769

4870
export HADOOP_CLIENT_OPTS="$HADOOP_CLIENT_OPTS -Xmx1G $SERVICE_OPTS"
71+
4972
if [[ "${SKIP_SCHEMA_INIT}" == "false" ]]; then
5073
# handles schema initialization
5174
initialize_hive
5275
fi
5376

5477
export METASTORE_PORT=${METASTORE_PORT:-9083}
55-
exec "$HIVE_HOME/bin/start-metastore"
78+
79+
echo "Starting Hive Metastore on port $METASTORE_PORT..."
80+
exec "$HIVE_HOME/bin/start-metastore"

0 commit comments

Comments
 (0)