Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[#4514] improvement(iceberg): shrink catalog-lakehouse-iceberg libs size from 184MB to 56MB, IcebergRESTServer size from 162M to 70M #4548

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions LICENSE.bin
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,7 @@
Apache Hadoop Auth
Apache Hadoop Client Aggregator
Apache Hadoop Common
Apache Hadoop HDFS
Apache Hadoop HDFS Client
Apache Hadoop MapReduce Common
Apache Hadoop MapReduce Core
Expand Down
37 changes: 18 additions & 19 deletions catalogs/catalog-lakehouse-iceberg/build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -31,22 +31,25 @@ val icebergVersion: String = libs.versions.iceberg.get()
val scalaCollectionCompatVersion: String = libs.versions.scala.collection.compat.get()

dependencies {
implementation(project(":api"))
implementation(project(":api")) {
exclude("*")
}
implementation(project(":catalogs:catalog-common"))
implementation(project(":common"))
implementation(project(":core"))
implementation(project(":common")) {
exclude("*")
}
implementation(project(":core")) {
exclude("*")
}
implementation(project(":iceberg:iceberg-common"))
LiuQhahah marked this conversation as resolved.
Show resolved Hide resolved
implementation(project(":server-common"))
implementation(libs.bundles.iceberg)
implementation(libs.bundles.jersey)
implementation(libs.bundles.jetty)

implementation(libs.bundles.log4j)
implementation(libs.cglib)
implementation(libs.commons.collections4)
implementation(libs.commons.io)
implementation(libs.commons.lang3)
implementation(libs.guava)
implementation(libs.sqlite.jdbc)

annotationProcessor(libs.lombok)

Expand All @@ -60,7 +63,9 @@ dependencies {

testImplementation("org.scala-lang.modules:scala-collection-compat_$scalaVersion:$scalaCollectionCompatVersion")
testImplementation("org.apache.iceberg:iceberg-spark-runtime-${sparkMajorVersion}_$scalaVersion:$icebergVersion")
testImplementation("org.apache.spark:spark-hive_$scalaVersion:$sparkVersion")
testImplementation("org.apache.spark:spark-hive_$scalaVersion:$sparkVersion") {
exclude("org.apache.hive")
}
testImplementation("org.apache.spark:spark-sql_$scalaVersion:$sparkVersion") {
exclude("org.apache.avro")
exclude("org.apache.hadoop")
Expand All @@ -69,22 +74,12 @@ dependencies {
exclude("org.rocksdb")
}

testImplementation(libs.hadoop2.common) {
exclude("com.github.spotbugs")
}
testImplementation(libs.jersey.test.framework.core) {
exclude(group = "org.junit.jupiter")
}
testImplementation(libs.jersey.test.framework.provider.jetty) {
exclude(group = "org.junit.jupiter")
}
testImplementation(libs.junit.jupiter.api)
testImplementation(libs.junit.jupiter.params)
testImplementation(libs.mockito.core)
// For test TestMultipleJDBCLoad, it was depended on testcontainers.mysql and testcontainers.postgresql
testImplementation(libs.mysql.driver)
testImplementation(libs.postgresql.driver)

testImplementation(libs.slf4j.api)
testImplementation(libs.testcontainers)
testImplementation(libs.testcontainers.mysql)
Expand All @@ -105,7 +100,11 @@ tasks {

val copyCatalogLibs by registering(Copy::class) {
dependsOn("jar", "runtimeJars")
from("build/libs")
from("build/libs") {
exclude("guava-*.jar")
exclude("log4j-*.jar")
exclude("slf4j-*.jar")
}
into("$rootDir/distribution/package/catalogs/lakehouse-iceberg/libs")
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import org.apache.commons.lang.math.RandomUtils;
import java.util.Random;
import org.apache.gravitino.catalog.lakehouse.iceberg.IcebergColumn;
import org.apache.gravitino.rel.Column;
import org.apache.gravitino.rel.expressions.Expression;
Expand Down Expand Up @@ -54,6 +54,8 @@ public class TestBaseConvert {
protected static final Map<String, Type> GRAVITINO_TYPE = new HashMap<>();
protected static final Map<String, org.apache.iceberg.types.Type> ICEBERG_TYPE = new HashMap<>();

private static Random random = new Random(System.currentTimeMillis());

static {
GRAVITINO_TYPE.put("BOOLEAN", org.apache.gravitino.rel.types.Types.BooleanType.get());
// Types not supported by iceberg
Expand Down Expand Up @@ -113,24 +115,24 @@ protected static SortOrder[] createSortOrder(String... colNames) {
results.add(
SortOrders.of(
field(colName),
RandomUtils.nextBoolean() ? SortDirection.DESCENDING : SortDirection.ASCENDING,
RandomUtils.nextBoolean() ? NullOrdering.NULLS_FIRST : NullOrdering.NULLS_LAST));
random.nextBoolean() ? SortDirection.DESCENDING : SortDirection.ASCENDING,
random.nextBoolean() ? NullOrdering.NULLS_FIRST : NullOrdering.NULLS_LAST));
}
return results.toArray(new SortOrder[0]);
}

protected static SortOrder createSortOrder(String name, String colName) {
return SortOrders.of(
FunctionExpression.of(name, field(colName)),
RandomUtils.nextBoolean() ? SortDirection.DESCENDING : SortDirection.ASCENDING,
RandomUtils.nextBoolean() ? NullOrdering.NULLS_FIRST : NullOrdering.NULLS_LAST);
random.nextBoolean() ? SortDirection.DESCENDING : SortDirection.ASCENDING,
random.nextBoolean() ? NullOrdering.NULLS_FIRST : NullOrdering.NULLS_LAST);
}

protected static SortOrder createSortOrder(String name, int width, String colName) {
return SortOrders.of(
FunctionExpression.of(name, Literals.integerLiteral(width), field(colName)),
RandomUtils.nextBoolean() ? SortDirection.DESCENDING : SortDirection.ASCENDING,
RandomUtils.nextBoolean() ? NullOrdering.NULLS_FIRST : NullOrdering.NULLS_LAST);
random.nextBoolean() ? SortDirection.DESCENDING : SortDirection.ASCENDING,
random.nextBoolean() ? NullOrdering.NULLS_FIRST : NullOrdering.NULLS_LAST);
}

protected static Types.NestedField createNestedField(
Expand All @@ -143,7 +145,7 @@ protected static Types.NestedField[] createNestedField(String... colNames) {
for (int i = 0; i < colNames.length; i++) {
results.add(
Types.NestedField.of(
i + 1, RandomUtils.nextBoolean(), colNames[i], getRandomIcebergType(), TEST_COMMENT));
i + 1, random.nextBoolean(), colNames[i], getRandomIcebergType(), TEST_COMMENT));
}
return results.toArray(new Types.NestedField[0]);
}
Expand Down Expand Up @@ -211,15 +213,15 @@ protected static String getGravitinoSortOrderExpressionString(Expression sortOrd
private static Type getRandomGravitinoType() {
Collection<Type> values = GRAVITINO_TYPE.values();
return values.stream()
.skip(RandomUtils.nextInt(values.size()))
.skip(random.nextInt(values.size()))
.findFirst()
.orElseThrow(() -> new RuntimeException("No type found"));
}

private static org.apache.iceberg.types.Type getRandomIcebergType() {
Collection<org.apache.iceberg.types.Type> values = ICEBERG_TYPE.values();
return values.stream()
.skip(RandomUtils.nextInt(values.size()))
.skip(random.nextInt(values.size()))
.findFirst()
.orElseThrow(() -> new RuntimeException("No type found"));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
* under the License.
*/

package org.apache.gravitino.server.web;
package org.apache.gravitino;

import java.util.Map;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,11 @@ public final class ConfigConstants {

private ConfigConstants() {}

/** HTTP Server port, reused by Gravitino server and Iceberg REST server */
public static final String WEBSERVER_HTTP_PORT = "httpPort";
/** HTTPS Server port, reused by Gravitino server and Iceberg REST server */
public static final String WEBSERVER_HTTPS_PORT = "httpsPort";

/** The value of messages used to indicate that the configuration is not set. */
public static final String NOT_BLANK_ERROR_MSG = "The value can't be blank";

Expand Down
1 change: 1 addition & 0 deletions gradle/libs.versions.toml
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ hive2-common = { group = "org.apache.hive", name = "hive-common", version.ref =
hive2-jdbc = { group = "org.apache.hive", name = "hive-jdbc", version.ref = "hive2"}
hadoop2-auth = { group = "org.apache.hadoop", name = "hadoop-auth", version.ref = "hadoop2" }
hadoop2-hdfs = { group = "org.apache.hadoop", name = "hadoop-hdfs", version.ref = "hadoop2" }
hadoop2-hdfs-client = { group = "org.apache.hadoop", name = "hadoop-hdfs-client", version.ref = "hadoop2" }
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did you update the license.bin also?

Copy link
Contributor

@FANNG1 FANNG1 Sep 13, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

license.bin contains Apache Hadoop HDFS Client already, but missing Apache Hadoop HDFS, updated

hadoop2-common = { group = "org.apache.hadoop", name = "hadoop-common", version.ref = "hadoop2"}
hadoop2-mapreduce-client-core = { group = "org.apache.hadoop", name = "hadoop-mapreduce-client-core", version.ref = "hadoop2"}
hadoop3-hdfs = { group = "org.apache.hadoop", name = "hadoop-hdfs", version.ref = "hadoop3" }
Expand Down
35 changes: 29 additions & 6 deletions iceberg/iceberg-common/build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,19 @@ plugins {

dependencies {
implementation(project(":catalogs:catalog-common"))
implementation(project(":core"))
implementation(project(":common"))
implementation(project(":server-common"))
implementation(project(":core")) {
exclude("*")
}
implementation(project(":common")) {
exclude("*")
}
implementation(libs.bundles.iceberg)
implementation(libs.bundles.log4j)
implementation(libs.bundles.kerby) {
exclude("org.jline")
}
implementation(libs.caffeine)
implementation(libs.cglib)
implementation(libs.commons.lang3)
implementation(libs.guava)
implementation(libs.iceberg.aliyun)
Expand All @@ -42,17 +49,22 @@ dependencies {
exclude("com.github.spotbugs")
exclude("com.sun.jersey")
exclude("javax.servlet")
exclude("org.apache.curator")
exclude("org.apache.zookeeper")
exclude("org.mortbay.jetty")
}
// use hdfs-default.xml
implementation(libs.hadoop2.hdfs) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we still need this, is hdfs-client enough?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I failed to exclude hadoop-hdfs jar because we dependshdfs-default.xml to initialize the related environment.

exclude("*")
}
implementation(libs.hadoop2.hdfs.client) {
exclude("com.sun.jersey")
exclude("javax.servlet")
exclude("org.fusesource.leveldbjni")
exclude("org.mortbay.jetty")
}
implementation(libs.hadoop2.mapreduce.client.core) {
exclude("com.sun.jersey")
exclude("javax.servlet")
exclude("org.mortbay.jetty")
exclude("*")
}
implementation(libs.hive2.metastore) {
exclude("co.cask.tephra")
Expand All @@ -61,18 +73,28 @@ dependencies {
exclude("com.sun.jersey")
exclude("com.tdunning", "json")
exclude("com.zaxxer", "HikariCP")
exclude("com.github.joshelser")
exclude("io.dropwizard.metrics")
exclude("javax.servlet")
exclude("javax.transaction", "transaction-api")
exclude("jline")
exclude("org.apache.ant")
exclude("org.apache.avro", "avro")
exclude("org.apache.curator")
exclude("org.apache.derby")
exclude("org.apache.hbase")
exclude("org.apache.hive", "hive-service-rpc")
exclude("org.apache.hadoop")
exclude("org.apache.hadoop", "hadoop-yarn-api")
exclude("org.apache.hadoop", "hadoop-yarn-server-applicationhistoryservice")
exclude("org.apache.hadoop", "hadoop-yarn-server-common")
exclude("org.apache.hadoop", "hadoop-yarn-server-resourcemanager")
exclude("org.apache.hadoop", "hadoop-yarn-server-web-proxy")
exclude("org.apache.logging.log4j")
exclude("org.apache.parquet", "parquet-hadoop-bundle")
exclude("org.apache.orc")
exclude("org.apache.zookeeper")
exclude("org.datanucleus")
exclude("org.eclipse.jetty.aggregate", "jetty-all")
exclude("org.eclipse.jetty.orbit", "javax.servlet")
exclude("org.mortbay.jetty")
Expand All @@ -83,6 +105,7 @@ dependencies {
annotationProcessor(libs.lombok)
compileOnly(libs.lombok)

testImplementation(project(":server-common"))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we require "server-common" here?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

because IcebergConfig overwites the default HTTP port, we add a test to check whether it works in JettyServerConfig, like:

  public void testIcebergHttpPort() {
    Map<String, String> properties = ImmutableMap.of();
    IcebergConfig icebergConfig = new IcebergConfig(properties);
    JettyServerConfig jettyServerConfig = JettyServerConfig.fromConfig(icebergConfig);
    Assertions.assertEquals(
        JettyServerConfig.DEFAULT_ICEBERG_REST_SERVICE_HTTP_PORT, jettyServerConfig.getHttpPort());
    Assertions.assertEquals(
        JettyServerConfig.DEFAULT_ICEBERG_REST_SERVICE_HTTPS_PORT,
        jettyServerConfig.getHttpsPort());

testImplementation(libs.junit.jupiter.api)
testImplementation(libs.junit.jupiter.params)
testImplementation(libs.sqlite.jdbc)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,20 +24,22 @@
import java.util.Optional;
import org.apache.commons.lang3.StringUtils;
import org.apache.gravitino.Config;
import org.apache.gravitino.OverwriteDefaultConfig;
import org.apache.gravitino.catalog.lakehouse.iceberg.IcebergConstants;
import org.apache.gravitino.catalog.lakehouse.iceberg.IcebergPropertiesUtils;
import org.apache.gravitino.config.ConfigBuilder;
import org.apache.gravitino.config.ConfigConstants;
import org.apache.gravitino.config.ConfigEntry;
import org.apache.gravitino.server.web.JettyServerConfig;
import org.apache.gravitino.server.web.OverwriteDefaultConfig;
import org.apache.gravitino.storage.OSSProperties;
import org.apache.gravitino.storage.S3Properties;

public class IcebergConfig extends Config implements OverwriteDefaultConfig {

public static final String ICEBERG_CONFIG_PREFIX = "gravitino.iceberg-rest.";

public static final int DEFAULT_ICEBERG_REST_SERVICE_HTTP_PORT = 9001;
public static final int DEFAULT_ICEBERG_REST_SERVICE_HTTPS_PORT = 9433;

public static final ConfigEntry<String> CATALOG_BACKEND =
new ConfigBuilder(IcebergConstants.CATALOG_BACKEND)
.doc("Catalog backend of Gravitino Iceberg catalog")
Expand Down Expand Up @@ -248,9 +250,9 @@ public Map<String, String> getIcebergCatalogProperties() {
@Override
public Map<String, String> getOverwriteDefaultConfig() {
return ImmutableMap.of(
JettyServerConfig.WEBSERVER_HTTP_PORT.getKey(),
String.valueOf(JettyServerConfig.DEFAULT_ICEBERG_REST_SERVICE_HTTP_PORT),
JettyServerConfig.WEBSERVER_HTTPS_PORT.getKey(),
String.valueOf(JettyServerConfig.DEFAULT_ICEBERG_REST_SERVICE_HTTPS_PORT));
ConfigConstants.WEBSERVER_HTTP_PORT,
String.valueOf(DEFAULT_ICEBERG_REST_SERVICE_HTTP_PORT),
ConfigConstants.WEBSERVER_HTTPS_PORT,
String.valueOf(DEFAULT_ICEBERG_REST_SERVICE_HTTPS_PORT));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,9 @@ public void testIcebergHttpPort() {
IcebergConfig icebergConfig = new IcebergConfig(properties);
JettyServerConfig jettyServerConfig = JettyServerConfig.fromConfig(icebergConfig);
Assertions.assertEquals(
JettyServerConfig.DEFAULT_ICEBERG_REST_SERVICE_HTTP_PORT, jettyServerConfig.getHttpPort());
IcebergConfig.DEFAULT_ICEBERG_REST_SERVICE_HTTP_PORT, jettyServerConfig.getHttpPort());
Assertions.assertEquals(
JettyServerConfig.DEFAULT_ICEBERG_REST_SERVICE_HTTPS_PORT,
jettyServerConfig.getHttpsPort());
IcebergConfig.DEFAULT_ICEBERG_REST_SERVICE_HTTPS_PORT, jettyServerConfig.getHttpsPort());

properties =
ImmutableMap.of(
Expand Down
14 changes: 11 additions & 3 deletions iceberg/iceberg-rest-server/build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,22 @@ dependencies {
implementation(project(":api"))
implementation(project(":catalogs:catalog-common"))
implementation(project(":clients:client-java"))
implementation(project(":core"))
implementation(project(":common"))
implementation(project(":core")) {
exclude("*")
}
implementation(project(":common")) {
exclude("*")
}
implementation(project(":iceberg:iceberg-common"))
implementation(project(":server-common"))
implementation(project(":server-common")) {
exclude("*")
}
implementation(libs.bundles.iceberg)
implementation(libs.bundles.jetty)
implementation(libs.bundles.jersey)
implementation(libs.bundles.log4j)
implementation(libs.bundles.metrics)
implementation(libs.bundles.prometheus)
implementation(libs.caffeine)
implementation(libs.commons.lang3)
implementation(libs.guava)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@

import com.google.common.collect.Maps;
import java.util.Map;
import java.util.UUID;
import org.apache.gravitino.catalog.lakehouse.iceberg.IcebergConstants;
import org.apache.gravitino.iceberg.common.IcebergConfig;
import org.apache.gravitino.iceberg.common.ops.IcebergTableOps;
Expand All @@ -33,9 +32,6 @@
import org.junit.jupiter.params.provider.ValueSource;

public class TestConfigBasedIcebergTableOpsProvider {
private static final String STORE_PATH =
"/tmp/gravitino_test_iceberg_jdbc_backend_" + UUID.randomUUID().toString().replace("-", "");

@Test
public void testValidIcebergTableOps() {
String hiveCatalogName = "hive_backend";
Expand All @@ -51,13 +47,11 @@ public void testValidIcebergTableOps() {
// jdbc backend catalog
config.put("catalog.jdbc_backend.catalog-backend-name", jdbcCatalogName);
config.put("catalog.jdbc_backend.catalog-backend", "jdbc");
config.put(
"catalog.jdbc_backend.uri",
String.format("jdbc:h2:%s;DB_CLOSE_DELAY=-1;MODE=MYSQL", STORE_PATH));
config.put("catalog.jdbc_backend.uri", "jdbc:sqlite::memory:");
config.put("catalog.jdbc_backend.warehouse", "/tmp/usr/jdbc/warehouse");
config.put("catalog.jdbc_backend.jdbc.password", "gravitino");
config.put("catalog.jdbc_backend.jdbc.user", "gravitino");
config.put("catalog.jdbc_backend.jdbc-driver", "org.h2.Driver");
config.put("catalog.jdbc_backend.jdbc-driver", "org.sqlite.JDBC");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we use Sqlite instead of H2?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no special reason, it has sqlite dependence

config.put("catalog.jdbc_backend.jdbc-initialize", "true");
// default catalog
config.put("catalog-backend-name", defaultCatalogName);
Expand Down Expand Up @@ -87,7 +81,7 @@ public void testValidIcebergTableOps() {
Assertions.assertEquals("jdbc", jdbcIcebergConfig.get(IcebergConfig.CATALOG_BACKEND));
Assertions.assertEquals(
"/tmp/usr/jdbc/warehouse", jdbcIcebergConfig.get(IcebergConfig.CATALOG_WAREHOUSE));
Assertions.assertEquals("org.h2.Driver", jdbcIcebergConfig.get(IcebergConfig.JDBC_DRIVER));
Assertions.assertEquals("org.sqlite.JDBC", jdbcIcebergConfig.get(IcebergConfig.JDBC_DRIVER));
Assertions.assertEquals(true, jdbcIcebergConfig.get(IcebergConfig.JDBC_INIT_TABLES));

Assertions.assertEquals(
Expand Down
Loading
Loading