Skip to content

Commit

Permalink
[BEAM-7209][BEAM-9351][BEAM-9428] Upgrade Hive to version 3.1.3 (apac…
Browse files Browse the repository at this point in the history
…he#17749)

* [BEAM-9351] Upgrade Hive to version 3.1.2

* This eliminated the pentaho dependency

* fix auth issue in test

* Add change log

* move internal test only files to test

* clean up original workaround: Hive 3.1.3 upgraded to log4j 2.17.1
  • Loading branch information
Abacn authored and prodriguezdefino committed Jun 21, 2022
1 parent 9ecdd47 commit 33f2fb6
Show file tree
Hide file tree
Showing 11 changed files with 69 additions and 61 deletions.
1 change: 1 addition & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@
## I/Os

* Support for X source added (Java/Python) ([BEAM-X](https://issues.apache.org/jira/browse/BEAM-X)).
* Upgraded to Hive 3.1.3 for HCatalogIO. Users can still provide their own version of Hive. (Java) ([Issue-19554](https://github.com/apache/beam/issues/19554)).

## New Features / Improvements

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,12 +53,6 @@ class Repositories {
url "https://packages.confluent.io/maven/"
content { includeGroup "io.confluent" }
}

// For pentaho dependencies.
maven {
url "https://public.nexus.pentaho.org/repository/omni"
content { includeGroup "org.pentaho" }
}
}

// Apply a plugin which provides the 'updateOfflineRepository' task that creates an offline
Expand All @@ -76,7 +70,6 @@ class Repositories {
maven { url "https://plugins.gradle.org/m2/" }
maven { url "https://repo.spring.io/plugins-release" }
maven { url "https://packages.confluent.io/maven/" }
maven { url "https://public.nexus.pentaho.org/repository/omni" }
maven { url project.offlineRepositoryRoot }
}
includeSources = false
Expand Down
24 changes: 10 additions & 14 deletions sdks/java/extensions/sql/hcatalog/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -25,22 +25,17 @@ applyJavaNature(
],
)

def hive_version = "2.1.0"
def hive_version = "3.1.3"
def netty_version = "4.1.51.Final"

configurations.all {
resolutionStrategy {
// Pin log4j as workaround for CVE-2021-44228
// HIVE-25804 should address this upstream, but only in 4.0
// TODO(BEAM-9351): Upgrade Hive and remove this pin
def log4j_version = "2.17.1"
force "org.apache.logging.log4j:log4j-api:${log4j_version}"
force "org.apache.logging.log4j:log4j-core:${log4j_version}"
force "org.apache.logging.log4j:log4j-slf4j-impl:${log4j_version}"
force "org.apache.logging.log4j:log4j-1.2-api:${log4j_version}"
force "org.apache.logging.log4j:log4j-web:${log4j_version}"
}
}
/*
* We need to rely on manually specifying these evaluationDependsOn to ensure that
* the following projects are evaluated before we evaluate this project. This is because
* we are attempting to reference the "sourceSets.test.output" directly.
* TODO: use testTextures feature which is introduced in Gradle 5.6 instead of
* the test outputs directly.
*/
evaluationDependsOn(":sdks:java:io:hcatalog")

dependencies {
implementation project(":sdks:java:extensions:sql")
Expand All @@ -49,6 +44,7 @@ dependencies {
implementation "com.alibaba:fastjson:1.2.69"
implementation library.java.vendored_guava_26_0_jre

testImplementation project(":sdks:java:io:hcatalog").sourceSets.test.output
// Needed for HCatalogTableProvider tests,
// they use HCat* types
testImplementation "io.netty:netty-all:$netty_version"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ public void testJoinMultipleExtraProvidersWithImplicitHiveDB() throws Exception
pipeline.run();
}

private void reCreateTestTable() throws Exception {
private void reCreateTestTable() {
service.executeQuery("drop table " + TEST_TABLE);
service.executeQuery("create table " + TEST_TABLE + "(f_str string, f_int int)");
}
Expand Down
25 changes: 2 additions & 23 deletions sdks/java/io/hcatalog/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -37,34 +37,13 @@ def hadoopVersions = [

hadoopVersions.each {kv -> configurations.create("hadoopVersion$kv.key")}

def hive_version = "2.1.0"

test {
// TODO: Get tests to run. Known issues:
// * calcite-avatica bundles w/o repackaging Jackson (CALCITE-1110)
// * hive-exec bundles w/o repackaging Guava (HIVE-13690)
ignoreFailures true
}

configurations.all {
resolutionStrategy {
// Pin log4j as workaround for CVE-2021-44228
// HIVE-25804 should address this upstream, but only in 4.0
// TODO(BEAM-9351): Upgrade Hive and remove this pin
def log4j_version = "2.17.1"
force "org.apache.logging.log4j:log4j-api:${log4j_version}"
force "org.apache.logging.log4j:log4j-core:${log4j_version}"
force "org.apache.logging.log4j:log4j-slf4j-impl:${log4j_version}"
force "org.apache.logging.log4j:log4j-1.2-api:${log4j_version}"
force "org.apache.logging.log4j:log4j-web:${log4j_version}"
}
}
def hive_version = "3.1.3"

/*
* We need to rely on manually specifying these evaluationDependsOn to ensure that
* the following projects are evaluated before we evaluate this project. This is because
* we are attempting to reference the "sourceSets.test.output" directly.
* TODO: Swap to generating test artifacts which we can then rely on instead of
* TODO: use testTextures feature which is introduced in Gradle 5.6 instead of
* the test outputs directly.
*/
evaluationDependsOn(":sdks:java:io:common")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,6 @@
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList;
import org.apache.hadoop.hive.metastore.api.NoSuchObjectException;
import org.apache.hadoop.hive.ql.CommandNeedRetryException;
import org.apache.hive.hcatalog.data.DefaultHCatRecord;
import org.apache.hive.hcatalog.data.HCatRecord;
import org.apache.hive.hcatalog.data.transfer.ReaderContext;
Expand Down Expand Up @@ -369,7 +368,7 @@ private void reCreateTestTable() {
service.executeQuery("create table " + TEST_TABLE + "(mycol1 string, mycol2 int)");
}

private void reCreateTestTableForUnboundedReads() throws CommandNeedRetryException {
private void reCreateTestTableForUnboundedReads() {
service.executeQuery("drop table " + TEST_TABLE);
service.executeQuery(
"create table "
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,9 @@
import org.apache.beam.sdk.annotations.Internal;
import org.apache.commons.io.FileUtils;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.CommandNeedRetryException;
import org.apache.hadoop.hive.ql.Driver;
import org.apache.hadoop.hive.ql.DriverFactory;
import org.apache.hadoop.hive.ql.IDriver;
import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse;
import org.apache.hadoop.hive.ql.session.SessionState;

/**
Expand All @@ -40,7 +41,7 @@
*/
@Internal
public final class EmbeddedMetastoreService implements AutoCloseable {
private final Driver driver;
private final IDriver driver;
private final HiveConf hiveConf;
private final SessionState sessionState;

Expand All @@ -64,21 +65,19 @@ public EmbeddedMetastoreService(String baseDirPath) throws IOException {
hiveConf.setBoolVar(HiveConf.ConfVars.HIVEOPTIMIZEMETADATAQUERIES, true);
hiveConf.setVar(
HiveConf.ConfVars.HIVE_AUTHORIZATION_MANAGER,
"org.apache.hadoop.hive.ql.security.authorization.plugin.sqlstd."
+ "SQLStdHiveAuthorizerFactory");
"org.apache.hadoop.hive.ql.security.authorization.DefaultHiveAuthorizationProvider");
hiveConf.set("test.tmp.dir", hiveDirPath);

System.setProperty("derby.stream.error.file", "/dev/null");
driver = new Driver(hiveConf);
driver = DriverFactory.newDriver(hiveConf);
sessionState = SessionState.start(new SessionState(hiveConf));
}

/** Executes the passed query on the embedded metastore service. */
public void executeQuery(String query) {
try {
driver.run(query);
} catch (CommandNeedRetryException e) {
throw new RuntimeException(e);
CommandProcessorResponse response = driver.run(query);
if (response.failed()) {
throw new RuntimeException(response.getException());
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,12 @@
<description>Internal marker for test. Used for masking env-dependent values</description>
</property>

<!-- Properties for test folders -->
<property>
<name>mapreduce.jobtracker.staging.root.dir</name>
<value>${test.tmp.dir}/cli/mapred/staging</value>
</property>

<!-- Hive Configuration can either be stored in this file or in the hadoop configuration files -->
<!-- that are implied by Hadoop setup variables. -->
<!-- Aside from Hadoop setup variables - this file is provided as a convenience so that Hive -->
Expand Down Expand Up @@ -63,9 +69,14 @@
<value>true</value>
</property>

<property>
<name>hive.metastore.schema.verification</name>
<value>false</value>
</property>

<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:derby:;databaseName=${test.tmp.dir}/junit_metastore_db;create=true</value>
<value>jdbc:derby:memory:${test.tmp.dir}/junit_metastore_db;create=true</value>
</property>

<property>
Expand Down Expand Up @@ -136,7 +147,7 @@

<property>
<name>hive.exec.pre.hooks</name>
<value>org.apache.hadoop.hive.ql.hooks.PreExecutePrinter, org.apache.hadoop.hive.ql.hooks.EnforceReadOnlyTables</value>
<value>org.apache.hadoop.hive.ql.hooks.PreExecutePrinter, org.apache.hadoop.hive.ql.hooks.EnforceReadOnlyTables, org.apache.hadoop.hive.ql.hooks.MaterializedViewRegistryPropertiesHook</value>
<description>Pre Execute Hook for Tests</description>
</property>

Expand Down Expand Up @@ -260,6 +271,12 @@
<value>hive_admin_user</value>
</property>

<property>
<name>hive.security.authorization.manager</name>
<value>org.apache.hadoop.hive.ql.security.authorization.plugin.sqlstd.SQLStdHiveAuthorizerFactoryForTest</value>
<description>The Hive client authorization manager class name.</description>
</property>

<property>
<name>hive.llap.io.cache.orc.size</name>
<value>8388608</value>
Expand Down Expand Up @@ -291,11 +308,36 @@
<value>true</value>
</property>


<property>
<name>hive.llap.io.allocator.direct</name>
<value>false</value>
</property>

<property>
<name>hive.stats.column.autogather</name>
<value>true</value>
</property>

<property>
<name>hive.materializedview.rewriting</name>
<value>true</value>
</property>

<property>
<name>hive.stats.fetch.bitvector</name>
<value>true</value>
</property>


<property>
<name>yarn.nodemanager.disk-health-checker.max-disk-utilization-per-disk-percentage</name>
<value>99</value>
</property>

<property>
<name>hive.query.results.cache.enabled</name>
<value>false</value>
</property>


</configuration>
1 change: 0 additions & 1 deletion vendor/calcite-1_28_0/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,6 @@ List<String> packagesToRelocate = [
"org.apiguardian.api",
"org.codehaus",
"org.objectweb",
"org.pentaho",
"org.yaml",
]

Expand Down

0 comments on commit 33f2fb6

Please sign in to comment.