Skip to content

Commit

Permalink
IMPALA-10030: Remove unnecessary jar dependencies
Browse files Browse the repository at this point in the history
Remove the dependency on hadoop-hdfs, this jar file contains the core
code for implementing HDFS, and thus pulls in a bunch of unnecessary
transitive dependencies. Impala currently only requires this jar for
some configuration key names. Most of these configuration key names have
been moved to the appropriate HDFS client jars, and some others are
deprecated altogether. Removing this jar required making a few code
changes to move the location of the referenced configuration keys.

Removes all transitive Kafka dependencies from the Apache Ranger
dependency. Previously, Impala only excluded Kafka jars with binary
version kafka_2.11, however, it seems the Ranger recently upgraded the
dependency version to kafka_2.12. Now all Kafka dependencies are
excluded, regardless of artifact name.

Removes all transitive dependencies from the Apache Ozone dependency.
Impala has a dependency on the Ozone client shaded-jar, which already
includes all required transitive dependencies. For some reason, Ozone
still pulls in some transitive dependencies even though they are not
needed.

Made some other minor cleanup / improvements in the fe/pom.xml file.

This saves about 70 MB of space in the Docker images.

Testing:
* Ran exhaustive tests
* Ran on-prem cluster E2E tests

Change-Id: Iadbb6142466f73f067dd7cf9d401ff81145c74cc
Reviewed-on: http://gerrit.cloudera.org:8080/16311
Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
  • Loading branch information
sahilTakiar authored and Impala Public Jenkins committed Sep 1, 2020
1 parent 1cdae46 commit f85dbff
Show file tree
Hide file tree
Showing 5 changed files with 51 additions and 102 deletions.
98 changes: 22 additions & 76 deletions fe/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -35,14 +35,6 @@ under the License.
<name>Apache Impala Query Engine Frontend</name>

<dependencies>
<!-- Force json-smart dependency.
See https://issues.apache.org/jira/browse/HADOOP-14903 -->
<dependency>
<groupId>net.minidev</groupId>
<artifactId>json-smart</artifactId>
<version>2.3</version>
</dependency>

<dependency>
<groupId>org.apache.impala</groupId>
<artifactId>query-event-hook-api</artifactId>
Expand All @@ -54,46 +46,18 @@ under the License.
<artifactId>impala-data-source-api</artifactId>
<version>${impala.extdatasrc.api.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>${hadoop.version}</version>
<exclusions>
<exclusion>
<groupId>org.eclipse.jetty</groupId>
<artifactId>*</artifactId>
</exclusion>
<exclusion>
<!-- IMPALA-9108: Avoid pulling in leveldbjni, which is unneeded. -->
<groupId>org.fusesource.leveldbjni</groupId>
<artifactId>*</artifactId>
</exclusion>
<exclusion>
<!-- IMPALA-9468: Avoid pulling in netty for security reasons -->
<groupId>io.netty</groupId>
<artifactId>*</artifactId>
</exclusion>
<exclusion>
<groupId>com.sun.jersey</groupId>
<artifactId>jersey-server</artifactId>
</exclusion>
</exclusions>
</dependency>

<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs-client</artifactId>
<version>${hadoop.version}</version>
</dependency>

<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>${hadoop.version}</version>
<exclusions>
<!-- https://issues.apache.org/jira/browse/HADOOP-14903 -->
<exclusion>
<groupId>net.minidev</groupId>
<artifactId>json-smart</artifactId>
</exclusion>
<exclusion>
<groupId>org.eclipse.jetty</groupId>
<artifactId>*</artifactId>
Expand All @@ -113,17 +77,11 @@ under the License.
</exclusion>
</exclusions>
</dependency>

<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-auth</artifactId>
<version>${hadoop.version}</version>
<exclusions>
<!-- https://issues.apache.org/jira/browse/HADOOP-14903 -->
<exclusion>
<groupId>net.minidev</groupId>
<artifactId>json-smart</artifactId>
</exclusion>
</exclusions>
</dependency>

<dependency>
Expand Down Expand Up @@ -162,13 +120,6 @@ under the License.
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-azure-datalake</artifactId>
<version>${hadoop.version}</version>
<exclusions>
<!-- https://issues.apache.org/jira/browse/HADOOP-14903 -->
<exclusion>
<groupId>net.minidev</groupId>
<artifactId>json-smart</artifactId>
</exclusion>
</exclusions>
</dependency>

<dependency>
Expand Down Expand Up @@ -218,7 +169,7 @@ under the License.
<exclusions>
<exclusion>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka_2.11</artifactId>
<artifactId>*</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.shiro</groupId>
Expand All @@ -230,12 +181,14 @@ under the License.
</exclusion>
</exclusions>
</dependency>
<!-- this is needed by ranger-plugins-audit -->

<!-- This is needed by ranger-plugins-audit -->
<dependency>
<groupId>javax.mail</groupId>
<artifactId>mail</artifactId>
<version>1.4</version>
</dependency>

<dependency>
<groupId>javax.ws.rs</groupId>
<artifactId>javax.ws.rs-api</artifactId>
Expand Down Expand Up @@ -290,26 +243,12 @@ under the License.
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>${hbase.version}</version>
<exclusions>
<!-- https://issues.apache.org/jira/browse/HADOOP-14903 -->
<exclusion>
<groupId>net.minidev</groupId>
<artifactId>json-smart</artifactId>
</exclusion>
</exclusions>
</dependency>

<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-common</artifactId>
<version>${hbase.version}</version>
<exclusions>
<!-- https://issues.apache.org/jira/browse/HADOOP-14903 -->
<exclusion>
<groupId>net.minidev</groupId>
<artifactId>json-smart</artifactId>
</exclusion>
</exclusions>
</dependency>

<dependency>
Expand Down Expand Up @@ -382,6 +321,7 @@ under the License.
<artifactId>slf4j-api</artifactId>
<version>${slf4j.version}</version>
</dependency>

<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
Expand All @@ -401,9 +341,9 @@ under the License.
</dependency>

<dependency>
<groupId>com.google.errorprone</groupId>
<artifactId>error_prone_annotations</artifactId>
<version>2.3.1</version>
<groupId>com.google.errorprone</groupId>
<artifactId>error_prone_annotations</artifactId>
<version>2.3.1</version>
</dependency>

<dependency>
Expand All @@ -424,6 +364,7 @@ under the License.
<artifactId>json-simple</artifactId>
<version>1.1.1</version>
</dependency>

<dependency>
<groupId>org.glassfish</groupId>
<artifactId>javax.json</artifactId>
Expand Down Expand Up @@ -979,11 +920,6 @@ under the License.
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-1.2-api</artifactId>
</exclusion>
<!-- https://issues.apache.org/jira/browse/HADOOP-14903 -->
<exclusion>
<groupId>net.minidev</groupId>
<artifactId>json-smart</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.hive</groupId>
<artifactId>hive-serde</artifactId>
Expand Down Expand Up @@ -1035,6 +971,16 @@ under the License.
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-ozone-filesystem-hadoop3</artifactId>
<version>${ozone.version}</version>
<!-- Remove all transitive dependencies from the Apache Ozone dependency.
hadoop-ozone-filesystem-hadoop3 is a shaded-jar, which already includes
all required transitive dependencies. For some reason, Ozone still pulls
in some transitive dependencies even though they are not needed. -->
<exclusions>
<exclusion>
<groupId>*</groupId>
<artifactId>*</artifactId>
</exclusion>
</exclusions>
</dependency>
</dependencies>
</profile>
Expand Down
21 changes: 11 additions & 10 deletions fe/src/main/java/org/apache/impala/service/JniFrontend.java
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
import org.apache.hadoop.fs.azurebfs.AzureBlobFileSystem;
import org.apache.hadoop.fs.azurebfs.SecureAzureBlobFileSystem;
import org.apache.hadoop.fs.s3a.S3AFileSystem;
import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.client.HdfsClientConfigKeys;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.security.Groups;
import org.apache.hadoop.security.JniBasedUnixGroupsMappingWithFallback;
Expand Down Expand Up @@ -753,8 +753,8 @@ private String checkLogFilePermission() {
*/
@VisibleForTesting
protected static String checkShortCircuitRead(Configuration conf) {
if (!conf.getBoolean(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_KEY,
DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_DEFAULT)) {
if (!conf.getBoolean(HdfsClientConfigKeys.Read.ShortCircuit.KEY,
HdfsClientConfigKeys.Read.ShortCircuit.DEFAULT)) {
LOG.info("Short-circuit reads are not enabled.");
return "";
}
Expand All @@ -765,11 +765,12 @@ protected static String checkShortCircuitRead(Configuration conf) {
StringBuilder errorCause = new StringBuilder();

// dfs.domain.socket.path must be set properly
String domainSocketPath = conf.getTrimmed(DFSConfigKeys.DFS_DOMAIN_SOCKET_PATH_KEY,
DFSConfigKeys.DFS_DOMAIN_SOCKET_PATH_DEFAULT);
String domainSocketPath =
conf.getTrimmed(HdfsClientConfigKeys.DFS_DOMAIN_SOCKET_PATH_KEY,
HdfsClientConfigKeys.DFS_DOMAIN_SOCKET_PATH_DEFAULT);
if (domainSocketPath.isEmpty()) {
errorCause.append(prefix);
errorCause.append(DFSConfigKeys.DFS_DOMAIN_SOCKET_PATH_KEY);
errorCause.append(HdfsClientConfigKeys.DFS_DOMAIN_SOCKET_PATH_KEY);
errorCause.append(" is not configured.\n");
} else {
// The socket path parent directory must be readable and executable.
Expand All @@ -781,16 +782,16 @@ protected static String checkShortCircuitRead(Configuration conf) {
} else if (socketDir == null || !socketDir.canRead() || !socketDir.canExecute()) {
errorCause.append(prefix);
errorCause.append("Impala cannot read or execute the parent directory of ");
errorCause.append(DFSConfigKeys.DFS_DOMAIN_SOCKET_PATH_KEY);
errorCause.append(HdfsClientConfigKeys.DFS_DOMAIN_SOCKET_PATH_KEY);
errorCause.append("\n");
}
}

// dfs.client.use.legacy.blockreader.local must be set to false
if (conf.getBoolean(DFSConfigKeys.DFS_CLIENT_USE_LEGACY_BLOCKREADERLOCAL,
DFSConfigKeys.DFS_CLIENT_USE_LEGACY_BLOCKREADERLOCAL_DEFAULT)) {
if (conf.getBoolean(HdfsClientConfigKeys.DFS_CLIENT_USE_LEGACY_BLOCKREADERLOCAL,
HdfsClientConfigKeys.DFS_CLIENT_USE_LEGACY_BLOCKREADERLOCAL_DEFAULT)) {
errorCause.append(prefix);
errorCause.append(DFSConfigKeys.DFS_CLIENT_USE_LEGACY_BLOCKREADERLOCAL);
errorCause.append(HdfsClientConfigKeys.DFS_CLIENT_USE_LEGACY_BLOCKREADERLOCAL);
errorCause.append(" should not be enabled.\n");
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,7 @@
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.hdfs.protocol.AclException;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_SUPERUSERGROUP_DEFAULT;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_PERMISSIONS_SUPERUSERGROUP_KEY;
import static org.apache.hadoop.hdfs.client.HdfsClientConfigKeys.DeprecatedKeys.DFS_PERMISSIONS_SUPERUSERGROUP_KEY;

import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
Expand Down Expand Up @@ -72,8 +71,9 @@ public class FsPermissionChecker {
private FsPermissionChecker() throws IOException {
UserGroupInformation ugi = UserGroupInformation.getCurrentUser();
groups_.addAll(Arrays.asList(ugi.getGroupNames()));
supergroup_ = CONF.get(DFS_PERMISSIONS_SUPERUSERGROUP_KEY,
DFS_PERMISSIONS_SUPERUSERGROUP_DEFAULT);
// The default value is taken from the String DFS_PERMISSIONS_SUPERUSERGROUP_DEFAULT
// in DFSConfigKeys.java from the hadoop-hdfs jar.
supergroup_ = CONF.get(DFS_PERMISSIONS_SUPERUSERGROUP_KEY, "supergroup");
user_ = ugi.getShortUserName();
}

Expand Down
7 changes: 4 additions & 3 deletions fe/src/main/java/org/apache/impala/util/HdfsCachingUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.hdfs.protocol.CacheDirectiveEntry;
import org.apache.hadoop.hdfs.protocol.CacheDirectiveInfo;
Expand Down Expand Up @@ -265,9 +264,11 @@ public static void waitForDirective(long directiveId)

// The refresh interval is how often HDFS will update cache directive stats. We use
// this value to determine how frequently we should poll for changes.
// The key dfs.namenode.path.based.cache.refresh.interval.ms is copied from the string
// DFS_NAMENODE_PATH_BASED_CACHE_REFRESH_INTERVAL_MS in DFSConfigKeys.java from the
// hadoop-hdfs jar.
long hdfsRefreshIntervalMs = getDfs().getConf().getLong(
DFSConfigKeys.DFS_NAMENODE_PATH_BASED_CACHE_REFRESH_INTERVAL_MS,
DFSConfigKeys.DFS_NAMENODE_PATH_BASED_CACHE_REFRESH_INTERVAL_MS_DEFAULT);
"dfs.namenode.path.based.cache.refresh.interval.ms", 30000L);
Preconditions.checkState(hdfsRefreshIntervalMs > 0);

// Loop until either MAX_UNCHANGED_CACHING_REFRESH_INTERVALS have passed with no
Expand Down
19 changes: 10 additions & 9 deletions fe/src/test/java/org/apache/impala/service/JniFrontendTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.CommonConfigurationKeys;
import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.client.HdfsClientConfigKeys;
import org.apache.hadoop.security.JniBasedUnixGroupsMappingWithFallback;
import org.apache.hadoop.security.JniBasedUnixGroupsNetgroupMappingWithFallback;
import org.apache.hadoop.security.ShellBasedUnixGroupsMapping;
Expand Down Expand Up @@ -96,13 +96,14 @@ public void testCheckShortCircuitConfigs() {
socketDir.getParentFile().setExecutable(false);

Configuration conf = mock(Configuration.class);
when(conf.getBoolean(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_KEY,
DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_DEFAULT)).thenReturn(true);
when(conf.getTrimmed(DFSConfigKeys.DFS_DOMAIN_SOCKET_PATH_KEY,
DFSConfigKeys.DFS_DOMAIN_SOCKET_PATH_DEFAULT))
when(conf.getBoolean(HdfsClientConfigKeys.Read.ShortCircuit.KEY,
HdfsClientConfigKeys.Read.ShortCircuit.DEFAULT)).thenReturn(true);
when(conf.getTrimmed(HdfsClientConfigKeys.DFS_DOMAIN_SOCKET_PATH_KEY,
HdfsClientConfigKeys.DFS_DOMAIN_SOCKET_PATH_DEFAULT))
.thenReturn(socketDir.getAbsolutePath());
when(conf.getBoolean(DFSConfigKeys.DFS_CLIENT_USE_LEGACY_BLOCKREADERLOCAL,
DFSConfigKeys.DFS_CLIENT_USE_LEGACY_BLOCKREADERLOCAL_DEFAULT)).thenReturn(false);
when(conf.getBoolean(HdfsClientConfigKeys.DFS_CLIENT_USE_LEGACY_BLOCKREADERLOCAL,
HdfsClientConfigKeys.DFS_CLIENT_USE_LEGACY_BLOCKREADERLOCAL_DEFAULT))
.thenReturn(false);
BackendConfig.INSTANCE = mock(BackendConfig.class);

when(BackendConfig.INSTANCE.isDedicatedCoordinator()).thenReturn(true);
Expand All @@ -113,7 +114,7 @@ public void testCheckShortCircuitConfigs() {
actualErrorMessage = JniFrontend.checkShortCircuitRead(conf);
assertEquals("Invalid short-circuit reads configuration:\n"
+ " - Impala cannot read or execute the parent directory of "
+ DFSConfigKeys.DFS_DOMAIN_SOCKET_PATH_KEY + "\n",
+ HdfsClientConfigKeys.DFS_DOMAIN_SOCKET_PATH_KEY + "\n",
actualErrorMessage);

if (socketDir != null) {
Expand All @@ -122,4 +123,4 @@ public void testCheckShortCircuitConfigs() {
socketDir.getParentFile().delete();
}
}
}
}

0 comments on commit f85dbff

Please sign in to comment.