prestodb · arhimondr · Mar 21, 2016 · Mar 21, 2016 · Mar 21, 2016 · Mar 29, 2016
@@ -10,7 +10,8 @@ env:
     - TEST_MODULES=!presto-tests,!presto-kafka,!presto-redis,!presto-cassandra,!presto-raptor,!presto-postgresql,!presto-mysql
     - TEST_MODULES=presto-tests
     - TEST_MODULES=presto-raptor,presto-redis,presto-cassandra,presto-kafka,presto-postgresql,presto-mysql
-    - RUN_PRODUCT_TESTS=true
+    - RUN_PRODUCT_TESTS="singlenode -x quarantine,big_query,storage_formats,mysql_connector,postgresql_connector,profile_specific_tests"
+    - RUN_PRODUCT_TESTS="singlenode-kerberos-hdfs-impersonation -g storage_formats,cli,hdfs_impersonation"
     - INTEGRATION_TESTS=true
 
 sudo: required
@@ -38,7 +39,7 @@ script:
     fi
   - |
     if [ -v RUN_PRODUCT_TESTS ]; then
-      presto-product-tests/bin/run_on_docker.sh singlenode -x quarantine,big_query
+      presto-product-tests/bin/run_on_docker.sh ${RUN_PRODUCT_TESTS}
     fi
   - |
     if [ -v INTEGRATION_TESTS ]; then

@@ -49,6 +49,7 @@
         <dep.packaging.version>${dep.airlift.version}</dep.packaging.version>
         <dep.slice.version>0.19</dep.slice.version>
         <dep.aws-sdk.version>1.9.40</dep.aws-sdk.version>
+        <dep.tempto.version>1.8</dep.tempto.version>
 
         <!--
         Versions newer than 6.9 appear to have an issue where the @BeforeClass method in
@@ -723,19 +724,19 @@
             <dependency>
                 <groupId>com.teradata.tempto</groupId>
                 <artifactId>tempto-core</artifactId>
-                <version>1.7</version>
+                <version>${dep.tempto.version}</version>
             </dependency>
 
             <dependency>
                 <groupId>com.teradata.tempto</groupId>
                 <artifactId>tempto-runner</artifactId>
-                <version>1.7</version>
+                <version>${dep.tempto.version}</version>
             </dependency>
 
             <dependency>
                 <groupId>com.facebook.presto.hive</groupId>
                 <artifactId>hive-apache-jdbc</artifactId>
-                <version>0.13.1-1</version>
+                <version>0.13.1-3</version>
             </dependency>
 
             <dependency>

@@ -62,6 +62,9 @@ public class ClientOptions
     @Option(name = "--krb5-principal", title = "krb5 principal", description = "Kerberos principal to be used")
     public String krb5Principal;
 
+    @Option(name = "--krb5-disable-remote-service-hostname-canonicalization", title = "krb5 disable remote service hostname canonicalization", description = "Disable service hostname canonicalization using the DNS reverse lookup")
+    public boolean krb5DisableRemoteServiceHostnameCanonicalization;
+
     @Option(name = "--keystore-path", title = "keystore path", description = "Keystore path")
     public String keystorePath;
 
@@ -143,7 +146,7 @@ public KerberosConfig toKerberosConfig()
         if (krb5CredentialCachePath != null) {
             config.setCredentialCache(new File(krb5CredentialCachePath));
         }
-
+        config.setUseCanonicalHostname(!krb5DisableRemoteServiceHostnameCanonicalization);
         return config;
     }
 

@@ -67,6 +67,24 @@ The configuration files must exist on all Presto nodes. If you are
 referencing existing Hadoop config files, make sure to copy them to
 any Presto nodes that are not running Hadoop.
 
+Accessing Hadoop clusters protected with Kerberos authentication
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Kerberos authentication is currently supported for both HDFS and Hive metastore.
+
+However there are still few limitations:
+
+* Kerberos authentication is supported only for ``hive-hadoop2`` and `hive-cdh5` connectors.
+* Kerberos authentication by ticket cache is not yet supported.
+
+Please refer to  `Configuration Properties`_ section for configuration details.
+
+.. note::
+
+    If your ``krb5.conf`` location is different than ``/etc/krb5.conf`` you must set it
+    explicitly using the ``java.security.krb5.conf`` JVM property in ``jvm.config`` file.
+    Example: ``-Djava.security.krb5.conf=/example/path/krb5.conf``.
+
 Configuration Properties
 ------------------------
 
@@ -107,6 +125,76 @@ Property Name                                      Description
 ``hive.max-partitions-per-writers``                Maximum number of partitions per writer.                     100
 
 ``hive.s3.sse.enabled``                            Enable S3 server-side encryption.                            ``false``
+
+``hive.metastore.authentication.type``             Hive metastore authentication type.                          ``NONE``
+                                                   Possible values are ``NONE`` or ``KERBEROS``.
+
+``hive.metastore.service.principal``               Hive metastore service principal.
+                                                   The ``_HOST`` placeholder is allowed here and it is
+                                                   substituted with the actual metastore host. Use ``_HOST``
+                                                   placeholder for configurations with more that
+                                                   one Hive metastore server.
+                                                   Example: ``hive/hive-server-host@EXAMPLE.COM`` or
+                                                   ``hive/_HOST@EXAMPLE.COM``.
+
+``hive.metastore.client.principal``                Hive metastore client principal.
+                                                   The ``_HOST`` placeholder is allowed here and it is
+                                                   substituted with the actual Presto server host. Use
+                                                   ``_HOST`` placeholder for the principal per server
+                                                   configurations.
+                                                   Example: ``presto/presto-server-node@EXAMPLE.COM`` or
+                                                   ``presto/_HOST@EXAMPLE.COM``.
+
+                                                   .. warning::
+
+                                                        The principal specified by
+                                                        ``hive.metastore.client.principal``
+                                                        must have sufficient privileges to remove files
+                                                        and directories within the ``hive/warehouse``
+                                                        directory. If the principal does not, only the
+                                                        metadata will be removed, and the data will
+                                                        continue to consume disk space.
+
+                                                        This occurs because the Hive metastore is
+                                                        responsible for deleting the internal table data.
+                                                        When the metastore is configured to use Kerberos
+                                                        authentication, all of the HDFS operations performed
+                                                        by the metastore are impersonated. Errors
+                                                        deleting data are silently ignored.
+
+``hive.metastore.client.keytab``                   Hive metastore client keytab location. Must be accessible
+                                                   for the user running Presto and must contain the
+                                                   credentials for the  ``hive.metastore.client.principal``.
+
+``hive.hdfs.authentication.type``                  HDFS authentication type.                                    ``NONE``
+                                                   Possible values are ``NONE`` or ``KERBEROS``.
+
+``hive.hdfs.impersonation.enabled``                Enable HDFS calls impersonation.                             ``false``
+
+                                                   When set to the default of ``false``, Presto accesses
+                                                   HDFS as the Unix user the presto process is running as,
+                                                   or as the Kerberos principal specified in
+                                                   ``hive.hdfs.presto.principal``
+
+                                                   When set to ``true``, Presto accesses HDFS as the Presto
+                                                   user or Kerberos principal specified by ``--user`` or
+                                                   ``--krb5-principal`` passed to the CLI, or as the user
+                                                   in the JDBC credentials.
+
+``hive.hdfs.presto.principal``                     HDFS client principal. The ``_HOST`` placeholder
+                                                   is allowed here and it is substituted with the actual
+                                                   Presto server host. Use ``_HOST`` placeholder for the
+                                                   principal per server configurations.
+                                                   When impersonation is enabled make sure that provided
+                                                   user is configured to be a super user and has the
+                                                   impersonation allowed.
+                                                   Example:
+                                                   ``presto-hdfs-superuser/presto-server-node@EXAMPLE.COM`` or
+                                                   ``presto-hdfs-superuser/_HOST@EXAMPLE.COM``.
+
+``hive.hdfs.presto.keytab``                        HDFS client keytab location. Must be accessible
+                                                   for the user running Presto and must contain the
+                                                   credentials for the  ``hive.hdfs.presto.principal``.
 ================================================== ============================================================ ==========
 
 Querying Hive Tables

@@ -242,7 +242,7 @@ private CompletableFuture<?> loadSplits()
                 }
             }
             else {
-                boolean splittable = isSplittable(files.getInputFormat(), hdfsEnvironment.getFileSystem(file.getPath()), file.getPath());
+                boolean splittable = isSplittable(files.getInputFormat(), hdfsEnvironment.getFileSystem(session.getUser(), file.getPath()), file.getPath());
 
                 CompletableFuture<?> future = hiveSplitSource.addToQueue(createHiveSplits(
                         files.getPartitionName(),
@@ -278,14 +278,15 @@ private void loadPartition(HivePartitionMetadata partition)
         Path path = new Path(getPartitionLocation(table, partition.getPartition()));
         Configuration configuration = hdfsEnvironment.getConfiguration(path);
         InputFormat<?, ?> inputFormat = getInputFormat(configuration, schema, false);
+        FileSystem fs = hdfsEnvironment.getFileSystem(session.getUser(), path);
 
         if (inputFormat instanceof SymlinkTextInputFormat) {
             if (bucketHandle.isPresent()) {
                 throw new PrestoException(StandardErrorCode.NOT_SUPPORTED, "Bucketed table in SymlinkTextInputFormat is not yet supported");
             }
 
             // TODO: This should use an iterator like the HiveFileIterator
-            for (Path targetPath : getTargetPathsFromSymlink(configuration, path)) {
+            for (Path targetPath : getTargetPathsFromSymlink(fs, path)) {
                 // The input should be in TextInputFormat.
                 TextInputFormat targetInputFormat = new TextInputFormat();
                 // get the configuration for the target path -- it may be a different hdfs instance
@@ -298,7 +299,7 @@ private void loadPartition(HivePartitionMetadata partition)
 
                 for (InputSplit inputSplit : targetSplits) {
                     FileSplit split = (FileSplit) inputSplit;
-                    FileSystem targetFilesystem = split.getPath().getFileSystem(targetConfiguration);
+                    FileSystem targetFilesystem = hdfsEnvironment.getFileSystem(session.getUser(), split.getPath());
                     FileStatus file = targetFilesystem.getFileStatus(split.getPath());
                     hiveSplitSource.addToQueue(createHiveSplits(
                             partitionName,
@@ -321,7 +322,6 @@ private void loadPartition(HivePartitionMetadata partition)
         }
 
         // If only one bucket could match: load that one file
-        FileSystem fs = hdfsEnvironment.getFileSystem(path);
         HiveFileIterator iterator = new HiveFileIterator(path, fs, directoryLister, namenodeStats, partitionName, inputFormat, schema, partitionKeys, effectivePredicate);
         if (bucket.isPresent()) {
             List<LocatedFileStatus> locatedFileStatuses = listAndSortBucketFiles(iterator, bucket.get().getBucketCount());
@@ -352,7 +352,7 @@ private void loadPartition(HivePartitionMetadata partition)
 
             for (int bucketIndex = 0; bucketIndex < bucketCount; bucketIndex++) {
                 LocatedFileStatus file = list.get(bucketIndex);
-                boolean splittable = isSplittable(iterator.getInputFormat(), hdfsEnvironment.getFileSystem(file.getPath()), file.getPath());
+                boolean splittable = isSplittable(iterator.getInputFormat(), hdfsEnvironment.getFileSystem(session.getUser(), file.getPath()), file.getPath());
 
                 hiveSplitSource.addToQueue(createHiveSplits(
                         iterator.getPartitionName(),
@@ -396,10 +396,9 @@ private static List<LocatedFileStatus> listAndSortBucketFiles(HiveFileIterator h
         return list;
     }
 
-    private static List<Path> getTargetPathsFromSymlink(Configuration conf, Path symlinkDir)
+    private static List<Path> getTargetPathsFromSymlink(FileSystem fileSystem, Path symlinkDir)
     {
         try {
-            FileSystem fileSystem = symlinkDir.getFileSystem(conf);
             FileStatus[] symlinks = fileSystem.listStatus(symlinkDir, HIDDEN_FILES_PATH_FILTER);
             List<Path> targets = new ArrayList<>();
 

@@ -16,6 +16,7 @@
 import com.facebook.presto.spi.ConnectorSession;
 import com.facebook.presto.spi.predicate.TupleDomain;
 import com.facebook.presto.spi.type.TypeManager;
+import com.google.inject.Inject;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable;
@@ -28,10 +29,19 @@
 import java.util.Properties;
 
 import static com.facebook.presto.hive.HiveUtil.isDeserializerClass;
+import static java.util.Objects.requireNonNull;
 
 public class ColumnarBinaryHiveRecordCursorProvider
         implements HiveRecordCursorProvider
 {
+    private final HdfsEnvironment hdfsEnvironment;
+
+    @Inject
+    public ColumnarBinaryHiveRecordCursorProvider(HdfsEnvironment hdfsEnvironment)
+    {
+        this.hdfsEnvironment = requireNonNull(hdfsEnvironment, "hdfsEnvironment is null");
+    }
+
     @Override
     public Optional<HiveRecordCursor> createHiveRecordCursor(
             String clientId,
@@ -51,7 +61,8 @@ public Optional<HiveRecordCursor> createHiveRecordCursor(
             return Optional.empty();
         }
 
-        RecordReader<?, ?> recordReader = HiveUtil.createRecordReader(configuration, path, start, length, schema, columns);
+        RecordReader<?, ?> recordReader = hdfsEnvironment.doAs(session.getUser(),
+                () -> HiveUtil.createRecordReader(configuration, path, start, length, schema, columns));
 
         return Optional.<HiveRecordCursor>of(new ColumnarBinaryHiveRecordCursor<>(
                 bytesRecordReader(recordReader),

@@ -16,6 +16,7 @@
 import com.facebook.presto.spi.ConnectorSession;
 import com.facebook.presto.spi.predicate.TupleDomain;
 import com.facebook.presto.spi.type.TypeManager;
+import com.google.inject.Inject;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable;
@@ -28,10 +29,19 @@
 import java.util.Properties;
 
 import static com.facebook.presto.hive.HiveUtil.isDeserializerClass;
+import static java.util.Objects.requireNonNull;
 
 public class ColumnarTextHiveRecordCursorProvider
         implements HiveRecordCursorProvider
 {
+    private final HdfsEnvironment hdfsEnvironment;
+
+    @Inject
+    public ColumnarTextHiveRecordCursorProvider(HdfsEnvironment hdfsEnvironment)
+    {
+        this.hdfsEnvironment = requireNonNull(hdfsEnvironment, "hdfsEnvironment is null");
+    }
+
     @Override
     public Optional<HiveRecordCursor> createHiveRecordCursor(
             String clientId,
@@ -51,7 +61,8 @@ public Optional<HiveRecordCursor> createHiveRecordCursor(
             return Optional.empty();
         }
 
-        RecordReader<?, ?> recordReader = HiveUtil.createRecordReader(configuration, path, start, length, schema, columns);
+        RecordReader<?, ?> recordReader = hdfsEnvironment.doAs(session.getUser(),
+                () -> HiveUtil.createRecordReader(configuration, path, start, length, schema, columns));
 
         return Optional.<HiveRecordCursor>of(new ColumnarTextHiveRecordCursor<>(
                 columnarTextRecordReader(recordReader),

@@ -0,0 +1,35 @@
+/*
+ * Copyright 2016, Teradata Corp. All rights reserved.
+ */
+
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.facebook.presto.hive;
+
+import javax.inject.Qualifier;
+
+import java.lang.annotation.Retention;
+import java.lang.annotation.Target;
+
+import static java.lang.annotation.ElementType.FIELD;
+import static java.lang.annotation.ElementType.METHOD;
+import static java.lang.annotation.ElementType.PARAMETER;
+import static java.lang.annotation.RetentionPolicy.RUNTIME;
+
+@Retention(RUNTIME)
+@Target({FIELD, PARAMETER, METHOD})
+@Qualifier
+public @interface ForHdfs
+{
+}
@@ -16,6 +16,7 @@
 import com.facebook.presto.spi.ConnectorSession;
 import com.facebook.presto.spi.predicate.TupleDomain;
 import com.facebook.presto.spi.type.TypeManager;
+import com.google.inject.Inject;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.Writable;
@@ -26,9 +27,19 @@
 import java.util.Optional;
 import java.util.Properties;
 
+import static java.util.Objects.requireNonNull;
+
 public class GenericHiveRecordCursorProvider
         implements HiveRecordCursorProvider
 {
+    private final HdfsEnvironment hdfsEnvironment;
+
+    @Inject
+    public GenericHiveRecordCursorProvider(HdfsEnvironment hdfsEnvironment)
+    {
+        this.hdfsEnvironment = requireNonNull(hdfsEnvironment, "hdfsEnvironment is null");
+    }
+
     @Override
     public Optional<HiveRecordCursor> createHiveRecordCursor(
             String clientId,
@@ -44,7 +55,8 @@ public Optional<HiveRecordCursor> createHiveRecordCursor(
             DateTimeZone hiveStorageTimeZone,
             TypeManager typeManager)
     {
-        RecordReader<?, ?> recordReader = HiveUtil.createRecordReader(configuration, path, start, length, schema, columns);
+        RecordReader<?, ?> recordReader = hdfsEnvironment.doAs(session.getUser(),
+                () -> HiveUtil.createRecordReader(configuration, path, start, length, schema, columns));
 
         return Optional.<HiveRecordCursor>of(new GenericHiveRecordCursor<>(
                 genericRecordReader(recordReader),