Allow custom hadoop properties to be loaded in the Spark data source (#7)

mccheah · rdblue · commit 78f8c506343d · 2018-12-19T11:17:03.000-08:00
Properties that start with iceberg.hadoop are copied into the Hadoop Configuration used in the Spark source. These may be set in table properties or in read and write options passed to the Spark operation. Read and write options take precedence over the table properties.

Supporting these custom Hadoop properties should also be done in other Iceberg integrations in subsequent patches.
diff --git a/spark/src/main/java/com/netflix/iceberg/spark/source/IcebergSource.java b/spark/src/main/java/com/netflix/iceberg/spark/source/IcebergSource.java
@@ -36,6 +36,7 @@
 import org.apache.spark.sql.types.StructType;
 import java.util.List;
 import java.util.Locale;
+import java.util.Map;
 import java.util.Optional;
 
 import static com.netflix.iceberg.TableProperties.DEFAULT_FILE_FORMAT;
@@ -53,16 +54,18 @@ public String shortName() {
 
   @Override
   public DataSourceReader createReader(DataSourceOptions options) {
-    Table table = findTable(options);
-    return new Reader(table, lazyConf());
+    Configuration conf = new Configuration(lazyBaseConf());
+    Table table = getTableAndResolveHadoopConfiguration(options, conf);
+
+    return new Reader(table, conf);
   }
 
   @Override
   public Optional<DataSourceWriter> createWriter(String jobId, StructType dfStruct, SaveMode mode,
                                                    DataSourceOptions options) {
     Preconditions.checkArgument(mode == SaveMode.Append, "Save mode %s is not supported", mode);
-
-    Table table = findTable(options);
+    Configuration conf = new Configuration(lazyBaseConf());
+    Table table = getTableAndResolveHadoopConfiguration(options, conf);
 
     Schema dfSchema = SparkSchemaUtil.convert(table.schema(), dfStruct);
     List<String> errors = CheckCompatibility.writeCompatibilityErrors(table.schema(), dfSchema);
@@ -86,30 +89,49 @@ public Optional<DataSourceWriter> createWriter(String jobId, StructType dfStruct
           .toUpperCase(Locale.ENGLISH));
     }
 
-    return Optional.of(new Writer(table, lazyConf(), format));
+    return Optional.of(new Writer(table, conf, format));
   }
 
-  protected Table findTable(DataSourceOptions options) {
+  protected Table findTable(DataSourceOptions options, Configuration conf) {
     Optional<String> location = options.get("path");
     Preconditions.checkArgument(location.isPresent(),
         "Cannot open table without a location: path is not set");
 
-    HadoopTables tables = new HadoopTables(lazyConf());
+    HadoopTables tables = new HadoopTables(conf);
 
     return tables.load(location.get());
   }
 
-  protected SparkSession lazySparkSession() {
+  private SparkSession lazySparkSession() {
     if (lazySpark == null) {
       this.lazySpark = SparkSession.builder().getOrCreate();
     }
     return lazySpark;
   }
 
-  protected Configuration lazyConf() {
+  private Configuration lazyBaseConf() {
     if (lazyConf == null) {
       this.lazyConf = lazySparkSession().sparkContext().hadoopConfiguration();
     }
     return lazyConf;
   }
+
+  private Table getTableAndResolveHadoopConfiguration(
+      DataSourceOptions options, Configuration conf) {
+    // Overwrite configurations from the Spark Context with configurations from the options.
+    mergeIcebergHadoopConfs(conf, options.asMap());
+    Table table = findTable(options, conf);
+    // Set confs from table properties
+    mergeIcebergHadoopConfs(conf, table.properties());
+    // Re-overwrite values set in options and table properties but were not in the environment.
+    mergeIcebergHadoopConfs(conf, options.asMap());
+    return table;
+  }
+
+  private static void mergeIcebergHadoopConfs(
+      Configuration baseConf, Map<String, String> options) {
+    options.keySet().stream()
+        .filter(key -> key.startsWith("iceberg.hadoop"))
+        .forEach(key -> baseConf.set(key.replaceFirst("iceberg.hadoop", ""), options.get(key)));
+  }
 }
diff --git a/spark/src/test/java/com/netflix/iceberg/spark/source/TestIcebergSource.java b/spark/src/test/java/com/netflix/iceberg/spark/source/TestIcebergSource.java
@@ -17,6 +17,7 @@
 package com.netflix.iceberg.spark.source;
 
 import com.netflix.iceberg.Table;
+import org.apache.hadoop.conf.Configuration;
 import org.apache.spark.sql.sources.v2.DataSourceOptions;
 
 public class TestIcebergSource extends IcebergSource {
@@ -26,7 +27,7 @@ public String shortName() {
   }
 
   @Override
-  protected Table findTable(DataSourceOptions options) {
+  protected Table findTable(DataSourceOptions options, Configuration conf) {
     return TestTables.load(options.get("iceberg.table.name").get());
   }
 }

Original file line number	Diff line number	Diff line change
`@@ -17,6 +17,7 @@`
`17`	`17`	`package com.netflix.iceberg.spark.source;`
`18`	`18`
`19`	`19`	`import com.netflix.iceberg.Table;`
	`20`	`+import org.apache.hadoop.conf.Configuration;`
`20`	`21`	`import org.apache.spark.sql.sources.v2.DataSourceOptions;`
`21`	`22`
`22`	`23`	`public class TestIcebergSource extends IcebergSource {`
`@@ -26,7 +27,7 @@ public String shortName() {`
`26`	`27`	`}`
`27`	`28`
`28`	`29`	`@Override`
`29`		`- protected Table findTable(DataSourceOptions options) {`
	`30`	`+ protected Table findTable(DataSourceOptions options, Configuration conf) {`
`30`	`31`	`return TestTables.load(options.get("iceberg.table.name").get());`
`31`	`32`	`}`
`32`	`33`	`}`