remove hiveberg dependency in iceberg-spark2 module

Wenye Zhang · Wenye Zhang · commit 2e8b7430ffd6 · 2022-01-27T13:35:05.000-08:00
diff --git a/build.gradle b/build.gradle
@@ -498,12 +498,16 @@ project(':iceberg-hive-metastore') {
 project(':iceberg-hiveberg') {
   dependencies {
     compile project(':iceberg-hive-metastore')
+    compile project(':iceberg-spark2')
 
     compileOnly "org.apache.avro:avro"
     compileOnly("org.apache.hadoop:hadoop-client") {
       exclude group: 'org.apache.avro', module: 'avro'
       exclude group: 'org.slf4j', module: 'slf4j-log4j12'
     }
+    compileOnly("org.apache.spark:spark-hive_2.11") {
+      exclude group: 'org.apache.avro', module: 'avro'
+    }
 
     compileOnly("org.apache.hive:hive-metastore") {
       exclude group: 'org.apache.avro', module: 'avro'
@@ -872,7 +876,6 @@ if (jdkVersion == '8') {
       compile project(':iceberg-arrow')
       compile project(':iceberg-hive-metastore')
       compile project(':iceberg-spark')
-      compile project(':iceberg-hiveberg')
 
       compileOnly "org.apache.avro:avro"
       compileOnly("org.apache.spark:spark-hive_2.11") {
diff --git a/hiveberg/src/main/java/org/apache/iceberg/hiveberg/spark2/IcebergSource.java b/hiveberg/src/main/java/org/apache/iceberg/hiveberg/spark2/IcebergSource.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.hiveberg.spark2;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.iceberg.Table;
+import org.apache.iceberg.encryption.EncryptionManager;
+import org.apache.iceberg.io.FileIO;
+import org.apache.iceberg.spark.SparkSchemaUtil;
+import org.apache.iceberg.spark.SparkUtil;
+import org.apache.spark.broadcast.Broadcast;
+import org.apache.spark.sql.sources.v2.DataSourceOptions;
+import org.apache.spark.sql.sources.v2.reader.DataSourceReader;
+import org.apache.spark.sql.types.StructType;
+
+
+public class IcebergSource extends org.apache.iceberg.spark.source.IcebergSource {
+
+  @Override
+  public DataSourceReader createReader(StructType readSchema, DataSourceOptions options) {
+    Configuration conf = new Configuration(lazyBaseConf());
+    Table table = getTableAndResolveHadoopConfiguration(options, conf);
+    String caseSensitive = lazySparkSession().conf().get("spark.sql.caseSensitive");
+
+    Broadcast<FileIO> io = lazySparkContext().broadcast(SparkUtil.serializableFileIO(table));
+    Broadcast<EncryptionManager> encryptionManager = lazySparkContext().broadcast(table.encryption());
+
+    Reader reader = new Reader(table, io, encryptionManager, Boolean.parseBoolean(caseSensitive), options);
+    if (readSchema != null) {
+      // convert() will fail if readSchema contains fields not in table.schema()
+      SparkSchemaUtil.convert(table.schema(), readSchema);
+      reader.pruneColumns(readSchema);
+    }
+
+    return reader;
+  }
+}
diff --git a/hiveberg/src/main/java/org/apache/iceberg/hiveberg/spark2/Reader.java b/hiveberg/src/main/java/org/apache/iceberg/hiveberg/spark2/Reader.java
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.hiveberg.spark2;
+
+import java.util.OptionalLong;
+import org.apache.iceberg.CombinedScanTask;
+import org.apache.iceberg.FileScanTask;
+import org.apache.iceberg.SnapshotSummary;
+import org.apache.iceberg.Table;
+import org.apache.iceberg.encryption.EncryptionManager;
+import org.apache.iceberg.expressions.Expressions;
+import org.apache.iceberg.hiveberg.LegacyHiveTable;
+import org.apache.iceberg.io.FileIO;
+import org.apache.iceberg.spark.SparkSchemaUtil;
+import org.apache.iceberg.spark.source.Stats;
+import org.apache.iceberg.util.PropertyUtil;
+import org.apache.spark.broadcast.Broadcast;
+import org.apache.spark.sql.sources.v2.DataSourceOptions;
+import org.apache.spark.sql.sources.v2.reader.Statistics;
+
+
+class Reader extends org.apache.iceberg.spark.source.Reader {
+  Reader(Table table, Broadcast<FileIO> io, Broadcast<EncryptionManager> encryptionManager,
+      boolean caseSensitive, DataSourceOptions options) {
+    super(table, io, encryptionManager, caseSensitive, options);
+  }
+
+  @Override
+  public Statistics estimateStatistics() {
+    Table table = super.getTable();
+    if (table instanceof LegacyHiveTable) {
+      // We currently don't have reliable stats for Hive tables
+      return EMPTY_STATS;
+    }
+
+    // its a fresh table, no data
+    if (table.currentSnapshot() == null) {
+      return new Stats(0L, 0L);
+    }
+
+    // estimate stats using snapshot summary only for partitioned tables (metadata tables are unpartitioned)
+    if (!table.spec().isUnpartitioned() && filterExpression() == Expressions.alwaysTrue()) {
+      long totalRecords = PropertyUtil.propertyAsLong(table.currentSnapshot().summary(),
+          SnapshotSummary.TOTAL_RECORDS_PROP, Long.MAX_VALUE);
+      return new Stats(SparkSchemaUtil.estimateSize(lazyType(), totalRecords), totalRecords);
+    }
+
+    long sizeInBytes = 0L;
+    long numRows = 0L;
+
+    for (CombinedScanTask task : tasks()) {
+      for (FileScanTask file : task.files()) {
+        sizeInBytes += file.length();
+        numRows += file.file().recordCount();
+      }
+    }
+
+    return new Stats(sizeInBytes, numRows);
+  }
+
+  private static final Statistics EMPTY_STATS = new Statistics() {
+    @Override
+    public OptionalLong sizeInBytes() {
+      return OptionalLong.empty();
+    }
+
+    @Override
+    public OptionalLong numRows() {
+      return OptionalLong.empty();
+    }
+  };
+}
diff --git a/spark2/src/main/java/org/apache/iceberg/spark/source/IcebergSource.java b/spark2/src/main/java/org/apache/iceberg/spark/source/IcebergSource.java
@@ -139,29 +139,28 @@ protected Table findTable(DataSourceOptions options, Configuration conf) {
     }
   }
 
-  private SparkSession lazySparkSession() {
+  protected SparkSession lazySparkSession() {
     if (lazySpark == null) {
       this.lazySpark = SparkSession.builder().getOrCreate();
     }
     return lazySpark;
   }
 
-  private JavaSparkContext lazySparkContext() {
+  protected JavaSparkContext lazySparkContext() {
     if (lazySparkContext == null) {
       this.lazySparkContext = new JavaSparkContext(lazySparkSession().sparkContext());
     }
     return lazySparkContext;
   }
 
-  private Configuration lazyBaseConf() {
+  protected Configuration lazyBaseConf() {
     if (lazyConf == null) {
       this.lazyConf = lazySparkSession().sessionState().newHadoopConf();
     }
     return lazyConf;
   }
 
-  private Table getTableAndResolveHadoopConfiguration(
-      DataSourceOptions options, Configuration conf) {
+  protected Table getTableAndResolveHadoopConfiguration(DataSourceOptions options, Configuration conf) {
     // Overwrite configurations from the Spark Context with configurations from the options.
     mergeIcebergHadoopConfs(conf, options.asMap());
     Table table = findTable(options, conf);
diff --git a/spark2/src/main/java/org/apache/iceberg/spark/source/Reader.java b/spark2/src/main/java/org/apache/iceberg/spark/source/Reader.java
@@ -24,7 +24,6 @@
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
-import java.util.OptionalLong;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
@@ -44,7 +43,6 @@
 import org.apache.iceberg.expressions.Expressions;
 import org.apache.iceberg.hadoop.HadoopFileIO;
 import org.apache.iceberg.hadoop.Util;
-import org.apache.iceberg.hiveberg.LegacyHiveTable;
 import org.apache.iceberg.io.CloseableIterable;
 import org.apache.iceberg.io.FileIO;
 import org.apache.iceberg.orc.OrcRowFilterUtils;
@@ -80,7 +78,7 @@
 
 import static org.apache.iceberg.TableProperties.DEFAULT_NAME_MAPPING;
 
-class Reader implements DataSourceReader, SupportsScanColumnarBatch, SupportsPushDownFilters,
+public class Reader implements DataSourceReader, SupportsScanColumnarBatch, SupportsPushDownFilters,
     SupportsPushDownRequiredColumns, SupportsReportStatistics {
   private static final Logger LOG = LoggerFactory.getLogger(Reader.class);
 
@@ -112,7 +110,7 @@ class Reader implements DataSourceReader, SupportsScanColumnarBatch, SupportsPus
   private List<CombinedScanTask> tasks = null; // lazy cache of tasks
   private Boolean readUsingBatch = null;
 
-  Reader(Table table, Broadcast<FileIO> io, Broadcast<EncryptionManager> encryptionManager,
+  protected Reader(Table table, Broadcast<FileIO> io, Broadcast<EncryptionManager> encryptionManager,
       boolean caseSensitive, DataSourceOptions options) {
     this.table = table;
     this.snapshotId = options.get(SparkReadOptions.SNAPSHOT_ID).map(Long::parseLong).orElse(null);
@@ -202,14 +200,14 @@ private Schema lazySchema() {
     return schema;
   }
 
-  private Expression filterExpression() {
+  protected Expression filterExpression() {
     if (filterExpressions != null) {
       return filterExpressions.stream().reduce(Expressions.alwaysTrue(), Expressions::and);
     }
     return Expressions.alwaysTrue();
   }
 
-  private StructType lazyType() {
+  protected StructType lazyType() {
     if (type == null) {
       Preconditions.checkArgument(readTimestampWithoutZone || !hasTimestampWithoutZone(lazySchema()),
           "Spark does not support timestamp without time zone fields");
@@ -310,11 +308,6 @@ public void pruneColumns(StructType newRequestedSchema) {
 
   @Override
   public Statistics estimateStatistics() {
-    if (table instanceof LegacyHiveTable) {
-      // We currently don't have reliable stats for Hive tables
-      return EMPTY_STATS;
-    }
-
     // its a fresh table, no data
     if (table.currentSnapshot() == null) {
       return new Stats(0L, 0L);
@@ -340,18 +333,6 @@ public Statistics estimateStatistics() {
     return new Stats(sizeInBytes, numRows);
   }
 
-  private static final Statistics EMPTY_STATS = new Statistics() {
-    @Override
-    public OptionalLong sizeInBytes() {
-      return OptionalLong.empty();
-    }
-
-    @Override
-    public OptionalLong numRows() {
-      return OptionalLong.empty();
-    }
-  };
-
   @Override
   public boolean enableBatchRead() {
     if (readUsingBatch == null) {
@@ -402,7 +383,7 @@ private static void mergeIcebergHadoopConfs(
         .forEach(key -> baseConf.set(key.replaceFirst("hadoop.", ""), options.get(key)));
   }
 
-  private List<CombinedScanTask> tasks() {
+  protected List<CombinedScanTask> tasks() {
     if (tasks == null) {
       TableScan scan = table
           .newScan()
@@ -588,4 +569,8 @@ private static class BatchReader extends BatchDataReader implements InputPartiti
       super(task, expectedSchema, nameMapping, io, encryptionManager, caseSensitive, size);
     }
   }
+
+  public Table getTable() {
+    return table;
+  }
 }
diff --git a/spark2/src/main/java/org/apache/iceberg/spark/source/Stats.java b/spark2/src/main/java/org/apache/iceberg/spark/source/Stats.java
@@ -22,11 +22,11 @@
 import java.util.OptionalLong;
 import org.apache.spark.sql.sources.v2.reader.Statistics;
 
-class Stats implements Statistics {
+public class Stats implements Statistics {
   private final OptionalLong sizeInBytes;
   private final OptionalLong numRows;
 
-  Stats(long sizeInBytes, long numRows) {
+  public Stats(long sizeInBytes, long numRows) {
     this.sizeInBytes = OptionalLong.of(sizeInBytes);
     this.numRows = OptionalLong.of(numRows);
   }

Original file line number	Diff line number	Diff line change
`@@ -139,29 +139,28 @@ protected Table findTable(DataSourceOptions options, Configuration conf) {`
`139`	`139`	`}`
`140`	`140`	`}`
`141`	`141`
`142`		`- private SparkSession lazySparkSession() {`
	`142`	`+ protected SparkSession lazySparkSession() {`
`143`	`143`	`if (lazySpark == null) {`
`144`	`144`	`this.lazySpark = SparkSession.builder().getOrCreate();`
`145`	`145`	`}`
`146`	`146`	`return lazySpark;`
`147`	`147`	`}`
`148`	`148`
`149`		`- private JavaSparkContext lazySparkContext() {`
	`149`	`+ protected JavaSparkContext lazySparkContext() {`
`150`	`150`	`if (lazySparkContext == null) {`
`151`	`151`	`this.lazySparkContext = new JavaSparkContext(lazySparkSession().sparkContext());`
`152`	`152`	`}`
`153`	`153`	`return lazySparkContext;`
`154`	`154`	`}`
`155`	`155`
`156`		`- private Configuration lazyBaseConf() {`
	`156`	`+ protected Configuration lazyBaseConf() {`
`157`	`157`	`if (lazyConf == null) {`
`158`	`158`	`this.lazyConf = lazySparkSession().sessionState().newHadoopConf();`
`159`	`159`	`}`
`160`	`160`	`return lazyConf;`
`161`	`161`	`}`
`162`	`162`
`163`		`- private Table getTableAndResolveHadoopConfiguration(`
`164`		`- DataSourceOptions options, Configuration conf) {`
	`163`	`+ protected Table getTableAndResolveHadoopConfiguration(DataSourceOptions options, Configuration conf) {`
`165`	`164`	`// Overwrite configurations from the Spark Context with configurations from the options.`
`166`	`165`	`mergeIcebergHadoopConfs(conf, options.asMap());`
`167`	`166`	`Table table = findTable(options, conf);`