Spark: Allow reading timestamp without timezone

shardulm94 · rzhang10 · commit c7f97aefff3d · 2021-05-14T15:26:35.000-07:00
Cherry-picked PR 48: Spark: Allow reading timestamp without time zone

Fix style and refactor read-timestamp-without-zone option to constant after rebase
diff --git a/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithoutReordering.java b/spark/src/main/java/org/apache/iceberg/spark/PruneColumnsWithoutReordering.java
@@ -202,11 +202,6 @@ public Type primitive(Type.PrimitiveType primitive) {
             "Cannot project decimal with incompatible precision: %s < %s",
             requestedDecimal.precision(), decimal.precision());
         break;
-      case TIMESTAMP:
-        Types.TimestampType timestamp = (Types.TimestampType) primitive;
-        Preconditions.checkArgument(timestamp.shouldAdjustToUTC(),
-            "Cannot project timestamp (without time zone) as timestamptz (with time zone)");
-        break;
       default:
     }
 
diff --git a/spark/src/main/java/org/apache/iceberg/spark/SparkReadOptions.java b/spark/src/main/java/org/apache/iceberg/spark/SparkReadOptions.java
@@ -47,4 +47,6 @@ private SparkReadOptions() {
 
   // Overrides the table's read.parquet.vectorization.batch-size
   public static final String VECTORIZATION_BATCH_SIZE = "batch-size";
+
+  public static final String READ_TIMESTAMP_WITHOUT_ZONE = "read-timestamp-without-zone";
 }
diff --git a/spark/src/main/java/org/apache/iceberg/spark/TypeToSparkType.java b/spark/src/main/java/org/apache/iceberg/spark/TypeToSparkType.java
@@ -104,12 +104,7 @@ public DataType primitive(Type.PrimitiveType primitive) {
         throw new UnsupportedOperationException(
             "Spark does not support time fields");
       case TIMESTAMP:
-        Types.TimestampType timestamp = (Types.TimestampType) primitive;
-        if (timestamp.shouldAdjustToUTC()) {
-          return TimestampType$.MODULE$;
-        }
-        throw new UnsupportedOperationException(
-            "Spark does not support timestamp without time zone fields");
+        return TimestampType$.MODULE$;
       case STRING:
         return StringType$.MODULE$;
       case UUID:
diff --git a/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcReader.java b/spark/src/main/java/org/apache/iceberg/spark/data/SparkOrcReader.java
@@ -104,6 +104,7 @@ public OrcValueReader<?> primitive(Type.PrimitiveType iPrimitive, TypeDescriptio
           return OrcValueReaders.floats();
         case DOUBLE:
           return OrcValueReaders.doubles();
+        case TIMESTAMP:
         case TIMESTAMP_INSTANT:
           return SparkOrcValueReaders.timestampTzs();
         case DECIMAL:
diff --git a/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkOrcReaders.java b/spark/src/main/java/org/apache/iceberg/spark/data/vectorized/VectorizedSparkOrcReaders.java
@@ -127,6 +127,7 @@ public Converter primitive(Type.PrimitiveType iPrimitive, TypeDescription primit
         case DOUBLE:
           primitiveValueReader = OrcValueReaders.doubles();
           break;
+        case TIMESTAMP:
         case TIMESTAMP_INSTANT:
           primitiveValueReader = SparkOrcValueReaders.timestampTzs();
           break;
diff --git a/spark/src/test/java/org/apache/iceberg/spark/data/GenericsHelpers.java b/spark/src/test/java/org/apache/iceberg/spark/data/GenericsHelpers.java
@@ -24,6 +24,7 @@
 import java.sql.Timestamp;
 import java.time.Instant;
 import java.time.LocalDate;
+import java.time.LocalDateTime;
 import java.time.OffsetDateTime;
 import java.time.ZoneOffset;
 import java.time.temporal.ChronoUnit;
@@ -122,13 +123,19 @@ private static void assertEqualsSafe(Type type, Object expected, Object actual)
         Assert.assertEquals("ISO-8601 date should be equal", expected.toString(), actual.toString());
         break;
       case TIMESTAMP:
-        Assert.assertTrue("Should expect an OffsetDateTime", expected instanceof OffsetDateTime);
         Assert.assertTrue("Should be a Timestamp", actual instanceof Timestamp);
         Timestamp ts = (Timestamp) actual;
         // milliseconds from nanos has already been added by getTime
         OffsetDateTime actualTs = EPOCH.plusNanos(
             (ts.getTime() * 1_000_000) + (ts.getNanos() % 1_000_000));
-        Assert.assertEquals("Timestamp should be equal", expected, actualTs);
+        Types.TimestampType timestampType = (Types.TimestampType) type;
+        if (timestampType.shouldAdjustToUTC()) {
+          Assert.assertTrue("Should expect an OffsetDateTime", expected instanceof OffsetDateTime);
+          Assert.assertEquals("Timestamp should be equal", expected, actualTs);
+        } else {
+          Assert.assertTrue("Should expect an LocalDateTime", expected instanceof LocalDateTime);
+          Assert.assertEquals("Timestamp should be equal", expected, actualTs.toLocalDateTime());
+        }
         break;
       case STRING:
         Assert.assertTrue("Should be a String", actual instanceof String);
diff --git a/spark/src/test/java/org/apache/iceberg/spark/source/TestTimestampWithoutZone.java b/spark/src/test/java/org/apache/iceberg/spark/source/TestTimestampWithoutZone.java
@@ -0,0 +1,227 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.spark.source;
+
+import java.io.File;
+import java.io.IOException;
+import java.time.LocalDateTime;
+import java.util.List;
+import java.util.Locale;
+import java.util.UUID;
+import java.util.stream.Collectors;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.iceberg.DataFile;
+import org.apache.iceberg.DataFiles;
+import org.apache.iceberg.FileFormat;
+import org.apache.iceberg.PartitionSpec;
+import org.apache.iceberg.Schema;
+import org.apache.iceberg.Table;
+import org.apache.iceberg.data.GenericAppenderFactory;
+import org.apache.iceberg.data.GenericRecord;
+import org.apache.iceberg.data.Record;
+import org.apache.iceberg.hadoop.HadoopTables;
+import org.apache.iceberg.io.FileAppender;
+import org.apache.iceberg.relocated.com.google.common.collect.Lists;
+import org.apache.iceberg.spark.data.GenericsHelpers;
+import org.apache.iceberg.types.Types;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SparkSession;
+import org.junit.AfterClass;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.ExpectedException;
+import org.junit.rules.TemporaryFolder;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import static org.apache.iceberg.Files.localOutput;
+
+@RunWith(Parameterized.class)
+public abstract class TestTimestampWithoutZone {
+  private static final Configuration CONF = new Configuration();
+  private static final HadoopTables TABLES = new HadoopTables(CONF);
+
+  private static final Schema SCHEMA = new Schema(
+      Types.NestedField.required(1, "id", Types.LongType.get()),
+      Types.NestedField.optional(2, "ts", Types.TimestampType.withoutZone()),
+      Types.NestedField.optional(3, "data", Types.StringType.get())
+  );
+
+  private static SparkSession spark = null;
+
+  @BeforeClass
+  public static void startSpark() {
+    TestTimestampWithoutZone.spark = SparkSession.builder().master("local[2]").getOrCreate();
+  }
+
+  @AfterClass
+  public static void stopSpark() {
+    SparkSession currentSpark = TestTimestampWithoutZone.spark;
+    TestTimestampWithoutZone.spark = null;
+    currentSpark.stop();
+  }
+
+  @Rule
+  public TemporaryFolder temp = new TemporaryFolder();
+
+  private final String format;
+  private final boolean vectorized;
+
+  @Parameterized.Parameters(name = "format = {0}, vectorized = {1}")
+  public static Object[][] parameters() {
+    return new Object[][] {
+        { "parquet", false },
+        { "parquet", true },
+        { "avro", false },
+        { "orc", false },
+        { "orc", true }
+    };
+  }
+
+  public TestTimestampWithoutZone(String format, boolean vectorized) {
+    this.format = format;
+    this.vectorized = vectorized;
+  }
+
+  private File parent = null;
+  private File unpartitioned = null;
+  private List<Record> records = null;
+
+  @Before
+  public void writeUnpartitionedTable() throws IOException {
+    this.parent = temp.newFolder("TestTimestampWithoutZone");
+    this.unpartitioned = new File(parent, "unpartitioned");
+    File dataFolder = new File(unpartitioned, "data");
+    Assert.assertTrue("Mkdir should succeed", dataFolder.mkdirs());
+
+    Table table = TABLES.create(SCHEMA, PartitionSpec.unpartitioned(), unpartitioned.toString());
+    Schema tableSchema = table.schema(); // use the table schema because ids are reassigned
+
+    FileFormat fileFormat = FileFormat.valueOf(format.toUpperCase(Locale.ENGLISH));
+
+    File testFile = new File(dataFolder, fileFormat.addExtension(UUID.randomUUID().toString()));
+
+    // create records using the table's schema
+    this.records = testRecords(tableSchema);
+
+    try (FileAppender<Record> writer = new GenericAppenderFactory(tableSchema).newAppender(
+        localOutput(testFile), fileFormat)) {
+      writer.addAll(records);
+    }
+
+    DataFile file = DataFiles.builder(PartitionSpec.unpartitioned())
+        .withRecordCount(records.size())
+        .withFileSizeInBytes(testFile.length())
+        .withPath(testFile.toString())
+        .build();
+
+    table.newAppend().appendFile(file).commit();
+  }
+
+  @Test
+  public void testUnpartitionedTimestampWithoutZone() {
+    assertEqualsSafe(SCHEMA.asStruct(), records, read(unpartitioned.toString(), vectorized));
+  }
+
+  @Test
+  public void testUnpartitionedTimestampWithoutZoneProjection() {
+    Schema projection = SCHEMA.select("id", "ts");
+    assertEqualsSafe(projection.asStruct(),
+        records.stream().map(r -> projectFlat(projection, r)).collect(Collectors.toList()),
+        read(unpartitioned.toString(), vectorized, "id", "ts"));
+  }
+
+  @Rule
+  public ExpectedException exception = ExpectedException.none();
+
+  @Test
+  public void testUnpartitionedTimestampWithoutZoneError() {
+    exception.expect(IllegalArgumentException.class);
+    exception.expectMessage("Spark does not support timestamp without time zone fields");
+
+    spark.read().format("iceberg")
+        .option("vectorization-enabled", String.valueOf(vectorized))
+        .option("read-timestamp-without-zone", "false")
+        .load(unpartitioned.toString())
+        .collectAsList();
+  }
+
+  private static Record projectFlat(Schema projection, Record record) {
+    Record result = GenericRecord.create(projection);
+    List<Types.NestedField> fields = projection.asStruct().fields();
+    for (int i = 0; i < fields.size(); i += 1) {
+      Types.NestedField field = fields.get(i);
+      result.set(i, record.getField(field.name()));
+    }
+    return result;
+  }
+
+  public static void assertEqualsSafe(Types.StructType struct,
+                                      List<Record> expected, List<Row> actual) {
+    Assert.assertEquals("Number of results should match expected", expected.size(), actual.size());
+    for (int i = 0; i < expected.size(); i += 1) {
+      GenericsHelpers.assertEqualsSafe(struct, expected.get(i), actual.get(i));
+    }
+  }
+
+  private List<Record> testRecords(Schema schema) {
+    return Lists.newArrayList(
+        record(schema, 0L, parseToLocal("2017-12-22T09:20:44.294658"), "junction"),
+        record(schema, 1L, parseToLocal("2017-12-22T07:15:34.582910"), "alligator"),
+        record(schema, 2L, parseToLocal("2017-12-22T06:02:09.243857"), "forrest"),
+        record(schema, 3L, parseToLocal("2017-12-22T03:10:11.134509"), "clapping"),
+        record(schema, 4L, parseToLocal("2017-12-22T00:34:00.184671"), "brush"),
+        record(schema, 5L, parseToLocal("2017-12-21T22:20:08.935889"), "trap"),
+        record(schema, 6L, parseToLocal("2017-12-21T21:55:30.589712"), "element"),
+        record(schema, 7L, parseToLocal("2017-12-21T17:31:14.532797"), "limited"),
+        record(schema, 8L, parseToLocal("2017-12-21T15:21:51.237521"), "global"),
+        record(schema, 9L, parseToLocal("2017-12-21T15:02:15.230570"), "goldfish")
+    );
+  }
+
+  private static List<Row> read(String table, boolean vectorized) {
+    return read(table, vectorized, "*");
+  }
+
+  private static List<Row> read(String table, boolean vectorized, String select0, String... selectN) {
+    Dataset<Row> dataset = spark.read().format("iceberg")
+        .option("vectorization-enabled", String.valueOf(vectorized))
+        .option("read-timestamp-without-zone", "true")
+        .load(table)
+        .select(select0, selectN);
+    return dataset.collectAsList();
+  }
+
+  private static LocalDateTime parseToLocal(String timestamp) {
+    return LocalDateTime.parse(timestamp);
+  }
+
+  private static Record record(Schema schema, Object... values) {
+    Record rec = GenericRecord.create(schema);
+    for (int i = 0; i < values.length; i += 1) {
+      rec.set(i, values[i]);
+    }
+    return rec;
+  }
+}
diff --git a/spark2/src/main/java/org/apache/iceberg/spark/source/Reader.java b/spark2/src/main/java/org/apache/iceberg/spark/source/Reader.java
@@ -54,6 +54,9 @@
 import org.apache.iceberg.spark.SparkFilters;
 import org.apache.iceberg.spark.SparkReadOptions;
 import org.apache.iceberg.spark.SparkSchemaUtil;
+import org.apache.iceberg.types.Type;
+import org.apache.iceberg.types.TypeUtil;
+import org.apache.iceberg.types.Types;
 import org.apache.iceberg.util.PropertyUtil;
 import org.apache.iceberg.util.TableScanUtil;
 import org.apache.spark.broadcast.Broadcast;
@@ -101,6 +104,7 @@ class Reader implements DataSourceReader, SupportsScanColumnarBatch, SupportsPus
   private final boolean localityPreferred;
   private final boolean batchReadsEnabled;
   private final int batchSize;
+  private final boolean readTimestampWithoutZone;
 
   // lazy variables
   private Schema schema = null;
@@ -174,6 +178,16 @@ class Reader implements DataSourceReader, SupportsScanColumnarBatch, SupportsPus
     this.batchSize = options.get(SparkReadOptions.VECTORIZATION_BATCH_SIZE).map(Integer::parseInt).orElseGet(() ->
         PropertyUtil.propertyAsInt(table.properties(),
           TableProperties.PARQUET_BATCH_SIZE, TableProperties.PARQUET_BATCH_SIZE_DEFAULT));
+    // Allow reading timestamp without time zone as timestamp with time zone. Generally, this is not safe as timestamp
+    // without time zone is supposed to represent wall clock time semantics, i.e. no matter the reader/writer timezone
+    // 3PM should always be read as 3PM, but timestamp with time zone represents instant semantics, i.e the timestamp
+    // is adjusted so that the corresponding time in the reader timezone is displayed. However, at LinkedIn, all readers
+    // and writers are in the UTC timezone as our production machines are set to UTC. So, timestamp with/without time
+    // zone is the same.
+    // When set to false (default), we throw an exception at runtime
+    // "Spark does not support timestamp without time zone fields" if reading timestamp without time zone fields
+    this.readTimestampWithoutZone = options.get(SparkReadOptions.READ_TIMESTAMP_WITHOUT_ZONE)
+            .map(Boolean::parseBoolean).orElse(false);
   }
 
   private Schema lazySchema() {
@@ -197,6 +211,8 @@ private Expression filterExpression() {
 
   private StructType lazyType() {
     if (type == null) {
+      Preconditions.checkArgument(readTimestampWithoutZone || !hasTimestampWithoutZone(lazySchema()),
+          "Spark does not support timestamp without time zone fields");
       this.type = SparkSchemaUtil.convert(lazySchema());
     }
     return type;
@@ -365,12 +381,20 @@ public boolean enableBatchRead() {
 
       boolean hasNoDeleteFiles = tasks().stream().noneMatch(TableScanUtil::hasDeletes);
 
+      boolean hasTimestampWithoutZone = hasTimestampWithoutZone(lazySchema());
+
       this.readUsingBatch = batchReadsEnabled && hasNoDeleteFiles && ((allOrcFileScanTasks && hasNoRowFilters) ||
-          (allParquetFileScanTasks && atLeastOneColumn && onlyPrimitives));
+          (allParquetFileScanTasks && atLeastOneColumn && onlyPrimitives && !hasTimestampWithoutZone));
     }
     return readUsingBatch;
   }
 
+  private static boolean hasTimestampWithoutZone(Schema schema) {
+    return TypeUtil.find(schema, t ->
+        t.typeId().equals(Type.TypeID.TIMESTAMP) && !((Types.TimestampType) t).shouldAdjustToUTC()
+    ) != null;
+  }
+
   private static void mergeIcebergHadoopConfs(
       Configuration baseConf, Map<String, String> options) {
     options.keySet().stream()
diff --git a/spark2/src/test/java/org/apache/iceberg/spark/source/TestTimestampWithoutZone24.java b/spark2/src/test/java/org/apache/iceberg/spark/source/TestTimestampWithoutZone24.java
@@ -0,0 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.spark.source;
+
+public class TestTimestampWithoutZone24 extends TestTimestampWithoutZone {
+  public TestTimestampWithoutZone24(String format, boolean vectorized) {
+    super(format, vectorized);
+  }
+}
diff --git a/spark3/src/main/java/org/apache/iceberg/spark/source/SparkBatchScan.java b/spark3/src/main/java/org/apache/iceberg/spark/source/SparkBatchScan.java
diff --git a/spark3/src/test/java/org/apache/iceberg/spark/source/TestTimestampWithoutZone3.java b/spark3/src/test/java/org/apache/iceberg/spark/source/TestTimestampWithoutZone3.java

Original file line number	Diff line number	Diff line change
`@@ -47,4 +47,6 @@ private SparkReadOptions() {`
`47`	`47`
`48`	`48`	`// Overrides the table's read.parquet.vectorization.batch-size`
`49`	`49`	`public static final String VECTORIZATION_BATCH_SIZE = "batch-size";`
	`50`	`+`
	`51`	`+ public static final String READ_TIMESTAMP_WITHOUT_ZONE = "read-timestamp-without-zone";`
`50`	`52`	`}`