From 892d66dc234e495fca6dfff3e23bb810bec8cd53 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 14 May 2024 09:03:22 -0600 Subject: [PATCH 1/5] add new DataGenerator class --- .../org/apache/comet/CometCastSuite.scala | 41 ++++++++---------- .../org/apache/comet/DataGenerator.scala | 42 +++++++++++++++++++ 2 files changed, 60 insertions(+), 23 deletions(-) create mode 100644 spark/src/test/scala/org/apache/comet/DataGenerator.scala diff --git a/spark/src/test/scala/org/apache/comet/CometCastSuite.scala b/spark/src/test/scala/org/apache/comet/CometCastSuite.scala index 1881c561c..bcc171a87 100644 --- a/spark/src/test/scala/org/apache/comet/CometCastSuite.scala +++ b/spark/src/test/scala/org/apache/comet/CometCastSuite.scala @@ -35,6 +35,9 @@ import org.apache.comet.expressions.{CometCast, Compatible} class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper { import testImplicits._ + /** Create a data generator using a fixed seed so that tests are reproducible */ + private val gen = new DataGenerator(new Random(42)) + private val dataSize = 1000 // we should eventually add more whitespace chars here as documented in @@ -478,7 +481,7 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper { test("cast StringType to BooleanType") { val testValues = (Seq("TRUE", "True", "true", "FALSE", "False", "false", "1", "0", "", null) ++ - generateStrings("truefalseTRUEFALSEyesno10" + whitespaceChars, 8)).toDF("a") + gen.generateStrings(dataSize, "truefalseTRUEFALSEyesno10" + whitespaceChars, 8)).toDF("a") castTest(testValues, DataTypes.BooleanType) } @@ -519,53 +522,53 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper { // test with hand-picked values castTest(castStringToIntegralInputs.toDF("a"), DataTypes.ByteType) // fuzz test - castTest(generateStrings(numericPattern, 4).toDF("a"), DataTypes.ByteType) + castTest(gen.generateStrings(dataSize, numericPattern, 4).toDF("a"), DataTypes.ByteType) } test("cast StringType to ShortType") { // test with hand-picked values castTest(castStringToIntegralInputs.toDF("a"), DataTypes.ShortType) // fuzz test - castTest(generateStrings(numericPattern, 5).toDF("a"), DataTypes.ShortType) + castTest(gen.generateStrings(dataSize, numericPattern, 5).toDF("a"), DataTypes.ShortType) } test("cast StringType to IntegerType") { // test with hand-picked values castTest(castStringToIntegralInputs.toDF("a"), DataTypes.IntegerType) // fuzz test - castTest(generateStrings(numericPattern, 8).toDF("a"), DataTypes.IntegerType) + castTest(gen.generateStrings(dataSize, numericPattern, 8).toDF("a"), DataTypes.IntegerType) } test("cast StringType to LongType") { // test with hand-picked values castTest(castStringToIntegralInputs.toDF("a"), DataTypes.LongType) // fuzz test - castTest(generateStrings(numericPattern, 8).toDF("a"), DataTypes.LongType) + castTest(gen.generateStrings(dataSize, numericPattern, 8).toDF("a"), DataTypes.LongType) } ignore("cast StringType to FloatType") { // https://github.com/apache/datafusion-comet/issues/326 - castTest(generateStrings(numericPattern, 8).toDF("a"), DataTypes.FloatType) + castTest(gen.generateStrings(dataSize, numericPattern, 8).toDF("a"), DataTypes.FloatType) } ignore("cast StringType to DoubleType") { // https://github.com/apache/datafusion-comet/issues/326 - castTest(generateStrings(numericPattern, 8).toDF("a"), DataTypes.DoubleType) + castTest(gen.generateStrings(dataSize, numericPattern, 8).toDF("a"), DataTypes.DoubleType) } ignore("cast StringType to DecimalType(10,2)") { // https://github.com/apache/datafusion-comet/issues/325 - val values = generateStrings(numericPattern, 8).toDF("a") + val values = gen.generateStrings(dataSize, numericPattern, 8).toDF("a") castTest(values, DataTypes.createDecimalType(10, 2)) } test("cast StringType to BinaryType") { - castTest(generateStrings(numericPattern, 8).toDF("a"), DataTypes.BinaryType) + castTest(gen.generateStrings(dataSize, numericPattern, 8).toDF("a"), DataTypes.BinaryType) } ignore("cast StringType to DateType") { // https://github.com/apache/datafusion-comet/issues/327 - castTest(generateStrings(datePattern, 8).toDF("a"), DataTypes.DateType) + castTest(gen.generateStrings(dataSize, datePattern, 8).toDF("a"), DataTypes.DateType) } test("cast StringType to TimestampType disabled by default") { @@ -581,7 +584,10 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper { ignore("cast StringType to TimestampType") { // https://github.com/apache/datafusion-comet/issues/328 withSQLConf((CometConf.COMET_CAST_ALLOW_INCOMPATIBLE.key, "true")) { - val values = Seq("2020-01-01T12:34:56.123456", "T2") ++ generateStrings(timestampPattern, 8) + val values = Seq("2020-01-01T12:34:56.123456", "T2") ++ gen.generateStrings( + dataSize, + timestampPattern, + 8) castTest(values.toDF("a"), DataTypes.TimestampType) } } @@ -630,7 +636,7 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper { } test("cast BinaryType to StringType - valid UTF-8 inputs") { - castTest(generateStrings(numericPattern, 8).toDF("a"), DataTypes.StringType) + castTest(gen.generateStrings(dataSize, numericPattern, 8).toDF("a"), DataTypes.StringType) } // CAST from DateType @@ -864,17 +870,6 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper { .drop("str") } - private def generateString(r: Random, chars: String, maxLen: Int): String = { - val len = r.nextInt(maxLen) - Range(0, len).map(_ => chars.charAt(r.nextInt(chars.length))).mkString - } - - // TODO return DataFrame for consistency with other generators and include null values - private def generateStrings(chars: String, maxLen: Int): Seq[String] = { - val r = new Random(0) - Range(0, dataSize).map(_ => generateString(r, chars, maxLen)) - } - private def generateBinary(): DataFrame = { val r = new Random(0) val bytes = new Array[Byte](8) diff --git a/spark/src/test/scala/org/apache/comet/DataGenerator.scala b/spark/src/test/scala/org/apache/comet/DataGenerator.scala new file mode 100644 index 000000000..8e361e803 --- /dev/null +++ b/spark/src/test/scala/org/apache/comet/DataGenerator.scala @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.comet + +import scala.util.Random + +class DataGenerator(r: Random) { + + /** Generate a random string using the specified characters */ + def generateString(chars: String, maxLen: Int): String = { + val len = r.nextInt(maxLen) + Range(0, len).map(_ => chars.charAt(r.nextInt(chars.length))).mkString + } + + /** Generate random strings */ + def generateStrings(n: Int, maxLen: Int): Seq[String] = { + Range(0, n).map(_ => r.nextString(maxLen)) + } + + /** Generate random strings using the specified characters */ + def generateStrings(n: Int, chars: String, maxLen: Int): Seq[String] = { + Range(0, n).map(_ => generateString(chars, maxLen)) + } + +} From 23082da3506f7ada8bb11882622e2cde72316f15 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 14 May 2024 09:17:49 -0600 Subject: [PATCH 2/5] move more data gen methods --- .../org/apache/comet/CometCastSuite.scala | 70 ++----------------- .../org/apache/comet/DataGenerator.scala | 49 +++++++++++++ 2 files changed, 55 insertions(+), 64 deletions(-) diff --git a/spark/src/test/scala/org/apache/comet/CometCastSuite.scala b/spark/src/test/scala/org/apache/comet/CometCastSuite.scala index bcc171a87..51fa0626b 100644 --- a/spark/src/test/scala/org/apache/comet/CometCastSuite.scala +++ b/spark/src/test/scala/org/apache/comet/CometCastSuite.scala @@ -745,35 +745,11 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper { } private def generateFloats(): DataFrame = { - val r = new Random(0) - val values = Seq( - Float.MaxValue, - Float.MinPositiveValue, - Float.MinValue, - Float.NaN, - Float.PositiveInfinity, - Float.NegativeInfinity, - 1.0f, - -1.0f, - Short.MinValue.toFloat, - Short.MaxValue.toFloat, - 0.0f) ++ - Range(0, dataSize).map(_ => r.nextFloat()) - withNulls(values).toDF("a") + withNulls(gen.generateFloats(dataSize)).toDF("a") } private def generateDoubles(): DataFrame = { - val r = new Random(0) - val values = Seq( - Double.MaxValue, - Double.MinPositiveValue, - Double.MinValue, - Double.NaN, - Double.PositiveInfinity, - Double.NegativeInfinity, - 0.0d) ++ - Range(0, dataSize).map(_ => r.nextDouble()) - withNulls(values).toDF("a") + withNulls(gen.generateDoubles(dataSize)).toDF("a") } private def generateBools(): DataFrame = { @@ -781,31 +757,19 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper { } private def generateBytes(): DataFrame = { - val r = new Random(0) - val values = Seq(Byte.MinValue, Byte.MaxValue) ++ - Range(0, dataSize).map(_ => r.nextInt().toByte) - withNulls(values).toDF("a") + withNulls(gen.generateBytes(dataSize)).toDF("a") } private def generateShorts(): DataFrame = { - val r = new Random(0) - val values = Seq(Short.MinValue, Short.MaxValue) ++ - Range(0, dataSize).map(_ => r.nextInt().toShort) - withNulls(values).toDF("a") + withNulls(gen.generateShorts(dataSize)).toDF("a") } private def generateInts(): DataFrame = { - val r = new Random(0) - val values = Seq(Int.MinValue, Int.MaxValue) ++ - Range(0, dataSize).map(_ => r.nextInt()) - withNulls(values).toDF("a") + withNulls(gen.generateInts(dataSize)).toDF("a") } private def generateLongs(): DataFrame = { - val r = new Random(0) - val values = Seq(Long.MinValue, Long.MaxValue) ++ - Range(0, dataSize).map(_ => r.nextLong()) - withNulls(values).toDF("a") + withNulls(gen.generateLongs(dataSize)).toDF("a") } private def generateDecimalsPrecision10Scale2(): DataFrame = { @@ -902,28 +866,6 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper { } } - // TODO Commented out to work around scalafix since this is currently unused. - // private def castFallbackTestTimezone( - // input: DataFrame, - // toType: DataType, - // expectedMessage: String): Unit = { - // withTempPath { dir => - // val data = roundtripParquet(input, dir).coalesce(1) - // data.createOrReplaceTempView("t") - // - // withSQLConf( - // (SQLConf.ANSI_ENABLED.key, "false"), - // (CometConf.COMET_CAST_ALLOW_INCOMPATIBLE.key, "true"), - // (SQLConf.SESSION_LOCAL_TIMEZONE.key, "America/Los_Angeles")) { - // val df = data.withColumn("converted", col("a").cast(toType)) - // df.collect() - // val str = - // new ExtendedExplainInfo().generateExtendedInfo(df.queryExecution.executedPlan) - // assert(str.contains(expectedMessage)) - // } - // } - // } - private def castTimestampTest(input: DataFrame, toType: DataType) = { withTempPath { dir => val data = roundtripParquet(input, dir).coalesce(1) diff --git a/spark/src/test/scala/org/apache/comet/DataGenerator.scala b/spark/src/test/scala/org/apache/comet/DataGenerator.scala index 8e361e803..19ef9eb9d 100644 --- a/spark/src/test/scala/org/apache/comet/DataGenerator.scala +++ b/spark/src/test/scala/org/apache/comet/DataGenerator.scala @@ -39,4 +39,53 @@ class DataGenerator(r: Random) { Range(0, n).map(_ => generateString(chars, maxLen)) } + def generateFloats(n: Int): Seq[Float] = { + Seq( + Float.MaxValue, + Float.MinPositiveValue, + Float.MinValue, + Float.NaN, + Float.PositiveInfinity, + Float.NegativeInfinity, + 1.0f, + -1.0f, + Short.MinValue.toFloat, + Short.MaxValue.toFloat, + 0.0f) ++ + Range(0, n).map(_ => r.nextFloat()) + } + + def generateDoubles(n: Int): Seq[Double] = { + Seq( + Double.MaxValue, + Double.MinPositiveValue, + Double.MinValue, + Double.NaN, + Double.PositiveInfinity, + Double.NegativeInfinity, + 0.0d) ++ + Range(0, n).map(_ => r.nextDouble()) + } + + def generateBytes(n: Int): Seq[Byte] = { + Seq(Byte.MinValue, Byte.MaxValue) ++ + Range(0, n).map(_ => r.nextInt().toByte) + } + + def generateShorts(n: Int): Seq[Short] = { + val r = new Random(0) + Seq(Short.MinValue, Short.MaxValue) ++ + Range(0, n).map(_ => r.nextInt().toShort) + } + + def generateInts(n: Int): Seq[Int] = { + Seq(Int.MinValue, Int.MaxValue) ++ + Range(0, n).map(_ => r.nextInt()) + } + + def generateLongs(n: Int): Seq[Long] = { + Seq(Long.MinValue, Long.MaxValue) ++ + Range(0, n).map(_ => r.nextLong()) + } + } From b109871a0f3d77dea70c5afbecf3a862fd70ecaa Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 14 May 2024 10:50:58 -0600 Subject: [PATCH 3/5] ignore some failing tests, update compat docs --- docs/source/user-guide/compatibility.md | 8 ++++---- .../org/apache/comet/expressions/CometCast.scala | 2 +- .../test/scala/org/apache/comet/CometCastSuite.scala | 11 ++++++----- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/docs/source/user-guide/compatibility.md b/docs/source/user-guide/compatibility.md index a4ed9289f..278edb848 100644 --- a/docs/source/user-guide/compatibility.md +++ b/docs/source/user-guide/compatibility.md @@ -110,10 +110,6 @@ The following cast operations are generally compatible with Spark except for the | decimal | float | | | decimal | double | | | string | boolean | | -| string | byte | | -| string | short | | -| string | integer | | -| string | long | | | string | binary | | | date | string | | | timestamp | long | | @@ -129,6 +125,10 @@ The following cast operations are not compatible with Spark for all inputs and a |-|-|-| | integer | decimal | No overflow check | | long | decimal | No overflow check | +| string | byte | Not all invalid inputs are detected | +| string | short | Not all invalid inputs are detected | +| string | integer | Not all invalid inputs are detected | +| string | long | Not all invalid inputs are detected | | string | timestamp | Not all valid formats are supported | | binary | string | Only works for binary data representing valid UTF-8 strings | diff --git a/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala b/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala index 795bdb428..9c3695ba5 100644 --- a/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala +++ b/spark/src/main/scala/org/apache/comet/expressions/CometCast.scala @@ -108,7 +108,7 @@ object CometCast { Compatible() case DataTypes.ByteType | DataTypes.ShortType | DataTypes.IntegerType | DataTypes.LongType => - Compatible() + Incompatible(Some("Not all invalid inputs are detected")) case DataTypes.BinaryType => Compatible() case DataTypes.FloatType | DataTypes.DoubleType => diff --git a/spark/src/test/scala/org/apache/comet/CometCastSuite.scala b/spark/src/test/scala/org/apache/comet/CometCastSuite.scala index 51fa0626b..6e7b2c0d3 100644 --- a/spark/src/test/scala/org/apache/comet/CometCastSuite.scala +++ b/spark/src/test/scala/org/apache/comet/CometCastSuite.scala @@ -38,7 +38,8 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper { /** Create a data generator using a fixed seed so that tests are reproducible */ private val gen = new DataGenerator(new Random(42)) - private val dataSize = 1000 + /** Number of random data items to generate in each test */ + private val dataSize = 10000 // we should eventually add more whitespace chars here as documented in // https://docs.oracle.com/javase/8/docs/api/java/lang/Character.html#isWhitespace-char- @@ -518,28 +519,28 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper { "9223372036854775808" // Long.MaxValue + 1 ) - test("cast StringType to ByteType") { + ignore("cast StringType to ByteType") { // test with hand-picked values castTest(castStringToIntegralInputs.toDF("a"), DataTypes.ByteType) // fuzz test castTest(gen.generateStrings(dataSize, numericPattern, 4).toDF("a"), DataTypes.ByteType) } - test("cast StringType to ShortType") { + ignore("cast StringType to ShortType") { // test with hand-picked values castTest(castStringToIntegralInputs.toDF("a"), DataTypes.ShortType) // fuzz test castTest(gen.generateStrings(dataSize, numericPattern, 5).toDF("a"), DataTypes.ShortType) } - test("cast StringType to IntegerType") { + ignore("cast StringType to IntegerType") { // test with hand-picked values castTest(castStringToIntegralInputs.toDF("a"), DataTypes.IntegerType) // fuzz test castTest(gen.generateStrings(dataSize, numericPattern, 8).toDF("a"), DataTypes.IntegerType) } - test("cast StringType to LongType") { + ignore("cast StringType to LongType") { // test with hand-picked values castTest(castStringToIntegralInputs.toDF("a"), DataTypes.LongType) // fuzz test From edf28d9e3461fc21de89e08fd022642afd3eb4ee Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 14 May 2024 11:00:44 -0600 Subject: [PATCH 4/5] address feedback --- spark/src/test/scala/org/apache/comet/CometCastSuite.scala | 2 +- spark/src/test/scala/org/apache/comet/DataGenerator.scala | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/spark/src/test/scala/org/apache/comet/CometCastSuite.scala b/spark/src/test/scala/org/apache/comet/CometCastSuite.scala index 6e7b2c0d3..50311a9b5 100644 --- a/spark/src/test/scala/org/apache/comet/CometCastSuite.scala +++ b/spark/src/test/scala/org/apache/comet/CometCastSuite.scala @@ -36,7 +36,7 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper { import testImplicits._ /** Create a data generator using a fixed seed so that tests are reproducible */ - private val gen = new DataGenerator(new Random(42)) + private val gen = DataGenerator.DEFAULT /** Number of random data items to generate in each test */ private val dataSize = 10000 diff --git a/spark/src/test/scala/org/apache/comet/DataGenerator.scala b/spark/src/test/scala/org/apache/comet/DataGenerator.scala index 19ef9eb9d..691a371b5 100644 --- a/spark/src/test/scala/org/apache/comet/DataGenerator.scala +++ b/spark/src/test/scala/org/apache/comet/DataGenerator.scala @@ -21,6 +21,13 @@ package org.apache.comet import scala.util.Random +object DataGenerator { + // note that we use `def` rather than `val` intentionally here so that + // each test suite starts with a fresh data generator to help ensure + // that tests are deterministic + def DEFAULT = new DataGenerator(new Random(42)) +} + class DataGenerator(r: Random) { /** Generate a random string using the specified characters */ From cf779dfec90788d98fdd13d04af443e91e1db8e6 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 14 May 2024 11:48:54 -0600 Subject: [PATCH 5/5] fix regression --- .../test/scala/org/apache/comet/CometExpressionSuite.scala | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala b/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala index 28027c5cb..ef1c5ae37 100644 --- a/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala +++ b/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala @@ -942,7 +942,9 @@ class CometExpressionSuite extends CometTestBase with AdaptiveSparkPlanHelper { test("Chr") { Seq(false, true).foreach { dictionary => - withSQLConf("parquet.enable.dictionary" -> dictionary.toString) { + withSQLConf( + "parquet.enable.dictionary" -> dictionary.toString, + CometConf.COMET_CAST_ALLOW_INCOMPATIBLE.key -> "true") { val table = "test" withTable(table) { sql(s"create table $table(col varchar(20)) using parquet")