Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add fuzz tests for cast from string to other types #2898

Merged
merged 10 commits into from
Jul 13, 2021
135 changes: 135 additions & 0 deletions tests/src/test/scala/com/nvidia/spark/rapids/CastOpSuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import java.util.TimeZone

import ai.rapids.cudf.ColumnVector
import scala.collection.JavaConverters._
import scala.util.Random

import org.apache.spark.SparkConf
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
Expand Down Expand Up @@ -56,6 +57,140 @@ class CastOpSuite extends GpuExpressionTestSuite {
for (from <- supportedTypes; to <- supportedTypes) yield (from, to)
}

private val BOOL_CHARS = " \t\r\nFALSEfalseTRUEtrue01yesYESnoNO"
private val NUMERIC_CHARS = "inf \t\r\n0123456789.+-eE"
private val DATE_CHARS = " \t\r\n0123456789:-/TZ"

ignore("Cast from string to boolean using random inputs") {
// Test ignored due to known issues
// https://github.com/NVIDIA/spark-rapids/issues/2902
testCastStringTo(DataTypes.BooleanType,
generateRandomStrings(Some(BOOL_CHARS), maxStringLen = 1))
testCastStringTo(DataTypes.BooleanType,
generateRandomStrings(Some(BOOL_CHARS), maxStringLen = 3))
testCastStringTo(DataTypes.BooleanType, generateRandomStrings(Some(BOOL_CHARS)))
}

ignore("Cast from string to boolean using hand-picked values") {
jlowe marked this conversation as resolved.
Show resolved Hide resolved
// Test ignored due to known issues
// https://github.com/NVIDIA/spark-rapids/issues/2902
testCastStringTo(DataTypes.BooleanType, Seq("\n\nN", "False", "FALSE", "false", "FaLsE",
"f", "F", "True", "TRUE", "true", "tRuE", "t", "T", "Y", "y", "10", "01", "0", "1"))
}

ignore("Cast from string to byte using random inputs") {
// Test ignored due to known issues
// https://github.com/NVIDIA/spark-rapids/issues/2899
testCastStringTo(DataTypes.ByteType, generateRandomStrings(Some(NUMERIC_CHARS)))
}

ignore("Cast from string to short using random inputs") {
// Test ignored due to known issues
// https://github.com/NVIDIA/spark-rapids/issues/2899
testCastStringTo(DataTypes.ShortType, generateRandomStrings(Some(NUMERIC_CHARS)))
}

ignore("Cast from string to int using random inputs") {
// Test ignored due to known issues
// https://github.com/NVIDIA/spark-rapids/issues/2899
testCastStringTo(DataTypes.IntegerType, generateRandomStrings(Some(NUMERIC_CHARS)))
}

ignore("Cast from string to long using random inputs") {
// Test ignored due to known issues
// https://github.com/NVIDIA/spark-rapids/issues/2899
testCastStringTo(DataTypes.LongType, generateRandomStrings(Some(NUMERIC_CHARS)))
}

ignore("Cast from string to float using random inputs") {
// Test ignored due to known issues
// https://github.com/NVIDIA/spark-rapids/issues/2900
testCastStringTo(DataTypes.FloatType, generateRandomStrings(Some(NUMERIC_CHARS)))
}

ignore("Cast from string to double using random inputs") {
// Test ignored due to known issues
// https://github.com/NVIDIA/spark-rapids/issues/2900
testCastStringTo(DataTypes.DoubleType, generateRandomStrings(Some(NUMERIC_CHARS)))
}

test("Cast from string to date using random inputs") {
testCastStringTo(DataTypes.DateType, generateRandomStrings(Some(DATE_CHARS), maxStringLen = 8))
}

test("Cast from string to date using random inputs with valid year prefix") {
testCastStringTo(DataTypes.DateType,
generateRandomStrings(Some(DATE_CHARS), maxStringLen = 8, Some("2021")))
}

ignore("Cast from string to timestamp using random inputs") {
// Test ignored due to known issues
// https://github.com/NVIDIA/spark-rapids/issues/2889
testCastStringTo(DataTypes.TimestampType,
generateRandomStrings(Some(DATE_CHARS), maxStringLen = 32, None))
}

ignore("Cast from string to timestamp using random inputs with valid year prefix") {
// Test ignored due to known issues
// https://github.com/NVIDIA/spark-rapids/issues/2889
testCastStringTo(DataTypes.TimestampType,
generateRandomStrings(Some(DATE_CHARS), maxStringLen = 32, Some("2021-")))
}

private def generateRandomStrings(
validChars: Option[String],
maxStringLen: Int = 12,
prefix: Option[String] = None): Seq[String] = {
val randomValueCount = 8192

val random = new Random(0)
val r = new EnhancedRandom(random,
FuzzerOptions(validChars, maxStringLen))

(0 until randomValueCount)
.map(_ => prefix.getOrElse("") + r.nextString())
}

private def testCastStringTo(toType: DataType, strings: Seq[String]) {

def castDf(spark: SparkSession): Seq[Row] = {
import spark.implicits._
val df = strings.zipWithIndex.toDF("c0", "id").repartition(2)
val castDf = df.withColumn("c1", col("c0").cast(toType))
castDf.collect()
}

val INDEX_ID = 1
val INDEX_C0 = 0
val INDEX_C1 = 2

val cpu = withCpuSparkSession(castDf)
.sortBy(_.getInt(INDEX_ID))

val conf = new SparkConf()
.set(RapidsConf.EXPLAIN.key, "ALL")
.set(RapidsConf.INCOMPATIBLE_DATE_FORMATS.key, "true")
.set(RapidsConf.ENABLE_CAST_STRING_TO_TIMESTAMP.key, "true")
.set(RapidsConf.ENABLE_CAST_STRING_TO_FLOAT.key, "true")
.set(RapidsConf.ENABLE_CAST_STRING_TO_DECIMAL.key, "true")
.set(RapidsConf.ENABLE_CAST_STRING_TO_INTEGER.key, "true")

val gpu = withGpuSparkSession(castDf, conf)
.sortBy(_.getInt(INDEX_ID))

for ((cpuRow, gpuRow) <- cpu.zip(gpu)) {
assert(cpuRow.getString(INDEX_C0) === gpuRow.getString(INDEX_C0))
assert(cpuRow.getInt(INDEX_ID) === gpuRow.getInt(INDEX_ID))
val cpuValue = cpuRow.get(INDEX_C1)
val gpuValue = gpuRow.get(INDEX_C1)
if (!compare(cpuValue, gpuValue)) {
val inputValue = cpuRow.getString(INDEX_C0)
fail(s"Mismatch casting string [$inputValue] " +
s"to $toType. CPU: $cpuValue; GPU: $gpuValue")
}
}
}

test("Test all supported casts with in-range values") {
// test cast() and ansi_cast()
Seq(false, true).foreach { ansiEnabled =>
Expand Down
47 changes: 17 additions & 30 deletions tests/src/test/scala/com/nvidia/spark/rapids/FuzzerUtils.scala
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,7 @@ object FuzzerUtils {
/**
* Default options when generating random data.
*/
private val DEFAULT_OPTIONS = FuzzerOptions(
numbersAsStrings = true,
asciiStringsOnly = false,
maxStringLen = 64)
private val DEFAULT_OPTIONS = FuzzerOptions()

/**
* Create a schema with the specified data types.
Expand Down Expand Up @@ -331,20 +328,6 @@ class EnhancedRandom(protected val r: Random, protected val options: FuzzerOptio
}
}

def nextString(): String = {
if (options.numbersAsStrings) {
r.nextInt(5) match {
case 0 => String.valueOf(r.nextInt())
case 1 => String.valueOf(r.nextLong())
case 2 => String.valueOf(r.nextFloat())
case 3 => String.valueOf(r.nextDouble())
case 4 => generateString()
}
} else {
generateString()
}
}

def nextDate(): Date = {
val futureDate = 6321706291000L // Upper limit Sunday, April 29, 2170 9:31:31 PM
new Date((futureDate * r.nextDouble()).toLong);
Expand All @@ -355,22 +338,26 @@ class EnhancedRandom(protected val r: Random, protected val options: FuzzerOptio
new Timestamp((futureDate * r.nextDouble()).toLong)
}

private def generateString(): String = {
if (options.asciiStringsOnly) {
val b = new StringBuilder()
for (_ <- 0 until options.maxStringLen) {
b.append(ASCII_CHARS.charAt(r.nextInt(ASCII_CHARS.length)))
}
b.toString
} else {
r.nextString(r.nextInt(options.maxStringLen))
def nextString(): String = {
val length = r.nextInt(options.maxStringLen)
options.validStringChars match {
case Some(ch) => nextString(ch, length)
case _ =>
// delegate to Scala's Random.nextString
r.nextString(length)
}
}

def nextString(validStringChars: String, maxStringLen: Int): String = {
val b = new StringBuilder(maxStringLen)
for (_ <- 0 until maxStringLen) {
b.append(validStringChars.charAt(r.nextInt(validStringChars.length)))
}
b.toString
}

private val ASCII_CHARS = "abcdefghijklmnopqrstuvwxyz"
}

case class FuzzerOptions(
numbersAsStrings: Boolean = true,
asciiStringsOnly: Boolean = false,
validStringChars: Option[String] = None,
maxStringLen: Int = 64)
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,7 @@ class HashAggregatesSuite extends SparkQueryCompareTestSuite {
}

def firstDf(spark: SparkSession): DataFrame = {
val options = FuzzerOptions(asciiStringsOnly = true, numbersAsStrings = false,
maxStringLen = 4)
val options = FuzzerOptions(maxStringLen = 4)
val schema = FuzzerUtils.createSchema(Seq(DataTypes.StringType, DataTypes.IntegerType))
FuzzerUtils.generateDataFrame(spark, schema, 100, options, seed = 0)
.withColumn("c2", col("c1").mod(lit(10)))
Expand Down Expand Up @@ -857,8 +856,7 @@ class HashAggregatesSuite extends SparkQueryCompareTestSuite {
private def randomDF(dataType: DataType)(spark: SparkSession) : DataFrame = {
val schema = FuzzerUtils.createSchema(Seq(DataTypes.StringType, dataType))
FuzzerUtils.generateDataFrame(spark, schema, rowCount = 1000,
options = FuzzerOptions(numbersAsStrings = false, asciiStringsOnly = true,
maxStringLen = 2))
options = FuzzerOptions(maxStringLen = 2))
}

FLOAT_TEST_testSparkResultsAreEqual("empty df: reduction count", floatCsvDf,
Expand Down