-
-
Notifications
You must be signed in to change notification settings - Fork 355
Dataset Generator
Mahmoud Hanafy edited this page Apr 22, 2016
·
2 revisions
DatasetGenerator
provides an easy way to generate arbitrary Datasets, to be able to check any property.
If you don't know scalacheck, I suggest you read about it first; to understand the concepts of properties and generators.
You can generate arbitrary datasets using method arbitraryDataset
. Just create a generator for your required Dataset type or use generators that are supported by default.
Example: (Supported Generator)
class SampleDatasetGeneratorTest extends FunSuite with SharedSparkContext with Checkers {
test("test generating Datasets[String]") {
val sqlContext = new SQLContext(sc)
import sqlContext.implicits._
val property =
forAll(DatasetGenerator.genDataset[String](sqlContext)(Arbitrary.arbitrary[String])) {
dataset => dataset.map(_.length).count() == dataset.count()
}
check(property)
}
}
You can create custom generator for your own datatype.
Example: (Custom Generator)
class SampleDatasetGeneratorTest extends FunSuite with SharedSparkContext with Checkers {
test("test generating Datasets[Custom Class]") {
val sqlContext = new SQLContext(sc)
import sqlContext.implicits._
val carGen: Gen[Dataset[Car]] =
DatasetGenerator.genDataset[Car](sqlContext) {
val generator: Gen[Car] = for {
name <- Arbitrary.arbitrary[String]
speed <- Arbitrary.arbitrary[Int]
} yield (Car(name, speed))
generator
}
val property =
forAll(carGen) {
dataset => dataset.map(_.speed).count() == dataset.count()
}
check(property)
}
}
case class Car(name: String, speed: Int)