update default threshold

airbnb · Oct 4, 2023 · a3050b0 · a3050b0
1 parent 7f126e3
commit a3050b0
Show file tree

Hide file tree

Showing 2 changed files with 3 additions and 3 deletions.
diff --git a/spark/src/main/scala/ai/chronon/spark/Extensions.scala b/spark/src/main/scala/ai/chronon/spark/Extensions.scala
@@ -132,7 +132,7 @@ object Extensions {
                             totalCount: Long,
                             tableName: String,
                             partitionRange: PartitionRange,
-                            fpp: Double = 0.01): BloomFilter = {
+                            fpp: Double = 0.03): BloomFilter = {
       val approxCount =
         df.filter(df.col(col).isNotNull).select(approx_count_distinct(col)).collect()(0).getLong(0)
       if (approxCount == 0) {

diff --git a/spark/src/main/scala/ai/chronon/spark/TableUtils.scala b/spark/src/main/scala/ai/chronon/spark/TableUtils.scala
@@ -24,8 +24,8 @@ case class TableUtils(sparkSession: SparkSession) {
     sparkSession.conf.get("spark.chronon.partition.format", "yyyy-MM-dd")
   val partitionSpec: PartitionSpec = PartitionSpec(partitionFormat, WindowUtils.Day.millis)
   val backfillValidationEnforced = sparkSession.conf.get("spark.chronon.backfill.validation.enabled", "true").toBoolean
-  // Threshold to control whether or not to use bloomfilter on join backfill. If the row approximate count is under this threshold, we will use bloomfilter.
-  // default threshold is 1 million rows
+  // Threshold to control whether or not to use bloomfilter on join backfill. If the backfill row approximate count is under this threshold, we will use bloomfilter.
+  // default threshold is 100K rows
   val bloomFilterThreshold = sparkSession.conf.get("spark.chronon.backfill.bloomfilter.threshold", "1000000").toLong
 
   sparkSession.sparkContext.setLogLevel("ERROR")