Skip to content

Commit

Permalink
update default threshold
Browse files Browse the repository at this point in the history
  • Loading branch information
Sophie Wang committed Oct 4, 2023
1 parent 7f126e3 commit a3050b0
Show file tree
Hide file tree
Showing 2 changed files with 3 additions and 3 deletions.
2 changes: 1 addition & 1 deletion spark/src/main/scala/ai/chronon/spark/Extensions.scala
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ object Extensions {
totalCount: Long,
tableName: String,
partitionRange: PartitionRange,
fpp: Double = 0.01): BloomFilter = {
fpp: Double = 0.03): BloomFilter = {
val approxCount =
df.filter(df.col(col).isNotNull).select(approx_count_distinct(col)).collect()(0).getLong(0)
if (approxCount == 0) {
Expand Down
4 changes: 2 additions & 2 deletions spark/src/main/scala/ai/chronon/spark/TableUtils.scala
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ case class TableUtils(sparkSession: SparkSession) {
sparkSession.conf.get("spark.chronon.partition.format", "yyyy-MM-dd")
val partitionSpec: PartitionSpec = PartitionSpec(partitionFormat, WindowUtils.Day.millis)
val backfillValidationEnforced = sparkSession.conf.get("spark.chronon.backfill.validation.enabled", "true").toBoolean
// Threshold to control whether or not to use bloomfilter on join backfill. If the row approximate count is under this threshold, we will use bloomfilter.
// default threshold is 1 million rows
// Threshold to control whether or not to use bloomfilter on join backfill. If the backfill row approximate count is under this threshold, we will use bloomfilter.
// default threshold is 100K rows
val bloomFilterThreshold = sparkSession.conf.get("spark.chronon.backfill.bloomfilter.threshold", "1000000").toLong

sparkSession.sparkContext.setLogLevel("ERROR")
Expand Down

0 comments on commit a3050b0

Please sign in to comment.