diff --git a/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxBackend.scala b/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxBackend.scala index 61ed9a2de17f..56d6fb2e65c2 100644 --- a/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxBackend.scala +++ b/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxBackend.scala @@ -200,8 +200,13 @@ object VeloxBackendSettings extends BackendSettingsApi { return None } + val fileLimit = GlutenConfig.get.parquetEncryptionValidationFileLimit val encryptionResult = - ParquetMetadataUtils.validateEncryption(format, rootPaths, serializableHadoopConf) + ParquetMetadataUtils.validateEncryption( + format, + rootPaths, + serializableHadoopConf, + fileLimit) if (encryptionResult.ok()) { None } else { diff --git a/backends-velox/src/main/scala/org/apache/gluten/utils/ParquetMetadataUtils.scala b/backends-velox/src/main/scala/org/apache/gluten/utils/ParquetMetadataUtils.scala index 9f43575cf90c..48d0629268da 100644 --- a/backends-velox/src/main/scala/org/apache/gluten/utils/ParquetMetadataUtils.scala +++ b/backends-velox/src/main/scala/org/apache/gluten/utils/ParquetMetadataUtils.scala @@ -46,7 +46,8 @@ object ParquetMetadataUtils { def validateEncryption( format: ReadFileFormat, rootPaths: Seq[String], - serializableHadoopConf: Option[SerializableConfiguration] + serializableHadoopConf: Option[SerializableConfiguration], + fileLimit: Int ): ValidationResult = { if (format != ParquetReadFormat || rootPaths.isEmpty) { return ValidationResult.succeeded @@ -59,7 +60,7 @@ object ParquetMetadataUtils { val fs = new Path(rootPath).getFileSystem(conf) try { val encryptionDetected = - checkForEncryptionWithLimit(fs, new Path(rootPath), conf, fileLimit = 10) + checkForEncryptionWithLimit(fs, new Path(rootPath), conf, fileLimit = fileLimit) if (encryptionDetected) { return ValidationResult.failed("Encrypted Parquet file detected.") } diff --git a/shims/common/src/main/scala/org/apache/gluten/config/GlutenConfig.scala b/shims/common/src/main/scala/org/apache/gluten/config/GlutenConfig.scala index 1cdc3d552af1..e9e2dbac54a0 100644 --- a/shims/common/src/main/scala/org/apache/gluten/config/GlutenConfig.scala +++ b/shims/common/src/main/scala/org/apache/gluten/config/GlutenConfig.scala @@ -503,6 +503,8 @@ class GlutenConfig(conf: SQLConf) extends Logging { def autoAdjustStageFallenNodeThreshold: Double = getConf(AUTO_ADJUST_STAGE_RESOURCES_FALLEN_NODE_RATIO_THRESHOLD) + + def parquetEncryptionValidationFileLimit: Int = getConf(ENCRYPTED_PARQUET_FALLBACK_FILE_LIMIT) } object GlutenConfig { @@ -2310,4 +2312,14 @@ object GlutenConfig { "count exceeds the total node count ratio.") .doubleConf .createWithDefault(0.5d) + + val ENCRYPTED_PARQUET_FALLBACK_FILE_LIMIT = + buildConf("spark.gluten.sql.fallbackEncryptedParquet.limit") + .internal() + .doc("If supplied, `limit` number of files will be checked to determine encryption " + + "and falling back java scan") + .intConf + .checkValue(_ > 0, s"must be positive.") + .createWithDefault(10) + }