diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala index 1180501e8c738..6f799a542bc1e 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala @@ -462,6 +462,9 @@ private[spark] object SparkHadoopUtil { for ((key, value) <- conf.getAll if key.startsWith("spark.hadoop.")) { hadoopConf.set(key.substring("spark.hadoop.".length), value) } + if (conf.getOption("spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version").isEmpty) { + hadoopConf.set("mapreduce.fileoutputcommitter.algorithm.version", "1") + } } private def appendSparkHiveConfigs(conf: SparkConf, hadoopConf: Configuration): Unit = { diff --git a/docs/configuration.md b/docs/configuration.md index 8b6ae9d777cce..d825a589dfd31 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -1761,16 +1761,10 @@ Apart from these, the following properties are also available, and may be useful spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version - Dependent on environment + 1 The file output committer algorithm version, valid algorithm version number: 1 or 2. - Version 2 may have better performance, but version 1 may handle failures better in certain situations, - as per MAPREDUCE-4815. - The default value depends on the Hadoop version used in an environment: - 1 for Hadoop versions lower than 3.0 - 2 for Hadoop versions 3.0 and higher - It's important to note that this can change back to 1 again in the future once MAPREDUCE-7282 - is fixed and merged. + Note that 2 may cause a correctness issue like MAPREDUCE-7282. 2.2.0