Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -223,4 +223,13 @@ package object config {
" bigger files.")
.longConf
.createWithDefault(4 * 1024 * 1024)

private[spark] val SECRET_REDACTION_PATTERN =
ConfigBuilder("spark.redaction.regex")
.doc("Regex to decide which Spark configuration properties and environment variables in " +
"driver and executor environments contain sensitive information. When this regex matches " +
"a property, its value is redacted from the environment UI and various logs like YARN " +
"and event logs.")
.stringConf
.createWithDefault("(?i)secret|password")
}
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,9 @@ private[spark] class EventLoggingListener(

override def onTaskEnd(event: SparkListenerTaskEnd): Unit = logEvent(event)

override def onEnvironmentUpdate(event: SparkListenerEnvironmentUpdate): Unit = logEvent(event)
override def onEnvironmentUpdate(event: SparkListenerEnvironmentUpdate): Unit = {
logEvent(redactEvent(event))
}

// Events that trigger a flush
override def onStageCompleted(event: SparkListenerStageCompleted): Unit = {
Expand Down Expand Up @@ -231,6 +233,15 @@ private[spark] class EventLoggingListener(
}
}

private[spark] def redactEvent(
event: SparkListenerEnvironmentUpdate): SparkListenerEnvironmentUpdate = {
// "Spark Properties" entry will always exist because the map is always populated with it.
val redactedProps = Utils.redact(sparkConf, event.environmentDetails("Spark Properties"))
val redactedEnvironmentDetails = event.environmentDetails +
("Spark Properties" -> redactedProps)
SparkListenerEnvironmentUpdate(redactedEnvironmentDetails)
}

}

private[spark] object EventLoggingListener extends Logging {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,21 +22,17 @@ import javax.servlet.http.HttpServletRequest
import scala.xml.Node

import org.apache.spark.ui.{UIUtils, WebUIPage}
import org.apache.spark.util.Utils

private[ui] class EnvironmentPage(parent: EnvironmentTab) extends WebUIPage("") {
private val listener = parent.listener

private def removePass(kv: (String, String)): (String, String) = {
if (kv._1.toLowerCase.contains("password") || kv._1.toLowerCase.contains("secret")) {
(kv._1, "******")
} else kv
}

def render(request: HttpServletRequest): Seq[Node] = {
val runtimeInformationTable = UIUtils.listingTable(
propertyHeader, jvmRow, listener.jvmInformation, fixedWidth = true)
val sparkPropertiesTable = UIUtils.listingTable(
propertyHeader, propertyRow, listener.sparkProperties.map(removePass), fixedWidth = true)
val sparkPropertiesTable = UIUtils.listingTable(propertyHeader, propertyRow,
Utils.redact(parent.conf, listener.sparkProperties), fixedWidth = true)

val systemPropertiesTable = UIUtils.listingTable(
propertyHeader, propertyRow, listener.systemProperties, fixedWidth = true)
val classpathEntriesTable = UIUtils.listingTable(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import org.apache.spark.ui._

private[ui] class EnvironmentTab(parent: SparkUI) extends SparkUITab(parent, "environment") {
val listener = parent.environmentListener
val conf = parent.conf
attachPage(new EnvironmentPage(this))
}

Expand Down
14 changes: 13 additions & 1 deletion core/src/main/scala/org/apache/spark/util/Utils.scala
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ import org.slf4j.Logger
import org.apache.spark._
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.internal.Logging
import org.apache.spark.internal.config.{DYN_ALLOCATION_INITIAL_EXECUTORS, DYN_ALLOCATION_MIN_EXECUTORS, EXECUTOR_INSTANCES}
import org.apache.spark.internal.config._
import org.apache.spark.network.util.JavaUtils
import org.apache.spark.serializer.{DeserializationStream, SerializationStream, SerializerInstance}
import org.apache.spark.util.logging.RollingFileAppender
Expand Down Expand Up @@ -2555,6 +2555,18 @@ private[spark] object Utils extends Logging {
sparkJars.map(_.split(",")).map(_.filter(_.nonEmpty)).toSeq.flatten
}
}

private[util] val REDACTION_REPLACEMENT_TEXT = "*********(redacted)"

def redact(conf: SparkConf, kvs: Seq[(String, String)]): Seq[(String, String)] = {
val redactionPattern = conf.get(SECRET_REDACTION_PATTERN).r
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is very expensive. How about a version that takes a list of tuples and redacts them?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What part do you think is expensive? Going through all the configuration properties and matching them with the regex?
If so, I agree. However, that has to be done somewhere. All the callers of this function have a SparkConf that they want stuff redacted from. So, if this function accepts a list of tuples, they have to run the regex check to find that list first before sending it to redact(). So, overall, unless I am missing something, I don't think we can avoid the expense.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Compiling the regex once for every item in the list being redacted, instead of doing it once for the whole list.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, good point. Let me fix this.

kvs.map { kv =>
redactionPattern.findFirstIn(kv._1)
.map { ignore => (kv._1, REDACTION_REPLACEMENT_TEXT) }
.getOrElse(kv)
}
}

}

private[util] object CallerContext extends Logging {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,18 @@ class EventLoggingListenerSuite extends SparkFunSuite with LocalSparkContext wit
}
}

test("Event logging with password redaction") {
val key = "spark.executorEnv.HADOOP_CREDSTORE_PASSWORD"
val secretPassword = "secret_password"
val conf = getLoggingConf(testDirPath, None)
.set(key, secretPassword)
val eventLogger = new EventLoggingListener("test", None, testDirPath.toUri(), conf)
val envDetails = SparkEnv.environmentDetails(conf, "FIFO", Seq.empty, Seq.empty)
val event = SparkListenerEnvironmentUpdate(envDetails)
val redactedProps = eventLogger.redactEvent(event).environmentDetails("Spark Properties").toMap
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"Spark Properties" is begging to be turned into a constant somewhere...

assert(redactedProps(key) == "*********(redacted)")
}

test("Log overwriting") {
val logUri = EventLoggingListener.getLogPath(testDir.toURI, "test", None)
val logPath = new URI(logUri).getPath
Expand Down
20 changes: 20 additions & 0 deletions core/src/test/scala/org/apache/spark/util/UtilsSuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -974,4 +974,24 @@ class UtilsSuite extends SparkFunSuite with ResetSystemProperties with Logging {

assert(pValue > threshold)
}

test("redact sensitive information") {
val sparkConf = new SparkConf

// Set some secret keys
val secretKeys = Seq(
"spark.executorEnv.HADOOP_CREDSTORE_PASSWORD",
"spark.my.password",
"spark.my.sECreT")
secretKeys.foreach { key => sparkConf.set(key, "secret_password") }
// Set a non-secret key
sparkConf.set("spark.regular.property", "not_a_secret")

// Redact sensitive information
val redactedConf = Utils.redact(sparkConf, sparkConf.getAll).toMap

// Assert that secret information got redacted while the regular property remained the same
secretKeys.foreach { key => assert(redactedConf(key) === Utils.REDACTION_REPLACEMENT_TEXT) }
assert(redactedConf("spark.regular.property") === "not_a_secret")
}
}
9 changes: 9 additions & 0 deletions docs/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,15 @@ Apart from these, the following properties are also available, and may be useful
process. The user can specify multiple of these to set multiple environment variables.
</td>
</tr>
<tr>
<td><code>spark.redaction.regex</code></td>
<td>(?i)secret|password</td>
<td>
Regex to decide which Spark configuration properties and environment variables in driver and
executor environments contain sensitive information. When this regex matches a property, its
value is redacted from the environment UI and various logs like YARN and event logs.
</td>
</tr>
<tr>
<td><code>spark.python.profile</code></td>
<td>false</td>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ import org.apache.hadoop.yarn.ipc.YarnRPC
import org.apache.hadoop.yarn.util.{ConverterUtils, Records}

import org.apache.spark.{SecurityManager, SparkConf, SparkException}
import org.apache.spark.deploy.yarn.config._
import org.apache.spark.internal.Logging
import org.apache.spark.internal.config._
import org.apache.spark.launcher.YarnCommandBuilderUtils
Expand Down Expand Up @@ -75,7 +74,7 @@ private[yarn] class ExecutorRunnable(
|===============================================================================
|YARN executor launch context:
| env:
|${env.map { case (k, v) => s" $k -> $v\n" }.mkString}
|${Utils.redact(sparkConf, env.toSeq).map { case (k, v) => s" $k -> $v\n" }.mkString}
| command:
| ${commands.mkString(" \\ \n ")}
|
Expand Down