From 9e757820af7990f37d1cb5f8cd9c989fcf815cdf Mon Sep 17 00:00:00 2001 From: Mark Grover Date: Thu, 2 Mar 2017 10:33:56 -0800 Subject: [PATCH 1/2] [SPARK-19720][CORE] Redact sensitive information from SparkSubmit console This change redacts senstive information (based on default password and secret regex) from the Spark Submit console logs. Such sensitive information is already being redacted from event logs and yarn logs, etc. Testing was done manually to make sure that the console logs were not printing any sensitive information. Here's some output from the console: ``` Spark properties used, including those specified through --conf and those from the properties file /etc/spark2/conf/spark-defaults.conf: (spark.yarn.appMasterEnv.HADOOP_CREDSTORE_PASSWORD,*********(redacted)) (spark.authenticate,false) (spark.executorEnv.HADOOP_CREDSTORE_PASSWORD,*********(redacted)) ``` ``` System properties: (spark.yarn.appMasterEnv.HADOOP_CREDSTORE_PASSWORD,*********(redacted)) (spark.authenticate,false) (spark.executorEnv.HADOOP_CREDSTORE_PASSWORD,*********(redacted)) ``` There is a risk if new print statements were added to the console down the road, sensitive information may still get leaked, since there is no test that asserts on the console log output. I considered it out of the scope of this JIRA to write an integration test to make sure new leaks don't happen in the future. Running unit tests to make sure nothing else is broken by this change. Using reference from Mark Grover Closes #17047 for 2.1.2 spark vesion. --- .../org/apache/spark/deploy/SparkSubmit.scala | 3 ++- .../spark/deploy/SparkSubmitArguments.scala | 12 +++++++++--- .../scala/org/apache/spark/util/Utils.scala | 18 ++++++++++++++++++ 3 files changed, 29 insertions(+), 4 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala index 443f1f5b084c..653830eac6c8 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala @@ -670,7 +670,8 @@ object SparkSubmit { if (verbose) { printStream.println(s"Main class:\n$childMainClass") printStream.println(s"Arguments:\n${childArgs.mkString("\n")}") - printStream.println(s"System properties:\n${sysProps.mkString("\n")}") + // sysProps may contain sensitive information, so redact before printing + printStream.println(s"System properties:\n${Utils.redact(sysProps).mkString("\n")}") printStream.println(s"Classpath elements:\n${childClasspath.mkString("\n")}") printStream.println("\n") } diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala index f1761e7c1ec9..b9cde3f8bfa8 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala @@ -84,9 +84,15 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S // scalastyle:off println if (verbose) SparkSubmit.printStream.println(s"Using properties file: $propertiesFile") Option(propertiesFile).foreach { filename => - Utils.getPropertiesFromFile(filename).foreach { case (k, v) => + val properties = Utils.getPropertiesFromFile(filename) + properties.foreach { case (k, v) => defaultProperties(k) = v - if (verbose) SparkSubmit.printStream.println(s"Adding default property: $k=$v") + } + // Property files may contain sensitive information, so redact before printing + if (verbose) { + Utils.redact(properties).foreach { case (k, v) => + SparkSubmit.printStream.println(s"Adding default property: $k=$v") + } } } // scalastyle:on println @@ -318,7 +324,7 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S | |Spark properties used, including those specified through | --conf and those from the properties file $propertiesFile: - |${sparkProperties.mkString(" ", "\n ", "\n")} + |${Utils.redact(sparkProperties.toMap).mkString(" ", "\n ", "\n")} """.stripMargin } diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index 56de802c423c..8144ae827ff8 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -38,6 +38,7 @@ import scala.io.Source import scala.reflect.ClassTag import scala.util.Try import scala.util.control.{ControlThrowable, NonFatal} +import scala.util.matching.Regex import _root_.io.netty.channel.unix.Errors.NativeIoException import com.google.common.cache.{CacheBuilder, CacheLoader, LoadingCache} @@ -2571,6 +2572,23 @@ private[spark] object Utils extends Logging { sparkJars.map(_.split(",")).map(_.filter(_.nonEmpty)).toSeq.flatten } } + + private[util] val REDACTION_REPLACEMENT_TEXT = "*********(redacted)" + private[util] val SECRET_REDACTION_PATTERN = "(?i)secret|password".r + + def redact(kvs: Map[String, String]): Seq[(String, String)] = { + val redactionPattern = SECRET_REDACTION_PATTERN + redact(redactionPattern, kvs.toArray) + } + + private def redact(redactionPattern: Regex, kvs: Array[(String, String)]): Seq[(String, String)] = { + kvs.map { kv => + redactionPattern.findFirstIn(kv._1) + .map { _ => (kv._1, REDACTION_REPLACEMENT_TEXT) } + .getOrElse(kv) + } + } + } private[util] object CallerContext extends Logging { From f5362fce252e5f5a11093feff0f2727af7a657e0 Mon Sep 17 00:00:00 2001 From: Diogo Munaro Date: Fri, 28 Jul 2017 22:12:26 -0300 Subject: [PATCH 2/2] Fixing 100 line length code --- core/src/main/scala/org/apache/spark/util/Utils.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index 8144ae827ff8..e18d4bcd45ac 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -2581,9 +2581,9 @@ private[spark] object Utils extends Logging { redact(redactionPattern, kvs.toArray) } - private def redact(redactionPattern: Regex, kvs: Array[(String, String)]): Seq[(String, String)] = { + private def redact(redactPattern: Regex, kvs: Array[(String, String)]): Seq[(String, String)] = { kvs.map { kv => - redactionPattern.findFirstIn(kv._1) + redactPattern.findFirstIn(kv._1) .map { _ => (kv._1, REDACTION_REPLACEMENT_TEXT) } .getOrElse(kv) }