From 9e757820af7990f37d1cb5f8cd9c989fcf815cdf Mon Sep 17 00:00:00 2001
From: Mark Grover <mark@apache.org>
Date: Thu, 2 Mar 2017 10:33:56 -0800
Subject: [PATCH 1/2] [SPARK-19720][CORE] Redact sensitive information from
 SparkSubmit console

This change redacts senstive information (based on default password and secret regex)
from the Spark Submit console logs. Such sensitive information is already being
redacted from event logs and yarn logs, etc.

Testing was done manually to make sure that the console logs were not printing any
sensitive information.

Here's some output from the console:

```
Spark properties used, including those specified through
 --conf and those from the properties file /etc/spark2/conf/spark-defaults.conf:
  (spark.yarn.appMasterEnv.HADOOP_CREDSTORE_PASSWORD,*********(redacted))
  (spark.authenticate,false)
  (spark.executorEnv.HADOOP_CREDSTORE_PASSWORD,*********(redacted))
```

```
System properties:
(spark.yarn.appMasterEnv.HADOOP_CREDSTORE_PASSWORD,*********(redacted))
(spark.authenticate,false)
(spark.executorEnv.HADOOP_CREDSTORE_PASSWORD,*********(redacted))
```
There is a risk if new print statements were added to the console down the road, sensitive information may still get leaked, since there is no test that asserts on the console log output. I considered it out of the scope of this JIRA to write an integration test to make sure new leaks don't happen in the future.

Running unit tests to make sure nothing else is broken by this change.

Using reference from Mark Grover <mark@apache.org>

Closes #17047 for 2.1.2 spark vesion.
---
 .../org/apache/spark/deploy/SparkSubmit.scala  |  3 ++-
 .../spark/deploy/SparkSubmitArguments.scala    | 12 +++++++++---
 .../scala/org/apache/spark/util/Utils.scala    | 18 ++++++++++++++++++
 3 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
index 443f1f5b084c..653830eac6c8 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -670,7 +670,8 @@ object SparkSubmit {
     if (verbose) {
       printStream.println(s"Main class:\n$childMainClass")
       printStream.println(s"Arguments:\n${childArgs.mkString("\n")}")
-      printStream.println(s"System properties:\n${sysProps.mkString("\n")}")
+      // sysProps may contain sensitive information, so redact before printing
+      printStream.println(s"System properties:\n${Utils.redact(sysProps).mkString("\n")}")
       printStream.println(s"Classpath elements:\n${childClasspath.mkString("\n")}")
       printStream.println("\n")
     }
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
index f1761e7c1ec9..b9cde3f8bfa8 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
@@ -84,9 +84,15 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S
     // scalastyle:off println
     if (verbose) SparkSubmit.printStream.println(s"Using properties file: $propertiesFile")
     Option(propertiesFile).foreach { filename =>
-      Utils.getPropertiesFromFile(filename).foreach { case (k, v) =>
+      val properties = Utils.getPropertiesFromFile(filename)
+      properties.foreach { case (k, v) =>
         defaultProperties(k) = v
-        if (verbose) SparkSubmit.printStream.println(s"Adding default property: $k=$v")
+      }
+      // Property files may contain sensitive information, so redact before printing
+      if (verbose) {
+        Utils.redact(properties).foreach { case (k, v) =>
+          SparkSubmit.printStream.println(s"Adding default property: $k=$v")
+        }
       }
     }
     // scalastyle:on println
@@ -318,7 +324,7 @@ private[deploy] class SparkSubmitArguments(args: Seq[String], env: Map[String, S
     |
     |Spark properties used, including those specified through
     | --conf and those from the properties file $propertiesFile:
-    |${sparkProperties.mkString("  ", "\n  ", "\n")}
+    |${Utils.redact(sparkProperties.toMap).mkString("  ", "\n  ", "\n")}
     """.stripMargin
   }
 
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 56de802c423c..8144ae827ff8 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -38,6 +38,7 @@ import scala.io.Source
 import scala.reflect.ClassTag
 import scala.util.Try
 import scala.util.control.{ControlThrowable, NonFatal}
+import scala.util.matching.Regex
 
 import _root_.io.netty.channel.unix.Errors.NativeIoException
 import com.google.common.cache.{CacheBuilder, CacheLoader, LoadingCache}
@@ -2571,6 +2572,23 @@ private[spark] object Utils extends Logging {
       sparkJars.map(_.split(",")).map(_.filter(_.nonEmpty)).toSeq.flatten
     }
   }
+
+  private[util] val REDACTION_REPLACEMENT_TEXT = "*********(redacted)"
+  private[util] val SECRET_REDACTION_PATTERN = "(?i)secret|password".r
+
+  def redact(kvs: Map[String, String]): Seq[(String, String)] = {
+    val redactionPattern = SECRET_REDACTION_PATTERN
+        redact(redactionPattern, kvs.toArray)
+  }
+
+  private def redact(redactionPattern: Regex, kvs: Array[(String, String)]): Seq[(String, String)] = {
+    kvs.map { kv =>
+      redactionPattern.findFirstIn(kv._1)
+        .map { _ => (kv._1, REDACTION_REPLACEMENT_TEXT) }
+        .getOrElse(kv)
+    }
+  }
+
 }
 
 private[util] object CallerContext extends Logging {

From f5362fce252e5f5a11093feff0f2727af7a657e0 Mon Sep 17 00:00:00 2001
From: Diogo Munaro <diogo.munaro@corp.globo.com>
Date: Fri, 28 Jul 2017 22:12:26 -0300
Subject: [PATCH 2/2] Fixing 100 line length code

---
 core/src/main/scala/org/apache/spark/util/Utils.scala | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 8144ae827ff8..e18d4bcd45ac 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -2581,9 +2581,9 @@ private[spark] object Utils extends Logging {
         redact(redactionPattern, kvs.toArray)
   }
 
-  private def redact(redactionPattern: Regex, kvs: Array[(String, String)]): Seq[(String, String)] = {
+  private def redact(redactPattern: Regex, kvs: Array[(String, String)]): Seq[(String, String)] = {
     kvs.map { kv =>
-      redactionPattern.findFirstIn(kv._1)
+      redactPattern.findFirstIn(kv._1)
         .map { _ => (kv._1, REDACTION_REPLACEMENT_TEXT) }
         .getOrElse(kv)
     }