microsoft · memoryz · Jun 18, 2021 · May 22, 2021 · May 23, 2021 · May 23, 2021
@@ -347,6 +347,8 @@ jobs:
         PACKAGE: "core"
       downloader:
         PACKAGE: "downloader"
+      explainers:
+        PACKAGE: "explainers"
       featurize:
         PACKAGE: "featurize"
       image:

@@ -6,12 +6,12 @@ package com.microsoft.ml.spark.codegen
 import java.lang.reflect.ParameterizedType
 import java.nio.charset.StandardCharsets
 import java.nio.file.Files
-
 import com.microsoft.ml.spark.core.env.FileUtilities
 import com.microsoft.ml.spark.core.serialize.ComplexParam
 import org.apache.spark.ml.evaluation.Evaluator
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.{Estimator, Model, Transformer}
+import org.apache.commons.lang.StringEscapeUtils
 
 import scala.collection.Iterator.iterate
 import scala.reflect.ClassTag
@@ -119,7 +119,7 @@ trait PythonWrappable extends BaseWrappable {
     }
   }
 
-  protected lazy val pyInheritedClasses =
+  protected lazy val pyInheritedClasses: Seq[String] =
     Seq("ComplexParamsMixin", "JavaMLReadable", "JavaMLWritable", pyObjectBaseClass)
 
   // TODO add default values
@@ -134,10 +134,14 @@ trait PythonWrappable extends BaseWrappable {
         |""".stripMargin
   }
 
+  private def escape(raw: String): String = {
+    StringEscapeUtils.escapeJava(raw)
+  }
+
   protected lazy val pyParamsDefinitions: String = {
     this.params.map { p =>
       val typeConverterString = getParamInfo(p).pyTypeConverter.map(", typeConverter=" + _).getOrElse("")
-      s"""|${p.name} = Param(Params._dummy(), "${p.name}", "${p.doc}"$typeConverterString)
+      s"""|${p.name} = Param(Params._dummy(), "${p.name}", "${escape(p.doc)}"$typeConverterString)
           |""".stripMargin
     }.mkString("\n")
   }

@@ -7,8 +7,6 @@ import org.apache.spark.ml.linalg.{DenseVector, SparseVector}
 import org.apache.spark.sql.Dataset
 import org.apache.spark.sql.types.StructType
 
-import scala.collection.mutable
-
 /** Contains methods for manipulating spark dataframes and datasets. */
 object DatasetExtensions {
 
@@ -20,8 +18,7 @@ object DatasetExtensions {
       * @return The unused column name.
       */
     def withDerivativeCol(prefix: String): String = {
-      val columnNamesSet = mutable.HashSet(df.columns: _*)
-      findUnusedColumnName(prefix)(columnNamesSet)
+      findUnusedColumnName(prefix)(df.columns.toSet)
     }
 
     /** Gets the column values as the given type.
@@ -36,12 +33,12 @@ object DatasetExtensions {
     /** Gets the spark sparse vector column.
       * @return The spark sparse vector column.
       */
-    def getSVCol: String => Seq[SparseVector] = getColAs[SparseVector] _
+    def getSVCol: String => Seq[SparseVector] = getColAs[SparseVector]
 
     /** Gets the spark dense vector column.
       * @return The spark dense vector column.
       */
-    def getDVCol: String => Seq[DenseVector] = getColAs[DenseVector] _
+    def getDVCol: String => Seq[DenseVector] = getColAs[DenseVector]
   }
 
   /** Finds an unused column name given initial column name and a list of existing column names.
@@ -51,13 +48,8 @@ object DatasetExtensions {
     * @return The unused column name.
     */
   def findUnusedColumnName(prefix: String)(columnNames: scala.collection.Set[String]): String = {
-    var counter = 2
-    var unusedColumnName = prefix
-    while (columnNames.contains(unusedColumnName)) {
-      unusedColumnName += "_" + counter
-      counter += 1
-    }
-    unusedColumnName
+    val stream = Iterator(prefix) ++ Iterator.from(1, 1).map(prefix + "_" + _)
+    stream.dropWhile(columnNames.contains).next()
   }
 
   def findUnusedColumnName(prefix: String, schema: StructType): String = {
@@ -67,5 +59,4 @@ object DatasetExtensions {
   def findUnusedColumnName(prefix: String, df: Dataset[_]): String = {
     findUnusedColumnName(prefix, df.schema)
   }
-
 }
@@ -22,7 +22,7 @@ object ModelEquality {
   }
 
   def assertEqual(m1: PipelineStage, m2: PipelineStage): Unit = {
-    assert(m1.getClass === m2.getClass)
+    assert(m1.getClass === m2.getClass, s"${m1.getClass} != ${m2.getClass}, assertion failed.")
     val m1Params = m1.extractParamMap().toSeq.map(pp => pp.param.name).toSet
     val m2Params = m2.extractParamMap().toSeq.map(pp => pp.param.name).toSet
     assert(m1Params === m2Params)

@@ -6,10 +6,13 @@ package com.microsoft.ml.spark.core.utils
 import org.apache.spark.sql.Row
 import org.apache.spark.sql.catalyst.expressions.GenericRow
 
+// This class currently has no usage. Should we just remove it?
+@deprecated("This is a copy of Row.merge function from Spark, which was marked deprecated.", "1.0.0-rc3")
 class RowUtils {
 
   //TODO Deprecate later
   def merge(rows: Row*): Row = {
+    Row.merge()
     new GenericRow(rows.flatMap(_.toSeq).toArray)
   }
 }
@@ -0,0 +1,62 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark.core.utils
+
+import org.apache.spark.injections.UDFUtils
+import org.apache.spark.ml.linalg.SQLDataTypes.VectorType
+import org.apache.spark.ml.linalg.{Vector, Vectors}
+import org.apache.spark.sql.expressions.UserDefinedFunction
+import org.apache.spark.sql.types._
+
+private[spark] object SlicerFunctions {
+  private def slice[T](values: Int => T, indices: Seq[Int])(implicit num: Numeric[_]): Vector = {
+    val n = num.asInstanceOf[Numeric[T]]
+    Vectors.dense(indices.map(values.apply).map(n.toDouble).toArray)
+  }
+
+  private val DataTypeToNumericMap: Map[NumericType, Numeric[_]] = Map(
+    FloatType -> implicitly[Numeric[Float]],
+    DoubleType -> implicitly[Numeric[Double]],
+    ByteType -> implicitly[Numeric[Byte]],
+    ShortType -> implicitly[Numeric[Short]],
+    IntegerType -> implicitly[Numeric[Int]],
+    LongType -> implicitly[Numeric[Long]]
+  )
+
+  /**
+    * A UDF that takes a vector, and a seq of indices. The function slices the given vector at given indices,
+    * and returns the result in a Vector.
+    */
+  def vectorSlicer: UserDefinedFunction = {
+    implicit val num: Numeric[_] = DataTypeToNumericMap(DoubleType)
+    UDFUtils.oldUdf(
+      (v: Vector, indices: Seq[Int]) => slice(v.apply, indices),
+      VectorType
+    )
+  }
+
+  /**
+    * A UDF that takes an array of numeric types, and a seq of indices.
+    * The function slices the given array at given indices, and returns the result in a Vector.
+    */
+  def arraySlicer(elementType: NumericType): UserDefinedFunction = {
+    implicit val num: Numeric[_] = DataTypeToNumericMap(elementType)
+    UDFUtils.oldUdf(
+      (v: Seq[Any], indices: Seq[Int]) => slice(v.apply, indices),
+      VectorType
+    )
+  }
+
+  /**
+    * A UDF that takes a map of integer keys and numeric values, and a seq of keys.
+    * The function slices the given array at given keys, and returns the result in a Vector.
+    */
+  def mapSlicer(valueType: NumericType): UserDefinedFunction = {
+    implicit val num: Numeric[_] = DataTypeToNumericMap(valueType)
+    UDFUtils.oldUdf(
+      (m: Map[Int, Any], indices: Seq[Int]) => slice(m.apply, indices),
+      VectorType
+    )
+  }
+}
@@ -0,0 +1,36 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark.explainers
+
+import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, Vector => BV, DenseMatrix => BDM}
+import org.apache.spark.ml.linalg.{Vector, Vectors, Matrix, Matrices}
+
+object BreezeUtils {
+  implicit class SparkVectorCanConvertToBreeze(sv: Vector) {
+    def toBreeze: BDV[Double] = {
+      BDV(sv.toArray)
+    }
+  }
+
+  implicit class SparkMatrixCanConvertToBreeze(mat: Matrix) {
+    def toBreeze: BDM[Double] = {
+      BDM(mat.rowIter.map(_.toBreeze).toArray: _*)
+    }
+  }
+
+  implicit class BreezeVectorCanConvertToSpark(bv: BV[Double]) {
+    def toSpark: Vector = {
+      bv match {
+        case v: BDV[Double] => Vectors.dense(v.toArray)
+        case v: BSV[Double] => Vectors.sparse(v.size, v.activeIterator.toSeq).compressed
+      }
+    }
+  }
+
+  implicit class BreezeMatrixCanConvertToSpark(bm: BDM[Double]) {
+    def toSpark: Matrix = {
+      Matrices.dense(bm.rows, bm.cols, bm.data)
+    }
+  }
+}
@@ -0,0 +1,64 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark.explainers
+
+import breeze.numerics.abs
+import breeze.stats.distributions.{RandBasis, Uniform}
+
+private[explainers] trait FeatureStats[T] {
+
+  def getRandomState(instance: T)(implicit randBasis: RandBasis): Double
+
+  def sample(state: Double): T
+
+  def getDistance(instance: T, sample: T): Double
+}
+
+private[explainers] final case class ContinuousFeatureStats(stddev: Double)
+  extends FeatureStats[Double] {
+  override def getRandomState(instance: Double)(implicit randBasis: RandBasis): Double = {
+    randBasis.gaussian(instance, this.stddev).sample
+  }
+
+  override def sample(state: Double): Double = {
+    state
+  }
+
+  override def getDistance(instance: Double, sample: Double): Double = {
+    if (this.stddev == 0d) {
+      0d
+    } else {
+      // Normalize by stddev
+      abs(sample - instance) / this.stddev
+    }
+  }
+}
+
+private[explainers] final case class DiscreteFeatureStats[V](freq: Map[V, Double])
+  extends FeatureStats[V] {
+
+  /**
+    * Returns the cumulative density function (CDF) of the given frequency table.
+    */
+  private def cdf[T](freq: Seq[(T, Double)]): Seq[(T, Double)] = {
+    freq.map(_._1) zip freq.map(_._2).scanLeft(0d)(_ + _).drop(1)
+  }
+
+  private lazy val cdfTable: Seq[(V, Double)] = {
+    val freq = this.freq.toSeq
+    cdf(freq)
+  }
+
+  override def getRandomState(instance: V)(implicit randBasis: RandBasis): Double = {
+    Uniform(0d, freq.values.sum).sample
+  }
+
+  override def sample(state: Double): V = {
+    cdfTable.find(state <= _._2).get._1
+  }
+
+  override def getDistance(instance: V, sample: V): Double = {
+    if (instance == sample) 0d else 1d
+  }
+}
@@ -0,0 +1,29 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark.explainers
+
+import com.microsoft.ml.spark.lime.{HasCellSize, HasModifier, SuperpixelTransformer}
+import org.apache.spark.ml.param.shared.HasInputCol
+import org.apache.spark.sql.DataFrame
+
+/**
+  * Common preprocessing logic for image explainers
+  */
+trait ImageExplainer {
+  self: LocalExplainer
+    with HasCellSize
+    with HasModifier
+    with HasInputCol
+    with HasSuperpixelCol =>
+
+  protected override def preprocess(df: DataFrame): DataFrame = {
+    // Dataframe with new column containing superpixels (Array[Cluster]) for each row (image to explain)
+    new SuperpixelTransformer()
+      .setCellSize(getCellSize)
+      .setModifier(getModifier)
+      .setInputCol(getInputCol)
+      .setOutputCol(getSuperpixelCol)
+      .transform(df)
+  }
+}