Merge branch 'master' into serena/addTranslator

microsoft · Jul 13, 2021 · 82f026e · 82f026e
2 parents 79b324b + d287be6
commit 82f026e
Show file tree

Hide file tree

Showing 23 changed files with 1,171 additions and 591 deletions.
diff --git a/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/CognitiveServiceBase.scala b/cognitive/src/main/scala/com/microsoft/ml/spark/cognitive/CognitiveServiceBase.scala
@@ -230,7 +230,7 @@ trait HasSetLinkedService extends Wrappable with HasURL with HasSubscriptionKey
   }
 
   def setLinkedService(v: String): this.type = {
-    val classPath = "mssparkutils.CognitiveServiceUtils"
+    val classPath = "mssparkutils.cognitiveService"
     val linkedServiceClass = ScalaClassLoader(getClass.getClassLoader).tryToLoadClass(classPath)
     val endpointMethod = linkedServiceClass.get.getMethod("getEndpoint", v.getClass)
     val keyMethod = linkedServiceClass.get.getMethod("getKey", v.getClass)

diff --git a/core/src/main/scala/com/microsoft/ml/spark/core/utils/ClusterUtil.scala b/core/src/main/scala/com/microsoft/ml/spark/core/utils/ClusterUtil.scala
@@ -104,8 +104,8 @@ object ClusterUtil {
     }
   }
 
-  def getDriverHost(dataset: Dataset[_]): String = {
-    val blockManager = BlockManagerUtils.getBlockManager(dataset)
+  def getDriverHost(spark: SparkSession): String = {
+    val blockManager = BlockManagerUtils.getBlockManager(spark)
     blockManager.master.getMemoryStatus.toList.flatMap({ case (blockManagerId, _) =>
       if (blockManagerId.executorId == "driver") Some(getHostToIP(blockManagerId.host))
       else None
@@ -120,11 +120,11 @@ object ClusterUtil {
   }
 
   /** Returns a list of executor id and host.
-    * @param dataset The dataset containing the current spark session.
+    * @param spark The current spark session.
     * @return List of executors as an array of (id,host).
     */
-  def getExecutors(dataset: Dataset[_]): Array[(Int, String)] = {
-    val blockManager = BlockManagerUtils.getBlockManager(dataset)
+  def getExecutors(spark: SparkSession): Array[(Int, String)] = {
+    val blockManager = BlockManagerUtils.getBlockManager(spark)
     blockManager.master.getMemoryStatus.toList.flatMap({ case (blockManagerId, _) =>
       if (blockManagerId.executorId == "driver") None
       else Some((blockManagerId.executorId.toInt, getHostToIP(blockManagerId.host)))
@@ -142,35 +142,33 @@ object ClusterUtil {
     * @param numTasksPerExec The number of tasks per executor.
     * @return The number of executors * number of tasks.
     */
-  def getNumExecutorTasks(dataset: Dataset[_], numTasksPerExec: Int, log: Logger): Int = {
-    val executors = getExecutors(dataset)
+  def getNumExecutorTasks(spark: SparkSession, numTasksPerExec: Int, log: Logger): Int = {
+    val executors = getExecutors(spark)
     log.info(s"Retrieving executors...")
     if (!executors.isEmpty) {
       log.info(s"Retrieved num executors ${executors.length} with num tasks per executor $numTasksPerExec")
       executors.length * numTasksPerExec
     } else {
       log.info(s"Could not retrieve executors from blockmanager, trying to get from configuration...")
-      val master = dataset.sparkSession.sparkContext.master
+      val master = spark.sparkContext.master
+
+      //TODO make this less brittle
       val rx = "local(?:\\[(\\*|\\d+)(?:,\\d+)?\\])?".r
       master match {
-        case rx(null)  => {
+        case rx(null)  =>
           log.info(s"Retrieved local() = 1 executor by default")
           1
-        }
-        case rx("*")   => {
+        case rx("*")   =>
           log.info(s"Retrieved local(*) = ${Runtime.getRuntime.availableProcessors()} executors")
           Runtime.getRuntime.availableProcessors()
-        }
-        case rx(cores) => {
+        case rx(cores) =>
           log.info(s"Retrieved local(cores) = $cores executors")
           cores.toInt
-        }
-        case _         => {
-          val numExecutors = BlockManagerUtils.getBlockManager(dataset)
+        case _         =>
+          val numExecutors = BlockManagerUtils.getBlockManager(spark)
             .master.getMemoryStatus.size
           log.info(s"Using default case = $numExecutors executors")
           numExecutors
-        }
       }
     }
   }

diff --git a/core/src/main/scala/com/microsoft/ml/spark/stages/PartitionConsolidator.scala b/core/src/main/scala/com/microsoft/ml/spark/stages/PartitionConsolidator.scala
@@ -137,8 +137,3 @@ class Consolidator[T] {
 
 }
 
-trait LocalAggregator[T] {
-  def prep(iter: Iterator[Row]): T
-
-  def merge(ts: Seq[T]): T
-}
diff --git a/core/src/main/scala/org/apache/spark/injections/BlockManagerUtils.scala b/core/src/main/scala/org/apache/spark/injections/BlockManagerUtils.scala
@@ -3,16 +3,16 @@
 
 package org.apache.spark.injections
 
-import org.apache.spark.sql.Dataset
+import org.apache.spark.sql.SparkSession
 import org.apache.spark.storage.BlockManager
 
 object BlockManagerUtils {
   /** Returns the block manager from the dataframe's spark context.
     *
-    * @param data The dataframe to get the block manager from.
+    * @param spark The spark session to get the block manager from.
     * @return The block manager.
     */
-  def getBlockManager(data: Dataset[_]): BlockManager = {
-    data.sparkSession.sparkContext.env.blockManager
+  def getBlockManager(spark: SparkSession): BlockManager = {
+    spark.sparkContext.env.blockManager
   }
 }