Skip to content

Commit c2fe3a6

Browse files
jkbradleymengxr
authored andcommitted
[SPARK-6120] [mllib] Warnings about memory in tree, ensemble model save
Issue: When the Python DecisionTree example in the programming guide is run, it runs out of Java Heap Space when using the default memory settings for the spark shell. This prints a warning. CC: mengxr Author: Joseph K. Bradley <joseph@databricks.com> Closes #4864 from jkbradley/dt-save-heap and squashes the following commits: 02e8daf [Joseph K. Bradley] fixed based on code review 7ecb1ed [Joseph K. Bradley] Added warnings about memory when calling tree and ensemble model save with too small a Java heap size
1 parent 7e53a79 commit c2fe3a6

File tree

2 files changed

+50
-4
lines changed

2 files changed

+50
-4
lines changed

mllib/src/main/scala/org/apache/spark/mllib/tree/model/DecisionTreeModel.scala

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ import org.json4s._
2323
import org.json4s.JsonDSL._
2424
import org.json4s.jackson.JsonMethods._
2525

26-
import org.apache.spark.SparkContext
26+
import org.apache.spark.{Logging, SparkContext}
2727
import org.apache.spark.annotation.Experimental
2828
import org.apache.spark.api.java.JavaRDD
2929
import org.apache.spark.mllib.linalg.Vector
@@ -32,6 +32,7 @@ import org.apache.spark.mllib.tree.configuration.Algo._
3232
import org.apache.spark.mllib.util.{Loader, Saveable}
3333
import org.apache.spark.rdd.RDD
3434
import org.apache.spark.sql.{DataFrame, Row, SQLContext}
35+
import org.apache.spark.util.Utils
3536

3637
/**
3738
* :: Experimental ::
@@ -115,7 +116,7 @@ class DecisionTreeModel(val topNode: Node, val algo: Algo) extends Serializable
115116
override protected def formatVersion: String = "1.0"
116117
}
117118

118-
object DecisionTreeModel extends Loader[DecisionTreeModel] {
119+
object DecisionTreeModel extends Loader[DecisionTreeModel] with Logging {
119120

120121
private[tree] object SaveLoadV1_0 {
121122

@@ -187,6 +188,28 @@ object DecisionTreeModel extends Loader[DecisionTreeModel] {
187188
val sqlContext = new SQLContext(sc)
188189
import sqlContext.implicits._
189190

191+
// SPARK-6120: We do a hacky check here so users understand why save() is failing
192+
// when they run the ML guide example.
193+
// TODO: Fix this issue for real.
194+
val memThreshold = 768
195+
if (sc.isLocal) {
196+
val driverMemory = sc.getConf.getOption("spark.driver.memory")
197+
.orElse(Option(System.getenv("SPARK_DRIVER_MEMORY")))
198+
.map(Utils.memoryStringToMb)
199+
.getOrElse(512)
200+
if (driverMemory <= memThreshold) {
201+
logWarning(s"$thisClassName.save() was called, but it may fail because of too little" +
202+
s" driver memory (${driverMemory}m)." +
203+
s" If failure occurs, try setting driver-memory ${memThreshold}m (or larger).")
204+
}
205+
} else {
206+
if (sc.executorMemory <= memThreshold) {
207+
logWarning(s"$thisClassName.save() was called, but it may fail because of too little" +
208+
s" executor memory (${sc.executorMemory}m)." +
209+
s" If failure occurs try setting executor-memory ${memThreshold}m (or larger).")
210+
}
211+
}
212+
190213
// Create JSON metadata.
191214
val metadata = compact(render(
192215
("class" -> thisClassName) ~ ("version" -> thisFormatVersion) ~

mllib/src/main/scala/org/apache/spark/mllib/tree/model/treeEnsembleModels.scala

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ import org.json4s._
2424
import org.json4s.JsonDSL._
2525
import org.json4s.jackson.JsonMethods._
2626

27-
import org.apache.spark.SparkContext
27+
import org.apache.spark.{Logging, SparkContext}
2828
import org.apache.spark.annotation.Experimental
2929
import org.apache.spark.api.java.JavaRDD
3030
import org.apache.spark.mllib.linalg.Vector
@@ -34,6 +34,7 @@ import org.apache.spark.mllib.tree.configuration.EnsembleCombiningStrategy._
3434
import org.apache.spark.mllib.util.{Loader, Saveable}
3535
import org.apache.spark.rdd.RDD
3636
import org.apache.spark.sql.SQLContext
37+
import org.apache.spark.util.Utils
3738

3839
/**
3940
* :: Experimental ::
@@ -250,7 +251,7 @@ private[tree] sealed class TreeEnsembleModel(
250251
def totalNumNodes: Int = trees.map(_.numNodes).sum
251252
}
252253

253-
private[tree] object TreeEnsembleModel {
254+
private[tree] object TreeEnsembleModel extends Logging {
254255

255256
object SaveLoadV1_0 {
256257

@@ -277,6 +278,28 @@ private[tree] object TreeEnsembleModel {
277278
val sqlContext = new SQLContext(sc)
278279
import sqlContext.implicits._
279280

281+
// SPARK-6120: We do a hacky check here so users understand why save() is failing
282+
// when they run the ML guide example.
283+
// TODO: Fix this issue for real.
284+
val memThreshold = 768
285+
if (sc.isLocal) {
286+
val driverMemory = sc.getConf.getOption("spark.driver.memory")
287+
.orElse(Option(System.getenv("SPARK_DRIVER_MEMORY")))
288+
.map(Utils.memoryStringToMb)
289+
.getOrElse(512)
290+
if (driverMemory <= memThreshold) {
291+
logWarning(s"$className.save() was called, but it may fail because of too little" +
292+
s" driver memory (${driverMemory}m)." +
293+
s" If failure occurs, try setting driver-memory ${memThreshold}m (or larger).")
294+
}
295+
} else {
296+
if (sc.executorMemory <= memThreshold) {
297+
logWarning(s"$className.save() was called, but it may fail because of too little" +
298+
s" executor memory (${sc.executorMemory}m)." +
299+
s" If failure occurs try setting executor-memory ${memThreshold}m (or larger).")
300+
}
301+
}
302+
280303
// Create JSON metadata.
281304
implicit val format = DefaultFormats
282305
val ensembleMetadata = Metadata(model.algo.toString, model.trees(0).algo.toString,

0 commit comments

Comments
 (0)