Fix tests, remove perf test in LocalTreeIntegrationSuite, use Scala transpose in LocalDecisionTreeUtils.

smurching · smurching · commit abc86b2042e0 · 2017-10-09T15:07:34.000-07:00
Changes made to fix tests:
 * Return correct impurity stats for splits that achieved a gain of 0 but didn't violate user-specified constraints on min info gain or min
 instances per node
 * Previously, ImpurityStats.impurity was set incorrectly in ImpurityStats.getInvalidImpurityStats(), requiring a correction in LearningNode.toNode.
   This commit fixes the issue by directly setting impurity = -1 in getInvalidSplits()
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/Node.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/Node.scala
@@ -278,14 +278,8 @@ private[tree] class LearningNode(
     } else {
       assert(stats != null, "Unknown error during Decision Tree learning. Could not convert " +
         "LearningNode to Node")
-      if (stats.valid) {
-        new LeafNode(stats.impurityCalculator.predict, stats.impurity,
-          stats.impurityCalculator)
-      } else {
-        // Here we want to keep same behavior with the old mllib.DecisionTreeModel
-        new LeafNode(stats.impurityCalculator.predict, -1.0, stats.impurityCalculator)
-      }
-
+      new LeafNode(stats.impurityCalculator.predict, stats.impurity,
+        stats.impurityCalculator)
     }
   }
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/ImpurityUtils.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/ImpurityUtils.scala
@@ -60,14 +60,20 @@ private[impl] object ImpurityUtils {
     val rightWeight = rightCount / totalCount.toDouble
 
     val gain = impurity - leftWeight * leftImpurity - rightWeight * rightImpurity
-    // if information gain doesn't satisfy minimum information gain,
+    // If information gain doesn't satisfy minimum information gain,
     // then this split is invalid, return invalid information gain stats.
-    // NOTE: We check gain < metadata.minInfoGain and gain <= 0 separately as this is what the
-    // original tree training logic did.
-    if (gain < metadata.minInfoGain || gain <= 0) {
+    if (gain < metadata.minInfoGain) {
       return ImpurityStats.getInvalidImpurityStats(parentImpurityCalculator)
     }
 
+    // If information gain is non-positive but doesn't violate the minimum info gain constraint,
+    // return a stats object with correct values but valid = false to indicate that we should not
+    // split.
+    if (gain <= 0) {
+      return new ImpurityStats(gain, impurity, parentImpurityCalculator, leftImpurityCalculator,
+        rightImpurityCalculator, valid = false)
+    }
+
     new ImpurityStats(gain, impurity, parentImpurityCalculator,
       leftImpurityCalculator, rightImpurityCalculator)
   }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/LocalDecisionTreeUtils.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/LocalDecisionTreeUtils.scala
@@ -51,9 +51,7 @@ private[ml] object LocalDecisionTreeUtils extends Logging {
     val numFeatures = rowStore(0).length
     require(numFeatures > 0, "Local decision tree training requires numFeatures > 0.")
     // Return the transpose of the rowStore matrix
-    0.until(numFeatures).map { colIdx =>
-      rowStore.map(row => row(colIdx))
-    }.toArray
+    rowStore.transpose
   }
 
 }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/impl/RandomForest.scala
@@ -638,16 +638,8 @@ private[spark] object RandomForest extends Logging {
 
     // For each (feature, split), calculate the gain, and select the best (feature, split).
     val splitsAndImpurityInfo =
-      validFeatureSplits.flatMap { case (featureIndexIdx, featureIndex) =>
-        val (split, stats) = SplitUtils.chooseSplit(binAggregates,
-          featureIndex, featureIndexIdx, splits)
-        // Filter out invalid splits
-        // TODO(smurching): Better to use map + filter or flatmap?
-        if (stats.valid) {
-          Seq((split, stats))
-        } else {
-          Seq.empty
-        }
+      validFeatureSplits.map { case (featureIndexIdx, featureIndex) =>
+        SplitUtils.chooseSplit(binAggregates, featureIndex, featureIndexIdx, splits)
       }
 
     val (bestSplit, bestSplitStats) =
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/model/InformationGainStats.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/model/InformationGainStats.scala
@@ -75,8 +75,9 @@ class InformationGainStats(
  * @param impurityCalculator impurity statistics for current node
  * @param leftImpurityCalculator impurity statistics for left child node
  * @param rightImpurityCalculator impurity statistics for right child node
- * @param valid whether the current split satisfies minimum info gain or
- *              minimum number of instances per node
+ * @param valid whether the current split should be performed; true if split
+ *              satisfies minimum info gain, minimum number of instances per node, and
+ *              has positive info gain.
  */
 private[spark] class ImpurityStats(
     val gain: Double,
@@ -112,7 +113,7 @@ private[spark] object ImpurityStats {
    * minimum number of instances per node.
    */
   def getInvalidImpurityStats(impurityCalculator: ImpurityCalculator): ImpurityStats = {
-    new ImpurityStats(Double.MinValue, impurityCalculator.calculate(),
+    new ImpurityStats(Double.MinValue, impurity = -1,
       impurityCalculator, null, null, false)
   }
 
diff --git a/mllib/src/test/scala/org/apache/spark/ml/tree/impl/LocalTreeIntegrationSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/tree/impl/LocalTreeIntegrationSuite.scala
@@ -94,31 +94,4 @@ class LocalTreeIntegrationSuite extends SparkFunSuite with MLlibTestSparkContext
     testEquivalence(df, TreeTests.allParamSettings)
   }
 
-  // TODO(smurching): Probably remove this (since it depends on user env). Currently fails, partly
-  // because collecting data for local training is slow but also because local training is
-  // slightly slower than distributed training.
-//  test("Local tree training is faster than distributed training on a medium-sized dataset") {
-//    val sqlContext = spark.sqlContext
-//    import sqlContext.implicits._
-//    val df = LogisticRegressionDataGenerator.generateLogisticRDD(spark.sparkContext,
-//      nexamples = 100000, nfeatures = 5, eps = 2.0, nparts = 1, probOne = 0.2)
-//      .map(_.asML).toDF().cache()
-//
-//    val timer = new TimeTracker()
-//
-//    timer.start("local")
-//    val localTree = setParams(new LocalDecisionTreeRegressor(), TreeTests.allParamSettings)
-//    localTree.fit(df)
-//    val localTrainTime = timer.stop("local")
-//
-//    timer.start("distributed")
-//    val distribTree = setParams(new DecisionTreeRegressor(), TreeTests.allParamSettings)
-//    distribTree.fit(df)
-//    val distribTrainTime = timer.stop("distributed")
-//
-//    assert(localTrainTime < distribTrainTime, s"Local tree training time ($localTrainTime) " +
-//      s"should be less than distributed tree training time ($distribTrainTime).")
-//  }
-
-
 }

Original file line number	Diff line number	Diff line change
`@@ -51,9 +51,7 @@ private[ml] object LocalDecisionTreeUtils extends Logging {`
`51`	`51`	`val numFeatures = rowStore(0).length`
`52`	`52`	`require(numFeatures > 0, "Local decision tree training requires numFeatures > 0.")`
`53`	`53`	`// Return the transpose of the rowStore matrix`
`54`		`- 0.until(numFeatures).map { colIdx =>`
`55`		`- rowStore.map(row => row(colIdx))`
`56`		`- }.toArray`
	`54`	`+ rowStore.transpose`
`57`	`55`	`}`
`58`	`56`
`59`	`57`	`}`