apache
diff --git a/‎examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRunner.scala‎
Lines changed: 1 addition & 2 deletions b/‎examples/src/main/scala/org/apache/spark/examples/mllib/DecisionTreeRunner.scala‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala‎
Lines changed: 35 additions & 33 deletions b/‎mllib/src/main/scala/org/apache/spark/mllib/tree/DecisionTree.scala‎
Lines changed: 35 additions & 33 deletions
diff --git a/‎mllib/src/main/scala/org/apache/spark/mllib/tree/impl/DTStatsAggregator.scala‎
Lines changed: 20 additions & 8 deletions b/‎mllib/src/main/scala/org/apache/spark/mllib/tree/impl/DTStatsAggregator.scala‎
Lines changed: 20 additions & 8 deletions
diff --git a/‎mllib/src/main/scala/org/apache/spark/mllib/tree/impl/DecisionTreeMetadata.scala‎
Lines changed: 10 additions & 20 deletions b/‎mllib/src/main/scala/org/apache/spark/mllib/tree/impl/DecisionTreeMetadata.scala‎
Lines changed: 10 additions & 20 deletions
diff --git a/‎mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Entropy.scala‎
Lines changed: 27 additions & 0 deletions b/‎mllib/src/main/scala/org/apache/spark/mllib/tree/impurity/Entropy.scala‎
Lines changed: 27 additions & 0 deletions
@@ -156,9 +156,8 @@ object DecisionTreeRunner {
         throw new IllegalArgumentException("Algo ${params.algo} not supported.")
     }
 
-    println("opt3")
     // Split into training, test.
-    val splits = examples.randomSplit(Array(1.0 - params.fracTest, params.fracTest), seed = 12345)
+    val splits = examples.randomSplit(Array(1.0 - params.fracTest, params.fracTest))
     val training = splits(0).cache()
     val test = splits(1).cache()
     val numTraining = training.count()
 
@@ -28,7 +28,7 @@ import org.apache.spark.mllib.tree.configuration.Strategy
 import org.apache.spark.mllib.tree.configuration.Algo._
 import org.apache.spark.mllib.tree.configuration.FeatureType._
 import org.apache.spark.mllib.tree.configuration.QuantileStrategy._
-import org.apache.spark.mllib.tree.impl.{DecisionTreeMetadata, DTStatsAggregator, TimeTracker, TreePoint}
+import org.apache.spark.mllib.tree.impl._
 import org.apache.spark.mllib.tree.impurity.{Impurities, Impurity}
 import org.apache.spark.mllib.tree.impurity._
 import org.apache.spark.mllib.tree.model._
@@ -122,7 +122,6 @@ class DecisionTree (private val strategy: Strategy) extends Serializable with Lo
     var break = false
     while (level <= maxDepth && !break) {
 
-      //println(s"LEVEL $level")
       logDebug("#####################################")
       logDebug("level = " + level)
       logDebug("#####################################")
@@ -198,14 +197,12 @@ class DecisionTree (private val strategy: Strategy) extends Serializable with Lo
 
     logInfo("Internal timing for DecisionTree:")
     logInfo(s"$timer")
-    println(s"$timer")
 
     new DecisionTreeModel(topNode, strategy.algo)
   }
 
 }
 
-
 object DecisionTree extends Serializable with Logging {
 
   /**
@@ -456,13 +453,21 @@ object DecisionTree extends Serializable with Logging {
    * This function mimics prediction, passing an example from the root node down to a node
    * at the current level being trained; that node's index is returned.
    *
+   * @param node  Node in tree from which to classify the given data point.
+   * @param binnedFeatures  Binned feature vector for data point.
+   * @param bins possible bins for all features, indexed (numFeatures)(numBins)
+   * @param unorderedFeatures  Set of indices of unordered features.
    * @return  Leaf index if the data point reaches a leaf.
    *          Otherwise, last node reachable in tree matching this example.
    *          Note: This is the global node index, i.e., the index used in the tree.
    *                This index is different from the index used during training a particular
    *                set of nodes in a (level, group).
    */
-  def predictNodeIndex(node: Node, binnedFeatures: Array[Int], bins: Array[Array[Bin]], unorderedFeatures: Set[Int]): Int = {
+  def predictNodeIndex(
+      node: Node,
+      binnedFeatures: Array[Int],
+      bins: Array[Array[Bin]],
+      unorderedFeatures: Set[Int]): Int = {
     if (node.isLeaf) {
       node.id
     } else {
@@ -499,15 +504,18 @@ object DecisionTree extends Serializable with Logging {
   }
 
   /**
-   * Helper for binSeqOp.
+   * Helper for binSeqOp, for data containing some unordered (categorical) features.
    *
-   * @param agg  Array storing aggregate calculation.
-   *             For ordered features, this is of size:
-   *               numClasses * numBins * numFeatures * numNodes.
-   *             For unordered features, this is of size:
-   *               2 * numClasses * numBins * numFeatures * numNodes.
-   * @param treePoint   Data point being aggregated.
+   * For ordered features, a single bin is updated.
+   * For unordered features, bins correspond to subsets of categories; either the left or right bin
+   * for each subset is updated.
+   *
+   * @param agg  Array storing aggregate calculation, with a set of sufficient statistics for
+   *             each (node, feature, bin).
+   * @param treePoint  Data point being aggregated.
    * @param nodeIndex  Node corresponding to treePoint. Indexed from 0 at start of (level, group).
+   * @param bins possible bins for all features, indexed (numFeatures)(numBins)
+   * @param unorderedFeatures  Set of indices of unordered features.
    */
   def someUnorderedBinSeqOp(
       agg: DTStatsAggregator,
@@ -547,15 +555,13 @@ object DecisionTree extends Serializable with Logging {
   }
 
   /**
-   * Helper for binSeqOp: for regression and for classification with only ordered features.
+   * Helper for binSeqOp, for regression and for classification with only ordered features.
    *
-   * Performs a sequential aggregation over a partition for regression.
-   * For l nodes, k features,
-   * the count, sum, sum of squares of one of the p bins is incremented.
+   * For each feature, the sufficient statistics of one bin are updated.
    *
-   * @param agg Array storing aggregate calculation, updated by this function.
-   *            Size: 3 * numBins * numFeatures * numNodes
-   * @param treePoint   Data point being aggregated.
+   * @param agg  Array storing aggregate calculation, with a set of sufficient statistics for
+   *             each (node, feature, bin).
+   * @param treePoint  Data point being aggregated.
    * @param nodeIndex  Node corresponding to treePoint. Indexed from 0 at start of (level, group).
    * @return agg
    */
@@ -582,6 +588,7 @@ object DecisionTree extends Serializable with Logging {
    * @param parentImpurities Impurities for all parent nodes for the current level
    * @param metadata Learning and dataset metadata
    * @param level Level of the tree
+   * @param nodes Array of all nodes in the tree.  Used for matching data points to nodes.
    * @param splits possible splits for all features, indexed (numFeatures)(numSplits)
    * @param bins possible bins for all features, indexed (numFeatures)(numBins)
    * @param numGroups total number of node groups at the current level. Default value is set to 1.
@@ -663,19 +670,12 @@ object DecisionTree extends Serializable with Logging {
 
     /**
      * Performs a sequential aggregation over a partition.
-     * For l nodes, k features,
-     *   For classification:
-     *     Either the left count or the right count of one of the bins is
-     *     incremented based upon whether the feature is classified as 0 or 1.
-     *   For regression:
-     *     The count, sum, sum of squares of one of the bins is incremented.
      *
-     * @param agg Array storing aggregate calculation, updated by this function.
-     *            Size for classification:
-     *              Ordered features: numNodes * numFeatures * numBins.
-     *              Unordered features: (2 * numNodes) * numFeatures * numBins.
-     *            Size for regression:
-     *              numNodes * numFeatures * numBins.
+     * Each data point contributes to one node. For each feature,
+     * the aggregate sufficient statistics are updated for the relevant bins.
+     *
+     * @param agg  Array storing aggregate calculation, with a set of sufficient statistics for
+     *             each (node, feature, bin).
      * @param treePoint   Data point being aggregated.
      * @return  agg
      */
@@ -883,8 +883,10 @@ object DecisionTree extends Serializable with Logging {
         val (bestFeatureSplitIndex, bestFeatureGainStats) =
           Range(0, numSplits).map { splitIndex =>
             val featureValue = categoriesSortedByCentroid(splitIndex)._1
-            val leftChildStats = binAggregates.getImpurityCalculator(nodeFeatureOffset, featureValue)
-            val rightChildStats = binAggregates.getImpurityCalculator(nodeFeatureOffset, lastCategory)
+            val leftChildStats =
+              binAggregates.getImpurityCalculator(nodeFeatureOffset, featureValue)
+            val rightChildStats =
+              binAggregates.getImpurityCalculator(nodeFeatureOffset, lastCategory)
             rightChildStats.subtract(leftChildStats)
             val gainStats =
               calculateGainForSplit(leftChildStats, rightChildStats, nodeImpurity, level, metadata)
 
@@ -19,36 +19,46 @@ package org.apache.spark.mllib.tree.impl
 
 import org.apache.spark.mllib.tree.impurity._
 
-import scala.collection.mutable
-
-
 /**
- * :: Experimental ::
  * DecisionTree statistics aggregator.
  * This holds a flat array of statistics for a set of (nodes, features, bins)
  * and helps with indexing.
- * TODO: Allow views of Vector types to replace some of the code in here.
  */
 private[tree] class DTStatsAggregator(
     metadata: DecisionTreeMetadata,
     val numNodes: Int) extends Serializable {
 
+  /**
+   * [[ImpurityAggregator]] instance specifying the impurity type.
+   */
   val impurityAggregator: ImpurityAggregator = metadata.impurity match {
     case Gini => new GiniAggregator(metadata.numClasses)
     case Entropy => new EntropyAggregator(metadata.numClasses)
     case Variance => new VarianceAggregator()
     case _ => throw new IllegalArgumentException(s"Bad impurity parameter: ${metadata.impurity}")
   }
 
+  /**
+   * Number of elements (Double values) used for the sufficient statistics of each bin.
+   */
   val statsSize: Int = impurityAggregator.statsSize
 
   val numFeatures: Int = metadata.numFeatures
 
+  /**
+   * Number of bins for each feature.  This is indexed by the feature index.
+   */
   val numBins: Array[Int] = metadata.numBins
 
-  val isUnordered: Array[Boolean] =
-    Range(0, numFeatures).map(f => metadata.unorderedFeatures.contains(f)).toArray
+  /**
+   * Indicator for each feature of whether that feature is an unordered feature.
+   * TODO: Is Array[Boolean] any faster?
+   */
+  def isUnordered(featureIndex: Int): Boolean = metadata.isUnordered(featureIndex)
 
+  /**
+   * Offset for each feature for calculating indices into the [[allStats]] array.
+   */
   private val featureOffsets: Array[Int] = {
     def featureOffsetsCalc(total: Int, featureIndex: Int): Int = {
       if (isUnordered(featureIndex)) {
@@ -105,8 +115,9 @@ private[tree] class DTStatsAggregator(
   def getNodeOffset(nodeIndex: Int): Int = nodeIndex * nodeStride
 
   /**
+   * Faster version of [[update]].
    * Update the stats for a given (node, feature, bin) for ordered features, using the given label.
-   * This uses a pre-computed node offset from [[getNodeOffset]].
+   * @param nodeOffset  Pre-computed node offset from [[getNodeOffset]].
    */
   def nodeUpdate(nodeOffset: Int, featureIndex: Int, binIndex: Int, label: Double): Unit = {
     val i = nodeOffset + featureOffsets(featureIndex) + binIndex * statsSize
@@ -137,6 +148,7 @@ private[tree] class DTStatsAggregator(
   }
 
   /**
+   * Faster version of [[update]].
    * Update the stats for a given (node, feature, bin), using the given label.
    * @param nodeFeatureOffset  For ordered features, this is a pre-computed (node, feature) offset
    *                           from [[getNodeFeatureOffset]].
 
@@ -24,30 +24,17 @@ import org.apache.spark.mllib.tree.configuration.Algo._
 import org.apache.spark.mllib.tree.configuration.QuantileStrategy._
 import org.apache.spark.mllib.tree.configuration.Strategy
 import org.apache.spark.mllib.tree.impurity.Impurity
-import org.apache.spark.mllib.tree.DecisionTree
 import org.apache.spark.rdd.RDD
 
-
-/*
- * TODO: Add doc about ordered vs. unordered features.
- * Ensure numBins is always greater than the categories. For multiclass classification,
- * numBins should be greater than math.pow(2, maxCategories - 1) - 1.
- * It's a limitation of the current implementation but a reasonable trade-off since features
- * with large number of categories get favored over continuous features.
- *
- * This needs to be checked here instead of in Strategy since numBins can be determined
- * by the number of training examples.
- */
-
-
 /**
  * Learning and dataset metadata for DecisionTree.
  *
  * @param numClasses    For classification: labels can take values {0, ..., numClasses - 1}.
  *                      For regression: fixed at 0 (no meaning).
+ * @param maxBins  Maximum number of bins, for all features.
  * @param featureArity  Map: categorical feature index --> arity.
  *                      I.e., the feature takes values in {0, ..., arity - 1}.
- * @param numBins  numBins(featureIndex) = number of bins for feature
+ * @param numBins  Number of bins for each feature.
  */
 private[tree] class DecisionTreeMetadata(
     val numFeatures: Int,
@@ -82,6 +69,11 @@ private[tree] class DecisionTreeMetadata(
 
 private[tree] object DecisionTreeMetadata {
 
+  /**
+   * Construct a [[DecisionTreeMetadata]] instance for this dataset and parameters.
+   * This computes which categorical features will be ordered vs. unordered,
+   * as well as the number of splits and bins for each feature.
+   */
   def buildMetadata(input: RDD[LabeledPoint], strategy: Strategy): DecisionTreeMetadata = {
 
     val numFeatures = input.take(1)(0).features.size
@@ -94,6 +86,9 @@ private[tree] object DecisionTreeMetadata {
     val maxPossibleBins = math.min(strategy.maxBins, numExamples).toInt
     val log2MaxPossibleBinsp1 = math.log(maxPossibleBins + 1) / math.log(2.0)
 
+    // We check the number of bins here against maxPossibleBins.
+    // This needs to be checked here instead of in Strategy since maxPossibleBins can be modified
+    // based on the number of training examples.
     val unorderedFeatures = new mutable.HashSet[Int]()
     val numBins = Array.fill[Int](numFeatures)(maxPossibleBins)
     if (numClasses > 2) {
@@ -104,11 +99,6 @@ private[tree] object DecisionTreeMetadata {
           unorderedFeatures.add(f)
           numBins(f) = numUnorderedBins(k)
         } else {
-          // TODO: Check the below k <= maxBins.
-          //       Checking k <= maxPossibleBins should work.
-          //       However, there may have been a 1-off error later on allocating 1 extra
-          //       (unused) bin.
-          // TODO: Allow this case, where we simply will know nothing about some categories?
           require(k <= maxPossibleBins,
             s"maxBins (= $maxPossibleBins) should be greater than max categories " +
             s"in categorical features (>= $k)")
 
@@ -75,6 +75,12 @@ object Entropy extends Impurity {
 
 }
 
+/**
+ * Class for updating views of a vector of sufficient statistics,
+ * in order to compute impurity from a sample.
+ * Note: Instances of this class do not hold the data; they operate on views of the data.
+ * @param numClasses  Number of classes for label.
+ */
 private[tree] class EntropyAggregator(numClasses: Int)
   extends ImpurityAggregator(numClasses) with Serializable {
 
@@ -102,20 +108,41 @@ private[tree] class EntropyAggregator(numClasses: Int)
 
 }
 
+/**
+ * Stores statistics for one (node, feature, bin) for calculating impurity.
+ * Unlike [[EntropyAggregator]], this class stores its own data and is for a specific
+ * (node, feature, bin).
+ * @param stats  Array of sufficient statistics for a (node, feature, bin).
+ */
 private[tree] class EntropyCalculator(stats: Array[Double]) extends ImpurityCalculator(stats) {
 
+  /**
+   * Make a deep copy of this [[ImpurityCalculator]].
+   */
   def copy: EntropyCalculator = new EntropyCalculator(stats.clone())
 
+  /**
+   * Calculate the impurity from the stored sufficient statistics.
+   */
   def calculate(): Double = Entropy.calculate(stats, stats.sum)
 
+  /**
+   * Number of data points accounted for in the sufficient statistics.
+   */
   def count: Long = stats.sum.toLong
 
+  /**
+   * Prediction which should be made based on the sufficient statistics.
+   */
   def predict: Double = if (count == 0) {
     0
   } else {
     indexOfLargestArrayElement(stats)
   }
 
+  /**
+   * Probability of the label given by [[predict]].
+   */
   override def prob(label: Double): Double = {
     val lbl = label.toInt
     require(lbl < stats.length,