@@ -23,7 +23,6 @@ import java.util.Arrays.binarySearch
2323import scala .collection .JavaConverters ._
2424import scala .collection .mutable .ArrayBuffer
2525
26- import org .apache .commons .math3 .util .Precision
2726import org .json4s ._
2827import org .json4s .JsonDSL ._
2928import org .json4s .jackson .JsonMethods ._
@@ -272,8 +271,8 @@ class IsotonicRegression private (private var isotonic: Boolean) extends Seriali
272271 * @param input RDD of tuples (label, feature, weight) where label is dependent variable
273272 * for which we calculate isotonic regression, feature is independent variable
274273 * and weight represents number of measures with default 1.
275- * If multiple labels share the same feature value then they are ordered before
276- * the algorithm is executed.
274+ * If multiple labels share the same feature value then they are aggregated using
275+ * the weighted average before the algorithm is executed.
277276 * @return Isotonic regression model.
278277 */
279278 @ Since (" 1.3.0" )
@@ -298,8 +297,8 @@ class IsotonicRegression private (private var isotonic: Boolean) extends Seriali
298297 * @param input JavaRDD of tuples (label, feature, weight) where label is dependent variable
299298 * for which we calculate isotonic regression, feature is independent variable
300299 * and weight represents number of measures with default 1.
301- * If multiple labels share the same feature value then they are ordered before
302- * the algorithm is executed.
300+ * If multiple labels share the same feature value then they are aggregated using
301+ * the weighted average before the algorithm is executed.
303302 * @return Isotonic regression model.
304303 */
305304 @ Since (" 1.3.0" )
@@ -310,21 +309,14 @@ class IsotonicRegression private (private var isotonic: Boolean) extends Seriali
310309 /**
311310 * Aggregates points of duplicate feature values into a single point using as label the weighted
312311 * average of the labels of the points with duplicate feature values. All points for a unique
313- * feature values are aggregated as:
312+ * feature value are aggregated as:
314313 *
315- * - Aggregated label is the weighted average of all labels
316- * - Aggregated feature is the weighted average of all equal features[1]
317- * - Aggregated weight is the sum of all weights
314+ * - Aggregated label is the weighted average of all labels.
315+ * - Aggregated feature is the unique feature value.
316+ * - Aggregated weight is the sum of all weights.
318317 *
319- * [1] Note: It is possible that feature values to be equal up to a resolution due to
320- * representation errors, since we cannot know which feature value to use in that case, we
321- * compute the weighted average of the features. Ideally, all feature values will be equal and
322- * the weighted average is just the value at any point.
323- *
324- * @param input
325- * Input data of tuples (label, feature, weight). Weights must be non-negative.
326- * @return
327- * Points with unique feature values.
318+ * @param input Input data of tuples (label, feature, weight). Weights must be non-negative.
319+ * @return Points with unique feature values.
328320 */
329321 private [regression] def makeUnique (
330322 input : Array [(Double , Double , Double )]): Array [(Double , Double , Double )] = {
@@ -339,28 +331,28 @@ class IsotonicRegression private (private var isotonic: Boolean) extends Seriali
339331 if (cleanInput.length <= 1 ) {
340332 cleanInput
341333 } else {
342- // whether or not two double features are equal up to a precision
343- @ inline def areEqual (a : Double , b : Double ): Boolean = Precision .equals(a, b)
344-
345334 val pointsAccumulator = new IsotonicRegression .PointsAccumulator
346- var (_, prevFeature, _) = cleanInput.head
347-
348- // Go through input points, merging all points with approximately equal feature values into
349- // a single point. Equality of features is defined by areEqual method. The label of the
350- // accumulated points is the weighted average of the labels of all points of equal feature
351- // value. It is possible that feature values to be equal up to a resolution due to
352- // representation errors, since we cannot know which feature value to use in that case,
353- // we compute the weighted average of the features.
354- cleanInput.foreach { case point @ (_, feature, _) =>
355- if (areEqual(feature, prevFeature)) {
335+
336+ // Go through input points, merging all points with equal feature values into a single point.
337+ // Equality of features is defined by shouldAccumulate method. The label of the accumulated
338+ // points is the weighted average of the labels of all points of equal feature value.
339+
340+ // Initialize with first point
341+ pointsAccumulator := cleanInput.head
342+ // Accumulate the rest
343+ cleanInput.tail.foreach { case point @ (_, feature, _) =>
344+ if (pointsAccumulator.shouldAccumulate(feature)) {
345+ // Still on a duplicate feature, accumulate
356346 pointsAccumulator += point
357347 } else {
348+ // A new unique feature encountered:
349+ // - append the last accumulated point to unique features output
358350 pointsAccumulator.appendToOutput()
351+ // - and reset
359352 pointsAccumulator := point
360353 }
361- prevFeature = feature
362354 }
363- // Append the last accumulated point
355+ // Append the last accumulated point to unique features output
364356 pointsAccumulator.appendToOutput()
365357 pointsAccumulator.getOutput
366358 }
@@ -488,14 +480,14 @@ class IsotonicRegression private (private var isotonic: Boolean) extends Seriali
488480 // Points with same or adjacent features must collocate within the same partition.
489481 .partitionBy(new RangePartitioner (keyedInput.getNumPartitions, keyedInput))
490482 .values
491- // Lexicographically sort points by features then labels .
492- .mapPartitions(p => Iterator (p.toArray.sortBy(x => (x ._2, x._1) )))
483+ // Lexicographically sort points by features.
484+ .mapPartitions(p => Iterator (p.toArray.sortBy(_ ._2)))
493485 // Aggregate points with equal features into a single point.
494486 .map(makeUnique)
495487 .flatMap(poolAdjacentViolators)
496488 .collect()
497489 // Sort again because collect() doesn't promise ordering.
498- .sortBy(x => (x ._2, x._1) )
490+ .sortBy(_ ._2)
499491 poolAdjacentViolators(parallelStepResult)
500492 }
501493}
@@ -511,30 +503,32 @@ object IsotonicRegression {
511503 private var (currentLabel : Double , currentFeature : Double , currentWeight : Double ) =
512504 (0d , 0d , 0d )
513505
506+ /** Whether or not this feature exactly equals the current accumulated feature. */
507+ @ inline def shouldAccumulate (feature : Double ): Boolean = currentFeature == feature
508+
514509 /** Resets the current value of the point accumulator using the provided point. */
515- def := (point : (Double , Double , Double )): Unit = {
510+ @ inline def := (point : (Double , Double , Double )): Unit = {
516511 val (label, feature, weight) = point
517512 currentLabel = label * weight
518- currentFeature = feature * weight
513+ currentFeature = feature
519514 currentWeight = weight
520515 }
521516
522517 /** Accumulates the provided point into the current value of the point accumulator. */
523- def += (point : (Double , Double , Double )): Unit = {
524- val (label, feature , weight) = point
518+ @ inline def += (point : (Double , Double , Double )): Unit = {
519+ val (label, _ , weight) = point
525520 currentLabel += label * weight
526- currentFeature += feature * weight
527521 currentWeight += weight
528522 }
529523
530524 /** Appends the current value of the point accumulator to the output. */
531- def appendToOutput (): Unit =
525+ @ inline def appendToOutput (): Unit =
532526 output += ((
533527 currentLabel / currentWeight,
534- currentFeature / currentWeight ,
528+ currentFeature,
535529 currentWeight))
536530
537531 /** Returns all accumulated points so far. */
538- def getOutput : Array [(Double , Double , Double )] = output.toArray
532+ @ inline def getOutput : Array [(Double , Double , Double )] = output.toArray
539533 }
540534}
0 commit comments