From 9cb04571d02a99e3e26a71c9addbfd8aba13e6d6 Mon Sep 17 00:00:00 2001 From: "Joseph K. Bradley" Date: Tue, 19 Apr 2016 23:00:21 -0700 Subject: [PATCH] Noted that StandardScaler uses the corrected sample std, not the unbiased std --- .../scala/org/apache/spark/ml/feature/StandardScaler.scala | 5 +++++ .../org/apache/spark/mllib/feature/StandardScaler.scala | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala index 118a6e3e6ad44..626e97efb47c6 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StandardScaler.scala @@ -66,6 +66,11 @@ private[feature] trait StandardScalerParams extends Params with HasInputCol with * :: Experimental :: * Standardizes features by removing the mean and scaling to unit variance using column summary * statistics on the samples in the training set. + * + * The "unit std" is computed using the + * [[https://en.wikipedia.org/wiki/Standard_deviation#Corrected_sample_standard_deviation + * corrected sample standard deviation]], + * which is computed as the square root of the unbiased sample variance. */ @Experimental class StandardScaler(override val uid: String) extends Estimator[StandardScalerModel] diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala index 5c35e1b91c9bf..ee97045f34dc8 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/StandardScaler.scala @@ -27,6 +27,11 @@ import org.apache.spark.rdd.RDD * Standardizes features by removing the mean and scaling to unit std using column summary * statistics on the samples in the training set. * + * The "unit std" is computed using the + * [[https://en.wikipedia.org/wiki/Standard_deviation#Corrected_sample_standard_deviation + * corrected sample standard deviation]], + * which is computed as the square root of the unbiased sample variance. + * * @param withMean False by default. Centers the data with mean before scaling. It will build a * dense output, so this does not work on sparse input and will raise an exception. * @param withStd True by default. Scales the data to unit standard deviation.