rename param

zhengruifeng · zhengruifeng · commit a69ca83c393f · 2020-11-12T11:19:58.000+08:00
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala
@@ -42,7 +42,7 @@ import org.apache.spark.storage.StorageLevel
 /** Params for linear SVM Classifier. */
 private[classification] trait LinearSVCParams extends ClassifierParams with HasRegParam
   with HasMaxIter with HasFitIntercept with HasTol with HasStandardization with HasWeightCol
-  with HasAggregationDepth with HasThreshold with HasBlockSizeInMB {
+  with HasAggregationDepth with HasThreshold with HasMaxBlockSizeInMB {
 
   /**
    * Param for threshold in binary classification prediction.
@@ -57,7 +57,7 @@ private[classification] trait LinearSVCParams extends ClassifierParams with HasR
     "threshold in binary classification prediction applied to rawPrediction")
 
   setDefault(regParam -> 0.0, maxIter -> 100, fitIntercept -> true, tol -> 1E-6,
-    standardization -> true, threshold -> 0.0, aggregationDepth -> 2, blockSizeInMB -> 0.0)
+    standardization -> true, threshold -> 0.0, aggregationDepth -> 2, maxBlockSizeInMB -> 0.0)
 }
 
 /**
@@ -153,13 +153,13 @@ class LinearSVC @Since("2.2.0") (
   def setAggregationDepth(value: Int): this.type = set(aggregationDepth, value)
 
   /**
-   * Sets the value of param [[blockSizeInMB]].
+   * Sets the value of param [[maxBlockSizeInMB]].
    * Default is 0.0.
    *
    * @group expertSetParam
    */
   @Since("3.1.0")
-  def setBlockSizeInMB(value: Double): this.type = set(blockSizeInMB, value)
+  def setMaxBlockSizeInMB(value: Double): this.type = set(maxBlockSizeInMB, value)
 
   @Since("2.2.0")
   override def copy(extra: ParamMap): LinearSVC = defaultCopy(extra)
@@ -169,7 +169,7 @@ class LinearSVC @Since("2.2.0") (
     instr.logDataset(dataset)
     instr.logParams(this, labelCol, weightCol, featuresCol, predictionCol, rawPredictionCol,
       regParam, maxIter, fitIntercept, tol, standardization, threshold, aggregationDepth,
-      blockSizeInMB)
+      maxBlockSizeInMB)
 
     if (dataset.storageLevel != StorageLevel.NONE) {
       instr.logWarning(s"Input instances will be standardized, blockified to blocks, and " +
@@ -191,7 +191,7 @@ class LinearSVC @Since("2.2.0") (
     instr.logNamedValue("highestLabelWeight", labelSummarizer.histogram.max.toString)
     instr.logSumOfWeights(summarizer.weightSum)
 
-    var actualBlockSizeInMB = $(blockSizeInMB)
+    var actualBlockSizeInMB = $(maxBlockSizeInMB)
     if (actualBlockSizeInMB == 0) {
       actualBlockSizeInMB = InstanceBlock.DefaultBlockSizeInMB
       require(actualBlockSizeInMB > 0, "inferred actual BlockSizeInMB must > 0")
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Instance.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Instance.scala
@@ -161,20 +161,19 @@ private[spark] object InstanceBlock {
         var blockMemUsage = 0L
 
         while (instanceIterator.hasNext && blockMemUsage < maxMemUsage) {
-          val instance: Instance = instanceIterator.next()
+          val instance = instanceIterator.next()
           if (numCols < 0L) numCols = instance.features.size
           require(numCols == instance.features.size)
-          val nnz = instance.features.numNonzeros
 
           buff += instance
           buffCnt += 1L
-          buffNnz += nnz
+          buffNnz += instance.features.numNonzeros
           buffUnitWeight &&= (instance.weight == 1)
           blockMemUsage = getBlockMemUsage(numCols, buffCnt, buffNnz, buffUnitWeight)
         }
 
-        // the block mem usage may slightly exceed threshold, not a big issue.
-        // and this ensure even if one row exceed block limit, each block has one row
+        // the block memory usage may slightly exceed threshold, not a big issue.
+        // and this ensure even if one row exceed block limit, each block has one row.
         InstanceBlock.fromInstances(buff.result())
       }
     }
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/SharedParamsCodeGen.scala
@@ -109,10 +109,10 @@ private[shared] object SharedParamsCodeGen {
         "stacked within partitions. If block size is more than remaining data in a partition " +
         "then it is adjusted to the size of this data.",
         isValid = "ParamValidators.gt(0)", isExpertParam = true),
-      ParamDesc[Double]("blockSizeInMB", "Maximum memory in MB for stacking input data " +
-        "in blocks. Data is stacked within partitions. If more than remaining data size in a " +
-        "partition then it is adjusted to the data size. If 0, try to infer an appropriate value " +
-        "based on the statistics of dataset. Must be >= 0.",
+      ParamDesc[Double]("maxBlockSizeInMB", "Maximum memory in MB for stacking input data " +
+        "into blocks. Data is stacked within partitions. If more than remaining data size in a " +
+        "partition then it is adjusted to the data size. If 0, try to infer an appropriate " +
+        "value. Must be >= 0.",
         Some("0.0"), isValid = "ParamValidators.gtEq(0.0)", isExpertParam = true)
     )
 
diff --git a/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala b/mllib/src/main/scala/org/apache/spark/ml/param/shared/sharedParams.scala
@@ -564,20 +564,20 @@ trait HasBlockSize extends Params {
 }
 
 /**
- * Trait for shared param blockSizeInMB (default: 0.0). This trait may be changed or
+ * Trait for shared param maxBlockSizeInMB (default: 0.0). This trait may be changed or
  * removed between minor versions.
  */
-trait HasBlockSizeInMB extends Params {
+trait HasMaxBlockSizeInMB extends Params {
 
   /**
-   * Param for Maximum memory in MB for stacking input data in blocks. Data is stacked within partitions. If more than remaining data size in a partition then it is adjusted to the data size. If 0, try to infer an appropriate value based on the statistics of dataset. Must be &gt;= 0..
+   * Param for Maximum memory in MB for stacking input data into blocks. Data is stacked within partitions. If more than remaining data size in a partition then it is adjusted to the data size. If 0, try to infer an appropriate value. Must be &gt;= 0..
    * @group expertParam
    */
-  final val blockSizeInMB: DoubleParam = new DoubleParam(this, "blockSizeInMB", "Maximum memory in MB for stacking input data in blocks. Data is stacked within partitions. If more than remaining data size in a partition then it is adjusted to the data size. If 0, try to infer an appropriate value based on the statistics of dataset. Must be >= 0.", ParamValidators.gtEq(0.0))
+  final val maxBlockSizeInMB: DoubleParam = new DoubleParam(this, "maxBlockSizeInMB", "Maximum memory in MB for stacking input data into blocks. Data is stacked within partitions. If more than remaining data size in a partition then it is adjusted to the data size. If 0, try to infer an appropriate value. Must be >= 0.", ParamValidators.gtEq(0.0))
 
-  setDefault(blockSizeInMB, 0.0)
+  setDefault(maxBlockSizeInMB, 0.0)
 
   /** @group expertGetParam */
-  final def getBlockSizeInMB: Double = $(blockSizeInMB)
+  final def getMaxBlockSizeInMB: Double = $(maxBlockSizeInMB)
 }
 // scalastyle:on
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/LinearSVCSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/LinearSVCSuite.scala
@@ -215,7 +215,7 @@ class LinearSVCSuite extends MLTest with DefaultReadWriteTest {
         .setMaxIter(5)
       val model = lsvc.fit(dataset)
       Seq(0, 0.01, 0.1, 1, 2, 4).foreach { s =>
-        val model2 = lsvc.setBlockSizeInMB(s).fit(dataset)
+        val model2 = lsvc.setMaxBlockSizeInMB(s).fit(dataset)
         assert(model.intercept ~== model2.intercept relTol 1e-9)
         assert(model.coefficients ~== model2.coefficients relTol 1e-9)
       }
diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
@@ -26,7 +26,7 @@
 from pyspark.ml import Estimator, Predictor, PredictionModel, Model
 from pyspark.ml.param.shared import HasRawPredictionCol, HasProbabilityCol, HasThresholds, \
     HasRegParam, HasMaxIter, HasFitIntercept, HasTol, HasStandardization, HasWeightCol, \
-    HasAggregationDepth, HasThreshold, HasBlockSize, HasBlockSizeInMB, Param, Params, \
+    HasAggregationDepth, HasThreshold, HasBlockSize, HasMaxBlockSizeInMB, Param, Params, \
     TypeConverters, HasElasticNetParam, HasSeed, HasStepSize, HasSolver, HasParallelism
 from pyspark.ml.tree import _DecisionTreeModel, _DecisionTreeParams, \
     _TreeEnsembleModel, _RandomForestParams, _GBTParams, \
@@ -504,7 +504,7 @@ def recallByThreshold(self):
 
 class _LinearSVCParams(_ClassifierParams, HasRegParam, HasMaxIter, HasFitIntercept, HasTol,
                        HasStandardization, HasWeightCol, HasAggregationDepth, HasThreshold,
-                       HasBlockSizeInMB):
+                       HasMaxBlockSizeInMB):
     """
     Params for :py:class:`LinearSVC` and :py:class:`LinearSVCModel`.
 
@@ -521,7 +521,7 @@ def __init__(self, *args):
         super(_LinearSVCParams, self).__init__(*args)
         self._setDefault(maxIter=100, regParam=0.0, tol=1e-6, fitIntercept=True,
                          standardization=True, threshold=0.0, aggregationDepth=2,
-                         blockSizeInMB=0.0)
+                         maxBlockSizeInMB=0.0)
 
 
 @inherit_doc
@@ -565,7 +565,7 @@ class LinearSVC(_JavaClassifier, _LinearSVCParams, JavaMLWritable, JavaMLReadabl
     LinearSVCModel...
     >>> model.getThreshold()
     0.5
-    >>> model.getBlockSizeInMB()
+    >>> model.getMaxBlockSizeInMB()
     0.0
     >>> model.coefficients
     DenseVector([0.0, -0.2792, -0.1833])
@@ -605,12 +605,12 @@ class LinearSVC(_JavaClassifier, _LinearSVCParams, JavaMLWritable, JavaMLReadabl
     def __init__(self, *, featuresCol="features", labelCol="label", predictionCol="prediction",
                  maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction",
                  fitIntercept=True, standardization=True, threshold=0.0, weightCol=None,
-                 aggregationDepth=2, blockSizeInMB=0.0):
+                 aggregationDepth=2, maxBlockSizeInMB=0.0):
         """
         __init__(self, \\*, featuresCol="features", labelCol="label", predictionCol="prediction", \
                  maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", \
                  fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, \
-                 aggregationDepth=2, blockSizeInMB=0.0):
+                 aggregationDepth=2, maxBlockSizeInMB=0.0):
         """
         super(LinearSVC, self).__init__()
         self._java_obj = self._new_java_obj(
@@ -623,12 +623,12 @@ def __init__(self, *, featuresCol="features", labelCol="label", predictionCol="p
     def setParams(self, *, featuresCol="features", labelCol="label", predictionCol="prediction",
                   maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction",
                   fitIntercept=True, standardization=True, threshold=0.0, weightCol=None,
-                  aggregationDepth=2, blockSizeInMB=0.0):
+                  aggregationDepth=2, maxBlockSizeInMB=0.0):
         """
         setParams(self, \\*, featuresCol="features", labelCol="label", predictionCol="prediction", \
                   maxIter=100, regParam=0.0, tol=1e-6, rawPredictionCol="rawPrediction", \
                   fitIntercept=True, standardization=True, threshold=0.0, weightCol=None, \
-                  aggregationDepth=2, blockSizeInMB=0.0):
+                  aggregationDepth=2, maxBlockSizeInMB=0.0):
         Sets params for Linear SVM Classifier.
         """
         kwargs = self._input_kwargs
@@ -694,11 +694,11 @@ def setAggregationDepth(self, value):
         return self._set(aggregationDepth=value)
 
     @since("3.1.0")
-    def setBlockSizeInMB(self, value):
+    def setMaxBlockSizeInMB(self, value):
         """
-        Sets the value of :py:attr:`blockSizeInMB`.
+        Sets the value of :py:attr:`maxBlockSizeInMB`.
         """
-        return self._set(blockSizeInMB=value)
+        return self._set(maxBlockSizeInMB=value)
 
 
 class LinearSVCModel(_JavaClassificationModel, _LinearSVCParams, JavaMLWritable, JavaMLReadable,
diff --git a/python/pyspark/ml/classification.pyi b/python/pyspark/ml/classification.pyi
@@ -26,7 +26,7 @@ from pyspark.ml.base import _PredictorParams
 from pyspark.ml.param.shared import (
     HasAggregationDepth,
     HasBlockSize,
-    HasBlockSizeInMB,
+    HasMaxBlockSizeInMB,
     HasElasticNetParam,
     HasFitIntercept,
     HasMaxIter,
@@ -173,7 +173,7 @@ class _LinearSVCParams(
     HasWeightCol,
     HasAggregationDepth,
     HasThreshold,
-    HasBlockSizeInMB,
+    HasMaxBlockSizeInMB,
 ):
     threshold: Param[float]
     def __init__(self, *args: Any) -> None: ...
@@ -199,7 +199,7 @@ class LinearSVC(
         threshold: float = ...,
         weightCol: Optional[str] = ...,
         aggregationDepth: int = ...,
-        blockSizeInMB: float = ...
+        maxBlockSizeInMB: float = ...
     ) -> None: ...
     def setParams(
         self,
@@ -216,7 +216,7 @@ class LinearSVC(
         threshold: float = ...,
         weightCol: Optional[str] = ...,
         aggregationDepth: int = ...,
-        blockSizeInMB: float = ...
+        maxBlockSizeInMB: float = ...
     ) -> LinearSVC: ...
     def setMaxIter(self, value: int) -> LinearSVC: ...
     def setRegParam(self, value: float) -> LinearSVC: ...
@@ -226,7 +226,7 @@ class LinearSVC(
     def setThreshold(self, value: float) -> LinearSVC: ...
     def setWeightCol(self, value: str) -> LinearSVC: ...
     def setAggregationDepth(self, value: int) -> LinearSVC: ...
-    def setBlockSizeInMB(self, value: float) -> LinearSVC: ...
+    def setMaxBlockSizeInMB(self, value: float) -> LinearSVC: ...
 
 class LinearSVCModel(
     _JavaClassificationModel[Vector],
diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py b/python/pyspark/ml/param/_shared_params_code_gen.py
@@ -166,10 +166,10 @@ def get$Name(self):
         ("blockSize", "block size for stacking input data in matrices. Data is stacked within "
          "partitions. If block size is more than remaining data in a partition then it is "
          "adjusted to the size of this data.", None, "TypeConverters.toInt"),
-        ("blockSizeInMB", "maximum memory in MB for stacking input data in blocks. Data is " +
+        ("maxBlockSizeInMB", "maximum memory in MB for stacking input data into blocks. Data is " +
          "stacked within partitions. If more than remaining data size in a partition then it " +
-         "is adjusted to the data size. If 0, try to infer an appropriate value based on the " +
-         "statistics of dataset. Must be >= 0.", "0.0", "TypeConverters.toFloat")]
+         "is adjusted to the data size. If 0, try to infer an appropriate value. Must be >= 0.",
+         "0.0", "TypeConverters.toFloat")]
 
     code = []
     for name, doc, defaultValueStr, typeConverter in shared:
diff --git a/python/pyspark/ml/param/shared.py b/python/pyspark/ml/param/shared.py
@@ -599,19 +599,19 @@ def getBlockSize(self):
         return self.getOrDefault(self.blockSize)
 
 
-class HasBlockSizeInMB(Params):
+class HasMaxBlockSizeInMB(Params):
     """
-    Mixin for param blockSizeInMB: maximum memory in MB for stacking input data in blocks. Data is stacked within partitions. If more than remaining data size in a partition then it is adjusted to the data size. If 0, try to infer an appropriate value based on the statistics of dataset. Must be >= 0.
+    Mixin for param maxBlockSizeInMB: maximum memory in MB for stacking input data into blocks. Data is stacked within partitions. If more than remaining data size in a partition then it is adjusted to the data size. If 0, try to infer an appropriate value. Must be >= 0.
     """
 
-    blockSizeInMB = Param(Params._dummy(), "blockSizeInMB", "maximum memory in MB for stacking input data in blocks. Data is stacked within partitions. If more than remaining data size in a partition then it is adjusted to the data size. If 0, try to infer an appropriate value based on the statistics of dataset. Must be >= 0.", typeConverter=TypeConverters.toFloat)
+    maxBlockSizeInMB = Param(Params._dummy(), "maxBlockSizeInMB", "maximum memory in MB for stacking input data into blocks. Data is stacked within partitions. If more than remaining data size in a partition then it is adjusted to the data size. If 0, try to infer an appropriate value. Must be >= 0.", typeConverter=TypeConverters.toFloat)
 
     def __init__(self):
-        super(HasBlockSizeInMB, self).__init__()
-        self._setDefault(blockSizeInMB=0.0)
+        super(HasMaxBlockSizeInMB, self).__init__()
+        self._setDefault(maxBlockSizeInMB=0.0)
 
-    def getBlockSizeInMB(self):
+    def getMaxBlockSizeInMB(self):
         """
-        Gets the value of blockSizeInMB or its default value.
+        Gets the value of maxBlockSizeInMB or its default value.
         """
-        return self.getOrDefault(self.blockSizeInMB)
+        return self.getOrDefault(self.maxBlockSizeInMB)
diff --git a/python/pyspark/ml/param/shared.pyi b/python/pyspark/ml/param/shared.pyi
@@ -186,7 +186,7 @@ class HasBlockSize(Params):
     def __init__(self) -> None: ...
     def getBlockSize(self) -> int: ...
 
-class HasBlockSizeInMB(Params):
-    blockSizeInMB: Param[float]
+class HasMaxBlockSizeInMB(Params):
+    maxBlockSizeInMB: Param[float]
     def __init__(self) -> None: ...
-    def getBlockSizeInMB(self) -> float: ...
+    def getMaxBlockSizeInMB(self) -> float: ...

Original file line number	Diff line number	Diff line change
`@@ -215,7 +215,7 @@ class LinearSVCSuite extends MLTest with DefaultReadWriteTest {`
`215`	`215`	`.setMaxIter(5)`
`216`	`216`	`val model = lsvc.fit(dataset)`
`217`	`217`	`Seq(0, 0.01, 0.1, 1, 2, 4).foreach { s =>`
`218`		`- val model2 = lsvc.setBlockSizeInMB(s).fit(dataset)`
	`218`	`+ val model2 = lsvc.setMaxBlockSizeInMB(s).fit(dataset)`
`219`	`219`	`assert(model.intercept ~== model2.intercept relTol 1e-9)`
`220`	`220`	`assert(model.coefficients ~== model2.coefficients relTol 1e-9)`
`221`	`221`	`}`