diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala index 078bfa979d60..b24dc2398559 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala @@ -219,11 +219,6 @@ class KMeans private ( data: RDD[(Vector, Double)], instr: Option[Instrumentation]): KMeansModel = { - if (data.getStorageLevel == StorageLevel.NONE) { - logWarning("The input data is not directly cached, which may hurt performance if its" - + " parent RDDs are also uncached.") - } - // Compute squared norms and cache them. val norms = data.map { case (v, _) => Vectors.norm(v, 2.0) @@ -232,15 +227,13 @@ class KMeans private ( val zippedData = data.zip(norms).map { case ((v, w), norm) => (new VectorWithNorm(v, norm), w) } - zippedData.persist(StorageLevel.MEMORY_AND_DISK) - val model = runAlgorithmWithWeight(zippedData, instr) - zippedData.unpersist() - // Warn at the end of the run as well, for increased visibility. if (data.getStorageLevel == StorageLevel.NONE) { - logWarning("The input data was not directly cached, which may hurt performance if its" - + " parent RDDs are also uncached.") + zippedData.persist(StorageLevel.MEMORY_AND_DISK) } + val model = runAlgorithmWithWeight(zippedData, instr) + zippedData.unpersist() + model }