Try adding PMMLExportable to ML with KMeans

holdenk · holdenk · commit 1749aecff0d5 · 2015-10-21T17:13:07.000-07:00
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
@@ -17,9 +17,12 @@
 
 package org.apache.spark.ml.clustering
 
+import javax.xml.transform.stream.StreamResult
+
 import org.apache.spark.annotation.{Since, Experimental}
 import org.apache.spark.ml.param.{Param, Params, IntParam, ParamMap}
 import org.apache.spark.ml.param.shared._
+import org.apache.spark.ml.pmml.PMMLExportable
 import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
 import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.mllib.clustering.{KMeans => MLlibKMeans, KMeansModel => MLlibKMeansModel}
@@ -94,7 +97,8 @@ private[clustering] trait KMeansParams extends Params with HasMaxIter with HasFe
 @Experimental
 class KMeansModel private[ml] (
     @Since("1.5.0") override val uid: String,
-    private val parentModel: MLlibKMeansModel) extends Model[KMeansModel] with KMeansParams {
+  private val parentModel: MLlibKMeansModel) extends Model[KMeansModel] with KMeansParams
+    with PMMLExportable {
 
   @Since("1.5.0")
   override def copy(extra: ParamMap): KMeansModel = {
@@ -129,6 +133,14 @@ class KMeansModel private[ml] (
     val data = dataset.select(col($(featuresCol))).map { case Row(point: Vector) => point }
     parentModel.computeCost(data)
   }
+
+  /**
+   * Export the model to stream result in PMML format
+   */
+  @Since("1.6.0")
+  override def toPMML(streamResult: StreamResult): Unit = {
+    parentModel.toPMML(streamResult)
+  }
 }
 
 /**
@@ -209,4 +221,3 @@ class KMeans @Since("1.5.0") (
     validateAndTransformSchema(schema)
   }
 }
-
diff --git a/mllib/src/main/scala/org/apache/spark/ml/pmml/PMMLExportable.scala b/mllib/src/main/scala/org/apache/spark/ml/pmml/PMMLExportable.scala
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.pmml
+
+import java.io.{File, OutputStream, StringWriter}
+import javax.xml.transform.stream.StreamResult
+
+import org.jpmml.model.JAXBUtil
+
+import org.apache.spark.SparkContext
+import org.apache.spark.annotation.{DeveloperApi, Experimental, Since}
+import org.apache.spark.mllib.pmml.export.PMMLModelExportFactory
+
+/**
+ * :: DeveloperApi ::
+ * Export model to the PMML format
+ * Predictive Model Markup Language (PMML) is an XML-based file format
+ * developed by the Data Mining Group (www.dmg.org).
+ * Based on [[org.apache.spark.mllib.pmml.Exportable]]
+ */
+@DeveloperApi
+@Since("1.6.0")
+trait PMMLExportable {
+
+  /**
+   * Export the model to the stream result in PMML format.
+   */
+  private[spark] def toPMML(streamResult: StreamResult): Unit
+
+  /**
+   * :: Experimental ::
+   * Export the model to a local file in PMML format
+   */
+  @Experimental
+  @Since("1.6.0")
+  def toPMML(localPath: String): Unit = {
+    toPMML(new StreamResult(new File(localPath)))
+  }
+
+  /**
+   * :: Experimental ::
+   * Export the model to a directory on a distributed file system in PMML format.
+   * Models should override if they may contain more data than
+   * is reasonable to store locally.
+   */
+  @Experimental
+  @Since("1.6.0")
+  def toPMML(sc: SparkContext, path: String): Unit = {
+    val pmml = toPMML()
+    sc.parallelize(Array(pmml), 1).saveAsTextFile(path)
+  }
+
+  /**
+   * :: Experimental ::
+   * Export the model to the OutputStream in PMML format
+   */
+  @Experimental
+  @Since("1.6.0")
+  def toPMML(outputStream: OutputStream): Unit = {
+    toPMML(new StreamResult(outputStream))
+  }
+
+  /**
+   * :: Experimental ::
+   * Export the model to a String in PMML format
+   */
+  @Experimental
+  @Since("1.6.0")
+  def toPMML(): String = {
+    val writer = new StringWriter
+    toPMML(new StreamResult(writer))
+    writer.toString
+  }
+
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/pmml/PMMLExportable.scala b/mllib/src/main/scala/org/apache/spark/mllib/pmml/PMMLExportable.scala
@@ -39,7 +39,7 @@ trait PMMLExportable {
   /**
    * Export the model to the stream result in PMML format
    */
-  private def toPMML(streamResult: StreamResult): Unit = {
+  private[spark] def toPMML(streamResult: StreamResult): Unit = {
     val pmmlModelExport = PMMLModelExportFactory.createPMMLModelExport(this)
     JAXBUtil.marshalPMML(pmmlModelExport.getPmml, streamResult)
   }

Original file line number	Diff line number	Diff line change
`@@ -39,7 +39,7 @@ trait PMMLExportable {`
`39`	`39`	`/**`
`40`	`40`	`* Export the model to the stream result in PMML format`
`41`	`41`	`*/`
`42`		`- private def toPMML(streamResult: StreamResult): Unit = {`
	`42`	`+ private[spark] def toPMML(streamResult: StreamResult): Unit = {`
`43`	`43`	`val pmmlModelExport = PMMLModelExportFactory.createPMMLModelExport(this)`
`44`	`44`	`JAXBUtil.marshalPMML(pmmlModelExport.getPmml, streamResult)`
`45`	`45`	`}`