diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaMinHashExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaMinHashExample.java new file mode 100644 index 000000000000..09b030dd50fe --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaMinHashExample.java @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.examples.ml; + +// $example on$ +import org.apache.spark.ml.feature.MinHash; +import org.apache.spark.ml.feature.MinHashModel; +import org.apache.spark.ml.linalg.Vector; +import org.apache.spark.ml.linalg.VectorUDT; +import org.apache.spark.ml.linalg.Vectors; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.RowFactory; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.types.Metadata; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; + +import java.util.Arrays; +import java.util.List; +// $example off$ + +public class JavaMinHashExample { + public static void main(String[] args) { + SparkSession spark = SparkSession + .builder() + .appName("JavaMinHashExample") + .getOrCreate(); + + // $example on$ + List data = Arrays.asList( + RowFactory.create(Vectors.sparse(100, new int[]{1, 3, 4}, new double[]{1.0, 1.0, 1.0})), + RowFactory.create(Vectors.sparse(100, new int[]{1, 3, 4}, new double[]{1.0, 1.0, 1.0})), + RowFactory.create(Vectors.sparse(100, new int[]{3, 4, 6}, new double[]{1.0, 1.0, 1.0})), + RowFactory.create(Vectors.sparse(100, new int[]{2, 8}, new double[]{1.0, 1.0})) + ); + StructType schema = new StructType(new StructField[]{ + new StructField("signatures", new VectorUDT(), false, Metadata.empty()), + }); + Dataset dataset = spark.createDataFrame(data, schema); + + MinHash minHash = new MinHash() + .setInputCol("signatures") + .setOutputCol("buckets") + .setOutputDim(2); + MinHashModel model = minHash.fit(dataset); + + // basic transformation with a new hash column + Dataset transformedDataset = model.transform(dataset); + transformedDataset.select("signatures", "buckets").show(); + + // approximate nearest neighbor search with a dataset and a key + Vector key = Vectors.sparse(100, new int[]{2, 3, 4}, new double[]{1.0, 1.0, 1.0}); + Dataset approxNearestNeighbors = model.approxNearestNeighbors(dataset, key, 3, false, "distance"); + approxNearestNeighbors.select("signatures", "distance").show(); + + // approximate similarity join of two datasets + List dataToJoin = Arrays.asList(RowFactory.create(key)); + Dataset datasetToJoin = spark.createDataFrame(dataToJoin, schema); + Dataset approxSimilarityJoin = model.approxSimilarityJoin(dataset, datasetToJoin, 1); + approxSimilarityJoin.select("datasetA", "datasetB", "distCol").show(); + // $example off$ + + spark.stop(); + } +} diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomProjectionExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomProjectionExample.java new file mode 100644 index 000000000000..444ed18d42d9 --- /dev/null +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaRandomProjectionExample.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.examples.ml; + +// $example on$ +import org.apache.spark.ml.feature.RandomProjection; +import org.apache.spark.ml.feature.RandomProjectionModel; +import org.apache.spark.ml.linalg.Vector; +import org.apache.spark.ml.linalg.VectorUDT; +import org.apache.spark.ml.linalg.Vectors; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.RowFactory; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.types.Metadata; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; + +import java.util.Arrays; +import java.util.List; +// $example off$ + +public class JavaRandomProjectionExample { + public static void main(String[] args) { + SparkSession spark = SparkSession + .builder() + .appName("JavaRandomProjectionExample") + .getOrCreate(); + + // $example on$ + List data = Arrays.asList( + RowFactory.create(Vectors.sparse(100, new int[]{1, 3, 4}, new double[]{1.0, 1.0, 1.0})), + RowFactory.create(Vectors.sparse(100, new int[]{1, 3, 4}, new double[]{1.0, 1.0, 1.0})), + RowFactory.create(Vectors.sparse(100, new int[]{3, 4, 6}, new double[]{1.0, 1.0, 1.0})), + RowFactory.create(Vectors.sparse(100, new int[]{2, 8}, new double[]{1.0, 1.0})) + ); + StructType schema = new StructType(new StructField[]{ + new StructField("signatures", new VectorUDT(), false, Metadata.empty()), + }); + Dataset dataset = spark.createDataFrame(data, schema); + + RandomProjection randomProjection = new RandomProjection() + .setInputCol("signatures") + .setOutputCol("results") + .setOutputDim(3) + .setBucketLength(2); + RandomProjectionModel model = randomProjection.fit(dataset); + + // basic transformation with a new hash column + Dataset transformedDataset = model.transform(dataset); + transformedDataset.select("signatures", "results").show(); + + // approximate nearest neighbor search with a dataset and a key + Vector key = Vectors.sparse(100, new int[]{1, 3, 4}, new double[]{1.0, 1.0, 1.0}); + Dataset approxNearestNeighbors = model.approxNearestNeighbors(dataset, key, 3, false, "distance"); + approxNearestNeighbors.select("signatures", "distance").show(); + + // approximate similarity join of two datasets + List dataToJoin = Arrays.asList(RowFactory.create(key)); + Dataset datasetToJoin = spark.createDataFrame(dataToJoin, schema); + Dataset approxSimilarityJoin = model.approxSimilarityJoin(dataset, datasetToJoin, 1); + approxSimilarityJoin.select("datasetA", "datasetB", "distCol").show(); + // $example off$ + + spark.stop(); + } +} diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/MinHashExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/MinHashExample.scala new file mode 100644 index 000000000000..7b8365875680 --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/ml/MinHashExample.scala @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.ml + +// $example on$ +import org.apache.spark.ml.feature.MinHash +import org.apache.spark.ml.linalg.Vectors +// $example off$ +import org.apache.spark.sql.SparkSession + +object MinHashExample { + def main(args: Array[String]) { + val spark = SparkSession + .builder + .appName("MinHashExample") + .getOrCreate() + + // $example on$ + val dataset = spark.createDataFrame(Seq( + (1, Vectors.sparse(100, Array(1, 3, 4), Array(1.0, 1.0, 1.0))), + (2, Vectors.sparse(100, Array(1, 3, 4), Array(1.0, 1.0, 1.0))), + (3, Vectors.sparse(100, Array(3, 4, 6), Array(1.0, 1.0, 1.0))), + (4, Vectors.sparse(100, Array(2, 8), Array(1.0, 1.0))) + )).toDF("id", "signatures") + + val minHash = new MinHash() + .setInputCol("signatures") + .setOutputCol("buckets") + .setOutputDim(2) + val model = minHash.fit(dataset) + + // basic transformation with a new hash column + val transformedDataset = model.transform(dataset) + transformedDataset.select("id", "signatures", "buckets").show + + // approximate nearest neighbor search with a dataset and a key + val key = Vectors.sparse(100, Array[Int](2, 3, 4), Array[Double](1.0, 1.0, 1.0)) + val approxNearestNeighbors = model.approxNearestNeighbors(dataset, key, 3, false, "distance") + approxNearestNeighbors.select("id", "signatures", "distance").show + + // approximate similarity join of two datasets + val datasetToJoin = spark.createDataFrame(Seq((5, key))).toDF("id", "signatures") + val approxSimilarityJoin = model.approxSimilarityJoin(dataset, datasetToJoin, 1) + approxSimilarityJoin.select("datasetA", "datasetB", "distCol").show + // $example off$ + + spark.stop() + } +} diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/RandomProjectionExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/RandomProjectionExample.scala new file mode 100644 index 000000000000..13568acf599a --- /dev/null +++ b/examples/src/main/scala/org/apache/spark/examples/ml/RandomProjectionExample.scala @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// scalastyle:off println +package org.apache.spark.examples.ml + +// $example on$ +import org.apache.spark.ml.feature.RandomProjection +import org.apache.spark.ml.linalg.Vectors +// $example off$ +import org.apache.spark.sql.SparkSession + +object RandomProjectionExample { + def main(args: Array[String]) { + val spark = SparkSession + .builder + .appName("RandomProjectionExample") + .getOrCreate() + + // $example on$ + val dataset = spark.createDataFrame(Seq( + (1, Vectors.sparse(100, Array(1, 3, 4), Array(1.0, 1.0, 1.0))), + (2, Vectors.sparse(100, Array(1, 3, 4), Array(1.0, 1.0, 1.0))), + (3, Vectors.sparse(100, Array(3, 4, 6), Array(1.0, 1.0, 1.0))), + (4, Vectors.sparse(100, Array(2, 8), Array(1.0, 1.0))) + )).toDF("id", "signatures") + + val randomProjection = new RandomProjection() + .setInputCol("signatures") + .setOutputCol("results") + .setOutputDim(3) + .setBucketLength(2); + val model = randomProjection.fit(dataset) + + // basic transformation with a new hash column + val transformedDataset = model.transform(dataset) + transformedDataset.select("id", "signatures", "results").show + + // approximate nearest neighbor search with a dataset and a key + val key = Vectors.sparse(100, Array[Int](1, 3, 4), Array[Double](1.0, 1.0, 1.0)) + val approxNearestNeighbors = model.approxNearestNeighbors(dataset, key, 3, false, "distance") + approxNearestNeighbors.select("id", "signatures", "distance").show + + // approximate similarity join of two datasets + val datasetToJoin = spark.createDataFrame(Seq((5, key))).toDF("id", "signatures") + val approxSimilarityJoin = model.approxSimilarityJoin(dataset, datasetToJoin, 1) + approxSimilarityJoin.select("datasetA", "datasetB", "distCol").show + // $example off$ + + spark.stop() + } +}