diff --git a/docs/mllib-frequent-pattern-mining.md b/docs/mllib-frequent-pattern-mining.md index 8ea4389266484..6c06550703d46 100644 --- a/docs/mllib-frequent-pattern-mining.md +++ b/docs/mllib-frequent-pattern-mining.md @@ -39,18 +39,30 @@ MLlib's FP-growth implementation takes the following (hyper-)parameters:
-[`FPGrowth`](api/scala/index.html#org.apache.spark.mllib.fpm.FPGrowth) implements the -FP-growth algorithm. -It take a `JavaRDD` of transactions, where each transaction is an `Iterable` of items of a generic type. +[`FPGrowth`](api/scala/index.html#org.apache.spark.mllib.fpm.FPGrowth) +implements the FP-growth algorithm. It take an `RDD` of transactions, +where each transaction is an `Iterable` of items of a generic type. Calling `FPGrowth.run` with transactions returns an [`FPGrowthModel`](api/scala/index.html#org.apache.spark.mllib.fpm.FPGrowthModel) -that stores the frequent itemsets with their frequencies. +that stores the frequent itemsets with their frequencies. The following +example illustrates how to mine frequent itemsets and association rules +(see [Association +Rules](mllib-frequent-pattern-mining.html#association-rules) for +details) from `transactions`. + {% highlight scala %} import org.apache.spark.rdd.RDD import org.apache.spark.mllib.fpm.{FPGrowth, FPGrowthModel} -val transactions: RDD[Array[String]] = ... +val transactions: RDD[Array[String]] = sc.parallelize(Seq( + "r z h k p", + "z y x w v u t s", + "s x o n r", + "x z y m t s q e", + "z", + "x z y r q t p") + .map(_.split(" "))) val fpg = new FPGrowth() .setMinSupport(0.2) @@ -60,29 +72,48 @@ val model = fpg.run(transactions) model.freqItemsets.collect().foreach { itemset => println(itemset.items.mkString("[", ",", "]") + ", " + itemset.freq) } + +val minConfidence = 0.8 +model.generateAssociationRules(minConfidence).collect().foreach { rule => + println( + rule.antecedent.mkString("[", ",", "]") + + " => " + rule.consequent .mkString("[", ",", "]") + + ", " + rule.confidence) +} {% endhighlight %}
-[`FPGrowth`](api/java/org/apache/spark/mllib/fpm/FPGrowth.html) implements the -FP-growth algorithm. -It take an `RDD` of transactions, where each transaction is an `Array` of items of a generic type. -Calling `FPGrowth.run` with transactions returns an +[`FPGrowth`](api/java/org/apache/spark/mllib/fpm/FPGrowth.html) +implements the FP-growth algorithm. It take a `JavaRDD` of +transactions, where each transaction is an `Array` of items of a generic +type. Calling `FPGrowth.run` with transactions returns an [`FPGrowthModel`](api/java/org/apache/spark/mllib/fpm/FPGrowthModel.html) -that stores the frequent itemsets with their frequencies. +that stores the frequent itemsets with their frequencies. The following +example illustrates how to mine frequent itemsets and association rules +(see [Association +Rules](mllib-frequent-pattern-mining.html#association-rules) for +details) from `transactions`. {% highlight java %} +import java.util.Arrays; import java.util.List; -import com.google.common.base.Joiner; - import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.mllib.fpm.AssociationRules; import org.apache.spark.mllib.fpm.FPGrowth; import org.apache.spark.mllib.fpm.FPGrowthModel; -JavaRDD> transactions = ... +JavaRDD> transactions = sc.parallelize(Arrays.asList( + Arrays.asList("r z h k p".split(" ")), + Arrays.asList("z y x w v u t s".split(" ")), + Arrays.asList("s x o n r".split(" ")), + Arrays.asList("x z y m t s q e".split(" ")), + Arrays.asList("z".split(" ")), + Arrays.asList("x z y r q t p".split(" "))), 2); FPGrowth fpg = new FPGrowth() .setMinSupport(0.2) @@ -90,7 +121,78 @@ FPGrowth fpg = new FPGrowth() FPGrowthModel model = fpg.run(transactions); for (FPGrowth.FreqItemset itemset: model.freqItemsets().toJavaRDD().collect()) { - System.out.println("[" + Joiner.on(",").join(s.javaItems()) + "], " + s.freq()); + System.out.println("[" + itemset.javaItems() + "], " + itemset.freq()); +} + +double minConfidence = 0.8; +for (AssociationRules.Rule rule + : model.generateAssociationRules(minConfidence).toJavaRDD().collect()) { + System.out.println( + rule.javaAntecedent() + " => " + rule.javaConsequent() + ", " + rule.confidence()); +} +{% endhighlight %} + +
+
+ +## Association Rules + +
+
+[AssociationRules](api/scala/index.html#org.apache.spark.mllib.fpm.AssociationRules) +implements a parallel rule generation algorithm for constructing rules +that have a single item as the consequent. + +{% highlight scala %} +import org.apache.spark.rdd.RDD +import org.apache.spark.mllib.fpm.AssociationRules +import org.apache.spark.mllib.fpm.FPGrowth.FreqItemset + +val freqItemsets = sc.parallelize(Seq( + new FreqItemset(Array("a"), 15L), + new FreqItemset(Array("b"), 35L), + new FreqItemset(Array("a", "b"), 12L) +)); + +val ar = new AssociationRules() + .setMinConfidence(0.8) +val results = ar.run(freqItemsets) + +results.collect().foreach { rule => + println("[" + rule.antecedent.mkString(",") + + "=>" + + rule.consequent.mkString(",") + "]," + rule.confidence) +} +{% endhighlight %} + +
+ +
+[AssociationRules](api/java/org/apache/spark/mllib/fpm/AssociationRules.html) +implements a parallel rule generation algorithm for constructing rules +that have a single item as the consequent. + +{% highlight java %} +import java.util.Arrays; + +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.mllib.fpm.AssociationRules; +import org.apache.spark.mllib.fpm.FPGrowth.FreqItemset; + +JavaRDD> freqItemsets = sc.parallelize(Arrays.asList( + new FreqItemset(new String[] {"a"}, 15L), + new FreqItemset(new String[] {"b"}, 35L), + new FreqItemset(new String[] {"a", "b"}, 12L) +)); + +AssociationRules arules = new AssociationRules() + .setMinConfidence(0.8); +JavaRDD> results = arules.run(freqItemsets); + +for (AssociationRules.Rule rule: results.collect()) { + System.out.println( + rule.javaAntecedent() + " => " + rule.javaConsequent() + ", " + rule.confidence()); } {% endhighlight %} diff --git a/docs/mllib-guide.md b/docs/mllib-guide.md index 7851175b98230..0573e5d4bc5ff 100644 --- a/docs/mllib-guide.md +++ b/docs/mllib-guide.md @@ -48,6 +48,7 @@ This lists functionality included in `spark.mllib`, the main MLlib API. * [Feature extraction and transformation](mllib-feature-extraction.html) * [Frequent pattern mining](mllib-frequent-pattern-mining.html) * [FP-growth](mllib-frequent-pattern-mining.html#fp-growth) + * [association rules](mllib-frequent-pattern-mining.html#association-rules) * [PrefixSpan](mllib-frequent-pattern-mining.html#prefix-span) * [Evaluation Metrics](mllib-evaluation-metrics.html) * [Optimization (developer)](mllib-optimization.html) diff --git a/mllib/src/test/java/org/apache/spark/mllib/fpm/JavaAssociationRulesSuite.java b/mllib/src/test/java/org/apache/spark/mllib/fpm/JavaAssociationRulesSuite.java index b3815ae6039c0..d7c2cb3ae2067 100644 --- a/mllib/src/test/java/org/apache/spark/mllib/fpm/JavaAssociationRulesSuite.java +++ b/mllib/src/test/java/org/apache/spark/mllib/fpm/JavaAssociationRulesSuite.java @@ -49,7 +49,7 @@ public void runAssociationRules() { JavaRDD> freqItemsets = sc.parallelize(Lists.newArrayList( new FreqItemset(new String[] {"a"}, 15L), new FreqItemset(new String[] {"b"}, 35L), - new FreqItemset(new String[] {"a", "b"}, 18L) + new FreqItemset(new String[] {"a", "b"}, 12L) )); JavaRDD> results = (new AssociationRules()).run(freqItemsets);