From d6ce7af1e8f2ab71e92514beed552c4275608e98 Mon Sep 17 00:00:00 2001 From: Yuhao Yang Date: Fri, 31 Jul 2015 17:09:57 +0800 Subject: [PATCH 1/3] draft --- docs/ml-features.md | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/docs/ml-features.md b/docs/ml-features.md index 54068debe2159..b97feb7969deb 100644 --- a/docs/ml-features.md +++ b/docs/ml-features.md @@ -217,21 +217,27 @@ for feature in result.select("result").take(3): [Tokenization](http://en.wikipedia.org/wiki/Lexical_analysis#Tokenization) is the process of taking text (such as a sentence) and breaking it into individual terms (usually words). A simple [Tokenizer](api/scala/index.html#org.apache.spark.ml.feature.Tokenizer) class provides this functionality. The example below shows how to split sentences into sequences of words. -Note: A more advanced tokenizer is provided via [RegexTokenizer](api/scala/index.html#org.apache.spark.ml.feature.RegexTokenizer). +[RegexTokenizer](api/scala/index.html#org.apache.spark.ml.feature.RegexTokenizer) allows more advanced tokenization based on regular expression (regex) matching. By default, the parameter pattern (regex, default: \\s+) is used as delimiters to split the input text. Alternatively, users can set parameter gaps to false indicating the regex pattern denotes "tokens" rather than splitting "gaps", and find all matching occurrences as the tokenization result.
{% highlight scala %} -import org.apache.spark.ml.feature.Tokenizer +import org.apache.spark.ml.feature.{Tokenizer, RegexTokenizer} val sentenceDataFrame = sqlContext.createDataFrame(Seq( (0, "Hi I heard about Spark"), - (0, "I wish Java could use case classes"), - (1, "Logistic regression models are neat") + (1, "I wish Java could use case classes"), + (2, "Logistic,regression,models,are,neat") )).toDF("label", "sentence") val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words") -val wordsDataFrame = tokenizer.transform(sentenceDataFrame) -wordsDataFrame.select("words", "label").take(3).foreach(println) +val regexTokenizer = new RegexTokenizer().setInputCol("sentence").setOutputCol("words") + .setPattern("\\W") +// alternatively .setPattern("\\w+").setGaps(false) + +val tokenized = tokenizer.transform(sentenceDataFrame) +tokenized.select("words", "label").take(3).foreach(println) +val regexTokenized = regexTokenizer.transform(sentenceDataFrame) +regexTokenized.select("words", "label").take(3).foreach(println) {% endhighlight %}
From f6eb1dda4f80c70bc7cfb2ce4667ab2e5beb1a7e Mon Sep 17 00:00:00 2001 From: Yuhao Yang Date: Fri, 31 Jul 2015 18:07:21 +0800 Subject: [PATCH 2/3] add java and python --- docs/ml-features.md | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/docs/ml-features.md b/docs/ml-features.md index b97feb7969deb..dfd206b0c5986 100644 --- a/docs/ml-features.md +++ b/docs/ml-features.md @@ -232,7 +232,7 @@ val sentenceDataFrame = sqlContext.createDataFrame(Seq( val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words") val regexTokenizer = new RegexTokenizer().setInputCol("sentence").setOutputCol("words") .setPattern("\\W") -// alternatively .setPattern("\\w+").setGaps(false) + // alternatively .setPattern("\\w+").setGaps(false) val tokenized = tokenizer.transform(sentenceDataFrame) tokenized.select("words", "label").take(3).foreach(println) @@ -246,6 +246,7 @@ regexTokenized.select("words", "label").take(3).foreach(println) import com.google.common.collect.Lists; import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.ml.feature.RegexTokenizer; import org.apache.spark.ml.feature.Tokenizer; import org.apache.spark.mllib.linalg.Vector; import org.apache.spark.sql.DataFrame; @@ -258,8 +259,8 @@ import org.apache.spark.sql.types.StructType; JavaRDD jrdd = jsc.parallelize(Lists.newArrayList( RowFactory.create(0, "Hi I heard about Spark"), - RowFactory.create(0, "I wish Java could use case classes"), - RowFactory.create(1, "Logistic regression models are neat") + RowFactory.create(1, "I wish Java could use case classes"), + RowFactory.create(2, "Logistic regression models are neat") )); StructType schema = new StructType(new StructField[]{ new StructField("label", DataTypes.DoubleType, false, Metadata.empty()), @@ -273,22 +274,28 @@ for (Row r : wordsDataFrame.select("words", "label").take(3)) { for (String word : words) System.out.print(word + " "); System.out.println(); } + +RegexTokenizer regexTokenizer = new RegexTokenizer().setInputCol("sentence").setOutputCol("words") + .setPattern("\\W"); + // alternatively .setPattern("\\w+").setGaps(false); {% endhighlight %}
{% highlight python %} -from pyspark.ml.feature import Tokenizer +from pyspark.ml.feature import Tokenizer, RegexTokenizer sentenceDataFrame = sqlContext.createDataFrame([ (0, "Hi I heard about Spark"), - (0, "I wish Java could use case classes"), - (1, "Logistic regression models are neat") + (1, "I wish Java could use case classes"), + (2, "Logistic regression models are neat") ], ["label", "sentence"]) tokenizer = Tokenizer(inputCol="sentence", outputCol="words") wordsDataFrame = tokenizer.transform(sentenceDataFrame) for words_label in wordsDataFrame.select("words", "label").take(3): print(words_label) +regexTokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern="\\W") +# alternatively, pattern="\\w+", gaps(false) {% endhighlight %}
From 4970d78227b3b24e02ba8bdd8b98d12cf3ed45e5 Mon Sep 17 00:00:00 2001 From: Yuhao Yang Date: Wed, 12 Aug 2015 17:14:37 +0800 Subject: [PATCH 3/3] style fix --- docs/ml-features.md | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/docs/ml-features.md b/docs/ml-features.md index d7f8562d06b80..cec2cbe673407 100644 --- a/docs/ml-features.md +++ b/docs/ml-features.md @@ -217,7 +217,11 @@ for feature in result.select("result").take(3): [Tokenization](http://en.wikipedia.org/wiki/Lexical_analysis#Tokenization) is the process of taking text (such as a sentence) and breaking it into individual terms (usually words). A simple [Tokenizer](api/scala/index.html#org.apache.spark.ml.feature.Tokenizer) class provides this functionality. The example below shows how to split sentences into sequences of words. -[RegexTokenizer](api/scala/index.html#org.apache.spark.ml.feature.RegexTokenizer) allows more advanced tokenization based on regular expression (regex) matching. By default, the parameter pattern (regex, default: \\s+) is used as delimiters to split the input text. Alternatively, users can set parameter gaps to false indicating the regex pattern denotes "tokens" rather than splitting "gaps", and find all matching occurrences as the tokenization result. +[RegexTokenizer](api/scala/index.html#org.apache.spark.ml.feature.RegexTokenizer) allows more + advanced tokenization based on regular expression (regex) matching. + By default, the parameter "pattern" (regex, default: \\s+) is used as delimiters to split the input text. + Alternatively, users can set parameter "gaps" to false indicating the regex "pattern" denotes + "tokens" rather than splitting gaps, and find all matching occurrences as the tokenization result.
@@ -230,9 +234,10 @@ val sentenceDataFrame = sqlContext.createDataFrame(Seq( (2, "Logistic,regression,models,are,neat") )).toDF("label", "sentence") val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words") -val regexTokenizer = new RegexTokenizer().setInputCol("sentence").setOutputCol("words") - .setPattern("\\W") - // alternatively .setPattern("\\w+").setGaps(false) +val regexTokenizer = new RegexTokenizer() + .setInputCol("sentence") + .setOutputCol("words") + .setPattern("\\W") // alternatively .setPattern("\\w+").setGaps(false) val tokenized = tokenizer.transform(sentenceDataFrame) tokenized.select("words", "label").take(3).foreach(println) @@ -260,7 +265,7 @@ import org.apache.spark.sql.types.StructType; JavaRDD jrdd = jsc.parallelize(Lists.newArrayList( RowFactory.create(0, "Hi I heard about Spark"), RowFactory.create(1, "I wish Java could use case classes"), - RowFactory.create(2, "Logistic regression models are neat") + RowFactory.create(2, "Logistic,regression,models,are,neat") )); StructType schema = new StructType(new StructField[]{ new StructField("label", DataTypes.DoubleType, false, Metadata.empty()), @@ -275,9 +280,10 @@ for (Row r : wordsDataFrame.select("words", "label").take(3)) { System.out.println(); } -RegexTokenizer regexTokenizer = new RegexTokenizer().setInputCol("sentence").setOutputCol("words") - .setPattern("\\W"); - // alternatively .setPattern("\\w+").setGaps(false); +RegexTokenizer regexTokenizer = new RegexTokenizer() + .setInputCol("sentence") + .setOutputCol("words") + .setPattern("\\W"); // alternatively .setPattern("\\w+").setGaps(false); {% endhighlight %}
@@ -288,14 +294,14 @@ from pyspark.ml.feature import Tokenizer, RegexTokenizer sentenceDataFrame = sqlContext.createDataFrame([ (0, "Hi I heard about Spark"), (1, "I wish Java could use case classes"), - (2, "Logistic regression models are neat") + (2, "Logistic,regression,models,are,neat") ], ["label", "sentence"]) tokenizer = Tokenizer(inputCol="sentence", outputCol="words") wordsDataFrame = tokenizer.transform(sentenceDataFrame) for words_label in wordsDataFrame.select("words", "label").take(3): print(words_label) regexTokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern="\\W") -# alternatively, pattern="\\w+", gaps(false) +# alternatively, pattern="\\w+", gaps(False) {% endhighlight %}