Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ public static void main(String[] args) {
.setOutputCol("features");
LogisticRegression lr = new LogisticRegression()
.setMaxIter(10)
.setRegParam(0.01);
.setRegParam(0.001);
Pipeline pipeline = new Pipeline()
.setStages(new PipelineStage[] {tokenizer, hashingTF, lr});

Expand All @@ -71,7 +71,7 @@ public static void main(String[] args) {
Dataset<Row> test = spark.createDataFrame(Arrays.asList(
new JavaDocument(4L, "spark i j k"),
new JavaDocument(5L, "l m n"),
new JavaDocument(6L, "mapreduce spark"),
new JavaDocument(6L, "spark hadoop spark"),
new JavaDocument(7L, "apache hadoop")
), JavaDocument.class);

Expand Down

This file was deleted.

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ public static void main(String[] args) {
.setOutputCol("filtered");

List<Row> data = Arrays.asList(
RowFactory.create(Arrays.asList("I", "saw", "the", "red", "baloon")),
RowFactory.create(Arrays.asList("I", "saw", "the", "red", "balloon")),
RowFactory.create(Arrays.asList("Mary", "had", "a", "little", "lamb"))
);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,17 +57,24 @@ public static void main(String[] args) {

Tokenizer tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words");

Dataset<Row> wordsDataFrame = tokenizer.transform(sentenceDataFrame);
for (Row r : wordsDataFrame.select("words", "label").takeAsList(3)) {
RegexTokenizer regexTokenizer = new RegexTokenizer()
.setInputCol("sentence")
.setOutputCol("words")
.setPattern("\\W"); // alternatively .setPattern("\\w+").setGaps(false);

Dataset<Row> tokenized = tokenizer.transform(sentenceDataFrame);
for (Row r : tokenized.select("words", "label").takeAsList(3)) {
java.util.List<String> words = r.getList(0);
for (String word : words) System.out.print(word + " ");
System.out.println();
}

RegexTokenizer regexTokenizer = new RegexTokenizer()
.setInputCol("sentence")
.setOutputCol("words")
.setPattern("\\W"); // alternatively .setPattern("\\w+").setGaps(false);
Dataset<Row> regexTokenized = regexTokenizer.transform(sentenceDataFrame);
for (Row r : regexTokenized.select("words", "label").takeAsList(3)) {
java.util.List<String> words = r.getList(0);
for (String word : words) System.out.print(word + " ");
System.out.println();
}
// $example off$
spark.stop();
}
Expand Down
2 changes: 1 addition & 1 deletion examples/src/main/python/ml/aft_survival_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
if __name__ == "__main__":
spark = SparkSession \
.builder \
.appName("PythonAFTSurvivalRegressionExample") \
.appName("AFTSurvivalRegressionExample") \
.getOrCreate()

# $example on$
Expand Down
2 changes: 1 addition & 1 deletion examples/src/main/python/ml/bisecting_k_means_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
if __name__ == "__main__":
spark = SparkSession\
.builder\
.appName("PythonBisectingKMeansExample")\
.appName("BisectingKMeansExample")\
.getOrCreate()

# $example on$
Expand Down
3 changes: 2 additions & 1 deletion examples/src/main/python/ml/cross_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
# $example off$
from pyspark.sql import Row, SparkSession
from pyspark.sql import SparkSession

"""
A simple example demonstrating model selection using CrossValidator.
Expand All @@ -39,6 +39,7 @@
.builder\
.appName("CrossValidatorExample")\
.getOrCreate()

# $example on$
# Prepare training documents, which are labeled.
training = spark.createDataFrame([
Expand Down
11 changes: 6 additions & 5 deletions examples/src/main/python/ml/dataframe_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,15 +34,16 @@
if len(sys.argv) > 2:
print("Usage: dataframe_example.py <libsvm file>", file=sys.stderr)
exit(-1)
spark = SparkSession\
.builder\
.appName("DataFrameExample")\
.getOrCreate()
if len(sys.argv) == 2:
elif len(sys.argv) == 2:
input = sys.argv[1]
else:
input = "data/mllib/sample_libsvm_data.txt"

spark = SparkSession \
.builder \
.appName("DataFrameExample") \
.getOrCreate()

# Load input data
print("Loading LIBSVM file with UDT from " + input + ".")
df = spark.read.format("libsvm").load(input).cache()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
if __name__ == "__main__":
spark = SparkSession\
.builder\
.appName("decision_tree_classification_example")\
.appName("DecisionTreeClassificationExample")\
.getOrCreate()

# $example on$
Expand Down
20 changes: 12 additions & 8 deletions examples/src/main/python/ml/estimator_transformer_param_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
"""
Estimator Transformer Param Example.
"""
from __future__ import print_function

# $example on$
from pyspark.ml.linalg import Vectors
Expand All @@ -42,7 +43,7 @@
# Create a LogisticRegression instance. This instance is an Estimator.
lr = LogisticRegression(maxIter=10, regParam=0.01)
# Print out the parameters, documentation, and any default values.
print "LogisticRegression parameters:\n" + lr.explainParams() + "\n"
print("LogisticRegression parameters:\n" + lr.explainParams() + "\n")

# Learn a LogisticRegression model. This uses the parameters stored in lr.
model1 = lr.fit(training)
Expand All @@ -51,8 +52,8 @@
# we can view the parameters it used during fit().
# This prints the parameter (name: value) pairs, where names are unique IDs for this
# LogisticRegression instance.
print "Model 1 was fit using parameters: "
print model1.extractParamMap()
print("Model 1 was fit using parameters: ")
print(model1.extractParamMap())

# We may alternatively specify parameters using a Python dictionary as a paramMap
paramMap = {lr.maxIter: 20}
Expand All @@ -67,8 +68,8 @@
# Now learn a new model using the paramMapCombined parameters.
# paramMapCombined overrides all parameters set earlier via lr.set* methods.
model2 = lr.fit(training, paramMapCombined)
print "Model 2 was fit using parameters: "
print model2.extractParamMap()
print("Model 2 was fit using parameters: ")
print(model2.extractParamMap())

# Prepare test data
test = spark.createDataFrame([
Expand All @@ -81,9 +82,12 @@
# Note that model2.transform() outputs a "myProbability" column instead of the usual
# 'probability' column since we renamed the lr.probabilityCol parameter previously.
prediction = model2.transform(test)
selected = prediction.select("features", "label", "myProbability", "prediction")
for row in selected.collect():
print row
result = prediction.select("features", "label", "myProbability", "prediction") \
.collect()

for row in result:
print("features=%s, label=%s -> prob=%s, prediction=%s"
% (row.features, row.label, row.myProbability, row.prediction))
# $example off$

spark.stop()
2 changes: 1 addition & 1 deletion examples/src/main/python/ml/gaussian_mixture_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
if __name__ == "__main__":
spark = SparkSession\
.builder\
.appName("PythonGuassianMixtureExample")\
.appName("GaussianMixtureExample")\
.getOrCreate()

# $example on$
Expand Down
Loading