Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 7 additions & 8 deletions R/pkg/R/mllib_classification.R
Original file line number Diff line number Diff line change
Expand Up @@ -75,9 +75,9 @@ setClass("NaiveBayesModel", representation(jobj = "jobj"))
#' @examples
#' \dontrun{
#' sparkR.session()
#' df <- createDataFrame(iris)
#' training <- df[df$Species %in% c("versicolor", "virginica"), ]
#' model <- spark.svmLinear(training, Species ~ ., regParam = 0.5)
#' t <- as.data.frame(Titanic)
#' training <- createDataFrame(t)
#' model <- spark.svmLinear(training, Survived ~ ., regParam = 0.5)
#' summary <- summary(model)
#'
#' # fitted values on training data
Expand Down Expand Up @@ -220,9 +220,9 @@ function(object, path, overwrite = FALSE) {
#' \dontrun{
#' sparkR.session()
#' # binary logistic regression
#' df <- createDataFrame(iris)
#' training <- df[df$Species %in% c("versicolor", "virginica"), ]
#' model <- spark.logit(training, Species ~ ., regParam = 0.5)
#' t <- as.data.frame(Titanic)
#' training <- createDataFrame(t)
#' model <- spark.logit(training, Survived ~ ., regParam = 0.5)
#' summary <- summary(model)
#'
#' # fitted values on training data
Expand All @@ -239,8 +239,7 @@ function(object, path, overwrite = FALSE) {
#'
#' # multinomial logistic regression
#'
#' df <- createDataFrame(iris)
#' model <- spark.logit(df, Species ~ ., regParam = 0.5)
#' model <- spark.logit(training, Class ~ ., regParam = 0.5)
#' summary <- summary(model)
#'
#' }
Expand Down
15 changes: 8 additions & 7 deletions R/pkg/R/mllib_clustering.R
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,9 @@ setClass("LDAModel", representation(jobj = "jobj"))
#' @examples
#' \dontrun{
#' sparkR.session()
#' df <- createDataFrame(iris)
#' model <- spark.bisectingKmeans(df, Sepal_Length ~ Sepal_Width, k = 4)
#' t <- as.data.frame(Titanic)
#' df <- createDataFrame(t)
#' model <- spark.bisectingKmeans(df, Class ~ Survived, k = 4)
#' summary(model)
#'
#' # get fitted result from a bisecting k-means model
Expand All @@ -82,7 +83,7 @@ setClass("LDAModel", representation(jobj = "jobj"))
#'
#' # fitted values on training data
#' fitted <- predict(model, df)
#' head(select(fitted, "Sepal_Length", "prediction"))
#' head(select(fitted, "Class", "prediction"))
#'
#' # save fitted model to input path
#' path <- "path/to/model"
Expand Down Expand Up @@ -338,14 +339,14 @@ setMethod("write.ml", signature(object = "GaussianMixtureModel", path = "charact
#' @examples
#' \dontrun{
#' sparkR.session()
#' data(iris)
#' df <- createDataFrame(iris)
#' model <- spark.kmeans(df, Sepal_Length ~ Sepal_Width, k = 4, initMode = "random")
#' t <- as.data.frame(Titanic)
#' df <- createDataFrame(t)
#' model <- spark.kmeans(df, Class ~ Survived, k = 4, initMode = "random")
#' summary(model)
#'
#' # fitted values on training data
#' fitted <- predict(model, df)
#' head(select(fitted, "Sepal_Length", "prediction"))
#' head(select(fitted, "Class", "prediction"))
#'
#' # save fitted model to input path
#' path <- "path/to/model"
Expand Down
14 changes: 7 additions & 7 deletions R/pkg/R/mllib_regression.R
Original file line number Diff line number Diff line change
Expand Up @@ -68,14 +68,14 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj"))
#' @examples
#' \dontrun{
#' sparkR.session()
#' data(iris)
#' df <- createDataFrame(iris)
#' model <- spark.glm(df, Sepal_Length ~ Sepal_Width, family = "gaussian")
#' t <- as.data.frame(Titanic)
#' df <- createDataFrame(t)
#' model <- spark.glm(df, Freq ~ Sex + Age, family = "gaussian")
#' summary(model)
#'
#' # fitted values on training data
#' fitted <- predict(model, df)
#' head(select(fitted, "Sepal_Length", "prediction"))
#' head(select(fitted, "Freq", "prediction"))
#'
#' # save fitted model to input path
#' path <- "path/to/model"
Expand Down Expand Up @@ -137,9 +137,9 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
#' @examples
#' \dontrun{
#' sparkR.session()
#' data(iris)
#' df <- createDataFrame(iris)
#' model <- glm(Sepal_Length ~ Sepal_Width, df, family = "gaussian")
#' t <- as.data.frame(Titanic)
#' df <- createDataFrame(t)
#' model <- glm(Freq ~ Sex + Age, df, family = "gaussian")
#' summary(model)
#' }
#' @note glm since 1.5.0
Expand Down
18 changes: 10 additions & 8 deletions R/pkg/R/mllib_tree.R
Original file line number Diff line number Diff line change
Expand Up @@ -143,14 +143,15 @@ print.summary.treeEnsemble <- function(x) {
#'
#' # fit a Gradient Boosted Tree Classification Model
#' # label must be binary - Only binary classification is supported for GBT.
#' df <- createDataFrame(iris[iris$Species != "virginica", ])
#' model <- spark.gbt(df, Species ~ Petal_Length + Petal_Width, "classification")
#' t <- as.data.frame(Titanic)
#' df <- createDataFrame(t)
#' model <- spark.gbt(df, Survived ~ Age + Freq, "classification")
#'
#' # numeric label is also supported
#' iris2 <- iris[iris$Species != "virginica", ]
#' iris2$NumericSpecies <- ifelse(iris2$Species == "setosa", 0, 1)
#' df <- createDataFrame(iris2)
#' model <- spark.gbt(df, NumericSpecies ~ ., type = "classification")
#' t2 <- as.data.frame(Titanic)
#' t2$NumericGender <- ifelse(t2$Sex == "Male", 0, 1)
#' df <- createDataFrame(t2)
#' model <- spark.gbt(df, NumericGender ~ ., type = "classification")
#' }
#' @note spark.gbt since 2.1.0
setMethod("spark.gbt", signature(data = "SparkDataFrame", formula = "formula"),
Expand Down Expand Up @@ -351,8 +352,9 @@ setMethod("write.ml", signature(object = "GBTClassificationModel", path = "chara
#' summary(savedModel)
#'
#' # fit a Random Forest Classification Model
#' df <- createDataFrame(iris)
#' model <- spark.randomForest(df, Species ~ Petal_Length + Petal_Width, "classification")
#' t <- as.data.frame(Titanic)
#' df <- createDataFrame(t)
#' model <- spark.randomForest(df, Survived ~ Freq + Age, "classification")
#' }
#' @note spark.randomForest since 2.1.0
setMethod("spark.randomForest", signature(data = "SparkDataFrame", formula = "formula"),
Expand Down
47 changes: 25 additions & 22 deletions R/pkg/vignettes/sparkr-vignettes.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -565,11 +565,10 @@ We use a simple example to demonstrate `spark.logit` usage. In general, there ar
and 3). Obtain the coefficient matrix of the fitted model using `summary` and use the model for prediction with `predict`.

Binomial logistic regression
```{r, warning=FALSE}
df <- createDataFrame(iris)
# Create a DataFrame containing two classes
training <- df[df$Species %in% c("versicolor", "virginica"), ]
model <- spark.logit(training, Species ~ ., regParam = 0.00042)
```{r}
t <- as.data.frame(Titanic)
training <- createDataFrame(t)
model <- spark.logit(training, Survived ~ ., regParam = 0.04741301)
summary(model)
```

Expand All @@ -579,10 +578,11 @@ fitted <- predict(model, training)
```

Multinomial logistic regression against three classes
```{r, warning=FALSE}
df <- createDataFrame(iris)
```{r}
t <- as.data.frame(Titanic)
training <- createDataFrame(t)
# Note in this case, Spark infers it is multinomial logistic regression, so family = "multinomial" is optional.
model <- spark.logit(df, Species ~ ., regParam = 0.056)
model <- spark.logit(training, Class ~ ., regParam = 0.07815179)
summary(model)
```

Expand All @@ -609,11 +609,12 @@ MLPC employs backpropagation for learning the model. We use the logistic loss fu

`spark.mlp` requires at least two columns in `data`: one named `"label"` and the other one `"features"`. The `"features"` column should be in libSVM-format.

We use iris data set to show how to use `spark.mlp` in classification.
```{r, warning=FALSE}
df <- createDataFrame(iris)
We use Titanic data set to show how to use `spark.mlp` in classification.
```{r}
t <- as.data.frame(Titanic)
training <- createDataFrame(t)
# fit a Multilayer Perceptron Classification Model
model <- spark.mlp(df, Species ~ ., blockSize = 128, layers = c(4, 3), solver = "l-bfgs", maxIter = 100, tol = 0.5, stepSize = 1, seed = 1, initialWeights = c(0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 9, 9, 9, 9, 9))
model <- spark.mlp(training, Survived ~ Age + Sex, blockSize = 128, layers = c(2, 3), solver = "l-bfgs", maxIter = 100, tol = 0.5, stepSize = 1, seed = 1, initialWeights = c( 0, 0, 0, 5, 5, 5, 9, 9, 9))
```

To avoid lengthy display, we only present partial results of the model summary. You can check the full result from your sparkR shell.
Expand All @@ -630,7 +631,7 @@ options(ops)
```
```{r}
# make predictions use the fitted model
predictions <- predict(model, df)
predictions <- predict(model, training)
head(select(predictions, predictions$prediction))
```

Expand Down Expand Up @@ -769,12 +770,13 @@ predictions <- predict(rfModel, df)

`spark.bisectingKmeans` is a kind of [hierarchical clustering](https://en.wikipedia.org/wiki/Hierarchical_clustering) using a divisive (or "top-down") approach: all observations start in one cluster, and splits are performed recursively as one moves down the hierarchy.

```{r, warning=FALSE}
df <- createDataFrame(iris)
model <- spark.bisectingKmeans(df, Sepal_Length ~ Sepal_Width, k = 4)
```{r}
t <- as.data.frame(Titanic)
training <- createDataFrame(t)
model <- spark.bisectingKmeans(training, Class ~ Survived, k = 4)
summary(model)
fitted <- predict(model, df)
head(select(fitted, "Sepal_Length", "prediction"))
fitted <- predict(model, training)
head(select(fitted, "Class", "prediction"))
```

#### Gaussian Mixture Model
Expand Down Expand Up @@ -912,9 +914,10 @@ testSummary

### Model Persistence
The following example shows how to save/load an ML model by SparkR.
```{r, warning=FALSE}
irisDF <- createDataFrame(iris)
gaussianGLM <- spark.glm(irisDF, Sepal_Length ~ Sepal_Width + Species, family = "gaussian")
```{r}
t <- as.data.frame(Titanic)
training <- createDataFrame(t)
gaussianGLM <- spark.glm(training, Freq ~ Sex + Age, family = "gaussian")

# Save and then load a fitted MLlib model
modelPath <- tempfile(pattern = "ml", fileext = ".tmp")
Expand All @@ -925,7 +928,7 @@ gaussianGLM2 <- read.ml(modelPath)
summary(gaussianGLM2)

# Check model prediction
gaussianPredictions <- predict(gaussianGLM2, irisDF)
gaussianPredictions <- predict(gaussianGLM2, training)
head(gaussianPredictions)

unlink(modelPath)
Expand Down
11 changes: 6 additions & 5 deletions examples/src/main/r/ml/bisectingKmeans.R
Original file line number Diff line number Diff line change
Expand Up @@ -25,20 +25,21 @@ library(SparkR)
sparkR.session(appName = "SparkR-ML-bisectingKmeans-example")

# $example on$
irisDF <- createDataFrame(iris)
t <- as.data.frame(Titanic)
training <- createDataFrame(t)

# Fit bisecting k-means model with four centers
model <- spark.bisectingKmeans(df, Sepal_Length ~ Sepal_Width, k = 4)
model <- spark.bisectingKmeans(training, Class ~ Survived, k = 4)

# get fitted result from a bisecting k-means model
fitted.model <- fitted(model, "centers")

# Model summary
summary(fitted.model)
head(summary(fitted.model))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

summary should print without having to do a head here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this case, summary returns a DataFrame. It won't print out the contents of the DataFrame.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok :) so we could add a print.summary.bisectingKMeansModel like other models :)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will do in follow-up PR.


# fitted values on training data
fitted <- predict(model, df)
head(select(fitted, "Sepal_Length", "prediction"))
fitted <- predict(model, training)
head(select(fitted, "Class", "prediction"))
# $example off$

sparkR.session.stop()
20 changes: 11 additions & 9 deletions examples/src/main/r/ml/glm.R
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,12 @@ library(SparkR)
sparkR.session(appName = "SparkR-ML-glm-example")

# $example on$
irisDF <- suppressWarnings(createDataFrame(iris))
training <- read.df("data/mllib/sample_multiclass_classification_data.txt", source = "libsvm")
# Fit a generalized linear model of family "gaussian" with spark.glm
gaussianDF <- irisDF
gaussianTestDF <- irisDF
gaussianGLM <- spark.glm(gaussianDF, Sepal_Length ~ Sepal_Width + Species, family = "gaussian")
df_list <- randomSplit(training, c(7,3), 2)
gaussianDF <- df_list[[1]]
gaussianTestDF <- df_list[[2]]
gaussianGLM <- spark.glm(gaussianDF, label ~ features, family = "gaussian")

# Model summary
summary(gaussianGLM)
Expand All @@ -39,14 +40,15 @@ gaussianPredictions <- predict(gaussianGLM, gaussianTestDF)
head(gaussianPredictions)

# Fit a generalized linear model with glm (R-compliant)
gaussianGLM2 <- glm(Sepal_Length ~ Sepal_Width + Species, gaussianDF, family = "gaussian")
gaussianGLM2 <- glm(label ~ features, gaussianDF, family = "gaussian")
summary(gaussianGLM2)

# Fit a generalized linear model of family "binomial" with spark.glm
# Note: Filter out "setosa" from label column (two labels left) to match "binomial" family.
binomialDF <- filter(irisDF, irisDF$Species != "setosa")
binomialTestDF <- binomialDF
binomialGLM <- spark.glm(binomialDF, Species ~ Sepal_Length + Sepal_Width, family = "binomial")
training2 <- read.df("data/mllib/sample_binary_classification_data.txt", source = "libsvm")
df_list2 <- randomSplit(training2, c(7,3), 2)
binomialDF <- df_list2[[1]]
binomialTestDF <- df_list2[[2]]
binomialGLM <- spark.glm(binomialDF, label ~ features, family = "binomial")

# Model summary
summary(binomialGLM)
Expand Down
10 changes: 6 additions & 4 deletions examples/src/main/r/ml/kmeans.R
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,12 @@ sparkR.session(appName = "SparkR-ML-kmeans-example")

# $example on$
# Fit a k-means model with spark.kmeans
irisDF <- suppressWarnings(createDataFrame(iris))
kmeansDF <- irisDF
kmeansTestDF <- irisDF
kmeansModel <- spark.kmeans(kmeansDF, ~ Sepal_Length + Sepal_Width + Petal_Length + Petal_Width,
t <- as.data.frame(Titanic)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

kmeans_data.txt and sample_kmeans_data.txt have fewer data points than Titanic. So in this case, I am still using the Titanic dataset.

training <- createDataFrame(t)
df_list <- randomSplit(training, c(7,3), 2)
kmeansDF <- df_list[[1]]
kmeansTestDF <- df_list[[2]]
kmeansModel <- spark.kmeans(kmeansDF, ~ Class + Sex + Age + Freq,
k = 3)

# Model summary
Expand Down
9 changes: 5 additions & 4 deletions examples/src/main/r/ml/ml.R
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,12 @@ sparkR.session(appName = "SparkR-ML-example")

############################ model read/write ##############################################
# $example on:read_write$
irisDF <- suppressWarnings(createDataFrame(iris))
training <- read.df("data/mllib/sample_multiclass_classification_data.txt", source = "libsvm")
# Fit a generalized linear model of family "gaussian" with spark.glm
gaussianDF <- irisDF
gaussianTestDF <- irisDF
gaussianGLM <- spark.glm(gaussianDF, Sepal_Length ~ Sepal_Width + Species, family = "gaussian")
df_list <- randomSplit(training, c(7,3), 2)
gaussianDF <- df_list[[1]]
gaussianTestDF <- df_list[[2]]
gaussianGLM <- spark.glm(gaussianDF, label ~ features, family = "gaussian")

# Save and then load a fitted MLlib model
modelPath <- tempfile(pattern = "ml", fileext = ".tmp")
Expand Down