Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
192 changes: 192 additions & 0 deletions docs/algorithms-classification.md
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,17 @@ Eqs. (1) and (2).
### Usage

<div class="codetabs">
<div data-lang="Python" markdown="1">
{% highlight python %}
import SystemML as sml
# C = 1/reg
logistic = sml.mllearn.LogisticRegression(sqlCtx, fit_intercept=True, max_iter=100, max_inner_iter=0, tol=0.000001, C=1.0)
# X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix
y_test = logistic.fit(X_train, y_train).predict(X_test)
# df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features"
y_test = logistic.fit(df_train).transform(df_test)
{% endhighlight %}
</div>
<div data-lang="Hadoop" markdown="1">
hadoop jar SystemML.jar -f MultiLogReg.dml
-nvargs X=<file>
Expand Down Expand Up @@ -214,6 +225,58 @@ SystemML Language Reference for details.
### Examples

<div class="codetabs">
<div data-lang="Python" markdown="1">
{% highlight python %}
# Scikit-learn way
from sklearn import datasets, neighbors
import SystemML as sml
from pyspark.sql import SQLContext
sqlCtx = SQLContext(sc)
digits = datasets.load_digits()
X_digits = digits.data
y_digits = digits.target + 1
n_samples = len(X_digits)
X_train = X_digits[:.9 * n_samples]
y_train = y_digits[:.9 * n_samples]
X_test = X_digits[.9 * n_samples:]
y_test = y_digits[.9 * n_samples:]
logistic = sml.mllearn.LogisticRegression(sqlCtx)
print('LogisticRegression score: %f' % logistic.fit(X_train, y_train).score(X_test, y_test))

# MLPipeline way
from pyspark.ml import Pipeline
import SystemML as sml
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.sql import SQLContext
sqlCtx = SQLContext(sc)
training = sqlCtx.createDataFrame([
(0L, "a b c d e spark", 1.0),
(1L, "b d", 2.0),
(2L, "spark f g h", 1.0),
(3L, "hadoop mapreduce", 2.0),
(4L, "b spark who", 1.0),
(5L, "g d a y", 2.0),
(6L, "spark fly", 1.0),
(7L, "was mapreduce", 2.0),
(8L, "e spark program", 1.0),
(9L, "a e c l", 2.0),
(10L, "spark compile", 1.0),
(11L, "hadoop software", 2.0)
], ["id", "text", "label"])
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=20)
lr = sml.mllearn.LogisticRegression(sqlCtx)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
model = pipeline.fit(training)
test = sqlCtx.createDataFrame([
(12L, "spark i j k"),
(13L, "l m n"),
(14L, "mapreduce spark"),
(15L, "apache hadoop")], ["id", "text"])
prediction = model.transform(test)
prediction.show()
{% endhighlight %}
</div>
<div data-lang="Hadoop" markdown="1">
hadoop jar SystemML.jar -f MultiLogReg.dml
-nvargs X=/user/ml/X.mtx
Expand Down Expand Up @@ -393,6 +456,17 @@ support vector machine (`y` with domain size `2`).
**Binary-Class Support Vector Machines**:

<div class="codetabs">
<div data-lang="Python" markdown="1">
{% highlight python %}
import SystemML as sml
# C = 1/reg
svm = sml.mllearn.SVM(sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, is_multi_class=False)
# X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix
y_test = svm.fit(X_train, y_train)
# df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features"
y_test = svm.fit(df_train)
{% endhighlight %}
</div>
<div data-lang="Hadoop" markdown="1">
hadoop jar SystemML.jar -f l2-svm.dml
-nvargs X=<file>
Expand Down Expand Up @@ -428,6 +502,14 @@ support vector machine (`y` with domain size `2`).
**Binary-Class Support Vector Machines Prediction**:

<div class="codetabs">
<div data-lang="Python" markdown="1">
{% highlight python %}
# X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix
y_test = svm.predict(X_test)
# df_test is a DataFrame that contains the column "features" of type Vector
y_test = svm.transform(df_test)
{% endhighlight %}
</div>
<div data-lang="Hadoop" markdown="1">
hadoop jar SystemML.jar -f l2-svm-predict.dml
-nvargs X=<file>
Expand Down Expand Up @@ -630,6 +712,17 @@ class labels.
**Multi-Class Support Vector Machines**:

<div class="codetabs">
<div data-lang="Python" markdown="1">
{% highlight python %}
import SystemML as sml
# C = 1/reg
svm = sml.mllearn.SVM(sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, is_multi_class=True)
# X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix
y_test = svm.fit(X_train, y_train)
# df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features"
y_test = svm.fit(df_train)
{% endhighlight %}
</div>
<div data-lang="Hadoop" markdown="1">
hadoop jar SystemML.jar -f m-svm.dml
-nvargs X=<file>
Expand Down Expand Up @@ -665,6 +758,14 @@ class labels.
**Multi-Class Support Vector Machines Prediction**:

<div class="codetabs">
<div data-lang="Python" markdown="1">
{% highlight python %}
# X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix
y_test = svm.predict(X_test)
# df_test is a DataFrame that contains the column "features" of type Vector
y_test = svm.transform(df_test)
{% endhighlight %}
</div>
<div data-lang="Hadoop" markdown="1">
hadoop jar SystemML.jar -f m-svm-predict.dml
-nvargs X=<file>
Expand Down Expand Up @@ -747,6 +848,58 @@ SystemML Language Reference for details.
**Multi-Class Support Vector Machines**:

<div class="codetabs">
<div data-lang="Python" markdown="1">
{% highlight python %}
# Scikit-learn way
from sklearn import datasets, neighbors
import SystemML as sml
from pyspark.sql import SQLContext
sqlCtx = SQLContext(sc)
digits = datasets.load_digits()
X_digits = digits.data
y_digits = digits.target
n_samples = len(X_digits)
X_train = X_digits[:.9 * n_samples]
y_train = y_digits[:.9 * n_samples]
X_test = X_digits[.9 * n_samples:]
y_test = y_digits[.9 * n_samples:]
svm = sml.mllearn.SVM(sqlCtx, is_multi_class=True)
print('LogisticRegression score: %f' % svm.fit(X_train, y_train).score(X_test, y_test))

# MLPipeline way
from pyspark.ml import Pipeline
import SystemML as sml
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.sql import SQLContext
sqlCtx = SQLContext(sc)
training = sqlCtx.createDataFrame([
(0L, "a b c d e spark", 1.0),
(1L, "b d", 2.0),
(2L, "spark f g h", 1.0),
(3L, "hadoop mapreduce", 2.0),
(4L, "b spark who", 1.0),
(5L, "g d a y", 2.0),
(6L, "spark fly", 1.0),
(7L, "was mapreduce", 2.0),
(8L, "e spark program", 1.0),
(9L, "a e c l", 2.0),
(10L, "spark compile", 1.0),
(11L, "hadoop software", 2.0)
], ["id", "text", "label"])
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=20)
svm = sml.mllearn.SVM(sqlCtx, is_multi_class=True)
pipeline = Pipeline(stages=[tokenizer, hashingTF, svm])
model = pipeline.fit(training)
test = sqlCtx.createDataFrame([
(12L, "spark i j k"),
(13L, "l m n"),
(14L, "mapreduce spark"),
(15L, "apache hadoop")], ["id", "text"])
prediction = model.transform(test)
prediction.show()
{% endhighlight %}
</div>
<div data-lang="Hadoop" markdown="1">
hadoop jar SystemML.jar -f m-svm.dml
-nvargs X=/user/ml/X.mtx
Expand Down Expand Up @@ -871,6 +1024,16 @@ applicable when all features are counts of categorical values.
**Naive Bayes**:

<div class="codetabs">
<div data-lang="Python" markdown="1">
{% highlight python %}
import SystemML as sml
nb = sml.mllearn.NaiveBayes(sqlCtx, laplace=1.0)
# X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix
y_test = nb.fit(X_train, y_train)
# df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features"
y_test = nb.fit(df_train)
{% endhighlight %}
</div>
<div data-lang="Hadoop" markdown="1">
hadoop jar SystemML.jar -f naive-bayes.dml
-nvargs X=<file>
Expand Down Expand Up @@ -902,6 +1065,14 @@ applicable when all features are counts of categorical values.
**Naive Bayes Prediction**:

<div class="codetabs">
<div data-lang="Python" markdown="1">
{% highlight python %}
# X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix
y_test = nb.predict(X_test)
# df_test is a DataFrame that contains the column "features" of type Vector
y_test = nb.transform(df_test)
{% endhighlight %}
</div>
<div data-lang="Hadoop" markdown="1">
hadoop jar SystemML.jar -f naive-bayes-predict.dml
-nvargs X=<file>
Expand Down Expand Up @@ -974,6 +1145,27 @@ SystemML Language Reference for details.
**Naive Bayes**:

<div class="codetabs">
<div data-lang="Python" markdown="1">
{% highlight python %}
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
import SystemML as sml
from sklearn import metrics
from pyspark.sql import SQLContext
sqlCtx = SQLContext(sc)
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)
vectorizer = TfidfVectorizer()
# Both vectors and vectors_test are SciPy CSR matrix
vectors = vectorizer.fit_transform(newsgroups_train.data)
vectors_test = vectorizer.transform(newsgroups_test.data)
nb = sml.mllearn.NaiveBayes(sqlCtx)
nb.fit(vectors, newsgroups_train.target)
pred = nb.predict(vectors_test)
metrics.f1_score(newsgroups_test.target, pred, average='weighted')
{% endhighlight %}
</div>
<div data-lang="Hadoop" markdown="1">
hadoop jar SystemML.jar -f naive-bayes.dml
-nvargs X=/user/ml/X.mtx
Expand Down
70 changes: 70 additions & 0 deletions docs/algorithms-regression.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,17 @@ efficient when the number of features $m$ is relatively small
**Linear Regression - Direct Solve**:

<div class="codetabs">
<div data-lang="Python" markdown="1">
{% highlight python %}
import SystemML as sml
# C = 1/reg
lr = sml.mllearn.LinearRegression(sqlCtx, fit_intercept=True, C=1.0, solver='direct-solve')
# X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix
y_test = lr.fit(X_train, y_train)
# df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features"
y_test = lr.fit(df_train)
{% endhighlight %}
</div>
<div data-lang="Hadoop" markdown="1">
hadoop jar SystemML.jar -f LinearRegDS.dml
-nvargs X=<file>
Expand Down Expand Up @@ -111,6 +122,17 @@ efficient when the number of features $m$ is relatively small
**Linear Regression - Conjugate Gradient**:

<div class="codetabs">
<div data-lang="Python" markdown="1">
{% highlight python %}
import SystemML as sml
# C = 1/reg
lr = sml.mllearn.LinearRegression(sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, solver='newton-cg')
# X_train, y_train and X_test can be NumPy matrices or Pandas DataFrames or SciPy Sparse matrices
y_test = lr.fit(X_train, y_train)
# df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features"
y_test = lr.fit(df_train)
{% endhighlight %}
</div>
<div data-lang="Hadoop" markdown="1">
hadoop jar SystemML.jar -f LinearRegCG.dml
-nvargs X=<file>
Expand Down Expand Up @@ -196,6 +218,30 @@ SystemML Language Reference for details.
**Linear Regression - Direct Solve**:

<div class="codetabs">
<div data-lang="Python" markdown="1">
{% highlight python %}
import numpy as np
from sklearn import datasets
import SystemML as sml
from pyspark.sql import SQLContext
# Load the diabetes dataset
diabetes = datasets.load_diabetes()
# Use only one feature
diabetes_X = diabetes.data[:, np.newaxis, 2]
# Split the data into training/testing sets
diabetes_X_train = diabetes_X[:-20]
diabetes_X_test = diabetes_X[-20:]
# Split the targets into training/testing sets
diabetes_y_train = diabetes.target[:-20]
diabetes_y_test = diabetes.target[-20:]
# Create linear regression object
regr = sml.mllearn.LinearRegression(sqlCtx, solver='direct-solve')
# Train the model using the training sets
regr.fit(diabetes_X_train, diabetes_y_train)
# The mean square error
print("Residual sum of squares: %.2f" % np.mean((regr.predict(diabetes_X_test) - diabetes_y_test) ** 2))
{% endhighlight %}
</div>
<div data-lang="Hadoop" markdown="1">
hadoop jar SystemML.jar -f LinearRegDS.dml
-nvargs X=/user/ml/X.mtx
Expand Down Expand Up @@ -227,6 +273,30 @@ SystemML Language Reference for details.
**Linear Regression - Conjugate Gradient**:

<div class="codetabs">
<div data-lang="Python" markdown="1">
{% highlight python %}
import numpy as np
from sklearn import datasets
import SystemML as sml
from pyspark.sql import SQLContext
# Load the diabetes dataset
diabetes = datasets.load_diabetes()
# Use only one feature
diabetes_X = diabetes.data[:, np.newaxis, 2]
# Split the data into training/testing sets
diabetes_X_train = diabetes_X[:-20]
diabetes_X_test = diabetes_X[-20:]
# Split the targets into training/testing sets
diabetes_y_train = diabetes.target[:-20]
diabetes_y_test = diabetes.target[-20:]
# Create linear regression object
regr = sml.mllearn.LinearRegression(sqlCtx, solver='newton-cg')
# Train the model using the training sets
regr.fit(diabetes_X_train, diabetes_y_train)
# The mean square error
print("Residual sum of squares: %.2f" % np.mean((regr.predict(diabetes_X_test) - diabetes_y_test) ** 2))
{% endhighlight %}
</div>
<div data-lang="Hadoop" markdown="1">
hadoop jar SystemML.jar -f LinearRegCG.dml
-nvargs X=/user/ml/X.mtx
Expand Down
5 changes: 4 additions & 1 deletion scripts/algorithms/l2-svm.dml
Original file line number Diff line number Diff line change
Expand Up @@ -160,4 +160,7 @@ extra_model_params[4,1] = dimensions
w = t(append(t(w), t(extra_model_params)))
write(w, $model, format=cmdLine_fmt)

write(debug_str, $Log)
logFile = $Log
if(logFile != " ") {
write(debug_str, logFile)
}
5 changes: 4 additions & 1 deletion scripts/algorithms/m-svm.dml
Original file line number Diff line number Diff line change
Expand Up @@ -175,4 +175,7 @@ for(iter_class in 1:ncol(debug_mat)){
debug_str = append(debug_str, iter_class + "," + iter + "," + obj)
}
}
write(debug_str, $Log)
logFile = $Log
if(logFile != " ") {
write(debug_str, logFile)
}
Loading