Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/_layouts/global.html
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@
<li><b>Language Guides:</b></li>
<li><a href="dml-language-reference.html">DML Language Reference</a></li>
<li><a href="beginners-guide-to-dml-and-pydml.html">Beginner's Guide to DML and PyDML</a></li>
<li><a href="beginners-guide-python.html">Beginner's Guide for Python users</a></li>
<li class="divider"></li>
<li><b>ML Algorithms:</b></li>
<li><a href="algorithms-reference.html">Algorithms Reference</a></li>
Expand Down
36 changes: 18 additions & 18 deletions docs/algorithms-classification.md
Original file line number Diff line number Diff line change
Expand Up @@ -129,9 +129,9 @@ Eqs. (1) and (2).
<div class="codetabs">
<div data-lang="Python" markdown="1">
{% highlight python %}
import SystemML as sml
from SystemML.mllearn import LogisticRegression
# C = 1/reg
logistic = sml.mllearn.LogisticRegression(sqlCtx, fit_intercept=True, max_iter=100, max_inner_iter=0, tol=0.000001, C=1.0)
logistic = LogisticRegression(sqlCtx, fit_intercept=True, max_iter=100, max_inner_iter=0, tol=0.000001, C=1.0)
# X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix
y_test = logistic.fit(X_train, y_train).predict(X_test)
# df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features"
Expand Down Expand Up @@ -229,7 +229,7 @@ SystemML Language Reference for details.
{% highlight python %}
# Scikit-learn way
from sklearn import datasets, neighbors
import SystemML as sml
from SystemML.mllearn import LogisticRegression
from pyspark.sql import SQLContext
sqlCtx = SQLContext(sc)
digits = datasets.load_digits()
Expand All @@ -240,12 +240,12 @@ X_train = X_digits[:.9 * n_samples]
y_train = y_digits[:.9 * n_samples]
X_test = X_digits[.9 * n_samples:]
y_test = y_digits[.9 * n_samples:]
logistic = sml.mllearn.LogisticRegression(sqlCtx)
logistic = LogisticRegression(sqlCtx)
print('LogisticRegression score: %f' % logistic.fit(X_train, y_train).score(X_test, y_test))

# MLPipeline way
from pyspark.ml import Pipeline
import SystemML as sml
from SystemML.mllearn import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.sql import SQLContext
sqlCtx = SQLContext(sc)
Expand All @@ -265,7 +265,7 @@ training = sqlCtx.createDataFrame([
], ["id", "text", "label"])
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=20)
lr = sml.mllearn.LogisticRegression(sqlCtx)
lr = LogisticRegression(sqlCtx)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
model = pipeline.fit(training)
test = sqlCtx.createDataFrame([
Expand Down Expand Up @@ -458,9 +458,9 @@ support vector machine (`y` with domain size `2`).
<div class="codetabs">
<div data-lang="Python" markdown="1">
{% highlight python %}
import SystemML as sml
from SystemML.mllearn import SVM
# C = 1/reg
svm = sml.mllearn.SVM(sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, is_multi_class=False)
svm = SVM(sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, is_multi_class=False)
# X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix
y_test = svm.fit(X_train, y_train)
# df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features"
Expand Down Expand Up @@ -714,9 +714,9 @@ class labels.
<div class="codetabs">
<div data-lang="Python" markdown="1">
{% highlight python %}
import SystemML as sml
from SystemML.mllearn import SVM
# C = 1/reg
svm = sml.mllearn.SVM(sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, is_multi_class=True)
svm = SVM(sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, is_multi_class=True)
# X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix
y_test = svm.fit(X_train, y_train)
# df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features"
Expand Down Expand Up @@ -852,7 +852,7 @@ SystemML Language Reference for details.
{% highlight python %}
# Scikit-learn way
from sklearn import datasets, neighbors
import SystemML as sml
from SystemML.mllearn import SVM
from pyspark.sql import SQLContext
sqlCtx = SQLContext(sc)
digits = datasets.load_digits()
Expand All @@ -863,12 +863,12 @@ X_train = X_digits[:.9 * n_samples]
y_train = y_digits[:.9 * n_samples]
X_test = X_digits[.9 * n_samples:]
y_test = y_digits[.9 * n_samples:]
svm = sml.mllearn.SVM(sqlCtx, is_multi_class=True)
svm = SVM(sqlCtx, is_multi_class=True)
print('LogisticRegression score: %f' % svm.fit(X_train, y_train).score(X_test, y_test))

# MLPipeline way
from pyspark.ml import Pipeline
import SystemML as sml
from SystemML.mllearn import SVM
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.sql import SQLContext
sqlCtx = SQLContext(sc)
Expand All @@ -888,7 +888,7 @@ training = sqlCtx.createDataFrame([
], ["id", "text", "label"])
tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=20)
svm = sml.mllearn.SVM(sqlCtx, is_multi_class=True)
svm = SVM(sqlCtx, is_multi_class=True)
pipeline = Pipeline(stages=[tokenizer, hashingTF, svm])
model = pipeline.fit(training)
test = sqlCtx.createDataFrame([
Expand Down Expand Up @@ -1026,8 +1026,8 @@ applicable when all features are counts of categorical values.
<div class="codetabs">
<div data-lang="Python" markdown="1">
{% highlight python %}
import SystemML as sml
nb = sml.mllearn.NaiveBayes(sqlCtx, laplace=1.0)
from SystemML.mllearn import NaiveBayes
nb = NaiveBayes(sqlCtx, laplace=1.0)
# X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix
y_test = nb.fit(X_train, y_train)
# df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features"
Expand Down Expand Up @@ -1149,7 +1149,7 @@ SystemML Language Reference for details.
{% highlight python %}
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
import SystemML as sml
from SystemML.mllearn import NaiveBayes
from sklearn import metrics
from pyspark.sql import SQLContext
sqlCtx = SQLContext(sc)
Expand All @@ -1160,7 +1160,7 @@ vectorizer = TfidfVectorizer()
# Both vectors and vectors_test are SciPy CSR matrix
vectors = vectorizer.fit_transform(newsgroups_train.data)
vectors_test = vectorizer.transform(newsgroups_test.data)
nb = sml.mllearn.NaiveBayes(sqlCtx)
nb = NaiveBayes(sqlCtx)
nb.fit(vectors, newsgroups_train.target)
pred = nb.predict(vectors_test)
metrics.f1_score(newsgroups_test.target, pred, average='weighted')
Expand Down
16 changes: 8 additions & 8 deletions docs/algorithms-regression.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,9 +82,9 @@ efficient when the number of features $m$ is relatively small
<div class="codetabs">
<div data-lang="Python" markdown="1">
{% highlight python %}
import SystemML as sml
from SystemML.mllearn import LinearRegression
# C = 1/reg
lr = sml.mllearn.LinearRegression(sqlCtx, fit_intercept=True, C=1.0, solver='direct-solve')
lr = LinearRegression(sqlCtx, fit_intercept=True, C=1.0, solver='direct-solve')
# X_train, y_train and X_test can be NumPy matrices or Pandas DataFrame or SciPy Sparse Matrix
y_test = lr.fit(X_train, y_train)
# df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features"
Expand Down Expand Up @@ -124,9 +124,9 @@ y_test = lr.fit(df_train)
<div class="codetabs">
<div data-lang="Python" markdown="1">
{% highlight python %}
import SystemML as sml
from SystemML.mllearn import LinearRegression
# C = 1/reg
lr = sml.mllearn.LinearRegression(sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, solver='newton-cg')
lr = LinearRegression(sqlCtx, fit_intercept=True, max_iter=100, tol=0.000001, C=1.0, solver='newton-cg')
# X_train, y_train and X_test can be NumPy matrices or Pandas DataFrames or SciPy Sparse matrices
y_test = lr.fit(X_train, y_train)
# df_train is DataFrame that contains two columns: "features" (of type Vector) and "label". df_test is a DataFrame that contains the column "features"
Expand Down Expand Up @@ -222,7 +222,7 @@ SystemML Language Reference for details.
{% highlight python %}
import numpy as np
from sklearn import datasets
import SystemML as sml
from SystemML.mllearn import LinearRegression
from pyspark.sql import SQLContext
# Load the diabetes dataset
diabetes = datasets.load_diabetes()
Expand All @@ -235,7 +235,7 @@ diabetes_X_test = diabetes_X[-20:]
diabetes_y_train = diabetes.target[:-20]
diabetes_y_test = diabetes.target[-20:]
# Create linear regression object
regr = sml.mllearn.LinearRegression(sqlCtx, solver='direct-solve')
regr = LinearRegression(sqlCtx, solver='direct-solve')
# Train the model using the training sets
regr.fit(diabetes_X_train, diabetes_y_train)
# The mean square error
Expand Down Expand Up @@ -277,7 +277,7 @@ print("Residual sum of squares: %.2f" % np.mean((regr.predict(diabetes_X_test) -
{% highlight python %}
import numpy as np
from sklearn import datasets
import SystemML as sml
from SystemML.mllearn import LinearRegression
from pyspark.sql import SQLContext
# Load the diabetes dataset
diabetes = datasets.load_diabetes()
Expand All @@ -290,7 +290,7 @@ diabetes_X_test = diabetes_X[-20:]
diabetes_y_train = diabetes.target[:-20]
diabetes_y_test = diabetes.target[-20:]
# Create linear regression object
regr = sml.mllearn.LinearRegression(sqlCtx, solver='newton-cg')
regr = LinearRegression(sqlCtx, solver='newton-cg')
# Train the model using the training sets
regr.fit(diabetes_X_train, diabetes_y_train)
# The mean square error
Expand Down
Loading