From c40a18cdb2521ed63341bc2b5569901e49547507 Mon Sep 17 00:00:00 2001 From: movelikeriver Date: Mon, 8 Feb 2016 15:54:47 -0800 Subject: [PATCH 1/2] Refine naive Bayes example by checking model after loading it --- .../main/python/mllib/naive_bayes_example.py | 26 ++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/examples/src/main/python/mllib/naive_bayes_example.py b/examples/src/main/python/mllib/naive_bayes_example.py index f5e120c678fc..1f17094b2770 100644 --- a/examples/src/main/python/mllib/naive_bayes_example.py +++ b/examples/src/main/python/mllib/naive_bayes_example.py @@ -17,9 +17,16 @@ """ NaiveBayes Example. + +Usage: + `spark-submit --master local[4] examples/src/main/python/mllib/naive_bayes_example.py` """ + from __future__ import print_function +from os import path +import shutil + from pyspark import SparkContext # $example on$ from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel @@ -38,8 +45,12 @@ def parseLine(line): sc = SparkContext(appName="PythonNaiveBayesExample") + WORK_DIR = './' + # $example on$ - data = sc.textFile('data/mllib/sample_naive_bayes_data.txt').map(parseLine) + data = sc.textFile(path.join(WORK_DIR, + 'data/mllib/sample_naive_bayes_data.txt') + ).map(parseLine) # Split data aproximately into training (60%) and test (40%) training, test = data.randomSplit([0.6, 0.4], seed=0) @@ -50,8 +61,17 @@ def parseLine(line): # Make prediction and test accuracy. predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label)) accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count() + print('\n\tmodel accuracy %.4f\n' % accuracy) # Save and load model - model.save(sc, "target/tmp/myNaiveBayesModel") - sameModel = NaiveBayesModel.load(sc, "target/tmp/myNaiveBayesModel") + output_dir = '/tmp/myNaiveBayesModel' + shutil.rmtree(output_dir, ignore_errors=True) + model.save(sc, output_dir) + print('\n\tSaved to path %s\n' % output_dir) + sameModel = NaiveBayesModel.load(sc, output_dir) + print('\n\tLoaded from path %s\n' % output_dir) + predictionAndLabel = test.map(lambda p: (sameModel.predict(p.features), p.label)) + accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count() + print('\n\tsameModel accuracy %.4f\n' % accuracy) + # $example off$ From 7e3ea32fdd51f2e5a631602b23576b6330d9f112 Mon Sep 17 00:00:00 2001 From: movelikeriver Date: Thu, 11 Feb 2016 17:47:17 -0800 Subject: [PATCH 2/2] fix comments --- .../src/main/python/mllib/naive_bayes_example.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/examples/src/main/python/mllib/naive_bayes_example.py b/examples/src/main/python/mllib/naive_bayes_example.py index 1f17094b2770..e7d5893d6741 100644 --- a/examples/src/main/python/mllib/naive_bayes_example.py +++ b/examples/src/main/python/mllib/naive_bayes_example.py @@ -24,7 +24,6 @@ from __future__ import print_function -from os import path import shutil from pyspark import SparkContext @@ -45,12 +44,8 @@ def parseLine(line): sc = SparkContext(appName="PythonNaiveBayesExample") - WORK_DIR = './' - # $example on$ - data = sc.textFile(path.join(WORK_DIR, - 'data/mllib/sample_naive_bayes_data.txt') - ).map(parseLine) + data = sc.textFile('data/mllib/sample_naive_bayes_data.txt').map(parseLine) # Split data aproximately into training (60%) and test (40%) training, test = data.randomSplit([0.6, 0.4], seed=0) @@ -61,17 +56,15 @@ def parseLine(line): # Make prediction and test accuracy. predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label)) accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count() - print('\n\tmodel accuracy %.4f\n' % accuracy) + print('model accuracy {}'.format(accuracy)) # Save and load model - output_dir = '/tmp/myNaiveBayesModel' + output_dir = 'target/tmp/myNaiveBayesModel' shutil.rmtree(output_dir, ignore_errors=True) model.save(sc, output_dir) - print('\n\tSaved to path %s\n' % output_dir) sameModel = NaiveBayesModel.load(sc, output_dir) - print('\n\tLoaded from path %s\n' % output_dir) predictionAndLabel = test.map(lambda p: (sameModel.predict(p.features), p.label)) accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count() - print('\n\tsameModel accuracy %.4f\n' % accuracy) + print('sameModel accuracy {}'.format(accuracy)) # $example off$