Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Final code for linear regression and SVR, and condo/residential datasets #16

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
774 changes: 771 additions & 3 deletions code/regression/regression.ipynb

Large diffs are not rendered by default.

345 changes: 345 additions & 0 deletions code/svm_regression/.ipynb_checkpoints/SVM_RBF-checkpoint.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,345 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#\n",
"# Import needed packages\n",
"#\n",
"import numpy as np\n",
"import time\n",
"import pandas as pd\n",
"import csv\n",
"from numpy import genfromtxt\n",
"from sklearn import cross_validation\n",
"from sklearn.cross_validation import KFold\n",
"from sklearn.grid_search import GridSearchCV\n",
"from sklearn.learning_curve import learning_curve\n",
"from sklearn.svm import SVR\n",
"from sklearn import preprocessing\n",
"from sklearn.preprocessing import Imputer\n",
"from sklearn import metrics\n",
"import matplotlib.pyplot as plt\n",
"import matplotlib\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of data: 4844 \n",
"Number of variables: 110\n"
]
}
],
"source": [
"#\n",
"# Read data\n",
"#\n",
"dataset = genfromtxt('../../data/realestate/realestate.csv', delimiter=',')\n",
"dataset = dataset[1:,:]\n",
"\n",
"ndata=dataset.shape[0]\n",
"nvar=dataset.shape[1]\n",
"\n",
"X = dataset[:,:nvar-1]\n",
"y = dataset[:,nvar-1]\n",
"nvar = nvar - 1\n",
"\n",
"print('Number of data: %d \\nNumber of variables: %d' % (ndata,nvar) )"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[ 1.65500000e+05 5.35870927e+01 -1.13441766e+02 ..., 2.02500000e+03\n",
" 1.60500000e+03 2.84100000e+03]\n",
" [ 4.75500000e+05 5.36128236e+01 -1.13430047e+02 ..., 1.93000000e+03\n",
" 1.48000000e+03 2.09700000e+03]\n",
" [ 2.68000000e+05 5.35955564e+01 -1.13378465e+02 ..., 2.48500000e+03\n",
" 2.56900000e+03 1.60800000e+03]\n",
" ..., \n",
" [ 1.54000000e+05 5.34268642e+01 -1.13456094e+02 ..., 4.70400000e+03\n",
" 5.43900000e+03 1.71500000e+03]\n",
" [ 1.50500000e+05 5.34268642e+01 -1.13456094e+02 ..., 4.70400000e+03\n",
" 5.43900000e+03 1.71500000e+03]\n",
" [ 1.56000000e+05 5.34292446e+01 -1.13468327e+02 ..., 5.32900000e+03\n",
" 4.90800000e+03 1.12600000e+03]]\n"
]
}
],
"source": [
"#\n",
"# normalize the data attributes\n",
"#\n",
"min_X = np.amin(X,axis=0)\n",
"max_X = np.amax(X,axis=0)\n",
"print(X)\n",
"diff_X = max_X-min_X;\n",
"nrm_X = np.zeros((ndata,nvar))\n",
"for i in range(ndata):\n",
" nrm_X[i,:] = (X[i,:]-min_X) / diff_X"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#\n",
"# define parameters for cross-validation\n",
"#\n",
"nfold=5\n",
"\n",
"minsigma=-4\n",
"maxsigma=0\n",
"nsigma=5\n",
"\n",
"mincost=3\n",
"maxcost=8\n",
"ncost=5\n",
"\n",
"cvsigma=np.logspace(minsigma, maxsigma, nsigma)\n",
"cvcost=np.logspace(mincost,maxcost,ncost)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Working on: Fold= 0.0000, Sigma= 0.0001, Cost= 1000.0000\n",
" Working on: Fold= 0.0000, Sigma= 0.0001, Cost=17782.7941"
]
}
],
"source": [
"#\n",
"# 5-fold cross-validation to define parameters\n",
"#\n",
"i=0\n",
"kf = KFold(ndata, n_folds=nfold, shuffle=True)\n",
"cvape=np.zeros((nsigma,ncost,nfold))\n",
"for train, test in kf:\n",
" X_train=nrm_X[train]\n",
" y_train=y[train]\n",
" X_test=nrm_X[test]\n",
" y_test=y[test]\n",
" for j in range(nsigma):\n",
" for k in range(ncost):\n",
" print (\" Working on: Fold=%10.4f, Sigma=%10.4f, Cost=%10.4f\" % (i,cvsigma[j],cvcost[k]) )\n",
" svr_rbf=SVR(kernel='rbf', C=cvcost[k], gamma=cvsigma[j])\n",
" y_pred=svr_rbf.fit(X_train, y_train).predict(X_test)\n",
" cvape[i,j,k] = np.mean ( np.abs((y_test - y_pred)/y_test) )\n",
" i = i+1 \n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#\n",
"# Mean absolute percentage error\n",
"#\n",
"cvmape=np.mean(cvape,axis=0)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#\n",
"# Get indexes of parameter combination with minimum error\n",
"#\n",
"idxsigma, idxcost = np.unravel_index(cvmape.argmin(), cvmape.shape)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#\n",
"# Print out Results\n",
"#\n",
"optsigma=cvsigma[idxsigma];\n",
"optcost=cvcost[idxcost];\n",
"print(\"Best parameters: \\n Sigma = %10.4f\\n Cost = %10.4f\\n Relative Accuracy = %10.4f\" % (cvsigma[idxsigma],cvcost[idxcost],cvmape[idxsigma,idxcost]))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"scrolled": false
},
"outputs": [],
"source": [
"#\n",
"# Plot results for all combinations tested\n",
"#\n",
"matplotlib.rcParams.update({'font.size': 14})\n",
"fig, ax = plt.subplots(1,figsize=(9,9))\n",
"ax.imshow(cvmape,interpolation='nearest')\n",
"for i in range(nsigma):\n",
" for j in range(ncost):\n",
" ax.text(i,j,(\"%.4f\" % cvmape[j,i]), va='center', ha='center')\n",
" \n",
"fig.suptitle('Average Absolute Percentual Error', fontsize=20)\n",
"plt.xlabel('Cost')\n",
"plt.ylabel('Sigma')\n",
"\n",
"plt.xticks(range(ncost),np.char.mod('%.2e', cvcost))\n",
"plt.yticks(range(nsigma),cvsigma)\n",
"\n",
"fig.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#\n",
"# Run in all data set with optimal parameters\n",
"#\n",
"svr_rbf=SVR(kernel='rbf', C=optcost, gamma=optsigma)\n",
"y_pred=svr_rbf.fit(nrm_X, y).predict(nrm_X)\n",
"trainmape = np.mean ( np.abs((y - y_pred)/y) )\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#\n",
"# Report Results\n",
"#\n",
"print(\"Cross-Validation Accuracy: %10.4f\\nTrain set Accuracy: %10.4f\" %(cvmape[idxsigma,idxcost],trainmape))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#\n",
"#\n",
"#\n",
"fig, ax = plt.subplots(1,figsize=(9,9))\n",
"ax.scatter(y,y_pred)\n",
"ax.plot([0, np.max(y)], [0, np.max(y)], color=[1,0,0])\n",
"axy=ax.get_ylim()\n",
"axx=ax.get_xlim()\n",
"plt.xlim(0,np.max([axx[1],axy[1]]))\n",
"plt.ylim(0,np.max([axx[1],axy[1]]))\n",
"fig.suptitle('Results', fontsize=20)\n",
"plt.xlabel('True ($CAD)')\n",
"plt.ylabel('Estimated ($CAD)')\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"fig, ax = plt.subplots(1,figsize=(9,9))\n",
"ax.scatter(y,y_pred)\n",
"ax.plot([0, np.max(y)], [0, np.max(y)], color=[1,0,0])\n",
"axy=ax.get_ylim()\n",
"axx=ax.get_xlim()\n",
"plt.xlim(0,2000000)\n",
"plt.ylim(0,2000000)\n",
"fig.suptitle('Zoom in Results', fontsize=20)\n",
"plt.xlabel('True ($CAD)')\n",
"plt.ylabel('Estimated ($CAD)')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.0"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Loading