AIDeveloperCommunity · SiddharthaMishra · Mar 31, 2018
diff --git a/practise/KNN+Practise.ipynb b/practise/KNN+Practise.ipynb
@@ -24,14 +24,13 @@
     "ExecuteTime": {
      "end_time": "2017-03-09T17:11:00.462641Z",
      "start_time": "2017-03-09T12:11:00.457060-05:00"
-    },
-    "collapsed": true
+    }
    },
    "outputs": [],
    "source": [
     "from __future__ import print_function\n",
     "import os\n",
-    "data_path = ['data']"
+    "data_path = ['../data']"
    ]
   },
   {
@@ -53,8 +52,7 @@
     "ExecuteTime": {
      "end_time": "2017-03-09T17:11:00.913456Z",
      "start_time": "2017-03-09T12:11:00.883452-05:00"
-    },
-    "collapsed": true
+    }
    },
    "outputs": [],
    "source": [
@@ -86,8 +84,7 @@
     "ExecuteTime": {
      "end_time": "2017-03-09T17:11:01.564122Z",
      "start_time": "2017-03-09T12:11:01.557967-05:00"
-    },
-    "collapsed": true
+    }
    },
    "outputs": [],
    "source": [
@@ -109,6 +106,14 @@
     "data.columns"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Due to things like different income and competetion, customers in some states/areas might be more prone to terminate their service with similer tarrifs than in other areas. So the state and area code can be useful information in predicting whether a customer will terminate the service or not.\n",
+    "Phone number will obviously not be a good feature to analyze since it won't have an affect on if a customer leaves and will only serve to pollute the data set.  "
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -126,17 +131,21 @@
     "ExecuteTime": {
      "end_time": "2017-03-09T17:11:04.545751Z",
      "start_time": "2017-03-09T12:11:04.509105-05:00"
-    },
-    "collapsed": true
+    }
    },
    "outputs": [],
    "source": [
     "from sklearn.preprocessing import LabelBinarizer\n",
     "\n",
     "lb = LabelBinarizer()\n",
-    "\n",
     "for col in ['intl_plan', 'voice_mail_plan', 'churned']:\n",
-    "    data[col] = lb.fit_transform(data[col])"
+    "    print(data.head(2)[col])\n",
+    "print()\n",
+    "for col in ['intl_plan', 'voice_mail_plan', 'churned']:\n",
+    "    data[col] = lb.fit_transform(data[col])\n",
+    "print()\n",
+    "for col in ['intl_plan', 'voice_mail_plan', 'churned']:\n",
+    "    print(data.head(2)[col])"
    ]
   },
   {
@@ -146,8 +155,7 @@
     "ExecuteTime": {
      "end_time": "2017-03-09T17:11:04.736451Z",
      "start_time": "2017-03-09T12:11:04.718049-05:00"
-    },
-    "collapsed": true
+    }
    },
    "outputs": [],
    "source": [
@@ -160,7 +168,8 @@
     "msc = MinMaxScaler()\n",
     "\n",
     "data = pd.DataFrame(msc.fit_transform(data),  # this is an np.array, not a dataframe.\n",
-    "                    columns=data.columns)"
+    "                    columns=data.columns)\n",
+    "print(data.head(2))"
    ]
   },
   {
@@ -180,8 +189,7 @@
     "ExecuteTime": {
      "end_time": "2017-03-09T17:11:50.280188Z",
      "start_time": "2017-03-09T12:11:50.269326-05:00"
-    },
-    "collapsed": true
+    }
    },
    "outputs": [],
    "source": [
@@ -191,7 +199,6 @@
     "# Split the data into two dataframes\n",
     "X_data = data[x_cols]\n",
     "y_data = data['churned']\n",
-    "\n",
     "# # alternatively:\n",
     "# X_data = data.copy()\n",
     "# y_data = X_data.pop('churned')"
@@ -204,8 +211,7 @@
     "ExecuteTime": {
      "end_time": "2017-03-09T17:11:50.989446Z",
      "start_time": "2017-03-09T12:11:50.498708-05:00"
-    },
-    "collapsed": true
+    }
    },
    "outputs": [],
    "source": [
@@ -237,15 +243,14 @@
     "ExecuteTime": {
      "end_time": "2017-03-09T17:11:50.997204Z",
      "start_time": "2017-03-09T12:11:50.991392-05:00"
-    },
-    "collapsed": true
+    }
    },
    "outputs": [],
    "source": [
     "# Function to calculate the % of values that were correctly predicted\n",
     "\n",
     "def accuracy(real, predict):\n",
-    "    return sum(y_data == y_pred) / float(real.shape[0])"
+    "    return sum(real == predict) / float(real.shape[0])"
    ]
   },
   {
@@ -281,12 +286,17 @@
     "ExecuteTime": {
      "end_time": "2017-03-09T17:11:52.047123Z",
      "start_time": "2017-03-09T12:11:51.538212-05:00"
-    },
-    "collapsed": true
+    }
    },
    "outputs": [],
    "source": [
-    "#Student writes code here"
+    "#Student writes code here\n",
+    "knn = KNeighborsClassifier(n_neighbors=3, weights='distance')\n",
+    "\n",
+    "knn = knn.fit(X_data, y_data)\n",
+    "\n",
+    "y_pred = knn.predict(X_data)\n",
+    "print(accuracy(y_data, y_pred))"
    ]
   },
   {
@@ -296,11 +306,18 @@
     "ExecuteTime": {
      "end_time": "2017-03-09T17:11:52.755941Z",
      "start_time": "2017-03-09T12:11:52.049816-05:00"
-    },
-    "collapsed": true
+    }
    },
    "outputs": [],
-   "source": []
+   "source": [
+    "knn = KNeighborsClassifier(n_neighbors=3, weights='distance', p=1)\n",
+    "\n",
+    "knn = knn.fit(X_data, y_data)\n",
+    "\n",
+    "y_pred = knn.predict(X_data)\n",
+    "\n",
+    "print(accuracy(y_data, y_pred))"
+   ]
   },
   {
    "cell_type": "markdown",
@@ -319,12 +336,17 @@
     "ExecuteTime": {
      "end_time": "2017-03-09T17:12:01.329053Z",
      "start_time": "2017-03-09T12:11:52.759302-05:00"
-    },
-    "collapsed": true
+    }
    },
    "outputs": [],
    "source": [
-    "#Student writes code here"
+    "#Student writes code here\n",
+    "ksAccuracy = {} #to store the accuracy for each k\n",
+    "for k in range(1, 21):\n",
+    "    knn = KNeighborsClassifier(n_neighbors=k)\n",
+    "    knn = knn.fit(X_data, y_data)\n",
+    "    y_pred = knn.predict(X_data)\n",
+    "    ksAccuracy[k] = accuracy(y_data, y_pred) "
    ]
   },
   {
@@ -334,11 +356,16 @@
     "ExecuteTime": {
      "end_time": "2017-03-09T17:12:01.829160Z",
      "start_time": "2017-03-09T12:12:01.331021-05:00"
-    },
-    "collapsed": true
+    }
    },
    "outputs": [],
-   "source": []
+   "source": [
+    "#To view the table\n",
+    "print(\"{:<8} {:<15}\".format(\"k\", \"accuracy\"))\n",
+    "print()\n",
+    "for k, acc in ksAccuracy.items():\n",
+    "    print(\"{:<8} {:<15}\".format(k, acc))"
+   ]
   },
   {
    "cell_type": "code",
@@ -347,28 +374,36 @@
     "ExecuteTime": {
      "end_time": "2017-03-09T17:12:02.238935Z",
      "start_time": "2017-03-09T12:12:01.831094-05:00"
-    },
-    "collapsed": true
+    }
    },
    "outputs": [],
-   "source": []
+   "source": [
+    "#Requires matplotlib\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "plt.plot(ksAccuracy.keys(), ksAccuracy.values(), 'ro')\n",
+    "plt.xticks(range(1,21))\n",
+    "plt.xlabel(\"Ks\")\n",
+    "plt.ylabel(\"Accuracy\")\n",
+    "plt.show()"
+   ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
+   "cell_type": "markdown",
    "metadata": {
     "collapsed": true
    },
-   "outputs": [],
-   "source": []
+   "source": [
+    "Since the \"training set\" is the same is the test set, at K=1, the nearest neighbour matches exactly with the point in the data set for every point in the set. Therefore the accuracy is 100%."
+   ]
   }
  ],
  "metadata": {
   "anaconda-cloud": {},
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 2",
    "language": "python",
-   "name": "python3"
+   "name": "python2"
   },
   "language_info": {
    "codemirror_mode": {
@@ -380,7 +415,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.5.3"
+   "version": "3.5.2"
   },
   "name": "Linear_Regression_and_K_Nearest_Neighbors_Exercises-ANSWERS",
   "notebookId": 2125319687183902