ledeprogram · mercye · Aug 11, 2016 · Aug 16, 2016
diff --git a/class9/homework/Emelike_Mercy_9_1.ipynb b/class9/homework/Emelike_Mercy_9_1.ipynb
@@ -0,0 +1,290 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Use the pseudocode you came up with in class to write your own 5-fold cross-validation function that splits the data set into 5 equal-sized sets\n",
+    "- Don't forget to shuffle the input before assigning to sets\n",
+    "- You can use the fit(), predict(), and score() functions of your model in your functions\n",
+    "- Test the results with the sklearn cross_val_score\n",
+    "\n",
+    "In your PR, discuss what challenges you had creating this function and if it helped you better understand cross validation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "iris = datasets.load_iris()\n",
+    "x = iris.data[:,2:] # the attributes\n",
+    "y = iris.target # the target variables\n",
+    "dt = tree.DecisionTreeClassifier()\n",
+    "dt = dt.fit(x,y)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "def cv(model, attributes, target, folds):\n",
+    "    \n",
+    "#import stuff \n",
+    "    import numpy as np\n",
+    "    import random\n",
+    "    from sklearn import datasets\n",
+    "    from sklearn import tree\n",
+    "    from sklearn.utils import shuffle\n",
+    "    \n",
+    "#     print('the entire data set is', len(attributes), 'rows long')\n",
+    "    len_subset1 = len(attributes)//folds #30\n",
+    "#     print('we have', folds, 'subsets.')\n",
+    "#     print('each subset is', len_subset1, 'rows long')\n",
+    "    len_subset2 = len_subset1//folds #6\n",
+    "#     print('each subset is divide into', len_subset2, 'sections')\n",
+    "    \n",
+    "    scores = []\n",
+    "\n",
+    "    n = 0\n",
+    "    while n<(folds): \n",
+    "            for a in range(0,folds):\n",
+    "\n",
+    "                n=n+1\n",
+    "                \n",
+    "#                 print('loop:', a) \n",
+    "                \n",
+    "#                 print('shuffle the attributes and target classifications together. select seed randomly')\n",
+    "                attributes_s, target_s = shuffle(attributes, target, random_state=random.seed())\n",
+    "                \n",
+    "#                 print('link the attributes and targets in a list')\n",
+    "                attr_targ = list(zip(attributes_s, target_s))\n",
+    "                \n",
+    "#                 print('list length is', len(attr_targ))\n",
+    "                \n",
+    "                if len(attr_targ) > 30:\n",
+    "#                     print('list length greater than 30')\n",
+    "#                     print('assign the first', len_subset1, 'rows to the subset')\n",
+    "                    attr_targ_subset = attr_targ[(len_subset1):] #  150 rows; 120 rows, 90 rows\n",
+    "\n",
+    "               \n",
+    "#                     print('**remove the subset from the list by starting at row', (n)*len_subset1, 'leaving', len(attributes_s)-len_subset1)\n",
+    "                    attr_targ = attr_targ[len_subset1:]\n",
+    "\n",
+    "#                     print('**empty attributes and targets so we can reassign')\n",
+    "                    attributes = []\n",
+    "                    target = []\n",
+    "\n",
+    "#                     print('**assign attributes and targets from the shortened list to their respective lists')\n",
+    "                    for x,y in attr_targ: \n",
+    "                        attributes.append(x)\n",
+    "                        target.append(y)\n",
+    "\n",
+    "#                     print('now the list length is', len(attributes))\n",
+    "\n",
+    "#                     print('assign training data from row', len_subset2, 'to', folds*len_subset2, 'of the subset')\n",
+    "                    train = attr_targ_subset[len_subset2:(folds*len_subset2)] # training, assigned from row 6 to 30 of subset\n",
+    "\n",
+    "\n",
+    "#                     print('create empty lists for training attributes and training targets')\n",
+    "                    train_attr = []\n",
+    "                    train_targ = []\n",
+    "\n",
+    "#                     print('append separated elements to respective lists: train_attr or train_targ')\n",
+    "                    for x,y in train: \n",
+    "                        train_attr.append(x)\n",
+    "                        train_targ.append(y)\n",
+    "#                     print('train_attr length is', len(train_attr))\n",
+    "#                     print('train_targ length is', len(train_targ))\n",
+    "\n",
+    "#                     print('assign testing data from row', 0, 'to', len_subset2, 'of the subset')\n",
+    "                    test = attr_targ_subset[0:len_subset2] # test assigned from row 0 to 6 of subset\n",
+    "\n",
+    "#                     print('create empty lists for test attributes and test targets')\n",
+    "                    test_attr = []\n",
+    "                    test_targ = []\n",
+    "\n",
+    "#                     print('append separated elements to respective lists: test_attr or test_targ')\n",
+    "                    for x,y in test: \n",
+    "                        test_attr.append(x)\n",
+    "                        test_targ.append(y)\n",
+    "#                     print('test_attr length is', len(test_attr))\n",
+    "#                     print('test_targ length is', len(test_targ))\n",
+    "\n",
+    "\n",
+    "#                     print('fit model on training data')\n",
+    "                    model_ = model.fit(train_attr,train_targ)\n",
+    "\n",
+    "#                     print('validate model on test data')\n",
+    "                    predict = model_.predict(test_attr)\n",
+    "#                     print('attributes:', test_attr, 'predictions:', predict, 'true values:', test_targ)\n",
+    "\n",
+    "                    score = model_.score(test_attr,test_targ)\n",
+    "#                     print('model score is:', score)  \n",
+    "\n",
+    "                    scores.append(score)\n",
+    "#                     print('append score to list:', scores)\n",
+    "\n",
+    "\n",
+    "#                     print('validation complete on subset',n)\n",
+    "\n",
+    "                \n",
+    "                else: \n",
+    "#                     print('list length less than 30')\n",
+    "#                     print('assign training data from row', len_subset2, 'to', len(attr_targ), 'of the subset')\n",
+    "\n",
+    "                    train = attr_targ_subset[len_subset2:len(attr_targ)] # training, assigned from row 6 to 30 of subset\n",
+    "\n",
+    "\n",
+    "#                     print('create empty lists for training attributes and training targets')\n",
+    "                    train_attr = []\n",
+    "                    train_targ = []\n",
+    "\n",
+    "#                     print('append separated elements to respective lists: train_attr or train_targ')\n",
+    "                    for x,y in train: \n",
+    "                        train_attr.append(x)\n",
+    "                        train_targ.append(y)\n",
+    "#                     print('train_attr length is', len(train_attr))\n",
+    "#                     print('train_targ length is', len(train_targ))\n",
+    "\n",
+    "#                     print('assign testing data from row', 0, 'to', len_subset2, 'of the subset')\n",
+    "                    test = attr_targ_subset[0:len_subset2] # test assigned from row 0 to 6 of subset\n",
+    "\n",
+    "#                     print('create empty lists for test attributes and test targets')\n",
+    "                    test_attr = []\n",
+    "                    test_targ = []\n",
+    "\n",
+    "#                     print('append separated elements to respective lists: test_attr or test_targ')\n",
+    "                    for x,y in test: \n",
+    "                        test_attr.append(x)\n",
+    "                        test_targ.append(y)\n",
+    "#                     print('test_attr length is', len(test_attr))\n",
+    "#                     print('test_targ length is', len(test_targ))\n",
+    "\n",
+    "\n",
+    "#                     print('fit model on training data')\n",
+    "                    model_ = model.fit(train_attr,train_targ)\n",
+    "\n",
+    "#                     print('validate model on test data')\n",
+    "                    predict = model_.predict(test_attr)\n",
+    "#                     print('attributes:', test_attr, 'predictions:', predict, 'true values:', test_targ)\n",
+    "\n",
+    "                    score = model_.score(test_attr,test_targ)\n",
+    "#                     print('model score is:', score)  \n",
+    "\n",
+    "                    scores.append(score)\n",
+    "#                     print('append score to list:', scores)\n",
+    "\n",
+    "\n",
+    "#                     print('validation complete on subset',n)\n",
+    "\n",
+    "               \n",
+    "    accuracy_score = np.mean(scores)\n",
+    "          \n",
+    "    return accuracy_score"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {
+    "collapsed": false,
+    "scrolled": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.96666666666666679"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "cv(dt,x,y,5)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Testing with cross_val_score"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "from sklearn.cross_validation import cross_val_score"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "scores = cross_val_score(dt,x,y,cv=5) "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.96000000000000019"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "np.mean(scores)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}