Skip to content

Mercye assignment 9 #339

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: gh-pages
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
290 changes: 290 additions & 0 deletions class9/homework/Emelike_Mercy_9_1.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,290 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Use the pseudocode you came up with in class to write your own 5-fold cross-validation function that splits the data set into 5 equal-sized sets\n",
"- Don't forget to shuffle the input before assigning to sets\n",
"- You can use the fit(), predict(), and score() functions of your model in your functions\n",
"- Test the results with the sklearn cross_val_score\n",
"\n",
"In your PR, discuss what challenges you had creating this function and if it helped you better understand cross validation"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"iris = datasets.load_iris()\n",
"x = iris.data[:,2:] # the attributes\n",
"y = iris.target # the target variables\n",
"dt = tree.DecisionTreeClassifier()\n",
"dt = dt.fit(x,y)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def cv(model, attributes, target, folds):\n",
" \n",
"#import stuff \n",
" import numpy as np\n",
" import random\n",
" from sklearn import datasets\n",
" from sklearn import tree\n",
" from sklearn.utils import shuffle\n",
" \n",
"# print('the entire data set is', len(attributes), 'rows long')\n",
" len_subset1 = len(attributes)//folds #30\n",
"# print('we have', folds, 'subsets.')\n",
"# print('each subset is', len_subset1, 'rows long')\n",
" len_subset2 = len_subset1//folds #6\n",
"# print('each subset is divide into', len_subset2, 'sections')\n",
" \n",
" scores = []\n",
"\n",
" n = 0\n",
" while n<(folds): \n",
" for a in range(0,folds):\n",
"\n",
" n=n+1\n",
" \n",
"# print('loop:', a) \n",
" \n",
"# print('shuffle the attributes and target classifications together. select seed randomly')\n",
" attributes_s, target_s = shuffle(attributes, target, random_state=random.seed())\n",
" \n",
"# print('link the attributes and targets in a list')\n",
" attr_targ = list(zip(attributes_s, target_s))\n",
" \n",
"# print('list length is', len(attr_targ))\n",
" \n",
" if len(attr_targ) > 30:\n",
"# print('list length greater than 30')\n",
"# print('assign the first', len_subset1, 'rows to the subset')\n",
" attr_targ_subset = attr_targ[(len_subset1):] # 150 rows; 120 rows, 90 rows\n",
"\n",
" \n",
"# print('**remove the subset from the list by starting at row', (n)*len_subset1, 'leaving', len(attributes_s)-len_subset1)\n",
" attr_targ = attr_targ[len_subset1:]\n",
"\n",
"# print('**empty attributes and targets so we can reassign')\n",
" attributes = []\n",
" target = []\n",
"\n",
"# print('**assign attributes and targets from the shortened list to their respective lists')\n",
" for x,y in attr_targ: \n",
" attributes.append(x)\n",
" target.append(y)\n",
"\n",
"# print('now the list length is', len(attributes))\n",
"\n",
"# print('assign training data from row', len_subset2, 'to', folds*len_subset2, 'of the subset')\n",
" train = attr_targ_subset[len_subset2:(folds*len_subset2)] # training, assigned from row 6 to 30 of subset\n",
"\n",
"\n",
"# print('create empty lists for training attributes and training targets')\n",
" train_attr = []\n",
" train_targ = []\n",
"\n",
"# print('append separated elements to respective lists: train_attr or train_targ')\n",
" for x,y in train: \n",
" train_attr.append(x)\n",
" train_targ.append(y)\n",
"# print('train_attr length is', len(train_attr))\n",
"# print('train_targ length is', len(train_targ))\n",
"\n",
"# print('assign testing data from row', 0, 'to', len_subset2, 'of the subset')\n",
" test = attr_targ_subset[0:len_subset2] # test assigned from row 0 to 6 of subset\n",
"\n",
"# print('create empty lists for test attributes and test targets')\n",
" test_attr = []\n",
" test_targ = []\n",
"\n",
"# print('append separated elements to respective lists: test_attr or test_targ')\n",
" for x,y in test: \n",
" test_attr.append(x)\n",
" test_targ.append(y)\n",
"# print('test_attr length is', len(test_attr))\n",
"# print('test_targ length is', len(test_targ))\n",
"\n",
"\n",
"# print('fit model on training data')\n",
" model_ = model.fit(train_attr,train_targ)\n",
"\n",
"# print('validate model on test data')\n",
" predict = model_.predict(test_attr)\n",
"# print('attributes:', test_attr, 'predictions:', predict, 'true values:', test_targ)\n",
"\n",
" score = model_.score(test_attr,test_targ)\n",
"# print('model score is:', score) \n",
"\n",
" scores.append(score)\n",
"# print('append score to list:', scores)\n",
"\n",
"\n",
"# print('validation complete on subset',n)\n",
"\n",
" \n",
" else: \n",
"# print('list length less than 30')\n",
"# print('assign training data from row', len_subset2, 'to', len(attr_targ), 'of the subset')\n",
"\n",
" train = attr_targ_subset[len_subset2:len(attr_targ)] # training, assigned from row 6 to 30 of subset\n",
"\n",
"\n",
"# print('create empty lists for training attributes and training targets')\n",
" train_attr = []\n",
" train_targ = []\n",
"\n",
"# print('append separated elements to respective lists: train_attr or train_targ')\n",
" for x,y in train: \n",
" train_attr.append(x)\n",
" train_targ.append(y)\n",
"# print('train_attr length is', len(train_attr))\n",
"# print('train_targ length is', len(train_targ))\n",
"\n",
"# print('assign testing data from row', 0, 'to', len_subset2, 'of the subset')\n",
" test = attr_targ_subset[0:len_subset2] # test assigned from row 0 to 6 of subset\n",
"\n",
"# print('create empty lists for test attributes and test targets')\n",
" test_attr = []\n",
" test_targ = []\n",
"\n",
"# print('append separated elements to respective lists: test_attr or test_targ')\n",
" for x,y in test: \n",
" test_attr.append(x)\n",
" test_targ.append(y)\n",
"# print('test_attr length is', len(test_attr))\n",
"# print('test_targ length is', len(test_targ))\n",
"\n",
"\n",
"# print('fit model on training data')\n",
" model_ = model.fit(train_attr,train_targ)\n",
"\n",
"# print('validate model on test data')\n",
" predict = model_.predict(test_attr)\n",
"# print('attributes:', test_attr, 'predictions:', predict, 'true values:', test_targ)\n",
"\n",
" score = model_.score(test_attr,test_targ)\n",
"# print('model score is:', score) \n",
"\n",
" scores.append(score)\n",
"# print('append score to list:', scores)\n",
"\n",
"\n",
"# print('validation complete on subset',n)\n",
"\n",
" \n",
" accuracy_score = np.mean(scores)\n",
" \n",
" return accuracy_score"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": false,
"scrolled": false
},
"outputs": [
{
"data": {
"text/plain": [
"0.96666666666666679"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cv(dt,x,y,5)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Testing with cross_val_score"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn.cross_validation import cross_val_score"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"scores = cross_val_score(dt,x,y,cv=5) "
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0.96000000000000019"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.mean(scores)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.1"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Loading