Skip to content

Commit

Permalink
Merge pull request #6 from ipranjal/feature/decision_tree
Browse files Browse the repository at this point in the history
Feature/decision tree
  • Loading branch information
ipranjal authored Oct 7, 2023
2 parents 3b6e2f9 + ec08c35 commit 6af37cd
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 29 deletions.
56 changes: 35 additions & 21 deletions digits.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
# Import datasets, classifiers and performance metrics
from utils import preprocess_data, tune_hparams, split_train_dev_test,read_digits,predict_and_eval
from joblib import load
import pandas as pd

# The digits dataset consists of 8x8 pixel images of digits. The images attribute of the dataset stores 8x8 arrays of grayscale values for each image. We will use these arrays to visualize the first 4 images. The target attribute of the dataset stores the digit each image represents and this is included in the title of the 4 plots below.
# Note: if we were working from image files (e.g., ‘png’ files), we would load them using matplotlib.pyplot.imread.
Expand All @@ -15,38 +16,51 @@

x,y = read_digits()

print("Total number of samples : ", len(x))
#print("Total number of samples : ", len(x))

print("(number of samples,length of image,height of image) is:",x.shape)
#print("(number of samples,length of image,height of image) is:",x.shape)

test_sizes = [0.1, 0.2, 0.3]
dev_sizes = [0.1, 0.2, 0.3]
# test_sizes = [0.1, 0.2, 0.3]
# dev_sizes = [0.1, 0.2, 0.3]

for test_size in test_sizes:
for dev_size in dev_sizes:
test_sizes = [0.2]
dev_sizes = [0.2]
results = []

for i in range(5):
for test_size in test_sizes:
for dev_size in dev_sizes:
# 3. Data splitting
X_train, X_test,X_dev, y_train, y_test,y_dev = split_train_dev_test(x, y, test_size=test_size, dev_size=dev_size);
X_train, X_test,X_dev, y_train, y_test,y_dev = split_train_dev_test(x, y, test_size=test_size, dev_size=dev_size);

# 4. Data Preprocessing
X_train = preprocess_data(X_train)
X_test = preprocess_data(X_test)
X_dev = preprocess_data(X_dev)
X_train = preprocess_data(X_train)
X_test = preprocess_data(X_test)
X_dev = preprocess_data(X_dev)

classifer_hparam = {}

gama_ranges = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
C_ranges = [0.1,1,2,5,10]
classifer_hparam['svm']= [{'gamma': gamma, 'C': C} for gamma in gama_ranges for C in C_ranges]

gama_ranges = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
C_ranges = [0.1,1,2,5,10]
list_of_all_param_combination = [{'gamma': gamma, 'C': C} for gamma in gama_ranges for C in C_ranges]
max_depth = [5,10,15,20,50,100]
classifer_hparam['tree'] = [{'max_depth': depth} for depth in max_depth]

models = ['svm','tree']

# Predict the value of the digit on the test subset
# 6.Predict and Evaluate
best_hparams, best_model_path, best_accuracy = tune_hparams(X_train, y_train, X_dev, y_dev, list_of_all_param_combination)
best_model = load(best_model_path)

for model in models:
best_hparams, best_model_path, best_accuracy = tune_hparams(X_train, y_train, X_dev, y_dev, classifer_hparam[model], model_type=model)
best_model = load(best_model_path)

accuracy_test = predict_and_eval(best_model, X_test, y_test)
accuracy_dev = predict_and_eval(best_model, X_dev, y_dev)
accuracy_train = predict_and_eval(best_model, X_train, y_train)
print(f"test_size={test_size} dev_size={dev_size} train_size={1- (dev_size+test_size)} train_acc={accuracy_train} dev_acc={accuracy_dev} test_acc={accuracy_test}")
print(f"best_gamma={best_hparams['gamma']},best_C={best_hparams['C']}")
accuracy_test = predict_and_eval(best_model, X_test, y_test)
accuracy_dev = predict_and_eval(best_model, X_dev, y_dev)
accuracy_train = predict_and_eval(best_model, X_train, y_train)
print(f"model={model} run_index={i} test_size={test_size} dev_size={dev_size} train_size={1- (dev_size+test_size)} train_acc={accuracy_train} dev_acc={accuracy_dev} test_acc={accuracy_test}")
results.append([{'model':model,'run_index': i, 'test_size':test_size, 'dev_size':dev_size,'train_size': 1- (dev_size+test_size), 'train_acc':accuracy_train,'dev_acc':accuracy_dev,'test_acc':accuracy_test}])
#print(f"best_gamma={best_hparams['gamma']},best_C={best_hparams['C']}")



Expand Down
2 changes: 1 addition & 1 deletion test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def test_hparam_count():
def test_mode_saving():
X_train, y_train, X_dev, y_dev = create_dummy_data()
list_of_all_param_combination = create_dummy_hyperparamete()
_, best_model_path, _ = tune_hparams(X_train, y_train, X_dev, y_dev, list_of_all_param_combination)
_, best_model_path, _ = tune_hparams(X_train, y_train, X_dev, y_dev, list_of_all_param_combination,'svm')
assert os.path.exists(best_model_path)

def test_data_splitting():
Expand Down
25 changes: 18 additions & 7 deletions utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# Import datasets, classifiers and performance metrics
from sklearn import svm,datasets
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import tree
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from joblib import dump,load
Expand Down Expand Up @@ -32,6 +32,8 @@ def split_data(X,y,test_size=0.5,random_state=1):
def train_model(X, y, model_params,model_type = 'svm'):
if model_type == 'svm':
clf = svm.SVC(**model_params)
if model_type == 'tree':
clf = tree.DecisionTreeClassifier(**model_params)
clf.fit(X, y)
return clf

Expand All @@ -47,20 +49,29 @@ def predict_and_eval(model, X, y):

return accuracy

def tune_hparams(X_train, Y_train, X_dev, y_dev, list_of_all_param_combination):
def tune_hparams(X_train, Y_train, X_dev, y_dev, list_of_all_param_combination, model_type='svm'):
best_accuracy_so_far = -1
best_model = None
best_model_path = ""

for param_combination in list_of_all_param_combination:
cur_model = train_model(X_train, Y_train, {'gamma': param_combination['gamma'],'C':param_combination['C']}, model_type='svm')
if model_type == 'svm':
cur_model = train_model(X_train, Y_train, {'gamma': param_combination['gamma'],'C':param_combination['C']}, model_type='svm')
if model_type == 'tree':
cur_model = train_model(X_train, Y_train, {'max_depth': param_combination['max_depth']}, model_type='tree')

cur_accuracy = predict_and_eval(cur_model, X_dev, y_dev)
if cur_accuracy > best_accuracy_so_far:
best_accuracy_so_far = cur_accuracy
optimal_gamma = param_combination['gamma']
optimal_C = param_combination['C']
best_hparams = {'gamma': optimal_gamma,'C':optimal_C}
best_model_path = "./models/svm"+"_".join(["{}:{}".format(k,v) for k,v in best_hparams.items()])+".joblib"
if model_type == 'svm':
optimal_gamma = param_combination['gamma']
optimal_C = param_combination['C']
best_hparams = {'gamma': optimal_gamma,'C':optimal_C}
if model_type == 'tree':
optimal_max_depth = param_combination['max_depth']
best_hparams = {'max_depth': optimal_max_depth}
best_model_path = "./models/{}".format(model_type)+"_".join(["{}:{}".format(k,v) for k,v in best_hparams.items()])+".joblib"

best_model = cur_model

# save the best model
Expand Down

0 comments on commit 6af37cd

Please sign in to comment.