Merge pull request #6 from ipranjal/feature/decision_tree

Feature/decision tree
ipranjal · Oct 7, 2023 · 6af37cd · 6af37cd
2 parents 3b6e2f9 + ec08c35
commit 6af37cd
Show file tree

Hide file tree

Showing 3 changed files with 54 additions and 29 deletions.
diff --git a/digits.py b/digits.py
@@ -7,6 +7,7 @@
 # Import datasets, classifiers and performance metrics
 from utils import preprocess_data, tune_hparams, split_train_dev_test,read_digits,predict_and_eval
 from joblib import load
+import pandas as pd
 
 # The digits dataset consists of 8x8 pixel images of digits. The images attribute of the dataset stores 8x8 arrays of grayscale values for each image. We will use these arrays to visualize the first 4 images. The target attribute of the dataset stores the digit each image represents and this is included in the title of the 4 plots below.
 # Note: if we were working from image files (e.g., ‘png’ files), we would load them using matplotlib.pyplot.imread.
@@ -15,38 +16,51 @@
 
 x,y = read_digits()
 
-print("Total number of samples : ", len(x))
+#print("Total number of samples : ", len(x))
 
-print("(number of samples,length of image,height of image) is:",x.shape)
+#print("(number of samples,length of image,height of image) is:",x.shape)
 
-test_sizes = [0.1, 0.2, 0.3]
-dev_sizes = [0.1, 0.2, 0.3]
+# test_sizes = [0.1, 0.2, 0.3]
+# dev_sizes = [0.1, 0.2, 0.3]
 
-for test_size in test_sizes:
-    for dev_size in dev_sizes:
+test_sizes = [0.2]
+dev_sizes = [0.2]
+results = []
+
+for i in range(5):
+    for test_size in test_sizes:
+        for dev_size in dev_sizes:
         # 3. Data splitting
-        X_train, X_test,X_dev, y_train, y_test,y_dev = split_train_dev_test(x, y, test_size=test_size, dev_size=dev_size);
+            X_train, X_test,X_dev, y_train, y_test,y_dev = split_train_dev_test(x, y, test_size=test_size, dev_size=dev_size);
 
         # 4. Data Preprocessing
-        X_train = preprocess_data(X_train)
-        X_test = preprocess_data(X_test)
-        X_dev = preprocess_data(X_dev)
+            X_train = preprocess_data(X_train)
+            X_test = preprocess_data(X_test)
+            X_dev = preprocess_data(X_dev)
+
+            classifer_hparam = {}
+
+            gama_ranges = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
+            C_ranges = [0.1,1,2,5,10]
+            classifer_hparam['svm']= [{'gamma': gamma, 'C': C} for gamma in gama_ranges for C in C_ranges]
 
-        gama_ranges = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
-        C_ranges = [0.1,1,2,5,10]
-        list_of_all_param_combination = [{'gamma': gamma, 'C': C} for gamma in gama_ranges for C in C_ranges]
+            max_depth = [5,10,15,20,50,100]
+            classifer_hparam['tree'] = [{'max_depth': depth} for depth in max_depth]
+
+            models = ['svm','tree']
 
         # Predict the value of the digit on the test subset
         # 6.Predict and Evaluate 
-        best_hparams, best_model_path, best_accuracy = tune_hparams(X_train, y_train, X_dev, y_dev, list_of_all_param_combination)
-        best_model = load(best_model_path)
-
+            for model in models:
+                best_hparams, best_model_path, best_accuracy = tune_hparams(X_train, y_train, X_dev, y_dev, classifer_hparam[model], model_type=model)
+                best_model = load(best_model_path)
 
-        accuracy_test = predict_and_eval(best_model, X_test, y_test)
-        accuracy_dev = predict_and_eval(best_model, X_dev, y_dev)
-        accuracy_train = predict_and_eval(best_model, X_train, y_train)
-        print(f"test_size={test_size} dev_size={dev_size} train_size={1- (dev_size+test_size)} train_acc={accuracy_train} dev_acc={accuracy_dev} test_acc={accuracy_test}")
-        print(f"best_gamma={best_hparams['gamma']},best_C={best_hparams['C']}")
+                accuracy_test = predict_and_eval(best_model, X_test, y_test)
+                accuracy_dev = predict_and_eval(best_model, X_dev, y_dev)
+                accuracy_train = predict_and_eval(best_model, X_train, y_train)
+                print(f"model={model} run_index={i} test_size={test_size} dev_size={dev_size} train_size={1- (dev_size+test_size)} train_acc={accuracy_train} dev_acc={accuracy_dev} test_acc={accuracy_test}")
+                results.append([{'model':model,'run_index': i, 'test_size':test_size, 'dev_size':dev_size,'train_size': 1- (dev_size+test_size), 'train_acc':accuracy_train,'dev_acc':accuracy_dev,'test_acc':accuracy_test}])
+        #print(f"best_gamma={best_hparams['gamma']},best_C={best_hparams['C']}")
 
 
 

diff --git a/test_utils.py b/test_utils.py
@@ -32,7 +32,7 @@ def test_hparam_count():
 def test_mode_saving():
     X_train, y_train, X_dev, y_dev = create_dummy_data()
     list_of_all_param_combination = create_dummy_hyperparamete()
-    _, best_model_path, _ = tune_hparams(X_train, y_train, X_dev, y_dev, list_of_all_param_combination)
+    _, best_model_path, _ = tune_hparams(X_train, y_train, X_dev, y_dev, list_of_all_param_combination,'svm')
     assert os.path.exists(best_model_path)
 
 def test_data_splitting():

diff --git a/utils.py b/utils.py
@@ -2,7 +2,7 @@
 # Import datasets, classifiers and performance metrics
 from sklearn import svm,datasets
 from sklearn.model_selection import train_test_split
-from sklearn import metrics
+from sklearn import tree
 import matplotlib.pyplot as plt
 from sklearn.metrics import accuracy_score
 from joblib import dump,load
@@ -32,6 +32,8 @@ def split_data(X,y,test_size=0.5,random_state=1):
 def train_model(X, y, model_params,model_type = 'svm'):
     if model_type == 'svm':
         clf = svm.SVC(**model_params)
+    if model_type == 'tree':
+        clf = tree.DecisionTreeClassifier(**model_params)
     clf.fit(X, y)
     return clf
 
@@ -47,20 +49,29 @@ def predict_and_eval(model, X, y):
 
     return accuracy
 
-def tune_hparams(X_train, Y_train, X_dev, y_dev, list_of_all_param_combination):
+def tune_hparams(X_train, Y_train, X_dev, y_dev, list_of_all_param_combination, model_type='svm'):
     best_accuracy_so_far = -1
     best_model = None
     best_model_path = ""
 
     for param_combination in list_of_all_param_combination:
-        cur_model = train_model(X_train, Y_train, {'gamma': param_combination['gamma'],'C':param_combination['C']}, model_type='svm')
+        if model_type == 'svm':
+            cur_model = train_model(X_train, Y_train, {'gamma': param_combination['gamma'],'C':param_combination['C']}, model_type='svm')
+        if model_type == 'tree':
+            cur_model = train_model(X_train, Y_train, {'max_depth': param_combination['max_depth']}, model_type='tree')
+
         cur_accuracy = predict_and_eval(cur_model, X_dev, y_dev)
         if cur_accuracy > best_accuracy_so_far:
             best_accuracy_so_far = cur_accuracy
-            optimal_gamma = param_combination['gamma']
-            optimal_C = param_combination['C']
-            best_hparams = {'gamma': optimal_gamma,'C':optimal_C}
-            best_model_path = "./models/svm"+"_".join(["{}:{}".format(k,v) for k,v in best_hparams.items()])+".joblib"
+            if model_type == 'svm':
+                optimal_gamma = param_combination['gamma']
+                optimal_C = param_combination['C']
+                best_hparams = {'gamma': optimal_gamma,'C':optimal_C}
+            if model_type == 'tree':
+                optimal_max_depth = param_combination['max_depth']
+                best_hparams = {'max_depth': optimal_max_depth}
+            best_model_path = "./models/{}".format(model_type)+"_".join(["{}:{}".format(k,v) for k,v in best_hparams.items()])+".joblib"
+
             best_model = cur_model
 
     # save the best model