-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain.py
165 lines (132 loc) · 5.24 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn import metrics
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import ConfusionMatrixDisplay, classification_report
from sklearn.impute import SimpleImputer
import joblib
import os
# Suppress warnings
import warnings
warnings.filterwarnings('ignore')
# Ensure working directory is set to Kaggle's working directory
working_dir = '/kaggle/working'
os.makedirs(working_dir, exist_ok=True)
def load_and_preprocess_data(file_path):
# Load the data
df = pd.read_csv(file_path)
# Replace values
df = df.replace({'yes':1, 'no':0, '?':'Others', 'others':'Others'})
# Age conversion function
def convertAge(age):
if age < 4:
return 'Toddler'
elif age < 12:
return 'Kid'
elif age < 18:
return 'Teenager'
elif age < 40:
return 'Young'
else:
return 'Senior'
# Feature engineering
df['ageGroup'] = df['age'].apply(convertAge)
def add_feature(data):
# Creating a column with sum of scores
data['sum_score'] = data.loc[:,'A1_Score':'A10_Score'].sum(axis=1)
# Creating an indicator feature
data['ind'] = data['austim'] + data['used_app_before'] + data['jaundice']
return data
df = add_feature(df)
# Log transformation of age
df['age'] = np.log(df['age'])
# Label encoding
def encode_labels(data):
for col in data.columns:
if data[col].dtype == 'object':
le = LabelEncoder()
data[col] = le.fit_transform(data[col])
return data
df = encode_labels(df)
return df
def prepare_data(df):
# Remove unnecessary columns
removal = ['ID', 'age_desc', 'used_app_before', 'austim']
features = df.drop(removal + ['Class/ASD'], axis=1)
target = df['Class/ASD']
# Split the data
X_train, X_val, Y_train, Y_val = train_test_split(features, target, test_size=0.2, random_state=10)
# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_val_imputed = imputer.transform(X_val)
# Oversample minority class
ros = RandomOverSampler(sampling_strategy='minority', random_state=0)
X_resampled, Y_resampled = ros.fit_resample(X_train_imputed, Y_train)
# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_resampled)
X_val_scaled = scaler.transform(X_val_imputed)
return X_scaled, Y_resampled, X_val_scaled, Y_val, scaler, X_train.columns
def train_and_save_models(X, Y, X_val, Y_val):
# Define models
models = {
'logistic_regression': LogisticRegression(max_iter=1000),
'xgboost': XGBClassifier(),
'svm': SVC(kernel='rbf', probability=True)
}
# Train and save models
results = {}
for name, model in models.items():
# Train the model
model.fit(X, Y)
# Predict and calculate metrics
train_pred = model.predict(X)
val_pred = model.predict(X_val)
# Store results
results[name] = {
'train_auc': metrics.roc_auc_score(Y, train_pred),
'val_auc': metrics.roc_auc_score(Y_val, val_pred),
'train_report': classification_report(Y, train_pred),
'val_report': classification_report(Y_val, val_pred),
'model': model
}
# Save the model
joblib.dump(model, os.path.join(working_dir, f'{name}_model.joblib'))
return results
def main():
# Load and preprocess data
train_path = '/kaggle/input/autismprediction/train.csv'
df = load_and_preprocess_data(train_path)
# Prepare data
X, Y, X_val, Y_val, scaler, feature_columns = prepare_data(df)
# Save scaler, feature columns, and label encoder
joblib.dump(scaler, os.path.join(working_dir, 'feature_scaler.joblib'))
joblib.dump(feature_columns.tolist(), os.path.join(working_dir, 'feature_columns.joblib'))
# Train and save models
results = train_and_save_models(X, Y, X_val, Y_val)
# Print results
for model_name, model_results in results.items():
print(f"\n{model_name.upper()} Model Results:")
print(f"Training AUC: {model_results['train_auc']}")
print(f"Validation AUC: {model_results['val_auc']}")
print("\nTraining Classification Report:")
print(model_results['train_report'])
print("\nValidation Classification Report:")
print(model_results['val_report'])
# Visualize Confusion Matrix for Logistic Regression
plt.figure(figsize=(8,6))
ConfusionMatrixDisplay.from_estimator(results['logistic_regression']['model'], X_val, Y_val)
plt.title('Confusion Matrix - Logistic Regression')
plt.tight_layout()
plt.savefig(os.path.join(working_dir, 'confusion_matrix.png'))
plt.close()
if __name__ == '__main__':
main()