-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain_script.py
106 lines (76 loc) · 3.67 KB
/
train_script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.compose import ColumnTransformer
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import joblib
"""### Reading the database"""
file_path = r"D:\Sicherung\pythonProject\venv\Lib\data2.csv"
file = pd.read_csv(file_path, sep=',', encoding='utf-8')
df = pd.DataFrame(file)
df = df.fillna('-')
df = df[~df.iloc[:, 0].str.startswith('** Entfallen **')]
df.reset_index(drop=True, inplace=True)
"""### Feature processing"""
inputs = df.iloc[:, :-1]
target = pd.DataFrame(df.iloc[:, -1])
numerical_columns = inputs.select_dtypes(include=['int', 'float']).columns.tolist()
all_columns = list(inputs.columns)
if numerical_columns is not None:
text_columns = [col for col in all_columns if col not in numerical_columns]
text_transformers = [('text' + str(i), CountVectorizer(), column) for i, column in enumerate(text_columns, start=1)]
numerical_transformers = [('numerical' + str(i), MinMaxScaler(), [column]) for i, column in enumerate(numerical_columns, start=1)]
transformers = text_transformers + numerical_transformers
else:
text_columns = all_columns
transformers = [('text' + str(i), CountVectorizer(), column) for i, column in enumerate(text_columns, start=1)]
preprocessor = ColumnTransformer(transformers=transformers)
"""### Train test split"""
x_train, x_test, y_train, y_test = train_test_split(inputs, target,
test_size=0.2,
random_state=42)
"""### Tokenizing the Input Room Names"""
# The 'fit_transform()' method is to be applied *only* on the training data!
x_train_transf = preprocessor.fit_transform(x_train)
x_test_transf = preprocessor.transform(x_test)
# Check how the 'x_train_transf' matrix looks like.
print(x_train_transf.shape)
print(x_test_transf.shape)
print(x_test_transf)
"""### Performing the classification"""
# Create an instance of the Multinomial Naive Bayes classifier
clf = MultinomialNB()
# Fit the model to the training data
clf.fit(x_train_transf, y_train)
"""### Performing the evaluation on the test dataset"""
# Predict the target of the observations in the test set
y_test_pred = clf.predict(x_test_transf)
y_test_pred_prob = clf.predict_proba(x_test_transf)
# The classification report consists of the precision, recall and f1-score of each class as well as
# the overall accuracy of the model.
print(classification_report(y_test, y_test_pred))
# Export the heatmap
def save_heatmap(y_true, y_pred, file_name, figsize=(10, 7)):
conf_matrix = confusion_matrix(y_true, y_pred)
plt.figure(figsize=figsize)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix Heatmap")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.savefig(file_name)
print(f"Heatmap saved as {file_name}")
plt.close()
y_pred = clf.predict(x_test_transf)
save_heatmap(y_test, y_pred, 'heatmap.png')
# Export the model
def save_model(model, preprocessor, model_file_name, preprocessor_file_name):
joblib.dump(model, model_file_name)
joblib.dump(preprocessor, preprocessor_file_name)
print(f"Model saved as {model_file_name}")
print(f"Preprocessor saved as {preprocessor_file_name}")
save_model(clf, preprocessor, 'trained_classifier.joblib','preprocessor.joblib')