-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodelo.py
165 lines (137 loc) · 5.36 KB
/
modelo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import pandas as pd
import numpy as np
import fasttext
from tqdm import tqdm
import time
import os
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from joblib import dump, load
import datetime
# Load the dataset from a CSV file
def load_data(file_path):
"""
Load the data from a CSV file, with columns 'url' and 'phishing'.
"""
data = pd.read_csv(file_path)
return data
# Preprocess the data (vectorize the URLs using TF-IDF)
def preprocess_data(data):
"""
Preprocess the data by splitting it into training and testing sets, and vectorizing the URLs using the TF-IDF method.
"""
# Load the FastText pre-trained models
ft_eng = fasttext.load_model('cc1.en.300.bin')
ft_esp = fasttext.load_model('cc.es.300.bin')
def domain_to_vector(domain):
tokens = tokenize(domain)
vectors = []
for token in tokens:
if token in ft_eng.words:
vectors.append(ft_eng[token])
elif token in ft_esp.words:
vectors.append(ft_esp[token])
if not vectors:
return np.zeros(300)
return np.mean(vectors, axis=0)
X = np.array([domain_to_vector(domain) for domain in domains])
X = data['url']
y = data['phishing']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)
return X_train, X_test, y_train, y_test, vectorizer
# Train the model using logistic regression and grid search for hyperparameter tuning
def train_model(X_train, y_train):
"""
Train the model using logistic regression and random forest classifiers, and perform grid search to find the best hyperparameters.
"""
# Define the pipeline
pipe = Pipeline([
('clf', None)
])
# Set the hyperparameters to search
param_grid = [
{
'clf': [LogisticRegression(solver='liblinear', max_iter=1000)], # Increased the number of iterations
'clf__C': np.logspace(-4, 4, 20),
'clf__penalty': ['l1', 'l2']
},
{
'clf': [RandomForestClassifier()],
'clf__n_estimators': [10, 50, 100, 200],
'clf__max_depth': [None, 10, 20, 30],
'clf__min_samples_split': [2, 5, 10],
'clf__min_samples_leaf': [1, 2, 4]
}
]
# Perform grid search
grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)
return grid_search.best_estimator_
# Evaluate the model
def evaluate_model(model, X_test, y_test):
"""
Evaluate the trained model on the test set and print the classification report.
"""
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
# Save the model
def save_model(model, vectorizer, file_name):
"""
Save the trained model and vectorizer to a file, including the current timestamp.
"""
timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
file_name = f"{file_name}_{timestamp}.joblib"
dump({'model': model, 'vectorizer': vectorizer}, file_name)
print(f"Model saved to {file_name}")
# Load the saved model
def load_saved_model(file_name):
"""
Load the saved model and vectorizer from a file.
"""
loaded = load(file_name)
return loaded['model'], loaded['vectorizer']
def get_existing_models(directory):
"""
Get a list of existing model files in the specified directory.
"""
model_files = [f for f in os.listdir(directory) if f.endswith('.joblib') and f.startswith('trained_model')]
return model_files
def compare_models(new_model, existing_models, X_test, y_test):
"""
Compare the new model to existing models and return the best model.
"""
best_model = new_model
best_score = new_model.score(X_test, y_test)
for model_file in existing_models:
loaded_model, _ = load_saved_model(model_file)
loaded_score = loaded_model.score(X_test, y_test)
if loaded_score > best_score:
best_model = loaded_model
best_score = loaded_score
return best_model
if __name__ == '__main__':
print("Loading data...")
data = load_data('c:/Users/Nano/Desktop/APruebavirtualenv/Ahorasiquesi/.venv/datos_entrenamiento.csv')
print("Preprocessing data...")
X_train, X_test, y_train, y_test, vectorizer = preprocess_data(data)
print("Training the model...")
model = train_model(X_train, y_train)
print("Evaluating the model...")
evaluate_model(model, X_test, y_test)
print("Checking for existing models...")
existing_models = get_existing_models('.')
print("Comparing models...")
best_model = compare_models(model, existing_models, X_test, y_test)
if best_model == model:
print("Saving the new model...")
save_model(model, vectorizer, 'trained_model')
else:
print("The new model is not better than the existing models. No new model will be saved.")