-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutility.py
688 lines (566 loc) · 27.6 KB
/
utility.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
# -*- coding: utf-8 -*-
"""
Created on Sun Feb 26 19:25:06 2023
@author: imyaash-admin
Helper functions for theOneFunction (Machine Learning Model Selector, Builder & Hyper-Parameter Tuner).
"""
import random
from scipy.stats import ttest_ind, ks_2samp
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.preprocessing import OrdinalEncoder, Normalizer, StandardScaler, MinMaxScaler
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
from sklearn.model_selection import RandomizedSearchCV
from regCandidateModels import regressionEnsembleModel
from clfCandidateModels import classificationEnsembleModel
from itertools import combinations
def findBestSubset(data, n_subsets, subsetSize, randomState):
"""
Randomly selects a subset of data from a dataset n_subsets times,
performs hypothesis testing to test how closely the subsets represent the dataset,
and outputs the subset that best represents the dataset based on the p-value of the hypothesis test.
Parameters:
data (DataFrame): Dataset to select subsets from
n_subsets (int): Number of subsets to randomly select
subsetSize (int): Size of each subset
randomState (int): Setting a seed value
Returns:
(DataFrame): Subset that best represents the dataset
"""
bestSubset = None
best_pvalue = 0
# Setting a seed value
random.seed(randomState)
for i in range(n_subsets):
# Randomly selecting a subset
subsetIndices = random.sample(range(data.shape[0]), subsetSize)
subset = data.iloc[subsetIndices]
# Performing hypothesis testing
t_stat, pvalue = ttest_ind(data, subset)
# Checking if pvalue for the subset is better than the current best pvalue
if np.mean(pvalue) > best_pvalue:
bestSubset = subset
best_pvalue = np.mean(pvalue)
return bestSubset
def cleaner(data):
"""
Cleans a data frame by removing columns with more than 30% NaNs/empty rows and
imputing NaNs/empty rows with the median (if there are outliers) or the mean (otherwise).
Parameters:
data (DataFrame): The data frame to clean.
Returns:
DataFrame: The cleaned data frame.
"""
# Calculate the percentage of NaNs/empty rows in each column
nanPercentages = data.isna().mean() * 100
# Identify columns with more than 30% NaNs/empty rows
toDrop = nanPercentages[nanPercentages > 30].index.tolist()
# Remove the identified columns
data = data.drop(toDrop, axis=1)
# Impute NaNs/empty rows with the median (if there are outliers) or the mean (otherwise)
for col in data.columns:
median = data[col].median()
q1, q3 = np.percentile(data[col], [25, 75])
iqr = q3 - q1
lBound = q1 - (1.5 * iqr)
uBound = q3 + (1.5 * iqr)
if np.isnan(lBound) or np.isnan(uBound):
data[col].fillna(data[col].mean(), inplace=True)
else:
data[col].fillna(median, inplace=True)
return data
def selector(data, targetVar, isContinuous, randomState):
"""
Selects the 5 most important features from a given data using random forest
based feature selection.
Parameters:
data : pandas.DataFrame
Input data containing the features and target variable.
targetVar : str
Name of the target variable in the input data.
isContinuous : bool
Whether the target variable is continuous or categorical.
randomState : int
Setting a seed value.
Returns:
data : pandas.DataFrame
A DataFrame containing the selected features and the target variable.
selected_features : list of str
A list of the selected feature names.
"""
# Splitting the data into features (X) and target variable (y)
X = data.drop(targetVar, axis = 1)
y = data[targetVar]
# Creating a Random Forest Model
if isContinuous:
model = RandomForestRegressor(random_state = randomState)
else:
model = RandomForestClassifier(random_state = randomState)
# Fitting the model on the dataset
model.fit(X, y)
# Getting the feature importances
featureImportances = pd.Series(model.feature_importances_, index = X.columns)
# Sortting the feature importance in descending order
sortedImportances = featureImportances.sort_values(ascending = False)
# Selecting the top 5 features
selectedFeatures = sortedImportances.head(5).index.tolist()
# Adding target variable to the selected features
selectedFeatures.append(targetVar)
return data[selectedFeatures]
def encoder(data):
"""
Encodes non-numeric columns of a given pandas DataFrame using ordinal encoding.
Parameters:
-----------
data : pandas DataFrame
The input DataFrame to be encoded.
Returns:
--------
pandas DataFrame
The encoded DataFrame, with non-numeric columns transformed using ordinal encoding.
Notes:
------
1. If no non-numeric columns are found in the input DataFrame, the function returns the original DataFrame.
2. The function modifies the input DataFrame in place and does not create a new copy.
"""
# Checking if any column is not numerical
nonNumCols = data.select_dtypes(exclude = ["int", "float"]).columns
if len(nonNumCols) == 0:
return data
# Initialising the ordinal encoder
ordinalEncoder = OrdinalEncoder()
# Fitting and transforming the encoder on the non numerical columns
data[nonNumCols] = ordinalEncoder.fit_transform(data[nonNumCols])
return data
def normaliser(X):
"""
This function performs standard scaling on the input data and returns the Normalised data.
Parameters
----------
X : array-like or sparse matrix, shape (n_samples, n_features)
Input data containing the features.
Returns
-------
ndarray or sparse matrix, shape (n_samples, n_features)
Normalised input data.
"""
norm = Normalizer()
return norm.fit_transform(X)
def classBalanceChecker(data, targetVar): # Unused
"""
Checks if the classes in a dataset are balanced by calculating the class ratio and returns a class_weight dictionary
if the ratio is greater than 5.
Parameters:
-----------
data : pandas DataFrame
The input dataset.
targetVar : str
The name of the column that contains the target variable.
Returns:
--------
dict or None
Returns a dictionary of class weights if the class ratio is greater than 5, otherwise returns None.
Raises:
-------
ValueError
If the number of unique classes in the dataset is less than 2.
"""
# Getting the count of unique classes in the data
numClasses = len(data[targetVar].unique())
# Raising exception if data has less than 2 class
if numClasses < 2:
raise ValueError("Data should have 2 or more classes.")
# Calculating the class ratio
classRatios = {}
for i in range(numClasses):
classI = data[data[targetVar]] == data[targetVar].unique()[i]
classRatios[i] = sum(classI) / len(data)
# Checking if the class ratio is greater than 5
if max(classRatios.values()) / min(classRatios.values()) > 5:
# Calculating class weights
total = sum(classRatios.values())
classWeights = {}
for i in range(numClasses):
classWeights[i] = total / (numClasses * classRatios[i])
return classWeights
else:
return None
def selector_v2(data, targetVar, isContinuous, randomState, k = 5):
"""
Performs mutual information-based feature selection on the input data.
Parameters:
data : pandas.DataFrame
Input data containing the features and target variable.
targetVar : str
Name of the target variable in the input data.
isContinuous : bool
Whether the target variable is continuous or categorical.
randomState : int
Setting a seed value.
k : int, default = 5
Number of top features to select.
Returns:
data : pandas.DataFrame
A DataFrame containing the selected features and the target variable.
selected_features : list of str
A list of the selected feature names.
"""
# Separating the features and target variable
features = data.drop(targetVar, axis = 1)
target = data[targetVar]
# Performing mutual information-based feature selection
if isContinuous:
mi = mutual_info_regression(features, target, random_state=randomState)
else:
mi = mutual_info_classif(features, target, random_state=randomState)
# Ranking the features by their mutual information with the target variable
ranked_features = pd.Series(mi, index=features.columns)
ranked_features.sort_values(ascending=False, inplace=True)
# Selecting the top k features
selected_features = ranked_features.index[:k]
# Creating a new DataFrame with the selected features and the target variable
selected_data = pd.concat([features[selected_features], target], axis=1)
return selected_data, selected_features
def trainer(models, sub, targetVar, randomState):
"""
Trains each model using cross-validation to find the best hyper-parameters.
Parameters:
models : list of tuples
A list of tuples containing the model, hyperparameters to search over, and scoring method.
sub : pandas.DataFrame
Input data containing the subset of features and target variable.
targetVar : str
Name of the target variable column in the subset data.
randomState : int
Setting a seed value.
Returns:
bestModel : sklearn.model_selection._search.RandomizedSearchCV
The best model based on the cross-validation results.
bestScore : float
The mean cross-validation score of the best model.
"""
# Splitting the stratified subset into features and target
features = sub.drop(targetVar, axis = 1)
target = sub[targetVar]
# Training each model and finding the best hyper-parameters by cross-validation
bestModel = None
bestScore = 0
for model, hyperparams, scoring in models:
tuner = RandomizedSearchCV(model, hyperparams, cv = 5, n_jobs = -1, n_iter = 25, scoring = scoring, verbose = 2, random_state = randomState)
# Checking if the model is not tree-based and using scaled data for not-tree based models
if type(model).__name__ in ["DecisionTreeRegressor", "DecisionTreeClassifier", "RandomForestRegressor", "RandomForestClassifier", "ExtraTreesRegressor", "ExtraTreesClassifier", "GradientBoostingRegressor", "GradientBoostingClassifier"]:
tuner.fit(features, target)
elif type(model).__name__ in ["LinearRegression", "Ridge", "Lasso", "SGDRegressor", "ElasticNet", "BayesianRidge", "KNeighborsRegressor", "RadiusNeighborsRegressor", "GaussianProcessRegressor"]:
normalisedFeatures = normaliser_v2(features, "standard")
tuner.fit(normalisedFeatures, target)
else:
normalisedFeatures = normaliser_v2(features, "minmax")
tuner.fit(normalisedFeatures, target)
score = tuner.best_score_
if np.nanmean(score) > bestScore:
bestModel = tuner
bestScore = np.nanmean(score)
return bestModel, bestScore
def normaliser_v2(X, method):
"""
This function performs either standard scaling or min-max scaling on the input data
and returns the normalised data.
Parameters
----------
X : array-like or sparse matrix, shape (n_samples, n_features)
Input data containing the features.
method : str {"standard", "minmax"}
Normalization method to be applied.
Returns
-------
ndarray or sparse matrix, shape (n_samples, n_features)
Normalised input data.
Raises
------
ValueError
If method is not one of "standard" or "minmax".
"""
if method == "standard":
normaliser = StandardScaler()
elif method == "minmax":
normaliser = MinMaxScaler()
else:
raise ValueError("method must be one of 'standard' or 'minmax'.")
return normaliser.fit_transform(X)
def cleaner_v2(data):
"""
Cleans a data frame by removing columns with more than 30% NaNs/empty rows,
imputing NaNs/empty rows with the median (if there are outliers) or the mean (otherwise),
and dropping any duplicate rows.
Parameters:
data (DataFrame): The data frame to clean.
Returns:
DataFrame: The cleaned data frame.
"""
# Identify duplicate rows and drop them
data.drop_duplicates(inplace=True)
# Calculate the percentage of NaNs/empty rows in each column
nanPercentages = data.isna().mean() * 100
# Identify columns with more than 30% NaNs/empty rows
toDrop = nanPercentages[nanPercentages > 30].index.tolist()
# Remove the identified columns
data = data.drop(toDrop, axis=1)
# Impute NaNs/empty rows with the median (if there are outliers) or the mean (otherwise)
for col in data.columns:
median = data[col].median()
q1, q3 = np.percentile(data[col], [25, 75])
iqr = q3 - q1
lBound = q1 - (1.5 * iqr)
uBound = q3 + (1.5 * iqr)
if np.isnan(lBound) or np.isnan(uBound):
data[col].fillna(data[col].mean(), inplace=True)
else:
data[col].fillna(median, inplace=True)
return data
def subsetFinder(data, nSubsets, subsetSize, randomState):
bestSubset = None
bestAvgPValue = 0
# Setting a seed value
random.seed(randomState)
for i in range(nSubsets):
# Randomly selecting a subset
subsetIndices = random.sample(range(data.shape[0]), subsetSize)
subset = data.iloc[subsetIndices]
# Compute p-value for each column
pValues = []
for col in data.columns:
_, pValue = ks_2samp(data[col], subset[col])
pValues.append(pValue)
# Checking if the average p-value of the subset is better than the current best average p-value
if np.mean(pValues) > bestAvgPValue:
bestSubset = subset
bestAvgPValue = np.mean(pValues)
return bestSubset
def retrainer(bestModel, data, targetVar):
"""
Retrains the best model using the non-scaled data for tree-based models and
the standard or minmax scaled data for other models.
Parameters:
bestModel : sklearn.model_selection._search.RandomizedSearchCV
The best model based on the cross-validation results.
data : pandas.DataFrame
Input data containing the features and target variable.
targetVar : str
The name of the target variable in the data.
Returns:
bestModel : sklearn.model_selection._search.RandomizedSearchCV
The retrained best model based on the cross-validation results.
bestScore : float
The mean cross-validation score of the retrained model.
bestParams : dict
The best hyper-parameters found by cross-validation of the retrained model.
"""
# Splitting the data into features and target
features = data.drop(targetVar, axis = 1)
target = data[targetVar]
# Re-fitting the best model with the whole dataset, while using the non scaled data for the tree-based models
if type(bestModel.best_estimator_).__name__ in ["DecisionTreeRegressor", "DecisionTreeClassifier", "RandomForestRegressor", "RandomForestClassifier", "ExtraTreesRegressor", "ExtraTreesClassifier", "GradientBoostingRegressor", "GradientBoostingClassifier"]:
bestModel.fit(features, target)
elif type(bestModel.best_estimator_).__name__ in ["LinearRegression", "Ridge", "Lasso", "SGDRegressor", "ElasticNet", "BayesianRidge", "KNeighborsRegressor", "RadiusNeighborsRegressor", "GaussianProcessRegressor"]:
normalisedFeatures = normaliser_v2(features, "standard")
bestModel.fit(normalisedFeatures, target)
else:
normalisedFeatures = normaliser_v2(features, "minmax")
bestModel.fit(normalisedFeatures, target)
bestScore = bestModel.best_score_
bestParams = bestModel.best_params_
return bestModel, bestScore, bestParams
def trainer_v2(models, sub, targetVar, randomState):
"""
Trains a set of machine learning models on a stratified subset of data, and returns the best model
and the top three models that are not tree-based.
Parameters:
-----------
models: list of tuples
A list of tuples, where each tuple contains a machine learning model, its hyperparameters, and a scoring metric.
sub: pandas DataFrame
A stratified subset of the data to train the models on.
targetVar: str
The name of the target variable in the dataset.
randomState: int
Random state to be used for reproducibility.
Returns:
--------
bestModel: RandomizedSearchCV object
The best model selected based on the cross-validation performance.
bestScore: float
The mean cross-validation score of the best model.
top3Models: list of RandomizedSearchCV objects
The top three models that are not tree-based, sorted by their cross-validation score in descending order.
"""
# Splitting the stratified subset into features and target
features = sub.drop(targetVar, axis = 1)
target = sub[targetVar]
# Training each model and finding the best hyper-parameters by cross validation
bestModel = None
bestScore = 0
top3Models = []
top3Scores = []
for model, hyperparams, scoring in models:
tuner = RandomizedSearchCV(model, hyperparams, cv = 5, n_jobs = -1, n_iter = 25, scoring = scoring, verbose = 5, random_state = randomState)
# Checking if the model is not tree-based and using scaled data for not-tree based models
if type(model).__name__ in ["DecisionTreeRegressor", "DecisionTreeClassifier", "RandomForestRegressor", "RandomForestClassifier", "ExtraTreesRegressor", "ExtraTreesClassifier", "GradientBoostingRegressor", "GradientBoostingClassifier"]:
tuner.fit(features, target)
elif type(model).__name__ in ["LinearRegression", "Ridge", "Lasso", "SGDRegressor", "ElasticNet", "BayesianRidge", "KNeighborsRegressor", "RadiusNeighborsRegressor", "GaussianProcessRegressor"]:
normalisedFeatures = normaliser_v2(features, "standard")
tuner.fit(normalisedFeatures, target)
else:
normalisedFeatures = normaliser_v2(features, "minmax")
tuner.fit(normalisedFeatures, target)
score = tuner.best_score_
if np.nanmean(score) > bestScore:
bestModel = tuner
bestScore = np.nanmean(score)
# Checking if the top 3 model list has fewer than 3 models
if len(top3Models) < 3:
if type(bestModel.best_estimator_).__name__ not in ["DecisionTreeRegressor", "DecisionTreeClassifier", "RandomForestRegressor", "RandomForestClassifier", "ExtraTreesRegressor", "ExtraTreesClassifier", "GradientBoostingRegressor", "GradientBoostingClassifier"]:
top3Models.append(bestModel)
top3Scores.append(bestScore)
else:
if type(bestModel.best_estimator_).__name__ not in ["DecisionTreeRegressor", "DecisionTreeClassifier", "RandomForestRegressor", "RandomForestClassifier", "ExtraTreesRegressor", "ExtraTreesClassifier", "GradientBoostingRegressor", "GradientBoostingClassifier"]:
# Comparing score of the new model with the ones in the list
if bestScore > min(top3Scores):
# Adding new model and score to the list
top3Models.append(bestModel.best_estimator_)
top3Scores.append(bestScore)
# Removing the worst model from the best model list
top3Models.pop(top3Scores.index(min(top3Scores)))
top3Scores.pop(top3Scores.index(min(top3Scores)))
return bestModel, bestScore, top3Models
def ensembler(top3Models, sub, targetVar, isContinuous, randomState):
"""
This function takes in a list of top 3 models, a subset of data, target
variable, a boolean value indicating whether the target variable is continuous
or not, and a random state.
The function creates an ensemble model by combining different subsets of
the top 3 models and selects the best ensemble model using a random search
cross-validation approach. If the target variable is continuous, it uses a
regression ensemble model, and if it is categorical, it uses a classification
ensemble model.
Parameters:
-----------
top3Models: a list of top 3 models
A list of tuples, where each tuple contains a machine learning model, its hyperparameters, and a scoring metric.
sub: pandas DataFrame
A stratified subset of the data to train the models on.
targetVar: str
The name of the target variable in the dataset.
isContinuous: bool
boolean value indicating whether the target variable is continuous or not
randomState: int
Random state to be used for reproducibility.
Returns:
--------
bestEnsembleModel: RandomizedSearchCV object
The best model selected based on the cross-validation performance.
bestEnsembleScore: float
The mean cross-validation score of the best model.
"""
# Dropping the subset of the date into features and target variable
features = sub.drop(targetVar, axis = 1)
target = sub[targetVar]
# Initialising the best ensemble model and best score to zero
bestEnsembleModel = None
bestEnsembleScore = 0
# Looping through the different combinations of models
for i in range(1, len(top3Models) + 1):
for combo in combinations(top3Models, i):
# It target variable is continuous
if isContinuous:
# Creating a regression ensemble model
ensembleModel = regressionEnsembleModel(combo, randomState)
# Performing a randomised search cross-validation with 5 folds
tuner = RandomizedSearchCV(ensembleModel[0], ensembleModel[1], cv = 5, n_jobs = -1, n_iter = 25, scoring = ensembleModel[2], verbose = 5, random_state = randomState)
# Normalising the features with stardard scaling
normalisedFeatures = normaliser_v2(features, "standard")
# Fitting the model on the mormalised features
tuner.fit(normalisedFeatures, target)
# If target variable is categorical
else:
# Creating a classification ensemble model
ensembleModel = classificationEnsembleModel(combo, randomState)
# Performing a randomised search cross-validation with 5 folds
tuner = RandomizedSearchCV(ensembleModel[0], ensembleModel[1], cv = 5, n_jobs = -1, n_iter = 25, scoring = ensembleModel[2], verbose = 5, random_state = randomState)
# Normalising the features with minmax scaling
normalisedFeatures = normaliser_v2(features, "minmax")
# Fitting the model on the mormalised features
tuner.fit(normalisedFeatures, target)
# Getting the best cross-validation score of the model
score = tuner.best_score_
# If the current ensemble model has better score, update the best ensemble model and best score
if np.nanmean(score) > bestEnsembleScore:
bestEnsembleModel = tuner
bestEnsembleScore = np.nanmean(score)
return bestEnsembleModel, bestEnsembleScore
def compareNselect(bestModel, bestScore, bestEnsembleModel, bestEnsembleScore):
"""
This function compares the scores of two models and returns the model with the higher score.
Parameters:
-----------
bestModel: RandomizedSearchCV object
The best model selected based on the cross-validation performance.
bestScore: float
The mean cross-validation score of the best model.
bestEnsembleModel: RandomizedSearchCV object
The best model selected based on the cross-validation performance.
bestEnsembleScore: float
The mean cross-validation score of the best model.
Returns:
--------
bestModel: RandomizedSearchCV object
The best model selected based on the cross-validation performance.
bestScore: float
The mean cross-validation score of the best model.
||or||
bestEnsembleModel: RandomizedSearchCV object
The best model selected based on the cross-validation performance.
bestEnsembleScore: float
The mean cross-validation score of the best model.
"""
# Checking if the score of the ensemble is lower than the score of the input best model
if bestEnsembleScore < bestScore:
# Returning the input best model and its score
return bestModel, bestScore
else:
# Returning the best ensemble model and its score
return bestEnsembleModel, bestEnsembleScore
def retrainer_v2(bestModel, data, targetVar):
"""
Retrains the best model using the non-scaled data for tree-based models and
the standard or minmax scaled data for other models.
Parameters:
bestModel : sklearn.model_selection._search.RandomizedSearchCV
The best model based on the cross-validation results.
data : pandas.DataFrame
Input data containing the features and target variable.
targetVar : str
The name of the target variable in the data.
Returns:
bestModel : sklearn.model_selection._search.RandomizedSearchCV
The retrained best model based on the cross-validation results.
bestScore : float
The mean cross-validation score of the retrained model.
bestParams : dict
The best hyper-parameters found by cross-validation of the retrained model.
"""
# Splitting the data into features and target
features = data.drop(targetVar, axis = 1)
target = data[targetVar]
# Re-fitting the best model with the whole dataset, while using the non scaled data for the tree-based models
if type(bestModel.best_estimator_).__name__ in ["DecisionTreeRegressor", "DecisionTreeClassifier", "RandomForestRegressor", "RandomForestClassifier", "ExtraTreesRegressor", "ExtraTreesClassifier", "GradientBoostingRegressor", "GradientBoostingClassifier"]:
bestModel.fit(features, target)
elif type(bestModel.best_estimator_).__name__ in ["LinearRegression", "Ridge", "Lasso", "SGDRegressor", "ElasticNet", "BayesianRidge", "KNeighborsRegressor", "RadiusNeighborsRegressor", "GaussianProcessRegressor", "AdaBoostRegressor"]:
normalisedFeatures = normaliser_v2(features, "standard")
bestModel.fit(normalisedFeatures, target)
else:
normalisedFeatures = normaliser_v2(features, "minmax")
bestModel.fit(normalisedFeatures, target)
bestScore = bestModel.best_score_
bestParams = bestModel.best_params_
return bestModel, bestScore, bestParams