Skip to content

Commit

Permalink
Merge pull request #46 from Alex-Lekov/release-2023.3.10
Browse files Browse the repository at this point in the history
Release 2023.3.10
  • Loading branch information
Alex-Lekov authored Mar 9, 2023
2 parents a5b01e7 + 21acebc commit 5347aa9
Show file tree
Hide file tree
Showing 33 changed files with 6,103 additions and 16,964 deletions.
8 changes: 0 additions & 8 deletions .deepsource.toml

This file was deleted.

32 changes: 0 additions & 32 deletions .devcontainer/devcontainer.json

This file was deleted.

5 changes: 3 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -135,9 +135,8 @@ dmypy.json
*/.DS_Store
alexautoml/.DS_Store
.catboost_info
*.catboost_info/
**/catboost_info
catboost_info/*
*/catboost_info
*.pkl
test_de.zip
.gitignore
Expand All @@ -147,3 +146,5 @@ de.zip
test_save.zip
examples/prod_sample/catboost_info/*
result
.devcontainer
poetry.lock
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@ All notable changes to this project will be documented in this file.

The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).

## [2023.3.9]
### Changed
- Update dependencies
### Fix
- ValueError: X and y both have indexes, but they do not match.


## [1.3.10]
### Fix
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM python:3.9-buster
FROM python:3.10-buster

# Uncomment the following COPY line and the corresponding lines in the `RUN` command if you wish to
# include your requirements in the image itself. It is suggested that you only do this if your
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ $ optuna-dashboard sqlite:///db.sqlite3

- [x] Add opt Pruners

- [x] Docs Site
- [ ] Docs Site

- [ ] DL Encoders

Expand Down
3 changes: 2 additions & 1 deletion automl_alex/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,5 @@
from .cross_validation import *
from .optimizer import *
from ._logger import *
from .__version__ import __version__

__version__ = "2023.3.10"
1 change: 0 additions & 1 deletion automl_alex/__version__.py

This file was deleted.

7 changes: 3 additions & 4 deletions automl_alex/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,13 +228,13 @@ def opt(
timeout=200, # optimization time in seconds
metric=None,
metric_round=4,
combined_score_opt=False,
cold_start=30,
auto_parameters=True,
folds=7,
score_folds=2,
opt_lvl=2,
early_stoping=100,
feature_selection=False,
verbose=1,
):
"""
Expand All @@ -249,7 +249,7 @@ def opt(
opt_lvl=None (None or int):
direction=None (None or str):
early_stoping=100 (int):
feature_selection=True (bool):
feature_selection=False (bool):
verbose=1 (int):
Returns:
Expand All @@ -273,8 +273,7 @@ def opt(
clean_and_encod_data=False,
type_of_estimator=self._type_of_estimator,
models_names=[self.__name__],
target_encoders_names=[],
feature_selection=False,
feature_selection=feature_selection,
auto_parameters=auto_parameters,
folds=folds,
score_folds=score_folds,
Expand Down
18 changes: 0 additions & 18 deletions automl_alex/_encoders.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,6 @@
OrdinalEncoder,
BaseNEncoder,
)
from category_encoders import (
TargetEncoder,
CatBoostEncoder,
WOEEncoder,
JamesSteinEncoder,
)
from category_encoders.count import CountEncoder

################################################################
Expand All @@ -32,15 +26,3 @@
"BaseNEncoder": BaseNEncoder,
"CountEncoder": CountEncoder,
}


################################################################
# Target Encoders
################################################################

target_encoders_names = {
"TargetEncoder": TargetEncoder,
"CatBoostEncoder": CatBoostEncoder,
"WOEEncoder": WOEEncoder,
"JamesSteinEncoder": JamesSteinEncoder,
}
30 changes: 15 additions & 15 deletions automl_alex/automl_alex.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,7 +317,7 @@ def fit(
score_folds: int = 3,
opt_lvl: int = 2,
early_stoping: int = 100,
feature_selection: bool = False,
feature_selection: bool = True,
verbose: int = 3,
) -> None:
"""
Expand Down Expand Up @@ -412,15 +412,9 @@ def fit(
"OneHotEncoder",
"CountEncoder",
"HashingEncoder",
"BackwardDifferenceEncoder",
],
"target_encoders_names": [
"TargetEncoder",
"JamesSteinEncoder",
"CatBoostEncoder",
],
"clean_outliers": [True, False],
"num_generator_select_operations": True,
"num_generator_select_operations": False,
"num_generator_operations": ["/", "*", "-", "+"],
#'iteration_check': False,
}
Expand All @@ -429,15 +423,21 @@ def fit(
logger.info(50 * "#")
logger.info("> Start Fit Models 2")
logger.info(50 * "#")

# Model 2
self.model_2 = automl_alex.BestSingleModel(
models_names=[
# "LinearModel",
if self._type_of_estimator == "classifier":
models_names = [
"LightGBM",
# "ExtraTrees",
# "RandomForest",
# "MLP",
],
"XGBoost",
]
elif self._type_of_estimator == "regression":
models_names = [
"LinearModel",
"LightGBM",
]

self.model_2 = automl_alex.BestSingleModel(
models_names=models_names,
**params,
)

Expand Down
98 changes: 0 additions & 98 deletions automl_alex/cross_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@ class CrossValidation(object):
def __init__(
self,
estimator: Callable, # model
target_encoders_names: List[str] = [],
folds: int = 7,
score_folds: int = 5,
n_repeats: int = 1,
Expand All @@ -87,8 +86,6 @@ def __init__(
estimator : Callable
model object from automl_alex.models
The object to use to fit model.
target_encoders_names : List[str]
name encoders (from automl_alex._encoders.target_encoders_names)
folds : int, optional
Number of folds., by default 7
score_folds : int, optional
Expand All @@ -113,7 +110,6 @@ def __init__(
self.n_repeats = n_repeats
self.print_metric = print_metric
self.metric_round = metric_round
self.target_encoders_names = target_encoders_names

if metric is None:
if estimator._type_of_estimator == "classifier":
Expand Down Expand Up @@ -173,30 +169,6 @@ def fit(

for i, (train_idx, valid_idx) in enumerate(self.cv_split_idx):
train_x, train_y = X.iloc[train_idx], y.iloc[train_idx]
# Target Encoder
if len(self.target_encoders_names) > 0:
train_x_copy = train_x[self.cat_features].copy()
for target_enc_name in self.target_encoders_names:
self._fit_target_enc[
f"{target_enc_name} _fold_{i}"
] = copy.deepcopy(
target_encoders_names[target_enc_name](drop_invariant=True)
)

self._fit_target_enc[
f"{target_enc_name} _fold_{i}"
] = self._fit_target_enc[f"{target_enc_name} _fold_{i}"].fit(
train_x_copy, train_y
)

data_encodet = self._fit_target_enc[
f"{target_enc_name} _fold_{i}"
].transform(train_x_copy)
data_encodet = data_encodet.add_prefix(target_enc_name + "_")

train_x = train_x.join(data_encodet.reset_index(drop=True))
train_x_copy = None
train_x.fillna(0, inplace=True)

# Fit
self.estimator.fit(X_train=train_x, y_train=train_y)
Expand All @@ -214,17 +186,6 @@ def predict_test(self, X_test):

for i in range(self.folds * self.n_repeats):
X_test_tmp = X_test.copy()
# Target Encoder
if len(self.target_encoders_names) > 0:
X_cat_features = X_test_tmp[self.cat_features].copy()
for target_enc_name in self.target_encoders_names:
data_encodet = self._fit_target_enc[
f"{target_enc_name} _fold_{i}"
].transform(X_cat_features)
data_encodet = data_encodet.add_prefix(target_enc_name + "_")

X_test_tmp = X_test_tmp.join(data_encodet.reset_index(drop=True))
X_test_tmp.fillna(0, inplace=True)
# Predict
y_pred_test = self.fited_models[
f"model_{self.estimator.__name__}_fold_{i}"
Expand All @@ -242,18 +203,6 @@ def predict_train(self, X):

for i, (train_idx, valid_idx) in enumerate(self.cv_split_idx):
val_x = X.iloc[valid_idx]
# Target Encoder
if len(self.target_encoders_names) > 0:
val_x_copy = val_x[self.cat_features].copy()
for target_enc_name in self.target_encoders_names:
data_encodet = self._fit_target_enc[
f"{target_enc_name} _fold_{i}"
].transform(val_x_copy)
data_encodet = data_encodet.add_prefix(target_enc_name + "_")
val_x = val_x.join(data_encodet.reset_index(drop=True))
val_x_copy = None
val_x.fillna(0, inplace=True)

y_pred = self.fited_models[
f"model_{self.estimator.__name__}_fold_{i}"
].predict_or_predict_proba(val_x)
Expand All @@ -274,16 +223,6 @@ def get_feature_importance(self, X):

for i in range(self.folds * self.n_repeats):
X_tmp = X.copy()
# Target Encoder
if len(self.target_encoders_names) > 0:
X_cat_features = X[self.cat_features].copy()
for target_enc_name in self.target_encoders_names:
data_encodet = self._fit_target_enc[
f"{target_enc_name} _fold_{i}"
].transform(X_cat_features)
data_encodet = data_encodet.add_prefix(target_enc_name + "_")

X_tmp = X_tmp.join(data_encodet.reset_index(drop=True))
X_tmp.fillna(0, inplace=True)
# Get feature_importance
if i == 0:
Expand Down Expand Up @@ -321,30 +260,6 @@ def fit_score(
for i, (train_idx, valid_idx) in enumerate(self.cv_split_idx):
train_x, train_y = X.iloc[train_idx], y.iloc[train_idx]
val_x, val_y = X.iloc[valid_idx], y.iloc[valid_idx]
# Target Encoder
if len(self.target_encoders_names) > 0:
val_x_copy = val_x[cat_features].copy()
train_x_copy = train_x[cat_features].copy()
for target_enc_name in self.target_encoders_names:
target_enc = target_encoders_names[target_enc_name](
drop_invariant=True
)

data_encodet = target_enc.fit_transform(train_x_copy, train_y)
data_encodet = data_encodet.add_prefix(target_enc_name + "_")
train_x = train_x.join(data_encodet.reset_index(drop=True))
data_encodet = None

val_x_data_encodet = target_enc.transform(val_x_copy)
val_x_data_encodet = val_x_data_encodet.add_prefix(
target_enc_name + "_"
)
val_x = val_x.join(val_x_data_encodet.reset_index(drop=True))
val_x_data_encodet = None
val_x_copy = None
train_x_copy = None
train_x.fillna(0, inplace=True)
val_x.fillna(0, inplace=True)

# Fit

Expand Down Expand Up @@ -397,13 +312,6 @@ def save(self, name="cv_dump", folder="./", verbose=1):
self._clean_temp_folder()

for i in range(self.folds * self.n_repeats):
# Target Encoder
if len(self.target_encoders_names) > 0:
for target_enc_name in self.target_encoders_names:
joblib.dump(
self._fit_target_enc[f"{target_enc_name} _fold_{i}"],
f"{dir_tmp}{target_enc_name} _fold_{i}.pkl",
)
# Models
self.fited_models[f"model_{self.estimator.__name__}_fold_{i}"].save(
f"{dir_tmp}model_{self.estimator.__name__}_fold_{i}", verbose=0
Expand All @@ -426,12 +334,6 @@ def load(self, name="cv_dump", folder="./", verbose=1):
cv = joblib.load(dir_tmp + "CV" + ".pkl")

for i in range(cv.folds * cv.n_repeats):
# Target Encoder
if len(self.target_encoders_names) > 0:
for target_enc_name in self.target_encoders_names:
self._fit_target_enc[f"{target_enc_name} _fold_{i}"] = joblib.load(
f"{dir_tmp}{target_enc_name} _fold_{i}.pkl"
)
# Models
cv.fited_models[
f"model_{self.estimator.__name__}_fold_{i}"
Expand Down
Loading

0 comments on commit 5347aa9

Please sign in to comment.