diff --git a/InsurAutoML/VERSION b/InsurAutoML/VERSION index 3a4036f..53a75d6 100644 --- a/InsurAutoML/VERSION +++ b/InsurAutoML/VERSION @@ -1 +1 @@ -0.2.5 +0.2.6 diff --git a/InsurAutoML/constant.py b/InsurAutoML/constant.py index 61b288f..0441778 100644 --- a/InsurAutoML/constant.py +++ b/InsurAutoML/constant.py @@ -5,13 +5,13 @@ Mathematics Department, University of Illinois at Urbana-Champaign (UIUC) Project: InsurAutoML -Latest Version: 0.2.5 +Latest Version: 0.2.6 Relative Path: /InsurAutoML/constant.py File: _constant.py Author: Panyi Dong (panyid2@illinois.edu) ----- -Last Modified: Sunday, 10th December 2023 11:13:54 am +Last Modified: Wednesday, 24th April 2024 2:57:57 pm Modified By: Panyi Dong (panyid2@illinois.edu) ----- @@ -205,7 +205,8 @@ def __str__(self): # maximum unique classes determined as categorical variable # 31 is capped by days in a month -UNI_CLASS = 31 +# 120 is capped by age of a person +UNI_CLASS = 120 # maximum iteration allowed for the algorithm MAX_ITER = 1024 diff --git a/InsurAutoML/encoding/encoding.py b/InsurAutoML/encoding/encoding.py index 542a763..7a88458 100644 --- a/InsurAutoML/encoding/encoding.py +++ b/InsurAutoML/encoding/encoding.py @@ -5,13 +5,13 @@ Mathematics Department, University of Illinois at Urbana-Champaign (UIUC) Project: InsurAutoML -Latest Version: 0.2.5 +Latest Version: 0.2.6 Relative Path: /InsurAutoML/encoding/encoding.py File Created: Monday, 24th October 2022 11:56:57 pm Author: Panyi Dong (panyid2@illinois.edu) ----- -Last Modified: Tuesday, 5th December 2023 5:27:01 pm +Last Modified: Thursday, 5th September 2024 7:14:40 pm Modified By: Panyi Dong (panyid2@illinois.edu) ----- @@ -47,10 +47,10 @@ from ..utils.base import is_date from ..utils.data import formatting from .base import BaseEncoder +from ..constant import UNI_CLASS class DataEncoding(BaseEncoder, formatting): - """ Data preprocessing 1. convert string type features to numerical categorical/dummy variables @@ -86,8 +86,8 @@ def fit(self, _df: pd.DataFrame) -> pd.DataFrame: for column in features: if ( df[column].dtype == object - and is_date(df[[column]]) - and len(df[column].dropna().unique()) > 31 + and is_date(df[column]) + and len(df[column].dropna().unique()) > UNI_CLASS ): df[column] = pd.to_numeric(pd.to_datetime(df[column])) elif (df[column].dtype == object) or (str(df[column].dtype) == "category"): @@ -115,10 +115,8 @@ def fit(self, _df: pd.DataFrame) -> pd.DataFrame: axis=1, ) for i in range(len(unique_value)): - df.loc[df[column] == unique_value[i], column] = i - df.loc[~df[column].isnull(), column] = df.loc[ - ~df[column].isnull(), column - ].astype(int) + df.loc[df[column] == unique_value[i], column] = int(i) + df[column] = pd.to_numeric(df[column]) else: df.loc[~df[column].isnull(), column] = df.loc[ ~df[column].isnull(), column @@ -151,6 +149,7 @@ def fit(self, _df: pd.DataFrame) -> pd.DataFrame: df.drop(columns=list(self.category.columns), inplace=True) self._fitted = True + df.columns = df.columns.astype(str) return df @@ -162,8 +161,8 @@ def refit(self, _df: pd.DataFrame) -> pd.DataFrame: for column in list(df.columns): if ( df[column].dtype == object - and is_date(df[[column]]) - and len(df[column].dropna().unique()) > 31 + and is_date(df[column]) + and len(df[column].dropna().unique()) > UNI_CLASS ): df[column] = pd.to_numeric(pd.to_datetime(df[column])) elif df[column].dtype == object or str(df[column].dtype) == "category": @@ -190,9 +189,7 @@ def refit(self, _df: pd.DataFrame) -> pd.DataFrame: df.loc[~df[column].isin(unique_values), column] = np.NaN for i in range(len(unique_values)): df.loc[df[column] == unique_values[i], column] = i - df.loc[~df[column].isnull(), column] = df.loc[ - ~df[column].isnull(), column - ].astype(int) + df[column] = pd.to_numeric(df[column]) else: df.loc[~df[column].isnull(), column] = df.loc[ ~df[column].isnull(), column @@ -223,12 +220,12 @@ def refit(self, _df: pd.DataFrame) -> pd.DataFrame: # remove categorical variables if self.dummy_coding: df.drop(columns=list(self.category.columns), inplace=True) + df.columns = df.columns.astype(str) return df class CategoryShift(BaseEncoder): - """ Add 3 to every cateogry diff --git a/InsurAutoML/hpo/base.py b/InsurAutoML/hpo/base.py index bfd1be8..feb671e 100644 --- a/InsurAutoML/hpo/base.py +++ b/InsurAutoML/hpo/base.py @@ -5,13 +5,13 @@ Mathematics Department, University of Illinois at Urbana-Champaign (UIUC) Project: InsurAutoML -Latest Version: 0.2.5 +Latest Version: 0.2.6 Relative Path: /InsurAutoML/hpo/base.py File Created: Friday, 12th May 2023 10:11:52 am Author: Panyi Dong (panyid2@illinois.edu) ----- -Last Modified: Saturday, 16th December 2023 8:18:35 pm +Last Modified: Tuesday, 30th April 2024 8:00:54 pm Modified By: Panyi Dong (panyid2@illinois.edu) ----- @@ -38,7 +38,6 @@ SOFTWARE. """ - from __future__ import annotations from typing import Union, List, Callable, Dict, Tuple import os @@ -111,7 +110,6 @@ class AutoTabularBase: - """ " Base class module for AutoTabular (for classification and regression tasks) @@ -331,7 +329,7 @@ def __init__( self.cpu_threads = cpu_threads self.use_gpu = use_gpu self.reset_index = reset_index - self.seed = seed + self.seed = seed if seed else 42 self._iter = 0 # record iteration number self._fitted = False # record whether the model has been fitted diff --git a/InsurAutoML/utils/base.py b/InsurAutoML/utils/base.py index dfc17ef..de09a3e 100644 --- a/InsurAutoML/utils/base.py +++ b/InsurAutoML/utils/base.py @@ -5,13 +5,13 @@ Mathematics Department, University of Illinois at Urbana-Champaign (UIUC) Project: InsurAutoML -Latest Version: 0.2.5 +Latest Version: 0.2.6 Relative Path: /InsurAutoML/utils/base.py File Created: Monday, 24th October 2022 11:56:57 pm Author: Panyi Dong (panyid2@illinois.edu) ----- -Last Modified: Thursday, 1st June 2023 9:42:56 am +Last Modified: Thursday, 5th September 2024 6:49:39 pm Modified By: Panyi Dong (panyid2@illinois.edu) ----- @@ -38,7 +38,6 @@ SOFTWARE. """ - from typing import List, Union, Callable, Any, Dict import logging import warnings @@ -117,7 +116,7 @@ def random_list(vlist: list, seed: int = None) -> np.ndarray: # rule = 'any' will consider the column as date type as long as one value is date type, # rule = 'all' will consider the column as date type only when all values # are date type. -def is_date(df: pd.DataFrame, rule: str = "any") -> List[bool]: +def is_date(df: pd.Series, rule: str = "all") -> List[bool]: def _is_date(string, fuzzy=False): try: parse(string, fuzzy=fuzzy) @@ -128,7 +127,7 @@ def _is_date(string, fuzzy=False): _check = [] for item in df.values: - _check.append(_is_date(str(item[0]))) + _check.append(_is_date(str(item))) if rule == "any": return any(_check) elif rule == "all": diff --git a/InsurAutoML/utils/metadata.py b/InsurAutoML/utils/metadata.py index 358adb9..076317d 100644 --- a/InsurAutoML/utils/metadata.py +++ b/InsurAutoML/utils/metadata.py @@ -5,13 +5,13 @@ Mathematics Department, University of Illinois at Urbana-Champaign (UIUC) Project: InsurAutoML -Latest Version: 0.2.5 +Latest Version: 0.2.6 Relative Path: /InsurAutoML/utils/metadata.py File: _metadata.py Author: Panyi Dong (panyid2@illinois.edu) ----- -Last Modified: Wednesday, 12th July 2023 8:19:42 pm +Last Modified: Thursday, 5th September 2024 6:35:35 pm Modified By: Panyi Dong (panyid2@illinois.edu) ----- @@ -72,6 +72,7 @@ def meta_map_object(data: pd.Series) -> str: return "Text" except BaseException: return "Categorical" + return "Categorical" class get_details: @@ -129,11 +130,13 @@ def _merge_details_numerical( # get details of categorical data @staticmethod def _get_details_categorical(data: pd.Series) -> Dict[str, dict]: - return { - "unique_count": { - key: value for key, value in zip(*np.unique(data, return_counts=True)) - } - } + # Using pandas value_counts to avoid nan values error in np.unique + # return { + # "unique_count": { + # key: value for key, value in zip(*np.unique(data, return_counts=True)) + # } + # } + return {"unique_count": dict(data.value_counts(dropna=True))} @staticmethod def _merge_details_categorical( @@ -151,7 +154,6 @@ def _merge_details_categorical( class MetaData: - """ MetaData class is used to store the metadata and details of a dataset. """ @@ -413,7 +415,6 @@ def get_from_df(self, data: pd.DataFrame) -> Dict[Tuple, List] or dict: class ChunkMetaData(MetaData): - """ Metadata for chunk data to deal with very large dataset. """ diff --git a/requirements.txt b/requirements.txt index b116ea8..791e580 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,29 +1,29 @@ -func-timeout -cython -gensim -rpy2 -ray<2.0.0 -pyarrow -fastparquet -scikit-learn>=1.2.0 -redis -ray[tune] -tensorboardX -pandas flaml -tqdm -threadpoolctl>2.2.0 +fastparquet +hyperopt +optuna bayesian_optimization==1.4.0 -nevergrad +tensorboardX +seaborn>=0.11.0 +gensim +redis xgboost -mlflow -optuna +pandas +ray<2.0.0 lightgbm -scipy -hyperopt -pygam setuptools +pygam +threadpoolctl>2.2.0 +mlflow +func-timeout +cython matplotlib -seaborn>=0.11.0 -colorama==0.4.4 +pyarrow +nevergrad numpy<1.24.0 +ray[tune] +colorama==0.4.4 +rpy2 +tqdm +scipy +scikit-learn>=1.2.0 diff --git a/requirements_nn.txt b/requirements_nn.txt index d86d0d7..ea8ed1e 100644 --- a/requirements_nn.txt +++ b/requirements_nn.txt @@ -1,20 +1,20 @@ -func-timeout -cython +fastparquet +tensorboardX +seaborn>=0.11.0 gensim -rpy2 +redis +pandas ray<2.0.0 +setuptools +threadpoolctl>2.2.0 +func-timeout +cython +matplotlib pyarrow -fastparquet -scikit-learn>=1.2.0 -redis +numpy<1.24.0 torch ray[tune] -tensorboardX -pandas +rpy2 tqdm -threadpoolctl>2.2.0 scipy -setuptools -matplotlib -seaborn>=0.11.0 -numpy<1.24.0 +scikit-learn>=1.2.0