Skip to content

Commit

Permalink
fix metadata with missing & numerical saved as object
Browse files Browse the repository at this point in the history
  • Loading branch information
PanyiDong committed Sep 6, 2024
1 parent 34bbc5b commit d8aa6c1
Show file tree
Hide file tree
Showing 8 changed files with 69 additions and 73 deletions.
2 changes: 1 addition & 1 deletion InsurAutoML/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.2.5
0.2.6
7 changes: 4 additions & 3 deletions InsurAutoML/constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@
Mathematics Department, University of Illinois at Urbana-Champaign (UIUC)
Project: InsurAutoML
Latest Version: 0.2.5
Latest Version: 0.2.6
Relative Path: /InsurAutoML/constant.py
File: _constant.py
Author: Panyi Dong (panyid2@illinois.edu)
-----
Last Modified: Sunday, 10th December 2023 11:13:54 am
Last Modified: Wednesday, 24th April 2024 2:57:57 pm
Modified By: Panyi Dong (panyid2@illinois.edu)
-----
Expand Down Expand Up @@ -205,7 +205,8 @@ def __str__(self):

# maximum unique classes determined as categorical variable
# 31 is capped by days in a month
UNI_CLASS = 31
# 120 is capped by age of a person
UNI_CLASS = 120

# maximum iteration allowed for the algorithm
MAX_ITER = 1024
Expand Down
27 changes: 12 additions & 15 deletions InsurAutoML/encoding/encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@
Mathematics Department, University of Illinois at Urbana-Champaign (UIUC)
Project: InsurAutoML
Latest Version: 0.2.5
Latest Version: 0.2.6
Relative Path: /InsurAutoML/encoding/encoding.py
File Created: Monday, 24th October 2022 11:56:57 pm
Author: Panyi Dong (panyid2@illinois.edu)
-----
Last Modified: Tuesday, 5th December 2023 5:27:01 pm
Last Modified: Thursday, 5th September 2024 7:14:40 pm
Modified By: Panyi Dong (panyid2@illinois.edu)
-----
Expand Down Expand Up @@ -47,10 +47,10 @@
from ..utils.base import is_date
from ..utils.data import formatting
from .base import BaseEncoder
from ..constant import UNI_CLASS


class DataEncoding(BaseEncoder, formatting):

"""
Data preprocessing
1. convert string type features to numerical categorical/dummy variables
Expand Down Expand Up @@ -86,8 +86,8 @@ def fit(self, _df: pd.DataFrame) -> pd.DataFrame:
for column in features:
if (
df[column].dtype == object
and is_date(df[[column]])
and len(df[column].dropna().unique()) > 31
and is_date(df[column])
and len(df[column].dropna().unique()) > UNI_CLASS
):
df[column] = pd.to_numeric(pd.to_datetime(df[column]))
elif (df[column].dtype == object) or (str(df[column].dtype) == "category"):
Expand Down Expand Up @@ -115,10 +115,8 @@ def fit(self, _df: pd.DataFrame) -> pd.DataFrame:
axis=1,
)
for i in range(len(unique_value)):
df.loc[df[column] == unique_value[i], column] = i
df.loc[~df[column].isnull(), column] = df.loc[
~df[column].isnull(), column
].astype(int)
df.loc[df[column] == unique_value[i], column] = int(i)
df[column] = pd.to_numeric(df[column])
else:
df.loc[~df[column].isnull(), column] = df.loc[
~df[column].isnull(), column
Expand Down Expand Up @@ -151,6 +149,7 @@ def fit(self, _df: pd.DataFrame) -> pd.DataFrame:
df.drop(columns=list(self.category.columns), inplace=True)

self._fitted = True
df.columns = df.columns.astype(str)

return df

Expand All @@ -162,8 +161,8 @@ def refit(self, _df: pd.DataFrame) -> pd.DataFrame:
for column in list(df.columns):
if (
df[column].dtype == object
and is_date(df[[column]])
and len(df[column].dropna().unique()) > 31
and is_date(df[column])
and len(df[column].dropna().unique()) > UNI_CLASS
):
df[column] = pd.to_numeric(pd.to_datetime(df[column]))
elif df[column].dtype == object or str(df[column].dtype) == "category":
Expand All @@ -190,9 +189,7 @@ def refit(self, _df: pd.DataFrame) -> pd.DataFrame:
df.loc[~df[column].isin(unique_values), column] = np.NaN
for i in range(len(unique_values)):
df.loc[df[column] == unique_values[i], column] = i
df.loc[~df[column].isnull(), column] = df.loc[
~df[column].isnull(), column
].astype(int)
df[column] = pd.to_numeric(df[column])
else:
df.loc[~df[column].isnull(), column] = df.loc[
~df[column].isnull(), column
Expand Down Expand Up @@ -223,12 +220,12 @@ def refit(self, _df: pd.DataFrame) -> pd.DataFrame:
# remove categorical variables
if self.dummy_coding:
df.drop(columns=list(self.category.columns), inplace=True)
df.columns = df.columns.astype(str)

return df


class CategoryShift(BaseEncoder):

"""
Add 3 to every cateogry
Expand Down
8 changes: 3 additions & 5 deletions InsurAutoML/hpo/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@
Mathematics Department, University of Illinois at Urbana-Champaign (UIUC)
Project: InsurAutoML
Latest Version: 0.2.5
Latest Version: 0.2.6
Relative Path: /InsurAutoML/hpo/base.py
File Created: Friday, 12th May 2023 10:11:52 am
Author: Panyi Dong (panyid2@illinois.edu)
-----
Last Modified: Saturday, 16th December 2023 8:18:35 pm
Last Modified: Tuesday, 30th April 2024 8:00:54 pm
Modified By: Panyi Dong (panyid2@illinois.edu)
-----
Expand All @@ -38,7 +38,6 @@
SOFTWARE.
"""


from __future__ import annotations
from typing import Union, List, Callable, Dict, Tuple
import os
Expand Down Expand Up @@ -111,7 +110,6 @@


class AutoTabularBase:

""" "
Base class module for AutoTabular (for classification and regression tasks)
Expand Down Expand Up @@ -331,7 +329,7 @@ def __init__(
self.cpu_threads = cpu_threads
self.use_gpu = use_gpu
self.reset_index = reset_index
self.seed = seed
self.seed = seed if seed else 42

self._iter = 0 # record iteration number
self._fitted = False # record whether the model has been fitted
Expand Down
9 changes: 4 additions & 5 deletions InsurAutoML/utils/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@
Mathematics Department, University of Illinois at Urbana-Champaign (UIUC)
Project: InsurAutoML
Latest Version: 0.2.5
Latest Version: 0.2.6
Relative Path: /InsurAutoML/utils/base.py
File Created: Monday, 24th October 2022 11:56:57 pm
Author: Panyi Dong (panyid2@illinois.edu)
-----
Last Modified: Thursday, 1st June 2023 9:42:56 am
Last Modified: Thursday, 5th September 2024 6:49:39 pm
Modified By: Panyi Dong (panyid2@illinois.edu)
-----
Expand All @@ -38,7 +38,6 @@
SOFTWARE.
"""


from typing import List, Union, Callable, Any, Dict
import logging
import warnings
Expand Down Expand Up @@ -117,7 +116,7 @@ def random_list(vlist: list, seed: int = None) -> np.ndarray:
# rule = 'any' will consider the column as date type as long as one value is date type,
# rule = 'all' will consider the column as date type only when all values
# are date type.
def is_date(df: pd.DataFrame, rule: str = "any") -> List[bool]:
def is_date(df: pd.Series, rule: str = "all") -> List[bool]:
def _is_date(string, fuzzy=False):
try:
parse(string, fuzzy=fuzzy)
Expand All @@ -128,7 +127,7 @@ def _is_date(string, fuzzy=False):

_check = []
for item in df.values:
_check.append(_is_date(str(item[0])))
_check.append(_is_date(str(item)))
if rule == "any":
return any(_check)
elif rule == "all":
Expand Down
19 changes: 10 additions & 9 deletions InsurAutoML/utils/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@
Mathematics Department, University of Illinois at Urbana-Champaign (UIUC)
Project: InsurAutoML
Latest Version: 0.2.5
Latest Version: 0.2.6
Relative Path: /InsurAutoML/utils/metadata.py
File: _metadata.py
Author: Panyi Dong (panyid2@illinois.edu)
-----
Last Modified: Wednesday, 12th July 2023 8:19:42 pm
Last Modified: Thursday, 5th September 2024 6:35:35 pm
Modified By: Panyi Dong (panyid2@illinois.edu)
-----
Expand Down Expand Up @@ -72,6 +72,7 @@ def meta_map_object(data: pd.Series) -> str:
return "Text"
except BaseException:
return "Categorical"
return "Categorical"


class get_details:
Expand Down Expand Up @@ -129,11 +130,13 @@ def _merge_details_numerical(
# get details of categorical data
@staticmethod
def _get_details_categorical(data: pd.Series) -> Dict[str, dict]:
return {
"unique_count": {
key: value for key, value in zip(*np.unique(data, return_counts=True))
}
}
# Using pandas value_counts to avoid nan values error in np.unique
# return {
# "unique_count": {
# key: value for key, value in zip(*np.unique(data, return_counts=True))
# }
# }
return {"unique_count": dict(data.value_counts(dropna=True))}

@staticmethod
def _merge_details_categorical(
Expand All @@ -151,7 +154,6 @@ def _merge_details_categorical(


class MetaData:

"""
MetaData class is used to store the metadata and details of a dataset.
"""
Expand Down Expand Up @@ -413,7 +415,6 @@ def get_from_df(self, data: pd.DataFrame) -> Dict[Tuple, List] or dict:


class ChunkMetaData(MetaData):

"""
Metadata for chunk data to deal with very large dataset.
"""
Expand Down
44 changes: 22 additions & 22 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,29 +1,29 @@
func-timeout
cython
gensim
rpy2
ray<2.0.0
pyarrow
fastparquet
scikit-learn>=1.2.0
redis
ray[tune]
tensorboardX
pandas
flaml
tqdm
threadpoolctl>2.2.0
fastparquet
hyperopt
optuna
bayesian_optimization==1.4.0
nevergrad
tensorboardX
seaborn>=0.11.0
gensim
redis
xgboost
mlflow
optuna
pandas
ray<2.0.0
lightgbm
scipy
hyperopt
pygam
setuptools
pygam
threadpoolctl>2.2.0
mlflow
func-timeout
cython
matplotlib
seaborn>=0.11.0
colorama==0.4.4
pyarrow
nevergrad
numpy<1.24.0
ray[tune]
colorama==0.4.4
rpy2
tqdm
scipy
scikit-learn>=1.2.0
26 changes: 13 additions & 13 deletions requirements_nn.txt
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
func-timeout
cython
fastparquet
tensorboardX
seaborn>=0.11.0
gensim
rpy2
redis
pandas
ray<2.0.0
setuptools
threadpoolctl>2.2.0
func-timeout
cython
matplotlib
pyarrow
fastparquet
scikit-learn>=1.2.0
redis
numpy<1.24.0
torch
ray[tune]
tensorboardX
pandas
rpy2
tqdm
threadpoolctl>2.2.0
scipy
setuptools
matplotlib
seaborn>=0.11.0
numpy<1.24.0
scikit-learn>=1.2.0

0 comments on commit d8aa6c1

Please sign in to comment.