fix metadata with missing & numerical saved as object

PanyiDong · Sep 6, 2024 · d8aa6c1 · d8aa6c1
1 parent 34bbc5b
commit d8aa6c1
Show file tree

Hide file tree

Showing 8 changed files with 69 additions and 73 deletions.
diff --git a/InsurAutoML/VERSION b/InsurAutoML/VERSION
@@ -1 +1 @@
-0.2.5
+0.2.6
diff --git a/InsurAutoML/constant.py b/InsurAutoML/constant.py
@@ -5,13 +5,13 @@
 Mathematics Department, University of Illinois at Urbana-Champaign (UIUC)
 
 Project: InsurAutoML
-Latest Version: 0.2.5
+Latest Version: 0.2.6
 Relative Path: /InsurAutoML/constant.py
 File: _constant.py
 Author: Panyi Dong (panyid2@illinois.edu)
 
 -----
-Last Modified: Sunday, 10th December 2023 11:13:54 am
+Last Modified: Wednesday, 24th April 2024 2:57:57 pm
 Modified By: Panyi Dong (panyid2@illinois.edu)
 
 -----
@@ -205,7 +205,8 @@ def __str__(self):
 
 # maximum unique classes determined as categorical variable
 # 31 is capped by days in a month
-UNI_CLASS = 31
+# 120 is capped by age of a person
+UNI_CLASS = 120
 
 # maximum iteration allowed for the algorithm
 MAX_ITER = 1024

diff --git a/InsurAutoML/encoding/encoding.py b/InsurAutoML/encoding/encoding.py
@@ -5,13 +5,13 @@
 Mathematics Department, University of Illinois at Urbana-Champaign (UIUC)
 
 Project: InsurAutoML
-Latest Version: 0.2.5
+Latest Version: 0.2.6
 Relative Path: /InsurAutoML/encoding/encoding.py
 File Created: Monday, 24th October 2022 11:56:57 pm
 Author: Panyi Dong (panyid2@illinois.edu)
 
 -----
-Last Modified: Tuesday, 5th December 2023 5:27:01 pm
+Last Modified: Thursday, 5th September 2024 7:14:40 pm
 Modified By: Panyi Dong (panyid2@illinois.edu)
 
 -----
@@ -47,10 +47,10 @@
 from ..utils.base import is_date
 from ..utils.data import formatting
 from .base import BaseEncoder
+from ..constant import UNI_CLASS
 
 
 class DataEncoding(BaseEncoder, formatting):
-
     """
     Data preprocessing
     1. convert string type features to numerical categorical/dummy variables
@@ -86,8 +86,8 @@ def fit(self, _df: pd.DataFrame) -> pd.DataFrame:
         for column in features:
             if (
                 df[column].dtype == object
-                and is_date(df[[column]])
-                and len(df[column].dropna().unique()) > 31
+                and is_date(df[column])
+                and len(df[column].dropna().unique()) > UNI_CLASS
             ):
                 df[column] = pd.to_numeric(pd.to_datetime(df[column]))
             elif (df[column].dtype == object) or (str(df[column].dtype) == "category"):
@@ -115,10 +115,8 @@ def fit(self, _df: pd.DataFrame) -> pd.DataFrame:
                             axis=1,
                         )
                     for i in range(len(unique_value)):
-                        df.loc[df[column] == unique_value[i], column] = i
-                    df.loc[~df[column].isnull(), column] = df.loc[
-                        ~df[column].isnull(), column
-                    ].astype(int)
+                        df.loc[df[column] == unique_value[i], column] = int(i)
+                    df[column] = pd.to_numeric(df[column])
             else:
                 df.loc[~df[column].isnull(), column] = df.loc[
                     ~df[column].isnull(), column
@@ -151,6 +149,7 @@ def fit(self, _df: pd.DataFrame) -> pd.DataFrame:
             df.drop(columns=list(self.category.columns), inplace=True)
 
         self._fitted = True
+        df.columns = df.columns.astype(str)
 
         return df
 
@@ -162,8 +161,8 @@ def refit(self, _df: pd.DataFrame) -> pd.DataFrame:
         for column in list(df.columns):
             if (
                 df[column].dtype == object
-                and is_date(df[[column]])
-                and len(df[column].dropna().unique()) > 31
+                and is_date(df[column])
+                and len(df[column].dropna().unique()) > UNI_CLASS
             ):
                 df[column] = pd.to_numeric(pd.to_datetime(df[column]))
             elif df[column].dtype == object or str(df[column].dtype) == "category":
@@ -190,9 +189,7 @@ def refit(self, _df: pd.DataFrame) -> pd.DataFrame:
                         df.loc[~df[column].isin(unique_values), column] = np.NaN
                         for i in range(len(unique_values)):
                             df.loc[df[column] == unique_values[i], column] = i
-                        df.loc[~df[column].isnull(), column] = df.loc[
-                            ~df[column].isnull(), column
-                        ].astype(int)
+                        df[column] = pd.to_numeric(df[column])
             else:
                 df.loc[~df[column].isnull(), column] = df.loc[
                     ~df[column].isnull(), column
@@ -223,12 +220,12 @@ def refit(self, _df: pd.DataFrame) -> pd.DataFrame:
         # remove categorical variables
         if self.dummy_coding:
             df.drop(columns=list(self.category.columns), inplace=True)
+        df.columns = df.columns.astype(str)
 
         return df
 
 
 class CategoryShift(BaseEncoder):
-
     """
     Add 3 to every cateogry
 

diff --git a/InsurAutoML/hpo/base.py b/InsurAutoML/hpo/base.py
@@ -5,13 +5,13 @@
 Mathematics Department, University of Illinois at Urbana-Champaign (UIUC)
 
 Project: InsurAutoML
-Latest Version: 0.2.5
+Latest Version: 0.2.6
 Relative Path: /InsurAutoML/hpo/base.py
 File Created: Friday, 12th May 2023 10:11:52 am
 Author: Panyi Dong (panyid2@illinois.edu)
 
 -----
-Last Modified: Saturday, 16th December 2023 8:18:35 pm
+Last Modified: Tuesday, 30th April 2024 8:00:54 pm
 Modified By: Panyi Dong (panyid2@illinois.edu)
 
 -----
@@ -38,7 +38,6 @@
 SOFTWARE.
 """
 
-
 from __future__ import annotations
 from typing import Union, List, Callable, Dict, Tuple
 import os
@@ -111,7 +110,6 @@
 
 
 class AutoTabularBase:
-
     """ "
     Base class module for AutoTabular (for classification and regression tasks)
 
@@ -331,7 +329,7 @@ def __init__(
         self.cpu_threads = cpu_threads
         self.use_gpu = use_gpu
         self.reset_index = reset_index
-        self.seed = seed
+        self.seed = seed if seed else 42
 
         self._iter = 0  # record iteration number
         self._fitted = False  # record whether the model has been fitted

diff --git a/InsurAutoML/utils/base.py b/InsurAutoML/utils/base.py
@@ -5,13 +5,13 @@
 Mathematics Department, University of Illinois at Urbana-Champaign (UIUC)
 
 Project: InsurAutoML
-Latest Version: 0.2.5
+Latest Version: 0.2.6
 Relative Path: /InsurAutoML/utils/base.py
 File Created: Monday, 24th October 2022 11:56:57 pm
 Author: Panyi Dong (panyid2@illinois.edu)
 
 -----
-Last Modified: Thursday, 1st June 2023 9:42:56 am
+Last Modified: Thursday, 5th September 2024 6:49:39 pm
 Modified By: Panyi Dong (panyid2@illinois.edu)
 
 -----
@@ -38,7 +38,6 @@
 SOFTWARE.
 """
 
-
 from typing import List, Union, Callable, Any, Dict
 import logging
 import warnings
@@ -117,7 +116,7 @@ def random_list(vlist: list, seed: int = None) -> np.ndarray:
 # rule = 'any' will consider the column as date type as long as one value is date type,
 # rule = 'all' will consider the column as date type only when all values
 # are date type.
-def is_date(df: pd.DataFrame, rule: str = "any") -> List[bool]:
+def is_date(df: pd.Series, rule: str = "all") -> List[bool]:
     def _is_date(string, fuzzy=False):
         try:
             parse(string, fuzzy=fuzzy)
@@ -128,7 +127,7 @@ def _is_date(string, fuzzy=False):
 
     _check = []
     for item in df.values:
-        _check.append(_is_date(str(item[0])))
+        _check.append(_is_date(str(item)))
     if rule == "any":
         return any(_check)
     elif rule == "all":

diff --git a/InsurAutoML/utils/metadata.py b/InsurAutoML/utils/metadata.py
@@ -5,13 +5,13 @@
 Mathematics Department, University of Illinois at Urbana-Champaign (UIUC)
 
 Project: InsurAutoML
-Latest Version: 0.2.5
+Latest Version: 0.2.6
 Relative Path: /InsurAutoML/utils/metadata.py
 File: _metadata.py
 Author: Panyi Dong (panyid2@illinois.edu)
 
 -----
-Last Modified: Wednesday, 12th July 2023 8:19:42 pm
+Last Modified: Thursday, 5th September 2024 6:35:35 pm
 Modified By: Panyi Dong (panyid2@illinois.edu)
 
 -----
@@ -72,6 +72,7 @@ def meta_map_object(data: pd.Series) -> str:
             return "Text"
     except BaseException:
         return "Categorical"
+    return "Categorical"
 
 
 class get_details:
@@ -129,11 +130,13 @@ def _merge_details_numerical(
     # get details of categorical data
     @staticmethod
     def _get_details_categorical(data: pd.Series) -> Dict[str, dict]:
-        return {
-            "unique_count": {
-                key: value for key, value in zip(*np.unique(data, return_counts=True))
-            }
-        }
+        # Using pandas value_counts to avoid nan values error in np.unique
+        # return {
+        #     "unique_count": {
+        #         key: value for key, value in zip(*np.unique(data, return_counts=True))
+        #     }
+        # }
+        return {"unique_count": dict(data.value_counts(dropna=True))}
 
     @staticmethod
     def _merge_details_categorical(
@@ -151,7 +154,6 @@ def _merge_details_categorical(
 
 
 class MetaData:
-
     """
     MetaData class is used to store the metadata and details of a dataset.
     """
@@ -413,7 +415,6 @@ def get_from_df(self, data: pd.DataFrame) -> Dict[Tuple, List] or dict:
 
 
 class ChunkMetaData(MetaData):
-
     """
     Metadata for chunk data to deal with very large dataset.
     """

diff --git a/requirements.txt b/requirements.txt
@@ -1,29 +1,29 @@
-func-timeout
-cython
-gensim
-rpy2
-ray<2.0.0
-pyarrow
-fastparquet
-scikit-learn>=1.2.0
-redis
-ray[tune]
-tensorboardX
-pandas
 flaml
-tqdm
-threadpoolctl>2.2.0
+fastparquet
+hyperopt
+optuna
 bayesian_optimization==1.4.0
-nevergrad
+tensorboardX
+seaborn>=0.11.0
+gensim
+redis
 xgboost
-mlflow
-optuna
+pandas
+ray<2.0.0
 lightgbm
-scipy
-hyperopt
-pygam
 setuptools
+pygam
+threadpoolctl>2.2.0
+mlflow
+func-timeout
+cython
 matplotlib
-seaborn>=0.11.0
-colorama==0.4.4
+pyarrow
+nevergrad
 numpy<1.24.0
+ray[tune]
+colorama==0.4.4
+rpy2
+tqdm
+scipy
+scikit-learn>=1.2.0
diff --git a/requirements_nn.txt b/requirements_nn.txt
@@ -1,20 +1,20 @@
-func-timeout
-cython
+fastparquet
+tensorboardX
+seaborn>=0.11.0
 gensim
-rpy2
+redis
+pandas
 ray<2.0.0
+setuptools
+threadpoolctl>2.2.0
+func-timeout
+cython
+matplotlib
 pyarrow
-fastparquet
-scikit-learn>=1.2.0
-redis
+numpy<1.24.0
 torch
 ray[tune]
-tensorboardX
-pandas
+rpy2
 tqdm
-threadpoolctl>2.2.0
 scipy
-setuptools
-matplotlib
-seaborn>=0.11.0
-numpy<1.24.0
+scikit-learn>=1.2.0