Skip to content

Commit

Permalink
cudf23.06 robust fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
dcolinmorgan committed Dec 30, 2023
1 parent 6f929a8 commit a35149a
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 27 deletions.
40 changes: 14 additions & 26 deletions cu_cat/_gap_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@

def make_safe_gpu_dataframes(X, y, engine):
cudf = deps.cudf
# if 'cudf' in str(getmodule(X)) and parse_version(cudf.__version__) > parse_version("23.04"):
# X=X.apply(lambda x: str((x)).zfill(4)) ## need at least >3 chars for gap encoder
if cudf:
assert cudf is not None
new_kwargs = {}
Expand All @@ -79,15 +81,6 @@ def make_safe_gpu_dataframes(X, y, engine):
new_kwargs[key] = cudf.from_pandas(value)
else:
new_kwargs[key] = value

try:
if 'cudf' in str(getmodule(X)) and parse_version(cudf.__version__) < parse_version("23.10"):
new_kwargs[key] = cudf.from_pandas(new_kwargs[key].to_pandas().convert_dtypes())
else:
new_kwargs[key] = new_kwargs[key].convert_dtypes()
except:
pass

return new_kwargs['X'], new_kwargs['y']
else:
return X, y
Expand Down Expand Up @@ -165,7 +158,7 @@ def __init__(
elif 'cuml' in engine_resolved:
# _, _, engine, gmem = lazy_cuml_import_has_dependancy()
engine = deps.cuml
from cuml.feature_extraction.text import CountVectorizer,HashingVectorizer
from sklearn.feature_extraction.text import CountVectorizer,HashingVectorizer
gmem = get_gpu_memory()
smem = get_sys_memory()

Expand Down Expand Up @@ -230,8 +223,8 @@ def _init_vars(self, X) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
# Init H_dict_ with empty dict to train from scratch
self.H_dict_ = dict()
if deps.cudf and parse_version(cuml.__version__) > parse_version("23.04"):
# X = X.replace('nan',np.nan).fillna('0o0o0') ## must be string w/len >= 3 (otherwise wont pass to gap encoder)
X=X[X.str.len()>=3]
X = X.replace('nan',np.nan).fillna('0o0o0') ## must be string w/len >= 3 (otherwise wont pass to gap encoder)
X = X.apply(lambda x: str((x)).zfill(4)) ## need at least >3 chars for gap encoder
# X.convert_dtypes()
# Build the n-grams counts matrix unq_V on unique elements of X
X, y = make_safe_gpu_dataframes(X, None, self.engine)
Expand All @@ -240,7 +233,7 @@ def _init_vars(self, X) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
elif 'cudf' in str(getmodule(X)) and 'cuml' in self.engine:
unq_X = X.unique()
tmp, lookup = np.unique(X.to_arrow(), return_inverse=True)
unq_V = self.ngrams_count_.fit_transform(unq_X)
unq_V = self.ngrams_count_.fit_transform(unq_X.to_arrow())
if self.add_words: # Add word counts to unq_V
unq_V2 = self.word_count_.fit_transform(unq_X)
unq_V = sparse.hstack((unq_V, unq_V2), format="csr")
Expand Down Expand Up @@ -346,8 +339,8 @@ def fit(self, X, y=None) -> "GapEncoderColumn":
self.Xt_= df_type(X)
# Make n-grams counts matrix unq_V
if deps.cudf and parse_version(cuml.__version__) > parse_version("23.04"):
# X = X.replace('nan',np.nan).fillna('0o0o0')
X=X[X.str.len()>=3]
X = X.replace('nan',np.nan).fillna('0o0o0')
X = X.apply(lambda x: str((x)).zfill(4)) ## need at least >3 chars for gap encoder
unq_X, unq_V, lookup = self._init_vars(X)
n_batch = (len(X) - 1) // self.batch_size + 1
# Get activations unq_H
Expand Down Expand Up @@ -574,7 +567,8 @@ def transform(self, X) -> np.array:
check_is_fitted(self, "H_dict_")
# Check if first item has str or np.str_ type
if deps.cudf and parse_version(cuml.__version__) > parse_version("23.04"):
X=X[X.str.len()>=3] #replace('nan',np.nan).fillna('0o0o0')
X.replace('nan',np.nan).fillna('0o0o0')
X = X.apply(lambda x: str((x)).zfill(4)) ## need at least >3 chars for gap encoder
unq_X = X.unique()
# Build the n-grams counts matrix V for the string data to encode
unq_V = self.ngrams_count_.transform(unq_X)#.astype(str))
Expand Down Expand Up @@ -955,12 +949,9 @@ def fit(self, X, y=None) -> "GapEncoder":
X = self._handle_missing(X)
self.fitted_models_ = []
for k in range(X.shape[1]):
# for k in X.columns:
col_enc = self._create_column_gap_encoder()
self.fitted_models_.append(col_enc.fit(X.iloc[:, k]))
if k == len(X):
break
else:
continue
self.fitted_models_.append(col_enc.fit(X.iloc[:,k]))#[k]))
return self

def transform(self, X) -> np.array:
Expand Down Expand Up @@ -989,12 +980,9 @@ def transform(self, X) -> np.array:
X = check_input(X)
X = self._handle_missing(X)
X_enc = []
# if 'cudf' in str(getmodule(X)) or 'cuml' == self.engine:
# for k in range(X.shape[1]):
# X_enc.append(self.fitted_models_[k].transform(X.iloc[:, k]))
# else:
for k in range(X.shape[1]):
X_enc.append(self.fitted_models_[k].transform(X.iloc[:, k]))
# for k in X.columns:
X_enc.append(self.fitted_models_[k].transform(X.iloc[:,k]))#[k]))
X_enc = np.hstack(X_enc)
return X_enc

Expand Down
2 changes: 1 addition & 1 deletion cu_cat/_table_vectorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -710,7 +710,7 @@ def fit_transform(self, X, y=None):
if 'cudf' not in str(getmodule(X)) and deps.cudf:
# if deps.cudf and 'cudf' not in str(getmodule(X)):
X = cudf.from_pandas(X)#,nan_as_null=True) ### see how flag acts
X.fillna(0.0,inplace=True)
# X.fillna(0.0,inplace=True)
X, y = make_safe_gpu_dataframes(X, None, self.engine_)

if (self.datetime_transformer_ == "passthrough") and (datetime_columns !=[]):
Expand Down

0 comments on commit a35149a

Please sign in to comment.