Skip to content

Commit

Permalink
make hashing-hash chunk arguments explicit
Browse files Browse the repository at this point in the history
  • Loading branch information
PaulWestenthanner committed Nov 11, 2023
1 parent 5c94e27 commit 5203c12
Showing 1 changed file with 7 additions and 12 deletions.
19 changes: 7 additions & 12 deletions category_encoders/hashing.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
"""The hashing module contains all methods and classes related to the hashing trick."""

import sys
import hashlib
import category_encoders.utils as util
import multiprocessing
Expand Down Expand Up @@ -179,8 +178,7 @@ def _transform(self, X, override_return_df=False):
return X

@staticmethod
def hash_chunk(args):
hash_method, np_df, N = args
def hash_chunk(hash_method: str, np_df: np.ndarray, N: int) -> np.ndarray:
# Calling getattr outside the loop saves some time in the loop
hasher_constructor = getattr(hashlib, hash_method)
# Same when the call to getattr is implicit
Expand All @@ -202,31 +200,28 @@ def hash_chunk(args):
result[i, column_index] += 1
return result

def hashing_trick_with_np_parallel(self, df, N: int):
def hashing_trick_with_np_parallel(self, df: pd.DataFrame, N: int) -> pd.DataFrame:
np_df = df.to_numpy()
ctx = multiprocessing.get_context(self.process_creation_method)

with ProcessPoolExecutor(max_workers=self.max_process, mp_context=ctx) as executor:
result = np.concatenate(list(
executor.map(
self.hash_chunk,
zip(
[self.hash_method]*self.max_process,
np.array_split(np_df, self.max_process),
[N]*self.max_process
)
[self.hash_method]*self.max_process,
np.array_split(np_df, self.max_process),
[N]*self.max_process
)
))

return pd.DataFrame(result, index=df.index)

def hashing_trick_with_np_no_parallel(self, df, N):
def hashing_trick_with_np_no_parallel(self, df: pd.DataFrame, N: int) -> pd.DataFrame:
np_df = df.to_numpy()

result = HashingEncoder.hash_chunk((self.hash_method, np_df, N))
result = HashingEncoder.hash_chunk(self.hash_method, np_df, N)

return pd.DataFrame(result, index=df.index)


def hashing_trick(self, X_in, hashing_method='md5', N=2, cols=None, make_copy=False):
"""A basic hashing implementation with configurable dimensionality/precision
Expand Down

0 comments on commit 5203c12

Please sign in to comment.