make hashing-hash chunk arguments explicit

scikit-learn-contrib · Nov 11, 2023 · 5203c12 · 5203c12
1 parent 5c94e27
commit 5203c12
Showing 1 changed file with 7 additions and 12 deletions.
diff --git a/category_encoders/hashing.py b/category_encoders/hashing.py
@@ -1,6 +1,5 @@
 """The hashing module contains all methods and classes related to the hashing trick."""
 
-import sys
 import hashlib
 import category_encoders.utils as util
 import multiprocessing
@@ -179,8 +178,7 @@ def _transform(self, X, override_return_df=False):
  return X
 
  @staticmethod
- def hash_chunk(args):
- hash_method, np_df, N = args
+ def hash_chunk(hash_method: str, np_df: np.ndarray, N: int) -> np.ndarray:
  # Calling getattr outside the loop saves some time in the loop
  hasher_constructor = getattr(hashlib, hash_method)
  # Same when the call to getattr is implicit
@@ -202,31 +200,28 @@ def hash_chunk(args):
  result[i, column_index] += 1
  return result
 
- def hashing_trick_with_np_parallel(self, df, N: int):
+ def hashing_trick_with_np_parallel(self, df: pd.DataFrame, N: int) -> pd.DataFrame:
  np_df = df.to_numpy()
  ctx = multiprocessing.get_context(self.process_creation_method)
 
  with ProcessPoolExecutor(max_workers=self.max_process, mp_context=ctx) as executor:
  result = np.concatenate(list(
  executor.map(
  self.hash_chunk,
- zip(
- [self.hash_method]*self.max_process,
- np.array_split(np_df, self.max_process),
- [N]*self.max_process
- )
+ [self.hash_method]*self.max_process,
+ np.array_split(np_df, self.max_process),
+ [N]*self.max_process
  )
  ))
 
  return pd.DataFrame(result, index=df.index)
 
- def hashing_trick_with_np_no_parallel(self, df, N):
+ def hashing_trick_with_np_no_parallel(self, df: pd.DataFrame, N: int) -> pd.DataFrame:
  np_df = df.to_numpy()
 
- result = HashingEncoder.hash_chunk((self.hash_method, np_df, N))
+ result = HashingEncoder.hash_chunk(self.hash_method, np_df, N)
 
  return pd.DataFrame(result, index=df.index)
-
 
  def hashing_trick(self, X_in, hashing_method='md5', N=2, cols=None, make_copy=False):
  """A basic hashing implementation with configurable dimensionality/precision