From ccfc95a623e13d59a6e4f640ee7c022bda35f763 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 12 Nov 2024 10:03:06 -0500
Subject: [PATCH] Add new nvtext minhash_permuted API (#16756)

Introduce new nvtext minhash API that takes a single seed for hashing and 2 parameter vectors to calculate the minhash results from the seed hash:
```
std::unique_ptr<cudf::column> minhash_permuted(
  cudf::strings_column_view const& input,
  uint32_t seed,
  cudf::device_span<uint32_t const> parameter_a,
  cudf::device_span<uint32_t const> parameter_b,
  cudf::size_type width,
  rmm::cuda_stream_view stream,
  rmm::device_async_resource_ref mr);
```
The `seed` is used to hash the `input` using rolling set of substrings `width` characters wide.
The hashes are then combined with the values in `parameter_a` and `parameter_b` to calculate a set of 32-bit (or 64-bit) values for each row. Only the minimum value is returned per element of `a` and `b` when combined with all the hashes for a row. Each output row is a set of M values where `M = parameter_a.size() = parameter_b.size()`

This implementation is significantly faster than the current minhash which computes hashes for multiple seeds.

Included in this PR is also the `minhash64_permuted()` API that is identical but uses 64-bit values for the seed and the parameter values. Also included are new tests and a benchmark as well as the pylibcudf and cudf interfaces.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Matthew Murray (https://github.com/Matt711)
  - Lawrence Mitchell (https://github.com/wence-)
  - Karthikeyan (https://github.com/karthikeyann)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/16756
---
 cpp/benchmarks/CMakeLists.txt                 |   4 +-
 cpp/benchmarks/text/minhash.cpp               |  38 +-
 cpp/include/nvtext/minhash.hpp                |  94 +++++
 cpp/src/text/minhash.cu                       | 390 ++++++++++++++++++
 cpp/tests/CMakeLists.txt                      |   1 +
 cpp/tests/text/minhash_tests.cpp              | 267 ++++++------
 python/cudf/cudf/_lib/nvtext/minhash.pyx      |  28 ++
 python/cudf/cudf/_lib/strings/__init__.py     |   2 +
 python/cudf/cudf/core/column/string.py        | 107 +++++
 .../cudf/cudf/tests/text/test_text_methods.py |  48 +--
 .../pylibcudf/libcudf/nvtext/minhash.pxd      |  16 +
 python/pylibcudf/pylibcudf/nvtext/minhash.pxd |  16 +
 python/pylibcudf/pylibcudf/nvtext/minhash.pyx | 103 +++++
 .../pylibcudf/tests/test_nvtext_minhash.py    |  12 +-
 14 files changed, 949 insertions(+), 177 deletions(-)

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index ad090be99f3..59f5602fd5a 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -348,8 +348,8 @@ ConfigureNVBench(BINARYOP_NVBENCH binaryop/binaryop.cpp binaryop/compiled_binary
 ConfigureBench(TEXT_BENCH text/subword.cpp)
 
 ConfigureNVBench(
-  TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/ngrams.cpp
-  text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp
+  TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp
+  text/ngrams.cpp text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp
 )
 
 # ##################################################################################################
diff --git a/cpp/benchmarks/text/minhash.cpp b/cpp/benchmarks/text/minhash.cpp
index 31ce60d8f9a..a80d0dcbdb8 100644
--- a/cpp/benchmarks/text/minhash.cpp
+++ b/cpp/benchmarks/text/minhash.cpp
@@ -20,8 +20,6 @@
 
 #include <nvtext/minhash.hpp>
 
-#include <rmm/device_buffer.hpp>
-
 #include <nvbench/nvbench.cuh>
 
 static void bench_minhash(nvbench::state& state)
@@ -29,26 +27,25 @@ static void bench_minhash(nvbench::state& state)
   auto const num_rows   = static_cast<cudf::size_type>(state.get_int64("num_rows"));
   auto const row_width  = static_cast<cudf::size_type>(state.get_int64("row_width"));
   auto const hash_width = static_cast<cudf::size_type>(state.get_int64("hash_width"));
-  auto const seed_count = static_cast<cudf::size_type>(state.get_int64("seed_count"));
+  auto const parameters = static_cast<cudf::size_type>(state.get_int64("parameters"));
   auto const base64     = state.get_int64("hash_type") == 64;
 
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
-
   data_profile const strings_profile = data_profile_builder().distribution(
     cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
   auto const strings_table =
     create_random_table({cudf::type_id::STRING}, row_count{num_rows}, strings_profile);
   cudf::strings_column_view input(strings_table->view().column(0));
 
-  data_profile const seeds_profile = data_profile_builder().null_probability(0).distribution(
-    cudf::type_to_id<cudf::hash_value_type>(), distribution_id::NORMAL, 0, row_width);
-  auto const seed_type   = base64 ? cudf::type_id::UINT64 : cudf::type_id::UINT32;
-  auto const seeds_table = create_random_table({seed_type}, row_count{seed_count}, seeds_profile);
-  auto seeds             = seeds_table->get_column(0);
-  seeds.set_null_mask(rmm::device_buffer{}, 0);
+  data_profile const param_profile = data_profile_builder().no_validity().distribution(
+    cudf::type_to_id<cudf::hash_value_type>(),
+    distribution_id::NORMAL,
+    0u,
+    std::numeric_limits<cudf::hash_value_type>::max());
+  auto const param_type = base64 ? cudf::type_id::UINT64 : cudf::type_id::UINT32;
+  auto const param_table =
+    create_random_table({param_type, param_type}, row_count{parameters}, param_profile);
+  auto const parameters_a = param_table->view().column(0);
+  auto const parameters_b = param_table->view().column(1);
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
 
@@ -57,15 +54,16 @@ static void bench_minhash(nvbench::state& state)
   state.add_global_memory_writes<nvbench::int32_t>(num_rows);  // output are hashes
 
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    auto result = base64 ? nvtext::minhash64(input, seeds.view(), hash_width)
-                         : nvtext::minhash(input, seeds.view(), hash_width);
+    auto result = base64
+                    ? nvtext::minhash64_permuted(input, 0, parameters_a, parameters_b, hash_width)
+                    : nvtext::minhash_permuted(input, 0, parameters_a, parameters_b, hash_width);
   });
 }
 
 NVBENCH_BENCH(bench_minhash)
   .set_name("minhash")
-  .add_int64_axis("num_rows", {1024, 8192, 16364, 131072})
-  .add_int64_axis("row_width", {128, 512, 2048})
-  .add_int64_axis("hash_width", {5, 10})
-  .add_int64_axis("seed_count", {2, 26})
+  .add_int64_axis("num_rows", {15000, 30000, 60000})
+  .add_int64_axis("row_width", {6000, 28000, 50000})
+  .add_int64_axis("hash_width", {12, 24})
+  .add_int64_axis("parameters", {26, 260})
   .add_int64_axis("hash_type", {32, 64});
diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp
index 42124461cdf..b2c1a23f57e 100644
--- a/cpp/include/nvtext/minhash.hpp
+++ b/cpp/include/nvtext/minhash.hpp
@@ -94,6 +94,53 @@ namespace CUDF_EXPORT nvtext {
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
+/**
+ * @brief Returns the minhash values for each string
+ *
+ * This function uses MurmurHash3_x86_32 for the hash algorithm.
+ *
+ * The input strings are first hashed using the given `seed` over substrings
+ * of `width` characters. These hash values are then combined with the `a`
+ * and `b` parameter values using the following formula:
+ * ```
+ *   max_hash = max of uint32
+ *   mp = (1 << 61) - 1
+ *   hv[i] = hash value of a substring at i
+ *   pv[i] = ((hv[i] * a[i] + b[i]) % mp) & max_hash
+ * ```
+ *
+ * This calculation is performed on each substring and the minimum value is computed
+ * as follows:
+ * ```
+ *   mh[j,i] = min(pv[i]) for all substrings in row j
+ *                        and where i=[0,a.size())
+ * ```
+ *
+ * Any null row entries result in corresponding null output rows.
+ *
+ * @throw std::invalid_argument if the width < 2
+ * @throw std::invalid_argument if parameter_a is empty
+ * @throw std::invalid_argument if `parameter_b.size() != parameter_a.size()`
+ * @throw std::overflow_error if `parameter_a.size() * input.size()` exceeds the column size limit
+ *
+ * @param input Strings column to compute minhash
+ * @param seed Seed value used for the hash algorithm
+ * @param parameter_a Values used for the permuted calculation
+ * @param parameter_b Values used for the permuted calculation
+ * @param width The character width of substrings to hash for each row
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return List column of minhash values for each string per seed
+ */
+std::unique_ptr<cudf::column> minhash_permuted(
+  cudf::strings_column_view const& input,
+  uint32_t seed,
+  cudf::device_span<uint32_t const> parameter_a,
+  cudf::device_span<uint32_t const> parameter_b,
+  cudf::size_type width,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
+
 /**
  * @brief Returns the minhash value for each string
  *
@@ -159,6 +206,53 @@ namespace CUDF_EXPORT nvtext {
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
+/**
+ * @brief Returns the minhash values for each string
+ *
+ * This function uses MurmurHash3_x64_128 for the hash algorithm.
+ *
+ * The input strings are first hashed using the given `seed` over substrings
+ * of `width` characters. These hash values are then combined with the `a`
+ * and `b` parameter values using the following formula:
+ * ```
+ *   max_hash = max of uint64
+ *   mp = (1 << 61) - 1
+ *   hv[i] = hash value of a substring at i
+ *   pv[i] = ((hv[i] * a[i] + b[i]) % mp) & max_hash
+ * ```
+ *
+ * This calculation is performed on each substring and the minimum value is computed
+ * as follows:
+ * ```
+ *   mh[j,i] = min(pv[i]) for all substrings in row j
+ *                        and where i=[0,a.size())
+ * ```
+ *
+ * Any null row entries result in corresponding null output rows.
+ *
+ * @throw std::invalid_argument if the width < 2
+ * @throw std::invalid_argument if parameter_a is empty
+ * @throw std::invalid_argument if `parameter_b.size() != parameter_a.size()`
+ * @throw std::overflow_error if `parameter_a.size() * input.size()` exceeds the column size limit
+ *
+ * @param input Strings column to compute minhash
+ * @param seed Seed value used for the hash algorithm
+ * @param parameter_a Values used for the permuted calculation
+ * @param parameter_b Values used for the permuted calculation
+ * @param width The character width of substrings to hash for each row
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return List column of minhash values for each string per seed
+ */
+std::unique_ptr<cudf::column> minhash64_permuted(
+  cudf::strings_column_view const& input,
+  uint64_t seed,
+  cudf::device_span<uint64_t const> parameter_a,
+  cudf::device_span<uint64_t const> parameter_b,
+  cudf::size_type width,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
+
 /**
  * @brief Returns the minhash values for each row of strings per seed
  *
diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu
index a03a34f5fa7..aee83ab35ed 100644
--- a/cpp/src/text/minhash.cu
+++ b/cpp/src/text/minhash.cu
@@ -18,6 +18,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/copy.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/sequence.hpp>
@@ -37,9 +38,13 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <cooperative_groups.h>
 #include <cuda/atomic>
+#include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
 #include <thrust/fill.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
 
 #include <limits>
 
@@ -162,6 +167,339 @@ std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
   return hashes;
 }
 
+constexpr cudf::thread_index_type block_size = 256;
+// for potentially tuning minhash_seed_kernel independently from block_size
+constexpr cudf::thread_index_type tile_size = block_size;
+
+// Number of a/b parameter values to process per thread.
+// The intermediate values are stored in shared-memory and therefore limits this count.
+// This value was found to be an efficient size for both uint32 and uint64
+// hash types based on benchmarks.
+constexpr cuda::std::size_t params_per_thread = 16;
+
+// Separate kernels are used to process strings above and below this value (in bytes).
+constexpr cudf::size_type wide_string_threshold = 1 << 18;  // 256K
+// The number of blocks per string for the above-threshold kernel processing.
+constexpr cudf::size_type blocks_per_string = 64;
+// The above values were determined using the redpajama and books_sample datasets
+
+/**
+ * @brief Hashing kernel launched as a thread per tile-size (block or warp)
+ *
+ * This kernel computes the hashes for each string using the seed and the specified
+ * hash function. The width is used to compute rolling substrings to hash over.
+ * The hashes are stored in d_hashes to be used in the minhash_permuted_kernel.
+ *
+ * This kernel also counts the number of strings above the wide_string_threshold
+ * and proactively initializes the output values for those strings.
+ *
+ * @tparam HashFunction The hash function to use for this kernel
+ * @tparam hash_value_type Derived from HashFunction result_type
+ *
+ * @param d_strings The input strings to hash
+ * @param seed The seed used for the hash function
+ * @param width Width in characters used for determining substrings to hash
+ * @param d_hashes The resulting hash values are stored here
+ * @param threshold_count Stores the number of strings above wide_string_threshold
+ * @param param_count Number of parameters (used for the proactive initialize)
+ * @param d_results Final results vector (used for the proactive initialize)
+ */
+template <typename HashFunction, typename hash_value_type = typename HashFunction::result_type>
+CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings,
+                                     hash_value_type seed,
+                                     cudf::size_type width,
+                                     hash_value_type* d_hashes,
+                                     cudf::size_type* threshold_count,
+                                     cudf::size_type param_count,
+                                     hash_value_type* d_results)
+{
+  auto const tid     = cudf::detail::grid_1d::global_thread_id();
+  auto const str_idx = tid / tile_size;
+  if (str_idx >= d_strings.size()) { return; }
+  if (d_strings.is_null(str_idx)) { return; }
+
+  // retrieve this string's offset to locate the output position in d_hashes
+  auto const offsets = d_strings.child(cudf::strings_column_view::offsets_column_index);
+  auto const offsets_itr =
+    cudf::detail::input_offsetalator(offsets.head(), offsets.type(), d_strings.offset());
+  auto const offset     = offsets_itr[str_idx];
+  auto const size_bytes = static_cast<cudf::size_type>(offsets_itr[str_idx + 1] - offset);
+  if (size_bytes == 0) { return; }
+
+  auto const d_str    = cudf::string_view(d_strings.head<char>() + offset, size_bytes);
+  auto const lane_idx = tid % tile_size;
+
+  // hashes for this string/thread are stored here
+  auto seed_hashes = d_hashes + offset - offsets_itr[0] + lane_idx;
+
+  auto const begin  = d_str.data() + lane_idx;
+  auto const end    = d_str.data() + d_str.size_bytes();
+  auto const hasher = HashFunction(seed);
+
+  for (auto itr = begin; itr < end; itr += tile_size, seed_hashes += tile_size) {
+    if (cudf::strings::detail::is_utf8_continuation_char(*itr)) {
+      *seed_hashes = 0;
+      continue;
+    }
+    auto const check_str =  // used for counting 'width' characters
+      cudf::string_view(itr, static_cast<cudf::size_type>(thrust::distance(itr, end)));
+    auto const [bytes, left] = cudf::strings::detail::bytes_to_character_position(check_str, width);
+    if ((itr != d_str.data()) && (left > 0)) {
+      // true itr+width is past the end of the string
+      *seed_hashes = 0;
+      continue;
+    }
+
+    auto const hash_str = cudf::string_view(itr, bytes);
+    hash_value_type hv;
+    if constexpr (std::is_same_v<hash_value_type, uint32_t>) {
+      hv = hasher(hash_str);
+    } else {
+      hv = thrust::get<0>(hasher(hash_str));
+    }
+    // disallowing hash to zero case
+    *seed_hashes = cuda::std::max(hv, hash_value_type{1});
+  }
+
+  // logic appended here so an extra kernel is not required
+  if (size_bytes >= wide_string_threshold) {
+    if (lane_idx == 0) {
+      // count the number of wide strings
+      cuda::atomic_ref<cudf::size_type, cuda::thread_scope_device> ref{*threshold_count};
+      ref.fetch_add(1, cuda::std::memory_order_relaxed);
+    }
+    // initialize the output -- only needed for wider strings
+    auto d_output = d_results + (str_idx * param_count);
+    for (auto i = lane_idx; i < param_count; i += tile_size) {
+      d_output[i] = std::numeric_limits<hash_value_type>::max();
+    }
+  }
+}
+
+/**
+ * @brief Permutation calculation kernel
+ *
+ * This kernel uses the hashes from the minhash_seed_kernel and the parameter_a and
+ * parameter_b values to compute the final output results.
+ * The output is the number of input rows (N) by the number of parameter values (M).
+ * Each output[i] is the calculated result for parameter_a/b[0:M].
+ *
+ * This kernel is launched with either blocks per strings of 1 for strings
+ * below the wide_strings_threshold or blocks per string = blocks_per_strings
+ * for strings above wide_strings_threshold.
+ *
+ * @tparam hash_value_type Derived from HashFunction result_type
+ * @tparam blocks_per_string Number of blocks used to process each string
+ *
+ * @param d_strings The input strings to hash
+ * @param indices The indices of the strings in d_strings to process
+ * @param parameter_a 1st set of parameters for the calculation result
+ * @param parameter_b 2nd set of parameters for the calculation result
+ * @param width Used for calculating the number of available hashes in each string
+ * @param d_hashes The hash values computed in minhash_seed_kernel
+ * @param d_results Final results vector of calculate values
+ */
+template <typename hash_value_type, int blocks_per_string>
+CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_strings,
+                                         cudf::device_span<cudf::size_type const> indices,
+                                         cudf::device_span<hash_value_type const> parameter_a,
+                                         cudf::device_span<hash_value_type const> parameter_b,
+                                         cudf::size_type width,
+                                         hash_value_type const* d_hashes,
+                                         hash_value_type* d_results)
+{
+  auto const tid = cudf::detail::grid_1d::global_thread_id();
+  auto const idx = (tid / blocks_per_string) / block_size;
+  if (idx >= indices.size()) { return; }
+  auto const str_idx = indices[idx];
+  if (d_strings.is_null(str_idx)) { return; }
+
+  auto const block      = cooperative_groups::this_thread_block();
+  int const section_idx = block.group_index().x % blocks_per_string;
+
+  auto const offsets = d_strings.child(cudf::strings_column_view::offsets_column_index);
+  auto const offsets_itr =
+    cudf::detail::input_offsetalator(offsets.head(), offsets.type(), d_strings.offset());
+  auto const offset     = offsets_itr[str_idx];
+  auto const size_bytes = static_cast<cudf::size_type>(offsets_itr[str_idx + 1] - offset);
+
+  // number of items to process in this block;
+  // last block also includes any remainder values from the size_bytes/blocks_per_string truncation
+  // example:
+  //  each section_size for string with size 588090 and blocks_per_string=64 is 9188
+  //  except the last section which is 9188 + (588090 % 64) = 9246
+  auto const section_size =
+    (size_bytes / blocks_per_string) +
+    (section_idx < (blocks_per_string - 1) ? 0 : size_bytes % blocks_per_string);
+  auto const section_offset = section_idx * (size_bytes / blocks_per_string);
+
+  // hash values for this block/section
+  auto const seed_hashes = d_hashes + offset - offsets_itr[0] + section_offset;
+  // width used here as a max value since a string's char-count <= byte-count
+  auto const hashes_size =
+    section_idx < (blocks_per_string - 1)
+      ? section_size
+      : cuda::std::max(static_cast<cudf::size_type>(size_bytes > 0), section_size - width + 1);
+
+  auto const init     = size_bytes == 0 ? 0 : std::numeric_limits<hash_value_type>::max();
+  auto const lane_idx = block.thread_rank();
+  auto const d_output = d_results + (str_idx * parameter_a.size());
+
+  auto const begin = seed_hashes + lane_idx;
+  auto const end   = seed_hashes + hashes_size;
+
+  // constants used in the permutation calculations
+  constexpr uint64_t mersenne_prime  = (1UL << 61) - 1;
+  constexpr hash_value_type hash_max = std::numeric_limits<hash_value_type>::max();
+
+  // found to be an efficient shared memory size for both hash types
+  __shared__ hash_value_type block_values[block_size * params_per_thread];
+
+  for (std::size_t i = 0; i < parameter_a.size(); i += params_per_thread) {
+    // initialize this block's chunk of shared memory
+    // each thread handles params_per_thread of values
+    auto const chunk_values = block_values + (lane_idx * params_per_thread);
+    thrust::uninitialized_fill(thrust::seq, chunk_values, chunk_values + params_per_thread, init);
+    block.sync();
+
+    auto const param_count =
+      cuda::std::min(static_cast<cuda::std::size_t>(params_per_thread), parameter_a.size() - i);
+
+    // each lane accumulates min hashes in its shared memory
+    for (auto itr = begin; itr < end; itr += block_size) {
+      auto const hv = *itr;
+      // 0 is used as a skip sentinel for UTF-8 and trailing bytes
+      if (hv == 0) { continue; }
+
+      for (std::size_t param_idx = i; param_idx < (i + param_count); ++param_idx) {
+        // permutation formula used by datatrove
+        hash_value_type const v =
+          ((hv * parameter_a[param_idx] + parameter_b[param_idx]) % mersenne_prime) & hash_max;
+        auto const block_idx    = ((param_idx % params_per_thread) * block_size) + lane_idx;
+        block_values[block_idx] = cuda::std::min(v, block_values[block_idx]);
+      }
+    }
+    block.sync();
+
+    // reduce each parameter values vector to a single min value;
+    // assumes that the block_size > params_per_thread;
+    // each thread reduces a block_size of parameter values (thread per parameter)
+    if (lane_idx < param_count) {
+      auto const values = block_values + (lane_idx * block_size);
+      // cooperative groups does not have a min function and cub::BlockReduce was slower
+      auto const minv =
+        thrust::reduce(thrust::seq, values, values + block_size, init, thrust::minimum{});
+      if constexpr (blocks_per_string > 1) {
+        // accumulates mins for each block into d_output
+        cuda::atomic_ref<hash_value_type, cuda::thread_scope_block> ref{d_output[lane_idx + i]};
+        ref.fetch_min(minv, cuda::std::memory_order_relaxed);
+      } else {
+        d_output[lane_idx + i] = minv;
+      }
+    }
+    block.sync();
+  }
+}
+
+template <typename HashFunction, typename hash_value_type = typename HashFunction::result_type>
+std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
+                                         hash_value_type seed,
+                                         cudf::device_span<hash_value_type const> parameter_a,
+                                         cudf::device_span<hash_value_type const> parameter_b,
+                                         cudf::size_type width,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::device_async_resource_ref mr)
+{
+  CUDF_EXPECTS(width >= 2,
+               "Parameter width should be an integer value of 2 or greater",
+               std::invalid_argument);
+  CUDF_EXPECTS(!parameter_a.empty(), "Parameters A and B cannot be empty", std::invalid_argument);
+  CUDF_EXPECTS(parameter_a.size() == parameter_b.size(),
+               "Parameters A and B should have the same number of elements",
+               std::invalid_argument);
+  CUDF_EXPECTS(
+    (static_cast<std::size_t>(input.size()) * parameter_a.size()) <
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()),
+    "The number of parameters times the number of input rows exceeds the column size limit",
+    std::overflow_error);
+
+  auto const output_type = cudf::data_type{cudf::type_to_id<hash_value_type>()};
+  if (input.is_empty()) { return cudf::make_empty_column(output_type); }
+
+  auto const d_strings = cudf::column_device_view::create(input.parent(), stream);
+
+  auto results =
+    cudf::make_numeric_column(output_type,
+                              input.size() * static_cast<cudf::size_type>(parameter_a.size()),
+                              cudf::mask_state::UNALLOCATED,
+                              stream,
+                              mr);
+  auto d_results = results->mutable_view().data<hash_value_type>();
+
+  cudf::detail::grid_1d grid{static_cast<cudf::thread_index_type>(input.size()) * block_size,
+                             block_size};
+  auto const hashes_size = input.chars_size(stream);
+  auto d_hashes          = rmm::device_uvector<hash_value_type>(hashes_size, stream);
+  auto d_threshold_count = cudf::detail::device_scalar<cudf::size_type>(0, stream);
+
+  minhash_seed_kernel<HashFunction>
+    <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(*d_strings,
+                                                                         seed,
+                                                                         width,
+                                                                         d_hashes.data(),
+                                                                         d_threshold_count.data(),
+                                                                         parameter_a.size(),
+                                                                         d_results);
+  auto const threshold_count = d_threshold_count.value(stream);
+
+  auto indices = rmm::device_uvector<cudf::size_type>(input.size(), stream);
+  thrust::sequence(rmm::exec_policy(stream), indices.begin(), indices.end());
+  cudf::size_type threshold_index = threshold_count < input.size() ? input.size() : 0;
+
+  // if we counted a split of above/below threshold then
+  // compute partitions based on the size of each string
+  if ((threshold_count > 0) && (threshold_count < input.size())) {
+    auto sizes = rmm::device_uvector<cudf::size_type>(input.size(), stream);
+    thrust::transform(rmm::exec_policy_nosync(stream),
+                      thrust::counting_iterator<cudf::size_type>(0),
+                      thrust::counting_iterator<cudf::size_type>(input.size()),
+                      sizes.data(),
+                      cuda::proclaim_return_type<cudf::size_type>(
+                        [d_strings = *d_strings] __device__(auto idx) -> cudf::size_type {
+                          if (d_strings.is_null(idx)) { return 0; }
+                          return d_strings.element<cudf::string_view>(idx).size_bytes();
+                        }));
+    thrust::sort_by_key(
+      rmm::exec_policy_nosync(stream), sizes.begin(), sizes.end(), indices.begin());
+    auto const lb = thrust::lower_bound(
+      rmm::exec_policy_nosync(stream), sizes.begin(), sizes.end(), wide_string_threshold);
+    threshold_index = static_cast<cudf::size_type>(thrust::distance(sizes.begin(), lb));
+  }
+
+  // handle the strings below the threshold width
+  if (threshold_index > 0) {
+    auto d_indices = cudf::device_span<cudf::size_type const>(indices.data(), threshold_index);
+    cudf::detail::grid_1d grid{static_cast<cudf::thread_index_type>(d_indices.size()) * block_size,
+                               block_size};
+    minhash_permuted_kernel<hash_value_type, 1>
+      <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+        *d_strings, d_indices, parameter_a, parameter_b, width, d_hashes.data(), d_results);
+  }
+
+  // handle the strings above the threshold width
+  if (threshold_index < input.size()) {
+    auto const count = static_cast<cudf::thread_index_type>(input.size() - threshold_index);
+    auto d_indices =
+      cudf::device_span<cudf::size_type const>(indices.data() + threshold_index, count);
+    cudf::detail::grid_1d grid{count * block_size * blocks_per_string, block_size};
+    minhash_permuted_kernel<hash_value_type, blocks_per_string>
+      <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+        *d_strings, d_indices, parameter_a, parameter_b, width, d_hashes.data(), d_results);
+  }
+
+  return results;
+}
+
 /**
  * @brief Compute the minhash of each list row of strings for each seed
  *
@@ -309,6 +647,20 @@ std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
   return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr);
 }
 
+std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
+                                      uint32_t seed,
+                                      cudf::device_span<uint32_t const> parameter_a,
+                                      cudf::device_span<uint32_t const> parameter_b,
+                                      cudf::size_type width,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::device_async_resource_ref mr)
+{
+  using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>;
+  auto hashes =
+    detail::minhash_fn<HashFunction>(input, seed, parameter_a, parameter_b, width, stream, mr);
+  return build_list_result(input.parent(), std::move(hashes), parameter_a.size(), stream, mr);
+}
+
 std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
                                         cudf::numeric_scalar<uint64_t> const& seed,
                                         cudf::size_type width,
@@ -333,6 +685,20 @@ std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
   return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr);
 }
 
+std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
+                                        uint64_t seed,
+                                        cudf::device_span<uint64_t const> parameter_a,
+                                        cudf::device_span<uint64_t const> parameter_b,
+                                        cudf::size_type width,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::device_async_resource_ref mr)
+{
+  using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128<cudf::string_view>;
+  auto hashes =
+    detail::minhash_fn<HashFunction>(input, seed, parameter_a, parameter_b, width, stream, mr);
+  return build_list_result(input.parent(), std::move(hashes), parameter_a.size(), stream, mr);
+}
+
 std::unique_ptr<cudf::column> word_minhash(cudf::lists_column_view const& input,
                                            cudf::device_span<uint32_t const> seeds,
                                            rmm::cuda_stream_view stream,
@@ -374,6 +740,18 @@ std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
   return detail::minhash(input, seeds, width, stream, mr);
 }
 
+std::unique_ptr<cudf::column> minhash_permuted(cudf::strings_column_view const& input,
+                                               uint32_t seed,
+                                               cudf::device_span<uint32_t const> parameter_a,
+                                               cudf::device_span<uint32_t const> parameter_b,
+                                               cudf::size_type width,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::minhash(input, seed, parameter_a, parameter_b, width, stream, mr);
+}
+
 std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
                                         cudf::numeric_scalar<uint64_t> seed,
                                         cudf::size_type width,
@@ -394,6 +772,18 @@ std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
   return detail::minhash64(input, seeds, width, stream, mr);
 }
 
+std::unique_ptr<cudf::column> minhash64_permuted(cudf::strings_column_view const& input,
+                                                 uint64_t seed,
+                                                 cudf::device_span<uint64_t const> parameter_a,
+                                                 cudf::device_span<uint64_t const> parameter_b,
+                                                 cudf::size_type width,
+                                                 rmm::cuda_stream_view stream,
+                                                 rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::minhash64(input, seed, parameter_a, parameter_b, width, stream, mr);
+}
+
 std::unique_ptr<cudf::column> word_minhash(cudf::lists_column_view const& input,
                                            cudf::device_span<uint32_t const> seeds,
                                            rmm::cuda_stream_view stream,
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 3a9b930830b..cbca0ceef77 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -610,6 +610,7 @@ ConfigureTest(
   text/bpe_tests.cpp
   text/edit_distance_tests.cpp
   text/jaccard_tests.cpp
+  text/minhash_tests.cpp
   text/ngrams_tests.cpp
   text/ngrams_tokenize_tests.cpp
   text/normalize_tests.cpp
diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp
index ef35a4472cf..042ac44621e 100644
--- a/cpp/tests/text/minhash_tests.cpp
+++ b/cpp/tests/text/minhash_tests.cpp
@@ -28,155 +28,169 @@
 
 struct MinHashTest : public cudf::test::BaseFixture {};
 
-TEST_F(MinHashTest, Basic)
+TEST_F(MinHashTest, Permuted)
 {
-  auto validity = cudf::test::iterators::null_at(1);
   auto input =
     cudf::test::strings_column_wrapper({"doc 1",
-                                        "",
                                         "this is doc 2",
-                                        "",
                                         "doc 3",
                                         "d",
-                                        "The quick brown fox jumpéd over the lazy brown dog."},
-                                       validity);
+                                        "The quick brown fox jumpéd over the lazy brown dog.",
+                                        "line six",
+                                        "line seven",
+                                        "line eight",
+                                        "line nine",
+                                        "line ten"});
 
   auto view = cudf::strings_column_view(input);
 
-  auto results = nvtext::minhash(view);
+  auto first  = thrust::counting_iterator<uint32_t>(10);
+  auto params = cudf::test::fixed_width_column_wrapper<uint32_t>(first, first + 3);
+  auto results =
+    nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4);
 
-  auto expected = cudf::test::fixed_width_column_wrapper<uint32_t>(
-    {1207251914u, 0u, 21141582u, 0u, 1207251914u, 655955059u, 86520422u}, validity);
+  using LCW32 = cudf::test::lists_column_wrapper<uint32_t>;
+  // clang-format off
+  LCW32 expected({
+    LCW32{1392101586u,  394869177u,  811528444u},
+    LCW32{ 211415830u,  187088503u,  130291444u},
+    LCW32{2098117052u,  394869177u,  799753544u},
+    LCW32{2264583304u, 2920538364u, 3576493424u},
+    LCW32{ 253327882u,   41747273u,  302030804u},
+    LCW32{2109809594u, 1017470651u,  326988172u},
+    LCW32{1303819864u,  850676747u,  147107852u},
+    LCW32{ 736021564u,  720812292u, 1405158760u},
+    LCW32{ 902780242u,  134064807u, 1613944636u},
+    LCW32{ 547084870u, 1748895564u,  656501844u}
+  });
+  // clang-format on
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
-  auto results64  = nvtext::minhash64(view);
-  auto expected64 = cudf::test::fixed_width_column_wrapper<uint64_t>({774489391575805754ul,
-                                                                      0ul,
-                                                                      3232308021562742685ul,
-                                                                      0ul,
-                                                                      13145552576991307582ul,
-                                                                      14660046701545912182ul,
-                                                                      398062025280761388ul},
-                                                                     validity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64);
-}
+  auto params64  = cudf::test::fixed_width_column_wrapper<uint64_t, uint32_t>(first, first + 3);
+  auto results64 = nvtext::minhash64_permuted(
+    view, 0, cudf::column_view(params64), cudf::column_view(params64), 4);
 
-TEST_F(MinHashTest, LengthEqualsWidth)
-{
-  auto input   = cudf::test::strings_column_wrapper({"abcdé", "fghjk", "lmnop", "qrstu", "vwxyz"});
-  auto view    = cudf::strings_column_view(input);
-  auto results = nvtext::minhash(view, 0, 5);
-  auto expected = cudf::test::fixed_width_column_wrapper<uint32_t>(
-    {3825281041u, 2728681928u, 1984332911u, 3965004915u, 192452857u});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  using LCW64 = cudf::test::lists_column_wrapper<uint64_t>;
+  // clang-format off
+  LCW64 expected64({
+    LCW64{ 827364888116975697ul, 1601854279692781452ul,  70500662054893256ul},
+    LCW64{  18312093741021833ul,  133793446674258329ul,  21974512489226198ul},
+    LCW64{  22474244732520567ul, 1638811775655358395ul, 949306297364502264ul},
+    LCW64{1332357434996402861ul, 2157346081260151330ul, 676491718310205848ul},
+    LCW64{  65816830624808020ul,   43323600380520789ul,  63511816333816345ul},
+    LCW64{ 629657184954525200ul,   49741036507643002ul,  97466271004074331ul},
+    LCW64{ 301611977846331113ul,  101188874709594830ul,  97466271004074331ul},
+    LCW64{ 121498891461700668ul,  171065800427907402ul,  97466271004074331ul},
+    LCW64{  54617739511834072ul,  231454301607238929ul,  97466271004074331ul},
+    LCW64{ 576418665851990314ul,  231454301607238929ul,  97466271004074331ul}
+  });
+  // clang-format on
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64);
 }
 
-TEST_F(MinHashTest, MultiSeed)
+TEST_F(MinHashTest, PermutedWide)
 {
-  auto input =
-    cudf::test::strings_column_wrapper({"doc 1",
-                                        "this is doc 2",
-                                        "doc 3",
-                                        "d",
-                                        "The quick brown fox jumpéd over the lazy brown dog."});
-
-  auto view = cudf::strings_column_view(input);
+  std::string const small(2 << 10, 'x');  // below wide_string_threshold
+  std::string const wide(2 << 19, 'y');   // above wide_string_threshold
+  auto input = cudf::test::strings_column_wrapper({small, wide});
+  auto view  = cudf::strings_column_view(input);
 
-  auto seeds   = cudf::test::fixed_width_column_wrapper<uint32_t>({0, 1, 2});
-  auto results = nvtext::minhash(view, cudf::column_view(seeds));
+  auto first  = thrust::counting_iterator<uint32_t>(20);
+  auto params = cudf::test::fixed_width_column_wrapper<uint32_t>(first, first + 3);
+  auto results =
+    nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4);
 
-  using LCW = cudf::test::lists_column_wrapper<uint32_t>;
+  using LCW32 = cudf::test::lists_column_wrapper<uint32_t>;
   // clang-format off
-  LCW expected({LCW{1207251914u, 1677652962u, 1061355987u},
-                LCW{  21141582u,  580916568u, 1258052021u},
-                LCW{1207251914u,  943567174u, 1109272887u},
-                LCW{ 655955059u,  488346356u, 2394664816u},
-                LCW{  86520422u,  236622901u,  102546228u}});
+  LCW32 expected({
+    LCW32{1731998032u,  315359380u, 3193688024u},
+    LCW32{1293098788u, 2860992281u,  133918478u}
+  });
   // clang-format on
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
-  auto seeds64   = cudf::test::fixed_width_column_wrapper<uint64_t>({0, 1, 2});
-  auto results64 = nvtext::minhash64(view, cudf::column_view(seeds64));
+  auto params64  = cudf::test::fixed_width_column_wrapper<uint64_t, uint32_t>(first, first + 3);
+  auto results64 = nvtext::minhash64_permuted(
+    view, 0, cudf::column_view(params64), cudf::column_view(params64), 4);
 
   using LCW64 = cudf::test::lists_column_wrapper<uint64_t>;
   // clang-format off
-  LCW64 expected64({LCW64{  774489391575805754ul, 10435654231793485448ul, 1188598072697676120ul},
-                    LCW64{ 3232308021562742685ul,  4445611509348165860ul, 1188598072697676120ul},
-                    LCW64{13145552576991307582ul,  6846192680998069919ul, 1188598072697676120ul},
-                    LCW64{14660046701545912182ul, 17106501326045553694ul, 17713478494106035784ul},
-                    LCW64{  398062025280761388ul,   377720198157450084ul,  984941365662009329ul}});
+   LCW64 expected64({
+     LCW64{1818322427062143853ul, 641024893347719371ul, 1769570368846988848ul},
+     LCW64{1389920339306667795ul, 421787002125838902ul, 1759496674158703968ul}
+   });
   // clang-format on
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64);
 }
 
-TEST_F(MinHashTest, MultiSeedWithNullInputRow)
+TEST_F(MinHashTest, PermutedManyParameters)
 {
-  auto validity = cudf::test::iterators::null_at(1);
-  auto input    = cudf::test::strings_column_wrapper({"abcdéfgh", "", "", "stuvwxyz"}, validity);
-  auto view     = cudf::strings_column_view(input);
+  std::string const small(2 << 10, 'x');
+  std::string const wide(2 << 19, 'y');
+  auto input = cudf::test::strings_column_wrapper({small, wide});
+  auto view  = cudf::strings_column_view(input);
 
-  auto seeds   = cudf::test::fixed_width_column_wrapper<uint32_t>({1, 2});
-  auto results = nvtext::minhash(view, cudf::column_view(seeds));
+  auto first = thrust::counting_iterator<uint32_t>(20);
+  // more than params_per_thread
+  auto params = cudf::test::fixed_width_column_wrapper<uint32_t>(first, first + 31);
+  auto results =
+    nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4);
 
-  using LCW = cudf::test::lists_column_wrapper<uint32_t>;
-  LCW expected({LCW{484984072u, 1074168784u}, LCW{}, LCW{0u, 0u}, LCW{571652169u, 173528385u}},
-               validity);
+  using LCW32 = cudf::test::lists_column_wrapper<uint32_t>;
+  // clang-format off
+  LCW32 expected({
+    LCW32{1731998032u,  315359380u, 3193688024u, 1777049372u,  360410720u, 3238739364u, 1822100712u,  405462060u,
+          3283790704u, 1867152052u,  450513400u, 3328842044u, 1912203392u,  495564740u, 3373893384u, 1957254732u,
+           540616080u, 3418944724u, 2002306072u,  585667420u, 3463996064u, 2047357412u,  630718760u, 3509047404u,
+          2092408752u,  675770100u, 3554098744u, 2137460092u,  720821440u, 3599150084u, 2182511432u},
+    LCW32{1293098788u, 2860992281u,  133918478u, 1701811971u, 3269705464u,  542631661u, 2110525154u, 3678418647u,
+           951344844u, 2519238337u, 4087131830u, 1360058027u, 2927951520u,  200877717u, 1768771210u, 3336664703u,
+           609590900u, 2177484393u, 3745377886u, 1018304083u, 2586197576u, 4154091069u, 1427017266u, 2994910759u,
+           267836956u, 1835730449u, 3403623942u,  676550139u, 2244443632u, 3812337125u, 1085263322u}
+  });
+  // clang-format on
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
-  auto seeds64   = cudf::test::fixed_width_column_wrapper<uint64_t>({11, 22});
-  auto results64 = nvtext::minhash64(view, cudf::column_view(seeds64));
+  // more than params_per_thread
+  auto params64  = cudf::test::fixed_width_column_wrapper<uint64_t, uint32_t>(first, first + 31);
+  auto results64 = nvtext::minhash64_permuted(
+    view, 0, cudf::column_view(params64), cudf::column_view(params64), 4);
 
   using LCW64 = cudf::test::lists_column_wrapper<uint64_t>;
-  LCW64 expected64({LCW64{2597399324547032480ul, 4461410998582111052ul},
-                    LCW64{},
-                    LCW64{0ul, 0ul},
-                    LCW64{2717781266371273264ul, 6977325820868387259ul}},
-                   validity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64);
-}
-
-TEST_F(MinHashTest, WordsMinHash)
-{
-  using LCWS    = cudf::test::lists_column_wrapper<cudf::string_view>;
-  auto validity = cudf::test::iterators::null_at(1);
-
-  LCWS input(
-    {LCWS({"hello", "abcdéfgh"}),
-     LCWS{},
-     LCWS({"rapids", "moré", "test", "text"}),
-     LCWS({"The", "quick", "brown", "fox", "jumpéd", "over", "the", "lazy", "brown", "dog"})},
-    validity);
-
-  auto view = cudf::lists_column_view(input);
-
-  auto seeds   = cudf::test::fixed_width_column_wrapper<uint32_t>({1, 2});
-  auto results = nvtext::word_minhash(view, cudf::column_view(seeds));
-  using LCW32  = cudf::test::lists_column_wrapper<uint32_t>;
-  LCW32 expected({LCW32{2069617641u, 1975382903u},
-                  LCW32{},
-                  LCW32{657297235u, 1010955999u},
-                  LCW32{644643885u, 310002789u}},
-                 validity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
-
-  auto seeds64   = cudf::test::fixed_width_column_wrapper<uint64_t>({11, 22});
-  auto results64 = nvtext::word_minhash64(view, cudf::column_view(seeds64));
-  using LCW64    = cudf::test::lists_column_wrapper<uint64_t>;
-  LCW64 expected64({LCW64{1940333969930105370ul, 272615362982418219ul},
-                    LCW64{},
-                    LCW64{5331949571924938590ul, 2088583894581919741ul},
-                    LCW64{3400468157617183341ul, 2398577492366130055ul}},
-                   validity);
+  // clang-format off
+   LCW64 expected64({
+     LCW64{1818322427062143853,  641024893347719371, 1769570368846988848, 592272835132564366,
+           1720818310631833835,  543520776917409353, 1672066252416678822, 494768718702254348,
+           1623314194201523817,  446016660487099335, 1574562135986368804, 397264602271944322,
+           1525810077771213799,  348512544056789317, 1477058019556058786, 299760485841634304,
+           1428305961340903773,  251008427626479291, 1379553903125748768, 202256369411324286,
+           1330801844910593755,  153504311196169273, 1282049786695438742, 104752252981014268,
+           1233297728480283737,   56000194765859255, 1184545670265128724,   7248136550704242,
+           1135793612049973719, 2264339087549243188, 1087041553834818706},
+     LCW64{1389920339306667795,  421787002125838902, 1759496674158703968,  791363336977875075,
+           2129073009010740141, 1160939671829911248,  192806334649082363, 1530516006681947421,
+            562382669501118536, 1900092341533983602,  931959004353154709, 2269668676386019775,
+           1301535339205190882,  333402002024361997, 1671111674057227055,  702978336876398170,
+           2040688008909263228, 1072554671728434343,  104421334547605450, 1442131006580470516,
+            473997669399641631, 1811707341432506689,  843574004251677804, 2181283676284542862,
+           1213150339103713977,  245017001922885084, 1582726673955750150,  614593336774921257,
+           1952303008807786323,  984169671626957438,   16036334446128545}
+   });
+  // clang-format on
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64);
 }
 
 TEST_F(MinHashTest, EmptyTest)
 {
-  auto input   = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
-  auto view    = cudf::strings_column_view(input->view());
-  auto results = nvtext::minhash(view);
+  auto input  = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
+  auto view   = cudf::strings_column_view(input->view());
+  auto params = cudf::test::fixed_width_column_wrapper<uint32_t>({1, 2, 3});
+  auto results =
+    nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4);
   EXPECT_EQ(results->size(), 0);
-  results = nvtext::minhash64(view);
+  auto params64 = cudf::test::fixed_width_column_wrapper<uint64_t>({1, 2, 3});
+  results       = nvtext::minhash64_permuted(
+    view, 0, cudf::column_view(params64), cudf::column_view(params64), 4);
   EXPECT_EQ(results->size(), 0);
 }
 
@@ -184,20 +198,39 @@ TEST_F(MinHashTest, ErrorsTest)
 {
   auto input = cudf::test::strings_column_wrapper({"this string intentionally left blank"});
   auto view  = cudf::strings_column_view(input);
-  EXPECT_THROW(nvtext::minhash(view, 0, 0), std::invalid_argument);
-  EXPECT_THROW(nvtext::minhash64(view, 0, 0), std::invalid_argument);
-  auto seeds = cudf::test::fixed_width_column_wrapper<uint32_t>();
-  EXPECT_THROW(nvtext::minhash(view, cudf::column_view(seeds)), std::invalid_argument);
-  auto seeds64 = cudf::test::fixed_width_column_wrapper<uint64_t>();
-  EXPECT_THROW(nvtext::minhash64(view, cudf::column_view(seeds64)), std::invalid_argument);
+  auto empty = cudf::test::fixed_width_column_wrapper<uint32_t>();
+  EXPECT_THROW(
+    nvtext::minhash_permuted(view, 0, cudf::column_view(empty), cudf::column_view(empty), 0),
+    std::invalid_argument);
+  auto empty64 = cudf::test::fixed_width_column_wrapper<uint64_t>();
+  EXPECT_THROW(
+    nvtext::minhash64_permuted(view, 0, cudf::column_view(empty64), cudf::column_view(empty64), 0),
+    std::invalid_argument);
+  EXPECT_THROW(
+    nvtext::minhash_permuted(view, 0, cudf::column_view(empty), cudf::column_view(empty), 4),
+    std::invalid_argument);
+  EXPECT_THROW(
+    nvtext::minhash64_permuted(view, 0, cudf::column_view(empty64), cudf::column_view(empty64), 4),
+    std::invalid_argument);
 
   std::vector<std::string> h_input(50000, "");
   input = cudf::test::strings_column_wrapper(h_input.begin(), h_input.end());
   view  = cudf::strings_column_view(input);
 
   auto const zeroes = thrust::constant_iterator<uint32_t>(0);
-  seeds             = cudf::test::fixed_width_column_wrapper<uint32_t>(zeroes, zeroes + 50000);
-  EXPECT_THROW(nvtext::minhash(view, cudf::column_view(seeds)), std::overflow_error);
-  seeds64 = cudf::test::fixed_width_column_wrapper<uint64_t>(zeroes, zeroes + 50000);
-  EXPECT_THROW(nvtext::minhash64(view, cudf::column_view(seeds64)), std::overflow_error);
+  auto params       = cudf::test::fixed_width_column_wrapper<uint32_t>(zeroes, zeroes + 50000);
+  EXPECT_THROW(
+    nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4),
+    std::overflow_error);
+  auto params64 = cudf::test::fixed_width_column_wrapper<uint64_t>(zeroes, zeroes + 50000);
+  EXPECT_THROW(nvtext::minhash64_permuted(
+                 view, 0, cudf::column_view(params64), cudf::column_view(params64), 4),
+               std::overflow_error);
+
+  EXPECT_THROW(
+    nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(empty), 4),
+    std::invalid_argument);
+  EXPECT_THROW(
+    nvtext::minhash64_permuted(view, 0, cudf::column_view(params64), cudf::column_view(empty64), 4),
+    std::invalid_argument);
 }
diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx
index 5e39cafa47b..25cfcf99ca6 100644
--- a/python/cudf/cudf/_lib/nvtext/minhash.pyx
+++ b/python/cudf/cudf/_lib/nvtext/minhash.pyx
@@ -1,5 +1,7 @@
 # Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
+from libc.stdint cimport uint32_t, uint64_t
+
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
@@ -17,6 +19,19 @@ def minhash(Column input, Column seeds, int width=4):
     return Column.from_pylibcudf(result)
 
 
+@acquire_spill_lock()
+def minhash_permuted(Column input, uint32_t seed, Column a, Column b, int width):
+    return Column.from_pylibcudf(
+        nvtext.minhash.minhash_permuted(
+            input.to_pylibcudf(mode="read"),
+            seed,
+            a.to_pylibcudf(mode="read"),
+            b.to_pylibcudf(mode="read"),
+            width,
+        )
+    )
+
+
 @acquire_spill_lock()
 def minhash64(Column input, Column seeds, int width=4):
     result = nvtext.minhash.minhash64(
@@ -27,6 +42,19 @@ def minhash64(Column input, Column seeds, int width=4):
     return Column.from_pylibcudf(result)
 
 
+@acquire_spill_lock()
+def minhash64_permuted(Column input, uint64_t seed, Column a, Column b, int width):
+    return Column.from_pylibcudf(
+        nvtext.minhash.minhash64_permuted(
+            input.to_pylibcudf(mode="read"),
+            seed,
+            a.to_pylibcudf(mode="read"),
+            b.to_pylibcudf(mode="read"),
+            width,
+        )
+    )
+
+
 @acquire_spill_lock()
 def word_minhash(Column input, Column seeds):
     result = nvtext.minhash.word_minhash(
diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py
index ffa5e603408..4c0ec2d9ac5 100644
--- a/python/cudf/cudf/_lib/strings/__init__.py
+++ b/python/cudf/cudf/_lib/strings/__init__.py
@@ -9,6 +9,8 @@
 from cudf._lib.nvtext.minhash import (
     minhash,
     minhash64,
+    minhash64_permuted,
+    minhash_permuted,
     word_minhash,
     word_minhash64,
 )
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 856ce0f75de..3d70b01b7e4 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -5350,11 +5350,65 @@ def minhash(
             libstrings.minhash(self._column, seeds_column, width)
         )
 
+    def minhash_permuted(
+        self, seed: np.uint32, a: ColumnLike, b: ColumnLike, width: int
+    ) -> SeriesOrIndex:
+        """
+        Compute the minhash of a strings column.
+
+        This uses the MurmurHash3_x86_32 algorithm for the hash function.
+
+        Calculation uses the formula (hv * a + b) % mersenne_prime
+        where hv is the hash of a substring of width characters,
+        a and b are provided values and mersenne_prime is 2^61-1.
+
+        Parameters
+        ----------
+        seed : uint32
+            The seed used for the hash algorithm.
+        a : ColumnLike
+            Values for minhash calculation.
+            Must be of type uint32.
+        b : ColumnLike
+            Values for minhash calculation.
+            Must be of type uint32.
+        width : int
+            The width of the substring to hash.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> import numpy as np
+        >>> s = cudf.Series(['this is my', 'favorite book'])
+        >>> a = cudf.Series([1, 2, 3], dtype=np.uint32)
+        >>> b = cudf.Series([4, 5, 6], dtype=np.uint32)
+        >>> s.str.minhash_permuted(0, a=a, b=b, width=5)
+        0    [1305480171, 462824409, 74608232]
+        1       [32665388, 65330773, 97996158]
+        dtype: list
+        """
+        a_column = column.as_column(a)
+        if a_column.dtype != np.uint32:
+            raise ValueError(
+                f"Expecting a Series with dtype uint32, got {type(a)}"
+            )
+        b_column = column.as_column(b)
+        if b_column.dtype != np.uint32:
+            raise ValueError(
+                f"Expecting a Series with dtype uint32, got {type(b)}"
+            )
+        return self._return_or_inplace(
+            libstrings.minhash_permuted(
+                self._column, seed, a_column, b_column, width
+            )
+        )
+
     def minhash64(
         self, seeds: ColumnLike | None = None, width: int = 4
     ) -> SeriesOrIndex:
         """
         Compute the minhash of a strings column.
+
         This uses the MurmurHash3_x64_128 algorithm for the hash function.
         This function generates 2 uint64 values but only the first
         uint64 value is used.
@@ -5390,6 +5444,59 @@ def minhash64(
             libstrings.minhash64(self._column, seeds_column, width)
         )
 
+    def minhash64_permuted(
+        self, seed: np.uint64, a: ColumnLike, b: ColumnLike, width: int
+    ) -> SeriesOrIndex:
+        """
+        Compute the minhash of a strings column.
+        This uses the MurmurHash3_x64_128 algorithm for the hash function.
+
+        Calculation uses the formula (hv * a + b) % mersenne_prime
+        where hv is the hash of a substring of width characters,
+        a and b are provided values and mersenne_prime is 2^61-1.
+
+        Parameters
+        ----------
+        seed : uint64
+            The seed used for the hash algorithm.
+        a : ColumnLike
+            Values for minhash calculation.
+            Must be of type uint64.
+        b : ColumnLike
+            Values for minhash calculation.
+            Must be of type uint64.
+        width : int
+            The width of the substring to hash.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> import numpy as np
+        >>> s = cudf.Series(['this is my', 'favorite book', 'to read'])
+        >>> a = cudf.Series([2, 3], dtype=np.uint64)
+        >>> b = cudf.Series([5, 6], dtype=np.uint64)
+        >>> s.str.minhash64_permuted(0, a=a, b=b, width=5)
+        0    [172452388517576012, 316595762085180527]
+        1      [71427536958126239, 58787297728258215]
+        2    [423885828176437114, 1140588505926961370]
+        dtype: list
+        """
+        a_column = column.as_column(a)
+        if a_column.dtype != np.uint64:
+            raise ValueError(
+                f"Expecting a Series with dtype uint64, got {type(a)}"
+            )
+        b_column = column.as_column(b)
+        if b_column.dtype != np.uint64:
+            raise ValueError(
+                f"Expecting a Series with dtype uint64, got {type(b)}"
+            )
+        return self._return_or_inplace(
+            libstrings.minhash64_permuted(
+                self._column, seed, a_column, b_column, width
+            )
+        )
+
     def word_minhash(self, seeds: ColumnLike | None = None) -> SeriesOrIndex:
         """
         Compute the minhash of a list column of strings.
diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py
index 997ca357986..47e541fdcef 100644
--- a/python/cudf/cudf/tests/text/test_text_methods.py
+++ b/python/cudf/cudf/tests/text/test_text_methods.py
@@ -882,68 +882,48 @@ def test_is_vowel_consonant():
     assert_eq(expected, actual)
 
 
-def test_minhash():
+def test_minhash_permuted():
     strings = cudf.Series(["this is my", "favorite book", None, ""])
 
+    params = cudf.Series([1, 2, 3], dtype=np.uint32)
     expected = cudf.Series(
         [
-            cudf.Series([21141582], dtype=np.uint32),
-            cudf.Series([962346254], dtype=np.uint32),
-            None,
-            cudf.Series([0], dtype=np.uint32),
-        ]
-    )
-    actual = strings.str.minhash()
-    assert_eq(expected, actual)
-    seeds = cudf.Series([0, 1, 2], dtype=np.uint32)
-    expected = cudf.Series(
-        [
-            cudf.Series([1305480167, 668155704, 34311509], dtype=np.uint32),
-            cudf.Series([32665384, 3470118, 363147162], dtype=np.uint32),
+            cudf.Series([1305480168, 462824406, 74608229], dtype=np.uint32),
+            cudf.Series([32665385, 65330770, 97996155], dtype=np.uint32),
             None,
             cudf.Series([0, 0, 0], dtype=np.uint32),
         ]
     )
-    actual = strings.str.minhash(seeds=seeds, width=5)
+    actual = strings.str.minhash_permuted(0, a=params, b=params, width=5)
     assert_eq(expected, actual)
 
-    expected = cudf.Series(
-        [
-            cudf.Series([3232308021562742685], dtype=np.uint64),
-            cudf.Series([23008204270530356], dtype=np.uint64),
-            None,
-            cudf.Series([0], dtype=np.uint64),
-        ]
-    )
-    actual = strings.str.minhash64()
-    assert_eq(expected, actual)
-    seeds = cudf.Series([0, 1, 2], dtype=np.uint64)
+    params = cudf.Series([1, 2, 3], dtype=np.uint64)
     expected = cudf.Series(
         [
             cudf.Series(
-                [7082801294247314046, 185949556058924788, 167570629329462454],
+                [105531920695060180, 172452388517576009, 316595762085180524],
                 dtype=np.uint64,
             ),
             cudf.Series(
-                [382665377781028452, 86243762733551437, 7688750597953083512],
+                [35713768479063122, 71427536958126236, 58787297728258212],
                 dtype=np.uint64,
             ),
             None,
             cudf.Series([0, 0, 0], dtype=np.uint64),
         ]
     )
-    actual = strings.str.minhash64(seeds=seeds, width=5)
+    actual = strings.str.minhash64_permuted(0, a=params, b=params, width=5)
     assert_eq(expected, actual)
 
     # test wrong seed types
     with pytest.raises(ValueError):
-        strings.str.minhash(seeds="a")
+        strings.str.minhash_permuted(1, a="a", b="b", width=7)
     with pytest.raises(ValueError):
-        seeds = cudf.Series([0, 1, 2], dtype=np.int32)
-        strings.str.minhash(seeds=seeds)
+        params = cudf.Series([0, 1, 2], dtype=np.int32)
+        strings.str.minhash_permuted(1, a=params, b=params, width=6)
     with pytest.raises(ValueError):
-        seeds = cudf.Series([0, 1, 2], dtype=np.uint32)
-        strings.str.minhash64(seeds=seeds)
+        params = cudf.Series([0, 1, 2], dtype=np.uint32)
+        strings.str.minhash64_permuted(1, a=params, b=params, width=8)
 
 
 def test_word_minhash():
diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
index 41250037dcf..ebf8eda1ce3 100644
--- a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
@@ -22,6 +22,14 @@ cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil:
         const size_type width,
     ) except +
 
+    cdef unique_ptr[column] minhash_permuted(
+        const column_view &strings,
+        const uint32_t seed,
+        const column_view &a,
+        const column_view &b,
+        const size_type width,
+    ) except +
+
     cdef unique_ptr[column] minhash64(
         const column_view &strings,
         const column_view &seeds,
@@ -34,6 +42,14 @@ cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil:
         const size_type width,
     ) except +
 
+    cdef unique_ptr[column] minhash64_permuted(
+        const column_view &strings,
+        const uint64_t seed,
+        const column_view &a,
+        const column_view &b,
+        const size_type width,
+    ) except +
+
     cdef unique_ptr[column] word_minhash(
         const column_view &input,
         const column_view &seeds
diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/nvtext/minhash.pxd
index 97e8c9dc83c..6b544282f44 100644
--- a/python/pylibcudf/pylibcudf/nvtext/minhash.pxd
+++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pxd
@@ -11,8 +11,24 @@ ctypedef fused ColumnOrScalar:
 
 cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=*)
 
+cpdef Column minhash_permuted(
+    Column input,
+    uint32_t seed,
+    Column a,
+    Column b,
+    size_type width
+)
+
 cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=*)
 
+cpdef Column minhash64_permuted(
+    Column input,
+    uint64_t seed,
+    Column a,
+    Column b,
+    size_type width
+)
+
 cpdef Column word_minhash(Column input, Column seeds)
 
 cpdef Column word_minhash64(Column input, Column seeds)
diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx
index f1e012e60e5..5a51e32b287 100644
--- a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx
+++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx
@@ -8,6 +8,8 @@ from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.nvtext.minhash cimport (
     minhash as cpp_minhash,
     minhash64 as cpp_minhash64,
+    minhash64_permuted as cpp_minhash64_permuted,
+    minhash_permuted as cpp_minhash_permuted,
     word_minhash as cpp_word_minhash,
     word_minhash64 as cpp_word_minhash64,
 )
@@ -16,6 +18,7 @@ from pylibcudf.libcudf.types cimport size_type
 from pylibcudf.scalar cimport Scalar
 
 from cython.operator import dereference
+import warnings
 
 
 cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=4):
@@ -40,6 +43,12 @@ cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=4):
     Column
         List column of minhash values for each string per seed
     """
+    warnings.warn(
+        "Starting in version 25.02, the signature of this function will "
+        "be changed to match pylibcudf.nvtext.minhash_permuted.",
+        FutureWarning
+    )
+
     cdef unique_ptr[column] c_result
 
     if not isinstance(seeds, (Column, Scalar)):
@@ -55,6 +64,50 @@ cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=4):
 
     return Column.from_libcudf(move(c_result))
 
+cpdef Column minhash_permuted(
+    Column input,
+    uint32_t seed,
+    Column a,
+    Column b,
+    size_type width
+):
+    """
+    Returns the minhash values for each string.
+    This function uses MurmurHash3_x86_32 for the hash algorithm.
+
+    For details, see :cpp:func:`minhash_permuted`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings column to compute minhash
+    seed : uint32_t
+        Seed used for the hash function
+    a : Column
+        1st parameter value used for the minhash algorithm.
+    b : Column
+        2nd parameter value used for the minhash algorithm.
+    width : size_type
+        Character width used for apply substrings;
+
+    Returns
+    -------
+    Column
+        List column of minhash values for each string per seed
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_minhash_permuted(
+            input.view(),
+            seed,
+            a.view(),
+            b.view(),
+            width
+        )
+
+    return Column.from_libcudf(move(c_result))
+
 cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=4):
     """
     Returns the minhash values for each string per seed.
@@ -77,6 +130,12 @@ cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=4):
     Column
         List column of minhash values for each string per seed
     """
+    warnings.warn(
+        "Starting in version 25.02, the signature of this function will "
+        "be changed to match pylibcudf.nvtext.minhash64_permuted.",
+        FutureWarning
+    )
+
     cdef unique_ptr[column] c_result
 
     if not isinstance(seeds, (Column, Scalar)):
@@ -92,6 +151,50 @@ cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=4):
 
     return Column.from_libcudf(move(c_result))
 
+cpdef Column minhash64_permuted(
+    Column input,
+    uint64_t seed,
+    Column a,
+    Column b,
+    size_type width
+):
+    """
+    Returns the minhash values for each string.
+    This function uses MurmurHash3_x64_128 for the hash algorithm.
+
+    For details, see :cpp:func:`minhash64_permuted`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings column to compute minhash
+    seed : uint64_t
+        Seed used for the hash function
+    a : Column
+        1st parameter value used for the minhash algorithm.
+    b : Column
+        2nd parameter value used for the minhash algorithm.
+    width : size_type
+        Character width used for apply substrings;
+
+    Returns
+    -------
+    Column
+        List column of minhash values for each string per seed
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_minhash64_permuted(
+            input.view(),
+            seed,
+            a.view(),
+            b.view(),
+            width
+        )
+
+    return Column.from_libcudf(move(c_result))
+
 cpdef Column word_minhash(Column input, Column seeds):
     """
     Returns the minhash values for each row of strings per seed.
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py
index ead9ee094af..ec533e64307 100644
--- a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py
@@ -21,15 +21,19 @@ def word_minhash_input_data(request):
 
 
 @pytest.mark.parametrize("width", [5, 12])
-def test_minhash(minhash_input_data, width):
+def test_minhash_permuted(minhash_input_data, width):
     input_arr, seeds, seed_type = minhash_input_data
     minhash_func = (
-        plc.nvtext.minhash.minhash
+        plc.nvtext.minhash.minhash_permuted
         if seed_type == pa.uint32()
-        else plc.nvtext.minhash.minhash64
+        else plc.nvtext.minhash.minhash64_permuted
     )
     result = minhash_func(
-        plc.interop.from_arrow(input_arr), plc.interop.from_arrow(seeds), width
+        plc.interop.from_arrow(input_arr),
+        0,
+        plc.interop.from_arrow(seeds),
+        plc.interop.from_arrow(seeds),
+        width,
     )
     pa_result = plc.interop.to_arrow(result)
     assert all(len(got) == len(seeds) for got, s in zip(pa_result, input_arr))