Add new nvtext minhash_permuted API (#16756)

Introduce new nvtext minhash API that takes a single seed for hashing and 2 parameter vectors to calculate the minhash results from the seed hash: ``` std::unique_ptr<cudf::column> minhash_permuted( cudf::strings_column_view const& input, uint32_t seed, cudf::device_span<uint32_t const> parameter_a, cudf::device_span<uint32_t const> parameter_b, cudf::size_type width, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); ``` The `seed` is used to hash the `input` using rolling set of substrings `width` characters wide. The hashes are then combined with the values in `parameter_a` and `parameter_b` to calculate a set of 32-bit (or 64-bit) values for each row. Only the minimum value is returned per element of `a` and `b` when combined with all the hashes for a row. Each output row is a set of M values where `M = parameter_a.size() = parameter_b.size()` This implementation is significantly faster than the current minhash which computes hashes for multiple seeds. Included in this PR is also the `minhash64_permuted()` API that is identical but uses 64-bit values for the seed and the parameter values. Also included are new tests and a benchmark as well as the pylibcudf and cudf interfaces. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Matthew Murray (https://github.com/Matt711) - Lawrence Mitchell (https://github.com/wence-) - Karthikeyan (https://github.com/karthikeyann) - Yunsong Wang (https://github.com/PointKernel) URL: #16756
rapidsai · Nov 12, 2024 · ccfc95a · ccfc95a
1 parent 043bcbd
commit ccfc95a
Show file tree

Hide file tree

Showing 14 changed files with 949 additions and 177 deletions.
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
@@ -348,8 +348,8 @@ ConfigureNVBench(BINARYOP_NVBENCH binaryop/binaryop.cpp binaryop/compiled_binary
 ConfigureBench(TEXT_BENCH text/subword.cpp)
 
 ConfigureNVBench(
-  TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/ngrams.cpp
-  text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp
+  TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp
+  text/ngrams.cpp text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp
 )
 
 # ##################################################################################################

diff --git a/cpp/benchmarks/text/minhash.cpp b/cpp/benchmarks/text/minhash.cpp
@@ -20,35 +20,32 @@
 
 #include <nvtext/minhash.hpp>
 
-#include <rmm/device_buffer.hpp>
-
 #include <nvbench/nvbench.cuh>
 
 static void bench_minhash(nvbench::state& state)
 {
   auto const num_rows   = static_cast<cudf::size_type>(state.get_int64("num_rows"));
   auto const row_width  = static_cast<cudf::size_type>(state.get_int64("row_width"));
   auto const hash_width = static_cast<cudf::size_type>(state.get_int64("hash_width"));
-  auto const seed_count = static_cast<cudf::size_type>(state.get_int64("seed_count"));
+  auto const parameters = static_cast<cudf::size_type>(state.get_int64("parameters"));
   auto const base64     = state.get_int64("hash_type") == 64;
 
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
-
   data_profile const strings_profile = data_profile_builder().distribution(
     cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
   auto const strings_table =
     create_random_table({cudf::type_id::STRING}, row_count{num_rows}, strings_profile);
   cudf::strings_column_view input(strings_table->view().column(0));
 
-  data_profile const seeds_profile = data_profile_builder().null_probability(0).distribution(
-    cudf::type_to_id<cudf::hash_value_type>(), distribution_id::NORMAL, 0, row_width);
-  auto const seed_type   = base64 ? cudf::type_id::UINT64 : cudf::type_id::UINT32;
-  auto const seeds_table = create_random_table({seed_type}, row_count{seed_count}, seeds_profile);
-  auto seeds             = seeds_table->get_column(0);
-  seeds.set_null_mask(rmm::device_buffer{}, 0);
+  data_profile const param_profile = data_profile_builder().no_validity().distribution(
+    cudf::type_to_id<cudf::hash_value_type>(),
+    distribution_id::NORMAL,
+    0u,
+    std::numeric_limits<cudf::hash_value_type>::max());
+  auto const param_type = base64 ? cudf::type_id::UINT64 : cudf::type_id::UINT32;
+  auto const param_table =
+    create_random_table({param_type, param_type}, row_count{parameters}, param_profile);
+  auto const parameters_a = param_table->view().column(0);
+  auto const parameters_b = param_table->view().column(1);
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
 
@@ -57,15 +54,16 @@ static void bench_minhash(nvbench::state& state)
   state.add_global_memory_writes<nvbench::int32_t>(num_rows);  // output are hashes
 
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    auto result = base64 ? nvtext::minhash64(input, seeds.view(), hash_width)
-                         : nvtext::minhash(input, seeds.view(), hash_width);
+    auto result = base64
+                    ? nvtext::minhash64_permuted(input, 0, parameters_a, parameters_b, hash_width)
+                    : nvtext::minhash_permuted(input, 0, parameters_a, parameters_b, hash_width);
   });
 }
 
 NVBENCH_BENCH(bench_minhash)
   .set_name("minhash")
-  .add_int64_axis("num_rows", {1024, 8192, 16364, 131072})
-  .add_int64_axis("row_width", {128, 512, 2048})
-  .add_int64_axis("hash_width", {5, 10})
-  .add_int64_axis("seed_count", {2, 26})
+  .add_int64_axis("num_rows", {15000, 30000, 60000})
+  .add_int64_axis("row_width", {6000, 28000, 50000})
+  .add_int64_axis("hash_width", {12, 24})
+  .add_int64_axis("parameters", {26, 260})
   .add_int64_axis("hash_type", {32, 64});
diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp
@@ -94,6 +94,53 @@ namespace CUDF_EXPORT nvtext {
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
+/**
+ * @brief Returns the minhash values for each string
+ *
+ * This function uses MurmurHash3_x86_32 for the hash algorithm.
+ *
+ * The input strings are first hashed using the given `seed` over substrings
+ * of `width` characters. These hash values are then combined with the `a`
+ * and `b` parameter values using the following formula:
+ * ```
+ *   max_hash = max of uint32
+ *   mp = (1 << 61) - 1
+ *   hv[i] = hash value of a substring at i
+ *   pv[i] = ((hv[i] * a[i] + b[i]) % mp) & max_hash
+ * ```
+ *
+ * This calculation is performed on each substring and the minimum value is computed
+ * as follows:
+ * ```
+ *   mh[j,i] = min(pv[i]) for all substrings in row j
+ *                        and where i=[0,a.size())
+ * ```
+ *
+ * Any null row entries result in corresponding null output rows.
+ *
+ * @throw std::invalid_argument if the width < 2
+ * @throw std::invalid_argument if parameter_a is empty
+ * @throw std::invalid_argument if `parameter_b.size() != parameter_a.size()`
+ * @throw std::overflow_error if `parameter_a.size() * input.size()` exceeds the column size limit
+ *
+ * @param input Strings column to compute minhash
+ * @param seed Seed value used for the hash algorithm
+ * @param parameter_a Values used for the permuted calculation
+ * @param parameter_b Values used for the permuted calculation
+ * @param width The character width of substrings to hash for each row
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return List column of minhash values for each string per seed
+ */
+std::unique_ptr<cudf::column> minhash_permuted(
+  cudf::strings_column_view const& input,
+  uint32_t seed,
+  cudf::device_span<uint32_t const> parameter_a,
+  cudf::device_span<uint32_t const> parameter_b,
+  cudf::size_type width,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
+
 /**
  * @brief Returns the minhash value for each string
  *
@@ -159,6 +206,53 @@ namespace CUDF_EXPORT nvtext {
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
+/**
+ * @brief Returns the minhash values for each string
+ *
+ * This function uses MurmurHash3_x64_128 for the hash algorithm.
+ *
+ * The input strings are first hashed using the given `seed` over substrings
+ * of `width` characters. These hash values are then combined with the `a`
+ * and `b` parameter values using the following formula:
+ * ```
+ *   max_hash = max of uint64
+ *   mp = (1 << 61) - 1
+ *   hv[i] = hash value of a substring at i
+ *   pv[i] = ((hv[i] * a[i] + b[i]) % mp) & max_hash
+ * ```
+ *
+ * This calculation is performed on each substring and the minimum value is computed
+ * as follows:
+ * ```
+ *   mh[j,i] = min(pv[i]) for all substrings in row j
+ *                        and where i=[0,a.size())
+ * ```
+ *
+ * Any null row entries result in corresponding null output rows.
+ *
+ * @throw std::invalid_argument if the width < 2
+ * @throw std::invalid_argument if parameter_a is empty
+ * @throw std::invalid_argument if `parameter_b.size() != parameter_a.size()`
+ * @throw std::overflow_error if `parameter_a.size() * input.size()` exceeds the column size limit
+ *
+ * @param input Strings column to compute minhash
+ * @param seed Seed value used for the hash algorithm
+ * @param parameter_a Values used for the permuted calculation
+ * @param parameter_b Values used for the permuted calculation
+ * @param width The character width of substrings to hash for each row
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return List column of minhash values for each string per seed
+ */
+std::unique_ptr<cudf::column> minhash64_permuted(
+  cudf::strings_column_view const& input,
+  uint64_t seed,
+  cudf::device_span<uint64_t const> parameter_a,
+  cudf::device_span<uint64_t const> parameter_b,
+  cudf::size_type width,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
+
 /**
  * @brief Returns the minhash values for each row of strings per seed
  *