From ccfc95a623e13d59a6e4f640ee7c022bda35f763 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 12 Nov 2024 10:03:06 -0500 Subject: [PATCH] Add new nvtext minhash_permuted API (#16756) Introduce new nvtext minhash API that takes a single seed for hashing and 2 parameter vectors to calculate the minhash results from the seed hash: ``` std::unique_ptr minhash_permuted( cudf::strings_column_view const& input, uint32_t seed, cudf::device_span parameter_a, cudf::device_span parameter_b, cudf::size_type width, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); ``` The `seed` is used to hash the `input` using rolling set of substrings `width` characters wide. The hashes are then combined with the values in `parameter_a` and `parameter_b` to calculate a set of 32-bit (or 64-bit) values for each row. Only the minimum value is returned per element of `a` and `b` when combined with all the hashes for a row. Each output row is a set of M values where `M = parameter_a.size() = parameter_b.size()` This implementation is significantly faster than the current minhash which computes hashes for multiple seeds. Included in this PR is also the `minhash64_permuted()` API that is identical but uses 64-bit values for the seed and the parameter values. Also included are new tests and a benchmark as well as the pylibcudf and cudf interfaces. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Matthew Murray (https://github.com/Matt711) - Lawrence Mitchell (https://github.com/wence-) - Karthikeyan (https://github.com/karthikeyann) - Yunsong Wang (https://github.com/PointKernel) URL: https://github.com/rapidsai/cudf/pull/16756 --- cpp/benchmarks/CMakeLists.txt | 4 +- cpp/benchmarks/text/minhash.cpp | 38 +- cpp/include/nvtext/minhash.hpp | 94 +++++ cpp/src/text/minhash.cu | 390 ++++++++++++++++++ cpp/tests/CMakeLists.txt | 1 + cpp/tests/text/minhash_tests.cpp | 267 ++++++------ python/cudf/cudf/_lib/nvtext/minhash.pyx | 28 ++ python/cudf/cudf/_lib/strings/__init__.py | 2 + python/cudf/cudf/core/column/string.py | 107 +++++ .../cudf/cudf/tests/text/test_text_methods.py | 48 +-- .../pylibcudf/libcudf/nvtext/minhash.pxd | 16 + python/pylibcudf/pylibcudf/nvtext/minhash.pxd | 16 + python/pylibcudf/pylibcudf/nvtext/minhash.pyx | 103 +++++ .../pylibcudf/tests/test_nvtext_minhash.py | 12 +- 14 files changed, 949 insertions(+), 177 deletions(-) diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index ad090be99f3..59f5602fd5a 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -348,8 +348,8 @@ ConfigureNVBench(BINARYOP_NVBENCH binaryop/binaryop.cpp binaryop/compiled_binary ConfigureBench(TEXT_BENCH text/subword.cpp) ConfigureNVBench( - TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/ngrams.cpp - text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp + TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp + text/ngrams.cpp text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp ) # ################################################################################################## diff --git a/cpp/benchmarks/text/minhash.cpp b/cpp/benchmarks/text/minhash.cpp index 31ce60d8f9a..a80d0dcbdb8 100644 --- a/cpp/benchmarks/text/minhash.cpp +++ b/cpp/benchmarks/text/minhash.cpp @@ -20,8 +20,6 @@ #include -#include - #include static void bench_minhash(nvbench::state& state) @@ -29,26 +27,25 @@ static void bench_minhash(nvbench::state& state) auto const num_rows = static_cast(state.get_int64("num_rows")); auto const row_width = static_cast(state.get_int64("row_width")); auto const hash_width = static_cast(state.get_int64("hash_width")); - auto const seed_count = static_cast(state.get_int64("seed_count")); + auto const parameters = static_cast(state.get_int64("parameters")); auto const base64 = state.get_int64("hash_type") == 64; - if (static_cast(num_rows) * static_cast(row_width) >= - static_cast(std::numeric_limits::max())) { - state.skip("Skip benchmarks greater than size_type limit"); - } - data_profile const strings_profile = data_profile_builder().distribution( cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width); auto const strings_table = create_random_table({cudf::type_id::STRING}, row_count{num_rows}, strings_profile); cudf::strings_column_view input(strings_table->view().column(0)); - data_profile const seeds_profile = data_profile_builder().null_probability(0).distribution( - cudf::type_to_id(), distribution_id::NORMAL, 0, row_width); - auto const seed_type = base64 ? cudf::type_id::UINT64 : cudf::type_id::UINT32; - auto const seeds_table = create_random_table({seed_type}, row_count{seed_count}, seeds_profile); - auto seeds = seeds_table->get_column(0); - seeds.set_null_mask(rmm::device_buffer{}, 0); + data_profile const param_profile = data_profile_builder().no_validity().distribution( + cudf::type_to_id(), + distribution_id::NORMAL, + 0u, + std::numeric_limits::max()); + auto const param_type = base64 ? cudf::type_id::UINT64 : cudf::type_id::UINT32; + auto const param_table = + create_random_table({param_type, param_type}, row_count{parameters}, param_profile); + auto const parameters_a = param_table->view().column(0); + auto const parameters_b = param_table->view().column(1); state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); @@ -57,15 +54,16 @@ static void bench_minhash(nvbench::state& state) state.add_global_memory_writes(num_rows); // output are hashes state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - auto result = base64 ? nvtext::minhash64(input, seeds.view(), hash_width) - : nvtext::minhash(input, seeds.view(), hash_width); + auto result = base64 + ? nvtext::minhash64_permuted(input, 0, parameters_a, parameters_b, hash_width) + : nvtext::minhash_permuted(input, 0, parameters_a, parameters_b, hash_width); }); } NVBENCH_BENCH(bench_minhash) .set_name("minhash") - .add_int64_axis("num_rows", {1024, 8192, 16364, 131072}) - .add_int64_axis("row_width", {128, 512, 2048}) - .add_int64_axis("hash_width", {5, 10}) - .add_int64_axis("seed_count", {2, 26}) + .add_int64_axis("num_rows", {15000, 30000, 60000}) + .add_int64_axis("row_width", {6000, 28000, 50000}) + .add_int64_axis("hash_width", {12, 24}) + .add_int64_axis("parameters", {26, 260}) .add_int64_axis("hash_type", {32, 64}); diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp index 42124461cdf..b2c1a23f57e 100644 --- a/cpp/include/nvtext/minhash.hpp +++ b/cpp/include/nvtext/minhash.hpp @@ -94,6 +94,53 @@ namespace CUDF_EXPORT nvtext { rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); +/** + * @brief Returns the minhash values for each string + * + * This function uses MurmurHash3_x86_32 for the hash algorithm. + * + * The input strings are first hashed using the given `seed` over substrings + * of `width` characters. These hash values are then combined with the `a` + * and `b` parameter values using the following formula: + * ``` + * max_hash = max of uint32 + * mp = (1 << 61) - 1 + * hv[i] = hash value of a substring at i + * pv[i] = ((hv[i] * a[i] + b[i]) % mp) & max_hash + * ``` + * + * This calculation is performed on each substring and the minimum value is computed + * as follows: + * ``` + * mh[j,i] = min(pv[i]) for all substrings in row j + * and where i=[0,a.size()) + * ``` + * + * Any null row entries result in corresponding null output rows. + * + * @throw std::invalid_argument if the width < 2 + * @throw std::invalid_argument if parameter_a is empty + * @throw std::invalid_argument if `parameter_b.size() != parameter_a.size()` + * @throw std::overflow_error if `parameter_a.size() * input.size()` exceeds the column size limit + * + * @param input Strings column to compute minhash + * @param seed Seed value used for the hash algorithm + * @param parameter_a Values used for the permuted calculation + * @param parameter_b Values used for the permuted calculation + * @param width The character width of substrings to hash for each row + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return List column of minhash values for each string per seed + */ +std::unique_ptr minhash_permuted( + cudf::strings_column_view const& input, + uint32_t seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + cudf::size_type width, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + /** * @brief Returns the minhash value for each string * @@ -159,6 +206,53 @@ namespace CUDF_EXPORT nvtext { rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); +/** + * @brief Returns the minhash values for each string + * + * This function uses MurmurHash3_x64_128 for the hash algorithm. + * + * The input strings are first hashed using the given `seed` over substrings + * of `width` characters. These hash values are then combined with the `a` + * and `b` parameter values using the following formula: + * ``` + * max_hash = max of uint64 + * mp = (1 << 61) - 1 + * hv[i] = hash value of a substring at i + * pv[i] = ((hv[i] * a[i] + b[i]) % mp) & max_hash + * ``` + * + * This calculation is performed on each substring and the minimum value is computed + * as follows: + * ``` + * mh[j,i] = min(pv[i]) for all substrings in row j + * and where i=[0,a.size()) + * ``` + * + * Any null row entries result in corresponding null output rows. + * + * @throw std::invalid_argument if the width < 2 + * @throw std::invalid_argument if parameter_a is empty + * @throw std::invalid_argument if `parameter_b.size() != parameter_a.size()` + * @throw std::overflow_error if `parameter_a.size() * input.size()` exceeds the column size limit + * + * @param input Strings column to compute minhash + * @param seed Seed value used for the hash algorithm + * @param parameter_a Values used for the permuted calculation + * @param parameter_b Values used for the permuted calculation + * @param width The character width of substrings to hash for each row + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return List column of minhash values for each string per seed + */ +std::unique_ptr minhash64_permuted( + cudf::strings_column_view const& input, + uint64_t seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + cudf::size_type width, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + /** * @brief Returns the minhash values for each row of strings per seed * diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index a03a34f5fa7..aee83ab35ed 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -37,9 +38,13 @@ #include #include +#include #include +#include #include #include +#include +#include #include @@ -162,6 +167,339 @@ std::unique_ptr minhash_fn(cudf::strings_column_view const& input, return hashes; } +constexpr cudf::thread_index_type block_size = 256; +// for potentially tuning minhash_seed_kernel independently from block_size +constexpr cudf::thread_index_type tile_size = block_size; + +// Number of a/b parameter values to process per thread. +// The intermediate values are stored in shared-memory and therefore limits this count. +// This value was found to be an efficient size for both uint32 and uint64 +// hash types based on benchmarks. +constexpr cuda::std::size_t params_per_thread = 16; + +// Separate kernels are used to process strings above and below this value (in bytes). +constexpr cudf::size_type wide_string_threshold = 1 << 18; // 256K +// The number of blocks per string for the above-threshold kernel processing. +constexpr cudf::size_type blocks_per_string = 64; +// The above values were determined using the redpajama and books_sample datasets + +/** + * @brief Hashing kernel launched as a thread per tile-size (block or warp) + * + * This kernel computes the hashes for each string using the seed and the specified + * hash function. The width is used to compute rolling substrings to hash over. + * The hashes are stored in d_hashes to be used in the minhash_permuted_kernel. + * + * This kernel also counts the number of strings above the wide_string_threshold + * and proactively initializes the output values for those strings. + * + * @tparam HashFunction The hash function to use for this kernel + * @tparam hash_value_type Derived from HashFunction result_type + * + * @param d_strings The input strings to hash + * @param seed The seed used for the hash function + * @param width Width in characters used for determining substrings to hash + * @param d_hashes The resulting hash values are stored here + * @param threshold_count Stores the number of strings above wide_string_threshold + * @param param_count Number of parameters (used for the proactive initialize) + * @param d_results Final results vector (used for the proactive initialize) + */ +template +CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings, + hash_value_type seed, + cudf::size_type width, + hash_value_type* d_hashes, + cudf::size_type* threshold_count, + cudf::size_type param_count, + hash_value_type* d_results) +{ + auto const tid = cudf::detail::grid_1d::global_thread_id(); + auto const str_idx = tid / tile_size; + if (str_idx >= d_strings.size()) { return; } + if (d_strings.is_null(str_idx)) { return; } + + // retrieve this string's offset to locate the output position in d_hashes + auto const offsets = d_strings.child(cudf::strings_column_view::offsets_column_index); + auto const offsets_itr = + cudf::detail::input_offsetalator(offsets.head(), offsets.type(), d_strings.offset()); + auto const offset = offsets_itr[str_idx]; + auto const size_bytes = static_cast(offsets_itr[str_idx + 1] - offset); + if (size_bytes == 0) { return; } + + auto const d_str = cudf::string_view(d_strings.head() + offset, size_bytes); + auto const lane_idx = tid % tile_size; + + // hashes for this string/thread are stored here + auto seed_hashes = d_hashes + offset - offsets_itr[0] + lane_idx; + + auto const begin = d_str.data() + lane_idx; + auto const end = d_str.data() + d_str.size_bytes(); + auto const hasher = HashFunction(seed); + + for (auto itr = begin; itr < end; itr += tile_size, seed_hashes += tile_size) { + if (cudf::strings::detail::is_utf8_continuation_char(*itr)) { + *seed_hashes = 0; + continue; + } + auto const check_str = // used for counting 'width' characters + cudf::string_view(itr, static_cast(thrust::distance(itr, end))); + auto const [bytes, left] = cudf::strings::detail::bytes_to_character_position(check_str, width); + if ((itr != d_str.data()) && (left > 0)) { + // true itr+width is past the end of the string + *seed_hashes = 0; + continue; + } + + auto const hash_str = cudf::string_view(itr, bytes); + hash_value_type hv; + if constexpr (std::is_same_v) { + hv = hasher(hash_str); + } else { + hv = thrust::get<0>(hasher(hash_str)); + } + // disallowing hash to zero case + *seed_hashes = cuda::std::max(hv, hash_value_type{1}); + } + + // logic appended here so an extra kernel is not required + if (size_bytes >= wide_string_threshold) { + if (lane_idx == 0) { + // count the number of wide strings + cuda::atomic_ref ref{*threshold_count}; + ref.fetch_add(1, cuda::std::memory_order_relaxed); + } + // initialize the output -- only needed for wider strings + auto d_output = d_results + (str_idx * param_count); + for (auto i = lane_idx; i < param_count; i += tile_size) { + d_output[i] = std::numeric_limits::max(); + } + } +} + +/** + * @brief Permutation calculation kernel + * + * This kernel uses the hashes from the minhash_seed_kernel and the parameter_a and + * parameter_b values to compute the final output results. + * The output is the number of input rows (N) by the number of parameter values (M). + * Each output[i] is the calculated result for parameter_a/b[0:M]. + * + * This kernel is launched with either blocks per strings of 1 for strings + * below the wide_strings_threshold or blocks per string = blocks_per_strings + * for strings above wide_strings_threshold. + * + * @tparam hash_value_type Derived from HashFunction result_type + * @tparam blocks_per_string Number of blocks used to process each string + * + * @param d_strings The input strings to hash + * @param indices The indices of the strings in d_strings to process + * @param parameter_a 1st set of parameters for the calculation result + * @param parameter_b 2nd set of parameters for the calculation result + * @param width Used for calculating the number of available hashes in each string + * @param d_hashes The hash values computed in minhash_seed_kernel + * @param d_results Final results vector of calculate values + */ +template +CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_strings, + cudf::device_span indices, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + cudf::size_type width, + hash_value_type const* d_hashes, + hash_value_type* d_results) +{ + auto const tid = cudf::detail::grid_1d::global_thread_id(); + auto const idx = (tid / blocks_per_string) / block_size; + if (idx >= indices.size()) { return; } + auto const str_idx = indices[idx]; + if (d_strings.is_null(str_idx)) { return; } + + auto const block = cooperative_groups::this_thread_block(); + int const section_idx = block.group_index().x % blocks_per_string; + + auto const offsets = d_strings.child(cudf::strings_column_view::offsets_column_index); + auto const offsets_itr = + cudf::detail::input_offsetalator(offsets.head(), offsets.type(), d_strings.offset()); + auto const offset = offsets_itr[str_idx]; + auto const size_bytes = static_cast(offsets_itr[str_idx + 1] - offset); + + // number of items to process in this block; + // last block also includes any remainder values from the size_bytes/blocks_per_string truncation + // example: + // each section_size for string with size 588090 and blocks_per_string=64 is 9188 + // except the last section which is 9188 + (588090 % 64) = 9246 + auto const section_size = + (size_bytes / blocks_per_string) + + (section_idx < (blocks_per_string - 1) ? 0 : size_bytes % blocks_per_string); + auto const section_offset = section_idx * (size_bytes / blocks_per_string); + + // hash values for this block/section + auto const seed_hashes = d_hashes + offset - offsets_itr[0] + section_offset; + // width used here as a max value since a string's char-count <= byte-count + auto const hashes_size = + section_idx < (blocks_per_string - 1) + ? section_size + : cuda::std::max(static_cast(size_bytes > 0), section_size - width + 1); + + auto const init = size_bytes == 0 ? 0 : std::numeric_limits::max(); + auto const lane_idx = block.thread_rank(); + auto const d_output = d_results + (str_idx * parameter_a.size()); + + auto const begin = seed_hashes + lane_idx; + auto const end = seed_hashes + hashes_size; + + // constants used in the permutation calculations + constexpr uint64_t mersenne_prime = (1UL << 61) - 1; + constexpr hash_value_type hash_max = std::numeric_limits::max(); + + // found to be an efficient shared memory size for both hash types + __shared__ hash_value_type block_values[block_size * params_per_thread]; + + for (std::size_t i = 0; i < parameter_a.size(); i += params_per_thread) { + // initialize this block's chunk of shared memory + // each thread handles params_per_thread of values + auto const chunk_values = block_values + (lane_idx * params_per_thread); + thrust::uninitialized_fill(thrust::seq, chunk_values, chunk_values + params_per_thread, init); + block.sync(); + + auto const param_count = + cuda::std::min(static_cast(params_per_thread), parameter_a.size() - i); + + // each lane accumulates min hashes in its shared memory + for (auto itr = begin; itr < end; itr += block_size) { + auto const hv = *itr; + // 0 is used as a skip sentinel for UTF-8 and trailing bytes + if (hv == 0) { continue; } + + for (std::size_t param_idx = i; param_idx < (i + param_count); ++param_idx) { + // permutation formula used by datatrove + hash_value_type const v = + ((hv * parameter_a[param_idx] + parameter_b[param_idx]) % mersenne_prime) & hash_max; + auto const block_idx = ((param_idx % params_per_thread) * block_size) + lane_idx; + block_values[block_idx] = cuda::std::min(v, block_values[block_idx]); + } + } + block.sync(); + + // reduce each parameter values vector to a single min value; + // assumes that the block_size > params_per_thread; + // each thread reduces a block_size of parameter values (thread per parameter) + if (lane_idx < param_count) { + auto const values = block_values + (lane_idx * block_size); + // cooperative groups does not have a min function and cub::BlockReduce was slower + auto const minv = + thrust::reduce(thrust::seq, values, values + block_size, init, thrust::minimum{}); + if constexpr (blocks_per_string > 1) { + // accumulates mins for each block into d_output + cuda::atomic_ref ref{d_output[lane_idx + i]}; + ref.fetch_min(minv, cuda::std::memory_order_relaxed); + } else { + d_output[lane_idx + i] = minv; + } + } + block.sync(); + } +} + +template +std::unique_ptr minhash_fn(cudf::strings_column_view const& input, + hash_value_type seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + cudf::size_type width, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_EXPECTS(width >= 2, + "Parameter width should be an integer value of 2 or greater", + std::invalid_argument); + CUDF_EXPECTS(!parameter_a.empty(), "Parameters A and B cannot be empty", std::invalid_argument); + CUDF_EXPECTS(parameter_a.size() == parameter_b.size(), + "Parameters A and B should have the same number of elements", + std::invalid_argument); + CUDF_EXPECTS( + (static_cast(input.size()) * parameter_a.size()) < + static_cast(std::numeric_limits::max()), + "The number of parameters times the number of input rows exceeds the column size limit", + std::overflow_error); + + auto const output_type = cudf::data_type{cudf::type_to_id()}; + if (input.is_empty()) { return cudf::make_empty_column(output_type); } + + auto const d_strings = cudf::column_device_view::create(input.parent(), stream); + + auto results = + cudf::make_numeric_column(output_type, + input.size() * static_cast(parameter_a.size()), + cudf::mask_state::UNALLOCATED, + stream, + mr); + auto d_results = results->mutable_view().data(); + + cudf::detail::grid_1d grid{static_cast(input.size()) * block_size, + block_size}; + auto const hashes_size = input.chars_size(stream); + auto d_hashes = rmm::device_uvector(hashes_size, stream); + auto d_threshold_count = cudf::detail::device_scalar(0, stream); + + minhash_seed_kernel + <<>>(*d_strings, + seed, + width, + d_hashes.data(), + d_threshold_count.data(), + parameter_a.size(), + d_results); + auto const threshold_count = d_threshold_count.value(stream); + + auto indices = rmm::device_uvector(input.size(), stream); + thrust::sequence(rmm::exec_policy(stream), indices.begin(), indices.end()); + cudf::size_type threshold_index = threshold_count < input.size() ? input.size() : 0; + + // if we counted a split of above/below threshold then + // compute partitions based on the size of each string + if ((threshold_count > 0) && (threshold_count < input.size())) { + auto sizes = rmm::device_uvector(input.size(), stream); + thrust::transform(rmm::exec_policy_nosync(stream), + thrust::counting_iterator(0), + thrust::counting_iterator(input.size()), + sizes.data(), + cuda::proclaim_return_type( + [d_strings = *d_strings] __device__(auto idx) -> cudf::size_type { + if (d_strings.is_null(idx)) { return 0; } + return d_strings.element(idx).size_bytes(); + })); + thrust::sort_by_key( + rmm::exec_policy_nosync(stream), sizes.begin(), sizes.end(), indices.begin()); + auto const lb = thrust::lower_bound( + rmm::exec_policy_nosync(stream), sizes.begin(), sizes.end(), wide_string_threshold); + threshold_index = static_cast(thrust::distance(sizes.begin(), lb)); + } + + // handle the strings below the threshold width + if (threshold_index > 0) { + auto d_indices = cudf::device_span(indices.data(), threshold_index); + cudf::detail::grid_1d grid{static_cast(d_indices.size()) * block_size, + block_size}; + minhash_permuted_kernel + <<>>( + *d_strings, d_indices, parameter_a, parameter_b, width, d_hashes.data(), d_results); + } + + // handle the strings above the threshold width + if (threshold_index < input.size()) { + auto const count = static_cast(input.size() - threshold_index); + auto d_indices = + cudf::device_span(indices.data() + threshold_index, count); + cudf::detail::grid_1d grid{count * block_size * blocks_per_string, block_size}; + minhash_permuted_kernel + <<>>( + *d_strings, d_indices, parameter_a, parameter_b, width, d_hashes.data(), d_results); + } + + return results; +} + /** * @brief Compute the minhash of each list row of strings for each seed * @@ -309,6 +647,20 @@ std::unique_ptr minhash(cudf::strings_column_view const& input, return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr); } +std::unique_ptr minhash(cudf::strings_column_view const& input, + uint32_t seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + cudf::size_type width, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32; + auto hashes = + detail::minhash_fn(input, seed, parameter_a, parameter_b, width, stream, mr); + return build_list_result(input.parent(), std::move(hashes), parameter_a.size(), stream, mr); +} + std::unique_ptr minhash64(cudf::strings_column_view const& input, cudf::numeric_scalar const& seed, cudf::size_type width, @@ -333,6 +685,20 @@ std::unique_ptr minhash64(cudf::strings_column_view const& input, return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr); } +std::unique_ptr minhash64(cudf::strings_column_view const& input, + uint64_t seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + cudf::size_type width, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128; + auto hashes = + detail::minhash_fn(input, seed, parameter_a, parameter_b, width, stream, mr); + return build_list_result(input.parent(), std::move(hashes), parameter_a.size(), stream, mr); +} + std::unique_ptr word_minhash(cudf::lists_column_view const& input, cudf::device_span seeds, rmm::cuda_stream_view stream, @@ -374,6 +740,18 @@ std::unique_ptr minhash(cudf::strings_column_view const& input, return detail::minhash(input, seeds, width, stream, mr); } +std::unique_ptr minhash_permuted(cudf::strings_column_view const& input, + uint32_t seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + cudf::size_type width, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return detail::minhash(input, seed, parameter_a, parameter_b, width, stream, mr); +} + std::unique_ptr minhash64(cudf::strings_column_view const& input, cudf::numeric_scalar seed, cudf::size_type width, @@ -394,6 +772,18 @@ std::unique_ptr minhash64(cudf::strings_column_view const& input, return detail::minhash64(input, seeds, width, stream, mr); } +std::unique_ptr minhash64_permuted(cudf::strings_column_view const& input, + uint64_t seed, + cudf::device_span parameter_a, + cudf::device_span parameter_b, + cudf::size_type width, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return detail::minhash64(input, seed, parameter_a, parameter_b, width, stream, mr); +} + std::unique_ptr word_minhash(cudf::lists_column_view const& input, cudf::device_span seeds, rmm::cuda_stream_view stream, diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 3a9b930830b..cbca0ceef77 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -610,6 +610,7 @@ ConfigureTest( text/bpe_tests.cpp text/edit_distance_tests.cpp text/jaccard_tests.cpp + text/minhash_tests.cpp text/ngrams_tests.cpp text/ngrams_tokenize_tests.cpp text/normalize_tests.cpp diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp index ef35a4472cf..042ac44621e 100644 --- a/cpp/tests/text/minhash_tests.cpp +++ b/cpp/tests/text/minhash_tests.cpp @@ -28,155 +28,169 @@ struct MinHashTest : public cudf::test::BaseFixture {}; -TEST_F(MinHashTest, Basic) +TEST_F(MinHashTest, Permuted) { - auto validity = cudf::test::iterators::null_at(1); auto input = cudf::test::strings_column_wrapper({"doc 1", - "", "this is doc 2", - "", "doc 3", "d", - "The quick brown fox jumpéd over the lazy brown dog."}, - validity); + "The quick brown fox jumpéd over the lazy brown dog.", + "line six", + "line seven", + "line eight", + "line nine", + "line ten"}); auto view = cudf::strings_column_view(input); - auto results = nvtext::minhash(view); + auto first = thrust::counting_iterator(10); + auto params = cudf::test::fixed_width_column_wrapper(first, first + 3); + auto results = + nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4); - auto expected = cudf::test::fixed_width_column_wrapper( - {1207251914u, 0u, 21141582u, 0u, 1207251914u, 655955059u, 86520422u}, validity); + using LCW32 = cudf::test::lists_column_wrapper; + // clang-format off + LCW32 expected({ + LCW32{1392101586u, 394869177u, 811528444u}, + LCW32{ 211415830u, 187088503u, 130291444u}, + LCW32{2098117052u, 394869177u, 799753544u}, + LCW32{2264583304u, 2920538364u, 3576493424u}, + LCW32{ 253327882u, 41747273u, 302030804u}, + LCW32{2109809594u, 1017470651u, 326988172u}, + LCW32{1303819864u, 850676747u, 147107852u}, + LCW32{ 736021564u, 720812292u, 1405158760u}, + LCW32{ 902780242u, 134064807u, 1613944636u}, + LCW32{ 547084870u, 1748895564u, 656501844u} + }); + // clang-format on CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - auto results64 = nvtext::minhash64(view); - auto expected64 = cudf::test::fixed_width_column_wrapper({774489391575805754ul, - 0ul, - 3232308021562742685ul, - 0ul, - 13145552576991307582ul, - 14660046701545912182ul, - 398062025280761388ul}, - validity); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); -} + auto params64 = cudf::test::fixed_width_column_wrapper(first, first + 3); + auto results64 = nvtext::minhash64_permuted( + view, 0, cudf::column_view(params64), cudf::column_view(params64), 4); -TEST_F(MinHashTest, LengthEqualsWidth) -{ - auto input = cudf::test::strings_column_wrapper({"abcdé", "fghjk", "lmnop", "qrstu", "vwxyz"}); - auto view = cudf::strings_column_view(input); - auto results = nvtext::minhash(view, 0, 5); - auto expected = cudf::test::fixed_width_column_wrapper( - {3825281041u, 2728681928u, 1984332911u, 3965004915u, 192452857u}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + using LCW64 = cudf::test::lists_column_wrapper; + // clang-format off + LCW64 expected64({ + LCW64{ 827364888116975697ul, 1601854279692781452ul, 70500662054893256ul}, + LCW64{ 18312093741021833ul, 133793446674258329ul, 21974512489226198ul}, + LCW64{ 22474244732520567ul, 1638811775655358395ul, 949306297364502264ul}, + LCW64{1332357434996402861ul, 2157346081260151330ul, 676491718310205848ul}, + LCW64{ 65816830624808020ul, 43323600380520789ul, 63511816333816345ul}, + LCW64{ 629657184954525200ul, 49741036507643002ul, 97466271004074331ul}, + LCW64{ 301611977846331113ul, 101188874709594830ul, 97466271004074331ul}, + LCW64{ 121498891461700668ul, 171065800427907402ul, 97466271004074331ul}, + LCW64{ 54617739511834072ul, 231454301607238929ul, 97466271004074331ul}, + LCW64{ 576418665851990314ul, 231454301607238929ul, 97466271004074331ul} + }); + // clang-format on + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); } -TEST_F(MinHashTest, MultiSeed) +TEST_F(MinHashTest, PermutedWide) { - auto input = - cudf::test::strings_column_wrapper({"doc 1", - "this is doc 2", - "doc 3", - "d", - "The quick brown fox jumpéd over the lazy brown dog."}); - - auto view = cudf::strings_column_view(input); + std::string const small(2 << 10, 'x'); // below wide_string_threshold + std::string const wide(2 << 19, 'y'); // above wide_string_threshold + auto input = cudf::test::strings_column_wrapper({small, wide}); + auto view = cudf::strings_column_view(input); - auto seeds = cudf::test::fixed_width_column_wrapper({0, 1, 2}); - auto results = nvtext::minhash(view, cudf::column_view(seeds)); + auto first = thrust::counting_iterator(20); + auto params = cudf::test::fixed_width_column_wrapper(first, first + 3); + auto results = + nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4); - using LCW = cudf::test::lists_column_wrapper; + using LCW32 = cudf::test::lists_column_wrapper; // clang-format off - LCW expected({LCW{1207251914u, 1677652962u, 1061355987u}, - LCW{ 21141582u, 580916568u, 1258052021u}, - LCW{1207251914u, 943567174u, 1109272887u}, - LCW{ 655955059u, 488346356u, 2394664816u}, - LCW{ 86520422u, 236622901u, 102546228u}}); + LCW32 expected({ + LCW32{1731998032u, 315359380u, 3193688024u}, + LCW32{1293098788u, 2860992281u, 133918478u} + }); // clang-format on CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - auto seeds64 = cudf::test::fixed_width_column_wrapper({0, 1, 2}); - auto results64 = nvtext::minhash64(view, cudf::column_view(seeds64)); + auto params64 = cudf::test::fixed_width_column_wrapper(first, first + 3); + auto results64 = nvtext::minhash64_permuted( + view, 0, cudf::column_view(params64), cudf::column_view(params64), 4); using LCW64 = cudf::test::lists_column_wrapper; // clang-format off - LCW64 expected64({LCW64{ 774489391575805754ul, 10435654231793485448ul, 1188598072697676120ul}, - LCW64{ 3232308021562742685ul, 4445611509348165860ul, 1188598072697676120ul}, - LCW64{13145552576991307582ul, 6846192680998069919ul, 1188598072697676120ul}, - LCW64{14660046701545912182ul, 17106501326045553694ul, 17713478494106035784ul}, - LCW64{ 398062025280761388ul, 377720198157450084ul, 984941365662009329ul}}); + LCW64 expected64({ + LCW64{1818322427062143853ul, 641024893347719371ul, 1769570368846988848ul}, + LCW64{1389920339306667795ul, 421787002125838902ul, 1759496674158703968ul} + }); // clang-format on CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); } -TEST_F(MinHashTest, MultiSeedWithNullInputRow) +TEST_F(MinHashTest, PermutedManyParameters) { - auto validity = cudf::test::iterators::null_at(1); - auto input = cudf::test::strings_column_wrapper({"abcdéfgh", "", "", "stuvwxyz"}, validity); - auto view = cudf::strings_column_view(input); + std::string const small(2 << 10, 'x'); + std::string const wide(2 << 19, 'y'); + auto input = cudf::test::strings_column_wrapper({small, wide}); + auto view = cudf::strings_column_view(input); - auto seeds = cudf::test::fixed_width_column_wrapper({1, 2}); - auto results = nvtext::minhash(view, cudf::column_view(seeds)); + auto first = thrust::counting_iterator(20); + // more than params_per_thread + auto params = cudf::test::fixed_width_column_wrapper(first, first + 31); + auto results = + nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4); - using LCW = cudf::test::lists_column_wrapper; - LCW expected({LCW{484984072u, 1074168784u}, LCW{}, LCW{0u, 0u}, LCW{571652169u, 173528385u}}, - validity); + using LCW32 = cudf::test::lists_column_wrapper; + // clang-format off + LCW32 expected({ + LCW32{1731998032u, 315359380u, 3193688024u, 1777049372u, 360410720u, 3238739364u, 1822100712u, 405462060u, + 3283790704u, 1867152052u, 450513400u, 3328842044u, 1912203392u, 495564740u, 3373893384u, 1957254732u, + 540616080u, 3418944724u, 2002306072u, 585667420u, 3463996064u, 2047357412u, 630718760u, 3509047404u, + 2092408752u, 675770100u, 3554098744u, 2137460092u, 720821440u, 3599150084u, 2182511432u}, + LCW32{1293098788u, 2860992281u, 133918478u, 1701811971u, 3269705464u, 542631661u, 2110525154u, 3678418647u, + 951344844u, 2519238337u, 4087131830u, 1360058027u, 2927951520u, 200877717u, 1768771210u, 3336664703u, + 609590900u, 2177484393u, 3745377886u, 1018304083u, 2586197576u, 4154091069u, 1427017266u, 2994910759u, + 267836956u, 1835730449u, 3403623942u, 676550139u, 2244443632u, 3812337125u, 1085263322u} + }); + // clang-format on CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - auto seeds64 = cudf::test::fixed_width_column_wrapper({11, 22}); - auto results64 = nvtext::minhash64(view, cudf::column_view(seeds64)); + // more than params_per_thread + auto params64 = cudf::test::fixed_width_column_wrapper(first, first + 31); + auto results64 = nvtext::minhash64_permuted( + view, 0, cudf::column_view(params64), cudf::column_view(params64), 4); using LCW64 = cudf::test::lists_column_wrapper; - LCW64 expected64({LCW64{2597399324547032480ul, 4461410998582111052ul}, - LCW64{}, - LCW64{0ul, 0ul}, - LCW64{2717781266371273264ul, 6977325820868387259ul}}, - validity); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); -} - -TEST_F(MinHashTest, WordsMinHash) -{ - using LCWS = cudf::test::lists_column_wrapper; - auto validity = cudf::test::iterators::null_at(1); - - LCWS input( - {LCWS({"hello", "abcdéfgh"}), - LCWS{}, - LCWS({"rapids", "moré", "test", "text"}), - LCWS({"The", "quick", "brown", "fox", "jumpéd", "over", "the", "lazy", "brown", "dog"})}, - validity); - - auto view = cudf::lists_column_view(input); - - auto seeds = cudf::test::fixed_width_column_wrapper({1, 2}); - auto results = nvtext::word_minhash(view, cudf::column_view(seeds)); - using LCW32 = cudf::test::lists_column_wrapper; - LCW32 expected({LCW32{2069617641u, 1975382903u}, - LCW32{}, - LCW32{657297235u, 1010955999u}, - LCW32{644643885u, 310002789u}}, - validity); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - - auto seeds64 = cudf::test::fixed_width_column_wrapper({11, 22}); - auto results64 = nvtext::word_minhash64(view, cudf::column_view(seeds64)); - using LCW64 = cudf::test::lists_column_wrapper; - LCW64 expected64({LCW64{1940333969930105370ul, 272615362982418219ul}, - LCW64{}, - LCW64{5331949571924938590ul, 2088583894581919741ul}, - LCW64{3400468157617183341ul, 2398577492366130055ul}}, - validity); + // clang-format off + LCW64 expected64({ + LCW64{1818322427062143853, 641024893347719371, 1769570368846988848, 592272835132564366, + 1720818310631833835, 543520776917409353, 1672066252416678822, 494768718702254348, + 1623314194201523817, 446016660487099335, 1574562135986368804, 397264602271944322, + 1525810077771213799, 348512544056789317, 1477058019556058786, 299760485841634304, + 1428305961340903773, 251008427626479291, 1379553903125748768, 202256369411324286, + 1330801844910593755, 153504311196169273, 1282049786695438742, 104752252981014268, + 1233297728480283737, 56000194765859255, 1184545670265128724, 7248136550704242, + 1135793612049973719, 2264339087549243188, 1087041553834818706}, + LCW64{1389920339306667795, 421787002125838902, 1759496674158703968, 791363336977875075, + 2129073009010740141, 1160939671829911248, 192806334649082363, 1530516006681947421, + 562382669501118536, 1900092341533983602, 931959004353154709, 2269668676386019775, + 1301535339205190882, 333402002024361997, 1671111674057227055, 702978336876398170, + 2040688008909263228, 1072554671728434343, 104421334547605450, 1442131006580470516, + 473997669399641631, 1811707341432506689, 843574004251677804, 2181283676284542862, + 1213150339103713977, 245017001922885084, 1582726673955750150, 614593336774921257, + 1952303008807786323, 984169671626957438, 16036334446128545} + }); + // clang-format on CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64); } TEST_F(MinHashTest, EmptyTest) { - auto input = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); - auto view = cudf::strings_column_view(input->view()); - auto results = nvtext::minhash(view); + auto input = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); + auto view = cudf::strings_column_view(input->view()); + auto params = cudf::test::fixed_width_column_wrapper({1, 2, 3}); + auto results = + nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4); EXPECT_EQ(results->size(), 0); - results = nvtext::minhash64(view); + auto params64 = cudf::test::fixed_width_column_wrapper({1, 2, 3}); + results = nvtext::minhash64_permuted( + view, 0, cudf::column_view(params64), cudf::column_view(params64), 4); EXPECT_EQ(results->size(), 0); } @@ -184,20 +198,39 @@ TEST_F(MinHashTest, ErrorsTest) { auto input = cudf::test::strings_column_wrapper({"this string intentionally left blank"}); auto view = cudf::strings_column_view(input); - EXPECT_THROW(nvtext::minhash(view, 0, 0), std::invalid_argument); - EXPECT_THROW(nvtext::minhash64(view, 0, 0), std::invalid_argument); - auto seeds = cudf::test::fixed_width_column_wrapper(); - EXPECT_THROW(nvtext::minhash(view, cudf::column_view(seeds)), std::invalid_argument); - auto seeds64 = cudf::test::fixed_width_column_wrapper(); - EXPECT_THROW(nvtext::minhash64(view, cudf::column_view(seeds64)), std::invalid_argument); + auto empty = cudf::test::fixed_width_column_wrapper(); + EXPECT_THROW( + nvtext::minhash_permuted(view, 0, cudf::column_view(empty), cudf::column_view(empty), 0), + std::invalid_argument); + auto empty64 = cudf::test::fixed_width_column_wrapper(); + EXPECT_THROW( + nvtext::minhash64_permuted(view, 0, cudf::column_view(empty64), cudf::column_view(empty64), 0), + std::invalid_argument); + EXPECT_THROW( + nvtext::minhash_permuted(view, 0, cudf::column_view(empty), cudf::column_view(empty), 4), + std::invalid_argument); + EXPECT_THROW( + nvtext::minhash64_permuted(view, 0, cudf::column_view(empty64), cudf::column_view(empty64), 4), + std::invalid_argument); std::vector h_input(50000, ""); input = cudf::test::strings_column_wrapper(h_input.begin(), h_input.end()); view = cudf::strings_column_view(input); auto const zeroes = thrust::constant_iterator(0); - seeds = cudf::test::fixed_width_column_wrapper(zeroes, zeroes + 50000); - EXPECT_THROW(nvtext::minhash(view, cudf::column_view(seeds)), std::overflow_error); - seeds64 = cudf::test::fixed_width_column_wrapper(zeroes, zeroes + 50000); - EXPECT_THROW(nvtext::minhash64(view, cudf::column_view(seeds64)), std::overflow_error); + auto params = cudf::test::fixed_width_column_wrapper(zeroes, zeroes + 50000); + EXPECT_THROW( + nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4), + std::overflow_error); + auto params64 = cudf::test::fixed_width_column_wrapper(zeroes, zeroes + 50000); + EXPECT_THROW(nvtext::minhash64_permuted( + view, 0, cudf::column_view(params64), cudf::column_view(params64), 4), + std::overflow_error); + + EXPECT_THROW( + nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(empty), 4), + std::invalid_argument); + EXPECT_THROW( + nvtext::minhash64_permuted(view, 0, cudf::column_view(params64), cudf::column_view(empty64), 4), + std::invalid_argument); } diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx index 5e39cafa47b..25cfcf99ca6 100644 --- a/python/cudf/cudf/_lib/nvtext/minhash.pyx +++ b/python/cudf/cudf/_lib/nvtext/minhash.pyx @@ -1,5 +1,7 @@ # Copyright (c) 2023-2024, NVIDIA CORPORATION. +from libc.stdint cimport uint32_t, uint64_t + from cudf.core.buffer import acquire_spill_lock from cudf._lib.column cimport Column @@ -17,6 +19,19 @@ def minhash(Column input, Column seeds, int width=4): return Column.from_pylibcudf(result) +@acquire_spill_lock() +def minhash_permuted(Column input, uint32_t seed, Column a, Column b, int width): + return Column.from_pylibcudf( + nvtext.minhash.minhash_permuted( + input.to_pylibcudf(mode="read"), + seed, + a.to_pylibcudf(mode="read"), + b.to_pylibcudf(mode="read"), + width, + ) + ) + + @acquire_spill_lock() def minhash64(Column input, Column seeds, int width=4): result = nvtext.minhash.minhash64( @@ -27,6 +42,19 @@ def minhash64(Column input, Column seeds, int width=4): return Column.from_pylibcudf(result) +@acquire_spill_lock() +def minhash64_permuted(Column input, uint64_t seed, Column a, Column b, int width): + return Column.from_pylibcudf( + nvtext.minhash.minhash64_permuted( + input.to_pylibcudf(mode="read"), + seed, + a.to_pylibcudf(mode="read"), + b.to_pylibcudf(mode="read"), + width, + ) + ) + + @acquire_spill_lock() def word_minhash(Column input, Column seeds): result = nvtext.minhash.word_minhash( diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py index ffa5e603408..4c0ec2d9ac5 100644 --- a/python/cudf/cudf/_lib/strings/__init__.py +++ b/python/cudf/cudf/_lib/strings/__init__.py @@ -9,6 +9,8 @@ from cudf._lib.nvtext.minhash import ( minhash, minhash64, + minhash64_permuted, + minhash_permuted, word_minhash, word_minhash64, ) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 856ce0f75de..3d70b01b7e4 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5350,11 +5350,65 @@ def minhash( libstrings.minhash(self._column, seeds_column, width) ) + def minhash_permuted( + self, seed: np.uint32, a: ColumnLike, b: ColumnLike, width: int + ) -> SeriesOrIndex: + """ + Compute the minhash of a strings column. + + This uses the MurmurHash3_x86_32 algorithm for the hash function. + + Calculation uses the formula (hv * a + b) % mersenne_prime + where hv is the hash of a substring of width characters, + a and b are provided values and mersenne_prime is 2^61-1. + + Parameters + ---------- + seed : uint32 + The seed used for the hash algorithm. + a : ColumnLike + Values for minhash calculation. + Must be of type uint32. + b : ColumnLike + Values for minhash calculation. + Must be of type uint32. + width : int + The width of the substring to hash. + + Examples + -------- + >>> import cudf + >>> import numpy as np + >>> s = cudf.Series(['this is my', 'favorite book']) + >>> a = cudf.Series([1, 2, 3], dtype=np.uint32) + >>> b = cudf.Series([4, 5, 6], dtype=np.uint32) + >>> s.str.minhash_permuted(0, a=a, b=b, width=5) + 0 [1305480171, 462824409, 74608232] + 1 [32665388, 65330773, 97996158] + dtype: list + """ + a_column = column.as_column(a) + if a_column.dtype != np.uint32: + raise ValueError( + f"Expecting a Series with dtype uint32, got {type(a)}" + ) + b_column = column.as_column(b) + if b_column.dtype != np.uint32: + raise ValueError( + f"Expecting a Series with dtype uint32, got {type(b)}" + ) + return self._return_or_inplace( + libstrings.minhash_permuted( + self._column, seed, a_column, b_column, width + ) + ) + def minhash64( self, seeds: ColumnLike | None = None, width: int = 4 ) -> SeriesOrIndex: """ Compute the minhash of a strings column. + This uses the MurmurHash3_x64_128 algorithm for the hash function. This function generates 2 uint64 values but only the first uint64 value is used. @@ -5390,6 +5444,59 @@ def minhash64( libstrings.minhash64(self._column, seeds_column, width) ) + def minhash64_permuted( + self, seed: np.uint64, a: ColumnLike, b: ColumnLike, width: int + ) -> SeriesOrIndex: + """ + Compute the minhash of a strings column. + This uses the MurmurHash3_x64_128 algorithm for the hash function. + + Calculation uses the formula (hv * a + b) % mersenne_prime + where hv is the hash of a substring of width characters, + a and b are provided values and mersenne_prime is 2^61-1. + + Parameters + ---------- + seed : uint64 + The seed used for the hash algorithm. + a : ColumnLike + Values for minhash calculation. + Must be of type uint64. + b : ColumnLike + Values for minhash calculation. + Must be of type uint64. + width : int + The width of the substring to hash. + + Examples + -------- + >>> import cudf + >>> import numpy as np + >>> s = cudf.Series(['this is my', 'favorite book', 'to read']) + >>> a = cudf.Series([2, 3], dtype=np.uint64) + >>> b = cudf.Series([5, 6], dtype=np.uint64) + >>> s.str.minhash64_permuted(0, a=a, b=b, width=5) + 0 [172452388517576012, 316595762085180527] + 1 [71427536958126239, 58787297728258215] + 2 [423885828176437114, 1140588505926961370] + dtype: list + """ + a_column = column.as_column(a) + if a_column.dtype != np.uint64: + raise ValueError( + f"Expecting a Series with dtype uint64, got {type(a)}" + ) + b_column = column.as_column(b) + if b_column.dtype != np.uint64: + raise ValueError( + f"Expecting a Series with dtype uint64, got {type(b)}" + ) + return self._return_or_inplace( + libstrings.minhash64_permuted( + self._column, seed, a_column, b_column, width + ) + ) + def word_minhash(self, seeds: ColumnLike | None = None) -> SeriesOrIndex: """ Compute the minhash of a list column of strings. diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py index 997ca357986..47e541fdcef 100644 --- a/python/cudf/cudf/tests/text/test_text_methods.py +++ b/python/cudf/cudf/tests/text/test_text_methods.py @@ -882,68 +882,48 @@ def test_is_vowel_consonant(): assert_eq(expected, actual) -def test_minhash(): +def test_minhash_permuted(): strings = cudf.Series(["this is my", "favorite book", None, ""]) + params = cudf.Series([1, 2, 3], dtype=np.uint32) expected = cudf.Series( [ - cudf.Series([21141582], dtype=np.uint32), - cudf.Series([962346254], dtype=np.uint32), - None, - cudf.Series([0], dtype=np.uint32), - ] - ) - actual = strings.str.minhash() - assert_eq(expected, actual) - seeds = cudf.Series([0, 1, 2], dtype=np.uint32) - expected = cudf.Series( - [ - cudf.Series([1305480167, 668155704, 34311509], dtype=np.uint32), - cudf.Series([32665384, 3470118, 363147162], dtype=np.uint32), + cudf.Series([1305480168, 462824406, 74608229], dtype=np.uint32), + cudf.Series([32665385, 65330770, 97996155], dtype=np.uint32), None, cudf.Series([0, 0, 0], dtype=np.uint32), ] ) - actual = strings.str.minhash(seeds=seeds, width=5) + actual = strings.str.minhash_permuted(0, a=params, b=params, width=5) assert_eq(expected, actual) - expected = cudf.Series( - [ - cudf.Series([3232308021562742685], dtype=np.uint64), - cudf.Series([23008204270530356], dtype=np.uint64), - None, - cudf.Series([0], dtype=np.uint64), - ] - ) - actual = strings.str.minhash64() - assert_eq(expected, actual) - seeds = cudf.Series([0, 1, 2], dtype=np.uint64) + params = cudf.Series([1, 2, 3], dtype=np.uint64) expected = cudf.Series( [ cudf.Series( - [7082801294247314046, 185949556058924788, 167570629329462454], + [105531920695060180, 172452388517576009, 316595762085180524], dtype=np.uint64, ), cudf.Series( - [382665377781028452, 86243762733551437, 7688750597953083512], + [35713768479063122, 71427536958126236, 58787297728258212], dtype=np.uint64, ), None, cudf.Series([0, 0, 0], dtype=np.uint64), ] ) - actual = strings.str.minhash64(seeds=seeds, width=5) + actual = strings.str.minhash64_permuted(0, a=params, b=params, width=5) assert_eq(expected, actual) # test wrong seed types with pytest.raises(ValueError): - strings.str.minhash(seeds="a") + strings.str.minhash_permuted(1, a="a", b="b", width=7) with pytest.raises(ValueError): - seeds = cudf.Series([0, 1, 2], dtype=np.int32) - strings.str.minhash(seeds=seeds) + params = cudf.Series([0, 1, 2], dtype=np.int32) + strings.str.minhash_permuted(1, a=params, b=params, width=6) with pytest.raises(ValueError): - seeds = cudf.Series([0, 1, 2], dtype=np.uint32) - strings.str.minhash64(seeds=seeds) + params = cudf.Series([0, 1, 2], dtype=np.uint32) + strings.str.minhash64_permuted(1, a=params, b=params, width=8) def test_word_minhash(): diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd index 41250037dcf..ebf8eda1ce3 100644 --- a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd @@ -22,6 +22,14 @@ cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil: const size_type width, ) except + + cdef unique_ptr[column] minhash_permuted( + const column_view &strings, + const uint32_t seed, + const column_view &a, + const column_view &b, + const size_type width, + ) except + + cdef unique_ptr[column] minhash64( const column_view &strings, const column_view &seeds, @@ -34,6 +42,14 @@ cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil: const size_type width, ) except + + cdef unique_ptr[column] minhash64_permuted( + const column_view &strings, + const uint64_t seed, + const column_view &a, + const column_view &b, + const size_type width, + ) except + + cdef unique_ptr[column] word_minhash( const column_view &input, const column_view &seeds diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/nvtext/minhash.pxd index 97e8c9dc83c..6b544282f44 100644 --- a/python/pylibcudf/pylibcudf/nvtext/minhash.pxd +++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pxd @@ -11,8 +11,24 @@ ctypedef fused ColumnOrScalar: cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=*) +cpdef Column minhash_permuted( + Column input, + uint32_t seed, + Column a, + Column b, + size_type width +) + cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=*) +cpdef Column minhash64_permuted( + Column input, + uint64_t seed, + Column a, + Column b, + size_type width +) + cpdef Column word_minhash(Column input, Column seeds) cpdef Column word_minhash64(Column input, Column seeds) diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx index f1e012e60e5..5a51e32b287 100644 --- a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx @@ -8,6 +8,8 @@ from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.nvtext.minhash cimport ( minhash as cpp_minhash, minhash64 as cpp_minhash64, + minhash64_permuted as cpp_minhash64_permuted, + minhash_permuted as cpp_minhash_permuted, word_minhash as cpp_word_minhash, word_minhash64 as cpp_word_minhash64, ) @@ -16,6 +18,7 @@ from pylibcudf.libcudf.types cimport size_type from pylibcudf.scalar cimport Scalar from cython.operator import dereference +import warnings cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=4): @@ -40,6 +43,12 @@ cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=4): Column List column of minhash values for each string per seed """ + warnings.warn( + "Starting in version 25.02, the signature of this function will " + "be changed to match pylibcudf.nvtext.minhash_permuted.", + FutureWarning + ) + cdef unique_ptr[column] c_result if not isinstance(seeds, (Column, Scalar)): @@ -55,6 +64,50 @@ cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=4): return Column.from_libcudf(move(c_result)) +cpdef Column minhash_permuted( + Column input, + uint32_t seed, + Column a, + Column b, + size_type width +): + """ + Returns the minhash values for each string. + This function uses MurmurHash3_x86_32 for the hash algorithm. + + For details, see :cpp:func:`minhash_permuted`. + + Parameters + ---------- + input : Column + Strings column to compute minhash + seed : uint32_t + Seed used for the hash function + a : Column + 1st parameter value used for the minhash algorithm. + b : Column + 2nd parameter value used for the minhash algorithm. + width : size_type + Character width used for apply substrings; + + Returns + ------- + Column + List column of minhash values for each string per seed + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_minhash_permuted( + input.view(), + seed, + a.view(), + b.view(), + width + ) + + return Column.from_libcudf(move(c_result)) + cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=4): """ Returns the minhash values for each string per seed. @@ -77,6 +130,12 @@ cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=4): Column List column of minhash values for each string per seed """ + warnings.warn( + "Starting in version 25.02, the signature of this function will " + "be changed to match pylibcudf.nvtext.minhash64_permuted.", + FutureWarning + ) + cdef unique_ptr[column] c_result if not isinstance(seeds, (Column, Scalar)): @@ -92,6 +151,50 @@ cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=4): return Column.from_libcudf(move(c_result)) +cpdef Column minhash64_permuted( + Column input, + uint64_t seed, + Column a, + Column b, + size_type width +): + """ + Returns the minhash values for each string. + This function uses MurmurHash3_x64_128 for the hash algorithm. + + For details, see :cpp:func:`minhash64_permuted`. + + Parameters + ---------- + input : Column + Strings column to compute minhash + seed : uint64_t + Seed used for the hash function + a : Column + 1st parameter value used for the minhash algorithm. + b : Column + 2nd parameter value used for the minhash algorithm. + width : size_type + Character width used for apply substrings; + + Returns + ------- + Column + List column of minhash values for each string per seed + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_minhash64_permuted( + input.view(), + seed, + a.view(), + b.view(), + width + ) + + return Column.from_libcudf(move(c_result)) + cpdef Column word_minhash(Column input, Column seeds): """ Returns the minhash values for each row of strings per seed. diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py index ead9ee094af..ec533e64307 100644 --- a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py @@ -21,15 +21,19 @@ def word_minhash_input_data(request): @pytest.mark.parametrize("width", [5, 12]) -def test_minhash(minhash_input_data, width): +def test_minhash_permuted(minhash_input_data, width): input_arr, seeds, seed_type = minhash_input_data minhash_func = ( - plc.nvtext.minhash.minhash + plc.nvtext.minhash.minhash_permuted if seed_type == pa.uint32() - else plc.nvtext.minhash.minhash64 + else plc.nvtext.minhash.minhash64_permuted ) result = minhash_func( - plc.interop.from_arrow(input_arr), plc.interop.from_arrow(seeds), width + plc.interop.from_arrow(input_arr), + 0, + plc.interop.from_arrow(seeds), + plc.interop.from_arrow(seeds), + width, ) pa_result = plc.interop.to_arrow(result) assert all(len(got) == len(seeds) for got, s in zip(pa_result, input_arr))