Skip to content

Commit

Permalink
feat(gpu): implement subarray search
Browse files Browse the repository at this point in the history
  • Loading branch information
guillermo-oyarzun committed Dec 18, 2024
1 parent 9195753 commit 419edbf
Show file tree
Hide file tree
Showing 15 changed files with 3,286 additions and 10 deletions.
36 changes: 36 additions & 0 deletions backends/tfhe-cuda-backend/cuda/include/integer/integer.h
Original file line number Diff line number Diff line change
Expand Up @@ -440,5 +440,41 @@ void cleanup_cuda_integer_abs_inplace(void *const *streams,
uint32_t gpu_count,
int8_t **mem_ptr_void);

void scratch_cuda_integer_are_all_comparisons_block_true_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_radix_blocks,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory);

void cuda_integer_are_all_comparisons_block_true_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void const *lwe_array_in, int8_t *mem_ptr,
void *const *bsks, void *const *ksks, uint32_t num_radix_blocks);

void cleanup_cuda_integer_are_all_comparisons_block_true(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void);

void scratch_cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_radix_blocks,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory);

void cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void const *lwe_array_in, int8_t *mem_ptr,
void *const *bsks, void *const *ksks, uint32_t num_radix_blocks);

void cleanup_cuda_integer_is_at_least_one_comparisons_block_true(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void);

} // extern C
#endif // CUDA_INTEGER_H
88 changes: 88 additions & 0 deletions backends/tfhe-cuda-backend/cuda/src/integer/comparison.cu
Original file line number Diff line number Diff line change
Expand Up @@ -94,3 +94,91 @@ void cleanup_cuda_integer_comparison(void *const *streams,
(int_comparison_buffer<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
}

void scratch_cuda_integer_are_all_comparisons_block_true_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_radix_blocks,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory) {

int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
message_modulus, carry_modulus);

scratch_cuda_integer_radix_comparison_check_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_comparison_buffer<uint64_t> **)mem_ptr, num_radix_blocks, params, EQ,
false, allocate_gpu_memory);
}

void cuda_integer_are_all_comparisons_block_true_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void const *lwe_array_in, int8_t *mem_ptr,
void *const *bsks, void *const *ksks, uint32_t num_radix_blocks) {

int_comparison_buffer<uint64_t> *buffer =
(int_comparison_buffer<uint64_t> *)mem_ptr;

host_integer_are_all_comparisons_block_true_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_array_in), buffer, bsks,
(uint64_t **)(ksks), num_radix_blocks);
}

void cleanup_cuda_integer_are_all_comparisons_block_true(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void) {

int_comparison_buffer<uint64_t> *mem_ptr =
(int_comparison_buffer<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
}

void scratch_cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level,
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_radix_blocks,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory) {

int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
message_modulus, carry_modulus);

scratch_cuda_integer_radix_comparison_check_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_comparison_buffer<uint64_t> **)mem_ptr, num_radix_blocks, params, EQ,
false, allocate_gpu_memory);
}

void cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void const *lwe_array_in, int8_t *mem_ptr,
void *const *bsks, void *const *ksks, uint32_t num_radix_blocks) {

int_comparison_buffer<uint64_t> *buffer =
(int_comparison_buffer<uint64_t> *)mem_ptr;

host_integer_is_at_least_one_comparisons_block_true_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_array_in), buffer, bsks,
(uint64_t **)(ksks), num_radix_blocks);
}

void cleanup_cuda_integer_is_at_least_one_comparisons_block_true(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void) {

int_comparison_buffer<uint64_t> *mem_ptr =
(int_comparison_buffer<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
}
35 changes: 33 additions & 2 deletions backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ __host__ void accumulate_all_blocks(cudaStream_t stream, uint32_t gpu_index,
template <typename Torus>
__host__ void are_all_comparisons_block_true(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out, Torus *lwe_array_in,
uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_in,
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks, uint32_t num_radix_blocks) {

Expand Down Expand Up @@ -167,7 +167,7 @@ __host__ void are_all_comparisons_block_true(
template <typename Torus>
__host__ void is_at_least_one_comparisons_block_true(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out, Torus *lwe_array_in,
uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_in,
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks, uint32_t num_radix_blocks) {

Expand Down Expand Up @@ -626,4 +626,35 @@ __host__ void host_integer_radix_maxmin_kb(
mem_ptr->cmux_buffer, bsks, ksks, total_num_radix_blocks);
}

template <typename Torus>
__host__ void host_integer_are_all_comparisons_block_true_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_in,
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks, uint32_t num_radix_blocks) {

auto eq_buffer = mem_ptr->eq_buffer;

// It returns a block encrypting 1 if all input blocks are 1
// otherwise the block encrypts 0
are_all_comparisons_block_true<Torus>(streams, gpu_indexes, gpu_count,
lwe_array_out, lwe_array_in, mem_ptr,
bsks, ksks, num_radix_blocks);
}

template <typename Torus>
__host__ void host_integer_is_at_least_one_comparisons_block_true_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_in,
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks, uint32_t num_radix_blocks) {

auto eq_buffer = mem_ptr->eq_buffer;

// It returns a block encrypting 1 if all input blocks are 1
// otherwise the block encrypts 0
is_at_least_one_comparisons_block_true<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, mem_ptr,
bsks, ksks, num_radix_blocks);
}
#endif
86 changes: 86 additions & 0 deletions backends/tfhe-cuda-backend/src/bindings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1083,6 +1083,92 @@ extern "C" {
mem_ptr_void: *mut *mut i8,
);
}
extern "C" {
pub fn scratch_cuda_integer_are_all_comparisons_block_true_kb_64(
streams: *const *mut ffi::c_void,
gpu_indexes: *const u32,
gpu_count: u32,
mem_ptr: *mut *mut i8,
glwe_dimension: u32,
polynomial_size: u32,
big_lwe_dimension: u32,
small_lwe_dimension: u32,
ks_level: u32,
ks_base_log: u32,
pbs_level: u32,
pbs_base_log: u32,
grouping_factor: u32,
num_radix_blocks: u32,
message_modulus: u32,
carry_modulus: u32,
pbs_type: PBS_TYPE,
allocate_gpu_memory: bool,
);
}
extern "C" {
pub fn cuda_integer_are_all_comparisons_block_true_kb_64(
streams: *const *mut ffi::c_void,
gpu_indexes: *const u32,
gpu_count: u32,
lwe_array_out: *mut ffi::c_void,
lwe_array_in: *const ffi::c_void,
mem_ptr: *mut i8,
bsks: *const *mut ffi::c_void,
ksks: *const *mut ffi::c_void,
num_radix_blocks: u32,
);
}
extern "C" {
pub fn cleanup_cuda_integer_are_all_comparisons_block_true(
streams: *const *mut ffi::c_void,
gpu_indexes: *const u32,
gpu_count: u32,
mem_ptr_void: *mut *mut i8,
);
}
extern "C" {
pub fn scratch_cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
streams: *const *mut ffi::c_void,
gpu_indexes: *const u32,
gpu_count: u32,
mem_ptr: *mut *mut i8,
glwe_dimension: u32,
polynomial_size: u32,
big_lwe_dimension: u32,
small_lwe_dimension: u32,
ks_level: u32,
ks_base_log: u32,
pbs_level: u32,
pbs_base_log: u32,
grouping_factor: u32,
num_radix_blocks: u32,
message_modulus: u32,
carry_modulus: u32,
pbs_type: PBS_TYPE,
allocate_gpu_memory: bool,
);
}
extern "C" {
pub fn cuda_integer_is_at_least_one_comparisons_block_true_kb_64(
streams: *const *mut ffi::c_void,
gpu_indexes: *const u32,
gpu_count: u32,
lwe_array_out: *mut ffi::c_void,
lwe_array_in: *const ffi::c_void,
mem_ptr: *mut i8,
bsks: *const *mut ffi::c_void,
ksks: *const *mut ffi::c_void,
num_radix_blocks: u32,
);
}
extern "C" {
pub fn cleanup_cuda_integer_is_at_least_one_comparisons_block_true(
streams: *const *mut ffi::c_void,
gpu_indexes: *const u32,
gpu_count: u32,
mem_ptr_void: *mut *mut i8,
);
}
extern "C" {
pub fn cuda_keyswitch_lwe_ciphertext_vector_32(
stream: *mut ffi::c_void,
Expand Down
36 changes: 30 additions & 6 deletions tfhe/src/high_level_api/integers/unsigned/base.rs
Original file line number Diff line number Diff line change
Expand Up @@ -779,9 +779,22 @@ where
}
}
#[cfg(feature = "gpu")]
InternalServerKey::Cuda(_) => {
panic!("Cuda devices do not support match_value yet");
}
InternalServerKey::Cuda(cuda_key) => with_thread_local_cuda_streams(|streams| {
let (result, matched) =
cuda_key
.key
.key
.match_value(&self.ciphertext.on_gpu(), matches, streams);
let target_num_blocks = OutId::num_blocks(cuda_key.key.key.message_modulus);
if target_num_blocks >= result.ciphertext.d_blocks.lwe_ciphertext_count().0 {
Ok((
FheUint::new(result, cuda_key.tag.clone()),
FheBool::new(matched, cuda_key.tag.clone()),
))
} else {
Err(crate::Error::new("Output type does not have enough bits to represent all possible output values".to_string()))
}
}),
})
}

Expand Down Expand Up @@ -844,9 +857,20 @@ where
}
}
#[cfg(feature = "gpu")]
InternalServerKey::Cuda(_) => {
panic!("Cuda devices do not support match_value_or yet");
}
InternalServerKey::Cuda(cuda_key) => with_thread_local_cuda_streams(|streams| {
let result = cuda_key.key.key.match_value_or(
&self.ciphertext.on_gpu(),
matches,
or_value,
streams,
);
let target_num_blocks = OutId::num_blocks(cuda_key.key.key.message_modulus);
if target_num_blocks >= result.ciphertext.d_blocks.lwe_ciphertext_count().0 {
Ok(FheUint::new(result, cuda_key.tag.clone()))
} else {
Err(crate::Error::new("Output type does not have enough bits to represent all possible output values".to_string()))
}
}),
})
}

Expand Down
37 changes: 37 additions & 0 deletions tfhe/src/integer/gpu/ciphertext/info.rs
Original file line number Diff line number Diff line change
Expand Up @@ -525,6 +525,43 @@ impl CudaRadixCiphertextInfo {
}
}

pub(crate) fn after_block_comparisons(&self) -> Self {
Self {
blocks: self
.blocks
.iter()
.enumerate()
.map(|(i, block)| CudaBlockInfo {
degree: if i == 0 {
Degree::new(1)
} else {
Degree::new(0)
},
message_modulus: block.message_modulus,
carry_modulus: block.carry_modulus,
pbs_order: block.pbs_order,
noise_level: NoiseLevel::NOMINAL,
})
.collect(),
}
}

pub(crate) fn after_aggregate_one_hot_vector(&self) -> Self {
Self {
blocks: self
.blocks
.iter()
.map(|left| CudaBlockInfo {
degree: Degree::new(left.message_modulus.0 - 1),
message_modulus: left.message_modulus,
carry_modulus: left.carry_modulus,
pbs_order: left.pbs_order,
noise_level: NoiseLevel::NOMINAL,
})
.collect(),
}
}

pub(crate) fn after_ne(&self) -> Self {
Self {
blocks: self
Expand Down
Loading

0 comments on commit 419edbf

Please sign in to comment.