From 5a7e8ab43a20f5cdd236c39858b4773c522a4bfc Mon Sep 17 00:00:00 2001 From: Agnes Leroy Date: Thu, 5 Sep 2024 13:52:35 +0200 Subject: [PATCH] chore(gpu): pass over all cuda bind --- .../tfhe-cuda-backend/cuda/include/device.h | 10 +- .../cuda/include/helper_multi_gpu.h | 2 +- .../cuda/src/utils/helper_multi_gpu.cu | 4 +- backends/tfhe-cuda-backend/src/cuda_bind.rs | 1150 +++++++---------- 4 files changed, 504 insertions(+), 662 deletions(-) diff --git a/backends/tfhe-cuda-backend/cuda/include/device.h b/backends/tfhe-cuda-backend/cuda/include/device.h index 203db79b82..fd7dbf1e28 100644 --- a/backends/tfhe-cuda-backend/cuda/include/device.h +++ b/backends/tfhe-cuda-backend/cuda/include/device.h @@ -39,10 +39,6 @@ void *cuda_malloc_async(uint64_t size, cudaStream_t stream, uint32_t gpu_index); void cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index); -bool cuda_check_support_cooperative_groups(); - -bool cuda_check_support_thread_block_clusters(); - void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size, cudaStream_t stream, uint32_t gpu_index); @@ -62,9 +58,13 @@ void cuda_synchronize_device(uint32_t gpu_index); void cuda_drop(void *ptr, uint32_t gpu_index); void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index); +} int cuda_get_max_shared_memory(uint32_t gpu_index); -} + +bool cuda_check_support_cooperative_groups(); + +bool cuda_check_support_thread_block_clusters(); template void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index, diff --git a/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h b/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h index cbdd17fbf8..788fe416b3 100644 --- a/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h +++ b/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h @@ -8,7 +8,7 @@ extern std::mutex m; extern bool p2p_enabled; extern "C" { -int cuda_setup_multi_gpu(); +int32_t cuda_setup_multi_gpu(); } // Define a variant type that can be either a vector or a single pointer diff --git a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cu b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cu index a6d6cdd540..04b3d9b488 100644 --- a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cu +++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cu @@ -6,7 +6,7 @@ std::mutex m; bool p2p_enabled = false; -int cuda_setup_multi_gpu() { +int32_t cuda_setup_multi_gpu() { int num_gpus = cuda_get_number_of_gpus(); if (num_gpus == 0) PANIC("GPU error: the number of GPUs should be > 0.") @@ -32,7 +32,7 @@ int cuda_setup_multi_gpu() { } m.unlock(); } - return num_used_gpus; + return (int32_t)(num_used_gpus); } int get_active_gpu_count(int num_inputs, int gpu_count) { diff --git a/backends/tfhe-cuda-backend/src/cuda_bind.rs b/backends/tfhe-cuda-backend/src/cuda_bind.rs index 17353d48af..740d99bc2f 100644 --- a/backends/tfhe-cuda-backend/src/cuda_bind.rs +++ b/backends/tfhe-cuda-backend/src/cuda_bind.rs @@ -3,18 +3,19 @@ use std::ffi::c_void; #[link(name = "tfhe_cuda_backend", kind = "static")] extern "C" { - /// Create a new Cuda stream on GPU `gpu_index` pub fn cuda_create_stream(gpu_index: u32) -> *mut c_void; - /// Destroy the Cuda stream `v_stream` pub fn cuda_destroy_stream(stream: *mut c_void, gpu_index: u32); - /// Allocate `size` memory on GPU `gpu_index` asynchronously + pub fn cuda_synchronize_stream(stream: *mut c_void, gpu_index: u32); + + pub fn cuda_malloc(size: u64, gpu_index: u32) -> *mut c_void; + pub fn cuda_malloc_async(size: u64, stream: *mut c_void, gpu_index: u32) -> *mut c_void; - /// Copy `size` memory asynchronously from `src` on GPU `gpu_index` to `dest` on CPU using - /// the Cuda stream `v_stream`. - pub fn cuda_memcpy_async_to_cpu( + pub fn cuda_check_valid_malloc(size: u64, gpu_index: u32); + + pub fn cuda_memcpy_async_to_gpu( dest: *mut c_void, src: *const c_void, size: u64, @@ -22,9 +23,7 @@ extern "C" { gpu_index: u32, ); - /// Copy `size` memory asynchronously from `src` on CPU to `dest` on GPU `gpu_index` using - /// the Cuda stream `v_stream`. - pub fn cuda_memcpy_async_to_gpu( + pub fn cuda_memcpy_async_gpu_to_gpu( dest: *mut c_void, src: *const c_void, size: u64, @@ -32,9 +31,7 @@ extern "C" { gpu_index: u32, ); - /// Copy `size` memory asynchronously from `src` to `dest` on the same GPU `gpu_index` using - /// the Cuda stream `v_stream`. - pub fn cuda_memcpy_async_gpu_to_gpu( + pub fn cuda_memcpy_async_to_cpu( dest: *mut c_void, src: *const c_void, size: u64, @@ -42,65 +39,22 @@ extern "C" { gpu_index: u32, ); - /// Copy `size` memory asynchronously from `src` on CPU to `dest` on GPU `gpu_index` using - /// the Cuda stream `v_stream`. pub fn cuda_memset_async( dest: *mut c_void, - value: u64, + val: u64, size: u64, stream: *mut c_void, gpu_index: u32, ); - /// Get the total number of Nvidia GPUs detected on the platform pub fn cuda_get_number_of_gpus() -> i32; - /// Synchronize all streams on GPU `gpu_index` pub fn cuda_synchronize_device(gpu_index: u32); - /// Synchronize Cuda stream - pub fn cuda_synchronize_stream(stream: *mut c_void, gpu_index: u32); - - /// Free memory for pointer `ptr` on GPU `gpu_index` asynchronously, using stream `v_stream` - pub fn cuda_drop_async(ptr: *mut c_void, stream: *mut c_void, gpu_index: u32); - - /// Free memory for pointer `ptr` on GPU `gpu_index` synchronously pub fn cuda_drop(ptr: *mut c_void, gpu_index: u32); - pub fn cuda_setup_multi_gpu() -> i32; - - /// Copy a bootstrap key `src` represented with 64 bits in the standard domain from the CPU to - /// the GPU `gpu_index` using the stream `v_stream`, and convert it to the Fourier domain on the - /// GPU. The resulting bootstrap key `dest` on the GPU is an array of f64 values. - pub fn cuda_convert_lwe_programmable_bootstrap_key_64( - stream: *mut c_void, - gpu_index: u32, - dest: *mut c_void, - src: *const c_void, - input_lwe_dim: u32, - glwe_dim: u32, - level_count: u32, - polynomial_size: u32, - ); - - /// Copy a multi-bit bootstrap key `src` represented with 64 bits in the standard domain from - /// the CPU to the GPU `gpu_index` using the stream `v_stream`. The resulting bootstrap key - /// `dest` on the GPU is an array of uint64_t values. - pub fn cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64( - stream: *mut c_void, - gpu_index: u32, - dest: *mut c_void, - src: *const c_void, - input_lwe_dim: u32, - glwe_dim: u32, - level_count: u32, - polynomial_size: u32, - grouping_factor: u32, - ); + pub fn cuda_drop_async(ptr: *mut c_void, stream: *mut c_void, gpu_index: u32); - /// Copy `number_of_cts` LWE ciphertext represented with 64 bits in the standard domain from the - /// CPU to the GPU `gpu_index` using the stream `v_stream`. All ciphertexts must be - /// concatenated. pub fn cuda_convert_lwe_ciphertext_vector_to_gpu_64( stream: *mut c_void, gpu_index: u32, @@ -110,9 +64,6 @@ extern "C" { lwe_dimension: u32, ); - /// Copy `number_of_cts` LWE ciphertext represented with 64 bits in the standard domain from the - /// GPU to the CPU `gpu_index` using the stream `v_stream`. All ciphertexts must be - /// concatenated. pub fn cuda_convert_lwe_ciphertext_vector_to_cpu_64( stream: *mut c_void, gpu_index: u32, @@ -122,400 +73,15 @@ extern "C" { lwe_dimension: u32, ); - /// This scratch function allocates the necessary amount of data on the GPU for - /// the low latency PBS on 64-bit inputs, into `pbs_buffer`. It also configures SM - /// options on the GPU in case FULLSM or PARTIALSM mode are going to be used. - pub fn scratch_cuda_programmable_bootstrap_64( - stream: *mut c_void, - gpu_index: u32, - pbs_buffer: *mut *mut i8, - glwe_dimension: u32, - polynomial_size: u32, - level_count: u32, - input_lwe_ciphertext_count: u32, - allocate_gpu_memory: bool, - ); - - /// Perform bootstrapping on a batch of input u64 LWE ciphertexts. - /// - /// - `v_stream` is a void pointer to the Cuda stream to be used in the kernel launch - /// - `gpu_index` is the index of the GPU to be used in the kernel launch - /// - `lwe_array_out`: output batch of num_samples bootstrapped ciphertexts c = (a0,..an-1,b) - /// where n is the LWE dimension - /// - `lut_vector`: should hold as many test vectors of size polynomial_size as there are input - /// ciphertexts, but actually holds `num_lut_vectors` vectors to reduce memory usage - /// - `lut_vector_indexes`: stores the index corresponding to which test vector to use for each - /// sample in `lut_vector` - /// - `lwe_array_in`: input batch of num_samples LWE ciphertexts, containing n mask values + 1 - /// body value - /// - `bootstrapping_key`: GGSW encryption of the LWE secret key sk1 under secret key sk2. bsk = - /// Z + sk1 H where H is the gadget matrix and Z is a matrix (k+1).l containing GLWE - /// encryptions of 0 under sk2. bsk is thus a tensor of size (k+1)^2.l.N.n where l is the - /// number of decomposition levels and k is the GLWE dimension, N is the polynomial size for - /// GLWE. The polynomial size for GLWE and the test vector are the same because they have to - /// be in the same ring to be multiplied. - /// - `pbs_buffer`: a preallocated buffer to store temporary results - /// - `lwe_dimension`: size of the Torus vector used to encrypt the input LWE ciphertexts - - /// referred to as n above (~ 600) - /// - `glwe_dimension`: size of the polynomial vector used to encrypt the LUT GLWE ciphertexts - - /// referred to as k above. Only the value 1 is supported for this parameter. - /// - `polynomial_size`: size of the test polynomial (test vector) and size of the GLWE - /// polynomial (~1024) - /// - `base_log`: log base used for the gadget matrix - B = 2^base_log (~8) - /// - `level_count`: number of decomposition levels in the gadget matrix (~4) - /// - `num_samples`: number of encrypted input messages - /// - /// This function calls a wrapper to a device kernel that performs the - /// bootstrapping: - /// - the kernel is templatized based on integer discretization and polynomial degree - /// - num_samples * level_count * (glwe_dimension + 1) blocks of threads are launched, where - /// each thread is going to handle one or more polynomial coefficients at each stage, for a - /// given level of decomposition, either for the LUT mask or its body: - /// - perform the blind rotation - /// - round the result - /// - get the decomposition for the current level - /// - switch to the FFT domain - /// - multiply with the bootstrapping key - /// - come back to the coefficients representation - /// - between each stage a synchronization of the threads is necessary (some synchronizations - /// happen at the block level, some happen between blocks, using cooperative groups). - /// - in case the device has enough shared memory, temporary arrays used for the different - /// stages (accumulators) are stored into the shared memory - /// - the accumulators serve to combine the results for all decomposition levels - /// - the constant memory (64K) is used for storing the roots of identity values for the FFT - pub fn cuda_programmable_bootstrap_lwe_ciphertext_vector_64( - stream: *mut c_void, - gpu_index: u32, - lwe_array_out: *mut c_void, - lwe_output_indexes: *const c_void, - lut_vector: *const c_void, - lut_vector_indexes: *const c_void, - lwe_array_in: *const c_void, - lwe_input_indexes: *const c_void, - bootstrapping_key: *const c_void, - pbs_buffer: *mut i8, - lwe_dimension: u32, - glwe_dimension: u32, - polynomial_size: u32, - base_log: u32, - level: u32, - num_samples: u32, - ); - - /// This cleanup function frees the data for the low latency PBS on GPU - /// contained in pbs_buffer for 32 or 64-bit inputs. - pub fn cleanup_cuda_programmable_bootstrap( - stream: *mut c_void, - gpu_index: u32, - pbs_buffer: *mut *mut i8, - ); - - /// This scratch function allocates the necessary amount of data on the GPU for - /// the multi-bit PBS on 64-bit inputs into `pbs_buffer`. - pub fn scratch_cuda_multi_bit_programmable_bootstrap_64( - stream: *mut c_void, - gpu_index: u32, - pbs_buffer: *mut *mut i8, - lwe_dimension: u32, - glwe_dimension: u32, - polynomial_size: u32, - level_count: u32, - grouping_factor: u32, - input_lwe_ciphertext_count: u32, - allocate_gpu_memory: bool, - ); - - /// Perform bootstrapping on a batch of input u64 LWE ciphertexts using the multi-bit algorithm. - /// - /// - `v_stream` is a void pointer to the Cuda stream to be used in the kernel launch - /// - `gpu_index` is the index of the GPU to be used in the kernel launch - /// - `lwe_array_out`: output batch of num_samples bootstrapped ciphertexts c = (a0,..an-1,b) - /// where n is the LWE dimension - /// - `lut_vector`: should hold as many test vectors of size polynomial_size as there are input - /// ciphertexts, but actually holds `num_lut_vectors` vectors to reduce memory usage - /// - `lut_vector_indexes`: stores the index corresponding to which test vector to use for each - /// sample in `lut_vector` - /// - `lwe_array_in`: input batch of num_samples LWE ciphertexts, containing n mask values + 1 - /// body value - /// - `bootstrapping_key`: GGSW encryption of elements of the LWE secret key as in the classical - /// PBS, but this time we follow Zhou's trick and encrypt combinations of elements of the key - /// - `pbs_buffer`: a preallocated buffer to store temporary results - /// - `lwe_dimension`: size of the Torus vector used to encrypt the input LWE ciphertexts - - /// referred to as n above (~ 600) - /// - `glwe_dimension`: size of the polynomial vector used to encrypt the LUT GLWE ciphertexts - - /// referred to as k above. Only the value 1 is supported for this parameter. - /// - `polynomial_size`: size of the test polynomial (test vector) and size of the GLWE - /// polynomial (~1024) - /// - `grouping_factor`: number of elements of the LWE secret key combined per GGSW of the - /// bootstrap key - /// - `base_log`: log base used for the gadget matrix - B = 2^base_log (~8) - /// - `level_count`: number of decomposition levels in the gadget matrix (~4) - /// - `num_samples`: number of encrypted input messages - pub fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( - stream: *mut c_void, - gpu_index: u32, - lwe_array_out: *mut c_void, - lwe_output_indexes: *const c_void, - lut_vector: *const c_void, - lut_vector_indexes: *const c_void, - lwe_array_in: *const c_void, - lwe_input_indexes: *const c_void, - bootstrapping_key: *const c_void, - pbs_buffer: *mut i8, - lwe_dimension: u32, - glwe_dimension: u32, - polynomial_size: u32, - grouping_factor: u32, - base_log: u32, - level: u32, - num_samples: u32, - ); - - /// This cleanup function frees the data for the multi-bit PBS on GPU - /// contained in pbs_buffer for 64-bit inputs. - pub fn cleanup_cuda_multi_bit_programmable_bootstrap( - stream: *mut c_void, - gpu_index: u32, - pbs_buffer: *mut *mut i8, - ); - - /// Perform keyswitch on a batch of 64 bits input LWE ciphertexts. - /// - /// - `v_stream` is a void pointer to the Cuda stream to be used in the kernel launch - /// - `gpu_index` is the index of the GPU to be used in the kernel launch - /// - `lwe_array_out`: output batch of num_samples keyswitched ciphertexts c = (a0,..an-1,b) - /// where n is the output LWE dimension (lwe_dimension_out) - /// - `lwe_array_in`: input batch of num_samples LWE ciphertexts, containing lwe_dimension_in - /// mask values + 1 body value - /// - `ksk`: the keyswitch key to be used in the operation - /// - `base_log`: the log of the base used in the decomposition (should be the one used to - /// create the ksk). - /// - `level_count`: the number of levels used in the decomposition (should be the one used to - /// create the ksk). - /// - `num_samples`: the number of input and output LWE ciphertexts. - /// - /// This function calls a wrapper to a device kernel that performs the keyswitch. - /// `num_samples` blocks of threads are launched - pub fn cuda_keyswitch_lwe_ciphertext_vector_64( - stream: *mut c_void, - gpu_index: u32, - lwe_array_out: *mut c_void, - lwe_output_indexes: *const c_void, - lwe_array_in: *const c_void, - lwe_input_indexes: *const c_void, - keyswitch_key: *const c_void, - input_lwe_dimension: u32, - output_lwe_dimension: u32, - base_log: u32, - level_count: u32, - num_samples: u32, - ); - - /// This scratch function allocates the necessary amount of data on the GPU for - /// the public function packing keyswitch implementation on 64-bit - pub fn scratch_packing_keyswitch_lwe_list_to_glwe_64( - stream: *mut c_void, - gpu_index: u32, - fp_ks_buffer: *mut *mut i8, - glwe_dimension: u32, - polynomial_size: u32, - input_lwe_ciphertext_count: u32, - allocate_gpu_memory: bool, - ); - - /// Perform public functional packing keyswitch on a vector of 64-bit LWE ciphertexts - pub fn cuda_packing_keyswitch_lwe_list_to_glwe_64( - stream: *mut c_void, - gpu_index: u32, - glwe_array_out: *mut c_void, - lwe_array_in: *const c_void, - fp_ksk_array: *const c_void, - fp_ks_buffer: *mut i8, - input_lwe_dimension: u32, - output_glwe_dimension: u32, - polynomial_size: u32, - base_log: u32, - level_count: u32, - num_lwes: u32, - ); - - pub fn cleanup_packing_keyswitch_lwe_list_to_glwe( - stream: *mut c_void, - gpu_index: u32, - fp_ks_buffer: *mut *mut i8, - ); - - /// Perform the negation of a u64 input LWE ciphertext vector. - /// - `v_stream` is a void pointer to the Cuda stream to be used in the kernel launch - /// - `gpu_index` is the index of the GPU to be used in the kernel launch - /// - `lwe_array_out` is an array of size `(input_lwe_dimension + 1) * - /// input_lwe_ciphertext_count` that should have been allocated on the GPU before calling this - /// function, and that will hold the result of the computation. - /// - `lwe_array_in` is the LWE ciphertext vector used as input, it should have been allocated - /// and initialized before calling this function. It has the same size as the output array. - /// - `input_lwe_dimension` is the number of mask elements in the two input and in the output - /// ciphertext vectors - /// - `input_lwe_ciphertext_count` is the number of ciphertexts contained in each input LWE - /// ciphertext vector, as well as in the output. - /// - /// Each element (mask element or body) of the input LWE ciphertext vector is negated. - /// The result is stored in the output LWE ciphertext vector. The input LWE ciphertext vector - /// is left unchanged. This function is a wrapper to a device function that performs the - /// operation on the GPU. - pub fn cuda_negate_lwe_ciphertext_vector_64( - stream: *mut c_void, - gpu_index: u32, - lwe_array_out: *mut c_void, - lwe_array_in: *const c_void, - input_lwe_dimension: u32, - input_lwe_ciphertext_count: u32, - ); - - pub fn cuda_negate_integer_radix_ciphertext_64_inplace( - streams: *const *mut c_void, - gpu_index: *const u32, - gpu_indexes: u32, - lwe_array: *mut c_void, - lwe_dimension: u32, - lwe_ciphertext_count: u32, - message_modulus: u32, - carry_modulus: u32, - ); - - /// Perform the addition of two u64 input LWE ciphertext vectors. - /// - `v_stream` is a void pointer to the Cuda stream to be used in the kernel launch - /// - `gpu_index` is the index of the GPU to be used in the kernel launch - /// - `lwe_array_out` is an array of size `(input_lwe_dimension + 1) * - /// input_lwe_ciphertext_count` that should have been allocated on the GPU before calling this - /// function, and that will hold the result of the computation. - /// - `lwe_array_in_1` is the first LWE ciphertext vector used as input, it should have been - /// allocated and initialized before calling this function. It has the same size as the output - /// array. - /// - `lwe_array_in_2` is the second LWE ciphertext vector used as input, it should have been - /// allocated and initialized before calling this function. It has the same size as the output - /// array. - /// - `input_lwe_dimension` is the number of mask elements in the two input and in the output - /// ciphertext vectors - /// - `input_lwe_ciphertext_count` is the number of ciphertexts contained in each input LWE - /// ciphertext vector, as well as in the output. - /// - /// Each element (mask element or body) of the input LWE ciphertext vector 1 is added to the - /// corresponding element in the input LWE ciphertext 2. The result is stored in the output LWE - /// ciphertext vector. The two input LWE ciphertext vectors are left unchanged. This function is - /// a wrapper to a device function that performs the operation on the GPU. - pub fn cuda_add_lwe_ciphertext_vector_64( - stream: *mut c_void, - gpu_index: u32, - lwe_array_out: *mut c_void, - lwe_array_in_1: *const c_void, - lwe_array_in_2: *const c_void, - input_lwe_dimension: u32, - input_lwe_ciphertext_count: u32, - ); - - /// Perform the addition of a u64 input LWE ciphertext vector with a u64 input plaintext vector. - /// - `v_stream` is a void pointer to the Cuda stream to be used in the kernel launch - /// - `gpu_index` is the index of the GPU to be used in the kernel launch - /// - `lwe_array_out` is an array of size `(input_lwe_dimension + 1) * - /// input_lwe_ciphertext_count` that should have been allocated on the GPU before calling this - /// function, and that will hold the result of the computation. - /// - `lwe_array_in` is the LWE ciphertext vector used as input, it should have been allocated - /// and initialized before calling this function. It has the same size as the output array. - /// - `plaintext_array_in` is the plaintext vector used as input, it should have been allocated - /// and initialized before calling this function. It should be of size - /// `input_lwe_ciphertext_count`. - /// - `input_lwe_dimension` is the number of mask elements in the input and output LWE - /// ciphertext vectors - /// - `input_lwe_ciphertext_count` is the number of ciphertexts contained in the input LWE - /// ciphertext vector, as well as in the output. It is also the number of plaintexts in the - /// input plaintext vector. - /// - /// Each plaintext of the input plaintext vector is added to the body of the corresponding LWE - /// ciphertext in the LWE ciphertext vector. The result of the operation is stored in the output - /// LWE ciphertext vector. The two input vectors are unchanged. This function is a - /// wrapper to a device function that performs the operation on the GPU. - pub fn cuda_add_lwe_ciphertext_vector_plaintext_vector_64( - stream: *mut c_void, - gpu_index: u32, - lwe_array_out: *mut c_void, - lwe_array_in: *const c_void, - plaintext_array_in: *const c_void, - input_lwe_dimension: u32, - input_lwe_ciphertext_count: u32, - ); - - /// Perform the multiplication of a u64 input LWE ciphertext vector with a u64 input cleartext - /// vector. - /// - `v_stream` is a void pointer to the Cuda stream to be used in the kernel launch - /// - `gpu_index` is the index of the GPU to be used in the kernel launch - /// - `lwe_array_out` is an array of size `(input_lwe_dimension + 1) * - /// input_lwe_ciphertext_count` that should have been allocated on the GPU before calling this - /// function, and that will hold the result of the computation. - /// - `lwe_array_in` is the LWE ciphertext vector used as input, it should have been allocated - /// and initialized before calling this function. It has the same size as the output array. - /// - `cleartext_array_in` is the cleartext vector used as input, it should have been allocated - /// and initialized before calling this function. It should be of size - /// `input_lwe_ciphertext_count`. - /// - `input_lwe_dimension` is the number of mask elements in the input and output LWE - /// ciphertext vectors - /// - `input_lwe_ciphertext_count` is the number of ciphertexts contained in the input LWE - /// ciphertext vector, as well as in the output. It is also the number of cleartexts in the - /// input cleartext vector. - /// - /// Each cleartext of the input cleartext vector is multiplied to the mask and body of the - /// corresponding LWE ciphertext in the LWE ciphertext vector. - /// The result of the operation is stored in the output - /// LWE ciphertext vector. The two input vectors are unchanged. This function is a - /// wrapper to a device function that performs the operation on the GPU. - pub fn cuda_mult_lwe_ciphertext_vector_cleartext_vector_64( + pub fn cuda_glwe_sample_extract_64( stream: *mut c_void, gpu_index: u32, lwe_array_out: *mut c_void, - lwe_array_in: *const c_void, - cleartext_array_in: *const c_void, - input_lwe_dimension: u32, - input_lwe_ciphertext_count: u32, - ); - - pub fn scratch_cuda_integer_mult_radix_ciphertext_kb_64( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - mem_ptr: *mut *mut i8, - message_modulus: u32, - carry_modulus: u32, + glwe_array_in: *const c_void, + nth_array: *const u32, + num_glwes: u32, glwe_dimension: u32, - lwe_dimension: u32, polynomial_size: u32, - pbs_base_log: u32, - pbs_level: u32, - ks_base_log: u32, - ks_level: u32, - grouping_factor: u32, - num_blocks: u32, - pbs_type: u32, - allocate_gpu_memory: bool, - ); - - pub fn cuda_integer_mult_radix_ciphertext_kb_64( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - radix_lwe_out: *mut c_void, - radix_lwe_left: *const c_void, - radix_lwe_right: *const c_void, - bsks: *const *mut c_void, - ksks: *const *mut c_void, - mem_ptr: *mut i8, - polynomial_size: u32, - num_blocks: u32, - ); - - pub fn cleanup_cuda_integer_mult( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - mem_ptr: *mut *mut i8, ); pub fn scratch_cuda_integer_compress_radix_ciphertext_64( @@ -536,6 +102,7 @@ extern "C" { storage_log_modulus: u32, allocate_gpu_memory: bool, ); + pub fn scratch_cuda_integer_decompress_radix_ciphertext_64( streams: *const *mut c_void, gpu_indexes: *const u32, @@ -563,7 +130,7 @@ extern "C" { glwe_array_out: *mut c_void, lwe_array_in: *const c_void, fp_ksk: *const *mut c_void, - num_lwes: u32, + num_nths: u32, mem_ptr: *mut i8, ); @@ -571,8 +138,8 @@ extern "C" { streams: *const *mut c_void, gpu_indexes: *const u32, gpu_count: u32, - lwe_out: *mut c_void, - glwe_array_in: *const c_void, + lwe_array_out: *mut c_void, + glwe_in: *const c_void, indexes_array: *const c_void, indexes_array_size: u32, bsks: *const *mut c_void, @@ -585,6 +152,7 @@ extern "C" { gpu_count: u32, mem_ptr: *mut *mut i8, ); + pub fn cleanup_cuda_integer_decompress_radix_ciphertext_64( streams: *const *mut c_void, gpu_indexes: *const u32, @@ -592,191 +160,54 @@ extern "C" { mem_ptr: *mut *mut i8, ); - pub fn cuda_scalar_addition_integer_radix_ciphertext_64_inplace( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - lwe_array: *mut c_void, - scalar_input: *const c_void, - lwe_dimension: u32, - lwe_ciphertext_count: u32, - message_modulus: u32, - carry_modulus: u32, - ); + pub fn cuda_setup_multi_gpu() -> i32; - pub fn scratch_cuda_integer_scalar_mul_kb_64( + pub fn scratch_cuda_apply_univariate_lut_kb_64( streams: *const *mut c_void, gpu_indexes: *const u32, gpu_count: u32, mem_ptr: *mut *mut i8, - glwe_dimension: u32, - polynomial_size: u32, - lwe_dimension: u32, - ks_level: u32, - ks_base_log: u32, - pbs_level: u32, - pbs_base_log: u32, - grouping_factor: u32, - num_blocks: u32, - message_modulus: u32, - carry_modulus: u32, - pbs_type: u32, - allocate_gpu_memory: bool, - ); - - pub fn cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - lwe_array: *mut c_void, - decomposed_scalar: *const u64, - has_at_least_one_set: *const u64, - mem: *mut i8, - bsks: *const *mut c_void, - ksks: *const *mut c_void, + input_lut: *const c_void, lwe_dimension: u32, - polynomial_size: u32, - message_modulus: u32, - num_blocks: u32, - num_scalars: u32, - ); - - pub fn cleanup_cuda_integer_radix_scalar_mul( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - mem_ptr: *mut *mut i8, - ); - - pub fn scratch_cuda_integer_radix_bitop_kb_64( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - mem_ptr: *mut *mut i8, - glwe_dimension: u32, - polynomial_size: u32, - big_lwe_dimension: u32, - small_lwe_dimension: u32, - ks_level: u32, - ks_base_log: u32, - pbs_level: u32, - pbs_base_log: u32, - grouping_factor: u32, - num_blocks: u32, - message_modulus: u32, - carry_modulus: u32, - pbs_type: u32, - op_type: u32, - allocate_gpu_memory: bool, - ); - - pub fn cuda_bitop_integer_radix_ciphertext_kb_64( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - radix_lwe_out: *mut c_void, - radix_lwe_left: *const c_void, - radix_lwe_right: *const c_void, - mem_ptr: *mut i8, - bsks: *const *mut c_void, - ksks: *const *mut c_void, - num_blocks: u32, - ); - - pub fn cuda_scalar_bitop_integer_radix_ciphertext_kb_64( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - radix_lwe_output: *mut c_void, - radix_lwe_input: *mut c_void, - clear_blocks: *const c_void, - num_clear_blocks: u32, - mem_ptr: *mut i8, - bsks: *const *mut c_void, - ksks: *const *mut c_void, - num_blocks: u32, - op_type: u32, - ); - - pub fn cleanup_cuda_integer_bitop( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - mem_ptr: *mut *mut i8, - ); - pub fn cuda_glwe_sample_extract_64( - stream: *mut c_void, - gpu_index: u32, - lwe_array_out: *mut c_void, - glwe_array_in: *const c_void, - nth_array: *const u32, - num_glwes: u32, glwe_dimension: u32, polynomial_size: u32, - ); - - pub fn scratch_cuda_integer_radix_comparison_kb_64( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - mem_ptr: *mut *mut i8, - glwe_dimension: u32, - polynomial_size: u32, - big_lwe_dimension: u32, - small_lwe_dimension: u32, ks_level: u32, ks_base_log: u32, pbs_level: u32, pbs_base_log: u32, grouping_factor: u32, - num_blocks: u32, + input_lwe_ciphertext_count: u32, message_modulus: u32, carry_modulus: u32, pbs_type: u32, - op_type: u32, - is_signed: bool, allocate_gpu_memory: bool, ); - pub fn cuda_comparison_integer_radix_ciphertext_kb_64( + pub fn cuda_apply_univariate_lut_kb_64( streams: *const *mut c_void, gpu_indexes: *const u32, gpu_count: u32, - radix_lwe_out: *mut c_void, - radix_lwe_left: *const c_void, - radix_lwe_right: *const c_void, + output_radix_lwe: *mut c_void, + input_radix_lwe: *const c_void, mem_ptr: *mut i8, - bsks: *const *mut c_void, ksks: *const *mut c_void, + bsks: *const *mut c_void, num_blocks: u32, ); - pub fn cleanup_cuda_integer_comparison( + pub fn cleanup_cuda_apply_univariate_lut_kb_64( streams: *const *mut c_void, gpu_indexes: *const u32, gpu_count: u32, mem_ptr: *mut *mut i8, ); - pub fn cuda_scalar_comparison_integer_radix_ciphertext_kb_64( - streams: *const *mut c_void, - gpu_indexes: *const u32, - gpu_count: u32, - radix_lwe_out: *mut c_void, - radix_lwe_in: *const c_void, - scalar_blocks: *const c_void, - mem_ptr: *mut i8, - bsks: *const *mut c_void, - ksks: *const *mut c_void, - num_blocks: u32, - num_scalar_blocks: u32, - ); - - pub fn scratch_cuda_full_propagation_64( + pub fn scratch_cuda_apply_bivariate_lut_kb_64( streams: *const *mut c_void, gpu_indexes: *const u32, gpu_count: u32, mem_ptr: *mut *mut i8, + input_lut: *const c_void, lwe_dimension: u32, glwe_dimension: u32, polynomial_size: u32, @@ -785,36 +216,39 @@ extern "C" { pbs_level: u32, pbs_base_log: u32, grouping_factor: u32, + input_lwe_ciphertext_count: u32, message_modulus: u32, carry_modulus: u32, pbs_type: u32, allocate_gpu_memory: bool, ); - pub fn cuda_full_propagation_64_inplace( + pub fn cuda_apply_bivariate_lut_kb_64( streams: *const *mut c_void, gpu_indexes: *const u32, gpu_count: u32, - radix_lwe_right: *mut c_void, + output_radix_lwe: *mut c_void, + input_radix_lwe_1: *const c_void, + input_radix_lwe_2: *const c_void, mem_ptr: *mut i8, ksks: *const *mut c_void, bsks: *const *mut c_void, num_blocks: u32, + shift: u32, ); - pub fn cleanup_cuda_full_propagation( + pub fn cleanup_cuda_apply_bivariate_lut_kb_64( streams: *const *mut c_void, gpu_indexes: *const u32, gpu_count: u32, mem_ptr: *mut *mut i8, ); - pub fn scratch_cuda_apply_univariate_lut_kb_64( + pub fn scratch_cuda_full_propagation_64( streams: *const *mut c_void, gpu_indexes: *const u32, gpu_count: u32, mem_ptr: *mut *mut i8, - input_lut: *const c_void, lwe_dimension: u32, glwe_dimension: u32, polynomial_size: u32, @@ -823,74 +257,94 @@ extern "C" { pbs_level: u32, pbs_base_log: u32, grouping_factor: u32, - num_blocks: u32, message_modulus: u32, carry_modulus: u32, pbs_type: u32, allocate_gpu_memory: bool, ); - pub fn cuda_apply_univariate_lut_kb_64( + pub fn cuda_full_propagation_64_inplace( streams: *const *mut c_void, gpu_indexes: *const u32, gpu_count: u32, - output_radix_lwe: *mut c_void, - input_radix_lwe: *const c_void, + input_blocks: *mut c_void, mem_ptr: *mut i8, ksks: *const *mut c_void, bsks: *const *mut c_void, num_blocks: u32, ); - pub fn cleanup_cuda_apply_univariate_lut_kb_64( + pub fn cleanup_cuda_full_propagation( streams: *const *mut c_void, gpu_indexes: *const u32, gpu_count: u32, mem_ptr: *mut *mut i8, ); - pub fn scratch_cuda_apply_bivariate_lut_kb_64( + pub fn scratch_cuda_integer_mult_radix_ciphertext_kb_64( streams: *const *mut c_void, gpu_indexes: *const u32, gpu_count: u32, mem_ptr: *mut *mut i8, - input_lut: *const c_void, - lwe_dimension: u32, + message_modulus: u32, + carry_modulus: u32, glwe_dimension: u32, + lwe_dimension: u32, polynomial_size: u32, - ks_level: u32, - ks_base_log: u32, - pbs_level: u32, pbs_base_log: u32, + pbs_level: u32, + ks_base_log: u32, + ks_level: u32, grouping_factor: u32, num_blocks: u32, - message_modulus: u32, - carry_modulus: u32, pbs_type: u32, allocate_gpu_memory: bool, ); - pub fn cuda_apply_bivariate_lut_kb_64( + pub fn cuda_integer_mult_radix_ciphertext_kb_64( streams: *const *mut c_void, gpu_indexes: *const u32, gpu_count: u32, - output_radix_lwe: *mut c_void, - input_radix_lwe_1: *const c_void, - input_radix_lwe_2: *const c_void, - mem_ptr: *mut i8, - ksks: *const *mut c_void, + radix_lwe_out: *mut c_void, + radix_lwe_left: *const c_void, + radix_lwe_right: *const c_void, bsks: *const *mut c_void, + ksks: *const *mut c_void, + mem_ptr: *mut i8, + polynomial_size: u32, num_blocks: u32, - shift: u32, ); - pub fn cleanup_cuda_apply_bivariate_lut_kb_64( + pub fn cleanup_cuda_integer_mult( streams: *const *mut c_void, gpu_indexes: *const u32, gpu_count: u32, mem_ptr: *mut *mut i8, ); + pub fn cuda_negate_integer_radix_ciphertext_64_inplace( + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, + lwe_array: *mut c_void, + lwe_dimension: u32, + lwe_ciphertext_count: u32, + message_modulus: u32, + carry_modulus: u32, + ); + + pub fn cuda_scalar_addition_integer_radix_ciphertext_64_inplace( + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, + lwe_array: *mut c_void, + scalar_input: *const c_void, + lwe_dimension: u32, + lwe_ciphertext_count: u32, + message_modulus: u32, + carry_modulus: u32, + ); + pub fn scratch_cuda_integer_radix_logical_scalar_shift_kb_64( streams: *const *mut c_void, gpu_indexes: *const u32, @@ -917,7 +371,7 @@ extern "C" { streams: *const *mut c_void, gpu_indexes: *const u32, gpu_count: u32, - radix_lwe: *mut c_void, + lwe_array: *mut c_void, shift: u32, mem_ptr: *mut i8, bsks: *const *mut c_void, @@ -951,7 +405,7 @@ extern "C" { streams: *const *mut c_void, gpu_indexes: *const u32, gpu_count: u32, - radix_lwe: *mut c_void, + lwe_array: *mut c_void, shift: u32, mem_ptr: *mut i8, bsks: *const *mut c_void, @@ -1000,8 +454,8 @@ extern "C" { streams: *const *mut c_void, gpu_indexes: *const u32, gpu_count: u32, - radix_lwe: *mut c_void, - radix_shift: *const c_void, + lwe_array: *mut c_void, + lwe_shift: *const c_void, mem_ptr: *mut i8, bsks: *const *mut c_void, ksks: *const *mut c_void, @@ -1015,6 +469,120 @@ extern "C" { mem_ptr: *mut *mut i8, ); + pub fn scratch_cuda_integer_radix_comparison_kb_64( + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr: *mut *mut i8, + glwe_dimension: u32, + polynomial_size: u32, + big_lwe_dimension: u32, + small_lwe_dimension: u32, + ks_level: u32, + ks_base_log: u32, + pbs_level: u32, + pbs_base_log: u32, + grouping_factor: u32, + lwe_ciphertext_count: u32, + message_modulus: u32, + carry_modulus: u32, + pbs_type: u32, + op_type: u32, + is_signed: bool, + allocate_gpu_memory: bool, + ); + + pub fn cuda_comparison_integer_radix_ciphertext_kb_64( + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, + lwe_array_out: *mut c_void, + lwe_array_1: *const c_void, + lwe_array_2: *const c_void, + mem_ptr: *mut i8, + bsks: *const *mut c_void, + ksks: *const *mut c_void, + lwe_ciphertext_count: u32, + ); + + pub fn cuda_scalar_comparison_integer_radix_ciphertext_kb_64( + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, + lwe_array_out: *mut c_void, + lwe_array_in: *const c_void, + scalar_blocks: *const c_void, + mem_ptr: *mut i8, + bsks: *const *mut c_void, + ksks: *const *mut c_void, + lwe_ciphertext_count: u32, + num_scalar_blocks: u32, + ); + + pub fn cleanup_cuda_integer_comparison( + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr: *mut *mut i8, + ); + + pub fn scratch_cuda_integer_radix_bitop_kb_64( + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr: *mut *mut i8, + glwe_dimension: u32, + polynomial_size: u32, + big_lwe_dimension: u32, + small_lwe_dimension: u32, + ks_level: u32, + ks_base_log: u32, + pbs_level: u32, + pbs_base_log: u32, + grouping_factor: u32, + lwe_ciphertext_count: u32, + message_modulus: u32, + carry_modulus: u32, + pbs_type: u32, + op_type: u32, + allocate_gpu_memory: bool, + ); + + pub fn cuda_bitop_integer_radix_ciphertext_kb_64( + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, + lwe_array_out: *mut c_void, + lwe_array_1: *const c_void, + lwe_array_2: *const c_void, + mem_ptr: *mut i8, + bsks: *const *mut c_void, + ksks: *const *mut c_void, + lwe_ciphertext_count: u32, + ); + + pub fn cuda_scalar_bitop_integer_radix_ciphertext_kb_64( + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, + lwe_array_out: *mut c_void, + lwe_array_input: *const c_void, + clear_blocks: *const c_void, + num_clear_blocks: u32, + mem_ptr: *mut i8, + bsks: *const *mut c_void, + ksks: *const *mut c_void, + lwe_ciphertext_count: u32, + op: u32, + ); + + pub fn cleanup_cuda_integer_bitop( + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr: *mut *mut i8, + ); + pub fn scratch_cuda_integer_radix_cmux_kb_64( streams: *const *mut c_void, gpu_indexes: *const u32, @@ -1029,7 +597,7 @@ extern "C" { pbs_level: u32, pbs_base_log: u32, grouping_factor: u32, - num_blocks: u32, + lwe_ciphertext_count: u32, message_modulus: u32, carry_modulus: u32, pbs_type: u32, @@ -1047,7 +615,7 @@ extern "C" { mem_ptr: *mut i8, bsks: *const *mut c_void, ksks: *const *mut c_void, - num_blocks: u32, + lwe_ciphertext_count: u32, ); pub fn cleanup_cuda_integer_radix_cmux( @@ -1083,7 +651,7 @@ extern "C" { streams: *const *mut c_void, gpu_indexes: *const u32, gpu_count: u32, - radix_lwe: *mut c_void, + lwe_array: *mut c_void, n: u32, mem_ptr: *mut i8, bsks: *const *mut c_void, @@ -1123,7 +691,7 @@ extern "C" { streams: *const *mut c_void, gpu_indexes: *const u32, gpu_count: u32, - radix_lwe: *mut c_void, + lwe_array: *mut c_void, carry_out: *mut c_void, mem_ptr: *mut i8, bsks: *const *mut c_void, @@ -1135,7 +703,7 @@ extern "C" { streams: *const *mut c_void, gpu_indexes: *const u32, gpu_count: u32, - radix_lwe: *mut c_void, + lwe_array: *mut c_void, carry_out: *mut c_void, input_carries: *mut c_void, mem_ptr: *mut i8, @@ -1164,43 +732,84 @@ extern "C" { pbs_level: u32, pbs_base_log: u32, grouping_factor: u32, - num_blocks_in_radix: u32, - max_num_radix_in_vec: u32, + num_blocks_in_radix: u32, + max_num_radix_in_vec: u32, + message_modulus: u32, + carry_modulus: u32, + pbs_type: u32, + allocate_gpu_memory: bool, + ); + + pub fn cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64( + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, + radix_lwe_out: *mut c_void, + radix_lwe_vec: *const c_void, + num_radix_in_vec: u32, + mem_ptr: *mut i8, + bsks: *const *mut c_void, + ksks: *const *mut c_void, + num_blocks_in_radix: u32, + ); + + pub fn cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec( + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr: *mut *mut i8, + ); + + pub fn scratch_cuda_integer_radix_overflowing_sub_kb_64( + streams: *const *mut c_void, + gpu_indexes: *const u32, + gpu_count: u32, + mem_ptr: *mut *mut i8, + glwe_dimension: u32, + polynomial_size: u32, + big_lwe_dimension: u32, + small_lwe_dimension: u32, + ks_level: u32, + ks_base_log: u32, + pbs_level: u32, + pbs_base_log: u32, + grouping_factor: u32, + num_blocks: u32, message_modulus: u32, carry_modulus: u32, pbs_type: u32, allocate_gpu_memory: bool, ); - pub fn cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64( + pub fn cuda_integer_radix_overflowing_sub_kb_64( streams: *const *mut c_void, gpu_indexes: *const u32, gpu_count: u32, radix_lwe_out: *mut c_void, - radix_lwe_vec: *mut c_void, - num_radix_in_vec: u32, + radix_lwe_overflowed: *mut c_void, + radix_lwe_left: *const c_void, + radix_lwe_right: *const c_void, mem_ptr: *mut i8, bsks: *const *mut c_void, ksks: *const *mut c_void, num_blocks_in_radix: u32, ); - pub fn cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec( + pub fn cleanup_cuda_integer_radix_overflowing_sub( streams: *const *mut c_void, gpu_indexes: *const u32, gpu_count: u32, mem_ptr: *mut *mut i8, ); - pub fn scratch_cuda_integer_radix_overflowing_sub_kb_64( + pub fn scratch_cuda_integer_scalar_mul_kb_64( streams: *const *mut c_void, gpu_indexes: *const u32, gpu_count: u32, mem_ptr: *mut *mut i8, glwe_dimension: u32, polynomial_size: u32, - big_lwe_dimension: u32, - small_lwe_dimension: u32, + lwe_dimension: u32, ks_level: u32, ks_base_log: u32, pbs_level: u32, @@ -1213,21 +822,24 @@ extern "C" { allocate_gpu_memory: bool, ); - pub fn cuda_integer_radix_overflowing_sub_kb_64( + pub fn cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace( streams: *const *mut c_void, gpu_indexes: *const u32, gpu_count: u32, - radix_lwe_out: *mut c_void, - radix_lwe_overflowed: *mut c_void, - radix_lwe_left: *const c_void, - radix_lwe_right: *const c_void, + lwe_array: *mut c_void, + decomposed_scalar: *const u64, + has_at_least_one_set: *const u64, mem_ptr: *mut i8, bsks: *const *mut c_void, ksks: *const *mut c_void, + lwe_dimension: u32, + polynomial_size: u32, + message_modulus: u32, num_blocks: u32, + num_scalars: u32, ); - pub fn cleanup_cuda_integer_radix_overflowing_sub( + pub fn cleanup_cuda_integer_radix_scalar_mul( streams: *const *mut c_void, gpu_indexes: *const u32, gpu_count: u32, @@ -1266,7 +878,7 @@ extern "C" { mem_ptr: *mut i8, bsks: *const *mut c_void, ksks: *const *mut c_void, - num_blocks: u32, + num_blocks_in_radix: u32, ); pub fn cleanup_cuda_integer_div_rem( @@ -1309,7 +921,7 @@ extern "C" { mem_ptr: *mut i8, bsks: *const *mut c_void, ksks: *const *mut c_void, - num_blocks: u32, + num_blocks_in_radix: u32, ); pub fn cleanup_signed_overflowing_add_or_sub( @@ -1318,6 +930,7 @@ extern "C" { gpu_count: u32, mem_ptr: *mut *mut i8, ); + pub fn scratch_cuda_integer_compute_prefix_sum_hillis_steele_64( streams: *const *mut c_void, gpu_indexes: *const u32, @@ -1332,7 +945,7 @@ extern "C" { pbs_level: u32, pbs_base_log: u32, grouping_factor: u32, - num_blocks: u32, + num_radix_blocks: u32, message_modulus: u32, carry_modulus: u32, pbs_type: u32, @@ -1363,8 +976,237 @@ extern "C" { streams: *const *mut c_void, gpu_indexes: *const u32, gpu_count: u32, - output_radix_lwe: *mut c_void, + lwe_array: *mut c_void, num_blocks: u32, lwe_size: u32, ); + + pub fn cuda_keyswitch_lwe_ciphertext_vector_64( + stream: *mut c_void, + gpu_index: u32, + lwe_array_out: *mut c_void, + lwe_output_indexes: *const c_void, + lwe_array_in: *const c_void, + lwe_input_indexes: *const c_void, + ksk: *const c_void, + lwe_dimension_in: u32, + lwe_dimension_out: u32, + base_log: u32, + level_count: u32, + num_samples: u32, + ); + + pub fn scratch_packing_keyswitch_lwe_list_to_glwe_64( + stream: *mut c_void, + gpu_index: u32, + fp_ks_buffer: *mut *mut i8, + glwe_dimension: u32, + polynomial_size: u32, + num_lwes: u32, + allocate_gpu_memory: bool, + ); + + pub fn cuda_packing_keyswitch_lwe_list_to_glwe_64( + stream: *mut c_void, + gpu_index: u32, + glwe_array_out: *mut c_void, + lwe_array_in: *const c_void, + fp_ksk_array: *const c_void, + fp_ks_buffer: *mut i8, + input_lwe_dimension: u32, + output_glwe_dimension: u32, + output_polynomial_size: u32, + base_log: u32, + level_count: u32, + num_lwes: u32, + ); + + pub fn cleanup_packing_keyswitch_lwe_list_to_glwe( + stream: *mut c_void, + gpu_index: u32, + fp_ks_buffer: *mut *mut i8, + ); + + pub fn cuda_negate_lwe_ciphertext_vector_64( + stream: *mut c_void, + gpu_index: u32, + lwe_array_out: *mut c_void, + lwe_array_in: *const c_void, + input_lwe_dimension: u32, + input_lwe_ciphertext_count: u32, + ); + + pub fn cuda_add_lwe_ciphertext_vector_64( + stream: *mut c_void, + gpu_index: u32, + lwe_array_out: *mut c_void, + lwe_array_in_1: *const c_void, + lwe_array_in_2: *const c_void, + input_lwe_dimension: u32, + input_lwe_ciphertext_count: u32, + ); + + pub fn cuda_add_lwe_ciphertext_vector_plaintext_vector_64( + stream: *mut c_void, + gpu_index: u32, + lwe_array_out: *mut c_void, + lwe_array_in: *const c_void, + plaintext_array_in: *const c_void, + input_lwe_dimension: u32, + input_lwe_ciphertext_count: u32, + ); + + pub fn cuda_mult_lwe_ciphertext_vector_cleartext_vector_64( + stream: *mut c_void, + gpu_index: u32, + lwe_array_out: *mut c_void, + lwe_array_in: *const c_void, + cleartext_array_in: *const c_void, + input_lwe_dimension: u32, + input_lwe_ciphertext_count: u32, + ); + + pub fn cuda_fourier_polynomial_mul( + stream: *mut c_void, + gpu_index: u32, + input1: *const c_void, + input2: *const c_void, + output: *mut c_void, + polynomial_size: u32, + total_polynomials: u32, + ); + + pub fn cuda_convert_lwe_programmable_bootstrap_key_64( + stream: *mut c_void, + gpu_index: u32, + dest: *mut c_void, + src: *const c_void, + input_lwe_dim: u32, + glwe_dim: u32, + level_count: u32, + polynomial_size: u32, + ); + + pub fn scratch_cuda_programmable_bootstrap_amortized_64( + stream: *mut c_void, + gpu_index: u32, + pbs_buffer: *mut *mut i8, + glwe_dimension: u32, + polynomial_size: u32, + input_lwe_ciphertext_count: u32, + allocate_gpu_memory: bool, + ); + + pub fn cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( + stream: *mut c_void, + gpu_index: u32, + lwe_array_out: *mut c_void, + lwe_output_indexes: *const c_void, + lut_vector: *const c_void, + lut_vector_indexes: *const c_void, + lwe_array_in: *const c_void, + lwe_input_indexes: *const c_void, + bootstrapping_key: *const c_void, + pbs_buffer: *mut i8, + lwe_dimension: u32, + glwe_dimension: u32, + polynomial_size: u32, + base_log: u32, + level_count: u32, + num_samples: u32, + ); + + pub fn cleanup_cuda_programmable_bootstrap_amortized( + stream: *mut c_void, + gpu_index: u32, + pbs_buffer: *mut *mut i8, + ); + + pub fn scratch_cuda_programmable_bootstrap_64( + stream: *mut c_void, + gpu_index: u32, + pbs_buffer: *mut *mut i8, + glwe_dimension: u32, + polynomial_size: u32, + level_count: u32, + input_lwe_ciphertext_count: u32, + allocate_gpu_memory: bool, + ); + + pub fn cuda_programmable_bootstrap_lwe_ciphertext_vector_64( + stream: *mut c_void, + gpu_index: u32, + lwe_array_out: *mut c_void, + lwe_output_indexes: *const c_void, + lut_vector: *const c_void, + lut_vector_indexes: *const c_void, + lwe_array_in: *const c_void, + lwe_input_indexes: *const c_void, + bootstrapping_key: *const c_void, + buffer: *mut i8, + lwe_dimension: u32, + glwe_dimension: u32, + polynomial_size: u32, + base_log: u32, + level_count: u32, + num_samples: u32, + ); + + pub fn cleanup_cuda_programmable_bootstrap( + stream: *mut c_void, + gpu_index: u32, + pbs_buffer: *mut *mut i8, + ); + + pub fn cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64( + stream: *mut c_void, + gpu_index: u32, + dest: *mut c_void, + src: *const c_void, + input_lwe_dim: u32, + glwe_dim: u32, + level_count: u32, + polynomial_size: u32, + grouping_factor: u32, + ); + + pub fn scratch_cuda_multi_bit_programmable_bootstrap_64( + stream: *mut c_void, + gpu_index: u32, + pbs_buffer: *mut *mut i8, + lwe_dimension: u32, + glwe_dimension: u32, + polynomial_size: u32, + level_count: u32, + grouping_factor: u32, + input_lwe_ciphertext_count: u32, + allocate_gpu_memory: bool, + ); + + pub fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( + stream: *mut c_void, + gpu_index: u32, + lwe_array_out: *mut c_void, + lwe_output_indexes: *const c_void, + lut_vector: *const c_void, + lut_vector_indexes: *const c_void, + lwe_array_in: *const c_void, + lwe_input_indexes: *const c_void, + bootstrapping_key: *const c_void, + buffer: *mut i8, + lwe_dimension: u32, + glwe_dimension: u32, + polynomial_size: u32, + grouping_factor: u32, + base_log: u32, + level_count: u32, + num_samples: u32, + ); + + pub fn cleanup_cuda_multi_bit_programmable_bootstrap( + stream: *mut c_void, + gpu_index: u32, + pbs_buffer: *mut *mut i8, + ); + } // extern "C"