From 5a7e8ab43a20f5cdd236c39858b4773c522a4bfc Mon Sep 17 00:00:00 2001
From: Agnes Leroy <agnes.leroy@zama.ai>
Date: Thu, 5 Sep 2024 13:52:35 +0200
Subject: [PATCH] chore(gpu): pass over all cuda bind

---
 .../tfhe-cuda-backend/cuda/include/device.h   |   10 +-
 .../cuda/include/helper_multi_gpu.h           |    2 +-
 .../cuda/src/utils/helper_multi_gpu.cu        |    4 +-
 backends/tfhe-cuda-backend/src/cuda_bind.rs   | 1150 +++++++----------
 4 files changed, 504 insertions(+), 662 deletions(-)
diff --git a/backends/tfhe-cuda-backend/cuda/include/device.h b/backends/tfhe-cuda-backend/cuda/include/device.h
index 203db79b82..fd7dbf1e28 100644
--- a/backends/tfhe-cuda-backend/cuda/include/device.h
+++ b/backends/tfhe-cuda-backend/cuda/include/device.h
@@ -39,10 +39,6 @@ void *cuda_malloc_async(uint64_t size, cudaStream_t stream, uint32_t gpu_index);
 
 void cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);
 
-bool cuda_check_support_cooperative_groups();
-
-bool cuda_check_support_thread_block_clusters();
-
 void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
                               cudaStream_t stream, uint32_t gpu_index);
 
@@ -62,9 +58,13 @@ void cuda_synchronize_device(uint32_t gpu_index);
 void cuda_drop(void *ptr, uint32_t gpu_index);
 
 void cuda_drop_async(void *ptr, cudaStream_t stream, uint32_t gpu_index);
+}
 
 int cuda_get_max_shared_memory(uint32_t gpu_index);
-}
+
+bool cuda_check_support_cooperative_groups();
+
+bool cuda_check_support_thread_block_clusters();
 
 template <typename Torus>
 void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
diff --git a/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h b/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
index cbdd17fbf8..788fe416b3 100644
--- a/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
+++ b/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
@@ -8,7 +8,7 @@ extern std::mutex m;
 extern bool p2p_enabled;
 
 extern "C" {
-int cuda_setup_multi_gpu();
+int32_t cuda_setup_multi_gpu();
 }
 
 // Define a variant type that can be either a vector or a single pointer
diff --git a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cu b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cu
index a6d6cdd540..04b3d9b488 100644
--- a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cu
@@ -6,7 +6,7 @@
 std::mutex m;
 bool p2p_enabled = false;
 
-int cuda_setup_multi_gpu() {
+int32_t cuda_setup_multi_gpu() {
   int num_gpus = cuda_get_number_of_gpus();
   if (num_gpus == 0)
     PANIC("GPU error: the number of GPUs should be > 0.")
@@ -32,7 +32,7 @@ int cuda_setup_multi_gpu() {
     }
     m.unlock();
   }
-  return num_used_gpus;
+  return (int32_t)(num_used_gpus);
 }
 
 int get_active_gpu_count(int num_inputs, int gpu_count) {
diff --git a/backends/tfhe-cuda-backend/src/cuda_bind.rs b/backends/tfhe-cuda-backend/src/cuda_bind.rs
index 17353d48af..740d99bc2f 100644
--- a/backends/tfhe-cuda-backend/src/cuda_bind.rs
+++ b/backends/tfhe-cuda-backend/src/cuda_bind.rs
@@ -3,18 +3,19 @@ use std::ffi::c_void;
 #[link(name = "tfhe_cuda_backend", kind = "static")]
 extern "C" {
 
-    /// Create a new Cuda stream on GPU `gpu_index`
     pub fn cuda_create_stream(gpu_index: u32) -> *mut c_void;
 
-    /// Destroy the Cuda stream `v_stream`
     pub fn cuda_destroy_stream(stream: *mut c_void, gpu_index: u32);
 
-    /// Allocate `size` memory on GPU `gpu_index` asynchronously
+    pub fn cuda_synchronize_stream(stream: *mut c_void, gpu_index: u32);
+
+    pub fn cuda_malloc(size: u64, gpu_index: u32) -> *mut c_void;
+
     pub fn cuda_malloc_async(size: u64, stream: *mut c_void, gpu_index: u32) -> *mut c_void;
 
-    /// Copy `size` memory asynchronously from `src` on GPU `gpu_index` to `dest` on CPU using
-    /// the Cuda stream `v_stream`.
-    pub fn cuda_memcpy_async_to_cpu(
+    pub fn cuda_check_valid_malloc(size: u64, gpu_index: u32);
+
+    pub fn cuda_memcpy_async_to_gpu(
         dest: *mut c_void,
         src: *const c_void,
         size: u64,
@@ -22,9 +23,7 @@ extern "C" {
         gpu_index: u32,
     );
 
-    /// Copy `size` memory asynchronously from `src` on CPU to `dest` on GPU `gpu_index` using
-    /// the Cuda stream `v_stream`.
-    pub fn cuda_memcpy_async_to_gpu(
+    pub fn cuda_memcpy_async_gpu_to_gpu(
         dest: *mut c_void,
         src: *const c_void,
         size: u64,
@@ -32,9 +31,7 @@ extern "C" {
         gpu_index: u32,
     );
 
-    /// Copy `size` memory asynchronously from `src` to `dest` on the same GPU `gpu_index` using
-    /// the Cuda stream `v_stream`.
-    pub fn cuda_memcpy_async_gpu_to_gpu(
+    pub fn cuda_memcpy_async_to_cpu(
         dest: *mut c_void,
         src: *const c_void,
         size: u64,
@@ -42,65 +39,22 @@ extern "C" {
         gpu_index: u32,
     );
 
-    /// Copy `size` memory asynchronously from `src` on CPU to `dest` on GPU `gpu_index` using
-    /// the Cuda stream `v_stream`.
     pub fn cuda_memset_async(
         dest: *mut c_void,
-        value: u64,
+        val: u64,
         size: u64,
         stream: *mut c_void,
         gpu_index: u32,
     );
 
-    /// Get the total number of Nvidia GPUs detected on the platform
     pub fn cuda_get_number_of_gpus() -> i32;
 
-    /// Synchronize all streams on GPU `gpu_index`
     pub fn cuda_synchronize_device(gpu_index: u32);
 
-    /// Synchronize Cuda stream
-    pub fn cuda_synchronize_stream(stream: *mut c_void, gpu_index: u32);
-
-    /// Free memory for pointer `ptr` on GPU `gpu_index` asynchronously, using stream `v_stream`
-    pub fn cuda_drop_async(ptr: *mut c_void, stream: *mut c_void, gpu_index: u32);
-
-    /// Free memory for pointer `ptr` on GPU `gpu_index` synchronously
     pub fn cuda_drop(ptr: *mut c_void, gpu_index: u32);
 
-    pub fn cuda_setup_multi_gpu() -> i32;
-
-    /// Copy a bootstrap key `src` represented with 64 bits in the standard domain from the CPU to
-    /// the GPU `gpu_index` using the stream `v_stream`, and convert it to the Fourier domain on the
-    /// GPU. The resulting bootstrap key `dest` on the GPU is an array of f64 values.
-    pub fn cuda_convert_lwe_programmable_bootstrap_key_64(
-        stream: *mut c_void,
-        gpu_index: u32,
-        dest: *mut c_void,
-        src: *const c_void,
-        input_lwe_dim: u32,
-        glwe_dim: u32,
-        level_count: u32,
-        polynomial_size: u32,
-    );
-
-    /// Copy a multi-bit bootstrap key `src` represented with 64 bits in the standard domain from
-    /// the CPU to the GPU `gpu_index` using the stream `v_stream`. The resulting bootstrap key
-    /// `dest` on the GPU is an array of uint64_t values.
-    pub fn cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
-        stream: *mut c_void,
-        gpu_index: u32,
-        dest: *mut c_void,
-        src: *const c_void,
-        input_lwe_dim: u32,
-        glwe_dim: u32,
-        level_count: u32,
-        polynomial_size: u32,
-        grouping_factor: u32,
-    );
+    pub fn cuda_drop_async(ptr: *mut c_void, stream: *mut c_void, gpu_index: u32);
 
-    /// Copy `number_of_cts` LWE ciphertext represented with 64 bits in the standard domain from the
-    /// CPU to the GPU `gpu_index` using the stream `v_stream`. All ciphertexts must be
-    /// concatenated.
     pub fn cuda_convert_lwe_ciphertext_vector_to_gpu_64(
         stream: *mut c_void,
         gpu_index: u32,
@@ -110,9 +64,6 @@ extern "C" {
         lwe_dimension: u32,
     );
 
-    /// Copy `number_of_cts` LWE ciphertext represented with 64 bits in the standard domain from the
-    /// GPU to the CPU `gpu_index` using the stream `v_stream`. All ciphertexts must be
-    /// concatenated.
     pub fn cuda_convert_lwe_ciphertext_vector_to_cpu_64(
         stream: *mut c_void,
         gpu_index: u32,
@@ -122,400 +73,15 @@ extern "C" {
         lwe_dimension: u32,
     );
 
-    /// This scratch function allocates the necessary amount of data on the GPU for
-    /// the low latency PBS on 64-bit inputs, into `pbs_buffer`. It also configures SM
-    /// options on the GPU in case FULLSM or PARTIALSM mode are going to be used.
-    pub fn scratch_cuda_programmable_bootstrap_64(
-        stream: *mut c_void,
-        gpu_index: u32,
-        pbs_buffer: *mut *mut i8,
-        glwe_dimension: u32,
-        polynomial_size: u32,
-        level_count: u32,
-        input_lwe_ciphertext_count: u32,
-        allocate_gpu_memory: bool,
-    );
-
-    /// Perform bootstrapping on a batch of input u64 LWE ciphertexts.
-    ///
-    /// - `v_stream` is a void pointer to the Cuda stream to be used in the kernel launch
-    /// - `gpu_index` is the index of the GPU to be used in the kernel launch
-    /// - `lwe_array_out`: output batch of num_samples bootstrapped ciphertexts c = (a0,..an-1,b)
-    ///   where n is the LWE dimension
-    /// - `lut_vector`: should hold as many test vectors of size polynomial_size as there are input
-    ///   ciphertexts, but actually holds `num_lut_vectors` vectors to reduce memory usage
-    /// - `lut_vector_indexes`: stores the index corresponding to which test vector to use for each
-    ///   sample in `lut_vector`
-    /// - `lwe_array_in`: input batch of num_samples LWE ciphertexts, containing n mask values + 1
-    ///   body value
-    /// - `bootstrapping_key`: GGSW encryption of the LWE secret key sk1 under secret key sk2. bsk =
-    ///   Z + sk1 H where H is the gadget matrix and Z is a matrix (k+1).l containing GLWE
-    ///   encryptions of 0 under sk2. bsk is thus a tensor of size (k+1)^2.l.N.n where l is the
-    ///   number of decomposition levels and k is the GLWE dimension, N is the polynomial size for
-    ///   GLWE. The polynomial size for GLWE and the test vector are the same because they have to
-    ///   be in the same ring to be multiplied.
-    /// - `pbs_buffer`: a preallocated buffer to store temporary results
-    /// - `lwe_dimension`: size of the Torus vector used to encrypt the input LWE ciphertexts -
-    ///   referred to as n above (~ 600)
-    /// - `glwe_dimension`: size of the polynomial vector used to encrypt the LUT GLWE ciphertexts -
-    ///   referred to as k above. Only the value 1 is supported for this parameter.
-    /// - `polynomial_size`: size of the test polynomial (test vector) and size of the GLWE
-    ///   polynomial (~1024)
-    /// - `base_log`: log base used for the gadget matrix - B = 2^base_log (~8)
-    /// - `level_count`: number of decomposition levels in the gadget matrix (~4)
-    /// - `num_samples`: number of encrypted input messages
-    ///
-    /// This function calls a wrapper to a device kernel that performs the
-    /// bootstrapping:
-    ///   - the kernel is templatized based on integer discretization and polynomial degree
-    ///   - num_samples * level_count * (glwe_dimension + 1) blocks of threads are launched, where
-    ///     each thread is going to handle one or more polynomial coefficients at each stage, for a
-    ///     given level of decomposition, either for the LUT mask or its body:
-    ///     - perform the blind rotation
-    ///     - round the result
-    ///     - get the decomposition for the current level
-    ///     - switch to the FFT domain
-    ///     - multiply with the bootstrapping key
-    ///     - come back to the coefficients representation
-    ///   - between each stage a synchronization of the threads is necessary (some synchronizations
-    ///     happen at the block level, some happen between blocks, using cooperative groups).
-    ///   - in case the device has enough shared memory, temporary arrays used for the different
-    ///     stages (accumulators) are stored into the shared memory
-    ///   - the accumulators serve to combine the results for all decomposition levels
-    ///   - the constant memory (64K) is used for storing the roots of identity values for the FFT
-    pub fn cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
-        stream: *mut c_void,
-        gpu_index: u32,
-        lwe_array_out: *mut c_void,
-        lwe_output_indexes: *const c_void,
-        lut_vector: *const c_void,
-        lut_vector_indexes: *const c_void,
-        lwe_array_in: *const c_void,
-        lwe_input_indexes: *const c_void,
-        bootstrapping_key: *const c_void,
-        pbs_buffer: *mut i8,
-        lwe_dimension: u32,
-        glwe_dimension: u32,
-        polynomial_size: u32,
-        base_log: u32,
-        level: u32,
-        num_samples: u32,
-    );
-
-    /// This cleanup function frees the data for the low latency PBS on GPU
-    /// contained in pbs_buffer for 32 or 64-bit inputs.
-    pub fn cleanup_cuda_programmable_bootstrap(
-        stream: *mut c_void,
-        gpu_index: u32,
-        pbs_buffer: *mut *mut i8,
-    );
-
-    /// This scratch function allocates the necessary amount of data on the GPU for
-    /// the multi-bit PBS on 64-bit inputs into `pbs_buffer`.
-    pub fn scratch_cuda_multi_bit_programmable_bootstrap_64(
-        stream: *mut c_void,
-        gpu_index: u32,
-        pbs_buffer: *mut *mut i8,
-        lwe_dimension: u32,
-        glwe_dimension: u32,
-        polynomial_size: u32,
-        level_count: u32,
-        grouping_factor: u32,
-        input_lwe_ciphertext_count: u32,
-        allocate_gpu_memory: bool,
-    );
-
-    /// Perform bootstrapping on a batch of input u64 LWE ciphertexts using the multi-bit algorithm.
-    ///
-    /// - `v_stream` is a void pointer to the Cuda stream to be used in the kernel launch
-    /// - `gpu_index` is the index of the GPU to be used in the kernel launch
-    /// - `lwe_array_out`: output batch of num_samples bootstrapped ciphertexts c = (a0,..an-1,b)
-    ///   where n is the LWE dimension
-    /// - `lut_vector`: should hold as many test vectors of size polynomial_size as there are input
-    ///   ciphertexts, but actually holds `num_lut_vectors` vectors to reduce memory usage
-    /// - `lut_vector_indexes`: stores the index corresponding to which test vector to use for each
-    ///   sample in `lut_vector`
-    /// - `lwe_array_in`: input batch of num_samples LWE ciphertexts, containing n mask values + 1
-    ///   body value
-    /// - `bootstrapping_key`: GGSW encryption of elements of the LWE secret key as in the classical
-    ///   PBS, but this time we follow Zhou's trick and encrypt combinations of elements of the key
-    /// - `pbs_buffer`: a preallocated buffer to store temporary results
-    /// - `lwe_dimension`: size of the Torus vector used to encrypt the input LWE ciphertexts -
-    ///   referred to as n above (~ 600)
-    /// - `glwe_dimension`: size of the polynomial vector used to encrypt the LUT GLWE ciphertexts -
-    ///   referred to as k above. Only the value 1 is supported for this parameter.
-    /// - `polynomial_size`: size of the test polynomial (test vector) and size of the GLWE
-    ///   polynomial (~1024)
-    /// - `grouping_factor`: number of elements of the LWE secret key combined per GGSW of the
-    ///   bootstrap key
-    /// - `base_log`: log base used for the gadget matrix - B = 2^base_log (~8)
-    /// - `level_count`: number of decomposition levels in the gadget matrix (~4)
-    /// - `num_samples`: number of encrypted input messages
-    pub fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
-        stream: *mut c_void,
-        gpu_index: u32,
-        lwe_array_out: *mut c_void,
-        lwe_output_indexes: *const c_void,
-        lut_vector: *const c_void,
-        lut_vector_indexes: *const c_void,
-        lwe_array_in: *const c_void,
-        lwe_input_indexes: *const c_void,
-        bootstrapping_key: *const c_void,
-        pbs_buffer: *mut i8,
-        lwe_dimension: u32,
-        glwe_dimension: u32,
-        polynomial_size: u32,
-        grouping_factor: u32,
-        base_log: u32,
-        level: u32,
-        num_samples: u32,
-    );
-
-    /// This cleanup function frees the data for the multi-bit PBS on GPU
-    /// contained in pbs_buffer for 64-bit inputs.
-    pub fn cleanup_cuda_multi_bit_programmable_bootstrap(
-        stream: *mut c_void,
-        gpu_index: u32,
-        pbs_buffer: *mut *mut i8,
-    );
-
-    /// Perform keyswitch on a batch of 64 bits input LWE ciphertexts.
-    ///
-    /// - `v_stream` is a void pointer to the Cuda stream to be used in the kernel launch
-    /// - `gpu_index` is the index of the GPU to be used in the kernel launch
-    /// - `lwe_array_out`: output batch of num_samples keyswitched ciphertexts c = (a0,..an-1,b)
-    ///   where n is the output LWE dimension (lwe_dimension_out)
-    /// - `lwe_array_in`: input batch of num_samples LWE ciphertexts, containing lwe_dimension_in
-    ///   mask values + 1 body value
-    /// - `ksk`: the keyswitch key to be used in the operation
-    /// - `base_log`: the log of the base used in the decomposition (should be the one used to
-    ///   create the ksk).
-    /// - `level_count`: the number of levels used in the decomposition (should be the one used to
-    ///   create the ksk).
-    /// - `num_samples`: the number of input and output LWE ciphertexts.
-    ///
-    /// This function calls a wrapper to a device kernel that performs the keyswitch.
-    /// `num_samples` blocks of threads are launched
-    pub fn cuda_keyswitch_lwe_ciphertext_vector_64(
-        stream: *mut c_void,
-        gpu_index: u32,
-        lwe_array_out: *mut c_void,
-        lwe_output_indexes: *const c_void,
-        lwe_array_in: *const c_void,
-        lwe_input_indexes: *const c_void,
-        keyswitch_key: *const c_void,
-        input_lwe_dimension: u32,
-        output_lwe_dimension: u32,
-        base_log: u32,
-        level_count: u32,
-        num_samples: u32,
-    );
-
-    /// This scratch function allocates the necessary amount of data on the GPU for
-    /// the public function packing keyswitch implementation on 64-bit
-    pub fn scratch_packing_keyswitch_lwe_list_to_glwe_64(
-        stream: *mut c_void,
-        gpu_index: u32,
-        fp_ks_buffer: *mut *mut i8,
-        glwe_dimension: u32,
-        polynomial_size: u32,
-        input_lwe_ciphertext_count: u32,
-        allocate_gpu_memory: bool,
-    );
-
-    /// Perform public functional packing keyswitch on a vector of 64-bit LWE ciphertexts
-    pub fn cuda_packing_keyswitch_lwe_list_to_glwe_64(
-        stream: *mut c_void,
-        gpu_index: u32,
-        glwe_array_out: *mut c_void,
-        lwe_array_in: *const c_void,
-        fp_ksk_array: *const c_void,
-        fp_ks_buffer: *mut i8,
-        input_lwe_dimension: u32,
-        output_glwe_dimension: u32,
-        polynomial_size: u32,
-        base_log: u32,
-        level_count: u32,
-        num_lwes: u32,
-    );
-
-    pub fn cleanup_packing_keyswitch_lwe_list_to_glwe(
-        stream: *mut c_void,
-        gpu_index: u32,
-        fp_ks_buffer: *mut *mut i8,
-    );
-
-    /// Perform the negation of a u64 input LWE ciphertext vector.
-    /// - `v_stream` is a void pointer to the Cuda stream to be used in the kernel launch
-    /// - `gpu_index` is the index of the GPU to be used in the kernel launch
-    /// - `lwe_array_out` is an array of size `(input_lwe_dimension + 1) *
-    ///   input_lwe_ciphertext_count` that should have been allocated on the GPU before calling this
-    ///   function, and that will hold the result of the computation.
-    /// - `lwe_array_in` is the LWE ciphertext vector used as input, it should have been allocated
-    ///   and initialized before calling this function. It has the same size as the output array.
-    /// - `input_lwe_dimension` is the number of mask elements in the two input and in the output
-    ///   ciphertext vectors
-    /// - `input_lwe_ciphertext_count` is the number of ciphertexts contained in each input LWE
-    ///   ciphertext vector, as well as in the output.
-    ///
-    /// Each element (mask element or body) of the input LWE ciphertext vector is negated.
-    /// The result is stored in the output LWE ciphertext vector. The input LWE ciphertext vector
-    /// is left unchanged. This function is a wrapper to a device function that performs the
-    /// operation on the GPU.
-    pub fn cuda_negate_lwe_ciphertext_vector_64(
-        stream: *mut c_void,
-        gpu_index: u32,
-        lwe_array_out: *mut c_void,
-        lwe_array_in: *const c_void,
-        input_lwe_dimension: u32,
-        input_lwe_ciphertext_count: u32,
-    );
-
-    pub fn cuda_negate_integer_radix_ciphertext_64_inplace(
-        streams: *const *mut c_void,
-        gpu_index: *const u32,
-        gpu_indexes: u32,
-        lwe_array: *mut c_void,
-        lwe_dimension: u32,
-        lwe_ciphertext_count: u32,
-        message_modulus: u32,
-        carry_modulus: u32,
-    );
-
-    /// Perform the addition of two u64 input LWE ciphertext vectors.
-    /// - `v_stream` is a void pointer to the Cuda stream to be used in the kernel launch
-    /// - `gpu_index` is the index of the GPU to be used in the kernel launch
-    /// - `lwe_array_out` is an array of size `(input_lwe_dimension + 1) *
-    ///   input_lwe_ciphertext_count` that should have been allocated on the GPU before calling this
-    ///   function, and that will hold the result of the computation.
-    /// - `lwe_array_in_1` is the first LWE ciphertext vector used as input, it should have been
-    ///   allocated and initialized before calling this function. It has the same size as the output
-    ///   array.
-    /// - `lwe_array_in_2` is the second LWE ciphertext vector used as input, it should have been
-    ///   allocated and initialized before calling this function. It has the same size as the output
-    ///   array.
-    /// - `input_lwe_dimension` is the number of mask elements in the two input and in the output
-    ///   ciphertext vectors
-    /// - `input_lwe_ciphertext_count` is the number of ciphertexts contained in each input LWE
-    ///   ciphertext vector, as well as in the output.
-    ///
-    /// Each element (mask element or body) of the input LWE ciphertext vector 1 is added to the
-    /// corresponding element in the input LWE ciphertext 2. The result is stored in the output LWE
-    /// ciphertext vector. The two input LWE ciphertext vectors are left unchanged. This function is
-    /// a wrapper to a device function that performs the operation on the GPU.
-    pub fn cuda_add_lwe_ciphertext_vector_64(
-        stream: *mut c_void,
-        gpu_index: u32,
-        lwe_array_out: *mut c_void,
-        lwe_array_in_1: *const c_void,
-        lwe_array_in_2: *const c_void,
-        input_lwe_dimension: u32,
-        input_lwe_ciphertext_count: u32,
-    );
-
-    /// Perform the addition of a u64 input LWE ciphertext vector with a u64 input plaintext vector.
-    /// - `v_stream` is a void pointer to the Cuda stream to be used in the kernel launch
-    /// - `gpu_index` is the index of the GPU to be used in the kernel launch
-    /// - `lwe_array_out` is an array of size `(input_lwe_dimension + 1) *
-    ///   input_lwe_ciphertext_count` that should have been allocated on the GPU before calling this
-    ///   function, and that will hold the result of the computation.
-    /// - `lwe_array_in` is the LWE ciphertext vector used as input, it should have been allocated
-    ///   and initialized before calling this function. It has the same size as the output array.
-    /// - `plaintext_array_in` is the plaintext vector used as input, it should have been allocated
-    ///   and initialized before calling this function. It should be of size
-    ///   `input_lwe_ciphertext_count`.
-    /// - `input_lwe_dimension` is the number of mask elements in the input and output LWE
-    ///   ciphertext vectors
-    /// - `input_lwe_ciphertext_count` is the number of ciphertexts contained in the input LWE
-    ///   ciphertext vector, as well as in the output. It is also the number of plaintexts in the
-    ///   input plaintext vector.
-    ///
-    /// Each plaintext of the input plaintext vector is added to the body of the corresponding LWE
-    /// ciphertext in the LWE ciphertext vector. The result of the operation is stored in the output
-    /// LWE ciphertext vector. The two input vectors are unchanged. This function is a
-    /// wrapper to a device function that performs the operation on the GPU.
-    pub fn cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
-        stream: *mut c_void,
-        gpu_index: u32,
-        lwe_array_out: *mut c_void,
-        lwe_array_in: *const c_void,
-        plaintext_array_in: *const c_void,
-        input_lwe_dimension: u32,
-        input_lwe_ciphertext_count: u32,
-    );
-
-    /// Perform the multiplication of a u64 input LWE ciphertext vector with a u64 input cleartext
-    /// vector.
-    /// - `v_stream` is a void pointer to the Cuda stream to be used in the kernel launch
-    /// - `gpu_index` is the index of the GPU to be used in the kernel launch
-    /// - `lwe_array_out` is an array of size `(input_lwe_dimension + 1) *
-    ///   input_lwe_ciphertext_count` that should have been allocated on the GPU before calling this
-    ///   function, and that will hold the result of the computation.
-    /// - `lwe_array_in` is the LWE ciphertext vector used as input, it should have been allocated
-    ///   and initialized before calling this function. It has the same size as the output array.
-    /// - `cleartext_array_in` is the cleartext vector used as input, it should have been allocated
-    ///   and initialized before calling this function. It should be of size
-    ///   `input_lwe_ciphertext_count`.
-    /// - `input_lwe_dimension` is the number of mask elements in the input and output LWE
-    ///   ciphertext vectors
-    /// - `input_lwe_ciphertext_count` is the number of ciphertexts contained in the input LWE
-    ///   ciphertext vector, as well as in the output. It is also the number of cleartexts in the
-    ///   input cleartext vector.
-    ///
-    /// Each cleartext of the input cleartext vector is multiplied to the mask and body of the
-    /// corresponding LWE ciphertext in the LWE ciphertext vector.
-    /// The result of the operation is stored in the output
-    /// LWE ciphertext vector. The two input vectors are unchanged. This function is a
-    /// wrapper to a device function that performs the operation on the GPU.
-    pub fn cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
+    pub fn cuda_glwe_sample_extract_64(
         stream: *mut c_void,
         gpu_index: u32,
         lwe_array_out: *mut c_void,
-        lwe_array_in: *const c_void,
-        cleartext_array_in: *const c_void,
-        input_lwe_dimension: u32,
-        input_lwe_ciphertext_count: u32,
-    );
-
-    pub fn scratch_cuda_integer_mult_radix_ciphertext_kb_64(
-        streams: *const *mut c_void,
-        gpu_indexes: *const u32,
-        gpu_count: u32,
-        mem_ptr: *mut *mut i8,
-        message_modulus: u32,
-        carry_modulus: u32,
+        glwe_array_in: *const c_void,
+        nth_array: *const u32,
+        num_glwes: u32,
         glwe_dimension: u32,
-        lwe_dimension: u32,
         polynomial_size: u32,
-        pbs_base_log: u32,
-        pbs_level: u32,
-        ks_base_log: u32,
-        ks_level: u32,
-        grouping_factor: u32,
-        num_blocks: u32,
-        pbs_type: u32,
-        allocate_gpu_memory: bool,
-    );
-
-    pub fn cuda_integer_mult_radix_ciphertext_kb_64(
-        streams: *const *mut c_void,
-        gpu_indexes: *const u32,
-        gpu_count: u32,
-        radix_lwe_out: *mut c_void,
-        radix_lwe_left: *const c_void,
-        radix_lwe_right: *const c_void,
-        bsks: *const *mut c_void,
-        ksks: *const *mut c_void,
-        mem_ptr: *mut i8,
-        polynomial_size: u32,
-        num_blocks: u32,
-    );
-
-    pub fn cleanup_cuda_integer_mult(
-        streams: *const *mut c_void,
-        gpu_indexes: *const u32,
-        gpu_count: u32,
-        mem_ptr: *mut *mut i8,
     );
 
     pub fn scratch_cuda_integer_compress_radix_ciphertext_64(
@@ -536,6 +102,7 @@ extern "C" {
         storage_log_modulus: u32,
         allocate_gpu_memory: bool,
     );
+
     pub fn scratch_cuda_integer_decompress_radix_ciphertext_64(
         streams: *const *mut c_void,
         gpu_indexes: *const u32,
@@ -563,7 +130,7 @@ extern "C" {
         glwe_array_out: *mut c_void,
         lwe_array_in: *const c_void,
         fp_ksk: *const *mut c_void,
-        num_lwes: u32,
+        num_nths: u32,
         mem_ptr: *mut i8,
     );
 
@@ -571,8 +138,8 @@ extern "C" {
         streams: *const *mut c_void,
         gpu_indexes: *const u32,
         gpu_count: u32,
-        lwe_out: *mut c_void,
-        glwe_array_in: *const c_void,
+        lwe_array_out: *mut c_void,
+        glwe_in: *const c_void,
         indexes_array: *const c_void,
         indexes_array_size: u32,
         bsks: *const *mut c_void,
@@ -585,6 +152,7 @@ extern "C" {
         gpu_count: u32,
         mem_ptr: *mut *mut i8,
     );
+
     pub fn cleanup_cuda_integer_decompress_radix_ciphertext_64(
         streams: *const *mut c_void,
         gpu_indexes: *const u32,
@@ -592,191 +160,54 @@ extern "C" {
         mem_ptr: *mut *mut i8,
     );
 
-    pub fn cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
-        streams: *const *mut c_void,
-        gpu_indexes: *const u32,
-        gpu_count: u32,
-        lwe_array: *mut c_void,
-        scalar_input: *const c_void,
-        lwe_dimension: u32,
-        lwe_ciphertext_count: u32,
-        message_modulus: u32,
-        carry_modulus: u32,
-    );
+    pub fn cuda_setup_multi_gpu() -> i32;
 
-    pub fn scratch_cuda_integer_scalar_mul_kb_64(
+    pub fn scratch_cuda_apply_univariate_lut_kb_64(
         streams: *const *mut c_void,
         gpu_indexes: *const u32,
         gpu_count: u32,
         mem_ptr: *mut *mut i8,
-        glwe_dimension: u32,
-        polynomial_size: u32,
-        lwe_dimension: u32,
-        ks_level: u32,
-        ks_base_log: u32,
-        pbs_level: u32,
-        pbs_base_log: u32,
-        grouping_factor: u32,
-        num_blocks: u32,
-        message_modulus: u32,
-        carry_modulus: u32,
-        pbs_type: u32,
-        allocate_gpu_memory: bool,
-    );
-
-    pub fn cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
-        streams: *const *mut c_void,
-        gpu_indexes: *const u32,
-        gpu_count: u32,
-        lwe_array: *mut c_void,
-        decomposed_scalar: *const u64,
-        has_at_least_one_set: *const u64,
-        mem: *mut i8,
-        bsks: *const *mut c_void,
-        ksks: *const *mut c_void,
+        input_lut: *const c_void,
         lwe_dimension: u32,
-        polynomial_size: u32,
-        message_modulus: u32,
-        num_blocks: u32,
-        num_scalars: u32,
-    );
-
-    pub fn cleanup_cuda_integer_radix_scalar_mul(
-        streams: *const *mut c_void,
-        gpu_indexes: *const u32,
-        gpu_count: u32,
-        mem_ptr: *mut *mut i8,
-    );
-
-    pub fn scratch_cuda_integer_radix_bitop_kb_64(
-        streams: *const *mut c_void,
-        gpu_indexes: *const u32,
-        gpu_count: u32,
-        mem_ptr: *mut *mut i8,
-        glwe_dimension: u32,
-        polynomial_size: u32,
-        big_lwe_dimension: u32,
-        small_lwe_dimension: u32,
-        ks_level: u32,
-        ks_base_log: u32,
-        pbs_level: u32,
-        pbs_base_log: u32,
-        grouping_factor: u32,
-        num_blocks: u32,
-        message_modulus: u32,
-        carry_modulus: u32,
-        pbs_type: u32,
-        op_type: u32,
-        allocate_gpu_memory: bool,
-    );
-
-    pub fn cuda_bitop_integer_radix_ciphertext_kb_64(
-        streams: *const *mut c_void,
-        gpu_indexes: *const u32,
-        gpu_count: u32,
-        radix_lwe_out: *mut c_void,
-        radix_lwe_left: *const c_void,
-        radix_lwe_right: *const c_void,
-        mem_ptr: *mut i8,
-        bsks: *const *mut c_void,
-        ksks: *const *mut c_void,
-        num_blocks: u32,
-    );
-
-    pub fn cuda_scalar_bitop_integer_radix_ciphertext_kb_64(
-        streams: *const *mut c_void,
-        gpu_indexes: *const u32,
-        gpu_count: u32,
-        radix_lwe_output: *mut c_void,
-        radix_lwe_input: *mut c_void,
-        clear_blocks: *const c_void,
-        num_clear_blocks: u32,
-        mem_ptr: *mut i8,
-        bsks: *const *mut c_void,
-        ksks: *const *mut c_void,
-        num_blocks: u32,
-        op_type: u32,
-    );
-
-    pub fn cleanup_cuda_integer_bitop(
-        streams: *const *mut c_void,
-        gpu_indexes: *const u32,
-        gpu_count: u32,
-        mem_ptr: *mut *mut i8,
-    );
-    pub fn cuda_glwe_sample_extract_64(
-        stream: *mut c_void,
-        gpu_index: u32,
-        lwe_array_out: *mut c_void,
-        glwe_array_in: *const c_void,
-        nth_array: *const u32,
-        num_glwes: u32,
         glwe_dimension: u32,
         polynomial_size: u32,
-    );
-
-    pub fn scratch_cuda_integer_radix_comparison_kb_64(
-        streams: *const *mut c_void,
-        gpu_indexes: *const u32,
-        gpu_count: u32,
-        mem_ptr: *mut *mut i8,
-        glwe_dimension: u32,
-        polynomial_size: u32,
-        big_lwe_dimension: u32,
-        small_lwe_dimension: u32,
         ks_level: u32,
         ks_base_log: u32,
         pbs_level: u32,
         pbs_base_log: u32,
         grouping_factor: u32,
-        num_blocks: u32,
+        input_lwe_ciphertext_count: u32,
         message_modulus: u32,
         carry_modulus: u32,
         pbs_type: u32,
-        op_type: u32,
-        is_signed: bool,
         allocate_gpu_memory: bool,
     );
 
-    pub fn cuda_comparison_integer_radix_ciphertext_kb_64(
+    pub fn cuda_apply_univariate_lut_kb_64(
         streams: *const *mut c_void,
         gpu_indexes: *const u32,
         gpu_count: u32,
-        radix_lwe_out: *mut c_void,
-        radix_lwe_left: *const c_void,
-        radix_lwe_right: *const c_void,
+        output_radix_lwe: *mut c_void,
+        input_radix_lwe: *const c_void,
         mem_ptr: *mut i8,
-        bsks: *const *mut c_void,
         ksks: *const *mut c_void,
+        bsks: *const *mut c_void,
         num_blocks: u32,
     );
 
-    pub fn cleanup_cuda_integer_comparison(
+    pub fn cleanup_cuda_apply_univariate_lut_kb_64(
         streams: *const *mut c_void,
         gpu_indexes: *const u32,
         gpu_count: u32,
         mem_ptr: *mut *mut i8,
     );
 
-    pub fn cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
-        streams: *const *mut c_void,
-        gpu_indexes: *const u32,
-        gpu_count: u32,
-        radix_lwe_out: *mut c_void,
-        radix_lwe_in: *const c_void,
-        scalar_blocks: *const c_void,
-        mem_ptr: *mut i8,
-        bsks: *const *mut c_void,
-        ksks: *const *mut c_void,
-        num_blocks: u32,
-        num_scalar_blocks: u32,
-    );
-
-    pub fn scratch_cuda_full_propagation_64(
+    pub fn scratch_cuda_apply_bivariate_lut_kb_64(
         streams: *const *mut c_void,
         gpu_indexes: *const u32,
         gpu_count: u32,
         mem_ptr: *mut *mut i8,
+        input_lut: *const c_void,
         lwe_dimension: u32,
         glwe_dimension: u32,
         polynomial_size: u32,
@@ -785,36 +216,39 @@ extern "C" {
         pbs_level: u32,
         pbs_base_log: u32,
         grouping_factor: u32,
+        input_lwe_ciphertext_count: u32,
         message_modulus: u32,
         carry_modulus: u32,
         pbs_type: u32,
         allocate_gpu_memory: bool,
     );
 
-    pub fn cuda_full_propagation_64_inplace(
+    pub fn cuda_apply_bivariate_lut_kb_64(
         streams: *const *mut c_void,
         gpu_indexes: *const u32,
         gpu_count: u32,
-        radix_lwe_right: *mut c_void,
+        output_radix_lwe: *mut c_void,
+        input_radix_lwe_1: *const c_void,
+        input_radix_lwe_2: *const c_void,
         mem_ptr: *mut i8,
         ksks: *const *mut c_void,
         bsks: *const *mut c_void,
         num_blocks: u32,
+        shift: u32,
     );
 
-    pub fn cleanup_cuda_full_propagation(
+    pub fn cleanup_cuda_apply_bivariate_lut_kb_64(
         streams: *const *mut c_void,
         gpu_indexes: *const u32,
         gpu_count: u32,
         mem_ptr: *mut *mut i8,
     );
 
-    pub fn scratch_cuda_apply_univariate_lut_kb_64(
+    pub fn scratch_cuda_full_propagation_64(
         streams: *const *mut c_void,
         gpu_indexes: *const u32,
         gpu_count: u32,
         mem_ptr: *mut *mut i8,
-        input_lut: *const c_void,
         lwe_dimension: u32,
         glwe_dimension: u32,
         polynomial_size: u32,
@@ -823,74 +257,94 @@ extern "C" {
         pbs_level: u32,
         pbs_base_log: u32,
         grouping_factor: u32,
-        num_blocks: u32,
         message_modulus: u32,
         carry_modulus: u32,
         pbs_type: u32,
         allocate_gpu_memory: bool,
     );
 
-    pub fn cuda_apply_univariate_lut_kb_64(
+    pub fn cuda_full_propagation_64_inplace(
         streams: *const *mut c_void,
         gpu_indexes: *const u32,
         gpu_count: u32,
-        output_radix_lwe: *mut c_void,
-        input_radix_lwe: *const c_void,
+        input_blocks: *mut c_void,
         mem_ptr: *mut i8,
         ksks: *const *mut c_void,
         bsks: *const *mut c_void,
         num_blocks: u32,
     );
 
-    pub fn cleanup_cuda_apply_univariate_lut_kb_64(
+    pub fn cleanup_cuda_full_propagation(
         streams: *const *mut c_void,
         gpu_indexes: *const u32,
         gpu_count: u32,
         mem_ptr: *mut *mut i8,
     );
 
-    pub fn scratch_cuda_apply_bivariate_lut_kb_64(
+    pub fn scratch_cuda_integer_mult_radix_ciphertext_kb_64(
         streams: *const *mut c_void,
         gpu_indexes: *const u32,
         gpu_count: u32,
         mem_ptr: *mut *mut i8,
-        input_lut: *const c_void,
-        lwe_dimension: u32,
+        message_modulus: u32,
+        carry_modulus: u32,
         glwe_dimension: u32,
+        lwe_dimension: u32,
         polynomial_size: u32,
-        ks_level: u32,
-        ks_base_log: u32,
-        pbs_level: u32,
         pbs_base_log: u32,
+        pbs_level: u32,
+        ks_base_log: u32,
+        ks_level: u32,
         grouping_factor: u32,
         num_blocks: u32,
-        message_modulus: u32,
-        carry_modulus: u32,
         pbs_type: u32,
         allocate_gpu_memory: bool,
     );
 
-    pub fn cuda_apply_bivariate_lut_kb_64(
+    pub fn cuda_integer_mult_radix_ciphertext_kb_64(
         streams: *const *mut c_void,
         gpu_indexes: *const u32,
         gpu_count: u32,
-        output_radix_lwe: *mut c_void,
-        input_radix_lwe_1: *const c_void,
-        input_radix_lwe_2: *const c_void,
-        mem_ptr: *mut i8,
-        ksks: *const *mut c_void,
+        radix_lwe_out: *mut c_void,
+        radix_lwe_left: *const c_void,
+        radix_lwe_right: *const c_void,
         bsks: *const *mut c_void,
+        ksks: *const *mut c_void,
+        mem_ptr: *mut i8,
+        polynomial_size: u32,
         num_blocks: u32,
-        shift: u32,
     );
 
-    pub fn cleanup_cuda_apply_bivariate_lut_kb_64(
+    pub fn cleanup_cuda_integer_mult(
         streams: *const *mut c_void,
         gpu_indexes: *const u32,
         gpu_count: u32,
         mem_ptr: *mut *mut i8,
     );
 
+    pub fn cuda_negate_integer_radix_ciphertext_64_inplace(
+        streams: *const *mut c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        lwe_array: *mut c_void,
+        lwe_dimension: u32,
+        lwe_ciphertext_count: u32,
+        message_modulus: u32,
+        carry_modulus: u32,
+    );
+
+    pub fn cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
+        streams: *const *mut c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        lwe_array: *mut c_void,
+        scalar_input: *const c_void,
+        lwe_dimension: u32,
+        lwe_ciphertext_count: u32,
+        message_modulus: u32,
+        carry_modulus: u32,
+    );
+
     pub fn scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
         streams: *const *mut c_void,
         gpu_indexes: *const u32,
@@ -917,7 +371,7 @@ extern "C" {
         streams: *const *mut c_void,
         gpu_indexes: *const u32,
         gpu_count: u32,
-        radix_lwe: *mut c_void,
+        lwe_array: *mut c_void,
         shift: u32,
         mem_ptr: *mut i8,
         bsks: *const *mut c_void,
@@ -951,7 +405,7 @@ extern "C" {
         streams: *const *mut c_void,
         gpu_indexes: *const u32,
         gpu_count: u32,
-        radix_lwe: *mut c_void,
+        lwe_array: *mut c_void,
         shift: u32,
         mem_ptr: *mut i8,
         bsks: *const *mut c_void,
@@ -1000,8 +454,8 @@ extern "C" {
         streams: *const *mut c_void,
         gpu_indexes: *const u32,
         gpu_count: u32,
-        radix_lwe: *mut c_void,
-        radix_shift: *const c_void,
+        lwe_array: *mut c_void,
+        lwe_shift: *const c_void,
         mem_ptr: *mut i8,
         bsks: *const *mut c_void,
         ksks: *const *mut c_void,
@@ -1015,6 +469,120 @@ extern "C" {
         mem_ptr: *mut *mut i8,
     );
 
+    pub fn scratch_cuda_integer_radix_comparison_kb_64(
+        streams: *const *mut c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        mem_ptr: *mut *mut i8,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        big_lwe_dimension: u32,
+        small_lwe_dimension: u32,
+        ks_level: u32,
+        ks_base_log: u32,
+        pbs_level: u32,
+        pbs_base_log: u32,
+        grouping_factor: u32,
+        lwe_ciphertext_count: u32,
+        message_modulus: u32,
+        carry_modulus: u32,
+        pbs_type: u32,
+        op_type: u32,
+        is_signed: bool,
+        allocate_gpu_memory: bool,
+    );
+
+    pub fn cuda_comparison_integer_radix_ciphertext_kb_64(
+        streams: *const *mut c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        lwe_array_out: *mut c_void,
+        lwe_array_1: *const c_void,
+        lwe_array_2: *const c_void,
+        mem_ptr: *mut i8,
+        bsks: *const *mut c_void,
+        ksks: *const *mut c_void,
+        lwe_ciphertext_count: u32,
+    );
+
+    pub fn cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
+        streams: *const *mut c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        lwe_array_out: *mut c_void,
+        lwe_array_in: *const c_void,
+        scalar_blocks: *const c_void,
+        mem_ptr: *mut i8,
+        bsks: *const *mut c_void,
+        ksks: *const *mut c_void,
+        lwe_ciphertext_count: u32,
+        num_scalar_blocks: u32,
+    );
+
+    pub fn cleanup_cuda_integer_comparison(
+        streams: *const *mut c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        mem_ptr: *mut *mut i8,
+    );
+
+    pub fn scratch_cuda_integer_radix_bitop_kb_64(
+        streams: *const *mut c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        mem_ptr: *mut *mut i8,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        big_lwe_dimension: u32,
+        small_lwe_dimension: u32,
+        ks_level: u32,
+        ks_base_log: u32,
+        pbs_level: u32,
+        pbs_base_log: u32,
+        grouping_factor: u32,
+        lwe_ciphertext_count: u32,
+        message_modulus: u32,
+        carry_modulus: u32,
+        pbs_type: u32,
+        op_type: u32,
+        allocate_gpu_memory: bool,
+    );
+
+    pub fn cuda_bitop_integer_radix_ciphertext_kb_64(
+        streams: *const *mut c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        lwe_array_out: *mut c_void,
+        lwe_array_1: *const c_void,
+        lwe_array_2: *const c_void,
+        mem_ptr: *mut i8,
+        bsks: *const *mut c_void,
+        ksks: *const *mut c_void,
+        lwe_ciphertext_count: u32,
+    );
+
+    pub fn cuda_scalar_bitop_integer_radix_ciphertext_kb_64(
+        streams: *const *mut c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        lwe_array_out: *mut c_void,
+        lwe_array_input: *const c_void,
+        clear_blocks: *const c_void,
+        num_clear_blocks: u32,
+        mem_ptr: *mut i8,
+        bsks: *const *mut c_void,
+        ksks: *const *mut c_void,
+        lwe_ciphertext_count: u32,
+        op: u32,
+    );
+
+    pub fn cleanup_cuda_integer_bitop(
+        streams: *const *mut c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        mem_ptr: *mut *mut i8,
+    );
+
     pub fn scratch_cuda_integer_radix_cmux_kb_64(
         streams: *const *mut c_void,
         gpu_indexes: *const u32,
@@ -1029,7 +597,7 @@ extern "C" {
         pbs_level: u32,
         pbs_base_log: u32,
         grouping_factor: u32,
-        num_blocks: u32,
+        lwe_ciphertext_count: u32,
         message_modulus: u32,
         carry_modulus: u32,
         pbs_type: u32,
@@ -1047,7 +615,7 @@ extern "C" {
         mem_ptr: *mut i8,
         bsks: *const *mut c_void,
         ksks: *const *mut c_void,
-        num_blocks: u32,
+        lwe_ciphertext_count: u32,
     );
 
     pub fn cleanup_cuda_integer_radix_cmux(
@@ -1083,7 +651,7 @@ extern "C" {
         streams: *const *mut c_void,
         gpu_indexes: *const u32,
         gpu_count: u32,
-        radix_lwe: *mut c_void,
+        lwe_array: *mut c_void,
         n: u32,
         mem_ptr: *mut i8,
         bsks: *const *mut c_void,
@@ -1123,7 +691,7 @@ extern "C" {
         streams: *const *mut c_void,
         gpu_indexes: *const u32,
         gpu_count: u32,
-        radix_lwe: *mut c_void,
+        lwe_array: *mut c_void,
         carry_out: *mut c_void,
         mem_ptr: *mut i8,
         bsks: *const *mut c_void,
@@ -1135,7 +703,7 @@ extern "C" {
         streams: *const *mut c_void,
         gpu_indexes: *const u32,
         gpu_count: u32,
-        radix_lwe: *mut c_void,
+        lwe_array: *mut c_void,
         carry_out: *mut c_void,
         input_carries: *mut c_void,
         mem_ptr: *mut i8,
@@ -1164,43 +732,84 @@ extern "C" {
         pbs_level: u32,
         pbs_base_log: u32,
         grouping_factor: u32,
-        num_blocks_in_radix: u32,
-        max_num_radix_in_vec: u32,
+        num_blocks_in_radix: u32,
+        max_num_radix_in_vec: u32,
+        message_modulus: u32,
+        carry_modulus: u32,
+        pbs_type: u32,
+        allocate_gpu_memory: bool,
+    );
+
+    pub fn cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
+        streams: *const *mut c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        radix_lwe_out: *mut c_void,
+        radix_lwe_vec: *const c_void,
+        num_radix_in_vec: u32,
+        mem_ptr: *mut i8,
+        bsks: *const *mut c_void,
+        ksks: *const *mut c_void,
+        num_blocks_in_radix: u32,
+    );
+
+    pub fn cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec(
+        streams: *const *mut c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        mem_ptr: *mut *mut i8,
+    );
+
+    pub fn scratch_cuda_integer_radix_overflowing_sub_kb_64(
+        streams: *const *mut c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        mem_ptr: *mut *mut i8,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        big_lwe_dimension: u32,
+        small_lwe_dimension: u32,
+        ks_level: u32,
+        ks_base_log: u32,
+        pbs_level: u32,
+        pbs_base_log: u32,
+        grouping_factor: u32,
+        num_blocks: u32,
         message_modulus: u32,
         carry_modulus: u32,
         pbs_type: u32,
         allocate_gpu_memory: bool,
     );
 
-    pub fn cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
+    pub fn cuda_integer_radix_overflowing_sub_kb_64(
         streams: *const *mut c_void,
         gpu_indexes: *const u32,
         gpu_count: u32,
         radix_lwe_out: *mut c_void,
-        radix_lwe_vec: *mut c_void,
-        num_radix_in_vec: u32,
+        radix_lwe_overflowed: *mut c_void,
+        radix_lwe_left: *const c_void,
+        radix_lwe_right: *const c_void,
         mem_ptr: *mut i8,
         bsks: *const *mut c_void,
         ksks: *const *mut c_void,
         num_blocks_in_radix: u32,
     );
 
-    pub fn cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec(
+    pub fn cleanup_cuda_integer_radix_overflowing_sub(
         streams: *const *mut c_void,
         gpu_indexes: *const u32,
         gpu_count: u32,
         mem_ptr: *mut *mut i8,
     );
 
-    pub fn scratch_cuda_integer_radix_overflowing_sub_kb_64(
+    pub fn scratch_cuda_integer_scalar_mul_kb_64(
         streams: *const *mut c_void,
         gpu_indexes: *const u32,
         gpu_count: u32,
         mem_ptr: *mut *mut i8,
         glwe_dimension: u32,
         polynomial_size: u32,
-        big_lwe_dimension: u32,
-        small_lwe_dimension: u32,
+        lwe_dimension: u32,
         ks_level: u32,
         ks_base_log: u32,
         pbs_level: u32,
@@ -1213,21 +822,24 @@ extern "C" {
         allocate_gpu_memory: bool,
     );
 
-    pub fn cuda_integer_radix_overflowing_sub_kb_64(
+    pub fn cuda_scalar_multiplication_integer_radix_ciphertext_64_inplace(
         streams: *const *mut c_void,
         gpu_indexes: *const u32,
         gpu_count: u32,
-        radix_lwe_out: *mut c_void,
-        radix_lwe_overflowed: *mut c_void,
-        radix_lwe_left: *const c_void,
-        radix_lwe_right: *const c_void,
+        lwe_array: *mut c_void,
+        decomposed_scalar: *const u64,
+        has_at_least_one_set: *const u64,
         mem_ptr: *mut i8,
         bsks: *const *mut c_void,
         ksks: *const *mut c_void,
+        lwe_dimension: u32,
+        polynomial_size: u32,
+        message_modulus: u32,
         num_blocks: u32,
+        num_scalars: u32,
     );
 
-    pub fn cleanup_cuda_integer_radix_overflowing_sub(
+    pub fn cleanup_cuda_integer_radix_scalar_mul(
         streams: *const *mut c_void,
         gpu_indexes: *const u32,
         gpu_count: u32,
@@ -1266,7 +878,7 @@ extern "C" {
         mem_ptr: *mut i8,
         bsks: *const *mut c_void,
         ksks: *const *mut c_void,
-        num_blocks: u32,
+        num_blocks_in_radix: u32,
     );
 
     pub fn cleanup_cuda_integer_div_rem(
@@ -1309,7 +921,7 @@ extern "C" {
         mem_ptr: *mut i8,
         bsks: *const *mut c_void,
         ksks: *const *mut c_void,
-        num_blocks: u32,
+        num_blocks_in_radix: u32,
     );
 
     pub fn cleanup_signed_overflowing_add_or_sub(
@@ -1318,6 +930,7 @@ extern "C" {
         gpu_count: u32,
         mem_ptr: *mut *mut i8,
     );
+
     pub fn scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
         streams: *const *mut c_void,
         gpu_indexes: *const u32,
@@ -1332,7 +945,7 @@ extern "C" {
         pbs_level: u32,
         pbs_base_log: u32,
         grouping_factor: u32,
-        num_blocks: u32,
+        num_radix_blocks: u32,
         message_modulus: u32,
         carry_modulus: u32,
         pbs_type: u32,
@@ -1363,8 +976,237 @@ extern "C" {
         streams: *const *mut c_void,
         gpu_indexes: *const u32,
         gpu_count: u32,
-        output_radix_lwe: *mut c_void,
+        lwe_array: *mut c_void,
         num_blocks: u32,
         lwe_size: u32,
     );
+
+    pub fn cuda_keyswitch_lwe_ciphertext_vector_64(
+        stream: *mut c_void,
+        gpu_index: u32,
+        lwe_array_out: *mut c_void,
+        lwe_output_indexes: *const c_void,
+        lwe_array_in: *const c_void,
+        lwe_input_indexes: *const c_void,
+        ksk: *const c_void,
+        lwe_dimension_in: u32,
+        lwe_dimension_out: u32,
+        base_log: u32,
+        level_count: u32,
+        num_samples: u32,
+    );
+
+    pub fn scratch_packing_keyswitch_lwe_list_to_glwe_64(
+        stream: *mut c_void,
+        gpu_index: u32,
+        fp_ks_buffer: *mut *mut i8,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        num_lwes: u32,
+        allocate_gpu_memory: bool,
+    );
+
+    pub fn cuda_packing_keyswitch_lwe_list_to_glwe_64(
+        stream: *mut c_void,
+        gpu_index: u32,
+        glwe_array_out: *mut c_void,
+        lwe_array_in: *const c_void,
+        fp_ksk_array: *const c_void,
+        fp_ks_buffer: *mut i8,
+        input_lwe_dimension: u32,
+        output_glwe_dimension: u32,
+        output_polynomial_size: u32,
+        base_log: u32,
+        level_count: u32,
+        num_lwes: u32,
+    );
+
+    pub fn cleanup_packing_keyswitch_lwe_list_to_glwe(
+        stream: *mut c_void,
+        gpu_index: u32,
+        fp_ks_buffer: *mut *mut i8,
+    );
+
+    pub fn cuda_negate_lwe_ciphertext_vector_64(
+        stream: *mut c_void,
+        gpu_index: u32,
+        lwe_array_out: *mut c_void,
+        lwe_array_in: *const c_void,
+        input_lwe_dimension: u32,
+        input_lwe_ciphertext_count: u32,
+    );
+
+    pub fn cuda_add_lwe_ciphertext_vector_64(
+        stream: *mut c_void,
+        gpu_index: u32,
+        lwe_array_out: *mut c_void,
+        lwe_array_in_1: *const c_void,
+        lwe_array_in_2: *const c_void,
+        input_lwe_dimension: u32,
+        input_lwe_ciphertext_count: u32,
+    );
+
+    pub fn cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
+        stream: *mut c_void,
+        gpu_index: u32,
+        lwe_array_out: *mut c_void,
+        lwe_array_in: *const c_void,
+        plaintext_array_in: *const c_void,
+        input_lwe_dimension: u32,
+        input_lwe_ciphertext_count: u32,
+    );
+
+    pub fn cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
+        stream: *mut c_void,
+        gpu_index: u32,
+        lwe_array_out: *mut c_void,
+        lwe_array_in: *const c_void,
+        cleartext_array_in: *const c_void,
+        input_lwe_dimension: u32,
+        input_lwe_ciphertext_count: u32,
+    );
+
+    pub fn cuda_fourier_polynomial_mul(
+        stream: *mut c_void,
+        gpu_index: u32,
+        input1: *const c_void,
+        input2: *const c_void,
+        output: *mut c_void,
+        polynomial_size: u32,
+        total_polynomials: u32,
+    );
+
+    pub fn cuda_convert_lwe_programmable_bootstrap_key_64(
+        stream: *mut c_void,
+        gpu_index: u32,
+        dest: *mut c_void,
+        src: *const c_void,
+        input_lwe_dim: u32,
+        glwe_dim: u32,
+        level_count: u32,
+        polynomial_size: u32,
+    );
+
+    pub fn scratch_cuda_programmable_bootstrap_amortized_64(
+        stream: *mut c_void,
+        gpu_index: u32,
+        pbs_buffer: *mut *mut i8,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        input_lwe_ciphertext_count: u32,
+        allocate_gpu_memory: bool,
+    );
+
+    pub fn cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
+        stream: *mut c_void,
+        gpu_index: u32,
+        lwe_array_out: *mut c_void,
+        lwe_output_indexes: *const c_void,
+        lut_vector: *const c_void,
+        lut_vector_indexes: *const c_void,
+        lwe_array_in: *const c_void,
+        lwe_input_indexes: *const c_void,
+        bootstrapping_key: *const c_void,
+        pbs_buffer: *mut i8,
+        lwe_dimension: u32,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        base_log: u32,
+        level_count: u32,
+        num_samples: u32,
+    );
+
+    pub fn cleanup_cuda_programmable_bootstrap_amortized(
+        stream: *mut c_void,
+        gpu_index: u32,
+        pbs_buffer: *mut *mut i8,
+    );
+
+    pub fn scratch_cuda_programmable_bootstrap_64(
+        stream: *mut c_void,
+        gpu_index: u32,
+        pbs_buffer: *mut *mut i8,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        level_count: u32,
+        input_lwe_ciphertext_count: u32,
+        allocate_gpu_memory: bool,
+    );
+
+    pub fn cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
+        stream: *mut c_void,
+        gpu_index: u32,
+        lwe_array_out: *mut c_void,
+        lwe_output_indexes: *const c_void,
+        lut_vector: *const c_void,
+        lut_vector_indexes: *const c_void,
+        lwe_array_in: *const c_void,
+        lwe_input_indexes: *const c_void,
+        bootstrapping_key: *const c_void,
+        buffer: *mut i8,
+        lwe_dimension: u32,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        base_log: u32,
+        level_count: u32,
+        num_samples: u32,
+    );
+
+    pub fn cleanup_cuda_programmable_bootstrap(
+        stream: *mut c_void,
+        gpu_index: u32,
+        pbs_buffer: *mut *mut i8,
+    );
+
+    pub fn cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
+        stream: *mut c_void,
+        gpu_index: u32,
+        dest: *mut c_void,
+        src: *const c_void,
+        input_lwe_dim: u32,
+        glwe_dim: u32,
+        level_count: u32,
+        polynomial_size: u32,
+        grouping_factor: u32,
+    );
+
+    pub fn scratch_cuda_multi_bit_programmable_bootstrap_64(
+        stream: *mut c_void,
+        gpu_index: u32,
+        pbs_buffer: *mut *mut i8,
+        lwe_dimension: u32,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        level_count: u32,
+        grouping_factor: u32,
+        input_lwe_ciphertext_count: u32,
+        allocate_gpu_memory: bool,
+    );
+
+    pub fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
+        stream: *mut c_void,
+        gpu_index: u32,
+        lwe_array_out: *mut c_void,
+        lwe_output_indexes: *const c_void,
+        lut_vector: *const c_void,
+        lut_vector_indexes: *const c_void,
+        lwe_array_in: *const c_void,
+        lwe_input_indexes: *const c_void,
+        bootstrapping_key: *const c_void,
+        buffer: *mut i8,
+        lwe_dimension: u32,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        grouping_factor: u32,
+        base_log: u32,
+        level_count: u32,
+        num_samples: u32,
+    );
+
+    pub fn cleanup_cuda_multi_bit_programmable_bootstrap(
+        stream: *mut c_void,
+        gpu_index: u32,
+        pbs_buffer: *mut *mut i8,
+    );
+
 } // extern "C"