fix(gpu): fix compression on multi-gpu

zama-ai · Sep 5, 2024 · 6f9337e · 6f9337e
1 parent 358bcc9
commit 6f9337e
Show file tree

Hide file tree

Showing 12 changed files with 158 additions and 93 deletions.
diff --git a/.github/workflows/gpu_fast_h100_tests.yml b/.github/workflows/gpu_fast_h100_tests.yml
@@ -147,6 +147,7 @@ jobs:
       - name: Run core crypto and internal CUDA backend tests
         run: |
           BIG_TESTS_INSTANCE=TRUE make test_core_crypto_gpu
+          BIG_TESTS_INSTANCE=TRUE make test_integer_compression_gpu
           BIG_TESTS_INSTANCE=TRUE make test_cuda_backend
 
       - name: Run user docs tests

diff --git a/.github/workflows/gpu_fast_tests.yml b/.github/workflows/gpu_fast_tests.yml
@@ -145,6 +145,7 @@ jobs:
       - name: Run core crypto and internal CUDA backend tests
         run: |
           make test_core_crypto_gpu
+          make test_integer_compression_gpu
           make test_cuda_backend
 
       - name: Run user docs tests

diff --git a/.github/workflows/gpu_full_multi_gpu_tests.yml b/.github/workflows/gpu_full_multi_gpu_tests.yml
@@ -144,6 +144,10 @@ jobs:
         if: ${{ !cancelled() }}
         run: nvidia-smi
 
+      - name: Run multi-bit CUDA integer compression tests
+        run: |
+          BIG_TESTS_INSTANCE=TRUE make test_integer_compression_gpu
+
       # No need to test core_crypto and classic PBS in integer since it's already tested on single GPU.
       - name: Run multi-bit CUDA integer tests
         run: |

diff --git a/Makefile b/Makefile
@@ -481,6 +481,13 @@ test_integer_gpu: install_rs_build_toolchain
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
 		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key::
 
+.PHONY: test_integer_compression_gpu
+test_integer_compression_gpu: install_rs_build_toolchain
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
+		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::ciphertext --test-threads=6
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
+		--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::ciphertext::compress
+
 .PHONY: test_integer_gpu_ci # Run the tests for integer ci on gpu backend
 test_integer_gpu_ci: install_rs_check_toolchain install_cargo_nextest
 	BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \

diff --git a/backends/tfhe-cuda-backend/cuda/CMakeLists.txt b/backends/tfhe-cuda-backend/cuda/CMakeLists.txt
@@ -69,7 +69,7 @@ add_compile_definitions(CUDA_ARCH=${CUDA_ARCH})
 
 # in production, should use -arch=sm_70 --ptxas-options=-v to see register spills -lineinfo for better debugging
 set(CMAKE_CUDA_FLAGS
-    "${CMAKE_CUDA_FLAGS} -ccbin ${CMAKE_CXX_COMPILER} -O3 \
+    "${CMAKE_CUDA_FLAGS} -ccbin ${CMAKE_CXX_COMPILER} -O0 -G -g \
   -std=c++17 --no-exceptions  --expt-relaxed-constexpr -rdc=true \
   --use_fast_math -Xcompiler -fPIC")
 

diff --git a/backends/tfhe-cuda-backend/cuda/include/compression.h b/backends/tfhe-cuda-backend/cuda/include/compression.h
@@ -18,7 +18,8 @@ void scratch_cuda_integer_decompress_radix_ciphertext_64(
     uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
     uint32_t lwe_dimension, uint32_t pbs_level, uint32_t pbs_base_log,
     uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, uint32_t storage_log_modulus, bool allocate_gpu_memory);
+    PBS_TYPE pbs_type, uint32_t storage_log_modulus, uint32_t bodies_count,
+    bool allocate_gpu_memory);
 
 void cuda_integer_compress_radix_ciphertext_64(
     void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
@@ -94,6 +95,7 @@ template <typename Torus> struct int_decompression {
 
   uint32_t storage_log_modulus;
 
+  uint32_t num_lwes;
   uint32_t body_count;
 
   Torus *tmp_extracted_glwe;
@@ -104,12 +106,13 @@ template <typename Torus> struct int_decompression {
   int_decompression(cudaStream_t *streams, uint32_t *gpu_indexes,
                     uint32_t gpu_count, int_radix_params encryption_params,
                     int_radix_params compression_params,
-                    uint32_t num_radix_blocks, uint32_t storage_log_modulus,
-                    bool allocate_gpu_memory) {
+                    uint32_t num_radix_blocks, uint32_t bodies_count,
+                    uint32_t storage_log_modulus, bool allocate_gpu_memory) {
     this->encryption_params = encryption_params;
     this->compression_params = compression_params;
     this->storage_log_modulus = storage_log_modulus;
-    this->body_count = num_radix_blocks;
+    this->num_lwes = num_radix_blocks;
+    this->body_count = bodies_count;
 
     if (allocate_gpu_memory) {
       Torus glwe_accumulator_size = (compression_params.glwe_dimension + 1) *

diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu
@@ -25,24 +25,25 @@ void scratch_cuda_integer_decompress_radix_ciphertext_64(
     uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
     uint32_t lwe_dimension, uint32_t pbs_level, uint32_t pbs_base_log,
     uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus,
-    PBS_TYPE pbs_type, uint32_t storage_log_modulus, bool allocate_gpu_memory) {
+    PBS_TYPE pbs_type, uint32_t storage_log_modulus, uint32_t bodies_count,
+    bool allocate_gpu_memory) {
 
+  // Decompression doesn't keyswitch, so big and small dimensions are the same
   int_radix_params encryption_params(
       pbs_type, encryption_glwe_dimension, encryption_polynomial_size,
-      (encryption_glwe_dimension + 1) * encryption_polynomial_size,
-      lwe_dimension, 0, 0, pbs_level, pbs_base_log, 0, message_modulus,
-      carry_modulus);
+      lwe_dimension, lwe_dimension, 0, 0, pbs_level, pbs_base_log, 0,
+      message_modulus, carry_modulus);
 
   int_radix_params compression_params(
       pbs_type, compression_glwe_dimension, compression_polynomial_size,
-      (compression_glwe_dimension + 1) * compression_polynomial_size,
-      lwe_dimension, 0, 0, pbs_level, pbs_base_log, 0, message_modulus,
-      carry_modulus);
+      lwe_dimension, compression_glwe_dimension * compression_polynomial_size,
+      0, 0, pbs_level, pbs_base_log, 0, message_modulus, carry_modulus);
 
   scratch_cuda_integer_decompress_radix_ciphertext_64(
       (cudaStream_t *)(streams), gpu_indexes, gpu_count,
-      (int_decompression<uint64_t> **)mem_ptr, num_lwes, encryption_params,
-      compression_params, storage_log_modulus, allocate_gpu_memory);
+      (int_decompression<uint64_t> **)mem_ptr, num_lwes, bodies_count,
+      encryption_params, compression_params, storage_log_modulus,
+      allocate_gpu_memory);
 }
 void cuda_integer_compress_radix_ciphertext_64(
     void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,

diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
@@ -45,7 +45,6 @@ __host__ void host_pack(cudaStream_t stream, uint32_t gpu_index,
   auto log_modulus = mem_ptr->storage_log_modulus;
   auto in_len = params.glwe_dimension * params.polynomial_size + body_count;
   auto number_bits_to_pack = in_len * log_modulus;
-
   auto nbits = sizeof(Torus) * 8;
   // number_bits_to_pack.div_ceil(Scalar::BITS)
   auto len = (number_bits_to_pack + nbits - 1) / nbits;
@@ -80,6 +79,7 @@ __host__ void host_integer_compress(cudaStream_t *streams,
   uint32_t glwe_out_size = (compression_params.glwe_dimension + 1) *
                            compression_params.polynomial_size;
   uint32_t num_glwes = num_lwes / mem_ptr->lwe_per_glwe + 1;
+  auto body_count = min(num_lwes, mem_ptr->lwe_per_glwe);
 
   // Keyswitch LWEs to GLWE
   auto tmp_glwe_array_out = mem_ptr->tmp_glwe_array_out;
@@ -92,11 +92,9 @@ __host__ void host_integer_compress(cudaStream_t *streams,
         streams[0], gpu_indexes[0], glwe_out, lwe_subset, fp_ksk[0],
         fp_ks_buffer, input_lwe_dimension, compression_params.glwe_dimension,
         compression_params.polynomial_size, compression_params.ks_base_log,
-        compression_params.ks_level, min(num_lwes, mem_ptr->lwe_per_glwe));
+        compression_params.ks_level, body_count);
   }
 
-  auto body_count = min(num_lwes, mem_ptr->lwe_per_glwe);
-
   // Modulus switch
   host_modulus_switch_inplace(streams[0], gpu_indexes[0], tmp_glwe_array_out,
                               num_glwes *
@@ -156,8 +154,9 @@ __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
   auto log_modulus = mem_ptr->storage_log_modulus;
 
   uint32_t body_count = mem_ptr->body_count;
+
   auto initial_out_len =
-      params.glwe_dimension * params.polynomial_size + body_count * body_count;
+      params.glwe_dimension * params.polynomial_size + body_count;
 
   // We assure the tail of the glwe is zeroed
   auto zeroed_slice =
@@ -187,7 +186,7 @@ host_integer_decompress(cudaStream_t *streams, uint32_t *gpu_indexes,
   host_extract(streams[0], gpu_indexes[0], extracted_glwe, packed_glwe_in, 0,
                mem_ptr);
 
-  auto num_lwes = mem_ptr->body_count;
+  auto num_lwes = mem_ptr->num_lwes;
 
   // Sample extract
   auto extracted_lwe = mem_ptr->tmp_extracted_lwe;
@@ -199,17 +198,58 @@ host_integer_decompress(cudaStream_t *streams, uint32_t *gpu_indexes,
   /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
   /// dimension to a big LWE dimension
   auto encryption_params = mem_ptr->encryption_params;
-  auto carry_extract_lut = mem_ptr->carry_extract_lut;
-  execute_pbs_async<Torus>(
-      streams, gpu_indexes, gpu_count, lwe_array_out,
-      carry_extract_lut->lwe_indexes_out, carry_extract_lut->lut_vec,
-      carry_extract_lut->lut_indexes_vec, extracted_lwe,
-      carry_extract_lut->lwe_indexes_in, bsks, carry_extract_lut->buffer,
-      encryption_params.glwe_dimension,
-      compression_params.glwe_dimension * compression_params.polynomial_size,
-      encryption_params.polynomial_size, encryption_params.pbs_base_log,
-      encryption_params.pbs_level, encryption_params.grouping_factor, num_lwes,
-      encryption_params.pbs_type);
+  auto lut = mem_ptr->carry_extract_lut;
+  auto active_gpu_count = get_active_gpu_count(num_lwes, gpu_count);
+  if (active_gpu_count == 1) {
+    execute_pbs_async<Torus>(
+        streams, gpu_indexes, active_gpu_count, lwe_array_out,
+        lut->lwe_indexes_out, lut->lut_vec, lut->lut_indexes_vec, extracted_lwe,
+        lut->lwe_indexes_in, bsks, lut->buffer,
+        encryption_params.glwe_dimension,
+        compression_params.small_lwe_dimension,
+        encryption_params.polynomial_size, encryption_params.pbs_base_log,
+        encryption_params.pbs_level, encryption_params.grouping_factor,
+        num_lwes, encryption_params.pbs_type);
+  } else {
+    /// For multi GPU execution we create vectors of pointers for inputs and
+    /// outputs
+    std::vector<Torus *> lwe_array_in_vec = lut->lwe_array_in_vec;
+    std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
+    std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;
+
+    /// Make sure all data that should be on GPU 0 is indeed there
+    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+
+    /// With multiple GPUs we push to the vectors on each GPU then when we
+    /// gather data to GPU 0 we can copy back to the original indexing
+    multi_gpu_scatter_lwe_async<Torus>(
+        streams, gpu_indexes, active_gpu_count, lwe_array_in_vec, extracted_lwe,
+        lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes, num_lwes,
+        compression_params.small_lwe_dimension + 1);
+
+    /// Apply PBS
+    execute_pbs_async<Torus>(
+        streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
+        lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
+        lwe_array_in_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
+        encryption_params.glwe_dimension,
+        compression_params.small_lwe_dimension,
+        encryption_params.polynomial_size, encryption_params.pbs_base_log,
+        encryption_params.pbs_level, encryption_params.grouping_factor,
+        num_lwes, encryption_params.pbs_type);
+
+    /// Copy data back to GPU 0 and release vecs
+    multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
+                                      lwe_array_out, lwe_after_pbs_vec,
+                                      lut->h_lwe_indexes_out,
+                                      lut->using_trivial_lwe_indexes, num_lwes,
+                                      encryption_params.big_lwe_dimension + 1);
+
+    /// Synchronize all GPUs
+    for (uint i = 0; i < active_gpu_count; i++) {
+      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+    }
+  }
 }
 
 template <typename Torus>
@@ -228,11 +268,12 @@ template <typename Torus>
 __host__ void scratch_cuda_integer_decompress_radix_ciphertext_64(
     cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
     int_decompression<Torus> **mem_ptr, uint32_t num_lwes,
-    int_radix_params encryption_params, int_radix_params compression_params,
-    uint32_t storage_log_modulus, bool allocate_gpu_memory) {
+    uint32_t bodies_count, int_radix_params encryption_params,
+    int_radix_params compression_params, uint32_t storage_log_modulus,
+    bool allocate_gpu_memory) {
 
   *mem_ptr = new int_decompression<Torus>(
       streams, gpu_indexes, gpu_count, encryption_params, compression_params,
-      num_lwes, storage_log_modulus, allocate_gpu_memory);
+      num_lwes, bodies_count, storage_log_modulus, allocate_gpu_memory);
 }
 #endif
diff --git a/backends/tfhe-cuda-backend/src/cuda_bind.rs b/backends/tfhe-cuda-backend/src/cuda_bind.rs
@@ -553,6 +553,7 @@ extern "C" {
         carry_modulus: u32,
         pbs_type: u32,
         storage_log_modulus: u32,
+        bodies_count: u32,
         allocate_gpu_memory: bool,
     );