Skip to content

Commit

Permalink
fix(gpu): fix compression on multi-gpu
Browse files Browse the repository at this point in the history
  • Loading branch information
pdroalves committed Sep 5, 2024
1 parent 358bcc9 commit 6f9337e
Show file tree
Hide file tree
Showing 12 changed files with 158 additions and 93 deletions.
1 change: 1 addition & 0 deletions .github/workflows/gpu_fast_h100_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ jobs:
- name: Run core crypto and internal CUDA backend tests
run: |
BIG_TESTS_INSTANCE=TRUE make test_core_crypto_gpu
BIG_TESTS_INSTANCE=TRUE make test_integer_compression_gpu
BIG_TESTS_INSTANCE=TRUE make test_cuda_backend
- name: Run user docs tests
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/gpu_fast_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ jobs:
- name: Run core crypto and internal CUDA backend tests
run: |
make test_core_crypto_gpu
make test_integer_compression_gpu
make test_cuda_backend
- name: Run user docs tests
Expand Down
4 changes: 4 additions & 0 deletions .github/workflows/gpu_full_multi_gpu_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,10 @@ jobs:
if: ${{ !cancelled() }}
run: nvidia-smi

- name: Run multi-bit CUDA integer compression tests
run: |
BIG_TESTS_INSTANCE=TRUE make test_integer_compression_gpu
# No need to test core_crypto and classic PBS in integer since it's already tested on single GPU.
- name: Run multi-bit CUDA integer tests
run: |
Expand Down
7 changes: 7 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -481,6 +481,13 @@ test_integer_gpu: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::server_key::

.PHONY: test_integer_compression_gpu
test_integer_compression_gpu: install_rs_build_toolchain
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::ciphertext --test-threads=6
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --doc --profile $(CARGO_PROFILE) \
--features=$(TARGET_ARCH_FEATURE),integer,gpu -p $(TFHE_SPEC) -- integer::gpu::ciphertext::compress

.PHONY: test_integer_gpu_ci # Run the tests for integer ci on gpu backend
test_integer_gpu_ci: install_rs_check_toolchain install_cargo_nextest
BIG_TESTS_INSTANCE="$(BIG_TESTS_INSTANCE)" \
Expand Down
2 changes: 1 addition & 1 deletion backends/tfhe-cuda-backend/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ add_compile_definitions(CUDA_ARCH=${CUDA_ARCH})

# in production, should use -arch=sm_70 --ptxas-options=-v to see register spills -lineinfo for better debugging
set(CMAKE_CUDA_FLAGS
"${CMAKE_CUDA_FLAGS} -ccbin ${CMAKE_CXX_COMPILER} -O3 \
"${CMAKE_CUDA_FLAGS} -ccbin ${CMAKE_CXX_COMPILER} -O0 -G -g \
-std=c++17 --no-exceptions --expt-relaxed-constexpr -rdc=true \
--use_fast_math -Xcompiler -fPIC")

Expand Down
11 changes: 7 additions & 4 deletions backends/tfhe-cuda-backend/cuda/include/compression.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ void scratch_cuda_integer_decompress_radix_ciphertext_64(
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
uint32_t lwe_dimension, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, uint32_t storage_log_modulus, bool allocate_gpu_memory);
PBS_TYPE pbs_type, uint32_t storage_log_modulus, uint32_t bodies_count,
bool allocate_gpu_memory);

void cuda_integer_compress_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Expand Down Expand Up @@ -94,6 +95,7 @@ template <typename Torus> struct int_decompression {

uint32_t storage_log_modulus;

uint32_t num_lwes;
uint32_t body_count;

Torus *tmp_extracted_glwe;
Expand All @@ -104,12 +106,13 @@ template <typename Torus> struct int_decompression {
int_decompression(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, int_radix_params encryption_params,
int_radix_params compression_params,
uint32_t num_radix_blocks, uint32_t storage_log_modulus,
bool allocate_gpu_memory) {
uint32_t num_radix_blocks, uint32_t bodies_count,
uint32_t storage_log_modulus, bool allocate_gpu_memory) {
this->encryption_params = encryption_params;
this->compression_params = compression_params;
this->storage_log_modulus = storage_log_modulus;
this->body_count = num_radix_blocks;
this->num_lwes = num_radix_blocks;
this->body_count = bodies_count;

if (allocate_gpu_memory) {
Torus glwe_accumulator_size = (compression_params.glwe_dimension + 1) *
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,24 +25,25 @@ void scratch_cuda_integer_decompress_radix_ciphertext_64(
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
uint32_t lwe_dimension, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, uint32_t storage_log_modulus, bool allocate_gpu_memory) {
PBS_TYPE pbs_type, uint32_t storage_log_modulus, uint32_t bodies_count,
bool allocate_gpu_memory) {

// Decompression doesn't keyswitch, so big and small dimensions are the same
int_radix_params encryption_params(
pbs_type, encryption_glwe_dimension, encryption_polynomial_size,
(encryption_glwe_dimension + 1) * encryption_polynomial_size,
lwe_dimension, 0, 0, pbs_level, pbs_base_log, 0, message_modulus,
carry_modulus);
lwe_dimension, lwe_dimension, 0, 0, pbs_level, pbs_base_log, 0,
message_modulus, carry_modulus);

int_radix_params compression_params(
pbs_type, compression_glwe_dimension, compression_polynomial_size,
(compression_glwe_dimension + 1) * compression_polynomial_size,
lwe_dimension, 0, 0, pbs_level, pbs_base_log, 0, message_modulus,
carry_modulus);
lwe_dimension, compression_glwe_dimension * compression_polynomial_size,
0, 0, pbs_level, pbs_base_log, 0, message_modulus, carry_modulus);

scratch_cuda_integer_decompress_radix_ciphertext_64(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_decompression<uint64_t> **)mem_ptr, num_lwes, encryption_params,
compression_params, storage_log_modulus, allocate_gpu_memory);
(int_decompression<uint64_t> **)mem_ptr, num_lwes, bodies_count,
encryption_params, compression_params, storage_log_modulus,
allocate_gpu_memory);
}
void cuda_integer_compress_radix_ciphertext_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ __host__ void host_pack(cudaStream_t stream, uint32_t gpu_index,
auto log_modulus = mem_ptr->storage_log_modulus;
auto in_len = params.glwe_dimension * params.polynomial_size + body_count;
auto number_bits_to_pack = in_len * log_modulus;

auto nbits = sizeof(Torus) * 8;
// number_bits_to_pack.div_ceil(Scalar::BITS)
auto len = (number_bits_to_pack + nbits - 1) / nbits;
Expand Down Expand Up @@ -80,6 +79,7 @@ __host__ void host_integer_compress(cudaStream_t *streams,
uint32_t glwe_out_size = (compression_params.glwe_dimension + 1) *
compression_params.polynomial_size;
uint32_t num_glwes = num_lwes / mem_ptr->lwe_per_glwe + 1;
auto body_count = min(num_lwes, mem_ptr->lwe_per_glwe);

// Keyswitch LWEs to GLWE
auto tmp_glwe_array_out = mem_ptr->tmp_glwe_array_out;
Expand All @@ -92,11 +92,9 @@ __host__ void host_integer_compress(cudaStream_t *streams,
streams[0], gpu_indexes[0], glwe_out, lwe_subset, fp_ksk[0],
fp_ks_buffer, input_lwe_dimension, compression_params.glwe_dimension,
compression_params.polynomial_size, compression_params.ks_base_log,
compression_params.ks_level, min(num_lwes, mem_ptr->lwe_per_glwe));
compression_params.ks_level, body_count);
}

auto body_count = min(num_lwes, mem_ptr->lwe_per_glwe);

// Modulus switch
host_modulus_switch_inplace(streams[0], gpu_indexes[0], tmp_glwe_array_out,
num_glwes *
Expand Down Expand Up @@ -156,8 +154,9 @@ __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
auto log_modulus = mem_ptr->storage_log_modulus;

uint32_t body_count = mem_ptr->body_count;

auto initial_out_len =
params.glwe_dimension * params.polynomial_size + body_count * body_count;
params.glwe_dimension * params.polynomial_size + body_count;

// We assure the tail of the glwe is zeroed
auto zeroed_slice =
Expand Down Expand Up @@ -187,7 +186,7 @@ host_integer_decompress(cudaStream_t *streams, uint32_t *gpu_indexes,
host_extract(streams[0], gpu_indexes[0], extracted_glwe, packed_glwe_in, 0,
mem_ptr);

auto num_lwes = mem_ptr->body_count;
auto num_lwes = mem_ptr->num_lwes;

// Sample extract
auto extracted_lwe = mem_ptr->tmp_extracted_lwe;
Expand All @@ -199,17 +198,58 @@ host_integer_decompress(cudaStream_t *streams, uint32_t *gpu_indexes,
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
/// dimension to a big LWE dimension
auto encryption_params = mem_ptr->encryption_params;
auto carry_extract_lut = mem_ptr->carry_extract_lut;
execute_pbs_async<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out,
carry_extract_lut->lwe_indexes_out, carry_extract_lut->lut_vec,
carry_extract_lut->lut_indexes_vec, extracted_lwe,
carry_extract_lut->lwe_indexes_in, bsks, carry_extract_lut->buffer,
encryption_params.glwe_dimension,
compression_params.glwe_dimension * compression_params.polynomial_size,
encryption_params.polynomial_size, encryption_params.pbs_base_log,
encryption_params.pbs_level, encryption_params.grouping_factor, num_lwes,
encryption_params.pbs_type);
auto lut = mem_ptr->carry_extract_lut;
auto active_gpu_count = get_active_gpu_count(num_lwes, gpu_count);
if (active_gpu_count == 1) {
execute_pbs_async<Torus>(
streams, gpu_indexes, active_gpu_count, lwe_array_out,
lut->lwe_indexes_out, lut->lut_vec, lut->lut_indexes_vec, extracted_lwe,
lut->lwe_indexes_in, bsks, lut->buffer,
encryption_params.glwe_dimension,
compression_params.small_lwe_dimension,
encryption_params.polynomial_size, encryption_params.pbs_base_log,
encryption_params.pbs_level, encryption_params.grouping_factor,
num_lwes, encryption_params.pbs_type);
} else {
/// For multi GPU execution we create vectors of pointers for inputs and
/// outputs
std::vector<Torus *> lwe_array_in_vec = lut->lwe_array_in_vec;
std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;

/// Make sure all data that should be on GPU 0 is indeed there
cuda_synchronize_stream(streams[0], gpu_indexes[0]);

/// With multiple GPUs we push to the vectors on each GPU then when we
/// gather data to GPU 0 we can copy back to the original indexing
multi_gpu_scatter_lwe_async<Torus>(
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec, extracted_lwe,
lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes, num_lwes,
compression_params.small_lwe_dimension + 1);

/// Apply PBS
execute_pbs_async<Torus>(
streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
lwe_array_in_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
encryption_params.glwe_dimension,
compression_params.small_lwe_dimension,
encryption_params.polynomial_size, encryption_params.pbs_base_log,
encryption_params.pbs_level, encryption_params.grouping_factor,
num_lwes, encryption_params.pbs_type);

/// Copy data back to GPU 0 and release vecs
multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
lwe_array_out, lwe_after_pbs_vec,
lut->h_lwe_indexes_out,
lut->using_trivial_lwe_indexes, num_lwes,
encryption_params.big_lwe_dimension + 1);

/// Synchronize all GPUs
for (uint i = 0; i < active_gpu_count; i++) {
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
}
}
}

template <typename Torus>
Expand All @@ -228,11 +268,12 @@ template <typename Torus>
__host__ void scratch_cuda_integer_decompress_radix_ciphertext_64(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
int_decompression<Torus> **mem_ptr, uint32_t num_lwes,
int_radix_params encryption_params, int_radix_params compression_params,
uint32_t storage_log_modulus, bool allocate_gpu_memory) {
uint32_t bodies_count, int_radix_params encryption_params,
int_radix_params compression_params, uint32_t storage_log_modulus,
bool allocate_gpu_memory) {

*mem_ptr = new int_decompression<Torus>(
streams, gpu_indexes, gpu_count, encryption_params, compression_params,
num_lwes, storage_log_modulus, allocate_gpu_memory);
num_lwes, bodies_count, storage_log_modulus, allocate_gpu_memory);
}
#endif
1 change: 1 addition & 0 deletions backends/tfhe-cuda-backend/src/cuda_bind.rs
Original file line number Diff line number Diff line change
Expand Up @@ -553,6 +553,7 @@ extern "C" {
carry_modulus: u32,
pbs_type: u32,
storage_log_modulus: u32,
bodies_count: u32,
allocate_gpu_memory: bool,
);

Expand Down
Loading

0 comments on commit 6f9337e

Please sign in to comment.