Skip to content

Commit

Permalink
feat(gpu): implement fhe rand on gpu
Browse files Browse the repository at this point in the history
  • Loading branch information
guillermo-oyarzun committed Jan 14, 2025
1 parent 3d19986 commit 5d54d65
Show file tree
Hide file tree
Showing 15 changed files with 1,099 additions and 73 deletions.
5 changes: 5 additions & 0 deletions backends/tfhe-cuda-backend/cuda/include/linear_algebra.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,11 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_array_in, void const *cleartext_array_in,
uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count);
void cuda_add_lwe_ciphertext_vector_plaintext_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_array_in, const uint64_t plaintext_in,
const uint32_t input_lwe_dimension,
const uint32_t input_lwe_ciphertext_count);
}

#endif // CUDA_LINALG_H_
38 changes: 38 additions & 0 deletions backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cu
Original file line number Diff line number Diff line change
Expand Up @@ -114,3 +114,41 @@ void cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
static_cast<const uint64_t *>(plaintext_array_in), input_lwe_dimension,
input_lwe_ciphertext_count);
}

/*
* Perform the addition of a u64 input LWE ciphertext vector with a u64 input
* plaintext scalar.
* - `stream` is a void pointer to the Cuda stream to be used in the kernel
* launch
* - `gpu_index` is the index of the GPU to be used in the kernel launch
* - `lwe_array_out` is an array of size
* `(input_lwe_dimension + 1) * input_lwe_ciphertext_count` that should have
* been allocated on the GPU before calling this function, and that will hold
* the result of the computation.
* - `lwe_array_in` is the LWE ciphertext vector used as input, it should have
* been allocated and initialized before calling this function. It has the same
* size as the output array.
* - `plaintext_in` is the plaintext used as input.
* - `input_lwe_dimension` is the number of mask elements in the input and
* output LWE ciphertext vectors
* - `input_lwe_ciphertext_count` is the number of ciphertexts contained in the
* input LWE ciphertext vector, as well as in the output.
*
* The same input plaintext is added to the body of the
* LWE ciphertexts in the LWE ciphertext vector. The result of the
* operation is stored in the output LWE ciphertext vector. The two input
* vectors are unchanged. This function is a wrapper to a device function that
* performs the operation on the GPU.
*/
void cuda_add_lwe_ciphertext_vector_plaintext_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_array_in, const uint64_t plaintext_in,
const uint32_t input_lwe_dimension,
const uint32_t input_lwe_ciphertext_count) {

host_addition_plaintext_scalar<uint64_t>(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_array_in), plaintext_in,
input_lwe_dimension, input_lwe_ciphertext_count);
}
36 changes: 36 additions & 0 deletions backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,21 @@ plaintext_addition(T *output, T const *lwe_input, T const *plaintext_input,
}
}

template <typename T>
__global__ void plaintext_addition_scalar(T *output, T const *lwe_input,
const T plaintext_input,
const uint32_t input_lwe_dimension,
const uint32_t num_entries) {

int tid = threadIdx.x;
int lwe_index = blockIdx.x * blockDim.x + tid;
if (lwe_index < num_entries) {
int index = lwe_index * (input_lwe_dimension + 1) + input_lwe_dimension;
// Here we take advantage of the wrapping behaviour of uint
output[index] = lwe_input[index] + plaintext_input;
}
}

template <typename T>
__host__ void
host_addition_plaintext(cudaStream_t stream, uint32_t gpu_index, T *output,
Expand All @@ -48,6 +63,27 @@ host_addition_plaintext(cudaStream_t stream, uint32_t gpu_index, T *output,
check_cuda_error(cudaGetLastError());
}

template <typename T>
__host__ void host_addition_plaintext_scalar(
cudaStream_t stream, uint32_t gpu_index, T *output, T const *lwe_input,
const T plaintext_input, const uint32_t lwe_dimension,
const uint32_t lwe_ciphertext_count) {

cudaSetDevice(gpu_index);
int num_blocks = 0, num_threads = 0;
int num_entries = lwe_ciphertext_count;
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
dim3 grid(num_blocks, 1, 1);
dim3 thds(num_threads, 1, 1);

cuda_memcpy_async_gpu_to_gpu(
output, lwe_input, (lwe_dimension + 1) * lwe_ciphertext_count * sizeof(T),
stream, gpu_index);
plaintext_addition_scalar<T><<<grid, thds, 0, stream>>>(
output, lwe_input, plaintext_input, lwe_dimension, num_entries);
check_cuda_error(cudaGetLastError());
}

template <typename T>
__global__ void addition(T *output, T const *input_1, T const *input_2,
uint32_t num_entries) {
Expand Down
11 changes: 11 additions & 0 deletions backends/tfhe-cuda-backend/src/bindings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1345,6 +1345,17 @@ extern "C" {
input_lwe_ciphertext_count: u32,
);
}
extern "C" {
pub fn cuda_add_lwe_ciphertext_vector_plaintext_64(
stream: *mut ffi::c_void,
gpu_index: u32,
lwe_array_out: *mut ffi::c_void,
lwe_array_in: *const ffi::c_void,
plaintext_in: u64,
input_lwe_dimension: u32,
input_lwe_ciphertext_count: u32,
);
}
extern "C" {
pub fn cuda_fourier_polynomial_mul(
stream: *mut ffi::c_void,
Expand Down
80 changes: 79 additions & 1 deletion tfhe/benches/integer/bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1307,11 +1307,12 @@ define_server_key_bench_default_fn!(
#[cfg(feature = "gpu")]
mod cuda {
use super::*;
use criterion::criterion_group;
use criterion::{black_box, criterion_group};
use tfhe::core_crypto::gpu::CudaStreams;
use tfhe::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
use tfhe::integer::gpu::ciphertext::CudaUnsignedRadixCiphertext;
use tfhe::integer::gpu::server_key::CudaServerKey;
use tfhe_csprng::seeders::Seed;

fn bench_cuda_server_key_unary_function_clean_inputs<F>(
c: &mut Criterion,
Expand Down Expand Up @@ -1731,6 +1732,81 @@ mod cuda {
bench_group.finish()
}

pub fn cuda_unsigned_oprf(c: &mut Criterion) {
let bench_name = "integer::cuda::unsigned_oprf";

let mut bench_group = c.benchmark_group(bench_name);
bench_group
.sample_size(15)
.measurement_time(std::time::Duration::from_secs(30));

let stream = CudaStreams::new_multi_gpu();

for (param, num_block, bit_size) in ParamsAndNumBlocksIter::default() {
let param_name = param.name();

let bench_id;

match BENCH_TYPE.get().unwrap() {
BenchmarkType::Latency => {
bench_id = format!("{bench_name}::{param_name}::{bit_size}_bits");
bench_group.bench_function(&bench_id, |b| {
let (cks, _cpu_sks) =
KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
let gpu_sks = CudaServerKey::new(&cks, &stream);

b.iter(|| {
_ = black_box(
gpu_sks
.par_generate_oblivious_pseudo_random_unsigned_integer_bounded(
Seed(0),
bit_size as u64,
num_block as u64,
&stream,
),
);
})
});
}
BenchmarkType::Throughput => {
bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
let elements = throughput_num_threads(num_block);
bench_group.throughput(Throughput::Elements(elements));

bench_group.bench_function(&bench_id, |b| {
let (cks, _cpu_sks) =
KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
let gpu_sks = CudaServerKey::new(&cks, &stream);

b.iter(|| {
(0..elements).into_par_iter().for_each(|_| {
gpu_sks
.par_generate_oblivious_pseudo_random_unsigned_integer_bounded(
Seed(0),
bit_size as u64,
num_block as u64,
&stream,
);
})
})
});
}
}

write_to_json::<u64, _>(
&bench_id,
param,
param.name(),
"oprf",
&OperatorType::Atomic,
bit_size as u32,
vec![param.message_modulus().0.ilog2(); num_block],
);
}

bench_group.finish()
}

macro_rules! define_cuda_server_key_bench_clean_input_unary_fn (
(method_name: $server_key_method:ident, display_name:$name:ident) => {
::paste::paste!{
Expand Down Expand Up @@ -2376,6 +2452,7 @@ mod cuda {
cuda_trailing_zeros,
cuda_trailing_ones,
cuda_ilog2,
cuda_unsigned_oprf,
);

criterion_group!(
Expand All @@ -2395,6 +2472,7 @@ mod cuda {
cuda_scalar_mul,
cuda_scalar_div,
cuda_scalar_rem,
cuda_unsigned_oprf,
);

criterion_group!(
Expand Down
25 changes: 25 additions & 0 deletions tfhe/src/core_crypto/gpu/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -483,6 +483,31 @@ pub unsafe fn add_lwe_ciphertext_vector_plaintext_vector_async<T: UnsignedIntege
);
}

/// Addition of a vector of LWE ciphertexts with a plaintext scalar
///
/// # Safety
///
/// [CudaStreams::synchronize] __must__ be called as soon as synchronization is
/// required
pub unsafe fn add_lwe_ciphertext_vector_plaintext_scalar_async<T: UnsignedInteger>(
streams: &CudaStreams,
lwe_array_out: &mut CudaVec<T>,
lwe_array_in: &CudaVec<T>,
plaintext_in: u64,
lwe_dimension: LweDimension,
num_samples: u32,
) {
cuda_add_lwe_ciphertext_vector_plaintext_64(
streams.ptr[0],
streams.gpu_indexes[0].0,
lwe_array_out.as_mut_c_ptr(0),
lwe_array_in.as_c_ptr(0),
plaintext_in,
lwe_dimension.0 as u32,
num_samples,
);
}

/// Assigned addition of a vector of LWE ciphertexts with a vector of plaintexts
///
/// # Safety
Expand Down
34 changes: 26 additions & 8 deletions tfhe/src/high_level_api/booleans/oprf.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
use super::FheBool;
use super::{FheBool, InnerBoolean};
use crate::high_level_api::global_state;
#[cfg(feature = "gpu")]
use crate::high_level_api::global_state::with_thread_local_cuda_streams;
use crate::high_level_api::keys::InternalServerKey;
#[cfg(feature = "gpu")]
use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
#[cfg(feature = "gpu")]
use crate::integer::gpu::ciphertext::CudaUnsignedRadixCiphertext;
use crate::integer::BooleanBlock;
use tfhe_csprng::seeders::Seed;

Expand All @@ -24,16 +30,28 @@ impl FheBool {
/// let dec_result: bool = ct_res.decrypt(&client_key);
/// ```
pub fn generate_oblivious_pseudo_random(seed: Seed) -> Self {
global_state::with_internal_keys(|key| match key {
let (ciphertext, tag) = global_state::with_internal_keys(|key| match key {
InternalServerKey::Cpu(key) => {
let ct = key.pbs_key().key.generate_oblivious_pseudo_random(seed, 1);

Self::new(BooleanBlock(ct), key.tag.clone())
(
InnerBoolean::Cpu(BooleanBlock::new_unchecked(ct)),
key.tag.clone(),
)
}
#[cfg(feature = "gpu")]
InternalServerKey::Cuda(_) => {
todo!("Cuda devices do not yet support oblivious pseudo random generation")
}
})
InternalServerKey::Cuda(cuda_key) => with_thread_local_cuda_streams(|streams| {
let d_ct: CudaUnsignedRadixCiphertext = cuda_key
.key
.key
.generate_oblivious_pseudo_random(seed, 1, streams);
(
InnerBoolean::Cuda(CudaBooleanBlock::from_cuda_radix_ciphertext(
d_ct.ciphertext,
)),
cuda_key.tag.clone(),
)
}),
});
Self::new(ciphertext, tag)
}
}
Loading

0 comments on commit 5d54d65

Please sign in to comment.