Skip to content

Commit

Permalink
try std bootstrap
Browse files Browse the repository at this point in the history
  • Loading branch information
BourgerieQuentin committed Dec 10, 2024
1 parent ad106f7 commit 7fc2663
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 60 deletions.
74 changes: 42 additions & 32 deletions compilers/concrete-compiler/compiler/lib/Runtime/GPUDFG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,22 +117,25 @@ struct Dependence;
// is required.
struct PBS_buffer {
PBS_buffer(void *stream, uint32_t gpu_idx, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t input_lwe_ciphertext_count)
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count)
: max_pbs_buffer_samples(input_lwe_ciphertext_count),
glwe_dim(glwe_dimension), poly_size(polynomial_size),
gpu_stream(stream), gpu_index(gpu_idx) {
scratch_cuda_programmable_bootstrap_amortized_64(
gpu_stream, gpu_index, &pbs_buffer, glwe_dim, poly_size,
max_pbs_buffer_samples, true);
glwe_dim(glwe_dimension), _level_count(level_count),
poly_size(polynomial_size), gpu_stream(stream), gpu_index(gpu_idx) {
scratch_cuda_programmable_bootstrap_64(gpu_stream, gpu_index, &pbs_buffer,
glwe_dim, poly_size, _level_count,
max_pbs_buffer_samples, true);
}
~PBS_buffer() {
cleanup_cuda_programmable_bootstrap_amortized(gpu_stream, gpu_index, &pbs_buffer);
cleanup_cuda_programmable_bootstrap(gpu_stream, gpu_index, &pbs_buffer);
}
int8_t *get_pbs_buffer(void *stream, uint32_t gpu_idx,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count,
uint32_t input_lwe_ciphertext_count) {
assert(glwe_dimension <= glwe_dim);
assert(polynomial_size <= poly_size);
assert(level_count <= _level_count);
assert(input_lwe_ciphertext_count <= max_pbs_buffer_samples);
assert(stream == gpu_stream);
assert(gpu_idx == gpu_index);
Expand All @@ -144,6 +147,7 @@ struct PBS_buffer {
uint32_t max_pbs_buffer_samples;
uint32_t glwe_dim;
uint32_t poly_size;
uint32_t _level_count;
void *gpu_stream;
uint32_t gpu_index;
};
Expand All @@ -163,20 +167,22 @@ struct GPU_state {
cuda_destroy_stream((cudaStream_t)gpu_stream, gpu_idx);
}
inline int8_t *get_pbs_buffer(uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count) {
if (pbs_buffer != nullptr && (pbs_buffer->glwe_dim != glwe_dimension ||
pbs_buffer->poly_size != polynomial_size ||
pbs_buffer->_level_count != level_count ||
pbs_buffer->get_max_pbs_buffer_samples() <
input_lwe_ciphertext_count)) {
delete pbs_buffer;
pbs_buffer = nullptr;
}
if (pbs_buffer == nullptr)
pbs_buffer = new PBS_buffer(get_gpu_stream(), gpu_idx, glwe_dimension,
polynomial_size, input_lwe_ciphertext_count);
polynomial_size, level_count,
input_lwe_ciphertext_count);
return pbs_buffer->get_pbs_buffer(get_gpu_stream(), gpu_idx, glwe_dimension,
polynomial_size,
polynomial_size, level_count,
input_lwe_ciphertext_count);
}
inline void *get_gpu_stream() {
Expand Down Expand Up @@ -216,16 +222,17 @@ struct GPU_DFG {
to_free_list.clear();
}
inline int8_t *get_pbs_buffer(uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count) {
if (pbs_buffer == nullptr) {
int8_t *ret = gpus[gpu_idx].get_pbs_buffer(
glwe_dimension, polynomial_size, input_lwe_ciphertext_count);
int8_t *ret =
gpus[gpu_idx].get_pbs_buffer(glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count);
pbs_buffer = gpus[gpu_idx].pbs_buffer;
return ret;
}
return pbs_buffer->get_pbs_buffer(gpu_stream, gpu_idx, glwe_dimension,
polynomial_size,
polynomial_size, level_count,
input_lwe_ciphertext_count);
}
inline void *get_gpu_stream(int32_t loc) {
Expand Down Expand Up @@ -422,8 +429,8 @@ struct Dependence {
}
inline void free_data(GPU_DFG *dfg, bool immediate = false) {
if (device_data != nullptr) {
cuda_drop_async(device_data,
(cudaStream_t)dfg->get_gpu_stream(location), location);
cuda_drop_async(device_data, (cudaStream_t)dfg->get_gpu_stream(location),
location);
}
if (onHostReady && host_data.allocated != nullptr && hostAllocated) {
// As streams are not synchronized aside from the GET operation,
Expand Down Expand Up @@ -1080,16 +1087,18 @@ void memref_keyswitch_lwe_u64_process(Process *p, int32_t loc, int32_t chunk_id,
void *ksk_gpu = p->ctx.val->get_ksk_gpu(
p->level.val, p->input_lwe_dim.val, p->output_lwe_dim.val, loc, s,
p->sk_index.val);
// Initialize indexes
uint64_t *indexes = (uint64_t*) malloc(num_samples * sizeof(uint64_t));
// Initialize indexes
uint64_t *indexes = (uint64_t *)malloc(num_samples * sizeof(uint64_t));
for (uint32_t i = 0; i < num_samples; i++) {
indexes[i] = i;
indexes[i] = i;
}
void *indexes_gpu = alloc_and_memcpy_async_to_gpu(indexes, 0, num_samples, loc, s);
void *indexes_gpu =
alloc_and_memcpy_async_to_gpu(indexes, 0, num_samples, loc, s);

cuda_keyswitch_lwe_ciphertext_vector_64(
s, loc, out_gpu, indexes_gpu, ct0_gpu, indexes_gpu, ksk_gpu, p->input_lwe_dim.val,
p->output_lwe_dim.val, p->base_log.val, p->level.val, num_samples);
s, loc, out_gpu, indexes_gpu, ct0_gpu, indexes_gpu, ksk_gpu,
p->input_lwe_dim.val, p->output_lwe_dim.val, p->base_log.val,
p->level.val, num_samples);
cuda_drop_async(indexes_gpu, s, loc);
Dependence *dep =
new Dependence(loc, out, out_gpu, false, false, d->chunk_id);
Expand Down Expand Up @@ -1188,23 +1197,25 @@ void memref_bootstrap_lwe_u64_process(Process *p, int32_t loc, int32_t chunk_id,
cuda_memcpy_async_to_gpu(test_vector_idxes_gpu, (void *)test_vector_idxes,
test_vector_idxes_size, s, loc);
// Initialize indexes
uint64_t *indexes = (uint64_t*) malloc(num_samples * sizeof(uint64_t));
uint64_t *indexes = (uint64_t *)malloc(num_samples * sizeof(uint64_t));
for (uint32_t i = 0; i < num_samples; i++) {
indexes[i] = i;
indexes[i] = i;
}
void *indexes_gpu = alloc_and_memcpy_async_to_gpu(indexes, 0, num_samples, loc, s);
void *indexes_gpu =
alloc_and_memcpy_async_to_gpu(indexes, 0, num_samples, loc, s);

int8_t *pbs_buffer = p->dfg->gpus[loc].get_pbs_buffer(
p->glwe_dim.val, p->poly_size.val, num_samples);
p->glwe_dim.val, p->poly_size.val, p->level.val, num_samples);
void *ct0_gpu = d0->device_data;
void *out_gpu = cuda_malloc_async(data_size, s, loc);
void *fbsk_gpu = p->ctx.val->get_bsk_gpu(
p->input_lwe_dim.val, p->poly_size.val, p->level.val, p->glwe_dim.val,
loc, s, p->sk_index.val);
cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
s, loc, out_gpu, indexes_gpu, glwe_ct_gpu, test_vector_idxes_gpu, ct0_gpu, indexes_gpu,
fbsk_gpu, (int8_t *)pbs_buffer, p->input_lwe_dim.val, p->glwe_dim.val,
p->poly_size.val, p->base_log.val, p->level.val, num_samples);
cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
s, loc, out_gpu, indexes_gpu, glwe_ct_gpu, test_vector_idxes_gpu,
ct0_gpu, indexes_gpu, fbsk_gpu, (int8_t *)pbs_buffer,
p->input_lwe_dim.val, p->glwe_dim.val, p->poly_size.val,
p->base_log.val, p->level.val, num_samples, 1, 1);
cuda_drop_async(test_vector_idxes_gpu, s, loc);
cuda_drop_async(glwe_ct_gpu, s, loc);
cuda_drop_async(indexes_gpu, s, loc);
Expand Down Expand Up @@ -1442,8 +1453,7 @@ void memref_negate_lwe_ciphertext_u64_process(Process *p, int32_t loc,
Dependence *idep0 = p->input_streams[0]->get(loc, chunk_id);
if (p->output_streams[0]->need_new_gen(chunk_id))
p->output_streams[0]->put(
sched(idep0, (cudaStream_t)p->dfg->get_gpu_stream(loc), loc),
chunk_id);
sched(idep0, (cudaStream_t)p->dfg->get_gpu_stream(loc), loc), chunk_id);
}

} // namespace
Expand Down
60 changes: 32 additions & 28 deletions compilers/concrete-compiler/compiler/lib/Runtime/wrappers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -134,17 +134,19 @@ void memref_batched_keyswitch_lwe_cuda_u64(
void *ct0_gpu = alloc_and_memcpy_async_to_gpu(
ct0_aligned, ct0_offset, ct0_batch_size, gpu_idx, (cudaStream_t)stream);
// Initialize indexes
uint64_t *indexes = (uint64_t*) malloc(num_samples * sizeof(uint64_t));
uint64_t *indexes = (uint64_t *)malloc(num_samples * sizeof(uint64_t));
for (uint32_t i = 0; i < num_samples; i++) {
indexes[i] = i;
}
void *indexes_gpu = alloc_and_memcpy_async_to_gpu(indexes, 0, num_samples * sizeof(uint64_t), gpu_idx, (cudaStream_t)stream);
void *indexes_gpu =
alloc_and_memcpy_async_to_gpu(indexes, 0, num_samples * sizeof(uint64_t),
gpu_idx, (cudaStream_t)stream);
void *out_gpu = cuda_malloc_async(out_batch_size * sizeof(uint64_t),
(cudaStream_t)stream, gpu_idx);
// Run the keyswitch kernel on the GPU
cuda_keyswitch_lwe_ciphertext_vector_64(
stream, gpu_idx, out_gpu, indexes_gpu, ct0_gpu, indexes_gpu, ksk_gpu, input_lwe_dim, output_lwe_dim,
base_log, level, num_samples);
stream, gpu_idx, out_gpu, indexes_gpu, ct0_gpu, indexes_gpu, ksk_gpu,
input_lwe_dim, output_lwe_dim, base_log, level, num_samples);
// Copy the output batch of ciphertext back to CPU
memcpy_async_to_cpu(out_aligned, out_offset, out_batch_size, out_gpu, gpu_idx,
stream);
Expand Down Expand Up @@ -213,27 +215,28 @@ void memref_batched_bootstrap_lwe_cuda_u64(
test_vector_idxes_size = num_samples * sizeof(uint64_t);
void *test_vector_idxes = malloc(test_vector_idxes_size);
memset(test_vector_idxes, 0, test_vector_idxes_size);
void *test_vector_idxes_gpu = cuda_malloc_async(
test_vector_idxes_size, (cudaStream_t)stream, gpu_idx);
void *test_vector_idxes_gpu =
cuda_malloc_async(test_vector_idxes_size, (cudaStream_t)stream, gpu_idx);
cuda_memcpy_async_to_gpu(test_vector_idxes_gpu, test_vector_idxes,
test_vector_idxes_size, (cudaStream_t)stream,
gpu_idx);
// Initialize indexes
uint64_t *indexes = (uint64_t*) malloc(num_samples * sizeof(uint64_t));
uint64_t *indexes = (uint64_t *)malloc(num_samples * sizeof(uint64_t));
for (uint32_t i = 0; i < num_samples; i++) {
indexes[i] = i;
}
void *indexes_gpu = alloc_and_memcpy_async_to_gpu(indexes, 0, num_samples * sizeof(uint64_t), gpu_idx, (cudaStream_t)stream);
void *indexes_gpu =
alloc_and_memcpy_async_to_gpu(indexes, 0, num_samples * sizeof(uint64_t),
gpu_idx, (cudaStream_t)stream);
// Allocate PBS buffer on GPU
scratch_cuda_programmable_bootstrap_amortized_64(
stream, gpu_idx, &pbs_buffer, glwe_dim, poly_size, num_samples,
true);
scratch_cuda_programmable_bootstrap_64(stream, gpu_idx, &pbs_buffer, glwe_dim,
poly_size, level, num_samples, true);
// Run the bootstrap kernel on the GPU
cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
stream, gpu_idx, out_gpu, indexes_gpu, glwe_ct_gpu, test_vector_idxes_gpu, ct0_gpu, indexes_gpu,
fbsk_gpu, pbs_buffer, input_lwe_dim, glwe_dim, poly_size, base_log, level,
num_samples);
cleanup_cuda_programmable_bootstrap_amortized(stream, gpu_idx, &pbs_buffer);
cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
stream, gpu_idx, out_gpu, indexes_gpu, glwe_ct_gpu, test_vector_idxes_gpu,
ct0_gpu, indexes_gpu, fbsk_gpu, pbs_buffer, input_lwe_dim, glwe_dim,
poly_size, base_log, level, num_samples, 1, 1);
cleanup_cuda_programmable_bootstrap(stream, gpu_idx, &pbs_buffer);
// Copy the output batch of ciphertext back to CPU
memcpy_async_to_cpu(out_aligned, out_offset, out_batch_size, out_gpu, gpu_idx,
stream);
Expand Down Expand Up @@ -318,28 +321,29 @@ void memref_batched_mapped_bootstrap_lwe_cuda_u64(
for (size_t i = 0; i < num_lut_vectors; ++i)
test_vector_idxes[i] = i;
}
void *test_vector_idxes_gpu = cuda_malloc_async(
test_vector_idxes_size, (cudaStream_t)stream, gpu_idx);
void *test_vector_idxes_gpu =
cuda_malloc_async(test_vector_idxes_size, (cudaStream_t)stream, gpu_idx);
cuda_memcpy_async_to_gpu(test_vector_idxes_gpu, (void *)test_vector_idxes,
test_vector_idxes_size, (cudaStream_t)stream,
gpu_idx);
// Initialize indexes
uint64_t *indexes = (uint64_t*) malloc(num_samples * sizeof(uint64_t));
uint64_t *indexes = (uint64_t *)malloc(num_samples * sizeof(uint64_t));
for (uint32_t i = 0; i < num_samples; i++) {
indexes[i] = i;
}
void *indexes_gpu = alloc_and_memcpy_async_to_gpu(indexes, 0, num_samples * sizeof(uint64_t), gpu_idx, (cudaStream_t)stream);
void *indexes_gpu =
alloc_and_memcpy_async_to_gpu(indexes, 0, num_samples * sizeof(uint64_t),
gpu_idx, (cudaStream_t)stream);

// Allocate PBS buffer on GPU
scratch_cuda_programmable_bootstrap_amortized_64(
stream, gpu_idx, &pbs_buffer, glwe_dim, poly_size, num_samples,
true);
scratch_cuda_programmable_bootstrap_64(stream, gpu_idx, &pbs_buffer, glwe_dim,
poly_size, level, num_samples, true);
// Run the bootstrap kernel on the GPU
cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
stream, gpu_idx, out_gpu, indexes_gpu, glwe_ct_gpu, test_vector_idxes_gpu, ct0_gpu, indexes_gpu,
fbsk_gpu, pbs_buffer, input_lwe_dim, glwe_dim, poly_size, base_log, level,
num_samples);
cleanup_cuda_programmable_bootstrap_amortized(stream, gpu_idx, &pbs_buffer);
cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
stream, gpu_idx, out_gpu, indexes_gpu, glwe_ct_gpu, test_vector_idxes_gpu,
ct0_gpu, indexes_gpu, fbsk_gpu, pbs_buffer, input_lwe_dim, glwe_dim,
poly_size, base_log, level, num_samples, 1, 1);
cleanup_cuda_programmable_bootstrap(stream, gpu_idx, &pbs_buffer);
// Copy the output batch of ciphertext back to CPU
memcpy_async_to_cpu(out_aligned, out_offset, out_batch_size, out_gpu, gpu_idx,
stream);
Expand Down

0 comments on commit 7fc2663

Please sign in to comment.