Skip to content

Commit

Permalink
#5 Added preliminary on-disk diffusion implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
carljohnsen committed Mar 7, 2024
1 parent 87b9da0 commit 212a70a
Show file tree
Hide file tree
Showing 2 changed files with 203 additions and 11 deletions.
201 changes: 190 additions & 11 deletions src/lib/cpp/cpu/diffusion.cc
Original file line number Diff line number Diff line change
@@ -1,16 +1,21 @@
#include "diffusion.hh"

#include <fcntl.h>
#include <iostream>
#include <unistd.h>
#include <filesystem>

namespace cpu_par {

void diffusion_core(const float *__restrict__ input, const float *__restrict__ kernel, float *__restrict__ output, const shape_t &N, const int64_t dim, const int64_t radius, const int64_t padding) {
void diffusion_core(const float *__restrict__ input, const float *__restrict__ kernel, float *__restrict__ output, const shape_t &N, const int64_t dim, const int64_t radius) {
#pragma omp parallel for collapse(3)
for (int64_t i = 0; i < N.z+padding; i++) {
for (int64_t j = 0; j < N.y+padding; j++) {
for (int64_t k = 0; k < N.x+padding; k++) {
for (int64_t i = 0; i < N.z; i++) {
for (int64_t j = 0; j < N.y; j++) {
for (int64_t k = 0; k < N.x; k++) {
const int64_t
X[3] = {i, j, k},
stride[3] = {(N.y+padding)*(N.x+padding), N.x+padding, 1},
Ns[3] = {N.z+padding, N.y+padding, N.x+padding},
stride[3] = {N.y*N.x, N.x, 1},
Ns[3] = {N.z, N.y, N.x},
ranges[2] = {
-std::min(radius, X[dim]), std::min(radius, Ns[dim]-X[dim]-1)
},
Expand All @@ -32,20 +37,109 @@ namespace cpu_par {
}
}

// padding is padding*ny*nx - i.e. number of padding layers in flat size
template <typename T>
void load_partial(FILE *f, T *__restrict__ buffer, const int64_t offset, const int64_t size, const int64_t total_size, const int64_t padding) {
int64_t
disk_begin = std::max((int64_t) 0, offset-padding),
disk_end = std::min(offset+size+padding, total_size),
disk_size = disk_end - disk_begin,
buffer_begin = disk_begin < padding ? 0 : padding;

fseek(f, disk_begin*sizeof(T), SEEK_SET);
fread(buffer+buffer_begin, sizeof(T), disk_size, f);
}

template <typename T>
void store_partial(FILE *f, const T *__restrict__ buffer, const int64_t offset, const int64_t size, const int64_t padding) {
fseek(f, offset*sizeof(T), SEEK_SET);
fwrite(buffer+padding, sizeof(T), size, f);
}

FILE* open_file_read(const std::string &path) {
FILE *file = fopen(path.c_str(), "rb");
if (file == NULL) {
throw std::runtime_error("Could not open file: " + path);
}
return file;
}

FILE* open_file_write(const std::string &path, const int64_t size) {
FILE *file = fopen(path.c_str(), "wb");
if (file == NULL) {
throw std::runtime_error("Could not open file: " + path);
}
ftruncate(fileno(file), size);
return file;
}

FILE* open_file_read_write(const std::string &path, const int64_t size) {
FILE *file = fopen(path.c_str(), "w+b");
if (file == NULL) {
throw std::runtime_error("Could not open file: " + path);
}
ftruncate(fileno(file), size);
return file;
}

void convert_float_to_uint8(const float *__restrict__ src, uint8_t *__restrict__ dst, const int64_t total_flat_size) {
#pragma omp parallel for
for (int64_t i = 0; i < total_flat_size; i++) {
dst[i] = (uint8_t) std::floor(src[i] * 255.0f);
}
}

void convert_float_to_uint8(const std::string &src, const std::string &dst, const int64_t total_flat_size) {
constexpr int64_t
disk_block_size = 4096,
chunk_size = 2048*disk_block_size;
FILE *file_src = open_file_read(src);
FILE *file_dst = open_file_write(dst, total_flat_size*sizeof(uint8_t));
float *buffer_src = (float *) aligned_alloc(disk_block_size, chunk_size*sizeof(float));
uint8_t *buffer_dst = (uint8_t *) aligned_alloc(disk_block_size, chunk_size*sizeof(uint8_t));

for (int64_t chunk = 0; chunk < total_flat_size; chunk += chunk_size) {
int64_t size = std::min(chunk_size, total_flat_size - chunk);
load_partial(file_src, buffer_src, chunk, size, total_flat_size, 0);
convert_float_to_uint8(buffer_src, buffer_dst, size);
store_partial(file_dst, buffer_dst, chunk, size, 0);
}

free(buffer_dst);
free(buffer_src);
fclose(file_dst);
fclose(file_src);
}

void convert_uint8_to_float(const uint8_t *__restrict__ src, float *__restrict__ dst, const int64_t total_flat_size) {
#pragma omp parallel for
for (int64_t i = 0; i < total_flat_size; i++) {
dst[i] = src[i] > 0 ? 1.0f : 0.0f;
}
}

void convert_uint8_to_float(const std::string &src, const std::string &dst, const int64_t total_flat_size) {
constexpr int64_t
disk_block_size = 4096,
chunk_size = 2048*disk_block_size;
FILE *file_src = open_file_read(src);
FILE *file_dst = open_file_write(dst, total_flat_size*sizeof(float));
uint8_t *buffer_src = (uint8_t *) aligned_alloc(disk_block_size, chunk_size*sizeof(uint8_t));
float *buffer_dst = (float *) aligned_alloc(disk_block_size, chunk_size*sizeof(float));

for (int64_t chunk = 0; chunk < total_flat_size; chunk += chunk_size) {
int64_t size = std::min(chunk_size, total_flat_size - chunk);
load_partial(file_src, buffer_src, chunk, size, total_flat_size, 0);
convert_uint8_to_float(buffer_src, buffer_dst, size);
store_partial(file_dst, buffer_dst, chunk, size, 0);
}

free(buffer_dst);
free(buffer_src);
fclose(file_dst);
fclose(file_src);
}

void illuminate(const uint8_t *__restrict__ mask, float *__restrict__ output, const int64_t local_flat_size) {
#pragma omp parallel for
for (int64_t i = 0; i < local_flat_size; i++) {
Expand All @@ -55,6 +149,21 @@ namespace cpu_par {
}
}

void diffusion_step(const uint8_t *__restrict__ voxels, float **buffers, const shape_t &N, const float *__restrict__ kernel, const int64_t radius) {
for (int64_t dim = 0; dim < 3; dim++) {
diffusion_core(buffers[0], kernel, buffers[1], N, dim, radius);
std::swap(buffers[0], buffers[1]);
}
illuminate(voxels, buffers[0], N.z*N.y*N.x);
}

void store_mask(const float *__restrict__ input, uint8_t *__restrict__ mask, const int64_t local_flat_size) {
#pragma omp parallel for
for (int64_t i = 0; i < local_flat_size; i++) {
mask[i] = input[i] == 1.0f ? 1 : 0; // The implant will always be 1.0f
}
}

void diffusion_in_memory(const uint8_t *__restrict__ voxels, const shape_t &N, const float *__restrict__ kernel, const int64_t kernel_size, const int64_t repititions, uint8_t *__restrict__ output) {
const int64_t
total_size = N.z*N.y*N.x,
Expand All @@ -65,11 +174,7 @@ namespace cpu_par {

convert_uint8_to_float(voxels, buffers[0], total_size);
for (int64_t rep = 0; rep < repititions; rep++) {
for (int64_t dim = 0; dim < 3; dim++) {
diffusion_core(buffers[0], kernel, buffers[1], N, dim, radius, 0);
std::swap(buffers[0], buffers[1]);
}
illuminate(voxels, buffers[0], total_size);
diffusion_step(voxels, buffers, N, kernel, radius);
}
convert_float_to_uint8(buffers[0], output, total_size);

Expand All @@ -78,4 +183,78 @@ namespace cpu_par {
delete[] buffers;
}

void diffusion_on_disk(const std::string &input_file, const float *__restrict__ kernel, const int64_t kernel_size, const std::string &output_file, const shape_t &total_shape, const shape_t &global_shape, const int64_t repititions, const bool verbose) {
std::string
temp_folder = "/tmp/maxibone",
temp0 = temp_folder + "/diffusion-temp0.float32",
temp1 = temp_folder + "/diffusion-temp1.float32";

// Create temp folder
std::filesystem::create_directories(temp_folder);

// Compute the number of global blocks
const int64_t
radius = kernel_size / 2,
padding = kernel_size - 1, // kernel is assumed to be odd
disk_block_size = 4096, // TODO Generate a header file during configuration
total_flat_size = total_shape.z * total_shape.y * total_shape.x,
global_blocks = (int64_t) std::ceil((float)total_shape.z / (float)global_shape.z),
global_flat_size = global_shape.z * global_shape.y * global_shape.x,
global_flat_size_padded = (global_shape.z+padding) * global_shape.y * global_shape.x,
layer_flat_size = global_shape.y * global_shape.x,
disk_total_flat_size = ((total_flat_size*sizeof(float) / disk_block_size) + (total_flat_size*sizeof(float) % disk_block_size == 0 ? 0 : 1)) * disk_block_size,
disk_global_flat_size = ((global_flat_size_padded*sizeof(float) / disk_block_size) + (global_flat_size_padded*sizeof(float) % disk_block_size == 0 ? 0 : 1)) * disk_block_size,
disk_mask_flat_size = ((global_flat_size_padded*sizeof(bool) / disk_block_size) + (global_flat_size_padded*sizeof(bool) % disk_block_size == 0 ? 0 : 1)) * disk_block_size;

const shape_t global_shape_padded = {global_shape.z+padding, global_shape.y, global_shape.x};

if (verbose) {
// Print the number of blocks
std::cout << "Global blocks: " << global_blocks << std::endl;
}

// Allocate memory. Aligned to block_size, and should overallocate to ensure alignment.
// TODO since the I/O functions handle alignment with buffers, the allocations doesn't need to be aligned. Although, it might make sense to do so anyways, since this can avoid the need for a buffer. However, checking this is complicated, and is left for later.
float **buffers = new float*[2];
buffers[0] = (float *) aligned_alloc(disk_block_size, disk_global_flat_size);
buffers[1] = (float *) aligned_alloc(disk_block_size, disk_global_flat_size);
uint8_t *mask = (uint8_t *) aligned_alloc(disk_block_size, disk_mask_flat_size);

FILE
*tmp0 = open_file_read_write(temp0, disk_total_flat_size),
*tmp1 = open_file_read_write(temp1, disk_total_flat_size);

// Convert to float
convert_uint8_to_float(input_file, temp0, total_flat_size);

for (int64_t rep = 0; rep < repititions; rep++) {
for (int64_t global_block = 0; global_block < global_blocks; global_block++) {
int64_t this_block_size = std::min(global_shape.z, total_shape.z - global_block*global_shape.z) * layer_flat_size;

// Load the global block
load_partial(tmp0, buffers[0], global_block*layer_flat_size, this_block_size, total_flat_size, padding*layer_flat_size);

// Diffuse the global block
store_mask(buffers[0], mask, global_flat_size_padded);
diffusion_step(mask, buffers, global_shape_padded, kernel, radius);

// Store the global block
store_partial(tmp1, buffers[0], global_block*global_flat_size, this_block_size, padding*layer_flat_size);
}
std::swap(temp0, temp1);
std::swap(tmp0, tmp1);
}

// Convert to uint8
convert_float_to_uint8(output_file, temp0, total_flat_size);

// Free memory
free(buffers[0]);
free(buffers[1]);
delete[] buffers;
free(mask);
fclose(tmp0);
fclose(tmp1);
}

}
13 changes: 13 additions & 0 deletions src/pybind/diffusion-pybind.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,23 @@ namespace python_api {
NS::diffusion_in_memory(voxels, N, kernel, kernel_size, repititions, output);
}

void diffusion_on_disk(const std::string &input_file, const np_array<float> &np_kernel, const std::string &output_file, const std::tuple<int64_t, int64_t, int64_t> &py_total_shape, const std::tuple<int64_t, int64_t, int64_t> &py_global_shape, const int64_t repititions, const bool verbose = false) {
shape_t
total_shape = {std::get<0>(py_total_shape), std::get<1>(py_total_shape), std::get<2>(py_total_shape)},
global_shape = {std::get<0>(py_global_shape), std::get<1>(py_global_shape), std::get<2>(py_global_shape)};

auto kernel_info = np_kernel.request();
const float *kernel = static_cast<const float*>(kernel_info.ptr);
const int64_t kernel_size = kernel_info.shape[0];

NS::diffusion_on_disk(input_file, kernel, kernel_size, output_file, total_shape, global_shape, repititions, verbose);
}

}

PYBIND11_MODULE(diffusion, m) {
m.doc() = "Diffusion approximation using 3 1D gauss convolutions."; // optional module docstring

m.def("diffusion", &python_api::diffusion_in_memory, py::arg("np_voxels"), py::arg("np_kernel"), py::arg("np_output").noconvert(), py::arg("repititions") = 1);
m.def("diffusion", &python_api::diffusion_on_disk, py::arg("input_file"), py::arg("np_kernel"), py::arg("output_file"), py::arg("total_shape"), py::arg("global_shape"), py::arg("repititions") = 1, py::arg("verbose") = false);
}

0 comments on commit 212a70a

Please sign in to comment.