From 045c1f9b03c527baa71cd0d662176c5f1f6e2c75 Mon Sep 17 00:00:00 2001 From: Beniel Thileepan Date: Thu, 16 Feb 2023 18:16:28 +0000 Subject: [PATCH] completed heat code. HW run tested --- FPGA/Xilinx/Batched/heat3D/SLR.cpp | 86 ++++ FPGA/Xilinx/Batched/heat3D/heat3D_app.cpp | 383 ++++++++++++++++++ FPGA/Xilinx/Batched/heat3D/heat3D_common.h | 3 +- FPGA/Xilinx/Batched/heat3D/heat3D_cpu.cpp | 4 +- FPGA/Xilinx/Batched/heat3D/heat3D_cpu.h | 4 +- FPGA/Xilinx/Batched/heat3D/mem2stream.cpp | 134 ++++++ FPGA/Xilinx/Batched/heat3D/stencil.cpp | 238 +++++++++++ FPGA/Xilinx/Batched/heat3D/stencil.h | 92 +++++ FPGA/Xilinx/Batched/heat3D/stencil_3_SLR.cfg | 22 + .../Batched/heat3D/stencil_single_SLR.cfg | 9 + FPGA/Xilinx/Batched/heat3D/xcl2.cpp | 114 ++++++ FPGA/Xilinx/Batched/heat3D/xcl2.hpp | 105 +++++ 12 files changed, 1189 insertions(+), 5 deletions(-) create mode 100644 FPGA/Xilinx/Batched/heat3D/SLR.cpp create mode 100644 FPGA/Xilinx/Batched/heat3D/heat3D_app.cpp create mode 100644 FPGA/Xilinx/Batched/heat3D/mem2stream.cpp create mode 100644 FPGA/Xilinx/Batched/heat3D/stencil.cpp create mode 100644 FPGA/Xilinx/Batched/heat3D/stencil.h create mode 100644 FPGA/Xilinx/Batched/heat3D/stencil_3_SLR.cfg create mode 100644 FPGA/Xilinx/Batched/heat3D/stencil_single_SLR.cfg create mode 100644 FPGA/Xilinx/Batched/heat3D/xcl2.cpp create mode 100644 FPGA/Xilinx/Batched/heat3D/xcl2.hpp diff --git a/FPGA/Xilinx/Batched/heat3D/SLR.cpp b/FPGA/Xilinx/Batched/heat3D/SLR.cpp new file mode 100644 index 0000000..a8ca89a --- /dev/null +++ b/FPGA/Xilinx/Batched/heat3D/SLR.cpp @@ -0,0 +1,86 @@ +#include +#include +#include +#include +#include "stencil.h" +#include "stencil.cpp" + +void process_SLR(hls::stream &in, hls::stream &out, const int xdim0, const unsigned short size_x, + const unsigned int size_y, const unsigned int size_z, const unsigned short batches, const float calcParam_K) +{ + hls::stream streamArray[SLR_P_STAGE + 1]; +#pragma HLS STREAM variable = streamArray depth = 10 + + data_G data_g; + data_g.sizex = size_x; + data_g.sizey = size_y; + data_g.sizez = size_z; + data_g.offset_x = 0; + data_g.grid_size_x = xdim0; + data_g.xblocks = (data_g.grid_size_x >> SHIFT_BITS); + data_g.offset_y = 0; + data_g.grid_size_y = size_y + 2; + data_g.offset_z = 0; + data_g.grid_size_z = size_z + 2; + data_g.batches = batches; + data_g.limit_z = size_z + 3; + + unsigned short tile_y_1 = data_g.grid_size_y - 1; + unsigned int plane_size = data_g.xblocks * data_g.grid_size_y; + + data_g.plane_diff = data_g.xblocks * tile_y_1; + data_g.line_diff = data_g.xblocks - 1; + data_g.gridsize_pr = plane_size * register_it(data_g.grid_size_z * batches + 1); + data_g.gridsize_da = register_it(plane_size * data_g.grid_size_z) * batches; + + const float coefficients[7] = {calcParam_K, calcParam_K, calcParam_K, 1-6*calcParam_K, calcParam_K, calcParam_K, calcParam_K}; +#pragma HLS ARRAY_PARTITION variable=coefficients complete dim=1 + +#pragma HLS DATAFLOW + { + axis2_fifo256(in, streamArray[0], data_g.gridsize_da); + + for (int i = 0; i < SLR_P_STAGE; i++) + { +#pragma HLS unroll + process_grid(streamArray[i], streamArray[i+1], data_g, coefficients); + } + + fifo256_2axis(streamArray[SLR_P_STAGE], out, data_g.gridsize_da); + + } + +} + +extern "C" +{ + void stencil_SLR( + const int sizex, + const int sizey, + const int sizez, + const int xdim0, + const int batches, + const int count, + const float calcParam_K, + hls::stream &in, + hls::stream &out) + { +#pragma HLS INTERFACE axis port = in register +#pragma HLS INTERFACE axis port = out register + +#pragma HLS INTERFACE s_axilite port = sizex bundle = control +#pragma HLS INTERFACE s_axilite port = sizey bundle = control +#pragma HLS INTERFACE s_axilite port = sizez bundle = control +#pragma HLS INTERFACE s_axilite port = xdim0 bundle = control +#pragma HLS INTERFACE s_axilite port = batches bundle = control +#pragma HLS INTERFACE s_axilite port = count bundle = control +#pragma HLS INTERFACE s_axilite port = calcParam_K bundle = control +#pragma HLS INTERFACE s_axilite port = return bundle = control + + for (unsigned int i = 0; i < count * 2; i++) + { + process_SLR(in, out, xdim0, sizex, sizey, sizez, batches, calcParam_K); + } + + } +} diff --git a/FPGA/Xilinx/Batched/heat3D/heat3D_app.cpp b/FPGA/Xilinx/Batched/heat3D/heat3D_app.cpp new file mode 100644 index 0000000..cbe219d --- /dev/null +++ b/FPGA/Xilinx/Batched/heat3D/heat3D_app.cpp @@ -0,0 +1,383 @@ + +// standard headers +#include +#include +#include +#include +#include +#include "xcl2.hpp" +#include "heat3D_cpu.h" +#include "heat3D_common.h" + +//#define DEBUG_VERBOSE +#define VERIFICATION +#define MULTI_SLR +//#define FPGA_RUN_ONLY + +int main(int argc, char **argv) +{ + GridParameter gridData; + + gridData.logical_size_x = 100; + gridData.logical_size_y = 100; + gridData.logical_size_z = 100; + gridData.batch = 10; + gridData.num_iter = 1000; + + unsigned int vectorization_factor = 8; + + // setting grid parameters given by user + const char * pch; + + for ( int n = 1; n < argc; n++ ) + { + pch = strstr(argv[n], "-size="); + + if(pch != NULL) + { + gridData.logical_size_x = atoi ( argv[n] + 7 ); continue; + } + + pch = strstr(argv[n], "-iters="); + + if(pch != NULL) + { + gridData.num_iter = atoi ( argv[n] + 7 ); continue; + } + pch = strstr(argv[n], "-batch="); + + if(pch != NULL) + { + gridData.batch = atoi ( argv[n] + 7 ); continue; + } + } + + printf("Grid: %dx1 , %d iterations, %d batches\n", gridData.logical_size_x, gridData.num_iter, gridData.batch); + + //adding halo + gridData.act_size_x = gridData.logical_size_x + 2; + gridData.act_size_y = gridData.logical_size_y + 2; + gridData.act_size_z = gridData.logical_size_z + 2; + + //padding each row as multiples of vectorization factor + gridData.grid_size_x = (gridData.act_size_x % vectorization_factor) != 0 ? + (gridData.act_size_x/vectorization_factor + 1) * vectorization_factor : + gridData.act_size_x; + gridData.grid_size_y = gridData.act_size_y; + gridData.grid_size_z = gridData.act_size_z; + + //allocating memory buffer + unsigned int data_size_bytes = gridData.grid_size_x * gridData.grid_size_y + * gridData.grid_size_z * sizeof(float) * gridData.batch; + + if (data_size_bytes >= 4000000000) + { + std::cerr << "Maximum buffer size is exceeded!" << std::endl; + } + + heat3DParameter calcParam; + + + calcParam.alpha = 1.5/1000; //diffusivity + calcParam.h = 1/gridData.act_size_x; + calcParam.delta_t = 0.5; //0.5s + calcParam.K = calcParam.alpha * calcParam.delta_t / (calcParam.h * calcParam.h); + + float * grid_u1_cpu = (float*) aligned_alloc(4096, data_size_bytes); + float * grid_u2_cpu = (float*) aligned_alloc(4096, data_size_bytes); + + float * grid_u1_d = (float*) aligned_alloc(4096, data_size_bytes); + float * grid_u2_d = (float*) aligned_alloc(4096, data_size_bytes); + + + auto init_start_clk_point = std::chrono::high_resolution_clock::now(); + initialize_grid(grid_u1_cpu, gridData); + copy_grid(grid_u1_cpu, grid_u1_d, gridData); + auto init_stop_clk_point = std::chrono::high_resolution_clock::now(); + double runtime_init = std::chrono::duration (init_stop_clk_point - init_start_clk_point).count(); + copy_grid(grid_u1_cpu, grid_u2_cpu, gridData); + + +#ifdef DEBUG_VERBOSE + std::cout << std::endl; + std::cout << "*********************************************" << std::endl; + std::cout << "** intial grid values **" << std::endl; + std::cout << "*********************************************" << std::endl; + + for (unsigned int bat = 0; bat < gridData.batch; bat++) + { + int offset = bat * gridData.grid_size_x * gridData.grid_size_y * gridData.grid_size_y; + + std::cout << "---------------------------------------------" << std::endl; + std::cout << " batch: " << bat << std::endl; + std::cout << "---------------------------------------------" << std::endl; + + for (unsigned int k = 0; k < gridData.grid_size_z; k++) + { + for (unsigned int j = 0; j < gridData.grid_size_y; j++) + { + for (unsigned int i = 0; i < gridData.grid_size_x; i++) + { + int index = offset + k * gridData.grid_size_x * gridData.grid_size_y + + j * gridData.grid_size_x + i; + std::cout << "grid_id: (" << i << ", " << j << ", " << k << ") initial_val: " + << grid_u1_cpu[index]<< std::endl; + } + } + } + } + std::cout << "=============================================" << std::endl << std::endl; +#endif + +#ifndef FPGA_RUN_ONLY + //golden stencil computation on the CPU + + std::vector calcParams(gridData.batch); + + for (unsigned int bat = 0; bat < gridData.batch; bat++) + { + calcParams[bat] = calcParam; + } + + auto naive_cpu_start_clk_point = std::chrono::high_resolution_clock::now(); + heat3D_explicit(grid_u1_cpu, grid_u2_cpu, gridData, calcParams); + auto naive_cpu_stop_clk_point = std::chrono::high_resolution_clock::now(); + double runtime_naive_cpu_stencil = std::chrono::duration (naive_cpu_stop_clk_point - naive_cpu_start_clk_point).count(); + +#endif + //OPENCL HOST CODE START + auto bindaryFile = argv[1]; + cl_int err; + + auto devices = xcl::get_xil_devices(); + auto device = devices[0]; + + OCL_CHECK(err, cl::Context context(device, NULL, NULL, NULL, &err)); + OCL_CHECK(err, cl::CommandQueue queue(context, CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err)); + OCL_CHECK(err, std::string device_name = device.getInfo(&err)); + + //Create Program and Kernel + auto fileBuf = xcl::read_binary_file(bindaryFile); + cl::Program::Binaries bins{{fileBuf.data(), fileBuf.size()}}; + + OCL_CHECK(err, cl::Program program(context, {device}, bins, NULL, &err)); + OCL_CHECK(err, cl::Kernel krnl_slr0(program, "stencil_SLR", &err)); + +#ifdef MULTI_SLR + OCL_CHECK(err, cl::Kernel krnl_slr1(program, "stencil_SLR", &err)); + OCL_CHECK(err, cl::Kernel krnl_slr2(program, "stencil_SLR", &err)); +#endif + OCL_CHECK(err, cl::Kernel krnl_mem2stream(program, "stencil_mem2stream", &err)); + + //Allocation Buffer in Global Memory + OCL_CHECK(err, cl::Buffer buff_curr(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, data_size_bytes, grid_u1_d, &err)); + OCL_CHECK(err, cl::Buffer buff_next(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, data_size_bytes, grid_u2_d, &err)); + +#ifdef MULTI_SLR + unsigned int total_SLR = 3; +#else + unsigned int total_SLR = 1; +#endif + + unsigned number_of_process_grid_per_SLR = NUM_OF_PROCESS_GRID_PER_SLR; + unsigned int total_process_grid_per_iter = total_SLR * number_of_process_grid_per_SLR * 2; + unsigned int num_iter = gridData.num_iter / total_process_grid_per_iter; + + //set Kernel arguments + + /* + * void stencil_SLR( + const int sizex, + const int sizey, + const int sizez, + const int xdim0, + const int batches, + const int count, + const float calcParam_K, + hls::stream &in, + hls::stream &out) + */ + int narg = 0; + OCL_CHECK(err, err = krnl_slr0.setArg(narg++, gridData.logical_size_x)); + OCL_CHECK(err, err = krnl_slr0.setArg(narg++, gridData.logical_size_y)); + OCL_CHECK(err, err = krnl_slr0.setArg(narg++, gridData.logical_size_z)); + OCL_CHECK(err, err = krnl_slr0.setArg(narg++, gridData.grid_size_x)); + OCL_CHECK(err, err = krnl_slr0.setArg(narg++, gridData.batch)); + OCL_CHECK(err, err = krnl_slr0.setArg(narg++, num_iter)); + OCL_CHECK(err, err = krnl_slr0.setArg(narg++, calcParam.K)); + +#ifdef MULTI_SLR + narg = 0; + OCL_CHECK(err, err = krnl_slr1.setArg(narg++, gridData.logical_size_x)); + OCL_CHECK(err, err = krnl_slr1.setArg(narg++, gridData.logical_size_y)); + OCL_CHECK(err, err = krnl_slr1.setArg(narg++, gridData.logical_size_z)); + OCL_CHECK(err, err = krnl_slr1.setArg(narg++, gridData.grid_size_x)); + OCL_CHECK(err, err = krnl_slr1.setArg(narg++, gridData.batch)); + OCL_CHECK(err, err = krnl_slr1.setArg(narg++, num_iter)); + OCL_CHECK(err, err = krnl_slr1.setArg(narg++, calcParam.K)); + + narg = 0; + OCL_CHECK(err, err = krnl_slr2.setArg(narg++, gridData.logical_size_x)); + OCL_CHECK(err, err = krnl_slr2.setArg(narg++, gridData.logical_size_y)); + OCL_CHECK(err, err = krnl_slr2.setArg(narg++, gridData.logical_size_z)); + OCL_CHECK(err, err = krnl_slr2.setArg(narg++, gridData.grid_size_x)); + OCL_CHECK(err, err = krnl_slr2.setArg(narg++, gridData.batch)); + OCL_CHECK(err, err = krnl_slr2.setArg(narg++, num_iter)); + OCL_CHECK(err, err = krnl_slr2.setArg(narg++, calcParam.K)); +#endif + + /* + * void stencil_mem2stream( + uint512_dt* arg0, + uint512_dt* arg1, + const int count, + const int xdim0, + const int ydim0, + const int zdim0, + const int batch, + hls::stream &in, + hls::stream &out) + */ + narg = 0; + OCL_CHECK(err, err = krnl_mem2stream.setArg(narg++, buff_curr)); + OCL_CHECK(err, err = krnl_mem2stream.setArg(narg++, buff_next)); + OCL_CHECK(err, err = krnl_mem2stream.setArg(narg++, num_iter)); + OCL_CHECK(err, err = krnl_mem2stream.setArg(narg++, gridData.grid_size_x)); + OCL_CHECK(err, err = krnl_mem2stream.setArg(narg++, gridData.grid_size_y)); + OCL_CHECK(err, err = krnl_mem2stream.setArg(narg++, gridData.grid_size_z)); + OCL_CHECK(err, err = krnl_mem2stream.setArg(narg++, gridData.batch)); + + //Copy input buffer to device + auto h_to_d_start_point = std::chrono::high_resolution_clock::now(); + OCL_CHECK(err, err = queue.enqueueMigrateMemObjects({buff_curr, buff_next}, 0)); + queue.finish(); + auto h_to_d_stop_kernels_start_point = std::chrono::high_resolution_clock::now(); +#ifdef MULTI_SLR + OCL_CHECK(err, err = queue.enqueueTask(krnl_slr2)); + OCL_CHECK(err, err = queue.enqueueTask(krnl_slr1)); +#endif + OCL_CHECK(err, err = queue.enqueueTask(krnl_slr0)); + OCL_CHECK(err, err = queue.enqueueTask(krnl_mem2stream)); + queue.finish(); + auto kernels_stop_d_to_h_start_point = std::chrono::high_resolution_clock::now(); + OCL_CHECK(err, err = queue.enqueueMigrateMemObjects({buff_curr}, CL_MIGRATE_MEM_OBJECT_HOST)); + queue.finish(); + auto d_to_h_stop_point = std::chrono::high_resolution_clock::now(); + + double h_to_d_runtime = std::chrono::duration + (h_to_d_stop_kernels_start_point - h_to_d_start_point).count(); + double kernels_runtime = std::chrono::duration + (kernels_stop_d_to_h_start_point - h_to_d_stop_kernels_start_point).count(); + double d_to_h_runtime = std::chrono::duration + (d_to_h_stop_point - kernels_stop_d_to_h_start_point).count(); + +#ifdef VERIFICATION + std::cout << std::endl; + std::cout << "*********************************************" << std::endl; + std::cout << "** Verification **" << std::endl; + std::cout << "*********************************************" << std::endl; + + for (unsigned int bat = 0; bat < gridData.batch; bat++) + { + int offset = bat * gridData.grid_size_x * gridData.grid_size_y * gridData.grid_size_y; + + std::cout << "---------------------------------------------" << std::endl; + std::cout << " batch: " << bat << std::endl; + std::cout << "---------------------------------------------" << std::endl; + + bool passed = true; + + for (unsigned int k = 0; k < gridData.grid_size_z; k++) + { + for (unsigned int j = 0; j < gridData.grid_size_y; j++) + { + for (unsigned int i = 0; i < gridData.grid_size_x; i++) + { + int index = offset + k * gridData.grid_size_x * gridData.grid_size_y + + j * gridData.grid_size_x + i; + if (abs(grid_u1_cpu[index] - grid_u1_d[index]) > EPSILON) + { + std::cerr << "Value Mismatch index: (" << i << ", " << j << ", " << k << "), naive_cpu_val: " + << grid_u1_cpu[index] << ", and fpga_val: " << grid_u1_d[index] << std::endl; + passed = false; + } + } + } + } + + std::cout << "---------------------------------------------" << std::endl; + std::cout << " batch: " << bat << " "; + + if (passed) + std::cout << "Verification passed "; + else + std::cout << "Verification failed "; + + std::cout << std::endl; + std::cout << "---------------------------------------------" << std::endl; + + } + std::cout << "=============================================" << std::endl << std::endl; +#endif + +#ifdef DEBUG_VERBOSE + std::cout << std::endl; + std::cout << "*********************************************" << std::endl; + std::cout << "** Debug info after calculations **" << std::endl; + std::cout << "*********************************************" << std::endl; + + for (unsigned int bat = 0; bat < gridData.batch; bat++) + { + int offset = bat * gridData.grid_size_x * gridData.grid_size_y * gridData.grid_size_y; + + std::cout << "---------------------------------------------" << std::endl; + std::cout << " batch: " << bat << std::endl; + std::cout << "---------------------------------------------" << std::endl; + + for (unsigned int k = 0; k < gridData.grid_size_z; k++) + { + for (unsigned int j = 0; j < gridData.grid_size_y; j++) + { + for (unsigned int i = 0; i < gridData.grid_size_x; i++) + { + int index = offset + k * gridData.grid_size_x * gridData.grid_size_y + + j * gridData.grid_size_x + i; + std::cout << "grid_id: (" << i << ", " << j << ", " << k << "), " +#ifndef FPGA_RUN_ONLY + << "golden_val: " << grid_u1_cpu[index] +#endif + << "fpga_explicit_val: " << grid_u1_d[index] << std::endl; + } + } + } + } + + std::cout << "=============================================" << std::endl << std::endl; +#endif + + std::cout << std::endl; + std::cout << "*********************************************" << std::endl; + std::cout << "** runtime summery **" << std::endl; + std::cout << "*********************************************" << std::endl; + +#ifndef FPGA_RUN_ONLY + + std::cout << " * naive stencil runtime : " << runtime_init + runtime_naive_cpu_stencil<< " us" << std::endl; + std::cout << " |--> grid_init time : " << runtime_init << " us" << std::endl; + std::cout << " |--> calc time : " << runtime_naive_cpu_stencil << " us" << std::endl; +#endif + std::cout << " * fpga runtime : " << runtime_init + h_to_d_runtime + + kernels_runtime + d_to_h_runtime << " us" << std::endl; + std::cout << " |--> grid_init time : " << runtime_init<< " us" << std::endl; + std::cout << " |--> h_to_d : " << h_to_d_runtime << " us" << std::endl; + std::cout << " |--> d_to_h : " << d_to_h_runtime << " us" << std::endl; + std::cout << " |--> kernels_runtime: " << kernels_runtime << " us" << std::endl; + std::cout << "=============================================" << std::endl << std::endl; + + free(grid_u1_cpu); + free(grid_u2_cpu); + free(grid_u1_d); + free(grid_u2_d); + + return 0; +} diff --git a/FPGA/Xilinx/Batched/heat3D/heat3D_common.h b/FPGA/Xilinx/Batched/heat3D/heat3D_common.h index 7fab8ef..4a4e6c6 100644 --- a/FPGA/Xilinx/Batched/heat3D/heat3D_common.h +++ b/FPGA/Xilinx/Batched/heat3D/heat3D_common.h @@ -6,6 +6,7 @@ #define EPSILON 0.0001 #define ERROR_TOL 10e-6 +#define NUM_OF_PROCESS_GRID_PER_SLR 8 struct GridParameter { @@ -31,4 +32,4 @@ struct heat3DParameter float alpha; //diffusivity float delta_t; float K; -}; \ No newline at end of file +}; diff --git a/FPGA/Xilinx/Batched/heat3D/heat3D_cpu.cpp b/FPGA/Xilinx/Batched/heat3D/heat3D_cpu.cpp index 56d13db..30dffeb 100644 --- a/FPGA/Xilinx/Batched/heat3D/heat3D_cpu.cpp +++ b/FPGA/Xilinx/Batched/heat3D/heat3D_cpu.cpp @@ -4,7 +4,7 @@ #include "heat3D_cpu.h" -int heat3D_explicit(float * current, float *next, GridParameter gridData, std::vector calcParam) +int heat3D_explicit(float * current, float *next, GridParameter gridData, std::vector & calcParam) { assert(calcParam.size() == gridData.batch); @@ -116,7 +116,7 @@ void initialize_grid(float* grid, GridParameter gridData) } } -int copy_grid(float* grid_s, float* grid_d, GridParameter gridData) +void copy_grid(float* grid_s, float* grid_d, GridParameter gridData) { for (unsigned int bat = 0; bat < gridData.batch; bat++) { diff --git a/FPGA/Xilinx/Batched/heat3D/heat3D_cpu.h b/FPGA/Xilinx/Batched/heat3D/heat3D_cpu.h index 1392dcc..ccb35c8 100644 --- a/FPGA/Xilinx/Batched/heat3D/heat3D_cpu.h +++ b/FPGA/Xilinx/Batched/heat3D/heat3D_cpu.h @@ -13,8 +13,8 @@ #include #include "heat3D_common.h" -int heat3D_explicit(float * current, float *next, GridParameter gridData, std::vector calcParam); +int heat3D_explicit(float * current, float *next, GridParameter gridData, std::vector & calcParam); void initialize_grid(float* grid, GridParameter gridData); -int copy_grid(float* grid_s, float* grid_d, GridParameter gridData); \ No newline at end of file +void copy_grid(float* grid_s, float* grid_d, GridParameter gridData); diff --git a/FPGA/Xilinx/Batched/heat3D/mem2stream.cpp b/FPGA/Xilinx/Batched/heat3D/mem2stream.cpp new file mode 100644 index 0000000..ded4a22 --- /dev/null +++ b/FPGA/Xilinx/Batched/heat3D/mem2stream.cpp @@ -0,0 +1,134 @@ +#include +#include +#include +#include +#include +#include "heat3D_common.h" +#include "stencil.h" +#include "stencil.cpp" + +// coalesced memory access at 512 bit to get maximum out of memory bandwidth +// Single pipelined loop below will be mapped to single memory transfer +// which will further split into multiple transfers by axim module. +static void read_grid(uint512_dt* arg0, hls::stream &rd_buffer, const unsigned int total_itr) +{ + for (int itr = 0; itr < total_itr; itr++){ + #pragma HLS PIPELINE II=1 + #pragma HLS loop_tripcount min=min_grid max=max_grid avg=avg_grid + rd_buffer << arg0[itr]; + } +} + +// data width conversion to support 256 bit width compute pipeline +static void stream_convert_512_256(hls::stream &in, hls::stream &out, + const unsigned int total_itr_512, const unsigned int total_iter_256) +{ + unsigned int total_itr = total_itr_512; + bool flag = total_iter_256 & 0x1; + + for (int itr = 0; itr < total_itr; itr++){ + #pragma HLS PIPELINE II=2 + #pragma HLS loop_tripcount min=min_grid max=max_grid avg=avg_grid + uint512_dt tmp = in.read(); + uint256_dt var_l = tmp.range(255,0); + uint256_dt var_h = tmp.range(511,256);; + out << var_l; + if(~flag || itr < total_itr -1){ + out << var_h; + } + } +} + +// data width conversion to support 512 bit width memory write interface +static void stream_convert_256_512(hls::stream &in, hls::stream &out, + const unsigned int total_itr_512, const unsigned int total_itr_256) +{ + unsigned int total_itr = total_itr_512; + bool flag = total_itr_256 & 0x1; + for (int itr = 0; itr < total_itr; itr++){ + #pragma HLS PIPELINE II=2 + #pragma HLS loop_tripcount min=min_grid max=max_grid avg=avg_grid + uint512_dt tmp; + tmp.range(255,0) = in.read(); + if(~flag || itr < total_itr -1){ + tmp.range(511,256) = in.read(); + } + out << tmp; + } +} + +// coalesced memory write using 512 bit to get maximum out of memory bandwidth +// Single pipelined loop below will be mapped to single memory transfer +// which will further split into multiple transfers by axim module. +static void write_grid(uint512_dt* arg0, hls::stream &wr_buffer, const unsigned int total_itr) +{ + for (int itr = 0; itr < total_itr; itr++){ + #pragma HLS PIPELINE II=1 + #pragma HLS loop_tripcount min=min_grid max=max_grid avg=avg_grid + arg0[itr] = wr_buffer.read(); + } +} + + +void process_mem2stream(uint512_dt* arg0, uint512_dt* arg1, const int count, const int xdim0, const int ydim0, const int zdim0, + const int batch, hls::stream &in, hls::stream &out) +{ + static hls::stream streamArray[2]; + static hls::stream rd_buffer; + static hls::stream wr_buffer; + + // depth of rd_buffer and wr_buffer set such that burst transfers can be supported. + #pragma HLS STREAM variable = streamArray depth = 10 + #pragma HLS STREAM variable = rd_buffer depth = max_depth_16 + #pragma HLS STREAM variable = wr_buffer depth = max_depth_16 + + int end_index = (xdim0 >> SHIFT_BITS); + + unsigned int total_itr_256 = zdim0 * ydim0 * end_index * batch; + unsigned int total_itr_512 = (zdim0 * ydim0 * end_index * batch + 1) >> 1; + + #pragma HLS DATAFLOW + read_grid(arg0, rd_buffer, total_itr_512); + stream_convert_512_256(rd_buffer, streamArray[0], total_itr_512, total_itr_256); + fifo256_2axis(streamArray[0], out, total_itr_256); + axis2_fifo256(in, streamArray[1], total_itr_256); + stream_convert_256_512(streamArray[1], wr_buffer, total_itr_512, total_itr_256); + write_grid(arg1, wr_buffer, total_itr_512); + +} +extern "C" { + + void stencil_mem2stream( + uint512_dt* arg0, + uint512_dt* arg1, + const int count, + const int xdim0, + const int ydim0, + const int zdim0, + const int batch, + hls::stream &in, + hls::stream &out) + { + #pragma HLS INTERFACE depth=4096 m_axi port = arg0 offset = slave bundle = gmem0 max_read_burst_length=64 max_write_burst_length=64 \ + num_read_outstanding=4 num_write_outstanding=4 + #pragma HLS INTERFACE depth=4096 m_axi port = arg1 offset = slave bundle = gmem1 + #pragma HLS INTERFACE s_axilite port = arg0 bundle = control + #pragma HLS INTERFACE s_axilite port = arg1 bundle = control + #pragma HLS INTERFACE s_axilite port = count bundle = control + #pragma HLS INTERFACE s_axilite port = xdim0 bundle = control + #pragma HLS INTERFACE s_axilite port = ydim0 bundle = control + #pragma HLS INTERFACE s_axilite port = zdim0 bundle = control + #pragma HLS INTERFACE s_axilite port = batch bundle = control + #pragma HLS INTERFACE axis port = in register + #pragma HLS INTERFACE axis port = out register + #pragma HLS INTERFACE s_axilite port = return bundle = control + + + for (int i = 0; i < count; i++) + { + process_mem2stream(arg0, arg1, count, xdim0, ydim0, zdim0, batch, in, out); + process_mem2stream(arg1, arg0, count, xdim0, ydim0, zdim0, batch, in, out); + } + } +} + diff --git a/FPGA/Xilinx/Batched/heat3D/stencil.cpp b/FPGA/Xilinx/Batched/heat3D/stencil.cpp new file mode 100644 index 0000000..0a67a5d --- /dev/null +++ b/FPGA/Xilinx/Batched/heat3D/stencil.cpp @@ -0,0 +1,238 @@ +#include "stencil.h" + +#define OPTIMIZED_REDUCTION +#define DEBUG_VERBOSE + +template +static T register_it(T x) +{ +#pragma HLS inline off + T temp = x; + return temp; +} + +static void axis2_fifo256(hls::stream &in, hls::stream &out, const unsigned int total_itr) +{ + for (int itr = 0; itr < total_itr; itr++){ + #pragma HLS PIPELINE II=1 + #pragma HLS loop_tripcount min=min_grid max=max_grid avg=avg_grid + t_pkt tmp = in.read(); +// printf("reading 256 bit input from SLR. iter: %d\n", itr); + out << tmp.data; + } +} + +static void fifo256_2axis(hls::stream &in, hls::stream &out, const unsigned int total_itr) +{ + for (int itr = 0; itr < total_itr; itr++){ + #pragma HLS PIPELINE II=1 + #pragma HLS loop_tripcount min=min_grid max=max_grid avg=avg_grid + t_pkt tmp; + tmp.data = in.read(); + out.write(tmp); + } +} + +static void inline get_stencil_coefficent(const float & alpha, const float & beta, + const float & delta_t, const int & init_idx, float * a, + float * b, float * c) +{ + for (int i = 0; i < VEC_FACTOR; i++) + { +#pragma HLS UNROLL + + int idx = init_idx + i; + a[i] = 0.5 * (alpha * std::pow(idx,2) - beta * idx); + b[i] = 1 - alpha * std::pow(idx,2) - beta; + c[i] = 0.5 * (alpha * std::pow(idx,2) + beta * idx); + } +} + +static void process_grid(hls::stream &rd_buffer, hls::stream &wr_buffer, struct data_G data_g, + const float coefficients[7]) +{ + float s_1_1_2_arr[VEC_FACTOR]; + float s_1_2_1_arr[VEC_FACTOR]; + float s_1_1_1_arr[VEC_FACTOR+2]; + float s_1_0_1_arr[VEC_FACTOR]; + float s_1_1_0_arr[VEC_FACTOR]; + + float mem_wr[VEC_FACTOR]; + +#pragma HLS ARRAY_PARTITION variable=s_1_1_2_arr complete dim=1 +#pragma HLS ARRAY_PARTITION variable=s_1_2_1_arr complete dim=1 +#pragma HLS ARRAY_PARTITION variable=s_1_1_1_arr complete dim=1 +#pragma HLS ARRAY_PARTITION variable=s_1_0_1_arr complete dim=1 +#pragma HLS ARRAY_PARTITION variable=s_1_1_0_arr complete dim=1 +#pragma HLS ARRAY_PARTITION variable=mem_wr complete dim=1 + + uint256_dt windowBuff_1[max_depth_xy]; + uint256_dt windowBuff_2[max_depth_8]; + uint256_dt windowBuff_3[max_depth_8]; + uint256_dt windowBuff_4[max_depth_xy]; + +#pragma HLS BIND_STORAGE variable=windowBuff_1 type=ram_t2p impl=uram latency=1 +#pragma HLS BIND_STORAGE variable=windowBuff_2 type=ram_t2p impl=uram latency=1 +#pragma HLS BIND_STORAGE variable=windowBuff_3 type=ram_t2p impl=uram latency=1 +#pragma HLS BIND_STORAGE variable=windowBuff_4 type=ram_t2p impl=uram latency=1 + + uint256_dt s_1_1_2, s_1_2_1, s_1_1_1, s_1_1_1b, s_1_1_1f, s_1_0_1, s_1_1_0; + uint256_dt update_j; + + unsigned short i = 0, j = 0, k = 0; + unsigned short j_p = 0, j_l = 0; + + for(unsigned int itr = 0; itr < data_g.gridsize_pr; itr++) + { +#pragma HLS loop_tripcount min=min_grid max=max_grid avg=avg_grid +#pragma HLS PIPELINE II=1 + + spc_temp_blocking_read: + { + bool cond_x = (i == data_g.xblocks); + bool cond_y = (j == data_g.grid_size_y - 1); + bool cond_z = (k == data_g.limit_z - 1); + + if (cond_x) + i = 0; + + if (cond_y && cond_x) + j = 0; + else if(cond_x) + j++; + + if (cond_x && cond_y && cond_z) + k = 1; + else if(cond_y && cond_x) + k++; + + s_1_1_0 = windowBuff_4[j_p]; + + s_1_0_1 = windowBuff_3[j_l]; + windowBuff_4[j_p] = s_1_0_1; + + s_1_1_1b = s_1_1_1; + windowBuff_3[j_l] = s_1_1_1b; + + s_1_1_1 = s_1_1_1f; + s_1_1_1f = windowBuff_2[j_l]; + + s_1_2_1 = windowBuff_1[j_p]; + windowBuff_2[j_l] = s_1_2_1; + + bool cond_read = (itr < data_g.gridsize_da); + + if (cond_read) + { +// printf("reading data for iteration %d\n", itr); + s_1_1_2 = rd_buffer.read(); + } + + windowBuff_1[j_p] = s_1_1_2; + + bool cond_eo_plane = (j_p == data_g.plane_diff); + bool cond_eo_line = (j_l == data_g.line_diff); + + if (cond_eo_plane) + j_p = 0; + else + j_p++; + + if(cond_eo_line) + j_l = 0; + else + j_l++; + } + + vec2arr: + { + for (int id = 0; id < VEC_FACTOR; id++) + + { +#pragma HLS UNROLL + + data_conv s_1_1_2_u, s_1_2_1_u, s_1_1_1_u, s_1_0_1_u, s_1_1_0_u; + + s_1_1_2_u.i = s_1_1_2.range(DATATYPE_SIZE * (id + 1) - 1, id * DATATYPE_SIZE); + s_1_2_1_u.i = s_1_2_1.range(DATATYPE_SIZE * (id + 1) - 1, id * DATATYPE_SIZE); + s_1_1_1_u.i = s_1_1_1.range(DATATYPE_SIZE * (id + 1) - 1, id * DATATYPE_SIZE); + s_1_0_1_u.i = s_1_0_1.range(DATATYPE_SIZE * (id + 1) - 1, id * DATATYPE_SIZE); + s_1_1_0_u.i = s_1_1_0.range(DATATYPE_SIZE * (id + 1) - 1, id * DATATYPE_SIZE); + + s_1_1_2_arr[id] = s_1_1_2_u.f; + s_1_2_1_arr[id] = s_1_2_1_u.f; + s_1_1_1_arr[id+1] = s_1_1_1_u.f; + s_1_0_1_arr[id] = s_1_0_1_u.f; + s_1_1_0_arr[id] = s_1_1_0_u.f; + } + + data_conv tmp1_o1, tmp2_o2; + tmp1_o1.i = s_1_1_1b.range(DATATYPE_SIZE * (VEC_FACTOR) - 1, (VEC_FACTOR-1) * DATATYPE_SIZE); + tmp2_o2.i = s_1_1_1f.range(DATATYPE_SIZE * (0 + 1) - 1, 0 * DATATYPE_SIZE); + s_1_1_1_arr[0] = tmp1_o1.f; + s_1_1_1_arr[VEC_FACTOR + 1] = tmp2_o2.f; + } + + process: + { + unsigned short y_index = j + data_g.offset_y; + + for (short q = 0; q < VEC_FACTOR; q++) + { +#pragma HLS UNROLL + + short index = (i << SHIFT_BITS) + q + data_g.offset_x; + + float r1_1_2 = s_1_1_2_arr[q] * coefficients[0]; + float r1_2_1 = s_1_2_1_arr[q] * coefficients[1]; + float r0_1_1 = s_1_1_1_arr[q] * coefficients[2]; + float r1_1_1 = s_1_1_1_arr[q+1] * coefficients[3]; + float r2_1_1 = s_1_1_1_arr[q+2] * coefficients[4]; + float r1_0_1 = s_1_0_1_arr[q] * coefficients[5]; + float r1_1_0 = s_1_1_0_arr[q] * coefficients[6]; + +#ifdef OPTIMIZED_REDUCTION + float f1 = r1_1_2 + r1_2_1; + float f2 = r0_1_1 + r1_1_1; + float f3 = r2_1_1 + r1_0_1; + +#pragma HLS BIND_OP variable=f1 op=fadd +#pragma HLS BIND_OP variable=f2 op=fadd + + float r1 = f1 + f2; + float r2 = f3 + r1_1_0; + + float result = r1 + r2; +#else + float result = r1_1_2 + r1_2_1 + r0_1_1 + r1_1_1 + r2_1_1 + r1_0_1 + r1_1_0; +#endif + + bool cond_change = register_it (index <= data_g.offset_x || index > data_g.sizex + || (k <= 1) || (k >= data_g.limit_z -1) || (y_index <= 0) || (y_index >= data_g.grid_size_y - 1)); + + mem_wr[q] = cond_change ? s_1_1_1[q+1] : result; + } + } + + array2vec: for (int q = 0; q < VEC_FACTOR; q++) + { +#pragma HLS UNROLL + data_conv tmp; + tmp.f = mem_wr[q]; + update_j.range(DATATYPE_SIZE * (q + 1) - 1, q * DATATYPE_SIZE) = tmp.i; + } + + write: + { + bool cond_wr = (k >= 1) && (k < data_g.limit_z); + + if (cond_wr) + wr_buffer << update_j; + } + + // move cell block + i++; + + } + +} diff --git a/FPGA/Xilinx/Batched/heat3D/stencil.h b/FPGA/Xilinx/Batched/heat3D/stencil.h new file mode 100644 index 0000000..242360c --- /dev/null +++ b/FPGA/Xilinx/Batched/heat3D/stencil.h @@ -0,0 +1,92 @@ +#include +#include +#include +#include +#include +#include +#include "heat3D_common.h" + +#pragma once + +typedef ap_uint<512> uint512_dt; +typedef ap_uint<256> uint256_dt; +typedef ap_axiu<256,0,0,0> t_pkt; +typedef ap_axiu<32,0,0,0> t_pkt_32; + +#define SLR_P_STAGE NUM_OF_PROCESS_GRID_PER_SLR + +//Maximum Tile Size +#define MAX_SIZE_X 304 +#define MAX_DEPTH_16 (MAX_SIZE_X/16) + +//user function +#define VEC_FACTOR 8 +#define SHIFT_BITS 3 +#define DATATYPE_SIZE 32 // single precision operations + + +const int max_size_y = MAX_SIZE_X; +const int min_size_y = 20; +const int avg_size_y = MAX_SIZE_X; + +const int max_block_x = MAX_SIZE_X/VEC_FACTOR + 1; +const int min_block_x = 20/VEC_FACTOR + 1; +const int avg_block_x = MAX_SIZE_X/VEC_FACTOR + 1; + +const int max_grid = max_block_x * max_size_y * max_size_y; +const int min_grid = min_block_x * min_size_y * min_size_y; +const int avg_grid = avg_block_x * avg_size_y * avg_size_y; + +const int vec_factor = VEC_FACTOR; +const int max_depth_16 = MAX_DEPTH_16; +const int max_depth_8 = MAX_DEPTH_16 *2; +const int max_depth_xy = max_block_x * max_size_y; + +// union to reinterpret float as integer and vice versa +typedef union { + int i; + float f; +} data_conv; + + +// strcutre to hold grid parameters to avoid recalculation in +// different process +//struct data_G{ +// unsigned short sizex; +// unsigned short sizey; +// unsigned short xdim0; +// unsigned short end_index; +// unsigned short end_row; +// unsigned int gridsize; +// unsigned int total_itr_512; +// unsigned int total_itr_256; +// unsigned short outer_loop_limit; +// unsigned short endrow_plus2; +// unsigned short endrow_plus1; +// unsigned short endrow_minus1; +// unsigned short endindex_minus1; +//}; + +struct data_G{ + unsigned short sizex; + unsigned short sizey; + unsigned short sizez; + unsigned short xblocks; + unsigned short grid_size_x; + unsigned short grid_size_y; + unsigned short grid_size_z; + unsigned short limit_z; + unsigned short offset_x; + unsigned short offset_y; + unsigned short offset_z; + unsigned int plane_size; + unsigned int gridsize_pr; + unsigned int gridsize_da; + unsigned int plane_diff; + unsigned int line_diff; + unsigned short outer_loop_limit; + unsigned int total_itr; + bool last_half; + unsigned short batches; +}; + diff --git a/FPGA/Xilinx/Batched/heat3D/stencil_3_SLR.cfg b/FPGA/Xilinx/Batched/heat3D/stencil_3_SLR.cfg new file mode 100644 index 0000000..35aa0b8 --- /dev/null +++ b/FPGA/Xilinx/Batched/heat3D/stencil_3_SLR.cfg @@ -0,0 +1,22 @@ +[connectivity] +nk=stencil_SLR:1:stencil_SLR_2 +nk=stencil_SLR:1:stencil_SLR_3 + +stream_connect=stencil_mem2stream_1.out:stencil_SLR_1.in:128 +stream_connect=stencil_SLR_1.out:stencil_SLR_2.in:128 +stream_connect=stencil_SLR_2.out:stencil_SLR_3.in:128 +stream_connect=stencil_SLR_3.out:stencil_mem2stream_1.in:128 + +sp=stencil_mem2stream_1.arg0:HBM[0] +sp=stencil_mem2stream_1.arg1:HBM[1] + +slr=stencil_mem2stream_1:SLR0 +slr=stencil_SLR_1:SLR0 +slr=stencil_SLR_2:SLR1 +slr=stencil_SLR_3:SLR2 + +[profile] +data=all:all:all +memory=all +stall=all:all +exec=all:all diff --git a/FPGA/Xilinx/Batched/heat3D/stencil_single_SLR.cfg b/FPGA/Xilinx/Batched/heat3D/stencil_single_SLR.cfg new file mode 100644 index 0000000..047324d --- /dev/null +++ b/FPGA/Xilinx/Batched/heat3D/stencil_single_SLR.cfg @@ -0,0 +1,9 @@ +[connectivity] +stream_connect=stencil_mem2stream_1.out:stencil_SLR_1.in +stream_connect=stencil_SLR_1.out:stencil_mem2stream_1.in + +sp=stencil_mem2stream_1.arg0:HBM[0] +sp=stencil_mem2stream_1.arg1:HBM[1] + +slr=stencil_mem2stream_1:SLR1 +slr=stencil_SLR_1:SLR0 diff --git a/FPGA/Xilinx/Batched/heat3D/xcl2.cpp b/FPGA/Xilinx/Batched/heat3D/xcl2.cpp new file mode 100644 index 0000000..893b79d --- /dev/null +++ b/FPGA/Xilinx/Batched/heat3D/xcl2.cpp @@ -0,0 +1,114 @@ +/********** +Copyright (c) 2018, Xilinx, Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors +may be used to endorse or promote products derived from this software +without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********/ + +#include "xcl2.hpp" +#include +#include +#include + +namespace xcl { +std::vector get_devices(const std::string &vendor_name) { + size_t i; + cl_int err; + std::vector platforms; + OCL_CHECK(err, err = cl::Platform::get(&platforms)); + cl::Platform platform; + for (i = 0; i < platforms.size(); i++) { + platform = platforms[i]; + OCL_CHECK(err, + std::string platformName = + platform.getInfo(&err)); + if (platformName == vendor_name) { + std::cout << "Found Platform" << std::endl; + std::cout << "Platform Name: " << platformName.c_str() << std::endl; + break; + } + } + if (i == platforms.size()) { + std::cout << "Error: Failed to find Xilinx platform" << std::endl; + exit(EXIT_FAILURE); + } + //Getting ACCELERATOR Devices and selecting 1st such device + std::vector devices; + OCL_CHECK(err, + err = platform.getDevices(CL_DEVICE_TYPE_ACCELERATOR, &devices)); + return devices; +} + +std::vector get_xil_devices() { return get_devices("Xilinx"); } + + +std::vector read_binary_file(const std::string &xclbin_file_name) { + std::cout << "INFO: Reading " << xclbin_file_name << std::endl; + + if (access(xclbin_file_name.c_str(), R_OK) != 0) { + printf("ERROR: %s xclbin not available please build\n", + xclbin_file_name.c_str()); + exit(EXIT_FAILURE); + } + //Loading XCL Bin into char buffer + std::cout << "Loading: '" << xclbin_file_name.c_str() << "'\n"; + std::ifstream bin_file(xclbin_file_name.c_str(), std::ifstream::binary); + bin_file.seekg(0, bin_file.end); + auto nb = bin_file.tellg(); + bin_file.seekg(0, bin_file.beg); + std::vector buf; + buf.resize(nb); + bin_file.read(reinterpret_cast(buf.data()), nb); + return buf; +} + +bool is_emulation() { + bool ret = false; + char *xcl_mode = getenv("XCL_EMULATION_MODE"); + if (xcl_mode != NULL) { + ret = true; + } + return ret; +} + +bool is_hw_emulation() { + bool ret = false; + char *xcl_mode = getenv("XCL_EMULATION_MODE"); + if ((xcl_mode != NULL) && !strcmp(xcl_mode, "hw_emu")) { + ret = true; + } + return ret; +} + +bool is_xpr_device(const char *device_name) { + const char *output = strstr(device_name, "xpr"); + + if (output == NULL) { + return false; + } else { + return true; + } +} +}; // namespace xcl diff --git a/FPGA/Xilinx/Batched/heat3D/xcl2.hpp b/FPGA/Xilinx/Batched/heat3D/xcl2.hpp new file mode 100644 index 0000000..7dbcb6d --- /dev/null +++ b/FPGA/Xilinx/Batched/heat3D/xcl2.hpp @@ -0,0 +1,105 @@ +/********** +Copyright (c) 2018, Xilinx, Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation +and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors +may be used to endorse or promote products derived from this software +without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********/ + + +#pragma once + +#define CL_HPP_CL_1_2_DEFAULT_BUILD +#define CL_HPP_TARGET_OPENCL_VERSION 120 +#define CL_HPP_MINIMUM_OPENCL_VERSION 120 +#define CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY 1 +#define CL_USE_DEPRECATED_OPENCL_1_2_APIS + +//OCL_CHECK doesn't work if call has templatized function call +#define OCL_CHECK(error,call) \ + call; \ + if (error != CL_SUCCESS) { \ + printf("%s:%d Error calling " #call ", error code is: %d\n", \ + __FILE__,__LINE__, error); \ + exit(EXIT_FAILURE); \ + } + +#include +#include +#include +#include +// When creating a buffer with user pointer (CL_MEM_USE_HOST_PTR), under the hood +// User ptr is used if and only if it is properly aligned (page aligned). When not +// aligned, runtime has no choice but to create its own host side buffer that backs +// user ptr. This in turn implies that all operations that move data to and from +// device incur an extra memcpy to move data to/from runtime's own host buffer +// from/to user pointer. So it is recommended to use this allocator if user wish to +// Create Buffer/Memory Object with CL_MEM_USE_HOST_PTR to align user buffer to the +// page boundary. It will ensure that user buffer will be used when user create +// Buffer/Mem Object with CL_MEM_USE_HOST_PTR. +template +struct aligned_allocator +{ + using value_type = T; + T* allocate(std::size_t num) + { + void* ptr = nullptr; + if (posix_memalign(&ptr,4096,num*sizeof(T))) + throw std::bad_alloc(); + return reinterpret_cast(ptr); + } + void deallocate(T* p, std::size_t num) + { + free(p); + } +}; + +namespace xcl { + std::vector get_xil_devices(); + std::vector get_devices(const std::string& vendor_name); + std::vector read_binary_file(const std::string &xclbin_file_name); + bool is_emulation (); + bool is_hw_emulation (); + bool is_xpr_device (const char *device_name); + class Stream{ + public: + static decltype(&clCreateStream) createStream; + static decltype(&clReleaseStream) releaseStream; + static decltype(&clReadStream) readStream; + static decltype(&clWriteStream) writeStream; + static decltype(&clPollStreams) pollStreams; + static void init(const cl_platform_id& platform) { + void *bar = clGetExtensionFunctionAddressForPlatform(platform, "clCreateStream"); + createStream = (decltype(&clCreateStream))bar; + bar = clGetExtensionFunctionAddressForPlatform(platform, "clReleaseStream"); + releaseStream = (decltype(&clReleaseStream))bar; + bar = clGetExtensionFunctionAddressForPlatform(platform, "clReadStream"); + readStream = (decltype(&clReadStream))bar; + bar = clGetExtensionFunctionAddressForPlatform(platform, "clWriteStream"); + writeStream = (decltype(&clWriteStream))bar; + bar = clGetExtensionFunctionAddressForPlatform(platform, "clPollStreams"); + pollStreams = (decltype(&clPollStreams))bar; + } + }; +}