From 045c1f9b03c527baa71cd0d662176c5f1f6e2c75 Mon Sep 17 00:00:00 2001
From: Beniel Thileepan <thileepanbeniel@gmail.com>
Date: Thu, 16 Feb 2023 18:16:28 +0000
Subject: [PATCH] completed heat code. HW run tested

---
 FPGA/Xilinx/Batched/heat3D/SLR.cpp            |  86 ++++
 FPGA/Xilinx/Batched/heat3D/heat3D_app.cpp     | 383 ++++++++++++++++++
 FPGA/Xilinx/Batched/heat3D/heat3D_common.h    |   3 +-
 FPGA/Xilinx/Batched/heat3D/heat3D_cpu.cpp     |   4 +-
 FPGA/Xilinx/Batched/heat3D/heat3D_cpu.h       |   4 +-
 FPGA/Xilinx/Batched/heat3D/mem2stream.cpp     | 134 ++++++
 FPGA/Xilinx/Batched/heat3D/stencil.cpp        | 238 +++++++++++
 FPGA/Xilinx/Batched/heat3D/stencil.h          |  92 +++++
 FPGA/Xilinx/Batched/heat3D/stencil_3_SLR.cfg  |  22 +
 .../Batched/heat3D/stencil_single_SLR.cfg     |   9 +
 FPGA/Xilinx/Batched/heat3D/xcl2.cpp           | 114 ++++++
 FPGA/Xilinx/Batched/heat3D/xcl2.hpp           | 105 +++++
 12 files changed, 1189 insertions(+), 5 deletions(-)
 create mode 100644 FPGA/Xilinx/Batched/heat3D/SLR.cpp
 create mode 100644 FPGA/Xilinx/Batched/heat3D/heat3D_app.cpp
 create mode 100644 FPGA/Xilinx/Batched/heat3D/mem2stream.cpp
 create mode 100644 FPGA/Xilinx/Batched/heat3D/stencil.cpp
 create mode 100644 FPGA/Xilinx/Batched/heat3D/stencil.h
 create mode 100644 FPGA/Xilinx/Batched/heat3D/stencil_3_SLR.cfg
 create mode 100644 FPGA/Xilinx/Batched/heat3D/stencil_single_SLR.cfg
 create mode 100644 FPGA/Xilinx/Batched/heat3D/xcl2.cpp
 create mode 100644 FPGA/Xilinx/Batched/heat3D/xcl2.hpp

diff --git a/FPGA/Xilinx/Batched/heat3D/SLR.cpp b/FPGA/Xilinx/Batched/heat3D/SLR.cpp
new file mode 100644
index 0000000..a8ca89a
--- /dev/null
+++ b/FPGA/Xilinx/Batched/heat3D/SLR.cpp
@@ -0,0 +1,86 @@
+#include <ap_int.h>
+#include <hls_stream.h>
+#include <ap_axi_sdata.h>
+#include <math.h>
+#include "stencil.h"
+#include "stencil.cpp"
+
+void process_SLR(hls::stream <t_pkt> &in, hls::stream<t_pkt> &out, const int xdim0, const unsigned short size_x,
+		const unsigned int size_y, const unsigned int size_z, const unsigned short batches, const float calcParam_K)
+{
+	hls::stream<uint256_dt> streamArray[SLR_P_STAGE + 1];
+#pragma HLS STREAM variable = streamArray depth = 10
+
+	data_G data_g;
+	data_g.sizex = size_x;
+	data_g.sizey = size_y;
+	data_g.sizez = size_z;
+	data_g.offset_x = 0;
+	data_g.grid_size_x = xdim0;
+	data_g.xblocks = (data_g.grid_size_x >> SHIFT_BITS);
+	data_g.offset_y = 0;
+	data_g.grid_size_y = size_y + 2;
+	data_g.offset_z = 0;
+	data_g.grid_size_z = size_z + 2;
+	data_g.batches = batches;
+	data_g.limit_z = size_z + 3;
+
+	unsigned short tile_y_1 = data_g.grid_size_y - 1;
+	unsigned int plane_size = data_g.xblocks * data_g.grid_size_y;
+
+	data_g.plane_diff = data_g.xblocks * tile_y_1;
+	data_g.line_diff = data_g.xblocks - 1;
+	data_g.gridsize_pr = plane_size * register_it(data_g.grid_size_z * batches + 1);
+	data_g.gridsize_da = register_it(plane_size * data_g.grid_size_z) * batches;
+
+	const float coefficients[7] = {calcParam_K, calcParam_K, calcParam_K, 1-6*calcParam_K, calcParam_K, calcParam_K, calcParam_K};
+#pragma HLS ARRAY_PARTITION variable=coefficients complete dim=1
+
+#pragma HLS DATAFLOW
+	{
+		axis2_fifo256(in, streamArray[0], data_g.gridsize_da);
+
+		for (int i = 0; i < SLR_P_STAGE; i++)
+		{
+#pragma HLS unroll
+			process_grid(streamArray[i], streamArray[i+1], data_g, coefficients);
+		}
+
+		fifo256_2axis(streamArray[SLR_P_STAGE], out, data_g.gridsize_da);
+
+	}
+
+}
+
+extern "C"
+{
+	void stencil_SLR(
+			const int sizex,
+			const int sizey,
+			const int sizez,
+			const int xdim0,
+			const int batches,
+			const int count,
+			const float calcParam_K,
+			hls::stream <t_pkt> &in,
+			hls::stream <t_pkt> &out)
+	{
+#pragma HLS INTERFACE axis port = in register
+#pragma HLS INTERFACE axis port = out register
+
+#pragma HLS INTERFACE s_axilite port = sizex bundle = control
+#pragma HLS INTERFACE s_axilite port = sizey bundle = control
+#pragma HLS INTERFACE s_axilite port = sizez bundle = control
+#pragma HLS INTERFACE s_axilite port = xdim0 bundle = control
+#pragma HLS INTERFACE s_axilite port = batches bundle = control
+#pragma HLS INTERFACE s_axilite port = count bundle = control
+#pragma HLS INTERFACE s_axilite port = calcParam_K bundle = control
+#pragma HLS INTERFACE s_axilite port = return bundle = control
+
+		for (unsigned int i = 0; i < count * 2; i++)
+		{
+			process_SLR(in, out, xdim0, sizex, sizey, sizez, batches, calcParam_K);
+		}
+
+	}
+}
diff --git a/FPGA/Xilinx/Batched/heat3D/heat3D_app.cpp b/FPGA/Xilinx/Batched/heat3D/heat3D_app.cpp
new file mode 100644
index 0000000..cbe219d
--- /dev/null
+++ b/FPGA/Xilinx/Batched/heat3D/heat3D_app.cpp
@@ -0,0 +1,383 @@
+
+// standard headers
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <chrono>
+#include <iostream>
+#include "xcl2.hpp"
+#include "heat3D_cpu.h"
+#include "heat3D_common.h"
+
+//#define DEBUG_VERBOSE
+#define VERIFICATION
+#define MULTI_SLR
+//#define FPGA_RUN_ONLY
+
+int main(int argc, char **argv)
+{
+    GridParameter gridData;
+
+    gridData.logical_size_x = 100;
+    gridData.logical_size_y = 100;
+    gridData.logical_size_z = 100;
+    gridData.batch = 10;
+    gridData.num_iter = 1000;
+
+    unsigned int vectorization_factor = 8;
+
+    // setting grid parameters given by user
+    const char * pch;
+
+    for ( int n = 1; n < argc; n++ )
+    {
+        pch = strstr(argv[n], "-size=");
+
+        if(pch != NULL)
+        {
+            gridData.logical_size_x = atoi ( argv[n] + 7 ); continue;
+        }
+
+        pch = strstr(argv[n], "-iters=");
+
+        if(pch != NULL)
+        {
+            gridData.num_iter = atoi ( argv[n] + 7 ); continue;
+        }
+        pch = strstr(argv[n], "-batch=");
+
+        if(pch != NULL)
+        {
+            gridData.batch = atoi ( argv[n] + 7 ); continue;
+        }
+    }
+
+    printf("Grid: %dx1 , %d iterations, %d batches\n", gridData.logical_size_x, gridData.num_iter, gridData.batch);
+
+    //adding halo
+    gridData.act_size_x = gridData.logical_size_x + 2;
+    gridData.act_size_y = gridData.logical_size_y + 2;
+    gridData.act_size_z = gridData.logical_size_z + 2;
+
+    //padding each row as multiples of vectorization factor
+    gridData.grid_size_x = (gridData.act_size_x % vectorization_factor) != 0 ?
+			      (gridData.act_size_x/vectorization_factor + 1) * vectorization_factor :
+			      gridData.act_size_x;
+	  gridData.grid_size_y = gridData.act_size_y;
+    gridData.grid_size_z = gridData.act_size_z;
+
+    //allocating memory buffer
+    unsigned int data_size_bytes = gridData.grid_size_x * gridData.grid_size_y
+            * gridData.grid_size_z * sizeof(float) * gridData.batch;
+
+    if (data_size_bytes >= 4000000000)
+    {
+        std::cerr << "Maximum buffer size is exceeded!" << std::endl;
+    }
+
+    heat3DParameter calcParam;
+
+
+	calcParam.alpha = 1.5/1000; //diffusivity
+	calcParam.h = 1/gridData.act_size_x;
+	calcParam.delta_t = 0.5; //0.5s
+	calcParam.K = calcParam.alpha * calcParam.delta_t / (calcParam.h * calcParam.h);
+
+	float * grid_u1_cpu = (float*) aligned_alloc(4096, data_size_bytes);
+	float * grid_u2_cpu = (float*) aligned_alloc(4096, data_size_bytes);
+
+	float * grid_u1_d = (float*) aligned_alloc(4096, data_size_bytes);
+	float * grid_u2_d = (float*) aligned_alloc(4096, data_size_bytes);
+
+
+    auto init_start_clk_point = std::chrono::high_resolution_clock::now();
+    initialize_grid(grid_u1_cpu, gridData);
+    copy_grid(grid_u1_cpu, grid_u1_d, gridData);
+    auto init_stop_clk_point = std::chrono::high_resolution_clock::now();
+    double runtime_init = std::chrono::duration<double, std::micro> (init_stop_clk_point - init_start_clk_point).count();
+    copy_grid(grid_u1_cpu, grid_u2_cpu, gridData);
+
+
+#ifdef DEBUG_VERBOSE
+    std::cout << std::endl;
+    std::cout << "*********************************************"  << std::endl;
+    std::cout << "**            intial grid values           **"  << std::endl;
+    std::cout << "*********************************************"  << std::endl;
+
+    for (unsigned int bat = 0; bat < gridData.batch; bat++)
+    {
+        int offset = bat * gridData.grid_size_x * gridData.grid_size_y * gridData.grid_size_y;
+
+        std::cout << "---------------------------------------------" << std::endl;
+        std::cout << "               batch: " << bat << std::endl;
+        std::cout << "---------------------------------------------" << std::endl;
+
+        for (unsigned int k = 0; k < gridData.grid_size_z; k++)
+        {
+            for (unsigned int j = 0; j < gridData.grid_size_y; j++)
+            {
+                for (unsigned int i = 0; i < gridData.grid_size_x; i++)
+                {
+                	int index = offset + k * gridData.grid_size_x * gridData.grid_size_y
+                	                                + j * gridData.grid_size_x + i;
+                	std::cout << "grid_id: (" << i << ", " << j << ", " << k << ") initial_val: "
+                			<< grid_u1_cpu[index]<< std::endl;
+                }
+            }
+        }
+    }
+    std::cout << "============================================="  << std::endl << std::endl;
+#endif
+
+#ifndef FPGA_RUN_ONLY
+    //golden stencil computation on the CPU
+
+    std::vector<heat3DParameter> calcParams(gridData.batch);
+
+    for (unsigned int bat = 0; bat < gridData.batch; bat++)
+    {
+    	calcParams[bat] = calcParam;
+    }
+
+    auto naive_cpu_start_clk_point = std::chrono::high_resolution_clock::now();
+    heat3D_explicit(grid_u1_cpu, grid_u2_cpu, gridData, calcParams);
+    auto naive_cpu_stop_clk_point = std::chrono::high_resolution_clock::now();
+    double runtime_naive_cpu_stencil = std::chrono::duration<double, std::micro> (naive_cpu_stop_clk_point - naive_cpu_start_clk_point).count();
+
+#endif
+    //OPENCL HOST CODE START
+    auto bindaryFile = argv[1];
+    cl_int err;
+
+    auto devices = xcl::get_xil_devices();
+    auto device = devices[0];
+
+    OCL_CHECK(err, cl::Context context(device, NULL, NULL, NULL, &err));
+    OCL_CHECK(err, cl::CommandQueue queue(context, CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err));
+    OCL_CHECK(err, std::string device_name = device.getInfo<CL_DEVICE_NAME>(&err));
+
+    //Create Program and Kernel
+    auto fileBuf = xcl::read_binary_file(bindaryFile);
+    cl::Program::Binaries bins{{fileBuf.data(), fileBuf.size()}};
+
+    OCL_CHECK(err, cl::Program program(context, {device}, bins, NULL, &err));
+    OCL_CHECK(err, cl::Kernel krnl_slr0(program, "stencil_SLR", &err));
+
+#ifdef MULTI_SLR
+    OCL_CHECK(err, cl::Kernel krnl_slr1(program, "stencil_SLR", &err));
+    OCL_CHECK(err, cl::Kernel krnl_slr2(program, "stencil_SLR", &err));
+#endif
+    OCL_CHECK(err, cl::Kernel krnl_mem2stream(program, "stencil_mem2stream", &err));
+
+    //Allocation Buffer in Global Memory
+    OCL_CHECK(err, cl::Buffer buff_curr(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, data_size_bytes, grid_u1_d, &err));
+    OCL_CHECK(err, cl::Buffer buff_next(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, data_size_bytes, grid_u2_d, &err));
+
+#ifdef MULTI_SLR
+    unsigned int total_SLR = 3;
+#else
+    unsigned int total_SLR = 1;
+#endif
+
+    unsigned number_of_process_grid_per_SLR = NUM_OF_PROCESS_GRID_PER_SLR;
+    unsigned int total_process_grid_per_iter = total_SLR * number_of_process_grid_per_SLR * 2;
+    unsigned int num_iter = gridData.num_iter / total_process_grid_per_iter;
+
+    //set Kernel arguments
+
+    /*
+     * 	void stencil_SLR(
+			const int sizex,
+			const int sizey,
+			const int sizez,
+			const int xdim0,
+			const int batches,
+			const int count,
+			const float calcParam_K,
+			hls::stream <t_pkt> &in,
+			hls::stream <t_pkt> &out)
+     */
+    int narg = 0;
+    OCL_CHECK(err, err = krnl_slr0.setArg(narg++, gridData.logical_size_x));
+    OCL_CHECK(err, err = krnl_slr0.setArg(narg++, gridData.logical_size_y));
+    OCL_CHECK(err, err = krnl_slr0.setArg(narg++, gridData.logical_size_z));
+    OCL_CHECK(err, err = krnl_slr0.setArg(narg++, gridData.grid_size_x));
+    OCL_CHECK(err, err = krnl_slr0.setArg(narg++, gridData.batch));
+    OCL_CHECK(err, err = krnl_slr0.setArg(narg++, num_iter));
+    OCL_CHECK(err, err = krnl_slr0.setArg(narg++, calcParam.K));
+
+#ifdef MULTI_SLR
+    narg = 0;
+    OCL_CHECK(err, err = krnl_slr1.setArg(narg++, gridData.logical_size_x));
+    OCL_CHECK(err, err = krnl_slr1.setArg(narg++, gridData.logical_size_y));
+    OCL_CHECK(err, err = krnl_slr1.setArg(narg++, gridData.logical_size_z));
+    OCL_CHECK(err, err = krnl_slr1.setArg(narg++, gridData.grid_size_x));
+    OCL_CHECK(err, err = krnl_slr1.setArg(narg++, gridData.batch));
+    OCL_CHECK(err, err = krnl_slr1.setArg(narg++, num_iter));
+    OCL_CHECK(err, err = krnl_slr1.setArg(narg++, calcParam.K));
+
+    narg = 0;
+    OCL_CHECK(err, err = krnl_slr2.setArg(narg++, gridData.logical_size_x));
+    OCL_CHECK(err, err = krnl_slr2.setArg(narg++, gridData.logical_size_y));
+    OCL_CHECK(err, err = krnl_slr2.setArg(narg++, gridData.logical_size_z));
+    OCL_CHECK(err, err = krnl_slr2.setArg(narg++, gridData.grid_size_x));
+    OCL_CHECK(err, err = krnl_slr2.setArg(narg++, gridData.batch));
+    OCL_CHECK(err, err = krnl_slr2.setArg(narg++, num_iter));
+    OCL_CHECK(err, err = krnl_slr2.setArg(narg++, calcParam.K));
+#endif
+
+    /*
+     * 	void stencil_mem2stream(
+			uint512_dt* arg0,
+			uint512_dt* arg1,
+			const int count,
+			const int xdim0,
+			const int ydim0,
+			const int zdim0,
+			const int batch,
+			hls::stream <t_pkt> &in,
+			hls::stream <t_pkt> &out)
+     */
+    narg = 0;
+    OCL_CHECK(err, err = krnl_mem2stream.setArg(narg++, buff_curr));
+    OCL_CHECK(err, err = krnl_mem2stream.setArg(narg++, buff_next));
+    OCL_CHECK(err, err = krnl_mem2stream.setArg(narg++, num_iter));
+    OCL_CHECK(err, err = krnl_mem2stream.setArg(narg++, gridData.grid_size_x));
+    OCL_CHECK(err, err = krnl_mem2stream.setArg(narg++, gridData.grid_size_y));
+    OCL_CHECK(err, err = krnl_mem2stream.setArg(narg++, gridData.grid_size_z));
+    OCL_CHECK(err, err = krnl_mem2stream.setArg(narg++, gridData.batch));
+
+    //Copy input buffer to device
+    auto h_to_d_start_point = std::chrono::high_resolution_clock::now();
+    OCL_CHECK(err, err = queue.enqueueMigrateMemObjects({buff_curr, buff_next}, 0));
+    queue.finish();
+    auto h_to_d_stop_kernels_start_point = std::chrono::high_resolution_clock::now();
+#ifdef MULTI_SLR
+    OCL_CHECK(err, err = queue.enqueueTask(krnl_slr2));
+    OCL_CHECK(err, err = queue.enqueueTask(krnl_slr1));
+#endif
+    OCL_CHECK(err, err = queue.enqueueTask(krnl_slr0));
+    OCL_CHECK(err, err = queue.enqueueTask(krnl_mem2stream));
+    queue.finish();
+    auto kernels_stop_d_to_h_start_point = std::chrono::high_resolution_clock::now();
+    OCL_CHECK(err, err = queue.enqueueMigrateMemObjects({buff_curr}, CL_MIGRATE_MEM_OBJECT_HOST));
+    queue.finish();
+    auto d_to_h_stop_point = std::chrono::high_resolution_clock::now();
+
+    double h_to_d_runtime = std::chrono::duration<double, std::micro>
+    		(h_to_d_stop_kernels_start_point - h_to_d_start_point).count();
+    double kernels_runtime = std::chrono::duration<double, std::micro>
+    		(kernels_stop_d_to_h_start_point - h_to_d_stop_kernels_start_point).count();
+    double d_to_h_runtime = std::chrono::duration<double, std::micro>
+    		(d_to_h_stop_point - kernels_stop_d_to_h_start_point).count();
+
+#ifdef VERIFICATION
+    std::cout << std::endl;
+    std::cout << "*********************************************"  << std::endl;
+    std::cout << "**               Verification              **"  << std::endl;
+    std::cout << "*********************************************"  << std::endl;
+
+    for (unsigned int bat = 0; bat < gridData.batch; bat++)
+    {
+        int offset = bat * gridData.grid_size_x * gridData.grid_size_y * gridData.grid_size_y;
+
+        std::cout << "---------------------------------------------" << std::endl;
+        std::cout << "               batch: " << bat << std::endl;
+        std::cout << "---------------------------------------------" << std::endl;
+
+        bool passed = true;
+
+        for (unsigned int k = 0; k < gridData.grid_size_z; k++)
+        {
+            for (unsigned int j = 0; j < gridData.grid_size_y; j++)
+            {
+                for (unsigned int i = 0; i < gridData.grid_size_x; i++)
+                {
+                	int index = offset + k * gridData.grid_size_x * gridData.grid_size_y
+                	                                + j * gridData.grid_size_x + i;
+                	if (abs(grid_u1_cpu[index] - grid_u1_d[index]) > EPSILON)
+                	{
+                		std::cerr << "Value Mismatch index: (" << i << ", " << j << ", " << k << "), naive_cpu_val: "
+								<< grid_u1_cpu[index] << ", and fpga_val: " << grid_u1_d[index] << std::endl;
+                		passed = false;
+                	}
+                }
+            }
+        }
+
+        std::cout << "---------------------------------------------" << std::endl;
+        std::cout << "               batch: " << bat << " ";
+
+        if (passed)
+        	std::cout << "Verification passed ";
+        else
+        	std::cout << "Verification failed ";
+
+		std::cout << std::endl;
+		std::cout << "---------------------------------------------" << std::endl;
+
+    }
+    std::cout << "============================================="  << std::endl << std::endl;
+#endif
+
+#ifdef DEBUG_VERBOSE
+    std::cout << std::endl;
+    std::cout << "*********************************************"  << std::endl;
+    std::cout << "**      Debug info after calculations      **"  << std::endl;
+    std::cout << "*********************************************"  << std::endl;
+
+    for (unsigned int bat = 0; bat < gridData.batch; bat++)
+    {
+        int offset = bat * gridData.grid_size_x * gridData.grid_size_y * gridData.grid_size_y;
+
+        std::cout << "---------------------------------------------" << std::endl;
+        std::cout << "               batch: " << bat << std::endl;
+        std::cout << "---------------------------------------------" << std::endl;
+
+        for (unsigned int k = 0; k < gridData.grid_size_z; k++)
+        {
+            for (unsigned int j = 0; j < gridData.grid_size_y; j++)
+            {
+                for (unsigned int i = 0; i < gridData.grid_size_x; i++)
+                {
+                    int index = offset + k * gridData.grid_size_x * gridData.grid_size_y
+                            + j * gridData.grid_size_x + i;
+                    std::cout << "grid_id: (" << i << ", " << j << ", " << k << "), "
+#ifndef FPGA_RUN_ONLY
+                    		<< "golden_val: " << grid_u1_cpu[index]
+#endif
+							<< "fpga_explicit_val: " << grid_u1_d[index] << std::endl;
+                }
+            }
+        }
+    }
+
+    std::cout << "============================================="  << std::endl << std::endl;
+#endif
+
+	std::cout << std::endl;
+	std::cout << "*********************************************"  << std::endl;
+	std::cout << "**            runtime summery              **"  << std::endl;
+	std::cout << "*********************************************"  << std::endl;
+
+#ifndef FPGA_RUN_ONLY
+
+	std::cout << " * naive stencil runtime  : " << runtime_init + runtime_naive_cpu_stencil<< " us" << std::endl;
+	std::cout << "      |--> grid_init time : " << runtime_init << " us" << std::endl;
+	std::cout << "      |--> calc time      : " << runtime_naive_cpu_stencil << " us" << std::endl;
+#endif
+	std::cout << " * fpga runtime           : " << runtime_init + h_to_d_runtime
+				+ kernels_runtime + d_to_h_runtime << " us" << std::endl;
+	std::cout << "      |--> grid_init time : " << runtime_init<< " us" << std::endl;
+	std::cout << "      |--> h_to_d         : " << h_to_d_runtime << " us" << std::endl;
+	std::cout << "      |--> d_to_h         : " << d_to_h_runtime << " us" << std::endl;
+	std::cout << "      |--> kernels_runtime: " << kernels_runtime << " us" << std::endl;
+	std::cout << "============================================="  << std::endl << std::endl;
+
+    free(grid_u1_cpu);
+    free(grid_u2_cpu);
+    free(grid_u1_d);
+    free(grid_u2_d);
+
+	return 0;
+}
diff --git a/FPGA/Xilinx/Batched/heat3D/heat3D_common.h b/FPGA/Xilinx/Batched/heat3D/heat3D_common.h
index 7fab8ef..4a4e6c6 100644
--- a/FPGA/Xilinx/Batched/heat3D/heat3D_common.h
+++ b/FPGA/Xilinx/Batched/heat3D/heat3D_common.h
@@ -6,6 +6,7 @@
 
 #define EPSILON 0.0001
 #define ERROR_TOL 10e-6
+#define NUM_OF_PROCESS_GRID_PER_SLR 8
 
 struct GridParameter
 {
@@ -31,4 +32,4 @@ struct heat3DParameter
     float alpha; //diffusivity
     float delta_t;
     float K;
-};
\ No newline at end of file
+};
diff --git a/FPGA/Xilinx/Batched/heat3D/heat3D_cpu.cpp b/FPGA/Xilinx/Batched/heat3D/heat3D_cpu.cpp
index 56d13db..30dffeb 100644
--- a/FPGA/Xilinx/Batched/heat3D/heat3D_cpu.cpp
+++ b/FPGA/Xilinx/Batched/heat3D/heat3D_cpu.cpp
@@ -4,7 +4,7 @@
 
 #include "heat3D_cpu.h"
 
-int heat3D_explicit(float * current, float *next, GridParameter gridData, std::vector<heat3DParameter> calcParam)
+int heat3D_explicit(float * current, float *next, GridParameter gridData, std::vector<heat3DParameter> & calcParam)
 {
     assert(calcParam.size() == gridData.batch);
 
@@ -116,7 +116,7 @@ void initialize_grid(float* grid, GridParameter gridData)
     }
 }
 
-int copy_grid(float* grid_s, float* grid_d, GridParameter gridData)
+void copy_grid(float* grid_s, float* grid_d, GridParameter gridData)
 {
     for (unsigned int bat = 0; bat < gridData.batch; bat++)
     {
diff --git a/FPGA/Xilinx/Batched/heat3D/heat3D_cpu.h b/FPGA/Xilinx/Batched/heat3D/heat3D_cpu.h
index 1392dcc..ccb35c8 100644
--- a/FPGA/Xilinx/Batched/heat3D/heat3D_cpu.h
+++ b/FPGA/Xilinx/Batched/heat3D/heat3D_cpu.h
@@ -13,8 +13,8 @@
 #include <chrono>
 #include "heat3D_common.h"
 
-int heat3D_explicit(float * current, float *next, GridParameter gridData, std::vector<heat3DParameter> calcParam);
+int heat3D_explicit(float * current, float *next, GridParameter gridData, std::vector<heat3DParameter> & calcParam);
 
 void initialize_grid(float* grid, GridParameter gridData);
 
-int copy_grid(float* grid_s, float* grid_d, GridParameter gridData);
\ No newline at end of file
+void copy_grid(float* grid_s, float* grid_d, GridParameter gridData);
diff --git a/FPGA/Xilinx/Batched/heat3D/mem2stream.cpp b/FPGA/Xilinx/Batched/heat3D/mem2stream.cpp
new file mode 100644
index 0000000..ded4a22
--- /dev/null
+++ b/FPGA/Xilinx/Batched/heat3D/mem2stream.cpp
@@ -0,0 +1,134 @@
+#include <ap_int.h>
+#include <hls_stream.h>
+#include <ap_axi_sdata.h>
+#include <math.h>
+#include <stdio.h>
+#include "heat3D_common.h"
+#include "stencil.h"
+#include "stencil.cpp"
+
+// coalesced memory access at 512 bit to get maximum out of memory bandwidth
+// Single pipelined loop below will be mapped to single memory transfer
+// which will further split into multiple transfers by axim module.
+static void read_grid(uint512_dt*  arg0, hls::stream<uint512_dt> &rd_buffer, const unsigned int total_itr)
+{
+	for (int itr = 0; itr < total_itr; itr++){
+		#pragma HLS PIPELINE II=1
+		#pragma HLS loop_tripcount min=min_grid max=max_grid avg=avg_grid
+		rd_buffer << arg0[itr];
+	}
+}
+
+// data width conversion to support 256 bit width compute pipeline
+static void stream_convert_512_256(hls::stream<uint512_dt> &in, hls::stream<uint256_dt> &out,
+		const unsigned int total_itr_512, const unsigned int total_iter_256)
+{
+	unsigned int total_itr = total_itr_512;
+	bool flag = total_iter_256 & 0x1;
+
+	for (int itr = 0; itr < total_itr; itr++){
+		#pragma HLS PIPELINE II=2
+		#pragma HLS loop_tripcount min=min_grid max=max_grid avg=avg_grid
+		uint512_dt tmp = in.read();
+		uint256_dt var_l = tmp.range(255,0);
+		uint256_dt var_h = tmp.range(511,256);;
+		out << var_l;
+		if(~flag ||  itr < total_itr -1){
+			out << var_h;
+		}
+	}
+}
+
+// data width conversion to support 512 bit width memory write interface
+static void stream_convert_256_512(hls::stream<uint256_dt> &in, hls::stream<uint512_dt> &out,
+		const unsigned int total_itr_512, const unsigned int total_itr_256)
+{
+	unsigned int total_itr = total_itr_512;
+	bool flag = total_itr_256 & 0x1;
+	for (int itr = 0; itr < total_itr; itr++){
+		#pragma HLS PIPELINE II=2
+		#pragma HLS loop_tripcount min=min_grid max=max_grid avg=avg_grid
+		uint512_dt tmp;
+		tmp.range(255,0) = in.read();
+		if(~flag ||  itr < total_itr -1){
+			tmp.range(511,256) = in.read();
+		}
+		out << tmp;
+	}
+}
+
+// coalesced memory write using 512 bit to get maximum out of memory bandwidth
+// Single pipelined loop below will be mapped to single memory transfer
+// which will further split into multiple transfers by axim module.
+static void write_grid(uint512_dt*  arg0, hls::stream<uint512_dt> &wr_buffer, const unsigned int total_itr)
+{
+	for (int itr = 0; itr < total_itr; itr++){
+		#pragma HLS PIPELINE II=1
+		#pragma HLS loop_tripcount min=min_grid max=max_grid avg=avg_grid
+		arg0[itr] = wr_buffer.read();
+	}
+}
+
+
+void process_mem2stream(uint512_dt* arg0, uint512_dt* arg1, const int count, const int xdim0, const int ydim0, const int zdim0,
+			const int batch, hls::stream <t_pkt> &in, hls::stream <t_pkt> &out)
+{
+	static hls::stream<uint256_dt> streamArray[2];
+	static hls::stream<uint512_dt> rd_buffer;
+	static hls::stream<uint512_dt> wr_buffer;
+
+	// depth of rd_buffer and wr_buffer set such that burst transfers can be supported.
+	#pragma HLS STREAM variable = streamArray depth = 10
+	#pragma HLS STREAM variable = rd_buffer depth = max_depth_16
+	#pragma HLS STREAM variable = wr_buffer depth = max_depth_16
+
+	int end_index = (xdim0 >> SHIFT_BITS);
+
+	unsigned int total_itr_256 = zdim0 * ydim0 * end_index * batch;
+	unsigned int total_itr_512 = (zdim0 * ydim0 * end_index * batch + 1) >> 1;
+
+	#pragma HLS DATAFLOW
+	read_grid(arg0, rd_buffer, total_itr_512);
+	stream_convert_512_256(rd_buffer, streamArray[0], total_itr_512, total_itr_256);
+	fifo256_2axis(streamArray[0], out, total_itr_256);
+	axis2_fifo256(in, streamArray[1], total_itr_256);
+	stream_convert_256_512(streamArray[1], wr_buffer, total_itr_512, total_itr_256);
+	write_grid(arg1, wr_buffer, total_itr_512);
+
+}
+extern "C" {
+
+	void stencil_mem2stream(
+			uint512_dt* arg0,
+			uint512_dt* arg1,
+			const int count,
+			const int xdim0,
+			const int ydim0,
+			const int zdim0,
+			const int batch,
+			hls::stream <t_pkt> &in,
+			hls::stream <t_pkt> &out)
+	{
+			#pragma HLS INTERFACE depth=4096 m_axi port = arg0 offset = slave bundle = gmem0 max_read_burst_length=64 max_write_burst_length=64 \
+									num_read_outstanding=4 num_write_outstanding=4
+			#pragma HLS INTERFACE depth=4096 m_axi port = arg1 offset = slave bundle = gmem1
+			#pragma HLS INTERFACE s_axilite port = arg0 bundle = control
+			#pragma HLS INTERFACE s_axilite port = arg1 bundle = control
+			#pragma HLS INTERFACE s_axilite port = count bundle = control
+			#pragma HLS INTERFACE s_axilite port = xdim0 bundle = control
+			#pragma HLS INTERFACE s_axilite port = ydim0 bundle = control
+			#pragma HLS INTERFACE s_axilite port = zdim0 bundle = control
+			#pragma HLS INTERFACE s_axilite port = batch bundle = control
+			#pragma HLS INTERFACE axis port = in  register
+			#pragma HLS INTERFACE axis port = out register
+			#pragma HLS INTERFACE s_axilite port = return bundle = control
+
+
+			for (int i = 0; i < count; i++)
+			{
+				process_mem2stream(arg0, arg1, count, xdim0, ydim0, zdim0, batch, in, out);
+				process_mem2stream(arg1, arg0, count, xdim0, ydim0, zdim0, batch, in, out);
+			}
+	}
+}
+
diff --git a/FPGA/Xilinx/Batched/heat3D/stencil.cpp b/FPGA/Xilinx/Batched/heat3D/stencil.cpp
new file mode 100644
index 0000000..0a67a5d
--- /dev/null
+++ b/FPGA/Xilinx/Batched/heat3D/stencil.cpp
@@ -0,0 +1,238 @@
+#include "stencil.h"
+
+#define OPTIMIZED_REDUCTION
+#define DEBUG_VERBOSE
+
+template <typename T>
+static T register_it(T x)
+{
+#pragma HLS inline off
+	T temp = x;
+	return temp;
+}
+
+static void axis2_fifo256(hls::stream <t_pkt> &in, hls::stream<uint256_dt> &out,  const unsigned int total_itr)
+{
+	for (int itr = 0; itr < total_itr; itr++){
+		#pragma HLS PIPELINE II=1
+		#pragma HLS loop_tripcount min=min_grid max=max_grid avg=avg_grid
+		t_pkt tmp = in.read();
+//		printf("reading 256 bit input from SLR. iter: %d\n", itr);
+		out << tmp.data;
+	}
+}
+
+static void fifo256_2axis(hls::stream <uint256_dt> &in, hls::stream<t_pkt> &out, const unsigned int total_itr)
+{
+	for (int itr = 0; itr < total_itr; itr++){
+		#pragma HLS PIPELINE II=1
+		#pragma HLS loop_tripcount min=min_grid max=max_grid avg=avg_grid
+		t_pkt tmp;
+		tmp.data = in.read();
+		out.write(tmp);
+	}
+}
+
+static void inline get_stencil_coefficent(const float & alpha, const float & beta,
+		const float & delta_t, const int & init_idx, float * a,
+		float * b, float * c)
+{
+	for (int i = 0; i < VEC_FACTOR; i++)
+	{
+#pragma HLS UNROLL
+
+		int idx = init_idx + i;
+		a[i] = 0.5 * (alpha * std::pow(idx,2) - beta * idx);
+		b[i] = 1 - alpha * std::pow(idx,2) - beta;
+		c[i] = 0.5 * (alpha * std::pow(idx,2) + beta * idx);
+	}
+}
+
+static void process_grid(hls::stream<uint256_dt> &rd_buffer, hls::stream<uint256_dt> &wr_buffer, struct data_G data_g,
+		const float coefficients[7])
+{
+	float s_1_1_2_arr[VEC_FACTOR];
+	float s_1_2_1_arr[VEC_FACTOR];
+	float s_1_1_1_arr[VEC_FACTOR+2];
+	float s_1_0_1_arr[VEC_FACTOR];
+	float s_1_1_0_arr[VEC_FACTOR];
+
+	float mem_wr[VEC_FACTOR];
+
+#pragma HLS ARRAY_PARTITION variable=s_1_1_2_arr complete dim=1
+#pragma HLS ARRAY_PARTITION variable=s_1_2_1_arr complete dim=1
+#pragma HLS ARRAY_PARTITION variable=s_1_1_1_arr complete dim=1
+#pragma HLS ARRAY_PARTITION variable=s_1_0_1_arr complete dim=1
+#pragma HLS ARRAY_PARTITION variable=s_1_1_0_arr complete dim=1
+#pragma HLS ARRAY_PARTITION variable=mem_wr complete dim=1
+
+	uint256_dt windowBuff_1[max_depth_xy];
+	uint256_dt windowBuff_2[max_depth_8];
+	uint256_dt windowBuff_3[max_depth_8];
+	uint256_dt windowBuff_4[max_depth_xy];
+
+#pragma HLS BIND_STORAGE variable=windowBuff_1 type=ram_t2p impl=uram latency=1
+#pragma HLS BIND_STORAGE variable=windowBuff_2 type=ram_t2p impl=uram latency=1
+#pragma HLS BIND_STORAGE variable=windowBuff_3 type=ram_t2p impl=uram latency=1
+#pragma HLS BIND_STORAGE variable=windowBuff_4 type=ram_t2p impl=uram latency=1
+
+	uint256_dt s_1_1_2, s_1_2_1, s_1_1_1, s_1_1_1b, s_1_1_1f, s_1_0_1, s_1_1_0;
+	uint256_dt update_j;
+
+	unsigned short i = 0, j = 0, k = 0;
+	unsigned short j_p = 0, j_l = 0;
+
+	for(unsigned int itr = 0; itr < data_g.gridsize_pr; itr++)
+	{
+#pragma HLS loop_tripcount min=min_grid max=max_grid avg=avg_grid
+#pragma HLS PIPELINE II=1
+
+		spc_temp_blocking_read:
+		{
+			bool cond_x = (i == data_g.xblocks);
+			bool cond_y = (j == data_g.grid_size_y - 1);
+			bool cond_z = (k == data_g.limit_z - 1);
+
+			if (cond_x)
+				i = 0;
+
+			if (cond_y && cond_x)
+				j = 0;
+			else if(cond_x)
+				j++;
+
+			if (cond_x && cond_y && cond_z)
+				k = 1;
+			else if(cond_y && cond_x)
+				k++;
+
+			s_1_1_0 = windowBuff_4[j_p];
+
+			s_1_0_1 = windowBuff_3[j_l];
+			windowBuff_4[j_p] = s_1_0_1;
+
+			s_1_1_1b = s_1_1_1;
+			windowBuff_3[j_l] = s_1_1_1b;
+
+			s_1_1_1 = s_1_1_1f;
+			s_1_1_1f = windowBuff_2[j_l];
+
+			s_1_2_1 = windowBuff_1[j_p];
+			windowBuff_2[j_l] = s_1_2_1;
+
+			bool cond_read = (itr < data_g.gridsize_da);
+
+			if (cond_read)
+			{
+//				printf("reading data for iteration %d\n", itr);
+				s_1_1_2 = rd_buffer.read();
+			}
+
+			windowBuff_1[j_p] = s_1_1_2;
+
+			bool cond_eo_plane = (j_p == data_g.plane_diff);
+			bool cond_eo_line = (j_l == data_g.line_diff);
+
+			if (cond_eo_plane)
+				j_p = 0;
+			else
+				j_p++;
+
+			if(cond_eo_line)
+				j_l = 0;
+			else
+				j_l++;
+		}
+
+		vec2arr:
+		{
+			for (int id = 0; id < VEC_FACTOR; id++)
+
+			{
+#pragma HLS UNROLL
+
+				data_conv s_1_1_2_u, s_1_2_1_u, s_1_1_1_u, s_1_0_1_u, s_1_1_0_u;
+
+				s_1_1_2_u.i = s_1_1_2.range(DATATYPE_SIZE * (id + 1) - 1, id * DATATYPE_SIZE);
+				s_1_2_1_u.i = s_1_2_1.range(DATATYPE_SIZE * (id + 1) - 1, id * DATATYPE_SIZE);
+				s_1_1_1_u.i = s_1_1_1.range(DATATYPE_SIZE * (id + 1) - 1, id * DATATYPE_SIZE);
+				s_1_0_1_u.i = s_1_0_1.range(DATATYPE_SIZE * (id + 1) - 1, id * DATATYPE_SIZE);
+				s_1_1_0_u.i = s_1_1_0.range(DATATYPE_SIZE * (id + 1) - 1, id * DATATYPE_SIZE);
+
+				s_1_1_2_arr[id]   =  s_1_1_2_u.f;
+				s_1_2_1_arr[id]   =  s_1_2_1_u.f;
+				s_1_1_1_arr[id+1] =  s_1_1_1_u.f;
+				s_1_0_1_arr[id]   =  s_1_0_1_u.f;
+				s_1_1_0_arr[id]   =  s_1_1_0_u.f;
+			}
+
+			data_conv tmp1_o1, tmp2_o2;
+			tmp1_o1.i = s_1_1_1b.range(DATATYPE_SIZE * (VEC_FACTOR) - 1, (VEC_FACTOR-1) * DATATYPE_SIZE);
+			tmp2_o2.i = s_1_1_1f.range(DATATYPE_SIZE * (0 + 1) - 1, 0 * DATATYPE_SIZE);
+			s_1_1_1_arr[0] = tmp1_o1.f;
+			s_1_1_1_arr[VEC_FACTOR + 1] = tmp2_o2.f;
+		}
+
+		process:
+		{
+			unsigned short y_index = j + data_g.offset_y;
+
+			for (short q = 0; q < VEC_FACTOR; q++)
+			{
+#pragma HLS UNROLL
+
+				short index = (i << SHIFT_BITS) + q + data_g.offset_x;
+
+				float r1_1_2 =  s_1_1_2_arr[q] * coefficients[0];
+				float r1_2_1 =  s_1_2_1_arr[q] * coefficients[1];
+				float r0_1_1 =  s_1_1_1_arr[q] * coefficients[2];
+				float r1_1_1 =  s_1_1_1_arr[q+1] * coefficients[3];
+				float r2_1_1 =  s_1_1_1_arr[q+2] * coefficients[4];
+				float r1_0_1 =  s_1_0_1_arr[q] * coefficients[5];
+				float r1_1_0 =  s_1_1_0_arr[q] * coefficients[6];
+
+#ifdef OPTIMIZED_REDUCTION
+				float f1 = r1_1_2 + r1_2_1;
+				float f2 = r0_1_1 + r1_1_1;
+				float f3 = r2_1_1 + r1_0_1;
+
+#pragma HLS BIND_OP variable=f1 op=fadd
+#pragma HLS BIND_OP variable=f2 op=fadd
+
+				float r1 = f1 + f2;
+				float r2 = f3 + r1_1_0;
+
+				float result = r1 + r2;
+#else
+				float result = r1_1_2 + r1_2_1 + r0_1_1 + r1_1_1 + r2_1_1 + r1_0_1 + r1_1_0;
+#endif
+
+				bool cond_change = register_it <bool> (index <= data_g.offset_x || index > data_g.sizex
+						|| (k <= 1) || (k >= data_g.limit_z -1) || (y_index <= 0) || (y_index >= data_g.grid_size_y - 1));
+
+				mem_wr[q] = cond_change ? s_1_1_1[q+1] : result;
+			}
+		}
+
+		array2vec: for (int q = 0; q < VEC_FACTOR; q++)
+		{
+#pragma HLS UNROLL
+			data_conv tmp;
+			tmp.f = mem_wr[q];
+			update_j.range(DATATYPE_SIZE * (q + 1) - 1, q * DATATYPE_SIZE) = tmp.i;
+		}
+
+		write:
+		{
+			bool cond_wr = (k >= 1) && (k < data_g.limit_z);
+
+			if (cond_wr)
+				wr_buffer << update_j;
+		}
+
+		// move cell block
+		i++;
+
+	}
+
+}
diff --git a/FPGA/Xilinx/Batched/heat3D/stencil.h b/FPGA/Xilinx/Batched/heat3D/stencil.h
new file mode 100644
index 0000000..242360c
--- /dev/null
+++ b/FPGA/Xilinx/Batched/heat3D/stencil.h
@@ -0,0 +1,92 @@
+#include <ap_int.h>
+#include <hls_stream.h>
+#include <ap_axi_sdata.h>
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include "heat3D_common.h"
+
+#pragma once
+
+typedef ap_uint<512> uint512_dt;
+typedef ap_uint<256> uint256_dt;
+typedef ap_axiu<256,0,0,0> t_pkt;
+typedef ap_axiu<32,0,0,0> t_pkt_32;
+
+#define SLR_P_STAGE NUM_OF_PROCESS_GRID_PER_SLR
+
+//Maximum Tile Size
+#define MAX_SIZE_X 304
+#define MAX_DEPTH_16 (MAX_SIZE_X/16)
+
+//user function
+#define VEC_FACTOR 8
+#define SHIFT_BITS 3
+#define DATATYPE_SIZE 32  // single precision operations
+
+
+const int max_size_y = MAX_SIZE_X;
+const int min_size_y = 20;
+const int avg_size_y = MAX_SIZE_X;
+
+const int max_block_x = MAX_SIZE_X/VEC_FACTOR + 1;
+const int min_block_x = 20/VEC_FACTOR + 1;
+const int avg_block_x = MAX_SIZE_X/VEC_FACTOR + 1;
+
+const int max_grid = max_block_x * max_size_y * max_size_y;
+const int min_grid = min_block_x * min_size_y * min_size_y;
+const int avg_grid = avg_block_x * avg_size_y * avg_size_y;
+
+const int vec_factor = VEC_FACTOR;
+const int max_depth_16 = MAX_DEPTH_16;
+const int max_depth_8 = MAX_DEPTH_16 *2;
+const int max_depth_xy = max_block_x * max_size_y;
+
+// union to reinterpret float as integer and vice versa
+typedef union  {
+   int i;
+   float f;
+} data_conv;
+
+
+// strcutre to hold grid parameters to avoid recalculation in
+// different process
+//struct data_G{
+//	unsigned short sizex;
+//	unsigned short sizey;
+//	unsigned short xdim0;
+//	unsigned short end_index;
+//	unsigned short end_row;
+//	unsigned int gridsize;
+//    unsigned int total_itr_512;
+//    unsigned int total_itr_256;
+//	unsigned short outer_loop_limit;
+//	unsigned short endrow_plus2;
+//	unsigned short endrow_plus1;
+//	unsigned short endrow_minus1;
+//	unsigned short endindex_minus1;
+//};
+
+struct data_G{
+	unsigned short sizex;
+	unsigned short sizey;
+	unsigned short sizez;
+	unsigned short xblocks;
+	unsigned short grid_size_x;
+	unsigned short grid_size_y;
+	unsigned short grid_size_z;
+	unsigned short limit_z;
+	unsigned short offset_x;
+	unsigned short offset_y;
+	unsigned short offset_z;
+	unsigned int plane_size;
+	unsigned int gridsize_pr;
+	unsigned int gridsize_da;
+	unsigned int plane_diff;
+	unsigned int line_diff;
+	unsigned short outer_loop_limit;
+	unsigned int total_itr;
+	bool last_half;
+	unsigned short batches;
+};
+
diff --git a/FPGA/Xilinx/Batched/heat3D/stencil_3_SLR.cfg b/FPGA/Xilinx/Batched/heat3D/stencil_3_SLR.cfg
new file mode 100644
index 0000000..35aa0b8
--- /dev/null
+++ b/FPGA/Xilinx/Batched/heat3D/stencil_3_SLR.cfg
@@ -0,0 +1,22 @@
+[connectivity]
+nk=stencil_SLR:1:stencil_SLR_2
+nk=stencil_SLR:1:stencil_SLR_3
+
+stream_connect=stencil_mem2stream_1.out:stencil_SLR_1.in:128
+stream_connect=stencil_SLR_1.out:stencil_SLR_2.in:128
+stream_connect=stencil_SLR_2.out:stencil_SLR_3.in:128
+stream_connect=stencil_SLR_3.out:stencil_mem2stream_1.in:128
+
+sp=stencil_mem2stream_1.arg0:HBM[0]
+sp=stencil_mem2stream_1.arg1:HBM[1]
+
+slr=stencil_mem2stream_1:SLR0
+slr=stencil_SLR_1:SLR0
+slr=stencil_SLR_2:SLR1
+slr=stencil_SLR_3:SLR2
+
+[profile]
+data=all:all:all
+memory=all
+stall=all:all
+exec=all:all
diff --git a/FPGA/Xilinx/Batched/heat3D/stencil_single_SLR.cfg b/FPGA/Xilinx/Batched/heat3D/stencil_single_SLR.cfg
new file mode 100644
index 0000000..047324d
--- /dev/null
+++ b/FPGA/Xilinx/Batched/heat3D/stencil_single_SLR.cfg
@@ -0,0 +1,9 @@
+[connectivity]
+stream_connect=stencil_mem2stream_1.out:stencil_SLR_1.in
+stream_connect=stencil_SLR_1.out:stencil_mem2stream_1.in
+
+sp=stencil_mem2stream_1.arg0:HBM[0]
+sp=stencil_mem2stream_1.arg1:HBM[1]
+
+slr=stencil_mem2stream_1:SLR1
+slr=stencil_SLR_1:SLR0
diff --git a/FPGA/Xilinx/Batched/heat3D/xcl2.cpp b/FPGA/Xilinx/Batched/heat3D/xcl2.cpp
new file mode 100644
index 0000000..893b79d
--- /dev/null
+++ b/FPGA/Xilinx/Batched/heat3D/xcl2.cpp
@@ -0,0 +1,114 @@
+/**********
+Copyright (c) 2018, Xilinx, Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+may be used to endorse or promote products derived from this software
+without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********/
+
+#include "xcl2.hpp"
+#include <limits.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+namespace xcl {
+std::vector<cl::Device> get_devices(const std::string &vendor_name) {
+    size_t i;
+    cl_int err;
+    std::vector<cl::Platform> platforms;
+    OCL_CHECK(err, err = cl::Platform::get(&platforms));
+    cl::Platform platform;
+    for (i = 0; i < platforms.size(); i++) {
+        platform = platforms[i];
+        OCL_CHECK(err,
+                  std::string platformName =
+                      platform.getInfo<CL_PLATFORM_NAME>(&err));
+        if (platformName == vendor_name) {
+            std::cout << "Found Platform" << std::endl;
+            std::cout << "Platform Name: " << platformName.c_str() << std::endl;
+            break;
+        }
+    }
+    if (i == platforms.size()) {
+        std::cout << "Error: Failed to find Xilinx platform" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+    //Getting ACCELERATOR Devices and selecting 1st such device
+    std::vector<cl::Device> devices;
+    OCL_CHECK(err,
+              err = platform.getDevices(CL_DEVICE_TYPE_ACCELERATOR, &devices));
+    return devices;
+}
+
+std::vector<cl::Device> get_xil_devices() { return get_devices("Xilinx"); }
+
+
+std::vector<unsigned char> read_binary_file(const std::string &xclbin_file_name) {
+    std::cout << "INFO: Reading " << xclbin_file_name << std::endl;
+
+    if (access(xclbin_file_name.c_str(), R_OK) != 0) {
+        printf("ERROR: %s xclbin not available please build\n",
+               xclbin_file_name.c_str());
+        exit(EXIT_FAILURE);
+    }
+    //Loading XCL Bin into char buffer
+    std::cout << "Loading: '" << xclbin_file_name.c_str() << "'\n";
+    std::ifstream bin_file(xclbin_file_name.c_str(), std::ifstream::binary);
+    bin_file.seekg(0, bin_file.end);
+    auto nb = bin_file.tellg();
+    bin_file.seekg(0, bin_file.beg);
+    std::vector<unsigned char> buf;
+    buf.resize(nb);
+    bin_file.read(reinterpret_cast<char*>(buf.data()), nb);
+    return buf;
+}
+
+bool is_emulation() {
+    bool ret = false;
+    char *xcl_mode = getenv("XCL_EMULATION_MODE");
+    if (xcl_mode != NULL) {
+        ret = true;
+    }
+    return ret;
+}
+
+bool is_hw_emulation() {
+    bool ret = false;
+    char *xcl_mode = getenv("XCL_EMULATION_MODE");
+    if ((xcl_mode != NULL) && !strcmp(xcl_mode, "hw_emu")) {
+        ret = true;
+    }
+    return ret;
+}
+
+bool is_xpr_device(const char *device_name) {
+    const char *output = strstr(device_name, "xpr");
+
+    if (output == NULL) {
+        return false;
+    } else {
+        return true;
+    }
+}
+}; // namespace xcl
diff --git a/FPGA/Xilinx/Batched/heat3D/xcl2.hpp b/FPGA/Xilinx/Batched/heat3D/xcl2.hpp
new file mode 100644
index 0000000..7dbcb6d
--- /dev/null
+++ b/FPGA/Xilinx/Batched/heat3D/xcl2.hpp
@@ -0,0 +1,105 @@
+/**********
+Copyright (c) 2018, Xilinx, Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+may be used to endorse or promote products derived from this software
+without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********/
+
+
+#pragma once
+
+#define CL_HPP_CL_1_2_DEFAULT_BUILD
+#define CL_HPP_TARGET_OPENCL_VERSION 120
+#define CL_HPP_MINIMUM_OPENCL_VERSION 120
+#define CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY 1
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
+
+//OCL_CHECK doesn't work if call has templatized function call
+#define OCL_CHECK(error,call)                                       \
+    call;                                                           \
+    if (error != CL_SUCCESS) {                                      \
+      printf("%s:%d Error calling " #call ", error code is: %d\n",  \
+              __FILE__,__LINE__, error);                            \
+      exit(EXIT_FAILURE);                                           \
+    }                                       
+
+#include <CL/cl2.hpp>
+#include <iostream>
+#include <fstream>
+#include <CL/cl_ext_xilinx.h>
+// When creating a buffer with user pointer (CL_MEM_USE_HOST_PTR), under the hood
+// User ptr is used if and only if it is properly aligned (page aligned). When not 
+// aligned, runtime has no choice but to create its own host side buffer that backs
+// user ptr. This in turn implies that all operations that move data to and from 
+// device incur an extra memcpy to move data to/from runtime's own host buffer 
+// from/to user pointer. So it is recommended to use this allocator if user wish to
+// Create Buffer/Memory Object with CL_MEM_USE_HOST_PTR to align user buffer to the
+// page boundary. It will ensure that user buffer will be used when user create 
+// Buffer/Mem Object with CL_MEM_USE_HOST_PTR.
+template <typename T>
+struct aligned_allocator
+{
+  using value_type = T;
+  T* allocate(std::size_t num)
+  {
+    void* ptr = nullptr;
+    if (posix_memalign(&ptr,4096,num*sizeof(T)))
+      throw std::bad_alloc();
+    return reinterpret_cast<T*>(ptr);
+  }
+  void deallocate(T* p, std::size_t num)
+  {
+    free(p);
+  }
+};
+
+namespace xcl {
+  std::vector<cl::Device> get_xil_devices();
+  std::vector<cl::Device> get_devices(const std::string& vendor_name);
+  std::vector<unsigned char> read_binary_file(const std::string &xclbin_file_name); 
+  bool is_emulation ();
+  bool is_hw_emulation ();
+  bool is_xpr_device (const char *device_name);
+    class Stream{
+      public:
+        static decltype(&clCreateStream) createStream;
+        static decltype(&clReleaseStream) releaseStream;
+        static decltype(&clReadStream) readStream;
+        static decltype(&clWriteStream) writeStream;
+        static decltype(&clPollStreams) pollStreams;
+        static void init(const cl_platform_id& platform) {
+            void *bar = clGetExtensionFunctionAddressForPlatform(platform, "clCreateStream");
+            createStream = (decltype(&clCreateStream))bar;
+            bar = clGetExtensionFunctionAddressForPlatform(platform, "clReleaseStream");
+            releaseStream = (decltype(&clReleaseStream))bar;
+            bar = clGetExtensionFunctionAddressForPlatform(platform, "clReadStream");
+            readStream = (decltype(&clReadStream))bar;
+            bar = clGetExtensionFunctionAddressForPlatform(platform, "clWriteStream");
+            writeStream = (decltype(&clWriteStream))bar;
+            bar = clGetExtensionFunctionAddressForPlatform(platform, "clPollStreams");
+            pollStreams = (decltype(&clPollStreams))bar;
+        }
+    };
+}