#include <stdio.h>

__global__ void VecAdd(int n, const float *A, const float *B, float* C) {

    /********************************************************************
     *
     * Compute C = A + B
     *   where A is a (1 * n) vector
     *   where B is a (1 * n) vector
     *   where C is a (1 * n) vector
     *
     ********************************************************************/
     
    /*************************************************************************/
    // INSERT CODE HERE
    int i = threadIdx.x + blockDim.x*blockIdx.x;				//Convert the 3 dimensional index to 1 dimension index for indexing 1D array.
    
    if(i<n)									// To limit the number of threads equal to 10,0000.
      C[i] = A[i] + B[i];							// Performing vector addition.
    return;
    /*************************************************************************/
	 
}

void basicVecAdd( float *A,  float *B, float *C, int n)
{

    // Initialize thread block and kernel grid dimensions 

    const unsigned int BLOCK_SIZE = 256; 

    /*************************************************************************/
    // INSERT CODE HERE
     dim3 dim_grid(((n-1)/(BLOCK_SIZE*2))+1,1,1);					// Initializing the grid dimension using the ceiling function to get the number of blocks depending upon the total data.
     dim3 dim_block(BLOCK_SIZE*2,1,1);						// Initializing the number of threads in a block. As said in the lab1, 512 threads per block.
    VecAdd<<<dim_grid,dim_block>>>(n,A,B,C);					// Invoke the VecAdd Kernel to be used by the GPU and perform vector addition.
    /*************************************************************************/
}