tlb.cu

//compile nvcc *.cu -o test

# include <stdio.h>
# include <stdlib.h>
# include <stdint.h>
# include <time.h>
# include "cuda_runtime.h"

# define OFFSETS 16
# define INVALID -1LL
# define L2_ENTRIES 1025
# define GB (1024LU*1024LU*1024LU)
# define MB (1024LU*1024LU)
# define KB (1024LU)


__global__ void global_latency (unsigned long *, unsigned long, unsigned long, unsigned long);
void measure_global (unsigned int, unsigned int, unsigned int);

void create_pattern (unsigned long *arr, unsigned long stride, unsigned long size, unsigned long start_idx) {
    unsigned long temp = start_idx, i;
    // [Style 1]: Jump for a stride
    for (i = start_idx; i < size; i++) {
        temp = (i + stride);
        // Each index points to next index!
        arr[i] = (temp >= size) ? start_idx : temp;
    }
    // [Style 2]: Jump twice as much as stride
    /*stride = 2 * stride;
    i = 1;
    while (i < L2_ENTRIES) {
        arr[temp] = temp + stride;
        temp = temp + stride;
        i++;
    }
    // New temp will be (start_idx + original stride) (was doubled earlier)
    arr[temp] = start_idx + (stride / 2);
    i = 1;
    temp = start_idx + (stride / 2);
    // Jump twice as much as stride
    while (i < L2_ENTRIES) {
        arr[temp] = temp + stride;
        temp = temp + stride;
        i++;
    }
    // Roundabout!
    arr[temp] = start_idx;*/
    // [Style 3]: To create a stride like (1, 2, 4, 1 ...) x stride
    /* int cycle = 3;
    srand (time (0));
    unsigned long next_idx = start_idx;
    temp = 2 ^ (rand () % cycle) * stride;
    while (next_idx + temp < size) {
        arr[next_idx] = next_idx + temp;
        iteration += 1;
        next_idx = next_idx + temp;
        temp = 2 ^ (rand () % cycle) * stride;
    }
    // Roundabout!
    arr[next_idx] = start_idx;*/
    // [Style 4]: To create L2 amount of accesses and then somewhere in between!
    /*for (i = 0; i < L2_ENTRIES; i++) {
        arr[temp] = (temp + stride) % size;
        temp = (temp + stride) % size;
    }
    i = start_idx + (stride / 2);
    arr[temp] = i;
    temp = i + (stride);
    arr[i] = temp;
    // Roundabout!
    arr[temp] = start_idx;*/
    // [Style 5]: To create random accesses at stride size!
    /* long long probe_points[size];
    int count, idx;
    // Find out all elements that needs to be accessed!
    for (i = 0, count = 0; i < size; i++, count++) {
        temp = start_idx + (i + 1) * stride;
        if (temp >= size)
            break;
        probe_points[i] = temp;
    }
    // Now create access pattern out of them!
    srand (time (0));
    temp = start_idx;
    while (i > 0) {
        idx = rand () % count;
        if (probe_points[idx] != INVALID) {
            arr[temp] = (unsigned long) probe_points[idx];
            temp = (unsigned long) probe_points[idx];
            probe_points[idx] = INVALID;
            i--;
        }
    }
    // Roundabout
    arr[temp] = start_idx;*/
}

int main (int argc, char **argv) {

    if (argc < 4) {
        printf ("Usage: %s   from_Mb   to_Mb   stride_size_kb\n", argv[0]);
        return 0;
    }

    /* Array size in mega bytes 1st argument. */
    unsigned int from_mb = (unsigned int) atof (argv[1]);

    /* Array size in mega bytes 1st argument. */
    unsigned int to_mb = (unsigned int) atof (argv[2]);

    /* Stride size in kilo bytes 1st argument. */
    unsigned int stride_size_kb = (unsigned int) atof (argv[3]);

    measure_global (from_mb, to_mb, stride_size_kb);
    return 0;
}


void measure_global (unsigned int from_mb, unsigned int to_mb, unsigned int stride_kb) {

    cudaError_t error_id;
    
    int e_size = sizeof (unsigned long);
    unsigned long ol_limit = (from_mb * MB) / e_size, l_limit;
    unsigned long ou_limit = (to_mb * MB) / e_size, u_limit;
    unsigned long stride = (stride_kb * KB) / e_size;

    /* Always allocate some GB(s) for now */
    unsigned long a_size = (2 * GB) / e_size;
    unsigned long l_stride = (4 * MB) / e_size;

    cudaSetDevice (0);
    /* allocate arrays on GPU */    
    unsigned long *d_a;
    error_id = cudaMallocManaged ((void **) &d_a, e_size * (a_size));
    if (error_id != cudaSuccess) {
        printf ("Error 1.0 is %s\n", cudaGetErrorString (error_id));
        return;
    }

    /* Find a start address which is good, i.e which we need */
    unsigned long start_idx = 0, temp, elements, k, j;

    // printf ("Start Addr: %p, Idx: %llu\n", start_da, start_idx);
    int times = 1;
    for (start_idx = start_idx; times <= OFFSETS; start_idx += l_stride * 1, times++){

        /* The limits are basically end indices when indexing starts at 0,
           shift it by start_idx! */
        l_limit = (ol_limit + start_idx);
        u_limit = (ou_limit + start_idx);

        if (u_limit >= a_size)
            break;
        // printf ("lLimit: %llu, uLimit: %llu\n", l_limit, u_limit);

        for (k = l_limit; k <= u_limit; k += stride) {

            // start_idx = start_idx + (256 * KB / e_size);
            /* Initialize array elements on CPU with pointers into d_a. */
            create_pattern (d_a, stride, k, start_idx);
            d_a[k + 1] = 0;
            d_a[k + 2] = 0;

            cudaDeviceSynchronize ();
            dim3 Db = dim3 (1);
            dim3 Dg = dim3 (1, 1, 1);

            /* launch kernel*/
            elements = (k - start_idx) / stride;
            global_latency <<<Dg, Db>>>(d_a, k, start_idx, elements);

            cudaDeviceSynchronize ();

            error_id = cudaGetLastError ();
            if (error_id != cudaSuccess) {
                printf ("Error kernel is %s\n", cudaGetErrorString (error_id));
            }

            printf ("===============================================\n");
            printf ("%ld.%02d, %lu KB\n", e_size * (k - start_idx) / KB, times, (stride * e_size) / KB);
            printf ("%lu\n", start_idx);

            /* Response structure [Next Address][Address][Time to access it] ... [Next Address]... */
            temp = start_idx;
            for (j = 0; j < elements; j++) {
                printf ("%p, %lu, %lu\n", &d_a[temp], temp, d_a[temp + 1]);
                /* Pointer chase! */
                temp = d_a[temp];
            }

            printf ("===============================================\n\n");
        }
    }

    /* free memory on GPU */
    cudaFree (d_a);
    
    cudaDeviceReset();
}


__global__ void global_latency (unsigned long *my_array, unsigned long array_length, unsigned long s_idx, unsigned long elements) {
    unsigned long j = s_idx, old_j, k, start_time, end_time, res = 0, res_1 = 0, iter = 3;

    // first round, warm the TLB
    // for (k = 0; k < elements; k++) {
    //     j = my_array[j];
    // }

    // second round, begin timestamp
    // j = (j != s_idx) ? s_idx : j;
    for (k = 0; k < elements * iter; k++) {
        old_j = j;
        __threadfence ();
        start_time = clock ();
        j = my_array[j];
        res += j;
        res_1 += res;
        end_time = clock ();
        /* We just used it for accessing the element that has to be timed. It will be
           at a boundary of the stride chosen. Keep it atleast greater than cache line
           size. */
        my_array[old_j + 1] = (end_time - start_time);
    }

    my_array[j + 2/*array_length + 1*/] = res + res_1;
    my_array[j + 3/*array_length + 2*/] = my_array[j];
}