-
Notifications
You must be signed in to change notification settings - Fork 6
/
kernel1.cu
103 lines (89 loc) · 3.16 KB
/
kernel1.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#include <stdio.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"
#include "common.h"
const int num_submatrix = 8;
void msplitm(char transa, char transb, unsigned long long m, unsigned long long n, unsigned long long k, float alpha, const float *A, int lda, const float *B, int ldb, float beta, float *C, int ldc)
{
printf("entering msplitm \n");
float* A_d;
float* B_d;
float* C_d;
unsigned long long A_sz = m * k;
unsigned long long B_sz = n * k;
unsigned long long C_sz = m * n;
unsigned long long MAX = (unsigned long long )m* (unsigned long long) n / num_submatrix;
MAX -= MAX % k;
printf("MAX: %d\n", MAX);
printf("B_sz: %d\n",B_sz);
unsigned long long numSubMatrixB = B_sz / MAX;
printf("SubmatriciesB: %d\n", numSubMatrixB);
unsigned long long SMB_sz = B_sz / numSubMatrixB;
printf("SMB_sz: %d\n", SMB_sz);
unsigned long long subCols = B_sz / (numSubMatrixB * k);
printf("subCols: %d\n", subCols);
unsigned long long numSubMatrixA = A_sz / MAX;
unsigned long long SMA_sz = A_sz / numSubMatrixA;
unsigned long long subRows = A_sz / (numSubMatrixA * k);
printf("subrows: %d\n", subRows);
printf("SMA_sz: %d\n", SMA_sz);
printf("submatriciesA: %d\n", numSubMatrixA);
unsigned long long overflowA = m % subRows;
unsigned long long overflowB = n % subCols;
printf("overflowB: %d\n", overflowB);
printf("overflowA: %d\n", overflowA);
for(unsigned long long i = 0; i < numSubMatrixB + 1; ++i){
if(overflowB == 0 && i == numSubMatrixB){
break;
}
float* b = 0;
float* temp3 = (float*) malloc( sizeof(float)*subCols * k );
for(int j = 0; j < k; ++j){
for(int x = 0; x < subCols; ++x){
if(i * subCols + x < n){
temp3[j * subCols + x] = B[j * n + (i*subCols + x)];
}else{
temp3[j *subCols + x] = 0;
}
}
}
cudaMalloc((void**) &b, sizeof(float) * subCols * k);
cudaMemcpy(b, temp3, sizeof(float)*subCols*k, cudaMemcpyHostToDevice);
free(temp3);
for(unsigned long long y = 0; y < numSubMatrixA + 1; ++y){
if(overflowA == 0 && y == numSubMatrixA){
break;
}
float * temp = (float*) malloc( sizeof(float)*subRows * k );
for(int j = 0; j < subRows; ++j){
for(int x = 0; x < k; ++x){
if(y * subRows + j < m){
temp[j * k + x] = A[y*subRows*k + j*k + x];
}else{
temp[j * k + x] = 0;
}
}
}
float* a = 0;
float* c = 0;
cudaMalloc((void**) &a, sizeof(float) * subRows * k);
cudaMalloc((void**) &c, sizeof(float) * subCols * subRows);
cudaMemcpy(a, temp, sizeof(float)*subRows*k, cudaMemcpyHostToDevice);
doMultiply2Matrices(subRows, k, a, k, subCols, b, c, alpha);
cudaMemcpy(temp, c, sizeof(float)*subRows*subCols, cudaMemcpyDeviceToHost);
if(i == numSubMatrixB && y == numSubMatrixA){
copyElements(C, temp, subRows, subCols, m, n, y, i, overflowA, overflowB, beta);
}else if(i == numSubMatrixB){
copyElements(C, temp, subRows, subCols, m, n, y, i, 0, overflowB, beta);
}else if(y == numSubMatrixA){
copyElements(C, temp, subRows, subCols, m, n, y, i, overflowA, 0, beta);
}else{
copyElements(C, temp, subRows, subCols, m, n, y, i, 0, 0, beta);
}
free(temp);
cudaFree(a);
cudaFree(c);
}
cudaFree(b);
}
}