-
Notifications
You must be signed in to change notification settings - Fork 1
/
jacobi_kernel.cu
112 lines (110 loc) · 19.7 KB
/
jacobi_kernel.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#include "jacobi_kernel.hu"
__global__ void kernel0_1(float *A, int h0)
{
int b0 = blockIdx.x;
int t0 = threadIdx.y, t1 = threadIdx.x;
#define floord(n,d) (((n)<0) ? -((-(n)+(d)-1)/(d)) : (n)/(d))
#define min(x,y) ((x) < (y) ? (x) : (y))
for (int g7 = 0; g7 <= min(32, -((h0 + 32) / 32) + 96); g7 += 1) {
if (h0 >= 1 && g7 <= 31 && b0 >= 1) {
if (t1 + 128 * g7 >= 1 && t1 + 128 * g7 <= 4094) {
A[(1 * 4096 + (12 * b0 - 4)) * 4096 + (t1 + 128 * g7)] = (0.2f * ((((A[(0 * 4096 + (12 * b0 - 4)) * 4096 + (t1 + 128 * g7)] + A[(0 * 4096 + (12 * b0 - 4)) * 4096 + (t1 + 128 * g7 - 1)]) + A[(0 * 4096 + (12 * b0 - 4)) * 4096 + (t1 + 128 * g7 + 1)]) + A[(0 * 4096 + (12 * b0 - 3)) * 4096 + (t1 + 128 * g7)]) + A[(0 * 4096 + (12 * b0 - 5)) * 4096 + (t1 + 128 * g7)]));
A[(1 * 4096 + (12 * b0 - 3)) * 4096 + (t1 + 128 * g7)] = (0.2f * ((((A[(0 * 4096 + (12 * b0 - 3)) * 4096 + (t1 + 128 * g7)] + A[(0 * 4096 + (12 * b0 - 3)) * 4096 + (t1 + 128 * g7 - 1)]) + A[(0 * 4096 + (12 * b0 - 3)) * 4096 + (t1 + 128 * g7 + 1)]) + A[(0 * 4096 + (12 * b0 - 2)) * 4096 + (t1 + 128 * g7)]) + A[(0 * 4096 + (12 * b0 - 4)) * 4096 + (t1 + 128 * g7)]));
A[(1 * 4096 + (12 * b0 - 2)) * 4096 + (t1 + 128 * g7)] = (0.2f * ((((A[(0 * 4096 + (12 * b0 - 2)) * 4096 + (t1 + 128 * g7)] + A[(0 * 4096 + (12 * b0 - 2)) * 4096 + (t1 + 128 * g7 - 1)]) + A[(0 * 4096 + (12 * b0 - 2)) * 4096 + (t1 + 128 * g7 + 1)]) + A[(0 * 4096 + (12 * b0 - 1)) * 4096 + (t1 + 128 * g7)]) + A[(0 * 4096 + (12 * b0 - 3)) * 4096 + (t1 + 128 * g7)]));
A[(1 * 4096 + (12 * b0 - 1)) * 4096 + (t1 + 128 * g7)] = (0.2f * ((((A[(0 * 4096 + (12 * b0 - 1)) * 4096 + (t1 + 128 * g7)] + A[(0 * 4096 + (12 * b0 - 1)) * 4096 + (t1 + 128 * g7 - 1)]) + A[(0 * 4096 + (12 * b0 - 1)) * 4096 + (t1 + 128 * g7 + 1)]) + A[(0 * 4096 + 12 * b0) * 4096 + (t1 + 128 * g7)]) + A[(0 * 4096 + (12 * b0 - 2)) * 4096 + (t1 + 128 * g7)]));
A[(1 * 4096 + 12 * b0) * 4096 + (t1 + 128 * g7)] = (0.2f * ((((A[(0 * 4096 + 12 * b0) * 4096 + (t1 + 128 * g7)] + A[(0 * 4096 + 12 * b0) * 4096 + (t1 + 128 * g7 - 1)]) + A[(0 * 4096 + 12 * b0) * 4096 + (t1 + 128 * g7 + 1)]) + A[(0 * 4096 + (12 * b0 + 1)) * 4096 + (t1 + 128 * g7)]) + A[(0 * 4096 + (12 * b0 - 1)) * 4096 + (t1 + 128 * g7)]));
}
__syncthreads();
}
if (h0 >= 1 && g7 <= 31) {
if (t1 + 128 * g7 >= 2 && b0 >= 1) {
A[(0 * 4096 + (12 * b0 - 5)) * 4096 + (t1 + 128 * g7 - 1)] = (0.2f * ((((A[(1 * 4096 + (12 * b0 - 5)) * 4096 + (t1 + 128 * g7 - 1)] + A[(1 * 4096 + (12 * b0 - 5)) * 4096 + (t1 + 128 * g7 - 2)]) + A[(1 * 4096 + (12 * b0 - 5)) * 4096 + (t1 + 128 * g7)]) + A[(1 * 4096 + (12 * b0 - 4)) * 4096 + (t1 + 128 * g7 - 1)]) + A[(1 * 4096 + (12 * b0 - 6)) * 4096 + (t1 + 128 * g7 - 1)]));
A[(0 * 4096 + (12 * b0 - 4)) * 4096 + (t1 + 128 * g7 - 1)] = (0.2f * ((((A[(1 * 4096 + (12 * b0 - 4)) * 4096 + (t1 + 128 * g7 - 1)] + A[(1 * 4096 + (12 * b0 - 4)) * 4096 + (t1 + 128 * g7 - 2)]) + A[(1 * 4096 + (12 * b0 - 4)) * 4096 + (t1 + 128 * g7)]) + A[(1 * 4096 + (12 * b0 - 3)) * 4096 + (t1 + 128 * g7 - 1)]) + A[(1 * 4096 + (12 * b0 - 5)) * 4096 + (t1 + 128 * g7 - 1)]));
A[(0 * 4096 + (12 * b0 - 3)) * 4096 + (t1 + 128 * g7 - 1)] = (0.2f * ((((A[(1 * 4096 + (12 * b0 - 3)) * 4096 + (t1 + 128 * g7 - 1)] + A[(1 * 4096 + (12 * b0 - 3)) * 4096 + (t1 + 128 * g7 - 2)]) + A[(1 * 4096 + (12 * b0 - 3)) * 4096 + (t1 + 128 * g7)]) + A[(1 * 4096 + (12 * b0 - 2)) * 4096 + (t1 + 128 * g7 - 1)]) + A[(1 * 4096 + (12 * b0 - 4)) * 4096 + (t1 + 128 * g7 - 1)]));
A[(0 * 4096 + (12 * b0 - 2)) * 4096 + (t1 + 128 * g7 - 1)] = (0.2f * ((((A[(1 * 4096 + (12 * b0 - 2)) * 4096 + (t1 + 128 * g7 - 1)] + A[(1 * 4096 + (12 * b0 - 2)) * 4096 + (t1 + 128 * g7 - 2)]) + A[(1 * 4096 + (12 * b0 - 2)) * 4096 + (t1 + 128 * g7)]) + A[(1 * 4096 + (12 * b0 - 1)) * 4096 + (t1 + 128 * g7 - 1)]) + A[(1 * 4096 + (12 * b0 - 3)) * 4096 + (t1 + 128 * g7 - 1)]));
A[(0 * 4096 + (12 * b0 - 1)) * 4096 + (t1 + 128 * g7 - 1)] = (0.2f * ((((A[(1 * 4096 + (12 * b0 - 1)) * 4096 + (t1 + 128 * g7 - 1)] + A[(1 * 4096 + (12 * b0 - 1)) * 4096 + (t1 + 128 * g7 - 2)]) + A[(1 * 4096 + (12 * b0 - 1)) * 4096 + (t1 + 128 * g7)]) + A[(1 * 4096 + 12 * b0) * 4096 + (t1 + 128 * g7 - 1)]) + A[(1 * 4096 + (12 * b0 - 2)) * 4096 + (t1 + 128 * g7 - 1)]));
A[(0 * 4096 + 12 * b0) * 4096 + (t1 + 128 * g7 - 1)] = (0.2f * ((((A[(1 * 4096 + 12 * b0) * 4096 + (t1 + 128 * g7 - 1)] + A[(1 * 4096 + 12 * b0) * 4096 + (t1 + 128 * g7 - 2)]) + A[(1 * 4096 + 12 * b0) * 4096 + (t1 + 128 * g7)]) + A[(1 * 4096 + (12 * b0 + 1)) * 4096 + (t1 + 128 * g7 - 1)]) + A[(1 * 4096 + (12 * b0 - 1)) * 4096 + (t1 + 128 * g7 - 1)]));
}
if (t1 + 128 * g7 >= 2)
A[(0 * 4096 + (12 * b0 + 1)) * 4096 + (t1 + 128 * g7 - 1)] = (0.2f * ((((A[(1 * 4096 + (12 * b0 + 1)) * 4096 + (t1 + 128 * g7 - 1)] + A[(1 * 4096 + (12 * b0 + 1)) * 4096 + (t1 + 128 * g7 - 2)]) + A[(1 * 4096 + (12 * b0 + 1)) * 4096 + (t1 + 128 * g7)]) + A[(1 * 4096 + (12 * b0 + 2)) * 4096 + (t1 + 128 * g7 - 1)]) + A[(1 * 4096 + 12 * b0) * 4096 + (t1 + 128 * g7 - 1)]));
__syncthreads();
}
if (h0 <= 2047) {
if (t1 + 128 * g7 >= 3 && b0 >= 1 && t1 + 128 * g7 <= 4096) {
A[(1 * 4096 + (12 * b0 - 5)) * 4096 + (t1 + 128 * g7 - 2)] = (0.2f * ((((A[(0 * 4096 + (12 * b0 - 5)) * 4096 + (t1 + 128 * g7 - 2)] + A[(0 * 4096 + (12 * b0 - 5)) * 4096 + (t1 + 128 * g7 - 3)]) + A[(0 * 4096 + (12 * b0 - 5)) * 4096 + (t1 + 128 * g7 - 1)]) + A[(0 * 4096 + (12 * b0 - 4)) * 4096 + (t1 + 128 * g7 - 2)]) + A[(0 * 4096 + (12 * b0 - 6)) * 4096 + (t1 + 128 * g7 - 2)]));
A[(1 * 4096 + (12 * b0 - 4)) * 4096 + (t1 + 128 * g7 - 2)] = (0.2f * ((((A[(0 * 4096 + (12 * b0 - 4)) * 4096 + (t1 + 128 * g7 - 2)] + A[(0 * 4096 + (12 * b0 - 4)) * 4096 + (t1 + 128 * g7 - 3)]) + A[(0 * 4096 + (12 * b0 - 4)) * 4096 + (t1 + 128 * g7 - 1)]) + A[(0 * 4096 + (12 * b0 - 3)) * 4096 + (t1 + 128 * g7 - 2)]) + A[(0 * 4096 + (12 * b0 - 5)) * 4096 + (t1 + 128 * g7 - 2)]));
A[(1 * 4096 + (12 * b0 - 3)) * 4096 + (t1 + 128 * g7 - 2)] = (0.2f * ((((A[(0 * 4096 + (12 * b0 - 3)) * 4096 + (t1 + 128 * g7 - 2)] + A[(0 * 4096 + (12 * b0 - 3)) * 4096 + (t1 + 128 * g7 - 3)]) + A[(0 * 4096 + (12 * b0 - 3)) * 4096 + (t1 + 128 * g7 - 1)]) + A[(0 * 4096 + (12 * b0 - 2)) * 4096 + (t1 + 128 * g7 - 2)]) + A[(0 * 4096 + (12 * b0 - 4)) * 4096 + (t1 + 128 * g7 - 2)]));
A[(1 * 4096 + (12 * b0 - 2)) * 4096 + (t1 + 128 * g7 - 2)] = (0.2f * ((((A[(0 * 4096 + (12 * b0 - 2)) * 4096 + (t1 + 128 * g7 - 2)] + A[(0 * 4096 + (12 * b0 - 2)) * 4096 + (t1 + 128 * g7 - 3)]) + A[(0 * 4096 + (12 * b0 - 2)) * 4096 + (t1 + 128 * g7 - 1)]) + A[(0 * 4096 + (12 * b0 - 1)) * 4096 + (t1 + 128 * g7 - 2)]) + A[(0 * 4096 + (12 * b0 - 3)) * 4096 + (t1 + 128 * g7 - 2)]));
A[(1 * 4096 + (12 * b0 - 1)) * 4096 + (t1 + 128 * g7 - 2)] = (0.2f * ((((A[(0 * 4096 + (12 * b0 - 1)) * 4096 + (t1 + 128 * g7 - 2)] + A[(0 * 4096 + (12 * b0 - 1)) * 4096 + (t1 + 128 * g7 - 3)]) + A[(0 * 4096 + (12 * b0 - 1)) * 4096 + (t1 + 128 * g7 - 1)]) + A[(0 * 4096 + 12 * b0) * 4096 + (t1 + 128 * g7 - 2)]) + A[(0 * 4096 + (12 * b0 - 2)) * 4096 + (t1 + 128 * g7 - 2)]));
A[(1 * 4096 + 12 * b0) * 4096 + (t1 + 128 * g7 - 2)] = (0.2f * ((((A[(0 * 4096 + 12 * b0) * 4096 + (t1 + 128 * g7 - 2)] + A[(0 * 4096 + 12 * b0) * 4096 + (t1 + 128 * g7 - 3)]) + A[(0 * 4096 + 12 * b0) * 4096 + (t1 + 128 * g7 - 1)]) + A[(0 * 4096 + (12 * b0 + 1)) * 4096 + (t1 + 128 * g7 - 2)]) + A[(0 * 4096 + (12 * b0 - 1)) * 4096 + (t1 + 128 * g7 - 2)]));
}
if (t1 + 128 * g7 >= 3 && t1 + 128 * g7 <= 4096)
A[(1 * 4096 + (12 * b0 + 1)) * 4096 + (t1 + 128 * g7 - 2)] = (0.2f * ((((A[(0 * 4096 + (12 * b0 + 1)) * 4096 + (t1 + 128 * g7 - 2)] + A[(0 * 4096 + (12 * b0 + 1)) * 4096 + (t1 + 128 * g7 - 3)]) + A[(0 * 4096 + (12 * b0 + 1)) * 4096 + (t1 + 128 * g7 - 1)]) + A[(0 * 4096 + (12 * b0 + 2)) * 4096 + (t1 + 128 * g7 - 2)]) + A[(0 * 4096 + 12 * b0) * 4096 + (t1 + 128 * g7 - 2)]));
__syncthreads();
if (b0 >= 1) {
if (t1 + 128 * g7 >= 4 && t1 + 128 * g7 <= 4097) {
A[(0 * 4096 + (12 * b0 - 4)) * 4096 + (t1 + 128 * g7 - 3)] = (0.2f * ((((A[(1 * 4096 + (12 * b0 - 4)) * 4096 + (t1 + 128 * g7 - 3)] + A[(1 * 4096 + (12 * b0 - 4)) * 4096 + (t1 + 128 * g7 - 4)]) + A[(1 * 4096 + (12 * b0 - 4)) * 4096 + (t1 + 128 * g7 - 2)]) + A[(1 * 4096 + (12 * b0 - 3)) * 4096 + (t1 + 128 * g7 - 3)]) + A[(1 * 4096 + (12 * b0 - 5)) * 4096 + (t1 + 128 * g7 - 3)]));
A[(0 * 4096 + (12 * b0 - 3)) * 4096 + (t1 + 128 * g7 - 3)] = (0.2f * ((((A[(1 * 4096 + (12 * b0 - 3)) * 4096 + (t1 + 128 * g7 - 3)] + A[(1 * 4096 + (12 * b0 - 3)) * 4096 + (t1 + 128 * g7 - 4)]) + A[(1 * 4096 + (12 * b0 - 3)) * 4096 + (t1 + 128 * g7 - 2)]) + A[(1 * 4096 + (12 * b0 - 2)) * 4096 + (t1 + 128 * g7 - 3)]) + A[(1 * 4096 + (12 * b0 - 4)) * 4096 + (t1 + 128 * g7 - 3)]));
A[(0 * 4096 + (12 * b0 - 2)) * 4096 + (t1 + 128 * g7 - 3)] = (0.2f * ((((A[(1 * 4096 + (12 * b0 - 2)) * 4096 + (t1 + 128 * g7 - 3)] + A[(1 * 4096 + (12 * b0 - 2)) * 4096 + (t1 + 128 * g7 - 4)]) + A[(1 * 4096 + (12 * b0 - 2)) * 4096 + (t1 + 128 * g7 - 2)]) + A[(1 * 4096 + (12 * b0 - 1)) * 4096 + (t1 + 128 * g7 - 3)]) + A[(1 * 4096 + (12 * b0 - 3)) * 4096 + (t1 + 128 * g7 - 3)]));
A[(0 * 4096 + (12 * b0 - 1)) * 4096 + (t1 + 128 * g7 - 3)] = (0.2f * ((((A[(1 * 4096 + (12 * b0 - 1)) * 4096 + (t1 + 128 * g7 - 3)] + A[(1 * 4096 + (12 * b0 - 1)) * 4096 + (t1 + 128 * g7 - 4)]) + A[(1 * 4096 + (12 * b0 - 1)) * 4096 + (t1 + 128 * g7 - 2)]) + A[(1 * 4096 + 12 * b0) * 4096 + (t1 + 128 * g7 - 3)]) + A[(1 * 4096 + (12 * b0 - 2)) * 4096 + (t1 + 128 * g7 - 3)]));
A[(0 * 4096 + 12 * b0) * 4096 + (t1 + 128 * g7 - 3)] = (0.2f * ((((A[(1 * 4096 + 12 * b0) * 4096 + (t1 + 128 * g7 - 3)] + A[(1 * 4096 + 12 * b0) * 4096 + (t1 + 128 * g7 - 4)]) + A[(1 * 4096 + 12 * b0) * 4096 + (t1 + 128 * g7 - 2)]) + A[(1 * 4096 + (12 * b0 + 1)) * 4096 + (t1 + 128 * g7 - 3)]) + A[(1 * 4096 + (12 * b0 - 1)) * 4096 + (t1 + 128 * g7 - 3)]));
}
__syncthreads();
}
}
}
}
__global__ void kernel1_1(float *A, int h0)
{
int b0 = blockIdx.x;
int t0 = threadIdx.y, t1 = threadIdx.x;
#define floord(n,d) (((n)<0) ? -((-(n)+(d)-1)/(d)) : (n)/(d))
for (int g7 = 0; g7 <= 32; g7 += 1) {
if (g7 <= 31) {
if (t1 + 128 * g7 >= 1 && t1 + 128 * g7 <= 4094) {
A[(1 * 4096 + (12 * b0 + 2)) * 4096 + (t1 + 128 * g7)] = (0.2f * ((((A[(0 * 4096 + (12 * b0 + 2)) * 4096 + (t1 + 128 * g7)] + A[(0 * 4096 + (12 * b0 + 2)) * 4096 + (t1 + 128 * g7 - 1)]) + A[(0 * 4096 + (12 * b0 + 2)) * 4096 + (t1 + 128 * g7 + 1)]) + A[(0 * 4096 + (12 * b0 + 3)) * 4096 + (t1 + 128 * g7)]) + A[(0 * 4096 + (12 * b0 + 1)) * 4096 + (t1 + 128 * g7)]));
if (b0 <= 340) {
A[(1 * 4096 + (12 * b0 + 3)) * 4096 + (t1 + 128 * g7)] = (0.2f * ((((A[(0 * 4096 + (12 * b0 + 3)) * 4096 + (t1 + 128 * g7)] + A[(0 * 4096 + (12 * b0 + 3)) * 4096 + (t1 + 128 * g7 - 1)]) + A[(0 * 4096 + (12 * b0 + 3)) * 4096 + (t1 + 128 * g7 + 1)]) + A[(0 * 4096 + (12 * b0 + 4)) * 4096 + (t1 + 128 * g7)]) + A[(0 * 4096 + (12 * b0 + 2)) * 4096 + (t1 + 128 * g7)]));
A[(1 * 4096 + (12 * b0 + 4)) * 4096 + (t1 + 128 * g7)] = (0.2f * ((((A[(0 * 4096 + (12 * b0 + 4)) * 4096 + (t1 + 128 * g7)] + A[(0 * 4096 + (12 * b0 + 4)) * 4096 + (t1 + 128 * g7 - 1)]) + A[(0 * 4096 + (12 * b0 + 4)) * 4096 + (t1 + 128 * g7 + 1)]) + A[(0 * 4096 + (12 * b0 + 5)) * 4096 + (t1 + 128 * g7)]) + A[(0 * 4096 + (12 * b0 + 3)) * 4096 + (t1 + 128 * g7)]));
A[(1 * 4096 + (12 * b0 + 5)) * 4096 + (t1 + 128 * g7)] = (0.2f * ((((A[(0 * 4096 + (12 * b0 + 5)) * 4096 + (t1 + 128 * g7)] + A[(0 * 4096 + (12 * b0 + 5)) * 4096 + (t1 + 128 * g7 - 1)]) + A[(0 * 4096 + (12 * b0 + 5)) * 4096 + (t1 + 128 * g7 + 1)]) + A[(0 * 4096 + (12 * b0 + 6)) * 4096 + (t1 + 128 * g7)]) + A[(0 * 4096 + (12 * b0 + 4)) * 4096 + (t1 + 128 * g7)]));
A[(1 * 4096 + (12 * b0 + 6)) * 4096 + (t1 + 128 * g7)] = (0.2f * ((((A[(0 * 4096 + (12 * b0 + 6)) * 4096 + (t1 + 128 * g7)] + A[(0 * 4096 + (12 * b0 + 6)) * 4096 + (t1 + 128 * g7 - 1)]) + A[(0 * 4096 + (12 * b0 + 6)) * 4096 + (t1 + 128 * g7 + 1)]) + A[(0 * 4096 + (12 * b0 + 7)) * 4096 + (t1 + 128 * g7)]) + A[(0 * 4096 + (12 * b0 + 5)) * 4096 + (t1 + 128 * g7)]));
}
}
__syncthreads();
if (t1 + 128 * g7 >= 2) {
A[(0 * 4096 + (12 * b0 + 1)) * 4096 + (t1 + 128 * g7 - 1)] = (0.2f * ((((A[(1 * 4096 + (12 * b0 + 1)) * 4096 + (t1 + 128 * g7 - 1)] + A[(1 * 4096 + (12 * b0 + 1)) * 4096 + (t1 + 128 * g7 - 2)]) + A[(1 * 4096 + (12 * b0 + 1)) * 4096 + (t1 + 128 * g7)]) + A[(1 * 4096 + (12 * b0 + 2)) * 4096 + (t1 + 128 * g7 - 1)]) + A[(1 * 4096 + 12 * b0) * 4096 + (t1 + 128 * g7 - 1)]));
A[(0 * 4096 + (12 * b0 + 2)) * 4096 + (t1 + 128 * g7 - 1)] = (0.2f * ((((A[(1 * 4096 + (12 * b0 + 2)) * 4096 + (t1 + 128 * g7 - 1)] + A[(1 * 4096 + (12 * b0 + 2)) * 4096 + (t1 + 128 * g7 - 2)]) + A[(1 * 4096 + (12 * b0 + 2)) * 4096 + (t1 + 128 * g7)]) + A[(1 * 4096 + (12 * b0 + 3)) * 4096 + (t1 + 128 * g7 - 1)]) + A[(1 * 4096 + (12 * b0 + 1)) * 4096 + (t1 + 128 * g7 - 1)]));
if (b0 <= 340) {
A[(0 * 4096 + (12 * b0 + 3)) * 4096 + (t1 + 128 * g7 - 1)] = (0.2f * ((((A[(1 * 4096 + (12 * b0 + 3)) * 4096 + (t1 + 128 * g7 - 1)] + A[(1 * 4096 + (12 * b0 + 3)) * 4096 + (t1 + 128 * g7 - 2)]) + A[(1 * 4096 + (12 * b0 + 3)) * 4096 + (t1 + 128 * g7)]) + A[(1 * 4096 + (12 * b0 + 4)) * 4096 + (t1 + 128 * g7 - 1)]) + A[(1 * 4096 + (12 * b0 + 2)) * 4096 + (t1 + 128 * g7 - 1)]));
A[(0 * 4096 + (12 * b0 + 4)) * 4096 + (t1 + 128 * g7 - 1)] = (0.2f * ((((A[(1 * 4096 + (12 * b0 + 4)) * 4096 + (t1 + 128 * g7 - 1)] + A[(1 * 4096 + (12 * b0 + 4)) * 4096 + (t1 + 128 * g7 - 2)]) + A[(1 * 4096 + (12 * b0 + 4)) * 4096 + (t1 + 128 * g7)]) + A[(1 * 4096 + (12 * b0 + 5)) * 4096 + (t1 + 128 * g7 - 1)]) + A[(1 * 4096 + (12 * b0 + 3)) * 4096 + (t1 + 128 * g7 - 1)]));
A[(0 * 4096 + (12 * b0 + 5)) * 4096 + (t1 + 128 * g7 - 1)] = (0.2f * ((((A[(1 * 4096 + (12 * b0 + 5)) * 4096 + (t1 + 128 * g7 - 1)] + A[(1 * 4096 + (12 * b0 + 5)) * 4096 + (t1 + 128 * g7 - 2)]) + A[(1 * 4096 + (12 * b0 + 5)) * 4096 + (t1 + 128 * g7)]) + A[(1 * 4096 + (12 * b0 + 6)) * 4096 + (t1 + 128 * g7 - 1)]) + A[(1 * 4096 + (12 * b0 + 4)) * 4096 + (t1 + 128 * g7 - 1)]));
A[(0 * 4096 + (12 * b0 + 6)) * 4096 + (t1 + 128 * g7 - 1)] = (0.2f * ((((A[(1 * 4096 + (12 * b0 + 6)) * 4096 + (t1 + 128 * g7 - 1)] + A[(1 * 4096 + (12 * b0 + 6)) * 4096 + (t1 + 128 * g7 - 2)]) + A[(1 * 4096 + (12 * b0 + 6)) * 4096 + (t1 + 128 * g7)]) + A[(1 * 4096 + (12 * b0 + 7)) * 4096 + (t1 + 128 * g7 - 1)]) + A[(1 * 4096 + (12 * b0 + 5)) * 4096 + (t1 + 128 * g7 - 1)]));
A[(0 * 4096 + (12 * b0 + 7)) * 4096 + (t1 + 128 * g7 - 1)] = (0.2f * ((((A[(1 * 4096 + (12 * b0 + 7)) * 4096 + (t1 + 128 * g7 - 1)] + A[(1 * 4096 + (12 * b0 + 7)) * 4096 + (t1 + 128 * g7 - 2)]) + A[(1 * 4096 + (12 * b0 + 7)) * 4096 + (t1 + 128 * g7)]) + A[(1 * 4096 + (12 * b0 + 8)) * 4096 + (t1 + 128 * g7 - 1)]) + A[(1 * 4096 + (12 * b0 + 6)) * 4096 + (t1 + 128 * g7 - 1)]));
}
}
__syncthreads();
}
if (t1 + 128 * g7 >= 3 && t1 + 128 * g7 <= 4096) {
A[(1 * 4096 + (12 * b0 + 1)) * 4096 + (t1 + 128 * g7 - 2)] = (0.2f * ((((A[(0 * 4096 + (12 * b0 + 1)) * 4096 + (t1 + 128 * g7 - 2)] + A[(0 * 4096 + (12 * b0 + 1)) * 4096 + (t1 + 128 * g7 - 3)]) + A[(0 * 4096 + (12 * b0 + 1)) * 4096 + (t1 + 128 * g7 - 1)]) + A[(0 * 4096 + (12 * b0 + 2)) * 4096 + (t1 + 128 * g7 - 2)]) + A[(0 * 4096 + 12 * b0) * 4096 + (t1 + 128 * g7 - 2)]));
A[(1 * 4096 + (12 * b0 + 2)) * 4096 + (t1 + 128 * g7 - 2)] = (0.2f * ((((A[(0 * 4096 + (12 * b0 + 2)) * 4096 + (t1 + 128 * g7 - 2)] + A[(0 * 4096 + (12 * b0 + 2)) * 4096 + (t1 + 128 * g7 - 3)]) + A[(0 * 4096 + (12 * b0 + 2)) * 4096 + (t1 + 128 * g7 - 1)]) + A[(0 * 4096 + (12 * b0 + 3)) * 4096 + (t1 + 128 * g7 - 2)]) + A[(0 * 4096 + (12 * b0 + 1)) * 4096 + (t1 + 128 * g7 - 2)]));
if (b0 <= 340) {
A[(1 * 4096 + (12 * b0 + 3)) * 4096 + (t1 + 128 * g7 - 2)] = (0.2f * ((((A[(0 * 4096 + (12 * b0 + 3)) * 4096 + (t1 + 128 * g7 - 2)] + A[(0 * 4096 + (12 * b0 + 3)) * 4096 + (t1 + 128 * g7 - 3)]) + A[(0 * 4096 + (12 * b0 + 3)) * 4096 + (t1 + 128 * g7 - 1)]) + A[(0 * 4096 + (12 * b0 + 4)) * 4096 + (t1 + 128 * g7 - 2)]) + A[(0 * 4096 + (12 * b0 + 2)) * 4096 + (t1 + 128 * g7 - 2)]));
A[(1 * 4096 + (12 * b0 + 4)) * 4096 + (t1 + 128 * g7 - 2)] = (0.2f * ((((A[(0 * 4096 + (12 * b0 + 4)) * 4096 + (t1 + 128 * g7 - 2)] + A[(0 * 4096 + (12 * b0 + 4)) * 4096 + (t1 + 128 * g7 - 3)]) + A[(0 * 4096 + (12 * b0 + 4)) * 4096 + (t1 + 128 * g7 - 1)]) + A[(0 * 4096 + (12 * b0 + 5)) * 4096 + (t1 + 128 * g7 - 2)]) + A[(0 * 4096 + (12 * b0 + 3)) * 4096 + (t1 + 128 * g7 - 2)]));
A[(1 * 4096 + (12 * b0 + 5)) * 4096 + (t1 + 128 * g7 - 2)] = (0.2f * ((((A[(0 * 4096 + (12 * b0 + 5)) * 4096 + (t1 + 128 * g7 - 2)] + A[(0 * 4096 + (12 * b0 + 5)) * 4096 + (t1 + 128 * g7 - 3)]) + A[(0 * 4096 + (12 * b0 + 5)) * 4096 + (t1 + 128 * g7 - 1)]) + A[(0 * 4096 + (12 * b0 + 6)) * 4096 + (t1 + 128 * g7 - 2)]) + A[(0 * 4096 + (12 * b0 + 4)) * 4096 + (t1 + 128 * g7 - 2)]));
A[(1 * 4096 + (12 * b0 + 6)) * 4096 + (t1 + 128 * g7 - 2)] = (0.2f * ((((A[(0 * 4096 + (12 * b0 + 6)) * 4096 + (t1 + 128 * g7 - 2)] + A[(0 * 4096 + (12 * b0 + 6)) * 4096 + (t1 + 128 * g7 - 3)]) + A[(0 * 4096 + (12 * b0 + 6)) * 4096 + (t1 + 128 * g7 - 1)]) + A[(0 * 4096 + (12 * b0 + 7)) * 4096 + (t1 + 128 * g7 - 2)]) + A[(0 * 4096 + (12 * b0 + 5)) * 4096 + (t1 + 128 * g7 - 2)]));
A[(1 * 4096 + (12 * b0 + 7)) * 4096 + (t1 + 128 * g7 - 2)] = (0.2f * ((((A[(0 * 4096 + (12 * b0 + 7)) * 4096 + (t1 + 128 * g7 - 2)] + A[(0 * 4096 + (12 * b0 + 7)) * 4096 + (t1 + 128 * g7 - 3)]) + A[(0 * 4096 + (12 * b0 + 7)) * 4096 + (t1 + 128 * g7 - 1)]) + A[(0 * 4096 + (12 * b0 + 8)) * 4096 + (t1 + 128 * g7 - 2)]) + A[(0 * 4096 + (12 * b0 + 6)) * 4096 + (t1 + 128 * g7 - 2)]));
}
}
__syncthreads();
if (t1 + 128 * g7 >= 4 && t1 + 128 * g7 <= 4097) {
A[(0 * 4096 + (12 * b0 + 2)) * 4096 + (t1 + 128 * g7 - 3)] = (0.2f * ((((A[(1 * 4096 + (12 * b0 + 2)) * 4096 + (t1 + 128 * g7 - 3)] + A[(1 * 4096 + (12 * b0 + 2)) * 4096 + (t1 + 128 * g7 - 4)]) + A[(1 * 4096 + (12 * b0 + 2)) * 4096 + (t1 + 128 * g7 - 2)]) + A[(1 * 4096 + (12 * b0 + 3)) * 4096 + (t1 + 128 * g7 - 3)]) + A[(1 * 4096 + (12 * b0 + 1)) * 4096 + (t1 + 128 * g7 - 3)]));
if (b0 <= 340) {
A[(0 * 4096 + (12 * b0 + 3)) * 4096 + (t1 + 128 * g7 - 3)] = (0.2f * ((((A[(1 * 4096 + (12 * b0 + 3)) * 4096 + (t1 + 128 * g7 - 3)] + A[(1 * 4096 + (12 * b0 + 3)) * 4096 + (t1 + 128 * g7 - 4)]) + A[(1 * 4096 + (12 * b0 + 3)) * 4096 + (t1 + 128 * g7 - 2)]) + A[(1 * 4096 + (12 * b0 + 4)) * 4096 + (t1 + 128 * g7 - 3)]) + A[(1 * 4096 + (12 * b0 + 2)) * 4096 + (t1 + 128 * g7 - 3)]));
A[(0 * 4096 + (12 * b0 + 4)) * 4096 + (t1 + 128 * g7 - 3)] = (0.2f * ((((A[(1 * 4096 + (12 * b0 + 4)) * 4096 + (t1 + 128 * g7 - 3)] + A[(1 * 4096 + (12 * b0 + 4)) * 4096 + (t1 + 128 * g7 - 4)]) + A[(1 * 4096 + (12 * b0 + 4)) * 4096 + (t1 + 128 * g7 - 2)]) + A[(1 * 4096 + (12 * b0 + 5)) * 4096 + (t1 + 128 * g7 - 3)]) + A[(1 * 4096 + (12 * b0 + 3)) * 4096 + (t1 + 128 * g7 - 3)]));
A[(0 * 4096 + (12 * b0 + 5)) * 4096 + (t1 + 128 * g7 - 3)] = (0.2f * ((((A[(1 * 4096 + (12 * b0 + 5)) * 4096 + (t1 + 128 * g7 - 3)] + A[(1 * 4096 + (12 * b0 + 5)) * 4096 + (t1 + 128 * g7 - 4)]) + A[(1 * 4096 + (12 * b0 + 5)) * 4096 + (t1 + 128 * g7 - 2)]) + A[(1 * 4096 + (12 * b0 + 6)) * 4096 + (t1 + 128 * g7 - 3)]) + A[(1 * 4096 + (12 * b0 + 4)) * 4096 + (t1 + 128 * g7 - 3)]));
A[(0 * 4096 + (12 * b0 + 6)) * 4096 + (t1 + 128 * g7 - 3)] = (0.2f * ((((A[(1 * 4096 + (12 * b0 + 6)) * 4096 + (t1 + 128 * g7 - 3)] + A[(1 * 4096 + (12 * b0 + 6)) * 4096 + (t1 + 128 * g7 - 4)]) + A[(1 * 4096 + (12 * b0 + 6)) * 4096 + (t1 + 128 * g7 - 2)]) + A[(1 * 4096 + (12 * b0 + 7)) * 4096 + (t1 + 128 * g7 - 3)]) + A[(1 * 4096 + (12 * b0 + 5)) * 4096 + (t1 + 128 * g7 - 3)]));
}
}
__syncthreads();
}
}