Skip to content

Commit

Permalink
Nontemporal loads and stores
Browse files Browse the repository at this point in the history
Maybe helps CLEAN=1 keep trig data in cache on Radeon 7, timings
may be lower but within the marginn of error.
  • Loading branch information
gwoltman authored and preda committed Dec 19, 2024
1 parent 65f374d commit cf5ef23
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 18 deletions.
4 changes: 4 additions & 0 deletions src/cl/base.cl
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,10 @@ typedef double2 T2;
#define P(x) global x * restrict
#define CP(x) const P(x)

// Macros for non-temporal load and store (in case we later want to provide a -use option to turn this off)
#define NTLOAD(mem) __builtin_nontemporal_load(&(mem))
#define NTSTORE(mem,val) __builtin_nontemporal_store(val, &(mem))

// For reasons unknown, loading trig values into nVidia's constant cache has terrible performance
#if AMDGPU
typedef constant const T2* Trig;
Expand Down
32 changes: 16 additions & 16 deletions src/cl/middle.cl
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,10 @@ void writeCarryFusedLine(T2 *u, P(T2) out, u32 line) {
#if PADDING
u32 BIG_PAD_SIZE = (PAD_SIZE/2+1)*PAD_SIZE;
out += line * WIDTH + line * PAD_SIZE + line / SMALL_HEIGHT * BIG_PAD_SIZE + (u32) get_local_id(0); // One pad every line + a big pad every SMALL_HEIGHT lines
for (u32 i = 0; i < NW; ++i) { out[i * G_W] = u[i]; }
for (u32 i = 0; i < NW; ++i) { NTSTORE(out[i * G_W], u[i]); }
#else
out += line * WIDTH + (u32) get_local_id(0);
for (u32 i = 0; i < NW; ++i) { out[i * G_W] = u[i]; }
for (u32 i = 0; i < NW; ++i) { NTSTORE(out[i * G_W], u[i]); }
#endif
}

Expand All @@ -65,10 +65,10 @@ void readMiddleInLine(T2 *u, CP(T2) in, u32 y, u32 x) {
// Rather than having u[i] also increment by one, we choose a larger pad increment
u32 BIG_PAD_SIZE = (PAD_SIZE/2+1)*PAD_SIZE;
in += y * WIDTH + y * PAD_SIZE + (y / SMALL_HEIGHT) * BIG_PAD_SIZE + x;
for (i32 i = 0; i < MIDDLE; ++i) { u[i] = in[i * (SMALL_HEIGHT * (WIDTH + PAD_SIZE) + BIG_PAD_SIZE)]; }
for (i32 i = 0; i < MIDDLE; ++i) { u[i] = NTLOAD(in[i * (SMALL_HEIGHT * (WIDTH + PAD_SIZE) + BIG_PAD_SIZE)]); }
#else
in += y * WIDTH + x;
for (i32 i = 0; i < MIDDLE; ++i) { u[i] = in[i * SMALL_HEIGHT * WIDTH]; }
for (i32 i = 0; i < MIDDLE; ++i) { u[i] = NTLOAD(in[i * SMALL_HEIGHT * WIDTH]); }
#endif
}

Expand Down Expand Up @@ -102,7 +102,7 @@ void writeMiddleInLine (P(T2) out, T2 *u, u32 chunk_y, u32 chunk_x)
// = SMALL_HEIGHT / (IN_WG / IN_SIZEX) * (MIDDLE * IN_WG + PAD_SIZE)
// = SMALL_HEIGHT * MIDDLE * IN_SIZEX + SMALL_HEIGHT / SIZEY * PAD_SIZE
// Write each u[i] sequentially
for (int i = 0; i < MIDDLE; ++i) { out[i * IN_WG] = u[i]; }
for (int i = 0; i < MIDDLE; ++i) { NTSTORE(out[i * IN_WG], u[i]); }

#else

Expand All @@ -112,7 +112,7 @@ void writeMiddleInLine (P(T2) out, T2 *u, u32 chunk_y, u32 chunk_x)
// = MIDDLE * SMALL_HEIGHT / (IN_WG / IN_SIZEX) * IN_WG
// = MIDDLE * SMALL_HEIGHT * IN_SIZEX
// Write each u[i] sequentially
for (int i = 0; i < MIDDLE; ++i) { out[i * IN_WG] = u[i]; }
for (int i = 0; i < MIDDLE; ++i) { NTSTORE(out[i * IN_WG], u[i]); }

#endif
}
Expand Down Expand Up @@ -141,7 +141,7 @@ void readTailFusedLine(CP(T2) in, T2 *u, u32 line, u32 me) {
for (i32 i = 0; i < NH; ++i) {
u32 fftMiddleIn_y = i * G_H + me; // The fftMiddleIn y value
u32 chunk_y = fftMiddleIn_y / SIZEY; // The fftMiddleIn chunk_y value
u[i] = in[chunk_y * (MIDDLE * IN_WG + PAD_SIZE)]; // Adjust in pointer the same way writeMiddleInLine did
u[i] = NTLOAD(in[chunk_y * (MIDDLE * IN_WG + PAD_SIZE)]); // Adjust in pointer the same way writeMiddleInLine did
}

#else // Read data that was not rotated or padded
Expand All @@ -162,7 +162,7 @@ void readTailFusedLine(CP(T2) in, T2 *u, u32 line, u32 me) {
for (i32 i = 0; i < NH; ++i) {
u32 fftMiddleIn_y = i * G_H + me; // The fftMiddleIn y value
u32 chunk_y = fftMiddleIn_y / SIZEY; // The fftMiddleIn chunk_y value
u[i] = in[chunk_y * (MIDDLE * IN_WG)]; // Adjust in pointer the same way writeMiddleInLine did
u[i] = NTLOAD(in[chunk_y * (MIDDLE * IN_WG)]); // Adjust in pointer the same way writeMiddleInLine did
}

#endif
Expand Down Expand Up @@ -192,10 +192,10 @@ void writeTailFusedLine(T2 *u, P(T2) out, u32 line, u32 me) {
#else
out += line * (SMALL_HEIGHT + PAD_SIZE) + me; // Pad every output line
#endif
for (u32 i = 0; i < NH; ++i) { out[i * G_H] = u[i]; }
for (u32 i = 0; i < NH; ++i) { NTSTORE(out[i * G_H], u[i]); }
#else // No padding, might be better on nVidia cards
out += line * SMALL_HEIGHT + me;
for (u32 i = 0; i < NH; ++i) { out[i * G_H] = u[i]; }
for (u32 i = 0; i < NH; ++i) { NTSTORE(out[i * G_H], u[i]); }
#endif
}

Expand All @@ -209,10 +209,10 @@ void readMiddleOutLine(T2 *u, CP(T2) in, u32 y, u32 x) {
#else
in += y * MIDDLE * (SMALL_HEIGHT + PAD_SIZE) + x;
#endif
for (i32 i = 0; i < MIDDLE; ++i) { u[i] = in[i * (SMALL_HEIGHT + PAD_SIZE)]; }
for (i32 i = 0; i < MIDDLE; ++i) { u[i] = NTLOAD(in[i * (SMALL_HEIGHT + PAD_SIZE)]); }
#else // No rotation, might be better on nVidia cards
in += y * MIDDLE * SMALL_HEIGHT + x;
for (i32 i = 0; i < MIDDLE; ++i) { u[i] = in[i * SMALL_HEIGHT]; }
for (i32 i = 0; i < MIDDLE; ++i) { u[i] = NTLOAD(in[i * SMALL_HEIGHT]); }
#endif
}

Expand Down Expand Up @@ -279,7 +279,7 @@ void writeMiddleOutLine (P(T2) out, T2 *u, u32 chunk_y, u32 chunk_x)
// = WIDTH / (OUT_WG / OUT_SIZEX) * (MIDDLE * OUT_WG + PAD_SIZE)
// = WIDTH * MIDDLE * OUT_SIZEX + WIDTH / SIZEY * PAD_SIZE
// Write each u[i] sequentially
for (int i = 0; i < MIDDLE; ++i) { out[i * OUT_WG] = u[i]; }
for (int i = 0; i < MIDDLE; ++i) { NTSTORE(out[i * OUT_WG], u[i]); }

#else

Expand All @@ -289,7 +289,7 @@ void writeMiddleOutLine (P(T2) out, T2 *u, u32 chunk_y, u32 chunk_x)
// = MIDDLE * WIDTH / (OUT_WG / OUT_SIZEX) * OUT_WG
// = MIDDLE * WIDTH * OUT_SIZEX
// Write each u[i] sequentially
for (int i = 0; i < MIDDLE; ++i) { out[i * OUT_WG] = u[i]; }
for (int i = 0; i < MIDDLE; ++i) { NTSTORE(out[i * OUT_WG], u[i]); }

#endif
}
Expand Down Expand Up @@ -318,7 +318,7 @@ void readCarryFusedLine(CP(T2) in, T2 *u, u32 line) {
for (i32 i = 0; i < NW; ++i) {
u32 fftMiddleOut_y = i * G_W + me; // The fftMiddleOut y value
u32 chunk_y = fftMiddleOut_y / SIZEY; // The fftMiddleOut chunk_y value
u[i] = in[chunk_y * (MIDDLE * OUT_WG + PAD_SIZE)]; // Adjust in pointer the same way writeMiddleOutLine did
u[i] = NTLOAD(in[chunk_y * (MIDDLE * OUT_WG + PAD_SIZE)]); // Adjust in pointer the same way writeMiddleOutLine did
}

#else // Read data that was not rotated or padded
Expand All @@ -339,7 +339,7 @@ void readCarryFusedLine(CP(T2) in, T2 *u, u32 line) {
for (i32 i = 0; i < NW; ++i) {
u32 fftMiddleOut_y = i * G_W + me; // The fftMiddleOut y value
u32 chunk_y = fftMiddleOut_y / SIZEY; // The fftMiddleOut chunk_y value
u[i] = in[chunk_y * MIDDLE * OUT_WG]; // Adjust in pointer the same way writeMiddleOutLine did
u[i] = NTLOAD(in[chunk_y * MIDDLE * OUT_WG]); // Adjust in pointer the same way writeMiddleOutLine did
}

#endif
Expand Down
4 changes: 2 additions & 2 deletions src/cl/tailsquare.cl
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ KERNEL(G_H) tailSquare(P(T2) out, CP(T2) in, Trig smallTrig) {
// The trig values used here are pre-computed and stored after the fft_HEIGHT trig values.
u32 height_trigs = SMALL_HEIGHT/NH*(NH-1);
// Read pre-computed trig values
T2 trig = smallTrig[height_trigs + line1*G_H + me];
T2 trig = NTLOAD(smallTrig[height_trigs + line1*G_H + me]);
#endif

#if SINGLE_KERNEL
Expand Down Expand Up @@ -259,7 +259,7 @@ KERNEL(G_H * 2) tailSquare(P(T2) out, CP(T2) in, Trig smallTrig) {
// The trig values used here are pre-computed and stored after the fft_HEIGHT trig values.
u32 height_trigs = SMALL_HEIGHT/NH*(NH-1);
// Read pre-computed trig values
T2 trig = smallTrig[height_trigs + line_u*G_H*2 + me];
T2 trig = NTLOAD(smallTrig[height_trigs + line_u*G_H*2 + me]);
#endif

bar(G_H);
Expand Down

0 comments on commit cf5ef23

Please sign in to comment.