From cf5ef236a7c1e845cc7f9ae0f5e1dff68804928b Mon Sep 17 00:00:00 2001
From: George Woltman <woltman@alum.mit.edu>
Date: Thu, 19 Dec 2024 01:53:46 +0000
Subject: [PATCH] Nontemporal loads and stores Maybe helps CLEAN=1 keep trig
 data in cache on Radeon 7, timings may be lower but within the marginn of
 error.

---
 src/cl/base.cl       |  4 ++++
 src/cl/middle.cl     | 32 ++++++++++++++++----------------
 src/cl/tailsquare.cl |  4 ++--
 3 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/src/cl/base.cl b/src/cl/base.cl
index 92db740..ec97095 100644
--- a/src/cl/base.cl
+++ b/src/cl/base.cl
@@ -126,6 +126,10 @@ typedef double2 T2;
 #define P(x) global x * restrict
 #define CP(x) const P(x)
 
+// Macros for non-temporal load and store (in case we later want to provide a -use option to turn this off)
+#define NTLOAD(mem)        __builtin_nontemporal_load(&(mem))
+#define NTSTORE(mem,val)   __builtin_nontemporal_store(val, &(mem))
+
 // For reasons unknown, loading trig values into nVidia's constant cache has terrible performance
 #if AMDGPU
 typedef constant const T2* Trig;
diff --git a/src/cl/middle.cl b/src/cl/middle.cl
index 97d5d03..a76e32e 100644
--- a/src/cl/middle.cl
+++ b/src/cl/middle.cl
@@ -52,10 +52,10 @@ void writeCarryFusedLine(T2 *u, P(T2) out, u32 line) {
 #if PADDING
   u32 BIG_PAD_SIZE = (PAD_SIZE/2+1)*PAD_SIZE;
   out += line * WIDTH + line * PAD_SIZE + line / SMALL_HEIGHT * BIG_PAD_SIZE + (u32) get_local_id(0); // One pad every line + a big pad every SMALL_HEIGHT lines
-  for (u32 i = 0; i < NW; ++i) { out[i * G_W] = u[i]; }
+  for (u32 i = 0; i < NW; ++i) { NTSTORE(out[i * G_W], u[i]); }
 #else
   out += line * WIDTH + (u32) get_local_id(0);
-  for (u32 i = 0; i < NW; ++i) { out[i * G_W] = u[i]; }
+  for (u32 i = 0; i < NW; ++i) { NTSTORE(out[i * G_W], u[i]); }
 #endif
 }
 
@@ -65,10 +65,10 @@ void readMiddleInLine(T2 *u, CP(T2) in, u32 y, u32 x) {
   // Rather than having u[i] also increment by one, we choose a larger pad increment
   u32 BIG_PAD_SIZE = (PAD_SIZE/2+1)*PAD_SIZE;
   in += y * WIDTH + y * PAD_SIZE + (y / SMALL_HEIGHT) * BIG_PAD_SIZE + x;
-  for (i32 i = 0; i < MIDDLE; ++i) { u[i] = in[i * (SMALL_HEIGHT * (WIDTH + PAD_SIZE) + BIG_PAD_SIZE)]; }
+  for (i32 i = 0; i < MIDDLE; ++i) { u[i] = NTLOAD(in[i * (SMALL_HEIGHT * (WIDTH + PAD_SIZE) + BIG_PAD_SIZE)]); }
 #else
   in += y * WIDTH + x;
-  for (i32 i = 0; i < MIDDLE; ++i) { u[i] = in[i * SMALL_HEIGHT * WIDTH]; }
+  for (i32 i = 0; i < MIDDLE; ++i) { u[i] = NTLOAD(in[i * SMALL_HEIGHT * WIDTH]); }
 #endif
 }
 
@@ -102,7 +102,7 @@ void writeMiddleInLine (P(T2) out, T2 *u, u32 chunk_y, u32 chunk_x)
                                                         //              = SMALL_HEIGHT / (IN_WG / IN_SIZEX) * (MIDDLE * IN_WG + PAD_SIZE)
                                                         //              = SMALL_HEIGHT * MIDDLE * IN_SIZEX + SMALL_HEIGHT / SIZEY * PAD_SIZE
   // Write each u[i] sequentially
-  for (int i = 0; i < MIDDLE; ++i) { out[i * IN_WG] = u[i]; }
+  for (int i = 0; i < MIDDLE; ++i) { NTSTORE(out[i * IN_WG], u[i]); }
 
 #else
 
@@ -112,7 +112,7 @@ void writeMiddleInLine (P(T2) out, T2 *u, u32 chunk_y, u32 chunk_x)
                                                         //                       = MIDDLE * SMALL_HEIGHT / (IN_WG / IN_SIZEX) * IN_WG
                                                         //                       = MIDDLE * SMALL_HEIGHT * IN_SIZEX
   // Write each u[i] sequentially
-  for (int i = 0; i < MIDDLE; ++i) { out[i * IN_WG] = u[i]; }
+  for (int i = 0; i < MIDDLE; ++i) { NTSTORE(out[i * IN_WG], u[i]); }
 
 #endif
 }
@@ -141,7 +141,7 @@ void readTailFusedLine(CP(T2) in, T2 *u, u32 line, u32 me) {
   for (i32 i = 0; i < NH; ++i) {
     u32 fftMiddleIn_y = i * G_H + me;                           // The fftMiddleIn y value
     u32 chunk_y = fftMiddleIn_y / SIZEY;                        // The fftMiddleIn chunk_y value
-    u[i] = in[chunk_y * (MIDDLE * IN_WG + PAD_SIZE)];           // Adjust in pointer the same way writeMiddleInLine did
+    u[i] = NTLOAD(in[chunk_y * (MIDDLE * IN_WG + PAD_SIZE)]);   // Adjust in pointer the same way writeMiddleInLine did
   }
 
 #else                                                           // Read data that was not rotated or padded
@@ -162,7 +162,7 @@ void readTailFusedLine(CP(T2) in, T2 *u, u32 line, u32 me) {
   for (i32 i = 0; i < NH; ++i) {
     u32 fftMiddleIn_y = i * G_H + me;                           // The fftMiddleIn y value
     u32 chunk_y = fftMiddleIn_y / SIZEY;                        // The fftMiddleIn chunk_y value
-    u[i] = in[chunk_y * (MIDDLE * IN_WG)];                      // Adjust in pointer the same way writeMiddleInLine did
+    u[i] = NTLOAD(in[chunk_y * (MIDDLE * IN_WG)]);              // Adjust in pointer the same way writeMiddleInLine did
   }
 
 #endif
@@ -192,10 +192,10 @@ void writeTailFusedLine(T2 *u, P(T2) out, u32 line, u32 me) {
 #else
   out += line * (SMALL_HEIGHT + PAD_SIZE) + me;                         // Pad every output line
 #endif
-  for (u32 i = 0; i < NH; ++i) { out[i * G_H] = u[i]; }
+  for (u32 i = 0; i < NH; ++i) { NTSTORE(out[i * G_H], u[i]); }
 #else                                                                   // No padding, might be better on nVidia cards
   out += line * SMALL_HEIGHT + me;
-  for (u32 i = 0; i < NH; ++i) { out[i * G_H] = u[i]; }
+  for (u32 i = 0; i < NH; ++i) { NTSTORE(out[i * G_H], u[i]); }
 #endif
 }
 
@@ -209,10 +209,10 @@ void readMiddleOutLine(T2 *u, CP(T2) in, u32 y, u32 x) {
 #else
   in += y * MIDDLE * (SMALL_HEIGHT + PAD_SIZE) + x;
 #endif
-  for (i32 i = 0; i < MIDDLE; ++i) { u[i] = in[i * (SMALL_HEIGHT + PAD_SIZE)]; }
+  for (i32 i = 0; i < MIDDLE; ++i) { u[i] = NTLOAD(in[i * (SMALL_HEIGHT + PAD_SIZE)]); }
 #else                                                                   // No rotation, might be better on nVidia cards
   in += y * MIDDLE * SMALL_HEIGHT + x;
-  for (i32 i = 0; i < MIDDLE; ++i) { u[i] = in[i * SMALL_HEIGHT]; }
+  for (i32 i = 0; i < MIDDLE; ++i) { u[i] = NTLOAD(in[i * SMALL_HEIGHT]); }
 #endif
 }
 
@@ -279,7 +279,7 @@ void writeMiddleOutLine (P(T2) out, T2 *u, u32 chunk_y, u32 chunk_x)
                                                         //              = WIDTH / (OUT_WG / OUT_SIZEX) * (MIDDLE * OUT_WG + PAD_SIZE)
                                                         //              = WIDTH * MIDDLE * OUT_SIZEX + WIDTH / SIZEY * PAD_SIZE
   // Write each u[i] sequentially
-  for (int i = 0; i < MIDDLE; ++i) { out[i * OUT_WG] = u[i]; }
+  for (int i = 0; i < MIDDLE; ++i) { NTSTORE(out[i * OUT_WG], u[i]); }
 
 #else
 
@@ -289,7 +289,7 @@ void writeMiddleOutLine (P(T2) out, T2 *u, u32 chunk_y, u32 chunk_x)
                                         //                       = MIDDLE * WIDTH / (OUT_WG / OUT_SIZEX) * OUT_WG
                                         //                       = MIDDLE * WIDTH * OUT_SIZEX
   // Write each u[i] sequentially
-  for (int i = 0; i < MIDDLE; ++i) { out[i * OUT_WG] = u[i]; }
+  for (int i = 0; i < MIDDLE; ++i) { NTSTORE(out[i * OUT_WG], u[i]); }
 
 #endif
 }
@@ -318,7 +318,7 @@ void readCarryFusedLine(CP(T2) in, T2 *u, u32 line) {
   for (i32 i = 0; i < NW; ++i) {
     u32 fftMiddleOut_y = i * G_W + me;                          // The fftMiddleOut y value
     u32 chunk_y = fftMiddleOut_y / SIZEY;                       // The fftMiddleOut chunk_y value
-    u[i] = in[chunk_y * (MIDDLE * OUT_WG + PAD_SIZE)];          // Adjust in pointer the same way writeMiddleOutLine did
+    u[i] = NTLOAD(in[chunk_y * (MIDDLE * OUT_WG + PAD_SIZE)]);  // Adjust in pointer the same way writeMiddleOutLine did
   }
 
 #else                                                           // Read data that was not rotated or padded
@@ -339,7 +339,7 @@ void readCarryFusedLine(CP(T2) in, T2 *u, u32 line) {
   for (i32 i = 0; i < NW; ++i) {
     u32 fftMiddleOut_y = i * G_W + me;                          // The fftMiddleOut y value
     u32 chunk_y = fftMiddleOut_y / SIZEY;                       // The fftMiddleOut chunk_y value
-    u[i] = in[chunk_y * MIDDLE * OUT_WG];                       // Adjust in pointer the same way writeMiddleOutLine did
+    u[i] = NTLOAD(in[chunk_y * MIDDLE * OUT_WG]);               // Adjust in pointer the same way writeMiddleOutLine did
   }
 
 #endif
diff --git a/src/cl/tailsquare.cl b/src/cl/tailsquare.cl
index 2351a15..fc70854 100644
--- a/src/cl/tailsquare.cl
+++ b/src/cl/tailsquare.cl
@@ -148,7 +148,7 @@ KERNEL(G_H) tailSquare(P(T2) out, CP(T2) in, Trig smallTrig) {
   // The trig values used here are pre-computed and stored after the fft_HEIGHT trig values.
   u32 height_trigs = SMALL_HEIGHT/NH*(NH-1);
   // Read pre-computed trig values
-  T2 trig = smallTrig[height_trigs + line1*G_H + me];
+  T2 trig = NTLOAD(smallTrig[height_trigs + line1*G_H + me]);
 #endif
 
 #if SINGLE_KERNEL
@@ -259,7 +259,7 @@ KERNEL(G_H * 2) tailSquare(P(T2) out, CP(T2) in, Trig smallTrig) {
   // The trig values used here are pre-computed and stored after the fft_HEIGHT trig values.
   u32 height_trigs = SMALL_HEIGHT/NH*(NH-1);
   // Read pre-computed trig values
-  T2 trig = smallTrig[height_trigs + line_u*G_H*2 + me];
+  T2 trig = NTLOAD(smallTrig[height_trigs + line_u*G_H*2 + me]);
 #endif
 
   bar(G_H);