AlphaGPU · June24-Wu · Sep 13, 2025
@@ -0,0 +1,148 @@
+<p>
+  Implement <strong>Sliding Window Self-Attention</strong> for a given set of matrices.  
+  Before introducing the sliding window version, let's first recall standard Self-Attention.
+</p>
+
+<h3>1. Standard Softmax Attention</h3>
+<p>
+  Given query matrix <code>Q</code>, key matrix <code>K</code>, and value matrix <code>V</code>, each position <code>i</code> attends to all positions <code>j</code> using a softmax-weighted sum:
+</p>
+
+<p style="text-align:center;">
+  \( \text{score}_{i,j} = \frac{Q_i \cdot K_j}{\sqrt{d}} \)
+</p>
+
+<p style="text-align:center;">
+  \( \text{output}_i = \sum_{j=1}^{M} \text{softmax}(\text{score}_{i,*})_j \cdot V_j \)
+</p>
+
+<p>
+  In other words, each query computes similarity with all keys, applies a softmax to get attention weights, and then computes a weighted sum of values.
+</p>
+
+<h3>2. Sliding Window Self-Attention</h3>
+<p>
+  Sliding Window Attention modifies standard attention by restricting each query to attend only to a local window around its position.
+</p>
+
+<ul>
+  <li>For each position <code>i</code>, only consider the keys and values within a window of size <code>window_size</code> around <code>i</code> (positions <code>[i-window_size, ..., i+window_size]</code>).</li>
+  <li>Compute similarity scores between <code>Q<sub>i</sub></code> and the keys in this window:</li>
+</ul>
+
+<p style="text-align:center;">
+  \( \text{score}_{i,j} = \frac{Q_i \cdot K_j}{\sqrt{d}} \)
+</p>
+
+<ul>
+  <li>Apply <code>softmax</code> over these local scores to obtain attention weights.</li>
+  <li>Use the weights to compute a weighted average of the values in the same window:</li>
+</ul>
+
+<p style="text-align:center;">
+  \( \text{output}_i = \sum_{j \in [i-\text{window_size}, \, i+\text{window_size}]} \text{softmax}(\text{score}_{i,*})_j \cdot V_j \)
+</p>
+
+<p>
+  In short, each query only attends to its nearby neighbors.
+</p>
+
+
+<h2>Implementation Requirements</h2>
+<ul>
+  <li>Use only native features (external libraries are not permitted)</li>
+  <li>The
+    <code>solve</code> function signature must remain unchanged
+  </li>
+  <li>The final result must be stored in the output matrix
+    <code>output</code>
+  </li>
+</ul>
+<h2>Example 1:</h2>
+<p>
+<strong>Input:</strong><br>
+<code>Q</code> (2×4):
+\[
+\begin{bmatrix}
+1.0 & 0.0 & 0.0 & 0.0 \\
+0.0 & 1.0 & 0.0 & 0.0
+\end{bmatrix}
+\]
+<code>K</code> (2×4):
+\[
+\begin{bmatrix}
+1.0 & 0.0 & 0.0 & 0.0 \\
+0.0 & 1.0 & 0.0 & 0.0
+\end{bmatrix}
+\]
+<code>V</code> (2×4):
+\[
+\begin{bmatrix}
+1.0 & 2.0 & 3.0 & 4.0 \\
+5.0 & 6.0 & 7.0 & 8.0
+\end{bmatrix}
+\]
+<code>window_size</code>: 1
+</p>
+
+<p>
+<strong>Output:</strong><br>
+<code>output</code> (2×4):
+\[
+\begin{bmatrix}
+2.5101628 & 3.5101628 & 4.510163 & 5.510163 \\
+3.4898374 & 4.4898376 & 5.4898376 & 6.489837
+\end{bmatrix}
+\]
+</p>
+
+
+<h2>Example 2:</h2>
+<p>
+  <strong>Input:</strong><br>
+  <code>Q</code> (2×3):
+  \[
+  \begin{bmatrix}
+  0.0 & 0.0 & 0.0 \\
+  0.0 & 1.0 & 0.0
+  \end{bmatrix}
+  \]
+  <code>K</code> (2×3):
+  \[
+  \begin{bmatrix}
+  1.0 & 0.0 & 0.0 \\
+  0.0 & 1.0 & 0.0
+  \end{bmatrix}
+  \]
+  <code>V</code> (2×3):
+  \[
+  \begin{bmatrix}
+  1.0 & 2.0 & 3.0 \\
+  5.0 & 6.0 & 7.0
+  \end{bmatrix}
+  \]
+  <code>window_size</code>: 1
+  </p>
+
+  <p>
+  <strong>Output:</strong><br>
+  <code>output</code> (2×3):
+  \[
+  \begin{bmatrix}
+  3.0 & 4.0 & 5.0 \\
+  3.5618298 & 4.56183 & 5.5618296
+  \end{bmatrix}
+  \]
+  </p>
+
+
+
+<h2>Constraints</h2>
+<ul>
+  <li>Matrix <code>Q</code>, <code>K</code>, and <code>V</code> are all of size <code>M×d</code></li>
+  <li>1 &le; <code>M</code> &le; 10000</li>
+  <li>1 &le; <code>d</code> &le; 128</li>
+  <li>1 &le; <code>window_size</code> &le; 32</li>
+  <li>All elements in <code>Q</code>, <code>K</code>, and <code>V</code> are sampled from<code>[-100.0, 100.0]</code></li>
+  <li>Data type for all matrices is <code>float32</code></li>
+</ul>
@@ -0,0 +1,107 @@
+import ctypes
+from typing import Any, List, Dict
+import torch
+from core.challenge_base import ChallengeBase
+
+class Challenge(ChallengeBase):
+    def __init__(self):
+        super().__init__(
+            name="Sliding Window Self-Attention",
+            atol=1e-05,
+            rtol=1e-05,
+            num_gpus=1,
+            access_tier="free"
+        )
+
+    def reference_impl(self, Q: torch.Tensor, K: torch.Tensor, V: torch.Tensor, output: torch.Tensor, M: int, d: int, window_size: int):
+        assert Q.shape == K.shape == V.shape == output.shape == (M,d)
+
+        scores = (Q @ K.T) / (d ** 0.5)
+
+        idxs = torch.arange(M)
+        mask = (idxs[None, :] - idxs[:, None]).abs() > window_size
+        mask = mask.to(Q.device)
+        scores.masked_fill_(mask, float('-inf'))
+        attn = torch.softmax(scores, dim=1)
+
+        torch.matmul(attn, V, out=output)
+
+    def get_solve_signature(self) -> Dict[str, Any]:
+        return {
+            "Q": ctypes.POINTER(ctypes.c_float),
+            "K": ctypes.POINTER(ctypes.c_float),
+            "V": ctypes.POINTER(ctypes.c_float),
+            "output": ctypes.POINTER(ctypes.c_float),
+            "M": ctypes.c_int,
+            "d": ctypes.c_int,
+            "window_size": ctypes.c_int,
+        }
+
+    def generate_example_test(self) -> Dict[str, Any]:
+        dtype = torch.float32
+        Q = torch.tensor([[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0]], device="cuda", dtype=dtype)
+        K= torch.tensor([[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0]], device="cuda", dtype=dtype)
+        V= torch.tensor([[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0]], device="cuda", dtype=dtype)
+        output = torch.empty(2, 4, device="cuda", dtype=dtype)
+        return {"Q": Q, "K": K, "V": V, "output": output, "M": 2, "d": 4, "window_size": 1}
+
+    def generate_functional_test(self) -> List[Dict[str, Any]]:
+        dtype = torch.float32
+        tests = []
+
+        # basic_example
+        tests.append({
+            "Q": torch.tensor([[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0]], device="cuda", dtype=dtype),
+            "K": torch.tensor([[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0]], device="cuda", dtype=dtype),
+            "V": torch.tensor([[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0]], device="cuda", dtype=dtype),
+            "output": torch.empty(2, 4, device="cuda", dtype=dtype),
+            "M": 2, "d": 4, "window_size" : 1
+        })
+
+        # basic_example
+        tests.append({
+            "Q": torch.tensor([[0.0, 0.0, 0.0], [0.0, 1.0, 0.0]], device="cuda", dtype=dtype),
+            "K": torch.tensor([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]], device="cuda", dtype=dtype),
+            "V": torch.tensor([[1.0, 2.0, 3.0], [5.0, 6.0, 7.0]], device="cuda", dtype=dtype),
+            "output": torch.empty(2, 3, device="cuda", dtype=dtype),
+            "M": 2, "d": 3, "window_size" : 1
+        })
+
+
+        # zero_matrices
+        tests.append({
+            "Q": torch.zeros((3, 5), device="cuda", dtype=dtype),
+            "K": torch.zeros((3, 5), device="cuda", dtype=dtype),
+            "V": torch.zeros((3, 5), device="cuda", dtype=dtype),
+            "output": torch.empty(3, 5, device="cuda", dtype=dtype),
+            "M": 3, "d": 5, "window_size" : 2
+        })
+
+        # mixed_values
+        tests.append({
+            "Q": torch.tensor([[-1.0, 2.0, -3.0], [4.0, -5.0, 6.0], [-7.0, 8.0, -9.0], [10.0, -11.0, 12.0]], device="cuda", dtype=dtype),
+            "K": torch.tensor([[2.0, -1.0, 3.0], [-4.0, 5.0, -6.0], [7.0, -8.0, 9.0], [-10.0, 11.0, -12.0]], device="cuda", dtype=dtype),
+            "V": torch.tensor([[1.0, 0.5, -0.5], [-1.0, 2.0, 3.0], [4.0, -2.0, 1.0], [0.0, 1.0, -1.0]], device="cuda", dtype=dtype),
+            "output": torch.empty(4, 3, device="cuda", dtype=dtype),
+            "M": 4, "d": 3, "window_size" : 2
+        })
+
+        # large_matrices
+        tests.append({
+            "Q": torch.empty((128, 32), device="cuda", dtype=dtype).uniform_(-0.1, 0.1),
+            "K": torch.empty((128, 32), device="cuda", dtype=dtype).uniform_(-0.1, 0.1),
+            "V": torch.empty((128, 32), device="cuda", dtype=dtype).uniform_(-0.1, 0.1),
+            "output": torch.empty(128, 32, device="cuda", dtype=dtype),
+            "M": 128, "d": 32, "window_size" : 8
+        })
+
+        return tests
+
+    def generate_performance_test(self) -> Dict[str, Any]:
+        dtype = torch.float32
+        M, d, window_size = 10000, 128, 32
+        Q = torch.empty((M, d), device="cuda", dtype=dtype).uniform_(-100, 100)
+        K = torch.empty((M, d), device="cuda", dtype=dtype).uniform_(-100, 100)
+        V = torch.empty((M, d), device="cuda", dtype=dtype).uniform_(-100, 100)
+        output = torch.empty(M, d, device="cuda", dtype=dtype)
+        return {"Q": Q, "K": K, "V": V, "output": output, "M": M, "d": d, "window_size" : window_size}
@@ -0,0 +1,6 @@
+#include <cuda_runtime.h>
+
+// Q, K, V, output are device pointers
+extern "C" void solve(const float* Q, const float* K, const float* V, float* output, int M, int d, int window_size) {
+
+}
@@ -0,0 +1,10 @@
+from gpu.host import DeviceContext
+from gpu.id import block_dim, block_idx, thread_idx
+from memory import UnsafePointer
+from math import ceildiv
+
+# Q, K, V, output are device pointers (i.e. pointers to memory on the GPU)
+@export                         
+def solve(Q: UnsafePointer[Float32], K: UnsafePointer[Float32], V: UnsafePointer[Float32], 
+          output: UnsafePointer[Float32], M: Int32, d: Int32, window_size: Int32):
+    pass 
@@ -0,0 +1,6 @@
+import torch
+
+# Q, K, V, output are tensors on the GPU
+def solve(Q: torch.Tensor, K: torch.Tensor, V: torch.Tensor, output: torch.Tensor,
+          M: int, d: int, window_size: int):
+    pass 
@@ -0,0 +1,7 @@
+import torch
+import triton
+import triton.language as tl
+
+# Q, K, V, output are tensors on the GPU
+def solve(Q: torch.Tensor, K: torch.Tensor, V: torch.Tensor, output: torch.Tensor, M: int, d: int, window_size: int):
+    pass