diff --git a/challenges/medium/55_max_2d_subarray_sum/challenge.html b/challenges/medium/55_max_2d_subarray_sum/challenge.html
new file mode 100644
index 0000000..ceedace
--- /dev/null
+++ b/challenges/medium/55_max_2d_subarray_sum/challenge.html
@@ -0,0 +1,35 @@
+
+ Implement a program that computes the maximum sum of any contiguous 2D subarray of length exactly window_size x window_size
. You are given an array input
of length N x N
consisting of 32-bit signed integers, and an integer window_size
.
+
+
+Implementation Requirements
+
+ - Use only native features (external libraries are not permitted)
+ - The
solve
function signature must remain unchanged
+ - The final result must be stored in the
output
variable
+
+
+Example 1:
+
+Input: input = [[1, 2, 3],
+ [4, 5, 1],
+ [5, 1, 7]]
+ window_size = 2
+Output: output = 15
+
+
+Example 2:
+
+Input: input = [[-1, -2, -3],
+ [-4, -5, -1],
+ [-5, -1, -7]]
+ window_size = 2
+Output: output = -11
+
+
+Constraints
+
+ - 1 ≤
N
≤ 5,000
+ - -10 ≤
input[i]
≤ 10
+ - 1 ≤
window_size
≤ N
+
\ No newline at end of file
diff --git a/challenges/medium/55_max_2d_subarray_sum/challenge.py b/challenges/medium/55_max_2d_subarray_sum/challenge.py
new file mode 100644
index 0000000..109751d
--- /dev/null
+++ b/challenges/medium/55_max_2d_subarray_sum/challenge.py
@@ -0,0 +1,139 @@
+import ctypes
+from typing import Any, List, Dict
+import torch
+from core.challenge_base import ChallengeBase
+
+class Challenge(ChallengeBase):
+ def __init__(self):
+ super().__init__(
+ name="Max 2D Subarray Sum",
+ atol=1e-05,
+ rtol=1e-05,
+ num_gpus=1,
+ access_tier="free"
+ )
+
+ def reference_impl(self, input: torch.Tensor, output: torch.Tensor, N: int, window_size: int):
+ # Validate input types and shapes
+ assert input.shape == (N, N)
+ assert output.shape == (1,)
+ assert input.dtype == torch.int32
+ assert output.dtype == torch.int32
+
+ psum = input.cumsum(dim=0).cumsum(dim=1)
+ padded = torch.zeros((N+1, N+1), dtype=torch.int32)
+ padded[1:, 1:] = psum
+
+ top_left = padded[:-window_size, :-window_size]
+ top_right = padded[:-window_size, window_size:]
+ bottom_left = padded[window_size:, :-window_size]
+ bottom_right = padded[window_size:, window_size:]
+ window_sums = bottom_right - top_right - bottom_left + top_left
+
+ max_sum = torch.max(window_sums)
+ output[0] = max_sum
+
+ def get_solve_signature(self) -> Dict[str, Any]:
+ return {
+ "input": ctypes.POINTER(ctypes.c_int),
+ "output": ctypes.POINTER(ctypes.c_int),
+ "N": ctypes.c_int,
+ "window_size": ctypes.c_int
+ }
+
+ def generate_example_test(self) -> Dict[str, Any]:
+ dtype = torch.int32
+ input = torch.tensor([[1, 2, 3], [4, 5, 1], [5, 1, 7]], device="cuda", dtype=dtype)
+ output = torch.empty(1, device="cuda", dtype=dtype)
+ return {
+ "input": input,
+ "output": output,
+ "N": 3,
+ "window_size": 2
+ }
+
+ def generate_functional_test(self) -> List[Dict[str, Any]]:
+ dtype = torch.int32
+ tests = []
+
+ # basic_example
+ tests.append({
+ "input": torch.tensor([[-1, -2, -3], [-4, -5, -1], [-5, -1, -7]], device="cuda", dtype=dtype),
+ "output": torch.empty(1, device="cuda", dtype=dtype),
+ "N": 3,
+ "window_size": 2
+ })
+
+ # all_same_value
+ tests.append({
+ "input": torch.tensor([[2]*16] * 16, device="cuda", dtype=dtype),
+ "output": torch.empty(1, device="cuda", dtype=dtype),
+ "N": 16,
+ "window_size": 16
+ })
+
+ tests.append({
+ "input": torch.tensor([[2]*16] * 16, device="cuda", dtype=dtype),
+ "output": torch.empty(1, device="cuda", dtype=dtype),
+ "N": 16,
+ "window_size": 15
+ })
+
+ tests.append({
+ "input": torch.tensor([[2]*16] * 16, device="cuda", dtype=dtype),
+ "output": torch.empty(1, device="cuda", dtype=dtype),
+ "N": 16,
+ "window_size": 1
+ })
+
+ # all_minus_value
+ tests.append({
+ "input": torch.tensor([[-10]*10]*10, device="cuda", dtype=dtype),
+ "output": torch.empty(1, device="cuda", dtype=dtype),
+ "N": 10,
+ "window_size": 5
+ })
+
+ tests.append({
+ "input": torch.randint(-10, 0, (123, 123), device="cuda", dtype=dtype),
+ "output": torch.empty(1, device="cuda", dtype=dtype),
+ "N": 123,
+ "window_size": 7
+ })
+
+ # increasing_sequence
+ tests.append({
+ "input": torch.randint(-10, 11, (123, 123), device="cuda", dtype=dtype),
+ "output": torch.empty(1, device="cuda", dtype=dtype),
+ "N": 123,
+ "window_size": 7
+ })
+
+ # medium_size
+ tests.append({
+ "input": torch.randint(-10, 11, (1000, 1000), device="cuda", dtype=dtype),
+ "output": torch.empty(1, device="cuda", dtype=dtype),
+ "N": 1000,
+ "window_size": 476
+ })
+
+ # large_size
+ tests.append({
+ "input": torch.randint(-10, 11, (3000, 3000), device="cuda", dtype=dtype),
+ "output": torch.empty(1, device="cuda", dtype=dtype),
+ "N": 3000,
+ "window_size": 2011
+ })
+
+ return tests
+
+ def generate_performance_test(self) -> Dict[str, Any]:
+ dtype = torch.int32
+ input = torch.randint(-10, 11, (5000, 5000), device="cuda", dtype=dtype)
+ output = torch.empty(1, device="cuda", dtype=dtype)
+ return {
+ "input": input,
+ "output": output,
+ "N": 5000,
+ "window_size": 2500
+ }
\ No newline at end of file
diff --git a/challenges/medium/55_max_2d_subarray_sum/starter/starter.cu b/challenges/medium/55_max_2d_subarray_sum/starter/starter.cu
new file mode 100644
index 0000000..a6b1393
--- /dev/null
+++ b/challenges/medium/55_max_2d_subarray_sum/starter/starter.cu
@@ -0,0 +1,6 @@
+#include
+
+// input, output are device pointers (i.e. pointers to memory on the GPU)
+extern "C" void solve(const int* input, int* output, int N, int window_size) {
+
+}
\ No newline at end of file
diff --git a/challenges/medium/55_max_2d_subarray_sum/starter/starter.mojo b/challenges/medium/55_max_2d_subarray_sum/starter/starter.mojo
new file mode 100644
index 0000000..fb1ba38
--- /dev/null
+++ b/challenges/medium/55_max_2d_subarray_sum/starter/starter.mojo
@@ -0,0 +1,9 @@
+from gpu.host import DeviceContext
+from gpu.id import block_dim, block_idx, thread_idx
+from memory import UnsafePointer
+from math import ceildiv
+
+# input, output are device pointers (i.e. pointers to memory on the GPU)
+@export
+def solve(input: UnsafePointer[Int32], output: UnsafePointer[Int32], N: Int32, window_size: Int32):
+ pass
diff --git a/challenges/medium/55_max_2d_subarray_sum/starter/starter.pytorch.py b/challenges/medium/55_max_2d_subarray_sum/starter/starter.pytorch.py
new file mode 100644
index 0000000..59ec21d
--- /dev/null
+++ b/challenges/medium/55_max_2d_subarray_sum/starter/starter.pytorch.py
@@ -0,0 +1,5 @@
+import torch
+
+# input, output are tensors on the GPU
+def solve(input: torch.Tensor, output: torch.Tensor, N: int, window_size: int):
+ pass
diff --git a/challenges/medium/55_max_2d_subarray_sum/starter/starter.triton.py b/challenges/medium/55_max_2d_subarray_sum/starter/starter.triton.py
new file mode 100644
index 0000000..3a4a1ec
--- /dev/null
+++ b/challenges/medium/55_max_2d_subarray_sum/starter/starter.triton.py
@@ -0,0 +1,7 @@
+import torch
+import triton
+import triton.language as tl
+
+# input, output are tensors on the GPU
+def solve(input: torch.Tensor, output: torch.Tensor, N: int, window_size: int):
+ pass
\ No newline at end of file