diff --git a/challenges/medium/55_max_2d_subarray_sum/challenge.html b/challenges/medium/55_max_2d_subarray_sum/challenge.html new file mode 100644 index 0000000..ceedace --- /dev/null +++ b/challenges/medium/55_max_2d_subarray_sum/challenge.html @@ -0,0 +1,35 @@ +

+ Implement a program that computes the maximum sum of any contiguous 2D subarray of length exactly window_size x window_size. You are given an array input of length N x N consisting of 32-bit signed integers, and an integer window_size. +

+ +

Implementation Requirements

+ + +

Example 1:

+
+Input:  input = [[1, 2, 3],
+                 [4, 5, 1],
+                 [5, 1, 7]]
+        window_size = 2
+Output: output = 15
+
+ +

Example 2:

+
+Input:  input = [[-1, -2, -3],
+                 [-4, -5, -1],
+                 [-5, -1, -7]]
+        window_size = 2
+Output: output = -11
+
+ +

Constraints

+ \ No newline at end of file diff --git a/challenges/medium/55_max_2d_subarray_sum/challenge.py b/challenges/medium/55_max_2d_subarray_sum/challenge.py new file mode 100644 index 0000000..109751d --- /dev/null +++ b/challenges/medium/55_max_2d_subarray_sum/challenge.py @@ -0,0 +1,139 @@ +import ctypes +from typing import Any, List, Dict +import torch +from core.challenge_base import ChallengeBase + +class Challenge(ChallengeBase): + def __init__(self): + super().__init__( + name="Max 2D Subarray Sum", + atol=1e-05, + rtol=1e-05, + num_gpus=1, + access_tier="free" + ) + + def reference_impl(self, input: torch.Tensor, output: torch.Tensor, N: int, window_size: int): + # Validate input types and shapes + assert input.shape == (N, N) + assert output.shape == (1,) + assert input.dtype == torch.int32 + assert output.dtype == torch.int32 + + psum = input.cumsum(dim=0).cumsum(dim=1) + padded = torch.zeros((N+1, N+1), dtype=torch.int32) + padded[1:, 1:] = psum + + top_left = padded[:-window_size, :-window_size] + top_right = padded[:-window_size, window_size:] + bottom_left = padded[window_size:, :-window_size] + bottom_right = padded[window_size:, window_size:] + window_sums = bottom_right - top_right - bottom_left + top_left + + max_sum = torch.max(window_sums) + output[0] = max_sum + + def get_solve_signature(self) -> Dict[str, Any]: + return { + "input": ctypes.POINTER(ctypes.c_int), + "output": ctypes.POINTER(ctypes.c_int), + "N": ctypes.c_int, + "window_size": ctypes.c_int + } + + def generate_example_test(self) -> Dict[str, Any]: + dtype = torch.int32 + input = torch.tensor([[1, 2, 3], [4, 5, 1], [5, 1, 7]], device="cuda", dtype=dtype) + output = torch.empty(1, device="cuda", dtype=dtype) + return { + "input": input, + "output": output, + "N": 3, + "window_size": 2 + } + + def generate_functional_test(self) -> List[Dict[str, Any]]: + dtype = torch.int32 + tests = [] + + # basic_example + tests.append({ + "input": torch.tensor([[-1, -2, -3], [-4, -5, -1], [-5, -1, -7]], device="cuda", dtype=dtype), + "output": torch.empty(1, device="cuda", dtype=dtype), + "N": 3, + "window_size": 2 + }) + + # all_same_value + tests.append({ + "input": torch.tensor([[2]*16] * 16, device="cuda", dtype=dtype), + "output": torch.empty(1, device="cuda", dtype=dtype), + "N": 16, + "window_size": 16 + }) + + tests.append({ + "input": torch.tensor([[2]*16] * 16, device="cuda", dtype=dtype), + "output": torch.empty(1, device="cuda", dtype=dtype), + "N": 16, + "window_size": 15 + }) + + tests.append({ + "input": torch.tensor([[2]*16] * 16, device="cuda", dtype=dtype), + "output": torch.empty(1, device="cuda", dtype=dtype), + "N": 16, + "window_size": 1 + }) + + # all_minus_value + tests.append({ + "input": torch.tensor([[-10]*10]*10, device="cuda", dtype=dtype), + "output": torch.empty(1, device="cuda", dtype=dtype), + "N": 10, + "window_size": 5 + }) + + tests.append({ + "input": torch.randint(-10, 0, (123, 123), device="cuda", dtype=dtype), + "output": torch.empty(1, device="cuda", dtype=dtype), + "N": 123, + "window_size": 7 + }) + + # increasing_sequence + tests.append({ + "input": torch.randint(-10, 11, (123, 123), device="cuda", dtype=dtype), + "output": torch.empty(1, device="cuda", dtype=dtype), + "N": 123, + "window_size": 7 + }) + + # medium_size + tests.append({ + "input": torch.randint(-10, 11, (1000, 1000), device="cuda", dtype=dtype), + "output": torch.empty(1, device="cuda", dtype=dtype), + "N": 1000, + "window_size": 476 + }) + + # large_size + tests.append({ + "input": torch.randint(-10, 11, (3000, 3000), device="cuda", dtype=dtype), + "output": torch.empty(1, device="cuda", dtype=dtype), + "N": 3000, + "window_size": 2011 + }) + + return tests + + def generate_performance_test(self) -> Dict[str, Any]: + dtype = torch.int32 + input = torch.randint(-10, 11, (5000, 5000), device="cuda", dtype=dtype) + output = torch.empty(1, device="cuda", dtype=dtype) + return { + "input": input, + "output": output, + "N": 5000, + "window_size": 2500 + } \ No newline at end of file diff --git a/challenges/medium/55_max_2d_subarray_sum/starter/starter.cu b/challenges/medium/55_max_2d_subarray_sum/starter/starter.cu new file mode 100644 index 0000000..a6b1393 --- /dev/null +++ b/challenges/medium/55_max_2d_subarray_sum/starter/starter.cu @@ -0,0 +1,6 @@ +#include + +// input, output are device pointers (i.e. pointers to memory on the GPU) +extern "C" void solve(const int* input, int* output, int N, int window_size) { + +} \ No newline at end of file diff --git a/challenges/medium/55_max_2d_subarray_sum/starter/starter.mojo b/challenges/medium/55_max_2d_subarray_sum/starter/starter.mojo new file mode 100644 index 0000000..fb1ba38 --- /dev/null +++ b/challenges/medium/55_max_2d_subarray_sum/starter/starter.mojo @@ -0,0 +1,9 @@ +from gpu.host import DeviceContext +from gpu.id import block_dim, block_idx, thread_idx +from memory import UnsafePointer +from math import ceildiv + +# input, output are device pointers (i.e. pointers to memory on the GPU) +@export +def solve(input: UnsafePointer[Int32], output: UnsafePointer[Int32], N: Int32, window_size: Int32): + pass diff --git a/challenges/medium/55_max_2d_subarray_sum/starter/starter.pytorch.py b/challenges/medium/55_max_2d_subarray_sum/starter/starter.pytorch.py new file mode 100644 index 0000000..59ec21d --- /dev/null +++ b/challenges/medium/55_max_2d_subarray_sum/starter/starter.pytorch.py @@ -0,0 +1,5 @@ +import torch + +# input, output are tensors on the GPU +def solve(input: torch.Tensor, output: torch.Tensor, N: int, window_size: int): + pass diff --git a/challenges/medium/55_max_2d_subarray_sum/starter/starter.triton.py b/challenges/medium/55_max_2d_subarray_sum/starter/starter.triton.py new file mode 100644 index 0000000..3a4a1ec --- /dev/null +++ b/challenges/medium/55_max_2d_subarray_sum/starter/starter.triton.py @@ -0,0 +1,7 @@ +import torch +import triton +import triton.language as tl + +# input, output are tensors on the GPU +def solve(input: torch.Tensor, output: torch.Tensor, N: int, window_size: int): + pass \ No newline at end of file