Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions challenges/medium/55_max_2d_subarray_sum/challenge.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
<p>
Implement a program that computes the maximum sum of any contiguous 2D subarray of length exactly <code>window_size x window_size</code>. You are given an array <code>input</code> of length <code>N x N</code> consisting of 32-bit signed integers, and an integer <code>window_size</code>.
</p>

<h2>Implementation Requirements</h2>
<ul>
<li>Use only native features (external libraries are not permitted)</li>
<li>The <code>solve</code> function signature must remain unchanged</li>
<li>The final result must be stored in the <code>output</code> variable</li>
</ul>

<h2>Example 1:</h2>
<pre>
Input: input = [[1, 2, 3],
[4, 5, 1],
[5, 1, 7]]
window_size = 2
Output: output = 15
</pre>

<h2>Example 2:</h2>
<pre>
Input: input = [[-1, -2, -3],
[-4, -5, -1],
[-5, -1, -7]]
window_size = 2
Output: output = -11
</pre>

<h2>Constraints</h2>
<ul>
<li>1 &le; <code>N</code> &le; 5,000</li>
<li>-10 &le; <code>input[i]</code> &le; 10</li>
<li>1 &le; <code>window_size</code> &le; <code>N</code></li>
</ul>
139 changes: 139 additions & 0 deletions challenges/medium/55_max_2d_subarray_sum/challenge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
import ctypes
from typing import Any, List, Dict
import torch
from core.challenge_base import ChallengeBase

class Challenge(ChallengeBase):
def __init__(self):
super().__init__(
name="Max 2D Subarray Sum",
atol=1e-05,
rtol=1e-05,
num_gpus=1,
access_tier="free"
)

def reference_impl(self, input: torch.Tensor, output: torch.Tensor, N: int, window_size: int):
# Validate input types and shapes
assert input.shape == (N, N)
assert output.shape == (1,)
assert input.dtype == torch.int32
assert output.dtype == torch.int32

psum = input.cumsum(dim=0).cumsum(dim=1)
padded = torch.zeros((N+1, N+1), dtype=torch.int32)
padded[1:, 1:] = psum

top_left = padded[:-window_size, :-window_size]
top_right = padded[:-window_size, window_size:]
bottom_left = padded[window_size:, :-window_size]
bottom_right = padded[window_size:, window_size:]
window_sums = bottom_right - top_right - bottom_left + top_left

max_sum = torch.max(window_sums)
output[0] = max_sum

def get_solve_signature(self) -> Dict[str, Any]:
return {
"input": ctypes.POINTER(ctypes.c_int),
"output": ctypes.POINTER(ctypes.c_int),
"N": ctypes.c_int,
"window_size": ctypes.c_int
}

def generate_example_test(self) -> Dict[str, Any]:
dtype = torch.int32
input = torch.tensor([[1, 2, 3], [4, 5, 1], [5, 1, 7]], device="cuda", dtype=dtype)
output = torch.empty(1, device="cuda", dtype=dtype)
return {
"input": input,
"output": output,
"N": 3,
"window_size": 2
}

def generate_functional_test(self) -> List[Dict[str, Any]]:
dtype = torch.int32
tests = []

# basic_example
tests.append({
"input": torch.tensor([[-1, -2, -3], [-4, -5, -1], [-5, -1, -7]], device="cuda", dtype=dtype),
"output": torch.empty(1, device="cuda", dtype=dtype),
"N": 3,
"window_size": 2
})

# all_same_value
tests.append({
"input": torch.tensor([[2]*16] * 16, device="cuda", dtype=dtype),
"output": torch.empty(1, device="cuda", dtype=dtype),
"N": 16,
"window_size": 16
})

tests.append({
"input": torch.tensor([[2]*16] * 16, device="cuda", dtype=dtype),
"output": torch.empty(1, device="cuda", dtype=dtype),
"N": 16,
"window_size": 15
})

tests.append({
"input": torch.tensor([[2]*16] * 16, device="cuda", dtype=dtype),
"output": torch.empty(1, device="cuda", dtype=dtype),
"N": 16,
"window_size": 1
})

# all_minus_value
tests.append({
"input": torch.tensor([[-10]*10]*10, device="cuda", dtype=dtype),
"output": torch.empty(1, device="cuda", dtype=dtype),
"N": 10,
"window_size": 5
})

tests.append({
"input": torch.randint(-10, 0, (123, 123), device="cuda", dtype=dtype),
"output": torch.empty(1, device="cuda", dtype=dtype),
"N": 123,
"window_size": 7
})

# increasing_sequence
tests.append({
"input": torch.randint(-10, 11, (123, 123), device="cuda", dtype=dtype),
"output": torch.empty(1, device="cuda", dtype=dtype),
"N": 123,
"window_size": 7
})

# medium_size
tests.append({
"input": torch.randint(-10, 11, (1000, 1000), device="cuda", dtype=dtype),
"output": torch.empty(1, device="cuda", dtype=dtype),
"N": 1000,
"window_size": 476
})

# large_size
tests.append({
"input": torch.randint(-10, 11, (3000, 3000), device="cuda", dtype=dtype),
"output": torch.empty(1, device="cuda", dtype=dtype),
"N": 3000,
"window_size": 2011
})

return tests

def generate_performance_test(self) -> Dict[str, Any]:
dtype = torch.int32
input = torch.randint(-10, 11, (5000, 5000), device="cuda", dtype=dtype)
output = torch.empty(1, device="cuda", dtype=dtype)
return {
"input": input,
"output": output,
"N": 5000,
"window_size": 2500
}
6 changes: 6 additions & 0 deletions challenges/medium/55_max_2d_subarray_sum/starter/starter.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#include <cuda_runtime.h>

// input, output are device pointers (i.e. pointers to memory on the GPU)
extern "C" void solve(const int* input, int* output, int N, int window_size) {

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from gpu.host import DeviceContext
from gpu.id import block_dim, block_idx, thread_idx
from memory import UnsafePointer
from math import ceildiv

# input, output are device pointers (i.e. pointers to memory on the GPU)
@export
def solve(input: UnsafePointer[Int32], output: UnsafePointer[Int32], N: Int32, window_size: Int32):
pass
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
import torch

# input, output are tensors on the GPU
def solve(input: torch.Tensor, output: torch.Tensor, N: int, window_size: int):
pass
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
import torch
import triton
import triton.language as tl

# input, output are tensors on the GPU
def solve(input: torch.Tensor, output: torch.Tensor, N: int, window_size: int):
pass