Skip to content

Commit 5084bab

Browse files
oandreeva-nvjasonqinzhou
authored andcommitted
fix: FullyContiguous Memory Region Size Bug (#3175)
Signed-off-by: Olga Andreeva <oandreeva@nvidia.com> Signed-off-by: Jason Zhou <jasonzho@nvidia.com>
1 parent b01443a commit 5084bab

File tree

3 files changed

+231
-4
lines changed

3 files changed

+231
-4
lines changed

lib/llm/src/block_manager/block/data/local.rs

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,15 @@ impl<S: Storage> BlockDataViews<S> for LocalBlockData<S> {
124124
if self.is_fully_contiguous() {
125125
let mr = self.layout.memory_region(self.block_idx, 0, 0)?;
126126
let offset = mr.addr();
127-
let size = mr.size() * self.num_layers();
127+
let size = mr.size()
128+
.checked_mul(self.num_layers())
129+
.and_then(|intermediate| intermediate.checked_mul(self.num_outer_dims()))
130+
.ok_or_else(|| {
131+
BlockError::InvalidState(format!(
132+
"Block size calculation overflow: region_size={} * layers={} * outer_dims={}",
133+
mr.size(), self.num_layers(), self.num_outer_dims()
134+
))
135+
})?;
128136
let storage_type = mr.storage_type();
129137
unsafe { view::BlockView::new(self, offset, size, storage_type) }
130138
} else {
@@ -138,7 +146,15 @@ impl<S: Storage> BlockDataViews<S> for LocalBlockData<S> {
138146
if self.is_fully_contiguous() {
139147
let mr = self.layout.memory_region(self.block_idx, 0, 0)?;
140148
let offset = mr.addr();
141-
let size = mr.size() * self.num_layers();
149+
let size = mr.size()
150+
.checked_mul(self.num_layers())
151+
.and_then(|intermediate| intermediate.checked_mul(self.num_outer_dims()))
152+
.ok_or_else(|| {
153+
BlockError::InvalidState(format!(
154+
"Block size calculation overflow: region_size={} * layers={} * outer_dims={}",
155+
mr.size(), self.num_layers(), self.num_outer_dims()
156+
))
157+
})?;
142158
let storage_type = mr.storage_type();
143159
unsafe { view::BlockViewMut::new(self, offset, size, storage_type) }
144160
} else {

lib/llm/src/block_manager/layout.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -325,7 +325,7 @@ impl FullyContiguousConfig {
325325
let outer_dim_stride_in_bytes =
326326
config.page_size * config.inner_dim * config.dtype_width_bytes;
327327
let layer_stride_in_bytes = outer_dim_stride_in_bytes * config.outer_dim;
328-
let memory_region_size = layer_stride_in_bytes;
328+
let memory_region_size = outer_dim_stride_in_bytes;
329329
let natural_block_stride = config.num_layers * layer_stride_in_bytes;
330330

331331
let block_stride_in_bytes = if alignment > 1 {

lib/llm/src/block_manager/layout/utils.rs

Lines changed: 212 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,135 @@
11
// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
// SPDX-License-Identifier: Apache-2.0
33

4-
use crate::block_manager::layout::{BlockLayoutConfig, LayoutError};
4+
use crate::block_manager::layout::{BlockLayoutConfig, GenericBlockLayout, LayoutError};
55
use crate::block_manager::storage::Storage;
66

77
use validator::ValidationError;
88

9+
/// Verification result for a memory region
10+
#[derive(Debug, Clone)]
11+
#[allow(dead_code)]
12+
pub struct RegionVerificationResult {
13+
/// Block index that was verified
14+
pub block_idx: usize,
15+
/// Layer index that was verified
16+
pub layer_idx: usize,
17+
/// Outer dimension index that was verified
18+
pub outer_idx: usize,
19+
/// Expected memory address for this region
20+
pub expected_addr: usize,
21+
/// Actual memory address for this region
22+
pub actual_addr: usize,
23+
/// Expected size in bytes for this region
24+
pub expected_size: usize,
25+
/// Actual size in bytes for this region
26+
pub actual_size: usize,
27+
/// Whether the addresses match
28+
pub addr_matches: bool,
29+
/// Whether the sizes match
30+
pub size_matches: bool,
31+
}
32+
33+
/// Layout verification statistics
34+
#[derive(Debug, Clone, Default)]
35+
#[allow(dead_code)]
36+
pub struct LayoutVerificationStats {
37+
/// Total number of memory regions verified
38+
pub total_regions: usize,
39+
/// Number of regions with address mismatches
40+
pub addr_mismatches: usize,
41+
/// Number of regions with size mismatches
42+
pub size_mismatches: usize,
43+
/// Number of regions that passed all verifications
44+
pub successful_verifications: usize,
45+
}
46+
47+
#[derive(Debug)]
48+
#[allow(dead_code)]
49+
pub struct WorkerLayoutVerifier {
50+
stats: LayoutVerificationStats,
51+
}
52+
53+
#[allow(dead_code)]
54+
impl WorkerLayoutVerifier {
55+
// Constructor: Start with clean slate
56+
pub fn new() -> Self {
57+
Self {
58+
stats: LayoutVerificationStats::default(),
59+
}
60+
}
61+
62+
pub fn verify_layout_consistency<L: GenericBlockLayout>(
63+
&mut self,
64+
layout: &L,
65+
) -> Result<Vec<RegionVerificationResult>, LayoutError> {
66+
// This is the main orchestrator method.
67+
// It systematically checks every memory region in
68+
// the layout to ensure consistency.
69+
70+
self.stats = LayoutVerificationStats::default();
71+
let mut results = Vec::new();
72+
73+
// Iterate over all blocks, layers, and outer dimensions
74+
for block_idx in 0..layout.num_blocks() {
75+
for layer_idx in 0..layout.num_layers() {
76+
for outer_idx in 0..layout.outer_dim() {
77+
let result =
78+
self.verify_memory_region(layout, block_idx, layer_idx, outer_idx)?;
79+
self.update_stats(&result);
80+
results.push(result);
81+
}
82+
}
83+
}
84+
85+
Ok(results)
86+
}
87+
88+
pub fn verify_memory_region<L: GenericBlockLayout>(
89+
&mut self,
90+
layout: &L,
91+
block_idx: usize,
92+
layer_idx: usize,
93+
outer_idx: usize,
94+
) -> Result<RegionVerificationResult, LayoutError> {
95+
let memory_region = layout.memory_region(block_idx, layer_idx, outer_idx)?;
96+
97+
let config = layout.config();
98+
let expected_size = config.page_size * config.inner_dim * config.dtype_width_bytes;
99+
100+
let expected_addr = memory_region.addr();
101+
102+
Ok(RegionVerificationResult {
103+
block_idx,
104+
layer_idx,
105+
outer_idx,
106+
expected_addr,
107+
actual_addr: memory_region.addr(),
108+
expected_size,
109+
actual_size: memory_region.size(),
110+
addr_matches: expected_addr == memory_region.addr(),
111+
size_matches: expected_size == memory_region.size(),
112+
})
113+
}
114+
115+
fn update_stats(&mut self, result: &RegionVerificationResult) {
116+
self.stats.total_regions += 1;
117+
if !result.addr_matches {
118+
self.stats.addr_mismatches += 1;
119+
}
120+
if !result.size_matches {
121+
self.stats.size_mismatches += 1;
122+
}
123+
if result.addr_matches && result.size_matches {
124+
self.stats.successful_verifications += 1;
125+
}
126+
}
127+
128+
pub fn has_critical_mismatches(&self) -> bool {
129+
self.stats.size_mismatches > 0
130+
}
131+
}
132+
9133
/// Validation function for Option<usize> to check if it's Some(power_of_2).
10134
pub fn validate_power_of_2(alignment: usize) -> Result<(), ValidationError> {
11135
if !alignment.is_power_of_two() {
@@ -87,3 +211,90 @@ pub fn validate_indices<C: BlockLayoutConfig>(
87211

88212
Ok(())
89213
}
214+
215+
#[cfg(test)]
216+
mod worker_verification_tests {
217+
use super::*;
218+
use crate::block_manager::LayoutConfig;
219+
use crate::block_manager::layout::{FullyContiguous, LayerSeparate};
220+
use crate::block_manager::storage::tests::{NullDeviceAllocator, NullDeviceStorage};
221+
222+
// Test constants (same as layout.rs tests)
223+
const NUM_BLOCKS: usize = 7;
224+
const NUM_LAYERS: usize = 5;
225+
const OUTER_DIM: usize = 2;
226+
const PAGE_SIZE: usize = 4;
227+
const INNER_DIM: usize = 13;
228+
const DTYPE_WIDTH_BYTES: usize = 4;
229+
230+
fn create_test_config() -> LayoutConfig {
231+
LayoutConfig {
232+
num_blocks: NUM_BLOCKS,
233+
num_layers: NUM_LAYERS,
234+
outer_dim: OUTER_DIM,
235+
page_size: PAGE_SIZE,
236+
inner_dim: INNER_DIM,
237+
alignment: 1,
238+
dtype_width_bytes: DTYPE_WIDTH_BYTES,
239+
}
240+
}
241+
242+
fn create_fully_contiguous_layout() -> Result<FullyContiguous<NullDeviceStorage>, LayoutError> {
243+
let config = create_test_config();
244+
FullyContiguous::allocate(config, &NullDeviceAllocator)
245+
}
246+
247+
fn create_layer_separate_layout() -> Result<LayerSeparate<NullDeviceStorage>, LayoutError> {
248+
let config = create_test_config();
249+
LayerSeparate::allocate(config, &NullDeviceAllocator, true) // outer_contiguous = true
250+
}
251+
252+
#[test]
253+
fn test_verify_initialisation() {
254+
let verifier = WorkerLayoutVerifier::new();
255+
256+
assert_eq!(verifier.stats.total_regions, 0);
257+
assert_eq!(verifier.stats.addr_mismatches, 0);
258+
assert_eq!(verifier.stats.size_mismatches, 0);
259+
assert_eq!(verifier.stats.successful_verifications, 0);
260+
261+
assert!(!verifier.has_critical_mismatches());
262+
}
263+
264+
#[test]
265+
fn test_layer_separate_verification() {
266+
let layout = create_layer_separate_layout().expect("Failed to create LayerSeparate layout");
267+
let mut verifier = WorkerLayoutVerifier::new();
268+
let results = verifier
269+
.verify_layout_consistency(&layout)
270+
.expect("Failed to verify layout");
271+
272+
assert_eq!(results.len(), NUM_BLOCKS * NUM_LAYERS * OUTER_DIM);
273+
assert!(
274+
!verifier.has_critical_mismatches(),
275+
"Expected no critical mismatches but got: total={}, size_mismatches={}, successful={}",
276+
verifier.stats.total_regions,
277+
verifier.stats.size_mismatches,
278+
verifier.stats.successful_verifications
279+
);
280+
}
281+
282+
#[test]
283+
fn test_fully_contiguous_verification() {
284+
let layout =
285+
create_fully_contiguous_layout().expect("Failed to create FullyContiguous layout");
286+
let mut verifier = WorkerLayoutVerifier::new();
287+
let results = verifier
288+
.verify_layout_consistency(&layout)
289+
.expect("Failed to verify layout");
290+
assert_eq!(results.len(), NUM_BLOCKS * NUM_LAYERS * OUTER_DIM);
291+
292+
assert!(
293+
!verifier.has_critical_mismatches(),
294+
"Expected no critical mismatches but got: total={}, size_mismatches={}, successful={}",
295+
verifier.stats.total_regions,
296+
verifier.stats.size_mismatches,
297+
verifier.stats.successful_verifications
298+
);
299+
}
300+
}

0 commit comments

Comments
 (0)