@@ -1598,7 +1598,6 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
15981598 const int warpid = threadIdx .x / WARP_SIZE;
15991599 const int laneid = threadIdx .x % WARP_SIZE;
16001600 const int lane2id = laneid % 2 ;
1601- const int lane4id = laneid % 4 ;
16021601 const int lane16id = laneid % 16 ;
16031602 const int rowid = laneid / 16 ;
16041603
@@ -1745,7 +1744,6 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
17451744 const cache_t * k_ptr2 = k_ptr + kblock_number * kv_block_stride;
17461745 const int klocal_token_idx =
17471746 TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id;
1748- const int kglobal_token_idx = partition_start_token_idx + klocal_token_idx;
17491747 const int kphysical_block_offset = klocal_token_idx % BLOCK_SIZE;
17501748 const cache_t * k_ptr3 = k_ptr2 + kphysical_block_offset * KX;
17511749
@@ -2368,7 +2366,6 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
23682366 const int warpid = threadIdx .x / WARP_SIZE;
23692367 const int laneid = threadIdx .x % WARP_SIZE;
23702368 const int lane2id = laneid % 2 ;
2371- const int lane4id = laneid % 4 ;
23722369 const int lane16id = laneid % 16 ;
23732370 const int rowid = laneid / 16 ;
23742371
@@ -2514,7 +2511,6 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
25142511 const cache_t * k_ptr2 = k_ptr + kblock_number * kv_block_stride;
25152512 const int klocal_token_idx =
25162513 TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id;
2517- const int kglobal_token_idx = partition_start_token_idx + klocal_token_idx;
25182514 const int kphysical_block_offset = klocal_token_idx % BLOCK_SIZE;
25192515 const cache_t * k_ptr3 = k_ptr2 + kphysical_block_offset * KX;
25202516
0 commit comments