Skip to content

Commit 3c4ae35

Browse files
nlutsenkometa-codesync[bot]
authored andcommitted
clang-format | Format fbsource with clang-format 21.
Reviewed By: ChristianK275 Differential Revision: D85317706 fbshipit-source-id: b399c5c4b75252999442b7d7d2778e7a241b0025
1 parent 6a9e616 commit 3c4ae35

File tree

60 files changed

+865
-714
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

60 files changed

+865
-714
lines changed

bench/BenchUtils.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -274,8 +274,9 @@ void performance_test(
274274
aligned_vector<float> B(Bint.begin(), Bint.end());
275275
std::vector<std::unique_ptr<PackedGemmMatrixB<btype>>> Bp;
276276
for (int i = 0; i < num_instances; ++i) {
277-
Bp.emplace_back(std::unique_ptr<PackedGemmMatrixB<btype>>(
278-
new PackedGemmMatrixB<btype>(btran, k, n, alpha, B.data())));
277+
Bp.emplace_back(
278+
std::unique_ptr<PackedGemmMatrixB<btype>>(
279+
new PackedGemmMatrixB<btype>(btran, k, n, alpha, B.data())));
279280
}
280281
auto kAligned = ((k * sizeof(float) + 64) & ~63) / sizeof(float);
281282
auto nAligned = ((n * sizeof(float) + 64) & ~63) / sizeof(float);

fbgemm_gpu/codegen/training/index_select/batch_index_select_dim0_cpu_host.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -403,8 +403,9 @@ class BatchIndexSelectDim0TensorCPUOp
403403
input_columns,
404404
permute_output_dim_0_1);
405405
ctx->saved_data["permute_output_dim_0_1"] = permute_output_dim_0_1;
406-
ctx->save_for_backward(std::vector<Tensor>{
407-
indices, input_num_indices, input_rows, input_columns, res[1]});
406+
ctx->save_for_backward(
407+
std::vector<Tensor>{
408+
indices, input_num_indices, input_rows, input_columns, res[1]});
408409
res.resize(1);
409410
return res;
410411
}

fbgemm_gpu/codegen/training/index_select/batch_index_select_dim0_host.cpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -264,8 +264,9 @@ class BatchIndexSelectDim0GPUOp
264264

265265
ctx->saved_data["permute_output_dim_0_1"] = permute_output_dim_0_1;
266266

267-
ctx->save_for_backward(std::vector<Tensor>{
268-
inputs, indices, res[1], res[2], res[3], res[4], res[5], res[6]});
267+
ctx->save_for_backward(
268+
std::vector<Tensor>{
269+
inputs, indices, res[1], res[2], res[3], res[4], res[5], res[6]});
269270

270271
res.resize(1);
271272
return res;
@@ -584,8 +585,9 @@ class BatchIndexSelectDim0TensorGPUOp
584585

585586
ctx->saved_data["permute_output_dim_0_1"] = permute_output_dim_0_1;
586587

587-
ctx->save_for_backward(std::vector<Tensor>{
588-
inputs, indices, res[1], res[2], res[3], res[4], res[5], res[6]});
588+
ctx->save_for_backward(
589+
std::vector<Tensor>{
590+
inputs, indices, res[1], res[2], res[3], res[4], res[5], res[6]});
589591

590592
// res.resize(1);
591593
return res;

fbgemm_gpu/experimental/gen_ai/src/kv_cache/kv_cache_defs.cpp

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -18,16 +18,21 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
1818
m.def("rope_qkv_varseq_prefill(Tensor XQ, Tensor(a!)? XK, Tensor? XV, Tensor(b!) cache_K, Tensor(c!) cache_V, Tensor varseq_batch, Tensor varseq_seqpos, float theta, int? num_groups=1, Tensor? block_tables=None, int page_size=" STRING(
1919
DEFAULT_PAGE_SIZE) ", Tensor? actual_batch_size=None, Tensor? varseq_cache_seqpos=None, int cache_logical_dtype_int=0, bool rope_scaling=False, int old_context_len=8192"
2020
", float scaling_factor=16, float lo_freq_factor=1, float hi_freq_factor=32, Tensor? qparam_k=None, Tensor? qparam_v=None, bool write_k_back=False, bool k_norm=False,bool update_kv=True, Tensor?amax_qkv=None, Tensor?kv_quant_scale_precomputed=None) -> Tensor");
21-
m.def("rope_qkv_decoding(Tensor XQ, Tensor? XK, Tensor? XV, Tensor(a!) cache_K, Tensor(b!) cache_V, Tensor seqpos, float theta, int? num_groups=1, Tensor? block_tables=None, int page_size=" STRING(
22-
DEFAULT_PAGE_SIZE) ", Tensor? actual_batch_size=None, Tensor? batch=None, Tensor? cache_seqpos=None, int cache_logical_dtype_int=0, bool rope_scaling=False, int old_context_len=8192, float scaling_factor=16, float lo_freq_factor=1, float hi_freq_factor=32, Tensor? qparam_k=None, Tensor? qparam_v=None, bool k_norm=False, bool update_kv=True, Tensor?amax_qkv=None) -> Tensor");
23-
m.def("nope_qkv_varseq_prefill(Tensor XQ, Tensor? XK, Tensor? XV, Tensor(a!) cache_K, Tensor(b!) cache_V, Tensor varseq_batch, Tensor varseq_seqpos, Tensor? block_tables=None, int page_size=" STRING(
24-
DEFAULT_PAGE_SIZE) ", Tensor? actual_batch_size=None, Tensor? varseq_cache_seqpos=None, int cache_logical_dtype_int=0, int? num_groups=1, Tensor? qparam_k=None, Tensor? qparam_v=None, bool k_norm=False, bool update_kv=True, Tensor?amax_qkv=None, Tensor?kv_quant_scale_precomputed=None) -> Tensor");
25-
m.def("nope_qkv_decoding(Tensor XQ, Tensor? XK, Tensor? XV, Tensor(a!) cache_K, Tensor(b!) cache_V, Tensor seqpos, Tensor? block_tables=None, int page_size=" STRING(
26-
DEFAULT_PAGE_SIZE) ", Tensor? actual_batch_size=None, Tensor? batch=None, Tensor? cache_seqpos=None, int cache_logical_dtype_int=0, int? num_groups=1, Tensor? qparam_k=None, Tensor? qparam_v=None, bool k_norm=False, bool update_kv=True, Tensor?amax_qkv=None) -> Tensor");
27-
m.def("xpos_qkv_varseq_prefill(Tensor XQ, Tensor XK, Tensor XV, Tensor(a!) cache_K, Tensor(b!) cache_V, Tensor varseq_batch, Tensor varseq_seqpos, float theta, float gamma, float scale_base, float exponent_offset, int? num_groups=1, Tensor? block_tables=None, int page_size=" STRING(
28-
DEFAULT_PAGE_SIZE) ", Tensor? actual_batch_size=None, Tensor? varseq_cache_seqpos=None, int cache_logical_dtype_int=0, bool rope_scaling=False, int old_context_len=8192, float scaling_factor=16, float lo_freq_factor=1, float hi_freq_factor=32, Tensor? qparam_k=None, Tensor? qparam_v=None) -> Tensor");
29-
m.def("xpos_qkv_decoding(Tensor XQ, Tensor XK, Tensor XV, Tensor(a!) cache_K, Tensor(b!) cache_V, Tensor seqpos, float theta, float gamma, float scale_base, float exponent_offset, int? num_groups=1, Tensor? block_tables=None, int page_size=" STRING(
30-
DEFAULT_PAGE_SIZE) ", Tensor? actual_batch_size=None, Tensor? batch=None, Tensor? cache_seqpos=None, int cache_logical_dtype_int=0, bool rope_scaling=False, int old_context_len=8192, float scaling_factor=16, float lo_freq_factor=1, float hi_freq_factor=32, Tensor? qparam_k=None, Tensor? qparam_v=None) -> Tensor");
21+
m.def(
22+
"rope_qkv_decoding(Tensor XQ, Tensor? XK, Tensor? XV, Tensor(a!) cache_K, Tensor(b!) cache_V, Tensor seqpos, float theta, int? num_groups=1, Tensor? block_tables=None, int page_size=" STRING(
23+
DEFAULT_PAGE_SIZE) ", Tensor? actual_batch_size=None, Tensor? batch=None, Tensor? cache_seqpos=None, int cache_logical_dtype_int=0, bool rope_scaling=False, int old_context_len=8192, float scaling_factor=16, float lo_freq_factor=1, float hi_freq_factor=32, Tensor? qparam_k=None, Tensor? qparam_v=None, bool k_norm=False, bool update_kv=True, Tensor?amax_qkv=None) -> Tensor");
24+
m.def(
25+
"nope_qkv_varseq_prefill(Tensor XQ, Tensor? XK, Tensor? XV, Tensor(a!) cache_K, Tensor(b!) cache_V, Tensor varseq_batch, Tensor varseq_seqpos, Tensor? block_tables=None, int page_size=" STRING(
26+
DEFAULT_PAGE_SIZE) ", Tensor? actual_batch_size=None, Tensor? varseq_cache_seqpos=None, int cache_logical_dtype_int=0, int? num_groups=1, Tensor? qparam_k=None, Tensor? qparam_v=None, bool k_norm=False, bool update_kv=True, Tensor?amax_qkv=None, Tensor?kv_quant_scale_precomputed=None) -> Tensor");
27+
m.def(
28+
"nope_qkv_decoding(Tensor XQ, Tensor? XK, Tensor? XV, Tensor(a!) cache_K, Tensor(b!) cache_V, Tensor seqpos, Tensor? block_tables=None, int page_size=" STRING(
29+
DEFAULT_PAGE_SIZE) ", Tensor? actual_batch_size=None, Tensor? batch=None, Tensor? cache_seqpos=None, int cache_logical_dtype_int=0, int? num_groups=1, Tensor? qparam_k=None, Tensor? qparam_v=None, bool k_norm=False, bool update_kv=True, Tensor?amax_qkv=None) -> Tensor");
30+
m.def(
31+
"xpos_qkv_varseq_prefill(Tensor XQ, Tensor XK, Tensor XV, Tensor(a!) cache_K, Tensor(b!) cache_V, Tensor varseq_batch, Tensor varseq_seqpos, float theta, float gamma, float scale_base, float exponent_offset, int? num_groups=1, Tensor? block_tables=None, int page_size=" STRING(
32+
DEFAULT_PAGE_SIZE) ", Tensor? actual_batch_size=None, Tensor? varseq_cache_seqpos=None, int cache_logical_dtype_int=0, bool rope_scaling=False, int old_context_len=8192, float scaling_factor=16, float lo_freq_factor=1, float hi_freq_factor=32, Tensor? qparam_k=None, Tensor? qparam_v=None) -> Tensor");
33+
m.def(
34+
"xpos_qkv_decoding(Tensor XQ, Tensor XK, Tensor XV, Tensor(a!) cache_K, Tensor(b!) cache_V, Tensor seqpos, float theta, float gamma, float scale_base, float exponent_offset, int? num_groups=1, Tensor? block_tables=None, int page_size=" STRING(
35+
DEFAULT_PAGE_SIZE) ", Tensor? actual_batch_size=None, Tensor? batch=None, Tensor? cache_seqpos=None, int cache_logical_dtype_int=0, bool rope_scaling=False, int old_context_len=8192, float scaling_factor=16, float lo_freq_factor=1, float hi_freq_factor=32, Tensor? qparam_k=None, Tensor? qparam_v=None) -> Tensor");
3136
m.def(
3237
"dequantize_int4_cache(Tensor cache_K, Tensor cache_V, Tensor kv_seqlen, int? num_groups=1, Tensor? qparam_k=None, Tensor? qparam_v=None) -> (Tensor, Tensor)");
3338
m.def(

fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/bf16_grouped/kernels/bf16_grouped_kernel_manifest.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,7 @@
1313

1414
#include <ATen/ATen.h>
1515

16-
#define KERNEL_NAME_MAP_ENTRY(name) \
17-
{ #name, name }
16+
#define KERNEL_NAME_MAP_ENTRY(name) {#name, name}
1817

1918
template <typename InputType, typename OutputType>
2019
using GroupedKernel =

fbgemm_gpu/experimental/gen_ai/src/quantize/quantize.cu

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -444,8 +444,9 @@ __global__ void scaleMatrix1(
444444
const int64_t lda) {
445445
for (int64_t i = threadIdx.x + blockIdx.x * blockDim.x; i < numel;
446446
i += (size_t)blockDim.x * gridDim.x) {
447-
output[i] = T_OUT(scale<QUANTIZE>(
448-
static_cast<float>(input[i]), static_cast<float>(input_scale[0])));
447+
output[i] = T_OUT(
448+
scale<QUANTIZE>(
449+
static_cast<float>(input[i]), static_cast<float>(input_scale[0])));
449450
}
450451
}
451452

@@ -534,9 +535,10 @@ __global__ void scaleMatrixRowwise1(
534535
const int64_t lda) {
535536
for (int64_t i = threadIdx.x + blockIdx.x * blockDim.x; i < numel;
536537
i += (size_t)blockDim.x * gridDim.x) {
537-
output[i] = T_OUT(scale<QUANTIZE>(
538-
static_cast<float>(input[i]),
539-
static_cast<float>(input_scale[i / lda])));
538+
output[i] = T_OUT(
539+
scale<QUANTIZE>(
540+
static_cast<float>(input[i]),
541+
static_cast<float>(input_scale[i / lda])));
540542
}
541543
}
542544

@@ -549,9 +551,10 @@ __global__ void scaleMatrixColwise(
549551
const int64_t lda) {
550552
for (int64_t i = threadIdx.x + blockIdx.x * blockDim.x; i < numel;
551553
i += (size_t)blockDim.x * gridDim.x) {
552-
output[i] = T_OUT(scale<QUANTIZE>(
553-
static_cast<float>(input[i]),
554-
static_cast<float>(input_scale[i % lda])));
554+
output[i] = T_OUT(
555+
scale<QUANTIZE>(
556+
static_cast<float>(input[i]),
557+
static_cast<float>(input_scale[i % lda])));
555558
}
556559
}
557560

@@ -1764,8 +1767,9 @@ __device__ __host__ __inline__ void compute_scale_with_global(
17641767
const double two_level_scale =
17651768
static_cast<double>(scale_format_max) * (elem_format_max / global_amax);
17661769

1767-
const double local_unscale_q = quantize_amax_e4m3(static_cast<float>(
1768-
local_unscale * two_level_scale)) /
1770+
const double local_unscale_q =
1771+
quantize_amax_e4m3(
1772+
static_cast<float>(local_unscale * two_level_scale)) /
17691773
two_level_scale;
17701774
double eps = FLT_MIN;
17711775
scale = 1. / (local_unscale_q + eps);

fbgemm_gpu/experimental/hstu/src/hstu_ampere/hstu_fwd.h

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -564,13 +564,10 @@ inline __device__ void hstu_compute_attn_1rowblock(
564564
Tensor tOcO = gmem_thr_copy_O.partition_D(cO);
565565
Tensor tOpO = make_tensor<bool>(make_shape(size<2>(tOgO)));
566566
// Clear_OOB_K must be false since we don't want to write zeros to gmem
567-
flash::
568-
copy</*Is_even_MN=*/false, /*Clear_OOB_MN=*/false, /*Clear_OOB_K=*/false>(
569-
gmem_tiled_copy_O,
570-
tOrO,
571-
tOgO,
572-
tOcO,
573-
actual_seqlen_q - m_block * kBlockM);
567+
flash::copy</*Is_even_MN=*/false,
568+
/*Clear_OOB_MN=*/false,
569+
/*Clear_OOB_K=*/false>(
570+
gmem_tiled_copy_O, tOrO, tOgO, tOcO, actual_seqlen_q - m_block * kBlockM);
574571
}
575572

576573
template <typename Kernel_traits, typename Params>

fbgemm_gpu/experimental/hstu/src/hstu_hopper/epilogue_bwd_sm90_tma.hpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -155,11 +155,11 @@ struct CollectiveEpilogueBwd {
155155
tdKrdK_out); // ((Atom,AtomNum), MMA_M, MMA_N)
156156
Tensor taccdVrdV = smem_thr_copy_dKV.retile_S(
157157
tdVrdV_out); // ((Atom,AtomNum), MMA_M, MMA_N)
158-
Tensor taccdKsdK =
159-
smem_thr_copy_dKV.partition_D(cute::conditional_return<!dKV_swapAB>(
158+
Tensor taccdKsdK = smem_thr_copy_dKV.partition_D(
159+
cute::conditional_return<!dKV_swapAB>(
160160
sdK, sdKt)); // ((Atom,AtomNum),PIPE_M,PIPE_N)
161-
Tensor taccdVsdV =
162-
smem_thr_copy_dKV.partition_D(cute::conditional_return<!dKV_swapAB>(
161+
Tensor taccdVsdV = smem_thr_copy_dKV.partition_D(
162+
cute::conditional_return<!dKV_swapAB>(
163163
sdV, sdVt)); // ((Atom,AtomNum),PIPE_M,PIPE_N)
164164

165165
// Make sure all WGs have finished reading K and V

fbgemm_gpu/experimental/hstu/src/hstu_hopper/hstu_bwd_kernel.h

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -408,12 +408,10 @@ __global__ void __launch_bounds__(
408408
using TileShape_MNK = typename Ktraits::TileShape_MNK;
409409
Tensor tdKrdK = partition_fragment_C(
410410
tiled_mma_dKV,
411-
select < !dKV_swapAB ? 1 : 2,
412-
!dKV_swapAB ? 2 : 1 > (TileShape_MNK{}));
411+
select<!dKV_swapAB ? 1 : 2, !dKV_swapAB ? 2 : 1>(TileShape_MNK{}));
413412
Tensor tdVrdV = partition_fragment_C(
414413
tiled_mma_dKV,
415-
select < !dKV_swapAB ? 1 : 2,
416-
!dKV_swapAB ? 2 : 1 > (TileShape_MNK{}));
414+
select<!dKV_swapAB ? 1 : 2, !dKV_swapAB ? 2 : 1>(TileShape_MNK{}));
417415
collective_mainloop.mma(
418416
mainloop_params,
419417
pipeline_q,

fbgemm_gpu/experimental/hstu/src/hstu_hopper/hstu_bwd_postprocess_kernel.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -254,8 +254,7 @@ class FlashAttnBwdPostprocessConvertdQ {
254254
TiledMma tiled_mma_dQ;
255255
Tensor taccdQrdQaccum = partition_fragment_C(
256256
tiled_mma_dQ,
257-
select < !dQ_swapAB ? 0 : 1,
258-
!dQ_swapAB ? 1 : 0 > (TileShape_MK{}));
257+
select<!dQ_swapAB ? 0 : 1, !dQ_swapAB ? 1 : 0>(TileShape_MK{}));
259258
CUTE_STATIC_ASSERT_V(size(taccdQrdQaccum) == size(tdQsdQaccum));
260259
Tensor tdQrdQaccum = s2r_thr_copy_dQaccum.retile_D(taccdQrdQaccum);
261260
cute::copy(s2r_tiled_copy_dQaccum, tdQsdQaccum, tdQrdQaccum);

0 commit comments

Comments
 (0)