[Reland][CPU] Support int8 scaled embedding bag (#3060)

shiyang-weng · web-flow · commit 4a0349434e09 · 2025-09-29T00:21:36.000-07:00
* re-enable scaled_embedding_bag

* only support fp32 out_dtype
diff --git a/test/test_ops.py b/test/test_ops.py
@@ -863,19 +863,10 @@ def test_swizzle_mm():
 )
 
 
-@pytest.mark.skipif(
-    "CPU" not in torch._C._dispatch_dump("torchao::_scaled_embedding_bag"),
-    reason="cpp kernels not built",
-)
-@pytest.mark.parametrize(
-    "multi_hot, batch_size, vector_size, index_type",
-    EMBEDINGBAG_TEST_PARAMS,
-    ids=str,
-)
-def test_scaled_embedding_bag_cpu(multi_hot, batch_size, vector_size, index_type):
-    qtype = torch.float8_e4m3fn
+def _test_scaled_embedding_bag_cpu_helper(
+    multi_hot, batch_size, vector_size, index_type, qtype
+):
     dtype = torch.float32
-    weight_scale = torch.tensor([2.0])
     include_last_offset = True
     mode = "sum"
 
@@ -895,13 +886,18 @@ def test_scaled_embedding_bag_cpu(multi_hot, batch_size, vector_size, index_type
         dtype=dtype,
         include_last_offset=include_last_offset,
     )
-    fp8_weight = m.weight.data.to(qtype)
-    m.weight.data = fp8_weight.to(m.weight.dtype)
+    if qtype == torch.int8:
+        weight_scale = 127.0 / m.weight.data.abs().max()
+        qweight = (m.weight.data * weight_scale).to(qtype)
+    else:
+        weight_scale = torch.tensor([2.0])
+        qweight = m.weight.data.to(qtype)
+    m.weight.data = qweight.to(m.weight.dtype)
 
     with torch.no_grad():
         refe_out = m.forward(indices, offsets) * weight_scale
         test_out = torch.ops.torchao._scaled_embedding_bag(
-            fp8_weight,
+            qweight,
             indices,
             offsets,
             weight_scale,
@@ -912,6 +908,36 @@ def test_scaled_embedding_bag_cpu(multi_hot, batch_size, vector_size, index_type
         torch.testing.assert_close(refe_out, test_out, atol=1e-5, rtol=1e-5)
 
 
+@pytest.mark.skipif(
+    "CPU" not in torch._C._dispatch_dump("torchao::_scaled_embedding_bag"),
+    reason="cpp kernels not built",
+)
+@pytest.mark.parametrize(
+    "multi_hot, batch_size, vector_size, index_type",
+    EMBEDINGBAG_TEST_PARAMS,
+    ids=str,
+)
+def test_scaled_embedding_bag_int8_cpu(multi_hot, batch_size, vector_size, index_type):
+    _test_scaled_embedding_bag_cpu_helper(
+        multi_hot, batch_size, vector_size, index_type, torch.int8
+    )
+
+
+@pytest.mark.skipif(
+    "CPU" not in torch._C._dispatch_dump("torchao::_scaled_embedding_bag"),
+    reason="cpp kernels not built",
+)
+@pytest.mark.parametrize(
+    "multi_hot, batch_size, vector_size, index_type",
+    EMBEDINGBAG_TEST_PARAMS,
+    ids=str,
+)
+def test_scaled_embedding_bag_fp8_cpu(multi_hot, batch_size, vector_size, index_type):
+    _test_scaled_embedding_bag_cpu_helper(
+        multi_hot, batch_size, vector_size, index_type, torch.float8_e4m3fn
+    )
+
+
 @pytest.mark.skipif(
     "CPU" not in torch._C._dispatch_dump("torchao::float8_linear_prepack_cpu")
     or "CPU" not in torch._C._dispatch_dump("torchao::float8_linear_cpu"),
diff --git a/torchao/csrc/cpu/aten_kernels/scaled_embedding_bag.cpp b/torchao/csrc/cpu/aten_kernels/scaled_embedding_bag.cpp
@@ -11,19 +11,55 @@ namespace torchao {
 namespace {
 
 #if defined(CPU_CAPABILITY_AVX512)
+using CHUNK =
+    std::tuple<__m512, __m512, __m512, __m512, __m512, __m512, __m512, __m512>;
 static inline __m512 _mm512_load_e4m3_cvt_ps(const at::Float8_e4m3fn *x) {
   __m512 o;
   __m128i v = _mm_loadu_si128(reinterpret_cast<const __m128i *>(x));
   at::vec::CPU_CAPABILITY::cvtfp8e4m3_fp32(v, o);
   return o;
 }
+
+static inline __m512 _mm512_cvt_s8_ps(__m128i x) {
+  return _mm512_cvt_roundepi32_ps(
+      _mm512_cvtepi8_epi32(x), (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+}
+
+static inline CHUNK load_chunk(const at::Float8_e4m3fn *x) {
+  __m512 x0, x1, x2, x3, x4, x5, x6, x7;
+  x0 = _mm512_load_e4m3_cvt_ps(x + 0);
+  x1 = _mm512_load_e4m3_cvt_ps(x + 16);
+  x2 = _mm512_load_e4m3_cvt_ps(x + 32);
+  x3 = _mm512_load_e4m3_cvt_ps(x + 48);
+  x4 = _mm512_load_e4m3_cvt_ps(x + 64);
+  x5 = _mm512_load_e4m3_cvt_ps(x + 80);
+  x6 = _mm512_load_e4m3_cvt_ps(x + 96);
+  x7 = _mm512_load_e4m3_cvt_ps(x + 112);
+  return {x0, x1, x2, x3, x4, x5, x6, x7};
+}
+
+static inline CHUNK load_chunk(const int8_t *x) {
+  __m512i x00, x64;
+  __m512 x0, x1, x2, x3, x4, x5, x6, x7;
+  x00 = _mm512_load_si512(x);
+  x64 = _mm512_load_si512(x + 64);
+  x0 = _mm512_cvt_s8_ps(_mm512_extracti32x4_epi32(x00, 0));
+  x1 = _mm512_cvt_s8_ps(_mm512_extracti32x4_epi32(x00, 1));
+  x2 = _mm512_cvt_s8_ps(_mm512_extracti32x4_epi32(x00, 2));
+  x3 = _mm512_cvt_s8_ps(_mm512_extracti32x4_epi32(x00, 3));
+  x4 = _mm512_cvt_s8_ps(_mm512_extracti32x4_epi32(x64, 0));
+  x5 = _mm512_cvt_s8_ps(_mm512_extracti32x4_epi32(x64, 1));
+  x6 = _mm512_cvt_s8_ps(_mm512_extracti32x4_epi32(x64, 2));
+  x7 = _mm512_cvt_s8_ps(_mm512_extracti32x4_epi32(x64, 3));
+  return {x0, x1, x2, x3, x4, x5, x6, x7};
+}
 #endif
 
-template <typename index_t>
+template <typename index_t, typename data_t>
 inline void _scaled_embedding_bag_krnl(
     const int64_t bs_begin, const int64_t bs_end, const int64_t num_emb,
     const int64_t emb_dim, const index_t last_offset, const index_t *indices,
-    const index_t *offsets, const at::Float8_e4m3fn *weight, const double scale,
+    const index_t *offsets, const data_t *weight, const double scale,
     float *result, const int64_t num_batch) {
 #if defined(CPU_CAPABILITY_AVX512)
   if (emb_dim % 128 == 0) {
@@ -32,6 +68,7 @@ inline void _scaled_embedding_bag_krnl(
     __m512 scale_v = _mm512_set1_ps(scale);
     for (int64_t b = bs_begin; b < bs_end; ++b) {
       __m512 x0, x1, x2, x3, x4, x5, x6, x7;
+      __m512 y0, y1, y2, y3, y4, y5, y6, y7;
       int64_t start_idx = offsets[b];
       int64_t end_idx = ((b + 1) == num_batch && last_offset != -1)
                             ? last_offset
@@ -40,25 +77,19 @@ inline void _scaled_embedding_bag_krnl(
         // load first indices
         int64_t idx = indices[start_idx] * emb_dim + block_dim * block_id;
         float *block_result = result + block_dim * block_id;
-        x0 = _mm512_load_e4m3_cvt_ps(&weight[idx]);
-        x1 = _mm512_load_e4m3_cvt_ps(&weight[idx + 16]);
-        x2 = _mm512_load_e4m3_cvt_ps(&weight[idx + 32]);
-        x3 = _mm512_load_e4m3_cvt_ps(&weight[idx + 48]);
-        x4 = _mm512_load_e4m3_cvt_ps(&weight[idx + 64]);
-        x5 = _mm512_load_e4m3_cvt_ps(&weight[idx + 80]);
-        x6 = _mm512_load_e4m3_cvt_ps(&weight[idx + 96]);
-        x7 = _mm512_load_e4m3_cvt_ps(&weight[idx + 112]);
+        std::tie(x0, x1, x2, x3, x4, x5, x6, x7) = load_chunk(weight + idx);
         for (int64_t j = start_idx + 1; j < end_idx; ++j) {
           // add following idx
           idx = indices[j] * emb_dim + block_dim * block_id;
-          x0 = _mm512_add_ps(x0, _mm512_load_e4m3_cvt_ps(&weight[idx]));
-          x1 = _mm512_add_ps(x1, _mm512_load_e4m3_cvt_ps(&weight[idx + 16]));
-          x2 = _mm512_add_ps(x2, _mm512_load_e4m3_cvt_ps(&weight[idx + 32]));
-          x3 = _mm512_add_ps(x3, _mm512_load_e4m3_cvt_ps(&weight[idx + 48]));
-          x4 = _mm512_add_ps(x4, _mm512_load_e4m3_cvt_ps(&weight[idx + 64]));
-          x5 = _mm512_add_ps(x5, _mm512_load_e4m3_cvt_ps(&weight[idx + 80]));
-          x6 = _mm512_add_ps(x6, _mm512_load_e4m3_cvt_ps(&weight[idx + 96]));
-          x7 = _mm512_add_ps(x7, _mm512_load_e4m3_cvt_ps(&weight[idx + 112]));
+          std::tie(y0, y1, y2, y3, y4, y5, y6, y7) = load_chunk(weight + idx);
+          x0 = _mm512_add_ps(x0, y0);
+          x1 = _mm512_add_ps(x1, y1);
+          x2 = _mm512_add_ps(x2, y2);
+          x3 = _mm512_add_ps(x3, y3);
+          x4 = _mm512_add_ps(x4, y4);
+          x5 = _mm512_add_ps(x5, y5);
+          x6 = _mm512_add_ps(x6, y6);
+          x7 = _mm512_add_ps(x7, y7);
         }
         x0 = _mm512_mul_ps(x0, scale_v);
         x1 = _mm512_mul_ps(x1, scale_v);
@@ -143,6 +174,7 @@ at::Tensor _scaled_embedding_bag_impl(const at::Tensor &qweight,
   int64_t emb_dim = qweight.size(1);
 
   auto index_type = indices.scalar_type();
+  auto qtype = qweight.scalar_type();
   float w_scale = w_scales.data_ptr<float>()[0];
 
   TORCH_CHECK(indices.is_contiguous() && offsets.is_contiguous(),
@@ -154,22 +186,39 @@ at::Tensor _scaled_embedding_bag_impl(const at::Tensor &qweight,
               "_scaled_embedding_bag: only accept contiguous weight");
   TORCH_CHECK(qweight.dim() == 2,
               "_scaled_embedding_bag: only accept weight with dim == 2");
-  TORCH_CHECK(qweight.scalar_type() == c10::ScalarType::Float8_e4m3fn,
-              "_scaled_embedding_bag: only support e4m3fn weight")
+  TORCH_CHECK(qtype == c10::ScalarType::Float8_e4m3fn ||
+              qtype == c10::ScalarType::Char,
+              "_scaled_embedding_bag: only support e4m3fn and int8 weight")
   // handle last offsets
   int64_t last_offset = indices.numel();
 
   at::Tensor output =
       at::empty({batch_size, emb_dim}, qweight.options().dtype(at::kFloat));
-  AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embeddingbag_cat", [&] {
-    at::Float8_e4m3fn *qweight_ptr = qweight.data_ptr<at::Float8_e4m3fn>();
-    index_t *indices_ptr = indices.data_ptr<index_t>();
-    index_t *offsets_ptr = offsets.data_ptr<index_t>();
-    float *output_ptr = output.data_ptr<float>();
-    _scaled_embedding_bag<index_t, at::Float8_e4m3fn>(
-        output_ptr, qweight_ptr, indices_ptr, offsets_ptr, batch_size, emb_dim,
-        last_offset, w_scale, o_scale);
-  });
+  if (qtype == c10::ScalarType::Float8_e4m3fn) {
+    AT_DISPATCH_INDEX_TYPES(
+        indices.scalar_type(), "_scaled_embedding_bag", [&] {
+          at::Float8_e4m3fn *qweight_ptr =
+              qweight.data_ptr<at::Float8_e4m3fn>();
+          index_t *indices_ptr = indices.data_ptr<index_t>();
+          index_t *offsets_ptr = offsets.data_ptr<index_t>();
+          float *output_ptr = output.data_ptr<float>();
+          _scaled_embedding_bag<index_t, at::Float8_e4m3fn>(
+              output_ptr, qweight_ptr, indices_ptr, offsets_ptr, batch_size,
+              emb_dim, last_offset, w_scale, o_scale);
+        });
+  } else {
+    AT_DISPATCH_INDEX_TYPES(
+        indices.scalar_type(), "_scaled_embedding_bag", [&] {
+          int8_t *qweight_ptr = qweight.data_ptr<int8_t>();
+          index_t *indices_ptr = indices.data_ptr<index_t>();
+          index_t *offsets_ptr = offsets.data_ptr<index_t>();
+          float *output_ptr = output.data_ptr<float>();
+          _scaled_embedding_bag<index_t, int8_t>(
+              output_ptr, qweight_ptr, indices_ptr, offsets_ptr, batch_size,
+              emb_dim, last_offset, w_scale, o_scale);
+        });
+  }
+
   return output;
 }
 
@@ -179,4 +228,4 @@ TORCH_LIBRARY_IMPL(torchao, CPU, m) {
   m.impl("torchao::_scaled_embedding_bag", &_scaled_embedding_bag_impl);
 }
 
-} // namespace torchao
+} // namespace torchao
diff --git a/torchao/ops.py b/torchao/ops.py
@@ -1122,7 +1122,10 @@ def _(
     # Only support include_last_offset == True
     assert include_last_offset == True
     batch_size = offsets.shape[0] - 1
-    return qweight.new_empty(batch_size, qweight.shape[1], dtype=qweight.dtype)
+    # Only support out_dtype == torch.float32
+    # Next setp: support more out_dtype
+    out_dtype = torch.float32
+    return qweight.new_empty(batch_size, qweight.shape[1], dtype=out_dtype)
 
 
 def float8_linear_prepack_cpu(