Revert "[CPU] Support int8 scaled embedding bag" (#2974)

metascroy · web-flow · commit 83e8e60a00e9 · 2025-09-10T15:15:37.000-07:00
Revert "[CPU] Support int8 scaled embedding bag (#2938)" This reverts commit 2cb799b.
diff --git a/test/test_ops.py b/test/test_ops.py
@@ -779,10 +779,19 @@ def test_swizzle_mm():
 )
 
 
-def _test_scaled_embedding_bag_cpu_helper(
-    multi_hot, batch_size, vector_size, index_type, qtype
-):
+@pytest.mark.skipif(
+    "CPU" not in torch._C._dispatch_dump("torchao::_scaled_embedding_bag"),
+    reason="cpp kernels not built",
+)
+@pytest.mark.parametrize(
+    "multi_hot, batch_size, vector_size, index_type",
+    EMBEDINGBAG_TEST_PARAMS,
+    ids=str,
+)
+def test_scaled_embedding_bag_cpu(multi_hot, batch_size, vector_size, index_type):
+    qtype = torch.float8_e4m3fn
     dtype = torch.float32
+    weight_scale = torch.tensor([2.0])
     include_last_offset = True
     mode = "sum"
 
@@ -802,18 +811,13 @@ def _test_scaled_embedding_bag_cpu_helper(
         dtype=dtype,
         include_last_offset=include_last_offset,
     )
-    if qtype == torch.int8:
-        weight_scale = 127.0 / m.weight.data.abs().max()
-        qweight = (m.weight.data * weight_scale).to(qtype)
-    else:
-        weight_scale = torch.tensor([2.0])
-        qweight = m.weight.data.to(qtype)
-    m.weight.data = qweight.to(m.weight.dtype)
+    fp8_weight = m.weight.data.to(qtype)
+    m.weight.data = fp8_weight.to(m.weight.dtype)
 
     with torch.no_grad():
         refe_out = m.forward(indices, offsets) * weight_scale
         test_out = torch.ops.torchao._scaled_embedding_bag(
-            qweight,
+            fp8_weight,
             indices,
             offsets,
             weight_scale,
@@ -824,35 +828,5 @@ def _test_scaled_embedding_bag_cpu_helper(
         torch.testing.assert_close(refe_out, test_out, atol=1e-5, rtol=1e-5)
 
 
-@pytest.mark.skipif(
-    "CPU" not in torch._C._dispatch_dump("torchao::_scaled_embedding_bag"),
-    reason="cpp kernels not built",
-)
-@pytest.mark.parametrize(
-    "multi_hot, batch_size, vector_size, index_type",
-    EMBEDINGBAG_TEST_PARAMS,
-    ids=str,
-)
-def test_scaled_embedding_bag_int8_cpu(multi_hot, batch_size, vector_size, index_type):
-    _test_scaled_embedding_bag_cpu_helper(
-        multi_hot, batch_size, vector_size, index_type, torch.int8
-    )
-
-
-@pytest.mark.skipif(
-    "CPU" not in torch._C._dispatch_dump("torchao::_scaled_embedding_bag"),
-    reason="cpp kernels not built",
-)
-@pytest.mark.parametrize(
-    "multi_hot, batch_size, vector_size, index_type",
-    EMBEDINGBAG_TEST_PARAMS,
-    ids=str,
-)
-def test_scaled_embedding_bag_fp8_cpu(multi_hot, batch_size, vector_size, index_type):
-    _test_scaled_embedding_bag_cpu_helper(
-        multi_hot, batch_size, vector_size, index_type, torch.float8_e4m3fn
-    )
-
-
 if __name__ == "__main__":
     pytest.main(sys.argv)
diff --git a/torchao/csrc/cpu/aten_kernels/scaled_embedding_bag.cpp b/torchao/csrc/cpu/aten_kernels/scaled_embedding_bag.cpp
@@ -11,55 +11,19 @@ namespace torchao {
 namespace {
 
 #if defined(CPU_CAPABILITY_AVX512)
-using CHUNK =
-    std::tuple<__m512, __m512, __m512, __m512, __m512, __m512, __m512, __m512>;
 static inline __m512 _mm512_load_e4m3_cvt_ps(const at::Float8_e4m3fn *x) {
   __m512 o;
   __m128i v = _mm_loadu_si128(reinterpret_cast<const __m128i *>(x));
   at::vec::CPU_CAPABILITY::cvtfp8e4m3_fp32(v, o);
   return o;
 }
-
-static inline __m512 _mm512_cvt_s8_ps(__m128i x) {
-  return _mm512_cvt_roundepi32_ps(
-      _mm512_cvtepi8_epi32(x), (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
-}
-
-static inline CHUNK load_chunk(const at::Float8_e4m3fn *x) {
-  __m512 x0, x1, x2, x3, x4, x5, x6, x7;
-  x0 = _mm512_load_e4m3_cvt_ps(x + 0);
-  x1 = _mm512_load_e4m3_cvt_ps(x + 16);
-  x2 = _mm512_load_e4m3_cvt_ps(x + 32);
-  x3 = _mm512_load_e4m3_cvt_ps(x + 48);
-  x4 = _mm512_load_e4m3_cvt_ps(x + 64);
-  x5 = _mm512_load_e4m3_cvt_ps(x + 80);
-  x6 = _mm512_load_e4m3_cvt_ps(x + 96);
-  x7 = _mm512_load_e4m3_cvt_ps(x + 112);
-  return {x0, x1, x2, x3, x4, x5, x6, x7};
-}
-
-static inline CHUNK load_chunk(const int8_t *x) {
-  __m512i x00, x64;
-  __m512 x0, x1, x2, x3, x4, x5, x6, x7;
-  x00 = _mm512_load_si512(x);
-  x64 = _mm512_load_si512(x + 64);
-  x0 = _mm512_cvt_s8_ps(_mm512_extracti32x4_epi32(x00, 0));
-  x1 = _mm512_cvt_s8_ps(_mm512_extracti32x4_epi32(x00, 1));
-  x2 = _mm512_cvt_s8_ps(_mm512_extracti32x4_epi32(x00, 2));
-  x3 = _mm512_cvt_s8_ps(_mm512_extracti32x4_epi32(x00, 3));
-  x4 = _mm512_cvt_s8_ps(_mm512_extracti32x4_epi32(x64, 0));
-  x5 = _mm512_cvt_s8_ps(_mm512_extracti32x4_epi32(x64, 1));
-  x6 = _mm512_cvt_s8_ps(_mm512_extracti32x4_epi32(x64, 2));
-  x7 = _mm512_cvt_s8_ps(_mm512_extracti32x4_epi32(x64, 3));
-  return {x0, x1, x2, x3, x4, x5, x6, x7};
-}
 #endif
 
-template <typename index_t, typename data_t>
+template <typename index_t>
 inline void _scaled_embedding_bag_krnl(
     const int64_t bs_begin, const int64_t bs_end, const int64_t num_emb,
     const int64_t emb_dim, const index_t last_offset, const index_t *indices,
-    const index_t *offsets, const data_t *weight, const double scale,
+    const index_t *offsets, const at::Float8_e4m3fn *weight, const double scale,
     float *result, const int64_t num_batch) {
 #if defined(CPU_CAPABILITY_AVX512)
   if (emb_dim % 128 == 0) {
@@ -68,7 +32,6 @@ inline void _scaled_embedding_bag_krnl(
     __m512 scale_v = _mm512_set1_ps(scale);
     for (int64_t b = bs_begin; b < bs_end; ++b) {
       __m512 x0, x1, x2, x3, x4, x5, x6, x7;
-      __m512 y0, y1, y2, y3, y4, y5, y6, y7;
       int64_t start_idx = offsets[b];
       int64_t end_idx = ((b + 1) == num_batch && last_offset != -1)
                             ? last_offset
@@ -77,19 +40,25 @@ inline void _scaled_embedding_bag_krnl(
         // load first indices
         int64_t idx = indices[start_idx] * emb_dim + block_dim * block_id;
         float *block_result = result + block_dim * block_id;
-        std::tie(x0, x1, x2, x3, x4, x5, x6, x7) = load_chunk(weight + idx);
+        x0 = _mm512_load_e4m3_cvt_ps(&weight[idx]);
+        x1 = _mm512_load_e4m3_cvt_ps(&weight[idx + 16]);
+        x2 = _mm512_load_e4m3_cvt_ps(&weight[idx + 32]);
+        x3 = _mm512_load_e4m3_cvt_ps(&weight[idx + 48]);
+        x4 = _mm512_load_e4m3_cvt_ps(&weight[idx + 64]);
+        x5 = _mm512_load_e4m3_cvt_ps(&weight[idx + 80]);
+        x6 = _mm512_load_e4m3_cvt_ps(&weight[idx + 96]);
+        x7 = _mm512_load_e4m3_cvt_ps(&weight[idx + 112]);
         for (int64_t j = start_idx + 1; j < end_idx; ++j) {
           // add following idx
           idx = indices[j] * emb_dim + block_dim * block_id;
-          std::tie(y0, y1, y2, y3, y4, y5, y6, y7) = load_chunk(weight + idx);
-          x0 = _mm512_add_ps(x0, y0);
-          x1 = _mm512_add_ps(x1, y1);
-          x2 = _mm512_add_ps(x2, y2);
-          x3 = _mm512_add_ps(x3, y3);
-          x4 = _mm512_add_ps(x4, y4);
-          x5 = _mm512_add_ps(x5, y5);
-          x6 = _mm512_add_ps(x6, y6);
-          x7 = _mm512_add_ps(x7, y7);
+          x0 = _mm512_add_ps(x0, _mm512_load_e4m3_cvt_ps(&weight[idx]));
+          x1 = _mm512_add_ps(x1, _mm512_load_e4m3_cvt_ps(&weight[idx + 16]));
+          x2 = _mm512_add_ps(x2, _mm512_load_e4m3_cvt_ps(&weight[idx + 32]));
+          x3 = _mm512_add_ps(x3, _mm512_load_e4m3_cvt_ps(&weight[idx + 48]));
+          x4 = _mm512_add_ps(x4, _mm512_load_e4m3_cvt_ps(&weight[idx + 64]));
+          x5 = _mm512_add_ps(x5, _mm512_load_e4m3_cvt_ps(&weight[idx + 80]));
+          x6 = _mm512_add_ps(x6, _mm512_load_e4m3_cvt_ps(&weight[idx + 96]));
+          x7 = _mm512_add_ps(x7, _mm512_load_e4m3_cvt_ps(&weight[idx + 112]));
         }
         x0 = _mm512_mul_ps(x0, scale_v);
         x1 = _mm512_mul_ps(x1, scale_v);
@@ -174,7 +143,6 @@ at::Tensor _scaled_embedding_bag_impl(const at::Tensor &qweight,
   int64_t emb_dim = qweight.size(1);
 
   auto index_type = indices.scalar_type();
-  auto qtype = qweight.scalar_type();
   float w_scale = w_scales.data_ptr<float>()[0];
 
   TORCH_CHECK(indices.is_contiguous() && offsets.is_contiguous(),
@@ -186,39 +154,22 @@ at::Tensor _scaled_embedding_bag_impl(const at::Tensor &qweight,
               "_scaled_embedding_bag: only accept contiguous weight");
   TORCH_CHECK(qweight.dim() == 2,
               "_scaled_embedding_bag: only accept weight with dim == 2");
-  TORCH_CHECK(qweight.scalar_type() == c10::ScalarType::Float8_e4m3fn ||
-                  qweight.scalar_type() == c10::ScalarType::Char,
-              "_scaled_embedding_bag: only support e4m3fn and int8 weight")
+  TORCH_CHECK(qweight.scalar_type() == c10::ScalarType::Float8_e4m3fn,
+              "_scaled_embedding_bag: only support e4m3fn weight")
   // handle last offsets
   int64_t last_offset = indices.numel();
 
   at::Tensor output =
       at::empty({batch_size, emb_dim}, qweight.options().dtype(at::kFloat));
-  if (qweight.scalar_type() == c10::ScalarType::Float8_e4m3fn) {
-    AT_DISPATCH_INDEX_TYPES(
-        indices.scalar_type(), "_scaled_embedding_bag", [&] {
-          at::Float8_e4m3fn *qweight_ptr =
-              qweight.data_ptr<at::Float8_e4m3fn>();
-          index_t *indices_ptr = indices.data_ptr<index_t>();
-          index_t *offsets_ptr = offsets.data_ptr<index_t>();
-          float *output_ptr = output.data_ptr<float>();
-          _scaled_embedding_bag<index_t, at::Float8_e4m3fn>(
-              output_ptr, qweight_ptr, indices_ptr, offsets_ptr, batch_size,
-              emb_dim, last_offset, w_scale, o_scale);
-        });
-  } else {
-    AT_DISPATCH_INDEX_TYPES(
-        indices.scalar_type(), "_scaled_embedding_bag", [&] {
-          int8_t *qweight_ptr = qweight.data_ptr<int8_t>();
-          index_t *indices_ptr = indices.data_ptr<index_t>();
-          index_t *offsets_ptr = offsets.data_ptr<index_t>();
-          float *output_ptr = output.data_ptr<float>();
-          _scaled_embedding_bag<index_t, int8_t>(
-              output_ptr, qweight_ptr, indices_ptr, offsets_ptr, batch_size,
-              emb_dim, last_offset, w_scale, o_scale);
-        });
-  }
-
+  AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embeddingbag_cat", [&] {
+    at::Float8_e4m3fn *qweight_ptr = qweight.data_ptr<at::Float8_e4m3fn>();
+    index_t *indices_ptr = indices.data_ptr<index_t>();
+    index_t *offsets_ptr = offsets.data_ptr<index_t>();
+    float *output_ptr = output.data_ptr<float>();
+    _scaled_embedding_bag<index_t, at::Float8_e4m3fn>(
+        output_ptr, qweight_ptr, indices_ptr, offsets_ptr, batch_size, emb_dim,
+        last_offset, w_scale, o_scale);
+  });
   return output;
 }