From b9e677ca59166a0724653e8a26318be3bf4e8648 Mon Sep 17 00:00:00 2001
From: Emma Lin <line@meta.com>
Date: Mon, 27 Oct 2025 14:44:13 -0700
Subject: [PATCH] add auto feature score collection to EC (#5030)

Summary:

X-link: https://github.com/meta-pytorch/torchrec/pull/3474

X-link: https://github.com/facebookresearch/FBGEMM/pull/2043

Enable feature score auto collection in ShardedEmbeddingCollection based on static feature to score mapping.
If user needs custom score for specific id, they can disable auto collection and then change model code explicitly to collect score for each id.

Here is the sample eviction policy config in embedding_table config to enable auto score collection:
                virtual_table_eviction_policy=FeatureScoreBasedEvictionPolicy(
                    training_id_eviction_trigger_count=260_000_000,  # 260M
                    training_id_keep_count=160_000_000,  # 160M
                    enable_auto_feature_score_collection=True,
                    feature_score_mapping={
                        "sparse_public_original_content_creator": 1.0,
                    },
                    feature_score_default_value=0.5,
                ),

Additionally the counter collected previously during EC dedup is not used by kvzch backend, so this diff removed that counter and allow KJT to transfer a single float32 weight tensor to backend. This allows feature score collection for EBC since there could have another float weight for EBC pooling already.

Reviewed By: EddyLXJ

Differential Revision: D83945722
---
 fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py                     | 2 +-
 .../src/dram_kv_embedding_cache/dram_kv_embedding_cache.h     | 4 +---
 2 files changed, 2 insertions(+), 4 deletions(-)
diff --git a/fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py b/fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py
index a497cf9a5b..32fb3991f7 100644
--- a/fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py
+++ b/fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py
@@ -2089,7 +2089,7 @@ def _prefetch(  # noqa C901
                         torch.tensor(
                             [weights.shape[0]], device="cpu", dtype=torch.long
                         ),
-                        weights.cpu().view(torch.float32).view(-1, 2),
+                        weights.cpu(),
                     )
 
             # Generate row addresses (pointing to either L1 or the current
diff --git a/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache.h b/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache.h
index 9738b846cc..98f3a44e35 100644
--- a/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache.h
+++ b/fbgemm_gpu/src/dram_kv_embedding_cache/dram_kv_embedding_cache.h
@@ -770,7 +770,6 @@ class DramKVEmbeddingCache : public kv_db::EmbeddingKVDB {
                       CHECK_EQ(indices.size(0), engege_rates.size(0));
                       auto indices_data_ptr = indices.data_ptr<index_t>();
                       auto engage_rate_ptr = engege_rates.data_ptr<float>();
-                      int64_t stride = 2;
                       {
                         auto before_write_lock_ts =
                             facebook::WallClockUtil::NowInUsecFast();
@@ -785,8 +784,7 @@ class DramKVEmbeddingCache : public kv_db::EmbeddingKVDB {
                              index_iter++) {
                           const auto& id_index = *index_iter;
                           auto id = int64_t(indices_data_ptr[id_index]);
-                          float engege_rate =
-                              float(engage_rate_ptr[id_index * stride + 0]);
+                          float engege_rate = float(engage_rate_ptr[id_index]);
                           // use mempool
                           weight_type* block = nullptr;
                           auto before_lookup_cache_ts =