From a155592256173ddd837fad5708d698b69c70057a Mon Sep 17 00:00:00 2001
From: JacoCheung <junzhang@nvidia.com>
Date: Fri, 24 Oct 2025 09:13:05 +0000
Subject: [PATCH 1/3] Refactor training folder and docs

---
 examples/commons/utils/logger.py              |  20 +-
 examples/commons/utils/stringify.py           |   6 +-
 examples/hstu/README.md                       |   2 +-
 examples/hstu/training/README.md              |  51 ++++
 examples/hstu/training/__init__.py            |   2 -
 examples/hstu/training/benchmark/README.md    |   9 +-
 .../benchmark/hstu_layer_benchmark.py         |   2 +-
 .../benchmark/run_hstu_layer_benchmark.sh     |   8 +-
 examples/hstu/training/pretrain_gr_ranking.py |  44 +--
 .../hstu/training/pretrain_gr_retrieval.py    |  53 ++--
 examples/hstu/training/trainer/__init__.py    |   0
 .../{training_impl.py => trainer/training.py} |   2 +-
 .../{training_utils.py => trainer/utils.py}   |   0
 examples/hstu/utils/gin_args_doc.md           | 250 ++++++++++++++++++
 pyproject.toml                                |   1 +
 15 files changed, 389 insertions(+), 61 deletions(-)
 create mode 100644 examples/hstu/training/README.md
 delete mode 100644 examples/hstu/training/__init__.py
 create mode 100644 examples/hstu/training/trainer/__init__.py
 rename examples/hstu/training/{training_impl.py => trainer/training.py} (99%)
 rename examples/hstu/training/{training_utils.py => trainer/utils.py} (100%)
 create mode 100644 examples/hstu/utils/gin_args_doc.md

diff --git a/examples/commons/utils/logger.py b/examples/commons/utils/logger.py
index 6f2f3225..e66ef6ab 100644
--- a/examples/commons/utils/logger.py
+++ b/examples/commons/utils/logger.py
@@ -12,16 +12,30 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from datetime import datetime
+import logging
 
 import torch
+from rich.console import Console
+from rich.logging import RichHandler
+
+# Set up logger with RichHandler if not already configured
+
+console = Console()
+_LOGGER = logging.getLogger("rich_rank0")
+
+if not _LOGGER.hasHandlers():
+    handler = RichHandler(
+        console=console, show_time=True, show_path=False, rich_tracebacks=True
+    )
+    _LOGGER.addHandler(handler)
+    _LOGGER.propagate = False
+    _LOGGER.setLevel(logging.INFO)
 
 
 def print_rank_0(message):
     """If distributed is initialized, print only on rank 0."""
     if torch.distributed.is_initialized():
-        now = datetime.now()
         if torch.distributed.get_rank() == 0:
-            print(f"[{now}] " + message, flush=True)
+            _LOGGER.info(message)
     else:
         print(message, flush=True)
diff --git a/examples/commons/utils/stringify.py b/examples/commons/utils/stringify.py
index ac834b38..93986bfc 100644
--- a/examples/commons/utils/stringify.py
+++ b/examples/commons/utils/stringify.py
@@ -34,11 +34,11 @@ def stringify_dict(input_dict, prefix="", sep=","):
             value.float()
             assert value.dim() == 0
             value = value.cpu().item()
-            output += key + ":" + f"{value:6f}{sep}"
+            output += key + ": " + f"{value:6f}{sep}"
         elif isinstance(value, float):
-            output += key + ":" + f"{value:6f}{sep}"
+            output += key + ": " + f"{value:6f}{sep}"
         elif isinstance(value, int):
-            output += key + ":" + f"{value}{sep}"
+            output += key + ": " + f"{value}{sep}"
         else:
             assert RuntimeError(f"stringify dict not supports type {type(value)}")
     # remove the ending sep
diff --git a/examples/hstu/README.md b/examples/hstu/README.md
index 19e56e40..e2b2e9bf 100644
--- a/examples/hstu/README.md
+++ b/examples/hstu/README.md
@@ -1,4 +1,4 @@
-# Examples: to demonstrate how to train generative recommendation models
+# Examples: to demonstrate how to do training and inference generative recommendation models
 
 ## Generative Recommender Introduction
 Meta's paper ["Actions Speak Louder Than Words"](https://arxiv.org/abs/2402.17152) introduces a novel paradigm for recommendation systems called **Generative Recommenders(GRs)**, which reformulates recommendation tasks as generative modeling problems. The work introduced Hierarchical Sequential Transduction Units (HSTU), a novel architecture designed to handle high-cardinality, non-stationary data streams in large-scale recommendation systems. HSTU enables both retrieval and ranking tasks. As noted in the paper, “HSTU-based GRs, with 1.5 trillion parameters, improve metrics in online A/B tests by 12.4% and have been deployed on multiple surfaces of a large internet platform with billions of users.”
diff --git a/examples/hstu/training/README.md b/examples/hstu/training/README.md
new file mode 100644
index 00000000..4275590b
--- /dev/null
+++ b/examples/hstu/training/README.md
@@ -0,0 +1,51 @@
+# HSTU Training example
+
+We have supported both retrieval and ranking model whose backbones are HSTU layers. In this example collection, we allow user to specify the model structures via gin-config file. Supported datasets are listed below. Regarding the gin-config interface, please refer to [inline comments](../utils/gin_config_args.py) .
+
+## Parallelism Introduction 
+To facilitate large embedding tables and scaling-laws of HSTU dense, we have integrate **[TorchRec](https://github.com/pytorch/torchrec)** that does shard embedding tables and **[Megatron-LM](https://github.com/NVIDIA/Megatron-LM)** that enable dense parallelism(e.g Data, Tensor, Sequence, Pipeline, and Context parallelism) in this example.
+This integration ensures efficient training by coordinating sparse (embedding) and dense (context/data) parallelisms within a single model.
+![parallelism](../figs/parallelism.png)
+
+
+## Dataset Introduction
+
+We have supported several datasets as listed in the following sections:
+
+### Dataset Information
+#### **MovieLens**
+refer to [MovieLens 1M](https://grouplens.org/datasets/movielens/1m/) and [MovieLens 20M](https://www.kaggle.com/datasets/grouplens/movielens-20m-dataset) for details.
+#### **KuaiRand**
+
+| dataset       | # users | seqlen max | seqlen min | seqlen mean | seqlen median | # items    |
+|---------------|---------|------------|------------|-------------|---------------|------------|
+| kuairand_pure | 27285   | 910        | 1          | 1           | 39            | 7551       |
+| kuairand_1k   | 1000    | 49332      | 10         | 5038        | 3379          | 4369953    |
+| kuairand_27k  | 27285   | 228000     | 100        | 11796       | 8591          | 32038725   |
+ 
+refer to [KuaiRand](https://kuairand.com/) for details.
+
+## Running the examples
+
+Before getting started, please make sure that all pre-requisites are fulfilled. You can refer to [Get Started][../../../README] section in the root directory of the repo to set up the environment.****
+
+
+### Start training
+The entrypoint for training are `pretrain_gr_retrieval.py` or `pretrain_gr_ranking.py`. We use gin-config to specify the model structure, training arguments, hyper-params etc.
+
+Command to run retrieval task with `MovieLens 20m` dataset:
+
+```bash
+# Before running the `pretrain_gr_retrieval.py`, make sure that current working directory is `hstu`
+cd <root-to-project>examples/hstu 
+PYTHONPATH=${PYTHONPATH}:$(realpath ../) torchrun --nproc_per_node 1 --master_addr localhost --master_port 6000  ./training/pretrain_gr_retrieval.py --gin-config-file ./training/configs/movielen_retrieval.gin
+```
+
+To run ranking task with `MovieLens 20m` dataset:
+```bash
+# Before running the `pretrain_gr_ranking.py`, make sure that current working directory is `hstu`
+cd <root-to-project>examples/hstu 
+PYTHONPATH=${PYTHONPATH}:$(realpath ../) torchrun --nproc_per_node 1 --master_addr localhost --master_port 6000  ./training/pretrain_gr_ranking.py --gin-config-file ./training/configs/movielen_ranking.gin
+```
+
+
diff --git a/examples/hstu/training/__init__.py b/examples/hstu/training/__init__.py
deleted file mode 100644
index 270ce28a..00000000
--- a/examples/hstu/training/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from .training_impl import *  # pylint: disable=wildcard-import
-from .training_utils import *  # pylint: disable=wildcard-import
diff --git a/examples/hstu/training/benchmark/README.md b/examples/hstu/training/benchmark/README.md
index 42f83069..27358ac9 100644
--- a/examples/hstu/training/benchmark/README.md
+++ b/examples/hstu/training/benchmark/README.md
@@ -13,7 +13,7 @@ You can run script `run_hstu_benchmark.sh` to see the performance over the base
 
 ## How to run
 
-The test entry is `python ./benchmark/hstu_layer_benchmark.py run`, you can type `python ./benchmark/hstu_layer_benchmark.py run --help` to get the input arguments. 4 important arguments are :
+The test entry is `python ./training/benchmark/hstu_layer_benchmark.py run`, you can type `python ./training/benchmark/hstu_layer_benchmark.py run --help` to get the input arguments. 4 important arguments are :
 
 1. --kernel-backend: select the hstu mha backend. Could be `triton` or `cutlass`.
 2. --fuse-norm-mul-dropout: knob of  `layer norm + multiplication + dropout ` fusion. Could be `False` or `True`
@@ -23,7 +23,9 @@ The test entry is `python ./benchmark/hstu_layer_benchmark.py run`, you can type
 Our baseline cmd example (1K): 
 
 ```bash
-python ./benchmark/hstu_layer_benchmark.py run \
+
+cd recsys-examples/examples/hstu
+python ./training/benchmark/hstu_layer_benchmark.py run \
   --iters 100 \
   --warmup-iters 50 \
   --layer-type native \
@@ -40,7 +42,8 @@ python ./benchmark/hstu_layer_benchmark.py run \
 You can also run a set of arguments with run.sh:
 
 ```bash
-bash run_hstu_layer_benchmark.sh <num_layers>
+cd recsys-examples/examples/hstu
+bash ./training/benchmark/run_hstu_layer_benchmark.sh <num_layers>
 ```
 
 After one run is done, a memory snapshot file in current working directory is generated, you can trace the memory usage with the file. Please refer to [PyTorch docs](https://docs.pytorch.org/docs/stable/torch_cuda_memory.html) on how to visualize the memory trace.
diff --git a/examples/hstu/training/benchmark/hstu_layer_benchmark.py b/examples/hstu/training/benchmark/hstu_layer_benchmark.py
index c47742f6..1e2765ae 100644
--- a/examples/hstu/training/benchmark/hstu_layer_benchmark.py
+++ b/examples/hstu/training/benchmark/hstu_layer_benchmark.py
@@ -47,7 +47,7 @@
 from modules.jagged_data import JaggedData
 from modules.native_hstu_layer import HSTULayer as NativeHSTULayer
 from ops.length_to_offsets import length_to_complete_offsets
-from training.utils import cal_flops_single_rank
+from training.trainer.utils import cal_flops_single_rank
 
 _backend_str_to_type = {
     "cutlass": KernelBackend.CUTLASS,
diff --git a/examples/hstu/training/benchmark/run_hstu_layer_benchmark.sh b/examples/hstu/training/benchmark/run_hstu_layer_benchmark.sh
index 3984dfe9..2bae6a38 100644
--- a/examples/hstu/training/benchmark/run_hstu_layer_benchmark.sh
+++ b/examples/hstu/training/benchmark/run_hstu_layer_benchmark.sh
@@ -32,7 +32,7 @@ for dim_per_head in "${dim_per_heads[@]}"; do
                 fi
                 echo -e "\n\033[32mbaseline hstu layer \033[0m:"
                 ${nsys_profile_cmd/<placeholder>/${baseline_profile_name}} \
-                    python ./benchmark/hstu_layer_benchmark.py run \
+                    python ./training/benchmark/hstu_layer_benchmark.py run \
                     --iters 100 \
                     --warmup-iters 50 \
                     --kernel-backend triton \
@@ -53,7 +53,7 @@ for dim_per_head in "${dim_per_heads[@]}"; do
 
                 echo -e "\n\033[32m +cutlass\033[0m:"
                 ${nsys_profile_cmd/<placeholder>/${cutlass_profile_name}} \
-                    python ./benchmark/hstu_layer_benchmark.py run \
+                    python ./training/benchmark/hstu_layer_benchmark.py run \
                     --iters 100 \
                     --warmup-iters 50 \
                     --kernel-backend cutlass \
@@ -73,7 +73,7 @@ for dim_per_head in "${dim_per_heads[@]}"; do
 
                 echo -e "\n\033[32m +fused\033[0m:"
                 ${nsys_profile_cmd/<placeholder>/${fused_profile_name}} \
-                    python ./benchmark/hstu_layer_benchmark.py run \
+                    python ./training/benchmark/hstu_layer_benchmark.py run \
                     --iters 100 \
                     --warmup-iters 50 \
                     --kernel-backend cutlass \
@@ -93,7 +93,7 @@ for dim_per_head in "${dim_per_heads[@]}"; do
 
                 echo -e "\n\033[32m + recompute\033[0m:"
                 ${nsys_profile_cmd/<placeholder>/${recompute_profile_name}} \
-                    python ./benchmark/hstu_layer_benchmark.py run \
+                    python ./training/benchmark/hstu_layer_benchmark.py run \
                     --iters 100 \
                     --warmup-iters 50 \
                     --kernel-backend cutlass \
diff --git a/examples/hstu/training/pretrain_gr_ranking.py b/examples/hstu/training/pretrain_gr_ranking.py
index 993fc753..9ff0b32e 100644
--- a/examples/hstu/training/pretrain_gr_ranking.py
+++ b/examples/hstu/training/pretrain_gr_ranking.py
@@ -18,7 +18,7 @@
 warnings.filterwarnings("ignore", category=FutureWarning)
 warnings.filterwarnings("ignore", category=SyntaxWarning)
 import argparse
-from functools import partial  # pylint: disable-unused-import
+from typing import List, Union
 
 import commons.utils.initialize as init
 import gin
@@ -34,7 +34,8 @@
     JaggedMegatronTrainNonePipeline,
     JaggedMegatronTrainPipelineSparseDist,
 )
-from training import (
+from trainer.training import maybe_load_ckpts, train_with_pipeline
+from trainer.utils import (
     create_dynamic_optitons_dict,
     create_embedding_configs,
     create_hstu_config,
@@ -42,10 +43,11 @@
     get_data_loader,
     get_dataset_and_embedding_args,
     get_embedding_vector_storage_multiplier,
-    maybe_load_ckpts,
-    train_with_pipeline,
 )
-from utils import (
+from utils import (  # from hstu.utils
+    BenchmarkDatasetArgs,
+    DatasetArgs,
+    EmbeddingArgs,
     NetworkArgs,
     OptimizerArgs,
     RankingArgs,
@@ -53,20 +55,12 @@
     TrainerArgs,
 )
 
-parser = argparse.ArgumentParser(
-    description="Distributed GR Arguments", allow_abbrev=False
-)
-parser.add_argument("--gin-config-file", type=str)
-args = parser.parse_args()
-gin.parse_config_file(args.gin_config_file)
-trainer_args = TrainerArgs()
-dataset_args, embedding_args = get_dataset_and_embedding_args()
-network_args = NetworkArgs()
-optimizer_args = OptimizerArgs()
-tp_args = TensorModelParallelArgs()
-
 
-def create_ranking_config() -> RankingConfig:
+def create_ranking_config(
+    dataset_args: Union[DatasetArgs, BenchmarkDatasetArgs],
+    network_args: NetworkArgs,
+    embedding_args: List[EmbeddingArgs],
+) -> RankingConfig:
     ranking_args = RankingArgs()
 
     return RankingConfig(
@@ -82,6 +76,18 @@ def create_ranking_config() -> RankingConfig:
 
 
 def main():
+    parser = argparse.ArgumentParser(
+        description="HSTU Example Arguments", allow_abbrev=False
+    )
+    parser.add_argument("--gin-config-file", type=str)
+    args = parser.parse_args()
+    gin.parse_config_file(args.gin_config_file)
+    trainer_args = TrainerArgs()
+    dataset_args, embedding_args = get_dataset_and_embedding_args()
+    network_args = NetworkArgs()
+    optimizer_args = OptimizerArgs()
+    tp_args = TensorModelParallelArgs()
+
     init.initialize_distributed()
     init.initialize_model_parallel(
         tensor_model_parallel_size=tp_args.tensor_model_parallel_size
@@ -92,7 +98,7 @@ def main():
         f"distributed env initialization done. Free cuda memory: {free_memory / (1024 ** 2):.2f} MB"
     )
     hstu_config = create_hstu_config(network_args, tp_args)
-    task_config = create_ranking_config()
+    task_config = create_ranking_config(dataset_args, network_args, embedding_args)
     model = get_ranking_model(hstu_config=hstu_config, task_config=task_config)
 
     dynamic_options_dict = create_dynamic_optitons_dict(
diff --git a/examples/hstu/training/pretrain_gr_retrieval.py b/examples/hstu/training/pretrain_gr_retrieval.py
index ec3d0486..c628c535 100644
--- a/examples/hstu/training/pretrain_gr_retrieval.py
+++ b/examples/hstu/training/pretrain_gr_retrieval.py
@@ -18,7 +18,7 @@
 warnings.filterwarnings("ignore", category=FutureWarning)
 warnings.filterwarnings("ignore", category=SyntaxWarning)
 import argparse
-from functools import partial  # pylint: disable-unused-import
+from typing import List, Union
 
 import commons.utils.initialize as init
 import gin
@@ -32,18 +32,20 @@
     JaggedMegatronTrainNonePipeline,
     JaggedMegatronTrainPipelineSparseDist,
 )
-from training import (
+from trainer.training import maybe_load_ckpts, train_with_pipeline
+from trainer.utils import (
     create_dynamic_optitons_dict,
-    create_embedding_config,
+    create_embedding_configs,
     create_hstu_config,
     create_optimizer_params,
     get_data_loader,
     get_dataset_and_embedding_args,
     get_embedding_vector_storage_multiplier,
-    maybe_load_ckpts,
-    train_with_pipeline,
 )
-from utils import (
+from utils import (  # from hstu.utils
+    BenchmarkDatasetArgs,
+    DatasetArgs,
+    EmbeddingArgs,
     NetworkArgs,
     OptimizerArgs,
     RetrievalArgs,
@@ -51,27 +53,18 @@
     TrainerArgs,
 )
 
-parser = argparse.ArgumentParser(
-    description="Distributed GR Arguments", allow_abbrev=False
-)
-parser.add_argument("--gin-config-file", type=str)
-args = parser.parse_args()
-gin.parse_config_file(args.gin_config_file)
-trainer_args = TrainerArgs()
-dataset_args, embedding_args = get_dataset_and_embedding_args()
-network_args = NetworkArgs()
-optimizer_args = OptimizerArgs()
-tp_args = TensorModelParallelArgs()
-
 
-def create_retrieval_config() -> RetrievalConfig:
+def create_retrieval_config(
+    dataset_args: Union[DatasetArgs, BenchmarkDatasetArgs],
+    network_args: NetworkArgs,
+    embedding_args: List[EmbeddingArgs],
+) -> RetrievalConfig:
     retrieval_args = RetrievalArgs()
 
     return RetrievalConfig(
-        embedding_configs=[
-            create_embedding_config(network_args.hidden_size, arg)
-            for arg in embedding_args
-        ],
+        embedding_configs=create_embedding_configs(
+            dataset_args, network_args, embedding_args
+        ),
         temperature=retrieval_args.temperature,
         l2_norm_eps=retrieval_args.l2_norm_eps,
         num_negatives=retrieval_args.num_negatives,
@@ -80,6 +73,18 @@ def create_retrieval_config() -> RetrievalConfig:
 
 
 def main():
+    parser = argparse.ArgumentParser(
+        description="Distributed GR Arguments", allow_abbrev=False
+    )
+    parser.add_argument("--gin-config-file", type=str)
+    args = parser.parse_args()
+    gin.parse_config_file(args.gin_config_file)
+    trainer_args = TrainerArgs()
+    dataset_args, embedding_args = get_dataset_and_embedding_args()
+    network_args = NetworkArgs()
+    optimizer_args = OptimizerArgs()
+    tp_args = TensorModelParallelArgs()
+
     init.initialize_distributed()
     init.initialize_model_parallel(
         tensor_model_parallel_size=tp_args.tensor_model_parallel_size
@@ -87,7 +92,7 @@ def main():
     init.set_random_seed(trainer_args.seed)
 
     hstu_config = create_hstu_config(network_args, tp_args)
-    task_config = create_retrieval_config()
+    task_config = create_retrieval_config(dataset_args, network_args, embedding_args)
     model = get_retrieval_model(hstu_config=hstu_config, task_config=task_config)
 
     dynamic_options_dict = create_dynamic_optitons_dict(
diff --git a/examples/hstu/training/trainer/__init__.py b/examples/hstu/training/trainer/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/examples/hstu/training/training_impl.py b/examples/hstu/training/trainer/training.py
similarity index 99%
rename from examples/hstu/training/training_impl.py
rename to examples/hstu/training/trainer/training.py
index a3fe7a0b..a4b0254c 100644
--- a/examples/hstu/training/training_impl.py
+++ b/examples/hstu/training/trainer/training.py
@@ -31,7 +31,7 @@
     JaggedMegatronTrainNonePipeline,
     JaggedMegatronTrainPipelineSparseDist,
 )
-from training.training_utils import cal_flops
+from trainer.utils import cal_flops
 from utils import TrainerArgs
 
 
diff --git a/examples/hstu/training/training_utils.py b/examples/hstu/training/trainer/utils.py
similarity index 100%
rename from examples/hstu/training/training_utils.py
rename to examples/hstu/training/trainer/utils.py
diff --git a/examples/hstu/utils/gin_args_doc.md b/examples/hstu/utils/gin_args_doc.md
new file mode 100644
index 00000000..ff37215b
--- /dev/null
+++ b/examples/hstu/utils/gin_args_doc.md
@@ -0,0 +1,250 @@
+# Gin Configurable Interfaces Documentation
+
+This document provides comprehensive documentation for all configurable hypara-params that used by both inference and training
+
+
+## 1. TrainerArgs - Trainer Configuration
+
+Training-related parameters and settings.
+
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `train_batch_size` | int | - | **Required**. Batch size per GPU. When TP is enabled, the theoretical batch size is (train_batch_size × tp_size) |
+| `eval_batch_size` | int | - | **Required**. Evaluation batch size |
+| `eval_interval` | int | 100 | Evaluation interval in iterations |
+| `log_interval` | int | 100 | Logging interval in iterations |
+| `max_train_iters` | Optional[int] | None | Maximum training iterations |
+| `max_eval_iters` | Optional[int] | None | Maximum evaluation iterations |
+| `seed` | int | 1234 | Random seed |
+| `profile` | bool | False | Enable profiling |
+| `profile_step_start` | int | 100 | Profiling start step |
+| `profile_step_end` | int | 200 | Profiling end step |
+| `ckpt_save_interval` | int | -1 | Checkpoint save interval, -1 means no checkpoint saving |
+| `ckpt_save_dir` | str | "./checkpoints" | Checkpoint save directory |
+| `ckpt_load_dir` | str | "" | Checkpoint load directory |
+| `pipeline_type` | str | "native" | Pipeline overlap type: `none` (no overlap), `native` (overlap h2d, input dist, fwd+bwd), `prefetch` (includes prefetch overlap) |
+
+---
+
+
+## 2. EmbeddingArgs - Embedding Configuration
+
+Base embedding layer configuration parameters.
+
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `feature_names` | List[str] | - | **Required**. List of feature names |
+| `table_name` | str | - | **Required**. Embedding table name |
+| `item_vocab_size_or_capacity` | int | - | **Required**. For dynamic embedding: capacity; for static embedding: vocabulary size |
+| `sharding_type` | str | "None" | Sharding type, must be "data_parallel" or "model_parallel" |
+
+---
+
+## 3. DynamicEmbeddingArgs - Dynamic Embedding Configuration
+
+Extends `EmbeddingArgs` with dynamic embedding-specific parameters.
+
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `global_hbm_for_values` | Optional[int] | None | Global HBM size in bytes (highest priority) |
+| `item_vocab_gpu_capacity` | Optional[float] | None | Item vocabulary GPU capacity (second priority) |
+| `item_vocab_gpu_capacity_ratio` | Optional[float] | None | Item vocabulary GPU capacity ratio (lowest priority) |
+| `evict_strategy` | str | "lru" | Eviction strategy: "lru" or "lfu" |
+| `caching` | bool | False | Enable caching on HMB. When caching is enabled, the global_hbm_for_values indicates the cache size |
+
+**Note**: `sharding_type` is automatically set to "model_parallel"
+
+**Precedence**: The first 3 params can be used for setting the HBM size for dynamic embedding, but there is a precedence relationship:   `global_hbm_for_values` > `item_vocab_gpu_capacity` > `item_vocab_gpu_capacity_ratio`. When only `item_vocab_gpu_capacity_ratio` is given, `item_vocab_gpu_capacity = item_vocab_gpu_capacity_ratio * item_vocab_size_or_capacity` and `global_hbm_for_values` are deduced based on the optimizer and embedding dims.
+
+**Note**: A table could be only one of type EmbeddingArgs or DynamicEmbeddingArgs.
+
+**Note**: When movielen\* or kuairand\* dataset are used,  DynamicEmbeddingArgs/EmbeddingArgs are predefined. See [get_dataset_and_embedding_args() func](../hstu/training/trainer/utils.py)
+
+---
+
+## 4. DatasetArgs - Dataset Configuration
+
+Dataset-related configuration parameters.
+
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `dataset_name` | str | - | **Required**. Dataset name |
+| `max_sequence_length` | int | - | **Required**. Maximum sequence length |
+| `dataset_path` | Optional[str] | None | Path to dataset |
+| `max_num_candidates` | int | 0 | Maximum number of candidates |
+| `shuffle` | bool | False | Whether to shuffle data |
+
+**Note**: `dataset_path` could be none if your dataset is preprocessed and moved under <root-to-project>/hstu/tmp_data folder or you're running with `BenchmarkDatasetArgs` which is a in-memory random data generator. Please refer to [example](../hstu/training/configs/benchmark_ranking.gin).
+
+
+---
+
+## 5. FeatureArgs - Feature Configuration
+
+Feature-specific configuration parameters.
+
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `feature_names` | List[str] | - | **Required**. List of feature names |
+| `max_sequence_length` | int | - | **Required**. Maximum sequence length |
+| `is_jagged` | bool | False | Whether features are jagged (variable length) |
+ FeatureArgs and DatasetArgs
+
+**Note**: `FeatureArgs` are only used when the dataset is of `BenchmarkDatasetArgs`.
+
+---
+## 6. BenchmarkDatasetArgs - Benchmark Dataset Configuration
+
+Configuration for benchmark datasets combining features and embeddings.
+
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `feature_args` | List[FeatureArgs] | - | **Required**. List of feature arguments |
+| `embedding_args` | List[Union[EmbeddingArgs, DynamicEmbeddingArgs]] | - | **Required**. List of embedding arguments |
+| `item_feature_name` | str | - | **Required**. Item feature name |
+| `contextual_feature_names` | List[str] | - | **Required**. List of contextual feature names |
+| `action_feature_name` | Optional[str] | None | Action feature name |
+| `max_num_candidates` | int | 0 | Maximum number of candidates |
+
+---
+
+## 7. NetworkArgs - Network Architecture Configuration
+
+Neural network architecture parameters.
+
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `num_layers` | int | - | **Required**. Number of layers |
+| `hidden_size` | int | - | **Required**. Hidden layer size |
+| `num_attention_heads` | int | - | **Required**. Number of attention heads |
+| `kv_channels` | int | - | **Required**. Key-value channels |
+| `hidden_dropout` | float | 0.2 | Hidden layer dropout rate |
+| `norm_epsilon` | float | 1e-5 | Normalization epsilon |
+| `is_causal` | bool | True | Use causal attention mask |
+| `dtype_str` | str | "bfloat16" | Data type: "bfloat16" or "float16" |
+| `kernel_backend` | str | "cutlass" | Kernel backend: "cutlass", "triton", or "pytorch" |
+| `target_group_size` | int | 1 | Target group size |
+| `num_position_buckets` | int | 8192 | Number of position buckets |
+| `recompute_input_layernorm` | bool | False | Recompute input layer normalization |
+| `recompute_input_silu` | bool | False | Recompute input SiLU activation |
+| `item_embedding_dim` | int | -1 | Item embedding dimension |
+| `contextual_embedding_dim` | int | -1 | Contextual embedding dimension |
+
+---
+
+## 8. OptimizerArgs - Optimizer Configuration
+
+Optimizer-related parameters.
+
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `optimizer_str` | str | - | **Required**. Optimizer name |
+| `learning_rate` | float | - | **Required**. Learning rate |
+| `adam_beta1` | float | 0.9 | Adam optimizer beta1 parameter |
+| `adam_beta2` | float | 0.999 | Adam optimizer beta2 parameter |
+| `adam_eps` | float | 1e-8 | Adam optimizer epsilon parameter |
+
+---
+
+## 9. TensorModelParallelArgs - Tensor Model Parallelism Configuration
+
+Tensor model parallelism settings.
+
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `tensor_model_parallel_size` | int | 1 | Tensor model parallel size (number of GPUs for model sharding) |
+
+**Note**: The data parallel size is deduced based on the `world_size` and `tensor_model_parallel_size`.
+
+---
+
+## 10. RankingArgs - Ranking Task Configuration
+
+Configuration specific to ranking tasks.
+
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `prediction_head_arch` | List[int] | None | **Required**. Prediction head architecture (list of layer sizes) |
+| `prediction_head_act_type` | str | "relu" | Prediction head activation type: "relu" or "gelu" |
+| `prediction_head_bias` | bool | True | Whether to use bias in prediction head |
+| `num_tasks` | int | 1 | Number of tasks (for multi-task learning) |
+| `eval_metrics` | Tuple[str, ...] | ("AUC",) | Evaluation metrics tuple |
+
+---
+
+## 11. RetrievalArgs - Retrieval Task Configuration
+
+Configuration specific to retrieval tasks.
+
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `num_negatives` | int | -1 | Number of negative samples |
+| `temperature` | float | 0.05 | Temperature parameter for similarity scoring |
+| `l2_norm_eps` | float | 1e-6 | Epsilon value for L2 normalization |
+| `eval_metrics` | Tuple[str, ...] | ("HR@10", "NDCG@10") | Evaluation metrics tuple (Hit Rate, NDCG) |
+
+---
+
+## Usage Examples
+
+### Example 1: Basic Configuration
+
+```python
+# In your .gin config file
+
+# Trainer configuration
+TrainerArgs.train_batch_size = 256
+TrainerArgs.eval_batch_size = 512
+TrainerArgs.max_train_iters = 10000
+TrainerArgs.pipeline_type = "prefetch"
+
+# Network configuration
+NetworkArgs.num_layers = 4
+NetworkArgs.hidden_size = 256
+NetworkArgs.num_attention_heads = 8
+NetworkArgs.kv_channels = 32
+NetworkArgs.dtype_str = "bfloat16"
+
+# Optimizer configuration
+OptimizerArgs.optimizer_str = "adam"
+OptimizerArgs.learning_rate = 0.001
+```
+
+### Example 2: Ranking Task Configuration
+
+```python
+# Dataset
+DatasetArgs.dataset_name = "criteo"
+DatasetArgs.max_sequence_length = 128
+
+# Ranking model
+RankingArgs.prediction_head_arch = [512, 256, 1]
+RankingArgs.prediction_head_act_type = "relu"
+RankingArgs.eval_metrics = ("AUC")
+
+# Embeddings
+EmbeddingArgs.feature_names = ["item_id", "category"]
+EmbeddingArgs.table_name = "item_table"
+EmbeddingArgs.item_vocab_size_or_capacity = 1000000
+EmbeddingArgs.sharding_type = "data_parallel"
+```
+
+### Example 3: Retrieval Task with Dynamic Embedding
+
+```python
+# Retrieval configuration
+RetrievalArgs.num_negatives = 100
+RetrievalArgs.temperature = 0.05
+RetrievalArgs.eval_metrics = ("HR@10", "HR@50", "NDCG@10")
+
+# Dynamic embedding
+DynamicEmbeddingArgs.feature_names = ["user_id", "item_id"]
+DynamicEmbeddingArgs.table_name = "user_item_table"
+DynamicEmbeddingArgs.item_vocab_size_or_capacity = 10000000
+DynamicEmbeddingArgs.item_vocab_gpu_capacity_ratio = 0.1
+DynamicEmbeddingArgs.evict_strategy = "lru"
+DynamicEmbeddingArgs.caching = True
+```
+
+---
+
diff --git a/pyproject.toml b/pyproject.toml
index f55eae22..87927543 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,5 +1,6 @@
 [tool.mypy]
 exclude = [
+  "examples/hstu/tmp_data",
   "examples/hstu/ops/triton_ops/*",
   "examples/hstu/ops/fused_hstu_op.py",
   "corelib/*",

From b4cf17506791fc4906729571a8061470351492e1 Mon Sep 17 00:00:00 2001
From: JacoCheung <junzhang@nvidia.com>
Date: Mon, 27 Oct 2025 05:18:32 +0000
Subject: [PATCH 2/3] Move root RM env setting up to training

---
 README.md                              |  38 ----
 examples/hstu/training/README.md       |  50 ++++-
 examples/hstu/utils/gin_args_doc.md    | 250 -------------------------
 examples/hstu/utils/gin_config_args.py | 199 +++++++++++++++++++-
 4 files changed, 246 insertions(+), 291 deletions(-)
 delete mode 100644 examples/hstu/utils/gin_args_doc.md

diff --git a/README.md b/README.md
index 08b47440..8c1a47ec 100644
--- a/README.md
+++ b/README.md
@@ -35,44 +35,6 @@ The project includes:
 </details>
 For more detailed release notes, please refer our [releases](https://github.com/NVIDIA/recsys-examples/releases).
 
-# Environment Setup
-## Start from dockerfile
-
-We provide [dockerfile](./docker/Dockerfile) for users to build environment. 
-```
-docker build -f docker/Dockerfile --platform linux/amd64 -t recsys-examples:latest .
-```
-If you want to build image for Grace, you can use 
-```
-docker build -f docker/Dockerfile --platform linux/arm64 -t recsys-examples:latest .
-```
-You can also set your own base image with args `--build-arg <BASE_IMAGE>`.
-
-## Start from source file
-Before running examples, build and install libs under corelib following instruction in documentation:
-- [HSTU attention documentation](./corelib/hstu/README.md)
-- [Dynamic Embeddings documentation](./corelib/dynamicemb/README.md)
-
-On top of those two core libs, Megatron-Core along with other libs are required. You can install them via pypi package:
-
-```bash
-pip install torchx gin-config torchmetrics==1.0.3 typing-extensions iopath megatron-core==0.9.0
-```
-
-If you fail to install the megatron-core package, usually due to the python version incompatibility, please try to clone and then install the source code. 
-
-```bash
-git clone -b core_r0.9.0 https://github.com/NVIDIA/Megatron-LM.git megatron-lm && \
-pip install -e ./megatron-lm
-```
-
-We provide our custom HSTU CUDA operators for enhanced performance. You need to install these operators using the following command:
-
-```bash
-cd /workspace/recsys-examples/examples/hstu && \
-python setup.py install
-```
-
 # Get Started
 The examples we supported:
 - [HSTU recommender examples](./examples/hstu/README.md)
diff --git a/examples/hstu/training/README.md b/examples/hstu/training/README.md
index 4275590b..385f13e7 100644
--- a/examples/hstu/training/README.md
+++ b/examples/hstu/training/README.md
@@ -7,8 +7,44 @@ To facilitate large embedding tables and scaling-laws of HSTU dense, we have int
 This integration ensures efficient training by coordinating sparse (embedding) and dense (context/data) parallelisms within a single model.
 ![parallelism](../figs/parallelism.png)
 
+## Environment Setup
+### Start from dockerfile
 
-## Dataset Introduction
+We provide [dockerfile](./docker/Dockerfile) for users to build environment. 
+```
+docker build -f docker/Dockerfile --platform linux/amd64 -t recsys-examples:latest .
+```
+If you want to build image for Grace, you can use 
+```
+docker build -f docker/Dockerfile --platform linux/arm64 -t recsys-examples:latest .
+```
+You can also set your own base image with args `--build-arg <BASE_IMAGE>`.
+
+### Start from source file
+Before running examples, build and install libs under corelib following instruction in documentation:
+- [HSTU attention documentation](./corelib/hstu/README.md)
+- [Dynamic Embeddings documentation](./corelib/dynamicemb/README.md)
+
+On top of those two core libs, Megatron-Core along with other libs are required. You can install them via pypi package:
+
+```bash
+pip install torchx gin-config torchmetrics==1.0.3 typing-extensions iopath megatron-core==0.9.0
+```
+
+If you fail to install the megatron-core package, usually due to the python version incompatibility, please try to clone and then install the source code. 
+
+```bash
+git clone -b core_r0.9.0 https://github.com/NVIDIA/Megatron-LM.git megatron-lm && \
+pip install -e ./megatron-lm
+```
+
+We provide our custom HSTU CUDA operators for enhanced performance. You need to install these operators using the following command:
+
+```bash
+cd /workspace/recsys-examples/examples/hstu && \
+python setup.py install
+```
+### Dataset Introduction
 
 We have supported several datasets as listed in the following sections:
 
@@ -27,8 +63,18 @@ refer to [KuaiRand](https://kuairand.com/) for details.
 
 ## Running the examples
 
-Before getting started, please make sure that all pre-requisites are fulfilled. You can refer to [Get Started][../../../README] section in the root directory of the repo to set up the environment.****
+Before getting started, please make sure that all pre-requisites are fulfilled. You can refer to [Get Started](../../../README) section in the root directory of the repo to set up the environment.
+
 
+### Dataset preprocessing
+
+In order to prepare the dataset for training, you can use our `preprocessor.py` under the hstu example folder of the project.
+
+```bash
+cd <root-to-repo>/examples/hstu && 
+mkdir -p ./tmp_data && python3 ./preprocessor.py --dataset_name <"ml-1m"|"ml-20m"|"kuairand-pure"|"kuairand-1k"|"kuairand-27k">
+
+```
 
 ### Start training
 The entrypoint for training are `pretrain_gr_retrieval.py` or `pretrain_gr_ranking.py`. We use gin-config to specify the model structure, training arguments, hyper-params etc.
diff --git a/examples/hstu/utils/gin_args_doc.md b/examples/hstu/utils/gin_args_doc.md
deleted file mode 100644
index ff37215b..00000000
--- a/examples/hstu/utils/gin_args_doc.md
+++ /dev/null
@@ -1,250 +0,0 @@
-# Gin Configurable Interfaces Documentation
-
-This document provides comprehensive documentation for all configurable hypara-params that used by both inference and training
-
-
-## 1. TrainerArgs - Trainer Configuration
-
-Training-related parameters and settings.
-
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `train_batch_size` | int | - | **Required**. Batch size per GPU. When TP is enabled, the theoretical batch size is (train_batch_size × tp_size) |
-| `eval_batch_size` | int | - | **Required**. Evaluation batch size |
-| `eval_interval` | int | 100 | Evaluation interval in iterations |
-| `log_interval` | int | 100 | Logging interval in iterations |
-| `max_train_iters` | Optional[int] | None | Maximum training iterations |
-| `max_eval_iters` | Optional[int] | None | Maximum evaluation iterations |
-| `seed` | int | 1234 | Random seed |
-| `profile` | bool | False | Enable profiling |
-| `profile_step_start` | int | 100 | Profiling start step |
-| `profile_step_end` | int | 200 | Profiling end step |
-| `ckpt_save_interval` | int | -1 | Checkpoint save interval, -1 means no checkpoint saving |
-| `ckpt_save_dir` | str | "./checkpoints" | Checkpoint save directory |
-| `ckpt_load_dir` | str | "" | Checkpoint load directory |
-| `pipeline_type` | str | "native" | Pipeline overlap type: `none` (no overlap), `native` (overlap h2d, input dist, fwd+bwd), `prefetch` (includes prefetch overlap) |
-
----
-
-
-## 2. EmbeddingArgs - Embedding Configuration
-
-Base embedding layer configuration parameters.
-
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `feature_names` | List[str] | - | **Required**. List of feature names |
-| `table_name` | str | - | **Required**. Embedding table name |
-| `item_vocab_size_or_capacity` | int | - | **Required**. For dynamic embedding: capacity; for static embedding: vocabulary size |
-| `sharding_type` | str | "None" | Sharding type, must be "data_parallel" or "model_parallel" |
-
----
-
-## 3. DynamicEmbeddingArgs - Dynamic Embedding Configuration
-
-Extends `EmbeddingArgs` with dynamic embedding-specific parameters.
-
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `global_hbm_for_values` | Optional[int] | None | Global HBM size in bytes (highest priority) |
-| `item_vocab_gpu_capacity` | Optional[float] | None | Item vocabulary GPU capacity (second priority) |
-| `item_vocab_gpu_capacity_ratio` | Optional[float] | None | Item vocabulary GPU capacity ratio (lowest priority) |
-| `evict_strategy` | str | "lru" | Eviction strategy: "lru" or "lfu" |
-| `caching` | bool | False | Enable caching on HMB. When caching is enabled, the global_hbm_for_values indicates the cache size |
-
-**Note**: `sharding_type` is automatically set to "model_parallel"
-
-**Precedence**: The first 3 params can be used for setting the HBM size for dynamic embedding, but there is a precedence relationship:   `global_hbm_for_values` > `item_vocab_gpu_capacity` > `item_vocab_gpu_capacity_ratio`. When only `item_vocab_gpu_capacity_ratio` is given, `item_vocab_gpu_capacity = item_vocab_gpu_capacity_ratio * item_vocab_size_or_capacity` and `global_hbm_for_values` are deduced based on the optimizer and embedding dims.
-
-**Note**: A table could be only one of type EmbeddingArgs or DynamicEmbeddingArgs.
-
-**Note**: When movielen\* or kuairand\* dataset are used,  DynamicEmbeddingArgs/EmbeddingArgs are predefined. See [get_dataset_and_embedding_args() func](../hstu/training/trainer/utils.py)
-
----
-
-## 4. DatasetArgs - Dataset Configuration
-
-Dataset-related configuration parameters.
-
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `dataset_name` | str | - | **Required**. Dataset name |
-| `max_sequence_length` | int | - | **Required**. Maximum sequence length |
-| `dataset_path` | Optional[str] | None | Path to dataset |
-| `max_num_candidates` | int | 0 | Maximum number of candidates |
-| `shuffle` | bool | False | Whether to shuffle data |
-
-**Note**: `dataset_path` could be none if your dataset is preprocessed and moved under <root-to-project>/hstu/tmp_data folder or you're running with `BenchmarkDatasetArgs` which is a in-memory random data generator. Please refer to [example](../hstu/training/configs/benchmark_ranking.gin).
-
-
----
-
-## 5. FeatureArgs - Feature Configuration
-
-Feature-specific configuration parameters.
-
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `feature_names` | List[str] | - | **Required**. List of feature names |
-| `max_sequence_length` | int | - | **Required**. Maximum sequence length |
-| `is_jagged` | bool | False | Whether features are jagged (variable length) |
- FeatureArgs and DatasetArgs
-
-**Note**: `FeatureArgs` are only used when the dataset is of `BenchmarkDatasetArgs`.
-
----
-## 6. BenchmarkDatasetArgs - Benchmark Dataset Configuration
-
-Configuration for benchmark datasets combining features and embeddings.
-
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `feature_args` | List[FeatureArgs] | - | **Required**. List of feature arguments |
-| `embedding_args` | List[Union[EmbeddingArgs, DynamicEmbeddingArgs]] | - | **Required**. List of embedding arguments |
-| `item_feature_name` | str | - | **Required**. Item feature name |
-| `contextual_feature_names` | List[str] | - | **Required**. List of contextual feature names |
-| `action_feature_name` | Optional[str] | None | Action feature name |
-| `max_num_candidates` | int | 0 | Maximum number of candidates |
-
----
-
-## 7. NetworkArgs - Network Architecture Configuration
-
-Neural network architecture parameters.
-
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `num_layers` | int | - | **Required**. Number of layers |
-| `hidden_size` | int | - | **Required**. Hidden layer size |
-| `num_attention_heads` | int | - | **Required**. Number of attention heads |
-| `kv_channels` | int | - | **Required**. Key-value channels |
-| `hidden_dropout` | float | 0.2 | Hidden layer dropout rate |
-| `norm_epsilon` | float | 1e-5 | Normalization epsilon |
-| `is_causal` | bool | True | Use causal attention mask |
-| `dtype_str` | str | "bfloat16" | Data type: "bfloat16" or "float16" |
-| `kernel_backend` | str | "cutlass" | Kernel backend: "cutlass", "triton", or "pytorch" |
-| `target_group_size` | int | 1 | Target group size |
-| `num_position_buckets` | int | 8192 | Number of position buckets |
-| `recompute_input_layernorm` | bool | False | Recompute input layer normalization |
-| `recompute_input_silu` | bool | False | Recompute input SiLU activation |
-| `item_embedding_dim` | int | -1 | Item embedding dimension |
-| `contextual_embedding_dim` | int | -1 | Contextual embedding dimension |
-
----
-
-## 8. OptimizerArgs - Optimizer Configuration
-
-Optimizer-related parameters.
-
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `optimizer_str` | str | - | **Required**. Optimizer name |
-| `learning_rate` | float | - | **Required**. Learning rate |
-| `adam_beta1` | float | 0.9 | Adam optimizer beta1 parameter |
-| `adam_beta2` | float | 0.999 | Adam optimizer beta2 parameter |
-| `adam_eps` | float | 1e-8 | Adam optimizer epsilon parameter |
-
----
-
-## 9. TensorModelParallelArgs - Tensor Model Parallelism Configuration
-
-Tensor model parallelism settings.
-
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `tensor_model_parallel_size` | int | 1 | Tensor model parallel size (number of GPUs for model sharding) |
-
-**Note**: The data parallel size is deduced based on the `world_size` and `tensor_model_parallel_size`.
-
----
-
-## 10. RankingArgs - Ranking Task Configuration
-
-Configuration specific to ranking tasks.
-
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `prediction_head_arch` | List[int] | None | **Required**. Prediction head architecture (list of layer sizes) |
-| `prediction_head_act_type` | str | "relu" | Prediction head activation type: "relu" or "gelu" |
-| `prediction_head_bias` | bool | True | Whether to use bias in prediction head |
-| `num_tasks` | int | 1 | Number of tasks (for multi-task learning) |
-| `eval_metrics` | Tuple[str, ...] | ("AUC",) | Evaluation metrics tuple |
-
----
-
-## 11. RetrievalArgs - Retrieval Task Configuration
-
-Configuration specific to retrieval tasks.
-
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| `num_negatives` | int | -1 | Number of negative samples |
-| `temperature` | float | 0.05 | Temperature parameter for similarity scoring |
-| `l2_norm_eps` | float | 1e-6 | Epsilon value for L2 normalization |
-| `eval_metrics` | Tuple[str, ...] | ("HR@10", "NDCG@10") | Evaluation metrics tuple (Hit Rate, NDCG) |
-
----
-
-## Usage Examples
-
-### Example 1: Basic Configuration
-
-```python
-# In your .gin config file
-
-# Trainer configuration
-TrainerArgs.train_batch_size = 256
-TrainerArgs.eval_batch_size = 512
-TrainerArgs.max_train_iters = 10000
-TrainerArgs.pipeline_type = "prefetch"
-
-# Network configuration
-NetworkArgs.num_layers = 4
-NetworkArgs.hidden_size = 256
-NetworkArgs.num_attention_heads = 8
-NetworkArgs.kv_channels = 32
-NetworkArgs.dtype_str = "bfloat16"
-
-# Optimizer configuration
-OptimizerArgs.optimizer_str = "adam"
-OptimizerArgs.learning_rate = 0.001
-```
-
-### Example 2: Ranking Task Configuration
-
-```python
-# Dataset
-DatasetArgs.dataset_name = "criteo"
-DatasetArgs.max_sequence_length = 128
-
-# Ranking model
-RankingArgs.prediction_head_arch = [512, 256, 1]
-RankingArgs.prediction_head_act_type = "relu"
-RankingArgs.eval_metrics = ("AUC")
-
-# Embeddings
-EmbeddingArgs.feature_names = ["item_id", "category"]
-EmbeddingArgs.table_name = "item_table"
-EmbeddingArgs.item_vocab_size_or_capacity = 1000000
-EmbeddingArgs.sharding_type = "data_parallel"
-```
-
-### Example 3: Retrieval Task with Dynamic Embedding
-
-```python
-# Retrieval configuration
-RetrievalArgs.num_negatives = 100
-RetrievalArgs.temperature = 0.05
-RetrievalArgs.eval_metrics = ("HR@10", "HR@50", "NDCG@10")
-
-# Dynamic embedding
-DynamicEmbeddingArgs.feature_names = ["user_id", "item_id"]
-DynamicEmbeddingArgs.table_name = "user_item_table"
-DynamicEmbeddingArgs.item_vocab_size_or_capacity = 10000000
-DynamicEmbeddingArgs.item_vocab_gpu_capacity_ratio = 0.1
-DynamicEmbeddingArgs.evict_strategy = "lru"
-DynamicEmbeddingArgs.caching = True
-```
-
----
-
diff --git a/examples/hstu/utils/gin_config_args.py b/examples/hstu/utils/gin_config_args.py
index 7d5738e0..d47501b1 100644
--- a/examples/hstu/utils/gin_config_args.py
+++ b/examples/hstu/utils/gin_config_args.py
@@ -21,6 +21,31 @@
 @gin.configurable
 @dataclass
 class TrainerArgs:
+    """Trainer Configuration.
+
+    Training-related parameters and settings.
+
+    Attributes:
+        train_batch_size (int): **Required**. Batch size per GPU. When TP is enabled,
+            the theoretical batch size is (train_batch_size × tp_size).
+        eval_batch_size (int): **Required**. Evaluation batch size.
+        eval_interval (int): Evaluation interval in iterations. Default: 100.
+        log_interval (int): Logging interval in iterations. Default: 100.
+        max_train_iters (Optional[int]): Maximum training iterations. Default: None.
+        max_eval_iters (Optional[int]): Maximum evaluation iterations. Default: None.
+        seed (int): Random seed. Default: 1234.
+        profile (bool): Enable profiling. Default: False.
+        profile_step_start (int): Profiling start step. Default: 100.
+        profile_step_end (int): Profiling end step. Default: 200.
+        ckpt_save_interval (int): Checkpoint save interval, -1 means no checkpoint saving.
+            Default: -1.
+        ckpt_save_dir (str): Checkpoint save directory. Default: "./checkpoints".
+        ckpt_load_dir (str): Checkpoint load directory. Default: "".
+        pipeline_type (str): Pipeline overlap type: 'none' (no overlap), 'native'
+            (overlap h2d, input dist, fwd+bwd), 'prefetch' (includes prefetch overlap).
+            Default: "native".
+    """
+
     # below batchsize is batchsize_per_gpu
     # when TP is enabled, the theoratical batchsize is (train_batch_size * tp_size)
     train_batch_size: int
@@ -54,6 +79,17 @@ def __post_init__(self):
 
 @dataclass
 class BaseEmbeddingArgs:
+    """Base Embedding Arguments.
+
+    Base class for embedding configuration parameters.
+
+    Attributes:
+        feature_names (List[str]): List of feature names.
+        table_name (str): Embedding table name.
+        item_vocab_size_or_capacity (int): For dynamic embedding: capacity;
+            for static embedding: vocabulary size.
+    """
+
     # for dynamic emb, it serves as capacity, while for static emb, it serves as vocab size
     feature_names: List[str]
     table_name: str
@@ -63,6 +99,25 @@ class BaseEmbeddingArgs:
 @gin.configurable
 @dataclass
 class EmbeddingArgs(BaseEmbeddingArgs):
+    """Embedding Configuration.
+
+    Base embedding layer configuration parameters.
+
+    Attributes:
+        feature_names (List[str]): **Required**. List of feature names.
+        table_name (str): **Required**. Embedding table name.
+        item_vocab_size_or_capacity (int): **Required**. For dynamic embedding: capacity;
+            for static embedding: vocabulary size.
+        sharding_type (str): Sharding type, must be "data_parallel" or "model_parallel".
+            Default: "None".
+
+    Note:
+        A table could be only one of type `EmbeddingArgs` or `DynamicEmbeddingArgs`.
+        When movielen* or kuairand* datasets are used, `DynamicEmbeddingArgs`/`EmbeddingArgs`
+        are predefined. Setting the proper DatasetArgs.dataset_name in the gin config file will automatically set the proper EmbeddingArgs/DynamicEmbeddingArgs.
+        See `examples/hstu/training/trainer/utils.py::get_dataset_and_embedding_args()` for more details.
+    """
+
     sharding_type: str = "None"
 
     def __post_init__(self):
@@ -75,7 +130,31 @@ def __post_init__(self):
 @gin.configurable
 @dataclass
 class DynamicEmbeddingArgs(EmbeddingArgs):
-    # the precedence is global_hbm_for_values > item_vocab_gpu_capacity > item_vocab_gpu_capacity_ratio
+    """Dynamic Embedding Configuration.
+
+    Extends EmbeddingArgs with dynamic embedding-specific parameters.
+
+    Attributes:
+        global_hbm_for_values (Optional[int]): Global HBM size in bytes (highest priority).
+            Default: None.
+        item_vocab_gpu_capacity (Optional[float]): Item vocabulary GPU capacity
+            (second priority). Default: None.
+        item_vocab_gpu_capacity_ratio (Optional[float]): Item vocabulary GPU capacity ratio
+            (lowest priority). Default: None.
+        evict_strategy (str): Eviction strategy: "lru" or "lfu". Default: "lru".
+        caching (bool): Enable caching on HBM. When caching is enabled, the
+            global_hbm_for_values indicates the cache size. Default: False.
+
+    Note:
+        - sharding_type is automatically set to "model_parallel".
+        - Precedence: The first 3 params can be used for setting the HBM size for dynamic
+          embedding, with precedence: `global_hbm_for_values` > `item_vocab_gpu_capacity` >
+          item_vocab_gpu_capacity_ratio. When only item_vocab_gpu_capacity_ratio is given,
+          `item_vocab_gpu_capacity` = `item_vocab_gpu_capacity_ratio` * `item_vocab_size_or_capacity`
+          and `global_hbm_for_values` are deduced based on the optimizer and embedding dims.
+    """
+
+    # the precedence is `global_hbm_for_values` > `item_vocab_gpu_capacity` > `item_vocab_gpu_capacity_ratio`
     # without optimizer consideration
     global_hbm_for_values: Optional[int] = None
     item_vocab_gpu_capacity: Optional[float] = None
@@ -107,6 +186,23 @@ def calculate_and_reset_global_hbm_for_values(self, hidden_size, multiplier=1):
 @gin.configurable
 @dataclass
 class DatasetArgs:
+    """Dataset Configuration.
+
+    Dataset-related configuration parameters.
+
+    Attributes:
+        dataset_name (str): **Required**. Dataset name.
+        max_sequence_length (int): **Required**. Maximum sequence length.
+        dataset_path (Optional[str]): Path to dataset. Default: None.
+        max_num_candidates (int): Maximum number of candidates. Default: 0.
+        shuffle (bool): Whether to shuffle data. Default: False.
+
+    Note:
+        dataset_path could be None if your dataset is preprocessed and moved under
+        <root-to-repo>/hstu/tmp_data folder or you're running with BenchmarkDatasetArgs
+        which is an in-memory random data generator.
+    """
+
     dataset_name: str
     max_sequence_length: int
     dataset_path: Optional[str] = None
@@ -117,6 +213,19 @@ class DatasetArgs:
 @gin.configurable
 @dataclass
 class FeatureArgs:
+    """Feature Configuration.
+
+    Feature-specific configuration parameters.
+
+    Attributes:
+        feature_names (List[str]): **Required**. List of feature names.
+        max_sequence_length (int): **Required**. Maximum sequence length.
+        is_jagged (bool): Whether features are jagged (variable length). Default: False.
+
+    Note:
+        `FeatureArgs` are only used when the dataset is of `BenchmarkDatasetArgs` type.
+    """
+
     feature_names: List[str]
     max_sequence_length: int
     is_jagged: bool = False
@@ -125,6 +234,20 @@ class FeatureArgs:
 @gin.configurable
 @dataclass
 class BenchmarkDatasetArgs:
+    """Benchmark Dataset Configuration.
+
+    Configuration for benchmark datasets combining features and embeddings.
+
+    Attributes:
+        feature_args (List[FeatureArgs]): **Required**. List of feature arguments.
+        embedding_args (List[Union[EmbeddingArgs, DynamicEmbeddingArgs]]): **Required**.
+            List of embedding arguments.
+        item_feature_name (str): **Required**. Item feature name.
+        contextual_feature_names (List[str]): **Required**. List of contextual feature names.
+        action_feature_name (Optional[str]): Action feature name. Default: None.
+        max_num_candidates (int): Maximum number of candidates. Default: 0.
+    """
+
     feature_args: List[FeatureArgs]
     embedding_args: List[Union[EmbeddingArgs, DynamicEmbeddingArgs]]
     item_feature_name: str
@@ -136,6 +259,29 @@ class BenchmarkDatasetArgs:
 @gin.configurable
 @dataclass
 class NetworkArgs:
+    """Network Architecture Configuration.
+
+    Neural network architecture parameters.
+
+    Attributes:
+        num_layers (int): **Required**. Number of layers.
+        hidden_size (int): **Required**. Hidden layer size.
+        num_attention_heads (int): **Required**. Number of attention heads.
+        kv_channels (int): **Required**. Key-value channels.
+        hidden_dropout (float): Hidden layer dropout rate. Default: 0.2.
+        norm_epsilon (float): Normalization epsilon. Default: 1e-5.
+        is_causal (bool): Use causal attention mask. Default: True.
+        dtype_str (str): Data type: "bfloat16" or "float16". Default: "bfloat16".
+        kernel_backend (str): Kernel backend: "cutlass", "triton", or "pytorch".
+            Default: "cutlass".
+        target_group_size (int): Target group size. Default: 1.
+        num_position_buckets (int): Number of position buckets. Default: 8192.
+        recompute_input_layernorm (bool): Recompute input layer normalization. Default: False.
+        recompute_input_silu (bool): Recompute input SiLU activation. Default: False.
+        item_embedding_dim (int): Item embedding dimension. Default: -1.
+        contextual_embedding_dim (int): Contextual embedding dimension. Default: -1.
+    """
+
     num_layers: int
     hidden_size: int
     num_attention_heads: int
@@ -170,6 +316,18 @@ def __post_init__(self):
 @gin.configurable
 @dataclass
 class OptimizerArgs:
+    """Optimizer Configuration.
+
+    Optimizer-related parameters.
+
+    Attributes:
+        optimizer_str (str): **Required**. Optimizer name.
+        learning_rate (float): **Required**. Learning rate.
+        adam_beta1 (float): Adam optimizer beta1 parameter. Default: 0.9.
+        adam_beta2 (float): Adam optimizer beta2 parameter. Default: 0.999.
+        adam_eps (float): Adam optimizer epsilon parameter. Default: 1e-8.
+    """
+
     optimizer_str: str
     learning_rate: float
     adam_beta1: float = 0.9
@@ -180,12 +338,39 @@ class OptimizerArgs:
 @gin.configurable
 @dataclass
 class TensorModelParallelArgs:
+    """Tensor Model Parallelism Configuration.
+
+    Tensor model parallelism settings.
+
+    Attributes:
+        tensor_model_parallel_size (int): Tensor model parallel size (number of GPUs
+            for model sharding). Default: 1.
+
+    Note:
+        The data parallel size is deduced based on the world_size and
+        tensor_model_parallel_size.
+    """
+
     tensor_model_parallel_size: int = 1
 
 
 @gin.configurable
 @dataclass
 class RankingArgs:
+    """Ranking Task Configuration.
+
+    Configuration specific to ranking tasks.
+
+    Attributes:
+        prediction_head_arch (List[int]): **Required**. Prediction head architecture
+            (list of layer sizes). Default: None.
+        prediction_head_act_type (str): Prediction head activation type: "relu" or "gelu".
+            Default: "relu".
+        prediction_head_bias (bool): Whether to use bias in prediction head. Default: True.
+        num_tasks (int): Number of tasks (for multi-task learning). Default: 1.
+        eval_metrics (Tuple[str, ...]): Evaluation metrics tuple. Default: ("AUC",).
+    """
+
     prediction_head_arch: List[int] = cast(List[int], None)
     prediction_head_act_type: str = "relu"
     prediction_head_bias: bool = True
@@ -206,6 +391,18 @@ def __post_init__(self):
 @gin.configurable
 @dataclass
 class RetrievalArgs:
+    """Retrieval Task Configuration.
+
+    Configuration specific to retrieval tasks.
+
+    Attributes:
+        num_negatives (int): Number of negative samples. Default: -1.
+        temperature (float): Temperature parameter for similarity scoring. Default: 0.05.
+        l2_norm_eps (float): Epsilon value for L2 normalization. Default: 1e-6.
+        eval_metrics (Tuple[str, ...]): Evaluation metrics tuple (Hit Rate, NDCG).
+            Default: ("HR@10", "NDCG@10").
+    """
+
     ### retrieval
     num_negatives: int = -1
     temperature = 0.05

From 9aaf5861c571b80c16a62b56622cc63bbfe032a6 Mon Sep 17 00:00:00 2001
From: JacoCheung <junzhang@nvidia.com>
Date: Mon, 27 Oct 2025 05:18:32 +0000
Subject: [PATCH 3/3] Move root ReadMe env setting up to training

---
 examples/hstu/training/README.md | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/examples/hstu/training/README.md b/examples/hstu/training/README.md
index 385f13e7..9e27d44e 100644
--- a/examples/hstu/training/README.md
+++ b/examples/hstu/training/README.md
@@ -10,20 +10,22 @@ This integration ensures efficient training by coordinating sparse (embedding) a
 ## Environment Setup
 ### Start from dockerfile
 
-We provide [dockerfile](./docker/Dockerfile) for users to build environment. 
+We provide [dockerfile](../../../docker/Dockerfile) for users to build environment. 
 ```
+git clone https://github.com/NVIDIA/recsys-examples.git && cd recsys-examples
 docker build -f docker/Dockerfile --platform linux/amd64 -t recsys-examples:latest .
 ```
 If you want to build image for Grace, you can use 
 ```
+git clone https://github.com/NVIDIA/recsys-examples.git && cd recsys-examples
 docker build -f docker/Dockerfile --platform linux/arm64 -t recsys-examples:latest .
 ```
 You can also set your own base image with args `--build-arg <BASE_IMAGE>`.
 
 ### Start from source file
 Before running examples, build and install libs under corelib following instruction in documentation:
-- [HSTU attention documentation](./corelib/hstu/README.md)
-- [Dynamic Embeddings documentation](./corelib/dynamicemb/README.md)
+- [HSTU attention documentation](.../../../corelib/hstu/README.md)
+- [Dynamic Embeddings documentation](.../../../corelib/dynamicemb/README.md)
 
 On top of those two core libs, Megatron-Core along with other libs are required. You can install them via pypi package: