diff --git a/README.md b/README.md
index 08b47440..8c1a47ec 100644
--- a/README.md
+++ b/README.md
@@ -35,44 +35,6 @@ The project includes:
 </details>
 For more detailed release notes, please refer our [releases](https://github.com/NVIDIA/recsys-examples/releases).
 
-# Environment Setup
-## Start from dockerfile
-
-We provide [dockerfile](./docker/Dockerfile) for users to build environment. 
-```
-docker build -f docker/Dockerfile --platform linux/amd64 -t recsys-examples:latest .
-```
-If you want to build image for Grace, you can use 
-```
-docker build -f docker/Dockerfile --platform linux/arm64 -t recsys-examples:latest .
-```
-You can also set your own base image with args `--build-arg <BASE_IMAGE>`.
-
-## Start from source file
-Before running examples, build and install libs under corelib following instruction in documentation:
-- [HSTU attention documentation](./corelib/hstu/README.md)
-- [Dynamic Embeddings documentation](./corelib/dynamicemb/README.md)
-
-On top of those two core libs, Megatron-Core along with other libs are required. You can install them via pypi package:
-
-```bash
-pip install torchx gin-config torchmetrics==1.0.3 typing-extensions iopath megatron-core==0.9.0
-```
-
-If you fail to install the megatron-core package, usually due to the python version incompatibility, please try to clone and then install the source code. 
-
-```bash
-git clone -b core_r0.9.0 https://github.com/NVIDIA/Megatron-LM.git megatron-lm && \
-pip install -e ./megatron-lm
-```
-
-We provide our custom HSTU CUDA operators for enhanced performance. You need to install these operators using the following command:
-
-```bash
-cd /workspace/recsys-examples/examples/hstu && \
-python setup.py install
-```
-
 # Get Started
 The examples we supported:
 - [HSTU recommender examples](./examples/hstu/README.md)
diff --git a/examples/commons/utils/logger.py b/examples/commons/utils/logger.py
index 6f2f3225..e66ef6ab 100644
--- a/examples/commons/utils/logger.py
+++ b/examples/commons/utils/logger.py
@@ -12,16 +12,30 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from datetime import datetime
+import logging
 
 import torch
+from rich.console import Console
+from rich.logging import RichHandler
+
+# Set up logger with RichHandler if not already configured
+
+console = Console()
+_LOGGER = logging.getLogger("rich_rank0")
+
+if not _LOGGER.hasHandlers():
+    handler = RichHandler(
+        console=console, show_time=True, show_path=False, rich_tracebacks=True
+    )
+    _LOGGER.addHandler(handler)
+    _LOGGER.propagate = False
+    _LOGGER.setLevel(logging.INFO)
 
 
 def print_rank_0(message):
     """If distributed is initialized, print only on rank 0."""
     if torch.distributed.is_initialized():
-        now = datetime.now()
         if torch.distributed.get_rank() == 0:
-            print(f"[{now}] " + message, flush=True)
+            _LOGGER.info(message)
     else:
         print(message, flush=True)
diff --git a/examples/commons/utils/stringify.py b/examples/commons/utils/stringify.py
index ac834b38..93986bfc 100644
--- a/examples/commons/utils/stringify.py
+++ b/examples/commons/utils/stringify.py
@@ -34,11 +34,11 @@ def stringify_dict(input_dict, prefix="", sep=","):
             value.float()
             assert value.dim() == 0
             value = value.cpu().item()
-            output += key + ":" + f"{value:6f}{sep}"
+            output += key + ": " + f"{value:6f}{sep}"
         elif isinstance(value, float):
-            output += key + ":" + f"{value:6f}{sep}"
+            output += key + ": " + f"{value:6f}{sep}"
         elif isinstance(value, int):
-            output += key + ":" + f"{value}{sep}"
+            output += key + ": " + f"{value}{sep}"
         else:
             assert RuntimeError(f"stringify dict not supports type {type(value)}")
     # remove the ending sep
diff --git a/examples/hstu/README.md b/examples/hstu/README.md
index 19e56e40..e2b2e9bf 100644
--- a/examples/hstu/README.md
+++ b/examples/hstu/README.md
@@ -1,4 +1,4 @@
-# Examples: to demonstrate how to train generative recommendation models
+# Examples: to demonstrate how to do training and inference generative recommendation models
 
 ## Generative Recommender Introduction
 Meta's paper ["Actions Speak Louder Than Words"](https://arxiv.org/abs/2402.17152) introduces a novel paradigm for recommendation systems called **Generative Recommenders(GRs)**, which reformulates recommendation tasks as generative modeling problems. The work introduced Hierarchical Sequential Transduction Units (HSTU), a novel architecture designed to handle high-cardinality, non-stationary data streams in large-scale recommendation systems. HSTU enables both retrieval and ranking tasks. As noted in the paper, “HSTU-based GRs, with 1.5 trillion parameters, improve metrics in online A/B tests by 12.4% and have been deployed on multiple surfaces of a large internet platform with billions of users.”
diff --git a/examples/hstu/training/README.md b/examples/hstu/training/README.md
new file mode 100644
index 00000000..9e27d44e
--- /dev/null
+++ b/examples/hstu/training/README.md
@@ -0,0 +1,99 @@
+# HSTU Training example
+
+We have supported both retrieval and ranking model whose backbones are HSTU layers. In this example collection, we allow user to specify the model structures via gin-config file. Supported datasets are listed below. Regarding the gin-config interface, please refer to [inline comments](../utils/gin_config_args.py) .
+
+## Parallelism Introduction 
+To facilitate large embedding tables and scaling-laws of HSTU dense, we have integrate **[TorchRec](https://github.com/pytorch/torchrec)** that does shard embedding tables and **[Megatron-LM](https://github.com/NVIDIA/Megatron-LM)** that enable dense parallelism(e.g Data, Tensor, Sequence, Pipeline, and Context parallelism) in this example.
+This integration ensures efficient training by coordinating sparse (embedding) and dense (context/data) parallelisms within a single model.
+![parallelism](../figs/parallelism.png)
+
+## Environment Setup
+### Start from dockerfile
+
+We provide [dockerfile](../../../docker/Dockerfile) for users to build environment. 
+```
+git clone https://github.com/NVIDIA/recsys-examples.git && cd recsys-examples
+docker build -f docker/Dockerfile --platform linux/amd64 -t recsys-examples:latest .
+```
+If you want to build image for Grace, you can use 
+```
+git clone https://github.com/NVIDIA/recsys-examples.git && cd recsys-examples
+docker build -f docker/Dockerfile --platform linux/arm64 -t recsys-examples:latest .
+```
+You can also set your own base image with args `--build-arg <BASE_IMAGE>`.
+
+### Start from source file
+Before running examples, build and install libs under corelib following instruction in documentation:
+- [HSTU attention documentation](.../../../corelib/hstu/README.md)
+- [Dynamic Embeddings documentation](.../../../corelib/dynamicemb/README.md)
+
+On top of those two core libs, Megatron-Core along with other libs are required. You can install them via pypi package:
+
+```bash
+pip install torchx gin-config torchmetrics==1.0.3 typing-extensions iopath megatron-core==0.9.0
+```
+
+If you fail to install the megatron-core package, usually due to the python version incompatibility, please try to clone and then install the source code. 
+
+```bash
+git clone -b core_r0.9.0 https://github.com/NVIDIA/Megatron-LM.git megatron-lm && \
+pip install -e ./megatron-lm
+```
+
+We provide our custom HSTU CUDA operators for enhanced performance. You need to install these operators using the following command:
+
+```bash
+cd /workspace/recsys-examples/examples/hstu && \
+python setup.py install
+```
+### Dataset Introduction
+
+We have supported several datasets as listed in the following sections:
+
+### Dataset Information
+#### **MovieLens**
+refer to [MovieLens 1M](https://grouplens.org/datasets/movielens/1m/) and [MovieLens 20M](https://www.kaggle.com/datasets/grouplens/movielens-20m-dataset) for details.
+#### **KuaiRand**
+
+| dataset       | # users | seqlen max | seqlen min | seqlen mean | seqlen median | # items    |
+|---------------|---------|------------|------------|-------------|---------------|------------|
+| kuairand_pure | 27285   | 910        | 1          | 1           | 39            | 7551       |
+| kuairand_1k   | 1000    | 49332      | 10         | 5038        | 3379          | 4369953    |
+| kuairand_27k  | 27285   | 228000     | 100        | 11796       | 8591          | 32038725   |
+ 
+refer to [KuaiRand](https://kuairand.com/) for details.
+
+## Running the examples
+
+Before getting started, please make sure that all pre-requisites are fulfilled. You can refer to [Get Started](../../../README) section in the root directory of the repo to set up the environment.
+
+
+### Dataset preprocessing
+
+In order to prepare the dataset for training, you can use our `preprocessor.py` under the hstu example folder of the project.
+
+```bash
+cd <root-to-repo>/examples/hstu && 
+mkdir -p ./tmp_data && python3 ./preprocessor.py --dataset_name <"ml-1m"|"ml-20m"|"kuairand-pure"|"kuairand-1k"|"kuairand-27k">
+
+```
+
+### Start training
+The entrypoint for training are `pretrain_gr_retrieval.py` or `pretrain_gr_ranking.py`. We use gin-config to specify the model structure, training arguments, hyper-params etc.
+
+Command to run retrieval task with `MovieLens 20m` dataset:
+
+```bash
+# Before running the `pretrain_gr_retrieval.py`, make sure that current working directory is `hstu`
+cd <root-to-project>examples/hstu 
+PYTHONPATH=${PYTHONPATH}:$(realpath ../) torchrun --nproc_per_node 1 --master_addr localhost --master_port 6000  ./training/pretrain_gr_retrieval.py --gin-config-file ./training/configs/movielen_retrieval.gin
+```
+
+To run ranking task with `MovieLens 20m` dataset:
+```bash
+# Before running the `pretrain_gr_ranking.py`, make sure that current working directory is `hstu`
+cd <root-to-project>examples/hstu 
+PYTHONPATH=${PYTHONPATH}:$(realpath ../) torchrun --nproc_per_node 1 --master_addr localhost --master_port 6000  ./training/pretrain_gr_ranking.py --gin-config-file ./training/configs/movielen_ranking.gin
+```
+
+
diff --git a/examples/hstu/training/__init__.py b/examples/hstu/training/__init__.py
deleted file mode 100644
index 270ce28a..00000000
--- a/examples/hstu/training/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from .training_impl import *  # pylint: disable=wildcard-import
-from .training_utils import *  # pylint: disable=wildcard-import
diff --git a/examples/hstu/training/benchmark/README.md b/examples/hstu/training/benchmark/README.md
index 42f83069..27358ac9 100644
--- a/examples/hstu/training/benchmark/README.md
+++ b/examples/hstu/training/benchmark/README.md
@@ -13,7 +13,7 @@ You can run script `run_hstu_benchmark.sh` to see the performance over the base
 
 ## How to run
 
-The test entry is `python ./benchmark/hstu_layer_benchmark.py run`, you can type `python ./benchmark/hstu_layer_benchmark.py run --help` to get the input arguments. 4 important arguments are :
+The test entry is `python ./training/benchmark/hstu_layer_benchmark.py run`, you can type `python ./training/benchmark/hstu_layer_benchmark.py run --help` to get the input arguments. 4 important arguments are :
 
 1. --kernel-backend: select the hstu mha backend. Could be `triton` or `cutlass`.
 2. --fuse-norm-mul-dropout: knob of  `layer norm + multiplication + dropout ` fusion. Could be `False` or `True`
@@ -23,7 +23,9 @@ The test entry is `python ./benchmark/hstu_layer_benchmark.py run`, you can type
 Our baseline cmd example (1K): 
 
 ```bash
-python ./benchmark/hstu_layer_benchmark.py run \
+
+cd recsys-examples/examples/hstu
+python ./training/benchmark/hstu_layer_benchmark.py run \
   --iters 100 \
   --warmup-iters 50 \
   --layer-type native \
@@ -40,7 +42,8 @@ python ./benchmark/hstu_layer_benchmark.py run \
 You can also run a set of arguments with run.sh:
 
 ```bash
-bash run_hstu_layer_benchmark.sh <num_layers>
+cd recsys-examples/examples/hstu
+bash ./training/benchmark/run_hstu_layer_benchmark.sh <num_layers>
 ```
 
 After one run is done, a memory snapshot file in current working directory is generated, you can trace the memory usage with the file. Please refer to [PyTorch docs](https://docs.pytorch.org/docs/stable/torch_cuda_memory.html) on how to visualize the memory trace.
diff --git a/examples/hstu/training/benchmark/hstu_layer_benchmark.py b/examples/hstu/training/benchmark/hstu_layer_benchmark.py
index c47742f6..1e2765ae 100644
--- a/examples/hstu/training/benchmark/hstu_layer_benchmark.py
+++ b/examples/hstu/training/benchmark/hstu_layer_benchmark.py
@@ -47,7 +47,7 @@
 from modules.jagged_data import JaggedData
 from modules.native_hstu_layer import HSTULayer as NativeHSTULayer
 from ops.length_to_offsets import length_to_complete_offsets
-from training.utils import cal_flops_single_rank
+from training.trainer.utils import cal_flops_single_rank
 
 _backend_str_to_type = {
     "cutlass": KernelBackend.CUTLASS,
diff --git a/examples/hstu/training/benchmark/run_hstu_layer_benchmark.sh b/examples/hstu/training/benchmark/run_hstu_layer_benchmark.sh
index 3984dfe9..2bae6a38 100644
--- a/examples/hstu/training/benchmark/run_hstu_layer_benchmark.sh
+++ b/examples/hstu/training/benchmark/run_hstu_layer_benchmark.sh
@@ -32,7 +32,7 @@ for dim_per_head in "${dim_per_heads[@]}"; do
                 fi
                 echo -e "\n\033[32mbaseline hstu layer \033[0m:"
                 ${nsys_profile_cmd/<placeholder>/${baseline_profile_name}} \
-                    python ./benchmark/hstu_layer_benchmark.py run \
+                    python ./training/benchmark/hstu_layer_benchmark.py run \
                     --iters 100 \
                     --warmup-iters 50 \
                     --kernel-backend triton \
@@ -53,7 +53,7 @@ for dim_per_head in "${dim_per_heads[@]}"; do
 
                 echo -e "\n\033[32m +cutlass\033[0m:"
                 ${nsys_profile_cmd/<placeholder>/${cutlass_profile_name}} \
-                    python ./benchmark/hstu_layer_benchmark.py run \
+                    python ./training/benchmark/hstu_layer_benchmark.py run \
                     --iters 100 \
                     --warmup-iters 50 \
                     --kernel-backend cutlass \
@@ -73,7 +73,7 @@ for dim_per_head in "${dim_per_heads[@]}"; do
 
                 echo -e "\n\033[32m +fused\033[0m:"
                 ${nsys_profile_cmd/<placeholder>/${fused_profile_name}} \
-                    python ./benchmark/hstu_layer_benchmark.py run \
+                    python ./training/benchmark/hstu_layer_benchmark.py run \
                     --iters 100 \
                     --warmup-iters 50 \
                     --kernel-backend cutlass \
@@ -93,7 +93,7 @@ for dim_per_head in "${dim_per_heads[@]}"; do
 
                 echo -e "\n\033[32m + recompute\033[0m:"
                 ${nsys_profile_cmd/<placeholder>/${recompute_profile_name}} \
-                    python ./benchmark/hstu_layer_benchmark.py run \
+                    python ./training/benchmark/hstu_layer_benchmark.py run \
                     --iters 100 \
                     --warmup-iters 50 \
                     --kernel-backend cutlass \
diff --git a/examples/hstu/training/pretrain_gr_ranking.py b/examples/hstu/training/pretrain_gr_ranking.py
index 993fc753..9ff0b32e 100644
--- a/examples/hstu/training/pretrain_gr_ranking.py
+++ b/examples/hstu/training/pretrain_gr_ranking.py
@@ -18,7 +18,7 @@
 warnings.filterwarnings("ignore", category=FutureWarning)
 warnings.filterwarnings("ignore", category=SyntaxWarning)
 import argparse
-from functools import partial  # pylint: disable-unused-import
+from typing import List, Union
 
 import commons.utils.initialize as init
 import gin
@@ -34,7 +34,8 @@
     JaggedMegatronTrainNonePipeline,
     JaggedMegatronTrainPipelineSparseDist,
 )
-from training import (
+from trainer.training import maybe_load_ckpts, train_with_pipeline
+from trainer.utils import (
     create_dynamic_optitons_dict,
     create_embedding_configs,
     create_hstu_config,
@@ -42,10 +43,11 @@
     get_data_loader,
     get_dataset_and_embedding_args,
     get_embedding_vector_storage_multiplier,
-    maybe_load_ckpts,
-    train_with_pipeline,
 )
-from utils import (
+from utils import (  # from hstu.utils
+    BenchmarkDatasetArgs,
+    DatasetArgs,
+    EmbeddingArgs,
     NetworkArgs,
     OptimizerArgs,
     RankingArgs,
@@ -53,20 +55,12 @@
     TrainerArgs,
 )
 
-parser = argparse.ArgumentParser(
-    description="Distributed GR Arguments", allow_abbrev=False
-)
-parser.add_argument("--gin-config-file", type=str)
-args = parser.parse_args()
-gin.parse_config_file(args.gin_config_file)
-trainer_args = TrainerArgs()
-dataset_args, embedding_args = get_dataset_and_embedding_args()
-network_args = NetworkArgs()
-optimizer_args = OptimizerArgs()
-tp_args = TensorModelParallelArgs()
-
 
-def create_ranking_config() -> RankingConfig:
+def create_ranking_config(
+    dataset_args: Union[DatasetArgs, BenchmarkDatasetArgs],
+    network_args: NetworkArgs,
+    embedding_args: List[EmbeddingArgs],
+) -> RankingConfig:
     ranking_args = RankingArgs()
 
     return RankingConfig(
@@ -82,6 +76,18 @@ def create_ranking_config() -> RankingConfig:
 
 
 def main():
+    parser = argparse.ArgumentParser(
+        description="HSTU Example Arguments", allow_abbrev=False
+    )
+    parser.add_argument("--gin-config-file", type=str)
+    args = parser.parse_args()
+    gin.parse_config_file(args.gin_config_file)
+    trainer_args = TrainerArgs()
+    dataset_args, embedding_args = get_dataset_and_embedding_args()
+    network_args = NetworkArgs()
+    optimizer_args = OptimizerArgs()
+    tp_args = TensorModelParallelArgs()
+
     init.initialize_distributed()
     init.initialize_model_parallel(
         tensor_model_parallel_size=tp_args.tensor_model_parallel_size
@@ -92,7 +98,7 @@ def main():
         f"distributed env initialization done. Free cuda memory: {free_memory / (1024 ** 2):.2f} MB"
     )
     hstu_config = create_hstu_config(network_args, tp_args)
-    task_config = create_ranking_config()
+    task_config = create_ranking_config(dataset_args, network_args, embedding_args)
     model = get_ranking_model(hstu_config=hstu_config, task_config=task_config)
 
     dynamic_options_dict = create_dynamic_optitons_dict(
diff --git a/examples/hstu/training/pretrain_gr_retrieval.py b/examples/hstu/training/pretrain_gr_retrieval.py
index ec3d0486..c628c535 100644
--- a/examples/hstu/training/pretrain_gr_retrieval.py
+++ b/examples/hstu/training/pretrain_gr_retrieval.py
@@ -18,7 +18,7 @@
 warnings.filterwarnings("ignore", category=FutureWarning)
 warnings.filterwarnings("ignore", category=SyntaxWarning)
 import argparse
-from functools import partial  # pylint: disable-unused-import
+from typing import List, Union
 
 import commons.utils.initialize as init
 import gin
@@ -32,18 +32,20 @@
     JaggedMegatronTrainNonePipeline,
     JaggedMegatronTrainPipelineSparseDist,
 )
-from training import (
+from trainer.training import maybe_load_ckpts, train_with_pipeline
+from trainer.utils import (
     create_dynamic_optitons_dict,
-    create_embedding_config,
+    create_embedding_configs,
     create_hstu_config,
     create_optimizer_params,
     get_data_loader,
     get_dataset_and_embedding_args,
     get_embedding_vector_storage_multiplier,
-    maybe_load_ckpts,
-    train_with_pipeline,
 )
-from utils import (
+from utils import (  # from hstu.utils
+    BenchmarkDatasetArgs,
+    DatasetArgs,
+    EmbeddingArgs,
     NetworkArgs,
     OptimizerArgs,
     RetrievalArgs,
@@ -51,27 +53,18 @@
     TrainerArgs,
 )
 
-parser = argparse.ArgumentParser(
-    description="Distributed GR Arguments", allow_abbrev=False
-)
-parser.add_argument("--gin-config-file", type=str)
-args = parser.parse_args()
-gin.parse_config_file(args.gin_config_file)
-trainer_args = TrainerArgs()
-dataset_args, embedding_args = get_dataset_and_embedding_args()
-network_args = NetworkArgs()
-optimizer_args = OptimizerArgs()
-tp_args = TensorModelParallelArgs()
-
 
-def create_retrieval_config() -> RetrievalConfig:
+def create_retrieval_config(
+    dataset_args: Union[DatasetArgs, BenchmarkDatasetArgs],
+    network_args: NetworkArgs,
+    embedding_args: List[EmbeddingArgs],
+) -> RetrievalConfig:
     retrieval_args = RetrievalArgs()
 
     return RetrievalConfig(
-        embedding_configs=[
-            create_embedding_config(network_args.hidden_size, arg)
-            for arg in embedding_args
-        ],
+        embedding_configs=create_embedding_configs(
+            dataset_args, network_args, embedding_args
+        ),
         temperature=retrieval_args.temperature,
         l2_norm_eps=retrieval_args.l2_norm_eps,
         num_negatives=retrieval_args.num_negatives,
@@ -80,6 +73,18 @@ def create_retrieval_config() -> RetrievalConfig:
 
 
 def main():
+    parser = argparse.ArgumentParser(
+        description="Distributed GR Arguments", allow_abbrev=False
+    )
+    parser.add_argument("--gin-config-file", type=str)
+    args = parser.parse_args()
+    gin.parse_config_file(args.gin_config_file)
+    trainer_args = TrainerArgs()
+    dataset_args, embedding_args = get_dataset_and_embedding_args()
+    network_args = NetworkArgs()
+    optimizer_args = OptimizerArgs()
+    tp_args = TensorModelParallelArgs()
+
     init.initialize_distributed()
     init.initialize_model_parallel(
         tensor_model_parallel_size=tp_args.tensor_model_parallel_size
@@ -87,7 +92,7 @@ def main():
     init.set_random_seed(trainer_args.seed)
 
     hstu_config = create_hstu_config(network_args, tp_args)
-    task_config = create_retrieval_config()
+    task_config = create_retrieval_config(dataset_args, network_args, embedding_args)
     model = get_retrieval_model(hstu_config=hstu_config, task_config=task_config)
 
     dynamic_options_dict = create_dynamic_optitons_dict(
diff --git a/examples/hstu/training/trainer/__init__.py b/examples/hstu/training/trainer/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/examples/hstu/training/training_impl.py b/examples/hstu/training/trainer/training.py
similarity index 99%
rename from examples/hstu/training/training_impl.py
rename to examples/hstu/training/trainer/training.py
index a3fe7a0b..a4b0254c 100644
--- a/examples/hstu/training/training_impl.py
+++ b/examples/hstu/training/trainer/training.py
@@ -31,7 +31,7 @@
     JaggedMegatronTrainNonePipeline,
     JaggedMegatronTrainPipelineSparseDist,
 )
-from training.training_utils import cal_flops
+from trainer.utils import cal_flops
 from utils import TrainerArgs
 
 
diff --git a/examples/hstu/training/training_utils.py b/examples/hstu/training/trainer/utils.py
similarity index 100%
rename from examples/hstu/training/training_utils.py
rename to examples/hstu/training/trainer/utils.py
diff --git a/examples/hstu/utils/gin_config_args.py b/examples/hstu/utils/gin_config_args.py
index 7d5738e0..d47501b1 100644
--- a/examples/hstu/utils/gin_config_args.py
+++ b/examples/hstu/utils/gin_config_args.py
@@ -21,6 +21,31 @@
 @gin.configurable
 @dataclass
 class TrainerArgs:
+    """Trainer Configuration.
+
+    Training-related parameters and settings.
+
+    Attributes:
+        train_batch_size (int): **Required**. Batch size per GPU. When TP is enabled,
+            the theoretical batch size is (train_batch_size × tp_size).
+        eval_batch_size (int): **Required**. Evaluation batch size.
+        eval_interval (int): Evaluation interval in iterations. Default: 100.
+        log_interval (int): Logging interval in iterations. Default: 100.
+        max_train_iters (Optional[int]): Maximum training iterations. Default: None.
+        max_eval_iters (Optional[int]): Maximum evaluation iterations. Default: None.
+        seed (int): Random seed. Default: 1234.
+        profile (bool): Enable profiling. Default: False.
+        profile_step_start (int): Profiling start step. Default: 100.
+        profile_step_end (int): Profiling end step. Default: 200.
+        ckpt_save_interval (int): Checkpoint save interval, -1 means no checkpoint saving.
+            Default: -1.
+        ckpt_save_dir (str): Checkpoint save directory. Default: "./checkpoints".
+        ckpt_load_dir (str): Checkpoint load directory. Default: "".
+        pipeline_type (str): Pipeline overlap type: 'none' (no overlap), 'native'
+            (overlap h2d, input dist, fwd+bwd), 'prefetch' (includes prefetch overlap).
+            Default: "native".
+    """
+
     # below batchsize is batchsize_per_gpu
     # when TP is enabled, the theoratical batchsize is (train_batch_size * tp_size)
     train_batch_size: int
@@ -54,6 +79,17 @@ def __post_init__(self):
 
 @dataclass
 class BaseEmbeddingArgs:
+    """Base Embedding Arguments.
+
+    Base class for embedding configuration parameters.
+
+    Attributes:
+        feature_names (List[str]): List of feature names.
+        table_name (str): Embedding table name.
+        item_vocab_size_or_capacity (int): For dynamic embedding: capacity;
+            for static embedding: vocabulary size.
+    """
+
     # for dynamic emb, it serves as capacity, while for static emb, it serves as vocab size
     feature_names: List[str]
     table_name: str
@@ -63,6 +99,25 @@ class BaseEmbeddingArgs:
 @gin.configurable
 @dataclass
 class EmbeddingArgs(BaseEmbeddingArgs):
+    """Embedding Configuration.
+
+    Base embedding layer configuration parameters.
+
+    Attributes:
+        feature_names (List[str]): **Required**. List of feature names.
+        table_name (str): **Required**. Embedding table name.
+        item_vocab_size_or_capacity (int): **Required**. For dynamic embedding: capacity;
+            for static embedding: vocabulary size.
+        sharding_type (str): Sharding type, must be "data_parallel" or "model_parallel".
+            Default: "None".
+
+    Note:
+        A table could be only one of type `EmbeddingArgs` or `DynamicEmbeddingArgs`.
+        When movielen* or kuairand* datasets are used, `DynamicEmbeddingArgs`/`EmbeddingArgs`
+        are predefined. Setting the proper DatasetArgs.dataset_name in the gin config file will automatically set the proper EmbeddingArgs/DynamicEmbeddingArgs.
+        See `examples/hstu/training/trainer/utils.py::get_dataset_and_embedding_args()` for more details.
+    """
+
     sharding_type: str = "None"
 
     def __post_init__(self):
@@ -75,7 +130,31 @@ def __post_init__(self):
 @gin.configurable
 @dataclass
 class DynamicEmbeddingArgs(EmbeddingArgs):
-    # the precedence is global_hbm_for_values > item_vocab_gpu_capacity > item_vocab_gpu_capacity_ratio
+    """Dynamic Embedding Configuration.
+
+    Extends EmbeddingArgs with dynamic embedding-specific parameters.
+
+    Attributes:
+        global_hbm_for_values (Optional[int]): Global HBM size in bytes (highest priority).
+            Default: None.
+        item_vocab_gpu_capacity (Optional[float]): Item vocabulary GPU capacity
+            (second priority). Default: None.
+        item_vocab_gpu_capacity_ratio (Optional[float]): Item vocabulary GPU capacity ratio
+            (lowest priority). Default: None.
+        evict_strategy (str): Eviction strategy: "lru" or "lfu". Default: "lru".
+        caching (bool): Enable caching on HBM. When caching is enabled, the
+            global_hbm_for_values indicates the cache size. Default: False.
+
+    Note:
+        - sharding_type is automatically set to "model_parallel".
+        - Precedence: The first 3 params can be used for setting the HBM size for dynamic
+          embedding, with precedence: `global_hbm_for_values` > `item_vocab_gpu_capacity` >
+          item_vocab_gpu_capacity_ratio. When only item_vocab_gpu_capacity_ratio is given,
+          `item_vocab_gpu_capacity` = `item_vocab_gpu_capacity_ratio` * `item_vocab_size_or_capacity`
+          and `global_hbm_for_values` are deduced based on the optimizer and embedding dims.
+    """
+
+    # the precedence is `global_hbm_for_values` > `item_vocab_gpu_capacity` > `item_vocab_gpu_capacity_ratio`
     # without optimizer consideration
     global_hbm_for_values: Optional[int] = None
     item_vocab_gpu_capacity: Optional[float] = None
@@ -107,6 +186,23 @@ def calculate_and_reset_global_hbm_for_values(self, hidden_size, multiplier=1):
 @gin.configurable
 @dataclass
 class DatasetArgs:
+    """Dataset Configuration.
+
+    Dataset-related configuration parameters.
+
+    Attributes:
+        dataset_name (str): **Required**. Dataset name.
+        max_sequence_length (int): **Required**. Maximum sequence length.
+        dataset_path (Optional[str]): Path to dataset. Default: None.
+        max_num_candidates (int): Maximum number of candidates. Default: 0.
+        shuffle (bool): Whether to shuffle data. Default: False.
+
+    Note:
+        dataset_path could be None if your dataset is preprocessed and moved under
+        <root-to-repo>/hstu/tmp_data folder or you're running with BenchmarkDatasetArgs
+        which is an in-memory random data generator.
+    """
+
     dataset_name: str
     max_sequence_length: int
     dataset_path: Optional[str] = None
@@ -117,6 +213,19 @@ class DatasetArgs:
 @gin.configurable
 @dataclass
 class FeatureArgs:
+    """Feature Configuration.
+
+    Feature-specific configuration parameters.
+
+    Attributes:
+        feature_names (List[str]): **Required**. List of feature names.
+        max_sequence_length (int): **Required**. Maximum sequence length.
+        is_jagged (bool): Whether features are jagged (variable length). Default: False.
+
+    Note:
+        `FeatureArgs` are only used when the dataset is of `BenchmarkDatasetArgs` type.
+    """
+
     feature_names: List[str]
     max_sequence_length: int
     is_jagged: bool = False
@@ -125,6 +234,20 @@ class FeatureArgs:
 @gin.configurable
 @dataclass
 class BenchmarkDatasetArgs:
+    """Benchmark Dataset Configuration.
+
+    Configuration for benchmark datasets combining features and embeddings.
+
+    Attributes:
+        feature_args (List[FeatureArgs]): **Required**. List of feature arguments.
+        embedding_args (List[Union[EmbeddingArgs, DynamicEmbeddingArgs]]): **Required**.
+            List of embedding arguments.
+        item_feature_name (str): **Required**. Item feature name.
+        contextual_feature_names (List[str]): **Required**. List of contextual feature names.
+        action_feature_name (Optional[str]): Action feature name. Default: None.
+        max_num_candidates (int): Maximum number of candidates. Default: 0.
+    """
+
     feature_args: List[FeatureArgs]
     embedding_args: List[Union[EmbeddingArgs, DynamicEmbeddingArgs]]
     item_feature_name: str
@@ -136,6 +259,29 @@ class BenchmarkDatasetArgs:
 @gin.configurable
 @dataclass
 class NetworkArgs:
+    """Network Architecture Configuration.
+
+    Neural network architecture parameters.
+
+    Attributes:
+        num_layers (int): **Required**. Number of layers.
+        hidden_size (int): **Required**. Hidden layer size.
+        num_attention_heads (int): **Required**. Number of attention heads.
+        kv_channels (int): **Required**. Key-value channels.
+        hidden_dropout (float): Hidden layer dropout rate. Default: 0.2.
+        norm_epsilon (float): Normalization epsilon. Default: 1e-5.
+        is_causal (bool): Use causal attention mask. Default: True.
+        dtype_str (str): Data type: "bfloat16" or "float16". Default: "bfloat16".
+        kernel_backend (str): Kernel backend: "cutlass", "triton", or "pytorch".
+            Default: "cutlass".
+        target_group_size (int): Target group size. Default: 1.
+        num_position_buckets (int): Number of position buckets. Default: 8192.
+        recompute_input_layernorm (bool): Recompute input layer normalization. Default: False.
+        recompute_input_silu (bool): Recompute input SiLU activation. Default: False.
+        item_embedding_dim (int): Item embedding dimension. Default: -1.
+        contextual_embedding_dim (int): Contextual embedding dimension. Default: -1.
+    """
+
     num_layers: int
     hidden_size: int
     num_attention_heads: int
@@ -170,6 +316,18 @@ def __post_init__(self):
 @gin.configurable
 @dataclass
 class OptimizerArgs:
+    """Optimizer Configuration.
+
+    Optimizer-related parameters.
+
+    Attributes:
+        optimizer_str (str): **Required**. Optimizer name.
+        learning_rate (float): **Required**. Learning rate.
+        adam_beta1 (float): Adam optimizer beta1 parameter. Default: 0.9.
+        adam_beta2 (float): Adam optimizer beta2 parameter. Default: 0.999.
+        adam_eps (float): Adam optimizer epsilon parameter. Default: 1e-8.
+    """
+
     optimizer_str: str
     learning_rate: float
     adam_beta1: float = 0.9
@@ -180,12 +338,39 @@ class OptimizerArgs:
 @gin.configurable
 @dataclass
 class TensorModelParallelArgs:
+    """Tensor Model Parallelism Configuration.
+
+    Tensor model parallelism settings.
+
+    Attributes:
+        tensor_model_parallel_size (int): Tensor model parallel size (number of GPUs
+            for model sharding). Default: 1.
+
+    Note:
+        The data parallel size is deduced based on the world_size and
+        tensor_model_parallel_size.
+    """
+
     tensor_model_parallel_size: int = 1
 
 
 @gin.configurable
 @dataclass
 class RankingArgs:
+    """Ranking Task Configuration.
+
+    Configuration specific to ranking tasks.
+
+    Attributes:
+        prediction_head_arch (List[int]): **Required**. Prediction head architecture
+            (list of layer sizes). Default: None.
+        prediction_head_act_type (str): Prediction head activation type: "relu" or "gelu".
+            Default: "relu".
+        prediction_head_bias (bool): Whether to use bias in prediction head. Default: True.
+        num_tasks (int): Number of tasks (for multi-task learning). Default: 1.
+        eval_metrics (Tuple[str, ...]): Evaluation metrics tuple. Default: ("AUC",).
+    """
+
     prediction_head_arch: List[int] = cast(List[int], None)
     prediction_head_act_type: str = "relu"
     prediction_head_bias: bool = True
@@ -206,6 +391,18 @@ def __post_init__(self):
 @gin.configurable
 @dataclass
 class RetrievalArgs:
+    """Retrieval Task Configuration.
+
+    Configuration specific to retrieval tasks.
+
+    Attributes:
+        num_negatives (int): Number of negative samples. Default: -1.
+        temperature (float): Temperature parameter for similarity scoring. Default: 0.05.
+        l2_norm_eps (float): Epsilon value for L2 normalization. Default: 1e-6.
+        eval_metrics (Tuple[str, ...]): Evaluation metrics tuple (Hit Rate, NDCG).
+            Default: ("HR@10", "NDCG@10").
+    """
+
     ### retrieval
     num_negatives: int = -1
     temperature = 0.05
diff --git a/pyproject.toml b/pyproject.toml
index f55eae22..87927543 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,5 +1,6 @@
 [tool.mypy]
 exclude = [
+  "examples/hstu/tmp_data",
   "examples/hstu/ops/triton_ops/*",
   "examples/hstu/ops/fused_hstu_op.py",
   "corelib/*",