From a155592256173ddd837fad5708d698b69c70057a Mon Sep 17 00:00:00 2001 From: JacoCheung Date: Fri, 24 Oct 2025 09:13:05 +0000 Subject: [PATCH 1/3] Refactor training folder and docs --- examples/commons/utils/logger.py | 20 +- examples/commons/utils/stringify.py | 6 +- examples/hstu/README.md | 2 +- examples/hstu/training/README.md | 51 ++++ examples/hstu/training/__init__.py | 2 - examples/hstu/training/benchmark/README.md | 9 +- .../benchmark/hstu_layer_benchmark.py | 2 +- .../benchmark/run_hstu_layer_benchmark.sh | 8 +- examples/hstu/training/pretrain_gr_ranking.py | 44 +-- .../hstu/training/pretrain_gr_retrieval.py | 53 ++-- examples/hstu/training/trainer/__init__.py | 0 .../{training_impl.py => trainer/training.py} | 2 +- .../{training_utils.py => trainer/utils.py} | 0 examples/hstu/utils/gin_args_doc.md | 250 ++++++++++++++++++ pyproject.toml | 1 + 15 files changed, 389 insertions(+), 61 deletions(-) create mode 100644 examples/hstu/training/README.md delete mode 100644 examples/hstu/training/__init__.py create mode 100644 examples/hstu/training/trainer/__init__.py rename examples/hstu/training/{training_impl.py => trainer/training.py} (99%) rename examples/hstu/training/{training_utils.py => trainer/utils.py} (100%) create mode 100644 examples/hstu/utils/gin_args_doc.md diff --git a/examples/commons/utils/logger.py b/examples/commons/utils/logger.py index 6f2f3225..e66ef6ab 100644 --- a/examples/commons/utils/logger.py +++ b/examples/commons/utils/logger.py @@ -12,16 +12,30 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from datetime import datetime +import logging import torch +from rich.console import Console +from rich.logging import RichHandler + +# Set up logger with RichHandler if not already configured + +console = Console() +_LOGGER = logging.getLogger("rich_rank0") + +if not _LOGGER.hasHandlers(): + handler = RichHandler( + console=console, show_time=True, show_path=False, rich_tracebacks=True + ) + _LOGGER.addHandler(handler) + _LOGGER.propagate = False + _LOGGER.setLevel(logging.INFO) def print_rank_0(message): """If distributed is initialized, print only on rank 0.""" if torch.distributed.is_initialized(): - now = datetime.now() if torch.distributed.get_rank() == 0: - print(f"[{now}] " + message, flush=True) + _LOGGER.info(message) else: print(message, flush=True) diff --git a/examples/commons/utils/stringify.py b/examples/commons/utils/stringify.py index ac834b38..93986bfc 100644 --- a/examples/commons/utils/stringify.py +++ b/examples/commons/utils/stringify.py @@ -34,11 +34,11 @@ def stringify_dict(input_dict, prefix="", sep=","): value.float() assert value.dim() == 0 value = value.cpu().item() - output += key + ":" + f"{value:6f}{sep}" + output += key + ": " + f"{value:6f}{sep}" elif isinstance(value, float): - output += key + ":" + f"{value:6f}{sep}" + output += key + ": " + f"{value:6f}{sep}" elif isinstance(value, int): - output += key + ":" + f"{value}{sep}" + output += key + ": " + f"{value}{sep}" else: assert RuntimeError(f"stringify dict not supports type {type(value)}") # remove the ending sep diff --git a/examples/hstu/README.md b/examples/hstu/README.md index 19e56e40..e2b2e9bf 100644 --- a/examples/hstu/README.md +++ b/examples/hstu/README.md @@ -1,4 +1,4 @@ -# Examples: to demonstrate how to train generative recommendation models +# Examples: to demonstrate how to do training and inference generative recommendation models ## Generative Recommender Introduction Meta's paper ["Actions Speak Louder Than Words"](https://arxiv.org/abs/2402.17152) introduces a novel paradigm for recommendation systems called **Generative Recommenders(GRs)**, which reformulates recommendation tasks as generative modeling problems. The work introduced Hierarchical Sequential Transduction Units (HSTU), a novel architecture designed to handle high-cardinality, non-stationary data streams in large-scale recommendation systems. HSTU enables both retrieval and ranking tasks. As noted in the paper, “HSTU-based GRs, with 1.5 trillion parameters, improve metrics in online A/B tests by 12.4% and have been deployed on multiple surfaces of a large internet platform with billions of users.” diff --git a/examples/hstu/training/README.md b/examples/hstu/training/README.md new file mode 100644 index 00000000..4275590b --- /dev/null +++ b/examples/hstu/training/README.md @@ -0,0 +1,51 @@ +# HSTU Training example + +We have supported both retrieval and ranking model whose backbones are HSTU layers. In this example collection, we allow user to specify the model structures via gin-config file. Supported datasets are listed below. Regarding the gin-config interface, please refer to [inline comments](../utils/gin_config_args.py) . + +## Parallelism Introduction +To facilitate large embedding tables and scaling-laws of HSTU dense, we have integrate **[TorchRec](https://github.com/pytorch/torchrec)** that does shard embedding tables and **[Megatron-LM](https://github.com/NVIDIA/Megatron-LM)** that enable dense parallelism(e.g Data, Tensor, Sequence, Pipeline, and Context parallelism) in this example. +This integration ensures efficient training by coordinating sparse (embedding) and dense (context/data) parallelisms within a single model. +![parallelism](../figs/parallelism.png) + + +## Dataset Introduction + +We have supported several datasets as listed in the following sections: + +### Dataset Information +#### **MovieLens** +refer to [MovieLens 1M](https://grouplens.org/datasets/movielens/1m/) and [MovieLens 20M](https://www.kaggle.com/datasets/grouplens/movielens-20m-dataset) for details. +#### **KuaiRand** + +| dataset | # users | seqlen max | seqlen min | seqlen mean | seqlen median | # items | +|---------------|---------|------------|------------|-------------|---------------|------------| +| kuairand_pure | 27285 | 910 | 1 | 1 | 39 | 7551 | +| kuairand_1k | 1000 | 49332 | 10 | 5038 | 3379 | 4369953 | +| kuairand_27k | 27285 | 228000 | 100 | 11796 | 8591 | 32038725 | + +refer to [KuaiRand](https://kuairand.com/) for details. + +## Running the examples + +Before getting started, please make sure that all pre-requisites are fulfilled. You can refer to [Get Started][../../../README] section in the root directory of the repo to set up the environment.**** + + +### Start training +The entrypoint for training are `pretrain_gr_retrieval.py` or `pretrain_gr_ranking.py`. We use gin-config to specify the model structure, training arguments, hyper-params etc. + +Command to run retrieval task with `MovieLens 20m` dataset: + +```bash +# Before running the `pretrain_gr_retrieval.py`, make sure that current working directory is `hstu` +cd examples/hstu +PYTHONPATH=${PYTHONPATH}:$(realpath ../) torchrun --nproc_per_node 1 --master_addr localhost --master_port 6000 ./training/pretrain_gr_retrieval.py --gin-config-file ./training/configs/movielen_retrieval.gin +``` + +To run ranking task with `MovieLens 20m` dataset: +```bash +# Before running the `pretrain_gr_ranking.py`, make sure that current working directory is `hstu` +cd examples/hstu +PYTHONPATH=${PYTHONPATH}:$(realpath ../) torchrun --nproc_per_node 1 --master_addr localhost --master_port 6000 ./training/pretrain_gr_ranking.py --gin-config-file ./training/configs/movielen_ranking.gin +``` + + diff --git a/examples/hstu/training/__init__.py b/examples/hstu/training/__init__.py deleted file mode 100644 index 270ce28a..00000000 --- a/examples/hstu/training/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .training_impl import * # pylint: disable=wildcard-import -from .training_utils import * # pylint: disable=wildcard-import diff --git a/examples/hstu/training/benchmark/README.md b/examples/hstu/training/benchmark/README.md index 42f83069..27358ac9 100644 --- a/examples/hstu/training/benchmark/README.md +++ b/examples/hstu/training/benchmark/README.md @@ -13,7 +13,7 @@ You can run script `run_hstu_benchmark.sh` to see the performance over the base ## How to run -The test entry is `python ./benchmark/hstu_layer_benchmark.py run`, you can type `python ./benchmark/hstu_layer_benchmark.py run --help` to get the input arguments. 4 important arguments are : +The test entry is `python ./training/benchmark/hstu_layer_benchmark.py run`, you can type `python ./training/benchmark/hstu_layer_benchmark.py run --help` to get the input arguments. 4 important arguments are : 1. --kernel-backend: select the hstu mha backend. Could be `triton` or `cutlass`. 2. --fuse-norm-mul-dropout: knob of `layer norm + multiplication + dropout ` fusion. Could be `False` or `True` @@ -23,7 +23,9 @@ The test entry is `python ./benchmark/hstu_layer_benchmark.py run`, you can type Our baseline cmd example (1K): ```bash -python ./benchmark/hstu_layer_benchmark.py run \ + +cd recsys-examples/examples/hstu +python ./training/benchmark/hstu_layer_benchmark.py run \ --iters 100 \ --warmup-iters 50 \ --layer-type native \ @@ -40,7 +42,8 @@ python ./benchmark/hstu_layer_benchmark.py run \ You can also run a set of arguments with run.sh: ```bash -bash run_hstu_layer_benchmark.sh +cd recsys-examples/examples/hstu +bash ./training/benchmark/run_hstu_layer_benchmark.sh ``` After one run is done, a memory snapshot file in current working directory is generated, you can trace the memory usage with the file. Please refer to [PyTorch docs](https://docs.pytorch.org/docs/stable/torch_cuda_memory.html) on how to visualize the memory trace. diff --git a/examples/hstu/training/benchmark/hstu_layer_benchmark.py b/examples/hstu/training/benchmark/hstu_layer_benchmark.py index c47742f6..1e2765ae 100644 --- a/examples/hstu/training/benchmark/hstu_layer_benchmark.py +++ b/examples/hstu/training/benchmark/hstu_layer_benchmark.py @@ -47,7 +47,7 @@ from modules.jagged_data import JaggedData from modules.native_hstu_layer import HSTULayer as NativeHSTULayer from ops.length_to_offsets import length_to_complete_offsets -from training.utils import cal_flops_single_rank +from training.trainer.utils import cal_flops_single_rank _backend_str_to_type = { "cutlass": KernelBackend.CUTLASS, diff --git a/examples/hstu/training/benchmark/run_hstu_layer_benchmark.sh b/examples/hstu/training/benchmark/run_hstu_layer_benchmark.sh index 3984dfe9..2bae6a38 100644 --- a/examples/hstu/training/benchmark/run_hstu_layer_benchmark.sh +++ b/examples/hstu/training/benchmark/run_hstu_layer_benchmark.sh @@ -32,7 +32,7 @@ for dim_per_head in "${dim_per_heads[@]}"; do fi echo -e "\n\033[32mbaseline hstu layer \033[0m:" ${nsys_profile_cmd//${baseline_profile_name}} \ - python ./benchmark/hstu_layer_benchmark.py run \ + python ./training/benchmark/hstu_layer_benchmark.py run \ --iters 100 \ --warmup-iters 50 \ --kernel-backend triton \ @@ -53,7 +53,7 @@ for dim_per_head in "${dim_per_heads[@]}"; do echo -e "\n\033[32m +cutlass\033[0m:" ${nsys_profile_cmd//${cutlass_profile_name}} \ - python ./benchmark/hstu_layer_benchmark.py run \ + python ./training/benchmark/hstu_layer_benchmark.py run \ --iters 100 \ --warmup-iters 50 \ --kernel-backend cutlass \ @@ -73,7 +73,7 @@ for dim_per_head in "${dim_per_heads[@]}"; do echo -e "\n\033[32m +fused\033[0m:" ${nsys_profile_cmd//${fused_profile_name}} \ - python ./benchmark/hstu_layer_benchmark.py run \ + python ./training/benchmark/hstu_layer_benchmark.py run \ --iters 100 \ --warmup-iters 50 \ --kernel-backend cutlass \ @@ -93,7 +93,7 @@ for dim_per_head in "${dim_per_heads[@]}"; do echo -e "\n\033[32m + recompute\033[0m:" ${nsys_profile_cmd//${recompute_profile_name}} \ - python ./benchmark/hstu_layer_benchmark.py run \ + python ./training/benchmark/hstu_layer_benchmark.py run \ --iters 100 \ --warmup-iters 50 \ --kernel-backend cutlass \ diff --git a/examples/hstu/training/pretrain_gr_ranking.py b/examples/hstu/training/pretrain_gr_ranking.py index 993fc753..9ff0b32e 100644 --- a/examples/hstu/training/pretrain_gr_ranking.py +++ b/examples/hstu/training/pretrain_gr_ranking.py @@ -18,7 +18,7 @@ warnings.filterwarnings("ignore", category=FutureWarning) warnings.filterwarnings("ignore", category=SyntaxWarning) import argparse -from functools import partial # pylint: disable-unused-import +from typing import List, Union import commons.utils.initialize as init import gin @@ -34,7 +34,8 @@ JaggedMegatronTrainNonePipeline, JaggedMegatronTrainPipelineSparseDist, ) -from training import ( +from trainer.training import maybe_load_ckpts, train_with_pipeline +from trainer.utils import ( create_dynamic_optitons_dict, create_embedding_configs, create_hstu_config, @@ -42,10 +43,11 @@ get_data_loader, get_dataset_and_embedding_args, get_embedding_vector_storage_multiplier, - maybe_load_ckpts, - train_with_pipeline, ) -from utils import ( +from utils import ( # from hstu.utils + BenchmarkDatasetArgs, + DatasetArgs, + EmbeddingArgs, NetworkArgs, OptimizerArgs, RankingArgs, @@ -53,20 +55,12 @@ TrainerArgs, ) -parser = argparse.ArgumentParser( - description="Distributed GR Arguments", allow_abbrev=False -) -parser.add_argument("--gin-config-file", type=str) -args = parser.parse_args() -gin.parse_config_file(args.gin_config_file) -trainer_args = TrainerArgs() -dataset_args, embedding_args = get_dataset_and_embedding_args() -network_args = NetworkArgs() -optimizer_args = OptimizerArgs() -tp_args = TensorModelParallelArgs() - -def create_ranking_config() -> RankingConfig: +def create_ranking_config( + dataset_args: Union[DatasetArgs, BenchmarkDatasetArgs], + network_args: NetworkArgs, + embedding_args: List[EmbeddingArgs], +) -> RankingConfig: ranking_args = RankingArgs() return RankingConfig( @@ -82,6 +76,18 @@ def create_ranking_config() -> RankingConfig: def main(): + parser = argparse.ArgumentParser( + description="HSTU Example Arguments", allow_abbrev=False + ) + parser.add_argument("--gin-config-file", type=str) + args = parser.parse_args() + gin.parse_config_file(args.gin_config_file) + trainer_args = TrainerArgs() + dataset_args, embedding_args = get_dataset_and_embedding_args() + network_args = NetworkArgs() + optimizer_args = OptimizerArgs() + tp_args = TensorModelParallelArgs() + init.initialize_distributed() init.initialize_model_parallel( tensor_model_parallel_size=tp_args.tensor_model_parallel_size @@ -92,7 +98,7 @@ def main(): f"distributed env initialization done. Free cuda memory: {free_memory / (1024 ** 2):.2f} MB" ) hstu_config = create_hstu_config(network_args, tp_args) - task_config = create_ranking_config() + task_config = create_ranking_config(dataset_args, network_args, embedding_args) model = get_ranking_model(hstu_config=hstu_config, task_config=task_config) dynamic_options_dict = create_dynamic_optitons_dict( diff --git a/examples/hstu/training/pretrain_gr_retrieval.py b/examples/hstu/training/pretrain_gr_retrieval.py index ec3d0486..c628c535 100644 --- a/examples/hstu/training/pretrain_gr_retrieval.py +++ b/examples/hstu/training/pretrain_gr_retrieval.py @@ -18,7 +18,7 @@ warnings.filterwarnings("ignore", category=FutureWarning) warnings.filterwarnings("ignore", category=SyntaxWarning) import argparse -from functools import partial # pylint: disable-unused-import +from typing import List, Union import commons.utils.initialize as init import gin @@ -32,18 +32,20 @@ JaggedMegatronTrainNonePipeline, JaggedMegatronTrainPipelineSparseDist, ) -from training import ( +from trainer.training import maybe_load_ckpts, train_with_pipeline +from trainer.utils import ( create_dynamic_optitons_dict, - create_embedding_config, + create_embedding_configs, create_hstu_config, create_optimizer_params, get_data_loader, get_dataset_and_embedding_args, get_embedding_vector_storage_multiplier, - maybe_load_ckpts, - train_with_pipeline, ) -from utils import ( +from utils import ( # from hstu.utils + BenchmarkDatasetArgs, + DatasetArgs, + EmbeddingArgs, NetworkArgs, OptimizerArgs, RetrievalArgs, @@ -51,27 +53,18 @@ TrainerArgs, ) -parser = argparse.ArgumentParser( - description="Distributed GR Arguments", allow_abbrev=False -) -parser.add_argument("--gin-config-file", type=str) -args = parser.parse_args() -gin.parse_config_file(args.gin_config_file) -trainer_args = TrainerArgs() -dataset_args, embedding_args = get_dataset_and_embedding_args() -network_args = NetworkArgs() -optimizer_args = OptimizerArgs() -tp_args = TensorModelParallelArgs() - -def create_retrieval_config() -> RetrievalConfig: +def create_retrieval_config( + dataset_args: Union[DatasetArgs, BenchmarkDatasetArgs], + network_args: NetworkArgs, + embedding_args: List[EmbeddingArgs], +) -> RetrievalConfig: retrieval_args = RetrievalArgs() return RetrievalConfig( - embedding_configs=[ - create_embedding_config(network_args.hidden_size, arg) - for arg in embedding_args - ], + embedding_configs=create_embedding_configs( + dataset_args, network_args, embedding_args + ), temperature=retrieval_args.temperature, l2_norm_eps=retrieval_args.l2_norm_eps, num_negatives=retrieval_args.num_negatives, @@ -80,6 +73,18 @@ def create_retrieval_config() -> RetrievalConfig: def main(): + parser = argparse.ArgumentParser( + description="Distributed GR Arguments", allow_abbrev=False + ) + parser.add_argument("--gin-config-file", type=str) + args = parser.parse_args() + gin.parse_config_file(args.gin_config_file) + trainer_args = TrainerArgs() + dataset_args, embedding_args = get_dataset_and_embedding_args() + network_args = NetworkArgs() + optimizer_args = OptimizerArgs() + tp_args = TensorModelParallelArgs() + init.initialize_distributed() init.initialize_model_parallel( tensor_model_parallel_size=tp_args.tensor_model_parallel_size @@ -87,7 +92,7 @@ def main(): init.set_random_seed(trainer_args.seed) hstu_config = create_hstu_config(network_args, tp_args) - task_config = create_retrieval_config() + task_config = create_retrieval_config(dataset_args, network_args, embedding_args) model = get_retrieval_model(hstu_config=hstu_config, task_config=task_config) dynamic_options_dict = create_dynamic_optitons_dict( diff --git a/examples/hstu/training/trainer/__init__.py b/examples/hstu/training/trainer/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/examples/hstu/training/training_impl.py b/examples/hstu/training/trainer/training.py similarity index 99% rename from examples/hstu/training/training_impl.py rename to examples/hstu/training/trainer/training.py index a3fe7a0b..a4b0254c 100644 --- a/examples/hstu/training/training_impl.py +++ b/examples/hstu/training/trainer/training.py @@ -31,7 +31,7 @@ JaggedMegatronTrainNonePipeline, JaggedMegatronTrainPipelineSparseDist, ) -from training.training_utils import cal_flops +from trainer.utils import cal_flops from utils import TrainerArgs diff --git a/examples/hstu/training/training_utils.py b/examples/hstu/training/trainer/utils.py similarity index 100% rename from examples/hstu/training/training_utils.py rename to examples/hstu/training/trainer/utils.py diff --git a/examples/hstu/utils/gin_args_doc.md b/examples/hstu/utils/gin_args_doc.md new file mode 100644 index 00000000..ff37215b --- /dev/null +++ b/examples/hstu/utils/gin_args_doc.md @@ -0,0 +1,250 @@ +# Gin Configurable Interfaces Documentation + +This document provides comprehensive documentation for all configurable hypara-params that used by both inference and training + + +## 1. TrainerArgs - Trainer Configuration + +Training-related parameters and settings. + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `train_batch_size` | int | - | **Required**. Batch size per GPU. When TP is enabled, the theoretical batch size is (train_batch_size × tp_size) | +| `eval_batch_size` | int | - | **Required**. Evaluation batch size | +| `eval_interval` | int | 100 | Evaluation interval in iterations | +| `log_interval` | int | 100 | Logging interval in iterations | +| `max_train_iters` | Optional[int] | None | Maximum training iterations | +| `max_eval_iters` | Optional[int] | None | Maximum evaluation iterations | +| `seed` | int | 1234 | Random seed | +| `profile` | bool | False | Enable profiling | +| `profile_step_start` | int | 100 | Profiling start step | +| `profile_step_end` | int | 200 | Profiling end step | +| `ckpt_save_interval` | int | -1 | Checkpoint save interval, -1 means no checkpoint saving | +| `ckpt_save_dir` | str | "./checkpoints" | Checkpoint save directory | +| `ckpt_load_dir` | str | "" | Checkpoint load directory | +| `pipeline_type` | str | "native" | Pipeline overlap type: `none` (no overlap), `native` (overlap h2d, input dist, fwd+bwd), `prefetch` (includes prefetch overlap) | + +--- + + +## 2. EmbeddingArgs - Embedding Configuration + +Base embedding layer configuration parameters. + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `feature_names` | List[str] | - | **Required**. List of feature names | +| `table_name` | str | - | **Required**. Embedding table name | +| `item_vocab_size_or_capacity` | int | - | **Required**. For dynamic embedding: capacity; for static embedding: vocabulary size | +| `sharding_type` | str | "None" | Sharding type, must be "data_parallel" or "model_parallel" | + +--- + +## 3. DynamicEmbeddingArgs - Dynamic Embedding Configuration + +Extends `EmbeddingArgs` with dynamic embedding-specific parameters. + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `global_hbm_for_values` | Optional[int] | None | Global HBM size in bytes (highest priority) | +| `item_vocab_gpu_capacity` | Optional[float] | None | Item vocabulary GPU capacity (second priority) | +| `item_vocab_gpu_capacity_ratio` | Optional[float] | None | Item vocabulary GPU capacity ratio (lowest priority) | +| `evict_strategy` | str | "lru" | Eviction strategy: "lru" or "lfu" | +| `caching` | bool | False | Enable caching on HMB. When caching is enabled, the global_hbm_for_values indicates the cache size | + +**Note**: `sharding_type` is automatically set to "model_parallel" + +**Precedence**: The first 3 params can be used for setting the HBM size for dynamic embedding, but there is a precedence relationship: `global_hbm_for_values` > `item_vocab_gpu_capacity` > `item_vocab_gpu_capacity_ratio`. When only `item_vocab_gpu_capacity_ratio` is given, `item_vocab_gpu_capacity = item_vocab_gpu_capacity_ratio * item_vocab_size_or_capacity` and `global_hbm_for_values` are deduced based on the optimizer and embedding dims. + +**Note**: A table could be only one of type EmbeddingArgs or DynamicEmbeddingArgs. + +**Note**: When movielen\* or kuairand\* dataset are used, DynamicEmbeddingArgs/EmbeddingArgs are predefined. See [get_dataset_and_embedding_args() func](../hstu/training/trainer/utils.py) + +--- + +## 4. DatasetArgs - Dataset Configuration + +Dataset-related configuration parameters. + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `dataset_name` | str | - | **Required**. Dataset name | +| `max_sequence_length` | int | - | **Required**. Maximum sequence length | +| `dataset_path` | Optional[str] | None | Path to dataset | +| `max_num_candidates` | int | 0 | Maximum number of candidates | +| `shuffle` | bool | False | Whether to shuffle data | + +**Note**: `dataset_path` could be none if your dataset is preprocessed and moved under /hstu/tmp_data folder or you're running with `BenchmarkDatasetArgs` which is a in-memory random data generator. Please refer to [example](../hstu/training/configs/benchmark_ranking.gin). + + +--- + +## 5. FeatureArgs - Feature Configuration + +Feature-specific configuration parameters. + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `feature_names` | List[str] | - | **Required**. List of feature names | +| `max_sequence_length` | int | - | **Required**. Maximum sequence length | +| `is_jagged` | bool | False | Whether features are jagged (variable length) | + FeatureArgs and DatasetArgs + +**Note**: `FeatureArgs` are only used when the dataset is of `BenchmarkDatasetArgs`. + +--- +## 6. BenchmarkDatasetArgs - Benchmark Dataset Configuration + +Configuration for benchmark datasets combining features and embeddings. + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `feature_args` | List[FeatureArgs] | - | **Required**. List of feature arguments | +| `embedding_args` | List[Union[EmbeddingArgs, DynamicEmbeddingArgs]] | - | **Required**. List of embedding arguments | +| `item_feature_name` | str | - | **Required**. Item feature name | +| `contextual_feature_names` | List[str] | - | **Required**. List of contextual feature names | +| `action_feature_name` | Optional[str] | None | Action feature name | +| `max_num_candidates` | int | 0 | Maximum number of candidates | + +--- + +## 7. NetworkArgs - Network Architecture Configuration + +Neural network architecture parameters. + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `num_layers` | int | - | **Required**. Number of layers | +| `hidden_size` | int | - | **Required**. Hidden layer size | +| `num_attention_heads` | int | - | **Required**. Number of attention heads | +| `kv_channels` | int | - | **Required**. Key-value channels | +| `hidden_dropout` | float | 0.2 | Hidden layer dropout rate | +| `norm_epsilon` | float | 1e-5 | Normalization epsilon | +| `is_causal` | bool | True | Use causal attention mask | +| `dtype_str` | str | "bfloat16" | Data type: "bfloat16" or "float16" | +| `kernel_backend` | str | "cutlass" | Kernel backend: "cutlass", "triton", or "pytorch" | +| `target_group_size` | int | 1 | Target group size | +| `num_position_buckets` | int | 8192 | Number of position buckets | +| `recompute_input_layernorm` | bool | False | Recompute input layer normalization | +| `recompute_input_silu` | bool | False | Recompute input SiLU activation | +| `item_embedding_dim` | int | -1 | Item embedding dimension | +| `contextual_embedding_dim` | int | -1 | Contextual embedding dimension | + +--- + +## 8. OptimizerArgs - Optimizer Configuration + +Optimizer-related parameters. + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `optimizer_str` | str | - | **Required**. Optimizer name | +| `learning_rate` | float | - | **Required**. Learning rate | +| `adam_beta1` | float | 0.9 | Adam optimizer beta1 parameter | +| `adam_beta2` | float | 0.999 | Adam optimizer beta2 parameter | +| `adam_eps` | float | 1e-8 | Adam optimizer epsilon parameter | + +--- + +## 9. TensorModelParallelArgs - Tensor Model Parallelism Configuration + +Tensor model parallelism settings. + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `tensor_model_parallel_size` | int | 1 | Tensor model parallel size (number of GPUs for model sharding) | + +**Note**: The data parallel size is deduced based on the `world_size` and `tensor_model_parallel_size`. + +--- + +## 10. RankingArgs - Ranking Task Configuration + +Configuration specific to ranking tasks. + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `prediction_head_arch` | List[int] | None | **Required**. Prediction head architecture (list of layer sizes) | +| `prediction_head_act_type` | str | "relu" | Prediction head activation type: "relu" or "gelu" | +| `prediction_head_bias` | bool | True | Whether to use bias in prediction head | +| `num_tasks` | int | 1 | Number of tasks (for multi-task learning) | +| `eval_metrics` | Tuple[str, ...] | ("AUC",) | Evaluation metrics tuple | + +--- + +## 11. RetrievalArgs - Retrieval Task Configuration + +Configuration specific to retrieval tasks. + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `num_negatives` | int | -1 | Number of negative samples | +| `temperature` | float | 0.05 | Temperature parameter for similarity scoring | +| `l2_norm_eps` | float | 1e-6 | Epsilon value for L2 normalization | +| `eval_metrics` | Tuple[str, ...] | ("HR@10", "NDCG@10") | Evaluation metrics tuple (Hit Rate, NDCG) | + +--- + +## Usage Examples + +### Example 1: Basic Configuration + +```python +# In your .gin config file + +# Trainer configuration +TrainerArgs.train_batch_size = 256 +TrainerArgs.eval_batch_size = 512 +TrainerArgs.max_train_iters = 10000 +TrainerArgs.pipeline_type = "prefetch" + +# Network configuration +NetworkArgs.num_layers = 4 +NetworkArgs.hidden_size = 256 +NetworkArgs.num_attention_heads = 8 +NetworkArgs.kv_channels = 32 +NetworkArgs.dtype_str = "bfloat16" + +# Optimizer configuration +OptimizerArgs.optimizer_str = "adam" +OptimizerArgs.learning_rate = 0.001 +``` + +### Example 2: Ranking Task Configuration + +```python +# Dataset +DatasetArgs.dataset_name = "criteo" +DatasetArgs.max_sequence_length = 128 + +# Ranking model +RankingArgs.prediction_head_arch = [512, 256, 1] +RankingArgs.prediction_head_act_type = "relu" +RankingArgs.eval_metrics = ("AUC") + +# Embeddings +EmbeddingArgs.feature_names = ["item_id", "category"] +EmbeddingArgs.table_name = "item_table" +EmbeddingArgs.item_vocab_size_or_capacity = 1000000 +EmbeddingArgs.sharding_type = "data_parallel" +``` + +### Example 3: Retrieval Task with Dynamic Embedding + +```python +# Retrieval configuration +RetrievalArgs.num_negatives = 100 +RetrievalArgs.temperature = 0.05 +RetrievalArgs.eval_metrics = ("HR@10", "HR@50", "NDCG@10") + +# Dynamic embedding +DynamicEmbeddingArgs.feature_names = ["user_id", "item_id"] +DynamicEmbeddingArgs.table_name = "user_item_table" +DynamicEmbeddingArgs.item_vocab_size_or_capacity = 10000000 +DynamicEmbeddingArgs.item_vocab_gpu_capacity_ratio = 0.1 +DynamicEmbeddingArgs.evict_strategy = "lru" +DynamicEmbeddingArgs.caching = True +``` + +--- + diff --git a/pyproject.toml b/pyproject.toml index f55eae22..87927543 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,6 @@ [tool.mypy] exclude = [ + "examples/hstu/tmp_data", "examples/hstu/ops/triton_ops/*", "examples/hstu/ops/fused_hstu_op.py", "corelib/*", From b4cf17506791fc4906729571a8061470351492e1 Mon Sep 17 00:00:00 2001 From: JacoCheung Date: Mon, 27 Oct 2025 05:18:32 +0000 Subject: [PATCH 2/3] Move root RM env setting up to training --- README.md | 38 ---- examples/hstu/training/README.md | 50 ++++- examples/hstu/utils/gin_args_doc.md | 250 ------------------------- examples/hstu/utils/gin_config_args.py | 199 +++++++++++++++++++- 4 files changed, 246 insertions(+), 291 deletions(-) delete mode 100644 examples/hstu/utils/gin_args_doc.md diff --git a/README.md b/README.md index 08b47440..8c1a47ec 100644 --- a/README.md +++ b/README.md @@ -35,44 +35,6 @@ The project includes: For more detailed release notes, please refer our [releases](https://github.com/NVIDIA/recsys-examples/releases). -# Environment Setup -## Start from dockerfile - -We provide [dockerfile](./docker/Dockerfile) for users to build environment. -``` -docker build -f docker/Dockerfile --platform linux/amd64 -t recsys-examples:latest . -``` -If you want to build image for Grace, you can use -``` -docker build -f docker/Dockerfile --platform linux/arm64 -t recsys-examples:latest . -``` -You can also set your own base image with args `--build-arg `. - -## Start from source file -Before running examples, build and install libs under corelib following instruction in documentation: -- [HSTU attention documentation](./corelib/hstu/README.md) -- [Dynamic Embeddings documentation](./corelib/dynamicemb/README.md) - -On top of those two core libs, Megatron-Core along with other libs are required. You can install them via pypi package: - -```bash -pip install torchx gin-config torchmetrics==1.0.3 typing-extensions iopath megatron-core==0.9.0 -``` - -If you fail to install the megatron-core package, usually due to the python version incompatibility, please try to clone and then install the source code. - -```bash -git clone -b core_r0.9.0 https://github.com/NVIDIA/Megatron-LM.git megatron-lm && \ -pip install -e ./megatron-lm -``` - -We provide our custom HSTU CUDA operators for enhanced performance. You need to install these operators using the following command: - -```bash -cd /workspace/recsys-examples/examples/hstu && \ -python setup.py install -``` - # Get Started The examples we supported: - [HSTU recommender examples](./examples/hstu/README.md) diff --git a/examples/hstu/training/README.md b/examples/hstu/training/README.md index 4275590b..385f13e7 100644 --- a/examples/hstu/training/README.md +++ b/examples/hstu/training/README.md @@ -7,8 +7,44 @@ To facilitate large embedding tables and scaling-laws of HSTU dense, we have int This integration ensures efficient training by coordinating sparse (embedding) and dense (context/data) parallelisms within a single model. ![parallelism](../figs/parallelism.png) +## Environment Setup +### Start from dockerfile -## Dataset Introduction +We provide [dockerfile](./docker/Dockerfile) for users to build environment. +``` +docker build -f docker/Dockerfile --platform linux/amd64 -t recsys-examples:latest . +``` +If you want to build image for Grace, you can use +``` +docker build -f docker/Dockerfile --platform linux/arm64 -t recsys-examples:latest . +``` +You can also set your own base image with args `--build-arg `. + +### Start from source file +Before running examples, build and install libs under corelib following instruction in documentation: +- [HSTU attention documentation](./corelib/hstu/README.md) +- [Dynamic Embeddings documentation](./corelib/dynamicemb/README.md) + +On top of those two core libs, Megatron-Core along with other libs are required. You can install them via pypi package: + +```bash +pip install torchx gin-config torchmetrics==1.0.3 typing-extensions iopath megatron-core==0.9.0 +``` + +If you fail to install the megatron-core package, usually due to the python version incompatibility, please try to clone and then install the source code. + +```bash +git clone -b core_r0.9.0 https://github.com/NVIDIA/Megatron-LM.git megatron-lm && \ +pip install -e ./megatron-lm +``` + +We provide our custom HSTU CUDA operators for enhanced performance. You need to install these operators using the following command: + +```bash +cd /workspace/recsys-examples/examples/hstu && \ +python setup.py install +``` +### Dataset Introduction We have supported several datasets as listed in the following sections: @@ -27,8 +63,18 @@ refer to [KuaiRand](https://kuairand.com/) for details. ## Running the examples -Before getting started, please make sure that all pre-requisites are fulfilled. You can refer to [Get Started][../../../README] section in the root directory of the repo to set up the environment.**** +Before getting started, please make sure that all pre-requisites are fulfilled. You can refer to [Get Started](../../../README) section in the root directory of the repo to set up the environment. + +### Dataset preprocessing + +In order to prepare the dataset for training, you can use our `preprocessor.py` under the hstu example folder of the project. + +```bash +cd /examples/hstu && +mkdir -p ./tmp_data && python3 ./preprocessor.py --dataset_name <"ml-1m"|"ml-20m"|"kuairand-pure"|"kuairand-1k"|"kuairand-27k"> + +``` ### Start training The entrypoint for training are `pretrain_gr_retrieval.py` or `pretrain_gr_ranking.py`. We use gin-config to specify the model structure, training arguments, hyper-params etc. diff --git a/examples/hstu/utils/gin_args_doc.md b/examples/hstu/utils/gin_args_doc.md deleted file mode 100644 index ff37215b..00000000 --- a/examples/hstu/utils/gin_args_doc.md +++ /dev/null @@ -1,250 +0,0 @@ -# Gin Configurable Interfaces Documentation - -This document provides comprehensive documentation for all configurable hypara-params that used by both inference and training - - -## 1. TrainerArgs - Trainer Configuration - -Training-related parameters and settings. - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `train_batch_size` | int | - | **Required**. Batch size per GPU. When TP is enabled, the theoretical batch size is (train_batch_size × tp_size) | -| `eval_batch_size` | int | - | **Required**. Evaluation batch size | -| `eval_interval` | int | 100 | Evaluation interval in iterations | -| `log_interval` | int | 100 | Logging interval in iterations | -| `max_train_iters` | Optional[int] | None | Maximum training iterations | -| `max_eval_iters` | Optional[int] | None | Maximum evaluation iterations | -| `seed` | int | 1234 | Random seed | -| `profile` | bool | False | Enable profiling | -| `profile_step_start` | int | 100 | Profiling start step | -| `profile_step_end` | int | 200 | Profiling end step | -| `ckpt_save_interval` | int | -1 | Checkpoint save interval, -1 means no checkpoint saving | -| `ckpt_save_dir` | str | "./checkpoints" | Checkpoint save directory | -| `ckpt_load_dir` | str | "" | Checkpoint load directory | -| `pipeline_type` | str | "native" | Pipeline overlap type: `none` (no overlap), `native` (overlap h2d, input dist, fwd+bwd), `prefetch` (includes prefetch overlap) | - ---- - - -## 2. EmbeddingArgs - Embedding Configuration - -Base embedding layer configuration parameters. - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `feature_names` | List[str] | - | **Required**. List of feature names | -| `table_name` | str | - | **Required**. Embedding table name | -| `item_vocab_size_or_capacity` | int | - | **Required**. For dynamic embedding: capacity; for static embedding: vocabulary size | -| `sharding_type` | str | "None" | Sharding type, must be "data_parallel" or "model_parallel" | - ---- - -## 3. DynamicEmbeddingArgs - Dynamic Embedding Configuration - -Extends `EmbeddingArgs` with dynamic embedding-specific parameters. - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `global_hbm_for_values` | Optional[int] | None | Global HBM size in bytes (highest priority) | -| `item_vocab_gpu_capacity` | Optional[float] | None | Item vocabulary GPU capacity (second priority) | -| `item_vocab_gpu_capacity_ratio` | Optional[float] | None | Item vocabulary GPU capacity ratio (lowest priority) | -| `evict_strategy` | str | "lru" | Eviction strategy: "lru" or "lfu" | -| `caching` | bool | False | Enable caching on HMB. When caching is enabled, the global_hbm_for_values indicates the cache size | - -**Note**: `sharding_type` is automatically set to "model_parallel" - -**Precedence**: The first 3 params can be used for setting the HBM size for dynamic embedding, but there is a precedence relationship: `global_hbm_for_values` > `item_vocab_gpu_capacity` > `item_vocab_gpu_capacity_ratio`. When only `item_vocab_gpu_capacity_ratio` is given, `item_vocab_gpu_capacity = item_vocab_gpu_capacity_ratio * item_vocab_size_or_capacity` and `global_hbm_for_values` are deduced based on the optimizer and embedding dims. - -**Note**: A table could be only one of type EmbeddingArgs or DynamicEmbeddingArgs. - -**Note**: When movielen\* or kuairand\* dataset are used, DynamicEmbeddingArgs/EmbeddingArgs are predefined. See [get_dataset_and_embedding_args() func](../hstu/training/trainer/utils.py) - ---- - -## 4. DatasetArgs - Dataset Configuration - -Dataset-related configuration parameters. - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `dataset_name` | str | - | **Required**. Dataset name | -| `max_sequence_length` | int | - | **Required**. Maximum sequence length | -| `dataset_path` | Optional[str] | None | Path to dataset | -| `max_num_candidates` | int | 0 | Maximum number of candidates | -| `shuffle` | bool | False | Whether to shuffle data | - -**Note**: `dataset_path` could be none if your dataset is preprocessed and moved under /hstu/tmp_data folder or you're running with `BenchmarkDatasetArgs` which is a in-memory random data generator. Please refer to [example](../hstu/training/configs/benchmark_ranking.gin). - - ---- - -## 5. FeatureArgs - Feature Configuration - -Feature-specific configuration parameters. - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `feature_names` | List[str] | - | **Required**. List of feature names | -| `max_sequence_length` | int | - | **Required**. Maximum sequence length | -| `is_jagged` | bool | False | Whether features are jagged (variable length) | - FeatureArgs and DatasetArgs - -**Note**: `FeatureArgs` are only used when the dataset is of `BenchmarkDatasetArgs`. - ---- -## 6. BenchmarkDatasetArgs - Benchmark Dataset Configuration - -Configuration for benchmark datasets combining features and embeddings. - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `feature_args` | List[FeatureArgs] | - | **Required**. List of feature arguments | -| `embedding_args` | List[Union[EmbeddingArgs, DynamicEmbeddingArgs]] | - | **Required**. List of embedding arguments | -| `item_feature_name` | str | - | **Required**. Item feature name | -| `contextual_feature_names` | List[str] | - | **Required**. List of contextual feature names | -| `action_feature_name` | Optional[str] | None | Action feature name | -| `max_num_candidates` | int | 0 | Maximum number of candidates | - ---- - -## 7. NetworkArgs - Network Architecture Configuration - -Neural network architecture parameters. - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `num_layers` | int | - | **Required**. Number of layers | -| `hidden_size` | int | - | **Required**. Hidden layer size | -| `num_attention_heads` | int | - | **Required**. Number of attention heads | -| `kv_channels` | int | - | **Required**. Key-value channels | -| `hidden_dropout` | float | 0.2 | Hidden layer dropout rate | -| `norm_epsilon` | float | 1e-5 | Normalization epsilon | -| `is_causal` | bool | True | Use causal attention mask | -| `dtype_str` | str | "bfloat16" | Data type: "bfloat16" or "float16" | -| `kernel_backend` | str | "cutlass" | Kernel backend: "cutlass", "triton", or "pytorch" | -| `target_group_size` | int | 1 | Target group size | -| `num_position_buckets` | int | 8192 | Number of position buckets | -| `recompute_input_layernorm` | bool | False | Recompute input layer normalization | -| `recompute_input_silu` | bool | False | Recompute input SiLU activation | -| `item_embedding_dim` | int | -1 | Item embedding dimension | -| `contextual_embedding_dim` | int | -1 | Contextual embedding dimension | - ---- - -## 8. OptimizerArgs - Optimizer Configuration - -Optimizer-related parameters. - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `optimizer_str` | str | - | **Required**. Optimizer name | -| `learning_rate` | float | - | **Required**. Learning rate | -| `adam_beta1` | float | 0.9 | Adam optimizer beta1 parameter | -| `adam_beta2` | float | 0.999 | Adam optimizer beta2 parameter | -| `adam_eps` | float | 1e-8 | Adam optimizer epsilon parameter | - ---- - -## 9. TensorModelParallelArgs - Tensor Model Parallelism Configuration - -Tensor model parallelism settings. - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `tensor_model_parallel_size` | int | 1 | Tensor model parallel size (number of GPUs for model sharding) | - -**Note**: The data parallel size is deduced based on the `world_size` and `tensor_model_parallel_size`. - ---- - -## 10. RankingArgs - Ranking Task Configuration - -Configuration specific to ranking tasks. - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `prediction_head_arch` | List[int] | None | **Required**. Prediction head architecture (list of layer sizes) | -| `prediction_head_act_type` | str | "relu" | Prediction head activation type: "relu" or "gelu" | -| `prediction_head_bias` | bool | True | Whether to use bias in prediction head | -| `num_tasks` | int | 1 | Number of tasks (for multi-task learning) | -| `eval_metrics` | Tuple[str, ...] | ("AUC",) | Evaluation metrics tuple | - ---- - -## 11. RetrievalArgs - Retrieval Task Configuration - -Configuration specific to retrieval tasks. - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `num_negatives` | int | -1 | Number of negative samples | -| `temperature` | float | 0.05 | Temperature parameter for similarity scoring | -| `l2_norm_eps` | float | 1e-6 | Epsilon value for L2 normalization | -| `eval_metrics` | Tuple[str, ...] | ("HR@10", "NDCG@10") | Evaluation metrics tuple (Hit Rate, NDCG) | - ---- - -## Usage Examples - -### Example 1: Basic Configuration - -```python -# In your .gin config file - -# Trainer configuration -TrainerArgs.train_batch_size = 256 -TrainerArgs.eval_batch_size = 512 -TrainerArgs.max_train_iters = 10000 -TrainerArgs.pipeline_type = "prefetch" - -# Network configuration -NetworkArgs.num_layers = 4 -NetworkArgs.hidden_size = 256 -NetworkArgs.num_attention_heads = 8 -NetworkArgs.kv_channels = 32 -NetworkArgs.dtype_str = "bfloat16" - -# Optimizer configuration -OptimizerArgs.optimizer_str = "adam" -OptimizerArgs.learning_rate = 0.001 -``` - -### Example 2: Ranking Task Configuration - -```python -# Dataset -DatasetArgs.dataset_name = "criteo" -DatasetArgs.max_sequence_length = 128 - -# Ranking model -RankingArgs.prediction_head_arch = [512, 256, 1] -RankingArgs.prediction_head_act_type = "relu" -RankingArgs.eval_metrics = ("AUC") - -# Embeddings -EmbeddingArgs.feature_names = ["item_id", "category"] -EmbeddingArgs.table_name = "item_table" -EmbeddingArgs.item_vocab_size_or_capacity = 1000000 -EmbeddingArgs.sharding_type = "data_parallel" -``` - -### Example 3: Retrieval Task with Dynamic Embedding - -```python -# Retrieval configuration -RetrievalArgs.num_negatives = 100 -RetrievalArgs.temperature = 0.05 -RetrievalArgs.eval_metrics = ("HR@10", "HR@50", "NDCG@10") - -# Dynamic embedding -DynamicEmbeddingArgs.feature_names = ["user_id", "item_id"] -DynamicEmbeddingArgs.table_name = "user_item_table" -DynamicEmbeddingArgs.item_vocab_size_or_capacity = 10000000 -DynamicEmbeddingArgs.item_vocab_gpu_capacity_ratio = 0.1 -DynamicEmbeddingArgs.evict_strategy = "lru" -DynamicEmbeddingArgs.caching = True -``` - ---- - diff --git a/examples/hstu/utils/gin_config_args.py b/examples/hstu/utils/gin_config_args.py index 7d5738e0..d47501b1 100644 --- a/examples/hstu/utils/gin_config_args.py +++ b/examples/hstu/utils/gin_config_args.py @@ -21,6 +21,31 @@ @gin.configurable @dataclass class TrainerArgs: + """Trainer Configuration. + + Training-related parameters and settings. + + Attributes: + train_batch_size (int): **Required**. Batch size per GPU. When TP is enabled, + the theoretical batch size is (train_batch_size × tp_size). + eval_batch_size (int): **Required**. Evaluation batch size. + eval_interval (int): Evaluation interval in iterations. Default: 100. + log_interval (int): Logging interval in iterations. Default: 100. + max_train_iters (Optional[int]): Maximum training iterations. Default: None. + max_eval_iters (Optional[int]): Maximum evaluation iterations. Default: None. + seed (int): Random seed. Default: 1234. + profile (bool): Enable profiling. Default: False. + profile_step_start (int): Profiling start step. Default: 100. + profile_step_end (int): Profiling end step. Default: 200. + ckpt_save_interval (int): Checkpoint save interval, -1 means no checkpoint saving. + Default: -1. + ckpt_save_dir (str): Checkpoint save directory. Default: "./checkpoints". + ckpt_load_dir (str): Checkpoint load directory. Default: "". + pipeline_type (str): Pipeline overlap type: 'none' (no overlap), 'native' + (overlap h2d, input dist, fwd+bwd), 'prefetch' (includes prefetch overlap). + Default: "native". + """ + # below batchsize is batchsize_per_gpu # when TP is enabled, the theoratical batchsize is (train_batch_size * tp_size) train_batch_size: int @@ -54,6 +79,17 @@ def __post_init__(self): @dataclass class BaseEmbeddingArgs: + """Base Embedding Arguments. + + Base class for embedding configuration parameters. + + Attributes: + feature_names (List[str]): List of feature names. + table_name (str): Embedding table name. + item_vocab_size_or_capacity (int): For dynamic embedding: capacity; + for static embedding: vocabulary size. + """ + # for dynamic emb, it serves as capacity, while for static emb, it serves as vocab size feature_names: List[str] table_name: str @@ -63,6 +99,25 @@ class BaseEmbeddingArgs: @gin.configurable @dataclass class EmbeddingArgs(BaseEmbeddingArgs): + """Embedding Configuration. + + Base embedding layer configuration parameters. + + Attributes: + feature_names (List[str]): **Required**. List of feature names. + table_name (str): **Required**. Embedding table name. + item_vocab_size_or_capacity (int): **Required**. For dynamic embedding: capacity; + for static embedding: vocabulary size. + sharding_type (str): Sharding type, must be "data_parallel" or "model_parallel". + Default: "None". + + Note: + A table could be only one of type `EmbeddingArgs` or `DynamicEmbeddingArgs`. + When movielen* or kuairand* datasets are used, `DynamicEmbeddingArgs`/`EmbeddingArgs` + are predefined. Setting the proper DatasetArgs.dataset_name in the gin config file will automatically set the proper EmbeddingArgs/DynamicEmbeddingArgs. + See `examples/hstu/training/trainer/utils.py::get_dataset_and_embedding_args()` for more details. + """ + sharding_type: str = "None" def __post_init__(self): @@ -75,7 +130,31 @@ def __post_init__(self): @gin.configurable @dataclass class DynamicEmbeddingArgs(EmbeddingArgs): - # the precedence is global_hbm_for_values > item_vocab_gpu_capacity > item_vocab_gpu_capacity_ratio + """Dynamic Embedding Configuration. + + Extends EmbeddingArgs with dynamic embedding-specific parameters. + + Attributes: + global_hbm_for_values (Optional[int]): Global HBM size in bytes (highest priority). + Default: None. + item_vocab_gpu_capacity (Optional[float]): Item vocabulary GPU capacity + (second priority). Default: None. + item_vocab_gpu_capacity_ratio (Optional[float]): Item vocabulary GPU capacity ratio + (lowest priority). Default: None. + evict_strategy (str): Eviction strategy: "lru" or "lfu". Default: "lru". + caching (bool): Enable caching on HBM. When caching is enabled, the + global_hbm_for_values indicates the cache size. Default: False. + + Note: + - sharding_type is automatically set to "model_parallel". + - Precedence: The first 3 params can be used for setting the HBM size for dynamic + embedding, with precedence: `global_hbm_for_values` > `item_vocab_gpu_capacity` > + item_vocab_gpu_capacity_ratio. When only item_vocab_gpu_capacity_ratio is given, + `item_vocab_gpu_capacity` = `item_vocab_gpu_capacity_ratio` * `item_vocab_size_or_capacity` + and `global_hbm_for_values` are deduced based on the optimizer and embedding dims. + """ + + # the precedence is `global_hbm_for_values` > `item_vocab_gpu_capacity` > `item_vocab_gpu_capacity_ratio` # without optimizer consideration global_hbm_for_values: Optional[int] = None item_vocab_gpu_capacity: Optional[float] = None @@ -107,6 +186,23 @@ def calculate_and_reset_global_hbm_for_values(self, hidden_size, multiplier=1): @gin.configurable @dataclass class DatasetArgs: + """Dataset Configuration. + + Dataset-related configuration parameters. + + Attributes: + dataset_name (str): **Required**. Dataset name. + max_sequence_length (int): **Required**. Maximum sequence length. + dataset_path (Optional[str]): Path to dataset. Default: None. + max_num_candidates (int): Maximum number of candidates. Default: 0. + shuffle (bool): Whether to shuffle data. Default: False. + + Note: + dataset_path could be None if your dataset is preprocessed and moved under + /hstu/tmp_data folder or you're running with BenchmarkDatasetArgs + which is an in-memory random data generator. + """ + dataset_name: str max_sequence_length: int dataset_path: Optional[str] = None @@ -117,6 +213,19 @@ class DatasetArgs: @gin.configurable @dataclass class FeatureArgs: + """Feature Configuration. + + Feature-specific configuration parameters. + + Attributes: + feature_names (List[str]): **Required**. List of feature names. + max_sequence_length (int): **Required**. Maximum sequence length. + is_jagged (bool): Whether features are jagged (variable length). Default: False. + + Note: + `FeatureArgs` are only used when the dataset is of `BenchmarkDatasetArgs` type. + """ + feature_names: List[str] max_sequence_length: int is_jagged: bool = False @@ -125,6 +234,20 @@ class FeatureArgs: @gin.configurable @dataclass class BenchmarkDatasetArgs: + """Benchmark Dataset Configuration. + + Configuration for benchmark datasets combining features and embeddings. + + Attributes: + feature_args (List[FeatureArgs]): **Required**. List of feature arguments. + embedding_args (List[Union[EmbeddingArgs, DynamicEmbeddingArgs]]): **Required**. + List of embedding arguments. + item_feature_name (str): **Required**. Item feature name. + contextual_feature_names (List[str]): **Required**. List of contextual feature names. + action_feature_name (Optional[str]): Action feature name. Default: None. + max_num_candidates (int): Maximum number of candidates. Default: 0. + """ + feature_args: List[FeatureArgs] embedding_args: List[Union[EmbeddingArgs, DynamicEmbeddingArgs]] item_feature_name: str @@ -136,6 +259,29 @@ class BenchmarkDatasetArgs: @gin.configurable @dataclass class NetworkArgs: + """Network Architecture Configuration. + + Neural network architecture parameters. + + Attributes: + num_layers (int): **Required**. Number of layers. + hidden_size (int): **Required**. Hidden layer size. + num_attention_heads (int): **Required**. Number of attention heads. + kv_channels (int): **Required**. Key-value channels. + hidden_dropout (float): Hidden layer dropout rate. Default: 0.2. + norm_epsilon (float): Normalization epsilon. Default: 1e-5. + is_causal (bool): Use causal attention mask. Default: True. + dtype_str (str): Data type: "bfloat16" or "float16". Default: "bfloat16". + kernel_backend (str): Kernel backend: "cutlass", "triton", or "pytorch". + Default: "cutlass". + target_group_size (int): Target group size. Default: 1. + num_position_buckets (int): Number of position buckets. Default: 8192. + recompute_input_layernorm (bool): Recompute input layer normalization. Default: False. + recompute_input_silu (bool): Recompute input SiLU activation. Default: False. + item_embedding_dim (int): Item embedding dimension. Default: -1. + contextual_embedding_dim (int): Contextual embedding dimension. Default: -1. + """ + num_layers: int hidden_size: int num_attention_heads: int @@ -170,6 +316,18 @@ def __post_init__(self): @gin.configurable @dataclass class OptimizerArgs: + """Optimizer Configuration. + + Optimizer-related parameters. + + Attributes: + optimizer_str (str): **Required**. Optimizer name. + learning_rate (float): **Required**. Learning rate. + adam_beta1 (float): Adam optimizer beta1 parameter. Default: 0.9. + adam_beta2 (float): Adam optimizer beta2 parameter. Default: 0.999. + adam_eps (float): Adam optimizer epsilon parameter. Default: 1e-8. + """ + optimizer_str: str learning_rate: float adam_beta1: float = 0.9 @@ -180,12 +338,39 @@ class OptimizerArgs: @gin.configurable @dataclass class TensorModelParallelArgs: + """Tensor Model Parallelism Configuration. + + Tensor model parallelism settings. + + Attributes: + tensor_model_parallel_size (int): Tensor model parallel size (number of GPUs + for model sharding). Default: 1. + + Note: + The data parallel size is deduced based on the world_size and + tensor_model_parallel_size. + """ + tensor_model_parallel_size: int = 1 @gin.configurable @dataclass class RankingArgs: + """Ranking Task Configuration. + + Configuration specific to ranking tasks. + + Attributes: + prediction_head_arch (List[int]): **Required**. Prediction head architecture + (list of layer sizes). Default: None. + prediction_head_act_type (str): Prediction head activation type: "relu" or "gelu". + Default: "relu". + prediction_head_bias (bool): Whether to use bias in prediction head. Default: True. + num_tasks (int): Number of tasks (for multi-task learning). Default: 1. + eval_metrics (Tuple[str, ...]): Evaluation metrics tuple. Default: ("AUC",). + """ + prediction_head_arch: List[int] = cast(List[int], None) prediction_head_act_type: str = "relu" prediction_head_bias: bool = True @@ -206,6 +391,18 @@ def __post_init__(self): @gin.configurable @dataclass class RetrievalArgs: + """Retrieval Task Configuration. + + Configuration specific to retrieval tasks. + + Attributes: + num_negatives (int): Number of negative samples. Default: -1. + temperature (float): Temperature parameter for similarity scoring. Default: 0.05. + l2_norm_eps (float): Epsilon value for L2 normalization. Default: 1e-6. + eval_metrics (Tuple[str, ...]): Evaluation metrics tuple (Hit Rate, NDCG). + Default: ("HR@10", "NDCG@10"). + """ + ### retrieval num_negatives: int = -1 temperature = 0.05 From 9aaf5861c571b80c16a62b56622cc63bbfe032a6 Mon Sep 17 00:00:00 2001 From: JacoCheung Date: Mon, 27 Oct 2025 05:18:32 +0000 Subject: [PATCH 3/3] Move root ReadMe env setting up to training --- examples/hstu/training/README.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/examples/hstu/training/README.md b/examples/hstu/training/README.md index 385f13e7..9e27d44e 100644 --- a/examples/hstu/training/README.md +++ b/examples/hstu/training/README.md @@ -10,20 +10,22 @@ This integration ensures efficient training by coordinating sparse (embedding) a ## Environment Setup ### Start from dockerfile -We provide [dockerfile](./docker/Dockerfile) for users to build environment. +We provide [dockerfile](../../../docker/Dockerfile) for users to build environment. ``` +git clone https://github.com/NVIDIA/recsys-examples.git && cd recsys-examples docker build -f docker/Dockerfile --platform linux/amd64 -t recsys-examples:latest . ``` If you want to build image for Grace, you can use ``` +git clone https://github.com/NVIDIA/recsys-examples.git && cd recsys-examples docker build -f docker/Dockerfile --platform linux/arm64 -t recsys-examples:latest . ``` You can also set your own base image with args `--build-arg `. ### Start from source file Before running examples, build and install libs under corelib following instruction in documentation: -- [HSTU attention documentation](./corelib/hstu/README.md) -- [Dynamic Embeddings documentation](./corelib/dynamicemb/README.md) +- [HSTU attention documentation](.../../../corelib/hstu/README.md) +- [Dynamic Embeddings documentation](.../../../corelib/dynamicemb/README.md) On top of those two core libs, Megatron-Core along with other libs are required. You can install them via pypi package: