diff --git a/README.md b/README.md index 08b47440..8c1a47ec 100644 --- a/README.md +++ b/README.md @@ -35,44 +35,6 @@ The project includes: For more detailed release notes, please refer our [releases](https://github.com/NVIDIA/recsys-examples/releases). -# Environment Setup -## Start from dockerfile - -We provide [dockerfile](./docker/Dockerfile) for users to build environment. -``` -docker build -f docker/Dockerfile --platform linux/amd64 -t recsys-examples:latest . -``` -If you want to build image for Grace, you can use -``` -docker build -f docker/Dockerfile --platform linux/arm64 -t recsys-examples:latest . -``` -You can also set your own base image with args `--build-arg `. - -## Start from source file -Before running examples, build and install libs under corelib following instruction in documentation: -- [HSTU attention documentation](./corelib/hstu/README.md) -- [Dynamic Embeddings documentation](./corelib/dynamicemb/README.md) - -On top of those two core libs, Megatron-Core along with other libs are required. You can install them via pypi package: - -```bash -pip install torchx gin-config torchmetrics==1.0.3 typing-extensions iopath megatron-core==0.9.0 -``` - -If you fail to install the megatron-core package, usually due to the python version incompatibility, please try to clone and then install the source code. - -```bash -git clone -b core_r0.9.0 https://github.com/NVIDIA/Megatron-LM.git megatron-lm && \ -pip install -e ./megatron-lm -``` - -We provide our custom HSTU CUDA operators for enhanced performance. You need to install these operators using the following command: - -```bash -cd /workspace/recsys-examples/examples/hstu && \ -python setup.py install -``` - # Get Started The examples we supported: - [HSTU recommender examples](./examples/hstu/README.md) diff --git a/examples/commons/utils/logger.py b/examples/commons/utils/logger.py index 6f2f3225..e66ef6ab 100644 --- a/examples/commons/utils/logger.py +++ b/examples/commons/utils/logger.py @@ -12,16 +12,30 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from datetime import datetime +import logging import torch +from rich.console import Console +from rich.logging import RichHandler + +# Set up logger with RichHandler if not already configured + +console = Console() +_LOGGER = logging.getLogger("rich_rank0") + +if not _LOGGER.hasHandlers(): + handler = RichHandler( + console=console, show_time=True, show_path=False, rich_tracebacks=True + ) + _LOGGER.addHandler(handler) + _LOGGER.propagate = False + _LOGGER.setLevel(logging.INFO) def print_rank_0(message): """If distributed is initialized, print only on rank 0.""" if torch.distributed.is_initialized(): - now = datetime.now() if torch.distributed.get_rank() == 0: - print(f"[{now}] " + message, flush=True) + _LOGGER.info(message) else: print(message, flush=True) diff --git a/examples/commons/utils/stringify.py b/examples/commons/utils/stringify.py index ac834b38..93986bfc 100644 --- a/examples/commons/utils/stringify.py +++ b/examples/commons/utils/stringify.py @@ -34,11 +34,11 @@ def stringify_dict(input_dict, prefix="", sep=","): value.float() assert value.dim() == 0 value = value.cpu().item() - output += key + ":" + f"{value:6f}{sep}" + output += key + ": " + f"{value:6f}{sep}" elif isinstance(value, float): - output += key + ":" + f"{value:6f}{sep}" + output += key + ": " + f"{value:6f}{sep}" elif isinstance(value, int): - output += key + ":" + f"{value}{sep}" + output += key + ": " + f"{value}{sep}" else: assert RuntimeError(f"stringify dict not supports type {type(value)}") # remove the ending sep diff --git a/examples/hstu/README.md b/examples/hstu/README.md index 19e56e40..e2b2e9bf 100644 --- a/examples/hstu/README.md +++ b/examples/hstu/README.md @@ -1,4 +1,4 @@ -# Examples: to demonstrate how to train generative recommendation models +# Examples: to demonstrate how to do training and inference generative recommendation models ## Generative Recommender Introduction Meta's paper ["Actions Speak Louder Than Words"](https://arxiv.org/abs/2402.17152) introduces a novel paradigm for recommendation systems called **Generative Recommenders(GRs)**, which reformulates recommendation tasks as generative modeling problems. The work introduced Hierarchical Sequential Transduction Units (HSTU), a novel architecture designed to handle high-cardinality, non-stationary data streams in large-scale recommendation systems. HSTU enables both retrieval and ranking tasks. As noted in the paper, “HSTU-based GRs, with 1.5 trillion parameters, improve metrics in online A/B tests by 12.4% and have been deployed on multiple surfaces of a large internet platform with billions of users.” diff --git a/examples/hstu/training/README.md b/examples/hstu/training/README.md new file mode 100644 index 00000000..9e27d44e --- /dev/null +++ b/examples/hstu/training/README.md @@ -0,0 +1,99 @@ +# HSTU Training example + +We have supported both retrieval and ranking model whose backbones are HSTU layers. In this example collection, we allow user to specify the model structures via gin-config file. Supported datasets are listed below. Regarding the gin-config interface, please refer to [inline comments](../utils/gin_config_args.py) . + +## Parallelism Introduction +To facilitate large embedding tables and scaling-laws of HSTU dense, we have integrate **[TorchRec](https://github.com/pytorch/torchrec)** that does shard embedding tables and **[Megatron-LM](https://github.com/NVIDIA/Megatron-LM)** that enable dense parallelism(e.g Data, Tensor, Sequence, Pipeline, and Context parallelism) in this example. +This integration ensures efficient training by coordinating sparse (embedding) and dense (context/data) parallelisms within a single model. +![parallelism](../figs/parallelism.png) + +## Environment Setup +### Start from dockerfile + +We provide [dockerfile](../../../docker/Dockerfile) for users to build environment. +``` +git clone https://github.com/NVIDIA/recsys-examples.git && cd recsys-examples +docker build -f docker/Dockerfile --platform linux/amd64 -t recsys-examples:latest . +``` +If you want to build image for Grace, you can use +``` +git clone https://github.com/NVIDIA/recsys-examples.git && cd recsys-examples +docker build -f docker/Dockerfile --platform linux/arm64 -t recsys-examples:latest . +``` +You can also set your own base image with args `--build-arg `. + +### Start from source file +Before running examples, build and install libs under corelib following instruction in documentation: +- [HSTU attention documentation](.../../../corelib/hstu/README.md) +- [Dynamic Embeddings documentation](.../../../corelib/dynamicemb/README.md) + +On top of those two core libs, Megatron-Core along with other libs are required. You can install them via pypi package: + +```bash +pip install torchx gin-config torchmetrics==1.0.3 typing-extensions iopath megatron-core==0.9.0 +``` + +If you fail to install the megatron-core package, usually due to the python version incompatibility, please try to clone and then install the source code. + +```bash +git clone -b core_r0.9.0 https://github.com/NVIDIA/Megatron-LM.git megatron-lm && \ +pip install -e ./megatron-lm +``` + +We provide our custom HSTU CUDA operators for enhanced performance. You need to install these operators using the following command: + +```bash +cd /workspace/recsys-examples/examples/hstu && \ +python setup.py install +``` +### Dataset Introduction + +We have supported several datasets as listed in the following sections: + +### Dataset Information +#### **MovieLens** +refer to [MovieLens 1M](https://grouplens.org/datasets/movielens/1m/) and [MovieLens 20M](https://www.kaggle.com/datasets/grouplens/movielens-20m-dataset) for details. +#### **KuaiRand** + +| dataset | # users | seqlen max | seqlen min | seqlen mean | seqlen median | # items | +|---------------|---------|------------|------------|-------------|---------------|------------| +| kuairand_pure | 27285 | 910 | 1 | 1 | 39 | 7551 | +| kuairand_1k | 1000 | 49332 | 10 | 5038 | 3379 | 4369953 | +| kuairand_27k | 27285 | 228000 | 100 | 11796 | 8591 | 32038725 | + +refer to [KuaiRand](https://kuairand.com/) for details. + +## Running the examples + +Before getting started, please make sure that all pre-requisites are fulfilled. You can refer to [Get Started](../../../README) section in the root directory of the repo to set up the environment. + + +### Dataset preprocessing + +In order to prepare the dataset for training, you can use our `preprocessor.py` under the hstu example folder of the project. + +```bash +cd /examples/hstu && +mkdir -p ./tmp_data && python3 ./preprocessor.py --dataset_name <"ml-1m"|"ml-20m"|"kuairand-pure"|"kuairand-1k"|"kuairand-27k"> + +``` + +### Start training +The entrypoint for training are `pretrain_gr_retrieval.py` or `pretrain_gr_ranking.py`. We use gin-config to specify the model structure, training arguments, hyper-params etc. + +Command to run retrieval task with `MovieLens 20m` dataset: + +```bash +# Before running the `pretrain_gr_retrieval.py`, make sure that current working directory is `hstu` +cd examples/hstu +PYTHONPATH=${PYTHONPATH}:$(realpath ../) torchrun --nproc_per_node 1 --master_addr localhost --master_port 6000 ./training/pretrain_gr_retrieval.py --gin-config-file ./training/configs/movielen_retrieval.gin +``` + +To run ranking task with `MovieLens 20m` dataset: +```bash +# Before running the `pretrain_gr_ranking.py`, make sure that current working directory is `hstu` +cd examples/hstu +PYTHONPATH=${PYTHONPATH}:$(realpath ../) torchrun --nproc_per_node 1 --master_addr localhost --master_port 6000 ./training/pretrain_gr_ranking.py --gin-config-file ./training/configs/movielen_ranking.gin +``` + + diff --git a/examples/hstu/training/__init__.py b/examples/hstu/training/__init__.py deleted file mode 100644 index 270ce28a..00000000 --- a/examples/hstu/training/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .training_impl import * # pylint: disable=wildcard-import -from .training_utils import * # pylint: disable=wildcard-import diff --git a/examples/hstu/training/benchmark/README.md b/examples/hstu/training/benchmark/README.md index 42f83069..27358ac9 100644 --- a/examples/hstu/training/benchmark/README.md +++ b/examples/hstu/training/benchmark/README.md @@ -13,7 +13,7 @@ You can run script `run_hstu_benchmark.sh` to see the performance over the base ## How to run -The test entry is `python ./benchmark/hstu_layer_benchmark.py run`, you can type `python ./benchmark/hstu_layer_benchmark.py run --help` to get the input arguments. 4 important arguments are : +The test entry is `python ./training/benchmark/hstu_layer_benchmark.py run`, you can type `python ./training/benchmark/hstu_layer_benchmark.py run --help` to get the input arguments. 4 important arguments are : 1. --kernel-backend: select the hstu mha backend. Could be `triton` or `cutlass`. 2. --fuse-norm-mul-dropout: knob of `layer norm + multiplication + dropout ` fusion. Could be `False` or `True` @@ -23,7 +23,9 @@ The test entry is `python ./benchmark/hstu_layer_benchmark.py run`, you can type Our baseline cmd example (1K): ```bash -python ./benchmark/hstu_layer_benchmark.py run \ + +cd recsys-examples/examples/hstu +python ./training/benchmark/hstu_layer_benchmark.py run \ --iters 100 \ --warmup-iters 50 \ --layer-type native \ @@ -40,7 +42,8 @@ python ./benchmark/hstu_layer_benchmark.py run \ You can also run a set of arguments with run.sh: ```bash -bash run_hstu_layer_benchmark.sh +cd recsys-examples/examples/hstu +bash ./training/benchmark/run_hstu_layer_benchmark.sh ``` After one run is done, a memory snapshot file in current working directory is generated, you can trace the memory usage with the file. Please refer to [PyTorch docs](https://docs.pytorch.org/docs/stable/torch_cuda_memory.html) on how to visualize the memory trace. diff --git a/examples/hstu/training/benchmark/hstu_layer_benchmark.py b/examples/hstu/training/benchmark/hstu_layer_benchmark.py index c47742f6..1e2765ae 100644 --- a/examples/hstu/training/benchmark/hstu_layer_benchmark.py +++ b/examples/hstu/training/benchmark/hstu_layer_benchmark.py @@ -47,7 +47,7 @@ from modules.jagged_data import JaggedData from modules.native_hstu_layer import HSTULayer as NativeHSTULayer from ops.length_to_offsets import length_to_complete_offsets -from training.utils import cal_flops_single_rank +from training.trainer.utils import cal_flops_single_rank _backend_str_to_type = { "cutlass": KernelBackend.CUTLASS, diff --git a/examples/hstu/training/benchmark/run_hstu_layer_benchmark.sh b/examples/hstu/training/benchmark/run_hstu_layer_benchmark.sh index 3984dfe9..2bae6a38 100644 --- a/examples/hstu/training/benchmark/run_hstu_layer_benchmark.sh +++ b/examples/hstu/training/benchmark/run_hstu_layer_benchmark.sh @@ -32,7 +32,7 @@ for dim_per_head in "${dim_per_heads[@]}"; do fi echo -e "\n\033[32mbaseline hstu layer \033[0m:" ${nsys_profile_cmd//${baseline_profile_name}} \ - python ./benchmark/hstu_layer_benchmark.py run \ + python ./training/benchmark/hstu_layer_benchmark.py run \ --iters 100 \ --warmup-iters 50 \ --kernel-backend triton \ @@ -53,7 +53,7 @@ for dim_per_head in "${dim_per_heads[@]}"; do echo -e "\n\033[32m +cutlass\033[0m:" ${nsys_profile_cmd//${cutlass_profile_name}} \ - python ./benchmark/hstu_layer_benchmark.py run \ + python ./training/benchmark/hstu_layer_benchmark.py run \ --iters 100 \ --warmup-iters 50 \ --kernel-backend cutlass \ @@ -73,7 +73,7 @@ for dim_per_head in "${dim_per_heads[@]}"; do echo -e "\n\033[32m +fused\033[0m:" ${nsys_profile_cmd//${fused_profile_name}} \ - python ./benchmark/hstu_layer_benchmark.py run \ + python ./training/benchmark/hstu_layer_benchmark.py run \ --iters 100 \ --warmup-iters 50 \ --kernel-backend cutlass \ @@ -93,7 +93,7 @@ for dim_per_head in "${dim_per_heads[@]}"; do echo -e "\n\033[32m + recompute\033[0m:" ${nsys_profile_cmd//${recompute_profile_name}} \ - python ./benchmark/hstu_layer_benchmark.py run \ + python ./training/benchmark/hstu_layer_benchmark.py run \ --iters 100 \ --warmup-iters 50 \ --kernel-backend cutlass \ diff --git a/examples/hstu/training/pretrain_gr_ranking.py b/examples/hstu/training/pretrain_gr_ranking.py index 993fc753..9ff0b32e 100644 --- a/examples/hstu/training/pretrain_gr_ranking.py +++ b/examples/hstu/training/pretrain_gr_ranking.py @@ -18,7 +18,7 @@ warnings.filterwarnings("ignore", category=FutureWarning) warnings.filterwarnings("ignore", category=SyntaxWarning) import argparse -from functools import partial # pylint: disable-unused-import +from typing import List, Union import commons.utils.initialize as init import gin @@ -34,7 +34,8 @@ JaggedMegatronTrainNonePipeline, JaggedMegatronTrainPipelineSparseDist, ) -from training import ( +from trainer.training import maybe_load_ckpts, train_with_pipeline +from trainer.utils import ( create_dynamic_optitons_dict, create_embedding_configs, create_hstu_config, @@ -42,10 +43,11 @@ get_data_loader, get_dataset_and_embedding_args, get_embedding_vector_storage_multiplier, - maybe_load_ckpts, - train_with_pipeline, ) -from utils import ( +from utils import ( # from hstu.utils + BenchmarkDatasetArgs, + DatasetArgs, + EmbeddingArgs, NetworkArgs, OptimizerArgs, RankingArgs, @@ -53,20 +55,12 @@ TrainerArgs, ) -parser = argparse.ArgumentParser( - description="Distributed GR Arguments", allow_abbrev=False -) -parser.add_argument("--gin-config-file", type=str) -args = parser.parse_args() -gin.parse_config_file(args.gin_config_file) -trainer_args = TrainerArgs() -dataset_args, embedding_args = get_dataset_and_embedding_args() -network_args = NetworkArgs() -optimizer_args = OptimizerArgs() -tp_args = TensorModelParallelArgs() - -def create_ranking_config() -> RankingConfig: +def create_ranking_config( + dataset_args: Union[DatasetArgs, BenchmarkDatasetArgs], + network_args: NetworkArgs, + embedding_args: List[EmbeddingArgs], +) -> RankingConfig: ranking_args = RankingArgs() return RankingConfig( @@ -82,6 +76,18 @@ def create_ranking_config() -> RankingConfig: def main(): + parser = argparse.ArgumentParser( + description="HSTU Example Arguments", allow_abbrev=False + ) + parser.add_argument("--gin-config-file", type=str) + args = parser.parse_args() + gin.parse_config_file(args.gin_config_file) + trainer_args = TrainerArgs() + dataset_args, embedding_args = get_dataset_and_embedding_args() + network_args = NetworkArgs() + optimizer_args = OptimizerArgs() + tp_args = TensorModelParallelArgs() + init.initialize_distributed() init.initialize_model_parallel( tensor_model_parallel_size=tp_args.tensor_model_parallel_size @@ -92,7 +98,7 @@ def main(): f"distributed env initialization done. Free cuda memory: {free_memory / (1024 ** 2):.2f} MB" ) hstu_config = create_hstu_config(network_args, tp_args) - task_config = create_ranking_config() + task_config = create_ranking_config(dataset_args, network_args, embedding_args) model = get_ranking_model(hstu_config=hstu_config, task_config=task_config) dynamic_options_dict = create_dynamic_optitons_dict( diff --git a/examples/hstu/training/pretrain_gr_retrieval.py b/examples/hstu/training/pretrain_gr_retrieval.py index ec3d0486..c628c535 100644 --- a/examples/hstu/training/pretrain_gr_retrieval.py +++ b/examples/hstu/training/pretrain_gr_retrieval.py @@ -18,7 +18,7 @@ warnings.filterwarnings("ignore", category=FutureWarning) warnings.filterwarnings("ignore", category=SyntaxWarning) import argparse -from functools import partial # pylint: disable-unused-import +from typing import List, Union import commons.utils.initialize as init import gin @@ -32,18 +32,20 @@ JaggedMegatronTrainNonePipeline, JaggedMegatronTrainPipelineSparseDist, ) -from training import ( +from trainer.training import maybe_load_ckpts, train_with_pipeline +from trainer.utils import ( create_dynamic_optitons_dict, - create_embedding_config, + create_embedding_configs, create_hstu_config, create_optimizer_params, get_data_loader, get_dataset_and_embedding_args, get_embedding_vector_storage_multiplier, - maybe_load_ckpts, - train_with_pipeline, ) -from utils import ( +from utils import ( # from hstu.utils + BenchmarkDatasetArgs, + DatasetArgs, + EmbeddingArgs, NetworkArgs, OptimizerArgs, RetrievalArgs, @@ -51,27 +53,18 @@ TrainerArgs, ) -parser = argparse.ArgumentParser( - description="Distributed GR Arguments", allow_abbrev=False -) -parser.add_argument("--gin-config-file", type=str) -args = parser.parse_args() -gin.parse_config_file(args.gin_config_file) -trainer_args = TrainerArgs() -dataset_args, embedding_args = get_dataset_and_embedding_args() -network_args = NetworkArgs() -optimizer_args = OptimizerArgs() -tp_args = TensorModelParallelArgs() - -def create_retrieval_config() -> RetrievalConfig: +def create_retrieval_config( + dataset_args: Union[DatasetArgs, BenchmarkDatasetArgs], + network_args: NetworkArgs, + embedding_args: List[EmbeddingArgs], +) -> RetrievalConfig: retrieval_args = RetrievalArgs() return RetrievalConfig( - embedding_configs=[ - create_embedding_config(network_args.hidden_size, arg) - for arg in embedding_args - ], + embedding_configs=create_embedding_configs( + dataset_args, network_args, embedding_args + ), temperature=retrieval_args.temperature, l2_norm_eps=retrieval_args.l2_norm_eps, num_negatives=retrieval_args.num_negatives, @@ -80,6 +73,18 @@ def create_retrieval_config() -> RetrievalConfig: def main(): + parser = argparse.ArgumentParser( + description="Distributed GR Arguments", allow_abbrev=False + ) + parser.add_argument("--gin-config-file", type=str) + args = parser.parse_args() + gin.parse_config_file(args.gin_config_file) + trainer_args = TrainerArgs() + dataset_args, embedding_args = get_dataset_and_embedding_args() + network_args = NetworkArgs() + optimizer_args = OptimizerArgs() + tp_args = TensorModelParallelArgs() + init.initialize_distributed() init.initialize_model_parallel( tensor_model_parallel_size=tp_args.tensor_model_parallel_size @@ -87,7 +92,7 @@ def main(): init.set_random_seed(trainer_args.seed) hstu_config = create_hstu_config(network_args, tp_args) - task_config = create_retrieval_config() + task_config = create_retrieval_config(dataset_args, network_args, embedding_args) model = get_retrieval_model(hstu_config=hstu_config, task_config=task_config) dynamic_options_dict = create_dynamic_optitons_dict( diff --git a/examples/hstu/training/trainer/__init__.py b/examples/hstu/training/trainer/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/examples/hstu/training/training_impl.py b/examples/hstu/training/trainer/training.py similarity index 99% rename from examples/hstu/training/training_impl.py rename to examples/hstu/training/trainer/training.py index a3fe7a0b..a4b0254c 100644 --- a/examples/hstu/training/training_impl.py +++ b/examples/hstu/training/trainer/training.py @@ -31,7 +31,7 @@ JaggedMegatronTrainNonePipeline, JaggedMegatronTrainPipelineSparseDist, ) -from training.training_utils import cal_flops +from trainer.utils import cal_flops from utils import TrainerArgs diff --git a/examples/hstu/training/training_utils.py b/examples/hstu/training/trainer/utils.py similarity index 100% rename from examples/hstu/training/training_utils.py rename to examples/hstu/training/trainer/utils.py diff --git a/examples/hstu/utils/gin_config_args.py b/examples/hstu/utils/gin_config_args.py index 7d5738e0..d47501b1 100644 --- a/examples/hstu/utils/gin_config_args.py +++ b/examples/hstu/utils/gin_config_args.py @@ -21,6 +21,31 @@ @gin.configurable @dataclass class TrainerArgs: + """Trainer Configuration. + + Training-related parameters and settings. + + Attributes: + train_batch_size (int): **Required**. Batch size per GPU. When TP is enabled, + the theoretical batch size is (train_batch_size × tp_size). + eval_batch_size (int): **Required**. Evaluation batch size. + eval_interval (int): Evaluation interval in iterations. Default: 100. + log_interval (int): Logging interval in iterations. Default: 100. + max_train_iters (Optional[int]): Maximum training iterations. Default: None. + max_eval_iters (Optional[int]): Maximum evaluation iterations. Default: None. + seed (int): Random seed. Default: 1234. + profile (bool): Enable profiling. Default: False. + profile_step_start (int): Profiling start step. Default: 100. + profile_step_end (int): Profiling end step. Default: 200. + ckpt_save_interval (int): Checkpoint save interval, -1 means no checkpoint saving. + Default: -1. + ckpt_save_dir (str): Checkpoint save directory. Default: "./checkpoints". + ckpt_load_dir (str): Checkpoint load directory. Default: "". + pipeline_type (str): Pipeline overlap type: 'none' (no overlap), 'native' + (overlap h2d, input dist, fwd+bwd), 'prefetch' (includes prefetch overlap). + Default: "native". + """ + # below batchsize is batchsize_per_gpu # when TP is enabled, the theoratical batchsize is (train_batch_size * tp_size) train_batch_size: int @@ -54,6 +79,17 @@ def __post_init__(self): @dataclass class BaseEmbeddingArgs: + """Base Embedding Arguments. + + Base class for embedding configuration parameters. + + Attributes: + feature_names (List[str]): List of feature names. + table_name (str): Embedding table name. + item_vocab_size_or_capacity (int): For dynamic embedding: capacity; + for static embedding: vocabulary size. + """ + # for dynamic emb, it serves as capacity, while for static emb, it serves as vocab size feature_names: List[str] table_name: str @@ -63,6 +99,25 @@ class BaseEmbeddingArgs: @gin.configurable @dataclass class EmbeddingArgs(BaseEmbeddingArgs): + """Embedding Configuration. + + Base embedding layer configuration parameters. + + Attributes: + feature_names (List[str]): **Required**. List of feature names. + table_name (str): **Required**. Embedding table name. + item_vocab_size_or_capacity (int): **Required**. For dynamic embedding: capacity; + for static embedding: vocabulary size. + sharding_type (str): Sharding type, must be "data_parallel" or "model_parallel". + Default: "None". + + Note: + A table could be only one of type `EmbeddingArgs` or `DynamicEmbeddingArgs`. + When movielen* or kuairand* datasets are used, `DynamicEmbeddingArgs`/`EmbeddingArgs` + are predefined. Setting the proper DatasetArgs.dataset_name in the gin config file will automatically set the proper EmbeddingArgs/DynamicEmbeddingArgs. + See `examples/hstu/training/trainer/utils.py::get_dataset_and_embedding_args()` for more details. + """ + sharding_type: str = "None" def __post_init__(self): @@ -75,7 +130,31 @@ def __post_init__(self): @gin.configurable @dataclass class DynamicEmbeddingArgs(EmbeddingArgs): - # the precedence is global_hbm_for_values > item_vocab_gpu_capacity > item_vocab_gpu_capacity_ratio + """Dynamic Embedding Configuration. + + Extends EmbeddingArgs with dynamic embedding-specific parameters. + + Attributes: + global_hbm_for_values (Optional[int]): Global HBM size in bytes (highest priority). + Default: None. + item_vocab_gpu_capacity (Optional[float]): Item vocabulary GPU capacity + (second priority). Default: None. + item_vocab_gpu_capacity_ratio (Optional[float]): Item vocabulary GPU capacity ratio + (lowest priority). Default: None. + evict_strategy (str): Eviction strategy: "lru" or "lfu". Default: "lru". + caching (bool): Enable caching on HBM. When caching is enabled, the + global_hbm_for_values indicates the cache size. Default: False. + + Note: + - sharding_type is automatically set to "model_parallel". + - Precedence: The first 3 params can be used for setting the HBM size for dynamic + embedding, with precedence: `global_hbm_for_values` > `item_vocab_gpu_capacity` > + item_vocab_gpu_capacity_ratio. When only item_vocab_gpu_capacity_ratio is given, + `item_vocab_gpu_capacity` = `item_vocab_gpu_capacity_ratio` * `item_vocab_size_or_capacity` + and `global_hbm_for_values` are deduced based on the optimizer and embedding dims. + """ + + # the precedence is `global_hbm_for_values` > `item_vocab_gpu_capacity` > `item_vocab_gpu_capacity_ratio` # without optimizer consideration global_hbm_for_values: Optional[int] = None item_vocab_gpu_capacity: Optional[float] = None @@ -107,6 +186,23 @@ def calculate_and_reset_global_hbm_for_values(self, hidden_size, multiplier=1): @gin.configurable @dataclass class DatasetArgs: + """Dataset Configuration. + + Dataset-related configuration parameters. + + Attributes: + dataset_name (str): **Required**. Dataset name. + max_sequence_length (int): **Required**. Maximum sequence length. + dataset_path (Optional[str]): Path to dataset. Default: None. + max_num_candidates (int): Maximum number of candidates. Default: 0. + shuffle (bool): Whether to shuffle data. Default: False. + + Note: + dataset_path could be None if your dataset is preprocessed and moved under + /hstu/tmp_data folder or you're running with BenchmarkDatasetArgs + which is an in-memory random data generator. + """ + dataset_name: str max_sequence_length: int dataset_path: Optional[str] = None @@ -117,6 +213,19 @@ class DatasetArgs: @gin.configurable @dataclass class FeatureArgs: + """Feature Configuration. + + Feature-specific configuration parameters. + + Attributes: + feature_names (List[str]): **Required**. List of feature names. + max_sequence_length (int): **Required**. Maximum sequence length. + is_jagged (bool): Whether features are jagged (variable length). Default: False. + + Note: + `FeatureArgs` are only used when the dataset is of `BenchmarkDatasetArgs` type. + """ + feature_names: List[str] max_sequence_length: int is_jagged: bool = False @@ -125,6 +234,20 @@ class FeatureArgs: @gin.configurable @dataclass class BenchmarkDatasetArgs: + """Benchmark Dataset Configuration. + + Configuration for benchmark datasets combining features and embeddings. + + Attributes: + feature_args (List[FeatureArgs]): **Required**. List of feature arguments. + embedding_args (List[Union[EmbeddingArgs, DynamicEmbeddingArgs]]): **Required**. + List of embedding arguments. + item_feature_name (str): **Required**. Item feature name. + contextual_feature_names (List[str]): **Required**. List of contextual feature names. + action_feature_name (Optional[str]): Action feature name. Default: None. + max_num_candidates (int): Maximum number of candidates. Default: 0. + """ + feature_args: List[FeatureArgs] embedding_args: List[Union[EmbeddingArgs, DynamicEmbeddingArgs]] item_feature_name: str @@ -136,6 +259,29 @@ class BenchmarkDatasetArgs: @gin.configurable @dataclass class NetworkArgs: + """Network Architecture Configuration. + + Neural network architecture parameters. + + Attributes: + num_layers (int): **Required**. Number of layers. + hidden_size (int): **Required**. Hidden layer size. + num_attention_heads (int): **Required**. Number of attention heads. + kv_channels (int): **Required**. Key-value channels. + hidden_dropout (float): Hidden layer dropout rate. Default: 0.2. + norm_epsilon (float): Normalization epsilon. Default: 1e-5. + is_causal (bool): Use causal attention mask. Default: True. + dtype_str (str): Data type: "bfloat16" or "float16". Default: "bfloat16". + kernel_backend (str): Kernel backend: "cutlass", "triton", or "pytorch". + Default: "cutlass". + target_group_size (int): Target group size. Default: 1. + num_position_buckets (int): Number of position buckets. Default: 8192. + recompute_input_layernorm (bool): Recompute input layer normalization. Default: False. + recompute_input_silu (bool): Recompute input SiLU activation. Default: False. + item_embedding_dim (int): Item embedding dimension. Default: -1. + contextual_embedding_dim (int): Contextual embedding dimension. Default: -1. + """ + num_layers: int hidden_size: int num_attention_heads: int @@ -170,6 +316,18 @@ def __post_init__(self): @gin.configurable @dataclass class OptimizerArgs: + """Optimizer Configuration. + + Optimizer-related parameters. + + Attributes: + optimizer_str (str): **Required**. Optimizer name. + learning_rate (float): **Required**. Learning rate. + adam_beta1 (float): Adam optimizer beta1 parameter. Default: 0.9. + adam_beta2 (float): Adam optimizer beta2 parameter. Default: 0.999. + adam_eps (float): Adam optimizer epsilon parameter. Default: 1e-8. + """ + optimizer_str: str learning_rate: float adam_beta1: float = 0.9 @@ -180,12 +338,39 @@ class OptimizerArgs: @gin.configurable @dataclass class TensorModelParallelArgs: + """Tensor Model Parallelism Configuration. + + Tensor model parallelism settings. + + Attributes: + tensor_model_parallel_size (int): Tensor model parallel size (number of GPUs + for model sharding). Default: 1. + + Note: + The data parallel size is deduced based on the world_size and + tensor_model_parallel_size. + """ + tensor_model_parallel_size: int = 1 @gin.configurable @dataclass class RankingArgs: + """Ranking Task Configuration. + + Configuration specific to ranking tasks. + + Attributes: + prediction_head_arch (List[int]): **Required**. Prediction head architecture + (list of layer sizes). Default: None. + prediction_head_act_type (str): Prediction head activation type: "relu" or "gelu". + Default: "relu". + prediction_head_bias (bool): Whether to use bias in prediction head. Default: True. + num_tasks (int): Number of tasks (for multi-task learning). Default: 1. + eval_metrics (Tuple[str, ...]): Evaluation metrics tuple. Default: ("AUC",). + """ + prediction_head_arch: List[int] = cast(List[int], None) prediction_head_act_type: str = "relu" prediction_head_bias: bool = True @@ -206,6 +391,18 @@ def __post_init__(self): @gin.configurable @dataclass class RetrievalArgs: + """Retrieval Task Configuration. + + Configuration specific to retrieval tasks. + + Attributes: + num_negatives (int): Number of negative samples. Default: -1. + temperature (float): Temperature parameter for similarity scoring. Default: 0.05. + l2_norm_eps (float): Epsilon value for L2 normalization. Default: 1e-6. + eval_metrics (Tuple[str, ...]): Evaluation metrics tuple (Hit Rate, NDCG). + Default: ("HR@10", "NDCG@10"). + """ + ### retrieval num_negatives: int = -1 temperature = 0.05 diff --git a/pyproject.toml b/pyproject.toml index f55eae22..87927543 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,6 @@ [tool.mypy] exclude = [ + "examples/hstu/tmp_data", "examples/hstu/ops/triton_ops/*", "examples/hstu/ops/fused_hstu_op.py", "corelib/*",