From fafd845ccb80f1e1ddcb6ebefa508cf71d723bc9 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Fri, 4 Oct 2024 12:36:44 -0700 Subject: [PATCH 1/3] remove 8x3b recipes Signed-off-by: Alexandros Koumparoulis --- nemo/collections/llm/recipes/mixtral_8x3b.py | 290 ------------------ .../llm/recipes/mixtral_8x3b_16k.py | 132 -------- .../llm/recipes/mixtral_8x3b_64k.py | 133 -------- .../llm/recipes/test_mixtral_8x3b.py | 110 ------- .../llm/recipes/test_mixtral_8x3b_16k.py | 84 ----- .../llm/recipes/test_mixtral_8x3b_64k.py | 84 ----- 6 files changed, 833 deletions(-) delete mode 100644 nemo/collections/llm/recipes/mixtral_8x3b.py delete mode 100644 nemo/collections/llm/recipes/mixtral_8x3b_16k.py delete mode 100644 nemo/collections/llm/recipes/mixtral_8x3b_64k.py delete mode 100644 tests/collections/llm/recipes/test_mixtral_8x3b.py delete mode 100644 tests/collections/llm/recipes/test_mixtral_8x3b_16k.py delete mode 100644 tests/collections/llm/recipes/test_mixtral_8x3b_64k.py diff --git a/nemo/collections/llm/recipes/mixtral_8x3b.py b/nemo/collections/llm/recipes/mixtral_8x3b.py deleted file mode 100644 index ca5b4e35039f..000000000000 --- a/nemo/collections/llm/recipes/mixtral_8x3b.py +++ /dev/null @@ -1,290 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from typing import Callable, Optional - -import nemo_run as run -import pytorch_lightning as pl -import torch -from megatron.core.distributed import DistributedDataParallelConfig -from pytorch_lightning.callbacks.callback import Callback - -from nemo import lightning as nl -from nemo.collections.llm.api import finetune, pretrain -from nemo.collections.llm.gpt.data.mock import MockDataModule -from nemo.collections.llm.gpt.data.squad import SquadDataModule -from nemo.collections.llm.gpt.model.mixtral import MixtralConfig8x3B, MixtralModel -from nemo.collections.llm.peft.lora import LoRA -from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger -from nemo.collections.llm.recipes.optim.adam import distributed_fused_adam_with_cosine_annealing -from nemo.collections.llm.recipes.precision.mixed_precision import bf16_mixed -from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback -from nemo.lightning.pytorch.callbacks.moe_token_drop import MegatronTokenDropCallback -from nemo.utils.exp_manager import TimingCallback - -NAME = "mixtral_8x3b" - - -@run.cli.factory(name=NAME) -def model() -> run.Config[pl.LightningModule]: - """ - Factory function to create a Mixtral 8x3B model configuration. - - Returns: - run.Config[pl.LightningModule]: Configuration for the Mixtral 8x3B model. - - Examples: - CLI usage: - $ nemo llm pretrain model=mixtral_8x3b ... - - Python API usage: - >>> model_config = model() - >>> print(model_config) - """ - return run.Config(MixtralModel, config=run.Config(MixtralConfig8x3B)) - - -def trainer( - tensor_parallelism: int = 1, - pipeline_parallelism: int = 1, - pipeline_parallelism_type: Optional[torch.dtype] = None, - virtual_pipeline_parallelism: Optional[int] = None, - context_parallelism: int = 1, - sequence_parallelism: bool = False, - expert_parallelism: int = 4, - num_nodes: int = 2, - num_gpus_per_node: int = 8, - max_steps: int = 1168251, - callbacks: Optional[list[run.Config[Callback]]] = None, -) -> run.Config[nl.Trainer]: - """ - Configure the NeMo Lightning Trainer for Mixtral 8x3B model. - - This function sets up the distributed training strategy optimized for the Mixtral 8x3B model. - - Args: - tensor_parallelism (int): Degree of tensor model parallelism. - pipeline_parallelism (int): Degree of pipeline model parallelism. - pipeline_parallelism_type (Optional[torch.dtype]): Data type for pipeline parallelism. - virtual_pipeline_parallelism (Optional[int]): Size of virtual pipeline parallelism. - context_parallelism (int): Degree of context parallelism. - sequence_parallelism (bool): Whether to use sequence parallelism. - expert_parallelism (int): Degree of expert parallelism. - num_nodes (int): Number of compute nodes to use. - num_gpus_per_node (int): Number of GPUs per node. - max_steps (int): Maximum number of training steps. - callbacks (Optional[list[run.Config[Callback]]]): List of callback configurations. - - Returns: - run.Config[nl.Trainer]: Configuration for the NeMo Lightning Trainer. - - Examples: - CLI usage: - $ nemo llm pretrain trainer=mixtral_8x3b ... - - Python API usage: - >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8) - >>> print(trainer_config) - """ - strategy = run.Config( - nl.MegatronStrategy, - tensor_model_parallel_size=tensor_parallelism, - pipeline_model_parallel_size=pipeline_parallelism, - pipeline_dtype=pipeline_parallelism_type, - virtual_pipeline_model_parallel_size=virtual_pipeline_parallelism, - context_parallel_size=context_parallelism, - sequence_parallel=sequence_parallelism, - expert_model_parallel_size=expert_parallelism, - gradient_as_bucket_view=True, - ckpt_async_save=True, - ckpt_parallel_load=True, - ddp=run.Config( - DistributedDataParallelConfig, - check_for_nan_in_grad=True, - grad_reduce_in_fp32=True, - overlap_grad_reduce=True, - overlap_param_gather=True, - ), - ) - - trainer = run.Config( - nl.Trainer, - accelerator="gpu", - accumulate_grad_batches=1, - callbacks=callbacks, - devices=num_gpus_per_node, - limit_test_batches=50, - limit_val_batches=32, - log_every_n_steps=10, - max_steps=max_steps, - num_nodes=num_nodes, - plugins=bf16_mixed(), - strategy=strategy, - use_distributed_sampler=False, - val_check_interval=2000, - ) - - return trainer - - -@run.cli.factory(target=pretrain, name=NAME) -def pretrain_recipe( - dir: Optional[str] = None, name: str = "default", num_nodes: int = 2, num_gpus_per_node: int = 8, fn=pretrain -) -> run.Partial: - """ - Create a pre-training recipe for Mixtral 8x3B model. - - This function sets up a complete configuration for pre-training, including - model, trainer, and data settings. - - Args: - dir (Optional[str]): Directory for saving logs and checkpoints. - name (str): Name of the pre-training run. - num_nodes (int): Number of compute nodes to use. - num_gpus_per_node (int): Number of GPUs per node. - fn (Callable): Function to use for pre-training (default: nemo.collections.llm.api.pretrain). - - Returns: - run.Partial: Partial configuration for pre-training. - - Examples: - CLI usage: - $ nemo llm pretrain --factory mixtral_8x3b - $ nemo llm pretrain --factory "mixtral_8x3b(num_nodes=2, name='my_pretrain')" - - Python API usage: - >>> recipe = pretrain_recipe(name="mixtral_8x3b_pretrain", num_nodes=2) - >>> print(recipe) - """ - return run.Partial( - fn, - model=model(), - trainer=trainer( - num_nodes=num_nodes, - num_gpus_per_node=num_gpus_per_node, - callbacks=[run.Config(TimingCallback)], - ), - data=run.Config(MockDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1), - log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)), - optim=distributed_fused_adam_with_cosine_annealing(max_lr=3e-4), - resume=default_resume(), - ) - - -@run.cli.factory(target=pretrain, name=NAME + "_performance") -def pretrain_recipe_performance( - dir: Optional[str] = None, name: str = "default", num_nodes: int = 2, num_gpus_per_node: int = 8, fn=pretrain -) -> run.Partial: - """ - Create a performance-optimized pre-training recipe for Mixtral 8x3B model. - - This recipe enables performance optimizations that may not be suitable for all use cases. - It builds upon the standard pre-training recipe and adds additional performance enhancements. - - Args: - dir (Optional[str]): Directory for saving logs and checkpoints. - name (str): Name of the pre-training run. - num_nodes (int): Number of compute nodes to use. - num_gpus_per_node (int): Number of GPUs per node. - fn (Callable): The pre-training function to use. - - Returns: - run.Partial: Partial configuration for performance-optimized pre-training. - - Examples: - CLI usage: - $ nemo llm pretrain --factory "mixtral_8x3b.pretrain_recipe_performance(num_nodes=2, name='perf_pretrain')" - - Python API usage: - >>> recipe = pretrain_recipe_performance(name="mixtral_8x3b", num_nodes=4) - >>> print(recipe) - - Note: - Use this recipe with caution and only when you need maximum performance. - It may not be suitable for all hardware configurations or use cases. - """ - recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=fn) - - recipe.trainer.callbacks.extend( - [ - run.Config(MegatronTokenDropCallback), - run.Config(MegatronCommOverlapCallback), - ] - ) - - return recipe - - -def hf_resume() -> run.Config[nl.AutoResume]: - """ - Configure the Hugging Face model resuming for Mixtral 8x3B model. - - This function sets up the configuration for resuming training from a Hugging Face model. - - Returns: - run.Config[nl.AutoResume]: Configuration for resuming from a Hugging Face model. - - Examples: - CLI usage: - $ nemo llm finetune --factory "mixtral_8x3b(resume=hf_resume())" - - Python API usage: - >>> recipe = finetune_recipe(name="mixtral_8x3b_finetune", num_nodes=2) - >>> recipe.resume = hf_resume() - >>> print(recipe) - """ - return run.Config( - nl.AutoResume, - restore_config=run.Config(nl.RestoreConfig, path="hf://mistralai/Mixtral-8x7B-v0.1"), - ) - - -@run.cli.factory(target=finetune, name=NAME) -def finetune_recipe( - dir: Optional[str] = None, - name: str = "default", - num_nodes: int = 1, - num_gpus_per_node: int = 8, -) -> run.Partial: - """ - Create a fine-tuning recipe for Mixtral 8x3B model. - - This function sets up a complete configuration for fine-tuning, including - model, trainer, and data settings. - - Args: - dir (Optional[str]): Directory for saving logs and checkpoints. - name (str): Name of the fine-tuning run. - num_nodes (int): Number of compute nodes to use. - num_gpus_per_node (int): Number of GPUs per node. - - Returns: - run.Partial: Partial configuration for fine-tuning. - - Examples: - CLI usage: - $ nemo llm finetune --factory mixtral_8x3b - $ nemo llm finetune --factory "mixtral_8x3b(num_nodes=2, name='my_finetune')" - - Python API usage: - >>> recipe = finetune_recipe(name="mixtral_8x3b_finetune", num_nodes=2) - >>> print(recipe) - """ - recipe = pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node, fn=finetune) - - recipe.resume = hf_resume() - recipe.peft = run.Config(LoRA, target_modules=['linear_qkv', 'linear_proj'], dim=32) - recipe.data = run.Config(SquadDataModule, seq_length=8192, global_batch_size=512, micro_batch_size=1) - return recipe diff --git a/nemo/collections/llm/recipes/mixtral_8x3b_16k.py b/nemo/collections/llm/recipes/mixtral_8x3b_16k.py deleted file mode 100644 index 13ca1c2d4537..000000000000 --- a/nemo/collections/llm/recipes/mixtral_8x3b_16k.py +++ /dev/null @@ -1,132 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from typing import Optional - -import nemo_run as run -import pytorch_lightning as pl -import torch - -from nemo.collections.llm.api import finetune, pretrain -from nemo.collections.llm.gpt.data.mock import MockDataModule -from nemo.collections.llm.gpt.data.squad import SquadDataModule -from nemo.collections.llm.recipes import mixtral_8x3b - -NAME = "mixtral_8x3b_16k" - - -@run.cli.factory(name=NAME) -def model() -> run.Config[pl.LightningModule]: - """ - Factory function to create a Mixtral 8x3B model configuration with 16k sequence length. - - Returns: - run.Config[pl.LightningModule]: Configuration for the Mixtral 8x3B model with 16k sequence length. - - Examples: - CLI usage: - $ nemo llm pretrain model=mixtral_8x3b_16k ... - - Python API usage: - >>> model_config = model() - >>> print(model_config) - """ - model_config = mixtral_8x3b.model() - model_config.config.seq_length = 16384 - model_config.config.max_position_embeddings = 16384 - return model_config - - -def trainer( - num_nodes: int = 1, - num_gpus_per_node: int = 8, -) -> run.Config: - """ - Configure the NeMo Lightning Trainer for Mixtral 8x3B model with 16k sequence length. - - This function sets up the distributed training strategy optimized for longer sequences. - - Args: - num_nodes (int): Number of compute nodes to use. - num_gpus_per_node (int): Number of GPUs per node. - - Returns: - run.Config: Configuration for the NeMo Lightning Trainer. - - Examples: - CLI usage: - $ nemo llm pretrain trainer=mixtral_8x3b_16k ... - - Python API usage: - >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8) - >>> print(trainer_config) - - Note: - This configuration uses increased parallelism to handle the longer sequence length efficiently. - """ - return mixtral_8x3b.trainer( - tensor_parallelism=2, - pipeline_parallelism=2, - pipeline_parallelism_type=torch.bfloat16, - virtual_pipeline_parallelism=8, - context_parallelism=2, - sequence_parallelism=True, - expert_parallelism=1, - num_nodes=num_nodes, - num_gpus_per_node=num_gpus_per_node, - ) - - -@run.cli.factory(target=pretrain, name=NAME) -def pretrain_recipe( - dir: Optional[str] = None, - name: str = "default", - num_nodes: int = 1, - num_gpus_per_node: int = 8, -) -> run.Partial: - """ - Create a pre-training recipe for Mixtral 8x3B model with 16k sequence length. - - This function sets up a complete configuration for pre-training, including - model, trainer, and data settings optimized for 16k sequence length. - - Args: - dir (Optional[str]): Directory for saving logs and checkpoints. - name (str): Name of the pre-training run. - num_nodes (int): Number of compute nodes to use. - num_gpus_per_node (int): Number of GPUs per node. - - Returns: - run.Partial: Partial configuration for pre-training. - - Examples: - CLI usage: - $ nemo llm pretrain --factory mixtral_8x3b_16k - $ nemo llm pretrain --factory "mixtral_8x3b_16k(num_nodes=2, name='my_16k_pretrain')" - - Python API usage: - >>> recipe = pretrain_recipe(name="mixtral_8x3b_16k_pretrain", num_nodes=2) - >>> print(recipe) - - Note: - This recipe is optimized for handling longer sequences (16k) compared to the standard version. - """ - recipe = mixtral_8x3b.pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) - - recipe.model = model() - recipe.trainer = trainer(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) - recipe.data = run.Config(MockDataModule, seq_length=16384, global_batch_size=512, micro_batch_size=1) - - return recipe diff --git a/nemo/collections/llm/recipes/mixtral_8x3b_64k.py b/nemo/collections/llm/recipes/mixtral_8x3b_64k.py deleted file mode 100644 index e21d85a13dcd..000000000000 --- a/nemo/collections/llm/recipes/mixtral_8x3b_64k.py +++ /dev/null @@ -1,133 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from typing import Optional - -import nemo_run as run -import pytorch_lightning as pl -import torch - -from nemo.collections.llm.api import finetune, pretrain -from nemo.collections.llm.gpt.data.mock import MockDataModule -from nemo.collections.llm.gpt.data.squad import SquadDataModule -from nemo.collections.llm.recipes import mixtral_8x3b -from nemo.utils.exp_manager import TimingCallback - -NAME = "mixtral_8x3b_64k" - - -@run.cli.factory(name=NAME) -def model() -> run.Config[pl.LightningModule]: - """ - Factory function to create a Mixtral 8x3B model configuration with 64k sequence length. - - Returns: - run.Config[pl.LightningModule]: Configuration for the Mixtral 8x3B model with 64k sequence length. - - Examples: - CLI usage: - $ nemo llm pretrain model=mixtral_8x3b_64k ... - - Python API usage: - >>> model_config = model() - >>> print(model_config) - """ - model_config = mixtral_8x3b.model() - model_config.config.seq_length = 65536 - return model_config - - -def trainer( - num_nodes: int = 8, - num_gpus_per_node: int = 8, -) -> run.Config: - """ - Configure the NeMo Lightning Trainer for Mixtral 8x3B model with 64k sequence length. - - This function sets up the distributed training strategy optimized for long sequences. - - Args: - num_nodes (int): Number of compute nodes to use. - num_gpus_per_node (int): Number of GPUs per node. - - Returns: - run.Config: Configuration for the NeMo Lightning Trainer. - - Examples: - CLI usage: - $ nemo llm pretrain trainer=mixtral_8x3b_64k ... - - Python API usage: - >>> trainer_config = trainer(num_nodes=8, num_gpus_per_node=8) - >>> print(trainer_config) - - Note: - This configuration uses significantly increased parallelism to handle the long sequence length efficiently. - """ - return mixtral_8x3b.trainer( - tensor_parallelism=4, - pipeline_parallelism=4, - pipeline_parallelism_type=torch.bfloat16, - virtual_pipeline_parallelism=8, - context_parallelism=4, - sequence_parallelism=True, - expert_parallelism=1, - num_nodes=num_nodes, - num_gpus_per_node=num_gpus_per_node, - callbacks=[run.Config(TimingCallback)], - ) - - -@run.cli.factory(target=pretrain, name=NAME) -def pretrain_recipe( - dir: Optional[str] = None, - name: str = "default", - num_nodes: int = 8, - num_gpus_per_node: int = 8, -) -> run.Partial: - """ - Create a pre-training recipe for Mixtral 8x3B model with 64k sequence length. - - This function sets up a complete configuration for pre-training, including - model, trainer, and data settings optimized for 64k sequence length. - - Args: - dir (Optional[str]): Directory for saving logs and checkpoints. - name (str): Name of the pre-training run. - num_nodes (int): Number of compute nodes to use. - num_gpus_per_node (int): Number of GPUs per node. - - Returns: - run.Partial: Partial configuration for pre-training. - - Examples: - CLI usage: - $ nemo llm pretrain --factory mixtral_8x3b_64k - $ nemo llm pretrain --factory "mixtral_8x3b_64k(num_nodes=8, name='my_64k_pretrain')" - - Python API usage: - >>> recipe = pretrain_recipe(name="mixtral_8x3b_64k_pretrain", num_nodes=8) - >>> print(recipe) - - Note: - This recipe is optimized for handling long sequences (64k) compared to the standard version. - It requires significant computational resources due to the extended sequence length. - """ - recipe = mixtral_8x3b.pretrain_recipe(name=name, dir=dir, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) - - recipe.model = model() - recipe.trainer = trainer(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) - recipe.data = run.Config(MockDataModule, seq_length=65536, global_batch_size=512, micro_batch_size=1) - return recipe diff --git a/tests/collections/llm/recipes/test_mixtral_8x3b.py b/tests/collections/llm/recipes/test_mixtral_8x3b.py deleted file mode 100644 index 238fec74e0e1..000000000000 --- a/tests/collections/llm/recipes/test_mixtral_8x3b.py +++ /dev/null @@ -1,110 +0,0 @@ -import nemo_run as run -import pytest - -from nemo.collections.llm.api import finetune, pretrain -from nemo.collections.llm.gpt.data.mock import MockDataModule -from nemo.collections.llm.gpt.data.squad import SquadDataModule -from nemo.collections.llm.gpt.model.mixtral import MixtralConfig8x3B, MixtralModel -from nemo.collections.llm.peft.lora import LoRA -from nemo.collections.llm.recipes import mixtral_8x3b -from nemo.lightning import AutoResume, Trainer - - -class TestMixtral8x3B: - @pytest.fixture(scope="class") - def recipe_module(self): - return mixtral_8x3b - - def test_model(self, recipe_module): - model_config = recipe_module.model() - assert isinstance(model_config, run.Config) - assert model_config.__fn_or_cls__ == MixtralModel - assert isinstance(model_config.config, run.Config) - assert model_config.config.__fn_or_cls__ == MixtralConfig8x3B - - def test_trainer(self, recipe_module): - trainer_config = recipe_module.trainer() - assert isinstance(trainer_config, run.Config) - assert trainer_config.__fn_or_cls__ == Trainer - assert trainer_config.accelerator == "gpu" - assert trainer_config.devices == 8 - assert trainer_config.num_nodes == 2 - - # Check strategy configuration - assert isinstance(trainer_config.strategy, run.Config) - assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy" - assert trainer_config.strategy.tensor_model_parallel_size == 1 - assert trainer_config.strategy.pipeline_model_parallel_size == 1 - assert trainer_config.strategy.pipeline_dtype is None - assert trainer_config.strategy.virtual_pipeline_model_parallel_size is None - assert trainer_config.strategy.context_parallel_size == 1 - assert trainer_config.strategy.sequence_parallel is False - assert trainer_config.strategy.expert_model_parallel_size == 4 - - def test_pretrain_recipe(self, recipe_module): - recipe = recipe_module.pretrain_recipe() - assert isinstance(recipe, run.Partial) - assert recipe.__fn_or_cls__ == pretrain - assert isinstance(recipe.model, run.Config) - assert recipe.model.__fn_or_cls__ == MixtralModel - assert isinstance(recipe.trainer, run.Config) - assert recipe.trainer.__fn_or_cls__ == Trainer - assert isinstance(recipe.data, run.Config) - assert recipe.data.__fn_or_cls__ == MockDataModule - assert recipe.data.seq_length == 8192 - assert recipe.data.global_batch_size == 512 - assert recipe.data.micro_batch_size == 1 - - def test_finetune_recipe(self, recipe_module): - recipe = recipe_module.finetune_recipe() - assert isinstance(recipe, run.Partial) - assert recipe.__fn_or_cls__ == finetune - assert isinstance(recipe.model, run.Config) - assert recipe.model.__fn_or_cls__ == MixtralModel - assert isinstance(recipe.trainer, run.Config) - assert recipe.trainer.__fn_or_cls__ == Trainer - assert isinstance(recipe.data, run.Config) - assert recipe.data.__fn_or_cls__ == SquadDataModule - assert recipe.data.seq_length == 8192 - assert recipe.data.global_batch_size == 512 - assert recipe.data.micro_batch_size == 1 - assert isinstance(recipe.peft, run.Config) - assert recipe.peft.__fn_or_cls__ == LoRA - assert recipe.peft.target_modules == ['linear_qkv', 'linear_proj'] - assert recipe.peft.dim == 32 - - @pytest.mark.parametrize("num_nodes,num_gpus_per_node", [(1, 8), (2, 4), (4, 2)]) - def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_nodes, num_gpus_per_node): - recipe = recipe_module.pretrain_recipe(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) - assert recipe.trainer.num_nodes == num_nodes - assert recipe.trainer.devices == num_gpus_per_node - - def test_hf_resume(self, recipe_module): - resume_config = recipe_module.hf_resume() - assert isinstance(resume_config, run.Config) - assert resume_config.__fn_or_cls__ == AutoResume - assert isinstance(resume_config.restore_config, run.Config) - assert resume_config.restore_config.path == "hf://mistralai/Mixtral-8x7B-v0.1" - - def test_trainer_parallelism_options(self, recipe_module): - trainer_config = recipe_module.trainer( - tensor_parallelism=8, - pipeline_parallelism=2, - context_parallelism=4, - sequence_parallelism=False, - expert_parallelism=2, - ) - assert trainer_config.strategy.tensor_model_parallel_size == 8 - assert trainer_config.strategy.pipeline_model_parallel_size == 2 - assert trainer_config.strategy.context_parallel_size == 4 - assert trainer_config.strategy.sequence_parallel is False - assert trainer_config.strategy.expert_model_parallel_size == 2 - - def test_model_config_parameters(self, recipe_module): - model_config = recipe_module.model() - mixtral_config = model_config.config - assert mixtral_config.num_layers == 32 - assert mixtral_config.hidden_size == 2560 - assert mixtral_config.num_attention_heads == 32 - assert mixtral_config.seq_length == 4096 - assert mixtral_config.num_moe_experts == 8 diff --git a/tests/collections/llm/recipes/test_mixtral_8x3b_16k.py b/tests/collections/llm/recipes/test_mixtral_8x3b_16k.py deleted file mode 100644 index 1f1b041584d8..000000000000 --- a/tests/collections/llm/recipes/test_mixtral_8x3b_16k.py +++ /dev/null @@ -1,84 +0,0 @@ -import nemo_run as run -import pytest -import torch - -from nemo.collections.llm.api import finetune, pretrain -from nemo.collections.llm.gpt.data.mock import MockDataModule -from nemo.collections.llm.gpt.data.squad import SquadDataModule -from nemo.collections.llm.gpt.model.mixtral import MixtralConfig8x3B, MixtralModel -from nemo.collections.llm.recipes import mixtral_8x3b_16k -from nemo.lightning import Trainer - - -class TestMixtral8x3B_16k: - @pytest.fixture(scope="class") - def recipe_module(self): - return mixtral_8x3b_16k - - def test_model(self, recipe_module): - model_config = recipe_module.model() - assert isinstance(model_config, run.Config) - assert model_config.__fn_or_cls__ == MixtralModel - assert isinstance(model_config.config, run.Config) - assert model_config.config.__fn_or_cls__ == MixtralConfig8x3B - assert model_config.config.seq_length == 16384 - assert model_config.config.max_position_embeddings == 16384 - - def test_trainer(self, recipe_module): - trainer_config = recipe_module.trainer() - assert isinstance(trainer_config, run.Config) - assert trainer_config.__fn_or_cls__ == Trainer - assert trainer_config.accelerator == "gpu" - assert trainer_config.devices == 8 - assert trainer_config.num_nodes == 1 - - # Check strategy configuration - assert isinstance(trainer_config.strategy, run.Config) - assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy" - assert trainer_config.strategy.tensor_model_parallel_size == 2 - assert trainer_config.strategy.pipeline_model_parallel_size == 2 - assert trainer_config.strategy.pipeline_dtype == torch.bfloat16 - assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 8 - assert trainer_config.strategy.context_parallel_size == 2 - assert trainer_config.strategy.sequence_parallel is True - assert trainer_config.strategy.expert_model_parallel_size == 1 - - def test_pretrain_recipe(self, recipe_module): - recipe = recipe_module.pretrain_recipe() - assert isinstance(recipe, run.Partial) - assert recipe.__fn_or_cls__ == pretrain - assert isinstance(recipe.model, run.Config) - assert recipe.model.__fn_or_cls__ == MixtralModel - assert isinstance(recipe.trainer, run.Config) - assert recipe.trainer.__fn_or_cls__ == Trainer - assert isinstance(recipe.data, run.Config) - assert recipe.data.__fn_or_cls__ == MockDataModule - assert recipe.data.seq_length == 16384 - assert recipe.data.global_batch_size == 512 - assert recipe.data.micro_batch_size == 1 - - @pytest.mark.parametrize("num_nodes,num_gpus_per_node", [(1, 8), (2, 4), (4, 2)]) - def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_nodes, num_gpus_per_node): - recipe = recipe_module.pretrain_recipe(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) - assert recipe.trainer.num_nodes == num_nodes - assert recipe.trainer.devices == num_gpus_per_node - - def test_trainer_parallelism_options(self, recipe_module): - trainer_config = recipe_module.trainer() - assert trainer_config.strategy.tensor_model_parallel_size == 2 - assert trainer_config.strategy.pipeline_model_parallel_size == 2 - assert trainer_config.strategy.pipeline_dtype == torch.bfloat16 - assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 8 - assert trainer_config.strategy.context_parallel_size == 2 - assert trainer_config.strategy.sequence_parallel is True - assert trainer_config.strategy.expert_model_parallel_size == 1 - - def test_model_config_parameters(self, recipe_module): - model_config = recipe_module.model() - mixtral_config = model_config.config - assert mixtral_config.num_layers == 32 - assert mixtral_config.hidden_size == 2560 - assert mixtral_config.num_attention_heads == 32 - assert mixtral_config.seq_length == 16384 - assert mixtral_config.max_position_embeddings == 16384 - assert mixtral_config.num_moe_experts == 8 diff --git a/tests/collections/llm/recipes/test_mixtral_8x3b_64k.py b/tests/collections/llm/recipes/test_mixtral_8x3b_64k.py deleted file mode 100644 index d71017649b1b..000000000000 --- a/tests/collections/llm/recipes/test_mixtral_8x3b_64k.py +++ /dev/null @@ -1,84 +0,0 @@ -import nemo_run as run -import pytest -import torch - -from nemo.collections.llm.api import finetune, pretrain -from nemo.collections.llm.gpt.data.mock import MockDataModule -from nemo.collections.llm.gpt.data.squad import SquadDataModule -from nemo.collections.llm.gpt.model.mixtral import MixtralConfig8x3B, MixtralModel -from nemo.collections.llm.recipes import mixtral_8x3b_64k -from nemo.lightning import Trainer - - -class TestMixtral8x3B_64k: - @pytest.fixture(scope="class") - def recipe_module(self): - return mixtral_8x3b_64k - - def test_model(self, recipe_module): - model_config = recipe_module.model() - assert isinstance(model_config, run.Config) - assert model_config.__fn_or_cls__ == MixtralModel - assert isinstance(model_config.config, run.Config) - assert model_config.config.__fn_or_cls__ == MixtralConfig8x3B - assert model_config.config.seq_length == 65536 - assert model_config.config.max_position_embeddings == 4096 - - def test_trainer(self, recipe_module): - trainer_config = recipe_module.trainer() - assert isinstance(trainer_config, run.Config) - assert trainer_config.__fn_or_cls__ == Trainer - assert trainer_config.accelerator == "gpu" - assert trainer_config.devices == 8 - assert trainer_config.num_nodes == 8 - - # Check strategy configuration - assert isinstance(trainer_config.strategy, run.Config) - assert trainer_config.strategy.__fn_or_cls__.__name__ == "MegatronStrategy" - assert trainer_config.strategy.tensor_model_parallel_size == 4 - assert trainer_config.strategy.pipeline_model_parallel_size == 4 - assert trainer_config.strategy.pipeline_dtype == torch.bfloat16 - assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 8 - assert trainer_config.strategy.context_parallel_size == 4 - assert trainer_config.strategy.sequence_parallel is True - assert trainer_config.strategy.expert_model_parallel_size == 1 - - def test_pretrain_recipe(self, recipe_module): - recipe = recipe_module.pretrain_recipe() - assert isinstance(recipe, run.Partial) - assert recipe.__fn_or_cls__ == pretrain - assert isinstance(recipe.model, run.Config) - assert recipe.model.__fn_or_cls__ == MixtralModel - assert isinstance(recipe.trainer, run.Config) - assert recipe.trainer.__fn_or_cls__ == Trainer - assert isinstance(recipe.data, run.Config) - assert recipe.data.__fn_or_cls__ == MockDataModule - assert recipe.data.seq_length == 65536 - assert recipe.data.global_batch_size == 512 - assert recipe.data.micro_batch_size == 1 - - @pytest.mark.parametrize("num_nodes,num_gpus_per_node", [(32, 8), (64, 4), (128, 2)]) - def test_pretrain_recipe_with_different_configurations(self, recipe_module, num_nodes, num_gpus_per_node): - recipe = recipe_module.pretrain_recipe(num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node) - assert recipe.trainer.num_nodes == num_nodes - assert recipe.trainer.devices == num_gpus_per_node - - def test_trainer_parallelism_options(self, recipe_module): - trainer_config = recipe_module.trainer() - assert trainer_config.strategy.tensor_model_parallel_size == 4 - assert trainer_config.strategy.pipeline_model_parallel_size == 4 - assert trainer_config.strategy.pipeline_dtype == torch.bfloat16 - assert trainer_config.strategy.virtual_pipeline_model_parallel_size == 8 - assert trainer_config.strategy.context_parallel_size == 4 - assert trainer_config.strategy.sequence_parallel is True - assert trainer_config.strategy.expert_model_parallel_size == 1 - - def test_model_config_parameters(self, recipe_module): - model_config = recipe_module.model() - mixtral_config = model_config.config - assert mixtral_config.num_layers == 32 - assert mixtral_config.hidden_size == 2560 - assert mixtral_config.num_attention_heads == 32 - assert mixtral_config.seq_length == 65536 - assert mixtral_config.max_position_embeddings == 4096 - assert mixtral_config.num_moe_experts == 8 From 07f251d1341376407465e5c98cf26741890f2e97 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Fri, 4 Oct 2024 12:38:06 -0700 Subject: [PATCH 2/3] remove 8x3b from test_nemo_run Signed-off-by: Alexandros Koumparoulis --- tests/lightning/test_nemo_run.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/lightning/test_nemo_run.py b/tests/lightning/test_nemo_run.py index d651890b5fd3..8d7814bfe530 100644 --- a/tests/lightning/test_nemo_run.py +++ b/tests/lightning/test_nemo_run.py @@ -19,10 +19,6 @@ ("llama31_405b", "pretrain_recipe", "llama31_405b_pretrain"), ("mistral", "pretrain_recipe", "mistral_pretrain"), ("mistral", "finetune_recipe", "mistral_finetune"), - ("mixtral_8x3b", "pretrain_recipe", "mixtral_8x3b_pretrain"), - ("mixtral_8x3b", "finetune_recipe", "mixtral_8x3b_finetune"), - ("mixtral_8x3b_16k", "pretrain_recipe", "mixtral_8x3b_16k_pretrain"), - ("mixtral_8x3b_64k", "pretrain_recipe", "mixtral_8x3b_64k_pretrain"), ("mixtral_8x7b", "pretrain_recipe", "mixtral_8x7b_pretrain"), ("mixtral_8x7b", "finetune_recipe", "mixtral_8x7b_finetune"), ("mixtral_8x7b_16k", "pretrain_recipe", "mixtral_8x7b_16k_pretrain"), From 3898d916a6bf9336c9a7f04653e2772283d4101e Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis Date: Fri, 4 Oct 2024 15:08:38 -0700 Subject: [PATCH 3/3] rm from __init__ Signed-off-by: Alexandros Koumparoulis --- nemo/collections/llm/recipes/__init__.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/nemo/collections/llm/recipes/__init__.py b/nemo/collections/llm/recipes/__init__.py index 43c881110603..6bee8c882ffd 100644 --- a/nemo/collections/llm/recipes/__init__.py +++ b/nemo/collections/llm/recipes/__init__.py @@ -22,9 +22,6 @@ llama3_70b_64k, llama31_405b, mistral, - mixtral_8x3b, - mixtral_8x3b_16k, - mixtral_8x3b_64k, mixtral_8x7b, mixtral_8x7b_16k, mixtral_8x7b_64k, @@ -52,9 +49,6 @@ "llama3_70b_64k", "llama31_405b", "mistral", - "mixtral_8x3b", - "mixtral_8x3b_16k", - "mixtral_8x3b_64k", "mixtral_8x7b", "mixtral_8x7b_16k", "mixtral_8x7b_64k",