From cb86f4521b91e9f87e7fd23a4e836dfc63bbdc21 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Thu, 5 Jun 2025 15:26:02 -0400 Subject: [PATCH 01/43] Test all models --- fast_llm/layers/transformer/config.py | 8 +- tests/common.py | 470 ----------------------- tests/conftest.py | 9 +- tests/data/common.py | 2 +- tests/data/test_blending.py | 2 +- tests/data/test_concatenate.py | 2 +- tests/data/test_concatenated_memmap.py | 2 +- tests/data/test_dataset_from_file.py | 2 +- tests/data/test_fim.py | 2 +- tests/data/test_memmap.py | 2 +- tests/data/test_sampling.py | 2 +- tests/data/test_slice.py | 2 +- tests/layers/test_lm_head.py | 2 +- tests/test_checkpoint.py | 344 ++++++++--------- tests/test_config.py | 5 +- tests/test_functional.py | 2 +- tests/test_gpt_generate_and_forward.py | 69 ++-- tests/test_match_megatron.py | 156 +------- tests/test_mb.py | 68 ++-- tests/test_mb_seq_first.py | 39 +- tests/test_ms.py | 32 +- tests/test_mtp.py | 2 +- tests/test_multi_stage.py | 6 +- tests/test_seq_first.py | 39 +- tests/test_simple.py | 73 ++-- tests/test_ssms.py | 2 +- tests/test_triton_kernels.py | 2 +- tests/utils/__init__.py | 0 tests/{ => utils}/compare_tensor_logs.py | 0 tests/utils/dataset.py | 82 ++++ tests/utils/model_configs.py | 276 +++++++++++++ tests/utils/run_test_script.py | 118 ++++++ tests/utils/utils.py | 55 +++ 33 files changed, 885 insertions(+), 992 deletions(-) delete mode 100644 tests/common.py create mode 100644 tests/utils/__init__.py rename tests/{ => utils}/compare_tensor_logs.py (100%) create mode 100644 tests/utils/dataset.py create mode 100644 tests/utils/model_configs.py create mode 100644 tests/utils/run_test_script.py create mode 100644 tests/utils/utils.py diff --git a/fast_llm/layers/transformer/config.py b/fast_llm/layers/transformer/config.py index e7ef0b15f..235aa366e 100644 --- a/fast_llm/layers/transformer/config.py +++ b/fast_llm/layers/transformer/config.py @@ -711,13 +711,7 @@ def setup_tensor_space(self, tensor_space: TensorSpace) -> None: ) def do_use_flash_attention(self, distributed_config: DistributedConfig) -> bool: - use_flash_attention = self.use_flash_attention and distributed_config.training_dtype in ( + return self.use_flash_attention and distributed_config.training_dtype in ( DataType.float16, DataType.bfloat16, ) - - # Config parameter `window_size` only can be used with flash attention - if not use_flash_attention: - Assert.is_(self.window_size, None) - - return use_flash_attention diff --git a/tests/common.py b/tests/common.py deleted file mode 100644 index d531972e7..000000000 --- a/tests/common.py +++ /dev/null @@ -1,470 +0,0 @@ -import os -import pathlib -import random -import shutil -import string -import subprocess -import sys - -import numpy as np -import pytest -import torch -import yaml - -from fast_llm.data.dataset.gpt.memmap import GPTMemmapDataset -from fast_llm.data.dataset.gpt.sampled import GPTSample -from fast_llm.layers.ssm.config import SSMConfig -from fast_llm.layers.transformer.config import TransformerConfig -from fast_llm.models.gpt.config import ( - LlamaGPTHuggingfaceCheckpointFormat, - MistralGPTHuggingfaceCheckpointFormat, - MixtralGPTHuggingfaceCheckpointFormat, - MTPLlamaGPTHuggingfaceCheckpointFormat, - Qwen2GPTHuggingfaceCheckpointFormat, - Starcoder2GPTHuggingfaceCheckpointFormat, -) -from fast_llm.models.ssm.config import HybridSSMBaseModelConfig, LLambaHuggingfaceCheckpointFormat -from fast_llm.tools.train import CliTrainingConfig -from tests.compare_tensor_logs import CompareConfig, compare_tensor_logs - -# FIXME: figure out correct import of megatron modules without this hack -sys.path.append(os.getcwd()) - -# TODO: Use `pytest_addoption` instead? -# Keep all results in one place to allow recovering them for debugging in case of failure. -TEST_RESULTS_PATH = pathlib.Path(os.environ.get("TEST_RESULTS_PATH", "/tmp/fast_llm_tests")).resolve() -FORCE_REUSE_RESULTS = int(os.environ.get("FORCE_REUSE_RESULTS", 0)) != 0 -REUSE_RESULTS = FORCE_REUSE_RESULTS or int(os.environ.get("REUSE_RESULTS", 0)) != 0 -_LOG_LEVEL = int(os.environ.get("LOG_LEVEL", 13)) -TEST_MODEL = os.environ.get("MODEL", "llama") - -ARTIFACT_PATH = "runs/0/artifacts" - -TOKENIZER_PATH = TEST_RESULTS_PATH / "tokenizer" / "common" -TOKENIZER_FILE = TOKENIZER_PATH / "tokenizer.json" -DATASET_CACHE = TEST_RESULTS_PATH / "dataset" -DATASET_PREFIX = DATASET_CACHE / "common" / "dataset" -DATASET_SAMPLING_CACHE = TEST_RESULTS_PATH / "dataset" / "cache" - -TEST_VOCAB_SIZE = 8192 -# Random lowercase: 80.7% (3.1% each); space: 18.6%; doc end: 0.6% -TEST_CHARACTERS = (string.ascii_lowercase) * 5 + " " * 30 + "\n" -TEST_DATASET_TOKENS = 1000000 - -CONFIG_BASE_FAST_LLM = [ - "training.logs.interval=1", - "run.tensor_logs.save=True", - "run.tensor_logs.show=False", - "model.base_model.transformer.num_layers=2", - "model.base_model.transformer.hidden_size=256", - "model.base_model.transformer.num_attention_heads=8", - "model.base_model.transformer.init_method_std=0.022", - f"model.base_model.vocab_size={TEST_VOCAB_SIZE}", - f"model.multi_stage.debug_param_init={_LOG_LEVEL}", - f"model.multi_stage.debug_layer_outputs={_LOG_LEVEL}", - f"model.multi_stage.debug_layer_gradients={_LOG_LEVEL}", - f"model.multi_stage.debug_all_param_gradients={_LOG_LEVEL}", - "model.multi_stage.debug_tensor_parallel=True", - "model.distributed.reproducible_init=True", - "model.distributed.timeout=10", - "training.train_iters=2", - "training.num_workers=0", - "training.timeout=30", - "batch.batch_size=8", - "batch.sequence_length=512", - "data.datasets.training.type=slice", - "data.datasets.training.end=0.969", - "data.datasets.training.dataset.type=memmap", - f"data.datasets.training.dataset.path={DATASET_PREFIX}", - "data.datasets.validation.type=slice", - "data.datasets.validation.begin=0.969", - "data.datasets.validation.end=0.999", - "data.datasets.validation.dataset.type=memmap", - f"data.datasets.validation.dataset.path={DATASET_PREFIX}", - "data.datasets.test.type=slice", - "data.datasets.test.begin=0.999", - "data.datasets.test.end=1", - "data.datasets.test.dataset.type=memmap", - f"data.datasets.test.dataset.path={DATASET_PREFIX}", - "optimizer.learning_rate.base=0.0001", -] -CONFIG_BASE_MEGATRON = [ - "--num-layers=2", - "--hidden-size=256", - "--num-attention-heads=8", - "--log-interval=1", - "--train-iters=2", - "--eval-iters=0", - "--hidden-dropout=0", - "--attention-dropout=0", - f"--debug_param_init={_LOG_LEVEL}", - f"--debug_layer_outputs={_LOG_LEVEL}", - f"--debug_layer_gradients={_LOG_LEVEL}", - f"--debug_all_param_gradients={_LOG_LEVEL}", - "--debug_param_update=0", - "--global-batch-size=8", - "--max-position-embeddings=512", - "--seq-length=512", - "--init-method-std=0.022", - "--lr=0.0001", - "--num-workers=0", - "--valid-num-workers=0", - "--tokenizer-type=NullTokenizer", - # Megatron messes with the vocab size, so we have to subtract 1. - f"--vocab-size={TEST_VOCAB_SIZE-1}", - f"--data-path={DATASET_PREFIX}", - "--lr-decay-style=constant", - # Initialization is set up to match MCore models (MCore inverts self-attn qkv and dense layers compared to original Megatron) - "--use-mcore-models", - # local implementation doesn't allow for RMS norm. - "--transformer-impl=transformer_engine", -] - -CONFIG_SC1_FAST_LLM = CONFIG_BASE_FAST_LLM + ["model.base_model.max_position_embeddings=512"] -CONFIG_SC1_MEGATRON = CONFIG_BASE_MEGATRON + ["--group-query-attention"] -CONFIG_SC1_COMMON = CONFIG_SC1_FAST_LLM + ["model.distributed.training_dtype=bf16"] - -CONFIG_GPT2_FAST_LLM = CONFIG_SC1_FAST_LLM + ["model.base_model.transformer.head_groups=8"] -CONFIG_GPT2_MEGATRON = CONFIG_BASE_MEGATRON -CONFIG_GPT2_COMMON = CONFIG_GPT2_FAST_LLM + ["model.distributed.training_dtype=bf16"] - -CONFIG_SC2_FAST_LLM = CONFIG_BASE_FAST_LLM + [ - "model.base_model.transformer.head_groups=4", - "model.base_model.transformer.rotary.type=default", -] -CONFIG_SC2_MEGATRON = CONFIG_SC1_MEGATRON + [ - "--num-query-groups=4", - "--use-rotary-position-embeddings", - "--no-position-embedding", -] -CONFIG_SC2_COMMON = CONFIG_SC2_FAST_LLM + ["model.distributed.training_dtype=bf16"] - -CONFIG_LLAMA_MEGATRON = CONFIG_SC2_MEGATRON + [ - "--swiglu", - "--disable-bias-linear", - "--normalization=RMSNorm", - "--ffn-hidden-size=1024", - "--untie-embeddings-and-output-weights", -] -CONFIG_LLAMA_FAST_LLM = CONFIG_SC2_FAST_LLM + [ - "model.base_model.transformer.gated=True", - "model.base_model.transformer.activation_type=silu", - "model.base_model.transformer.add_linear_biases=False", - "model.base_model.transformer.normalization.type=rms_norm", - "model.base_model.transformer.ffn_hidden_size=1024", - "model.base_model.tie_word_embeddings=False", -] -CONFIG_LLAMA_COMMON = CONFIG_LLAMA_FAST_LLM + ["model.distributed.training_dtype=bf16"] - -# Megatron does not support Llama3-style Rotary Embeddings -CONFIG_LLAMA3_MEGATRON = None -CONFIG_LLAMA3_FAST_LLM = CONFIG_LLAMA_FAST_LLM + [ - "model.base_model.transformer.rotary.type=llama3", -] -CONFIG_LLAMA3_COMMON = CONFIG_LLAMA3_FAST_LLM + ["model.distributed.training_dtype=bf16"] - -# Megatron does not support per sub layer biases -CONFIG_QWEN2_MEGATRON = None -CONFIG_QWEN2_FAST_LLM = CONFIG_SC2_FAST_LLM + [ - "model.base_model.transformer.gated=True", - "model.base_model.transformer.activation_type=silu", - "model.base_model.transformer.add_linear_biases=only_attn_qkv", - "model.base_model.transformer.normalization.type=rms_norm", - "model.base_model.transformer.ffn_hidden_size=1024", - "model.base_model.tie_word_embeddings=False", -] -CONFIG_QWEN2_COMMON = CONFIG_QWEN2_FAST_LLM + ["model.distributed.training_dtype=bf16"] - -# Yarn-style Rotary Embeddings -CONFIG_LLAMA_YARN_MEGATRON = None -CONFIG_LLAMA_YARN_FAST_LLM = CONFIG_LLAMA_FAST_LLM + [ - "model.base_model.transformer.rotary.type=yarn", -] -CONFIG_LLAMA_YARN_COMMON = CONFIG_LLAMA_YARN_FAST_LLM + ["model.distributed.training_dtype=bf16"] - - -CONFIG_MIXTRAL_MEGATRON = CONFIG_LLAMA_MEGATRON + [ - "--num-experts=4", - "--moe-router-topk=4", -] -CONFIG_MIXTRAL_FAST_LLM = CONFIG_LLAMA_FAST_LLM + [ - "model.base_model.transformer.num_experts=4", - "model.base_model.transformer.num_experts_per_token=4", -] -CONFIG_MIXTRAL_COMMON = CONFIG_MIXTRAL_FAST_LLM + ["model.distributed.training_dtype=bf16"] -CONFIG_MIXTRAL_YARN_MEGATRON = None -CONFIG_MIXTRAL_YARN_FAST_LLM = CONFIG_MIXTRAL_FAST_LLM + [ - "model.base_model.transformer.rotary.type=yarn", -] -CONFIG_MIXTRAL_YARN_COMMON = CONFIG_MIXTRAL_YARN_FAST_LLM + ["model.distributed.training_dtype=bf16"] - -CONFIG_LLAMA_MTP_MEGATRON = None -CONFIG_LLAMA_MTP_FAST_LLM = CONFIG_LLAMA_FAST_LLM + [ - "model.base_model.prediction_heads=4", -] -CONFIG_LLAMA_MTP_COMMON = CONFIG_LLAMA_MTP_FAST_LLM + ["model.distributed.training_dtype=bf16"] - -CONFIG_LLAMBA_FAST_LLM = CONFIG_LLAMA_FAST_LLM + ["model.base_model.hybrid_block_layout==['t','m']"] -CONFIG_LLAMBA_MEGATRON = CONFIG_LLAMA_MEGATRON + [] -CONFIG_LLAMBA_COMMON = CONFIG_LLAMBA_FAST_LLM - -_CONFIGS = { - "gpt2": ("gpt", CONFIG_GPT2_FAST_LLM, CONFIG_GPT2_MEGATRON, CONFIG_GPT2_COMMON, None), - "sc1": ("gpt", CONFIG_SC1_FAST_LLM, CONFIG_SC1_MEGATRON, CONFIG_SC1_COMMON, None), - "starcoder2": ( - "gpt", - CONFIG_SC2_FAST_LLM, - CONFIG_SC2_MEGATRON, - CONFIG_SC2_COMMON, - Starcoder2GPTHuggingfaceCheckpointFormat, - ), - "llama": ( - "gpt", - CONFIG_LLAMA_FAST_LLM, - CONFIG_LLAMA_MEGATRON, - CONFIG_LLAMA_COMMON, - LlamaGPTHuggingfaceCheckpointFormat, - ), - "llama3": ( - "gpt", - CONFIG_LLAMA3_FAST_LLM, - CONFIG_LLAMA3_MEGATRON, - CONFIG_LLAMA3_COMMON, - LlamaGPTHuggingfaceCheckpointFormat, - ), - "qwen2": ( - "gpt", - CONFIG_QWEN2_FAST_LLM, - CONFIG_QWEN2_MEGATRON, - CONFIG_QWEN2_COMMON, - Qwen2GPTHuggingfaceCheckpointFormat, - ), - "llama-yarn": ( - "gpt", - CONFIG_LLAMA_YARN_FAST_LLM, - CONFIG_LLAMA_YARN_MEGATRON, - CONFIG_LLAMA_YARN_COMMON, - LlamaGPTHuggingfaceCheckpointFormat, - ), - "mistral": ( - "gpt", - CONFIG_LLAMA_FAST_LLM, - CONFIG_LLAMA_MEGATRON, - CONFIG_LLAMA_COMMON, - MistralGPTHuggingfaceCheckpointFormat, - ), - "mixtral": ( - "gpt", - CONFIG_MIXTRAL_FAST_LLM, - CONFIG_MIXTRAL_MEGATRON, - CONFIG_MIXTRAL_COMMON, - MixtralGPTHuggingfaceCheckpointFormat, - ), - "llamba": ( - "hybrid_ssm", - CONFIG_LLAMBA_FAST_LLM, - CONFIG_LLAMBA_MEGATRON, - CONFIG_LLAMBA_COMMON, - LLambaHuggingfaceCheckpointFormat, - ), - "mixtral-yarn": ( - "gpt", - CONFIG_MIXTRAL_YARN_FAST_LLM, - CONFIG_MIXTRAL_YARN_MEGATRON, - CONFIG_MIXTRAL_YARN_COMMON, - MixtralGPTHuggingfaceCheckpointFormat, - ), - "llama-mtp": ( - "gpt", - CONFIG_LLAMA_MTP_FAST_LLM, - CONFIG_LLAMA_MTP_MEGATRON, - CONFIG_LLAMA_MTP_COMMON, - MTPLlamaGPTHuggingfaceCheckpointFormat, - ), -} - -TEST_MODEL_TYPE, CONFIG_FAST_LLM, CONFIG_GPT2, CONFIG_COMMON, HUGGINGFACE_CHECKPOINT_FORMAT = _CONFIGS[TEST_MODEL] - - -requires_cuda = pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is not available") - - -def get_test_dataset( - prefix: pathlib.Path = DATASET_PREFIX, - seed: int = 1234, - num_tokens: int = TEST_DATASET_TOKENS, - characters: str = TEST_CHARACTERS, - vocab_size: int = TEST_VOCAB_SIZE, - max_spans: int = 0, -): - if not TOKENIZER_FILE.is_file(): - import transformers - - transformers.AutoTokenizer.from_pretrained("bigcode/santacoder").save_pretrained(TOKENIZER_PATH) - - if not ( - prefix.with_suffix(".idx").is_file() - and prefix.with_suffix(".bin").is_file() - and prefix.parent.joinpath("fast_llm_config.yaml").is_file() - ): - import transformers - - texts = "".join(random.Random(seed).choices(characters, k=num_tokens)).splitlines() - tokenizer = transformers.AutoTokenizer.from_pretrained(TOKENIZER_PATH) - - samples = [ - GPTSample(np.array(tokenizer(document)["input_ids"], dtype=np.uint16) % vocab_size) for document in texts - ] - if max_spans > 0: - lengths = np.array([max(len(sample.token_ids), 1) for sample in samples]) - spans = np.sort(np.random.RandomState(seed + 3847).randint(0, lengths[:, None], [len(samples), max_spans])) - for sample, span in zip(samples, spans): - span = np.unique(span) - sample.loss_masking_spans = span[: len(span) // 2 * 2].reshape(-1, 2) - - GPTMemmapDataset.write_dataset(prefix, samples) - yaml.safe_dump( - {"type": "memmap", "path": prefix.name}, prefix.parent.joinpath("fast_llm_config.yaml").open("w") - ) - - -def get_test_concatenated_memmap_dataset( - path: pathlib.Path, - num_files: int, - seed: int = 1234, - num_tokens: int = TEST_DATASET_TOKENS, - characters: str = TEST_CHARACTERS, - vocab_size: int = TEST_VOCAB_SIZE, - seed_shift: int = 55, -): - index_file = path / "index.txt" - if not index_file.is_file(): - for i in range(num_files): - get_test_dataset( - prefix=path / f"dataset_{i}", - seed=seed + i * seed_shift, - num_tokens=num_tokens, - characters=characters, - vocab_size=vocab_size, - ) - index_file.open("w").writelines([str(path / f"dataset_{i}") + "\n" for i in range(num_files)]) - - -@pytest.fixture(scope="session") -def run_test_script(worker_resources): - def do_run_test_script( - name: str, - script: list[str], - num_gpus: int = 1, - *, - model_type: str = TEST_MODEL_TYPE, - is_megatron: bool = False, - compare: str | None = None, - config: CompareConfig | None = None, - prepare_fn=None, - compare_fn=None, - do_compare: bool = True, - ): - if torch.cuda.device_count() < num_gpus: - pytest.skip(f"Not enough GPUs to run test ({torch.cuda.device_count()}<{num_gpus})") - env = os.environ.copy() - if is_megatron: - # Prevent Megatron from complaining. - env["CUDA_DEVICE_MAX_CONNECTIONS"] = "1" - env["NVTE_FLASH_ATTN"] = "0" - path = TEST_RESULTS_PATH / name - skip = False - artifact_path = path / ARTIFACT_PATH - if path.exists(): - assert path.is_dir() - # TODO: Better way to check if the previous attempt succeeded. - if ( - REUSE_RESULTS - and artifact_path.is_dir() - and len(list((artifact_path / "0").iterdir())) >= (1 if is_megatron else 3) - ): - skip = True - elif FORCE_REUSE_RESULTS: - raise RuntimeError(artifact_path) - else: - shutil.rmtree(path) - elif FORCE_REUSE_RESULTS: - raise RuntimeError(path) - if prepare_fn is not None: - skip = prepare_fn(TEST_RESULTS_PATH / name, None if compare is None else TEST_RESULTS_PATH / compare, skip) - if is_megatron: - script = [*script, f"--structured-logs-dir={path}", f"--data-cache-path={path}"] - else: - script = [model_type, *script, f"run.experiment_dir={path}"] - header = ["Megatron-LM/pretrain_gpt.py"] if is_megatron else ["--no-python", "fast-llm", "train"] - command = [ - "python", - "-m", - "torch.distributed.run", - f"--nproc-per-node={num_gpus}", - f"--rdzv-endpoint=localhost:{worker_resources.rendezvous_port}", - f"--master-port={worker_resources.torchrun_port}", - *header, - *script, - ] - print(" ".join(command)) - if skip: - print("Reusing existing run.") - else: - get_test_dataset() - if num_gpus == 1 and not is_megatron: - CliTrainingConfig.parse_and_run(script) - else: - completed_proc = subprocess.run(command, env=env, timeout=60) - if completed_proc.returncode: - raise RuntimeError(f"Process failed with return code {completed_proc.returncode}") - if compare and do_compare: - if compare_fn is not None: - compare_fn(TEST_RESULTS_PATH / name, TEST_RESULTS_PATH / compare) - compare_tensor_logs( - TEST_RESULTS_PATH / compare / ARTIFACT_PATH, - TEST_RESULTS_PATH / name / ARTIFACT_PATH, - config, - ) - - return do_run_test_script - - -def materialize_meta_tensors(model, tensor_space): - # Materialize parameters that are on meta device - for name, param in model.named_parameters(): - if param.device.type == "meta": - # Check if the parameter is a custom tensor type - if hasattr(param, "tensor_name") and hasattr(param, "init_parameter"): - param_data = param.new_empty(param.shape, device="cuda") - # Initialize param_data - param.init_parameter(param_data, tensor_space.distributed) - # Replace the parameter in the module - module_path, param_name = name.rsplit(".", 1) if "." in name else (None, name) - module = model - if module_path is not None: - for part in module_path.split("."): - module = getattr(module, part) - param = torch.nn.Parameter(param_data, requires_grad=param.requires_grad) - # TODO: add param_grad_is_zero etc., grad_buffer, etc., see test_mlp_recomputation - param.grad = None - param.grad_buffer = torch.empty_like(param) - param.param_grad_is_zero = True - module._parameters[param_name] = param - return model - - -def get_hybrid_config(hybrid_block_layout=["t", "m"], prediction_heads=1, default_mtp_type=None): - config = HybridSSMBaseModelConfig( - transformer=TransformerConfig(num_layers=len(hybrid_block_layout)), - ssm=SSMConfig(), - hybrid_block_layout=hybrid_block_layout, - prediction_heads=prediction_heads, - default_mtp_type=default_mtp_type, - init_method_std_embed=0.02, - init_method_min_embed=-0.02, - init_method_max_embed=0.02, - use_position_embeddings=True, - tie_word_embeddings=False, - ) - return config diff --git a/tests/conftest.py b/tests/conftest.py index edc52e034..3d1e940b0 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -10,7 +10,14 @@ from xdist.scheduler import LoadGroupScheduling # Make fixtures available globally without import -from tests.common import run_test_script # isort: skip +from tests.utils.run_test_script import ( # isort: skip + run_test_script, + run_test_script_base_path, + run_test_script_for_all_models, +) + +from tests.utils.model_configs import model_testing_config # isort: skip +from tests.utils.utils import result_path # isort: skip def pytest_addoption(parser): diff --git a/tests/data/common.py b/tests/data/common.py index cacb28e6b..2d3cb905f 100644 --- a/tests/data/common.py +++ b/tests/data/common.py @@ -23,7 +23,7 @@ from fast_llm.engine.distributed.distributed import Distributed from fast_llm.models.gpt.config import GPTBatchConfig from fast_llm.utils import Assert, div -from tests.common import TEST_VOCAB_SIZE +from tests.utils.dataset import TEST_VOCAB_SIZE def get_sampling_data( diff --git a/tests/data/test_blending.py b/tests/data/test_blending.py index de97eaa21..438782dfe 100644 --- a/tests/data/test_blending.py +++ b/tests/data/test_blending.py @@ -5,13 +5,13 @@ from fast_llm.data.dataset.gpt.config import GPTBlendedDatasetConfig from fast_llm.utils import Assert, normalize_probabilities -from tests.common import DATASET_CACHE, DATASET_PREFIX, get_test_dataset from tests.data.common import ( compare_sampled_dataset, get_dataset_config, get_sampling_data, get_test_data_and_compare_samples, ) +from tests.utils.dataset import DATASET_CACHE, DATASET_PREFIX, get_test_dataset _DATASET_PREFIX_MIX_1 = DATASET_CACHE / "blended_mix_1" / "dataset" diff --git a/tests/data/test_concatenate.py b/tests/data/test_concatenate.py index 1142d5364..e951cc2b1 100644 --- a/tests/data/test_concatenate.py +++ b/tests/data/test_concatenate.py @@ -1,5 +1,4 @@ from fast_llm.data.dataset.gpt.config import GPTConcatenatedDatasetConfig -from tests.common import DATASET_PREFIX, get_test_dataset from tests.data.common import ( compare_indexed_dataset, compare_sampled_dataset, @@ -8,6 +7,7 @@ get_test_data_and_compare_samples, ) from tests.data.test_memmap import MEMMAP_DATASET_LENGTH, MEMMAP_DATASET_SAMPLES, MEMMAP_DATASET_TOKENS +from tests.utils.dataset import DATASET_PREFIX, get_test_dataset GPT_CONCATENATED_SAMPLES = [ [4709, 819, 79, 207, 277, 1790], diff --git a/tests/data/test_concatenated_memmap.py b/tests/data/test_concatenated_memmap.py index 09929040d..0ab7c7fe4 100644 --- a/tests/data/test_concatenated_memmap.py +++ b/tests/data/test_concatenated_memmap.py @@ -1,5 +1,4 @@ from fast_llm.data.dataset.gpt.config import GPTConcatenatedMemmapConfig -from tests.common import DATASET_CACHE, get_test_concatenated_memmap_dataset from tests.data.common import ( compare_indexed_dataset, get_dataset_config, @@ -8,6 +7,7 @@ validate_indexed_dataset_sampling, ) from tests.data.test_memmap import MEMMAP_DATASET_SAMPLES +from tests.utils.dataset import DATASET_CACHE, get_test_concatenated_memmap_dataset _DATASET_PREFIX_MIX_CONCATENATED_MEMMAP = DATASET_CACHE / "concatenated_memmap" diff --git a/tests/data/test_dataset_from_file.py b/tests/data/test_dataset_from_file.py index 280b34137..3f7d1a139 100644 --- a/tests/data/test_dataset_from_file.py +++ b/tests/data/test_dataset_from_file.py @@ -1,7 +1,7 @@ from fast_llm.data.dataset.gpt.config import GPTDatasetFromFileConfig -from tests.common import DATASET_PREFIX, get_test_dataset from tests.data.common import compare_indexed_dataset, get_dataset_config from tests.data.test_memmap import MEMMAP_DATASET_LENGTH, MEMMAP_DATASET_SAMPLES, MEMMAP_DATASET_TOKENS +from tests.utils.dataset import DATASET_PREFIX, get_test_dataset def test_dataset_from_file(): diff --git a/tests/data/test_fim.py b/tests/data/test_fim.py index 7b614d2fe..7472f1958 100644 --- a/tests/data/test_fim.py +++ b/tests/data/test_fim.py @@ -1,13 +1,13 @@ from fast_llm.data.config import TokenizerConfig from fast_llm.data.dataset.gpt.config import GPTFimSampledDatasetConfig from fast_llm.data.tokenizer import Tokenizer -from tests.common import DATASET_PREFIX, TOKENIZER_PATH, get_test_dataset from tests.data.common import ( compare_sampled_dataset, get_dataset_config, get_sampling_data, get_test_data_and_compare_samples, ) +from tests.utils.dataset import DATASET_PREFIX, TOKENIZER_PATH, get_test_dataset GPT_FIM_SAMPLES = [ [4709, 819, 79, 207, 277, 1790], diff --git a/tests/data/test_memmap.py b/tests/data/test_memmap.py index be801220b..fcd7756db 100644 --- a/tests/data/test_memmap.py +++ b/tests/data/test_memmap.py @@ -3,8 +3,8 @@ import pytest from fast_llm.data.dataset.gpt.config import GPTMemmapDatasetConfig -from tests.common import DATASET_CACHE, DATASET_PREFIX, DATASET_SAMPLING_CACHE, get_test_dataset from tests.data.common import compare_indexed_dataset, get_dataset_config +from tests.utils.dataset import DATASET_CACHE, DATASET_PREFIX, DATASET_SAMPLING_CACHE, get_test_dataset MEMMAP_DATASET_LENGTH = 6153 MEMMAP_DATASET_TOKENS = 508327 diff --git a/tests/data/test_sampling.py b/tests/data/test_sampling.py index 386795826..32d76fa4c 100644 --- a/tests/data/test_sampling.py +++ b/tests/data/test_sampling.py @@ -7,13 +7,13 @@ from fast_llm.data.dataset.gpt.indexed import GPTIndexedDataset from fast_llm.data.dataset.gpt.sampled import GPTSample from fast_llm.utils import Assert -from tests.common import DATASET_PREFIX, get_test_dataset from tests.data.common import ( get_dataset_config, get_sampling_data, get_test_data_and_compare_samples, validate_indexed_dataset_sampling, ) +from tests.utils.dataset import DATASET_PREFIX, get_test_dataset try: from fast_llm.csrc.data import build_padded_token_cumsum # noqa diff --git a/tests/data/test_slice.py b/tests/data/test_slice.py index 299e2054e..f8eedc5bc 100644 --- a/tests/data/test_slice.py +++ b/tests/data/test_slice.py @@ -1,5 +1,4 @@ from fast_llm.data.dataset.gpt.config import GPTDatasetSliceConfig -from tests.common import DATASET_PREFIX, get_test_dataset from tests.data.common import ( compare_indexed_dataset, get_dataset_config, @@ -8,6 +7,7 @@ validate_indexed_dataset_sampling, ) from tests.data.test_memmap import MEMMAP_DATASET_SAMPLES +from tests.utils.dataset import DATASET_PREFIX, get_test_dataset GPT_SLICE_TRAINING_SAMPLES = [ [80, 268, 79, 260, 207, 3086], diff --git a/tests/layers/test_lm_head.py b/tests/layers/test_lm_head.py index 7578a5f05..95da48e7e 100644 --- a/tests/layers/test_lm_head.py +++ b/tests/layers/test_lm_head.py @@ -19,7 +19,7 @@ from fast_llm.models.gpt.config import GPTBaseModelConfig from fast_llm.models.gpt.model import GPTBaseModel from fast_llm.utils import Assert -from tests.common import requires_cuda +from tests.utils.utils import requires_cuda def _lm_head( diff --git a/tests/test_checkpoint.py b/tests/test_checkpoint.py index 216f7828a..e7929440a 100644 --- a/tests/test_checkpoint.py +++ b/tests/test_checkpoint.py @@ -15,34 +15,18 @@ ModelConfigType, ) from fast_llm.engine.multi_stage.config import FastLLMModelConfig, ShardName, StageMode -from fast_llm.models.auto import model_registry from fast_llm.tools.convert import ConvertConfig -from tests.common import ( - CONFIG_COMMON, - FORCE_REUSE_RESULTS, - HUGGINGFACE_CHECKPOINT_FORMAT, - REUSE_RESULTS, - TEST_MODEL, - TEST_MODEL_TYPE, - TEST_RESULTS_PATH, - requires_cuda, -) -from tests.compare_tensor_logs import CompareConfig, compare_logged_tensor - -TEST_MODEL_CONFIG_CLS = model_registry[TEST_MODEL_TYPE] -TEST_MODEL_HF_CLS = TEST_MODEL_CONFIG_CLS.get_huggingface_model_for_causal_lm_class() -TEST_MODEL_CLS = TEST_MODEL_CONFIG_CLS.get_model_class() -TEST_BASE_MODEL_CONFIG_CLS = TEST_MODEL_CONFIG_CLS.get_base_model_config_class() +from tests.utils.compare_tensor_logs import CompareConfig, compare_logged_tensor +from tests.utils.utils import requires_cuda -WEIGHT_SHARD_SAVE_NAME = f"{ShardName.weights}_shard" +_WEIGHT_SHARD_SAVE_NAME = f"{ShardName.weights}_shard" @requires_cuda -def test_checkpoint_and_eval(run_test_script): +def test_checkpoint_and_eval(run_test_script_for_all_models, model_testing_config): # A baseline config (single-gpu, bf16, flash-attn). - run_test_script( - f"test_{TEST_MODEL}_checkpoint_and_eval", - CONFIG_COMMON + run_test_script_for_all_models( + model_testing_config.config_args + [ "training.checkpoint.interval=1", "training.evaluations.validation.interval=2", @@ -72,168 +56,172 @@ def _compare_resume_fn(test_path: pathlib.Path, compare_path: pathlib.Path): @pytest.mark.depends(on=["test_checkpoint_and_eval"]) -def test_resume(run_test_script): +def test_resume(run_test_script_for_all_models): # Resume from iteration=1 and compare outputs with the baseline run. - run_test_script( - f"test_{TEST_MODEL}_resume", - CONFIG_COMMON - + [ + run_test_script_for_all_models( + [ "training.checkpoint.interval=1", "training.evaluations.validation.interval=2", "training.evaluations.validation.iterations=1", ], - compare=f"test_{TEST_MODEL}_checkpoint_and_eval", + compare=f"test_checkpoint_and_eval", prepare_fn=_prepare_resume_fn, compare_fn=_compare_resume_fn, ) @pytest.mark.depends(on=["test_checkpoint_and_eval"]) -def test_resume_frozen(run_test_script): +def test_resume_frozen(run_test_script_for_all_models): # Resume with frozen mlp. No comparison. - run_test_script( - f"test_{TEST_MODEL}_resume_frozen", - CONFIG_COMMON - + [ + run_test_script_for_all_models( + "test_resume_frozen", + [ "training.checkpoint.interval=1", "training.evaluations.validation.interval=2", "training.evaluations.validation.iterations=1", "model.base_model.transformer.mlp_lr_scale=0.", ], - compare=f"test_{TEST_MODEL}_checkpoint_and_eval", + compare="test_checkpoint_and_eval", prepare_fn=_prepare_resume_fn, do_compare=False, ) def _run_conversion(config: ConvertConfig): - if config.output.path.is_dir() and not REUSE_RESULTS: + if config.output.path.exists(): + assert config.output.path.is_dir() shutil.rmtree(config.output.path) - if not config.output.path.is_dir(): - if FORCE_REUSE_RESULTS: - raise RuntimeError(config.output.path) - config.run() + config.run() -_CKPT_PATH = TEST_RESULTS_PATH / f"test_{TEST_MODEL}_checkpoint_and_eval" / "checkpoint" / "2" -CONVERT_PATH = TEST_RESULTS_PATH / f"test_{TEST_MODEL}_convert_model" +@pytest.fixture(scope="module") +def convert_paths(run_test_script_base_path): + return { + "checkpoint": run_test_script_base_path / "test_checkpoint_and_eval" / "checkpoint" / "2", + "distributed_0": run_test_script_base_path / "test_convert_model" / "distributed_0", + "distributed_1": run_test_script_base_path / "test_convert_model" / "distributed_1", + "fast_llm_0": run_test_script_base_path / "test_convert_model" / "fast_llm_0", + "fast_llm_1": run_test_script_base_path / "test_convert_model" / "fast_llm_1", + "huggingface_0": run_test_script_base_path / "test_convert_model" / "huggingface_0", + "huggingface_1": run_test_script_base_path / "test_convert_model" / "huggingface_1", + } @pytest.mark.depends(on=["test_checkpoint_and_eval"]) -def test_convert_distributed_to_fast_llm(): +def test_convert_distributed_to_fast_llm(model_testing_config, convert_paths): _run_conversion( ConvertConfig( input=CheckpointLoadConfig( - path=_CKPT_PATH, + path=convert_paths["checkpoint"], format=DistributedCheckpointFormat, ), output=CheckpointSaveConfig( - path=CONVERT_PATH / "fast_llm_0", + path=convert_paths["fast_llm_0"], format=FastLLMCheckpointFormat, ), - model=TEST_MODEL_CONFIG_CLS, + model=model_testing_config.model_config_class, ) ) @pytest.mark.depends(on=["test_convert_distributed_to_fast_llm"]) -def test_convert_fast_llm_to_huggingface(): - if HUGGINGFACE_CHECKPOINT_FORMAT is None: - pytest.skip(f"Conversion not supported for {TEST_MODEL}") +def test_convert_fast_llm_to_huggingface(model_testing_config, convert_paths): + if model_testing_config.checkpoint_format is None: + pytest.skip(f"Conversion not supported for {model_testing_config.name}") _run_conversion( ConvertConfig( input=CheckpointLoadConfig( - path=CONVERT_PATH / "fast_llm_0", + path=convert_paths["fast_llm_0"], format=FastLLMCheckpointFormat, ), output=CheckpointSaveConfig( - path=CONVERT_PATH / "huggingface_0", - format=HUGGINGFACE_CHECKPOINT_FORMAT, + path=convert_paths["huggingface_0"], + format=model_testing_config.checkpoint_format, ), - model=TEST_MODEL_CONFIG_CLS, + model=model_testing_config.model_config_class, ) ) @pytest.mark.depends(on=["test_convert_fast_llm_to_huggingface"]) -def test_convert_huggingface_to_distributed(): +def test_convert_huggingface_to_distributed(model_testing_config, convert_paths): _run_conversion( ConvertConfig( input=CheckpointLoadConfig( - path=CONVERT_PATH / "huggingface_0", - format=HUGGINGFACE_CHECKPOINT_FORMAT, + path=convert_paths["huggingface_0"], + format=model_testing_config.checkpoint_format, ), output=CheckpointSaveConfig( - path=CONVERT_PATH / "distributed_0", + path=convert_paths["distributed_0"], format=DistributedCheckpointFormat, ), - model=TEST_MODEL_CONFIG_CLS, + model=model_testing_config.model_config_class, ) ) @pytest.mark.depends(on=["test_checkpoint_and_eval"]) -def test_convert_distributed_to_huggingface(): - if HUGGINGFACE_CHECKPOINT_FORMAT is None: - pytest.skip(f"Conversion not supported for {TEST_MODEL}") +def test_convert_distributed_to_huggingface(model_testing_config, convert_paths): + if model_testing_config.checkpoint_format is None: + pytest.skip(f"Conversion not supported for {model_testing_config.name}") _run_conversion( ConvertConfig( input=CheckpointLoadConfig( - path=_CKPT_PATH, + path=convert_paths["checkpoint"], format=DistributedCheckpointFormat, ), output=CheckpointSaveConfig( - path=CONVERT_PATH / "huggingface_1", - format=HUGGINGFACE_CHECKPOINT_FORMAT, + path=convert_paths["huggingface_1"], + format=model_testing_config.checkpoint_format, ), - model=TEST_MODEL_CONFIG_CLS, + model=model_testing_config.model_config_class, ) ) @pytest.mark.depends(on=["test_convert_distributed_to_huggingface"]) -def test_convert_huggingface_to_fast_llm(): +def test_convert_huggingface_to_fast_llm(model_testing_config, convert_paths): _run_conversion( ConvertConfig( input=CheckpointLoadConfig( - path=CONVERT_PATH / "huggingface_1", - format=HUGGINGFACE_CHECKPOINT_FORMAT, + path=convert_paths["huggingface_1"], + format=model_testing_config.checkpoint_format, ), output=CheckpointSaveConfig( - path=CONVERT_PATH / "fast_llm_1", + path=convert_paths["fast_llm_1"], format=FastLLMCheckpointFormat, ), - model=TEST_MODEL_CONFIG_CLS, + model=model_testing_config.model_config_class, ) ) @pytest.mark.depends(on=["test_convert_huggingface_to_fast_llm"]) -def test_convert_fast_llm_to_distributed(): +def test_convert_fast_llm_to_distributed(model_testing_config, convert_paths): _run_conversion( ConvertConfig( input=CheckpointLoadConfig( - path=CONVERT_PATH / "fast_llm_1", + path=convert_paths["fast_llm_1"], format=FastLLMCheckpointFormat, ), output=CheckpointSaveConfig( - path=CONVERT_PATH / "distributed_1", + path=convert_paths["distributed_1"], format=DistributedCheckpointFormat, ), - model=TEST_MODEL_CONFIG_CLS, + model=model_testing_config.model_config_class, ) ) @pytest.mark.depends(on=["test_convert_huggingface_to_distributed", "test_convert_fast_llm_to_distributed"]) -def test_converted_distributed(): +def test_converted_distributed(convert_paths): # Compare the fast llm weights # TODO: Compare configs - w = safetensors.torch.load_file(_CKPT_PATH / "rank_0.safetensors") - w0 = safetensors.torch.load_file(CONVERT_PATH / "distributed_0" / "rank_0.safetensors") - w1 = safetensors.torch.load_file(CONVERT_PATH / "distributed_1" / "rank_0.safetensors") - assert w.keys() >= {WEIGHT_SHARD_SAVE_NAME} - assert w0.keys() == w1.keys() == {WEIGHT_SHARD_SAVE_NAME} + w = safetensors.torch.load_file(convert_paths["checkpoint"] / "rank_0.safetensors") + w0 = safetensors.torch.load_file(convert_paths["distributed_0"] / "rank_0.safetensors") + w1 = safetensors.torch.load_file(convert_paths["distributed_1"] / "rank_0.safetensors") + assert w.keys() >= {_WEIGHT_SHARD_SAVE_NAME} + assert w0.keys() == w1.keys() == {_WEIGHT_SHARD_SAVE_NAME} for key in w0: assert w[key].shape == w0[key].shape, (key, w[key].shape, w0[key].shape) assert (w[key] == w0[key]).all(), (w[key], w0[key]) @@ -242,9 +230,9 @@ def test_converted_distributed(): @pytest.mark.depends(on=["test_convert_distributed_to_fast_llm", "test_convert_huggingface_to_fast_llm"]) -def test_converted_fast_llm(): - s0 = safetensors.torch.load_file(CONVERT_PATH / "fast_llm_0" / "model_0.safetensors") - s1 = safetensors.torch.load_file(CONVERT_PATH / "fast_llm_1" / "model_0.safetensors") +def test_converted_fast_llm(convert_paths): + s0 = safetensors.torch.load_file(convert_paths["fast_llm_0"] / "model_0.safetensors") + s1 = safetensors.torch.load_file(convert_paths["fast_llm_1"] / "model_0.safetensors") assert s0.keys() == s1.keys() for key in s0: assert s0[key].shape == s1[key].shape, (key, s0[key].shape, s1[key].shape) @@ -252,9 +240,9 @@ def test_converted_fast_llm(): @pytest.mark.depends(on=["test_convert_fast_llm_to_huggingface", "test_convert_distributed_to_huggingface"]) -def test_converted_huggingface(): - h0 = safetensors.torch.load_file(CONVERT_PATH / "huggingface_0" / "model_0.safetensors") - h1 = safetensors.torch.load_file(CONVERT_PATH / "huggingface_1" / "model_0.safetensors") +def test_converted_huggingface(convert_paths): + h0 = safetensors.torch.load_file(convert_paths["huggingface_0"] / "model_0.safetensors") + h1 = safetensors.torch.load_file(convert_paths["huggingface_1"] / "model_0.safetensors") assert h0.keys() == h1.keys() for key in h0: assert h0[key].shape == h1[key].shape, (key, h0[key].shape, h1[key].shape) @@ -270,45 +258,45 @@ def _compare_architectures(config_ref: FastLLMModelConfig, config_test: FastLLMM @pytest.mark.depends(on=["test_converted_distributed"]) -def test_load_pretrained_distributed_checkpoint(): - config = TEST_MODEL_CONFIG_CLS.from_dict( - yaml.safe_load((_CKPT_PATH / ".." / ".." / "config.yaml").open("r"))["model"], strict=False +def test_load_pretrained_distributed_checkpoint(model_testing_config, convert_paths): + config = model_testing_config.model_config_class.from_dict( + yaml.safe_load((convert_paths["checkpoint"] / ".." / ".." / "config.yaml").open("r"))["model"], strict=False ) pretrained_config_ref = CheckpointLoadConfig( - path=_CKPT_PATH, + path=convert_paths["checkpoint"], format=DistributedCheckpointFormat, optimizer_state=True, load_config=ModelConfigType.model, ) - model = TEST_MODEL_CLS.from_pretrained(pretrained_config_ref) + model = model_testing_config.model_class.from_pretrained(pretrained_config_ref) _compare_model_configs(config, model.config) state_shards = safetensors.torch.load_file( - _CKPT_PATH / "rank_0.safetensors", device=str(model._distributed.device) + convert_paths["checkpoint"] / "rank_0.safetensors", device=str(model._distributed.device) ) for shard_name in model.state_shard_names: assert (state_shards[f"{shard_name}_shard"] == model.get_shard(shard_name)).all() @pytest.mark.depends(on=["test_load_pretrained_distributed_checkpoint"]) -def test_load_converted_distributed_checkpoint(): - config_ref = TEST_MODEL_CONFIG_CLS.from_pretrained( +def test_load_converted_distributed_checkpoint(model_testing_config, convert_paths): + config_ref = model_testing_config.model_config_class.from_pretrained( CheckpointLoadConfig( - path=_CKPT_PATH, + path=convert_paths["checkpoint"], format=DistributedCheckpointFormat, load_config=ModelConfigType.model, ) ) - model = TEST_MODEL_CLS.from_pretrained( + model = model_testing_config.model_class.from_pretrained( CheckpointLoadConfig( - path=CONVERT_PATH / "distributed_0", + path=convert_paths["distributed_0"], format=DistributedCheckpointFormat, load_config=ModelConfigType.model, ) ) - config_alt = TEST_MODEL_CONFIG_CLS.from_pretrained( + config_alt = model_testing_config.model_config_class.from_pretrained( CheckpointLoadConfig( - path=CONVERT_PATH / "distributed_1", + path=convert_paths["distributed_1"], format=DistributedCheckpointFormat, load_config=ModelConfigType.model, ) @@ -316,30 +304,30 @@ def test_load_converted_distributed_checkpoint(): _compare_architectures(config_ref, model.config) _compare_model_configs(model.config, config_alt) weight_shard = safetensors.torch.load_file( - _CKPT_PATH / "rank_0.safetensors", device=str(model._distributed.device) - )[WEIGHT_SHARD_SAVE_NAME] + convert_paths["checkpoint"] / "rank_0.safetensors", device=str(model._distributed.device) + )[_WEIGHT_SHARD_SAVE_NAME] assert (weight_shard == model.get_shard(ShardName.weights)).all() @pytest.mark.depends(on=["test_converted_fast_llm", "test_load_pretrained_distributed_checkpoint"]) -def test_load_converted_fast_llm_checkpoint(): - config_ref = TEST_MODEL_CONFIG_CLS.from_pretrained( +def test_load_converted_fast_llm_checkpoint(model_testing_config, convert_paths): + config_ref = model_testing_config.model_config_class.from_pretrained( CheckpointLoadConfig( - path=_CKPT_PATH, + path=convert_paths["checkpoint"], format=DistributedCheckpointFormat, load_config=ModelConfigType.model, ) ) - model = TEST_MODEL_CLS.from_pretrained( + model = model_testing_config.model_class.from_pretrained( CheckpointLoadConfig( - path=CONVERT_PATH / "fast_llm_0", + path=convert_paths["fast_llm_0"], format=FastLLMCheckpointFormat, load_config=ModelConfigType.model, ) ) - config_alt = TEST_MODEL_CONFIG_CLS.from_pretrained( + config_alt = model_testing_config.model_config_class.from_pretrained( CheckpointLoadConfig( - path=CONVERT_PATH / "fast_llm_1", + path=convert_paths["fast_llm_1"], format=FastLLMCheckpointFormat, load_config=ModelConfigType.model, ) @@ -347,48 +335,48 @@ def test_load_converted_fast_llm_checkpoint(): _compare_architectures(config_ref, model.config) _compare_architectures(config_ref, config_alt) weight_shard = safetensors.torch.load_file( - _CKPT_PATH / "rank_0.safetensors", device=str(model._distributed.device) - )[WEIGHT_SHARD_SAVE_NAME] + convert_paths["checkpoint"] / "rank_0.safetensors", device=str(model._distributed.device) + )[_WEIGHT_SHARD_SAVE_NAME] assert (weight_shard == model.get_shard(ShardName.weights)).all() @pytest.mark.depends(on=["test_converted_fast_llm", "test_load_pretrained_distributed_checkpoint"]) -def test_load_converted_huggingface_checkpoint(): - config_ref = TEST_MODEL_CONFIG_CLS.from_pretrained( +def test_load_converted_huggingface_checkpoint(model_testing_config, convert_paths): + config_ref = model_testing_config.model_config_class.from_pretrained( CheckpointLoadConfig( - path=_CKPT_PATH, + path=convert_paths["checkpoint"], format=DistributedCheckpointFormat, load_config=ModelConfigType.model, ) ) - model = TEST_MODEL_CLS.from_pretrained( + model = model_testing_config.model_class.from_pretrained( CheckpointLoadConfig( - path=CONVERT_PATH / "huggingface_1", - format=HUGGINGFACE_CHECKPOINT_FORMAT, + path=convert_paths["huggingface_1"], + format=model_testing_config.checkpoint_format, load_config=ModelConfigType.model, ), mode=StageMode.weights, ) - config_alt = TEST_MODEL_CONFIG_CLS.from_pretrained( + config_alt = model_testing_config.model_config_class.from_pretrained( CheckpointLoadConfig( - path=CONVERT_PATH / "huggingface_0", - format=HUGGINGFACE_CHECKPOINT_FORMAT, + path=convert_paths["huggingface_0"], + format=model_testing_config.checkpoint_format, load_config=ModelConfigType.model, ) ) _compare_architectures(config_ref, model.config) _compare_model_configs(model.config, config_alt) weight_shard = safetensors.torch.load_file( - _CKPT_PATH / "rank_0.safetensors", device=str(model._distributed.device) - )[WEIGHT_SHARD_SAVE_NAME] + convert_paths["checkpoint"] / "rank_0.safetensors", device=str(model._distributed.device) + )[_WEIGHT_SHARD_SAVE_NAME] assert (weight_shard == model.get_shard(ShardName.weights)).all() @pytest.mark.depends(on=["test_load_converted_fast_llm_checkpoint", "test_load_converted_huggingface_checkpoint"]) -def test_run_converted_model(): - model_ref = TEST_MODEL_HF_CLS.from_pretrained( +def test_run_converted_model(model_testing_config, convert_paths): + model_ref = model_testing_config.huggingface_model_for_causal_lm_class.from_pretrained( CheckpointLoadConfig( - path=_CKPT_PATH, + path=convert_paths["checkpoint"], format=DistributedCheckpointFormat, load_config=ModelConfigType.model, ) @@ -397,18 +385,20 @@ def test_run_converted_model(): 0, model_ref.config.fast_llm_config.base_model.vocab_size, size=(4, 100), dtype=torch.int64, device="cuda" ) output_ref = model_ref(test_input) - model_from_fast_llm = TEST_MODEL_HF_CLS.from_pretrained(CONVERT_PATH / "fast_llm_0") - model_from_hf = TEST_MODEL_HF_CLS.from_pretrained( + model_from_fast_llm = model_testing_config.huggingface_model_for_causal_lm_class.from_pretrained( + convert_paths["fast_llm_0"] + ) + model_from_hf = model_testing_config.huggingface_model_for_causal_lm_class.from_pretrained( CheckpointLoadConfig( - path=CONVERT_PATH / "huggingface_0", - format=HUGGINGFACE_CHECKPOINT_FORMAT, + path=convert_paths["huggingface_0"], + format=model_testing_config.checkpoint_format, load_config=ModelConfigType.model, ) ) errors = [] compare = CompareConfig() model_as_hf = transformers.AutoModelForCausalLM.from_pretrained( - CONVERT_PATH / "huggingface_0", trust_remote_code=HUGGINGFACE_CHECKPOINT_FORMAT.trust_remote_code + convert_paths["huggingface_0"], trust_remote_code=model_testing_config.checkpoint_format.trust_remote_code ).cuda() for name, model in zip( ("From state dict", "From Huggingface", "Native Huggingface"), @@ -434,14 +424,13 @@ def test_run_converted_model(): @pytest.mark.slow @pytest.mark.depends(on=["test_load_converted_distributed_checkpoint"]) -def test_load_pretrained_distributed_in_dp2(run_test_script): - run_test_script( - f"test_{TEST_MODEL}_load_pretrained_distributed_in_dp2", - CONFIG_COMMON - + [ +def test_load_pretrained_distributed_in_dp2(run_test_script_for_all_models, convert_paths): + run_test_script_for_all_models( + "test_load_pretrained_distributed_in_dp2", + [ "training.checkpoint.interval=1", "training.train_iters=1", - f"pretrained.path={CONVERT_PATH / 'distributed_0'}", + f"pretrained.path={convert_paths["distributed_0"]}", f"pretrained.format={DistributedCheckpointFormat.name}", "schedule.skip_step=True", ], @@ -450,14 +439,13 @@ def test_load_pretrained_distributed_in_dp2(run_test_script): @pytest.mark.depends(on=["test_load_converted_distributed_checkpoint"]) -def test_load_pretrained_distributed_with_config(run_test_script): - run_test_script( - f"test_{TEST_MODEL}_load_pretrained_distributed_with_config", - CONFIG_COMMON - + [ +def test_load_pretrained_distributed_with_config(run_test_script_for_all_models, convert_paths): + run_test_script_for_all_models( + "test_load_pretrained_distributed_with_config", + [ "training.checkpoint.interval=1", "training.train_iters=1", - f"pretrained.path={CONVERT_PATH / 'distributed_0'}", + f"pretrained.path={convert_paths["distributed_0"]}", f"pretrained.format={DistributedCheckpointFormat.name}", "schedule.skip_step=True", ], @@ -465,10 +453,10 @@ def test_load_pretrained_distributed_with_config(run_test_script): @pytest.mark.depends(on=["test_load_pretrained_distributed_in_dp2"]) -def test_load_pretrained_in_dp2_match_checkpoint(): - test_ckpt_path = TEST_RESULTS_PATH / f"test_{TEST_MODEL}_load_pretrained_distributed_in_dp2" / "checkpoint" / "1" +def test_load_pretrained_in_dp2_match_checkpoint(model_testing_config, convert_paths, run_test_script_base_path): + test_ckpt_path = run_test_script_base_path / "test_load_pretrained_distributed_in_dp2" / "checkpoint" / "1" pretrained_config_ref = CheckpointLoadConfig( - path=_CKPT_PATH, + path=convert_paths["checkpoint"], format=DistributedCheckpointFormat, load_config=ModelConfigType.fast_llm, ) @@ -477,21 +465,21 @@ def test_load_pretrained_in_dp2_match_checkpoint(): format=DistributedCheckpointFormat, load_config=ModelConfigType.fast_llm, ) - config_ref = TEST_MODEL_CONFIG_CLS.from_pretrained(pretrained_config_ref) - config_test = TEST_MODEL_CONFIG_CLS.from_pretrained(pretrained_config_test) + config_ref = model_testing_config.model_config_class.from_pretrained(pretrained_config_ref) + config_test = model_testing_config.model_config_class.from_pretrained(pretrained_config_test) _compare_model_configs(config_ref, config_test) - shards_ref = safetensors.torch.load_file(_CKPT_PATH / "rank_0.safetensors") + shards_ref = safetensors.torch.load_file(convert_paths["checkpoint"] / "rank_0.safetensors") shards_test = [safetensors.torch.load_file(test_ckpt_path / f"rank_{i}.safetensors") for i in range(2)] - ref_model = TEST_MODEL_CLS(config_ref) - test_model = TEST_MODEL_CLS(config_test) + ref_model = model_testing_config.model_class(config_ref) + test_model = model_testing_config.model_class(config_test) - weight_shard_ref_split = shards_ref[WEIGHT_SHARD_SAVE_NAME].split(ref_model._stage_weight_shard_sizes) + weight_shard_ref_split = shards_ref[_WEIGHT_SHARD_SAVE_NAME].split(ref_model._stage_weight_shard_sizes) weight_shards_test_split = [ - shard_test[WEIGHT_SHARD_SAVE_NAME].split(test_model._stage_weight_shard_sizes) for shard_test in shards_test + shard_test[_WEIGHT_SHARD_SAVE_NAME].split(test_model._stage_weight_shard_sizes) for shard_test in shards_test ] for shard_test in shards_test: for shard_name, shard in shard_test.items(): - if shard_name != WEIGHT_SHARD_SAVE_NAME: + if shard_name != _WEIGHT_SHARD_SAVE_NAME: assert (shard == 0).all() # noqa assert len(ref_model._stage_weight_shard_sizes) == len(test_model._stage_weight_shard_sizes) @@ -510,37 +498,36 @@ def test_load_pretrained_in_dp2_match_checkpoint(): @pytest.mark.slow @pytest.mark.depends(on=["test_load_pretrained_in_dp2_match_checkpoint"]) -def test_load_distributed_checkpoint_dp2(): +def test_load_distributed_checkpoint_dp2(model_testing_config, convert_paths, run_test_script_base_path): # This also tests conversion which uses `FastLLMModel.from_checkpoint` pretrained_config_ref = CheckpointLoadConfig( - path=_CKPT_PATH, + path=convert_paths["checkpoint"], format=DistributedCheckpointFormat, load_config=ModelConfigType.fast_llm, ) pretrained_config_test = CheckpointLoadConfig( - path=TEST_RESULTS_PATH / f"test_{TEST_MODEL}_load_pretrained_distributed_in_dp2" / "checkpoint" / "1", + path=run_test_script_base_path / "test_load_pretrained_distributed_in_dp2" / "checkpoint" / "1", format=DistributedCheckpointFormat, load_config=ModelConfigType.model, ) - config = TEST_MODEL_CONFIG_CLS.from_pretrained(pretrained_config_ref) - model = TEST_MODEL_CLS.from_pretrained(pretrained_config_test, mode=StageMode.weights) + config = model_testing_config.model_config_class.from_pretrained(pretrained_config_ref) + model = model_testing_config.model_class.from_pretrained(pretrained_config_test, mode=StageMode.weights) _compare_model_configs(config, model.config) weight_shard = safetensors.torch.load_file( - _CKPT_PATH / "rank_0.safetensors", device=str(model._distributed.device) - )[WEIGHT_SHARD_SAVE_NAME] + convert_paths["checkpoint"] / "rank_0.safetensors", device=str(model._distributed.device) + )[_WEIGHT_SHARD_SAVE_NAME] assert (weight_shard == model.get_shard(ShardName.weights)).all() @pytest.mark.slow @pytest.mark.depends(on=["test_load_converted_fast_llm_checkpoint", "test_load_pretrained_in_dp2_match_checkpoint"]) -def test_load_pretrained_fast_llm_in_dp2(run_test_script): +def test_load_pretrained_fast_llm_in_dp2(run_test_script, convert_paths, run_test_script_base_path): run_test_script( - f"test_{TEST_MODEL}_load_pretrained_fast_llm_in_dp2", - CONFIG_COMMON - + [ + "test_load_pretrained_fast_llm_in_dp2", + [ "training.checkpoint.interval=1", "training.train_iters=1", - f"pretrained.path={CONVERT_PATH / 'fast_llm_0'}", + f"pretrained.path={convert_paths["fast_llm_0"]}", f"pretrained.format=fast_llm", "schedule.skip_step=True", ], @@ -548,15 +535,15 @@ def test_load_pretrained_fast_llm_in_dp2(run_test_script): ) for rank in range(2): ref_shard = safetensors.torch.load_file( - TEST_RESULTS_PATH - / f"test_{TEST_MODEL}_load_pretrained_distributed_in_dp2" + run_test_script_base_path + / f"test_load_pretrained_distributed_in_dp2" / "checkpoint" / "1" / f"rank_{rank}.safetensors" ) test_shard = safetensors.torch.load_file( - TEST_RESULTS_PATH - / f"test_{TEST_MODEL}_load_pretrained_fast_llm_in_dp2" + run_test_script_base_path + / f"test_load_pretrained_fast_llm_in_dp2" / "checkpoint" / "1" / f"rank_{rank}.safetensors" @@ -567,30 +554,31 @@ def test_load_pretrained_fast_llm_in_dp2(run_test_script): @pytest.mark.slow @pytest.mark.depends(on=["test_load_converted_huggingface_checkpoint", "test_load_pretrained_in_dp2_match_checkpoint"]) -def test_load_pretrained_huggingface_in_dp2(run_test_script): - run_test_script( - f"test_{TEST_MODEL}_load_pretrained_huggingface_in_dp2", - CONFIG_COMMON - + [ +def test_load_pretrained_huggingface_in_dp2( + run_test_script_for_all_models, model_testing_config, run_test_script_base_path, convert_paths +): + run_test_script_for_all_models( + "test_load_pretrained_huggingface_in_dp2", + [ "training.checkpoint.interval=1", "training.train_iters=1", - f"pretrained.path={CONVERT_PATH / 'huggingface_0'}", - f"pretrained.format={HUGGINGFACE_CHECKPOINT_FORMAT.name}", + f"pretrained.path={convert_paths["huggingface_0"]}", + f"pretrained.format={model_testing_config.checkpoint_format.name}", "schedule.skip_step=True", ], num_gpus=2, ) for rank in range(2): ref_shard = safetensors.torch.load_file( - TEST_RESULTS_PATH - / f"test_{TEST_MODEL}_load_pretrained_distributed_in_dp2" + run_test_script_base_path + / f"test_load_pretrained_distributed_in_dp2" / "checkpoint" / "1" / f"rank_{rank}.safetensors" ) test_shard = safetensors.torch.load_file( - TEST_RESULTS_PATH - / f"test_{TEST_MODEL}_load_pretrained_huggingface_in_dp2" + run_test_script_base_path + / f"test_load_pretrained_huggingface_in_dp2" / "checkpoint" / "1" / f"rank_{rank}.safetensors" diff --git a/tests/test_config.py b/tests/test_config.py index 80bed418c..98a4c07c6 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -14,7 +14,6 @@ from fast_llm.models.auto import trainer_registry from fast_llm.models.gpt.config import GPTModelConfig, PretrainedGPTModelConfig from fast_llm.utils import Assert, check_equal_nested -from tests.common import TEST_RESULTS_PATH def run_without_import(cmd: str): @@ -101,8 +100,8 @@ def test_serialize_default_config_updates(cls, default): @pytest.mark.parametrize("load_config", tuple(ModelConfigType)) -def test_pretrained_config(load_config: ModelConfigType): - config_path = TEST_RESULTS_PATH / "pretrained_config" +def test_pretrained_config(load_config: ModelConfigType, result_path): + config_path = result_path / "pretrained_config" pretrained_model_config = GPTModelConfig.from_dict( { "base_model": { diff --git a/tests/test_functional.py b/tests/test_functional.py index 908a55374..03a0ae8a0 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -8,7 +8,7 @@ from fast_llm.functional.triton.mlp import mlp_autograd, mlp_autograd_looped, torch_mlp_activation from fast_llm.functional.triton.sparse_copy import get_sparse_map from fast_llm.utils import Assert -from tests.common import requires_cuda +from tests.utils.utils import requires_cuda def ref_log_probs_from_logits(logits: torch.Tensor, labels: torch.Tensor, temperature: float = 1.0) -> torch.Tensor: diff --git a/tests/test_gpt_generate_and_forward.py b/tests/test_gpt_generate_and_forward.py index a16d4c716..ca75cf3e8 100644 --- a/tests/test_gpt_generate_and_forward.py +++ b/tests/test_gpt_generate_and_forward.py @@ -9,13 +9,7 @@ from fast_llm.engine.schedule.runner import ScheduleRunner from fast_llm.models.gpt.config import LlamaGPTHuggingfaceCheckpointFormat, PretrainedGPTModelConfig from fast_llm.models.gpt.huggingface import HuggingfaceGPTModelForCausalLM -from tests.common import CONFIG_COMMON, HUGGINGFACE_CHECKPOINT_FORMAT, TEST_MODEL, TEST_RESULTS_PATH, requires_cuda - - -def _prepare_checkpoint(model: str) -> str: - path = TEST_RESULTS_PATH.resolve() / "generate/model" - model_path = huggingface_hub.snapshot_download(repo_id=model, local_dir=path) - return model_path +from tests.utils.utils import requires_cuda def _prepare_data(tokenizer, use_batch_size2: bool): @@ -179,12 +173,11 @@ def _test_for_batches( @pytest.fixture(scope="module") -def model_and_tokenizer(): - model = "HuggingFaceTB/SmolLM2-135M-Instruct" - fast_llm_checkpoint_format = LlamaGPTHuggingfaceCheckpointFormat - model_path = _prepare_checkpoint(model) - tokenizer = AutoTokenizer.from_pretrained(model_path) - return model_path, tokenizer, fast_llm_checkpoint_format +def model_path(result_path): + return huggingface_hub.snapshot_download( + repo_id="HuggingFaceTB/SmolLM2-135M-Instruct", + local_dir=result_path / "generate/model", + ) def _test_generate( @@ -224,35 +217,33 @@ def _test_generate( ], ) def test_generate( - model_and_tokenizer, + model_path, use_flash_attention, use_bf16, max_new_tokens, min_matching_tokens_batch_size_1, min_matching_tokens_batch_size_2, ): - model_path, tokenizer, fast_llm_checkpoint_format = model_and_tokenizer _test_generate( model_path, - fast_llm_checkpoint_format, + LlamaGPTHuggingfaceCheckpointFormat, use_flash_attention, use_bf16, max_new_tokens, min_matching_tokens_batch_size_1, min_matching_tokens_batch_size_2, - tokenizer=tokenizer, + tokenizer=AutoTokenizer.from_pretrained(model_path), ) +@pytest.mark.slow @requires_cuda -def test_export_for_generate(run_test_script): +def test_export_for_generate(run_test_script_for_all_models, model_testing_config): # Not really testing, anything, but handles dependencies more easily than a fixture. - run_test_script( - f"test_{TEST_MODEL}_export_for_generate", - CONFIG_COMMON - + [ + run_test_script_for_all_models( + [ "training.train_iters=1", - f"training.export.format={HUGGINGFACE_CHECKPOINT_FORMAT.name}", + f"training.export.format={model_testing_config.checkpoint_format.name}", "training.export.interval=1", ], ) @@ -273,6 +264,8 @@ def test_export_for_generate(run_test_script): ], ) def test_small_generate( + model_testing_config, + run_test_script_base_path, use_flash_attention, use_bf16, max_new_tokens, @@ -280,8 +273,8 @@ def test_small_generate( min_matching_tokens_batch_size_2, ): _test_generate( - TEST_RESULTS_PATH / f"test_{TEST_MODEL}_export_for_generate/export/{HUGGINGFACE_CHECKPOINT_FORMAT.name}/1", - HUGGINGFACE_CHECKPOINT_FORMAT, + run_test_script_base_path / f"test_export_for_generate/export/{model_testing_config.checkpoint_format.name}/1", + model_testing_config.checkpoint_format, use_flash_attention, use_bf16, max_new_tokens, @@ -312,20 +305,21 @@ def _test_generate_from_model(model_path, tokenizer, fast_llm_checkpoint_format) @requires_cuda @pytest.mark.extra_slow def test_generate_from_model( - model_and_tokenizer, + model_path, ): - model_path, tokenizer, fast_llm_checkpoint_format = model_and_tokenizer - _test_generate_from_model(model_path, tokenizer, fast_llm_checkpoint_format) + _test_generate_from_model( + model_path, AutoTokenizer.from_pretrained(model_path), LlamaGPTHuggingfaceCheckpointFormat + ) @requires_cuda @pytest.mark.slow @pytest.mark.depends(on=["test_export_for_generate"]) -def test_small_generate_from_model(): +def test_small_generate_from_model(model_testing_config, run_test_script_base_path): _test_generate_from_model( - TEST_RESULTS_PATH / f"test_{TEST_MODEL}_export_for_generate/export/{HUGGINGFACE_CHECKPOINT_FORMAT.name}/1", + run_test_script_base_path / f"test_export_for_generate/export/{model_testing_config.checkpoint_format.name}/1", None, - HUGGINGFACE_CHECKPOINT_FORMAT, + model_testing_config.checkpoint_format, ) @@ -361,16 +355,17 @@ def _test_forward_return_hidden_states( @pytest.mark.extra_slow @requires_cuda -def test_forward_return_hidden_states(model_and_tokenizer): - model_path, tokenizer, fast_llm_checkpoint_format = model_and_tokenizer - _test_forward_return_hidden_states(model_path, fast_llm_checkpoint_format, tokenizer.vocab_size) +def test_forward_return_hidden_states(model_path): + _test_forward_return_hidden_states( + model_path, LlamaGPTHuggingfaceCheckpointFormat, AutoTokenizer.from_pretrained(model_path).vocab_size + ) @pytest.mark.slow @requires_cuda @pytest.mark.depends(on=["test_export_for_generate"]) -def test_small_forward_return_hidden_states(): +def test_small_forward_return_hidden_states(model_testing_config, run_test_script_base_path): _test_forward_return_hidden_states( - TEST_RESULTS_PATH / f"test_{TEST_MODEL}_export_for_generate/export/{HUGGINGFACE_CHECKPOINT_FORMAT.name}/1", - HUGGINGFACE_CHECKPOINT_FORMAT, + run_test_script_base_path / f"test_export_for_generate/export/{model_testing_config.checkpoint_format.name}/1", + model_testing_config.checkpoint_format, ) diff --git a/tests/test_match_megatron.py b/tests/test_match_megatron.py index 1857f0f8f..a77906ae2 100644 --- a/tests/test_match_megatron.py +++ b/tests/test_match_megatron.py @@ -1,158 +1,32 @@ import pytest -from tests.common import ( - CONFIG_GPT2_FAST_LLM, - CONFIG_GPT2_MEGATRON, - CONFIG_LLAMA_FAST_LLM, - CONFIG_LLAMA_MEGATRON, - CONFIG_MIXTRAL_FAST_LLM, - CONFIG_MIXTRAL_MEGATRON, - CONFIG_SC1_FAST_LLM, - CONFIG_SC1_MEGATRON, - CONFIG_SC2_FAST_LLM, - CONFIG_SC2_MEGATRON, - DATASET_PREFIX, -) -from tests.compare_tensor_logs import CompareConfig +from tests.utils.compare_tensor_logs import CompareConfig +from tests.utils.dataset import DATASET_PREFIX @pytest.mark.slow -@pytest.mark.skip(reason="Skipping mostly redundant test") -def test_sc1_meg(run_test_script): - # Starcoder 1 (GPT2 with MQA) with Megatron. - run_test_script("test_sc1_meg", CONFIG_SC1_MEGATRON + ["--micro-batch-size=8"], is_megatron=True) - - -CONFIG_MATCH_MEGATRON = [ - "data.datasets={}", - f"data.path={DATASET_PREFIX}", -] - - -@pytest.mark.depends(on=["test_sc1_meg"]) -def test_sc1_match_meg(run_test_script): - # Starcoder 1 (GPT2 with MQA) with Fast-llm. - # QKV tensors are in a different format. - run_test_script( - "test_sc1_match_meg", - CONFIG_SC1_FAST_LLM + CONFIG_MATCH_MEGATRON + ["model.base_model.use_megatron_initialization=True"], - compare="test_sc1_meg", - config=CompareConfig( - ignore_tensors=[ - ".self_attn.query_key_value.", - ".self_attn.query.", - ".self_attn.key_value.", - ".mlp.layer_2.weight", - ] - ), - ) - - -@pytest.mark.slow -@pytest.mark.skip(reason="Skipping mostly redundant test") -@pytest.mark.depends(on=["test_sc1_match_meg"]) -def test_sc2_meg(run_test_script): - # Starcoder 2 (GPT2 with MQA and RoPE) with Megatron. - run_test_script("test_sc2_meg", CONFIG_SC2_MEGATRON + ["--micro-batch-size=8"], is_megatron=True) - - -@pytest.mark.depends(on=["test_sc2_meg"]) -def test_sc2_match_meg(run_test_script): - # Starcoder 2 (GPT2 with MQA and RoPE) with Fast-llm. - # QKV tensors are in a different format, - # dense not matching because of the way initialization is corrected for RoPE format. - run_test_script( - "test_sc2_match_meg", - CONFIG_SC2_FAST_LLM + CONFIG_MATCH_MEGATRON + ["model.base_model.use_megatron_initialization=True"], - compare="test_sc2_meg", - config=CompareConfig( - ignore_tensors=[ - ".self_attn.query_key_value.", - ".self_attn.query.", - ".self_attn.key_value.", - ".self_attn.dense.", - ".mlp.layer_2.weight", - ] - ), - ) - - -@pytest.mark.slow -def test_gpt2_meg(run_test_script): - # GPT2 (MHA, layer norm, absolute embeddings) with Megatron. - run_test_script("test_gpt2_meg", CONFIG_GPT2_MEGATRON + ["--micro-batch-size=8"], is_megatron=True) - - -@pytest.mark.depends(on=["test_gpt2_meg"]) -def test_gpt2_match_meg(run_test_script): - # GPT2 (MHA, layer norm, absolute embeddings) with Fast-llm. - # QKV tensors are in a different format. - run_test_script( - "test_gpt2_match_meg", - CONFIG_GPT2_FAST_LLM + CONFIG_MATCH_MEGATRON + ["model.base_model.use_megatron_initialization=True"], - compare="test_gpt2_meg", - config=CompareConfig( - ignore_tensors=[ - ".self_attn.query_key_value.", - ".self_attn.query.", - ".self_attn.key_value.", - ".mlp.layer_2.weight", - ] - ), - ) +def test_megatron(run_test_script_for_all_models, model_testing_config): + run_test_script_for_all_models(is_megatron=True) @pytest.mark.slow -def test_mistral_meg(run_test_script): - # Mistral with Megatron. - # No linear bias, swiglu activation, RMSNorm - run_test_script("test_mistral_meg", CONFIG_LLAMA_MEGATRON + ["--micro-batch-size=8"], is_megatron=True) - - -@pytest.mark.depends(on=["test_mistral_meg"]) -def test_mistral_match_meg(run_test_script): - # Mistral with Fast-LLM. - run_test_script( - "test_mistral_match_meg", - CONFIG_LLAMA_FAST_LLM + CONFIG_MATCH_MEGATRON + ["model.base_model.use_megatron_initialization=True"], - compare="test_mistral_meg", +@pytest.mark.depends(on=["test_megatron"]) +def test_match_megatron(run_test_script_for_all_models, model_testing_config): + run_test_script_for_all_models( + [ + "model.distributed.training_dtype=fp32", + "data.datasets={}", + f"data.path={DATASET_PREFIX}", + "model.base_model.use_megatron_initialization=True", + ], + compare="test_megatron", config=CompareConfig( ignore_tensors=[ ".self_attn.query_key_value.", ".self_attn.query.", ".self_attn.key_value.", - ".self_attn.dense.", ".mlp.layer_2.weight", ] ), - ) - - -@pytest.mark.slow -def test_mixtral_meg(run_test_script): - # Mistral with Megatron. - # No linear bias, swiglu activation, RMSNorm - run_test_script("test_mixtral_meg", CONFIG_MIXTRAL_MEGATRON + ["--micro-batch-size=8"], is_megatron=True) - - -@pytest.mark.depends(on=["test_mixtral_meg"]) -def test_mixtral_match_meg(run_test_script): - # Mistral with Fast-LLM. - run_test_script( - "test_mixtral_match_meg", - CONFIG_MIXTRAL_FAST_LLM + CONFIG_MATCH_MEGATRON + ["model.base_model.use_megatron_initialization=True"], - compare="test_mixtral_meg", - config=CompareConfig( - ignore_tensors=[ - ".self_attn.query_key_value.", - ".self_attn.query.", - ".self_attn.key_value.", - ".self_attn.dense.", - ".mlp.layer_1.weight", - ".mlp.layer_2.weight", - ".mlp.experts", - "Global layer 2 fw: Transformer layer 2 output", - ], - max_rel_tolerance=1.5e-1, - ), + use_performance_args=False, ) diff --git a/tests/test_mb.py b/tests/test_mb.py index 82ac4c25f..80350df9d 100644 --- a/tests/test_mb.py +++ b/tests/test_mb.py @@ -1,82 +1,84 @@ import pytest -from tests.common import CONFIG_COMMON, TEST_MODEL -from tests.compare_tensor_logs import CompareConfig - -CONFIG_DF = CONFIG_COMMON + ["batch.depth_first_micro_batches=4"] -CONFIG_BF = CONFIG_COMMON + ["batch.breadth_first_micro_batches=4"] -CONFIG_BF_DF = CONFIG_COMMON + ["batch.depth_first_micro_batches=2", "batch.breadth_first_micro_batches=2"] +from tests.utils.compare_tensor_logs import CompareConfig # TODO: Compare grads with simple -def test_model_df4(run_test_script): +def test_model_df4(run_test_script_for_all_models): # Depth-first gradient accumulation baseline. - run_test_script(f"test_{TEST_MODEL}_df4", CONFIG_DF) + run_test_script_for_all_models("test_model_df4", ["batch.depth_first_micro_batches=4"]) @pytest.mark.slow @pytest.mark.depends(on=["test_model_df4"]) -def test_model_df4_z3(run_test_script): +def test_model_df4_z3(run_test_script_for_all_models): # Gradient accumulation with ZeRO-3. - run_test_script( - f"test_{TEST_MODEL}_df4_z3", - CONFIG_DF + ["model.multi_stage.zero_stage=3"], + run_test_script_for_all_models( + "test_model_df4_z3", + ["model.multi_stage.zero_stage=3", "batch.depth_first_micro_batches=4"], num_gpus=2, - compare=f"test_{TEST_MODEL}_df4", + compare="test_model_df4", config=CompareConfig(ignore_duplicates=["Global gradient"]), ) @pytest.mark.depends(on=["test_model_df4"], scope="session") -def test_model_bf4(run_test_script): +def test_model_bf4(run_test_script_for_all_models): # Breadth-first gradient accumulation baseline. - run_test_script(f"test_{TEST_MODEL}_bf4", CONFIG_BF, compare=f"test_{TEST_MODEL}_df4") + run_test_script_for_all_models(["batch.breadth_first_micro_batches=4"], compare="test_model_df4") @pytest.mark.depends(on=["test_model_df4", "test_model_bf4"]) -def test_model_bf2_df2(run_test_script): +def test_model_bf2_df2(run_test_script_for_all_models): # Mixed gradient accumulation baseline. - run_test_script(f"test_{TEST_MODEL}_bf2_df2", CONFIG_BF_DF, compare=f"test_{TEST_MODEL}_df4") + run_test_script_for_all_models( + ["batch.depth_first_micro_batches=2", "batch.breadth_first_micro_batches=2"], compare="test_model_df4" + ) @pytest.mark.slow @pytest.mark.depends(on=["test_model_bf4"]) -def test_model_pp2s2_bf4(run_test_script): +def test_model_pp2s2_bf4(run_test_script_for_all_models): # Pipeline-parallel without tied weights. - run_test_script( - f"test_{TEST_MODEL}_pp2s2_bf4", - CONFIG_BF + ["model.distributed.pipeline_parallel=2", "model.multi_stage.layers_per_stage=2"], + run_test_script_for_all_models( + [ + "batch.breadth_first_micro_batches=4", + "model.distributed.pipeline_parallel=2", + "model.multi_stage.layers_per_stage=2", + ], num_gpus=2, - compare=f"test_{TEST_MODEL}_df4", + compare="test_model_df4", ) @pytest.mark.slow @pytest.mark.depends(on=["test_model_bf4"]) -def test_model_pp2s1_bf4(run_test_script): +def test_model_pp2s1_bf4(run_test_script_for_all_models): # Pipeline-parallel with tied weights. - run_test_script( - f"test_{TEST_MODEL}_pp2s1_bf4", - CONFIG_BF + ["model.distributed.pipeline_parallel=2", "model.multi_stage.layers_per_stage=1"], + run_test_script_for_all_models( + [ + "batch.breadth_first_micro_batches=4", + "model.distributed.pipeline_parallel=2", + "model.multi_stage.layers_per_stage=1", + ], num_gpus=2, - compare=f"test_{TEST_MODEL}_df4", + compare="test_model_df4", config=CompareConfig(ignore_duplicates=["layers.0.word_embeddings_weight"]), ) @pytest.mark.slow @pytest.mark.depends(on=["test_model_bf4"]) -def test_model_dp2_tp2_pp2s2_bf4(run_test_script): +def test_model_dp2_tp2_pp2s2_bf4(run_test_script_for_all_models): # Simple 3d parallelism # TODO: Test fails - run_test_script( - f"test_{TEST_MODEL}_dp2_tp2_pp2s2_bf4", - CONFIG_BF - + [ + run_test_script_for_all_models( + [ + "batch.breadth_first_micro_batches=4", "model.distributed.tensor_parallel=2", "model.distributed.pipeline_parallel=2", "model.multi_stage.layers_per_stage=1", ], num_gpus=8, - compare=f"test_{TEST_MODEL}_df4", + compare="test_model_df4", ) diff --git a/tests/test_mb_seq_first.py b/tests/test_mb_seq_first.py index 345a7bc49..5146dc9a9 100644 --- a/tests/test_mb_seq_first.py +++ b/tests/test_mb_seq_first.py @@ -1,57 +1,48 @@ import pytest -from tests.common import CONFIG_COMMON, TEST_MODEL -from tests.compare_tensor_logs import CompareConfig - -CONFIG_DF_SF = CONFIG_COMMON + ["batch.depth_first_micro_batches=4", "model.base_model.sequence_first=True"] -CONFIG_BF_SF = CONFIG_COMMON + ["batch.breadth_first_micro_batches=4", "model.base_model.sequence_first=True"] -CONFIG_BF_DF_SF = CONFIG_COMMON + [ - "batch.depth_first_micro_batches=2", - "batch.breadth_first_micro_batches=2", - "model.base_model.sequence_first=True", -] +from tests.utils.compare_tensor_logs import CompareConfig # TODO: Compare grads with simple -def test_model_df4_sf(run_test_script): +def test_model_df4_sf(run_test_script_for_all_models): # Sequence-first gradient accumulation baseline. - run_test_script(f"test_{TEST_MODEL}_df4_sf", CONFIG_DF_SF) + run_test_script_for_all_models(["batch.depth_first_micro_batches=4", "model.base_model.sequence_first=True"]) @pytest.mark.slow @pytest.mark.depends(on=["test_model_df4_sf"]) -def test_model_dp2_sp2_df4(run_test_script): +def test_model_dp2_sp2_df4(run_test_script_for_all_models): # Sequence-tensor-parallel with gradient accumulation. # TODO: Compiled cross-entropy broken for this config - run_test_script( - f"test_{TEST_MODEL}_dp2_sp2_df4", - CONFIG_BF_SF - + [ + run_test_script_for_all_models( + [ + "batch.breadth_first_micro_batches=4", + "model.base_model.sequence_first=True", "model.distributed.tensor_parallel=2", "model.distributed.sequence_tensor_parallel=True", "run.torch_dynamo_enable=False", ], num_gpus=4, - compare=f"test_{TEST_MODEL}_df4_sf", + compare="test_model_df4_sf", ) @pytest.mark.slow @pytest.mark.skip(reason="Test is broken.") @pytest.mark.depends(on=["test_model_df4_sf"]) -def test_model_dp2_sp2_pp2s1(run_test_script): +def test_model_dp2_sp2_pp2s1(run_test_script_for_all_models): # 3d-parallel with sequence-tensor-parallel. # TODO: Compiled cross-entropy broken for this config - run_test_script( - f"test_{TEST_MODEL}_dp2_sp2_pp2s1", - CONFIG_BF_SF - + [ + run_test_script_for_all_models( + [ + "batch.breadth_first_micro_batches=4", + "model.base_model.sequence_first=True", "model.distributed.tensor_parallel=2", "model.distributed.pipeline_parallel=2", "model.distributed.sequence_tensor_parallel=True", "run.torch_dynamo_enable=False", ], num_gpus=8, - compare=f"test_{TEST_MODEL}_df4_sf", + compare="test_model_df4_sf", config=CompareConfig(ignore_duplicates=["layers.0.word_embeddings_weight"]), ) diff --git a/tests/test_ms.py b/tests/test_ms.py index 90d166728..256eafe31 100644 --- a/tests/test_ms.py +++ b/tests/test_ms.py @@ -1,38 +1,36 @@ import pytest -from tests.common import CONFIG_COMMON, TEST_MODEL - -CONFIG_MS = CONFIG_COMMON + ["batch.micro_sequence_length=256"] - # TODO: Compare grads with simple -def test_model_ms256(run_test_script): +def test_model_ms256(run_test_script_for_all_models): # Micro-sequence baseline - run_test_script(f"test_{TEST_MODEL}_ms256", CONFIG_MS) + run_test_script_for_all_models(["batch.micro_sequence_length=256"]) @pytest.mark.slow @pytest.mark.depends(on=["test_model_ms256"]) -def test_model_pp2s2_ms256(run_test_script): +def test_model_pp2s2_ms256(run_test_script_for_all_models): # Sequence-pipeline-parallel - run_test_script( - f"test_{TEST_MODEL}_pp2s2_ms256", - CONFIG_MS + ["model.distributed.pipeline_parallel=2", "model.multi_stage.layers_per_stage=2"], + run_test_script_for_all_models( + [ + "batch.micro_sequence_length=256", + "model.distributed.pipeline_parallel=2", + "model.multi_stage.layers_per_stage=2", + ], num_gpus=2, - compare=f"test_{TEST_MODEL}_ms256", + compare="test_model_ms256", ) @pytest.mark.slow @pytest.mark.skip @pytest.mark.depends(on=["test_model_ms256"]) -def test_model_dp2s2_stp2_pp2s2_ms256(run_test_script): +def test_model_dp2s2_stp2_pp2s2_ms256(run_test_script_for_all_models): # TODO: Handle this case. # Sequence-3d-parallel - run_test_script( - f"test_{TEST_MODEL}_dp2s2_stp2_pp2s2_ms256", - CONFIG_MS - + [ + run_test_script_for_all_models( + [ + "batch.micro_sequence_length=256", "model.distributed.pipeline_parallel=2", "model.distributed.tensor_parallel=2", "model.distributed.sequence_tensor_parallel=True", @@ -40,5 +38,5 @@ def test_model_dp2s2_stp2_pp2s2_ms256(run_test_script): "model.multi_stage.layers_per_stage=2", ], num_gpus=8, - compare=f"test_{TEST_MODEL}_ms256", + compare="test_model_ms256", ) diff --git a/tests/test_mtp.py b/tests/test_mtp.py index edce4e74d..5c4660b73 100644 --- a/tests/test_mtp.py +++ b/tests/test_mtp.py @@ -14,7 +14,7 @@ from fast_llm.models.gpt.config import GPTBaseModelConfig from fast_llm.models.gpt.model import GPTBaseModel from fast_llm.utils import Assert -from tests.common import get_hybrid_config, materialize_meta_tensors, requires_cuda +from tests.utils.utils import get_hybrid_config, materialize_meta_tensors, requires_cuda try: from fast_llm.layers.ssm.discrete_mamba2 import DiscreteMamba2 diff --git a/tests/test_multi_stage.py b/tests/test_multi_stage.py index bb468cebe..6d3861ebf 100644 --- a/tests/test_multi_stage.py +++ b/tests/test_multi_stage.py @@ -4,7 +4,7 @@ from fast_llm.layers.transformer.transformer import TransformerLayer from fast_llm.tools.train import CliTrainingConfig from fast_llm.utils import Assert -from tests.common import CONFIG_COMMON, requires_cuda +from tests.utils.utils import requires_cuda def _get_trainer_from_args(args: list[str], model_type: str = "gpt") -> Trainer: @@ -17,8 +17,8 @@ def _get_trainer_from_args(args: list[str], model_type: str = "gpt") -> Trainer: @requires_cuda -def test_frozen_weights(): - args = CONFIG_COMMON + ["run.tensor_logs.save=False"] +def test_frozen_weights(model_testing_config): + args = model_testing_config.config_args + ["run.tensor_logs.save=False"] model_ref = _get_trainer_from_args(args)._multi_stage model_frozen = _get_trainer_from_args(args + ["model.base_model.transformer.mlp_lr_scale=[0]"])._multi_stage diff --git a/tests/test_seq_first.py b/tests/test_seq_first.py index a8f4c0363..3e8b7ea19 100644 --- a/tests/test_seq_first.py +++ b/tests/test_seq_first.py @@ -1,53 +1,48 @@ import pytest -from tests.common import CONFIG_COMMON, TEST_MODEL - -CONFIG_SF = CONFIG_COMMON + ["model.base_model.sequence_first=True"] - # TODO: Compare grads with simple -def test_model_sf(run_test_script): +def test_model_sf(run_test_script_for_all_models): # Sequence-first baseline. - run_test_script(f"test_{TEST_MODEL}_sf", CONFIG_SF) + run_test_script_for_all_models("test_model_sf", ["model.base_model.sequence_first=True"]) @pytest.mark.slow @pytest.mark.depends(on=["test_model_sf"]) -def test_model_sp2(run_test_script): +def test_model_sp2(run_test_script_for_all_models): # Sequence-tensor-parallel. - run_test_script( - f"test_{TEST_MODEL}_sp2", - CONFIG_SF + ["model.distributed.tensor_parallel=2", "model.distributed.sequence_tensor_parallel=True"], + run_test_script_for_all_models( + "test_model_sp2", + ["model.distributed.tensor_parallel=2", "model.distributed.sequence_tensor_parallel=True"], num_gpus=2, - compare=f"test_{TEST_MODEL}_sf", + compare="test_model_sf", ) @pytest.mark.slow @pytest.mark.depends(on=["test_model_sf"]) -def test_model_sdp2(run_test_script): +def test_model_sdp2(run_test_script_for_all_models): # Sequence-data-parallel - run_test_script( - f"test_{TEST_MODEL}_sdp2", - CONFIG_COMMON + ["model.distributed.sequence_data_parallel=2"], + run_test_script_for_all_models( + "test_model_sdp2", + ["model.distributed.sequence_data_parallel=2"], num_gpus=2, - compare=f"test_{TEST_MODEL}_sf", + compare="test_model_sf", ) @pytest.mark.slow @pytest.mark.depends(on=["test_model_sf"]) -def test_model_sp2_ce4(run_test_script): +def test_model_sp2_ce4(run_test_script_for_all_models): # Sequence-tensor-parallel with cross-entropy splits. - run_test_script( - f"test_{TEST_MODEL}_sp2_ce4", - CONFIG_SF - + [ + run_test_script_for_all_models( + "test_model_sp2_ce4", + [ "model.distributed.tensor_parallel=2", "model.distributed.sequence_tensor_parallel=True", "model.base_model.parallel_embeddings=False", "model.base_model.cross_entropy_splits=4", ], num_gpus=2, - compare=f"test_{TEST_MODEL}_sf", + compare="test_model_sf", ) diff --git a/tests/test_simple.py b/tests/test_simple.py index 3128626d3..bc48e26b4 100644 --- a/tests/test_simple.py +++ b/tests/test_simple.py @@ -1,14 +1,11 @@ import pytest -from tests.common import CONFIG_COMMON, CONFIG_FAST_LLM, TEST_MODEL - -def test_model_safe(run_test_script): +def test_model_safe(run_test_script_for_all_models): # The safest possible config, identical to the one in test_match_megatron except for the initialization. - run_test_script( - f"test_{TEST_MODEL}_safe", - CONFIG_FAST_LLM - + [ + run_test_script_for_all_models( + [ + "model.distributed.training_dtype=fp32", "run.torch_dynamo_enable=False", "schedule.data_overlap=False", "model.base_model.transformer.dropless_moe=False", @@ -17,29 +14,25 @@ def test_model_safe(run_test_script): @pytest.mark.depends(on=["test_model_safe"]) -def test_model(run_test_script): +def test_model(run_test_script_for_all_models): # A baseline config (single-gpu, bf16, flash-attn). # Also tests for multiple data loaders. - run_test_script( - f"test_{TEST_MODEL}", CONFIG_COMMON + ["training.num_workers=2"], compare=f"test_{TEST_MODEL}_safe" - ) + run_test_script_for_all_models(["training.num_workers=2"], compare="test_model_safe") @pytest.mark.slow @pytest.mark.depends(on=["test_model"]) -def test_model_dp2(run_test_script): +def test_model_dp2(run_test_script_for_all_models): # Simple data-parallel. - run_test_script(f"test_{TEST_MODEL}_dp2", CONFIG_COMMON, num_gpus=2, compare=f"test_{TEST_MODEL}") + run_test_script_for_all_models([], num_gpus=2, compare="test_model") @pytest.mark.slow -def test_model_dp2_timeout(run_test_script): +def test_model_dp2_timeout(run_test_script_for_all_models): # Test sampling timeout # TODO: Find a better way to test this - run_test_script( - f"test_{TEST_MODEL}_dp2_timeout", - CONFIG_COMMON - + [ + run_test_script_for_all_models( + [ # Use a short timeout "model.distributed.timeout=4", # Make a dataset that would timeout under the distributed timeout @@ -49,10 +42,10 @@ def test_model_dp2_timeout(run_test_script): # Use a bigger timeout for the dataset. "training.timeout=10", # Remove testing clutter. - f"model.multi_stage.debug_param_init=0", - f"model.multi_stage.debug_layer_outputs=0", - f"model.multi_stage.debug_layer_gradients=0", - f"model.multi_stage.debug_all_param_gradients=0", + "model.multi_stage.debug_param_init=0", + "model.multi_stage.debug_layer_outputs=0", + "model.multi_stage.debug_layer_gradients=0", + "model.multi_stage.debug_all_param_gradients=0", ], num_gpus=2, ) @@ -60,45 +53,41 @@ def test_model_dp2_timeout(run_test_script): @pytest.mark.slow @pytest.mark.depends(on=["test_model"]) -def test_model_tp2(run_test_script): +def test_model_tp2(run_test_script_for_all_models): # Simple tensor-parallel. - run_test_script( - f"test_{TEST_MODEL}_tp2", - CONFIG_COMMON + ["model.distributed.tensor_parallel=2"], + run_test_script_for_all_models( + ["model.distributed.tensor_parallel=2"], num_gpus=2, - compare=f"test_{TEST_MODEL}", + compare="test_model", ) @pytest.mark.depends(on=["test_model"]) -def test_model_ce4(run_test_script): +def test_model_ce4(run_test_script_for_all_models): # Cross-entropy splits. - run_test_script( - f"test_{TEST_MODEL}_ce4", - CONFIG_COMMON + ["model.base_model.cross_entropy_splits=4"], - compare=f"test_{TEST_MODEL}", + run_test_script_for_all_models( + ["model.base_model.cross_entropy_splits=4"], + compare="test_model", ) @pytest.mark.slow @pytest.mark.depends(on=["test_model"]) -def test_model_dp2_z2(run_test_script): +def test_model_dp2_z2(run_test_script_for_all_models): # Data-parallel with zero stage 2. - run_test_script( - f"test_{TEST_MODEL}_dp2_z2", - CONFIG_COMMON + ["model.multi_stage.zero_stage=2"], + run_test_script_for_all_models( + ["model.multi_stage.zero_stage=2"], num_gpus=2, - compare=f"test_{TEST_MODEL}", + compare="test_model", ) @pytest.mark.slow @pytest.mark.depends(on=["test_model"]) -def test_model_dp2_z3(run_test_script): +def test_model_dp2_z3(run_test_script_for_all_models): # Data-parallel with zero stage 3. - run_test_script( - f"test_{TEST_MODEL}_dp2_z3", - CONFIG_COMMON + ["model.multi_stage.zero_stage=3"], + run_test_script_for_all_models( + ["model.multi_stage.zero_stage=3"], num_gpus=2, - compare=f"test_{TEST_MODEL}", + compare="test_model", ) diff --git a/tests/test_ssms.py b/tests/test_ssms.py index a6922a454..a1d460c28 100644 --- a/tests/test_ssms.py +++ b/tests/test_ssms.py @@ -16,7 +16,7 @@ from fast_llm.layers.transformer.config import TransformerKwargs from fast_llm.models.gpt.config import GPTBatchConfig, LlamaGPTHuggingfaceCheckpointFormat from fast_llm.models.ssm.config import LLambaHuggingfaceCheckpointFormat -from tests.common import get_hybrid_config, materialize_meta_tensors +from tests.utils.utils import get_hybrid_config, materialize_meta_tensors try: from fast_llm.layers.ssm.discrete_mamba2 import DiscreteMamba2 diff --git a/tests/test_triton_kernels.py b/tests/test_triton_kernels.py index 108a28982..9befe64fd 100644 --- a/tests/test_triton_kernels.py +++ b/tests/test_triton_kernels.py @@ -31,7 +31,7 @@ from fast_llm.layers.transformer.config import RotaryConfig, RotaryEmbeddingType from fast_llm.layers.transformer.preprocessing import get_rotary_frequencies from fast_llm.utils import Assert, rms_diff -from tests.common import requires_cuda +from tests.utils.utils import requires_cuda @requires_cuda diff --git a/tests/utils/__init__.py b/tests/utils/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/compare_tensor_logs.py b/tests/utils/compare_tensor_logs.py similarity index 100% rename from tests/compare_tensor_logs.py rename to tests/utils/compare_tensor_logs.py diff --git a/tests/utils/dataset.py b/tests/utils/dataset.py new file mode 100644 index 000000000..23c487a74 --- /dev/null +++ b/tests/utils/dataset.py @@ -0,0 +1,82 @@ +import pathlib +import random +import string + +import numpy as np +import yaml + +from fast_llm.data.dataset.gpt.memmap import GPTMemmapDataset +from fast_llm.data.dataset.gpt.sampled import GPTSample + +# TODO: Fixture +TEST_RESULTS_PATH = pathlib.Path("/tmp/fast_llm_tests") +TOKENIZER_PATH = TEST_RESULTS_PATH / "tokenizer" / "common" +TOKENIZER_FILE = TOKENIZER_PATH / "tokenizer.json" +DATASET_CACHE = TEST_RESULTS_PATH / "dataset" +DATASET_PREFIX = DATASET_CACHE / "common" / "dataset" +DATASET_SAMPLING_CACHE = TEST_RESULTS_PATH / "dataset" / "cache" +TEST_VOCAB_SIZE = 8192 +# Random lowercase: 80.7% (3.1% each); space: 18.6%; doc end: 0.6% +TEST_CHARACTERS = (string.ascii_lowercase) * 5 + " " * 30 + "\n" +TEST_DATASET_TOKENS = 1000000 + + +def get_test_dataset( + prefix: pathlib.Path = DATASET_PREFIX, + seed: int = 1234, + num_tokens: int = TEST_DATASET_TOKENS, + characters: str = TEST_CHARACTERS, + vocab_size: int = TEST_VOCAB_SIZE, + max_spans: int = 0, +): + if not TOKENIZER_FILE.is_file(): + import transformers + + transformers.AutoTokenizer.from_pretrained("bigcode/santacoder").save_pretrained(TOKENIZER_PATH) + + if not ( + prefix.with_suffix(".idx").is_file() + and prefix.with_suffix(".bin").is_file() + and prefix.parent.joinpath("fast_llm_config.yaml").is_file() + ): + import transformers + + texts = "".join(random.Random(seed).choices(characters, k=num_tokens)).splitlines() + tokenizer = transformers.AutoTokenizer.from_pretrained(TOKENIZER_PATH) + + samples = [ + GPTSample(np.array(tokenizer(document)["input_ids"], dtype=np.uint16) % vocab_size) for document in texts + ] + if max_spans > 0: + lengths = np.array([max(len(sample.token_ids), 1) for sample in samples]) + spans = np.sort(np.random.RandomState(seed + 3847).randint(0, lengths[:, None], [len(samples), max_spans])) + for sample, span in zip(samples, spans): + span = np.unique(span) + sample.loss_masking_spans = span[: len(span) // 2 * 2].reshape(-1, 2) + + GPTMemmapDataset.write_dataset(prefix, samples) + yaml.safe_dump( + {"type": "memmap", "path": prefix.name}, prefix.parent.joinpath("fast_llm_config.yaml").open("w") + ) + + +def get_test_concatenated_memmap_dataset( + path: pathlib.Path, + num_files: int, + seed: int = 1234, + num_tokens: int = TEST_DATASET_TOKENS, + characters: str = TEST_CHARACTERS, + vocab_size: int = TEST_VOCAB_SIZE, + seed_shift: int = 55, +): + index_file = path / "index.txt" + if not index_file.is_file(): + for i in range(num_files): + get_test_dataset( + prefix=path / f"dataset_{i}", + seed=seed + i * seed_shift, + num_tokens=num_tokens, + characters=characters, + vocab_size=vocab_size, + ) + index_file.open("w").writelines([str(path / f"dataset_{i}") + "\n" for i in range(num_files)]) diff --git a/tests/utils/model_configs.py b/tests/utils/model_configs.py new file mode 100644 index 000000000..963f6ae93 --- /dev/null +++ b/tests/utils/model_configs.py @@ -0,0 +1,276 @@ +import dataclasses +import functools +import os +import typing + +import pytest + +from fast_llm.engine.checkpoint.config import CheckpointFormat +from fast_llm.models.auto import model_registry +from fast_llm.models.gpt.config import ( + LlamaGPTHuggingfaceCheckpointFormat, + MistralGPTHuggingfaceCheckpointFormat, + MixtralGPTHuggingfaceCheckpointFormat, + MTPLlamaGPTHuggingfaceCheckpointFormat, + Qwen2GPTHuggingfaceCheckpointFormat, + Starcoder2GPTHuggingfaceCheckpointFormat, +) +from fast_llm.models.ssm.config import LLambaHuggingfaceCheckpointFormat +from tests.utils.dataset import DATASET_PREFIX, TEST_VOCAB_SIZE + +_LOG_LEVEL = int(os.environ.get("LOG_LEVEL", 13)) + + +@dataclasses.dataclass(kw_only=True, frozen=True) +class ModelTestingConfig: + name: str = None + model_type: str + config_args: list[str] + megatron_args: list[str] | None + checkpoint_format: CheckpointFormat | None + + @functools.cached_property + def model_config_class(self): + return model_registry[self.model_type] + + @functools.cached_property + def huggingface_model_for_causal_lm_class(self): + return self.model_config_class.get_huggingface_model_for_causal_lm_class() + + @functools.cached_property + def model_class(self): + return self.model_config_class.get_model_class() + + @functools.cached_property + def base_model_config_class(self): + return self.model_config_class.get_base_model_config_class() + + +def _update_and_add_testing_config( + old_name: str, + new_name: str, + *, + model_type: str | None = None, + extra_args: list[str] | None = None, + megatron_args: list[str] | None = ..., + checkpoint_format: CheckpointFormat | None = ..., +): + config = _MODEL_CONFIGS[old_name] + updates: dict[str, typing.Any] = {"name": new_name} + if model_type is not None: + updates["model_type"] = model_type + if extra_args is not None: + updates["config_args"] = config.config_args + extra_args + if megatron_args is not ...: + if megatron_args is None: + updates["megatron_args"] = None + elif config.megatron_args is None: + updates["megatron_args"] = megatron_args + else: + updates["megatron_args"] = config.megatron_args + megatron_args + if checkpoint_format is not ...: + updates["checkpoint_format"] = checkpoint_format + + _MODEL_CONFIGS[new_name] = dataclasses.replace(config, **updates) + + +_MODEL_CONFIGS: dict[str, ModelTestingConfig] = {} + + +_MODEL_CONFIGS["gpt2"] = ModelTestingConfig( + name="gpt2", + model_type="gpt", + config_args=[ + "training.logs.interval=1", + "run.tensor_logs.save=True", + "run.tensor_logs.show=False", + "model.base_model.max_position_embeddings=512", + "model.base_model.transformer.num_layers=2", + "model.base_model.transformer.hidden_size=256", + "model.base_model.transformer.num_attention_heads=8", + "model.base_model.transformer.head_groups=8", + "model.base_model.transformer.init_method_std=0.022", + f"model.base_model.vocab_size={TEST_VOCAB_SIZE}", + f"model.multi_stage.debug_param_init={_LOG_LEVEL}", + f"model.multi_stage.debug_layer_outputs={_LOG_LEVEL}", + f"model.multi_stage.debug_layer_gradients={_LOG_LEVEL}", + f"model.multi_stage.debug_all_param_gradients={_LOG_LEVEL}", + "model.multi_stage.debug_tensor_parallel=True", + "model.distributed.reproducible_init=True", + "model.distributed.timeout=10", + "model.distributed.training_dtype=bf16", + "training.train_iters=2", + "training.num_workers=0", + "training.timeout=30", + "batch.batch_size=8", + "batch.sequence_length=512", + "data.datasets.training.type=slice", + "data.datasets.training.end=0.969", + "data.datasets.training.dataset.type=memmap", + f"data.datasets.training.dataset.path={DATASET_PREFIX}", + "data.datasets.validation.type=slice", + "data.datasets.validation.begin=0.969", + "data.datasets.validation.end=0.999", + "data.datasets.validation.dataset.type=memmap", + f"data.datasets.validation.dataset.path={DATASET_PREFIX}", + "data.datasets.test.type=slice", + "data.datasets.test.begin=0.999", + "data.datasets.test.end=1", + "data.datasets.test.dataset.type=memmap", + f"data.datasets.test.dataset.path={DATASET_PREFIX}", + "optimizer.learning_rate.base=0.0001", + ], + megatron_args=[ + "--num-layers=2", + "--hidden-size=256", + "--num-attention-heads=8", + "--log-interval=1", + "--train-iters=2", + "--eval-iters=0", + "--hidden-dropout=0", + "--attention-dropout=0", + f"--debug_param_init={_LOG_LEVEL}", + f"--debug_layer_outputs={_LOG_LEVEL}", + f"--debug_layer_gradients={_LOG_LEVEL}", + f"--debug_all_param_gradients={_LOG_LEVEL}", + "--debug_param_update=0", + "--global-batch-size=8", + "--micro-batch-size=8", + "--max-position-embeddings=512", + "--seq-length=512", + "--init-method-std=0.022", + "--lr=0.0001", + "--num-workers=0", + "--valid-num-workers=0", + "--tokenizer-type=NullTokenizer", + # Megatron messes with the vocab size, so we have to subtract 1. + f"--vocab-size={TEST_VOCAB_SIZE - 1}", + f"--data-path={DATASET_PREFIX}", + "--lr-decay-style=constant", + # Initialization is set up to match MCore models (MCore inverts self-attn qkv and dense layers compared to original Megatron) + "--use-mcore-models", + # local implementation doesn't allow for RMS norm. + "--transformer-impl=transformer_engine", + ], + checkpoint_format=None, +) + +_update_and_add_testing_config( + "gpt2", + "starcoder", + extra_args=["model.base_model.transformer.head_groups=1"], + megatron_args=["--group-query-attention"], + checkpoint_format=None, +) + +_update_and_add_testing_config( + "gpt2", + "starcoder2", + extra_args=[ + "model.base_model.transformer.head_groups=4", + "model.base_model.transformer.rotary.type=default", + ], + megatron_args=[ + "--group-query-attention", + "--num-query-groups=4", + "--use-rotary-position-embeddings", + "--no-position-embedding", + ], + checkpoint_format=Starcoder2GPTHuggingfaceCheckpointFormat, +) + +_update_and_add_testing_config( + "starcoder2", + "llama", + extra_args=[ + "model.base_model.transformer.gated=True", + "model.base_model.transformer.activation_type=silu", + "model.base_model.transformer.add_linear_biases=False", + "model.base_model.transformer.normalization.type=rms_norm", + "model.base_model.transformer.ffn_hidden_size=1024", + "model.base_model.tie_word_embeddings=False", + ], + megatron_args=[ + "--swiglu", + "--disable-bias-linear", + "--normalization=RMSNorm", + "--ffn-hidden-size=1024", + "--untie-embeddings-and-output-weights", + ], + checkpoint_format=LlamaGPTHuggingfaceCheckpointFormat, +) + +_update_and_add_testing_config( + "llama", + "llama3", + extra_args=["model.base_model.transformer.rotary.type=llama3"], + # Megatron doesn't support Llama3-style Rotary Embeddings + megatron_args=None, + checkpoint_format=LlamaGPTHuggingfaceCheckpointFormat, +) + +_update_and_add_testing_config( + "llama", + "llama_yarn", + extra_args=["model.base_model.transformer.rotary.type=yarn"], + # Megatron doesn't support Yarn-style Rotary Embeddings + megatron_args=None, + checkpoint_format=LlamaGPTHuggingfaceCheckpointFormat, +) + +_update_and_add_testing_config( + "llama", + "llama_mtp", + extra_args=["model.base_model.prediction_heads=4"], + # Megatron doesn't support multi-token prediction. + megatron_args=None, + checkpoint_format=MTPLlamaGPTHuggingfaceCheckpointFormat, +) + +_update_and_add_testing_config( + "llama", + "qwen2", + extra_args=["model.base_model.transformer.add_linear_biases=only_attn_qkv"], + # Megatron doesn't support per sub layer biases + megatron_args=None, + checkpoint_format=Qwen2GPTHuggingfaceCheckpointFormat, +) + +_update_and_add_testing_config( + "llama", + "mistral", + extra_args=["model.base_model.transformer.window_size=128"], + # Megatron doesn't support sliding windows. + megatron_args=None, + checkpoint_format=MistralGPTHuggingfaceCheckpointFormat, +) + +_update_and_add_testing_config( + # We ignore sliding windows to enable comparison with Megatron. + "llama", + "mixtral", + extra_args=[ + "model.base_model.transformer.num_experts=4", + "model.base_model.transformer.num_experts_per_token=4", + ], + megatron_args=[ + "--num-experts=4", + "--moe-router-topk=4", + ], + checkpoint_format=MixtralGPTHuggingfaceCheckpointFormat, +) + +_update_and_add_testing_config( + # We ignore sliding windows to enable comparison with Megatron. + "llama", + "llamba", + model_type="hybrid_ssm", + extra_args=["model.base_model.hybrid_block_layout=['t','m']"], + megatron_args=None, + checkpoint_format=LLambaHuggingfaceCheckpointFormat, +) + + +@pytest.fixture(scope="session", params=_MODEL_CONFIGS.keys()) +def model_testing_config(request) -> ModelTestingConfig: + return _MODEL_CONFIGS[request.param] diff --git a/tests/utils/run_test_script.py b/tests/utils/run_test_script.py new file mode 100644 index 000000000..c11d3f3b5 --- /dev/null +++ b/tests/utils/run_test_script.py @@ -0,0 +1,118 @@ +import os +import pathlib +import shutil +import subprocess +import sys + +import pytest +import torch + +from fast_llm.tools.train import CliTrainingConfig +from tests.utils.compare_tensor_logs import CompareConfig, compare_tensor_logs +from tests.utils.dataset import get_test_dataset + +# FIXME: figure out correct import of megatron modules without this hack +sys.path.append(os.getcwd()) + +_ARTIFACT_PATH = "runs/0/artifacts" + + +@pytest.fixture(scope="session") +def run_test_script(worker_resources): + def do_run_test_script( + path: pathlib.Path, + args: list[str], + num_gpus: int = 1, + *, + model_type: str, + is_megatron: bool = False, + compare_path: pathlib.Path | None = None, + config: CompareConfig | None = None, + prepare_fn=None, + compare_fn=None, + do_compare: bool = True, + ): + if torch.cuda.device_count() < num_gpus: + pytest.skip(f"Not enough GPUs to run test ({torch.cuda.device_count()}<{num_gpus})") + env = os.environ.copy() + if is_megatron: + # Prevent Megatron from complaining. + env["CUDA_DEVICE_MAX_CONNECTIONS"] = "1" + env["NVTE_FLASH_ATTN"] = "0" + skip = False + if path.exists(): + assert path.is_dir() + # TODO: Better way to check if the previous attempt succeeded. + shutil.rmtree(path) + if prepare_fn is not None: + skip = prepare_fn(path, None if compare_path is None else compare_path, skip) + if is_megatron: + args = [*args, f"--structured-logs-dir={path}", f"--data-cache-path={path}"] + else: + args = [model_type, *args, f"run.experiment_dir={path}"] + header = ["Megatron-LM/pretrain_gpt.py"] if is_megatron else ["--no-python", "fast-llm", "train"] + command = [ + "python", + "-m", + "torch.distributed.run", + f"--nproc-per-node={num_gpus}", + f"--rdzv-endpoint=localhost:{worker_resources.rendezvous_port}", + f"--master-port={worker_resources.torchrun_port}", + *header, + *args, + ] + print(" ".join(command)) + if skip: + print("Reusing existing run.") + else: + get_test_dataset() + if num_gpus == 1 and not is_megatron: + CliTrainingConfig.parse_and_run(args) + else: + completed_proc = subprocess.run(command, env=env, timeout=60) + if completed_proc.returncode: + raise RuntimeError(f"Process failed with return code {completed_proc.returncode}") + if compare_path is not None and do_compare: + if compare_fn is not None: + compare_fn(path, compare_path) + compare_tensor_logs( + compare_path / _ARTIFACT_PATH, + path / _ARTIFACT_PATH, + config, + ) + + return do_run_test_script + + +@pytest.fixture(scope="session") +def run_test_script_base_path(model_testing_config, result_path, request): + return result_path / "models" / model_testing_config.name + + +@pytest.fixture(scope="function") +def run_test_script_for_all_models(run_test_script, run_test_script_base_path, model_testing_config, request): + def do_run_test_script_for_all_models( + extra_args: list[str], + num_gpus: int = 1, + *, + is_megatron: bool = False, + compare: str | None = None, + config: CompareConfig | None = None, + prepare_fn=None, + compare_fn=None, + do_compare: bool = True, + ): + run_test_script( + run_test_script_base_path / request.node.originalname, + (model_testing_config.megatron_args if is_megatron else model_testing_config.config_args) + extra_args, + num_gpus, + model_type=model_testing_config.model_type, + is_megatron=is_megatron, + compare_path=None if compare is None else run_test_script_base_path / compare, + config=config, + prepare_fn=prepare_fn, + compare_fn=compare_fn, + do_compare=do_compare, + ) + + return do_run_test_script_for_all_models diff --git a/tests/utils/utils.py b/tests/utils/utils.py new file mode 100644 index 000000000..bf2059fa8 --- /dev/null +++ b/tests/utils/utils.py @@ -0,0 +1,55 @@ +import pathlib + +import pytest +import torch + +from fast_llm.layers.ssm.config import SSMConfig +from fast_llm.layers.transformer.config import TransformerConfig +from fast_llm.models.ssm.config import HybridSSMBaseModelConfig + +requires_cuda = pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is not available") + + +@pytest.fixture(scope="session") +def result_path(): + return pathlib.Path("/tmp/fast_llm_tests") + + +def materialize_meta_tensors(model, tensor_space): + # Materialize parameters that are on meta device + for name, param in model.named_parameters(): + if param.device.type == "meta": + # Check if the parameter is a custom tensor type + if hasattr(param, "tensor_name") and hasattr(param, "init_parameter"): + param_data = param.new_empty(param.shape, device="cuda") + # Initialize param_data + param.init_parameter(param_data, tensor_space.distributed) + # Replace the parameter in the module + module_path, param_name = name.rsplit(".", 1) if "." in name else (None, name) + module = model + if module_path is not None: + for part in module_path.split("."): + module = getattr(module, part) + param = torch.nn.Parameter(param_data, requires_grad=param.requires_grad) + # TODO: add param_grad_is_zero etc., grad_buffer, etc., see test_mlp_recomputation + param.grad = None + param.grad_buffer = torch.empty_like(param) + param.param_grad_is_zero = True + module._parameters[param_name] = param + return model + + +def get_hybrid_config(hybrid_block_layout=["t", "m"], prediction_heads=1, default_mtp_type=None): + config = HybridSSMBaseModelConfig( + transformer=TransformerConfig(num_layers=len(hybrid_block_layout)), + ssm=SSMConfig(), + hybrid_block_layout=hybrid_block_layout, + prediction_heads=prediction_heads, + default_mtp_type=default_mtp_type, + init_method_std_embed=0.02, + init_method_min_embed=-0.02, + init_method_max_embed=0.02, + use_position_embeddings=True, + tie_word_embeddings=False, + ) + return config From f8850e4c09e677ab94ca062c51272fbe3689699c Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Fri, 6 Jun 2025 14:41:00 -0400 Subject: [PATCH 02/43] Parametrized dependencies --- tests/conftest.py | 76 ++++++---- tests/test_checkpoint.py | 84 ++++++++--- tests/test_gpt_generate_and_forward.py | 6 +- tests/test_match_megatron.py | 2 +- tests/test_mb.py | 12 +- tests/test_mb_seq_first.py | 4 +- tests/test_ms.py | 4 +- tests/test_seq_first.py | 8 +- tests/test_simple.py | 12 +- tests/utils/depends.py | 200 +++++++++++++++++++++++++ 10 files changed, 337 insertions(+), 71 deletions(-) create mode 100644 tests/utils/depends.py diff --git a/tests/conftest.py b/tests/conftest.py index 3d1e940b0..4cf6158de 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2,13 +2,12 @@ import math import os -import networkx import pytest -import pytest_depends -import pytest_depends.main import torch from xdist.scheduler import LoadGroupScheduling +from tests.utils.depends import DependencyManager + # Make fixtures available globally without import from tests.utils.run_test_script import ( # isort: skip run_test_script, @@ -20,14 +19,24 @@ from tests.utils.utils import result_path # isort: skip +manager: DependencyManager | None = None + + def pytest_addoption(parser): - parser.addoption("--skip-slow", action="store_true") - parser.addoption( + group = parser.getgroup("fast_llm") + group.addoption("--skip-slow", action="store_true") + group.addoption( "--run-extra-slow", action="store_true", default=False, help="Run tests marked as extra_slow", ) + group.addoption( + "--show-dependencies", + action="store_true", + default=False, + help="List all dependencies of all tests as a list of nodeids + the names that could not be resolved.", + ) @dataclasses.dataclass @@ -49,6 +58,7 @@ def pytest_configure(config): config.addinivalue_line( "markers", "extra_slow: Mark test as extra slow and skip unless --run-extra-slow is given." ) + config.addinivalue_line("markers", "depends_on(name='name', on=['other_name']): marks dependencies between tests.") # TODO: Spawned processes (multi-gpu, Megatron) ignore resource allocation. is_parallel = hasattr(config, "workerinput") if is_parallel: @@ -98,6 +108,8 @@ def pytest_configure(config): @pytest.hookimpl(trylast=True) def pytest_collection_modifyitems(config, items): + global manager + if config.getoption("--skip-slow"): skip_slow = pytest.mark.skip(reason="Skipping slow tests") for item in items: @@ -109,26 +121,40 @@ def pytest_collection_modifyitems(config, items): if "extra_slow" in item.keywords: item.add_marker(skip_extra_slow) - manager: pytest_depends.DependencyManager = pytest_depends.managers[-1] - # Build the undirected graph as in `DependencyManager.sorted_items`. - dag = networkx.DiGraph() - for item in manager.items: - node_id = pytest_depends.clean_nodeid(item.nodeid) - dag.add_node(node_id) - for dependency in manager.dependencies[node_id].dependencies: - dag.add_edge(dependency, node_id) - # Mark dependency groups for xdist. - manager.groups = {} - for i, node_ids in enumerate(sorted(networkx.weakly_connected_components(dag), key=len, reverse=True)): - if len(node_ids) > 1: - for node_id in node_ids: - manager.nodeid_to_item[node_id]._nodeid = ( - f"{manager.nodeid_to_item[node_id]._nodeid}@dependency_group_{i}" - ) - - old_clean_nodeid = pytest_depends.main.clean_nodeid - # Hack into `clean_nodeid` so pytest_depends recognizes the renamed nodes. - pytest_depends.main.clean_nodeid = lambda nodeid: old_clean_nodeid(nodeid.split("@dependency_group_")[0]) + manager = DependencyManager(items) + + # Show the extra information if requested + if config.getoption("show_dependencies"): + manager.print_name_map(config.getoption("verbose") > 1) + manager.print_processed_dependencies(config.getoption("color")) + + # Reorder the items so that tests run after their dependencies + items[:] = manager.items + + # If pytest-depends is installed, it will complain about renamed nodes whether it's used or not. + try: + import pytest_depends + except ImportError: + pass + else: + old_clean_nodeid = pytest_depends.main.clean_nodeid + # Hack into `clean_nodeid` so pytest_depends recognizes the renamed nodes. + pytest_depends.main.clean_nodeid = lambda nodeid: old_clean_nodeid(nodeid.split("@dependency_group_")[0]) + + +@pytest.hookimpl(tryfirst=True, hookwrapper=True) +def pytest_runtest_makereport(item: pytest.Function, call): + outcome = yield + manager.register_result(item, outcome.get_result()) + + +def pytest_runtest_call(item: pytest.Function): + manager.handle_missing(item) + + +def pytest_unconfigure(): + global manager + manager = None @pytest.fixture(scope="session") diff --git a/tests/test_checkpoint.py b/tests/test_checkpoint.py index e7929440a..6e6d5806c 100644 --- a/tests/test_checkpoint.py +++ b/tests/test_checkpoint.py @@ -55,7 +55,7 @@ def _compare_resume_fn(test_path: pathlib.Path, compare_path: pathlib.Path): shutil.copy(compare_path / path, test_path / path) -@pytest.mark.depends(on=["test_checkpoint_and_eval"]) +@pytest.mark.depends_on(on=["test_checkpoint_and_eval[{model_testing_config}]"]) def test_resume(run_test_script_for_all_models): # Resume from iteration=1 and compare outputs with the baseline run. run_test_script_for_all_models( @@ -70,7 +70,7 @@ def test_resume(run_test_script_for_all_models): ) -@pytest.mark.depends(on=["test_checkpoint_and_eval"]) +@pytest.mark.depends_on(on=["test_checkpoint_and_eval[{model_testing_config}]"]) def test_resume_frozen(run_test_script_for_all_models): # Resume with frozen mlp. No comparison. run_test_script_for_all_models( @@ -107,7 +107,7 @@ def convert_paths(run_test_script_base_path): } -@pytest.mark.depends(on=["test_checkpoint_and_eval"]) +@pytest.mark.depends_on(on=["test_checkpoint_and_eval[{model_testing_config}]"]) def test_convert_distributed_to_fast_llm(model_testing_config, convert_paths): _run_conversion( ConvertConfig( @@ -124,7 +124,7 @@ def test_convert_distributed_to_fast_llm(model_testing_config, convert_paths): ) -@pytest.mark.depends(on=["test_convert_distributed_to_fast_llm"]) +@pytest.mark.depends_on(on=["test_convert_distributed_to_fast_llm[{model_testing_config}]"]) def test_convert_fast_llm_to_huggingface(model_testing_config, convert_paths): if model_testing_config.checkpoint_format is None: pytest.skip(f"Conversion not supported for {model_testing_config.name}") @@ -143,7 +143,7 @@ def test_convert_fast_llm_to_huggingface(model_testing_config, convert_paths): ) -@pytest.mark.depends(on=["test_convert_fast_llm_to_huggingface"]) +@pytest.mark.depends_on(on=["test_convert_fast_llm_to_huggingface[{model_testing_config}]"]) def test_convert_huggingface_to_distributed(model_testing_config, convert_paths): _run_conversion( ConvertConfig( @@ -160,7 +160,7 @@ def test_convert_huggingface_to_distributed(model_testing_config, convert_paths) ) -@pytest.mark.depends(on=["test_checkpoint_and_eval"]) +@pytest.mark.depends_on(on=["test_checkpoint_and_eval[{model_testing_config}]"]) def test_convert_distributed_to_huggingface(model_testing_config, convert_paths): if model_testing_config.checkpoint_format is None: pytest.skip(f"Conversion not supported for {model_testing_config.name}") @@ -179,7 +179,7 @@ def test_convert_distributed_to_huggingface(model_testing_config, convert_paths) ) -@pytest.mark.depends(on=["test_convert_distributed_to_huggingface"]) +@pytest.mark.depends_on(on=["test_convert_distributed_to_huggingface[{model_testing_config}]"]) def test_convert_huggingface_to_fast_llm(model_testing_config, convert_paths): _run_conversion( ConvertConfig( @@ -196,7 +196,7 @@ def test_convert_huggingface_to_fast_llm(model_testing_config, convert_paths): ) -@pytest.mark.depends(on=["test_convert_huggingface_to_fast_llm"]) +@pytest.mark.depends_on(on=["test_convert_huggingface_to_fast_llm[{model_testing_config}]"]) def test_convert_fast_llm_to_distributed(model_testing_config, convert_paths): _run_conversion( ConvertConfig( @@ -213,7 +213,12 @@ def test_convert_fast_llm_to_distributed(model_testing_config, convert_paths): ) -@pytest.mark.depends(on=["test_convert_huggingface_to_distributed", "test_convert_fast_llm_to_distributed"]) +@pytest.mark.depends_on( + on=[ + "test_convert_huggingface_to_distributed[{model_testing_config}]", + "test_convert_fast_llm_to_distributed[{model_testing_config}]", + ] +) def test_converted_distributed(convert_paths): # Compare the fast llm weights # TODO: Compare configs @@ -229,7 +234,12 @@ def test_converted_distributed(convert_paths): assert (w[key] == w1[key]).all(), (w[key], w1[key]) -@pytest.mark.depends(on=["test_convert_distributed_to_fast_llm", "test_convert_huggingface_to_fast_llm"]) +@pytest.mark.depends_on( + on=[ + "test_convert_distributed_to_fast_llm[{model_testing_config}]", + "test_convert_huggingface_to_fast_llm[{model_testing_config}]", + ] +) def test_converted_fast_llm(convert_paths): s0 = safetensors.torch.load_file(convert_paths["fast_llm_0"] / "model_0.safetensors") s1 = safetensors.torch.load_file(convert_paths["fast_llm_1"] / "model_0.safetensors") @@ -239,7 +249,12 @@ def test_converted_fast_llm(convert_paths): assert (s0[key] == s1[key]).all(), (key, s0, s1) -@pytest.mark.depends(on=["test_convert_fast_llm_to_huggingface", "test_convert_distributed_to_huggingface"]) +@pytest.mark.depends_on( + on=[ + "test_convert_fast_llm_to_huggingface[{model_testing_config}]", + "test_convert_distributed_to_huggingface[{model_testing_config}]", + ] +) def test_converted_huggingface(convert_paths): h0 = safetensors.torch.load_file(convert_paths["huggingface_0"] / "model_0.safetensors") h1 = safetensors.torch.load_file(convert_paths["huggingface_1"] / "model_0.safetensors") @@ -257,7 +272,7 @@ def _compare_architectures(config_ref: FastLLMModelConfig, config_test: FastLLMM config_ref.base_model.compare_architecture(config_test.base_model) -@pytest.mark.depends(on=["test_converted_distributed"]) +@pytest.mark.depends_on(on=["test_converted_distributed[{model_testing_config}]"]) def test_load_pretrained_distributed_checkpoint(model_testing_config, convert_paths): config = model_testing_config.model_config_class.from_dict( yaml.safe_load((convert_paths["checkpoint"] / ".." / ".." / "config.yaml").open("r"))["model"], strict=False @@ -277,7 +292,7 @@ def test_load_pretrained_distributed_checkpoint(model_testing_config, convert_pa assert (state_shards[f"{shard_name}_shard"] == model.get_shard(shard_name)).all() -@pytest.mark.depends(on=["test_load_pretrained_distributed_checkpoint"]) +@pytest.mark.depends_on(on=["test_load_pretrained_distributed_checkpoint[{model_testing_config}]"]) def test_load_converted_distributed_checkpoint(model_testing_config, convert_paths): config_ref = model_testing_config.model_config_class.from_pretrained( CheckpointLoadConfig( @@ -309,7 +324,12 @@ def test_load_converted_distributed_checkpoint(model_testing_config, convert_pat assert (weight_shard == model.get_shard(ShardName.weights)).all() -@pytest.mark.depends(on=["test_converted_fast_llm", "test_load_pretrained_distributed_checkpoint"]) +@pytest.mark.depends_on( + on=[ + "test_converted_fast_llm[{model_testing_config}]", + "test_load_pretrained_distributed_checkpoint[{model_testing_config}]", + ] +) def test_load_converted_fast_llm_checkpoint(model_testing_config, convert_paths): config_ref = model_testing_config.model_config_class.from_pretrained( CheckpointLoadConfig( @@ -340,7 +360,12 @@ def test_load_converted_fast_llm_checkpoint(model_testing_config, convert_paths) assert (weight_shard == model.get_shard(ShardName.weights)).all() -@pytest.mark.depends(on=["test_converted_fast_llm", "test_load_pretrained_distributed_checkpoint"]) +@pytest.mark.depends_on( + on=[ + "test_converted_fast_llm[{model_testing_config}]", + "test_load_pretrained_distributed_checkpoint[{model_testing_config}]", + ] +) def test_load_converted_huggingface_checkpoint(model_testing_config, convert_paths): config_ref = model_testing_config.model_config_class.from_pretrained( CheckpointLoadConfig( @@ -372,7 +397,12 @@ def test_load_converted_huggingface_checkpoint(model_testing_config, convert_pat assert (weight_shard == model.get_shard(ShardName.weights)).all() -@pytest.mark.depends(on=["test_load_converted_fast_llm_checkpoint", "test_load_converted_huggingface_checkpoint"]) +@pytest.mark.depends_on( + on=[ + "test_load_converted_fast_llm_checkpoint[{model_testing_config}]", + "test_load_converted_huggingface_checkpoint[{model_testing_config}]", + ] +) def test_run_converted_model(model_testing_config, convert_paths): model_ref = model_testing_config.huggingface_model_for_causal_lm_class.from_pretrained( CheckpointLoadConfig( @@ -423,7 +453,7 @@ def test_run_converted_model(model_testing_config, convert_paths): @pytest.mark.slow -@pytest.mark.depends(on=["test_load_converted_distributed_checkpoint"]) +@pytest.mark.depends_on(on=["test_load_converted_distributed_checkpoint[{model_testing_config}]"]) def test_load_pretrained_distributed_in_dp2(run_test_script_for_all_models, convert_paths): run_test_script_for_all_models( "test_load_pretrained_distributed_in_dp2", @@ -438,7 +468,7 @@ def test_load_pretrained_distributed_in_dp2(run_test_script_for_all_models, conv ) -@pytest.mark.depends(on=["test_load_converted_distributed_checkpoint"]) +@pytest.mark.depends_on(on=["test_load_converted_distributed_checkpoint[{model_testing_config}]"]) def test_load_pretrained_distributed_with_config(run_test_script_for_all_models, convert_paths): run_test_script_for_all_models( "test_load_pretrained_distributed_with_config", @@ -452,7 +482,7 @@ def test_load_pretrained_distributed_with_config(run_test_script_for_all_models, ) -@pytest.mark.depends(on=["test_load_pretrained_distributed_in_dp2"]) +@pytest.mark.depends_on(on=["test_load_pretrained_distributed_in_dp2[{model_testing_config}]"]) def test_load_pretrained_in_dp2_match_checkpoint(model_testing_config, convert_paths, run_test_script_base_path): test_ckpt_path = run_test_script_base_path / "test_load_pretrained_distributed_in_dp2" / "checkpoint" / "1" pretrained_config_ref = CheckpointLoadConfig( @@ -497,7 +527,7 @@ def test_load_pretrained_in_dp2_match_checkpoint(model_testing_config, convert_p @pytest.mark.slow -@pytest.mark.depends(on=["test_load_pretrained_in_dp2_match_checkpoint"]) +@pytest.mark.depends_on(on=["test_load_pretrained_in_dp2_match_checkpoint[{model_testing_config}]"]) def test_load_distributed_checkpoint_dp2(model_testing_config, convert_paths, run_test_script_base_path): # This also tests conversion which uses `FastLLMModel.from_checkpoint` pretrained_config_ref = CheckpointLoadConfig( @@ -520,7 +550,12 @@ def test_load_distributed_checkpoint_dp2(model_testing_config, convert_paths, ru @pytest.mark.slow -@pytest.mark.depends(on=["test_load_converted_fast_llm_checkpoint", "test_load_pretrained_in_dp2_match_checkpoint"]) +@pytest.mark.depends_on( + on=[ + "test_load_converted_fast_llm_checkpoint[{model_testing_config}]", + "test_load_pretrained_in_dp2_match_checkpoint[{model_testing_config}]", + ] +) def test_load_pretrained_fast_llm_in_dp2(run_test_script, convert_paths, run_test_script_base_path): run_test_script( "test_load_pretrained_fast_llm_in_dp2", @@ -553,7 +588,12 @@ def test_load_pretrained_fast_llm_in_dp2(run_test_script, convert_paths, run_tes @pytest.mark.slow -@pytest.mark.depends(on=["test_load_converted_huggingface_checkpoint", "test_load_pretrained_in_dp2_match_checkpoint"]) +@pytest.mark.depends_on( + on=[ + "test_load_converted_huggingface_checkpoint[{model_testing_config}]", + "test_load_pretrained_in_dp2_match_checkpoint[{model_testing_config}]", + ] +) def test_load_pretrained_huggingface_in_dp2( run_test_script_for_all_models, model_testing_config, run_test_script_base_path, convert_paths ): diff --git a/tests/test_gpt_generate_and_forward.py b/tests/test_gpt_generate_and_forward.py index ca75cf3e8..4c920afde 100644 --- a/tests/test_gpt_generate_and_forward.py +++ b/tests/test_gpt_generate_and_forward.py @@ -251,7 +251,7 @@ def test_export_for_generate(run_test_script_for_all_models, model_testing_confi @pytest.mark.slow @requires_cuda -@pytest.mark.depends(on=["test_export_for_generate"]) +@pytest.mark.depends_on(on=["test_export_for_generate[{model_testing_config}]"]) @pytest.mark.parametrize( "use_flash_attention, use_bf16, max_new_tokens, min_matching_tokens_batch_size_1, min_matching_tokens_batch_size_2", [ @@ -314,7 +314,7 @@ def test_generate_from_model( @requires_cuda @pytest.mark.slow -@pytest.mark.depends(on=["test_export_for_generate"]) +@pytest.mark.depends_on(on=["test_export_for_generate[{model_testing_config}]"]) def test_small_generate_from_model(model_testing_config, run_test_script_base_path): _test_generate_from_model( run_test_script_base_path / f"test_export_for_generate/export/{model_testing_config.checkpoint_format.name}/1", @@ -363,7 +363,7 @@ def test_forward_return_hidden_states(model_path): @pytest.mark.slow @requires_cuda -@pytest.mark.depends(on=["test_export_for_generate"]) +@pytest.mark.depends_on(on=["test_export_for_generate[{model_testing_config}]"]) def test_small_forward_return_hidden_states(model_testing_config, run_test_script_base_path): _test_forward_return_hidden_states( run_test_script_base_path / f"test_export_for_generate/export/{model_testing_config.checkpoint_format.name}/1", diff --git a/tests/test_match_megatron.py b/tests/test_match_megatron.py index a77906ae2..5c0bbdaa1 100644 --- a/tests/test_match_megatron.py +++ b/tests/test_match_megatron.py @@ -10,7 +10,7 @@ def test_megatron(run_test_script_for_all_models, model_testing_config): @pytest.mark.slow -@pytest.mark.depends(on=["test_megatron"]) +@pytest.mark.depends_on(on=["test_megatron[{model_testing_config}]"]) def test_match_megatron(run_test_script_for_all_models, model_testing_config): run_test_script_for_all_models( [ diff --git a/tests/test_mb.py b/tests/test_mb.py index 80350df9d..e1f79fc14 100644 --- a/tests/test_mb.py +++ b/tests/test_mb.py @@ -10,7 +10,7 @@ def test_model_df4(run_test_script_for_all_models): @pytest.mark.slow -@pytest.mark.depends(on=["test_model_df4"]) +@pytest.mark.depends_on(on=["test_model_df4[{model_testing_config}]"]) def test_model_df4_z3(run_test_script_for_all_models): # Gradient accumulation with ZeRO-3. run_test_script_for_all_models( @@ -22,13 +22,13 @@ def test_model_df4_z3(run_test_script_for_all_models): ) -@pytest.mark.depends(on=["test_model_df4"], scope="session") +@pytest.mark.depends_on(on=["test_model_df4[{model_testing_config}]"], scope="session") def test_model_bf4(run_test_script_for_all_models): # Breadth-first gradient accumulation baseline. run_test_script_for_all_models(["batch.breadth_first_micro_batches=4"], compare="test_model_df4") -@pytest.mark.depends(on=["test_model_df4", "test_model_bf4"]) +@pytest.mark.depends_on(on=["test_model_df4[{model_testing_config}]", "test_model_bf4[{model_testing_config}]"]) def test_model_bf2_df2(run_test_script_for_all_models): # Mixed gradient accumulation baseline. run_test_script_for_all_models( @@ -37,7 +37,7 @@ def test_model_bf2_df2(run_test_script_for_all_models): @pytest.mark.slow -@pytest.mark.depends(on=["test_model_bf4"]) +@pytest.mark.depends_on(on=["test_model_bf4[{model_testing_config}]"]) def test_model_pp2s2_bf4(run_test_script_for_all_models): # Pipeline-parallel without tied weights. run_test_script_for_all_models( @@ -52,7 +52,7 @@ def test_model_pp2s2_bf4(run_test_script_for_all_models): @pytest.mark.slow -@pytest.mark.depends(on=["test_model_bf4"]) +@pytest.mark.depends_on(on=["test_model_bf4[{model_testing_config}]"]) def test_model_pp2s1_bf4(run_test_script_for_all_models): # Pipeline-parallel with tied weights. run_test_script_for_all_models( @@ -68,7 +68,7 @@ def test_model_pp2s1_bf4(run_test_script_for_all_models): @pytest.mark.slow -@pytest.mark.depends(on=["test_model_bf4"]) +@pytest.mark.depends_on(on=["test_model_bf4[{model_testing_config}]"]) def test_model_dp2_tp2_pp2s2_bf4(run_test_script_for_all_models): # Simple 3d parallelism # TODO: Test fails diff --git a/tests/test_mb_seq_first.py b/tests/test_mb_seq_first.py index 5146dc9a9..7d3cf5ad9 100644 --- a/tests/test_mb_seq_first.py +++ b/tests/test_mb_seq_first.py @@ -10,7 +10,7 @@ def test_model_df4_sf(run_test_script_for_all_models): @pytest.mark.slow -@pytest.mark.depends(on=["test_model_df4_sf"]) +@pytest.mark.depends_on(on=["test_model_df4_sf[{model_testing_config}]"]) def test_model_dp2_sp2_df4(run_test_script_for_all_models): # Sequence-tensor-parallel with gradient accumulation. # TODO: Compiled cross-entropy broken for this config @@ -29,7 +29,7 @@ def test_model_dp2_sp2_df4(run_test_script_for_all_models): @pytest.mark.slow @pytest.mark.skip(reason="Test is broken.") -@pytest.mark.depends(on=["test_model_df4_sf"]) +@pytest.mark.depends_on(on=["test_model_df4_sf[{model_testing_config}]"]) def test_model_dp2_sp2_pp2s1(run_test_script_for_all_models): # 3d-parallel with sequence-tensor-parallel. # TODO: Compiled cross-entropy broken for this config diff --git a/tests/test_ms.py b/tests/test_ms.py index 256eafe31..23ef60e64 100644 --- a/tests/test_ms.py +++ b/tests/test_ms.py @@ -8,7 +8,7 @@ def test_model_ms256(run_test_script_for_all_models): @pytest.mark.slow -@pytest.mark.depends(on=["test_model_ms256"]) +@pytest.mark.depends_on(on=["test_model_ms256[{model_testing_config}]"]) def test_model_pp2s2_ms256(run_test_script_for_all_models): # Sequence-pipeline-parallel run_test_script_for_all_models( @@ -24,7 +24,7 @@ def test_model_pp2s2_ms256(run_test_script_for_all_models): @pytest.mark.slow @pytest.mark.skip -@pytest.mark.depends(on=["test_model_ms256"]) +@pytest.mark.depends_on(on=["test_model_ms256[{model_testing_config}]"]) def test_model_dp2s2_stp2_pp2s2_ms256(run_test_script_for_all_models): # TODO: Handle this case. # Sequence-3d-parallel diff --git a/tests/test_seq_first.py b/tests/test_seq_first.py index 3e8b7ea19..3df31bb9d 100644 --- a/tests/test_seq_first.py +++ b/tests/test_seq_first.py @@ -4,11 +4,11 @@ # TODO: Compare grads with simple def test_model_sf(run_test_script_for_all_models): # Sequence-first baseline. - run_test_script_for_all_models("test_model_sf", ["model.base_model.sequence_first=True"]) + run_test_script_for_all_models("test_model_sf[{model_testing_config}]", ["model.base_model.sequence_first=True"]) @pytest.mark.slow -@pytest.mark.depends(on=["test_model_sf"]) +@pytest.mark.depends_on(on=["test_model_sf[{model_testing_config}]"]) def test_model_sp2(run_test_script_for_all_models): # Sequence-tensor-parallel. run_test_script_for_all_models( @@ -20,7 +20,7 @@ def test_model_sp2(run_test_script_for_all_models): @pytest.mark.slow -@pytest.mark.depends(on=["test_model_sf"]) +@pytest.mark.depends_on(on=["test_model_sf[{model_testing_config}]"]) def test_model_sdp2(run_test_script_for_all_models): # Sequence-data-parallel run_test_script_for_all_models( @@ -32,7 +32,7 @@ def test_model_sdp2(run_test_script_for_all_models): @pytest.mark.slow -@pytest.mark.depends(on=["test_model_sf"]) +@pytest.mark.depends_on(on=["test_model_sf[{model_testing_config}]"]) def test_model_sp2_ce4(run_test_script_for_all_models): # Sequence-tensor-parallel with cross-entropy splits. run_test_script_for_all_models( diff --git a/tests/test_simple.py b/tests/test_simple.py index bc48e26b4..8026f0127 100644 --- a/tests/test_simple.py +++ b/tests/test_simple.py @@ -13,7 +13,7 @@ def test_model_safe(run_test_script_for_all_models): ) -@pytest.mark.depends(on=["test_model_safe"]) +@pytest.mark.depends_on(on=["test_model_safe[{model_testing_config}]"]) def test_model(run_test_script_for_all_models): # A baseline config (single-gpu, bf16, flash-attn). # Also tests for multiple data loaders. @@ -21,7 +21,7 @@ def test_model(run_test_script_for_all_models): @pytest.mark.slow -@pytest.mark.depends(on=["test_model"]) +@pytest.mark.depends_on(on=["test_model[{model_testing_config}]"]) def test_model_dp2(run_test_script_for_all_models): # Simple data-parallel. run_test_script_for_all_models([], num_gpus=2, compare="test_model") @@ -52,7 +52,7 @@ def test_model_dp2_timeout(run_test_script_for_all_models): @pytest.mark.slow -@pytest.mark.depends(on=["test_model"]) +@pytest.mark.depends_on(on=["test_model[{model_testing_config}]"]) def test_model_tp2(run_test_script_for_all_models): # Simple tensor-parallel. run_test_script_for_all_models( @@ -62,7 +62,7 @@ def test_model_tp2(run_test_script_for_all_models): ) -@pytest.mark.depends(on=["test_model"]) +@pytest.mark.depends_on(on=["test_model[{model_testing_config}]"]) def test_model_ce4(run_test_script_for_all_models): # Cross-entropy splits. run_test_script_for_all_models( @@ -72,7 +72,7 @@ def test_model_ce4(run_test_script_for_all_models): @pytest.mark.slow -@pytest.mark.depends(on=["test_model"]) +@pytest.mark.depends_on(on=["test_model[{model_testing_config}]"]) def test_model_dp2_z2(run_test_script_for_all_models): # Data-parallel with zero stage 2. run_test_script_for_all_models( @@ -83,7 +83,7 @@ def test_model_dp2_z2(run_test_script_for_all_models): @pytest.mark.slow -@pytest.mark.depends(on=["test_model"]) +@pytest.mark.depends_on(on=["test_model[{model_testing_config}]"]) def test_model_dp2_z3(run_test_script_for_all_models): # Data-parallel with zero stage 3. run_test_script_for_all_models( diff --git a/tests/utils/depends.py b/tests/utils/depends.py new file mode 100644 index 000000000..c1e2e2509 --- /dev/null +++ b/tests/utils/depends.py @@ -0,0 +1,200 @@ +import re + +import colorama +import networkx +import pytest + +MARKER_NAME = "depends_on" +MARKER_KWARG_ID = "name" +MARKER_KWARG_DEPENDENCIES = "on" + +REGEX_PARAMETERS = re.compile(r"\[.+\]$") + + +def clean_nodeid(nodeid): + return nodeid.replace("::()::", "::").split("@dependency_group_")[0] + + +def get_names(item): + names = set() + + # Node id + nodeid = clean_nodeid(item.nodeid) + names.add(nodeid) + + # Node id without parameter + nodeid = REGEX_PARAMETERS.sub("", nodeid) + names.add(nodeid) + + # Node id scopes + while "::" in nodeid: + nodeid = nodeid.rsplit("::", 1)[0] + names.add(nodeid) + + # Custom name + for marker in item.iter_markers(): + if marker.name == MARKER_NAME and MARKER_KWARG_ID in marker.kwargs: + for name in as_list(marker.kwargs[MARKER_KWARG_ID]): + names.add(name) + + return names + + +def as_list(lst): + return [lst] if isinstance(lst, str) else lst + + +STEPS = ["setup", "call", "teardown"] +GOOD_OUTCOME = "passed" + + +class DependencyManager: + """Keep track of tests, their names and their dependencies.""" + + def __init__(self, items: list[pytest.Function]): + self._items = items + self._name_to_nodeids: dict[str, list[str]] = {} + self._nodeid_to_item: dict[str, pytest.Function] = {} + self._results: dict[str, dict[str, str]] = {} + self._dependencies: dict[str, set[str]] = {} + self._unresolved: dict[str, set[str]] = {} + + for item in self._items: + nodeid = clean_nodeid(item.nodeid) + # Add the mapping from nodeid to the test item + self._nodeid_to_item[nodeid] = item + # Add the mappings from all names to the node id + for name in get_names(item): + if name not in self._name_to_nodeids: + self._name_to_nodeids[name] = [] + self._name_to_nodeids[name].append(nodeid) + # Create the object that will contain the results of this test + self._results[nodeid] = {} + + for item in self._items: + # Process the dependencies of this test + # This uses the mappings created in the previous loop, and can thus not be merged into that loop + nodeid = clean_nodeid(item.nodeid) + self._dependencies[nodeid], self._unresolved[nodeid] = self._resolve_dependencies(item) + + self._items = self._sort_dependencies() + + @property + def items(self) -> list[pytest.Function]: + return self._items + + def register_result(self, item: pytest.Function, result: pytest.TestReport): + self._results[clean_nodeid(item.nodeid)][result.when] = result.outcome + + def handle_missing(self, item: pytest.Function): + nodeid = clean_nodeid(item.nodeid) + if missing := self._unresolved[nodeid]: + pytest.fail(f'{item.nodeid} depends on {", ".join(missing)}, which was not found', False) + + if failed := [ + dependency + for dependency in self._dependencies[nodeid] + if not all(self._results[dependency].get(step, None) == "passed" for step in ("setup", "call", "teardown")) + ]: + pytest.skip( + f'{item.nodeid} depends on {", ".join(failed)} ({self._dependencies[nodeid]} ;;;; { + [self._results[dependency] for dependency in self._dependencies[nodeid]]})' + ) + + def _resolve_dependencies(self, item: pytest.Function): + dependencies = set() + unresolved = set() + nodeid = clean_nodeid(item.nodeid) + + for marker in item.iter_markers(): + if marker.name == MARKER_NAME: + for dependency in as_list(marker.kwargs.get(MARKER_KWARG_DEPENDENCIES, [])): + dependency = dependency.format(**item.callspec.params) + + # If the name is not known, try to make it absolute (ie file::[class::]method) + if dependency not in self._name_to_nodeids: + absolute_dependency = self._get_absolute_nodeid(dependency, nodeid) + if absolute_dependency in self._name_to_nodeids: + dependency = absolute_dependency + + # Add all items matching the name + if dependency in self._name_to_nodeids: + for nodeid in self._name_to_nodeids[dependency]: + dependencies.add(nodeid) + else: + unresolved.add(dependency) + + return dependencies, unresolved + + def _sort_dependencies(self): + # Build a directed graph for sorting + dag = networkx.DiGraph() + + for item in self.items: + nodeid = clean_nodeid(item.nodeid) + dag.add_node(nodeid) + for dependency in self._dependencies[nodeid]: + dag.add_edge(dependency, nodeid) + + for i, nodeids in enumerate(sorted(networkx.weakly_connected_components(dag), key=len, reverse=True)): + if len(nodeids) > 1: + for nodeid in nodeids: + self._nodeid_to_item[nodeid]._nodeid = ( + f"{self._nodeid_to_item[nodeid]._nodeid}@dependency_group_{i}" + ) + + return [self._nodeid_to_item[nodeid] for nodeid in networkx.topological_sort(dag)] + + @staticmethod + def _get_absolute_nodeid(nodeid: str, scope: str): + parts = nodeid.split("::") + # Completely relative (test_name), so add the full current scope (either file::class or file) + if len(parts) == 1: + base_nodeid = scope.rsplit("::", 1)[0] + nodeid = f"{base_nodeid}::{nodeid}" + # Contains some scope already (Class::test_name), so only add the current file scope + elif "." not in parts[0]: + base_nodeid = scope.split("::", 1)[0] + nodeid = f"{base_nodeid}::{nodeid}" + return clean_nodeid(nodeid) + + def print_name_map(self, verbose: bool = False): + """Print a human-readable version of the name -> test mapping.""" + print("Available dependency names:") + for name, nodeids in sorted(self._name_to_nodeids.items(), key=lambda x: x[0]): + if len(nodeids) == 1: + if name == nodeids[0]: + # This is just the base name, only print this when verbose + if verbose: + print(f" {name}") + else: + # Name refers to a single node id, so use the short format + print(f" {name} -> {nodeids[0]}") + else: + # Name refers to multiple node ids, so use the long format + print(f" {name} ->") + for nodeid in sorted(nodeids): + print(f" {nodeid}") + + def print_processed_dependencies(self, colors: bool = False): + """Print a human-readable list of the processed dependencies.""" + missing = "MISSING" + if colors: + missing = f"{colorama.Fore.RED}{missing}{colorama.Fore.RESET}" + colorama.init() + try: + print("Dependencies:") + + for nodeid in sorted(self._dependencies): + descriptions = [] + for dependency in self._dependencies[nodeid]: + descriptions.append(dependency) + for dependency in self._unresolved[nodeid]: + descriptions.append(f"{dependency} ({missing})") + if descriptions: + print(f" {nodeid} depends on") + for description in sorted(descriptions): + print(f" {description}") + finally: + if colors: + colorama.deinit() From 478ac05220d37363e8128ffec40fd17c7a3078fe Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Fri, 6 Jun 2025 15:11:35 -0400 Subject: [PATCH 03/43] fixes --- tests/test_checkpoint.py | 5 ----- tests/test_mb.py | 3 +-- tests/test_seq_first.py | 4 +--- tests/utils/depends.py | 5 +---- tests/utils/model_configs.py | 2 +- 5 files changed, 4 insertions(+), 15 deletions(-) diff --git a/tests/test_checkpoint.py b/tests/test_checkpoint.py index 6e6d5806c..eea3ab0e8 100644 --- a/tests/test_checkpoint.py +++ b/tests/test_checkpoint.py @@ -74,7 +74,6 @@ def test_resume(run_test_script_for_all_models): def test_resume_frozen(run_test_script_for_all_models): # Resume with frozen mlp. No comparison. run_test_script_for_all_models( - "test_resume_frozen", [ "training.checkpoint.interval=1", "training.evaluations.validation.interval=2", @@ -456,7 +455,6 @@ def test_run_converted_model(model_testing_config, convert_paths): @pytest.mark.depends_on(on=["test_load_converted_distributed_checkpoint[{model_testing_config}]"]) def test_load_pretrained_distributed_in_dp2(run_test_script_for_all_models, convert_paths): run_test_script_for_all_models( - "test_load_pretrained_distributed_in_dp2", [ "training.checkpoint.interval=1", "training.train_iters=1", @@ -471,7 +469,6 @@ def test_load_pretrained_distributed_in_dp2(run_test_script_for_all_models, conv @pytest.mark.depends_on(on=["test_load_converted_distributed_checkpoint[{model_testing_config}]"]) def test_load_pretrained_distributed_with_config(run_test_script_for_all_models, convert_paths): run_test_script_for_all_models( - "test_load_pretrained_distributed_with_config", [ "training.checkpoint.interval=1", "training.train_iters=1", @@ -558,7 +555,6 @@ def test_load_distributed_checkpoint_dp2(model_testing_config, convert_paths, ru ) def test_load_pretrained_fast_llm_in_dp2(run_test_script, convert_paths, run_test_script_base_path): run_test_script( - "test_load_pretrained_fast_llm_in_dp2", [ "training.checkpoint.interval=1", "training.train_iters=1", @@ -598,7 +594,6 @@ def test_load_pretrained_huggingface_in_dp2( run_test_script_for_all_models, model_testing_config, run_test_script_base_path, convert_paths ): run_test_script_for_all_models( - "test_load_pretrained_huggingface_in_dp2", [ "training.checkpoint.interval=1", "training.train_iters=1", diff --git a/tests/test_mb.py b/tests/test_mb.py index e1f79fc14..fb09dcec6 100644 --- a/tests/test_mb.py +++ b/tests/test_mb.py @@ -6,7 +6,7 @@ # TODO: Compare grads with simple def test_model_df4(run_test_script_for_all_models): # Depth-first gradient accumulation baseline. - run_test_script_for_all_models("test_model_df4", ["batch.depth_first_micro_batches=4"]) + run_test_script_for_all_models(["batch.depth_first_micro_batches=4"]) @pytest.mark.slow @@ -14,7 +14,6 @@ def test_model_df4(run_test_script_for_all_models): def test_model_df4_z3(run_test_script_for_all_models): # Gradient accumulation with ZeRO-3. run_test_script_for_all_models( - "test_model_df4_z3", ["model.multi_stage.zero_stage=3", "batch.depth_first_micro_batches=4"], num_gpus=2, compare="test_model_df4", diff --git a/tests/test_seq_first.py b/tests/test_seq_first.py index 3df31bb9d..6e1eb07ac 100644 --- a/tests/test_seq_first.py +++ b/tests/test_seq_first.py @@ -4,7 +4,7 @@ # TODO: Compare grads with simple def test_model_sf(run_test_script_for_all_models): # Sequence-first baseline. - run_test_script_for_all_models("test_model_sf[{model_testing_config}]", ["model.base_model.sequence_first=True"]) + run_test_script_for_all_models(["model.base_model.sequence_first=True"]) @pytest.mark.slow @@ -12,7 +12,6 @@ def test_model_sf(run_test_script_for_all_models): def test_model_sp2(run_test_script_for_all_models): # Sequence-tensor-parallel. run_test_script_for_all_models( - "test_model_sp2", ["model.distributed.tensor_parallel=2", "model.distributed.sequence_tensor_parallel=True"], num_gpus=2, compare="test_model_sf", @@ -24,7 +23,6 @@ def test_model_sp2(run_test_script_for_all_models): def test_model_sdp2(run_test_script_for_all_models): # Sequence-data-parallel run_test_script_for_all_models( - "test_model_sdp2", ["model.distributed.sequence_data_parallel=2"], num_gpus=2, compare="test_model_sf", diff --git a/tests/utils/depends.py b/tests/utils/depends.py index c1e2e2509..8ddb5041c 100644 --- a/tests/utils/depends.py +++ b/tests/utils/depends.py @@ -96,10 +96,7 @@ def handle_missing(self, item: pytest.Function): for dependency in self._dependencies[nodeid] if not all(self._results[dependency].get(step, None) == "passed" for step in ("setup", "call", "teardown")) ]: - pytest.skip( - f'{item.nodeid} depends on {", ".join(failed)} ({self._dependencies[nodeid]} ;;;; { - [self._results[dependency] for dependency in self._dependencies[nodeid]]})' - ) + pytest.skip(f'{item.nodeid} depends on failed {", ".join(failed)}') def _resolve_dependencies(self, item: pytest.Function): dependencies = set() diff --git a/tests/utils/model_configs.py b/tests/utils/model_configs.py index 963f6ae93..d0c0d070e 100644 --- a/tests/utils/model_configs.py +++ b/tests/utils/model_configs.py @@ -84,7 +84,7 @@ def _update_and_add_testing_config( "training.logs.interval=1", "run.tensor_logs.save=True", "run.tensor_logs.show=False", - "model.base_model.max_position_embeddings=512", + # "model.base_model.max_position_embeddings=512", "model.base_model.transformer.num_layers=2", "model.base_model.transformer.hidden_size=256", "model.base_model.transformer.num_attention_heads=8", From d3b18a13ccd6be6c9f2d2a1b36d4deeaeebd3fc2 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Mon, 9 Jun 2025 13:30:19 -0400 Subject: [PATCH 04/43] stuff --- fast_llm/layers/transformer/config.py | 5 +- tests/conftest.py | 81 +++++++++++++- tests/test_checkpoint.py | 34 ++++-- tests/test_config.py | 29 ----- tests/test_functional.py | 12 +- tests/test_gpt_generate_and_forward.py | 10 +- tests/test_match_megatron.py | 10 +- tests/test_mb.py | 12 +- tests/test_mb_seq_first.py | 6 +- tests/test_ms.py | 7 +- tests/test_multi_stage.py | 4 + tests/test_seq_first.py | 10 +- tests/test_simple.py | 16 ++- tests/utils/depends.py | 4 + tests/utils/model_configs.py | 149 ++++++++++++++++++++++++- tests/utils/run_test_script.py | 2 +- 16 files changed, 314 insertions(+), 77 deletions(-) diff --git a/fast_llm/layers/transformer/config.py b/fast_llm/layers/transformer/config.py index 235aa366e..c0ed1472a 100644 --- a/fast_llm/layers/transformer/config.py +++ b/fast_llm/layers/transformer/config.py @@ -711,7 +711,4 @@ def setup_tensor_space(self, tensor_space: TensorSpace) -> None: ) def do_use_flash_attention(self, distributed_config: DistributedConfig) -> bool: - return self.use_flash_attention and distributed_config.training_dtype in ( - DataType.float16, - DataType.bfloat16, - ) + return self.use_flash_attention and distributed_config.training_dtype in (DataType.float16, DataType.bfloat16) diff --git a/tests/conftest.py b/tests/conftest.py index 4cf6158de..829e1696f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,10 +1,11 @@ import dataclasses +import datetime import math import os import pytest import torch -from xdist.scheduler import LoadGroupScheduling +import xdist.scheduler from tests.utils.depends import DependencyManager @@ -15,7 +16,7 @@ run_test_script_for_all_models, ) -from tests.utils.model_configs import model_testing_config # isort: skip +from tests.utils.model_configs import model_testing_config, ModelTestingConfig, testing_group_enabled # isort: skip from tests.utils.utils import result_path # isort: skip @@ -25,6 +26,8 @@ def pytest_addoption(parser): group = parser.getgroup("fast_llm") group.addoption("--skip-slow", action="store_true") + group.addoption("--show-skipped", action="store_true") + group.addoption("--models", nargs="*") group.addoption( "--run-extra-slow", action="store_true", @@ -59,6 +62,7 @@ def pytest_configure(config): "markers", "extra_slow: Mark test as extra slow and skip unless --run-extra-slow is given." ) config.addinivalue_line("markers", "depends_on(name='name', on=['other_name']): marks dependencies between tests.") + config.addinivalue_line("markers", "model_testing_group(group='group'): marks model testing group.") # TODO: Spawned processes (multi-gpu, Megatron) ignore resource allocation. is_parallel = hasattr(config, "workerinput") if is_parallel: @@ -107,8 +111,11 @@ def pytest_configure(config): @pytest.hookimpl(trylast=True) -def pytest_collection_modifyitems(config, items): +def pytest_collection_modifyitems(config, items: list[pytest.Function]): global manager + skip_slow = config.getoption("--skip-slow") + skip_extra_slow = not config.getoption("--run-extra-slow") + show_skipped = config.getoption("--show-skipped") if config.getoption("--skip-slow"): skip_slow = pytest.mark.skip(reason="Skipping slow tests") @@ -121,7 +128,23 @@ def pytest_collection_modifyitems(config, items): if "extra_slow" in item.keywords: item.add_marker(skip_extra_slow) - manager = DependencyManager(items) + new_items = [] + for item in items: + if skip_slow and "slow" in item.keywords: + if show_skipped: + item.add_marker(pytest.mark.skip(reason="Skipping slow tests")) + else: + continue + elif skip_extra_slow and "extra_slow" in item.keywords: + if show_skipped: + item.add_marker(pytest.mark.skip(reason="Skipping extra-slow tests")) + else: + continue + elif not testing_group_enabled(item, skip_slow, skip_extra_slow, show_skipped): + continue + new_items.append(item) + + manager = DependencyManager(new_items) # Show the extra information if requested if config.getoption("show_dependencies"): @@ -166,4 +189,52 @@ def worker_resources(request) -> WorkerResources: def pytest_xdist_make_scheduler(config, log): # Always use grouped load balancing to handle dependencies, and make it work with `-n`. assert config.getvalue("dist") == "load" - return LoadGroupScheduling(config, log) + return xdist.scheduler.LoadGroupScheduling(config, log) + + +def get_all_reports(terminalreporter): + """Reports for all stages and all outcomes""" + for reports in terminalreporter.stats.values(): + for report in reports: + if isinstance(report, pytest.TestReport): + yield report + + +def resource_usage_message(report): + """The resource usage message for a report""" + return ", ".join(content for (prefix, content) in report.get_sections(f"Captured resource {report.when}")) + + +def format_duration(seconds): + """Human-readable running time message""" + if seconds < 60: + duration_string = f"{seconds:.3f} seconds" + else: + duration_string = str(datetime.timedelta(seconds=round(seconds))) + return f"running time: {duration_string}" + + +# @pytest.hookimpl(tryfirst=True) +# def pytest_runtest_makereport(item, call): +# """Report running time of a test call""" +# if call.when == "call": +# item.add_report_section( +# call.when, "resource", format_duration(call.duration) +# ) +# +# +# @pytest.hookimpl +# def pytest_terminal_summary(terminalreporter): +# """Produce a resource usage report if any test asked for it""" +# resource_reports = [ +# (report, message) +# for report in get_all_reports(terminalreporter) +# if (message := resource_usage_message(report)) +# ] +# if not resource_reports: +# return +# terminalreporter.write_sep("=", "resource usage", bold=True) +# for report, message in resource_reports: +# terminalreporter.write_line( +# f"{report.nodeid} ({report.when}) {message}" +# ) diff --git a/tests/test_checkpoint.py b/tests/test_checkpoint.py index eea3ab0e8..06f69a96b 100644 --- a/tests/test_checkpoint.py +++ b/tests/test_checkpoint.py @@ -17,12 +17,12 @@ from fast_llm.engine.multi_stage.config import FastLLMModelConfig, ShardName, StageMode from fast_llm.tools.convert import ConvertConfig from tests.utils.compare_tensor_logs import CompareConfig, compare_logged_tensor -from tests.utils.utils import requires_cuda +from tests.utils.model_configs import ModelTestingGroup _WEIGHT_SHARD_SAVE_NAME = f"{ShardName.weights}_shard" -@requires_cuda +@pytest.mark.model_testing_group(ModelTestingGroup.basic) def test_checkpoint_and_eval(run_test_script_for_all_models, model_testing_config): # A baseline config (single-gpu, bf16, flash-attn). run_test_script_for_all_models( @@ -56,6 +56,7 @@ def _compare_resume_fn(test_path: pathlib.Path, compare_path: pathlib.Path): @pytest.mark.depends_on(on=["test_checkpoint_and_eval[{model_testing_config}]"]) +@pytest.mark.model_testing_group(ModelTestingGroup.basic) def test_resume(run_test_script_for_all_models): # Resume from iteration=1 and compare outputs with the baseline run. run_test_script_for_all_models( @@ -71,6 +72,7 @@ def test_resume(run_test_script_for_all_models): @pytest.mark.depends_on(on=["test_checkpoint_and_eval[{model_testing_config}]"]) +@pytest.mark.model_testing_group(ModelTestingGroup.basic) def test_resume_frozen(run_test_script_for_all_models): # Resume with frozen mlp. No comparison. run_test_script_for_all_models( @@ -107,6 +109,7 @@ def convert_paths(run_test_script_base_path): @pytest.mark.depends_on(on=["test_checkpoint_and_eval[{model_testing_config}]"]) +@pytest.mark.model_testing_group(ModelTestingGroup.convert) def test_convert_distributed_to_fast_llm(model_testing_config, convert_paths): _run_conversion( ConvertConfig( @@ -124,6 +127,7 @@ def test_convert_distributed_to_fast_llm(model_testing_config, convert_paths): @pytest.mark.depends_on(on=["test_convert_distributed_to_fast_llm[{model_testing_config}]"]) +@pytest.mark.model_testing_group(ModelTestingGroup.convert) def test_convert_fast_llm_to_huggingface(model_testing_config, convert_paths): if model_testing_config.checkpoint_format is None: pytest.skip(f"Conversion not supported for {model_testing_config.name}") @@ -143,6 +147,7 @@ def test_convert_fast_llm_to_huggingface(model_testing_config, convert_paths): @pytest.mark.depends_on(on=["test_convert_fast_llm_to_huggingface[{model_testing_config}]"]) +@pytest.mark.model_testing_group(ModelTestingGroup.convert) def test_convert_huggingface_to_distributed(model_testing_config, convert_paths): _run_conversion( ConvertConfig( @@ -160,6 +165,7 @@ def test_convert_huggingface_to_distributed(model_testing_config, convert_paths) @pytest.mark.depends_on(on=["test_checkpoint_and_eval[{model_testing_config}]"]) +@pytest.mark.model_testing_group(ModelTestingGroup.convert) def test_convert_distributed_to_huggingface(model_testing_config, convert_paths): if model_testing_config.checkpoint_format is None: pytest.skip(f"Conversion not supported for {model_testing_config.name}") @@ -179,6 +185,7 @@ def test_convert_distributed_to_huggingface(model_testing_config, convert_paths) @pytest.mark.depends_on(on=["test_convert_distributed_to_huggingface[{model_testing_config}]"]) +@pytest.mark.model_testing_group(ModelTestingGroup.convert) def test_convert_huggingface_to_fast_llm(model_testing_config, convert_paths): _run_conversion( ConvertConfig( @@ -196,6 +203,7 @@ def test_convert_huggingface_to_fast_llm(model_testing_config, convert_paths): @pytest.mark.depends_on(on=["test_convert_huggingface_to_fast_llm[{model_testing_config}]"]) +@pytest.mark.model_testing_group(ModelTestingGroup.convert) def test_convert_fast_llm_to_distributed(model_testing_config, convert_paths): _run_conversion( ConvertConfig( @@ -218,6 +226,7 @@ def test_convert_fast_llm_to_distributed(model_testing_config, convert_paths): "test_convert_fast_llm_to_distributed[{model_testing_config}]", ] ) +@pytest.mark.model_testing_group(ModelTestingGroup.convert) def test_converted_distributed(convert_paths): # Compare the fast llm weights # TODO: Compare configs @@ -239,6 +248,7 @@ def test_converted_distributed(convert_paths): "test_convert_huggingface_to_fast_llm[{model_testing_config}]", ] ) +@pytest.mark.model_testing_group(ModelTestingGroup.convert) def test_converted_fast_llm(convert_paths): s0 = safetensors.torch.load_file(convert_paths["fast_llm_0"] / "model_0.safetensors") s1 = safetensors.torch.load_file(convert_paths["fast_llm_1"] / "model_0.safetensors") @@ -254,6 +264,7 @@ def test_converted_fast_llm(convert_paths): "test_convert_distributed_to_huggingface[{model_testing_config}]", ] ) +@pytest.mark.model_testing_group(ModelTestingGroup.convert) def test_converted_huggingface(convert_paths): h0 = safetensors.torch.load_file(convert_paths["huggingface_0"] / "model_0.safetensors") h1 = safetensors.torch.load_file(convert_paths["huggingface_1"] / "model_0.safetensors") @@ -272,6 +283,7 @@ def _compare_architectures(config_ref: FastLLMModelConfig, config_test: FastLLMM @pytest.mark.depends_on(on=["test_converted_distributed[{model_testing_config}]"]) +@pytest.mark.model_testing_group(ModelTestingGroup.convert) def test_load_pretrained_distributed_checkpoint(model_testing_config, convert_paths): config = model_testing_config.model_config_class.from_dict( yaml.safe_load((convert_paths["checkpoint"] / ".." / ".." / "config.yaml").open("r"))["model"], strict=False @@ -292,6 +304,7 @@ def test_load_pretrained_distributed_checkpoint(model_testing_config, convert_pa @pytest.mark.depends_on(on=["test_load_pretrained_distributed_checkpoint[{model_testing_config}]"]) +@pytest.mark.model_testing_group(ModelTestingGroup.convert) def test_load_converted_distributed_checkpoint(model_testing_config, convert_paths): config_ref = model_testing_config.model_config_class.from_pretrained( CheckpointLoadConfig( @@ -329,6 +342,7 @@ def test_load_converted_distributed_checkpoint(model_testing_config, convert_pat "test_load_pretrained_distributed_checkpoint[{model_testing_config}]", ] ) +@pytest.mark.model_testing_group(ModelTestingGroup.convert) def test_load_converted_fast_llm_checkpoint(model_testing_config, convert_paths): config_ref = model_testing_config.model_config_class.from_pretrained( CheckpointLoadConfig( @@ -365,6 +379,7 @@ def test_load_converted_fast_llm_checkpoint(model_testing_config, convert_paths) "test_load_pretrained_distributed_checkpoint[{model_testing_config}]", ] ) +@pytest.mark.model_testing_group(ModelTestingGroup.convert) def test_load_converted_huggingface_checkpoint(model_testing_config, convert_paths): config_ref = model_testing_config.model_config_class.from_pretrained( CheckpointLoadConfig( @@ -402,6 +417,7 @@ def test_load_converted_huggingface_checkpoint(model_testing_config, convert_pat "test_load_converted_huggingface_checkpoint[{model_testing_config}]", ] ) +@pytest.mark.model_testing_group(ModelTestingGroup.convert) def test_run_converted_model(model_testing_config, convert_paths): model_ref = model_testing_config.huggingface_model_for_causal_lm_class.from_pretrained( CheckpointLoadConfig( @@ -451,8 +467,8 @@ def test_run_converted_model(model_testing_config, convert_paths): raise ValueError(f"Comparison failed ({len(errors)} errors)") -@pytest.mark.slow @pytest.mark.depends_on(on=["test_load_converted_distributed_checkpoint[{model_testing_config}]"]) +@pytest.mark.model_testing_group(ModelTestingGroup.convert, ModelTestingGroup.distributed) def test_load_pretrained_distributed_in_dp2(run_test_script_for_all_models, convert_paths): run_test_script_for_all_models( [ @@ -467,6 +483,7 @@ def test_load_pretrained_distributed_in_dp2(run_test_script_for_all_models, conv @pytest.mark.depends_on(on=["test_load_converted_distributed_checkpoint[{model_testing_config}]"]) +@pytest.mark.model_testing_group(ModelTestingGroup.convert) def test_load_pretrained_distributed_with_config(run_test_script_for_all_models, convert_paths): run_test_script_for_all_models( [ @@ -480,6 +497,7 @@ def test_load_pretrained_distributed_with_config(run_test_script_for_all_models, @pytest.mark.depends_on(on=["test_load_pretrained_distributed_in_dp2[{model_testing_config}]"]) +@pytest.mark.model_testing_group(ModelTestingGroup.convert, ModelTestingGroup.distributed) def test_load_pretrained_in_dp2_match_checkpoint(model_testing_config, convert_paths, run_test_script_base_path): test_ckpt_path = run_test_script_base_path / "test_load_pretrained_distributed_in_dp2" / "checkpoint" / "1" pretrained_config_ref = CheckpointLoadConfig( @@ -523,8 +541,8 @@ def test_load_pretrained_in_dp2_match_checkpoint(model_testing_config, convert_p assert (stage_shard_test[stage_shard_ref.numel() :] == 0).all() # noqa -@pytest.mark.slow @pytest.mark.depends_on(on=["test_load_pretrained_in_dp2_match_checkpoint[{model_testing_config}]"]) +@pytest.mark.model_testing_group(ModelTestingGroup.convert, ModelTestingGroup.distributed) def test_load_distributed_checkpoint_dp2(model_testing_config, convert_paths, run_test_script_base_path): # This also tests conversion which uses `FastLLMModel.from_checkpoint` pretrained_config_ref = CheckpointLoadConfig( @@ -546,15 +564,15 @@ def test_load_distributed_checkpoint_dp2(model_testing_config, convert_paths, ru assert (weight_shard == model.get_shard(ShardName.weights)).all() -@pytest.mark.slow @pytest.mark.depends_on( on=[ "test_load_converted_fast_llm_checkpoint[{model_testing_config}]", "test_load_pretrained_in_dp2_match_checkpoint[{model_testing_config}]", ] ) -def test_load_pretrained_fast_llm_in_dp2(run_test_script, convert_paths, run_test_script_base_path): - run_test_script( +@pytest.mark.model_testing_group(ModelTestingGroup.convert, ModelTestingGroup.distributed) +def test_load_pretrained_fast_llm_in_dp2(run_test_script_for_all_models, convert_paths, run_test_script_base_path): + run_test_script_for_all_models( [ "training.checkpoint.interval=1", "training.train_iters=1", @@ -583,13 +601,13 @@ def test_load_pretrained_fast_llm_in_dp2(run_test_script, convert_paths, run_tes assert (ref_shard[name] == test_shard[name]).all() -@pytest.mark.slow @pytest.mark.depends_on( on=[ "test_load_converted_huggingface_checkpoint[{model_testing_config}]", "test_load_pretrained_in_dp2_match_checkpoint[{model_testing_config}]", ] ) +@pytest.mark.model_testing_group(ModelTestingGroup.convert, ModelTestingGroup.distributed) def test_load_pretrained_huggingface_in_dp2( run_test_script_for_all_models, model_testing_config, run_test_script_base_path, convert_paths ): diff --git a/tests/test_config.py b/tests/test_config.py index 98a4c07c6..ed5d9b8a6 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -1,6 +1,5 @@ import pathlib import subprocess -import unittest.mock import pytest import yaml @@ -8,9 +7,7 @@ from fast_llm.config import NoAutoValidate from fast_llm.data.dataset.gpt.config import GPTSamplingConfig from fast_llm.engine.checkpoint.config import CheckpointSaveMetadataConfig, ModelConfigType -from fast_llm.engine.config_utils.data_type import DataType from fast_llm.engine.distributed.config import DistributedConfig -from fast_llm.layers.transformer.config import TransformerConfig from fast_llm.models.auto import trainer_registry from fast_llm.models.gpt.config import GPTModelConfig, PretrainedGPTModelConfig from fast_llm.utils import Assert, check_equal_nested @@ -63,32 +60,6 @@ def test_validate_example_config(): trainer_registry["gpt"].from_dict(fast_llm_config_dict) -def test_do_use_flash_attention(): - # Create a mock DistributedConfig - mock_distributed_config = unittest.mock.Mock(spec=DistributedConfig) - - # Test case 1: use_flash_attention is True and training_dtype is float16 - config = TransformerConfig(use_flash_attention=True, window_size=None) - mock_distributed_config.training_dtype = DataType.float16 - assert config.do_use_flash_attention(mock_distributed_config) is True - - # Test case 2: use_flash_attention is False - config = TransformerConfig(use_flash_attention=False, window_size=None) - mock_distributed_config.training_dtype = DataType.float16 - assert config.do_use_flash_attention(mock_distributed_config) is False - - # Test case 3: use_flash_attention is True but training_dtype is not float16 or bfloat16 - config = TransformerConfig(use_flash_attention=True, window_size=None) - mock_distributed_config.training_dtype = DataType.float32 - assert config.do_use_flash_attention(mock_distributed_config) is False - - # Test case 4: use_flash_attention is False and window_size is not None - config = TransformerConfig(use_flash_attention=False, window_size=512) - mock_distributed_config.training_dtype = DataType.float32 - with pytest.raises(AssertionError): - config.do_use_flash_attention(mock_distributed_config) - - @pytest.mark.parametrize( ("cls", "default"), ((GPTSamplingConfig, {}), (GPTModelConfig, {"distributed": {"world_size": 1, "rank": 0, "local_world_size": 1}})), diff --git a/tests/test_functional.py b/tests/test_functional.py index 03a0ae8a0..9c01f0840 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -57,9 +57,15 @@ def ref_packed_get_batch_logps( @pytest.mark.slow -@pytest.mark.parametrize("batch_size", [1, 2, 4, 8]) -@pytest.mark.parametrize("seq_length", [1024, 4096, 8192]) -@pytest.mark.parametrize("vocab_size", [1000, 2000, 8000]) +@pytest.mark.parametrize( + ("batch_size", "seq_length", "vocab_size"), + ( + (2, 32, 50), + (1, 32, 50), + (2, 100, 50), + (2, 32, 200), + ), +) def test_preference_logps(batch_size, seq_length, vocab_size): random.seed(0) torch.manual_seed(0) diff --git a/tests/test_gpt_generate_and_forward.py b/tests/test_gpt_generate_and_forward.py index 4c920afde..7f0b902f8 100644 --- a/tests/test_gpt_generate_and_forward.py +++ b/tests/test_gpt_generate_and_forward.py @@ -9,6 +9,7 @@ from fast_llm.engine.schedule.runner import ScheduleRunner from fast_llm.models.gpt.config import LlamaGPTHuggingfaceCheckpointFormat, PretrainedGPTModelConfig from fast_llm.models.gpt.huggingface import HuggingfaceGPTModelForCausalLM +from tests.utils.model_configs import ModelTestingGroup from tests.utils.utils import requires_cuda @@ -44,7 +45,7 @@ def _prepare_rand_data(vocab_size, use_batch_size2: bool): def _get_hf_model(model_path: str, use_flash_attention: bool, use_bf16: bool): - hf_kwargs = {} + hf_kwargs = {"trust_remote_code": True} if use_flash_attention: hf_kwargs["attn_implementation"] = "flash_attention_2" hf_kwargs["torch_dtype"] = torch.bfloat16 @@ -237,9 +238,11 @@ def test_generate( @pytest.mark.slow -@requires_cuda +@pytest.mark.model_testing_group(ModelTestingGroup.generate) def test_export_for_generate(run_test_script_for_all_models, model_testing_config): # Not really testing, anything, but handles dependencies more easily than a fixture. + if model_testing_config.checkpoint_format is None: + pytest.skip(f"Conversion not supported for {model_testing_config.name}") run_test_script_for_all_models( [ "training.train_iters=1", @@ -263,6 +266,7 @@ def test_export_for_generate(run_test_script_for_all_models, model_testing_confi (True, True, 10, 10, 10), ], ) +@pytest.mark.model_testing_group(ModelTestingGroup.generate) def test_small_generate( model_testing_config, run_test_script_base_path, @@ -315,6 +319,7 @@ def test_generate_from_model( @requires_cuda @pytest.mark.slow @pytest.mark.depends_on(on=["test_export_for_generate[{model_testing_config}]"]) +@pytest.mark.model_testing_group(ModelTestingGroup.generate) def test_small_generate_from_model(model_testing_config, run_test_script_base_path): _test_generate_from_model( run_test_script_base_path / f"test_export_for_generate/export/{model_testing_config.checkpoint_format.name}/1", @@ -363,6 +368,7 @@ def test_forward_return_hidden_states(model_path): @pytest.mark.slow @requires_cuda +@pytest.mark.model_testing_group(ModelTestingGroup.generate) @pytest.mark.depends_on(on=["test_export_for_generate[{model_testing_config}]"]) def test_small_forward_return_hidden_states(model_testing_config, run_test_script_base_path): _test_forward_return_hidden_states( diff --git a/tests/test_match_megatron.py b/tests/test_match_megatron.py index 5c0bbdaa1..9b3b591b7 100644 --- a/tests/test_match_megatron.py +++ b/tests/test_match_megatron.py @@ -2,16 +2,19 @@ from tests.utils.compare_tensor_logs import CompareConfig from tests.utils.dataset import DATASET_PREFIX +from tests.utils.model_configs import ModelTestingGroup -@pytest.mark.slow +@pytest.mark.model_testing_group(ModelTestingGroup.megatron) def test_megatron(run_test_script_for_all_models, model_testing_config): - run_test_script_for_all_models(is_megatron=True) + run_test_script_for_all_models([], is_megatron=True) -@pytest.mark.slow @pytest.mark.depends_on(on=["test_megatron[{model_testing_config}]"]) +@pytest.mark.model_testing_group(ModelTestingGroup.megatron) def test_match_megatron(run_test_script_for_all_models, model_testing_config): + if model_testing_config.megatron_args is None: + pytest.skip(f"Megatron does not support model {model_testing_config.name}") run_test_script_for_all_models( [ "model.distributed.training_dtype=fp32", @@ -28,5 +31,4 @@ def test_match_megatron(run_test_script_for_all_models, model_testing_config): ".mlp.layer_2.weight", ] ), - use_performance_args=False, ) diff --git a/tests/test_mb.py b/tests/test_mb.py index fb09dcec6..806ccebce 100644 --- a/tests/test_mb.py +++ b/tests/test_mb.py @@ -1,16 +1,18 @@ import pytest from tests.utils.compare_tensor_logs import CompareConfig +from tests.utils.model_configs import ModelTestingGroup # TODO: Compare grads with simple +@pytest.mark.model_testing_group(ModelTestingGroup.basic) def test_model_df4(run_test_script_for_all_models): # Depth-first gradient accumulation baseline. run_test_script_for_all_models(["batch.depth_first_micro_batches=4"]) -@pytest.mark.slow @pytest.mark.depends_on(on=["test_model_df4[{model_testing_config}]"]) +@pytest.mark.model_testing_group(ModelTestingGroup.distributed) def test_model_df4_z3(run_test_script_for_all_models): # Gradient accumulation with ZeRO-3. run_test_script_for_all_models( @@ -22,12 +24,14 @@ def test_model_df4_z3(run_test_script_for_all_models): @pytest.mark.depends_on(on=["test_model_df4[{model_testing_config}]"], scope="session") +@pytest.mark.model_testing_group(ModelTestingGroup.distributed) def test_model_bf4(run_test_script_for_all_models): # Breadth-first gradient accumulation baseline. run_test_script_for_all_models(["batch.breadth_first_micro_batches=4"], compare="test_model_df4") @pytest.mark.depends_on(on=["test_model_df4[{model_testing_config}]", "test_model_bf4[{model_testing_config}]"]) +@pytest.mark.model_testing_group(ModelTestingGroup.distributed) def test_model_bf2_df2(run_test_script_for_all_models): # Mixed gradient accumulation baseline. run_test_script_for_all_models( @@ -35,8 +39,8 @@ def test_model_bf2_df2(run_test_script_for_all_models): ) -@pytest.mark.slow @pytest.mark.depends_on(on=["test_model_bf4[{model_testing_config}]"]) +@pytest.mark.model_testing_group(ModelTestingGroup.distributed) def test_model_pp2s2_bf4(run_test_script_for_all_models): # Pipeline-parallel without tied weights. run_test_script_for_all_models( @@ -50,8 +54,8 @@ def test_model_pp2s2_bf4(run_test_script_for_all_models): ) -@pytest.mark.slow @pytest.mark.depends_on(on=["test_model_bf4[{model_testing_config}]"]) +@pytest.mark.model_testing_group(ModelTestingGroup.distributed) def test_model_pp2s1_bf4(run_test_script_for_all_models): # Pipeline-parallel with tied weights. run_test_script_for_all_models( @@ -66,8 +70,8 @@ def test_model_pp2s1_bf4(run_test_script_for_all_models): ) -@pytest.mark.slow @pytest.mark.depends_on(on=["test_model_bf4[{model_testing_config}]"]) +@pytest.mark.model_testing_group(ModelTestingGroup.distributed) def test_model_dp2_tp2_pp2s2_bf4(run_test_script_for_all_models): # Simple 3d parallelism # TODO: Test fails diff --git a/tests/test_mb_seq_first.py b/tests/test_mb_seq_first.py index 7d3cf5ad9..5a8db0b98 100644 --- a/tests/test_mb_seq_first.py +++ b/tests/test_mb_seq_first.py @@ -1,16 +1,18 @@ import pytest from tests.utils.compare_tensor_logs import CompareConfig +from tests.utils.model_configs import ModelTestingGroup # TODO: Compare grads with simple +@pytest.mark.model_testing_group(ModelTestingGroup.basic) def test_model_df4_sf(run_test_script_for_all_models): # Sequence-first gradient accumulation baseline. run_test_script_for_all_models(["batch.depth_first_micro_batches=4", "model.base_model.sequence_first=True"]) -@pytest.mark.slow @pytest.mark.depends_on(on=["test_model_df4_sf[{model_testing_config}]"]) +@pytest.mark.model_testing_group(ModelTestingGroup.distributed) def test_model_dp2_sp2_df4(run_test_script_for_all_models): # Sequence-tensor-parallel with gradient accumulation. # TODO: Compiled cross-entropy broken for this config @@ -27,9 +29,9 @@ def test_model_dp2_sp2_df4(run_test_script_for_all_models): ) -@pytest.mark.slow @pytest.mark.skip(reason="Test is broken.") @pytest.mark.depends_on(on=["test_model_df4_sf[{model_testing_config}]"]) +@pytest.mark.model_testing_group(ModelTestingGroup.distributed) def test_model_dp2_sp2_pp2s1(run_test_script_for_all_models): # 3d-parallel with sequence-tensor-parallel. # TODO: Compiled cross-entropy broken for this config diff --git a/tests/test_ms.py b/tests/test_ms.py index 23ef60e64..b97f84e5d 100644 --- a/tests/test_ms.py +++ b/tests/test_ms.py @@ -1,14 +1,17 @@ import pytest +from tests.utils.model_configs import ModelTestingGroup + # TODO: Compare grads with simple +@pytest.mark.model_testing_group(ModelTestingGroup.basic) def test_model_ms256(run_test_script_for_all_models): # Micro-sequence baseline run_test_script_for_all_models(["batch.micro_sequence_length=256"]) -@pytest.mark.slow @pytest.mark.depends_on(on=["test_model_ms256[{model_testing_config}]"]) +@pytest.mark.model_testing_group(ModelTestingGroup.distributed) def test_model_pp2s2_ms256(run_test_script_for_all_models): # Sequence-pipeline-parallel run_test_script_for_all_models( @@ -22,9 +25,9 @@ def test_model_pp2s2_ms256(run_test_script_for_all_models): ) -@pytest.mark.slow @pytest.mark.skip @pytest.mark.depends_on(on=["test_model_ms256[{model_testing_config}]"]) +@pytest.mark.model_testing_group(ModelTestingGroup.distributed) def test_model_dp2s2_stp2_pp2s2_ms256(run_test_script_for_all_models): # TODO: Handle this case. # Sequence-3d-parallel diff --git a/tests/test_multi_stage.py b/tests/test_multi_stage.py index 6d3861ebf..06eca6854 100644 --- a/tests/test_multi_stage.py +++ b/tests/test_multi_stage.py @@ -1,9 +1,12 @@ +import pytest + from fast_llm.engine.distributed.distributed import Distributed from fast_llm.engine.training.config import TrainerConfig from fast_llm.engine.training.trainer import Trainer from fast_llm.layers.transformer.transformer import TransformerLayer from fast_llm.tools.train import CliTrainingConfig from fast_llm.utils import Assert +from tests.utils.model_configs import ModelTestingGroup from tests.utils.utils import requires_cuda @@ -17,6 +20,7 @@ def _get_trainer_from_args(args: list[str], model_type: str = "gpt") -> Trainer: @requires_cuda +@pytest.mark.model_testing_group(ModelTestingGroup.basic) def test_frozen_weights(model_testing_config): args = model_testing_config.config_args + ["run.tensor_logs.save=False"] model_ref = _get_trainer_from_args(args)._multi_stage diff --git a/tests/test_seq_first.py b/tests/test_seq_first.py index 6e1eb07ac..66b044df3 100644 --- a/tests/test_seq_first.py +++ b/tests/test_seq_first.py @@ -1,14 +1,17 @@ import pytest +from tests.utils.model_configs import ModelTestingGroup + # TODO: Compare grads with simple +@pytest.mark.model_testing_group(ModelTestingGroup.basic) def test_model_sf(run_test_script_for_all_models): # Sequence-first baseline. run_test_script_for_all_models(["model.base_model.sequence_first=True"]) -@pytest.mark.slow @pytest.mark.depends_on(on=["test_model_sf[{model_testing_config}]"]) +@pytest.mark.model_testing_group(ModelTestingGroup.distributed) def test_model_sp2(run_test_script_for_all_models): # Sequence-tensor-parallel. run_test_script_for_all_models( @@ -18,8 +21,8 @@ def test_model_sp2(run_test_script_for_all_models): ) -@pytest.mark.slow @pytest.mark.depends_on(on=["test_model_sf[{model_testing_config}]"]) +@pytest.mark.model_testing_group(ModelTestingGroup.distributed) def test_model_sdp2(run_test_script_for_all_models): # Sequence-data-parallel run_test_script_for_all_models( @@ -29,12 +32,11 @@ def test_model_sdp2(run_test_script_for_all_models): ) -@pytest.mark.slow @pytest.mark.depends_on(on=["test_model_sf[{model_testing_config}]"]) +@pytest.mark.model_testing_group(ModelTestingGroup.distributed) def test_model_sp2_ce4(run_test_script_for_all_models): # Sequence-tensor-parallel with cross-entropy splits. run_test_script_for_all_models( - "test_model_sp2_ce4", [ "model.distributed.tensor_parallel=2", "model.distributed.sequence_tensor_parallel=True", diff --git a/tests/test_simple.py b/tests/test_simple.py index 8026f0127..4616942c6 100644 --- a/tests/test_simple.py +++ b/tests/test_simple.py @@ -1,6 +1,9 @@ import pytest +from tests.utils.model_configs import ModelTestingGroup + +@pytest.mark.model_testing_group(ModelTestingGroup.basic) def test_model_safe(run_test_script_for_all_models): # The safest possible config, identical to the one in test_match_megatron except for the initialization. run_test_script_for_all_models( @@ -14,20 +17,22 @@ def test_model_safe(run_test_script_for_all_models): @pytest.mark.depends_on(on=["test_model_safe[{model_testing_config}]"]) +@pytest.mark.model_testing_group(ModelTestingGroup.basic) def test_model(run_test_script_for_all_models): # A baseline config (single-gpu, bf16, flash-attn). # Also tests for multiple data loaders. run_test_script_for_all_models(["training.num_workers=2"], compare="test_model_safe") -@pytest.mark.slow @pytest.mark.depends_on(on=["test_model[{model_testing_config}]"]) +@pytest.mark.model_testing_group(ModelTestingGroup.distributed) def test_model_dp2(run_test_script_for_all_models): # Simple data-parallel. run_test_script_for_all_models([], num_gpus=2, compare="test_model") -@pytest.mark.slow +@pytest.mark.skip(reason="Flaky") +@pytest.mark.model_testing_group(ModelTestingGroup.distributed) def test_model_dp2_timeout(run_test_script_for_all_models): # Test sampling timeout # TODO: Find a better way to test this @@ -51,8 +56,8 @@ def test_model_dp2_timeout(run_test_script_for_all_models): ) -@pytest.mark.slow @pytest.mark.depends_on(on=["test_model[{model_testing_config}]"]) +@pytest.mark.model_testing_group(ModelTestingGroup.distributed) def test_model_tp2(run_test_script_for_all_models): # Simple tensor-parallel. run_test_script_for_all_models( @@ -63,6 +68,7 @@ def test_model_tp2(run_test_script_for_all_models): @pytest.mark.depends_on(on=["test_model[{model_testing_config}]"]) +@pytest.mark.model_testing_group(ModelTestingGroup.basic) def test_model_ce4(run_test_script_for_all_models): # Cross-entropy splits. run_test_script_for_all_models( @@ -71,8 +77,8 @@ def test_model_ce4(run_test_script_for_all_models): ) -@pytest.mark.slow @pytest.mark.depends_on(on=["test_model[{model_testing_config}]"]) +@pytest.mark.model_testing_group(ModelTestingGroup.distributed) def test_model_dp2_z2(run_test_script_for_all_models): # Data-parallel with zero stage 2. run_test_script_for_all_models( @@ -82,8 +88,8 @@ def test_model_dp2_z2(run_test_script_for_all_models): ) -@pytest.mark.slow @pytest.mark.depends_on(on=["test_model[{model_testing_config}]"]) +@pytest.mark.model_testing_group(ModelTestingGroup.distributed) def test_model_dp2_z3(run_test_script_for_all_models): # Data-parallel with zero stage 3. run_test_script_for_all_models( diff --git a/tests/utils/depends.py b/tests/utils/depends.py index 8ddb5041c..5e6bcc711 100644 --- a/tests/utils/depends.py +++ b/tests/utils/depends.py @@ -101,6 +101,10 @@ def handle_missing(self, item: pytest.Function): def _resolve_dependencies(self, item: pytest.Function): dependencies = set() unresolved = set() + + if "skip" in item.keywords: + return dependencies, unresolved + nodeid = clean_nodeid(item.nodeid) for marker in item.iter_markers(): diff --git a/tests/utils/model_configs.py b/tests/utils/model_configs.py index d0c0d070e..65a063b5a 100644 --- a/tests/utils/model_configs.py +++ b/tests/utils/model_configs.py @@ -1,4 +1,5 @@ import dataclasses +import enum import functools import os import typing @@ -21,6 +22,17 @@ _LOG_LEVEL = int(os.environ.get("LOG_LEVEL", 13)) +class ModelTestingGroup(enum.StrEnum): + basic = "basic" + megatron = "megatron" + distributed = "distributed" + convert = "convert" + generate = "generate" + + +SLOW_TESTING_GROUPS = {ModelTestingGroup.megatron, ModelTestingGroup.distributed} + + @dataclasses.dataclass(kw_only=True, frozen=True) class ModelTestingConfig: name: str = None @@ -28,6 +40,11 @@ class ModelTestingConfig: config_args: list[str] megatron_args: list[str] | None checkpoint_format: CheckpointFormat | None + # The important groups we want to test. + testing_groups: list[ModelTestingGroup] + # Other supported groups, excluded by default because they are mostly unimportant and/or redundant. + # They can be run with `--run-extra-slow`. + other_groups: list[ModelTestingGroup] @functools.cached_property def model_config_class(self): @@ -54,9 +71,15 @@ def _update_and_add_testing_config( extra_args: list[str] | None = None, megatron_args: list[str] | None = ..., checkpoint_format: CheckpointFormat | None = ..., + testing_groups: list[ModelTestingGroup], + other_groups: list[ModelTestingGroup], ): config = _MODEL_CONFIGS[old_name] - updates: dict[str, typing.Any] = {"name": new_name} + updates: dict[str, typing.Any] = { + "name": new_name, + "testing_groups": testing_groups, + "other_groups": other_groups, + } if model_type is not None: updates["model_type"] = model_type if extra_args is not None: @@ -78,6 +101,7 @@ def _update_and_add_testing_config( _MODEL_CONFIGS["gpt2"] = ModelTestingConfig( + # Tests gpt2 features (absolute embeddings, layer norm, relu activation, tied embeddings, MHA, linear biases). name="gpt2", model_type="gpt", config_args=[ @@ -97,7 +121,7 @@ def _update_and_add_testing_config( f"model.multi_stage.debug_all_param_gradients={_LOG_LEVEL}", "model.multi_stage.debug_tensor_parallel=True", "model.distributed.reproducible_init=True", - "model.distributed.timeout=10", + "model.distributed.timeout=20", "model.distributed.training_dtype=bf16", "training.train_iters=2", "training.num_workers=0", @@ -153,17 +177,32 @@ def _update_and_add_testing_config( "--transformer-impl=transformer_engine", ], checkpoint_format=None, + testing_groups=[ + ModelTestingGroup.basic, + ModelTestingGroup.megatron, + ModelTestingGroup.distributed, + ], + other_groups=[], ) _update_and_add_testing_config( + # Tests MQA. "gpt2", "starcoder", extra_args=["model.base_model.transformer.head_groups=1"], megatron_args=["--group-query-attention"], checkpoint_format=None, + testing_groups=[ + ModelTestingGroup.basic, + ], + other_groups=[ + ModelTestingGroup.megatron, + ModelTestingGroup.distributed, + ], ) _update_and_add_testing_config( + # Tests intermediate between gpt2 and llama, closest converter to gpt2. "gpt2", "starcoder2", extra_args=[ @@ -177,9 +216,19 @@ def _update_and_add_testing_config( "--no-position-embedding", ], checkpoint_format=Starcoder2GPTHuggingfaceCheckpointFormat, + testing_groups=[ + ModelTestingGroup.basic, + ModelTestingGroup.convert, + ], + other_groups=[ + ModelTestingGroup.megatron, + ModelTestingGroup.distributed, + ModelTestingGroup.generate, + ], ) _update_and_add_testing_config( + # Main tested model. "starcoder2", "llama", extra_args=[ @@ -198,55 +247,108 @@ def _update_and_add_testing_config( "--untie-embeddings-and-output-weights", ], checkpoint_format=LlamaGPTHuggingfaceCheckpointFormat, + testing_groups=[ + ModelTestingGroup.basic, + ModelTestingGroup.megatron, + ModelTestingGroup.distributed, + ModelTestingGroup.convert, + ModelTestingGroup.generate, + ], + other_groups=[], ) _update_and_add_testing_config( + # Tests llama3-style rotary embeddings. "llama", "llama3", extra_args=["model.base_model.transformer.rotary.type=llama3"], # Megatron doesn't support Llama3-style Rotary Embeddings megatron_args=None, checkpoint_format=LlamaGPTHuggingfaceCheckpointFormat, + testing_groups=[ + ModelTestingGroup.basic, + ], + other_groups=[ + ModelTestingGroup.distributed, + ModelTestingGroup.convert, + ModelTestingGroup.generate, + ], ) _update_and_add_testing_config( + # Tests yarn-style rotary embeddings. "llama", "llama_yarn", extra_args=["model.base_model.transformer.rotary.type=yarn"], # Megatron doesn't support Yarn-style Rotary Embeddings megatron_args=None, checkpoint_format=LlamaGPTHuggingfaceCheckpointFormat, + testing_groups=[ + ModelTestingGroup.basic, + ], + other_groups=[ + ModelTestingGroup.distributed, + ModelTestingGroup.convert, + ModelTestingGroup.generate, + ], ) _update_and_add_testing_config( + # Tests multi-token prediction, custom HF model and converter. "llama", "llama_mtp", extra_args=["model.base_model.prediction_heads=4"], # Megatron doesn't support multi-token prediction. megatron_args=None, checkpoint_format=MTPLlamaGPTHuggingfaceCheckpointFormat, + testing_groups=[ + ModelTestingGroup.basic, + ModelTestingGroup.convert, + ModelTestingGroup.generate, + ], + other_groups=[ + ModelTestingGroup.distributed, + ], ) _update_and_add_testing_config( + # Tests partial linear biases, Qwen2 converter. "llama", "qwen2", extra_args=["model.base_model.transformer.add_linear_biases=only_attn_qkv"], # Megatron doesn't support per sub layer biases megatron_args=None, checkpoint_format=Qwen2GPTHuggingfaceCheckpointFormat, + testing_groups=[ + ModelTestingGroup.basic, + ModelTestingGroup.convert, + ], + other_groups=[ + ModelTestingGroup.distributed, + ModelTestingGroup.generate, + ], ) _update_and_add_testing_config( + # Tests sliding window attention, mistral converter. "llama", "mistral", extra_args=["model.base_model.transformer.window_size=128"], # Megatron doesn't support sliding windows. megatron_args=None, checkpoint_format=MistralGPTHuggingfaceCheckpointFormat, + testing_groups=[ + ModelTestingGroup.basic, + ModelTestingGroup.convert, + ModelTestingGroup.generate, + ], + other_groups=[ + ModelTestingGroup.distributed, + ], ) _update_and_add_testing_config( - # We ignore sliding windows to enable comparison with Megatron. + # Tests mixture of experts, mixtral converter. "llama", "mixtral", extra_args=[ @@ -258,19 +360,58 @@ def _update_and_add_testing_config( "--moe-router-topk=4", ], checkpoint_format=MixtralGPTHuggingfaceCheckpointFormat, + testing_groups=[ + ModelTestingGroup.basic, + ModelTestingGroup.megatron, + ModelTestingGroup.distributed, + ModelTestingGroup.convert, + ModelTestingGroup.generate, + ], + other_groups=[], ) _update_and_add_testing_config( - # We ignore sliding windows to enable comparison with Megatron. + # Tests hybrid ssm, llamba converter. + # TODO: Conversion fails. "llama", "llamba", model_type="hybrid_ssm", extra_args=["model.base_model.hybrid_block_layout=['t','m']"], megatron_args=None, checkpoint_format=LLambaHuggingfaceCheckpointFormat, + testing_groups=[ + ModelTestingGroup.basic, + ModelTestingGroup.distributed, + ModelTestingGroup.convert, + ModelTestingGroup.generate, + ], + other_groups=[], ) @pytest.fixture(scope="session", params=_MODEL_CONFIGS.keys()) def model_testing_config(request) -> ModelTestingConfig: return _MODEL_CONFIGS[request.param] + + +def testing_group_enabled(item: pytest.Function, skip_slow: bool, skip_extra_slow: bool, show_skipped: bool) -> bool: + if "model_testing_group" in item.keywords: + assert "model_testing_config" in item.callspec.params, item.nodeid + groups: tuple[ModelTestingGroup] = item.keywords["model_testing_group"].args + model_testing_config = item.callspec.params["model_testing_config"] + model_config = _MODEL_CONFIGS[model_testing_config] + for group in groups: + if group in model_config.testing_groups and not (skip_slow and group in SLOW_TESTING_GROUPS): + pass + elif group in model_config.other_groups and not skip_extra_slow: + pass + elif show_skipped: + item.add_marker( + pytest.mark.skip(reason=f"Skipping testing group {group} for model {model_testing_config}.") + ) + else: + return False + elif hasattr(item, "callspec"): + assert "model_testing_config" not in item.callspec.params, item.nodeid + + return True diff --git a/tests/utils/run_test_script.py b/tests/utils/run_test_script.py index c11d3f3b5..26666df88 100644 --- a/tests/utils/run_test_script.py +++ b/tests/utils/run_test_script.py @@ -69,7 +69,7 @@ def do_run_test_script( if num_gpus == 1 and not is_megatron: CliTrainingConfig.parse_and_run(args) else: - completed_proc = subprocess.run(command, env=env, timeout=60) + completed_proc = subprocess.run(command, env=env, timeout=120) if completed_proc.returncode: raise RuntimeError(f"Process failed with return code {completed_proc.returncode}") if compare_path is not None and do_compare: From 8c64f03e3ab657c1a857cca4743c5f6962674184 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Mon, 9 Jun 2025 14:17:29 -0400 Subject: [PATCH 05/43] fix --- tests/test_match_megatron.py | 20 ++++++++++++-------- tests/utils/model_configs.py | 4 +++- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/tests/test_match_megatron.py b/tests/test_match_megatron.py index 9b3b591b7..4f82d5753 100644 --- a/tests/test_match_megatron.py +++ b/tests/test_match_megatron.py @@ -15,6 +15,17 @@ def test_megatron(run_test_script_for_all_models, model_testing_config): def test_match_megatron(run_test_script_for_all_models, model_testing_config): if model_testing_config.megatron_args is None: pytest.skip(f"Megatron does not support model {model_testing_config.name}") + + ignore_tensors = [ + ".self_attn.query_key_value.", + ".self_attn.query.", + ".self_attn.key_value.", + ".mlp.layer_2.weight", + ".mlp.experts.", + ] + if model_testing_config.name == "mixtral": + ignore_tensors.extend([".mlp.experts.", ".mlp.layer_1.weight"]) + run_test_script_for_all_models( [ "model.distributed.training_dtype=fp32", @@ -23,12 +34,5 @@ def test_match_megatron(run_test_script_for_all_models, model_testing_config): "model.base_model.use_megatron_initialization=True", ], compare="test_megatron", - config=CompareConfig( - ignore_tensors=[ - ".self_attn.query_key_value.", - ".self_attn.query.", - ".self_attn.key_value.", - ".mlp.layer_2.weight", - ] - ), + config=CompareConfig(ignore_tensors=ignore_tensors), ) diff --git a/tests/utils/model_configs.py b/tests/utils/model_configs.py index 65a063b5a..a444307e1 100644 --- a/tests/utils/model_configs.py +++ b/tests/utils/model_configs.py @@ -108,7 +108,7 @@ def _update_and_add_testing_config( "training.logs.interval=1", "run.tensor_logs.save=True", "run.tensor_logs.show=False", - # "model.base_model.max_position_embeddings=512", + "model.base_model.max_position_embeddings=512", "model.base_model.transformer.num_layers=2", "model.base_model.transformer.hidden_size=256", "model.base_model.transformer.num_attention_heads=8", @@ -208,6 +208,8 @@ def _update_and_add_testing_config( extra_args=[ "model.base_model.transformer.head_groups=4", "model.base_model.transformer.rotary.type=default", + # Unused, but prevents issues with conversion tests. + "model.base_model.max_position_embeddings=2048", ], megatron_args=[ "--group-query-attention", From c0f648cdbb97b902e4c9fc96636856ea17ea41c1 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Tue, 10 Jun 2025 12:48:02 -0400 Subject: [PATCH 06/43] fixes --- fast_llm/layers/transformer/transformer.py | 2 +- tests/test_mb.py | 7 +++- tests/test_multi_stage.py | 37 ++++++++++++++-------- tests/utils/model_configs.py | 24 ++++++++++---- 4 files changed, 48 insertions(+), 22 deletions(-) diff --git a/fast_llm/layers/transformer/transformer.py b/fast_llm/layers/transformer/transformer.py index 40dd2e00e..115629d6b 100644 --- a/fast_llm/layers/transformer/transformer.py +++ b/fast_llm/layers/transformer/transformer.py @@ -20,7 +20,7 @@ class BaseBlock(Layer, abc.ABC): """ - A transformer-like decoder base block block with abstract mixer. + A transformer-like decoder base block with abstract mixer. """ _mixer_module_name = "self_attn" diff --git a/tests/test_mb.py b/tests/test_mb.py index 806ccebce..781de6e85 100644 --- a/tests/test_mb.py +++ b/tests/test_mb.py @@ -66,7 +66,12 @@ def test_model_pp2s1_bf4(run_test_script_for_all_models): ], num_gpus=2, compare="test_model_df4", - config=CompareConfig(ignore_duplicates=["layers.0.word_embeddings_weight"]), + config=CompareConfig( + ignore_duplicates=[ + "layers.0.word_embeddings_weight", + "layers.0.position_embeddings_weight", + ] + ), ) diff --git a/tests/test_multi_stage.py b/tests/test_multi_stage.py index 06eca6854..8753cf482 100644 --- a/tests/test_multi_stage.py +++ b/tests/test_multi_stage.py @@ -3,6 +3,7 @@ from fast_llm.engine.distributed.distributed import Distributed from fast_llm.engine.training.config import TrainerConfig from fast_llm.engine.training.trainer import Trainer +from fast_llm.layers.ssm.llamba_block import LlambaBlock from fast_llm.layers.transformer.transformer import TransformerLayer from fast_llm.tools.train import CliTrainingConfig from fast_llm.utils import Assert @@ -23,31 +24,39 @@ def _get_trainer_from_args(args: list[str], model_type: str = "gpt") -> Trainer: @pytest.mark.model_testing_group(ModelTestingGroup.basic) def test_frozen_weights(model_testing_config): args = model_testing_config.config_args + ["run.tensor_logs.save=False"] - model_ref = _get_trainer_from_args(args)._multi_stage - model_frozen = _get_trainer_from_args(args + ["model.base_model.transformer.mlp_lr_scale=[0]"])._multi_stage + model_ref = _get_trainer_from_args(args, model_testing_config.model_type)._multi_stage + model_frozen = _get_trainer_from_args( + args + + [ + f"model.base_model.transformer.mlp_lr_scale={[0]*model_ref.config.base_model.transformer.num_experts}", + f"model.base_model.transformer.router_lr_scale=0", + ], + model_testing_config.model_type, + )._multi_stage Assert.eq( model_ref._num_stages, model_frozen._num_stages, ) - diff_by_layer = [ - sum(p.numel() for p in layer.mlp.parameters()) if isinstance(layer, TransformerLayer) else 0 + frozen_parameter_counts = [ + sum(p.numel() for p in layer.mlp.parameters()) if isinstance(layer, (TransformerLayer, LlambaBlock)) else 0 for layer in model_ref.base_model.layers ] - assert all((diff_by_layer[i] == 0) == (i in (0, len(diff_by_layer) - 1)) for i in range(len(diff_by_layer))) - total_diff = sum(diff_by_layer) - for weight_buffer_ref, weight_buffer_frozen in zip( model_ref._weight_buffers, model_frozen._weight_buffers, strict=True ): - assert weight_buffer_ref.numel() == weight_buffer_frozen.numel() + Assert.eq(weight_buffer_ref.numel() == weight_buffer_frozen.numel()) - for grad_buffer_ref, grad_buffer_frozen, diff in zip( - model_ref._grad_buffers, model_frozen._grad_buffers, diff_by_layer, strict=True + for grad_buffer_ref, grad_buffer_frozen, frozen_parameter_count in zip( + model_ref._grad_buffers, model_frozen._grad_buffers, frozen_parameter_counts, strict=True ): - Assert.eq(grad_buffer_ref.numel() - grad_buffer_frozen.numel() == diff) + Assert.eq(grad_buffer_ref.numel() - grad_buffer_frozen.numel() == frozen_parameter_count) - for shard_name, shard_diff in zip( - model_ref._shard_names, [0] + [total_diff] * (len(model_ref._all_shard_names) - 1), strict=True + for shard_name, shard_frozen_count in zip( + model_ref._shard_names, + [0] + [sum(frozen_parameter_counts)] * (len(model_ref._all_shard_names) - 1), + strict=True, ): - Assert.eq(model_ref.get_shard(shard_name).numel() - model_frozen.get_shard(shard_name).numel(), shard_diff) + Assert.eq( + model_ref.get_shard(shard_name).numel() - model_frozen.get_shard(shard_name).numel(), shard_frozen_count + ) diff --git a/tests/utils/model_configs.py b/tests/utils/model_configs.py index a444307e1..3f989f584 100644 --- a/tests/utils/model_configs.py +++ b/tests/utils/model_configs.py @@ -222,6 +222,7 @@ def _update_and_add_testing_config( ModelTestingGroup.basic, ModelTestingGroup.convert, ], + # TODO: Bring back `generate` to `testing_groups` when stable. other_groups=[ ModelTestingGroup.megatron, ModelTestingGroup.distributed, @@ -254,9 +255,11 @@ def _update_and_add_testing_config( ModelTestingGroup.megatron, ModelTestingGroup.distributed, ModelTestingGroup.convert, + ], + # TODO: Bring back `generate` to `testing_groups` when stable. + other_groups=[ ModelTestingGroup.generate, ], - other_groups=[], ) _update_and_add_testing_config( @@ -270,6 +273,7 @@ def _update_and_add_testing_config( testing_groups=[ ModelTestingGroup.basic, ], + # TODO: Bring back `generate` to `testing_groups` when stable. other_groups=[ ModelTestingGroup.distributed, ModelTestingGroup.convert, @@ -288,6 +292,7 @@ def _update_and_add_testing_config( testing_groups=[ ModelTestingGroup.basic, ], + # TODO: Bring back `generate` to `testing_groups` when stable. other_groups=[ ModelTestingGroup.distributed, ModelTestingGroup.convert, @@ -306,10 +311,11 @@ def _update_and_add_testing_config( testing_groups=[ ModelTestingGroup.basic, ModelTestingGroup.convert, - ModelTestingGroup.generate, ], + # TODO: Bring back `generate` to `testing_groups` when stable. other_groups=[ ModelTestingGroup.distributed, + ModelTestingGroup.generate, ], ) @@ -325,6 +331,7 @@ def _update_and_add_testing_config( ModelTestingGroup.basic, ModelTestingGroup.convert, ], + # TODO: Bring back `generate` to `testing_groups` when stable. other_groups=[ ModelTestingGroup.distributed, ModelTestingGroup.generate, @@ -342,10 +349,11 @@ def _update_and_add_testing_config( testing_groups=[ ModelTestingGroup.basic, ModelTestingGroup.convert, - ModelTestingGroup.generate, ], + # TODO: Bring back `generate` to `testing_groups` when stable. other_groups=[ ModelTestingGroup.distributed, + ModelTestingGroup.generate, ], ) @@ -367,14 +375,15 @@ def _update_and_add_testing_config( ModelTestingGroup.megatron, ModelTestingGroup.distributed, ModelTestingGroup.convert, + ], + # TODO: Bring back `generate` to `testing_groups` when stable. + other_groups=[ ModelTestingGroup.generate, ], - other_groups=[], ) _update_and_add_testing_config( # Tests hybrid ssm, llamba converter. - # TODO: Conversion fails. "llama", "llamba", model_type="hybrid_ssm", @@ -383,11 +392,14 @@ def _update_and_add_testing_config( checkpoint_format=LLambaHuggingfaceCheckpointFormat, testing_groups=[ ModelTestingGroup.basic, + ], + # TODO: Bring back `generate` to `testing_groups` when stable. + other_groups=[ + # TODO: Fix and bring these back to `testing_groups` ModelTestingGroup.distributed, ModelTestingGroup.convert, ModelTestingGroup.generate, ], - other_groups=[], ) From e92c311845a92d5de67aac5a5c2ab0ae9d759849 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Wed, 11 Jun 2025 09:02:07 -0400 Subject: [PATCH 07/43] stuff --- tests/conftest.py | 111 ++++++++++++++++++++++------------------- tests/utils/depends.py | 4 +- 2 files changed, 62 insertions(+), 53 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 829e1696f..b688bb548 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,5 +1,7 @@ import dataclasses -import datetime +import gc +import json +import logging import math import os @@ -27,6 +29,7 @@ def pytest_addoption(parser): group = parser.getgroup("fast_llm") group.addoption("--skip-slow", action="store_true") group.addoption("--show-skipped", action="store_true") + group.addoption("--show-gpu-memory", type=int, default=10) group.addoption("--models", nargs="*") group.addoption( "--run-extra-slow", @@ -166,9 +169,63 @@ def pytest_collection_modifyitems(config, items: list[pytest.Function]): @pytest.hookimpl(tryfirst=True, hookwrapper=True) -def pytest_runtest_makereport(item: pytest.Function, call): +def pytest_runtest_makereport(item: pytest.Function, call: pytest.CallInfo): outcome = yield - manager.register_result(item, outcome.get_result()) + result = outcome.get_result() + manager.register_result(item, result) + + # Measure GPU memory usage. (TODO: This excludes child processes) + if call.when == "call" and torch.cuda.is_available(): + torch._C._cuda_clearCublasWorkspaces() + gc.collect() + # This also frees memory for other processes. + torch.cuda.empty_cache() + item.add_report_section( + call.when, + "resource usage", + json.dumps( + { + "duration": call.duration, + "max_memory_reserved": torch.cuda.max_memory_reserved(), + "max_memory_allocated": torch.cuda.max_memory_allocated(), + "memory_reserved": torch.cuda.memory_reserved(), + "memory_allocated": torch.cuda.memory_allocated(), + } + ), + ) + torch.cuda.reset_peak_memory_stats() + + +@pytest.hookimpl +def pytest_terminal_summary(terminalreporter): + resource_reports = {} + for reports in terminalreporter.stats.values(): + for report in reports: + if isinstance(report, pytest.TestReport): + for _, section in report.get_sections("Captured resource usage"): + if report.nodeid in resource_reports: + logging.error(f"Duplicate resource report for {report.nodeid}") + resource_reports[report.nodeid] = json.loads(section) + + if not resource_reports: + return + + terminalreporter.write_sep("=", "Highest gpu memory usage", bold=True) + sorted_nodeids = sorted( + resource_reports.keys(), + key=lambda nodeid: resource_reports[nodeid]["max_memory_reserved"], + reverse=True, + ) + logging.error(f"sorted_nodeids {sorted_nodeids}") + for nodeid in sorted_nodeids[: terminalreporter.config.getoption("--show-gpu-memory")]: + terminalreporter.write_line( + f"{nodeid}:\n " + f"Max Reserved {resource_reports[nodeid]["max_memory_reserved"] / 1e6:.0f} MB | " + f"Max Allocated {resource_reports[nodeid]["max_memory_allocated"] / 1e6:.0f} MB | " + f"End Reserved {resource_reports[nodeid]["memory_reserved"] / 1e6:.0f} MB | " + f"End Allocated {resource_reports[nodeid]["memory_allocated"] / 1e6:.0f} MB | " + f"Duration {resource_reports[nodeid]["duration"]:.2f}" + ) def pytest_runtest_call(item: pytest.Function): @@ -190,51 +247,3 @@ def pytest_xdist_make_scheduler(config, log): # Always use grouped load balancing to handle dependencies, and make it work with `-n`. assert config.getvalue("dist") == "load" return xdist.scheduler.LoadGroupScheduling(config, log) - - -def get_all_reports(terminalreporter): - """Reports for all stages and all outcomes""" - for reports in terminalreporter.stats.values(): - for report in reports: - if isinstance(report, pytest.TestReport): - yield report - - -def resource_usage_message(report): - """The resource usage message for a report""" - return ", ".join(content for (prefix, content) in report.get_sections(f"Captured resource {report.when}")) - - -def format_duration(seconds): - """Human-readable running time message""" - if seconds < 60: - duration_string = f"{seconds:.3f} seconds" - else: - duration_string = str(datetime.timedelta(seconds=round(seconds))) - return f"running time: {duration_string}" - - -# @pytest.hookimpl(tryfirst=True) -# def pytest_runtest_makereport(item, call): -# """Report running time of a test call""" -# if call.when == "call": -# item.add_report_section( -# call.when, "resource", format_duration(call.duration) -# ) -# -# -# @pytest.hookimpl -# def pytest_terminal_summary(terminalreporter): -# """Produce a resource usage report if any test asked for it""" -# resource_reports = [ -# (report, message) -# for report in get_all_reports(terminalreporter) -# if (message := resource_usage_message(report)) -# ] -# if not resource_reports: -# return -# terminalreporter.write_sep("=", "resource usage", bold=True) -# for report, message in resource_reports: -# terminalreporter.write_line( -# f"{report.nodeid} ({report.when}) {message}" -# ) diff --git a/tests/utils/depends.py b/tests/utils/depends.py index 5e6bcc711..3fbb8f398 100644 --- a/tests/utils/depends.py +++ b/tests/utils/depends.py @@ -92,11 +92,11 @@ def handle_missing(self, item: pytest.Function): pytest.fail(f'{item.nodeid} depends on {", ".join(missing)}, which was not found', False) if failed := [ - dependency + f"{dependency} ({", ".join(f"{key}: {value}" for key, value in self._results[dependency].items()) if self._results[dependency] else "missing"})" for dependency in self._dependencies[nodeid] if not all(self._results[dependency].get(step, None) == "passed" for step in ("setup", "call", "teardown")) ]: - pytest.skip(f'{item.nodeid} depends on failed {", ".join(failed)}') + pytest.skip(f'{item.nodeid} depends on {", ".join(failed)}') def _resolve_dependencies(self, item: pytest.Function): dependencies = set() From b877fb27604be66c9ca87de11a88524e6cc5d7f9 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Wed, 11 Jun 2025 09:03:59 -0400 Subject: [PATCH 08/43] stuff --- Dockerfile | 9 ++++++++- setup.cfg | 11 +++++------ 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/Dockerfile b/Dockerfile index 8c2efa85e..983d785ea 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,5 @@ # syntax=docker/dockerfile:1.7-labs -FROM nvcr.io/nvidia/pytorch:24.11-py3 +FROM nvcr.io/nvidia/pytorch:25.05-py3 # Install dependencies. RUN apt-get update \ @@ -24,6 +24,13 @@ RUN mkdir -m 777 /app/Megatron-LM /app/examples /app/fast_llm /app/tests /app/to /usr/local/lib/python3.12/dist-packages \ /usr/local/lib/python3.12/dist-packages/__pycache__ +# The base image enforces versions for things like pytest for no good reason. +ENV PIP_CONSTRAINT="" +# There is no pre-build mamba image for pytorch 2.8, we build it before the rest to avoid rebuilds. +# We need to compile from the repo because of https://github.com/state-spaces/mamba/issues/720 +# We set the number of workers to avoid OOM when compiling on laptop. (TODO: Can we make it configurable?) +RUN MAX_JOBS=4 pip install --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4" + # Copy dependency files with universal write permissions for all users. COPY --chmod=777 setup.py setup.cfg pyproject.toml ./ COPY --chmod=777 ./fast_llm/__init__.py fast_llm/ diff --git a/setup.cfg b/setup.cfg index 381225bf8..fac372eb8 100644 --- a/setup.cfg +++ b/setup.cfg @@ -17,13 +17,13 @@ install_requires = # FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE FLASH_ATTENTION_FORCE_BUILD=TRUE pip install -e ".[CORE]" --no-build-isolation CORE = # Available through the nvidia base image - torch>=2.5.0 + torch>=2.6.0 # Numpy major needs to match torch - numpy>=1.24.4,<2.0.0 + numpy>=1.26.4,<2.0.0 # Used for checkpoints safetensors>=0.4.4 # Update the base image (version fixed to ensure there is a wheel for the base image), may need --no-build-isolation - flash-attn==2.7.2.post1 + flash-attn==2.7.3 mamba_ssm[causal-conv1d]==2.2.4 @@ -41,17 +41,16 @@ OPTIONAL = omegaconf>=2.3.0 # Miscellaneous requests>=2.32.3 - tqdm>=4.66.3 + tqdm>=4.67.1 DEV = # Pre-commit git hook pre-commit>=4.0.1 # Required for testing pytest>=8.3.2 - pytest-depends>=1.0.1 pytest-xdist>=3.6.1 # Somehow needed for Megatron to work with base image 24.11 - setuptools>=75.6.0 + setuptools>=78.1.1 # Required for building the documentation DOCS = From 907aef09ad944a3741ff184f36923c7cd7bb84af Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Wed, 11 Jun 2025 09:45:29 -0400 Subject: [PATCH 09/43] attempt --- Dockerfile | 2 +- setup.cfg | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index 983d785ea..ae6625d07 100644 --- a/Dockerfile +++ b/Dockerfile @@ -29,7 +29,7 @@ ENV PIP_CONSTRAINT="" # There is no pre-build mamba image for pytorch 2.8, we build it before the rest to avoid rebuilds. # We need to compile from the repo because of https://github.com/state-spaces/mamba/issues/720 # We set the number of workers to avoid OOM when compiling on laptop. (TODO: Can we make it configurable?) -RUN MAX_JOBS=4 pip install --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4" +RUN MAX_JOBS=4 pip install --no-build-isolation "mamba_ssm[causal-conv1d]@git+https://github.com/state-spaces/mamba@v2.2.4" # Copy dependency files with universal write permissions for all users. COPY --chmod=777 setup.py setup.cfg pyproject.toml ./ diff --git a/setup.cfg b/setup.cfg index fac372eb8..c0a7d57b6 100644 --- a/setup.cfg +++ b/setup.cfg @@ -17,14 +17,15 @@ install_requires = # FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE FLASH_ATTENTION_FORCE_BUILD=TRUE pip install -e ".[CORE]" --no-build-isolation CORE = # Available through the nvidia base image - torch>=2.6.0 + torch>=2.7.0 # Numpy major needs to match torch numpy>=1.26.4,<2.0.0 # Used for checkpoints safetensors>=0.4.4 # Update the base image (version fixed to ensure there is a wheel for the base image), may need --no-build-isolation flash-attn==2.7.3 - mamba_ssm[causal-conv1d]==2.2.4 + # mamba_ssm[causal-conv1d]=2.2.4 # Removed here because we need to compile from github. + mamba_ssm[causal-conv1d]@git+https://github.com/state-spaces/mamba@v2.2.4 # Required for some optional features and tools. @@ -48,6 +49,7 @@ DEV = pre-commit>=4.0.1 # Required for testing pytest>=8.3.2 + pytest-depends>=1.0.1 pytest-xdist>=3.6.1 # Somehow needed for Megatron to work with base image 24.11 setuptools>=78.1.1 From 1340903d5b31c8f1fc0c6afb9171b6f119f3c7a4 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Wed, 11 Jun 2025 11:56:45 -0400 Subject: [PATCH 10/43] attempt --- Dockerfile | 4 ++-- setup.cfg | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index ae6625d07..05c3870c5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -27,10 +27,10 @@ RUN mkdir -m 777 /app/Megatron-LM /app/examples /app/fast_llm /app/tests /app/to # The base image enforces versions for things like pytest for no good reason. ENV PIP_CONSTRAINT="" # There is no pre-build mamba image for pytorch 2.8, we build it before the rest to avoid rebuilds. -# We need to compile from the repo because of https://github.com/state-spaces/mamba/issues/720 +# We need to compile from the repo because of https://github.com/state-spaces/mamba/issues/720 (same for causal-conv1d) # We set the number of workers to avoid OOM when compiling on laptop. (TODO: Can we make it configurable?) +RUN MAX_JOBS=4 pip install --no-build-isolation "causal-conv1d@git+https://github.com/Dao-AILab/causal-conv1d.git@v1.5.0.post8" RUN MAX_JOBS=4 pip install --no-build-isolation "mamba_ssm[causal-conv1d]@git+https://github.com/state-spaces/mamba@v2.2.4" - # Copy dependency files with universal write permissions for all users. COPY --chmod=777 setup.py setup.cfg pyproject.toml ./ COPY --chmod=777 ./fast_llm/__init__.py fast_llm/ diff --git a/setup.cfg b/setup.cfg index c0a7d57b6..3345ff73a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -24,8 +24,7 @@ CORE = safetensors>=0.4.4 # Update the base image (version fixed to ensure there is a wheel for the base image), may need --no-build-isolation flash-attn==2.7.3 - # mamba_ssm[causal-conv1d]=2.2.4 # Removed here because we need to compile from github. - mamba_ssm[causal-conv1d]@git+https://github.com/state-spaces/mamba@v2.2.4 + mamba_ssm[causal-conv1d]==2.2.4 # Required for some optional features and tools. From 8aed0a3e3b99edf44391f22215f69b72f640bff6 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Wed, 11 Jun 2025 18:27:06 -0400 Subject: [PATCH 11/43] Cleanup tests --- fast_llm/logging.py | 7 +- tests/conftest.py | 23 +++- tests/layers/test_lm_head.py | 86 +++++-------- tests/test_functional.py | 6 +- tests/test_mtp.py | 204 ----------------------------- tests/test_ssms.py | 241 ++--------------------------------- tests/utils/model_configs.py | 19 +++ tests/utils/utils.py | 71 +++++------ 8 files changed, 123 insertions(+), 534 deletions(-) delete mode 100644 tests/test_mtp.py diff --git a/fast_llm/logging.py b/fast_llm/logging.py index ffeb56f62..9c791ba64 100644 --- a/fast_llm/logging.py +++ b/fast_llm/logging.py @@ -323,16 +323,19 @@ def log_generator[ return log(f"{name} {tensor.view(dtype=torch.int64)[-8:].tolist()}", log_fn=log_fn) +_global_max_allocated = 0 _global_max_reserved = 0 def get_memory_usage_mib(reset_stats: bool = True, relative_to: dict[str, int] | None = None) -> dict[str, float]: - global _global_max_reserved + global _global_max_allocated, _global_max_reserved + max_allocated = torch.cuda.memory_allocated() / 2**20 max_reserved = torch.cuda.max_memory_reserved() / 2**20 + _global_max_allocated = max(max_allocated, _global_max_allocated) _global_max_reserved = max(max_reserved, _global_max_reserved) out = { "allocated": torch.cuda.memory_allocated() / 2**20, - "max_allocated": torch.cuda.max_memory_allocated() / 2**20, + "max_allocated": max_allocated, "reserved": torch.cuda.memory_reserved() / 2**20, "max_reserved": max_reserved, "global_max_reserved": _global_max_reserved, diff --git a/tests/conftest.py b/tests/conftest.py index b688bb548..cd4cc1d1a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -9,6 +9,7 @@ import torch import xdist.scheduler +import fast_llm.logging from tests.utils.depends import DependencyManager # Make fixtures available globally without import @@ -176,9 +177,14 @@ def pytest_runtest_makereport(item: pytest.Function, call: pytest.CallInfo): # Measure GPU memory usage. (TODO: This excludes child processes) if call.when == "call" and torch.cuda.is_available(): + # Free memory for more accurate reporting, and to reduce OOM risk with lots of workers. + # Cublas workspace can unnecessarily keep 100s of MBs of reserved memory. torch._C._cuda_clearCublasWorkspaces() - gc.collect() - # This also frees memory for other processes. + # Lots of tensors tend to stay allocated until the next garbage collection. + # Collect only if the remaining memory is significant enough since it's costly. + if torch.cuda.memory_allocated() > 1e7: + gc.collect() + # Actually free the memory. torch.cuda.empty_cache() item.add_report_section( call.when, @@ -186,14 +192,23 @@ def pytest_runtest_makereport(item: pytest.Function, call: pytest.CallInfo): json.dumps( { "duration": call.duration, - "max_memory_reserved": torch.cuda.max_memory_reserved(), - "max_memory_allocated": torch.cuda.max_memory_allocated(), + # Relevant value for OOM risk. Also look at global max since fast-llm resets stats. + "max_memory_reserved": max( + torch.cuda.max_memory_reserved(), fast_llm.logging._global_max_reserved + ), + # Actual memory usage from the test. + "max_memory_allocated": max( + torch.cuda.max_memory_allocated(), fast_llm.logging._global_max_allocated + ), "memory_reserved": torch.cuda.memory_reserved(), "memory_allocated": torch.cuda.memory_allocated(), } ), ) torch.cuda.reset_peak_memory_stats() + # Reset global stats for next test. + fast_llm.logging._global_max_reserved = 0 + fast_llm.logging._global_max_allocated = 0 @pytest.hookimpl diff --git a/tests/layers/test_lm_head.py b/tests/layers/test_lm_head.py index 95da48e7e..cad95e539 100644 --- a/tests/layers/test_lm_head.py +++ b/tests/layers/test_lm_head.py @@ -5,21 +5,15 @@ from fast_llm.config import UpdateType from fast_llm.engine.config_utils.data_type import DataType -from fast_llm.engine.config_utils.tensor_space import TensorSpace -from fast_llm.engine.distributed.config import DistributedConfig -from fast_llm.engine.distributed.distributed import Distributed -from fast_llm.engine.multi_stage.config import StageConfig -from fast_llm.engine.multi_stage.stage import Stage from fast_llm.functional.config import CrossEntropyImpl from fast_llm.layers.common.config import NormalizationType from fast_llm.layers.language_model.config import LanguageModelKwargs from fast_llm.layers.language_model.embedding import WORD_EMBEDDINGS_WEIGHT from fast_llm.layers.language_model.head import OUTPUT_WEIGHTS, LanguageModelHead from fast_llm.layers.transformer.config import TransformerKwargs -from fast_llm.models.gpt.config import GPTBaseModelConfig -from fast_llm.models.gpt.model import GPTBaseModel +from fast_llm.models.gpt.config import GPTModelConfig from fast_llm.utils import Assert -from tests.utils.utils import requires_cuda +from tests.utils.utils import get_base_model, get_stage, requires_cuda def _lm_head( @@ -88,44 +82,41 @@ def test_lm_head( distributed_config_dict: dict[str, typing.Any], loss_masking: bool, ): - config = GPTBaseModelConfig.from_dict( + config = GPTModelConfig.from_dict( { - "transformer": { - "normalization": {"type": NormalizationType.rms_norm}, - "hidden_size": HIDDEN_SIZE, - "num_layers": 0, + "base_model": { + "transformer": { + "normalization": {"type": NormalizationType.rms_norm}, + "hidden_size": HIDDEN_SIZE, + "num_layers": 0, + }, + "vocab_size": VOCAB_SIZE, + "cross_entropy_impl": cross_entropy_impl, }, - "vocab_size": VOCAB_SIZE, - "cross_entropy_impl": cross_entropy_impl, + "distributed": distributed_config_dict, }, config_dict, update_type=UpdateType.update, ) - distributed_config = DistributedConfig.from_dict(distributed_config_dict) - distributed = Distributed(distributed_config) - tensor_space = TensorSpace(distributed_config) - config.setup_tensor_space(tensor_space) - tensor_space.setup(distributed) - model = GPTBaseModel(config, distributed_config) - model.setup(distributed) + model, distributed = get_base_model(config) - sequence_first = config.sequence_first or ( - config.cross_entropy_splits is not None and config.cross_entropy_splits > 1 + sequence_first = config.base_model.sequence_first or ( + config.base_model.cross_entropy_splits is not None and config.base_model.cross_entropy_splits > 1 ) input_ = torch.randn( (SEQUENCE_LENGTH, BATCH_SIZE, HIDDEN_SIZE) if sequence_first else (BATCH_SIZE, SEQUENCE_LENGTH, HIDDEN_SIZE), dtype=( - distributed_config.optimization_dtype.torch - if config.transformer.full_precision_residual - else distributed_config.training_dtype.torch + config.distributed.optimization_dtype.torch + if config.base_model.transformer.full_precision_residual + else config.distributed.training_dtype.torch ), device=distributed.device, requires_grad=True, ) label_shape = ( - (SEQUENCE_LENGTH + config.prediction_heads - 1, BATCH_SIZE) + (SEQUENCE_LENGTH + config.base_model.prediction_heads - 1, BATCH_SIZE) if sequence_first - else (BATCH_SIZE, SEQUENCE_LENGTH + config.prediction_heads - 1) + else (BATCH_SIZE, SEQUENCE_LENGTH + config.base_model.prediction_heads - 1) ) if loss_masking: loss_mask = torch.randint(0, 2, label_shape, dtype=torch.bool, device=distributed.device) @@ -135,7 +126,7 @@ def test_lm_head( TransformerKwargs.sequence_first: sequence_first, TransformerKwargs.grad_output: 1.0, } - if config.distillation_model is None: + if config.base_model.distillation_model is None: target = torch.randint( 0, VOCAB_SIZE, @@ -148,25 +139,25 @@ def test_lm_head( kwargs[LanguageModelKwargs.labels] = target else: - assert config.prediction_heads == 1 + assert config.base_model.prediction_heads == 1 target = torch.randn( input_.shape[:-1] + (VOCAB_SIZE,), dtype=input_.dtype, device=distributed.device, ) - kwargs[f"{config.distillation_model}_logits"] = target + kwargs[f"{config.base_model.distillation_model}_logits"] = target if loss_mask is not None: kwargs[LanguageModelKwargs.loss_mask] = loss_mask - if config.tie_word_embeddings or config.prediction_heads > 1: + if config.base_model.tie_word_embeddings or config.base_model.prediction_heads > 1: logit_weight = ( torch.empty( - VOCAB_SIZE, HIDDEN_SIZE, dtype=distributed_config.training_dtype.torch, device=distributed.device + VOCAB_SIZE, HIDDEN_SIZE, dtype=config.distributed.training_dtype.torch, device=distributed.device ) - .normal_(config.transformer.init_method_std) + .normal_(config.base_model.transformer.init_method_std) .requires_grad_(True) ) - kwargs[WORD_EMBEDDINGS_WEIGHT if config.tie_word_embeddings else OUTPUT_WEIGHTS] = logit_weight + kwargs[WORD_EMBEDDINGS_WEIGHT if config.base_model.tie_word_embeddings else OUTPUT_WEIGHTS] = logit_weight else: logit_weight = None @@ -175,18 +166,7 @@ def test_lm_head( head: LanguageModelHead = model[layer_index] Assert.custom(isinstance, head, LanguageModelHead) Assert.eq(head._prediction_distance, prediction_distance) - stage = Stage( - config=StageConfig(), - base_model=[head], - distributed_config=distributed_config, - begin=0, - end=1, - index=0, - ) - stage.setup(distributed=distributed) - stage.initialize_weights() - stage.restore_parameters() - stage.reset_gradients() + stage = get_stage([head], distributed) # Get reference outputs and grads if logit_weight is None: @@ -209,8 +189,8 @@ def test_lm_head( loss_mask, rms_weight=ref_rms_weight, logit_weight=ref_logit_weight, - logit_scale_factor=config.logits_scale_factor, - logit_z_loss=config.logit_z_loss, + logit_scale_factor=config.base_model.logits_scale_factor, + logit_z_loss=config.base_model.logit_z_loss, ) # Prepare LM head inputs @@ -231,10 +211,10 @@ def test_lm_head( output, context = stage.forward(head_input, kwargs, losses) stage.backward(output_grad, context) - threshold = 1e-5 if distributed_config.training_dtype == DataType.float32 else 5e-3 + threshold = 1e-5 if config.distributed.training_dtype == DataType.float32 else 5e-3 min_threshold = ( - 1e-5 if distributed_config.training_dtype == DataType.float32 else 1e-4 - ) * config.logits_scale_factor + 1e-5 if config.distributed.training_dtype == DataType.float32 else 1e-4 + ) * config.base_model.logits_scale_factor Assert.eq(losses.keys(), loss_keys) Assert.eq(len(losses[loss_name]), 1) diff --git a/tests/test_functional.py b/tests/test_functional.py index 9c01f0840..b049be855 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -226,9 +226,9 @@ def test_mlp_recomputation(gated, activation_type): def test_dropless_mlp(): num_experts = 4 experts_per_token = 4 - tokens = 1024 - hidden_size = 2048 - ffn_hidden_size = 4096 + tokens = 256 + hidden_size = 512 + ffn_hidden_size = 1024 std = 1 / 64 input_ = torch.randn(tokens, hidden_size, device="cuda", requires_grad=True) router_weight = torch.normal(0, std, (num_experts, hidden_size), device="cuda") diff --git a/tests/test_mtp.py b/tests/test_mtp.py deleted file mode 100644 index 5c4660b73..000000000 --- a/tests/test_mtp.py +++ /dev/null @@ -1,204 +0,0 @@ -import typing - -import pytest -import torch - -from fast_llm.config import UpdateType -from fast_llm.engine.distributed.config import DistributedConfig -from fast_llm.engine.distributed.distributed import Distributed -from fast_llm.layers.language_model.config import LanguageModelKwargs, LanguageModelLossNames -from fast_llm.layers.language_model.embedding import WORD_EMBEDDINGS_WEIGHT -from fast_llm.layers.language_model.head import OUTPUT_WEIGHTS, LanguageModelHead -from fast_llm.layers.transformer.config import TransformerKwargs -from fast_llm.layers.transformer.transformer import TransformerLayer -from fast_llm.models.gpt.config import GPTBaseModelConfig -from fast_llm.models.gpt.model import GPTBaseModel -from fast_llm.utils import Assert -from tests.utils.utils import get_hybrid_config, materialize_meta_tensors, requires_cuda - -try: - from fast_llm.layers.ssm.discrete_mamba2 import DiscreteMamba2 - from fast_llm.layers.ssm.mamba_layer import MambaLayer - from fast_llm.models.ssm.model import HybridSSMBaseModel -except ImportError: - MambaLayer, HybridSSMBaseModel, DiscreteMamba2 = ( - None, - None, - None, - ) - # Mamba not installed, skipping tests - - -run_hybrid_test = MambaLayer is not None and DiscreteMamba2 is not None and torch.cuda.is_available() - - -SEQUENCE_LENGTH = 200 -BATCH_SIZE = 4 -HIDDEN_SIZE = 256 -VOCAB_SIZE = 500 - - -@pytest.fixture -def distributed_config(): - return DistributedConfig( - tensor_parallel=1, - pipeline_parallel=1, - sequence_data_parallel=1, - local_world_size=1, - world_size=1, - ) - - -@pytest.fixture -def distributed(distributed_config): - return Distributed(config=distributed_config) - - -@requires_cuda -@pytest.mark.parametrize( - "config_dict", - ( - {"prediction_heads": 1}, - {"prediction_heads": 2, "tie_word_embeddings": False}, - {"prediction_heads": 5, "tie_word_embeddings": False}, - ), -) -def test_transformer_mtp(config_dict: dict[str, typing.Any]): - config = GPTBaseModelConfig.from_dict( - { - "transformer": { - "hidden_size": HIDDEN_SIZE, - "num_layers": 2, - }, - "vocab_size": VOCAB_SIZE, - }, - config_dict, - update_type=UpdateType.update, - ) - distributed_config = DistributedConfig.from_dict({}) - distributed = Distributed(distributed_config) - model = GPTBaseModel(config, distributed_config) - model.setup(distributed) - materialize_meta_tensors(model, model._tensor_space) - model.to("cuda") - - sequence_first = config.sequence_first or ( - config.cross_entropy_splits is not None and config.cross_entropy_splits > 1 - ) - target = torch.randint( - 0, - VOCAB_SIZE, - ( - (SEQUENCE_LENGTH + config.prediction_heads - 1, BATCH_SIZE) - if sequence_first - else (BATCH_SIZE, SEQUENCE_LENGTH + config.prediction_heads - 1) - ), - dtype=torch.int64, - device=distributed.device, - ) - input_ = torch.randint( - 0, - VOCAB_SIZE, - (SEQUENCE_LENGTH, BATCH_SIZE) if sequence_first else (BATCH_SIZE, SEQUENCE_LENGTH), - device=distributed.device, - ) - attention_mask = torch.ones((1, 1, 1, 1), device="cuda", dtype=torch.bool) - position_ids = torch.arange(SEQUENCE_LENGTH, device="cuda", dtype=torch.int64) - kwargs = { - "position_ids": position_ids, - TransformerKwargs.sequence_first: sequence_first, - TransformerKwargs.attention_mask: attention_mask, - TransformerKwargs.attention_mask_value: -100, - TransformerKwargs.grad_output: 1.0, - LanguageModelKwargs.labels: target, - } - if config.tie_word_embeddings: - kwargs[WORD_EMBEDDINGS_WEIGHT] = model.embedding.word_embeddings_weight - else: - kwargs[OUTPUT_WEIGHTS] = model.model_head.output_weights - losses = {LanguageModelLossNames.multi_token_prediction_loss(i): [] for i in range(model._config.prediction_heads)} - _ = model(input_, kwargs, losses=losses) - for loss_name, loss_values in losses.items(): - Assert.gt(len(loss_values), 0) - loss = sum( - [ - sum(losses[LanguageModelLossNames.multi_token_prediction_loss(i)]) - for i in range(model._config.prediction_heads) - ] - ) - loss.backward() - - -@pytest.mark.skip(reason="Too slow") -@requires_cuda -@pytest.mark.skipif(not run_hybrid_test, reason="No CUDA available or Mamba not installed") -@pytest.mark.parametrize( - ("hybrid_block_layout", "prediction_heads", "default_mtp_type"), - [ - (["m", "t"], 1, None), - (["t", "m"], 2, None), - (["m", "t"], 2, None), - (["t", "m2"], 3, None), - (["t", "m2"], 3, "m"), - ], -) -def test_hybrid_model_mtp(distributed_config, hybrid_block_layout, prediction_heads, default_mtp_type): - hybrid_config = get_hybrid_config( - hybrid_block_layout=hybrid_block_layout, prediction_heads=prediction_heads, default_mtp_type=default_mtp_type - ) - model = HybridSSMBaseModel(hybrid_config, distributed_config) - distributed = Distributed(distributed_config) - model.setup(distributed) - tensor_space = model._tensor_space - materialize_meta_tensors(model, tensor_space) - model.to("cuda") - - num_heads, num_mtp_blocks = 0, 0 - str_block_mapping = {"t": TransformerLayer, "m": MambaLayer, "m2": DiscreteMamba2} - mtp_block_type = default_mtp_type or hybrid_block_layout[-1] - for block in model.get_output_layers(): - if isinstance(block, LanguageModelHead): - num_heads += 1 - else: - block = getattr(block, "mixer", block) - Assert.custom( - lambda _: isinstance(block, str_block_mapping[mtp_block_type]), - f"Block {block} is not of type {str_block_mapping[mtp_block_type]}", - ) - num_mtp_blocks += 1 - Assert.eq(num_heads, prediction_heads) - Assert.eq(num_mtp_blocks, prediction_heads - 1) - - batch_size = 2 - seq_length = 32 - x = torch.randint(0, 49152, (batch_size, seq_length), device="cuda") - position_ids = torch.arange(seq_length, device="cuda", dtype=torch.int64) - attention_mask = torch.ones((1, 1, 1, 1), device="cuda", dtype=torch.bool) # will be broadcasted to right shape - labels = torch.randint(0, 49152, (batch_size, seq_length + model._config.prediction_heads - 1), device="cuda") - losses = {LanguageModelLossNames.multi_token_prediction_loss(i): [] for i in range(model._config.prediction_heads)} - kwargs = { - "position_ids": position_ids, - TransformerKwargs.sequence_first: False, - TransformerKwargs.attention_mask: attention_mask, - TransformerKwargs.attention_mask_value: -100, - TransformerKwargs.grad_output: True, - LanguageModelKwargs.labels: labels, - } - - if model._config.tie_word_embeddings: - kwargs[WORD_EMBEDDINGS_WEIGHT] = model.embedding.word_embeddings_weight - else: - kwargs[OUTPUT_WEIGHTS] = model.model_head.output_weights - - output = model( - x, - kwargs, - losses=losses, - ) - loss = sum( - [ - sum(losses[LanguageModelLossNames.multi_token_prediction_loss(i)]) - for i in range(model._config.prediction_heads) - ] - ) - loss.backward() diff --git a/tests/test_ssms.py b/tests/test_ssms.py index a1d460c28..52b51c8a5 100644 --- a/tests/test_ssms.py +++ b/tests/test_ssms.py @@ -1,84 +1,31 @@ import pathlib -from functools import partial import pytest import torch from fast_llm.config import NoAutoValidate from fast_llm.engine.checkpoint.config import CheckpointLoadConfig -from fast_llm.engine.config_utils.tensor_space import TensorSpace from fast_llm.engine.distributed.config import DistributedConfig, PhaseType -from fast_llm.engine.distributed.distributed import Distributed from fast_llm.engine.schedule.config import ScheduleConfig from fast_llm.engine.schedule.runner import ScheduleRunner from fast_llm.engine.schedule.schedule import Schedule -from fast_llm.layers.language_model.config import LanguageModelKwargs, LanguageModelLossNames from fast_llm.layers.transformer.config import TransformerKwargs -from fast_llm.models.gpt.config import GPTBatchConfig, LlamaGPTHuggingfaceCheckpointFormat +from fast_llm.models.gpt.config import GPTBatchConfig from fast_llm.models.ssm.config import LLambaHuggingfaceCheckpointFormat -from tests.utils.utils import get_hybrid_config, materialize_meta_tensors - -try: - from fast_llm.layers.ssm.discrete_mamba2 import DiscreteMamba2 - from fast_llm.layers.ssm.llamba_block import LlambaBlock - from fast_llm.layers.ssm.mamba_layer import MambaLayer - from fast_llm.models.ssm.model import HybridSSMBaseModel, HybridSSMModel -except ImportError: - MambaLayer, LlambaBlock, HybridSSMBaseModel, DiscreteMamba2 = ( - None, - None, - None, - None, - ) - # Mamba not installed, skipping tests +from fast_llm.models.ssm.model import HybridSSMModel try: from cartesia_pytorch.Llamba.llamba import LlambaLMHeadModel as LMHeadModel except ImportError: LMHeadModel = None -run_test = MambaLayer is not None and torch.cuda.is_available() - - -@pytest.fixture -def distributed_config(): - return DistributedConfig( - tensor_parallel=1, - pipeline_parallel=1, - sequence_data_parallel=1, - local_world_size=1, - world_size=1, - ) - - -@pytest.fixture -def distributed(distributed_config): - return Distributed(config=distributed_config) - - -def get_hf_llamba_out(input_ids, path, format): - if format == LLambaHuggingfaceCheckpointFormat: - from cartesia_pytorch.Llamba.llamba import LlambaLMHeadModel as LMHeadModel - elif format == LlamaGPTHuggingfaceCheckpointFormat: - from transformers import LlamaForCausalLM as LMHeadModel - else: - raise ValueError(f"Invalid format: {format}") - - model = LMHeadModel.from_pretrained(path, strict=True).to("cuda") - parameter_sum = sum(p.detach().cpu().numpy().sum() for p in model.parameters()) - print(f"Parameter sum: {parameter_sum}") - output = model(input_ids) - del model - torch.cuda.empty_cache() - return output, parameter_sum - @pytest.mark.slow @pytest.mark.skipif( - not run_test or LMHeadModel is None, - reason=f"Skipping because one of the following: cartesia_pytorch.Llamba not installed or no CUDA available or Mamba not installed", + LMHeadModel is None, + reason=f"cartesia_pytorch.Llamba not installed", ) -def test_load_from_llamba_checkpoint(distributed_config): +def test_load_from_llamba_checkpoint(): """ Test to check whether the of Fast-LLM and Huggingface checkpoint loading for Llamba-1B produce the same results. """ @@ -90,8 +37,12 @@ def test_load_from_llamba_checkpoint(distributed_config): format = LLambaHuggingfaceCheckpointFormat x = torch.randint(0, vocab_size, (batch_size, seq_length), device="cuda") - hf_logits, parameter_sum_hf = get_hf_llamba_out(x, path, format) - hf_logits = hf_logits["logits"].cpu() + + hf_model = LMHeadModel.from_pretrained(path, strict=True).to("cuda") + parameter_sum_hf = sum(p.detach().sum().cpu().item() for p in hf_model.parameters()) + hf_logits = hf_model(x)["logits"].cpu() + del hf_model + torch.cuda.empty_cache() # Create checkpoint load config checkpoint_config = CheckpointLoadConfig(path=path, format=format, model_weights=True, optimizer_state=False) @@ -109,7 +60,7 @@ def test_load_from_llamba_checkpoint(distributed_config): schedule_config = ScheduleConfig() with NoAutoValidate(): batch_config = GPTBatchConfig(micro_batch_size=batch_size, sequence_length=seq_length) - batch_config.setup(distributed_config) + batch_config.setup(DistributedConfig.from_dict({})) batch_config.validate() schedule_runner = ScheduleRunner( config=schedule_config, @@ -131,173 +82,7 @@ def test_load_from_llamba_checkpoint(distributed_config): } input_data = [(x, common_kwargs)] - losses, success, metrics = schedule_runner.run_step( - iter([input_data]), schedule, iteration=0, return_metrics=True, preprocessed=True - ) + schedule_runner.run_step(iter([input_data]), schedule, iteration=0, return_metrics=True, preprocessed=True) logits = input_data[0][1]["logits"].cpu() assert torch.allclose(logits, hf_logits, atol=1e-2) - - -@pytest.mark.extra_slow -@pytest.mark.skipif(not run_test, reason="No CUDA available or Mamba not installed") -@pytest.mark.parametrize( - "hybrid_block_layout,LAYER_CLS", - [ - (["m", "t"], MambaLayer), - (["m2", "t"], DiscreteMamba2), - ], - ids=["mamba", "discrete_mamba2"], -) -def test_mamba_layer(distributed_config, distributed, hybrid_block_layout, LAYER_CLS): - hybrid_config = get_hybrid_config(hybrid_block_layout=hybrid_block_layout) - tensor_space = TensorSpace(distributed_config=distributed_config) - hybrid_config.setup_tensor_space(tensor_space) - layer = LAYER_CLS(hybrid_config.ssm, layer_idx=0, tensor_space=tensor_space) - tensor_space.setup(distributed) - materialize_meta_tensors(layer, tensor_space) - layer.to(distributed.device) - - batch_size = 2 - seq_length = 32 - hidden_size = hybrid_config.transformer.hidden_size - x = torch.randn(batch_size, seq_length, hidden_size, device=distributed.device) - - # Run forward pass - output, _ = layer(x, {}) - - loss = output.sum() - loss.backward() - # Basic shape checkss - assert output.shape == x.shape - assert not torch.isnan(output).any() - assert not torch.isinf(output).any() - - -@pytest.mark.skipif(not run_test, reason="No CUDA available or Mamba not installed") -def test_mamba_block(distributed_config, distributed): - hybrid_config = get_hybrid_config(hybrid_block_layout=["m", "t"]) - tensor_space = TensorSpace(distributed_config=distributed_config) - tensor_space.setup(distributed) - hybrid_config.setup_tensor_space(tensor_space) - layer_idx = 0 - - mixer_cls = partial(MambaLayer, layer_idx=layer_idx) - block = LlambaBlock( - hybrid_config.transformer, - hybrid_config.ssm, - mixer_cls=mixer_cls, - tensor_space=tensor_space, - layer_index=layer_idx, - ) - - materialize_meta_tensors(block, tensor_space) - block.to("cuda") - - batch_size = 2 - seq_length = 32 - hidden_size = hybrid_config.transformer.hidden_size - x = torch.randn(batch_size, seq_length, hidden_size, device=distributed.device) - - hidden_states = block(x, {}) - loss = hidden_states.sum() - loss.backward() - - assert hidden_states.shape == x.shape - assert not torch.isnan(hidden_states).any() - assert not torch.isinf(hidden_states).any() - - -@pytest.mark.slow -@pytest.mark.skipif(not run_test, reason="No CUDA available or Mamba not installed") -@pytest.mark.parametrize( - ("hybrid_block_layout"), - [ - (["m", "t"]), - (["m2", "t"]), - ], - ids=["mamba", "discrete_mamba2"], -) -def test_hybrid_model_train_with_fast_mode(distributed_config, hybrid_block_layout): - hybrid_config = get_hybrid_config(hybrid_block_layout=hybrid_block_layout) - model = HybridSSMBaseModel(hybrid_config, distributed_config) - distributed = Distributed(distributed_config) - model.setup(distributed) - tensor_space = model._tensor_space - materialize_meta_tensors(model, tensor_space) - model.to("cuda") - - batch_size = 2 - seq_length = 32 - x = torch.randint(0, 49152, (batch_size, seq_length), device="cuda") - position_ids = torch.arange(seq_length, device="cuda", dtype=torch.int64) - attention_mask = torch.ones((1, 1, 1, 1), device="cuda", dtype=torch.bool) # will be broadcasted to right shape - labels = torch.randint(0, 49152, (batch_size, seq_length), device="cuda") - losses = {LanguageModelLossNames.language_model_loss: []} - output = model( - x, - { - "position_ids": position_ids, - TransformerKwargs.sequence_first: False, - TransformerKwargs.attention_mask: attention_mask, - TransformerKwargs.attention_mask_value: -100, - TransformerKwargs.grad_output: True, - LanguageModelKwargs.labels: labels, - }, - losses=losses, - ) - loss = sum(losses[LanguageModelLossNames.language_model_loss]) - loss.backward() - - -# TODO: added this when inference enabled -# No inference for now -# @dataclass -# class InferenceParams: -# max_seqlen: int -# max_batch_size: int -# sequence_len_offset: int = 0 -# key_value_memory_dict: dict = None - -# def __post_init__(self): -# if self.key_value_memory_dict is None: -# self.key_value_memory_dict = {} - - -# @pytest.mark.skipif(not torch.cuda.is_available(), reason="No CUDA available") -# def test_hybrid_model_inference(distributed_config, hybrid_config): -# hybrid_config.ssm.use_fast_path = False -# model = HybridSSMBaseModel(hybrid_config, distributed_config) -# distributed = Distributed(distributed_config) -# model.setup(distributed) -# tensor_space = model._tensor_space -# materialize_meta_tensors(model, tensor_space) -# model.to("cuda") -# # print(model) - -# batch_size = 2 -# seq_length = 32 -# x = torch.randint(0, 49152, (batch_size, seq_length), device="cuda") -# position_ids = torch.arange(seq_length, device="cuda", dtype=torch.int64) -# attention_mask = torch.ones((1, 1, 1, 1), device="cuda", dtype=torch.bool) # will be broadcasted to right shape -# labels = torch.randint(0, 49152, (batch_size, seq_length), device="cuda") -# max_new_tokens = 10 - -# inference_params = InferenceParams( -# max_seqlen=len(x[0]) + max_new_tokens, max_batch_size=x.shape[0], sequence_len_offset=0 -# ) -# losses = {LanguageModelLossNames.language_model_loss: []} - -# output = model( -# x, -# { -# "position_ids": position_ids, -# TransformerKwargs.sequence_first: True, -# TransformerKwargs.attention_mask: attention_mask, -# TransformerKwargs.attention_mask_value: -100, -# TransformerKwargs.grad_output: True, -# LanguageModelKwargs.labels: labels, -# "inference_params": inference_params, -# }, -# losses=losses, -# ) diff --git a/tests/utils/model_configs.py b/tests/utils/model_configs.py index 3f989f584..1c3324960 100644 --- a/tests/utils/model_configs.py +++ b/tests/utils/model_configs.py @@ -403,6 +403,25 @@ def _update_and_add_testing_config( ) +_update_and_add_testing_config( + # Tests hybrid ssm, llamba converter. + "llama", + "hybrid_mamba_2", + model_type="hybrid_ssm", + extra_args=["model.base_model.hybrid_block_layout=['t','m2']"], + megatron_args=None, + checkpoint_format=None, + testing_groups=[ + ModelTestingGroup.basic, + ], + # TODO: Bring back `generate` to `testing_groups` when stable. + other_groups=[ + # TODO: Fix and bring back to `testing_groups` + ModelTestingGroup.distributed, + ], +) + + @pytest.fixture(scope="session", params=_MODEL_CONFIGS.keys()) def model_testing_config(request) -> ModelTestingConfig: return _MODEL_CONFIGS[request.param] diff --git a/tests/utils/utils.py b/tests/utils/utils.py index bf2059fa8..ea689bccf 100644 --- a/tests/utils/utils.py +++ b/tests/utils/utils.py @@ -3,9 +3,11 @@ import pytest import torch -from fast_llm.layers.ssm.config import SSMConfig -from fast_llm.layers.transformer.config import TransformerConfig -from fast_llm.models.ssm.config import HybridSSMBaseModelConfig +from fast_llm.engine.base_model.base_model import BaseModel, Layer +from fast_llm.engine.config_utils.tensor_space import TensorSpace +from fast_llm.engine.distributed.distributed import Distributed +from fast_llm.engine.multi_stage.config import FastLLMModelConfig, StageConfig +from fast_llm.engine.multi_stage.stage import Stage requires_cuda = pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is not available") @@ -15,41 +17,30 @@ def result_path(): return pathlib.Path("/tmp/fast_llm_tests") -def materialize_meta_tensors(model, tensor_space): - # Materialize parameters that are on meta device - for name, param in model.named_parameters(): - if param.device.type == "meta": - # Check if the parameter is a custom tensor type - if hasattr(param, "tensor_name") and hasattr(param, "init_parameter"): - param_data = param.new_empty(param.shape, device="cuda") - # Initialize param_data - param.init_parameter(param_data, tensor_space.distributed) - # Replace the parameter in the module - module_path, param_name = name.rsplit(".", 1) if "." in name else (None, name) - module = model - if module_path is not None: - for part in module_path.split("."): - module = getattr(module, part) - param = torch.nn.Parameter(param_data, requires_grad=param.requires_grad) - # TODO: add param_grad_is_zero etc., grad_buffer, etc., see test_mlp_recomputation - param.grad = None - param.grad_buffer = torch.empty_like(param) - param.param_grad_is_zero = True - module._parameters[param_name] = param - return model - - -def get_hybrid_config(hybrid_block_layout=["t", "m"], prediction_heads=1, default_mtp_type=None): - config = HybridSSMBaseModelConfig( - transformer=TransformerConfig(num_layers=len(hybrid_block_layout)), - ssm=SSMConfig(), - hybrid_block_layout=hybrid_block_layout, - prediction_heads=prediction_heads, - default_mtp_type=default_mtp_type, - init_method_std_embed=0.02, - init_method_min_embed=-0.02, - init_method_max_embed=0.02, - use_position_embeddings=True, - tie_word_embeddings=False, +def get_base_model(config: FastLLMModelConfig): + # Create a base model (and distributed). + # Using a full model config so we have the model type and distributed config in the same argument. + distributed = Distributed(config.distributed) + tensor_space = TensorSpace(config.distributed) + config.base_model.setup_tensor_space(tensor_space) + tensor_space.setup(distributed) + base_model = config.get_model_class().base_model_class(config.base_model, config.distributed) + base_model.setup(distributed) + return base_model, distributed + + +def get_stage(base_model: BaseModel | list[Layer], distributed: Distributed): + # Create a fast-llm stage which allocates and initializes meta tensors correctly. + stage = Stage( + config=StageConfig(), + base_model=base_model, + distributed_config=distributed.config, + begin=0, + end=1, + index=0, ) - return config + stage.setup(distributed=distributed) + stage.initialize_weights() + stage.restore_parameters() + stage.reset_gradients() + return stage From 830a380b9d0a5835975d73f9c1fda7e2c987ce95 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Wed, 11 Jun 2025 19:21:39 -0400 Subject: [PATCH 12/43] fixes --- tests/conftest.py | 1 - tests/layers/test_lm_head.py | 67 +++++++++++++++++++----------------- tests/utils/model_configs.py | 14 ++++++-- 3 files changed, 47 insertions(+), 35 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index cd4cc1d1a..bfe9f50cf 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -231,7 +231,6 @@ def pytest_terminal_summary(terminalreporter): key=lambda nodeid: resource_reports[nodeid]["max_memory_reserved"], reverse=True, ) - logging.error(f"sorted_nodeids {sorted_nodeids}") for nodeid in sorted_nodeids[: terminalreporter.config.getoption("--show-gpu-memory")]: terminalreporter.write_line( f"{nodeid}:\n " diff --git a/tests/layers/test_lm_head.py b/tests/layers/test_lm_head.py index cad95e539..ea09d3b54 100644 --- a/tests/layers/test_lm_head.py +++ b/tests/layers/test_lm_head.py @@ -11,7 +11,7 @@ from fast_llm.layers.language_model.embedding import WORD_EMBEDDINGS_WEIGHT from fast_llm.layers.language_model.head import OUTPUT_WEIGHTS, LanguageModelHead from fast_llm.layers.transformer.config import TransformerKwargs -from fast_llm.models.gpt.config import GPTModelConfig +from fast_llm.models.gpt.config import GPTBaseModelConfig, GPTModelConfig from fast_llm.utils import Assert from tests.utils.utils import get_base_model, get_stage, requires_cuda @@ -82,41 +82,46 @@ def test_lm_head( distributed_config_dict: dict[str, typing.Any], loss_masking: bool, ): - config = GPTModelConfig.from_dict( + config = GPTBaseModelConfig.from_dict( { - "base_model": { - "transformer": { - "normalization": {"type": NormalizationType.rms_norm}, - "hidden_size": HIDDEN_SIZE, - "num_layers": 0, - }, - "vocab_size": VOCAB_SIZE, - "cross_entropy_impl": cross_entropy_impl, + "transformer": { + "normalization": {"type": NormalizationType.rms_norm}, + "hidden_size": HIDDEN_SIZE, + "num_layers": 0, }, - "distributed": distributed_config_dict, + "vocab_size": VOCAB_SIZE, + "cross_entropy_impl": cross_entropy_impl, }, config_dict, update_type=UpdateType.update, ) - model, distributed = get_base_model(config) - sequence_first = config.base_model.sequence_first or ( - config.base_model.cross_entropy_splits is not None and config.base_model.cross_entropy_splits > 1 + model, distributed = get_base_model( + GPTModelConfig.from_dict( + { + "base_model": config, + "distributed": distributed_config_dict, + }, + ) + ) + + sequence_first = config.sequence_first or ( + config.cross_entropy_splits is not None and config.cross_entropy_splits > 1 ) input_ = torch.randn( (SEQUENCE_LENGTH, BATCH_SIZE, HIDDEN_SIZE) if sequence_first else (BATCH_SIZE, SEQUENCE_LENGTH, HIDDEN_SIZE), dtype=( - config.distributed.optimization_dtype.torch - if config.base_model.transformer.full_precision_residual - else config.distributed.training_dtype.torch + distributed.config.optimization_dtype.torch + if config.transformer.full_precision_residual + else distributed.config.training_dtype.torch ), device=distributed.device, requires_grad=True, ) label_shape = ( - (SEQUENCE_LENGTH + config.base_model.prediction_heads - 1, BATCH_SIZE) + (SEQUENCE_LENGTH + config.prediction_heads - 1, BATCH_SIZE) if sequence_first - else (BATCH_SIZE, SEQUENCE_LENGTH + config.base_model.prediction_heads - 1) + else (BATCH_SIZE, SEQUENCE_LENGTH + config.prediction_heads - 1) ) if loss_masking: loss_mask = torch.randint(0, 2, label_shape, dtype=torch.bool, device=distributed.device) @@ -126,7 +131,7 @@ def test_lm_head( TransformerKwargs.sequence_first: sequence_first, TransformerKwargs.grad_output: 1.0, } - if config.base_model.distillation_model is None: + if config.distillation_model is None: target = torch.randint( 0, VOCAB_SIZE, @@ -139,25 +144,25 @@ def test_lm_head( kwargs[LanguageModelKwargs.labels] = target else: - assert config.base_model.prediction_heads == 1 + assert config.prediction_heads == 1 target = torch.randn( input_.shape[:-1] + (VOCAB_SIZE,), dtype=input_.dtype, device=distributed.device, ) - kwargs[f"{config.base_model.distillation_model}_logits"] = target + kwargs[f"{config.distillation_model}_logits"] = target if loss_mask is not None: kwargs[LanguageModelKwargs.loss_mask] = loss_mask - if config.base_model.tie_word_embeddings or config.base_model.prediction_heads > 1: + if config.tie_word_embeddings or config.prediction_heads > 1: logit_weight = ( torch.empty( - VOCAB_SIZE, HIDDEN_SIZE, dtype=config.distributed.training_dtype.torch, device=distributed.device + VOCAB_SIZE, HIDDEN_SIZE, dtype=distributed.config.training_dtype.torch, device=distributed.device ) - .normal_(config.base_model.transformer.init_method_std) + .normal_(config.transformer.init_method_std) .requires_grad_(True) ) - kwargs[WORD_EMBEDDINGS_WEIGHT if config.base_model.tie_word_embeddings else OUTPUT_WEIGHTS] = logit_weight + kwargs[WORD_EMBEDDINGS_WEIGHT if config.tie_word_embeddings else OUTPUT_WEIGHTS] = logit_weight else: logit_weight = None @@ -189,8 +194,8 @@ def test_lm_head( loss_mask, rms_weight=ref_rms_weight, logit_weight=ref_logit_weight, - logit_scale_factor=config.base_model.logits_scale_factor, - logit_z_loss=config.base_model.logit_z_loss, + logit_scale_factor=config.logits_scale_factor, + logit_z_loss=config.logit_z_loss, ) # Prepare LM head inputs @@ -211,10 +216,10 @@ def test_lm_head( output, context = stage.forward(head_input, kwargs, losses) stage.backward(output_grad, context) - threshold = 1e-5 if config.distributed.training_dtype == DataType.float32 else 5e-3 + threshold = 1e-5 if distributed.config.training_dtype == DataType.float32 else 5e-3 min_threshold = ( - 1e-5 if config.distributed.training_dtype == DataType.float32 else 1e-4 - ) * config.base_model.logits_scale_factor + 1e-5 if distributed.config.training_dtype == DataType.float32 else 1e-4 + ) * config.logits_scale_factor Assert.eq(losses.keys(), loss_keys) Assert.eq(len(losses[loss_name]), 1) diff --git a/tests/utils/model_configs.py b/tests/utils/model_configs.py index 1c3324960..3f334c64a 100644 --- a/tests/utils/model_configs.py +++ b/tests/utils/model_configs.py @@ -387,7 +387,13 @@ def _update_and_add_testing_config( "llama", "llamba", model_type="hybrid_ssm", - extra_args=["model.base_model.hybrid_block_layout=['t','m']"], + extra_args=[ + "model.base_model.hybrid_block_layout=['t','m']", + "model.base_model.ssm.state_size=8", + "model.base_model.ssm.chunk_size=32", + "model.base_model.ssm.n_qk_heads=8", + "model.base_model.ssm.n_v_heads=8", + ], megatron_args=None, checkpoint_format=LLambaHuggingfaceCheckpointFormat, testing_groups=[ @@ -405,10 +411,12 @@ def _update_and_add_testing_config( _update_and_add_testing_config( # Tests hybrid ssm, llamba converter. - "llama", + "llamba", "hybrid_mamba_2", model_type="hybrid_ssm", - extra_args=["model.base_model.hybrid_block_layout=['t','m2']"], + extra_args=[ + "model.base_model.hybrid_block_layout=['t','m2']", + ], megatron_args=None, checkpoint_format=None, testing_groups=[ From 13e1da5c9d91658ba9941a2d03d91d21e668143b Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Thu, 12 Jun 2025 10:41:22 -0400 Subject: [PATCH 13/43] fix --- fast_llm/functional/triton/mlp.py | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/fast_llm/functional/triton/mlp.py b/fast_llm/functional/triton/mlp.py index ee3ba304c..ab408368f 100644 --- a/fast_llm/functional/triton/mlp.py +++ b/fast_llm/functional/triton/mlp.py @@ -25,9 +25,6 @@ from fast_llm.functional.triton.sparse_linear import output_sparse_matmul from fast_llm.tensor import param_get_and_unset_is_zero -# Triton requires global variables to be annotated with `constexpr`. -_TritonActivationType: tl_constexpr = ActivationType - @triton_jit() def triton_mlp_activation_forward_kernel( @@ -50,18 +47,19 @@ def triton_mlp_activation_forward_kernel( input_ = tl.load(input_ptr, mask=mask).to(tl.float32) - if activation_type == _TritonActivationType.gelu: + # Triton doesn't like enums, so we use str instead of ActivationType. + if activation_type == "gelu": tanh_input = 0.79788456 * input_ * (1 + 0.044715 * input_ * input_) tanh = 1 - 2 / (1 + tl.exp(2 * tanh_input)) out = input_ * 0.5 * (1.0 + tanh) - elif activation_type == _TritonActivationType.silu: + elif activation_type == "silu": out = input_ / (1 + tl.exp(-input_)) - elif activation_type == _TritonActivationType.relu: + elif activation_type == "relu": out = tl.where(input_ > 0, input_, 0) - elif activation_type == _TritonActivationType.squared_relu: + elif activation_type == "squared_relu": relu_out = tl.where(input_ > 0, input_, 0) out = relu_out * relu_out - elif activation_type == _TritonActivationType.identity: + elif activation_type == "identity": out = input_ else: tl.static_assert(False, activation_type) @@ -100,28 +98,29 @@ def triton_mlp_activation_backward_kernel( input_ = tl.load(input_ptr, mask=mask).to(tl.float32) output_grad = tl.load(grad_output_ptr + output_offsets, mask=mask).to(tl.float32) - if activation_type == _TritonActivationType.gelu: + # Triton doesn't like enums, so we use str instead of ActivationType. + if activation_type == "gelu": tanh_input = 0.79788456 * input_ * (1 + 0.044715 * input_ * input_) tanh = 1 - 2 / (1 + tl.exp(2 * tanh_input)) grad = 0.5 * input_ * ((1 - tanh * tanh) * (0.79788456 + 0.1070322243 * input_ * input_)) + 0.5 * (1 + tanh) if gated or recompute: out = input_ * 0.5 * (1.0 + tanh) - elif activation_type == _TritonActivationType.silu: + elif activation_type == "silu": exp = tl.exp(-input_) sigma = 1 / (1 + exp) grad = sigma * sigma + (1 + input_) / (2 + exp + 1 / exp) if gated or recompute: out = input_ * sigma - elif activation_type == _TritonActivationType.relu: + elif activation_type == "relu": grad = tl.where(input_ > 0, 1, 0) if gated or recompute: out = tl.where(input_ > 0, input_, 0) - elif activation_type == _TritonActivationType.squared_relu: + elif activation_type == "squared_relu": relu_out = tl.where(input_ > 0, input_, 0) grad = 2 * relu_out if gated or recompute: out = relu_out * relu_out - elif activation_type == _TritonActivationType.identity: + elif activation_type == "identity": grad = 1 if gated or recompute: out = input_ From 0dffe5c46ca31e0b8b1b13dfcbec6d0e712ab2d6 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Thu, 12 Jun 2025 14:27:13 -0400 Subject: [PATCH 14/43] fixes --- fast_llm/layers/ssm/discrete_mamba2.py | 41 ++++++++++++++++---------- fast_llm/layers/ssm/mamba_layer.py | 11 +++++-- setup.cfg | 29 +++++++++--------- tests/test_ssms.py | 2 +- 4 files changed, 50 insertions(+), 33 deletions(-) diff --git a/fast_llm/layers/ssm/discrete_mamba2.py b/fast_llm/layers/ssm/discrete_mamba2.py index 85916244e..ecf0b29d7 100644 --- a/fast_llm/layers/ssm/discrete_mamba2.py +++ b/fast_llm/layers/ssm/discrete_mamba2.py @@ -2,7 +2,6 @@ import math import einops -import mamba_ssm.ops.triton.ssd_combined import torch from fast_llm.engine.config_utils.tensor_space import TensorDim, TensorSpace @@ -13,12 +12,22 @@ logger = logging.getLogger(__name__) + try: - import causal_conv1d + from mamba_ssm.ops.triton.ssd_combined import mamba_chunk_scan_combined as _mamba_chunk_scan_combined # noqa + + _mamba_available = True except ImportError: - # this is needed since we cannot use causal_conv1d on B200 GPUs for now - logger.warning("Note, causal_conv1d not found, will use torch.nn.functional.conv1d instead") - causal_conv1d = None + _mamba_available = False + + +try: + from causal_conv1d import causal_conv1d_fn as _causal_conv1d_fn # noqa + + _causal_conv1d_available = True +except ImportError: + _causal_conv1d_available = False + """ This code is adapted from https://github.com/cartesia-ai/edge/blob/main/cartesia-pytorch/cartesia_pytorch/Llamba/mixers/discrete_mamba2.py @@ -148,6 +157,8 @@ def forward(self, hidden_states, kwargs): outputs["hidden_states"]: (B, L, D). outputs["state"]: inference cache. """ + + assert _mamba_available input_ = hidden_states outputs = {} # assert state is None @@ -201,7 +212,7 @@ def forward(self, hidden_states, kwargs): C = einops.rearrange(C, "b l (h n) -> b l h n", h=self.n_qk_heads) # SSM forward - result = mamba_ssm.ops.triton.ssd_combined.mamba_chunk_scan_combined( + result = _mamba_chunk_scan_combined( x=x / torch.nn.functional.softplus(A_log).to(x.dtype).unsqueeze(-1), dt=A_log, dt_softplus=True, @@ -234,11 +245,18 @@ def forward(self, hidden_states, kwargs): def convolutional_forward(self, xBC, padded_len): """Convolutional layer forward pass for the full sequence.""" - if causal_conv1d is None or self.activation_name not in [ + if _causal_conv1d_available and self.activation_name in ( "silu", "swish", "identity", - ]: + ): + xBC = _causal_conv1d_fn( + xBC.transpose(1, 2), + einops.rearrange(self.conv1d_weight, "d 1 w -> d w"), + self.conv1d_bias, + activation=None if self.activation_name == "identity" else self.activation_name, + ).transpose(1, 2) + else: xBC = self.act( torch.nn.functional.conv1d( xBC.transpose(1, 2), @@ -248,11 +266,4 @@ def convolutional_forward(self, xBC, padded_len): padding=self.conv_kernel_size - 1, )[..., :padded_len].transpose(1, 2) ) - else: - xBC = causal_conv1d.causal_conv1d_fn( - xBC.transpose(1, 2), - einops.rearrange(self.conv1d_weight, "d 1 w -> d w"), - self.conv1d_bias, - activation=None if self.activation_name == "identity" else self.activation_name, - ).transpose(1, 2) return xBC diff --git a/fast_llm/layers/ssm/mamba_layer.py b/fast_llm/layers/ssm/mamba_layer.py index 7d0ee48a4..7fd437894 100644 --- a/fast_llm/layers/ssm/mamba_layer.py +++ b/fast_llm/layers/ssm/mamba_layer.py @@ -2,7 +2,6 @@ from typing import Callable import einops -import mamba_ssm.ops.selective_scan_interface import torch from fast_llm.engine.config_utils.tensor_space import TensorDim, TensorSpace @@ -11,6 +10,13 @@ from fast_llm.tensor import ParameterMeta, init_ones_, kaiming_init_ from fast_llm.utils import get_lr_scale +try: + from mamba_ssm.ops.selective_scan_interface import mamba_inner_fn as _mamba_inner_fn # noqa + + _mamba_available = True +except ImportError: + _mamba_available = False + """ Note: this is mostly adapted from https://github.com/Zyphra/Zamba2, similar code is also in https://github.com/state-spaces/mamba. For now it only supports training and not inference. @@ -153,6 +159,7 @@ def __init__( self._return_input = return_input def forward(self, hidden_states, kwargs): + assert _mamba_available batch, seqlen, dim = hidden_states.shape # We do matmul and transpose BLH -> HBL at the same time @@ -167,7 +174,7 @@ def forward(self, hidden_states, kwargs): A = -torch.exp(self.A_log.float()) # (d_inner, d_state) # In the backward pass we write dx and dz next to each other to avoid torch.cat # not, if we wanbt to support inference, we would need to imp.lement slow path here, see https://github.com/Zyphra/Zamba2/blob/1b182f40f2257f822cc06dd785df53d67d691a15/mamba_layer.py#L172s - out = mamba_ssm.ops.selective_scan_interface.mamba_inner_fn( + out = _mamba_inner_fn( xz, self.conv1d_weight, self.conv1d_bias, diff --git a/setup.cfg b/setup.cfg index 3345ff73a..bc0de459d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -6,10 +6,10 @@ packages = find_namespace: include_package_data = True python_requires = >=3.12 install_requires = - requests>=2.32.3 - PyYAML>=6.0.1 - pybind11>=2.5.0 - packaging>=24.1 + requests>=2.32.4 + PyYAML>=6.0.2 + pybind11>=2.13.6 + packaging>=25.0 [options.extras_require] # Required to use the main functionality of Fast-LLM @@ -21,7 +21,7 @@ CORE = # Numpy major needs to match torch numpy>=1.26.4,<2.0.0 # Used for checkpoints - safetensors>=0.4.4 + safetensors>=0.5.3 # Update the base image (version fixed to ensure there is a wheel for the base image), may need --no-build-isolation flash-attn==2.7.3 mamba_ssm[causal-conv1d]==2.2.4 @@ -30,28 +30,27 @@ CORE = # Required for some optional features and tools. OPTIONAL = # Huggingface tools - transformers>=4.44.2 - hf-transfer>=0.1.8 - datasets>=3.1.0 - huggingface-hub>=0.28.1 + transformers>=4.52.4 + hf-transfer>=0.1.9 + datasets>=3.6.0 + huggingface-hub>=0.32.6 # Weights and biases - wandb>=0.17.7 + wandb>=0.20.1 # Hydra hydra-core>=1.3.2 omegaconf>=2.3.0 # Miscellaneous - requests>=2.32.3 tqdm>=4.67.1 DEV = # Pre-commit git hook - pre-commit>=4.0.1 + pre-commit>=4.2.0 # Required for testing - pytest>=8.3.2 + pytest>=8.4.0 pytest-depends>=1.0.1 - pytest-xdist>=3.6.1 + pytest-xdist>=3.7.0 # Somehow needed for Megatron to work with base image 24.11 - setuptools>=78.1.1 + setuptools>=80.9.0 # Required for building the documentation DOCS = diff --git a/tests/test_ssms.py b/tests/test_ssms.py index f3eb92617..ef5193b67 100644 --- a/tests/test_ssms.py +++ b/tests/test_ssms.py @@ -14,6 +14,7 @@ from fast_llm.engine.schedule.schedule import Schedule from fast_llm.layers.language_model.config import LanguageModelKwargs, LanguageModelLossNames from fast_llm.layers.ssm.config import SSMBlockType +from fast_llm.layers.ssm.llamba_block import LlambaBlock from fast_llm.layers.transformer.config import TransformerKwargs from fast_llm.models.gpt.config import GPTBatchConfig, LlamaGPTHuggingfaceCheckpointFormat from fast_llm.models.ssm.config import AprielSSMHHybridHuggingfaceCheckpointFormat, LLambaHuggingfaceCheckpointFormat @@ -21,7 +22,6 @@ try: from fast_llm.layers.ssm.discrete_mamba2 import DiscreteMamba2 - from fast_llm.layers.ssm.llamba_block import LlambaBlock from fast_llm.layers.ssm.mamba_layer import MambaLayer from fast_llm.models.ssm.model import HybridSSMBaseModel, HybridSSMModel except Exception: From dcc506464d175407c3d8711e73d05ae3b88c6c41 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Thu, 12 Jun 2025 14:30:29 -0400 Subject: [PATCH 15/43] fixes --- tests/test_ssms.py | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/tests/test_ssms.py b/tests/test_ssms.py index ef5193b67..36c7b6229 100644 --- a/tests/test_ssms.py +++ b/tests/test_ssms.py @@ -14,24 +14,15 @@ from fast_llm.engine.schedule.schedule import Schedule from fast_llm.layers.language_model.config import LanguageModelKwargs, LanguageModelLossNames from fast_llm.layers.ssm.config import SSMBlockType +from fast_llm.layers.ssm.discrete_mamba2 import DiscreteMamba2 from fast_llm.layers.ssm.llamba_block import LlambaBlock +from fast_llm.layers.ssm.mamba_layer import MambaLayer from fast_llm.layers.transformer.config import TransformerKwargs from fast_llm.models.gpt.config import GPTBatchConfig, LlamaGPTHuggingfaceCheckpointFormat from fast_llm.models.ssm.config import AprielSSMHHybridHuggingfaceCheckpointFormat, LLambaHuggingfaceCheckpointFormat +from fast_llm.models.ssm.model import HybridSSMBaseModel, HybridSSMModel from tests.common import get_hybrid_config, materialize_meta_tensors -try: - from fast_llm.layers.ssm.discrete_mamba2 import DiscreteMamba2 - from fast_llm.layers.ssm.mamba_layer import MambaLayer - from fast_llm.models.ssm.model import HybridSSMBaseModel, HybridSSMModel -except Exception: - MambaLayer, LlambaBlock, HybridSSMBaseModel, DiscreteMamba2 = ( - None, - None, - None, - None, - ) - try: from cartesia_pytorch.Llamba.llamba import LlambaLMHeadModel as LMHeadModel except ImportError: From 9d415bc6f29a083e326d856fcfcc949bdad3b638 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Thu, 12 Jun 2025 14:37:21 -0400 Subject: [PATCH 16/43] fixes --- .github/workflows/docs.yaml | 2 +- Dockerfile | 2 +- setup.cfg | 21 ++++++++++++++------- 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml index 93191972e..b755993ce 100644 --- a/.github/workflows/docs.yaml +++ b/.github/workflows/docs.yaml @@ -33,7 +33,7 @@ jobs: pip install pybind11 FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE FLASH_ATTENTION_FORCE_BUILD=TRUE MAMBA_SKIP_CUDA_BUILD=TRUE \ MAMBA_FORCE_BUILD=TRUE CAUSAL_CONV1D_FORCE_BUILD=TRUE CAUSAL_CONV1D_SKIP_CUDA_BUILD=TRUE \ - pip install --no-build-isolation -e ".[CORE,OPTIONAL,DEV,DOCS]" + pip install --no-build-isolation -e ".[CORE,OPTIONAL,HUGGINGFACE,SSM,DEV,DOCS]" - name: Build the documentation run: mkdocs build diff --git a/Dockerfile b/Dockerfile index 05c3870c5..50810ed1e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -37,7 +37,7 @@ COPY --chmod=777 ./fast_llm/__init__.py fast_llm/ COPY --chmod=777 ./fast_llm/csrc/ fast_llm/csrc/ # Install dependencies within the virtual environment. -RUN pip install --no-cache-dir --no-build-isolation -e ".[CORE,OPTIONAL,DEV]" +RUN pip install --no-cache-dir --no-build-isolation -e ".[CORE,OPTIONAL,HUGGINGFACE,SSM,DEV]" # Copy the remaining source code with universal write permissions. COPY --chmod=777 ./Megatron-LM Megatron-LM diff --git a/setup.cfg b/setup.cfg index bc0de459d..8a446064d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -24,16 +24,10 @@ CORE = safetensors>=0.5.3 # Update the base image (version fixed to ensure there is a wheel for the base image), may need --no-build-isolation flash-attn==2.7.3 - mamba_ssm[causal-conv1d]==2.2.4 -# Required for some optional features and tools. +# Small packages required for some optional features and tools. OPTIONAL = - # Huggingface tools - transformers>=4.52.4 - hf-transfer>=0.1.9 - datasets>=3.6.0 - huggingface-hub>=0.32.6 # Weights and biases wandb>=0.20.1 # Hydra @@ -42,6 +36,19 @@ OPTIONAL = # Miscellaneous tqdm>=4.67.1 +# Huggingface tools +HUGGINGFACE = + transformers>=4.52.4 + hf-transfer>=0.1.9 + datasets>=3.6.0 + huggingface-hub>=0.32.6 + +# Required to run SSMs +# To install on cpu environment (ex. for IDE support): +# MAMBA_FORCE_BUILD=TRUE CAUSAL_CONV1D_FORCE_BUILD=TRUE CAUSAL_CONV1D_SKIP_CUDA_BUILD=TRUE pip install -e ".[CORE,SSM]" --no-build-isolation +SSM = + mamba_ssm[causal-conv1d]==2.2.4 + DEV = # Pre-commit git hook pre-commit>=4.2.0 From 68251c29eadeb1f25d23ba1090d8f43d6665cbf4 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Thu, 12 Jun 2025 16:00:41 -0400 Subject: [PATCH 17/43] fixes --- fast_llm/layers/ssm/config.py | 2 +- tests/conftest.py | 14 ++++++++++++-- tests/utils/model_configs.py | 8 ++++---- 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/fast_llm/layers/ssm/config.py b/fast_llm/layers/ssm/config.py index 13418254c..6837507f9 100644 --- a/fast_llm/layers/ssm/config.py +++ b/fast_llm/layers/ssm/config.py @@ -21,7 +21,7 @@ class SSMDimNames: v_heads = "v_heads" # Number of V heads -class SSMBlockType(str, enum.Enum): +class SSMBlockType(enum.StrEnum): """ An enum for the available mamba types for the MLP layer. """ diff --git a/tests/conftest.py b/tests/conftest.py index bfe9f50cf..bc3d443cd 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -184,8 +184,12 @@ def pytest_runtest_makereport(item: pytest.Function, call: pytest.CallInfo): # Collect only if the remaining memory is significant enough since it's costly. if torch.cuda.memory_allocated() > 1e7: gc.collect() - # Actually free the memory. - torch.cuda.empty_cache() + try: + # Actually free the memory. + torch.cuda.empty_cache() + except RuntimeError: + # Happens if the test broke cuda. + return item.add_report_section( call.when, "resource usage", @@ -243,6 +247,12 @@ def pytest_terminal_summary(terminalreporter): def pytest_runtest_call(item: pytest.Function): + if torch.cuda.is_available(): + # Empty cache to check is cuda is still working (TODO: Is there a better way? Can we kill the worker?) + try: + torch.cuda.empty_cache() + except RuntimeError: + pytest.skip("Cuda runtime unavailable due to an error in an earlier test.") manager.handle_missing(item) diff --git a/tests/utils/model_configs.py b/tests/utils/model_configs.py index 3f334c64a..cf1246905 100644 --- a/tests/utils/model_configs.py +++ b/tests/utils/model_configs.py @@ -370,14 +370,14 @@ def _update_and_add_testing_config( "--moe-router-topk=4", ], checkpoint_format=MixtralGPTHuggingfaceCheckpointFormat, - testing_groups=[ + testing_groups=[], + # TODO: New base image broke mixtral + # TODO: Bring back `generate` to `testing_groups` when stable. + other_groups=[ ModelTestingGroup.basic, ModelTestingGroup.megatron, ModelTestingGroup.distributed, ModelTestingGroup.convert, - ], - # TODO: Bring back `generate` to `testing_groups` when stable. - other_groups=[ ModelTestingGroup.generate, ], ) From 639d6c261f8ddafae62d73631223e3f7b1cae72a Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Thu, 12 Jun 2025 16:54:51 -0400 Subject: [PATCH 18/43] doc --- fast_llm/layers/ssm/config.py | 1 - fast_llm/models/ssm/config.py | 8 ++++---- fast_llm/models/ssm/model.py | 6 +++--- setup.cfg | 1 - tests/utils/depends.py | 12 +++++++++++- tests/utils/model_configs.py | 2 +- 6 files changed, 19 insertions(+), 11 deletions(-) diff --git a/fast_llm/layers/ssm/config.py b/fast_llm/layers/ssm/config.py index 6837507f9..fd9c60ece 100644 --- a/fast_llm/layers/ssm/config.py +++ b/fast_llm/layers/ssm/config.py @@ -28,7 +28,6 @@ class SSMBlockType(enum.StrEnum): mamba = "m" mamba2_discrete = "m2d" - mamba2 = "m2" transformer = "t" diff --git a/fast_llm/models/ssm/config.py b/fast_llm/models/ssm/config.py index e27e52802..22f81fa15 100644 --- a/fast_llm/models/ssm/config.py +++ b/fast_llm/models/ssm/config.py @@ -30,14 +30,14 @@ class HybridSSMBaseModelConfig(LanguageModelBaseConfig): desc="Configuration for the transformer architecture.", hint=FieldHint.architecture, ) - hybrid_block_layout: list[str] | None = Field( + hybrid_block_layout: list[SSMBlockType] | None = Field( default=None, - desc=f"Pattern of blocks to use in the model. Availabel types: {SSMBlockType.__members__.values()}", + desc=f"Pattern of blocks to use in the model. Available types: {SSMBlockType.__members__.values()}", hint=FieldHint.core, ) - default_mtp_type: str | None = Field( + default_mtp_type: SSMBlockType | None = Field( default=None, - desc="Multi-token prediction mixer to use in the model. 't' for Transformer, 'm' for Mamba1, 'm2' for discrete Mamba2. If None, will use the last block type in `hybrid_block_layout`.", + desc="Multi-token prediction mixer to use in the model. If None, will use the last block type in `hybrid_block_layout`.", hint=FieldHint.optional, ) use_megatron_initialization: bool = Field( diff --git a/fast_llm/models/ssm/model.py b/fast_llm/models/ssm/model.py index 118a195b8..526d66c01 100644 --- a/fast_llm/models/ssm/model.py +++ b/fast_llm/models/ssm/model.py @@ -88,7 +88,7 @@ def get_layers(self) -> list[Layer]: # Create blocks according to pattern for i, block_type in enumerate(self._config.hybrid_block_layout): - if block_type == SSMBlockType.transformer.value: + if block_type == SSMBlockType.transformer: # Transformer block layers.append( TransformerLayer( @@ -100,7 +100,7 @@ def get_layers(self) -> list[Layer]: ), ) ) - elif block_type == SSMBlockType.mamba2_discrete.value: + elif block_type == SSMBlockType.mamba2_discrete: mamba_block = self.SSM_BLOCK_CLS( config_transformer=self._config.transformer, config_ssm=self._config.ssm, @@ -113,7 +113,7 @@ def get_layers(self) -> list[Layer]: ) layers.append(mamba_block) - elif block_type == SSMBlockType.mamba.value: + elif block_type == SSMBlockType.mamba: # Create Mamba block mamba_block = self.SSM_BLOCK_CLS( config_transformer=self._config.transformer, diff --git a/setup.cfg b/setup.cfg index 8a446064d..24efcaf35 100644 --- a/setup.cfg +++ b/setup.cfg @@ -54,7 +54,6 @@ DEV = pre-commit>=4.2.0 # Required for testing pytest>=8.4.0 - pytest-depends>=1.0.1 pytest-xdist>=3.7.0 # Somehow needed for Megatron to work with base image 24.11 setuptools>=80.9.0 diff --git a/tests/utils/depends.py b/tests/utils/depends.py index 3fbb8f398..3972a066d 100644 --- a/tests/utils/depends.py +++ b/tests/utils/depends.py @@ -49,7 +49,17 @@ def as_list(lst): class DependencyManager: - """Keep track of tests, their names and their dependencies.""" + """ + A simplified and improved version of pytest-depends. Main differences are the following: + * Add compatibility with pytest-xdist: group connected components of the dependency graph together, + and rename them with the `@dependency_group_{i}` suffix so they are run in the same worker, assuming + group scheduling is used. + * Improved parameterized dependencies so tests can depend on other tests with matching parametrization. + Ex. a test `test_model` with parameter `model` can depend on `test_other[{model}]`, + then `test_model[llama]` will depend on `test_other[llama]`, and so on. + * Improved description of missing/failed dependencies. + * Some option hard-coded for Fast-LLM. + """ def __init__(self, items: list[pytest.Function]): self._items = items diff --git a/tests/utils/model_configs.py b/tests/utils/model_configs.py index cf1246905..d4889e948 100644 --- a/tests/utils/model_configs.py +++ b/tests/utils/model_configs.py @@ -415,7 +415,7 @@ def _update_and_add_testing_config( "hybrid_mamba_2", model_type="hybrid_ssm", extra_args=[ - "model.base_model.hybrid_block_layout=['t','m2']", + "model.base_model.hybrid_block_layout=['t','m2d']", ], megatron_args=None, checkpoint_format=None, From 746542847ed3045fe62819a31a12eacfa17aeb5e Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Thu, 12 Jun 2025 17:24:38 -0400 Subject: [PATCH 19/43] stuff --- fast_llm/layers/ssm/config.py | 3 +- fast_llm/layers/transformer/config.py | 11 +- fast_llm/layers/transformer/transformer.py | 2 +- fast_llm/logging.py | 7 +- fast_llm/models/ssm/config.py | 8 +- fast_llm/models/ssm/model.py | 6 +- setup.cfg | 1 - tests/common.py | 454 --------------------- tests/conftest.py | 183 +++++++-- tests/data/common.py | 2 +- tests/data/test_blending.py | 2 +- tests/data/test_concatenate.py | 2 +- tests/data/test_concatenated_memmap.py | 2 +- tests/data/test_dataset_from_file.py | 2 +- tests/data/test_fim.py | 2 +- tests/data/test_memmap.py | 2 +- tests/data/test_sampling.py | 2 +- tests/data/test_slice.py | 2 +- tests/layers/test_lm_head.py | 2 +- tests/test_checkpoint.py | 16 +- tests/test_config.py | 2 +- tests/test_functional.py | 2 +- tests/test_gpt_generate_and_forward.py | 4 +- tests/test_match_megatron.py | 6 +- tests/test_mb.py | 5 +- tests/test_mb_seq_first.py | 5 +- tests/test_ms.py | 3 +- tests/test_mtp.py | 2 +- tests/test_multi_stage.py | 3 +- tests/test_seq_first.py | 3 +- tests/test_simple.py | 3 +- tests/test_ssms.py | 2 +- tests/test_triton_kernels.py | 2 +- tests/utils/__init__.py | 0 tests/{ => utils}/compare_tensor_logs.py | 0 tests/utils/dataset.py | 80 ++++ tests/utils/depends.py | 211 ++++++++++ tests/utils/model_configs.py | 233 +++++++++++ tests/utils/run_test_script.py | 96 +++++ tests/utils/utils.py | 52 +++ 40 files changed, 881 insertions(+), 544 deletions(-) create mode 100644 tests/utils/__init__.py rename tests/{ => utils}/compare_tensor_logs.py (100%) create mode 100644 tests/utils/dataset.py create mode 100644 tests/utils/depends.py create mode 100644 tests/utils/model_configs.py create mode 100644 tests/utils/run_test_script.py create mode 100644 tests/utils/utils.py diff --git a/fast_llm/layers/ssm/config.py b/fast_llm/layers/ssm/config.py index 13418254c..fd9c60ece 100644 --- a/fast_llm/layers/ssm/config.py +++ b/fast_llm/layers/ssm/config.py @@ -21,14 +21,13 @@ class SSMDimNames: v_heads = "v_heads" # Number of V heads -class SSMBlockType(str, enum.Enum): +class SSMBlockType(enum.StrEnum): """ An enum for the available mamba types for the MLP layer. """ mamba = "m" mamba2_discrete = "m2d" - mamba2 = "m2" transformer = "t" diff --git a/fast_llm/layers/transformer/config.py b/fast_llm/layers/transformer/config.py index 9cc9510b5..3e619eb97 100644 --- a/fast_llm/layers/transformer/config.py +++ b/fast_llm/layers/transformer/config.py @@ -711,13 +711,4 @@ def setup_tensor_space(self, tensor_space: TensorSpace) -> None: ) def do_use_flash_attention(self, distributed_config: DistributedConfig) -> bool: - use_flash_attention = self.use_flash_attention and distributed_config.training_dtype in ( - DataType.float16, - DataType.bfloat16, - ) - - # Config parameter `window_size` only can be used with flash attention - if not use_flash_attention: - Assert.is_(self.window_size, None) - - return use_flash_attention + return self.use_flash_attention and distributed_config.training_dtype in (DataType.float16, DataType.bfloat16) diff --git a/fast_llm/layers/transformer/transformer.py b/fast_llm/layers/transformer/transformer.py index b51ba1e94..147452073 100644 --- a/fast_llm/layers/transformer/transformer.py +++ b/fast_llm/layers/transformer/transformer.py @@ -20,7 +20,7 @@ class BaseBlock(Layer, abc.ABC): """ - A transformer-like decoder base block block with abstract mixer. + A transformer-like decoder base block with abstract mixer. """ _mixer_module_name = "self_attn" diff --git a/fast_llm/logging.py b/fast_llm/logging.py index ffeb56f62..9c791ba64 100644 --- a/fast_llm/logging.py +++ b/fast_llm/logging.py @@ -323,16 +323,19 @@ def log_generator[ return log(f"{name} {tensor.view(dtype=torch.int64)[-8:].tolist()}", log_fn=log_fn) +_global_max_allocated = 0 _global_max_reserved = 0 def get_memory_usage_mib(reset_stats: bool = True, relative_to: dict[str, int] | None = None) -> dict[str, float]: - global _global_max_reserved + global _global_max_allocated, _global_max_reserved + max_allocated = torch.cuda.memory_allocated() / 2**20 max_reserved = torch.cuda.max_memory_reserved() / 2**20 + _global_max_allocated = max(max_allocated, _global_max_allocated) _global_max_reserved = max(max_reserved, _global_max_reserved) out = { "allocated": torch.cuda.memory_allocated() / 2**20, - "max_allocated": torch.cuda.max_memory_allocated() / 2**20, + "max_allocated": max_allocated, "reserved": torch.cuda.memory_reserved() / 2**20, "max_reserved": max_reserved, "global_max_reserved": _global_max_reserved, diff --git a/fast_llm/models/ssm/config.py b/fast_llm/models/ssm/config.py index e27e52802..22f81fa15 100644 --- a/fast_llm/models/ssm/config.py +++ b/fast_llm/models/ssm/config.py @@ -30,14 +30,14 @@ class HybridSSMBaseModelConfig(LanguageModelBaseConfig): desc="Configuration for the transformer architecture.", hint=FieldHint.architecture, ) - hybrid_block_layout: list[str] | None = Field( + hybrid_block_layout: list[SSMBlockType] | None = Field( default=None, - desc=f"Pattern of blocks to use in the model. Availabel types: {SSMBlockType.__members__.values()}", + desc=f"Pattern of blocks to use in the model. Available types: {SSMBlockType.__members__.values()}", hint=FieldHint.core, ) - default_mtp_type: str | None = Field( + default_mtp_type: SSMBlockType | None = Field( default=None, - desc="Multi-token prediction mixer to use in the model. 't' for Transformer, 'm' for Mamba1, 'm2' for discrete Mamba2. If None, will use the last block type in `hybrid_block_layout`.", + desc="Multi-token prediction mixer to use in the model. If None, will use the last block type in `hybrid_block_layout`.", hint=FieldHint.optional, ) use_megatron_initialization: bool = Field( diff --git a/fast_llm/models/ssm/model.py b/fast_llm/models/ssm/model.py index 118a195b8..526d66c01 100644 --- a/fast_llm/models/ssm/model.py +++ b/fast_llm/models/ssm/model.py @@ -88,7 +88,7 @@ def get_layers(self) -> list[Layer]: # Create blocks according to pattern for i, block_type in enumerate(self._config.hybrid_block_layout): - if block_type == SSMBlockType.transformer.value: + if block_type == SSMBlockType.transformer: # Transformer block layers.append( TransformerLayer( @@ -100,7 +100,7 @@ def get_layers(self) -> list[Layer]: ), ) ) - elif block_type == SSMBlockType.mamba2_discrete.value: + elif block_type == SSMBlockType.mamba2_discrete: mamba_block = self.SSM_BLOCK_CLS( config_transformer=self._config.transformer, config_ssm=self._config.ssm, @@ -113,7 +113,7 @@ def get_layers(self) -> list[Layer]: ) layers.append(mamba_block) - elif block_type == SSMBlockType.mamba.value: + elif block_type == SSMBlockType.mamba: # Create Mamba block mamba_block = self.SSM_BLOCK_CLS( config_transformer=self._config.transformer, diff --git a/setup.cfg b/setup.cfg index 8a446064d..24efcaf35 100644 --- a/setup.cfg +++ b/setup.cfg @@ -54,7 +54,6 @@ DEV = pre-commit>=4.2.0 # Required for testing pytest>=8.4.0 - pytest-depends>=1.0.1 pytest-xdist>=3.7.0 # Somehow needed for Megatron to work with base image 24.11 setuptools>=80.9.0 diff --git a/tests/common.py b/tests/common.py index d531972e7..a2dba74a6 100644 --- a/tests/common.py +++ b/tests/common.py @@ -1,470 +1,16 @@ import os -import pathlib -import random -import shutil -import string -import subprocess import sys -import numpy as np -import pytest -import torch -import yaml - -from fast_llm.data.dataset.gpt.memmap import GPTMemmapDataset -from fast_llm.data.dataset.gpt.sampled import GPTSample -from fast_llm.layers.ssm.config import SSMConfig -from fast_llm.layers.transformer.config import TransformerConfig -from fast_llm.models.gpt.config import ( - LlamaGPTHuggingfaceCheckpointFormat, - MistralGPTHuggingfaceCheckpointFormat, - MixtralGPTHuggingfaceCheckpointFormat, - MTPLlamaGPTHuggingfaceCheckpointFormat, - Qwen2GPTHuggingfaceCheckpointFormat, - Starcoder2GPTHuggingfaceCheckpointFormat, -) -from fast_llm.models.ssm.config import HybridSSMBaseModelConfig, LLambaHuggingfaceCheckpointFormat -from fast_llm.tools.train import CliTrainingConfig -from tests.compare_tensor_logs import CompareConfig, compare_tensor_logs - # FIXME: figure out correct import of megatron modules without this hack sys.path.append(os.getcwd()) # TODO: Use `pytest_addoption` instead? # Keep all results in one place to allow recovering them for debugging in case of failure. -TEST_RESULTS_PATH = pathlib.Path(os.environ.get("TEST_RESULTS_PATH", "/tmp/fast_llm_tests")).resolve() -FORCE_REUSE_RESULTS = int(os.environ.get("FORCE_REUSE_RESULTS", 0)) != 0 -REUSE_RESULTS = FORCE_REUSE_RESULTS or int(os.environ.get("REUSE_RESULTS", 0)) != 0 -_LOG_LEVEL = int(os.environ.get("LOG_LEVEL", 13)) -TEST_MODEL = os.environ.get("MODEL", "llama") - -ARTIFACT_PATH = "runs/0/artifacts" -TOKENIZER_PATH = TEST_RESULTS_PATH / "tokenizer" / "common" -TOKENIZER_FILE = TOKENIZER_PATH / "tokenizer.json" -DATASET_CACHE = TEST_RESULTS_PATH / "dataset" -DATASET_PREFIX = DATASET_CACHE / "common" / "dataset" -DATASET_SAMPLING_CACHE = TEST_RESULTS_PATH / "dataset" / "cache" - -TEST_VOCAB_SIZE = 8192 # Random lowercase: 80.7% (3.1% each); space: 18.6%; doc end: 0.6% -TEST_CHARACTERS = (string.ascii_lowercase) * 5 + " " * 30 + "\n" -TEST_DATASET_TOKENS = 1000000 - -CONFIG_BASE_FAST_LLM = [ - "training.logs.interval=1", - "run.tensor_logs.save=True", - "run.tensor_logs.show=False", - "model.base_model.transformer.num_layers=2", - "model.base_model.transformer.hidden_size=256", - "model.base_model.transformer.num_attention_heads=8", - "model.base_model.transformer.init_method_std=0.022", - f"model.base_model.vocab_size={TEST_VOCAB_SIZE}", - f"model.multi_stage.debug_param_init={_LOG_LEVEL}", - f"model.multi_stage.debug_layer_outputs={_LOG_LEVEL}", - f"model.multi_stage.debug_layer_gradients={_LOG_LEVEL}", - f"model.multi_stage.debug_all_param_gradients={_LOG_LEVEL}", - "model.multi_stage.debug_tensor_parallel=True", - "model.distributed.reproducible_init=True", - "model.distributed.timeout=10", - "training.train_iters=2", - "training.num_workers=0", - "training.timeout=30", - "batch.batch_size=8", - "batch.sequence_length=512", - "data.datasets.training.type=slice", - "data.datasets.training.end=0.969", - "data.datasets.training.dataset.type=memmap", - f"data.datasets.training.dataset.path={DATASET_PREFIX}", - "data.datasets.validation.type=slice", - "data.datasets.validation.begin=0.969", - "data.datasets.validation.end=0.999", - "data.datasets.validation.dataset.type=memmap", - f"data.datasets.validation.dataset.path={DATASET_PREFIX}", - "data.datasets.test.type=slice", - "data.datasets.test.begin=0.999", - "data.datasets.test.end=1", - "data.datasets.test.dataset.type=memmap", - f"data.datasets.test.dataset.path={DATASET_PREFIX}", - "optimizer.learning_rate.base=0.0001", -] -CONFIG_BASE_MEGATRON = [ - "--num-layers=2", - "--hidden-size=256", - "--num-attention-heads=8", - "--log-interval=1", - "--train-iters=2", - "--eval-iters=0", - "--hidden-dropout=0", - "--attention-dropout=0", - f"--debug_param_init={_LOG_LEVEL}", - f"--debug_layer_outputs={_LOG_LEVEL}", - f"--debug_layer_gradients={_LOG_LEVEL}", - f"--debug_all_param_gradients={_LOG_LEVEL}", - "--debug_param_update=0", - "--global-batch-size=8", - "--max-position-embeddings=512", - "--seq-length=512", - "--init-method-std=0.022", - "--lr=0.0001", - "--num-workers=0", - "--valid-num-workers=0", - "--tokenizer-type=NullTokenizer", - # Megatron messes with the vocab size, so we have to subtract 1. - f"--vocab-size={TEST_VOCAB_SIZE-1}", - f"--data-path={DATASET_PREFIX}", - "--lr-decay-style=constant", - # Initialization is set up to match MCore models (MCore inverts self-attn qkv and dense layers compared to original Megatron) - "--use-mcore-models", - # local implementation doesn't allow for RMS norm. - "--transformer-impl=transformer_engine", -] - -CONFIG_SC1_FAST_LLM = CONFIG_BASE_FAST_LLM + ["model.base_model.max_position_embeddings=512"] -CONFIG_SC1_MEGATRON = CONFIG_BASE_MEGATRON + ["--group-query-attention"] -CONFIG_SC1_COMMON = CONFIG_SC1_FAST_LLM + ["model.distributed.training_dtype=bf16"] - -CONFIG_GPT2_FAST_LLM = CONFIG_SC1_FAST_LLM + ["model.base_model.transformer.head_groups=8"] -CONFIG_GPT2_MEGATRON = CONFIG_BASE_MEGATRON -CONFIG_GPT2_COMMON = CONFIG_GPT2_FAST_LLM + ["model.distributed.training_dtype=bf16"] - -CONFIG_SC2_FAST_LLM = CONFIG_BASE_FAST_LLM + [ - "model.base_model.transformer.head_groups=4", - "model.base_model.transformer.rotary.type=default", -] -CONFIG_SC2_MEGATRON = CONFIG_SC1_MEGATRON + [ - "--num-query-groups=4", - "--use-rotary-position-embeddings", - "--no-position-embedding", -] -CONFIG_SC2_COMMON = CONFIG_SC2_FAST_LLM + ["model.distributed.training_dtype=bf16"] - -CONFIG_LLAMA_MEGATRON = CONFIG_SC2_MEGATRON + [ - "--swiglu", - "--disable-bias-linear", - "--normalization=RMSNorm", - "--ffn-hidden-size=1024", - "--untie-embeddings-and-output-weights", -] -CONFIG_LLAMA_FAST_LLM = CONFIG_SC2_FAST_LLM + [ - "model.base_model.transformer.gated=True", - "model.base_model.transformer.activation_type=silu", - "model.base_model.transformer.add_linear_biases=False", - "model.base_model.transformer.normalization.type=rms_norm", - "model.base_model.transformer.ffn_hidden_size=1024", - "model.base_model.tie_word_embeddings=False", -] -CONFIG_LLAMA_COMMON = CONFIG_LLAMA_FAST_LLM + ["model.distributed.training_dtype=bf16"] # Megatron does not support Llama3-style Rotary Embeddings -CONFIG_LLAMA3_MEGATRON = None -CONFIG_LLAMA3_FAST_LLM = CONFIG_LLAMA_FAST_LLM + [ - "model.base_model.transformer.rotary.type=llama3", -] -CONFIG_LLAMA3_COMMON = CONFIG_LLAMA3_FAST_LLM + ["model.distributed.training_dtype=bf16"] # Megatron does not support per sub layer biases -CONFIG_QWEN2_MEGATRON = None -CONFIG_QWEN2_FAST_LLM = CONFIG_SC2_FAST_LLM + [ - "model.base_model.transformer.gated=True", - "model.base_model.transformer.activation_type=silu", - "model.base_model.transformer.add_linear_biases=only_attn_qkv", - "model.base_model.transformer.normalization.type=rms_norm", - "model.base_model.transformer.ffn_hidden_size=1024", - "model.base_model.tie_word_embeddings=False", -] -CONFIG_QWEN2_COMMON = CONFIG_QWEN2_FAST_LLM + ["model.distributed.training_dtype=bf16"] # Yarn-style Rotary Embeddings -CONFIG_LLAMA_YARN_MEGATRON = None -CONFIG_LLAMA_YARN_FAST_LLM = CONFIG_LLAMA_FAST_LLM + [ - "model.base_model.transformer.rotary.type=yarn", -] -CONFIG_LLAMA_YARN_COMMON = CONFIG_LLAMA_YARN_FAST_LLM + ["model.distributed.training_dtype=bf16"] - - -CONFIG_MIXTRAL_MEGATRON = CONFIG_LLAMA_MEGATRON + [ - "--num-experts=4", - "--moe-router-topk=4", -] -CONFIG_MIXTRAL_FAST_LLM = CONFIG_LLAMA_FAST_LLM + [ - "model.base_model.transformer.num_experts=4", - "model.base_model.transformer.num_experts_per_token=4", -] -CONFIG_MIXTRAL_COMMON = CONFIG_MIXTRAL_FAST_LLM + ["model.distributed.training_dtype=bf16"] -CONFIG_MIXTRAL_YARN_MEGATRON = None -CONFIG_MIXTRAL_YARN_FAST_LLM = CONFIG_MIXTRAL_FAST_LLM + [ - "model.base_model.transformer.rotary.type=yarn", -] -CONFIG_MIXTRAL_YARN_COMMON = CONFIG_MIXTRAL_YARN_FAST_LLM + ["model.distributed.training_dtype=bf16"] - -CONFIG_LLAMA_MTP_MEGATRON = None -CONFIG_LLAMA_MTP_FAST_LLM = CONFIG_LLAMA_FAST_LLM + [ - "model.base_model.prediction_heads=4", -] -CONFIG_LLAMA_MTP_COMMON = CONFIG_LLAMA_MTP_FAST_LLM + ["model.distributed.training_dtype=bf16"] - -CONFIG_LLAMBA_FAST_LLM = CONFIG_LLAMA_FAST_LLM + ["model.base_model.hybrid_block_layout==['t','m']"] -CONFIG_LLAMBA_MEGATRON = CONFIG_LLAMA_MEGATRON + [] -CONFIG_LLAMBA_COMMON = CONFIG_LLAMBA_FAST_LLM - -_CONFIGS = { - "gpt2": ("gpt", CONFIG_GPT2_FAST_LLM, CONFIG_GPT2_MEGATRON, CONFIG_GPT2_COMMON, None), - "sc1": ("gpt", CONFIG_SC1_FAST_LLM, CONFIG_SC1_MEGATRON, CONFIG_SC1_COMMON, None), - "starcoder2": ( - "gpt", - CONFIG_SC2_FAST_LLM, - CONFIG_SC2_MEGATRON, - CONFIG_SC2_COMMON, - Starcoder2GPTHuggingfaceCheckpointFormat, - ), - "llama": ( - "gpt", - CONFIG_LLAMA_FAST_LLM, - CONFIG_LLAMA_MEGATRON, - CONFIG_LLAMA_COMMON, - LlamaGPTHuggingfaceCheckpointFormat, - ), - "llama3": ( - "gpt", - CONFIG_LLAMA3_FAST_LLM, - CONFIG_LLAMA3_MEGATRON, - CONFIG_LLAMA3_COMMON, - LlamaGPTHuggingfaceCheckpointFormat, - ), - "qwen2": ( - "gpt", - CONFIG_QWEN2_FAST_LLM, - CONFIG_QWEN2_MEGATRON, - CONFIG_QWEN2_COMMON, - Qwen2GPTHuggingfaceCheckpointFormat, - ), - "llama-yarn": ( - "gpt", - CONFIG_LLAMA_YARN_FAST_LLM, - CONFIG_LLAMA_YARN_MEGATRON, - CONFIG_LLAMA_YARN_COMMON, - LlamaGPTHuggingfaceCheckpointFormat, - ), - "mistral": ( - "gpt", - CONFIG_LLAMA_FAST_LLM, - CONFIG_LLAMA_MEGATRON, - CONFIG_LLAMA_COMMON, - MistralGPTHuggingfaceCheckpointFormat, - ), - "mixtral": ( - "gpt", - CONFIG_MIXTRAL_FAST_LLM, - CONFIG_MIXTRAL_MEGATRON, - CONFIG_MIXTRAL_COMMON, - MixtralGPTHuggingfaceCheckpointFormat, - ), - "llamba": ( - "hybrid_ssm", - CONFIG_LLAMBA_FAST_LLM, - CONFIG_LLAMBA_MEGATRON, - CONFIG_LLAMBA_COMMON, - LLambaHuggingfaceCheckpointFormat, - ), - "mixtral-yarn": ( - "gpt", - CONFIG_MIXTRAL_YARN_FAST_LLM, - CONFIG_MIXTRAL_YARN_MEGATRON, - CONFIG_MIXTRAL_YARN_COMMON, - MixtralGPTHuggingfaceCheckpointFormat, - ), - "llama-mtp": ( - "gpt", - CONFIG_LLAMA_MTP_FAST_LLM, - CONFIG_LLAMA_MTP_MEGATRON, - CONFIG_LLAMA_MTP_COMMON, - MTPLlamaGPTHuggingfaceCheckpointFormat, - ), -} - -TEST_MODEL_TYPE, CONFIG_FAST_LLM, CONFIG_GPT2, CONFIG_COMMON, HUGGINGFACE_CHECKPOINT_FORMAT = _CONFIGS[TEST_MODEL] - - -requires_cuda = pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is not available") - - -def get_test_dataset( - prefix: pathlib.Path = DATASET_PREFIX, - seed: int = 1234, - num_tokens: int = TEST_DATASET_TOKENS, - characters: str = TEST_CHARACTERS, - vocab_size: int = TEST_VOCAB_SIZE, - max_spans: int = 0, -): - if not TOKENIZER_FILE.is_file(): - import transformers - - transformers.AutoTokenizer.from_pretrained("bigcode/santacoder").save_pretrained(TOKENIZER_PATH) - - if not ( - prefix.with_suffix(".idx").is_file() - and prefix.with_suffix(".bin").is_file() - and prefix.parent.joinpath("fast_llm_config.yaml").is_file() - ): - import transformers - - texts = "".join(random.Random(seed).choices(characters, k=num_tokens)).splitlines() - tokenizer = transformers.AutoTokenizer.from_pretrained(TOKENIZER_PATH) - - samples = [ - GPTSample(np.array(tokenizer(document)["input_ids"], dtype=np.uint16) % vocab_size) for document in texts - ] - if max_spans > 0: - lengths = np.array([max(len(sample.token_ids), 1) for sample in samples]) - spans = np.sort(np.random.RandomState(seed + 3847).randint(0, lengths[:, None], [len(samples), max_spans])) - for sample, span in zip(samples, spans): - span = np.unique(span) - sample.loss_masking_spans = span[: len(span) // 2 * 2].reshape(-1, 2) - - GPTMemmapDataset.write_dataset(prefix, samples) - yaml.safe_dump( - {"type": "memmap", "path": prefix.name}, prefix.parent.joinpath("fast_llm_config.yaml").open("w") - ) - - -def get_test_concatenated_memmap_dataset( - path: pathlib.Path, - num_files: int, - seed: int = 1234, - num_tokens: int = TEST_DATASET_TOKENS, - characters: str = TEST_CHARACTERS, - vocab_size: int = TEST_VOCAB_SIZE, - seed_shift: int = 55, -): - index_file = path / "index.txt" - if not index_file.is_file(): - for i in range(num_files): - get_test_dataset( - prefix=path / f"dataset_{i}", - seed=seed + i * seed_shift, - num_tokens=num_tokens, - characters=characters, - vocab_size=vocab_size, - ) - index_file.open("w").writelines([str(path / f"dataset_{i}") + "\n" for i in range(num_files)]) - - -@pytest.fixture(scope="session") -def run_test_script(worker_resources): - def do_run_test_script( - name: str, - script: list[str], - num_gpus: int = 1, - *, - model_type: str = TEST_MODEL_TYPE, - is_megatron: bool = False, - compare: str | None = None, - config: CompareConfig | None = None, - prepare_fn=None, - compare_fn=None, - do_compare: bool = True, - ): - if torch.cuda.device_count() < num_gpus: - pytest.skip(f"Not enough GPUs to run test ({torch.cuda.device_count()}<{num_gpus})") - env = os.environ.copy() - if is_megatron: - # Prevent Megatron from complaining. - env["CUDA_DEVICE_MAX_CONNECTIONS"] = "1" - env["NVTE_FLASH_ATTN"] = "0" - path = TEST_RESULTS_PATH / name - skip = False - artifact_path = path / ARTIFACT_PATH - if path.exists(): - assert path.is_dir() - # TODO: Better way to check if the previous attempt succeeded. - if ( - REUSE_RESULTS - and artifact_path.is_dir() - and len(list((artifact_path / "0").iterdir())) >= (1 if is_megatron else 3) - ): - skip = True - elif FORCE_REUSE_RESULTS: - raise RuntimeError(artifact_path) - else: - shutil.rmtree(path) - elif FORCE_REUSE_RESULTS: - raise RuntimeError(path) - if prepare_fn is not None: - skip = prepare_fn(TEST_RESULTS_PATH / name, None if compare is None else TEST_RESULTS_PATH / compare, skip) - if is_megatron: - script = [*script, f"--structured-logs-dir={path}", f"--data-cache-path={path}"] - else: - script = [model_type, *script, f"run.experiment_dir={path}"] - header = ["Megatron-LM/pretrain_gpt.py"] if is_megatron else ["--no-python", "fast-llm", "train"] - command = [ - "python", - "-m", - "torch.distributed.run", - f"--nproc-per-node={num_gpus}", - f"--rdzv-endpoint=localhost:{worker_resources.rendezvous_port}", - f"--master-port={worker_resources.torchrun_port}", - *header, - *script, - ] - print(" ".join(command)) - if skip: - print("Reusing existing run.") - else: - get_test_dataset() - if num_gpus == 1 and not is_megatron: - CliTrainingConfig.parse_and_run(script) - else: - completed_proc = subprocess.run(command, env=env, timeout=60) - if completed_proc.returncode: - raise RuntimeError(f"Process failed with return code {completed_proc.returncode}") - if compare and do_compare: - if compare_fn is not None: - compare_fn(TEST_RESULTS_PATH / name, TEST_RESULTS_PATH / compare) - compare_tensor_logs( - TEST_RESULTS_PATH / compare / ARTIFACT_PATH, - TEST_RESULTS_PATH / name / ARTIFACT_PATH, - config, - ) - - return do_run_test_script - - -def materialize_meta_tensors(model, tensor_space): - # Materialize parameters that are on meta device - for name, param in model.named_parameters(): - if param.device.type == "meta": - # Check if the parameter is a custom tensor type - if hasattr(param, "tensor_name") and hasattr(param, "init_parameter"): - param_data = param.new_empty(param.shape, device="cuda") - # Initialize param_data - param.init_parameter(param_data, tensor_space.distributed) - # Replace the parameter in the module - module_path, param_name = name.rsplit(".", 1) if "." in name else (None, name) - module = model - if module_path is not None: - for part in module_path.split("."): - module = getattr(module, part) - param = torch.nn.Parameter(param_data, requires_grad=param.requires_grad) - # TODO: add param_grad_is_zero etc., grad_buffer, etc., see test_mlp_recomputation - param.grad = None - param.grad_buffer = torch.empty_like(param) - param.param_grad_is_zero = True - module._parameters[param_name] = param - return model - - -def get_hybrid_config(hybrid_block_layout=["t", "m"], prediction_heads=1, default_mtp_type=None): - config = HybridSSMBaseModelConfig( - transformer=TransformerConfig(num_layers=len(hybrid_block_layout)), - ssm=SSMConfig(), - hybrid_block_layout=hybrid_block_layout, - prediction_heads=prediction_heads, - default_mtp_type=default_mtp_type, - init_method_std_embed=0.02, - init_method_min_embed=-0.02, - init_method_max_embed=0.02, - use_position_embeddings=True, - tie_word_embeddings=False, - ) - return config diff --git a/tests/conftest.py b/tests/conftest.py index edc52e034..284f4140a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,26 +1,39 @@ import dataclasses +import gc +import json +import logging import math import os -import networkx import pytest -import pytest_depends -import pytest_depends.main import torch -from xdist.scheduler import LoadGroupScheduling +import xdist.scheduler + +import fast_llm.logging +from tests.utils.depends import DependencyManager # Make fixtures available globally without import -from tests.common import run_test_script # isort: skip + +manager: DependencyManager | None = None def pytest_addoption(parser): - parser.addoption("--skip-slow", action="store_true") - parser.addoption( + group = parser.getgroup("fast_llm") + group.addoption("--skip-slow", action="store_true") + group.addoption("--show-skipped", action="store_true") + group.addoption("--show-gpu-memory", type=int, default=10) + group.addoption( "--run-extra-slow", action="store_true", default=False, help="Run tests marked as extra_slow", ) + group.addoption( + "--show-dependencies", + action="store_true", + default=False, + help="List all dependencies of all tests as a list of nodeids + the names that could not be resolved.", + ) @dataclasses.dataclass @@ -42,6 +55,8 @@ def pytest_configure(config): config.addinivalue_line( "markers", "extra_slow: Mark test as extra slow and skip unless --run-extra-slow is given." ) + config.addinivalue_line("markers", "depends_on(name='name', on=['other_name']): marks dependencies between tests.") + config.addinivalue_line("markers", "model_testing_group(group='group'): marks model testing group.") # TODO: Spawned processes (multi-gpu, Megatron) ignore resource allocation. is_parallel = hasattr(config, "workerinput") if is_parallel: @@ -90,7 +105,12 @@ def pytest_configure(config): @pytest.hookimpl(trylast=True) -def pytest_collection_modifyitems(config, items): +def pytest_collection_modifyitems(config, items: list[pytest.Function]): + global manager + skip_slow = config.getoption("--skip-slow") + skip_extra_slow = not config.getoption("--run-extra-slow") + show_skipped = config.getoption("--show-skipped") + if config.getoption("--skip-slow"): skip_slow = pytest.mark.skip(reason="Skipping slow tests") for item in items: @@ -102,26 +122,131 @@ def pytest_collection_modifyitems(config, items): if "extra_slow" in item.keywords: item.add_marker(skip_extra_slow) - manager: pytest_depends.DependencyManager = pytest_depends.managers[-1] - # Build the undirected graph as in `DependencyManager.sorted_items`. - dag = networkx.DiGraph() - for item in manager.items: - node_id = pytest_depends.clean_nodeid(item.nodeid) - dag.add_node(node_id) - for dependency in manager.dependencies[node_id].dependencies: - dag.add_edge(dependency, node_id) - # Mark dependency groups for xdist. - manager.groups = {} - for i, node_ids in enumerate(sorted(networkx.weakly_connected_components(dag), key=len, reverse=True)): - if len(node_ids) > 1: - for node_id in node_ids: - manager.nodeid_to_item[node_id]._nodeid = ( - f"{manager.nodeid_to_item[node_id]._nodeid}@dependency_group_{i}" - ) - - old_clean_nodeid = pytest_depends.main.clean_nodeid - # Hack into `clean_nodeid` so pytest_depends recognizes the renamed nodes. - pytest_depends.main.clean_nodeid = lambda nodeid: old_clean_nodeid(nodeid.split("@dependency_group_")[0]) + new_items = [] + for item in items: + if skip_slow and "slow" in item.keywords: + if show_skipped: + item.add_marker(pytest.mark.skip(reason="Skipping slow tests")) + else: + continue + elif skip_extra_slow and "extra_slow" in item.keywords: + if show_skipped: + item.add_marker(pytest.mark.skip(reason="Skipping extra-slow tests")) + else: + continue + new_items.append(item) + + manager = DependencyManager(new_items) + + # Show the extra information if requested + if config.getoption("show_dependencies"): + manager.print_name_map(config.getoption("verbose") > 1) + manager.print_processed_dependencies(config.getoption("color")) + + # Reorder the items so that tests run after their dependencies + items[:] = manager.items + + # If pytest-depends is installed, it will complain about renamed nodes whether it's used or not. + try: + import pytest_depends + except ImportError: + pass + else: + old_clean_nodeid = pytest_depends.main.clean_nodeid + # Hack into `clean_nodeid` so pytest_depends recognizes the renamed nodes. + pytest_depends.main.clean_nodeid = lambda nodeid: old_clean_nodeid(nodeid.split("@dependency_group_")[0]) + + +@pytest.hookimpl(tryfirst=True, hookwrapper=True) +def pytest_runtest_makereport(item: pytest.Function, call: pytest.CallInfo): + outcome = yield + result = outcome.get_result() + manager.register_result(item, result) + + # Measure GPU memory usage. (TODO: This excludes child processes) + if call.when == "call" and torch.cuda.is_available(): + # Free memory for more accurate reporting, and to reduce OOM risk with lots of workers. + # Cublas workspace can unnecessarily keep 100s of MBs of reserved memory. + torch._C._cuda_clearCublasWorkspaces() + # Lots of tensors tend to stay allocated until the next garbage collection. + # Collect only if the remaining memory is significant enough since it's costly. + if torch.cuda.memory_allocated() > 1e7: + gc.collect() + try: + # Actually free the memory. + torch.cuda.empty_cache() + except RuntimeError: + # Happens if the test broke cuda. + return + item.add_report_section( + call.when, + "resource usage", + json.dumps( + { + "duration": call.duration, + # Relevant value for OOM risk. Also look at global max since fast-llm resets stats. + "max_memory_reserved": max( + torch.cuda.max_memory_reserved(), fast_llm.logging._global_max_reserved + ), + # Actual memory usage from the test. + "max_memory_allocated": max( + torch.cuda.max_memory_allocated(), fast_llm.logging._global_max_allocated + ), + "memory_reserved": torch.cuda.memory_reserved(), + "memory_allocated": torch.cuda.memory_allocated(), + } + ), + ) + torch.cuda.reset_peak_memory_stats() + # Reset global stats for next test. + fast_llm.logging._global_max_reserved = 0 + fast_llm.logging._global_max_allocated = 0 + + +@pytest.hookimpl +def pytest_terminal_summary(terminalreporter): + resource_reports = {} + for reports in terminalreporter.stats.values(): + for report in reports: + if isinstance(report, pytest.TestReport): + for _, section in report.get_sections("Captured resource usage"): + if report.nodeid in resource_reports: + logging.error(f"Duplicate resource report for {report.nodeid}") + resource_reports[report.nodeid] = json.loads(section) + + if not resource_reports: + return + + terminalreporter.write_sep("=", "Highest gpu memory usage", bold=True) + sorted_nodeids = sorted( + resource_reports.keys(), + key=lambda nodeid: resource_reports[nodeid]["max_memory_reserved"], + reverse=True, + ) + for nodeid in sorted_nodeids[: terminalreporter.config.getoption("--show-gpu-memory")]: + terminalreporter.write_line( + f"{nodeid}:\n " + f"Max Reserved {resource_reports[nodeid]["max_memory_reserved"] / 1e6:.0f} MB | " + f"Max Allocated {resource_reports[nodeid]["max_memory_allocated"] / 1e6:.0f} MB | " + f"End Reserved {resource_reports[nodeid]["memory_reserved"] / 1e6:.0f} MB | " + f"End Allocated {resource_reports[nodeid]["memory_allocated"] / 1e6:.0f} MB | " + f"Duration {resource_reports[nodeid]["duration"]:.2f}" + ) + + +def pytest_runtest_call(item: pytest.Function): + if torch.cuda.is_available(): + # Empty cache to check is cuda is still working (TODO: Is there a better way? Can we kill the worker?) + try: + torch.cuda.empty_cache() + except RuntimeError: + pytest.skip("Cuda runtime unavailable due to an error in an earlier test.") + manager.handle_missing(item) + + +def pytest_unconfigure(): + global manager + manager = None @pytest.fixture(scope="session") @@ -133,4 +258,4 @@ def worker_resources(request) -> WorkerResources: def pytest_xdist_make_scheduler(config, log): # Always use grouped load balancing to handle dependencies, and make it work with `-n`. assert config.getvalue("dist") == "load" - return LoadGroupScheduling(config, log) + return xdist.scheduler.LoadGroupScheduling(config, log) diff --git a/tests/data/common.py b/tests/data/common.py index cacb28e6b..2d3cb905f 100644 --- a/tests/data/common.py +++ b/tests/data/common.py @@ -23,7 +23,7 @@ from fast_llm.engine.distributed.distributed import Distributed from fast_llm.models.gpt.config import GPTBatchConfig from fast_llm.utils import Assert, div -from tests.common import TEST_VOCAB_SIZE +from tests.utils.dataset import TEST_VOCAB_SIZE def get_sampling_data( diff --git a/tests/data/test_blending.py b/tests/data/test_blending.py index de97eaa21..438782dfe 100644 --- a/tests/data/test_blending.py +++ b/tests/data/test_blending.py @@ -5,13 +5,13 @@ from fast_llm.data.dataset.gpt.config import GPTBlendedDatasetConfig from fast_llm.utils import Assert, normalize_probabilities -from tests.common import DATASET_CACHE, DATASET_PREFIX, get_test_dataset from tests.data.common import ( compare_sampled_dataset, get_dataset_config, get_sampling_data, get_test_data_and_compare_samples, ) +from tests.utils.dataset import DATASET_CACHE, DATASET_PREFIX, get_test_dataset _DATASET_PREFIX_MIX_1 = DATASET_CACHE / "blended_mix_1" / "dataset" diff --git a/tests/data/test_concatenate.py b/tests/data/test_concatenate.py index 1142d5364..e951cc2b1 100644 --- a/tests/data/test_concatenate.py +++ b/tests/data/test_concatenate.py @@ -1,5 +1,4 @@ from fast_llm.data.dataset.gpt.config import GPTConcatenatedDatasetConfig -from tests.common import DATASET_PREFIX, get_test_dataset from tests.data.common import ( compare_indexed_dataset, compare_sampled_dataset, @@ -8,6 +7,7 @@ get_test_data_and_compare_samples, ) from tests.data.test_memmap import MEMMAP_DATASET_LENGTH, MEMMAP_DATASET_SAMPLES, MEMMAP_DATASET_TOKENS +from tests.utils.dataset import DATASET_PREFIX, get_test_dataset GPT_CONCATENATED_SAMPLES = [ [4709, 819, 79, 207, 277, 1790], diff --git a/tests/data/test_concatenated_memmap.py b/tests/data/test_concatenated_memmap.py index 09929040d..0ab7c7fe4 100644 --- a/tests/data/test_concatenated_memmap.py +++ b/tests/data/test_concatenated_memmap.py @@ -1,5 +1,4 @@ from fast_llm.data.dataset.gpt.config import GPTConcatenatedMemmapConfig -from tests.common import DATASET_CACHE, get_test_concatenated_memmap_dataset from tests.data.common import ( compare_indexed_dataset, get_dataset_config, @@ -8,6 +7,7 @@ validate_indexed_dataset_sampling, ) from tests.data.test_memmap import MEMMAP_DATASET_SAMPLES +from tests.utils.dataset import DATASET_CACHE, get_test_concatenated_memmap_dataset _DATASET_PREFIX_MIX_CONCATENATED_MEMMAP = DATASET_CACHE / "concatenated_memmap" diff --git a/tests/data/test_dataset_from_file.py b/tests/data/test_dataset_from_file.py index 280b34137..3f7d1a139 100644 --- a/tests/data/test_dataset_from_file.py +++ b/tests/data/test_dataset_from_file.py @@ -1,7 +1,7 @@ from fast_llm.data.dataset.gpt.config import GPTDatasetFromFileConfig -from tests.common import DATASET_PREFIX, get_test_dataset from tests.data.common import compare_indexed_dataset, get_dataset_config from tests.data.test_memmap import MEMMAP_DATASET_LENGTH, MEMMAP_DATASET_SAMPLES, MEMMAP_DATASET_TOKENS +from tests.utils.dataset import DATASET_PREFIX, get_test_dataset def test_dataset_from_file(): diff --git a/tests/data/test_fim.py b/tests/data/test_fim.py index 7b614d2fe..7472f1958 100644 --- a/tests/data/test_fim.py +++ b/tests/data/test_fim.py @@ -1,13 +1,13 @@ from fast_llm.data.config import TokenizerConfig from fast_llm.data.dataset.gpt.config import GPTFimSampledDatasetConfig from fast_llm.data.tokenizer import Tokenizer -from tests.common import DATASET_PREFIX, TOKENIZER_PATH, get_test_dataset from tests.data.common import ( compare_sampled_dataset, get_dataset_config, get_sampling_data, get_test_data_and_compare_samples, ) +from tests.utils.dataset import DATASET_PREFIX, TOKENIZER_PATH, get_test_dataset GPT_FIM_SAMPLES = [ [4709, 819, 79, 207, 277, 1790], diff --git a/tests/data/test_memmap.py b/tests/data/test_memmap.py index be801220b..fcd7756db 100644 --- a/tests/data/test_memmap.py +++ b/tests/data/test_memmap.py @@ -3,8 +3,8 @@ import pytest from fast_llm.data.dataset.gpt.config import GPTMemmapDatasetConfig -from tests.common import DATASET_CACHE, DATASET_PREFIX, DATASET_SAMPLING_CACHE, get_test_dataset from tests.data.common import compare_indexed_dataset, get_dataset_config +from tests.utils.dataset import DATASET_CACHE, DATASET_PREFIX, DATASET_SAMPLING_CACHE, get_test_dataset MEMMAP_DATASET_LENGTH = 6153 MEMMAP_DATASET_TOKENS = 508327 diff --git a/tests/data/test_sampling.py b/tests/data/test_sampling.py index 386795826..32d76fa4c 100644 --- a/tests/data/test_sampling.py +++ b/tests/data/test_sampling.py @@ -7,13 +7,13 @@ from fast_llm.data.dataset.gpt.indexed import GPTIndexedDataset from fast_llm.data.dataset.gpt.sampled import GPTSample from fast_llm.utils import Assert -from tests.common import DATASET_PREFIX, get_test_dataset from tests.data.common import ( get_dataset_config, get_sampling_data, get_test_data_and_compare_samples, validate_indexed_dataset_sampling, ) +from tests.utils.dataset import DATASET_PREFIX, get_test_dataset try: from fast_llm.csrc.data import build_padded_token_cumsum # noqa diff --git a/tests/data/test_slice.py b/tests/data/test_slice.py index 299e2054e..f8eedc5bc 100644 --- a/tests/data/test_slice.py +++ b/tests/data/test_slice.py @@ -1,5 +1,4 @@ from fast_llm.data.dataset.gpt.config import GPTDatasetSliceConfig -from tests.common import DATASET_PREFIX, get_test_dataset from tests.data.common import ( compare_indexed_dataset, get_dataset_config, @@ -8,6 +7,7 @@ validate_indexed_dataset_sampling, ) from tests.data.test_memmap import MEMMAP_DATASET_SAMPLES +from tests.utils.dataset import DATASET_PREFIX, get_test_dataset GPT_SLICE_TRAINING_SAMPLES = [ [80, 268, 79, 260, 207, 3086], diff --git a/tests/layers/test_lm_head.py b/tests/layers/test_lm_head.py index 7578a5f05..95da48e7e 100644 --- a/tests/layers/test_lm_head.py +++ b/tests/layers/test_lm_head.py @@ -19,7 +19,7 @@ from fast_llm.models.gpt.config import GPTBaseModelConfig from fast_llm.models.gpt.model import GPTBaseModel from fast_llm.utils import Assert -from tests.common import requires_cuda +from tests.utils.utils import requires_cuda def _lm_head( diff --git a/tests/test_checkpoint.py b/tests/test_checkpoint.py index 216f7828a..05a621005 100644 --- a/tests/test_checkpoint.py +++ b/tests/test_checkpoint.py @@ -17,17 +17,11 @@ from fast_llm.engine.multi_stage.config import FastLLMModelConfig, ShardName, StageMode from fast_llm.models.auto import model_registry from fast_llm.tools.convert import ConvertConfig -from tests.common import ( - CONFIG_COMMON, - FORCE_REUSE_RESULTS, - HUGGINGFACE_CHECKPOINT_FORMAT, - REUSE_RESULTS, - TEST_MODEL, - TEST_MODEL_TYPE, - TEST_RESULTS_PATH, - requires_cuda, -) -from tests.compare_tensor_logs import CompareConfig, compare_logged_tensor +from tests.common import CONFIG_COMMON, HUGGINGFACE_CHECKPOINT_FORMAT, TEST_MODEL_TYPE +from tests.utils.compare_tensor_logs import CompareConfig, compare_logged_tensor +from tests.utils.model_configs import TEST_MODEL +from tests.utils.run_test_script import FORCE_REUSE_RESULTS, REUSE_RESULTS +from tests.utils.utils import TEST_RESULTS_PATH, requires_cuda TEST_MODEL_CONFIG_CLS = model_registry[TEST_MODEL_TYPE] TEST_MODEL_HF_CLS = TEST_MODEL_CONFIG_CLS.get_huggingface_model_for_causal_lm_class() diff --git a/tests/test_config.py b/tests/test_config.py index 80bed418c..e050cb230 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -14,7 +14,7 @@ from fast_llm.models.auto import trainer_registry from fast_llm.models.gpt.config import GPTModelConfig, PretrainedGPTModelConfig from fast_llm.utils import Assert, check_equal_nested -from tests.common import TEST_RESULTS_PATH +from tests.utils.utils import TEST_RESULTS_PATH def run_without_import(cmd: str): diff --git a/tests/test_functional.py b/tests/test_functional.py index 908a55374..03a0ae8a0 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -8,7 +8,7 @@ from fast_llm.functional.triton.mlp import mlp_autograd, mlp_autograd_looped, torch_mlp_activation from fast_llm.functional.triton.sparse_copy import get_sparse_map from fast_llm.utils import Assert -from tests.common import requires_cuda +from tests.utils.utils import requires_cuda def ref_log_probs_from_logits(logits: torch.Tensor, labels: torch.Tensor, temperature: float = 1.0) -> torch.Tensor: diff --git a/tests/test_gpt_generate_and_forward.py b/tests/test_gpt_generate_and_forward.py index a16d4c716..6e8d43601 100644 --- a/tests/test_gpt_generate_and_forward.py +++ b/tests/test_gpt_generate_and_forward.py @@ -9,7 +9,9 @@ from fast_llm.engine.schedule.runner import ScheduleRunner from fast_llm.models.gpt.config import LlamaGPTHuggingfaceCheckpointFormat, PretrainedGPTModelConfig from fast_llm.models.gpt.huggingface import HuggingfaceGPTModelForCausalLM -from tests.common import CONFIG_COMMON, HUGGINGFACE_CHECKPOINT_FORMAT, TEST_MODEL, TEST_RESULTS_PATH, requires_cuda +from tests.common import CONFIG_COMMON, HUGGINGFACE_CHECKPOINT_FORMAT +from tests.utils.model_configs import TEST_MODEL +from tests.utils.utils import TEST_RESULTS_PATH, requires_cuda def _prepare_checkpoint(model: str) -> str: diff --git a/tests/test_match_megatron.py b/tests/test_match_megatron.py index 1857f0f8f..3d8210860 100644 --- a/tests/test_match_megatron.py +++ b/tests/test_match_megatron.py @@ -1,6 +1,8 @@ import pytest -from tests.common import ( +from tests.utils.compare_tensor_logs import CompareConfig +from tests.utils.dataset import DATASET_PREFIX +from tests.utils.model_configs import ( CONFIG_GPT2_FAST_LLM, CONFIG_GPT2_MEGATRON, CONFIG_LLAMA_FAST_LLM, @@ -11,9 +13,7 @@ CONFIG_SC1_MEGATRON, CONFIG_SC2_FAST_LLM, CONFIG_SC2_MEGATRON, - DATASET_PREFIX, ) -from tests.compare_tensor_logs import CompareConfig @pytest.mark.slow diff --git a/tests/test_mb.py b/tests/test_mb.py index 82ac4c25f..fd6130565 100644 --- a/tests/test_mb.py +++ b/tests/test_mb.py @@ -1,7 +1,8 @@ import pytest -from tests.common import CONFIG_COMMON, TEST_MODEL -from tests.compare_tensor_logs import CompareConfig +from tests.common import CONFIG_COMMON +from tests.utils.compare_tensor_logs import CompareConfig +from tests.utils.model_configs import TEST_MODEL CONFIG_DF = CONFIG_COMMON + ["batch.depth_first_micro_batches=4"] CONFIG_BF = CONFIG_COMMON + ["batch.breadth_first_micro_batches=4"] diff --git a/tests/test_mb_seq_first.py b/tests/test_mb_seq_first.py index 345a7bc49..dd00fd5fc 100644 --- a/tests/test_mb_seq_first.py +++ b/tests/test_mb_seq_first.py @@ -1,7 +1,8 @@ import pytest -from tests.common import CONFIG_COMMON, TEST_MODEL -from tests.compare_tensor_logs import CompareConfig +from tests.common import CONFIG_COMMON +from tests.utils.compare_tensor_logs import CompareConfig +from tests.utils.model_configs import TEST_MODEL CONFIG_DF_SF = CONFIG_COMMON + ["batch.depth_first_micro_batches=4", "model.base_model.sequence_first=True"] CONFIG_BF_SF = CONFIG_COMMON + ["batch.breadth_first_micro_batches=4", "model.base_model.sequence_first=True"] diff --git a/tests/test_ms.py b/tests/test_ms.py index 90d166728..55032620b 100644 --- a/tests/test_ms.py +++ b/tests/test_ms.py @@ -1,6 +1,7 @@ import pytest -from tests.common import CONFIG_COMMON, TEST_MODEL +from tests.common import CONFIG_COMMON +from tests.utils.model_configs import TEST_MODEL CONFIG_MS = CONFIG_COMMON + ["batch.micro_sequence_length=256"] diff --git a/tests/test_mtp.py b/tests/test_mtp.py index 71c55e0fc..1f01954e8 100644 --- a/tests/test_mtp.py +++ b/tests/test_mtp.py @@ -15,7 +15,7 @@ from fast_llm.models.gpt.config import GPTBaseModelConfig from fast_llm.models.gpt.model import GPTBaseModel from fast_llm.utils import Assert -from tests.common import get_hybrid_config, materialize_meta_tensors, requires_cuda +from tests.utils.utils import get_hybrid_config, materialize_meta_tensors, requires_cuda try: from fast_llm.layers.ssm.discrete_mamba2 import DiscreteMamba2 diff --git a/tests/test_multi_stage.py b/tests/test_multi_stage.py index bb468cebe..f5f09b1b3 100644 --- a/tests/test_multi_stage.py +++ b/tests/test_multi_stage.py @@ -4,7 +4,8 @@ from fast_llm.layers.transformer.transformer import TransformerLayer from fast_llm.tools.train import CliTrainingConfig from fast_llm.utils import Assert -from tests.common import CONFIG_COMMON, requires_cuda +from tests.common import CONFIG_COMMON +from tests.utils.utils import requires_cuda def _get_trainer_from_args(args: list[str], model_type: str = "gpt") -> Trainer: diff --git a/tests/test_seq_first.py b/tests/test_seq_first.py index a8f4c0363..9ead58e88 100644 --- a/tests/test_seq_first.py +++ b/tests/test_seq_first.py @@ -1,6 +1,7 @@ import pytest -from tests.common import CONFIG_COMMON, TEST_MODEL +from tests.common import CONFIG_COMMON +from tests.utils.model_configs import TEST_MODEL CONFIG_SF = CONFIG_COMMON + ["model.base_model.sequence_first=True"] diff --git a/tests/test_simple.py b/tests/test_simple.py index 3128626d3..1523750f7 100644 --- a/tests/test_simple.py +++ b/tests/test_simple.py @@ -1,6 +1,7 @@ import pytest -from tests.common import CONFIG_COMMON, CONFIG_FAST_LLM, TEST_MODEL +from tests.common import CONFIG_COMMON, CONFIG_FAST_LLM +from tests.utils.model_configs import TEST_MODEL def test_model_safe(run_test_script): diff --git a/tests/test_ssms.py b/tests/test_ssms.py index 36c7b6229..9e7485447 100644 --- a/tests/test_ssms.py +++ b/tests/test_ssms.py @@ -21,7 +21,7 @@ from fast_llm.models.gpt.config import GPTBatchConfig, LlamaGPTHuggingfaceCheckpointFormat from fast_llm.models.ssm.config import AprielSSMHHybridHuggingfaceCheckpointFormat, LLambaHuggingfaceCheckpointFormat from fast_llm.models.ssm.model import HybridSSMBaseModel, HybridSSMModel -from tests.common import get_hybrid_config, materialize_meta_tensors +from tests.utils.utils import get_hybrid_config, materialize_meta_tensors try: from cartesia_pytorch.Llamba.llamba import LlambaLMHeadModel as LMHeadModel diff --git a/tests/test_triton_kernels.py b/tests/test_triton_kernels.py index 108a28982..9befe64fd 100644 --- a/tests/test_triton_kernels.py +++ b/tests/test_triton_kernels.py @@ -31,7 +31,7 @@ from fast_llm.layers.transformer.config import RotaryConfig, RotaryEmbeddingType from fast_llm.layers.transformer.preprocessing import get_rotary_frequencies from fast_llm.utils import Assert, rms_diff -from tests.common import requires_cuda +from tests.utils.utils import requires_cuda @requires_cuda diff --git a/tests/utils/__init__.py b/tests/utils/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/compare_tensor_logs.py b/tests/utils/compare_tensor_logs.py similarity index 100% rename from tests/compare_tensor_logs.py rename to tests/utils/compare_tensor_logs.py diff --git a/tests/utils/dataset.py b/tests/utils/dataset.py new file mode 100644 index 000000000..72888dfdb --- /dev/null +++ b/tests/utils/dataset.py @@ -0,0 +1,80 @@ +import pathlib +import random +import string + +import numpy as np +import yaml + +from fast_llm.data.dataset.gpt.memmap import GPTMemmapDataset +from fast_llm.data.dataset.gpt.sampled import GPTSample +from tests.utils.utils import TEST_RESULTS_PATH + +TOKENIZER_PATH = TEST_RESULTS_PATH / "tokenizer" / "common" +TOKENIZER_FILE = TOKENIZER_PATH / "tokenizer.json" +DATASET_CACHE = TEST_RESULTS_PATH / "dataset" +DATASET_PREFIX = DATASET_CACHE / "common" / "dataset" +DATASET_SAMPLING_CACHE = TEST_RESULTS_PATH / "dataset" / "cache" +TEST_VOCAB_SIZE = 8192 +TEST_CHARACTERS = (string.ascii_lowercase) * 5 + " " * 30 + "\n" +TEST_DATASET_TOKENS = 1000000 + + +def get_test_dataset( + prefix: pathlib.Path = DATASET_PREFIX, + seed: int = 1234, + num_tokens: int = TEST_DATASET_TOKENS, + characters: str = TEST_CHARACTERS, + vocab_size: int = TEST_VOCAB_SIZE, + max_spans: int = 0, +): + if not TOKENIZER_FILE.is_file(): + import transformers + + transformers.AutoTokenizer.from_pretrained("bigcode/santacoder").save_pretrained(TOKENIZER_PATH) + + if not ( + prefix.with_suffix(".idx").is_file() + and prefix.with_suffix(".bin").is_file() + and prefix.parent.joinpath("fast_llm_config.yaml").is_file() + ): + import transformers + + texts = "".join(random.Random(seed).choices(characters, k=num_tokens)).splitlines() + tokenizer = transformers.AutoTokenizer.from_pretrained(TOKENIZER_PATH) + + samples = [ + GPTSample(np.array(tokenizer(document)["input_ids"], dtype=np.uint16) % vocab_size) for document in texts + ] + if max_spans > 0: + lengths = np.array([max(len(sample.token_ids), 1) for sample in samples]) + spans = np.sort(np.random.RandomState(seed + 3847).randint(0, lengths[:, None], [len(samples), max_spans])) + for sample, span in zip(samples, spans): + span = np.unique(span) + sample.loss_masking_spans = span[: len(span) // 2 * 2].reshape(-1, 2) + + GPTMemmapDataset.write_dataset(prefix, samples) + yaml.safe_dump( + {"type": "memmap", "path": prefix.name}, prefix.parent.joinpath("fast_llm_config.yaml").open("w") + ) + + +def get_test_concatenated_memmap_dataset( + path: pathlib.Path, + num_files: int, + seed: int = 1234, + num_tokens: int = TEST_DATASET_TOKENS, + characters: str = TEST_CHARACTERS, + vocab_size: int = TEST_VOCAB_SIZE, + seed_shift: int = 55, +): + index_file = path / "index.txt" + if not index_file.is_file(): + for i in range(num_files): + get_test_dataset( + prefix=path / f"dataset_{i}", + seed=seed + i * seed_shift, + num_tokens=num_tokens, + characters=characters, + vocab_size=vocab_size, + ) + index_file.open("w").writelines([str(path / f"dataset_{i}") + "\n" for i in range(num_files)]) diff --git a/tests/utils/depends.py b/tests/utils/depends.py new file mode 100644 index 000000000..3972a066d --- /dev/null +++ b/tests/utils/depends.py @@ -0,0 +1,211 @@ +import re + +import colorama +import networkx +import pytest + +MARKER_NAME = "depends_on" +MARKER_KWARG_ID = "name" +MARKER_KWARG_DEPENDENCIES = "on" + +REGEX_PARAMETERS = re.compile(r"\[.+\]$") + + +def clean_nodeid(nodeid): + return nodeid.replace("::()::", "::").split("@dependency_group_")[0] + + +def get_names(item): + names = set() + + # Node id + nodeid = clean_nodeid(item.nodeid) + names.add(nodeid) + + # Node id without parameter + nodeid = REGEX_PARAMETERS.sub("", nodeid) + names.add(nodeid) + + # Node id scopes + while "::" in nodeid: + nodeid = nodeid.rsplit("::", 1)[0] + names.add(nodeid) + + # Custom name + for marker in item.iter_markers(): + if marker.name == MARKER_NAME and MARKER_KWARG_ID in marker.kwargs: + for name in as_list(marker.kwargs[MARKER_KWARG_ID]): + names.add(name) + + return names + + +def as_list(lst): + return [lst] if isinstance(lst, str) else lst + + +STEPS = ["setup", "call", "teardown"] +GOOD_OUTCOME = "passed" + + +class DependencyManager: + """ + A simplified and improved version of pytest-depends. Main differences are the following: + * Add compatibility with pytest-xdist: group connected components of the dependency graph together, + and rename them with the `@dependency_group_{i}` suffix so they are run in the same worker, assuming + group scheduling is used. + * Improved parameterized dependencies so tests can depend on other tests with matching parametrization. + Ex. a test `test_model` with parameter `model` can depend on `test_other[{model}]`, + then `test_model[llama]` will depend on `test_other[llama]`, and so on. + * Improved description of missing/failed dependencies. + * Some option hard-coded for Fast-LLM. + """ + + def __init__(self, items: list[pytest.Function]): + self._items = items + self._name_to_nodeids: dict[str, list[str]] = {} + self._nodeid_to_item: dict[str, pytest.Function] = {} + self._results: dict[str, dict[str, str]] = {} + self._dependencies: dict[str, set[str]] = {} + self._unresolved: dict[str, set[str]] = {} + + for item in self._items: + nodeid = clean_nodeid(item.nodeid) + # Add the mapping from nodeid to the test item + self._nodeid_to_item[nodeid] = item + # Add the mappings from all names to the node id + for name in get_names(item): + if name not in self._name_to_nodeids: + self._name_to_nodeids[name] = [] + self._name_to_nodeids[name].append(nodeid) + # Create the object that will contain the results of this test + self._results[nodeid] = {} + + for item in self._items: + # Process the dependencies of this test + # This uses the mappings created in the previous loop, and can thus not be merged into that loop + nodeid = clean_nodeid(item.nodeid) + self._dependencies[nodeid], self._unresolved[nodeid] = self._resolve_dependencies(item) + + self._items = self._sort_dependencies() + + @property + def items(self) -> list[pytest.Function]: + return self._items + + def register_result(self, item: pytest.Function, result: pytest.TestReport): + self._results[clean_nodeid(item.nodeid)][result.when] = result.outcome + + def handle_missing(self, item: pytest.Function): + nodeid = clean_nodeid(item.nodeid) + if missing := self._unresolved[nodeid]: + pytest.fail(f'{item.nodeid} depends on {", ".join(missing)}, which was not found', False) + + if failed := [ + f"{dependency} ({", ".join(f"{key}: {value}" for key, value in self._results[dependency].items()) if self._results[dependency] else "missing"})" + for dependency in self._dependencies[nodeid] + if not all(self._results[dependency].get(step, None) == "passed" for step in ("setup", "call", "teardown")) + ]: + pytest.skip(f'{item.nodeid} depends on {", ".join(failed)}') + + def _resolve_dependencies(self, item: pytest.Function): + dependencies = set() + unresolved = set() + + if "skip" in item.keywords: + return dependencies, unresolved + + nodeid = clean_nodeid(item.nodeid) + + for marker in item.iter_markers(): + if marker.name == MARKER_NAME: + for dependency in as_list(marker.kwargs.get(MARKER_KWARG_DEPENDENCIES, [])): + dependency = dependency.format(**item.callspec.params) + + # If the name is not known, try to make it absolute (ie file::[class::]method) + if dependency not in self._name_to_nodeids: + absolute_dependency = self._get_absolute_nodeid(dependency, nodeid) + if absolute_dependency in self._name_to_nodeids: + dependency = absolute_dependency + + # Add all items matching the name + if dependency in self._name_to_nodeids: + for nodeid in self._name_to_nodeids[dependency]: + dependencies.add(nodeid) + else: + unresolved.add(dependency) + + return dependencies, unresolved + + def _sort_dependencies(self): + # Build a directed graph for sorting + dag = networkx.DiGraph() + + for item in self.items: + nodeid = clean_nodeid(item.nodeid) + dag.add_node(nodeid) + for dependency in self._dependencies[nodeid]: + dag.add_edge(dependency, nodeid) + + for i, nodeids in enumerate(sorted(networkx.weakly_connected_components(dag), key=len, reverse=True)): + if len(nodeids) > 1: + for nodeid in nodeids: + self._nodeid_to_item[nodeid]._nodeid = ( + f"{self._nodeid_to_item[nodeid]._nodeid}@dependency_group_{i}" + ) + + return [self._nodeid_to_item[nodeid] for nodeid in networkx.topological_sort(dag)] + + @staticmethod + def _get_absolute_nodeid(nodeid: str, scope: str): + parts = nodeid.split("::") + # Completely relative (test_name), so add the full current scope (either file::class or file) + if len(parts) == 1: + base_nodeid = scope.rsplit("::", 1)[0] + nodeid = f"{base_nodeid}::{nodeid}" + # Contains some scope already (Class::test_name), so only add the current file scope + elif "." not in parts[0]: + base_nodeid = scope.split("::", 1)[0] + nodeid = f"{base_nodeid}::{nodeid}" + return clean_nodeid(nodeid) + + def print_name_map(self, verbose: bool = False): + """Print a human-readable version of the name -> test mapping.""" + print("Available dependency names:") + for name, nodeids in sorted(self._name_to_nodeids.items(), key=lambda x: x[0]): + if len(nodeids) == 1: + if name == nodeids[0]: + # This is just the base name, only print this when verbose + if verbose: + print(f" {name}") + else: + # Name refers to a single node id, so use the short format + print(f" {name} -> {nodeids[0]}") + else: + # Name refers to multiple node ids, so use the long format + print(f" {name} ->") + for nodeid in sorted(nodeids): + print(f" {nodeid}") + + def print_processed_dependencies(self, colors: bool = False): + """Print a human-readable list of the processed dependencies.""" + missing = "MISSING" + if colors: + missing = f"{colorama.Fore.RED}{missing}{colorama.Fore.RESET}" + colorama.init() + try: + print("Dependencies:") + + for nodeid in sorted(self._dependencies): + descriptions = [] + for dependency in self._dependencies[nodeid]: + descriptions.append(dependency) + for dependency in self._unresolved[nodeid]: + descriptions.append(f"{dependency} ({missing})") + if descriptions: + print(f" {nodeid} depends on") + for description in sorted(descriptions): + print(f" {description}") + finally: + if colors: + colorama.deinit() diff --git a/tests/utils/model_configs.py b/tests/utils/model_configs.py new file mode 100644 index 000000000..26eebf4f1 --- /dev/null +++ b/tests/utils/model_configs.py @@ -0,0 +1,233 @@ +import os + +from fast_llm.models.gpt.config import ( + LlamaGPTHuggingfaceCheckpointFormat, + MistralGPTHuggingfaceCheckpointFormat, + MixtralGPTHuggingfaceCheckpointFormat, + MTPLlamaGPTHuggingfaceCheckpointFormat, + Qwen2GPTHuggingfaceCheckpointFormat, + Starcoder2GPTHuggingfaceCheckpointFormat, +) +from fast_llm.models.ssm.config import LLambaHuggingfaceCheckpointFormat +from tests.utils.dataset import DATASET_PREFIX, TEST_VOCAB_SIZE + +_LOG_LEVEL = int(os.environ.get("LOG_LEVEL", 13)) +TEST_MODEL = os.environ.get("MODEL", "llama") +CONFIG_BASE_FAST_LLM = [ + "training.logs.interval=1", + "run.tensor_logs.save=True", + "run.tensor_logs.show=False", + "model.base_model.transformer.num_layers=2", + "model.base_model.transformer.hidden_size=256", + "model.base_model.transformer.num_attention_heads=8", + "model.base_model.transformer.init_method_std=0.022", + f"model.base_model.vocab_size={TEST_VOCAB_SIZE}", + f"model.multi_stage.debug_param_init={_LOG_LEVEL}", + f"model.multi_stage.debug_layer_outputs={_LOG_LEVEL}", + f"model.multi_stage.debug_layer_gradients={_LOG_LEVEL}", + f"model.multi_stage.debug_all_param_gradients={_LOG_LEVEL}", + "model.multi_stage.debug_tensor_parallel=True", + "model.distributed.reproducible_init=True", + "model.distributed.timeout=10", + "training.train_iters=2", + "training.num_workers=0", + "training.timeout=30", + "batch.batch_size=8", + "batch.sequence_length=512", + "data.datasets.training.type=slice", + "data.datasets.training.end=0.969", + "data.datasets.training.dataset.type=memmap", + f"data.datasets.training.dataset.path={DATASET_PREFIX}", + "data.datasets.validation.type=slice", + "data.datasets.validation.begin=0.969", + "data.datasets.validation.end=0.999", + "data.datasets.validation.dataset.type=memmap", + f"data.datasets.validation.dataset.path={DATASET_PREFIX}", + "data.datasets.test.type=slice", + "data.datasets.test.begin=0.999", + "data.datasets.test.end=1", + "data.datasets.test.dataset.type=memmap", + f"data.datasets.test.dataset.path={DATASET_PREFIX}", + "optimizer.learning_rate.base=0.0001", +] +CONFIG_BASE_MEGATRON = [ + "--num-layers=2", + "--hidden-size=256", + "--num-attention-heads=8", + "--log-interval=1", + "--train-iters=2", + "--eval-iters=0", + "--hidden-dropout=0", + "--attention-dropout=0", + f"--debug_param_init={_LOG_LEVEL}", + f"--debug_layer_outputs={_LOG_LEVEL}", + f"--debug_layer_gradients={_LOG_LEVEL}", + f"--debug_all_param_gradients={_LOG_LEVEL}", + "--debug_param_update=0", + "--global-batch-size=8", + "--max-position-embeddings=512", + "--seq-length=512", + "--init-method-std=0.022", + "--lr=0.0001", + "--num-workers=0", + "--valid-num-workers=0", + "--tokenizer-type=NullTokenizer", + # Megatron messes with the vocab size, so we have to subtract 1. + f"--vocab-size={TEST_VOCAB_SIZE - 1}", + f"--data-path={DATASET_PREFIX}", + "--lr-decay-style=constant", + # Initialization is set up to match MCore models (MCore inverts self-attn qkv and dense layers compared to original Megatron) + "--use-mcore-models", + # local implementation doesn't allow for RMS norm. + "--transformer-impl=transformer_engine", +] +CONFIG_SC1_FAST_LLM = CONFIG_BASE_FAST_LLM + ["model.base_model.max_position_embeddings=512"] +CONFIG_SC1_MEGATRON = CONFIG_BASE_MEGATRON + ["--group-query-attention"] +CONFIG_SC1_COMMON = CONFIG_SC1_FAST_LLM + ["model.distributed.training_dtype=bf16"] +CONFIG_GPT2_FAST_LLM = CONFIG_SC1_FAST_LLM + ["model.base_model.transformer.head_groups=8"] +CONFIG_GPT2_MEGATRON = CONFIG_BASE_MEGATRON +CONFIG_GPT2_COMMON = CONFIG_GPT2_FAST_LLM + ["model.distributed.training_dtype=bf16"] +CONFIG_SC2_FAST_LLM = CONFIG_BASE_FAST_LLM + [ + "model.base_model.transformer.head_groups=4", + "model.base_model.transformer.rotary.type=default", +] +CONFIG_SC2_MEGATRON = CONFIG_SC1_MEGATRON + [ + "--num-query-groups=4", + "--use-rotary-position-embeddings", + "--no-position-embedding", +] +CONFIG_SC2_COMMON = CONFIG_SC2_FAST_LLM + ["model.distributed.training_dtype=bf16"] +CONFIG_LLAMA_MEGATRON = CONFIG_SC2_MEGATRON + [ + "--swiglu", + "--disable-bias-linear", + "--normalization=RMSNorm", + "--ffn-hidden-size=1024", + "--untie-embeddings-and-output-weights", +] +CONFIG_LLAMA_FAST_LLM = CONFIG_SC2_FAST_LLM + [ + "model.base_model.transformer.gated=True", + "model.base_model.transformer.activation_type=silu", + "model.base_model.transformer.add_linear_biases=False", + "model.base_model.transformer.normalization.type=rms_norm", + "model.base_model.transformer.ffn_hidden_size=1024", + "model.base_model.tie_word_embeddings=False", +] +CONFIG_LLAMA_COMMON = CONFIG_LLAMA_FAST_LLM + ["model.distributed.training_dtype=bf16"] +CONFIG_LLAMA3_MEGATRON = None +CONFIG_LLAMA3_FAST_LLM = CONFIG_LLAMA_FAST_LLM + [ + "model.base_model.transformer.rotary.type=llama3", +] +CONFIG_LLAMA3_COMMON = CONFIG_LLAMA3_FAST_LLM + ["model.distributed.training_dtype=bf16"] +CONFIG_QWEN2_MEGATRON = None +CONFIG_QWEN2_FAST_LLM = CONFIG_SC2_FAST_LLM + [ + "model.base_model.transformer.gated=True", + "model.base_model.transformer.activation_type=silu", + "model.base_model.transformer.add_linear_biases=only_attn_qkv", + "model.base_model.transformer.normalization.type=rms_norm", + "model.base_model.transformer.ffn_hidden_size=1024", + "model.base_model.tie_word_embeddings=False", +] +CONFIG_QWEN2_COMMON = CONFIG_QWEN2_FAST_LLM + ["model.distributed.training_dtype=bf16"] +CONFIG_LLAMA_YARN_MEGATRON = None +CONFIG_LLAMA_YARN_FAST_LLM = CONFIG_LLAMA_FAST_LLM + [ + "model.base_model.transformer.rotary.type=yarn", +] +CONFIG_LLAMA_YARN_COMMON = CONFIG_LLAMA_YARN_FAST_LLM + ["model.distributed.training_dtype=bf16"] +CONFIG_MIXTRAL_MEGATRON = CONFIG_LLAMA_MEGATRON + [ + "--num-experts=4", + "--moe-router-topk=4", +] +CONFIG_MIXTRAL_FAST_LLM = CONFIG_LLAMA_FAST_LLM + [ + "model.base_model.transformer.num_experts=4", + "model.base_model.transformer.num_experts_per_token=4", +] +CONFIG_MIXTRAL_COMMON = CONFIG_MIXTRAL_FAST_LLM + ["model.distributed.training_dtype=bf16"] +CONFIG_MIXTRAL_YARN_MEGATRON = None +CONFIG_MIXTRAL_YARN_FAST_LLM = CONFIG_MIXTRAL_FAST_LLM + [ + "model.base_model.transformer.rotary.type=yarn", +] +CONFIG_MIXTRAL_YARN_COMMON = CONFIG_MIXTRAL_YARN_FAST_LLM + ["model.distributed.training_dtype=bf16"] +CONFIG_LLAMA_MTP_MEGATRON = None +CONFIG_LLAMA_MTP_FAST_LLM = CONFIG_LLAMA_FAST_LLM + [ + "model.base_model.prediction_heads=4", +] +CONFIG_LLAMA_MTP_COMMON = CONFIG_LLAMA_MTP_FAST_LLM + ["model.distributed.training_dtype=bf16"] +CONFIG_LLAMBA_FAST_LLM = CONFIG_LLAMA_FAST_LLM + ["model.base_model.hybrid_block_layout==['t','m']"] +CONFIG_LLAMBA_MEGATRON = CONFIG_LLAMA_MEGATRON + [] +CONFIG_LLAMBA_COMMON = CONFIG_LLAMBA_FAST_LLM +_CONFIGS = { + "gpt2": ("gpt", CONFIG_GPT2_FAST_LLM, CONFIG_GPT2_MEGATRON, CONFIG_GPT2_COMMON, None), + "sc1": ("gpt", CONFIG_SC1_FAST_LLM, CONFIG_SC1_MEGATRON, CONFIG_SC1_COMMON, None), + "starcoder2": ( + "gpt", + CONFIG_SC2_FAST_LLM, + CONFIG_SC2_MEGATRON, + CONFIG_SC2_COMMON, + Starcoder2GPTHuggingfaceCheckpointFormat, + ), + "llama": ( + "gpt", + CONFIG_LLAMA_FAST_LLM, + CONFIG_LLAMA_MEGATRON, + CONFIG_LLAMA_COMMON, + LlamaGPTHuggingfaceCheckpointFormat, + ), + "llama3": ( + "gpt", + CONFIG_LLAMA3_FAST_LLM, + CONFIG_LLAMA3_MEGATRON, + CONFIG_LLAMA3_COMMON, + LlamaGPTHuggingfaceCheckpointFormat, + ), + "qwen2": ( + "gpt", + CONFIG_QWEN2_FAST_LLM, + CONFIG_QWEN2_MEGATRON, + CONFIG_QWEN2_COMMON, + Qwen2GPTHuggingfaceCheckpointFormat, + ), + "llama-yarn": ( + "gpt", + CONFIG_LLAMA_YARN_FAST_LLM, + CONFIG_LLAMA_YARN_MEGATRON, + CONFIG_LLAMA_YARN_COMMON, + LlamaGPTHuggingfaceCheckpointFormat, + ), + "mistral": ( + "gpt", + CONFIG_LLAMA_FAST_LLM, + CONFIG_LLAMA_MEGATRON, + CONFIG_LLAMA_COMMON, + MistralGPTHuggingfaceCheckpointFormat, + ), + "mixtral": ( + "gpt", + CONFIG_MIXTRAL_FAST_LLM, + CONFIG_MIXTRAL_MEGATRON, + CONFIG_MIXTRAL_COMMON, + MixtralGPTHuggingfaceCheckpointFormat, + ), + "llamba": ( + "hybrid_ssm", + CONFIG_LLAMBA_FAST_LLM, + CONFIG_LLAMBA_MEGATRON, + CONFIG_LLAMBA_COMMON, + LLambaHuggingfaceCheckpointFormat, + ), + "mixtral-yarn": ( + "gpt", + CONFIG_MIXTRAL_YARN_FAST_LLM, + CONFIG_MIXTRAL_YARN_MEGATRON, + CONFIG_MIXTRAL_YARN_COMMON, + MixtralGPTHuggingfaceCheckpointFormat, + ), + "llama-mtp": ( + "gpt", + CONFIG_LLAMA_MTP_FAST_LLM, + CONFIG_LLAMA_MTP_MEGATRON, + CONFIG_LLAMA_MTP_COMMON, + MTPLlamaGPTHuggingfaceCheckpointFormat, + ), +} + +TEST_MODEL_TYPE, CONFIG_FAST_LLM, CONFIG_GPT2, CONFIG_COMMON, HUGGINGFACE_CHECKPOINT_FORMAT = _CONFIGS[TEST_MODEL] diff --git a/tests/utils/run_test_script.py b/tests/utils/run_test_script.py new file mode 100644 index 000000000..27d828693 --- /dev/null +++ b/tests/utils/run_test_script.py @@ -0,0 +1,96 @@ +import os +import shutil +import subprocess + +import pytest +import torch + +from fast_llm.tools.train import CliTrainingConfig +from tests.utils.compare_tensor_logs import CompareConfig, compare_tensor_logs +from tests.utils.dataset import get_test_dataset +from tests.utils.model_configs import TEST_MODEL_TYPE +from tests.utils.utils import TEST_RESULTS_PATH + +FORCE_REUSE_RESULTS = int(os.environ.get("FORCE_REUSE_RESULTS", 0)) != 0 +REUSE_RESULTS = FORCE_REUSE_RESULTS or int(os.environ.get("REUSE_RESULTS", 0)) != 0 +ARTIFACT_PATH = "runs/0/artifacts" + + +@pytest.fixture(scope="session") +def run_test_script(worker_resources): + def do_run_test_script( + name: str, + script: list[str], + num_gpus: int = 1, + *, + model_type: str = TEST_MODEL_TYPE, + is_megatron: bool = False, + compare: str | None = None, + config: CompareConfig | None = None, + prepare_fn=None, + compare_fn=None, + do_compare: bool = True, + ): + if torch.cuda.device_count() < num_gpus: + pytest.skip(f"Not enough GPUs to run test ({torch.cuda.device_count()}<{num_gpus})") + env = os.environ.copy() + if is_megatron: + # Prevent Megatron from complaining. + env["CUDA_DEVICE_MAX_CONNECTIONS"] = "1" + env["NVTE_FLASH_ATTN"] = "0" + path = TEST_RESULTS_PATH / name + skip = False + artifact_path = path / ARTIFACT_PATH + if path.exists(): + assert path.is_dir() + # TODO: Better way to check if the previous attempt succeeded. + if ( + REUSE_RESULTS + and artifact_path.is_dir() + and len(list((artifact_path / "0").iterdir())) >= (1 if is_megatron else 3) + ): + skip = True + elif FORCE_REUSE_RESULTS: + raise RuntimeError(artifact_path) + else: + shutil.rmtree(path) + elif FORCE_REUSE_RESULTS: + raise RuntimeError(path) + if prepare_fn is not None: + skip = prepare_fn(TEST_RESULTS_PATH / name, None if compare is None else TEST_RESULTS_PATH / compare, skip) + if is_megatron: + script = [*script, f"--structured-logs-dir={path}", f"--data-cache-path={path}"] + else: + script = [model_type, *script, f"run.experiment_dir={path}"] + header = ["Megatron-LM/pretrain_gpt.py"] if is_megatron else ["--no-python", "fast-llm", "train"] + command = [ + "python", + "-m", + "torch.distributed.run", + f"--nproc-per-node={num_gpus}", + f"--rdzv-endpoint=localhost:{worker_resources.rendezvous_port}", + f"--master-port={worker_resources.torchrun_port}", + *header, + *script, + ] + print(" ".join(command)) + if skip: + print("Reusing existing run.") + else: + get_test_dataset() + if num_gpus == 1 and not is_megatron: + CliTrainingConfig.parse_and_run(script) + else: + completed_proc = subprocess.run(command, env=env, timeout=60) + if completed_proc.returncode: + raise RuntimeError(f"Process failed with return code {completed_proc.returncode}") + if compare and do_compare: + if compare_fn is not None: + compare_fn(TEST_RESULTS_PATH / name, TEST_RESULTS_PATH / compare) + compare_tensor_logs( + TEST_RESULTS_PATH / compare / ARTIFACT_PATH, + TEST_RESULTS_PATH / name / ARTIFACT_PATH, + config, + ) + + return do_run_test_script diff --git a/tests/utils/utils.py b/tests/utils/utils.py new file mode 100644 index 000000000..f37c1cb27 --- /dev/null +++ b/tests/utils/utils.py @@ -0,0 +1,52 @@ +import os +import pathlib + +import pytest +import torch + +from fast_llm.layers.ssm.config import SSMConfig +from fast_llm.layers.transformer.config import TransformerConfig +from fast_llm.models.ssm.config import HybridSSMBaseModelConfig + +TEST_RESULTS_PATH = pathlib.Path(os.environ.get("TEST_RESULTS_PATH", "/tmp/fast_llm_tests")).resolve() +requires_cuda = pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is not available") + + +def materialize_meta_tensors(model, tensor_space): + # Materialize parameters that are on meta device + for name, param in model.named_parameters(): + if param.device.type == "meta": + # Check if the parameter is a custom tensor type + if hasattr(param, "tensor_name") and hasattr(param, "init_parameter"): + param_data = param.new_empty(param.shape, device="cuda") + # Initialize param_data + param.init_parameter(param_data, tensor_space.distributed) + # Replace the parameter in the module + module_path, param_name = name.rsplit(".", 1) if "." in name else (None, name) + module = model + if module_path is not None: + for part in module_path.split("."): + module = getattr(module, part) + param = torch.nn.Parameter(param_data, requires_grad=param.requires_grad) + # TODO: add param_grad_is_zero etc., grad_buffer, etc., see test_mlp_recomputation + param.grad = None + param.grad_buffer = torch.empty_like(param) + param.param_grad_is_zero = True + module._parameters[param_name] = param + return model + + +def get_hybrid_config(hybrid_block_layout=["t", "m"], prediction_heads=1, default_mtp_type=None): + config = HybridSSMBaseModelConfig( + transformer=TransformerConfig(num_layers=len(hybrid_block_layout)), + ssm=SSMConfig(), + hybrid_block_layout=hybrid_block_layout, + prediction_heads=prediction_heads, + default_mtp_type=default_mtp_type, + init_method_std_embed=0.02, + init_method_min_embed=-0.02, + init_method_max_embed=0.02, + use_position_embeddings=True, + tie_word_embeddings=False, + ) + return config From ced34e08ce29bd2f4ac121609a5c49e47beefe9b Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Thu, 12 Jun 2025 17:41:26 -0400 Subject: [PATCH 20/43] stuff --- tests/conftest.py | 4 ++- tests/test_checkpoint.py | 49 +++++++++++++------------- tests/test_gpt_generate_and_forward.py | 9 +++-- tests/test_match_megatron.py | 14 ++++---- tests/test_mb.py | 15 ++++---- tests/test_mb_seq_first.py | 7 ++-- tests/test_ms.py | 7 ++-- tests/test_multi_stage.py | 2 +- tests/test_seq_first.py | 9 +++-- tests/test_simple.py | 15 ++++---- tests/utils/depends.py | 3 +- 11 files changed, 67 insertions(+), 67 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 284f4140a..99490f1bf 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -13,6 +13,8 @@ from tests.utils.depends import DependencyManager # Make fixtures available globally without import +from tests.utils.run_test_script import run_test_script # isort: skip + manager: DependencyManager | None = None @@ -148,7 +150,7 @@ def pytest_collection_modifyitems(config, items: list[pytest.Function]): # If pytest-depends is installed, it will complain about renamed nodes whether it's used or not. try: - import pytest_depends + import pytest_depends.main except ImportError: pass else: diff --git a/tests/test_checkpoint.py b/tests/test_checkpoint.py index 05a621005..55d30d3fc 100644 --- a/tests/test_checkpoint.py +++ b/tests/test_checkpoint.py @@ -17,9 +17,8 @@ from fast_llm.engine.multi_stage.config import FastLLMModelConfig, ShardName, StageMode from fast_llm.models.auto import model_registry from fast_llm.tools.convert import ConvertConfig -from tests.common import CONFIG_COMMON, HUGGINGFACE_CHECKPOINT_FORMAT, TEST_MODEL_TYPE from tests.utils.compare_tensor_logs import CompareConfig, compare_logged_tensor -from tests.utils.model_configs import TEST_MODEL +from tests.utils.model_configs import CONFIG_COMMON, HUGGINGFACE_CHECKPOINT_FORMAT, TEST_MODEL, TEST_MODEL_TYPE from tests.utils.run_test_script import FORCE_REUSE_RESULTS, REUSE_RESULTS from tests.utils.utils import TEST_RESULTS_PATH, requires_cuda @@ -65,7 +64,7 @@ def _compare_resume_fn(test_path: pathlib.Path, compare_path: pathlib.Path): shutil.copy(compare_path / path, test_path / path) -@pytest.mark.depends(on=["test_checkpoint_and_eval"]) +@pytest.mark.depends_on(on=["test_checkpoint_and_eval"]) def test_resume(run_test_script): # Resume from iteration=1 and compare outputs with the baseline run. run_test_script( @@ -82,7 +81,7 @@ def test_resume(run_test_script): ) -@pytest.mark.depends(on=["test_checkpoint_and_eval"]) +@pytest.mark.depends_on(on=["test_checkpoint_and_eval"]) def test_resume_frozen(run_test_script): # Resume with frozen mlp. No comparison. run_test_script( @@ -113,7 +112,7 @@ def _run_conversion(config: ConvertConfig): CONVERT_PATH = TEST_RESULTS_PATH / f"test_{TEST_MODEL}_convert_model" -@pytest.mark.depends(on=["test_checkpoint_and_eval"]) +@pytest.mark.depends_on(on=["test_checkpoint_and_eval"]) def test_convert_distributed_to_fast_llm(): _run_conversion( ConvertConfig( @@ -130,7 +129,7 @@ def test_convert_distributed_to_fast_llm(): ) -@pytest.mark.depends(on=["test_convert_distributed_to_fast_llm"]) +@pytest.mark.depends_on(on=["test_convert_distributed_to_fast_llm"]) def test_convert_fast_llm_to_huggingface(): if HUGGINGFACE_CHECKPOINT_FORMAT is None: pytest.skip(f"Conversion not supported for {TEST_MODEL}") @@ -149,7 +148,7 @@ def test_convert_fast_llm_to_huggingface(): ) -@pytest.mark.depends(on=["test_convert_fast_llm_to_huggingface"]) +@pytest.mark.depends_on(on=["test_convert_fast_llm_to_huggingface"]) def test_convert_huggingface_to_distributed(): _run_conversion( ConvertConfig( @@ -166,7 +165,7 @@ def test_convert_huggingface_to_distributed(): ) -@pytest.mark.depends(on=["test_checkpoint_and_eval"]) +@pytest.mark.depends_on(on=["test_checkpoint_and_eval"]) def test_convert_distributed_to_huggingface(): if HUGGINGFACE_CHECKPOINT_FORMAT is None: pytest.skip(f"Conversion not supported for {TEST_MODEL}") @@ -185,7 +184,7 @@ def test_convert_distributed_to_huggingface(): ) -@pytest.mark.depends(on=["test_convert_distributed_to_huggingface"]) +@pytest.mark.depends_on(on=["test_convert_distributed_to_huggingface"]) def test_convert_huggingface_to_fast_llm(): _run_conversion( ConvertConfig( @@ -202,7 +201,7 @@ def test_convert_huggingface_to_fast_llm(): ) -@pytest.mark.depends(on=["test_convert_huggingface_to_fast_llm"]) +@pytest.mark.depends_on(on=["test_convert_huggingface_to_fast_llm"]) def test_convert_fast_llm_to_distributed(): _run_conversion( ConvertConfig( @@ -219,7 +218,7 @@ def test_convert_fast_llm_to_distributed(): ) -@pytest.mark.depends(on=["test_convert_huggingface_to_distributed", "test_convert_fast_llm_to_distributed"]) +@pytest.mark.depends_on(on=["test_convert_huggingface_to_distributed", "test_convert_fast_llm_to_distributed"]) def test_converted_distributed(): # Compare the fast llm weights # TODO: Compare configs @@ -235,7 +234,7 @@ def test_converted_distributed(): assert (w[key] == w1[key]).all(), (w[key], w1[key]) -@pytest.mark.depends(on=["test_convert_distributed_to_fast_llm", "test_convert_huggingface_to_fast_llm"]) +@pytest.mark.depends_on(on=["test_convert_distributed_to_fast_llm", "test_convert_huggingface_to_fast_llm"]) def test_converted_fast_llm(): s0 = safetensors.torch.load_file(CONVERT_PATH / "fast_llm_0" / "model_0.safetensors") s1 = safetensors.torch.load_file(CONVERT_PATH / "fast_llm_1" / "model_0.safetensors") @@ -245,7 +244,7 @@ def test_converted_fast_llm(): assert (s0[key] == s1[key]).all(), (key, s0, s1) -@pytest.mark.depends(on=["test_convert_fast_llm_to_huggingface", "test_convert_distributed_to_huggingface"]) +@pytest.mark.depends_on(on=["test_convert_fast_llm_to_huggingface", "test_convert_distributed_to_huggingface"]) def test_converted_huggingface(): h0 = safetensors.torch.load_file(CONVERT_PATH / "huggingface_0" / "model_0.safetensors") h1 = safetensors.torch.load_file(CONVERT_PATH / "huggingface_1" / "model_0.safetensors") @@ -263,7 +262,7 @@ def _compare_architectures(config_ref: FastLLMModelConfig, config_test: FastLLMM config_ref.base_model.compare_architecture(config_test.base_model) -@pytest.mark.depends(on=["test_converted_distributed"]) +@pytest.mark.depends_on(on=["test_converted_distributed"]) def test_load_pretrained_distributed_checkpoint(): config = TEST_MODEL_CONFIG_CLS.from_dict( yaml.safe_load((_CKPT_PATH / ".." / ".." / "config.yaml").open("r"))["model"], strict=False @@ -283,7 +282,7 @@ def test_load_pretrained_distributed_checkpoint(): assert (state_shards[f"{shard_name}_shard"] == model.get_shard(shard_name)).all() -@pytest.mark.depends(on=["test_load_pretrained_distributed_checkpoint"]) +@pytest.mark.depends_on(on=["test_load_pretrained_distributed_checkpoint"]) def test_load_converted_distributed_checkpoint(): config_ref = TEST_MODEL_CONFIG_CLS.from_pretrained( CheckpointLoadConfig( @@ -315,7 +314,7 @@ def test_load_converted_distributed_checkpoint(): assert (weight_shard == model.get_shard(ShardName.weights)).all() -@pytest.mark.depends(on=["test_converted_fast_llm", "test_load_pretrained_distributed_checkpoint"]) +@pytest.mark.depends_on(on=["test_converted_fast_llm", "test_load_pretrained_distributed_checkpoint"]) def test_load_converted_fast_llm_checkpoint(): config_ref = TEST_MODEL_CONFIG_CLS.from_pretrained( CheckpointLoadConfig( @@ -346,7 +345,7 @@ def test_load_converted_fast_llm_checkpoint(): assert (weight_shard == model.get_shard(ShardName.weights)).all() -@pytest.mark.depends(on=["test_converted_fast_llm", "test_load_pretrained_distributed_checkpoint"]) +@pytest.mark.depends_on(on=["test_converted_fast_llm", "test_load_pretrained_distributed_checkpoint"]) def test_load_converted_huggingface_checkpoint(): config_ref = TEST_MODEL_CONFIG_CLS.from_pretrained( CheckpointLoadConfig( @@ -378,7 +377,7 @@ def test_load_converted_huggingface_checkpoint(): assert (weight_shard == model.get_shard(ShardName.weights)).all() -@pytest.mark.depends(on=["test_load_converted_fast_llm_checkpoint", "test_load_converted_huggingface_checkpoint"]) +@pytest.mark.depends_on(on=["test_load_converted_fast_llm_checkpoint", "test_load_converted_huggingface_checkpoint"]) def test_run_converted_model(): model_ref = TEST_MODEL_HF_CLS.from_pretrained( CheckpointLoadConfig( @@ -427,7 +426,7 @@ def test_run_converted_model(): @pytest.mark.slow -@pytest.mark.depends(on=["test_load_converted_distributed_checkpoint"]) +@pytest.mark.depends_on(on=["test_load_converted_distributed_checkpoint"]) def test_load_pretrained_distributed_in_dp2(run_test_script): run_test_script( f"test_{TEST_MODEL}_load_pretrained_distributed_in_dp2", @@ -443,7 +442,7 @@ def test_load_pretrained_distributed_in_dp2(run_test_script): ) -@pytest.mark.depends(on=["test_load_converted_distributed_checkpoint"]) +@pytest.mark.depends_on(on=["test_load_converted_distributed_checkpoint"]) def test_load_pretrained_distributed_with_config(run_test_script): run_test_script( f"test_{TEST_MODEL}_load_pretrained_distributed_with_config", @@ -458,7 +457,7 @@ def test_load_pretrained_distributed_with_config(run_test_script): ) -@pytest.mark.depends(on=["test_load_pretrained_distributed_in_dp2"]) +@pytest.mark.depends_on(on=["test_load_pretrained_distributed_in_dp2"]) def test_load_pretrained_in_dp2_match_checkpoint(): test_ckpt_path = TEST_RESULTS_PATH / f"test_{TEST_MODEL}_load_pretrained_distributed_in_dp2" / "checkpoint" / "1" pretrained_config_ref = CheckpointLoadConfig( @@ -503,7 +502,7 @@ def test_load_pretrained_in_dp2_match_checkpoint(): @pytest.mark.slow -@pytest.mark.depends(on=["test_load_pretrained_in_dp2_match_checkpoint"]) +@pytest.mark.depends_on(on=["test_load_pretrained_in_dp2_match_checkpoint"]) def test_load_distributed_checkpoint_dp2(): # This also tests conversion which uses `FastLLMModel.from_checkpoint` pretrained_config_ref = CheckpointLoadConfig( @@ -526,7 +525,7 @@ def test_load_distributed_checkpoint_dp2(): @pytest.mark.slow -@pytest.mark.depends(on=["test_load_converted_fast_llm_checkpoint", "test_load_pretrained_in_dp2_match_checkpoint"]) +@pytest.mark.depends_on(on=["test_load_converted_fast_llm_checkpoint", "test_load_pretrained_in_dp2_match_checkpoint"]) def test_load_pretrained_fast_llm_in_dp2(run_test_script): run_test_script( f"test_{TEST_MODEL}_load_pretrained_fast_llm_in_dp2", @@ -560,7 +559,9 @@ def test_load_pretrained_fast_llm_in_dp2(run_test_script): @pytest.mark.slow -@pytest.mark.depends(on=["test_load_converted_huggingface_checkpoint", "test_load_pretrained_in_dp2_match_checkpoint"]) +@pytest.mark.depends_on( + on=["test_load_converted_huggingface_checkpoint", "test_load_pretrained_in_dp2_match_checkpoint"] +) def test_load_pretrained_huggingface_in_dp2(run_test_script): run_test_script( f"test_{TEST_MODEL}_load_pretrained_huggingface_in_dp2", diff --git a/tests/test_gpt_generate_and_forward.py b/tests/test_gpt_generate_and_forward.py index 6e8d43601..06cfd8037 100644 --- a/tests/test_gpt_generate_and_forward.py +++ b/tests/test_gpt_generate_and_forward.py @@ -9,8 +9,7 @@ from fast_llm.engine.schedule.runner import ScheduleRunner from fast_llm.models.gpt.config import LlamaGPTHuggingfaceCheckpointFormat, PretrainedGPTModelConfig from fast_llm.models.gpt.huggingface import HuggingfaceGPTModelForCausalLM -from tests.common import CONFIG_COMMON, HUGGINGFACE_CHECKPOINT_FORMAT -from tests.utils.model_configs import TEST_MODEL +from tests.utils.model_configs import CONFIG_COMMON, HUGGINGFACE_CHECKPOINT_FORMAT, TEST_MODEL from tests.utils.utils import TEST_RESULTS_PATH, requires_cuda @@ -262,7 +261,7 @@ def test_export_for_generate(run_test_script): @pytest.mark.slow @requires_cuda -@pytest.mark.depends(on=["test_export_for_generate"]) +@pytest.mark.depends_on(on=["test_export_for_generate"]) @pytest.mark.parametrize( "use_flash_attention, use_bf16, max_new_tokens, min_matching_tokens_batch_size_1, min_matching_tokens_batch_size_2", [ @@ -322,7 +321,7 @@ def test_generate_from_model( @requires_cuda @pytest.mark.slow -@pytest.mark.depends(on=["test_export_for_generate"]) +@pytest.mark.depends_on(on=["test_export_for_generate"]) def test_small_generate_from_model(): _test_generate_from_model( TEST_RESULTS_PATH / f"test_{TEST_MODEL}_export_for_generate/export/{HUGGINGFACE_CHECKPOINT_FORMAT.name}/1", @@ -370,7 +369,7 @@ def test_forward_return_hidden_states(model_and_tokenizer): @pytest.mark.slow @requires_cuda -@pytest.mark.depends(on=["test_export_for_generate"]) +@pytest.mark.depends_on(on=["test_export_for_generate"]) def test_small_forward_return_hidden_states(): _test_forward_return_hidden_states( TEST_RESULTS_PATH / f"test_{TEST_MODEL}_export_for_generate/export/{HUGGINGFACE_CHECKPOINT_FORMAT.name}/1", diff --git a/tests/test_match_megatron.py b/tests/test_match_megatron.py index 3d8210860..7d89c80a2 100644 --- a/tests/test_match_megatron.py +++ b/tests/test_match_megatron.py @@ -29,7 +29,7 @@ def test_sc1_meg(run_test_script): ] -@pytest.mark.depends(on=["test_sc1_meg"]) +@pytest.mark.depends_on(on=["test_sc1_meg"]) def test_sc1_match_meg(run_test_script): # Starcoder 1 (GPT2 with MQA) with Fast-llm. # QKV tensors are in a different format. @@ -50,13 +50,13 @@ def test_sc1_match_meg(run_test_script): @pytest.mark.slow @pytest.mark.skip(reason="Skipping mostly redundant test") -@pytest.mark.depends(on=["test_sc1_match_meg"]) +@pytest.mark.depends_on(on=["test_sc1_match_meg"]) def test_sc2_meg(run_test_script): # Starcoder 2 (GPT2 with MQA and RoPE) with Megatron. run_test_script("test_sc2_meg", CONFIG_SC2_MEGATRON + ["--micro-batch-size=8"], is_megatron=True) -@pytest.mark.depends(on=["test_sc2_meg"]) +@pytest.mark.depends_on(on=["test_sc2_meg"]) def test_sc2_match_meg(run_test_script): # Starcoder 2 (GPT2 with MQA and RoPE) with Fast-llm. # QKV tensors are in a different format, @@ -83,7 +83,7 @@ def test_gpt2_meg(run_test_script): run_test_script("test_gpt2_meg", CONFIG_GPT2_MEGATRON + ["--micro-batch-size=8"], is_megatron=True) -@pytest.mark.depends(on=["test_gpt2_meg"]) +@pytest.mark.depends_on(on=["test_gpt2_meg"]) def test_gpt2_match_meg(run_test_script): # GPT2 (MHA, layer norm, absolute embeddings) with Fast-llm. # QKV tensors are in a different format. @@ -109,7 +109,7 @@ def test_mistral_meg(run_test_script): run_test_script("test_mistral_meg", CONFIG_LLAMA_MEGATRON + ["--micro-batch-size=8"], is_megatron=True) -@pytest.mark.depends(on=["test_mistral_meg"]) +@pytest.mark.depends_on(on=["test_mistral_meg"]) def test_mistral_match_meg(run_test_script): # Mistral with Fast-LLM. run_test_script( @@ -135,9 +135,11 @@ def test_mixtral_meg(run_test_script): run_test_script("test_mixtral_meg", CONFIG_MIXTRAL_MEGATRON + ["--micro-batch-size=8"], is_megatron=True) -@pytest.mark.depends(on=["test_mixtral_meg"]) +@pytest.mark.depends_on(on=["test_mixtral_meg"]) def test_mixtral_match_meg(run_test_script): # Mistral with Fast-LLM. + # TODO: Fix dropless MOE + pytest.fail("Test fails, aborting to avoid breaking cuda", False) run_test_script( "test_mixtral_match_meg", CONFIG_MIXTRAL_FAST_LLM + CONFIG_MATCH_MEGATRON + ["model.base_model.use_megatron_initialization=True"], diff --git a/tests/test_mb.py b/tests/test_mb.py index fd6130565..4df6e510a 100644 --- a/tests/test_mb.py +++ b/tests/test_mb.py @@ -1,8 +1,7 @@ import pytest -from tests.common import CONFIG_COMMON from tests.utils.compare_tensor_logs import CompareConfig -from tests.utils.model_configs import TEST_MODEL +from tests.utils.model_configs import CONFIG_COMMON, TEST_MODEL CONFIG_DF = CONFIG_COMMON + ["batch.depth_first_micro_batches=4"] CONFIG_BF = CONFIG_COMMON + ["batch.breadth_first_micro_batches=4"] @@ -16,7 +15,7 @@ def test_model_df4(run_test_script): @pytest.mark.slow -@pytest.mark.depends(on=["test_model_df4"]) +@pytest.mark.depends_on(on=["test_model_df4"]) def test_model_df4_z3(run_test_script): # Gradient accumulation with ZeRO-3. run_test_script( @@ -28,20 +27,20 @@ def test_model_df4_z3(run_test_script): ) -@pytest.mark.depends(on=["test_model_df4"], scope="session") +@pytest.mark.depends_on(on=["test_model_df4"], scope="session") def test_model_bf4(run_test_script): # Breadth-first gradient accumulation baseline. run_test_script(f"test_{TEST_MODEL}_bf4", CONFIG_BF, compare=f"test_{TEST_MODEL}_df4") -@pytest.mark.depends(on=["test_model_df4", "test_model_bf4"]) +@pytest.mark.depends_on(on=["test_model_df4", "test_model_bf4"]) def test_model_bf2_df2(run_test_script): # Mixed gradient accumulation baseline. run_test_script(f"test_{TEST_MODEL}_bf2_df2", CONFIG_BF_DF, compare=f"test_{TEST_MODEL}_df4") @pytest.mark.slow -@pytest.mark.depends(on=["test_model_bf4"]) +@pytest.mark.depends_on(on=["test_model_bf4"]) def test_model_pp2s2_bf4(run_test_script): # Pipeline-parallel without tied weights. run_test_script( @@ -53,7 +52,7 @@ def test_model_pp2s2_bf4(run_test_script): @pytest.mark.slow -@pytest.mark.depends(on=["test_model_bf4"]) +@pytest.mark.depends_on(on=["test_model_bf4"]) def test_model_pp2s1_bf4(run_test_script): # Pipeline-parallel with tied weights. run_test_script( @@ -66,7 +65,7 @@ def test_model_pp2s1_bf4(run_test_script): @pytest.mark.slow -@pytest.mark.depends(on=["test_model_bf4"]) +@pytest.mark.depends_on(on=["test_model_bf4"]) def test_model_dp2_tp2_pp2s2_bf4(run_test_script): # Simple 3d parallelism # TODO: Test fails diff --git a/tests/test_mb_seq_first.py b/tests/test_mb_seq_first.py index dd00fd5fc..bb3d1e270 100644 --- a/tests/test_mb_seq_first.py +++ b/tests/test_mb_seq_first.py @@ -1,8 +1,7 @@ import pytest -from tests.common import CONFIG_COMMON from tests.utils.compare_tensor_logs import CompareConfig -from tests.utils.model_configs import TEST_MODEL +from tests.utils.model_configs import CONFIG_COMMON, TEST_MODEL CONFIG_DF_SF = CONFIG_COMMON + ["batch.depth_first_micro_batches=4", "model.base_model.sequence_first=True"] CONFIG_BF_SF = CONFIG_COMMON + ["batch.breadth_first_micro_batches=4", "model.base_model.sequence_first=True"] @@ -20,7 +19,7 @@ def test_model_df4_sf(run_test_script): @pytest.mark.slow -@pytest.mark.depends(on=["test_model_df4_sf"]) +@pytest.mark.depends_on(on=["test_model_df4_sf"]) def test_model_dp2_sp2_df4(run_test_script): # Sequence-tensor-parallel with gradient accumulation. # TODO: Compiled cross-entropy broken for this config @@ -39,7 +38,7 @@ def test_model_dp2_sp2_df4(run_test_script): @pytest.mark.slow @pytest.mark.skip(reason="Test is broken.") -@pytest.mark.depends(on=["test_model_df4_sf"]) +@pytest.mark.depends_on(on=["test_model_df4_sf"]) def test_model_dp2_sp2_pp2s1(run_test_script): # 3d-parallel with sequence-tensor-parallel. # TODO: Compiled cross-entropy broken for this config diff --git a/tests/test_ms.py b/tests/test_ms.py index 55032620b..d937f0eb3 100644 --- a/tests/test_ms.py +++ b/tests/test_ms.py @@ -1,7 +1,6 @@ import pytest -from tests.common import CONFIG_COMMON -from tests.utils.model_configs import TEST_MODEL +from tests.utils.model_configs import CONFIG_COMMON, TEST_MODEL CONFIG_MS = CONFIG_COMMON + ["batch.micro_sequence_length=256"] @@ -13,7 +12,7 @@ def test_model_ms256(run_test_script): @pytest.mark.slow -@pytest.mark.depends(on=["test_model_ms256"]) +@pytest.mark.depends_on(on=["test_model_ms256"]) def test_model_pp2s2_ms256(run_test_script): # Sequence-pipeline-parallel run_test_script( @@ -26,7 +25,7 @@ def test_model_pp2s2_ms256(run_test_script): @pytest.mark.slow @pytest.mark.skip -@pytest.mark.depends(on=["test_model_ms256"]) +@pytest.mark.depends_on(on=["test_model_ms256"]) def test_model_dp2s2_stp2_pp2s2_ms256(run_test_script): # TODO: Handle this case. # Sequence-3d-parallel diff --git a/tests/test_multi_stage.py b/tests/test_multi_stage.py index f5f09b1b3..7424cd683 100644 --- a/tests/test_multi_stage.py +++ b/tests/test_multi_stage.py @@ -4,7 +4,7 @@ from fast_llm.layers.transformer.transformer import TransformerLayer from fast_llm.tools.train import CliTrainingConfig from fast_llm.utils import Assert -from tests.common import CONFIG_COMMON +from tests.utils.model_configs import CONFIG_COMMON from tests.utils.utils import requires_cuda diff --git a/tests/test_seq_first.py b/tests/test_seq_first.py index 9ead58e88..123d8a68f 100644 --- a/tests/test_seq_first.py +++ b/tests/test_seq_first.py @@ -1,7 +1,6 @@ import pytest -from tests.common import CONFIG_COMMON -from tests.utils.model_configs import TEST_MODEL +from tests.utils.model_configs import CONFIG_COMMON, TEST_MODEL CONFIG_SF = CONFIG_COMMON + ["model.base_model.sequence_first=True"] @@ -13,7 +12,7 @@ def test_model_sf(run_test_script): @pytest.mark.slow -@pytest.mark.depends(on=["test_model_sf"]) +@pytest.mark.depends_on(on=["test_model_sf"]) def test_model_sp2(run_test_script): # Sequence-tensor-parallel. run_test_script( @@ -25,7 +24,7 @@ def test_model_sp2(run_test_script): @pytest.mark.slow -@pytest.mark.depends(on=["test_model_sf"]) +@pytest.mark.depends_on(on=["test_model_sf"]) def test_model_sdp2(run_test_script): # Sequence-data-parallel run_test_script( @@ -37,7 +36,7 @@ def test_model_sdp2(run_test_script): @pytest.mark.slow -@pytest.mark.depends(on=["test_model_sf"]) +@pytest.mark.depends_on(on=["test_model_sf"]) def test_model_sp2_ce4(run_test_script): # Sequence-tensor-parallel with cross-entropy splits. run_test_script( diff --git a/tests/test_simple.py b/tests/test_simple.py index 1523750f7..36ce14245 100644 --- a/tests/test_simple.py +++ b/tests/test_simple.py @@ -1,7 +1,6 @@ import pytest -from tests.common import CONFIG_COMMON, CONFIG_FAST_LLM -from tests.utils.model_configs import TEST_MODEL +from tests.utils.model_configs import CONFIG_COMMON, CONFIG_FAST_LLM, TEST_MODEL def test_model_safe(run_test_script): @@ -17,7 +16,7 @@ def test_model_safe(run_test_script): ) -@pytest.mark.depends(on=["test_model_safe"]) +@pytest.mark.depends_on(on=["test_model_safe"]) def test_model(run_test_script): # A baseline config (single-gpu, bf16, flash-attn). # Also tests for multiple data loaders. @@ -27,7 +26,7 @@ def test_model(run_test_script): @pytest.mark.slow -@pytest.mark.depends(on=["test_model"]) +@pytest.mark.depends_on(on=["test_model"]) def test_model_dp2(run_test_script): # Simple data-parallel. run_test_script(f"test_{TEST_MODEL}_dp2", CONFIG_COMMON, num_gpus=2, compare=f"test_{TEST_MODEL}") @@ -60,7 +59,7 @@ def test_model_dp2_timeout(run_test_script): @pytest.mark.slow -@pytest.mark.depends(on=["test_model"]) +@pytest.mark.depends_on(on=["test_model"]) def test_model_tp2(run_test_script): # Simple tensor-parallel. run_test_script( @@ -71,7 +70,7 @@ def test_model_tp2(run_test_script): ) -@pytest.mark.depends(on=["test_model"]) +@pytest.mark.depends_on(on=["test_model"]) def test_model_ce4(run_test_script): # Cross-entropy splits. run_test_script( @@ -82,7 +81,7 @@ def test_model_ce4(run_test_script): @pytest.mark.slow -@pytest.mark.depends(on=["test_model"]) +@pytest.mark.depends_on(on=["test_model"]) def test_model_dp2_z2(run_test_script): # Data-parallel with zero stage 2. run_test_script( @@ -94,7 +93,7 @@ def test_model_dp2_z2(run_test_script): @pytest.mark.slow -@pytest.mark.depends(on=["test_model"]) +@pytest.mark.depends_on(on=["test_model"]) def test_model_dp2_z3(run_test_script): # Data-parallel with zero stage 3. run_test_script( diff --git a/tests/utils/depends.py b/tests/utils/depends.py index 3972a066d..6e10eac17 100644 --- a/tests/utils/depends.py +++ b/tests/utils/depends.py @@ -120,7 +120,8 @@ def _resolve_dependencies(self, item: pytest.Function): for marker in item.iter_markers(): if marker.name == MARKER_NAME: for dependency in as_list(marker.kwargs.get(MARKER_KWARG_DEPENDENCIES, [])): - dependency = dependency.format(**item.callspec.params) + if hasattr(item, "callspec"): + dependency = dependency.format(**item.callspec.params) # If the name is not known, try to make it absolute (ie file::[class::]method) if dependency not in self._name_to_nodeids: From b328f0710f5a6709e0df1c050899639379054bed Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Thu, 12 Jun 2025 17:46:04 -0400 Subject: [PATCH 21/43] stuff --- tests/test_config.py | 29 ----------------------------- 1 file changed, 29 deletions(-) diff --git a/tests/test_config.py b/tests/test_config.py index e050cb230..72eda809d 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -1,6 +1,5 @@ import pathlib import subprocess -import unittest.mock import pytest import yaml @@ -8,9 +7,7 @@ from fast_llm.config import NoAutoValidate from fast_llm.data.dataset.gpt.config import GPTSamplingConfig from fast_llm.engine.checkpoint.config import CheckpointSaveMetadataConfig, ModelConfigType -from fast_llm.engine.config_utils.data_type import DataType from fast_llm.engine.distributed.config import DistributedConfig -from fast_llm.layers.transformer.config import TransformerConfig from fast_llm.models.auto import trainer_registry from fast_llm.models.gpt.config import GPTModelConfig, PretrainedGPTModelConfig from fast_llm.utils import Assert, check_equal_nested @@ -64,32 +61,6 @@ def test_validate_example_config(): trainer_registry["gpt"].from_dict(fast_llm_config_dict) -def test_do_use_flash_attention(): - # Create a mock DistributedConfig - mock_distributed_config = unittest.mock.Mock(spec=DistributedConfig) - - # Test case 1: use_flash_attention is True and training_dtype is float16 - config = TransformerConfig(use_flash_attention=True, window_size=None) - mock_distributed_config.training_dtype = DataType.float16 - assert config.do_use_flash_attention(mock_distributed_config) is True - - # Test case 2: use_flash_attention is False - config = TransformerConfig(use_flash_attention=False, window_size=None) - mock_distributed_config.training_dtype = DataType.float16 - assert config.do_use_flash_attention(mock_distributed_config) is False - - # Test case 3: use_flash_attention is True but training_dtype is not float16 or bfloat16 - config = TransformerConfig(use_flash_attention=True, window_size=None) - mock_distributed_config.training_dtype = DataType.float32 - assert config.do_use_flash_attention(mock_distributed_config) is False - - # Test case 4: use_flash_attention is False and window_size is not None - config = TransformerConfig(use_flash_attention=False, window_size=512) - mock_distributed_config.training_dtype = DataType.float32 - with pytest.raises(AssertionError): - config.do_use_flash_attention(mock_distributed_config) - - @pytest.mark.parametrize( ("cls", "default"), ((GPTSamplingConfig, {}), (GPTModelConfig, {"distributed": {"world_size": 1, "rank": 0, "local_world_size": 1}})), From 7ed804b153146a58bafa3fb9f9b215eaa9b83048 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Thu, 12 Jun 2025 17:48:54 -0400 Subject: [PATCH 22/43] stuff --- tests/test_functional.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_functional.py b/tests/test_functional.py index 03a0ae8a0..0689f4d8b 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -218,6 +218,8 @@ def test_mlp_recomputation(gated, activation_type): @pytest.mark.slow @requires_cuda def test_dropless_mlp(): + # TODO: Fix dropless MOE + pytest.fail("Test fails, aborting to avoid breaking cuda", False) num_experts = 4 experts_per_token = 4 tokens = 1024 From 6f000359bb2413f17552b617485f68bc1e07dfe1 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Thu, 12 Jun 2025 17:53:04 -0400 Subject: [PATCH 23/43] stuff --- tests/conftest.py | 2 +- tests/test_functional.py | 2 ++ tests/utils/depends.py | 3 ++- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index bc3d443cd..0d25fc5aa 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -160,7 +160,7 @@ def pytest_collection_modifyitems(config, items: list[pytest.Function]): # If pytest-depends is installed, it will complain about renamed nodes whether it's used or not. try: - import pytest_depends + import pytest_depends.main except ImportError: pass else: diff --git a/tests/test_functional.py b/tests/test_functional.py index b049be855..9211259c2 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -224,6 +224,8 @@ def test_mlp_recomputation(gated, activation_type): @pytest.mark.slow @requires_cuda def test_dropless_mlp(): + # TODO: Fix dropless MOE + pytest.fail("Test fails, aborting to avoid breaking cuda", False) num_experts = 4 experts_per_token = 4 tokens = 256 diff --git a/tests/utils/depends.py b/tests/utils/depends.py index 3972a066d..6e10eac17 100644 --- a/tests/utils/depends.py +++ b/tests/utils/depends.py @@ -120,7 +120,8 @@ def _resolve_dependencies(self, item: pytest.Function): for marker in item.iter_markers(): if marker.name == MARKER_NAME: for dependency in as_list(marker.kwargs.get(MARKER_KWARG_DEPENDENCIES, [])): - dependency = dependency.format(**item.callspec.params) + if hasattr(item, "callspec"): + dependency = dependency.format(**item.callspec.params) # If the name is not known, try to make it absolute (ie file::[class::]method) if dependency not in self._name_to_nodeids: From e45ff6aafacd981b5a3c21515b5e07c02b056f31 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Thu, 12 Jun 2025 17:56:02 -0400 Subject: [PATCH 24/43] stuff --- tests/common.py | 16 ---------------- tests/utils/dataset.py | 1 + 2 files changed, 1 insertion(+), 16 deletions(-) delete mode 100644 tests/common.py diff --git a/tests/common.py b/tests/common.py deleted file mode 100644 index a2dba74a6..000000000 --- a/tests/common.py +++ /dev/null @@ -1,16 +0,0 @@ -import os -import sys - -# FIXME: figure out correct import of megatron modules without this hack -sys.path.append(os.getcwd()) - -# TODO: Use `pytest_addoption` instead? -# Keep all results in one place to allow recovering them for debugging in case of failure. - -# Random lowercase: 80.7% (3.1% each); space: 18.6%; doc end: 0.6% - -# Megatron does not support Llama3-style Rotary Embeddings - -# Megatron does not support per sub layer biases - -# Yarn-style Rotary Embeddings diff --git a/tests/utils/dataset.py b/tests/utils/dataset.py index 72888dfdb..6f40347b1 100644 --- a/tests/utils/dataset.py +++ b/tests/utils/dataset.py @@ -15,6 +15,7 @@ DATASET_PREFIX = DATASET_CACHE / "common" / "dataset" DATASET_SAMPLING_CACHE = TEST_RESULTS_PATH / "dataset" / "cache" TEST_VOCAB_SIZE = 8192 +# Random lowercase: 80.7% (3.1% each); space: 18.6%; doc end: 0.6% TEST_CHARACTERS = (string.ascii_lowercase) * 5 + " " * 30 + "\n" TEST_DATASET_TOKENS = 1000000 From 67d3c92c9420af25a6b1c70e992c2c4195357a2f Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Thu, 12 Jun 2025 18:41:46 -0400 Subject: [PATCH 25/43] fix --- .github/workflows/ci.yaml | 7 ++++--- .github/workflows/docs.yaml | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 912ddaf5e..0bca2dd8d 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -27,10 +27,11 @@ jobs: - name: Install dependencies run: | - pip install "torch>=2.2.2" + pip install "torch>=2.7.0" pip install pybind11 - FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE FLASH_ATTENTION_FORCE_BUILD=TRUE MAMBA_SKIP_CUDA_BUILD=TRUE MAMBA_FORCE_BUILD=TRUE CAUSAL_CONV1D_FORCE_BUILD=TRUE CAUSAL_CONV1D_SKIP_CUDA_BUILD=TRUE pip install --no-build-isolation -e ".[CORE,OPTIONAL,DEV,DOCS]" - + FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE FLASH_ATTENTION_FORCE_BUILD=TRUE MAMBA_SKIP_CUDA_BUILD=TRUE \ + MAMBA_FORCE_BUILD=TRUE CAUSAL_CONV1D_FORCE_BUILD=TRUE CAUSAL_CONV1D_SKIP_CUDA_BUILD=TRUE \ + pip install --no-build-isolation -e ".[CORE,OPTIONAL,HUGGINGFACE,SSM,DEV,DOCS]" - name: Run tests run: pytest . diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml index b755993ce..632fa7b93 100644 --- a/.github/workflows/docs.yaml +++ b/.github/workflows/docs.yaml @@ -29,7 +29,7 @@ jobs: restore-keys: | mkdocs-material- - run: | - pip install "torch>=2.2.2" + pip install "torch>=2.7.0" pip install pybind11 FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE FLASH_ATTENTION_FORCE_BUILD=TRUE MAMBA_SKIP_CUDA_BUILD=TRUE \ MAMBA_FORCE_BUILD=TRUE CAUSAL_CONV1D_FORCE_BUILD=TRUE CAUSAL_CONV1D_SKIP_CUDA_BUILD=TRUE \ From c2ae03d830007a59745d4982a791ca32e3288f7b Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Fri, 13 Jun 2025 12:54:57 -0400 Subject: [PATCH 26/43] fix --- fast_llm/layers/ssm/discrete_mamba2.py | 4 ++-- fast_llm/layers/ssm/mamba_layer.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fast_llm/layers/ssm/discrete_mamba2.py b/fast_llm/layers/ssm/discrete_mamba2.py index ecf0b29d7..31e81e99b 100644 --- a/fast_llm/layers/ssm/discrete_mamba2.py +++ b/fast_llm/layers/ssm/discrete_mamba2.py @@ -17,7 +17,7 @@ from mamba_ssm.ops.triton.ssd_combined import mamba_chunk_scan_combined as _mamba_chunk_scan_combined # noqa _mamba_available = True -except ImportError: +except (ImportError, RuntimeError): _mamba_available = False @@ -25,7 +25,7 @@ from causal_conv1d import causal_conv1d_fn as _causal_conv1d_fn # noqa _causal_conv1d_available = True -except ImportError: +except (ImportError, RuntimeError): _causal_conv1d_available = False diff --git a/fast_llm/layers/ssm/mamba_layer.py b/fast_llm/layers/ssm/mamba_layer.py index 7fd437894..7c824d235 100644 --- a/fast_llm/layers/ssm/mamba_layer.py +++ b/fast_llm/layers/ssm/mamba_layer.py @@ -14,7 +14,7 @@ from mamba_ssm.ops.selective_scan_interface import mamba_inner_fn as _mamba_inner_fn # noqa _mamba_available = True -except ImportError: +except (ImportError, RuntimeError): _mamba_available = False """ From 31da2a80ff0575afa7fd6588a446a23cd3ae86c2 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Fri, 13 Jun 2025 15:32:48 -0400 Subject: [PATCH 27/43] misc --- fast_llm/config.py | 12 +- fast_llm/engine/multi_stage/config.py | 7 +- fast_llm/models/gpt/conversion.py | 3 +- fast_llm/models/ssm/config.py | 2 +- tests/utils/model_configs.py | 242 +++++++++++++------------- 5 files changed, 136 insertions(+), 130 deletions(-) diff --git a/fast_llm/config.py b/fast_llm/config.py index f2197224e..cdc1dd5d8 100644 --- a/fast_llm/config.py +++ b/fast_llm/config.py @@ -490,7 +490,7 @@ def _validate_element(cls, value, type_, name: str): elif issubclass(origin, dict): value = cls._validate_dict(value, type_, name) elif origin is type: - cls._validate_type(value, type_, name) + value = cls._validate_type(value, type_, name) else: raise FieldTypeError(f"Unsupported __origin__ `{origin}`") elif not isinstance(type_, type): @@ -585,10 +585,13 @@ def _validate_type(cls, value, type_: type | tuple[type, ...], name): args = list(getattr(type_, "__args__", [])) if len(args) != 1: raise FieldTypeError(f"Invalid type specification `{get_type_name(type_)}` for field `{name}`") + if issubclass(args[0], Config) and isinstance(value, str): + value = args[0].get_subclass(value) if not isinstance(value, type): raise ValidationError(f"Unexpected type `{get_type_name(type(value))}`") if not issubclass(value, args[0]): raise ValidationError(f"Field value `{value} is not a subclass of `{get_type_name(type_)}`") + return value @classmethod def _validate_element_type(cls, value, type_: type | tuple[type, ...], strict: bool = True): @@ -947,6 +950,13 @@ def get_subclass(cls, name: str | None): raise KeyError(f"Unknown type {name} for base class {cls.__name__}") return cls_ + @classmethod + def __fast_llm_serialize__(cls) -> str: + # Used to serialize config type fields, which only makes sense for dynamic types. + # Deserialization implemented in _validate_type. + assert cls.dynamic_type_name is not None + return cls.dynamic_type_name + def __init_subclass__(cls): """ We need to postpone validation until the class has been processed by the dataclass wrapper. diff --git a/fast_llm/engine/multi_stage/config.py b/fast_llm/engine/multi_stage/config.py index 5aa972c29..6ac157dfe 100644 --- a/fast_llm/engine/multi_stage/config.py +++ b/fast_llm/engine/multi_stage/config.py @@ -215,7 +215,6 @@ class FastLLMModelConfig(Config): DistributedCheckpointFormat, FastLLMCheckpointFormat, ) - model_name: typing.ClassVar[str] base_model: BaseModelConfig = Field(desc="Configuration for the base model.", hint=FieldHint.core) multi_stage: MultiStageConfig = Field( desc="Configuration for the stage breakdown of the model.", @@ -223,10 +222,6 @@ class FastLLMModelConfig(Config): ) distributed: DistributedConfig = Field(desc="Distributed configuration.", hint=FieldHint.core) - @classmethod - def __fast_llm_serialize__(cls) -> str: - return cls.model_name - @classmethod def get_checkpoint_format(cls, format: type[CheckpointFormat] | str) -> type[CheckpointFormat]: if isinstance(format, type) and issubclass(format, CheckpointFormat): @@ -236,7 +231,7 @@ def get_checkpoint_format(cls, format: type[CheckpointFormat] | str) -> type[Che for format_ in cls.checkpoint_formats: if format_.name == format: return format_ - raise ValueError(f"Checkpoint format {format} not supported for model {cls.model_name}") + raise ValueError(f"Checkpoint format {format} not supported for model {cls.dynamic_type_name}") @classmethod def get_checkpoint_handler_class(cls, format: type[CheckpointFormat] | str) -> type[CheckpointHandler]: diff --git a/fast_llm/models/gpt/conversion.py b/fast_llm/models/gpt/conversion.py index 5c6896294..93428954a 100644 --- a/fast_llm/models/gpt/conversion.py +++ b/fast_llm/models/gpt/conversion.py @@ -319,7 +319,8 @@ def _create_config_converters(cls) -> list[ParamConverter]: return super()._create_config_converters() + [ ConstantExportParamConverter(export_names=(("architectures",),), export_value=["Starcoder2ForCausalLM"]), ConstantImportParamConverter( - fast_llm_names=(("transformer", "rotary", "type"),), fast_llm_value=DefaultRotaryConfig + fast_llm_names=(("transformer", "rotary", "type"),), + fast_llm_value=DefaultRotaryConfig.dynamic_type_name, ), ConstantImportParamConverter( fast_llm_names=(("transformer", "normalization", "type"),), diff --git a/fast_llm/models/ssm/config.py b/fast_llm/models/ssm/config.py index 301aca7b0..386d2f504 100644 --- a/fast_llm/models/ssm/config.py +++ b/fast_llm/models/ssm/config.py @@ -169,7 +169,7 @@ def get_handler_class(cls) -> type[CheckpointHandler]: return AprielThinkerSSMHHybridHuggingfaceCheckpointHandler -@config_class() +@config_class(dynamic_type={FastLLMModelConfig: "hybrid_ssm"}) class HybridSSMModelConfig(FastLLMModelConfig): _abstract = False model_name: typing.ClassVar[str] = "hybrid_ssm" diff --git a/tests/utils/model_configs.py b/tests/utils/model_configs.py index d4889e948..8357bdbe4 100644 --- a/tests/utils/model_configs.py +++ b/tests/utils/model_configs.py @@ -7,7 +7,7 @@ import pytest from fast_llm.engine.checkpoint.config import CheckpointFormat -from fast_llm.models.auto import model_registry +from fast_llm.engine.multi_stage.config import FastLLMModelConfig from fast_llm.models.gpt.config import ( LlamaGPTHuggingfaceCheckpointFormat, MistralGPTHuggingfaceCheckpointFormat, @@ -30,6 +30,19 @@ class ModelTestingGroup(enum.StrEnum): generate = "generate" +class ModelTestingGroupAction(enum.StrEnum): + # Critical test, will always run. + main = "main" + # Standard test, treated as slow + normal = "normal" + # Feature is not important enough for frequent testing (ex. mostly redundant), treated as extra-slow. + unimportant = "unimportant" + # Test is known to fail, treated as extra-slow. + broken = "broken" + # Tested feature is unsupported for this model, skip unconditionally. + not_implemented = "not_implemented" + + SLOW_TESTING_GROUPS = {ModelTestingGroup.megatron, ModelTestingGroup.distributed} @@ -40,15 +53,12 @@ class ModelTestingConfig: config_args: list[str] megatron_args: list[str] | None checkpoint_format: CheckpointFormat | None - # The important groups we want to test. - testing_groups: list[ModelTestingGroup] - # Other supported groups, excluded by default because they are mostly unimportant and/or redundant. - # They can be run with `--run-extra-slow`. - other_groups: list[ModelTestingGroup] + groups: dict[ModelTestingGroup, ModelTestingGroupAction] @functools.cached_property def model_config_class(self): - return model_registry[self.model_type] + # TODO: Ok to assume the model and trainer have the same name? + return FastLLMModelConfig.get_subclass(self.model_type) @functools.cached_property def huggingface_model_for_causal_lm_class(self): @@ -71,14 +81,12 @@ def _update_and_add_testing_config( extra_args: list[str] | None = None, megatron_args: list[str] | None = ..., checkpoint_format: CheckpointFormat | None = ..., - testing_groups: list[ModelTestingGroup], - other_groups: list[ModelTestingGroup], + groups: dict[ModelTestingGroup, ModelTestingGroupAction], ): config = _MODEL_CONFIGS[old_name] updates: dict[str, typing.Any] = { "name": new_name, - "testing_groups": testing_groups, - "other_groups": other_groups, + "groups": groups, } if model_type is not None: updates["model_type"] = model_type @@ -177,12 +185,13 @@ def _update_and_add_testing_config( "--transformer-impl=transformer_engine", ], checkpoint_format=None, - testing_groups=[ - ModelTestingGroup.basic, - ModelTestingGroup.megatron, - ModelTestingGroup.distributed, - ], - other_groups=[], + groups={ + ModelTestingGroup.basic: ModelTestingGroupAction.main, + ModelTestingGroup.convert: ModelTestingGroupAction.not_implemented, + ModelTestingGroup.generate: ModelTestingGroupAction.not_implemented, + ModelTestingGroup.megatron: ModelTestingGroupAction.normal, + ModelTestingGroup.distributed: ModelTestingGroupAction.normal, + }, ) _update_and_add_testing_config( @@ -192,13 +201,13 @@ def _update_and_add_testing_config( extra_args=["model.base_model.transformer.head_groups=1"], megatron_args=["--group-query-attention"], checkpoint_format=None, - testing_groups=[ - ModelTestingGroup.basic, - ], - other_groups=[ - ModelTestingGroup.megatron, - ModelTestingGroup.distributed, - ], + groups={ + ModelTestingGroup.basic: ModelTestingGroupAction.normal, + ModelTestingGroup.convert: ModelTestingGroupAction.not_implemented, + ModelTestingGroup.generate: ModelTestingGroupAction.not_implemented, + ModelTestingGroup.megatron: ModelTestingGroupAction.unimportant, + ModelTestingGroup.distributed: ModelTestingGroupAction.unimportant, + }, ) _update_and_add_testing_config( @@ -218,16 +227,14 @@ def _update_and_add_testing_config( "--no-position-embedding", ], checkpoint_format=Starcoder2GPTHuggingfaceCheckpointFormat, - testing_groups=[ - ModelTestingGroup.basic, - ModelTestingGroup.convert, - ], - # TODO: Bring back `generate` to `testing_groups` when stable. - other_groups=[ - ModelTestingGroup.megatron, - ModelTestingGroup.distributed, - ModelTestingGroup.generate, - ], + # TODO: Add back generate as `normal` when stable. + groups={ + ModelTestingGroup.basic: ModelTestingGroupAction.normal, + ModelTestingGroup.convert: ModelTestingGroupAction.normal, + ModelTestingGroup.generate: ModelTestingGroupAction.broken, + ModelTestingGroup.megatron: ModelTestingGroupAction.unimportant, + ModelTestingGroup.distributed: ModelTestingGroupAction.unimportant, + }, ) _update_and_add_testing_config( @@ -250,16 +257,14 @@ def _update_and_add_testing_config( "--untie-embeddings-and-output-weights", ], checkpoint_format=LlamaGPTHuggingfaceCheckpointFormat, - testing_groups=[ - ModelTestingGroup.basic, - ModelTestingGroup.megatron, - ModelTestingGroup.distributed, - ModelTestingGroup.convert, - ], - # TODO: Bring back `generate` to `testing_groups` when stable. - other_groups=[ - ModelTestingGroup.generate, - ], + # TODO: Add back generate as `normal` when stable. + groups={ + ModelTestingGroup.basic: ModelTestingGroupAction.main, + ModelTestingGroup.convert: ModelTestingGroupAction.main, + ModelTestingGroup.generate: ModelTestingGroupAction.broken, + ModelTestingGroup.megatron: ModelTestingGroupAction.normal, + ModelTestingGroup.distributed: ModelTestingGroupAction.normal, + }, ) _update_and_add_testing_config( @@ -270,15 +275,13 @@ def _update_and_add_testing_config( # Megatron doesn't support Llama3-style Rotary Embeddings megatron_args=None, checkpoint_format=LlamaGPTHuggingfaceCheckpointFormat, - testing_groups=[ - ModelTestingGroup.basic, - ], - # TODO: Bring back `generate` to `testing_groups` when stable. - other_groups=[ - ModelTestingGroup.distributed, - ModelTestingGroup.convert, - ModelTestingGroup.generate, - ], + groups={ + ModelTestingGroup.basic: ModelTestingGroupAction.normal, + ModelTestingGroup.convert: ModelTestingGroupAction.unimportant, + ModelTestingGroup.generate: ModelTestingGroupAction.unimportant, + ModelTestingGroup.megatron: ModelTestingGroupAction.not_implemented, + ModelTestingGroup.distributed: ModelTestingGroupAction.unimportant, + }, ) _update_and_add_testing_config( @@ -289,15 +292,13 @@ def _update_and_add_testing_config( # Megatron doesn't support Yarn-style Rotary Embeddings megatron_args=None, checkpoint_format=LlamaGPTHuggingfaceCheckpointFormat, - testing_groups=[ - ModelTestingGroup.basic, - ], - # TODO: Bring back `generate` to `testing_groups` when stable. - other_groups=[ - ModelTestingGroup.distributed, - ModelTestingGroup.convert, - ModelTestingGroup.generate, - ], + groups={ + ModelTestingGroup.basic: ModelTestingGroupAction.normal, + ModelTestingGroup.convert: ModelTestingGroupAction.unimportant, + ModelTestingGroup.generate: ModelTestingGroupAction.unimportant, + ModelTestingGroup.megatron: ModelTestingGroupAction.not_implemented, + ModelTestingGroup.distributed: ModelTestingGroupAction.unimportant, + }, ) _update_and_add_testing_config( @@ -308,15 +309,14 @@ def _update_and_add_testing_config( # Megatron doesn't support multi-token prediction. megatron_args=None, checkpoint_format=MTPLlamaGPTHuggingfaceCheckpointFormat, - testing_groups=[ - ModelTestingGroup.basic, - ModelTestingGroup.convert, - ], - # TODO: Bring back `generate` to `testing_groups` when stable. - other_groups=[ - ModelTestingGroup.distributed, - ModelTestingGroup.generate, - ], + # TODO: Add back generate as `normal` when stable. + groups={ + ModelTestingGroup.basic: ModelTestingGroupAction.normal, + ModelTestingGroup.convert: ModelTestingGroupAction.normal, + ModelTestingGroup.generate: ModelTestingGroupAction.broken, + ModelTestingGroup.megatron: ModelTestingGroupAction.not_implemented, + ModelTestingGroup.distributed: ModelTestingGroupAction.unimportant, + }, ) _update_and_add_testing_config( @@ -327,15 +327,14 @@ def _update_and_add_testing_config( # Megatron doesn't support per sub layer biases megatron_args=None, checkpoint_format=Qwen2GPTHuggingfaceCheckpointFormat, - testing_groups=[ - ModelTestingGroup.basic, - ModelTestingGroup.convert, - ], - # TODO: Bring back `generate` to `testing_groups` when stable. - other_groups=[ - ModelTestingGroup.distributed, - ModelTestingGroup.generate, - ], + # TODO: Add back generate as `normal` when stable. + groups={ + ModelTestingGroup.basic: ModelTestingGroupAction.normal, + ModelTestingGroup.convert: ModelTestingGroupAction.normal, + ModelTestingGroup.generate: ModelTestingGroupAction.broken, + ModelTestingGroup.megatron: ModelTestingGroupAction.not_implemented, + ModelTestingGroup.distributed: ModelTestingGroupAction.unimportant, + }, ) _update_and_add_testing_config( @@ -346,15 +345,14 @@ def _update_and_add_testing_config( # Megatron doesn't support sliding windows. megatron_args=None, checkpoint_format=MistralGPTHuggingfaceCheckpointFormat, - testing_groups=[ - ModelTestingGroup.basic, - ModelTestingGroup.convert, - ], - # TODO: Bring back `generate` to `testing_groups` when stable. - other_groups=[ - ModelTestingGroup.distributed, - ModelTestingGroup.generate, - ], + # TODO: Add back generate as `normal` when stable. + groups={ + ModelTestingGroup.basic: ModelTestingGroupAction.normal, + ModelTestingGroup.convert: ModelTestingGroupAction.normal, + ModelTestingGroup.generate: ModelTestingGroupAction.broken, + ModelTestingGroup.megatron: ModelTestingGroupAction.not_implemented, + ModelTestingGroup.distributed: ModelTestingGroupAction.unimportant, + }, ) _update_and_add_testing_config( @@ -370,16 +368,14 @@ def _update_and_add_testing_config( "--moe-router-topk=4", ], checkpoint_format=MixtralGPTHuggingfaceCheckpointFormat, - testing_groups=[], # TODO: New base image broke mixtral - # TODO: Bring back `generate` to `testing_groups` when stable. - other_groups=[ - ModelTestingGroup.basic, - ModelTestingGroup.megatron, - ModelTestingGroup.distributed, - ModelTestingGroup.convert, - ModelTestingGroup.generate, - ], + groups={ + ModelTestingGroup.basic: ModelTestingGroupAction.broken, + ModelTestingGroup.convert: ModelTestingGroupAction.broken, + ModelTestingGroup.generate: ModelTestingGroupAction.broken, + ModelTestingGroup.megatron: ModelTestingGroupAction.broken, + ModelTestingGroup.distributed: ModelTestingGroupAction.broken, + }, ) _update_and_add_testing_config( @@ -396,16 +392,16 @@ def _update_and_add_testing_config( ], megatron_args=None, checkpoint_format=LLambaHuggingfaceCheckpointFormat, - testing_groups=[ - ModelTestingGroup.basic, - ], - # TODO: Bring back `generate` to `testing_groups` when stable. - other_groups=[ - # TODO: Fix and bring these back to `testing_groups` - ModelTestingGroup.distributed, - ModelTestingGroup.convert, - ModelTestingGroup.generate, - ], + # TODO: Add back generate as `normal` when stable. + groups={ + ModelTestingGroup.basic: ModelTestingGroupAction.normal, + ModelTestingGroup.convert: ModelTestingGroupAction.broken, + # TODO: Fix and bring back to `testing_groups` + ModelTestingGroup.generate: ModelTestingGroupAction.broken, + ModelTestingGroup.megatron: ModelTestingGroupAction.not_implemented, + # TODO: Fix and bring back to `testing_groups` + ModelTestingGroup.distributed: ModelTestingGroupAction.broken, + }, ) @@ -419,14 +415,13 @@ def _update_and_add_testing_config( ], megatron_args=None, checkpoint_format=None, - testing_groups=[ - ModelTestingGroup.basic, - ], - # TODO: Bring back `generate` to `testing_groups` when stable. - other_groups=[ - # TODO: Fix and bring back to `testing_groups` - ModelTestingGroup.distributed, - ], + groups={ + ModelTestingGroup.basic: ModelTestingGroupAction.normal, + ModelTestingGroup.convert: ModelTestingGroupAction.not_implemented, + ModelTestingGroup.generate: ModelTestingGroupAction.not_implemented, + ModelTestingGroup.megatron: ModelTestingGroupAction.not_implemented, + ModelTestingGroup.distributed: ModelTestingGroupAction.unimportant, + }, ) @@ -440,12 +435,17 @@ def testing_group_enabled(item: pytest.Function, skip_slow: bool, skip_extra_slo assert "model_testing_config" in item.callspec.params, item.nodeid groups: tuple[ModelTestingGroup] = item.keywords["model_testing_group"].args model_testing_config = item.callspec.params["model_testing_config"] - model_config = _MODEL_CONFIGS[model_testing_config] + model_config: ModelTestingConfig = _MODEL_CONFIGS[model_testing_config] for group in groups: - if group in model_config.testing_groups and not (skip_slow and group in SLOW_TESTING_GROUPS): - pass - elif group in model_config.other_groups and not skip_extra_slow: - pass + action = model_config.groups[group] + if action == ModelTestingGroupAction.main: + return True + elif action == ModelTestingGroupAction.normal and not skip_slow: + return True + elif ( + action in (ModelTestingGroupAction.broken, ModelTestingGroupAction.unimportant) and not skip_extra_slow + ): + return True elif show_skipped: item.add_marker( pytest.mark.skip(reason=f"Skipping testing group {group} for model {model_testing_config}.") From c2ee8fee9d97dca477ed7fd700be5d440f5d6a3d Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Fri, 13 Jun 2025 15:37:13 -0400 Subject: [PATCH 28/43] stuff --- fast_llm/config.py | 12 +++++++++++- fast_llm/engine/multi_stage/config.py | 7 +------ fast_llm/layers/ssm/discrete_mamba2.py | 4 ++-- fast_llm/layers/ssm/mamba_layer.py | 2 +- fast_llm/models/gpt/conversion.py | 3 ++- fast_llm/models/ssm/config.py | 2 +- 6 files changed, 18 insertions(+), 12 deletions(-) diff --git a/fast_llm/config.py b/fast_llm/config.py index f2197224e..cdc1dd5d8 100644 --- a/fast_llm/config.py +++ b/fast_llm/config.py @@ -490,7 +490,7 @@ def _validate_element(cls, value, type_, name: str): elif issubclass(origin, dict): value = cls._validate_dict(value, type_, name) elif origin is type: - cls._validate_type(value, type_, name) + value = cls._validate_type(value, type_, name) else: raise FieldTypeError(f"Unsupported __origin__ `{origin}`") elif not isinstance(type_, type): @@ -585,10 +585,13 @@ def _validate_type(cls, value, type_: type | tuple[type, ...], name): args = list(getattr(type_, "__args__", [])) if len(args) != 1: raise FieldTypeError(f"Invalid type specification `{get_type_name(type_)}` for field `{name}`") + if issubclass(args[0], Config) and isinstance(value, str): + value = args[0].get_subclass(value) if not isinstance(value, type): raise ValidationError(f"Unexpected type `{get_type_name(type(value))}`") if not issubclass(value, args[0]): raise ValidationError(f"Field value `{value} is not a subclass of `{get_type_name(type_)}`") + return value @classmethod def _validate_element_type(cls, value, type_: type | tuple[type, ...], strict: bool = True): @@ -947,6 +950,13 @@ def get_subclass(cls, name: str | None): raise KeyError(f"Unknown type {name} for base class {cls.__name__}") return cls_ + @classmethod + def __fast_llm_serialize__(cls) -> str: + # Used to serialize config type fields, which only makes sense for dynamic types. + # Deserialization implemented in _validate_type. + assert cls.dynamic_type_name is not None + return cls.dynamic_type_name + def __init_subclass__(cls): """ We need to postpone validation until the class has been processed by the dataclass wrapper. diff --git a/fast_llm/engine/multi_stage/config.py b/fast_llm/engine/multi_stage/config.py index 5aa972c29..6ac157dfe 100644 --- a/fast_llm/engine/multi_stage/config.py +++ b/fast_llm/engine/multi_stage/config.py @@ -215,7 +215,6 @@ class FastLLMModelConfig(Config): DistributedCheckpointFormat, FastLLMCheckpointFormat, ) - model_name: typing.ClassVar[str] base_model: BaseModelConfig = Field(desc="Configuration for the base model.", hint=FieldHint.core) multi_stage: MultiStageConfig = Field( desc="Configuration for the stage breakdown of the model.", @@ -223,10 +222,6 @@ class FastLLMModelConfig(Config): ) distributed: DistributedConfig = Field(desc="Distributed configuration.", hint=FieldHint.core) - @classmethod - def __fast_llm_serialize__(cls) -> str: - return cls.model_name - @classmethod def get_checkpoint_format(cls, format: type[CheckpointFormat] | str) -> type[CheckpointFormat]: if isinstance(format, type) and issubclass(format, CheckpointFormat): @@ -236,7 +231,7 @@ def get_checkpoint_format(cls, format: type[CheckpointFormat] | str) -> type[Che for format_ in cls.checkpoint_formats: if format_.name == format: return format_ - raise ValueError(f"Checkpoint format {format} not supported for model {cls.model_name}") + raise ValueError(f"Checkpoint format {format} not supported for model {cls.dynamic_type_name}") @classmethod def get_checkpoint_handler_class(cls, format: type[CheckpointFormat] | str) -> type[CheckpointHandler]: diff --git a/fast_llm/layers/ssm/discrete_mamba2.py b/fast_llm/layers/ssm/discrete_mamba2.py index 31e81e99b..ecf0b29d7 100644 --- a/fast_llm/layers/ssm/discrete_mamba2.py +++ b/fast_llm/layers/ssm/discrete_mamba2.py @@ -17,7 +17,7 @@ from mamba_ssm.ops.triton.ssd_combined import mamba_chunk_scan_combined as _mamba_chunk_scan_combined # noqa _mamba_available = True -except (ImportError, RuntimeError): +except ImportError: _mamba_available = False @@ -25,7 +25,7 @@ from causal_conv1d import causal_conv1d_fn as _causal_conv1d_fn # noqa _causal_conv1d_available = True -except (ImportError, RuntimeError): +except ImportError: _causal_conv1d_available = False diff --git a/fast_llm/layers/ssm/mamba_layer.py b/fast_llm/layers/ssm/mamba_layer.py index 7c824d235..7fd437894 100644 --- a/fast_llm/layers/ssm/mamba_layer.py +++ b/fast_llm/layers/ssm/mamba_layer.py @@ -14,7 +14,7 @@ from mamba_ssm.ops.selective_scan_interface import mamba_inner_fn as _mamba_inner_fn # noqa _mamba_available = True -except (ImportError, RuntimeError): +except ImportError: _mamba_available = False """ diff --git a/fast_llm/models/gpt/conversion.py b/fast_llm/models/gpt/conversion.py index 5c6896294..93428954a 100644 --- a/fast_llm/models/gpt/conversion.py +++ b/fast_llm/models/gpt/conversion.py @@ -319,7 +319,8 @@ def _create_config_converters(cls) -> list[ParamConverter]: return super()._create_config_converters() + [ ConstantExportParamConverter(export_names=(("architectures",),), export_value=["Starcoder2ForCausalLM"]), ConstantImportParamConverter( - fast_llm_names=(("transformer", "rotary", "type"),), fast_llm_value=DefaultRotaryConfig + fast_llm_names=(("transformer", "rotary", "type"),), + fast_llm_value=DefaultRotaryConfig.dynamic_type_name, ), ConstantImportParamConverter( fast_llm_names=(("transformer", "normalization", "type"),), diff --git a/fast_llm/models/ssm/config.py b/fast_llm/models/ssm/config.py index 301aca7b0..386d2f504 100644 --- a/fast_llm/models/ssm/config.py +++ b/fast_llm/models/ssm/config.py @@ -169,7 +169,7 @@ def get_handler_class(cls) -> type[CheckpointHandler]: return AprielThinkerSSMHHybridHuggingfaceCheckpointHandler -@config_class() +@config_class(dynamic_type={FastLLMModelConfig: "hybrid_ssm"}) class HybridSSMModelConfig(FastLLMModelConfig): _abstract = False model_name: typing.ClassVar[str] = "hybrid_ssm" From 6c775e47bec481569e3ab69861c52c01a7ae231f Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Fri, 13 Jun 2025 15:53:49 -0400 Subject: [PATCH 29/43] stuff --- tests/test_match_megatron.py | 157 ++--------- tests/test_simple.py | 5 +- tests/utils/model_configs.py | 512 ++++++++++++++++++++--------------- 3 files changed, 322 insertions(+), 352 deletions(-) diff --git a/tests/test_match_megatron.py b/tests/test_match_megatron.py index 7d89c80a2..f464dd06d 100644 --- a/tests/test_match_megatron.py +++ b/tests/test_match_megatron.py @@ -2,25 +2,12 @@ from tests.utils.compare_tensor_logs import CompareConfig from tests.utils.dataset import DATASET_PREFIX -from tests.utils.model_configs import ( - CONFIG_GPT2_FAST_LLM, - CONFIG_GPT2_MEGATRON, - CONFIG_LLAMA_FAST_LLM, - CONFIG_LLAMA_MEGATRON, - CONFIG_MIXTRAL_FAST_LLM, - CONFIG_MIXTRAL_MEGATRON, - CONFIG_SC1_FAST_LLM, - CONFIG_SC1_MEGATRON, - CONFIG_SC2_FAST_LLM, - CONFIG_SC2_MEGATRON, -) +from tests.utils.model_configs import CONFIG_COMMON, CONFIG_MEGATRON, TEST_MODEL @pytest.mark.slow -@pytest.mark.skip(reason="Skipping mostly redundant test") -def test_sc1_meg(run_test_script): - # Starcoder 1 (GPT2 with MQA) with Megatron. - run_test_script("test_sc1_meg", CONFIG_SC1_MEGATRON + ["--micro-batch-size=8"], is_megatron=True) +def test_megatron(run_test_script): + run_test_script(f"test_{TEST_MODEL}_megatron", CONFIG_MEGATRON, is_megatron=True) CONFIG_MATCH_MEGATRON = [ @@ -29,42 +16,31 @@ def test_sc1_meg(run_test_script): ] -@pytest.mark.depends_on(on=["test_sc1_meg"]) -def test_sc1_match_meg(run_test_script): - # Starcoder 1 (GPT2 with MQA) with Fast-llm. - # QKV tensors are in a different format. - run_test_script( - "test_sc1_match_meg", - CONFIG_SC1_FAST_LLM + CONFIG_MATCH_MEGATRON + ["model.base_model.use_megatron_initialization=True"], - compare="test_sc1_meg", - config=CompareConfig( - ignore_tensors=[ - ".self_attn.query_key_value.", - ".self_attn.query.", - ".self_attn.key_value.", - ".mlp.layer_2.weight", - ] - ), - ) +@pytest.mark.depends_on(on=["test_megatron"]) +def test_match_megatron(run_test_script): + if CONFIG_MEGATRON is None: + pytest.skip(f"Megatron does not support model {TEST_MODEL}") + ignore_tensors = [ + ".self_attn.query_key_value.", + ".self_attn.query.", + ".self_attn.key_value.", + ".mlp.layer_2.weight", + ".mlp.experts.", + ] + if TEST_MODEL == "mixtral": + ignore_tensors.extend([".mlp.experts.", ".mlp.layer_1.weight"]) -@pytest.mark.slow -@pytest.mark.skip(reason="Skipping mostly redundant test") -@pytest.mark.depends_on(on=["test_sc1_match_meg"]) -def test_sc2_meg(run_test_script): - # Starcoder 2 (GPT2 with MQA and RoPE) with Megatron. - run_test_script("test_sc2_meg", CONFIG_SC2_MEGATRON + ["--micro-batch-size=8"], is_megatron=True) - - -@pytest.mark.depends_on(on=["test_sc2_meg"]) -def test_sc2_match_meg(run_test_script): - # Starcoder 2 (GPT2 with MQA and RoPE) with Fast-llm. - # QKV tensors are in a different format, - # dense not matching because of the way initialization is corrected for RoPE format. run_test_script( - "test_sc2_match_meg", - CONFIG_SC2_FAST_LLM + CONFIG_MATCH_MEGATRON + ["model.base_model.use_megatron_initialization=True"], - compare="test_sc2_meg", + f"test_{TEST_MODEL}_match_megatron", + CONFIG_COMMON + + [ + "model.distributed.training_dtype=fp32", + "data.datasets={}", + f"data.path={DATASET_PREFIX}", + "model.base_model.use_megatron_initialization=True", + ], + compare=f"test_{TEST_MODEL}_megatron", config=CompareConfig( ignore_tensors=[ ".self_attn.query_key_value.", @@ -75,86 +51,3 @@ def test_sc2_match_meg(run_test_script): ] ), ) - - -@pytest.mark.slow -def test_gpt2_meg(run_test_script): - # GPT2 (MHA, layer norm, absolute embeddings) with Megatron. - run_test_script("test_gpt2_meg", CONFIG_GPT2_MEGATRON + ["--micro-batch-size=8"], is_megatron=True) - - -@pytest.mark.depends_on(on=["test_gpt2_meg"]) -def test_gpt2_match_meg(run_test_script): - # GPT2 (MHA, layer norm, absolute embeddings) with Fast-llm. - # QKV tensors are in a different format. - run_test_script( - "test_gpt2_match_meg", - CONFIG_GPT2_FAST_LLM + CONFIG_MATCH_MEGATRON + ["model.base_model.use_megatron_initialization=True"], - compare="test_gpt2_meg", - config=CompareConfig( - ignore_tensors=[ - ".self_attn.query_key_value.", - ".self_attn.query.", - ".self_attn.key_value.", - ".mlp.layer_2.weight", - ] - ), - ) - - -@pytest.mark.slow -def test_mistral_meg(run_test_script): - # Mistral with Megatron. - # No linear bias, swiglu activation, RMSNorm - run_test_script("test_mistral_meg", CONFIG_LLAMA_MEGATRON + ["--micro-batch-size=8"], is_megatron=True) - - -@pytest.mark.depends_on(on=["test_mistral_meg"]) -def test_mistral_match_meg(run_test_script): - # Mistral with Fast-LLM. - run_test_script( - "test_mistral_match_meg", - CONFIG_LLAMA_FAST_LLM + CONFIG_MATCH_MEGATRON + ["model.base_model.use_megatron_initialization=True"], - compare="test_mistral_meg", - config=CompareConfig( - ignore_tensors=[ - ".self_attn.query_key_value.", - ".self_attn.query.", - ".self_attn.key_value.", - ".self_attn.dense.", - ".mlp.layer_2.weight", - ] - ), - ) - - -@pytest.mark.slow -def test_mixtral_meg(run_test_script): - # Mistral with Megatron. - # No linear bias, swiglu activation, RMSNorm - run_test_script("test_mixtral_meg", CONFIG_MIXTRAL_MEGATRON + ["--micro-batch-size=8"], is_megatron=True) - - -@pytest.mark.depends_on(on=["test_mixtral_meg"]) -def test_mixtral_match_meg(run_test_script): - # Mistral with Fast-LLM. - # TODO: Fix dropless MOE - pytest.fail("Test fails, aborting to avoid breaking cuda", False) - run_test_script( - "test_mixtral_match_meg", - CONFIG_MIXTRAL_FAST_LLM + CONFIG_MATCH_MEGATRON + ["model.base_model.use_megatron_initialization=True"], - compare="test_mixtral_meg", - config=CompareConfig( - ignore_tensors=[ - ".self_attn.query_key_value.", - ".self_attn.query.", - ".self_attn.key_value.", - ".self_attn.dense.", - ".mlp.layer_1.weight", - ".mlp.layer_2.weight", - ".mlp.experts", - "Global layer 2 fw: Transformer layer 2 output", - ], - max_rel_tolerance=1.5e-1, - ), - ) diff --git a/tests/test_simple.py b/tests/test_simple.py index 36ce14245..d67d06cdb 100644 --- a/tests/test_simple.py +++ b/tests/test_simple.py @@ -1,14 +1,15 @@ import pytest -from tests.utils.model_configs import CONFIG_COMMON, CONFIG_FAST_LLM, TEST_MODEL +from tests.utils.model_configs import CONFIG_COMMON, TEST_MODEL def test_model_safe(run_test_script): # The safest possible config, identical to the one in test_match_megatron except for the initialization. run_test_script( f"test_{TEST_MODEL}_safe", - CONFIG_FAST_LLM + CONFIG_COMMON + [ + "model.distributed.training_dtype=fp32", "run.torch_dynamo_enable=False", "schedule.data_overlap=False", "model.base_model.transformer.dropless_moe=False", diff --git a/tests/utils/model_configs.py b/tests/utils/model_configs.py index 26eebf4f1..c6c412d23 100644 --- a/tests/utils/model_configs.py +++ b/tests/utils/model_configs.py @@ -1,5 +1,10 @@ +import dataclasses +import functools import os +import typing +from fast_llm.engine.checkpoint.config import CheckpointFormat +from fast_llm.engine.multi_stage.config import FastLLMModelConfig from fast_llm.models.gpt.config import ( LlamaGPTHuggingfaceCheckpointFormat, MistralGPTHuggingfaceCheckpointFormat, @@ -12,222 +17,293 @@ from tests.utils.dataset import DATASET_PREFIX, TEST_VOCAB_SIZE _LOG_LEVEL = int(os.environ.get("LOG_LEVEL", 13)) + + +@dataclasses.dataclass(kw_only=True, frozen=True) +class ModelTestingConfig: + name: str = None + model_type: str + config_args: list[str] + megatron_args: list[str] | None + checkpoint_format: CheckpointFormat | None + + @functools.cached_property + def model_config_class(self): + # TODO: Ok to assume the model and trainer have the same name? + return FastLLMModelConfig.get_subclass(self.model_type) + + @functools.cached_property + def huggingface_model_for_causal_lm_class(self): + return self.model_config_class.get_huggingface_model_for_causal_lm_class() + + @functools.cached_property + def model_class(self): + return self.model_config_class.get_model_class() + + @functools.cached_property + def base_model_config_class(self): + return self.model_config_class.get_base_model_config_class() + + +def _update_and_add_testing_config( + old_name: str, + new_name: str, + *, + model_type: str | None = None, + extra_args: list[str] | None = None, + megatron_args: list[str] | None = ..., + checkpoint_format: CheckpointFormat | None = ..., +): + config = _MODEL_CONFIGS[old_name] + updates: dict[str, typing.Any] = {"name": new_name} + if model_type is not None: + updates["model_type"] = model_type + if extra_args is not None: + updates["config_args"] = config.config_args + extra_args + if megatron_args is not ...: + if megatron_args is None: + updates["megatron_args"] = None + elif config.megatron_args is None: + updates["megatron_args"] = megatron_args + else: + updates["megatron_args"] = config.megatron_args + megatron_args + if checkpoint_format is not ...: + updates["checkpoint_format"] = checkpoint_format + + _MODEL_CONFIGS[new_name] = dataclasses.replace(config, **updates) + + +_MODEL_CONFIGS: dict[str, ModelTestingConfig] = {} + + +_MODEL_CONFIGS["gpt2"] = ModelTestingConfig( + # Tests gpt2 features (absolute embeddings, layer norm, relu activation, tied embeddings, MHA, linear biases). + name="gpt2", + model_type="gpt", + config_args=[ + "training.logs.interval=1", + "run.tensor_logs.save=True", + "run.tensor_logs.show=False", + "model.base_model.max_position_embeddings=512", + "model.base_model.transformer.num_layers=2", + "model.base_model.transformer.hidden_size=256", + "model.base_model.transformer.num_attention_heads=8", + "model.base_model.transformer.head_groups=8", + "model.base_model.transformer.init_method_std=0.022", + f"model.base_model.vocab_size={TEST_VOCAB_SIZE}", + f"model.multi_stage.debug_param_init={_LOG_LEVEL}", + f"model.multi_stage.debug_layer_outputs={_LOG_LEVEL}", + f"model.multi_stage.debug_layer_gradients={_LOG_LEVEL}", + f"model.multi_stage.debug_all_param_gradients={_LOG_LEVEL}", + "model.multi_stage.debug_tensor_parallel=True", + "model.distributed.reproducible_init=True", + "model.distributed.timeout=20", + "model.distributed.training_dtype=bf16", + "training.train_iters=2", + "training.num_workers=0", + "training.timeout=30", + "batch.batch_size=8", + "batch.sequence_length=512", + "data.datasets.training.type=slice", + "data.datasets.training.end=0.969", + "data.datasets.training.dataset.type=memmap", + f"data.datasets.training.dataset.path={DATASET_PREFIX}", + "data.datasets.validation.type=slice", + "data.datasets.validation.begin=0.969", + "data.datasets.validation.end=0.999", + "data.datasets.validation.dataset.type=memmap", + f"data.datasets.validation.dataset.path={DATASET_PREFIX}", + "data.datasets.test.type=slice", + "data.datasets.test.begin=0.999", + "data.datasets.test.end=1", + "data.datasets.test.dataset.type=memmap", + f"data.datasets.test.dataset.path={DATASET_PREFIX}", + "optimizer.learning_rate.base=0.0001", + ], + megatron_args=[ + "--num-layers=2", + "--hidden-size=256", + "--num-attention-heads=8", + "--log-interval=1", + "--train-iters=2", + "--eval-iters=0", + "--hidden-dropout=0", + "--attention-dropout=0", + f"--debug_param_init={_LOG_LEVEL}", + f"--debug_layer_outputs={_LOG_LEVEL}", + f"--debug_layer_gradients={_LOG_LEVEL}", + f"--debug_all_param_gradients={_LOG_LEVEL}", + "--debug_param_update=0", + "--global-batch-size=8", + "--micro-batch-size=8", + "--max-position-embeddings=512", + "--seq-length=512", + "--init-method-std=0.022", + "--lr=0.0001", + "--num-workers=0", + "--valid-num-workers=0", + "--tokenizer-type=NullTokenizer", + # Megatron messes with the vocab size, so we have to subtract 1. + f"--vocab-size={TEST_VOCAB_SIZE - 1}", + f"--data-path={DATASET_PREFIX}", + "--lr-decay-style=constant", + # Initialization is set up to match MCore models (MCore inverts self-attn qkv and dense layers compared to original Megatron) + "--use-mcore-models", + # local implementation doesn't allow for RMS norm. + "--transformer-impl=transformer_engine", + ], + checkpoint_format=None, +) + +_update_and_add_testing_config( + # Tests MQA. + "gpt2", + "starcoder", + extra_args=["model.base_model.transformer.head_groups=1"], + megatron_args=["--group-query-attention"], + checkpoint_format=None, +) + +_update_and_add_testing_config( + # Tests intermediate between gpt2 and llama, closest converter to gpt2. + "gpt2", + "starcoder2", + extra_args=[ + "model.base_model.transformer.head_groups=4", + "model.base_model.transformer.rotary.type=default", + # Unused, but prevents issues with conversion tests. + "model.base_model.max_position_embeddings=2048", + ], + megatron_args=[ + "--group-query-attention", + "--num-query-groups=4", + "--use-rotary-position-embeddings", + "--no-position-embedding", + ], + checkpoint_format=Starcoder2GPTHuggingfaceCheckpointFormat, +) + +_update_and_add_testing_config( + # Main tested model. + "starcoder2", + "llama", + extra_args=[ + "model.base_model.transformer.gated=True", + "model.base_model.transformer.activation_type=silu", + "model.base_model.transformer.add_linear_biases=False", + "model.base_model.transformer.normalization.type=rms_norm", + "model.base_model.transformer.ffn_hidden_size=1024", + "model.base_model.tie_word_embeddings=False", + ], + megatron_args=[ + "--swiglu", + "--disable-bias-linear", + "--normalization=RMSNorm", + "--ffn-hidden-size=1024", + "--untie-embeddings-and-output-weights", + ], + checkpoint_format=LlamaGPTHuggingfaceCheckpointFormat, +) + +_update_and_add_testing_config( + # Tests llama3-style rotary embeddings. + "llama", + "llama3", + extra_args=["model.base_model.transformer.rotary.type=llama3"], + # Megatron doesn't support Llama3-style Rotary Embeddings + megatron_args=None, + checkpoint_format=LlamaGPTHuggingfaceCheckpointFormat, +) + +_update_and_add_testing_config( + # Tests yarn-style rotary embeddings. + "llama", + "llama_yarn", + extra_args=["model.base_model.transformer.rotary.type=yarn"], + # Megatron doesn't support Yarn-style Rotary Embeddings + megatron_args=None, + checkpoint_format=LlamaGPTHuggingfaceCheckpointFormat, +) + +_update_and_add_testing_config( + # Tests multi-token prediction, custom HF model and converter. + "llama", + "llama_mtp", + extra_args=["model.base_model.prediction_heads=4"], + # Megatron doesn't support multi-token prediction. + megatron_args=None, + checkpoint_format=MTPLlamaGPTHuggingfaceCheckpointFormat, +) + +_update_and_add_testing_config( + # Tests partial linear biases, Qwen2 converter. + "llama", + "qwen2", + extra_args=["model.base_model.transformer.add_linear_biases=only_attn_qkv"], + # Megatron doesn't support per sub layer biases + megatron_args=None, + checkpoint_format=Qwen2GPTHuggingfaceCheckpointFormat, +) + +_update_and_add_testing_config( + # Tests sliding window attention, mistral converter. + "llama", + "mistral", + extra_args=["model.base_model.transformer.window_size=128"], + # Megatron doesn't support sliding windows. + megatron_args=None, + checkpoint_format=MistralGPTHuggingfaceCheckpointFormat, +) + +_update_and_add_testing_config( + # Tests mixture of experts, mixtral converter. + "llama", + "mixtral", + extra_args=[ + "model.base_model.transformer.num_experts=4", + "model.base_model.transformer.num_experts_per_token=4", + ], + megatron_args=[ + "--num-experts=4", + "--moe-router-topk=4", + ], + checkpoint_format=MixtralGPTHuggingfaceCheckpointFormat, +) + +_update_and_add_testing_config( + # Tests hybrid ssm, llamba converter. + "llama", + "llamba", + model_type="hybrid_ssm", + extra_args=[ + "model.base_model.hybrid_block_layout=['t','m']", + "model.base_model.ssm.state_size=8", + "model.base_model.ssm.chunk_size=32", + "model.base_model.ssm.n_qk_heads=8", + "model.base_model.ssm.n_v_heads=8", + ], + megatron_args=None, + checkpoint_format=LLambaHuggingfaceCheckpointFormat, +) + + +_update_and_add_testing_config( + # Tests hybrid ssm, llamba converter. + "llamba", + "hybrid_mamba_2", + model_type="hybrid_ssm", + extra_args=[ + "model.base_model.hybrid_block_layout=['t','m2d']", + ], + megatron_args=None, + checkpoint_format=None, +) + TEST_MODEL = os.environ.get("MODEL", "llama") -CONFIG_BASE_FAST_LLM = [ - "training.logs.interval=1", - "run.tensor_logs.save=True", - "run.tensor_logs.show=False", - "model.base_model.transformer.num_layers=2", - "model.base_model.transformer.hidden_size=256", - "model.base_model.transformer.num_attention_heads=8", - "model.base_model.transformer.init_method_std=0.022", - f"model.base_model.vocab_size={TEST_VOCAB_SIZE}", - f"model.multi_stage.debug_param_init={_LOG_LEVEL}", - f"model.multi_stage.debug_layer_outputs={_LOG_LEVEL}", - f"model.multi_stage.debug_layer_gradients={_LOG_LEVEL}", - f"model.multi_stage.debug_all_param_gradients={_LOG_LEVEL}", - "model.multi_stage.debug_tensor_parallel=True", - "model.distributed.reproducible_init=True", - "model.distributed.timeout=10", - "training.train_iters=2", - "training.num_workers=0", - "training.timeout=30", - "batch.batch_size=8", - "batch.sequence_length=512", - "data.datasets.training.type=slice", - "data.datasets.training.end=0.969", - "data.datasets.training.dataset.type=memmap", - f"data.datasets.training.dataset.path={DATASET_PREFIX}", - "data.datasets.validation.type=slice", - "data.datasets.validation.begin=0.969", - "data.datasets.validation.end=0.999", - "data.datasets.validation.dataset.type=memmap", - f"data.datasets.validation.dataset.path={DATASET_PREFIX}", - "data.datasets.test.type=slice", - "data.datasets.test.begin=0.999", - "data.datasets.test.end=1", - "data.datasets.test.dataset.type=memmap", - f"data.datasets.test.dataset.path={DATASET_PREFIX}", - "optimizer.learning_rate.base=0.0001", -] -CONFIG_BASE_MEGATRON = [ - "--num-layers=2", - "--hidden-size=256", - "--num-attention-heads=8", - "--log-interval=1", - "--train-iters=2", - "--eval-iters=0", - "--hidden-dropout=0", - "--attention-dropout=0", - f"--debug_param_init={_LOG_LEVEL}", - f"--debug_layer_outputs={_LOG_LEVEL}", - f"--debug_layer_gradients={_LOG_LEVEL}", - f"--debug_all_param_gradients={_LOG_LEVEL}", - "--debug_param_update=0", - "--global-batch-size=8", - "--max-position-embeddings=512", - "--seq-length=512", - "--init-method-std=0.022", - "--lr=0.0001", - "--num-workers=0", - "--valid-num-workers=0", - "--tokenizer-type=NullTokenizer", - # Megatron messes with the vocab size, so we have to subtract 1. - f"--vocab-size={TEST_VOCAB_SIZE - 1}", - f"--data-path={DATASET_PREFIX}", - "--lr-decay-style=constant", - # Initialization is set up to match MCore models (MCore inverts self-attn qkv and dense layers compared to original Megatron) - "--use-mcore-models", - # local implementation doesn't allow for RMS norm. - "--transformer-impl=transformer_engine", -] -CONFIG_SC1_FAST_LLM = CONFIG_BASE_FAST_LLM + ["model.base_model.max_position_embeddings=512"] -CONFIG_SC1_MEGATRON = CONFIG_BASE_MEGATRON + ["--group-query-attention"] -CONFIG_SC1_COMMON = CONFIG_SC1_FAST_LLM + ["model.distributed.training_dtype=bf16"] -CONFIG_GPT2_FAST_LLM = CONFIG_SC1_FAST_LLM + ["model.base_model.transformer.head_groups=8"] -CONFIG_GPT2_MEGATRON = CONFIG_BASE_MEGATRON -CONFIG_GPT2_COMMON = CONFIG_GPT2_FAST_LLM + ["model.distributed.training_dtype=bf16"] -CONFIG_SC2_FAST_LLM = CONFIG_BASE_FAST_LLM + [ - "model.base_model.transformer.head_groups=4", - "model.base_model.transformer.rotary.type=default", -] -CONFIG_SC2_MEGATRON = CONFIG_SC1_MEGATRON + [ - "--num-query-groups=4", - "--use-rotary-position-embeddings", - "--no-position-embedding", -] -CONFIG_SC2_COMMON = CONFIG_SC2_FAST_LLM + ["model.distributed.training_dtype=bf16"] -CONFIG_LLAMA_MEGATRON = CONFIG_SC2_MEGATRON + [ - "--swiglu", - "--disable-bias-linear", - "--normalization=RMSNorm", - "--ffn-hidden-size=1024", - "--untie-embeddings-and-output-weights", -] -CONFIG_LLAMA_FAST_LLM = CONFIG_SC2_FAST_LLM + [ - "model.base_model.transformer.gated=True", - "model.base_model.transformer.activation_type=silu", - "model.base_model.transformer.add_linear_biases=False", - "model.base_model.transformer.normalization.type=rms_norm", - "model.base_model.transformer.ffn_hidden_size=1024", - "model.base_model.tie_word_embeddings=False", -] -CONFIG_LLAMA_COMMON = CONFIG_LLAMA_FAST_LLM + ["model.distributed.training_dtype=bf16"] -CONFIG_LLAMA3_MEGATRON = None -CONFIG_LLAMA3_FAST_LLM = CONFIG_LLAMA_FAST_LLM + [ - "model.base_model.transformer.rotary.type=llama3", -] -CONFIG_LLAMA3_COMMON = CONFIG_LLAMA3_FAST_LLM + ["model.distributed.training_dtype=bf16"] -CONFIG_QWEN2_MEGATRON = None -CONFIG_QWEN2_FAST_LLM = CONFIG_SC2_FAST_LLM + [ - "model.base_model.transformer.gated=True", - "model.base_model.transformer.activation_type=silu", - "model.base_model.transformer.add_linear_biases=only_attn_qkv", - "model.base_model.transformer.normalization.type=rms_norm", - "model.base_model.transformer.ffn_hidden_size=1024", - "model.base_model.tie_word_embeddings=False", -] -CONFIG_QWEN2_COMMON = CONFIG_QWEN2_FAST_LLM + ["model.distributed.training_dtype=bf16"] -CONFIG_LLAMA_YARN_MEGATRON = None -CONFIG_LLAMA_YARN_FAST_LLM = CONFIG_LLAMA_FAST_LLM + [ - "model.base_model.transformer.rotary.type=yarn", -] -CONFIG_LLAMA_YARN_COMMON = CONFIG_LLAMA_YARN_FAST_LLM + ["model.distributed.training_dtype=bf16"] -CONFIG_MIXTRAL_MEGATRON = CONFIG_LLAMA_MEGATRON + [ - "--num-experts=4", - "--moe-router-topk=4", -] -CONFIG_MIXTRAL_FAST_LLM = CONFIG_LLAMA_FAST_LLM + [ - "model.base_model.transformer.num_experts=4", - "model.base_model.transformer.num_experts_per_token=4", -] -CONFIG_MIXTRAL_COMMON = CONFIG_MIXTRAL_FAST_LLM + ["model.distributed.training_dtype=bf16"] -CONFIG_MIXTRAL_YARN_MEGATRON = None -CONFIG_MIXTRAL_YARN_FAST_LLM = CONFIG_MIXTRAL_FAST_LLM + [ - "model.base_model.transformer.rotary.type=yarn", -] -CONFIG_MIXTRAL_YARN_COMMON = CONFIG_MIXTRAL_YARN_FAST_LLM + ["model.distributed.training_dtype=bf16"] -CONFIG_LLAMA_MTP_MEGATRON = None -CONFIG_LLAMA_MTP_FAST_LLM = CONFIG_LLAMA_FAST_LLM + [ - "model.base_model.prediction_heads=4", -] -CONFIG_LLAMA_MTP_COMMON = CONFIG_LLAMA_MTP_FAST_LLM + ["model.distributed.training_dtype=bf16"] -CONFIG_LLAMBA_FAST_LLM = CONFIG_LLAMA_FAST_LLM + ["model.base_model.hybrid_block_layout==['t','m']"] -CONFIG_LLAMBA_MEGATRON = CONFIG_LLAMA_MEGATRON + [] -CONFIG_LLAMBA_COMMON = CONFIG_LLAMBA_FAST_LLM -_CONFIGS = { - "gpt2": ("gpt", CONFIG_GPT2_FAST_LLM, CONFIG_GPT2_MEGATRON, CONFIG_GPT2_COMMON, None), - "sc1": ("gpt", CONFIG_SC1_FAST_LLM, CONFIG_SC1_MEGATRON, CONFIG_SC1_COMMON, None), - "starcoder2": ( - "gpt", - CONFIG_SC2_FAST_LLM, - CONFIG_SC2_MEGATRON, - CONFIG_SC2_COMMON, - Starcoder2GPTHuggingfaceCheckpointFormat, - ), - "llama": ( - "gpt", - CONFIG_LLAMA_FAST_LLM, - CONFIG_LLAMA_MEGATRON, - CONFIG_LLAMA_COMMON, - LlamaGPTHuggingfaceCheckpointFormat, - ), - "llama3": ( - "gpt", - CONFIG_LLAMA3_FAST_LLM, - CONFIG_LLAMA3_MEGATRON, - CONFIG_LLAMA3_COMMON, - LlamaGPTHuggingfaceCheckpointFormat, - ), - "qwen2": ( - "gpt", - CONFIG_QWEN2_FAST_LLM, - CONFIG_QWEN2_MEGATRON, - CONFIG_QWEN2_COMMON, - Qwen2GPTHuggingfaceCheckpointFormat, - ), - "llama-yarn": ( - "gpt", - CONFIG_LLAMA_YARN_FAST_LLM, - CONFIG_LLAMA_YARN_MEGATRON, - CONFIG_LLAMA_YARN_COMMON, - LlamaGPTHuggingfaceCheckpointFormat, - ), - "mistral": ( - "gpt", - CONFIG_LLAMA_FAST_LLM, - CONFIG_LLAMA_MEGATRON, - CONFIG_LLAMA_COMMON, - MistralGPTHuggingfaceCheckpointFormat, - ), - "mixtral": ( - "gpt", - CONFIG_MIXTRAL_FAST_LLM, - CONFIG_MIXTRAL_MEGATRON, - CONFIG_MIXTRAL_COMMON, - MixtralGPTHuggingfaceCheckpointFormat, - ), - "llamba": ( - "hybrid_ssm", - CONFIG_LLAMBA_FAST_LLM, - CONFIG_LLAMBA_MEGATRON, - CONFIG_LLAMBA_COMMON, - LLambaHuggingfaceCheckpointFormat, - ), - "mixtral-yarn": ( - "gpt", - CONFIG_MIXTRAL_YARN_FAST_LLM, - CONFIG_MIXTRAL_YARN_MEGATRON, - CONFIG_MIXTRAL_YARN_COMMON, - MixtralGPTHuggingfaceCheckpointFormat, - ), - "llama-mtp": ( - "gpt", - CONFIG_LLAMA_MTP_FAST_LLM, - CONFIG_LLAMA_MTP_MEGATRON, - CONFIG_LLAMA_MTP_COMMON, - MTPLlamaGPTHuggingfaceCheckpointFormat, - ), -} - -TEST_MODEL_TYPE, CONFIG_FAST_LLM, CONFIG_GPT2, CONFIG_COMMON, HUGGINGFACE_CHECKPOINT_FORMAT = _CONFIGS[TEST_MODEL] +_MODEL_CONFIG = _MODEL_CONFIGS[TEST_MODEL] + + +TEST_MODEL_TYPE = _MODEL_CONFIG.model_type +CONFIG_COMMON = _MODEL_CONFIG.config_args +CONFIG_MEGATRON = _MODEL_CONFIG.megatron_args +HUGGINGFACE_CHECKPOINT_FORMAT = _MODEL_CONFIG.checkpoint_format From d41e0d5a66e6b79ae9d67cec6cf086325429d3d8 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Fri, 13 Jun 2025 16:02:15 -0400 Subject: [PATCH 30/43] misc --- tests/test_match_megatron.py | 3 +-- tests/utils/model_configs.py | 3 --- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/tests/test_match_megatron.py b/tests/test_match_megatron.py index 4f82d5753..7645de9e1 100644 --- a/tests/test_match_megatron.py +++ b/tests/test_match_megatron.py @@ -13,8 +13,7 @@ def test_megatron(run_test_script_for_all_models, model_testing_config): @pytest.mark.depends_on(on=["test_megatron[{model_testing_config}]"]) @pytest.mark.model_testing_group(ModelTestingGroup.megatron) def test_match_megatron(run_test_script_for_all_models, model_testing_config): - if model_testing_config.megatron_args is None: - pytest.skip(f"Megatron does not support model {model_testing_config.name}") + assert model_testing_config.megatron_args is not None ignore_tensors = [ ".self_attn.query_key_value.", diff --git a/tests/utils/model_configs.py b/tests/utils/model_configs.py index 8357bdbe4..ee9ad5cbc 100644 --- a/tests/utils/model_configs.py +++ b/tests/utils/model_configs.py @@ -43,9 +43,6 @@ class ModelTestingGroupAction(enum.StrEnum): not_implemented = "not_implemented" -SLOW_TESTING_GROUPS = {ModelTestingGroup.megatron, ModelTestingGroup.distributed} - - @dataclasses.dataclass(kw_only=True, frozen=True) class ModelTestingConfig: name: str = None From 59582c3a639002de1a53861b6e544f5a26ca05af Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Fri, 13 Jun 2025 16:02:55 -0400 Subject: [PATCH 31/43] misc --- .github/workflows/ci.yaml | 7 +++---- .github/workflows/docs.yaml | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 0bca2dd8d..912ddaf5e 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -27,11 +27,10 @@ jobs: - name: Install dependencies run: | - pip install "torch>=2.7.0" + pip install "torch>=2.2.2" pip install pybind11 - FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE FLASH_ATTENTION_FORCE_BUILD=TRUE MAMBA_SKIP_CUDA_BUILD=TRUE \ - MAMBA_FORCE_BUILD=TRUE CAUSAL_CONV1D_FORCE_BUILD=TRUE CAUSAL_CONV1D_SKIP_CUDA_BUILD=TRUE \ - pip install --no-build-isolation -e ".[CORE,OPTIONAL,HUGGINGFACE,SSM,DEV,DOCS]" + FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE FLASH_ATTENTION_FORCE_BUILD=TRUE MAMBA_SKIP_CUDA_BUILD=TRUE MAMBA_FORCE_BUILD=TRUE CAUSAL_CONV1D_FORCE_BUILD=TRUE CAUSAL_CONV1D_SKIP_CUDA_BUILD=TRUE pip install --no-build-isolation -e ".[CORE,OPTIONAL,DEV,DOCS]" + - name: Run tests run: pytest . diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml index 632fa7b93..b755993ce 100644 --- a/.github/workflows/docs.yaml +++ b/.github/workflows/docs.yaml @@ -29,7 +29,7 @@ jobs: restore-keys: | mkdocs-material- - run: | - pip install "torch>=2.7.0" + pip install "torch>=2.2.2" pip install pybind11 FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE FLASH_ATTENTION_FORCE_BUILD=TRUE MAMBA_SKIP_CUDA_BUILD=TRUE \ MAMBA_FORCE_BUILD=TRUE CAUSAL_CONV1D_FORCE_BUILD=TRUE CAUSAL_CONV1D_SKIP_CUDA_BUILD=TRUE \ From 8ecf81e4a6e69ebb94e1f7e02bd0c3f7d2633386 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Fri, 13 Jun 2025 16:03:35 -0400 Subject: [PATCH 32/43] fix --- tests/test_match_megatron.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/tests/test_match_megatron.py b/tests/test_match_megatron.py index f464dd06d..9f8614648 100644 --- a/tests/test_match_megatron.py +++ b/tests/test_match_megatron.py @@ -41,13 +41,5 @@ def test_match_megatron(run_test_script): "model.base_model.use_megatron_initialization=True", ], compare=f"test_{TEST_MODEL}_megatron", - config=CompareConfig( - ignore_tensors=[ - ".self_attn.query_key_value.", - ".self_attn.query.", - ".self_attn.key_value.", - ".self_attn.dense.", - ".mlp.layer_2.weight", - ] - ), + config=CompareConfig(ignore_tensors=ignore_tensors), ) From c5b29e257aa6067a96d76888878436d201d49a7e Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Fri, 13 Jun 2025 16:44:36 -0400 Subject: [PATCH 33/43] Revert "misc" This reverts commit 59582c3a639002de1a53861b6e544f5a26ca05af. --- .github/workflows/ci.yaml | 7 ++++--- .github/workflows/docs.yaml | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 912ddaf5e..0bca2dd8d 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -27,10 +27,11 @@ jobs: - name: Install dependencies run: | - pip install "torch>=2.2.2" + pip install "torch>=2.7.0" pip install pybind11 - FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE FLASH_ATTENTION_FORCE_BUILD=TRUE MAMBA_SKIP_CUDA_BUILD=TRUE MAMBA_FORCE_BUILD=TRUE CAUSAL_CONV1D_FORCE_BUILD=TRUE CAUSAL_CONV1D_SKIP_CUDA_BUILD=TRUE pip install --no-build-isolation -e ".[CORE,OPTIONAL,DEV,DOCS]" - + FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE FLASH_ATTENTION_FORCE_BUILD=TRUE MAMBA_SKIP_CUDA_BUILD=TRUE \ + MAMBA_FORCE_BUILD=TRUE CAUSAL_CONV1D_FORCE_BUILD=TRUE CAUSAL_CONV1D_SKIP_CUDA_BUILD=TRUE \ + pip install --no-build-isolation -e ".[CORE,OPTIONAL,HUGGINGFACE,SSM,DEV,DOCS]" - name: Run tests run: pytest . diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml index b755993ce..632fa7b93 100644 --- a/.github/workflows/docs.yaml +++ b/.github/workflows/docs.yaml @@ -29,7 +29,7 @@ jobs: restore-keys: | mkdocs-material- - run: | - pip install "torch>=2.2.2" + pip install "torch>=2.7.0" pip install pybind11 FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE FLASH_ATTENTION_FORCE_BUILD=TRUE MAMBA_SKIP_CUDA_BUILD=TRUE \ MAMBA_FORCE_BUILD=TRUE CAUSAL_CONV1D_FORCE_BUILD=TRUE CAUSAL_CONV1D_SKIP_CUDA_BUILD=TRUE \ From edced8c829bef10e8b917195e4215a0f785ee5b6 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Fri, 13 Jun 2025 17:06:30 -0400 Subject: [PATCH 34/43] Cleanup tests --- tests/layers/test_lm_head.py | 49 +++--- tests/test_functional.py | 18 ++- tests/test_mtp.py | 209 -------------------------- tests/test_ssms.py | 282 ++--------------------------------- tests/utils/utils.py | 67 ++++----- 5 files changed, 72 insertions(+), 553 deletions(-) delete mode 100644 tests/test_mtp.py diff --git a/tests/layers/test_lm_head.py b/tests/layers/test_lm_head.py index ddb1521f5..9d124d4d0 100644 --- a/tests/layers/test_lm_head.py +++ b/tests/layers/test_lm_head.py @@ -5,20 +5,14 @@ from fast_llm.config import UpdateType from fast_llm.engine.config_utils.data_type import DataType -from fast_llm.engine.config_utils.tensor_space import TensorSpace -from fast_llm.engine.distributed.config import DistributedConfig -from fast_llm.engine.distributed.distributed import Distributed -from fast_llm.engine.multi_stage.config import StageConfig -from fast_llm.engine.multi_stage.stage import Stage from fast_llm.functional.config import CrossEntropyImpl from fast_llm.layers.language_model.config import LanguageModelKwargs from fast_llm.layers.language_model.embedding import WORD_EMBEDDINGS_WEIGHT from fast_llm.layers.language_model.head import OUTPUT_WEIGHTS, LanguageModelHead from fast_llm.layers.transformer.config import TransformerKwargs -from fast_llm.models.gpt.config import GPTBaseModelConfig -from fast_llm.models.gpt.model import GPTBaseModel +from fast_llm.models.gpt.config import GPTBaseModelConfig, GPTModelConfig from fast_llm.utils import Assert -from tests.utils.utils import requires_cuda +from tests.utils.utils import get_base_model, get_stage, requires_cuda def _lm_head( @@ -100,13 +94,15 @@ def test_lm_head( config_dict, update_type=UpdateType.update, ) - distributed_config = DistributedConfig.from_dict(distributed_config_dict) - distributed = Distributed(distributed_config) - tensor_space = TensorSpace(distributed_config) - config.setup_tensor_space(tensor_space) - tensor_space.setup(distributed) - model = GPTBaseModel(config, distributed_config) - model.setup(distributed) + + model, distributed = get_base_model( + GPTModelConfig.from_dict( + { + "base_model": config, + "distributed": distributed_config_dict, + }, + ) + ) sequence_first = config.sequence_first or ( config.cross_entropy_splits is not None and config.cross_entropy_splits > 1 @@ -114,9 +110,9 @@ def test_lm_head( input_ = torch.randn( (SEQUENCE_LENGTH, BATCH_SIZE, HIDDEN_SIZE) if sequence_first else (BATCH_SIZE, SEQUENCE_LENGTH, HIDDEN_SIZE), dtype=( - distributed_config.optimization_dtype.torch + distributed.config.optimization_dtype.torch if config.transformer.full_precision_residual - else distributed_config.training_dtype.torch + else distributed.config.training_dtype.torch ), device=distributed.device, requires_grad=True, @@ -160,7 +156,7 @@ def test_lm_head( if config.tie_word_embeddings or config.prediction_heads > 1: logit_weight = ( torch.empty( - VOCAB_SIZE, HIDDEN_SIZE, dtype=distributed_config.training_dtype.torch, device=distributed.device + VOCAB_SIZE, HIDDEN_SIZE, dtype=distributed.config.training_dtype.torch, device=distributed.device ) .normal_(config.transformer.init_method_std) .requires_grad_(True) @@ -174,18 +170,7 @@ def test_lm_head( head: LanguageModelHead = model[layer_index] Assert.custom(isinstance, head, LanguageModelHead) Assert.eq(head._prediction_distance, prediction_distance) - stage = Stage( - config=StageConfig(), - base_model=[head], - distributed_config=distributed_config, - begin=0, - end=1, - index=0, - ) - stage.setup(distributed=distributed) - stage.initialize_weights() - stage.restore_parameters() - stage.reset_gradients() + stage = get_stage([head], distributed) # Get reference outputs and grads if logit_weight is None: @@ -230,9 +215,9 @@ def test_lm_head( output, context = stage.forward(head_input, kwargs, losses) stage.backward(output_grad, context) - threshold = 1e-5 if distributed_config.training_dtype == DataType.float32 else 5e-3 + threshold = 1e-5 if distributed.config.training_dtype == DataType.float32 else 5e-3 min_threshold = ( - 1e-5 if distributed_config.training_dtype == DataType.float32 else 1e-4 + 1e-5 if distributed.config.training_dtype == DataType.float32 else 1e-4 ) * config.logits_scale_factor Assert.eq(losses.keys(), loss_keys) diff --git a/tests/test_functional.py b/tests/test_functional.py index 0689f4d8b..9211259c2 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -57,9 +57,15 @@ def ref_packed_get_batch_logps( @pytest.mark.slow -@pytest.mark.parametrize("batch_size", [1, 2, 4, 8]) -@pytest.mark.parametrize("seq_length", [1024, 4096, 8192]) -@pytest.mark.parametrize("vocab_size", [1000, 2000, 8000]) +@pytest.mark.parametrize( + ("batch_size", "seq_length", "vocab_size"), + ( + (2, 32, 50), + (1, 32, 50), + (2, 100, 50), + (2, 32, 200), + ), +) def test_preference_logps(batch_size, seq_length, vocab_size): random.seed(0) torch.manual_seed(0) @@ -222,9 +228,9 @@ def test_dropless_mlp(): pytest.fail("Test fails, aborting to avoid breaking cuda", False) num_experts = 4 experts_per_token = 4 - tokens = 1024 - hidden_size = 2048 - ffn_hidden_size = 4096 + tokens = 256 + hidden_size = 512 + ffn_hidden_size = 1024 std = 1 / 64 input_ = torch.randn(tokens, hidden_size, device="cuda", requires_grad=True) router_weight = torch.normal(0, std, (num_experts, hidden_size), device="cuda") diff --git a/tests/test_mtp.py b/tests/test_mtp.py deleted file mode 100644 index 1f01954e8..000000000 --- a/tests/test_mtp.py +++ /dev/null @@ -1,209 +0,0 @@ -import typing - -import pytest -import torch - -from fast_llm.config import UpdateType -from fast_llm.engine.distributed.config import DistributedConfig -from fast_llm.engine.distributed.distributed import Distributed -from fast_llm.layers.language_model.config import LanguageModelKwargs, LanguageModelLossNames -from fast_llm.layers.language_model.embedding import WORD_EMBEDDINGS_WEIGHT -from fast_llm.layers.language_model.head import OUTPUT_WEIGHTS, LanguageModelHead -from fast_llm.layers.ssm.config import SSMBlockType -from fast_llm.layers.transformer.config import TransformerKwargs -from fast_llm.layers.transformer.transformer import TransformerLayer -from fast_llm.models.gpt.config import GPTBaseModelConfig -from fast_llm.models.gpt.model import GPTBaseModel -from fast_llm.utils import Assert -from tests.utils.utils import get_hybrid_config, materialize_meta_tensors, requires_cuda - -try: - from fast_llm.layers.ssm.discrete_mamba2 import DiscreteMamba2 - from fast_llm.layers.ssm.mamba_layer import MambaLayer - from fast_llm.models.ssm.model import HybridSSMBaseModel -except Exception: - MambaLayer, HybridSSMBaseModel, DiscreteMamba2 = ( - None, - None, - None, - ) - # Mamba not installed, skipping tests - - -run_hybrid_test = MambaLayer is not None and DiscreteMamba2 is not None and torch.cuda.is_available() - - -SEQUENCE_LENGTH = 200 -BATCH_SIZE = 4 -HIDDEN_SIZE = 256 -VOCAB_SIZE = 500 - - -@pytest.fixture -def distributed_config(): - return DistributedConfig( - tensor_parallel=1, - pipeline_parallel=1, - sequence_data_parallel=1, - local_world_size=1, - world_size=1, - ) - - -@pytest.fixture -def distributed(distributed_config): - return Distributed(config=distributed_config) - - -@requires_cuda -@pytest.mark.parametrize( - "config_dict", - ( - {"prediction_heads": 1}, - {"prediction_heads": 2, "tie_word_embeddings": False}, - {"prediction_heads": 5, "tie_word_embeddings": False}, - ), -) -def test_transformer_mtp(config_dict: dict[str, typing.Any]): - config = GPTBaseModelConfig.from_dict( - { - "transformer": { - "hidden_size": HIDDEN_SIZE, - "num_layers": 2, - }, - "vocab_size": VOCAB_SIZE, - }, - config_dict, - update_type=UpdateType.update, - ) - distributed_config = DistributedConfig.from_dict({}) - distributed = Distributed(distributed_config) - model = GPTBaseModel(config, distributed_config) - model.setup(distributed) - materialize_meta_tensors(model, model._tensor_space) - model.to("cuda") - - sequence_first = config.sequence_first or ( - config.cross_entropy_splits is not None and config.cross_entropy_splits > 1 - ) - target = torch.randint( - 0, - VOCAB_SIZE, - ( - (SEQUENCE_LENGTH + config.prediction_heads - 1, BATCH_SIZE) - if sequence_first - else (BATCH_SIZE, SEQUENCE_LENGTH + config.prediction_heads - 1) - ), - dtype=torch.int64, - device=distributed.device, - ) - input_ = torch.randint( - 0, - VOCAB_SIZE, - (SEQUENCE_LENGTH, BATCH_SIZE) if sequence_first else (BATCH_SIZE, SEQUENCE_LENGTH), - device=distributed.device, - ) - attention_mask = torch.ones((1, 1, 1, 1), device="cuda", dtype=torch.bool) - position_ids = torch.arange(SEQUENCE_LENGTH, device="cuda", dtype=torch.int64) - kwargs = { - "position_ids": position_ids, - TransformerKwargs.sequence_first: sequence_first, - TransformerKwargs.attention_mask: attention_mask, - TransformerKwargs.attention_mask_value: -100, - TransformerKwargs.grad_output: 1.0, - LanguageModelKwargs.labels: target, - } - if config.tie_word_embeddings: - kwargs[WORD_EMBEDDINGS_WEIGHT] = model.embedding.word_embeddings_weight - else: - kwargs[OUTPUT_WEIGHTS] = model.model_head.output_weights - losses = {LanguageModelLossNames.multi_token_prediction_loss(i): [] for i in range(model._config.prediction_heads)} - _ = model(input_, kwargs, losses=losses) - for loss_name, loss_values in losses.items(): - Assert.gt(len(loss_values), 0) - loss = sum( - [ - sum(losses[LanguageModelLossNames.multi_token_prediction_loss(i)]) - for i in range(model._config.prediction_heads) - ] - ) - loss.backward() - - -@pytest.mark.skip(reason="Too slow") -@requires_cuda -@pytest.mark.skipif(not run_hybrid_test, reason="No CUDA available or Mamba not installed") -@pytest.mark.parametrize( - ("hybrid_block_layout", "prediction_heads", "default_mtp_type"), - [ - ([SSMBlockType.mamba.value, SSMBlockType.transformer.value], 1, None), - ([SSMBlockType.transformer.value, SSMBlockType.mamba.value], 2, None), - ([SSMBlockType.mamba.value, SSMBlockType.transformer.value], 2, None), - ([SSMBlockType.transformer.value, SSMBlockType.mamba2_discrete.value], 3, None), - ([SSMBlockType.transformer.value, SSMBlockType.mamba2_discrete.value], 3, SSMBlockType.mamba.value), - ], -) -def test_hybrid_model_mtp(distributed_config, hybrid_block_layout, prediction_heads, default_mtp_type): - hybrid_config = get_hybrid_config( - hybrid_block_layout=hybrid_block_layout, prediction_heads=prediction_heads, default_mtp_type=default_mtp_type - ) - model = HybridSSMBaseModel(hybrid_config, distributed_config) - distributed = Distributed(distributed_config) - model.setup(distributed) - tensor_space = model._tensor_space - materialize_meta_tensors(model, tensor_space) - model.to("cuda") - - num_heads, num_mtp_blocks = 0, 0 - str_block_mapping = { - SSMBlockType.transformer: TransformerLayer, - SSMBlockType.mamba: MambaLayer, - SSMBlockType.mamba2_discrete: DiscreteMamba2, - } - mtp_block_type = default_mtp_type or hybrid_block_layout[-1] - for block in model.get_output_layers(): - if isinstance(block, LanguageModelHead): - num_heads += 1 - else: - block = getattr(block, "mixer", block) - Assert.custom( - lambda _: isinstance(block, str_block_mapping[mtp_block_type]), - f"Block {block} is not of type {str_block_mapping[mtp_block_type]}", - ) - num_mtp_blocks += 1 - Assert.eq(num_heads, prediction_heads) - Assert.eq(num_mtp_blocks, prediction_heads - 1) - - batch_size = 2 - seq_length = 32 - x = torch.randint(0, 49152, (batch_size, seq_length), device="cuda") - position_ids = torch.arange(seq_length, device="cuda", dtype=torch.int64) - attention_mask = torch.ones((1, 1, 1, 1), device="cuda", dtype=torch.bool) # will be broadcasted to right shape - labels = torch.randint(0, 49152, (batch_size, seq_length + model._config.prediction_heads - 1), device="cuda") - losses = {LanguageModelLossNames.multi_token_prediction_loss(i): [] for i in range(model._config.prediction_heads)} - kwargs = { - "position_ids": position_ids, - TransformerKwargs.sequence_first: False, - TransformerKwargs.attention_mask: attention_mask, - TransformerKwargs.attention_mask_value: -100, - TransformerKwargs.grad_output: True, - LanguageModelKwargs.labels: labels, - } - - if model._config.tie_word_embeddings: - kwargs[WORD_EMBEDDINGS_WEIGHT] = model.embedding.word_embeddings_weight - else: - kwargs[OUTPUT_WEIGHTS] = model.model_head.output_weights - - output = model( - x, - kwargs, - losses=losses, - ) - loss = sum( - [ - sum(losses[LanguageModelLossNames.multi_token_prediction_loss(i)]) - for i in range(model._config.prediction_heads) - ] - ) - loss.backward() diff --git a/tests/test_ssms.py b/tests/test_ssms.py index 9e7485447..52b51c8a5 100644 --- a/tests/test_ssms.py +++ b/tests/test_ssms.py @@ -1,75 +1,31 @@ import pathlib -from functools import partial import pytest import torch from fast_llm.config import NoAutoValidate from fast_llm.engine.checkpoint.config import CheckpointLoadConfig -from fast_llm.engine.config_utils.tensor_space import TensorSpace from fast_llm.engine.distributed.config import DistributedConfig, PhaseType -from fast_llm.engine.distributed.distributed import Distributed from fast_llm.engine.schedule.config import ScheduleConfig from fast_llm.engine.schedule.runner import ScheduleRunner from fast_llm.engine.schedule.schedule import Schedule -from fast_llm.layers.language_model.config import LanguageModelKwargs, LanguageModelLossNames -from fast_llm.layers.ssm.config import SSMBlockType -from fast_llm.layers.ssm.discrete_mamba2 import DiscreteMamba2 -from fast_llm.layers.ssm.llamba_block import LlambaBlock -from fast_llm.layers.ssm.mamba_layer import MambaLayer from fast_llm.layers.transformer.config import TransformerKwargs -from fast_llm.models.gpt.config import GPTBatchConfig, LlamaGPTHuggingfaceCheckpointFormat -from fast_llm.models.ssm.config import AprielSSMHHybridHuggingfaceCheckpointFormat, LLambaHuggingfaceCheckpointFormat -from fast_llm.models.ssm.model import HybridSSMBaseModel, HybridSSMModel -from tests.utils.utils import get_hybrid_config, materialize_meta_tensors +from fast_llm.models.gpt.config import GPTBatchConfig +from fast_llm.models.ssm.config import LLambaHuggingfaceCheckpointFormat +from fast_llm.models.ssm.model import HybridSSMModel try: from cartesia_pytorch.Llamba.llamba import LlambaLMHeadModel as LMHeadModel except ImportError: LMHeadModel = None -run_test = MambaLayer is not None and torch.cuda.is_available() - - -@pytest.fixture -def distributed_config(): - return DistributedConfig( - tensor_parallel=1, - pipeline_parallel=1, - sequence_data_parallel=1, - local_world_size=1, - world_size=1, - ) - - -@pytest.fixture -def distributed(distributed_config): - return Distributed(config=distributed_config) - - -def get_hf_llamba_out(input_ids, path, format): - if format == LLambaHuggingfaceCheckpointFormat: - from cartesia_pytorch.Llamba.llamba import LlambaLMHeadModel as LMHeadModel - elif format == LlamaGPTHuggingfaceCheckpointFormat: - from transformers import LlamaForCausalLM as LMHeadModel - else: - raise ValueError(f"Invalid format: {format}") - - model = LMHeadModel.from_pretrained(path, strict=True).to("cuda") - parameter_sum = sum(p.detach().cpu().numpy().sum() for p in model.parameters()) - print(f"Parameter sum: {parameter_sum}") - output = model(input_ids) - del model - torch.cuda.empty_cache() - return output, parameter_sum - @pytest.mark.slow @pytest.mark.skipif( - not run_test or LMHeadModel is None, - reason=f"Skipping because one of the following: cartesia_pytorch.Llamba not installed or no CUDA available or Mamba not installed", + LMHeadModel is None, + reason=f"cartesia_pytorch.Llamba not installed", ) -def test_load_from_llamba_checkpoint(distributed_config): +def test_load_from_llamba_checkpoint(): """ Test to check whether the of Fast-LLM and Huggingface checkpoint loading for Llamba-1B produce the same results. """ @@ -81,8 +37,12 @@ def test_load_from_llamba_checkpoint(distributed_config): format = LLambaHuggingfaceCheckpointFormat x = torch.randint(0, vocab_size, (batch_size, seq_length), device="cuda") - hf_logits, parameter_sum_hf = get_hf_llamba_out(x, path, format) - hf_logits = hf_logits["logits"].cpu() + + hf_model = LMHeadModel.from_pretrained(path, strict=True).to("cuda") + parameter_sum_hf = sum(p.detach().sum().cpu().item() for p in hf_model.parameters()) + hf_logits = hf_model(x)["logits"].cpu() + del hf_model + torch.cuda.empty_cache() # Create checkpoint load config checkpoint_config = CheckpointLoadConfig(path=path, format=format, model_weights=True, optimizer_state=False) @@ -100,7 +60,7 @@ def test_load_from_llamba_checkpoint(distributed_config): schedule_config = ScheduleConfig() with NoAutoValidate(): batch_config = GPTBatchConfig(micro_batch_size=batch_size, sequence_length=seq_length) - batch_config.setup(distributed_config) + batch_config.setup(DistributedConfig.from_dict({})) batch_config.validate() schedule_runner = ScheduleRunner( config=schedule_config, @@ -122,221 +82,7 @@ def test_load_from_llamba_checkpoint(distributed_config): } input_data = [(x, common_kwargs)] - losses, success, metrics = schedule_runner.run_step( - iter([input_data]), schedule, iteration=0, return_metrics=True, preprocessed=True - ) + schedule_runner.run_step(iter([input_data]), schedule, iteration=0, return_metrics=True, preprocessed=True) logits = input_data[0][1]["logits"].cpu() assert torch.allclose(logits, hf_logits, atol=1e-2) - - -def get_hf_apriel_hybrid_out(input_ids, path, format): - from fast_llm.models.ssm.external.apriel_hybrid.modeling_ssm_hybrid_apriel import AprielSSMHybridForCausalLM - - model = AprielSSMHybridForCausalLM.from_pretrained(path, strict=True).to("cuda") - parameter_sum = sum(p.detach().cpu().numpy().sum() for p in model.parameters()) - print(f"Parameter sum: {parameter_sum}") - output = model(input_ids) - del model - torch.cuda.empty_cache() - return output, parameter_sum - - -@pytest.mark.slow -@pytest.mark.skipif( - not run_test - and not pathlib.Path("/mnt/checkpoints/ssm/apriel_ssm_instruct_hybrid_ssm2nd_init_mambainlama_debug").exists(), - reason=f"Skipping because no CUDA available or Mamba not installed", -) -def test_load_from_hybridssm_checkpoint(distributed_config): - """ - Test to check whether the of Fast-LLM and Huggingface checkpoint loading for Llamba-1B produce the same results. - """ - vocab_size = 131072 # from https://huggingface.co/cartesia-ai/Llamba-1B/blob/main/config.json - batch_size = 2 - seq_length = 32 - - path = pathlib.Path("/mnt/checkpoints/ssm/apriel_ssm_instruct_hybrid_ssm2nd_init_mambainlama_debug") - format = AprielSSMHHybridHuggingfaceCheckpointFormat - - x = torch.randint(0, vocab_size, (batch_size, seq_length), device="cuda") - hf_logits, parameter_sum_hf = get_hf_apriel_hybrid_out(x, path, format) - hf_logits = hf_logits["logits"].cpu() - - # Create checkpoint load config - checkpoint_config = CheckpointLoadConfig(path=path, format=format, model_weights=True, optimizer_state=False) - # Initialize model - model = HybridSSMModel.from_pretrained(checkpoint_config) - param_sum = 0 - for stage in model.stages: - for fsdp in stage.fsdps: - if hasattr(fsdp, "_weight_shard"): - param_sum += torch.sum(fsdp._weight_shard).item() - assert torch.abs(torch.tensor(param_sum) - parameter_sum_hf) < 1e-1 - - -@pytest.mark.extra_slow -@pytest.mark.skipif(not run_test, reason="No CUDA available or Mamba not installed") -@pytest.mark.parametrize( - "hybrid_block_layout,LAYER_CLS", - [ - ([SSMBlockType.mamba, SSMBlockType.transformer], MambaLayer), - ([SSMBlockType.mamba2_discrete, SSMBlockType.transformer], DiscreteMamba2), - ], - ids=["mamba", "discrete_mamba2"], -) -def test_mamba_layer(distributed_config, distributed, hybrid_block_layout, LAYER_CLS): - hybrid_config = get_hybrid_config(hybrid_block_layout=hybrid_block_layout) - tensor_space = TensorSpace(distributed_config=distributed_config) - hybrid_config.setup_tensor_space(tensor_space) - layer = LAYER_CLS(hybrid_config.ssm, layer_idx=0, tensor_space=tensor_space) - tensor_space.setup(distributed) - materialize_meta_tensors(layer, tensor_space) - layer.to(distributed.device) - - batch_size = 2 - seq_length = 32 - hidden_size = hybrid_config.transformer.hidden_size - x = torch.randn(batch_size, seq_length, hidden_size, device=distributed.device) - - # Run forward pass - output, _ = layer(x, {}) - - loss = output.sum() - loss.backward() - # Basic shape checkss - assert output.shape == x.shape - assert not torch.isnan(output).any() - assert not torch.isinf(output).any() - - -@pytest.mark.skipif(not run_test, reason="No CUDA available or Mamba not installed") -def test_mamba_block(distributed_config, distributed): - hybrid_config = get_hybrid_config(hybrid_block_layout=["m", "t"]) - tensor_space = TensorSpace(distributed_config=distributed_config) - tensor_space.setup(distributed) - hybrid_config.setup_tensor_space(tensor_space) - layer_idx = 0 - - mixer_cls = partial(MambaLayer, layer_idx=layer_idx) - block = LlambaBlock( - hybrid_config.transformer, - hybrid_config.ssm, - mixer_cls=mixer_cls, - tensor_space=tensor_space, - layer_index=layer_idx, - ) - - materialize_meta_tensors(block, tensor_space) - block.to("cuda") - - batch_size = 2 - seq_length = 32 - hidden_size = hybrid_config.transformer.hidden_size - x = torch.randn(batch_size, seq_length, hidden_size, device=distributed.device) - - hidden_states = block(x, {}) - loss = hidden_states.sum() - loss.backward() - - assert hidden_states.shape == x.shape - assert not torch.isnan(hidden_states).any() - assert not torch.isinf(hidden_states).any() - - -@pytest.mark.slow -@pytest.mark.skipif(not run_test, reason="No CUDA available or Mamba not installed") -@pytest.mark.parametrize( - ("hybrid_block_layout"), - [ - (["m", "t"]), - (["m2d", "t"]), - ], - ids=["mamba", "discrete_mamba2"], -) -def test_hybrid_model_train_with_fast_mode(distributed_config, hybrid_block_layout): - hybrid_config = get_hybrid_config(hybrid_block_layout=hybrid_block_layout) - model = HybridSSMBaseModel(hybrid_config, distributed_config) - distributed = Distributed(distributed_config) - model.setup(distributed) - tensor_space = model._tensor_space - materialize_meta_tensors(model, tensor_space) - model.to("cuda") - - batch_size = 2 - seq_length = 32 - x = torch.randint(0, 49152, (batch_size, seq_length), device="cuda") - position_ids = torch.arange(seq_length, device="cuda", dtype=torch.int64) - attention_mask = torch.ones((1, 1, 1, 1), device="cuda", dtype=torch.bool) # will be broadcasted to right shape - labels = torch.randint(0, 49152, (batch_size, seq_length), device="cuda") - losses = {LanguageModelLossNames.language_model_loss: []} - output = model( - x, - { - "position_ids": position_ids, - TransformerKwargs.sequence_first: False, - TransformerKwargs.attention_mask: attention_mask, - TransformerKwargs.attention_mask_value: -100, - TransformerKwargs.grad_output: True, - LanguageModelKwargs.labels: labels, - }, - losses=losses, - ) - loss = sum(losses[LanguageModelLossNames.language_model_loss]) - loss.backward() - - -# TODO: added this when inference enabled -# No inference for now -# @dataclass -# class InferenceParams: -# max_seqlen: int -# max_batch_size: int -# sequence_len_offset: int = 0 -# key_value_memory_dict: dict = None - -# def __post_init__(self): -# if self.key_value_memory_dict is None: -# self.key_value_memory_dict = {} - - -# @pytest.mark.skipif(not torch.cuda.is_available(), reason="No CUDA available") -# def test_hybrid_model_inference(distributed_config, hybrid_config): -# hybrid_config.ssm.use_fast_path = False -# model = HybridSSMBaseModel(hybrid_config, distributed_config) -# distributed = Distributed(distributed_config) -# model.setup(distributed) -# tensor_space = model._tensor_space -# materialize_meta_tensors(model, tensor_space) -# model.to("cuda") -# # print(model) - -# batch_size = 2 -# seq_length = 32 -# x = torch.randint(0, 49152, (batch_size, seq_length), device="cuda") -# position_ids = torch.arange(seq_length, device="cuda", dtype=torch.int64) -# attention_mask = torch.ones((1, 1, 1, 1), device="cuda", dtype=torch.bool) # will be broadcasted to right shape -# labels = torch.randint(0, 49152, (batch_size, seq_length), device="cuda") -# max_new_tokens = 10 - -# inference_params = InferenceParams( -# max_seqlen=len(x[0]) + max_new_tokens, max_batch_size=x.shape[0], sequence_len_offset=0 -# ) -# losses = {LanguageModelLossNames.language_model_loss: []} - -# output = model( -# x, -# { -# "position_ids": position_ids, -# TransformerKwargs.sequence_first: True, -# TransformerKwargs.attention_mask: attention_mask, -# TransformerKwargs.attention_mask_value: -100, -# TransformerKwargs.grad_output: True, -# LanguageModelKwargs.labels: labels, -# "inference_params": inference_params, -# }, -# losses=losses, -# ) - -if __name__ == "__main__": - pytest.main(["-s", __file__]) diff --git a/tests/utils/utils.py b/tests/utils/utils.py index f37c1cb27..11b7e4036 100644 --- a/tests/utils/utils.py +++ b/tests/utils/utils.py @@ -4,49 +4,40 @@ import pytest import torch -from fast_llm.layers.ssm.config import SSMConfig -from fast_llm.layers.transformer.config import TransformerConfig -from fast_llm.models.ssm.config import HybridSSMBaseModelConfig +from fast_llm.engine.base_model.base_model import BaseModel, Layer +from fast_llm.engine.config_utils.tensor_space import TensorSpace +from fast_llm.engine.distributed.distributed import Distributed +from fast_llm.engine.multi_stage.config import FastLLMModelConfig, StageConfig +from fast_llm.engine.multi_stage.stage import Stage TEST_RESULTS_PATH = pathlib.Path(os.environ.get("TEST_RESULTS_PATH", "/tmp/fast_llm_tests")).resolve() requires_cuda = pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is not available") -def materialize_meta_tensors(model, tensor_space): - # Materialize parameters that are on meta device - for name, param in model.named_parameters(): - if param.device.type == "meta": - # Check if the parameter is a custom tensor type - if hasattr(param, "tensor_name") and hasattr(param, "init_parameter"): - param_data = param.new_empty(param.shape, device="cuda") - # Initialize param_data - param.init_parameter(param_data, tensor_space.distributed) - # Replace the parameter in the module - module_path, param_name = name.rsplit(".", 1) if "." in name else (None, name) - module = model - if module_path is not None: - for part in module_path.split("."): - module = getattr(module, part) - param = torch.nn.Parameter(param_data, requires_grad=param.requires_grad) - # TODO: add param_grad_is_zero etc., grad_buffer, etc., see test_mlp_recomputation - param.grad = None - param.grad_buffer = torch.empty_like(param) - param.param_grad_is_zero = True - module._parameters[param_name] = param - return model +def get_base_model(config: FastLLMModelConfig): + # Create a base model (and distributed). + # Using a full model config so we have the model type and distributed config in the same argument. + distributed = Distributed(config.distributed) + tensor_space = TensorSpace(config.distributed) + config.base_model.setup_tensor_space(tensor_space) + tensor_space.setup(distributed) + base_model = config.get_model_class().base_model_class(config.base_model, config.distributed) + base_model.setup(distributed) + return base_model, distributed -def get_hybrid_config(hybrid_block_layout=["t", "m"], prediction_heads=1, default_mtp_type=None): - config = HybridSSMBaseModelConfig( - transformer=TransformerConfig(num_layers=len(hybrid_block_layout)), - ssm=SSMConfig(), - hybrid_block_layout=hybrid_block_layout, - prediction_heads=prediction_heads, - default_mtp_type=default_mtp_type, - init_method_std_embed=0.02, - init_method_min_embed=-0.02, - init_method_max_embed=0.02, - use_position_embeddings=True, - tie_word_embeddings=False, +def get_stage(base_model: BaseModel | list[Layer], distributed: Distributed): + # Create a fast-llm stage which allocates and initializes meta tensors correctly. + stage = Stage( + config=StageConfig(), + base_model=base_model, + distributed_config=distributed.config, + begin=0, + end=1, + index=0, ) - return config + stage.setup(distributed=distributed) + stage.initialize_weights() + stage.restore_parameters() + stage.reset_gradients() + return stage From 58677d291f37d4625307b80b323e19264b53957f Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Fri, 13 Jun 2025 17:31:51 -0400 Subject: [PATCH 35/43] fix --- fast_llm/layers/ssm/discrete_mamba2.py | 4 ++-- fast_llm/layers/ssm/mamba_layer.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fast_llm/layers/ssm/discrete_mamba2.py b/fast_llm/layers/ssm/discrete_mamba2.py index ecf0b29d7..31e81e99b 100644 --- a/fast_llm/layers/ssm/discrete_mamba2.py +++ b/fast_llm/layers/ssm/discrete_mamba2.py @@ -17,7 +17,7 @@ from mamba_ssm.ops.triton.ssd_combined import mamba_chunk_scan_combined as _mamba_chunk_scan_combined # noqa _mamba_available = True -except ImportError: +except (ImportError, RuntimeError): _mamba_available = False @@ -25,7 +25,7 @@ from causal_conv1d import causal_conv1d_fn as _causal_conv1d_fn # noqa _causal_conv1d_available = True -except ImportError: +except (ImportError, RuntimeError): _causal_conv1d_available = False diff --git a/fast_llm/layers/ssm/mamba_layer.py b/fast_llm/layers/ssm/mamba_layer.py index 7fd437894..7c824d235 100644 --- a/fast_llm/layers/ssm/mamba_layer.py +++ b/fast_llm/layers/ssm/mamba_layer.py @@ -14,7 +14,7 @@ from mamba_ssm.ops.selective_scan_interface import mamba_inner_fn as _mamba_inner_fn # noqa _mamba_available = True -except ImportError: +except (ImportError, RuntimeError): _mamba_available = False """ From e125fa9ff06f9ae148af41e14cae1c58717c88a7 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Fri, 13 Jun 2025 17:37:01 -0400 Subject: [PATCH 36/43] move to directory --- tests/models/__init__.py | 0 tests/{ => models}/test_checkpoint.py | 0 .../{test_gpt_generate_and_forward.py => models/test_generate.py} | 0 tests/{ => models}/test_match_megatron.py | 0 tests/{ => models}/test_mb.py | 0 tests/{ => models}/test_mb_seq_first.py | 0 tests/{ => models}/test_ms.py | 0 tests/{ => models}/test_seq_first.py | 0 tests/{ => models}/test_simple.py | 0 9 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 tests/models/__init__.py rename tests/{ => models}/test_checkpoint.py (100%) rename tests/{test_gpt_generate_and_forward.py => models/test_generate.py} (100%) rename tests/{ => models}/test_match_megatron.py (100%) rename tests/{ => models}/test_mb.py (100%) rename tests/{ => models}/test_mb_seq_first.py (100%) rename tests/{ => models}/test_ms.py (100%) rename tests/{ => models}/test_seq_first.py (100%) rename tests/{ => models}/test_simple.py (100%) diff --git a/tests/models/__init__.py b/tests/models/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/test_checkpoint.py b/tests/models/test_checkpoint.py similarity index 100% rename from tests/test_checkpoint.py rename to tests/models/test_checkpoint.py diff --git a/tests/test_gpt_generate_and_forward.py b/tests/models/test_generate.py similarity index 100% rename from tests/test_gpt_generate_and_forward.py rename to tests/models/test_generate.py diff --git a/tests/test_match_megatron.py b/tests/models/test_match_megatron.py similarity index 100% rename from tests/test_match_megatron.py rename to tests/models/test_match_megatron.py diff --git a/tests/test_mb.py b/tests/models/test_mb.py similarity index 100% rename from tests/test_mb.py rename to tests/models/test_mb.py diff --git a/tests/test_mb_seq_first.py b/tests/models/test_mb_seq_first.py similarity index 100% rename from tests/test_mb_seq_first.py rename to tests/models/test_mb_seq_first.py diff --git a/tests/test_ms.py b/tests/models/test_ms.py similarity index 100% rename from tests/test_ms.py rename to tests/models/test_ms.py diff --git a/tests/test_seq_first.py b/tests/models/test_seq_first.py similarity index 100% rename from tests/test_seq_first.py rename to tests/models/test_seq_first.py diff --git a/tests/test_simple.py b/tests/models/test_simple.py similarity index 100% rename from tests/test_simple.py rename to tests/models/test_simple.py From d164f25718878aae5c4724985513912356310f12 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Mon, 16 Jun 2025 17:05:07 -0400 Subject: [PATCH 37/43] fixes --- setup.cfg | 2 ++ tests/test_match_megatron.py | 1 + 2 files changed, 3 insertions(+) diff --git a/setup.cfg b/setup.cfg index fff7503ae..b3b1df036 100644 --- a/setup.cfg +++ b/setup.cfg @@ -57,6 +57,8 @@ DEV = pytest-xdist>=3.7.0 # Somehow needed for Megatron to work with base image 24.11 setuptools>=80.9.0 + # dependency manager needs it. + colorama>=0.4.6 # Required for building the documentation DOCS = diff --git a/tests/test_match_megatron.py b/tests/test_match_megatron.py index 9f8614648..5e7f3d37a 100644 --- a/tests/test_match_megatron.py +++ b/tests/test_match_megatron.py @@ -16,6 +16,7 @@ def test_megatron(run_test_script): ] +@pytest.mark.slow @pytest.mark.depends_on(on=["test_megatron"]) def test_match_megatron(run_test_script): if CONFIG_MEGATRON is None: From 917912789f923290b0d6f9b0dec03ae86daf662e Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Mon, 16 Jun 2025 17:38:08 -0400 Subject: [PATCH 38/43] fix --- tests/utils/model_configs.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/tests/utils/model_configs.py b/tests/utils/model_configs.py index 191358157..481ec6116 100644 --- a/tests/utils/model_configs.py +++ b/tests/utils/model_configs.py @@ -301,13 +301,21 @@ def _update_and_add_testing_config( ) _update_and_add_testing_config( - # Tests yarn-style rotary embeddings. + # Tests diffusion llama converter. "llama_yarn", "diffusion_llama", extra_args=[], # Megatron doesn't support Yarn-style Rotary Embeddings megatron_args=None, checkpoint_format=DiffusionLlamaGPTHuggingfaceCheckpointFormat, + # TODO: Add back generate as `normal` when stable. + groups={ + ModelTestingGroup.basic: ModelTestingGroupAction.unimportant, + ModelTestingGroup.convert: ModelTestingGroupAction.normal, + ModelTestingGroup.generate: ModelTestingGroupAction.broken, + ModelTestingGroup.megatron: ModelTestingGroupAction.not_implemented, + ModelTestingGroup.distributed: ModelTestingGroupAction.unimportant, + }, ) _update_and_add_testing_config( @@ -347,13 +355,21 @@ def _update_and_add_testing_config( ) _update_and_add_testing_config( - # Diffusion dream converter. + # Tests diffusion dream converter. "qwen2", "dream", extra_args=[], # Megatron doesn't support per sub layer biases. megatron_args=None, checkpoint_format=DiffusionDreamGPTHuggingfaceCheckpointFormat, + # TODO: Add back generate as `normal` when stable. + groups={ + ModelTestingGroup.basic: ModelTestingGroupAction.unimportant, + ModelTestingGroup.convert: ModelTestingGroupAction.normal, + ModelTestingGroup.generate: ModelTestingGroupAction.broken, + ModelTestingGroup.megatron: ModelTestingGroupAction.not_implemented, + ModelTestingGroup.distributed: ModelTestingGroupAction.unimportant, + }, ) _update_and_add_testing_config( From d97e4c10c209da96339152b446f3f1f7b9305566 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Mon, 16 Jun 2025 18:21:14 -0400 Subject: [PATCH 39/43] fix --- tests/models/test_checkpoint.py | 6 +++--- tests/utils/model_configs.py | 19 +++++++++++++++++-- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/tests/models/test_checkpoint.py b/tests/models/test_checkpoint.py index 5132ba4f4..9cf60e91a 100644 --- a/tests/models/test_checkpoint.py +++ b/tests/models/test_checkpoint.py @@ -22,7 +22,7 @@ _WEIGHT_SHARD_SAVE_NAME = f"{ShardName.weights}_shard" -@pytest.mark.model_testing_group(ModelTestingGroup.basic) +@pytest.mark.model_testing_group(ModelTestingGroup.checkpoint) def test_checkpoint_and_eval(run_test_script_for_all_models, model_testing_config): # A baseline config (single-gpu, bf16, flash-attn). run_test_script_for_all_models( @@ -56,7 +56,7 @@ def _compare_resume_fn(test_path: pathlib.Path, compare_path: pathlib.Path): @pytest.mark.depends_on(on=["test_checkpoint_and_eval[{model_testing_config}]"]) -@pytest.mark.model_testing_group(ModelTestingGroup.basic) +@pytest.mark.model_testing_group(ModelTestingGroup.checkpoint) def test_resume(run_test_script_for_all_models): # Resume from iteration=1 and compare outputs with the baseline run. run_test_script_for_all_models( @@ -72,7 +72,7 @@ def test_resume(run_test_script_for_all_models): @pytest.mark.depends_on(on=["test_checkpoint_and_eval[{model_testing_config}]"]) -@pytest.mark.model_testing_group(ModelTestingGroup.basic) +@pytest.mark.model_testing_group(ModelTestingGroup.checkpoint) def test_resume_frozen(run_test_script_for_all_models): # Resume with frozen mlp. No comparison. run_test_script_for_all_models( diff --git a/tests/utils/model_configs.py b/tests/utils/model_configs.py index 481ec6116..3d654a0fb 100644 --- a/tests/utils/model_configs.py +++ b/tests/utils/model_configs.py @@ -26,10 +26,11 @@ class ModelTestingGroup(enum.StrEnum): basic = "basic" - megatron = "megatron" - distributed = "distributed" + checkpoint = "checkpoint" convert = "convert" generate = "generate" + megatron = "megatron" + distributed = "distributed" class ModelTestingGroupAction(enum.StrEnum): @@ -186,6 +187,7 @@ def _update_and_add_testing_config( checkpoint_format=None, groups={ ModelTestingGroup.basic: ModelTestingGroupAction.main, + ModelTestingGroup.checkpoint: ModelTestingGroupAction.main, ModelTestingGroup.convert: ModelTestingGroupAction.not_implemented, ModelTestingGroup.generate: ModelTestingGroupAction.not_implemented, ModelTestingGroup.megatron: ModelTestingGroupAction.normal, @@ -202,6 +204,7 @@ def _update_and_add_testing_config( checkpoint_format=None, groups={ ModelTestingGroup.basic: ModelTestingGroupAction.normal, + ModelTestingGroup.checkpoint: ModelTestingGroupAction.normal, ModelTestingGroup.convert: ModelTestingGroupAction.not_implemented, ModelTestingGroup.generate: ModelTestingGroupAction.not_implemented, ModelTestingGroup.megatron: ModelTestingGroupAction.unimportant, @@ -229,6 +232,7 @@ def _update_and_add_testing_config( # TODO: Add back generate as `normal` when stable. groups={ ModelTestingGroup.basic: ModelTestingGroupAction.normal, + ModelTestingGroup.checkpoint: ModelTestingGroupAction.normal, ModelTestingGroup.convert: ModelTestingGroupAction.normal, ModelTestingGroup.generate: ModelTestingGroupAction.broken, ModelTestingGroup.megatron: ModelTestingGroupAction.unimportant, @@ -259,6 +263,7 @@ def _update_and_add_testing_config( # TODO: Add back generate as `normal` when stable. groups={ ModelTestingGroup.basic: ModelTestingGroupAction.main, + ModelTestingGroup.checkpoint: ModelTestingGroupAction.main, ModelTestingGroup.convert: ModelTestingGroupAction.main, ModelTestingGroup.generate: ModelTestingGroupAction.broken, ModelTestingGroup.megatron: ModelTestingGroupAction.normal, @@ -276,6 +281,7 @@ def _update_and_add_testing_config( checkpoint_format=LlamaGPTHuggingfaceCheckpointFormat, groups={ ModelTestingGroup.basic: ModelTestingGroupAction.normal, + ModelTestingGroup.checkpoint: ModelTestingGroupAction.normal, ModelTestingGroup.convert: ModelTestingGroupAction.unimportant, ModelTestingGroup.generate: ModelTestingGroupAction.unimportant, ModelTestingGroup.megatron: ModelTestingGroupAction.not_implemented, @@ -293,6 +299,7 @@ def _update_and_add_testing_config( checkpoint_format=LlamaGPTHuggingfaceCheckpointFormat, groups={ ModelTestingGroup.basic: ModelTestingGroupAction.normal, + ModelTestingGroup.checkpoint: ModelTestingGroupAction.normal, ModelTestingGroup.convert: ModelTestingGroupAction.unimportant, ModelTestingGroup.generate: ModelTestingGroupAction.unimportant, ModelTestingGroup.megatron: ModelTestingGroupAction.not_implemented, @@ -311,6 +318,7 @@ def _update_and_add_testing_config( # TODO: Add back generate as `normal` when stable. groups={ ModelTestingGroup.basic: ModelTestingGroupAction.unimportant, + ModelTestingGroup.checkpoint: ModelTestingGroupAction.normal, ModelTestingGroup.convert: ModelTestingGroupAction.normal, ModelTestingGroup.generate: ModelTestingGroupAction.broken, ModelTestingGroup.megatron: ModelTestingGroupAction.not_implemented, @@ -329,6 +337,7 @@ def _update_and_add_testing_config( # TODO: Add back generate as `normal` when stable. groups={ ModelTestingGroup.basic: ModelTestingGroupAction.normal, + ModelTestingGroup.checkpoint: ModelTestingGroupAction.normal, ModelTestingGroup.convert: ModelTestingGroupAction.normal, ModelTestingGroup.generate: ModelTestingGroupAction.broken, ModelTestingGroup.megatron: ModelTestingGroupAction.not_implemented, @@ -347,6 +356,7 @@ def _update_and_add_testing_config( # TODO: Add back generate as `normal` when stable. groups={ ModelTestingGroup.basic: ModelTestingGroupAction.normal, + ModelTestingGroup.checkpoint: ModelTestingGroupAction.normal, ModelTestingGroup.convert: ModelTestingGroupAction.normal, ModelTestingGroup.generate: ModelTestingGroupAction.broken, ModelTestingGroup.megatron: ModelTestingGroupAction.not_implemented, @@ -365,6 +375,7 @@ def _update_and_add_testing_config( # TODO: Add back generate as `normal` when stable. groups={ ModelTestingGroup.basic: ModelTestingGroupAction.unimportant, + ModelTestingGroup.checkpoint: ModelTestingGroupAction.normal, ModelTestingGroup.convert: ModelTestingGroupAction.normal, ModelTestingGroup.generate: ModelTestingGroupAction.broken, ModelTestingGroup.megatron: ModelTestingGroupAction.not_implemented, @@ -383,6 +394,7 @@ def _update_and_add_testing_config( # TODO: Add back generate as `normal` when stable. groups={ ModelTestingGroup.basic: ModelTestingGroupAction.normal, + ModelTestingGroup.checkpoint: ModelTestingGroupAction.normal, ModelTestingGroup.convert: ModelTestingGroupAction.normal, ModelTestingGroup.generate: ModelTestingGroupAction.broken, ModelTestingGroup.megatron: ModelTestingGroupAction.not_implemented, @@ -406,6 +418,7 @@ def _update_and_add_testing_config( # TODO: New base image broke mixtral groups={ ModelTestingGroup.basic: ModelTestingGroupAction.broken, + ModelTestingGroup.checkpoint: ModelTestingGroupAction.broken, ModelTestingGroup.convert: ModelTestingGroupAction.broken, ModelTestingGroup.generate: ModelTestingGroupAction.broken, ModelTestingGroup.megatron: ModelTestingGroupAction.broken, @@ -430,6 +443,7 @@ def _update_and_add_testing_config( # TODO: Add back generate as `normal` when stable. groups={ ModelTestingGroup.basic: ModelTestingGroupAction.normal, + ModelTestingGroup.checkpoint: ModelTestingGroupAction.normal, ModelTestingGroup.convert: ModelTestingGroupAction.broken, # TODO: Fix and bring back to `testing_groups` ModelTestingGroup.generate: ModelTestingGroupAction.broken, @@ -452,6 +466,7 @@ def _update_and_add_testing_config( checkpoint_format=None, groups={ ModelTestingGroup.basic: ModelTestingGroupAction.normal, + ModelTestingGroup.checkpoint: ModelTestingGroupAction.normal, ModelTestingGroup.convert: ModelTestingGroupAction.not_implemented, ModelTestingGroup.generate: ModelTestingGroupAction.not_implemented, ModelTestingGroup.megatron: ModelTestingGroupAction.not_implemented, From c95e8ebee8f6afc61450cda6e9644fc76ad5772f Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Tue, 17 Jun 2025 16:12:23 -0400 Subject: [PATCH 40/43] Fix dropless mlp --- fast_llm/functional/config.py | 2 +- fast_llm/functional/triton/sparse_copy.py | 5 + fast_llm/functional/triton/sparse_linear.py | 27 +-- setup.cfg | 4 +- tests/conftest.py | 3 + tests/functional/__init__.py | 0 tests/{ => functional}/test_functional.py | 4 +- tests/functional/test_sparse_matmul.py | 154 ++++++++++++++++++ tests/{ => functional}/test_triton_kernels.py | 0 tests/utils/model_configs.py | 12 +- 10 files changed, 190 insertions(+), 21 deletions(-) create mode 100644 tests/functional/__init__.py rename tests/{ => functional}/test_functional.py (98%) create mode 100644 tests/functional/test_sparse_matmul.py rename tests/{ => functional}/test_triton_kernels.py (100%) diff --git a/fast_llm/functional/config.py b/fast_llm/functional/config.py index 22f23174b..0b7b14ab1 100644 --- a/fast_llm/functional/config.py +++ b/fast_llm/functional/config.py @@ -15,7 +15,7 @@ class TritonConfig: MAX_BLOCK_SIZE_BYTES = 65536 -class MLPRecomputeLevel(str, enum.Enum): +class MLPRecomputeLevel(enum.StrEnum): none = "none" activation = "activation" activation_and_input = "activation_and_input" diff --git a/fast_llm/functional/triton/sparse_copy.py b/fast_llm/functional/triton/sparse_copy.py index 258a2578b..7c803689c 100644 --- a/fast_llm/functional/triton/sparse_copy.py +++ b/fast_llm/functional/triton/sparse_copy.py @@ -11,10 +11,15 @@ @dataclasses.dataclass() class SparseMap: sparse_rows: torch.Tensor + # The end row for each expert, including padding. `expert_ends[i] = expert_begins[i] + padded_tokens_per_expert[i]` expert_ends: torch.Tensor + # The end row for each expert, excluding padding. `expert_pad_begins[i] = expert_begins[i] + unpadded_tokens_per_expert[i]` expert_pad_begins: torch.Tensor + # The number of rows un the dense tensor, i.e., the number of tokens. num_rows_dense: int + # The number of sparse rows, including padding. `num_rows = expert_ends[-1]` num_rows: int + # The number of sparse rows, excluding padding. `num_rows_unpadded = num_rows_dense * num_experts_per_token` num_rows_unpadded: int num_experts: int num_experts_per_token: int diff --git a/fast_llm/functional/triton/sparse_linear.py b/fast_llm/functional/triton/sparse_linear.py index 9a0864944..ae46655ea 100644 --- a/fast_llm/functional/triton/sparse_linear.py +++ b/fast_llm/functional/triton/sparse_linear.py @@ -1,10 +1,12 @@ +import os + import torch from fast_llm.functional.triton import TritonConfig, tl, tl_constexpr, triton, triton_autotune, triton_jit from fast_llm.functional.triton.sparse_copy import SparseMap from fast_llm.utils import Assert, div -autotune_configs = [ +autotune_configs = ( TritonConfig( {"block_size_row": 128, "block_size_col": 256, "block_size_inner": 64, "group_size_row": 8}, num_stages=3, @@ -45,7 +47,10 @@ num_stages=5, num_warps=2, ), -] +) + +if os.environ.get("FAST_LLM_SKIP_TRITON_AUTOTUNE"): + autotune_configs = (autotune_configs[2],) @triton_autotune( @@ -255,13 +260,13 @@ def output_sparse_matmul_kernel( def output_sparse_matmul( lhs: torch.Tensor, rhs: torch.Tensor, - sparse_map: SparseMap | None, + sparse_map: SparseMap | None = None, out: torch.Tensor | None = None, accumulate: bool = False, ) -> torch.Tensor: """ - Output-sparse matrix multiplication with a sparse column dimension, - i.e., with a mapping row_index -> sparse_index (obtained from expert_ends). + Output-sparse matrix multiplication with a sparse column dimension + and a mapping row_index -> sparse_index (obtained from expert_ends). Ex.: MLP layer 1 forward (Y = X x W1^T), MLP layer 2 input grad (gY = gZ x W2). Formula: out[i, js] = sum_k(lhs[i, k] * rhs[k, jd]), where jd = js + col_sparse_dim * sparse_index[i] sparse_index[i] = sum(expert_ends <= i) @@ -381,13 +386,13 @@ def input_inner_sparse_matmul_kernel( def input_inner_sparse_matmul( lhs: torch.Tensor, rhs: torch.Tensor, - sparse_map: SparseMap | None, + sparse_map: SparseMap | None = None, out: torch.Tensor | None = None, accumulate: bool = False, ) -> torch.Tensor: """ - Left-input-sparse matrix multiplication with a sparse inner dimension, - i.e., with a mapping row_index -> sparse_index (obtained from expert_ends). + Left-input-sparse matrix multiplication with a sparse inner dimension + and a mapping row_index -> sparse_index (obtained from expert_ends). Ex.: MLP layer 2 forward (Z = Y x W2^T), MLP layer 1 input grad (gX = gY x W1). Formula: out[i, j] = sum_ks(lhs[i, ks] * rhs[kd, j]), where kd = ks + inner_sparse_dim * sparse_index[i] sparse_index[i] = sum(expert_ends <= i) @@ -511,13 +516,13 @@ def input_row_sparse_matmul_kernel( def input_row_sparse_matmul( lhs: torch.Tensor, rhs: torch.Tensor, - sparse_map: SparseMap | None, + sparse_map: SparseMap | None = None, out: torch.Tensor | None = None, accumulate: bool = False, ) -> torch.Tensor: """ - Left-input-sparse matrix multiplication with a sparse row dimension, - i.e., with a mapping inner_index -> sparse_index. + Left-input-sparse matrix multiplication with a sparse row dimension + and a mapping inner_index -> sparse_index. Ex.: MLP layer 1 weight grad (gW1 = gY^T x X), MLP layer 2 weight grad (gW2^T = Y^T x gZ). Formula: out[id, j] = sum_ks(lhs[is, ks] * rhs[ks, j]), where sparse_begin[sparse_index[id]] <= ks < sparse_end[sparse_index[id]], diff --git a/setup.cfg b/setup.cfg index b3b1df036..3b79a1d03 100644 --- a/setup.cfg +++ b/setup.cfg @@ -24,6 +24,8 @@ CORE = safetensors>=0.5.3 # Update the base image (version fixed to ensure there is a wheel for the base image), may need --no-build-isolation flash-attn==2.7.3 + # Dropless MLP is broken with triton 3.3.0 and 3.3.1, probably because of a bug in triton. TODO: Fix + triton==3.2.0 # Small packages required for some optional features and tools. @@ -57,7 +59,7 @@ DEV = pytest-xdist>=3.7.0 # Somehow needed for Megatron to work with base image 24.11 setuptools>=80.9.0 - # dependency manager needs it. + # Dependency manager needs colorama to show colors. colorama>=0.4.6 # Required for building the documentation diff --git a/tests/conftest.py b/tests/conftest.py index 0d25fc5aa..11757176e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -113,6 +113,9 @@ def pytest_configure(config): rendezvous_port=TORCHRUN_DEFAULT_PORT + 2 * worker_id + 1, ) + # Skip slow autotune for tests. The default config has the highest block size, so this shouldn't hide any bug. + os.environ["FAST_LLM_SKIP_TRITON_AUTOTUNE"] = "TRUE" + @pytest.hookimpl(trylast=True) def pytest_collection_modifyitems(config, items: list[pytest.Function]): diff --git a/tests/functional/__init__.py b/tests/functional/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/test_functional.py b/tests/functional/test_functional.py similarity index 98% rename from tests/test_functional.py rename to tests/functional/test_functional.py index 9211259c2..3ddd5d4fe 100644 --- a/tests/test_functional.py +++ b/tests/functional/test_functional.py @@ -224,8 +224,6 @@ def test_mlp_recomputation(gated, activation_type): @pytest.mark.slow @requires_cuda def test_dropless_mlp(): - # TODO: Fix dropless MOE - pytest.fail("Test fails, aborting to avoid breaking cuda", False) num_experts = 4 experts_per_token = 4 tokens = 256 @@ -273,7 +271,7 @@ def test_dropless_mlp(): sparse_map = get_sparse_map(top_experts, num_experts) for i, recompute_level in enumerate(MLPRecomputeLevel): - print(recompute_level.value) # noqa + print("recompute_level", recompute_level) # noqa input_.grad = None scores.grad = None for param in params: diff --git a/tests/functional/test_sparse_matmul.py b/tests/functional/test_sparse_matmul.py new file mode 100644 index 000000000..899dad967 --- /dev/null +++ b/tests/functional/test_sparse_matmul.py @@ -0,0 +1,154 @@ +import dataclasses +import functools + +import pytest +import torch + +from fast_llm.functional.triton.sparse_copy import SparseMap +from fast_llm.functional.triton.sparse_linear import ( + dense_matmul, + input_inner_sparse_matmul, + input_row_sparse_matmul, + output_sparse_matmul, +) +from fast_llm.utils import Assert +from tests.utils.utils import requires_cuda + + +@dataclasses.dataclass +class _SparseTestData: + dense_dim: int + sparse_dim: int + expert_ends: tuple[int, ...] + tokens_per_expert: tuple[int, ...] + std: float = 0.125 + + @functools.cached_property + def expert_begins(self) -> tuple[int, ...]: + return (0,) + self.expert_ends[:-1] + + @functools.cached_property + def expert_pad_begins(self) -> tuple[int, ...]: + return tuple( + expert_begin + expert_tokens + for expert_begin, expert_tokens in zip(self.expert_begins, self.tokens_per_expert, strict=True) + ) + + @functools.cached_property + def token_dim(self) -> int: + return self.expert_ends[-1] + + @property + def sparse_dim_expanded(self) -> int: + return self.sparse_dim * self.num_experts + + @functools.cached_property + def num_experts(self) -> int: + return len(self.expert_begins) + + @functools.cached_property + def sparse_map(self) -> SparseMap: + return SparseMap( + num_experts=self.num_experts, + expert_ends=torch.tensor(self.expert_ends, device="cuda"), + expert_pad_begins=torch.tensor(self.expert_pad_begins, device="cuda"), + num_rows=self.expert_ends[-1], + # Not needed + sparse_rows=None, + num_rows_dense=None, + num_rows_unpadded=None, + num_experts_per_token=None, + ) + + def normal(self, dim_0: int, dim_1: int) -> torch.Tensor: + return torch.normal(0, self.std, (dim_0, dim_1), device="cuda") + + +_SPARSE_TEST_DATAS = ( + _SparseTestData( + dense_dim=384, + sparse_dim=256, + expert_ends=(128, 384, 512), + tokens_per_expert=(78, 256, 54), + ), + _SparseTestData( + dense_dim=256, + sparse_dim=512, + expert_ends=(128, 256, 256, 384), + tokens_per_expert=(52, 125, 0, 97), + ), +) + + +@requires_cuda +@pytest.mark.slow +@pytest.mark.parametrize("sparse_test_data", _SPARSE_TEST_DATAS) +def test_dense_matmul(sparse_test_data): + lhs = sparse_test_data.normal(sparse_test_data.token_dim, sparse_test_data.dense_dim) + rhs = sparse_test_data.normal(sparse_test_data.dense_dim, sparse_test_data.sparse_dim) + + output = dense_matmul(lhs, rhs) + output_ref = torch.matmul(lhs, rhs) + Assert.rms_close(output, output_ref, 1e-3) + + +@requires_cuda +@pytest.mark.slow +@pytest.mark.parametrize("sparse_test_data", _SPARSE_TEST_DATAS) +def test_output_sparse_matmul(sparse_test_data): + lhs = sparse_test_data.normal(sparse_test_data.token_dim, sparse_test_data.dense_dim) + rhs = sparse_test_data.normal(sparse_test_data.dense_dim, sparse_test_data.sparse_dim_expanded) + + # Randomly initialize the output to ensure padded values have no effect. + out = sparse_test_data.normal(sparse_test_data.token_dim, sparse_test_data.sparse_dim) + output = output_sparse_matmul(lhs, rhs, sparse_test_data.sparse_map, out) + + output_ref = torch.zeros_like(output) + for i in range(sparse_test_data.num_experts): + # Padded tokens are treated like regular ones. + output_ref[sparse_test_data.expert_begins[i] : sparse_test_data.expert_ends[i]] = torch.matmul( + lhs[sparse_test_data.expert_begins[i] : sparse_test_data.expert_ends[i]], + rhs[:, i * sparse_test_data.sparse_dim : (i + 1) * sparse_test_data.sparse_dim], + ) + + Assert.rms_close(output, output_ref, 1e-3) + + +@requires_cuda +@pytest.mark.slow +@pytest.mark.parametrize("sparse_test_data", _SPARSE_TEST_DATAS) +def test_input_inner_sparse_matmul(sparse_test_data): + lhs = sparse_test_data.normal(sparse_test_data.token_dim, sparse_test_data.sparse_dim) + rhs = sparse_test_data.normal(sparse_test_data.sparse_dim_expanded, sparse_test_data.dense_dim) + + output = input_inner_sparse_matmul(lhs, rhs, sparse_test_data.sparse_map) + + output_ref = torch.zeros_like(output) + for i in range(sparse_test_data.num_experts): + # Padded tokens are treated like regular ones. + output_ref[sparse_test_data.expert_begins[i] : sparse_test_data.expert_ends[i]] = torch.matmul( + lhs[sparse_test_data.expert_begins[i] : sparse_test_data.expert_ends[i]], + rhs[i * sparse_test_data.sparse_dim : (i + 1) * sparse_test_data.sparse_dim], + ) + + Assert.rms_close(output, output_ref, 1e-3) + + +@requires_cuda +@pytest.mark.slow +@pytest.mark.parametrize("sparse_test_data", _SPARSE_TEST_DATAS) +def test_input_row_sparse_matmul(sparse_test_data): + lhs = sparse_test_data.normal(sparse_test_data.sparse_dim, sparse_test_data.token_dim) + rhs = sparse_test_data.normal(sparse_test_data.token_dim, sparse_test_data.dense_dim) + + output = input_row_sparse_matmul(lhs, rhs, sparse_test_data.sparse_map) + + output_ref = torch.zeros_like(output) + for i in range(sparse_test_data.num_experts): + # Padded tokens are excluded from the sum. + output_ref[i * sparse_test_data.sparse_dim : (i + 1) * sparse_test_data.sparse_dim] = torch.matmul( + lhs[:, sparse_test_data.expert_begins[i] : sparse_test_data.expert_pad_begins[i]], + rhs[sparse_test_data.expert_begins[i] : sparse_test_data.expert_pad_begins[i]], + ) + + Assert.rms_close(output, output_ref, 1e-3) diff --git a/tests/test_triton_kernels.py b/tests/functional/test_triton_kernels.py similarity index 100% rename from tests/test_triton_kernels.py rename to tests/functional/test_triton_kernels.py diff --git a/tests/utils/model_configs.py b/tests/utils/model_configs.py index 3d654a0fb..4c2254227 100644 --- a/tests/utils/model_configs.py +++ b/tests/utils/model_configs.py @@ -315,11 +315,12 @@ def _update_and_add_testing_config( # Megatron doesn't support Yarn-style Rotary Embeddings megatron_args=None, checkpoint_format=DiffusionLlamaGPTHuggingfaceCheckpointFormat, + # TODO: Conversion is broken. # TODO: Add back generate as `normal` when stable. groups={ ModelTestingGroup.basic: ModelTestingGroupAction.unimportant, ModelTestingGroup.checkpoint: ModelTestingGroupAction.normal, - ModelTestingGroup.convert: ModelTestingGroupAction.normal, + ModelTestingGroup.convert: ModelTestingGroupAction.broken, ModelTestingGroup.generate: ModelTestingGroupAction.broken, ModelTestingGroup.megatron: ModelTestingGroupAction.not_implemented, ModelTestingGroup.distributed: ModelTestingGroupAction.unimportant, @@ -372,11 +373,12 @@ def _update_and_add_testing_config( # Megatron doesn't support per sub layer biases. megatron_args=None, checkpoint_format=DiffusionDreamGPTHuggingfaceCheckpointFormat, + # TODO: Conversion is broken. # TODO: Add back generate as `normal` when stable. groups={ ModelTestingGroup.basic: ModelTestingGroupAction.unimportant, ModelTestingGroup.checkpoint: ModelTestingGroupAction.normal, - ModelTestingGroup.convert: ModelTestingGroupAction.normal, + ModelTestingGroup.convert: ModelTestingGroupAction.broken, ModelTestingGroup.generate: ModelTestingGroupAction.broken, ModelTestingGroup.megatron: ModelTestingGroupAction.not_implemented, ModelTestingGroup.distributed: ModelTestingGroupAction.unimportant, @@ -489,13 +491,13 @@ def testing_group_enabled(item: pytest.Function, skip_slow: bool, skip_extra_slo for group in groups: action = model_config.groups[group] if action == ModelTestingGroupAction.main: - return True + pass elif action == ModelTestingGroupAction.normal and not skip_slow: - return True + pass elif ( action in (ModelTestingGroupAction.broken, ModelTestingGroupAction.unimportant) and not skip_extra_slow ): - return True + pass elif show_skipped: item.add_marker( pytest.mark.skip(reason=f"Skipping testing group {group} for model {model_testing_config}.") From 468ed7eb04446fdbd7ab3beb79a1f75e321a1b01 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Tue, 17 Jun 2025 16:42:46 -0400 Subject: [PATCH 41/43] fix --- setup.cfg | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.cfg b/setup.cfg index 3b79a1d03..b1e44e814 100644 --- a/setup.cfg +++ b/setup.cfg @@ -24,8 +24,8 @@ CORE = safetensors>=0.5.3 # Update the base image (version fixed to ensure there is a wheel for the base image), may need --no-build-isolation flash-attn==2.7.3 - # Dropless MLP is broken with triton 3.3.0 and 3.3.1, probably because of a bug in triton. TODO: Fix - triton==3.2.0 + # Dropless MLP is broken with triton 3.2.0, 3.3.0 and 3.3.1. TODO: Remove once a working triton version is released. + triton==3.1.0 # Small packages required for some optional features and tools. From eb734bd5b880ee4e383fa2a9a88f6f262201f028 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Tue, 17 Jun 2025 16:55:48 -0400 Subject: [PATCH 42/43] fix --- tests/utils/model_configs.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tests/utils/model_configs.py b/tests/utils/model_configs.py index 3d654a0fb..4c2254227 100644 --- a/tests/utils/model_configs.py +++ b/tests/utils/model_configs.py @@ -315,11 +315,12 @@ def _update_and_add_testing_config( # Megatron doesn't support Yarn-style Rotary Embeddings megatron_args=None, checkpoint_format=DiffusionLlamaGPTHuggingfaceCheckpointFormat, + # TODO: Conversion is broken. # TODO: Add back generate as `normal` when stable. groups={ ModelTestingGroup.basic: ModelTestingGroupAction.unimportant, ModelTestingGroup.checkpoint: ModelTestingGroupAction.normal, - ModelTestingGroup.convert: ModelTestingGroupAction.normal, + ModelTestingGroup.convert: ModelTestingGroupAction.broken, ModelTestingGroup.generate: ModelTestingGroupAction.broken, ModelTestingGroup.megatron: ModelTestingGroupAction.not_implemented, ModelTestingGroup.distributed: ModelTestingGroupAction.unimportant, @@ -372,11 +373,12 @@ def _update_and_add_testing_config( # Megatron doesn't support per sub layer biases. megatron_args=None, checkpoint_format=DiffusionDreamGPTHuggingfaceCheckpointFormat, + # TODO: Conversion is broken. # TODO: Add back generate as `normal` when stable. groups={ ModelTestingGroup.basic: ModelTestingGroupAction.unimportant, ModelTestingGroup.checkpoint: ModelTestingGroupAction.normal, - ModelTestingGroup.convert: ModelTestingGroupAction.normal, + ModelTestingGroup.convert: ModelTestingGroupAction.broken, ModelTestingGroup.generate: ModelTestingGroupAction.broken, ModelTestingGroup.megatron: ModelTestingGroupAction.not_implemented, ModelTestingGroup.distributed: ModelTestingGroupAction.unimportant, @@ -489,13 +491,13 @@ def testing_group_enabled(item: pytest.Function, skip_slow: bool, skip_extra_slo for group in groups: action = model_config.groups[group] if action == ModelTestingGroupAction.main: - return True + pass elif action == ModelTestingGroupAction.normal and not skip_slow: - return True + pass elif ( action in (ModelTestingGroupAction.broken, ModelTestingGroupAction.unimportant) and not skip_extra_slow ): - return True + pass elif show_skipped: item.add_marker( pytest.mark.skip(reason=f"Skipping testing group {group} for model {model_testing_config}.") From c338d444e403b0147d182b480846d0db5060fd59 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Thu, 19 Jun 2025 16:36:41 -0400 Subject: [PATCH 43/43] fixes --- fast_llm/models/ssm/model.py | 5 ++--- tests/models/test_checkpoint.py | 13 +++++++++---- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/fast_llm/models/ssm/model.py b/fast_llm/models/ssm/model.py index 526d66c01..d6a2f7e1a 100644 --- a/fast_llm/models/ssm/model.py +++ b/fast_llm/models/ssm/model.py @@ -3,14 +3,13 @@ from fast_llm.engine.base_model.base_model import Layer from fast_llm.engine.distributed.config import DistributedConfig -from fast_llm.engine.multi_stage.fast_llm_model import FastLLMModel from fast_llm.layers.language_model.embedding import LanguageModelEmbedding from fast_llm.layers.language_model.head import LanguageModelHead from fast_llm.layers.ssm.discrete_mamba2 import DiscreteMamba2 from fast_llm.layers.ssm.llamba_block import LlambaBlock from fast_llm.layers.ssm.mamba_layer import MambaLayer from fast_llm.layers.transformer.transformer import TransformerLayer -from fast_llm.models.gpt.model import GPTBaseModel +from fast_llm.models.gpt.model import GPTBaseModel, GPTModel from fast_llm.models.ssm.config import HybridSSMBaseModelConfig, HybridSSMModelConfig, SSMBlockType logger = logging.getLogger(__name__) @@ -135,7 +134,7 @@ def get_layers(self) -> list[Layer]: return layers -class HybridSSMModel[ConfigType: HybridSSMModelConfig](FastLLMModel[ConfigType]): +class HybridSSMModel[ConfigType: HybridSSMModelConfig](GPTModel[ConfigType]): """ A hybrid model that combines Transformer and SSM blocks. """ diff --git a/tests/models/test_checkpoint.py b/tests/models/test_checkpoint.py index 39fd0840e..aff7d991f 100644 --- a/tests/models/test_checkpoint.py +++ b/tests/models/test_checkpoint.py @@ -30,7 +30,7 @@ def test_checkpoint_and_eval(run_test_script_for_all_models, model_testing_confi + [ "training.checkpoint.interval=1", "training.evaluators.validation.interval=2", - "training.evaluators.validation.evaluators.iterations=1", + "training.evaluators.validation.evaluator.iterations=1", ], ) @@ -63,7 +63,7 @@ def test_resume(run_test_script_for_all_models): [ "training.checkpoint.interval=1", "training.evaluators.validation.interval=2", - "training.evaluators.validation.evaluators.iterations=1", + "training.evaluators.validation.evaluator.iterations=1", ], compare=f"test_checkpoint_and_eval", prepare_fn=_prepare_resume_fn, @@ -79,7 +79,7 @@ def test_resume_frozen(run_test_script_for_all_models): [ "training.checkpoint.interval=1", "training.evaluators.validation.interval=2", - "training.evaluators.validation.evaluators.iterations=1", + "training.evaluators.validation.evaluator.iterations=1", "model.base_model.transformer.mlp_lr_scale=0.", ], compare="test_checkpoint_and_eval", @@ -442,7 +442,12 @@ def test_run_converted_model(model_testing_config, convert_paths): ) errors = [] compare = CompareConfig() - model_as_hf = transformers.AutoModel.from_pretrained( + auto_model = ( + transformers.AutoModel + if model_testing_config.name in ("diffusion_llama", "dream") + else transformers.AutoModelForCausalLM + ) + model_as_hf = auto_model.from_pretrained( convert_paths["huggingface_0"], trust_remote_code=model_testing_config.checkpoint_format.trust_remote_code ).cuda() for name, model in zip(