From 6084122ad14af4190588eb0c4bc263b0a11f0d08 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Fri, 3 Oct 2025 23:51:53 -0400 Subject: [PATCH 1/3] language_model --- fast_llm/layers/block/block.py | 1 + fast_llm/layers/language_model/config.py | 33 ++++++++------- .../layers/language_model/language_model.py | 40 ++++++++++++------- fast_llm/models/gpt/config.py | 6 +++ fast_llm/models/gpt/conversion/llama.py | 16 ++++---- fast_llm/models/gpt/model.py | 8 ++-- fast_llm/utils.py | 4 +- tests/layers/test_lm_head.py | 2 +- 8 files changed, 64 insertions(+), 46 deletions(-) diff --git a/fast_llm/layers/block/block.py b/fast_llm/layers/block/block.py index ab6cb22b..67ce5eea 100644 --- a/fast_llm/layers/block/block.py +++ b/fast_llm/layers/block/block.py @@ -103,6 +103,7 @@ def __init__( config: ConfigType, distributed_config: DistributedConfig, *, + # TODO: Review. Use `input_dim(s)` and `output_dim(s)` instead? hidden_dim: TensorDim, lr_scale: float | None, peft: PeftConfig | None, diff --git a/fast_llm/layers/language_model/config.py b/fast_llm/layers/language_model/config.py index d2fbc490..25fa2d91 100644 --- a/fast_llm/layers/language_model/config.py +++ b/fast_llm/layers/language_model/config.py @@ -2,7 +2,6 @@ import typing from fast_llm.config import Field, FieldHint, check_field, config_class, skip_valid_if_none -from fast_llm.engine.base_model.config import ModuleConfig from fast_llm.engine.config_utils.parameter import OptionalParameterConfig, ParameterConfig, combine_lr_scales from fast_llm.engine.config_utils.tensor_dim import TensorDim from fast_llm.engine.distributed.config import DistributedConfig @@ -16,6 +15,7 @@ if typing.TYPE_CHECKING: from fast_llm.layers.language_model.embedding import LanguageModelEmbedding from fast_llm.layers.language_model.head import LanguageModelHead, LanguageModelHeadBase + from fast_llm.layers.language_model.language_model import LanguageModel from fast_llm.layers.language_model.multi_token_prediction import MultiTokenPrediction @@ -41,12 +41,6 @@ class LanguageModelEmbeddingsConfig(BlockConfig): desc="Configuration for the word embedding (weight).", hint=FieldHint.architecture, ) - hidden_size: int = Field( - default=1024, - desc="Size of the model's main hidden dimension, e.g., for its input and output layers.", - hint=FieldHint.architecture, - valid=check_field(Assert.gt, 0), - ) vocab_size: int = Field( default=49152, desc="Size of the vocabulary, i.e., number of vocabulary embeddings and logits.", @@ -295,24 +289,29 @@ def max_prediction_distance(self) -> int: @config_class() -class LanguageModelConfig(ModuleConfig): - # TODO: block +class LanguageModelConfig(BlockConfig): decoder: BlockSequenceConfig = Field( desc="Configuration for the language model decoder.", hint=FieldHint.architecture, ) - embeddings: LanguageModelEmbeddingsConfig = Field() - head: LanguageModelHeadBaseConfig = Field() - # TODO: Allow overriding in sub-models? - peft: PeftConfig = Field( - desc="Configuration for parameter-efficient fine tuning.", + embeddings: LanguageModelEmbeddingsConfig = Field( hint=FieldHint.architecture, + desc="Configuration for the language model embeddings.", + ) + head: LanguageModelHeadBaseConfig = Field( + hint=FieldHint.architecture, desc="Configuration for the language model head(s)." ) tied_embedding_weight: bool = Field( default=False, desc="Tie the output weights (logits) with the vocabulary embedding.", hint=FieldHint.architecture, ) + hidden_size: int = Field( + default=1024, + desc="Size of the model's main hidden dimension, e.g., for its input and output layers.", + hint=FieldHint.architecture, + valid=check_field(Assert.gt, 0), + ) sequence_first: bool | None = Field( default=None, desc="Override the default dimension ordering", @@ -321,3 +320,9 @@ class LanguageModelConfig(ModuleConfig): " Setting this parameter overrides the default choice. Note that setting to `False` will either do nothing or raise an error.", hint=FieldHint.testing, ) + + @property + def layer_class(self) -> "type[LanguageModel]": + from fast_llm.layers.language_model.language_model import LanguageModel + + return LanguageModel diff --git a/fast_llm/layers/language_model/language_model.py b/fast_llm/layers/language_model/language_model.py index 9a3bef19..56d41dc3 100644 --- a/fast_llm/layers/language_model/language_model.py +++ b/fast_llm/layers/language_model/language_model.py @@ -1,52 +1,64 @@ import logging import typing -from fast_llm.config import Configurable -from fast_llm.engine.base_model.base_model import Layer, LayerBase +import torch + +from fast_llm.engine.base_model.base_model import Layer from fast_llm.engine.base_model.config import LossDef from fast_llm.engine.config_utils.tensor_dim import TensorDim from fast_llm.engine.distributed.config import DistributedConfig +from fast_llm.layers.block.block import BlockBase +from fast_llm.layers.common.peft.config import PeftConfig from fast_llm.layers.language_model.config import LanguageModelConfig from fast_llm.layers.language_model.embedding import LanguageModelEmbedding logger = logging.getLogger(__name__) -class LanguageModel[ConfigType: LanguageModelConfig](Configurable[ConfigType], LayerBase): +class LanguageModel[ConfigType: LanguageModelConfig](BlockBase[ConfigType]): _config: ConfigType def __init__( self, config: ConfigType, distributed_config: DistributedConfig, + *, + # Unused, but required by the `BlockBase` interface. + hidden_dim: TensorDim | None = None, + lr_scale: float | None, + peft: PeftConfig | None, ): - super().__init__(config, distributed_config) - - self._hidden_dim = TensorDim("hidden", config.embeddings.hidden_size) + super().__init__( + config, + distributed_config, + hidden_dim=TensorDim("hidden", self._config.hidden_size), + lr_scale=lr_scale, + peft=peft, + ) self.embeddings: LanguageModelEmbedding = self._config.embeddings.get_layer( distributed_config, hidden_dim=self._hidden_dim, - lr_scale=None, - peft=self._config.peft, + lr_scale=self._lr_scale, + peft=self._peft, ) self.decoder = self._config.decoder.get_layer( distributed_config, self._hidden_dim, - lr_scale=None, - peft=self._config.peft, + lr_scale=self._lr_scale, + peft=self._peft, ) self.head = self._config.head.get_layer( distributed_config, self._config.embeddings, hidden_dim=self._hidden_dim, - lr_scale=None, - peft=self._config.peft, + lr_scale=self._lr_scale, + peft=self._peft, ) - def get_layers(self) -> list["Layer"]: + def get_layers(self) -> list[Layer]: return self.embeddings.get_layers() + self.decoder.get_layers() + self.head.get_layers() - def preprocess(self, batch: "torch.Tensor", kwargs: dict[str, typing.Any]) -> None: + def preprocess(self, batch: torch.Tensor, kwargs: dict[str, typing.Any]) -> None: # Needed because the base class uses `get_layers` which may bypass the decoder and head. TODO: Avoidable? self.embeddings.preprocess(batch, kwargs) self.decoder.preprocess(batch, kwargs) diff --git a/fast_llm/models/gpt/config.py b/fast_llm/models/gpt/config.py index 1e57f3b8..a901a046 100644 --- a/fast_llm/models/gpt/config.py +++ b/fast_llm/models/gpt/config.py @@ -10,6 +10,7 @@ from fast_llm.engine.multi_stage.config import FastLLMModelConfig, PretrainedFastLLMModelConfig from fast_llm.engine.schedule.config import BatchConfig from fast_llm.engine.training.config import TrainerConfig +from fast_llm.layers.common.peft.config import PeftConfig from fast_llm.layers.language_model.config import LanguageModelConfig, MultiTokenPredictionConfig from fast_llm.models.gpt.conversion.config import ( AprielHybridSSMCheckpointFormat, @@ -84,6 +85,11 @@ def micro_batch_splits(self) -> int: class GPTBaseModelConfig(LanguageModelConfig, BaseModelConfig): _abstract = False + # TODO: Allow overriding in sub-models? + peft: PeftConfig = Field( + desc="Configuration for parameter-efficient fine tuning.", + hint=FieldHint.architecture, + ) # Debug, to get an exact match with megatron init. use_megatron_initialization: bool = Field( default=False, desc="Exactly match the initialization of a Megatron model.", hint=FieldHint.testing diff --git a/fast_llm/models/gpt/conversion/llama.py b/fast_llm/models/gpt/conversion/llama.py index 786d923f..a9249226 100644 --- a/fast_llm/models/gpt/conversion/llama.py +++ b/fast_llm/models/gpt/conversion/llama.py @@ -449,19 +449,13 @@ def get_converters( class LlamaEmbeddingsConverter: @classmethod def import_config(cls, config: dict) -> dict: - return { - "vocab_size": config["vocab_size"], - "hidden_size": config["hidden_size"], - } + return {"vocab_size": config["vocab_size"]} @classmethod def export_config(cls, config: LanguageModelEmbeddingsConfig) -> dict: Assert.custom(isinstance, config, LanguageModelEmbeddingsConfig) assert not config.position_embeddings.enabled - return { - "vocab_size": config.vocab_size, - "hidden_size": config.hidden_size, - } + return {"vocab_size": config.vocab_size} @classmethod def get_converters( @@ -516,6 +510,7 @@ def import_config(cls, config: dict) -> dict: "embeddings": cls.embeddings_converter_class.import_config(config), "decoder": cls.decoder_converter_class.import_config(config), "head": cls.head_converter_class.import_config(config), + "hidden_size": config["hidden_size"], "tied_embedding_weight": config["tie_word_embeddings"], } @@ -526,7 +521,10 @@ def export_config(cls, config: GPTBaseModelConfig) -> dict: cls.embeddings_converter_class.export_config(config.embeddings), cls.decoder_converter_class.export_config(config.decoder), cls.head_converter_class.export_config(config.head), - {"tie_word_embeddings": config.tied_embedding_weight}, + { + "tie_word_embeddings": config.tied_embedding_weight, + "hidden_size": config.hidden_size, + }, ) @classmethod diff --git a/fast_llm/models/gpt/model.py b/fast_llm/models/gpt/model.py index 2c1fb0e4..158bbd92 100644 --- a/fast_llm/models/gpt/model.py +++ b/fast_llm/models/gpt/model.py @@ -30,16 +30,14 @@ class GPTBaseModel[ConfigType: GPTBaseModelConfig](LanguageModel[ConfigType], Ba def __init__( self, - config: GPTBaseModelConfig, + config: ConfigType, distributed_config: DistributedConfig, ): - super().__init__(config, distributed_config) + super().__init__(config, distributed_config, lr_scale=self._config.lr_scale, peft=self._config.peft) if self._config.use_megatron_initialization: for param in self.parameters(): Assert.custom(isinstance, param, ParameterMeta) - param.init_parameter = get_init_megatron( - param, self._config.decoder.block, config.embeddings.hidden_size - ) # Noqa + param.init_parameter = get_init_megatron(param, self._config.decoder.block, config.hidden_size) # Noqa def preprocess_meta( self, batch_meta: GPTBatchConfig | torch.Tensor, phase: PhaseType diff --git a/fast_llm/utils.py b/fast_llm/utils.py index bbd69ae8..1f9feceb 100644 --- a/fast_llm/utils.py +++ b/fast_llm/utils.py @@ -316,9 +316,7 @@ def new_decorator(*args, **kwargs): return new_decorator -def compare_nested( - config_a, config_b, errors: list | None = None, prefix: tuple = (), ignore_missing: tuple[str, ...] = () -): +def compare_nested(config_a, config_b, errors: list | None = None, prefix: tuple = ()): if errors is None: errors = [] # Check for equality of both values and types. diff --git a/tests/layers/test_lm_head.py b/tests/layers/test_lm_head.py index 0de823e2..d65d33a8 100644 --- a/tests/layers/test_lm_head.py +++ b/tests/layers/test_lm_head.py @@ -255,7 +255,7 @@ def test_lm_head( logit_weight = torch.nn.Parameter( torch.empty( VOCAB_SIZE, HIDDEN_SIZE, dtype=distributed.config.compute_dtype.torch, device=distributed.device - ).normal_(config.embeddings.hidden_size**-0.5) + ).normal_(config.hidden_size**-0.5) ) else: logit_weight = None From 4a9698003c78326108f465d89da5b82800dc4366 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Mon, 6 Oct 2025 16:27:07 -0400 Subject: [PATCH 2/3] fixes --- Dockerfile | 5 ++++ examples/mistral.yaml | 2 +- fast_llm/engine/checkpoint/huggingface.py | 5 ++-- .../layers/language_model/language_model.py | 4 +-- fast_llm/models/gpt/model.py | 2 +- tests/layers/test_lm_head.py | 26 +++++++++---------- tests/test_config.py | 13 ++++------ tests/utils/model_configs.py | 2 +- 8 files changed, 29 insertions(+), 30 deletions(-) diff --git a/Dockerfile b/Dockerfile index 526026fa..00e13d95 100644 --- a/Dockerfile +++ b/Dockerfile @@ -47,3 +47,8 @@ COPY --chmod=777 ./tests tests COPY --chmod=777 ./tools tools COPY --chmod=777 ./fast_llm_external_models fast_llm_external_models COPY --chmod=777 --exclude=./fast_llm/csrc/ ./fast_llm/ fast_llm/ + +# Set a dummy default user so we don't run in root by default. +# The image is still compatible with any user id. +RUN useradd user +USER user diff --git a/examples/mistral.yaml b/examples/mistral.yaml index 2e4a57de..904325c5 100644 --- a/examples/mistral.yaml +++ b/examples/mistral.yaml @@ -28,7 +28,6 @@ optimizer: model: base_model: embeddings: - hidden_size: 4096 vocab_size: 32000 dropout: 0.0 decoder: @@ -58,6 +57,7 @@ model: normalization: type: rms_norm epsilon: 1.0e-05 + hidden_size: 4096 tied_embedding_weight: false multi_stage: zero_stage: 2 diff --git a/fast_llm/engine/checkpoint/huggingface.py b/fast_llm/engine/checkpoint/huggingface.py index afe38129..96fb5332 100644 --- a/fast_llm/engine/checkpoint/huggingface.py +++ b/fast_llm/engine/checkpoint/huggingface.py @@ -150,7 +150,6 @@ def _load_weights( ].values() } elif (config.path / transformers.utils.WEIGHTS_NAME).is_file(): - # TODO: Prevent unsafe by default paths = {config.path / transformers.utils.WEIGHTS_NAME} elif (config.path / transformers.utils.WEIGHTS_INDEX_NAME).is_file(): logger.info(f"Loading index from {config.path / transformers.utils.WEIGHTS_INDEX_NAME}") @@ -170,7 +169,7 @@ def _load_weights( for key in f.keys(): yield key, "weights", f.get_slice(key) elif path.suffix == ".bin": - # TODO: Prevent unsafe by default - yield from torch.load(path) + # TODO: Confirm that loading works with `weights_only=True` + yield from torch.load(path, weights_only=True) else: raise NotImplementedError(f"Unknown file format for {path}") diff --git a/fast_llm/layers/language_model/language_model.py b/fast_llm/layers/language_model/language_model.py index 56d41dc3..2e46bb57 100644 --- a/fast_llm/layers/language_model/language_model.py +++ b/fast_llm/layers/language_model/language_model.py @@ -23,7 +23,7 @@ def __init__( config: ConfigType, distributed_config: DistributedConfig, *, - # Unused, but required by the `BlockBase` interface. + # TODO: Unused, but required by the `BlockBase` interface. hidden_dim: TensorDim | None = None, lr_scale: float | None, peft: PeftConfig | None, @@ -31,7 +31,7 @@ def __init__( super().__init__( config, distributed_config, - hidden_dim=TensorDim("hidden", self._config.hidden_size), + hidden_dim=TensorDim("hidden", config.hidden_size), lr_scale=lr_scale, peft=peft, ) diff --git a/fast_llm/models/gpt/model.py b/fast_llm/models/gpt/model.py index 158bbd92..efa348ec 100644 --- a/fast_llm/models/gpt/model.py +++ b/fast_llm/models/gpt/model.py @@ -33,7 +33,7 @@ def __init__( config: ConfigType, distributed_config: DistributedConfig, ): - super().__init__(config, distributed_config, lr_scale=self._config.lr_scale, peft=self._config.peft) + super().__init__(config, distributed_config, lr_scale=config.lr_scale, peft=config.peft) if self._config.use_megatron_initialization: for param in self.parameters(): Assert.custom(isinstance, param, ParameterMeta) diff --git a/tests/layers/test_lm_head.py b/tests/layers/test_lm_head.py index d65d33a8..5c044596 100644 --- a/tests/layers/test_lm_head.py +++ b/tests/layers/test_lm_head.py @@ -171,22 +171,20 @@ def test_lm_head( } config = GPTBaseModelConfig.from_dict( { - "decoder": { - "num_blocks": 0, - }, - "embeddings": { - "vocab_size": VOCAB_SIZE, - "hidden_size": HIDDEN_SIZE, - }, + "decoder": {"num_blocks": 0}, + "embeddings": {"vocab_size": VOCAB_SIZE}, "head": ( - head_config - if prediction_heads == 1 - else { - "type": "multi_token_prediction", - "head": head_config, - "prediction_heads": prediction_heads, - } + ( + head_config + if prediction_heads == 1 + else { + "type": "multi_token_prediction", + "head": head_config, + "prediction_heads": prediction_heads, + } + ), ), + "hidden_size": HIDDEN_SIZE, }, config_dict, update_type=UpdateType.update, diff --git a/tests/test_config.py b/tests/test_config.py index 32620053..63f2606f 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -74,9 +74,6 @@ def test_pretrained_config(load_config: ModelConfigType, result_path): pretrained_model_config = GPTModelConfig.from_dict( { "base_model": { - "embeddings": { - "hidden_size": 1024, # Default - }, "decoder": { "block": { "mixer": { @@ -92,6 +89,7 @@ def test_pretrained_config(load_config: ModelConfigType, result_path): }, "num_blocks": 12, # Default }, + "hidden_size": 1024, # Default "tied_embedding_weight": False, }, "multi_stage": {"zero_stage": 3}, @@ -105,7 +103,7 @@ def test_pretrained_config(load_config: ModelConfigType, result_path): pretrained_model_config.save_metadata(save_config) base_model_update = { - "embeddings": {"hidden_size": 512, "vocab_size": 1000}, + "embeddings": {"vocab_size": 1000}, "decoder": { "block": { "mixer": { @@ -115,6 +113,7 @@ def test_pretrained_config(load_config: ModelConfigType, result_path): "normalization": {"implementation": "triton"}, # Update non-default nested }, }, + "hidden_size": 512, "peft": {"type": "lora", "freeze_others": False}, # Update default nested, change type } pretrained_config = PretrainedGPTModelConfig.from_dict( @@ -134,10 +133,7 @@ def test_pretrained_config(load_config: ModelConfigType, result_path): expected_config["distributed"].update({"seed": 1234, "compute_dtype": "float16"}) if load_config in (ModelConfigType.fast_llm, ModelConfigType.model): expected_config["base_model"] = { - "embeddings": { - "hidden_size": 512, - "vocab_size": 1000, - }, + "embeddings": {"vocab_size": 1000}, "decoder": { "block": { "mixer": { @@ -152,6 +148,7 @@ def test_pretrained_config(load_config: ModelConfigType, result_path): }, "num_blocks": 12, }, + "hidden_size": 512, "tied_embedding_weight": False, "peft": {"freeze_others": False}, } diff --git a/tests/utils/model_configs.py b/tests/utils/model_configs.py index 6b313aa8..77d03825 100644 --- a/tests/utils/model_configs.py +++ b/tests/utils/model_configs.py @@ -192,7 +192,6 @@ def _update_and_add_testing_config( "embeddings": { "word_embeddings": init_1, "position_embeddings": {"enabled": True, **init_1}, - "hidden_size": 256, "num_position_embeddings": 512, "vocab_size": MODEL_TEST_VOCAB_SIZE, }, @@ -216,6 +215,7 @@ def _update_and_add_testing_config( "num_blocks": 2, }, "head": {"output_weight": init_1}, + "hidden_size": 256, "tied_embedding_weight": True, }, "multi_stage": { From 785413806b34b12ea5d720d3fee2db640885a044 Mon Sep 17 00:00:00 2001 From: Joel Lamy-Poirier Date: Mon, 6 Oct 2025 17:09:37 -0400 Subject: [PATCH 3/3] fixes --- fast_llm/engine/config_utils/parameter.py | 5 +++-- tests/layers/test_lm_head.py | 16 +++++++--------- tests/utils/model_configs.py | 2 ++ 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/fast_llm/engine/config_utils/parameter.py b/fast_llm/engine/config_utils/parameter.py index 76416d36..c0910c09 100644 --- a/fast_llm/engine/config_utils/parameter.py +++ b/fast_llm/engine/config_utils/parameter.py @@ -1,7 +1,8 @@ import math import typing -from fast_llm.config import Config, Field, FieldHint, config_class +from fast_llm.config import Field, FieldHint, config_class +from fast_llm.engine.base_model.config import ModuleConfig from fast_llm.engine.config_utils.initialization import Initialization, InitializationConfig from fast_llm.engine.config_utils.tensor_dim import TensorDim from fast_llm.layers.common.peft.config import PeftConfig @@ -36,7 +37,7 @@ def combine_lr_scales(*lr_scales: float | None | tuple[float | None, ...]): @config_class() -class ParameterConfig(Config): +class ParameterConfig(ModuleConfig): initialization: InitializationConfig = Field( desc="If provided, override the default initialization method set by the parent layer.", hint=FieldHint.feature, diff --git a/tests/layers/test_lm_head.py b/tests/layers/test_lm_head.py index 5c044596..0dc2421a 100644 --- a/tests/layers/test_lm_head.py +++ b/tests/layers/test_lm_head.py @@ -174,15 +174,13 @@ def test_lm_head( "decoder": {"num_blocks": 0}, "embeddings": {"vocab_size": VOCAB_SIZE}, "head": ( - ( - head_config - if prediction_heads == 1 - else { - "type": "multi_token_prediction", - "head": head_config, - "prediction_heads": prediction_heads, - } - ), + head_config + if prediction_heads == 1 + else { + "type": "multi_token_prediction", + "head": head_config, + "prediction_heads": prediction_heads, + } ), "hidden_size": HIDDEN_SIZE, }, diff --git a/tests/utils/model_configs.py b/tests/utils/model_configs.py index 77d03825..c02521d7 100644 --- a/tests/utils/model_configs.py +++ b/tests/utils/model_configs.py @@ -344,6 +344,8 @@ def _update_and_add_testing_config( ModelTestingGroup.distributed: ModelTestingGroupAction.unimportant, }, ) +del MODEL_CONFIGS["starcoder_2"].config_dict["model"]["base_model"]["embeddings"]["num_position_embeddings"] + _update_and_add_testing_config( # Main tested model.