From a46d581130718f394665ddfa7abb7633f3c3775d Mon Sep 17 00:00:00 2001 From: bigximik Date: Mon, 12 May 2025 07:50:46 +0000 Subject: [PATCH 01/19] copy from denis/generate of only generate support part, with some changes --- fast_llm/engine/inference/config.py | 3 +- fast_llm/engine/inference/huggingface.py | 102 ++++++++++++++++--- fast_llm/engine/inference/runner.py | 39 +++++-- fast_llm/engine/multi_stage/config.py | 4 +- fast_llm/engine/multi_stage/stage.py | 18 ++++ fast_llm/layers/language_model/head.py | 10 +- fast_llm/layers/transformer/preprocessing.py | 2 +- fast_llm/models/custom/config.py | 2 +- fast_llm/models/gpt/config.py | 2 +- fast_llm/models/gpt/huggingface.py | 58 ++++++++--- tests/test_checkpoint.py | 2 +- 11 files changed, 194 insertions(+), 48 deletions(-) diff --git a/fast_llm/engine/inference/config.py b/fast_llm/engine/inference/config.py index d4b46bcc0..c18daa48d 100644 --- a/fast_llm/engine/inference/config.py +++ b/fast_llm/engine/inference/config.py @@ -91,7 +91,8 @@ def __eq__(self, other) -> bool: def to_dict(self) -> dict[str, typing.Any]: out = super().to_dict() - out["fast_llm_config"] = self.fast_llm_config.to_dict(verbose=FieldVerboseLevel.everything) + if self.fast_llm_config is not None: + out["fast_llm_config"] = self.fast_llm_config.to_dict(verbose=FieldVerboseLevel.everything) return out def to_diff_dict(self) -> dict[str, typing.Any]: diff --git a/fast_llm/engine/inference/huggingface.py b/fast_llm/engine/inference/huggingface.py index 196310b4d..d0ec2bd53 100644 --- a/fast_llm/engine/inference/huggingface.py +++ b/fast_llm/engine/inference/huggingface.py @@ -2,13 +2,19 @@ import pathlib import typing +import torch import transformers.modeling_outputs +import transformers.generation.utils from fast_llm.engine.checkpoint.config import CheckpointLoadConfig, FastLLMCheckpointFormat from fast_llm.engine.inference.config import HuggingfaceModelConfig from fast_llm.engine.inference.runner import InferenceRunner from fast_llm.engine.multi_stage.config import StageMode from fast_llm.engine.multi_stage.fast_llm_model import FastLLMModel +from fast_llm.engine.schedule.config import BatchConfig, ScheduleConfig +from fast_llm.engine.schedule.runner import ScheduleRunner +from fast_llm.engine.schedule.schedule import Schedule +from fast_llm.engine.training.config import TrainerConfig class HuggingfacePreTrainedModel(transformers.PreTrainedModel): @@ -20,31 +26,91 @@ class HuggingfacePreTrainedModel(transformers.PreTrainedModel): # _supports_cache_class = False # _tied_weights_keys = [] - def __init__(self, config: HuggingfaceModelConfig, fast_llm_model: FastLLMModel, **kwargs): + def __init__( + self, + config: HuggingfaceModelConfig, + fast_llm_model: FastLLMModel, + micro_batch_size: int | None = None, + runner: ScheduleRunner | None = None, + **kwargs, + ): assert self.runner_class.model_class.config_class is config.model_config_class assert config.fast_llm_config is fast_llm_model.config assert isinstance(config, self.config_class) + # The HF constructor performs a deep copy of the config, + # but config.fast_llm_config may contain non-picklable items like process groups. + # Temporarily remove it before the call and restore it afterward. + fast_llm_config = config.fast_llm_config + config.fast_llm_config = None super().__init__(config, **kwargs) + config.fast_llm_config = fast_llm_config + + self._inference_runner = self.runner_class(fast_llm_model, micro_batch_size, runner) + + # A model can be created from pretrained which set it up in the current HF wrapper api + # or set existing model which also must be setup, so, do not accept not setup model + assert fast_llm_model.is_setup + + # We only support data parallel for now + assert ( + fast_llm_model.distributed.config.model_parallel == 1 + and fast_llm_model.distributed.config.sequence_tensor_parallel == 1 + ) - self._inference_runner = self.runner_class(fast_llm_model) - if not fast_llm_model.is_setup: - fast_llm_model.setup(mode=StageMode.inference) self._inference_runner.setup() + # Transformers needs to be able to inspect the base model. self.fast_llm_base_model = fast_llm_model.base_model - # TODO: Support distributed models? - assert fast_llm_model.config.distributed.world_size == 1 + # # TODO: Support distributed models? + # assert fast_llm_model.config.distributed.world_size == 1 with transformers.modeling_utils.no_init_weights(): self.post_init() + def forward( + self, + input_ids: torch.Tensor | None = None, + attention_mask: torch.Tensor | None = None, + position_ids: torch.Tensor | None = None, + past_key_values=None, + inputs_embeds: torch.FloatTensor | None = None, + labels: torch.LongTensor | None = None, + use_cache: bool | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + ) -> tuple | transformers.modeling_outputs.CausalLMOutputWithPast: + # Meant to be overridden in derived classes + raise NotImplementedError() + + @classmethod + def from_model( + cls, + fast_llm_model: FastLLMModel, + micro_batch_size: int | None = None, + runner: ScheduleRunner | None = None, + **kwargs, + ): + config = cls.config_class(fast_llm_model.config) + return cls( + config, + fast_llm_model, + micro_batch_size=micro_batch_size, + runner=runner, + **kwargs, + ) + @classmethod def from_pretrained( cls, pretrained_model_name_or_path: str | os.PathLike | CheckpointLoadConfig, - *, - mode: StageMode = StageMode.inference, + *updates: dict[str | tuple[str, ...], typing.Any], + optimizer_state_names: tuple[str, ...] | None = None, + # setup: bool = True, + mode: StageMode = StageMode.training, + use_cpu: bool = False, + stage_filter: set | None = None, **kwargs, ) -> typing.Self: # Pretrained config. @@ -54,18 +120,24 @@ def from_pretrained( format=FastLLMCheckpointFormat, ) - updates = {} - torch_dtype = kwargs.pop("torch_dtype", None) - if torch_dtype is not None: - updates[("distributed", "training_dtype")] = torch_dtype - # Create the model + # always set up model and crate distributed instance internally for now fast_llm_model = cls.runner_class.model_class.from_pretrained( - pretrained_model_name_or_path, updates, mode=mode + pretrained_model_name_or_path, + *updates, + optimizer_state_names=optimizer_state_names, + # setup=setup, + mode=mode, + use_cpu=use_cpu, + stage_filter=stage_filter, ) - config = cls.config_class(fast_llm_model.config) + config = cls.config_class(fast_llm_model.config) return cls(config, fast_llm_model, **kwargs) def _init_weights(self, module) -> None: raise NotImplementedError(module) + + +class HuggingfaceBaseModelForCausalLM(HuggingfacePreTrainedModel, transformers.generation.utils.GenerationMixin): + pass diff --git a/fast_llm/engine/inference/runner.py b/fast_llm/engine/inference/runner.py index 30f836b77..52eff82b6 100644 --- a/fast_llm/engine/inference/runner.py +++ b/fast_llm/engine/inference/runner.py @@ -7,27 +7,43 @@ from fast_llm.engine.schedule.config import BatchConfig, ScheduleConfig from fast_llm.engine.schedule.runner import ScheduleRunner from fast_llm.engine.schedule.schedule import Schedule +from fast_llm.engine.training.config import TrainerConfig class InferenceRunner(abc.ABC): model_class: typing.ClassVar[type[FastLLMModel]] = FastLLMModel batch_config_class: typing.ClassVar[type[BatchConfig]] = BatchConfig - def __init__(self, fast_llm_model: FastLLMModel): + def __init__( + self, + fast_llm_model: FastLLMModel, + micro_batch_size: int | None = None, + runner: ScheduleRunner | None = None, + ): assert isinstance(fast_llm_model, self.model_class) self._fast_llm_model = fast_llm_model - # We only need a basic schedule and don't care about dimensions. - self._schedule_config = ScheduleConfig() - # TODO: Sort things out. + with NoAutoValidate(): - self._batch_config = self.batch_config_class() + self._batch_config = self.batch_config_class(micro_batch_size=micro_batch_size) self._batch_config.setup(self._fast_llm_model.config.distributed) self._batch_config.validate() - self._runner = ScheduleRunner( - config=self._schedule_config, - multi_stage=self._fast_llm_model, - distributed_config=self._fast_llm_model.config.distributed, - ) + + if runner is None: + # We only need a basic schedule and don't care about dimensions. + self._schedule_config = ScheduleConfig() + # TODO: Sort things out. + + self._runner = ScheduleRunner( + config=self._schedule_config, + multi_stage=self._fast_llm_model, + distributed_config=self._fast_llm_model.config.distributed, + ) + else: + self._schedule_config = runner.config + self._runner = runner + # External runner from training loop must be already setup + assert runner._is_setup + # TODO: Random state? (Distributed.set_step) self._schedule = Schedule( multi_stage=self._fast_llm_model, @@ -42,7 +58,8 @@ def fast_llm_model(self) -> FastLLMModel: return self._fast_llm_model def setup(self): - self._runner.setup(self._fast_llm_model.distributed) + if not self._runner._is_setup: + self._runner.setup(self._fast_llm_model.distributed) def forward( self, input_, kwargs: dict, *, iteration: int = 1, return_metrics: bool = False diff --git a/fast_llm/engine/multi_stage/config.py b/fast_llm/engine/multi_stage/config.py index e2d04f80f..ae3abc706 100644 --- a/fast_llm/engine/multi_stage/config.py +++ b/fast_llm/engine/multi_stage/config.py @@ -30,7 +30,7 @@ from fast_llm.utils import Assert if typing.TYPE_CHECKING: - from fast_llm.engine.inference.huggingface import HuggingfacePreTrainedModel + from fast_llm.engine.inference.huggingface import HuggingfaceBaseModelForCausalLM from fast_llm.engine.multi_stage.fast_llm_model import FastLLMModel logger = logging.getLogger(__name__) @@ -247,7 +247,7 @@ def get_model_class(cls) -> type["FastLLMModel"]: raise NotImplementedError @classmethod - def get_huggingface_model_class(cls) -> type["HuggingfacePreTrainedModel"]: + def get_huggingface_model_for_causal_lm_class(cls) -> type["HuggingfaceBaseModelForCausalLM"]: raise NotImplementedError @classmethod diff --git a/fast_llm/engine/multi_stage/stage.py b/fast_llm/engine/multi_stage/stage.py index 675e878b3..a3ac98e8b 100644 --- a/fast_llm/engine/multi_stage/stage.py +++ b/fast_llm/engine/multi_stage/stage.py @@ -13,6 +13,9 @@ from fast_llm.tensor import ParameterMeta, TensorMeta, accumulate_gradient from fast_llm.utils import Assert +if typing.TYPE_CHECKING: + from fast_llm.core.distributed import ProcessGroup + logger = logging.getLogger(__name__) @@ -111,6 +114,21 @@ def forward( metrics, ) self._log_layer_forward(output, kwargs, i) + + # TODO: very slow and memory consuming, only use for debugging for now + # TODO: decide if and how we want to return + # HF transformer style details from forward properly + if "output_hidden_states" in kwargs and kwargs["output_hidden_states"]: + # Last layer does not provide output + if output is not None: + meta = self._meta_outputs[i] + output_global, _ = meta.local_to_global(output.detach(), distributed=self._distributed) + else: + output_global = None + kwargs["hidden_states"][self._layer_range[i]] = { + "layer_type": type(layer).__name__, + "tensor": output_global, + } return None if output is None else output.detach(), (input_, output) def backward( diff --git a/fast_llm/layers/language_model/head.py b/fast_llm/layers/language_model/head.py index 813dcc076..79c9f61b1 100644 --- a/fast_llm/layers/language_model/head.py +++ b/fast_llm/layers/language_model/head.py @@ -5,7 +5,7 @@ from torch.distributed import all_reduce from fast_llm.config import Configurable -from fast_llm.core.ops import split_op +from fast_llm.core.ops import gather_op, split_op from fast_llm.engine.base_model.base_model import Layer from fast_llm.engine.config_utils.tensor_space import DefaultDimNames, TensorDim, TensorSpace from fast_llm.engine.distributed.config import DistributedDimNames @@ -175,6 +175,14 @@ def _forward_backward( with torch.enable_grad(): ln_output = self.final_norm(input_) + if "output_hidden_states" in kwargs and kwargs["output_hidden_states"]: + # The last hidden layer output is returned normalized in the HF Transformers-style output, at least for LLama style models. + # So, if needed, we gather the data after normalization and set it as the output of the previous layer. + group = self._tensor_space.distributed.tensor_group if self._parallel_embeddings else None + sequence_parallel = self._sequence_parallel and self._parallel_embeddings + hidden_state = gather_op(ln_output.detach(), group, dim=0) if sequence_parallel else ln_output.detach() + kwargs["hidden_states"][len(kwargs["hidden_states"]) - 1]["tensor"] = hidden_state + grad_output = kwargs[TransformerKwargs.grad_output] / ( self._group_size if self._sequence_parallel_logits else 1 ) diff --git a/fast_llm/layers/transformer/preprocessing.py b/fast_llm/layers/transformer/preprocessing.py index 2415a2f91..1e07c1c12 100644 --- a/fast_llm/layers/transformer/preprocessing.py +++ b/fast_llm/layers/transformer/preprocessing.py @@ -239,7 +239,7 @@ def preprocess(self, batch, kwargs: dict[str, typing.Any]) -> None: ] if (sequence_lengths := kwargs.get(TransformerKwargs.sequence_lengths, None)) is not None: seq_ids = torch.stack( - [torch.cat([torch.arange(x) for x in sample_lens]) for sample_lens in sequence_lengths] + [torch.cat([torch.full((x,), i) for i, x in enumerate(sample_lens)]) for sample_lens in sequence_lengths] ) document_mask = (seq_ids[:, None, :] == seq_ids[:, :, None]).to(self._tensor_space.distributed.device) kwargs[TransformerKwargs.attention_mask] = ( diff --git a/fast_llm/models/custom/config.py b/fast_llm/models/custom/config.py index 8be45e1c2..f9805e643 100644 --- a/fast_llm/models/custom/config.py +++ b/fast_llm/models/custom/config.py @@ -35,7 +35,7 @@ def get_model_class(cls) -> type["CustomModel"]: return CustomModel @classmethod - def get_huggingface_model_class(cls) -> type["HuggingfaceCustomModelForCausalLM"]: + def get_huggingface_model_for_causal_lm_class(cls) -> type["HuggingfaceCustomModelForCausalLM"]: from fast_llm.models.custom.huggingface import HuggingfaceCustomModelForCausalLM return HuggingfaceCustomModelForCausalLM diff --git a/fast_llm/models/gpt/config.py b/fast_llm/models/gpt/config.py index 418f948e3..3852d83fc 100644 --- a/fast_llm/models/gpt/config.py +++ b/fast_llm/models/gpt/config.py @@ -147,7 +147,7 @@ def get_model_class(cls) -> type["GPTModel"]: return GPTModel @classmethod - def get_huggingface_model_class(cls) -> type["HuggingfaceGPTModelForCausalLM"]: + def get_huggingface_model_for_causal_lm_class(cls) -> type["HuggingfaceGPTModelForCausalLM"]: from fast_llm.models.gpt.huggingface import HuggingfaceGPTModelForCausalLM return HuggingfaceGPTModelForCausalLM diff --git a/fast_llm/models/gpt/huggingface.py b/fast_llm/models/gpt/huggingface.py index 0da4acbb4..7e668e735 100644 --- a/fast_llm/models/gpt/huggingface.py +++ b/fast_llm/models/gpt/huggingface.py @@ -5,10 +5,11 @@ import torch import transformers.modeling_outputs + from fast_llm.data.data.gpt.data import GPTBatch from fast_llm.engine.distributed.config import PhaseType from fast_llm.engine.inference.config import HuggingfaceModelConfig -from fast_llm.engine.inference.huggingface import HuggingfacePreTrainedModel +from fast_llm.engine.inference.huggingface import HuggingfaceBaseModelForCausalLM from fast_llm.layers.transformer.config import TransformerKwargs from fast_llm.models.gpt.config import GPTModelConfig from fast_llm.models.gpt.model import GPTBaseModel, GPTInferenceRunner @@ -22,7 +23,7 @@ class HuggingfaceGPTModelConfig(HuggingfaceModelConfig): fast_llm_config: GPTModelConfig -class HuggingfaceGPTModelForCausalLM(HuggingfacePreTrainedModel): +class HuggingfaceGPTModelForCausalLM(HuggingfaceBaseModelForCausalLM): config_class = HuggingfaceGPTModelConfig config: HuggingfaceGPTModelConfig runner_class: typing.ClassVar[type[GPTInferenceRunner]] = GPTInferenceRunner @@ -55,21 +56,33 @@ def forward( if output_attentions: raise NotImplementedError() - if output_hidden_states: - raise NotImplementedError() - if attention_mask is not None: - raise NotImplementedError() - if position_ids is not None: - raise NotImplementedError() if inputs_embeds is not None: raise NotImplementedError() if labels is not None: raise NotImplementedError() + # NOTE: We are ignoring position_ids as we reconstruct them from attention_mask via sequence_lenghts. + if attention_mask is not None: + + # First non zero indexes or zero index if the row is all zeros (invalid row) + first_non_zero_indexes = attention_mask.argmax(dim=1) + + # Check if the sequence is left-padded and if the remaining ones are continuous 1-ns + assert (attention_mask.sum(axis=1) == (attention_mask.shape[1] - first_non_zero_indexes)).all() + + sequence_lenghts = [ + torch.tensor( + [attention_mask.shape[1]] if el == 0 else [el, attention_mask.shape[1] - el], dtype=torch.int64 + ) + for el in first_non_zero_indexes.tolist() + ] + else: + sequence_lenghts = None + # Iteration serves as a random seed, using random module because it's not seeded by Fast LLM iteration = random.randint(0, 2**32) batch = self.fast_llm_base_model.preprocess( - GPTBatch(input_ids), phase=PhaseType.inference, iteration=iteration + GPTBatch(input_ids, sequence_lengths=sequence_lenghts), phase=PhaseType.inference, iteration=iteration ) ((input_, kwargs),) = batch @@ -82,23 +95,40 @@ def forward( # The transformers will save the present keys and values to this list. kwargs[TransformerKwargs.presents] = [] + if output_hidden_states: + kwargs["output_hidden_states"] = True + kwargs["hidden_states"] = {} + else: + kwargs["output_hidden_states"] = False + self._inference_runner.forward(input_, kwargs, iteration=iteration) # TODO: Make a proper way of returning the model output. logits = kwargs["logits"] + # TODO: convert hidden state form dict to list to be the same as with HFs + hidden_states = None + if output_hidden_states: + hidden_states = kwargs["hidden_states"] + if not return_dict: - outputs = (logits,) + # TODO: check hidden state go before past in the tuple + if output_hidden_states: + outputs = (logits, hidden_states) + else: + outputs = (logits,) + if use_cache: outputs += (kwargs[TransformerKwargs.presents],) return outputs return transformers.modeling_outputs.CausalLMOutputWithPast( logits=logits, + hidden_states=hidden_states, past_key_values=kwargs[TransformerKwargs.presents], ) - def prepare_inputs_for_generation( - self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs - ): - raise NotImplementedError() + # def prepare_inputs_for_generation( + # self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs + # ): + # raise NotImplementedError() diff --git a/tests/test_checkpoint.py b/tests/test_checkpoint.py index 257947e96..91d629422 100644 --- a/tests/test_checkpoint.py +++ b/tests/test_checkpoint.py @@ -32,7 +32,7 @@ from tests.compare_tensor_logs import CompareConfig, compare_logged_tensor TEST_MODEL_CONFIG_CLS = model_registry[TEST_MODEL_TYPE] -TEST_MODEL_HF_CLS = TEST_MODEL_CONFIG_CLS.get_huggingface_model_class() +TEST_MODEL_HF_CLS = TEST_MODEL_CONFIG_CLS.get_huggingface_model_for_causal_lm_class() TEST_MODEL_CLS = TEST_MODEL_CONFIG_CLS.get_model_class() TEST_BASE_MODEL_CONFIG_CLS = TEST_MODEL_CONFIG_CLS.get_base_model_config_class() From 9a81f8906dfd9526bcacb431b396a2f207a9c9c7 Mon Sep 17 00:00:00 2001 From: bigximik Date: Mon, 12 May 2025 11:00:25 +0000 Subject: [PATCH 02/19] fix to use right config param --- fast_llm/engine/inference/huggingface.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fast_llm/engine/inference/huggingface.py b/fast_llm/engine/inference/huggingface.py index d0ec2bd53..3dc79aa0c 100644 --- a/fast_llm/engine/inference/huggingface.py +++ b/fast_llm/engine/inference/huggingface.py @@ -55,7 +55,7 @@ def __init__( # We only support data parallel for now assert ( fast_llm_model.distributed.config.model_parallel == 1 - and fast_llm_model.distributed.config.sequence_tensor_parallel == 1 + and fast_llm_model.distributed.config.sequence_data_parallel == 1 ) self._inference_runner.setup() From 667aacf88bf7d30545819fe839a2d04d26836fac Mon Sep 17 00:00:00 2001 From: bigximik Date: Mon, 12 May 2025 11:01:25 +0000 Subject: [PATCH 03/19] added basic generate tests --- tests/test_generate.py | 144 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 144 insertions(+) create mode 100644 tests/test_generate.py diff --git a/tests/test_generate.py b/tests/test_generate.py new file mode 100644 index 000000000..a652a741f --- /dev/null +++ b/tests/test_generate.py @@ -0,0 +1,144 @@ +import pytest +import torch + +import huggingface_hub + +from transformers import AutoTokenizer, AutoModelForCausalLM + +from fast_llm.engine.checkpoint.config import CheckpointLoadConfig +from fast_llm.models.gpt.config import LlamaGPTHuggingfaceCheckpointFormat +from fast_llm.models.gpt.huggingface import HuggingfaceGPTModelForCausalLM + +from tests.common import requires_cuda, TEST_RESULTS_PATH + + +def _prepare_checkpoint(model: str) -> str: + path = TEST_RESULTS_PATH.resolve() / "generate/model" + model_path = huggingface_hub.snapshot_download(repo_id=model, local_dir=path) + return model_path + + +def _prepare_data(tokenizer, use_batch_size2: bool): + messages = [ + {"role": "user", "content": "What is gravity?"}, + {"role": "user", "content": "Who is the president of EU?"}, + ] + if not use_batch_size2: + messages = messages[0:1] + + input_text = [tokenizer.apply_chat_template([el], tokenize=False) for el in messages] + + tokenizer.padding_side = "left" + inputs = tokenizer(input_text, padding="longest", return_tensors="pt").to("cuda") + return inputs + + +def _get_hf_model(model_path: str, use_flash_attention: bool, use_bf16: bool): + hf_kwargs = {} + if use_flash_attention: + hf_kwargs["attn_implementation"] = "flash_attention_2" + hf_kwargs["torch_dtype"] = torch.bfloat16 + elif use_bf16: + hf_kwargs["torch_dtype"] = torch.bfloat16 + return AutoModelForCausalLM.from_pretrained(model_path, **hf_kwargs).to("cuda") + + +def _get_fast_llm_model( + model_path: str, use_flash_attention: bool, use_bf16: bool, checkpoint_format=LlamaGPTHuggingfaceCheckpointFormat +): + updates = {} + if use_flash_attention: + updates[("base_model", "transformer", "use_flash_attention")] = True + updates[("distributed", "training_dtype")] = "bf16" + else: + updates[("base_model", "transformer", "use_flash_attention")] = False + if use_bf16: + updates[("distributed", "training_dtype")] = "bf16" + return HuggingfaceGPTModelForCausalLM.from_pretrained( + CheckpointLoadConfig( + path=model_path, + format=checkpoint_format, + model_weights=True, + ), + updates, + ) + + +def _trim_output(output, inputs): + res = [] + for output_row, input_row in zip(output, inputs["input_ids"]): + res.append(output_row[len(input_row) :]) + return res + + +def _generate_with_params( + tokenizer, + model_path: str, + use_flash_attention: bool, + use_bf16: bool, + use_batch_size2: bool, + max_new_tokens: int, + fast_llm_checkpoint_format=LlamaGPTHuggingfaceCheckpointFormat, +): + inputs = _prepare_data(tokenizer, use_batch_size2) + + hf_model = _get_hf_model(model_path, use_flash_attention, use_bf16) + fast_llm_model = _get_fast_llm_model(model_path, use_flash_attention, use_bf16, fast_llm_checkpoint_format) + + return { + "hf": _trim_output(hf_model.generate(**inputs, max_new_tokens=max_new_tokens, use_cache=False), inputs), + "fast_llm": _trim_output( + fast_llm_model.generate(**inputs, max_new_tokens=max_new_tokens, use_cache=False), inputs + ), + } + + +def _compare_gen_outputs(outputs: dict[str, list], min_matching_tokens: int | None = None): + for hf_output, fast_llm_output in zip(outputs["hf"], outputs["fast_llm"]): + if min_matching_tokens is not None: + hf_output = hf_output[:min_matching_tokens] + fast_llm_output = fast_llm_output[:min_matching_tokens] + assert len(hf_output) == len(fast_llm_output) and all( + hf_char == fast_llm_char for hf_char, fast_llm_char in zip(hf_output, fast_llm_output) + ) + + +@pytest.fixture(scope="module") +def model_and_tokenizer(): + model = "HuggingFaceTB/SmolLM2-135M-Instruct" + fast_llm_checkpoint_format = LlamaGPTHuggingfaceCheckpointFormat + model_path = _prepare_checkpoint(model) + tokenizer = AutoTokenizer.from_pretrained(model_path) + return model_path, tokenizer, fast_llm_checkpoint_format + + +@pytest.mark.slow +@requires_cuda +@pytest.mark.parametrize( + "use_flash_attention, use_bf16, use_batch_size2, max_new_tokens, min_matching_tokens", + [ + # No flash attention + no bf16 + (False, False, False, 10, 10), + (False, False, True, 10, 10), + # No flash attention + with bf16 + (False, True, False, 10, 10), + (False, True, True, 10, 10), + # Flash attention must be paired with bf16 + (True, True, False, 10, 10), + (True, True, True, 10, 10), + ], +) +def test_generate( + model_and_tokenizer, use_flash_attention, use_bf16, use_batch_size2, max_new_tokens, min_matching_tokens +): + model_path, tokenizer, fast_llm_checkpoint_format = model_and_tokenizer + outputs = _generate_with_params( + tokenizer, + model_path, + use_flash_attention=use_flash_attention, + use_bf16=use_bf16, + use_batch_size2=use_batch_size2, + max_new_tokens=max_new_tokens, + fast_llm_checkpoint_format=fast_llm_checkpoint_format, + ) + _compare_gen_outputs(outputs, min_matching_tokens=min_matching_tokens) From 124cfa7f0ad79c3bf53e300030687b7e60ed7ef5 Mon Sep 17 00:00:00 2001 From: bigximik Date: Mon, 12 May 2025 11:16:06 +0000 Subject: [PATCH 04/19] clean up --- fast_llm/engine/inference/huggingface.py | 2 -- tests/test_generate.py | 6 ++---- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/fast_llm/engine/inference/huggingface.py b/fast_llm/engine/inference/huggingface.py index 3dc79aa0c..ba2700180 100644 --- a/fast_llm/engine/inference/huggingface.py +++ b/fast_llm/engine/inference/huggingface.py @@ -62,8 +62,6 @@ def __init__( # Transformers needs to be able to inspect the base model. self.fast_llm_base_model = fast_llm_model.base_model - # # TODO: Support distributed models? - # assert fast_llm_model.config.distributed.world_size == 1 with transformers.modeling_utils.no_init_weights(): self.post_init() diff --git a/tests/test_generate.py b/tests/test_generate.py index a652a741f..07a8a917d 100644 --- a/tests/test_generate.py +++ b/tests/test_generate.py @@ -93,14 +93,12 @@ def _generate_with_params( } -def _compare_gen_outputs(outputs: dict[str, list], min_matching_tokens: int | None = None): +def _compare_gen_outputs(outputs: dict[str, list[torch.Tensor]], min_matching_tokens: int | None = None): for hf_output, fast_llm_output in zip(outputs["hf"], outputs["fast_llm"]): if min_matching_tokens is not None: hf_output = hf_output[:min_matching_tokens] fast_llm_output = fast_llm_output[:min_matching_tokens] - assert len(hf_output) == len(fast_llm_output) and all( - hf_char == fast_llm_char for hf_char, fast_llm_char in zip(hf_output, fast_llm_output) - ) + assert torch.equal(hf_output, fast_llm_output) @pytest.fixture(scope="module") From fcef337fac9a69d436a024655f725f0b3758737e Mon Sep 17 00:00:00 2001 From: bigximik Date: Tue, 13 May 2025 07:53:30 +0000 Subject: [PATCH 05/19] updated interface and clean up --- fast_llm/engine/inference/huggingface.py | 30 ++++++------------------ fast_llm/engine/inference/runner.py | 3 +-- fast_llm/models/gpt/huggingface.py | 10 ++------ 3 files changed, 10 insertions(+), 33 deletions(-) diff --git a/fast_llm/engine/inference/huggingface.py b/fast_llm/engine/inference/huggingface.py index ba2700180..ea8f134d6 100644 --- a/fast_llm/engine/inference/huggingface.py +++ b/fast_llm/engine/inference/huggingface.py @@ -28,12 +28,14 @@ class HuggingfacePreTrainedModel(transformers.PreTrainedModel): def __init__( self, - config: HuggingfaceModelConfig, fast_llm_model: FastLLMModel, - micro_batch_size: int | None = None, + config: HuggingfaceModelConfig | None = None, runner: ScheduleRunner | None = None, **kwargs, ): + if config is None: + config = self.config_class(fast_llm_model.config) + assert self.runner_class.model_class.config_class is config.model_config_class assert config.fast_llm_config is fast_llm_model.config assert isinstance(config, self.config_class) @@ -46,7 +48,7 @@ def __init__( super().__init__(config, **kwargs) config.fast_llm_config = fast_llm_config - self._inference_runner = self.runner_class(fast_llm_model, micro_batch_size, runner) + self._inference_runner = self.runner_class(fast_llm_model, runner) # A model can be created from pretrained which set it up in the current HF wrapper api # or set existing model which also must be setup, so, do not accept not setup model @@ -82,23 +84,6 @@ def forward( # Meant to be overridden in derived classes raise NotImplementedError() - @classmethod - def from_model( - cls, - fast_llm_model: FastLLMModel, - micro_batch_size: int | None = None, - runner: ScheduleRunner | None = None, - **kwargs, - ): - config = cls.config_class(fast_llm_model.config) - return cls( - config, - fast_llm_model, - micro_batch_size=micro_batch_size, - runner=runner, - **kwargs, - ) - @classmethod def from_pretrained( cls, @@ -124,14 +109,13 @@ def from_pretrained( pretrained_model_name_or_path, *updates, optimizer_state_names=optimizer_state_names, - # setup=setup, + setup=True, mode=mode, use_cpu=use_cpu, stage_filter=stage_filter, ) - config = cls.config_class(fast_llm_model.config) - return cls(config, fast_llm_model, **kwargs) + return cls(fast_llm_model, **kwargs) def _init_weights(self, module) -> None: raise NotImplementedError(module) diff --git a/fast_llm/engine/inference/runner.py b/fast_llm/engine/inference/runner.py index 52eff82b6..6e8084601 100644 --- a/fast_llm/engine/inference/runner.py +++ b/fast_llm/engine/inference/runner.py @@ -17,14 +17,13 @@ class InferenceRunner(abc.ABC): def __init__( self, fast_llm_model: FastLLMModel, - micro_batch_size: int | None = None, runner: ScheduleRunner | None = None, ): assert isinstance(fast_llm_model, self.model_class) self._fast_llm_model = fast_llm_model with NoAutoValidate(): - self._batch_config = self.batch_config_class(micro_batch_size=micro_batch_size) + self._batch_config = self.batch_config_class() self._batch_config.setup(self._fast_llm_model.config.distributed) self._batch_config.validate() diff --git a/fast_llm/models/gpt/huggingface.py b/fast_llm/models/gpt/huggingface.py index 7e668e735..500c454d9 100644 --- a/fast_llm/models/gpt/huggingface.py +++ b/fast_llm/models/gpt/huggingface.py @@ -61,9 +61,8 @@ def forward( if labels is not None: raise NotImplementedError() - # NOTE: We are ignoring position_ids as we reconstruct them from attention_mask via sequence_lenghts. + # NOTE: We are ignoring position_ids as we reconstruct them from attention_mask via sequence_lengths. if attention_mask is not None: - # First non zero indexes or zero index if the row is all zeros (invalid row) first_non_zero_indexes = attention_mask.argmax(dim=1) @@ -112,7 +111,7 @@ def forward( hidden_states = kwargs["hidden_states"] if not return_dict: - # TODO: check hidden state go before past in the tuple + # TODO: Then implementing cache, check hidden state goes before past in the tuple if output_hidden_states: outputs = (logits, hidden_states) else: @@ -127,8 +126,3 @@ def forward( hidden_states=hidden_states, past_key_values=kwargs[TransformerKwargs.presents], ) - - # def prepare_inputs_for_generation( - # self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs - # ): - # raise NotImplementedError() From 47ad6d04b30aed767e3bc29b0294683c5ac88b8c Mon Sep 17 00:00:00 2001 From: bigximik Date: Tue, 13 May 2025 07:54:12 +0000 Subject: [PATCH 06/19] added from model case, renamed --- ...{test_generate.py => test_gpt_generate.py} | 142 ++++++++++++++---- 1 file changed, 113 insertions(+), 29 deletions(-) rename tests/{test_generate.py => test_gpt_generate.py} (54%) diff --git a/tests/test_generate.py b/tests/test_gpt_generate.py similarity index 54% rename from tests/test_generate.py rename to tests/test_gpt_generate.py index 07a8a917d..15dbfd6d1 100644 --- a/tests/test_generate.py +++ b/tests/test_gpt_generate.py @@ -6,9 +6,13 @@ from transformers import AutoTokenizer, AutoModelForCausalLM from fast_llm.engine.checkpoint.config import CheckpointLoadConfig -from fast_llm.models.gpt.config import LlamaGPTHuggingfaceCheckpointFormat +from fast_llm.engine.distributed.distributed import Distributed +from fast_llm.engine.schedule.config import ScheduleConfig +from fast_llm.engine.schedule.runner import ScheduleRunner +from fast_llm.models.gpt.config import LlamaGPTHuggingfaceCheckpointFormat, PretrainedGPTModelConfig from fast_llm.models.gpt.huggingface import HuggingfaceGPTModelForCausalLM + from tests.common import requires_cuda, TEST_RESULTS_PATH @@ -64,6 +68,42 @@ def _get_fast_llm_model( ) +def _get_fast_llm_model_from_model( + model_path: str, use_flash_attention: bool, use_bf16: bool, checkpoint_format=LlamaGPTHuggingfaceCheckpointFormat +): + updates = { + ("pretrained", "path"): model_path, + ("pretrained", "model_weights"): True, + ("pretrained", "format"): checkpoint_format.name, + } + + if use_flash_attention: + updates[("model", "base_model", "transformer", "use_flash_attention")] = True + updates[("model", "distributed", "training_dtype")] = "bf16" + else: + updates[("model", "base_model", "transformer", "use_flash_attention")] = False + if use_bf16: + updates[("model", "distributed", "training_dtype")] = "bf16" + + config = PretrainedGPTModelConfig.from_dict({}, updates) + multi_stage = config.model.get_model_class()(config.model) + schedule_config = ScheduleConfig() + runner = ScheduleRunner( + config=schedule_config, + multi_stage=multi_stage, + distributed_config=config.model.distributed, + ) + distributed = Distributed(config.model.distributed) + + with torch.no_grad(): + multi_stage.setup(distributed) + runner.setup(distributed) + + multi_stage.load_checkpoint(config.pretrained) + + return HuggingfaceGPTModelForCausalLM(multi_stage, runner=runner) + + def _trim_output(output, inputs): res = [] for output_row, input_row in zip(output, inputs["input_ids"]): @@ -71,20 +111,12 @@ def _trim_output(output, inputs): return res -def _generate_with_params( - tokenizer, - model_path: str, - use_flash_attention: bool, - use_bf16: bool, - use_batch_size2: bool, +def _generate( + inputs, + hf_model, + fast_llm_model, max_new_tokens: int, - fast_llm_checkpoint_format=LlamaGPTHuggingfaceCheckpointFormat, ): - inputs = _prepare_data(tokenizer, use_batch_size2) - - hf_model = _get_hf_model(model_path, use_flash_attention, use_bf16) - fast_llm_model = _get_fast_llm_model(model_path, use_flash_attention, use_bf16, fast_llm_checkpoint_format) - return { "hf": _trim_output(hf_model.generate(**inputs, max_new_tokens=max_new_tokens, use_cache=False), inputs), "fast_llm": _trim_output( @@ -101,6 +133,33 @@ def _compare_gen_outputs(outputs: dict[str, list[torch.Tensor]], min_matching_to assert torch.equal(hf_output, fast_llm_output) +def _test_for_batches( + hf_model, + fast_llm_model, + tokenizer, + max_new_tokens, + min_matching_tokens_batch_seize_1, + min_matching_tokens_batch_seize_2, +): + inputs = _prepare_data(tokenizer, use_batch_size2=False) + outputs = _generate( + inputs, + hf_model, + fast_llm_model, + max_new_tokens=max_new_tokens, + ) + _compare_gen_outputs(outputs, min_matching_tokens=min_matching_tokens_batch_seize_1) + + inputs = _prepare_data(tokenizer, use_batch_size2=True) + outputs = _generate( + inputs, + hf_model, + fast_llm_model, + max_new_tokens=max_new_tokens, + ) + _compare_gen_outputs(outputs, min_matching_tokens=min_matching_tokens_batch_seize_2) + + @pytest.fixture(scope="module") def model_and_tokenizer(): model = "HuggingFaceTB/SmolLM2-135M-Instruct" @@ -113,30 +172,55 @@ def model_and_tokenizer(): @pytest.mark.slow @requires_cuda @pytest.mark.parametrize( - "use_flash_attention, use_bf16, use_batch_size2, max_new_tokens, min_matching_tokens", + "use_flash_attention, use_bf16, max_new_tokens, min_matching_tokens_batch_seize_1, min_matching_tokens_batch_seize_2", [ # No flash attention + no bf16 - (False, False, False, 10, 10), - (False, False, True, 10, 10), + (False, False, 10, 10, 10), # No flash attention + with bf16 - (False, True, False, 10, 10), - (False, True, True, 10, 10), + (False, True, 10, 10, 10), # Flash attention must be paired with bf16 - (True, True, False, 10, 10), - (True, True, True, 10, 10), + (True, True, 10, 10, 10), ], ) def test_generate( - model_and_tokenizer, use_flash_attention, use_bf16, use_batch_size2, max_new_tokens, min_matching_tokens + model_and_tokenizer, + use_flash_attention, + use_bf16, + max_new_tokens, + min_matching_tokens_batch_seize_1, + min_matching_tokens_batch_seize_2, +): + model_path, tokenizer, fast_llm_checkpoint_format = model_and_tokenizer + hf_model = _get_hf_model(model_path, use_flash_attention, use_bf16) + fast_llm_model = _get_fast_llm_model(model_path, use_flash_attention, use_bf16, fast_llm_checkpoint_format) + + _test_for_batches( + hf_model, + fast_llm_model, + tokenizer, + max_new_tokens, + min_matching_tokens_batch_seize_1, + min_matching_tokens_batch_seize_2, + ) + + +def test_generate_from_model( + model_and_tokenizer, ): + max_new_tokens = 10 + min_matching_tokens_batch_seize_1 = 10 + min_matching_tokens_batch_seize_2 = 10 + + # Use flash attention for speed model_path, tokenizer, fast_llm_checkpoint_format = model_and_tokenizer - outputs = _generate_with_params( + hf_model = _get_hf_model(model_path, True, True) + fast_llm_model = _get_fast_llm_model_from_model(model_path, True, True, fast_llm_checkpoint_format) + + _test_for_batches( + hf_model, + fast_llm_model, tokenizer, - model_path, - use_flash_attention=use_flash_attention, - use_bf16=use_bf16, - use_batch_size2=use_batch_size2, - max_new_tokens=max_new_tokens, - fast_llm_checkpoint_format=fast_llm_checkpoint_format, + max_new_tokens, + min_matching_tokens_batch_seize_1, + min_matching_tokens_batch_seize_2, ) - _compare_gen_outputs(outputs, min_matching_tokens=min_matching_tokens) From e066170915079e5c4a71e22ef4a74b1fc41764ec Mon Sep 17 00:00:00 2001 From: bigximik Date: Tue, 13 May 2025 07:57:48 +0000 Subject: [PATCH 07/19] added decorators to the new test --- tests/test_gpt_generate.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_gpt_generate.py b/tests/test_gpt_generate.py index 15dbfd6d1..2a448d9ab 100644 --- a/tests/test_gpt_generate.py +++ b/tests/test_gpt_generate.py @@ -204,6 +204,8 @@ def test_generate( ) +@pytest.mark.slow +@requires_cuda def test_generate_from_model( model_and_tokenizer, ): From 24b3e1c6c78b4a7448f1d535dec84840d61675fc Mon Sep 17 00:00:00 2001 From: bigximik Date: Tue, 13 May 2025 08:40:46 +0000 Subject: [PATCH 08/19] added docs --- docs/recipes/generate.md | 78 ++++++++++++++++++++++++++++++++++++++++ mkdocs.yaml | 1 + 2 files changed, 79 insertions(+) create mode 100644 docs/recipes/generate.md diff --git a/docs/recipes/generate.md b/docs/recipes/generate.md new file mode 100644 index 000000000..e35d759d4 --- /dev/null +++ b/docs/recipes/generate.md @@ -0,0 +1,78 @@ +--- +title: How to Generate with a Fast-LLM Model +--- + +Fast-LLM models support `generate` and `forward` operations through Hugging Face–compatible wrappers. + +> ⚠️ Limitations: +> - No support for `cache`, `past_key_values`, `labels`, `attention` outputs, or `inputs_embeds` +> - `position_ids` are ignored and reconstructed from the attention mask +> - Only **data-parallel** generation is supported + +--- + +### 🔧 Generating Text from a Fast-LLM Model + +Below is a step-by-step example of how to generate text using a Fast-LLM model checkpoint from Hugging Face Hub. + +```python +# Import dependencies +import huggingface_hub +from transformers import AutoTokenizer +from fast_llm.engine.checkpoint.config import CheckpointLoadConfig +from fast_llm.models.gpt.config import LlamaGPTHuggingfaceCheckpointFormat +from fast_llm.models.gpt.huggingface import HuggingfaceGPTModelForCausalLM + +# Specify model and configuration +model = "HuggingFaceTB/SmolLM2-135M-Instruct" +checkpoint_format = LlamaGPTHuggingfaceCheckpointFormat +max_new_tokens = 50 + +# Download model checkpoint from the Hugging Face Hub to a local directory +model_path = huggingface_hub.snapshot_download(repo_id=model, local_dir="/tmp") + +# Load tokenizer from the downloaded model +tokenizer = AutoTokenizer.from_pretrained(model_path) + +# Optional: updates to Fast-LLM config before loading the model +updates = { + ("base_model", "transformer", "use_flash_attention"): True, + ("distributed", "training_dtype"): "bf16" +} + +# Load the model from the checkpoint with the given configuration +model = HuggingfaceGPTModelForCausalLM.from_pretrained( + CheckpointLoadConfig( + path=model_path, + format=checkpoint_format, + model_weights=True, + ), + updates, +) + +# Example input messages formatted for chat-style generation +messages = [ + {"role": "user", "content": "What is gravity?"}, + {"role": "user", "content": "Who is the president of EU?"}, +] + +# Convert messages into model input format using chat template +input_text = [tokenizer.apply_chat_template([el], tokenize=False) for el in messages] + +# Prepare tokenized input for the model +tokenizer.padding_side = "left" # Important for correct padding +inputs = tokenizer(input_text, padding="longest", return_tensors="pt").to("cuda") + +# Generate text using the model +outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, use_cache=False) + +# Decode and display outputs +outputs = [tokenizer.decode(el, skip_special_tokens=True) for el in outputs] + +print("--------------------------------------------------------------------") +for el in outputs: + print(el) + print("--------------------------------------------------------------------") +``` + + diff --git a/mkdocs.yaml b/mkdocs.yaml index a080bc83f..ab71bc231 100644 --- a/mkdocs.yaml +++ b/mkdocs.yaml @@ -173,6 +173,7 @@ nav: - Continue training a model: recipes/continue-training.md - Upcycle Llama 3B to MoE: recipes/upcycle-llama-3b-to-moe.md - Instruction Finetuning: recipes/instruction-finetuning.md + - Generate: recipes/generate.md - Reference: - User Guide: - Configuration: user_guide/configuration.md From a574f946ec9bbd508046e5766940bae17e81eca2 Mon Sep 17 00:00:00 2001 From: bigximik Date: Tue, 13 May 2025 08:57:10 +0000 Subject: [PATCH 09/19] fixed typo --- tests/test_gpt_generate.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/tests/test_gpt_generate.py b/tests/test_gpt_generate.py index 2a448d9ab..c1c924b94 100644 --- a/tests/test_gpt_generate.py +++ b/tests/test_gpt_generate.py @@ -138,8 +138,8 @@ def _test_for_batches( fast_llm_model, tokenizer, max_new_tokens, - min_matching_tokens_batch_seize_1, - min_matching_tokens_batch_seize_2, + min_matching_tokens_batch_size_1, + min_matching_tokens_batch_size_2, ): inputs = _prepare_data(tokenizer, use_batch_size2=False) outputs = _generate( @@ -148,7 +148,7 @@ def _test_for_batches( fast_llm_model, max_new_tokens=max_new_tokens, ) - _compare_gen_outputs(outputs, min_matching_tokens=min_matching_tokens_batch_seize_1) + _compare_gen_outputs(outputs, min_matching_tokens=min_matching_tokens_batch_size_1) inputs = _prepare_data(tokenizer, use_batch_size2=True) outputs = _generate( @@ -157,7 +157,7 @@ def _test_for_batches( fast_llm_model, max_new_tokens=max_new_tokens, ) - _compare_gen_outputs(outputs, min_matching_tokens=min_matching_tokens_batch_seize_2) + _compare_gen_outputs(outputs, min_matching_tokens=min_matching_tokens_batch_size_2) @pytest.fixture(scope="module") @@ -172,7 +172,7 @@ def model_and_tokenizer(): @pytest.mark.slow @requires_cuda @pytest.mark.parametrize( - "use_flash_attention, use_bf16, max_new_tokens, min_matching_tokens_batch_seize_1, min_matching_tokens_batch_seize_2", + "use_flash_attention, use_bf16, max_new_tokens, min_matching_tokens_batch_size_1, min_matching_tokens_batch_size_2", [ # No flash attention + no bf16 (False, False, 10, 10, 10), @@ -187,8 +187,8 @@ def test_generate( use_flash_attention, use_bf16, max_new_tokens, - min_matching_tokens_batch_seize_1, - min_matching_tokens_batch_seize_2, + min_matching_tokens_batch_size_1, + min_matching_tokens_batch_size_2, ): model_path, tokenizer, fast_llm_checkpoint_format = model_and_tokenizer hf_model = _get_hf_model(model_path, use_flash_attention, use_bf16) @@ -199,8 +199,8 @@ def test_generate( fast_llm_model, tokenizer, max_new_tokens, - min_matching_tokens_batch_seize_1, - min_matching_tokens_batch_seize_2, + min_matching_tokens_batch_size_1, + min_matching_tokens_batch_size_2, ) @@ -210,8 +210,8 @@ def test_generate_from_model( model_and_tokenizer, ): max_new_tokens = 10 - min_matching_tokens_batch_seize_1 = 10 - min_matching_tokens_batch_seize_2 = 10 + min_matching_tokens_batch_size_1 = 10 + min_matching_tokens_batch_size_2 = 10 # Use flash attention for speed model_path, tokenizer, fast_llm_checkpoint_format = model_and_tokenizer @@ -223,6 +223,6 @@ def test_generate_from_model( fast_llm_model, tokenizer, max_new_tokens, - min_matching_tokens_batch_seize_1, - min_matching_tokens_batch_seize_2, + min_matching_tokens_batch_size_1, + min_matching_tokens_batch_size_2, ) From 2094b603742a973a5fed4cc71a41a5c79f597429 Mon Sep 17 00:00:00 2001 From: Denis Kocetkov Date: Tue, 13 May 2025 17:10:12 +0300 Subject: [PATCH 10/19] docs updates --- docs/recipes/generate.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/docs/recipes/generate.md b/docs/recipes/generate.md index e35d759d4..a017fcbb4 100644 --- a/docs/recipes/generate.md +++ b/docs/recipes/generate.md @@ -4,10 +4,11 @@ title: How to Generate with a Fast-LLM Model Fast-LLM models support `generate` and `forward` operations through Hugging Face–compatible wrappers. -> ⚠️ Limitations: -> - No support for `cache`, `past_key_values`, `labels`, `attention` outputs, or `inputs_embeds` -> - `position_ids` are ignored and reconstructed from the attention mask -> - Only **data-parallel** generation is supported +⚠️ Limitations: + +- No support for `cache`, `past_key_values`, `labels`, `attention` outputs, or `inputs_embeds` +- `position_ids` are ignored and reconstructed from the attention mask +- **model-parallel** and **sequence-data-parallel** generation is **not** supported --- From 196e73bfb6d0325731b0de29ce50ee39128d5f41 Mon Sep 17 00:00:00 2001 From: Denis Kocetkov Date: Tue, 13 May 2025 17:10:51 +0300 Subject: [PATCH 11/19] cairosvg downgrade --- setup.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.cfg b/setup.cfg index 9b944b27f..2e3b549fc 100644 --- a/setup.cfg +++ b/setup.cfg @@ -65,6 +65,7 @@ DOCS = mkdocs-git-revision-date-localized-plugin pypandoc_binary mkdocs-bibtex + cairosvg==2.7.0 [options.entry_points] console_scripts = From 7fe218d70cee06f1ce7c9071ce7101cf955d9ded Mon Sep 17 00:00:00 2001 From: Denis Kocetkov Date: Tue, 13 May 2025 17:57:05 +0300 Subject: [PATCH 12/19] style filters applied --- docs/recipes/generate.md | 2 -- fast_llm/engine/inference/huggingface.py | 5 +---- fast_llm/engine/inference/runner.py | 1 - fast_llm/layers/transformer/preprocessing.py | 5 ++++- fast_llm/models/gpt/huggingface.py | 1 - tests/test_gpt_generate.py | 10 +++------- 6 files changed, 8 insertions(+), 16 deletions(-) diff --git a/docs/recipes/generate.md b/docs/recipes/generate.md index a017fcbb4..e6bda8031 100644 --- a/docs/recipes/generate.md +++ b/docs/recipes/generate.md @@ -75,5 +75,3 @@ for el in outputs: print(el) print("--------------------------------------------------------------------") ``` - - diff --git a/fast_llm/engine/inference/huggingface.py b/fast_llm/engine/inference/huggingface.py index ea8f134d6..2518ed96a 100644 --- a/fast_llm/engine/inference/huggingface.py +++ b/fast_llm/engine/inference/huggingface.py @@ -3,18 +3,15 @@ import typing import torch -import transformers.modeling_outputs import transformers.generation.utils +import transformers.modeling_outputs from fast_llm.engine.checkpoint.config import CheckpointLoadConfig, FastLLMCheckpointFormat from fast_llm.engine.inference.config import HuggingfaceModelConfig from fast_llm.engine.inference.runner import InferenceRunner from fast_llm.engine.multi_stage.config import StageMode from fast_llm.engine.multi_stage.fast_llm_model import FastLLMModel -from fast_llm.engine.schedule.config import BatchConfig, ScheduleConfig from fast_llm.engine.schedule.runner import ScheduleRunner -from fast_llm.engine.schedule.schedule import Schedule -from fast_llm.engine.training.config import TrainerConfig class HuggingfacePreTrainedModel(transformers.PreTrainedModel): diff --git a/fast_llm/engine/inference/runner.py b/fast_llm/engine/inference/runner.py index 6e8084601..df727288d 100644 --- a/fast_llm/engine/inference/runner.py +++ b/fast_llm/engine/inference/runner.py @@ -7,7 +7,6 @@ from fast_llm.engine.schedule.config import BatchConfig, ScheduleConfig from fast_llm.engine.schedule.runner import ScheduleRunner from fast_llm.engine.schedule.schedule import Schedule -from fast_llm.engine.training.config import TrainerConfig class InferenceRunner(abc.ABC): diff --git a/fast_llm/layers/transformer/preprocessing.py b/fast_llm/layers/transformer/preprocessing.py index 1e07c1c12..0697bd216 100644 --- a/fast_llm/layers/transformer/preprocessing.py +++ b/fast_llm/layers/transformer/preprocessing.py @@ -239,7 +239,10 @@ def preprocess(self, batch, kwargs: dict[str, typing.Any]) -> None: ] if (sequence_lengths := kwargs.get(TransformerKwargs.sequence_lengths, None)) is not None: seq_ids = torch.stack( - [torch.cat([torch.full((x,), i) for i, x in enumerate(sample_lens)]) for sample_lens in sequence_lengths] + [ + torch.cat([torch.full((x,), i) for i, x in enumerate(sample_lens)]) + for sample_lens in sequence_lengths + ] ) document_mask = (seq_ids[:, None, :] == seq_ids[:, :, None]).to(self._tensor_space.distributed.device) kwargs[TransformerKwargs.attention_mask] = ( diff --git a/fast_llm/models/gpt/huggingface.py b/fast_llm/models/gpt/huggingface.py index 500c454d9..cf7da3872 100644 --- a/fast_llm/models/gpt/huggingface.py +++ b/fast_llm/models/gpt/huggingface.py @@ -5,7 +5,6 @@ import torch import transformers.modeling_outputs - from fast_llm.data.data.gpt.data import GPTBatch from fast_llm.engine.distributed.config import PhaseType from fast_llm.engine.inference.config import HuggingfaceModelConfig diff --git a/tests/test_gpt_generate.py b/tests/test_gpt_generate.py index c1c924b94..ac2b6362d 100644 --- a/tests/test_gpt_generate.py +++ b/tests/test_gpt_generate.py @@ -1,9 +1,7 @@ +import huggingface_hub import pytest import torch - -import huggingface_hub - -from transformers import AutoTokenizer, AutoModelForCausalLM +from transformers import AutoModelForCausalLM, AutoTokenizer from fast_llm.engine.checkpoint.config import CheckpointLoadConfig from fast_llm.engine.distributed.distributed import Distributed @@ -11,9 +9,7 @@ from fast_llm.engine.schedule.runner import ScheduleRunner from fast_llm.models.gpt.config import LlamaGPTHuggingfaceCheckpointFormat, PretrainedGPTModelConfig from fast_llm.models.gpt.huggingface import HuggingfaceGPTModelForCausalLM - - -from tests.common import requires_cuda, TEST_RESULTS_PATH +from tests.common import TEST_RESULTS_PATH, requires_cuda def _prepare_checkpoint(model: str) -> str: From 5325b50770a064d9f6582e11d8bf050cd0c9d416 Mon Sep 17 00:00:00 2001 From: bigximik Date: Fri, 16 May 2025 11:29:49 +0000 Subject: [PATCH 13/19] changed of handling of unwanted config deepcopy --- fast_llm/engine/inference/config.py | 20 ++++++++++++++++++-- fast_llm/engine/inference/huggingface.py | 6 ------ 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/fast_llm/engine/inference/config.py b/fast_llm/engine/inference/config.py index c18daa48d..b09c88baf 100644 --- a/fast_llm/engine/inference/config.py +++ b/fast_llm/engine/inference/config.py @@ -1,3 +1,4 @@ +import copy import logging import os import pathlib @@ -36,6 +37,22 @@ def save_pretrained(self, save_directory: str | os.PathLike, push_to_hub: bool = finally: transformers.configuration_utils.CONFIG_NAME = _backup + def __deepcopy__(self, memo): + # Hugging Face's PretrainedModel will deep copy the config + # when `generate` is enabled. However, `fast_llm_config` + # cannot be deep copied if the world size is greater than 1, + # as it will contain references to process groups. + # Therefore, we copy it by reference instead. + cls = self.__class__ + copied = cls.__new__(cls) + memo[id(self)] = copied + for k, v in self.__dict__.items(): + if k == "fast_llm_config": + setattr(copied, k, v) # Keep the same reference + else: + setattr(copied, k, copy.deepcopy(v, memo)) + return copied + @classmethod def _get_config_dict( cls, pretrained_model_name_or_path: str | os.PathLike | CheckpointLoadMetadataConfig, **kwargs @@ -91,8 +108,7 @@ def __eq__(self, other) -> bool: def to_dict(self) -> dict[str, typing.Any]: out = super().to_dict() - if self.fast_llm_config is not None: - out["fast_llm_config"] = self.fast_llm_config.to_dict(verbose=FieldVerboseLevel.everything) + out["fast_llm_config"] = self.fast_llm_config.to_dict(verbose=FieldVerboseLevel.everything) return out def to_diff_dict(self) -> dict[str, typing.Any]: diff --git a/fast_llm/engine/inference/huggingface.py b/fast_llm/engine/inference/huggingface.py index 2518ed96a..2e219e076 100644 --- a/fast_llm/engine/inference/huggingface.py +++ b/fast_llm/engine/inference/huggingface.py @@ -37,13 +37,7 @@ def __init__( assert config.fast_llm_config is fast_llm_model.config assert isinstance(config, self.config_class) - # The HF constructor performs a deep copy of the config, - # but config.fast_llm_config may contain non-picklable items like process groups. - # Temporarily remove it before the call and restore it afterward. - fast_llm_config = config.fast_llm_config - config.fast_llm_config = None super().__init__(config, **kwargs) - config.fast_llm_config = fast_llm_config self._inference_runner = self.runner_class(fast_llm_model, runner) From 277892559d86108895e30b68285983c18c457d43 Mon Sep 17 00:00:00 2001 From: bigximik Date: Fri, 16 May 2025 11:36:05 +0000 Subject: [PATCH 14/19] moved forward declaration to the right class --- fast_llm/engine/inference/huggingface.py | 32 +++++++++++------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/fast_llm/engine/inference/huggingface.py b/fast_llm/engine/inference/huggingface.py index 2e219e076..0645eef4b 100644 --- a/fast_llm/engine/inference/huggingface.py +++ b/fast_llm/engine/inference/huggingface.py @@ -59,22 +59,6 @@ def __init__( with transformers.modeling_utils.no_init_weights(): self.post_init() - def forward( - self, - input_ids: torch.Tensor | None = None, - attention_mask: torch.Tensor | None = None, - position_ids: torch.Tensor | None = None, - past_key_values=None, - inputs_embeds: torch.FloatTensor | None = None, - labels: torch.LongTensor | None = None, - use_cache: bool | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, - ) -> tuple | transformers.modeling_outputs.CausalLMOutputWithPast: - # Meant to be overridden in derived classes - raise NotImplementedError() - @classmethod def from_pretrained( cls, @@ -113,4 +97,18 @@ def _init_weights(self, module) -> None: class HuggingfaceBaseModelForCausalLM(HuggingfacePreTrainedModel, transformers.generation.utils.GenerationMixin): - pass + def forward( + self, + input_ids: torch.Tensor | None = None, + attention_mask: torch.Tensor | None = None, + position_ids: torch.Tensor | None = None, + past_key_values=None, + inputs_embeds: torch.FloatTensor | None = None, + labels: torch.LongTensor | None = None, + use_cache: bool | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + ) -> tuple | transformers.modeling_outputs.CausalLMOutputWithPast: + # Meant to be overridden in derived classes + raise NotImplementedError() From 4829b0d66ca28db10f5ba7d7e17904d5e52e289f Mon Sep 17 00:00:00 2001 From: bigximik Date: Fri, 16 May 2025 11:47:23 +0000 Subject: [PATCH 15/19] changed asserts for clarity --- fast_llm/engine/inference/huggingface.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fast_llm/engine/inference/huggingface.py b/fast_llm/engine/inference/huggingface.py index 0645eef4b..e679cfd6f 100644 --- a/fast_llm/engine/inference/huggingface.py +++ b/fast_llm/engine/inference/huggingface.py @@ -12,6 +12,7 @@ from fast_llm.engine.multi_stage.config import StageMode from fast_llm.engine.multi_stage.fast_llm_model import FastLLMModel from fast_llm.engine.schedule.runner import ScheduleRunner +from fast_llm.utils import Assert class HuggingfacePreTrainedModel(transformers.PreTrainedModel): @@ -46,10 +47,8 @@ def __init__( assert fast_llm_model.is_setup # We only support data parallel for now - assert ( - fast_llm_model.distributed.config.model_parallel == 1 - and fast_llm_model.distributed.config.sequence_data_parallel == 1 - ) + Assert.eq(fast_llm_model.distributed.config.model_parallel, 1) + Assert.eq(fast_llm_model.distributed.config.sequence_data_parallel, 1) self._inference_runner.setup() From 1aa434458b95a9f40ab6501e437cf7af04c789f7 Mon Sep 17 00:00:00 2001 From: bigximik Date: Fri, 16 May 2025 11:47:54 +0000 Subject: [PATCH 16/19] Added assert as fail safe --- fast_llm/engine/inference/runner.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fast_llm/engine/inference/runner.py b/fast_llm/engine/inference/runner.py index df727288d..3003c5f9d 100644 --- a/fast_llm/engine/inference/runner.py +++ b/fast_llm/engine/inference/runner.py @@ -7,6 +7,7 @@ from fast_llm.engine.schedule.config import BatchConfig, ScheduleConfig from fast_llm.engine.schedule.runner import ScheduleRunner from fast_llm.engine.schedule.schedule import Schedule +from fast_llm.utils import Assert class InferenceRunner(abc.ABC): @@ -58,6 +59,9 @@ def fast_llm_model(self) -> FastLLMModel: def setup(self): if not self._runner._is_setup: self._runner.setup(self._fast_llm_model.distributed) + else: + # Means external runner was passed, check it has the same distributed class as the model + Assert.is_(self._runner._distributed, self._fast_llm_model.distributed) def forward( self, input_, kwargs: dict, *, iteration: int = 1, return_metrics: bool = False From 2b5fda6762733a3c40994a90ccaa0a9db8311917 Mon Sep 17 00:00:00 2001 From: bigximik Date: Fri, 16 May 2025 15:55:23 +0000 Subject: [PATCH 17/19] changes fixes and added test for return of hidden_states --- fast_llm/engine/multi_stage/stage.py | 12 ++++----- fast_llm/layers/language_model/head.py | 16 +++++++++--- ...te.py => test_gpt_generate_and_forward.py} | 26 +++++++++++++++++++ 3 files changed, 43 insertions(+), 11 deletions(-) rename tests/{test_gpt_generate.py => test_gpt_generate_and_forward.py} (87%) diff --git a/fast_llm/engine/multi_stage/stage.py b/fast_llm/engine/multi_stage/stage.py index a3ac98e8b..280c5c235 100644 --- a/fast_llm/engine/multi_stage/stage.py +++ b/fast_llm/engine/multi_stage/stage.py @@ -14,7 +14,7 @@ from fast_llm.utils import Assert if typing.TYPE_CHECKING: - from fast_llm.core.distributed import ProcessGroup + pass logger = logging.getLogger(__name__) @@ -123,12 +123,10 @@ def forward( if output is not None: meta = self._meta_outputs[i] output_global, _ = meta.local_to_global(output.detach(), distributed=self._distributed) - else: - output_global = None - kwargs["hidden_states"][self._layer_range[i]] = { - "layer_type": type(layer).__name__, - "tensor": output_global, - } + kwargs["hidden_states"][self._layer_range[i]] = { + "layer_type": type(layer).__name__, + "tensor": output_global, + } return None if output is None else output.detach(), (input_, output) def backward( diff --git a/fast_llm/layers/language_model/head.py b/fast_llm/layers/language_model/head.py index cc108cee4..d6d1b8a54 100644 --- a/fast_llm/layers/language_model/head.py +++ b/fast_llm/layers/language_model/head.py @@ -5,7 +5,7 @@ from torch.distributed import all_reduce from fast_llm.config import Configurable -from fast_llm.core.ops import gather_op, split_op +from fast_llm.core.ops import split_op from fast_llm.engine.base_model.base_model import Layer from fast_llm.engine.config_utils.tensor_space import DefaultDimNames, TensorDim, TensorSpace from fast_llm.engine.distributed.config import DistributedDimNames @@ -183,9 +183,17 @@ def _forward_backward( if "output_hidden_states" in kwargs and kwargs["output_hidden_states"]: # The last hidden layer output is returned normalized in the HF Transformers-style output, at least for LLama style models. # So, if needed, we gather the data after normalization and set it as the output of the previous layer. - group = self._tensor_space.distributed.tensor_group if self._parallel_embeddings else None - sequence_parallel = self._sequence_parallel and self._parallel_embeddings - hidden_state = gather_op(ln_output.detach(), group, dim=0) if sequence_parallel else ln_output.detach() + dims = list(kwargs[TransformerKwargs.hidden_dims]) + sequence_index = 1 - int(kwargs[TransformerKwargs.sequence_first]) + dims[sequence_index] = ( + TensorDim( + TransformerDimNames.sequence_q_tp, dims[sequence_index].global_size, DistributedDimNames.tensor + ) + if self._sequence_parallel_logits + else TensorDim(TransformerDimNames.sequence_q, dims[sequence_index].global_size) + ) + meta = TensorMeta.from_dims(tuple(dims), tensor_name="transformer hidden_state", dtype=ln_output.dtype) + hidden_state, _ = meta.local_to_global(ln_output.detach(), distributed=self._tensor_space.distributed) kwargs["hidden_states"][len(kwargs["hidden_states"]) - 1]["tensor"] = hidden_state grad_output = kwargs[TransformerKwargs.grad_output] / ( diff --git a/tests/test_gpt_generate.py b/tests/test_gpt_generate_and_forward.py similarity index 87% rename from tests/test_gpt_generate.py rename to tests/test_gpt_generate_and_forward.py index ac2b6362d..1a3f3a35b 100644 --- a/tests/test_gpt_generate.py +++ b/tests/test_gpt_generate_and_forward.py @@ -222,3 +222,29 @@ def test_generate_from_model( min_matching_tokens_batch_size_1, min_matching_tokens_batch_size_2, ) + + +@pytest.mark.slow +@requires_cuda +def test_forward_return_hidden_states( + model_and_tokenizer, +): + # Use flash attention for speed + model_path, tokenizer, fast_llm_checkpoint_format = model_and_tokenizer + fast_llm_model = _get_fast_llm_model(model_path, True, True, fast_llm_checkpoint_format) + + inputs_ids = torch.randint(1, tokenizer.vocab_size, [1, 10], dtype=torch.int64).cuda() + + # TODO: hidden states have differences between HF and Fast-LLM despite resulting in the similar logits, + # decide if to leave as it. + # hf_model = _get_hf_model(model_path, True, True) + # res_hf = hf_model.forward(input_ids=inputs_ids, output_hidden_states=True, return_dict=True, use_cache=False) + + res_fast_llm = fast_llm_model.forward( + input_ids=inputs_ids, output_hidden_states=True, return_dict=True, use_cache=False + ) + + # hidden_states include embeddings layer + assert ( + len(res_fast_llm.hidden_states) - 1 == fast_llm_model.config.fast_llm_config.base_model.transformer.num_layers + ) From 77e7cbc38c9f32bbfa856420a143c3f1de8b9b3a Mon Sep 17 00:00:00 2001 From: bigximik Date: Mon, 19 May 2025 06:50:26 +0000 Subject: [PATCH 18/19] added extra slow tests mark --- tests/conftest.py | 14 ++++++++++++++ tests/test_gpt_generate_and_forward.py | 6 +++--- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 445f59bb1..1c718c21e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,10 +3,19 @@ def pytest_addoption(parser): parser.addoption("--skip-slow", action="store_true") + parser.addoption( + "--run-extra-slow", + action="store_true", + default=False, + help="Run tests marked as extra_slow", + ) def pytest_configure(config): config.addinivalue_line("markers", "slow: Test is slow.") + config.addinivalue_line( + "markers", "extra_slow: Mark test as extra slow and skip unless --run-extra-slow is given." + ) def pytest_collection_modifyitems(config, items): @@ -15,3 +24,8 @@ def pytest_collection_modifyitems(config, items): for item in items: if "slow" in item.keywords: item.add_marker(skip_slow) + if not config.getoption("--run-extra-slow"): + skip_extra_slow = pytest.mark.skip(reason="need --run-extra-slow option to run") + for item in items: + if "extra_slow" in item.keywords: + item.add_marker(skip_extra_slow) diff --git a/tests/test_gpt_generate_and_forward.py b/tests/test_gpt_generate_and_forward.py index 1a3f3a35b..730d1f4b9 100644 --- a/tests/test_gpt_generate_and_forward.py +++ b/tests/test_gpt_generate_and_forward.py @@ -165,7 +165,7 @@ def model_and_tokenizer(): return model_path, tokenizer, fast_llm_checkpoint_format -@pytest.mark.slow +@pytest.mark.extra_slow @requires_cuda @pytest.mark.parametrize( "use_flash_attention, use_bf16, max_new_tokens, min_matching_tokens_batch_size_1, min_matching_tokens_batch_size_2", @@ -200,7 +200,7 @@ def test_generate( ) -@pytest.mark.slow +@pytest.mark.extra_slow @requires_cuda def test_generate_from_model( model_and_tokenizer, @@ -224,7 +224,7 @@ def test_generate_from_model( ) -@pytest.mark.slow +@pytest.mark.extra_slow @requires_cuda def test_forward_return_hidden_states( model_and_tokenizer, From 4497436d8447f5e1537360c21778243165f153db Mon Sep 17 00:00:00 2001 From: bigximik Date: Mon, 19 May 2025 09:24:13 +0000 Subject: [PATCH 19/19] added faster tests with placeholder model --- tests/test_gpt_generate_and_forward.py | 163 ++++++++++++++++++++++--- 1 file changed, 143 insertions(+), 20 deletions(-) diff --git a/tests/test_gpt_generate_and_forward.py b/tests/test_gpt_generate_and_forward.py index 730d1f4b9..134b51e68 100644 --- a/tests/test_gpt_generate_and_forward.py +++ b/tests/test_gpt_generate_and_forward.py @@ -33,6 +33,22 @@ def _prepare_data(tokenizer, use_batch_size2: bool): return inputs +def _prepare_rand_data(vocab_size, use_batch_size2: bool): + inputs = torch.randint( + 1, + vocab_size, + [2 if use_batch_size2 else 1, 10], + dtype=torch.int64, + generator=torch.Generator().manual_seed(42), + ).cuda() + attention_mask = torch.ones_like(inputs) + # simulate left padding on one of the rows + if use_batch_size2: + inputs[1, :5] = 0 + attention_mask[1, :5] = 0 + return {"input_ids": inputs, "attention_mask": attention_mask} + + def _get_hf_model(model_path: str, use_flash_attention: bool, use_bf16: bool): hf_kwargs = {} if use_flash_attention: @@ -132,12 +148,15 @@ def _compare_gen_outputs(outputs: dict[str, list[torch.Tensor]], min_matching_to def _test_for_batches( hf_model, fast_llm_model, - tokenizer, max_new_tokens, min_matching_tokens_batch_size_1, min_matching_tokens_batch_size_2, + tokenizer=None, ): - inputs = _prepare_data(tokenizer, use_batch_size2=False) + if tokenizer is not None: + inputs = _prepare_data(tokenizer, use_batch_size2=False) + else: + inputs = _prepare_rand_data(fast_llm_model.config.fast_llm_config.base_model.vocab_size, use_batch_size2=False) outputs = _generate( inputs, hf_model, @@ -146,7 +165,10 @@ def _test_for_batches( ) _compare_gen_outputs(outputs, min_matching_tokens=min_matching_tokens_batch_size_1) - inputs = _prepare_data(tokenizer, use_batch_size2=True) + if tokenizer is not None: + inputs = _prepare_data(tokenizer, use_batch_size2=True) + else: + inputs = _prepare_rand_data(fast_llm_model.config.fast_llm_config.base_model.vocab_size, use_batch_size2=True) outputs = _generate( inputs, hf_model, @@ -165,6 +187,42 @@ def model_and_tokenizer(): return model_path, tokenizer, fast_llm_checkpoint_format +@pytest.fixture(scope="module") +def small_model(): + from .common import _CONFIGS, TEST_RESULTS_PATH, run_test_script + + _, _, _, common_config, fast_llm_checkpoint_format = _CONFIGS["llama"] + run_test_script( + f"test_llama_generate_and_forward", + common_config + + ["training.checkpoint.interval=1", "training.export.format=llama", "training.export.interval=1"], + ) + return TEST_RESULTS_PATH / "test_llama_generate_and_forward/export/llama/2", fast_llm_checkpoint_format + + +def _test_generate( + model_path, + fast_llm_checkpoint_format, + use_flash_attention, + use_bf16, + max_new_tokens, + min_matching_tokens_batch_size_1, + min_matching_tokens_batch_size_2, + tokenizer=None, +): + hf_model = _get_hf_model(model_path, use_flash_attention, use_bf16) + fast_llm_model = _get_fast_llm_model(model_path, use_flash_attention, use_bf16, fast_llm_checkpoint_format) + + _test_for_batches( + hf_model, + fast_llm_model, + max_new_tokens, + min_matching_tokens_batch_size_1, + min_matching_tokens_batch_size_2, + tokenizer=tokenizer, + ) + + @pytest.mark.extra_slow @requires_cuda @pytest.mark.parametrize( @@ -187,59 +245,108 @@ def test_generate( min_matching_tokens_batch_size_2, ): model_path, tokenizer, fast_llm_checkpoint_format = model_and_tokenizer - hf_model = _get_hf_model(model_path, use_flash_attention, use_bf16) - fast_llm_model = _get_fast_llm_model(model_path, use_flash_attention, use_bf16, fast_llm_checkpoint_format) - - _test_for_batches( - hf_model, - fast_llm_model, - tokenizer, + _test_generate( + model_path, + fast_llm_checkpoint_format, + use_flash_attention, + use_bf16, max_new_tokens, min_matching_tokens_batch_size_1, min_matching_tokens_batch_size_2, + tokenizer=tokenizer, ) -@pytest.mark.extra_slow +@pytest.mark.slow @requires_cuda -def test_generate_from_model( - model_and_tokenizer, +@pytest.mark.parametrize( + "use_flash_attention, use_bf16, max_new_tokens, min_matching_tokens_batch_size_1, min_matching_tokens_batch_size_2", + [ + # No flash attention + no bf16 + (False, False, 10, 10, 10), + # No flash attention + with bf16 + (False, True, 10, 10, 10), + # Flash attention must be paired with bf16 + (True, True, 10, 10, 10), + ], +) +def test_small_generate( + small_model, + use_flash_attention, + use_bf16, + max_new_tokens, + min_matching_tokens_batch_size_1, + min_matching_tokens_batch_size_2, ): + model_path, fast_llm_checkpoint_format = small_model + _test_generate( + model_path, + fast_llm_checkpoint_format, + use_flash_attention, + use_bf16, + max_new_tokens, + min_matching_tokens_batch_size_1, + min_matching_tokens_batch_size_2, + ) + + +def _test_generate_from_model(model_path, tokenizer, fast_llm_checkpoint_format): max_new_tokens = 10 min_matching_tokens_batch_size_1 = 10 min_matching_tokens_batch_size_2 = 10 # Use flash attention for speed - model_path, tokenizer, fast_llm_checkpoint_format = model_and_tokenizer hf_model = _get_hf_model(model_path, True, True) fast_llm_model = _get_fast_llm_model_from_model(model_path, True, True, fast_llm_checkpoint_format) _test_for_batches( hf_model, fast_llm_model, - tokenizer, max_new_tokens, min_matching_tokens_batch_size_1, min_matching_tokens_batch_size_2, + tokenizer=tokenizer, ) @pytest.mark.extra_slow @requires_cuda -def test_forward_return_hidden_states( +def test_generate_from_model( model_and_tokenizer, ): - # Use flash attention for speed model_path, tokenizer, fast_llm_checkpoint_format = model_and_tokenizer - fast_llm_model = _get_fast_llm_model(model_path, True, True, fast_llm_checkpoint_format) + _test_generate_from_model(model_path, tokenizer, fast_llm_checkpoint_format) - inputs_ids = torch.randint(1, tokenizer.vocab_size, [1, 10], dtype=torch.int64).cuda() +@pytest.mark.slow +@requires_cuda +def test_small_generate_from_model( + small_model, +): + model_path, fast_llm_checkpoint_format = small_model + _test_generate_from_model(model_path, None, fast_llm_checkpoint_format) + + +def _test_forward_return_hidden_states( + model_path, + fast_llm_checkpoint_format, + vocab_size: int | None = None, +): + # Use flash attention for speed # TODO: hidden states have differences between HF and Fast-LLM despite resulting in the similar logits, # decide if to leave as it. # hf_model = _get_hf_model(model_path, True, True) - # res_hf = hf_model.forward(input_ids=inputs_ids, output_hidden_states=True, return_dict=True, use_cache=False) + fast_llm_model = _get_fast_llm_model(model_path, True, True, fast_llm_checkpoint_format) + + inputs_ids = torch.randint( + 1, + fast_llm_model.config.fast_llm_config.base_model.vocab_size if vocab_size is None else vocab_size, + [1, 10], + dtype=torch.int64, + generator=torch.Generator().manual_seed(42), + ).cuda() + # res_hf = hf_model.forward(input_ids=inputs_ids, output_hidden_states=True, return_dict=True, use_cache=False) res_fast_llm = fast_llm_model.forward( input_ids=inputs_ids, output_hidden_states=True, return_dict=True, use_cache=False ) @@ -248,3 +355,19 @@ def test_forward_return_hidden_states( assert ( len(res_fast_llm.hidden_states) - 1 == fast_llm_model.config.fast_llm_config.base_model.transformer.num_layers ) + + +@pytest.mark.extra_slow +@requires_cuda +def test_forward_return_hidden_states( + model_and_tokenizer, +): + model_path, tokenizer, fast_llm_checkpoint_format = model_and_tokenizer + _test_forward_return_hidden_states(model_path, fast_llm_checkpoint_format, tokenizer.vocab_size) + + +@pytest.mark.slow +@requires_cuda +def test_small_forward_return_hidden_states(small_model): + model_path, fast_llm_checkpoint_format = small_model + _test_forward_return_hidden_states(model_path, fast_llm_checkpoint_format)