From a46d581130718f394665ddfa7abb7633f3c3775d Mon Sep 17 00:00:00 2001
From: bigximik <denisko@live.com>
Date: Mon, 12 May 2025 07:50:46 +0000
Subject: [PATCH 01/19] copy from denis/generate of only generate support part,
 with some changes

---
 fast_llm/engine/inference/config.py          |   3 +-
 fast_llm/engine/inference/huggingface.py     | 102 ++++++++++++++++---
 fast_llm/engine/inference/runner.py          |  39 +++++--
 fast_llm/engine/multi_stage/config.py        |   4 +-
 fast_llm/engine/multi_stage/stage.py         |  18 ++++
 fast_llm/layers/language_model/head.py       |  10 +-
 fast_llm/layers/transformer/preprocessing.py |   2 +-
 fast_llm/models/custom/config.py             |   2 +-
 fast_llm/models/gpt/config.py                |   2 +-
 fast_llm/models/gpt/huggingface.py           |  58 ++++++++---
 tests/test_checkpoint.py                     |   2 +-
 11 files changed, 194 insertions(+), 48 deletions(-)

diff --git a/fast_llm/engine/inference/config.py b/fast_llm/engine/inference/config.py
index d4b46bcc0..c18daa48d 100644
--- a/fast_llm/engine/inference/config.py
+++ b/fast_llm/engine/inference/config.py
@@ -91,7 +91,8 @@ def __eq__(self, other) -> bool:
 
     def to_dict(self) -> dict[str, typing.Any]:
         out = super().to_dict()
-        out["fast_llm_config"] = self.fast_llm_config.to_dict(verbose=FieldVerboseLevel.everything)
+        if self.fast_llm_config is not None:
+            out["fast_llm_config"] = self.fast_llm_config.to_dict(verbose=FieldVerboseLevel.everything)
         return out
 
     def to_diff_dict(self) -> dict[str, typing.Any]:
diff --git a/fast_llm/engine/inference/huggingface.py b/fast_llm/engine/inference/huggingface.py
index 196310b4d..d0ec2bd53 100644
--- a/fast_llm/engine/inference/huggingface.py
+++ b/fast_llm/engine/inference/huggingface.py
@@ -2,13 +2,19 @@
 import pathlib
 import typing
 
+import torch
 import transformers.modeling_outputs
+import transformers.generation.utils
 
 from fast_llm.engine.checkpoint.config import CheckpointLoadConfig, FastLLMCheckpointFormat
 from fast_llm.engine.inference.config import HuggingfaceModelConfig
 from fast_llm.engine.inference.runner import InferenceRunner
 from fast_llm.engine.multi_stage.config import StageMode
 from fast_llm.engine.multi_stage.fast_llm_model import FastLLMModel
+from fast_llm.engine.schedule.config import BatchConfig, ScheduleConfig
+from fast_llm.engine.schedule.runner import ScheduleRunner
+from fast_llm.engine.schedule.schedule import Schedule
+from fast_llm.engine.training.config import TrainerConfig
 
 
 class HuggingfacePreTrainedModel(transformers.PreTrainedModel):
@@ -20,31 +26,91 @@ class HuggingfacePreTrainedModel(transformers.PreTrainedModel):
     # _supports_cache_class = False
     # _tied_weights_keys = []
 
-    def __init__(self, config: HuggingfaceModelConfig, fast_llm_model: FastLLMModel, **kwargs):
+    def __init__(
+        self,
+        config: HuggingfaceModelConfig,
+        fast_llm_model: FastLLMModel,
+        micro_batch_size: int | None = None,
+        runner: ScheduleRunner | None = None,
+        **kwargs,
+    ):
         assert self.runner_class.model_class.config_class is config.model_config_class
         assert config.fast_llm_config is fast_llm_model.config
         assert isinstance(config, self.config_class)
 
+        # The HF constructor performs a deep copy of the config,
+        # but config.fast_llm_config may contain non-picklable items like process groups.
+        # Temporarily remove it before the call and restore it afterward.
+        fast_llm_config = config.fast_llm_config
+        config.fast_llm_config = None
         super().__init__(config, **kwargs)
+        config.fast_llm_config = fast_llm_config
+
+        self._inference_runner = self.runner_class(fast_llm_model, micro_batch_size, runner)
+
+        # A model can be created from pretrained which set it up in the current HF wrapper api
+        # or set existing model which  also must be setup, so, do not accept not setup model
+        assert fast_llm_model.is_setup
+
+        # We only support data parallel for now
+        assert (
+            fast_llm_model.distributed.config.model_parallel == 1
+            and fast_llm_model.distributed.config.sequence_tensor_parallel == 1
+        )
 
-        self._inference_runner = self.runner_class(fast_llm_model)
-        if not fast_llm_model.is_setup:
-            fast_llm_model.setup(mode=StageMode.inference)
         self._inference_runner.setup()
+
         # Transformers needs to be able to inspect the base model.
         self.fast_llm_base_model = fast_llm_model.base_model
-        # TODO: Support distributed models?
-        assert fast_llm_model.config.distributed.world_size == 1
+        # # TODO: Support distributed models?
+        # assert fast_llm_model.config.distributed.world_size == 1
 
         with transformers.modeling_utils.no_init_weights():
             self.post_init()
 
+    def forward(
+        self,
+        input_ids: torch.Tensor | None = None,
+        attention_mask: torch.Tensor | None = None,
+        position_ids: torch.Tensor | None = None,
+        past_key_values=None,
+        inputs_embeds: torch.FloatTensor | None = None,
+        labels: torch.LongTensor | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+    ) -> tuple | transformers.modeling_outputs.CausalLMOutputWithPast:
+        # Meant to be overridden in derived classes
+        raise NotImplementedError()
+
+    @classmethod
+    def from_model(
+        cls,
+        fast_llm_model: FastLLMModel,
+        micro_batch_size: int | None = None,
+        runner: ScheduleRunner | None = None,
+        **kwargs,
+    ):
+        config = cls.config_class(fast_llm_model.config)
+        return cls(
+            config,
+            fast_llm_model,
+            micro_batch_size=micro_batch_size,
+            runner=runner,
+            **kwargs,
+        )
+
     @classmethod
     def from_pretrained(
         cls,
         pretrained_model_name_or_path: str | os.PathLike | CheckpointLoadConfig,
-        *,
-        mode: StageMode = StageMode.inference,
+        *updates: dict[str | tuple[str, ...], typing.Any],
+        optimizer_state_names: tuple[str, ...] | None = None,
+        # setup: bool = True,
+        mode: StageMode = StageMode.training,
+        use_cpu: bool = False,
+        stage_filter: set | None = None,
         **kwargs,
     ) -> typing.Self:
         # Pretrained config.
@@ -54,18 +120,24 @@ def from_pretrained(
                 format=FastLLMCheckpointFormat,
             )
 
-        updates = {}
-        torch_dtype = kwargs.pop("torch_dtype", None)
-        if torch_dtype is not None:
-            updates[("distributed", "training_dtype")] = torch_dtype
-
         # Create the model
+        # always set up model and crate distributed instance internally for now
         fast_llm_model = cls.runner_class.model_class.from_pretrained(
-            pretrained_model_name_or_path, updates, mode=mode
+            pretrained_model_name_or_path,
+            *updates,
+            optimizer_state_names=optimizer_state_names,
+            # setup=setup,
+            mode=mode,
+            use_cpu=use_cpu,
+            stage_filter=stage_filter,
         )
-        config = cls.config_class(fast_llm_model.config)
 
+        config = cls.config_class(fast_llm_model.config)
         return cls(config, fast_llm_model, **kwargs)
 
     def _init_weights(self, module) -> None:
         raise NotImplementedError(module)
+
+
+class HuggingfaceBaseModelForCausalLM(HuggingfacePreTrainedModel, transformers.generation.utils.GenerationMixin):
+    pass
diff --git a/fast_llm/engine/inference/runner.py b/fast_llm/engine/inference/runner.py
index 30f836b77..52eff82b6 100644
--- a/fast_llm/engine/inference/runner.py
+++ b/fast_llm/engine/inference/runner.py
@@ -7,27 +7,43 @@
 from fast_llm.engine.schedule.config import BatchConfig, ScheduleConfig
 from fast_llm.engine.schedule.runner import ScheduleRunner
 from fast_llm.engine.schedule.schedule import Schedule
+from fast_llm.engine.training.config import TrainerConfig
 
 
 class InferenceRunner(abc.ABC):
     model_class: typing.ClassVar[type[FastLLMModel]] = FastLLMModel
     batch_config_class: typing.ClassVar[type[BatchConfig]] = BatchConfig
 
-    def __init__(self, fast_llm_model: FastLLMModel):
+    def __init__(
+        self,
+        fast_llm_model: FastLLMModel,
+        micro_batch_size: int | None = None,
+        runner: ScheduleRunner | None = None,
+    ):
         assert isinstance(fast_llm_model, self.model_class)
         self._fast_llm_model = fast_llm_model
-        # We only need a basic schedule and don't care about dimensions.
-        self._schedule_config = ScheduleConfig()
-        # TODO: Sort things out.
+
         with NoAutoValidate():
-            self._batch_config = self.batch_config_class()
+            self._batch_config = self.batch_config_class(micro_batch_size=micro_batch_size)
         self._batch_config.setup(self._fast_llm_model.config.distributed)
         self._batch_config.validate()
-        self._runner = ScheduleRunner(
-            config=self._schedule_config,
-            multi_stage=self._fast_llm_model,
-            distributed_config=self._fast_llm_model.config.distributed,
-        )
+
+        if runner is None:
+            # We only need a basic schedule and don't care about dimensions.
+            self._schedule_config = ScheduleConfig()
+            # TODO: Sort things out.
+
+            self._runner = ScheduleRunner(
+                config=self._schedule_config,
+                multi_stage=self._fast_llm_model,
+                distributed_config=self._fast_llm_model.config.distributed,
+            )
+        else:
+            self._schedule_config = runner.config
+            self._runner = runner
+            # External runner from training loop must be already setup
+            assert runner._is_setup
+
         # TODO: Random state? (Distributed.set_step)
         self._schedule = Schedule(
             multi_stage=self._fast_llm_model,
@@ -42,7 +58,8 @@ def fast_llm_model(self) -> FastLLMModel:
         return self._fast_llm_model
 
     def setup(self):
-        self._runner.setup(self._fast_llm_model.distributed)
+        if not self._runner._is_setup:
+            self._runner.setup(self._fast_llm_model.distributed)
 
     def forward(
         self, input_, kwargs: dict, *, iteration: int = 1, return_metrics: bool = False
diff --git a/fast_llm/engine/multi_stage/config.py b/fast_llm/engine/multi_stage/config.py
index e2d04f80f..ae3abc706 100644
--- a/fast_llm/engine/multi_stage/config.py
+++ b/fast_llm/engine/multi_stage/config.py
@@ -30,7 +30,7 @@
 from fast_llm.utils import Assert
 
 if typing.TYPE_CHECKING:
-    from fast_llm.engine.inference.huggingface import HuggingfacePreTrainedModel
+    from fast_llm.engine.inference.huggingface import HuggingfaceBaseModelForCausalLM
     from fast_llm.engine.multi_stage.fast_llm_model import FastLLMModel
 
 logger = logging.getLogger(__name__)
@@ -247,7 +247,7 @@ def get_model_class(cls) -> type["FastLLMModel"]:
         raise NotImplementedError
 
     @classmethod
-    def get_huggingface_model_class(cls) -> type["HuggingfacePreTrainedModel"]:
+    def get_huggingface_model_for_causal_lm_class(cls) -> type["HuggingfaceBaseModelForCausalLM"]:
         raise NotImplementedError
 
     @classmethod
diff --git a/fast_llm/engine/multi_stage/stage.py b/fast_llm/engine/multi_stage/stage.py
index 675e878b3..a3ac98e8b 100644
--- a/fast_llm/engine/multi_stage/stage.py
+++ b/fast_llm/engine/multi_stage/stage.py
@@ -13,6 +13,9 @@
 from fast_llm.tensor import ParameterMeta, TensorMeta, accumulate_gradient
 from fast_llm.utils import Assert
 
+if typing.TYPE_CHECKING:
+    from fast_llm.core.distributed import ProcessGroup
+
 logger = logging.getLogger(__name__)
 
 
@@ -111,6 +114,21 @@ def forward(
                 metrics,
             )
             self._log_layer_forward(output, kwargs, i)
+
+            # TODO: very slow and memory consuming, only use for debugging for now
+            # TODO: decide if and how we want to return
+            #       HF transformer style details from forward properly
+            if "output_hidden_states" in kwargs and kwargs["output_hidden_states"]:
+                # Last layer does not provide output
+                if output is not None:
+                    meta = self._meta_outputs[i]
+                    output_global, _ = meta.local_to_global(output.detach(), distributed=self._distributed)
+                else:
+                    output_global = None
+                kwargs["hidden_states"][self._layer_range[i]] = {
+                    "layer_type": type(layer).__name__,
+                    "tensor": output_global,
+                }
         return None if output is None else output.detach(), (input_, output)
 
     def backward(
diff --git a/fast_llm/layers/language_model/head.py b/fast_llm/layers/language_model/head.py
index 813dcc076..79c9f61b1 100644
--- a/fast_llm/layers/language_model/head.py
+++ b/fast_llm/layers/language_model/head.py
@@ -5,7 +5,7 @@
 from torch.distributed import all_reduce
 
 from fast_llm.config import Configurable
-from fast_llm.core.ops import split_op
+from fast_llm.core.ops import gather_op, split_op
 from fast_llm.engine.base_model.base_model import Layer
 from fast_llm.engine.config_utils.tensor_space import DefaultDimNames, TensorDim, TensorSpace
 from fast_llm.engine.distributed.config import DistributedDimNames
@@ -175,6 +175,14 @@ def _forward_backward(
         with torch.enable_grad():
             ln_output = self.final_norm(input_)
 
+            if "output_hidden_states" in kwargs and kwargs["output_hidden_states"]:
+                # The last hidden layer output is returned normalized in the HF Transformers-style output, at least for LLama style models.
+                # So, if needed, we gather the data after normalization and set it as the output of the previous layer.
+                group = self._tensor_space.distributed.tensor_group if self._parallel_embeddings else None
+                sequence_parallel = self._sequence_parallel and self._parallel_embeddings
+                hidden_state = gather_op(ln_output.detach(), group, dim=0) if sequence_parallel else ln_output.detach()
+                kwargs["hidden_states"][len(kwargs["hidden_states"]) - 1]["tensor"] = hidden_state
+
         grad_output = kwargs[TransformerKwargs.grad_output] / (
             self._group_size if self._sequence_parallel_logits else 1
         )
diff --git a/fast_llm/layers/transformer/preprocessing.py b/fast_llm/layers/transformer/preprocessing.py
index 2415a2f91..1e07c1c12 100644
--- a/fast_llm/layers/transformer/preprocessing.py
+++ b/fast_llm/layers/transformer/preprocessing.py
@@ -239,7 +239,7 @@ def preprocess(self, batch, kwargs: dict[str, typing.Any]) -> None:
         ]
         if (sequence_lengths := kwargs.get(TransformerKwargs.sequence_lengths, None)) is not None:
             seq_ids = torch.stack(
-                [torch.cat([torch.arange(x) for x in sample_lens]) for sample_lens in sequence_lengths]
+                [torch.cat([torch.full((x,), i) for i, x in enumerate(sample_lens)]) for sample_lens in sequence_lengths]
             )
             document_mask = (seq_ids[:, None, :] == seq_ids[:, :, None]).to(self._tensor_space.distributed.device)
             kwargs[TransformerKwargs.attention_mask] = (
diff --git a/fast_llm/models/custom/config.py b/fast_llm/models/custom/config.py
index 8be45e1c2..f9805e643 100644
--- a/fast_llm/models/custom/config.py
+++ b/fast_llm/models/custom/config.py
@@ -35,7 +35,7 @@ def get_model_class(cls) -> type["CustomModel"]:
         return CustomModel
 
     @classmethod
-    def get_huggingface_model_class(cls) -> type["HuggingfaceCustomModelForCausalLM"]:
+    def get_huggingface_model_for_causal_lm_class(cls) -> type["HuggingfaceCustomModelForCausalLM"]:
         from fast_llm.models.custom.huggingface import HuggingfaceCustomModelForCausalLM
 
         return HuggingfaceCustomModelForCausalLM
diff --git a/fast_llm/models/gpt/config.py b/fast_llm/models/gpt/config.py
index 418f948e3..3852d83fc 100644
--- a/fast_llm/models/gpt/config.py
+++ b/fast_llm/models/gpt/config.py
@@ -147,7 +147,7 @@ def get_model_class(cls) -> type["GPTModel"]:
         return GPTModel
 
     @classmethod
-    def get_huggingface_model_class(cls) -> type["HuggingfaceGPTModelForCausalLM"]:
+    def get_huggingface_model_for_causal_lm_class(cls) -> type["HuggingfaceGPTModelForCausalLM"]:
         from fast_llm.models.gpt.huggingface import HuggingfaceGPTModelForCausalLM
 
         return HuggingfaceGPTModelForCausalLM
diff --git a/fast_llm/models/gpt/huggingface.py b/fast_llm/models/gpt/huggingface.py
index 0da4acbb4..7e668e735 100644
--- a/fast_llm/models/gpt/huggingface.py
+++ b/fast_llm/models/gpt/huggingface.py
@@ -5,10 +5,11 @@
 import torch
 import transformers.modeling_outputs
 
+
 from fast_llm.data.data.gpt.data import GPTBatch
 from fast_llm.engine.distributed.config import PhaseType
 from fast_llm.engine.inference.config import HuggingfaceModelConfig
-from fast_llm.engine.inference.huggingface import HuggingfacePreTrainedModel
+from fast_llm.engine.inference.huggingface import HuggingfaceBaseModelForCausalLM
 from fast_llm.layers.transformer.config import TransformerKwargs
 from fast_llm.models.gpt.config import GPTModelConfig
 from fast_llm.models.gpt.model import GPTBaseModel, GPTInferenceRunner
@@ -22,7 +23,7 @@ class HuggingfaceGPTModelConfig(HuggingfaceModelConfig):
     fast_llm_config: GPTModelConfig
 
 
-class HuggingfaceGPTModelForCausalLM(HuggingfacePreTrainedModel):
+class HuggingfaceGPTModelForCausalLM(HuggingfaceBaseModelForCausalLM):
     config_class = HuggingfaceGPTModelConfig
     config: HuggingfaceGPTModelConfig
     runner_class: typing.ClassVar[type[GPTInferenceRunner]] = GPTInferenceRunner
@@ -55,21 +56,33 @@ def forward(
 
         if output_attentions:
             raise NotImplementedError()
-        if output_hidden_states:
-            raise NotImplementedError()
-        if attention_mask is not None:
-            raise NotImplementedError()
-        if position_ids is not None:
-            raise NotImplementedError()
         if inputs_embeds is not None:
             raise NotImplementedError()
         if labels is not None:
             raise NotImplementedError()
 
+        # NOTE: We are ignoring position_ids as we reconstruct them from attention_mask via sequence_lenghts.
+        if attention_mask is not None:
+
+            # First non zero indexes or zero index if the row is all zeros (invalid row)
+            first_non_zero_indexes = attention_mask.argmax(dim=1)
+
+            # Check if the sequence is left-padded and if the remaining ones are continuous 1-ns
+            assert (attention_mask.sum(axis=1) == (attention_mask.shape[1] - first_non_zero_indexes)).all()
+
+            sequence_lenghts = [
+                torch.tensor(
+                    [attention_mask.shape[1]] if el == 0 else [el, attention_mask.shape[1] - el], dtype=torch.int64
+                )
+                for el in first_non_zero_indexes.tolist()
+            ]
+        else:
+            sequence_lenghts = None
+
         # Iteration serves as a random seed, using random module because it's not seeded by Fast LLM
         iteration = random.randint(0, 2**32)
         batch = self.fast_llm_base_model.preprocess(
-            GPTBatch(input_ids), phase=PhaseType.inference, iteration=iteration
+            GPTBatch(input_ids, sequence_lengths=sequence_lenghts), phase=PhaseType.inference, iteration=iteration
         )
         ((input_, kwargs),) = batch
 
@@ -82,23 +95,40 @@ def forward(
             # The transformers will save the present keys and values to this list.
             kwargs[TransformerKwargs.presents] = []
 
+        if output_hidden_states:
+            kwargs["output_hidden_states"] = True
+            kwargs["hidden_states"] = {}
+        else:
+            kwargs["output_hidden_states"] = False
+
         self._inference_runner.forward(input_, kwargs, iteration=iteration)
 
         # TODO: Make a proper way of returning the model output.
         logits = kwargs["logits"]
 
+        # TODO: convert hidden state form dict to list to be the same as with HFs
+        hidden_states = None
+        if output_hidden_states:
+            hidden_states = kwargs["hidden_states"]
+
         if not return_dict:
-            outputs = (logits,)
+            # TODO: check hidden state go before past in the tuple
+            if output_hidden_states:
+                outputs = (logits, hidden_states)
+            else:
+                outputs = (logits,)
+
             if use_cache:
                 outputs += (kwargs[TransformerKwargs.presents],)
             return outputs
 
         return transformers.modeling_outputs.CausalLMOutputWithPast(
             logits=logits,
+            hidden_states=hidden_states,
             past_key_values=kwargs[TransformerKwargs.presents],
         )
 
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
-    ):
-        raise NotImplementedError()
+    # def prepare_inputs_for_generation(
+    #     self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    # ):
+    #     raise NotImplementedError()
diff --git a/tests/test_checkpoint.py b/tests/test_checkpoint.py
index 257947e96..91d629422 100644
--- a/tests/test_checkpoint.py
+++ b/tests/test_checkpoint.py
@@ -32,7 +32,7 @@
 from tests.compare_tensor_logs import CompareConfig, compare_logged_tensor
 
 TEST_MODEL_CONFIG_CLS = model_registry[TEST_MODEL_TYPE]
-TEST_MODEL_HF_CLS = TEST_MODEL_CONFIG_CLS.get_huggingface_model_class()
+TEST_MODEL_HF_CLS = TEST_MODEL_CONFIG_CLS.get_huggingface_model_for_causal_lm_class()
 TEST_MODEL_CLS = TEST_MODEL_CONFIG_CLS.get_model_class()
 TEST_BASE_MODEL_CONFIG_CLS = TEST_MODEL_CONFIG_CLS.get_base_model_config_class()
 

From 9a81f8906dfd9526bcacb431b396a2f207a9c9c7 Mon Sep 17 00:00:00 2001
From: bigximik <denisko@live.com>
Date: Mon, 12 May 2025 11:00:25 +0000
Subject: [PATCH 02/19] fix to use right config param

---
 fast_llm/engine/inference/huggingface.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fast_llm/engine/inference/huggingface.py b/fast_llm/engine/inference/huggingface.py
index d0ec2bd53..3dc79aa0c 100644
--- a/fast_llm/engine/inference/huggingface.py
+++ b/fast_llm/engine/inference/huggingface.py
@@ -55,7 +55,7 @@ def __init__(
         # We only support data parallel for now
         assert (
             fast_llm_model.distributed.config.model_parallel == 1
-            and fast_llm_model.distributed.config.sequence_tensor_parallel == 1
+            and fast_llm_model.distributed.config.sequence_data_parallel == 1
         )
 
         self._inference_runner.setup()

From 667aacf88bf7d30545819fe839a2d04d26836fac Mon Sep 17 00:00:00 2001
From: bigximik <denisko@live.com>
Date: Mon, 12 May 2025 11:01:25 +0000
Subject: [PATCH 03/19] added basic generate tests

---
 tests/test_generate.py | 144 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 144 insertions(+)
 create mode 100644 tests/test_generate.py

diff --git a/tests/test_generate.py b/tests/test_generate.py
new file mode 100644
index 000000000..a652a741f
--- /dev/null
+++ b/tests/test_generate.py
@@ -0,0 +1,144 @@
+import pytest
+import torch
+
+import huggingface_hub
+
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+from fast_llm.engine.checkpoint.config import CheckpointLoadConfig
+from fast_llm.models.gpt.config import LlamaGPTHuggingfaceCheckpointFormat
+from fast_llm.models.gpt.huggingface import HuggingfaceGPTModelForCausalLM
+
+from tests.common import requires_cuda, TEST_RESULTS_PATH
+
+
+def _prepare_checkpoint(model: str) -> str:
+    path = TEST_RESULTS_PATH.resolve() / "generate/model"
+    model_path = huggingface_hub.snapshot_download(repo_id=model, local_dir=path)
+    return model_path
+
+
+def _prepare_data(tokenizer, use_batch_size2: bool):
+    messages = [
+        {"role": "user", "content": "What is gravity?"},
+        {"role": "user", "content": "Who is the president of EU?"},
+    ]
+    if not use_batch_size2:
+        messages = messages[0:1]
+
+    input_text = [tokenizer.apply_chat_template([el], tokenize=False) for el in messages]
+
+    tokenizer.padding_side = "left"
+    inputs = tokenizer(input_text, padding="longest", return_tensors="pt").to("cuda")
+    return inputs
+
+
+def _get_hf_model(model_path: str, use_flash_attention: bool, use_bf16: bool):
+    hf_kwargs = {}
+    if use_flash_attention:
+        hf_kwargs["attn_implementation"] = "flash_attention_2"
+        hf_kwargs["torch_dtype"] = torch.bfloat16
+    elif use_bf16:
+        hf_kwargs["torch_dtype"] = torch.bfloat16
+    return AutoModelForCausalLM.from_pretrained(model_path, **hf_kwargs).to("cuda")
+
+
+def _get_fast_llm_model(
+    model_path: str, use_flash_attention: bool, use_bf16: bool, checkpoint_format=LlamaGPTHuggingfaceCheckpointFormat
+):
+    updates = {}
+    if use_flash_attention:
+        updates[("base_model", "transformer", "use_flash_attention")] = True
+        updates[("distributed", "training_dtype")] = "bf16"
+    else:
+        updates[("base_model", "transformer", "use_flash_attention")] = False
+        if use_bf16:
+            updates[("distributed", "training_dtype")] = "bf16"
+    return HuggingfaceGPTModelForCausalLM.from_pretrained(
+        CheckpointLoadConfig(
+            path=model_path,
+            format=checkpoint_format,
+            model_weights=True,
+        ),
+        updates,
+    )
+
+
+def _trim_output(output, inputs):
+    res = []
+    for output_row, input_row in zip(output, inputs["input_ids"]):
+        res.append(output_row[len(input_row) :])
+    return res
+
+
+def _generate_with_params(
+    tokenizer,
+    model_path: str,
+    use_flash_attention: bool,
+    use_bf16: bool,
+    use_batch_size2: bool,
+    max_new_tokens: int,
+    fast_llm_checkpoint_format=LlamaGPTHuggingfaceCheckpointFormat,
+):
+    inputs = _prepare_data(tokenizer, use_batch_size2)
+
+    hf_model = _get_hf_model(model_path, use_flash_attention, use_bf16)
+    fast_llm_model = _get_fast_llm_model(model_path, use_flash_attention, use_bf16, fast_llm_checkpoint_format)
+
+    return {
+        "hf": _trim_output(hf_model.generate(**inputs, max_new_tokens=max_new_tokens, use_cache=False), inputs),
+        "fast_llm": _trim_output(
+            fast_llm_model.generate(**inputs, max_new_tokens=max_new_tokens, use_cache=False), inputs
+        ),
+    }
+
+
+def _compare_gen_outputs(outputs: dict[str, list], min_matching_tokens: int | None = None):
+    for hf_output, fast_llm_output in zip(outputs["hf"], outputs["fast_llm"]):
+        if min_matching_tokens is not None:
+            hf_output = hf_output[:min_matching_tokens]
+            fast_llm_output = fast_llm_output[:min_matching_tokens]
+        assert len(hf_output) == len(fast_llm_output) and all(
+            hf_char == fast_llm_char for hf_char, fast_llm_char in zip(hf_output, fast_llm_output)
+        )
+
+
+@pytest.fixture(scope="module")
+def model_and_tokenizer():
+    model = "HuggingFaceTB/SmolLM2-135M-Instruct"
+    fast_llm_checkpoint_format = LlamaGPTHuggingfaceCheckpointFormat
+    model_path = _prepare_checkpoint(model)
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    return model_path, tokenizer, fast_llm_checkpoint_format
+
+
+@pytest.mark.slow
+@requires_cuda
+@pytest.mark.parametrize(
+    "use_flash_attention, use_bf16, use_batch_size2, max_new_tokens, min_matching_tokens",
+    [
+        # No flash attention + no bf16
+        (False, False, False, 10, 10),
+        (False, False, True, 10, 10),
+        # No flash attention + with bf16
+        (False, True, False, 10, 10),
+        (False, True, True, 10, 10),
+        # Flash attention must be paired with bf16
+        (True, True, False, 10, 10),
+        (True, True, True, 10, 10),
+    ],
+)
+def test_generate(
+    model_and_tokenizer, use_flash_attention, use_bf16, use_batch_size2, max_new_tokens, min_matching_tokens
+):
+    model_path, tokenizer, fast_llm_checkpoint_format = model_and_tokenizer
+    outputs = _generate_with_params(
+        tokenizer,
+        model_path,
+        use_flash_attention=use_flash_attention,
+        use_bf16=use_bf16,
+        use_batch_size2=use_batch_size2,
+        max_new_tokens=max_new_tokens,
+        fast_llm_checkpoint_format=fast_llm_checkpoint_format,
+    )
+    _compare_gen_outputs(outputs, min_matching_tokens=min_matching_tokens)

From 124cfa7f0ad79c3bf53e300030687b7e60ed7ef5 Mon Sep 17 00:00:00 2001
From: bigximik <denisko@live.com>
Date: Mon, 12 May 2025 11:16:06 +0000
Subject: [PATCH 04/19] clean up

---
 fast_llm/engine/inference/huggingface.py | 2 --
 tests/test_generate.py                   | 6 ++----
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/fast_llm/engine/inference/huggingface.py b/fast_llm/engine/inference/huggingface.py
index 3dc79aa0c..ba2700180 100644
--- a/fast_llm/engine/inference/huggingface.py
+++ b/fast_llm/engine/inference/huggingface.py
@@ -62,8 +62,6 @@ def __init__(
 
         # Transformers needs to be able to inspect the base model.
         self.fast_llm_base_model = fast_llm_model.base_model
-        # # TODO: Support distributed models?
-        # assert fast_llm_model.config.distributed.world_size == 1
 
         with transformers.modeling_utils.no_init_weights():
             self.post_init()
diff --git a/tests/test_generate.py b/tests/test_generate.py
index a652a741f..07a8a917d 100644
--- a/tests/test_generate.py
+++ b/tests/test_generate.py
@@ -93,14 +93,12 @@ def _generate_with_params(
     }
 
 
-def _compare_gen_outputs(outputs: dict[str, list], min_matching_tokens: int | None = None):
+def _compare_gen_outputs(outputs: dict[str, list[torch.Tensor]], min_matching_tokens: int | None = None):
     for hf_output, fast_llm_output in zip(outputs["hf"], outputs["fast_llm"]):
         if min_matching_tokens is not None:
             hf_output = hf_output[:min_matching_tokens]
             fast_llm_output = fast_llm_output[:min_matching_tokens]
-        assert len(hf_output) == len(fast_llm_output) and all(
-            hf_char == fast_llm_char for hf_char, fast_llm_char in zip(hf_output, fast_llm_output)
-        )
+        assert torch.equal(hf_output, fast_llm_output)
 
 
 @pytest.fixture(scope="module")

From fcef337fac9a69d436a024655f725f0b3758737e Mon Sep 17 00:00:00 2001
From: bigximik <denisko@live.com>
Date: Tue, 13 May 2025 07:53:30 +0000
Subject: [PATCH 05/19] updated interface and clean up

---
 fast_llm/engine/inference/huggingface.py | 30 ++++++------------------
 fast_llm/engine/inference/runner.py      |  3 +--
 fast_llm/models/gpt/huggingface.py       | 10 ++------
 3 files changed, 10 insertions(+), 33 deletions(-)

diff --git a/fast_llm/engine/inference/huggingface.py b/fast_llm/engine/inference/huggingface.py
index ba2700180..ea8f134d6 100644
--- a/fast_llm/engine/inference/huggingface.py
+++ b/fast_llm/engine/inference/huggingface.py
@@ -28,12 +28,14 @@ class HuggingfacePreTrainedModel(transformers.PreTrainedModel):
 
     def __init__(
         self,
-        config: HuggingfaceModelConfig,
         fast_llm_model: FastLLMModel,
-        micro_batch_size: int | None = None,
+        config: HuggingfaceModelConfig | None = None,
         runner: ScheduleRunner | None = None,
         **kwargs,
     ):
+        if config is None:
+            config = self.config_class(fast_llm_model.config)
+
         assert self.runner_class.model_class.config_class is config.model_config_class
         assert config.fast_llm_config is fast_llm_model.config
         assert isinstance(config, self.config_class)
@@ -46,7 +48,7 @@ def __init__(
         super().__init__(config, **kwargs)
         config.fast_llm_config = fast_llm_config
 
-        self._inference_runner = self.runner_class(fast_llm_model, micro_batch_size, runner)
+        self._inference_runner = self.runner_class(fast_llm_model, runner)
 
         # A model can be created from pretrained which set it up in the current HF wrapper api
         # or set existing model which  also must be setup, so, do not accept not setup model
@@ -82,23 +84,6 @@ def forward(
         # Meant to be overridden in derived classes
         raise NotImplementedError()
 
-    @classmethod
-    def from_model(
-        cls,
-        fast_llm_model: FastLLMModel,
-        micro_batch_size: int | None = None,
-        runner: ScheduleRunner | None = None,
-        **kwargs,
-    ):
-        config = cls.config_class(fast_llm_model.config)
-        return cls(
-            config,
-            fast_llm_model,
-            micro_batch_size=micro_batch_size,
-            runner=runner,
-            **kwargs,
-        )
-
     @classmethod
     def from_pretrained(
         cls,
@@ -124,14 +109,13 @@ def from_pretrained(
             pretrained_model_name_or_path,
             *updates,
             optimizer_state_names=optimizer_state_names,
-            # setup=setup,
+            setup=True,
             mode=mode,
             use_cpu=use_cpu,
             stage_filter=stage_filter,
         )
 
-        config = cls.config_class(fast_llm_model.config)
-        return cls(config, fast_llm_model, **kwargs)
+        return cls(fast_llm_model, **kwargs)
 
     def _init_weights(self, module) -> None:
         raise NotImplementedError(module)
diff --git a/fast_llm/engine/inference/runner.py b/fast_llm/engine/inference/runner.py
index 52eff82b6..6e8084601 100644
--- a/fast_llm/engine/inference/runner.py
+++ b/fast_llm/engine/inference/runner.py
@@ -17,14 +17,13 @@ class InferenceRunner(abc.ABC):
     def __init__(
         self,
         fast_llm_model: FastLLMModel,
-        micro_batch_size: int | None = None,
         runner: ScheduleRunner | None = None,
     ):
         assert isinstance(fast_llm_model, self.model_class)
         self._fast_llm_model = fast_llm_model
 
         with NoAutoValidate():
-            self._batch_config = self.batch_config_class(micro_batch_size=micro_batch_size)
+            self._batch_config = self.batch_config_class()
         self._batch_config.setup(self._fast_llm_model.config.distributed)
         self._batch_config.validate()
 
diff --git a/fast_llm/models/gpt/huggingface.py b/fast_llm/models/gpt/huggingface.py
index 7e668e735..500c454d9 100644
--- a/fast_llm/models/gpt/huggingface.py
+++ b/fast_llm/models/gpt/huggingface.py
@@ -61,9 +61,8 @@ def forward(
         if labels is not None:
             raise NotImplementedError()
 
-        # NOTE: We are ignoring position_ids as we reconstruct them from attention_mask via sequence_lenghts.
+        # NOTE: We are ignoring position_ids as we reconstruct them from attention_mask via sequence_lengths.
         if attention_mask is not None:
-
             # First non zero indexes or zero index if the row is all zeros (invalid row)
             first_non_zero_indexes = attention_mask.argmax(dim=1)
 
@@ -112,7 +111,7 @@ def forward(
             hidden_states = kwargs["hidden_states"]
 
         if not return_dict:
-            # TODO: check hidden state go before past in the tuple
+            # TODO: Then implementing cache, check hidden state goes before past in the tuple
             if output_hidden_states:
                 outputs = (logits, hidden_states)
             else:
@@ -127,8 +126,3 @@ def forward(
             hidden_states=hidden_states,
             past_key_values=kwargs[TransformerKwargs.presents],
         )
-
-    # def prepare_inputs_for_generation(
-    #     self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
-    # ):
-    #     raise NotImplementedError()

From 47ad6d04b30aed767e3bc29b0294683c5ac88b8c Mon Sep 17 00:00:00 2001
From: bigximik <denisko@live.com>
Date: Tue, 13 May 2025 07:54:12 +0000
Subject: [PATCH 06/19] added from model case, renamed

---
 ...{test_generate.py => test_gpt_generate.py} | 142 ++++++++++++++----
 1 file changed, 113 insertions(+), 29 deletions(-)
 rename tests/{test_generate.py => test_gpt_generate.py} (54%)

diff --git a/tests/test_generate.py b/tests/test_gpt_generate.py
similarity index 54%
rename from tests/test_generate.py
rename to tests/test_gpt_generate.py
index 07a8a917d..15dbfd6d1 100644
--- a/tests/test_generate.py
+++ b/tests/test_gpt_generate.py
@@ -6,9 +6,13 @@
 from transformers import AutoTokenizer, AutoModelForCausalLM
 
 from fast_llm.engine.checkpoint.config import CheckpointLoadConfig
-from fast_llm.models.gpt.config import LlamaGPTHuggingfaceCheckpointFormat
+from fast_llm.engine.distributed.distributed import Distributed
+from fast_llm.engine.schedule.config import ScheduleConfig
+from fast_llm.engine.schedule.runner import ScheduleRunner
+from fast_llm.models.gpt.config import LlamaGPTHuggingfaceCheckpointFormat, PretrainedGPTModelConfig
 from fast_llm.models.gpt.huggingface import HuggingfaceGPTModelForCausalLM
 
+
 from tests.common import requires_cuda, TEST_RESULTS_PATH
 
 
@@ -64,6 +68,42 @@ def _get_fast_llm_model(
     )
 
 
+def _get_fast_llm_model_from_model(
+    model_path: str, use_flash_attention: bool, use_bf16: bool, checkpoint_format=LlamaGPTHuggingfaceCheckpointFormat
+):
+    updates = {
+        ("pretrained", "path"): model_path,
+        ("pretrained", "model_weights"): True,
+        ("pretrained", "format"): checkpoint_format.name,
+    }
+
+    if use_flash_attention:
+        updates[("model", "base_model", "transformer", "use_flash_attention")] = True
+        updates[("model", "distributed", "training_dtype")] = "bf16"
+    else:
+        updates[("model", "base_model", "transformer", "use_flash_attention")] = False
+        if use_bf16:
+            updates[("model", "distributed", "training_dtype")] = "bf16"
+
+    config = PretrainedGPTModelConfig.from_dict({}, updates)
+    multi_stage = config.model.get_model_class()(config.model)
+    schedule_config = ScheduleConfig()
+    runner = ScheduleRunner(
+        config=schedule_config,
+        multi_stage=multi_stage,
+        distributed_config=config.model.distributed,
+    )
+    distributed = Distributed(config.model.distributed)
+
+    with torch.no_grad():
+        multi_stage.setup(distributed)
+        runner.setup(distributed)
+
+    multi_stage.load_checkpoint(config.pretrained)
+
+    return HuggingfaceGPTModelForCausalLM(multi_stage, runner=runner)
+
+
 def _trim_output(output, inputs):
     res = []
     for output_row, input_row in zip(output, inputs["input_ids"]):
@@ -71,20 +111,12 @@ def _trim_output(output, inputs):
     return res
 
 
-def _generate_with_params(
-    tokenizer,
-    model_path: str,
-    use_flash_attention: bool,
-    use_bf16: bool,
-    use_batch_size2: bool,
+def _generate(
+    inputs,
+    hf_model,
+    fast_llm_model,
     max_new_tokens: int,
-    fast_llm_checkpoint_format=LlamaGPTHuggingfaceCheckpointFormat,
 ):
-    inputs = _prepare_data(tokenizer, use_batch_size2)
-
-    hf_model = _get_hf_model(model_path, use_flash_attention, use_bf16)
-    fast_llm_model = _get_fast_llm_model(model_path, use_flash_attention, use_bf16, fast_llm_checkpoint_format)
-
     return {
         "hf": _trim_output(hf_model.generate(**inputs, max_new_tokens=max_new_tokens, use_cache=False), inputs),
         "fast_llm": _trim_output(
@@ -101,6 +133,33 @@ def _compare_gen_outputs(outputs: dict[str, list[torch.Tensor]], min_matching_to
         assert torch.equal(hf_output, fast_llm_output)
 
 
+def _test_for_batches(
+    hf_model,
+    fast_llm_model,
+    tokenizer,
+    max_new_tokens,
+    min_matching_tokens_batch_seize_1,
+    min_matching_tokens_batch_seize_2,
+):
+    inputs = _prepare_data(tokenizer, use_batch_size2=False)
+    outputs = _generate(
+        inputs,
+        hf_model,
+        fast_llm_model,
+        max_new_tokens=max_new_tokens,
+    )
+    _compare_gen_outputs(outputs, min_matching_tokens=min_matching_tokens_batch_seize_1)
+
+    inputs = _prepare_data(tokenizer, use_batch_size2=True)
+    outputs = _generate(
+        inputs,
+        hf_model,
+        fast_llm_model,
+        max_new_tokens=max_new_tokens,
+    )
+    _compare_gen_outputs(outputs, min_matching_tokens=min_matching_tokens_batch_seize_2)
+
+
 @pytest.fixture(scope="module")
 def model_and_tokenizer():
     model = "HuggingFaceTB/SmolLM2-135M-Instruct"
@@ -113,30 +172,55 @@ def model_and_tokenizer():
 @pytest.mark.slow
 @requires_cuda
 @pytest.mark.parametrize(
-    "use_flash_attention, use_bf16, use_batch_size2, max_new_tokens, min_matching_tokens",
+    "use_flash_attention, use_bf16, max_new_tokens, min_matching_tokens_batch_seize_1, min_matching_tokens_batch_seize_2",
     [
         # No flash attention + no bf16
-        (False, False, False, 10, 10),
-        (False, False, True, 10, 10),
+        (False, False, 10, 10, 10),
         # No flash attention + with bf16
-        (False, True, False, 10, 10),
-        (False, True, True, 10, 10),
+        (False, True, 10, 10, 10),
         # Flash attention must be paired with bf16
-        (True, True, False, 10, 10),
-        (True, True, True, 10, 10),
+        (True, True, 10, 10, 10),
     ],
 )
 def test_generate(
-    model_and_tokenizer, use_flash_attention, use_bf16, use_batch_size2, max_new_tokens, min_matching_tokens
+    model_and_tokenizer,
+    use_flash_attention,
+    use_bf16,
+    max_new_tokens,
+    min_matching_tokens_batch_seize_1,
+    min_matching_tokens_batch_seize_2,
+):
+    model_path, tokenizer, fast_llm_checkpoint_format = model_and_tokenizer
+    hf_model = _get_hf_model(model_path, use_flash_attention, use_bf16)
+    fast_llm_model = _get_fast_llm_model(model_path, use_flash_attention, use_bf16, fast_llm_checkpoint_format)
+
+    _test_for_batches(
+        hf_model,
+        fast_llm_model,
+        tokenizer,
+        max_new_tokens,
+        min_matching_tokens_batch_seize_1,
+        min_matching_tokens_batch_seize_2,
+    )
+
+
+def test_generate_from_model(
+    model_and_tokenizer,
 ):
+    max_new_tokens = 10
+    min_matching_tokens_batch_seize_1 = 10
+    min_matching_tokens_batch_seize_2 = 10
+
+    # Use flash attention for speed
     model_path, tokenizer, fast_llm_checkpoint_format = model_and_tokenizer
-    outputs = _generate_with_params(
+    hf_model = _get_hf_model(model_path, True, True)
+    fast_llm_model = _get_fast_llm_model_from_model(model_path, True, True, fast_llm_checkpoint_format)
+
+    _test_for_batches(
+        hf_model,
+        fast_llm_model,
         tokenizer,
-        model_path,
-        use_flash_attention=use_flash_attention,
-        use_bf16=use_bf16,
-        use_batch_size2=use_batch_size2,
-        max_new_tokens=max_new_tokens,
-        fast_llm_checkpoint_format=fast_llm_checkpoint_format,
+        max_new_tokens,
+        min_matching_tokens_batch_seize_1,
+        min_matching_tokens_batch_seize_2,
     )
-    _compare_gen_outputs(outputs, min_matching_tokens=min_matching_tokens)

From e066170915079e5c4a71e22ef4a74b1fc41764ec Mon Sep 17 00:00:00 2001
From: bigximik <denisko@live.com>
Date: Tue, 13 May 2025 07:57:48 +0000
Subject: [PATCH 07/19] added decorators to the  new test

---
 tests/test_gpt_generate.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/test_gpt_generate.py b/tests/test_gpt_generate.py
index 15dbfd6d1..2a448d9ab 100644
--- a/tests/test_gpt_generate.py
+++ b/tests/test_gpt_generate.py
@@ -204,6 +204,8 @@ def test_generate(
     )
 
 
+@pytest.mark.slow
+@requires_cuda
 def test_generate_from_model(
     model_and_tokenizer,
 ):

From 24b3e1c6c78b4a7448f1d535dec84840d61675fc Mon Sep 17 00:00:00 2001
From: bigximik <denisko@live.com>
Date: Tue, 13 May 2025 08:40:46 +0000
Subject: [PATCH 08/19] added docs

---
 docs/recipes/generate.md | 78 ++++++++++++++++++++++++++++++++++++++++
 mkdocs.yaml              |  1 +
 2 files changed, 79 insertions(+)
 create mode 100644 docs/recipes/generate.md

diff --git a/docs/recipes/generate.md b/docs/recipes/generate.md
new file mode 100644
index 000000000..e35d759d4
--- /dev/null
+++ b/docs/recipes/generate.md
@@ -0,0 +1,78 @@
+---
+title: How to Generate with a Fast-LLM Model
+---
+
+Fast-LLM models support `generate` and `forward` operations through Hugging Face–compatible wrappers.
+
+> ⚠️ Limitations:
+> - No support for `cache`, `past_key_values`, `labels`, `attention` outputs, or `inputs_embeds`
+> - `position_ids` are ignored and reconstructed from the attention mask
+> - Only **data-parallel** generation is supported
+
+---
+
+### 🔧 Generating Text from a Fast-LLM Model
+
+Below is a step-by-step example of how to generate text using a Fast-LLM model checkpoint from Hugging Face Hub.
+
+```python
+# Import dependencies
+import huggingface_hub
+from transformers import AutoTokenizer
+from fast_llm.engine.checkpoint.config import CheckpointLoadConfig
+from fast_llm.models.gpt.config import LlamaGPTHuggingfaceCheckpointFormat
+from fast_llm.models.gpt.huggingface import HuggingfaceGPTModelForCausalLM
+
+# Specify model and configuration
+model = "HuggingFaceTB/SmolLM2-135M-Instruct"
+checkpoint_format = LlamaGPTHuggingfaceCheckpointFormat
+max_new_tokens = 50
+
+# Download model checkpoint from the Hugging Face Hub to a local directory
+model_path = huggingface_hub.snapshot_download(repo_id=model, local_dir="/tmp")
+
+# Load tokenizer from the downloaded model
+tokenizer = AutoTokenizer.from_pretrained(model_path)
+
+# Optional: updates to Fast-LLM config before loading the model
+updates = {
+    ("base_model", "transformer", "use_flash_attention"): True,
+    ("distributed", "training_dtype"): "bf16"
+}
+
+# Load the model from the checkpoint with the given configuration
+model = HuggingfaceGPTModelForCausalLM.from_pretrained(
+    CheckpointLoadConfig(
+        path=model_path,
+        format=checkpoint_format,
+        model_weights=True,
+    ),
+    updates,
+)
+
+# Example input messages formatted for chat-style generation
+messages = [
+    {"role": "user", "content": "What is gravity?"},
+    {"role": "user", "content": "Who is the president of EU?"},
+]
+
+# Convert messages into model input format using chat template
+input_text = [tokenizer.apply_chat_template([el], tokenize=False) for el in messages]
+
+# Prepare tokenized input for the model
+tokenizer.padding_side = "left"  # Important for correct padding
+inputs = tokenizer(input_text, padding="longest", return_tensors="pt").to("cuda")
+
+# Generate text using the model
+outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, use_cache=False)
+
+# Decode and display outputs
+outputs = [tokenizer.decode(el, skip_special_tokens=True) for el in outputs]
+
+print("--------------------------------------------------------------------")
+for el in outputs:
+    print(el)
+    print("--------------------------------------------------------------------")
+```
+
+
diff --git a/mkdocs.yaml b/mkdocs.yaml
index a080bc83f..ab71bc231 100644
--- a/mkdocs.yaml
+++ b/mkdocs.yaml
@@ -173,6 +173,7 @@ nav:
     - Continue training a model: recipes/continue-training.md
     - Upcycle Llama 3B to MoE: recipes/upcycle-llama-3b-to-moe.md
     - Instruction Finetuning: recipes/instruction-finetuning.md
+    - Generate: recipes/generate.md
   - Reference:
     - User Guide:
       - Configuration: user_guide/configuration.md

From a574f946ec9bbd508046e5766940bae17e81eca2 Mon Sep 17 00:00:00 2001
From: bigximik <denisko@live.com>
Date: Tue, 13 May 2025 08:57:10 +0000
Subject: [PATCH 09/19] fixed typo

---
 tests/test_gpt_generate.py | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/tests/test_gpt_generate.py b/tests/test_gpt_generate.py
index 2a448d9ab..c1c924b94 100644
--- a/tests/test_gpt_generate.py
+++ b/tests/test_gpt_generate.py
@@ -138,8 +138,8 @@ def _test_for_batches(
     fast_llm_model,
     tokenizer,
     max_new_tokens,
-    min_matching_tokens_batch_seize_1,
-    min_matching_tokens_batch_seize_2,
+    min_matching_tokens_batch_size_1,
+    min_matching_tokens_batch_size_2,
 ):
     inputs = _prepare_data(tokenizer, use_batch_size2=False)
     outputs = _generate(
@@ -148,7 +148,7 @@ def _test_for_batches(
         fast_llm_model,
         max_new_tokens=max_new_tokens,
     )
-    _compare_gen_outputs(outputs, min_matching_tokens=min_matching_tokens_batch_seize_1)
+    _compare_gen_outputs(outputs, min_matching_tokens=min_matching_tokens_batch_size_1)
 
     inputs = _prepare_data(tokenizer, use_batch_size2=True)
     outputs = _generate(
@@ -157,7 +157,7 @@ def _test_for_batches(
         fast_llm_model,
         max_new_tokens=max_new_tokens,
     )
-    _compare_gen_outputs(outputs, min_matching_tokens=min_matching_tokens_batch_seize_2)
+    _compare_gen_outputs(outputs, min_matching_tokens=min_matching_tokens_batch_size_2)
 
 
 @pytest.fixture(scope="module")
@@ -172,7 +172,7 @@ def model_and_tokenizer():
 @pytest.mark.slow
 @requires_cuda
 @pytest.mark.parametrize(
-    "use_flash_attention, use_bf16, max_new_tokens, min_matching_tokens_batch_seize_1, min_matching_tokens_batch_seize_2",
+    "use_flash_attention, use_bf16, max_new_tokens, min_matching_tokens_batch_size_1, min_matching_tokens_batch_size_2",
     [
         # No flash attention + no bf16
         (False, False, 10, 10, 10),
@@ -187,8 +187,8 @@ def test_generate(
     use_flash_attention,
     use_bf16,
     max_new_tokens,
-    min_matching_tokens_batch_seize_1,
-    min_matching_tokens_batch_seize_2,
+    min_matching_tokens_batch_size_1,
+    min_matching_tokens_batch_size_2,
 ):
     model_path, tokenizer, fast_llm_checkpoint_format = model_and_tokenizer
     hf_model = _get_hf_model(model_path, use_flash_attention, use_bf16)
@@ -199,8 +199,8 @@ def test_generate(
         fast_llm_model,
         tokenizer,
         max_new_tokens,
-        min_matching_tokens_batch_seize_1,
-        min_matching_tokens_batch_seize_2,
+        min_matching_tokens_batch_size_1,
+        min_matching_tokens_batch_size_2,
     )
 
 
@@ -210,8 +210,8 @@ def test_generate_from_model(
     model_and_tokenizer,
 ):
     max_new_tokens = 10
-    min_matching_tokens_batch_seize_1 = 10
-    min_matching_tokens_batch_seize_2 = 10
+    min_matching_tokens_batch_size_1 = 10
+    min_matching_tokens_batch_size_2 = 10
 
     # Use flash attention for speed
     model_path, tokenizer, fast_llm_checkpoint_format = model_and_tokenizer
@@ -223,6 +223,6 @@ def test_generate_from_model(
         fast_llm_model,
         tokenizer,
         max_new_tokens,
-        min_matching_tokens_batch_seize_1,
-        min_matching_tokens_batch_seize_2,
+        min_matching_tokens_batch_size_1,
+        min_matching_tokens_batch_size_2,
     )

From 2094b603742a973a5fed4cc71a41a5c79f597429 Mon Sep 17 00:00:00 2001
From: Denis Kocetkov <denis.kocetkov@servicenow.com>
Date: Tue, 13 May 2025 17:10:12 +0300
Subject: [PATCH 10/19] docs updates

---
 docs/recipes/generate.md | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/docs/recipes/generate.md b/docs/recipes/generate.md
index e35d759d4..a017fcbb4 100644
--- a/docs/recipes/generate.md
+++ b/docs/recipes/generate.md
@@ -4,10 +4,11 @@ title: How to Generate with a Fast-LLM Model
 
 Fast-LLM models support `generate` and `forward` operations through Hugging Face–compatible wrappers.
 
-> ⚠️ Limitations:
-> - No support for `cache`, `past_key_values`, `labels`, `attention` outputs, or `inputs_embeds`
-> - `position_ids` are ignored and reconstructed from the attention mask
-> - Only **data-parallel** generation is supported
+⚠️ Limitations:
+
+- No support for `cache`, `past_key_values`, `labels`, `attention` outputs, or `inputs_embeds`
+- `position_ids` are ignored and reconstructed from the attention mask
+- **model-parallel** and **sequence-data-parallel** generation is **not** supported
 
 ---
 

From 196e73bfb6d0325731b0de29ce50ee39128d5f41 Mon Sep 17 00:00:00 2001
From: Denis Kocetkov <denis.kocetkov@servicenow.com>
Date: Tue, 13 May 2025 17:10:51 +0300
Subject: [PATCH 11/19] cairosvg downgrade

---
 setup.cfg | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.cfg b/setup.cfg
index 9b944b27f..2e3b549fc 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -65,6 +65,7 @@ DOCS =
     mkdocs-git-revision-date-localized-plugin
     pypandoc_binary
     mkdocs-bibtex
+    cairosvg==2.7.0
 
 [options.entry_points]
 console_scripts =

From 7fe218d70cee06f1ce7c9071ce7101cf955d9ded Mon Sep 17 00:00:00 2001
From: Denis Kocetkov <denis.kocetkov@servicenow.com>
Date: Tue, 13 May 2025 17:57:05 +0300
Subject: [PATCH 12/19] style filters applied

---
 docs/recipes/generate.md                     |  2 --
 fast_llm/engine/inference/huggingface.py     |  5 +----
 fast_llm/engine/inference/runner.py          |  1 -
 fast_llm/layers/transformer/preprocessing.py |  5 ++++-
 fast_llm/models/gpt/huggingface.py           |  1 -
 tests/test_gpt_generate.py                   | 10 +++-------
 6 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/docs/recipes/generate.md b/docs/recipes/generate.md
index a017fcbb4..e6bda8031 100644
--- a/docs/recipes/generate.md
+++ b/docs/recipes/generate.md
@@ -75,5 +75,3 @@ for el in outputs:
     print(el)
     print("--------------------------------------------------------------------")
 ```
-
-
diff --git a/fast_llm/engine/inference/huggingface.py b/fast_llm/engine/inference/huggingface.py
index ea8f134d6..2518ed96a 100644
--- a/fast_llm/engine/inference/huggingface.py
+++ b/fast_llm/engine/inference/huggingface.py
@@ -3,18 +3,15 @@
 import typing
 
 import torch
-import transformers.modeling_outputs
 import transformers.generation.utils
+import transformers.modeling_outputs
 
 from fast_llm.engine.checkpoint.config import CheckpointLoadConfig, FastLLMCheckpointFormat
 from fast_llm.engine.inference.config import HuggingfaceModelConfig
 from fast_llm.engine.inference.runner import InferenceRunner
 from fast_llm.engine.multi_stage.config import StageMode
 from fast_llm.engine.multi_stage.fast_llm_model import FastLLMModel
-from fast_llm.engine.schedule.config import BatchConfig, ScheduleConfig
 from fast_llm.engine.schedule.runner import ScheduleRunner
-from fast_llm.engine.schedule.schedule import Schedule
-from fast_llm.engine.training.config import TrainerConfig
 
 
 class HuggingfacePreTrainedModel(transformers.PreTrainedModel):
diff --git a/fast_llm/engine/inference/runner.py b/fast_llm/engine/inference/runner.py
index 6e8084601..df727288d 100644
--- a/fast_llm/engine/inference/runner.py
+++ b/fast_llm/engine/inference/runner.py
@@ -7,7 +7,6 @@
 from fast_llm.engine.schedule.config import BatchConfig, ScheduleConfig
 from fast_llm.engine.schedule.runner import ScheduleRunner
 from fast_llm.engine.schedule.schedule import Schedule
-from fast_llm.engine.training.config import TrainerConfig
 
 
 class InferenceRunner(abc.ABC):
diff --git a/fast_llm/layers/transformer/preprocessing.py b/fast_llm/layers/transformer/preprocessing.py
index 1e07c1c12..0697bd216 100644
--- a/fast_llm/layers/transformer/preprocessing.py
+++ b/fast_llm/layers/transformer/preprocessing.py
@@ -239,7 +239,10 @@ def preprocess(self, batch, kwargs: dict[str, typing.Any]) -> None:
         ]
         if (sequence_lengths := kwargs.get(TransformerKwargs.sequence_lengths, None)) is not None:
             seq_ids = torch.stack(
-                [torch.cat([torch.full((x,), i) for i, x in enumerate(sample_lens)]) for sample_lens in sequence_lengths]
+                [
+                    torch.cat([torch.full((x,), i) for i, x in enumerate(sample_lens)])
+                    for sample_lens in sequence_lengths
+                ]
             )
             document_mask = (seq_ids[:, None, :] == seq_ids[:, :, None]).to(self._tensor_space.distributed.device)
             kwargs[TransformerKwargs.attention_mask] = (
diff --git a/fast_llm/models/gpt/huggingface.py b/fast_llm/models/gpt/huggingface.py
index 500c454d9..cf7da3872 100644
--- a/fast_llm/models/gpt/huggingface.py
+++ b/fast_llm/models/gpt/huggingface.py
@@ -5,7 +5,6 @@
 import torch
 import transformers.modeling_outputs
 
-
 from fast_llm.data.data.gpt.data import GPTBatch
 from fast_llm.engine.distributed.config import PhaseType
 from fast_llm.engine.inference.config import HuggingfaceModelConfig
diff --git a/tests/test_gpt_generate.py b/tests/test_gpt_generate.py
index c1c924b94..ac2b6362d 100644
--- a/tests/test_gpt_generate.py
+++ b/tests/test_gpt_generate.py
@@ -1,9 +1,7 @@
+import huggingface_hub
 import pytest
 import torch
-
-import huggingface_hub
-
-from transformers import AutoTokenizer, AutoModelForCausalLM
+from transformers import AutoModelForCausalLM, AutoTokenizer
 
 from fast_llm.engine.checkpoint.config import CheckpointLoadConfig
 from fast_llm.engine.distributed.distributed import Distributed
@@ -11,9 +9,7 @@
 from fast_llm.engine.schedule.runner import ScheduleRunner
 from fast_llm.models.gpt.config import LlamaGPTHuggingfaceCheckpointFormat, PretrainedGPTModelConfig
 from fast_llm.models.gpt.huggingface import HuggingfaceGPTModelForCausalLM
-
-
-from tests.common import requires_cuda, TEST_RESULTS_PATH
+from tests.common import TEST_RESULTS_PATH, requires_cuda
 
 
 def _prepare_checkpoint(model: str) -> str:

From 5325b50770a064d9f6582e11d8bf050cd0c9d416 Mon Sep 17 00:00:00 2001
From: bigximik <denisko@live.com>
Date: Fri, 16 May 2025 11:29:49 +0000
Subject: [PATCH 13/19] changed of handling of unwanted config deepcopy

---
 fast_llm/engine/inference/config.py      | 20 ++++++++++++++++++--
 fast_llm/engine/inference/huggingface.py |  6 ------
 2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/fast_llm/engine/inference/config.py b/fast_llm/engine/inference/config.py
index c18daa48d..b09c88baf 100644
--- a/fast_llm/engine/inference/config.py
+++ b/fast_llm/engine/inference/config.py
@@ -1,3 +1,4 @@
+import copy
 import logging
 import os
 import pathlib
@@ -36,6 +37,22 @@ def save_pretrained(self, save_directory: str | os.PathLike, push_to_hub: bool =
         finally:
             transformers.configuration_utils.CONFIG_NAME = _backup
 
+    def __deepcopy__(self, memo):
+        # Hugging Face's PretrainedModel will deep copy the config
+        # when `generate` is enabled. However, `fast_llm_config`
+        # cannot be deep copied if the world size is greater than 1,
+        # as it will contain references to process groups.
+        # Therefore, we copy it by reference instead.
+        cls = self.__class__
+        copied = cls.__new__(cls)
+        memo[id(self)] = copied
+        for k, v in self.__dict__.items():
+            if k == "fast_llm_config":
+                setattr(copied, k, v)  # Keep the same reference
+            else:
+                setattr(copied, k, copy.deepcopy(v, memo))
+        return copied
+
     @classmethod
     def _get_config_dict(
         cls, pretrained_model_name_or_path: str | os.PathLike | CheckpointLoadMetadataConfig, **kwargs
@@ -91,8 +108,7 @@ def __eq__(self, other) -> bool:
 
     def to_dict(self) -> dict[str, typing.Any]:
         out = super().to_dict()
-        if self.fast_llm_config is not None:
-            out["fast_llm_config"] = self.fast_llm_config.to_dict(verbose=FieldVerboseLevel.everything)
+        out["fast_llm_config"] = self.fast_llm_config.to_dict(verbose=FieldVerboseLevel.everything)
         return out
 
     def to_diff_dict(self) -> dict[str, typing.Any]:
diff --git a/fast_llm/engine/inference/huggingface.py b/fast_llm/engine/inference/huggingface.py
index 2518ed96a..2e219e076 100644
--- a/fast_llm/engine/inference/huggingface.py
+++ b/fast_llm/engine/inference/huggingface.py
@@ -37,13 +37,7 @@ def __init__(
         assert config.fast_llm_config is fast_llm_model.config
         assert isinstance(config, self.config_class)
 
-        # The HF constructor performs a deep copy of the config,
-        # but config.fast_llm_config may contain non-picklable items like process groups.
-        # Temporarily remove it before the call and restore it afterward.
-        fast_llm_config = config.fast_llm_config
-        config.fast_llm_config = None
         super().__init__(config, **kwargs)
-        config.fast_llm_config = fast_llm_config
 
         self._inference_runner = self.runner_class(fast_llm_model, runner)
 

From 277892559d86108895e30b68285983c18c457d43 Mon Sep 17 00:00:00 2001
From: bigximik <denisko@live.com>
Date: Fri, 16 May 2025 11:36:05 +0000
Subject: [PATCH 14/19] moved forward declaration to the right class

---
 fast_llm/engine/inference/huggingface.py | 32 +++++++++++-------------
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/fast_llm/engine/inference/huggingface.py b/fast_llm/engine/inference/huggingface.py
index 2e219e076..0645eef4b 100644
--- a/fast_llm/engine/inference/huggingface.py
+++ b/fast_llm/engine/inference/huggingface.py
@@ -59,22 +59,6 @@ def __init__(
         with transformers.modeling_utils.no_init_weights():
             self.post_init()
 
-    def forward(
-        self,
-        input_ids: torch.Tensor | None = None,
-        attention_mask: torch.Tensor | None = None,
-        position_ids: torch.Tensor | None = None,
-        past_key_values=None,
-        inputs_embeds: torch.FloatTensor | None = None,
-        labels: torch.LongTensor | None = None,
-        use_cache: bool | None = None,
-        output_attentions: bool | None = None,
-        output_hidden_states: bool | None = None,
-        return_dict: bool | None = None,
-    ) -> tuple | transformers.modeling_outputs.CausalLMOutputWithPast:
-        # Meant to be overridden in derived classes
-        raise NotImplementedError()
-
     @classmethod
     def from_pretrained(
         cls,
@@ -113,4 +97,18 @@ def _init_weights(self, module) -> None:
 
 
 class HuggingfaceBaseModelForCausalLM(HuggingfacePreTrainedModel, transformers.generation.utils.GenerationMixin):
-    pass
+    def forward(
+        self,
+        input_ids: torch.Tensor | None = None,
+        attention_mask: torch.Tensor | None = None,
+        position_ids: torch.Tensor | None = None,
+        past_key_values=None,
+        inputs_embeds: torch.FloatTensor | None = None,
+        labels: torch.LongTensor | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+    ) -> tuple | transformers.modeling_outputs.CausalLMOutputWithPast:
+        # Meant to be overridden in derived classes
+        raise NotImplementedError()

From 4829b0d66ca28db10f5ba7d7e17904d5e52e289f Mon Sep 17 00:00:00 2001
From: bigximik <denisko@live.com>
Date: Fri, 16 May 2025 11:47:23 +0000
Subject: [PATCH 15/19] changed asserts for clarity

---
 fast_llm/engine/inference/huggingface.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/fast_llm/engine/inference/huggingface.py b/fast_llm/engine/inference/huggingface.py
index 0645eef4b..e679cfd6f 100644
--- a/fast_llm/engine/inference/huggingface.py
+++ b/fast_llm/engine/inference/huggingface.py
@@ -12,6 +12,7 @@
 from fast_llm.engine.multi_stage.config import StageMode
 from fast_llm.engine.multi_stage.fast_llm_model import FastLLMModel
 from fast_llm.engine.schedule.runner import ScheduleRunner
+from fast_llm.utils import Assert
 
 
 class HuggingfacePreTrainedModel(transformers.PreTrainedModel):
@@ -46,10 +47,8 @@ def __init__(
         assert fast_llm_model.is_setup
 
         # We only support data parallel for now
-        assert (
-            fast_llm_model.distributed.config.model_parallel == 1
-            and fast_llm_model.distributed.config.sequence_data_parallel == 1
-        )
+        Assert.eq(fast_llm_model.distributed.config.model_parallel, 1)
+        Assert.eq(fast_llm_model.distributed.config.sequence_data_parallel, 1)
 
         self._inference_runner.setup()
 

From 1aa434458b95a9f40ab6501e437cf7af04c789f7 Mon Sep 17 00:00:00 2001
From: bigximik <denisko@live.com>
Date: Fri, 16 May 2025 11:47:54 +0000
Subject: [PATCH 16/19] Added assert as fail safe

---
 fast_llm/engine/inference/runner.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fast_llm/engine/inference/runner.py b/fast_llm/engine/inference/runner.py
index df727288d..3003c5f9d 100644
--- a/fast_llm/engine/inference/runner.py
+++ b/fast_llm/engine/inference/runner.py
@@ -7,6 +7,7 @@
 from fast_llm.engine.schedule.config import BatchConfig, ScheduleConfig
 from fast_llm.engine.schedule.runner import ScheduleRunner
 from fast_llm.engine.schedule.schedule import Schedule
+from fast_llm.utils import Assert
 
 
 class InferenceRunner(abc.ABC):
@@ -58,6 +59,9 @@ def fast_llm_model(self) -> FastLLMModel:
     def setup(self):
         if not self._runner._is_setup:
             self._runner.setup(self._fast_llm_model.distributed)
+        else:
+            # Means external runner was passed, check it has the same distributed class as the model
+            Assert.is_(self._runner._distributed, self._fast_llm_model.distributed)
 
     def forward(
         self, input_, kwargs: dict, *, iteration: int = 1, return_metrics: bool = False

From 2b5fda6762733a3c40994a90ccaa0a9db8311917 Mon Sep 17 00:00:00 2001
From: bigximik <denisko@live.com>
Date: Fri, 16 May 2025 15:55:23 +0000
Subject: [PATCH 17/19] changes fixes and added test for return of
 hidden_states

---
 fast_llm/engine/multi_stage/stage.py          | 12 ++++-----
 fast_llm/layers/language_model/head.py        | 16 +++++++++---
 ...te.py => test_gpt_generate_and_forward.py} | 26 +++++++++++++++++++
 3 files changed, 43 insertions(+), 11 deletions(-)
 rename tests/{test_gpt_generate.py => test_gpt_generate_and_forward.py} (87%)

diff --git a/fast_llm/engine/multi_stage/stage.py b/fast_llm/engine/multi_stage/stage.py
index a3ac98e8b..280c5c235 100644
--- a/fast_llm/engine/multi_stage/stage.py
+++ b/fast_llm/engine/multi_stage/stage.py
@@ -14,7 +14,7 @@
 from fast_llm.utils import Assert
 
 if typing.TYPE_CHECKING:
-    from fast_llm.core.distributed import ProcessGroup
+    pass
 
 logger = logging.getLogger(__name__)
 
@@ -123,12 +123,10 @@ def forward(
                 if output is not None:
                     meta = self._meta_outputs[i]
                     output_global, _ = meta.local_to_global(output.detach(), distributed=self._distributed)
-                else:
-                    output_global = None
-                kwargs["hidden_states"][self._layer_range[i]] = {
-                    "layer_type": type(layer).__name__,
-                    "tensor": output_global,
-                }
+                    kwargs["hidden_states"][self._layer_range[i]] = {
+                        "layer_type": type(layer).__name__,
+                        "tensor": output_global,
+                    }
         return None if output is None else output.detach(), (input_, output)
 
     def backward(
diff --git a/fast_llm/layers/language_model/head.py b/fast_llm/layers/language_model/head.py
index cc108cee4..d6d1b8a54 100644
--- a/fast_llm/layers/language_model/head.py
+++ b/fast_llm/layers/language_model/head.py
@@ -5,7 +5,7 @@
 from torch.distributed import all_reduce
 
 from fast_llm.config import Configurable
-from fast_llm.core.ops import gather_op, split_op
+from fast_llm.core.ops import split_op
 from fast_llm.engine.base_model.base_model import Layer
 from fast_llm.engine.config_utils.tensor_space import DefaultDimNames, TensorDim, TensorSpace
 from fast_llm.engine.distributed.config import DistributedDimNames
@@ -183,9 +183,17 @@ def _forward_backward(
             if "output_hidden_states" in kwargs and kwargs["output_hidden_states"]:
                 # The last hidden layer output is returned normalized in the HF Transformers-style output, at least for LLama style models.
                 # So, if needed, we gather the data after normalization and set it as the output of the previous layer.
-                group = self._tensor_space.distributed.tensor_group if self._parallel_embeddings else None
-                sequence_parallel = self._sequence_parallel and self._parallel_embeddings
-                hidden_state = gather_op(ln_output.detach(), group, dim=0) if sequence_parallel else ln_output.detach()
+                dims = list(kwargs[TransformerKwargs.hidden_dims])
+                sequence_index = 1 - int(kwargs[TransformerKwargs.sequence_first])
+                dims[sequence_index] = (
+                    TensorDim(
+                        TransformerDimNames.sequence_q_tp, dims[sequence_index].global_size, DistributedDimNames.tensor
+                    )
+                    if self._sequence_parallel_logits
+                    else TensorDim(TransformerDimNames.sequence_q, dims[sequence_index].global_size)
+                )
+                meta = TensorMeta.from_dims(tuple(dims), tensor_name="transformer hidden_state", dtype=ln_output.dtype)
+                hidden_state, _ = meta.local_to_global(ln_output.detach(), distributed=self._tensor_space.distributed)
                 kwargs["hidden_states"][len(kwargs["hidden_states"]) - 1]["tensor"] = hidden_state
 
         grad_output = kwargs[TransformerKwargs.grad_output] / (
diff --git a/tests/test_gpt_generate.py b/tests/test_gpt_generate_and_forward.py
similarity index 87%
rename from tests/test_gpt_generate.py
rename to tests/test_gpt_generate_and_forward.py
index ac2b6362d..1a3f3a35b 100644
--- a/tests/test_gpt_generate.py
+++ b/tests/test_gpt_generate_and_forward.py
@@ -222,3 +222,29 @@ def test_generate_from_model(
         min_matching_tokens_batch_size_1,
         min_matching_tokens_batch_size_2,
     )
+
+
+@pytest.mark.slow
+@requires_cuda
+def test_forward_return_hidden_states(
+    model_and_tokenizer,
+):
+    # Use flash attention for speed
+    model_path, tokenizer, fast_llm_checkpoint_format = model_and_tokenizer
+    fast_llm_model = _get_fast_llm_model(model_path, True, True, fast_llm_checkpoint_format)
+
+    inputs_ids = torch.randint(1, tokenizer.vocab_size, [1, 10], dtype=torch.int64).cuda()
+
+    # TODO: hidden states have differences between HF and Fast-LLM despite resulting in the similar logits,
+    #       decide if to leave as it.
+    # hf_model = _get_hf_model(model_path, True, True)
+    # res_hf = hf_model.forward(input_ids=inputs_ids, output_hidden_states=True, return_dict=True, use_cache=False)
+
+    res_fast_llm = fast_llm_model.forward(
+        input_ids=inputs_ids, output_hidden_states=True, return_dict=True, use_cache=False
+    )
+
+    # hidden_states include embeddings layer
+    assert (
+        len(res_fast_llm.hidden_states) - 1 == fast_llm_model.config.fast_llm_config.base_model.transformer.num_layers
+    )

From 77e7cbc38c9f32bbfa856420a143c3f1de8b9b3a Mon Sep 17 00:00:00 2001
From: bigximik <denisko@live.com>
Date: Mon, 19 May 2025 06:50:26 +0000
Subject: [PATCH 18/19] added extra slow tests mark

---
 tests/conftest.py                      | 14 ++++++++++++++
 tests/test_gpt_generate_and_forward.py |  6 +++---
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 445f59bb1..1c718c21e 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -3,10 +3,19 @@
 
 def pytest_addoption(parser):
     parser.addoption("--skip-slow", action="store_true")
+    parser.addoption(
+        "--run-extra-slow",
+        action="store_true",
+        default=False,
+        help="Run tests marked as extra_slow",
+    )
 
 
 def pytest_configure(config):
     config.addinivalue_line("markers", "slow: Test is slow.")
+    config.addinivalue_line(
+        "markers", "extra_slow: Mark test as extra slow and skip unless --run-extra-slow is given."
+    )
 
 
 def pytest_collection_modifyitems(config, items):
@@ -15,3 +24,8 @@ def pytest_collection_modifyitems(config, items):
         for item in items:
             if "slow" in item.keywords:
                 item.add_marker(skip_slow)
+    if not config.getoption("--run-extra-slow"):
+        skip_extra_slow = pytest.mark.skip(reason="need --run-extra-slow option to run")
+        for item in items:
+            if "extra_slow" in item.keywords:
+                item.add_marker(skip_extra_slow)
diff --git a/tests/test_gpt_generate_and_forward.py b/tests/test_gpt_generate_and_forward.py
index 1a3f3a35b..730d1f4b9 100644
--- a/tests/test_gpt_generate_and_forward.py
+++ b/tests/test_gpt_generate_and_forward.py
@@ -165,7 +165,7 @@ def model_and_tokenizer():
     return model_path, tokenizer, fast_llm_checkpoint_format
 
 
-@pytest.mark.slow
+@pytest.mark.extra_slow
 @requires_cuda
 @pytest.mark.parametrize(
     "use_flash_attention, use_bf16, max_new_tokens, min_matching_tokens_batch_size_1, min_matching_tokens_batch_size_2",
@@ -200,7 +200,7 @@ def test_generate(
     )
 
 
-@pytest.mark.slow
+@pytest.mark.extra_slow
 @requires_cuda
 def test_generate_from_model(
     model_and_tokenizer,
@@ -224,7 +224,7 @@ def test_generate_from_model(
     )
 
 
-@pytest.mark.slow
+@pytest.mark.extra_slow
 @requires_cuda
 def test_forward_return_hidden_states(
     model_and_tokenizer,

From 4497436d8447f5e1537360c21778243165f153db Mon Sep 17 00:00:00 2001
From: bigximik <denisko@live.com>
Date: Mon, 19 May 2025 09:24:13 +0000
Subject: [PATCH 19/19] added faster tests with placeholder model

---
 tests/test_gpt_generate_and_forward.py | 163 ++++++++++++++++++++++---
 1 file changed, 143 insertions(+), 20 deletions(-)

diff --git a/tests/test_gpt_generate_and_forward.py b/tests/test_gpt_generate_and_forward.py
index 730d1f4b9..134b51e68 100644
--- a/tests/test_gpt_generate_and_forward.py
+++ b/tests/test_gpt_generate_and_forward.py
@@ -33,6 +33,22 @@ def _prepare_data(tokenizer, use_batch_size2: bool):
     return inputs
 
 
+def _prepare_rand_data(vocab_size, use_batch_size2: bool):
+    inputs = torch.randint(
+        1,
+        vocab_size,
+        [2 if use_batch_size2 else 1, 10],
+        dtype=torch.int64,
+        generator=torch.Generator().manual_seed(42),
+    ).cuda()
+    attention_mask = torch.ones_like(inputs)
+    # simulate left padding on one of the rows
+    if use_batch_size2:
+        inputs[1, :5] = 0
+        attention_mask[1, :5] = 0
+    return {"input_ids": inputs, "attention_mask": attention_mask}
+
+
 def _get_hf_model(model_path: str, use_flash_attention: bool, use_bf16: bool):
     hf_kwargs = {}
     if use_flash_attention:
@@ -132,12 +148,15 @@ def _compare_gen_outputs(outputs: dict[str, list[torch.Tensor]], min_matching_to
 def _test_for_batches(
     hf_model,
     fast_llm_model,
-    tokenizer,
     max_new_tokens,
     min_matching_tokens_batch_size_1,
     min_matching_tokens_batch_size_2,
+    tokenizer=None,
 ):
-    inputs = _prepare_data(tokenizer, use_batch_size2=False)
+    if tokenizer is not None:
+        inputs = _prepare_data(tokenizer, use_batch_size2=False)
+    else:
+        inputs = _prepare_rand_data(fast_llm_model.config.fast_llm_config.base_model.vocab_size, use_batch_size2=False)
     outputs = _generate(
         inputs,
         hf_model,
@@ -146,7 +165,10 @@ def _test_for_batches(
     )
     _compare_gen_outputs(outputs, min_matching_tokens=min_matching_tokens_batch_size_1)
 
-    inputs = _prepare_data(tokenizer, use_batch_size2=True)
+    if tokenizer is not None:
+        inputs = _prepare_data(tokenizer, use_batch_size2=True)
+    else:
+        inputs = _prepare_rand_data(fast_llm_model.config.fast_llm_config.base_model.vocab_size, use_batch_size2=True)
     outputs = _generate(
         inputs,
         hf_model,
@@ -165,6 +187,42 @@ def model_and_tokenizer():
     return model_path, tokenizer, fast_llm_checkpoint_format
 
 
+@pytest.fixture(scope="module")
+def small_model():
+    from .common import _CONFIGS, TEST_RESULTS_PATH, run_test_script
+
+    _, _, _, common_config, fast_llm_checkpoint_format = _CONFIGS["llama"]
+    run_test_script(
+        f"test_llama_generate_and_forward",
+        common_config
+        + ["training.checkpoint.interval=1", "training.export.format=llama", "training.export.interval=1"],
+    )
+    return TEST_RESULTS_PATH / "test_llama_generate_and_forward/export/llama/2", fast_llm_checkpoint_format
+
+
+def _test_generate(
+    model_path,
+    fast_llm_checkpoint_format,
+    use_flash_attention,
+    use_bf16,
+    max_new_tokens,
+    min_matching_tokens_batch_size_1,
+    min_matching_tokens_batch_size_2,
+    tokenizer=None,
+):
+    hf_model = _get_hf_model(model_path, use_flash_attention, use_bf16)
+    fast_llm_model = _get_fast_llm_model(model_path, use_flash_attention, use_bf16, fast_llm_checkpoint_format)
+
+    _test_for_batches(
+        hf_model,
+        fast_llm_model,
+        max_new_tokens,
+        min_matching_tokens_batch_size_1,
+        min_matching_tokens_batch_size_2,
+        tokenizer=tokenizer,
+    )
+
+
 @pytest.mark.extra_slow
 @requires_cuda
 @pytest.mark.parametrize(
@@ -187,59 +245,108 @@ def test_generate(
     min_matching_tokens_batch_size_2,
 ):
     model_path, tokenizer, fast_llm_checkpoint_format = model_and_tokenizer
-    hf_model = _get_hf_model(model_path, use_flash_attention, use_bf16)
-    fast_llm_model = _get_fast_llm_model(model_path, use_flash_attention, use_bf16, fast_llm_checkpoint_format)
-
-    _test_for_batches(
-        hf_model,
-        fast_llm_model,
-        tokenizer,
+    _test_generate(
+        model_path,
+        fast_llm_checkpoint_format,
+        use_flash_attention,
+        use_bf16,
         max_new_tokens,
         min_matching_tokens_batch_size_1,
         min_matching_tokens_batch_size_2,
+        tokenizer=tokenizer,
     )
 
 
-@pytest.mark.extra_slow
+@pytest.mark.slow
 @requires_cuda
-def test_generate_from_model(
-    model_and_tokenizer,
+@pytest.mark.parametrize(
+    "use_flash_attention, use_bf16, max_new_tokens, min_matching_tokens_batch_size_1, min_matching_tokens_batch_size_2",
+    [
+        # No flash attention + no bf16
+        (False, False, 10, 10, 10),
+        # No flash attention + with bf16
+        (False, True, 10, 10, 10),
+        # Flash attention must be paired with bf16
+        (True, True, 10, 10, 10),
+    ],
+)
+def test_small_generate(
+    small_model,
+    use_flash_attention,
+    use_bf16,
+    max_new_tokens,
+    min_matching_tokens_batch_size_1,
+    min_matching_tokens_batch_size_2,
 ):
+    model_path, fast_llm_checkpoint_format = small_model
+    _test_generate(
+        model_path,
+        fast_llm_checkpoint_format,
+        use_flash_attention,
+        use_bf16,
+        max_new_tokens,
+        min_matching_tokens_batch_size_1,
+        min_matching_tokens_batch_size_2,
+    )
+
+
+def _test_generate_from_model(model_path, tokenizer, fast_llm_checkpoint_format):
     max_new_tokens = 10
     min_matching_tokens_batch_size_1 = 10
     min_matching_tokens_batch_size_2 = 10
 
     # Use flash attention for speed
-    model_path, tokenizer, fast_llm_checkpoint_format = model_and_tokenizer
     hf_model = _get_hf_model(model_path, True, True)
     fast_llm_model = _get_fast_llm_model_from_model(model_path, True, True, fast_llm_checkpoint_format)
 
     _test_for_batches(
         hf_model,
         fast_llm_model,
-        tokenizer,
         max_new_tokens,
         min_matching_tokens_batch_size_1,
         min_matching_tokens_batch_size_2,
+        tokenizer=tokenizer,
     )
 
 
 @pytest.mark.extra_slow
 @requires_cuda
-def test_forward_return_hidden_states(
+def test_generate_from_model(
     model_and_tokenizer,
 ):
-    # Use flash attention for speed
     model_path, tokenizer, fast_llm_checkpoint_format = model_and_tokenizer
-    fast_llm_model = _get_fast_llm_model(model_path, True, True, fast_llm_checkpoint_format)
+    _test_generate_from_model(model_path, tokenizer, fast_llm_checkpoint_format)
 
-    inputs_ids = torch.randint(1, tokenizer.vocab_size, [1, 10], dtype=torch.int64).cuda()
 
+@pytest.mark.slow
+@requires_cuda
+def test_small_generate_from_model(
+    small_model,
+):
+    model_path, fast_llm_checkpoint_format = small_model
+    _test_generate_from_model(model_path, None, fast_llm_checkpoint_format)
+
+
+def _test_forward_return_hidden_states(
+    model_path,
+    fast_llm_checkpoint_format,
+    vocab_size: int | None = None,
+):
+    # Use flash attention for speed
     # TODO: hidden states have differences between HF and Fast-LLM despite resulting in the similar logits,
     #       decide if to leave as it.
     # hf_model = _get_hf_model(model_path, True, True)
-    # res_hf = hf_model.forward(input_ids=inputs_ids, output_hidden_states=True, return_dict=True, use_cache=False)
+    fast_llm_model = _get_fast_llm_model(model_path, True, True, fast_llm_checkpoint_format)
+
+    inputs_ids = torch.randint(
+        1,
+        fast_llm_model.config.fast_llm_config.base_model.vocab_size if vocab_size is None else vocab_size,
+        [1, 10],
+        dtype=torch.int64,
+        generator=torch.Generator().manual_seed(42),
+    ).cuda()
 
+    # res_hf = hf_model.forward(input_ids=inputs_ids, output_hidden_states=True, return_dict=True, use_cache=False)
     res_fast_llm = fast_llm_model.forward(
         input_ids=inputs_ids, output_hidden_states=True, return_dict=True, use_cache=False
     )
@@ -248,3 +355,19 @@ def test_forward_return_hidden_states(
     assert (
         len(res_fast_llm.hidden_states) - 1 == fast_llm_model.config.fast_llm_config.base_model.transformer.num_layers
     )
+
+
+@pytest.mark.extra_slow
+@requires_cuda
+def test_forward_return_hidden_states(
+    model_and_tokenizer,
+):
+    model_path, tokenizer, fast_llm_checkpoint_format = model_and_tokenizer
+    _test_forward_return_hidden_states(model_path, fast_llm_checkpoint_format, tokenizer.vocab_size)
+
+
+@pytest.mark.slow
+@requires_cuda
+def test_small_forward_return_hidden_states(small_model):
+    model_path, fast_llm_checkpoint_format = small_model
+    _test_forward_return_hidden_states(model_path, fast_llm_checkpoint_format)