huggingface · ArthurZucker · Aug 5, 2025 · Jul 10, 2025 · Jul 10, 2025 · Jul 10, 2025
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
@@ -617,6 +617,8 @@
         title: OLMoE
       - local: model_doc/open-llama
         title: Open-Llama
+      - local: model_doc/openai_moe
+        title: OpenAIMoe
       - local: model_doc/opt
         title: OPT
       - local: model_doc/pegasus

diff --git a/docs/source/en/model_doc/openai_moe.md b/docs/source/en/model_doc/openai_moe.md
@@ -0,0 +1,58 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+        ">
+        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
+</div>
+
+# OpenAIMoE
+
+## Overview
+
+The OpenAIMoE model was proposed in [<INSERT PAPER NAME HERE>](<INSERT PAPER LINK HERE>) by <INSERT AUTHORS HERE>.
+<INSERT SHORT SUMMARY HERE>
+
+The abstract from the paper is the following:
+
+*<INSERT PAPER ABSTRACT HERE>*
+
+Tips:
+
+<INSERT TIPS ABOUT MODEL HERE>
+
+This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/<INSERT YOUR HF USERNAME HERE>).
+The original code can be found [here](<INSERT LINK TO GITHUB REPO HERE>).
+
+
+## OpenAIMoeConfig
+
+[[autodoc]] OpenAIMoeConfig
+
+## OpenAIMoeModel
+
+[[autodoc]] OpenAIMoeModel
+    - forward
+
+## OpenAIMoeForCausalLM
+
+[[autodoc]] OpenAIMoeForCausalLM
+    - forward
diff --git a/setup.py b/setup.py
@@ -128,7 +128,7 @@
     # Keras pin - this is to make sure Keras 3 doesn't destroy us. Remove or change when we have proper support.
     "keras>2.9,<2.16",
     "keras-nlp>=0.3.1,<0.14.0",  # keras-nlp 0.14 doesn't support keras 2, see pin on keras.
-    "kernels>=0.6.1,<0.7",
+    "kernels>=0.6.1,<=0.9",
     "librosa",
     "natten>=0.14.6,<0.15.0",
     "nltk<=3.8.1",
@@ -137,7 +137,7 @@
     "onnxconverter-common",
     "onnxruntime-tools>=1.4.2",
     "onnxruntime>=1.4.0",
-    "openai",
+    "openai>=1.98.0",
     "opencv-python",
     "optimum-benchmark>=0.3.0",
     "optuna",

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
@@ -277,6 +277,7 @@
         "GPTQConfig",
         "HiggsConfig",
         "HqqConfig",
+        "Mxfp4Config",
         "QuantoConfig",
         "QuarkConfig",
         "FPQuantConfig",

diff --git a/src/transformers/commands/serving.py b/src/transformers/commands/serving.py
@@ -909,7 +909,16 @@ def generate_chat_completion(self, req: dict) -> Generator[str, None, None]:
         inputs = inputs.to(model.device)
         request_id = req.get("request_id", "req_0")
 
-        generation_streamer = TextIteratorStreamer(processor, skip_special_tokens=True, skip_prompt=True)
+        # Temporary hack for GPTOSS 1: don't filter special tokens
+        skip_special_tokens = True
+        if "gptoss" in model.config.architectures[0].lower():
+            skip_special_tokens = False
+
+        generation_streamer = TextIteratorStreamer(
+            processor,
+            skip_special_tokens=skip_special_tokens,
+            skip_prompt=True,
+        )
         generation_config = create_generation_config_from_req(req, model_generation_config=model.generation_config)
 
         last_kv_cache = None
@@ -925,12 +934,21 @@ def generate_chat_completion(self, req: dict) -> Generator[str, None, None]:
         }
 
         def stream_chat_completion(streamer, _request_id):
+            # Temporary hack for GPTOS 2: filter out the CoT tokens. Full solution here implies defining new output
+            # classes and piping the reasoning trace into a new field
+            filter_cot = False
+            cot_trace_end = None
+            if "gptoss" in model.config.architectures[0].lower():
+                filter_cot = True
+                cot_trace_end = "<|channel|>final<|message|>"
+
             # Thin wrapper to save the KV cache after generation
             def generate_with_cache(**kwargs):
                 generate_output = model.generate(**kwargs)
                 self.last_kv_cache = generate_output.past_key_values
 
             thread = Thread(target=generate_with_cache, kwargs=generation_kwargs)
+            results = ""
 
             try:
                 thread.start()
@@ -941,6 +959,20 @@ def generate_with_cache(**kwargs):
                 yield self.build_chat_completion_chunk(request_id, role="assistant", model=model_id_and_revision)
 
                 for result in streamer:
+                    # Temporary hack for GPTOS 3: don't emit the final "<|return|>"
+                    if "gptoss" in model.config.architectures[0].lower():
+                        if result.endswith("<|return|>"):
+                            result = result[: -len("<|return|>")]
+                    results += result
+
+                    # (related to temporary hack 2)
+                    if filter_cot:
+                        if cot_trace_end in results:  # end of reasoning trace observed -> stop filtering
+                            filter_cot = False
+                            continue
+                        else:
+                            continue
+
                     # ====== TOOL CALL LOGIC ======
                     if tool_model_family is not None:
                         # Start of a tool call: reset state variables, set `inside_tool_call`
@@ -1064,7 +1096,16 @@ def generate_response(self, req: dict) -> Generator[str, None, None]:
         inputs = inputs.to(model.device)
         request_id = req.get("previous_response_id", "req_0")
 
-        generation_streamer = TextIteratorStreamer(processor, skip_special_tokens=True, skip_prompt=True)
+        # Temporary hack for GPTOSS 1: don't filter special tokens
+        skip_special_tokens = True
+        if "gptoss" in model.config.architectures[0].lower():
+            skip_special_tokens = False
+
+        generation_streamer = TextIteratorStreamer(
+            processor,
+            skip_special_tokens=skip_special_tokens,
+            skip_prompt=True,
+        )
         generation_config = create_generation_config_from_req(req, model_generation_config=model.generation_config)
 
         last_kv_cache = None
@@ -1081,6 +1122,14 @@ def generate_response(self, req: dict) -> Generator[str, None, None]:
         }
 
         def stream_response(streamer, _request_id):
+            # Temporary hack for GPTOS 2: filter out the CoT tokens. Full solution here implies defining new output
+            # classes and piping the reasoning trace into a new field
+            filter_cot = False
+            cot_trace_end = None
+            if "gptoss" in model.config.architectures[0].lower():
+                filter_cot = True
+                cot_trace_end = "<|channel|>final<|message|>"
+
             # Thin wrapper to save the KV cache after generation
             def generate_with_cache(**kwargs):
                 generate_output = model.generate(**kwargs)
@@ -1167,14 +1216,29 @@ def generate_with_cache(**kwargs):
                 # Stream the actual generated text
                 results = ""
                 for result in streamer:
+                    # Temporary hack for GPTOS 3: don't emit the final "<|return|>"
+                    if "gptoss" in model.config.architectures[0].lower():
+                        if result.endswith("<|return|>"):
+                            result = result[: -len("<|return|>")]
                     results += result
+
+                    # (related to temporary hack 2)
+                    if filter_cot:
+                        if cot_trace_end in results:  # end of reasoning trace observed -> stop filtering
+                            filter_cot = False
+                            results = ""  # reset the results -> results will now track the final response
+                            continue
+                        else:
+                            continue
+
                     response_output_text_delta = ResponseTextDeltaEvent(
                         type="response.output_text.delta",
                         item_id=f"msg_{request_id}",
                         sequence_number=sequence_number,
                         output_index=output_index,
                         content_index=content_index,
                         delta=result,
+                        logprobs=[{"token": "", "logprob": 99.9}],  # TODO: add actual logprobs
                     )
                     sequence_number += 1
                     yield self.build_response_event(response_output_text_delta)
@@ -1187,6 +1251,7 @@ def generate_with_cache(**kwargs):
                     output_index=output_index,
                     content_index=0,
                     text=results,
+                    logprobs=[{"token": "", "logprob": 99.9}],  # TODO: add actual logprobs
                 )
                 sequence_number += 1
                 yield self.build_response_event(response_output_text_done)
@@ -1446,9 +1511,10 @@ def _load_model_and_data_processor(self, model_id_and_revision: str):
             "attn_implementation": args.attn_implementation,
             "torch_dtype": torch_dtype,
             "device_map": "auto",
-            "quantization_config": quantization_config,
             "trust_remote_code": args.trust_remote_code,
         }
+        if quantization_config is not None:
+            model_kwargs["quantization_config"] = quantization_config
 
         config = AutoConfig.from_pretrained(model_id, **model_kwargs)
         architecture = getattr(transformers, config.architectures[0])

diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
@@ -34,7 +34,7 @@
     "kenlm": "kenlm",
     "keras": "keras>2.9,<2.16",
     "keras-nlp": "keras-nlp>=0.3.1,<0.14.0",
-    "kernels": "kernels>=0.6.1,<0.7",
+    "kernels": "kernels>=0.6.1,<=0.9",
     "librosa": "librosa",
     "natten": "natten>=0.14.6,<0.15.0",
     "nltk": "nltk<=3.8.1",
@@ -43,7 +43,7 @@
     "onnxconverter-common": "onnxconverter-common",
     "onnxruntime-tools": "onnxruntime-tools>=1.4.2",
     "onnxruntime": "onnxruntime>=1.4.0",
-    "openai": "openai",
+    "openai": "openai>=1.98.0",
     "opencv-python": "opencv-python",
     "optimum-benchmark": "optimum-benchmark>=0.3.0",
     "optuna": "optuna",

diff --git a/src/transformers/generation/continuous_batching.py b/src/transformers/generation/continuous_batching.py
@@ -182,27 +182,29 @@ def __init__(
                     f"Number of key value heads {num_key_value_heads} must be divisible by tensor parallel size {tp_size}."
                 )
             # If the model is using tensor parallelism, we need to adjust the number of heads accordingly.
-            self.num_key_value_heads //= tp_size
+            # self.num_key_value_heads //= tp_size
 
         self.head_dim = (
             config.head_dim if hasattr(config, "head_dim") else config.hidden_size // config.num_attention_heads
         )
         self.num_hidden_layers = config.num_hidden_layers
 
         # Calculate optimal block size and number if not provided
-        num_blocks = getattr(generation_config, "num_blocks", None)
+        num_blocks = getattr(generation_config, "num_blocks", 1024)
         block_size = getattr(generation_config, "block_size", 32)
         max_memory_percent = getattr(generation_config, "max_memory", 0.9)
-        num_blocks, max_batch_tokens = compute_optimal_blocks(
-            generation_config.max_new_tokens,
-            block_size=block_size,
-            head_dim=self.head_dim,
-            num_layers=self.num_hidden_layers,
-            num_heads=self.num_key_value_heads,
-            max_memory_percent=max_memory_percent,
-            dtype=dtype,
-            num_blocks=num_blocks,
-        )
+        max_batch_tokens = getattr(generation_config, "max_batch_tokens", 256)
+        if num_blocks is None or max_batch_tokens is None:
+            num_blocks, max_batch_tokens = compute_optimal_blocks(
+                generation_config.max_new_tokens,
+                block_size=block_size,
+                head_dim=self.head_dim,
+                num_layers=self.num_hidden_layers,
+                num_heads=self.num_key_value_heads,
+                max_memory_percent=max_memory_percent,
+                dtype=dtype,
+                num_blocks=num_blocks,
+            )
         logger.warning(
             f"Using calculated num_blocks={num_blocks}, block_size={block_size}, max concurrent requests {max_batch_tokens}"
         )
@@ -960,7 +962,14 @@ def _build_tensors(
 
     @traced
     def _sync(self):
-        return self.output_ids.tolist()[0]  # should be the only synch we do
+        if self.output_ids is not None:
+            try:
+                out = self.output_ids.tolist()[0]  # should be the only synch we do
+            except Exception:
+                out = [0, 1]
+        else:
+            out = [0, 0]
+        return out
 
     @traced
     def _maybe_send_output(self, state: RequestState, token: int):
@@ -1250,7 +1259,7 @@ def _run_generation_loop(self):
                 self.model.device,
                 self.model.dtype,
                 num_requests=len(self.input_queue.queue),
-                tp_size=getattr(self.model, "tp_size"),
+                tp_size=getattr(self.model, "_tp_size", 8),  # TODO quantized converted don't set this
             )
 
             scheduler = None

diff --git a/src/transformers/integrations/__init__.py b/src/transformers/integrations/__init__.py
@@ -119,6 +119,14 @@
         "run_hp_search_sigopt",
         "run_hp_search_wandb",
     ],
+    "mxfp4": [
+        "Mxfp4GptOssExperts",
+        "convert_moe_packed_tensors",
+        "dequantize",
+        "load_and_swizzle_mxfp4",
+        "quantize_to_mxfp4",
+        "replace_with_mxfp4_linear",
+    ],
     "peft": ["PeftAdapterMixin"],
     "quanto": ["replace_with_quanto_layers"],
     "spqr": ["replace_with_spqr_linear"],
@@ -255,6 +263,13 @@
         run_hp_search_sigopt,
         run_hp_search_wandb,
     )
+    from .mxfp4 import (
+        Mxfp4GptOssExperts,
+        dequantize,
+        load_and_swizzle_mxfp4,
+        quantize_to_mxfp4,
+        replace_with_mxfp4_linear,
+    )
     from .peft import PeftAdapterMixin
     from .quanto import replace_with_quanto_layers
     from .spqr import replace_with_spqr_linear

diff --git a/src/transformers/integrations/flash_paged.py b/src/transformers/integrations/flash_paged.py
@@ -50,8 +50,10 @@ def paged_attention_forward(
     """
     k, v = cache.update(k, v, module.layer_idx, cumulative_seqlens_k=cumulative_seqlens_k, **kwargs)
 
+    sliding_window = (-1, -1) if not getattr(module, "sliding_window", False) else (module.sliding_window, 0)
     if implementation is not None:
         flash_attn_varlen_func = implementation.flash_attn_varlen_func
+    custom_kwargs = {"s_aux": kwargs.get("s_aux")}
     attn_output = flash_attn_varlen_func(
         q.transpose(1, 2).squeeze(0).contiguous(),
         k.transpose(1, 2).squeeze(0).contiguous(),
@@ -62,9 +64,9 @@ def paged_attention_forward(
         max_seqlen_k,
         softmax_scale=module.scaling,
         causal=True,  # kind of a must, it automatically aligns the mask for q < k
-        window_size=(-1, -1),  # -1 means infinite context window
+        window_size=sliding_window,  # -1 means infinite context window
         # block_table=block_tables, -> torch.Tensor
-        # **kwargs,
+        **custom_kwargs,
     )
     if isinstance(attn_output, tuple):
         attn_output = attn_output[0]